TinyLlama-1.1B-Chat-rust-cpp-encodings
/
LORAs
/tinyllama-rust
/checkpoint-10000
/trainer_state.json
{ | |
"best_metric": null, | |
"best_model_checkpoint": null, | |
"epoch": 1.0, | |
"eval_steps": 500, | |
"global_step": 10000, | |
"is_hyper_param_search": false, | |
"is_local_process_zero": true, | |
"is_world_process_zero": true, | |
"log_history": [ | |
{ | |
"epoch": 0.0032, | |
"grad_norm": NaN, | |
"learning_rate": 9.999948122981575e-05, | |
"loss": 1.0507, | |
"step": 32 | |
}, | |
{ | |
"epoch": 0.0064, | |
"grad_norm": 1.0899442434310913, | |
"learning_rate": 9.999770471768777e-05, | |
"loss": 1.016, | |
"step": 64 | |
}, | |
{ | |
"epoch": 0.0096, | |
"grad_norm": 2.957014799118042, | |
"learning_rate": 9.999466495684926e-05, | |
"loss": 0.9928, | |
"step": 96 | |
}, | |
{ | |
"epoch": 0.0128, | |
"grad_norm": 1.3575732707977295, | |
"learning_rate": 9.999036202410325e-05, | |
"loss": 0.8757, | |
"step": 128 | |
}, | |
{ | |
"epoch": 0.016, | |
"grad_norm": 1.0611408948898315, | |
"learning_rate": 9.998498908285819e-05, | |
"loss": 0.8615, | |
"step": 160 | |
}, | |
{ | |
"epoch": 0.0192, | |
"grad_norm": 1.382992148399353, | |
"learning_rate": 9.997819962824957e-05, | |
"loss": 0.8216, | |
"step": 192 | |
}, | |
{ | |
"epoch": 0.0224, | |
"grad_norm": 2.863276481628418, | |
"learning_rate": 9.997014741774866e-05, | |
"loss": 0.7406, | |
"step": 224 | |
}, | |
{ | |
"epoch": 0.0256, | |
"grad_norm": 1.1629211902618408, | |
"learning_rate": 9.996083265480365e-05, | |
"loss": 0.8171, | |
"step": 256 | |
}, | |
{ | |
"epoch": 0.0288, | |
"grad_norm": 2.2264232635498047, | |
"learning_rate": 9.995025557476261e-05, | |
"loss": 0.8835, | |
"step": 288 | |
}, | |
{ | |
"epoch": 0.032, | |
"grad_norm": 1.7896003723144531, | |
"learning_rate": 9.993841644486747e-05, | |
"loss": 0.7303, | |
"step": 320 | |
}, | |
{ | |
"epoch": 0.0352, | |
"grad_norm": 1.403260350227356, | |
"learning_rate": 9.992531556424726e-05, | |
"loss": 0.7384, | |
"step": 352 | |
}, | |
{ | |
"epoch": 0.0384, | |
"grad_norm": 2.308896780014038, | |
"learning_rate": 9.99109532639106e-05, | |
"loss": 0.8211, | |
"step": 384 | |
}, | |
{ | |
"epoch": 0.0416, | |
"grad_norm": 1.282929539680481, | |
"learning_rate": 9.989532990673728e-05, | |
"loss": 0.7211, | |
"step": 416 | |
}, | |
{ | |
"epoch": 0.0448, | |
"grad_norm": 2.4921414852142334, | |
"learning_rate": 9.987844588746915e-05, | |
"loss": 0.8204, | |
"step": 448 | |
}, | |
{ | |
"epoch": 0.048, | |
"grad_norm": 1.3490195274353027, | |
"learning_rate": 9.986030163270011e-05, | |
"loss": 0.7623, | |
"step": 480 | |
}, | |
{ | |
"epoch": 0.0512, | |
"grad_norm": 1.436516523361206, | |
"learning_rate": 9.98408976008653e-05, | |
"loss": 0.7981, | |
"step": 512 | |
}, | |
{ | |
"epoch": 0.0544, | |
"grad_norm": 2.3144304752349854, | |
"learning_rate": 9.982023428222962e-05, | |
"loss": 0.7422, | |
"step": 544 | |
}, | |
{ | |
"epoch": 0.0576, | |
"grad_norm": 1.2702479362487793, | |
"learning_rate": 9.979831219887525e-05, | |
"loss": 0.8107, | |
"step": 576 | |
}, | |
{ | |
"epoch": 0.0608, | |
"grad_norm": 3.110814332962036, | |
"learning_rate": 9.977513190468848e-05, | |
"loss": 0.8395, | |
"step": 608 | |
}, | |
{ | |
"epoch": 0.064, | |
"grad_norm": 4.934881687164307, | |
"learning_rate": 9.975069398534574e-05, | |
"loss": 0.8456, | |
"step": 640 | |
}, | |
{ | |
"epoch": 0.0672, | |
"grad_norm": 1.5248093605041504, | |
"learning_rate": 9.972499905829875e-05, | |
"loss": 0.7604, | |
"step": 672 | |
}, | |
{ | |
"epoch": 0.0704, | |
"grad_norm": 1.5269616842269897, | |
"learning_rate": 9.9698047772759e-05, | |
"loss": 0.7557, | |
"step": 704 | |
}, | |
{ | |
"epoch": 0.0736, | |
"grad_norm": 1.523474097251892, | |
"learning_rate": 9.966984080968128e-05, | |
"loss": 0.7622, | |
"step": 736 | |
}, | |
{ | |
"epoch": 0.0768, | |
"grad_norm": 1.3121402263641357, | |
"learning_rate": 9.96403788817465e-05, | |
"loss": 0.6912, | |
"step": 768 | |
}, | |
{ | |
"epoch": 0.08, | |
"grad_norm": 0.9180154800415039, | |
"learning_rate": 9.96096627333437e-05, | |
"loss": 0.8783, | |
"step": 800 | |
}, | |
{ | |
"epoch": 0.0832, | |
"grad_norm": 2.254473924636841, | |
"learning_rate": 9.957769314055117e-05, | |
"loss": 0.7987, | |
"step": 832 | |
}, | |
{ | |
"epoch": 0.0864, | |
"grad_norm": 1.9398365020751953, | |
"learning_rate": 9.954447091111694e-05, | |
"loss": 0.7703, | |
"step": 864 | |
}, | |
{ | |
"epoch": 0.0896, | |
"grad_norm": 1.4880696535110474, | |
"learning_rate": 9.950999688443833e-05, | |
"loss": 0.7258, | |
"step": 896 | |
}, | |
{ | |
"epoch": 0.0928, | |
"grad_norm": 1.8427962064743042, | |
"learning_rate": 9.947427193154071e-05, | |
"loss": 0.6981, | |
"step": 928 | |
}, | |
{ | |
"epoch": 0.096, | |
"grad_norm": 3.3647401332855225, | |
"learning_rate": 9.943729695505552e-05, | |
"loss": 0.7862, | |
"step": 960 | |
}, | |
{ | |
"epoch": 0.0992, | |
"grad_norm": 1.852992296218872, | |
"learning_rate": 9.939907288919747e-05, | |
"loss": 0.8016, | |
"step": 992 | |
}, | |
{ | |
"epoch": 0.1024, | |
"grad_norm": 1.2231330871582031, | |
"learning_rate": 9.935960069974096e-05, | |
"loss": 0.8001, | |
"step": 1024 | |
}, | |
{ | |
"epoch": 0.1056, | |
"grad_norm": 1.2329598665237427, | |
"learning_rate": 9.931888138399561e-05, | |
"loss": 0.7656, | |
"step": 1056 | |
}, | |
{ | |
"epoch": 0.1088, | |
"grad_norm": 1.4887111186981201, | |
"learning_rate": 9.927691597078108e-05, | |
"loss": 0.7772, | |
"step": 1088 | |
}, | |
{ | |
"epoch": 0.112, | |
"grad_norm": 1.1879202127456665, | |
"learning_rate": 9.923370552040116e-05, | |
"loss": 0.7368, | |
"step": 1120 | |
}, | |
{ | |
"epoch": 0.1152, | |
"grad_norm": 1.4578642845153809, | |
"learning_rate": 9.918925112461688e-05, | |
"loss": 0.7226, | |
"step": 1152 | |
}, | |
{ | |
"epoch": 0.1184, | |
"grad_norm": 3.8356716632843018, | |
"learning_rate": 9.914355390661896e-05, | |
"loss": 0.7468, | |
"step": 1184 | |
}, | |
{ | |
"epoch": 0.1216, | |
"grad_norm": 3.390878200531006, | |
"learning_rate": 9.909661502099943e-05, | |
"loss": 0.7163, | |
"step": 1216 | |
}, | |
{ | |
"epoch": 0.1248, | |
"grad_norm": 2.217479944229126, | |
"learning_rate": 9.904843565372248e-05, | |
"loss": 0.7805, | |
"step": 1248 | |
}, | |
{ | |
"epoch": 0.128, | |
"grad_norm": 0.7309045195579529, | |
"learning_rate": 9.899901702209445e-05, | |
"loss": 0.6929, | |
"step": 1280 | |
}, | |
{ | |
"epoch": 0.1312, | |
"grad_norm": 1.173700213432312, | |
"learning_rate": 9.89483603747331e-05, | |
"loss": 0.726, | |
"step": 1312 | |
}, | |
{ | |
"epoch": 0.1344, | |
"grad_norm": 1.4089820384979248, | |
"learning_rate": 9.88964669915361e-05, | |
"loss": 0.8606, | |
"step": 1344 | |
}, | |
{ | |
"epoch": 0.1376, | |
"grad_norm": 1.0375796556472778, | |
"learning_rate": 9.884333818364861e-05, | |
"loss": 0.721, | |
"step": 1376 | |
}, | |
{ | |
"epoch": 0.1408, | |
"grad_norm": 2.082084894180298, | |
"learning_rate": 9.878897529343023e-05, | |
"loss": 0.7884, | |
"step": 1408 | |
}, | |
{ | |
"epoch": 0.144, | |
"grad_norm": 0.7961512804031372, | |
"learning_rate": 9.873337969442101e-05, | |
"loss": 0.774, | |
"step": 1440 | |
}, | |
{ | |
"epoch": 0.1472, | |
"grad_norm": 1.3074238300323486, | |
"learning_rate": 9.867655279130683e-05, | |
"loss": 0.7392, | |
"step": 1472 | |
}, | |
{ | |
"epoch": 0.1504, | |
"grad_norm": 1.5205963850021362, | |
"learning_rate": 9.861849601988383e-05, | |
"loss": 0.7731, | |
"step": 1504 | |
}, | |
{ | |
"epoch": 0.1536, | |
"grad_norm": 1.4995771646499634, | |
"learning_rate": 9.855921084702219e-05, | |
"loss": 0.8281, | |
"step": 1536 | |
}, | |
{ | |
"epoch": 0.1568, | |
"grad_norm": 1.0279921293258667, | |
"learning_rate": 9.849869877062902e-05, | |
"loss": 0.6942, | |
"step": 1568 | |
}, | |
{ | |
"epoch": 0.16, | |
"grad_norm": 2.8020853996276855, | |
"learning_rate": 9.843696131961058e-05, | |
"loss": 0.7389, | |
"step": 1600 | |
}, | |
{ | |
"epoch": 0.1632, | |
"grad_norm": 2.99129056930542, | |
"learning_rate": 9.837400005383354e-05, | |
"loss": 0.7483, | |
"step": 1632 | |
}, | |
{ | |
"epoch": 0.1664, | |
"grad_norm": 2.325167179107666, | |
"learning_rate": 9.830981656408574e-05, | |
"loss": 0.7483, | |
"step": 1664 | |
}, | |
{ | |
"epoch": 0.1696, | |
"grad_norm": 0.7245140671730042, | |
"learning_rate": 9.824441247203579e-05, | |
"loss": 0.7633, | |
"step": 1696 | |
}, | |
{ | |
"epoch": 0.1728, | |
"grad_norm": 2.7938778400421143, | |
"learning_rate": 9.817778943019228e-05, | |
"loss": 0.7812, | |
"step": 1728 | |
}, | |
{ | |
"epoch": 0.176, | |
"grad_norm": 1.2263625860214233, | |
"learning_rate": 9.810994912186189e-05, | |
"loss": 0.7712, | |
"step": 1760 | |
}, | |
{ | |
"epoch": 0.1792, | |
"grad_norm": 1.2694672346115112, | |
"learning_rate": 9.804089326110697e-05, | |
"loss": 0.7297, | |
"step": 1792 | |
}, | |
{ | |
"epoch": 0.1824, | |
"grad_norm": 1.255414366722107, | |
"learning_rate": 9.797062359270215e-05, | |
"loss": 0.735, | |
"step": 1824 | |
}, | |
{ | |
"epoch": 0.1856, | |
"grad_norm": 1.3175591230392456, | |
"learning_rate": 9.789914189209029e-05, | |
"loss": 0.7633, | |
"step": 1856 | |
}, | |
{ | |
"epoch": 0.1888, | |
"grad_norm": 1.0326446294784546, | |
"learning_rate": 9.78264499653376e-05, | |
"loss": 0.7955, | |
"step": 1888 | |
}, | |
{ | |
"epoch": 0.192, | |
"grad_norm": 1.093620777130127, | |
"learning_rate": 9.775254964908807e-05, | |
"loss": 0.766, | |
"step": 1920 | |
}, | |
{ | |
"epoch": 0.1952, | |
"grad_norm": 1.4234970808029175, | |
"learning_rate": 9.767744281051701e-05, | |
"loss": 0.6725, | |
"step": 1952 | |
}, | |
{ | |
"epoch": 0.1984, | |
"grad_norm": 0.7571769952774048, | |
"learning_rate": 9.760113134728384e-05, | |
"loss": 0.6953, | |
"step": 1984 | |
}, | |
{ | |
"epoch": 0.2016, | |
"grad_norm": 1.7207865715026855, | |
"learning_rate": 9.752361718748423e-05, | |
"loss": 0.7356, | |
"step": 2016 | |
}, | |
{ | |
"epoch": 0.2048, | |
"grad_norm": 2.240748882293701, | |
"learning_rate": 9.744490228960138e-05, | |
"loss": 0.8067, | |
"step": 2048 | |
}, | |
{ | |
"epoch": 0.208, | |
"grad_norm": 1.2544214725494385, | |
"learning_rate": 9.736498864245638e-05, | |
"loss": 0.7618, | |
"step": 2080 | |
}, | |
{ | |
"epoch": 0.2112, | |
"grad_norm": 5.976646900177002, | |
"learning_rate": 9.728387826515819e-05, | |
"loss": 0.6825, | |
"step": 2112 | |
}, | |
{ | |
"epoch": 0.2144, | |
"grad_norm": 4.557011127471924, | |
"learning_rate": 9.72015732070525e-05, | |
"loss": 0.7623, | |
"step": 2144 | |
}, | |
{ | |
"epoch": 0.2176, | |
"grad_norm": 0.8000884056091309, | |
"learning_rate": 9.71180755476699e-05, | |
"loss": 0.7719, | |
"step": 2176 | |
}, | |
{ | |
"epoch": 0.2208, | |
"grad_norm": 1.115488052368164, | |
"learning_rate": 9.703338739667346e-05, | |
"loss": 0.7913, | |
"step": 2208 | |
}, | |
{ | |
"epoch": 0.224, | |
"grad_norm": 1.3180317878723145, | |
"learning_rate": 9.694751089380536e-05, | |
"loss": 0.7452, | |
"step": 2240 | |
}, | |
{ | |
"epoch": 0.2272, | |
"grad_norm": 2.9995932579040527, | |
"learning_rate": 9.686044820883285e-05, | |
"loss": 0.7962, | |
"step": 2272 | |
}, | |
{ | |
"epoch": 0.2304, | |
"grad_norm": 1.234027624130249, | |
"learning_rate": 9.677220154149336e-05, | |
"loss": 0.828, | |
"step": 2304 | |
}, | |
{ | |
"epoch": 0.2336, | |
"grad_norm": 1.6579309701919556, | |
"learning_rate": 9.668277312143907e-05, | |
"loss": 0.7569, | |
"step": 2336 | |
}, | |
{ | |
"epoch": 0.2368, | |
"grad_norm": 1.5580084323883057, | |
"learning_rate": 9.65921652081804e-05, | |
"loss": 0.7947, | |
"step": 2368 | |
}, | |
{ | |
"epoch": 0.24, | |
"grad_norm": 0.6711795330047607, | |
"learning_rate": 9.650038009102905e-05, | |
"loss": 0.7461, | |
"step": 2400 | |
}, | |
{ | |
"epoch": 0.2432, | |
"grad_norm": 1.2285038232803345, | |
"learning_rate": 9.640742008904005e-05, | |
"loss": 0.6587, | |
"step": 2432 | |
}, | |
{ | |
"epoch": 0.2464, | |
"grad_norm": 0.7901808619499207, | |
"learning_rate": 9.631328755095334e-05, | |
"loss": 0.7182, | |
"step": 2464 | |
}, | |
{ | |
"epoch": 0.2496, | |
"grad_norm": 0.6125284433364868, | |
"learning_rate": 9.62179848551342e-05, | |
"loss": 0.709, | |
"step": 2496 | |
}, | |
{ | |
"epoch": 0.2528, | |
"grad_norm": 1.1602981090545654, | |
"learning_rate": 9.612151440951334e-05, | |
"loss": 0.7039, | |
"step": 2528 | |
}, | |
{ | |
"epoch": 0.256, | |
"grad_norm": 2.366184711456299, | |
"learning_rate": 9.602387865152597e-05, | |
"loss": 0.8669, | |
"step": 2560 | |
}, | |
{ | |
"epoch": 0.2592, | |
"grad_norm": 2.583352565765381, | |
"learning_rate": 9.592508004805023e-05, | |
"loss": 0.7258, | |
"step": 2592 | |
}, | |
{ | |
"epoch": 0.2624, | |
"grad_norm": 2.132749557495117, | |
"learning_rate": 9.58251210953449e-05, | |
"loss": 0.6971, | |
"step": 2624 | |
}, | |
{ | |
"epoch": 0.2656, | |
"grad_norm": 1.4479436874389648, | |
"learning_rate": 9.572400431898627e-05, | |
"loss": 0.8086, | |
"step": 2656 | |
}, | |
{ | |
"epoch": 0.2688, | |
"grad_norm": 1.2764617204666138, | |
"learning_rate": 9.562173227380436e-05, | |
"loss": 0.7426, | |
"step": 2688 | |
}, | |
{ | |
"epoch": 0.272, | |
"grad_norm": 3.4120121002197266, | |
"learning_rate": 9.55183075438184e-05, | |
"loss": 0.7382, | |
"step": 2720 | |
}, | |
{ | |
"epoch": 0.2752, | |
"grad_norm": 1.9773039817810059, | |
"learning_rate": 9.541373274217145e-05, | |
"loss": 0.7903, | |
"step": 2752 | |
}, | |
{ | |
"epoch": 0.2784, | |
"grad_norm": 1.4097728729248047, | |
"learning_rate": 9.530801051106449e-05, | |
"loss": 0.7713, | |
"step": 2784 | |
}, | |
{ | |
"epoch": 0.2816, | |
"grad_norm": 1.0817668437957764, | |
"learning_rate": 9.520114352168958e-05, | |
"loss": 0.7275, | |
"step": 2816 | |
}, | |
{ | |
"epoch": 0.2848, | |
"grad_norm": 1.2667794227600098, | |
"learning_rate": 9.509313447416242e-05, | |
"loss": 0.6648, | |
"step": 2848 | |
}, | |
{ | |
"epoch": 0.288, | |
"grad_norm": 1.8679159879684448, | |
"learning_rate": 9.498398609745405e-05, | |
"loss": 0.7445, | |
"step": 2880 | |
}, | |
{ | |
"epoch": 0.2912, | |
"grad_norm": 2.8598544597625732, | |
"learning_rate": 9.487370114932202e-05, | |
"loss": 0.733, | |
"step": 2912 | |
}, | |
{ | |
"epoch": 0.2944, | |
"grad_norm": 0.9554559588432312, | |
"learning_rate": 9.476228241624059e-05, | |
"loss": 0.7487, | |
"step": 2944 | |
}, | |
{ | |
"epoch": 0.2976, | |
"grad_norm": 1.926672101020813, | |
"learning_rate": 9.464973271333042e-05, | |
"loss": 0.8864, | |
"step": 2976 | |
}, | |
{ | |
"epoch": 0.3008, | |
"grad_norm": 0.8425309658050537, | |
"learning_rate": 9.45360548842874e-05, | |
"loss": 0.7295, | |
"step": 3008 | |
}, | |
{ | |
"epoch": 0.304, | |
"grad_norm": 1.3110431432724, | |
"learning_rate": 9.442125180131078e-05, | |
"loss": 0.7547, | |
"step": 3040 | |
}, | |
{ | |
"epoch": 0.3072, | |
"grad_norm": 0.9774306416511536, | |
"learning_rate": 9.430532636503068e-05, | |
"loss": 0.7099, | |
"step": 3072 | |
}, | |
{ | |
"epoch": 0.3104, | |
"grad_norm": 0.6718234419822693, | |
"learning_rate": 9.418828150443469e-05, | |
"loss": 0.7636, | |
"step": 3104 | |
}, | |
{ | |
"epoch": 0.3136, | |
"grad_norm": 1.2758376598358154, | |
"learning_rate": 9.407012017679393e-05, | |
"loss": 0.7066, | |
"step": 3136 | |
}, | |
{ | |
"epoch": 0.3168, | |
"grad_norm": 1.3185311555862427, | |
"learning_rate": 9.395084536758838e-05, | |
"loss": 0.6785, | |
"step": 3168 | |
}, | |
{ | |
"epoch": 0.32, | |
"grad_norm": 1.550795078277588, | |
"learning_rate": 9.383046009043134e-05, | |
"loss": 0.7451, | |
"step": 3200 | |
}, | |
{ | |
"epoch": 0.3232, | |
"grad_norm": 1.1686354875564575, | |
"learning_rate": 9.370896738699339e-05, | |
"loss": 0.6652, | |
"step": 3232 | |
}, | |
{ | |
"epoch": 0.3264, | |
"grad_norm": 0.848976194858551, | |
"learning_rate": 9.358637032692545e-05, | |
"loss": 0.7705, | |
"step": 3264 | |
}, | |
{ | |
"epoch": 0.3296, | |
"grad_norm": 1.3812384605407715, | |
"learning_rate": 9.346267200778126e-05, | |
"loss": 0.7168, | |
"step": 3296 | |
}, | |
{ | |
"epoch": 0.3328, | |
"grad_norm": 1.008135199546814, | |
"learning_rate": 9.333787555493914e-05, | |
"loss": 0.7352, | |
"step": 3328 | |
}, | |
{ | |
"epoch": 0.336, | |
"grad_norm": 1.2273484468460083, | |
"learning_rate": 9.321198412152301e-05, | |
"loss": 0.7979, | |
"step": 3360 | |
}, | |
{ | |
"epoch": 0.3392, | |
"grad_norm": 0.8740741610527039, | |
"learning_rate": 9.308500088832272e-05, | |
"loss": 0.6846, | |
"step": 3392 | |
}, | |
{ | |
"epoch": 0.3424, | |
"grad_norm": 1.3684589862823486, | |
"learning_rate": 9.295692906371363e-05, | |
"loss": 0.7758, | |
"step": 3424 | |
}, | |
{ | |
"epoch": 0.3456, | |
"grad_norm": 1.2861257791519165, | |
"learning_rate": 9.282777188357565e-05, | |
"loss": 0.6581, | |
"step": 3456 | |
}, | |
{ | |
"epoch": 0.3488, | |
"grad_norm": 0.8915108442306519, | |
"learning_rate": 9.269753261121138e-05, | |
"loss": 0.7935, | |
"step": 3488 | |
}, | |
{ | |
"epoch": 0.352, | |
"grad_norm": 1.1308799982070923, | |
"learning_rate": 9.256621453726379e-05, | |
"loss": 0.7759, | |
"step": 3520 | |
}, | |
{ | |
"epoch": 0.3552, | |
"grad_norm": 1.0886152982711792, | |
"learning_rate": 9.243382097963291e-05, | |
"loss": 0.7207, | |
"step": 3552 | |
}, | |
{ | |
"epoch": 0.3584, | |
"grad_norm": 0.675757110118866, | |
"learning_rate": 9.230035528339211e-05, | |
"loss": 0.6876, | |
"step": 3584 | |
}, | |
{ | |
"epoch": 0.3616, | |
"grad_norm": 3.258622884750366, | |
"learning_rate": 9.216582082070358e-05, | |
"loss": 0.7498, | |
"step": 3616 | |
}, | |
{ | |
"epoch": 0.3648, | |
"grad_norm": 3.8826818466186523, | |
"learning_rate": 9.203022099073309e-05, | |
"loss": 0.7993, | |
"step": 3648 | |
}, | |
{ | |
"epoch": 0.368, | |
"grad_norm": 0.9782927632331848, | |
"learning_rate": 9.189355921956412e-05, | |
"loss": 0.7005, | |
"step": 3680 | |
}, | |
{ | |
"epoch": 0.3712, | |
"grad_norm": 1.1662654876708984, | |
"learning_rate": 9.175583896011131e-05, | |
"loss": 0.6732, | |
"step": 3712 | |
}, | |
{ | |
"epoch": 0.3744, | |
"grad_norm": 1.038501501083374, | |
"learning_rate": 9.161706369203317e-05, | |
"loss": 0.7414, | |
"step": 3744 | |
}, | |
{ | |
"epoch": 0.3776, | |
"grad_norm": 2.3218936920166016, | |
"learning_rate": 9.147723692164427e-05, | |
"loss": 0.8008, | |
"step": 3776 | |
}, | |
{ | |
"epoch": 0.3808, | |
"grad_norm": 2.1190292835235596, | |
"learning_rate": 9.13363621818265e-05, | |
"loss": 0.711, | |
"step": 3808 | |
}, | |
{ | |
"epoch": 0.384, | |
"grad_norm": 1.8653652667999268, | |
"learning_rate": 9.119444303193996e-05, | |
"loss": 0.7641, | |
"step": 3840 | |
}, | |
{ | |
"epoch": 0.3872, | |
"grad_norm": 1.760704517364502, | |
"learning_rate": 9.10514830577329e-05, | |
"loss": 0.7231, | |
"step": 3872 | |
}, | |
{ | |
"epoch": 0.3904, | |
"grad_norm": 0.8437080979347229, | |
"learning_rate": 9.090748587125118e-05, | |
"loss": 0.7089, | |
"step": 3904 | |
}, | |
{ | |
"epoch": 0.3936, | |
"grad_norm": 1.6417099237442017, | |
"learning_rate": 9.076245511074703e-05, | |
"loss": 0.7645, | |
"step": 3936 | |
}, | |
{ | |
"epoch": 0.3968, | |
"grad_norm": 1.0280499458312988, | |
"learning_rate": 9.06163944405871e-05, | |
"loss": 0.78, | |
"step": 3968 | |
}, | |
{ | |
"epoch": 0.4, | |
"grad_norm": 2.645205020904541, | |
"learning_rate": 9.046930755115985e-05, | |
"loss": 0.7443, | |
"step": 4000 | |
}, | |
{ | |
"epoch": 0.4032, | |
"grad_norm": 1.4332572221755981, | |
"learning_rate": 9.032119815878236e-05, | |
"loss": 0.7138, | |
"step": 4032 | |
}, | |
{ | |
"epoch": 0.4064, | |
"grad_norm": 1.3062529563903809, | |
"learning_rate": 9.017207000560639e-05, | |
"loss": 0.6866, | |
"step": 4064 | |
}, | |
{ | |
"epoch": 0.4096, | |
"grad_norm": 1.559920072555542, | |
"learning_rate": 9.002192685952385e-05, | |
"loss": 0.7289, | |
"step": 4096 | |
}, | |
{ | |
"epoch": 0.4128, | |
"grad_norm": 2.111950635910034, | |
"learning_rate": 8.987077251407158e-05, | |
"loss": 0.7011, | |
"step": 4128 | |
}, | |
{ | |
"epoch": 0.416, | |
"grad_norm": 0.8812033534049988, | |
"learning_rate": 8.971861078833557e-05, | |
"loss": 0.7469, | |
"step": 4160 | |
}, | |
{ | |
"epoch": 0.4192, | |
"grad_norm": 0.8479238748550415, | |
"learning_rate": 8.956544552685437e-05, | |
"loss": 0.7263, | |
"step": 4192 | |
}, | |
{ | |
"epoch": 0.4224, | |
"grad_norm": 1.0125929117202759, | |
"learning_rate": 8.941128059952201e-05, | |
"loss": 0.6762, | |
"step": 4224 | |
}, | |
{ | |
"epoch": 0.4256, | |
"grad_norm": 0.9122424721717834, | |
"learning_rate": 8.925611990149021e-05, | |
"loss": 0.7076, | |
"step": 4256 | |
}, | |
{ | |
"epoch": 0.4288, | |
"grad_norm": 1.814253330230713, | |
"learning_rate": 8.909996735306996e-05, | |
"loss": 0.7143, | |
"step": 4288 | |
}, | |
{ | |
"epoch": 0.432, | |
"grad_norm": 1.4890289306640625, | |
"learning_rate": 8.894282689963251e-05, | |
"loss": 0.6931, | |
"step": 4320 | |
}, | |
{ | |
"epoch": 0.4352, | |
"grad_norm": 1.908116340637207, | |
"learning_rate": 8.878470251150959e-05, | |
"loss": 0.701, | |
"step": 4352 | |
}, | |
{ | |
"epoch": 0.4384, | |
"grad_norm": 1.2831019163131714, | |
"learning_rate": 8.862559818389322e-05, | |
"loss": 0.7625, | |
"step": 4384 | |
}, | |
{ | |
"epoch": 0.4416, | |
"grad_norm": 0.923768162727356, | |
"learning_rate": 8.846551793673467e-05, | |
"loss": 0.7902, | |
"step": 4416 | |
}, | |
{ | |
"epoch": 0.4448, | |
"grad_norm": 1.6989527940750122, | |
"learning_rate": 8.83044658146429e-05, | |
"loss": 0.7006, | |
"step": 4448 | |
}, | |
{ | |
"epoch": 0.448, | |
"grad_norm": 1.203029990196228, | |
"learning_rate": 8.814244588678245e-05, | |
"loss": 0.7588, | |
"step": 4480 | |
}, | |
{ | |
"epoch": 0.4512, | |
"grad_norm": 1.8377019166946411, | |
"learning_rate": 8.797946224677052e-05, | |
"loss": 0.6975, | |
"step": 4512 | |
}, | |
{ | |
"epoch": 0.4544, | |
"grad_norm": 1.4714457988739014, | |
"learning_rate": 8.78155190125736e-05, | |
"loss": 0.6502, | |
"step": 4544 | |
}, | |
{ | |
"epoch": 0.4576, | |
"grad_norm": 1.6311497688293457, | |
"learning_rate": 8.765062032640346e-05, | |
"loss": 0.7536, | |
"step": 4576 | |
}, | |
{ | |
"epoch": 0.4608, | |
"grad_norm": 1.8238953351974487, | |
"learning_rate": 8.748477035461238e-05, | |
"loss": 0.7899, | |
"step": 4608 | |
}, | |
{ | |
"epoch": 0.464, | |
"grad_norm": 1.5541362762451172, | |
"learning_rate": 8.7317973287588e-05, | |
"loss": 0.6904, | |
"step": 4640 | |
}, | |
{ | |
"epoch": 0.4672, | |
"grad_norm": 1.032272219657898, | |
"learning_rate": 8.715023333964736e-05, | |
"loss": 0.7395, | |
"step": 4672 | |
}, | |
{ | |
"epoch": 0.4704, | |
"grad_norm": 1.3095510005950928, | |
"learning_rate": 8.69815547489305e-05, | |
"loss": 0.6854, | |
"step": 4704 | |
}, | |
{ | |
"epoch": 0.4736, | |
"grad_norm": 1.5274263620376587, | |
"learning_rate": 8.681194177729327e-05, | |
"loss": 0.7498, | |
"step": 4736 | |
}, | |
{ | |
"epoch": 0.4768, | |
"grad_norm": 1.4236122369766235, | |
"learning_rate": 8.66413987101998e-05, | |
"loss": 0.7356, | |
"step": 4768 | |
}, | |
{ | |
"epoch": 0.48, | |
"grad_norm": 1.2118279933929443, | |
"learning_rate": 8.646992985661404e-05, | |
"loss": 0.7178, | |
"step": 4800 | |
}, | |
{ | |
"epoch": 0.4832, | |
"grad_norm": 3.3495805263519287, | |
"learning_rate": 8.629753954889107e-05, | |
"loss": 0.7326, | |
"step": 4832 | |
}, | |
{ | |
"epoch": 0.4864, | |
"grad_norm": 0.6829349398612976, | |
"learning_rate": 8.612423214266749e-05, | |
"loss": 0.7838, | |
"step": 4864 | |
}, | |
{ | |
"epoch": 0.4896, | |
"grad_norm": 0.8314148187637329, | |
"learning_rate": 8.595001201675147e-05, | |
"loss": 0.7007, | |
"step": 4896 | |
}, | |
{ | |
"epoch": 0.4928, | |
"grad_norm": 1.2672547101974487, | |
"learning_rate": 8.577488357301209e-05, | |
"loss": 0.7377, | |
"step": 4928 | |
}, | |
{ | |
"epoch": 0.496, | |
"grad_norm": 1.3968323469161987, | |
"learning_rate": 8.559885123626807e-05, | |
"loss": 0.6774, | |
"step": 4960 | |
}, | |
{ | |
"epoch": 0.4992, | |
"grad_norm": 1.2808008193969727, | |
"learning_rate": 8.542191945417601e-05, | |
"loss": 0.6807, | |
"step": 4992 | |
}, | |
{ | |
"epoch": 0.5024, | |
"grad_norm": 1.9290404319763184, | |
"learning_rate": 8.524409269711807e-05, | |
"loss": 0.7376, | |
"step": 5024 | |
}, | |
{ | |
"epoch": 0.5056, | |
"grad_norm": 1.3726913928985596, | |
"learning_rate": 8.506537545808892e-05, | |
"loss": 0.7402, | |
"step": 5056 | |
}, | |
{ | |
"epoch": 0.5088, | |
"grad_norm": 1.7894905805587769, | |
"learning_rate": 8.48857722525823e-05, | |
"loss": 0.6991, | |
"step": 5088 | |
}, | |
{ | |
"epoch": 0.512, | |
"grad_norm": 1.1462016105651855, | |
"learning_rate": 8.470528761847684e-05, | |
"loss": 0.7989, | |
"step": 5120 | |
}, | |
{ | |
"epoch": 0.5152, | |
"grad_norm": 0.7457314729690552, | |
"learning_rate": 8.452392611592153e-05, | |
"loss": 0.7616, | |
"step": 5152 | |
}, | |
{ | |
"epoch": 0.5184, | |
"grad_norm": 1.728968858718872, | |
"learning_rate": 8.434169232722043e-05, | |
"loss": 0.6324, | |
"step": 5184 | |
}, | |
{ | |
"epoch": 0.5216, | |
"grad_norm": 0.9103218913078308, | |
"learning_rate": 8.415859085671683e-05, | |
"loss": 0.7222, | |
"step": 5216 | |
}, | |
{ | |
"epoch": 0.5248, | |
"grad_norm": 1.602072834968567, | |
"learning_rate": 8.397462633067704e-05, | |
"loss": 0.7265, | |
"step": 5248 | |
}, | |
{ | |
"epoch": 0.528, | |
"grad_norm": 0.9967379570007324, | |
"learning_rate": 8.378980339717349e-05, | |
"loss": 0.7042, | |
"step": 5280 | |
}, | |
{ | |
"epoch": 0.5312, | |
"grad_norm": 1.9905532598495483, | |
"learning_rate": 8.360412672596712e-05, | |
"loss": 0.8098, | |
"step": 5312 | |
}, | |
{ | |
"epoch": 0.5344, | |
"grad_norm": 1.1432929039001465, | |
"learning_rate": 8.341760100838965e-05, | |
"loss": 0.7591, | |
"step": 5344 | |
}, | |
{ | |
"epoch": 0.5376, | |
"grad_norm": 2.1654775142669678, | |
"learning_rate": 8.323023095722486e-05, | |
"loss": 0.8071, | |
"step": 5376 | |
}, | |
{ | |
"epoch": 0.5408, | |
"grad_norm": 1.2390097379684448, | |
"learning_rate": 8.304202130658959e-05, | |
"loss": 0.834, | |
"step": 5408 | |
}, | |
{ | |
"epoch": 0.544, | |
"grad_norm": 1.0290433168411255, | |
"learning_rate": 8.285297681181408e-05, | |
"loss": 0.8228, | |
"step": 5440 | |
}, | |
{ | |
"epoch": 0.5472, | |
"grad_norm": 1.299111008644104, | |
"learning_rate": 8.26631022493219e-05, | |
"loss": 0.7099, | |
"step": 5472 | |
}, | |
{ | |
"epoch": 0.5504, | |
"grad_norm": 0.8850242495536804, | |
"learning_rate": 8.247240241650918e-05, | |
"loss": 0.7646, | |
"step": 5504 | |
}, | |
{ | |
"epoch": 0.5536, | |
"grad_norm": 1.980812907218933, | |
"learning_rate": 8.22808821316235e-05, | |
"loss": 0.7312, | |
"step": 5536 | |
}, | |
{ | |
"epoch": 0.5568, | |
"grad_norm": 1.0378026962280273, | |
"learning_rate": 8.208854623364202e-05, | |
"loss": 0.7277, | |
"step": 5568 | |
}, | |
{ | |
"epoch": 0.56, | |
"grad_norm": 1.6820452213287354, | |
"learning_rate": 8.189539958214935e-05, | |
"loss": 0.7654, | |
"step": 5600 | |
}, | |
{ | |
"epoch": 0.5632, | |
"grad_norm": 1.494661808013916, | |
"learning_rate": 8.170144705721465e-05, | |
"loss": 0.7208, | |
"step": 5632 | |
}, | |
{ | |
"epoch": 0.5664, | |
"grad_norm": 0.9761049747467041, | |
"learning_rate": 8.150669355926846e-05, | |
"loss": 0.6898, | |
"step": 5664 | |
}, | |
{ | |
"epoch": 0.5696, | |
"grad_norm": 1.3057583570480347, | |
"learning_rate": 8.131114400897874e-05, | |
"loss": 0.7887, | |
"step": 5696 | |
}, | |
{ | |
"epoch": 0.5728, | |
"grad_norm": 1.0025156736373901, | |
"learning_rate": 8.111480334712665e-05, | |
"loss": 0.6483, | |
"step": 5728 | |
}, | |
{ | |
"epoch": 0.576, | |
"grad_norm": 0.9818746447563171, | |
"learning_rate": 8.091767653448167e-05, | |
"loss": 0.8385, | |
"step": 5760 | |
}, | |
{ | |
"epoch": 0.5792, | |
"grad_norm": 1.1921987533569336, | |
"learning_rate": 8.071976855167629e-05, | |
"loss": 0.6707, | |
"step": 5792 | |
}, | |
{ | |
"epoch": 0.5824, | |
"grad_norm": 1.5055749416351318, | |
"learning_rate": 8.052108439908013e-05, | |
"loss": 0.7086, | |
"step": 5824 | |
}, | |
{ | |
"epoch": 0.5856, | |
"grad_norm": 1.7581650018692017, | |
"learning_rate": 8.032162909667362e-05, | |
"loss": 0.6696, | |
"step": 5856 | |
}, | |
{ | |
"epoch": 0.5888, | |
"grad_norm": 1.8909873962402344, | |
"learning_rate": 8.01214076839212e-05, | |
"loss": 0.7471, | |
"step": 5888 | |
}, | |
{ | |
"epoch": 0.592, | |
"grad_norm": 1.3570644855499268, | |
"learning_rate": 7.992042521964389e-05, | |
"loss": 0.655, | |
"step": 5920 | |
}, | |
{ | |
"epoch": 0.5952, | |
"grad_norm": 0.6561287641525269, | |
"learning_rate": 7.971868678189161e-05, | |
"loss": 0.719, | |
"step": 5952 | |
}, | |
{ | |
"epoch": 0.5984, | |
"grad_norm": 1.3650476932525635, | |
"learning_rate": 7.951619746781474e-05, | |
"loss": 0.7405, | |
"step": 5984 | |
}, | |
{ | |
"epoch": 0.6016, | |
"grad_norm": 2.8344266414642334, | |
"learning_rate": 7.931296239353544e-05, | |
"loss": 0.7192, | |
"step": 6016 | |
}, | |
{ | |
"epoch": 0.6048, | |
"grad_norm": 1.6202623844146729, | |
"learning_rate": 7.910898669401839e-05, | |
"loss": 0.7671, | |
"step": 6048 | |
}, | |
{ | |
"epoch": 0.608, | |
"grad_norm": 1.1194038391113281, | |
"learning_rate": 7.890427552294093e-05, | |
"loss": 0.7915, | |
"step": 6080 | |
}, | |
{ | |
"epoch": 0.6112, | |
"grad_norm": 0.8267541527748108, | |
"learning_rate": 7.869883405256295e-05, | |
"loss": 0.7441, | |
"step": 6112 | |
}, | |
{ | |
"epoch": 0.6144, | |
"grad_norm": 1.229134202003479, | |
"learning_rate": 7.849266747359619e-05, | |
"loss": 0.6548, | |
"step": 6144 | |
}, | |
{ | |
"epoch": 0.6176, | |
"grad_norm": 1.151248812675476, | |
"learning_rate": 7.828578099507308e-05, | |
"loss": 0.6795, | |
"step": 6176 | |
}, | |
{ | |
"epoch": 0.6208, | |
"grad_norm": 1.620975375175476, | |
"learning_rate": 7.80781798442151e-05, | |
"loss": 0.6352, | |
"step": 6208 | |
}, | |
{ | |
"epoch": 0.624, | |
"grad_norm": 0.9030219912528992, | |
"learning_rate": 7.786986926630078e-05, | |
"loss": 0.7185, | |
"step": 6240 | |
}, | |
{ | |
"epoch": 0.6272, | |
"grad_norm": 1.2997703552246094, | |
"learning_rate": 7.766085452453312e-05, | |
"loss": 0.6523, | |
"step": 6272 | |
}, | |
{ | |
"epoch": 0.6304, | |
"grad_norm": 1.208347201347351, | |
"learning_rate": 7.74511408999066e-05, | |
"loss": 0.6928, | |
"step": 6304 | |
}, | |
{ | |
"epoch": 0.6336, | |
"grad_norm": 0.723646879196167, | |
"learning_rate": 7.724073369107376e-05, | |
"loss": 0.6603, | |
"step": 6336 | |
}, | |
{ | |
"epoch": 0.6368, | |
"grad_norm": 1.125978946685791, | |
"learning_rate": 7.702963821421133e-05, | |
"loss": 0.7328, | |
"step": 6368 | |
}, | |
{ | |
"epoch": 0.64, | |
"grad_norm": 2.039461135864258, | |
"learning_rate": 7.6817859802886e-05, | |
"loss": 0.7545, | |
"step": 6400 | |
}, | |
{ | |
"epoch": 0.6432, | |
"grad_norm": 1.3743586540222168, | |
"learning_rate": 7.660540380791942e-05, | |
"loss": 0.67, | |
"step": 6432 | |
}, | |
{ | |
"epoch": 0.6464, | |
"grad_norm": 1.402256727218628, | |
"learning_rate": 7.639227559725332e-05, | |
"loss": 0.636, | |
"step": 6464 | |
}, | |
{ | |
"epoch": 0.6496, | |
"grad_norm": 1.0240074396133423, | |
"learning_rate": 7.617848055581361e-05, | |
"loss": 0.8179, | |
"step": 6496 | |
}, | |
{ | |
"epoch": 0.6528, | |
"grad_norm": 0.8905365467071533, | |
"learning_rate": 7.596402408537443e-05, | |
"loss": 0.7542, | |
"step": 6528 | |
}, | |
{ | |
"epoch": 0.656, | |
"grad_norm": 1.8598270416259766, | |
"learning_rate": 7.574891160442179e-05, | |
"loss": 0.7266, | |
"step": 6560 | |
}, | |
{ | |
"epoch": 0.6592, | |
"grad_norm": 0.9146720170974731, | |
"learning_rate": 7.553314854801641e-05, | |
"loss": 0.7861, | |
"step": 6592 | |
}, | |
{ | |
"epoch": 0.6624, | |
"grad_norm": 1.8956897258758545, | |
"learning_rate": 7.531674036765662e-05, | |
"loss": 0.7113, | |
"step": 6624 | |
}, | |
{ | |
"epoch": 0.6656, | |
"grad_norm": 1.0353283882141113, | |
"learning_rate": 7.509969253114055e-05, | |
"loss": 0.6984, | |
"step": 6656 | |
}, | |
{ | |
"epoch": 0.6688, | |
"grad_norm": 1.890493631362915, | |
"learning_rate": 7.488201052242789e-05, | |
"loss": 0.6687, | |
"step": 6688 | |
}, | |
{ | |
"epoch": 0.672, | |
"grad_norm": 0.9367122054100037, | |
"learning_rate": 7.46636998415015e-05, | |
"loss": 0.719, | |
"step": 6720 | |
}, | |
{ | |
"epoch": 0.6752, | |
"grad_norm": 1.1989344358444214, | |
"learning_rate": 7.444476600422828e-05, | |
"loss": 0.775, | |
"step": 6752 | |
}, | |
{ | |
"epoch": 0.6784, | |
"grad_norm": 0.8481733202934265, | |
"learning_rate": 7.42252145422199e-05, | |
"loss": 0.7667, | |
"step": 6784 | |
}, | |
{ | |
"epoch": 0.6816, | |
"grad_norm": 1.0271095037460327, | |
"learning_rate": 7.400505100269307e-05, | |
"loss": 0.653, | |
"step": 6816 | |
}, | |
{ | |
"epoch": 0.6848, | |
"grad_norm": 1.3998816013336182, | |
"learning_rate": 7.378428094832931e-05, | |
"loss": 0.6651, | |
"step": 6848 | |
}, | |
{ | |
"epoch": 0.688, | |
"grad_norm": 1.3338642120361328, | |
"learning_rate": 7.356290995713437e-05, | |
"loss": 0.6266, | |
"step": 6880 | |
}, | |
{ | |
"epoch": 0.6912, | |
"grad_norm": 0.8170168995857239, | |
"learning_rate": 7.334094362229739e-05, | |
"loss": 0.765, | |
"step": 6912 | |
}, | |
{ | |
"epoch": 0.6944, | |
"grad_norm": 1.4982614517211914, | |
"learning_rate": 7.311838755204959e-05, | |
"loss": 0.641, | |
"step": 6944 | |
}, | |
{ | |
"epoch": 0.6976, | |
"grad_norm": 1.623159646987915, | |
"learning_rate": 7.290222928580347e-05, | |
"loss": 0.6462, | |
"step": 6976 | |
}, | |
{ | |
"epoch": 0.7008, | |
"grad_norm": 1.169145941734314, | |
"learning_rate": 7.267852862072673e-05, | |
"loss": 0.7506, | |
"step": 7008 | |
}, | |
{ | |
"epoch": 0.704, | |
"grad_norm": 1.011816382408142, | |
"learning_rate": 7.245425495690538e-05, | |
"loss": 0.7183, | |
"step": 7040 | |
}, | |
{ | |
"epoch": 0.7072, | |
"grad_norm": 3.0435078144073486, | |
"learning_rate": 7.222941396086789e-05, | |
"loss": 0.7948, | |
"step": 7072 | |
}, | |
{ | |
"epoch": 0.7104, | |
"grad_norm": 0.802679717540741, | |
"learning_rate": 7.2004011313477e-05, | |
"loss": 0.8216, | |
"step": 7104 | |
}, | |
{ | |
"epoch": 0.7136, | |
"grad_norm": 0.7551457285881042, | |
"learning_rate": 7.17780527097862e-05, | |
"loss": 0.7823, | |
"step": 7136 | |
}, | |
{ | |
"epoch": 0.7168, | |
"grad_norm": 1.3118380308151245, | |
"learning_rate": 7.155154385889589e-05, | |
"loss": 0.7803, | |
"step": 7168 | |
}, | |
{ | |
"epoch": 0.72, | |
"grad_norm": 1.1100643873214722, | |
"learning_rate": 7.132449048380907e-05, | |
"loss": 0.7425, | |
"step": 7200 | |
}, | |
{ | |
"epoch": 0.7232, | |
"grad_norm": 0.8792561888694763, | |
"learning_rate": 7.109689832128673e-05, | |
"loss": 0.7515, | |
"step": 7232 | |
}, | |
{ | |
"epoch": 0.7264, | |
"grad_norm": 0.8382082581520081, | |
"learning_rate": 7.0868773121703e-05, | |
"loss": 0.8134, | |
"step": 7264 | |
}, | |
{ | |
"epoch": 0.7296, | |
"grad_norm": 1.7332772016525269, | |
"learning_rate": 7.064012064889971e-05, | |
"loss": 0.6971, | |
"step": 7296 | |
}, | |
{ | |
"epoch": 0.7328, | |
"grad_norm": 1.4402042627334595, | |
"learning_rate": 7.041094668004093e-05, | |
"loss": 0.6845, | |
"step": 7328 | |
}, | |
{ | |
"epoch": 0.736, | |
"grad_norm": 1.1810777187347412, | |
"learning_rate": 7.018125700546683e-05, | |
"loss": 0.7472, | |
"step": 7360 | |
}, | |
{ | |
"epoch": 0.7392, | |
"grad_norm": 0.9390580058097839, | |
"learning_rate": 6.995105742854759e-05, | |
"loss": 0.8127, | |
"step": 7392 | |
}, | |
{ | |
"epoch": 0.7424, | |
"grad_norm": 1.570432186126709, | |
"learning_rate": 6.972035376553656e-05, | |
"loss": 0.7071, | |
"step": 7424 | |
}, | |
{ | |
"epoch": 0.7456, | |
"grad_norm": 1.168547511100769, | |
"learning_rate": 6.94891518454234e-05, | |
"loss": 0.7017, | |
"step": 7456 | |
}, | |
{ | |
"epoch": 0.7488, | |
"grad_norm": 1.1337932348251343, | |
"learning_rate": 6.925745750978686e-05, | |
"loss": 0.6738, | |
"step": 7488 | |
}, | |
{ | |
"epoch": 0.752, | |
"grad_norm": 1.351352334022522, | |
"learning_rate": 6.902527661264701e-05, | |
"loss": 0.7548, | |
"step": 7520 | |
}, | |
{ | |
"epoch": 0.7552, | |
"grad_norm": 0.6679269671440125, | |
"learning_rate": 6.87926150203176e-05, | |
"loss": 0.7106, | |
"step": 7552 | |
}, | |
{ | |
"epoch": 0.7584, | |
"grad_norm": 1.3825992345809937, | |
"learning_rate": 6.855947861125759e-05, | |
"loss": 0.6443, | |
"step": 7584 | |
}, | |
{ | |
"epoch": 0.7616, | |
"grad_norm": 1.1650683879852295, | |
"learning_rate": 6.832587327592275e-05, | |
"loss": 0.7547, | |
"step": 7616 | |
}, | |
{ | |
"epoch": 0.7648, | |
"grad_norm": 1.5112355947494507, | |
"learning_rate": 6.809180491661678e-05, | |
"loss": 0.7076, | |
"step": 7648 | |
}, | |
{ | |
"epoch": 0.768, | |
"grad_norm": 0.8795199990272522, | |
"learning_rate": 6.785727944734228e-05, | |
"loss": 0.7345, | |
"step": 7680 | |
}, | |
{ | |
"epoch": 0.7712, | |
"grad_norm": 1.6340776681900024, | |
"learning_rate": 6.762230279365114e-05, | |
"loss": 0.7517, | |
"step": 7712 | |
}, | |
{ | |
"epoch": 0.7744, | |
"grad_norm": 1.022924542427063, | |
"learning_rate": 6.738688089249502e-05, | |
"loss": 0.6874, | |
"step": 7744 | |
}, | |
{ | |
"epoch": 0.7776, | |
"grad_norm": 1.2930107116699219, | |
"learning_rate": 6.715101969207525e-05, | |
"loss": 0.7479, | |
"step": 7776 | |
}, | |
{ | |
"epoch": 0.7808, | |
"grad_norm": 1.9842311143875122, | |
"learning_rate": 6.691472515169251e-05, | |
"loss": 0.7479, | |
"step": 7808 | |
}, | |
{ | |
"epoch": 0.784, | |
"grad_norm": 1.5960675477981567, | |
"learning_rate": 6.667800324159636e-05, | |
"loss": 0.7928, | |
"step": 7840 | |
}, | |
{ | |
"epoch": 0.7872, | |
"grad_norm": 3.447913885116577, | |
"learning_rate": 6.644085994283433e-05, | |
"loss": 0.6924, | |
"step": 7872 | |
}, | |
{ | |
"epoch": 0.7904, | |
"grad_norm": 0.8809865713119507, | |
"learning_rate": 6.620330124710077e-05, | |
"loss": 0.7955, | |
"step": 7904 | |
}, | |
{ | |
"epoch": 0.7936, | |
"grad_norm": 1.3761461973190308, | |
"learning_rate": 6.596533315658555e-05, | |
"loss": 0.6842, | |
"step": 7936 | |
}, | |
{ | |
"epoch": 0.7968, | |
"grad_norm": 0.9557456374168396, | |
"learning_rate": 6.572696168382235e-05, | |
"loss": 0.7285, | |
"step": 7968 | |
}, | |
{ | |
"epoch": 0.8, | |
"grad_norm": 0.7569695115089417, | |
"learning_rate": 6.548819285153676e-05, | |
"loss": 0.6431, | |
"step": 8000 | |
}, | |
{ | |
"epoch": 0.8032, | |
"grad_norm": 1.2884209156036377, | |
"learning_rate": 6.524903269249411e-05, | |
"loss": 0.739, | |
"step": 8032 | |
}, | |
{ | |
"epoch": 0.8064, | |
"grad_norm": 1.033050775527954, | |
"learning_rate": 6.500948724934703e-05, | |
"loss": 0.6759, | |
"step": 8064 | |
}, | |
{ | |
"epoch": 0.8096, | |
"grad_norm": 0.9404661655426025, | |
"learning_rate": 6.47695625744828e-05, | |
"loss": 0.696, | |
"step": 8096 | |
}, | |
{ | |
"epoch": 0.8128, | |
"grad_norm": 0.8363805413246155, | |
"learning_rate": 6.452926472987044e-05, | |
"loss": 0.7273, | |
"step": 8128 | |
}, | |
{ | |
"epoch": 0.816, | |
"grad_norm": 0.7976164817810059, | |
"learning_rate": 6.428859978690748e-05, | |
"loss": 0.6671, | |
"step": 8160 | |
}, | |
{ | |
"epoch": 0.8192, | |
"grad_norm": 1.6969666481018066, | |
"learning_rate": 6.404757382626669e-05, | |
"loss": 0.6968, | |
"step": 8192 | |
}, | |
{ | |
"epoch": 0.8224, | |
"grad_norm": 1.061860203742981, | |
"learning_rate": 6.380619293774223e-05, | |
"loss": 0.7424, | |
"step": 8224 | |
}, | |
{ | |
"epoch": 0.8256, | |
"grad_norm": 1.2336043119430542, | |
"learning_rate": 6.356446322009607e-05, | |
"loss": 0.6786, | |
"step": 8256 | |
}, | |
{ | |
"epoch": 0.8288, | |
"grad_norm": 1.3530735969543457, | |
"learning_rate": 6.332239078090358e-05, | |
"loss": 0.7042, | |
"step": 8288 | |
}, | |
{ | |
"epoch": 0.832, | |
"grad_norm": 0.9186837673187256, | |
"learning_rate": 6.307998173639954e-05, | |
"loss": 0.7433, | |
"step": 8320 | |
}, | |
{ | |
"epoch": 0.8352, | |
"grad_norm": 1.0583479404449463, | |
"learning_rate": 6.283724221132333e-05, | |
"loss": 0.6515, | |
"step": 8352 | |
}, | |
{ | |
"epoch": 0.8384, | |
"grad_norm": 1.468887209892273, | |
"learning_rate": 6.259417833876432e-05, | |
"loss": 0.7033, | |
"step": 8384 | |
}, | |
{ | |
"epoch": 0.8416, | |
"grad_norm": 0.7726921439170837, | |
"learning_rate": 6.235079626000694e-05, | |
"loss": 0.721, | |
"step": 8416 | |
}, | |
{ | |
"epoch": 0.8448, | |
"grad_norm": 1.8641211986541748, | |
"learning_rate": 6.21071021243754e-05, | |
"loss": 0.626, | |
"step": 8448 | |
}, | |
{ | |
"epoch": 0.848, | |
"grad_norm": 1.9702180624008179, | |
"learning_rate": 6.186310208907839e-05, | |
"loss": 0.6017, | |
"step": 8480 | |
}, | |
{ | |
"epoch": 0.8512, | |
"grad_norm": 2.057535171508789, | |
"learning_rate": 6.161880231905354e-05, | |
"loss": 0.7612, | |
"step": 8512 | |
}, | |
{ | |
"epoch": 0.8544, | |
"grad_norm": 2.2840230464935303, | |
"learning_rate": 6.137420898681158e-05, | |
"loss": 0.6609, | |
"step": 8544 | |
}, | |
{ | |
"epoch": 0.8576, | |
"grad_norm": 1.7856135368347168, | |
"learning_rate": 6.112932827228044e-05, | |
"loss": 0.7015, | |
"step": 8576 | |
}, | |
{ | |
"epoch": 0.8608, | |
"grad_norm": 1.0354335308074951, | |
"learning_rate": 6.0884166362649075e-05, | |
"loss": 0.6714, | |
"step": 8608 | |
}, | |
{ | |
"epoch": 0.864, | |
"grad_norm": 1.054237961769104, | |
"learning_rate": 6.063872945221118e-05, | |
"loss": 0.6928, | |
"step": 8640 | |
}, | |
{ | |
"epoch": 0.8672, | |
"grad_norm": 1.004862904548645, | |
"learning_rate": 6.039302374220861e-05, | |
"loss": 0.7676, | |
"step": 8672 | |
}, | |
{ | |
"epoch": 0.8704, | |
"grad_norm": 0.8693735003471375, | |
"learning_rate": 6.0147055440674795e-05, | |
"loss": 0.7562, | |
"step": 8704 | |
}, | |
{ | |
"epoch": 0.8736, | |
"grad_norm": 1.6824612617492676, | |
"learning_rate": 5.990083076227782e-05, | |
"loss": 0.6509, | |
"step": 8736 | |
}, | |
{ | |
"epoch": 0.8768, | |
"grad_norm": 3.1215667724609375, | |
"learning_rate": 5.9654355928163416e-05, | |
"loss": 0.7553, | |
"step": 8768 | |
}, | |
{ | |
"epoch": 0.88, | |
"grad_norm": 1.4479137659072876, | |
"learning_rate": 5.9407637165797793e-05, | |
"loss": 0.8046, | |
"step": 8800 | |
}, | |
{ | |
"epoch": 0.8832, | |
"grad_norm": 2.769347906112671, | |
"learning_rate": 5.916068070881026e-05, | |
"loss": 0.6869, | |
"step": 8832 | |
}, | |
{ | |
"epoch": 0.8864, | |
"grad_norm": 1.338932752609253, | |
"learning_rate": 5.891349279683578e-05, | |
"loss": 0.6742, | |
"step": 8864 | |
}, | |
{ | |
"epoch": 0.8896, | |
"grad_norm": 1.15195631980896, | |
"learning_rate": 5.8666079675357285e-05, | |
"loss": 0.6972, | |
"step": 8896 | |
}, | |
{ | |
"epoch": 0.8928, | |
"grad_norm": 1.0247623920440674, | |
"learning_rate": 5.841844759554787e-05, | |
"loss": 0.7107, | |
"step": 8928 | |
}, | |
{ | |
"epoch": 0.896, | |
"grad_norm": 1.4130921363830566, | |
"learning_rate": 5.817060281411284e-05, | |
"loss": 0.7327, | |
"step": 8960 | |
}, | |
{ | |
"epoch": 0.8992, | |
"grad_norm": 0.6436507701873779, | |
"learning_rate": 5.792255159313169e-05, | |
"loss": 0.6418, | |
"step": 8992 | |
}, | |
{ | |
"epoch": 0.9024, | |
"grad_norm": 0.9555985331535339, | |
"learning_rate": 5.7674300199899834e-05, | |
"loss": 0.7157, | |
"step": 9024 | |
}, | |
{ | |
"epoch": 0.9056, | |
"grad_norm": 0.8774769306182861, | |
"learning_rate": 5.742585490677024e-05, | |
"loss": 0.6197, | |
"step": 9056 | |
}, | |
{ | |
"epoch": 0.9088, | |
"grad_norm": 0.9347734451293945, | |
"learning_rate": 5.7177221990995e-05, | |
"loss": 0.6672, | |
"step": 9088 | |
}, | |
{ | |
"epoch": 0.912, | |
"grad_norm": 1.2730952501296997, | |
"learning_rate": 5.692840773456669e-05, | |
"loss": 0.7524, | |
"step": 9120 | |
}, | |
{ | |
"epoch": 0.9152, | |
"grad_norm": 1.3449304103851318, | |
"learning_rate": 5.667941842405968e-05, | |
"loss": 0.7106, | |
"step": 9152 | |
}, | |
{ | |
"epoch": 0.9184, | |
"grad_norm": 2.288444757461548, | |
"learning_rate": 5.643026035047128e-05, | |
"loss": 0.7239, | |
"step": 9184 | |
}, | |
{ | |
"epoch": 0.9216, | |
"grad_norm": 1.1817107200622559, | |
"learning_rate": 5.618093980906276e-05, | |
"loss": 0.7342, | |
"step": 9216 | |
}, | |
{ | |
"epoch": 0.9248, | |
"grad_norm": 1.4276821613311768, | |
"learning_rate": 5.5931463099200355e-05, | |
"loss": 0.6198, | |
"step": 9248 | |
}, | |
{ | |
"epoch": 0.928, | |
"grad_norm": 1.0878974199295044, | |
"learning_rate": 5.568183652419606e-05, | |
"loss": 0.7204, | |
"step": 9280 | |
}, | |
{ | |
"epoch": 0.9312, | |
"grad_norm": 1.5497533082962036, | |
"learning_rate": 5.54320663911484e-05, | |
"loss": 0.7218, | |
"step": 9312 | |
}, | |
{ | |
"epoch": 0.9344, | |
"grad_norm": 0.5286266207695007, | |
"learning_rate": 5.518215901078302e-05, | |
"loss": 0.8243, | |
"step": 9344 | |
}, | |
{ | |
"epoch": 0.9376, | |
"grad_norm": 1.9889594316482544, | |
"learning_rate": 5.493212069729332e-05, | |
"loss": 0.6849, | |
"step": 9376 | |
}, | |
{ | |
"epoch": 0.9408, | |
"grad_norm": 1.6639822721481323, | |
"learning_rate": 5.468195776818084e-05, | |
"loss": 0.682, | |
"step": 9408 | |
}, | |
{ | |
"epoch": 0.944, | |
"grad_norm": 3.0651698112487793, | |
"learning_rate": 5.4431676544095676e-05, | |
"loss": 0.8112, | |
"step": 9440 | |
}, | |
{ | |
"epoch": 0.9472, | |
"grad_norm": 1.0381174087524414, | |
"learning_rate": 5.4181283348676806e-05, | |
"loss": 0.6497, | |
"step": 9472 | |
}, | |
{ | |
"epoch": 0.9504, | |
"grad_norm": 1.0353689193725586, | |
"learning_rate": 5.393078450839228e-05, | |
"loss": 0.6654, | |
"step": 9504 | |
}, | |
{ | |
"epoch": 0.9536, | |
"grad_norm": 1.6130503416061401, | |
"learning_rate": 5.368018635237936e-05, | |
"loss": 0.7351, | |
"step": 9536 | |
}, | |
{ | |
"epoch": 0.9568, | |
"grad_norm": 1.171970248222351, | |
"learning_rate": 5.3429495212284665e-05, | |
"loss": 0.7099, | |
"step": 9568 | |
}, | |
{ | |
"epoch": 0.96, | |
"grad_norm": 1.937739610671997, | |
"learning_rate": 5.3178717422104144e-05, | |
"loss": 0.6366, | |
"step": 9600 | |
}, | |
{ | |
"epoch": 0.9632, | |
"grad_norm": 1.8911631107330322, | |
"learning_rate": 5.2927859318023073e-05, | |
"loss": 0.6813, | |
"step": 9632 | |
}, | |
{ | |
"epoch": 0.9664, | |
"grad_norm": 1.1599578857421875, | |
"learning_rate": 5.2676927238255946e-05, | |
"loss": 0.7155, | |
"step": 9664 | |
}, | |
{ | |
"epoch": 0.9696, | |
"grad_norm": 1.2809479236602783, | |
"learning_rate": 5.242592752288632e-05, | |
"loss": 0.7051, | |
"step": 9696 | |
}, | |
{ | |
"epoch": 0.9728, | |
"grad_norm": 2.0790278911590576, | |
"learning_rate": 5.2174866513706646e-05, | |
"loss": 0.7387, | |
"step": 9728 | |
}, | |
{ | |
"epoch": 0.976, | |
"grad_norm": 1.0074536800384521, | |
"learning_rate": 5.1923750554058084e-05, | |
"loss": 0.6751, | |
"step": 9760 | |
}, | |
{ | |
"epoch": 0.9792, | |
"grad_norm": 1.3937727212905884, | |
"learning_rate": 5.16725859886701e-05, | |
"loss": 0.6902, | |
"step": 9792 | |
}, | |
{ | |
"epoch": 0.9824, | |
"grad_norm": 0.8866567015647888, | |
"learning_rate": 5.142137916350028e-05, | |
"loss": 0.7443, | |
"step": 9824 | |
}, | |
{ | |
"epoch": 0.9856, | |
"grad_norm": 0.857765793800354, | |
"learning_rate": 5.1170136425573956e-05, | |
"loss": 0.7032, | |
"step": 9856 | |
}, | |
{ | |
"epoch": 0.9888, | |
"grad_norm": 0.6846195459365845, | |
"learning_rate": 5.0918864122823816e-05, | |
"loss": 0.6508, | |
"step": 9888 | |
}, | |
{ | |
"epoch": 0.992, | |
"grad_norm": 0.9779634475708008, | |
"learning_rate": 5.066756860392956e-05, | |
"loss": 0.7161, | |
"step": 9920 | |
}, | |
{ | |
"epoch": 0.9952, | |
"grad_norm": 1.3198580741882324, | |
"learning_rate": 5.0416256218157476e-05, | |
"loss": 0.6885, | |
"step": 9952 | |
}, | |
{ | |
"epoch": 0.9984, | |
"grad_norm": 0.8396392464637756, | |
"learning_rate": 5.0164933315199955e-05, | |
"loss": 0.7511, | |
"step": 9984 | |
} | |
], | |
"logging_steps": 32, | |
"max_steps": 20000, | |
"num_input_tokens_seen": 0, | |
"num_train_epochs": 2, | |
"save_steps": 500, | |
"stateful_callbacks": { | |
"TrainerControl": { | |
"args": { | |
"should_epoch_stop": false, | |
"should_evaluate": false, | |
"should_log": false, | |
"should_save": true, | |
"should_training_stop": false | |
}, | |
"attributes": {} | |
} | |
}, | |
"total_flos": 2.0363220663791616e+16, | |
"train_batch_size": 1, | |
"trial_name": null, | |
"trial_params": null | |
} | |