|
{ |
|
"best_metric": 1.0082145929336548, |
|
"best_model_checkpoint": "ckpt/llama2_13b_fuze15_no_sys/alpaca_no_sys/checkpoint-2000", |
|
"epoch": 1.158161418747738, |
|
"eval_steps": 200, |
|
"global_step": 3200, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 0.2189224660396576, |
|
"learning_rate": 5e-05, |
|
"loss": 1.4369, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 0.4817655384540558, |
|
"learning_rate": 0.0001, |
|
"loss": 1.3624, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 0.35323551297187805, |
|
"learning_rate": 9.999996763266864e-05, |
|
"loss": 1.1589, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 0.2697048485279083, |
|
"learning_rate": 9.999987053071647e-05, |
|
"loss": 1.1103, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 0.34059372544288635, |
|
"learning_rate": 9.99997086942692e-05, |
|
"loss": 1.0601, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 0.2907443344593048, |
|
"learning_rate": 9.999948212353635e-05, |
|
"loss": 1.0302, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 0.4002208113670349, |
|
"learning_rate": 9.999919081881129e-05, |
|
"loss": 1.114, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 0.4364459216594696, |
|
"learning_rate": 9.999883478047113e-05, |
|
"loss": 1.0913, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 0.322396844625473, |
|
"learning_rate": 9.999841400897687e-05, |
|
"loss": 1.0778, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 0.5678238868713379, |
|
"learning_rate": 9.999792850487325e-05, |
|
"loss": 1.0493, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 0.2919568717479706, |
|
"learning_rate": 9.999737826878886e-05, |
|
"loss": 1.0249, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 0.3787660300731659, |
|
"learning_rate": 9.99967633014361e-05, |
|
"loss": 1.0594, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 0.33062055706977844, |
|
"learning_rate": 9.999608360361113e-05, |
|
"loss": 1.0527, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 0.3306855857372284, |
|
"learning_rate": 9.999533917619399e-05, |
|
"loss": 1.0051, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 0.41762664914131165, |
|
"learning_rate": 9.999453002014846e-05, |
|
"loss": 0.9906, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 0.291189044713974, |
|
"learning_rate": 9.999365613652217e-05, |
|
"loss": 1.0197, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 0.30276551842689514, |
|
"learning_rate": 9.999271752644649e-05, |
|
"loss": 1.0356, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 0.25866344571113586, |
|
"learning_rate": 9.999171419113666e-05, |
|
"loss": 1.0332, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 0.1927756369113922, |
|
"learning_rate": 9.999064613189171e-05, |
|
"loss": 1.0126, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 0.2776283621788025, |
|
"learning_rate": 9.998951335009442e-05, |
|
"loss": 1.0429, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"eval_loss": 1.029819130897522, |
|
"eval_runtime": 124.6792, |
|
"eval_samples_per_second": 62.569, |
|
"eval_steps_per_second": 3.914, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 0.320551335811615, |
|
"learning_rate": 9.998831584721141e-05, |
|
"loss": 1.0431, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 0.46670058369636536, |
|
"learning_rate": 9.998705362479307e-05, |
|
"loss": 1.0374, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 0.30959388613700867, |
|
"learning_rate": 9.99857266844736e-05, |
|
"loss": 1.1065, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 0.3016811013221741, |
|
"learning_rate": 9.998433502797095e-05, |
|
"loss": 1.1105, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 0.356992244720459, |
|
"learning_rate": 9.998287865708694e-05, |
|
"loss": 0.9839, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 0.29836413264274597, |
|
"learning_rate": 9.998135757370708e-05, |
|
"loss": 1.0401, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 0.4305395483970642, |
|
"learning_rate": 9.997977177980074e-05, |
|
"loss": 1.0461, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 0.2959505021572113, |
|
"learning_rate": 9.9978121277421e-05, |
|
"loss": 1.0662, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 0.2577110826969147, |
|
"learning_rate": 9.99764060687048e-05, |
|
"loss": 1.0736, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 0.2583490014076233, |
|
"learning_rate": 9.997462615587276e-05, |
|
"loss": 0.9963, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 0.29901596903800964, |
|
"learning_rate": 9.997278154122935e-05, |
|
"loss": 1.044, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 0.24256502091884613, |
|
"learning_rate": 9.997087222716278e-05, |
|
"loss": 1.0713, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 0.267166405916214, |
|
"learning_rate": 9.996889821614502e-05, |
|
"loss": 1.0721, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 0.21612702310085297, |
|
"learning_rate": 9.996685951073182e-05, |
|
"loss": 1.0414, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 0.3107874095439911, |
|
"learning_rate": 9.996475611356264e-05, |
|
"loss": 0.9856, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 0.27626070380210876, |
|
"learning_rate": 9.996258802736079e-05, |
|
"loss": 1.0121, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 0.2957281172275543, |
|
"learning_rate": 9.996035525493322e-05, |
|
"loss": 1.0785, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 0.3168753981590271, |
|
"learning_rate": 9.995805779917073e-05, |
|
"loss": 0.996, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 0.24823521077632904, |
|
"learning_rate": 9.99556956630478e-05, |
|
"loss": 1.0557, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 0.3291969895362854, |
|
"learning_rate": 9.995326884962268e-05, |
|
"loss": 1.0505, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"eval_loss": 1.023820400238037, |
|
"eval_runtime": 124.7265, |
|
"eval_samples_per_second": 62.545, |
|
"eval_steps_per_second": 3.913, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 0.3567464351654053, |
|
"learning_rate": 9.995077736203733e-05, |
|
"loss": 0.9919, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 0.2938403785228729, |
|
"learning_rate": 9.99482212035175e-05, |
|
"loss": 1.0736, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 0.27481499314308167, |
|
"learning_rate": 9.994560037737259e-05, |
|
"loss": 1.0633, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 0.34652218222618103, |
|
"learning_rate": 9.994291488699579e-05, |
|
"loss": 1.049, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 0.23733928799629211, |
|
"learning_rate": 9.994016473586398e-05, |
|
"loss": 1.0022, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 0.2666071653366089, |
|
"learning_rate": 9.993734992753777e-05, |
|
"loss": 1.0076, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 0.22843866050243378, |
|
"learning_rate": 9.993447046566146e-05, |
|
"loss": 1.0298, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 0.4334356486797333, |
|
"learning_rate": 9.993152635396308e-05, |
|
"loss": 1.0635, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 0.25845977663993835, |
|
"learning_rate": 9.992851759625433e-05, |
|
"loss": 1.0183, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 0.26029086112976074, |
|
"learning_rate": 9.992544419643066e-05, |
|
"loss": 0.963, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 0.23090577125549316, |
|
"learning_rate": 9.992230615847116e-05, |
|
"loss": 0.9691, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 0.2835213243961334, |
|
"learning_rate": 9.991910348643865e-05, |
|
"loss": 1.0309, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 0.2612157166004181, |
|
"learning_rate": 9.991583618447958e-05, |
|
"loss": 1.0232, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 0.43860122561454773, |
|
"learning_rate": 9.99125042568241e-05, |
|
"loss": 1.0308, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 0.2504933476448059, |
|
"learning_rate": 9.990910770778606e-05, |
|
"loss": 1.0581, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 0.2778143286705017, |
|
"learning_rate": 9.990564654176293e-05, |
|
"loss": 0.958, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 0.29035818576812744, |
|
"learning_rate": 9.990212076323586e-05, |
|
"loss": 1.0258, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 0.307841032743454, |
|
"learning_rate": 9.989853037676965e-05, |
|
"loss": 1.0724, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 0.3011914789676666, |
|
"learning_rate": 9.989487538701279e-05, |
|
"loss": 0.9847, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 0.27195674180984497, |
|
"learning_rate": 9.989115579869732e-05, |
|
"loss": 1.044, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"eval_loss": 1.0194298028945923, |
|
"eval_runtime": 124.7334, |
|
"eval_samples_per_second": 62.541, |
|
"eval_steps_per_second": 3.912, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 0.2725551724433899, |
|
"learning_rate": 9.988737161663898e-05, |
|
"loss": 1.0244, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 0.2821577787399292, |
|
"learning_rate": 9.988352284573713e-05, |
|
"loss": 1.0254, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 0.3664613664150238, |
|
"learning_rate": 9.987960949097475e-05, |
|
"loss": 1.1093, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 0.3072526156902313, |
|
"learning_rate": 9.987563155741842e-05, |
|
"loss": 1.0196, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 0.24550805985927582, |
|
"learning_rate": 9.987158905021836e-05, |
|
"loss": 1.012, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 0.2521149814128876, |
|
"learning_rate": 9.986748197460837e-05, |
|
"loss": 1.0219, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 0.34175044298171997, |
|
"learning_rate": 9.986331033590586e-05, |
|
"loss": 1.015, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 0.30103522539138794, |
|
"learning_rate": 9.98590741395118e-05, |
|
"loss": 1.1113, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 0.2344699651002884, |
|
"learning_rate": 9.985477339091078e-05, |
|
"loss": 1.0456, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 0.26754796504974365, |
|
"learning_rate": 9.985040809567097e-05, |
|
"loss": 1.0102, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 0.31665658950805664, |
|
"learning_rate": 9.984597825944405e-05, |
|
"loss": 1.0057, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 0.2716057300567627, |
|
"learning_rate": 9.984148388796532e-05, |
|
"loss": 0.9937, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 0.2589300274848938, |
|
"learning_rate": 9.983692498705361e-05, |
|
"loss": 0.9937, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 0.2215312272310257, |
|
"learning_rate": 9.983230156261132e-05, |
|
"loss": 1.0205, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 0.26202231645584106, |
|
"learning_rate": 9.982761362062432e-05, |
|
"loss": 1.0486, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 0.21432209014892578, |
|
"learning_rate": 9.982286116716208e-05, |
|
"loss": 1.0679, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 0.4230276048183441, |
|
"learning_rate": 9.98180442083776e-05, |
|
"loss": 1.0051, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 0.26559358835220337, |
|
"learning_rate": 9.981316275050731e-05, |
|
"loss": 1.0398, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 0.2559758722782135, |
|
"learning_rate": 9.980821679987125e-05, |
|
"loss": 1.0365, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 0.34101855754852295, |
|
"learning_rate": 9.980320636287285e-05, |
|
"loss": 1.0169, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"eval_loss": 1.0172123908996582, |
|
"eval_runtime": 124.7169, |
|
"eval_samples_per_second": 62.55, |
|
"eval_steps_per_second": 3.913, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 0.3401408791542053, |
|
"learning_rate": 9.979813144599915e-05, |
|
"loss": 1.0165, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 0.34302470088005066, |
|
"learning_rate": 9.979299205582057e-05, |
|
"loss": 1.0314, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 0.2908473610877991, |
|
"learning_rate": 9.978778819899109e-05, |
|
"loss": 0.9779, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 0.229986771941185, |
|
"learning_rate": 9.978251988224804e-05, |
|
"loss": 0.9564, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 0.441243052482605, |
|
"learning_rate": 9.977718711241233e-05, |
|
"loss": 1.0275, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 0.2620699107646942, |
|
"learning_rate": 9.977178989638822e-05, |
|
"loss": 1.0293, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 0.27257561683654785, |
|
"learning_rate": 9.97663282411635e-05, |
|
"loss": 1.0508, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 0.306587278842926, |
|
"learning_rate": 9.97608021538093e-05, |
|
"loss": 0.9949, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 0.30046141147613525, |
|
"learning_rate": 9.97552116414802e-05, |
|
"loss": 1.0752, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 0.2749102711677551, |
|
"learning_rate": 9.974955671141424e-05, |
|
"loss": 0.9947, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 0.38608163595199585, |
|
"learning_rate": 9.974383737093279e-05, |
|
"loss": 1.0362, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 0.24529774487018585, |
|
"learning_rate": 9.973805362744064e-05, |
|
"loss": 1.0469, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 0.33143192529678345, |
|
"learning_rate": 9.973220548842598e-05, |
|
"loss": 0.9705, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 0.3112998306751251, |
|
"learning_rate": 9.972629296146035e-05, |
|
"loss": 0.9956, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 0.32970279455184937, |
|
"learning_rate": 9.972031605419864e-05, |
|
"loss": 1.0232, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 0.256101131439209, |
|
"learning_rate": 9.971427477437914e-05, |
|
"loss": 1.0471, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 0.4258672595024109, |
|
"learning_rate": 9.970816912982344e-05, |
|
"loss": 0.9652, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 0.3143826425075531, |
|
"learning_rate": 9.970199912843648e-05, |
|
"loss": 0.9894, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 0.2868054509162903, |
|
"learning_rate": 9.96957647782065e-05, |
|
"loss": 1.0437, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 0.2594622075557709, |
|
"learning_rate": 9.968946608720511e-05, |
|
"loss": 1.02, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"eval_loss": 1.0154483318328857, |
|
"eval_runtime": 124.672, |
|
"eval_samples_per_second": 62.572, |
|
"eval_steps_per_second": 3.914, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 0.2359086573123932, |
|
"learning_rate": 9.968310306358715e-05, |
|
"loss": 1.0676, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 0.22080975770950317, |
|
"learning_rate": 9.967667571559081e-05, |
|
"loss": 1.027, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 0.3211756944656372, |
|
"learning_rate": 9.967018405153749e-05, |
|
"loss": 1.0004, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 0.3681553602218628, |
|
"learning_rate": 9.966362807983196e-05, |
|
"loss": 1.0395, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 0.3180038332939148, |
|
"learning_rate": 9.965700780896216e-05, |
|
"loss": 0.9948, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 0.25071969628334045, |
|
"learning_rate": 9.965032324749932e-05, |
|
"loss": 1.0281, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 0.2274983674287796, |
|
"learning_rate": 9.964357440409789e-05, |
|
"loss": 1.0094, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 0.24825724959373474, |
|
"learning_rate": 9.963676128749553e-05, |
|
"loss": 1.0272, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 0.3256381154060364, |
|
"learning_rate": 9.96298839065132e-05, |
|
"loss": 1.0191, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 0.31695234775543213, |
|
"learning_rate": 9.962294227005493e-05, |
|
"loss": 1.08, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 0.288083553314209, |
|
"learning_rate": 9.961593638710804e-05, |
|
"loss": 0.9954, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 0.29730525612831116, |
|
"learning_rate": 9.960886626674302e-05, |
|
"loss": 1.071, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 0.2090187519788742, |
|
"learning_rate": 9.960173191811348e-05, |
|
"loss": 0.9725, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 0.2811983525753021, |
|
"learning_rate": 9.959453335045622e-05, |
|
"loss": 1.0071, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 0.27806761860847473, |
|
"learning_rate": 9.958727057309115e-05, |
|
"loss": 1.0108, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 0.2864569127559662, |
|
"learning_rate": 9.957994359542138e-05, |
|
"loss": 1.0495, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 0.3440109193325043, |
|
"learning_rate": 9.957255242693308e-05, |
|
"loss": 1.0015, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 0.2824917435646057, |
|
"learning_rate": 9.956509707719555e-05, |
|
"loss": 1.0559, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 0.3080492317676544, |
|
"learning_rate": 9.955757755586119e-05, |
|
"loss": 1.0134, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 0.2890901565551758, |
|
"learning_rate": 9.954999387266546e-05, |
|
"loss": 0.9492, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"eval_loss": 1.0133627653121948, |
|
"eval_runtime": 124.7104, |
|
"eval_samples_per_second": 62.553, |
|
"eval_steps_per_second": 3.913, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 0.33987322449684143, |
|
"learning_rate": 9.95423460374269e-05, |
|
"loss": 0.9629, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 0.29403063654899597, |
|
"learning_rate": 9.953463406004713e-05, |
|
"loss": 1.0384, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 0.20130111277103424, |
|
"learning_rate": 9.952685795051077e-05, |
|
"loss": 1.0235, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 0.1973690539598465, |
|
"learning_rate": 9.951901771888552e-05, |
|
"loss": 1.0395, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 0.24519580602645874, |
|
"learning_rate": 9.951111337532205e-05, |
|
"loss": 1.0914, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 0.2706618309020996, |
|
"learning_rate": 9.950314493005408e-05, |
|
"loss": 1.0714, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 0.23367558419704437, |
|
"learning_rate": 9.949511239339831e-05, |
|
"loss": 1.0224, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 0.30005407333374023, |
|
"learning_rate": 9.948701577575439e-05, |
|
"loss": 1.0152, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 0.3130083382129669, |
|
"learning_rate": 9.947885508760496e-05, |
|
"loss": 0.8988, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 0.23657679557800293, |
|
"learning_rate": 9.94706303395156e-05, |
|
"loss": 1.0242, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 0.40966659784317017, |
|
"learning_rate": 9.946234154213487e-05, |
|
"loss": 1.0145, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 0.35292962193489075, |
|
"learning_rate": 9.94539887061942e-05, |
|
"loss": 1.0197, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 0.38793638348579407, |
|
"learning_rate": 9.944557184250794e-05, |
|
"loss": 1.0273, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 0.27373677492141724, |
|
"learning_rate": 9.943709096197335e-05, |
|
"loss": 0.9561, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 0.24536257982254028, |
|
"learning_rate": 9.942854607557057e-05, |
|
"loss": 0.9678, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 0.4609609842300415, |
|
"learning_rate": 9.941993719436262e-05, |
|
"loss": 1.0429, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 0.27118805050849915, |
|
"learning_rate": 9.941126432949535e-05, |
|
"loss": 1.0506, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 0.27538400888442993, |
|
"learning_rate": 9.940252749219746e-05, |
|
"loss": 1.0326, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 0.2451954036951065, |
|
"learning_rate": 9.939372669378048e-05, |
|
"loss": 1.0413, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 0.2622232437133789, |
|
"learning_rate": 9.938486194563875e-05, |
|
"loss": 1.0051, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"eval_loss": 1.011703372001648, |
|
"eval_runtime": 124.6726, |
|
"eval_samples_per_second": 62.572, |
|
"eval_steps_per_second": 3.914, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 0.2616746425628662, |
|
"learning_rate": 9.937593325924937e-05, |
|
"loss": 1.0277, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 0.2952045202255249, |
|
"learning_rate": 9.936694064617227e-05, |
|
"loss": 0.9802, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 0.2611790895462036, |
|
"learning_rate": 9.935788411805011e-05, |
|
"loss": 0.9811, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 0.3291374742984772, |
|
"learning_rate": 9.934876368660836e-05, |
|
"loss": 0.9972, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 0.32888704538345337, |
|
"learning_rate": 9.933957936365515e-05, |
|
"loss": 1.1006, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 0.20011785626411438, |
|
"learning_rate": 9.933033116108134e-05, |
|
"loss": 1.0139, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 0.3157961666584015, |
|
"learning_rate": 9.932101909086056e-05, |
|
"loss": 0.993, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 0.22981207072734833, |
|
"learning_rate": 9.931164316504904e-05, |
|
"loss": 1.0539, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 0.23787029087543488, |
|
"learning_rate": 9.930220339578576e-05, |
|
"loss": 0.9599, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 0.2633046507835388, |
|
"learning_rate": 9.929269979529232e-05, |
|
"loss": 0.9813, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 0.2666633725166321, |
|
"learning_rate": 9.928313237587296e-05, |
|
"loss": 0.9637, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 0.26092538237571716, |
|
"learning_rate": 9.927350114991456e-05, |
|
"loss": 1.0375, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 0.2837240397930145, |
|
"learning_rate": 9.92638061298866e-05, |
|
"loss": 1.0053, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 0.2586491107940674, |
|
"learning_rate": 9.925404732834117e-05, |
|
"loss": 1.0631, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 0.43321874737739563, |
|
"learning_rate": 9.924422475791288e-05, |
|
"loss": 1.0134, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 0.19062629342079163, |
|
"learning_rate": 9.923433843131901e-05, |
|
"loss": 0.9989, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 0.34545308351516724, |
|
"learning_rate": 9.922438836135928e-05, |
|
"loss": 1.0896, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 0.2846600115299225, |
|
"learning_rate": 9.921437456091596e-05, |
|
"loss": 0.9954, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 0.25403323769569397, |
|
"learning_rate": 9.920429704295391e-05, |
|
"loss": 0.9937, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 0.23549498617649078, |
|
"learning_rate": 9.919415582052036e-05, |
|
"loss": 1.0469, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"eval_loss": 1.0105613470077515, |
|
"eval_runtime": 124.7139, |
|
"eval_samples_per_second": 62.551, |
|
"eval_steps_per_second": 3.913, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 0.21466514468193054, |
|
"learning_rate": 9.918395090674514e-05, |
|
"loss": 1.0408, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 0.21247586607933044, |
|
"learning_rate": 9.917368231484045e-05, |
|
"loss": 0.9893, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 0.26590731739997864, |
|
"learning_rate": 9.916335005810095e-05, |
|
"loss": 1.0563, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 0.2346472591161728, |
|
"learning_rate": 9.91529541499038e-05, |
|
"loss": 1.0061, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 0.27766481041908264, |
|
"learning_rate": 9.914249460370846e-05, |
|
"loss": 0.9639, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 0.24883978068828583, |
|
"learning_rate": 9.913197143305684e-05, |
|
"loss": 1.0289, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 0.2379382699728012, |
|
"learning_rate": 9.912138465157325e-05, |
|
"loss": 1.0154, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 0.17160119116306305, |
|
"learning_rate": 9.91107342729643e-05, |
|
"loss": 1.0002, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 0.2804344892501831, |
|
"learning_rate": 9.910002031101895e-05, |
|
"loss": 0.9887, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 0.2296508252620697, |
|
"learning_rate": 9.908924277960854e-05, |
|
"loss": 1.0703, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 0.22265523672103882, |
|
"learning_rate": 9.907840169268662e-05, |
|
"loss": 0.9495, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 0.3383825123310089, |
|
"learning_rate": 9.90674970642891e-05, |
|
"loss": 0.9878, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 0.2603285312652588, |
|
"learning_rate": 9.905652890853411e-05, |
|
"loss": 1.0351, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 0.27001509070396423, |
|
"learning_rate": 9.904549723962206e-05, |
|
"loss": 1.0528, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 0.34035804867744446, |
|
"learning_rate": 9.903440207183558e-05, |
|
"loss": 1.0159, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 0.3518404960632324, |
|
"learning_rate": 9.90232434195395e-05, |
|
"loss": 0.9879, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 0.24958577752113342, |
|
"learning_rate": 9.901202129718086e-05, |
|
"loss": 1.0221, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 0.23898568749427795, |
|
"learning_rate": 9.900073571928886e-05, |
|
"loss": 1.037, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 0.22275009751319885, |
|
"learning_rate": 9.898938670047486e-05, |
|
"loss": 1.0008, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 0.2770971655845642, |
|
"learning_rate": 9.897797425543236e-05, |
|
"loss": 0.9994, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"eval_loss": 1.0094062089920044, |
|
"eval_runtime": 124.6598, |
|
"eval_samples_per_second": 62.578, |
|
"eval_steps_per_second": 3.915, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 0.2470710575580597, |
|
"learning_rate": 9.896649839893699e-05, |
|
"loss": 1.0093, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 0.31282275915145874, |
|
"learning_rate": 9.895495914584643e-05, |
|
"loss": 1.0124, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 0.2757389545440674, |
|
"learning_rate": 9.894335651110051e-05, |
|
"loss": 1.0197, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 0.3123573362827301, |
|
"learning_rate": 9.893169050972106e-05, |
|
"loss": 0.9469, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 0.4073740839958191, |
|
"learning_rate": 9.8919961156812e-05, |
|
"loss": 1.0153, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 0.24388962984085083, |
|
"learning_rate": 9.89081684675592e-05, |
|
"loss": 1.0124, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 0.27508777379989624, |
|
"learning_rate": 9.88963124572306e-05, |
|
"loss": 0.96, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 0.2843553125858307, |
|
"learning_rate": 9.88843931411761e-05, |
|
"loss": 1.0448, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 0.25155389308929443, |
|
"learning_rate": 9.887241053482757e-05, |
|
"loss": 1.0362, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 0.21977895498275757, |
|
"learning_rate": 9.886036465369877e-05, |
|
"loss": 1.0658, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 0.22326160967350006, |
|
"learning_rate": 9.884825551338546e-05, |
|
"loss": 1.0068, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 0.3339684307575226, |
|
"learning_rate": 9.883608312956524e-05, |
|
"loss": 1.0147, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 0.26512840390205383, |
|
"learning_rate": 9.882384751799762e-05, |
|
"loss": 0.9421, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 0.313123881816864, |
|
"learning_rate": 9.881154869452395e-05, |
|
"loss": 1.0032, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 0.3562926948070526, |
|
"learning_rate": 9.879918667506748e-05, |
|
"loss": 1.0491, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 0.373032808303833, |
|
"learning_rate": 9.87867614756332e-05, |
|
"loss": 0.9823, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 0.2701728641986847, |
|
"learning_rate": 9.87742731123079e-05, |
|
"loss": 1.0326, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 0.4167492687702179, |
|
"learning_rate": 9.876172160126024e-05, |
|
"loss": 1.0256, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 0.2636062800884247, |
|
"learning_rate": 9.874910695874053e-05, |
|
"loss": 1.0301, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 0.27048760652542114, |
|
"learning_rate": 9.873642920108091e-05, |
|
"loss": 1.0141, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"eval_loss": 1.0082145929336548, |
|
"eval_runtime": 124.7209, |
|
"eval_samples_per_second": 62.548, |
|
"eval_steps_per_second": 3.913, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 0.26596397161483765, |
|
"learning_rate": 9.872368834469514e-05, |
|
"loss": 0.9554, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 0.3881726861000061, |
|
"learning_rate": 9.871088440607874e-05, |
|
"loss": 1.0374, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 0.345869243144989, |
|
"learning_rate": 9.869801740180889e-05, |
|
"loss": 1.01, |
|
"step": 2030 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 0.3740908205509186, |
|
"learning_rate": 9.86850873485444e-05, |
|
"loss": 1.0244, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 0.3265666663646698, |
|
"learning_rate": 9.867209426302572e-05, |
|
"loss": 0.9303, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 0.381783664226532, |
|
"learning_rate": 9.865903816207493e-05, |
|
"loss": 1.0851, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 0.30846527218818665, |
|
"learning_rate": 9.864591906259568e-05, |
|
"loss": 1.0042, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 0.36899617314338684, |
|
"learning_rate": 9.863273698157315e-05, |
|
"loss": 0.9866, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 0.25415265560150146, |
|
"learning_rate": 9.861949193607411e-05, |
|
"loss": 1.056, |
|
"step": 2090 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 0.3369081914424896, |
|
"learning_rate": 9.860618394324682e-05, |
|
"loss": 0.9988, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 0.19644911587238312, |
|
"learning_rate": 9.859281302032106e-05, |
|
"loss": 0.9562, |
|
"step": 2110 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 0.3449130356311798, |
|
"learning_rate": 9.857937918460808e-05, |
|
"loss": 1.0325, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 0.2639143764972687, |
|
"learning_rate": 9.856588245350056e-05, |
|
"loss": 1.0458, |
|
"step": 2130 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 0.2752164602279663, |
|
"learning_rate": 9.855232284447262e-05, |
|
"loss": 1.089, |
|
"step": 2140 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 0.31700417399406433, |
|
"learning_rate": 9.853870037507983e-05, |
|
"loss": 1.0398, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 0.24685466289520264, |
|
"learning_rate": 9.852501506295907e-05, |
|
"loss": 1.0038, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 0.28860118985176086, |
|
"learning_rate": 9.851126692582864e-05, |
|
"loss": 1.0343, |
|
"step": 2170 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 0.2774854898452759, |
|
"learning_rate": 9.849745598148817e-05, |
|
"loss": 0.9986, |
|
"step": 2180 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 0.28867611289024353, |
|
"learning_rate": 9.848358224781857e-05, |
|
"loss": 1.035, |
|
"step": 2190 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 0.2703929841518402, |
|
"learning_rate": 9.84696457427821e-05, |
|
"loss": 1.0891, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"eval_loss": 1.0072919130325317, |
|
"eval_runtime": 125.0779, |
|
"eval_samples_per_second": 62.369, |
|
"eval_steps_per_second": 3.902, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 0.3247489035129547, |
|
"learning_rate": 9.845564648442222e-05, |
|
"loss": 1.0259, |
|
"step": 2210 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 0.2535197138786316, |
|
"learning_rate": 9.844158449086371e-05, |
|
"loss": 1.0457, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 0.26780492067337036, |
|
"learning_rate": 9.842745978031253e-05, |
|
"loss": 0.9869, |
|
"step": 2230 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 0.29711589217185974, |
|
"learning_rate": 9.841327237105585e-05, |
|
"loss": 1.0158, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 0.239434614777565, |
|
"learning_rate": 9.8399022281462e-05, |
|
"loss": 0.997, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 0.2368830293416977, |
|
"learning_rate": 9.838470952998049e-05, |
|
"loss": 1.0148, |
|
"step": 2260 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 0.2554934322834015, |
|
"learning_rate": 9.837033413514191e-05, |
|
"loss": 0.9787, |
|
"step": 2270 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 0.2310570627450943, |
|
"learning_rate": 9.835589611555805e-05, |
|
"loss": 0.9656, |
|
"step": 2280 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 0.22654668986797333, |
|
"learning_rate": 9.834139548992165e-05, |
|
"loss": 0.9837, |
|
"step": 2290 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 0.25957950949668884, |
|
"learning_rate": 9.832683227700661e-05, |
|
"loss": 1.0513, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 0.20669637620449066, |
|
"learning_rate": 9.831220649566782e-05, |
|
"loss": 0.9649, |
|
"step": 2310 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 0.24330663681030273, |
|
"learning_rate": 9.829751816484116e-05, |
|
"loss": 1.0208, |
|
"step": 2320 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 0.28211724758148193, |
|
"learning_rate": 9.828276730354353e-05, |
|
"loss": 0.9512, |
|
"step": 2330 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 0.23784276843070984, |
|
"learning_rate": 9.826795393087278e-05, |
|
"loss": 0.976, |
|
"step": 2340 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 0.2881389260292053, |
|
"learning_rate": 9.825307806600765e-05, |
|
"loss": 1.0036, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 0.27906882762908936, |
|
"learning_rate": 9.823813972820786e-05, |
|
"loss": 1.0555, |
|
"step": 2360 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 0.25142115354537964, |
|
"learning_rate": 9.822313893681397e-05, |
|
"loss": 1.0483, |
|
"step": 2370 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 0.244681715965271, |
|
"learning_rate": 9.820807571124738e-05, |
|
"loss": 1.0102, |
|
"step": 2380 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 0.3696367144584656, |
|
"learning_rate": 9.819295007101035e-05, |
|
"loss": 1.0626, |
|
"step": 2390 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 0.26112619042396545, |
|
"learning_rate": 9.817776203568596e-05, |
|
"loss": 1.0141, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"eval_loss": 1.0063296556472778, |
|
"eval_runtime": 125.7335, |
|
"eval_samples_per_second": 62.044, |
|
"eval_steps_per_second": 3.881, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 0.25221410393714905, |
|
"learning_rate": 9.816251162493804e-05, |
|
"loss": 1.0222, |
|
"step": 2410 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 0.19672074913978577, |
|
"learning_rate": 9.814719885851121e-05, |
|
"loss": 0.9891, |
|
"step": 2420 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 0.3084292411804199, |
|
"learning_rate": 9.81318237562308e-05, |
|
"loss": 0.9785, |
|
"step": 2430 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 0.3434545397758484, |
|
"learning_rate": 9.811638633800287e-05, |
|
"loss": 0.9357, |
|
"step": 2440 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 0.23335447907447815, |
|
"learning_rate": 9.81008866238141e-05, |
|
"loss": 1.0485, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 0.2942172586917877, |
|
"learning_rate": 9.808532463373188e-05, |
|
"loss": 1.0138, |
|
"step": 2460 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 0.22536420822143555, |
|
"learning_rate": 9.806970038790423e-05, |
|
"loss": 1.0421, |
|
"step": 2470 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 0.30886924266815186, |
|
"learning_rate": 9.805401390655975e-05, |
|
"loss": 0.9926, |
|
"step": 2480 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 0.34105512499809265, |
|
"learning_rate": 9.803826521000761e-05, |
|
"loss": 1.0013, |
|
"step": 2490 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 0.261643648147583, |
|
"learning_rate": 9.802245431863757e-05, |
|
"loss": 0.9937, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 0.3864617347717285, |
|
"learning_rate": 9.800658125291984e-05, |
|
"loss": 0.9986, |
|
"step": 2510 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 0.31850436329841614, |
|
"learning_rate": 9.79906460334052e-05, |
|
"loss": 0.9984, |
|
"step": 2520 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 0.25421255826950073, |
|
"learning_rate": 9.797464868072488e-05, |
|
"loss": 1.0273, |
|
"step": 2530 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 0.34440311789512634, |
|
"learning_rate": 9.795858921559052e-05, |
|
"loss": 1.0346, |
|
"step": 2540 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 0.33147209882736206, |
|
"learning_rate": 9.79424676587942e-05, |
|
"loss": 1.0691, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 0.2778458893299103, |
|
"learning_rate": 9.792628403120842e-05, |
|
"loss": 1.009, |
|
"step": 2560 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 0.29282572865486145, |
|
"learning_rate": 9.791003835378598e-05, |
|
"loss": 1.0015, |
|
"step": 2570 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 0.25391730666160583, |
|
"learning_rate": 9.789373064756008e-05, |
|
"loss": 1.0177, |
|
"step": 2580 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 0.23779381811618805, |
|
"learning_rate": 9.787736093364416e-05, |
|
"loss": 1.0935, |
|
"step": 2590 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 0.2965840995311737, |
|
"learning_rate": 9.786092923323203e-05, |
|
"loss": 1.0002, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"eval_loss": 1.005922555923462, |
|
"eval_runtime": 125.0587, |
|
"eval_samples_per_second": 62.379, |
|
"eval_steps_per_second": 3.902, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 0.23760788142681122, |
|
"learning_rate": 9.784443556759766e-05, |
|
"loss": 1.0305, |
|
"step": 2610 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 0.22895409166812897, |
|
"learning_rate": 9.78278799580953e-05, |
|
"loss": 1.0427, |
|
"step": 2620 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 0.36007368564605713, |
|
"learning_rate": 9.781126242615939e-05, |
|
"loss": 1.0059, |
|
"step": 2630 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 0.2813151776790619, |
|
"learning_rate": 9.779458299330452e-05, |
|
"loss": 1.0418, |
|
"step": 2640 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 0.27038782835006714, |
|
"learning_rate": 9.777784168112545e-05, |
|
"loss": 1.0092, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 0.22898097336292267, |
|
"learning_rate": 9.776103851129706e-05, |
|
"loss": 0.9883, |
|
"step": 2660 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 0.2213810682296753, |
|
"learning_rate": 9.774417350557428e-05, |
|
"loss": 1.0753, |
|
"step": 2670 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 0.22410623729228973, |
|
"learning_rate": 9.772724668579212e-05, |
|
"loss": 1.0524, |
|
"step": 2680 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 0.3005650043487549, |
|
"learning_rate": 9.771025807386562e-05, |
|
"loss": 1.0562, |
|
"step": 2690 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"grad_norm": 0.3941683769226074, |
|
"learning_rate": 9.769320769178983e-05, |
|
"loss": 0.9925, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"grad_norm": 0.2829142212867737, |
|
"learning_rate": 9.767609556163977e-05, |
|
"loss": 1.014, |
|
"step": 2710 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"grad_norm": 0.29680418968200684, |
|
"learning_rate": 9.765892170557038e-05, |
|
"loss": 0.9677, |
|
"step": 2720 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"grad_norm": 0.22002767026424408, |
|
"learning_rate": 9.764168614581655e-05, |
|
"loss": 0.9954, |
|
"step": 2730 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"grad_norm": 0.2758820354938507, |
|
"learning_rate": 9.762438890469304e-05, |
|
"loss": 1.0029, |
|
"step": 2740 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 0.2981850802898407, |
|
"learning_rate": 9.760703000459446e-05, |
|
"loss": 1.0555, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 0.22340857982635498, |
|
"learning_rate": 9.758960946799528e-05, |
|
"loss": 1.0394, |
|
"step": 2760 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 0.19991633296012878, |
|
"learning_rate": 9.757212731744974e-05, |
|
"loss": 0.9325, |
|
"step": 2770 |
|
}, |
|
{ |
|
"epoch": 1.01, |
|
"grad_norm": 0.30030888319015503, |
|
"learning_rate": 9.755458357559186e-05, |
|
"loss": 0.9711, |
|
"step": 2780 |
|
}, |
|
{ |
|
"epoch": 1.01, |
|
"grad_norm": 0.3804832696914673, |
|
"learning_rate": 9.753697826513541e-05, |
|
"loss": 0.9651, |
|
"step": 2790 |
|
}, |
|
{ |
|
"epoch": 1.01, |
|
"grad_norm": 0.46047547459602356, |
|
"learning_rate": 9.751931140887387e-05, |
|
"loss": 0.9686, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 1.01, |
|
"eval_loss": 1.0086077451705933, |
|
"eval_runtime": 124.6354, |
|
"eval_samples_per_second": 62.591, |
|
"eval_steps_per_second": 3.915, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 1.02, |
|
"grad_norm": 0.30646952986717224, |
|
"learning_rate": 9.750158302968039e-05, |
|
"loss": 0.9267, |
|
"step": 2810 |
|
}, |
|
{ |
|
"epoch": 1.02, |
|
"grad_norm": 0.3007545471191406, |
|
"learning_rate": 9.748379315050778e-05, |
|
"loss": 1.0193, |
|
"step": 2820 |
|
}, |
|
{ |
|
"epoch": 1.02, |
|
"grad_norm": 0.2814784049987793, |
|
"learning_rate": 9.74659417943885e-05, |
|
"loss": 0.8893, |
|
"step": 2830 |
|
}, |
|
{ |
|
"epoch": 1.03, |
|
"grad_norm": 0.2728348970413208, |
|
"learning_rate": 9.744802898443456e-05, |
|
"loss": 0.937, |
|
"step": 2840 |
|
}, |
|
{ |
|
"epoch": 1.03, |
|
"grad_norm": 0.2994844913482666, |
|
"learning_rate": 9.743005474383755e-05, |
|
"loss": 0.949, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 1.04, |
|
"grad_norm": 0.43111738562583923, |
|
"learning_rate": 9.741201909586861e-05, |
|
"loss": 0.9897, |
|
"step": 2860 |
|
}, |
|
{ |
|
"epoch": 1.04, |
|
"grad_norm": 0.29551658034324646, |
|
"learning_rate": 9.739392206387838e-05, |
|
"loss": 0.9393, |
|
"step": 2870 |
|
}, |
|
{ |
|
"epoch": 1.04, |
|
"grad_norm": 0.40380623936653137, |
|
"learning_rate": 9.737576367129694e-05, |
|
"loss": 0.9365, |
|
"step": 2880 |
|
}, |
|
{ |
|
"epoch": 1.05, |
|
"grad_norm": 0.2757427394390106, |
|
"learning_rate": 9.735754394163386e-05, |
|
"loss": 1.0074, |
|
"step": 2890 |
|
}, |
|
{ |
|
"epoch": 1.05, |
|
"grad_norm": 0.35594430565834045, |
|
"learning_rate": 9.73392628984781e-05, |
|
"loss": 0.9682, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 1.05, |
|
"grad_norm": 0.32288888096809387, |
|
"learning_rate": 9.732092056549799e-05, |
|
"loss": 0.9753, |
|
"step": 2910 |
|
}, |
|
{ |
|
"epoch": 1.06, |
|
"grad_norm": 0.3491690158843994, |
|
"learning_rate": 9.730251696644122e-05, |
|
"loss": 0.926, |
|
"step": 2920 |
|
}, |
|
{ |
|
"epoch": 1.06, |
|
"grad_norm": 0.41806405782699585, |
|
"learning_rate": 9.728405212513483e-05, |
|
"loss": 0.9993, |
|
"step": 2930 |
|
}, |
|
{ |
|
"epoch": 1.06, |
|
"grad_norm": 0.4885188043117523, |
|
"learning_rate": 9.726552606548512e-05, |
|
"loss": 0.9879, |
|
"step": 2940 |
|
}, |
|
{ |
|
"epoch": 1.07, |
|
"grad_norm": 0.41796302795410156, |
|
"learning_rate": 9.724693881147761e-05, |
|
"loss": 0.9626, |
|
"step": 2950 |
|
}, |
|
{ |
|
"epoch": 1.07, |
|
"grad_norm": 0.39677631855010986, |
|
"learning_rate": 9.722829038717717e-05, |
|
"loss": 0.9767, |
|
"step": 2960 |
|
}, |
|
{ |
|
"epoch": 1.07, |
|
"grad_norm": 0.5329232215881348, |
|
"learning_rate": 9.720958081672773e-05, |
|
"loss": 0.9357, |
|
"step": 2970 |
|
}, |
|
{ |
|
"epoch": 1.08, |
|
"grad_norm": 0.4468931257724762, |
|
"learning_rate": 9.719081012435247e-05, |
|
"loss": 0.9705, |
|
"step": 2980 |
|
}, |
|
{ |
|
"epoch": 1.08, |
|
"grad_norm": 0.4029316306114197, |
|
"learning_rate": 9.717197833435367e-05, |
|
"loss": 0.9727, |
|
"step": 2990 |
|
}, |
|
{ |
|
"epoch": 1.09, |
|
"grad_norm": 0.37598028779029846, |
|
"learning_rate": 9.715308547111273e-05, |
|
"loss": 0.9767, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 1.09, |
|
"eval_loss": 1.014098048210144, |
|
"eval_runtime": 125.4232, |
|
"eval_samples_per_second": 62.197, |
|
"eval_steps_per_second": 3.891, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 1.09, |
|
"grad_norm": 0.3833357095718384, |
|
"learning_rate": 9.713413155909009e-05, |
|
"loss": 0.9605, |
|
"step": 3010 |
|
}, |
|
{ |
|
"epoch": 1.09, |
|
"grad_norm": 0.4391871988773346, |
|
"learning_rate": 9.711511662282527e-05, |
|
"loss": 0.9611, |
|
"step": 3020 |
|
}, |
|
{ |
|
"epoch": 1.1, |
|
"grad_norm": 0.39860454201698303, |
|
"learning_rate": 9.709604068693679e-05, |
|
"loss": 0.9222, |
|
"step": 3030 |
|
}, |
|
{ |
|
"epoch": 1.1, |
|
"grad_norm": 0.33882561326026917, |
|
"learning_rate": 9.707690377612211e-05, |
|
"loss": 0.9369, |
|
"step": 3040 |
|
}, |
|
{ |
|
"epoch": 1.1, |
|
"grad_norm": 0.3763039708137512, |
|
"learning_rate": 9.705770591515768e-05, |
|
"loss": 0.8864, |
|
"step": 3050 |
|
}, |
|
{ |
|
"epoch": 1.11, |
|
"grad_norm": 0.3221600353717804, |
|
"learning_rate": 9.703844712889884e-05, |
|
"loss": 0.9753, |
|
"step": 3060 |
|
}, |
|
{ |
|
"epoch": 1.11, |
|
"grad_norm": 0.3342023491859436, |
|
"learning_rate": 9.701912744227979e-05, |
|
"loss": 0.9233, |
|
"step": 3070 |
|
}, |
|
{ |
|
"epoch": 1.11, |
|
"grad_norm": 0.4082651734352112, |
|
"learning_rate": 9.699974688031363e-05, |
|
"loss": 0.987, |
|
"step": 3080 |
|
}, |
|
{ |
|
"epoch": 1.12, |
|
"grad_norm": 0.4198564291000366, |
|
"learning_rate": 9.69803054680922e-05, |
|
"loss": 0.8833, |
|
"step": 3090 |
|
}, |
|
{ |
|
"epoch": 1.12, |
|
"grad_norm": 0.3833492398262024, |
|
"learning_rate": 9.696080323078621e-05, |
|
"loss": 0.9894, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 1.13, |
|
"grad_norm": 0.35935208201408386, |
|
"learning_rate": 9.694124019364505e-05, |
|
"loss": 0.9417, |
|
"step": 3110 |
|
}, |
|
{ |
|
"epoch": 1.13, |
|
"grad_norm": 0.3433043658733368, |
|
"learning_rate": 9.692161638199686e-05, |
|
"loss": 0.9251, |
|
"step": 3120 |
|
}, |
|
{ |
|
"epoch": 1.13, |
|
"grad_norm": 0.30163127183914185, |
|
"learning_rate": 9.690193182124844e-05, |
|
"loss": 0.9447, |
|
"step": 3130 |
|
}, |
|
{ |
|
"epoch": 1.14, |
|
"grad_norm": 0.4361821711063385, |
|
"learning_rate": 9.68821865368853e-05, |
|
"loss": 0.9984, |
|
"step": 3140 |
|
}, |
|
{ |
|
"epoch": 1.14, |
|
"grad_norm": 0.4263075888156891, |
|
"learning_rate": 9.686238055447148e-05, |
|
"loss": 0.9422, |
|
"step": 3150 |
|
}, |
|
{ |
|
"epoch": 1.14, |
|
"grad_norm": 0.33963072299957275, |
|
"learning_rate": 9.684251389964967e-05, |
|
"loss": 0.9199, |
|
"step": 3160 |
|
}, |
|
{ |
|
"epoch": 1.15, |
|
"grad_norm": 0.41040754318237305, |
|
"learning_rate": 9.68225865981411e-05, |
|
"loss": 0.9249, |
|
"step": 3170 |
|
}, |
|
{ |
|
"epoch": 1.15, |
|
"grad_norm": 0.3697950839996338, |
|
"learning_rate": 9.680259867574552e-05, |
|
"loss": 0.947, |
|
"step": 3180 |
|
}, |
|
{ |
|
"epoch": 1.15, |
|
"grad_norm": 0.3211696743965149, |
|
"learning_rate": 9.678255015834112e-05, |
|
"loss": 0.9956, |
|
"step": 3190 |
|
}, |
|
{ |
|
"epoch": 1.16, |
|
"grad_norm": 0.4463675022125244, |
|
"learning_rate": 9.676244107188463e-05, |
|
"loss": 0.9494, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 1.16, |
|
"eval_loss": 1.0160499811172485, |
|
"eval_runtime": 124.6588, |
|
"eval_samples_per_second": 62.579, |
|
"eval_steps_per_second": 3.915, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 1.16, |
|
"step": 3200, |
|
"total_flos": 8.146148608211681e+17, |
|
"train_loss": 1.015018144249916, |
|
"train_runtime": 4695.8401, |
|
"train_samples_per_second": 94.128, |
|
"train_steps_per_second": 5.884 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 27630, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 10, |
|
"save_steps": 1000, |
|
"total_flos": 8.146148608211681e+17, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|