{ "best_metric": 1.0082145929336548, "best_model_checkpoint": "ckpt/llama2_13b_fuze15_no_sys/alpaca_no_sys/checkpoint-2000", "epoch": 1.158161418747738, "eval_steps": 200, "global_step": 3200, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 0.2189224660396576, "learning_rate": 5e-05, "loss": 1.4369, "step": 10 }, { "epoch": 0.01, "grad_norm": 0.4817655384540558, "learning_rate": 0.0001, "loss": 1.3624, "step": 20 }, { "epoch": 0.01, "grad_norm": 0.35323551297187805, "learning_rate": 9.999996763266864e-05, "loss": 1.1589, "step": 30 }, { "epoch": 0.01, "grad_norm": 0.2697048485279083, "learning_rate": 9.999987053071647e-05, "loss": 1.1103, "step": 40 }, { "epoch": 0.02, "grad_norm": 0.34059372544288635, "learning_rate": 9.99997086942692e-05, "loss": 1.0601, "step": 50 }, { "epoch": 0.02, "grad_norm": 0.2907443344593048, "learning_rate": 9.999948212353635e-05, "loss": 1.0302, "step": 60 }, { "epoch": 0.03, "grad_norm": 0.4002208113670349, "learning_rate": 9.999919081881129e-05, "loss": 1.114, "step": 70 }, { "epoch": 0.03, "grad_norm": 0.4364459216594696, "learning_rate": 9.999883478047113e-05, "loss": 1.0913, "step": 80 }, { "epoch": 0.03, "grad_norm": 0.322396844625473, "learning_rate": 9.999841400897687e-05, "loss": 1.0778, "step": 90 }, { "epoch": 0.04, "grad_norm": 0.5678238868713379, "learning_rate": 9.999792850487325e-05, "loss": 1.0493, "step": 100 }, { "epoch": 0.04, "grad_norm": 0.2919568717479706, "learning_rate": 9.999737826878886e-05, "loss": 1.0249, "step": 110 }, { "epoch": 0.04, "grad_norm": 0.3787660300731659, "learning_rate": 9.99967633014361e-05, "loss": 1.0594, "step": 120 }, { "epoch": 0.05, "grad_norm": 0.33062055706977844, "learning_rate": 9.999608360361113e-05, "loss": 1.0527, "step": 130 }, { "epoch": 0.05, "grad_norm": 0.3306855857372284, "learning_rate": 9.999533917619399e-05, "loss": 1.0051, "step": 140 }, { "epoch": 0.05, "grad_norm": 0.41762664914131165, "learning_rate": 9.999453002014846e-05, "loss": 0.9906, "step": 150 }, { "epoch": 0.06, "grad_norm": 0.291189044713974, "learning_rate": 9.999365613652217e-05, "loss": 1.0197, "step": 160 }, { "epoch": 0.06, "grad_norm": 0.30276551842689514, "learning_rate": 9.999271752644649e-05, "loss": 1.0356, "step": 170 }, { "epoch": 0.07, "grad_norm": 0.25866344571113586, "learning_rate": 9.999171419113666e-05, "loss": 1.0332, "step": 180 }, { "epoch": 0.07, "grad_norm": 0.1927756369113922, "learning_rate": 9.999064613189171e-05, "loss": 1.0126, "step": 190 }, { "epoch": 0.07, "grad_norm": 0.2776283621788025, "learning_rate": 9.998951335009442e-05, "loss": 1.0429, "step": 200 }, { "epoch": 0.07, "eval_loss": 1.029819130897522, "eval_runtime": 124.6792, "eval_samples_per_second": 62.569, "eval_steps_per_second": 3.914, "step": 200 }, { "epoch": 0.08, "grad_norm": 0.320551335811615, "learning_rate": 9.998831584721141e-05, "loss": 1.0431, "step": 210 }, { "epoch": 0.08, "grad_norm": 0.46670058369636536, "learning_rate": 9.998705362479307e-05, "loss": 1.0374, "step": 220 }, { "epoch": 0.08, "grad_norm": 0.30959388613700867, "learning_rate": 9.99857266844736e-05, "loss": 1.1065, "step": 230 }, { "epoch": 0.09, "grad_norm": 0.3016811013221741, "learning_rate": 9.998433502797095e-05, "loss": 1.1105, "step": 240 }, { "epoch": 0.09, "grad_norm": 0.356992244720459, "learning_rate": 9.998287865708694e-05, "loss": 0.9839, "step": 250 }, { "epoch": 0.09, "grad_norm": 0.29836413264274597, "learning_rate": 9.998135757370708e-05, "loss": 1.0401, "step": 260 }, { "epoch": 0.1, "grad_norm": 0.4305395483970642, "learning_rate": 9.997977177980074e-05, "loss": 1.0461, "step": 270 }, { "epoch": 0.1, "grad_norm": 0.2959505021572113, "learning_rate": 9.9978121277421e-05, "loss": 1.0662, "step": 280 }, { "epoch": 0.1, "grad_norm": 0.2577110826969147, "learning_rate": 9.99764060687048e-05, "loss": 1.0736, "step": 290 }, { "epoch": 0.11, "grad_norm": 0.2583490014076233, "learning_rate": 9.997462615587276e-05, "loss": 0.9963, "step": 300 }, { "epoch": 0.11, "grad_norm": 0.29901596903800964, "learning_rate": 9.997278154122935e-05, "loss": 1.044, "step": 310 }, { "epoch": 0.12, "grad_norm": 0.24256502091884613, "learning_rate": 9.997087222716278e-05, "loss": 1.0713, "step": 320 }, { "epoch": 0.12, "grad_norm": 0.267166405916214, "learning_rate": 9.996889821614502e-05, "loss": 1.0721, "step": 330 }, { "epoch": 0.12, "grad_norm": 0.21612702310085297, "learning_rate": 9.996685951073182e-05, "loss": 1.0414, "step": 340 }, { "epoch": 0.13, "grad_norm": 0.3107874095439911, "learning_rate": 9.996475611356264e-05, "loss": 0.9856, "step": 350 }, { "epoch": 0.13, "grad_norm": 0.27626070380210876, "learning_rate": 9.996258802736079e-05, "loss": 1.0121, "step": 360 }, { "epoch": 0.13, "grad_norm": 0.2957281172275543, "learning_rate": 9.996035525493322e-05, "loss": 1.0785, "step": 370 }, { "epoch": 0.14, "grad_norm": 0.3168753981590271, "learning_rate": 9.995805779917073e-05, "loss": 0.996, "step": 380 }, { "epoch": 0.14, "grad_norm": 0.24823521077632904, "learning_rate": 9.99556956630478e-05, "loss": 1.0557, "step": 390 }, { "epoch": 0.14, "grad_norm": 0.3291969895362854, "learning_rate": 9.995326884962268e-05, "loss": 1.0505, "step": 400 }, { "epoch": 0.14, "eval_loss": 1.023820400238037, "eval_runtime": 124.7265, "eval_samples_per_second": 62.545, "eval_steps_per_second": 3.913, "step": 400 }, { "epoch": 0.15, "grad_norm": 0.3567464351654053, "learning_rate": 9.995077736203733e-05, "loss": 0.9919, "step": 410 }, { "epoch": 0.15, "grad_norm": 0.2938403785228729, "learning_rate": 9.99482212035175e-05, "loss": 1.0736, "step": 420 }, { "epoch": 0.16, "grad_norm": 0.27481499314308167, "learning_rate": 9.994560037737259e-05, "loss": 1.0633, "step": 430 }, { "epoch": 0.16, "grad_norm": 0.34652218222618103, "learning_rate": 9.994291488699579e-05, "loss": 1.049, "step": 440 }, { "epoch": 0.16, "grad_norm": 0.23733928799629211, "learning_rate": 9.994016473586398e-05, "loss": 1.0022, "step": 450 }, { "epoch": 0.17, "grad_norm": 0.2666071653366089, "learning_rate": 9.993734992753777e-05, "loss": 1.0076, "step": 460 }, { "epoch": 0.17, "grad_norm": 0.22843866050243378, "learning_rate": 9.993447046566146e-05, "loss": 1.0298, "step": 470 }, { "epoch": 0.17, "grad_norm": 0.4334356486797333, "learning_rate": 9.993152635396308e-05, "loss": 1.0635, "step": 480 }, { "epoch": 0.18, "grad_norm": 0.25845977663993835, "learning_rate": 9.992851759625433e-05, "loss": 1.0183, "step": 490 }, { "epoch": 0.18, "grad_norm": 0.26029086112976074, "learning_rate": 9.992544419643066e-05, "loss": 0.963, "step": 500 }, { "epoch": 0.18, "grad_norm": 0.23090577125549316, "learning_rate": 9.992230615847116e-05, "loss": 0.9691, "step": 510 }, { "epoch": 0.19, "grad_norm": 0.2835213243961334, "learning_rate": 9.991910348643865e-05, "loss": 1.0309, "step": 520 }, { "epoch": 0.19, "grad_norm": 0.2612157166004181, "learning_rate": 9.991583618447958e-05, "loss": 1.0232, "step": 530 }, { "epoch": 0.2, "grad_norm": 0.43860122561454773, "learning_rate": 9.99125042568241e-05, "loss": 1.0308, "step": 540 }, { "epoch": 0.2, "grad_norm": 0.2504933476448059, "learning_rate": 9.990910770778606e-05, "loss": 1.0581, "step": 550 }, { "epoch": 0.2, "grad_norm": 0.2778143286705017, "learning_rate": 9.990564654176293e-05, "loss": 0.958, "step": 560 }, { "epoch": 0.21, "grad_norm": 0.29035818576812744, "learning_rate": 9.990212076323586e-05, "loss": 1.0258, "step": 570 }, { "epoch": 0.21, "grad_norm": 0.307841032743454, "learning_rate": 9.989853037676965e-05, "loss": 1.0724, "step": 580 }, { "epoch": 0.21, "grad_norm": 0.3011914789676666, "learning_rate": 9.989487538701279e-05, "loss": 0.9847, "step": 590 }, { "epoch": 0.22, "grad_norm": 0.27195674180984497, "learning_rate": 9.989115579869732e-05, "loss": 1.044, "step": 600 }, { "epoch": 0.22, "eval_loss": 1.0194298028945923, "eval_runtime": 124.7334, "eval_samples_per_second": 62.541, "eval_steps_per_second": 3.912, "step": 600 }, { "epoch": 0.22, "grad_norm": 0.2725551724433899, "learning_rate": 9.988737161663898e-05, "loss": 1.0244, "step": 610 }, { "epoch": 0.22, "grad_norm": 0.2821577787399292, "learning_rate": 9.988352284573713e-05, "loss": 1.0254, "step": 620 }, { "epoch": 0.23, "grad_norm": 0.3664613664150238, "learning_rate": 9.987960949097475e-05, "loss": 1.1093, "step": 630 }, { "epoch": 0.23, "grad_norm": 0.3072526156902313, "learning_rate": 9.987563155741842e-05, "loss": 1.0196, "step": 640 }, { "epoch": 0.24, "grad_norm": 0.24550805985927582, "learning_rate": 9.987158905021836e-05, "loss": 1.012, "step": 650 }, { "epoch": 0.24, "grad_norm": 0.2521149814128876, "learning_rate": 9.986748197460837e-05, "loss": 1.0219, "step": 660 }, { "epoch": 0.24, "grad_norm": 0.34175044298171997, "learning_rate": 9.986331033590586e-05, "loss": 1.015, "step": 670 }, { "epoch": 0.25, "grad_norm": 0.30103522539138794, "learning_rate": 9.98590741395118e-05, "loss": 1.1113, "step": 680 }, { "epoch": 0.25, "grad_norm": 0.2344699651002884, "learning_rate": 9.985477339091078e-05, "loss": 1.0456, "step": 690 }, { "epoch": 0.25, "grad_norm": 0.26754796504974365, "learning_rate": 9.985040809567097e-05, "loss": 1.0102, "step": 700 }, { "epoch": 0.26, "grad_norm": 0.31665658950805664, "learning_rate": 9.984597825944405e-05, "loss": 1.0057, "step": 710 }, { "epoch": 0.26, "grad_norm": 0.2716057300567627, "learning_rate": 9.984148388796532e-05, "loss": 0.9937, "step": 720 }, { "epoch": 0.26, "grad_norm": 0.2589300274848938, "learning_rate": 9.983692498705361e-05, "loss": 0.9937, "step": 730 }, { "epoch": 0.27, "grad_norm": 0.2215312272310257, "learning_rate": 9.983230156261132e-05, "loss": 1.0205, "step": 740 }, { "epoch": 0.27, "grad_norm": 0.26202231645584106, "learning_rate": 9.982761362062432e-05, "loss": 1.0486, "step": 750 }, { "epoch": 0.28, "grad_norm": 0.21432209014892578, "learning_rate": 9.982286116716208e-05, "loss": 1.0679, "step": 760 }, { "epoch": 0.28, "grad_norm": 0.4230276048183441, "learning_rate": 9.98180442083776e-05, "loss": 1.0051, "step": 770 }, { "epoch": 0.28, "grad_norm": 0.26559358835220337, "learning_rate": 9.981316275050731e-05, "loss": 1.0398, "step": 780 }, { "epoch": 0.29, "grad_norm": 0.2559758722782135, "learning_rate": 9.980821679987125e-05, "loss": 1.0365, "step": 790 }, { "epoch": 0.29, "grad_norm": 0.34101855754852295, "learning_rate": 9.980320636287285e-05, "loss": 1.0169, "step": 800 }, { "epoch": 0.29, "eval_loss": 1.0172123908996582, "eval_runtime": 124.7169, "eval_samples_per_second": 62.55, "eval_steps_per_second": 3.913, "step": 800 }, { "epoch": 0.29, "grad_norm": 0.3401408791542053, "learning_rate": 9.979813144599915e-05, "loss": 1.0165, "step": 810 }, { "epoch": 0.3, "grad_norm": 0.34302470088005066, "learning_rate": 9.979299205582057e-05, "loss": 1.0314, "step": 820 }, { "epoch": 0.3, "grad_norm": 0.2908473610877991, "learning_rate": 9.978778819899109e-05, "loss": 0.9779, "step": 830 }, { "epoch": 0.3, "grad_norm": 0.229986771941185, "learning_rate": 9.978251988224804e-05, "loss": 0.9564, "step": 840 }, { "epoch": 0.31, "grad_norm": 0.441243052482605, "learning_rate": 9.977718711241233e-05, "loss": 1.0275, "step": 850 }, { "epoch": 0.31, "grad_norm": 0.2620699107646942, "learning_rate": 9.977178989638822e-05, "loss": 1.0293, "step": 860 }, { "epoch": 0.31, "grad_norm": 0.27257561683654785, "learning_rate": 9.97663282411635e-05, "loss": 1.0508, "step": 870 }, { "epoch": 0.32, "grad_norm": 0.306587278842926, "learning_rate": 9.97608021538093e-05, "loss": 0.9949, "step": 880 }, { "epoch": 0.32, "grad_norm": 0.30046141147613525, "learning_rate": 9.97552116414802e-05, "loss": 1.0752, "step": 890 }, { "epoch": 0.33, "grad_norm": 0.2749102711677551, "learning_rate": 9.974955671141424e-05, "loss": 0.9947, "step": 900 }, { "epoch": 0.33, "grad_norm": 0.38608163595199585, "learning_rate": 9.974383737093279e-05, "loss": 1.0362, "step": 910 }, { "epoch": 0.33, "grad_norm": 0.24529774487018585, "learning_rate": 9.973805362744064e-05, "loss": 1.0469, "step": 920 }, { "epoch": 0.34, "grad_norm": 0.33143192529678345, "learning_rate": 9.973220548842598e-05, "loss": 0.9705, "step": 930 }, { "epoch": 0.34, "grad_norm": 0.3112998306751251, "learning_rate": 9.972629296146035e-05, "loss": 0.9956, "step": 940 }, { "epoch": 0.34, "grad_norm": 0.32970279455184937, "learning_rate": 9.972031605419864e-05, "loss": 1.0232, "step": 950 }, { "epoch": 0.35, "grad_norm": 0.256101131439209, "learning_rate": 9.971427477437914e-05, "loss": 1.0471, "step": 960 }, { "epoch": 0.35, "grad_norm": 0.4258672595024109, "learning_rate": 9.970816912982344e-05, "loss": 0.9652, "step": 970 }, { "epoch": 0.35, "grad_norm": 0.3143826425075531, "learning_rate": 9.970199912843648e-05, "loss": 0.9894, "step": 980 }, { "epoch": 0.36, "grad_norm": 0.2868054509162903, "learning_rate": 9.96957647782065e-05, "loss": 1.0437, "step": 990 }, { "epoch": 0.36, "grad_norm": 0.2594622075557709, "learning_rate": 9.968946608720511e-05, "loss": 1.02, "step": 1000 }, { "epoch": 0.36, "eval_loss": 1.0154483318328857, "eval_runtime": 124.672, "eval_samples_per_second": 62.572, "eval_steps_per_second": 3.914, "step": 1000 }, { "epoch": 0.37, "grad_norm": 0.2359086573123932, "learning_rate": 9.968310306358715e-05, "loss": 1.0676, "step": 1010 }, { "epoch": 0.37, "grad_norm": 0.22080975770950317, "learning_rate": 9.967667571559081e-05, "loss": 1.027, "step": 1020 }, { "epoch": 0.37, "grad_norm": 0.3211756944656372, "learning_rate": 9.967018405153749e-05, "loss": 1.0004, "step": 1030 }, { "epoch": 0.38, "grad_norm": 0.3681553602218628, "learning_rate": 9.966362807983196e-05, "loss": 1.0395, "step": 1040 }, { "epoch": 0.38, "grad_norm": 0.3180038332939148, "learning_rate": 9.965700780896216e-05, "loss": 0.9948, "step": 1050 }, { "epoch": 0.38, "grad_norm": 0.25071969628334045, "learning_rate": 9.965032324749932e-05, "loss": 1.0281, "step": 1060 }, { "epoch": 0.39, "grad_norm": 0.2274983674287796, "learning_rate": 9.964357440409789e-05, "loss": 1.0094, "step": 1070 }, { "epoch": 0.39, "grad_norm": 0.24825724959373474, "learning_rate": 9.963676128749553e-05, "loss": 1.0272, "step": 1080 }, { "epoch": 0.39, "grad_norm": 0.3256381154060364, "learning_rate": 9.96298839065132e-05, "loss": 1.0191, "step": 1090 }, { "epoch": 0.4, "grad_norm": 0.31695234775543213, "learning_rate": 9.962294227005493e-05, "loss": 1.08, "step": 1100 }, { "epoch": 0.4, "grad_norm": 0.288083553314209, "learning_rate": 9.961593638710804e-05, "loss": 0.9954, "step": 1110 }, { "epoch": 0.41, "grad_norm": 0.29730525612831116, "learning_rate": 9.960886626674302e-05, "loss": 1.071, "step": 1120 }, { "epoch": 0.41, "grad_norm": 0.2090187519788742, "learning_rate": 9.960173191811348e-05, "loss": 0.9725, "step": 1130 }, { "epoch": 0.41, "grad_norm": 0.2811983525753021, "learning_rate": 9.959453335045622e-05, "loss": 1.0071, "step": 1140 }, { "epoch": 0.42, "grad_norm": 0.27806761860847473, "learning_rate": 9.958727057309115e-05, "loss": 1.0108, "step": 1150 }, { "epoch": 0.42, "grad_norm": 0.2864569127559662, "learning_rate": 9.957994359542138e-05, "loss": 1.0495, "step": 1160 }, { "epoch": 0.42, "grad_norm": 0.3440109193325043, "learning_rate": 9.957255242693308e-05, "loss": 1.0015, "step": 1170 }, { "epoch": 0.43, "grad_norm": 0.2824917435646057, "learning_rate": 9.956509707719555e-05, "loss": 1.0559, "step": 1180 }, { "epoch": 0.43, "grad_norm": 0.3080492317676544, "learning_rate": 9.955757755586119e-05, "loss": 1.0134, "step": 1190 }, { "epoch": 0.43, "grad_norm": 0.2890901565551758, "learning_rate": 9.954999387266546e-05, "loss": 0.9492, "step": 1200 }, { "epoch": 0.43, "eval_loss": 1.0133627653121948, "eval_runtime": 124.7104, "eval_samples_per_second": 62.553, "eval_steps_per_second": 3.913, "step": 1200 }, { "epoch": 0.44, "grad_norm": 0.33987322449684143, "learning_rate": 9.95423460374269e-05, "loss": 0.9629, "step": 1210 }, { "epoch": 0.44, "grad_norm": 0.29403063654899597, "learning_rate": 9.953463406004713e-05, "loss": 1.0384, "step": 1220 }, { "epoch": 0.45, "grad_norm": 0.20130111277103424, "learning_rate": 9.952685795051077e-05, "loss": 1.0235, "step": 1230 }, { "epoch": 0.45, "grad_norm": 0.1973690539598465, "learning_rate": 9.951901771888552e-05, "loss": 1.0395, "step": 1240 }, { "epoch": 0.45, "grad_norm": 0.24519580602645874, "learning_rate": 9.951111337532205e-05, "loss": 1.0914, "step": 1250 }, { "epoch": 0.46, "grad_norm": 0.2706618309020996, "learning_rate": 9.950314493005408e-05, "loss": 1.0714, "step": 1260 }, { "epoch": 0.46, "grad_norm": 0.23367558419704437, "learning_rate": 9.949511239339831e-05, "loss": 1.0224, "step": 1270 }, { "epoch": 0.46, "grad_norm": 0.30005407333374023, "learning_rate": 9.948701577575439e-05, "loss": 1.0152, "step": 1280 }, { "epoch": 0.47, "grad_norm": 0.3130083382129669, "learning_rate": 9.947885508760496e-05, "loss": 0.8988, "step": 1290 }, { "epoch": 0.47, "grad_norm": 0.23657679557800293, "learning_rate": 9.94706303395156e-05, "loss": 1.0242, "step": 1300 }, { "epoch": 0.47, "grad_norm": 0.40966659784317017, "learning_rate": 9.946234154213487e-05, "loss": 1.0145, "step": 1310 }, { "epoch": 0.48, "grad_norm": 0.35292962193489075, "learning_rate": 9.94539887061942e-05, "loss": 1.0197, "step": 1320 }, { "epoch": 0.48, "grad_norm": 0.38793638348579407, "learning_rate": 9.944557184250794e-05, "loss": 1.0273, "step": 1330 }, { "epoch": 0.48, "grad_norm": 0.27373677492141724, "learning_rate": 9.943709096197335e-05, "loss": 0.9561, "step": 1340 }, { "epoch": 0.49, "grad_norm": 0.24536257982254028, "learning_rate": 9.942854607557057e-05, "loss": 0.9678, "step": 1350 }, { "epoch": 0.49, "grad_norm": 0.4609609842300415, "learning_rate": 9.941993719436262e-05, "loss": 1.0429, "step": 1360 }, { "epoch": 0.5, "grad_norm": 0.27118805050849915, "learning_rate": 9.941126432949535e-05, "loss": 1.0506, "step": 1370 }, { "epoch": 0.5, "grad_norm": 0.27538400888442993, "learning_rate": 9.940252749219746e-05, "loss": 1.0326, "step": 1380 }, { "epoch": 0.5, "grad_norm": 0.2451954036951065, "learning_rate": 9.939372669378048e-05, "loss": 1.0413, "step": 1390 }, { "epoch": 0.51, "grad_norm": 0.2622232437133789, "learning_rate": 9.938486194563875e-05, "loss": 1.0051, "step": 1400 }, { "epoch": 0.51, "eval_loss": 1.011703372001648, "eval_runtime": 124.6726, "eval_samples_per_second": 62.572, "eval_steps_per_second": 3.914, "step": 1400 }, { "epoch": 0.51, "grad_norm": 0.2616746425628662, "learning_rate": 9.937593325924937e-05, "loss": 1.0277, "step": 1410 }, { "epoch": 0.51, "grad_norm": 0.2952045202255249, "learning_rate": 9.936694064617227e-05, "loss": 0.9802, "step": 1420 }, { "epoch": 0.52, "grad_norm": 0.2611790895462036, "learning_rate": 9.935788411805011e-05, "loss": 0.9811, "step": 1430 }, { "epoch": 0.52, "grad_norm": 0.3291374742984772, "learning_rate": 9.934876368660836e-05, "loss": 0.9972, "step": 1440 }, { "epoch": 0.52, "grad_norm": 0.32888704538345337, "learning_rate": 9.933957936365515e-05, "loss": 1.1006, "step": 1450 }, { "epoch": 0.53, "grad_norm": 0.20011785626411438, "learning_rate": 9.933033116108134e-05, "loss": 1.0139, "step": 1460 }, { "epoch": 0.53, "grad_norm": 0.3157961666584015, "learning_rate": 9.932101909086056e-05, "loss": 0.993, "step": 1470 }, { "epoch": 0.54, "grad_norm": 0.22981207072734833, "learning_rate": 9.931164316504904e-05, "loss": 1.0539, "step": 1480 }, { "epoch": 0.54, "grad_norm": 0.23787029087543488, "learning_rate": 9.930220339578576e-05, "loss": 0.9599, "step": 1490 }, { "epoch": 0.54, "grad_norm": 0.2633046507835388, "learning_rate": 9.929269979529232e-05, "loss": 0.9813, "step": 1500 }, { "epoch": 0.55, "grad_norm": 0.2666633725166321, "learning_rate": 9.928313237587296e-05, "loss": 0.9637, "step": 1510 }, { "epoch": 0.55, "grad_norm": 0.26092538237571716, "learning_rate": 9.927350114991456e-05, "loss": 1.0375, "step": 1520 }, { "epoch": 0.55, "grad_norm": 0.2837240397930145, "learning_rate": 9.92638061298866e-05, "loss": 1.0053, "step": 1530 }, { "epoch": 0.56, "grad_norm": 0.2586491107940674, "learning_rate": 9.925404732834117e-05, "loss": 1.0631, "step": 1540 }, { "epoch": 0.56, "grad_norm": 0.43321874737739563, "learning_rate": 9.924422475791288e-05, "loss": 1.0134, "step": 1550 }, { "epoch": 0.56, "grad_norm": 0.19062629342079163, "learning_rate": 9.923433843131901e-05, "loss": 0.9989, "step": 1560 }, { "epoch": 0.57, "grad_norm": 0.34545308351516724, "learning_rate": 9.922438836135928e-05, "loss": 1.0896, "step": 1570 }, { "epoch": 0.57, "grad_norm": 0.2846600115299225, "learning_rate": 9.921437456091596e-05, "loss": 0.9954, "step": 1580 }, { "epoch": 0.58, "grad_norm": 0.25403323769569397, "learning_rate": 9.920429704295391e-05, "loss": 0.9937, "step": 1590 }, { "epoch": 0.58, "grad_norm": 0.23549498617649078, "learning_rate": 9.919415582052036e-05, "loss": 1.0469, "step": 1600 }, { "epoch": 0.58, "eval_loss": 1.0105613470077515, "eval_runtime": 124.7139, "eval_samples_per_second": 62.551, "eval_steps_per_second": 3.913, "step": 1600 }, { "epoch": 0.58, "grad_norm": 0.21466514468193054, "learning_rate": 9.918395090674514e-05, "loss": 1.0408, "step": 1610 }, { "epoch": 0.59, "grad_norm": 0.21247586607933044, "learning_rate": 9.917368231484045e-05, "loss": 0.9893, "step": 1620 }, { "epoch": 0.59, "grad_norm": 0.26590731739997864, "learning_rate": 9.916335005810095e-05, "loss": 1.0563, "step": 1630 }, { "epoch": 0.59, "grad_norm": 0.2346472591161728, "learning_rate": 9.91529541499038e-05, "loss": 1.0061, "step": 1640 }, { "epoch": 0.6, "grad_norm": 0.27766481041908264, "learning_rate": 9.914249460370846e-05, "loss": 0.9639, "step": 1650 }, { "epoch": 0.6, "grad_norm": 0.24883978068828583, "learning_rate": 9.913197143305684e-05, "loss": 1.0289, "step": 1660 }, { "epoch": 0.6, "grad_norm": 0.2379382699728012, "learning_rate": 9.912138465157325e-05, "loss": 1.0154, "step": 1670 }, { "epoch": 0.61, "grad_norm": 0.17160119116306305, "learning_rate": 9.91107342729643e-05, "loss": 1.0002, "step": 1680 }, { "epoch": 0.61, "grad_norm": 0.2804344892501831, "learning_rate": 9.910002031101895e-05, "loss": 0.9887, "step": 1690 }, { "epoch": 0.62, "grad_norm": 0.2296508252620697, "learning_rate": 9.908924277960854e-05, "loss": 1.0703, "step": 1700 }, { "epoch": 0.62, "grad_norm": 0.22265523672103882, "learning_rate": 9.907840169268662e-05, "loss": 0.9495, "step": 1710 }, { "epoch": 0.62, "grad_norm": 0.3383825123310089, "learning_rate": 9.90674970642891e-05, "loss": 0.9878, "step": 1720 }, { "epoch": 0.63, "grad_norm": 0.2603285312652588, "learning_rate": 9.905652890853411e-05, "loss": 1.0351, "step": 1730 }, { "epoch": 0.63, "grad_norm": 0.27001509070396423, "learning_rate": 9.904549723962206e-05, "loss": 1.0528, "step": 1740 }, { "epoch": 0.63, "grad_norm": 0.34035804867744446, "learning_rate": 9.903440207183558e-05, "loss": 1.0159, "step": 1750 }, { "epoch": 0.64, "grad_norm": 0.3518404960632324, "learning_rate": 9.90232434195395e-05, "loss": 0.9879, "step": 1760 }, { "epoch": 0.64, "grad_norm": 0.24958577752113342, "learning_rate": 9.901202129718086e-05, "loss": 1.0221, "step": 1770 }, { "epoch": 0.64, "grad_norm": 0.23898568749427795, "learning_rate": 9.900073571928886e-05, "loss": 1.037, "step": 1780 }, { "epoch": 0.65, "grad_norm": 0.22275009751319885, "learning_rate": 9.898938670047486e-05, "loss": 1.0008, "step": 1790 }, { "epoch": 0.65, "grad_norm": 0.2770971655845642, "learning_rate": 9.897797425543236e-05, "loss": 0.9994, "step": 1800 }, { "epoch": 0.65, "eval_loss": 1.0094062089920044, "eval_runtime": 124.6598, "eval_samples_per_second": 62.578, "eval_steps_per_second": 3.915, "step": 1800 }, { "epoch": 0.66, "grad_norm": 0.2470710575580597, "learning_rate": 9.896649839893699e-05, "loss": 1.0093, "step": 1810 }, { "epoch": 0.66, "grad_norm": 0.31282275915145874, "learning_rate": 9.895495914584643e-05, "loss": 1.0124, "step": 1820 }, { "epoch": 0.66, "grad_norm": 0.2757389545440674, "learning_rate": 9.894335651110051e-05, "loss": 1.0197, "step": 1830 }, { "epoch": 0.67, "grad_norm": 0.3123573362827301, "learning_rate": 9.893169050972106e-05, "loss": 0.9469, "step": 1840 }, { "epoch": 0.67, "grad_norm": 0.4073740839958191, "learning_rate": 9.8919961156812e-05, "loss": 1.0153, "step": 1850 }, { "epoch": 0.67, "grad_norm": 0.24388962984085083, "learning_rate": 9.89081684675592e-05, "loss": 1.0124, "step": 1860 }, { "epoch": 0.68, "grad_norm": 0.27508777379989624, "learning_rate": 9.88963124572306e-05, "loss": 0.96, "step": 1870 }, { "epoch": 0.68, "grad_norm": 0.2843553125858307, "learning_rate": 9.88843931411761e-05, "loss": 1.0448, "step": 1880 }, { "epoch": 0.68, "grad_norm": 0.25155389308929443, "learning_rate": 9.887241053482757e-05, "loss": 1.0362, "step": 1890 }, { "epoch": 0.69, "grad_norm": 0.21977895498275757, "learning_rate": 9.886036465369877e-05, "loss": 1.0658, "step": 1900 }, { "epoch": 0.69, "grad_norm": 0.22326160967350006, "learning_rate": 9.884825551338546e-05, "loss": 1.0068, "step": 1910 }, { "epoch": 0.69, "grad_norm": 0.3339684307575226, "learning_rate": 9.883608312956524e-05, "loss": 1.0147, "step": 1920 }, { "epoch": 0.7, "grad_norm": 0.26512840390205383, "learning_rate": 9.882384751799762e-05, "loss": 0.9421, "step": 1930 }, { "epoch": 0.7, "grad_norm": 0.313123881816864, "learning_rate": 9.881154869452395e-05, "loss": 1.0032, "step": 1940 }, { "epoch": 0.71, "grad_norm": 0.3562926948070526, "learning_rate": 9.879918667506748e-05, "loss": 1.0491, "step": 1950 }, { "epoch": 0.71, "grad_norm": 0.373032808303833, "learning_rate": 9.87867614756332e-05, "loss": 0.9823, "step": 1960 }, { "epoch": 0.71, "grad_norm": 0.2701728641986847, "learning_rate": 9.87742731123079e-05, "loss": 1.0326, "step": 1970 }, { "epoch": 0.72, "grad_norm": 0.4167492687702179, "learning_rate": 9.876172160126024e-05, "loss": 1.0256, "step": 1980 }, { "epoch": 0.72, "grad_norm": 0.2636062800884247, "learning_rate": 9.874910695874053e-05, "loss": 1.0301, "step": 1990 }, { "epoch": 0.72, "grad_norm": 0.27048760652542114, "learning_rate": 9.873642920108091e-05, "loss": 1.0141, "step": 2000 }, { "epoch": 0.72, "eval_loss": 1.0082145929336548, "eval_runtime": 124.7209, "eval_samples_per_second": 62.548, "eval_steps_per_second": 3.913, "step": 2000 }, { "epoch": 0.73, "grad_norm": 0.26596397161483765, "learning_rate": 9.872368834469514e-05, "loss": 0.9554, "step": 2010 }, { "epoch": 0.73, "grad_norm": 0.3881726861000061, "learning_rate": 9.871088440607874e-05, "loss": 1.0374, "step": 2020 }, { "epoch": 0.73, "grad_norm": 0.345869243144989, "learning_rate": 9.869801740180889e-05, "loss": 1.01, "step": 2030 }, { "epoch": 0.74, "grad_norm": 0.3740908205509186, "learning_rate": 9.86850873485444e-05, "loss": 1.0244, "step": 2040 }, { "epoch": 0.74, "grad_norm": 0.3265666663646698, "learning_rate": 9.867209426302572e-05, "loss": 0.9303, "step": 2050 }, { "epoch": 0.75, "grad_norm": 0.381783664226532, "learning_rate": 9.865903816207493e-05, "loss": 1.0851, "step": 2060 }, { "epoch": 0.75, "grad_norm": 0.30846527218818665, "learning_rate": 9.864591906259568e-05, "loss": 1.0042, "step": 2070 }, { "epoch": 0.75, "grad_norm": 0.36899617314338684, "learning_rate": 9.863273698157315e-05, "loss": 0.9866, "step": 2080 }, { "epoch": 0.76, "grad_norm": 0.25415265560150146, "learning_rate": 9.861949193607411e-05, "loss": 1.056, "step": 2090 }, { "epoch": 0.76, "grad_norm": 0.3369081914424896, "learning_rate": 9.860618394324682e-05, "loss": 0.9988, "step": 2100 }, { "epoch": 0.76, "grad_norm": 0.19644911587238312, "learning_rate": 9.859281302032106e-05, "loss": 0.9562, "step": 2110 }, { "epoch": 0.77, "grad_norm": 0.3449130356311798, "learning_rate": 9.857937918460808e-05, "loss": 1.0325, "step": 2120 }, { "epoch": 0.77, "grad_norm": 0.2639143764972687, "learning_rate": 9.856588245350056e-05, "loss": 1.0458, "step": 2130 }, { "epoch": 0.77, "grad_norm": 0.2752164602279663, "learning_rate": 9.855232284447262e-05, "loss": 1.089, "step": 2140 }, { "epoch": 0.78, "grad_norm": 0.31700417399406433, "learning_rate": 9.853870037507983e-05, "loss": 1.0398, "step": 2150 }, { "epoch": 0.78, "grad_norm": 0.24685466289520264, "learning_rate": 9.852501506295907e-05, "loss": 1.0038, "step": 2160 }, { "epoch": 0.79, "grad_norm": 0.28860118985176086, "learning_rate": 9.851126692582864e-05, "loss": 1.0343, "step": 2170 }, { "epoch": 0.79, "grad_norm": 0.2774854898452759, "learning_rate": 9.849745598148817e-05, "loss": 0.9986, "step": 2180 }, { "epoch": 0.79, "grad_norm": 0.28867611289024353, "learning_rate": 9.848358224781857e-05, "loss": 1.035, "step": 2190 }, { "epoch": 0.8, "grad_norm": 0.2703929841518402, "learning_rate": 9.84696457427821e-05, "loss": 1.0891, "step": 2200 }, { "epoch": 0.8, "eval_loss": 1.0072919130325317, "eval_runtime": 125.0779, "eval_samples_per_second": 62.369, "eval_steps_per_second": 3.902, "step": 2200 }, { "epoch": 0.8, "grad_norm": 0.3247489035129547, "learning_rate": 9.845564648442222e-05, "loss": 1.0259, "step": 2210 }, { "epoch": 0.8, "grad_norm": 0.2535197138786316, "learning_rate": 9.844158449086371e-05, "loss": 1.0457, "step": 2220 }, { "epoch": 0.81, "grad_norm": 0.26780492067337036, "learning_rate": 9.842745978031253e-05, "loss": 0.9869, "step": 2230 }, { "epoch": 0.81, "grad_norm": 0.29711589217185974, "learning_rate": 9.841327237105585e-05, "loss": 1.0158, "step": 2240 }, { "epoch": 0.81, "grad_norm": 0.239434614777565, "learning_rate": 9.8399022281462e-05, "loss": 0.997, "step": 2250 }, { "epoch": 0.82, "grad_norm": 0.2368830293416977, "learning_rate": 9.838470952998049e-05, "loss": 1.0148, "step": 2260 }, { "epoch": 0.82, "grad_norm": 0.2554934322834015, "learning_rate": 9.837033413514191e-05, "loss": 0.9787, "step": 2270 }, { "epoch": 0.83, "grad_norm": 0.2310570627450943, "learning_rate": 9.835589611555805e-05, "loss": 0.9656, "step": 2280 }, { "epoch": 0.83, "grad_norm": 0.22654668986797333, "learning_rate": 9.834139548992165e-05, "loss": 0.9837, "step": 2290 }, { "epoch": 0.83, "grad_norm": 0.25957950949668884, "learning_rate": 9.832683227700661e-05, "loss": 1.0513, "step": 2300 }, { "epoch": 0.84, "grad_norm": 0.20669637620449066, "learning_rate": 9.831220649566782e-05, "loss": 0.9649, "step": 2310 }, { "epoch": 0.84, "grad_norm": 0.24330663681030273, "learning_rate": 9.829751816484116e-05, "loss": 1.0208, "step": 2320 }, { "epoch": 0.84, "grad_norm": 0.28211724758148193, "learning_rate": 9.828276730354353e-05, "loss": 0.9512, "step": 2330 }, { "epoch": 0.85, "grad_norm": 0.23784276843070984, "learning_rate": 9.826795393087278e-05, "loss": 0.976, "step": 2340 }, { "epoch": 0.85, "grad_norm": 0.2881389260292053, "learning_rate": 9.825307806600765e-05, "loss": 1.0036, "step": 2350 }, { "epoch": 0.85, "grad_norm": 0.27906882762908936, "learning_rate": 9.823813972820786e-05, "loss": 1.0555, "step": 2360 }, { "epoch": 0.86, "grad_norm": 0.25142115354537964, "learning_rate": 9.822313893681397e-05, "loss": 1.0483, "step": 2370 }, { "epoch": 0.86, "grad_norm": 0.244681715965271, "learning_rate": 9.820807571124738e-05, "loss": 1.0102, "step": 2380 }, { "epoch": 0.87, "grad_norm": 0.3696367144584656, "learning_rate": 9.819295007101035e-05, "loss": 1.0626, "step": 2390 }, { "epoch": 0.87, "grad_norm": 0.26112619042396545, "learning_rate": 9.817776203568596e-05, "loss": 1.0141, "step": 2400 }, { "epoch": 0.87, "eval_loss": 1.0063296556472778, "eval_runtime": 125.7335, "eval_samples_per_second": 62.044, "eval_steps_per_second": 3.881, "step": 2400 }, { "epoch": 0.87, "grad_norm": 0.25221410393714905, "learning_rate": 9.816251162493804e-05, "loss": 1.0222, "step": 2410 }, { "epoch": 0.88, "grad_norm": 0.19672074913978577, "learning_rate": 9.814719885851121e-05, "loss": 0.9891, "step": 2420 }, { "epoch": 0.88, "grad_norm": 0.3084292411804199, "learning_rate": 9.81318237562308e-05, "loss": 0.9785, "step": 2430 }, { "epoch": 0.88, "grad_norm": 0.3434545397758484, "learning_rate": 9.811638633800287e-05, "loss": 0.9357, "step": 2440 }, { "epoch": 0.89, "grad_norm": 0.23335447907447815, "learning_rate": 9.81008866238141e-05, "loss": 1.0485, "step": 2450 }, { "epoch": 0.89, "grad_norm": 0.2942172586917877, "learning_rate": 9.808532463373188e-05, "loss": 1.0138, "step": 2460 }, { "epoch": 0.89, "grad_norm": 0.22536420822143555, "learning_rate": 9.806970038790423e-05, "loss": 1.0421, "step": 2470 }, { "epoch": 0.9, "grad_norm": 0.30886924266815186, "learning_rate": 9.805401390655975e-05, "loss": 0.9926, "step": 2480 }, { "epoch": 0.9, "grad_norm": 0.34105512499809265, "learning_rate": 9.803826521000761e-05, "loss": 1.0013, "step": 2490 }, { "epoch": 0.9, "grad_norm": 0.261643648147583, "learning_rate": 9.802245431863757e-05, "loss": 0.9937, "step": 2500 }, { "epoch": 0.91, "grad_norm": 0.3864617347717285, "learning_rate": 9.800658125291984e-05, "loss": 0.9986, "step": 2510 }, { "epoch": 0.91, "grad_norm": 0.31850436329841614, "learning_rate": 9.79906460334052e-05, "loss": 0.9984, "step": 2520 }, { "epoch": 0.92, "grad_norm": 0.25421255826950073, "learning_rate": 9.797464868072488e-05, "loss": 1.0273, "step": 2530 }, { "epoch": 0.92, "grad_norm": 0.34440311789512634, "learning_rate": 9.795858921559052e-05, "loss": 1.0346, "step": 2540 }, { "epoch": 0.92, "grad_norm": 0.33147209882736206, "learning_rate": 9.79424676587942e-05, "loss": 1.0691, "step": 2550 }, { "epoch": 0.93, "grad_norm": 0.2778458893299103, "learning_rate": 9.792628403120842e-05, "loss": 1.009, "step": 2560 }, { "epoch": 0.93, "grad_norm": 0.29282572865486145, "learning_rate": 9.791003835378598e-05, "loss": 1.0015, "step": 2570 }, { "epoch": 0.93, "grad_norm": 0.25391730666160583, "learning_rate": 9.789373064756008e-05, "loss": 1.0177, "step": 2580 }, { "epoch": 0.94, "grad_norm": 0.23779381811618805, "learning_rate": 9.787736093364416e-05, "loss": 1.0935, "step": 2590 }, { "epoch": 0.94, "grad_norm": 0.2965840995311737, "learning_rate": 9.786092923323203e-05, "loss": 1.0002, "step": 2600 }, { "epoch": 0.94, "eval_loss": 1.005922555923462, "eval_runtime": 125.0587, "eval_samples_per_second": 62.379, "eval_steps_per_second": 3.902, "step": 2600 }, { "epoch": 0.94, "grad_norm": 0.23760788142681122, "learning_rate": 9.784443556759766e-05, "loss": 1.0305, "step": 2610 }, { "epoch": 0.95, "grad_norm": 0.22895409166812897, "learning_rate": 9.78278799580953e-05, "loss": 1.0427, "step": 2620 }, { "epoch": 0.95, "grad_norm": 0.36007368564605713, "learning_rate": 9.781126242615939e-05, "loss": 1.0059, "step": 2630 }, { "epoch": 0.96, "grad_norm": 0.2813151776790619, "learning_rate": 9.779458299330452e-05, "loss": 1.0418, "step": 2640 }, { "epoch": 0.96, "grad_norm": 0.27038782835006714, "learning_rate": 9.777784168112545e-05, "loss": 1.0092, "step": 2650 }, { "epoch": 0.96, "grad_norm": 0.22898097336292267, "learning_rate": 9.776103851129706e-05, "loss": 0.9883, "step": 2660 }, { "epoch": 0.97, "grad_norm": 0.2213810682296753, "learning_rate": 9.774417350557428e-05, "loss": 1.0753, "step": 2670 }, { "epoch": 0.97, "grad_norm": 0.22410623729228973, "learning_rate": 9.772724668579212e-05, "loss": 1.0524, "step": 2680 }, { "epoch": 0.97, "grad_norm": 0.3005650043487549, "learning_rate": 9.771025807386562e-05, "loss": 1.0562, "step": 2690 }, { "epoch": 0.98, "grad_norm": 0.3941683769226074, "learning_rate": 9.769320769178983e-05, "loss": 0.9925, "step": 2700 }, { "epoch": 0.98, "grad_norm": 0.2829142212867737, "learning_rate": 9.767609556163977e-05, "loss": 1.014, "step": 2710 }, { "epoch": 0.98, "grad_norm": 0.29680418968200684, "learning_rate": 9.765892170557038e-05, "loss": 0.9677, "step": 2720 }, { "epoch": 0.99, "grad_norm": 0.22002767026424408, "learning_rate": 9.764168614581655e-05, "loss": 0.9954, "step": 2730 }, { "epoch": 0.99, "grad_norm": 0.2758820354938507, "learning_rate": 9.762438890469304e-05, "loss": 1.0029, "step": 2740 }, { "epoch": 1.0, "grad_norm": 0.2981850802898407, "learning_rate": 9.760703000459446e-05, "loss": 1.0555, "step": 2750 }, { "epoch": 1.0, "grad_norm": 0.22340857982635498, "learning_rate": 9.758960946799528e-05, "loss": 1.0394, "step": 2760 }, { "epoch": 1.0, "grad_norm": 0.19991633296012878, "learning_rate": 9.757212731744974e-05, "loss": 0.9325, "step": 2770 }, { "epoch": 1.01, "grad_norm": 0.30030888319015503, "learning_rate": 9.755458357559186e-05, "loss": 0.9711, "step": 2780 }, { "epoch": 1.01, "grad_norm": 0.3804832696914673, "learning_rate": 9.753697826513541e-05, "loss": 0.9651, "step": 2790 }, { "epoch": 1.01, "grad_norm": 0.46047547459602356, "learning_rate": 9.751931140887387e-05, "loss": 0.9686, "step": 2800 }, { "epoch": 1.01, "eval_loss": 1.0086077451705933, "eval_runtime": 124.6354, "eval_samples_per_second": 62.591, "eval_steps_per_second": 3.915, "step": 2800 }, { "epoch": 1.02, "grad_norm": 0.30646952986717224, "learning_rate": 9.750158302968039e-05, "loss": 0.9267, "step": 2810 }, { "epoch": 1.02, "grad_norm": 0.3007545471191406, "learning_rate": 9.748379315050778e-05, "loss": 1.0193, "step": 2820 }, { "epoch": 1.02, "grad_norm": 0.2814784049987793, "learning_rate": 9.74659417943885e-05, "loss": 0.8893, "step": 2830 }, { "epoch": 1.03, "grad_norm": 0.2728348970413208, "learning_rate": 9.744802898443456e-05, "loss": 0.937, "step": 2840 }, { "epoch": 1.03, "grad_norm": 0.2994844913482666, "learning_rate": 9.743005474383755e-05, "loss": 0.949, "step": 2850 }, { "epoch": 1.04, "grad_norm": 0.43111738562583923, "learning_rate": 9.741201909586861e-05, "loss": 0.9897, "step": 2860 }, { "epoch": 1.04, "grad_norm": 0.29551658034324646, "learning_rate": 9.739392206387838e-05, "loss": 0.9393, "step": 2870 }, { "epoch": 1.04, "grad_norm": 0.40380623936653137, "learning_rate": 9.737576367129694e-05, "loss": 0.9365, "step": 2880 }, { "epoch": 1.05, "grad_norm": 0.2757427394390106, "learning_rate": 9.735754394163386e-05, "loss": 1.0074, "step": 2890 }, { "epoch": 1.05, "grad_norm": 0.35594430565834045, "learning_rate": 9.73392628984781e-05, "loss": 0.9682, "step": 2900 }, { "epoch": 1.05, "grad_norm": 0.32288888096809387, "learning_rate": 9.732092056549799e-05, "loss": 0.9753, "step": 2910 }, { "epoch": 1.06, "grad_norm": 0.3491690158843994, "learning_rate": 9.730251696644122e-05, "loss": 0.926, "step": 2920 }, { "epoch": 1.06, "grad_norm": 0.41806405782699585, "learning_rate": 9.728405212513483e-05, "loss": 0.9993, "step": 2930 }, { "epoch": 1.06, "grad_norm": 0.4885188043117523, "learning_rate": 9.726552606548512e-05, "loss": 0.9879, "step": 2940 }, { "epoch": 1.07, "grad_norm": 0.41796302795410156, "learning_rate": 9.724693881147761e-05, "loss": 0.9626, "step": 2950 }, { "epoch": 1.07, "grad_norm": 0.39677631855010986, "learning_rate": 9.722829038717717e-05, "loss": 0.9767, "step": 2960 }, { "epoch": 1.07, "grad_norm": 0.5329232215881348, "learning_rate": 9.720958081672773e-05, "loss": 0.9357, "step": 2970 }, { "epoch": 1.08, "grad_norm": 0.4468931257724762, "learning_rate": 9.719081012435247e-05, "loss": 0.9705, "step": 2980 }, { "epoch": 1.08, "grad_norm": 0.4029316306114197, "learning_rate": 9.717197833435367e-05, "loss": 0.9727, "step": 2990 }, { "epoch": 1.09, "grad_norm": 0.37598028779029846, "learning_rate": 9.715308547111273e-05, "loss": 0.9767, "step": 3000 }, { "epoch": 1.09, "eval_loss": 1.014098048210144, "eval_runtime": 125.4232, "eval_samples_per_second": 62.197, "eval_steps_per_second": 3.891, "step": 3000 }, { "epoch": 1.09, "grad_norm": 0.3833357095718384, "learning_rate": 9.713413155909009e-05, "loss": 0.9605, "step": 3010 }, { "epoch": 1.09, "grad_norm": 0.4391871988773346, "learning_rate": 9.711511662282527e-05, "loss": 0.9611, "step": 3020 }, { "epoch": 1.1, "grad_norm": 0.39860454201698303, "learning_rate": 9.709604068693679e-05, "loss": 0.9222, "step": 3030 }, { "epoch": 1.1, "grad_norm": 0.33882561326026917, "learning_rate": 9.707690377612211e-05, "loss": 0.9369, "step": 3040 }, { "epoch": 1.1, "grad_norm": 0.3763039708137512, "learning_rate": 9.705770591515768e-05, "loss": 0.8864, "step": 3050 }, { "epoch": 1.11, "grad_norm": 0.3221600353717804, "learning_rate": 9.703844712889884e-05, "loss": 0.9753, "step": 3060 }, { "epoch": 1.11, "grad_norm": 0.3342023491859436, "learning_rate": 9.701912744227979e-05, "loss": 0.9233, "step": 3070 }, { "epoch": 1.11, "grad_norm": 0.4082651734352112, "learning_rate": 9.699974688031363e-05, "loss": 0.987, "step": 3080 }, { "epoch": 1.12, "grad_norm": 0.4198564291000366, "learning_rate": 9.69803054680922e-05, "loss": 0.8833, "step": 3090 }, { "epoch": 1.12, "grad_norm": 0.3833492398262024, "learning_rate": 9.696080323078621e-05, "loss": 0.9894, "step": 3100 }, { "epoch": 1.13, "grad_norm": 0.35935208201408386, "learning_rate": 9.694124019364505e-05, "loss": 0.9417, "step": 3110 }, { "epoch": 1.13, "grad_norm": 0.3433043658733368, "learning_rate": 9.692161638199686e-05, "loss": 0.9251, "step": 3120 }, { "epoch": 1.13, "grad_norm": 0.30163127183914185, "learning_rate": 9.690193182124844e-05, "loss": 0.9447, "step": 3130 }, { "epoch": 1.14, "grad_norm": 0.4361821711063385, "learning_rate": 9.68821865368853e-05, "loss": 0.9984, "step": 3140 }, { "epoch": 1.14, "grad_norm": 0.4263075888156891, "learning_rate": 9.686238055447148e-05, "loss": 0.9422, "step": 3150 }, { "epoch": 1.14, "grad_norm": 0.33963072299957275, "learning_rate": 9.684251389964967e-05, "loss": 0.9199, "step": 3160 }, { "epoch": 1.15, "grad_norm": 0.41040754318237305, "learning_rate": 9.68225865981411e-05, "loss": 0.9249, "step": 3170 }, { "epoch": 1.15, "grad_norm": 0.3697950839996338, "learning_rate": 9.680259867574552e-05, "loss": 0.947, "step": 3180 }, { "epoch": 1.15, "grad_norm": 0.3211696743965149, "learning_rate": 9.678255015834112e-05, "loss": 0.9956, "step": 3190 }, { "epoch": 1.16, "grad_norm": 0.4463675022125244, "learning_rate": 9.676244107188463e-05, "loss": 0.9494, "step": 3200 }, { "epoch": 1.16, "eval_loss": 1.0160499811172485, "eval_runtime": 124.6588, "eval_samples_per_second": 62.579, "eval_steps_per_second": 3.915, "step": 3200 }, { "epoch": 1.16, "step": 3200, "total_flos": 8.146148608211681e+17, "train_loss": 1.015018144249916, "train_runtime": 4695.8401, "train_samples_per_second": 94.128, "train_steps_per_second": 5.884 } ], "logging_steps": 10, "max_steps": 27630, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 1000, "total_flos": 8.146148608211681e+17, "train_batch_size": 8, "trial_name": null, "trial_params": null }