{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.9997795576456756, "eval_steps": 500, "global_step": 25515, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0011756925563965023, "grad_norm": 4.4454169273376465, "learning_rate": 1.0000000000000002e-06, "loss": 3.2731, "step": 10 }, { "epoch": 0.0023513851127930046, "grad_norm": 2.7126429080963135, "learning_rate": 2.0000000000000003e-06, "loss": 3.1464, "step": 20 }, { "epoch": 0.0035270776691895067, "grad_norm": 3.6091959476470947, "learning_rate": 3e-06, "loss": 3.0459, "step": 30 }, { "epoch": 0.004702770225586009, "grad_norm": 3.0747861862182617, "learning_rate": 4.000000000000001e-06, "loss": 2.9821, "step": 40 }, { "epoch": 0.005878462781982512, "grad_norm": 2.1256041526794434, "learning_rate": 5e-06, "loss": 2.8511, "step": 50 }, { "epoch": 0.0070541553383790135, "grad_norm": 2.50280499458313, "learning_rate": 6e-06, "loss": 2.6829, "step": 60 }, { "epoch": 0.008229847894775516, "grad_norm": 1.748313307762146, "learning_rate": 7.000000000000001e-06, "loss": 2.7084, "step": 70 }, { "epoch": 0.009405540451172019, "grad_norm": 1.8512229919433594, "learning_rate": 8.000000000000001e-06, "loss": 2.7276, "step": 80 }, { "epoch": 0.010581233007568521, "grad_norm": 2.021779775619507, "learning_rate": 9e-06, "loss": 2.6647, "step": 90 }, { "epoch": 0.011756925563965024, "grad_norm": 1.8054291009902954, "learning_rate": 1e-05, "loss": 2.5862, "step": 100 }, { "epoch": 0.012932618120361526, "grad_norm": 1.9738070964813232, "learning_rate": 1.1000000000000001e-05, "loss": 2.6172, "step": 110 }, { "epoch": 0.014108310676758027, "grad_norm": 1.8077218532562256, "learning_rate": 1.2e-05, "loss": 2.6131, "step": 120 }, { "epoch": 0.01528400323315453, "grad_norm": 1.859117865562439, "learning_rate": 1.3000000000000001e-05, "loss": 2.6017, "step": 130 }, { "epoch": 0.016459695789551032, "grad_norm": 1.7283823490142822, "learning_rate": 1.4000000000000001e-05, "loss": 2.6334, "step": 140 }, { "epoch": 0.017635388345947536, "grad_norm": 1.682303547859192, "learning_rate": 1.5e-05, "loss": 2.4886, "step": 150 }, { "epoch": 0.018811080902344037, "grad_norm": 1.977339744567871, "learning_rate": 1.6000000000000003e-05, "loss": 2.452, "step": 160 }, { "epoch": 0.019986773458740538, "grad_norm": 1.6340947151184082, "learning_rate": 1.7000000000000003e-05, "loss": 2.4404, "step": 170 }, { "epoch": 0.021162466015137042, "grad_norm": 1.7341827154159546, "learning_rate": 1.8e-05, "loss": 2.4223, "step": 180 }, { "epoch": 0.022338158571533543, "grad_norm": 1.7967009544372559, "learning_rate": 1.9e-05, "loss": 2.4856, "step": 190 }, { "epoch": 0.023513851127930047, "grad_norm": 1.9840071201324463, "learning_rate": 2e-05, "loss": 2.422, "step": 200 }, { "epoch": 0.024689543684326548, "grad_norm": 1.973362684249878, "learning_rate": 2.1e-05, "loss": 2.4914, "step": 210 }, { "epoch": 0.025865236240723052, "grad_norm": 1.7343276739120483, "learning_rate": 2.2000000000000003e-05, "loss": 2.4045, "step": 220 }, { "epoch": 0.027040928797119553, "grad_norm": 2.343691110610962, "learning_rate": 2.3000000000000003e-05, "loss": 2.4852, "step": 230 }, { "epoch": 0.028216621353516054, "grad_norm": 1.6044285297393799, "learning_rate": 2.4e-05, "loss": 2.4881, "step": 240 }, { "epoch": 0.02939231390991256, "grad_norm": 1.7488620281219482, "learning_rate": 2.5e-05, "loss": 2.4813, "step": 250 }, { "epoch": 0.03056800646630906, "grad_norm": 2.3864474296569824, "learning_rate": 2.6000000000000002e-05, "loss": 2.4173, "step": 260 }, { "epoch": 0.03174369902270556, "grad_norm": 1.741749882698059, "learning_rate": 2.7000000000000002e-05, "loss": 2.4052, "step": 270 }, { "epoch": 0.032919391579102064, "grad_norm": 1.8738642930984497, "learning_rate": 2.8000000000000003e-05, "loss": 2.3897, "step": 280 }, { "epoch": 0.03409508413549857, "grad_norm": 1.5419807434082031, "learning_rate": 2.9e-05, "loss": 2.3752, "step": 290 }, { "epoch": 0.03527077669189507, "grad_norm": 1.693520188331604, "learning_rate": 3e-05, "loss": 2.35, "step": 300 }, { "epoch": 0.03644646924829157, "grad_norm": 2.145747661590576, "learning_rate": 3.1e-05, "loss": 2.3721, "step": 310 }, { "epoch": 0.037622161804688074, "grad_norm": 1.524639368057251, "learning_rate": 3.2000000000000005e-05, "loss": 2.3524, "step": 320 }, { "epoch": 0.03879785436108458, "grad_norm": 1.8486028909683228, "learning_rate": 3.3e-05, "loss": 2.3539, "step": 330 }, { "epoch": 0.039973546917481076, "grad_norm": 1.8892922401428223, "learning_rate": 3.4000000000000007e-05, "loss": 2.3279, "step": 340 }, { "epoch": 0.04114923947387758, "grad_norm": 1.534327507019043, "learning_rate": 3.5e-05, "loss": 2.2188, "step": 350 }, { "epoch": 0.042324932030274084, "grad_norm": 1.8371981382369995, "learning_rate": 3.6e-05, "loss": 2.3541, "step": 360 }, { "epoch": 0.04350062458667059, "grad_norm": 1.448330044746399, "learning_rate": 3.7e-05, "loss": 2.3253, "step": 370 }, { "epoch": 0.044676317143067086, "grad_norm": 2.0190629959106445, "learning_rate": 3.8e-05, "loss": 2.3429, "step": 380 }, { "epoch": 0.04585200969946359, "grad_norm": 1.4914367198944092, "learning_rate": 3.9000000000000006e-05, "loss": 2.3195, "step": 390 }, { "epoch": 0.047027702255860095, "grad_norm": 1.8572663068771362, "learning_rate": 4e-05, "loss": 2.2525, "step": 400 }, { "epoch": 0.04820339481225659, "grad_norm": 1.813152551651001, "learning_rate": 4.1e-05, "loss": 2.3449, "step": 410 }, { "epoch": 0.049379087368653096, "grad_norm": 1.827502965927124, "learning_rate": 4.2e-05, "loss": 2.2682, "step": 420 }, { "epoch": 0.0505547799250496, "grad_norm": 1.4535824060440063, "learning_rate": 4.3e-05, "loss": 2.2863, "step": 430 }, { "epoch": 0.051730472481446105, "grad_norm": 1.6425749063491821, "learning_rate": 4.4000000000000006e-05, "loss": 2.2749, "step": 440 }, { "epoch": 0.0529061650378426, "grad_norm": 2.2492101192474365, "learning_rate": 4.5e-05, "loss": 2.2876, "step": 450 }, { "epoch": 0.054081857594239106, "grad_norm": 1.864755392074585, "learning_rate": 4.600000000000001e-05, "loss": 2.2666, "step": 460 }, { "epoch": 0.05525755015063561, "grad_norm": 1.8749364614486694, "learning_rate": 4.7e-05, "loss": 2.2591, "step": 470 }, { "epoch": 0.05643324270703211, "grad_norm": 1.7484126091003418, "learning_rate": 4.8e-05, "loss": 2.3078, "step": 480 }, { "epoch": 0.05760893526342861, "grad_norm": 1.8384519815444946, "learning_rate": 4.9e-05, "loss": 2.2968, "step": 490 }, { "epoch": 0.05878462781982512, "grad_norm": 1.9133697748184204, "learning_rate": 5e-05, "loss": 2.285, "step": 500 }, { "epoch": 0.05878462781982512, "eval_loss": 2.241626501083374, "eval_runtime": 1917.5412, "eval_samples_per_second": 31.543, "eval_steps_per_second": 3.943, "step": 500 }, { "epoch": 0.05996032037622162, "grad_norm": 1.70958411693573, "learning_rate": 5.1000000000000006e-05, "loss": 2.2052, "step": 510 }, { "epoch": 0.06113601293261812, "grad_norm": 1.755081295967102, "learning_rate": 5.2000000000000004e-05, "loss": 2.2342, "step": 520 }, { "epoch": 0.06231170548901462, "grad_norm": 1.4573886394500732, "learning_rate": 5.300000000000001e-05, "loss": 2.154, "step": 530 }, { "epoch": 0.06348739804541112, "grad_norm": 1.4183944463729858, "learning_rate": 5.4000000000000005e-05, "loss": 2.2262, "step": 540 }, { "epoch": 0.06466309060180762, "grad_norm": 1.8631272315979004, "learning_rate": 5.500000000000001e-05, "loss": 2.2634, "step": 550 }, { "epoch": 0.06583878315820413, "grad_norm": 1.9564274549484253, "learning_rate": 5.6000000000000006e-05, "loss": 2.214, "step": 560 }, { "epoch": 0.06701447571460063, "grad_norm": 1.5721197128295898, "learning_rate": 5.6999999999999996e-05, "loss": 2.2146, "step": 570 }, { "epoch": 0.06819016827099714, "grad_norm": 1.9322727918624878, "learning_rate": 5.8e-05, "loss": 2.1535, "step": 580 }, { "epoch": 0.06936586082739364, "grad_norm": 1.416174292564392, "learning_rate": 5.9e-05, "loss": 2.1998, "step": 590 }, { "epoch": 0.07054155338379015, "grad_norm": 1.9789154529571533, "learning_rate": 6e-05, "loss": 2.2373, "step": 600 }, { "epoch": 0.07171724594018664, "grad_norm": 1.8227792978286743, "learning_rate": 6.1e-05, "loss": 2.1972, "step": 610 }, { "epoch": 0.07289293849658314, "grad_norm": 1.8196603059768677, "learning_rate": 6.2e-05, "loss": 2.1889, "step": 620 }, { "epoch": 0.07406863105297964, "grad_norm": 1.7306127548217773, "learning_rate": 6.3e-05, "loss": 2.2504, "step": 630 }, { "epoch": 0.07524432360937615, "grad_norm": 1.458371639251709, "learning_rate": 6.400000000000001e-05, "loss": 2.0934, "step": 640 }, { "epoch": 0.07642001616577265, "grad_norm": 1.4244716167449951, "learning_rate": 6.500000000000001e-05, "loss": 2.2458, "step": 650 }, { "epoch": 0.07759570872216916, "grad_norm": 1.5873785018920898, "learning_rate": 6.6e-05, "loss": 2.156, "step": 660 }, { "epoch": 0.07877140127856566, "grad_norm": 2.0175890922546387, "learning_rate": 6.7e-05, "loss": 2.1549, "step": 670 }, { "epoch": 0.07994709383496215, "grad_norm": 1.9688979387283325, "learning_rate": 6.800000000000001e-05, "loss": 2.2466, "step": 680 }, { "epoch": 0.08112278639135866, "grad_norm": 2.2819833755493164, "learning_rate": 6.9e-05, "loss": 2.246, "step": 690 }, { "epoch": 0.08229847894775516, "grad_norm": 1.7715764045715332, "learning_rate": 7e-05, "loss": 2.2416, "step": 700 }, { "epoch": 0.08347417150415166, "grad_norm": 1.6172609329223633, "learning_rate": 7.1e-05, "loss": 2.1812, "step": 710 }, { "epoch": 0.08464986406054817, "grad_norm": 1.6439348459243774, "learning_rate": 7.2e-05, "loss": 2.157, "step": 720 }, { "epoch": 0.08582555661694467, "grad_norm": 1.4415756464004517, "learning_rate": 7.3e-05, "loss": 2.1815, "step": 730 }, { "epoch": 0.08700124917334118, "grad_norm": 1.463126301765442, "learning_rate": 7.4e-05, "loss": 2.1148, "step": 740 }, { "epoch": 0.08817694172973767, "grad_norm": 1.4844375848770142, "learning_rate": 7.500000000000001e-05, "loss": 2.2032, "step": 750 }, { "epoch": 0.08935263428613417, "grad_norm": 1.4137177467346191, "learning_rate": 7.6e-05, "loss": 2.1237, "step": 760 }, { "epoch": 0.09052832684253068, "grad_norm": 1.7228816747665405, "learning_rate": 7.7e-05, "loss": 2.2461, "step": 770 }, { "epoch": 0.09170401939892718, "grad_norm": 1.723456621170044, "learning_rate": 7.800000000000001e-05, "loss": 2.1863, "step": 780 }, { "epoch": 0.09287971195532368, "grad_norm": 1.66473388671875, "learning_rate": 7.900000000000001e-05, "loss": 2.2265, "step": 790 }, { "epoch": 0.09405540451172019, "grad_norm": 1.3827208280563354, "learning_rate": 8e-05, "loss": 2.2013, "step": 800 }, { "epoch": 0.0952310970681167, "grad_norm": 1.6661202907562256, "learning_rate": 8.1e-05, "loss": 2.1643, "step": 810 }, { "epoch": 0.09640678962451318, "grad_norm": 1.9432927370071411, "learning_rate": 8.2e-05, "loss": 2.0811, "step": 820 }, { "epoch": 0.09758248218090969, "grad_norm": 1.7929891347885132, "learning_rate": 8.3e-05, "loss": 2.197, "step": 830 }, { "epoch": 0.09875817473730619, "grad_norm": 1.9254796504974365, "learning_rate": 8.4e-05, "loss": 2.171, "step": 840 }, { "epoch": 0.0999338672937027, "grad_norm": 1.529528260231018, "learning_rate": 8.5e-05, "loss": 2.1423, "step": 850 }, { "epoch": 0.1011095598500992, "grad_norm": 1.489100694656372, "learning_rate": 8.6e-05, "loss": 2.0931, "step": 860 }, { "epoch": 0.1022852524064957, "grad_norm": 1.574889898300171, "learning_rate": 8.7e-05, "loss": 2.1302, "step": 870 }, { "epoch": 0.10346094496289221, "grad_norm": 1.3673583269119263, "learning_rate": 8.800000000000001e-05, "loss": 2.0766, "step": 880 }, { "epoch": 0.1046366375192887, "grad_norm": 1.745687484741211, "learning_rate": 8.900000000000001e-05, "loss": 2.1073, "step": 890 }, { "epoch": 0.1058123300756852, "grad_norm": 1.2604305744171143, "learning_rate": 9e-05, "loss": 2.1772, "step": 900 }, { "epoch": 0.10698802263208171, "grad_norm": 1.7562814950942993, "learning_rate": 9.1e-05, "loss": 2.106, "step": 910 }, { "epoch": 0.10816371518847821, "grad_norm": 1.5899475812911987, "learning_rate": 9.200000000000001e-05, "loss": 2.1613, "step": 920 }, { "epoch": 0.10933940774487472, "grad_norm": 1.42015540599823, "learning_rate": 9.300000000000001e-05, "loss": 2.1603, "step": 930 }, { "epoch": 0.11051510030127122, "grad_norm": 1.6306182146072388, "learning_rate": 9.4e-05, "loss": 2.2011, "step": 940 }, { "epoch": 0.11169079285766773, "grad_norm": 1.6548503637313843, "learning_rate": 9.5e-05, "loss": 2.1827, "step": 950 }, { "epoch": 0.11286648541406422, "grad_norm": 1.549340844154358, "learning_rate": 9.6e-05, "loss": 2.1046, "step": 960 }, { "epoch": 0.11404217797046072, "grad_norm": 1.5246042013168335, "learning_rate": 9.7e-05, "loss": 2.1177, "step": 970 }, { "epoch": 0.11521787052685722, "grad_norm": 1.5823726654052734, "learning_rate": 9.8e-05, "loss": 2.1358, "step": 980 }, { "epoch": 0.11639356308325373, "grad_norm": 1.5865737199783325, "learning_rate": 9.900000000000001e-05, "loss": 2.1992, "step": 990 }, { "epoch": 0.11756925563965023, "grad_norm": 3.2883830070495605, "learning_rate": 0.0001, "loss": 2.0921, "step": 1000 }, { "epoch": 0.11756925563965023, "eval_loss": 2.127145767211914, "eval_runtime": 1913.8228, "eval_samples_per_second": 31.604, "eval_steps_per_second": 3.951, "step": 1000 }, { "epoch": 0.11874494819604674, "grad_norm": 1.7023414373397827, "learning_rate": 9.99999589440695e-05, "loss": 2.1346, "step": 1010 }, { "epoch": 0.11992064075244324, "grad_norm": 1.6721221208572388, "learning_rate": 9.999983577634545e-05, "loss": 2.0962, "step": 1020 }, { "epoch": 0.12109633330883973, "grad_norm": 1.2532472610473633, "learning_rate": 9.999963049703009e-05, "loss": 2.1405, "step": 1030 }, { "epoch": 0.12227202586523624, "grad_norm": 1.6252515316009521, "learning_rate": 9.999934310646055e-05, "loss": 2.029, "step": 1040 }, { "epoch": 0.12344771842163274, "grad_norm": 1.4914478063583374, "learning_rate": 9.999897360510882e-05, "loss": 2.1596, "step": 1050 }, { "epoch": 0.12462341097802924, "grad_norm": 1.5309652090072632, "learning_rate": 9.999852199358166e-05, "loss": 2.1189, "step": 1060 }, { "epoch": 0.12579910353442575, "grad_norm": 1.301114559173584, "learning_rate": 9.999798827262075e-05, "loss": 2.1064, "step": 1070 }, { "epoch": 0.12697479609082224, "grad_norm": 1.4409784078598022, "learning_rate": 9.999737244310259e-05, "loss": 2.0553, "step": 1080 }, { "epoch": 0.12815048864721876, "grad_norm": 1.3894290924072266, "learning_rate": 9.99966745060385e-05, "loss": 2.1225, "step": 1090 }, { "epoch": 0.12932618120361525, "grad_norm": 2.0358502864837646, "learning_rate": 9.99958944625747e-05, "loss": 2.1177, "step": 1100 }, { "epoch": 0.13050187376001177, "grad_norm": 1.7554755210876465, "learning_rate": 9.999503231399215e-05, "loss": 2.1162, "step": 1110 }, { "epoch": 0.13167756631640826, "grad_norm": 1.3448805809020996, "learning_rate": 9.999408806170672e-05, "loss": 2.1453, "step": 1120 }, { "epoch": 0.13285325887280477, "grad_norm": 1.1961629390716553, "learning_rate": 9.999306170726913e-05, "loss": 2.1375, "step": 1130 }, { "epoch": 0.13402895142920127, "grad_norm": 1.7377305030822754, "learning_rate": 9.999195325236486e-05, "loss": 2.0772, "step": 1140 }, { "epoch": 0.13520464398559776, "grad_norm": 1.4622732400894165, "learning_rate": 9.999076269881427e-05, "loss": 2.0511, "step": 1150 }, { "epoch": 0.13638033654199427, "grad_norm": 1.3627151250839233, "learning_rate": 9.998949004857253e-05, "loss": 2.0622, "step": 1160 }, { "epoch": 0.13755602909839076, "grad_norm": 1.3070886135101318, "learning_rate": 9.998813530372964e-05, "loss": 2.0528, "step": 1170 }, { "epoch": 0.13873172165478728, "grad_norm": 1.3218578100204468, "learning_rate": 9.99866984665104e-05, "loss": 2.1242, "step": 1180 }, { "epoch": 0.13990741421118377, "grad_norm": 1.610963225364685, "learning_rate": 9.998517953927444e-05, "loss": 2.0625, "step": 1190 }, { "epoch": 0.1410831067675803, "grad_norm": 1.1151573657989502, "learning_rate": 9.998357852451622e-05, "loss": 2.1307, "step": 1200 }, { "epoch": 0.14225879932397678, "grad_norm": 1.3126977682113647, "learning_rate": 9.998189542486496e-05, "loss": 2.1, "step": 1210 }, { "epoch": 0.14343449188037327, "grad_norm": 1.6003894805908203, "learning_rate": 9.998013024308471e-05, "loss": 2.1281, "step": 1220 }, { "epoch": 0.1446101844367698, "grad_norm": 1.4869587421417236, "learning_rate": 9.997828298207432e-05, "loss": 2.0865, "step": 1230 }, { "epoch": 0.14578587699316628, "grad_norm": 1.525930643081665, "learning_rate": 9.997635364486747e-05, "loss": 2.1076, "step": 1240 }, { "epoch": 0.1469615695495628, "grad_norm": 1.2709434032440186, "learning_rate": 9.997434223463251e-05, "loss": 2.0918, "step": 1250 }, { "epoch": 0.1481372621059593, "grad_norm": 1.5626436471939087, "learning_rate": 9.997224875467273e-05, "loss": 2.0987, "step": 1260 }, { "epoch": 0.1493129546623558, "grad_norm": 1.679423451423645, "learning_rate": 9.997007320842606e-05, "loss": 2.0608, "step": 1270 }, { "epoch": 0.1504886472187523, "grad_norm": 1.2951703071594238, "learning_rate": 9.996781559946532e-05, "loss": 2.0802, "step": 1280 }, { "epoch": 0.1516643397751488, "grad_norm": 1.1203179359436035, "learning_rate": 9.9965475931498e-05, "loss": 2.0514, "step": 1290 }, { "epoch": 0.1528400323315453, "grad_norm": 1.6683197021484375, "learning_rate": 9.99630542083664e-05, "loss": 2.0784, "step": 1300 }, { "epoch": 0.1540157248879418, "grad_norm": 1.4730511903762817, "learning_rate": 9.996055043404756e-05, "loss": 2.1081, "step": 1310 }, { "epoch": 0.15519141744433831, "grad_norm": 1.3895132541656494, "learning_rate": 9.995796461265328e-05, "loss": 2.1624, "step": 1320 }, { "epoch": 0.1563671100007348, "grad_norm": 1.4618767499923706, "learning_rate": 9.99552967484301e-05, "loss": 2.0822, "step": 1330 }, { "epoch": 0.15754280255713132, "grad_norm": 1.7244621515274048, "learning_rate": 9.995254684575925e-05, "loss": 2.1163, "step": 1340 }, { "epoch": 0.1587184951135278, "grad_norm": 1.547454595565796, "learning_rate": 9.994971490915675e-05, "loss": 2.0822, "step": 1350 }, { "epoch": 0.1598941876699243, "grad_norm": 1.6287070512771606, "learning_rate": 9.994680094327333e-05, "loss": 2.0437, "step": 1360 }, { "epoch": 0.16106988022632082, "grad_norm": 1.5066560506820679, "learning_rate": 9.994380495289437e-05, "loss": 2.126, "step": 1370 }, { "epoch": 0.1622455727827173, "grad_norm": 1.3296915292739868, "learning_rate": 9.994072694294003e-05, "loss": 2.1096, "step": 1380 }, { "epoch": 0.16342126533911383, "grad_norm": 1.9835737943649292, "learning_rate": 9.993756691846512e-05, "loss": 2.1479, "step": 1390 }, { "epoch": 0.16459695789551032, "grad_norm": 1.5376664400100708, "learning_rate": 9.993432488465914e-05, "loss": 2.0743, "step": 1400 }, { "epoch": 0.16577265045190684, "grad_norm": 1.3664755821228027, "learning_rate": 9.99310008468463e-05, "loss": 2.0771, "step": 1410 }, { "epoch": 0.16694834300830333, "grad_norm": 1.5814976692199707, "learning_rate": 9.992759481048543e-05, "loss": 2.0825, "step": 1420 }, { "epoch": 0.16812403556469982, "grad_norm": 1.7187494039535522, "learning_rate": 9.992410678117009e-05, "loss": 2.0161, "step": 1430 }, { "epoch": 0.16929972812109634, "grad_norm": 1.522679328918457, "learning_rate": 9.992053676462842e-05, "loss": 2.1195, "step": 1440 }, { "epoch": 0.17047542067749283, "grad_norm": 1.572156548500061, "learning_rate": 9.991688476672325e-05, "loss": 1.9897, "step": 1450 }, { "epoch": 0.17165111323388935, "grad_norm": 1.5181337594985962, "learning_rate": 9.9913150793452e-05, "loss": 1.9996, "step": 1460 }, { "epoch": 0.17282680579028584, "grad_norm": 1.214157223701477, "learning_rate": 9.990933485094678e-05, "loss": 2.034, "step": 1470 }, { "epoch": 0.17400249834668235, "grad_norm": 1.5123066902160645, "learning_rate": 9.990543694547425e-05, "loss": 1.9953, "step": 1480 }, { "epoch": 0.17517819090307885, "grad_norm": 1.3879735469818115, "learning_rate": 9.990145708343571e-05, "loss": 2.0473, "step": 1490 }, { "epoch": 0.17635388345947534, "grad_norm": 1.2244585752487183, "learning_rate": 9.989739527136698e-05, "loss": 2.1212, "step": 1500 }, { "epoch": 0.17635388345947534, "eval_loss": 2.045675754547119, "eval_runtime": 1913.0403, "eval_samples_per_second": 31.617, "eval_steps_per_second": 3.952, "step": 1500 }, { "epoch": 0.17752957601587185, "grad_norm": 1.6319860219955444, "learning_rate": 9.989325151593861e-05, "loss": 1.9897, "step": 1510 }, { "epoch": 0.17870526857226834, "grad_norm": 1.2844350337982178, "learning_rate": 9.988902582395557e-05, "loss": 2.0188, "step": 1520 }, { "epoch": 0.17988096112866486, "grad_norm": 1.5271180868148804, "learning_rate": 9.988471820235746e-05, "loss": 2.0701, "step": 1530 }, { "epoch": 0.18105665368506135, "grad_norm": 1.18831467628479, "learning_rate": 9.988032865821842e-05, "loss": 2.0054, "step": 1540 }, { "epoch": 0.18223234624145787, "grad_norm": 1.2955900430679321, "learning_rate": 9.987585719874713e-05, "loss": 2.0252, "step": 1550 }, { "epoch": 0.18340803879785436, "grad_norm": 1.3982799053192139, "learning_rate": 9.987130383128678e-05, "loss": 2.0084, "step": 1560 }, { "epoch": 0.18458373135425085, "grad_norm": 1.1469404697418213, "learning_rate": 9.986666856331506e-05, "loss": 2.0958, "step": 1570 }, { "epoch": 0.18575942391064737, "grad_norm": 1.4466460943222046, "learning_rate": 9.986195140244421e-05, "loss": 1.9897, "step": 1580 }, { "epoch": 0.18693511646704386, "grad_norm": 1.2830710411071777, "learning_rate": 9.985715235642091e-05, "loss": 2.0198, "step": 1590 }, { "epoch": 0.18811080902344038, "grad_norm": 1.6278154850006104, "learning_rate": 9.985227143312635e-05, "loss": 2.0932, "step": 1600 }, { "epoch": 0.18928650157983687, "grad_norm": 1.4252407550811768, "learning_rate": 9.984730864057614e-05, "loss": 1.9686, "step": 1610 }, { "epoch": 0.1904621941362334, "grad_norm": 1.3854957818984985, "learning_rate": 9.984226398692039e-05, "loss": 2.0832, "step": 1620 }, { "epoch": 0.19163788669262988, "grad_norm": 1.3185784816741943, "learning_rate": 9.983713748044357e-05, "loss": 1.9973, "step": 1630 }, { "epoch": 0.19281357924902637, "grad_norm": 1.6201872825622559, "learning_rate": 9.983192912956467e-05, "loss": 1.9967, "step": 1640 }, { "epoch": 0.19398927180542289, "grad_norm": 1.346369981765747, "learning_rate": 9.982663894283702e-05, "loss": 2.089, "step": 1650 }, { "epoch": 0.19516496436181938, "grad_norm": 1.5156384706497192, "learning_rate": 9.982126692894838e-05, "loss": 2.0348, "step": 1660 }, { "epoch": 0.1963406569182159, "grad_norm": 1.3613094091415405, "learning_rate": 9.981581309672082e-05, "loss": 2.0767, "step": 1670 }, { "epoch": 0.19751634947461238, "grad_norm": 1.787123203277588, "learning_rate": 9.981027745511087e-05, "loss": 2.063, "step": 1680 }, { "epoch": 0.1986920420310089, "grad_norm": 1.211165189743042, "learning_rate": 9.980466001320936e-05, "loss": 2.0268, "step": 1690 }, { "epoch": 0.1998677345874054, "grad_norm": 1.4914475679397583, "learning_rate": 9.979896078024145e-05, "loss": 2.0484, "step": 1700 }, { "epoch": 0.20104342714380188, "grad_norm": 1.5200754404067993, "learning_rate": 9.979317976556665e-05, "loss": 2.0298, "step": 1710 }, { "epoch": 0.2022191197001984, "grad_norm": 1.1227608919143677, "learning_rate": 9.978731697867874e-05, "loss": 2.054, "step": 1720 }, { "epoch": 0.2033948122565949, "grad_norm": 1.5919827222824097, "learning_rate": 9.978137242920583e-05, "loss": 2.0302, "step": 1730 }, { "epoch": 0.2045705048129914, "grad_norm": 1.4626191854476929, "learning_rate": 9.977534612691024e-05, "loss": 1.9836, "step": 1740 }, { "epoch": 0.2057461973693879, "grad_norm": 1.4425270557403564, "learning_rate": 9.976923808168861e-05, "loss": 2.052, "step": 1750 }, { "epoch": 0.20692188992578442, "grad_norm": 1.4434897899627686, "learning_rate": 9.976304830357181e-05, "loss": 2.0678, "step": 1760 }, { "epoch": 0.2080975824821809, "grad_norm": 1.638001561164856, "learning_rate": 9.975677680272493e-05, "loss": 1.9868, "step": 1770 }, { "epoch": 0.2092732750385774, "grad_norm": 1.3118942975997925, "learning_rate": 9.975042358944724e-05, "loss": 1.9802, "step": 1780 }, { "epoch": 0.21044896759497392, "grad_norm": 1.5371990203857422, "learning_rate": 9.974398867417223e-05, "loss": 2.0166, "step": 1790 }, { "epoch": 0.2116246601513704, "grad_norm": 1.5281438827514648, "learning_rate": 9.973747206746755e-05, "loss": 1.973, "step": 1800 }, { "epoch": 0.21280035270776693, "grad_norm": 1.569236159324646, "learning_rate": 9.973087378003503e-05, "loss": 2.0535, "step": 1810 }, { "epoch": 0.21397604526416342, "grad_norm": 1.4628781080245972, "learning_rate": 9.97241938227106e-05, "loss": 2.0128, "step": 1820 }, { "epoch": 0.21515173782055994, "grad_norm": 1.8319720029830933, "learning_rate": 9.971743220646436e-05, "loss": 2.0109, "step": 1830 }, { "epoch": 0.21632743037695643, "grad_norm": 1.3796851634979248, "learning_rate": 9.97105889424005e-05, "loss": 2.036, "step": 1840 }, { "epoch": 0.21750312293335292, "grad_norm": 1.1975823640823364, "learning_rate": 9.970366404175724e-05, "loss": 1.9788, "step": 1850 }, { "epoch": 0.21867881548974943, "grad_norm": 1.2848585844039917, "learning_rate": 9.969665751590693e-05, "loss": 2.0196, "step": 1860 }, { "epoch": 0.21985450804614592, "grad_norm": 1.2799772024154663, "learning_rate": 9.968956937635595e-05, "loss": 2.0079, "step": 1870 }, { "epoch": 0.22103020060254244, "grad_norm": 1.399195671081543, "learning_rate": 9.96823996347447e-05, "loss": 1.9905, "step": 1880 }, { "epoch": 0.22220589315893893, "grad_norm": 1.2057019472122192, "learning_rate": 9.96751483028476e-05, "loss": 1.939, "step": 1890 }, { "epoch": 0.22338158571533545, "grad_norm": 1.2673991918563843, "learning_rate": 9.966781539257309e-05, "loss": 2.0075, "step": 1900 }, { "epoch": 0.22455727827173194, "grad_norm": 1.2878953218460083, "learning_rate": 9.966040091596348e-05, "loss": 1.9819, "step": 1910 }, { "epoch": 0.22573297082812843, "grad_norm": 1.4482604265213013, "learning_rate": 9.965290488519515e-05, "loss": 2.1423, "step": 1920 }, { "epoch": 0.22690866338452495, "grad_norm": 1.439816951751709, "learning_rate": 9.964532731257834e-05, "loss": 2.0218, "step": 1930 }, { "epoch": 0.22808435594092144, "grad_norm": 1.3476266860961914, "learning_rate": 9.963766821055725e-05, "loss": 1.9961, "step": 1940 }, { "epoch": 0.22926004849731796, "grad_norm": 1.330733299255371, "learning_rate": 9.96299275917099e-05, "loss": 1.9915, "step": 1950 }, { "epoch": 0.23043574105371445, "grad_norm": 1.3161274194717407, "learning_rate": 9.962210546874824e-05, "loss": 2.0419, "step": 1960 }, { "epoch": 0.23161143361011097, "grad_norm": 1.5587944984436035, "learning_rate": 9.961420185451806e-05, "loss": 2.035, "step": 1970 }, { "epoch": 0.23278712616650746, "grad_norm": 1.1499826908111572, "learning_rate": 9.960621676199897e-05, "loss": 1.9598, "step": 1980 }, { "epoch": 0.23396281872290395, "grad_norm": 1.2795321941375732, "learning_rate": 9.959815020430439e-05, "loss": 2.0008, "step": 1990 }, { "epoch": 0.23513851127930047, "grad_norm": 1.2844736576080322, "learning_rate": 9.959000219468149e-05, "loss": 1.9794, "step": 2000 }, { "epoch": 0.23513851127930047, "eval_loss": 1.9954005479812622, "eval_runtime": 1913.5094, "eval_samples_per_second": 31.609, "eval_steps_per_second": 3.951, "step": 2000 }, { "epoch": 0.23631420383569696, "grad_norm": 1.1324113607406616, "learning_rate": 9.958177274651126e-05, "loss": 2.0213, "step": 2010 }, { "epoch": 0.23748989639209347, "grad_norm": 1.1880581378936768, "learning_rate": 9.95734618733084e-05, "loss": 1.8996, "step": 2020 }, { "epoch": 0.23866558894848997, "grad_norm": 1.4424279928207397, "learning_rate": 9.956506958872135e-05, "loss": 1.9545, "step": 2030 }, { "epoch": 0.23984128150488648, "grad_norm": 1.3166203498840332, "learning_rate": 9.955659590653222e-05, "loss": 1.993, "step": 2040 }, { "epoch": 0.24101697406128297, "grad_norm": 1.2510100603103638, "learning_rate": 9.954804084065681e-05, "loss": 1.9624, "step": 2050 }, { "epoch": 0.24219266661767946, "grad_norm": 1.2646335363388062, "learning_rate": 9.953940440514454e-05, "loss": 1.9974, "step": 2060 }, { "epoch": 0.24336835917407598, "grad_norm": 1.616574764251709, "learning_rate": 9.953068661417852e-05, "loss": 1.9615, "step": 2070 }, { "epoch": 0.24454405173047247, "grad_norm": 1.2705035209655762, "learning_rate": 9.952188748207543e-05, "loss": 2.0039, "step": 2080 }, { "epoch": 0.245719744286869, "grad_norm": 1.341627597808838, "learning_rate": 9.951300702328553e-05, "loss": 1.9448, "step": 2090 }, { "epoch": 0.24689543684326548, "grad_norm": 1.4436321258544922, "learning_rate": 9.950404525239261e-05, "loss": 1.9751, "step": 2100 }, { "epoch": 0.248071129399662, "grad_norm": 1.109281063079834, "learning_rate": 9.949500218411405e-05, "loss": 1.9293, "step": 2110 }, { "epoch": 0.2492468219560585, "grad_norm": 1.4411486387252808, "learning_rate": 9.948587783330072e-05, "loss": 2.0172, "step": 2120 }, { "epoch": 0.250422514512455, "grad_norm": 1.5718107223510742, "learning_rate": 9.947667221493695e-05, "loss": 1.9218, "step": 2130 }, { "epoch": 0.2515982070688515, "grad_norm": 1.4463196992874146, "learning_rate": 9.946738534414058e-05, "loss": 1.9956, "step": 2140 }, { "epoch": 0.252773899625248, "grad_norm": 1.167048692703247, "learning_rate": 9.94580172361628e-05, "loss": 2.0015, "step": 2150 }, { "epoch": 0.2539495921816445, "grad_norm": 1.0568406581878662, "learning_rate": 9.94485679063883e-05, "loss": 1.9322, "step": 2160 }, { "epoch": 0.255125284738041, "grad_norm": 1.3389767408370972, "learning_rate": 9.943903737033513e-05, "loss": 2.0442, "step": 2170 }, { "epoch": 0.2563009772944375, "grad_norm": 1.3226016759872437, "learning_rate": 9.94294256436547e-05, "loss": 1.9154, "step": 2180 }, { "epoch": 0.257476669850834, "grad_norm": 1.3603910207748413, "learning_rate": 9.941973274213169e-05, "loss": 1.9047, "step": 2190 }, { "epoch": 0.2586523624072305, "grad_norm": 1.1901447772979736, "learning_rate": 9.940995868168419e-05, "loss": 1.9544, "step": 2200 }, { "epoch": 0.259828054963627, "grad_norm": 1.247434139251709, "learning_rate": 9.940010347836352e-05, "loss": 1.896, "step": 2210 }, { "epoch": 0.26100374752002353, "grad_norm": 1.4494489431381226, "learning_rate": 9.939016714835425e-05, "loss": 1.9457, "step": 2220 }, { "epoch": 0.26217944007642, "grad_norm": 1.1708606481552124, "learning_rate": 9.938014970797421e-05, "loss": 1.9778, "step": 2230 }, { "epoch": 0.2633551326328165, "grad_norm": 1.4373843669891357, "learning_rate": 9.937005117367438e-05, "loss": 1.9618, "step": 2240 }, { "epoch": 0.264530825189213, "grad_norm": 1.5014970302581787, "learning_rate": 9.935987156203899e-05, "loss": 1.9321, "step": 2250 }, { "epoch": 0.26570651774560955, "grad_norm": 1.5885076522827148, "learning_rate": 9.934961088978533e-05, "loss": 1.9985, "step": 2260 }, { "epoch": 0.26688221030200604, "grad_norm": 1.512679100036621, "learning_rate": 9.933926917376392e-05, "loss": 2.0183, "step": 2270 }, { "epoch": 0.26805790285840253, "grad_norm": 1.301005244255066, "learning_rate": 9.932884643095825e-05, "loss": 1.9979, "step": 2280 }, { "epoch": 0.269233595414799, "grad_norm": 1.406798005104065, "learning_rate": 9.931834267848497e-05, "loss": 2.007, "step": 2290 }, { "epoch": 0.2704092879711955, "grad_norm": 1.1532948017120361, "learning_rate": 9.930775793359372e-05, "loss": 1.964, "step": 2300 }, { "epoch": 0.27158498052759206, "grad_norm": 1.6937193870544434, "learning_rate": 9.929709221366717e-05, "loss": 2.0062, "step": 2310 }, { "epoch": 0.27276067308398855, "grad_norm": 1.3567652702331543, "learning_rate": 9.928634553622096e-05, "loss": 1.9522, "step": 2320 }, { "epoch": 0.27393636564038504, "grad_norm": 1.2926207780838013, "learning_rate": 9.927551791890369e-05, "loss": 2.0055, "step": 2330 }, { "epoch": 0.27511205819678153, "grad_norm": 1.0983537435531616, "learning_rate": 9.926460937949686e-05, "loss": 1.9779, "step": 2340 }, { "epoch": 0.276287750753178, "grad_norm": 1.5475362539291382, "learning_rate": 9.925361993591489e-05, "loss": 1.9921, "step": 2350 }, { "epoch": 0.27746344330957456, "grad_norm": 1.3673510551452637, "learning_rate": 9.924254960620505e-05, "loss": 1.9652, "step": 2360 }, { "epoch": 0.27863913586597105, "grad_norm": 1.2896283864974976, "learning_rate": 9.923139840854744e-05, "loss": 1.9837, "step": 2370 }, { "epoch": 0.27981482842236755, "grad_norm": 1.3939844369888306, "learning_rate": 9.9220166361255e-05, "loss": 1.9384, "step": 2380 }, { "epoch": 0.28099052097876404, "grad_norm": 1.6982284784317017, "learning_rate": 9.92088534827734e-05, "loss": 1.9727, "step": 2390 }, { "epoch": 0.2821662135351606, "grad_norm": 1.436413049697876, "learning_rate": 9.919745979168105e-05, "loss": 2.0238, "step": 2400 }, { "epoch": 0.28334190609155707, "grad_norm": 1.3806825876235962, "learning_rate": 9.918598530668912e-05, "loss": 1.9488, "step": 2410 }, { "epoch": 0.28451759864795356, "grad_norm": 1.2545872926712036, "learning_rate": 9.917443004664141e-05, "loss": 1.9164, "step": 2420 }, { "epoch": 0.28569329120435005, "grad_norm": 1.3380101919174194, "learning_rate": 9.916279403051445e-05, "loss": 1.9597, "step": 2430 }, { "epoch": 0.28686898376074654, "grad_norm": 1.3045512437820435, "learning_rate": 9.915107727741728e-05, "loss": 1.9356, "step": 2440 }, { "epoch": 0.2880446763171431, "grad_norm": 1.4536439180374146, "learning_rate": 9.913927980659161e-05, "loss": 1.9327, "step": 2450 }, { "epoch": 0.2892203688735396, "grad_norm": 1.361449956893921, "learning_rate": 9.91274016374117e-05, "loss": 1.9357, "step": 2460 }, { "epoch": 0.29039606142993607, "grad_norm": 1.5559766292572021, "learning_rate": 9.911544278938429e-05, "loss": 1.9269, "step": 2470 }, { "epoch": 0.29157175398633256, "grad_norm": 1.0727676153182983, "learning_rate": 9.910340328214869e-05, "loss": 1.9611, "step": 2480 }, { "epoch": 0.29274744654272905, "grad_norm": 1.32298743724823, "learning_rate": 9.909128313547659e-05, "loss": 1.9938, "step": 2490 }, { "epoch": 0.2939231390991256, "grad_norm": 1.457543134689331, "learning_rate": 9.907908236927215e-05, "loss": 1.8983, "step": 2500 }, { "epoch": 0.2939231390991256, "eval_loss": 1.9546035528182983, "eval_runtime": 1913.4365, "eval_samples_per_second": 31.611, "eval_steps_per_second": 3.952, "step": 2500 }, { "epoch": 0.2950988316555221, "grad_norm": 1.5958725214004517, "learning_rate": 9.906680100357195e-05, "loss": 1.9234, "step": 2510 }, { "epoch": 0.2962745242119186, "grad_norm": 1.1926506757736206, "learning_rate": 9.905443905854487e-05, "loss": 1.9491, "step": 2520 }, { "epoch": 0.29745021676831507, "grad_norm": 1.2101242542266846, "learning_rate": 9.904199655449218e-05, "loss": 1.9603, "step": 2530 }, { "epoch": 0.2986259093247116, "grad_norm": 1.2021504640579224, "learning_rate": 9.90294735118474e-05, "loss": 1.9779, "step": 2540 }, { "epoch": 0.2998016018811081, "grad_norm": 1.2972944974899292, "learning_rate": 9.901686995117637e-05, "loss": 1.9794, "step": 2550 }, { "epoch": 0.3009772944375046, "grad_norm": 1.572240948677063, "learning_rate": 9.900418589317709e-05, "loss": 1.9146, "step": 2560 }, { "epoch": 0.3021529869939011, "grad_norm": 1.24315345287323, "learning_rate": 9.899142135867983e-05, "loss": 1.9624, "step": 2570 }, { "epoch": 0.3033286795502976, "grad_norm": 1.6560978889465332, "learning_rate": 9.897857636864696e-05, "loss": 1.9518, "step": 2580 }, { "epoch": 0.3045043721066941, "grad_norm": 1.5916625261306763, "learning_rate": 9.896565094417298e-05, "loss": 1.9169, "step": 2590 }, { "epoch": 0.3056800646630906, "grad_norm": 1.387374758720398, "learning_rate": 9.895264510648456e-05, "loss": 1.9652, "step": 2600 }, { "epoch": 0.3068557572194871, "grad_norm": 1.269538402557373, "learning_rate": 9.893955887694033e-05, "loss": 1.8832, "step": 2610 }, { "epoch": 0.3080314497758836, "grad_norm": 1.4394878149032593, "learning_rate": 9.892639227703099e-05, "loss": 1.914, "step": 2620 }, { "epoch": 0.3092071423322801, "grad_norm": 1.2762129306793213, "learning_rate": 9.891314532837922e-05, "loss": 1.9752, "step": 2630 }, { "epoch": 0.31038283488867663, "grad_norm": 1.3599573373794556, "learning_rate": 9.889981805273966e-05, "loss": 1.9066, "step": 2640 }, { "epoch": 0.3115585274450731, "grad_norm": 1.5158650875091553, "learning_rate": 9.888641047199885e-05, "loss": 1.8255, "step": 2650 }, { "epoch": 0.3127342200014696, "grad_norm": 1.1718757152557373, "learning_rate": 9.887292260817523e-05, "loss": 1.9115, "step": 2660 }, { "epoch": 0.3139099125578661, "grad_norm": 1.3039108514785767, "learning_rate": 9.885935448341903e-05, "loss": 2.0043, "step": 2670 }, { "epoch": 0.31508560511426265, "grad_norm": 1.6466155052185059, "learning_rate": 9.884570612001239e-05, "loss": 1.9005, "step": 2680 }, { "epoch": 0.31626129767065914, "grad_norm": 1.264749526977539, "learning_rate": 9.883197754036913e-05, "loss": 1.9459, "step": 2690 }, { "epoch": 0.3174369902270556, "grad_norm": 1.3210339546203613, "learning_rate": 9.881816876703484e-05, "loss": 1.9747, "step": 2700 }, { "epoch": 0.3186126827834521, "grad_norm": 1.2149466276168823, "learning_rate": 9.880427982268679e-05, "loss": 1.9035, "step": 2710 }, { "epoch": 0.3197883753398486, "grad_norm": 1.536145567893982, "learning_rate": 9.879031073013393e-05, "loss": 1.9502, "step": 2720 }, { "epoch": 0.32096406789624515, "grad_norm": 1.2581171989440918, "learning_rate": 9.877626151231682e-05, "loss": 1.9215, "step": 2730 }, { "epoch": 0.32213976045264164, "grad_norm": 1.0164381265640259, "learning_rate": 9.876213219230764e-05, "loss": 1.9243, "step": 2740 }, { "epoch": 0.32331545300903813, "grad_norm": 1.551082968711853, "learning_rate": 9.874792279331002e-05, "loss": 1.9173, "step": 2750 }, { "epoch": 0.3244911455654346, "grad_norm": 1.2365858554840088, "learning_rate": 9.873363333865923e-05, "loss": 1.9419, "step": 2760 }, { "epoch": 0.3256668381218311, "grad_norm": 1.0733628273010254, "learning_rate": 9.87192638518219e-05, "loss": 1.9388, "step": 2770 }, { "epoch": 0.32684253067822766, "grad_norm": 1.458593487739563, "learning_rate": 9.870481435639616e-05, "loss": 1.9337, "step": 2780 }, { "epoch": 0.32801822323462415, "grad_norm": 1.1137982606887817, "learning_rate": 9.869028487611149e-05, "loss": 1.9067, "step": 2790 }, { "epoch": 0.32919391579102064, "grad_norm": 1.406600832939148, "learning_rate": 9.867567543482877e-05, "loss": 1.9677, "step": 2800 }, { "epoch": 0.33036960834741713, "grad_norm": 1.3324565887451172, "learning_rate": 9.866098605654014e-05, "loss": 1.9731, "step": 2810 }, { "epoch": 0.3315453009038137, "grad_norm": 1.4308537244796753, "learning_rate": 9.864621676536905e-05, "loss": 1.9179, "step": 2820 }, { "epoch": 0.33272099346021017, "grad_norm": 1.1073017120361328, "learning_rate": 9.86313675855702e-05, "loss": 1.9146, "step": 2830 }, { "epoch": 0.33389668601660666, "grad_norm": 1.4958479404449463, "learning_rate": 9.861643854152944e-05, "loss": 1.9291, "step": 2840 }, { "epoch": 0.33507237857300315, "grad_norm": 1.1945546865463257, "learning_rate": 9.860142965776382e-05, "loss": 1.9191, "step": 2850 }, { "epoch": 0.33624807112939964, "grad_norm": 1.4630643129348755, "learning_rate": 9.858634095892149e-05, "loss": 1.8994, "step": 2860 }, { "epoch": 0.3374237636857962, "grad_norm": 1.296429991722107, "learning_rate": 9.857117246978165e-05, "loss": 1.9315, "step": 2870 }, { "epoch": 0.3385994562421927, "grad_norm": 1.7550448179244995, "learning_rate": 9.855592421525457e-05, "loss": 1.9238, "step": 2880 }, { "epoch": 0.33977514879858917, "grad_norm": 1.1610223054885864, "learning_rate": 9.854059622038153e-05, "loss": 1.9194, "step": 2890 }, { "epoch": 0.34095084135498566, "grad_norm": 1.2789753675460815, "learning_rate": 9.852518851033467e-05, "loss": 1.9272, "step": 2900 }, { "epoch": 0.34212653391138215, "grad_norm": 1.1512668132781982, "learning_rate": 9.850970111041715e-05, "loss": 1.9058, "step": 2910 }, { "epoch": 0.3433022264677787, "grad_norm": 1.4260399341583252, "learning_rate": 9.849413404606296e-05, "loss": 1.9399, "step": 2920 }, { "epoch": 0.3444779190241752, "grad_norm": 1.1138675212860107, "learning_rate": 9.847848734283689e-05, "loss": 1.9115, "step": 2930 }, { "epoch": 0.3456536115805717, "grad_norm": 1.5060944557189941, "learning_rate": 9.846276102643453e-05, "loss": 2.0083, "step": 2940 }, { "epoch": 0.34682930413696816, "grad_norm": 1.3354663848876953, "learning_rate": 9.844695512268226e-05, "loss": 1.9591, "step": 2950 }, { "epoch": 0.3480049966933647, "grad_norm": 1.4079502820968628, "learning_rate": 9.84310696575371e-05, "loss": 1.9259, "step": 2960 }, { "epoch": 0.3491806892497612, "grad_norm": 1.3686866760253906, "learning_rate": 9.841510465708675e-05, "loss": 1.9649, "step": 2970 }, { "epoch": 0.3503563818061577, "grad_norm": 1.2066538333892822, "learning_rate": 9.839906014754953e-05, "loss": 1.8669, "step": 2980 }, { "epoch": 0.3515320743625542, "grad_norm": 1.4340181350708008, "learning_rate": 9.838293615527433e-05, "loss": 1.9273, "step": 2990 }, { "epoch": 0.35270776691895067, "grad_norm": 1.6686608791351318, "learning_rate": 9.836673270674058e-05, "loss": 1.8976, "step": 3000 }, { "epoch": 0.35270776691895067, "eval_loss": 1.9214105606079102, "eval_runtime": 1912.5439, "eval_samples_per_second": 31.625, "eval_steps_per_second": 3.953, "step": 3000 }, { "epoch": 0.3538834594753472, "grad_norm": 1.2992982864379883, "learning_rate": 9.835044982855817e-05, "loss": 1.8891, "step": 3010 }, { "epoch": 0.3550591520317437, "grad_norm": 1.511050820350647, "learning_rate": 9.833408754746747e-05, "loss": 1.916, "step": 3020 }, { "epoch": 0.3562348445881402, "grad_norm": 1.4789292812347412, "learning_rate": 9.83176458903392e-05, "loss": 1.8215, "step": 3030 }, { "epoch": 0.3574105371445367, "grad_norm": 1.217375636100769, "learning_rate": 9.830112488417449e-05, "loss": 1.9693, "step": 3040 }, { "epoch": 0.3585862297009332, "grad_norm": 1.6017667055130005, "learning_rate": 9.828452455610473e-05, "loss": 1.8754, "step": 3050 }, { "epoch": 0.3597619222573297, "grad_norm": 1.3842079639434814, "learning_rate": 9.82678449333916e-05, "loss": 1.934, "step": 3060 }, { "epoch": 0.3609376148137262, "grad_norm": 1.51304292678833, "learning_rate": 9.825108604342701e-05, "loss": 1.9147, "step": 3070 }, { "epoch": 0.3621133073701227, "grad_norm": 1.3016083240509033, "learning_rate": 9.823424791373302e-05, "loss": 1.8561, "step": 3080 }, { "epoch": 0.3632889999265192, "grad_norm": 1.2409002780914307, "learning_rate": 9.821733057196184e-05, "loss": 1.8387, "step": 3090 }, { "epoch": 0.36446469248291574, "grad_norm": 1.4731556177139282, "learning_rate": 9.820033404589576e-05, "loss": 1.8985, "step": 3100 }, { "epoch": 0.36564038503931223, "grad_norm": 1.3212507963180542, "learning_rate": 9.81832583634471e-05, "loss": 1.9136, "step": 3110 }, { "epoch": 0.3668160775957087, "grad_norm": 1.7940925359725952, "learning_rate": 9.816610355265818e-05, "loss": 2.0206, "step": 3120 }, { "epoch": 0.3679917701521052, "grad_norm": 1.2001186609268188, "learning_rate": 9.814886964170127e-05, "loss": 1.9023, "step": 3130 }, { "epoch": 0.3691674627085017, "grad_norm": 1.2480732202529907, "learning_rate": 9.813155665887855e-05, "loss": 1.9034, "step": 3140 }, { "epoch": 0.37034315526489825, "grad_norm": 1.3460354804992676, "learning_rate": 9.811416463262205e-05, "loss": 1.8897, "step": 3150 }, { "epoch": 0.37151884782129474, "grad_norm": 1.2092111110687256, "learning_rate": 9.809669359149357e-05, "loss": 1.8878, "step": 3160 }, { "epoch": 0.37269454037769123, "grad_norm": 1.4529813528060913, "learning_rate": 9.807914356418473e-05, "loss": 1.9226, "step": 3170 }, { "epoch": 0.3738702329340877, "grad_norm": 1.2951313257217407, "learning_rate": 9.806151457951685e-05, "loss": 1.9212, "step": 3180 }, { "epoch": 0.3750459254904842, "grad_norm": 1.5956711769104004, "learning_rate": 9.804380666644086e-05, "loss": 1.894, "step": 3190 }, { "epoch": 0.37622161804688076, "grad_norm": 1.3913791179656982, "learning_rate": 9.802601985403741e-05, "loss": 1.8819, "step": 3200 }, { "epoch": 0.37739731060327725, "grad_norm": 1.24259352684021, "learning_rate": 9.80081541715166e-05, "loss": 1.9106, "step": 3210 }, { "epoch": 0.37857300315967374, "grad_norm": 1.4305282831192017, "learning_rate": 9.79902096482182e-05, "loss": 1.9139, "step": 3220 }, { "epoch": 0.37974869571607023, "grad_norm": 1.5311501026153564, "learning_rate": 9.79721863136113e-05, "loss": 1.8682, "step": 3230 }, { "epoch": 0.3809243882724668, "grad_norm": 1.359506607055664, "learning_rate": 9.795408419729454e-05, "loss": 1.9419, "step": 3240 }, { "epoch": 0.38210008082886326, "grad_norm": 1.3718581199645996, "learning_rate": 9.793590332899586e-05, "loss": 1.853, "step": 3250 }, { "epoch": 0.38327577338525975, "grad_norm": 1.0316749811172485, "learning_rate": 9.791764373857257e-05, "loss": 1.8678, "step": 3260 }, { "epoch": 0.38445146594165625, "grad_norm": 1.1559522151947021, "learning_rate": 9.789930545601125e-05, "loss": 1.9753, "step": 3270 }, { "epoch": 0.38562715849805274, "grad_norm": 1.4552001953125, "learning_rate": 9.78808885114277e-05, "loss": 1.9412, "step": 3280 }, { "epoch": 0.3868028510544493, "grad_norm": 1.3433363437652588, "learning_rate": 9.786239293506692e-05, "loss": 1.8856, "step": 3290 }, { "epoch": 0.38797854361084577, "grad_norm": 1.0146565437316895, "learning_rate": 9.784381875730304e-05, "loss": 1.9086, "step": 3300 }, { "epoch": 0.38915423616724226, "grad_norm": 1.436052680015564, "learning_rate": 9.782516600863927e-05, "loss": 1.893, "step": 3310 }, { "epoch": 0.39032992872363875, "grad_norm": 1.0915888547897339, "learning_rate": 9.780643471970781e-05, "loss": 1.8603, "step": 3320 }, { "epoch": 0.39150562128003524, "grad_norm": 1.0220108032226562, "learning_rate": 9.778762492126994e-05, "loss": 1.8912, "step": 3330 }, { "epoch": 0.3926813138364318, "grad_norm": 1.405739665031433, "learning_rate": 9.776873664421575e-05, "loss": 1.9341, "step": 3340 }, { "epoch": 0.3938570063928283, "grad_norm": 1.0864759683609009, "learning_rate": 9.774976991956433e-05, "loss": 1.9372, "step": 3350 }, { "epoch": 0.39503269894922477, "grad_norm": 1.400607705116272, "learning_rate": 9.773072477846348e-05, "loss": 1.9445, "step": 3360 }, { "epoch": 0.39620839150562126, "grad_norm": 1.0864202976226807, "learning_rate": 9.77116012521899e-05, "loss": 1.8841, "step": 3370 }, { "epoch": 0.3973840840620178, "grad_norm": 1.1078921556472778, "learning_rate": 9.769239937214892e-05, "loss": 1.908, "step": 3380 }, { "epoch": 0.3985597766184143, "grad_norm": 1.1477378606796265, "learning_rate": 9.767311916987457e-05, "loss": 1.8863, "step": 3390 }, { "epoch": 0.3997354691748108, "grad_norm": 1.3231672048568726, "learning_rate": 9.765376067702955e-05, "loss": 1.8957, "step": 3400 }, { "epoch": 0.4009111617312073, "grad_norm": 1.2159173488616943, "learning_rate": 9.763432392540507e-05, "loss": 1.8669, "step": 3410 }, { "epoch": 0.40208685428760377, "grad_norm": 1.1029443740844727, "learning_rate": 9.761480894692093e-05, "loss": 1.8178, "step": 3420 }, { "epoch": 0.4032625468440003, "grad_norm": 1.1831114292144775, "learning_rate": 9.75952157736253e-05, "loss": 1.9641, "step": 3430 }, { "epoch": 0.4044382394003968, "grad_norm": 1.1129118204116821, "learning_rate": 9.757554443769485e-05, "loss": 1.8513, "step": 3440 }, { "epoch": 0.4056139319567933, "grad_norm": 1.3526332378387451, "learning_rate": 9.755579497143457e-05, "loss": 1.868, "step": 3450 }, { "epoch": 0.4067896245131898, "grad_norm": 1.3435856103897095, "learning_rate": 9.753596740727777e-05, "loss": 1.8749, "step": 3460 }, { "epoch": 0.40796531706958633, "grad_norm": 1.4072470664978027, "learning_rate": 9.751606177778603e-05, "loss": 1.9006, "step": 3470 }, { "epoch": 0.4091410096259828, "grad_norm": 1.088450312614441, "learning_rate": 9.749607811564908e-05, "loss": 1.8717, "step": 3480 }, { "epoch": 0.4103167021823793, "grad_norm": 1.1382876634597778, "learning_rate": 9.747601645368488e-05, "loss": 1.9136, "step": 3490 }, { "epoch": 0.4114923947387758, "grad_norm": 1.0758370161056519, "learning_rate": 9.745587682483939e-05, "loss": 1.9345, "step": 3500 }, { "epoch": 0.4114923947387758, "eval_loss": 1.8950259685516357, "eval_runtime": 1912.9997, "eval_samples_per_second": 31.618, "eval_steps_per_second": 3.952, "step": 3500 }, { "epoch": 0.4126680872951723, "grad_norm": 1.157543659210205, "learning_rate": 9.743565926218668e-05, "loss": 1.8515, "step": 3510 }, { "epoch": 0.41384377985156884, "grad_norm": 1.534197449684143, "learning_rate": 9.74153637989288e-05, "loss": 1.9282, "step": 3520 }, { "epoch": 0.41501947240796533, "grad_norm": 1.2924381494522095, "learning_rate": 9.739499046839568e-05, "loss": 1.8344, "step": 3530 }, { "epoch": 0.4161951649643618, "grad_norm": 1.2050232887268066, "learning_rate": 9.737453930404518e-05, "loss": 1.8694, "step": 3540 }, { "epoch": 0.4173708575207583, "grad_norm": 1.5699467658996582, "learning_rate": 9.735401033946299e-05, "loss": 1.8378, "step": 3550 }, { "epoch": 0.4185465500771548, "grad_norm": 1.331260085105896, "learning_rate": 9.73334036083625e-05, "loss": 1.833, "step": 3560 }, { "epoch": 0.41972224263355135, "grad_norm": 1.3369814157485962, "learning_rate": 9.731271914458486e-05, "loss": 1.8554, "step": 3570 }, { "epoch": 0.42089793518994784, "grad_norm": 1.1554583311080933, "learning_rate": 9.729195698209886e-05, "loss": 1.9161, "step": 3580 }, { "epoch": 0.4220736277463443, "grad_norm": 1.0841094255447388, "learning_rate": 9.727111715500092e-05, "loss": 1.9166, "step": 3590 }, { "epoch": 0.4232493203027408, "grad_norm": 1.1740009784698486, "learning_rate": 9.725019969751497e-05, "loss": 1.8034, "step": 3600 }, { "epoch": 0.42442501285913736, "grad_norm": 1.0715843439102173, "learning_rate": 9.722920464399244e-05, "loss": 1.8694, "step": 3610 }, { "epoch": 0.42560070541553385, "grad_norm": 1.238390564918518, "learning_rate": 9.720813202891217e-05, "loss": 1.8835, "step": 3620 }, { "epoch": 0.42677639797193034, "grad_norm": 1.139062523841858, "learning_rate": 9.718698188688041e-05, "loss": 1.9161, "step": 3630 }, { "epoch": 0.42795209052832683, "grad_norm": 1.2943507432937622, "learning_rate": 9.71657542526307e-05, "loss": 1.8165, "step": 3640 }, { "epoch": 0.4291277830847233, "grad_norm": 1.1939334869384766, "learning_rate": 9.714444916102388e-05, "loss": 1.9616, "step": 3650 }, { "epoch": 0.43030347564111987, "grad_norm": 1.1633938550949097, "learning_rate": 9.71230666470479e-05, "loss": 1.8982, "step": 3660 }, { "epoch": 0.43147916819751636, "grad_norm": 1.1644477844238281, "learning_rate": 9.710160674581801e-05, "loss": 1.803, "step": 3670 }, { "epoch": 0.43265486075391285, "grad_norm": 1.2325592041015625, "learning_rate": 9.708006949257638e-05, "loss": 1.9193, "step": 3680 }, { "epoch": 0.43383055331030934, "grad_norm": 1.1891539096832275, "learning_rate": 9.705845492269232e-05, "loss": 1.8212, "step": 3690 }, { "epoch": 0.43500624586670583, "grad_norm": 1.2826625108718872, "learning_rate": 9.70367630716621e-05, "loss": 1.8638, "step": 3700 }, { "epoch": 0.4361819384231024, "grad_norm": 1.0185095071792603, "learning_rate": 9.701499397510883e-05, "loss": 1.861, "step": 3710 }, { "epoch": 0.43735763097949887, "grad_norm": 1.4706171751022339, "learning_rate": 9.69931476687826e-05, "loss": 1.8561, "step": 3720 }, { "epoch": 0.43853332353589536, "grad_norm": 1.029944658279419, "learning_rate": 9.697122418856018e-05, "loss": 1.855, "step": 3730 }, { "epoch": 0.43970901609229185, "grad_norm": 1.632054090499878, "learning_rate": 9.694922357044514e-05, "loss": 1.908, "step": 3740 }, { "epoch": 0.4408847086486884, "grad_norm": 1.2101844549179077, "learning_rate": 9.69271458505677e-05, "loss": 1.8298, "step": 3750 }, { "epoch": 0.4420604012050849, "grad_norm": 1.308363676071167, "learning_rate": 9.690499106518473e-05, "loss": 1.8748, "step": 3760 }, { "epoch": 0.4432360937614814, "grad_norm": 1.5883780717849731, "learning_rate": 9.688275925067965e-05, "loss": 1.8565, "step": 3770 }, { "epoch": 0.44441178631787787, "grad_norm": 1.617079734802246, "learning_rate": 9.686045044356235e-05, "loss": 1.8998, "step": 3780 }, { "epoch": 0.44558747887427436, "grad_norm": 1.5866354703903198, "learning_rate": 9.683806468046922e-05, "loss": 1.885, "step": 3790 }, { "epoch": 0.4467631714306709, "grad_norm": 1.2109373807907104, "learning_rate": 9.681560199816294e-05, "loss": 1.8667, "step": 3800 }, { "epoch": 0.4479388639870674, "grad_norm": 1.5942695140838623, "learning_rate": 9.67930624335326e-05, "loss": 1.8969, "step": 3810 }, { "epoch": 0.4491145565434639, "grad_norm": 1.2186177968978882, "learning_rate": 9.67704460235935e-05, "loss": 1.8807, "step": 3820 }, { "epoch": 0.4502902490998604, "grad_norm": 1.3832145929336548, "learning_rate": 9.674775280548715e-05, "loss": 1.9075, "step": 3830 }, { "epoch": 0.45146594165625686, "grad_norm": 1.3819901943206787, "learning_rate": 9.672498281648121e-05, "loss": 1.868, "step": 3840 }, { "epoch": 0.4526416342126534, "grad_norm": 1.7274998426437378, "learning_rate": 9.67021360939694e-05, "loss": 1.8947, "step": 3850 }, { "epoch": 0.4538173267690499, "grad_norm": 1.2887579202651978, "learning_rate": 9.667921267547145e-05, "loss": 1.8121, "step": 3860 }, { "epoch": 0.4549930193254464, "grad_norm": 1.195483922958374, "learning_rate": 9.665621259863304e-05, "loss": 1.8632, "step": 3870 }, { "epoch": 0.4561687118818429, "grad_norm": 1.545079231262207, "learning_rate": 9.663313590122577e-05, "loss": 1.8202, "step": 3880 }, { "epoch": 0.4573444044382394, "grad_norm": 1.2770529985427856, "learning_rate": 9.660998262114707e-05, "loss": 1.8784, "step": 3890 }, { "epoch": 0.4585200969946359, "grad_norm": 1.3703734874725342, "learning_rate": 9.658675279642008e-05, "loss": 1.9059, "step": 3900 }, { "epoch": 0.4596957895510324, "grad_norm": 1.367532730102539, "learning_rate": 9.656344646519369e-05, "loss": 1.8936, "step": 3910 }, { "epoch": 0.4608714821074289, "grad_norm": 1.1961573362350464, "learning_rate": 9.654006366574244e-05, "loss": 1.8996, "step": 3920 }, { "epoch": 0.4620471746638254, "grad_norm": 1.011743187904358, "learning_rate": 9.651660443646644e-05, "loss": 1.8779, "step": 3930 }, { "epoch": 0.46322286722022193, "grad_norm": 1.2063390016555786, "learning_rate": 9.649306881589127e-05, "loss": 1.8821, "step": 3940 }, { "epoch": 0.4643985597766184, "grad_norm": 1.147666573524475, "learning_rate": 9.646945684266805e-05, "loss": 1.9144, "step": 3950 }, { "epoch": 0.4655742523330149, "grad_norm": 1.0469412803649902, "learning_rate": 9.644576855557322e-05, "loss": 1.7677, "step": 3960 }, { "epoch": 0.4667499448894114, "grad_norm": 1.9739069938659668, "learning_rate": 9.642200399350855e-05, "loss": 1.8635, "step": 3970 }, { "epoch": 0.4679256374458079, "grad_norm": 1.1250691413879395, "learning_rate": 9.63981631955011e-05, "loss": 1.8496, "step": 3980 }, { "epoch": 0.46910133000220444, "grad_norm": 1.439513087272644, "learning_rate": 9.637424620070314e-05, "loss": 1.8252, "step": 3990 }, { "epoch": 0.47027702255860093, "grad_norm": 1.2039324045181274, "learning_rate": 9.635025304839203e-05, "loss": 1.8782, "step": 4000 }, { "epoch": 0.47027702255860093, "eval_loss": 1.8705339431762695, "eval_runtime": 1914.2298, "eval_samples_per_second": 31.598, "eval_steps_per_second": 3.95, "step": 4000 }, { "epoch": 0.4714527151149974, "grad_norm": 1.1046302318572998, "learning_rate": 9.63261837779702e-05, "loss": 1.8232, "step": 4010 }, { "epoch": 0.4726284076713939, "grad_norm": 1.3088709115982056, "learning_rate": 9.630203842896513e-05, "loss": 1.8608, "step": 4020 }, { "epoch": 0.47380410022779046, "grad_norm": 1.5824846029281616, "learning_rate": 9.62778170410292e-05, "loss": 1.8569, "step": 4030 }, { "epoch": 0.47497979278418695, "grad_norm": 1.153314471244812, "learning_rate": 9.625351965393967e-05, "loss": 1.8566, "step": 4040 }, { "epoch": 0.47615548534058344, "grad_norm": 1.1716610193252563, "learning_rate": 9.622914630759862e-05, "loss": 1.813, "step": 4050 }, { "epoch": 0.47733117789697993, "grad_norm": 1.1216520071029663, "learning_rate": 9.620469704203286e-05, "loss": 1.8189, "step": 4060 }, { "epoch": 0.4785068704533764, "grad_norm": 1.238127589225769, "learning_rate": 9.61801718973939e-05, "loss": 1.8968, "step": 4070 }, { "epoch": 0.47968256300977297, "grad_norm": 1.307554006576538, "learning_rate": 9.615557091395781e-05, "loss": 1.8584, "step": 4080 }, { "epoch": 0.48085825556616946, "grad_norm": 1.554139256477356, "learning_rate": 9.613089413212529e-05, "loss": 1.8449, "step": 4090 }, { "epoch": 0.48203394812256595, "grad_norm": 1.4059886932373047, "learning_rate": 9.610614159242144e-05, "loss": 1.8635, "step": 4100 }, { "epoch": 0.48320964067896244, "grad_norm": 1.8455514907836914, "learning_rate": 9.608131333549579e-05, "loss": 1.8354, "step": 4110 }, { "epoch": 0.48438533323535893, "grad_norm": 1.1919498443603516, "learning_rate": 9.605640940212226e-05, "loss": 1.8873, "step": 4120 }, { "epoch": 0.4855610257917555, "grad_norm": 1.211358666419983, "learning_rate": 9.6031429833199e-05, "loss": 1.891, "step": 4130 }, { "epoch": 0.48673671834815196, "grad_norm": 1.385921835899353, "learning_rate": 9.600637466974838e-05, "loss": 1.8565, "step": 4140 }, { "epoch": 0.48791241090454845, "grad_norm": 1.282532811164856, "learning_rate": 9.598124395291692e-05, "loss": 1.8405, "step": 4150 }, { "epoch": 0.48908810346094495, "grad_norm": 1.230760931968689, "learning_rate": 9.595603772397524e-05, "loss": 1.9059, "step": 4160 }, { "epoch": 0.4902637960173415, "grad_norm": 1.1014673709869385, "learning_rate": 9.593075602431794e-05, "loss": 1.8255, "step": 4170 }, { "epoch": 0.491439488573738, "grad_norm": 1.5924718379974365, "learning_rate": 9.590539889546356e-05, "loss": 1.8307, "step": 4180 }, { "epoch": 0.49261518113013447, "grad_norm": 1.190051555633545, "learning_rate": 9.587996637905452e-05, "loss": 1.7917, "step": 4190 }, { "epoch": 0.49379087368653096, "grad_norm": 1.5432175397872925, "learning_rate": 9.585445851685706e-05, "loss": 1.8767, "step": 4200 }, { "epoch": 0.49496656624292745, "grad_norm": 1.2771213054656982, "learning_rate": 9.582887535076112e-05, "loss": 1.8976, "step": 4210 }, { "epoch": 0.496142258799324, "grad_norm": 1.2366210222244263, "learning_rate": 9.580321692278033e-05, "loss": 1.8992, "step": 4220 }, { "epoch": 0.4973179513557205, "grad_norm": 1.502581000328064, "learning_rate": 9.577748327505194e-05, "loss": 1.7727, "step": 4230 }, { "epoch": 0.498493643912117, "grad_norm": 1.5243933200836182, "learning_rate": 9.575167444983668e-05, "loss": 1.9028, "step": 4240 }, { "epoch": 0.49966933646851347, "grad_norm": 1.1629050970077515, "learning_rate": 9.572579048951877e-05, "loss": 1.8671, "step": 4250 }, { "epoch": 0.50084502902491, "grad_norm": 1.374474287033081, "learning_rate": 9.569983143660581e-05, "loss": 1.8009, "step": 4260 }, { "epoch": 0.5020207215813065, "grad_norm": 1.4215635061264038, "learning_rate": 9.567379733372875e-05, "loss": 1.8686, "step": 4270 }, { "epoch": 0.503196414137703, "grad_norm": 1.4737588167190552, "learning_rate": 9.564768822364172e-05, "loss": 1.8958, "step": 4280 }, { "epoch": 0.5043721066940995, "grad_norm": 1.3877507448196411, "learning_rate": 9.562150414922208e-05, "loss": 1.8099, "step": 4290 }, { "epoch": 0.505547799250496, "grad_norm": 1.0765511989593506, "learning_rate": 9.559524515347031e-05, "loss": 1.8659, "step": 4300 }, { "epoch": 0.5067234918068925, "grad_norm": 1.1707931756973267, "learning_rate": 9.556891127950992e-05, "loss": 1.8015, "step": 4310 }, { "epoch": 0.507899184363289, "grad_norm": 1.5194308757781982, "learning_rate": 9.554250257058735e-05, "loss": 1.8737, "step": 4320 }, { "epoch": 0.5090748769196854, "grad_norm": 1.506947636604309, "learning_rate": 9.551601907007198e-05, "loss": 1.834, "step": 4330 }, { "epoch": 0.510250569476082, "grad_norm": 1.076819658279419, "learning_rate": 9.548946082145599e-05, "loss": 1.8134, "step": 4340 }, { "epoch": 0.5114262620324785, "grad_norm": 1.1456104516983032, "learning_rate": 9.546282786835433e-05, "loss": 1.8273, "step": 4350 }, { "epoch": 0.512601954588875, "grad_norm": 1.2105098962783813, "learning_rate": 9.543612025450464e-05, "loss": 1.8334, "step": 4360 }, { "epoch": 0.5137776471452715, "grad_norm": 1.2587809562683105, "learning_rate": 9.540933802376712e-05, "loss": 1.8447, "step": 4370 }, { "epoch": 0.514953339701668, "grad_norm": 1.6158201694488525, "learning_rate": 9.53824812201246e-05, "loss": 1.865, "step": 4380 }, { "epoch": 0.5161290322580645, "grad_norm": 1.2261019945144653, "learning_rate": 9.535554988768227e-05, "loss": 1.8406, "step": 4390 }, { "epoch": 0.517304724814461, "grad_norm": 1.1431225538253784, "learning_rate": 9.532854407066781e-05, "loss": 1.9169, "step": 4400 }, { "epoch": 0.5184804173708575, "grad_norm": 1.2393782138824463, "learning_rate": 9.530146381343114e-05, "loss": 1.7719, "step": 4410 }, { "epoch": 0.519656109927254, "grad_norm": 1.3479174375534058, "learning_rate": 9.527430916044451e-05, "loss": 1.8354, "step": 4420 }, { "epoch": 0.5208318024836506, "grad_norm": 1.3852227926254272, "learning_rate": 9.524708015630225e-05, "loss": 1.7088, "step": 4430 }, { "epoch": 0.5220074950400471, "grad_norm": 0.9925305247306824, "learning_rate": 9.521977684572089e-05, "loss": 1.8787, "step": 4440 }, { "epoch": 0.5231831875964436, "grad_norm": 1.5413284301757812, "learning_rate": 9.51923992735389e-05, "loss": 1.8305, "step": 4450 }, { "epoch": 0.52435888015284, "grad_norm": 1.3170039653778076, "learning_rate": 9.516494748471681e-05, "loss": 1.821, "step": 4460 }, { "epoch": 0.5255345727092365, "grad_norm": 1.3034051656723022, "learning_rate": 9.513742152433689e-05, "loss": 1.826, "step": 4470 }, { "epoch": 0.526710265265633, "grad_norm": 1.937544584274292, "learning_rate": 9.510982143760336e-05, "loss": 1.854, "step": 4480 }, { "epoch": 0.5278859578220295, "grad_norm": 1.1502320766448975, "learning_rate": 9.508214726984208e-05, "loss": 1.8843, "step": 4490 }, { "epoch": 0.529061650378426, "grad_norm": 1.1828551292419434, "learning_rate": 9.50543990665006e-05, "loss": 1.806, "step": 4500 }, { "epoch": 0.529061650378426, "eval_loss": 1.8493072986602783, "eval_runtime": 1915.6911, "eval_samples_per_second": 31.573, "eval_steps_per_second": 3.947, "step": 4500 }, { "epoch": 0.5302373429348225, "grad_norm": 1.1107194423675537, "learning_rate": 9.502657687314807e-05, "loss": 1.8472, "step": 4510 }, { "epoch": 0.5314130354912191, "grad_norm": 1.0959134101867676, "learning_rate": 9.499868073547511e-05, "loss": 1.836, "step": 4520 }, { "epoch": 0.5325887280476156, "grad_norm": 1.3925580978393555, "learning_rate": 9.49707106992938e-05, "loss": 1.8028, "step": 4530 }, { "epoch": 0.5337644206040121, "grad_norm": 1.2634612321853638, "learning_rate": 9.49426668105376e-05, "loss": 1.8373, "step": 4540 }, { "epoch": 0.5349401131604086, "grad_norm": 1.327492117881775, "learning_rate": 9.491454911526119e-05, "loss": 1.8354, "step": 4550 }, { "epoch": 0.5361158057168051, "grad_norm": 1.0847629308700562, "learning_rate": 9.488635765964051e-05, "loss": 1.8814, "step": 4560 }, { "epoch": 0.5372914982732016, "grad_norm": 1.3326648473739624, "learning_rate": 9.485809248997262e-05, "loss": 1.8219, "step": 4570 }, { "epoch": 0.538467190829598, "grad_norm": 1.4907851219177246, "learning_rate": 9.482975365267566e-05, "loss": 1.7934, "step": 4580 }, { "epoch": 0.5396428833859945, "grad_norm": 1.0509204864501953, "learning_rate": 9.480134119428867e-05, "loss": 1.8239, "step": 4590 }, { "epoch": 0.540818575942391, "grad_norm": 1.3526304960250854, "learning_rate": 9.477285516147167e-05, "loss": 1.9096, "step": 4600 }, { "epoch": 0.5419942684987875, "grad_norm": 1.253490924835205, "learning_rate": 9.47442956010055e-05, "loss": 1.8355, "step": 4610 }, { "epoch": 0.5431699610551841, "grad_norm": 1.321663737297058, "learning_rate": 9.471566255979173e-05, "loss": 1.8475, "step": 4620 }, { "epoch": 0.5443456536115806, "grad_norm": 1.1134142875671387, "learning_rate": 9.468695608485259e-05, "loss": 1.8459, "step": 4630 }, { "epoch": 0.5455213461679771, "grad_norm": 1.3343265056610107, "learning_rate": 9.465817622333092e-05, "loss": 1.8308, "step": 4640 }, { "epoch": 0.5466970387243736, "grad_norm": 1.2862119674682617, "learning_rate": 9.46293230224901e-05, "loss": 1.868, "step": 4650 }, { "epoch": 0.5478727312807701, "grad_norm": 1.215512752532959, "learning_rate": 9.460039652971391e-05, "loss": 1.7888, "step": 4660 }, { "epoch": 0.5490484238371666, "grad_norm": 1.0800265073776245, "learning_rate": 9.457139679250651e-05, "loss": 1.8408, "step": 4670 }, { "epoch": 0.5502241163935631, "grad_norm": 1.3795661926269531, "learning_rate": 9.454232385849238e-05, "loss": 1.8374, "step": 4680 }, { "epoch": 0.5513998089499595, "grad_norm": 1.3008549213409424, "learning_rate": 9.451317777541615e-05, "loss": 1.8642, "step": 4690 }, { "epoch": 0.552575501506356, "grad_norm": 1.241731882095337, "learning_rate": 9.448395859114259e-05, "loss": 1.8563, "step": 4700 }, { "epoch": 0.5537511940627526, "grad_norm": 1.2900316715240479, "learning_rate": 9.445466635365657e-05, "loss": 1.8151, "step": 4710 }, { "epoch": 0.5549268866191491, "grad_norm": 1.5909968614578247, "learning_rate": 9.442530111106286e-05, "loss": 1.8631, "step": 4720 }, { "epoch": 0.5561025791755456, "grad_norm": 1.1197429895401, "learning_rate": 9.439586291158616e-05, "loss": 1.7967, "step": 4730 }, { "epoch": 0.5572782717319421, "grad_norm": 1.1088523864746094, "learning_rate": 9.436635180357099e-05, "loss": 1.822, "step": 4740 }, { "epoch": 0.5584539642883386, "grad_norm": 1.569267988204956, "learning_rate": 9.433676783548157e-05, "loss": 1.8265, "step": 4750 }, { "epoch": 0.5596296568447351, "grad_norm": 1.1369186639785767, "learning_rate": 9.430711105590182e-05, "loss": 1.8911, "step": 4760 }, { "epoch": 0.5608053494011316, "grad_norm": 1.2979038953781128, "learning_rate": 9.427738151353518e-05, "loss": 1.8151, "step": 4770 }, { "epoch": 0.5619810419575281, "grad_norm": 1.3625456094741821, "learning_rate": 9.424757925720464e-05, "loss": 1.8106, "step": 4780 }, { "epoch": 0.5631567345139246, "grad_norm": 1.2157680988311768, "learning_rate": 9.421770433585255e-05, "loss": 1.7908, "step": 4790 }, { "epoch": 0.5643324270703212, "grad_norm": 1.4438010454177856, "learning_rate": 9.418775679854062e-05, "loss": 1.7806, "step": 4800 }, { "epoch": 0.5655081196267177, "grad_norm": 1.409374475479126, "learning_rate": 9.415773669444981e-05, "loss": 1.828, "step": 4810 }, { "epoch": 0.5666838121831141, "grad_norm": 1.3449857234954834, "learning_rate": 9.412764407288028e-05, "loss": 1.8167, "step": 4820 }, { "epoch": 0.5678595047395106, "grad_norm": 1.248065710067749, "learning_rate": 9.40974789832512e-05, "loss": 1.8099, "step": 4830 }, { "epoch": 0.5690351972959071, "grad_norm": 1.1458079814910889, "learning_rate": 9.406724147510087e-05, "loss": 1.8626, "step": 4840 }, { "epoch": 0.5702108898523036, "grad_norm": 1.1551117897033691, "learning_rate": 9.40369315980864e-05, "loss": 1.77, "step": 4850 }, { "epoch": 0.5713865824087001, "grad_norm": 1.2762501239776611, "learning_rate": 9.400654940198381e-05, "loss": 1.8856, "step": 4860 }, { "epoch": 0.5725622749650966, "grad_norm": 1.7265418767929077, "learning_rate": 9.397609493668789e-05, "loss": 1.803, "step": 4870 }, { "epoch": 0.5737379675214931, "grad_norm": 1.4114713668823242, "learning_rate": 9.394556825221209e-05, "loss": 1.8103, "step": 4880 }, { "epoch": 0.5749136600778896, "grad_norm": 1.0132843255996704, "learning_rate": 9.391496939868843e-05, "loss": 1.7907, "step": 4890 }, { "epoch": 0.5760893526342862, "grad_norm": 1.0062189102172852, "learning_rate": 9.388429842636755e-05, "loss": 1.8426, "step": 4900 }, { "epoch": 0.5772650451906827, "grad_norm": 1.3358486890792847, "learning_rate": 9.38535553856184e-05, "loss": 1.8569, "step": 4910 }, { "epoch": 0.5784407377470792, "grad_norm": 1.4988279342651367, "learning_rate": 9.38227403269284e-05, "loss": 1.7974, "step": 4920 }, { "epoch": 0.5796164303034756, "grad_norm": 1.1461288928985596, "learning_rate": 9.379185330090315e-05, "loss": 1.85, "step": 4930 }, { "epoch": 0.5807921228598721, "grad_norm": 1.2740187644958496, "learning_rate": 9.376089435826649e-05, "loss": 1.7794, "step": 4940 }, { "epoch": 0.5819678154162686, "grad_norm": 1.0652244091033936, "learning_rate": 9.372986354986033e-05, "loss": 1.8502, "step": 4950 }, { "epoch": 0.5831435079726651, "grad_norm": 1.5481996536254883, "learning_rate": 9.369876092664465e-05, "loss": 1.8535, "step": 4960 }, { "epoch": 0.5843192005290616, "grad_norm": 1.3265409469604492, "learning_rate": 9.366758653969731e-05, "loss": 1.7899, "step": 4970 }, { "epoch": 0.5854948930854581, "grad_norm": 1.3259570598602295, "learning_rate": 9.363634044021406e-05, "loss": 1.8289, "step": 4980 }, { "epoch": 0.5866705856418547, "grad_norm": 1.6625652313232422, "learning_rate": 9.360502267950839e-05, "loss": 1.81, "step": 4990 }, { "epoch": 0.5878462781982512, "grad_norm": 1.0666922330856323, "learning_rate": 9.357363330901152e-05, "loss": 1.8282, "step": 5000 }, { "epoch": 0.5878462781982512, "eval_loss": 1.8275103569030762, "eval_runtime": 1916.5581, "eval_samples_per_second": 31.559, "eval_steps_per_second": 3.945, "step": 5000 }, { "epoch": 0.5890219707546477, "grad_norm": 1.2334250211715698, "learning_rate": 9.354217238027223e-05, "loss": 1.7126, "step": 5010 }, { "epoch": 0.5901976633110442, "grad_norm": 1.1140841245651245, "learning_rate": 9.351063994495681e-05, "loss": 1.8397, "step": 5020 }, { "epoch": 0.5913733558674407, "grad_norm": 1.1550763845443726, "learning_rate": 9.347903605484904e-05, "loss": 1.8392, "step": 5030 }, { "epoch": 0.5925490484238372, "grad_norm": 1.2639172077178955, "learning_rate": 9.344736076184996e-05, "loss": 1.8739, "step": 5040 }, { "epoch": 0.5937247409802336, "grad_norm": 1.4475167989730835, "learning_rate": 9.341561411797795e-05, "loss": 1.7713, "step": 5050 }, { "epoch": 0.5949004335366301, "grad_norm": 1.2050482034683228, "learning_rate": 9.33837961753685e-05, "loss": 1.759, "step": 5060 }, { "epoch": 0.5960761260930266, "grad_norm": 1.17839777469635, "learning_rate": 9.335190698627425e-05, "loss": 1.7805, "step": 5070 }, { "epoch": 0.5972518186494232, "grad_norm": 1.3823720216751099, "learning_rate": 9.331994660306478e-05, "loss": 1.7355, "step": 5080 }, { "epoch": 0.5984275112058197, "grad_norm": 1.2561278343200684, "learning_rate": 9.328791507822666e-05, "loss": 1.7615, "step": 5090 }, { "epoch": 0.5996032037622162, "grad_norm": 1.5722140073776245, "learning_rate": 9.325581246436322e-05, "loss": 1.8085, "step": 5100 }, { "epoch": 0.6007788963186127, "grad_norm": 1.3253744840621948, "learning_rate": 9.322363881419457e-05, "loss": 1.8256, "step": 5110 }, { "epoch": 0.6019545888750092, "grad_norm": 1.1960384845733643, "learning_rate": 9.31913941805575e-05, "loss": 1.8716, "step": 5120 }, { "epoch": 0.6031302814314057, "grad_norm": 1.411815881729126, "learning_rate": 9.315907861640532e-05, "loss": 1.7204, "step": 5130 }, { "epoch": 0.6043059739878022, "grad_norm": 1.359009861946106, "learning_rate": 9.312669217480787e-05, "loss": 1.7989, "step": 5140 }, { "epoch": 0.6054816665441987, "grad_norm": 1.124862551689148, "learning_rate": 9.309423490895137e-05, "loss": 1.7603, "step": 5150 }, { "epoch": 0.6066573591005952, "grad_norm": 1.4581263065338135, "learning_rate": 9.306170687213833e-05, "loss": 1.811, "step": 5160 }, { "epoch": 0.6078330516569918, "grad_norm": 1.2764766216278076, "learning_rate": 9.302910811778752e-05, "loss": 1.7525, "step": 5170 }, { "epoch": 0.6090087442133882, "grad_norm": 1.0223078727722168, "learning_rate": 9.299643869943384e-05, "loss": 1.7784, "step": 5180 }, { "epoch": 0.6101844367697847, "grad_norm": 1.1139816045761108, "learning_rate": 9.296369867072819e-05, "loss": 1.8081, "step": 5190 }, { "epoch": 0.6113601293261812, "grad_norm": 1.2981748580932617, "learning_rate": 9.293088808543748e-05, "loss": 1.8077, "step": 5200 }, { "epoch": 0.6125358218825777, "grad_norm": 1.2992781400680542, "learning_rate": 9.289800699744451e-05, "loss": 1.8457, "step": 5210 }, { "epoch": 0.6137115144389742, "grad_norm": 1.4568043947219849, "learning_rate": 9.286505546074777e-05, "loss": 1.7507, "step": 5220 }, { "epoch": 0.6148872069953707, "grad_norm": 1.1658329963684082, "learning_rate": 9.283203352946152e-05, "loss": 1.8552, "step": 5230 }, { "epoch": 0.6160628995517672, "grad_norm": 1.2394460439682007, "learning_rate": 9.27989412578156e-05, "loss": 1.7927, "step": 5240 }, { "epoch": 0.6172385921081637, "grad_norm": 1.3162946701049805, "learning_rate": 9.276577870015542e-05, "loss": 1.8536, "step": 5250 }, { "epoch": 0.6184142846645602, "grad_norm": 1.2978202104568481, "learning_rate": 9.273254591094169e-05, "loss": 1.8043, "step": 5260 }, { "epoch": 0.6195899772209568, "grad_norm": 1.3194001913070679, "learning_rate": 9.269924294475058e-05, "loss": 1.7834, "step": 5270 }, { "epoch": 0.6207656697773533, "grad_norm": 1.1423587799072266, "learning_rate": 9.266586985627343e-05, "loss": 1.7899, "step": 5280 }, { "epoch": 0.6219413623337497, "grad_norm": 1.221191644668579, "learning_rate": 9.263242670031682e-05, "loss": 1.7621, "step": 5290 }, { "epoch": 0.6231170548901462, "grad_norm": 1.3119646310806274, "learning_rate": 9.259891353180228e-05, "loss": 1.8262, "step": 5300 }, { "epoch": 0.6242927474465427, "grad_norm": 1.4040793180465698, "learning_rate": 9.256533040576645e-05, "loss": 1.7059, "step": 5310 }, { "epoch": 0.6254684400029392, "grad_norm": 1.21645188331604, "learning_rate": 9.253167737736073e-05, "loss": 1.7717, "step": 5320 }, { "epoch": 0.6266441325593357, "grad_norm": 1.1716586351394653, "learning_rate": 9.24979545018514e-05, "loss": 1.8254, "step": 5330 }, { "epoch": 0.6278198251157322, "grad_norm": 1.3239357471466064, "learning_rate": 9.246416183461944e-05, "loss": 1.8314, "step": 5340 }, { "epoch": 0.6289955176721287, "grad_norm": 1.18232262134552, "learning_rate": 9.243029943116039e-05, "loss": 1.8568, "step": 5350 }, { "epoch": 0.6301712102285253, "grad_norm": 1.4461965560913086, "learning_rate": 9.239636734708438e-05, "loss": 1.8252, "step": 5360 }, { "epoch": 0.6313469027849218, "grad_norm": 1.3155122995376587, "learning_rate": 9.236236563811592e-05, "loss": 1.6954, "step": 5370 }, { "epoch": 0.6325225953413183, "grad_norm": 1.401957631111145, "learning_rate": 9.232829436009389e-05, "loss": 1.8277, "step": 5380 }, { "epoch": 0.6336982878977148, "grad_norm": 1.2141048908233643, "learning_rate": 9.229415356897142e-05, "loss": 1.8633, "step": 5390 }, { "epoch": 0.6348739804541113, "grad_norm": 1.0278942584991455, "learning_rate": 9.225994332081577e-05, "loss": 1.8119, "step": 5400 }, { "epoch": 0.6360496730105077, "grad_norm": 1.3392034769058228, "learning_rate": 9.22256636718083e-05, "loss": 1.7625, "step": 5410 }, { "epoch": 0.6372253655669042, "grad_norm": 1.3481065034866333, "learning_rate": 9.219131467824432e-05, "loss": 1.8549, "step": 5420 }, { "epoch": 0.6384010581233007, "grad_norm": 1.37552011013031, "learning_rate": 9.215689639653301e-05, "loss": 1.7749, "step": 5430 }, { "epoch": 0.6395767506796972, "grad_norm": 1.2834831476211548, "learning_rate": 9.212240888319737e-05, "loss": 1.8312, "step": 5440 }, { "epoch": 0.6407524432360938, "grad_norm": 1.3401875495910645, "learning_rate": 9.208785219487408e-05, "loss": 1.8159, "step": 5450 }, { "epoch": 0.6419281357924903, "grad_norm": 1.1253060102462769, "learning_rate": 9.205322638831342e-05, "loss": 1.8785, "step": 5460 }, { "epoch": 0.6431038283488868, "grad_norm": 1.1685224771499634, "learning_rate": 9.201853152037915e-05, "loss": 1.7725, "step": 5470 }, { "epoch": 0.6442795209052833, "grad_norm": 1.3135749101638794, "learning_rate": 9.198376764804852e-05, "loss": 1.7871, "step": 5480 }, { "epoch": 0.6454552134616798, "grad_norm": 1.2319329977035522, "learning_rate": 9.194893482841202e-05, "loss": 1.7512, "step": 5490 }, { "epoch": 0.6466309060180763, "grad_norm": 1.126560926437378, "learning_rate": 9.191403311867344e-05, "loss": 1.7949, "step": 5500 }, { "epoch": 0.6466309060180763, "eval_loss": 1.8114657402038574, "eval_runtime": 1917.9552, "eval_samples_per_second": 31.536, "eval_steps_per_second": 3.942, "step": 5500 }, { "epoch": 0.6478065985744728, "grad_norm": 1.4446791410446167, "learning_rate": 9.18790625761496e-05, "loss": 1.796, "step": 5510 }, { "epoch": 0.6489822911308692, "grad_norm": 1.10252046585083, "learning_rate": 9.18440232582705e-05, "loss": 1.8507, "step": 5520 }, { "epoch": 0.6501579836872657, "grad_norm": 1.2583657503128052, "learning_rate": 9.180891522257896e-05, "loss": 1.7451, "step": 5530 }, { "epoch": 0.6513336762436622, "grad_norm": 1.3297507762908936, "learning_rate": 9.177373852673073e-05, "loss": 1.7613, "step": 5540 }, { "epoch": 0.6525093688000588, "grad_norm": 1.5515707731246948, "learning_rate": 9.173849322849428e-05, "loss": 1.7783, "step": 5550 }, { "epoch": 0.6536850613564553, "grad_norm": 1.2178689241409302, "learning_rate": 9.170317938575075e-05, "loss": 1.8289, "step": 5560 }, { "epoch": 0.6548607539128518, "grad_norm": 1.072955846786499, "learning_rate": 9.166779705649386e-05, "loss": 1.786, "step": 5570 }, { "epoch": 0.6560364464692483, "grad_norm": 1.1388579607009888, "learning_rate": 9.163234629882976e-05, "loss": 1.8184, "step": 5580 }, { "epoch": 0.6572121390256448, "grad_norm": 1.3352762460708618, "learning_rate": 9.159682717097703e-05, "loss": 1.7569, "step": 5590 }, { "epoch": 0.6583878315820413, "grad_norm": 1.351258397102356, "learning_rate": 9.15612397312665e-05, "loss": 1.7732, "step": 5600 }, { "epoch": 0.6595635241384378, "grad_norm": 1.1882327795028687, "learning_rate": 9.152558403814117e-05, "loss": 1.8679, "step": 5610 }, { "epoch": 0.6607392166948343, "grad_norm": 1.3938547372817993, "learning_rate": 9.148986015015617e-05, "loss": 1.7673, "step": 5620 }, { "epoch": 0.6619149092512308, "grad_norm": 1.2497762441635132, "learning_rate": 9.145406812597858e-05, "loss": 1.857, "step": 5630 }, { "epoch": 0.6630906018076274, "grad_norm": 1.2405085563659668, "learning_rate": 9.14182080243874e-05, "loss": 1.7884, "step": 5640 }, { "epoch": 0.6642662943640238, "grad_norm": 1.138134479522705, "learning_rate": 9.138227990427342e-05, "loss": 1.7733, "step": 5650 }, { "epoch": 0.6654419869204203, "grad_norm": 1.152241587638855, "learning_rate": 9.134628382463917e-05, "loss": 1.6719, "step": 5660 }, { "epoch": 0.6666176794768168, "grad_norm": 1.364266037940979, "learning_rate": 9.13102198445987e-05, "loss": 1.8082, "step": 5670 }, { "epoch": 0.6677933720332133, "grad_norm": 1.2657530307769775, "learning_rate": 9.127408802337764e-05, "loss": 1.8546, "step": 5680 }, { "epoch": 0.6689690645896098, "grad_norm": 1.2973246574401855, "learning_rate": 9.123788842031302e-05, "loss": 1.76, "step": 5690 }, { "epoch": 0.6701447571460063, "grad_norm": 1.169185996055603, "learning_rate": 9.120162109485317e-05, "loss": 1.806, "step": 5700 }, { "epoch": 0.6713204497024028, "grad_norm": 1.450462818145752, "learning_rate": 9.116528610655765e-05, "loss": 1.7759, "step": 5710 }, { "epoch": 0.6724961422587993, "grad_norm": 1.6770098209381104, "learning_rate": 9.112888351509711e-05, "loss": 1.7806, "step": 5720 }, { "epoch": 0.6736718348151959, "grad_norm": 1.2061808109283447, "learning_rate": 9.109241338025327e-05, "loss": 1.7445, "step": 5730 }, { "epoch": 0.6748475273715924, "grad_norm": 1.156482458114624, "learning_rate": 9.105587576191871e-05, "loss": 1.7985, "step": 5740 }, { "epoch": 0.6760232199279889, "grad_norm": 1.1578704118728638, "learning_rate": 9.10192707200969e-05, "loss": 1.8118, "step": 5750 }, { "epoch": 0.6771989124843854, "grad_norm": 1.307572603225708, "learning_rate": 9.098259831490197e-05, "loss": 1.823, "step": 5760 }, { "epoch": 0.6783746050407818, "grad_norm": 1.2954407930374146, "learning_rate": 9.094585860655873e-05, "loss": 1.7519, "step": 5770 }, { "epoch": 0.6795502975971783, "grad_norm": 1.35243821144104, "learning_rate": 9.090905165540248e-05, "loss": 1.7599, "step": 5780 }, { "epoch": 0.6807259901535748, "grad_norm": 1.6381586790084839, "learning_rate": 9.087217752187897e-05, "loss": 1.855, "step": 5790 }, { "epoch": 0.6819016827099713, "grad_norm": 1.304528832435608, "learning_rate": 9.083523626654431e-05, "loss": 1.8288, "step": 5800 }, { "epoch": 0.6830773752663678, "grad_norm": 1.4514285326004028, "learning_rate": 9.079822795006474e-05, "loss": 1.7438, "step": 5810 }, { "epoch": 0.6842530678227643, "grad_norm": 1.050034999847412, "learning_rate": 9.076115263321676e-05, "loss": 1.7842, "step": 5820 }, { "epoch": 0.6854287603791609, "grad_norm": 1.1894317865371704, "learning_rate": 9.072401037688678e-05, "loss": 1.7779, "step": 5830 }, { "epoch": 0.6866044529355574, "grad_norm": 1.2237564325332642, "learning_rate": 9.068680124207123e-05, "loss": 1.8146, "step": 5840 }, { "epoch": 0.6877801454919539, "grad_norm": 1.3124136924743652, "learning_rate": 9.064952528987633e-05, "loss": 1.8126, "step": 5850 }, { "epoch": 0.6889558380483504, "grad_norm": 1.3544930219650269, "learning_rate": 9.061218258151803e-05, "loss": 1.7738, "step": 5860 }, { "epoch": 0.6901315306047469, "grad_norm": 1.1713837385177612, "learning_rate": 9.057477317832191e-05, "loss": 1.7682, "step": 5870 }, { "epoch": 0.6913072231611433, "grad_norm": 1.4300111532211304, "learning_rate": 9.05372971417231e-05, "loss": 1.7836, "step": 5880 }, { "epoch": 0.6924829157175398, "grad_norm": 1.292567491531372, "learning_rate": 9.049975453326611e-05, "loss": 1.7414, "step": 5890 }, { "epoch": 0.6936586082739363, "grad_norm": 1.1864491701126099, "learning_rate": 9.046214541460487e-05, "loss": 1.7327, "step": 5900 }, { "epoch": 0.6948343008303328, "grad_norm": 1.2983736991882324, "learning_rate": 9.042446984750242e-05, "loss": 1.805, "step": 5910 }, { "epoch": 0.6960099933867294, "grad_norm": 1.562773585319519, "learning_rate": 9.038672789383098e-05, "loss": 1.7647, "step": 5920 }, { "epoch": 0.6971856859431259, "grad_norm": 1.1496193408966064, "learning_rate": 9.034891961557184e-05, "loss": 1.8198, "step": 5930 }, { "epoch": 0.6983613784995224, "grad_norm": 1.4592268466949463, "learning_rate": 9.031104507481512e-05, "loss": 1.8303, "step": 5940 }, { "epoch": 0.6995370710559189, "grad_norm": 1.2525300979614258, "learning_rate": 9.027310433375979e-05, "loss": 1.7714, "step": 5950 }, { "epoch": 0.7007127636123154, "grad_norm": 1.189684271812439, "learning_rate": 9.023509745471356e-05, "loss": 1.883, "step": 5960 }, { "epoch": 0.7018884561687119, "grad_norm": 1.4038972854614258, "learning_rate": 9.019702450009278e-05, "loss": 1.7984, "step": 5970 }, { "epoch": 0.7030641487251084, "grad_norm": 1.3394795656204224, "learning_rate": 9.015888553242222e-05, "loss": 1.7692, "step": 5980 }, { "epoch": 0.7042398412815049, "grad_norm": 1.058933973312378, "learning_rate": 9.012068061433515e-05, "loss": 1.756, "step": 5990 }, { "epoch": 0.7054155338379013, "grad_norm": 1.3692445755004883, "learning_rate": 9.008240980857307e-05, "loss": 1.7408, "step": 6000 }, { "epoch": 0.7054155338379013, "eval_loss": 1.7942686080932617, "eval_runtime": 1919.2489, "eval_samples_per_second": 31.515, "eval_steps_per_second": 3.94, "step": 6000 }, { "epoch": 0.7065912263942979, "grad_norm": 1.4479010105133057, "learning_rate": 9.004407317798577e-05, "loss": 1.7262, "step": 6010 }, { "epoch": 0.7077669189506944, "grad_norm": 0.9974729418754578, "learning_rate": 9.000567078553105e-05, "loss": 1.7412, "step": 6020 }, { "epoch": 0.7089426115070909, "grad_norm": 1.5402100086212158, "learning_rate": 8.996720269427476e-05, "loss": 1.7808, "step": 6030 }, { "epoch": 0.7101183040634874, "grad_norm": 1.214643955230713, "learning_rate": 8.992866896739065e-05, "loss": 1.749, "step": 6040 }, { "epoch": 0.7112939966198839, "grad_norm": 1.5298250913619995, "learning_rate": 8.98900696681602e-05, "loss": 1.7728, "step": 6050 }, { "epoch": 0.7124696891762804, "grad_norm": 1.3863435983657837, "learning_rate": 8.985140485997266e-05, "loss": 1.7868, "step": 6060 }, { "epoch": 0.7136453817326769, "grad_norm": 1.4470679759979248, "learning_rate": 8.981267460632478e-05, "loss": 1.7795, "step": 6070 }, { "epoch": 0.7148210742890734, "grad_norm": 1.0822720527648926, "learning_rate": 8.977387897082087e-05, "loss": 1.7933, "step": 6080 }, { "epoch": 0.7159967668454699, "grad_norm": 1.2606309652328491, "learning_rate": 8.973501801717252e-05, "loss": 1.7083, "step": 6090 }, { "epoch": 0.7171724594018664, "grad_norm": 1.3774864673614502, "learning_rate": 8.969609180919866e-05, "loss": 1.7514, "step": 6100 }, { "epoch": 0.718348151958263, "grad_norm": 1.174729585647583, "learning_rate": 8.965710041082534e-05, "loss": 1.7465, "step": 6110 }, { "epoch": 0.7195238445146594, "grad_norm": 1.2690554857254028, "learning_rate": 8.96180438860857e-05, "loss": 1.8205, "step": 6120 }, { "epoch": 0.7206995370710559, "grad_norm": 1.3486568927764893, "learning_rate": 8.957892229911981e-05, "loss": 1.8657, "step": 6130 }, { "epoch": 0.7218752296274524, "grad_norm": 1.3336251974105835, "learning_rate": 8.953973571417463e-05, "loss": 1.7855, "step": 6140 }, { "epoch": 0.7230509221838489, "grad_norm": 1.2952755689620972, "learning_rate": 8.950048419560377e-05, "loss": 1.8401, "step": 6150 }, { "epoch": 0.7242266147402454, "grad_norm": 1.303263783454895, "learning_rate": 8.946116780786758e-05, "loss": 1.7403, "step": 6160 }, { "epoch": 0.7254023072966419, "grad_norm": 1.1422659158706665, "learning_rate": 8.942178661553287e-05, "loss": 1.7391, "step": 6170 }, { "epoch": 0.7265779998530384, "grad_norm": 1.2580105066299438, "learning_rate": 8.938234068327291e-05, "loss": 1.6976, "step": 6180 }, { "epoch": 0.7277536924094349, "grad_norm": 1.118611454963684, "learning_rate": 8.934283007586728e-05, "loss": 1.7309, "step": 6190 }, { "epoch": 0.7289293849658315, "grad_norm": 1.3629348278045654, "learning_rate": 8.930325485820177e-05, "loss": 1.7137, "step": 6200 }, { "epoch": 0.730105077522228, "grad_norm": 1.7530487775802612, "learning_rate": 8.926361509526826e-05, "loss": 1.7017, "step": 6210 }, { "epoch": 0.7312807700786245, "grad_norm": 1.5331019163131714, "learning_rate": 8.922391085216466e-05, "loss": 1.7582, "step": 6220 }, { "epoch": 0.732456462635021, "grad_norm": 1.5436840057373047, "learning_rate": 8.918414219409476e-05, "loss": 1.7712, "step": 6230 }, { "epoch": 0.7336321551914174, "grad_norm": 1.5363048315048218, "learning_rate": 8.914430918636813e-05, "loss": 1.7263, "step": 6240 }, { "epoch": 0.7348078477478139, "grad_norm": 1.1293503046035767, "learning_rate": 8.91044118944e-05, "loss": 1.7018, "step": 6250 }, { "epoch": 0.7359835403042104, "grad_norm": 1.4453774690628052, "learning_rate": 8.90644503837112e-05, "loss": 1.7722, "step": 6260 }, { "epoch": 0.7371592328606069, "grad_norm": 1.3364787101745605, "learning_rate": 8.902442471992802e-05, "loss": 1.7646, "step": 6270 }, { "epoch": 0.7383349254170034, "grad_norm": 1.163448452949524, "learning_rate": 8.898433496878207e-05, "loss": 1.7929, "step": 6280 }, { "epoch": 0.7395106179734, "grad_norm": 1.2181710004806519, "learning_rate": 8.894418119611025e-05, "loss": 1.7536, "step": 6290 }, { "epoch": 0.7406863105297965, "grad_norm": 1.225786805152893, "learning_rate": 8.890396346785457e-05, "loss": 1.7559, "step": 6300 }, { "epoch": 0.741862003086193, "grad_norm": 1.0332603454589844, "learning_rate": 8.886368185006208e-05, "loss": 1.7313, "step": 6310 }, { "epoch": 0.7430376956425895, "grad_norm": 1.510768175125122, "learning_rate": 8.882333640888478e-05, "loss": 1.7077, "step": 6320 }, { "epoch": 0.744213388198986, "grad_norm": 1.1689822673797607, "learning_rate": 8.878292721057942e-05, "loss": 1.736, "step": 6330 }, { "epoch": 0.7453890807553825, "grad_norm": 1.2468162775039673, "learning_rate": 8.874245432150751e-05, "loss": 1.6871, "step": 6340 }, { "epoch": 0.746564773311779, "grad_norm": 1.2123504877090454, "learning_rate": 8.870191780813513e-05, "loss": 1.7079, "step": 6350 }, { "epoch": 0.7477404658681754, "grad_norm": 1.4285656213760376, "learning_rate": 8.866131773703283e-05, "loss": 1.7667, "step": 6360 }, { "epoch": 0.7489161584245719, "grad_norm": 1.2181369066238403, "learning_rate": 8.862065417487558e-05, "loss": 1.7418, "step": 6370 }, { "epoch": 0.7500918509809684, "grad_norm": 1.2653862237930298, "learning_rate": 8.857992718844261e-05, "loss": 1.7303, "step": 6380 }, { "epoch": 0.751267543537365, "grad_norm": 1.0364800691604614, "learning_rate": 8.853913684461725e-05, "loss": 1.7252, "step": 6390 }, { "epoch": 0.7524432360937615, "grad_norm": 1.0992807149887085, "learning_rate": 8.849828321038695e-05, "loss": 1.7312, "step": 6400 }, { "epoch": 0.753618928650158, "grad_norm": 1.2547990083694458, "learning_rate": 8.845736635284308e-05, "loss": 1.7292, "step": 6410 }, { "epoch": 0.7547946212065545, "grad_norm": 1.6516164541244507, "learning_rate": 8.84163863391808e-05, "loss": 1.7613, "step": 6420 }, { "epoch": 0.755970313762951, "grad_norm": 1.5158637762069702, "learning_rate": 8.837534323669903e-05, "loss": 1.7663, "step": 6430 }, { "epoch": 0.7571460063193475, "grad_norm": 1.3658159971237183, "learning_rate": 8.833423711280027e-05, "loss": 1.8369, "step": 6440 }, { "epoch": 0.758321698875744, "grad_norm": 1.195811152458191, "learning_rate": 8.829306803499053e-05, "loss": 1.7288, "step": 6450 }, { "epoch": 0.7594973914321405, "grad_norm": 1.1292695999145508, "learning_rate": 8.825183607087919e-05, "loss": 1.7939, "step": 6460 }, { "epoch": 0.760673083988537, "grad_norm": 1.4026774168014526, "learning_rate": 8.821054128817894e-05, "loss": 1.6923, "step": 6470 }, { "epoch": 0.7618487765449335, "grad_norm": 1.1430468559265137, "learning_rate": 8.81691837547056e-05, "loss": 1.7366, "step": 6480 }, { "epoch": 0.76302446910133, "grad_norm": 1.2885632514953613, "learning_rate": 8.812776353837805e-05, "loss": 1.7338, "step": 6490 }, { "epoch": 0.7642001616577265, "grad_norm": 1.185829520225525, "learning_rate": 8.808628070721811e-05, "loss": 1.6978, "step": 6500 }, { "epoch": 0.7642001616577265, "eval_loss": 1.7781524658203125, "eval_runtime": 1920.8491, "eval_samples_per_second": 31.489, "eval_steps_per_second": 3.936, "step": 6500 }, { "epoch": 0.765375854214123, "grad_norm": 1.0641669034957886, "learning_rate": 8.804473532935043e-05, "loss": 1.6954, "step": 6510 }, { "epoch": 0.7665515467705195, "grad_norm": 1.3596010208129883, "learning_rate": 8.800312747300237e-05, "loss": 1.7458, "step": 6520 }, { "epoch": 0.767727239326916, "grad_norm": 1.3293750286102295, "learning_rate": 8.79614572065039e-05, "loss": 1.7893, "step": 6530 }, { "epoch": 0.7689029318833125, "grad_norm": 1.1252282857894897, "learning_rate": 8.79197245982875e-05, "loss": 1.6925, "step": 6540 }, { "epoch": 0.770078624439709, "grad_norm": 1.104527473449707, "learning_rate": 8.7877929716888e-05, "loss": 1.7371, "step": 6550 }, { "epoch": 0.7712543169961055, "grad_norm": 1.2074071168899536, "learning_rate": 8.783607263094251e-05, "loss": 1.8002, "step": 6560 }, { "epoch": 0.7724300095525021, "grad_norm": 0.9008570909500122, "learning_rate": 8.779415340919028e-05, "loss": 1.7733, "step": 6570 }, { "epoch": 0.7736057021088986, "grad_norm": 1.2692817449569702, "learning_rate": 8.775217212047264e-05, "loss": 1.7327, "step": 6580 }, { "epoch": 0.774781394665295, "grad_norm": 1.1430948972702026, "learning_rate": 8.771012883373282e-05, "loss": 1.7234, "step": 6590 }, { "epoch": 0.7759570872216915, "grad_norm": 1.104250431060791, "learning_rate": 8.766802361801587e-05, "loss": 1.7934, "step": 6600 }, { "epoch": 0.777132779778088, "grad_norm": 1.922797441482544, "learning_rate": 8.762585654246853e-05, "loss": 1.7619, "step": 6610 }, { "epoch": 0.7783084723344845, "grad_norm": 1.3816494941711426, "learning_rate": 8.758362767633915e-05, "loss": 1.7347, "step": 6620 }, { "epoch": 0.779484164890881, "grad_norm": 1.2552838325500488, "learning_rate": 8.754133708897755e-05, "loss": 1.7565, "step": 6630 }, { "epoch": 0.7806598574472775, "grad_norm": 1.3074336051940918, "learning_rate": 8.74989848498349e-05, "loss": 1.851, "step": 6640 }, { "epoch": 0.781835550003674, "grad_norm": 1.0835909843444824, "learning_rate": 8.745657102846362e-05, "loss": 1.7645, "step": 6650 }, { "epoch": 0.7830112425600705, "grad_norm": 1.254091501235962, "learning_rate": 8.741409569451728e-05, "loss": 1.7799, "step": 6660 }, { "epoch": 0.7841869351164671, "grad_norm": 1.1105377674102783, "learning_rate": 8.737155891775043e-05, "loss": 1.7457, "step": 6670 }, { "epoch": 0.7853626276728636, "grad_norm": 1.1483558416366577, "learning_rate": 8.732896076801857e-05, "loss": 1.8059, "step": 6680 }, { "epoch": 0.7865383202292601, "grad_norm": 1.194258451461792, "learning_rate": 8.728630131527798e-05, "loss": 1.7209, "step": 6690 }, { "epoch": 0.7877140127856566, "grad_norm": 1.3407483100891113, "learning_rate": 8.724358062958555e-05, "loss": 1.7398, "step": 6700 }, { "epoch": 0.788889705342053, "grad_norm": 1.4359384775161743, "learning_rate": 8.720079878109883e-05, "loss": 1.7258, "step": 6710 }, { "epoch": 0.7900653978984495, "grad_norm": 1.5075762271881104, "learning_rate": 8.715795584007574e-05, "loss": 1.7631, "step": 6720 }, { "epoch": 0.791241090454846, "grad_norm": 1.048915982246399, "learning_rate": 8.711505187687456e-05, "loss": 1.7121, "step": 6730 }, { "epoch": 0.7924167830112425, "grad_norm": 1.1689430475234985, "learning_rate": 8.707208696195377e-05, "loss": 1.7564, "step": 6740 }, { "epoch": 0.793592475567639, "grad_norm": 1.0688875913619995, "learning_rate": 8.702906116587198e-05, "loss": 1.6675, "step": 6750 }, { "epoch": 0.7947681681240356, "grad_norm": 1.1660627126693726, "learning_rate": 8.69859745592877e-05, "loss": 1.7847, "step": 6760 }, { "epoch": 0.7959438606804321, "grad_norm": 1.0722157955169678, "learning_rate": 8.694282721295941e-05, "loss": 1.7236, "step": 6770 }, { "epoch": 0.7971195532368286, "grad_norm": 1.3707668781280518, "learning_rate": 8.689961919774525e-05, "loss": 1.7614, "step": 6780 }, { "epoch": 0.7982952457932251, "grad_norm": 1.2857524156570435, "learning_rate": 8.685635058460304e-05, "loss": 1.8287, "step": 6790 }, { "epoch": 0.7994709383496216, "grad_norm": 1.4464191198349, "learning_rate": 8.681302144459013e-05, "loss": 1.7617, "step": 6800 }, { "epoch": 0.8006466309060181, "grad_norm": 1.3305095434188843, "learning_rate": 8.676963184886322e-05, "loss": 1.7636, "step": 6810 }, { "epoch": 0.8018223234624146, "grad_norm": 1.4113892316818237, "learning_rate": 8.672618186867836e-05, "loss": 1.7464, "step": 6820 }, { "epoch": 0.802998016018811, "grad_norm": 1.1204049587249756, "learning_rate": 8.668267157539066e-05, "loss": 1.7128, "step": 6830 }, { "epoch": 0.8041737085752075, "grad_norm": 1.1180636882781982, "learning_rate": 8.663910104045439e-05, "loss": 1.7446, "step": 6840 }, { "epoch": 0.8053494011316041, "grad_norm": 1.1383891105651855, "learning_rate": 8.659547033542269e-05, "loss": 1.7562, "step": 6850 }, { "epoch": 0.8065250936880006, "grad_norm": 1.393744945526123, "learning_rate": 8.655177953194753e-05, "loss": 1.804, "step": 6860 }, { "epoch": 0.8077007862443971, "grad_norm": 1.4647397994995117, "learning_rate": 8.650802870177957e-05, "loss": 1.8066, "step": 6870 }, { "epoch": 0.8088764788007936, "grad_norm": 1.3927286863327026, "learning_rate": 8.646421791676806e-05, "loss": 1.7149, "step": 6880 }, { "epoch": 0.8100521713571901, "grad_norm": 1.38978910446167, "learning_rate": 8.642034724886069e-05, "loss": 1.7086, "step": 6890 }, { "epoch": 0.8112278639135866, "grad_norm": 1.1612343788146973, "learning_rate": 8.637641677010352e-05, "loss": 1.7456, "step": 6900 }, { "epoch": 0.8124035564699831, "grad_norm": 1.0588033199310303, "learning_rate": 8.63324265526408e-05, "loss": 1.6882, "step": 6910 }, { "epoch": 0.8135792490263796, "grad_norm": 1.1298272609710693, "learning_rate": 8.62883766687149e-05, "loss": 1.6617, "step": 6920 }, { "epoch": 0.8147549415827761, "grad_norm": 1.242858648300171, "learning_rate": 8.624426719066621e-05, "loss": 1.6882, "step": 6930 }, { "epoch": 0.8159306341391727, "grad_norm": 1.2309958934783936, "learning_rate": 8.620009819093293e-05, "loss": 1.6996, "step": 6940 }, { "epoch": 0.8171063266955692, "grad_norm": 1.0722143650054932, "learning_rate": 8.615586974205103e-05, "loss": 1.6816, "step": 6950 }, { "epoch": 0.8182820192519656, "grad_norm": 1.272240161895752, "learning_rate": 8.611158191665413e-05, "loss": 1.7564, "step": 6960 }, { "epoch": 0.8194577118083621, "grad_norm": 1.3289477825164795, "learning_rate": 8.606723478747333e-05, "loss": 1.691, "step": 6970 }, { "epoch": 0.8206334043647586, "grad_norm": 1.3592630624771118, "learning_rate": 8.602282842733717e-05, "loss": 1.7236, "step": 6980 }, { "epoch": 0.8218090969211551, "grad_norm": 1.423283576965332, "learning_rate": 8.59783629091714e-05, "loss": 1.7012, "step": 6990 }, { "epoch": 0.8229847894775516, "grad_norm": 1.0251657962799072, "learning_rate": 8.593383830599895e-05, "loss": 1.7152, "step": 7000 }, { "epoch": 0.8229847894775516, "eval_loss": 1.764369010925293, "eval_runtime": 1920.8825, "eval_samples_per_second": 31.488, "eval_steps_per_second": 3.936, "step": 7000 }, { "epoch": 0.8241604820339481, "grad_norm": 1.302617073059082, "learning_rate": 8.588925469093978e-05, "loss": 1.738, "step": 7010 }, { "epoch": 0.8253361745903446, "grad_norm": 1.474129557609558, "learning_rate": 8.584461213721077e-05, "loss": 1.7405, "step": 7020 }, { "epoch": 0.8265118671467411, "grad_norm": 1.5024540424346924, "learning_rate": 8.579991071812557e-05, "loss": 1.712, "step": 7030 }, { "epoch": 0.8276875597031377, "grad_norm": 1.2512673139572144, "learning_rate": 8.575515050709453e-05, "loss": 1.6544, "step": 7040 }, { "epoch": 0.8288632522595342, "grad_norm": 1.3266698122024536, "learning_rate": 8.571033157762453e-05, "loss": 1.7435, "step": 7050 }, { "epoch": 0.8300389448159307, "grad_norm": 1.4083542823791504, "learning_rate": 8.566545400331888e-05, "loss": 1.6554, "step": 7060 }, { "epoch": 0.8312146373723271, "grad_norm": 1.2147207260131836, "learning_rate": 8.562051785787721e-05, "loss": 1.8097, "step": 7070 }, { "epoch": 0.8323903299287236, "grad_norm": 1.2530720233917236, "learning_rate": 8.557552321509532e-05, "loss": 1.7239, "step": 7080 }, { "epoch": 0.8335660224851201, "grad_norm": 1.2593536376953125, "learning_rate": 8.553047014886509e-05, "loss": 1.7842, "step": 7090 }, { "epoch": 0.8347417150415166, "grad_norm": 1.3620553016662598, "learning_rate": 8.548535873317435e-05, "loss": 1.7097, "step": 7100 }, { "epoch": 0.8359174075979131, "grad_norm": 1.2021887302398682, "learning_rate": 8.544018904210674e-05, "loss": 1.7297, "step": 7110 }, { "epoch": 0.8370931001543096, "grad_norm": 1.2101328372955322, "learning_rate": 8.53949611498416e-05, "loss": 1.7089, "step": 7120 }, { "epoch": 0.8382687927107062, "grad_norm": 1.6435177326202393, "learning_rate": 8.534967513065387e-05, "loss": 1.7781, "step": 7130 }, { "epoch": 0.8394444852671027, "grad_norm": 1.3966532945632935, "learning_rate": 8.530433105891393e-05, "loss": 1.6373, "step": 7140 }, { "epoch": 0.8406201778234992, "grad_norm": 1.2677472829818726, "learning_rate": 8.525892900908752e-05, "loss": 1.6765, "step": 7150 }, { "epoch": 0.8417958703798957, "grad_norm": 1.2972750663757324, "learning_rate": 8.521346905573555e-05, "loss": 1.7536, "step": 7160 }, { "epoch": 0.8429715629362922, "grad_norm": 1.601100206375122, "learning_rate": 8.516795127351407e-05, "loss": 1.7121, "step": 7170 }, { "epoch": 0.8441472554926887, "grad_norm": 1.2231731414794922, "learning_rate": 8.512237573717406e-05, "loss": 1.7623, "step": 7180 }, { "epoch": 0.8453229480490851, "grad_norm": 1.226750135421753, "learning_rate": 8.507674252156136e-05, "loss": 1.6846, "step": 7190 }, { "epoch": 0.8464986406054816, "grad_norm": 1.2346446514129639, "learning_rate": 8.503105170161653e-05, "loss": 1.7202, "step": 7200 }, { "epoch": 0.8476743331618781, "grad_norm": 1.1077731847763062, "learning_rate": 8.498530335237478e-05, "loss": 1.7165, "step": 7210 }, { "epoch": 0.8488500257182747, "grad_norm": 1.3647215366363525, "learning_rate": 8.493949754896571e-05, "loss": 1.8124, "step": 7220 }, { "epoch": 0.8500257182746712, "grad_norm": 1.7647404670715332, "learning_rate": 8.48936343666133e-05, "loss": 1.7447, "step": 7230 }, { "epoch": 0.8512014108310677, "grad_norm": 1.1972322463989258, "learning_rate": 8.484771388063582e-05, "loss": 1.7202, "step": 7240 }, { "epoch": 0.8523771033874642, "grad_norm": 1.3993496894836426, "learning_rate": 8.480173616644558e-05, "loss": 1.748, "step": 7250 }, { "epoch": 0.8535527959438607, "grad_norm": 1.133462905883789, "learning_rate": 8.475570129954888e-05, "loss": 1.6829, "step": 7260 }, { "epoch": 0.8547284885002572, "grad_norm": 1.277855634689331, "learning_rate": 8.470960935554593e-05, "loss": 1.759, "step": 7270 }, { "epoch": 0.8559041810566537, "grad_norm": 1.4254156351089478, "learning_rate": 8.466346041013058e-05, "loss": 1.753, "step": 7280 }, { "epoch": 0.8570798736130502, "grad_norm": 1.2740952968597412, "learning_rate": 8.46172545390904e-05, "loss": 1.7306, "step": 7290 }, { "epoch": 0.8582555661694466, "grad_norm": 1.2641578912734985, "learning_rate": 8.457099181830637e-05, "loss": 1.7035, "step": 7300 }, { "epoch": 0.8594312587258431, "grad_norm": 1.02485191822052, "learning_rate": 8.452467232375286e-05, "loss": 1.7618, "step": 7310 }, { "epoch": 0.8606069512822397, "grad_norm": 1.0210273265838623, "learning_rate": 8.447829613149743e-05, "loss": 1.6806, "step": 7320 }, { "epoch": 0.8617826438386362, "grad_norm": 1.3862189054489136, "learning_rate": 8.443186331770083e-05, "loss": 1.7394, "step": 7330 }, { "epoch": 0.8629583363950327, "grad_norm": 1.1662148237228394, "learning_rate": 8.438537395861674e-05, "loss": 1.7172, "step": 7340 }, { "epoch": 0.8641340289514292, "grad_norm": 1.1813554763793945, "learning_rate": 8.433882813059173e-05, "loss": 1.7795, "step": 7350 }, { "epoch": 0.8653097215078257, "grad_norm": 1.0662617683410645, "learning_rate": 8.429222591006507e-05, "loss": 1.8031, "step": 7360 }, { "epoch": 0.8664854140642222, "grad_norm": 1.255322813987732, "learning_rate": 8.424556737356868e-05, "loss": 1.7017, "step": 7370 }, { "epoch": 0.8676611066206187, "grad_norm": 1.4458709955215454, "learning_rate": 8.419885259772691e-05, "loss": 1.7121, "step": 7380 }, { "epoch": 0.8688367991770152, "grad_norm": 1.1645170450210571, "learning_rate": 8.415208165925656e-05, "loss": 1.7033, "step": 7390 }, { "epoch": 0.8700124917334117, "grad_norm": 1.270965337753296, "learning_rate": 8.410525463496656e-05, "loss": 1.7348, "step": 7400 }, { "epoch": 0.8711881842898083, "grad_norm": 1.3405966758728027, "learning_rate": 8.405837160175802e-05, "loss": 1.7691, "step": 7410 }, { "epoch": 0.8723638768462048, "grad_norm": 1.1563912630081177, "learning_rate": 8.401143263662399e-05, "loss": 1.7591, "step": 7420 }, { "epoch": 0.8735395694026012, "grad_norm": 1.2369190454483032, "learning_rate": 8.396443781664937e-05, "loss": 1.6753, "step": 7430 }, { "epoch": 0.8747152619589977, "grad_norm": 1.6924042701721191, "learning_rate": 8.391738721901083e-05, "loss": 1.6758, "step": 7440 }, { "epoch": 0.8758909545153942, "grad_norm": 1.2625638246536255, "learning_rate": 8.38702809209766e-05, "loss": 1.6359, "step": 7450 }, { "epoch": 0.8770666470717907, "grad_norm": 0.9536723494529724, "learning_rate": 8.38231189999064e-05, "loss": 1.6809, "step": 7460 }, { "epoch": 0.8782423396281872, "grad_norm": 1.1828417778015137, "learning_rate": 8.377590153325128e-05, "loss": 1.7301, "step": 7470 }, { "epoch": 0.8794180321845837, "grad_norm": 1.3231292963027954, "learning_rate": 8.372862859855352e-05, "loss": 1.6659, "step": 7480 }, { "epoch": 0.8805937247409802, "grad_norm": 1.3111695051193237, "learning_rate": 8.368130027344653e-05, "loss": 1.6789, "step": 7490 }, { "epoch": 0.8817694172973768, "grad_norm": 1.0433690547943115, "learning_rate": 8.36339166356546e-05, "loss": 1.7186, "step": 7500 }, { "epoch": 0.8817694172973768, "eval_loss": 1.7510801553726196, "eval_runtime": 1914.7817, "eval_samples_per_second": 31.588, "eval_steps_per_second": 3.949, "step": 7500 }, { "epoch": 0.8829451098537733, "grad_norm": 1.5551807880401611, "learning_rate": 8.358647776299293e-05, "loss": 1.6998, "step": 7510 }, { "epoch": 0.8841208024101698, "grad_norm": 1.1200600862503052, "learning_rate": 8.35389837333674e-05, "loss": 1.7122, "step": 7520 }, { "epoch": 0.8852964949665663, "grad_norm": 1.4827282428741455, "learning_rate": 8.349143462477446e-05, "loss": 1.6873, "step": 7530 }, { "epoch": 0.8864721875229628, "grad_norm": 1.3009493350982666, "learning_rate": 8.344383051530104e-05, "loss": 1.7233, "step": 7540 }, { "epoch": 0.8876478800793592, "grad_norm": 1.3974863290786743, "learning_rate": 8.33961714831244e-05, "loss": 1.7816, "step": 7550 }, { "epoch": 0.8888235726357557, "grad_norm": 1.3270416259765625, "learning_rate": 8.33484576065119e-05, "loss": 1.6929, "step": 7560 }, { "epoch": 0.8899992651921522, "grad_norm": 1.028071403503418, "learning_rate": 8.330068896382114e-05, "loss": 1.7703, "step": 7570 }, { "epoch": 0.8911749577485487, "grad_norm": 1.3957843780517578, "learning_rate": 8.32528656334995e-05, "loss": 1.8037, "step": 7580 }, { "epoch": 0.8923506503049452, "grad_norm": 1.2548609972000122, "learning_rate": 8.320498769408425e-05, "loss": 1.7619, "step": 7590 }, { "epoch": 0.8935263428613418, "grad_norm": 1.31625497341156, "learning_rate": 8.315705522420234e-05, "loss": 1.7668, "step": 7600 }, { "epoch": 0.8947020354177383, "grad_norm": 1.0914857387542725, "learning_rate": 8.310906830257023e-05, "loss": 1.6949, "step": 7610 }, { "epoch": 0.8958777279741348, "grad_norm": 1.1386011838912964, "learning_rate": 8.306102700799385e-05, "loss": 1.6925, "step": 7620 }, { "epoch": 0.8970534205305313, "grad_norm": 1.4839836359024048, "learning_rate": 8.301293141936837e-05, "loss": 1.7464, "step": 7630 }, { "epoch": 0.8982291130869278, "grad_norm": 1.2525554895401, "learning_rate": 8.29647816156782e-05, "loss": 1.7475, "step": 7640 }, { "epoch": 0.8994048056433243, "grad_norm": 1.3611942529678345, "learning_rate": 8.291657767599672e-05, "loss": 1.8047, "step": 7650 }, { "epoch": 0.9005804981997207, "grad_norm": 1.2995916604995728, "learning_rate": 8.286831967948622e-05, "loss": 1.7267, "step": 7660 }, { "epoch": 0.9017561907561172, "grad_norm": 1.091174840927124, "learning_rate": 8.28200077053978e-05, "loss": 1.7381, "step": 7670 }, { "epoch": 0.9029318833125137, "grad_norm": 1.4284354448318481, "learning_rate": 8.277164183307117e-05, "loss": 1.7242, "step": 7680 }, { "epoch": 0.9041075758689103, "grad_norm": 1.2479182481765747, "learning_rate": 8.272322214193457e-05, "loss": 1.716, "step": 7690 }, { "epoch": 0.9052832684253068, "grad_norm": 1.1378108263015747, "learning_rate": 8.267474871150461e-05, "loss": 1.6824, "step": 7700 }, { "epoch": 0.9064589609817033, "grad_norm": 1.1451480388641357, "learning_rate": 8.262622162138616e-05, "loss": 1.665, "step": 7710 }, { "epoch": 0.9076346535380998, "grad_norm": 1.2406892776489258, "learning_rate": 8.257764095127223e-05, "loss": 1.7339, "step": 7720 }, { "epoch": 0.9088103460944963, "grad_norm": 1.378743290901184, "learning_rate": 8.252900678094382e-05, "loss": 1.6985, "step": 7730 }, { "epoch": 0.9099860386508928, "grad_norm": 1.2900267839431763, "learning_rate": 8.248031919026972e-05, "loss": 1.6659, "step": 7740 }, { "epoch": 0.9111617312072893, "grad_norm": 1.2047442197799683, "learning_rate": 8.243157825920655e-05, "loss": 1.7541, "step": 7750 }, { "epoch": 0.9123374237636858, "grad_norm": 1.4542067050933838, "learning_rate": 8.238278406779846e-05, "loss": 1.7218, "step": 7760 }, { "epoch": 0.9135131163200823, "grad_norm": 1.2063121795654297, "learning_rate": 8.23339366961771e-05, "loss": 1.7318, "step": 7770 }, { "epoch": 0.9146888088764789, "grad_norm": 1.3786412477493286, "learning_rate": 8.228503622456143e-05, "loss": 1.7443, "step": 7780 }, { "epoch": 0.9158645014328753, "grad_norm": 1.1597720384597778, "learning_rate": 8.223608273325763e-05, "loss": 1.6639, "step": 7790 }, { "epoch": 0.9170401939892718, "grad_norm": 1.4014465808868408, "learning_rate": 8.218707630265897e-05, "loss": 1.6411, "step": 7800 }, { "epoch": 0.9182158865456683, "grad_norm": 1.317675232887268, "learning_rate": 8.213801701324557e-05, "loss": 1.7375, "step": 7810 }, { "epoch": 0.9193915791020648, "grad_norm": 1.0871098041534424, "learning_rate": 8.208890494558448e-05, "loss": 1.6502, "step": 7820 }, { "epoch": 0.9205672716584613, "grad_norm": 1.2457531690597534, "learning_rate": 8.203974018032933e-05, "loss": 1.6816, "step": 7830 }, { "epoch": 0.9217429642148578, "grad_norm": 1.233129620552063, "learning_rate": 8.199052279822036e-05, "loss": 1.7169, "step": 7840 }, { "epoch": 0.9229186567712543, "grad_norm": 1.3486992120742798, "learning_rate": 8.194125288008416e-05, "loss": 1.7401, "step": 7850 }, { "epoch": 0.9240943493276508, "grad_norm": 1.1744529008865356, "learning_rate": 8.189193050683365e-05, "loss": 1.7591, "step": 7860 }, { "epoch": 0.9252700418840473, "grad_norm": 1.0788921117782593, "learning_rate": 8.184255575946784e-05, "loss": 1.6656, "step": 7870 }, { "epoch": 0.9264457344404439, "grad_norm": 1.1894562244415283, "learning_rate": 8.179312871907179e-05, "loss": 1.7524, "step": 7880 }, { "epoch": 0.9276214269968404, "grad_norm": 1.567104458808899, "learning_rate": 8.174364946681642e-05, "loss": 1.765, "step": 7890 }, { "epoch": 0.9287971195532368, "grad_norm": 1.122937798500061, "learning_rate": 8.169411808395839e-05, "loss": 1.6972, "step": 7900 }, { "epoch": 0.9299728121096333, "grad_norm": 1.4217126369476318, "learning_rate": 8.164453465184002e-05, "loss": 1.7829, "step": 7910 }, { "epoch": 0.9311485046660298, "grad_norm": 0.8702628016471863, "learning_rate": 8.159489925188904e-05, "loss": 1.7574, "step": 7920 }, { "epoch": 0.9323241972224263, "grad_norm": 1.3506906032562256, "learning_rate": 8.154521196561855e-05, "loss": 1.703, "step": 7930 }, { "epoch": 0.9334998897788228, "grad_norm": 1.116306185722351, "learning_rate": 8.149547287462684e-05, "loss": 1.6898, "step": 7940 }, { "epoch": 0.9346755823352193, "grad_norm": 1.2266786098480225, "learning_rate": 8.144568206059735e-05, "loss": 1.6358, "step": 7950 }, { "epoch": 0.9358512748916158, "grad_norm": 1.1994613409042358, "learning_rate": 8.139583960529837e-05, "loss": 1.6338, "step": 7960 }, { "epoch": 0.9370269674480124, "grad_norm": 1.4255046844482422, "learning_rate": 8.134594559058304e-05, "loss": 1.7201, "step": 7970 }, { "epoch": 0.9382026600044089, "grad_norm": 1.2738600969314575, "learning_rate": 8.129600009838917e-05, "loss": 1.6886, "step": 7980 }, { "epoch": 0.9393783525608054, "grad_norm": 1.0920809507369995, "learning_rate": 8.12460032107391e-05, "loss": 1.7466, "step": 7990 }, { "epoch": 0.9405540451172019, "grad_norm": 1.4128170013427734, "learning_rate": 8.11959550097396e-05, "loss": 1.6821, "step": 8000 }, { "epoch": 0.9405540451172019, "eval_loss": 1.7356935739517212, "eval_runtime": 1914.7531, "eval_samples_per_second": 31.589, "eval_steps_per_second": 3.949, "step": 8000 }, { "epoch": 0.9417297376735984, "grad_norm": 1.073804497718811, "learning_rate": 8.114585557758168e-05, "loss": 1.6728, "step": 8010 }, { "epoch": 0.9429054302299948, "grad_norm": 1.0565173625946045, "learning_rate": 8.109570499654048e-05, "loss": 1.7378, "step": 8020 }, { "epoch": 0.9440811227863913, "grad_norm": 1.1836392879486084, "learning_rate": 8.104550334897517e-05, "loss": 1.6899, "step": 8030 }, { "epoch": 0.9452568153427878, "grad_norm": 1.1896892786026, "learning_rate": 8.099525071732874e-05, "loss": 1.7031, "step": 8040 }, { "epoch": 0.9464325078991843, "grad_norm": 1.219974398612976, "learning_rate": 8.094494718412795e-05, "loss": 1.6835, "step": 8050 }, { "epoch": 0.9476082004555809, "grad_norm": 1.1254374980926514, "learning_rate": 8.089459283198313e-05, "loss": 1.8062, "step": 8060 }, { "epoch": 0.9487838930119774, "grad_norm": 1.199526071548462, "learning_rate": 8.084418774358806e-05, "loss": 1.6389, "step": 8070 }, { "epoch": 0.9499595855683739, "grad_norm": 0.9960947036743164, "learning_rate": 8.07937320017199e-05, "loss": 1.7042, "step": 8080 }, { "epoch": 0.9511352781247704, "grad_norm": 1.1944634914398193, "learning_rate": 8.074322568923887e-05, "loss": 1.7263, "step": 8090 }, { "epoch": 0.9523109706811669, "grad_norm": 1.3616657257080078, "learning_rate": 8.069266888908837e-05, "loss": 1.6948, "step": 8100 }, { "epoch": 0.9534866632375634, "grad_norm": 1.2831649780273438, "learning_rate": 8.064206168429464e-05, "loss": 1.6251, "step": 8110 }, { "epoch": 0.9546623557939599, "grad_norm": 1.1297916173934937, "learning_rate": 8.059140415796674e-05, "loss": 1.7045, "step": 8120 }, { "epoch": 0.9558380483503564, "grad_norm": 1.2400109767913818, "learning_rate": 8.054069639329631e-05, "loss": 1.6252, "step": 8130 }, { "epoch": 0.9570137409067528, "grad_norm": 1.3805961608886719, "learning_rate": 8.048993847355754e-05, "loss": 1.6467, "step": 8140 }, { "epoch": 0.9581894334631493, "grad_norm": 1.3529020547866821, "learning_rate": 8.043913048210698e-05, "loss": 1.7521, "step": 8150 }, { "epoch": 0.9593651260195459, "grad_norm": 1.3094087839126587, "learning_rate": 8.038827250238341e-05, "loss": 1.693, "step": 8160 }, { "epoch": 0.9605408185759424, "grad_norm": 1.4284802675247192, "learning_rate": 8.03373646179077e-05, "loss": 1.6931, "step": 8170 }, { "epoch": 0.9617165111323389, "grad_norm": 1.21076500415802, "learning_rate": 8.028640691228266e-05, "loss": 1.6499, "step": 8180 }, { "epoch": 0.9628922036887354, "grad_norm": 1.313759446144104, "learning_rate": 8.023539946919293e-05, "loss": 1.6686, "step": 8190 }, { "epoch": 0.9640678962451319, "grad_norm": 1.143612265586853, "learning_rate": 8.018434237240484e-05, "loss": 1.6896, "step": 8200 }, { "epoch": 0.9652435888015284, "grad_norm": 1.1504859924316406, "learning_rate": 8.013323570576625e-05, "loss": 1.6988, "step": 8210 }, { "epoch": 0.9664192813579249, "grad_norm": 1.297460675239563, "learning_rate": 8.008207955320643e-05, "loss": 1.7171, "step": 8220 }, { "epoch": 0.9675949739143214, "grad_norm": 1.5811901092529297, "learning_rate": 8.003087399873592e-05, "loss": 1.6961, "step": 8230 }, { "epoch": 0.9687706664707179, "grad_norm": 1.252623438835144, "learning_rate": 7.997961912644639e-05, "loss": 1.6909, "step": 8240 }, { "epoch": 0.9699463590271145, "grad_norm": 1.2874550819396973, "learning_rate": 7.992831502051048e-05, "loss": 1.8046, "step": 8250 }, { "epoch": 0.971122051583511, "grad_norm": 1.6053591966629028, "learning_rate": 7.987696176518173e-05, "loss": 1.6516, "step": 8260 }, { "epoch": 0.9722977441399074, "grad_norm": 1.267842411994934, "learning_rate": 7.982555944479435e-05, "loss": 1.7338, "step": 8270 }, { "epoch": 0.9734734366963039, "grad_norm": 1.4833852052688599, "learning_rate": 7.977410814376316e-05, "loss": 1.6699, "step": 8280 }, { "epoch": 0.9746491292527004, "grad_norm": 1.4590785503387451, "learning_rate": 7.972260794658337e-05, "loss": 1.6794, "step": 8290 }, { "epoch": 0.9758248218090969, "grad_norm": 1.384879231452942, "learning_rate": 7.967105893783056e-05, "loss": 1.7021, "step": 8300 }, { "epoch": 0.9770005143654934, "grad_norm": 1.2897634506225586, "learning_rate": 7.961946120216042e-05, "loss": 1.7078, "step": 8310 }, { "epoch": 0.9781762069218899, "grad_norm": 1.3479253053665161, "learning_rate": 7.956781482430864e-05, "loss": 1.6861, "step": 8320 }, { "epoch": 0.9793518994782864, "grad_norm": 1.3666346073150635, "learning_rate": 7.951611988909086e-05, "loss": 1.6958, "step": 8330 }, { "epoch": 0.980527592034683, "grad_norm": 1.4483178853988647, "learning_rate": 7.94643764814024e-05, "loss": 1.6681, "step": 8340 }, { "epoch": 0.9817032845910795, "grad_norm": 1.5546255111694336, "learning_rate": 7.941258468621824e-05, "loss": 1.7503, "step": 8350 }, { "epoch": 0.982878977147476, "grad_norm": 1.2555524110794067, "learning_rate": 7.936074458859277e-05, "loss": 1.652, "step": 8360 }, { "epoch": 0.9840546697038725, "grad_norm": 1.4022319316864014, "learning_rate": 7.930885627365973e-05, "loss": 1.6279, "step": 8370 }, { "epoch": 0.9852303622602689, "grad_norm": 1.4331746101379395, "learning_rate": 7.925691982663206e-05, "loss": 1.6599, "step": 8380 }, { "epoch": 0.9864060548166654, "grad_norm": 1.133203148841858, "learning_rate": 7.920493533280171e-05, "loss": 1.6717, "step": 8390 }, { "epoch": 0.9875817473730619, "grad_norm": 1.373866081237793, "learning_rate": 7.915290287753955e-05, "loss": 1.7507, "step": 8400 }, { "epoch": 0.9887574399294584, "grad_norm": 1.1928794384002686, "learning_rate": 7.910082254629523e-05, "loss": 1.7065, "step": 8410 }, { "epoch": 0.9899331324858549, "grad_norm": 1.1747695207595825, "learning_rate": 7.904869442459699e-05, "loss": 1.6871, "step": 8420 }, { "epoch": 0.9911088250422515, "grad_norm": 1.202128291130066, "learning_rate": 7.899651859805159e-05, "loss": 1.7197, "step": 8430 }, { "epoch": 0.992284517598648, "grad_norm": 1.2725143432617188, "learning_rate": 7.894429515234409e-05, "loss": 1.64, "step": 8440 }, { "epoch": 0.9934602101550445, "grad_norm": 1.2057924270629883, "learning_rate": 7.88920241732378e-05, "loss": 1.7316, "step": 8450 }, { "epoch": 0.994635902711441, "grad_norm": 1.025951623916626, "learning_rate": 7.883970574657405e-05, "loss": 1.7747, "step": 8460 }, { "epoch": 0.9958115952678375, "grad_norm": 1.2480911016464233, "learning_rate": 7.878733995827212e-05, "loss": 1.7192, "step": 8470 }, { "epoch": 0.996987287824234, "grad_norm": 1.337084412574768, "learning_rate": 7.873492689432907e-05, "loss": 1.6596, "step": 8480 }, { "epoch": 0.9981629803806304, "grad_norm": 1.103783130645752, "learning_rate": 7.868246664081954e-05, "loss": 1.715, "step": 8490 }, { "epoch": 0.9993386729370269, "grad_norm": 1.1674913167953491, "learning_rate": 7.862995928389574e-05, "loss": 1.6238, "step": 8500 }, { "epoch": 0.9993386729370269, "eval_loss": 1.7210807800292969, "eval_runtime": 1913.7241, "eval_samples_per_second": 31.606, "eval_steps_per_second": 3.951, "step": 8500 }, { "epoch": 1.0005143654934234, "grad_norm": 1.487837553024292, "learning_rate": 7.85774049097872e-05, "loss": 1.5972, "step": 8510 }, { "epoch": 1.00169005804982, "grad_norm": 1.1322617530822754, "learning_rate": 7.852480360480067e-05, "loss": 1.4787, "step": 8520 }, { "epoch": 1.0028657506062164, "grad_norm": 1.1087088584899902, "learning_rate": 7.847215545531998e-05, "loss": 1.4712, "step": 8530 }, { "epoch": 1.004041443162613, "grad_norm": 1.4394466876983643, "learning_rate": 7.841946054780589e-05, "loss": 1.5169, "step": 8540 }, { "epoch": 1.0052171357190094, "grad_norm": 1.2923603057861328, "learning_rate": 7.83667189687959e-05, "loss": 1.5115, "step": 8550 }, { "epoch": 1.006392828275406, "grad_norm": 1.1750625371932983, "learning_rate": 7.831393080490423e-05, "loss": 1.5157, "step": 8560 }, { "epoch": 1.0075685208318026, "grad_norm": 1.3513410091400146, "learning_rate": 7.826109614282154e-05, "loss": 1.4885, "step": 8570 }, { "epoch": 1.008744213388199, "grad_norm": 1.0138983726501465, "learning_rate": 7.82082150693149e-05, "loss": 1.4895, "step": 8580 }, { "epoch": 1.0099199059445956, "grad_norm": 1.3042219877243042, "learning_rate": 7.815528767122758e-05, "loss": 1.4737, "step": 8590 }, { "epoch": 1.011095598500992, "grad_norm": 1.306340217590332, "learning_rate": 7.81023140354789e-05, "loss": 1.4321, "step": 8600 }, { "epoch": 1.0122712910573886, "grad_norm": 1.0528134107589722, "learning_rate": 7.804929424906414e-05, "loss": 1.4642, "step": 8610 }, { "epoch": 1.013446983613785, "grad_norm": 1.1151111125946045, "learning_rate": 7.799622839905438e-05, "loss": 1.4523, "step": 8620 }, { "epoch": 1.0146226761701815, "grad_norm": 1.2096871137619019, "learning_rate": 7.794311657259634e-05, "loss": 1.5355, "step": 8630 }, { "epoch": 1.015798368726578, "grad_norm": 1.4151655435562134, "learning_rate": 7.788995885691222e-05, "loss": 1.5162, "step": 8640 }, { "epoch": 1.0169740612829745, "grad_norm": 1.2592905759811401, "learning_rate": 7.78367553392996e-05, "loss": 1.5619, "step": 8650 }, { "epoch": 1.018149753839371, "grad_norm": 1.255204677581787, "learning_rate": 7.77835061071313e-05, "loss": 1.462, "step": 8660 }, { "epoch": 1.0193254463957675, "grad_norm": 1.2716405391693115, "learning_rate": 7.773021124785516e-05, "loss": 1.4727, "step": 8670 }, { "epoch": 1.020501138952164, "grad_norm": 1.535090446472168, "learning_rate": 7.7676870848994e-05, "loss": 1.5158, "step": 8680 }, { "epoch": 1.0216768315085605, "grad_norm": 1.3599311113357544, "learning_rate": 7.762348499814543e-05, "loss": 1.4921, "step": 8690 }, { "epoch": 1.022852524064957, "grad_norm": 1.4668912887573242, "learning_rate": 7.757005378298161e-05, "loss": 1.4606, "step": 8700 }, { "epoch": 1.0240282166213535, "grad_norm": 1.9533289670944214, "learning_rate": 7.751657729124934e-05, "loss": 1.5018, "step": 8710 }, { "epoch": 1.02520390917775, "grad_norm": 1.223655104637146, "learning_rate": 7.746305561076968e-05, "loss": 1.444, "step": 8720 }, { "epoch": 1.0263796017341464, "grad_norm": 1.6113215684890747, "learning_rate": 7.740948882943792e-05, "loss": 1.5162, "step": 8730 }, { "epoch": 1.027555294290543, "grad_norm": 1.0953630208969116, "learning_rate": 7.735587703522345e-05, "loss": 1.5143, "step": 8740 }, { "epoch": 1.0287309868469394, "grad_norm": 1.1346840858459473, "learning_rate": 7.730222031616951e-05, "loss": 1.4874, "step": 8750 }, { "epoch": 1.029906679403336, "grad_norm": 0.9722471237182617, "learning_rate": 7.724851876039318e-05, "loss": 1.4768, "step": 8760 }, { "epoch": 1.0310823719597326, "grad_norm": 1.2295154333114624, "learning_rate": 7.719477245608517e-05, "loss": 1.4282, "step": 8770 }, { "epoch": 1.032258064516129, "grad_norm": 1.2539094686508179, "learning_rate": 7.714098149150966e-05, "loss": 1.5078, "step": 8780 }, { "epoch": 1.0334337570725256, "grad_norm": 1.129237174987793, "learning_rate": 7.708714595500415e-05, "loss": 1.4951, "step": 8790 }, { "epoch": 1.034609449628922, "grad_norm": 1.1453620195388794, "learning_rate": 7.703326593497937e-05, "loss": 1.4858, "step": 8800 }, { "epoch": 1.0357851421853186, "grad_norm": 1.1309852600097656, "learning_rate": 7.69793415199191e-05, "loss": 1.4566, "step": 8810 }, { "epoch": 1.036960834741715, "grad_norm": 1.1791269779205322, "learning_rate": 7.692537279838003e-05, "loss": 1.5053, "step": 8820 }, { "epoch": 1.0381365272981116, "grad_norm": 1.028153657913208, "learning_rate": 7.68713598589916e-05, "loss": 1.4622, "step": 8830 }, { "epoch": 1.039312219854508, "grad_norm": 1.1585650444030762, "learning_rate": 7.681730279045584e-05, "loss": 1.463, "step": 8840 }, { "epoch": 1.0404879124109045, "grad_norm": 1.4461066722869873, "learning_rate": 7.676320168154731e-05, "loss": 1.4554, "step": 8850 }, { "epoch": 1.0416636049673011, "grad_norm": 1.0177234411239624, "learning_rate": 7.670905662111287e-05, "loss": 1.4966, "step": 8860 }, { "epoch": 1.0428392975236975, "grad_norm": 1.147196650505066, "learning_rate": 7.665486769807153e-05, "loss": 1.4427, "step": 8870 }, { "epoch": 1.0440149900800941, "grad_norm": 1.0796765089035034, "learning_rate": 7.660063500141437e-05, "loss": 1.4764, "step": 8880 }, { "epoch": 1.0451906826364905, "grad_norm": 1.069022297859192, "learning_rate": 7.654635862020434e-05, "loss": 1.4918, "step": 8890 }, { "epoch": 1.046366375192887, "grad_norm": 1.1652644872665405, "learning_rate": 7.649203864357613e-05, "loss": 1.4211, "step": 8900 }, { "epoch": 1.0475420677492835, "grad_norm": 1.0817160606384277, "learning_rate": 7.643767516073605e-05, "loss": 1.4749, "step": 8910 }, { "epoch": 1.04871776030568, "grad_norm": 1.0584523677825928, "learning_rate": 7.63832682609618e-05, "loss": 1.4651, "step": 8920 }, { "epoch": 1.0498934528620765, "grad_norm": 1.505861520767212, "learning_rate": 7.632881803360244e-05, "loss": 1.5002, "step": 8930 }, { "epoch": 1.051069145418473, "grad_norm": 1.366989016532898, "learning_rate": 7.627432456807815e-05, "loss": 1.5552, "step": 8940 }, { "epoch": 1.0522448379748697, "grad_norm": 1.0168781280517578, "learning_rate": 7.621978795388012e-05, "loss": 1.3868, "step": 8950 }, { "epoch": 1.053420530531266, "grad_norm": 1.4444361925125122, "learning_rate": 7.616520828057042e-05, "loss": 1.5193, "step": 8960 }, { "epoch": 1.0545962230876627, "grad_norm": 1.1714551448822021, "learning_rate": 7.611058563778181e-05, "loss": 1.4522, "step": 8970 }, { "epoch": 1.055771915644059, "grad_norm": 1.0245108604431152, "learning_rate": 7.605592011521768e-05, "loss": 1.4363, "step": 8980 }, { "epoch": 1.0569476082004556, "grad_norm": 1.276252269744873, "learning_rate": 7.600121180265169e-05, "loss": 1.5068, "step": 8990 }, { "epoch": 1.058123300756852, "grad_norm": 1.3044743537902832, "learning_rate": 7.594646078992794e-05, "loss": 1.4753, "step": 9000 }, { "epoch": 1.058123300756852, "eval_loss": 1.7176666259765625, "eval_runtime": 1915.6331, "eval_samples_per_second": 31.574, "eval_steps_per_second": 3.947, "step": 9000 }, { "epoch": 1.0592989933132486, "grad_norm": 0.9948422312736511, "learning_rate": 7.589166716696055e-05, "loss": 1.4318, "step": 9010 }, { "epoch": 1.060474685869645, "grad_norm": 1.1191619634628296, "learning_rate": 7.583683102373364e-05, "loss": 1.4834, "step": 9020 }, { "epoch": 1.0616503784260416, "grad_norm": 1.3023236989974976, "learning_rate": 7.578195245030122e-05, "loss": 1.4856, "step": 9030 }, { "epoch": 1.062826070982438, "grad_norm": 1.264155387878418, "learning_rate": 7.572703153678687e-05, "loss": 1.4543, "step": 9040 }, { "epoch": 1.0640017635388346, "grad_norm": 1.010107398033142, "learning_rate": 7.567206837338377e-05, "loss": 1.4872, "step": 9050 }, { "epoch": 1.0651774560952312, "grad_norm": 0.9714999198913574, "learning_rate": 7.561706305035448e-05, "loss": 1.5143, "step": 9060 }, { "epoch": 1.0663531486516276, "grad_norm": 1.019950270652771, "learning_rate": 7.55620156580308e-05, "loss": 1.4887, "step": 9070 }, { "epoch": 1.0675288412080242, "grad_norm": 1.1102795600891113, "learning_rate": 7.550692628681357e-05, "loss": 1.5453, "step": 9080 }, { "epoch": 1.0687045337644205, "grad_norm": 1.2663441896438599, "learning_rate": 7.545179502717266e-05, "loss": 1.4828, "step": 9090 }, { "epoch": 1.0698802263208171, "grad_norm": 0.9711494445800781, "learning_rate": 7.539662196964662e-05, "loss": 1.5392, "step": 9100 }, { "epoch": 1.0710559188772135, "grad_norm": 1.0910563468933105, "learning_rate": 7.534140720484273e-05, "loss": 1.5294, "step": 9110 }, { "epoch": 1.0722316114336101, "grad_norm": 1.0717806816101074, "learning_rate": 7.528615082343673e-05, "loss": 1.5389, "step": 9120 }, { "epoch": 1.0734073039900065, "grad_norm": 1.530964732170105, "learning_rate": 7.52308529161727e-05, "loss": 1.4399, "step": 9130 }, { "epoch": 1.074582996546403, "grad_norm": 1.1488016843795776, "learning_rate": 7.517551357386292e-05, "loss": 1.4461, "step": 9140 }, { "epoch": 1.0757586891027997, "grad_norm": 1.1175764799118042, "learning_rate": 7.512013288738772e-05, "loss": 1.4665, "step": 9150 }, { "epoch": 1.076934381659196, "grad_norm": 1.2313203811645508, "learning_rate": 7.506471094769535e-05, "loss": 1.5168, "step": 9160 }, { "epoch": 1.0781100742155927, "grad_norm": 1.2752115726470947, "learning_rate": 7.500924784580174e-05, "loss": 1.4769, "step": 9170 }, { "epoch": 1.079285766771989, "grad_norm": 1.0810673236846924, "learning_rate": 7.495374367279048e-05, "loss": 1.4828, "step": 9180 }, { "epoch": 1.0804614593283857, "grad_norm": 1.2415813207626343, "learning_rate": 7.48981985198126e-05, "loss": 1.4695, "step": 9190 }, { "epoch": 1.081637151884782, "grad_norm": 1.1464005708694458, "learning_rate": 7.484261247808638e-05, "loss": 1.4858, "step": 9200 }, { "epoch": 1.0828128444411786, "grad_norm": 1.1929455995559692, "learning_rate": 7.478698563889732e-05, "loss": 1.4697, "step": 9210 }, { "epoch": 1.083988536997575, "grad_norm": 1.397558331489563, "learning_rate": 7.473131809359788e-05, "loss": 1.4543, "step": 9220 }, { "epoch": 1.0851642295539716, "grad_norm": 1.1237040758132935, "learning_rate": 7.467560993360738e-05, "loss": 1.3845, "step": 9230 }, { "epoch": 1.0863399221103682, "grad_norm": 1.2384823560714722, "learning_rate": 7.461986125041182e-05, "loss": 1.4605, "step": 9240 }, { "epoch": 1.0875156146667646, "grad_norm": 1.3171871900558472, "learning_rate": 7.456407213556377e-05, "loss": 1.4831, "step": 9250 }, { "epoch": 1.0886913072231612, "grad_norm": 1.5083237886428833, "learning_rate": 7.450824268068219e-05, "loss": 1.482, "step": 9260 }, { "epoch": 1.0898669997795576, "grad_norm": 0.99432772397995, "learning_rate": 7.445237297745228e-05, "loss": 1.4676, "step": 9270 }, { "epoch": 1.0910426923359542, "grad_norm": 1.0726395845413208, "learning_rate": 7.439646311762536e-05, "loss": 1.3919, "step": 9280 }, { "epoch": 1.0922183848923506, "grad_norm": 1.2037619352340698, "learning_rate": 7.434051319301868e-05, "loss": 1.4936, "step": 9290 }, { "epoch": 1.0933940774487472, "grad_norm": 1.26396644115448, "learning_rate": 7.428452329551527e-05, "loss": 1.4948, "step": 9300 }, { "epoch": 1.0945697700051435, "grad_norm": 1.228286623954773, "learning_rate": 7.422849351706385e-05, "loss": 1.4737, "step": 9310 }, { "epoch": 1.0957454625615402, "grad_norm": 1.0259971618652344, "learning_rate": 7.417242394967862e-05, "loss": 1.4054, "step": 9320 }, { "epoch": 1.0969211551179368, "grad_norm": 1.1449753046035767, "learning_rate": 7.411631468543906e-05, "loss": 1.4675, "step": 9330 }, { "epoch": 1.0980968476743331, "grad_norm": 1.3073805570602417, "learning_rate": 7.406016581648991e-05, "loss": 1.5025, "step": 9340 }, { "epoch": 1.0992725402307297, "grad_norm": 1.3375186920166016, "learning_rate": 7.400397743504096e-05, "loss": 1.4368, "step": 9350 }, { "epoch": 1.1004482327871261, "grad_norm": 0.9765594005584717, "learning_rate": 7.394774963336684e-05, "loss": 1.5238, "step": 9360 }, { "epoch": 1.1016239253435227, "grad_norm": 1.1424318552017212, "learning_rate": 7.389148250380696e-05, "loss": 1.5086, "step": 9370 }, { "epoch": 1.102799617899919, "grad_norm": 1.1976008415222168, "learning_rate": 7.383517613876525e-05, "loss": 1.501, "step": 9380 }, { "epoch": 1.1039753104563157, "grad_norm": 1.2896246910095215, "learning_rate": 7.377883063071014e-05, "loss": 1.4872, "step": 9390 }, { "epoch": 1.105151003012712, "grad_norm": 1.5570365190505981, "learning_rate": 7.372244607217434e-05, "loss": 1.4858, "step": 9400 }, { "epoch": 1.1063266955691087, "grad_norm": 1.2404149770736694, "learning_rate": 7.366602255575465e-05, "loss": 1.446, "step": 9410 }, { "epoch": 1.1075023881255053, "grad_norm": 1.2718952894210815, "learning_rate": 7.36095601741119e-05, "loss": 1.4977, "step": 9420 }, { "epoch": 1.1086780806819017, "grad_norm": 1.476306676864624, "learning_rate": 7.355305901997065e-05, "loss": 1.457, "step": 9430 }, { "epoch": 1.1098537732382983, "grad_norm": 1.3666908740997314, "learning_rate": 7.349651918611926e-05, "loss": 1.4712, "step": 9440 }, { "epoch": 1.1110294657946946, "grad_norm": 1.306581735610962, "learning_rate": 7.343994076540952e-05, "loss": 1.4747, "step": 9450 }, { "epoch": 1.1122051583510912, "grad_norm": 1.486339807510376, "learning_rate": 7.338332385075662e-05, "loss": 1.471, "step": 9460 }, { "epoch": 1.1133808509074876, "grad_norm": 1.0081589221954346, "learning_rate": 7.332666853513897e-05, "loss": 1.5021, "step": 9470 }, { "epoch": 1.1145565434638842, "grad_norm": 1.3591985702514648, "learning_rate": 7.326997491159804e-05, "loss": 1.497, "step": 9480 }, { "epoch": 1.1157322360202806, "grad_norm": 1.24358069896698, "learning_rate": 7.321324307323822e-05, "loss": 1.4508, "step": 9490 }, { "epoch": 1.1169079285766772, "grad_norm": 1.2822545766830444, "learning_rate": 7.315647311322663e-05, "loss": 1.4412, "step": 9500 }, { "epoch": 1.1169079285766772, "eval_loss": 1.7047855854034424, "eval_runtime": 1914.4301, "eval_samples_per_second": 31.594, "eval_steps_per_second": 3.949, "step": 9500 }, { "epoch": 1.1180836211330738, "grad_norm": 1.1646616458892822, "learning_rate": 7.309966512479301e-05, "loss": 1.3966, "step": 9510 }, { "epoch": 1.1192593136894702, "grad_norm": 1.0051647424697876, "learning_rate": 7.304281920122955e-05, "loss": 1.4565, "step": 9520 }, { "epoch": 1.1204350062458668, "grad_norm": 1.3270152807235718, "learning_rate": 7.298593543589076e-05, "loss": 1.5099, "step": 9530 }, { "epoch": 1.1216106988022632, "grad_norm": 1.0629124641418457, "learning_rate": 7.292901392219325e-05, "loss": 1.4004, "step": 9540 }, { "epoch": 1.1227863913586598, "grad_norm": 1.256546139717102, "learning_rate": 7.287205475361568e-05, "loss": 1.4673, "step": 9550 }, { "epoch": 1.1239620839150561, "grad_norm": 1.281146764755249, "learning_rate": 7.281505802369849e-05, "loss": 1.4484, "step": 9560 }, { "epoch": 1.1251377764714527, "grad_norm": 1.2016042470932007, "learning_rate": 7.275802382604384e-05, "loss": 1.4549, "step": 9570 }, { "epoch": 1.1263134690278491, "grad_norm": 1.2219256162643433, "learning_rate": 7.270095225431543e-05, "loss": 1.5027, "step": 9580 }, { "epoch": 1.1274891615842457, "grad_norm": 1.2971493005752563, "learning_rate": 7.264384340223828e-05, "loss": 1.4685, "step": 9590 }, { "epoch": 1.1286648541406423, "grad_norm": 1.07282555103302, "learning_rate": 7.258669736359872e-05, "loss": 1.4693, "step": 9600 }, { "epoch": 1.1298405466970387, "grad_norm": 1.1905196905136108, "learning_rate": 7.252951423224408e-05, "loss": 1.4351, "step": 9610 }, { "epoch": 1.1310162392534353, "grad_norm": 1.1704360246658325, "learning_rate": 7.24722941020826e-05, "loss": 1.5518, "step": 9620 }, { "epoch": 1.1321919318098317, "grad_norm": 1.2388302087783813, "learning_rate": 7.241503706708335e-05, "loss": 1.4903, "step": 9630 }, { "epoch": 1.1333676243662283, "grad_norm": 1.377185583114624, "learning_rate": 7.235774322127593e-05, "loss": 1.5007, "step": 9640 }, { "epoch": 1.1345433169226247, "grad_norm": 1.3142633438110352, "learning_rate": 7.230041265875044e-05, "loss": 1.4848, "step": 9650 }, { "epoch": 1.1357190094790213, "grad_norm": 1.0539777278900146, "learning_rate": 7.224304547365728e-05, "loss": 1.5092, "step": 9660 }, { "epoch": 1.1368947020354176, "grad_norm": 1.6200063228607178, "learning_rate": 7.218564176020696e-05, "loss": 1.4287, "step": 9670 }, { "epoch": 1.1380703945918142, "grad_norm": 1.4243866205215454, "learning_rate": 7.212820161266997e-05, "loss": 1.4142, "step": 9680 }, { "epoch": 1.1392460871482109, "grad_norm": 0.9475681781768799, "learning_rate": 7.207072512537672e-05, "loss": 1.518, "step": 9690 }, { "epoch": 1.1404217797046072, "grad_norm": 1.077085256576538, "learning_rate": 7.201321239271717e-05, "loss": 1.4654, "step": 9700 }, { "epoch": 1.1415974722610038, "grad_norm": 1.2881745100021362, "learning_rate": 7.195566350914094e-05, "loss": 1.4471, "step": 9710 }, { "epoch": 1.1427731648174002, "grad_norm": 1.1081621646881104, "learning_rate": 7.189807856915689e-05, "loss": 1.3928, "step": 9720 }, { "epoch": 1.1439488573737968, "grad_norm": 1.105178713798523, "learning_rate": 7.184045766733317e-05, "loss": 1.4677, "step": 9730 }, { "epoch": 1.1451245499301932, "grad_norm": 1.1361236572265625, "learning_rate": 7.178280089829698e-05, "loss": 1.4853, "step": 9740 }, { "epoch": 1.1463002424865898, "grad_norm": 1.0730022192001343, "learning_rate": 7.172510835673439e-05, "loss": 1.4898, "step": 9750 }, { "epoch": 1.1474759350429862, "grad_norm": 1.283571481704712, "learning_rate": 7.166738013739025e-05, "loss": 1.5105, "step": 9760 }, { "epoch": 1.1486516275993828, "grad_norm": 1.201699137687683, "learning_rate": 7.1609616335068e-05, "loss": 1.5137, "step": 9770 }, { "epoch": 1.1498273201557794, "grad_norm": 1.320861577987671, "learning_rate": 7.15518170446295e-05, "loss": 1.4468, "step": 9780 }, { "epoch": 1.1510030127121758, "grad_norm": 1.3528327941894531, "learning_rate": 7.149398236099489e-05, "loss": 1.5218, "step": 9790 }, { "epoch": 1.1521787052685724, "grad_norm": 1.2877815961837769, "learning_rate": 7.143611237914246e-05, "loss": 1.4254, "step": 9800 }, { "epoch": 1.1533543978249687, "grad_norm": 1.4409797191619873, "learning_rate": 7.137820719410842e-05, "loss": 1.4901, "step": 9810 }, { "epoch": 1.1545300903813653, "grad_norm": 1.1461670398712158, "learning_rate": 7.132026690098683e-05, "loss": 1.4607, "step": 9820 }, { "epoch": 1.1557057829377617, "grad_norm": 1.239444375038147, "learning_rate": 7.126229159492942e-05, "loss": 1.4685, "step": 9830 }, { "epoch": 1.1568814754941583, "grad_norm": 1.1030553579330444, "learning_rate": 7.120428137114535e-05, "loss": 1.492, "step": 9840 }, { "epoch": 1.1580571680505547, "grad_norm": 1.0500524044036865, "learning_rate": 7.11462363249012e-05, "loss": 1.4897, "step": 9850 }, { "epoch": 1.1592328606069513, "grad_norm": 1.0450153350830078, "learning_rate": 7.108815655152071e-05, "loss": 1.4317, "step": 9860 }, { "epoch": 1.160408553163348, "grad_norm": 1.3527547121047974, "learning_rate": 7.103004214638464e-05, "loss": 1.4412, "step": 9870 }, { "epoch": 1.1615842457197443, "grad_norm": 0.9922232031822205, "learning_rate": 7.09718932049306e-05, "loss": 1.4898, "step": 9880 }, { "epoch": 1.1627599382761409, "grad_norm": 1.2802048921585083, "learning_rate": 7.091370982265298e-05, "loss": 1.4775, "step": 9890 }, { "epoch": 1.1639356308325373, "grad_norm": 1.0011050701141357, "learning_rate": 7.085549209510269e-05, "loss": 1.4635, "step": 9900 }, { "epoch": 1.1651113233889339, "grad_norm": 1.0631232261657715, "learning_rate": 7.079724011788703e-05, "loss": 1.4763, "step": 9910 }, { "epoch": 1.1662870159453302, "grad_norm": 1.2858209609985352, "learning_rate": 7.073895398666958e-05, "loss": 1.4718, "step": 9920 }, { "epoch": 1.1674627085017268, "grad_norm": 1.6219468116760254, "learning_rate": 7.068063379717003e-05, "loss": 1.5238, "step": 9930 }, { "epoch": 1.1686384010581232, "grad_norm": 1.3149330615997314, "learning_rate": 7.06222796451639e-05, "loss": 1.4878, "step": 9940 }, { "epoch": 1.1698140936145198, "grad_norm": 1.2998390197753906, "learning_rate": 7.056389162648258e-05, "loss": 1.4949, "step": 9950 }, { "epoch": 1.1709897861709164, "grad_norm": 1.1299713850021362, "learning_rate": 7.050546983701305e-05, "loss": 1.5355, "step": 9960 }, { "epoch": 1.1721654787273128, "grad_norm": 1.2159680128097534, "learning_rate": 7.044701437269775e-05, "loss": 1.4276, "step": 9970 }, { "epoch": 1.1733411712837094, "grad_norm": 1.2199612855911255, "learning_rate": 7.03885253295344e-05, "loss": 1.4403, "step": 9980 }, { "epoch": 1.1745168638401058, "grad_norm": 1.3903744220733643, "learning_rate": 7.03300028035759e-05, "loss": 1.4396, "step": 9990 }, { "epoch": 1.1756925563965024, "grad_norm": 1.079207181930542, "learning_rate": 7.027144689093012e-05, "loss": 1.4273, "step": 10000 }, { "epoch": 1.1756925563965024, "eval_loss": 1.6991071701049805, "eval_runtime": 1918.5024, "eval_samples_per_second": 31.527, "eval_steps_per_second": 3.941, "step": 10000 }, { "epoch": 1.1768682489528988, "grad_norm": 1.3104126453399658, "learning_rate": 7.021285768775976e-05, "loss": 1.4519, "step": 10010 }, { "epoch": 1.1780439415092954, "grad_norm": 1.2057169675827026, "learning_rate": 7.015423529028218e-05, "loss": 1.4107, "step": 10020 }, { "epoch": 1.1792196340656917, "grad_norm": 1.2831695079803467, "learning_rate": 7.009557979476927e-05, "loss": 1.4476, "step": 10030 }, { "epoch": 1.1803953266220883, "grad_norm": 1.2142335176467896, "learning_rate": 7.003689129754727e-05, "loss": 1.4016, "step": 10040 }, { "epoch": 1.181571019178485, "grad_norm": 1.2298212051391602, "learning_rate": 6.997816989499663e-05, "loss": 1.4267, "step": 10050 }, { "epoch": 1.1827467117348813, "grad_norm": 0.9501388669013977, "learning_rate": 6.991941568355178e-05, "loss": 1.4507, "step": 10060 }, { "epoch": 1.1839224042912777, "grad_norm": 1.112906813621521, "learning_rate": 6.986062875970111e-05, "loss": 1.4564, "step": 10070 }, { "epoch": 1.1850980968476743, "grad_norm": 1.3455804586410522, "learning_rate": 6.980180921998668e-05, "loss": 1.4679, "step": 10080 }, { "epoch": 1.186273789404071, "grad_norm": 1.301292896270752, "learning_rate": 6.974295716100414e-05, "loss": 1.4587, "step": 10090 }, { "epoch": 1.1874494819604673, "grad_norm": 1.274287462234497, "learning_rate": 6.968407267940251e-05, "loss": 1.4595, "step": 10100 }, { "epoch": 1.188625174516864, "grad_norm": 1.212729573249817, "learning_rate": 6.962515587188408e-05, "loss": 1.5006, "step": 10110 }, { "epoch": 1.1898008670732603, "grad_norm": 1.2274562120437622, "learning_rate": 6.956620683520426e-05, "loss": 1.4945, "step": 10120 }, { "epoch": 1.1909765596296569, "grad_norm": 1.3652760982513428, "learning_rate": 6.950722566617131e-05, "loss": 1.451, "step": 10130 }, { "epoch": 1.1921522521860533, "grad_norm": 0.9665593504905701, "learning_rate": 6.94482124616463e-05, "loss": 1.3902, "step": 10140 }, { "epoch": 1.1933279447424499, "grad_norm": 1.181997299194336, "learning_rate": 6.938916731854294e-05, "loss": 1.4632, "step": 10150 }, { "epoch": 1.1945036372988462, "grad_norm": 1.0824170112609863, "learning_rate": 6.933009033382736e-05, "loss": 1.4964, "step": 10160 }, { "epoch": 1.1956793298552428, "grad_norm": 1.5983858108520508, "learning_rate": 6.927098160451794e-05, "loss": 1.5201, "step": 10170 }, { "epoch": 1.1968550224116394, "grad_norm": 1.0041390657424927, "learning_rate": 6.921184122768531e-05, "loss": 1.4789, "step": 10180 }, { "epoch": 1.1980307149680358, "grad_norm": 1.1514314413070679, "learning_rate": 6.915266930045194e-05, "loss": 1.4356, "step": 10190 }, { "epoch": 1.1992064075244324, "grad_norm": 1.3452905416488647, "learning_rate": 6.909346591999217e-05, "loss": 1.469, "step": 10200 }, { "epoch": 1.2003821000808288, "grad_norm": 1.3640415668487549, "learning_rate": 6.903423118353202e-05, "loss": 1.502, "step": 10210 }, { "epoch": 1.2015577926372254, "grad_norm": 1.3941380977630615, "learning_rate": 6.897496518834898e-05, "loss": 1.4887, "step": 10220 }, { "epoch": 1.2027334851936218, "grad_norm": 1.6356010437011719, "learning_rate": 6.891566803177185e-05, "loss": 1.454, "step": 10230 }, { "epoch": 1.2039091777500184, "grad_norm": 1.5237528085708618, "learning_rate": 6.885633981118065e-05, "loss": 1.4277, "step": 10240 }, { "epoch": 1.2050848703064148, "grad_norm": 1.2682058811187744, "learning_rate": 6.879698062400638e-05, "loss": 1.4504, "step": 10250 }, { "epoch": 1.2062605628628114, "grad_norm": 1.3118491172790527, "learning_rate": 6.873759056773091e-05, "loss": 1.4674, "step": 10260 }, { "epoch": 1.207436255419208, "grad_norm": 1.111470103263855, "learning_rate": 6.86781697398868e-05, "loss": 1.4846, "step": 10270 }, { "epoch": 1.2086119479756043, "grad_norm": 1.1193264722824097, "learning_rate": 6.861871823805715e-05, "loss": 1.4905, "step": 10280 }, { "epoch": 1.209787640532001, "grad_norm": 1.2230027914047241, "learning_rate": 6.85592361598754e-05, "loss": 1.4432, "step": 10290 }, { "epoch": 1.2109633330883973, "grad_norm": 1.174304485321045, "learning_rate": 6.849972360302527e-05, "loss": 1.5097, "step": 10300 }, { "epoch": 1.212139025644794, "grad_norm": 1.1213397979736328, "learning_rate": 6.844018066524051e-05, "loss": 1.472, "step": 10310 }, { "epoch": 1.2133147182011903, "grad_norm": 1.2787407636642456, "learning_rate": 6.83806074443047e-05, "loss": 1.5078, "step": 10320 }, { "epoch": 1.214490410757587, "grad_norm": 1.0966283082962036, "learning_rate": 6.832100403805122e-05, "loss": 1.473, "step": 10330 }, { "epoch": 1.2156661033139833, "grad_norm": 1.2529081106185913, "learning_rate": 6.826137054436302e-05, "loss": 1.4288, "step": 10340 }, { "epoch": 1.2168417958703799, "grad_norm": 1.033864140510559, "learning_rate": 6.820170706117242e-05, "loss": 1.4527, "step": 10350 }, { "epoch": 1.2180174884267765, "grad_norm": 1.1423323154449463, "learning_rate": 6.814201368646102e-05, "loss": 1.4526, "step": 10360 }, { "epoch": 1.2191931809831729, "grad_norm": 1.4255110025405884, "learning_rate": 6.808229051825949e-05, "loss": 1.4532, "step": 10370 }, { "epoch": 1.2203688735395695, "grad_norm": 1.1847636699676514, "learning_rate": 6.802253765464747e-05, "loss": 1.4748, "step": 10380 }, { "epoch": 1.2215445660959658, "grad_norm": 1.1389960050582886, "learning_rate": 6.79627551937533e-05, "loss": 1.3775, "step": 10390 }, { "epoch": 1.2227202586523624, "grad_norm": 1.369521975517273, "learning_rate": 6.790294323375399e-05, "loss": 1.4233, "step": 10400 }, { "epoch": 1.2238959512087588, "grad_norm": 1.3316912651062012, "learning_rate": 6.784310187287496e-05, "loss": 1.4663, "step": 10410 }, { "epoch": 1.2250716437651554, "grad_norm": 1.4464448690414429, "learning_rate": 6.778323120938992e-05, "loss": 1.4919, "step": 10420 }, { "epoch": 1.2262473363215518, "grad_norm": 1.1009833812713623, "learning_rate": 6.77233313416207e-05, "loss": 1.4645, "step": 10430 }, { "epoch": 1.2274230288779484, "grad_norm": 1.3995048999786377, "learning_rate": 6.766340236793708e-05, "loss": 1.4584, "step": 10440 }, { "epoch": 1.228598721434345, "grad_norm": 1.3022229671478271, "learning_rate": 6.760344438675668e-05, "loss": 1.4824, "step": 10450 }, { "epoch": 1.2297744139907414, "grad_norm": 0.9838978052139282, "learning_rate": 6.754345749654469e-05, "loss": 1.4457, "step": 10460 }, { "epoch": 1.230950106547138, "grad_norm": 1.2882027626037598, "learning_rate": 6.748344179581386e-05, "loss": 1.5321, "step": 10470 }, { "epoch": 1.2321257991035344, "grad_norm": 1.3871498107910156, "learning_rate": 6.742339738312418e-05, "loss": 1.4626, "step": 10480 }, { "epoch": 1.233301491659931, "grad_norm": 1.148050308227539, "learning_rate": 6.73633243570828e-05, "loss": 1.4569, "step": 10490 }, { "epoch": 1.2344771842163274, "grad_norm": 1.4487998485565186, "learning_rate": 6.730322281634392e-05, "loss": 1.4464, "step": 10500 }, { "epoch": 1.2344771842163274, "eval_loss": 1.6839934587478638, "eval_runtime": 1918.5445, "eval_samples_per_second": 31.527, "eval_steps_per_second": 3.941, "step": 10500 }, { "epoch": 1.235652876772724, "grad_norm": 1.195565104484558, "learning_rate": 6.72430928596085e-05, "loss": 1.4493, "step": 10510 }, { "epoch": 1.2368285693291203, "grad_norm": 1.1839338541030884, "learning_rate": 6.718293458562422e-05, "loss": 1.3946, "step": 10520 }, { "epoch": 1.238004261885517, "grad_norm": 1.2473584413528442, "learning_rate": 6.712274809318519e-05, "loss": 1.3872, "step": 10530 }, { "epoch": 1.2391799544419135, "grad_norm": 1.3128076791763306, "learning_rate": 6.706253348113194e-05, "loss": 1.4931, "step": 10540 }, { "epoch": 1.24035564699831, "grad_norm": 1.1814830303192139, "learning_rate": 6.700229084835116e-05, "loss": 1.4388, "step": 10550 }, { "epoch": 1.2415313395547065, "grad_norm": 1.48375403881073, "learning_rate": 6.694202029377551e-05, "loss": 1.4934, "step": 10560 }, { "epoch": 1.242707032111103, "grad_norm": 1.175952672958374, "learning_rate": 6.688172191638358e-05, "loss": 1.4523, "step": 10570 }, { "epoch": 1.2438827246674995, "grad_norm": 1.3456496000289917, "learning_rate": 6.682139581519956e-05, "loss": 1.4988, "step": 10580 }, { "epoch": 1.2450584172238959, "grad_norm": 1.3549906015396118, "learning_rate": 6.676104208929325e-05, "loss": 1.4312, "step": 10590 }, { "epoch": 1.2462341097802925, "grad_norm": 1.263985514640808, "learning_rate": 6.67006608377798e-05, "loss": 1.4649, "step": 10600 }, { "epoch": 1.2474098023366889, "grad_norm": 1.335316777229309, "learning_rate": 6.66402521598195e-05, "loss": 1.5259, "step": 10610 }, { "epoch": 1.2485854948930855, "grad_norm": 1.5303469896316528, "learning_rate": 6.657981615461777e-05, "loss": 1.5071, "step": 10620 }, { "epoch": 1.249761187449482, "grad_norm": 1.1267995834350586, "learning_rate": 6.651935292142485e-05, "loss": 1.4798, "step": 10630 }, { "epoch": 1.2509368800058784, "grad_norm": 1.2747951745986938, "learning_rate": 6.645886255953571e-05, "loss": 1.4585, "step": 10640 }, { "epoch": 1.252112572562275, "grad_norm": 1.234013319015503, "learning_rate": 6.639834516828989e-05, "loss": 1.4886, "step": 10650 }, { "epoch": 1.2532882651186714, "grad_norm": 0.9654020071029663, "learning_rate": 6.633780084707127e-05, "loss": 1.4835, "step": 10660 }, { "epoch": 1.254463957675068, "grad_norm": 1.1089553833007812, "learning_rate": 6.627722969530802e-05, "loss": 1.452, "step": 10670 }, { "epoch": 1.2556396502314644, "grad_norm": 1.5390143394470215, "learning_rate": 6.621663181247234e-05, "loss": 1.4417, "step": 10680 }, { "epoch": 1.256815342787861, "grad_norm": 1.300284743309021, "learning_rate": 6.615600729808031e-05, "loss": 1.4539, "step": 10690 }, { "epoch": 1.2579910353442574, "grad_norm": 1.2501838207244873, "learning_rate": 6.609535625169177e-05, "loss": 1.4369, "step": 10700 }, { "epoch": 1.259166727900654, "grad_norm": 1.476755142211914, "learning_rate": 6.603467877291014e-05, "loss": 1.469, "step": 10710 }, { "epoch": 1.2603424204570506, "grad_norm": 1.3323252201080322, "learning_rate": 6.597397496138222e-05, "loss": 1.4684, "step": 10720 }, { "epoch": 1.261518113013447, "grad_norm": 1.6422585248947144, "learning_rate": 6.591324491679807e-05, "loss": 1.4577, "step": 10730 }, { "epoch": 1.2626938055698436, "grad_norm": 1.1776736974716187, "learning_rate": 6.585248873889081e-05, "loss": 1.4315, "step": 10740 }, { "epoch": 1.26386949812624, "grad_norm": 1.3768630027770996, "learning_rate": 6.579170652743653e-05, "loss": 1.4495, "step": 10750 }, { "epoch": 1.2650451906826365, "grad_norm": 1.0895339250564575, "learning_rate": 6.573089838225404e-05, "loss": 1.4124, "step": 10760 }, { "epoch": 1.266220883239033, "grad_norm": 1.213144063949585, "learning_rate": 6.56700644032047e-05, "loss": 1.4655, "step": 10770 }, { "epoch": 1.2673965757954295, "grad_norm": 1.2948362827301025, "learning_rate": 6.560920469019237e-05, "loss": 1.4097, "step": 10780 }, { "epoch": 1.268572268351826, "grad_norm": 1.6014686822891235, "learning_rate": 6.554831934316314e-05, "loss": 1.499, "step": 10790 }, { "epoch": 1.2697479609082225, "grad_norm": 1.3647328615188599, "learning_rate": 6.548740846210515e-05, "loss": 1.5025, "step": 10800 }, { "epoch": 1.270923653464619, "grad_norm": 1.193146824836731, "learning_rate": 6.542647214704858e-05, "loss": 1.4346, "step": 10810 }, { "epoch": 1.2720993460210155, "grad_norm": 1.3852726221084595, "learning_rate": 6.536551049806527e-05, "loss": 1.4349, "step": 10820 }, { "epoch": 1.273275038577412, "grad_norm": 0.9971378445625305, "learning_rate": 6.53045236152687e-05, "loss": 1.4668, "step": 10830 }, { "epoch": 1.2744507311338085, "grad_norm": 1.0527534484863281, "learning_rate": 6.524351159881383e-05, "loss": 1.4606, "step": 10840 }, { "epoch": 1.275626423690205, "grad_norm": 1.3037880659103394, "learning_rate": 6.518247454889686e-05, "loss": 1.4789, "step": 10850 }, { "epoch": 1.2768021162466014, "grad_norm": 1.0713800191879272, "learning_rate": 6.51214125657551e-05, "loss": 1.4595, "step": 10860 }, { "epoch": 1.277977808802998, "grad_norm": 1.161318302154541, "learning_rate": 6.506032574966679e-05, "loss": 1.4833, "step": 10870 }, { "epoch": 1.2791535013593944, "grad_norm": 1.1459256410598755, "learning_rate": 6.499921420095101e-05, "loss": 1.4716, "step": 10880 }, { "epoch": 1.280329193915791, "grad_norm": 1.479053020477295, "learning_rate": 6.493807801996738e-05, "loss": 1.5157, "step": 10890 }, { "epoch": 1.2815048864721876, "grad_norm": 1.3093435764312744, "learning_rate": 6.487691730711604e-05, "loss": 1.3668, "step": 10900 }, { "epoch": 1.282680579028584, "grad_norm": 1.3011060953140259, "learning_rate": 6.481573216283739e-05, "loss": 1.3903, "step": 10910 }, { "epoch": 1.2838562715849804, "grad_norm": 1.1963512897491455, "learning_rate": 6.475452268761194e-05, "loss": 1.3942, "step": 10920 }, { "epoch": 1.285031964141377, "grad_norm": 1.1473850011825562, "learning_rate": 6.469328898196017e-05, "loss": 1.4782, "step": 10930 }, { "epoch": 1.2862076566977736, "grad_norm": 1.371085524559021, "learning_rate": 6.463203114644236e-05, "loss": 1.4722, "step": 10940 }, { "epoch": 1.28738334925417, "grad_norm": 1.4423400163650513, "learning_rate": 6.45707492816584e-05, "loss": 1.4568, "step": 10950 }, { "epoch": 1.2885590418105666, "grad_norm": 1.3598648309707642, "learning_rate": 6.450944348824765e-05, "loss": 1.4447, "step": 10960 }, { "epoch": 1.289734734366963, "grad_norm": 1.419168472290039, "learning_rate": 6.444811386688875e-05, "loss": 1.517, "step": 10970 }, { "epoch": 1.2909104269233596, "grad_norm": 1.3234745264053345, "learning_rate": 6.438676051829953e-05, "loss": 1.5122, "step": 10980 }, { "epoch": 1.2920861194797562, "grad_norm": 1.2817275524139404, "learning_rate": 6.43253835432367e-05, "loss": 1.4985, "step": 10990 }, { "epoch": 1.2932618120361525, "grad_norm": 1.22225821018219, "learning_rate": 6.426398304249581e-05, "loss": 1.4484, "step": 11000 }, { "epoch": 1.2932618120361525, "eval_loss": 1.674931526184082, "eval_runtime": 1919.0508, "eval_samples_per_second": 31.518, "eval_steps_per_second": 3.94, "step": 11000 }, { "epoch": 1.294437504592549, "grad_norm": 1.293758511543274, "learning_rate": 6.420255911691109e-05, "loss": 1.4678, "step": 11010 }, { "epoch": 1.2956131971489455, "grad_norm": 1.4137052297592163, "learning_rate": 6.414111186735516e-05, "loss": 1.3933, "step": 11020 }, { "epoch": 1.2967888897053421, "grad_norm": 1.2664361000061035, "learning_rate": 6.4079641394739e-05, "loss": 1.5151, "step": 11030 }, { "epoch": 1.2979645822617385, "grad_norm": 1.4995366334915161, "learning_rate": 6.401814780001169e-05, "loss": 1.4321, "step": 11040 }, { "epoch": 1.299140274818135, "grad_norm": 1.298086166381836, "learning_rate": 6.395663118416031e-05, "loss": 1.4222, "step": 11050 }, { "epoch": 1.3003159673745315, "grad_norm": 1.3930251598358154, "learning_rate": 6.389509164820974e-05, "loss": 1.4986, "step": 11060 }, { "epoch": 1.301491659930928, "grad_norm": 1.3640013933181763, "learning_rate": 6.38335292932225e-05, "loss": 1.4143, "step": 11070 }, { "epoch": 1.3026673524873247, "grad_norm": 1.234474778175354, "learning_rate": 6.377194422029857e-05, "loss": 1.4893, "step": 11080 }, { "epoch": 1.303843045043721, "grad_norm": 1.2913360595703125, "learning_rate": 6.371033653057524e-05, "loss": 1.514, "step": 11090 }, { "epoch": 1.3050187376001174, "grad_norm": 0.9980594515800476, "learning_rate": 6.3648706325227e-05, "loss": 1.4093, "step": 11100 }, { "epoch": 1.306194430156514, "grad_norm": 1.5111900568008423, "learning_rate": 6.358705370546519e-05, "loss": 1.4129, "step": 11110 }, { "epoch": 1.3073701227129106, "grad_norm": 1.3089832067489624, "learning_rate": 6.352537877253809e-05, "loss": 1.4147, "step": 11120 }, { "epoch": 1.308545815269307, "grad_norm": 1.1320339441299438, "learning_rate": 6.346368162773055e-05, "loss": 1.4309, "step": 11130 }, { "epoch": 1.3097215078257036, "grad_norm": 1.3170768022537231, "learning_rate": 6.340196237236395e-05, "loss": 1.4922, "step": 11140 }, { "epoch": 1.3108972003821, "grad_norm": 1.2673604488372803, "learning_rate": 6.334022110779592e-05, "loss": 1.457, "step": 11150 }, { "epoch": 1.3120728929384966, "grad_norm": 1.2616866827011108, "learning_rate": 6.327845793542024e-05, "loss": 1.4533, "step": 11160 }, { "epoch": 1.3132485854948932, "grad_norm": 1.099026083946228, "learning_rate": 6.321667295666674e-05, "loss": 1.5088, "step": 11170 }, { "epoch": 1.3144242780512896, "grad_norm": 1.660454273223877, "learning_rate": 6.315486627300098e-05, "loss": 1.4372, "step": 11180 }, { "epoch": 1.315599970607686, "grad_norm": 1.4206299781799316, "learning_rate": 6.30930379859242e-05, "loss": 1.4076, "step": 11190 }, { "epoch": 1.3167756631640826, "grad_norm": 1.2144578695297241, "learning_rate": 6.303118819697313e-05, "loss": 1.4991, "step": 11200 }, { "epoch": 1.3179513557204792, "grad_norm": 1.1893341541290283, "learning_rate": 6.296931700771978e-05, "loss": 1.4055, "step": 11210 }, { "epoch": 1.3191270482768755, "grad_norm": 1.1036959886550903, "learning_rate": 6.290742451977133e-05, "loss": 1.4593, "step": 11220 }, { "epoch": 1.3203027408332721, "grad_norm": 1.5781663656234741, "learning_rate": 6.284551083476992e-05, "loss": 1.4709, "step": 11230 }, { "epoch": 1.3214784333896685, "grad_norm": 1.3911752700805664, "learning_rate": 6.278357605439251e-05, "loss": 1.4351, "step": 11240 }, { "epoch": 1.3226541259460651, "grad_norm": 1.2755422592163086, "learning_rate": 6.27216202803507e-05, "loss": 1.3866, "step": 11250 }, { "epoch": 1.3238298185024617, "grad_norm": 1.1078753471374512, "learning_rate": 6.265964361439057e-05, "loss": 1.4816, "step": 11260 }, { "epoch": 1.325005511058858, "grad_norm": 1.1804518699645996, "learning_rate": 6.25976461582925e-05, "loss": 1.4443, "step": 11270 }, { "epoch": 1.3261812036152545, "grad_norm": 1.1190757751464844, "learning_rate": 6.253562801387103e-05, "loss": 1.5221, "step": 11280 }, { "epoch": 1.327356896171651, "grad_norm": 1.1433650255203247, "learning_rate": 6.247358928297467e-05, "loss": 1.4739, "step": 11290 }, { "epoch": 1.3285325887280477, "grad_norm": 1.370600938796997, "learning_rate": 6.241153006748571e-05, "loss": 1.4822, "step": 11300 }, { "epoch": 1.329708281284444, "grad_norm": 1.267935872077942, "learning_rate": 6.234945046932014e-05, "loss": 1.4748, "step": 11310 }, { "epoch": 1.3308839738408407, "grad_norm": 1.2385363578796387, "learning_rate": 6.228735059042736e-05, "loss": 1.4708, "step": 11320 }, { "epoch": 1.332059666397237, "grad_norm": 1.1453253030776978, "learning_rate": 6.222523053279009e-05, "loss": 1.485, "step": 11330 }, { "epoch": 1.3332353589536337, "grad_norm": 1.083616018295288, "learning_rate": 6.216309039842423e-05, "loss": 1.4594, "step": 11340 }, { "epoch": 1.3344110515100303, "grad_norm": 1.3208450078964233, "learning_rate": 6.210093028937862e-05, "loss": 1.4629, "step": 11350 }, { "epoch": 1.3355867440664266, "grad_norm": 1.1633281707763672, "learning_rate": 6.203875030773488e-05, "loss": 1.4643, "step": 11360 }, { "epoch": 1.336762436622823, "grad_norm": 1.466296672821045, "learning_rate": 6.197655055560732e-05, "loss": 1.4578, "step": 11370 }, { "epoch": 1.3379381291792196, "grad_norm": 1.0865249633789062, "learning_rate": 6.191433113514264e-05, "loss": 1.4083, "step": 11380 }, { "epoch": 1.3391138217356162, "grad_norm": 1.2550734281539917, "learning_rate": 6.185209214851995e-05, "loss": 1.4403, "step": 11390 }, { "epoch": 1.3402895142920126, "grad_norm": 1.1243187189102173, "learning_rate": 6.178983369795041e-05, "loss": 1.4725, "step": 11400 }, { "epoch": 1.3414652068484092, "grad_norm": 1.043884038925171, "learning_rate": 6.172755588567713e-05, "loss": 1.4738, "step": 11410 }, { "epoch": 1.3426408994048056, "grad_norm": 1.3856217861175537, "learning_rate": 6.16652588139751e-05, "loss": 1.3903, "step": 11420 }, { "epoch": 1.3438165919612022, "grad_norm": 1.5508580207824707, "learning_rate": 6.160294258515085e-05, "loss": 1.3903, "step": 11430 }, { "epoch": 1.3449922845175988, "grad_norm": 1.3118549585342407, "learning_rate": 6.154060730154243e-05, "loss": 1.4299, "step": 11440 }, { "epoch": 1.3461679770739952, "grad_norm": 1.2776038646697998, "learning_rate": 6.147825306551917e-05, "loss": 1.4072, "step": 11450 }, { "epoch": 1.3473436696303915, "grad_norm": 1.3047503232955933, "learning_rate": 6.141587997948149e-05, "loss": 1.4465, "step": 11460 }, { "epoch": 1.3485193621867881, "grad_norm": 1.2295664548873901, "learning_rate": 6.135348814586083e-05, "loss": 1.4288, "step": 11470 }, { "epoch": 1.3496950547431847, "grad_norm": 1.128860354423523, "learning_rate": 6.129107766711935e-05, "loss": 1.4378, "step": 11480 }, { "epoch": 1.3508707472995811, "grad_norm": 1.0664775371551514, "learning_rate": 6.122864864574988e-05, "loss": 1.4547, "step": 11490 }, { "epoch": 1.3520464398559777, "grad_norm": 1.1868221759796143, "learning_rate": 6.116620118427568e-05, "loss": 1.4752, "step": 11500 }, { "epoch": 1.3520464398559777, "eval_loss": 1.6665728092193604, "eval_runtime": 1918.1325, "eval_samples_per_second": 31.533, "eval_steps_per_second": 3.942, "step": 11500 }, { "epoch": 1.353222132412374, "grad_norm": 1.1563061475753784, "learning_rate": 6.110373538525027e-05, "loss": 1.4158, "step": 11510 }, { "epoch": 1.3543978249687707, "grad_norm": 1.3682633638381958, "learning_rate": 6.104125135125735e-05, "loss": 1.4663, "step": 11520 }, { "epoch": 1.3555735175251673, "grad_norm": 1.6941479444503784, "learning_rate": 6.097874918491051e-05, "loss": 1.4481, "step": 11530 }, { "epoch": 1.3567492100815637, "grad_norm": 1.0947091579437256, "learning_rate": 6.091622898885313e-05, "loss": 1.4722, "step": 11540 }, { "epoch": 1.35792490263796, "grad_norm": 1.3617501258850098, "learning_rate": 6.085369086575819e-05, "loss": 1.4272, "step": 11550 }, { "epoch": 1.3591005951943567, "grad_norm": 1.665128469467163, "learning_rate": 6.079113491832815e-05, "loss": 1.4917, "step": 11560 }, { "epoch": 1.3602762877507533, "grad_norm": 1.1929265260696411, "learning_rate": 6.0728561249294705e-05, "loss": 1.4803, "step": 11570 }, { "epoch": 1.3614519803071496, "grad_norm": 1.5296154022216797, "learning_rate": 6.066596996141867e-05, "loss": 1.4793, "step": 11580 }, { "epoch": 1.3626276728635462, "grad_norm": 1.159460186958313, "learning_rate": 6.060336115748977e-05, "loss": 1.5065, "step": 11590 }, { "epoch": 1.3638033654199426, "grad_norm": 1.1856430768966675, "learning_rate": 6.054073494032651e-05, "loss": 1.451, "step": 11600 }, { "epoch": 1.3649790579763392, "grad_norm": 0.9621641635894775, "learning_rate": 6.047809141277603e-05, "loss": 1.4998, "step": 11610 }, { "epoch": 1.3661547505327356, "grad_norm": 1.3030149936676025, "learning_rate": 6.0415430677713825e-05, "loss": 1.4096, "step": 11620 }, { "epoch": 1.3673304430891322, "grad_norm": 1.409255027770996, "learning_rate": 6.03527528380437e-05, "loss": 1.4665, "step": 11630 }, { "epoch": 1.3685061356455286, "grad_norm": 1.2834994792938232, "learning_rate": 6.029005799669753e-05, "loss": 1.4477, "step": 11640 }, { "epoch": 1.3696818282019252, "grad_norm": 1.2183235883712769, "learning_rate": 6.022734625663515e-05, "loss": 1.4808, "step": 11650 }, { "epoch": 1.3708575207583218, "grad_norm": 1.3459337949752808, "learning_rate": 6.0164617720844076e-05, "loss": 1.4034, "step": 11660 }, { "epoch": 1.3720332133147182, "grad_norm": 1.0359379053115845, "learning_rate": 6.010187249233944e-05, "loss": 1.4123, "step": 11670 }, { "epoch": 1.3732089058711148, "grad_norm": 1.105823040008545, "learning_rate": 6.003911067416382e-05, "loss": 1.4142, "step": 11680 }, { "epoch": 1.3743845984275112, "grad_norm": 1.409528374671936, "learning_rate": 5.9976332369387e-05, "loss": 1.4484, "step": 11690 }, { "epoch": 1.3755602909839078, "grad_norm": 1.4041920900344849, "learning_rate": 5.991353768110585e-05, "loss": 1.5151, "step": 11700 }, { "epoch": 1.3767359835403041, "grad_norm": 1.5800915956497192, "learning_rate": 5.9850726712444125e-05, "loss": 1.4609, "step": 11710 }, { "epoch": 1.3779116760967007, "grad_norm": 1.1779450178146362, "learning_rate": 5.9787899566552354e-05, "loss": 1.4108, "step": 11720 }, { "epoch": 1.3790873686530971, "grad_norm": 1.2733160257339478, "learning_rate": 5.9725056346607625e-05, "loss": 1.436, "step": 11730 }, { "epoch": 1.3802630612094937, "grad_norm": 0.9557298421859741, "learning_rate": 5.9662197155813396e-05, "loss": 1.4546, "step": 11740 }, { "epoch": 1.3814387537658903, "grad_norm": 1.3464919328689575, "learning_rate": 5.959932209739936e-05, "loss": 1.4681, "step": 11750 }, { "epoch": 1.3826144463222867, "grad_norm": 1.1906040906906128, "learning_rate": 5.9536431274621295e-05, "loss": 1.3746, "step": 11760 }, { "epoch": 1.3837901388786833, "grad_norm": 1.4017455577850342, "learning_rate": 5.947352479076086e-05, "loss": 1.4689, "step": 11770 }, { "epoch": 1.3849658314350797, "grad_norm": 1.3524692058563232, "learning_rate": 5.94106027491254e-05, "loss": 1.4071, "step": 11780 }, { "epoch": 1.3861415239914763, "grad_norm": 1.391163945198059, "learning_rate": 5.934766525304783e-05, "loss": 1.4777, "step": 11790 }, { "epoch": 1.3873172165478727, "grad_norm": 1.556793451309204, "learning_rate": 5.928471240588649e-05, "loss": 1.402, "step": 11800 }, { "epoch": 1.3884929091042693, "grad_norm": 1.3402925729751587, "learning_rate": 5.922174431102484e-05, "loss": 1.4918, "step": 11810 }, { "epoch": 1.3896686016606656, "grad_norm": 1.1938923597335815, "learning_rate": 5.915876107187146e-05, "loss": 1.4668, "step": 11820 }, { "epoch": 1.3908442942170622, "grad_norm": 1.3402364253997803, "learning_rate": 5.9095762791859776e-05, "loss": 1.4595, "step": 11830 }, { "epoch": 1.3920199867734588, "grad_norm": 1.0470637083053589, "learning_rate": 5.903274957444787e-05, "loss": 1.5226, "step": 11840 }, { "epoch": 1.3931956793298552, "grad_norm": 1.2324721813201904, "learning_rate": 5.8969721523118424e-05, "loss": 1.4428, "step": 11850 }, { "epoch": 1.3943713718862518, "grad_norm": 1.6477957963943481, "learning_rate": 5.890667874137844e-05, "loss": 1.4112, "step": 11860 }, { "epoch": 1.3955470644426482, "grad_norm": 1.1557326316833496, "learning_rate": 5.8843621332759134e-05, "loss": 1.4408, "step": 11870 }, { "epoch": 1.3967227569990448, "grad_norm": 1.331052541732788, "learning_rate": 5.87805494008157e-05, "loss": 1.4226, "step": 11880 }, { "epoch": 1.3978984495554412, "grad_norm": 1.1075398921966553, "learning_rate": 5.871746304912724e-05, "loss": 1.4674, "step": 11890 }, { "epoch": 1.3990741421118378, "grad_norm": 1.351260781288147, "learning_rate": 5.8654362381296504e-05, "loss": 1.5033, "step": 11900 }, { "epoch": 1.4002498346682342, "grad_norm": 1.279270887374878, "learning_rate": 5.859124750094974e-05, "loss": 1.434, "step": 11910 }, { "epoch": 1.4014255272246308, "grad_norm": 1.2027990818023682, "learning_rate": 5.852811851173656e-05, "loss": 1.4063, "step": 11920 }, { "epoch": 1.4026012197810274, "grad_norm": 1.2059873342514038, "learning_rate": 5.846497551732976e-05, "loss": 1.4314, "step": 11930 }, { "epoch": 1.4037769123374237, "grad_norm": 1.2089678049087524, "learning_rate": 5.8401818621425095e-05, "loss": 1.4911, "step": 11940 }, { "epoch": 1.4049526048938203, "grad_norm": 1.4122209548950195, "learning_rate": 5.833864792774117e-05, "loss": 1.5013, "step": 11950 }, { "epoch": 1.4061282974502167, "grad_norm": 1.4281269311904907, "learning_rate": 5.827546354001927e-05, "loss": 1.4459, "step": 11960 }, { "epoch": 1.4073039900066133, "grad_norm": 1.2282943725585938, "learning_rate": 5.821226556202313e-05, "loss": 1.4415, "step": 11970 }, { "epoch": 1.4084796825630097, "grad_norm": 1.310762882232666, "learning_rate": 5.8149054097538815e-05, "loss": 1.4285, "step": 11980 }, { "epoch": 1.4096553751194063, "grad_norm": 1.2515947818756104, "learning_rate": 5.808582925037457e-05, "loss": 1.4647, "step": 11990 }, { "epoch": 1.4108310676758027, "grad_norm": 1.4609304666519165, "learning_rate": 5.802259112436056e-05, "loss": 1.4023, "step": 12000 }, { "epoch": 1.4108310676758027, "eval_loss": 1.6601688861846924, "eval_runtime": 1916.742, "eval_samples_per_second": 31.556, "eval_steps_per_second": 3.945, "step": 12000 }, { "epoch": 1.4120067602321993, "grad_norm": 1.4450361728668213, "learning_rate": 5.7959339823348814e-05, "loss": 1.4991, "step": 12010 }, { "epoch": 1.4131824527885959, "grad_norm": 1.3296563625335693, "learning_rate": 5.789607545121296e-05, "loss": 1.4561, "step": 12020 }, { "epoch": 1.4143581453449923, "grad_norm": 1.4998587369918823, "learning_rate": 5.7832798111848127e-05, "loss": 1.4341, "step": 12030 }, { "epoch": 1.4155338379013889, "grad_norm": 1.5106745958328247, "learning_rate": 5.7769507909170683e-05, "loss": 1.4553, "step": 12040 }, { "epoch": 1.4167095304577852, "grad_norm": 1.7369840145111084, "learning_rate": 5.770620494711818e-05, "loss": 1.484, "step": 12050 }, { "epoch": 1.4178852230141819, "grad_norm": 1.1620473861694336, "learning_rate": 5.7642889329649075e-05, "loss": 1.4106, "step": 12060 }, { "epoch": 1.4190609155705782, "grad_norm": 1.253677487373352, "learning_rate": 5.7579561160742655e-05, "loss": 1.4272, "step": 12070 }, { "epoch": 1.4202366081269748, "grad_norm": 1.0931650400161743, "learning_rate": 5.75162205443988e-05, "loss": 1.4735, "step": 12080 }, { "epoch": 1.4214123006833712, "grad_norm": 0.9987673759460449, "learning_rate": 5.74528675846378e-05, "loss": 1.4319, "step": 12090 }, { "epoch": 1.4225879932397678, "grad_norm": 1.2928924560546875, "learning_rate": 5.738950238550026e-05, "loss": 1.4317, "step": 12100 }, { "epoch": 1.4237636857961644, "grad_norm": 1.093039631843567, "learning_rate": 5.732612505104686e-05, "loss": 1.463, "step": 12110 }, { "epoch": 1.4249393783525608, "grad_norm": 1.1787341833114624, "learning_rate": 5.726273568535825e-05, "loss": 1.4351, "step": 12120 }, { "epoch": 1.4261150709089572, "grad_norm": 1.2007547616958618, "learning_rate": 5.719933439253475e-05, "loss": 1.4439, "step": 12130 }, { "epoch": 1.4272907634653538, "grad_norm": 1.4807531833648682, "learning_rate": 5.713592127669637e-05, "loss": 1.4445, "step": 12140 }, { "epoch": 1.4284664560217504, "grad_norm": 1.3713806867599487, "learning_rate": 5.707249644198247e-05, "loss": 1.4376, "step": 12150 }, { "epoch": 1.4296421485781468, "grad_norm": 1.2078429460525513, "learning_rate": 5.700905999255167e-05, "loss": 1.4544, "step": 12160 }, { "epoch": 1.4308178411345434, "grad_norm": 1.3264617919921875, "learning_rate": 5.694561203258168e-05, "loss": 1.4136, "step": 12170 }, { "epoch": 1.4319935336909397, "grad_norm": 1.4408338069915771, "learning_rate": 5.6882152666269094e-05, "loss": 1.4352, "step": 12180 }, { "epoch": 1.4331692262473363, "grad_norm": 1.0763658285140991, "learning_rate": 5.681868199782925e-05, "loss": 1.4596, "step": 12190 }, { "epoch": 1.434344918803733, "grad_norm": 1.1817296743392944, "learning_rate": 5.6755200131496026e-05, "loss": 1.4521, "step": 12200 }, { "epoch": 1.4355206113601293, "grad_norm": 1.1701550483703613, "learning_rate": 5.6691707171521736e-05, "loss": 1.4751, "step": 12210 }, { "epoch": 1.4366963039165257, "grad_norm": 1.2752610445022583, "learning_rate": 5.662820322217686e-05, "loss": 1.4286, "step": 12220 }, { "epoch": 1.4378719964729223, "grad_norm": 1.3754462003707886, "learning_rate": 5.656468838774994e-05, "loss": 1.4016, "step": 12230 }, { "epoch": 1.439047689029319, "grad_norm": 1.0563693046569824, "learning_rate": 5.650116277254742e-05, "loss": 1.4815, "step": 12240 }, { "epoch": 1.4402233815857153, "grad_norm": 1.4206534624099731, "learning_rate": 5.643762648089344e-05, "loss": 1.4579, "step": 12250 }, { "epoch": 1.4413990741421119, "grad_norm": 1.3142942190170288, "learning_rate": 5.637407961712964e-05, "loss": 1.388, "step": 12260 }, { "epoch": 1.4425747666985083, "grad_norm": 1.2211089134216309, "learning_rate": 5.631052228561503e-05, "loss": 1.3801, "step": 12270 }, { "epoch": 1.4437504592549049, "grad_norm": 1.4033881425857544, "learning_rate": 5.624695459072587e-05, "loss": 1.5123, "step": 12280 }, { "epoch": 1.4449261518113015, "grad_norm": 0.9911127686500549, "learning_rate": 5.618337663685536e-05, "loss": 1.4775, "step": 12290 }, { "epoch": 1.4461018443676978, "grad_norm": 1.2543293237686157, "learning_rate": 5.6119788528413594e-05, "loss": 1.4974, "step": 12300 }, { "epoch": 1.4472775369240942, "grad_norm": 1.4553948640823364, "learning_rate": 5.6056190369827324e-05, "loss": 1.4235, "step": 12310 }, { "epoch": 1.4484532294804908, "grad_norm": 1.2525317668914795, "learning_rate": 5.5992582265539815e-05, "loss": 1.452, "step": 12320 }, { "epoch": 1.4496289220368874, "grad_norm": 1.0818679332733154, "learning_rate": 5.592896432001068e-05, "loss": 1.4649, "step": 12330 }, { "epoch": 1.4508046145932838, "grad_norm": 1.367707371711731, "learning_rate": 5.586533663771567e-05, "loss": 1.3661, "step": 12340 }, { "epoch": 1.4519803071496804, "grad_norm": 1.3952076435089111, "learning_rate": 5.580169932314651e-05, "loss": 1.463, "step": 12350 }, { "epoch": 1.4531559997060768, "grad_norm": 1.116111397743225, "learning_rate": 5.573805248081079e-05, "loss": 1.4032, "step": 12360 }, { "epoch": 1.4543316922624734, "grad_norm": 1.4246304035186768, "learning_rate": 5.5674396215231715e-05, "loss": 1.4259, "step": 12370 }, { "epoch": 1.45550738481887, "grad_norm": 1.1432337760925293, "learning_rate": 5.561073063094798e-05, "loss": 1.3955, "step": 12380 }, { "epoch": 1.4566830773752664, "grad_norm": 1.2595884799957275, "learning_rate": 5.554705583251356e-05, "loss": 1.424, "step": 12390 }, { "epoch": 1.4578587699316627, "grad_norm": 1.2963684797286987, "learning_rate": 5.5483371924497606e-05, "loss": 1.41, "step": 12400 }, { "epoch": 1.4590344624880593, "grad_norm": 1.1225638389587402, "learning_rate": 5.541967901148419e-05, "loss": 1.3897, "step": 12410 }, { "epoch": 1.460210155044456, "grad_norm": 1.175368070602417, "learning_rate": 5.535597719807217e-05, "loss": 1.4107, "step": 12420 }, { "epoch": 1.4613858476008523, "grad_norm": 1.010724425315857, "learning_rate": 5.529226658887505e-05, "loss": 1.4256, "step": 12430 }, { "epoch": 1.462561540157249, "grad_norm": 1.1452133655548096, "learning_rate": 5.522854728852076e-05, "loss": 1.4667, "step": 12440 }, { "epoch": 1.4637372327136453, "grad_norm": 1.2505030632019043, "learning_rate": 5.5164819401651514e-05, "loss": 1.4229, "step": 12450 }, { "epoch": 1.464912925270042, "grad_norm": 1.3987067937850952, "learning_rate": 5.510108303292361e-05, "loss": 1.4023, "step": 12460 }, { "epoch": 1.4660886178264385, "grad_norm": 1.2496756315231323, "learning_rate": 5.503733828700729e-05, "loss": 1.3907, "step": 12470 }, { "epoch": 1.467264310382835, "grad_norm": 1.5330246686935425, "learning_rate": 5.4973585268586535e-05, "loss": 1.4322, "step": 12480 }, { "epoch": 1.4684400029392313, "grad_norm": 1.1474759578704834, "learning_rate": 5.490982408235894e-05, "loss": 1.4457, "step": 12490 }, { "epoch": 1.4696156954956279, "grad_norm": 1.3515642881393433, "learning_rate": 5.484605483303551e-05, "loss": 1.3717, "step": 12500 }, { "epoch": 1.4696156954956279, "eval_loss": 1.646672248840332, "eval_runtime": 1917.4327, "eval_samples_per_second": 31.545, "eval_steps_per_second": 3.943, "step": 12500 }, { "epoch": 1.4707913880520245, "grad_norm": 1.155239462852478, "learning_rate": 5.478227762534045e-05, "loss": 1.4463, "step": 12510 }, { "epoch": 1.4719670806084209, "grad_norm": 1.2111297845840454, "learning_rate": 5.4718492564011084e-05, "loss": 1.4597, "step": 12520 }, { "epoch": 1.4731427731648175, "grad_norm": 1.464544653892517, "learning_rate": 5.46546997537976e-05, "loss": 1.4587, "step": 12530 }, { "epoch": 1.4743184657212138, "grad_norm": 1.3224107027053833, "learning_rate": 5.459089929946295e-05, "loss": 1.4237, "step": 12540 }, { "epoch": 1.4754941582776104, "grad_norm": 1.4278076887130737, "learning_rate": 5.452709130578257e-05, "loss": 1.5183, "step": 12550 }, { "epoch": 1.476669850834007, "grad_norm": 1.2120144367218018, "learning_rate": 5.446327587754437e-05, "loss": 1.3733, "step": 12560 }, { "epoch": 1.4778455433904034, "grad_norm": 1.237852692604065, "learning_rate": 5.439945311954839e-05, "loss": 1.4372, "step": 12570 }, { "epoch": 1.4790212359467998, "grad_norm": 1.1000795364379883, "learning_rate": 5.433562313660676e-05, "loss": 1.4137, "step": 12580 }, { "epoch": 1.4801969285031964, "grad_norm": 0.9907048344612122, "learning_rate": 5.427178603354346e-05, "loss": 1.422, "step": 12590 }, { "epoch": 1.481372621059593, "grad_norm": 1.5363129377365112, "learning_rate": 5.4207941915194114e-05, "loss": 1.4037, "step": 12600 }, { "epoch": 1.4825483136159894, "grad_norm": 1.21452796459198, "learning_rate": 5.4144090886405954e-05, "loss": 1.4564, "step": 12610 }, { "epoch": 1.483724006172386, "grad_norm": 1.4411814212799072, "learning_rate": 5.408023305203748e-05, "loss": 1.4798, "step": 12620 }, { "epoch": 1.4848996987287824, "grad_norm": 1.097219467163086, "learning_rate": 5.401636851695844e-05, "loss": 1.3747, "step": 12630 }, { "epoch": 1.486075391285179, "grad_norm": 1.4491606950759888, "learning_rate": 5.395249738604953e-05, "loss": 1.3934, "step": 12640 }, { "epoch": 1.4872510838415756, "grad_norm": 1.401915192604065, "learning_rate": 5.388861976420228e-05, "loss": 1.4416, "step": 12650 }, { "epoch": 1.488426776397972, "grad_norm": 1.5187549591064453, "learning_rate": 5.382473575631895e-05, "loss": 1.4636, "step": 12660 }, { "epoch": 1.4896024689543683, "grad_norm": 1.331545352935791, "learning_rate": 5.3760845467312195e-05, "loss": 1.4657, "step": 12670 }, { "epoch": 1.490778161510765, "grad_norm": 1.2393823862075806, "learning_rate": 5.3696949002105024e-05, "loss": 1.4123, "step": 12680 }, { "epoch": 1.4919538540671615, "grad_norm": 1.2795801162719727, "learning_rate": 5.363304646563061e-05, "loss": 1.3864, "step": 12690 }, { "epoch": 1.493129546623558, "grad_norm": 1.2784656286239624, "learning_rate": 5.356913796283207e-05, "loss": 1.4393, "step": 12700 }, { "epoch": 1.4943052391799545, "grad_norm": 1.167173981666565, "learning_rate": 5.350522359866232e-05, "loss": 1.4008, "step": 12710 }, { "epoch": 1.4954809317363509, "grad_norm": 1.1122100353240967, "learning_rate": 5.344130347808393e-05, "loss": 1.4324, "step": 12720 }, { "epoch": 1.4966566242927475, "grad_norm": 1.106329083442688, "learning_rate": 5.337737770606886e-05, "loss": 1.4171, "step": 12730 }, { "epoch": 1.497832316849144, "grad_norm": 1.2726444005966187, "learning_rate": 5.331344638759843e-05, "loss": 1.4391, "step": 12740 }, { "epoch": 1.4990080094055405, "grad_norm": 1.2554010152816772, "learning_rate": 5.324950962766302e-05, "loss": 1.4296, "step": 12750 }, { "epoch": 1.5001837019619368, "grad_norm": 1.0687004327774048, "learning_rate": 5.3185567531261935e-05, "loss": 1.4171, "step": 12760 }, { "epoch": 1.5013593945183334, "grad_norm": 1.1617909669876099, "learning_rate": 5.31216202034033e-05, "loss": 1.4091, "step": 12770 }, { "epoch": 1.50253508707473, "grad_norm": 1.3394794464111328, "learning_rate": 5.305766774910377e-05, "loss": 1.464, "step": 12780 }, { "epoch": 1.5037107796311264, "grad_norm": 1.4066137075424194, "learning_rate": 5.299371027338848e-05, "loss": 1.4306, "step": 12790 }, { "epoch": 1.5048864721875228, "grad_norm": 1.081097960472107, "learning_rate": 5.2929747881290725e-05, "loss": 1.4276, "step": 12800 }, { "epoch": 1.5060621647439194, "grad_norm": 1.2471295595169067, "learning_rate": 5.286578067785197e-05, "loss": 1.4046, "step": 12810 }, { "epoch": 1.507237857300316, "grad_norm": 1.6198123693466187, "learning_rate": 5.280180876812151e-05, "loss": 1.3819, "step": 12820 }, { "epoch": 1.5084135498567126, "grad_norm": 1.501482605934143, "learning_rate": 5.2737832257156414e-05, "loss": 1.476, "step": 12830 }, { "epoch": 1.509589242413109, "grad_norm": 1.1524819135665894, "learning_rate": 5.2673851250021287e-05, "loss": 1.4394, "step": 12840 }, { "epoch": 1.5107649349695054, "grad_norm": 1.1725422143936157, "learning_rate": 5.260986585178812e-05, "loss": 1.4179, "step": 12850 }, { "epoch": 1.511940627525902, "grad_norm": 1.5252307653427124, "learning_rate": 5.254587616753611e-05, "loss": 1.4291, "step": 12860 }, { "epoch": 1.5131163200822986, "grad_norm": 1.2233588695526123, "learning_rate": 5.2481882302351494e-05, "loss": 1.4464, "step": 12870 }, { "epoch": 1.514292012638695, "grad_norm": 1.0808680057525635, "learning_rate": 5.2417884361327404e-05, "loss": 1.4087, "step": 12880 }, { "epoch": 1.5154677051950913, "grad_norm": 1.2674560546875, "learning_rate": 5.235388244956361e-05, "loss": 1.4205, "step": 12890 }, { "epoch": 1.516643397751488, "grad_norm": 1.2737966775894165, "learning_rate": 5.228987667216644e-05, "loss": 1.3931, "step": 12900 }, { "epoch": 1.5178190903078845, "grad_norm": 1.552759051322937, "learning_rate": 5.222586713424858e-05, "loss": 1.4183, "step": 12910 }, { "epoch": 1.5189947828642811, "grad_norm": 1.1709100008010864, "learning_rate": 5.2161853940928864e-05, "loss": 1.4093, "step": 12920 }, { "epoch": 1.5201704754206775, "grad_norm": 1.1601662635803223, "learning_rate": 5.209783719733214e-05, "loss": 1.4307, "step": 12930 }, { "epoch": 1.521346167977074, "grad_norm": 1.6590371131896973, "learning_rate": 5.203381700858909e-05, "loss": 1.4555, "step": 12940 }, { "epoch": 1.5225218605334705, "grad_norm": 1.902848720550537, "learning_rate": 5.1969793479836046e-05, "loss": 1.4058, "step": 12950 }, { "epoch": 1.523697553089867, "grad_norm": 1.176027536392212, "learning_rate": 5.1905766716214835e-05, "loss": 1.4355, "step": 12960 }, { "epoch": 1.5248732456462635, "grad_norm": 1.24137282371521, "learning_rate": 5.18417368228726e-05, "loss": 1.4329, "step": 12970 }, { "epoch": 1.5260489382026599, "grad_norm": 1.3687187433242798, "learning_rate": 5.1777703904961604e-05, "loss": 1.3904, "step": 12980 }, { "epoch": 1.5272246307590565, "grad_norm": 1.2047253847122192, "learning_rate": 5.1713668067639076e-05, "loss": 1.4351, "step": 12990 }, { "epoch": 1.528400323315453, "grad_norm": 1.194820761680603, "learning_rate": 5.1649629416067083e-05, "loss": 1.411, "step": 13000 }, { "epoch": 1.528400323315453, "eval_loss": 1.6375658512115479, "eval_runtime": 1917.3224, "eval_samples_per_second": 31.547, "eval_steps_per_second": 3.944, "step": 13000 }, { "epoch": 1.5295760158718497, "grad_norm": 1.2510570287704468, "learning_rate": 5.158558805541226e-05, "loss": 1.4265, "step": 13010 }, { "epoch": 1.530751708428246, "grad_norm": 1.1159077882766724, "learning_rate": 5.1521544090845705e-05, "loss": 1.4121, "step": 13020 }, { "epoch": 1.5319274009846424, "grad_norm": 1.0746954679489136, "learning_rate": 5.1457497627542816e-05, "loss": 1.4422, "step": 13030 }, { "epoch": 1.533103093541039, "grad_norm": 1.6446961164474487, "learning_rate": 5.13934487706831e-05, "loss": 1.4665, "step": 13040 }, { "epoch": 1.5342787860974356, "grad_norm": 1.2507566213607788, "learning_rate": 5.1329397625449916e-05, "loss": 1.3899, "step": 13050 }, { "epoch": 1.535454478653832, "grad_norm": 1.1560121774673462, "learning_rate": 5.126534429703048e-05, "loss": 1.4337, "step": 13060 }, { "epoch": 1.5366301712102284, "grad_norm": 1.3064507246017456, "learning_rate": 5.120128889061554e-05, "loss": 1.4738, "step": 13070 }, { "epoch": 1.537805863766625, "grad_norm": 1.1583871841430664, "learning_rate": 5.1137231511399286e-05, "loss": 1.4864, "step": 13080 }, { "epoch": 1.5389815563230216, "grad_norm": 1.4249876737594604, "learning_rate": 5.1073172264579104e-05, "loss": 1.4316, "step": 13090 }, { "epoch": 1.5401572488794182, "grad_norm": 1.1145589351654053, "learning_rate": 5.10091112553555e-05, "loss": 1.3653, "step": 13100 }, { "epoch": 1.5413329414358146, "grad_norm": 0.9300178289413452, "learning_rate": 5.094504858893182e-05, "loss": 1.407, "step": 13110 }, { "epoch": 1.542508633992211, "grad_norm": 1.3214335441589355, "learning_rate": 5.088098437051417e-05, "loss": 1.4354, "step": 13120 }, { "epoch": 1.5436843265486075, "grad_norm": 1.2193971872329712, "learning_rate": 5.081691870531121e-05, "loss": 1.4078, "step": 13130 }, { "epoch": 1.5448600191050041, "grad_norm": 1.2514294385910034, "learning_rate": 5.075285169853394e-05, "loss": 1.3903, "step": 13140 }, { "epoch": 1.5460357116614005, "grad_norm": 1.391670823097229, "learning_rate": 5.06887834553956e-05, "loss": 1.4311, "step": 13150 }, { "epoch": 1.547211404217797, "grad_norm": 1.1714516878128052, "learning_rate": 5.0624714081111416e-05, "loss": 1.4408, "step": 13160 }, { "epoch": 1.5483870967741935, "grad_norm": 1.2090966701507568, "learning_rate": 5.056064368089854e-05, "loss": 1.4257, "step": 13170 }, { "epoch": 1.54956278933059, "grad_norm": 1.1011195182800293, "learning_rate": 5.0496572359975725e-05, "loss": 1.3263, "step": 13180 }, { "epoch": 1.5507384818869867, "grad_norm": 1.082807183265686, "learning_rate": 5.0432500223563296e-05, "loss": 1.4079, "step": 13190 }, { "epoch": 1.551914174443383, "grad_norm": 1.0877431631088257, "learning_rate": 5.0368427376882896e-05, "loss": 1.4298, "step": 13200 }, { "epoch": 1.5530898669997795, "grad_norm": 1.080137014389038, "learning_rate": 5.030435392515737e-05, "loss": 1.4605, "step": 13210 }, { "epoch": 1.554265559556176, "grad_norm": 1.2629669904708862, "learning_rate": 5.024027997361047e-05, "loss": 1.3773, "step": 13220 }, { "epoch": 1.5554412521125727, "grad_norm": 1.40390944480896, "learning_rate": 5.0176205627466886e-05, "loss": 1.4027, "step": 13230 }, { "epoch": 1.556616944668969, "grad_norm": 1.2147927284240723, "learning_rate": 5.011213099195185e-05, "loss": 1.3934, "step": 13240 }, { "epoch": 1.5577926372253654, "grad_norm": 1.1927661895751953, "learning_rate": 5.0048056172291115e-05, "loss": 1.4182, "step": 13250 }, { "epoch": 1.558968329781762, "grad_norm": 1.3597666025161743, "learning_rate": 4.998398127371075e-05, "loss": 1.4473, "step": 13260 }, { "epoch": 1.5601440223381586, "grad_norm": 1.3652582168579102, "learning_rate": 4.991990640143693e-05, "loss": 1.3582, "step": 13270 }, { "epoch": 1.5613197148945552, "grad_norm": 1.2156258821487427, "learning_rate": 4.9855831660695805e-05, "loss": 1.4187, "step": 13280 }, { "epoch": 1.5624954074509516, "grad_norm": 1.1029983758926392, "learning_rate": 4.97917571567133e-05, "loss": 1.4148, "step": 13290 }, { "epoch": 1.563671100007348, "grad_norm": 1.2351984977722168, "learning_rate": 4.9727682994714936e-05, "loss": 1.3714, "step": 13300 }, { "epoch": 1.5648467925637446, "grad_norm": 1.4573674201965332, "learning_rate": 4.966360927992569e-05, "loss": 1.4044, "step": 13310 }, { "epoch": 1.5660224851201412, "grad_norm": 1.2415186166763306, "learning_rate": 4.95995361175698e-05, "loss": 1.3859, "step": 13320 }, { "epoch": 1.5671981776765376, "grad_norm": 1.2496088743209839, "learning_rate": 4.9535463612870634e-05, "loss": 1.3686, "step": 13330 }, { "epoch": 1.568373870232934, "grad_norm": 1.3265630006790161, "learning_rate": 4.9471391871050394e-05, "loss": 1.459, "step": 13340 }, { "epoch": 1.5695495627893306, "grad_norm": 1.350673794746399, "learning_rate": 4.9407320997330086e-05, "loss": 1.4068, "step": 13350 }, { "epoch": 1.5707252553457272, "grad_norm": 1.5469472408294678, "learning_rate": 4.9343251096929306e-05, "loss": 1.4055, "step": 13360 }, { "epoch": 1.5719009479021238, "grad_norm": 1.0151551961898804, "learning_rate": 4.927918227506602e-05, "loss": 1.3698, "step": 13370 }, { "epoch": 1.5730766404585201, "grad_norm": 1.6176893711090088, "learning_rate": 4.921511463695643e-05, "loss": 1.4155, "step": 13380 }, { "epoch": 1.5742523330149165, "grad_norm": 1.2751104831695557, "learning_rate": 4.915104828781479e-05, "loss": 1.4171, "step": 13390 }, { "epoch": 1.5754280255713131, "grad_norm": 1.172865390777588, "learning_rate": 4.9086983332853245e-05, "loss": 1.4412, "step": 13400 }, { "epoch": 1.5766037181277097, "grad_norm": 1.361809253692627, "learning_rate": 4.9022919877281666e-05, "loss": 1.4486, "step": 13410 }, { "epoch": 1.577779410684106, "grad_norm": 1.204079031944275, "learning_rate": 4.895885802630743e-05, "loss": 1.4076, "step": 13420 }, { "epoch": 1.5789551032405025, "grad_norm": 1.3683494329452515, "learning_rate": 4.889479788513529e-05, "loss": 1.3991, "step": 13430 }, { "epoch": 1.580130795796899, "grad_norm": 1.1117258071899414, "learning_rate": 4.88307395589672e-05, "loss": 1.3806, "step": 13440 }, { "epoch": 1.5813064883532957, "grad_norm": 1.2629603147506714, "learning_rate": 4.876668315300212e-05, "loss": 1.4295, "step": 13450 }, { "epoch": 1.5824821809096923, "grad_norm": 1.5813183784484863, "learning_rate": 4.8702628772435874e-05, "loss": 1.4528, "step": 13460 }, { "epoch": 1.5836578734660887, "grad_norm": 1.210327386856079, "learning_rate": 4.863857652246095e-05, "loss": 1.4075, "step": 13470 }, { "epoch": 1.584833566022485, "grad_norm": 1.15656316280365, "learning_rate": 4.8574526508266326e-05, "loss": 1.4262, "step": 13480 }, { "epoch": 1.5860092585788816, "grad_norm": 1.123010516166687, "learning_rate": 4.851047883503733e-05, "loss": 1.3975, "step": 13490 }, { "epoch": 1.5871849511352782, "grad_norm": 1.2387384176254272, "learning_rate": 4.8446433607955426e-05, "loss": 1.41, "step": 13500 }, { "epoch": 1.5871849511352782, "eval_loss": 1.629834771156311, "eval_runtime": 1916.8333, "eval_samples_per_second": 31.555, "eval_steps_per_second": 3.945, "step": 13500 }, { "epoch": 1.5883606436916746, "grad_norm": 1.094812273979187, "learning_rate": 4.838239093219808e-05, "loss": 1.4601, "step": 13510 }, { "epoch": 1.589536336248071, "grad_norm": 1.2237190008163452, "learning_rate": 4.8318350912938546e-05, "loss": 1.4445, "step": 13520 }, { "epoch": 1.5907120288044676, "grad_norm": 1.2271169424057007, "learning_rate": 4.825431365534574e-05, "loss": 1.4092, "step": 13530 }, { "epoch": 1.5918877213608642, "grad_norm": 1.3426430225372314, "learning_rate": 4.8190279264584024e-05, "loss": 1.3471, "step": 13540 }, { "epoch": 1.5930634139172606, "grad_norm": 1.1924805641174316, "learning_rate": 4.812624784581305e-05, "loss": 1.4388, "step": 13550 }, { "epoch": 1.5942391064736572, "grad_norm": 1.1941908597946167, "learning_rate": 4.8062219504187614e-05, "loss": 1.4511, "step": 13560 }, { "epoch": 1.5954147990300536, "grad_norm": 1.376799464225769, "learning_rate": 4.7998194344857424e-05, "loss": 1.4132, "step": 13570 }, { "epoch": 1.5965904915864502, "grad_norm": 1.132213830947876, "learning_rate": 4.793417247296701e-05, "loss": 1.3702, "step": 13580 }, { "epoch": 1.5977661841428468, "grad_norm": 1.3068040609359741, "learning_rate": 4.787015399365543e-05, "loss": 1.3687, "step": 13590 }, { "epoch": 1.5989418766992431, "grad_norm": 1.3216030597686768, "learning_rate": 4.780613901205623e-05, "loss": 1.4332, "step": 13600 }, { "epoch": 1.6001175692556395, "grad_norm": 1.6191517114639282, "learning_rate": 4.77421276332972e-05, "loss": 1.3745, "step": 13610 }, { "epoch": 1.6012932618120361, "grad_norm": 1.5119205713272095, "learning_rate": 4.767811996250022e-05, "loss": 1.3606, "step": 13620 }, { "epoch": 1.6024689543684327, "grad_norm": 1.4570772647857666, "learning_rate": 4.761411610478105e-05, "loss": 1.5055, "step": 13630 }, { "epoch": 1.603644646924829, "grad_norm": 1.0544387102127075, "learning_rate": 4.755011616524922e-05, "loss": 1.4011, "step": 13640 }, { "epoch": 1.6048203394812257, "grad_norm": 1.092721700668335, "learning_rate": 4.748612024900779e-05, "loss": 1.405, "step": 13650 }, { "epoch": 1.605996032037622, "grad_norm": 1.6766278743743896, "learning_rate": 4.742212846115326e-05, "loss": 1.4366, "step": 13660 }, { "epoch": 1.6071717245940187, "grad_norm": 1.2171212434768677, "learning_rate": 4.7358140906775325e-05, "loss": 1.3771, "step": 13670 }, { "epoch": 1.6083474171504153, "grad_norm": 1.2568343877792358, "learning_rate": 4.729415769095673e-05, "loss": 1.3925, "step": 13680 }, { "epoch": 1.6095231097068117, "grad_norm": 1.5166115760803223, "learning_rate": 4.7230178918773074e-05, "loss": 1.377, "step": 13690 }, { "epoch": 1.610698802263208, "grad_norm": 1.038155436515808, "learning_rate": 4.71662046952927e-05, "loss": 1.4313, "step": 13700 }, { "epoch": 1.6118744948196047, "grad_norm": 1.516806960105896, "learning_rate": 4.7102235125576436e-05, "loss": 1.3933, "step": 13710 }, { "epoch": 1.6130501873760013, "grad_norm": 1.0455366373062134, "learning_rate": 4.703827031467751e-05, "loss": 1.4509, "step": 13720 }, { "epoch": 1.6142258799323976, "grad_norm": 1.2189064025878906, "learning_rate": 4.69743103676413e-05, "loss": 1.3947, "step": 13730 }, { "epoch": 1.6154015724887942, "grad_norm": 1.3343465328216553, "learning_rate": 4.691035538950524e-05, "loss": 1.3902, "step": 13740 }, { "epoch": 1.6165772650451906, "grad_norm": 1.101352334022522, "learning_rate": 4.684640548529854e-05, "loss": 1.4073, "step": 13750 }, { "epoch": 1.6177529576015872, "grad_norm": 1.2120623588562012, "learning_rate": 4.678246076004213e-05, "loss": 1.3969, "step": 13760 }, { "epoch": 1.6189286501579838, "grad_norm": 1.154954195022583, "learning_rate": 4.671852131874841e-05, "loss": 1.4308, "step": 13770 }, { "epoch": 1.6201043427143802, "grad_norm": 1.1759564876556396, "learning_rate": 4.6654587266421125e-05, "loss": 1.4002, "step": 13780 }, { "epoch": 1.6212800352707766, "grad_norm": 1.3416194915771484, "learning_rate": 4.659065870805515e-05, "loss": 1.3405, "step": 13790 }, { "epoch": 1.6224557278271732, "grad_norm": 1.9740723371505737, "learning_rate": 4.652673574863633e-05, "loss": 1.4173, "step": 13800 }, { "epoch": 1.6236314203835698, "grad_norm": 1.0842951536178589, "learning_rate": 4.646281849314134e-05, "loss": 1.3755, "step": 13810 }, { "epoch": 1.6248071129399662, "grad_norm": 1.0973657369613647, "learning_rate": 4.6398907046537474e-05, "loss": 1.4044, "step": 13820 }, { "epoch": 1.6259828054963625, "grad_norm": 1.3992210626602173, "learning_rate": 4.633500151378251e-05, "loss": 1.4427, "step": 13830 }, { "epoch": 1.6271584980527591, "grad_norm": 1.4417972564697266, "learning_rate": 4.6271101999824444e-05, "loss": 1.4117, "step": 13840 }, { "epoch": 1.6283341906091557, "grad_norm": 1.002854585647583, "learning_rate": 4.620720860960147e-05, "loss": 1.4405, "step": 13850 }, { "epoch": 1.6295098831655523, "grad_norm": 1.159307837486267, "learning_rate": 4.614332144804168e-05, "loss": 1.4118, "step": 13860 }, { "epoch": 1.6306855757219487, "grad_norm": 1.396193265914917, "learning_rate": 4.6079440620062955e-05, "loss": 1.4059, "step": 13870 }, { "epoch": 1.631861268278345, "grad_norm": 1.3227903842926025, "learning_rate": 4.601556623057278e-05, "loss": 1.3746, "step": 13880 }, { "epoch": 1.6330369608347417, "grad_norm": 1.0722737312316895, "learning_rate": 4.595169838446802e-05, "loss": 1.3836, "step": 13890 }, { "epoch": 1.6342126533911383, "grad_norm": 1.3737506866455078, "learning_rate": 4.588783718663486e-05, "loss": 1.4101, "step": 13900 }, { "epoch": 1.6353883459475347, "grad_norm": 1.0628955364227295, "learning_rate": 4.582398274194852e-05, "loss": 1.3642, "step": 13910 }, { "epoch": 1.636564038503931, "grad_norm": 1.390060305595398, "learning_rate": 4.576013515527315e-05, "loss": 1.3528, "step": 13920 }, { "epoch": 1.6377397310603277, "grad_norm": 1.076391339302063, "learning_rate": 4.5696294531461656e-05, "loss": 1.4224, "step": 13930 }, { "epoch": 1.6389154236167243, "grad_norm": 1.5216970443725586, "learning_rate": 4.563246097535545e-05, "loss": 1.3597, "step": 13940 }, { "epoch": 1.6400911161731209, "grad_norm": 1.2652708292007446, "learning_rate": 4.556863459178438e-05, "loss": 1.4225, "step": 13950 }, { "epoch": 1.6412668087295172, "grad_norm": 1.2204153537750244, "learning_rate": 4.550481548556653e-05, "loss": 1.3917, "step": 13960 }, { "epoch": 1.6424425012859136, "grad_norm": 1.0641429424285889, "learning_rate": 4.5441003761507986e-05, "loss": 1.4181, "step": 13970 }, { "epoch": 1.6436181938423102, "grad_norm": 1.185088872909546, "learning_rate": 4.537719952440276e-05, "loss": 1.4224, "step": 13980 }, { "epoch": 1.6447938863987068, "grad_norm": 1.2390542030334473, "learning_rate": 4.531340287903255e-05, "loss": 1.4155, "step": 13990 }, { "epoch": 1.6459695789551032, "grad_norm": 1.2120683193206787, "learning_rate": 4.5249613930166544e-05, "loss": 1.4263, "step": 14000 }, { "epoch": 1.6459695789551032, "eval_loss": 1.6193097829818726, "eval_runtime": 1918.5098, "eval_samples_per_second": 31.527, "eval_steps_per_second": 3.941, "step": 14000 }, { "epoch": 1.6471452715114996, "grad_norm": 1.3035722970962524, "learning_rate": 4.518583278256136e-05, "loss": 1.4196, "step": 14010 }, { "epoch": 1.6483209640678962, "grad_norm": 1.300072431564331, "learning_rate": 4.512205954096076e-05, "loss": 1.4153, "step": 14020 }, { "epoch": 1.6494966566242928, "grad_norm": 1.1177761554718018, "learning_rate": 4.505829431009553e-05, "loss": 1.3734, "step": 14030 }, { "epoch": 1.6506723491806894, "grad_norm": 1.5174025297164917, "learning_rate": 4.499453719468332e-05, "loss": 1.4519, "step": 14040 }, { "epoch": 1.6518480417370858, "grad_norm": 1.3092105388641357, "learning_rate": 4.493078829942844e-05, "loss": 1.3711, "step": 14050 }, { "epoch": 1.6530237342934821, "grad_norm": 1.0056978464126587, "learning_rate": 4.486704772902168e-05, "loss": 1.3982, "step": 14060 }, { "epoch": 1.6541994268498788, "grad_norm": 1.303484320640564, "learning_rate": 4.480331558814018e-05, "loss": 1.4381, "step": 14070 }, { "epoch": 1.6553751194062754, "grad_norm": 1.0847115516662598, "learning_rate": 4.4739591981447254e-05, "loss": 1.3864, "step": 14080 }, { "epoch": 1.6565508119626717, "grad_norm": 1.0375665426254272, "learning_rate": 4.467587701359218e-05, "loss": 1.358, "step": 14090 }, { "epoch": 1.6577265045190681, "grad_norm": 1.1457220315933228, "learning_rate": 4.461217078921002e-05, "loss": 1.3669, "step": 14100 }, { "epoch": 1.6589021970754647, "grad_norm": 1.2553225755691528, "learning_rate": 4.454847341292152e-05, "loss": 1.4028, "step": 14110 }, { "epoch": 1.6600778896318613, "grad_norm": 1.2484334707260132, "learning_rate": 4.448478498933289e-05, "loss": 1.4256, "step": 14120 }, { "epoch": 1.661253582188258, "grad_norm": 1.0624066591262817, "learning_rate": 4.442110562303563e-05, "loss": 1.3932, "step": 14130 }, { "epoch": 1.6624292747446543, "grad_norm": 0.9936274290084839, "learning_rate": 4.435743541860636e-05, "loss": 1.454, "step": 14140 }, { "epoch": 1.6636049673010507, "grad_norm": 1.1188641786575317, "learning_rate": 4.429377448060665e-05, "loss": 1.3933, "step": 14150 }, { "epoch": 1.6647806598574473, "grad_norm": 1.802079677581787, "learning_rate": 4.423012291358288e-05, "loss": 1.4061, "step": 14160 }, { "epoch": 1.6659563524138439, "grad_norm": 1.048601746559143, "learning_rate": 4.416648082206602e-05, "loss": 1.4571, "step": 14170 }, { "epoch": 1.6671320449702403, "grad_norm": 1.2052475214004517, "learning_rate": 4.410284831057146e-05, "loss": 1.3607, "step": 14180 }, { "epoch": 1.6683077375266366, "grad_norm": 1.31851327419281, "learning_rate": 4.403922548359892e-05, "loss": 1.4223, "step": 14190 }, { "epoch": 1.6694834300830332, "grad_norm": 1.2235910892486572, "learning_rate": 4.397561244563213e-05, "loss": 1.3972, "step": 14200 }, { "epoch": 1.6706591226394298, "grad_norm": 1.219156265258789, "learning_rate": 4.3912009301138814e-05, "loss": 1.3798, "step": 14210 }, { "epoch": 1.6718348151958264, "grad_norm": 1.4245223999023438, "learning_rate": 4.384841615457041e-05, "loss": 1.4394, "step": 14220 }, { "epoch": 1.6730105077522228, "grad_norm": 1.6273034811019897, "learning_rate": 4.378483311036197e-05, "loss": 1.4046, "step": 14230 }, { "epoch": 1.6741862003086192, "grad_norm": 1.2237838506698608, "learning_rate": 4.372126027293192e-05, "loss": 1.3792, "step": 14240 }, { "epoch": 1.6753618928650158, "grad_norm": 1.3924391269683838, "learning_rate": 4.3657697746681944e-05, "loss": 1.3388, "step": 14250 }, { "epoch": 1.6765375854214124, "grad_norm": 1.2760390043258667, "learning_rate": 4.359414563599678e-05, "loss": 1.4107, "step": 14260 }, { "epoch": 1.6777132779778088, "grad_norm": 1.0688505172729492, "learning_rate": 4.3530604045244086e-05, "loss": 1.3811, "step": 14270 }, { "epoch": 1.6788889705342052, "grad_norm": 1.1820533275604248, "learning_rate": 4.346707307877421e-05, "loss": 1.3558, "step": 14280 }, { "epoch": 1.6800646630906018, "grad_norm": 0.9331909418106079, "learning_rate": 4.3403552840920074e-05, "loss": 1.4422, "step": 14290 }, { "epoch": 1.6812403556469984, "grad_norm": 1.2958511114120483, "learning_rate": 4.3340043435997e-05, "loss": 1.4502, "step": 14300 }, { "epoch": 1.682416048203395, "grad_norm": 1.2539433240890503, "learning_rate": 4.327654496830247e-05, "loss": 1.3921, "step": 14310 }, { "epoch": 1.6835917407597913, "grad_norm": 1.0444159507751465, "learning_rate": 4.321305754211603e-05, "loss": 1.3559, "step": 14320 }, { "epoch": 1.6847674333161877, "grad_norm": 1.2587738037109375, "learning_rate": 4.314958126169911e-05, "loss": 1.4351, "step": 14330 }, { "epoch": 1.6859431258725843, "grad_norm": 1.3120955228805542, "learning_rate": 4.308611623129483e-05, "loss": 1.3577, "step": 14340 }, { "epoch": 1.687118818428981, "grad_norm": 1.284230351448059, "learning_rate": 4.302266255512779e-05, "loss": 1.3527, "step": 14350 }, { "epoch": 1.6882945109853773, "grad_norm": 1.4632539749145508, "learning_rate": 4.2959220337403996e-05, "loss": 1.4039, "step": 14360 }, { "epoch": 1.6894702035417737, "grad_norm": 1.4516350030899048, "learning_rate": 4.289578968231062e-05, "loss": 1.4265, "step": 14370 }, { "epoch": 1.6906458960981703, "grad_norm": 1.2250744104385376, "learning_rate": 4.2832370694015854e-05, "loss": 1.4052, "step": 14380 }, { "epoch": 1.6918215886545669, "grad_norm": 1.071190357208252, "learning_rate": 4.276896347666871e-05, "loss": 1.3412, "step": 14390 }, { "epoch": 1.6929972812109635, "grad_norm": 1.3842849731445312, "learning_rate": 4.2705568134398866e-05, "loss": 1.4091, "step": 14400 }, { "epoch": 1.6941729737673599, "grad_norm": 1.1283613443374634, "learning_rate": 4.264218477131654e-05, "loss": 1.3817, "step": 14410 }, { "epoch": 1.6953486663237562, "grad_norm": 1.0353924036026, "learning_rate": 4.2578813491512235e-05, "loss": 1.3851, "step": 14420 }, { "epoch": 1.6965243588801528, "grad_norm": 1.437709093093872, "learning_rate": 4.251545439905663e-05, "loss": 1.3453, "step": 14430 }, { "epoch": 1.6977000514365495, "grad_norm": 1.4583485126495361, "learning_rate": 4.24521075980004e-05, "loss": 1.374, "step": 14440 }, { "epoch": 1.6988757439929458, "grad_norm": 1.2199286222457886, "learning_rate": 4.2388773192373985e-05, "loss": 1.3909, "step": 14450 }, { "epoch": 1.7000514365493422, "grad_norm": 1.2895337343215942, "learning_rate": 4.232545128618753e-05, "loss": 1.4548, "step": 14460 }, { "epoch": 1.7012271291057388, "grad_norm": 1.3643852472305298, "learning_rate": 4.226214198343062e-05, "loss": 1.4542, "step": 14470 }, { "epoch": 1.7024028216621354, "grad_norm": 1.10687255859375, "learning_rate": 4.219884538807213e-05, "loss": 1.3708, "step": 14480 }, { "epoch": 1.703578514218532, "grad_norm": 1.4055023193359375, "learning_rate": 4.213556160406011e-05, "loss": 1.4669, "step": 14490 }, { "epoch": 1.7047542067749284, "grad_norm": 1.511447548866272, "learning_rate": 4.207229073532155e-05, "loss": 1.3655, "step": 14500 }, { "epoch": 1.7047542067749284, "eval_loss": 1.6107721328735352, "eval_runtime": 1920.0915, "eval_samples_per_second": 31.501, "eval_steps_per_second": 3.938, "step": 14500 }, { "epoch": 1.7059298993313248, "grad_norm": 1.046398639678955, "learning_rate": 4.20090328857622e-05, "loss": 1.3807, "step": 14510 }, { "epoch": 1.7071055918877214, "grad_norm": 1.1084301471710205, "learning_rate": 4.194578815926647e-05, "loss": 1.3526, "step": 14520 }, { "epoch": 1.708281284444118, "grad_norm": 1.576453685760498, "learning_rate": 4.1882556659697195e-05, "loss": 1.3905, "step": 14530 }, { "epoch": 1.7094569770005144, "grad_norm": 1.3708398342132568, "learning_rate": 4.1819338490895504e-05, "loss": 1.427, "step": 14540 }, { "epoch": 1.7106326695569107, "grad_norm": 1.3129656314849854, "learning_rate": 4.175613375668063e-05, "loss": 1.4353, "step": 14550 }, { "epoch": 1.7118083621133073, "grad_norm": 1.2023372650146484, "learning_rate": 4.1692942560849744e-05, "loss": 1.3849, "step": 14560 }, { "epoch": 1.712984054669704, "grad_norm": 1.2315034866333008, "learning_rate": 4.1629765007177754e-05, "loss": 1.4379, "step": 14570 }, { "epoch": 1.7141597472261005, "grad_norm": 1.3749507665634155, "learning_rate": 4.156660119941722e-05, "loss": 1.3962, "step": 14580 }, { "epoch": 1.715335439782497, "grad_norm": 1.3156503438949585, "learning_rate": 4.150345124129808e-05, "loss": 1.4747, "step": 14590 }, { "epoch": 1.7165111323388933, "grad_norm": 1.429336428642273, "learning_rate": 4.144031523652757e-05, "loss": 1.3936, "step": 14600 }, { "epoch": 1.71768682489529, "grad_norm": 1.2707250118255615, "learning_rate": 4.137719328878995e-05, "loss": 1.3692, "step": 14610 }, { "epoch": 1.7188625174516865, "grad_norm": 1.0630991458892822, "learning_rate": 4.1314085501746455e-05, "loss": 1.3544, "step": 14620 }, { "epoch": 1.7200382100080829, "grad_norm": 1.192599892616272, "learning_rate": 4.125099197903503e-05, "loss": 1.371, "step": 14630 }, { "epoch": 1.7212139025644793, "grad_norm": 1.2120085954666138, "learning_rate": 4.118791282427022e-05, "loss": 1.3616, "step": 14640 }, { "epoch": 1.7223895951208759, "grad_norm": 1.4883129596710205, "learning_rate": 4.1124848141042954e-05, "loss": 1.3985, "step": 14650 }, { "epoch": 1.7235652876772725, "grad_norm": 1.1468342542648315, "learning_rate": 4.10617980329204e-05, "loss": 1.3856, "step": 14660 }, { "epoch": 1.7247409802336688, "grad_norm": 1.1722649335861206, "learning_rate": 4.099876260344579e-05, "loss": 1.4203, "step": 14670 }, { "epoch": 1.7259166727900654, "grad_norm": 1.029269814491272, "learning_rate": 4.093574195613826e-05, "loss": 1.43, "step": 14680 }, { "epoch": 1.7270923653464618, "grad_norm": 1.312094807624817, "learning_rate": 4.087273619449267e-05, "loss": 1.4208, "step": 14690 }, { "epoch": 1.7282680579028584, "grad_norm": 1.62395441532135, "learning_rate": 4.08097454219794e-05, "loss": 1.3993, "step": 14700 }, { "epoch": 1.729443750459255, "grad_norm": 1.4500747919082642, "learning_rate": 4.074676974204426e-05, "loss": 1.4361, "step": 14710 }, { "epoch": 1.7306194430156514, "grad_norm": 1.0754508972167969, "learning_rate": 4.0683809258108255e-05, "loss": 1.3794, "step": 14720 }, { "epoch": 1.7317951355720478, "grad_norm": 1.4175310134887695, "learning_rate": 4.062086407356743e-05, "loss": 1.3498, "step": 14730 }, { "epoch": 1.7329708281284444, "grad_norm": 1.6454514265060425, "learning_rate": 4.055793429179272e-05, "loss": 1.4264, "step": 14740 }, { "epoch": 1.734146520684841, "grad_norm": 1.301708459854126, "learning_rate": 4.0495020016129756e-05, "loss": 1.3642, "step": 14750 }, { "epoch": 1.7353222132412374, "grad_norm": 1.1629868745803833, "learning_rate": 4.043212134989868e-05, "loss": 1.4046, "step": 14760 }, { "epoch": 1.736497905797634, "grad_norm": 1.004142165184021, "learning_rate": 4.0369238396394035e-05, "loss": 1.3101, "step": 14770 }, { "epoch": 1.7376735983540303, "grad_norm": 1.3700332641601562, "learning_rate": 4.030637125888456e-05, "loss": 1.334, "step": 14780 }, { "epoch": 1.738849290910427, "grad_norm": 1.354294776916504, "learning_rate": 4.024352004061299e-05, "loss": 1.4031, "step": 14790 }, { "epoch": 1.7400249834668235, "grad_norm": 1.3119330406188965, "learning_rate": 4.018068484479595e-05, "loss": 1.404, "step": 14800 }, { "epoch": 1.74120067602322, "grad_norm": 1.2852725982666016, "learning_rate": 4.0117865774623735e-05, "loss": 1.4466, "step": 14810 }, { "epoch": 1.7423763685796163, "grad_norm": 1.5884336233139038, "learning_rate": 4.0055062933260154e-05, "loss": 1.3542, "step": 14820 }, { "epoch": 1.743552061136013, "grad_norm": 1.2475671768188477, "learning_rate": 3.999227642384236e-05, "loss": 1.4323, "step": 14830 }, { "epoch": 1.7447277536924095, "grad_norm": 1.248392939567566, "learning_rate": 3.992950634948072e-05, "loss": 1.3712, "step": 14840 }, { "epoch": 1.745903446248806, "grad_norm": 1.1730457544326782, "learning_rate": 3.986675281325859e-05, "loss": 1.403, "step": 14850 }, { "epoch": 1.7470791388052025, "grad_norm": 1.2906088829040527, "learning_rate": 3.9804015918232126e-05, "loss": 1.4523, "step": 14860 }, { "epoch": 1.7482548313615989, "grad_norm": 1.1519922018051147, "learning_rate": 3.974129576743021e-05, "loss": 1.3873, "step": 14870 }, { "epoch": 1.7494305239179955, "grad_norm": 1.0067074298858643, "learning_rate": 3.9678592463854224e-05, "loss": 1.361, "step": 14880 }, { "epoch": 1.750606216474392, "grad_norm": 1.2146000862121582, "learning_rate": 3.9615906110477854e-05, "loss": 1.3571, "step": 14890 }, { "epoch": 1.7517819090307885, "grad_norm": 1.1484278440475464, "learning_rate": 3.9553236810246955e-05, "loss": 1.3851, "step": 14900 }, { "epoch": 1.7529576015871848, "grad_norm": 1.379613995552063, "learning_rate": 3.94905846660794e-05, "loss": 1.3468, "step": 14910 }, { "epoch": 1.7541332941435814, "grad_norm": 1.2184128761291504, "learning_rate": 3.942794978086485e-05, "loss": 1.4087, "step": 14920 }, { "epoch": 1.755308986699978, "grad_norm": 1.207550048828125, "learning_rate": 3.936533225746467e-05, "loss": 1.3319, "step": 14930 }, { "epoch": 1.7564846792563744, "grad_norm": 1.4277403354644775, "learning_rate": 3.930273219871168e-05, "loss": 1.4288, "step": 14940 }, { "epoch": 1.7576603718127708, "grad_norm": 1.1799743175506592, "learning_rate": 3.9240149707410026e-05, "loss": 1.4275, "step": 14950 }, { "epoch": 1.7588360643691674, "grad_norm": 1.2195091247558594, "learning_rate": 3.9177584886335e-05, "loss": 1.4133, "step": 14960 }, { "epoch": 1.760011756925564, "grad_norm": 1.2547407150268555, "learning_rate": 3.911503783823287e-05, "loss": 1.411, "step": 14970 }, { "epoch": 1.7611874494819606, "grad_norm": 1.323590636253357, "learning_rate": 3.905250866582074e-05, "loss": 1.3661, "step": 14980 }, { "epoch": 1.762363142038357, "grad_norm": 1.274086833000183, "learning_rate": 3.898999747178635e-05, "loss": 1.423, "step": 14990 }, { "epoch": 1.7635388345947534, "grad_norm": 1.0019911527633667, "learning_rate": 3.89275043587879e-05, "loss": 1.3813, "step": 15000 }, { "epoch": 1.7635388345947534, "eval_loss": 1.6027170419692993, "eval_runtime": 1920.5915, "eval_samples_per_second": 31.493, "eval_steps_per_second": 3.937, "step": 15000 }, { "epoch": 1.76471452715115, "grad_norm": 1.0274415016174316, "learning_rate": 3.886502942945391e-05, "loss": 1.3716, "step": 15010 }, { "epoch": 1.7658902197075466, "grad_norm": 1.4481539726257324, "learning_rate": 3.880257278638303e-05, "loss": 1.3921, "step": 15020 }, { "epoch": 1.767065912263943, "grad_norm": 1.4604376554489136, "learning_rate": 3.874013453214389e-05, "loss": 1.4483, "step": 15030 }, { "epoch": 1.7682416048203393, "grad_norm": 1.079111933708191, "learning_rate": 3.86777147692749e-05, "loss": 1.4168, "step": 15040 }, { "epoch": 1.769417297376736, "grad_norm": 1.0565489530563354, "learning_rate": 3.861531360028413e-05, "loss": 1.3653, "step": 15050 }, { "epoch": 1.7705929899331325, "grad_norm": 1.108343243598938, "learning_rate": 3.8552931127649125e-05, "loss": 1.3723, "step": 15060 }, { "epoch": 1.7717686824895291, "grad_norm": 1.1659613847732544, "learning_rate": 3.8490567453816655e-05, "loss": 1.3792, "step": 15070 }, { "epoch": 1.7729443750459255, "grad_norm": 1.2933045625686646, "learning_rate": 3.84282226812027e-05, "loss": 1.3555, "step": 15080 }, { "epoch": 1.7741200676023219, "grad_norm": 1.2733385562896729, "learning_rate": 3.836589691219215e-05, "loss": 1.4109, "step": 15090 }, { "epoch": 1.7752957601587185, "grad_norm": 1.3054280281066895, "learning_rate": 3.830359024913873e-05, "loss": 1.3463, "step": 15100 }, { "epoch": 1.776471452715115, "grad_norm": 1.2928621768951416, "learning_rate": 3.824130279436473e-05, "loss": 1.3891, "step": 15110 }, { "epoch": 1.7776471452715115, "grad_norm": 1.499068260192871, "learning_rate": 3.817903465016093e-05, "loss": 1.4413, "step": 15120 }, { "epoch": 1.7788228378279078, "grad_norm": 1.2286087274551392, "learning_rate": 3.81167859187864e-05, "loss": 1.4325, "step": 15130 }, { "epoch": 1.7799985303843044, "grad_norm": 1.403412103652954, "learning_rate": 3.805455670246833e-05, "loss": 1.3842, "step": 15140 }, { "epoch": 1.781174222940701, "grad_norm": 1.374221920967102, "learning_rate": 3.799234710340186e-05, "loss": 1.3957, "step": 15150 }, { "epoch": 1.7823499154970976, "grad_norm": 1.2268645763397217, "learning_rate": 3.7930157223749886e-05, "loss": 1.4313, "step": 15160 }, { "epoch": 1.783525608053494, "grad_norm": 1.1371427774429321, "learning_rate": 3.786798716564295e-05, "loss": 1.3173, "step": 15170 }, { "epoch": 1.7847013006098904, "grad_norm": 1.350148320198059, "learning_rate": 3.780583703117905e-05, "loss": 1.3641, "step": 15180 }, { "epoch": 1.785876993166287, "grad_norm": 1.3751076459884644, "learning_rate": 3.7743706922423446e-05, "loss": 1.3716, "step": 15190 }, { "epoch": 1.7870526857226836, "grad_norm": 1.5074763298034668, "learning_rate": 3.7681596941408516e-05, "loss": 1.4179, "step": 15200 }, { "epoch": 1.78822837827908, "grad_norm": 1.06928288936615, "learning_rate": 3.761950719013356e-05, "loss": 1.3114, "step": 15210 }, { "epoch": 1.7894040708354764, "grad_norm": 1.7292083501815796, "learning_rate": 3.7557437770564705e-05, "loss": 1.3722, "step": 15220 }, { "epoch": 1.790579763391873, "grad_norm": 1.1733286380767822, "learning_rate": 3.7495388784634646e-05, "loss": 1.4256, "step": 15230 }, { "epoch": 1.7917554559482696, "grad_norm": 1.1812050342559814, "learning_rate": 3.743336033424255e-05, "loss": 1.3554, "step": 15240 }, { "epoch": 1.7929311485046662, "grad_norm": 1.2673903703689575, "learning_rate": 3.7371352521253845e-05, "loss": 1.3919, "step": 15250 }, { "epoch": 1.7941068410610626, "grad_norm": 1.0686511993408203, "learning_rate": 3.7309365447500066e-05, "loss": 1.4077, "step": 15260 }, { "epoch": 1.795282533617459, "grad_norm": 1.1646027565002441, "learning_rate": 3.7247399214778684e-05, "loss": 1.3867, "step": 15270 }, { "epoch": 1.7964582261738555, "grad_norm": 1.720690369606018, "learning_rate": 3.7185453924852964e-05, "loss": 1.4134, "step": 15280 }, { "epoch": 1.7976339187302521, "grad_norm": 1.063223123550415, "learning_rate": 3.712352967945176e-05, "loss": 1.3306, "step": 15290 }, { "epoch": 1.7988096112866485, "grad_norm": 1.2662612199783325, "learning_rate": 3.706162658026937e-05, "loss": 1.3686, "step": 15300 }, { "epoch": 1.799985303843045, "grad_norm": 1.4283301830291748, "learning_rate": 3.699974472896538e-05, "loss": 1.3835, "step": 15310 }, { "epoch": 1.8011609963994415, "grad_norm": 1.3000984191894531, "learning_rate": 3.693788422716448e-05, "loss": 1.3958, "step": 15320 }, { "epoch": 1.802336688955838, "grad_norm": 1.6394623517990112, "learning_rate": 3.6876045176456255e-05, "loss": 1.3976, "step": 15330 }, { "epoch": 1.8035123815122347, "grad_norm": 1.0994157791137695, "learning_rate": 3.681422767839511e-05, "loss": 1.3376, "step": 15340 }, { "epoch": 1.804688074068631, "grad_norm": 1.6435993909835815, "learning_rate": 3.675243183450005e-05, "loss": 1.4131, "step": 15350 }, { "epoch": 1.8058637666250275, "grad_norm": 1.3077291250228882, "learning_rate": 3.6690657746254505e-05, "loss": 1.4246, "step": 15360 }, { "epoch": 1.807039459181424, "grad_norm": 1.2055766582489014, "learning_rate": 3.6628905515106185e-05, "loss": 1.3771, "step": 15370 }, { "epoch": 1.8082151517378207, "grad_norm": 1.4046354293823242, "learning_rate": 3.656717524246688e-05, "loss": 1.4231, "step": 15380 }, { "epoch": 1.809390844294217, "grad_norm": 1.0653280019760132, "learning_rate": 3.650546702971236e-05, "loss": 1.3569, "step": 15390 }, { "epoch": 1.8105665368506134, "grad_norm": 1.2533693313598633, "learning_rate": 3.6443780978182166e-05, "loss": 1.4063, "step": 15400 }, { "epoch": 1.81174222940701, "grad_norm": 1.3817572593688965, "learning_rate": 3.638211718917939e-05, "loss": 1.457, "step": 15410 }, { "epoch": 1.8129179219634066, "grad_norm": 1.2264130115509033, "learning_rate": 3.632047576397063e-05, "loss": 1.4624, "step": 15420 }, { "epoch": 1.8140936145198032, "grad_norm": 1.3649808168411255, "learning_rate": 3.6258856803785714e-05, "loss": 1.3984, "step": 15430 }, { "epoch": 1.8152693070761996, "grad_norm": 1.3798654079437256, "learning_rate": 3.61972604098176e-05, "loss": 1.3828, "step": 15440 }, { "epoch": 1.816444999632596, "grad_norm": 1.2947481870651245, "learning_rate": 3.613568668322217e-05, "loss": 1.3855, "step": 15450 }, { "epoch": 1.8176206921889926, "grad_norm": 1.1872769594192505, "learning_rate": 3.6074135725118116e-05, "loss": 1.427, "step": 15460 }, { "epoch": 1.8187963847453892, "grad_norm": 1.320824384689331, "learning_rate": 3.601260763658667e-05, "loss": 1.4096, "step": 15470 }, { "epoch": 1.8199720773017856, "grad_norm": 1.2179646492004395, "learning_rate": 3.595110251867157e-05, "loss": 1.4259, "step": 15480 }, { "epoch": 1.821147769858182, "grad_norm": 1.3142775297164917, "learning_rate": 3.588962047237883e-05, "loss": 1.361, "step": 15490 }, { "epoch": 1.8223234624145785, "grad_norm": 1.3480842113494873, "learning_rate": 3.582816159867652e-05, "loss": 1.3913, "step": 15500 }, { "epoch": 1.8223234624145785, "eval_loss": 1.5947504043579102, "eval_runtime": 1919.5677, "eval_samples_per_second": 31.51, "eval_steps_per_second": 3.939, "step": 15500 }, { "epoch": 1.8234991549709751, "grad_norm": 1.6714318990707397, "learning_rate": 3.576672599849472e-05, "loss": 1.4134, "step": 15510 }, { "epoch": 1.8246748475273717, "grad_norm": 1.6184147596359253, "learning_rate": 3.5705313772725234e-05, "loss": 1.4411, "step": 15520 }, { "epoch": 1.8258505400837681, "grad_norm": 1.3702915906906128, "learning_rate": 3.564392502222151e-05, "loss": 1.3707, "step": 15530 }, { "epoch": 1.8270262326401645, "grad_norm": 1.2322367429733276, "learning_rate": 3.558255984779846e-05, "loss": 1.4202, "step": 15540 }, { "epoch": 1.828201925196561, "grad_norm": 1.2651376724243164, "learning_rate": 3.552121835023223e-05, "loss": 1.4452, "step": 15550 }, { "epoch": 1.8293776177529577, "grad_norm": 1.087331771850586, "learning_rate": 3.545990063026012e-05, "loss": 1.3608, "step": 15560 }, { "epoch": 1.830553310309354, "grad_norm": 1.1548032760620117, "learning_rate": 3.539860678858039e-05, "loss": 1.3826, "step": 15570 }, { "epoch": 1.8317290028657505, "grad_norm": 1.0907747745513916, "learning_rate": 3.533733692585205e-05, "loss": 1.4506, "step": 15580 }, { "epoch": 1.832904695422147, "grad_norm": 1.1709752082824707, "learning_rate": 3.527609114269474e-05, "loss": 1.4085, "step": 15590 }, { "epoch": 1.8340803879785437, "grad_norm": 1.2285505533218384, "learning_rate": 3.5214869539688576e-05, "loss": 1.3559, "step": 15600 }, { "epoch": 1.8352560805349403, "grad_norm": 0.9928135871887207, "learning_rate": 3.5153672217373976e-05, "loss": 1.3132, "step": 15610 }, { "epoch": 1.8364317730913366, "grad_norm": 1.1534303426742554, "learning_rate": 3.509249927625142e-05, "loss": 1.3157, "step": 15620 }, { "epoch": 1.837607465647733, "grad_norm": 1.3549995422363281, "learning_rate": 3.503135081678141e-05, "loss": 1.403, "step": 15630 }, { "epoch": 1.8387831582041296, "grad_norm": 1.3790690898895264, "learning_rate": 3.4970226939384215e-05, "loss": 1.3276, "step": 15640 }, { "epoch": 1.8399588507605262, "grad_norm": 1.2716064453125, "learning_rate": 3.490912774443975e-05, "loss": 1.3754, "step": 15650 }, { "epoch": 1.8411345433169226, "grad_norm": 1.2430634498596191, "learning_rate": 3.484805333228738e-05, "loss": 1.3823, "step": 15660 }, { "epoch": 1.842310235873319, "grad_norm": 1.120430827140808, "learning_rate": 3.4787003803225785e-05, "loss": 1.4021, "step": 15670 }, { "epoch": 1.8434859284297156, "grad_norm": 1.359606385231018, "learning_rate": 3.4725979257512756e-05, "loss": 1.3988, "step": 15680 }, { "epoch": 1.8446616209861122, "grad_norm": 1.141626238822937, "learning_rate": 3.4664979795365086e-05, "loss": 1.3366, "step": 15690 }, { "epoch": 1.8458373135425088, "grad_norm": 1.4904969930648804, "learning_rate": 3.460400551695837e-05, "loss": 1.3766, "step": 15700 }, { "epoch": 1.8470130060989052, "grad_norm": 0.996296763420105, "learning_rate": 3.454305652242684e-05, "loss": 1.3935, "step": 15710 }, { "epoch": 1.8481886986553016, "grad_norm": 1.4294956922531128, "learning_rate": 3.4482132911863176e-05, "loss": 1.4034, "step": 15720 }, { "epoch": 1.8493643912116982, "grad_norm": 1.347923994064331, "learning_rate": 3.4421234785318426e-05, "loss": 1.3728, "step": 15730 }, { "epoch": 1.8505400837680948, "grad_norm": 1.4255729913711548, "learning_rate": 3.436036224280174e-05, "loss": 1.3994, "step": 15740 }, { "epoch": 1.8517157763244911, "grad_norm": 1.6196274757385254, "learning_rate": 3.429951538428029e-05, "loss": 1.3544, "step": 15750 }, { "epoch": 1.8528914688808875, "grad_norm": 1.1594067811965942, "learning_rate": 3.423869430967904e-05, "loss": 1.3402, "step": 15760 }, { "epoch": 1.8540671614372841, "grad_norm": 1.634135365486145, "learning_rate": 3.417789911888064e-05, "loss": 1.3599, "step": 15770 }, { "epoch": 1.8552428539936807, "grad_norm": 1.5500917434692383, "learning_rate": 3.411712991172519e-05, "loss": 1.3805, "step": 15780 }, { "epoch": 1.8564185465500773, "grad_norm": 1.4346470832824707, "learning_rate": 3.405638678801015e-05, "loss": 1.4366, "step": 15790 }, { "epoch": 1.8575942391064737, "grad_norm": 1.1692893505096436, "learning_rate": 3.399566984749016e-05, "loss": 1.3731, "step": 15800 }, { "epoch": 1.85876993166287, "grad_norm": 1.4070589542388916, "learning_rate": 3.3934979189876816e-05, "loss": 1.3664, "step": 15810 }, { "epoch": 1.8599456242192667, "grad_norm": 1.240797758102417, "learning_rate": 3.387431491483858e-05, "loss": 1.3176, "step": 15820 }, { "epoch": 1.8611213167756633, "grad_norm": 1.3436665534973145, "learning_rate": 3.3813677122000596e-05, "loss": 1.3505, "step": 15830 }, { "epoch": 1.8622970093320597, "grad_norm": 1.6068922281265259, "learning_rate": 3.3753065910944495e-05, "loss": 1.3874, "step": 15840 }, { "epoch": 1.863472701888456, "grad_norm": 1.1065675020217896, "learning_rate": 3.369248138120825e-05, "loss": 1.4005, "step": 15850 }, { "epoch": 1.8646483944448526, "grad_norm": 1.2501178979873657, "learning_rate": 3.363192363228604e-05, "loss": 1.3907, "step": 15860 }, { "epoch": 1.8658240870012492, "grad_norm": 1.2084951400756836, "learning_rate": 3.357139276362809e-05, "loss": 1.3372, "step": 15870 }, { "epoch": 1.8669997795576456, "grad_norm": 1.1865402460098267, "learning_rate": 3.351088887464039e-05, "loss": 1.4415, "step": 15880 }, { "epoch": 1.8681754721140422, "grad_norm": 1.381511926651001, "learning_rate": 3.34504120646847e-05, "loss": 1.3919, "step": 15890 }, { "epoch": 1.8693511646704386, "grad_norm": 1.3626618385314941, "learning_rate": 3.338996243307829e-05, "loss": 1.4589, "step": 15900 }, { "epoch": 1.8705268572268352, "grad_norm": 1.4033019542694092, "learning_rate": 3.3329540079093805e-05, "loss": 1.3884, "step": 15910 }, { "epoch": 1.8717025497832318, "grad_norm": 1.3798521757125854, "learning_rate": 3.3269145101959056e-05, "loss": 1.3833, "step": 15920 }, { "epoch": 1.8728782423396282, "grad_norm": 1.157533049583435, "learning_rate": 3.3208777600856946e-05, "loss": 1.3725, "step": 15930 }, { "epoch": 1.8740539348960246, "grad_norm": 1.0262092351913452, "learning_rate": 3.314843767492523e-05, "loss": 1.3637, "step": 15940 }, { "epoch": 1.8752296274524212, "grad_norm": 1.402050495147705, "learning_rate": 3.308812542325637e-05, "loss": 1.372, "step": 15950 }, { "epoch": 1.8764053200088178, "grad_norm": 1.5211067199707031, "learning_rate": 3.3027840944897405e-05, "loss": 1.3447, "step": 15960 }, { "epoch": 1.8775810125652141, "grad_norm": 1.2061314582824707, "learning_rate": 3.296758433884973e-05, "loss": 1.3438, "step": 15970 }, { "epoch": 1.8787567051216107, "grad_norm": 1.146067500114441, "learning_rate": 3.2907355704069005e-05, "loss": 1.3877, "step": 15980 }, { "epoch": 1.8799323976780071, "grad_norm": 1.487655758857727, "learning_rate": 3.284715513946492e-05, "loss": 1.3933, "step": 15990 }, { "epoch": 1.8811080902344037, "grad_norm": 1.1700481176376343, "learning_rate": 3.278698274390109e-05, "loss": 1.4214, "step": 16000 }, { "epoch": 1.8811080902344037, "eval_loss": 1.5871803760528564, "eval_runtime": 1919.3514, "eval_samples_per_second": 31.513, "eval_steps_per_second": 3.939, "step": 16000 }, { "epoch": 1.8822837827908003, "grad_norm": 1.2562119960784912, "learning_rate": 3.2726838616194863e-05, "loss": 1.3789, "step": 16010 }, { "epoch": 1.8834594753471967, "grad_norm": 1.1218231916427612, "learning_rate": 3.266672285511718e-05, "loss": 1.3529, "step": 16020 }, { "epoch": 1.884635167903593, "grad_norm": 1.2632538080215454, "learning_rate": 3.260663555939235e-05, "loss": 1.3752, "step": 16030 }, { "epoch": 1.8858108604599897, "grad_norm": 1.5393279790878296, "learning_rate": 3.254657682769798e-05, "loss": 1.342, "step": 16040 }, { "epoch": 1.8869865530163863, "grad_norm": 1.4300042390823364, "learning_rate": 3.248654675866476e-05, "loss": 1.3838, "step": 16050 }, { "epoch": 1.8881622455727827, "grad_norm": 1.6408071517944336, "learning_rate": 3.24265454508763e-05, "loss": 1.4261, "step": 16060 }, { "epoch": 1.8893379381291793, "grad_norm": 1.2367130517959595, "learning_rate": 3.2366573002868984e-05, "loss": 1.3953, "step": 16070 }, { "epoch": 1.8905136306855757, "grad_norm": 1.0545554161071777, "learning_rate": 3.23066295131318e-05, "loss": 1.4037, "step": 16080 }, { "epoch": 1.8916893232419723, "grad_norm": 1.2486355304718018, "learning_rate": 3.224671508010617e-05, "loss": 1.4135, "step": 16090 }, { "epoch": 1.8928650157983689, "grad_norm": 1.0999302864074707, "learning_rate": 3.21868298021858e-05, "loss": 1.449, "step": 16100 }, { "epoch": 1.8940407083547652, "grad_norm": 1.1480196714401245, "learning_rate": 3.212697377771653e-05, "loss": 1.3628, "step": 16110 }, { "epoch": 1.8952164009111616, "grad_norm": 1.2098793983459473, "learning_rate": 3.206714710499618e-05, "loss": 1.393, "step": 16120 }, { "epoch": 1.8963920934675582, "grad_norm": 1.329991102218628, "learning_rate": 3.200734988227429e-05, "loss": 1.3578, "step": 16130 }, { "epoch": 1.8975677860239548, "grad_norm": 1.2006151676177979, "learning_rate": 3.19475822077521e-05, "loss": 1.352, "step": 16140 }, { "epoch": 1.8987434785803512, "grad_norm": 1.4445840120315552, "learning_rate": 3.1887844179582325e-05, "loss": 1.4379, "step": 16150 }, { "epoch": 1.8999191711367476, "grad_norm": 1.3683501482009888, "learning_rate": 3.1828135895868974e-05, "loss": 1.4329, "step": 16160 }, { "epoch": 1.9010948636931442, "grad_norm": 1.0535725355148315, "learning_rate": 3.17684574546672e-05, "loss": 1.3972, "step": 16170 }, { "epoch": 1.9022705562495408, "grad_norm": 1.1165684461593628, "learning_rate": 3.170880895398317e-05, "loss": 1.4274, "step": 16180 }, { "epoch": 1.9034462488059374, "grad_norm": 1.1849948167800903, "learning_rate": 3.164919049177386e-05, "loss": 1.3417, "step": 16190 }, { "epoch": 1.9046219413623338, "grad_norm": 1.0730726718902588, "learning_rate": 3.1589602165946946e-05, "loss": 1.2951, "step": 16200 }, { "epoch": 1.9057976339187301, "grad_norm": 1.3689391613006592, "learning_rate": 3.153004407436059e-05, "loss": 1.316, "step": 16210 }, { "epoch": 1.9069733264751267, "grad_norm": 1.1396808624267578, "learning_rate": 3.147051631482331e-05, "loss": 1.3554, "step": 16220 }, { "epoch": 1.9081490190315233, "grad_norm": 1.1064128875732422, "learning_rate": 3.141101898509378e-05, "loss": 1.3372, "step": 16230 }, { "epoch": 1.9093247115879197, "grad_norm": 1.5126372575759888, "learning_rate": 3.1351552182880765e-05, "loss": 1.3355, "step": 16240 }, { "epoch": 1.910500404144316, "grad_norm": 1.2867069244384766, "learning_rate": 3.1292116005842835e-05, "loss": 1.4055, "step": 16250 }, { "epoch": 1.9116760967007127, "grad_norm": 1.3774924278259277, "learning_rate": 3.123271055158831e-05, "loss": 1.3442, "step": 16260 }, { "epoch": 1.9128517892571093, "grad_norm": 1.627793788909912, "learning_rate": 3.117333591767503e-05, "loss": 1.3295, "step": 16270 }, { "epoch": 1.914027481813506, "grad_norm": 1.2382993698120117, "learning_rate": 3.1113992201610245e-05, "loss": 1.2819, "step": 16280 }, { "epoch": 1.9152031743699023, "grad_norm": 1.1053544282913208, "learning_rate": 3.1054679500850394e-05, "loss": 1.3361, "step": 16290 }, { "epoch": 1.9163788669262987, "grad_norm": 1.3346683979034424, "learning_rate": 3.099539791280099e-05, "loss": 1.3722, "step": 16300 }, { "epoch": 1.9175545594826953, "grad_norm": 1.4136950969696045, "learning_rate": 3.093614753481649e-05, "loss": 1.4448, "step": 16310 }, { "epoch": 1.9187302520390919, "grad_norm": 1.3898134231567383, "learning_rate": 3.0876928464200075e-05, "loss": 1.3579, "step": 16320 }, { "epoch": 1.9199059445954882, "grad_norm": 1.526136040687561, "learning_rate": 3.081774079820349e-05, "loss": 1.3657, "step": 16330 }, { "epoch": 1.9210816371518846, "grad_norm": 1.5271230936050415, "learning_rate": 3.075858463402691e-05, "loss": 1.4094, "step": 16340 }, { "epoch": 1.9222573297082812, "grad_norm": 1.4990878105163574, "learning_rate": 3.0699460068818815e-05, "loss": 1.3539, "step": 16350 }, { "epoch": 1.9234330222646778, "grad_norm": 1.4070910215377808, "learning_rate": 3.0640367199675754e-05, "loss": 1.4168, "step": 16360 }, { "epoch": 1.9246087148210744, "grad_norm": 1.268892526626587, "learning_rate": 3.058130612364226e-05, "loss": 1.3449, "step": 16370 }, { "epoch": 1.9257844073774708, "grad_norm": 1.4335588216781616, "learning_rate": 3.052227693771059e-05, "loss": 1.393, "step": 16380 }, { "epoch": 1.9269600999338672, "grad_norm": 1.2772289514541626, "learning_rate": 3.0463279738820683e-05, "loss": 1.3851, "step": 16390 }, { "epoch": 1.9281357924902638, "grad_norm": 1.409557819366455, "learning_rate": 3.0404314623859947e-05, "loss": 1.3156, "step": 16400 }, { "epoch": 1.9293114850466604, "grad_norm": 1.3062139749526978, "learning_rate": 3.034538168966309e-05, "loss": 1.3612, "step": 16410 }, { "epoch": 1.9304871776030568, "grad_norm": 1.0328842401504517, "learning_rate": 3.028648103301196e-05, "loss": 1.4075, "step": 16420 }, { "epoch": 1.9316628701594531, "grad_norm": 1.0252045392990112, "learning_rate": 3.0227612750635405e-05, "loss": 1.3838, "step": 16430 }, { "epoch": 1.9328385627158498, "grad_norm": 1.3185124397277832, "learning_rate": 3.016877693920912e-05, "loss": 1.3123, "step": 16440 }, { "epoch": 1.9340142552722464, "grad_norm": 1.3827104568481445, "learning_rate": 3.0109973695355453e-05, "loss": 1.3261, "step": 16450 }, { "epoch": 1.935189947828643, "grad_norm": 1.284814715385437, "learning_rate": 3.005120311564329e-05, "loss": 1.3448, "step": 16460 }, { "epoch": 1.9363656403850393, "grad_norm": 1.2721716165542603, "learning_rate": 2.9992465296587867e-05, "loss": 1.4189, "step": 16470 }, { "epoch": 1.9375413329414357, "grad_norm": 1.380735993385315, "learning_rate": 2.9933760334650607e-05, "loss": 1.3678, "step": 16480 }, { "epoch": 1.9387170254978323, "grad_norm": 1.1989301443099976, "learning_rate": 2.9875088326238987e-05, "loss": 1.3103, "step": 16490 }, { "epoch": 1.939892718054229, "grad_norm": 1.5440247058868408, "learning_rate": 2.9816449367706356e-05, "loss": 1.3626, "step": 16500 }, { "epoch": 1.939892718054229, "eval_loss": 1.580987811088562, "eval_runtime": 1920.0091, "eval_samples_per_second": 31.502, "eval_steps_per_second": 3.938, "step": 16500 }, { "epoch": 1.9410684106106253, "grad_norm": 1.2056163549423218, "learning_rate": 2.9757843555351805e-05, "loss": 1.3663, "step": 16510 }, { "epoch": 1.9422441031670217, "grad_norm": 1.522611141204834, "learning_rate": 2.969927098541997e-05, "loss": 1.4143, "step": 16520 }, { "epoch": 1.9434197957234183, "grad_norm": 1.287111759185791, "learning_rate": 2.9640731754100924e-05, "loss": 1.3146, "step": 16530 }, { "epoch": 1.9445954882798149, "grad_norm": 1.3071000576019287, "learning_rate": 2.958222595752995e-05, "loss": 1.388, "step": 16540 }, { "epoch": 1.9457711808362115, "grad_norm": 1.2528523206710815, "learning_rate": 2.952375369178746e-05, "loss": 1.4459, "step": 16550 }, { "epoch": 1.9469468733926079, "grad_norm": 1.1895349025726318, "learning_rate": 2.9465315052898778e-05, "loss": 1.3807, "step": 16560 }, { "epoch": 1.9481225659490042, "grad_norm": 1.3007057905197144, "learning_rate": 2.940691013683401e-05, "loss": 1.401, "step": 16570 }, { "epoch": 1.9492982585054008, "grad_norm": 1.1840049028396606, "learning_rate": 2.9348539039507882e-05, "loss": 1.2719, "step": 16580 }, { "epoch": 1.9504739510617974, "grad_norm": 1.0901087522506714, "learning_rate": 2.92902018567796e-05, "loss": 1.3565, "step": 16590 }, { "epoch": 1.9516496436181938, "grad_norm": 1.2461532354354858, "learning_rate": 2.923189868445263e-05, "loss": 1.4045, "step": 16600 }, { "epoch": 1.9528253361745902, "grad_norm": 1.4192414283752441, "learning_rate": 2.9173629618274628e-05, "loss": 1.3194, "step": 16610 }, { "epoch": 1.9540010287309868, "grad_norm": 1.069959282875061, "learning_rate": 2.911539475393722e-05, "loss": 1.3395, "step": 16620 }, { "epoch": 1.9551767212873834, "grad_norm": 1.2145971059799194, "learning_rate": 2.9057194187075887e-05, "loss": 1.3558, "step": 16630 }, { "epoch": 1.95635241384378, "grad_norm": 1.496598482131958, "learning_rate": 2.8999028013269724e-05, "loss": 1.3572, "step": 16640 }, { "epoch": 1.9575281064001764, "grad_norm": 1.2683483362197876, "learning_rate": 2.8940896328041415e-05, "loss": 1.3466, "step": 16650 }, { "epoch": 1.9587037989565728, "grad_norm": 1.1922287940979004, "learning_rate": 2.888279922685697e-05, "loss": 1.3283, "step": 16660 }, { "epoch": 1.9598794915129694, "grad_norm": 1.2484605312347412, "learning_rate": 2.8824736805125613e-05, "loss": 1.3568, "step": 16670 }, { "epoch": 1.961055184069366, "grad_norm": 1.3944629430770874, "learning_rate": 2.8766709158199613e-05, "loss": 1.3633, "step": 16680 }, { "epoch": 1.9622308766257623, "grad_norm": 1.2556707859039307, "learning_rate": 2.870871638137413e-05, "loss": 1.3802, "step": 16690 }, { "epoch": 1.9634065691821587, "grad_norm": 1.1421706676483154, "learning_rate": 2.8650758569887083e-05, "loss": 1.3523, "step": 16700 }, { "epoch": 1.9645822617385553, "grad_norm": 1.3912410736083984, "learning_rate": 2.8592835818918905e-05, "loss": 1.3752, "step": 16710 }, { "epoch": 1.965757954294952, "grad_norm": 1.1782499551773071, "learning_rate": 2.853494822359252e-05, "loss": 1.3268, "step": 16720 }, { "epoch": 1.9669336468513485, "grad_norm": 1.4111839532852173, "learning_rate": 2.8477095878973088e-05, "loss": 1.4254, "step": 16730 }, { "epoch": 1.968109339407745, "grad_norm": 1.4424852132797241, "learning_rate": 2.841927888006788e-05, "loss": 1.3898, "step": 16740 }, { "epoch": 1.9692850319641413, "grad_norm": 1.3100436925888062, "learning_rate": 2.836149732182612e-05, "loss": 1.3646, "step": 16750 }, { "epoch": 1.9704607245205379, "grad_norm": 1.3602650165557861, "learning_rate": 2.830375129913884e-05, "loss": 1.3781, "step": 16760 }, { "epoch": 1.9716364170769345, "grad_norm": 1.2236242294311523, "learning_rate": 2.824604090683871e-05, "loss": 1.4037, "step": 16770 }, { "epoch": 1.9728121096333309, "grad_norm": 0.915988564491272, "learning_rate": 2.818836623969988e-05, "loss": 1.3754, "step": 16780 }, { "epoch": 1.9739878021897272, "grad_norm": 1.3108983039855957, "learning_rate": 2.8130727392437837e-05, "loss": 1.3285, "step": 16790 }, { "epoch": 1.9751634947461238, "grad_norm": 1.5392028093338013, "learning_rate": 2.807312445970924e-05, "loss": 1.376, "step": 16800 }, { "epoch": 1.9763391873025205, "grad_norm": 1.1481285095214844, "learning_rate": 2.801555753611178e-05, "loss": 1.2916, "step": 16810 }, { "epoch": 1.977514879858917, "grad_norm": 1.3150627613067627, "learning_rate": 2.7958026716183982e-05, "loss": 1.3997, "step": 16820 }, { "epoch": 1.9786905724153134, "grad_norm": 1.3300386667251587, "learning_rate": 2.7900532094405108e-05, "loss": 1.3508, "step": 16830 }, { "epoch": 1.9798662649717098, "grad_norm": 1.1853376626968384, "learning_rate": 2.784307376519496e-05, "loss": 1.3808, "step": 16840 }, { "epoch": 1.9810419575281064, "grad_norm": 1.2599533796310425, "learning_rate": 2.778565182291375e-05, "loss": 1.3761, "step": 16850 }, { "epoch": 1.982217650084503, "grad_norm": 1.2426036596298218, "learning_rate": 2.7728266361861932e-05, "loss": 1.315, "step": 16860 }, { "epoch": 1.9833933426408994, "grad_norm": 1.0143743753433228, "learning_rate": 2.7670917476280046e-05, "loss": 1.3809, "step": 16870 }, { "epoch": 1.9845690351972958, "grad_norm": 1.1778843402862549, "learning_rate": 2.7613605260348586e-05, "loss": 1.3997, "step": 16880 }, { "epoch": 1.9857447277536924, "grad_norm": 1.5868544578552246, "learning_rate": 2.7556329808187757e-05, "loss": 1.3675, "step": 16890 }, { "epoch": 1.986920420310089, "grad_norm": 1.194251537322998, "learning_rate": 2.7499091213857474e-05, "loss": 1.3748, "step": 16900 }, { "epoch": 1.9880961128664856, "grad_norm": 1.4506046772003174, "learning_rate": 2.744188957135707e-05, "loss": 1.4021, "step": 16910 }, { "epoch": 1.989271805422882, "grad_norm": 1.195387601852417, "learning_rate": 2.738472497462523e-05, "loss": 1.352, "step": 16920 }, { "epoch": 1.9904474979792783, "grad_norm": 1.173092007637024, "learning_rate": 2.7327597517539764e-05, "loss": 1.3571, "step": 16930 }, { "epoch": 1.991623190535675, "grad_norm": 1.387646198272705, "learning_rate": 2.7270507293917524e-05, "loss": 1.2966, "step": 16940 }, { "epoch": 1.9927988830920715, "grad_norm": 1.3981930017471313, "learning_rate": 2.721345439751421e-05, "loss": 1.3847, "step": 16950 }, { "epoch": 1.993974575648468, "grad_norm": 1.2530605792999268, "learning_rate": 2.7156438922024173e-05, "loss": 1.3405, "step": 16960 }, { "epoch": 1.9951502682048643, "grad_norm": 1.143169641494751, "learning_rate": 2.709946096108037e-05, "loss": 1.3515, "step": 16970 }, { "epoch": 1.996325960761261, "grad_norm": 1.6384285688400269, "learning_rate": 2.7042520608254135e-05, "loss": 1.3431, "step": 16980 }, { "epoch": 1.9975016533176575, "grad_norm": 1.1815955638885498, "learning_rate": 2.6985617957055027e-05, "loss": 1.3444, "step": 16990 }, { "epoch": 1.998677345874054, "grad_norm": 1.3578486442565918, "learning_rate": 2.6928753100930705e-05, "loss": 1.4187, "step": 17000 }, { "epoch": 1.998677345874054, "eval_loss": 1.5737470388412476, "eval_runtime": 1918.335, "eval_samples_per_second": 31.53, "eval_steps_per_second": 3.941, "step": 17000 }, { "epoch": 1.9998530384304505, "grad_norm": 1.1813344955444336, "learning_rate": 2.6871926133266733e-05, "loss": 1.3718, "step": 17010 }, { "epoch": 2.001028730986847, "grad_norm": 1.4456655979156494, "learning_rate": 2.6815137147386506e-05, "loss": 1.2121, "step": 17020 }, { "epoch": 2.0022044235432435, "grad_norm": 1.131664752960205, "learning_rate": 2.6758386236550982e-05, "loss": 1.204, "step": 17030 }, { "epoch": 2.00338011609964, "grad_norm": 0.9896606802940369, "learning_rate": 2.6701673493958622e-05, "loss": 1.1947, "step": 17040 }, { "epoch": 2.0045558086560367, "grad_norm": 1.3326860666275024, "learning_rate": 2.664499901274522e-05, "loss": 1.2234, "step": 17050 }, { "epoch": 2.005731501212433, "grad_norm": 1.2499990463256836, "learning_rate": 2.6588362885983704e-05, "loss": 1.1894, "step": 17060 }, { "epoch": 2.0069071937688294, "grad_norm": 1.032412052154541, "learning_rate": 2.6531765206684052e-05, "loss": 1.2059, "step": 17070 }, { "epoch": 2.008082886325226, "grad_norm": 1.158316731452942, "learning_rate": 2.647520606779304e-05, "loss": 1.1611, "step": 17080 }, { "epoch": 2.0092585788816226, "grad_norm": 1.0926722288131714, "learning_rate": 2.6418685562194213e-05, "loss": 1.1433, "step": 17090 }, { "epoch": 2.010434271438019, "grad_norm": 1.1307834386825562, "learning_rate": 2.6362203782707644e-05, "loss": 1.1902, "step": 17100 }, { "epoch": 2.0116099639944154, "grad_norm": 1.2237147092819214, "learning_rate": 2.630576082208982e-05, "loss": 1.1127, "step": 17110 }, { "epoch": 2.012785656550812, "grad_norm": 1.021829605102539, "learning_rate": 2.6249356773033472e-05, "loss": 1.1219, "step": 17120 }, { "epoch": 2.0139613491072086, "grad_norm": 1.4557090997695923, "learning_rate": 2.6192991728167415e-05, "loss": 1.2042, "step": 17130 }, { "epoch": 2.015137041663605, "grad_norm": 1.0786534547805786, "learning_rate": 2.6136665780056464e-05, "loss": 1.152, "step": 17140 }, { "epoch": 2.0163127342200013, "grad_norm": 1.3776435852050781, "learning_rate": 2.6080379021201134e-05, "loss": 1.1779, "step": 17150 }, { "epoch": 2.017488426776398, "grad_norm": 1.1511669158935547, "learning_rate": 2.6024131544037656e-05, "loss": 1.1746, "step": 17160 }, { "epoch": 2.0186641193327945, "grad_norm": 1.1800464391708374, "learning_rate": 2.5967923440937734e-05, "loss": 1.1862, "step": 17170 }, { "epoch": 2.019839811889191, "grad_norm": 1.3135852813720703, "learning_rate": 2.591175480420841e-05, "loss": 1.2001, "step": 17180 }, { "epoch": 2.0210155044455873, "grad_norm": 1.281931757926941, "learning_rate": 2.5855625726091905e-05, "loss": 1.1994, "step": 17190 }, { "epoch": 2.022191197001984, "grad_norm": 0.966310441493988, "learning_rate": 2.5799536298765483e-05, "loss": 1.1566, "step": 17200 }, { "epoch": 2.0233668895583805, "grad_norm": 1.4731669425964355, "learning_rate": 2.5743486614341304e-05, "loss": 1.1764, "step": 17210 }, { "epoch": 2.024542582114777, "grad_norm": 1.2987688779830933, "learning_rate": 2.568747676486621e-05, "loss": 1.193, "step": 17220 }, { "epoch": 2.0257182746711733, "grad_norm": 1.048346996307373, "learning_rate": 2.563150684232168e-05, "loss": 1.1774, "step": 17230 }, { "epoch": 2.02689396722757, "grad_norm": 1.1753878593444824, "learning_rate": 2.5575576938623603e-05, "loss": 1.1864, "step": 17240 }, { "epoch": 2.0280696597839665, "grad_norm": 1.3254859447479248, "learning_rate": 2.5519687145622152e-05, "loss": 1.1406, "step": 17250 }, { "epoch": 2.029245352340363, "grad_norm": 1.0237337350845337, "learning_rate": 2.5463837555101615e-05, "loss": 1.1924, "step": 17260 }, { "epoch": 2.0304210448967597, "grad_norm": 1.1925485134124756, "learning_rate": 2.540802825878028e-05, "loss": 1.1375, "step": 17270 }, { "epoch": 2.031596737453156, "grad_norm": 1.0898339748382568, "learning_rate": 2.5352259348310247e-05, "loss": 1.1857, "step": 17280 }, { "epoch": 2.0327724300095524, "grad_norm": 1.0958776473999023, "learning_rate": 2.52965309152773e-05, "loss": 1.1812, "step": 17290 }, { "epoch": 2.033948122565949, "grad_norm": 1.4090725183486938, "learning_rate": 2.5240843051200734e-05, "loss": 1.2177, "step": 17300 }, { "epoch": 2.0351238151223456, "grad_norm": 1.2127999067306519, "learning_rate": 2.518519584753325e-05, "loss": 1.1995, "step": 17310 }, { "epoch": 2.036299507678742, "grad_norm": 1.1389780044555664, "learning_rate": 2.512958939566076e-05, "loss": 1.198, "step": 17320 }, { "epoch": 2.0374752002351384, "grad_norm": 1.1708487272262573, "learning_rate": 2.5074023786902224e-05, "loss": 1.2043, "step": 17330 }, { "epoch": 2.038650892791535, "grad_norm": 1.2045083045959473, "learning_rate": 2.5018499112509552e-05, "loss": 1.1235, "step": 17340 }, { "epoch": 2.0398265853479316, "grad_norm": 1.0825096368789673, "learning_rate": 2.496301546366745e-05, "loss": 1.1697, "step": 17350 }, { "epoch": 2.041002277904328, "grad_norm": 1.0741770267486572, "learning_rate": 2.4907572931493227e-05, "loss": 1.2148, "step": 17360 }, { "epoch": 2.0421779704607244, "grad_norm": 0.9029036164283752, "learning_rate": 2.4852171607036668e-05, "loss": 1.1383, "step": 17370 }, { "epoch": 2.043353663017121, "grad_norm": 1.4491058588027954, "learning_rate": 2.4796811581279893e-05, "loss": 1.1463, "step": 17380 }, { "epoch": 2.0445293555735176, "grad_norm": 1.0515573024749756, "learning_rate": 2.4741492945137216e-05, "loss": 1.1621, "step": 17390 }, { "epoch": 2.045705048129914, "grad_norm": 1.1995779275894165, "learning_rate": 2.4686215789454926e-05, "loss": 1.1137, "step": 17400 }, { "epoch": 2.0468807406863103, "grad_norm": 1.0032734870910645, "learning_rate": 2.463098020501124e-05, "loss": 1.2309, "step": 17410 }, { "epoch": 2.048056433242707, "grad_norm": 1.3321430683135986, "learning_rate": 2.4575786282516084e-05, "loss": 1.2148, "step": 17420 }, { "epoch": 2.0492321257991035, "grad_norm": 1.3980544805526733, "learning_rate": 2.4520634112610984e-05, "loss": 1.1725, "step": 17430 }, { "epoch": 2.0504078183555, "grad_norm": 1.2117373943328857, "learning_rate": 2.4465523785868875e-05, "loss": 1.1696, "step": 17440 }, { "epoch": 2.0515835109118967, "grad_norm": 0.8990176916122437, "learning_rate": 2.4410455392794002e-05, "loss": 1.2209, "step": 17450 }, { "epoch": 2.052759203468293, "grad_norm": 1.0289480686187744, "learning_rate": 2.4355429023821734e-05, "loss": 1.2124, "step": 17460 }, { "epoch": 2.0539348960246895, "grad_norm": 1.001167893409729, "learning_rate": 2.430044476931839e-05, "loss": 1.1643, "step": 17470 }, { "epoch": 2.055110588581086, "grad_norm": 1.367915391921997, "learning_rate": 2.4245502719581183e-05, "loss": 1.1517, "step": 17480 }, { "epoch": 2.0562862811374827, "grad_norm": 1.3795180320739746, "learning_rate": 2.419060296483798e-05, "loss": 1.1658, "step": 17490 }, { "epoch": 2.057461973693879, "grad_norm": 1.0062705278396606, "learning_rate": 2.413574559524721e-05, "loss": 1.154, "step": 17500 }, { "epoch": 2.057461973693879, "eval_loss": 1.58790922164917, "eval_runtime": 1920.0106, "eval_samples_per_second": 31.502, "eval_steps_per_second": 3.938, "step": 17500 }, { "epoch": 2.0586376662502754, "grad_norm": 1.2457820177078247, "learning_rate": 2.4080930700897687e-05, "loss": 1.1384, "step": 17510 }, { "epoch": 2.059813358806672, "grad_norm": 0.9834398627281189, "learning_rate": 2.4026158371808472e-05, "loss": 1.2125, "step": 17520 }, { "epoch": 2.0609890513630686, "grad_norm": 1.0193132162094116, "learning_rate": 2.3971428697928717e-05, "loss": 1.1622, "step": 17530 }, { "epoch": 2.0621647439194652, "grad_norm": 1.402212381362915, "learning_rate": 2.391674176913753e-05, "loss": 1.1423, "step": 17540 }, { "epoch": 2.0633404364758614, "grad_norm": 1.1553230285644531, "learning_rate": 2.3862097675243822e-05, "loss": 1.2286, "step": 17550 }, { "epoch": 2.064516129032258, "grad_norm": 1.1064622402191162, "learning_rate": 2.3807496505986164e-05, "loss": 1.1728, "step": 17560 }, { "epoch": 2.0656918215886546, "grad_norm": 1.064599633216858, "learning_rate": 2.3752938351032623e-05, "loss": 1.1459, "step": 17570 }, { "epoch": 2.066867514145051, "grad_norm": 1.2186393737792969, "learning_rate": 2.3698423299980648e-05, "loss": 1.1799, "step": 17580 }, { "epoch": 2.0680432067014474, "grad_norm": 1.410469889640808, "learning_rate": 2.3643951442356853e-05, "loss": 1.1325, "step": 17590 }, { "epoch": 2.069218899257844, "grad_norm": 1.241778016090393, "learning_rate": 2.3589522867616964e-05, "loss": 1.1797, "step": 17600 }, { "epoch": 2.0703945918142406, "grad_norm": 1.6064949035644531, "learning_rate": 2.3535137665145618e-05, "loss": 1.1718, "step": 17610 }, { "epoch": 2.071570284370637, "grad_norm": 1.2114245891571045, "learning_rate": 2.348079592425621e-05, "loss": 1.2042, "step": 17620 }, { "epoch": 2.0727459769270338, "grad_norm": 1.0890687704086304, "learning_rate": 2.3426497734190776e-05, "loss": 1.2062, "step": 17630 }, { "epoch": 2.07392166948343, "grad_norm": 1.1578255891799927, "learning_rate": 2.337224318411984e-05, "loss": 1.1866, "step": 17640 }, { "epoch": 2.0750973620398265, "grad_norm": 1.1084034442901611, "learning_rate": 2.3318032363142212e-05, "loss": 1.1728, "step": 17650 }, { "epoch": 2.076273054596223, "grad_norm": 1.2607510089874268, "learning_rate": 2.3263865360284936e-05, "loss": 1.1341, "step": 17660 }, { "epoch": 2.0774487471526197, "grad_norm": 1.6853026151657104, "learning_rate": 2.3209742264503077e-05, "loss": 1.1772, "step": 17670 }, { "epoch": 2.078624439709016, "grad_norm": 1.2155693769454956, "learning_rate": 2.3155663164679598e-05, "loss": 1.1977, "step": 17680 }, { "epoch": 2.0798001322654125, "grad_norm": 0.9608173966407776, "learning_rate": 2.3101628149625208e-05, "loss": 1.1515, "step": 17690 }, { "epoch": 2.080975824821809, "grad_norm": 0.8812496066093445, "learning_rate": 2.3047637308078223e-05, "loss": 1.2189, "step": 17700 }, { "epoch": 2.0821515173782057, "grad_norm": 1.350563883781433, "learning_rate": 2.2993690728704436e-05, "loss": 1.2182, "step": 17710 }, { "epoch": 2.0833272099346023, "grad_norm": 1.1628788709640503, "learning_rate": 2.2939788500096882e-05, "loss": 1.1649, "step": 17720 }, { "epoch": 2.0845029024909985, "grad_norm": 1.1747791767120361, "learning_rate": 2.288593071077583e-05, "loss": 1.1584, "step": 17730 }, { "epoch": 2.085678595047395, "grad_norm": 1.205520510673523, "learning_rate": 2.283211744918854e-05, "loss": 1.1685, "step": 17740 }, { "epoch": 2.0868542876037917, "grad_norm": 1.0954421758651733, "learning_rate": 2.277834880370916e-05, "loss": 1.1221, "step": 17750 }, { "epoch": 2.0880299801601883, "grad_norm": 1.0718857049942017, "learning_rate": 2.2724624862638562e-05, "loss": 1.2125, "step": 17760 }, { "epoch": 2.0892056727165844, "grad_norm": 1.3436187505722046, "learning_rate": 2.2670945714204195e-05, "loss": 1.206, "step": 17770 }, { "epoch": 2.090381365272981, "grad_norm": 1.2940738201141357, "learning_rate": 2.261731144655996e-05, "loss": 1.1363, "step": 17780 }, { "epoch": 2.0915570578293776, "grad_norm": 1.3948806524276733, "learning_rate": 2.2563722147786042e-05, "loss": 1.1375, "step": 17790 }, { "epoch": 2.092732750385774, "grad_norm": 1.1506513357162476, "learning_rate": 2.2510177905888785e-05, "loss": 1.1416, "step": 17800 }, { "epoch": 2.093908442942171, "grad_norm": 1.3147773742675781, "learning_rate": 2.245667880880054e-05, "loss": 1.1618, "step": 17810 }, { "epoch": 2.095084135498567, "grad_norm": 1.341098427772522, "learning_rate": 2.2403224944379508e-05, "loss": 1.2718, "step": 17820 }, { "epoch": 2.0962598280549636, "grad_norm": 1.3428281545639038, "learning_rate": 2.2349816400409646e-05, "loss": 1.1938, "step": 17830 }, { "epoch": 2.09743552061136, "grad_norm": 1.0362529754638672, "learning_rate": 2.2296453264600398e-05, "loss": 1.1848, "step": 17840 }, { "epoch": 2.098611213167757, "grad_norm": 0.976139485836029, "learning_rate": 2.224313562458672e-05, "loss": 1.206, "step": 17850 }, { "epoch": 2.099786905724153, "grad_norm": 1.1379119157791138, "learning_rate": 2.2189863567928826e-05, "loss": 1.136, "step": 17860 }, { "epoch": 2.1009625982805495, "grad_norm": 1.1178689002990723, "learning_rate": 2.213663718211207e-05, "loss": 1.2008, "step": 17870 }, { "epoch": 2.102138290836946, "grad_norm": 1.2207353115081787, "learning_rate": 2.20834565545468e-05, "loss": 1.2114, "step": 17880 }, { "epoch": 2.1033139833933427, "grad_norm": 1.2144713401794434, "learning_rate": 2.2030321772568223e-05, "loss": 1.1927, "step": 17890 }, { "epoch": 2.1044896759497393, "grad_norm": 1.3641352653503418, "learning_rate": 2.197723292343628e-05, "loss": 1.2278, "step": 17900 }, { "epoch": 2.1056653685061355, "grad_norm": 1.1422010660171509, "learning_rate": 2.1924190094335406e-05, "loss": 1.119, "step": 17910 }, { "epoch": 2.106841061062532, "grad_norm": 1.6008025407791138, "learning_rate": 2.1871193372374544e-05, "loss": 1.1534, "step": 17920 }, { "epoch": 2.1080167536189287, "grad_norm": 1.2250964641571045, "learning_rate": 2.1818242844586867e-05, "loss": 1.1952, "step": 17930 }, { "epoch": 2.1091924461753253, "grad_norm": 1.0745422840118408, "learning_rate": 2.1765338597929713e-05, "loss": 1.1775, "step": 17940 }, { "epoch": 2.1103681387317215, "grad_norm": 1.3886727094650269, "learning_rate": 2.17124807192844e-05, "loss": 1.2199, "step": 17950 }, { "epoch": 2.111543831288118, "grad_norm": 1.3466503620147705, "learning_rate": 2.1659669295456104e-05, "loss": 1.2002, "step": 17960 }, { "epoch": 2.1127195238445147, "grad_norm": 1.0780349969863892, "learning_rate": 2.1606904413173733e-05, "loss": 1.1792, "step": 17970 }, { "epoch": 2.1138952164009113, "grad_norm": 1.0444084405899048, "learning_rate": 2.15541861590897e-05, "loss": 1.1508, "step": 17980 }, { "epoch": 2.115070908957308, "grad_norm": 1.0701510906219482, "learning_rate": 2.1501514619779905e-05, "loss": 1.1826, "step": 17990 }, { "epoch": 2.116246601513704, "grad_norm": 1.150806188583374, "learning_rate": 2.144888988174351e-05, "loss": 1.2142, "step": 18000 }, { "epoch": 2.116246601513704, "eval_loss": 1.5825936794281006, "eval_runtime": 1922.0621, "eval_samples_per_second": 31.469, "eval_steps_per_second": 3.934, "step": 18000 }, { "epoch": 2.1174222940701006, "grad_norm": 1.3464100360870361, "learning_rate": 2.1396312031402815e-05, "loss": 1.2233, "step": 18010 }, { "epoch": 2.1185979866264972, "grad_norm": 1.2402698993682861, "learning_rate": 2.1343781155103125e-05, "loss": 1.1911, "step": 18020 }, { "epoch": 2.119773679182894, "grad_norm": 1.195701003074646, "learning_rate": 2.1291297339112604e-05, "loss": 1.1737, "step": 18030 }, { "epoch": 2.12094937173929, "grad_norm": 1.0973713397979736, "learning_rate": 2.1238860669622125e-05, "loss": 1.1808, "step": 18040 }, { "epoch": 2.1221250642956866, "grad_norm": 0.9375945925712585, "learning_rate": 2.1186471232745137e-05, "loss": 1.1613, "step": 18050 }, { "epoch": 2.123300756852083, "grad_norm": 1.2476454973220825, "learning_rate": 2.113412911451752e-05, "loss": 1.1175, "step": 18060 }, { "epoch": 2.12447644940848, "grad_norm": 0.9893251061439514, "learning_rate": 2.108183440089746e-05, "loss": 1.1313, "step": 18070 }, { "epoch": 2.125652141964876, "grad_norm": 0.9714356660842896, "learning_rate": 2.1029587177765287e-05, "loss": 1.1501, "step": 18080 }, { "epoch": 2.1268278345212726, "grad_norm": 1.5072606801986694, "learning_rate": 2.097738753092331e-05, "loss": 1.2041, "step": 18090 }, { "epoch": 2.128003527077669, "grad_norm": 1.297592043876648, "learning_rate": 2.092523554609574e-05, "loss": 1.175, "step": 18100 }, { "epoch": 2.1291792196340658, "grad_norm": 1.1861096620559692, "learning_rate": 2.0873131308928518e-05, "loss": 1.1607, "step": 18110 }, { "epoch": 2.1303549121904624, "grad_norm": 1.3976826667785645, "learning_rate": 2.0821074904989152e-05, "loss": 1.1996, "step": 18120 }, { "epoch": 2.1315306047468585, "grad_norm": 1.2374169826507568, "learning_rate": 2.0769066419766612e-05, "loss": 1.1694, "step": 18130 }, { "epoch": 2.132706297303255, "grad_norm": 1.2603273391723633, "learning_rate": 2.071710593867117e-05, "loss": 1.1713, "step": 18140 }, { "epoch": 2.1338819898596517, "grad_norm": 1.0034083127975464, "learning_rate": 2.066519354703427e-05, "loss": 1.2232, "step": 18150 }, { "epoch": 2.1350576824160483, "grad_norm": 1.023352026939392, "learning_rate": 2.0613329330108354e-05, "loss": 1.1501, "step": 18160 }, { "epoch": 2.136233374972445, "grad_norm": 1.163655400276184, "learning_rate": 2.056151337306677e-05, "loss": 1.1385, "step": 18170 }, { "epoch": 2.137409067528841, "grad_norm": 1.4285978078842163, "learning_rate": 2.0509745761003623e-05, "loss": 1.172, "step": 18180 }, { "epoch": 2.1385847600852377, "grad_norm": 1.3053101301193237, "learning_rate": 2.045802657893361e-05, "loss": 1.1711, "step": 18190 }, { "epoch": 2.1397604526416343, "grad_norm": 1.0417462587356567, "learning_rate": 2.040635591179189e-05, "loss": 1.1343, "step": 18200 }, { "epoch": 2.140936145198031, "grad_norm": 1.2198718786239624, "learning_rate": 2.0354733844433966e-05, "loss": 1.1718, "step": 18210 }, { "epoch": 2.142111837754427, "grad_norm": 1.1064785718917847, "learning_rate": 2.0303160461635524e-05, "loss": 1.2283, "step": 18220 }, { "epoch": 2.1432875303108236, "grad_norm": 1.150686264038086, "learning_rate": 2.025163584809227e-05, "loss": 1.1786, "step": 18230 }, { "epoch": 2.1444632228672202, "grad_norm": 1.021859884262085, "learning_rate": 2.020016008841985e-05, "loss": 1.1305, "step": 18240 }, { "epoch": 2.145638915423617, "grad_norm": 1.4452184438705444, "learning_rate": 2.0148733267153686e-05, "loss": 1.1565, "step": 18250 }, { "epoch": 2.146814607980013, "grad_norm": 1.2597455978393555, "learning_rate": 2.0097355468748797e-05, "loss": 1.1582, "step": 18260 }, { "epoch": 2.1479903005364096, "grad_norm": 1.1630361080169678, "learning_rate": 2.0046026777579734e-05, "loss": 1.2388, "step": 18270 }, { "epoch": 2.149165993092806, "grad_norm": 1.2438710927963257, "learning_rate": 1.999474727794038e-05, "loss": 1.1131, "step": 18280 }, { "epoch": 2.150341685649203, "grad_norm": 1.3585916757583618, "learning_rate": 1.9943517054043832e-05, "loss": 1.1223, "step": 18290 }, { "epoch": 2.1515173782055994, "grad_norm": 1.1814576387405396, "learning_rate": 1.989233619002227e-05, "loss": 1.1401, "step": 18300 }, { "epoch": 2.1526930707619956, "grad_norm": 1.4893028736114502, "learning_rate": 1.984120476992682e-05, "loss": 1.0854, "step": 18310 }, { "epoch": 2.153868763318392, "grad_norm": 1.0900344848632812, "learning_rate": 1.97901228777274e-05, "loss": 1.2708, "step": 18320 }, { "epoch": 2.1550444558747888, "grad_norm": 1.171908974647522, "learning_rate": 1.9739090597312603e-05, "loss": 1.2086, "step": 18330 }, { "epoch": 2.1562201484311854, "grad_norm": 1.0144342184066772, "learning_rate": 1.9688108012489542e-05, "loss": 1.1879, "step": 18340 }, { "epoch": 2.157395840987582, "grad_norm": 1.142566442489624, "learning_rate": 1.9637175206983692e-05, "loss": 1.1681, "step": 18350 }, { "epoch": 2.158571533543978, "grad_norm": 1.1247198581695557, "learning_rate": 1.958629226443881e-05, "loss": 1.1983, "step": 18360 }, { "epoch": 2.1597472261003747, "grad_norm": 1.3419592380523682, "learning_rate": 1.9535459268416766e-05, "loss": 1.1883, "step": 18370 }, { "epoch": 2.1609229186567713, "grad_norm": 1.1084849834442139, "learning_rate": 1.9484676302397397e-05, "loss": 1.1662, "step": 18380 }, { "epoch": 2.162098611213168, "grad_norm": 1.1538888216018677, "learning_rate": 1.943394344977838e-05, "loss": 1.1889, "step": 18390 }, { "epoch": 2.163274303769564, "grad_norm": 1.2092742919921875, "learning_rate": 1.9383260793875086e-05, "loss": 1.1091, "step": 18400 }, { "epoch": 2.1644499963259607, "grad_norm": 1.192043423652649, "learning_rate": 1.9332628417920485e-05, "loss": 1.1498, "step": 18410 }, { "epoch": 2.1656256888823573, "grad_norm": 1.4232511520385742, "learning_rate": 1.9282046405064913e-05, "loss": 1.1769, "step": 18420 }, { "epoch": 2.166801381438754, "grad_norm": 1.2204766273498535, "learning_rate": 1.9231514838376046e-05, "loss": 1.1524, "step": 18430 }, { "epoch": 2.16797707399515, "grad_norm": 1.2995812892913818, "learning_rate": 1.9181033800838716e-05, "loss": 1.1593, "step": 18440 }, { "epoch": 2.1691527665515467, "grad_norm": 1.035925269126892, "learning_rate": 1.913060337535475e-05, "loss": 1.139, "step": 18450 }, { "epoch": 2.1703284591079433, "grad_norm": 1.138511061668396, "learning_rate": 1.9080223644742872e-05, "loss": 1.1083, "step": 18460 }, { "epoch": 2.17150415166434, "grad_norm": 1.2462079524993896, "learning_rate": 1.9029894691738553e-05, "loss": 1.157, "step": 18470 }, { "epoch": 2.1726798442207365, "grad_norm": 1.3438496589660645, "learning_rate": 1.8979616598993888e-05, "loss": 1.1652, "step": 18480 }, { "epoch": 2.1738555367771326, "grad_norm": 1.3186845779418945, "learning_rate": 1.89293894490774e-05, "loss": 1.1813, "step": 18490 }, { "epoch": 2.175031229333529, "grad_norm": 1.3384515047073364, "learning_rate": 1.8879213324474005e-05, "loss": 1.1634, "step": 18500 }, { "epoch": 2.175031229333529, "eval_loss": 1.5811468362808228, "eval_runtime": 1921.565, "eval_samples_per_second": 31.477, "eval_steps_per_second": 3.935, "step": 18500 }, { "epoch": 2.176206921889926, "grad_norm": 1.1734453439712524, "learning_rate": 1.882908830758479e-05, "loss": 1.1209, "step": 18510 }, { "epoch": 2.1773826144463224, "grad_norm": 1.4251855611801147, "learning_rate": 1.8779014480726935e-05, "loss": 1.1669, "step": 18520 }, { "epoch": 2.178558307002719, "grad_norm": 1.272411823272705, "learning_rate": 1.8728991926133538e-05, "loss": 1.1346, "step": 18530 }, { "epoch": 2.179733999559115, "grad_norm": 1.1146141290664673, "learning_rate": 1.8679020725953504e-05, "loss": 1.1977, "step": 18540 }, { "epoch": 2.1809096921155118, "grad_norm": 1.016687273979187, "learning_rate": 1.8629100962251387e-05, "loss": 1.1364, "step": 18550 }, { "epoch": 2.1820853846719084, "grad_norm": 1.1324809789657593, "learning_rate": 1.8579232717007295e-05, "loss": 1.1506, "step": 18560 }, { "epoch": 2.183261077228305, "grad_norm": 1.0038975477218628, "learning_rate": 1.8529416072116707e-05, "loss": 1.15, "step": 18570 }, { "epoch": 2.184436769784701, "grad_norm": 1.242782473564148, "learning_rate": 1.8479651109390373e-05, "loss": 1.2168, "step": 18580 }, { "epoch": 2.1856124623410977, "grad_norm": 1.4568074941635132, "learning_rate": 1.8429937910554183e-05, "loss": 1.223, "step": 18590 }, { "epoch": 2.1867881548974943, "grad_norm": 1.1878129243850708, "learning_rate": 1.8380276557248978e-05, "loss": 1.1132, "step": 18600 }, { "epoch": 2.187963847453891, "grad_norm": 1.1372034549713135, "learning_rate": 1.833066713103048e-05, "loss": 1.1741, "step": 18610 }, { "epoch": 2.189139540010287, "grad_norm": 1.0757704973220825, "learning_rate": 1.8281109713369154e-05, "loss": 1.2394, "step": 18620 }, { "epoch": 2.1903152325666837, "grad_norm": 1.4496842622756958, "learning_rate": 1.8231604385650026e-05, "loss": 1.1537, "step": 18630 }, { "epoch": 2.1914909251230803, "grad_norm": 1.1333884000778198, "learning_rate": 1.8182151229172583e-05, "loss": 1.2107, "step": 18640 }, { "epoch": 2.192666617679477, "grad_norm": 1.218798279762268, "learning_rate": 1.813275032515065e-05, "loss": 1.1837, "step": 18650 }, { "epoch": 2.1938423102358735, "grad_norm": 1.185654878616333, "learning_rate": 1.808340175471224e-05, "loss": 1.1967, "step": 18660 }, { "epoch": 2.1950180027922697, "grad_norm": 1.0079882144927979, "learning_rate": 1.8034105598899386e-05, "loss": 1.1504, "step": 18670 }, { "epoch": 2.1961936953486663, "grad_norm": 1.0848162174224854, "learning_rate": 1.7984861938668073e-05, "loss": 1.1233, "step": 18680 }, { "epoch": 2.197369387905063, "grad_norm": 1.2376140356063843, "learning_rate": 1.7935670854888087e-05, "loss": 1.1103, "step": 18690 }, { "epoch": 2.1985450804614595, "grad_norm": 1.0789793729782104, "learning_rate": 1.788653242834285e-05, "loss": 1.1435, "step": 18700 }, { "epoch": 2.1997207730178556, "grad_norm": 1.4154555797576904, "learning_rate": 1.7837446739729315e-05, "loss": 1.1677, "step": 18710 }, { "epoch": 2.2008964655742522, "grad_norm": 1.0390441417694092, "learning_rate": 1.7788413869657827e-05, "loss": 1.1565, "step": 18720 }, { "epoch": 2.202072158130649, "grad_norm": 1.1114355325698853, "learning_rate": 1.7739433898652008e-05, "loss": 1.2292, "step": 18730 }, { "epoch": 2.2032478506870454, "grad_norm": 1.2076687812805176, "learning_rate": 1.769050690714856e-05, "loss": 1.1886, "step": 18740 }, { "epoch": 2.204423543243442, "grad_norm": 1.4034390449523926, "learning_rate": 1.7641632975497215e-05, "loss": 1.1553, "step": 18750 }, { "epoch": 2.205599235799838, "grad_norm": 1.1046481132507324, "learning_rate": 1.7592812183960572e-05, "loss": 1.1698, "step": 18760 }, { "epoch": 2.206774928356235, "grad_norm": 1.0523169040679932, "learning_rate": 1.754404461271395e-05, "loss": 1.1982, "step": 18770 }, { "epoch": 2.2079506209126314, "grad_norm": 1.2754162549972534, "learning_rate": 1.7495330341845266e-05, "loss": 1.1759, "step": 18780 }, { "epoch": 2.209126313469028, "grad_norm": 1.1759934425354004, "learning_rate": 1.7446669451354915e-05, "loss": 1.1365, "step": 18790 }, { "epoch": 2.210302006025424, "grad_norm": 1.5871251821517944, "learning_rate": 1.739806202115562e-05, "loss": 1.1275, "step": 18800 }, { "epoch": 2.2114776985818207, "grad_norm": 1.055656909942627, "learning_rate": 1.7349508131072307e-05, "loss": 1.1728, "step": 18810 }, { "epoch": 2.2126533911382174, "grad_norm": 0.9734097123146057, "learning_rate": 1.7301007860841988e-05, "loss": 1.1853, "step": 18820 }, { "epoch": 2.213829083694614, "grad_norm": 1.6303898096084595, "learning_rate": 1.725256129011361e-05, "loss": 1.2047, "step": 18830 }, { "epoch": 2.2150047762510106, "grad_norm": 1.1266059875488281, "learning_rate": 1.720416849844793e-05, "loss": 1.1575, "step": 18840 }, { "epoch": 2.2161804688074067, "grad_norm": 1.3701244592666626, "learning_rate": 1.715582956531742e-05, "loss": 1.191, "step": 18850 }, { "epoch": 2.2173561613638033, "grad_norm": 1.1883749961853027, "learning_rate": 1.710754457010603e-05, "loss": 1.1589, "step": 18860 }, { "epoch": 2.2185318539202, "grad_norm": 0.9824060797691345, "learning_rate": 1.70593135921092e-05, "loss": 1.1635, "step": 18870 }, { "epoch": 2.2197075464765965, "grad_norm": 1.1506266593933105, "learning_rate": 1.701113671053363e-05, "loss": 1.2683, "step": 18880 }, { "epoch": 2.2208832390329927, "grad_norm": 1.1196383237838745, "learning_rate": 1.6963014004497198e-05, "loss": 1.1606, "step": 18890 }, { "epoch": 2.2220589315893893, "grad_norm": 1.1435905694961548, "learning_rate": 1.6914945553028793e-05, "loss": 1.1808, "step": 18900 }, { "epoch": 2.223234624145786, "grad_norm": 1.3259692192077637, "learning_rate": 1.686693143506824e-05, "loss": 1.1327, "step": 18910 }, { "epoch": 2.2244103167021825, "grad_norm": 0.9766810536384583, "learning_rate": 1.6818971729466072e-05, "loss": 1.1576, "step": 18920 }, { "epoch": 2.225586009258579, "grad_norm": 1.0368012189865112, "learning_rate": 1.677106651498352e-05, "loss": 1.161, "step": 18930 }, { "epoch": 2.2267617018149752, "grad_norm": 1.8915616273880005, "learning_rate": 1.6723215870292306e-05, "loss": 1.1533, "step": 18940 }, { "epoch": 2.227937394371372, "grad_norm": 1.2589011192321777, "learning_rate": 1.6675419873974553e-05, "loss": 1.1237, "step": 18950 }, { "epoch": 2.2291130869277684, "grad_norm": 1.5867886543273926, "learning_rate": 1.6627678604522617e-05, "loss": 1.1508, "step": 18960 }, { "epoch": 2.230288779484165, "grad_norm": 1.1096458435058594, "learning_rate": 1.6579992140338985e-05, "loss": 1.1081, "step": 18970 }, { "epoch": 2.231464472040561, "grad_norm": 0.9539268016815186, "learning_rate": 1.6532360559736158e-05, "loss": 1.1492, "step": 18980 }, { "epoch": 2.232640164596958, "grad_norm": 1.404564619064331, "learning_rate": 1.6484783940936472e-05, "loss": 1.0947, "step": 18990 }, { "epoch": 2.2338158571533544, "grad_norm": 1.3855056762695312, "learning_rate": 1.643726236207202e-05, "loss": 1.1774, "step": 19000 }, { "epoch": 2.2338158571533544, "eval_loss": 1.574950098991394, "eval_runtime": 1922.1688, "eval_samples_per_second": 31.467, "eval_steps_per_second": 3.934, "step": 19000 }, { "epoch": 2.234991549709751, "grad_norm": 1.0890169143676758, "learning_rate": 1.638979590118452e-05, "loss": 1.1471, "step": 19010 }, { "epoch": 2.2361672422661476, "grad_norm": 1.0967589616775513, "learning_rate": 1.634238463622515e-05, "loss": 1.1916, "step": 19020 }, { "epoch": 2.2373429348225438, "grad_norm": 1.3717865943908691, "learning_rate": 1.629502864505446e-05, "loss": 1.1706, "step": 19030 }, { "epoch": 2.2385186273789404, "grad_norm": 1.2884374856948853, "learning_rate": 1.6247728005442224e-05, "loss": 1.1354, "step": 19040 }, { "epoch": 2.239694319935337, "grad_norm": 1.0294396877288818, "learning_rate": 1.6200482795067313e-05, "loss": 1.2366, "step": 19050 }, { "epoch": 2.2408700124917336, "grad_norm": 1.2356408834457397, "learning_rate": 1.6153293091517564e-05, "loss": 1.0955, "step": 19060 }, { "epoch": 2.2420457050481297, "grad_norm": 1.4343918561935425, "learning_rate": 1.6106158972289666e-05, "loss": 1.1758, "step": 19070 }, { "epoch": 2.2432213976045263, "grad_norm": 1.179949402809143, "learning_rate": 1.6059080514789026e-05, "loss": 1.1857, "step": 19080 }, { "epoch": 2.244397090160923, "grad_norm": 1.448632836341858, "learning_rate": 1.6012057796329637e-05, "loss": 1.1483, "step": 19090 }, { "epoch": 2.2455727827173195, "grad_norm": 1.18316650390625, "learning_rate": 1.5965090894133976e-05, "loss": 1.161, "step": 19100 }, { "epoch": 2.246748475273716, "grad_norm": 1.1989649534225464, "learning_rate": 1.591817988533281e-05, "loss": 1.1124, "step": 19110 }, { "epoch": 2.2479241678301123, "grad_norm": 1.268152117729187, "learning_rate": 1.587132484696515e-05, "loss": 1.1677, "step": 19120 }, { "epoch": 2.249099860386509, "grad_norm": 1.2328910827636719, "learning_rate": 1.582452585597809e-05, "loss": 1.1746, "step": 19130 }, { "epoch": 2.2502755529429055, "grad_norm": 1.0572108030319214, "learning_rate": 1.5777782989226676e-05, "loss": 1.1573, "step": 19140 }, { "epoch": 2.251451245499302, "grad_norm": 1.4675514698028564, "learning_rate": 1.5731096323473776e-05, "loss": 1.1894, "step": 19150 }, { "epoch": 2.2526269380556982, "grad_norm": 1.2786047458648682, "learning_rate": 1.5684465935389976e-05, "loss": 1.1623, "step": 19160 }, { "epoch": 2.253802630612095, "grad_norm": 1.171980857849121, "learning_rate": 1.5637891901553446e-05, "loss": 1.1563, "step": 19170 }, { "epoch": 2.2549783231684914, "grad_norm": 1.0144617557525635, "learning_rate": 1.559137429844978e-05, "loss": 1.1699, "step": 19180 }, { "epoch": 2.256154015724888, "grad_norm": 1.2853516340255737, "learning_rate": 1.5544913202471916e-05, "loss": 1.1847, "step": 19190 }, { "epoch": 2.2573297082812847, "grad_norm": 1.133269190788269, "learning_rate": 1.5498508689920004e-05, "loss": 1.1452, "step": 19200 }, { "epoch": 2.258505400837681, "grad_norm": 1.1545765399932861, "learning_rate": 1.5452160837001256e-05, "loss": 1.1609, "step": 19210 }, { "epoch": 2.2596810933940774, "grad_norm": 1.1900852918624878, "learning_rate": 1.5405869719829852e-05, "loss": 1.1163, "step": 19220 }, { "epoch": 2.260856785950474, "grad_norm": 1.2840280532836914, "learning_rate": 1.5359635414426777e-05, "loss": 1.1458, "step": 19230 }, { "epoch": 2.2620324785068706, "grad_norm": 1.3721604347229004, "learning_rate": 1.5313457996719743e-05, "loss": 1.2207, "step": 19240 }, { "epoch": 2.2632081710632668, "grad_norm": 1.130231261253357, "learning_rate": 1.5267337542543004e-05, "loss": 1.1339, "step": 19250 }, { "epoch": 2.2643838636196634, "grad_norm": 1.0239814519882202, "learning_rate": 1.5221274127637292e-05, "loss": 1.1188, "step": 19260 }, { "epoch": 2.26555955617606, "grad_norm": 1.0587518215179443, "learning_rate": 1.5175267827649664e-05, "loss": 1.1926, "step": 19270 }, { "epoch": 2.2667352487324566, "grad_norm": 1.2831840515136719, "learning_rate": 1.5129318718133378e-05, "loss": 1.1648, "step": 19280 }, { "epoch": 2.2679109412888527, "grad_norm": 1.3657269477844238, "learning_rate": 1.5083426874547774e-05, "loss": 1.2233, "step": 19290 }, { "epoch": 2.2690866338452493, "grad_norm": 1.4194868803024292, "learning_rate": 1.5037592372258147e-05, "loss": 1.1886, "step": 19300 }, { "epoch": 2.270262326401646, "grad_norm": 1.060125708580017, "learning_rate": 1.4991815286535615e-05, "loss": 1.1457, "step": 19310 }, { "epoch": 2.2714380189580425, "grad_norm": 1.263488531112671, "learning_rate": 1.4946095692557022e-05, "loss": 1.1403, "step": 19320 }, { "epoch": 2.272613711514439, "grad_norm": 1.3042213916778564, "learning_rate": 1.4900433665404778e-05, "loss": 1.178, "step": 19330 }, { "epoch": 2.2737894040708353, "grad_norm": 1.2620720863342285, "learning_rate": 1.4854829280066768e-05, "loss": 1.1316, "step": 19340 }, { "epoch": 2.274965096627232, "grad_norm": 1.1428825855255127, "learning_rate": 1.4809282611436227e-05, "loss": 1.2199, "step": 19350 }, { "epoch": 2.2761407891836285, "grad_norm": 1.260157823562622, "learning_rate": 1.4763793734311554e-05, "loss": 1.1956, "step": 19360 }, { "epoch": 2.277316481740025, "grad_norm": 1.1738663911819458, "learning_rate": 1.4718362723396295e-05, "loss": 1.1479, "step": 19370 }, { "epoch": 2.2784921742964217, "grad_norm": 1.4211270809173584, "learning_rate": 1.4672989653298946e-05, "loss": 1.0981, "step": 19380 }, { "epoch": 2.279667866852818, "grad_norm": 1.6622451543807983, "learning_rate": 1.4627674598532858e-05, "loss": 1.2617, "step": 19390 }, { "epoch": 2.2808435594092145, "grad_norm": 1.1325006484985352, "learning_rate": 1.4582417633516088e-05, "loss": 1.1488, "step": 19400 }, { "epoch": 2.282019251965611, "grad_norm": 1.2660483121871948, "learning_rate": 1.453721883257132e-05, "loss": 1.2077, "step": 19410 }, { "epoch": 2.2831949445220077, "grad_norm": 1.0817116498947144, "learning_rate": 1.4492078269925718e-05, "loss": 1.1327, "step": 19420 }, { "epoch": 2.284370637078404, "grad_norm": 1.5454682111740112, "learning_rate": 1.4446996019710763e-05, "loss": 1.1599, "step": 19430 }, { "epoch": 2.2855463296348004, "grad_norm": 1.2672510147094727, "learning_rate": 1.4401972155962219e-05, "loss": 1.179, "step": 19440 }, { "epoch": 2.286722022191197, "grad_norm": 1.3579559326171875, "learning_rate": 1.4357006752619951e-05, "loss": 1.1425, "step": 19450 }, { "epoch": 2.2878977147475936, "grad_norm": 1.0969511270523071, "learning_rate": 1.4312099883527818e-05, "loss": 1.1877, "step": 19460 }, { "epoch": 2.28907340730399, "grad_norm": 0.9994701743125916, "learning_rate": 1.4267251622433553e-05, "loss": 1.109, "step": 19470 }, { "epoch": 2.2902490998603864, "grad_norm": 1.4596900939941406, "learning_rate": 1.4222462042988644e-05, "loss": 1.1865, "step": 19480 }, { "epoch": 2.291424792416783, "grad_norm": 1.3092703819274902, "learning_rate": 1.4177731218748208e-05, "loss": 1.1327, "step": 19490 }, { "epoch": 2.2926004849731796, "grad_norm": 1.2148137092590332, "learning_rate": 1.4133059223170852e-05, "loss": 1.196, "step": 19500 }, { "epoch": 2.2926004849731796, "eval_loss": 1.5731528997421265, "eval_runtime": 1919.8261, "eval_samples_per_second": 31.505, "eval_steps_per_second": 3.938, "step": 19500 }, { "epoch": 2.293776177529576, "grad_norm": 0.9516290426254272, "learning_rate": 1.4088446129618599e-05, "loss": 1.1889, "step": 19510 }, { "epoch": 2.2949518700859723, "grad_norm": 1.0004019737243652, "learning_rate": 1.4043892011356729e-05, "loss": 1.1793, "step": 19520 }, { "epoch": 2.296127562642369, "grad_norm": 1.2669215202331543, "learning_rate": 1.3999396941553683e-05, "loss": 1.1735, "step": 19530 }, { "epoch": 2.2973032551987655, "grad_norm": 1.096100926399231, "learning_rate": 1.3954960993280907e-05, "loss": 1.1457, "step": 19540 }, { "epoch": 2.298478947755162, "grad_norm": 1.3750991821289062, "learning_rate": 1.3910584239512786e-05, "loss": 1.1559, "step": 19550 }, { "epoch": 2.2996546403115588, "grad_norm": 1.1424857378005981, "learning_rate": 1.3866266753126462e-05, "loss": 1.1473, "step": 19560 }, { "epoch": 2.300830332867955, "grad_norm": 1.2224271297454834, "learning_rate": 1.3822008606901766e-05, "loss": 1.2112, "step": 19570 }, { "epoch": 2.3020060254243515, "grad_norm": 1.176493763923645, "learning_rate": 1.3777809873521069e-05, "loss": 1.0975, "step": 19580 }, { "epoch": 2.303181717980748, "grad_norm": 1.1921324729919434, "learning_rate": 1.373367062556919e-05, "loss": 1.1405, "step": 19590 }, { "epoch": 2.3043574105371447, "grad_norm": 1.386183261871338, "learning_rate": 1.3689590935533226e-05, "loss": 1.147, "step": 19600 }, { "epoch": 2.305533103093541, "grad_norm": 1.037595510482788, "learning_rate": 1.3645570875802511e-05, "loss": 1.1502, "step": 19610 }, { "epoch": 2.3067087956499375, "grad_norm": 1.1665080785751343, "learning_rate": 1.3601610518668395e-05, "loss": 1.2234, "step": 19620 }, { "epoch": 2.307884488206334, "grad_norm": 0.9666273593902588, "learning_rate": 1.3557709936324225e-05, "loss": 1.1442, "step": 19630 }, { "epoch": 2.3090601807627307, "grad_norm": 1.1648529767990112, "learning_rate": 1.3513869200865171e-05, "loss": 1.1877, "step": 19640 }, { "epoch": 2.310235873319127, "grad_norm": 1.1383777856826782, "learning_rate": 1.3470088384288127e-05, "loss": 1.1667, "step": 19650 }, { "epoch": 2.3114115658755234, "grad_norm": 1.311725378036499, "learning_rate": 1.3426367558491566e-05, "loss": 1.1867, "step": 19660 }, { "epoch": 2.31258725843192, "grad_norm": 1.0311052799224854, "learning_rate": 1.3382706795275468e-05, "loss": 1.1462, "step": 19670 }, { "epoch": 2.3137629509883166, "grad_norm": 1.196685552597046, "learning_rate": 1.333910616634117e-05, "loss": 1.1165, "step": 19680 }, { "epoch": 2.3149386435447132, "grad_norm": 1.1472617387771606, "learning_rate": 1.3295565743291228e-05, "loss": 1.1792, "step": 19690 }, { "epoch": 2.3161143361011094, "grad_norm": 1.3739311695098877, "learning_rate": 1.325208559762935e-05, "loss": 1.1927, "step": 19700 }, { "epoch": 2.317290028657506, "grad_norm": 1.0228461027145386, "learning_rate": 1.3208665800760251e-05, "loss": 1.122, "step": 19710 }, { "epoch": 2.3184657212139026, "grad_norm": 1.2850650548934937, "learning_rate": 1.3165306423989544e-05, "loss": 1.113, "step": 19720 }, { "epoch": 2.319641413770299, "grad_norm": 1.389392614364624, "learning_rate": 1.3122007538523607e-05, "loss": 1.187, "step": 19730 }, { "epoch": 2.320817106326696, "grad_norm": 1.17593514919281, "learning_rate": 1.3078769215469477e-05, "loss": 1.1193, "step": 19740 }, { "epoch": 2.321992798883092, "grad_norm": 1.18215012550354, "learning_rate": 1.3035591525834762e-05, "loss": 1.1781, "step": 19750 }, { "epoch": 2.3231684914394886, "grad_norm": 1.2043921947479248, "learning_rate": 1.2992474540527433e-05, "loss": 1.1862, "step": 19760 }, { "epoch": 2.324344183995885, "grad_norm": 1.1793807744979858, "learning_rate": 1.2949418330355829e-05, "loss": 1.1172, "step": 19770 }, { "epoch": 2.3255198765522818, "grad_norm": 1.34282648563385, "learning_rate": 1.2906422966028453e-05, "loss": 1.1631, "step": 19780 }, { "epoch": 2.326695569108678, "grad_norm": 1.51482093334198, "learning_rate": 1.2863488518153898e-05, "loss": 1.1662, "step": 19790 }, { "epoch": 2.3278712616650745, "grad_norm": 0.9960753321647644, "learning_rate": 1.2820615057240714e-05, "loss": 1.1182, "step": 19800 }, { "epoch": 2.329046954221471, "grad_norm": 1.5088419914245605, "learning_rate": 1.2777802653697291e-05, "loss": 1.149, "step": 19810 }, { "epoch": 2.3302226467778677, "grad_norm": 1.531351923942566, "learning_rate": 1.2735051377831752e-05, "loss": 1.1688, "step": 19820 }, { "epoch": 2.331398339334264, "grad_norm": 1.1877542734146118, "learning_rate": 1.2692361299851834e-05, "loss": 1.1675, "step": 19830 }, { "epoch": 2.3325740318906605, "grad_norm": 0.98676997423172, "learning_rate": 1.2649732489864774e-05, "loss": 1.1292, "step": 19840 }, { "epoch": 2.333749724447057, "grad_norm": 1.1608295440673828, "learning_rate": 1.2607165017877192e-05, "loss": 1.1671, "step": 19850 }, { "epoch": 2.3349254170034537, "grad_norm": 1.2410483360290527, "learning_rate": 1.2564658953794978e-05, "loss": 1.196, "step": 19860 }, { "epoch": 2.3361011095598503, "grad_norm": 1.3083305358886719, "learning_rate": 1.2522214367423157e-05, "loss": 1.1715, "step": 19870 }, { "epoch": 2.3372768021162464, "grad_norm": 1.2579611539840698, "learning_rate": 1.2479831328465813e-05, "loss": 1.1295, "step": 19880 }, { "epoch": 2.338452494672643, "grad_norm": 1.1195005178451538, "learning_rate": 1.2437509906525957e-05, "loss": 1.1043, "step": 19890 }, { "epoch": 2.3396281872290396, "grad_norm": 0.9687024354934692, "learning_rate": 1.23952501711054e-05, "loss": 1.1276, "step": 19900 }, { "epoch": 2.3408038797854362, "grad_norm": 1.2788686752319336, "learning_rate": 1.2353052191604652e-05, "loss": 1.21, "step": 19910 }, { "epoch": 2.341979572341833, "grad_norm": 1.267418622970581, "learning_rate": 1.2310916037322806e-05, "loss": 1.2225, "step": 19920 }, { "epoch": 2.343155264898229, "grad_norm": 1.511842131614685, "learning_rate": 1.226884177745744e-05, "loss": 1.1841, "step": 19930 }, { "epoch": 2.3443309574546256, "grad_norm": 1.5916218757629395, "learning_rate": 1.2226829481104435e-05, "loss": 1.1341, "step": 19940 }, { "epoch": 2.345506650011022, "grad_norm": 1.1926460266113281, "learning_rate": 1.2184879217257971e-05, "loss": 1.1597, "step": 19950 }, { "epoch": 2.346682342567419, "grad_norm": 1.2192529439926147, "learning_rate": 1.214299105481032e-05, "loss": 1.1417, "step": 19960 }, { "epoch": 2.347858035123815, "grad_norm": 1.2714020013809204, "learning_rate": 1.2101165062551795e-05, "loss": 1.1153, "step": 19970 }, { "epoch": 2.3490337276802116, "grad_norm": 1.286171555519104, "learning_rate": 1.2059401309170593e-05, "loss": 1.1559, "step": 19980 }, { "epoch": 2.350209420236608, "grad_norm": 1.5117440223693848, "learning_rate": 1.2017699863252696e-05, "loss": 1.2017, "step": 19990 }, { "epoch": 2.3513851127930048, "grad_norm": 1.351479411125183, "learning_rate": 1.1976060793281796e-05, "loss": 1.1546, "step": 20000 }, { "epoch": 2.3513851127930048, "eval_loss": 1.569747805595398, "eval_runtime": 1919.7486, "eval_samples_per_second": 31.507, "eval_steps_per_second": 3.939, "step": 20000 }, { "epoch": 2.352560805349401, "grad_norm": 1.219213843345642, "learning_rate": 1.1934484167639088e-05, "loss": 1.1744, "step": 20010 }, { "epoch": 2.3537364979057975, "grad_norm": 1.1660232543945312, "learning_rate": 1.189297005460327e-05, "loss": 1.0998, "step": 20020 }, { "epoch": 2.354912190462194, "grad_norm": 1.644038200378418, "learning_rate": 1.1851518522350362e-05, "loss": 1.155, "step": 20030 }, { "epoch": 2.3560878830185907, "grad_norm": 1.7003331184387207, "learning_rate": 1.1810129638953615e-05, "loss": 1.1625, "step": 20040 }, { "epoch": 2.3572635755749873, "grad_norm": 1.2617863416671753, "learning_rate": 1.17688034723834e-05, "loss": 1.1423, "step": 20050 }, { "epoch": 2.3584392681313835, "grad_norm": 1.1132947206497192, "learning_rate": 1.1727540090507078e-05, "loss": 1.0841, "step": 20060 }, { "epoch": 2.35961496068778, "grad_norm": 1.488434910774231, "learning_rate": 1.168633956108891e-05, "loss": 1.1546, "step": 20070 }, { "epoch": 2.3607906532441767, "grad_norm": 1.638696551322937, "learning_rate": 1.1645201951789941e-05, "loss": 1.1672, "step": 20080 }, { "epoch": 2.3619663458005733, "grad_norm": 1.0244743824005127, "learning_rate": 1.1604127330167885e-05, "loss": 1.1615, "step": 20090 }, { "epoch": 2.36314203835697, "grad_norm": 1.1126619577407837, "learning_rate": 1.1563115763677013e-05, "loss": 1.1612, "step": 20100 }, { "epoch": 2.364317730913366, "grad_norm": 1.2811470031738281, "learning_rate": 1.1522167319668048e-05, "loss": 1.0873, "step": 20110 }, { "epoch": 2.3654934234697627, "grad_norm": 1.3559036254882812, "learning_rate": 1.1481282065388066e-05, "loss": 1.1294, "step": 20120 }, { "epoch": 2.3666691160261593, "grad_norm": 1.3687350749969482, "learning_rate": 1.1440460067980314e-05, "loss": 1.1934, "step": 20130 }, { "epoch": 2.3678448085825554, "grad_norm": 1.2044767141342163, "learning_rate": 1.139970139448422e-05, "loss": 1.1784, "step": 20140 }, { "epoch": 2.369020501138952, "grad_norm": 1.0433107614517212, "learning_rate": 1.135900611183519e-05, "loss": 1.1797, "step": 20150 }, { "epoch": 2.3701961936953486, "grad_norm": 1.8864936828613281, "learning_rate": 1.131837428686453e-05, "loss": 1.2457, "step": 20160 }, { "epoch": 2.371371886251745, "grad_norm": 1.4011973142623901, "learning_rate": 1.1277805986299344e-05, "loss": 1.1714, "step": 20170 }, { "epoch": 2.372547578808142, "grad_norm": 1.0713647603988647, "learning_rate": 1.1237301276762396e-05, "loss": 1.2142, "step": 20180 }, { "epoch": 2.373723271364538, "grad_norm": 1.3823329210281372, "learning_rate": 1.1196860224772043e-05, "loss": 1.162, "step": 20190 }, { "epoch": 2.3748989639209346, "grad_norm": 1.421966791152954, "learning_rate": 1.1156482896742065e-05, "loss": 1.175, "step": 20200 }, { "epoch": 2.376074656477331, "grad_norm": 1.246587872505188, "learning_rate": 1.1116169358981615e-05, "loss": 1.1916, "step": 20210 }, { "epoch": 2.377250349033728, "grad_norm": 1.0320056676864624, "learning_rate": 1.1075919677695095e-05, "loss": 1.205, "step": 20220 }, { "epoch": 2.3784260415901244, "grad_norm": 1.346003770828247, "learning_rate": 1.1035733918982027e-05, "loss": 1.2213, "step": 20230 }, { "epoch": 2.3796017341465205, "grad_norm": 1.1149684190750122, "learning_rate": 1.0995612148836965e-05, "loss": 1.1465, "step": 20240 }, { "epoch": 2.380777426702917, "grad_norm": 1.3097283840179443, "learning_rate": 1.0955554433149373e-05, "loss": 1.1526, "step": 20250 }, { "epoch": 2.3819531192593137, "grad_norm": 1.0739415884017944, "learning_rate": 1.0915560837703503e-05, "loss": 1.1628, "step": 20260 }, { "epoch": 2.3831288118157103, "grad_norm": 1.227655053138733, "learning_rate": 1.0875631428178346e-05, "loss": 1.1732, "step": 20270 }, { "epoch": 2.3843045043721065, "grad_norm": 1.0961395502090454, "learning_rate": 1.0835766270147451e-05, "loss": 1.1827, "step": 20280 }, { "epoch": 2.385480196928503, "grad_norm": 1.16946542263031, "learning_rate": 1.079596542907888e-05, "loss": 1.1318, "step": 20290 }, { "epoch": 2.3866558894848997, "grad_norm": 1.3102200031280518, "learning_rate": 1.075622897033504e-05, "loss": 1.2168, "step": 20300 }, { "epoch": 2.3878315820412963, "grad_norm": 1.1627963781356812, "learning_rate": 1.0716556959172635e-05, "loss": 1.219, "step": 20310 }, { "epoch": 2.3890072745976925, "grad_norm": 1.0842041969299316, "learning_rate": 1.0676949460742513e-05, "loss": 1.1093, "step": 20320 }, { "epoch": 2.390182967154089, "grad_norm": 1.183977484703064, "learning_rate": 1.0637406540089578e-05, "loss": 1.1526, "step": 20330 }, { "epoch": 2.3913586597104857, "grad_norm": 1.2887957096099854, "learning_rate": 1.0597928262152695e-05, "loss": 1.1347, "step": 20340 }, { "epoch": 2.3925343522668823, "grad_norm": 1.0565372705459595, "learning_rate": 1.0558514691764555e-05, "loss": 1.2048, "step": 20350 }, { "epoch": 2.393710044823279, "grad_norm": 1.1657594442367554, "learning_rate": 1.0519165893651594e-05, "loss": 1.0957, "step": 20360 }, { "epoch": 2.394885737379675, "grad_norm": 1.1411945819854736, "learning_rate": 1.0479881932433883e-05, "loss": 1.1527, "step": 20370 }, { "epoch": 2.3960614299360716, "grad_norm": 1.1936817169189453, "learning_rate": 1.044066287262498e-05, "loss": 1.183, "step": 20380 }, { "epoch": 2.3972371224924682, "grad_norm": 1.112101435661316, "learning_rate": 1.0401508778631896e-05, "loss": 1.1616, "step": 20390 }, { "epoch": 2.398412815048865, "grad_norm": 1.0031119585037231, "learning_rate": 1.0362419714754945e-05, "loss": 1.0922, "step": 20400 }, { "epoch": 2.3995885076052614, "grad_norm": 0.9744369983673096, "learning_rate": 1.0323395745187637e-05, "loss": 1.0597, "step": 20410 }, { "epoch": 2.4007642001616576, "grad_norm": 1.1253435611724854, "learning_rate": 1.0284436934016595e-05, "loss": 1.163, "step": 20420 }, { "epoch": 2.401939892718054, "grad_norm": 1.014455795288086, "learning_rate": 1.0245543345221425e-05, "loss": 1.1535, "step": 20430 }, { "epoch": 2.403115585274451, "grad_norm": 1.131753921508789, "learning_rate": 1.0206715042674642e-05, "loss": 1.1705, "step": 20440 }, { "epoch": 2.4042912778308474, "grad_norm": 1.303338885307312, "learning_rate": 1.0167952090141497e-05, "loss": 1.1518, "step": 20450 }, { "epoch": 2.4054669703872436, "grad_norm": 1.0720384120941162, "learning_rate": 1.0129254551279971e-05, "loss": 1.0921, "step": 20460 }, { "epoch": 2.40664266294364, "grad_norm": 1.203493595123291, "learning_rate": 1.00906224896406e-05, "loss": 1.1712, "step": 20470 }, { "epoch": 2.4078183555000368, "grad_norm": 1.1288865804672241, "learning_rate": 1.0052055968666396e-05, "loss": 1.1555, "step": 20480 }, { "epoch": 2.4089940480564334, "grad_norm": 1.0200477838516235, "learning_rate": 1.0013555051692736e-05, "loss": 1.1388, "step": 20490 }, { "epoch": 2.4101697406128295, "grad_norm": 1.1588919162750244, "learning_rate": 9.975119801947253e-06, "loss": 1.1804, "step": 20500 }, { "epoch": 2.4101697406128295, "eval_loss": 1.5665874481201172, "eval_runtime": 1919.8521, "eval_samples_per_second": 31.505, "eval_steps_per_second": 3.938, "step": 20500 }, { "epoch": 2.411345433169226, "grad_norm": 1.0977870225906372, "learning_rate": 9.936750282549767e-06, "loss": 1.1046, "step": 20510 }, { "epoch": 2.4125211257256227, "grad_norm": 1.2836697101593018, "learning_rate": 9.898446556512097e-06, "loss": 1.1422, "step": 20520 }, { "epoch": 2.4136968182820193, "grad_norm": 1.4498416185379028, "learning_rate": 9.860208686738065e-06, "loss": 1.1765, "step": 20530 }, { "epoch": 2.414872510838416, "grad_norm": 1.0913317203521729, "learning_rate": 9.822036736023322e-06, "loss": 1.1785, "step": 20540 }, { "epoch": 2.416048203394812, "grad_norm": 1.1909239292144775, "learning_rate": 9.783930767055266e-06, "loss": 1.1534, "step": 20550 }, { "epoch": 2.4172238959512087, "grad_norm": 1.517229437828064, "learning_rate": 9.745890842412942e-06, "loss": 1.1641, "step": 20560 }, { "epoch": 2.4183995885076053, "grad_norm": 1.369382619857788, "learning_rate": 9.70791702456692e-06, "loss": 1.2041, "step": 20570 }, { "epoch": 2.419575281064002, "grad_norm": 1.2977288961410522, "learning_rate": 9.670009375879224e-06, "loss": 1.2415, "step": 20580 }, { "epoch": 2.4207509736203985, "grad_norm": 1.0789836645126343, "learning_rate": 9.632167958603211e-06, "loss": 1.1095, "step": 20590 }, { "epoch": 2.4219266661767946, "grad_norm": 1.2617942094802856, "learning_rate": 9.594392834883453e-06, "loss": 1.15, "step": 20600 }, { "epoch": 2.4231023587331912, "grad_norm": 1.1650317907333374, "learning_rate": 9.556684066755672e-06, "loss": 1.2032, "step": 20610 }, { "epoch": 2.424278051289588, "grad_norm": 1.672120451927185, "learning_rate": 9.519041716146621e-06, "loss": 1.1804, "step": 20620 }, { "epoch": 2.4254537438459844, "grad_norm": 1.1764438152313232, "learning_rate": 9.481465844873943e-06, "loss": 1.1614, "step": 20630 }, { "epoch": 2.4266294364023806, "grad_norm": 1.3771190643310547, "learning_rate": 9.443956514646146e-06, "loss": 1.1594, "step": 20640 }, { "epoch": 2.427805128958777, "grad_norm": 0.9567354917526245, "learning_rate": 9.406513787062448e-06, "loss": 1.1716, "step": 20650 }, { "epoch": 2.428980821515174, "grad_norm": 0.9942293763160706, "learning_rate": 9.369137723612692e-06, "loss": 1.1336, "step": 20660 }, { "epoch": 2.4301565140715704, "grad_norm": 1.1756943464279175, "learning_rate": 9.331828385677238e-06, "loss": 1.191, "step": 20670 }, { "epoch": 2.4313322066279666, "grad_norm": 1.1289403438568115, "learning_rate": 9.294585834526865e-06, "loss": 1.1824, "step": 20680 }, { "epoch": 2.432507899184363, "grad_norm": 1.1648211479187012, "learning_rate": 9.257410131322702e-06, "loss": 1.1701, "step": 20690 }, { "epoch": 2.4336835917407598, "grad_norm": 1.0929818153381348, "learning_rate": 9.220301337116033e-06, "loss": 1.1561, "step": 20700 }, { "epoch": 2.4348592842971564, "grad_norm": 1.2472970485687256, "learning_rate": 9.183259512848324e-06, "loss": 1.1549, "step": 20710 }, { "epoch": 2.436034976853553, "grad_norm": 1.1672176122665405, "learning_rate": 9.146284719351033e-06, "loss": 1.1229, "step": 20720 }, { "epoch": 2.437210669409949, "grad_norm": 1.171133041381836, "learning_rate": 9.109377017345544e-06, "loss": 1.1351, "step": 20730 }, { "epoch": 2.4383863619663457, "grad_norm": 1.312781810760498, "learning_rate": 9.072536467443055e-06, "loss": 1.1445, "step": 20740 }, { "epoch": 2.4395620545227423, "grad_norm": 1.4054301977157593, "learning_rate": 9.035763130144493e-06, "loss": 1.1431, "step": 20750 }, { "epoch": 2.440737747079139, "grad_norm": 1.1739004850387573, "learning_rate": 8.999057065840405e-06, "loss": 1.2079, "step": 20760 }, { "epoch": 2.4419134396355355, "grad_norm": 1.9097973108291626, "learning_rate": 8.962418334810847e-06, "loss": 1.1393, "step": 20770 }, { "epoch": 2.4430891321919317, "grad_norm": 1.0853099822998047, "learning_rate": 8.925846997225302e-06, "loss": 1.174, "step": 20780 }, { "epoch": 2.4442648247483283, "grad_norm": 1.0487487316131592, "learning_rate": 8.889343113142596e-06, "loss": 1.1449, "step": 20790 }, { "epoch": 2.445440517304725, "grad_norm": 1.4475690126419067, "learning_rate": 8.852906742510758e-06, "loss": 1.148, "step": 20800 }, { "epoch": 2.4466162098611215, "grad_norm": 1.759809136390686, "learning_rate": 8.816537945166953e-06, "loss": 1.1666, "step": 20810 }, { "epoch": 2.4477919024175177, "grad_norm": 1.6363978385925293, "learning_rate": 8.78023678083737e-06, "loss": 1.2248, "step": 20820 }, { "epoch": 2.4489675949739143, "grad_norm": 1.2229710817337036, "learning_rate": 8.744003309137144e-06, "loss": 1.1464, "step": 20830 }, { "epoch": 2.450143287530311, "grad_norm": 1.1274008750915527, "learning_rate": 8.707837589570216e-06, "loss": 1.1404, "step": 20840 }, { "epoch": 2.4513189800867075, "grad_norm": 1.047499179840088, "learning_rate": 8.67173968152929e-06, "loss": 1.2246, "step": 20850 }, { "epoch": 2.4524946726431036, "grad_norm": 1.0809823274612427, "learning_rate": 8.635709644295687e-06, "loss": 1.1989, "step": 20860 }, { "epoch": 2.4536703651995, "grad_norm": 1.2451667785644531, "learning_rate": 8.599747537039277e-06, "loss": 1.1784, "step": 20870 }, { "epoch": 2.454846057755897, "grad_norm": 1.2494577169418335, "learning_rate": 8.56385341881838e-06, "loss": 1.14, "step": 20880 }, { "epoch": 2.4560217503122934, "grad_norm": 1.3148537874221802, "learning_rate": 8.52802734857963e-06, "loss": 1.134, "step": 20890 }, { "epoch": 2.45719744286869, "grad_norm": 1.3112386465072632, "learning_rate": 8.492269385157953e-06, "loss": 1.2371, "step": 20900 }, { "epoch": 2.458373135425086, "grad_norm": 1.2255370616912842, "learning_rate": 8.456579587276398e-06, "loss": 1.1203, "step": 20910 }, { "epoch": 2.4595488279814828, "grad_norm": 1.0833566188812256, "learning_rate": 8.420958013546076e-06, "loss": 1.1745, "step": 20920 }, { "epoch": 2.4607245205378794, "grad_norm": 1.1466819047927856, "learning_rate": 8.385404722466073e-06, "loss": 1.1934, "step": 20930 }, { "epoch": 2.461900213094276, "grad_norm": 1.3600071668624878, "learning_rate": 8.34991977242332e-06, "loss": 1.1712, "step": 20940 }, { "epoch": 2.4630759056506726, "grad_norm": 1.3750544786453247, "learning_rate": 8.314503221692537e-06, "loss": 1.2277, "step": 20950 }, { "epoch": 2.4642515982070687, "grad_norm": 1.1938422918319702, "learning_rate": 8.279155128436077e-06, "loss": 1.1863, "step": 20960 }, { "epoch": 2.4654272907634653, "grad_norm": 1.0401078462600708, "learning_rate": 8.243875550703911e-06, "loss": 1.12, "step": 20970 }, { "epoch": 2.466602983319862, "grad_norm": 1.3430391550064087, "learning_rate": 8.208664546433465e-06, "loss": 1.1725, "step": 20980 }, { "epoch": 2.4677786758762585, "grad_norm": 1.2467933893203735, "learning_rate": 8.173522173449567e-06, "loss": 1.1804, "step": 20990 }, { "epoch": 2.4689543684326547, "grad_norm": 1.146432638168335, "learning_rate": 8.138448489464324e-06, "loss": 1.1517, "step": 21000 }, { "epoch": 2.4689543684326547, "eval_loss": 1.564630150794983, "eval_runtime": 1920.2145, "eval_samples_per_second": 31.499, "eval_steps_per_second": 3.938, "step": 21000 }, { "epoch": 2.4701300609890513, "grad_norm": 1.208878517150879, "learning_rate": 8.103443552077056e-06, "loss": 1.1608, "step": 21010 }, { "epoch": 2.471305753545448, "grad_norm": 1.161390781402588, "learning_rate": 8.068507418774173e-06, "loss": 1.1685, "step": 21020 }, { "epoch": 2.4724814461018445, "grad_norm": 1.169883370399475, "learning_rate": 8.033640146929083e-06, "loss": 1.1515, "step": 21030 }, { "epoch": 2.4736571386582407, "grad_norm": 1.1499937772750854, "learning_rate": 7.99884179380212e-06, "loss": 1.1699, "step": 21040 }, { "epoch": 2.4748328312146373, "grad_norm": 1.2490509748458862, "learning_rate": 7.964112416540437e-06, "loss": 1.1675, "step": 21050 }, { "epoch": 2.476008523771034, "grad_norm": 0.9924104809761047, "learning_rate": 7.929452072177911e-06, "loss": 1.1782, "step": 21060 }, { "epoch": 2.4771842163274305, "grad_norm": 1.0224708318710327, "learning_rate": 7.894860817635053e-06, "loss": 1.1493, "step": 21070 }, { "epoch": 2.478359908883827, "grad_norm": 1.1614015102386475, "learning_rate": 7.8603387097189e-06, "loss": 1.2214, "step": 21080 }, { "epoch": 2.4795356014402232, "grad_norm": 1.1573618650436401, "learning_rate": 7.825885805122946e-06, "loss": 1.12, "step": 21090 }, { "epoch": 2.48071129399662, "grad_norm": 1.1875364780426025, "learning_rate": 7.79150216042704e-06, "loss": 1.1563, "step": 21100 }, { "epoch": 2.4818869865530164, "grad_norm": 1.3660136461257935, "learning_rate": 7.757187832097273e-06, "loss": 1.1003, "step": 21110 }, { "epoch": 2.483062679109413, "grad_norm": 1.3918377161026, "learning_rate": 7.722942876485923e-06, "loss": 1.1446, "step": 21120 }, { "epoch": 2.4842383716658096, "grad_norm": 1.4332300424575806, "learning_rate": 7.688767349831327e-06, "loss": 1.1195, "step": 21130 }, { "epoch": 2.485414064222206, "grad_norm": 1.2892094850540161, "learning_rate": 7.654661308257798e-06, "loss": 1.1387, "step": 21140 }, { "epoch": 2.4865897567786024, "grad_norm": 1.3496962785720825, "learning_rate": 7.620624807775556e-06, "loss": 1.206, "step": 21150 }, { "epoch": 2.487765449334999, "grad_norm": 1.4049124717712402, "learning_rate": 7.586657904280603e-06, "loss": 1.1047, "step": 21160 }, { "epoch": 2.4889411418913956, "grad_norm": 1.3614282608032227, "learning_rate": 7.552760653554658e-06, "loss": 1.0974, "step": 21170 }, { "epoch": 2.4901168344477917, "grad_norm": 1.376314401626587, "learning_rate": 7.518933111265042e-06, "loss": 1.1834, "step": 21180 }, { "epoch": 2.4912925270041884, "grad_norm": 1.3075319528579712, "learning_rate": 7.485175332964606e-06, "loss": 1.2256, "step": 21190 }, { "epoch": 2.492468219560585, "grad_norm": 1.4354541301727295, "learning_rate": 7.451487374091648e-06, "loss": 1.1907, "step": 21200 }, { "epoch": 2.4936439121169816, "grad_norm": 1.6145875453948975, "learning_rate": 7.417869289969753e-06, "loss": 1.0859, "step": 21210 }, { "epoch": 2.4948196046733777, "grad_norm": 1.1258864402770996, "learning_rate": 7.384321135807814e-06, "loss": 1.1513, "step": 21220 }, { "epoch": 2.4959952972297743, "grad_norm": 1.0480154752731323, "learning_rate": 7.35084296669985e-06, "loss": 1.1802, "step": 21230 }, { "epoch": 2.497170989786171, "grad_norm": 1.4380848407745361, "learning_rate": 7.317434837624959e-06, "loss": 1.1646, "step": 21240 }, { "epoch": 2.4983466823425675, "grad_norm": 1.2404568195343018, "learning_rate": 7.28409680344721e-06, "loss": 1.1714, "step": 21250 }, { "epoch": 2.499522374898964, "grad_norm": 1.1649876832962036, "learning_rate": 7.250828918915575e-06, "loss": 1.1805, "step": 21260 }, { "epoch": 2.5006980674553603, "grad_norm": 1.3094823360443115, "learning_rate": 7.217631238663813e-06, "loss": 1.1677, "step": 21270 }, { "epoch": 2.501873760011757, "grad_norm": 1.2323296070098877, "learning_rate": 7.184503817210369e-06, "loss": 1.1478, "step": 21280 }, { "epoch": 2.5030494525681535, "grad_norm": 1.3259717226028442, "learning_rate": 7.151446708958337e-06, "loss": 1.1982, "step": 21290 }, { "epoch": 2.50422514512455, "grad_norm": 1.1401108503341675, "learning_rate": 7.11845996819534e-06, "loss": 1.1409, "step": 21300 }, { "epoch": 2.5054008376809467, "grad_norm": 1.1258550882339478, "learning_rate": 7.085543649093423e-06, "loss": 1.1521, "step": 21310 }, { "epoch": 2.506576530237343, "grad_norm": 1.249906063079834, "learning_rate": 7.052697805708991e-06, "loss": 1.1581, "step": 21320 }, { "epoch": 2.5077522227937394, "grad_norm": 1.1476536989212036, "learning_rate": 7.0199224919827165e-06, "loss": 1.1999, "step": 21330 }, { "epoch": 2.508927915350136, "grad_norm": 0.9508269429206848, "learning_rate": 6.9872177617394286e-06, "loss": 1.1699, "step": 21340 }, { "epoch": 2.510103607906532, "grad_norm": 1.178539514541626, "learning_rate": 6.954583668688064e-06, "loss": 1.1144, "step": 21350 }, { "epoch": 2.511279300462929, "grad_norm": 1.3245329856872559, "learning_rate": 6.922020266421542e-06, "loss": 1.1331, "step": 21360 }, { "epoch": 2.5124549930193254, "grad_norm": 1.3100366592407227, "learning_rate": 6.889527608416685e-06, "loss": 1.1266, "step": 21370 }, { "epoch": 2.513630685575722, "grad_norm": 1.3140509128570557, "learning_rate": 6.857105748034159e-06, "loss": 1.1511, "step": 21380 }, { "epoch": 2.5148063781321186, "grad_norm": 1.1794075965881348, "learning_rate": 6.824754738518352e-06, "loss": 1.1987, "step": 21390 }, { "epoch": 2.5159820706885148, "grad_norm": 1.3353517055511475, "learning_rate": 6.7924746329972755e-06, "loss": 1.2194, "step": 21400 }, { "epoch": 2.5171577632449114, "grad_norm": 1.173120379447937, "learning_rate": 6.760265484482531e-06, "loss": 1.1139, "step": 21410 }, { "epoch": 2.518333455801308, "grad_norm": 1.1765965223312378, "learning_rate": 6.728127345869184e-06, "loss": 1.1189, "step": 21420 }, { "epoch": 2.5195091483577046, "grad_norm": 1.2401111125946045, "learning_rate": 6.696060269935677e-06, "loss": 1.1761, "step": 21430 }, { "epoch": 2.520684840914101, "grad_norm": 1.2000371217727661, "learning_rate": 6.664064309343759e-06, "loss": 1.2378, "step": 21440 }, { "epoch": 2.5218605334704973, "grad_norm": 1.120064377784729, "learning_rate": 6.632139516638386e-06, "loss": 1.1597, "step": 21450 }, { "epoch": 2.523036226026894, "grad_norm": 1.203062653541565, "learning_rate": 6.600285944247658e-06, "loss": 1.182, "step": 21460 }, { "epoch": 2.5242119185832905, "grad_norm": 1.0040241479873657, "learning_rate": 6.568503644482666e-06, "loss": 1.1594, "step": 21470 }, { "epoch": 2.525387611139687, "grad_norm": 1.040872573852539, "learning_rate": 6.5367926695375e-06, "loss": 1.1932, "step": 21480 }, { "epoch": 2.5265633036960837, "grad_norm": 1.3334248065948486, "learning_rate": 6.505153071489101e-06, "loss": 1.1041, "step": 21490 }, { "epoch": 2.52773899625248, "grad_norm": 1.162282109260559, "learning_rate": 6.4735849022972025e-06, "loss": 1.1941, "step": 21500 }, { "epoch": 2.52773899625248, "eval_loss": 1.56331205368042, "eval_runtime": 1918.2178, "eval_samples_per_second": 31.532, "eval_steps_per_second": 3.942, "step": 21500 }, { "epoch": 2.5289146888088765, "grad_norm": 1.0784111022949219, "learning_rate": 6.442088213804215e-06, "loss": 1.1552, "step": 21510 }, { "epoch": 2.530090381365273, "grad_norm": 1.2352368831634521, "learning_rate": 6.4106630577351835e-06, "loss": 1.1117, "step": 21520 }, { "epoch": 2.5312660739216692, "grad_norm": 1.2179832458496094, "learning_rate": 6.379309485697676e-06, "loss": 1.1823, "step": 21530 }, { "epoch": 2.532441766478066, "grad_norm": 1.6056621074676514, "learning_rate": 6.348027549181673e-06, "loss": 1.1745, "step": 21540 }, { "epoch": 2.5336174590344624, "grad_norm": 1.1853609085083008, "learning_rate": 6.316817299559546e-06, "loss": 1.1541, "step": 21550 }, { "epoch": 2.534793151590859, "grad_norm": 0.9505048394203186, "learning_rate": 6.28567878808593e-06, "loss": 1.183, "step": 21560 }, { "epoch": 2.5359688441472557, "grad_norm": 1.6677607297897339, "learning_rate": 6.254612065897642e-06, "loss": 1.1569, "step": 21570 }, { "epoch": 2.537144536703652, "grad_norm": 1.183386206626892, "learning_rate": 6.223617184013619e-06, "loss": 1.1642, "step": 21580 }, { "epoch": 2.5383202292600484, "grad_norm": 1.3047291040420532, "learning_rate": 6.192694193334797e-06, "loss": 1.2273, "step": 21590 }, { "epoch": 2.539495921816445, "grad_norm": 1.0434406995773315, "learning_rate": 6.161843144644075e-06, "loss": 1.1428, "step": 21600 }, { "epoch": 2.5406716143728416, "grad_norm": 1.1240142583847046, "learning_rate": 6.1310640886061865e-06, "loss": 1.1198, "step": 21610 }, { "epoch": 2.541847306929238, "grad_norm": 1.2498174905776978, "learning_rate": 6.1003570757676485e-06, "loss": 1.1742, "step": 21620 }, { "epoch": 2.5430229994856344, "grad_norm": 1.4519636631011963, "learning_rate": 6.0697221565566506e-06, "loss": 1.1514, "step": 21630 }, { "epoch": 2.544198692042031, "grad_norm": 1.1900054216384888, "learning_rate": 6.039159381283016e-06, "loss": 1.188, "step": 21640 }, { "epoch": 2.5453743845984276, "grad_norm": 1.1230286359786987, "learning_rate": 6.008668800138045e-06, "loss": 1.1569, "step": 21650 }, { "epoch": 2.546550077154824, "grad_norm": 1.0666030645370483, "learning_rate": 5.978250463194523e-06, "loss": 1.1171, "step": 21660 }, { "epoch": 2.5477257697112208, "grad_norm": 1.1910102367401123, "learning_rate": 5.94790442040657e-06, "loss": 1.1855, "step": 21670 }, { "epoch": 2.548901462267617, "grad_norm": 1.255710482597351, "learning_rate": 5.917630721609585e-06, "loss": 1.1999, "step": 21680 }, { "epoch": 2.5500771548240135, "grad_norm": 1.273930549621582, "learning_rate": 5.887429416520169e-06, "loss": 1.2043, "step": 21690 }, { "epoch": 2.55125284738041, "grad_norm": 1.2258172035217285, "learning_rate": 5.857300554736022e-06, "loss": 1.1818, "step": 21700 }, { "epoch": 2.5524285399368063, "grad_norm": 1.36130690574646, "learning_rate": 5.827244185735903e-06, "loss": 1.1823, "step": 21710 }, { "epoch": 2.553604232493203, "grad_norm": 1.0412468910217285, "learning_rate": 5.797260358879469e-06, "loss": 1.1438, "step": 21720 }, { "epoch": 2.5547799250495995, "grad_norm": 1.711956262588501, "learning_rate": 5.767349123407289e-06, "loss": 1.1244, "step": 21730 }, { "epoch": 2.555955617605996, "grad_norm": 1.3877557516098022, "learning_rate": 5.737510528440709e-06, "loss": 1.138, "step": 21740 }, { "epoch": 2.5571313101623927, "grad_norm": 1.3246116638183594, "learning_rate": 5.707744622981781e-06, "loss": 1.1432, "step": 21750 }, { "epoch": 2.558307002718789, "grad_norm": 1.071755051612854, "learning_rate": 5.67805145591318e-06, "loss": 1.2171, "step": 21760 }, { "epoch": 2.5594826952751855, "grad_norm": 1.3515435457229614, "learning_rate": 5.648431075998134e-06, "loss": 1.1624, "step": 21770 }, { "epoch": 2.560658387831582, "grad_norm": 1.3174890279769897, "learning_rate": 5.618883531880337e-06, "loss": 1.1863, "step": 21780 }, { "epoch": 2.5618340803879787, "grad_norm": 1.2698324918746948, "learning_rate": 5.5894088720838565e-06, "loss": 1.1768, "step": 21790 }, { "epoch": 2.5630097729443753, "grad_norm": 1.005827784538269, "learning_rate": 5.560007145013074e-06, "loss": 1.0948, "step": 21800 }, { "epoch": 2.5641854655007714, "grad_norm": 1.121984839439392, "learning_rate": 5.53067839895261e-06, "loss": 1.1419, "step": 21810 }, { "epoch": 2.565361158057168, "grad_norm": 1.2479227781295776, "learning_rate": 5.501422682067214e-06, "loss": 1.1468, "step": 21820 }, { "epoch": 2.5665368506135646, "grad_norm": 0.9800752401351929, "learning_rate": 5.4722400424017165e-06, "loss": 1.1209, "step": 21830 }, { "epoch": 2.567712543169961, "grad_norm": 1.4367278814315796, "learning_rate": 5.443130527880935e-06, "loss": 1.1733, "step": 21840 }, { "epoch": 2.568888235726358, "grad_norm": 1.3022068738937378, "learning_rate": 5.414094186309604e-06, "loss": 1.1571, "step": 21850 }, { "epoch": 2.570063928282754, "grad_norm": 1.0060622692108154, "learning_rate": 5.385131065372273e-06, "loss": 1.1882, "step": 21860 }, { "epoch": 2.5712396208391506, "grad_norm": 1.1531273126602173, "learning_rate": 5.356241212633267e-06, "loss": 1.1227, "step": 21870 }, { "epoch": 2.572415313395547, "grad_norm": 1.0826479196548462, "learning_rate": 5.327424675536569e-06, "loss": 1.1976, "step": 21880 }, { "epoch": 2.5735910059519433, "grad_norm": 1.1506080627441406, "learning_rate": 5.298681501405783e-06, "loss": 1.1952, "step": 21890 }, { "epoch": 2.57476669850834, "grad_norm": 1.140075445175171, "learning_rate": 5.270011737444003e-06, "loss": 1.1932, "step": 21900 }, { "epoch": 2.5759423910647365, "grad_norm": 1.0892587900161743, "learning_rate": 5.241415430733787e-06, "loss": 1.1771, "step": 21910 }, { "epoch": 2.577118083621133, "grad_norm": 1.2352505922317505, "learning_rate": 5.212892628237054e-06, "loss": 1.1362, "step": 21920 }, { "epoch": 2.5782937761775298, "grad_norm": 1.1315137147903442, "learning_rate": 5.184443376795012e-06, "loss": 1.1519, "step": 21930 }, { "epoch": 2.579469468733926, "grad_norm": 1.2026236057281494, "learning_rate": 5.156067723128089e-06, "loss": 1.171, "step": 21940 }, { "epoch": 2.5806451612903225, "grad_norm": 1.32729971408844, "learning_rate": 5.127765713835825e-06, "loss": 1.1542, "step": 21950 }, { "epoch": 2.581820853846719, "grad_norm": 1.1550610065460205, "learning_rate": 5.099537395396858e-06, "loss": 1.1721, "step": 21960 }, { "epoch": 2.5829965464031157, "grad_norm": 1.4563357830047607, "learning_rate": 5.071382814168752e-06, "loss": 1.2166, "step": 21970 }, { "epoch": 2.5841722389595123, "grad_norm": 1.5898511409759521, "learning_rate": 5.043302016388024e-06, "loss": 1.154, "step": 21980 }, { "epoch": 2.5853479315159085, "grad_norm": 1.2787408828735352, "learning_rate": 5.0152950481700045e-06, "loss": 1.1222, "step": 21990 }, { "epoch": 2.586523624072305, "grad_norm": 1.3789772987365723, "learning_rate": 4.987361955508774e-06, "loss": 1.1836, "step": 22000 }, { "epoch": 2.586523624072305, "eval_loss": 1.5610768795013428, "eval_runtime": 1918.1566, "eval_samples_per_second": 31.533, "eval_steps_per_second": 3.942, "step": 22000 }, { "epoch": 2.5876993166287017, "grad_norm": 1.1998192071914673, "learning_rate": 4.959502784277104e-06, "loss": 1.155, "step": 22010 }, { "epoch": 2.588875009185098, "grad_norm": 1.194258213043213, "learning_rate": 4.931717580226364e-06, "loss": 1.187, "step": 22020 }, { "epoch": 2.590050701741495, "grad_norm": 1.6292840242385864, "learning_rate": 4.9040063889864485e-06, "loss": 1.1573, "step": 22030 }, { "epoch": 2.591226394297891, "grad_norm": 1.1854287385940552, "learning_rate": 4.876369256065699e-06, "loss": 1.2101, "step": 22040 }, { "epoch": 2.5924020868542876, "grad_norm": 1.4082568883895874, "learning_rate": 4.848806226850849e-06, "loss": 1.1499, "step": 22050 }, { "epoch": 2.5935777794106842, "grad_norm": 1.3633288145065308, "learning_rate": 4.8213173466069295e-06, "loss": 1.1123, "step": 22060 }, { "epoch": 2.5947534719670804, "grad_norm": 1.2654571533203125, "learning_rate": 4.793902660477206e-06, "loss": 1.1446, "step": 22070 }, { "epoch": 2.595929164523477, "grad_norm": 1.3165539503097534, "learning_rate": 4.7665622134830945e-06, "loss": 1.2052, "step": 22080 }, { "epoch": 2.5971048570798736, "grad_norm": 1.276818037033081, "learning_rate": 4.739296050524094e-06, "loss": 1.1766, "step": 22090 }, { "epoch": 2.59828054963627, "grad_norm": 1.09111487865448, "learning_rate": 4.712104216377716e-06, "loss": 1.1939, "step": 22100 }, { "epoch": 2.599456242192667, "grad_norm": 1.0535634756088257, "learning_rate": 4.6849867556994e-06, "loss": 1.2084, "step": 22110 }, { "epoch": 2.600631934749063, "grad_norm": 1.3824292421340942, "learning_rate": 4.657943713022444e-06, "loss": 1.1234, "step": 22120 }, { "epoch": 2.6018076273054596, "grad_norm": 1.4353539943695068, "learning_rate": 4.630975132757947e-06, "loss": 1.2131, "step": 22130 }, { "epoch": 2.602983319861856, "grad_norm": 1.1485735177993774, "learning_rate": 4.604081059194715e-06, "loss": 1.1336, "step": 22140 }, { "epoch": 2.6041590124182528, "grad_norm": 1.291395664215088, "learning_rate": 4.577261536499205e-06, "loss": 1.1746, "step": 22150 }, { "epoch": 2.6053347049746494, "grad_norm": 1.1014269590377808, "learning_rate": 4.550516608715411e-06, "loss": 1.1299, "step": 22160 }, { "epoch": 2.6065103975310455, "grad_norm": 1.2593562602996826, "learning_rate": 4.523846319764863e-06, "loss": 1.1478, "step": 22170 }, { "epoch": 2.607686090087442, "grad_norm": 1.525715708732605, "learning_rate": 4.4972507134465045e-06, "loss": 1.1373, "step": 22180 }, { "epoch": 2.6088617826438387, "grad_norm": 1.214255690574646, "learning_rate": 4.470729833436626e-06, "loss": 1.1412, "step": 22190 }, { "epoch": 2.610037475200235, "grad_norm": 1.2298465967178345, "learning_rate": 4.4442837232888055e-06, "loss": 1.114, "step": 22200 }, { "epoch": 2.6112131677566315, "grad_norm": 1.1266010999679565, "learning_rate": 4.417912426433829e-06, "loss": 1.1513, "step": 22210 }, { "epoch": 2.612388860313028, "grad_norm": 1.1691452264785767, "learning_rate": 4.391615986179626e-06, "loss": 1.1942, "step": 22220 }, { "epoch": 2.6135645528694247, "grad_norm": 1.2871342897415161, "learning_rate": 4.36539444571118e-06, "loss": 1.2163, "step": 22230 }, { "epoch": 2.6147402454258213, "grad_norm": 1.249171257019043, "learning_rate": 4.339247848090488e-06, "loss": 1.1705, "step": 22240 }, { "epoch": 2.6159159379822174, "grad_norm": 0.9809420108795166, "learning_rate": 4.313176236256461e-06, "loss": 1.1382, "step": 22250 }, { "epoch": 2.617091630538614, "grad_norm": 1.132043719291687, "learning_rate": 4.287179653024876e-06, "loss": 1.1886, "step": 22260 }, { "epoch": 2.6182673230950106, "grad_norm": 1.0613380670547485, "learning_rate": 4.261258141088281e-06, "loss": 1.1786, "step": 22270 }, { "epoch": 2.6194430156514072, "grad_norm": 0.9947808384895325, "learning_rate": 4.235411743015955e-06, "loss": 1.1452, "step": 22280 }, { "epoch": 2.620618708207804, "grad_norm": 1.1724342107772827, "learning_rate": 4.209640501253825e-06, "loss": 1.1713, "step": 22290 }, { "epoch": 2.6217944007642, "grad_norm": 0.8840771913528442, "learning_rate": 4.183944458124361e-06, "loss": 1.1433, "step": 22300 }, { "epoch": 2.6229700933205966, "grad_norm": 1.2750728130340576, "learning_rate": 4.158323655826568e-06, "loss": 1.1948, "step": 22310 }, { "epoch": 2.624145785876993, "grad_norm": 1.1792434453964233, "learning_rate": 4.132778136435889e-06, "loss": 1.1235, "step": 22320 }, { "epoch": 2.62532147843339, "grad_norm": 1.2135009765625, "learning_rate": 4.107307941904121e-06, "loss": 1.1418, "step": 22330 }, { "epoch": 2.6264971709897864, "grad_norm": 1.103952407836914, "learning_rate": 4.081913114059372e-06, "loss": 1.1646, "step": 22340 }, { "epoch": 2.6276728635461826, "grad_norm": 1.322338342666626, "learning_rate": 4.0565936946059644e-06, "loss": 1.154, "step": 22350 }, { "epoch": 2.628848556102579, "grad_norm": 1.3219256401062012, "learning_rate": 4.031349725124395e-06, "loss": 1.1831, "step": 22360 }, { "epoch": 2.6300242486589758, "grad_norm": 1.5240068435668945, "learning_rate": 4.006181247071256e-06, "loss": 1.1841, "step": 22370 }, { "epoch": 2.631199941215372, "grad_norm": 1.1477108001708984, "learning_rate": 3.9810883017791455e-06, "loss": 1.1238, "step": 22380 }, { "epoch": 2.6323756337717685, "grad_norm": 1.1719671487808228, "learning_rate": 3.956070930456646e-06, "loss": 1.1044, "step": 22390 }, { "epoch": 2.633551326328165, "grad_norm": 1.2114938497543335, "learning_rate": 3.93112917418822e-06, "loss": 1.0918, "step": 22400 }, { "epoch": 2.6347270188845617, "grad_norm": 1.388814926147461, "learning_rate": 3.906263073934124e-06, "loss": 1.1479, "step": 22410 }, { "epoch": 2.6359027114409583, "grad_norm": 1.1698169708251953, "learning_rate": 3.881472670530406e-06, "loss": 1.2341, "step": 22420 }, { "epoch": 2.6370784039973545, "grad_norm": 1.5400314331054688, "learning_rate": 3.8567580046887985e-06, "loss": 1.147, "step": 22430 }, { "epoch": 2.638254096553751, "grad_norm": 1.1561321020126343, "learning_rate": 3.8321191169966296e-06, "loss": 1.1313, "step": 22440 }, { "epoch": 2.6394297891101477, "grad_norm": 1.4464151859283447, "learning_rate": 3.8075560479168103e-06, "loss": 1.0998, "step": 22450 }, { "epoch": 2.6406054816665443, "grad_norm": 1.1175357103347778, "learning_rate": 3.7830688377877235e-06, "loss": 1.1855, "step": 22460 }, { "epoch": 2.641781174222941, "grad_norm": 1.5485281944274902, "learning_rate": 3.7586575268231784e-06, "loss": 1.1341, "step": 22470 }, { "epoch": 2.642956866779337, "grad_norm": 1.197023630142212, "learning_rate": 3.734322155112335e-06, "loss": 1.1002, "step": 22480 }, { "epoch": 2.6441325593357337, "grad_norm": 1.143330454826355, "learning_rate": 3.710062762619643e-06, "loss": 1.1691, "step": 22490 }, { "epoch": 2.6453082518921303, "grad_norm": 1.3081790208816528, "learning_rate": 3.685879389184782e-06, "loss": 1.1603, "step": 22500 }, { "epoch": 2.6453082518921303, "eval_loss": 1.5599348545074463, "eval_runtime": 1919.4604, "eval_samples_per_second": 31.511, "eval_steps_per_second": 3.939, "step": 22500 }, { "epoch": 2.646483944448527, "grad_norm": 1.0720388889312744, "learning_rate": 3.661772074522596e-06, "loss": 1.0775, "step": 22510 }, { "epoch": 2.6476596370049235, "grad_norm": 1.334944486618042, "learning_rate": 3.6377408582230044e-06, "loss": 1.0907, "step": 22520 }, { "epoch": 2.6488353295613196, "grad_norm": 1.1238764524459839, "learning_rate": 3.6137857797509665e-06, "loss": 1.133, "step": 22530 }, { "epoch": 2.650011022117716, "grad_norm": 1.315901279449463, "learning_rate": 3.5899068784464197e-06, "loss": 1.1458, "step": 22540 }, { "epoch": 2.651186714674113, "grad_norm": 1.151055097579956, "learning_rate": 3.5661041935241613e-06, "loss": 1.0922, "step": 22550 }, { "epoch": 2.652362407230509, "grad_norm": 1.2967767715454102, "learning_rate": 3.5423777640738566e-06, "loss": 1.1356, "step": 22560 }, { "epoch": 2.6535380997869056, "grad_norm": 1.2143229246139526, "learning_rate": 3.5187276290599314e-06, "loss": 1.1626, "step": 22570 }, { "epoch": 2.654713792343302, "grad_norm": 1.068017840385437, "learning_rate": 3.4951538273215124e-06, "loss": 1.1743, "step": 22580 }, { "epoch": 2.655889484899699, "grad_norm": 1.1613034009933472, "learning_rate": 3.4716563975723815e-06, "loss": 1.1897, "step": 22590 }, { "epoch": 2.6570651774560954, "grad_norm": 1.5198922157287598, "learning_rate": 3.4482353784008936e-06, "loss": 1.1581, "step": 22600 }, { "epoch": 2.6582408700124915, "grad_norm": 1.056044578552246, "learning_rate": 3.4248908082699093e-06, "loss": 1.1622, "step": 22610 }, { "epoch": 2.659416562568888, "grad_norm": 1.206364631652832, "learning_rate": 3.4016227255167565e-06, "loss": 1.1202, "step": 22620 }, { "epoch": 2.6605922551252847, "grad_norm": 1.1014162302017212, "learning_rate": 3.3784311683531523e-06, "loss": 1.1435, "step": 22630 }, { "epoch": 2.6617679476816813, "grad_norm": 1.1177345514297485, "learning_rate": 3.3553161748651195e-06, "loss": 1.1735, "step": 22640 }, { "epoch": 2.662943640238078, "grad_norm": 0.9363469481468201, "learning_rate": 3.332277783012977e-06, "loss": 1.0963, "step": 22650 }, { "epoch": 2.664119332794474, "grad_norm": 1.1700936555862427, "learning_rate": 3.3093160306312265e-06, "loss": 1.1924, "step": 22660 }, { "epoch": 2.6652950253508707, "grad_norm": 1.218223214149475, "learning_rate": 3.286430955428499e-06, "loss": 1.1539, "step": 22670 }, { "epoch": 2.6664707179072673, "grad_norm": 1.078202247619629, "learning_rate": 3.2636225949875256e-06, "loss": 1.1674, "step": 22680 }, { "epoch": 2.667646410463664, "grad_norm": 1.2088229656219482, "learning_rate": 3.2408909867650494e-06, "loss": 1.131, "step": 22690 }, { "epoch": 2.6688221030200605, "grad_norm": 1.2704145908355713, "learning_rate": 3.218236168091754e-06, "loss": 1.1104, "step": 22700 }, { "epoch": 2.6699977955764567, "grad_norm": 1.1240147352218628, "learning_rate": 3.1956581761722336e-06, "loss": 1.1122, "step": 22710 }, { "epoch": 2.6711734881328533, "grad_norm": 1.3395013809204102, "learning_rate": 3.1731570480849015e-06, "loss": 1.1079, "step": 22720 }, { "epoch": 2.67234918068925, "grad_norm": 1.2912640571594238, "learning_rate": 3.15073282078196e-06, "loss": 1.095, "step": 22730 }, { "epoch": 2.673524873245646, "grad_norm": 1.3903371095657349, "learning_rate": 3.128385531089295e-06, "loss": 1.1399, "step": 22740 }, { "epoch": 2.6747005658020426, "grad_norm": 1.30147123336792, "learning_rate": 3.106115215706462e-06, "loss": 1.204, "step": 22750 }, { "epoch": 2.6758762583584392, "grad_norm": 1.3236415386199951, "learning_rate": 3.0839219112065996e-06, "loss": 1.1358, "step": 22760 }, { "epoch": 2.677051950914836, "grad_norm": 1.1601516008377075, "learning_rate": 3.0618056540363814e-06, "loss": 1.1305, "step": 22770 }, { "epoch": 2.6782276434712324, "grad_norm": 1.4008907079696655, "learning_rate": 3.0397664805159486e-06, "loss": 1.156, "step": 22780 }, { "epoch": 2.6794033360276286, "grad_norm": 1.0688070058822632, "learning_rate": 3.017804426838855e-06, "loss": 1.112, "step": 22790 }, { "epoch": 2.680579028584025, "grad_norm": 1.4241867065429688, "learning_rate": 2.995919529072e-06, "loss": 1.1427, "step": 22800 }, { "epoch": 2.681754721140422, "grad_norm": 1.1876939535140991, "learning_rate": 2.974111823155573e-06, "loss": 1.1644, "step": 22810 }, { "epoch": 2.6829304136968184, "grad_norm": 1.1153663396835327, "learning_rate": 2.952381344903005e-06, "loss": 1.121, "step": 22820 }, { "epoch": 2.684106106253215, "grad_norm": 1.4539724588394165, "learning_rate": 2.9307281300008927e-06, "loss": 1.1094, "step": 22830 }, { "epoch": 2.685281798809611, "grad_norm": 1.0259804725646973, "learning_rate": 2.9091522140089533e-06, "loss": 1.1379, "step": 22840 }, { "epoch": 2.6864574913660078, "grad_norm": 0.8979310393333435, "learning_rate": 2.8876536323599655e-06, "loss": 1.1253, "step": 22850 }, { "epoch": 2.6876331839224044, "grad_norm": 1.1438335180282593, "learning_rate": 2.866232420359688e-06, "loss": 1.1374, "step": 22860 }, { "epoch": 2.688808876478801, "grad_norm": 1.2025328874588013, "learning_rate": 2.8448886131868412e-06, "loss": 1.1611, "step": 22870 }, { "epoch": 2.6899845690351976, "grad_norm": 1.163631558418274, "learning_rate": 2.823622245893015e-06, "loss": 1.1323, "step": 22880 }, { "epoch": 2.6911602615915937, "grad_norm": 1.3451321125030518, "learning_rate": 2.8024333534026335e-06, "loss": 1.1849, "step": 22890 }, { "epoch": 2.6923359541479903, "grad_norm": 1.4892504215240479, "learning_rate": 2.7813219705128846e-06, "loss": 1.1904, "step": 22900 }, { "epoch": 2.693511646704387, "grad_norm": 0.9732836484909058, "learning_rate": 2.760288131893668e-06, "loss": 1.1695, "step": 22910 }, { "epoch": 2.694687339260783, "grad_norm": 1.577136516571045, "learning_rate": 2.7393318720875304e-06, "loss": 1.1075, "step": 22920 }, { "epoch": 2.6958630318171797, "grad_norm": 1.0686618089675903, "learning_rate": 2.718453225509626e-06, "loss": 1.1217, "step": 22930 }, { "epoch": 2.6970387243735763, "grad_norm": 1.198993444442749, "learning_rate": 2.697652226447639e-06, "loss": 1.1897, "step": 22940 }, { "epoch": 2.698214416929973, "grad_norm": 1.1849384307861328, "learning_rate": 2.676928909061749e-06, "loss": 1.1103, "step": 22950 }, { "epoch": 2.6993901094863695, "grad_norm": 1.0488717555999756, "learning_rate": 2.656283307384566e-06, "loss": 1.123, "step": 22960 }, { "epoch": 2.7005658020427656, "grad_norm": 1.0016475915908813, "learning_rate": 2.635715455321053e-06, "loss": 1.1399, "step": 22970 }, { "epoch": 2.7017414945991622, "grad_norm": 1.2014634609222412, "learning_rate": 2.6152253866485176e-06, "loss": 1.135, "step": 22980 }, { "epoch": 2.702917187155559, "grad_norm": 0.9741714000701904, "learning_rate": 2.594813135016494e-06, "loss": 1.1179, "step": 22990 }, { "epoch": 2.7040928797119554, "grad_norm": 1.3688714504241943, "learning_rate": 2.5744787339467557e-06, "loss": 1.2281, "step": 23000 }, { "epoch": 2.7040928797119554, "eval_loss": 1.5587701797485352, "eval_runtime": 1919.1039, "eval_samples_per_second": 31.517, "eval_steps_per_second": 3.94, "step": 23000 }, { "epoch": 2.705268572268352, "grad_norm": 1.211548089981079, "learning_rate": 2.554222216833202e-06, "loss": 1.1432, "step": 23010 }, { "epoch": 2.706444264824748, "grad_norm": 1.4171385765075684, "learning_rate": 2.5340436169418503e-06, "loss": 1.1364, "step": 23020 }, { "epoch": 2.707619957381145, "grad_norm": 1.2138334512710571, "learning_rate": 2.5139429674107486e-06, "loss": 1.1916, "step": 23030 }, { "epoch": 2.7087956499375414, "grad_norm": 1.126107931137085, "learning_rate": 2.493920301249919e-06, "loss": 1.1045, "step": 23040 }, { "epoch": 2.7099713424939376, "grad_norm": 1.0342949628829956, "learning_rate": 2.473975651341348e-06, "loss": 1.1313, "step": 23050 }, { "epoch": 2.7111470350503346, "grad_norm": 1.1416676044464111, "learning_rate": 2.454109050438863e-06, "loss": 1.1649, "step": 23060 }, { "epoch": 2.7123227276067308, "grad_norm": 1.419245719909668, "learning_rate": 2.434320531168144e-06, "loss": 1.1499, "step": 23070 }, { "epoch": 2.7134984201631274, "grad_norm": 1.2786529064178467, "learning_rate": 2.414610126026634e-06, "loss": 1.1473, "step": 23080 }, { "epoch": 2.714674112719524, "grad_norm": 1.1608556509017944, "learning_rate": 2.394977867383491e-06, "loss": 1.2051, "step": 23090 }, { "epoch": 2.71584980527592, "grad_norm": 1.1766761541366577, "learning_rate": 2.375423787479547e-06, "loss": 1.1426, "step": 23100 }, { "epoch": 2.7170254978323167, "grad_norm": 1.646866798400879, "learning_rate": 2.3559479184272317e-06, "loss": 1.1383, "step": 23110 }, { "epoch": 2.7182011903887133, "grad_norm": 1.2091033458709717, "learning_rate": 2.3365502922105486e-06, "loss": 1.1834, "step": 23120 }, { "epoch": 2.71937688294511, "grad_norm": 1.1884089708328247, "learning_rate": 2.3172309406850002e-06, "loss": 1.1301, "step": 23130 }, { "epoch": 2.7205525755015065, "grad_norm": 1.1031832695007324, "learning_rate": 2.2979898955775402e-06, "loss": 1.1943, "step": 23140 }, { "epoch": 2.7217282680579027, "grad_norm": 1.7939648628234863, "learning_rate": 2.2788271884865375e-06, "loss": 1.1773, "step": 23150 }, { "epoch": 2.7229039606142993, "grad_norm": 1.330185055732727, "learning_rate": 2.259742850881702e-06, "loss": 1.173, "step": 23160 }, { "epoch": 2.724079653170696, "grad_norm": 1.272143840789795, "learning_rate": 2.240736914104036e-06, "loss": 1.1051, "step": 23170 }, { "epoch": 2.7252553457270925, "grad_norm": 1.4415802955627441, "learning_rate": 2.221809409365794e-06, "loss": 1.181, "step": 23180 }, { "epoch": 2.726431038283489, "grad_norm": 1.1886862516403198, "learning_rate": 2.202960367750434e-06, "loss": 1.1023, "step": 23190 }, { "epoch": 2.7276067308398853, "grad_norm": 1.0761234760284424, "learning_rate": 2.184189820212562e-06, "loss": 1.102, "step": 23200 }, { "epoch": 2.728782423396282, "grad_norm": 1.1269996166229248, "learning_rate": 2.1654977975778533e-06, "loss": 1.1673, "step": 23210 }, { "epoch": 2.7299581159526785, "grad_norm": 1.1329606771469116, "learning_rate": 2.1468843305430585e-06, "loss": 1.0868, "step": 23220 }, { "epoch": 2.7311338085090746, "grad_norm": 2.0541670322418213, "learning_rate": 2.128349449675898e-06, "loss": 1.1566, "step": 23230 }, { "epoch": 2.732309501065471, "grad_norm": 1.172316312789917, "learning_rate": 2.1098931854150407e-06, "loss": 1.1391, "step": 23240 }, { "epoch": 2.733485193621868, "grad_norm": 1.2649376392364502, "learning_rate": 2.091515568070057e-06, "loss": 1.199, "step": 23250 }, { "epoch": 2.7346608861782644, "grad_norm": 1.2650139331817627, "learning_rate": 2.0732166278213507e-06, "loss": 1.212, "step": 23260 }, { "epoch": 2.735836578734661, "grad_norm": 0.9937518239021301, "learning_rate": 2.0549963947201212e-06, "loss": 1.0961, "step": 23270 }, { "epoch": 2.737012271291057, "grad_norm": 1.3294414281845093, "learning_rate": 2.036854898688317e-06, "loss": 1.1581, "step": 23280 }, { "epoch": 2.7381879638474538, "grad_norm": 1.3171072006225586, "learning_rate": 2.0187921695185742e-06, "loss": 1.1911, "step": 23290 }, { "epoch": 2.7393636564038504, "grad_norm": 1.505049705505371, "learning_rate": 2.000808236874191e-06, "loss": 1.2242, "step": 23300 }, { "epoch": 2.740539348960247, "grad_norm": 1.433174967765808, "learning_rate": 1.982903130289032e-06, "loss": 1.1497, "step": 23310 }, { "epoch": 2.7417150415166436, "grad_norm": 1.1023048162460327, "learning_rate": 1.9650768791675377e-06, "loss": 1.0681, "step": 23320 }, { "epoch": 2.7428907340730397, "grad_norm": 1.4512269496917725, "learning_rate": 1.947329512784646e-06, "loss": 1.1494, "step": 23330 }, { "epoch": 2.7440664266294363, "grad_norm": 1.1900670528411865, "learning_rate": 1.929661060285737e-06, "loss": 1.1318, "step": 23340 }, { "epoch": 2.745242119185833, "grad_norm": 1.1529614925384521, "learning_rate": 1.9120715506866005e-06, "loss": 1.1511, "step": 23350 }, { "epoch": 2.7464178117422295, "grad_norm": 1.056344985961914, "learning_rate": 1.8945610128733914e-06, "loss": 1.1744, "step": 23360 }, { "epoch": 2.747593504298626, "grad_norm": 1.2787485122680664, "learning_rate": 1.877129475602557e-06, "loss": 1.1243, "step": 23370 }, { "epoch": 2.7487691968550223, "grad_norm": 1.2937983274459839, "learning_rate": 1.8597769675008258e-06, "loss": 1.1414, "step": 23380 }, { "epoch": 2.749944889411419, "grad_norm": 1.214745283126831, "learning_rate": 1.8425035170651196e-06, "loss": 1.1465, "step": 23390 }, { "epoch": 2.7511205819678155, "grad_norm": 1.3193306922912598, "learning_rate": 1.8253091526625577e-06, "loss": 1.1566, "step": 23400 }, { "epoch": 2.7522962745242117, "grad_norm": 1.1812658309936523, "learning_rate": 1.808193902530353e-06, "loss": 1.1614, "step": 23410 }, { "epoch": 2.7534719670806083, "grad_norm": 1.0444519519805908, "learning_rate": 1.7911577947758162e-06, "loss": 1.2012, "step": 23420 }, { "epoch": 2.754647659637005, "grad_norm": 1.2699439525604248, "learning_rate": 1.774200857376268e-06, "loss": 1.1573, "step": 23430 }, { "epoch": 2.7558233521934015, "grad_norm": 1.0471104383468628, "learning_rate": 1.7573231181790273e-06, "loss": 1.1616, "step": 23440 }, { "epoch": 2.756999044749798, "grad_norm": 1.2009565830230713, "learning_rate": 1.7405246049013447e-06, "loss": 1.2323, "step": 23450 }, { "epoch": 2.7581747373061942, "grad_norm": 1.214815616607666, "learning_rate": 1.723805345130358e-06, "loss": 1.1561, "step": 23460 }, { "epoch": 2.759350429862591, "grad_norm": 1.193826675415039, "learning_rate": 1.7071653663230659e-06, "loss": 1.1669, "step": 23470 }, { "epoch": 2.7605261224189874, "grad_norm": 1.1392747163772583, "learning_rate": 1.6906046958062637e-06, "loss": 1.1851, "step": 23480 }, { "epoch": 2.761701814975384, "grad_norm": 1.280152440071106, "learning_rate": 1.6741233607764971e-06, "loss": 1.1599, "step": 23490 }, { "epoch": 2.7628775075317806, "grad_norm": 1.0802146196365356, "learning_rate": 1.6577213883000209e-06, "loss": 1.1626, "step": 23500 }, { "epoch": 2.7628775075317806, "eval_loss": 1.557753324508667, "eval_runtime": 1919.8299, "eval_samples_per_second": 31.505, "eval_steps_per_second": 3.938, "step": 23500 }, { "epoch": 2.764053200088177, "grad_norm": 1.4651931524276733, "learning_rate": 1.641398805312766e-06, "loss": 1.1393, "step": 23510 }, { "epoch": 2.7652288926445734, "grad_norm": 1.3749213218688965, "learning_rate": 1.625155638620296e-06, "loss": 1.1273, "step": 23520 }, { "epoch": 2.76640458520097, "grad_norm": 1.3777464628219604, "learning_rate": 1.6089919148977396e-06, "loss": 1.1594, "step": 23530 }, { "epoch": 2.7675802777573666, "grad_norm": 1.1670223474502563, "learning_rate": 1.5929076606897574e-06, "loss": 1.1017, "step": 23540 }, { "epoch": 2.768755970313763, "grad_norm": 1.0072933435440063, "learning_rate": 1.576902902410521e-06, "loss": 1.1576, "step": 23550 }, { "epoch": 2.7699316628701594, "grad_norm": 1.291451334953308, "learning_rate": 1.5609776663436381e-06, "loss": 1.1316, "step": 23560 }, { "epoch": 2.771107355426556, "grad_norm": 1.0116820335388184, "learning_rate": 1.5451319786421225e-06, "loss": 1.1662, "step": 23570 }, { "epoch": 2.7722830479829526, "grad_norm": 1.0701724290847778, "learning_rate": 1.529365865328347e-06, "loss": 1.1991, "step": 23580 }, { "epoch": 2.7734587405393487, "grad_norm": 0.8560823798179626, "learning_rate": 1.5136793522940173e-06, "loss": 1.1764, "step": 23590 }, { "epoch": 2.7746344330957453, "grad_norm": 1.2107146978378296, "learning_rate": 1.4980724653001043e-06, "loss": 1.1404, "step": 23600 }, { "epoch": 2.775810125652142, "grad_norm": 1.1777063608169556, "learning_rate": 1.4825452299768173e-06, "loss": 1.192, "step": 23610 }, { "epoch": 2.7769858182085385, "grad_norm": 1.2553075551986694, "learning_rate": 1.4670976718235695e-06, "loss": 1.1494, "step": 23620 }, { "epoch": 2.778161510764935, "grad_norm": 1.2251709699630737, "learning_rate": 1.4517298162089022e-06, "loss": 1.1701, "step": 23630 }, { "epoch": 2.7793372033213313, "grad_norm": 1.2784150838851929, "learning_rate": 1.4364416883704934e-06, "loss": 1.1144, "step": 23640 }, { "epoch": 2.780512895877728, "grad_norm": 1.216147541999817, "learning_rate": 1.4212333134150657e-06, "loss": 1.1764, "step": 23650 }, { "epoch": 2.7816885884341245, "grad_norm": 1.4232467412948608, "learning_rate": 1.406104716318385e-06, "loss": 1.2009, "step": 23660 }, { "epoch": 2.782864280990521, "grad_norm": 1.1044601202011108, "learning_rate": 1.3910559219251995e-06, "loss": 1.1159, "step": 23670 }, { "epoch": 2.7840399735469177, "grad_norm": 1.609036922454834, "learning_rate": 1.376086954949185e-06, "loss": 1.2252, "step": 23680 }, { "epoch": 2.785215666103314, "grad_norm": 1.2237164974212646, "learning_rate": 1.3611978399729498e-06, "loss": 1.1254, "step": 23690 }, { "epoch": 2.7863913586597104, "grad_norm": 1.2777374982833862, "learning_rate": 1.3463886014479455e-06, "loss": 1.1273, "step": 23700 }, { "epoch": 2.787567051216107, "grad_norm": 1.2509548664093018, "learning_rate": 1.331659263694457e-06, "loss": 1.1172, "step": 23710 }, { "epoch": 2.7887427437725036, "grad_norm": 1.1575068235397339, "learning_rate": 1.3170098509015516e-06, "loss": 1.1697, "step": 23720 }, { "epoch": 2.7899184363289002, "grad_norm": 1.1010932922363281, "learning_rate": 1.3024403871270408e-06, "loss": 1.1108, "step": 23730 }, { "epoch": 2.7910941288852964, "grad_norm": 1.025123119354248, "learning_rate": 1.2879508962974407e-06, "loss": 1.1399, "step": 23740 }, { "epoch": 2.792269821441693, "grad_norm": 1.1506879329681396, "learning_rate": 1.2735414022079284e-06, "loss": 1.1664, "step": 23750 }, { "epoch": 2.7934455139980896, "grad_norm": 1.1540110111236572, "learning_rate": 1.2592119285223136e-06, "loss": 1.1114, "step": 23760 }, { "epoch": 2.7946212065544858, "grad_norm": 1.0166974067687988, "learning_rate": 1.2449624987729947e-06, "loss": 1.0946, "step": 23770 }, { "epoch": 2.7957968991108824, "grad_norm": 1.1716296672821045, "learning_rate": 1.2307931363609083e-06, "loss": 1.1227, "step": 23780 }, { "epoch": 2.796972591667279, "grad_norm": 1.1935173273086548, "learning_rate": 1.2167038645555185e-06, "loss": 1.1791, "step": 23790 }, { "epoch": 2.7981482842236756, "grad_norm": 1.2963321208953857, "learning_rate": 1.2026947064947392e-06, "loss": 1.0776, "step": 23800 }, { "epoch": 2.799323976780072, "grad_norm": 1.3039040565490723, "learning_rate": 1.1887656851849504e-06, "loss": 1.1658, "step": 23810 }, { "epoch": 2.8004996693364683, "grad_norm": 1.1612639427185059, "learning_rate": 1.1749168235008878e-06, "loss": 1.1697, "step": 23820 }, { "epoch": 2.801675361892865, "grad_norm": 1.2395830154418945, "learning_rate": 1.1611481441856808e-06, "loss": 1.1299, "step": 23830 }, { "epoch": 2.8028510544492615, "grad_norm": 1.222507119178772, "learning_rate": 1.147459669850759e-06, "loss": 1.0911, "step": 23840 }, { "epoch": 2.804026747005658, "grad_norm": 0.9755324125289917, "learning_rate": 1.1338514229758512e-06, "loss": 1.212, "step": 23850 }, { "epoch": 2.8052024395620547, "grad_norm": 1.8187111616134644, "learning_rate": 1.1203234259089257e-06, "loss": 1.1512, "step": 23860 }, { "epoch": 2.806378132118451, "grad_norm": 1.0976676940917969, "learning_rate": 1.1068757008661612e-06, "loss": 1.199, "step": 23870 }, { "epoch": 2.8075538246748475, "grad_norm": 1.307944893836975, "learning_rate": 1.0935082699319088e-06, "loss": 1.1884, "step": 23880 }, { "epoch": 2.808729517231244, "grad_norm": 1.304331660270691, "learning_rate": 1.0802211550586693e-06, "loss": 1.1764, "step": 23890 }, { "epoch": 2.8099052097876407, "grad_norm": 1.1319321393966675, "learning_rate": 1.0670143780670327e-06, "loss": 1.1841, "step": 23900 }, { "epoch": 2.8110809023440373, "grad_norm": 1.0646966695785522, "learning_rate": 1.053887960645661e-06, "loss": 1.1013, "step": 23910 }, { "epoch": 2.8122565949004334, "grad_norm": 1.1673672199249268, "learning_rate": 1.040841924351238e-06, "loss": 1.1455, "step": 23920 }, { "epoch": 2.81343228745683, "grad_norm": 1.4770869016647339, "learning_rate": 1.0278762906084705e-06, "loss": 1.1272, "step": 23930 }, { "epoch": 2.8146079800132267, "grad_norm": 1.563546895980835, "learning_rate": 1.0149910807099816e-06, "loss": 1.1129, "step": 23940 }, { "epoch": 2.815783672569623, "grad_norm": 1.0824576616287231, "learning_rate": 1.0021863158163502e-06, "loss": 1.1592, "step": 23950 }, { "epoch": 2.8169593651260194, "grad_norm": 1.2945562601089478, "learning_rate": 9.894620169560388e-07, "loss": 1.1599, "step": 23960 }, { "epoch": 2.818135057682416, "grad_norm": 1.14474356174469, "learning_rate": 9.768182050253705e-07, "loss": 1.1221, "step": 23970 }, { "epoch": 2.8193107502388126, "grad_norm": 1.253293514251709, "learning_rate": 9.64254900788475e-07, "loss": 1.0916, "step": 23980 }, { "epoch": 2.820486442795209, "grad_norm": 1.101318597793579, "learning_rate": 9.517721248772815e-07, "loss": 1.1834, "step": 23990 }, { "epoch": 2.8216621353516054, "grad_norm": 1.2081878185272217, "learning_rate": 9.3936989779147e-07, "loss": 1.077, "step": 24000 }, { "epoch": 2.8216621353516054, "eval_loss": 1.5579417943954468, "eval_runtime": 1920.9062, "eval_samples_per_second": 31.488, "eval_steps_per_second": 3.936, "step": 24000 }, { "epoch": 2.822837827908002, "grad_norm": 1.100292444229126, "learning_rate": 9.270482398984426e-07, "loss": 1.1693, "step": 24010 }, { "epoch": 2.8240135204643986, "grad_norm": 1.3238283395767212, "learning_rate": 9.14807171433274e-07, "loss": 1.0858, "step": 24020 }, { "epoch": 2.825189213020795, "grad_norm": 1.287351131439209, "learning_rate": 9.026467124987115e-07, "loss": 1.1186, "step": 24030 }, { "epoch": 2.8263649055771918, "grad_norm": 1.1819802522659302, "learning_rate": 8.905668830651137e-07, "loss": 1.1487, "step": 24040 }, { "epoch": 2.827540598133588, "grad_norm": 1.0516037940979004, "learning_rate": 8.785677029704231e-07, "loss": 1.1494, "step": 24050 }, { "epoch": 2.8287162906899845, "grad_norm": 1.2637062072753906, "learning_rate": 8.666491919201381e-07, "loss": 1.1518, "step": 24060 }, { "epoch": 2.829891983246381, "grad_norm": 1.4629069566726685, "learning_rate": 8.54811369487285e-07, "loss": 1.1235, "step": 24070 }, { "epoch": 2.8310676758027777, "grad_norm": 1.3523629903793335, "learning_rate": 8.430542551123744e-07, "loss": 1.1056, "step": 24080 }, { "epoch": 2.8322433683591743, "grad_norm": 1.1275618076324463, "learning_rate": 8.313778681033779e-07, "loss": 1.1679, "step": 24090 }, { "epoch": 2.8334190609155705, "grad_norm": 1.3627299070358276, "learning_rate": 8.197822276356904e-07, "loss": 1.1368, "step": 24100 }, { "epoch": 2.834594753471967, "grad_norm": 1.583010196685791, "learning_rate": 8.082673527521012e-07, "loss": 1.1939, "step": 24110 }, { "epoch": 2.8357704460283637, "grad_norm": 1.5272597074508667, "learning_rate": 7.968332623627784e-07, "loss": 1.1243, "step": 24120 }, { "epoch": 2.83694613858476, "grad_norm": 1.1384254693984985, "learning_rate": 7.854799752452014e-07, "loss": 1.1818, "step": 24130 }, { "epoch": 2.8381218311411565, "grad_norm": 1.0555572509765625, "learning_rate": 7.74207510044167e-07, "loss": 1.0874, "step": 24140 }, { "epoch": 2.839297523697553, "grad_norm": 1.1546967029571533, "learning_rate": 7.630158852717284e-07, "loss": 1.1385, "step": 24150 }, { "epoch": 2.8404732162539497, "grad_norm": 1.3868951797485352, "learning_rate": 7.519051193071947e-07, "loss": 1.2095, "step": 24160 }, { "epoch": 2.8416489088103463, "grad_norm": 1.0660392045974731, "learning_rate": 7.408752303970756e-07, "loss": 1.1965, "step": 24170 }, { "epoch": 2.8428246013667424, "grad_norm": 1.1958320140838623, "learning_rate": 7.299262366550763e-07, "loss": 1.1508, "step": 24180 }, { "epoch": 2.844000293923139, "grad_norm": 0.9313267469406128, "learning_rate": 7.190581560620302e-07, "loss": 1.1754, "step": 24190 }, { "epoch": 2.8451759864795356, "grad_norm": 1.2544128894805908, "learning_rate": 7.082710064658993e-07, "loss": 1.1296, "step": 24200 }, { "epoch": 2.8463516790359322, "grad_norm": 1.0625300407409668, "learning_rate": 6.975648055817574e-07, "loss": 1.1724, "step": 24210 }, { "epoch": 2.847527371592329, "grad_norm": 1.2196499109268188, "learning_rate": 6.869395709917125e-07, "loss": 1.1338, "step": 24220 }, { "epoch": 2.848703064148725, "grad_norm": 1.3948079347610474, "learning_rate": 6.763953201449291e-07, "loss": 1.1377, "step": 24230 }, { "epoch": 2.8498787567051216, "grad_norm": 1.1738827228546143, "learning_rate": 6.659320703575611e-07, "loss": 1.1362, "step": 24240 }, { "epoch": 2.851054449261518, "grad_norm": 1.194001317024231, "learning_rate": 6.555498388127579e-07, "loss": 1.1306, "step": 24250 }, { "epoch": 2.8522301418179143, "grad_norm": 1.5260933637619019, "learning_rate": 6.452486425605975e-07, "loss": 1.1161, "step": 24260 }, { "epoch": 2.8534058343743114, "grad_norm": 0.7872841954231262, "learning_rate": 6.350284985180865e-07, "loss": 1.2083, "step": 24270 }, { "epoch": 2.8545815269307075, "grad_norm": 1.2123403549194336, "learning_rate": 6.248894234691327e-07, "loss": 1.162, "step": 24280 }, { "epoch": 2.855757219487104, "grad_norm": 1.2504714727401733, "learning_rate": 6.148314340644945e-07, "loss": 1.1505, "step": 24290 }, { "epoch": 2.8569329120435007, "grad_norm": 1.040280818939209, "learning_rate": 6.04854546821787e-07, "loss": 1.1088, "step": 24300 }, { "epoch": 2.858108604599897, "grad_norm": 1.059801459312439, "learning_rate": 5.949587781254151e-07, "loss": 1.1923, "step": 24310 }, { "epoch": 2.8592842971562935, "grad_norm": 1.2096402645111084, "learning_rate": 5.851441442265904e-07, "loss": 1.1219, "step": 24320 }, { "epoch": 2.86045998971269, "grad_norm": 1.0836652517318726, "learning_rate": 5.754106612432531e-07, "loss": 1.1373, "step": 24330 }, { "epoch": 2.8616356822690867, "grad_norm": 1.3796131610870361, "learning_rate": 5.657583451601056e-07, "loss": 1.1842, "step": 24340 }, { "epoch": 2.8628113748254833, "grad_norm": 1.3925491571426392, "learning_rate": 5.56187211828535e-07, "loss": 1.1924, "step": 24350 }, { "epoch": 2.8639870673818795, "grad_norm": 1.1930854320526123, "learning_rate": 5.466972769666124e-07, "loss": 1.162, "step": 24360 }, { "epoch": 2.865162759938276, "grad_norm": 1.2747124433517456, "learning_rate": 5.372885561590657e-07, "loss": 1.1582, "step": 24370 }, { "epoch": 2.8663384524946727, "grad_norm": 1.477066993713379, "learning_rate": 5.279610648572408e-07, "loss": 1.1599, "step": 24380 }, { "epoch": 2.8675141450510693, "grad_norm": 1.3650463819503784, "learning_rate": 5.187148183790957e-07, "loss": 1.1547, "step": 24390 }, { "epoch": 2.868689837607466, "grad_norm": 1.2066184282302856, "learning_rate": 5.095498319091618e-07, "loss": 1.1162, "step": 24400 }, { "epoch": 2.869865530163862, "grad_norm": 1.4237841367721558, "learning_rate": 5.004661204985162e-07, "loss": 1.1256, "step": 24410 }, { "epoch": 2.8710412227202586, "grad_norm": 1.1365289688110352, "learning_rate": 4.914636990647703e-07, "loss": 1.1834, "step": 24420 }, { "epoch": 2.8722169152766552, "grad_norm": 1.4668817520141602, "learning_rate": 4.82542582392037e-07, "loss": 1.1566, "step": 24430 }, { "epoch": 2.8733926078330514, "grad_norm": 1.1685539484024048, "learning_rate": 4.737027851309028e-07, "loss": 1.1612, "step": 24440 }, { "epoch": 2.874568300389448, "grad_norm": 1.0939958095550537, "learning_rate": 4.6494432179841065e-07, "loss": 1.1739, "step": 24450 }, { "epoch": 2.8757439929458446, "grad_norm": 1.0863882303237915, "learning_rate": 4.562672067780383e-07, "loss": 1.1172, "step": 24460 }, { "epoch": 2.876919685502241, "grad_norm": 1.105719804763794, "learning_rate": 4.476714543196703e-07, "loss": 1.1805, "step": 24470 }, { "epoch": 2.878095378058638, "grad_norm": 1.0927035808563232, "learning_rate": 4.3915707853955936e-07, "loss": 1.1503, "step": 24480 }, { "epoch": 2.879271070615034, "grad_norm": 1.1344505548477173, "learning_rate": 4.3072409342034826e-07, "loss": 1.0852, "step": 24490 }, { "epoch": 2.8804467631714306, "grad_norm": 1.3309226036071777, "learning_rate": 4.2237251281098677e-07, "loss": 1.1677, "step": 24500 }, { "epoch": 2.8804467631714306, "eval_loss": 1.55752432346344, "eval_runtime": 1921.8185, "eval_samples_per_second": 31.473, "eval_steps_per_second": 3.934, "step": 24500 }, { "epoch": 2.881622455727827, "grad_norm": 1.2822154760360718, "learning_rate": 4.1410235042674826e-07, "loss": 1.176, "step": 24510 }, { "epoch": 2.8827981482842238, "grad_norm": 1.3878905773162842, "learning_rate": 4.0591361984921306e-07, "loss": 1.1552, "step": 24520 }, { "epoch": 2.8839738408406204, "grad_norm": 1.1055859327316284, "learning_rate": 3.978063345262073e-07, "loss": 1.1756, "step": 24530 }, { "epoch": 2.8851495333970165, "grad_norm": 1.2042357921600342, "learning_rate": 3.897805077718253e-07, "loss": 1.1731, "step": 24540 }, { "epoch": 2.886325225953413, "grad_norm": 1.5468255281448364, "learning_rate": 3.8183615276637387e-07, "loss": 1.1922, "step": 24550 }, { "epoch": 2.8875009185098097, "grad_norm": 1.457342267036438, "learning_rate": 3.7397328255637243e-07, "loss": 1.1577, "step": 24560 }, { "epoch": 2.8886766110662063, "grad_norm": 1.2725560665130615, "learning_rate": 3.661919100545197e-07, "loss": 1.115, "step": 24570 }, { "epoch": 2.889852303622603, "grad_norm": 1.177162766456604, "learning_rate": 3.5849204803966586e-07, "loss": 1.1638, "step": 24580 }, { "epoch": 2.891027996178999, "grad_norm": 1.4513847827911377, "learning_rate": 3.5087370915682373e-07, "loss": 1.1521, "step": 24590 }, { "epoch": 2.8922036887353957, "grad_norm": 1.215273380279541, "learning_rate": 3.433369059171021e-07, "loss": 1.1601, "step": 24600 }, { "epoch": 2.8933793812917923, "grad_norm": 1.2829140424728394, "learning_rate": 3.358816506977225e-07, "loss": 1.1557, "step": 24610 }, { "epoch": 2.8945550738481884, "grad_norm": 1.1659669876098633, "learning_rate": 3.285079557419857e-07, "loss": 1.1308, "step": 24620 }, { "epoch": 2.895730766404585, "grad_norm": 1.236364722251892, "learning_rate": 3.2121583315924964e-07, "loss": 1.1988, "step": 24630 }, { "epoch": 2.8969064589609816, "grad_norm": 1.2171268463134766, "learning_rate": 3.140052949249017e-07, "loss": 1.1729, "step": 24640 }, { "epoch": 2.8980821515173782, "grad_norm": 1.2187587022781372, "learning_rate": 3.068763528803642e-07, "loss": 1.071, "step": 24650 }, { "epoch": 2.899257844073775, "grad_norm": 1.120725154876709, "learning_rate": 2.9982901873304994e-07, "loss": 1.1382, "step": 24660 }, { "epoch": 2.900433536630171, "grad_norm": 1.2168409824371338, "learning_rate": 2.9286330405634554e-07, "loss": 1.2336, "step": 24670 }, { "epoch": 2.9016092291865676, "grad_norm": 1.1205121278762817, "learning_rate": 2.859792202896172e-07, "loss": 1.1319, "step": 24680 }, { "epoch": 2.902784921742964, "grad_norm": 1.3653805255889893, "learning_rate": 2.7917677873815496e-07, "loss": 1.1453, "step": 24690 }, { "epoch": 2.903960614299361, "grad_norm": 1.2413114309310913, "learning_rate": 2.7245599057318937e-07, "loss": 1.2014, "step": 24700 }, { "epoch": 2.9051363068557574, "grad_norm": 1.0825512409210205, "learning_rate": 2.658168668318417e-07, "loss": 1.1349, "step": 24710 }, { "epoch": 2.9063119994121536, "grad_norm": 1.2527415752410889, "learning_rate": 2.5925941841713483e-07, "loss": 1.0921, "step": 24720 }, { "epoch": 2.90748769196855, "grad_norm": 1.332753300666809, "learning_rate": 2.527836560979491e-07, "loss": 1.1491, "step": 24730 }, { "epoch": 2.9086633845249468, "grad_norm": 1.3713496923446655, "learning_rate": 2.463895905090219e-07, "loss": 1.1835, "step": 24740 }, { "epoch": 2.9098390770813434, "grad_norm": 1.0393855571746826, "learning_rate": 2.4007723215093146e-07, "loss": 1.1885, "step": 24750 }, { "epoch": 2.91101476963774, "grad_norm": 1.0722877979278564, "learning_rate": 2.3384659139006337e-07, "loss": 1.1509, "step": 24760 }, { "epoch": 2.912190462194136, "grad_norm": 1.2309484481811523, "learning_rate": 2.2769767845861045e-07, "loss": 1.1408, "step": 24770 }, { "epoch": 2.9133661547505327, "grad_norm": 1.0550230741500854, "learning_rate": 2.2163050345453962e-07, "loss": 1.1733, "step": 24780 }, { "epoch": 2.9145418473069293, "grad_norm": 1.505908727645874, "learning_rate": 2.1564507634160292e-07, "loss": 1.206, "step": 24790 }, { "epoch": 2.9157175398633255, "grad_norm": 1.5790512561798096, "learning_rate": 2.0974140694928202e-07, "loss": 1.1932, "step": 24800 }, { "epoch": 2.916893232419722, "grad_norm": 1.162100911140442, "learning_rate": 2.039195049728049e-07, "loss": 1.1445, "step": 24810 }, { "epoch": 2.9180689249761187, "grad_norm": 1.1965610980987549, "learning_rate": 1.9817937997311797e-07, "loss": 1.2171, "step": 24820 }, { "epoch": 2.9192446175325153, "grad_norm": 1.1473971605300903, "learning_rate": 1.9252104137686965e-07, "loss": 1.1797, "step": 24830 }, { "epoch": 2.920420310088912, "grad_norm": 1.292829990386963, "learning_rate": 1.8694449847638794e-07, "loss": 1.1451, "step": 24840 }, { "epoch": 2.921596002645308, "grad_norm": 1.2526473999023438, "learning_rate": 1.8144976042968055e-07, "loss": 1.1905, "step": 24850 }, { "epoch": 2.9227716952017047, "grad_norm": 1.0829849243164062, "learning_rate": 1.7603683626041257e-07, "loss": 1.1781, "step": 24860 }, { "epoch": 2.9239473877581013, "grad_norm": 1.1450783014297485, "learning_rate": 1.7070573485789554e-07, "loss": 1.1761, "step": 24870 }, { "epoch": 2.925123080314498, "grad_norm": 1.1385432481765747, "learning_rate": 1.6545646497704847e-07, "loss": 1.1371, "step": 24880 }, { "epoch": 2.9262987728708945, "grad_norm": 1.7975353002548218, "learning_rate": 1.602890352384312e-07, "loss": 1.1607, "step": 24890 }, { "epoch": 2.9274744654272906, "grad_norm": 1.124440312385559, "learning_rate": 1.5520345412818327e-07, "loss": 1.1762, "step": 24900 }, { "epoch": 2.928650157983687, "grad_norm": 1.2971079349517822, "learning_rate": 1.501997299980351e-07, "loss": 1.1739, "step": 24910 }, { "epoch": 2.929825850540084, "grad_norm": 1.0885522365570068, "learning_rate": 1.4527787106529133e-07, "loss": 1.1825, "step": 24920 }, { "epoch": 2.9310015430964804, "grad_norm": 0.9646847248077393, "learning_rate": 1.4043788541280856e-07, "loss": 1.1454, "step": 24930 }, { "epoch": 2.932177235652877, "grad_norm": 1.3883405923843384, "learning_rate": 1.3567978098899536e-07, "loss": 1.1551, "step": 24940 }, { "epoch": 2.933352928209273, "grad_norm": 1.162287712097168, "learning_rate": 1.3100356560778458e-07, "loss": 1.1569, "step": 24950 }, { "epoch": 2.93452862076567, "grad_norm": 1.053520679473877, "learning_rate": 1.2640924694862777e-07, "loss": 1.2158, "step": 24960 }, { "epoch": 2.9357043133220664, "grad_norm": 1.1270520687103271, "learning_rate": 1.2189683255649508e-07, "loss": 1.1346, "step": 24970 }, { "epoch": 2.9368800058784625, "grad_norm": 1.1189196109771729, "learning_rate": 1.1746632984183659e-07, "loss": 1.18, "step": 24980 }, { "epoch": 2.938055698434859, "grad_norm": 1.0846132040023804, "learning_rate": 1.1311774608058212e-07, "loss": 1.1666, "step": 24990 }, { "epoch": 2.9392313909912557, "grad_norm": 1.1636384725570679, "learning_rate": 1.0885108841415249e-07, "loss": 1.1624, "step": 25000 }, { "epoch": 2.9392313909912557, "eval_loss": 1.5574328899383545, "eval_runtime": 1923.1945, "eval_samples_per_second": 31.45, "eval_steps_per_second": 3.931, "step": 25000 }, { "epoch": 2.9404070835476523, "grad_norm": 1.3450044393539429, "learning_rate": 1.0466636384940387e-07, "loss": 1.1912, "step": 25010 }, { "epoch": 2.941582776104049, "grad_norm": 1.356529712677002, "learning_rate": 1.0056357925863902e-07, "loss": 1.1282, "step": 25020 }, { "epoch": 2.942758468660445, "grad_norm": 1.6541097164154053, "learning_rate": 9.654274137961827e-08, "loss": 1.1227, "step": 25030 }, { "epoch": 2.9439341612168417, "grad_norm": 1.3433836698532104, "learning_rate": 9.260385681549855e-08, "loss": 1.1757, "step": 25040 }, { "epoch": 2.9451098537732383, "grad_norm": 1.1945890188217163, "learning_rate": 8.874693203487216e-08, "loss": 1.1653, "step": 25050 }, { "epoch": 2.946285546329635, "grad_norm": 1.2607718706130981, "learning_rate": 8.497197337171691e-08, "loss": 1.1938, "step": 25060 }, { "epoch": 2.9474612388860315, "grad_norm": 1.0359092950820923, "learning_rate": 8.127898702541826e-08, "loss": 1.1747, "step": 25070 }, { "epoch": 2.9486369314424277, "grad_norm": 1.163474678993225, "learning_rate": 7.766797906073042e-08, "loss": 1.1633, "step": 25080 }, { "epoch": 2.9498126239988243, "grad_norm": 1.108085036277771, "learning_rate": 7.413895540778204e-08, "loss": 1.1189, "step": 25090 }, { "epoch": 2.950988316555221, "grad_norm": 1.2205270528793335, "learning_rate": 7.069192186207607e-08, "loss": 1.1319, "step": 25100 }, { "epoch": 2.9521640091116175, "grad_norm": 1.2022536993026733, "learning_rate": 6.732688408445098e-08, "loss": 1.1568, "step": 25110 }, { "epoch": 2.953339701668014, "grad_norm": 1.0304259061813354, "learning_rate": 6.404384760110294e-08, "loss": 1.2173, "step": 25120 }, { "epoch": 2.9545153942244102, "grad_norm": 1.237220287322998, "learning_rate": 6.08428178035525e-08, "loss": 1.1675, "step": 25130 }, { "epoch": 2.955691086780807, "grad_norm": 1.4498248100280762, "learning_rate": 5.772379994865018e-08, "loss": 1.0752, "step": 25140 }, { "epoch": 2.9568667793372034, "grad_norm": 1.3433541059494019, "learning_rate": 5.468679915857089e-08, "loss": 1.1828, "step": 25150 }, { "epoch": 2.9580424718935996, "grad_norm": 1.3184983730316162, "learning_rate": 5.173182042078062e-08, "loss": 1.1421, "step": 25160 }, { "epoch": 2.959218164449996, "grad_norm": 1.4728854894638062, "learning_rate": 4.885886858805866e-08, "loss": 1.1365, "step": 25170 }, { "epoch": 2.960393857006393, "grad_norm": 1.430730938911438, "learning_rate": 4.606794837847539e-08, "loss": 1.1656, "step": 25180 }, { "epoch": 2.9615695495627894, "grad_norm": 1.1683475971221924, "learning_rate": 4.335906437538118e-08, "loss": 1.1474, "step": 25190 }, { "epoch": 2.962745242119186, "grad_norm": 1.2339345216751099, "learning_rate": 4.073222102740637e-08, "loss": 1.1824, "step": 25200 }, { "epoch": 2.963920934675582, "grad_norm": 1.2131662368774414, "learning_rate": 3.8187422648450214e-08, "loss": 1.1144, "step": 25210 }, { "epoch": 2.9650966272319788, "grad_norm": 1.4206774234771729, "learning_rate": 3.572467341768082e-08, "loss": 1.152, "step": 25220 }, { "epoch": 2.9662723197883754, "grad_norm": 1.0693501234054565, "learning_rate": 3.3343977379513006e-08, "loss": 1.0998, "step": 25230 }, { "epoch": 2.967448012344772, "grad_norm": 1.0940967798233032, "learning_rate": 3.104533844360824e-08, "loss": 1.176, "step": 25240 }, { "epoch": 2.9686237049011686, "grad_norm": 1.324896216392517, "learning_rate": 2.882876038489135e-08, "loss": 1.14, "step": 25250 }, { "epoch": 2.9697993974575647, "grad_norm": 1.2778147459030151, "learning_rate": 2.6694246843494973e-08, "loss": 1.1914, "step": 25260 }, { "epoch": 2.9709750900139613, "grad_norm": 1.1371055841445923, "learning_rate": 2.464180132480398e-08, "loss": 1.1536, "step": 25270 }, { "epoch": 2.972150782570358, "grad_norm": 1.0526599884033203, "learning_rate": 2.2671427199416618e-08, "loss": 1.1154, "step": 25280 }, { "epoch": 2.9733264751267545, "grad_norm": 1.5525157451629639, "learning_rate": 2.0783127703161154e-08, "loss": 1.1256, "step": 25290 }, { "epoch": 2.974502167683151, "grad_norm": 1.1302651166915894, "learning_rate": 1.8976905937068134e-08, "loss": 1.1441, "step": 25300 }, { "epoch": 2.9756778602395473, "grad_norm": 1.3815456628799438, "learning_rate": 1.7252764867381478e-08, "loss": 1.121, "step": 25310 }, { "epoch": 2.976853552795944, "grad_norm": 1.4163875579833984, "learning_rate": 1.561070732555292e-08, "loss": 1.1262, "step": 25320 }, { "epoch": 2.9780292453523405, "grad_norm": 1.1246237754821777, "learning_rate": 1.4050736008230925e-08, "loss": 1.2155, "step": 25330 }, { "epoch": 2.9792049379087366, "grad_norm": 1.296768069267273, "learning_rate": 1.2572853477260672e-08, "loss": 1.1124, "step": 25340 }, { "epoch": 2.9803806304651332, "grad_norm": 1.2463260889053345, "learning_rate": 1.1177062159667406e-08, "loss": 1.2017, "step": 25350 }, { "epoch": 2.98155632302153, "grad_norm": 1.4516847133636475, "learning_rate": 9.863364347673098e-09, "loss": 1.1281, "step": 25360 }, { "epoch": 2.9827320155779264, "grad_norm": 1.19976007938385, "learning_rate": 8.631762198690885e-09, "loss": 1.0959, "step": 25370 }, { "epoch": 2.983907708134323, "grad_norm": 1.1681883335113525, "learning_rate": 7.482257735291765e-09, "loss": 1.1142, "step": 25380 }, { "epoch": 2.985083400690719, "grad_norm": 1.233157992362976, "learning_rate": 6.414852845243458e-09, "loss": 1.1774, "step": 25390 }, { "epoch": 2.986259093247116, "grad_norm": 1.4146987199783325, "learning_rate": 5.429549281471546e-09, "loss": 1.2295, "step": 25400 }, { "epoch": 2.9874347858035124, "grad_norm": 1.0754578113555908, "learning_rate": 4.526348662081681e-09, "loss": 1.1624, "step": 25410 }, { "epoch": 2.988610478359909, "grad_norm": 0.9882297515869141, "learning_rate": 3.705252470348475e-09, "loss": 1.1941, "step": 25420 }, { "epoch": 2.9897861709163056, "grad_norm": 1.1574598550796509, "learning_rate": 2.966262054698854e-09, "loss": 1.1411, "step": 25430 }, { "epoch": 2.9909618634727018, "grad_norm": 1.2991571426391602, "learning_rate": 2.309378628728709e-09, "loss": 1.1287, "step": 25440 }, { "epoch": 2.9921375560290984, "grad_norm": 1.2248836755752563, "learning_rate": 1.7346032712028948e-09, "loss": 1.1326, "step": 25450 }, { "epoch": 2.993313248585495, "grad_norm": 1.4950770139694214, "learning_rate": 1.2419369260385782e-09, "loss": 1.0967, "step": 25460 }, { "epoch": 2.994488941141891, "grad_norm": 1.128186821937561, "learning_rate": 8.313804023107885e-10, "loss": 1.0779, "step": 25470 }, { "epoch": 2.995664633698288, "grad_norm": 1.1225956678390503, "learning_rate": 5.029343742468662e-10, "loss": 1.0904, "step": 25480 }, { "epoch": 2.9968403262546843, "grad_norm": 1.0800658464431763, "learning_rate": 2.5659938123201444e-10, "loss": 1.1466, "step": 25490 }, { "epoch": 2.998016018811081, "grad_norm": 1.2926615476608276, "learning_rate": 9.237582780929898e-11, "loss": 1.217, "step": 25500 }, { "epoch": 2.998016018811081, "eval_loss": 1.557464361190796, "eval_runtime": 1922.8488, "eval_samples_per_second": 31.456, "eval_steps_per_second": 3.932, "step": 25500 }, { "epoch": 2.9991917113674775, "grad_norm": 1.5157231092453003, "learning_rate": 1.0263983679648093e-11, "loss": 1.1172, "step": 25510 }, { "epoch": 2.9997795576456756, "step": 25515, "total_flos": 1.101945505799209e+18, "train_loss": 1.4957824454362687, "train_runtime": 245984.0173, "train_samples_per_second": 6.639, "train_steps_per_second": 0.104 } ], "logging_steps": 10, "max_steps": 25515, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 4000, "total_flos": 1.101945505799209e+18, "train_batch_size": 2, "trial_name": null, "trial_params": null }