{ "best_metric": 0.9777191259513872, "best_model_checkpoint": "mobilenet_v2_1.0_224-finetuned-plantdisease/checkpoint-9164", "epoch": 9.995635093845483, "eval_steps": 500, "global_step": 11450, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.008729812309035356, "grad_norm": 5.370448589324951, "learning_rate": 4.366812227074236e-07, "loss": 3.7534, "step": 10 }, { "epoch": 0.017459624618070713, "grad_norm": 5.382869720458984, "learning_rate": 8.733624454148472e-07, "loss": 3.7447, "step": 20 }, { "epoch": 0.026189436927106065, "grad_norm": 5.5538249015808105, "learning_rate": 1.3100436681222706e-06, "loss": 3.7409, "step": 30 }, { "epoch": 0.034919249236141425, "grad_norm": 5.5582685470581055, "learning_rate": 1.7467248908296944e-06, "loss": 3.7325, "step": 40 }, { "epoch": 0.04364906154517678, "grad_norm": 5.354773998260498, "learning_rate": 2.1834061135371177e-06, "loss": 3.7135, "step": 50 }, { "epoch": 0.05237887385421213, "grad_norm": 5.103704929351807, "learning_rate": 2.6200873362445413e-06, "loss": 3.714, "step": 60 }, { "epoch": 0.06110868616324749, "grad_norm": 5.359903812408447, "learning_rate": 3.056768558951965e-06, "loss": 3.6754, "step": 70 }, { "epoch": 0.06983849847228285, "grad_norm": 5.380093097686768, "learning_rate": 3.493449781659389e-06, "loss": 3.6767, "step": 80 }, { "epoch": 0.0785683107813182, "grad_norm": 5.455582141876221, "learning_rate": 3.930131004366813e-06, "loss": 3.6431, "step": 90 }, { "epoch": 0.08729812309035356, "grad_norm": 5.4371490478515625, "learning_rate": 4.3668122270742355e-06, "loss": 3.6285, "step": 100 }, { "epoch": 0.09602793539938892, "grad_norm": 5.358748435974121, "learning_rate": 4.80349344978166e-06, "loss": 3.594, "step": 110 }, { "epoch": 0.10475774770842426, "grad_norm": 5.466850757598877, "learning_rate": 5.240174672489083e-06, "loss": 3.5717, "step": 120 }, { "epoch": 0.11348756001745962, "grad_norm": 5.143874168395996, "learning_rate": 5.676855895196507e-06, "loss": 3.5456, "step": 130 }, { "epoch": 0.12221737232649497, "grad_norm": 5.1142683029174805, "learning_rate": 6.11353711790393e-06, "loss": 3.5045, "step": 140 }, { "epoch": 0.13094718463553034, "grad_norm": 5.174585819244385, "learning_rate": 6.550218340611354e-06, "loss": 3.452, "step": 150 }, { "epoch": 0.1396769969445657, "grad_norm": 5.233867168426514, "learning_rate": 6.986899563318778e-06, "loss": 3.4269, "step": 160 }, { "epoch": 0.14840680925360106, "grad_norm": 4.996443271636963, "learning_rate": 7.423580786026202e-06, "loss": 3.3866, "step": 170 }, { "epoch": 0.1571366215626364, "grad_norm": 5.243433475494385, "learning_rate": 7.860262008733626e-06, "loss": 3.3293, "step": 180 }, { "epoch": 0.16586643387167177, "grad_norm": 5.2202911376953125, "learning_rate": 8.296943231441049e-06, "loss": 3.2767, "step": 190 }, { "epoch": 0.17459624618070713, "grad_norm": 4.891767501831055, "learning_rate": 8.733624454148471e-06, "loss": 3.2698, "step": 200 }, { "epoch": 0.18332605848974248, "grad_norm": 4.960419654846191, "learning_rate": 9.170305676855896e-06, "loss": 3.2333, "step": 210 }, { "epoch": 0.19205587079877784, "grad_norm": 4.834344387054443, "learning_rate": 9.60698689956332e-06, "loss": 3.1378, "step": 220 }, { "epoch": 0.2007856831078132, "grad_norm": 5.046371936798096, "learning_rate": 1.0043668122270743e-05, "loss": 3.0687, "step": 230 }, { "epoch": 0.20951549541684852, "grad_norm": 5.039073467254639, "learning_rate": 1.0480349344978165e-05, "loss": 3.0194, "step": 240 }, { "epoch": 0.21824530772588388, "grad_norm": 5.1495771408081055, "learning_rate": 1.091703056768559e-05, "loss": 2.9677, "step": 250 }, { "epoch": 0.22697512003491924, "grad_norm": 4.857588768005371, "learning_rate": 1.1353711790393014e-05, "loss": 2.9015, "step": 260 }, { "epoch": 0.2357049323439546, "grad_norm": 4.682951927185059, "learning_rate": 1.1790393013100438e-05, "loss": 2.8179, "step": 270 }, { "epoch": 0.24443474465298995, "grad_norm": 4.9485602378845215, "learning_rate": 1.222707423580786e-05, "loss": 2.8279, "step": 280 }, { "epoch": 0.25316455696202533, "grad_norm": 4.801344871520996, "learning_rate": 1.2663755458515283e-05, "loss": 2.7402, "step": 290 }, { "epoch": 0.2618943692710607, "grad_norm": 4.586411952972412, "learning_rate": 1.3100436681222708e-05, "loss": 2.7108, "step": 300 }, { "epoch": 0.27062418158009605, "grad_norm": 4.808084487915039, "learning_rate": 1.3537117903930133e-05, "loss": 2.585, "step": 310 }, { "epoch": 0.2793539938891314, "grad_norm": 4.550442695617676, "learning_rate": 1.3973799126637555e-05, "loss": 2.6376, "step": 320 }, { "epoch": 0.28808380619816676, "grad_norm": 4.389446258544922, "learning_rate": 1.4410480349344979e-05, "loss": 2.511, "step": 330 }, { "epoch": 0.2968136185072021, "grad_norm": 4.278360366821289, "learning_rate": 1.4847161572052404e-05, "loss": 2.4379, "step": 340 }, { "epoch": 0.30554343081623747, "grad_norm": 4.738358497619629, "learning_rate": 1.5283842794759826e-05, "loss": 2.387, "step": 350 }, { "epoch": 0.3142732431252728, "grad_norm": 4.373560428619385, "learning_rate": 1.572052401746725e-05, "loss": 2.3534, "step": 360 }, { "epoch": 0.3230030554343082, "grad_norm": 4.679076671600342, "learning_rate": 1.6157205240174673e-05, "loss": 2.321, "step": 370 }, { "epoch": 0.33173286774334354, "grad_norm": 4.373522758483887, "learning_rate": 1.6593886462882098e-05, "loss": 2.2783, "step": 380 }, { "epoch": 0.3404626800523789, "grad_norm": 4.47551155090332, "learning_rate": 1.703056768558952e-05, "loss": 2.1886, "step": 390 }, { "epoch": 0.34919249236141425, "grad_norm": 4.3290181159973145, "learning_rate": 1.7467248908296942e-05, "loss": 2.1986, "step": 400 }, { "epoch": 0.3579223046704496, "grad_norm": 4.084609031677246, "learning_rate": 1.7903930131004367e-05, "loss": 2.0736, "step": 410 }, { "epoch": 0.36665211697948497, "grad_norm": 4.143791198730469, "learning_rate": 1.8340611353711792e-05, "loss": 2.045, "step": 420 }, { "epoch": 0.3753819292885203, "grad_norm": 4.159194469451904, "learning_rate": 1.8777292576419214e-05, "loss": 1.9542, "step": 430 }, { "epoch": 0.3841117415975557, "grad_norm": 4.298882007598877, "learning_rate": 1.921397379912664e-05, "loss": 1.9863, "step": 440 }, { "epoch": 0.39284155390659103, "grad_norm": 4.460660934448242, "learning_rate": 1.965065502183406e-05, "loss": 1.9066, "step": 450 }, { "epoch": 0.4015713662156264, "grad_norm": 4.154500484466553, "learning_rate": 2.0087336244541487e-05, "loss": 1.7901, "step": 460 }, { "epoch": 0.41030117852466175, "grad_norm": 4.019309043884277, "learning_rate": 2.052401746724891e-05, "loss": 1.8314, "step": 470 }, { "epoch": 0.41903099083369705, "grad_norm": 4.08663272857666, "learning_rate": 2.096069868995633e-05, "loss": 1.7671, "step": 480 }, { "epoch": 0.4277608031427324, "grad_norm": 4.213662624359131, "learning_rate": 2.1397379912663756e-05, "loss": 1.7467, "step": 490 }, { "epoch": 0.43649061545176776, "grad_norm": 3.9979076385498047, "learning_rate": 2.183406113537118e-05, "loss": 1.6146, "step": 500 }, { "epoch": 0.4452204277608031, "grad_norm": 3.973045825958252, "learning_rate": 2.2270742358078603e-05, "loss": 1.6054, "step": 510 }, { "epoch": 0.4539502400698385, "grad_norm": 4.335777282714844, "learning_rate": 2.2707423580786028e-05, "loss": 1.6053, "step": 520 }, { "epoch": 0.46268005237887383, "grad_norm": 4.511986255645752, "learning_rate": 2.3144104803493453e-05, "loss": 1.5458, "step": 530 }, { "epoch": 0.4714098646879092, "grad_norm": 3.8047099113464355, "learning_rate": 2.3580786026200875e-05, "loss": 1.5328, "step": 540 }, { "epoch": 0.48013967699694454, "grad_norm": 3.791404962539673, "learning_rate": 2.4017467248908297e-05, "loss": 1.4993, "step": 550 }, { "epoch": 0.4888694893059799, "grad_norm": 4.048672199249268, "learning_rate": 2.445414847161572e-05, "loss": 1.4274, "step": 560 }, { "epoch": 0.49759930161501525, "grad_norm": 4.073417663574219, "learning_rate": 2.4890829694323144e-05, "loss": 1.3728, "step": 570 }, { "epoch": 0.5063291139240507, "grad_norm": 4.089795112609863, "learning_rate": 2.5327510917030566e-05, "loss": 1.3612, "step": 580 }, { "epoch": 0.515058926233086, "grad_norm": 3.6688883304595947, "learning_rate": 2.576419213973799e-05, "loss": 1.2784, "step": 590 }, { "epoch": 0.5237887385421214, "grad_norm": 3.9120841026306152, "learning_rate": 2.6200873362445416e-05, "loss": 1.2878, "step": 600 }, { "epoch": 0.5325185508511567, "grad_norm": 3.715569257736206, "learning_rate": 2.663755458515284e-05, "loss": 1.1857, "step": 610 }, { "epoch": 0.5412483631601921, "grad_norm": 3.895285129547119, "learning_rate": 2.7074235807860267e-05, "loss": 1.1727, "step": 620 }, { "epoch": 0.5499781754692274, "grad_norm": 3.671576738357544, "learning_rate": 2.7510917030567685e-05, "loss": 1.1431, "step": 630 }, { "epoch": 0.5587079877782628, "grad_norm": 4.0363993644714355, "learning_rate": 2.794759825327511e-05, "loss": 1.1228, "step": 640 }, { "epoch": 0.5674378000872982, "grad_norm": 4.259347915649414, "learning_rate": 2.8384279475982532e-05, "loss": 1.1352, "step": 650 }, { "epoch": 0.5761676123963335, "grad_norm": 3.2646217346191406, "learning_rate": 2.8820960698689958e-05, "loss": 1.0213, "step": 660 }, { "epoch": 0.5848974247053689, "grad_norm": 3.9578239917755127, "learning_rate": 2.9257641921397383e-05, "loss": 1.0741, "step": 670 }, { "epoch": 0.5936272370144042, "grad_norm": 3.894042730331421, "learning_rate": 2.9694323144104808e-05, "loss": 1.0305, "step": 680 }, { "epoch": 0.6023570493234396, "grad_norm": 4.07789945602417, "learning_rate": 3.0131004366812227e-05, "loss": 1.0185, "step": 690 }, { "epoch": 0.6110868616324749, "grad_norm": 4.043015003204346, "learning_rate": 3.056768558951965e-05, "loss": 1.0215, "step": 700 }, { "epoch": 0.6198166739415103, "grad_norm": 3.8258657455444336, "learning_rate": 3.1004366812227074e-05, "loss": 0.9917, "step": 710 }, { "epoch": 0.6285464862505457, "grad_norm": 3.633079767227173, "learning_rate": 3.14410480349345e-05, "loss": 0.9212, "step": 720 }, { "epoch": 0.637276298559581, "grad_norm": 4.321951866149902, "learning_rate": 3.1877729257641924e-05, "loss": 0.9052, "step": 730 }, { "epoch": 0.6460061108686164, "grad_norm": 3.5607919692993164, "learning_rate": 3.2314410480349346e-05, "loss": 0.894, "step": 740 }, { "epoch": 0.6547359231776517, "grad_norm": 3.680312156677246, "learning_rate": 3.275109170305677e-05, "loss": 0.8355, "step": 750 }, { "epoch": 0.6634657354866871, "grad_norm": 4.108597755432129, "learning_rate": 3.3187772925764197e-05, "loss": 0.7819, "step": 760 }, { "epoch": 0.6721955477957224, "grad_norm": 4.298781871795654, "learning_rate": 3.362445414847162e-05, "loss": 0.823, "step": 770 }, { "epoch": 0.6809253601047578, "grad_norm": 3.7209835052490234, "learning_rate": 3.406113537117904e-05, "loss": 0.8231, "step": 780 }, { "epoch": 0.6896551724137931, "grad_norm": 3.6912119388580322, "learning_rate": 3.449781659388647e-05, "loss": 0.7911, "step": 790 }, { "epoch": 0.6983849847228285, "grad_norm": 3.601679563522339, "learning_rate": 3.4934497816593884e-05, "loss": 0.7401, "step": 800 }, { "epoch": 0.7071147970318639, "grad_norm": 3.856973171234131, "learning_rate": 3.537117903930131e-05, "loss": 0.7565, "step": 810 }, { "epoch": 0.7158446093408992, "grad_norm": 4.325893878936768, "learning_rate": 3.5807860262008734e-05, "loss": 0.7557, "step": 820 }, { "epoch": 0.7245744216499346, "grad_norm": 4.065762996673584, "learning_rate": 3.624454148471616e-05, "loss": 0.7348, "step": 830 }, { "epoch": 0.7333042339589699, "grad_norm": 3.6065175533294678, "learning_rate": 3.6681222707423585e-05, "loss": 0.7162, "step": 840 }, { "epoch": 0.7420340462680053, "grad_norm": 3.7237186431884766, "learning_rate": 3.711790393013101e-05, "loss": 0.6657, "step": 850 }, { "epoch": 0.7507638585770406, "grad_norm": 3.3277764320373535, "learning_rate": 3.755458515283843e-05, "loss": 0.6956, "step": 860 }, { "epoch": 0.759493670886076, "grad_norm": 4.210085868835449, "learning_rate": 3.799126637554585e-05, "loss": 0.6844, "step": 870 }, { "epoch": 0.7682234831951114, "grad_norm": 3.41595458984375, "learning_rate": 3.842794759825328e-05, "loss": 0.6489, "step": 880 }, { "epoch": 0.7769532955041467, "grad_norm": 4.264623641967773, "learning_rate": 3.88646288209607e-05, "loss": 0.6868, "step": 890 }, { "epoch": 0.7856831078131821, "grad_norm": 3.9044318199157715, "learning_rate": 3.930131004366812e-05, "loss": 0.6235, "step": 900 }, { "epoch": 0.7944129201222174, "grad_norm": 3.767618179321289, "learning_rate": 3.9737991266375545e-05, "loss": 0.6105, "step": 910 }, { "epoch": 0.8031427324312528, "grad_norm": 3.6234493255615234, "learning_rate": 4.017467248908297e-05, "loss": 0.6195, "step": 920 }, { "epoch": 0.8118725447402881, "grad_norm": 3.8668875694274902, "learning_rate": 4.0611353711790395e-05, "loss": 0.6341, "step": 930 }, { "epoch": 0.8206023570493235, "grad_norm": 3.911647081375122, "learning_rate": 4.104803493449782e-05, "loss": 0.6803, "step": 940 }, { "epoch": 0.8293321693583589, "grad_norm": 4.5981950759887695, "learning_rate": 4.1484716157205246e-05, "loss": 0.5992, "step": 950 }, { "epoch": 0.8380619816673941, "grad_norm": 3.940392017364502, "learning_rate": 4.192139737991266e-05, "loss": 0.5896, "step": 960 }, { "epoch": 0.8467917939764295, "grad_norm": 3.4633290767669678, "learning_rate": 4.235807860262009e-05, "loss": 0.5743, "step": 970 }, { "epoch": 0.8555216062854648, "grad_norm": 3.471381187438965, "learning_rate": 4.279475982532751e-05, "loss": 0.5745, "step": 980 }, { "epoch": 0.8642514185945002, "grad_norm": 3.305868625640869, "learning_rate": 4.323144104803494e-05, "loss": 0.6056, "step": 990 }, { "epoch": 0.8729812309035355, "grad_norm": 3.885556221008301, "learning_rate": 4.366812227074236e-05, "loss": 0.5638, "step": 1000 }, { "epoch": 0.8817110432125709, "grad_norm": 3.944361448287964, "learning_rate": 4.4104803493449784e-05, "loss": 0.524, "step": 1010 }, { "epoch": 0.8904408555216062, "grad_norm": 3.539358377456665, "learning_rate": 4.4541484716157205e-05, "loss": 0.5493, "step": 1020 }, { "epoch": 0.8991706678306416, "grad_norm": 3.6482980251312256, "learning_rate": 4.497816593886463e-05, "loss": 0.5493, "step": 1030 }, { "epoch": 0.907900480139677, "grad_norm": 3.9914190769195557, "learning_rate": 4.5414847161572056e-05, "loss": 0.4909, "step": 1040 }, { "epoch": 0.9166302924487123, "grad_norm": 4.449773788452148, "learning_rate": 4.585152838427948e-05, "loss": 0.5089, "step": 1050 }, { "epoch": 0.9253601047577477, "grad_norm": 4.087109565734863, "learning_rate": 4.6288209606986906e-05, "loss": 0.5272, "step": 1060 }, { "epoch": 0.934089917066783, "grad_norm": 4.070830345153809, "learning_rate": 4.672489082969432e-05, "loss": 0.5054, "step": 1070 }, { "epoch": 0.9428197293758184, "grad_norm": 3.926940679550171, "learning_rate": 4.716157205240175e-05, "loss": 0.5005, "step": 1080 }, { "epoch": 0.9515495416848537, "grad_norm": 3.5378997325897217, "learning_rate": 4.759825327510917e-05, "loss": 0.5203, "step": 1090 }, { "epoch": 0.9602793539938891, "grad_norm": 3.6640408039093018, "learning_rate": 4.8034934497816594e-05, "loss": 0.4391, "step": 1100 }, { "epoch": 0.9690091663029244, "grad_norm": 3.9486780166625977, "learning_rate": 4.847161572052402e-05, "loss": 0.4608, "step": 1110 }, { "epoch": 0.9777389786119598, "grad_norm": 3.081714153289795, "learning_rate": 4.890829694323144e-05, "loss": 0.4607, "step": 1120 }, { "epoch": 0.9864687909209952, "grad_norm": 4.005092620849609, "learning_rate": 4.9344978165938866e-05, "loss": 0.4745, "step": 1130 }, { "epoch": 0.9951986032300305, "grad_norm": 3.1498849391937256, "learning_rate": 4.978165938864629e-05, "loss": 0.3974, "step": 1140 }, { "epoch": 0.9995635093845482, "eval_accuracy": 0.8978639823226123, "eval_loss": 0.3598543405532837, "eval_runtime": 62.6632, "eval_samples_per_second": 259.993, "eval_steps_per_second": 8.139, "step": 1145 }, { "epoch": 1.003928415539066, "grad_norm": 3.5931997299194336, "learning_rate": 4.997573993207181e-05, "loss": 0.4407, "step": 1150 }, { "epoch": 1.0126582278481013, "grad_norm": 2.662714719772339, "learning_rate": 4.9927219796215426e-05, "loss": 0.4129, "step": 1160 }, { "epoch": 1.0213880401571367, "grad_norm": 3.9502806663513184, "learning_rate": 4.9878699660359054e-05, "loss": 0.4622, "step": 1170 }, { "epoch": 1.030117852466172, "grad_norm": 3.2790305614471436, "learning_rate": 4.9830179524502674e-05, "loss": 0.4446, "step": 1180 }, { "epoch": 1.0388476647752074, "grad_norm": 3.6010549068450928, "learning_rate": 4.978165938864629e-05, "loss": 0.4411, "step": 1190 }, { "epoch": 1.0475774770842428, "grad_norm": 3.6778202056884766, "learning_rate": 4.9733139252789915e-05, "loss": 0.4662, "step": 1200 }, { "epoch": 1.0563072893932781, "grad_norm": 3.7837071418762207, "learning_rate": 4.968461911693353e-05, "loss": 0.4305, "step": 1210 }, { "epoch": 1.0650371017023135, "grad_norm": 3.306307554244995, "learning_rate": 4.963609898107715e-05, "loss": 0.4111, "step": 1220 }, { "epoch": 1.0737669140113488, "grad_norm": 3.2483878135681152, "learning_rate": 4.9587578845220763e-05, "loss": 0.3983, "step": 1230 }, { "epoch": 1.0824967263203842, "grad_norm": 3.236823558807373, "learning_rate": 4.953905870936439e-05, "loss": 0.386, "step": 1240 }, { "epoch": 1.0912265386294195, "grad_norm": 3.8218398094177246, "learning_rate": 4.949053857350801e-05, "loss": 0.4355, "step": 1250 }, { "epoch": 1.099956350938455, "grad_norm": 2.8009932041168213, "learning_rate": 4.9442018437651625e-05, "loss": 0.4203, "step": 1260 }, { "epoch": 1.1086861632474903, "grad_norm": 3.2365312576293945, "learning_rate": 4.9393498301795246e-05, "loss": 0.3648, "step": 1270 }, { "epoch": 1.1174159755565256, "grad_norm": 3.3708131313323975, "learning_rate": 4.9344978165938866e-05, "loss": 0.4026, "step": 1280 }, { "epoch": 1.126145787865561, "grad_norm": 3.124727487564087, "learning_rate": 4.929645803008249e-05, "loss": 0.3873, "step": 1290 }, { "epoch": 1.1348756001745963, "grad_norm": 3.1519768238067627, "learning_rate": 4.92479378942261e-05, "loss": 0.3864, "step": 1300 }, { "epoch": 1.1436054124836317, "grad_norm": 3.662675619125366, "learning_rate": 4.919941775836973e-05, "loss": 0.4071, "step": 1310 }, { "epoch": 1.152335224792667, "grad_norm": 3.7715182304382324, "learning_rate": 4.915089762251335e-05, "loss": 0.37, "step": 1320 }, { "epoch": 1.1610650371017024, "grad_norm": 3.4320363998413086, "learning_rate": 4.910237748665696e-05, "loss": 0.4143, "step": 1330 }, { "epoch": 1.1697948494107377, "grad_norm": 2.953200101852417, "learning_rate": 4.905385735080058e-05, "loss": 0.3522, "step": 1340 }, { "epoch": 1.178524661719773, "grad_norm": 3.239183187484741, "learning_rate": 4.90053372149442e-05, "loss": 0.3704, "step": 1350 }, { "epoch": 1.1872544740288085, "grad_norm": 3.6990933418273926, "learning_rate": 4.8956817079087824e-05, "loss": 0.392, "step": 1360 }, { "epoch": 1.1959842863378438, "grad_norm": 3.497069835662842, "learning_rate": 4.890829694323144e-05, "loss": 0.3628, "step": 1370 }, { "epoch": 1.2047140986468792, "grad_norm": 4.022006034851074, "learning_rate": 4.8859776807375065e-05, "loss": 0.3842, "step": 1380 }, { "epoch": 1.2134439109559145, "grad_norm": 3.68331241607666, "learning_rate": 4.8811256671518685e-05, "loss": 0.3386, "step": 1390 }, { "epoch": 1.2221737232649499, "grad_norm": 2.9863877296447754, "learning_rate": 4.87627365356623e-05, "loss": 0.3398, "step": 1400 }, { "epoch": 1.2309035355739852, "grad_norm": 3.9146687984466553, "learning_rate": 4.871421639980592e-05, "loss": 0.3543, "step": 1410 }, { "epoch": 1.2396333478830206, "grad_norm": 3.9005401134490967, "learning_rate": 4.866569626394954e-05, "loss": 0.4041, "step": 1420 }, { "epoch": 1.248363160192056, "grad_norm": 2.9291443824768066, "learning_rate": 4.861717612809316e-05, "loss": 0.3363, "step": 1430 }, { "epoch": 1.2570929725010913, "grad_norm": 3.2885146141052246, "learning_rate": 4.8568655992236775e-05, "loss": 0.375, "step": 1440 }, { "epoch": 1.2658227848101267, "grad_norm": 3.188255548477173, "learning_rate": 4.85201358563804e-05, "loss": 0.3467, "step": 1450 }, { "epoch": 1.274552597119162, "grad_norm": 3.08927059173584, "learning_rate": 4.847161572052402e-05, "loss": 0.3863, "step": 1460 }, { "epoch": 1.2832824094281974, "grad_norm": 2.3255248069763184, "learning_rate": 4.8423095584667636e-05, "loss": 0.3498, "step": 1470 }, { "epoch": 1.2920122217372327, "grad_norm": 3.268420696258545, "learning_rate": 4.837457544881126e-05, "loss": 0.3092, "step": 1480 }, { "epoch": 1.300742034046268, "grad_norm": 3.7126426696777344, "learning_rate": 4.832605531295488e-05, "loss": 0.3402, "step": 1490 }, { "epoch": 1.3094718463553034, "grad_norm": 2.976686954498291, "learning_rate": 4.82775351770985e-05, "loss": 0.3038, "step": 1500 }, { "epoch": 1.3182016586643388, "grad_norm": 3.393733501434326, "learning_rate": 4.822901504124212e-05, "loss": 0.3353, "step": 1510 }, { "epoch": 1.3269314709733742, "grad_norm": 3.2993807792663574, "learning_rate": 4.818049490538574e-05, "loss": 0.3427, "step": 1520 }, { "epoch": 1.3356612832824095, "grad_norm": 3.6997506618499756, "learning_rate": 4.813197476952936e-05, "loss": 0.3535, "step": 1530 }, { "epoch": 1.3443910955914449, "grad_norm": 3.4463653564453125, "learning_rate": 4.808345463367297e-05, "loss": 0.3431, "step": 1540 }, { "epoch": 1.3531209079004802, "grad_norm": 3.227196455001831, "learning_rate": 4.8034934497816594e-05, "loss": 0.3114, "step": 1550 }, { "epoch": 1.3618507202095156, "grad_norm": 3.5765860080718994, "learning_rate": 4.7986414361960214e-05, "loss": 0.3203, "step": 1560 }, { "epoch": 1.370580532518551, "grad_norm": 4.020304203033447, "learning_rate": 4.7937894226103835e-05, "loss": 0.3496, "step": 1570 }, { "epoch": 1.3793103448275863, "grad_norm": 3.5590262413024902, "learning_rate": 4.7889374090247456e-05, "loss": 0.3294, "step": 1580 }, { "epoch": 1.3880401571366217, "grad_norm": 3.3192408084869385, "learning_rate": 4.7840853954391076e-05, "loss": 0.3334, "step": 1590 }, { "epoch": 1.396769969445657, "grad_norm": 3.537508726119995, "learning_rate": 4.77923338185347e-05, "loss": 0.3349, "step": 1600 }, { "epoch": 1.4054997817546924, "grad_norm": 4.00840425491333, "learning_rate": 4.774381368267831e-05, "loss": 0.334, "step": 1610 }, { "epoch": 1.4142295940637277, "grad_norm": 3.3885114192962646, "learning_rate": 4.769529354682193e-05, "loss": 0.2888, "step": 1620 }, { "epoch": 1.422959406372763, "grad_norm": 3.376528739929199, "learning_rate": 4.764677341096555e-05, "loss": 0.3122, "step": 1630 }, { "epoch": 1.4316892186817984, "grad_norm": 3.8087868690490723, "learning_rate": 4.759825327510917e-05, "loss": 0.3433, "step": 1640 }, { "epoch": 1.4404190309908338, "grad_norm": 2.8861641883850098, "learning_rate": 4.754973313925279e-05, "loss": 0.2913, "step": 1650 }, { "epoch": 1.4491488432998691, "grad_norm": 3.975708246231079, "learning_rate": 4.750121300339641e-05, "loss": 0.3294, "step": 1660 }, { "epoch": 1.4578786556089045, "grad_norm": 4.433221817016602, "learning_rate": 4.7452692867540034e-05, "loss": 0.2937, "step": 1670 }, { "epoch": 1.4666084679179399, "grad_norm": 3.9927902221679688, "learning_rate": 4.740417273168365e-05, "loss": 0.2853, "step": 1680 }, { "epoch": 1.4753382802269752, "grad_norm": 3.353848695755005, "learning_rate": 4.735565259582727e-05, "loss": 0.3477, "step": 1690 }, { "epoch": 1.4840680925360106, "grad_norm": 3.72670841217041, "learning_rate": 4.730713245997089e-05, "loss": 0.3342, "step": 1700 }, { "epoch": 1.492797904845046, "grad_norm": 3.5278804302215576, "learning_rate": 4.725861232411451e-05, "loss": 0.2832, "step": 1710 }, { "epoch": 1.5015277171540813, "grad_norm": 3.5316507816314697, "learning_rate": 4.721009218825813e-05, "loss": 0.2962, "step": 1720 }, { "epoch": 1.5102575294631166, "grad_norm": 3.522723913192749, "learning_rate": 4.716157205240175e-05, "loss": 0.3165, "step": 1730 }, { "epoch": 1.518987341772152, "grad_norm": 3.0365447998046875, "learning_rate": 4.711305191654537e-05, "loss": 0.2921, "step": 1740 }, { "epoch": 1.5277171540811874, "grad_norm": 3.355435848236084, "learning_rate": 4.7064531780688984e-05, "loss": 0.2887, "step": 1750 }, { "epoch": 1.5364469663902227, "grad_norm": 4.480086326599121, "learning_rate": 4.7016011644832605e-05, "loss": 0.28, "step": 1760 }, { "epoch": 1.545176778699258, "grad_norm": 3.315585136413574, "learning_rate": 4.6967491508976226e-05, "loss": 0.3122, "step": 1770 }, { "epoch": 1.5539065910082934, "grad_norm": 3.043123483657837, "learning_rate": 4.6918971373119846e-05, "loss": 0.2679, "step": 1780 }, { "epoch": 1.5626364033173288, "grad_norm": 2.6386468410491943, "learning_rate": 4.687045123726347e-05, "loss": 0.2796, "step": 1790 }, { "epoch": 1.5713662156263641, "grad_norm": 3.1339259147644043, "learning_rate": 4.682193110140709e-05, "loss": 0.2942, "step": 1800 }, { "epoch": 1.5800960279353995, "grad_norm": 3.921851873397827, "learning_rate": 4.677341096555071e-05, "loss": 0.3087, "step": 1810 }, { "epoch": 1.5888258402444349, "grad_norm": 3.262922525405884, "learning_rate": 4.672489082969432e-05, "loss": 0.2879, "step": 1820 }, { "epoch": 1.5975556525534702, "grad_norm": 3.206650733947754, "learning_rate": 4.667637069383794e-05, "loss": 0.278, "step": 1830 }, { "epoch": 1.6062854648625056, "grad_norm": 3.8127830028533936, "learning_rate": 4.662785055798157e-05, "loss": 0.2656, "step": 1840 }, { "epoch": 1.615015277171541, "grad_norm": 3.552639961242676, "learning_rate": 4.657933042212518e-05, "loss": 0.2858, "step": 1850 }, { "epoch": 1.6237450894805763, "grad_norm": 3.771047353744507, "learning_rate": 4.6530810286268804e-05, "loss": 0.3176, "step": 1860 }, { "epoch": 1.6324749017896116, "grad_norm": 3.8466198444366455, "learning_rate": 4.6482290150412424e-05, "loss": 0.2594, "step": 1870 }, { "epoch": 1.641204714098647, "grad_norm": 3.0491561889648438, "learning_rate": 4.6433770014556045e-05, "loss": 0.3074, "step": 1880 }, { "epoch": 1.6499345264076823, "grad_norm": 3.4488463401794434, "learning_rate": 4.638524987869966e-05, "loss": 0.2625, "step": 1890 }, { "epoch": 1.6586643387167177, "grad_norm": 3.1591031551361084, "learning_rate": 4.633672974284328e-05, "loss": 0.2824, "step": 1900 }, { "epoch": 1.667394151025753, "grad_norm": 3.56533145904541, "learning_rate": 4.6288209606986906e-05, "loss": 0.2855, "step": 1910 }, { "epoch": 1.6761239633347884, "grad_norm": 3.567021131515503, "learning_rate": 4.623968947113052e-05, "loss": 0.2823, "step": 1920 }, { "epoch": 1.6848537756438238, "grad_norm": 4.153507709503174, "learning_rate": 4.619116933527414e-05, "loss": 0.2907, "step": 1930 }, { "epoch": 1.6935835879528591, "grad_norm": 4.215022087097168, "learning_rate": 4.614264919941776e-05, "loss": 0.2699, "step": 1940 }, { "epoch": 1.7023134002618945, "grad_norm": 3.388324499130249, "learning_rate": 4.609412906356138e-05, "loss": 0.2915, "step": 1950 }, { "epoch": 1.7110432125709298, "grad_norm": 3.654491901397705, "learning_rate": 4.6045608927704996e-05, "loss": 0.2804, "step": 1960 }, { "epoch": 1.7197730248799652, "grad_norm": 3.2709882259368896, "learning_rate": 4.5997088791848616e-05, "loss": 0.271, "step": 1970 }, { "epoch": 1.7285028371890006, "grad_norm": 3.5081441402435303, "learning_rate": 4.5948568655992244e-05, "loss": 0.2807, "step": 1980 }, { "epoch": 1.737232649498036, "grad_norm": 2.9276814460754395, "learning_rate": 4.590004852013586e-05, "loss": 0.2756, "step": 1990 }, { "epoch": 1.7459624618070713, "grad_norm": 3.9031786918640137, "learning_rate": 4.585152838427948e-05, "loss": 0.2964, "step": 2000 }, { "epoch": 1.7546922741161066, "grad_norm": 3.184328079223633, "learning_rate": 4.58030082484231e-05, "loss": 0.2366, "step": 2010 }, { "epoch": 1.763422086425142, "grad_norm": 3.6933434009552, "learning_rate": 4.575448811256672e-05, "loss": 0.2574, "step": 2020 }, { "epoch": 1.7721518987341773, "grad_norm": 3.177960157394409, "learning_rate": 4.570596797671033e-05, "loss": 0.2602, "step": 2030 }, { "epoch": 1.7808817110432127, "grad_norm": 4.310092926025391, "learning_rate": 4.565744784085395e-05, "loss": 0.2821, "step": 2040 }, { "epoch": 1.789611523352248, "grad_norm": 2.7888848781585693, "learning_rate": 4.560892770499758e-05, "loss": 0.2409, "step": 2050 }, { "epoch": 1.7983413356612834, "grad_norm": 3.6411709785461426, "learning_rate": 4.5560407569141194e-05, "loss": 0.2675, "step": 2060 }, { "epoch": 1.8070711479703188, "grad_norm": 2.790893077850342, "learning_rate": 4.5511887433284815e-05, "loss": 0.3142, "step": 2070 }, { "epoch": 1.8158009602793541, "grad_norm": 4.2732768058776855, "learning_rate": 4.5463367297428435e-05, "loss": 0.2925, "step": 2080 }, { "epoch": 1.8245307725883895, "grad_norm": 3.220660448074341, "learning_rate": 4.5414847161572056e-05, "loss": 0.2705, "step": 2090 }, { "epoch": 1.8332605848974248, "grad_norm": 4.008994102478027, "learning_rate": 4.5366327025715677e-05, "loss": 0.256, "step": 2100 }, { "epoch": 1.8419903972064602, "grad_norm": 3.519592761993408, "learning_rate": 4.531780688985929e-05, "loss": 0.2503, "step": 2110 }, { "epoch": 1.8507202095154955, "grad_norm": 3.621957540512085, "learning_rate": 4.526928675400292e-05, "loss": 0.2641, "step": 2120 }, { "epoch": 1.859450021824531, "grad_norm": 3.7663767337799072, "learning_rate": 4.522076661814653e-05, "loss": 0.2256, "step": 2130 }, { "epoch": 1.8681798341335663, "grad_norm": 2.5823781490325928, "learning_rate": 4.517224648229015e-05, "loss": 0.267, "step": 2140 }, { "epoch": 1.8769096464426016, "grad_norm": 2.787856340408325, "learning_rate": 4.512372634643377e-05, "loss": 0.2841, "step": 2150 }, { "epoch": 1.885639458751637, "grad_norm": 3.87716007232666, "learning_rate": 4.507520621057739e-05, "loss": 0.2001, "step": 2160 }, { "epoch": 1.8943692710606723, "grad_norm": 3.641904830932617, "learning_rate": 4.5026686074721014e-05, "loss": 0.2586, "step": 2170 }, { "epoch": 1.9030990833697077, "grad_norm": 3.6821677684783936, "learning_rate": 4.497816593886463e-05, "loss": 0.2698, "step": 2180 }, { "epoch": 1.911828895678743, "grad_norm": 2.7858848571777344, "learning_rate": 4.4929645803008255e-05, "loss": 0.2734, "step": 2190 }, { "epoch": 1.9205587079877784, "grad_norm": 3.4211864471435547, "learning_rate": 4.488112566715187e-05, "loss": 0.2387, "step": 2200 }, { "epoch": 1.9292885202968137, "grad_norm": 2.879937171936035, "learning_rate": 4.483260553129549e-05, "loss": 0.2375, "step": 2210 }, { "epoch": 1.938018332605849, "grad_norm": 3.930103063583374, "learning_rate": 4.478408539543911e-05, "loss": 0.2767, "step": 2220 }, { "epoch": 1.9467481449148845, "grad_norm": 3.938791275024414, "learning_rate": 4.473556525958273e-05, "loss": 0.2536, "step": 2230 }, { "epoch": 1.9554779572239198, "grad_norm": 2.574296236038208, "learning_rate": 4.468704512372635e-05, "loss": 0.2188, "step": 2240 }, { "epoch": 1.9642077695329552, "grad_norm": 4.026519298553467, "learning_rate": 4.4638524987869964e-05, "loss": 0.2557, "step": 2250 }, { "epoch": 1.9729375818419905, "grad_norm": 3.5013089179992676, "learning_rate": 4.459000485201359e-05, "loss": 0.2438, "step": 2260 }, { "epoch": 1.9816673941510259, "grad_norm": 2.124563694000244, "learning_rate": 4.4541484716157205e-05, "loss": 0.2489, "step": 2270 }, { "epoch": 1.9903972064600612, "grad_norm": 2.5535762310028076, "learning_rate": 4.4492964580300826e-05, "loss": 0.2468, "step": 2280 }, { "epoch": 1.9991270187690966, "grad_norm": 3.7411351203918457, "learning_rate": 4.4444444444444447e-05, "loss": 0.2155, "step": 2290 }, { "epoch": 2.0, "eval_accuracy": 0.9602872575497177, "eval_loss": 0.15250827372074127, "eval_runtime": 62.7478, "eval_samples_per_second": 259.643, "eval_steps_per_second": 8.128, "step": 2291 }, { "epoch": 2.007856831078132, "grad_norm": 3.3374812602996826, "learning_rate": 4.439592430858807e-05, "loss": 0.2331, "step": 2300 }, { "epoch": 2.0165866433871673, "grad_norm": 2.8195152282714844, "learning_rate": 4.434740417273169e-05, "loss": 0.2219, "step": 2310 }, { "epoch": 2.0253164556962027, "grad_norm": 3.702488899230957, "learning_rate": 4.42988840368753e-05, "loss": 0.2212, "step": 2320 }, { "epoch": 2.034046268005238, "grad_norm": 2.593886613845825, "learning_rate": 4.425036390101893e-05, "loss": 0.2278, "step": 2330 }, { "epoch": 2.0427760803142734, "grad_norm": 3.5325887203216553, "learning_rate": 4.420184376516254e-05, "loss": 0.2197, "step": 2340 }, { "epoch": 2.0515058926233087, "grad_norm": 4.355841159820557, "learning_rate": 4.415332362930616e-05, "loss": 0.2384, "step": 2350 }, { "epoch": 2.060235704932344, "grad_norm": 3.8640925884246826, "learning_rate": 4.4104803493449784e-05, "loss": 0.2102, "step": 2360 }, { "epoch": 2.0689655172413794, "grad_norm": 3.6872246265411377, "learning_rate": 4.4056283357593404e-05, "loss": 0.2536, "step": 2370 }, { "epoch": 2.077695329550415, "grad_norm": 3.1011269092559814, "learning_rate": 4.4007763221737025e-05, "loss": 0.2231, "step": 2380 }, { "epoch": 2.08642514185945, "grad_norm": 3.202810525894165, "learning_rate": 4.395924308588064e-05, "loss": 0.2609, "step": 2390 }, { "epoch": 2.0951549541684855, "grad_norm": 3.2603683471679688, "learning_rate": 4.3910722950024266e-05, "loss": 0.2367, "step": 2400 }, { "epoch": 2.103884766477521, "grad_norm": 2.8735013008117676, "learning_rate": 4.386220281416788e-05, "loss": 0.2528, "step": 2410 }, { "epoch": 2.1126145787865562, "grad_norm": 2.745392322540283, "learning_rate": 4.38136826783115e-05, "loss": 0.2259, "step": 2420 }, { "epoch": 2.1213443910955916, "grad_norm": 2.783156633377075, "learning_rate": 4.376516254245512e-05, "loss": 0.2285, "step": 2430 }, { "epoch": 2.130074203404627, "grad_norm": 3.6078295707702637, "learning_rate": 4.371664240659874e-05, "loss": 0.2222, "step": 2440 }, { "epoch": 2.1388040157136623, "grad_norm": 3.379075527191162, "learning_rate": 4.366812227074236e-05, "loss": 0.2574, "step": 2450 }, { "epoch": 2.1475338280226977, "grad_norm": 2.965963840484619, "learning_rate": 4.3619602134885976e-05, "loss": 0.2305, "step": 2460 }, { "epoch": 2.156263640331733, "grad_norm": 3.110180616378784, "learning_rate": 4.35710819990296e-05, "loss": 0.2127, "step": 2470 }, { "epoch": 2.1649934526407684, "grad_norm": 2.2609825134277344, "learning_rate": 4.352256186317322e-05, "loss": 0.1914, "step": 2480 }, { "epoch": 2.1737232649498037, "grad_norm": 3.6740341186523438, "learning_rate": 4.347404172731684e-05, "loss": 0.2116, "step": 2490 }, { "epoch": 2.182453077258839, "grad_norm": 4.600437641143799, "learning_rate": 4.342552159146046e-05, "loss": 0.2694, "step": 2500 }, { "epoch": 2.1911828895678744, "grad_norm": 2.320915460586548, "learning_rate": 4.337700145560408e-05, "loss": 0.2063, "step": 2510 }, { "epoch": 2.19991270187691, "grad_norm": 3.467024326324463, "learning_rate": 4.33284813197477e-05, "loss": 0.2474, "step": 2520 }, { "epoch": 2.208642514185945, "grad_norm": 2.8249664306640625, "learning_rate": 4.327996118389131e-05, "loss": 0.2376, "step": 2530 }, { "epoch": 2.2173723264949805, "grad_norm": 3.371345281600952, "learning_rate": 4.323144104803494e-05, "loss": 0.2286, "step": 2540 }, { "epoch": 2.226102138804016, "grad_norm": 2.801051139831543, "learning_rate": 4.3182920912178554e-05, "loss": 0.1987, "step": 2550 }, { "epoch": 2.234831951113051, "grad_norm": 2.635571002960205, "learning_rate": 4.3134400776322174e-05, "loss": 0.2364, "step": 2560 }, { "epoch": 2.2435617634220866, "grad_norm": 3.7617597579956055, "learning_rate": 4.3085880640465795e-05, "loss": 0.2607, "step": 2570 }, { "epoch": 2.252291575731122, "grad_norm": 2.7652156352996826, "learning_rate": 4.3037360504609415e-05, "loss": 0.2227, "step": 2580 }, { "epoch": 2.2610213880401573, "grad_norm": 2.7807669639587402, "learning_rate": 4.2988840368753036e-05, "loss": 0.2403, "step": 2590 }, { "epoch": 2.2697512003491926, "grad_norm": 3.653930425643921, "learning_rate": 4.294032023289665e-05, "loss": 0.226, "step": 2600 }, { "epoch": 2.278481012658228, "grad_norm": 2.039379358291626, "learning_rate": 4.289180009704028e-05, "loss": 0.2299, "step": 2610 }, { "epoch": 2.2872108249672634, "grad_norm": 2.568441390991211, "learning_rate": 4.284327996118389e-05, "loss": 0.2354, "step": 2620 }, { "epoch": 2.2959406372762987, "grad_norm": 3.425060987472534, "learning_rate": 4.279475982532751e-05, "loss": 0.2334, "step": 2630 }, { "epoch": 2.304670449585334, "grad_norm": 3.790520429611206, "learning_rate": 4.274623968947113e-05, "loss": 0.2126, "step": 2640 }, { "epoch": 2.3134002618943694, "grad_norm": 3.2851438522338867, "learning_rate": 4.269771955361475e-05, "loss": 0.202, "step": 2650 }, { "epoch": 2.322130074203405, "grad_norm": 2.7088074684143066, "learning_rate": 4.264919941775837e-05, "loss": 0.2, "step": 2660 }, { "epoch": 2.33085988651244, "grad_norm": 2.6394031047821045, "learning_rate": 4.260067928190199e-05, "loss": 0.2323, "step": 2670 }, { "epoch": 2.3395896988214755, "grad_norm": 3.6911752223968506, "learning_rate": 4.2552159146045614e-05, "loss": 0.2426, "step": 2680 }, { "epoch": 2.348319511130511, "grad_norm": 2.896843671798706, "learning_rate": 4.250363901018923e-05, "loss": 0.205, "step": 2690 }, { "epoch": 2.357049323439546, "grad_norm": 3.258150339126587, "learning_rate": 4.245511887433285e-05, "loss": 0.2154, "step": 2700 }, { "epoch": 2.3657791357485816, "grad_norm": 2.3189845085144043, "learning_rate": 4.240659873847647e-05, "loss": 0.1969, "step": 2710 }, { "epoch": 2.374508948057617, "grad_norm": 3.0700485706329346, "learning_rate": 4.235807860262009e-05, "loss": 0.2196, "step": 2720 }, { "epoch": 2.3832387603666523, "grad_norm": 2.3715126514434814, "learning_rate": 4.230955846676371e-05, "loss": 0.2214, "step": 2730 }, { "epoch": 2.3919685726756876, "grad_norm": 3.8778791427612305, "learning_rate": 4.2261038330907324e-05, "loss": 0.2229, "step": 2740 }, { "epoch": 2.400698384984723, "grad_norm": 2.7153546810150146, "learning_rate": 4.221251819505095e-05, "loss": 0.1865, "step": 2750 }, { "epoch": 2.4094281972937583, "grad_norm": 2.5187482833862305, "learning_rate": 4.216399805919457e-05, "loss": 0.2027, "step": 2760 }, { "epoch": 2.4181580096027937, "grad_norm": 3.387876510620117, "learning_rate": 4.2115477923338185e-05, "loss": 0.2097, "step": 2770 }, { "epoch": 2.426887821911829, "grad_norm": 2.9012184143066406, "learning_rate": 4.2066957787481806e-05, "loss": 0.1936, "step": 2780 }, { "epoch": 2.4356176342208644, "grad_norm": 3.1651363372802734, "learning_rate": 4.2018437651625426e-05, "loss": 0.2453, "step": 2790 }, { "epoch": 2.4443474465298998, "grad_norm": 3.4698972702026367, "learning_rate": 4.196991751576905e-05, "loss": 0.2172, "step": 2800 }, { "epoch": 2.453077258838935, "grad_norm": 3.2747743129730225, "learning_rate": 4.192139737991266e-05, "loss": 0.244, "step": 2810 }, { "epoch": 2.4618070711479705, "grad_norm": 4.365453720092773, "learning_rate": 4.187287724405629e-05, "loss": 0.2208, "step": 2820 }, { "epoch": 2.470536883457006, "grad_norm": 3.2108471393585205, "learning_rate": 4.182435710819991e-05, "loss": 0.2253, "step": 2830 }, { "epoch": 2.479266695766041, "grad_norm": 3.678192377090454, "learning_rate": 4.177583697234352e-05, "loss": 0.231, "step": 2840 }, { "epoch": 2.4879965080750766, "grad_norm": 3.295539617538452, "learning_rate": 4.172731683648714e-05, "loss": 0.24, "step": 2850 }, { "epoch": 2.496726320384112, "grad_norm": 3.0643677711486816, "learning_rate": 4.1678796700630764e-05, "loss": 0.2142, "step": 2860 }, { "epoch": 2.505456132693147, "grad_norm": 3.3195557594299316, "learning_rate": 4.1630276564774384e-05, "loss": 0.2218, "step": 2870 }, { "epoch": 2.5141859450021826, "grad_norm": 2.9350225925445557, "learning_rate": 4.1581756428918e-05, "loss": 0.1869, "step": 2880 }, { "epoch": 2.5229157573112175, "grad_norm": 3.483301877975464, "learning_rate": 4.1533236293061625e-05, "loss": 0.1986, "step": 2890 }, { "epoch": 2.5316455696202533, "grad_norm": 3.9944510459899902, "learning_rate": 4.1484716157205246e-05, "loss": 0.2112, "step": 2900 }, { "epoch": 2.5403753819292882, "grad_norm": 3.825562000274658, "learning_rate": 4.143619602134886e-05, "loss": 0.2102, "step": 2910 }, { "epoch": 2.549105194238324, "grad_norm": 3.575993776321411, "learning_rate": 4.138767588549248e-05, "loss": 0.2303, "step": 2920 }, { "epoch": 2.557835006547359, "grad_norm": 2.6293749809265137, "learning_rate": 4.13391557496361e-05, "loss": 0.1759, "step": 2930 }, { "epoch": 2.5665648188563948, "grad_norm": 3.9990651607513428, "learning_rate": 4.129063561377972e-05, "loss": 0.247, "step": 2940 }, { "epoch": 2.5752946311654297, "grad_norm": 3.152682065963745, "learning_rate": 4.1242115477923335e-05, "loss": 0.22, "step": 2950 }, { "epoch": 2.5840244434744655, "grad_norm": 2.9134416580200195, "learning_rate": 4.119359534206696e-05, "loss": 0.1966, "step": 2960 }, { "epoch": 2.5927542557835004, "grad_norm": 4.1252288818359375, "learning_rate": 4.114507520621058e-05, "loss": 0.1766, "step": 2970 }, { "epoch": 2.601484068092536, "grad_norm": 3.3701207637786865, "learning_rate": 4.1096555070354197e-05, "loss": 0.2062, "step": 2980 }, { "epoch": 2.610213880401571, "grad_norm": 3.7040228843688965, "learning_rate": 4.104803493449782e-05, "loss": 0.1997, "step": 2990 }, { "epoch": 2.618943692710607, "grad_norm": 3.8628756999969482, "learning_rate": 4.099951479864144e-05, "loss": 0.2126, "step": 3000 }, { "epoch": 2.627673505019642, "grad_norm": 3.3776793479919434, "learning_rate": 4.095099466278506e-05, "loss": 0.2277, "step": 3010 }, { "epoch": 2.6364033173286776, "grad_norm": 1.8289759159088135, "learning_rate": 4.090247452692867e-05, "loss": 0.2033, "step": 3020 }, { "epoch": 2.6451331296377125, "grad_norm": 3.379987955093384, "learning_rate": 4.08539543910723e-05, "loss": 0.216, "step": 3030 }, { "epoch": 2.6538629419467483, "grad_norm": 3.1933844089508057, "learning_rate": 4.080543425521592e-05, "loss": 0.175, "step": 3040 }, { "epoch": 2.6625927542557832, "grad_norm": 4.522855758666992, "learning_rate": 4.0756914119359534e-05, "loss": 0.2276, "step": 3050 }, { "epoch": 2.671322566564819, "grad_norm": 3.2988367080688477, "learning_rate": 4.0708393983503154e-05, "loss": 0.2112, "step": 3060 }, { "epoch": 2.680052378873854, "grad_norm": 3.88380765914917, "learning_rate": 4.0659873847646775e-05, "loss": 0.1831, "step": 3070 }, { "epoch": 2.6887821911828897, "grad_norm": 2.2652664184570312, "learning_rate": 4.0611353711790395e-05, "loss": 0.2062, "step": 3080 }, { "epoch": 2.6975120034919247, "grad_norm": 2.7714035511016846, "learning_rate": 4.0562833575934016e-05, "loss": 0.2016, "step": 3090 }, { "epoch": 2.7062418158009605, "grad_norm": 3.18550181388855, "learning_rate": 4.0514313440077636e-05, "loss": 0.2043, "step": 3100 }, { "epoch": 2.7149716281099954, "grad_norm": 4.249231815338135, "learning_rate": 4.046579330422126e-05, "loss": 0.1868, "step": 3110 }, { "epoch": 2.723701440419031, "grad_norm": 3.2999160289764404, "learning_rate": 4.041727316836487e-05, "loss": 0.1849, "step": 3120 }, { "epoch": 2.732431252728066, "grad_norm": 3.1966731548309326, "learning_rate": 4.036875303250849e-05, "loss": 0.2143, "step": 3130 }, { "epoch": 2.741161065037102, "grad_norm": 3.071214199066162, "learning_rate": 4.032023289665211e-05, "loss": 0.1541, "step": 3140 }, { "epoch": 2.749890877346137, "grad_norm": 2.8251476287841797, "learning_rate": 4.027171276079573e-05, "loss": 0.1818, "step": 3150 }, { "epoch": 2.7586206896551726, "grad_norm": 3.3325555324554443, "learning_rate": 4.022319262493935e-05, "loss": 0.2099, "step": 3160 }, { "epoch": 2.7673505019642075, "grad_norm": 3.3313956260681152, "learning_rate": 4.017467248908297e-05, "loss": 0.2064, "step": 3170 }, { "epoch": 2.7760803142732433, "grad_norm": 3.0525896549224854, "learning_rate": 4.0126152353226594e-05, "loss": 0.1882, "step": 3180 }, { "epoch": 2.7848101265822782, "grad_norm": 2.8672525882720947, "learning_rate": 4.007763221737021e-05, "loss": 0.1722, "step": 3190 }, { "epoch": 2.793539938891314, "grad_norm": 3.050518751144409, "learning_rate": 4.002911208151383e-05, "loss": 0.1933, "step": 3200 }, { "epoch": 2.802269751200349, "grad_norm": 3.527492046356201, "learning_rate": 3.998059194565745e-05, "loss": 0.1614, "step": 3210 }, { "epoch": 2.8109995635093847, "grad_norm": 4.246466159820557, "learning_rate": 3.993207180980107e-05, "loss": 0.2099, "step": 3220 }, { "epoch": 2.8197293758184196, "grad_norm": 3.365166425704956, "learning_rate": 3.988355167394469e-05, "loss": 0.2046, "step": 3230 }, { "epoch": 2.8284591881274554, "grad_norm": 2.6562325954437256, "learning_rate": 3.983503153808831e-05, "loss": 0.1593, "step": 3240 }, { "epoch": 2.8371890004364904, "grad_norm": 3.081038236618042, "learning_rate": 3.978651140223193e-05, "loss": 0.1784, "step": 3250 }, { "epoch": 2.845918812745526, "grad_norm": 3.1560895442962646, "learning_rate": 3.9737991266375545e-05, "loss": 0.2313, "step": 3260 }, { "epoch": 2.854648625054561, "grad_norm": 3.1035749912261963, "learning_rate": 3.9689471130519165e-05, "loss": 0.1988, "step": 3270 }, { "epoch": 2.863378437363597, "grad_norm": 3.2834436893463135, "learning_rate": 3.9640950994662786e-05, "loss": 0.2095, "step": 3280 }, { "epoch": 2.872108249672632, "grad_norm": 2.712871789932251, "learning_rate": 3.9592430858806406e-05, "loss": 0.1723, "step": 3290 }, { "epoch": 2.8808380619816676, "grad_norm": 2.6281795501708984, "learning_rate": 3.954391072295003e-05, "loss": 0.1921, "step": 3300 }, { "epoch": 2.8895678742907025, "grad_norm": 2.3966925144195557, "learning_rate": 3.949539058709365e-05, "loss": 0.1927, "step": 3310 }, { "epoch": 2.8982976865997383, "grad_norm": 3.192667245864868, "learning_rate": 3.944687045123727e-05, "loss": 0.2165, "step": 3320 }, { "epoch": 2.907027498908773, "grad_norm": 3.994009256362915, "learning_rate": 3.939835031538088e-05, "loss": 0.2054, "step": 3330 }, { "epoch": 2.915757311217809, "grad_norm": 3.6502673625946045, "learning_rate": 3.93498301795245e-05, "loss": 0.2229, "step": 3340 }, { "epoch": 2.924487123526844, "grad_norm": 3.1017322540283203, "learning_rate": 3.930131004366812e-05, "loss": 0.1659, "step": 3350 }, { "epoch": 2.9332169358358797, "grad_norm": 3.0115444660186768, "learning_rate": 3.9252789907811743e-05, "loss": 0.182, "step": 3360 }, { "epoch": 2.9419467481449146, "grad_norm": 3.564761161804199, "learning_rate": 3.9204269771955364e-05, "loss": 0.2281, "step": 3370 }, { "epoch": 2.9506765604539504, "grad_norm": 2.8959414958953857, "learning_rate": 3.9155749636098985e-05, "loss": 0.1896, "step": 3380 }, { "epoch": 2.9594063727629853, "grad_norm": 2.9973928928375244, "learning_rate": 3.9107229500242605e-05, "loss": 0.2058, "step": 3390 }, { "epoch": 2.968136185072021, "grad_norm": 3.4878735542297363, "learning_rate": 3.905870936438622e-05, "loss": 0.1839, "step": 3400 }, { "epoch": 2.976865997381056, "grad_norm": 2.474815607070923, "learning_rate": 3.901018922852984e-05, "loss": 0.2162, "step": 3410 }, { "epoch": 2.985595809690092, "grad_norm": 3.7291135787963867, "learning_rate": 3.896166909267347e-05, "loss": 0.1895, "step": 3420 }, { "epoch": 2.9943256219991268, "grad_norm": 3.1307461261749268, "learning_rate": 3.891314895681708e-05, "loss": 0.2058, "step": 3430 }, { "epoch": 2.999563509384548, "eval_accuracy": 0.9558679106309845, "eval_loss": 0.14920948445796967, "eval_runtime": 61.5134, "eval_samples_per_second": 264.853, "eval_steps_per_second": 8.291, "step": 3436 }, { "epoch": 3.0030554343081626, "grad_norm": 3.64821457862854, "learning_rate": 3.88646288209607e-05, "loss": 0.1772, "step": 3440 }, { "epoch": 3.011785246617198, "grad_norm": 2.404139280319214, "learning_rate": 3.881610868510432e-05, "loss": 0.1475, "step": 3450 }, { "epoch": 3.0205150589262333, "grad_norm": 2.3943281173706055, "learning_rate": 3.876758854924794e-05, "loss": 0.1839, "step": 3460 }, { "epoch": 3.0292448712352686, "grad_norm": 2.9818949699401855, "learning_rate": 3.8719068413391556e-05, "loss": 0.1981, "step": 3470 }, { "epoch": 3.037974683544304, "grad_norm": 4.090831279754639, "learning_rate": 3.8670548277535176e-05, "loss": 0.1597, "step": 3480 }, { "epoch": 3.0467044958533394, "grad_norm": 3.2076456546783447, "learning_rate": 3.8622028141678804e-05, "loss": 0.2012, "step": 3490 }, { "epoch": 3.0554343081623747, "grad_norm": 3.2840895652770996, "learning_rate": 3.857350800582242e-05, "loss": 0.1901, "step": 3500 }, { "epoch": 3.06416412047141, "grad_norm": 2.6999125480651855, "learning_rate": 3.852498786996604e-05, "loss": 0.1808, "step": 3510 }, { "epoch": 3.0728939327804454, "grad_norm": 2.939896821975708, "learning_rate": 3.847646773410966e-05, "loss": 0.185, "step": 3520 }, { "epoch": 3.081623745089481, "grad_norm": 3.7036166191101074, "learning_rate": 3.842794759825328e-05, "loss": 0.2125, "step": 3530 }, { "epoch": 3.090353557398516, "grad_norm": 3.631962299346924, "learning_rate": 3.837942746239689e-05, "loss": 0.1828, "step": 3540 }, { "epoch": 3.0990833697075515, "grad_norm": 2.847594976425171, "learning_rate": 3.8330907326540513e-05, "loss": 0.1442, "step": 3550 }, { "epoch": 3.107813182016587, "grad_norm": 2.8544461727142334, "learning_rate": 3.828238719068414e-05, "loss": 0.1772, "step": 3560 }, { "epoch": 3.116542994325622, "grad_norm": 2.833056688308716, "learning_rate": 3.8233867054827755e-05, "loss": 0.1847, "step": 3570 }, { "epoch": 3.1252728066346576, "grad_norm": 3.2772698402404785, "learning_rate": 3.8185346918971375e-05, "loss": 0.1872, "step": 3580 }, { "epoch": 3.1340026189436925, "grad_norm": 2.8444337844848633, "learning_rate": 3.8136826783114996e-05, "loss": 0.1829, "step": 3590 }, { "epoch": 3.1427324312527283, "grad_norm": 4.018392562866211, "learning_rate": 3.8088306647258616e-05, "loss": 0.1709, "step": 3600 }, { "epoch": 3.151462243561763, "grad_norm": 3.0900518894195557, "learning_rate": 3.803978651140223e-05, "loss": 0.1551, "step": 3610 }, { "epoch": 3.160192055870799, "grad_norm": 2.1413562297821045, "learning_rate": 3.799126637554585e-05, "loss": 0.1867, "step": 3620 }, { "epoch": 3.168921868179834, "grad_norm": 3.050671100616455, "learning_rate": 3.794274623968948e-05, "loss": 0.1884, "step": 3630 }, { "epoch": 3.1776516804888697, "grad_norm": 3.41404390335083, "learning_rate": 3.789422610383309e-05, "loss": 0.1864, "step": 3640 }, { "epoch": 3.1863814927979046, "grad_norm": 3.2197773456573486, "learning_rate": 3.784570596797671e-05, "loss": 0.1954, "step": 3650 }, { "epoch": 3.1951113051069404, "grad_norm": 4.097252368927002, "learning_rate": 3.779718583212033e-05, "loss": 0.1938, "step": 3660 }, { "epoch": 3.2038411174159753, "grad_norm": 3.3386523723602295, "learning_rate": 3.774866569626395e-05, "loss": 0.1995, "step": 3670 }, { "epoch": 3.212570929725011, "grad_norm": 2.2313122749328613, "learning_rate": 3.770014556040757e-05, "loss": 0.1557, "step": 3680 }, { "epoch": 3.221300742034046, "grad_norm": 2.797321081161499, "learning_rate": 3.765162542455119e-05, "loss": 0.1771, "step": 3690 }, { "epoch": 3.230030554343082, "grad_norm": 3.4761507511138916, "learning_rate": 3.7603105288694815e-05, "loss": 0.1603, "step": 3700 }, { "epoch": 3.2387603666521168, "grad_norm": 3.6318702697753906, "learning_rate": 3.755458515283843e-05, "loss": 0.1596, "step": 3710 }, { "epoch": 3.2474901789611526, "grad_norm": 2.6632442474365234, "learning_rate": 3.750606501698205e-05, "loss": 0.1559, "step": 3720 }, { "epoch": 3.2562199912701875, "grad_norm": 3.2897541522979736, "learning_rate": 3.745754488112567e-05, "loss": 0.149, "step": 3730 }, { "epoch": 3.2649498035792233, "grad_norm": 2.372366189956665, "learning_rate": 3.740902474526929e-05, "loss": 0.1909, "step": 3740 }, { "epoch": 3.273679615888258, "grad_norm": 4.001101493835449, "learning_rate": 3.736050460941291e-05, "loss": 0.1891, "step": 3750 }, { "epoch": 3.282409428197294, "grad_norm": 3.442195177078247, "learning_rate": 3.7311984473556525e-05, "loss": 0.1883, "step": 3760 }, { "epoch": 3.291139240506329, "grad_norm": 3.3341877460479736, "learning_rate": 3.726346433770015e-05, "loss": 0.183, "step": 3770 }, { "epoch": 3.2998690528153647, "grad_norm": 2.7835216522216797, "learning_rate": 3.7214944201843766e-05, "loss": 0.1593, "step": 3780 }, { "epoch": 3.3085988651243996, "grad_norm": 4.420855522155762, "learning_rate": 3.7166424065987386e-05, "loss": 0.1795, "step": 3790 }, { "epoch": 3.3173286774334354, "grad_norm": 2.7410755157470703, "learning_rate": 3.711790393013101e-05, "loss": 0.1519, "step": 3800 }, { "epoch": 3.3260584897424703, "grad_norm": 3.839470863342285, "learning_rate": 3.706938379427463e-05, "loss": 0.2107, "step": 3810 }, { "epoch": 3.334788302051506, "grad_norm": 2.829495668411255, "learning_rate": 3.702086365841825e-05, "loss": 0.1765, "step": 3820 }, { "epoch": 3.343518114360541, "grad_norm": 3.0382578372955322, "learning_rate": 3.697234352256186e-05, "loss": 0.1899, "step": 3830 }, { "epoch": 3.352247926669577, "grad_norm": 2.4491844177246094, "learning_rate": 3.692382338670549e-05, "loss": 0.1659, "step": 3840 }, { "epoch": 3.3609777389786117, "grad_norm": 3.5624337196350098, "learning_rate": 3.68753032508491e-05, "loss": 0.1681, "step": 3850 }, { "epoch": 3.3697075512876475, "grad_norm": 3.9547486305236816, "learning_rate": 3.682678311499272e-05, "loss": 0.2041, "step": 3860 }, { "epoch": 3.3784373635966825, "grad_norm": 3.0073511600494385, "learning_rate": 3.6778262979136344e-05, "loss": 0.1754, "step": 3870 }, { "epoch": 3.3871671759057183, "grad_norm": 3.345168352127075, "learning_rate": 3.6729742843279964e-05, "loss": 0.1714, "step": 3880 }, { "epoch": 3.395896988214753, "grad_norm": 2.464186668395996, "learning_rate": 3.6681222707423585e-05, "loss": 0.1861, "step": 3890 }, { "epoch": 3.404626800523789, "grad_norm": 2.6841201782226562, "learning_rate": 3.66327025715672e-05, "loss": 0.1676, "step": 3900 }, { "epoch": 3.413356612832824, "grad_norm": 2.905978202819824, "learning_rate": 3.6584182435710826e-05, "loss": 0.1952, "step": 3910 }, { "epoch": 3.4220864251418597, "grad_norm": 2.9638357162475586, "learning_rate": 3.653566229985444e-05, "loss": 0.1617, "step": 3920 }, { "epoch": 3.4308162374508946, "grad_norm": 3.0130560398101807, "learning_rate": 3.648714216399806e-05, "loss": 0.1554, "step": 3930 }, { "epoch": 3.4395460497599304, "grad_norm": 3.267518997192383, "learning_rate": 3.643862202814168e-05, "loss": 0.1896, "step": 3940 }, { "epoch": 3.4482758620689653, "grad_norm": 2.6104979515075684, "learning_rate": 3.63901018922853e-05, "loss": 0.1693, "step": 3950 }, { "epoch": 3.457005674378001, "grad_norm": 2.9736759662628174, "learning_rate": 3.634158175642892e-05, "loss": 0.1654, "step": 3960 }, { "epoch": 3.465735486687036, "grad_norm": 3.04263973236084, "learning_rate": 3.6293061620572536e-05, "loss": 0.1655, "step": 3970 }, { "epoch": 3.474465298996072, "grad_norm": 3.322094202041626, "learning_rate": 3.624454148471616e-05, "loss": 0.1718, "step": 3980 }, { "epoch": 3.4831951113051067, "grad_norm": 2.8771190643310547, "learning_rate": 3.619602134885978e-05, "loss": 0.188, "step": 3990 }, { "epoch": 3.4919249236141425, "grad_norm": 3.3343777656555176, "learning_rate": 3.61475012130034e-05, "loss": 0.1765, "step": 4000 }, { "epoch": 3.5006547359231774, "grad_norm": 3.087955951690674, "learning_rate": 3.609898107714702e-05, "loss": 0.1557, "step": 4010 }, { "epoch": 3.5093845482322132, "grad_norm": 2.9191036224365234, "learning_rate": 3.605046094129064e-05, "loss": 0.1657, "step": 4020 }, { "epoch": 3.518114360541248, "grad_norm": 3.2453222274780273, "learning_rate": 3.600194080543426e-05, "loss": 0.1921, "step": 4030 }, { "epoch": 3.526844172850284, "grad_norm": 3.4277548789978027, "learning_rate": 3.595342066957787e-05, "loss": 0.1583, "step": 4040 }, { "epoch": 3.535573985159319, "grad_norm": 2.132359504699707, "learning_rate": 3.59049005337215e-05, "loss": 0.168, "step": 4050 }, { "epoch": 3.5443037974683547, "grad_norm": 3.881998062133789, "learning_rate": 3.5856380397865114e-05, "loss": 0.1992, "step": 4060 }, { "epoch": 3.5530336097773896, "grad_norm": 3.376692771911621, "learning_rate": 3.5807860262008734e-05, "loss": 0.1804, "step": 4070 }, { "epoch": 3.5617634220864254, "grad_norm": 2.9599671363830566, "learning_rate": 3.5759340126152355e-05, "loss": 0.1561, "step": 4080 }, { "epoch": 3.5704932343954603, "grad_norm": 3.439408302307129, "learning_rate": 3.5710819990295976e-05, "loss": 0.1471, "step": 4090 }, { "epoch": 3.579223046704496, "grad_norm": 3.4653496742248535, "learning_rate": 3.5662299854439596e-05, "loss": 0.189, "step": 4100 }, { "epoch": 3.587952859013531, "grad_norm": 3.479707717895508, "learning_rate": 3.561377971858321e-05, "loss": 0.1771, "step": 4110 }, { "epoch": 3.596682671322567, "grad_norm": 3.077345848083496, "learning_rate": 3.556525958272684e-05, "loss": 0.1764, "step": 4120 }, { "epoch": 3.6054124836316017, "grad_norm": 2.612633228302002, "learning_rate": 3.551673944687045e-05, "loss": 0.1336, "step": 4130 }, { "epoch": 3.6141422959406375, "grad_norm": 2.649402379989624, "learning_rate": 3.546821931101407e-05, "loss": 0.1609, "step": 4140 }, { "epoch": 3.6228721082496724, "grad_norm": 3.7441020011901855, "learning_rate": 3.541969917515769e-05, "loss": 0.1758, "step": 4150 }, { "epoch": 3.6316019205587082, "grad_norm": 2.341038465499878, "learning_rate": 3.537117903930131e-05, "loss": 0.1431, "step": 4160 }, { "epoch": 3.640331732867743, "grad_norm": 2.990847587585449, "learning_rate": 3.532265890344493e-05, "loss": 0.1621, "step": 4170 }, { "epoch": 3.649061545176779, "grad_norm": 3.005911111831665, "learning_rate": 3.527413876758855e-05, "loss": 0.1481, "step": 4180 }, { "epoch": 3.657791357485814, "grad_norm": 3.05849027633667, "learning_rate": 3.5225618631732174e-05, "loss": 0.1781, "step": 4190 }, { "epoch": 3.6665211697948497, "grad_norm": 2.643735647201538, "learning_rate": 3.517709849587579e-05, "loss": 0.1688, "step": 4200 }, { "epoch": 3.6752509821038846, "grad_norm": 3.1337473392486572, "learning_rate": 3.512857836001941e-05, "loss": 0.1463, "step": 4210 }, { "epoch": 3.6839807944129204, "grad_norm": 2.945814609527588, "learning_rate": 3.508005822416303e-05, "loss": 0.1545, "step": 4220 }, { "epoch": 3.6927106067219553, "grad_norm": 2.5990426540374756, "learning_rate": 3.503153808830665e-05, "loss": 0.1525, "step": 4230 }, { "epoch": 3.701440419030991, "grad_norm": 2.7641971111297607, "learning_rate": 3.498301795245027e-05, "loss": 0.1661, "step": 4240 }, { "epoch": 3.710170231340026, "grad_norm": 2.606818437576294, "learning_rate": 3.4934497816593884e-05, "loss": 0.1906, "step": 4250 }, { "epoch": 3.718900043649062, "grad_norm": 3.2500600814819336, "learning_rate": 3.488597768073751e-05, "loss": 0.1404, "step": 4260 }, { "epoch": 3.7276298559580967, "grad_norm": 3.6567978858947754, "learning_rate": 3.4837457544881125e-05, "loss": 0.1715, "step": 4270 }, { "epoch": 3.7363596682671325, "grad_norm": 2.9746952056884766, "learning_rate": 3.4788937409024746e-05, "loss": 0.1633, "step": 4280 }, { "epoch": 3.7450894805761674, "grad_norm": 2.986154079437256, "learning_rate": 3.4740417273168366e-05, "loss": 0.141, "step": 4290 }, { "epoch": 3.753819292885203, "grad_norm": 2.4456489086151123, "learning_rate": 3.469189713731199e-05, "loss": 0.1576, "step": 4300 }, { "epoch": 3.762549105194238, "grad_norm": 2.2719054222106934, "learning_rate": 3.464337700145561e-05, "loss": 0.158, "step": 4310 }, { "epoch": 3.771278917503274, "grad_norm": 2.6917176246643066, "learning_rate": 3.459485686559922e-05, "loss": 0.1624, "step": 4320 }, { "epoch": 3.780008729812309, "grad_norm": 3.041710615158081, "learning_rate": 3.454633672974285e-05, "loss": 0.1573, "step": 4330 }, { "epoch": 3.7887385421213446, "grad_norm": 2.517232894897461, "learning_rate": 3.449781659388647e-05, "loss": 0.1256, "step": 4340 }, { "epoch": 3.7974683544303796, "grad_norm": 3.19498348236084, "learning_rate": 3.444929645803008e-05, "loss": 0.1399, "step": 4350 }, { "epoch": 3.8061981667394154, "grad_norm": 3.773149013519287, "learning_rate": 3.44007763221737e-05, "loss": 0.1627, "step": 4360 }, { "epoch": 3.8149279790484503, "grad_norm": 3.985891819000244, "learning_rate": 3.4352256186317324e-05, "loss": 0.161, "step": 4370 }, { "epoch": 3.823657791357486, "grad_norm": 2.994173049926758, "learning_rate": 3.4303736050460944e-05, "loss": 0.1604, "step": 4380 }, { "epoch": 3.832387603666521, "grad_norm": 3.8228750228881836, "learning_rate": 3.425521591460456e-05, "loss": 0.1609, "step": 4390 }, { "epoch": 3.841117415975557, "grad_norm": 3.412660598754883, "learning_rate": 3.4206695778748185e-05, "loss": 0.1591, "step": 4400 }, { "epoch": 3.8498472282845917, "grad_norm": 2.460543632507324, "learning_rate": 3.4158175642891806e-05, "loss": 0.171, "step": 4410 }, { "epoch": 3.8585770405936275, "grad_norm": 2.8546485900878906, "learning_rate": 3.410965550703542e-05, "loss": 0.1662, "step": 4420 }, { "epoch": 3.8673068529026624, "grad_norm": 3.296644926071167, "learning_rate": 3.406113537117904e-05, "loss": 0.1669, "step": 4430 }, { "epoch": 3.876036665211698, "grad_norm": 3.5040674209594727, "learning_rate": 3.401261523532266e-05, "loss": 0.1694, "step": 4440 }, { "epoch": 3.884766477520733, "grad_norm": 3.1331686973571777, "learning_rate": 3.396409509946628e-05, "loss": 0.1507, "step": 4450 }, { "epoch": 3.893496289829769, "grad_norm": 3.2440221309661865, "learning_rate": 3.3915574963609895e-05, "loss": 0.1727, "step": 4460 }, { "epoch": 3.902226102138804, "grad_norm": 2.514347553253174, "learning_rate": 3.386705482775352e-05, "loss": 0.1401, "step": 4470 }, { "epoch": 3.9109559144478396, "grad_norm": 3.354827404022217, "learning_rate": 3.381853469189714e-05, "loss": 0.1472, "step": 4480 }, { "epoch": 3.9196857267568745, "grad_norm": 2.985978126525879, "learning_rate": 3.377001455604076e-05, "loss": 0.1783, "step": 4490 }, { "epoch": 3.9284155390659103, "grad_norm": 2.6409835815429688, "learning_rate": 3.372149442018438e-05, "loss": 0.1468, "step": 4500 }, { "epoch": 3.9371453513749453, "grad_norm": 2.718919515609741, "learning_rate": 3.3672974284328e-05, "loss": 0.1672, "step": 4510 }, { "epoch": 3.945875163683981, "grad_norm": 3.5522565841674805, "learning_rate": 3.362445414847162e-05, "loss": 0.1666, "step": 4520 }, { "epoch": 3.954604975993016, "grad_norm": 3.1858441829681396, "learning_rate": 3.357593401261523e-05, "loss": 0.181, "step": 4530 }, { "epoch": 3.9633347883020518, "grad_norm": 3.0110580921173096, "learning_rate": 3.352741387675886e-05, "loss": 0.1729, "step": 4540 }, { "epoch": 3.9720646006110867, "grad_norm": 3.0811493396759033, "learning_rate": 3.347889374090248e-05, "loss": 0.1651, "step": 4550 }, { "epoch": 3.9807944129201225, "grad_norm": 2.8543362617492676, "learning_rate": 3.3430373605046094e-05, "loss": 0.1338, "step": 4560 }, { "epoch": 3.9895242252291574, "grad_norm": 3.2236084938049316, "learning_rate": 3.3381853469189714e-05, "loss": 0.1349, "step": 4570 }, { "epoch": 3.998254037538193, "grad_norm": 3.425377368927002, "learning_rate": 3.3333333333333335e-05, "loss": 0.1524, "step": 4580 }, { "epoch": 4.0, "eval_accuracy": 0.9694328504787626, "eval_loss": 0.10254349559545517, "eval_runtime": 60.9682, "eval_samples_per_second": 267.221, "eval_steps_per_second": 8.365, "step": 4582 }, { "epoch": 4.006983849847228, "grad_norm": 3.464984655380249, "learning_rate": 3.3284813197476955e-05, "loss": 0.1749, "step": 4590 }, { "epoch": 4.015713662156264, "grad_norm": 3.4568910598754883, "learning_rate": 3.323629306162057e-05, "loss": 0.1637, "step": 4600 }, { "epoch": 4.024443474465299, "grad_norm": 4.279798984527588, "learning_rate": 3.3187772925764197e-05, "loss": 0.1673, "step": 4610 }, { "epoch": 4.033173286774335, "grad_norm": 2.7573869228363037, "learning_rate": 3.313925278990782e-05, "loss": 0.1473, "step": 4620 }, { "epoch": 4.0419030990833695, "grad_norm": 1.6597820520401, "learning_rate": 3.309073265405143e-05, "loss": 0.1903, "step": 4630 }, { "epoch": 4.050632911392405, "grad_norm": 2.1218221187591553, "learning_rate": 3.304221251819505e-05, "loss": 0.136, "step": 4640 }, { "epoch": 4.05936272370144, "grad_norm": 2.941210985183716, "learning_rate": 3.299369238233867e-05, "loss": 0.1613, "step": 4650 }, { "epoch": 4.068092536010476, "grad_norm": 2.4769389629364014, "learning_rate": 3.294517224648229e-05, "loss": 0.1299, "step": 4660 }, { "epoch": 4.076822348319511, "grad_norm": 2.917901039123535, "learning_rate": 3.289665211062591e-05, "loss": 0.1522, "step": 4670 }, { "epoch": 4.085552160628547, "grad_norm": 2.216571092605591, "learning_rate": 3.2848131974769534e-05, "loss": 0.1137, "step": 4680 }, { "epoch": 4.094281972937582, "grad_norm": 3.3516063690185547, "learning_rate": 3.2799611838913154e-05, "loss": 0.1479, "step": 4690 }, { "epoch": 4.1030117852466175, "grad_norm": 2.5216290950775146, "learning_rate": 3.275109170305677e-05, "loss": 0.1792, "step": 4700 }, { "epoch": 4.111741597555652, "grad_norm": 3.8577330112457275, "learning_rate": 3.270257156720039e-05, "loss": 0.1647, "step": 4710 }, { "epoch": 4.120471409864688, "grad_norm": 3.3492794036865234, "learning_rate": 3.265405143134401e-05, "loss": 0.1646, "step": 4720 }, { "epoch": 4.129201222173723, "grad_norm": 2.7044191360473633, "learning_rate": 3.260553129548763e-05, "loss": 0.1373, "step": 4730 }, { "epoch": 4.137931034482759, "grad_norm": 2.3833065032958984, "learning_rate": 3.255701115963125e-05, "loss": 0.1606, "step": 4740 }, { "epoch": 4.146660846791794, "grad_norm": 2.541414260864258, "learning_rate": 3.250849102377487e-05, "loss": 0.1249, "step": 4750 }, { "epoch": 4.15539065910083, "grad_norm": 3.2906863689422607, "learning_rate": 3.245997088791849e-05, "loss": 0.1458, "step": 4760 }, { "epoch": 4.1641204714098645, "grad_norm": 2.537529706954956, "learning_rate": 3.2411450752062105e-05, "loss": 0.1559, "step": 4770 }, { "epoch": 4.1728502837189, "grad_norm": 3.309135675430298, "learning_rate": 3.2362930616205726e-05, "loss": 0.1478, "step": 4780 }, { "epoch": 4.181580096027935, "grad_norm": 3.2824246883392334, "learning_rate": 3.2314410480349346e-05, "loss": 0.1532, "step": 4790 }, { "epoch": 4.190309908336971, "grad_norm": 2.130497694015503, "learning_rate": 3.226589034449297e-05, "loss": 0.1552, "step": 4800 }, { "epoch": 4.199039720646006, "grad_norm": 2.358793020248413, "learning_rate": 3.221737020863659e-05, "loss": 0.1633, "step": 4810 }, { "epoch": 4.207769532955042, "grad_norm": 2.5943782329559326, "learning_rate": 3.216885007278021e-05, "loss": 0.1277, "step": 4820 }, { "epoch": 4.216499345264077, "grad_norm": 2.494443416595459, "learning_rate": 3.212032993692383e-05, "loss": 0.1644, "step": 4830 }, { "epoch": 4.2252291575731125, "grad_norm": 2.339279890060425, "learning_rate": 3.207180980106744e-05, "loss": 0.1607, "step": 4840 }, { "epoch": 4.233958969882147, "grad_norm": 2.0483193397521973, "learning_rate": 3.202328966521106e-05, "loss": 0.1386, "step": 4850 }, { "epoch": 4.242688782191183, "grad_norm": 3.7117726802825928, "learning_rate": 3.197476952935468e-05, "loss": 0.1412, "step": 4860 }, { "epoch": 4.251418594500218, "grad_norm": 1.738502860069275, "learning_rate": 3.1926249393498304e-05, "loss": 0.141, "step": 4870 }, { "epoch": 4.260148406809254, "grad_norm": 3.4214649200439453, "learning_rate": 3.1877729257641924e-05, "loss": 0.164, "step": 4880 }, { "epoch": 4.268878219118289, "grad_norm": 3.121246576309204, "learning_rate": 3.1829209121785545e-05, "loss": 0.1476, "step": 4890 }, { "epoch": 4.277608031427325, "grad_norm": 2.7231833934783936, "learning_rate": 3.1780688985929165e-05, "loss": 0.1419, "step": 4900 }, { "epoch": 4.2863378437363595, "grad_norm": 2.7895748615264893, "learning_rate": 3.173216885007278e-05, "loss": 0.161, "step": 4910 }, { "epoch": 4.295067656045395, "grad_norm": 2.6406960487365723, "learning_rate": 3.16836487142164e-05, "loss": 0.1358, "step": 4920 }, { "epoch": 4.30379746835443, "grad_norm": 2.4051973819732666, "learning_rate": 3.163512857836002e-05, "loss": 0.1544, "step": 4930 }, { "epoch": 4.312527280663466, "grad_norm": 2.49039626121521, "learning_rate": 3.158660844250364e-05, "loss": 0.1551, "step": 4940 }, { "epoch": 4.321257092972501, "grad_norm": 2.592521905899048, "learning_rate": 3.153808830664726e-05, "loss": 0.1836, "step": 4950 }, { "epoch": 4.329986905281537, "grad_norm": 3.2178955078125, "learning_rate": 3.148956817079088e-05, "loss": 0.1562, "step": 4960 }, { "epoch": 4.338716717590572, "grad_norm": 2.5670714378356934, "learning_rate": 3.14410480349345e-05, "loss": 0.1412, "step": 4970 }, { "epoch": 4.3474465298996074, "grad_norm": 4.081165790557861, "learning_rate": 3.1392527899078116e-05, "loss": 0.166, "step": 4980 }, { "epoch": 4.356176342208642, "grad_norm": 2.3925557136535645, "learning_rate": 3.134400776322174e-05, "loss": 0.1351, "step": 4990 }, { "epoch": 4.364906154517678, "grad_norm": 2.4587152004241943, "learning_rate": 3.1295487627365364e-05, "loss": 0.1389, "step": 5000 }, { "epoch": 4.373635966826713, "grad_norm": 3.3576841354370117, "learning_rate": 3.124696749150898e-05, "loss": 0.1431, "step": 5010 }, { "epoch": 4.382365779135749, "grad_norm": 3.1087608337402344, "learning_rate": 3.11984473556526e-05, "loss": 0.157, "step": 5020 }, { "epoch": 4.391095591444784, "grad_norm": 3.4390625953674316, "learning_rate": 3.114992721979622e-05, "loss": 0.1437, "step": 5030 }, { "epoch": 4.39982540375382, "grad_norm": 2.3669025897979736, "learning_rate": 3.110140708393984e-05, "loss": 0.1229, "step": 5040 }, { "epoch": 4.4085552160628545, "grad_norm": 3.4715614318847656, "learning_rate": 3.105288694808345e-05, "loss": 0.1436, "step": 5050 }, { "epoch": 4.41728502837189, "grad_norm": 3.359426975250244, "learning_rate": 3.1004366812227074e-05, "loss": 0.1563, "step": 5060 }, { "epoch": 4.426014840680925, "grad_norm": 3.365325450897217, "learning_rate": 3.09558466763707e-05, "loss": 0.1411, "step": 5070 }, { "epoch": 4.434744652989961, "grad_norm": 2.716036319732666, "learning_rate": 3.0907326540514315e-05, "loss": 0.1535, "step": 5080 }, { "epoch": 4.443474465298996, "grad_norm": 2.797657012939453, "learning_rate": 3.0858806404657935e-05, "loss": 0.1314, "step": 5090 }, { "epoch": 4.452204277608032, "grad_norm": 2.7117514610290527, "learning_rate": 3.0810286268801556e-05, "loss": 0.1436, "step": 5100 }, { "epoch": 4.460934089917067, "grad_norm": 2.1483347415924072, "learning_rate": 3.0761766132945176e-05, "loss": 0.1591, "step": 5110 }, { "epoch": 4.469663902226102, "grad_norm": 3.0980610847473145, "learning_rate": 3.071324599708879e-05, "loss": 0.1443, "step": 5120 }, { "epoch": 4.478393714535137, "grad_norm": 3.3128209114074707, "learning_rate": 3.066472586123241e-05, "loss": 0.1313, "step": 5130 }, { "epoch": 4.487123526844173, "grad_norm": 3.275357484817505, "learning_rate": 3.061620572537604e-05, "loss": 0.1744, "step": 5140 }, { "epoch": 4.495853339153208, "grad_norm": 3.222581148147583, "learning_rate": 3.056768558951965e-05, "loss": 0.1584, "step": 5150 }, { "epoch": 4.504583151462244, "grad_norm": 3.389233350753784, "learning_rate": 3.051916545366327e-05, "loss": 0.1477, "step": 5160 }, { "epoch": 4.513312963771279, "grad_norm": 3.1281654834747314, "learning_rate": 3.047064531780689e-05, "loss": 0.1283, "step": 5170 }, { "epoch": 4.522042776080315, "grad_norm": 3.123300075531006, "learning_rate": 3.042212518195051e-05, "loss": 0.1496, "step": 5180 }, { "epoch": 4.5307725883893495, "grad_norm": 2.7296836376190186, "learning_rate": 3.0373605046094127e-05, "loss": 0.133, "step": 5190 }, { "epoch": 4.539502400698385, "grad_norm": 2.5812196731567383, "learning_rate": 3.032508491023775e-05, "loss": 0.1388, "step": 5200 }, { "epoch": 4.54823221300742, "grad_norm": 3.827601671218872, "learning_rate": 3.0276564774381372e-05, "loss": 0.145, "step": 5210 }, { "epoch": 4.556962025316456, "grad_norm": 2.3603134155273438, "learning_rate": 3.022804463852499e-05, "loss": 0.1126, "step": 5220 }, { "epoch": 4.565691837625491, "grad_norm": 2.6947975158691406, "learning_rate": 3.017952450266861e-05, "loss": 0.1282, "step": 5230 }, { "epoch": 4.574421649934527, "grad_norm": 2.6288444995880127, "learning_rate": 3.0131004366812227e-05, "loss": 0.1371, "step": 5240 }, { "epoch": 4.583151462243562, "grad_norm": 3.6533212661743164, "learning_rate": 3.0082484230955847e-05, "loss": 0.1555, "step": 5250 }, { "epoch": 4.591881274552597, "grad_norm": 2.927175760269165, "learning_rate": 3.0033964095099464e-05, "loss": 0.1396, "step": 5260 }, { "epoch": 4.600611086861632, "grad_norm": 3.4521915912628174, "learning_rate": 2.9985443959243088e-05, "loss": 0.1399, "step": 5270 }, { "epoch": 4.609340899170668, "grad_norm": 3.1614108085632324, "learning_rate": 2.993692382338671e-05, "loss": 0.139, "step": 5280 }, { "epoch": 4.618070711479703, "grad_norm": 2.77795672416687, "learning_rate": 2.9888403687530326e-05, "loss": 0.1623, "step": 5290 }, { "epoch": 4.626800523788739, "grad_norm": 3.930523633956909, "learning_rate": 2.9839883551673947e-05, "loss": 0.1534, "step": 5300 }, { "epoch": 4.635530336097774, "grad_norm": 2.8020386695861816, "learning_rate": 2.9791363415817564e-05, "loss": 0.1309, "step": 5310 }, { "epoch": 4.64426014840681, "grad_norm": 2.3614084720611572, "learning_rate": 2.9742843279961184e-05, "loss": 0.1422, "step": 5320 }, { "epoch": 4.6529899607158445, "grad_norm": 3.849187135696411, "learning_rate": 2.9694323144104808e-05, "loss": 0.129, "step": 5330 }, { "epoch": 4.66171977302488, "grad_norm": 3.5703964233398438, "learning_rate": 2.9645803008248425e-05, "loss": 0.1403, "step": 5340 }, { "epoch": 4.670449585333915, "grad_norm": 2.742767810821533, "learning_rate": 2.9597282872392046e-05, "loss": 0.1234, "step": 5350 }, { "epoch": 4.679179397642951, "grad_norm": 2.6478323936462402, "learning_rate": 2.9548762736535663e-05, "loss": 0.1395, "step": 5360 }, { "epoch": 4.687909209951986, "grad_norm": 2.8150362968444824, "learning_rate": 2.9500242600679284e-05, "loss": 0.1378, "step": 5370 }, { "epoch": 4.696639022261022, "grad_norm": 3.0545525550842285, "learning_rate": 2.94517224648229e-05, "loss": 0.1244, "step": 5380 }, { "epoch": 4.705368834570057, "grad_norm": 3.3390815258026123, "learning_rate": 2.940320232896652e-05, "loss": 0.1433, "step": 5390 }, { "epoch": 4.714098646879092, "grad_norm": 2.237645149230957, "learning_rate": 2.9354682193110145e-05, "loss": 0.1386, "step": 5400 }, { "epoch": 4.722828459188127, "grad_norm": 2.8226351737976074, "learning_rate": 2.9306162057253762e-05, "loss": 0.1468, "step": 5410 }, { "epoch": 4.731558271497163, "grad_norm": 2.2140517234802246, "learning_rate": 2.9257641921397383e-05, "loss": 0.1271, "step": 5420 }, { "epoch": 4.740288083806198, "grad_norm": 2.853294610977173, "learning_rate": 2.9209121785541e-05, "loss": 0.1357, "step": 5430 }, { "epoch": 4.749017896115234, "grad_norm": 3.399142265319824, "learning_rate": 2.916060164968462e-05, "loss": 0.1517, "step": 5440 }, { "epoch": 4.757747708424269, "grad_norm": 1.853452205657959, "learning_rate": 2.9112081513828238e-05, "loss": 0.1323, "step": 5450 }, { "epoch": 4.7664775207333046, "grad_norm": 2.7379791736602783, "learning_rate": 2.906356137797186e-05, "loss": 0.1471, "step": 5460 }, { "epoch": 4.7752073330423395, "grad_norm": 3.7588677406311035, "learning_rate": 2.9015041242115482e-05, "loss": 0.1315, "step": 5470 }, { "epoch": 4.783937145351375, "grad_norm": 2.9723212718963623, "learning_rate": 2.89665211062591e-05, "loss": 0.131, "step": 5480 }, { "epoch": 4.79266695766041, "grad_norm": 2.8574113845825195, "learning_rate": 2.891800097040272e-05, "loss": 0.1495, "step": 5490 }, { "epoch": 4.801396769969446, "grad_norm": 2.9763901233673096, "learning_rate": 2.8869480834546337e-05, "loss": 0.1388, "step": 5500 }, { "epoch": 4.810126582278481, "grad_norm": 3.1920862197875977, "learning_rate": 2.8820960698689958e-05, "loss": 0.1523, "step": 5510 }, { "epoch": 4.818856394587517, "grad_norm": 2.83996844291687, "learning_rate": 2.8772440562833575e-05, "loss": 0.1126, "step": 5520 }, { "epoch": 4.827586206896552, "grad_norm": 2.066861391067505, "learning_rate": 2.8723920426977195e-05, "loss": 0.1337, "step": 5530 }, { "epoch": 4.836316019205587, "grad_norm": 3.4259138107299805, "learning_rate": 2.867540029112082e-05, "loss": 0.1575, "step": 5540 }, { "epoch": 4.845045831514622, "grad_norm": 3.806020975112915, "learning_rate": 2.8626880155264436e-05, "loss": 0.148, "step": 5550 }, { "epoch": 4.853775643823658, "grad_norm": 4.127275466918945, "learning_rate": 2.8578360019408057e-05, "loss": 0.1487, "step": 5560 }, { "epoch": 4.862505456132693, "grad_norm": 3.0096826553344727, "learning_rate": 2.8529839883551674e-05, "loss": 0.1627, "step": 5570 }, { "epoch": 4.871235268441729, "grad_norm": 3.183000087738037, "learning_rate": 2.8481319747695295e-05, "loss": 0.1241, "step": 5580 }, { "epoch": 4.879965080750764, "grad_norm": 2.7466609477996826, "learning_rate": 2.843279961183892e-05, "loss": 0.1518, "step": 5590 }, { "epoch": 4.8886948930597995, "grad_norm": 2.0740559101104736, "learning_rate": 2.8384279475982532e-05, "loss": 0.143, "step": 5600 }, { "epoch": 4.8974247053688345, "grad_norm": 2.2618136405944824, "learning_rate": 2.8335759340126156e-05, "loss": 0.145, "step": 5610 }, { "epoch": 4.90615451767787, "grad_norm": 3.275090217590332, "learning_rate": 2.8287239204269774e-05, "loss": 0.141, "step": 5620 }, { "epoch": 4.914884329986905, "grad_norm": 2.9142794609069824, "learning_rate": 2.8238719068413394e-05, "loss": 0.1263, "step": 5630 }, { "epoch": 4.923614142295941, "grad_norm": 3.941188097000122, "learning_rate": 2.819019893255701e-05, "loss": 0.167, "step": 5640 }, { "epoch": 4.932343954604976, "grad_norm": 3.0273237228393555, "learning_rate": 2.8141678796700632e-05, "loss": 0.1444, "step": 5650 }, { "epoch": 4.941073766914012, "grad_norm": 1.5899831056594849, "learning_rate": 2.8093158660844256e-05, "loss": 0.1089, "step": 5660 }, { "epoch": 4.949803579223047, "grad_norm": 4.153631687164307, "learning_rate": 2.804463852498787e-05, "loss": 0.1728, "step": 5670 }, { "epoch": 4.958533391532082, "grad_norm": 2.7368574142456055, "learning_rate": 2.7996118389131493e-05, "loss": 0.1371, "step": 5680 }, { "epoch": 4.967263203841117, "grad_norm": 3.6088647842407227, "learning_rate": 2.794759825327511e-05, "loss": 0.1402, "step": 5690 }, { "epoch": 4.975993016150153, "grad_norm": 2.830106496810913, "learning_rate": 2.789907811741873e-05, "loss": 0.1483, "step": 5700 }, { "epoch": 4.984722828459188, "grad_norm": 2.925632953643799, "learning_rate": 2.7850557981562348e-05, "loss": 0.1268, "step": 5710 }, { "epoch": 4.993452640768224, "grad_norm": 2.303786277770996, "learning_rate": 2.780203784570597e-05, "loss": 0.1274, "step": 5720 }, { "epoch": 4.999563509384548, "eval_accuracy": 0.9705990670267616, "eval_loss": 0.09282852709293365, "eval_runtime": 61.4449, "eval_samples_per_second": 265.148, "eval_steps_per_second": 8.3, "step": 5727 }, { "epoch": 5.002182453077259, "grad_norm": 3.6829633712768555, "learning_rate": 2.7753517709849593e-05, "loss": 0.1331, "step": 5730 }, { "epoch": 5.0109122653862945, "grad_norm": 2.4418623447418213, "learning_rate": 2.7704997573993207e-05, "loss": 0.1386, "step": 5740 }, { "epoch": 5.019642077695329, "grad_norm": 2.420471429824829, "learning_rate": 2.765647743813683e-05, "loss": 0.1287, "step": 5750 }, { "epoch": 5.028371890004365, "grad_norm": 1.8955364227294922, "learning_rate": 2.7607957302280448e-05, "loss": 0.1353, "step": 5760 }, { "epoch": 5.0371017023134, "grad_norm": 3.353316307067871, "learning_rate": 2.7559437166424068e-05, "loss": 0.1445, "step": 5770 }, { "epoch": 5.045831514622436, "grad_norm": 2.9570837020874023, "learning_rate": 2.7510917030567685e-05, "loss": 0.1374, "step": 5780 }, { "epoch": 5.054561326931471, "grad_norm": 3.622004747390747, "learning_rate": 2.7462396894711306e-05, "loss": 0.1298, "step": 5790 }, { "epoch": 5.063291139240507, "grad_norm": 2.643129587173462, "learning_rate": 2.741387675885493e-05, "loss": 0.146, "step": 5800 }, { "epoch": 5.072020951549542, "grad_norm": 2.9872100353240967, "learning_rate": 2.7365356622998544e-05, "loss": 0.17, "step": 5810 }, { "epoch": 5.080750763858577, "grad_norm": 3.114686965942383, "learning_rate": 2.7316836487142168e-05, "loss": 0.1391, "step": 5820 }, { "epoch": 5.089480576167612, "grad_norm": 2.8469395637512207, "learning_rate": 2.7268316351285785e-05, "loss": 0.1388, "step": 5830 }, { "epoch": 5.098210388476648, "grad_norm": 3.2871453762054443, "learning_rate": 2.7219796215429405e-05, "loss": 0.1221, "step": 5840 }, { "epoch": 5.106940200785683, "grad_norm": 4.159573078155518, "learning_rate": 2.7171276079573022e-05, "loss": 0.1357, "step": 5850 }, { "epoch": 5.115670013094719, "grad_norm": 2.4209814071655273, "learning_rate": 2.7122755943716643e-05, "loss": 0.1315, "step": 5860 }, { "epoch": 5.124399825403754, "grad_norm": 3.1330792903900146, "learning_rate": 2.7074235807860267e-05, "loss": 0.144, "step": 5870 }, { "epoch": 5.1331296377127895, "grad_norm": 3.4734888076782227, "learning_rate": 2.702571567200388e-05, "loss": 0.1255, "step": 5880 }, { "epoch": 5.141859450021824, "grad_norm": 2.6538310050964355, "learning_rate": 2.6977195536147505e-05, "loss": 0.1309, "step": 5890 }, { "epoch": 5.15058926233086, "grad_norm": 2.8028576374053955, "learning_rate": 2.6928675400291122e-05, "loss": 0.128, "step": 5900 }, { "epoch": 5.159319074639895, "grad_norm": 2.747344970703125, "learning_rate": 2.6880155264434742e-05, "loss": 0.1287, "step": 5910 }, { "epoch": 5.168048886948931, "grad_norm": 2.56372332572937, "learning_rate": 2.6831635128578363e-05, "loss": 0.141, "step": 5920 }, { "epoch": 5.176778699257966, "grad_norm": 2.5955655574798584, "learning_rate": 2.678311499272198e-05, "loss": 0.1405, "step": 5930 }, { "epoch": 5.185508511567002, "grad_norm": 2.351151943206787, "learning_rate": 2.6734594856865604e-05, "loss": 0.1145, "step": 5940 }, { "epoch": 5.194238323876037, "grad_norm": 3.593594551086426, "learning_rate": 2.6686074721009218e-05, "loss": 0.1321, "step": 5950 }, { "epoch": 5.202968136185072, "grad_norm": 2.935622453689575, "learning_rate": 2.663755458515284e-05, "loss": 0.1233, "step": 5960 }, { "epoch": 5.211697948494107, "grad_norm": 2.3054494857788086, "learning_rate": 2.658903444929646e-05, "loss": 0.1238, "step": 5970 }, { "epoch": 5.220427760803143, "grad_norm": 3.240325927734375, "learning_rate": 2.654051431344008e-05, "loss": 0.1613, "step": 5980 }, { "epoch": 5.229157573112178, "grad_norm": 2.3457300662994385, "learning_rate": 2.64919941775837e-05, "loss": 0.1498, "step": 5990 }, { "epoch": 5.237887385421214, "grad_norm": 3.284043550491333, "learning_rate": 2.6443474041727317e-05, "loss": 0.1328, "step": 6000 }, { "epoch": 5.246617197730249, "grad_norm": 2.991929531097412, "learning_rate": 2.639495390587094e-05, "loss": 0.1334, "step": 6010 }, { "epoch": 5.2553470100392845, "grad_norm": 3.4685487747192383, "learning_rate": 2.6346433770014555e-05, "loss": 0.1463, "step": 6020 }, { "epoch": 5.264076822348319, "grad_norm": 3.8685977458953857, "learning_rate": 2.629791363415818e-05, "loss": 0.1219, "step": 6030 }, { "epoch": 5.272806634657355, "grad_norm": 2.8441107273101807, "learning_rate": 2.6249393498301796e-05, "loss": 0.1243, "step": 6040 }, { "epoch": 5.28153644696639, "grad_norm": 3.628505229949951, "learning_rate": 2.6200873362445416e-05, "loss": 0.1465, "step": 6050 }, { "epoch": 5.290266259275426, "grad_norm": 3.1612138748168945, "learning_rate": 2.6152353226589037e-05, "loss": 0.124, "step": 6060 }, { "epoch": 5.298996071584461, "grad_norm": 2.9345169067382812, "learning_rate": 2.6103833090732654e-05, "loss": 0.153, "step": 6070 }, { "epoch": 5.307725883893497, "grad_norm": 2.2323479652404785, "learning_rate": 2.6055312954876278e-05, "loss": 0.1307, "step": 6080 }, { "epoch": 5.3164556962025316, "grad_norm": 2.986766815185547, "learning_rate": 2.6006792819019892e-05, "loss": 0.1137, "step": 6090 }, { "epoch": 5.325185508511567, "grad_norm": 3.07307767868042, "learning_rate": 2.5958272683163516e-05, "loss": 0.1372, "step": 6100 }, { "epoch": 5.333915320820602, "grad_norm": 2.370492935180664, "learning_rate": 2.5909752547307133e-05, "loss": 0.1549, "step": 6110 }, { "epoch": 5.342645133129638, "grad_norm": 1.7776038646697998, "learning_rate": 2.5861232411450753e-05, "loss": 0.1218, "step": 6120 }, { "epoch": 5.351374945438673, "grad_norm": 3.1490137577056885, "learning_rate": 2.5812712275594374e-05, "loss": 0.1556, "step": 6130 }, { "epoch": 5.360104757747709, "grad_norm": 2.9647440910339355, "learning_rate": 2.576419213973799e-05, "loss": 0.126, "step": 6140 }, { "epoch": 5.368834570056744, "grad_norm": 3.315322160720825, "learning_rate": 2.5715672003881615e-05, "loss": 0.133, "step": 6150 }, { "epoch": 5.3775643823657795, "grad_norm": 2.5604379177093506, "learning_rate": 2.566715186802523e-05, "loss": 0.1437, "step": 6160 }, { "epoch": 5.386294194674814, "grad_norm": 1.5733325481414795, "learning_rate": 2.5618631732168853e-05, "loss": 0.1374, "step": 6170 }, { "epoch": 5.39502400698385, "grad_norm": 2.330185890197754, "learning_rate": 2.5570111596312467e-05, "loss": 0.1158, "step": 6180 }, { "epoch": 5.403753819292885, "grad_norm": 3.2574543952941895, "learning_rate": 2.552159146045609e-05, "loss": 0.1374, "step": 6190 }, { "epoch": 5.412483631601921, "grad_norm": 3.3817057609558105, "learning_rate": 2.547307132459971e-05, "loss": 0.1272, "step": 6200 }, { "epoch": 5.421213443910956, "grad_norm": 3.8969085216522217, "learning_rate": 2.5424551188743328e-05, "loss": 0.1297, "step": 6210 }, { "epoch": 5.429943256219992, "grad_norm": 2.5738420486450195, "learning_rate": 2.5376031052886952e-05, "loss": 0.121, "step": 6220 }, { "epoch": 5.4386730685290265, "grad_norm": 3.007840633392334, "learning_rate": 2.5327510917030566e-05, "loss": 0.1561, "step": 6230 }, { "epoch": 5.447402880838062, "grad_norm": 2.9330294132232666, "learning_rate": 2.527899078117419e-05, "loss": 0.1304, "step": 6240 }, { "epoch": 5.456132693147097, "grad_norm": 2.3257663249969482, "learning_rate": 2.523047064531781e-05, "loss": 0.1273, "step": 6250 }, { "epoch": 5.464862505456133, "grad_norm": 2.6171205043792725, "learning_rate": 2.5181950509461428e-05, "loss": 0.1135, "step": 6260 }, { "epoch": 5.473592317765168, "grad_norm": 3.440798759460449, "learning_rate": 2.5133430373605048e-05, "loss": 0.1246, "step": 6270 }, { "epoch": 5.482322130074204, "grad_norm": 3.418937921524048, "learning_rate": 2.5084910237748665e-05, "loss": 0.1354, "step": 6280 }, { "epoch": 5.491051942383239, "grad_norm": 2.9420177936553955, "learning_rate": 2.503639010189229e-05, "loss": 0.1405, "step": 6290 }, { "epoch": 5.4997817546922745, "grad_norm": 3.042564630508423, "learning_rate": 2.4987869966035906e-05, "loss": 0.1357, "step": 6300 }, { "epoch": 5.508511567001309, "grad_norm": 2.1227240562438965, "learning_rate": 2.4939349830179527e-05, "loss": 0.1417, "step": 6310 }, { "epoch": 5.517241379310345, "grad_norm": 3.1265482902526855, "learning_rate": 2.4890829694323144e-05, "loss": 0.1382, "step": 6320 }, { "epoch": 5.52597119161938, "grad_norm": 2.782801389694214, "learning_rate": 2.4842309558466765e-05, "loss": 0.1635, "step": 6330 }, { "epoch": 5.534701003928416, "grad_norm": 3.488227605819702, "learning_rate": 2.4793789422610382e-05, "loss": 0.1303, "step": 6340 }, { "epoch": 5.543430816237451, "grad_norm": 2.8877530097961426, "learning_rate": 2.4745269286754006e-05, "loss": 0.1676, "step": 6350 }, { "epoch": 5.552160628546487, "grad_norm": 2.2188923358917236, "learning_rate": 2.4696749150897623e-05, "loss": 0.1118, "step": 6360 }, { "epoch": 5.5608904408555215, "grad_norm": 2.9819607734680176, "learning_rate": 2.4648229015041243e-05, "loss": 0.1603, "step": 6370 }, { "epoch": 5.569620253164557, "grad_norm": 3.2080116271972656, "learning_rate": 2.4599708879184864e-05, "loss": 0.1304, "step": 6380 }, { "epoch": 5.578350065473592, "grad_norm": 1.6882622241973877, "learning_rate": 2.455118874332848e-05, "loss": 0.1171, "step": 6390 }, { "epoch": 5.587079877782628, "grad_norm": 2.5788047313690186, "learning_rate": 2.45026686074721e-05, "loss": 0.1193, "step": 6400 }, { "epoch": 5.595809690091663, "grad_norm": 3.859628915786743, "learning_rate": 2.445414847161572e-05, "loss": 0.124, "step": 6410 }, { "epoch": 5.604539502400699, "grad_norm": 1.5977929830551147, "learning_rate": 2.4405628335759343e-05, "loss": 0.1194, "step": 6420 }, { "epoch": 5.613269314709734, "grad_norm": 3.073011875152588, "learning_rate": 2.435710819990296e-05, "loss": 0.1344, "step": 6430 }, { "epoch": 5.6219991270187695, "grad_norm": 2.9999372959136963, "learning_rate": 2.430858806404658e-05, "loss": 0.1487, "step": 6440 }, { "epoch": 5.630728939327804, "grad_norm": 2.077570676803589, "learning_rate": 2.42600679281902e-05, "loss": 0.118, "step": 6450 }, { "epoch": 5.63945875163684, "grad_norm": 3.1440892219543457, "learning_rate": 2.4211547792333818e-05, "loss": 0.1394, "step": 6460 }, { "epoch": 5.648188563945875, "grad_norm": 2.836007833480835, "learning_rate": 2.416302765647744e-05, "loss": 0.1308, "step": 6470 }, { "epoch": 5.656918376254911, "grad_norm": 2.675652027130127, "learning_rate": 2.411450752062106e-05, "loss": 0.1208, "step": 6480 }, { "epoch": 5.665648188563946, "grad_norm": 2.833317756652832, "learning_rate": 2.406598738476468e-05, "loss": 0.1244, "step": 6490 }, { "epoch": 5.674378000872982, "grad_norm": 2.444568157196045, "learning_rate": 2.4017467248908297e-05, "loss": 0.1153, "step": 6500 }, { "epoch": 5.6831078131820165, "grad_norm": 3.581678628921509, "learning_rate": 2.3968947113051917e-05, "loss": 0.1431, "step": 6510 }, { "epoch": 5.691837625491052, "grad_norm": 1.7067352533340454, "learning_rate": 2.3920426977195538e-05, "loss": 0.1322, "step": 6520 }, { "epoch": 5.700567437800087, "grad_norm": 4.281068325042725, "learning_rate": 2.3871906841339155e-05, "loss": 0.1349, "step": 6530 }, { "epoch": 5.709297250109123, "grad_norm": 2.958136558532715, "learning_rate": 2.3823386705482776e-05, "loss": 0.1319, "step": 6540 }, { "epoch": 5.718027062418158, "grad_norm": 2.9312613010406494, "learning_rate": 2.3774866569626396e-05, "loss": 0.103, "step": 6550 }, { "epoch": 5.726756874727194, "grad_norm": 2.7693583965301514, "learning_rate": 2.3726346433770017e-05, "loss": 0.1283, "step": 6560 }, { "epoch": 5.735486687036229, "grad_norm": 2.7267799377441406, "learning_rate": 2.3677826297913634e-05, "loss": 0.1131, "step": 6570 }, { "epoch": 5.7442164993452645, "grad_norm": 2.184208631515503, "learning_rate": 2.3629306162057255e-05, "loss": 0.1289, "step": 6580 }, { "epoch": 5.752946311654299, "grad_norm": 2.8971517086029053, "learning_rate": 2.3580786026200875e-05, "loss": 0.1185, "step": 6590 }, { "epoch": 5.761676123963335, "grad_norm": 3.3640193939208984, "learning_rate": 2.3532265890344492e-05, "loss": 0.139, "step": 6600 }, { "epoch": 5.77040593627237, "grad_norm": 3.1321167945861816, "learning_rate": 2.3483745754488113e-05, "loss": 0.1594, "step": 6610 }, { "epoch": 5.779135748581406, "grad_norm": 2.4703168869018555, "learning_rate": 2.3435225618631733e-05, "loss": 0.1528, "step": 6620 }, { "epoch": 5.787865560890441, "grad_norm": 2.4260854721069336, "learning_rate": 2.3386705482775354e-05, "loss": 0.1272, "step": 6630 }, { "epoch": 5.796595373199477, "grad_norm": 2.809253692626953, "learning_rate": 2.333818534691897e-05, "loss": 0.1421, "step": 6640 }, { "epoch": 5.8053251855085115, "grad_norm": 2.53581166267395, "learning_rate": 2.328966521106259e-05, "loss": 0.1395, "step": 6650 }, { "epoch": 5.814054997817547, "grad_norm": 2.912879467010498, "learning_rate": 2.3241145075206212e-05, "loss": 0.1269, "step": 6660 }, { "epoch": 5.822784810126582, "grad_norm": 2.0264980792999268, "learning_rate": 2.319262493934983e-05, "loss": 0.1335, "step": 6670 }, { "epoch": 5.831514622435618, "grad_norm": 2.8756725788116455, "learning_rate": 2.3144104803493453e-05, "loss": 0.1246, "step": 6680 }, { "epoch": 5.840244434744653, "grad_norm": 2.49039363861084, "learning_rate": 2.309558466763707e-05, "loss": 0.1292, "step": 6690 }, { "epoch": 5.848974247053688, "grad_norm": 2.6733226776123047, "learning_rate": 2.304706453178069e-05, "loss": 0.1064, "step": 6700 }, { "epoch": 5.857704059362724, "grad_norm": 2.1536784172058105, "learning_rate": 2.2998544395924308e-05, "loss": 0.138, "step": 6710 }, { "epoch": 5.8664338716717594, "grad_norm": 3.428746461868286, "learning_rate": 2.295002426006793e-05, "loss": 0.1362, "step": 6720 }, { "epoch": 5.875163683980794, "grad_norm": 3.4653851985931396, "learning_rate": 2.290150412421155e-05, "loss": 0.1235, "step": 6730 }, { "epoch": 5.883893496289829, "grad_norm": 2.283843755722046, "learning_rate": 2.2852983988355166e-05, "loss": 0.1081, "step": 6740 }, { "epoch": 5.892623308598865, "grad_norm": 3.812995433807373, "learning_rate": 2.280446385249879e-05, "loss": 0.1305, "step": 6750 }, { "epoch": 5.901353120907901, "grad_norm": 2.6548011302948, "learning_rate": 2.2755943716642407e-05, "loss": 0.135, "step": 6760 }, { "epoch": 5.910082933216936, "grad_norm": 2.09759783744812, "learning_rate": 2.2707423580786028e-05, "loss": 0.1066, "step": 6770 }, { "epoch": 5.918812745525971, "grad_norm": 2.5961296558380127, "learning_rate": 2.2658903444929645e-05, "loss": 0.1312, "step": 6780 }, { "epoch": 5.9275425578350065, "grad_norm": 2.6198313236236572, "learning_rate": 2.2610383309073266e-05, "loss": 0.1213, "step": 6790 }, { "epoch": 5.936272370144042, "grad_norm": 2.5672667026519775, "learning_rate": 2.2561863173216886e-05, "loss": 0.1097, "step": 6800 }, { "epoch": 5.945002182453077, "grad_norm": 1.9707518815994263, "learning_rate": 2.2513343037360507e-05, "loss": 0.1015, "step": 6810 }, { "epoch": 5.953731994762112, "grad_norm": 2.9768049716949463, "learning_rate": 2.2464822901504127e-05, "loss": 0.1246, "step": 6820 }, { "epoch": 5.962461807071148, "grad_norm": 3.3005049228668213, "learning_rate": 2.2416302765647744e-05, "loss": 0.1286, "step": 6830 }, { "epoch": 5.971191619380184, "grad_norm": 4.172717094421387, "learning_rate": 2.2367782629791365e-05, "loss": 0.1346, "step": 6840 }, { "epoch": 5.979921431689219, "grad_norm": 3.4869165420532227, "learning_rate": 2.2319262493934982e-05, "loss": 0.1205, "step": 6850 }, { "epoch": 5.9886512439982535, "grad_norm": 3.327515125274658, "learning_rate": 2.2270742358078603e-05, "loss": 0.1267, "step": 6860 }, { "epoch": 5.997381056307289, "grad_norm": 3.0895607471466064, "learning_rate": 2.2222222222222223e-05, "loss": 0.1141, "step": 6870 }, { "epoch": 6.0, "eval_accuracy": 0.9722563221212865, "eval_loss": 0.08740793168544769, "eval_runtime": 61.534, "eval_samples_per_second": 264.764, "eval_steps_per_second": 8.288, "step": 6873 }, { "epoch": 6.006110868616325, "grad_norm": 2.24489688873291, "learning_rate": 2.2173702086365844e-05, "loss": 0.1132, "step": 6880 }, { "epoch": 6.01484068092536, "grad_norm": 3.8057761192321777, "learning_rate": 2.2125181950509464e-05, "loss": 0.123, "step": 6890 }, { "epoch": 6.023570493234396, "grad_norm": 2.2050459384918213, "learning_rate": 2.207666181465308e-05, "loss": 0.1365, "step": 6900 }, { "epoch": 6.032300305543431, "grad_norm": 3.0642194747924805, "learning_rate": 2.2028141678796702e-05, "loss": 0.1423, "step": 6910 }, { "epoch": 6.041030117852467, "grad_norm": 2.7323355674743652, "learning_rate": 2.197962154294032e-05, "loss": 0.1451, "step": 6920 }, { "epoch": 6.0497599301615015, "grad_norm": 3.7342848777770996, "learning_rate": 2.193110140708394e-05, "loss": 0.1157, "step": 6930 }, { "epoch": 6.058489742470537, "grad_norm": 1.9272093772888184, "learning_rate": 2.188258127122756e-05, "loss": 0.1019, "step": 6940 }, { "epoch": 6.067219554779572, "grad_norm": 2.0700807571411133, "learning_rate": 2.183406113537118e-05, "loss": 0.1223, "step": 6950 }, { "epoch": 6.075949367088608, "grad_norm": 2.6618826389312744, "learning_rate": 2.17855409995148e-05, "loss": 0.1181, "step": 6960 }, { "epoch": 6.084679179397643, "grad_norm": 4.103969573974609, "learning_rate": 2.173702086365842e-05, "loss": 0.1357, "step": 6970 }, { "epoch": 6.093408991706679, "grad_norm": 2.845344066619873, "learning_rate": 2.168850072780204e-05, "loss": 0.1183, "step": 6980 }, { "epoch": 6.102138804015714, "grad_norm": 2.7917563915252686, "learning_rate": 2.1639980591945656e-05, "loss": 0.1287, "step": 6990 }, { "epoch": 6.110868616324749, "grad_norm": 2.6179635524749756, "learning_rate": 2.1591460456089277e-05, "loss": 0.1346, "step": 7000 }, { "epoch": 6.119598428633784, "grad_norm": 1.979193091392517, "learning_rate": 2.1542940320232897e-05, "loss": 0.125, "step": 7010 }, { "epoch": 6.12832824094282, "grad_norm": 2.7642877101898193, "learning_rate": 2.1494420184376518e-05, "loss": 0.1275, "step": 7020 }, { "epoch": 6.137058053251855, "grad_norm": 2.8622918128967285, "learning_rate": 2.144590004852014e-05, "loss": 0.1109, "step": 7030 }, { "epoch": 6.145787865560891, "grad_norm": 2.6740987300872803, "learning_rate": 2.1397379912663756e-05, "loss": 0.1144, "step": 7040 }, { "epoch": 6.154517677869926, "grad_norm": 2.9361984729766846, "learning_rate": 2.1348859776807376e-05, "loss": 0.1058, "step": 7050 }, { "epoch": 6.163247490178962, "grad_norm": 2.7780208587646484, "learning_rate": 2.1300339640950993e-05, "loss": 0.1391, "step": 7060 }, { "epoch": 6.1719773024879965, "grad_norm": 3.0886313915252686, "learning_rate": 2.1251819505094614e-05, "loss": 0.134, "step": 7070 }, { "epoch": 6.180707114797032, "grad_norm": 2.369558334350586, "learning_rate": 2.1203299369238234e-05, "loss": 0.1589, "step": 7080 }, { "epoch": 6.189436927106067, "grad_norm": 2.826566219329834, "learning_rate": 2.1154779233381855e-05, "loss": 0.1125, "step": 7090 }, { "epoch": 6.198166739415103, "grad_norm": 3.705179452896118, "learning_rate": 2.1106259097525476e-05, "loss": 0.1422, "step": 7100 }, { "epoch": 6.206896551724138, "grad_norm": 2.0699045658111572, "learning_rate": 2.1057738961669093e-05, "loss": 0.1174, "step": 7110 }, { "epoch": 6.215626364033174, "grad_norm": 1.7472636699676514, "learning_rate": 2.1009218825812713e-05, "loss": 0.1, "step": 7120 }, { "epoch": 6.224356176342209, "grad_norm": 3.3158297538757324, "learning_rate": 2.096069868995633e-05, "loss": 0.1296, "step": 7130 }, { "epoch": 6.233085988651244, "grad_norm": 3.7028303146362305, "learning_rate": 2.0912178554099954e-05, "loss": 0.1189, "step": 7140 }, { "epoch": 6.241815800960279, "grad_norm": 3.309446334838867, "learning_rate": 2.086365841824357e-05, "loss": 0.1197, "step": 7150 }, { "epoch": 6.250545613269315, "grad_norm": 2.997817277908325, "learning_rate": 2.0815138282387192e-05, "loss": 0.095, "step": 7160 }, { "epoch": 6.25927542557835, "grad_norm": 3.9455857276916504, "learning_rate": 2.0766618146530813e-05, "loss": 0.1141, "step": 7170 }, { "epoch": 6.268005237887385, "grad_norm": 2.8850817680358887, "learning_rate": 2.071809801067443e-05, "loss": 0.1241, "step": 7180 }, { "epoch": 6.276735050196421, "grad_norm": 3.4822731018066406, "learning_rate": 2.066957787481805e-05, "loss": 0.1447, "step": 7190 }, { "epoch": 6.2854648625054566, "grad_norm": 1.834179401397705, "learning_rate": 2.0621057738961667e-05, "loss": 0.113, "step": 7200 }, { "epoch": 6.2941946748144915, "grad_norm": 2.8471169471740723, "learning_rate": 2.057253760310529e-05, "loss": 0.1095, "step": 7210 }, { "epoch": 6.302924487123526, "grad_norm": 2.841275930404663, "learning_rate": 2.052401746724891e-05, "loss": 0.1174, "step": 7220 }, { "epoch": 6.311654299432562, "grad_norm": 3.3309431076049805, "learning_rate": 2.047549733139253e-05, "loss": 0.1182, "step": 7230 }, { "epoch": 6.320384111741598, "grad_norm": 2.214808225631714, "learning_rate": 2.042697719553615e-05, "loss": 0.1345, "step": 7240 }, { "epoch": 6.329113924050633, "grad_norm": 2.0826220512390137, "learning_rate": 2.0378457059679767e-05, "loss": 0.1114, "step": 7250 }, { "epoch": 6.337843736359668, "grad_norm": 2.8715829849243164, "learning_rate": 2.0329936923823387e-05, "loss": 0.1335, "step": 7260 }, { "epoch": 6.346573548668704, "grad_norm": 2.2511582374572754, "learning_rate": 2.0281416787967008e-05, "loss": 0.1558, "step": 7270 }, { "epoch": 6.355303360977739, "grad_norm": 3.4435458183288574, "learning_rate": 2.023289665211063e-05, "loss": 0.1433, "step": 7280 }, { "epoch": 6.364033173286774, "grad_norm": 2.411850929260254, "learning_rate": 2.0184376516254246e-05, "loss": 0.1224, "step": 7290 }, { "epoch": 6.372762985595809, "grad_norm": 2.7871601581573486, "learning_rate": 2.0135856380397866e-05, "loss": 0.1206, "step": 7300 }, { "epoch": 6.381492797904845, "grad_norm": 2.9544012546539307, "learning_rate": 2.0087336244541487e-05, "loss": 0.133, "step": 7310 }, { "epoch": 6.390222610213881, "grad_norm": 3.283802032470703, "learning_rate": 2.0038816108685104e-05, "loss": 0.1259, "step": 7320 }, { "epoch": 6.398952422522916, "grad_norm": 2.1609585285186768, "learning_rate": 1.9990295972828724e-05, "loss": 0.1499, "step": 7330 }, { "epoch": 6.407682234831951, "grad_norm": 2.2527894973754883, "learning_rate": 1.9941775836972345e-05, "loss": 0.1206, "step": 7340 }, { "epoch": 6.4164120471409865, "grad_norm": 3.909329891204834, "learning_rate": 1.9893255701115965e-05, "loss": 0.099, "step": 7350 }, { "epoch": 6.425141859450022, "grad_norm": 2.997509002685547, "learning_rate": 1.9844735565259583e-05, "loss": 0.1399, "step": 7360 }, { "epoch": 6.433871671759057, "grad_norm": 2.639704465866089, "learning_rate": 1.9796215429403203e-05, "loss": 0.1224, "step": 7370 }, { "epoch": 6.442601484068092, "grad_norm": 3.1105806827545166, "learning_rate": 1.9747695293546824e-05, "loss": 0.1106, "step": 7380 }, { "epoch": 6.451331296377128, "grad_norm": 2.972954273223877, "learning_rate": 1.969917515769044e-05, "loss": 0.1216, "step": 7390 }, { "epoch": 6.460061108686164, "grad_norm": 3.996022939682007, "learning_rate": 1.965065502183406e-05, "loss": 0.1279, "step": 7400 }, { "epoch": 6.468790920995199, "grad_norm": 3.0898971557617188, "learning_rate": 1.9602134885977682e-05, "loss": 0.1247, "step": 7410 }, { "epoch": 6.4775207333042335, "grad_norm": 2.5472044944763184, "learning_rate": 1.9553614750121303e-05, "loss": 0.1411, "step": 7420 }, { "epoch": 6.486250545613269, "grad_norm": 3.311650514602661, "learning_rate": 1.950509461426492e-05, "loss": 0.1053, "step": 7430 }, { "epoch": 6.494980357922305, "grad_norm": 2.9363017082214355, "learning_rate": 1.945657447840854e-05, "loss": 0.1191, "step": 7440 }, { "epoch": 6.50371017023134, "grad_norm": 3.120633602142334, "learning_rate": 1.940805434255216e-05, "loss": 0.1202, "step": 7450 }, { "epoch": 6.512439982540375, "grad_norm": 3.49874210357666, "learning_rate": 1.9359534206695778e-05, "loss": 0.1128, "step": 7460 }, { "epoch": 6.521169794849411, "grad_norm": 2.98953914642334, "learning_rate": 1.9311014070839402e-05, "loss": 0.1191, "step": 7470 }, { "epoch": 6.5298996071584465, "grad_norm": 2.440988779067993, "learning_rate": 1.926249393498302e-05, "loss": 0.12, "step": 7480 }, { "epoch": 6.538629419467481, "grad_norm": 2.9546585083007812, "learning_rate": 1.921397379912664e-05, "loss": 0.143, "step": 7490 }, { "epoch": 6.547359231776516, "grad_norm": 1.7305012941360474, "learning_rate": 1.9165453663270257e-05, "loss": 0.1083, "step": 7500 }, { "epoch": 6.556089044085552, "grad_norm": 2.084421396255493, "learning_rate": 1.9116933527413877e-05, "loss": 0.1228, "step": 7510 }, { "epoch": 6.564818856394588, "grad_norm": 3.237086057662964, "learning_rate": 1.9068413391557498e-05, "loss": 0.1265, "step": 7520 }, { "epoch": 6.573548668703623, "grad_norm": 2.24908185005188, "learning_rate": 1.9019893255701115e-05, "loss": 0.0891, "step": 7530 }, { "epoch": 6.582278481012658, "grad_norm": 3.3673548698425293, "learning_rate": 1.897137311984474e-05, "loss": 0.1377, "step": 7540 }, { "epoch": 6.591008293321694, "grad_norm": 2.560291051864624, "learning_rate": 1.8922852983988356e-05, "loss": 0.1114, "step": 7550 }, { "epoch": 6.599738105630729, "grad_norm": 1.9948253631591797, "learning_rate": 1.8874332848131977e-05, "loss": 0.1261, "step": 7560 }, { "epoch": 6.608467917939764, "grad_norm": 3.7410624027252197, "learning_rate": 1.8825812712275594e-05, "loss": 0.1111, "step": 7570 }, { "epoch": 6.617197730248799, "grad_norm": 3.520691156387329, "learning_rate": 1.8777292576419214e-05, "loss": 0.1396, "step": 7580 }, { "epoch": 6.625927542557835, "grad_norm": 2.289947986602783, "learning_rate": 1.8728772440562835e-05, "loss": 0.1211, "step": 7590 }, { "epoch": 6.634657354866871, "grad_norm": 2.078162431716919, "learning_rate": 1.8680252304706455e-05, "loss": 0.1258, "step": 7600 }, { "epoch": 6.643387167175906, "grad_norm": 2.3327958583831787, "learning_rate": 1.8631732168850076e-05, "loss": 0.1177, "step": 7610 }, { "epoch": 6.652116979484941, "grad_norm": 2.689931631088257, "learning_rate": 1.8583212032993693e-05, "loss": 0.1144, "step": 7620 }, { "epoch": 6.660846791793976, "grad_norm": 2.395716428756714, "learning_rate": 1.8534691897137314e-05, "loss": 0.1202, "step": 7630 }, { "epoch": 6.669576604103012, "grad_norm": 2.892514705657959, "learning_rate": 1.848617176128093e-05, "loss": 0.1227, "step": 7640 }, { "epoch": 6.678306416412047, "grad_norm": 2.741924285888672, "learning_rate": 1.843765162542455e-05, "loss": 0.0994, "step": 7650 }, { "epoch": 6.687036228721082, "grad_norm": 3.1253838539123535, "learning_rate": 1.8389131489568172e-05, "loss": 0.1212, "step": 7660 }, { "epoch": 6.695766041030118, "grad_norm": 3.520061492919922, "learning_rate": 1.8340611353711792e-05, "loss": 0.1113, "step": 7670 }, { "epoch": 6.704495853339154, "grad_norm": 1.9933674335479736, "learning_rate": 1.8292091217855413e-05, "loss": 0.0858, "step": 7680 }, { "epoch": 6.713225665648189, "grad_norm": 2.0083909034729004, "learning_rate": 1.824357108199903e-05, "loss": 0.1199, "step": 7690 }, { "epoch": 6.7219554779572235, "grad_norm": 3.1717681884765625, "learning_rate": 1.819505094614265e-05, "loss": 0.1207, "step": 7700 }, { "epoch": 6.730685290266259, "grad_norm": 1.9426745176315308, "learning_rate": 1.8146530810286268e-05, "loss": 0.1157, "step": 7710 }, { "epoch": 6.739415102575295, "grad_norm": 2.9203710556030273, "learning_rate": 1.809801067442989e-05, "loss": 0.1251, "step": 7720 }, { "epoch": 6.74814491488433, "grad_norm": 3.200591564178467, "learning_rate": 1.804949053857351e-05, "loss": 0.1481, "step": 7730 }, { "epoch": 6.756874727193365, "grad_norm": 4.032431125640869, "learning_rate": 1.800097040271713e-05, "loss": 0.135, "step": 7740 }, { "epoch": 6.765604539502401, "grad_norm": 2.4872477054595947, "learning_rate": 1.795245026686075e-05, "loss": 0.0975, "step": 7750 }, { "epoch": 6.7743343518114365, "grad_norm": 2.5336356163024902, "learning_rate": 1.7903930131004367e-05, "loss": 0.1311, "step": 7760 }, { "epoch": 6.783064164120471, "grad_norm": 2.6946463584899902, "learning_rate": 1.7855409995147988e-05, "loss": 0.1073, "step": 7770 }, { "epoch": 6.791793976429506, "grad_norm": 3.247837543487549, "learning_rate": 1.7806889859291605e-05, "loss": 0.1126, "step": 7780 }, { "epoch": 6.800523788738542, "grad_norm": 2.301456928253174, "learning_rate": 1.7758369723435225e-05, "loss": 0.1314, "step": 7790 }, { "epoch": 6.809253601047578, "grad_norm": 1.9753727912902832, "learning_rate": 1.7709849587578846e-05, "loss": 0.1, "step": 7800 }, { "epoch": 6.817983413356613, "grad_norm": 2.194286584854126, "learning_rate": 1.7661329451722467e-05, "loss": 0.1114, "step": 7810 }, { "epoch": 6.826713225665648, "grad_norm": 2.973609447479248, "learning_rate": 1.7612809315866087e-05, "loss": 0.1065, "step": 7820 }, { "epoch": 6.8354430379746836, "grad_norm": 2.5955142974853516, "learning_rate": 1.7564289180009704e-05, "loss": 0.1326, "step": 7830 }, { "epoch": 6.844172850283719, "grad_norm": 2.9520103931427, "learning_rate": 1.7515769044153325e-05, "loss": 0.1285, "step": 7840 }, { "epoch": 6.852902662592754, "grad_norm": 3.955249071121216, "learning_rate": 1.7467248908296942e-05, "loss": 0.1029, "step": 7850 }, { "epoch": 6.861632474901789, "grad_norm": 2.9063422679901123, "learning_rate": 1.7418728772440563e-05, "loss": 0.0971, "step": 7860 }, { "epoch": 6.870362287210825, "grad_norm": 2.6652753353118896, "learning_rate": 1.7370208636584183e-05, "loss": 0.1306, "step": 7870 }, { "epoch": 6.879092099519861, "grad_norm": 2.687307834625244, "learning_rate": 1.7321688500727804e-05, "loss": 0.1265, "step": 7880 }, { "epoch": 6.887821911828896, "grad_norm": 2.639251708984375, "learning_rate": 1.7273168364871424e-05, "loss": 0.1162, "step": 7890 }, { "epoch": 6.896551724137931, "grad_norm": 3.628904104232788, "learning_rate": 1.722464822901504e-05, "loss": 0.1368, "step": 7900 }, { "epoch": 6.905281536446966, "grad_norm": 2.818596124649048, "learning_rate": 1.7176128093158662e-05, "loss": 0.1304, "step": 7910 }, { "epoch": 6.914011348756002, "grad_norm": 2.2759499549865723, "learning_rate": 1.712760795730228e-05, "loss": 0.1264, "step": 7920 }, { "epoch": 6.922741161065037, "grad_norm": 3.426044225692749, "learning_rate": 1.7079087821445903e-05, "loss": 0.1131, "step": 7930 }, { "epoch": 6.931470973374072, "grad_norm": 1.9681727886199951, "learning_rate": 1.703056768558952e-05, "loss": 0.1231, "step": 7940 }, { "epoch": 6.940200785683108, "grad_norm": 1.9072273969650269, "learning_rate": 1.698204754973314e-05, "loss": 0.096, "step": 7950 }, { "epoch": 6.948930597992144, "grad_norm": 2.6058807373046875, "learning_rate": 1.693352741387676e-05, "loss": 0.1137, "step": 7960 }, { "epoch": 6.9576604103011785, "grad_norm": 1.815313696861267, "learning_rate": 1.688500727802038e-05, "loss": 0.0869, "step": 7970 }, { "epoch": 6.9663902226102135, "grad_norm": 3.1374306678771973, "learning_rate": 1.6836487142164e-05, "loss": 0.1256, "step": 7980 }, { "epoch": 6.975120034919249, "grad_norm": 1.8264007568359375, "learning_rate": 1.6787967006307616e-05, "loss": 0.0917, "step": 7990 }, { "epoch": 6.983849847228285, "grad_norm": 2.1605708599090576, "learning_rate": 1.673944687045124e-05, "loss": 0.134, "step": 8000 }, { "epoch": 6.99257965953732, "grad_norm": 2.8631725311279297, "learning_rate": 1.6690926734594857e-05, "loss": 0.1275, "step": 8010 }, { "epoch": 6.999563509384548, "eval_accuracy": 0.9620058924625583, "eval_loss": 0.12257199734449387, "eval_runtime": 61.6953, "eval_samples_per_second": 264.072, "eval_steps_per_second": 8.266, "step": 8018 }, { "epoch": 7.001309471846355, "grad_norm": 2.7233829498291016, "learning_rate": 1.6642406598738478e-05, "loss": 0.1128, "step": 8020 }, { "epoch": 7.010039284155391, "grad_norm": 2.6346898078918457, "learning_rate": 1.6593886462882098e-05, "loss": 0.107, "step": 8030 }, { "epoch": 7.018769096464426, "grad_norm": 3.0289077758789062, "learning_rate": 1.6545366327025715e-05, "loss": 0.1282, "step": 8040 }, { "epoch": 7.027498908773461, "grad_norm": 3.5590922832489014, "learning_rate": 1.6496846191169336e-05, "loss": 0.1248, "step": 8050 }, { "epoch": 7.036228721082496, "grad_norm": 2.1778759956359863, "learning_rate": 1.6448326055312957e-05, "loss": 0.1206, "step": 8060 }, { "epoch": 7.044958533391532, "grad_norm": 3.13328218460083, "learning_rate": 1.6399805919456577e-05, "loss": 0.1064, "step": 8070 }, { "epoch": 7.053688345700567, "grad_norm": 2.862576484680176, "learning_rate": 1.6351285783600194e-05, "loss": 0.1133, "step": 8080 }, { "epoch": 7.062418158009603, "grad_norm": 2.9792587757110596, "learning_rate": 1.6302765647743815e-05, "loss": 0.1542, "step": 8090 }, { "epoch": 7.071147970318638, "grad_norm": 3.1806788444519043, "learning_rate": 1.6254245511887435e-05, "loss": 0.1284, "step": 8100 }, { "epoch": 7.0798777826276735, "grad_norm": 2.1008007526397705, "learning_rate": 1.6205725376031052e-05, "loss": 0.0993, "step": 8110 }, { "epoch": 7.0886075949367084, "grad_norm": 3.443948984146118, "learning_rate": 1.6157205240174673e-05, "loss": 0.1321, "step": 8120 }, { "epoch": 7.097337407245744, "grad_norm": 3.075568675994873, "learning_rate": 1.6108685104318294e-05, "loss": 0.1135, "step": 8130 }, { "epoch": 7.106067219554779, "grad_norm": 2.7205452919006348, "learning_rate": 1.6060164968461914e-05, "loss": 0.1045, "step": 8140 }, { "epoch": 7.114797031863815, "grad_norm": 1.9306424856185913, "learning_rate": 1.601164483260553e-05, "loss": 0.1155, "step": 8150 }, { "epoch": 7.12352684417285, "grad_norm": 1.848753571510315, "learning_rate": 1.5963124696749152e-05, "loss": 0.121, "step": 8160 }, { "epoch": 7.132256656481886, "grad_norm": 2.816012144088745, "learning_rate": 1.5914604560892772e-05, "loss": 0.1195, "step": 8170 }, { "epoch": 7.140986468790921, "grad_norm": 1.6042299270629883, "learning_rate": 1.586608442503639e-05, "loss": 0.109, "step": 8180 }, { "epoch": 7.149716281099956, "grad_norm": 1.9858665466308594, "learning_rate": 1.581756428918001e-05, "loss": 0.1218, "step": 8190 }, { "epoch": 7.158446093408991, "grad_norm": 3.661896228790283, "learning_rate": 1.576904415332363e-05, "loss": 0.1162, "step": 8200 }, { "epoch": 7.167175905718027, "grad_norm": 3.4283740520477295, "learning_rate": 1.572052401746725e-05, "loss": 0.1192, "step": 8210 }, { "epoch": 7.175905718027062, "grad_norm": 2.0922162532806396, "learning_rate": 1.567200388161087e-05, "loss": 0.1228, "step": 8220 }, { "epoch": 7.184635530336098, "grad_norm": 3.018186330795288, "learning_rate": 1.562348374575449e-05, "loss": 0.1239, "step": 8230 }, { "epoch": 7.193365342645133, "grad_norm": 3.431612491607666, "learning_rate": 1.557496360989811e-05, "loss": 0.1199, "step": 8240 }, { "epoch": 7.2020951549541685, "grad_norm": 1.7211098670959473, "learning_rate": 1.5526443474041727e-05, "loss": 0.1241, "step": 8250 }, { "epoch": 7.210824967263203, "grad_norm": 3.201613664627075, "learning_rate": 1.547792333818535e-05, "loss": 0.1528, "step": 8260 }, { "epoch": 7.219554779572239, "grad_norm": 2.8337209224700928, "learning_rate": 1.5429403202328968e-05, "loss": 0.1037, "step": 8270 }, { "epoch": 7.228284591881274, "grad_norm": 2.8004138469696045, "learning_rate": 1.5380883066472588e-05, "loss": 0.1221, "step": 8280 }, { "epoch": 7.23701440419031, "grad_norm": 4.067490577697754, "learning_rate": 1.5332362930616205e-05, "loss": 0.1399, "step": 8290 }, { "epoch": 7.245744216499345, "grad_norm": 3.5075736045837402, "learning_rate": 1.5283842794759826e-05, "loss": 0.1412, "step": 8300 }, { "epoch": 7.254474028808381, "grad_norm": 3.4626917839050293, "learning_rate": 1.5235322658903445e-05, "loss": 0.1311, "step": 8310 }, { "epoch": 7.263203841117416, "grad_norm": 3.102483034133911, "learning_rate": 1.5186802523047064e-05, "loss": 0.1054, "step": 8320 }, { "epoch": 7.271933653426451, "grad_norm": 3.181889533996582, "learning_rate": 1.5138282387190686e-05, "loss": 0.1469, "step": 8330 }, { "epoch": 7.280663465735486, "grad_norm": 2.642778158187866, "learning_rate": 1.5089762251334305e-05, "loss": 0.1359, "step": 8340 }, { "epoch": 7.289393278044522, "grad_norm": 2.339479446411133, "learning_rate": 1.5041242115477924e-05, "loss": 0.0924, "step": 8350 }, { "epoch": 7.298123090353557, "grad_norm": 3.010129690170288, "learning_rate": 1.4992721979621544e-05, "loss": 0.1211, "step": 8360 }, { "epoch": 7.306852902662593, "grad_norm": 2.169443130493164, "learning_rate": 1.4944201843765163e-05, "loss": 0.0962, "step": 8370 }, { "epoch": 7.315582714971628, "grad_norm": 2.3417961597442627, "learning_rate": 1.4895681707908782e-05, "loss": 0.1263, "step": 8380 }, { "epoch": 7.3243125272806635, "grad_norm": 2.7159245014190674, "learning_rate": 1.4847161572052404e-05, "loss": 0.0773, "step": 8390 }, { "epoch": 7.333042339589698, "grad_norm": 1.89850652217865, "learning_rate": 1.4798641436196023e-05, "loss": 0.1134, "step": 8400 }, { "epoch": 7.341772151898734, "grad_norm": 2.856013536453247, "learning_rate": 1.4750121300339642e-05, "loss": 0.1278, "step": 8410 }, { "epoch": 7.350501964207769, "grad_norm": 2.531646728515625, "learning_rate": 1.470160116448326e-05, "loss": 0.1234, "step": 8420 }, { "epoch": 7.359231776516805, "grad_norm": 3.2163310050964355, "learning_rate": 1.4653081028626881e-05, "loss": 0.1112, "step": 8430 }, { "epoch": 7.36796158882584, "grad_norm": 2.611832618713379, "learning_rate": 1.46045608927705e-05, "loss": 0.1146, "step": 8440 }, { "epoch": 7.376691401134876, "grad_norm": 3.106451988220215, "learning_rate": 1.4556040756914119e-05, "loss": 0.1088, "step": 8450 }, { "epoch": 7.385421213443911, "grad_norm": 2.7591700553894043, "learning_rate": 1.4507520621057741e-05, "loss": 0.1313, "step": 8460 }, { "epoch": 7.394151025752946, "grad_norm": 2.3061065673828125, "learning_rate": 1.445900048520136e-05, "loss": 0.1187, "step": 8470 }, { "epoch": 7.402880838061981, "grad_norm": 2.7407071590423584, "learning_rate": 1.4410480349344979e-05, "loss": 0.1076, "step": 8480 }, { "epoch": 7.411610650371017, "grad_norm": 3.5780117511749268, "learning_rate": 1.4361960213488598e-05, "loss": 0.1261, "step": 8490 }, { "epoch": 7.420340462680052, "grad_norm": 2.2158961296081543, "learning_rate": 1.4313440077632218e-05, "loss": 0.1042, "step": 8500 }, { "epoch": 7.429070274989088, "grad_norm": 3.758617639541626, "learning_rate": 1.4264919941775837e-05, "loss": 0.1146, "step": 8510 }, { "epoch": 7.437800087298123, "grad_norm": 3.035114049911499, "learning_rate": 1.421639980591946e-05, "loss": 0.1149, "step": 8520 }, { "epoch": 7.4465298996071585, "grad_norm": 3.016139030456543, "learning_rate": 1.4167879670063078e-05, "loss": 0.0983, "step": 8530 }, { "epoch": 7.455259711916193, "grad_norm": 1.1190143823623657, "learning_rate": 1.4119359534206697e-05, "loss": 0.1016, "step": 8540 }, { "epoch": 7.463989524225229, "grad_norm": 1.6610057353973389, "learning_rate": 1.4070839398350316e-05, "loss": 0.1098, "step": 8550 }, { "epoch": 7.472719336534264, "grad_norm": 2.246140956878662, "learning_rate": 1.4022319262493935e-05, "loss": 0.1226, "step": 8560 }, { "epoch": 7.4814491488433, "grad_norm": 3.7668988704681396, "learning_rate": 1.3973799126637555e-05, "loss": 0.0909, "step": 8570 }, { "epoch": 7.490178961152335, "grad_norm": 1.9712340831756592, "learning_rate": 1.3925278990781174e-05, "loss": 0.1065, "step": 8580 }, { "epoch": 7.498908773461371, "grad_norm": 2.9757723808288574, "learning_rate": 1.3876758854924796e-05, "loss": 0.1222, "step": 8590 }, { "epoch": 7.5076385857704055, "grad_norm": 3.4910058975219727, "learning_rate": 1.3828238719068415e-05, "loss": 0.1156, "step": 8600 }, { "epoch": 7.516368398079441, "grad_norm": 2.6508007049560547, "learning_rate": 1.3779718583212034e-05, "loss": 0.099, "step": 8610 }, { "epoch": 7.525098210388476, "grad_norm": 1.8022457361221313, "learning_rate": 1.3731198447355653e-05, "loss": 0.0829, "step": 8620 }, { "epoch": 7.533828022697512, "grad_norm": 2.3336634635925293, "learning_rate": 1.3682678311499272e-05, "loss": 0.1409, "step": 8630 }, { "epoch": 7.542557835006547, "grad_norm": 3.1852900981903076, "learning_rate": 1.3634158175642892e-05, "loss": 0.125, "step": 8640 }, { "epoch": 7.551287647315583, "grad_norm": 2.8601016998291016, "learning_rate": 1.3585638039786511e-05, "loss": 0.1035, "step": 8650 }, { "epoch": 7.560017459624618, "grad_norm": 2.5289573669433594, "learning_rate": 1.3537117903930133e-05, "loss": 0.1093, "step": 8660 }, { "epoch": 7.5687472719336535, "grad_norm": 2.4447848796844482, "learning_rate": 1.3488597768073752e-05, "loss": 0.1076, "step": 8670 }, { "epoch": 7.577477084242688, "grad_norm": 3.016014337539673, "learning_rate": 1.3440077632217371e-05, "loss": 0.1189, "step": 8680 }, { "epoch": 7.586206896551724, "grad_norm": 2.6992433071136475, "learning_rate": 1.339155749636099e-05, "loss": 0.1088, "step": 8690 }, { "epoch": 7.594936708860759, "grad_norm": 2.2244362831115723, "learning_rate": 1.3343037360504609e-05, "loss": 0.0965, "step": 8700 }, { "epoch": 7.603666521169795, "grad_norm": 2.8653414249420166, "learning_rate": 1.329451722464823e-05, "loss": 0.1022, "step": 8710 }, { "epoch": 7.61239633347883, "grad_norm": 3.810765027999878, "learning_rate": 1.324599708879185e-05, "loss": 0.1208, "step": 8720 }, { "epoch": 7.621126145787866, "grad_norm": 2.089237689971924, "learning_rate": 1.319747695293547e-05, "loss": 0.0989, "step": 8730 }, { "epoch": 7.6298559580969005, "grad_norm": 1.9120042324066162, "learning_rate": 1.314895681707909e-05, "loss": 0.112, "step": 8740 }, { "epoch": 7.638585770405936, "grad_norm": 2.164149284362793, "learning_rate": 1.3100436681222708e-05, "loss": 0.1183, "step": 8750 }, { "epoch": 7.647315582714971, "grad_norm": 1.7820502519607544, "learning_rate": 1.3051916545366327e-05, "loss": 0.1187, "step": 8760 }, { "epoch": 7.656045395024007, "grad_norm": 3.0987606048583984, "learning_rate": 1.3003396409509946e-05, "loss": 0.1192, "step": 8770 }, { "epoch": 7.664775207333042, "grad_norm": 2.332767963409424, "learning_rate": 1.2954876273653566e-05, "loss": 0.1219, "step": 8780 }, { "epoch": 7.673505019642078, "grad_norm": 2.8338541984558105, "learning_rate": 1.2906356137797187e-05, "loss": 0.0953, "step": 8790 }, { "epoch": 7.682234831951113, "grad_norm": 2.000577926635742, "learning_rate": 1.2857836001940808e-05, "loss": 0.0969, "step": 8800 }, { "epoch": 7.6909646442601485, "grad_norm": 3.364076614379883, "learning_rate": 1.2809315866084426e-05, "loss": 0.1117, "step": 8810 }, { "epoch": 7.699694456569183, "grad_norm": 4.16958475112915, "learning_rate": 1.2760795730228045e-05, "loss": 0.1216, "step": 8820 }, { "epoch": 7.708424268878219, "grad_norm": 2.2611634731292725, "learning_rate": 1.2712275594371664e-05, "loss": 0.0929, "step": 8830 }, { "epoch": 7.717154081187254, "grad_norm": 2.592312812805176, "learning_rate": 1.2663755458515283e-05, "loss": 0.1147, "step": 8840 }, { "epoch": 7.72588389349629, "grad_norm": 3.680349588394165, "learning_rate": 1.2615235322658905e-05, "loss": 0.1184, "step": 8850 }, { "epoch": 7.734613705805325, "grad_norm": 3.146328926086426, "learning_rate": 1.2566715186802524e-05, "loss": 0.1083, "step": 8860 }, { "epoch": 7.743343518114361, "grad_norm": 3.165249824523926, "learning_rate": 1.2518195050946145e-05, "loss": 0.1303, "step": 8870 }, { "epoch": 7.7520733304233955, "grad_norm": 3.2401604652404785, "learning_rate": 1.2469674915089763e-05, "loss": 0.1032, "step": 8880 }, { "epoch": 7.760803142732431, "grad_norm": 1.9936988353729248, "learning_rate": 1.2421154779233382e-05, "loss": 0.1187, "step": 8890 }, { "epoch": 7.769532955041466, "grad_norm": 2.625159502029419, "learning_rate": 1.2372634643377003e-05, "loss": 0.1062, "step": 8900 }, { "epoch": 7.778262767350502, "grad_norm": 1.672642707824707, "learning_rate": 1.2324114507520622e-05, "loss": 0.1183, "step": 8910 }, { "epoch": 7.786992579659537, "grad_norm": 2.378697156906128, "learning_rate": 1.227559437166424e-05, "loss": 0.1197, "step": 8920 }, { "epoch": 7.795722391968573, "grad_norm": 2.9251904487609863, "learning_rate": 1.222707423580786e-05, "loss": 0.1141, "step": 8930 }, { "epoch": 7.804452204277608, "grad_norm": 1.902925968170166, "learning_rate": 1.217855409995148e-05, "loss": 0.1034, "step": 8940 }, { "epoch": 7.8131820165866435, "grad_norm": 2.213522434234619, "learning_rate": 1.21300339640951e-05, "loss": 0.1124, "step": 8950 }, { "epoch": 7.821911828895678, "grad_norm": 3.5406954288482666, "learning_rate": 1.208151382823872e-05, "loss": 0.0988, "step": 8960 }, { "epoch": 7.830641641204714, "grad_norm": 2.070265531539917, "learning_rate": 1.203299369238234e-05, "loss": 0.1182, "step": 8970 }, { "epoch": 7.839371453513749, "grad_norm": 3.4907076358795166, "learning_rate": 1.1984473556525959e-05, "loss": 0.1009, "step": 8980 }, { "epoch": 7.848101265822785, "grad_norm": 2.2645416259765625, "learning_rate": 1.1935953420669578e-05, "loss": 0.1021, "step": 8990 }, { "epoch": 7.85683107813182, "grad_norm": 2.2879273891448975, "learning_rate": 1.1887433284813198e-05, "loss": 0.1166, "step": 9000 }, { "epoch": 7.865560890440856, "grad_norm": 2.280569553375244, "learning_rate": 1.1838913148956817e-05, "loss": 0.1145, "step": 9010 }, { "epoch": 7.8742907027498905, "grad_norm": 2.2898128032684326, "learning_rate": 1.1790393013100438e-05, "loss": 0.095, "step": 9020 }, { "epoch": 7.883020515058926, "grad_norm": 2.112298011779785, "learning_rate": 1.1741872877244056e-05, "loss": 0.0834, "step": 9030 }, { "epoch": 7.891750327367961, "grad_norm": 3.349541425704956, "learning_rate": 1.1693352741387677e-05, "loss": 0.1125, "step": 9040 }, { "epoch": 7.900480139676997, "grad_norm": 2.482618570327759, "learning_rate": 1.1644832605531296e-05, "loss": 0.1059, "step": 9050 }, { "epoch": 7.909209951986032, "grad_norm": 2.8729076385498047, "learning_rate": 1.1596312469674915e-05, "loss": 0.1312, "step": 9060 }, { "epoch": 7.917939764295068, "grad_norm": 2.159461498260498, "learning_rate": 1.1547792333818535e-05, "loss": 0.1032, "step": 9070 }, { "epoch": 7.926669576604103, "grad_norm": 1.993208646774292, "learning_rate": 1.1499272197962154e-05, "loss": 0.1139, "step": 9080 }, { "epoch": 7.9353993889131385, "grad_norm": 2.4738166332244873, "learning_rate": 1.1450752062105775e-05, "loss": 0.1021, "step": 9090 }, { "epoch": 7.944129201222173, "grad_norm": 2.5491795539855957, "learning_rate": 1.1402231926249395e-05, "loss": 0.1195, "step": 9100 }, { "epoch": 7.952859013531209, "grad_norm": 2.6320948600769043, "learning_rate": 1.1353711790393014e-05, "loss": 0.0882, "step": 9110 }, { "epoch": 7.961588825840244, "grad_norm": 2.642817497253418, "learning_rate": 1.1305191654536633e-05, "loss": 0.1278, "step": 9120 }, { "epoch": 7.97031863814928, "grad_norm": 1.8427996635437012, "learning_rate": 1.1256671518680253e-05, "loss": 0.1091, "step": 9130 }, { "epoch": 7.979048450458315, "grad_norm": 3.3067104816436768, "learning_rate": 1.1208151382823872e-05, "loss": 0.1177, "step": 9140 }, { "epoch": 7.987778262767351, "grad_norm": 1.9407857656478882, "learning_rate": 1.1159631246967491e-05, "loss": 0.1146, "step": 9150 }, { "epoch": 7.9965080750763855, "grad_norm": 2.7859206199645996, "learning_rate": 1.1111111111111112e-05, "loss": 0.1323, "step": 9160 }, { "epoch": 8.0, "eval_accuracy": 0.9777191259513872, "eval_loss": 0.07020638883113861, "eval_runtime": 60.5088, "eval_samples_per_second": 269.25, "eval_steps_per_second": 8.429, "step": 9164 }, { "epoch": 8.005237887385421, "grad_norm": 3.0209567546844482, "learning_rate": 1.1062590975254732e-05, "loss": 0.1044, "step": 9170 }, { "epoch": 8.013967699694456, "grad_norm": 3.3612194061279297, "learning_rate": 1.1014070839398351e-05, "loss": 0.1367, "step": 9180 }, { "epoch": 8.022697512003491, "grad_norm": 3.807859182357788, "learning_rate": 1.096555070354197e-05, "loss": 0.0977, "step": 9190 }, { "epoch": 8.031427324312528, "grad_norm": 1.820942759513855, "learning_rate": 1.091703056768559e-05, "loss": 0.1027, "step": 9200 }, { "epoch": 8.040157136621563, "grad_norm": 1.7831873893737793, "learning_rate": 1.086851043182921e-05, "loss": 0.114, "step": 9210 }, { "epoch": 8.048886948930598, "grad_norm": 2.5594382286071777, "learning_rate": 1.0819990295972828e-05, "loss": 0.1036, "step": 9220 }, { "epoch": 8.057616761239633, "grad_norm": 2.1451315879821777, "learning_rate": 1.0771470160116449e-05, "loss": 0.1002, "step": 9230 }, { "epoch": 8.06634657354867, "grad_norm": 2.5839290618896484, "learning_rate": 1.072295002426007e-05, "loss": 0.1026, "step": 9240 }, { "epoch": 8.075076385857704, "grad_norm": 2.6166088581085205, "learning_rate": 1.0674429888403688e-05, "loss": 0.1, "step": 9250 }, { "epoch": 8.083806198166739, "grad_norm": 3.226916790008545, "learning_rate": 1.0625909752547307e-05, "loss": 0.0857, "step": 9260 }, { "epoch": 8.092536010475774, "grad_norm": 2.4332773685455322, "learning_rate": 1.0577389616690927e-05, "loss": 0.0862, "step": 9270 }, { "epoch": 8.10126582278481, "grad_norm": 2.966545820236206, "learning_rate": 1.0528869480834546e-05, "loss": 0.1025, "step": 9280 }, { "epoch": 8.109995635093846, "grad_norm": 2.112661123275757, "learning_rate": 1.0480349344978165e-05, "loss": 0.0996, "step": 9290 }, { "epoch": 8.11872544740288, "grad_norm": 3.0900607109069824, "learning_rate": 1.0431829209121786e-05, "loss": 0.1052, "step": 9300 }, { "epoch": 8.127455259711915, "grad_norm": 2.729537010192871, "learning_rate": 1.0383309073265406e-05, "loss": 0.1012, "step": 9310 }, { "epoch": 8.136185072020952, "grad_norm": 2.780118227005005, "learning_rate": 1.0334788937409025e-05, "loss": 0.1136, "step": 9320 }, { "epoch": 8.144914884329987, "grad_norm": 1.7321745157241821, "learning_rate": 1.0286268801552646e-05, "loss": 0.1115, "step": 9330 }, { "epoch": 8.153644696639022, "grad_norm": 2.258513927459717, "learning_rate": 1.0237748665696265e-05, "loss": 0.1353, "step": 9340 }, { "epoch": 8.162374508948057, "grad_norm": 3.355522632598877, "learning_rate": 1.0189228529839883e-05, "loss": 0.0798, "step": 9350 }, { "epoch": 8.171104321257094, "grad_norm": 1.8760606050491333, "learning_rate": 1.0140708393983504e-05, "loss": 0.1136, "step": 9360 }, { "epoch": 8.179834133566128, "grad_norm": 2.638148546218872, "learning_rate": 1.0092188258127123e-05, "loss": 0.1096, "step": 9370 }, { "epoch": 8.188563945875163, "grad_norm": 3.003227472305298, "learning_rate": 1.0043668122270743e-05, "loss": 0.1027, "step": 9380 }, { "epoch": 8.197293758184198, "grad_norm": 2.2695999145507812, "learning_rate": 9.995147986414362e-06, "loss": 0.1027, "step": 9390 }, { "epoch": 8.206023570493235, "grad_norm": 2.753089666366577, "learning_rate": 9.946627850557983e-06, "loss": 0.1131, "step": 9400 }, { "epoch": 8.21475338280227, "grad_norm": 2.087954521179199, "learning_rate": 9.898107714701602e-06, "loss": 0.1096, "step": 9410 }, { "epoch": 8.223483195111305, "grad_norm": 2.1234874725341797, "learning_rate": 9.84958757884522e-06, "loss": 0.1281, "step": 9420 }, { "epoch": 8.23221300742034, "grad_norm": 2.573425531387329, "learning_rate": 9.801067442988841e-06, "loss": 0.1221, "step": 9430 }, { "epoch": 8.240942819729376, "grad_norm": 2.972698211669922, "learning_rate": 9.75254730713246e-06, "loss": 0.1063, "step": 9440 }, { "epoch": 8.249672632038411, "grad_norm": 3.052992105484009, "learning_rate": 9.70402717127608e-06, "loss": 0.1117, "step": 9450 }, { "epoch": 8.258402444347446, "grad_norm": 2.1158103942871094, "learning_rate": 9.655507035419701e-06, "loss": 0.092, "step": 9460 }, { "epoch": 8.267132256656481, "grad_norm": 2.4370791912078857, "learning_rate": 9.60698689956332e-06, "loss": 0.1092, "step": 9470 }, { "epoch": 8.275862068965518, "grad_norm": 3.0041496753692627, "learning_rate": 9.558466763706939e-06, "loss": 0.0869, "step": 9480 }, { "epoch": 8.284591881274553, "grad_norm": 1.6758767366409302, "learning_rate": 9.509946627850557e-06, "loss": 0.1012, "step": 9490 }, { "epoch": 8.293321693583588, "grad_norm": 2.937406301498413, "learning_rate": 9.461426491994178e-06, "loss": 0.1152, "step": 9500 }, { "epoch": 8.302051505892623, "grad_norm": 2.93967866897583, "learning_rate": 9.412906356137797e-06, "loss": 0.099, "step": 9510 }, { "epoch": 8.31078131820166, "grad_norm": 2.0390257835388184, "learning_rate": 9.364386220281417e-06, "loss": 0.0926, "step": 9520 }, { "epoch": 8.319511130510694, "grad_norm": 2.6134767532348633, "learning_rate": 9.315866084425038e-06, "loss": 0.1139, "step": 9530 }, { "epoch": 8.328240942819729, "grad_norm": 3.398548126220703, "learning_rate": 9.267345948568657e-06, "loss": 0.1051, "step": 9540 }, { "epoch": 8.336970755128764, "grad_norm": 3.847663164138794, "learning_rate": 9.218825812712276e-06, "loss": 0.119, "step": 9550 }, { "epoch": 8.3457005674378, "grad_norm": 3.1890709400177, "learning_rate": 9.170305676855896e-06, "loss": 0.1459, "step": 9560 }, { "epoch": 8.354430379746836, "grad_norm": 1.5987610816955566, "learning_rate": 9.121785540999515e-06, "loss": 0.0985, "step": 9570 }, { "epoch": 8.36316019205587, "grad_norm": 2.2405405044555664, "learning_rate": 9.073265405143134e-06, "loss": 0.1044, "step": 9580 }, { "epoch": 8.371890004364905, "grad_norm": 2.4690017700195312, "learning_rate": 9.024745269286754e-06, "loss": 0.0976, "step": 9590 }, { "epoch": 8.380619816673942, "grad_norm": 2.168797254562378, "learning_rate": 8.976225133430375e-06, "loss": 0.0933, "step": 9600 }, { "epoch": 8.389349628982977, "grad_norm": 2.74342942237854, "learning_rate": 8.927704997573994e-06, "loss": 0.099, "step": 9610 }, { "epoch": 8.398079441292012, "grad_norm": 3.6532704830169678, "learning_rate": 8.879184861717613e-06, "loss": 0.1303, "step": 9620 }, { "epoch": 8.406809253601047, "grad_norm": 2.5002212524414062, "learning_rate": 8.830664725861233e-06, "loss": 0.1139, "step": 9630 }, { "epoch": 8.415539065910083, "grad_norm": 2.5484557151794434, "learning_rate": 8.782144590004852e-06, "loss": 0.1261, "step": 9640 }, { "epoch": 8.424268878219118, "grad_norm": 2.8932912349700928, "learning_rate": 8.733624454148471e-06, "loss": 0.1126, "step": 9650 }, { "epoch": 8.432998690528153, "grad_norm": 2.334230899810791, "learning_rate": 8.685104318292092e-06, "loss": 0.1021, "step": 9660 }, { "epoch": 8.441728502837188, "grad_norm": 2.884859561920166, "learning_rate": 8.636584182435712e-06, "loss": 0.0958, "step": 9670 }, { "epoch": 8.450458315146225, "grad_norm": 1.8755512237548828, "learning_rate": 8.588064046579331e-06, "loss": 0.0928, "step": 9680 }, { "epoch": 8.45918812745526, "grad_norm": 2.3897783756256104, "learning_rate": 8.539543910722951e-06, "loss": 0.0897, "step": 9690 }, { "epoch": 8.467917939764295, "grad_norm": 3.4187004566192627, "learning_rate": 8.49102377486657e-06, "loss": 0.0978, "step": 9700 }, { "epoch": 8.47664775207333, "grad_norm": 3.359574317932129, "learning_rate": 8.44250363901019e-06, "loss": 0.1029, "step": 9710 }, { "epoch": 8.485377564382366, "grad_norm": 2.7510204315185547, "learning_rate": 8.393983503153808e-06, "loss": 0.1115, "step": 9720 }, { "epoch": 8.494107376691401, "grad_norm": 2.2325894832611084, "learning_rate": 8.345463367297429e-06, "loss": 0.1237, "step": 9730 }, { "epoch": 8.502837189000436, "grad_norm": 2.400143623352051, "learning_rate": 8.296943231441049e-06, "loss": 0.1202, "step": 9740 }, { "epoch": 8.511567001309471, "grad_norm": 3.032205581665039, "learning_rate": 8.248423095584668e-06, "loss": 0.092, "step": 9750 }, { "epoch": 8.520296813618508, "grad_norm": 3.307790517807007, "learning_rate": 8.199902959728289e-06, "loss": 0.1069, "step": 9760 }, { "epoch": 8.529026625927543, "grad_norm": 3.3414857387542725, "learning_rate": 8.151382823871907e-06, "loss": 0.1315, "step": 9770 }, { "epoch": 8.537756438236578, "grad_norm": 2.243468999862671, "learning_rate": 8.102862688015526e-06, "loss": 0.0986, "step": 9780 }, { "epoch": 8.546486250545613, "grad_norm": 2.635031223297119, "learning_rate": 8.054342552159147e-06, "loss": 0.1067, "step": 9790 }, { "epoch": 8.55521606285465, "grad_norm": 2.037966728210449, "learning_rate": 8.005822416302766e-06, "loss": 0.1053, "step": 9800 }, { "epoch": 8.563945875163684, "grad_norm": 3.361685276031494, "learning_rate": 7.957302280446386e-06, "loss": 0.1166, "step": 9810 }, { "epoch": 8.572675687472719, "grad_norm": 1.941724419593811, "learning_rate": 7.908782144590005e-06, "loss": 0.0954, "step": 9820 }, { "epoch": 8.581405499781754, "grad_norm": 2.7277681827545166, "learning_rate": 7.860262008733626e-06, "loss": 0.1212, "step": 9830 }, { "epoch": 8.59013531209079, "grad_norm": 2.5248844623565674, "learning_rate": 7.811741872877244e-06, "loss": 0.1009, "step": 9840 }, { "epoch": 8.598865124399826, "grad_norm": 0.636101245880127, "learning_rate": 7.763221737020863e-06, "loss": 0.0992, "step": 9850 }, { "epoch": 8.60759493670886, "grad_norm": 2.8368330001831055, "learning_rate": 7.714701601164484e-06, "loss": 0.0886, "step": 9860 }, { "epoch": 8.616324749017895, "grad_norm": 2.3569271564483643, "learning_rate": 7.666181465308103e-06, "loss": 0.0835, "step": 9870 }, { "epoch": 8.625054561326932, "grad_norm": 2.118471622467041, "learning_rate": 7.617661329451722e-06, "loss": 0.1155, "step": 9880 }, { "epoch": 8.633784373635967, "grad_norm": 1.9520134925842285, "learning_rate": 7.569141193595343e-06, "loss": 0.0935, "step": 9890 }, { "epoch": 8.642514185945002, "grad_norm": 2.778207540512085, "learning_rate": 7.520621057738962e-06, "loss": 0.1167, "step": 9900 }, { "epoch": 8.651243998254037, "grad_norm": 3.1950080394744873, "learning_rate": 7.4721009218825815e-06, "loss": 0.124, "step": 9910 }, { "epoch": 8.659973810563073, "grad_norm": 1.913509726524353, "learning_rate": 7.423580786026202e-06, "loss": 0.0922, "step": 9920 }, { "epoch": 8.668703622872108, "grad_norm": 2.9603874683380127, "learning_rate": 7.375060650169821e-06, "loss": 0.1225, "step": 9930 }, { "epoch": 8.677433435181143, "grad_norm": 2.5559980869293213, "learning_rate": 7.326540514313441e-06, "loss": 0.1159, "step": 9940 }, { "epoch": 8.686163247490178, "grad_norm": 2.817579507827759, "learning_rate": 7.2780203784570594e-06, "loss": 0.069, "step": 9950 }, { "epoch": 8.694893059799215, "grad_norm": 2.3405838012695312, "learning_rate": 7.22950024260068e-06, "loss": 0.083, "step": 9960 }, { "epoch": 8.70362287210825, "grad_norm": 4.057965278625488, "learning_rate": 7.180980106744299e-06, "loss": 0.1346, "step": 9970 }, { "epoch": 8.712352684417285, "grad_norm": 3.50754976272583, "learning_rate": 7.1324599708879185e-06, "loss": 0.113, "step": 9980 }, { "epoch": 8.72108249672632, "grad_norm": 2.4381942749023438, "learning_rate": 7.083939835031539e-06, "loss": 0.0959, "step": 9990 }, { "epoch": 8.729812309035356, "grad_norm": 1.3491463661193848, "learning_rate": 7.035419699175158e-06, "loss": 0.1141, "step": 10000 }, { "epoch": 8.738542121344391, "grad_norm": 3.835019588470459, "learning_rate": 6.986899563318778e-06, "loss": 0.0997, "step": 10010 }, { "epoch": 8.747271933653426, "grad_norm": 1.8673381805419922, "learning_rate": 6.938379427462398e-06, "loss": 0.1044, "step": 10020 }, { "epoch": 8.756001745962461, "grad_norm": 3.303786039352417, "learning_rate": 6.889859291606017e-06, "loss": 0.1368, "step": 10030 }, { "epoch": 8.764731558271498, "grad_norm": 2.6152758598327637, "learning_rate": 6.841339155749636e-06, "loss": 0.1186, "step": 10040 }, { "epoch": 8.773461370580533, "grad_norm": 3.460881471633911, "learning_rate": 6.792819019893256e-06, "loss": 0.1263, "step": 10050 }, { "epoch": 8.782191182889568, "grad_norm": 2.3491194248199463, "learning_rate": 6.744298884036876e-06, "loss": 0.1147, "step": 10060 }, { "epoch": 8.790920995198602, "grad_norm": 2.6336238384246826, "learning_rate": 6.695778748180495e-06, "loss": 0.1044, "step": 10070 }, { "epoch": 8.79965080750764, "grad_norm": 2.4493720531463623, "learning_rate": 6.647258612324115e-06, "loss": 0.0914, "step": 10080 }, { "epoch": 8.808380619816674, "grad_norm": 2.0005953311920166, "learning_rate": 6.598738476467735e-06, "loss": 0.137, "step": 10090 }, { "epoch": 8.817110432125709, "grad_norm": 2.4096193313598633, "learning_rate": 6.550218340611354e-06, "loss": 0.1235, "step": 10100 }, { "epoch": 8.825840244434744, "grad_norm": 2.0157690048217773, "learning_rate": 6.501698204754973e-06, "loss": 0.0981, "step": 10110 }, { "epoch": 8.83457005674378, "grad_norm": 1.8452261686325073, "learning_rate": 6.4531780688985935e-06, "loss": 0.0971, "step": 10120 }, { "epoch": 8.843299869052816, "grad_norm": 2.7903761863708496, "learning_rate": 6.404657933042213e-06, "loss": 0.0935, "step": 10130 }, { "epoch": 8.85202968136185, "grad_norm": 2.508514165878296, "learning_rate": 6.356137797185832e-06, "loss": 0.0991, "step": 10140 }, { "epoch": 8.860759493670885, "grad_norm": 2.214094638824463, "learning_rate": 6.307617661329453e-06, "loss": 0.11, "step": 10150 }, { "epoch": 8.869489305979922, "grad_norm": 2.2677242755889893, "learning_rate": 6.259097525473072e-06, "loss": 0.1491, "step": 10160 }, { "epoch": 8.878219118288957, "grad_norm": 1.6787909269332886, "learning_rate": 6.210577389616691e-06, "loss": 0.1153, "step": 10170 }, { "epoch": 8.886948930597992, "grad_norm": 2.0860350131988525, "learning_rate": 6.162057253760311e-06, "loss": 0.0906, "step": 10180 }, { "epoch": 8.895678742907027, "grad_norm": 2.6599624156951904, "learning_rate": 6.11353711790393e-06, "loss": 0.1011, "step": 10190 }, { "epoch": 8.904408555216063, "grad_norm": 2.9594058990478516, "learning_rate": 6.06501698204755e-06, "loss": 0.1179, "step": 10200 }, { "epoch": 8.913138367525098, "grad_norm": 2.69802188873291, "learning_rate": 6.01649684619117e-06, "loss": 0.1086, "step": 10210 }, { "epoch": 8.921868179834133, "grad_norm": 2.7719838619232178, "learning_rate": 5.967976710334789e-06, "loss": 0.1064, "step": 10220 }, { "epoch": 8.930597992143168, "grad_norm": 2.729365587234497, "learning_rate": 5.9194565744784085e-06, "loss": 0.108, "step": 10230 }, { "epoch": 8.939327804452205, "grad_norm": 3.4504830837249756, "learning_rate": 5.870936438622028e-06, "loss": 0.095, "step": 10240 }, { "epoch": 8.94805761676124, "grad_norm": 2.7254908084869385, "learning_rate": 5.822416302765648e-06, "loss": 0.0973, "step": 10250 }, { "epoch": 8.956787429070275, "grad_norm": 2.1603481769561768, "learning_rate": 5.773896166909268e-06, "loss": 0.0989, "step": 10260 }, { "epoch": 8.96551724137931, "grad_norm": 3.3005552291870117, "learning_rate": 5.725376031052887e-06, "loss": 0.0931, "step": 10270 }, { "epoch": 8.974247053688346, "grad_norm": 2.790048122406006, "learning_rate": 5.676855895196507e-06, "loss": 0.1102, "step": 10280 }, { "epoch": 8.982976865997381, "grad_norm": 2.8936121463775635, "learning_rate": 5.628335759340127e-06, "loss": 0.1204, "step": 10290 }, { "epoch": 8.991706678306416, "grad_norm": 3.3234341144561768, "learning_rate": 5.5798156234837455e-06, "loss": 0.1212, "step": 10300 }, { "epoch": 8.999563509384549, "eval_accuracy": 0.9607169162779278, "eval_loss": 0.12570950388908386, "eval_runtime": 61.7961, "eval_samples_per_second": 263.641, "eval_steps_per_second": 8.253, "step": 10309 }, { "epoch": 9.000436490615451, "grad_norm": 4.348520755767822, "learning_rate": 5.531295487627366e-06, "loss": 0.132, "step": 10310 }, { "epoch": 9.009166302924488, "grad_norm": 2.5040526390075684, "learning_rate": 5.482775351770985e-06, "loss": 0.116, "step": 10320 }, { "epoch": 9.017896115233523, "grad_norm": 2.754953622817993, "learning_rate": 5.434255215914605e-06, "loss": 0.1035, "step": 10330 }, { "epoch": 9.026625927542558, "grad_norm": 3.0102930068969727, "learning_rate": 5.385735080058224e-06, "loss": 0.1104, "step": 10340 }, { "epoch": 9.035355739851592, "grad_norm": 2.0556371212005615, "learning_rate": 5.337214944201844e-06, "loss": 0.1387, "step": 10350 }, { "epoch": 9.04408555216063, "grad_norm": 2.688657760620117, "learning_rate": 5.288694808345464e-06, "loss": 0.1082, "step": 10360 }, { "epoch": 9.052815364469664, "grad_norm": 3.8439743518829346, "learning_rate": 5.240174672489083e-06, "loss": 0.1144, "step": 10370 }, { "epoch": 9.061545176778699, "grad_norm": 2.683326482772827, "learning_rate": 5.191654536632703e-06, "loss": 0.1075, "step": 10380 }, { "epoch": 9.070274989087734, "grad_norm": 2.6530981063842773, "learning_rate": 5.143134400776323e-06, "loss": 0.0948, "step": 10390 }, { "epoch": 9.07900480139677, "grad_norm": 3.5326459407806396, "learning_rate": 5.094614264919942e-06, "loss": 0.1101, "step": 10400 }, { "epoch": 9.087734613705805, "grad_norm": 3.0565762519836426, "learning_rate": 5.046094129063561e-06, "loss": 0.103, "step": 10410 }, { "epoch": 9.09646442601484, "grad_norm": 2.949897050857544, "learning_rate": 4.997573993207181e-06, "loss": 0.136, "step": 10420 }, { "epoch": 9.105194238323875, "grad_norm": 2.1031014919281006, "learning_rate": 4.949053857350801e-06, "loss": 0.117, "step": 10430 }, { "epoch": 9.113924050632912, "grad_norm": 2.6953930854797363, "learning_rate": 4.9005337214944205e-06, "loss": 0.0944, "step": 10440 }, { "epoch": 9.122653862941947, "grad_norm": 2.513293981552124, "learning_rate": 4.85201358563804e-06, "loss": 0.1121, "step": 10450 }, { "epoch": 9.131383675250982, "grad_norm": 2.8910365104675293, "learning_rate": 4.80349344978166e-06, "loss": 0.0821, "step": 10460 }, { "epoch": 9.140113487560017, "grad_norm": 2.991837978363037, "learning_rate": 4.754973313925279e-06, "loss": 0.115, "step": 10470 }, { "epoch": 9.148843299869053, "grad_norm": 3.1621274948120117, "learning_rate": 4.7064531780688984e-06, "loss": 0.0987, "step": 10480 }, { "epoch": 9.157573112178088, "grad_norm": 2.6590192317962646, "learning_rate": 4.657933042212519e-06, "loss": 0.0924, "step": 10490 }, { "epoch": 9.166302924487123, "grad_norm": 1.3833907842636108, "learning_rate": 4.609412906356138e-06, "loss": 0.0772, "step": 10500 }, { "epoch": 9.175032736796158, "grad_norm": 2.954089879989624, "learning_rate": 4.5608927704997575e-06, "loss": 0.1372, "step": 10510 }, { "epoch": 9.183762549105195, "grad_norm": 2.8369104862213135, "learning_rate": 4.512372634643377e-06, "loss": 0.1054, "step": 10520 }, { "epoch": 9.19249236141423, "grad_norm": 1.4422260522842407, "learning_rate": 4.463852498786997e-06, "loss": 0.1025, "step": 10530 }, { "epoch": 9.201222173723265, "grad_norm": 1.9895925521850586, "learning_rate": 4.415332362930617e-06, "loss": 0.0992, "step": 10540 }, { "epoch": 9.2099519860323, "grad_norm": 2.9992966651916504, "learning_rate": 4.3668122270742355e-06, "loss": 0.0927, "step": 10550 }, { "epoch": 9.218681798341336, "grad_norm": 2.2719290256500244, "learning_rate": 4.318292091217856e-06, "loss": 0.0987, "step": 10560 }, { "epoch": 9.227411610650371, "grad_norm": 3.9194726943969727, "learning_rate": 4.269771955361476e-06, "loss": 0.0936, "step": 10570 }, { "epoch": 9.236141422959406, "grad_norm": 2.4642674922943115, "learning_rate": 4.221251819505095e-06, "loss": 0.1074, "step": 10580 }, { "epoch": 9.244871235268441, "grad_norm": 3.479707956314087, "learning_rate": 4.172731683648714e-06, "loss": 0.1051, "step": 10590 }, { "epoch": 9.253601047577478, "grad_norm": 3.2341325283050537, "learning_rate": 4.124211547792334e-06, "loss": 0.0878, "step": 10600 }, { "epoch": 9.262330859886513, "grad_norm": 2.302276849746704, "learning_rate": 4.075691411935954e-06, "loss": 0.1036, "step": 10610 }, { "epoch": 9.271060672195548, "grad_norm": 2.9073214530944824, "learning_rate": 4.027171276079573e-06, "loss": 0.1181, "step": 10620 }, { "epoch": 9.279790484504582, "grad_norm": 2.0145514011383057, "learning_rate": 3.978651140223193e-06, "loss": 0.0956, "step": 10630 }, { "epoch": 9.28852029681362, "grad_norm": 4.683056831359863, "learning_rate": 3.930131004366813e-06, "loss": 0.138, "step": 10640 }, { "epoch": 9.297250109122654, "grad_norm": 2.681694507598877, "learning_rate": 3.881610868510432e-06, "loss": 0.08, "step": 10650 }, { "epoch": 9.305979921431689, "grad_norm": 2.887195348739624, "learning_rate": 3.833090732654051e-06, "loss": 0.1113, "step": 10660 }, { "epoch": 9.314709733740724, "grad_norm": 2.9222280979156494, "learning_rate": 3.7845705967976715e-06, "loss": 0.1074, "step": 10670 }, { "epoch": 9.32343954604976, "grad_norm": 2.944333076477051, "learning_rate": 3.7360504609412907e-06, "loss": 0.106, "step": 10680 }, { "epoch": 9.332169358358795, "grad_norm": 2.468632459640503, "learning_rate": 3.6875303250849104e-06, "loss": 0.1049, "step": 10690 }, { "epoch": 9.34089917066783, "grad_norm": 0.9164177775382996, "learning_rate": 3.6390101892285297e-06, "loss": 0.0718, "step": 10700 }, { "epoch": 9.349628982976865, "grad_norm": 2.6750760078430176, "learning_rate": 3.5904900533721494e-06, "loss": 0.1086, "step": 10710 }, { "epoch": 9.358358795285902, "grad_norm": 1.8694539070129395, "learning_rate": 3.5419699175157695e-06, "loss": 0.1099, "step": 10720 }, { "epoch": 9.367088607594937, "grad_norm": 2.571378707885742, "learning_rate": 3.493449781659389e-06, "loss": 0.0856, "step": 10730 }, { "epoch": 9.375818419903972, "grad_norm": 1.7155669927597046, "learning_rate": 3.4449296458030085e-06, "loss": 0.1044, "step": 10740 }, { "epoch": 9.384548232213007, "grad_norm": 2.1449294090270996, "learning_rate": 3.396409509946628e-06, "loss": 0.096, "step": 10750 }, { "epoch": 9.393278044522043, "grad_norm": 2.918898344039917, "learning_rate": 3.3478893740902475e-06, "loss": 0.0983, "step": 10760 }, { "epoch": 9.402007856831078, "grad_norm": 2.2519733905792236, "learning_rate": 3.2993692382338676e-06, "loss": 0.1027, "step": 10770 }, { "epoch": 9.410737669140113, "grad_norm": 4.343425750732422, "learning_rate": 3.2508491023774865e-06, "loss": 0.1118, "step": 10780 }, { "epoch": 9.419467481449148, "grad_norm": 2.5257129669189453, "learning_rate": 3.2023289665211066e-06, "loss": 0.1093, "step": 10790 }, { "epoch": 9.428197293758185, "grad_norm": 1.197124719619751, "learning_rate": 3.1538088306647263e-06, "loss": 0.1083, "step": 10800 }, { "epoch": 9.43692710606722, "grad_norm": 3.033644437789917, "learning_rate": 3.1052886948083456e-06, "loss": 0.1076, "step": 10810 }, { "epoch": 9.445656918376255, "grad_norm": 2.3595004081726074, "learning_rate": 3.056768558951965e-06, "loss": 0.0772, "step": 10820 }, { "epoch": 9.45438673068529, "grad_norm": 1.9762500524520874, "learning_rate": 3.008248423095585e-06, "loss": 0.1033, "step": 10830 }, { "epoch": 9.463116542994326, "grad_norm": 3.2624242305755615, "learning_rate": 2.9597282872392042e-06, "loss": 0.1018, "step": 10840 }, { "epoch": 9.471846355303361, "grad_norm": 2.5619966983795166, "learning_rate": 2.911208151382824e-06, "loss": 0.1084, "step": 10850 }, { "epoch": 9.480576167612396, "grad_norm": 2.26334547996521, "learning_rate": 2.8626880155264436e-06, "loss": 0.099, "step": 10860 }, { "epoch": 9.489305979921431, "grad_norm": 2.397780656814575, "learning_rate": 2.8141678796700633e-06, "loss": 0.1217, "step": 10870 }, { "epoch": 9.498035792230468, "grad_norm": 3.3402929306030273, "learning_rate": 2.765647743813683e-06, "loss": 0.1199, "step": 10880 }, { "epoch": 9.506765604539503, "grad_norm": 3.3935964107513428, "learning_rate": 2.7171276079573023e-06, "loss": 0.1158, "step": 10890 }, { "epoch": 9.515495416848538, "grad_norm": 2.4392571449279785, "learning_rate": 2.668607472100922e-06, "loss": 0.1306, "step": 10900 }, { "epoch": 9.524225229157572, "grad_norm": 2.4739038944244385, "learning_rate": 2.6200873362445413e-06, "loss": 0.0886, "step": 10910 }, { "epoch": 9.532955041466609, "grad_norm": 2.2183120250701904, "learning_rate": 2.5715672003881614e-06, "loss": 0.1015, "step": 10920 }, { "epoch": 9.541684853775644, "grad_norm": 3.256857395172119, "learning_rate": 2.5230470645317807e-06, "loss": 0.1328, "step": 10930 }, { "epoch": 9.550414666084679, "grad_norm": 2.740285634994507, "learning_rate": 2.4745269286754004e-06, "loss": 0.0996, "step": 10940 }, { "epoch": 9.559144478393714, "grad_norm": 2.5595703125, "learning_rate": 2.42600679281902e-06, "loss": 0.075, "step": 10950 }, { "epoch": 9.56787429070275, "grad_norm": 3.089890480041504, "learning_rate": 2.3774866569626394e-06, "loss": 0.1229, "step": 10960 }, { "epoch": 9.576604103011785, "grad_norm": 3.369868755340576, "learning_rate": 2.3289665211062595e-06, "loss": 0.1194, "step": 10970 }, { "epoch": 9.58533391532082, "grad_norm": 2.359818458557129, "learning_rate": 2.2804463852498788e-06, "loss": 0.107, "step": 10980 }, { "epoch": 9.594063727629855, "grad_norm": 2.449138641357422, "learning_rate": 2.2319262493934985e-06, "loss": 0.1078, "step": 10990 }, { "epoch": 9.602793539938892, "grad_norm": 1.8725260496139526, "learning_rate": 2.1834061135371177e-06, "loss": 0.0882, "step": 11000 }, { "epoch": 9.611523352247927, "grad_norm": 1.452510952949524, "learning_rate": 2.134885977680738e-06, "loss": 0.1029, "step": 11010 }, { "epoch": 9.620253164556962, "grad_norm": 4.510222911834717, "learning_rate": 2.086365841824357e-06, "loss": 0.1253, "step": 11020 }, { "epoch": 9.628982976865997, "grad_norm": 2.815927267074585, "learning_rate": 2.037845705967977e-06, "loss": 0.1132, "step": 11030 }, { "epoch": 9.637712789175033, "grad_norm": 3.04374098777771, "learning_rate": 1.9893255701115965e-06, "loss": 0.1043, "step": 11040 }, { "epoch": 9.646442601484068, "grad_norm": 3.407210111618042, "learning_rate": 1.940805434255216e-06, "loss": 0.1207, "step": 11050 }, { "epoch": 9.655172413793103, "grad_norm": 3.3913309574127197, "learning_rate": 1.8922852983988357e-06, "loss": 0.1153, "step": 11060 }, { "epoch": 9.663902226102138, "grad_norm": 1.3927017450332642, "learning_rate": 1.8437651625424552e-06, "loss": 0.0865, "step": 11070 }, { "epoch": 9.672632038411175, "grad_norm": 1.9914878606796265, "learning_rate": 1.7952450266860747e-06, "loss": 0.1071, "step": 11080 }, { "epoch": 9.68136185072021, "grad_norm": 2.445969820022583, "learning_rate": 1.7467248908296944e-06, "loss": 0.0921, "step": 11090 }, { "epoch": 9.690091663029245, "grad_norm": 3.2452139854431152, "learning_rate": 1.698204754973314e-06, "loss": 0.1383, "step": 11100 }, { "epoch": 9.69882147533828, "grad_norm": 1.981126070022583, "learning_rate": 1.6496846191169338e-06, "loss": 0.1139, "step": 11110 }, { "epoch": 9.707551287647316, "grad_norm": 1.5634502172470093, "learning_rate": 1.6011644832605533e-06, "loss": 0.0922, "step": 11120 }, { "epoch": 9.716281099956351, "grad_norm": 3.633537769317627, "learning_rate": 1.5526443474041728e-06, "loss": 0.107, "step": 11130 }, { "epoch": 9.725010912265386, "grad_norm": 2.735826015472412, "learning_rate": 1.5041242115477925e-06, "loss": 0.1027, "step": 11140 }, { "epoch": 9.733740724574421, "grad_norm": 2.9639649391174316, "learning_rate": 1.455604075691412e-06, "loss": 0.0849, "step": 11150 }, { "epoch": 9.742470536883458, "grad_norm": 2.332066297531128, "learning_rate": 1.4070839398350317e-06, "loss": 0.1055, "step": 11160 }, { "epoch": 9.751200349192493, "grad_norm": 2.8028910160064697, "learning_rate": 1.3585638039786512e-06, "loss": 0.1135, "step": 11170 }, { "epoch": 9.759930161501527, "grad_norm": 3.3625102043151855, "learning_rate": 1.3100436681222706e-06, "loss": 0.1219, "step": 11180 }, { "epoch": 9.768659973810562, "grad_norm": 3.788644790649414, "learning_rate": 1.2615235322658903e-06, "loss": 0.1035, "step": 11190 }, { "epoch": 9.777389786119599, "grad_norm": 3.0981266498565674, "learning_rate": 1.21300339640951e-06, "loss": 0.0964, "step": 11200 }, { "epoch": 9.786119598428634, "grad_norm": 2.1601572036743164, "learning_rate": 1.1644832605531297e-06, "loss": 0.09, "step": 11210 }, { "epoch": 9.794849410737669, "grad_norm": 2.264108419418335, "learning_rate": 1.1159631246967492e-06, "loss": 0.0925, "step": 11220 }, { "epoch": 9.803579223046704, "grad_norm": 4.96955680847168, "learning_rate": 1.067442988840369e-06, "loss": 0.1429, "step": 11230 }, { "epoch": 9.81230903535574, "grad_norm": 2.7126574516296387, "learning_rate": 1.0189228529839884e-06, "loss": 0.1139, "step": 11240 }, { "epoch": 9.821038847664775, "grad_norm": 2.6963400840759277, "learning_rate": 9.70402717127608e-07, "loss": 0.1106, "step": 11250 }, { "epoch": 9.82976865997381, "grad_norm": 2.730841636657715, "learning_rate": 9.218825812712276e-07, "loss": 0.1031, "step": 11260 }, { "epoch": 9.838498472282845, "grad_norm": 3.486825704574585, "learning_rate": 8.733624454148472e-07, "loss": 0.1023, "step": 11270 }, { "epoch": 9.847228284591882, "grad_norm": 3.1530649662017822, "learning_rate": 8.248423095584669e-07, "loss": 0.084, "step": 11280 }, { "epoch": 9.855958096900917, "grad_norm": 1.514291524887085, "learning_rate": 7.763221737020864e-07, "loss": 0.1088, "step": 11290 }, { "epoch": 9.864687909209952, "grad_norm": 1.9659836292266846, "learning_rate": 7.27802037845706e-07, "loss": 0.0722, "step": 11300 }, { "epoch": 9.873417721518987, "grad_norm": 2.1583216190338135, "learning_rate": 6.792819019893256e-07, "loss": 0.1086, "step": 11310 }, { "epoch": 9.882147533828023, "grad_norm": 3.034625291824341, "learning_rate": 6.307617661329452e-07, "loss": 0.1046, "step": 11320 }, { "epoch": 9.890877346137058, "grad_norm": 2.7435011863708496, "learning_rate": 5.822416302765649e-07, "loss": 0.1083, "step": 11330 }, { "epoch": 9.899607158446093, "grad_norm": 2.3298757076263428, "learning_rate": 5.337214944201845e-07, "loss": 0.11, "step": 11340 }, { "epoch": 9.908336970755128, "grad_norm": 3.238316774368286, "learning_rate": 4.85201358563804e-07, "loss": 0.1281, "step": 11350 }, { "epoch": 9.917066783064165, "grad_norm": 2.9456119537353516, "learning_rate": 4.366812227074236e-07, "loss": 0.0932, "step": 11360 }, { "epoch": 9.9257965953732, "grad_norm": 1.8803859949111938, "learning_rate": 3.881610868510432e-07, "loss": 0.1022, "step": 11370 }, { "epoch": 9.934526407682235, "grad_norm": 2.5518546104431152, "learning_rate": 3.396409509946628e-07, "loss": 0.0876, "step": 11380 }, { "epoch": 9.94325621999127, "grad_norm": 2.5071511268615723, "learning_rate": 2.9112081513828244e-07, "loss": 0.1075, "step": 11390 }, { "epoch": 9.951986032300306, "grad_norm": 4.196099758148193, "learning_rate": 2.42600679281902e-07, "loss": 0.1197, "step": 11400 }, { "epoch": 9.960715844609341, "grad_norm": 2.428870916366577, "learning_rate": 1.940805434255216e-07, "loss": 0.1262, "step": 11410 }, { "epoch": 9.969445656918376, "grad_norm": 2.2001659870147705, "learning_rate": 1.4556040756914122e-07, "loss": 0.1013, "step": 11420 }, { "epoch": 9.978175469227411, "grad_norm": 3.1071674823760986, "learning_rate": 9.70402717127608e-08, "loss": 0.0883, "step": 11430 }, { "epoch": 9.986905281536448, "grad_norm": 2.7770655155181885, "learning_rate": 4.85201358563804e-08, "loss": 0.1187, "step": 11440 }, { "epoch": 9.995635093845483, "grad_norm": 2.909649133682251, "learning_rate": 0.0, "loss": 0.0981, "step": 11450 }, { "epoch": 9.995635093845483, "eval_accuracy": 0.9750797937638105, "eval_loss": 0.07497124373912811, "eval_runtime": 61.2653, "eval_samples_per_second": 265.925, "eval_steps_per_second": 8.324, "step": 11450 }, { "epoch": 9.995635093845483, "step": 11450, "total_flos": 3.92899376185344e+18, "train_loss": 0.3148357209992721, "train_runtime": 13104.719, "train_samples_per_second": 111.886, "train_steps_per_second": 0.874 } ], "logging_steps": 10, "max_steps": 11450, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3.92899376185344e+18, "train_batch_size": 32, "trial_name": null, "trial_params": null }