{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9929408114188193, "eval_steps": 500, "global_step": 3200, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 20.729074478149414, "learning_rate": 2.5e-06, "loss": 8.0612, "step": 5 }, { "epoch": 0.0, "grad_norm": 16.320600509643555, "learning_rate": 5e-06, "loss": 7.3007, "step": 10 }, { "epoch": 0.0, "grad_norm": 17.508378982543945, "learning_rate": 7.5e-06, "loss": 7.7541, "step": 15 }, { "epoch": 0.01, "grad_norm": 18.7609920501709, "learning_rate": 1e-05, "loss": 7.0762, "step": 20 }, { "epoch": 0.01, "grad_norm": 10.039741516113281, "learning_rate": 1.25e-05, "loss": 6.3794, "step": 25 }, { "epoch": 0.01, "grad_norm": 10.681583404541016, "learning_rate": 1.5e-05, "loss": 5.7463, "step": 30 }, { "epoch": 0.01, "grad_norm": 8.521218299865723, "learning_rate": 1.75e-05, "loss": 5.1425, "step": 35 }, { "epoch": 0.01, "grad_norm": 8.024609565734863, "learning_rate": 2e-05, "loss": 4.8565, "step": 40 }, { "epoch": 0.01, "grad_norm": 6.419050216674805, "learning_rate": 2.25e-05, "loss": 4.4552, "step": 45 }, { "epoch": 0.02, "grad_norm": 7.1052398681640625, "learning_rate": 2.5e-05, "loss": 4.1432, "step": 50 }, { "epoch": 0.02, "grad_norm": 7.79315710067749, "learning_rate": 2.7500000000000004e-05, "loss": 3.9919, "step": 55 }, { "epoch": 0.02, "grad_norm": 5.008393287658691, "learning_rate": 3e-05, "loss": 3.3339, "step": 60 }, { "epoch": 0.02, "grad_norm": 8.750615119934082, "learning_rate": 3.2500000000000004e-05, "loss": 3.3154, "step": 65 }, { "epoch": 0.02, "grad_norm": 5.283076286315918, "learning_rate": 3.5e-05, "loss": 2.8296, "step": 70 }, { "epoch": 0.02, "grad_norm": 6.005578517913818, "learning_rate": 3.7500000000000003e-05, "loss": 2.8239, "step": 75 }, { "epoch": 0.02, "grad_norm": 7.009499549865723, "learning_rate": 4e-05, "loss": 3.0532, "step": 80 }, { "epoch": 0.03, "grad_norm": 5.712557315826416, "learning_rate": 4.25e-05, "loss": 2.8819, "step": 85 }, { "epoch": 0.03, "grad_norm": 4.914234638214111, "learning_rate": 4.5e-05, "loss": 2.8031, "step": 90 }, { "epoch": 0.03, "grad_norm": 7.396793842315674, "learning_rate": 4.75e-05, "loss": 2.6904, "step": 95 }, { "epoch": 0.03, "grad_norm": 5.087535381317139, "learning_rate": 5e-05, "loss": 2.772, "step": 100 }, { "epoch": 0.03, "grad_norm": 6.230583190917969, "learning_rate": 4.9999683566063894e-05, "loss": 2.6301, "step": 105 }, { "epoch": 0.03, "grad_norm": 4.741369724273682, "learning_rate": 4.9998734272266e-05, "loss": 2.5966, "step": 110 }, { "epoch": 0.04, "grad_norm": 4.758203506469727, "learning_rate": 4.9997152142637426e-05, "loss": 2.4406, "step": 115 }, { "epoch": 0.04, "grad_norm": 4.093080997467041, "learning_rate": 4.999493721722933e-05, "loss": 2.6457, "step": 120 }, { "epoch": 0.04, "grad_norm": 5.253550052642822, "learning_rate": 4.999208955211192e-05, "loss": 2.5449, "step": 125 }, { "epoch": 0.04, "grad_norm": 5.3556294441223145, "learning_rate": 4.998860921937302e-05, "loss": 2.5182, "step": 130 }, { "epoch": 0.04, "grad_norm": 3.888378620147705, "learning_rate": 4.998449630711627e-05, "loss": 2.6575, "step": 135 }, { "epoch": 0.04, "grad_norm": 4.9733967781066895, "learning_rate": 4.997975091945886e-05, "loss": 2.5669, "step": 140 }, { "epoch": 0.04, "grad_norm": 3.3941574096679688, "learning_rate": 4.997437317652894e-05, "loss": 2.5628, "step": 145 }, { "epoch": 0.05, "grad_norm": 3.743703842163086, "learning_rate": 4.996836321446253e-05, "loss": 2.6051, "step": 150 }, { "epoch": 0.05, "grad_norm": 3.359017848968506, "learning_rate": 4.99617211854001e-05, "loss": 2.2357, "step": 155 }, { "epoch": 0.05, "grad_norm": 4.703392028808594, "learning_rate": 4.995444725748274e-05, "loss": 2.4146, "step": 160 }, { "epoch": 0.05, "grad_norm": 4.182121753692627, "learning_rate": 4.994654161484784e-05, "loss": 2.4228, "step": 165 }, { "epoch": 0.05, "grad_norm": 4.623451232910156, "learning_rate": 4.993800445762451e-05, "loss": 2.4149, "step": 170 }, { "epoch": 0.05, "grad_norm": 3.7832231521606445, "learning_rate": 4.992883600192844e-05, "loss": 2.4566, "step": 175 }, { "epoch": 0.06, "grad_norm": 3.907249689102173, "learning_rate": 4.991903647985646e-05, "loss": 2.403, "step": 180 }, { "epoch": 0.06, "grad_norm": 3.4823191165924072, "learning_rate": 4.990860613948071e-05, "loss": 2.518, "step": 185 }, { "epoch": 0.06, "grad_norm": 4.531657695770264, "learning_rate": 4.989754524484225e-05, "loss": 2.4007, "step": 190 }, { "epoch": 0.06, "grad_norm": 4.945577621459961, "learning_rate": 4.988585407594449e-05, "loss": 2.3891, "step": 195 }, { "epoch": 0.06, "grad_norm": 3.9174554347991943, "learning_rate": 4.9873532928746036e-05, "loss": 2.2904, "step": 200 }, { "epoch": 0.06, "grad_norm": 3.8385236263275146, "learning_rate": 4.986058211515321e-05, "loss": 2.2802, "step": 205 }, { "epoch": 0.07, "grad_norm": 4.326376914978027, "learning_rate": 4.9847001963012176e-05, "loss": 2.295, "step": 210 }, { "epoch": 0.07, "grad_norm": 5.581832408905029, "learning_rate": 4.9832792816100605e-05, "loss": 2.4895, "step": 215 }, { "epoch": 0.07, "grad_norm": 3.5401458740234375, "learning_rate": 4.981795503411901e-05, "loss": 2.3254, "step": 220 }, { "epoch": 0.07, "grad_norm": 4.960626125335693, "learning_rate": 4.9802488992681594e-05, "loss": 2.2977, "step": 225 }, { "epoch": 0.07, "grad_norm": 4.908995628356934, "learning_rate": 4.978639508330681e-05, "loss": 2.3534, "step": 230 }, { "epoch": 0.07, "grad_norm": 4.865789890289307, "learning_rate": 4.976967371340736e-05, "loss": 2.3781, "step": 235 }, { "epoch": 0.07, "grad_norm": 4.27896785736084, "learning_rate": 4.975232530627998e-05, "loss": 2.3221, "step": 240 }, { "epoch": 0.08, "grad_norm": 3.9018704891204834, "learning_rate": 4.973435030109463e-05, "loss": 2.407, "step": 245 }, { "epoch": 0.08, "grad_norm": 3.4363269805908203, "learning_rate": 4.971574915288345e-05, "loss": 2.3857, "step": 250 }, { "epoch": 0.08, "grad_norm": 4.802529335021973, "learning_rate": 4.9696522332529205e-05, "loss": 2.183, "step": 255 }, { "epoch": 0.08, "grad_norm": 4.064101696014404, "learning_rate": 4.967667032675337e-05, "loss": 2.2134, "step": 260 }, { "epoch": 0.08, "grad_norm": 5.066267490386963, "learning_rate": 4.965619363810381e-05, "loss": 2.2722, "step": 265 }, { "epoch": 0.08, "grad_norm": 4.149215221405029, "learning_rate": 4.9635092784942064e-05, "loss": 2.3393, "step": 270 }, { "epoch": 0.09, "grad_norm": 3.8846592903137207, "learning_rate": 4.9613368301430194e-05, "loss": 2.2163, "step": 275 }, { "epoch": 0.09, "grad_norm": 4.181525230407715, "learning_rate": 4.9591020737517335e-05, "loss": 2.4478, "step": 280 }, { "epoch": 0.09, "grad_norm": 3.1801464557647705, "learning_rate": 4.956805065892568e-05, "loss": 2.2887, "step": 285 }, { "epoch": 0.09, "grad_norm": 5.8738250732421875, "learning_rate": 4.954445864713622e-05, "loss": 2.29, "step": 290 }, { "epoch": 0.09, "grad_norm": 4.968664646148682, "learning_rate": 4.9520245299374014e-05, "loss": 2.2801, "step": 295 }, { "epoch": 0.09, "grad_norm": 5.4960784912109375, "learning_rate": 4.949541122859305e-05, "loss": 2.3109, "step": 300 }, { "epoch": 0.09, "grad_norm": 3.6677656173706055, "learning_rate": 4.9469957063460747e-05, "loss": 2.2748, "step": 305 }, { "epoch": 0.1, "grad_norm": 2.90336275100708, "learning_rate": 4.944388344834205e-05, "loss": 2.2016, "step": 310 }, { "epoch": 0.1, "grad_norm": 3.515296459197998, "learning_rate": 4.9417191043283086e-05, "loss": 2.3607, "step": 315 }, { "epoch": 0.1, "grad_norm": 3.070936679840088, "learning_rate": 4.938988052399447e-05, "loss": 2.3314, "step": 320 }, { "epoch": 0.1, "grad_norm": 3.801671028137207, "learning_rate": 4.936195258183422e-05, "loss": 2.2395, "step": 325 }, { "epoch": 0.1, "grad_norm": 4.183629035949707, "learning_rate": 4.933340792379023e-05, "loss": 2.4527, "step": 330 }, { "epoch": 0.1, "grad_norm": 3.9023029804229736, "learning_rate": 4.930424727246238e-05, "loss": 2.2828, "step": 335 }, { "epoch": 0.11, "grad_norm": 3.6366467475891113, "learning_rate": 4.927447136604424e-05, "loss": 2.2859, "step": 340 }, { "epoch": 0.11, "grad_norm": 3.219228506088257, "learning_rate": 4.924408095830439e-05, "loss": 2.3497, "step": 345 }, { "epoch": 0.11, "grad_norm": 3.768355369567871, "learning_rate": 4.921307681856735e-05, "loss": 2.1229, "step": 350 }, { "epoch": 0.11, "grad_norm": 3.8723647594451904, "learning_rate": 4.9181459731694054e-05, "loss": 2.3544, "step": 355 }, { "epoch": 0.11, "grad_norm": 3.512420892715454, "learning_rate": 4.914923049806207e-05, "loss": 1.9489, "step": 360 }, { "epoch": 0.11, "grad_norm": 3.77095627784729, "learning_rate": 4.911638993354524e-05, "loss": 2.2499, "step": 365 }, { "epoch": 0.11, "grad_norm": 3.8103721141815186, "learning_rate": 4.90829388694931e-05, "loss": 2.1032, "step": 370 }, { "epoch": 0.12, "grad_norm": 3.6579902172088623, "learning_rate": 4.9048878152709785e-05, "loss": 2.2104, "step": 375 }, { "epoch": 0.12, "grad_norm": 5.087968826293945, "learning_rate": 4.901420864543265e-05, "loss": 2.2601, "step": 380 }, { "epoch": 0.12, "grad_norm": 3.773608684539795, "learning_rate": 4.8978931225310375e-05, "loss": 2.1831, "step": 385 }, { "epoch": 0.12, "grad_norm": 6.229213714599609, "learning_rate": 4.8943046785380795e-05, "loss": 2.2507, "step": 390 }, { "epoch": 0.12, "grad_norm": 5.113283634185791, "learning_rate": 4.890655623404828e-05, "loss": 2.2868, "step": 395 }, { "epoch": 0.12, "grad_norm": 3.9976158142089844, "learning_rate": 4.8869460495060726e-05, "loss": 2.264, "step": 400 }, { "epoch": 0.13, "grad_norm": 4.450018405914307, "learning_rate": 4.883176050748619e-05, "loss": 2.2319, "step": 405 }, { "epoch": 0.13, "grad_norm": 3.610208511352539, "learning_rate": 4.879345722568911e-05, "loss": 2.1011, "step": 410 }, { "epoch": 0.13, "grad_norm": 3.5385842323303223, "learning_rate": 4.875455161930614e-05, "loss": 2.2372, "step": 415 }, { "epoch": 0.13, "grad_norm": 3.5152907371520996, "learning_rate": 4.871504467322162e-05, "loss": 2.3424, "step": 420 }, { "epoch": 0.13, "grad_norm": 3.0804309844970703, "learning_rate": 4.867493738754263e-05, "loss": 1.9902, "step": 425 }, { "epoch": 0.13, "grad_norm": 4.568037033081055, "learning_rate": 4.8634230777573655e-05, "loss": 2.216, "step": 430 }, { "epoch": 0.13, "grad_norm": 3.0766966342926025, "learning_rate": 4.859292587379094e-05, "loss": 2.2049, "step": 435 }, { "epoch": 0.14, "grad_norm": 3.8717846870422363, "learning_rate": 4.855102372181634e-05, "loss": 2.179, "step": 440 }, { "epoch": 0.14, "grad_norm": 3.963639497756958, "learning_rate": 4.8508525382390876e-05, "loss": 2.3567, "step": 445 }, { "epoch": 0.14, "grad_norm": 3.3204896450042725, "learning_rate": 4.8465431931347904e-05, "loss": 2.1157, "step": 450 }, { "epoch": 0.14, "grad_norm": 4.884645938873291, "learning_rate": 4.842174445958585e-05, "loss": 2.192, "step": 455 }, { "epoch": 0.14, "grad_norm": 5.058561325073242, "learning_rate": 4.837746407304061e-05, "loss": 2.2785, "step": 460 }, { "epoch": 0.14, "grad_norm": 4.240612983703613, "learning_rate": 4.833259189265753e-05, "loss": 2.3115, "step": 465 }, { "epoch": 0.15, "grad_norm": 3.628058433532715, "learning_rate": 4.8287129054363076e-05, "loss": 2.3267, "step": 470 }, { "epoch": 0.15, "grad_norm": 3.4856207370758057, "learning_rate": 4.8241076709036036e-05, "loss": 2.1803, "step": 475 }, { "epoch": 0.15, "grad_norm": 4.317348480224609, "learning_rate": 4.8194436022478404e-05, "loss": 2.1224, "step": 480 }, { "epoch": 0.15, "grad_norm": 3.6160759925842285, "learning_rate": 4.814720817538585e-05, "loss": 2.1848, "step": 485 }, { "epoch": 0.15, "grad_norm": 3.2244794368743896, "learning_rate": 4.809939436331786e-05, "loss": 2.2176, "step": 490 }, { "epoch": 0.15, "grad_norm": 3.645427942276001, "learning_rate": 4.805099579666748e-05, "loss": 2.1778, "step": 495 }, { "epoch": 0.16, "grad_norm": 3.9020988941192627, "learning_rate": 4.800201370063059e-05, "loss": 2.2817, "step": 500 }, { "epoch": 0.16, "grad_norm": 4.484887599945068, "learning_rate": 4.7952449315174996e-05, "loss": 1.9207, "step": 505 }, { "epoch": 0.16, "grad_norm": 4.281662464141846, "learning_rate": 4.790230389500901e-05, "loss": 2.2251, "step": 510 }, { "epoch": 0.16, "grad_norm": 3.9683914184570312, "learning_rate": 4.785157870954961e-05, "loss": 2.22, "step": 515 }, { "epoch": 0.16, "grad_norm": 3.39128041267395, "learning_rate": 4.780027504289042e-05, "loss": 2.3237, "step": 520 }, { "epoch": 0.16, "grad_norm": 3.148158550262451, "learning_rate": 4.774839419376914e-05, "loss": 2.1838, "step": 525 }, { "epoch": 0.16, "grad_norm": 4.339906692504883, "learning_rate": 4.769593747553468e-05, "loss": 2.0075, "step": 530 }, { "epoch": 0.17, "grad_norm": 3.3067688941955566, "learning_rate": 4.764290621611388e-05, "loss": 2.1666, "step": 535 }, { "epoch": 0.17, "grad_norm": 4.491573810577393, "learning_rate": 4.758930175797797e-05, "loss": 2.3295, "step": 540 }, { "epoch": 0.17, "grad_norm": 3.894711494445801, "learning_rate": 4.753512545810851e-05, "loss": 2.1021, "step": 545 }, { "epoch": 0.17, "grad_norm": 2.7983195781707764, "learning_rate": 4.7480378687963114e-05, "loss": 2.2335, "step": 550 }, { "epoch": 0.17, "grad_norm": 3.40674090385437, "learning_rate": 4.7425062833440634e-05, "loss": 2.0456, "step": 555 }, { "epoch": 0.17, "grad_norm": 3.834815263748169, "learning_rate": 4.736917929484616e-05, "loss": 2.3161, "step": 560 }, { "epoch": 0.18, "grad_norm": 3.8907999992370605, "learning_rate": 4.731272948685554e-05, "loss": 2.1104, "step": 565 }, { "epoch": 0.18, "grad_norm": 3.7746763229370117, "learning_rate": 4.725571483847958e-05, "loss": 2.0498, "step": 570 }, { "epoch": 0.18, "grad_norm": 4.495760917663574, "learning_rate": 4.719813679302784e-05, "loss": 2.231, "step": 575 }, { "epoch": 0.18, "grad_norm": 3.9231886863708496, "learning_rate": 4.713999680807211e-05, "loss": 2.1878, "step": 580 }, { "epoch": 0.18, "grad_norm": 4.197574138641357, "learning_rate": 4.708129635540955e-05, "loss": 2.1897, "step": 585 }, { "epoch": 0.18, "grad_norm": 4.721147060394287, "learning_rate": 4.702203692102539e-05, "loss": 2.1359, "step": 590 }, { "epoch": 0.18, "grad_norm": 2.4958722591400146, "learning_rate": 4.696222000505529e-05, "loss": 2.1873, "step": 595 }, { "epoch": 0.19, "grad_norm": 3.5209269523620605, "learning_rate": 4.6901847121747455e-05, "loss": 2.0386, "step": 600 }, { "epoch": 0.19, "grad_norm": 3.6823954582214355, "learning_rate": 4.6840919799424186e-05, "loss": 2.0325, "step": 605 }, { "epoch": 0.19, "grad_norm": 4.033428192138672, "learning_rate": 4.677943958044329e-05, "loss": 2.13, "step": 610 }, { "epoch": 0.19, "grad_norm": 3.907592535018921, "learning_rate": 4.671740802115897e-05, "loss": 2.0553, "step": 615 }, { "epoch": 0.19, "grad_norm": 3.318100690841675, "learning_rate": 4.665482669188248e-05, "loss": 2.0218, "step": 620 }, { "epoch": 0.19, "grad_norm": 4.057621479034424, "learning_rate": 4.659169717684232e-05, "loss": 2.1056, "step": 625 }, { "epoch": 0.2, "grad_norm": 4.882345199584961, "learning_rate": 4.6528021074144165e-05, "loss": 2.1249, "step": 630 }, { "epoch": 0.2, "grad_norm": 4.954129219055176, "learning_rate": 4.646379999573039e-05, "loss": 2.1942, "step": 635 }, { "epoch": 0.2, "grad_norm": 4.156874656677246, "learning_rate": 4.639903556733931e-05, "loss": 2.175, "step": 640 }, { "epoch": 0.2, "grad_norm": 4.1573710441589355, "learning_rate": 4.633372942846393e-05, "loss": 2.0856, "step": 645 }, { "epoch": 0.2, "grad_norm": 5.385977745056152, "learning_rate": 4.6267883232310575e-05, "loss": 2.2399, "step": 650 }, { "epoch": 0.2, "grad_norm": 4.143659591674805, "learning_rate": 4.620149864575689e-05, "loss": 2.17, "step": 655 }, { "epoch": 0.2, "grad_norm": 3.286294460296631, "learning_rate": 4.613457734930978e-05, "loss": 2.0458, "step": 660 }, { "epoch": 0.21, "grad_norm": 4.520682334899902, "learning_rate": 4.606712103706278e-05, "loss": 2.1244, "step": 665 }, { "epoch": 0.21, "grad_norm": 3.6921236515045166, "learning_rate": 4.59991314166532e-05, "loss": 2.0801, "step": 670 }, { "epoch": 0.21, "grad_norm": 3.1880507469177246, "learning_rate": 4.593061020921889e-05, "loss": 2.3062, "step": 675 }, { "epoch": 0.21, "grad_norm": 3.380157709121704, "learning_rate": 4.586155914935469e-05, "loss": 2.0267, "step": 680 }, { "epoch": 0.21, "grad_norm": 3.0647785663604736, "learning_rate": 4.57919799850685e-05, "loss": 2.1566, "step": 685 }, { "epoch": 0.21, "grad_norm": 3.353318691253662, "learning_rate": 4.5721874477737006e-05, "loss": 2.0618, "step": 690 }, { "epoch": 0.22, "grad_norm": 3.342336654663086, "learning_rate": 4.5651244402061144e-05, "loss": 1.9534, "step": 695 }, { "epoch": 0.22, "grad_norm": 4.064236640930176, "learning_rate": 4.558009154602115e-05, "loss": 2.1573, "step": 700 }, { "epoch": 0.22, "grad_norm": 3.5223772525787354, "learning_rate": 4.550841771083129e-05, "loss": 2.0089, "step": 705 }, { "epoch": 0.22, "grad_norm": 4.3469557762146, "learning_rate": 4.543622471089426e-05, "loss": 2.1214, "step": 710 }, { "epoch": 0.22, "grad_norm": 3.922893762588501, "learning_rate": 4.536351437375526e-05, "loss": 2.0982, "step": 715 }, { "epoch": 0.22, "grad_norm": 3.053823947906494, "learning_rate": 4.529028854005576e-05, "loss": 2.0791, "step": 720 }, { "epoch": 0.22, "grad_norm": 3.636437177658081, "learning_rate": 4.521654906348687e-05, "loss": 2.1326, "step": 725 }, { "epoch": 0.23, "grad_norm": 4.3226318359375, "learning_rate": 4.51422978107424e-05, "loss": 2.2037, "step": 730 }, { "epoch": 0.23, "grad_norm": 4.59119987487793, "learning_rate": 4.506753666147163e-05, "loss": 2.1187, "step": 735 }, { "epoch": 0.23, "grad_norm": 5.592061996459961, "learning_rate": 4.499226750823177e-05, "loss": 2.3031, "step": 740 }, { "epoch": 0.23, "grad_norm": 4.18353271484375, "learning_rate": 4.491649225643996e-05, "loss": 2.0337, "step": 745 }, { "epoch": 0.23, "grad_norm": 3.2864906787872314, "learning_rate": 4.484021282432509e-05, "loss": 2.0575, "step": 750 }, { "epoch": 0.23, "grad_norm": 3.3072474002838135, "learning_rate": 4.476343114287924e-05, "loss": 2.0173, "step": 755 }, { "epoch": 0.24, "grad_norm": 4.088031768798828, "learning_rate": 4.468614915580879e-05, "loss": 2.1929, "step": 760 }, { "epoch": 0.24, "grad_norm": 4.264316082000732, "learning_rate": 4.4608368819485204e-05, "loss": 2.0457, "step": 765 }, { "epoch": 0.24, "grad_norm": 4.678459644317627, "learning_rate": 4.453009210289551e-05, "loss": 2.031, "step": 770 }, { "epoch": 0.24, "grad_norm": 3.3418045043945312, "learning_rate": 4.445132098759249e-05, "loss": 2.1464, "step": 775 }, { "epoch": 0.24, "grad_norm": 3.89583420753479, "learning_rate": 4.4372057467644455e-05, "loss": 2.1509, "step": 780 }, { "epoch": 0.24, "grad_norm": 2.6973416805267334, "learning_rate": 4.4292303549584816e-05, "loss": 2.072, "step": 785 }, { "epoch": 0.25, "grad_norm": 4.848878383636475, "learning_rate": 4.421206125236128e-05, "loss": 2.166, "step": 790 }, { "epoch": 0.25, "grad_norm": 3.48630428314209, "learning_rate": 4.4131332607284706e-05, "loss": 1.9686, "step": 795 }, { "epoch": 0.25, "grad_norm": 3.4183597564697266, "learning_rate": 4.405011965797775e-05, "loss": 2.0781, "step": 800 }, { "epoch": 0.25, "grad_norm": 3.5883586406707764, "learning_rate": 4.3968424460323047e-05, "loss": 2.0631, "step": 805 }, { "epoch": 0.25, "grad_norm": 3.683375835418701, "learning_rate": 4.388624908241124e-05, "loss": 2.0533, "step": 810 }, { "epoch": 0.25, "grad_norm": 3.0786943435668945, "learning_rate": 4.3803595604488595e-05, "loss": 1.8946, "step": 815 }, { "epoch": 0.25, "grad_norm": 3.2280662059783936, "learning_rate": 4.372046611890434e-05, "loss": 2.0221, "step": 820 }, { "epoch": 0.26, "grad_norm": 3.1918365955352783, "learning_rate": 4.36368627300577e-05, "loss": 2.0023, "step": 825 }, { "epoch": 0.26, "grad_norm": 4.814984321594238, "learning_rate": 4.3552787554344634e-05, "loss": 2.0967, "step": 830 }, { "epoch": 0.26, "grad_norm": 5.989580154418945, "learning_rate": 4.346824272010423e-05, "loss": 1.9698, "step": 835 }, { "epoch": 0.26, "grad_norm": 3.2674803733825684, "learning_rate": 4.338323036756488e-05, "loss": 2.0381, "step": 840 }, { "epoch": 0.26, "grad_norm": 3.6016860008239746, "learning_rate": 4.3297752648790035e-05, "loss": 2.0444, "step": 845 }, { "epoch": 0.26, "grad_norm": 4.092184543609619, "learning_rate": 4.321181172762379e-05, "loss": 2.1514, "step": 850 }, { "epoch": 0.27, "grad_norm": 3.5366742610931396, "learning_rate": 4.312540977963604e-05, "loss": 2.0518, "step": 855 }, { "epoch": 0.27, "grad_norm": 4.222804069519043, "learning_rate": 4.303854899206749e-05, "loss": 1.9858, "step": 860 }, { "epoch": 0.27, "grad_norm": 4.207810401916504, "learning_rate": 4.295123156377419e-05, "loss": 2.0067, "step": 865 }, { "epoch": 0.27, "grad_norm": 3.15069842338562, "learning_rate": 4.2863459705171945e-05, "loss": 1.9234, "step": 870 }, { "epoch": 0.27, "grad_norm": 3.337561845779419, "learning_rate": 4.2775235638180344e-05, "loss": 1.974, "step": 875 }, { "epoch": 0.27, "grad_norm": 5.987912178039551, "learning_rate": 4.2686561596166487e-05, "loss": 2.1928, "step": 880 }, { "epoch": 0.27, "grad_norm": 3.9456374645233154, "learning_rate": 4.259743982388845e-05, "loss": 2.023, "step": 885 }, { "epoch": 0.28, "grad_norm": 4.308691501617432, "learning_rate": 4.250787257743851e-05, "loss": 2.1075, "step": 890 }, { "epoch": 0.28, "grad_norm": 3.699410915374756, "learning_rate": 4.2417862124185955e-05, "loss": 2.0471, "step": 895 }, { "epoch": 0.28, "grad_norm": 4.254593372344971, "learning_rate": 4.232741074271977e-05, "loss": 2.0331, "step": 900 }, { "epoch": 0.28, "grad_norm": 3.2899739742279053, "learning_rate": 4.2236520722790855e-05, "loss": 2.0153, "step": 905 }, { "epoch": 0.28, "grad_norm": 5.5724616050720215, "learning_rate": 4.214519436525418e-05, "loss": 2.1466, "step": 910 }, { "epoch": 0.28, "grad_norm": 3.673755168914795, "learning_rate": 4.2053433982010436e-05, "loss": 2.1062, "step": 915 }, { "epoch": 0.29, "grad_norm": 4.009172439575195, "learning_rate": 4.1961241895947554e-05, "loss": 2.013, "step": 920 }, { "epoch": 0.29, "grad_norm": 3.0359890460968018, "learning_rate": 4.1868620440881925e-05, "loss": 2.1153, "step": 925 }, { "epoch": 0.29, "grad_norm": 4.953378200531006, "learning_rate": 4.177557196149927e-05, "loss": 2.0847, "step": 930 }, { "epoch": 0.29, "grad_norm": 3.580415964126587, "learning_rate": 4.168209881329531e-05, "loss": 1.9907, "step": 935 }, { "epoch": 0.29, "grad_norm": 3.3144888877868652, "learning_rate": 4.1588203362516153e-05, "loss": 2.0741, "step": 940 }, { "epoch": 0.29, "grad_norm": 4.115612983703613, "learning_rate": 4.149388798609836e-05, "loss": 1.9596, "step": 945 }, { "epoch": 0.29, "grad_norm": 5.178717613220215, "learning_rate": 4.1399155071608774e-05, "loss": 2.142, "step": 950 }, { "epoch": 0.3, "grad_norm": 3.350316286087036, "learning_rate": 4.1304007017184146e-05, "loss": 2.06, "step": 955 }, { "epoch": 0.3, "grad_norm": 4.030082702636719, "learning_rate": 4.120844623147033e-05, "loss": 2.0618, "step": 960 }, { "epoch": 0.3, "grad_norm": 5.1543707847595215, "learning_rate": 4.1112475133561376e-05, "loss": 2.3692, "step": 965 }, { "epoch": 0.3, "grad_norm": 3.9695091247558594, "learning_rate": 4.101609615293827e-05, "loss": 2.0065, "step": 970 }, { "epoch": 0.3, "grad_norm": 3.1106691360473633, "learning_rate": 4.0919311729407416e-05, "loss": 2.0318, "step": 975 }, { "epoch": 0.3, "grad_norm": 3.532636880874634, "learning_rate": 4.0822124313038904e-05, "loss": 2.139, "step": 980 }, { "epoch": 0.31, "grad_norm": 4.04263162612915, "learning_rate": 4.072453636410448e-05, "loss": 2.1352, "step": 985 }, { "epoch": 0.31, "grad_norm": 4.174222946166992, "learning_rate": 4.0626550353015236e-05, "loss": 2.0269, "step": 990 }, { "epoch": 0.31, "grad_norm": 4.390026569366455, "learning_rate": 4.052816876025912e-05, "loss": 2.0775, "step": 995 }, { "epoch": 0.31, "grad_norm": 4.04339075088501, "learning_rate": 4.042939407633808e-05, "loss": 2.0042, "step": 1000 }, { "epoch": 0.31, "grad_norm": 3.5550975799560547, "learning_rate": 4.03302288017051e-05, "loss": 1.9624, "step": 1005 }, { "epoch": 0.31, "grad_norm": 4.015019416809082, "learning_rate": 4.023067544670082e-05, "loss": 2.142, "step": 1010 }, { "epoch": 0.31, "grad_norm": 3.452937126159668, "learning_rate": 4.013073653149005e-05, "loss": 2.0798, "step": 1015 }, { "epoch": 0.32, "grad_norm": 4.2777509689331055, "learning_rate": 4.0030414585997925e-05, "loss": 2.0245, "step": 1020 }, { "epoch": 0.32, "grad_norm": 5.5015459060668945, "learning_rate": 3.99297121498459e-05, "loss": 2.0897, "step": 1025 }, { "epoch": 0.32, "grad_norm": 4.524988651275635, "learning_rate": 3.982863177228743e-05, "loss": 2.182, "step": 1030 }, { "epoch": 0.32, "grad_norm": 4.300734043121338, "learning_rate": 3.972717601214345e-05, "loss": 2.0477, "step": 1035 }, { "epoch": 0.32, "grad_norm": 3.456317186355591, "learning_rate": 3.962534743773761e-05, "loss": 2.1261, "step": 1040 }, { "epoch": 0.32, "grad_norm": 3.567162275314331, "learning_rate": 3.9523148626831234e-05, "loss": 2.119, "step": 1045 }, { "epoch": 0.33, "grad_norm": 3.5200531482696533, "learning_rate": 3.942058216655808e-05, "loss": 1.9731, "step": 1050 }, { "epoch": 0.33, "grad_norm": 4.380658149719238, "learning_rate": 3.931765065335886e-05, "loss": 1.9642, "step": 1055 }, { "epoch": 0.33, "grad_norm": 4.44472074508667, "learning_rate": 3.921435669291547e-05, "loss": 1.8666, "step": 1060 }, { "epoch": 0.33, "grad_norm": 5.24396276473999, "learning_rate": 3.9110702900085064e-05, "loss": 2.0983, "step": 1065 }, { "epoch": 0.33, "grad_norm": 4.166001319885254, "learning_rate": 3.900669189883386e-05, "loss": 1.9032, "step": 1070 }, { "epoch": 0.33, "grad_norm": 3.893059730529785, "learning_rate": 3.890232632217071e-05, "loss": 1.9269, "step": 1075 }, { "epoch": 0.34, "grad_norm": 3.5707895755767822, "learning_rate": 3.879760881208042e-05, "loss": 1.9055, "step": 1080 }, { "epoch": 0.34, "grad_norm": 4.270632743835449, "learning_rate": 3.869254201945692e-05, "loss": 1.9936, "step": 1085 }, { "epoch": 0.34, "grad_norm": 4.152591228485107, "learning_rate": 3.858712860403608e-05, "loss": 2.1007, "step": 1090 }, { "epoch": 0.34, "grad_norm": 3.5370168685913086, "learning_rate": 3.848137123432848e-05, "loss": 2.1225, "step": 1095 }, { "epoch": 0.34, "grad_norm": 3.657259941101074, "learning_rate": 3.837527258755177e-05, "loss": 1.9526, "step": 1100 }, { "epoch": 0.34, "grad_norm": 4.236551761627197, "learning_rate": 3.8268835349562946e-05, "loss": 1.9357, "step": 1105 }, { "epoch": 0.34, "grad_norm": 3.312053680419922, "learning_rate": 3.816206221479034e-05, "loss": 1.9833, "step": 1110 }, { "epoch": 0.35, "grad_norm": 3.346323013305664, "learning_rate": 3.8054955886165427e-05, "loss": 1.9351, "step": 1115 }, { "epoch": 0.35, "grad_norm": 3.557433843612671, "learning_rate": 3.7947519075054364e-05, "loss": 2.0037, "step": 1120 }, { "epoch": 0.35, "grad_norm": 3.824169635772705, "learning_rate": 3.7839754501189406e-05, "loss": 2.1035, "step": 1125 }, { "epoch": 0.35, "grad_norm": 4.1984968185424805, "learning_rate": 3.7731664892600004e-05, "loss": 1.9416, "step": 1130 }, { "epoch": 0.35, "grad_norm": 2.998347520828247, "learning_rate": 3.762325298554379e-05, "loss": 1.9615, "step": 1135 }, { "epoch": 0.35, "grad_norm": 4.985104560852051, "learning_rate": 3.751452152443728e-05, "loss": 1.912, "step": 1140 }, { "epoch": 0.36, "grad_norm": 3.560026168823242, "learning_rate": 3.74054732617864e-05, "loss": 1.9317, "step": 1145 }, { "epoch": 0.36, "grad_norm": 3.894937515258789, "learning_rate": 3.7296110958116844e-05, "loss": 1.9516, "step": 1150 }, { "epoch": 0.36, "grad_norm": 3.1330158710479736, "learning_rate": 3.718643738190414e-05, "loss": 1.8787, "step": 1155 }, { "epoch": 0.36, "grad_norm": 3.924584150314331, "learning_rate": 3.707645530950361e-05, "loss": 1.9294, "step": 1160 }, { "epoch": 0.36, "grad_norm": 3.2176225185394287, "learning_rate": 3.6966167525080056e-05, "loss": 2.1003, "step": 1165 }, { "epoch": 0.36, "grad_norm": 3.9685873985290527, "learning_rate": 3.6855576820537277e-05, "loss": 1.9088, "step": 1170 }, { "epoch": 0.36, "grad_norm": 4.544212818145752, "learning_rate": 3.674468599544746e-05, "loss": 2.0211, "step": 1175 }, { "epoch": 0.37, "grad_norm": 3.6609127521514893, "learning_rate": 3.663349785698021e-05, "loss": 2.0021, "step": 1180 }, { "epoch": 0.37, "grad_norm": 4.17726469039917, "learning_rate": 3.6522015219831546e-05, "loss": 2.0828, "step": 1185 }, { "epoch": 0.37, "grad_norm": 3.6899638175964355, "learning_rate": 3.641024090615265e-05, "loss": 1.9462, "step": 1190 }, { "epoch": 0.37, "grad_norm": 3.7764229774475098, "learning_rate": 3.62981777454784e-05, "loss": 2.0825, "step": 1195 }, { "epoch": 0.37, "grad_norm": 4.037018775939941, "learning_rate": 3.6185828574655766e-05, "loss": 1.8715, "step": 1200 }, { "epoch": 0.37, "grad_norm": 3.727513074874878, "learning_rate": 3.607319623777196e-05, "loss": 1.9394, "step": 1205 }, { "epoch": 0.38, "grad_norm": 4.162086009979248, "learning_rate": 3.59602835860825e-05, "loss": 1.89, "step": 1210 }, { "epoch": 0.38, "grad_norm": 3.546518564224243, "learning_rate": 3.5847093477938956e-05, "loss": 1.8102, "step": 1215 }, { "epoch": 0.38, "grad_norm": 4.054803371429443, "learning_rate": 3.5733628778716646e-05, "loss": 1.8825, "step": 1220 }, { "epoch": 0.38, "grad_norm": 3.638885498046875, "learning_rate": 3.5619892360742075e-05, "loss": 2.0755, "step": 1225 }, { "epoch": 0.38, "grad_norm": 3.433565378189087, "learning_rate": 3.5505887103220254e-05, "loss": 2.0261, "step": 1230 }, { "epoch": 0.38, "grad_norm": 3.5785629749298096, "learning_rate": 3.5391615892161754e-05, "loss": 2.1362, "step": 1235 }, { "epoch": 0.38, "grad_norm": 3.4514031410217285, "learning_rate": 3.527708162030971e-05, "loss": 1.8821, "step": 1240 }, { "epoch": 0.39, "grad_norm": 4.2519073486328125, "learning_rate": 3.516228718706656e-05, "loss": 2.112, "step": 1245 }, { "epoch": 0.39, "grad_norm": 3.0281126499176025, "learning_rate": 3.504723549842066e-05, "loss": 1.8516, "step": 1250 }, { "epoch": 0.39, "grad_norm": 3.3636157512664795, "learning_rate": 3.4931929466872685e-05, "loss": 1.9612, "step": 1255 }, { "epoch": 0.39, "grad_norm": 3.7413578033447266, "learning_rate": 3.481637201136197e-05, "loss": 1.9865, "step": 1260 }, { "epoch": 0.39, "grad_norm": 3.007408618927002, "learning_rate": 3.4700566057192544e-05, "loss": 1.9493, "step": 1265 }, { "epoch": 0.39, "grad_norm": 4.331480979919434, "learning_rate": 3.4584514535959114e-05, "loss": 2.1174, "step": 1270 }, { "epoch": 0.4, "grad_norm": 4.286431312561035, "learning_rate": 3.446822038547287e-05, "loss": 1.883, "step": 1275 }, { "epoch": 0.4, "grad_norm": 3.356170177459717, "learning_rate": 3.435168654968706e-05, "loss": 1.9707, "step": 1280 }, { "epoch": 0.4, "grad_norm": 3.436434507369995, "learning_rate": 3.423491597862251e-05, "loss": 1.8922, "step": 1285 }, { "epoch": 0.4, "grad_norm": 3.307274580001831, "learning_rate": 3.411791162829294e-05, "loss": 2.0583, "step": 1290 }, { "epoch": 0.4, "grad_norm": 4.032553195953369, "learning_rate": 3.4000676460630126e-05, "loss": 2.0121, "step": 1295 }, { "epoch": 0.4, "grad_norm": 3.4915122985839844, "learning_rate": 3.3883213443408903e-05, "loss": 1.9361, "step": 1300 }, { "epoch": 0.4, "grad_norm": 3.969005823135376, "learning_rate": 3.3765525550172066e-05, "loss": 1.8782, "step": 1305 }, { "epoch": 0.41, "grad_norm": 3.772780179977417, "learning_rate": 3.364761576015507e-05, "loss": 2.0914, "step": 1310 }, { "epoch": 0.41, "grad_norm": 2.9640040397644043, "learning_rate": 3.352948705821065e-05, "loss": 1.9143, "step": 1315 }, { "epoch": 0.41, "grad_norm": 5.698980331420898, "learning_rate": 3.341114243473319e-05, "loss": 1.9417, "step": 1320 }, { "epoch": 0.41, "grad_norm": 3.4275810718536377, "learning_rate": 3.3292584885583114e-05, "loss": 1.9053, "step": 1325 }, { "epoch": 0.41, "grad_norm": 3.2752602100372314, "learning_rate": 3.317381741201097e-05, "loss": 2.0126, "step": 1330 }, { "epoch": 0.41, "grad_norm": 4.166382312774658, "learning_rate": 3.305484302058148e-05, "loss": 1.9256, "step": 1335 }, { "epoch": 0.42, "grad_norm": 3.7549707889556885, "learning_rate": 3.293566472309746e-05, "loss": 2.0742, "step": 1340 }, { "epoch": 0.42, "grad_norm": 3.449774980545044, "learning_rate": 3.2816285536523515e-05, "loss": 1.9322, "step": 1345 }, { "epoch": 0.42, "grad_norm": 3.590756416320801, "learning_rate": 3.269670848290973e-05, "loss": 1.9619, "step": 1350 }, { "epoch": 0.42, "grad_norm": 4.403102874755859, "learning_rate": 3.2576936589315124e-05, "loss": 1.9513, "step": 1355 }, { "epoch": 0.42, "grad_norm": 4.1176676750183105, "learning_rate": 3.245697288773102e-05, "loss": 2.0274, "step": 1360 }, { "epoch": 0.42, "grad_norm": 4.0299859046936035, "learning_rate": 3.233682041500433e-05, "loss": 1.9853, "step": 1365 }, { "epoch": 0.43, "grad_norm": 4.306421279907227, "learning_rate": 3.2216482212760646e-05, "loss": 1.949, "step": 1370 }, { "epoch": 0.43, "grad_norm": 3.9233736991882324, "learning_rate": 3.209596132732725e-05, "loss": 1.9009, "step": 1375 }, { "epoch": 0.43, "grad_norm": 3.82336163520813, "learning_rate": 3.197526080965598e-05, "loss": 2.1035, "step": 1380 }, { "epoch": 0.43, "grad_norm": 3.946753740310669, "learning_rate": 3.185438371524605e-05, "loss": 1.9775, "step": 1385 }, { "epoch": 0.43, "grad_norm": 4.122159481048584, "learning_rate": 3.173333310406662e-05, "loss": 1.7694, "step": 1390 }, { "epoch": 0.43, "grad_norm": 3.5491435527801514, "learning_rate": 3.161211204047943e-05, "loss": 2.0022, "step": 1395 }, { "epoch": 0.43, "grad_norm": 4.0456438064575195, "learning_rate": 3.1490723593161096e-05, "loss": 2.1332, "step": 1400 }, { "epoch": 0.44, "grad_norm": 3.476616621017456, "learning_rate": 3.1369170835025594e-05, "loss": 1.9567, "step": 1405 }, { "epoch": 0.44, "grad_norm": 3.3506128787994385, "learning_rate": 3.124745684314633e-05, "loss": 2.1015, "step": 1410 }, { "epoch": 0.44, "grad_norm": 3.737765312194824, "learning_rate": 3.112558469867829e-05, "loss": 1.9677, "step": 1415 }, { "epoch": 0.44, "grad_norm": 3.6628215312957764, "learning_rate": 3.100355748678009e-05, "loss": 2.1167, "step": 1420 }, { "epoch": 0.44, "grad_norm": 3.3631627559661865, "learning_rate": 3.0881378296535784e-05, "loss": 1.928, "step": 1425 }, { "epoch": 0.44, "grad_norm": 4.281042575836182, "learning_rate": 3.075905022087675e-05, "loss": 1.9394, "step": 1430 }, { "epoch": 0.45, "grad_norm": 3.994631290435791, "learning_rate": 3.063657635650335e-05, "loss": 1.8533, "step": 1435 }, { "epoch": 0.45, "grad_norm": 5.131731033325195, "learning_rate": 3.0513959803806526e-05, "loss": 1.9484, "step": 1440 }, { "epoch": 0.45, "grad_norm": 3.4644176959991455, "learning_rate": 3.039120366678937e-05, "loss": 1.9492, "step": 1445 }, { "epoch": 0.45, "grad_norm": 3.832453966140747, "learning_rate": 3.0268311052988473e-05, "loss": 1.869, "step": 1450 }, { "epoch": 0.45, "grad_norm": 3.8497562408447266, "learning_rate": 3.0145285073395334e-05, "loss": 1.8965, "step": 1455 }, { "epoch": 0.45, "grad_norm": 3.4898972511291504, "learning_rate": 3.0022128842377534e-05, "loss": 2.0029, "step": 1460 }, { "epoch": 0.45, "grad_norm": 4.340991020202637, "learning_rate": 2.9898845477599963e-05, "loss": 1.9139, "step": 1465 }, { "epoch": 0.46, "grad_norm": 5.687810897827148, "learning_rate": 2.9775438099945836e-05, "loss": 2.0196, "step": 1470 }, { "epoch": 0.46, "grad_norm": 3.468388795852661, "learning_rate": 2.965190983343774e-05, "loss": 2.0382, "step": 1475 }, { "epoch": 0.46, "grad_norm": 3.2167277336120605, "learning_rate": 2.9528263805158524e-05, "loss": 2.0924, "step": 1480 }, { "epoch": 0.46, "grad_norm": 4.481842041015625, "learning_rate": 2.940450314517214e-05, "loss": 2.0535, "step": 1485 }, { "epoch": 0.46, "grad_norm": 4.334501266479492, "learning_rate": 2.92806309864444e-05, "loss": 1.9523, "step": 1490 }, { "epoch": 0.46, "grad_norm": 4.137599945068359, "learning_rate": 2.9156650464763713e-05, "loss": 2.0247, "step": 1495 }, { "epoch": 0.47, "grad_norm": 3.5023269653320312, "learning_rate": 2.9032564718661603e-05, "loss": 2.0151, "step": 1500 }, { "epoch": 0.47, "grad_norm": 4.225565433502197, "learning_rate": 2.8908376889333376e-05, "loss": 1.9438, "step": 1505 }, { "epoch": 0.47, "grad_norm": 3.86175799369812, "learning_rate": 2.8784090120558515e-05, "loss": 2.0108, "step": 1510 }, { "epoch": 0.47, "grad_norm": 2.7544214725494385, "learning_rate": 2.865970755862114e-05, "loss": 1.943, "step": 1515 }, { "epoch": 0.47, "grad_norm": 3.8477399349212646, "learning_rate": 2.8535232352230345e-05, "loss": 1.891, "step": 1520 }, { "epoch": 0.47, "grad_norm": 3.7875800132751465, "learning_rate": 2.8410667652440482e-05, "loss": 1.9343, "step": 1525 }, { "epoch": 0.47, "grad_norm": 3.8977842330932617, "learning_rate": 2.828601661257142e-05, "loss": 1.8978, "step": 1530 }, { "epoch": 0.48, "grad_norm": 3.39017915725708, "learning_rate": 2.8161282388128696e-05, "loss": 1.9368, "step": 1535 }, { "epoch": 0.48, "grad_norm": 4.3148322105407715, "learning_rate": 2.8036468136723627e-05, "loss": 1.9393, "step": 1540 }, { "epoch": 0.48, "grad_norm": 3.528031587600708, "learning_rate": 2.7911577017993412e-05, "loss": 1.831, "step": 1545 }, { "epoch": 0.48, "grad_norm": 4.506915092468262, "learning_rate": 2.778661219352111e-05, "loss": 2.1384, "step": 1550 }, { "epoch": 0.48, "grad_norm": 4.252208709716797, "learning_rate": 2.766157682675562e-05, "loss": 1.9593, "step": 1555 }, { "epoch": 0.48, "grad_norm": 3.718641996383667, "learning_rate": 2.753647408293161e-05, "loss": 1.9347, "step": 1560 }, { "epoch": 0.49, "grad_norm": 3.7793309688568115, "learning_rate": 2.7411307128989368e-05, "loss": 1.9519, "step": 1565 }, { "epoch": 0.49, "grad_norm": 3.7921085357666016, "learning_rate": 2.728607913349464e-05, "loss": 1.8966, "step": 1570 }, { "epoch": 0.49, "grad_norm": 3.735579252243042, "learning_rate": 2.7160793266558443e-05, "loss": 1.8972, "step": 1575 }, { "epoch": 0.49, "grad_norm": 4.979485511779785, "learning_rate": 2.7035452699756768e-05, "loss": 1.9879, "step": 1580 }, { "epoch": 0.49, "grad_norm": 3.672161102294922, "learning_rate": 2.6910060606050324e-05, "loss": 1.895, "step": 1585 }, { "epoch": 0.49, "grad_norm": 3.2381715774536133, "learning_rate": 2.6784620159704222e-05, "loss": 1.9259, "step": 1590 }, { "epoch": 0.49, "grad_norm": 5.407585620880127, "learning_rate": 2.6659134536207587e-05, "loss": 1.9021, "step": 1595 }, { "epoch": 0.5, "grad_norm": 3.894399642944336, "learning_rate": 2.6533606912193216e-05, "loss": 2.0666, "step": 1600 }, { "epoch": 0.5, "grad_norm": 3.4516754150390625, "learning_rate": 2.6408040465357097e-05, "loss": 1.9388, "step": 1605 }, { "epoch": 0.5, "grad_norm": 5.389581203460693, "learning_rate": 2.628243837437806e-05, "loss": 1.9731, "step": 1610 }, { "epoch": 0.5, "grad_norm": 3.623656988143921, "learning_rate": 2.6156803818837204e-05, "loss": 1.8931, "step": 1615 }, { "epoch": 0.5, "grad_norm": 3.5042312145233154, "learning_rate": 2.6031139979137492e-05, "loss": 1.8365, "step": 1620 }, { "epoch": 0.5, "grad_norm": 5.07073974609375, "learning_rate": 2.59054500364232e-05, "loss": 2.0215, "step": 1625 }, { "epoch": 0.51, "grad_norm": 4.199176788330078, "learning_rate": 2.5779737172499396e-05, "loss": 1.967, "step": 1630 }, { "epoch": 0.51, "grad_norm": 4.009402751922607, "learning_rate": 2.565400456975138e-05, "loss": 2.0154, "step": 1635 }, { "epoch": 0.51, "grad_norm": 3.114271640777588, "learning_rate": 2.552825541106414e-05, "loss": 1.9405, "step": 1640 }, { "epoch": 0.51, "grad_norm": 3.4758782386779785, "learning_rate": 2.540249287974178e-05, "loss": 1.94, "step": 1645 }, { "epoch": 0.51, "grad_norm": 6.038011074066162, "learning_rate": 2.527672015942693e-05, "loss": 2.1653, "step": 1650 }, { "epoch": 0.51, "grad_norm": 3.370410203933716, "learning_rate": 2.5150940434020132e-05, "loss": 1.9588, "step": 1655 }, { "epoch": 0.52, "grad_norm": 3.766829252243042, "learning_rate": 2.5025156887599288e-05, "loss": 1.8133, "step": 1660 }, { "epoch": 0.52, "grad_norm": 3.650520086288452, "learning_rate": 2.489937270433901e-05, "loss": 1.9111, "step": 1665 }, { "epoch": 0.52, "grad_norm": 3.1080238819122314, "learning_rate": 2.4773591068430018e-05, "loss": 1.8758, "step": 1670 }, { "epoch": 0.52, "grad_norm": 3.3637783527374268, "learning_rate": 2.4647815163998585e-05, "loss": 1.7589, "step": 1675 }, { "epoch": 0.52, "grad_norm": 4.043179988861084, "learning_rate": 2.452204817502587e-05, "loss": 1.9339, "step": 1680 }, { "epoch": 0.52, "grad_norm": 4.033404350280762, "learning_rate": 2.4396293285267327e-05, "loss": 1.9412, "step": 1685 }, { "epoch": 0.52, "grad_norm": 4.043616771697998, "learning_rate": 2.427055367817214e-05, "loss": 1.8728, "step": 1690 }, { "epoch": 0.53, "grad_norm": 4.840696811676025, "learning_rate": 2.4144832536802628e-05, "loss": 1.9966, "step": 1695 }, { "epoch": 0.53, "grad_norm": 4.977992057800293, "learning_rate": 2.4019133043753628e-05, "loss": 1.9621, "step": 1700 }, { "epoch": 0.53, "grad_norm": 3.1471240520477295, "learning_rate": 2.3893458381071964e-05, "loss": 2.0315, "step": 1705 }, { "epoch": 0.53, "grad_norm": 5.21504020690918, "learning_rate": 2.376781173017589e-05, "loss": 1.9859, "step": 1710 }, { "epoch": 0.53, "grad_norm": 3.4117472171783447, "learning_rate": 2.3642196271774568e-05, "loss": 1.905, "step": 1715 }, { "epoch": 0.53, "grad_norm": 3.8640167713165283, "learning_rate": 2.3516615185787494e-05, "loss": 2.0321, "step": 1720 }, { "epoch": 0.54, "grad_norm": 3.5830259323120117, "learning_rate": 2.3391071651264064e-05, "loss": 1.9936, "step": 1725 }, { "epoch": 0.54, "grad_norm": 5.528283596038818, "learning_rate": 2.3265568846303054e-05, "loss": 1.8955, "step": 1730 }, { "epoch": 0.54, "grad_norm": 3.968691110610962, "learning_rate": 2.3140109947972204e-05, "loss": 1.9137, "step": 1735 }, { "epoch": 0.54, "grad_norm": 3.56799054145813, "learning_rate": 2.3014698132227735e-05, "loss": 1.9854, "step": 1740 }, { "epoch": 0.54, "grad_norm": 4.353531360626221, "learning_rate": 2.2889336573834027e-05, "loss": 1.8967, "step": 1745 }, { "epoch": 0.54, "grad_norm": 3.8630661964416504, "learning_rate": 2.276402844628317e-05, "loss": 1.8833, "step": 1750 }, { "epoch": 0.54, "grad_norm": 3.5117268562316895, "learning_rate": 2.2638776921714696e-05, "loss": 1.8493, "step": 1755 }, { "epoch": 0.55, "grad_norm": 4.000200271606445, "learning_rate": 2.251358517083524e-05, "loss": 1.8717, "step": 1760 }, { "epoch": 0.55, "grad_norm": 3.0542423725128174, "learning_rate": 2.2388456362838283e-05, "loss": 1.9941, "step": 1765 }, { "epoch": 0.55, "grad_norm": 4.117686748504639, "learning_rate": 2.2263393665323907e-05, "loss": 2.0925, "step": 1770 }, { "epoch": 0.55, "grad_norm": 5.376316070556641, "learning_rate": 2.2138400244218665e-05, "loss": 2.0568, "step": 1775 }, { "epoch": 0.55, "grad_norm": 3.879211187362671, "learning_rate": 2.2013479263695368e-05, "loss": 1.9256, "step": 1780 }, { "epoch": 0.55, "grad_norm": 4.660920143127441, "learning_rate": 2.1888633886093017e-05, "loss": 2.092, "step": 1785 }, { "epoch": 0.56, "grad_norm": 3.143937587738037, "learning_rate": 2.176386727183676e-05, "loss": 1.7624, "step": 1790 }, { "epoch": 0.56, "grad_norm": 4.354220390319824, "learning_rate": 2.1639182579357846e-05, "loss": 1.8961, "step": 1795 }, { "epoch": 0.56, "grad_norm": 5.339317798614502, "learning_rate": 2.151458296501374e-05, "loss": 1.9361, "step": 1800 }, { "epoch": 0.56, "grad_norm": 3.080310344696045, "learning_rate": 2.139007158300814e-05, "loss": 1.8459, "step": 1805 }, { "epoch": 0.56, "grad_norm": 3.5018744468688965, "learning_rate": 2.126565158531119e-05, "loss": 1.9086, "step": 1810 }, { "epoch": 0.56, "grad_norm": 5.1605072021484375, "learning_rate": 2.1141326121579638e-05, "loss": 1.9395, "step": 1815 }, { "epoch": 0.56, "grad_norm": 4.0767998695373535, "learning_rate": 2.1017098339077176e-05, "loss": 2.005, "step": 1820 }, { "epoch": 0.57, "grad_norm": 4.308762073516846, "learning_rate": 2.0892971382594694e-05, "loss": 1.8772, "step": 1825 }, { "epoch": 0.57, "grad_norm": 3.049802541732788, "learning_rate": 2.0768948394370702e-05, "loss": 1.9591, "step": 1830 }, { "epoch": 0.57, "grad_norm": 3.853872060775757, "learning_rate": 2.0645032514011773e-05, "loss": 1.8408, "step": 1835 }, { "epoch": 0.57, "grad_norm": 3.8186545372009277, "learning_rate": 2.052122687841311e-05, "loss": 1.9765, "step": 1840 }, { "epoch": 0.57, "grad_norm": 3.238193988800049, "learning_rate": 2.0397534621679075e-05, "loss": 1.931, "step": 1845 }, { "epoch": 0.57, "grad_norm": 3.316253662109375, "learning_rate": 2.0273958875043874e-05, "loss": 1.9787, "step": 1850 }, { "epoch": 0.58, "grad_norm": 4.303181171417236, "learning_rate": 2.0150502766792298e-05, "loss": 1.9991, "step": 1855 }, { "epoch": 0.58, "grad_norm": 3.6812000274658203, "learning_rate": 2.0027169422180546e-05, "loss": 1.8782, "step": 1860 }, { "epoch": 0.58, "grad_norm": 5.033133506774902, "learning_rate": 1.990396196335706e-05, "loss": 1.8406, "step": 1865 }, { "epoch": 0.58, "grad_norm": 4.612210750579834, "learning_rate": 1.9780883509283526e-05, "loss": 2.0226, "step": 1870 }, { "epoch": 0.58, "grad_norm": 4.63312292098999, "learning_rate": 1.9657937175655922e-05, "loss": 1.9403, "step": 1875 }, { "epoch": 0.58, "grad_norm": 3.5263733863830566, "learning_rate": 1.9535126074825647e-05, "loss": 1.9812, "step": 1880 }, { "epoch": 0.58, "grad_norm": 3.100794792175293, "learning_rate": 1.941245331572068e-05, "loss": 1.8332, "step": 1885 }, { "epoch": 0.59, "grad_norm": 4.041380405426025, "learning_rate": 1.9289922003766962e-05, "loss": 1.9352, "step": 1890 }, { "epoch": 0.59, "grad_norm": 3.329756736755371, "learning_rate": 1.9167535240809703e-05, "loss": 1.9084, "step": 1895 }, { "epoch": 0.59, "grad_norm": 3.596053123474121, "learning_rate": 1.904529612503493e-05, "loss": 1.8971, "step": 1900 }, { "epoch": 0.59, "grad_norm": 3.9134511947631836, "learning_rate": 1.8923207750890992e-05, "loss": 2.0642, "step": 1905 }, { "epoch": 0.59, "grad_norm": 3.707994222640991, "learning_rate": 1.8801273209010284e-05, "loss": 1.8276, "step": 1910 }, { "epoch": 0.59, "grad_norm": 4.338993072509766, "learning_rate": 1.8679495586130952e-05, "loss": 1.9576, "step": 1915 }, { "epoch": 0.6, "grad_norm": 3.758429765701294, "learning_rate": 1.8557877965018817e-05, "loss": 1.9956, "step": 1920 }, { "epoch": 0.6, "grad_norm": 3.7816905975341797, "learning_rate": 1.843642342438928e-05, "loss": 1.9079, "step": 1925 }, { "epoch": 0.6, "grad_norm": 5.009194850921631, "learning_rate": 1.8315135038829406e-05, "loss": 1.9509, "step": 1930 }, { "epoch": 0.6, "grad_norm": 3.4465157985687256, "learning_rate": 1.8194015878720084e-05, "loss": 2.0019, "step": 1935 }, { "epoch": 0.6, "grad_norm": 3.6948273181915283, "learning_rate": 1.8073069010158334e-05, "loss": 2.0043, "step": 1940 }, { "epoch": 0.6, "grad_norm": 3.3850791454315186, "learning_rate": 1.795229749487965e-05, "loss": 1.9031, "step": 1945 }, { "epoch": 0.61, "grad_norm": 5.051716327667236, "learning_rate": 1.7831704390180498e-05, "loss": 1.8958, "step": 1950 }, { "epoch": 0.61, "grad_norm": 2.8910887241363525, "learning_rate": 1.7711292748840943e-05, "loss": 1.8856, "step": 1955 }, { "epoch": 0.61, "grad_norm": 3.8123810291290283, "learning_rate": 1.759106561904737e-05, "loss": 1.8229, "step": 1960 }, { "epoch": 0.61, "grad_norm": 4.154626369476318, "learning_rate": 1.747102604431528e-05, "loss": 1.9509, "step": 1965 }, { "epoch": 0.61, "grad_norm": 4.20812463760376, "learning_rate": 1.7351177063412276e-05, "loss": 1.9501, "step": 1970 }, { "epoch": 0.61, "grad_norm": 3.2041704654693604, "learning_rate": 1.723152171028114e-05, "loss": 1.9888, "step": 1975 }, { "epoch": 0.61, "grad_norm": 3.133105754852295, "learning_rate": 1.7112063013963044e-05, "loss": 2.0086, "step": 1980 }, { "epoch": 0.62, "grad_norm": 4.227274417877197, "learning_rate": 1.6992803998520794e-05, "loss": 1.9373, "step": 1985 }, { "epoch": 0.62, "grad_norm": 3.2231645584106445, "learning_rate": 1.6873747682962394e-05, "loss": 1.7439, "step": 1990 }, { "epoch": 0.62, "grad_norm": 2.90924334526062, "learning_rate": 1.67548970811645e-05, "loss": 1.8914, "step": 1995 }, { "epoch": 0.62, "grad_norm": 3.2363147735595703, "learning_rate": 1.6636255201796237e-05, "loss": 1.9674, "step": 2000 }, { "epoch": 0.62, "grad_norm": 4.925014019012451, "learning_rate": 1.6517825048242936e-05, "loss": 1.8693, "step": 2005 }, { "epoch": 0.62, "grad_norm": 3.2326242923736572, "learning_rate": 1.6399609618530183e-05, "loss": 1.8776, "step": 2010 }, { "epoch": 0.63, "grad_norm": 3.984081506729126, "learning_rate": 1.6281611905247855e-05, "loss": 1.881, "step": 2015 }, { "epoch": 0.63, "grad_norm": 3.8823959827423096, "learning_rate": 1.6163834895474445e-05, "loss": 1.9769, "step": 2020 }, { "epoch": 0.63, "grad_norm": 4.131060600280762, "learning_rate": 1.604628157070136e-05, "loss": 1.9811, "step": 2025 }, { "epoch": 0.63, "grad_norm": 4.516271591186523, "learning_rate": 1.5928954906757515e-05, "loss": 1.995, "step": 2030 }, { "epoch": 0.63, "grad_norm": 3.9269816875457764, "learning_rate": 1.5811857873733942e-05, "loss": 1.8224, "step": 2035 }, { "epoch": 0.63, "grad_norm": 3.7068333625793457, "learning_rate": 1.5694993435908646e-05, "loss": 1.8799, "step": 2040 }, { "epoch": 0.63, "grad_norm": 4.0933756828308105, "learning_rate": 1.557836455167157e-05, "loss": 1.9251, "step": 2045 }, { "epoch": 0.64, "grad_norm": 4.189598560333252, "learning_rate": 1.546197417344965e-05, "loss": 2.032, "step": 2050 }, { "epoch": 0.64, "grad_norm": 3.609545946121216, "learning_rate": 1.5345825247632135e-05, "loss": 1.9399, "step": 2055 }, { "epoch": 0.64, "grad_norm": 3.9929699897766113, "learning_rate": 1.5229920714495948e-05, "loss": 1.8803, "step": 2060 }, { "epoch": 0.64, "grad_norm": 3.578582286834717, "learning_rate": 1.5114263508131327e-05, "loss": 1.8303, "step": 2065 }, { "epoch": 0.64, "grad_norm": 3.167156457901001, "learning_rate": 1.499885655636746e-05, "loss": 2.0741, "step": 2070 }, { "epoch": 0.64, "grad_norm": 3.376950263977051, "learning_rate": 1.4883702780698433e-05, "loss": 1.8935, "step": 2075 }, { "epoch": 0.65, "grad_norm": 7.022952556610107, "learning_rate": 1.4768805096209231e-05, "loss": 1.9285, "step": 2080 }, { "epoch": 0.65, "grad_norm": 4.465900897979736, "learning_rate": 1.4654166411502002e-05, "loss": 1.9464, "step": 2085 }, { "epoch": 0.65, "grad_norm": 2.990349292755127, "learning_rate": 1.4539789628622347e-05, "loss": 1.8252, "step": 2090 }, { "epoch": 0.65, "grad_norm": 3.1683619022369385, "learning_rate": 1.4425677642985924e-05, "loss": 1.8346, "step": 2095 }, { "epoch": 0.65, "grad_norm": 3.782841444015503, "learning_rate": 1.4311833343305097e-05, "loss": 1.8584, "step": 2100 }, { "epoch": 0.65, "grad_norm": 3.302788257598877, "learning_rate": 1.4198259611515886e-05, "loss": 1.9615, "step": 2105 }, { "epoch": 0.65, "grad_norm": 4.179065227508545, "learning_rate": 1.4084959322704893e-05, "loss": 2.0387, "step": 2110 }, { "epoch": 0.66, "grad_norm": 3.3860225677490234, "learning_rate": 1.3971935345036657e-05, "loss": 1.7267, "step": 2115 }, { "epoch": 0.66, "grad_norm": 4.326015472412109, "learning_rate": 1.3859190539680927e-05, "loss": 1.9828, "step": 2120 }, { "epoch": 0.66, "grad_norm": 3.4805123805999756, "learning_rate": 1.3746727760740328e-05, "loss": 1.8873, "step": 2125 }, { "epoch": 0.66, "grad_norm": 2.8176207542419434, "learning_rate": 1.3634549855178028e-05, "loss": 2.0302, "step": 2130 }, { "epoch": 0.66, "grad_norm": 2.756837844848633, "learning_rate": 1.3522659662745723e-05, "loss": 1.9893, "step": 2135 }, { "epoch": 0.66, "grad_norm": 4.258969783782959, "learning_rate": 1.3411060015911734e-05, "loss": 1.847, "step": 2140 }, { "epoch": 0.67, "grad_norm": 5.707541465759277, "learning_rate": 1.32997537397893e-05, "loss": 1.8802, "step": 2145 }, { "epoch": 0.67, "grad_norm": 3.7876532077789307, "learning_rate": 1.3188743652065083e-05, "loss": 1.9015, "step": 2150 }, { "epoch": 0.67, "grad_norm": 3.91947340965271, "learning_rate": 1.3078032562927788e-05, "loss": 1.8293, "step": 2155 }, { "epoch": 0.67, "grad_norm": 4.129434108734131, "learning_rate": 1.296762327499707e-05, "loss": 1.786, "step": 2160 }, { "epoch": 0.67, "grad_norm": 3.0605030059814453, "learning_rate": 1.2857518583252587e-05, "loss": 1.9754, "step": 2165 }, { "epoch": 0.67, "grad_norm": 3.6712772846221924, "learning_rate": 1.2747721274963214e-05, "loss": 1.8931, "step": 2170 }, { "epoch": 0.67, "grad_norm": 3.6777453422546387, "learning_rate": 1.2638234129616488e-05, "loss": 1.9122, "step": 2175 }, { "epoch": 0.68, "grad_norm": 3.1498284339904785, "learning_rate": 1.2529059918848296e-05, "loss": 1.8041, "step": 2180 }, { "epoch": 0.68, "grad_norm": 3.7665841579437256, "learning_rate": 1.2420201406372662e-05, "loss": 1.7802, "step": 2185 }, { "epoch": 0.68, "grad_norm": 3.147603988647461, "learning_rate": 1.2311661347911783e-05, "loss": 1.9658, "step": 2190 }, { "epoch": 0.68, "grad_norm": 3.327116012573242, "learning_rate": 1.220344249112629e-05, "loss": 1.8795, "step": 2195 }, { "epoch": 0.68, "grad_norm": 3.689382553100586, "learning_rate": 1.2095547575545686e-05, "loss": 1.942, "step": 2200 }, { "epoch": 0.68, "grad_norm": 3.967803955078125, "learning_rate": 1.1987979332499011e-05, "loss": 1.8653, "step": 2205 }, { "epoch": 0.69, "grad_norm": 3.113976001739502, "learning_rate": 1.1880740485045649e-05, "loss": 1.8737, "step": 2210 }, { "epoch": 0.69, "grad_norm": 3.3383049964904785, "learning_rate": 1.1773833747906471e-05, "loss": 1.9163, "step": 2215 }, { "epoch": 0.69, "grad_norm": 3.971327304840088, "learning_rate": 1.1667261827395035e-05, "loss": 2.0355, "step": 2220 }, { "epoch": 0.69, "grad_norm": 3.8071823120117188, "learning_rate": 1.1561027421349117e-05, "loss": 1.7467, "step": 2225 }, { "epoch": 0.69, "grad_norm": 3.7409048080444336, "learning_rate": 1.145513321906243e-05, "loss": 1.847, "step": 2230 }, { "epoch": 0.69, "grad_norm": 5.195309162139893, "learning_rate": 1.1349581901216514e-05, "loss": 2.0805, "step": 2235 }, { "epoch": 0.7, "grad_norm": 2.922433376312256, "learning_rate": 1.1244376139812867e-05, "loss": 1.7545, "step": 2240 }, { "epoch": 0.7, "grad_norm": 5.311805725097656, "learning_rate": 1.1139518598105358e-05, "loss": 1.9093, "step": 2245 }, { "epoch": 0.7, "grad_norm": 3.9856057167053223, "learning_rate": 1.1035011930532771e-05, "loss": 1.8777, "step": 2250 }, { "epoch": 0.7, "grad_norm": 3.006605386734009, "learning_rate": 1.0930858782651585e-05, "loss": 1.9631, "step": 2255 }, { "epoch": 0.7, "grad_norm": 3.3158912658691406, "learning_rate": 1.0827061791069045e-05, "loss": 1.8097, "step": 2260 }, { "epoch": 0.7, "grad_norm": 4.086146831512451, "learning_rate": 1.0723623583376392e-05, "loss": 1.9171, "step": 2265 }, { "epoch": 0.7, "grad_norm": 4.822931289672852, "learning_rate": 1.062054677808238e-05, "loss": 2.1704, "step": 2270 }, { "epoch": 0.71, "grad_norm": 3.8096282482147217, "learning_rate": 1.0517833984546923e-05, "loss": 1.9599, "step": 2275 }, { "epoch": 0.71, "grad_norm": 5.096799373626709, "learning_rate": 1.0415487802915133e-05, "loss": 1.9463, "step": 2280 }, { "epoch": 0.71, "grad_norm": 3.9913666248321533, "learning_rate": 1.0313510824051393e-05, "loss": 1.9045, "step": 2285 }, { "epoch": 0.71, "grad_norm": 3.0718228816986084, "learning_rate": 1.0211905629473866e-05, "loss": 1.7678, "step": 2290 }, { "epoch": 0.71, "grad_norm": 5.186037540435791, "learning_rate": 1.0110674791289079e-05, "loss": 1.9355, "step": 2295 }, { "epoch": 0.71, "grad_norm": 3.739786386489868, "learning_rate": 1.0009820872126835e-05, "loss": 2.015, "step": 2300 }, { "epoch": 0.72, "grad_norm": 3.730051040649414, "learning_rate": 9.909346425075335e-06, "loss": 1.9639, "step": 2305 }, { "epoch": 0.72, "grad_norm": 4.366475582122803, "learning_rate": 9.809253993616569e-06, "loss": 2.1142, "step": 2310 }, { "epoch": 0.72, "grad_norm": 2.9198176860809326, "learning_rate": 9.709546111561913e-06, "loss": 1.8616, "step": 2315 }, { "epoch": 0.72, "grad_norm": 3.5179014205932617, "learning_rate": 9.610225302987961e-06, "loss": 1.8651, "step": 2320 }, { "epoch": 0.72, "grad_norm": 3.9303548336029053, "learning_rate": 9.511294082172653e-06, "loss": 2.0002, "step": 2325 }, { "epoch": 0.72, "grad_norm": 3.435821771621704, "learning_rate": 9.412754953531663e-06, "loss": 1.8817, "step": 2330 }, { "epoch": 0.72, "grad_norm": 4.4535932540893555, "learning_rate": 9.314610411554925e-06, "loss": 1.8213, "step": 2335 }, { "epoch": 0.73, "grad_norm": 3.345769166946411, "learning_rate": 9.216862940743529e-06, "loss": 1.8374, "step": 2340 }, { "epoch": 0.73, "grad_norm": 4.314777851104736, "learning_rate": 9.119515015546836e-06, "loss": 2.0438, "step": 2345 }, { "epoch": 0.73, "grad_norm": 4.599632263183594, "learning_rate": 9.02256910029983e-06, "loss": 1.8459, "step": 2350 }, { "epoch": 0.73, "grad_norm": 3.590637683868408, "learning_rate": 8.926027649160704e-06, "loss": 1.8009, "step": 2355 }, { "epoch": 0.73, "grad_norm": 3.119189500808716, "learning_rate": 8.82989310604877e-06, "loss": 1.9651, "step": 2360 }, { "epoch": 0.73, "grad_norm": 3.1386303901672363, "learning_rate": 8.734167904582566e-06, "loss": 1.7791, "step": 2365 }, { "epoch": 0.74, "grad_norm": 3.6528995037078857, "learning_rate": 8.638854468018296e-06, "loss": 1.9259, "step": 2370 }, { "epoch": 0.74, "grad_norm": 4.182424545288086, "learning_rate": 8.543955209188412e-06, "loss": 1.8853, "step": 2375 }, { "epoch": 0.74, "grad_norm": 5.662861347198486, "learning_rate": 8.449472530440612e-06, "loss": 1.9349, "step": 2380 }, { "epoch": 0.74, "grad_norm": 4.169982433319092, "learning_rate": 8.355408823576951e-06, "loss": 1.9554, "step": 2385 }, { "epoch": 0.74, "grad_norm": 3.808478832244873, "learning_rate": 8.261766469793373e-06, "loss": 1.8309, "step": 2390 }, { "epoch": 0.74, "grad_norm": 3.801201343536377, "learning_rate": 8.168547839619352e-06, "loss": 1.8714, "step": 2395 }, { "epoch": 0.74, "grad_norm": 3.8212218284606934, "learning_rate": 8.075755292857933e-06, "loss": 1.844, "step": 2400 }, { "epoch": 0.75, "grad_norm": 4.7147650718688965, "learning_rate": 7.983391178525979e-06, "loss": 1.9004, "step": 2405 }, { "epoch": 0.75, "grad_norm": 3.4768807888031006, "learning_rate": 7.89145783479471e-06, "loss": 1.947, "step": 2410 }, { "epoch": 0.75, "grad_norm": 3.307199478149414, "learning_rate": 7.799957588930523e-06, "loss": 1.9069, "step": 2415 }, { "epoch": 0.75, "grad_norm": 4.613658905029297, "learning_rate": 7.708892757236047e-06, "loss": 1.917, "step": 2420 }, { "epoch": 0.75, "grad_norm": 2.8293955326080322, "learning_rate": 7.618265644991535e-06, "loss": 1.8854, "step": 2425 }, { "epoch": 0.75, "grad_norm": 3.302823066711426, "learning_rate": 7.528078546396481e-06, "loss": 2.0073, "step": 2430 }, { "epoch": 0.76, "grad_norm": 2.862478494644165, "learning_rate": 7.438333744511591e-06, "loss": 1.9243, "step": 2435 }, { "epoch": 0.76, "grad_norm": 4.1902899742126465, "learning_rate": 7.3490335112009225e-06, "loss": 1.8696, "step": 2440 }, { "epoch": 0.76, "grad_norm": 3.4848709106445312, "learning_rate": 7.260180107074438e-06, "loss": 2.0236, "step": 2445 }, { "epoch": 0.76, "grad_norm": 2.9219446182250977, "learning_rate": 7.171775781430712e-06, "loss": 1.9218, "step": 2450 }, { "epoch": 0.76, "grad_norm": 3.458622694015503, "learning_rate": 7.083822772200058e-06, "loss": 1.9155, "step": 2455 }, { "epoch": 0.76, "grad_norm": 3.5859556198120117, "learning_rate": 6.996323305887822e-06, "loss": 1.9701, "step": 2460 }, { "epoch": 0.76, "grad_norm": 3.7645373344421387, "learning_rate": 6.909279597518048e-06, "loss": 1.9555, "step": 2465 }, { "epoch": 0.77, "grad_norm": 5.934003829956055, "learning_rate": 6.822693850577385e-06, "loss": 1.9963, "step": 2470 }, { "epoch": 0.77, "grad_norm": 4.152750015258789, "learning_rate": 6.7365682569593496e-06, "loss": 1.8777, "step": 2475 }, { "epoch": 0.77, "grad_norm": 3.7498714923858643, "learning_rate": 6.6509049969087715e-06, "loss": 1.9313, "step": 2480 }, { "epoch": 0.77, "grad_norm": 2.86311411857605, "learning_rate": 6.565706238966671e-06, "loss": 1.7692, "step": 2485 }, { "epoch": 0.77, "grad_norm": 4.296627521514893, "learning_rate": 6.480974139915297e-06, "loss": 1.942, "step": 2490 }, { "epoch": 0.77, "grad_norm": 3.102341890335083, "learning_rate": 6.396710844723597e-06, "loss": 1.9011, "step": 2495 }, { "epoch": 0.78, "grad_norm": 4.467423439025879, "learning_rate": 6.312918486492855e-06, "loss": 1.8276, "step": 2500 }, { "epoch": 0.78, "grad_norm": 4.662038803100586, "learning_rate": 6.229599186402729e-06, "loss": 1.8927, "step": 2505 }, { "epoch": 0.78, "grad_norm": 6.194324493408203, "learning_rate": 6.146755053657541e-06, "loss": 1.8046, "step": 2510 }, { "epoch": 0.78, "grad_norm": 3.2271151542663574, "learning_rate": 6.064388185432898e-06, "loss": 1.7897, "step": 2515 }, { "epoch": 0.78, "grad_norm": 3.0152978897094727, "learning_rate": 5.9825006668225905e-06, "loss": 1.8203, "step": 2520 }, { "epoch": 0.78, "grad_norm": 3.5677027702331543, "learning_rate": 5.901094570785798e-06, "loss": 1.9312, "step": 2525 }, { "epoch": 0.79, "grad_norm": 3.464501142501831, "learning_rate": 5.820171958094628e-06, "loss": 1.9227, "step": 2530 }, { "epoch": 0.79, "grad_norm": 4.184050559997559, "learning_rate": 5.73973487728196e-06, "loss": 1.8542, "step": 2535 }, { "epoch": 0.79, "grad_norm": 3.7280945777893066, "learning_rate": 5.659785364589556e-06, "loss": 2.0387, "step": 2540 }, { "epoch": 0.79, "grad_norm": 3.863532543182373, "learning_rate": 5.580325443916526e-06, "loss": 1.8824, "step": 2545 }, { "epoch": 0.79, "grad_norm": 3.403118133544922, "learning_rate": 5.501357126768117e-06, "loss": 1.8999, "step": 2550 }, { "epoch": 0.79, "grad_norm": 3.203178644180298, "learning_rate": 5.422882412204766e-06, "loss": 2.0521, "step": 2555 }, { "epoch": 0.79, "grad_norm": 3.8374898433685303, "learning_rate": 5.344903286791494e-06, "loss": 1.8838, "step": 2560 }, { "epoch": 0.8, "grad_norm": 3.570945978164673, "learning_rate": 5.267421724547627e-06, "loss": 1.9615, "step": 2565 }, { "epoch": 0.8, "grad_norm": 6.397089004516602, "learning_rate": 5.1904396868968195e-06, "loss": 1.9624, "step": 2570 }, { "epoch": 0.8, "grad_norm": 3.234090805053711, "learning_rate": 5.113959122617412e-06, "loss": 1.9239, "step": 2575 }, { "epoch": 0.8, "grad_norm": 3.1682183742523193, "learning_rate": 5.037981967793076e-06, "loss": 1.8498, "step": 2580 }, { "epoch": 0.8, "grad_norm": 4.0839152336120605, "learning_rate": 4.9625101457638376e-06, "loss": 1.9856, "step": 2585 }, { "epoch": 0.8, "grad_norm": 3.629542589187622, "learning_rate": 4.887545567077337e-06, "loss": 1.8867, "step": 2590 }, { "epoch": 0.81, "grad_norm": 4.0674638748168945, "learning_rate": 4.8130901294405255e-06, "loss": 2.0402, "step": 2595 }, { "epoch": 0.81, "grad_norm": 3.093059539794922, "learning_rate": 4.739145717671572e-06, "loss": 1.9107, "step": 2600 }, { "epoch": 0.81, "grad_norm": 6.425740718841553, "learning_rate": 4.665714203652177e-06, "loss": 1.8893, "step": 2605 }, { "epoch": 0.81, "grad_norm": 3.764960765838623, "learning_rate": 4.592797446280178e-06, "loss": 1.8649, "step": 2610 }, { "epoch": 0.81, "grad_norm": 3.2027156352996826, "learning_rate": 4.520397291422501e-06, "loss": 1.991, "step": 2615 }, { "epoch": 0.81, "grad_norm": 4.535457134246826, "learning_rate": 4.448515571868434e-06, "loss": 1.8798, "step": 2620 }, { "epoch": 0.81, "grad_norm": 3.6848881244659424, "learning_rate": 4.3771541072832045e-06, "loss": 1.9349, "step": 2625 }, { "epoch": 0.82, "grad_norm": 3.817534923553467, "learning_rate": 4.306314704161937e-06, "loss": 1.8637, "step": 2630 }, { "epoch": 0.82, "grad_norm": 3.4655098915100098, "learning_rate": 4.23599915578394e-06, "loss": 1.8615, "step": 2635 }, { "epoch": 0.82, "grad_norm": 2.829066276550293, "learning_rate": 4.16620924216726e-06, "loss": 1.7928, "step": 2640 }, { "epoch": 0.82, "grad_norm": 4.525213241577148, "learning_rate": 4.096946730023662e-06, "loss": 1.903, "step": 2645 }, { "epoch": 0.82, "grad_norm": 3.8306119441986084, "learning_rate": 4.028213372713904e-06, "loss": 1.9473, "step": 2650 }, { "epoch": 0.82, "grad_norm": 4.448178768157959, "learning_rate": 3.960010910203319e-06, "loss": 1.959, "step": 2655 }, { "epoch": 0.83, "grad_norm": 3.6487441062927246, "learning_rate": 3.892341069017808e-06, "loss": 1.9932, "step": 2660 }, { "epoch": 0.83, "grad_norm": 3.487689256668091, "learning_rate": 3.825205562200101e-06, "loss": 1.9578, "step": 2665 }, { "epoch": 0.83, "grad_norm": 3.0234782695770264, "learning_rate": 3.75860608926642e-06, "loss": 1.9083, "step": 2670 }, { "epoch": 0.83, "grad_norm": 3.328275203704834, "learning_rate": 3.69254433616342e-06, "loss": 2.0128, "step": 2675 }, { "epoch": 0.83, "grad_norm": 2.9996497631073, "learning_rate": 3.627021975225553e-06, "loss": 1.633, "step": 2680 }, { "epoch": 0.83, "grad_norm": 3.9526045322418213, "learning_rate": 3.562040665132715e-06, "loss": 1.8948, "step": 2685 }, { "epoch": 0.83, "grad_norm": 4.027220249176025, "learning_rate": 3.4976020508682344e-06, "loss": 1.8918, "step": 2690 }, { "epoch": 0.84, "grad_norm": 4.6429829597473145, "learning_rate": 3.4337077636772547e-06, "loss": 1.8865, "step": 2695 }, { "epoch": 0.84, "grad_norm": 4.5367865562438965, "learning_rate": 3.3703594210254487e-06, "loss": 1.895, "step": 2700 }, { "epoch": 0.84, "grad_norm": 3.4687774181365967, "learning_rate": 3.3075586265580494e-06, "loss": 1.8908, "step": 2705 }, { "epoch": 0.84, "grad_norm": 4.654914855957031, "learning_rate": 3.24530697005925e-06, "loss": 1.7785, "step": 2710 }, { "epoch": 0.84, "grad_norm": 4.516482353210449, "learning_rate": 3.183606027411998e-06, "loss": 1.7936, "step": 2715 }, { "epoch": 0.84, "grad_norm": 4.209545135498047, "learning_rate": 3.1224573605580648e-06, "loss": 1.9851, "step": 2720 }, { "epoch": 0.85, "grad_norm": 4.1666178703308105, "learning_rate": 3.061862517458519e-06, "loss": 1.858, "step": 2725 }, { "epoch": 0.85, "grad_norm": 5.190033912658691, "learning_rate": 3.001823032054532e-06, "loss": 1.9802, "step": 2730 }, { "epoch": 0.85, "grad_norm": 4.3511528968811035, "learning_rate": 2.942340424228554e-06, "loss": 1.9403, "step": 2735 }, { "epoch": 0.85, "grad_norm": 4.630067348480225, "learning_rate": 2.8834161997658565e-06, "loss": 1.7726, "step": 2740 }, { "epoch": 0.85, "grad_norm": 3.705087184906006, "learning_rate": 2.825051850316371e-06, "loss": 1.8286, "step": 2745 }, { "epoch": 0.85, "grad_norm": 3.315842628479004, "learning_rate": 2.767248853356971e-06, "loss": 1.8397, "step": 2750 }, { "epoch": 0.85, "grad_norm": 5.60033655166626, "learning_rate": 2.710008672154035e-06, "loss": 1.994, "step": 2755 }, { "epoch": 0.86, "grad_norm": 4.465238571166992, "learning_rate": 2.65333275572644e-06, "loss": 1.9824, "step": 2760 }, { "epoch": 0.86, "grad_norm": 3.8040528297424316, "learning_rate": 2.5972225388088497e-06, "loss": 1.8507, "step": 2765 }, { "epoch": 0.86, "grad_norm": 3.2600059509277344, "learning_rate": 2.5416794418154035e-06, "loss": 1.992, "step": 2770 }, { "epoch": 0.86, "grad_norm": 4.9075703620910645, "learning_rate": 2.486704870803763e-06, "loss": 1.8189, "step": 2775 }, { "epoch": 0.86, "grad_norm": 4.047214508056641, "learning_rate": 2.432300217439526e-06, "loss": 1.9156, "step": 2780 }, { "epoch": 0.86, "grad_norm": 4.082090854644775, "learning_rate": 2.3784668589609814e-06, "loss": 1.8582, "step": 2785 }, { "epoch": 0.87, "grad_norm": 3.8980605602264404, "learning_rate": 2.3252061581442496e-06, "loss": 1.8418, "step": 2790 }, { "epoch": 0.87, "grad_norm": 4.5113372802734375, "learning_rate": 2.2725194632687795e-06, "loss": 1.8942, "step": 2795 }, { "epoch": 0.87, "grad_norm": 4.78348445892334, "learning_rate": 2.220408108083244e-06, "loss": 1.868, "step": 2800 }, { "epoch": 0.87, "grad_norm": 3.327033281326294, "learning_rate": 2.1688734117717295e-06, "loss": 1.9177, "step": 2805 }, { "epoch": 0.87, "grad_norm": 3.6453311443328857, "learning_rate": 2.117916678920384e-06, "loss": 1.8282, "step": 2810 }, { "epoch": 0.87, "grad_norm": 3.0697853565216064, "learning_rate": 2.0675391994843695e-06, "loss": 1.8374, "step": 2815 }, { "epoch": 0.88, "grad_norm": 3.6173019409179688, "learning_rate": 2.017742248755225e-06, "loss": 1.9797, "step": 2820 }, { "epoch": 0.88, "grad_norm": 3.858684539794922, "learning_rate": 1.9685270873285505e-06, "loss": 1.9083, "step": 2825 }, { "epoch": 0.88, "grad_norm": 3.6615593433380127, "learning_rate": 1.9198949610721273e-06, "loss": 2.0119, "step": 2830 }, { "epoch": 0.88, "grad_norm": 4.125614643096924, "learning_rate": 1.8718471010943623e-06, "loss": 1.8927, "step": 2835 }, { "epoch": 0.88, "grad_norm": 3.79669451713562, "learning_rate": 1.8243847237131406e-06, "loss": 1.8407, "step": 2840 }, { "epoch": 0.88, "grad_norm": 3.5093576908111572, "learning_rate": 1.7775090304250065e-06, "loss": 1.9293, "step": 2845 }, { "epoch": 0.88, "grad_norm": 3.6266543865203857, "learning_rate": 1.7312212078747781e-06, "loss": 1.6496, "step": 2850 }, { "epoch": 0.89, "grad_norm": 4.086301326751709, "learning_rate": 1.6855224278254812e-06, "loss": 1.9496, "step": 2855 }, { "epoch": 0.89, "grad_norm": 3.14742374420166, "learning_rate": 1.6404138471286966e-06, "loss": 1.8646, "step": 2860 }, { "epoch": 0.89, "grad_norm": 2.868939161300659, "learning_rate": 1.5958966076952992e-06, "loss": 1.9593, "step": 2865 }, { "epoch": 0.89, "grad_norm": 3.424562931060791, "learning_rate": 1.5519718364665009e-06, "loss": 1.7344, "step": 2870 }, { "epoch": 0.89, "grad_norm": 3.9741764068603516, "learning_rate": 1.5086406453853646e-06, "loss": 1.7876, "step": 2875 }, { "epoch": 0.89, "grad_norm": 4.209314346313477, "learning_rate": 1.4659041313686366e-06, "loss": 2.1263, "step": 2880 }, { "epoch": 0.9, "grad_norm": 4.095180034637451, "learning_rate": 1.4237633762789942e-06, "loss": 1.7563, "step": 2885 }, { "epoch": 0.9, "grad_norm": 4.4438066482543945, "learning_rate": 1.3822194468976284e-06, "loss": 1.8099, "step": 2890 }, { "epoch": 0.9, "grad_norm": 4.844168663024902, "learning_rate": 1.3412733948972688e-06, "loss": 1.8867, "step": 2895 }, { "epoch": 0.9, "grad_norm": 3.2806739807128906, "learning_rate": 1.300926256815546e-06, "loss": 1.9385, "step": 2900 }, { "epoch": 0.9, "grad_norm": 3.7914087772369385, "learning_rate": 1.2611790540287633e-06, "loss": 1.7425, "step": 2905 }, { "epoch": 0.9, "grad_norm": 4.138453960418701, "learning_rate": 1.2220327927260161e-06, "loss": 1.9172, "step": 2910 }, { "epoch": 0.9, "grad_norm": 3.3346848487854004, "learning_rate": 1.1834884638837613e-06, "loss": 1.9754, "step": 2915 }, { "epoch": 0.91, "grad_norm": 3.6204893589019775, "learning_rate": 1.1455470432406829e-06, "loss": 1.7101, "step": 2920 }, { "epoch": 0.91, "grad_norm": 4.972575664520264, "learning_rate": 1.108209491273035e-06, "loss": 1.8861, "step": 2925 }, { "epoch": 0.91, "grad_norm": 3.620809316635132, "learning_rate": 1.0714767531702973e-06, "loss": 1.8525, "step": 2930 }, { "epoch": 0.91, "grad_norm": 3.33205509185791, "learning_rate": 1.035349758811263e-06, "loss": 1.8453, "step": 2935 }, { "epoch": 0.91, "grad_norm": 3.7018685340881348, "learning_rate": 9.998294227404863e-07, "loss": 2.0806, "step": 2940 }, { "epoch": 0.91, "grad_norm": 4.9941864013671875, "learning_rate": 9.649166441451557e-07, "loss": 1.94, "step": 2945 }, { "epoch": 0.92, "grad_norm": 4.217085361480713, "learning_rate": 9.306123068323097e-07, "loss": 1.9168, "step": 2950 }, { "epoch": 0.92, "grad_norm": 3.2208547592163086, "learning_rate": 8.969172792064634e-07, "loss": 1.8819, "step": 2955 }, { "epoch": 0.92, "grad_norm": 3.9018375873565674, "learning_rate": 8.638324142476284e-07, "loss": 1.9311, "step": 2960 }, { "epoch": 0.92, "grad_norm": 3.776543140411377, "learning_rate": 8.313585494897385e-07, "loss": 1.762, "step": 2965 }, { "epoch": 0.92, "grad_norm": 6.1161603927612305, "learning_rate": 7.994965069994142e-07, "loss": 1.8604, "step": 2970 }, { "epoch": 0.92, "grad_norm": 3.6044158935546875, "learning_rate": 7.682470933551761e-07, "loss": 1.7736, "step": 2975 }, { "epoch": 0.92, "grad_norm": 4.38954496383667, "learning_rate": 7.376110996270281e-07, "loss": 1.9429, "step": 2980 }, { "epoch": 0.93, "grad_norm": 4.361955165863037, "learning_rate": 7.075893013564123e-07, "loss": 1.8157, "step": 2985 }, { "epoch": 0.93, "grad_norm": 3.799809217453003, "learning_rate": 6.781824585365915e-07, "loss": 1.9094, "step": 2990 }, { "epoch": 0.93, "grad_norm": 4.269566059112549, "learning_rate": 6.493913155934117e-07, "loss": 1.9207, "step": 2995 }, { "epoch": 0.93, "grad_norm": 4.451285362243652, "learning_rate": 6.212166013664422e-07, "loss": 1.6652, "step": 3000 }, { "epoch": 0.93, "grad_norm": 3.91097092628479, "learning_rate": 5.93659029090543e-07, "loss": 1.9185, "step": 3005 }, { "epoch": 0.93, "grad_norm": 3.952296257019043, "learning_rate": 5.667192963778017e-07, "loss": 1.7982, "step": 3010 }, { "epoch": 0.94, "grad_norm": 3.8603575229644775, "learning_rate": 5.403980851998669e-07, "loss": 1.8665, "step": 3015 }, { "epoch": 0.94, "grad_norm": 4.040564060211182, "learning_rate": 5.146960618706981e-07, "loss": 1.8744, "step": 3020 }, { "epoch": 0.94, "grad_norm": 3.266788959503174, "learning_rate": 4.896138770296876e-07, "loss": 1.8463, "step": 3025 }, { "epoch": 0.94, "grad_norm": 3.374309539794922, "learning_rate": 4.6515216562519615e-07, "loss": 1.8195, "step": 3030 }, { "epoch": 0.94, "grad_norm": 3.7271621227264404, "learning_rate": 4.41311546898468e-07, "loss": 1.788, "step": 3035 }, { "epoch": 0.94, "grad_norm": 3.1484320163726807, "learning_rate": 4.180926243679689e-07, "loss": 1.8316, "step": 3040 }, { "epoch": 0.94, "grad_norm": 3.443974256515503, "learning_rate": 3.954959858141066e-07, "loss": 1.9071, "step": 3045 }, { "epoch": 0.95, "grad_norm": 3.8171606063842773, "learning_rate": 3.735222032643426e-07, "loss": 2.1321, "step": 3050 }, { "epoch": 0.95, "grad_norm": 3.141526699066162, "learning_rate": 3.521718329787177e-07, "loss": 1.8597, "step": 3055 }, { "epoch": 0.95, "grad_norm": 3.848994255065918, "learning_rate": 3.314454154357688e-07, "loss": 1.9906, "step": 3060 }, { "epoch": 0.95, "grad_norm": 3.9238314628601074, "learning_rate": 3.1134347531884267e-07, "loss": 1.9433, "step": 3065 }, { "epoch": 0.95, "grad_norm": 4.169834136962891, "learning_rate": 2.9186652150282603e-07, "loss": 1.7679, "step": 3070 }, { "epoch": 0.95, "grad_norm": 6.12147331237793, "learning_rate": 2.7301504704125016e-07, "loss": 1.6556, "step": 3075 }, { "epoch": 0.96, "grad_norm": 3.5053157806396484, "learning_rate": 2.547895291538177e-07, "loss": 1.9142, "step": 3080 }, { "epoch": 0.96, "grad_norm": 4.274362087249756, "learning_rate": 2.371904292143151e-07, "loss": 1.8754, "step": 3085 }, { "epoch": 0.96, "grad_norm": 3.843151569366455, "learning_rate": 2.2021819273894127e-07, "loss": 1.7239, "step": 3090 }, { "epoch": 0.96, "grad_norm": 3.5693886280059814, "learning_rate": 2.0387324937502505e-07, "loss": 1.8063, "step": 3095 }, { "epoch": 0.96, "grad_norm": 4.155526161193848, "learning_rate": 1.8815601289014496e-07, "loss": 1.8008, "step": 3100 }, { "epoch": 0.96, "grad_norm": 4.957355499267578, "learning_rate": 1.730668811616598e-07, "loss": 1.9108, "step": 3105 }, { "epoch": 0.97, "grad_norm": 5.035935878753662, "learning_rate": 1.5860623616664184e-07, "loss": 2.0325, "step": 3110 }, { "epoch": 0.97, "grad_norm": 4.176791667938232, "learning_rate": 1.4477444397219542e-07, "loss": 1.8947, "step": 3115 }, { "epoch": 0.97, "grad_norm": 3.648829460144043, "learning_rate": 1.3157185472619516e-07, "loss": 1.8535, "step": 3120 }, { "epoch": 0.97, "grad_norm": 3.8320178985595703, "learning_rate": 1.1899880264842068e-07, "loss": 1.8678, "step": 3125 }, { "epoch": 0.97, "grad_norm": 3.046886682510376, "learning_rate": 1.0705560602210784e-07, "loss": 1.8263, "step": 3130 }, { "epoch": 0.97, "grad_norm": 5.341119766235352, "learning_rate": 9.574256718586639e-08, "loss": 1.9319, "step": 3135 }, { "epoch": 0.97, "grad_norm": 3.0084095001220703, "learning_rate": 8.505997252605258e-08, "loss": 1.7669, "step": 3140 }, { "epoch": 0.98, "grad_norm": 3.5134646892547607, "learning_rate": 7.500809246950569e-08, "loss": 1.824, "step": 3145 }, { "epoch": 0.98, "grad_norm": 3.576869249343872, "learning_rate": 6.558718147670339e-08, "loss": 1.8971, "step": 3150 }, { "epoch": 0.98, "grad_norm": 3.1408050060272217, "learning_rate": 5.679747803531699e-08, "loss": 1.9365, "step": 3155 }, { "epoch": 0.98, "grad_norm": 4.063467979431152, "learning_rate": 4.863920465418836e-08, "loss": 1.8272, "step": 3160 }, { "epoch": 0.98, "grad_norm": 3.66452693939209, "learning_rate": 4.111256785767903e-08, "loss": 1.7885, "step": 3165 }, { "epoch": 0.98, "grad_norm": 3.7975409030914307, "learning_rate": 3.421775818045481e-08, "loss": 1.879, "step": 3170 }, { "epoch": 0.99, "grad_norm": 4.497860908508301, "learning_rate": 2.7954950162656367e-08, "loss": 1.828, "step": 3175 }, { "epoch": 0.99, "grad_norm": 3.815382242202759, "learning_rate": 2.2324302345483327e-08, "loss": 1.9715, "step": 3180 }, { "epoch": 0.99, "grad_norm": 5.165794849395752, "learning_rate": 1.7325957267180782e-08, "loss": 1.8856, "step": 3185 }, { "epoch": 0.99, "grad_norm": 4.661296367645264, "learning_rate": 1.2960041459425532e-08, "loss": 1.9542, "step": 3190 }, { "epoch": 0.99, "grad_norm": 4.152047157287598, "learning_rate": 9.226665444136973e-09, "loss": 1.9453, "step": 3195 }, { "epoch": 0.99, "grad_norm": 3.161618232727051, "learning_rate": 6.1259237306599e-09, "loss": 1.7805, "step": 3200 } ], "logging_steps": 5, "max_steps": 3222, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 200, "total_flos": 4.797270917531566e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }