{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.703940362087327, "eval_steps": 500, "global_step": 3800, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.002242026792220167, "grad_norm": 1.0706044435501099, "learning_rate": 9.999986217521373e-05, "loss": 2.3592, "step": 5 }, { "epoch": 0.004484053584440334, "grad_norm": 1.1121090650558472, "learning_rate": 9.999944870161475e-05, "loss": 2.2216, "step": 10 }, { "epoch": 0.006726080376660501, "grad_norm": 1.4128837585449219, "learning_rate": 9.999875958148252e-05, "loss": 2.0607, "step": 15 }, { "epoch": 0.008968107168880668, "grad_norm": 1.3161660432815552, "learning_rate": 9.99977948186162e-05, "loss": 1.7687, "step": 20 }, { "epoch": 0.011210133961100835, "grad_norm": 1.2509621381759644, "learning_rate": 9.999655441833445e-05, "loss": 1.8135, "step": 25 }, { "epoch": 0.013452160753321002, "grad_norm": 1.2357916831970215, "learning_rate": 9.999503838747563e-05, "loss": 1.6791, "step": 30 }, { "epoch": 0.01569418754554117, "grad_norm": 1.3020069599151611, "learning_rate": 9.999324673439762e-05, "loss": 1.5995, "step": 35 }, { "epoch": 0.017936214337761335, "grad_norm": 1.2886899709701538, "learning_rate": 9.999117946897775e-05, "loss": 1.5247, "step": 40 }, { "epoch": 0.020178241129981504, "grad_norm": 1.5385006666183472, "learning_rate": 9.998883660261285e-05, "loss": 1.4753, "step": 45 }, { "epoch": 0.02242026792220167, "grad_norm": 1.3205918073654175, "learning_rate": 9.998621814821914e-05, "loss": 1.5195, "step": 50 }, { "epoch": 0.02466229471442184, "grad_norm": 1.1984736919403076, "learning_rate": 9.99833241202321e-05, "loss": 1.5227, "step": 55 }, { "epoch": 0.026904321506642005, "grad_norm": 1.4019296169281006, "learning_rate": 9.998015453460651e-05, "loss": 1.4167, "step": 60 }, { "epoch": 0.02914634829886217, "grad_norm": 1.3315256834030151, "learning_rate": 9.997670940881627e-05, "loss": 1.4096, "step": 65 }, { "epoch": 0.03138837509108234, "grad_norm": 1.1818488836288452, "learning_rate": 9.99729887618543e-05, "loss": 1.4869, "step": 70 }, { "epoch": 0.033630401883302505, "grad_norm": 1.41718327999115, "learning_rate": 9.996899261423254e-05, "loss": 1.3926, "step": 75 }, { "epoch": 0.03587242867552267, "grad_norm": 1.1350888013839722, "learning_rate": 9.996472098798169e-05, "loss": 1.5258, "step": 80 }, { "epoch": 0.03811445546774284, "grad_norm": 1.2058793306350708, "learning_rate": 9.996017390665118e-05, "loss": 1.4866, "step": 85 }, { "epoch": 0.04035648225996301, "grad_norm": 1.3261053562164307, "learning_rate": 9.995535139530904e-05, "loss": 1.4103, "step": 90 }, { "epoch": 0.042598509052183174, "grad_norm": 1.4682806730270386, "learning_rate": 9.995025348054175e-05, "loss": 1.3903, "step": 95 }, { "epoch": 0.04484053584440334, "grad_norm": 1.4472965002059937, "learning_rate": 9.994488019045405e-05, "loss": 1.4353, "step": 100 }, { "epoch": 0.047082562636623505, "grad_norm": 1.1997061967849731, "learning_rate": 9.993923155466884e-05, "loss": 1.4072, "step": 105 }, { "epoch": 0.04932458942884368, "grad_norm": 1.140612006187439, "learning_rate": 9.993330760432703e-05, "loss": 1.4188, "step": 110 }, { "epoch": 0.05156661622106384, "grad_norm": 1.353933572769165, "learning_rate": 9.992710837208726e-05, "loss": 1.3812, "step": 115 }, { "epoch": 0.05380864301328401, "grad_norm": 1.6113260984420776, "learning_rate": 9.992063389212589e-05, "loss": 1.4144, "step": 120 }, { "epoch": 0.056050669805504175, "grad_norm": 1.1334792375564575, "learning_rate": 9.991388420013666e-05, "loss": 1.4366, "step": 125 }, { "epoch": 0.05829269659772434, "grad_norm": 1.274436593055725, "learning_rate": 9.990685933333054e-05, "loss": 1.3906, "step": 130 }, { "epoch": 0.06053472338994451, "grad_norm": 1.1813853979110718, "learning_rate": 9.98995593304356e-05, "loss": 1.4297, "step": 135 }, { "epoch": 0.06277675018216468, "grad_norm": 1.367594599723816, "learning_rate": 9.989198423169666e-05, "loss": 1.3987, "step": 140 }, { "epoch": 0.06501877697438484, "grad_norm": 1.1809221506118774, "learning_rate": 9.98841340788752e-05, "loss": 1.4149, "step": 145 }, { "epoch": 0.06726080376660501, "grad_norm": 1.1612411737442017, "learning_rate": 9.987600891524902e-05, "loss": 1.4063, "step": 150 }, { "epoch": 0.06950283055882518, "grad_norm": 1.2665232419967651, "learning_rate": 9.986760878561209e-05, "loss": 1.3765, "step": 155 }, { "epoch": 0.07174485735104534, "grad_norm": 1.1557029485702515, "learning_rate": 9.985893373627426e-05, "loss": 1.3657, "step": 160 }, { "epoch": 0.07398688414326551, "grad_norm": 1.2104707956314087, "learning_rate": 9.984998381506099e-05, "loss": 1.4443, "step": 165 }, { "epoch": 0.07622891093548569, "grad_norm": 1.7405874729156494, "learning_rate": 9.984075907131314e-05, "loss": 1.4025, "step": 170 }, { "epoch": 0.07847093772770584, "grad_norm": 1.366018295288086, "learning_rate": 9.983125955588662e-05, "loss": 1.3552, "step": 175 }, { "epoch": 0.08071296451992602, "grad_norm": 1.171950101852417, "learning_rate": 9.982148532115218e-05, "loss": 1.3902, "step": 180 }, { "epoch": 0.08295499131214618, "grad_norm": 1.1674119234085083, "learning_rate": 9.98114364209951e-05, "loss": 1.3777, "step": 185 }, { "epoch": 0.08519701810436635, "grad_norm": 1.069298505783081, "learning_rate": 9.980111291081488e-05, "loss": 1.4023, "step": 190 }, { "epoch": 0.08743904489658652, "grad_norm": 1.3680400848388672, "learning_rate": 9.979051484752496e-05, "loss": 1.3485, "step": 195 }, { "epoch": 0.08968107168880668, "grad_norm": 1.2846928834915161, "learning_rate": 9.977964228955232e-05, "loss": 1.4068, "step": 200 }, { "epoch": 0.09192309848102685, "grad_norm": 1.1173886060714722, "learning_rate": 9.976849529683734e-05, "loss": 1.4086, "step": 205 }, { "epoch": 0.09416512527324701, "grad_norm": 1.1485246419906616, "learning_rate": 9.975707393083327e-05, "loss": 1.3981, "step": 210 }, { "epoch": 0.09640715206546718, "grad_norm": 1.503753900527954, "learning_rate": 9.974537825450598e-05, "loss": 1.3903, "step": 215 }, { "epoch": 0.09864917885768736, "grad_norm": 1.1287413835525513, "learning_rate": 9.973340833233369e-05, "loss": 1.4314, "step": 220 }, { "epoch": 0.10089120564990751, "grad_norm": 1.2056329250335693, "learning_rate": 9.972116423030641e-05, "loss": 1.402, "step": 225 }, { "epoch": 0.10313323244212769, "grad_norm": 1.5625340938568115, "learning_rate": 9.970864601592583e-05, "loss": 1.3223, "step": 230 }, { "epoch": 0.10537525923434785, "grad_norm": 1.3235552310943604, "learning_rate": 9.969585375820474e-05, "loss": 1.3783, "step": 235 }, { "epoch": 0.10761728602656802, "grad_norm": 1.2155221700668335, "learning_rate": 9.968278752766672e-05, "loss": 1.422, "step": 240 }, { "epoch": 0.10985931281878819, "grad_norm": 1.470157504081726, "learning_rate": 9.966944739634581e-05, "loss": 1.4048, "step": 245 }, { "epoch": 0.11210133961100835, "grad_norm": 1.295973300933838, "learning_rate": 9.965583343778605e-05, "loss": 1.4016, "step": 250 }, { "epoch": 0.11434336640322852, "grad_norm": 1.306946873664856, "learning_rate": 9.964194572704106e-05, "loss": 1.3679, "step": 255 }, { "epoch": 0.11658539319544868, "grad_norm": 1.2880349159240723, "learning_rate": 9.962778434067368e-05, "loss": 1.4271, "step": 260 }, { "epoch": 0.11882741998766885, "grad_norm": 1.165236473083496, "learning_rate": 9.96133493567555e-05, "loss": 1.4263, "step": 265 }, { "epoch": 0.12106944677988903, "grad_norm": 1.293133020401001, "learning_rate": 9.959864085486648e-05, "loss": 1.4395, "step": 270 }, { "epoch": 0.12331147357210918, "grad_norm": 1.165431022644043, "learning_rate": 9.958365891609444e-05, "loss": 1.3845, "step": 275 }, { "epoch": 0.12555350036432936, "grad_norm": 1.262797474861145, "learning_rate": 9.956840362303473e-05, "loss": 1.4234, "step": 280 }, { "epoch": 0.12779552715654952, "grad_norm": 1.3967214822769165, "learning_rate": 9.955287505978959e-05, "loss": 1.3922, "step": 285 }, { "epoch": 0.13003755394876967, "grad_norm": 1.1680519580841064, "learning_rate": 9.953707331196787e-05, "loss": 1.3595, "step": 290 }, { "epoch": 0.13227958074098986, "grad_norm": 1.2890777587890625, "learning_rate": 9.95209984666845e-05, "loss": 1.3109, "step": 295 }, { "epoch": 0.13452160753321002, "grad_norm": 1.3940659761428833, "learning_rate": 9.950465061255996e-05, "loss": 1.4059, "step": 300 }, { "epoch": 0.13676363432543018, "grad_norm": 1.1973896026611328, "learning_rate": 9.948802983971981e-05, "loss": 1.377, "step": 305 }, { "epoch": 0.13900566111765036, "grad_norm": 1.222718358039856, "learning_rate": 9.947113623979422e-05, "loss": 1.3703, "step": 310 }, { "epoch": 0.14124768790987052, "grad_norm": 1.3250412940979004, "learning_rate": 9.945396990591751e-05, "loss": 1.2982, "step": 315 }, { "epoch": 0.14348971470209068, "grad_norm": 1.1987338066101074, "learning_rate": 9.943653093272749e-05, "loss": 1.3408, "step": 320 }, { "epoch": 0.14573174149431087, "grad_norm": 1.0673400163650513, "learning_rate": 9.941881941636506e-05, "loss": 1.3754, "step": 325 }, { "epoch": 0.14797376828653103, "grad_norm": 1.1687766313552856, "learning_rate": 9.94008354544737e-05, "loss": 1.352, "step": 330 }, { "epoch": 0.15021579507875119, "grad_norm": 1.1456810235977173, "learning_rate": 9.938257914619882e-05, "loss": 1.3725, "step": 335 }, { "epoch": 0.15245782187097137, "grad_norm": 1.0828138589859009, "learning_rate": 9.936405059218728e-05, "loss": 1.3499, "step": 340 }, { "epoch": 0.15469984866319153, "grad_norm": 1.4693915843963623, "learning_rate": 9.934524989458684e-05, "loss": 1.3956, "step": 345 }, { "epoch": 0.1569418754554117, "grad_norm": 1.2608237266540527, "learning_rate": 9.932617715704562e-05, "loss": 1.3734, "step": 350 }, { "epoch": 0.15918390224763185, "grad_norm": 1.2090539932250977, "learning_rate": 9.930683248471142e-05, "loss": 1.4775, "step": 355 }, { "epoch": 0.16142592903985203, "grad_norm": 1.1096559762954712, "learning_rate": 9.928721598423125e-05, "loss": 1.3189, "step": 360 }, { "epoch": 0.1636679558320722, "grad_norm": 1.3460302352905273, "learning_rate": 9.926732776375073e-05, "loss": 1.3477, "step": 365 }, { "epoch": 0.16590998262429235, "grad_norm": 1.372318983078003, "learning_rate": 9.924716793291346e-05, "loss": 1.3753, "step": 370 }, { "epoch": 0.16815200941651254, "grad_norm": 1.4617116451263428, "learning_rate": 9.922673660286039e-05, "loss": 1.3927, "step": 375 }, { "epoch": 0.1703940362087327, "grad_norm": 1.1783477067947388, "learning_rate": 9.920603388622928e-05, "loss": 1.3698, "step": 380 }, { "epoch": 0.17263606300095286, "grad_norm": 1.087998628616333, "learning_rate": 9.918505989715403e-05, "loss": 1.3449, "step": 385 }, { "epoch": 0.17487808979317304, "grad_norm": 1.1929972171783447, "learning_rate": 9.916381475126407e-05, "loss": 1.4253, "step": 390 }, { "epoch": 0.1771201165853932, "grad_norm": 1.3316450119018555, "learning_rate": 9.914229856568369e-05, "loss": 1.4232, "step": 395 }, { "epoch": 0.17936214337761336, "grad_norm": 1.0855283737182617, "learning_rate": 9.912051145903144e-05, "loss": 1.3813, "step": 400 }, { "epoch": 0.18160417016983352, "grad_norm": 1.1286154985427856, "learning_rate": 9.909845355141946e-05, "loss": 1.3733, "step": 405 }, { "epoch": 0.1838461969620537, "grad_norm": 1.1122159957885742, "learning_rate": 9.90761249644528e-05, "loss": 1.3546, "step": 410 }, { "epoch": 0.18608822375427386, "grad_norm": 1.0875800848007202, "learning_rate": 9.905352582122878e-05, "loss": 1.4501, "step": 415 }, { "epoch": 0.18833025054649402, "grad_norm": 1.130376935005188, "learning_rate": 9.903065624633628e-05, "loss": 1.404, "step": 420 }, { "epoch": 0.1905722773387142, "grad_norm": 1.267067790031433, "learning_rate": 9.900751636585506e-05, "loss": 1.3379, "step": 425 }, { "epoch": 0.19281430413093437, "grad_norm": 1.1137257814407349, "learning_rate": 9.898410630735509e-05, "loss": 1.3062, "step": 430 }, { "epoch": 0.19505633092315453, "grad_norm": 1.3096929788589478, "learning_rate": 9.896042619989581e-05, "loss": 1.4038, "step": 435 }, { "epoch": 0.1972983577153747, "grad_norm": 1.3187003135681152, "learning_rate": 9.893647617402548e-05, "loss": 1.3936, "step": 440 }, { "epoch": 0.19954038450759487, "grad_norm": 1.1851136684417725, "learning_rate": 9.891225636178037e-05, "loss": 1.3456, "step": 445 }, { "epoch": 0.20178241129981503, "grad_norm": 1.2681955099105835, "learning_rate": 9.88877668966841e-05, "loss": 1.3961, "step": 450 }, { "epoch": 0.2040244380920352, "grad_norm": 1.2412629127502441, "learning_rate": 9.886300791374688e-05, "loss": 1.3843, "step": 455 }, { "epoch": 0.20626646488425537, "grad_norm": 1.3303419351577759, "learning_rate": 9.883797954946476e-05, "loss": 1.3459, "step": 460 }, { "epoch": 0.20850849167647553, "grad_norm": 1.100720763206482, "learning_rate": 9.881268194181892e-05, "loss": 1.4156, "step": 465 }, { "epoch": 0.2107505184686957, "grad_norm": 1.0826263427734375, "learning_rate": 9.878711523027484e-05, "loss": 1.3297, "step": 470 }, { "epoch": 0.21299254526091588, "grad_norm": 1.4636311531066895, "learning_rate": 9.876127955578158e-05, "loss": 1.3662, "step": 475 }, { "epoch": 0.21523457205313604, "grad_norm": 1.1484990119934082, "learning_rate": 9.873517506077101e-05, "loss": 1.36, "step": 480 }, { "epoch": 0.2174765988453562, "grad_norm": 1.2333481311798096, "learning_rate": 9.870880188915698e-05, "loss": 1.3587, "step": 485 }, { "epoch": 0.21971862563757638, "grad_norm": 1.1720088720321655, "learning_rate": 9.868216018633456e-05, "loss": 1.2882, "step": 490 }, { "epoch": 0.22196065242979654, "grad_norm": 1.2749361991882324, "learning_rate": 9.865525009917921e-05, "loss": 1.3564, "step": 495 }, { "epoch": 0.2242026792220167, "grad_norm": 1.1952840089797974, "learning_rate": 9.862807177604602e-05, "loss": 1.3956, "step": 500 }, { "epoch": 0.22644470601423686, "grad_norm": 1.215401530265808, "learning_rate": 9.860062536676888e-05, "loss": 1.3836, "step": 505 }, { "epoch": 0.22868673280645704, "grad_norm": 1.2550543546676636, "learning_rate": 9.857291102265959e-05, "loss": 1.3626, "step": 510 }, { "epoch": 0.2309287595986772, "grad_norm": 1.1673667430877686, "learning_rate": 9.854492889650709e-05, "loss": 1.3601, "step": 515 }, { "epoch": 0.23317078639089736, "grad_norm": 1.2657443284988403, "learning_rate": 9.851667914257661e-05, "loss": 1.3216, "step": 520 }, { "epoch": 0.23541281318311755, "grad_norm": 1.3874006271362305, "learning_rate": 9.848816191660878e-05, "loss": 1.3565, "step": 525 }, { "epoch": 0.2376548399753377, "grad_norm": 1.3701063394546509, "learning_rate": 9.845937737581885e-05, "loss": 1.3676, "step": 530 }, { "epoch": 0.23989686676755786, "grad_norm": 1.1547927856445312, "learning_rate": 9.843032567889572e-05, "loss": 1.3882, "step": 535 }, { "epoch": 0.24213889355977805, "grad_norm": 1.2529016733169556, "learning_rate": 9.840100698600118e-05, "loss": 1.4058, "step": 540 }, { "epoch": 0.2443809203519982, "grad_norm": 1.250368595123291, "learning_rate": 9.837142145876892e-05, "loss": 1.3544, "step": 545 }, { "epoch": 0.24662294714421837, "grad_norm": 1.0099977254867554, "learning_rate": 9.834156926030368e-05, "loss": 1.3435, "step": 550 }, { "epoch": 0.24886497393643853, "grad_norm": 1.169044017791748, "learning_rate": 9.831145055518039e-05, "loss": 1.3226, "step": 555 }, { "epoch": 0.2511070007286587, "grad_norm": 1.1605632305145264, "learning_rate": 9.828106550944322e-05, "loss": 1.3941, "step": 560 }, { "epoch": 0.2533490275208789, "grad_norm": 1.1079938411712646, "learning_rate": 9.825041429060466e-05, "loss": 1.3151, "step": 565 }, { "epoch": 0.25559105431309903, "grad_norm": 1.39505934715271, "learning_rate": 9.821949706764463e-05, "loss": 1.34, "step": 570 }, { "epoch": 0.2578330811053192, "grad_norm": 1.3362523317337036, "learning_rate": 9.81883140110095e-05, "loss": 1.3317, "step": 575 }, { "epoch": 0.26007510789753935, "grad_norm": 1.131722092628479, "learning_rate": 9.815686529261119e-05, "loss": 1.3044, "step": 580 }, { "epoch": 0.26231713468975953, "grad_norm": 1.227959394454956, "learning_rate": 9.812515108582622e-05, "loss": 1.3882, "step": 585 }, { "epoch": 0.2645591614819797, "grad_norm": 1.2343266010284424, "learning_rate": 9.809317156549476e-05, "loss": 1.3132, "step": 590 }, { "epoch": 0.26680118827419985, "grad_norm": 1.3055241107940674, "learning_rate": 9.806092690791962e-05, "loss": 1.3804, "step": 595 }, { "epoch": 0.26904321506642004, "grad_norm": 1.1219717264175415, "learning_rate": 9.80284172908653e-05, "loss": 1.4393, "step": 600 }, { "epoch": 0.2712852418586402, "grad_norm": 1.1093581914901733, "learning_rate": 9.799564289355707e-05, "loss": 1.4185, "step": 605 }, { "epoch": 0.27352726865086036, "grad_norm": 1.181302785873413, "learning_rate": 9.79626038966799e-05, "loss": 1.3762, "step": 610 }, { "epoch": 0.27576929544308054, "grad_norm": 1.238273024559021, "learning_rate": 9.79293004823775e-05, "loss": 1.4161, "step": 615 }, { "epoch": 0.27801132223530073, "grad_norm": 1.2279235124588013, "learning_rate": 9.789573283425126e-05, "loss": 1.3645, "step": 620 }, { "epoch": 0.28025334902752086, "grad_norm": 1.0937743186950684, "learning_rate": 9.78619011373594e-05, "loss": 1.374, "step": 625 }, { "epoch": 0.28249537581974105, "grad_norm": 1.0662868022918701, "learning_rate": 9.782780557821576e-05, "loss": 1.2941, "step": 630 }, { "epoch": 0.28473740261196123, "grad_norm": 1.2285066843032837, "learning_rate": 9.779344634478884e-05, "loss": 1.3532, "step": 635 }, { "epoch": 0.28697942940418136, "grad_norm": 1.336958646774292, "learning_rate": 9.775882362650083e-05, "loss": 1.3376, "step": 640 }, { "epoch": 0.28922145619640155, "grad_norm": 1.1303439140319824, "learning_rate": 9.772393761422645e-05, "loss": 1.3311, "step": 645 }, { "epoch": 0.29146348298862174, "grad_norm": 1.155773401260376, "learning_rate": 9.768878850029201e-05, "loss": 1.3207, "step": 650 }, { "epoch": 0.29370550978084187, "grad_norm": 1.1738945245742798, "learning_rate": 9.765337647847429e-05, "loss": 1.3448, "step": 655 }, { "epoch": 0.29594753657306205, "grad_norm": 1.1708767414093018, "learning_rate": 9.761770174399943e-05, "loss": 1.4237, "step": 660 }, { "epoch": 0.29818956336528224, "grad_norm": 1.1160731315612793, "learning_rate": 9.758176449354194e-05, "loss": 1.3669, "step": 665 }, { "epoch": 0.30043159015750237, "grad_norm": 1.2477370500564575, "learning_rate": 9.754556492522359e-05, "loss": 1.3638, "step": 670 }, { "epoch": 0.30267361694972256, "grad_norm": 1.1834142208099365, "learning_rate": 9.750910323861228e-05, "loss": 1.3193, "step": 675 }, { "epoch": 0.30491564374194274, "grad_norm": 1.1948530673980713, "learning_rate": 9.747237963472098e-05, "loss": 1.3663, "step": 680 }, { "epoch": 0.3071576705341629, "grad_norm": 1.172042965888977, "learning_rate": 9.743539431600661e-05, "loss": 1.3777, "step": 685 }, { "epoch": 0.30939969732638306, "grad_norm": 1.276157259941101, "learning_rate": 9.739814748636891e-05, "loss": 1.3025, "step": 690 }, { "epoch": 0.3116417241186032, "grad_norm": 1.0595532655715942, "learning_rate": 9.736063935114934e-05, "loss": 1.3276, "step": 695 }, { "epoch": 0.3138837509108234, "grad_norm": 1.1211802959442139, "learning_rate": 9.732287011712992e-05, "loss": 1.3408, "step": 700 }, { "epoch": 0.31612577770304356, "grad_norm": 1.0752381086349487, "learning_rate": 9.72848399925321e-05, "loss": 1.3546, "step": 705 }, { "epoch": 0.3183678044952637, "grad_norm": 1.1245768070220947, "learning_rate": 9.724654918701568e-05, "loss": 1.3702, "step": 710 }, { "epoch": 0.3206098312874839, "grad_norm": 1.1155140399932861, "learning_rate": 9.720799791167749e-05, "loss": 1.4169, "step": 715 }, { "epoch": 0.32285185807970407, "grad_norm": 1.0726211071014404, "learning_rate": 9.716918637905041e-05, "loss": 1.3408, "step": 720 }, { "epoch": 0.3250938848719242, "grad_norm": 1.0829260349273682, "learning_rate": 9.713011480310208e-05, "loss": 1.3408, "step": 725 }, { "epoch": 0.3273359116641444, "grad_norm": 1.2374547719955444, "learning_rate": 9.709078339923377e-05, "loss": 1.4005, "step": 730 }, { "epoch": 0.32957793845636457, "grad_norm": 1.0573582649230957, "learning_rate": 9.705119238427915e-05, "loss": 1.3704, "step": 735 }, { "epoch": 0.3318199652485847, "grad_norm": 1.245229959487915, "learning_rate": 9.701134197650318e-05, "loss": 1.359, "step": 740 }, { "epoch": 0.3340619920408049, "grad_norm": 1.2916717529296875, "learning_rate": 9.697123239560081e-05, "loss": 1.3983, "step": 745 }, { "epoch": 0.3363040188330251, "grad_norm": 1.0935052633285522, "learning_rate": 9.693086386269581e-05, "loss": 1.2974, "step": 750 }, { "epoch": 0.3385460456252452, "grad_norm": 1.2657032012939453, "learning_rate": 9.689023660033956e-05, "loss": 1.4309, "step": 755 }, { "epoch": 0.3407880724174654, "grad_norm": 1.198128581047058, "learning_rate": 9.684935083250979e-05, "loss": 1.361, "step": 760 }, { "epoch": 0.3430300992096856, "grad_norm": 1.0504889488220215, "learning_rate": 9.680820678460941e-05, "loss": 1.3671, "step": 765 }, { "epoch": 0.3452721260019057, "grad_norm": 0.9644368290901184, "learning_rate": 9.676680468346521e-05, "loss": 1.3223, "step": 770 }, { "epoch": 0.3475141527941259, "grad_norm": 1.2863438129425049, "learning_rate": 9.672514475732659e-05, "loss": 1.2832, "step": 775 }, { "epoch": 0.3497561795863461, "grad_norm": 1.2852818965911865, "learning_rate": 9.66832272358644e-05, "loss": 1.3586, "step": 780 }, { "epoch": 0.3519982063785662, "grad_norm": 1.088563323020935, "learning_rate": 9.664105235016955e-05, "loss": 1.3518, "step": 785 }, { "epoch": 0.3542402331707864, "grad_norm": 1.138024926185608, "learning_rate": 9.659862033275187e-05, "loss": 1.4139, "step": 790 }, { "epoch": 0.35648225996300653, "grad_norm": 1.1250396966934204, "learning_rate": 9.655593141753865e-05, "loss": 1.3991, "step": 795 }, { "epoch": 0.3587242867552267, "grad_norm": 1.3767707347869873, "learning_rate": 9.651298583987353e-05, "loss": 1.3365, "step": 800 }, { "epoch": 0.3609663135474469, "grad_norm": 1.0378060340881348, "learning_rate": 9.646978383651515e-05, "loss": 1.3198, "step": 805 }, { "epoch": 0.36320834033966704, "grad_norm": 1.138748049736023, "learning_rate": 9.642632564563576e-05, "loss": 1.3889, "step": 810 }, { "epoch": 0.3654503671318872, "grad_norm": 1.2029573917388916, "learning_rate": 9.638261150681998e-05, "loss": 1.3673, "step": 815 }, { "epoch": 0.3676923939241074, "grad_norm": 1.0248106718063354, "learning_rate": 9.63386416610635e-05, "loss": 1.3291, "step": 820 }, { "epoch": 0.36993442071632754, "grad_norm": 1.1774693727493286, "learning_rate": 9.62944163507717e-05, "loss": 1.3082, "step": 825 }, { "epoch": 0.3721764475085477, "grad_norm": 1.075829029083252, "learning_rate": 9.624993581975833e-05, "loss": 1.3391, "step": 830 }, { "epoch": 0.3744184743007679, "grad_norm": 1.2112139463424683, "learning_rate": 9.62052003132442e-05, "loss": 1.3169, "step": 835 }, { "epoch": 0.37666050109298804, "grad_norm": 1.2244364023208618, "learning_rate": 9.616021007785576e-05, "loss": 1.3267, "step": 840 }, { "epoch": 0.37890252788520823, "grad_norm": 1.0250012874603271, "learning_rate": 9.611496536162379e-05, "loss": 1.3162, "step": 845 }, { "epoch": 0.3811445546774284, "grad_norm": 1.1248130798339844, "learning_rate": 9.606946641398203e-05, "loss": 1.3244, "step": 850 }, { "epoch": 0.38338658146964855, "grad_norm": 1.1805521249771118, "learning_rate": 9.602371348576577e-05, "loss": 1.3474, "step": 855 }, { "epoch": 0.38562860826186873, "grad_norm": 1.129887342453003, "learning_rate": 9.597770682921055e-05, "loss": 1.317, "step": 860 }, { "epoch": 0.3878706350540889, "grad_norm": 1.2311817407608032, "learning_rate": 9.593144669795066e-05, "loss": 1.3399, "step": 865 }, { "epoch": 0.39011266184630905, "grad_norm": 1.1044973134994507, "learning_rate": 9.588493334701777e-05, "loss": 1.3829, "step": 870 }, { "epoch": 0.39235468863852924, "grad_norm": 1.142473816871643, "learning_rate": 9.583816703283965e-05, "loss": 1.3408, "step": 875 }, { "epoch": 0.3945967154307494, "grad_norm": 1.054578423500061, "learning_rate": 9.579114801323854e-05, "loss": 1.2964, "step": 880 }, { "epoch": 0.39683874222296955, "grad_norm": 1.1482200622558594, "learning_rate": 9.574387654742992e-05, "loss": 1.3412, "step": 885 }, { "epoch": 0.39908076901518974, "grad_norm": 1.0244128704071045, "learning_rate": 9.569635289602097e-05, "loss": 1.3605, "step": 890 }, { "epoch": 0.40132279580740987, "grad_norm": 0.9474136233329773, "learning_rate": 9.564857732100916e-05, "loss": 1.3675, "step": 895 }, { "epoch": 0.40356482259963006, "grad_norm": 1.0347918272018433, "learning_rate": 9.560055008578085e-05, "loss": 1.3696, "step": 900 }, { "epoch": 0.40580684939185024, "grad_norm": 1.2021688222885132, "learning_rate": 9.555227145510977e-05, "loss": 1.32, "step": 905 }, { "epoch": 0.4080488761840704, "grad_norm": 1.1408722400665283, "learning_rate": 9.550374169515557e-05, "loss": 1.3757, "step": 910 }, { "epoch": 0.41029090297629056, "grad_norm": 1.015257716178894, "learning_rate": 9.545496107346244e-05, "loss": 1.3332, "step": 915 }, { "epoch": 0.41253292976851075, "grad_norm": 1.5246819257736206, "learning_rate": 9.540592985895752e-05, "loss": 1.2848, "step": 920 }, { "epoch": 0.4147749565607309, "grad_norm": 1.1320191621780396, "learning_rate": 9.535664832194946e-05, "loss": 1.3324, "step": 925 }, { "epoch": 0.41701698335295107, "grad_norm": 1.169104814529419, "learning_rate": 9.530711673412698e-05, "loss": 1.3697, "step": 930 }, { "epoch": 0.41925901014517125, "grad_norm": 1.03293776512146, "learning_rate": 9.525733536855728e-05, "loss": 1.3582, "step": 935 }, { "epoch": 0.4215010369373914, "grad_norm": 1.3983210325241089, "learning_rate": 9.520730449968461e-05, "loss": 1.3631, "step": 940 }, { "epoch": 0.42374306372961157, "grad_norm": 1.2297945022583008, "learning_rate": 9.515702440332869e-05, "loss": 1.4169, "step": 945 }, { "epoch": 0.42598509052183176, "grad_norm": 1.3570704460144043, "learning_rate": 9.510649535668332e-05, "loss": 1.3588, "step": 950 }, { "epoch": 0.4282271173140519, "grad_norm": 1.1815954446792603, "learning_rate": 9.505571763831468e-05, "loss": 1.364, "step": 955 }, { "epoch": 0.4304691441062721, "grad_norm": 1.199096441268921, "learning_rate": 9.500469152815988e-05, "loss": 1.3813, "step": 960 }, { "epoch": 0.43271117089849226, "grad_norm": 1.0751597881317139, "learning_rate": 9.495341730752543e-05, "loss": 1.3479, "step": 965 }, { "epoch": 0.4349531976907124, "grad_norm": 1.121031641960144, "learning_rate": 9.490189525908569e-05, "loss": 1.2976, "step": 970 }, { "epoch": 0.4371952244829326, "grad_norm": 1.0710008144378662, "learning_rate": 9.485012566688127e-05, "loss": 1.33, "step": 975 }, { "epoch": 0.43943725127515276, "grad_norm": 1.1103382110595703, "learning_rate": 9.479810881631747e-05, "loss": 1.3194, "step": 980 }, { "epoch": 0.4416792780673729, "grad_norm": 1.1765540838241577, "learning_rate": 9.474584499416275e-05, "loss": 1.4135, "step": 985 }, { "epoch": 0.4439213048595931, "grad_norm": 1.1305935382843018, "learning_rate": 9.469333448854713e-05, "loss": 1.2884, "step": 990 }, { "epoch": 0.44616333165181327, "grad_norm": 1.0487785339355469, "learning_rate": 9.464057758896055e-05, "loss": 1.3547, "step": 995 }, { "epoch": 0.4484053584440334, "grad_norm": 1.071997046470642, "learning_rate": 9.458757458625138e-05, "loss": 1.3376, "step": 1000 }, { "epoch": 0.4506473852362536, "grad_norm": 1.2403199672698975, "learning_rate": 9.453432577262471e-05, "loss": 1.3056, "step": 1005 }, { "epoch": 0.4528894120284737, "grad_norm": 1.2904599905014038, "learning_rate": 9.448083144164077e-05, "loss": 1.3357, "step": 1010 }, { "epoch": 0.4551314388206939, "grad_norm": 0.9735843539237976, "learning_rate": 9.442709188821337e-05, "loss": 1.3731, "step": 1015 }, { "epoch": 0.4573734656129141, "grad_norm": 1.131100058555603, "learning_rate": 9.437310740860822e-05, "loss": 1.3478, "step": 1020 }, { "epoch": 0.4596154924051342, "grad_norm": 1.149906873703003, "learning_rate": 9.431887830044129e-05, "loss": 1.3439, "step": 1025 }, { "epoch": 0.4618575191973544, "grad_norm": 1.177563190460205, "learning_rate": 9.426440486267716e-05, "loss": 1.4656, "step": 1030 }, { "epoch": 0.4640995459895746, "grad_norm": 1.1288046836853027, "learning_rate": 9.420968739562744e-05, "loss": 1.4185, "step": 1035 }, { "epoch": 0.4663415727817947, "grad_norm": 1.2524133920669556, "learning_rate": 9.415472620094909e-05, "loss": 1.3401, "step": 1040 }, { "epoch": 0.4685835995740149, "grad_norm": 1.212417721748352, "learning_rate": 9.409952158164266e-05, "loss": 1.3573, "step": 1045 }, { "epoch": 0.4708256263662351, "grad_norm": 1.0959070920944214, "learning_rate": 9.404407384205078e-05, "loss": 1.2674, "step": 1050 }, { "epoch": 0.4730676531584552, "grad_norm": 1.1945031881332397, "learning_rate": 9.398838328785635e-05, "loss": 1.3511, "step": 1055 }, { "epoch": 0.4753096799506754, "grad_norm": 1.1044509410858154, "learning_rate": 9.393245022608091e-05, "loss": 1.3917, "step": 1060 }, { "epoch": 0.4775517067428956, "grad_norm": 1.4578787088394165, "learning_rate": 9.387627496508298e-05, "loss": 1.3883, "step": 1065 }, { "epoch": 0.47979373353511573, "grad_norm": 1.1177469491958618, "learning_rate": 9.381985781455625e-05, "loss": 1.3079, "step": 1070 }, { "epoch": 0.4820357603273359, "grad_norm": 1.0329993963241577, "learning_rate": 9.376319908552803e-05, "loss": 1.3693, "step": 1075 }, { "epoch": 0.4842777871195561, "grad_norm": 1.0311007499694824, "learning_rate": 9.37062990903574e-05, "loss": 1.2942, "step": 1080 }, { "epoch": 0.48651981391177623, "grad_norm": 1.036125659942627, "learning_rate": 9.364915814273351e-05, "loss": 1.3083, "step": 1085 }, { "epoch": 0.4887618407039964, "grad_norm": 1.1864365339279175, "learning_rate": 9.359177655767396e-05, "loss": 1.3344, "step": 1090 }, { "epoch": 0.4910038674962166, "grad_norm": 1.2126179933547974, "learning_rate": 9.353415465152293e-05, "loss": 1.3113, "step": 1095 }, { "epoch": 0.49324589428843674, "grad_norm": 1.0252037048339844, "learning_rate": 9.34762927419495e-05, "loss": 1.3527, "step": 1100 }, { "epoch": 0.4954879210806569, "grad_norm": 1.058380126953125, "learning_rate": 9.341819114794584e-05, "loss": 1.327, "step": 1105 }, { "epoch": 0.49772994787287705, "grad_norm": 1.0073350667953491, "learning_rate": 9.335985018982559e-05, "loss": 1.3563, "step": 1110 }, { "epoch": 0.49997197466509724, "grad_norm": 1.0354520082473755, "learning_rate": 9.330127018922194e-05, "loss": 1.311, "step": 1115 }, { "epoch": 0.5022140014573174, "grad_norm": 1.4187575578689575, "learning_rate": 9.324245146908592e-05, "loss": 1.386, "step": 1120 }, { "epoch": 0.5044560282495376, "grad_norm": 1.1989063024520874, "learning_rate": 9.318339435368464e-05, "loss": 1.3826, "step": 1125 }, { "epoch": 0.5066980550417578, "grad_norm": 1.1496927738189697, "learning_rate": 9.312409916859948e-05, "loss": 1.3464, "step": 1130 }, { "epoch": 0.5089400818339779, "grad_norm": 1.0821688175201416, "learning_rate": 9.306456624072426e-05, "loss": 1.319, "step": 1135 }, { "epoch": 0.5111821086261981, "grad_norm": 0.9903674721717834, "learning_rate": 9.300479589826355e-05, "loss": 1.331, "step": 1140 }, { "epoch": 0.5134241354184182, "grad_norm": 1.0320252180099487, "learning_rate": 9.294478847073069e-05, "loss": 1.3697, "step": 1145 }, { "epoch": 0.5156661622106384, "grad_norm": 1.0023198127746582, "learning_rate": 9.288454428894615e-05, "loss": 1.2954, "step": 1150 }, { "epoch": 0.5179081890028586, "grad_norm": 0.9635931253433228, "learning_rate": 9.282406368503556e-05, "loss": 1.3488, "step": 1155 }, { "epoch": 0.5201502157950787, "grad_norm": 1.0810673236846924, "learning_rate": 9.276334699242799e-05, "loss": 1.3507, "step": 1160 }, { "epoch": 0.5223922425872989, "grad_norm": 1.1213270425796509, "learning_rate": 9.270239454585404e-05, "loss": 1.3535, "step": 1165 }, { "epoch": 0.5246342693795191, "grad_norm": 1.1882630586624146, "learning_rate": 9.264120668134405e-05, "loss": 1.3221, "step": 1170 }, { "epoch": 0.5268762961717393, "grad_norm": 1.1680420637130737, "learning_rate": 9.257978373622615e-05, "loss": 1.3585, "step": 1175 }, { "epoch": 0.5291183229639594, "grad_norm": 1.063761591911316, "learning_rate": 9.251812604912453e-05, "loss": 1.3171, "step": 1180 }, { "epoch": 0.5313603497561796, "grad_norm": 1.2708847522735596, "learning_rate": 9.245623395995751e-05, "loss": 1.3829, "step": 1185 }, { "epoch": 0.5336023765483997, "grad_norm": 1.1421536207199097, "learning_rate": 9.239410780993564e-05, "loss": 1.3211, "step": 1190 }, { "epoch": 0.5358444033406199, "grad_norm": 1.2646090984344482, "learning_rate": 9.233174794155985e-05, "loss": 1.3228, "step": 1195 }, { "epoch": 0.5380864301328401, "grad_norm": 1.1613190174102783, "learning_rate": 9.226915469861956e-05, "loss": 1.3229, "step": 1200 }, { "epoch": 0.5403284569250603, "grad_norm": 1.0214089155197144, "learning_rate": 9.220632842619079e-05, "loss": 1.3952, "step": 1205 }, { "epoch": 0.5425704837172804, "grad_norm": 1.172778844833374, "learning_rate": 9.214326947063423e-05, "loss": 1.3208, "step": 1210 }, { "epoch": 0.5448125105095006, "grad_norm": 1.251479983329773, "learning_rate": 9.207997817959338e-05, "loss": 1.3232, "step": 1215 }, { "epoch": 0.5470545373017207, "grad_norm": 1.2302333116531372, "learning_rate": 9.201645490199256e-05, "loss": 1.2792, "step": 1220 }, { "epoch": 0.5492965640939409, "grad_norm": 1.0342446565628052, "learning_rate": 9.195269998803507e-05, "loss": 1.3181, "step": 1225 }, { "epoch": 0.5515385908861611, "grad_norm": 0.9909287095069885, "learning_rate": 9.188871378920122e-05, "loss": 1.339, "step": 1230 }, { "epoch": 0.5537806176783813, "grad_norm": 1.1493330001831055, "learning_rate": 9.182449665824636e-05, "loss": 1.3659, "step": 1235 }, { "epoch": 0.5560226444706015, "grad_norm": 1.094141960144043, "learning_rate": 9.1760048949199e-05, "loss": 1.3464, "step": 1240 }, { "epoch": 0.5582646712628216, "grad_norm": 1.0574826002120972, "learning_rate": 9.169537101735879e-05, "loss": 1.2936, "step": 1245 }, { "epoch": 0.5605066980550417, "grad_norm": 0.9537421464920044, "learning_rate": 9.163046321929462e-05, "loss": 1.2573, "step": 1250 }, { "epoch": 0.5627487248472619, "grad_norm": 1.3234580755233765, "learning_rate": 9.156532591284263e-05, "loss": 1.3271, "step": 1255 }, { "epoch": 0.5649907516394821, "grad_norm": 1.078527808189392, "learning_rate": 9.149995945710423e-05, "loss": 1.3126, "step": 1260 }, { "epoch": 0.5672327784317023, "grad_norm": 1.2018640041351318, "learning_rate": 9.143436421244416e-05, "loss": 1.3642, "step": 1265 }, { "epoch": 0.5694748052239225, "grad_norm": 1.0188864469528198, "learning_rate": 9.136854054048838e-05, "loss": 1.2751, "step": 1270 }, { "epoch": 0.5717168320161425, "grad_norm": 1.2628931999206543, "learning_rate": 9.130248880412229e-05, "loss": 1.364, "step": 1275 }, { "epoch": 0.5739588588083627, "grad_norm": 1.1243770122528076, "learning_rate": 9.123620936748853e-05, "loss": 1.3668, "step": 1280 }, { "epoch": 0.5762008856005829, "grad_norm": 1.1701164245605469, "learning_rate": 9.116970259598505e-05, "loss": 1.3434, "step": 1285 }, { "epoch": 0.5784429123928031, "grad_norm": 1.0601651668548584, "learning_rate": 9.110296885626314e-05, "loss": 1.2645, "step": 1290 }, { "epoch": 0.5806849391850233, "grad_norm": 1.2184094190597534, "learning_rate": 9.103600851622531e-05, "loss": 1.3468, "step": 1295 }, { "epoch": 0.5829269659772435, "grad_norm": 1.5354876518249512, "learning_rate": 9.096882194502337e-05, "loss": 1.4595, "step": 1300 }, { "epoch": 0.5851689927694635, "grad_norm": 1.0867820978164673, "learning_rate": 9.09014095130563e-05, "loss": 1.385, "step": 1305 }, { "epoch": 0.5874110195616837, "grad_norm": 1.2308603525161743, "learning_rate": 9.083377159196825e-05, "loss": 1.3503, "step": 1310 }, { "epoch": 0.5896530463539039, "grad_norm": 1.2136027812957764, "learning_rate": 9.07659085546465e-05, "loss": 1.3986, "step": 1315 }, { "epoch": 0.5918950731461241, "grad_norm": 0.9775259494781494, "learning_rate": 9.069782077521943e-05, "loss": 1.4075, "step": 1320 }, { "epoch": 0.5941370999383443, "grad_norm": 1.0732626914978027, "learning_rate": 9.062950862905432e-05, "loss": 1.3594, "step": 1325 }, { "epoch": 0.5963791267305645, "grad_norm": 0.9587686061859131, "learning_rate": 9.056097249275553e-05, "loss": 1.3741, "step": 1330 }, { "epoch": 0.5986211535227846, "grad_norm": 0.9384256601333618, "learning_rate": 9.049221274416213e-05, "loss": 1.3553, "step": 1335 }, { "epoch": 0.6008631803150047, "grad_norm": 1.0991201400756836, "learning_rate": 9.042322976234606e-05, "loss": 1.3533, "step": 1340 }, { "epoch": 0.6031052071072249, "grad_norm": 1.215849757194519, "learning_rate": 9.035402392760988e-05, "loss": 1.3747, "step": 1345 }, { "epoch": 0.6053472338994451, "grad_norm": 1.416882872581482, "learning_rate": 9.02845956214848e-05, "loss": 1.3554, "step": 1350 }, { "epoch": 0.6075892606916653, "grad_norm": 1.1282700300216675, "learning_rate": 9.021494522672845e-05, "loss": 1.3741, "step": 1355 }, { "epoch": 0.6098312874838855, "grad_norm": 0.9101713299751282, "learning_rate": 9.014507312732285e-05, "loss": 1.3012, "step": 1360 }, { "epoch": 0.6120733142761056, "grad_norm": 1.0538674592971802, "learning_rate": 9.007497970847234e-05, "loss": 1.3273, "step": 1365 }, { "epoch": 0.6143153410683257, "grad_norm": 1.3435333967208862, "learning_rate": 9.000466535660129e-05, "loss": 1.4025, "step": 1370 }, { "epoch": 0.6165573678605459, "grad_norm": 1.0372601747512817, "learning_rate": 8.993413045935215e-05, "loss": 1.3212, "step": 1375 }, { "epoch": 0.6187993946527661, "grad_norm": 1.0866960287094116, "learning_rate": 8.986337540558318e-05, "loss": 1.3799, "step": 1380 }, { "epoch": 0.6210414214449863, "grad_norm": 1.0121322870254517, "learning_rate": 8.97924005853664e-05, "loss": 1.3493, "step": 1385 }, { "epoch": 0.6232834482372064, "grad_norm": 1.071612000465393, "learning_rate": 8.972120638998539e-05, "loss": 1.3564, "step": 1390 }, { "epoch": 0.6255254750294266, "grad_norm": 1.103440523147583, "learning_rate": 8.964979321193314e-05, "loss": 1.2915, "step": 1395 }, { "epoch": 0.6277675018216468, "grad_norm": 0.9943517446517944, "learning_rate": 8.957816144490989e-05, "loss": 1.3424, "step": 1400 }, { "epoch": 0.6300095286138669, "grad_norm": 1.0576980113983154, "learning_rate": 8.950631148382095e-05, "loss": 1.3101, "step": 1405 }, { "epoch": 0.6322515554060871, "grad_norm": 1.0375151634216309, "learning_rate": 8.943424372477455e-05, "loss": 1.3308, "step": 1410 }, { "epoch": 0.6344935821983073, "grad_norm": 1.1026891469955444, "learning_rate": 8.936195856507962e-05, "loss": 1.3229, "step": 1415 }, { "epoch": 0.6367356089905274, "grad_norm": 1.1137725114822388, "learning_rate": 8.928945640324364e-05, "loss": 1.2864, "step": 1420 }, { "epoch": 0.6389776357827476, "grad_norm": 1.0674328804016113, "learning_rate": 8.921673763897041e-05, "loss": 1.339, "step": 1425 }, { "epoch": 0.6412196625749678, "grad_norm": 1.1279280185699463, "learning_rate": 8.914380267315782e-05, "loss": 1.3516, "step": 1430 }, { "epoch": 0.643461689367188, "grad_norm": 0.9942423105239868, "learning_rate": 8.907065190789577e-05, "loss": 1.3102, "step": 1435 }, { "epoch": 0.6457037161594081, "grad_norm": 1.1335337162017822, "learning_rate": 8.899728574646376e-05, "loss": 1.304, "step": 1440 }, { "epoch": 0.6479457429516283, "grad_norm": 1.0654945373535156, "learning_rate": 8.892370459332883e-05, "loss": 1.273, "step": 1445 }, { "epoch": 0.6501877697438484, "grad_norm": 1.0929509401321411, "learning_rate": 8.884990885414326e-05, "loss": 1.3298, "step": 1450 }, { "epoch": 0.6524297965360686, "grad_norm": 1.157837986946106, "learning_rate": 8.87758989357423e-05, "loss": 1.3395, "step": 1455 }, { "epoch": 0.6546718233282888, "grad_norm": 1.1370052099227905, "learning_rate": 8.8701675246142e-05, "loss": 1.3823, "step": 1460 }, { "epoch": 0.656913850120509, "grad_norm": 1.096897840499878, "learning_rate": 8.862723819453696e-05, "loss": 1.2579, "step": 1465 }, { "epoch": 0.6591558769127291, "grad_norm": 1.028351902961731, "learning_rate": 8.855258819129796e-05, "loss": 1.323, "step": 1470 }, { "epoch": 0.6613979037049492, "grad_norm": 1.2492655515670776, "learning_rate": 8.847772564796987e-05, "loss": 1.3316, "step": 1475 }, { "epoch": 0.6636399304971694, "grad_norm": 1.1350480318069458, "learning_rate": 8.840265097726923e-05, "loss": 1.3331, "step": 1480 }, { "epoch": 0.6658819572893896, "grad_norm": 1.057501196861267, "learning_rate": 8.832736459308207e-05, "loss": 1.3092, "step": 1485 }, { "epoch": 0.6681239840816098, "grad_norm": 0.9846299290657043, "learning_rate": 8.825186691046157e-05, "loss": 1.3709, "step": 1490 }, { "epoch": 0.67036601087383, "grad_norm": 1.2653725147247314, "learning_rate": 8.817615834562583e-05, "loss": 1.368, "step": 1495 }, { "epoch": 0.6726080376660502, "grad_norm": 0.9622915387153625, "learning_rate": 8.81002393159555e-05, "loss": 1.3952, "step": 1500 }, { "epoch": 0.6748500644582702, "grad_norm": 1.1680620908737183, "learning_rate": 8.802411023999153e-05, "loss": 1.378, "step": 1505 }, { "epoch": 0.6770920912504904, "grad_norm": 1.2336018085479736, "learning_rate": 8.79477715374329e-05, "loss": 1.3017, "step": 1510 }, { "epoch": 0.6793341180427106, "grad_norm": 1.0431910753250122, "learning_rate": 8.78712236291342e-05, "loss": 1.2801, "step": 1515 }, { "epoch": 0.6815761448349308, "grad_norm": 0.9812450408935547, "learning_rate": 8.779446693710341e-05, "loss": 1.4084, "step": 1520 }, { "epoch": 0.683818171627151, "grad_norm": 0.9624593257904053, "learning_rate": 8.771750188449951e-05, "loss": 1.324, "step": 1525 }, { "epoch": 0.6860601984193712, "grad_norm": 0.9403428435325623, "learning_rate": 8.764032889563017e-05, "loss": 1.3739, "step": 1530 }, { "epoch": 0.6883022252115912, "grad_norm": 1.0417587757110596, "learning_rate": 8.756294839594943e-05, "loss": 1.2942, "step": 1535 }, { "epoch": 0.6905442520038114, "grad_norm": 1.1011159420013428, "learning_rate": 8.74853608120553e-05, "loss": 1.391, "step": 1540 }, { "epoch": 0.6927862787960316, "grad_norm": 1.0298092365264893, "learning_rate": 8.74075665716875e-05, "loss": 1.2973, "step": 1545 }, { "epoch": 0.6950283055882518, "grad_norm": 1.156357765197754, "learning_rate": 8.732956610372499e-05, "loss": 1.2932, "step": 1550 }, { "epoch": 0.697270332380472, "grad_norm": 0.9823068976402283, "learning_rate": 8.725135983818369e-05, "loss": 1.3696, "step": 1555 }, { "epoch": 0.6995123591726922, "grad_norm": 1.0234986543655396, "learning_rate": 8.717294820621407e-05, "loss": 1.3504, "step": 1560 }, { "epoch": 0.7017543859649122, "grad_norm": 1.3021448850631714, "learning_rate": 8.70943316400988e-05, "loss": 1.3624, "step": 1565 }, { "epoch": 0.7039964127571324, "grad_norm": 1.166528582572937, "learning_rate": 8.70155105732503e-05, "loss": 1.3469, "step": 1570 }, { "epoch": 0.7062384395493526, "grad_norm": 1.2379478216171265, "learning_rate": 8.693648544020847e-05, "loss": 1.3586, "step": 1575 }, { "epoch": 0.7084804663415728, "grad_norm": 0.9946653842926025, "learning_rate": 8.68572566766382e-05, "loss": 1.3349, "step": 1580 }, { "epoch": 0.710722493133793, "grad_norm": 1.184866189956665, "learning_rate": 8.677782471932696e-05, "loss": 1.2896, "step": 1585 }, { "epoch": 0.7129645199260131, "grad_norm": 1.2160494327545166, "learning_rate": 8.669819000618246e-05, "loss": 1.3714, "step": 1590 }, { "epoch": 0.7152065467182332, "grad_norm": 1.096117615699768, "learning_rate": 8.66183529762302e-05, "loss": 1.3556, "step": 1595 }, { "epoch": 0.7174485735104534, "grad_norm": 0.9968474507331848, "learning_rate": 8.653831406961105e-05, "loss": 1.3476, "step": 1600 }, { "epoch": 0.7196906003026736, "grad_norm": 1.093274474143982, "learning_rate": 8.64580737275788e-05, "loss": 1.3187, "step": 1605 }, { "epoch": 0.7219326270948938, "grad_norm": 1.1728419065475464, "learning_rate": 8.637763239249777e-05, "loss": 1.3481, "step": 1610 }, { "epoch": 0.724174653887114, "grad_norm": 1.1466108560562134, "learning_rate": 8.629699050784038e-05, "loss": 1.3226, "step": 1615 }, { "epoch": 0.7264166806793341, "grad_norm": 1.0177853107452393, "learning_rate": 8.621614851818461e-05, "loss": 1.3065, "step": 1620 }, { "epoch": 0.7286587074715543, "grad_norm": 0.9964995384216309, "learning_rate": 8.61351068692117e-05, "loss": 1.3096, "step": 1625 }, { "epoch": 0.7309007342637744, "grad_norm": 0.9439290165901184, "learning_rate": 8.605386600770353e-05, "loss": 1.2842, "step": 1630 }, { "epoch": 0.7331427610559946, "grad_norm": 1.1577221155166626, "learning_rate": 8.59724263815403e-05, "loss": 1.3666, "step": 1635 }, { "epoch": 0.7353847878482148, "grad_norm": 1.0668253898620605, "learning_rate": 8.589078843969796e-05, "loss": 1.3084, "step": 1640 }, { "epoch": 0.737626814640435, "grad_norm": 1.0648199319839478, "learning_rate": 8.580895263224578e-05, "loss": 1.3817, "step": 1645 }, { "epoch": 0.7398688414326551, "grad_norm": 1.081084132194519, "learning_rate": 8.572691941034389e-05, "loss": 1.2961, "step": 1650 }, { "epoch": 0.7421108682248753, "grad_norm": 0.9493741393089294, "learning_rate": 8.564468922624073e-05, "loss": 1.3692, "step": 1655 }, { "epoch": 0.7443528950170955, "grad_norm": 1.156214952468872, "learning_rate": 8.556226253327059e-05, "loss": 1.3368, "step": 1660 }, { "epoch": 0.7465949218093156, "grad_norm": 1.098140001296997, "learning_rate": 8.547963978585114e-05, "loss": 1.3045, "step": 1665 }, { "epoch": 0.7488369486015358, "grad_norm": 1.1900348663330078, "learning_rate": 8.539682143948087e-05, "loss": 1.3388, "step": 1670 }, { "epoch": 0.751078975393756, "grad_norm": 1.0908799171447754, "learning_rate": 8.531380795073662e-05, "loss": 1.2893, "step": 1675 }, { "epoch": 0.7533210021859761, "grad_norm": 1.1332789659500122, "learning_rate": 8.523059977727103e-05, "loss": 1.278, "step": 1680 }, { "epoch": 0.7555630289781963, "grad_norm": 1.1658406257629395, "learning_rate": 8.514719737781008e-05, "loss": 1.38, "step": 1685 }, { "epoch": 0.7578050557704165, "grad_norm": 1.1062614917755127, "learning_rate": 8.506360121215045e-05, "loss": 1.2967, "step": 1690 }, { "epoch": 0.7600470825626366, "grad_norm": 1.1336619853973389, "learning_rate": 8.497981174115712e-05, "loss": 1.3339, "step": 1695 }, { "epoch": 0.7622891093548568, "grad_norm": 0.9592335820198059, "learning_rate": 8.48958294267607e-05, "loss": 1.3373, "step": 1700 }, { "epoch": 0.7645311361470769, "grad_norm": 1.2497416734695435, "learning_rate": 8.4811654731955e-05, "loss": 1.3679, "step": 1705 }, { "epoch": 0.7667731629392971, "grad_norm": 1.078972578048706, "learning_rate": 8.472728812079436e-05, "loss": 1.3833, "step": 1710 }, { "epoch": 0.7690151897315173, "grad_norm": 1.0341068506240845, "learning_rate": 8.464273005839119e-05, "loss": 1.304, "step": 1715 }, { "epoch": 0.7712572165237375, "grad_norm": 0.9276494383811951, "learning_rate": 8.455798101091338e-05, "loss": 1.3569, "step": 1720 }, { "epoch": 0.7734992433159577, "grad_norm": 1.232210397720337, "learning_rate": 8.447304144558171e-05, "loss": 1.3199, "step": 1725 }, { "epoch": 0.7757412701081778, "grad_norm": 1.031119704246521, "learning_rate": 8.438791183066728e-05, "loss": 1.3693, "step": 1730 }, { "epoch": 0.7779832969003979, "grad_norm": 0.9429606795310974, "learning_rate": 8.43025926354889e-05, "loss": 1.3712, "step": 1735 }, { "epoch": 0.7802253236926181, "grad_norm": 1.0232348442077637, "learning_rate": 8.421708433041058e-05, "loss": 1.2815, "step": 1740 }, { "epoch": 0.7824673504848383, "grad_norm": 1.1679573059082031, "learning_rate": 8.413138738683887e-05, "loss": 1.2576, "step": 1745 }, { "epoch": 0.7847093772770585, "grad_norm": 1.3956390619277954, "learning_rate": 8.40455022772203e-05, "loss": 1.3678, "step": 1750 }, { "epoch": 0.7869514040692787, "grad_norm": 1.1722822189331055, "learning_rate": 8.395942947503874e-05, "loss": 1.2261, "step": 1755 }, { "epoch": 0.7891934308614988, "grad_norm": 1.1038949489593506, "learning_rate": 8.38731694548128e-05, "loss": 1.3066, "step": 1760 }, { "epoch": 0.7914354576537189, "grad_norm": 1.0882680416107178, "learning_rate": 8.378672269209326e-05, "loss": 1.388, "step": 1765 }, { "epoch": 0.7936774844459391, "grad_norm": 0.9000134468078613, "learning_rate": 8.370008966346037e-05, "loss": 1.3099, "step": 1770 }, { "epoch": 0.7959195112381593, "grad_norm": 0.993665874004364, "learning_rate": 8.361327084652126e-05, "loss": 1.2892, "step": 1775 }, { "epoch": 0.7981615380303795, "grad_norm": 1.091774344444275, "learning_rate": 8.352626671990735e-05, "loss": 1.3601, "step": 1780 }, { "epoch": 0.8004035648225997, "grad_norm": 1.1141952276229858, "learning_rate": 8.343907776327164e-05, "loss": 1.3546, "step": 1785 }, { "epoch": 0.8026455916148197, "grad_norm": 0.9900937676429749, "learning_rate": 8.335170445728608e-05, "loss": 1.3254, "step": 1790 }, { "epoch": 0.8048876184070399, "grad_norm": 0.959354817867279, "learning_rate": 8.326414728363899e-05, "loss": 1.3446, "step": 1795 }, { "epoch": 0.8071296451992601, "grad_norm": 1.1290162801742554, "learning_rate": 8.317640672503231e-05, "loss": 1.3338, "step": 1800 }, { "epoch": 0.8093716719914803, "grad_norm": 0.9364314675331116, "learning_rate": 8.308848326517897e-05, "loss": 1.2879, "step": 1805 }, { "epoch": 0.8116136987837005, "grad_norm": 1.0674771070480347, "learning_rate": 8.300037738880029e-05, "loss": 1.3129, "step": 1810 }, { "epoch": 0.8138557255759207, "grad_norm": 1.0436745882034302, "learning_rate": 8.291208958162317e-05, "loss": 1.3547, "step": 1815 }, { "epoch": 0.8160977523681407, "grad_norm": 1.097528100013733, "learning_rate": 8.282362033037758e-05, "loss": 1.3301, "step": 1820 }, { "epoch": 0.8183397791603609, "grad_norm": 1.0497652292251587, "learning_rate": 8.273497012279371e-05, "loss": 1.2466, "step": 1825 }, { "epoch": 0.8205818059525811, "grad_norm": 1.011123776435852, "learning_rate": 8.264613944759943e-05, "loss": 1.3085, "step": 1830 }, { "epoch": 0.8228238327448013, "grad_norm": 1.0443741083145142, "learning_rate": 8.255712879451747e-05, "loss": 1.281, "step": 1835 }, { "epoch": 0.8250658595370215, "grad_norm": 1.2140185832977295, "learning_rate": 8.246793865426279e-05, "loss": 1.3612, "step": 1840 }, { "epoch": 0.8273078863292417, "grad_norm": 1.128836989402771, "learning_rate": 8.237856951853989e-05, "loss": 1.322, "step": 1845 }, { "epoch": 0.8295499131214618, "grad_norm": 1.0461573600769043, "learning_rate": 8.228902188004004e-05, "loss": 1.2147, "step": 1850 }, { "epoch": 0.8317919399136819, "grad_norm": 1.025303602218628, "learning_rate": 8.219929623243862e-05, "loss": 1.3644, "step": 1855 }, { "epoch": 0.8340339667059021, "grad_norm": 1.1245356798171997, "learning_rate": 8.210939307039234e-05, "loss": 1.2791, "step": 1860 }, { "epoch": 0.8362759934981223, "grad_norm": 1.0641727447509766, "learning_rate": 8.201931288953657e-05, "loss": 1.3585, "step": 1865 }, { "epoch": 0.8385180202903425, "grad_norm": 1.0719192028045654, "learning_rate": 8.19290561864826e-05, "loss": 1.3353, "step": 1870 }, { "epoch": 0.8407600470825627, "grad_norm": 1.0135860443115234, "learning_rate": 8.183862345881483e-05, "loss": 1.3111, "step": 1875 }, { "epoch": 0.8430020738747828, "grad_norm": 1.0956032276153564, "learning_rate": 8.174801520508813e-05, "loss": 1.3599, "step": 1880 }, { "epoch": 0.845244100667003, "grad_norm": 1.2083892822265625, "learning_rate": 8.165723192482502e-05, "loss": 1.2641, "step": 1885 }, { "epoch": 0.8474861274592231, "grad_norm": 1.0608189105987549, "learning_rate": 8.156627411851295e-05, "loss": 1.3246, "step": 1890 }, { "epoch": 0.8497281542514433, "grad_norm": 1.099736213684082, "learning_rate": 8.147514228760153e-05, "loss": 1.294, "step": 1895 }, { "epoch": 0.8519701810436635, "grad_norm": 1.0537753105163574, "learning_rate": 8.138383693449978e-05, "loss": 1.3138, "step": 1900 }, { "epoch": 0.8542122078358836, "grad_norm": 1.1678063869476318, "learning_rate": 8.12923585625733e-05, "loss": 1.3333, "step": 1905 }, { "epoch": 0.8564542346281038, "grad_norm": 1.0176467895507812, "learning_rate": 8.120070767614161e-05, "loss": 1.2475, "step": 1910 }, { "epoch": 0.858696261420324, "grad_norm": 1.0608762502670288, "learning_rate": 8.110888478047523e-05, "loss": 1.3255, "step": 1915 }, { "epoch": 0.8609382882125441, "grad_norm": 0.9746761918067932, "learning_rate": 8.101689038179299e-05, "loss": 1.2848, "step": 1920 }, { "epoch": 0.8631803150047643, "grad_norm": 1.1493169069290161, "learning_rate": 8.092472498725927e-05, "loss": 1.3407, "step": 1925 }, { "epoch": 0.8654223417969845, "grad_norm": 1.04042649269104, "learning_rate": 8.083238910498108e-05, "loss": 1.3759, "step": 1930 }, { "epoch": 0.8676643685892046, "grad_norm": 1.1784476041793823, "learning_rate": 8.073988324400535e-05, "loss": 1.3276, "step": 1935 }, { "epoch": 0.8699063953814248, "grad_norm": 1.0766850709915161, "learning_rate": 8.064720791431608e-05, "loss": 1.4061, "step": 1940 }, { "epoch": 0.872148422173645, "grad_norm": 1.1751985549926758, "learning_rate": 8.055436362683158e-05, "loss": 1.3455, "step": 1945 }, { "epoch": 0.8743904489658652, "grad_norm": 1.0339034795761108, "learning_rate": 8.046135089340164e-05, "loss": 1.3087, "step": 1950 }, { "epoch": 0.8766324757580853, "grad_norm": 1.1246895790100098, "learning_rate": 8.036817022680466e-05, "loss": 1.2804, "step": 1955 }, { "epoch": 0.8788745025503055, "grad_norm": 0.9990755319595337, "learning_rate": 8.027482214074482e-05, "loss": 1.3058, "step": 1960 }, { "epoch": 0.8811165293425256, "grad_norm": 1.0636389255523682, "learning_rate": 8.018130714984933e-05, "loss": 1.3505, "step": 1965 }, { "epoch": 0.8833585561347458, "grad_norm": 1.2202845811843872, "learning_rate": 8.008762576966557e-05, "loss": 1.3404, "step": 1970 }, { "epoch": 0.885600582926966, "grad_norm": 1.0653436183929443, "learning_rate": 7.999377851665817e-05, "loss": 1.3974, "step": 1975 }, { "epoch": 0.8878426097191862, "grad_norm": 1.3170489072799683, "learning_rate": 7.989976590820623e-05, "loss": 1.314, "step": 1980 }, { "epoch": 0.8900846365114063, "grad_norm": 1.0469021797180176, "learning_rate": 7.980558846260044e-05, "loss": 1.3115, "step": 1985 }, { "epoch": 0.8923266633036265, "grad_norm": 1.0016125440597534, "learning_rate": 7.971124669904029e-05, "loss": 1.2834, "step": 1990 }, { "epoch": 0.8945686900958466, "grad_norm": 1.1542069911956787, "learning_rate": 7.961674113763109e-05, "loss": 1.2743, "step": 1995 }, { "epoch": 0.8968107168880668, "grad_norm": 1.0665364265441895, "learning_rate": 7.952207229938119e-05, "loss": 1.3778, "step": 2000 }, { "epoch": 0.899052743680287, "grad_norm": 1.06927490234375, "learning_rate": 7.942724070619911e-05, "loss": 1.3158, "step": 2005 }, { "epoch": 0.9012947704725072, "grad_norm": 1.1074497699737549, "learning_rate": 7.933224688089059e-05, "loss": 1.3796, "step": 2010 }, { "epoch": 0.9035367972647274, "grad_norm": 0.9936386942863464, "learning_rate": 7.923709134715577e-05, "loss": 1.3099, "step": 2015 }, { "epoch": 0.9057788240569474, "grad_norm": 1.0144227743148804, "learning_rate": 7.914177462958631e-05, "loss": 1.3097, "step": 2020 }, { "epoch": 0.9080208508491676, "grad_norm": 1.1205965280532837, "learning_rate": 7.904629725366247e-05, "loss": 1.3218, "step": 2025 }, { "epoch": 0.9102628776413878, "grad_norm": 1.0762195587158203, "learning_rate": 7.895065974575017e-05, "loss": 1.3102, "step": 2030 }, { "epoch": 0.912504904433608, "grad_norm": 1.1134177446365356, "learning_rate": 7.885486263309823e-05, "loss": 1.2953, "step": 2035 }, { "epoch": 0.9147469312258282, "grad_norm": 1.171975016593933, "learning_rate": 7.875890644383525e-05, "loss": 1.3812, "step": 2040 }, { "epoch": 0.9169889580180484, "grad_norm": 1.035203456878662, "learning_rate": 7.866279170696693e-05, "loss": 1.3105, "step": 2045 }, { "epoch": 0.9192309848102684, "grad_norm": 0.9938043355941772, "learning_rate": 7.856651895237297e-05, "loss": 1.2807, "step": 2050 }, { "epoch": 0.9214730116024886, "grad_norm": 1.012306571006775, "learning_rate": 7.847008871080423e-05, "loss": 1.2452, "step": 2055 }, { "epoch": 0.9237150383947088, "grad_norm": 1.160154938697815, "learning_rate": 7.837350151387985e-05, "loss": 1.3946, "step": 2060 }, { "epoch": 0.925957065186929, "grad_norm": 1.1950114965438843, "learning_rate": 7.827675789408417e-05, "loss": 1.3793, "step": 2065 }, { "epoch": 0.9281990919791492, "grad_norm": 0.9952568411827087, "learning_rate": 7.817985838476398e-05, "loss": 1.3438, "step": 2070 }, { "epoch": 0.9304411187713694, "grad_norm": 0.9820153713226318, "learning_rate": 7.808280352012544e-05, "loss": 1.2817, "step": 2075 }, { "epoch": 0.9326831455635894, "grad_norm": 1.062547206878662, "learning_rate": 7.798559383523116e-05, "loss": 1.2524, "step": 2080 }, { "epoch": 0.9349251723558096, "grad_norm": 0.9531433582305908, "learning_rate": 7.788822986599733e-05, "loss": 1.3326, "step": 2085 }, { "epoch": 0.9371671991480298, "grad_norm": 0.9412059783935547, "learning_rate": 7.779071214919066e-05, "loss": 1.3434, "step": 2090 }, { "epoch": 0.93940922594025, "grad_norm": 1.337913990020752, "learning_rate": 7.769304122242551e-05, "loss": 1.3211, "step": 2095 }, { "epoch": 0.9416512527324702, "grad_norm": 0.9646030068397522, "learning_rate": 7.759521762416084e-05, "loss": 1.2644, "step": 2100 }, { "epoch": 0.9438932795246903, "grad_norm": 1.146712303161621, "learning_rate": 7.749724189369735e-05, "loss": 1.3066, "step": 2105 }, { "epoch": 0.9461353063169105, "grad_norm": 0.9840266704559326, "learning_rate": 7.739911457117437e-05, "loss": 1.337, "step": 2110 }, { "epoch": 0.9483773331091306, "grad_norm": 1.027145504951477, "learning_rate": 7.730083619756698e-05, "loss": 1.3583, "step": 2115 }, { "epoch": 0.9506193599013508, "grad_norm": 0.9447183609008789, "learning_rate": 7.720240731468306e-05, "loss": 1.2966, "step": 2120 }, { "epoch": 0.952861386693571, "grad_norm": 0.9172132015228271, "learning_rate": 7.710382846516017e-05, "loss": 1.324, "step": 2125 }, { "epoch": 0.9551034134857912, "grad_norm": 1.004164218902588, "learning_rate": 7.700510019246266e-05, "loss": 1.3354, "step": 2130 }, { "epoch": 0.9573454402780113, "grad_norm": 1.1161928176879883, "learning_rate": 7.690622304087865e-05, "loss": 1.2743, "step": 2135 }, { "epoch": 0.9595874670702315, "grad_norm": 1.2197874784469604, "learning_rate": 7.680719755551707e-05, "loss": 1.2924, "step": 2140 }, { "epoch": 0.9618294938624516, "grad_norm": 1.1961028575897217, "learning_rate": 7.670802428230452e-05, "loss": 1.3233, "step": 2145 }, { "epoch": 0.9640715206546718, "grad_norm": 1.09461510181427, "learning_rate": 7.660870376798244e-05, "loss": 1.3149, "step": 2150 }, { "epoch": 0.966313547446892, "grad_norm": 1.1680549383163452, "learning_rate": 7.650923656010398e-05, "loss": 1.3106, "step": 2155 }, { "epoch": 0.9685555742391122, "grad_norm": 1.0645558834075928, "learning_rate": 7.6409623207031e-05, "loss": 1.2427, "step": 2160 }, { "epoch": 0.9707976010313323, "grad_norm": 1.3543119430541992, "learning_rate": 7.630986425793105e-05, "loss": 1.257, "step": 2165 }, { "epoch": 0.9730396278235525, "grad_norm": 0.9309380650520325, "learning_rate": 7.620996026277438e-05, "loss": 1.3291, "step": 2170 }, { "epoch": 0.9752816546157727, "grad_norm": 1.0483700037002563, "learning_rate": 7.610991177233085e-05, "loss": 1.3066, "step": 2175 }, { "epoch": 0.9775236814079928, "grad_norm": 1.029461145401001, "learning_rate": 7.600971933816695e-05, "loss": 1.3153, "step": 2180 }, { "epoch": 0.979765708200213, "grad_norm": 1.025608777999878, "learning_rate": 7.590938351264269e-05, "loss": 1.2595, "step": 2185 }, { "epoch": 0.9820077349924332, "grad_norm": 1.1784470081329346, "learning_rate": 7.580890484890864e-05, "loss": 1.3677, "step": 2190 }, { "epoch": 0.9842497617846533, "grad_norm": 1.0288585424423218, "learning_rate": 7.570828390090279e-05, "loss": 1.2931, "step": 2195 }, { "epoch": 0.9864917885768735, "grad_norm": 0.9635973572731018, "learning_rate": 7.560752122334757e-05, "loss": 1.2542, "step": 2200 }, { "epoch": 0.9887338153690937, "grad_norm": 1.0460883378982544, "learning_rate": 7.55066173717468e-05, "loss": 1.2744, "step": 2205 }, { "epoch": 0.9909758421613138, "grad_norm": 1.0957541465759277, "learning_rate": 7.54055729023825e-05, "loss": 1.3375, "step": 2210 }, { "epoch": 0.993217868953534, "grad_norm": 1.203940510749817, "learning_rate": 7.5304388372312e-05, "loss": 1.363, "step": 2215 }, { "epoch": 0.9954598957457541, "grad_norm": 1.2144309282302856, "learning_rate": 7.520306433936473e-05, "loss": 1.3041, "step": 2220 }, { "epoch": 0.9977019225379743, "grad_norm": 1.1100728511810303, "learning_rate": 7.510160136213921e-05, "loss": 1.2448, "step": 2225 }, { "epoch": 0.9999439493301945, "grad_norm": 1.0487066507339478, "learning_rate": 7.500000000000001e-05, "loss": 1.2796, "step": 2230 }, { "epoch": 1.0021859761224146, "grad_norm": 0.9101243615150452, "learning_rate": 7.489826081307452e-05, "loss": 1.2459, "step": 2235 }, { "epoch": 1.0044280029146349, "grad_norm": 0.9735124707221985, "learning_rate": 7.479638436225003e-05, "loss": 1.271, "step": 2240 }, { "epoch": 1.006670029706855, "grad_norm": 1.0015895366668701, "learning_rate": 7.469437120917054e-05, "loss": 1.331, "step": 2245 }, { "epoch": 1.0089120564990752, "grad_norm": 1.1906746625900269, "learning_rate": 7.459222191623369e-05, "loss": 1.2832, "step": 2250 }, { "epoch": 1.0111540832912953, "grad_norm": 1.022809386253357, "learning_rate": 7.448993704658766e-05, "loss": 1.2637, "step": 2255 }, { "epoch": 1.0133961100835156, "grad_norm": 1.0506726503372192, "learning_rate": 7.438751716412807e-05, "loss": 1.2623, "step": 2260 }, { "epoch": 1.0156381368757357, "grad_norm": 1.114424705505371, "learning_rate": 7.428496283349483e-05, "loss": 1.2747, "step": 2265 }, { "epoch": 1.0178801636679558, "grad_norm": 1.1833229064941406, "learning_rate": 7.418227462006912e-05, "loss": 1.387, "step": 2270 }, { "epoch": 1.020122190460176, "grad_norm": 1.137563943862915, "learning_rate": 7.407945308997017e-05, "loss": 1.3009, "step": 2275 }, { "epoch": 1.0223642172523961, "grad_norm": 1.0473971366882324, "learning_rate": 7.39764988100522e-05, "loss": 1.2309, "step": 2280 }, { "epoch": 1.0246062440446164, "grad_norm": 1.0773533582687378, "learning_rate": 7.387341234790124e-05, "loss": 1.2865, "step": 2285 }, { "epoch": 1.0268482708368365, "grad_norm": 1.1596111059188843, "learning_rate": 7.377019427183212e-05, "loss": 1.3355, "step": 2290 }, { "epoch": 1.0290902976290566, "grad_norm": 1.0251152515411377, "learning_rate": 7.366684515088521e-05, "loss": 1.3117, "step": 2295 }, { "epoch": 1.0313323244212769, "grad_norm": 1.039408802986145, "learning_rate": 7.356336555482332e-05, "loss": 1.3272, "step": 2300 }, { "epoch": 1.033574351213497, "grad_norm": 0.9818054437637329, "learning_rate": 7.345975605412855e-05, "loss": 1.3615, "step": 2305 }, { "epoch": 1.0358163780057172, "grad_norm": 1.0482890605926514, "learning_rate": 7.335601721999922e-05, "loss": 1.3027, "step": 2310 }, { "epoch": 1.0380584047979373, "grad_norm": 1.1090137958526611, "learning_rate": 7.325214962434665e-05, "loss": 1.2632, "step": 2315 }, { "epoch": 1.0403004315901576, "grad_norm": 1.005988597869873, "learning_rate": 7.314815383979198e-05, "loss": 1.2945, "step": 2320 }, { "epoch": 1.0425424583823777, "grad_norm": 1.1132372617721558, "learning_rate": 7.304403043966309e-05, "loss": 1.3651, "step": 2325 }, { "epoch": 1.0447844851745978, "grad_norm": 1.154373049736023, "learning_rate": 7.29397799979914e-05, "loss": 1.2766, "step": 2330 }, { "epoch": 1.047026511966818, "grad_norm": 1.1040149927139282, "learning_rate": 7.283540308950867e-05, "loss": 1.2856, "step": 2335 }, { "epoch": 1.0492685387590381, "grad_norm": 1.1813440322875977, "learning_rate": 7.273090028964396e-05, "loss": 1.2265, "step": 2340 }, { "epoch": 1.0515105655512584, "grad_norm": 1.099605679512024, "learning_rate": 7.262627217452027e-05, "loss": 1.2973, "step": 2345 }, { "epoch": 1.0537525923434785, "grad_norm": 1.2352324724197388, "learning_rate": 7.252151932095154e-05, "loss": 1.2729, "step": 2350 }, { "epoch": 1.0559946191356986, "grad_norm": 1.2545338869094849, "learning_rate": 7.241664230643931e-05, "loss": 1.293, "step": 2355 }, { "epoch": 1.0582366459279189, "grad_norm": 1.1900233030319214, "learning_rate": 7.23116417091697e-05, "loss": 1.3372, "step": 2360 }, { "epoch": 1.060478672720139, "grad_norm": 1.1750991344451904, "learning_rate": 7.220651810801009e-05, "loss": 1.2848, "step": 2365 }, { "epoch": 1.0627206995123593, "grad_norm": 1.1401137113571167, "learning_rate": 7.210127208250599e-05, "loss": 1.2853, "step": 2370 }, { "epoch": 1.0649627263045793, "grad_norm": 1.0749046802520752, "learning_rate": 7.199590421287788e-05, "loss": 1.3066, "step": 2375 }, { "epoch": 1.0672047530967994, "grad_norm": 1.2057609558105469, "learning_rate": 7.189041508001786e-05, "loss": 1.3053, "step": 2380 }, { "epoch": 1.0694467798890197, "grad_norm": 1.0309621095657349, "learning_rate": 7.178480526548666e-05, "loss": 1.3314, "step": 2385 }, { "epoch": 1.0716888066812398, "grad_norm": 1.1735321283340454, "learning_rate": 7.167907535151027e-05, "loss": 1.2538, "step": 2390 }, { "epoch": 1.07393083347346, "grad_norm": 1.0819196701049805, "learning_rate": 7.157322592097682e-05, "loss": 1.3022, "step": 2395 }, { "epoch": 1.0761728602656802, "grad_norm": 1.0149192810058594, "learning_rate": 7.146725755743329e-05, "loss": 1.3713, "step": 2400 }, { "epoch": 1.0784148870579005, "grad_norm": 0.8954042196273804, "learning_rate": 7.136117084508237e-05, "loss": 1.2962, "step": 2405 }, { "epoch": 1.0806569138501205, "grad_norm": 1.0265322923660278, "learning_rate": 7.125496636877922e-05, "loss": 1.3084, "step": 2410 }, { "epoch": 1.0828989406423406, "grad_norm": 1.0515128374099731, "learning_rate": 7.114864471402818e-05, "loss": 1.2758, "step": 2415 }, { "epoch": 1.085140967434561, "grad_norm": 1.1725807189941406, "learning_rate": 7.104220646697962e-05, "loss": 1.3046, "step": 2420 }, { "epoch": 1.087382994226781, "grad_norm": 1.1021701097488403, "learning_rate": 7.093565221442672e-05, "loss": 1.2635, "step": 2425 }, { "epoch": 1.0896250210190013, "grad_norm": 1.1437387466430664, "learning_rate": 7.082898254380214e-05, "loss": 1.323, "step": 2430 }, { "epoch": 1.0918670478112213, "grad_norm": 0.998076856136322, "learning_rate": 7.072219804317488e-05, "loss": 1.1992, "step": 2435 }, { "epoch": 1.0941090746034414, "grad_norm": 1.0903971195220947, "learning_rate": 7.061529930124695e-05, "loss": 1.2515, "step": 2440 }, { "epoch": 1.0963511013956617, "grad_norm": 1.143904209136963, "learning_rate": 7.050828690735022e-05, "loss": 1.286, "step": 2445 }, { "epoch": 1.0985931281878818, "grad_norm": 1.1476929187774658, "learning_rate": 7.040116145144311e-05, "loss": 1.2324, "step": 2450 }, { "epoch": 1.100835154980102, "grad_norm": 1.0371499061584473, "learning_rate": 7.029392352410733e-05, "loss": 1.2511, "step": 2455 }, { "epoch": 1.1030771817723222, "grad_norm": 1.069429636001587, "learning_rate": 7.018657371654464e-05, "loss": 1.3456, "step": 2460 }, { "epoch": 1.1053192085645422, "grad_norm": 0.9130118489265442, "learning_rate": 7.007911262057365e-05, "loss": 1.3043, "step": 2465 }, { "epoch": 1.1075612353567625, "grad_norm": 1.152266502380371, "learning_rate": 6.997154082862644e-05, "loss": 1.2775, "step": 2470 }, { "epoch": 1.1098032621489826, "grad_norm": 1.117077112197876, "learning_rate": 6.986385893374537e-05, "loss": 1.315, "step": 2475 }, { "epoch": 1.112045288941203, "grad_norm": 1.4062610864639282, "learning_rate": 6.975606752957984e-05, "loss": 1.2661, "step": 2480 }, { "epoch": 1.114287315733423, "grad_norm": 1.1715933084487915, "learning_rate": 6.96481672103829e-05, "loss": 1.3384, "step": 2485 }, { "epoch": 1.1165293425256433, "grad_norm": 0.937059760093689, "learning_rate": 6.95401585710081e-05, "loss": 1.2838, "step": 2490 }, { "epoch": 1.1187713693178634, "grad_norm": 1.0344353914260864, "learning_rate": 6.943204220690616e-05, "loss": 1.2396, "step": 2495 }, { "epoch": 1.1210133961100834, "grad_norm": 1.1878572702407837, "learning_rate": 6.932381871412167e-05, "loss": 1.329, "step": 2500 }, { "epoch": 1.1232554229023037, "grad_norm": 1.155254602432251, "learning_rate": 6.92154886892898e-05, "loss": 1.2652, "step": 2505 }, { "epoch": 1.1254974496945238, "grad_norm": 1.0120606422424316, "learning_rate": 6.910705272963307e-05, "loss": 1.2904, "step": 2510 }, { "epoch": 1.127739476486744, "grad_norm": 1.284738540649414, "learning_rate": 6.899851143295799e-05, "loss": 1.236, "step": 2515 }, { "epoch": 1.1299815032789642, "grad_norm": 1.1446951627731323, "learning_rate": 6.888986539765181e-05, "loss": 1.3456, "step": 2520 }, { "epoch": 1.1322235300711843, "grad_norm": 1.183556079864502, "learning_rate": 6.878111522267917e-05, "loss": 1.3006, "step": 2525 }, { "epoch": 1.1344655568634046, "grad_norm": 1.1089967489242554, "learning_rate": 6.867226150757888e-05, "loss": 1.3098, "step": 2530 }, { "epoch": 1.1367075836556246, "grad_norm": 1.1036224365234375, "learning_rate": 6.856330485246054e-05, "loss": 1.2543, "step": 2535 }, { "epoch": 1.138949610447845, "grad_norm": 1.2652587890625, "learning_rate": 6.845424585800123e-05, "loss": 1.2941, "step": 2540 }, { "epoch": 1.141191637240065, "grad_norm": 1.0114392042160034, "learning_rate": 6.834508512544228e-05, "loss": 1.306, "step": 2545 }, { "epoch": 1.143433664032285, "grad_norm": 1.0309230089187622, "learning_rate": 6.823582325658588e-05, "loss": 1.2697, "step": 2550 }, { "epoch": 1.1456756908245054, "grad_norm": 1.490627408027649, "learning_rate": 6.812646085379178e-05, "loss": 1.2784, "step": 2555 }, { "epoch": 1.1479177176167255, "grad_norm": 1.1522798538208008, "learning_rate": 6.801699851997393e-05, "loss": 1.2499, "step": 2560 }, { "epoch": 1.1501597444089458, "grad_norm": 1.0427577495574951, "learning_rate": 6.790743685859728e-05, "loss": 1.2711, "step": 2565 }, { "epoch": 1.1524017712011658, "grad_norm": 1.0645527839660645, "learning_rate": 6.779777647367434e-05, "loss": 1.2498, "step": 2570 }, { "epoch": 1.1546437979933861, "grad_norm": 1.0170249938964844, "learning_rate": 6.768801796976183e-05, "loss": 1.2622, "step": 2575 }, { "epoch": 1.1568858247856062, "grad_norm": 1.1332886219024658, "learning_rate": 6.75781619519575e-05, "loss": 1.3146, "step": 2580 }, { "epoch": 1.1591278515778263, "grad_norm": 1.1379398107528687, "learning_rate": 6.746820902589659e-05, "loss": 1.2898, "step": 2585 }, { "epoch": 1.1613698783700466, "grad_norm": 1.0116194486618042, "learning_rate": 6.735815979774866e-05, "loss": 1.3308, "step": 2590 }, { "epoch": 1.1636119051622666, "grad_norm": 0.9840161204338074, "learning_rate": 6.724801487421416e-05, "loss": 1.2739, "step": 2595 }, { "epoch": 1.165853931954487, "grad_norm": 1.3689374923706055, "learning_rate": 6.713777486252113e-05, "loss": 1.273, "step": 2600 }, { "epoch": 1.168095958746707, "grad_norm": 1.1147258281707764, "learning_rate": 6.702744037042179e-05, "loss": 1.3653, "step": 2605 }, { "epoch": 1.170337985538927, "grad_norm": 1.0359976291656494, "learning_rate": 6.691701200618925e-05, "loss": 1.2928, "step": 2610 }, { "epoch": 1.1725800123311474, "grad_norm": 1.0808576345443726, "learning_rate": 6.680649037861416e-05, "loss": 1.2834, "step": 2615 }, { "epoch": 1.1748220391233675, "grad_norm": 1.2251567840576172, "learning_rate": 6.669587609700129e-05, "loss": 1.206, "step": 2620 }, { "epoch": 1.1770640659155878, "grad_norm": 1.0829846858978271, "learning_rate": 6.658516977116623e-05, "loss": 1.2292, "step": 2625 }, { "epoch": 1.1793060927078078, "grad_norm": 1.1904149055480957, "learning_rate": 6.647437201143201e-05, "loss": 1.275, "step": 2630 }, { "epoch": 1.181548119500028, "grad_norm": 1.223581314086914, "learning_rate": 6.636348342862575e-05, "loss": 1.2954, "step": 2635 }, { "epoch": 1.1837901462922482, "grad_norm": 1.1710941791534424, "learning_rate": 6.625250463407522e-05, "loss": 1.2927, "step": 2640 }, { "epoch": 1.1860321730844683, "grad_norm": 1.0496562719345093, "learning_rate": 6.61414362396056e-05, "loss": 1.2966, "step": 2645 }, { "epoch": 1.1882741998766886, "grad_norm": 1.0458779335021973, "learning_rate": 6.603027885753598e-05, "loss": 1.3081, "step": 2650 }, { "epoch": 1.1905162266689087, "grad_norm": 1.2921910285949707, "learning_rate": 6.591903310067608e-05, "loss": 1.2511, "step": 2655 }, { "epoch": 1.192758253461129, "grad_norm": 1.0614089965820312, "learning_rate": 6.580769958232279e-05, "loss": 1.2995, "step": 2660 }, { "epoch": 1.195000280253349, "grad_norm": 1.2062052488327026, "learning_rate": 6.569627891625683e-05, "loss": 1.3231, "step": 2665 }, { "epoch": 1.197242307045569, "grad_norm": 1.065064549446106, "learning_rate": 6.558477171673941e-05, "loss": 1.3189, "step": 2670 }, { "epoch": 1.1994843338377894, "grad_norm": 1.0669735670089722, "learning_rate": 6.547317859850875e-05, "loss": 1.3024, "step": 2675 }, { "epoch": 1.2017263606300095, "grad_norm": 1.1397708654403687, "learning_rate": 6.536150017677675e-05, "loss": 1.342, "step": 2680 }, { "epoch": 1.2039683874222298, "grad_norm": 1.1043004989624023, "learning_rate": 6.524973706722562e-05, "loss": 1.3442, "step": 2685 }, { "epoch": 1.2062104142144499, "grad_norm": 1.1043583154678345, "learning_rate": 6.513788988600441e-05, "loss": 1.2344, "step": 2690 }, { "epoch": 1.2084524410066702, "grad_norm": 1.1633187532424927, "learning_rate": 6.502595924972565e-05, "loss": 1.3185, "step": 2695 }, { "epoch": 1.2106944677988902, "grad_norm": 1.2432576417922974, "learning_rate": 6.491394577546204e-05, "loss": 1.2941, "step": 2700 }, { "epoch": 1.2129364945911103, "grad_norm": 1.0130048990249634, "learning_rate": 6.480185008074284e-05, "loss": 1.2495, "step": 2705 }, { "epoch": 1.2151785213833306, "grad_norm": 1.1565743684768677, "learning_rate": 6.468967278355072e-05, "loss": 1.2585, "step": 2710 }, { "epoch": 1.2174205481755507, "grad_norm": 0.9963768124580383, "learning_rate": 6.457741450231812e-05, "loss": 1.3497, "step": 2715 }, { "epoch": 1.2196625749677708, "grad_norm": 1.1197139024734497, "learning_rate": 6.446507585592399e-05, "loss": 1.2958, "step": 2720 }, { "epoch": 1.221904601759991, "grad_norm": 1.1450271606445312, "learning_rate": 6.435265746369033e-05, "loss": 1.3259, "step": 2725 }, { "epoch": 1.2241466285522111, "grad_norm": 1.0894269943237305, "learning_rate": 6.424015994537877e-05, "loss": 1.272, "step": 2730 }, { "epoch": 1.2263886553444314, "grad_norm": 1.1631505489349365, "learning_rate": 6.412758392118718e-05, "loss": 1.3315, "step": 2735 }, { "epoch": 1.2286306821366515, "grad_norm": 1.213643193244934, "learning_rate": 6.40149300117462e-05, "loss": 1.3228, "step": 2740 }, { "epoch": 1.2308727089288718, "grad_norm": 1.0162944793701172, "learning_rate": 6.390219883811591e-05, "loss": 1.2519, "step": 2745 }, { "epoch": 1.2331147357210919, "grad_norm": 1.1782135963439941, "learning_rate": 6.378939102178225e-05, "loss": 1.3281, "step": 2750 }, { "epoch": 1.235356762513312, "grad_norm": 1.062117576599121, "learning_rate": 6.367650718465379e-05, "loss": 1.2671, "step": 2755 }, { "epoch": 1.2375987893055322, "grad_norm": 1.3144171237945557, "learning_rate": 6.356354794905814e-05, "loss": 1.3392, "step": 2760 }, { "epoch": 1.2398408160977523, "grad_norm": 1.0592882633209229, "learning_rate": 6.345051393773861e-05, "loss": 1.2902, "step": 2765 }, { "epoch": 1.2420828428899726, "grad_norm": 1.2294663190841675, "learning_rate": 6.333740577385074e-05, "loss": 1.3081, "step": 2770 }, { "epoch": 1.2443248696821927, "grad_norm": 1.0388215780258179, "learning_rate": 6.322422408095886e-05, "loss": 1.2917, "step": 2775 }, { "epoch": 1.246566896474413, "grad_norm": 1.094425916671753, "learning_rate": 6.311096948303264e-05, "loss": 1.3252, "step": 2780 }, { "epoch": 1.248808923266633, "grad_norm": 1.3590023517608643, "learning_rate": 6.299764260444378e-05, "loss": 1.2825, "step": 2785 }, { "epoch": 1.2510509500588531, "grad_norm": 1.1007918119430542, "learning_rate": 6.288424406996238e-05, "loss": 1.2437, "step": 2790 }, { "epoch": 1.2532929768510734, "grad_norm": 1.2783552408218384, "learning_rate": 6.277077450475354e-05, "loss": 1.3539, "step": 2795 }, { "epoch": 1.2555350036432935, "grad_norm": 1.2107961177825928, "learning_rate": 6.265723453437404e-05, "loss": 1.3215, "step": 2800 }, { "epoch": 1.2577770304355136, "grad_norm": 1.0384870767593384, "learning_rate": 6.254362478476878e-05, "loss": 1.2514, "step": 2805 }, { "epoch": 1.2600190572277339, "grad_norm": 1.3173192739486694, "learning_rate": 6.242994588226731e-05, "loss": 1.3129, "step": 2810 }, { "epoch": 1.262261084019954, "grad_norm": 0.9206444621086121, "learning_rate": 6.231619845358045e-05, "loss": 1.3108, "step": 2815 }, { "epoch": 1.2645031108121743, "grad_norm": 1.13257896900177, "learning_rate": 6.220238312579682e-05, "loss": 1.286, "step": 2820 }, { "epoch": 1.2667451376043943, "grad_norm": 1.3280197381973267, "learning_rate": 6.208850052637933e-05, "loss": 1.2462, "step": 2825 }, { "epoch": 1.2689871643966146, "grad_norm": 1.2632642984390259, "learning_rate": 6.197455128316178e-05, "loss": 1.2761, "step": 2830 }, { "epoch": 1.2712291911888347, "grad_norm": 1.0141961574554443, "learning_rate": 6.186053602434539e-05, "loss": 1.2421, "step": 2835 }, { "epoch": 1.2734712179810548, "grad_norm": 1.2043393850326538, "learning_rate": 6.174645537849529e-05, "loss": 1.333, "step": 2840 }, { "epoch": 1.275713244773275, "grad_norm": 1.1990864276885986, "learning_rate": 6.163230997453712e-05, "loss": 1.3188, "step": 2845 }, { "epoch": 1.2779552715654952, "grad_norm": 1.0753861665725708, "learning_rate": 6.15181004417535e-05, "loss": 1.3231, "step": 2850 }, { "epoch": 1.2801972983577155, "grad_norm": 1.089961290359497, "learning_rate": 6.140382740978062e-05, "loss": 1.258, "step": 2855 }, { "epoch": 1.2824393251499355, "grad_norm": 1.217774510383606, "learning_rate": 6.12894915086047e-05, "loss": 1.2642, "step": 2860 }, { "epoch": 1.2846813519421558, "grad_norm": 1.180626630783081, "learning_rate": 6.117509336855865e-05, "loss": 1.2759, "step": 2865 }, { "epoch": 1.286923378734376, "grad_norm": 1.2082866430282593, "learning_rate": 6.106063362031838e-05, "loss": 1.3255, "step": 2870 }, { "epoch": 1.289165405526596, "grad_norm": 1.1015843152999878, "learning_rate": 6.094611289489951e-05, "loss": 1.3282, "step": 2875 }, { "epoch": 1.2914074323188163, "grad_norm": 1.1207735538482666, "learning_rate": 6.083153182365383e-05, "loss": 1.2982, "step": 2880 }, { "epoch": 1.2936494591110363, "grad_norm": 1.1439082622528076, "learning_rate": 6.071689103826582e-05, "loss": 1.3463, "step": 2885 }, { "epoch": 1.2958914859032564, "grad_norm": 1.1893078088760376, "learning_rate": 6.060219117074913e-05, "loss": 1.2573, "step": 2890 }, { "epoch": 1.2981335126954767, "grad_norm": 1.2720766067504883, "learning_rate": 6.048743285344317e-05, "loss": 1.3029, "step": 2895 }, { "epoch": 1.3003755394876968, "grad_norm": 1.0983214378356934, "learning_rate": 6.037261671900953e-05, "loss": 1.2845, "step": 2900 }, { "epoch": 1.302617566279917, "grad_norm": 1.1721152067184448, "learning_rate": 6.02577434004286e-05, "loss": 1.3025, "step": 2905 }, { "epoch": 1.3048595930721372, "grad_norm": 1.1017165184020996, "learning_rate": 6.0142813530996e-05, "loss": 1.3166, "step": 2910 }, { "epoch": 1.3071016198643575, "grad_norm": 1.1608681678771973, "learning_rate": 6.002782774431911e-05, "loss": 1.259, "step": 2915 }, { "epoch": 1.3093436466565775, "grad_norm": 1.249861478805542, "learning_rate": 5.9912786674313614e-05, "loss": 1.2469, "step": 2920 }, { "epoch": 1.3115856734487976, "grad_norm": 1.2426470518112183, "learning_rate": 5.9797690955199926e-05, "loss": 1.2541, "step": 2925 }, { "epoch": 1.313827700241018, "grad_norm": 1.0516715049743652, "learning_rate": 5.968254122149974e-05, "loss": 1.277, "step": 2930 }, { "epoch": 1.316069727033238, "grad_norm": 1.4431426525115967, "learning_rate": 5.95673381080326e-05, "loss": 1.3182, "step": 2935 }, { "epoch": 1.3183117538254583, "grad_norm": 1.2504661083221436, "learning_rate": 5.945208224991226e-05, "loss": 1.3503, "step": 2940 }, { "epoch": 1.3205537806176784, "grad_norm": 1.1429800987243652, "learning_rate": 5.933677428254328e-05, "loss": 1.2767, "step": 2945 }, { "epoch": 1.3227958074098987, "grad_norm": 1.2553044557571411, "learning_rate": 5.922141484161751e-05, "loss": 1.2817, "step": 2950 }, { "epoch": 1.3250378342021187, "grad_norm": 1.2155333757400513, "learning_rate": 5.910600456311055e-05, "loss": 1.3347, "step": 2955 }, { "epoch": 1.3272798609943388, "grad_norm": 1.2551952600479126, "learning_rate": 5.8990544083278285e-05, "loss": 1.2119, "step": 2960 }, { "epoch": 1.329521887786559, "grad_norm": 1.1889550685882568, "learning_rate": 5.887503403865333e-05, "loss": 1.3307, "step": 2965 }, { "epoch": 1.3317639145787792, "grad_norm": 1.1134368181228638, "learning_rate": 5.8759475066041624e-05, "loss": 1.3094, "step": 2970 }, { "epoch": 1.3340059413709993, "grad_norm": 1.2652761936187744, "learning_rate": 5.8643867802518756e-05, "loss": 1.3296, "step": 2975 }, { "epoch": 1.3362479681632196, "grad_norm": 1.6688954830169678, "learning_rate": 5.852821288542658e-05, "loss": 1.3148, "step": 2980 }, { "epoch": 1.3384899949554399, "grad_norm": 0.9661517143249512, "learning_rate": 5.841251095236969e-05, "loss": 1.3197, "step": 2985 }, { "epoch": 1.34073202174766, "grad_norm": 1.1682339906692505, "learning_rate": 5.829676264121183e-05, "loss": 1.3328, "step": 2990 }, { "epoch": 1.34297404853988, "grad_norm": 1.109320044517517, "learning_rate": 5.818096859007247e-05, "loss": 1.2575, "step": 2995 }, { "epoch": 1.3452160753321003, "grad_norm": 1.2029309272766113, "learning_rate": 5.8065129437323206e-05, "loss": 1.3296, "step": 3000 }, { "epoch": 1.3474581021243204, "grad_norm": 1.104525089263916, "learning_rate": 5.794924582158431e-05, "loss": 1.2558, "step": 3005 }, { "epoch": 1.3497001289165405, "grad_norm": 1.1124447584152222, "learning_rate": 5.783331838172116e-05, "loss": 1.3036, "step": 3010 }, { "epoch": 1.3519421557087608, "grad_norm": 1.1220247745513916, "learning_rate": 5.771734775684072e-05, "loss": 1.3161, "step": 3015 }, { "epoch": 1.3541841825009808, "grad_norm": 1.143099069595337, "learning_rate": 5.760133458628809e-05, "loss": 1.3066, "step": 3020 }, { "epoch": 1.3564262092932011, "grad_norm": 0.9693493247032166, "learning_rate": 5.7485279509642885e-05, "loss": 1.3089, "step": 3025 }, { "epoch": 1.3586682360854212, "grad_norm": 1.0467145442962646, "learning_rate": 5.736918316671572e-05, "loss": 1.2631, "step": 3030 }, { "epoch": 1.3609102628776415, "grad_norm": 1.1905845403671265, "learning_rate": 5.7253046197544754e-05, "loss": 1.2759, "step": 3035 }, { "epoch": 1.3631522896698616, "grad_norm": 1.0732934474945068, "learning_rate": 5.713686924239211e-05, "loss": 1.296, "step": 3040 }, { "epoch": 1.3653943164620816, "grad_norm": 1.2555313110351562, "learning_rate": 5.702065294174036e-05, "loss": 1.2306, "step": 3045 }, { "epoch": 1.367636343254302, "grad_norm": 1.033304214477539, "learning_rate": 5.690439793628896e-05, "loss": 1.3072, "step": 3050 }, { "epoch": 1.369878370046522, "grad_norm": 1.058167576789856, "learning_rate": 5.6788104866950754e-05, "loss": 1.3995, "step": 3055 }, { "epoch": 1.372120396838742, "grad_norm": 1.0705965757369995, "learning_rate": 5.667177437484845e-05, "loss": 1.3035, "step": 3060 }, { "epoch": 1.3743624236309624, "grad_norm": 1.052674651145935, "learning_rate": 5.655540710131105e-05, "loss": 1.3247, "step": 3065 }, { "epoch": 1.3766044504231827, "grad_norm": 1.2467668056488037, "learning_rate": 5.643900368787036e-05, "loss": 1.3106, "step": 3070 }, { "epoch": 1.3788464772154028, "grad_norm": 1.1554597616195679, "learning_rate": 5.632256477625739e-05, "loss": 1.2686, "step": 3075 }, { "epoch": 1.3810885040076228, "grad_norm": 1.0708049535751343, "learning_rate": 5.6206091008398866e-05, "loss": 1.2774, "step": 3080 }, { "epoch": 1.3833305307998431, "grad_norm": 1.1895546913146973, "learning_rate": 5.608958302641364e-05, "loss": 1.1813, "step": 3085 }, { "epoch": 1.3855725575920632, "grad_norm": 1.4244434833526611, "learning_rate": 5.597304147260927e-05, "loss": 1.3678, "step": 3090 }, { "epoch": 1.3878145843842833, "grad_norm": 1.2443078756332397, "learning_rate": 5.5856466989478325e-05, "loss": 1.2248, "step": 3095 }, { "epoch": 1.3900566111765036, "grad_norm": 1.0258877277374268, "learning_rate": 5.573986021969494e-05, "loss": 1.2725, "step": 3100 }, { "epoch": 1.3922986379687237, "grad_norm": 1.0962164402008057, "learning_rate": 5.5623221806111224e-05, "loss": 1.2393, "step": 3105 }, { "epoch": 1.394540664760944, "grad_norm": 1.3782082796096802, "learning_rate": 5.550655239175377e-05, "loss": 1.2817, "step": 3110 }, { "epoch": 1.396782691553164, "grad_norm": 1.2269506454467773, "learning_rate": 5.538985261982006e-05, "loss": 1.2376, "step": 3115 }, { "epoch": 1.3990247183453843, "grad_norm": 1.2134568691253662, "learning_rate": 5.527312313367492e-05, "loss": 1.2925, "step": 3120 }, { "epoch": 1.4012667451376044, "grad_norm": 1.275994896888733, "learning_rate": 5.515636457684705e-05, "loss": 1.351, "step": 3125 }, { "epoch": 1.4035087719298245, "grad_norm": 1.0931142568588257, "learning_rate": 5.5039577593025335e-05, "loss": 1.3186, "step": 3130 }, { "epoch": 1.4057507987220448, "grad_norm": 1.0260752439498901, "learning_rate": 5.492276282605544e-05, "loss": 1.2835, "step": 3135 }, { "epoch": 1.4079928255142649, "grad_norm": 1.2498077154159546, "learning_rate": 5.480592091993616e-05, "loss": 1.3022, "step": 3140 }, { "epoch": 1.410234852306485, "grad_norm": 1.0274704694747925, "learning_rate": 5.4689052518815954e-05, "loss": 1.2354, "step": 3145 }, { "epoch": 1.4124768790987052, "grad_norm": 1.3377097845077515, "learning_rate": 5.457215826698928e-05, "loss": 1.3043, "step": 3150 }, { "epoch": 1.4147189058909255, "grad_norm": 1.2201504707336426, "learning_rate": 5.4455238808893185e-05, "loss": 1.36, "step": 3155 }, { "epoch": 1.4169609326831456, "grad_norm": 1.1458607912063599, "learning_rate": 5.433829478910362e-05, "loss": 1.285, "step": 3160 }, { "epoch": 1.4192029594753657, "grad_norm": 1.1673274040222168, "learning_rate": 5.4221326852331965e-05, "loss": 1.3474, "step": 3165 }, { "epoch": 1.421444986267586, "grad_norm": 1.141150951385498, "learning_rate": 5.410433564342146e-05, "loss": 1.3101, "step": 3170 }, { "epoch": 1.423687013059806, "grad_norm": 1.18087899684906, "learning_rate": 5.398732180734365e-05, "loss": 1.3324, "step": 3175 }, { "epoch": 1.4259290398520261, "grad_norm": 1.1001255512237549, "learning_rate": 5.3870285989194814e-05, "loss": 1.3046, "step": 3180 }, { "epoch": 1.4281710666442464, "grad_norm": 1.2380887269973755, "learning_rate": 5.3753228834192384e-05, "loss": 1.283, "step": 3185 }, { "epoch": 1.4304130934364665, "grad_norm": 1.2417025566101074, "learning_rate": 5.3636150987671496e-05, "loss": 1.2536, "step": 3190 }, { "epoch": 1.4326551202286868, "grad_norm": 1.2791988849639893, "learning_rate": 5.35190530950813e-05, "loss": 1.314, "step": 3195 }, { "epoch": 1.4348971470209069, "grad_norm": 1.0879089832305908, "learning_rate": 5.3401935801981464e-05, "loss": 1.2726, "step": 3200 }, { "epoch": 1.4371391738131272, "grad_norm": 1.1599972248077393, "learning_rate": 5.328479975403864e-05, "loss": 1.3082, "step": 3205 }, { "epoch": 1.4393812006053472, "grad_norm": 1.1873195171356201, "learning_rate": 5.316764559702285e-05, "loss": 1.2853, "step": 3210 }, { "epoch": 1.4416232273975673, "grad_norm": 1.049641489982605, "learning_rate": 5.3050473976803974e-05, "loss": 1.3048, "step": 3215 }, { "epoch": 1.4438652541897876, "grad_norm": 1.0594843626022339, "learning_rate": 5.293328553934813e-05, "loss": 1.2845, "step": 3220 }, { "epoch": 1.4461072809820077, "grad_norm": 1.007035732269287, "learning_rate": 5.2816080930714194e-05, "loss": 1.3099, "step": 3225 }, { "epoch": 1.4483493077742278, "grad_norm": 1.0724034309387207, "learning_rate": 5.269886079705018e-05, "loss": 1.28, "step": 3230 }, { "epoch": 1.450591334566448, "grad_norm": 1.023113489151001, "learning_rate": 5.258162578458963e-05, "loss": 1.3397, "step": 3235 }, { "epoch": 1.4528333613586684, "grad_norm": 1.005807638168335, "learning_rate": 5.246437653964822e-05, "loss": 1.2121, "step": 3240 }, { "epoch": 1.4550753881508884, "grad_norm": 1.0102343559265137, "learning_rate": 5.234711370862001e-05, "loss": 1.276, "step": 3245 }, { "epoch": 1.4573174149431085, "grad_norm": 1.173030138015747, "learning_rate": 5.2229837937974e-05, "loss": 1.3212, "step": 3250 }, { "epoch": 1.4595594417353288, "grad_norm": 1.0489596128463745, "learning_rate": 5.2112549874250495e-05, "loss": 1.32, "step": 3255 }, { "epoch": 1.4618014685275489, "grad_norm": 1.084416389465332, "learning_rate": 5.199525016405759e-05, "loss": 1.2529, "step": 3260 }, { "epoch": 1.464043495319769, "grad_norm": 1.0936429500579834, "learning_rate": 5.187793945406759e-05, "loss": 1.241, "step": 3265 }, { "epoch": 1.4662855221119893, "grad_norm": 1.199352502822876, "learning_rate": 5.1760618391013424e-05, "loss": 1.2246, "step": 3270 }, { "epoch": 1.4685275489042093, "grad_norm": 1.1133605241775513, "learning_rate": 5.164328762168514e-05, "loss": 1.3192, "step": 3275 }, { "epoch": 1.4707695756964296, "grad_norm": 1.165466070175171, "learning_rate": 5.152594779292624e-05, "loss": 1.3289, "step": 3280 }, { "epoch": 1.4730116024886497, "grad_norm": 1.1635582447052002, "learning_rate": 5.140859955163021e-05, "loss": 1.2729, "step": 3285 }, { "epoch": 1.47525362928087, "grad_norm": 1.1590099334716797, "learning_rate": 5.1291243544736875e-05, "loss": 1.3111, "step": 3290 }, { "epoch": 1.47749565607309, "grad_norm": 1.157904863357544, "learning_rate": 5.1173880419228935e-05, "loss": 1.2491, "step": 3295 }, { "epoch": 1.4797376828653102, "grad_norm": 1.103690266609192, "learning_rate": 5.105651082212828e-05, "loss": 1.2776, "step": 3300 }, { "epoch": 1.4819797096575305, "grad_norm": 1.020355463027954, "learning_rate": 5.093913540049249e-05, "loss": 1.1967, "step": 3305 }, { "epoch": 1.4842217364497505, "grad_norm": 1.2379658222198486, "learning_rate": 5.082175480141126e-05, "loss": 1.2427, "step": 3310 }, { "epoch": 1.4864637632419706, "grad_norm": 1.072657585144043, "learning_rate": 5.0704369672002835e-05, "loss": 1.325, "step": 3315 }, { "epoch": 1.488705790034191, "grad_norm": 1.2614028453826904, "learning_rate": 5.0586980659410434e-05, "loss": 1.3126, "step": 3320 }, { "epoch": 1.4909478168264112, "grad_norm": 1.1547425985336304, "learning_rate": 5.0469588410798676e-05, "loss": 1.2616, "step": 3325 }, { "epoch": 1.4931898436186313, "grad_norm": 1.2182773351669312, "learning_rate": 5.035219357335001e-05, "loss": 1.2992, "step": 3330 }, { "epoch": 1.4954318704108513, "grad_norm": 1.2161564826965332, "learning_rate": 5.023479679426122e-05, "loss": 1.2788, "step": 3335 }, { "epoch": 1.4976738972030716, "grad_norm": 1.122253656387329, "learning_rate": 5.011739872073968e-05, "loss": 1.3072, "step": 3340 }, { "epoch": 1.4999159239952917, "grad_norm": 0.9858971834182739, "learning_rate": 5e-05, "loss": 1.2049, "step": 3345 }, { "epoch": 1.5021579507875118, "grad_norm": 1.0259901285171509, "learning_rate": 4.9882601279260324e-05, "loss": 1.3184, "step": 3350 }, { "epoch": 1.504399977579732, "grad_norm": 1.0712144374847412, "learning_rate": 4.9765203205738805e-05, "loss": 1.2826, "step": 3355 }, { "epoch": 1.5066420043719524, "grad_norm": 1.0313420295715332, "learning_rate": 4.964780642664999e-05, "loss": 1.3633, "step": 3360 }, { "epoch": 1.5088840311641722, "grad_norm": 1.1968498229980469, "learning_rate": 4.953041158920133e-05, "loss": 1.2564, "step": 3365 }, { "epoch": 1.5111260579563925, "grad_norm": 1.0766561031341553, "learning_rate": 4.9413019340589585e-05, "loss": 1.2387, "step": 3370 }, { "epoch": 1.5133680847486128, "grad_norm": 1.2741787433624268, "learning_rate": 4.929563032799717e-05, "loss": 1.2113, "step": 3375 }, { "epoch": 1.515610111540833, "grad_norm": 1.1092220544815063, "learning_rate": 4.917824519858875e-05, "loss": 1.2782, "step": 3380 }, { "epoch": 1.517852138333053, "grad_norm": 1.1351913213729858, "learning_rate": 4.906086459950753e-05, "loss": 1.2667, "step": 3385 }, { "epoch": 1.5200941651252733, "grad_norm": 1.1670454740524292, "learning_rate": 4.8943489177871735e-05, "loss": 1.2764, "step": 3390 }, { "epoch": 1.5223361919174934, "grad_norm": 1.1347793340682983, "learning_rate": 4.882611958077108e-05, "loss": 1.3095, "step": 3395 }, { "epoch": 1.5245782187097134, "grad_norm": 1.0640754699707031, "learning_rate": 4.870875645526313e-05, "loss": 1.2696, "step": 3400 }, { "epoch": 1.5268202455019337, "grad_norm": 1.1215641498565674, "learning_rate": 4.859140044836979e-05, "loss": 1.2618, "step": 3405 }, { "epoch": 1.529062272294154, "grad_norm": 0.9714592695236206, "learning_rate": 4.847405220707377e-05, "loss": 1.3044, "step": 3410 }, { "epoch": 1.531304299086374, "grad_norm": 1.055709719657898, "learning_rate": 4.8356712378314876e-05, "loss": 1.3893, "step": 3415 }, { "epoch": 1.5335463258785942, "grad_norm": 1.0931789875030518, "learning_rate": 4.823938160898657e-05, "loss": 1.3075, "step": 3420 }, { "epoch": 1.5357883526708145, "grad_norm": 1.0338480472564697, "learning_rate": 4.812206054593242e-05, "loss": 1.334, "step": 3425 }, { "epoch": 1.5380303794630346, "grad_norm": 1.1629575490951538, "learning_rate": 4.800474983594242e-05, "loss": 1.2991, "step": 3430 }, { "epoch": 1.5402724062552546, "grad_norm": 0.9702677726745605, "learning_rate": 4.788745012574952e-05, "loss": 1.2372, "step": 3435 }, { "epoch": 1.542514433047475, "grad_norm": 1.1541732549667358, "learning_rate": 4.777016206202602e-05, "loss": 1.3549, "step": 3440 }, { "epoch": 1.5447564598396952, "grad_norm": 1.108521580696106, "learning_rate": 4.765288629137999e-05, "loss": 1.2351, "step": 3445 }, { "epoch": 1.5469984866319153, "grad_norm": 1.2135175466537476, "learning_rate": 4.753562346035178e-05, "loss": 1.2808, "step": 3450 }, { "epoch": 1.5492405134241354, "grad_norm": 1.0196810960769653, "learning_rate": 4.7418374215410374e-05, "loss": 1.2797, "step": 3455 }, { "epoch": 1.5514825402163557, "grad_norm": 1.1233173608779907, "learning_rate": 4.730113920294983e-05, "loss": 1.2932, "step": 3460 }, { "epoch": 1.5537245670085758, "grad_norm": 1.0524299144744873, "learning_rate": 4.7183919069285804e-05, "loss": 1.2907, "step": 3465 }, { "epoch": 1.5559665938007958, "grad_norm": 1.174949288368225, "learning_rate": 4.706671446065188e-05, "loss": 1.2596, "step": 3470 }, { "epoch": 1.5582086205930161, "grad_norm": 1.152764081954956, "learning_rate": 4.694952602319603e-05, "loss": 1.3416, "step": 3475 }, { "epoch": 1.5604506473852362, "grad_norm": 1.1165378093719482, "learning_rate": 4.683235440297717e-05, "loss": 1.2781, "step": 3480 }, { "epoch": 1.5626926741774563, "grad_norm": 1.0606844425201416, "learning_rate": 4.671520024596137e-05, "loss": 1.3009, "step": 3485 }, { "epoch": 1.5649347009696766, "grad_norm": 1.023228645324707, "learning_rate": 4.659806419801855e-05, "loss": 1.3042, "step": 3490 }, { "epoch": 1.5671767277618969, "grad_norm": 1.2059510946273804, "learning_rate": 4.6480946904918735e-05, "loss": 1.2997, "step": 3495 }, { "epoch": 1.569418754554117, "grad_norm": 1.0934103727340698, "learning_rate": 4.636384901232852e-05, "loss": 1.2771, "step": 3500 }, { "epoch": 1.571660781346337, "grad_norm": 1.032578945159912, "learning_rate": 4.6246771165807614e-05, "loss": 1.2553, "step": 3505 }, { "epoch": 1.5739028081385573, "grad_norm": 1.055982232093811, "learning_rate": 4.612971401080521e-05, "loss": 1.2673, "step": 3510 }, { "epoch": 1.5761448349307774, "grad_norm": 1.00336754322052, "learning_rate": 4.6012678192656364e-05, "loss": 1.2102, "step": 3515 }, { "epoch": 1.5783868617229975, "grad_norm": 1.0931719541549683, "learning_rate": 4.589566435657854e-05, "loss": 1.242, "step": 3520 }, { "epoch": 1.5806288885152178, "grad_norm": 1.1765341758728027, "learning_rate": 4.5778673147668053e-05, "loss": 1.2747, "step": 3525 }, { "epoch": 1.582870915307438, "grad_norm": 1.2692338228225708, "learning_rate": 4.5661705210896395e-05, "loss": 1.3241, "step": 3530 }, { "epoch": 1.5851129420996581, "grad_norm": 1.2092036008834839, "learning_rate": 4.5544761191106826e-05, "loss": 1.271, "step": 3535 }, { "epoch": 1.5873549688918782, "grad_norm": 1.2053848505020142, "learning_rate": 4.542784173301072e-05, "loss": 1.2828, "step": 3540 }, { "epoch": 1.5895969956840985, "grad_norm": 1.1756088733673096, "learning_rate": 4.5310947481184064e-05, "loss": 1.2556, "step": 3545 }, { "epoch": 1.5918390224763186, "grad_norm": 1.1956021785736084, "learning_rate": 4.5194079080063835e-05, "loss": 1.2561, "step": 3550 }, { "epoch": 1.5940810492685387, "grad_norm": 1.0988577604293823, "learning_rate": 4.5077237173944576e-05, "loss": 1.4031, "step": 3555 }, { "epoch": 1.596323076060759, "grad_norm": 1.1947277784347534, "learning_rate": 4.496042240697467e-05, "loss": 1.2634, "step": 3560 }, { "epoch": 1.5985651028529793, "grad_norm": 1.0463786125183105, "learning_rate": 4.484363542315297e-05, "loss": 1.2856, "step": 3565 }, { "epoch": 1.6008071296451991, "grad_norm": 1.0513739585876465, "learning_rate": 4.4726876866325086e-05, "loss": 1.2232, "step": 3570 }, { "epoch": 1.6030491564374194, "grad_norm": 1.1749991178512573, "learning_rate": 4.461014738017995e-05, "loss": 1.3407, "step": 3575 }, { "epoch": 1.6052911832296397, "grad_norm": 1.0796834230422974, "learning_rate": 4.4493447608246253e-05, "loss": 1.2917, "step": 3580 }, { "epoch": 1.6075332100218598, "grad_norm": 1.0776811838150024, "learning_rate": 4.437677819388879e-05, "loss": 1.3028, "step": 3585 }, { "epoch": 1.6097752368140799, "grad_norm": 1.1453006267547607, "learning_rate": 4.4260139780305074e-05, "loss": 1.2752, "step": 3590 }, { "epoch": 1.6120172636063002, "grad_norm": 0.9714769124984741, "learning_rate": 4.4143533010521686e-05, "loss": 1.2274, "step": 3595 }, { "epoch": 1.6142592903985202, "grad_norm": 1.298377513885498, "learning_rate": 4.4026958527390735e-05, "loss": 1.2982, "step": 3600 }, { "epoch": 1.6165013171907403, "grad_norm": 1.1102139949798584, "learning_rate": 4.391041697358636e-05, "loss": 1.3122, "step": 3605 }, { "epoch": 1.6187433439829606, "grad_norm": 1.0720750093460083, "learning_rate": 4.3793908991601166e-05, "loss": 1.3212, "step": 3610 }, { "epoch": 1.620985370775181, "grad_norm": 1.0826951265335083, "learning_rate": 4.367743522374261e-05, "loss": 1.2706, "step": 3615 }, { "epoch": 1.623227397567401, "grad_norm": 0.9729198217391968, "learning_rate": 4.3560996312129636e-05, "loss": 1.3026, "step": 3620 }, { "epoch": 1.625469424359621, "grad_norm": 1.081446647644043, "learning_rate": 4.344459289868895e-05, "loss": 1.2997, "step": 3625 }, { "epoch": 1.6277114511518413, "grad_norm": 1.1220719814300537, "learning_rate": 4.3328225625151553e-05, "loss": 1.2356, "step": 3630 }, { "epoch": 1.6299534779440614, "grad_norm": 1.0425808429718018, "learning_rate": 4.3211895133049244e-05, "loss": 1.2756, "step": 3635 }, { "epoch": 1.6321955047362815, "grad_norm": 1.0694538354873657, "learning_rate": 4.309560206371106e-05, "loss": 1.316, "step": 3640 }, { "epoch": 1.6344375315285018, "grad_norm": 1.0898274183273315, "learning_rate": 4.297934705825966e-05, "loss": 1.3316, "step": 3645 }, { "epoch": 1.636679558320722, "grad_norm": 1.176999807357788, "learning_rate": 4.2863130757607906e-05, "loss": 1.2538, "step": 3650 }, { "epoch": 1.638921585112942, "grad_norm": 1.2387757301330566, "learning_rate": 4.274695380245526e-05, "loss": 1.3211, "step": 3655 }, { "epoch": 1.6411636119051622, "grad_norm": 1.3879566192626953, "learning_rate": 4.263081683328429e-05, "loss": 1.2902, "step": 3660 }, { "epoch": 1.6434056386973825, "grad_norm": 1.071897268295288, "learning_rate": 4.2514720490357134e-05, "loss": 1.251, "step": 3665 }, { "epoch": 1.6456476654896026, "grad_norm": 1.221535086631775, "learning_rate": 4.239866541371192e-05, "loss": 1.2478, "step": 3670 }, { "epoch": 1.6478896922818227, "grad_norm": 1.0815098285675049, "learning_rate": 4.2282652243159276e-05, "loss": 1.2811, "step": 3675 }, { "epoch": 1.650131719074043, "grad_norm": 1.1960694789886475, "learning_rate": 4.216668161827887e-05, "loss": 1.2937, "step": 3680 }, { "epoch": 1.652373745866263, "grad_norm": 1.307964563369751, "learning_rate": 4.20507541784157e-05, "loss": 1.2725, "step": 3685 }, { "epoch": 1.6546157726584831, "grad_norm": 1.150295376777649, "learning_rate": 4.193487056267679e-05, "loss": 1.2542, "step": 3690 }, { "epoch": 1.6568577994507034, "grad_norm": 1.0650702714920044, "learning_rate": 4.181903140992754e-05, "loss": 1.1894, "step": 3695 }, { "epoch": 1.6590998262429237, "grad_norm": 1.0896923542022705, "learning_rate": 4.170323735878818e-05, "loss": 1.3178, "step": 3700 }, { "epoch": 1.6613418530351438, "grad_norm": 1.1696605682373047, "learning_rate": 4.1587489047630314e-05, "loss": 1.2414, "step": 3705 }, { "epoch": 1.6635838798273639, "grad_norm": 1.0459294319152832, "learning_rate": 4.1471787114573426e-05, "loss": 1.3447, "step": 3710 }, { "epoch": 1.6658259066195842, "grad_norm": 1.2181882858276367, "learning_rate": 4.135613219748125e-05, "loss": 1.2815, "step": 3715 }, { "epoch": 1.6680679334118043, "grad_norm": 1.2734391689300537, "learning_rate": 4.124052493395838e-05, "loss": 1.2832, "step": 3720 }, { "epoch": 1.6703099602040243, "grad_norm": 1.102831244468689, "learning_rate": 4.112496596134667e-05, "loss": 1.2647, "step": 3725 }, { "epoch": 1.6725519869962446, "grad_norm": 1.2140913009643555, "learning_rate": 4.100945591672173e-05, "loss": 1.258, "step": 3730 }, { "epoch": 1.674794013788465, "grad_norm": 1.0504231452941895, "learning_rate": 4.089399543688947e-05, "loss": 1.2588, "step": 3735 }, { "epoch": 1.6770360405806848, "grad_norm": 1.1240216493606567, "learning_rate": 4.07785851583825e-05, "loss": 1.2755, "step": 3740 }, { "epoch": 1.679278067372905, "grad_norm": 1.1572685241699219, "learning_rate": 4.066322571745673e-05, "loss": 1.2768, "step": 3745 }, { "epoch": 1.6815200941651254, "grad_norm": 0.9713603854179382, "learning_rate": 4.054791775008775e-05, "loss": 1.2288, "step": 3750 }, { "epoch": 1.6837621209573455, "grad_norm": 1.1444541215896606, "learning_rate": 4.043266189196741e-05, "loss": 1.2193, "step": 3755 }, { "epoch": 1.6860041477495655, "grad_norm": 1.0875182151794434, "learning_rate": 4.031745877850026e-05, "loss": 1.2802, "step": 3760 }, { "epoch": 1.6882461745417858, "grad_norm": 1.1171207427978516, "learning_rate": 4.02023090448001e-05, "loss": 1.307, "step": 3765 }, { "epoch": 1.690488201334006, "grad_norm": 0.991519570350647, "learning_rate": 4.008721332568639e-05, "loss": 1.2811, "step": 3770 }, { "epoch": 1.692730228126226, "grad_norm": 1.1010782718658447, "learning_rate": 3.9972172255680886e-05, "loss": 1.2631, "step": 3775 }, { "epoch": 1.6949722549184463, "grad_norm": 1.1108042001724243, "learning_rate": 3.985718646900402e-05, "loss": 1.3115, "step": 3780 }, { "epoch": 1.6972142817106666, "grad_norm": 1.009866714477539, "learning_rate": 3.974225659957141e-05, "loss": 1.2613, "step": 3785 }, { "epoch": 1.6994563085028866, "grad_norm": 1.0787150859832764, "learning_rate": 3.9627383280990474e-05, "loss": 1.353, "step": 3790 }, { "epoch": 1.7016983352951067, "grad_norm": 1.0613850355148315, "learning_rate": 3.951256714655685e-05, "loss": 1.3048, "step": 3795 }, { "epoch": 1.703940362087327, "grad_norm": 1.1148884296417236, "learning_rate": 3.939780882925088e-05, "loss": 1.2918, "step": 3800 } ], "logging_steps": 5, "max_steps": 6690, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 2.325845119889375e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }