openchat36_lora-3800-3005 / trainer_state.json
aanosov's picture
Upload PEFT adapter
73c65b3
raw
history blame
133 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.703940362087327,
"eval_steps": 500,
"global_step": 3800,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.002242026792220167,
"grad_norm": 1.0706044435501099,
"learning_rate": 9.999986217521373e-05,
"loss": 2.3592,
"step": 5
},
{
"epoch": 0.004484053584440334,
"grad_norm": 1.1121090650558472,
"learning_rate": 9.999944870161475e-05,
"loss": 2.2216,
"step": 10
},
{
"epoch": 0.006726080376660501,
"grad_norm": 1.4128837585449219,
"learning_rate": 9.999875958148252e-05,
"loss": 2.0607,
"step": 15
},
{
"epoch": 0.008968107168880668,
"grad_norm": 1.3161660432815552,
"learning_rate": 9.99977948186162e-05,
"loss": 1.7687,
"step": 20
},
{
"epoch": 0.011210133961100835,
"grad_norm": 1.2509621381759644,
"learning_rate": 9.999655441833445e-05,
"loss": 1.8135,
"step": 25
},
{
"epoch": 0.013452160753321002,
"grad_norm": 1.2357916831970215,
"learning_rate": 9.999503838747563e-05,
"loss": 1.6791,
"step": 30
},
{
"epoch": 0.01569418754554117,
"grad_norm": 1.3020069599151611,
"learning_rate": 9.999324673439762e-05,
"loss": 1.5995,
"step": 35
},
{
"epoch": 0.017936214337761335,
"grad_norm": 1.2886899709701538,
"learning_rate": 9.999117946897775e-05,
"loss": 1.5247,
"step": 40
},
{
"epoch": 0.020178241129981504,
"grad_norm": 1.5385006666183472,
"learning_rate": 9.998883660261285e-05,
"loss": 1.4753,
"step": 45
},
{
"epoch": 0.02242026792220167,
"grad_norm": 1.3205918073654175,
"learning_rate": 9.998621814821914e-05,
"loss": 1.5195,
"step": 50
},
{
"epoch": 0.02466229471442184,
"grad_norm": 1.1984736919403076,
"learning_rate": 9.99833241202321e-05,
"loss": 1.5227,
"step": 55
},
{
"epoch": 0.026904321506642005,
"grad_norm": 1.4019296169281006,
"learning_rate": 9.998015453460651e-05,
"loss": 1.4167,
"step": 60
},
{
"epoch": 0.02914634829886217,
"grad_norm": 1.3315256834030151,
"learning_rate": 9.997670940881627e-05,
"loss": 1.4096,
"step": 65
},
{
"epoch": 0.03138837509108234,
"grad_norm": 1.1818488836288452,
"learning_rate": 9.99729887618543e-05,
"loss": 1.4869,
"step": 70
},
{
"epoch": 0.033630401883302505,
"grad_norm": 1.41718327999115,
"learning_rate": 9.996899261423254e-05,
"loss": 1.3926,
"step": 75
},
{
"epoch": 0.03587242867552267,
"grad_norm": 1.1350888013839722,
"learning_rate": 9.996472098798169e-05,
"loss": 1.5258,
"step": 80
},
{
"epoch": 0.03811445546774284,
"grad_norm": 1.2058793306350708,
"learning_rate": 9.996017390665118e-05,
"loss": 1.4866,
"step": 85
},
{
"epoch": 0.04035648225996301,
"grad_norm": 1.3261053562164307,
"learning_rate": 9.995535139530904e-05,
"loss": 1.4103,
"step": 90
},
{
"epoch": 0.042598509052183174,
"grad_norm": 1.4682806730270386,
"learning_rate": 9.995025348054175e-05,
"loss": 1.3903,
"step": 95
},
{
"epoch": 0.04484053584440334,
"grad_norm": 1.4472965002059937,
"learning_rate": 9.994488019045405e-05,
"loss": 1.4353,
"step": 100
},
{
"epoch": 0.047082562636623505,
"grad_norm": 1.1997061967849731,
"learning_rate": 9.993923155466884e-05,
"loss": 1.4072,
"step": 105
},
{
"epoch": 0.04932458942884368,
"grad_norm": 1.140612006187439,
"learning_rate": 9.993330760432703e-05,
"loss": 1.4188,
"step": 110
},
{
"epoch": 0.05156661622106384,
"grad_norm": 1.353933572769165,
"learning_rate": 9.992710837208726e-05,
"loss": 1.3812,
"step": 115
},
{
"epoch": 0.05380864301328401,
"grad_norm": 1.6113260984420776,
"learning_rate": 9.992063389212589e-05,
"loss": 1.4144,
"step": 120
},
{
"epoch": 0.056050669805504175,
"grad_norm": 1.1334792375564575,
"learning_rate": 9.991388420013666e-05,
"loss": 1.4366,
"step": 125
},
{
"epoch": 0.05829269659772434,
"grad_norm": 1.274436593055725,
"learning_rate": 9.990685933333054e-05,
"loss": 1.3906,
"step": 130
},
{
"epoch": 0.06053472338994451,
"grad_norm": 1.1813853979110718,
"learning_rate": 9.98995593304356e-05,
"loss": 1.4297,
"step": 135
},
{
"epoch": 0.06277675018216468,
"grad_norm": 1.367594599723816,
"learning_rate": 9.989198423169666e-05,
"loss": 1.3987,
"step": 140
},
{
"epoch": 0.06501877697438484,
"grad_norm": 1.1809221506118774,
"learning_rate": 9.98841340788752e-05,
"loss": 1.4149,
"step": 145
},
{
"epoch": 0.06726080376660501,
"grad_norm": 1.1612411737442017,
"learning_rate": 9.987600891524902e-05,
"loss": 1.4063,
"step": 150
},
{
"epoch": 0.06950283055882518,
"grad_norm": 1.2665232419967651,
"learning_rate": 9.986760878561209e-05,
"loss": 1.3765,
"step": 155
},
{
"epoch": 0.07174485735104534,
"grad_norm": 1.1557029485702515,
"learning_rate": 9.985893373627426e-05,
"loss": 1.3657,
"step": 160
},
{
"epoch": 0.07398688414326551,
"grad_norm": 1.2104707956314087,
"learning_rate": 9.984998381506099e-05,
"loss": 1.4443,
"step": 165
},
{
"epoch": 0.07622891093548569,
"grad_norm": 1.7405874729156494,
"learning_rate": 9.984075907131314e-05,
"loss": 1.4025,
"step": 170
},
{
"epoch": 0.07847093772770584,
"grad_norm": 1.366018295288086,
"learning_rate": 9.983125955588662e-05,
"loss": 1.3552,
"step": 175
},
{
"epoch": 0.08071296451992602,
"grad_norm": 1.171950101852417,
"learning_rate": 9.982148532115218e-05,
"loss": 1.3902,
"step": 180
},
{
"epoch": 0.08295499131214618,
"grad_norm": 1.1674119234085083,
"learning_rate": 9.98114364209951e-05,
"loss": 1.3777,
"step": 185
},
{
"epoch": 0.08519701810436635,
"grad_norm": 1.069298505783081,
"learning_rate": 9.980111291081488e-05,
"loss": 1.4023,
"step": 190
},
{
"epoch": 0.08743904489658652,
"grad_norm": 1.3680400848388672,
"learning_rate": 9.979051484752496e-05,
"loss": 1.3485,
"step": 195
},
{
"epoch": 0.08968107168880668,
"grad_norm": 1.2846928834915161,
"learning_rate": 9.977964228955232e-05,
"loss": 1.4068,
"step": 200
},
{
"epoch": 0.09192309848102685,
"grad_norm": 1.1173886060714722,
"learning_rate": 9.976849529683734e-05,
"loss": 1.4086,
"step": 205
},
{
"epoch": 0.09416512527324701,
"grad_norm": 1.1485246419906616,
"learning_rate": 9.975707393083327e-05,
"loss": 1.3981,
"step": 210
},
{
"epoch": 0.09640715206546718,
"grad_norm": 1.503753900527954,
"learning_rate": 9.974537825450598e-05,
"loss": 1.3903,
"step": 215
},
{
"epoch": 0.09864917885768736,
"grad_norm": 1.1287413835525513,
"learning_rate": 9.973340833233369e-05,
"loss": 1.4314,
"step": 220
},
{
"epoch": 0.10089120564990751,
"grad_norm": 1.2056329250335693,
"learning_rate": 9.972116423030641e-05,
"loss": 1.402,
"step": 225
},
{
"epoch": 0.10313323244212769,
"grad_norm": 1.5625340938568115,
"learning_rate": 9.970864601592583e-05,
"loss": 1.3223,
"step": 230
},
{
"epoch": 0.10537525923434785,
"grad_norm": 1.3235552310943604,
"learning_rate": 9.969585375820474e-05,
"loss": 1.3783,
"step": 235
},
{
"epoch": 0.10761728602656802,
"grad_norm": 1.2155221700668335,
"learning_rate": 9.968278752766672e-05,
"loss": 1.422,
"step": 240
},
{
"epoch": 0.10985931281878819,
"grad_norm": 1.470157504081726,
"learning_rate": 9.966944739634581e-05,
"loss": 1.4048,
"step": 245
},
{
"epoch": 0.11210133961100835,
"grad_norm": 1.295973300933838,
"learning_rate": 9.965583343778605e-05,
"loss": 1.4016,
"step": 250
},
{
"epoch": 0.11434336640322852,
"grad_norm": 1.306946873664856,
"learning_rate": 9.964194572704106e-05,
"loss": 1.3679,
"step": 255
},
{
"epoch": 0.11658539319544868,
"grad_norm": 1.2880349159240723,
"learning_rate": 9.962778434067368e-05,
"loss": 1.4271,
"step": 260
},
{
"epoch": 0.11882741998766885,
"grad_norm": 1.165236473083496,
"learning_rate": 9.96133493567555e-05,
"loss": 1.4263,
"step": 265
},
{
"epoch": 0.12106944677988903,
"grad_norm": 1.293133020401001,
"learning_rate": 9.959864085486648e-05,
"loss": 1.4395,
"step": 270
},
{
"epoch": 0.12331147357210918,
"grad_norm": 1.165431022644043,
"learning_rate": 9.958365891609444e-05,
"loss": 1.3845,
"step": 275
},
{
"epoch": 0.12555350036432936,
"grad_norm": 1.262797474861145,
"learning_rate": 9.956840362303473e-05,
"loss": 1.4234,
"step": 280
},
{
"epoch": 0.12779552715654952,
"grad_norm": 1.3967214822769165,
"learning_rate": 9.955287505978959e-05,
"loss": 1.3922,
"step": 285
},
{
"epoch": 0.13003755394876967,
"grad_norm": 1.1680519580841064,
"learning_rate": 9.953707331196787e-05,
"loss": 1.3595,
"step": 290
},
{
"epoch": 0.13227958074098986,
"grad_norm": 1.2890777587890625,
"learning_rate": 9.95209984666845e-05,
"loss": 1.3109,
"step": 295
},
{
"epoch": 0.13452160753321002,
"grad_norm": 1.3940659761428833,
"learning_rate": 9.950465061255996e-05,
"loss": 1.4059,
"step": 300
},
{
"epoch": 0.13676363432543018,
"grad_norm": 1.1973896026611328,
"learning_rate": 9.948802983971981e-05,
"loss": 1.377,
"step": 305
},
{
"epoch": 0.13900566111765036,
"grad_norm": 1.222718358039856,
"learning_rate": 9.947113623979422e-05,
"loss": 1.3703,
"step": 310
},
{
"epoch": 0.14124768790987052,
"grad_norm": 1.3250412940979004,
"learning_rate": 9.945396990591751e-05,
"loss": 1.2982,
"step": 315
},
{
"epoch": 0.14348971470209068,
"grad_norm": 1.1987338066101074,
"learning_rate": 9.943653093272749e-05,
"loss": 1.3408,
"step": 320
},
{
"epoch": 0.14573174149431087,
"grad_norm": 1.0673400163650513,
"learning_rate": 9.941881941636506e-05,
"loss": 1.3754,
"step": 325
},
{
"epoch": 0.14797376828653103,
"grad_norm": 1.1687766313552856,
"learning_rate": 9.94008354544737e-05,
"loss": 1.352,
"step": 330
},
{
"epoch": 0.15021579507875119,
"grad_norm": 1.1456810235977173,
"learning_rate": 9.938257914619882e-05,
"loss": 1.3725,
"step": 335
},
{
"epoch": 0.15245782187097137,
"grad_norm": 1.0828138589859009,
"learning_rate": 9.936405059218728e-05,
"loss": 1.3499,
"step": 340
},
{
"epoch": 0.15469984866319153,
"grad_norm": 1.4693915843963623,
"learning_rate": 9.934524989458684e-05,
"loss": 1.3956,
"step": 345
},
{
"epoch": 0.1569418754554117,
"grad_norm": 1.2608237266540527,
"learning_rate": 9.932617715704562e-05,
"loss": 1.3734,
"step": 350
},
{
"epoch": 0.15918390224763185,
"grad_norm": 1.2090539932250977,
"learning_rate": 9.930683248471142e-05,
"loss": 1.4775,
"step": 355
},
{
"epoch": 0.16142592903985203,
"grad_norm": 1.1096559762954712,
"learning_rate": 9.928721598423125e-05,
"loss": 1.3189,
"step": 360
},
{
"epoch": 0.1636679558320722,
"grad_norm": 1.3460302352905273,
"learning_rate": 9.926732776375073e-05,
"loss": 1.3477,
"step": 365
},
{
"epoch": 0.16590998262429235,
"grad_norm": 1.372318983078003,
"learning_rate": 9.924716793291346e-05,
"loss": 1.3753,
"step": 370
},
{
"epoch": 0.16815200941651254,
"grad_norm": 1.4617116451263428,
"learning_rate": 9.922673660286039e-05,
"loss": 1.3927,
"step": 375
},
{
"epoch": 0.1703940362087327,
"grad_norm": 1.1783477067947388,
"learning_rate": 9.920603388622928e-05,
"loss": 1.3698,
"step": 380
},
{
"epoch": 0.17263606300095286,
"grad_norm": 1.087998628616333,
"learning_rate": 9.918505989715403e-05,
"loss": 1.3449,
"step": 385
},
{
"epoch": 0.17487808979317304,
"grad_norm": 1.1929972171783447,
"learning_rate": 9.916381475126407e-05,
"loss": 1.4253,
"step": 390
},
{
"epoch": 0.1771201165853932,
"grad_norm": 1.3316450119018555,
"learning_rate": 9.914229856568369e-05,
"loss": 1.4232,
"step": 395
},
{
"epoch": 0.17936214337761336,
"grad_norm": 1.0855283737182617,
"learning_rate": 9.912051145903144e-05,
"loss": 1.3813,
"step": 400
},
{
"epoch": 0.18160417016983352,
"grad_norm": 1.1286154985427856,
"learning_rate": 9.909845355141946e-05,
"loss": 1.3733,
"step": 405
},
{
"epoch": 0.1838461969620537,
"grad_norm": 1.1122159957885742,
"learning_rate": 9.90761249644528e-05,
"loss": 1.3546,
"step": 410
},
{
"epoch": 0.18608822375427386,
"grad_norm": 1.0875800848007202,
"learning_rate": 9.905352582122878e-05,
"loss": 1.4501,
"step": 415
},
{
"epoch": 0.18833025054649402,
"grad_norm": 1.130376935005188,
"learning_rate": 9.903065624633628e-05,
"loss": 1.404,
"step": 420
},
{
"epoch": 0.1905722773387142,
"grad_norm": 1.267067790031433,
"learning_rate": 9.900751636585506e-05,
"loss": 1.3379,
"step": 425
},
{
"epoch": 0.19281430413093437,
"grad_norm": 1.1137257814407349,
"learning_rate": 9.898410630735509e-05,
"loss": 1.3062,
"step": 430
},
{
"epoch": 0.19505633092315453,
"grad_norm": 1.3096929788589478,
"learning_rate": 9.896042619989581e-05,
"loss": 1.4038,
"step": 435
},
{
"epoch": 0.1972983577153747,
"grad_norm": 1.3187003135681152,
"learning_rate": 9.893647617402548e-05,
"loss": 1.3936,
"step": 440
},
{
"epoch": 0.19954038450759487,
"grad_norm": 1.1851136684417725,
"learning_rate": 9.891225636178037e-05,
"loss": 1.3456,
"step": 445
},
{
"epoch": 0.20178241129981503,
"grad_norm": 1.2681955099105835,
"learning_rate": 9.88877668966841e-05,
"loss": 1.3961,
"step": 450
},
{
"epoch": 0.2040244380920352,
"grad_norm": 1.2412629127502441,
"learning_rate": 9.886300791374688e-05,
"loss": 1.3843,
"step": 455
},
{
"epoch": 0.20626646488425537,
"grad_norm": 1.3303419351577759,
"learning_rate": 9.883797954946476e-05,
"loss": 1.3459,
"step": 460
},
{
"epoch": 0.20850849167647553,
"grad_norm": 1.100720763206482,
"learning_rate": 9.881268194181892e-05,
"loss": 1.4156,
"step": 465
},
{
"epoch": 0.2107505184686957,
"grad_norm": 1.0826263427734375,
"learning_rate": 9.878711523027484e-05,
"loss": 1.3297,
"step": 470
},
{
"epoch": 0.21299254526091588,
"grad_norm": 1.4636311531066895,
"learning_rate": 9.876127955578158e-05,
"loss": 1.3662,
"step": 475
},
{
"epoch": 0.21523457205313604,
"grad_norm": 1.1484990119934082,
"learning_rate": 9.873517506077101e-05,
"loss": 1.36,
"step": 480
},
{
"epoch": 0.2174765988453562,
"grad_norm": 1.2333481311798096,
"learning_rate": 9.870880188915698e-05,
"loss": 1.3587,
"step": 485
},
{
"epoch": 0.21971862563757638,
"grad_norm": 1.1720088720321655,
"learning_rate": 9.868216018633456e-05,
"loss": 1.2882,
"step": 490
},
{
"epoch": 0.22196065242979654,
"grad_norm": 1.2749361991882324,
"learning_rate": 9.865525009917921e-05,
"loss": 1.3564,
"step": 495
},
{
"epoch": 0.2242026792220167,
"grad_norm": 1.1952840089797974,
"learning_rate": 9.862807177604602e-05,
"loss": 1.3956,
"step": 500
},
{
"epoch": 0.22644470601423686,
"grad_norm": 1.215401530265808,
"learning_rate": 9.860062536676888e-05,
"loss": 1.3836,
"step": 505
},
{
"epoch": 0.22868673280645704,
"grad_norm": 1.2550543546676636,
"learning_rate": 9.857291102265959e-05,
"loss": 1.3626,
"step": 510
},
{
"epoch": 0.2309287595986772,
"grad_norm": 1.1673667430877686,
"learning_rate": 9.854492889650709e-05,
"loss": 1.3601,
"step": 515
},
{
"epoch": 0.23317078639089736,
"grad_norm": 1.2657443284988403,
"learning_rate": 9.851667914257661e-05,
"loss": 1.3216,
"step": 520
},
{
"epoch": 0.23541281318311755,
"grad_norm": 1.3874006271362305,
"learning_rate": 9.848816191660878e-05,
"loss": 1.3565,
"step": 525
},
{
"epoch": 0.2376548399753377,
"grad_norm": 1.3701063394546509,
"learning_rate": 9.845937737581885e-05,
"loss": 1.3676,
"step": 530
},
{
"epoch": 0.23989686676755786,
"grad_norm": 1.1547927856445312,
"learning_rate": 9.843032567889572e-05,
"loss": 1.3882,
"step": 535
},
{
"epoch": 0.24213889355977805,
"grad_norm": 1.2529016733169556,
"learning_rate": 9.840100698600118e-05,
"loss": 1.4058,
"step": 540
},
{
"epoch": 0.2443809203519982,
"grad_norm": 1.250368595123291,
"learning_rate": 9.837142145876892e-05,
"loss": 1.3544,
"step": 545
},
{
"epoch": 0.24662294714421837,
"grad_norm": 1.0099977254867554,
"learning_rate": 9.834156926030368e-05,
"loss": 1.3435,
"step": 550
},
{
"epoch": 0.24886497393643853,
"grad_norm": 1.169044017791748,
"learning_rate": 9.831145055518039e-05,
"loss": 1.3226,
"step": 555
},
{
"epoch": 0.2511070007286587,
"grad_norm": 1.1605632305145264,
"learning_rate": 9.828106550944322e-05,
"loss": 1.3941,
"step": 560
},
{
"epoch": 0.2533490275208789,
"grad_norm": 1.1079938411712646,
"learning_rate": 9.825041429060466e-05,
"loss": 1.3151,
"step": 565
},
{
"epoch": 0.25559105431309903,
"grad_norm": 1.39505934715271,
"learning_rate": 9.821949706764463e-05,
"loss": 1.34,
"step": 570
},
{
"epoch": 0.2578330811053192,
"grad_norm": 1.3362523317337036,
"learning_rate": 9.81883140110095e-05,
"loss": 1.3317,
"step": 575
},
{
"epoch": 0.26007510789753935,
"grad_norm": 1.131722092628479,
"learning_rate": 9.815686529261119e-05,
"loss": 1.3044,
"step": 580
},
{
"epoch": 0.26231713468975953,
"grad_norm": 1.227959394454956,
"learning_rate": 9.812515108582622e-05,
"loss": 1.3882,
"step": 585
},
{
"epoch": 0.2645591614819797,
"grad_norm": 1.2343266010284424,
"learning_rate": 9.809317156549476e-05,
"loss": 1.3132,
"step": 590
},
{
"epoch": 0.26680118827419985,
"grad_norm": 1.3055241107940674,
"learning_rate": 9.806092690791962e-05,
"loss": 1.3804,
"step": 595
},
{
"epoch": 0.26904321506642004,
"grad_norm": 1.1219717264175415,
"learning_rate": 9.80284172908653e-05,
"loss": 1.4393,
"step": 600
},
{
"epoch": 0.2712852418586402,
"grad_norm": 1.1093581914901733,
"learning_rate": 9.799564289355707e-05,
"loss": 1.4185,
"step": 605
},
{
"epoch": 0.27352726865086036,
"grad_norm": 1.181302785873413,
"learning_rate": 9.79626038966799e-05,
"loss": 1.3762,
"step": 610
},
{
"epoch": 0.27576929544308054,
"grad_norm": 1.238273024559021,
"learning_rate": 9.79293004823775e-05,
"loss": 1.4161,
"step": 615
},
{
"epoch": 0.27801132223530073,
"grad_norm": 1.2279235124588013,
"learning_rate": 9.789573283425126e-05,
"loss": 1.3645,
"step": 620
},
{
"epoch": 0.28025334902752086,
"grad_norm": 1.0937743186950684,
"learning_rate": 9.78619011373594e-05,
"loss": 1.374,
"step": 625
},
{
"epoch": 0.28249537581974105,
"grad_norm": 1.0662868022918701,
"learning_rate": 9.782780557821576e-05,
"loss": 1.2941,
"step": 630
},
{
"epoch": 0.28473740261196123,
"grad_norm": 1.2285066843032837,
"learning_rate": 9.779344634478884e-05,
"loss": 1.3532,
"step": 635
},
{
"epoch": 0.28697942940418136,
"grad_norm": 1.336958646774292,
"learning_rate": 9.775882362650083e-05,
"loss": 1.3376,
"step": 640
},
{
"epoch": 0.28922145619640155,
"grad_norm": 1.1303439140319824,
"learning_rate": 9.772393761422645e-05,
"loss": 1.3311,
"step": 645
},
{
"epoch": 0.29146348298862174,
"grad_norm": 1.155773401260376,
"learning_rate": 9.768878850029201e-05,
"loss": 1.3207,
"step": 650
},
{
"epoch": 0.29370550978084187,
"grad_norm": 1.1738945245742798,
"learning_rate": 9.765337647847429e-05,
"loss": 1.3448,
"step": 655
},
{
"epoch": 0.29594753657306205,
"grad_norm": 1.1708767414093018,
"learning_rate": 9.761770174399943e-05,
"loss": 1.4237,
"step": 660
},
{
"epoch": 0.29818956336528224,
"grad_norm": 1.1160731315612793,
"learning_rate": 9.758176449354194e-05,
"loss": 1.3669,
"step": 665
},
{
"epoch": 0.30043159015750237,
"grad_norm": 1.2477370500564575,
"learning_rate": 9.754556492522359e-05,
"loss": 1.3638,
"step": 670
},
{
"epoch": 0.30267361694972256,
"grad_norm": 1.1834142208099365,
"learning_rate": 9.750910323861228e-05,
"loss": 1.3193,
"step": 675
},
{
"epoch": 0.30491564374194274,
"grad_norm": 1.1948530673980713,
"learning_rate": 9.747237963472098e-05,
"loss": 1.3663,
"step": 680
},
{
"epoch": 0.3071576705341629,
"grad_norm": 1.172042965888977,
"learning_rate": 9.743539431600661e-05,
"loss": 1.3777,
"step": 685
},
{
"epoch": 0.30939969732638306,
"grad_norm": 1.276157259941101,
"learning_rate": 9.739814748636891e-05,
"loss": 1.3025,
"step": 690
},
{
"epoch": 0.3116417241186032,
"grad_norm": 1.0595532655715942,
"learning_rate": 9.736063935114934e-05,
"loss": 1.3276,
"step": 695
},
{
"epoch": 0.3138837509108234,
"grad_norm": 1.1211802959442139,
"learning_rate": 9.732287011712992e-05,
"loss": 1.3408,
"step": 700
},
{
"epoch": 0.31612577770304356,
"grad_norm": 1.0752381086349487,
"learning_rate": 9.72848399925321e-05,
"loss": 1.3546,
"step": 705
},
{
"epoch": 0.3183678044952637,
"grad_norm": 1.1245768070220947,
"learning_rate": 9.724654918701568e-05,
"loss": 1.3702,
"step": 710
},
{
"epoch": 0.3206098312874839,
"grad_norm": 1.1155140399932861,
"learning_rate": 9.720799791167749e-05,
"loss": 1.4169,
"step": 715
},
{
"epoch": 0.32285185807970407,
"grad_norm": 1.0726211071014404,
"learning_rate": 9.716918637905041e-05,
"loss": 1.3408,
"step": 720
},
{
"epoch": 0.3250938848719242,
"grad_norm": 1.0829260349273682,
"learning_rate": 9.713011480310208e-05,
"loss": 1.3408,
"step": 725
},
{
"epoch": 0.3273359116641444,
"grad_norm": 1.2374547719955444,
"learning_rate": 9.709078339923377e-05,
"loss": 1.4005,
"step": 730
},
{
"epoch": 0.32957793845636457,
"grad_norm": 1.0573582649230957,
"learning_rate": 9.705119238427915e-05,
"loss": 1.3704,
"step": 735
},
{
"epoch": 0.3318199652485847,
"grad_norm": 1.245229959487915,
"learning_rate": 9.701134197650318e-05,
"loss": 1.359,
"step": 740
},
{
"epoch": 0.3340619920408049,
"grad_norm": 1.2916717529296875,
"learning_rate": 9.697123239560081e-05,
"loss": 1.3983,
"step": 745
},
{
"epoch": 0.3363040188330251,
"grad_norm": 1.0935052633285522,
"learning_rate": 9.693086386269581e-05,
"loss": 1.2974,
"step": 750
},
{
"epoch": 0.3385460456252452,
"grad_norm": 1.2657032012939453,
"learning_rate": 9.689023660033956e-05,
"loss": 1.4309,
"step": 755
},
{
"epoch": 0.3407880724174654,
"grad_norm": 1.198128581047058,
"learning_rate": 9.684935083250979e-05,
"loss": 1.361,
"step": 760
},
{
"epoch": 0.3430300992096856,
"grad_norm": 1.0504889488220215,
"learning_rate": 9.680820678460941e-05,
"loss": 1.3671,
"step": 765
},
{
"epoch": 0.3452721260019057,
"grad_norm": 0.9644368290901184,
"learning_rate": 9.676680468346521e-05,
"loss": 1.3223,
"step": 770
},
{
"epoch": 0.3475141527941259,
"grad_norm": 1.2863438129425049,
"learning_rate": 9.672514475732659e-05,
"loss": 1.2832,
"step": 775
},
{
"epoch": 0.3497561795863461,
"grad_norm": 1.2852818965911865,
"learning_rate": 9.66832272358644e-05,
"loss": 1.3586,
"step": 780
},
{
"epoch": 0.3519982063785662,
"grad_norm": 1.088563323020935,
"learning_rate": 9.664105235016955e-05,
"loss": 1.3518,
"step": 785
},
{
"epoch": 0.3542402331707864,
"grad_norm": 1.138024926185608,
"learning_rate": 9.659862033275187e-05,
"loss": 1.4139,
"step": 790
},
{
"epoch": 0.35648225996300653,
"grad_norm": 1.1250396966934204,
"learning_rate": 9.655593141753865e-05,
"loss": 1.3991,
"step": 795
},
{
"epoch": 0.3587242867552267,
"grad_norm": 1.3767707347869873,
"learning_rate": 9.651298583987353e-05,
"loss": 1.3365,
"step": 800
},
{
"epoch": 0.3609663135474469,
"grad_norm": 1.0378060340881348,
"learning_rate": 9.646978383651515e-05,
"loss": 1.3198,
"step": 805
},
{
"epoch": 0.36320834033966704,
"grad_norm": 1.138748049736023,
"learning_rate": 9.642632564563576e-05,
"loss": 1.3889,
"step": 810
},
{
"epoch": 0.3654503671318872,
"grad_norm": 1.2029573917388916,
"learning_rate": 9.638261150681998e-05,
"loss": 1.3673,
"step": 815
},
{
"epoch": 0.3676923939241074,
"grad_norm": 1.0248106718063354,
"learning_rate": 9.63386416610635e-05,
"loss": 1.3291,
"step": 820
},
{
"epoch": 0.36993442071632754,
"grad_norm": 1.1774693727493286,
"learning_rate": 9.62944163507717e-05,
"loss": 1.3082,
"step": 825
},
{
"epoch": 0.3721764475085477,
"grad_norm": 1.075829029083252,
"learning_rate": 9.624993581975833e-05,
"loss": 1.3391,
"step": 830
},
{
"epoch": 0.3744184743007679,
"grad_norm": 1.2112139463424683,
"learning_rate": 9.62052003132442e-05,
"loss": 1.3169,
"step": 835
},
{
"epoch": 0.37666050109298804,
"grad_norm": 1.2244364023208618,
"learning_rate": 9.616021007785576e-05,
"loss": 1.3267,
"step": 840
},
{
"epoch": 0.37890252788520823,
"grad_norm": 1.0250012874603271,
"learning_rate": 9.611496536162379e-05,
"loss": 1.3162,
"step": 845
},
{
"epoch": 0.3811445546774284,
"grad_norm": 1.1248130798339844,
"learning_rate": 9.606946641398203e-05,
"loss": 1.3244,
"step": 850
},
{
"epoch": 0.38338658146964855,
"grad_norm": 1.1805521249771118,
"learning_rate": 9.602371348576577e-05,
"loss": 1.3474,
"step": 855
},
{
"epoch": 0.38562860826186873,
"grad_norm": 1.129887342453003,
"learning_rate": 9.597770682921055e-05,
"loss": 1.317,
"step": 860
},
{
"epoch": 0.3878706350540889,
"grad_norm": 1.2311817407608032,
"learning_rate": 9.593144669795066e-05,
"loss": 1.3399,
"step": 865
},
{
"epoch": 0.39011266184630905,
"grad_norm": 1.1044973134994507,
"learning_rate": 9.588493334701777e-05,
"loss": 1.3829,
"step": 870
},
{
"epoch": 0.39235468863852924,
"grad_norm": 1.142473816871643,
"learning_rate": 9.583816703283965e-05,
"loss": 1.3408,
"step": 875
},
{
"epoch": 0.3945967154307494,
"grad_norm": 1.054578423500061,
"learning_rate": 9.579114801323854e-05,
"loss": 1.2964,
"step": 880
},
{
"epoch": 0.39683874222296955,
"grad_norm": 1.1482200622558594,
"learning_rate": 9.574387654742992e-05,
"loss": 1.3412,
"step": 885
},
{
"epoch": 0.39908076901518974,
"grad_norm": 1.0244128704071045,
"learning_rate": 9.569635289602097e-05,
"loss": 1.3605,
"step": 890
},
{
"epoch": 0.40132279580740987,
"grad_norm": 0.9474136233329773,
"learning_rate": 9.564857732100916e-05,
"loss": 1.3675,
"step": 895
},
{
"epoch": 0.40356482259963006,
"grad_norm": 1.0347918272018433,
"learning_rate": 9.560055008578085e-05,
"loss": 1.3696,
"step": 900
},
{
"epoch": 0.40580684939185024,
"grad_norm": 1.2021688222885132,
"learning_rate": 9.555227145510977e-05,
"loss": 1.32,
"step": 905
},
{
"epoch": 0.4080488761840704,
"grad_norm": 1.1408722400665283,
"learning_rate": 9.550374169515557e-05,
"loss": 1.3757,
"step": 910
},
{
"epoch": 0.41029090297629056,
"grad_norm": 1.015257716178894,
"learning_rate": 9.545496107346244e-05,
"loss": 1.3332,
"step": 915
},
{
"epoch": 0.41253292976851075,
"grad_norm": 1.5246819257736206,
"learning_rate": 9.540592985895752e-05,
"loss": 1.2848,
"step": 920
},
{
"epoch": 0.4147749565607309,
"grad_norm": 1.1320191621780396,
"learning_rate": 9.535664832194946e-05,
"loss": 1.3324,
"step": 925
},
{
"epoch": 0.41701698335295107,
"grad_norm": 1.169104814529419,
"learning_rate": 9.530711673412698e-05,
"loss": 1.3697,
"step": 930
},
{
"epoch": 0.41925901014517125,
"grad_norm": 1.03293776512146,
"learning_rate": 9.525733536855728e-05,
"loss": 1.3582,
"step": 935
},
{
"epoch": 0.4215010369373914,
"grad_norm": 1.3983210325241089,
"learning_rate": 9.520730449968461e-05,
"loss": 1.3631,
"step": 940
},
{
"epoch": 0.42374306372961157,
"grad_norm": 1.2297945022583008,
"learning_rate": 9.515702440332869e-05,
"loss": 1.4169,
"step": 945
},
{
"epoch": 0.42598509052183176,
"grad_norm": 1.3570704460144043,
"learning_rate": 9.510649535668332e-05,
"loss": 1.3588,
"step": 950
},
{
"epoch": 0.4282271173140519,
"grad_norm": 1.1815954446792603,
"learning_rate": 9.505571763831468e-05,
"loss": 1.364,
"step": 955
},
{
"epoch": 0.4304691441062721,
"grad_norm": 1.199096441268921,
"learning_rate": 9.500469152815988e-05,
"loss": 1.3813,
"step": 960
},
{
"epoch": 0.43271117089849226,
"grad_norm": 1.0751597881317139,
"learning_rate": 9.495341730752543e-05,
"loss": 1.3479,
"step": 965
},
{
"epoch": 0.4349531976907124,
"grad_norm": 1.121031641960144,
"learning_rate": 9.490189525908569e-05,
"loss": 1.2976,
"step": 970
},
{
"epoch": 0.4371952244829326,
"grad_norm": 1.0710008144378662,
"learning_rate": 9.485012566688127e-05,
"loss": 1.33,
"step": 975
},
{
"epoch": 0.43943725127515276,
"grad_norm": 1.1103382110595703,
"learning_rate": 9.479810881631747e-05,
"loss": 1.3194,
"step": 980
},
{
"epoch": 0.4416792780673729,
"grad_norm": 1.1765540838241577,
"learning_rate": 9.474584499416275e-05,
"loss": 1.4135,
"step": 985
},
{
"epoch": 0.4439213048595931,
"grad_norm": 1.1305935382843018,
"learning_rate": 9.469333448854713e-05,
"loss": 1.2884,
"step": 990
},
{
"epoch": 0.44616333165181327,
"grad_norm": 1.0487785339355469,
"learning_rate": 9.464057758896055e-05,
"loss": 1.3547,
"step": 995
},
{
"epoch": 0.4484053584440334,
"grad_norm": 1.071997046470642,
"learning_rate": 9.458757458625138e-05,
"loss": 1.3376,
"step": 1000
},
{
"epoch": 0.4506473852362536,
"grad_norm": 1.2403199672698975,
"learning_rate": 9.453432577262471e-05,
"loss": 1.3056,
"step": 1005
},
{
"epoch": 0.4528894120284737,
"grad_norm": 1.2904599905014038,
"learning_rate": 9.448083144164077e-05,
"loss": 1.3357,
"step": 1010
},
{
"epoch": 0.4551314388206939,
"grad_norm": 0.9735843539237976,
"learning_rate": 9.442709188821337e-05,
"loss": 1.3731,
"step": 1015
},
{
"epoch": 0.4573734656129141,
"grad_norm": 1.131100058555603,
"learning_rate": 9.437310740860822e-05,
"loss": 1.3478,
"step": 1020
},
{
"epoch": 0.4596154924051342,
"grad_norm": 1.149906873703003,
"learning_rate": 9.431887830044129e-05,
"loss": 1.3439,
"step": 1025
},
{
"epoch": 0.4618575191973544,
"grad_norm": 1.177563190460205,
"learning_rate": 9.426440486267716e-05,
"loss": 1.4656,
"step": 1030
},
{
"epoch": 0.4640995459895746,
"grad_norm": 1.1288046836853027,
"learning_rate": 9.420968739562744e-05,
"loss": 1.4185,
"step": 1035
},
{
"epoch": 0.4663415727817947,
"grad_norm": 1.2524133920669556,
"learning_rate": 9.415472620094909e-05,
"loss": 1.3401,
"step": 1040
},
{
"epoch": 0.4685835995740149,
"grad_norm": 1.212417721748352,
"learning_rate": 9.409952158164266e-05,
"loss": 1.3573,
"step": 1045
},
{
"epoch": 0.4708256263662351,
"grad_norm": 1.0959070920944214,
"learning_rate": 9.404407384205078e-05,
"loss": 1.2674,
"step": 1050
},
{
"epoch": 0.4730676531584552,
"grad_norm": 1.1945031881332397,
"learning_rate": 9.398838328785635e-05,
"loss": 1.3511,
"step": 1055
},
{
"epoch": 0.4753096799506754,
"grad_norm": 1.1044509410858154,
"learning_rate": 9.393245022608091e-05,
"loss": 1.3917,
"step": 1060
},
{
"epoch": 0.4775517067428956,
"grad_norm": 1.4578787088394165,
"learning_rate": 9.387627496508298e-05,
"loss": 1.3883,
"step": 1065
},
{
"epoch": 0.47979373353511573,
"grad_norm": 1.1177469491958618,
"learning_rate": 9.381985781455625e-05,
"loss": 1.3079,
"step": 1070
},
{
"epoch": 0.4820357603273359,
"grad_norm": 1.0329993963241577,
"learning_rate": 9.376319908552803e-05,
"loss": 1.3693,
"step": 1075
},
{
"epoch": 0.4842777871195561,
"grad_norm": 1.0311007499694824,
"learning_rate": 9.37062990903574e-05,
"loss": 1.2942,
"step": 1080
},
{
"epoch": 0.48651981391177623,
"grad_norm": 1.036125659942627,
"learning_rate": 9.364915814273351e-05,
"loss": 1.3083,
"step": 1085
},
{
"epoch": 0.4887618407039964,
"grad_norm": 1.1864365339279175,
"learning_rate": 9.359177655767396e-05,
"loss": 1.3344,
"step": 1090
},
{
"epoch": 0.4910038674962166,
"grad_norm": 1.2126179933547974,
"learning_rate": 9.353415465152293e-05,
"loss": 1.3113,
"step": 1095
},
{
"epoch": 0.49324589428843674,
"grad_norm": 1.0252037048339844,
"learning_rate": 9.34762927419495e-05,
"loss": 1.3527,
"step": 1100
},
{
"epoch": 0.4954879210806569,
"grad_norm": 1.058380126953125,
"learning_rate": 9.341819114794584e-05,
"loss": 1.327,
"step": 1105
},
{
"epoch": 0.49772994787287705,
"grad_norm": 1.0073350667953491,
"learning_rate": 9.335985018982559e-05,
"loss": 1.3563,
"step": 1110
},
{
"epoch": 0.49997197466509724,
"grad_norm": 1.0354520082473755,
"learning_rate": 9.330127018922194e-05,
"loss": 1.311,
"step": 1115
},
{
"epoch": 0.5022140014573174,
"grad_norm": 1.4187575578689575,
"learning_rate": 9.324245146908592e-05,
"loss": 1.386,
"step": 1120
},
{
"epoch": 0.5044560282495376,
"grad_norm": 1.1989063024520874,
"learning_rate": 9.318339435368464e-05,
"loss": 1.3826,
"step": 1125
},
{
"epoch": 0.5066980550417578,
"grad_norm": 1.1496927738189697,
"learning_rate": 9.312409916859948e-05,
"loss": 1.3464,
"step": 1130
},
{
"epoch": 0.5089400818339779,
"grad_norm": 1.0821688175201416,
"learning_rate": 9.306456624072426e-05,
"loss": 1.319,
"step": 1135
},
{
"epoch": 0.5111821086261981,
"grad_norm": 0.9903674721717834,
"learning_rate": 9.300479589826355e-05,
"loss": 1.331,
"step": 1140
},
{
"epoch": 0.5134241354184182,
"grad_norm": 1.0320252180099487,
"learning_rate": 9.294478847073069e-05,
"loss": 1.3697,
"step": 1145
},
{
"epoch": 0.5156661622106384,
"grad_norm": 1.0023198127746582,
"learning_rate": 9.288454428894615e-05,
"loss": 1.2954,
"step": 1150
},
{
"epoch": 0.5179081890028586,
"grad_norm": 0.9635931253433228,
"learning_rate": 9.282406368503556e-05,
"loss": 1.3488,
"step": 1155
},
{
"epoch": 0.5201502157950787,
"grad_norm": 1.0810673236846924,
"learning_rate": 9.276334699242799e-05,
"loss": 1.3507,
"step": 1160
},
{
"epoch": 0.5223922425872989,
"grad_norm": 1.1213270425796509,
"learning_rate": 9.270239454585404e-05,
"loss": 1.3535,
"step": 1165
},
{
"epoch": 0.5246342693795191,
"grad_norm": 1.1882630586624146,
"learning_rate": 9.264120668134405e-05,
"loss": 1.3221,
"step": 1170
},
{
"epoch": 0.5268762961717393,
"grad_norm": 1.1680420637130737,
"learning_rate": 9.257978373622615e-05,
"loss": 1.3585,
"step": 1175
},
{
"epoch": 0.5291183229639594,
"grad_norm": 1.063761591911316,
"learning_rate": 9.251812604912453e-05,
"loss": 1.3171,
"step": 1180
},
{
"epoch": 0.5313603497561796,
"grad_norm": 1.2708847522735596,
"learning_rate": 9.245623395995751e-05,
"loss": 1.3829,
"step": 1185
},
{
"epoch": 0.5336023765483997,
"grad_norm": 1.1421536207199097,
"learning_rate": 9.239410780993564e-05,
"loss": 1.3211,
"step": 1190
},
{
"epoch": 0.5358444033406199,
"grad_norm": 1.2646090984344482,
"learning_rate": 9.233174794155985e-05,
"loss": 1.3228,
"step": 1195
},
{
"epoch": 0.5380864301328401,
"grad_norm": 1.1613190174102783,
"learning_rate": 9.226915469861956e-05,
"loss": 1.3229,
"step": 1200
},
{
"epoch": 0.5403284569250603,
"grad_norm": 1.0214089155197144,
"learning_rate": 9.220632842619079e-05,
"loss": 1.3952,
"step": 1205
},
{
"epoch": 0.5425704837172804,
"grad_norm": 1.172778844833374,
"learning_rate": 9.214326947063423e-05,
"loss": 1.3208,
"step": 1210
},
{
"epoch": 0.5448125105095006,
"grad_norm": 1.251479983329773,
"learning_rate": 9.207997817959338e-05,
"loss": 1.3232,
"step": 1215
},
{
"epoch": 0.5470545373017207,
"grad_norm": 1.2302333116531372,
"learning_rate": 9.201645490199256e-05,
"loss": 1.2792,
"step": 1220
},
{
"epoch": 0.5492965640939409,
"grad_norm": 1.0342446565628052,
"learning_rate": 9.195269998803507e-05,
"loss": 1.3181,
"step": 1225
},
{
"epoch": 0.5515385908861611,
"grad_norm": 0.9909287095069885,
"learning_rate": 9.188871378920122e-05,
"loss": 1.339,
"step": 1230
},
{
"epoch": 0.5537806176783813,
"grad_norm": 1.1493330001831055,
"learning_rate": 9.182449665824636e-05,
"loss": 1.3659,
"step": 1235
},
{
"epoch": 0.5560226444706015,
"grad_norm": 1.094141960144043,
"learning_rate": 9.1760048949199e-05,
"loss": 1.3464,
"step": 1240
},
{
"epoch": 0.5582646712628216,
"grad_norm": 1.0574826002120972,
"learning_rate": 9.169537101735879e-05,
"loss": 1.2936,
"step": 1245
},
{
"epoch": 0.5605066980550417,
"grad_norm": 0.9537421464920044,
"learning_rate": 9.163046321929462e-05,
"loss": 1.2573,
"step": 1250
},
{
"epoch": 0.5627487248472619,
"grad_norm": 1.3234580755233765,
"learning_rate": 9.156532591284263e-05,
"loss": 1.3271,
"step": 1255
},
{
"epoch": 0.5649907516394821,
"grad_norm": 1.078527808189392,
"learning_rate": 9.149995945710423e-05,
"loss": 1.3126,
"step": 1260
},
{
"epoch": 0.5672327784317023,
"grad_norm": 1.2018640041351318,
"learning_rate": 9.143436421244416e-05,
"loss": 1.3642,
"step": 1265
},
{
"epoch": 0.5694748052239225,
"grad_norm": 1.0188864469528198,
"learning_rate": 9.136854054048838e-05,
"loss": 1.2751,
"step": 1270
},
{
"epoch": 0.5717168320161425,
"grad_norm": 1.2628931999206543,
"learning_rate": 9.130248880412229e-05,
"loss": 1.364,
"step": 1275
},
{
"epoch": 0.5739588588083627,
"grad_norm": 1.1243770122528076,
"learning_rate": 9.123620936748853e-05,
"loss": 1.3668,
"step": 1280
},
{
"epoch": 0.5762008856005829,
"grad_norm": 1.1701164245605469,
"learning_rate": 9.116970259598505e-05,
"loss": 1.3434,
"step": 1285
},
{
"epoch": 0.5784429123928031,
"grad_norm": 1.0601651668548584,
"learning_rate": 9.110296885626314e-05,
"loss": 1.2645,
"step": 1290
},
{
"epoch": 0.5806849391850233,
"grad_norm": 1.2184094190597534,
"learning_rate": 9.103600851622531e-05,
"loss": 1.3468,
"step": 1295
},
{
"epoch": 0.5829269659772435,
"grad_norm": 1.5354876518249512,
"learning_rate": 9.096882194502337e-05,
"loss": 1.4595,
"step": 1300
},
{
"epoch": 0.5851689927694635,
"grad_norm": 1.0867820978164673,
"learning_rate": 9.09014095130563e-05,
"loss": 1.385,
"step": 1305
},
{
"epoch": 0.5874110195616837,
"grad_norm": 1.2308603525161743,
"learning_rate": 9.083377159196825e-05,
"loss": 1.3503,
"step": 1310
},
{
"epoch": 0.5896530463539039,
"grad_norm": 1.2136027812957764,
"learning_rate": 9.07659085546465e-05,
"loss": 1.3986,
"step": 1315
},
{
"epoch": 0.5918950731461241,
"grad_norm": 0.9775259494781494,
"learning_rate": 9.069782077521943e-05,
"loss": 1.4075,
"step": 1320
},
{
"epoch": 0.5941370999383443,
"grad_norm": 1.0732626914978027,
"learning_rate": 9.062950862905432e-05,
"loss": 1.3594,
"step": 1325
},
{
"epoch": 0.5963791267305645,
"grad_norm": 0.9587686061859131,
"learning_rate": 9.056097249275553e-05,
"loss": 1.3741,
"step": 1330
},
{
"epoch": 0.5986211535227846,
"grad_norm": 0.9384256601333618,
"learning_rate": 9.049221274416213e-05,
"loss": 1.3553,
"step": 1335
},
{
"epoch": 0.6008631803150047,
"grad_norm": 1.0991201400756836,
"learning_rate": 9.042322976234606e-05,
"loss": 1.3533,
"step": 1340
},
{
"epoch": 0.6031052071072249,
"grad_norm": 1.215849757194519,
"learning_rate": 9.035402392760988e-05,
"loss": 1.3747,
"step": 1345
},
{
"epoch": 0.6053472338994451,
"grad_norm": 1.416882872581482,
"learning_rate": 9.02845956214848e-05,
"loss": 1.3554,
"step": 1350
},
{
"epoch": 0.6075892606916653,
"grad_norm": 1.1282700300216675,
"learning_rate": 9.021494522672845e-05,
"loss": 1.3741,
"step": 1355
},
{
"epoch": 0.6098312874838855,
"grad_norm": 0.9101713299751282,
"learning_rate": 9.014507312732285e-05,
"loss": 1.3012,
"step": 1360
},
{
"epoch": 0.6120733142761056,
"grad_norm": 1.0538674592971802,
"learning_rate": 9.007497970847234e-05,
"loss": 1.3273,
"step": 1365
},
{
"epoch": 0.6143153410683257,
"grad_norm": 1.3435333967208862,
"learning_rate": 9.000466535660129e-05,
"loss": 1.4025,
"step": 1370
},
{
"epoch": 0.6165573678605459,
"grad_norm": 1.0372601747512817,
"learning_rate": 8.993413045935215e-05,
"loss": 1.3212,
"step": 1375
},
{
"epoch": 0.6187993946527661,
"grad_norm": 1.0866960287094116,
"learning_rate": 8.986337540558318e-05,
"loss": 1.3799,
"step": 1380
},
{
"epoch": 0.6210414214449863,
"grad_norm": 1.0121322870254517,
"learning_rate": 8.97924005853664e-05,
"loss": 1.3493,
"step": 1385
},
{
"epoch": 0.6232834482372064,
"grad_norm": 1.071612000465393,
"learning_rate": 8.972120638998539e-05,
"loss": 1.3564,
"step": 1390
},
{
"epoch": 0.6255254750294266,
"grad_norm": 1.103440523147583,
"learning_rate": 8.964979321193314e-05,
"loss": 1.2915,
"step": 1395
},
{
"epoch": 0.6277675018216468,
"grad_norm": 0.9943517446517944,
"learning_rate": 8.957816144490989e-05,
"loss": 1.3424,
"step": 1400
},
{
"epoch": 0.6300095286138669,
"grad_norm": 1.0576980113983154,
"learning_rate": 8.950631148382095e-05,
"loss": 1.3101,
"step": 1405
},
{
"epoch": 0.6322515554060871,
"grad_norm": 1.0375151634216309,
"learning_rate": 8.943424372477455e-05,
"loss": 1.3308,
"step": 1410
},
{
"epoch": 0.6344935821983073,
"grad_norm": 1.1026891469955444,
"learning_rate": 8.936195856507962e-05,
"loss": 1.3229,
"step": 1415
},
{
"epoch": 0.6367356089905274,
"grad_norm": 1.1137725114822388,
"learning_rate": 8.928945640324364e-05,
"loss": 1.2864,
"step": 1420
},
{
"epoch": 0.6389776357827476,
"grad_norm": 1.0674328804016113,
"learning_rate": 8.921673763897041e-05,
"loss": 1.339,
"step": 1425
},
{
"epoch": 0.6412196625749678,
"grad_norm": 1.1279280185699463,
"learning_rate": 8.914380267315782e-05,
"loss": 1.3516,
"step": 1430
},
{
"epoch": 0.643461689367188,
"grad_norm": 0.9942423105239868,
"learning_rate": 8.907065190789577e-05,
"loss": 1.3102,
"step": 1435
},
{
"epoch": 0.6457037161594081,
"grad_norm": 1.1335337162017822,
"learning_rate": 8.899728574646376e-05,
"loss": 1.304,
"step": 1440
},
{
"epoch": 0.6479457429516283,
"grad_norm": 1.0654945373535156,
"learning_rate": 8.892370459332883e-05,
"loss": 1.273,
"step": 1445
},
{
"epoch": 0.6501877697438484,
"grad_norm": 1.0929509401321411,
"learning_rate": 8.884990885414326e-05,
"loss": 1.3298,
"step": 1450
},
{
"epoch": 0.6524297965360686,
"grad_norm": 1.157837986946106,
"learning_rate": 8.87758989357423e-05,
"loss": 1.3395,
"step": 1455
},
{
"epoch": 0.6546718233282888,
"grad_norm": 1.1370052099227905,
"learning_rate": 8.8701675246142e-05,
"loss": 1.3823,
"step": 1460
},
{
"epoch": 0.656913850120509,
"grad_norm": 1.096897840499878,
"learning_rate": 8.862723819453696e-05,
"loss": 1.2579,
"step": 1465
},
{
"epoch": 0.6591558769127291,
"grad_norm": 1.028351902961731,
"learning_rate": 8.855258819129796e-05,
"loss": 1.323,
"step": 1470
},
{
"epoch": 0.6613979037049492,
"grad_norm": 1.2492655515670776,
"learning_rate": 8.847772564796987e-05,
"loss": 1.3316,
"step": 1475
},
{
"epoch": 0.6636399304971694,
"grad_norm": 1.1350480318069458,
"learning_rate": 8.840265097726923e-05,
"loss": 1.3331,
"step": 1480
},
{
"epoch": 0.6658819572893896,
"grad_norm": 1.057501196861267,
"learning_rate": 8.832736459308207e-05,
"loss": 1.3092,
"step": 1485
},
{
"epoch": 0.6681239840816098,
"grad_norm": 0.9846299290657043,
"learning_rate": 8.825186691046157e-05,
"loss": 1.3709,
"step": 1490
},
{
"epoch": 0.67036601087383,
"grad_norm": 1.2653725147247314,
"learning_rate": 8.817615834562583e-05,
"loss": 1.368,
"step": 1495
},
{
"epoch": 0.6726080376660502,
"grad_norm": 0.9622915387153625,
"learning_rate": 8.81002393159555e-05,
"loss": 1.3952,
"step": 1500
},
{
"epoch": 0.6748500644582702,
"grad_norm": 1.1680620908737183,
"learning_rate": 8.802411023999153e-05,
"loss": 1.378,
"step": 1505
},
{
"epoch": 0.6770920912504904,
"grad_norm": 1.2336018085479736,
"learning_rate": 8.79477715374329e-05,
"loss": 1.3017,
"step": 1510
},
{
"epoch": 0.6793341180427106,
"grad_norm": 1.0431910753250122,
"learning_rate": 8.78712236291342e-05,
"loss": 1.2801,
"step": 1515
},
{
"epoch": 0.6815761448349308,
"grad_norm": 0.9812450408935547,
"learning_rate": 8.779446693710341e-05,
"loss": 1.4084,
"step": 1520
},
{
"epoch": 0.683818171627151,
"grad_norm": 0.9624593257904053,
"learning_rate": 8.771750188449951e-05,
"loss": 1.324,
"step": 1525
},
{
"epoch": 0.6860601984193712,
"grad_norm": 0.9403428435325623,
"learning_rate": 8.764032889563017e-05,
"loss": 1.3739,
"step": 1530
},
{
"epoch": 0.6883022252115912,
"grad_norm": 1.0417587757110596,
"learning_rate": 8.756294839594943e-05,
"loss": 1.2942,
"step": 1535
},
{
"epoch": 0.6905442520038114,
"grad_norm": 1.1011159420013428,
"learning_rate": 8.74853608120553e-05,
"loss": 1.391,
"step": 1540
},
{
"epoch": 0.6927862787960316,
"grad_norm": 1.0298092365264893,
"learning_rate": 8.74075665716875e-05,
"loss": 1.2973,
"step": 1545
},
{
"epoch": 0.6950283055882518,
"grad_norm": 1.156357765197754,
"learning_rate": 8.732956610372499e-05,
"loss": 1.2932,
"step": 1550
},
{
"epoch": 0.697270332380472,
"grad_norm": 0.9823068976402283,
"learning_rate": 8.725135983818369e-05,
"loss": 1.3696,
"step": 1555
},
{
"epoch": 0.6995123591726922,
"grad_norm": 1.0234986543655396,
"learning_rate": 8.717294820621407e-05,
"loss": 1.3504,
"step": 1560
},
{
"epoch": 0.7017543859649122,
"grad_norm": 1.3021448850631714,
"learning_rate": 8.70943316400988e-05,
"loss": 1.3624,
"step": 1565
},
{
"epoch": 0.7039964127571324,
"grad_norm": 1.166528582572937,
"learning_rate": 8.70155105732503e-05,
"loss": 1.3469,
"step": 1570
},
{
"epoch": 0.7062384395493526,
"grad_norm": 1.2379478216171265,
"learning_rate": 8.693648544020847e-05,
"loss": 1.3586,
"step": 1575
},
{
"epoch": 0.7084804663415728,
"grad_norm": 0.9946653842926025,
"learning_rate": 8.68572566766382e-05,
"loss": 1.3349,
"step": 1580
},
{
"epoch": 0.710722493133793,
"grad_norm": 1.184866189956665,
"learning_rate": 8.677782471932696e-05,
"loss": 1.2896,
"step": 1585
},
{
"epoch": 0.7129645199260131,
"grad_norm": 1.2160494327545166,
"learning_rate": 8.669819000618246e-05,
"loss": 1.3714,
"step": 1590
},
{
"epoch": 0.7152065467182332,
"grad_norm": 1.096117615699768,
"learning_rate": 8.66183529762302e-05,
"loss": 1.3556,
"step": 1595
},
{
"epoch": 0.7174485735104534,
"grad_norm": 0.9968474507331848,
"learning_rate": 8.653831406961105e-05,
"loss": 1.3476,
"step": 1600
},
{
"epoch": 0.7196906003026736,
"grad_norm": 1.093274474143982,
"learning_rate": 8.64580737275788e-05,
"loss": 1.3187,
"step": 1605
},
{
"epoch": 0.7219326270948938,
"grad_norm": 1.1728419065475464,
"learning_rate": 8.637763239249777e-05,
"loss": 1.3481,
"step": 1610
},
{
"epoch": 0.724174653887114,
"grad_norm": 1.1466108560562134,
"learning_rate": 8.629699050784038e-05,
"loss": 1.3226,
"step": 1615
},
{
"epoch": 0.7264166806793341,
"grad_norm": 1.0177853107452393,
"learning_rate": 8.621614851818461e-05,
"loss": 1.3065,
"step": 1620
},
{
"epoch": 0.7286587074715543,
"grad_norm": 0.9964995384216309,
"learning_rate": 8.61351068692117e-05,
"loss": 1.3096,
"step": 1625
},
{
"epoch": 0.7309007342637744,
"grad_norm": 0.9439290165901184,
"learning_rate": 8.605386600770353e-05,
"loss": 1.2842,
"step": 1630
},
{
"epoch": 0.7331427610559946,
"grad_norm": 1.1577221155166626,
"learning_rate": 8.59724263815403e-05,
"loss": 1.3666,
"step": 1635
},
{
"epoch": 0.7353847878482148,
"grad_norm": 1.0668253898620605,
"learning_rate": 8.589078843969796e-05,
"loss": 1.3084,
"step": 1640
},
{
"epoch": 0.737626814640435,
"grad_norm": 1.0648199319839478,
"learning_rate": 8.580895263224578e-05,
"loss": 1.3817,
"step": 1645
},
{
"epoch": 0.7398688414326551,
"grad_norm": 1.081084132194519,
"learning_rate": 8.572691941034389e-05,
"loss": 1.2961,
"step": 1650
},
{
"epoch": 0.7421108682248753,
"grad_norm": 0.9493741393089294,
"learning_rate": 8.564468922624073e-05,
"loss": 1.3692,
"step": 1655
},
{
"epoch": 0.7443528950170955,
"grad_norm": 1.156214952468872,
"learning_rate": 8.556226253327059e-05,
"loss": 1.3368,
"step": 1660
},
{
"epoch": 0.7465949218093156,
"grad_norm": 1.098140001296997,
"learning_rate": 8.547963978585114e-05,
"loss": 1.3045,
"step": 1665
},
{
"epoch": 0.7488369486015358,
"grad_norm": 1.1900348663330078,
"learning_rate": 8.539682143948087e-05,
"loss": 1.3388,
"step": 1670
},
{
"epoch": 0.751078975393756,
"grad_norm": 1.0908799171447754,
"learning_rate": 8.531380795073662e-05,
"loss": 1.2893,
"step": 1675
},
{
"epoch": 0.7533210021859761,
"grad_norm": 1.1332789659500122,
"learning_rate": 8.523059977727103e-05,
"loss": 1.278,
"step": 1680
},
{
"epoch": 0.7555630289781963,
"grad_norm": 1.1658406257629395,
"learning_rate": 8.514719737781008e-05,
"loss": 1.38,
"step": 1685
},
{
"epoch": 0.7578050557704165,
"grad_norm": 1.1062614917755127,
"learning_rate": 8.506360121215045e-05,
"loss": 1.2967,
"step": 1690
},
{
"epoch": 0.7600470825626366,
"grad_norm": 1.1336619853973389,
"learning_rate": 8.497981174115712e-05,
"loss": 1.3339,
"step": 1695
},
{
"epoch": 0.7622891093548568,
"grad_norm": 0.9592335820198059,
"learning_rate": 8.48958294267607e-05,
"loss": 1.3373,
"step": 1700
},
{
"epoch": 0.7645311361470769,
"grad_norm": 1.2497416734695435,
"learning_rate": 8.4811654731955e-05,
"loss": 1.3679,
"step": 1705
},
{
"epoch": 0.7667731629392971,
"grad_norm": 1.078972578048706,
"learning_rate": 8.472728812079436e-05,
"loss": 1.3833,
"step": 1710
},
{
"epoch": 0.7690151897315173,
"grad_norm": 1.0341068506240845,
"learning_rate": 8.464273005839119e-05,
"loss": 1.304,
"step": 1715
},
{
"epoch": 0.7712572165237375,
"grad_norm": 0.9276494383811951,
"learning_rate": 8.455798101091338e-05,
"loss": 1.3569,
"step": 1720
},
{
"epoch": 0.7734992433159577,
"grad_norm": 1.232210397720337,
"learning_rate": 8.447304144558171e-05,
"loss": 1.3199,
"step": 1725
},
{
"epoch": 0.7757412701081778,
"grad_norm": 1.031119704246521,
"learning_rate": 8.438791183066728e-05,
"loss": 1.3693,
"step": 1730
},
{
"epoch": 0.7779832969003979,
"grad_norm": 0.9429606795310974,
"learning_rate": 8.43025926354889e-05,
"loss": 1.3712,
"step": 1735
},
{
"epoch": 0.7802253236926181,
"grad_norm": 1.0232348442077637,
"learning_rate": 8.421708433041058e-05,
"loss": 1.2815,
"step": 1740
},
{
"epoch": 0.7824673504848383,
"grad_norm": 1.1679573059082031,
"learning_rate": 8.413138738683887e-05,
"loss": 1.2576,
"step": 1745
},
{
"epoch": 0.7847093772770585,
"grad_norm": 1.3956390619277954,
"learning_rate": 8.40455022772203e-05,
"loss": 1.3678,
"step": 1750
},
{
"epoch": 0.7869514040692787,
"grad_norm": 1.1722822189331055,
"learning_rate": 8.395942947503874e-05,
"loss": 1.2261,
"step": 1755
},
{
"epoch": 0.7891934308614988,
"grad_norm": 1.1038949489593506,
"learning_rate": 8.38731694548128e-05,
"loss": 1.3066,
"step": 1760
},
{
"epoch": 0.7914354576537189,
"grad_norm": 1.0882680416107178,
"learning_rate": 8.378672269209326e-05,
"loss": 1.388,
"step": 1765
},
{
"epoch": 0.7936774844459391,
"grad_norm": 0.9000134468078613,
"learning_rate": 8.370008966346037e-05,
"loss": 1.3099,
"step": 1770
},
{
"epoch": 0.7959195112381593,
"grad_norm": 0.993665874004364,
"learning_rate": 8.361327084652126e-05,
"loss": 1.2892,
"step": 1775
},
{
"epoch": 0.7981615380303795,
"grad_norm": 1.091774344444275,
"learning_rate": 8.352626671990735e-05,
"loss": 1.3601,
"step": 1780
},
{
"epoch": 0.8004035648225997,
"grad_norm": 1.1141952276229858,
"learning_rate": 8.343907776327164e-05,
"loss": 1.3546,
"step": 1785
},
{
"epoch": 0.8026455916148197,
"grad_norm": 0.9900937676429749,
"learning_rate": 8.335170445728608e-05,
"loss": 1.3254,
"step": 1790
},
{
"epoch": 0.8048876184070399,
"grad_norm": 0.959354817867279,
"learning_rate": 8.326414728363899e-05,
"loss": 1.3446,
"step": 1795
},
{
"epoch": 0.8071296451992601,
"grad_norm": 1.1290162801742554,
"learning_rate": 8.317640672503231e-05,
"loss": 1.3338,
"step": 1800
},
{
"epoch": 0.8093716719914803,
"grad_norm": 0.9364314675331116,
"learning_rate": 8.308848326517897e-05,
"loss": 1.2879,
"step": 1805
},
{
"epoch": 0.8116136987837005,
"grad_norm": 1.0674771070480347,
"learning_rate": 8.300037738880029e-05,
"loss": 1.3129,
"step": 1810
},
{
"epoch": 0.8138557255759207,
"grad_norm": 1.0436745882034302,
"learning_rate": 8.291208958162317e-05,
"loss": 1.3547,
"step": 1815
},
{
"epoch": 0.8160977523681407,
"grad_norm": 1.097528100013733,
"learning_rate": 8.282362033037758e-05,
"loss": 1.3301,
"step": 1820
},
{
"epoch": 0.8183397791603609,
"grad_norm": 1.0497652292251587,
"learning_rate": 8.273497012279371e-05,
"loss": 1.2466,
"step": 1825
},
{
"epoch": 0.8205818059525811,
"grad_norm": 1.011123776435852,
"learning_rate": 8.264613944759943e-05,
"loss": 1.3085,
"step": 1830
},
{
"epoch": 0.8228238327448013,
"grad_norm": 1.0443741083145142,
"learning_rate": 8.255712879451747e-05,
"loss": 1.281,
"step": 1835
},
{
"epoch": 0.8250658595370215,
"grad_norm": 1.2140185832977295,
"learning_rate": 8.246793865426279e-05,
"loss": 1.3612,
"step": 1840
},
{
"epoch": 0.8273078863292417,
"grad_norm": 1.128836989402771,
"learning_rate": 8.237856951853989e-05,
"loss": 1.322,
"step": 1845
},
{
"epoch": 0.8295499131214618,
"grad_norm": 1.0461573600769043,
"learning_rate": 8.228902188004004e-05,
"loss": 1.2147,
"step": 1850
},
{
"epoch": 0.8317919399136819,
"grad_norm": 1.025303602218628,
"learning_rate": 8.219929623243862e-05,
"loss": 1.3644,
"step": 1855
},
{
"epoch": 0.8340339667059021,
"grad_norm": 1.1245356798171997,
"learning_rate": 8.210939307039234e-05,
"loss": 1.2791,
"step": 1860
},
{
"epoch": 0.8362759934981223,
"grad_norm": 1.0641727447509766,
"learning_rate": 8.201931288953657e-05,
"loss": 1.3585,
"step": 1865
},
{
"epoch": 0.8385180202903425,
"grad_norm": 1.0719192028045654,
"learning_rate": 8.19290561864826e-05,
"loss": 1.3353,
"step": 1870
},
{
"epoch": 0.8407600470825627,
"grad_norm": 1.0135860443115234,
"learning_rate": 8.183862345881483e-05,
"loss": 1.3111,
"step": 1875
},
{
"epoch": 0.8430020738747828,
"grad_norm": 1.0956032276153564,
"learning_rate": 8.174801520508813e-05,
"loss": 1.3599,
"step": 1880
},
{
"epoch": 0.845244100667003,
"grad_norm": 1.2083892822265625,
"learning_rate": 8.165723192482502e-05,
"loss": 1.2641,
"step": 1885
},
{
"epoch": 0.8474861274592231,
"grad_norm": 1.0608189105987549,
"learning_rate": 8.156627411851295e-05,
"loss": 1.3246,
"step": 1890
},
{
"epoch": 0.8497281542514433,
"grad_norm": 1.099736213684082,
"learning_rate": 8.147514228760153e-05,
"loss": 1.294,
"step": 1895
},
{
"epoch": 0.8519701810436635,
"grad_norm": 1.0537753105163574,
"learning_rate": 8.138383693449978e-05,
"loss": 1.3138,
"step": 1900
},
{
"epoch": 0.8542122078358836,
"grad_norm": 1.1678063869476318,
"learning_rate": 8.12923585625733e-05,
"loss": 1.3333,
"step": 1905
},
{
"epoch": 0.8564542346281038,
"grad_norm": 1.0176467895507812,
"learning_rate": 8.120070767614161e-05,
"loss": 1.2475,
"step": 1910
},
{
"epoch": 0.858696261420324,
"grad_norm": 1.0608762502670288,
"learning_rate": 8.110888478047523e-05,
"loss": 1.3255,
"step": 1915
},
{
"epoch": 0.8609382882125441,
"grad_norm": 0.9746761918067932,
"learning_rate": 8.101689038179299e-05,
"loss": 1.2848,
"step": 1920
},
{
"epoch": 0.8631803150047643,
"grad_norm": 1.1493169069290161,
"learning_rate": 8.092472498725927e-05,
"loss": 1.3407,
"step": 1925
},
{
"epoch": 0.8654223417969845,
"grad_norm": 1.04042649269104,
"learning_rate": 8.083238910498108e-05,
"loss": 1.3759,
"step": 1930
},
{
"epoch": 0.8676643685892046,
"grad_norm": 1.1784476041793823,
"learning_rate": 8.073988324400535e-05,
"loss": 1.3276,
"step": 1935
},
{
"epoch": 0.8699063953814248,
"grad_norm": 1.0766850709915161,
"learning_rate": 8.064720791431608e-05,
"loss": 1.4061,
"step": 1940
},
{
"epoch": 0.872148422173645,
"grad_norm": 1.1751985549926758,
"learning_rate": 8.055436362683158e-05,
"loss": 1.3455,
"step": 1945
},
{
"epoch": 0.8743904489658652,
"grad_norm": 1.0339034795761108,
"learning_rate": 8.046135089340164e-05,
"loss": 1.3087,
"step": 1950
},
{
"epoch": 0.8766324757580853,
"grad_norm": 1.1246895790100098,
"learning_rate": 8.036817022680466e-05,
"loss": 1.2804,
"step": 1955
},
{
"epoch": 0.8788745025503055,
"grad_norm": 0.9990755319595337,
"learning_rate": 8.027482214074482e-05,
"loss": 1.3058,
"step": 1960
},
{
"epoch": 0.8811165293425256,
"grad_norm": 1.0636389255523682,
"learning_rate": 8.018130714984933e-05,
"loss": 1.3505,
"step": 1965
},
{
"epoch": 0.8833585561347458,
"grad_norm": 1.2202845811843872,
"learning_rate": 8.008762576966557e-05,
"loss": 1.3404,
"step": 1970
},
{
"epoch": 0.885600582926966,
"grad_norm": 1.0653436183929443,
"learning_rate": 7.999377851665817e-05,
"loss": 1.3974,
"step": 1975
},
{
"epoch": 0.8878426097191862,
"grad_norm": 1.3170489072799683,
"learning_rate": 7.989976590820623e-05,
"loss": 1.314,
"step": 1980
},
{
"epoch": 0.8900846365114063,
"grad_norm": 1.0469021797180176,
"learning_rate": 7.980558846260044e-05,
"loss": 1.3115,
"step": 1985
},
{
"epoch": 0.8923266633036265,
"grad_norm": 1.0016125440597534,
"learning_rate": 7.971124669904029e-05,
"loss": 1.2834,
"step": 1990
},
{
"epoch": 0.8945686900958466,
"grad_norm": 1.1542069911956787,
"learning_rate": 7.961674113763109e-05,
"loss": 1.2743,
"step": 1995
},
{
"epoch": 0.8968107168880668,
"grad_norm": 1.0665364265441895,
"learning_rate": 7.952207229938119e-05,
"loss": 1.3778,
"step": 2000
},
{
"epoch": 0.899052743680287,
"grad_norm": 1.06927490234375,
"learning_rate": 7.942724070619911e-05,
"loss": 1.3158,
"step": 2005
},
{
"epoch": 0.9012947704725072,
"grad_norm": 1.1074497699737549,
"learning_rate": 7.933224688089059e-05,
"loss": 1.3796,
"step": 2010
},
{
"epoch": 0.9035367972647274,
"grad_norm": 0.9936386942863464,
"learning_rate": 7.923709134715577e-05,
"loss": 1.3099,
"step": 2015
},
{
"epoch": 0.9057788240569474,
"grad_norm": 1.0144227743148804,
"learning_rate": 7.914177462958631e-05,
"loss": 1.3097,
"step": 2020
},
{
"epoch": 0.9080208508491676,
"grad_norm": 1.1205965280532837,
"learning_rate": 7.904629725366247e-05,
"loss": 1.3218,
"step": 2025
},
{
"epoch": 0.9102628776413878,
"grad_norm": 1.0762195587158203,
"learning_rate": 7.895065974575017e-05,
"loss": 1.3102,
"step": 2030
},
{
"epoch": 0.912504904433608,
"grad_norm": 1.1134177446365356,
"learning_rate": 7.885486263309823e-05,
"loss": 1.2953,
"step": 2035
},
{
"epoch": 0.9147469312258282,
"grad_norm": 1.171975016593933,
"learning_rate": 7.875890644383525e-05,
"loss": 1.3812,
"step": 2040
},
{
"epoch": 0.9169889580180484,
"grad_norm": 1.035203456878662,
"learning_rate": 7.866279170696693e-05,
"loss": 1.3105,
"step": 2045
},
{
"epoch": 0.9192309848102684,
"grad_norm": 0.9938043355941772,
"learning_rate": 7.856651895237297e-05,
"loss": 1.2807,
"step": 2050
},
{
"epoch": 0.9214730116024886,
"grad_norm": 1.012306571006775,
"learning_rate": 7.847008871080423e-05,
"loss": 1.2452,
"step": 2055
},
{
"epoch": 0.9237150383947088,
"grad_norm": 1.160154938697815,
"learning_rate": 7.837350151387985e-05,
"loss": 1.3946,
"step": 2060
},
{
"epoch": 0.925957065186929,
"grad_norm": 1.1950114965438843,
"learning_rate": 7.827675789408417e-05,
"loss": 1.3793,
"step": 2065
},
{
"epoch": 0.9281990919791492,
"grad_norm": 0.9952568411827087,
"learning_rate": 7.817985838476398e-05,
"loss": 1.3438,
"step": 2070
},
{
"epoch": 0.9304411187713694,
"grad_norm": 0.9820153713226318,
"learning_rate": 7.808280352012544e-05,
"loss": 1.2817,
"step": 2075
},
{
"epoch": 0.9326831455635894,
"grad_norm": 1.062547206878662,
"learning_rate": 7.798559383523116e-05,
"loss": 1.2524,
"step": 2080
},
{
"epoch": 0.9349251723558096,
"grad_norm": 0.9531433582305908,
"learning_rate": 7.788822986599733e-05,
"loss": 1.3326,
"step": 2085
},
{
"epoch": 0.9371671991480298,
"grad_norm": 0.9412059783935547,
"learning_rate": 7.779071214919066e-05,
"loss": 1.3434,
"step": 2090
},
{
"epoch": 0.93940922594025,
"grad_norm": 1.337913990020752,
"learning_rate": 7.769304122242551e-05,
"loss": 1.3211,
"step": 2095
},
{
"epoch": 0.9416512527324702,
"grad_norm": 0.9646030068397522,
"learning_rate": 7.759521762416084e-05,
"loss": 1.2644,
"step": 2100
},
{
"epoch": 0.9438932795246903,
"grad_norm": 1.146712303161621,
"learning_rate": 7.749724189369735e-05,
"loss": 1.3066,
"step": 2105
},
{
"epoch": 0.9461353063169105,
"grad_norm": 0.9840266704559326,
"learning_rate": 7.739911457117437e-05,
"loss": 1.337,
"step": 2110
},
{
"epoch": 0.9483773331091306,
"grad_norm": 1.027145504951477,
"learning_rate": 7.730083619756698e-05,
"loss": 1.3583,
"step": 2115
},
{
"epoch": 0.9506193599013508,
"grad_norm": 0.9447183609008789,
"learning_rate": 7.720240731468306e-05,
"loss": 1.2966,
"step": 2120
},
{
"epoch": 0.952861386693571,
"grad_norm": 0.9172132015228271,
"learning_rate": 7.710382846516017e-05,
"loss": 1.324,
"step": 2125
},
{
"epoch": 0.9551034134857912,
"grad_norm": 1.004164218902588,
"learning_rate": 7.700510019246266e-05,
"loss": 1.3354,
"step": 2130
},
{
"epoch": 0.9573454402780113,
"grad_norm": 1.1161928176879883,
"learning_rate": 7.690622304087865e-05,
"loss": 1.2743,
"step": 2135
},
{
"epoch": 0.9595874670702315,
"grad_norm": 1.2197874784469604,
"learning_rate": 7.680719755551707e-05,
"loss": 1.2924,
"step": 2140
},
{
"epoch": 0.9618294938624516,
"grad_norm": 1.1961028575897217,
"learning_rate": 7.670802428230452e-05,
"loss": 1.3233,
"step": 2145
},
{
"epoch": 0.9640715206546718,
"grad_norm": 1.09461510181427,
"learning_rate": 7.660870376798244e-05,
"loss": 1.3149,
"step": 2150
},
{
"epoch": 0.966313547446892,
"grad_norm": 1.1680549383163452,
"learning_rate": 7.650923656010398e-05,
"loss": 1.3106,
"step": 2155
},
{
"epoch": 0.9685555742391122,
"grad_norm": 1.0645558834075928,
"learning_rate": 7.6409623207031e-05,
"loss": 1.2427,
"step": 2160
},
{
"epoch": 0.9707976010313323,
"grad_norm": 1.3543119430541992,
"learning_rate": 7.630986425793105e-05,
"loss": 1.257,
"step": 2165
},
{
"epoch": 0.9730396278235525,
"grad_norm": 0.9309380650520325,
"learning_rate": 7.620996026277438e-05,
"loss": 1.3291,
"step": 2170
},
{
"epoch": 0.9752816546157727,
"grad_norm": 1.0483700037002563,
"learning_rate": 7.610991177233085e-05,
"loss": 1.3066,
"step": 2175
},
{
"epoch": 0.9775236814079928,
"grad_norm": 1.029461145401001,
"learning_rate": 7.600971933816695e-05,
"loss": 1.3153,
"step": 2180
},
{
"epoch": 0.979765708200213,
"grad_norm": 1.025608777999878,
"learning_rate": 7.590938351264269e-05,
"loss": 1.2595,
"step": 2185
},
{
"epoch": 0.9820077349924332,
"grad_norm": 1.1784470081329346,
"learning_rate": 7.580890484890864e-05,
"loss": 1.3677,
"step": 2190
},
{
"epoch": 0.9842497617846533,
"grad_norm": 1.0288585424423218,
"learning_rate": 7.570828390090279e-05,
"loss": 1.2931,
"step": 2195
},
{
"epoch": 0.9864917885768735,
"grad_norm": 0.9635973572731018,
"learning_rate": 7.560752122334757e-05,
"loss": 1.2542,
"step": 2200
},
{
"epoch": 0.9887338153690937,
"grad_norm": 1.0460883378982544,
"learning_rate": 7.55066173717468e-05,
"loss": 1.2744,
"step": 2205
},
{
"epoch": 0.9909758421613138,
"grad_norm": 1.0957541465759277,
"learning_rate": 7.54055729023825e-05,
"loss": 1.3375,
"step": 2210
},
{
"epoch": 0.993217868953534,
"grad_norm": 1.203940510749817,
"learning_rate": 7.5304388372312e-05,
"loss": 1.363,
"step": 2215
},
{
"epoch": 0.9954598957457541,
"grad_norm": 1.2144309282302856,
"learning_rate": 7.520306433936473e-05,
"loss": 1.3041,
"step": 2220
},
{
"epoch": 0.9977019225379743,
"grad_norm": 1.1100728511810303,
"learning_rate": 7.510160136213921e-05,
"loss": 1.2448,
"step": 2225
},
{
"epoch": 0.9999439493301945,
"grad_norm": 1.0487066507339478,
"learning_rate": 7.500000000000001e-05,
"loss": 1.2796,
"step": 2230
},
{
"epoch": 1.0021859761224146,
"grad_norm": 0.9101243615150452,
"learning_rate": 7.489826081307452e-05,
"loss": 1.2459,
"step": 2235
},
{
"epoch": 1.0044280029146349,
"grad_norm": 0.9735124707221985,
"learning_rate": 7.479638436225003e-05,
"loss": 1.271,
"step": 2240
},
{
"epoch": 1.006670029706855,
"grad_norm": 1.0015895366668701,
"learning_rate": 7.469437120917054e-05,
"loss": 1.331,
"step": 2245
},
{
"epoch": 1.0089120564990752,
"grad_norm": 1.1906746625900269,
"learning_rate": 7.459222191623369e-05,
"loss": 1.2832,
"step": 2250
},
{
"epoch": 1.0111540832912953,
"grad_norm": 1.022809386253357,
"learning_rate": 7.448993704658766e-05,
"loss": 1.2637,
"step": 2255
},
{
"epoch": 1.0133961100835156,
"grad_norm": 1.0506726503372192,
"learning_rate": 7.438751716412807e-05,
"loss": 1.2623,
"step": 2260
},
{
"epoch": 1.0156381368757357,
"grad_norm": 1.114424705505371,
"learning_rate": 7.428496283349483e-05,
"loss": 1.2747,
"step": 2265
},
{
"epoch": 1.0178801636679558,
"grad_norm": 1.1833229064941406,
"learning_rate": 7.418227462006912e-05,
"loss": 1.387,
"step": 2270
},
{
"epoch": 1.020122190460176,
"grad_norm": 1.137563943862915,
"learning_rate": 7.407945308997017e-05,
"loss": 1.3009,
"step": 2275
},
{
"epoch": 1.0223642172523961,
"grad_norm": 1.0473971366882324,
"learning_rate": 7.39764988100522e-05,
"loss": 1.2309,
"step": 2280
},
{
"epoch": 1.0246062440446164,
"grad_norm": 1.0773533582687378,
"learning_rate": 7.387341234790124e-05,
"loss": 1.2865,
"step": 2285
},
{
"epoch": 1.0268482708368365,
"grad_norm": 1.1596111059188843,
"learning_rate": 7.377019427183212e-05,
"loss": 1.3355,
"step": 2290
},
{
"epoch": 1.0290902976290566,
"grad_norm": 1.0251152515411377,
"learning_rate": 7.366684515088521e-05,
"loss": 1.3117,
"step": 2295
},
{
"epoch": 1.0313323244212769,
"grad_norm": 1.039408802986145,
"learning_rate": 7.356336555482332e-05,
"loss": 1.3272,
"step": 2300
},
{
"epoch": 1.033574351213497,
"grad_norm": 0.9818054437637329,
"learning_rate": 7.345975605412855e-05,
"loss": 1.3615,
"step": 2305
},
{
"epoch": 1.0358163780057172,
"grad_norm": 1.0482890605926514,
"learning_rate": 7.335601721999922e-05,
"loss": 1.3027,
"step": 2310
},
{
"epoch": 1.0380584047979373,
"grad_norm": 1.1090137958526611,
"learning_rate": 7.325214962434665e-05,
"loss": 1.2632,
"step": 2315
},
{
"epoch": 1.0403004315901576,
"grad_norm": 1.005988597869873,
"learning_rate": 7.314815383979198e-05,
"loss": 1.2945,
"step": 2320
},
{
"epoch": 1.0425424583823777,
"grad_norm": 1.1132372617721558,
"learning_rate": 7.304403043966309e-05,
"loss": 1.3651,
"step": 2325
},
{
"epoch": 1.0447844851745978,
"grad_norm": 1.154373049736023,
"learning_rate": 7.29397799979914e-05,
"loss": 1.2766,
"step": 2330
},
{
"epoch": 1.047026511966818,
"grad_norm": 1.1040149927139282,
"learning_rate": 7.283540308950867e-05,
"loss": 1.2856,
"step": 2335
},
{
"epoch": 1.0492685387590381,
"grad_norm": 1.1813440322875977,
"learning_rate": 7.273090028964396e-05,
"loss": 1.2265,
"step": 2340
},
{
"epoch": 1.0515105655512584,
"grad_norm": 1.099605679512024,
"learning_rate": 7.262627217452027e-05,
"loss": 1.2973,
"step": 2345
},
{
"epoch": 1.0537525923434785,
"grad_norm": 1.2352324724197388,
"learning_rate": 7.252151932095154e-05,
"loss": 1.2729,
"step": 2350
},
{
"epoch": 1.0559946191356986,
"grad_norm": 1.2545338869094849,
"learning_rate": 7.241664230643931e-05,
"loss": 1.293,
"step": 2355
},
{
"epoch": 1.0582366459279189,
"grad_norm": 1.1900233030319214,
"learning_rate": 7.23116417091697e-05,
"loss": 1.3372,
"step": 2360
},
{
"epoch": 1.060478672720139,
"grad_norm": 1.1750991344451904,
"learning_rate": 7.220651810801009e-05,
"loss": 1.2848,
"step": 2365
},
{
"epoch": 1.0627206995123593,
"grad_norm": 1.1401137113571167,
"learning_rate": 7.210127208250599e-05,
"loss": 1.2853,
"step": 2370
},
{
"epoch": 1.0649627263045793,
"grad_norm": 1.0749046802520752,
"learning_rate": 7.199590421287788e-05,
"loss": 1.3066,
"step": 2375
},
{
"epoch": 1.0672047530967994,
"grad_norm": 1.2057609558105469,
"learning_rate": 7.189041508001786e-05,
"loss": 1.3053,
"step": 2380
},
{
"epoch": 1.0694467798890197,
"grad_norm": 1.0309621095657349,
"learning_rate": 7.178480526548666e-05,
"loss": 1.3314,
"step": 2385
},
{
"epoch": 1.0716888066812398,
"grad_norm": 1.1735321283340454,
"learning_rate": 7.167907535151027e-05,
"loss": 1.2538,
"step": 2390
},
{
"epoch": 1.07393083347346,
"grad_norm": 1.0819196701049805,
"learning_rate": 7.157322592097682e-05,
"loss": 1.3022,
"step": 2395
},
{
"epoch": 1.0761728602656802,
"grad_norm": 1.0149192810058594,
"learning_rate": 7.146725755743329e-05,
"loss": 1.3713,
"step": 2400
},
{
"epoch": 1.0784148870579005,
"grad_norm": 0.8954042196273804,
"learning_rate": 7.136117084508237e-05,
"loss": 1.2962,
"step": 2405
},
{
"epoch": 1.0806569138501205,
"grad_norm": 1.0265322923660278,
"learning_rate": 7.125496636877922e-05,
"loss": 1.3084,
"step": 2410
},
{
"epoch": 1.0828989406423406,
"grad_norm": 1.0515128374099731,
"learning_rate": 7.114864471402818e-05,
"loss": 1.2758,
"step": 2415
},
{
"epoch": 1.085140967434561,
"grad_norm": 1.1725807189941406,
"learning_rate": 7.104220646697962e-05,
"loss": 1.3046,
"step": 2420
},
{
"epoch": 1.087382994226781,
"grad_norm": 1.1021701097488403,
"learning_rate": 7.093565221442672e-05,
"loss": 1.2635,
"step": 2425
},
{
"epoch": 1.0896250210190013,
"grad_norm": 1.1437387466430664,
"learning_rate": 7.082898254380214e-05,
"loss": 1.323,
"step": 2430
},
{
"epoch": 1.0918670478112213,
"grad_norm": 0.998076856136322,
"learning_rate": 7.072219804317488e-05,
"loss": 1.1992,
"step": 2435
},
{
"epoch": 1.0941090746034414,
"grad_norm": 1.0903971195220947,
"learning_rate": 7.061529930124695e-05,
"loss": 1.2515,
"step": 2440
},
{
"epoch": 1.0963511013956617,
"grad_norm": 1.143904209136963,
"learning_rate": 7.050828690735022e-05,
"loss": 1.286,
"step": 2445
},
{
"epoch": 1.0985931281878818,
"grad_norm": 1.1476929187774658,
"learning_rate": 7.040116145144311e-05,
"loss": 1.2324,
"step": 2450
},
{
"epoch": 1.100835154980102,
"grad_norm": 1.0371499061584473,
"learning_rate": 7.029392352410733e-05,
"loss": 1.2511,
"step": 2455
},
{
"epoch": 1.1030771817723222,
"grad_norm": 1.069429636001587,
"learning_rate": 7.018657371654464e-05,
"loss": 1.3456,
"step": 2460
},
{
"epoch": 1.1053192085645422,
"grad_norm": 0.9130118489265442,
"learning_rate": 7.007911262057365e-05,
"loss": 1.3043,
"step": 2465
},
{
"epoch": 1.1075612353567625,
"grad_norm": 1.152266502380371,
"learning_rate": 6.997154082862644e-05,
"loss": 1.2775,
"step": 2470
},
{
"epoch": 1.1098032621489826,
"grad_norm": 1.117077112197876,
"learning_rate": 6.986385893374537e-05,
"loss": 1.315,
"step": 2475
},
{
"epoch": 1.112045288941203,
"grad_norm": 1.4062610864639282,
"learning_rate": 6.975606752957984e-05,
"loss": 1.2661,
"step": 2480
},
{
"epoch": 1.114287315733423,
"grad_norm": 1.1715933084487915,
"learning_rate": 6.96481672103829e-05,
"loss": 1.3384,
"step": 2485
},
{
"epoch": 1.1165293425256433,
"grad_norm": 0.937059760093689,
"learning_rate": 6.95401585710081e-05,
"loss": 1.2838,
"step": 2490
},
{
"epoch": 1.1187713693178634,
"grad_norm": 1.0344353914260864,
"learning_rate": 6.943204220690616e-05,
"loss": 1.2396,
"step": 2495
},
{
"epoch": 1.1210133961100834,
"grad_norm": 1.1878572702407837,
"learning_rate": 6.932381871412167e-05,
"loss": 1.329,
"step": 2500
},
{
"epoch": 1.1232554229023037,
"grad_norm": 1.155254602432251,
"learning_rate": 6.92154886892898e-05,
"loss": 1.2652,
"step": 2505
},
{
"epoch": 1.1254974496945238,
"grad_norm": 1.0120606422424316,
"learning_rate": 6.910705272963307e-05,
"loss": 1.2904,
"step": 2510
},
{
"epoch": 1.127739476486744,
"grad_norm": 1.284738540649414,
"learning_rate": 6.899851143295799e-05,
"loss": 1.236,
"step": 2515
},
{
"epoch": 1.1299815032789642,
"grad_norm": 1.1446951627731323,
"learning_rate": 6.888986539765181e-05,
"loss": 1.3456,
"step": 2520
},
{
"epoch": 1.1322235300711843,
"grad_norm": 1.183556079864502,
"learning_rate": 6.878111522267917e-05,
"loss": 1.3006,
"step": 2525
},
{
"epoch": 1.1344655568634046,
"grad_norm": 1.1089967489242554,
"learning_rate": 6.867226150757888e-05,
"loss": 1.3098,
"step": 2530
},
{
"epoch": 1.1367075836556246,
"grad_norm": 1.1036224365234375,
"learning_rate": 6.856330485246054e-05,
"loss": 1.2543,
"step": 2535
},
{
"epoch": 1.138949610447845,
"grad_norm": 1.2652587890625,
"learning_rate": 6.845424585800123e-05,
"loss": 1.2941,
"step": 2540
},
{
"epoch": 1.141191637240065,
"grad_norm": 1.0114392042160034,
"learning_rate": 6.834508512544228e-05,
"loss": 1.306,
"step": 2545
},
{
"epoch": 1.143433664032285,
"grad_norm": 1.0309230089187622,
"learning_rate": 6.823582325658588e-05,
"loss": 1.2697,
"step": 2550
},
{
"epoch": 1.1456756908245054,
"grad_norm": 1.490627408027649,
"learning_rate": 6.812646085379178e-05,
"loss": 1.2784,
"step": 2555
},
{
"epoch": 1.1479177176167255,
"grad_norm": 1.1522798538208008,
"learning_rate": 6.801699851997393e-05,
"loss": 1.2499,
"step": 2560
},
{
"epoch": 1.1501597444089458,
"grad_norm": 1.0427577495574951,
"learning_rate": 6.790743685859728e-05,
"loss": 1.2711,
"step": 2565
},
{
"epoch": 1.1524017712011658,
"grad_norm": 1.0645527839660645,
"learning_rate": 6.779777647367434e-05,
"loss": 1.2498,
"step": 2570
},
{
"epoch": 1.1546437979933861,
"grad_norm": 1.0170249938964844,
"learning_rate": 6.768801796976183e-05,
"loss": 1.2622,
"step": 2575
},
{
"epoch": 1.1568858247856062,
"grad_norm": 1.1332886219024658,
"learning_rate": 6.75781619519575e-05,
"loss": 1.3146,
"step": 2580
},
{
"epoch": 1.1591278515778263,
"grad_norm": 1.1379398107528687,
"learning_rate": 6.746820902589659e-05,
"loss": 1.2898,
"step": 2585
},
{
"epoch": 1.1613698783700466,
"grad_norm": 1.0116194486618042,
"learning_rate": 6.735815979774866e-05,
"loss": 1.3308,
"step": 2590
},
{
"epoch": 1.1636119051622666,
"grad_norm": 0.9840161204338074,
"learning_rate": 6.724801487421416e-05,
"loss": 1.2739,
"step": 2595
},
{
"epoch": 1.165853931954487,
"grad_norm": 1.3689374923706055,
"learning_rate": 6.713777486252113e-05,
"loss": 1.273,
"step": 2600
},
{
"epoch": 1.168095958746707,
"grad_norm": 1.1147258281707764,
"learning_rate": 6.702744037042179e-05,
"loss": 1.3653,
"step": 2605
},
{
"epoch": 1.170337985538927,
"grad_norm": 1.0359976291656494,
"learning_rate": 6.691701200618925e-05,
"loss": 1.2928,
"step": 2610
},
{
"epoch": 1.1725800123311474,
"grad_norm": 1.0808576345443726,
"learning_rate": 6.680649037861416e-05,
"loss": 1.2834,
"step": 2615
},
{
"epoch": 1.1748220391233675,
"grad_norm": 1.2251567840576172,
"learning_rate": 6.669587609700129e-05,
"loss": 1.206,
"step": 2620
},
{
"epoch": 1.1770640659155878,
"grad_norm": 1.0829846858978271,
"learning_rate": 6.658516977116623e-05,
"loss": 1.2292,
"step": 2625
},
{
"epoch": 1.1793060927078078,
"grad_norm": 1.1904149055480957,
"learning_rate": 6.647437201143201e-05,
"loss": 1.275,
"step": 2630
},
{
"epoch": 1.181548119500028,
"grad_norm": 1.223581314086914,
"learning_rate": 6.636348342862575e-05,
"loss": 1.2954,
"step": 2635
},
{
"epoch": 1.1837901462922482,
"grad_norm": 1.1710941791534424,
"learning_rate": 6.625250463407522e-05,
"loss": 1.2927,
"step": 2640
},
{
"epoch": 1.1860321730844683,
"grad_norm": 1.0496562719345093,
"learning_rate": 6.61414362396056e-05,
"loss": 1.2966,
"step": 2645
},
{
"epoch": 1.1882741998766886,
"grad_norm": 1.0458779335021973,
"learning_rate": 6.603027885753598e-05,
"loss": 1.3081,
"step": 2650
},
{
"epoch": 1.1905162266689087,
"grad_norm": 1.2921910285949707,
"learning_rate": 6.591903310067608e-05,
"loss": 1.2511,
"step": 2655
},
{
"epoch": 1.192758253461129,
"grad_norm": 1.0614089965820312,
"learning_rate": 6.580769958232279e-05,
"loss": 1.2995,
"step": 2660
},
{
"epoch": 1.195000280253349,
"grad_norm": 1.2062052488327026,
"learning_rate": 6.569627891625683e-05,
"loss": 1.3231,
"step": 2665
},
{
"epoch": 1.197242307045569,
"grad_norm": 1.065064549446106,
"learning_rate": 6.558477171673941e-05,
"loss": 1.3189,
"step": 2670
},
{
"epoch": 1.1994843338377894,
"grad_norm": 1.0669735670089722,
"learning_rate": 6.547317859850875e-05,
"loss": 1.3024,
"step": 2675
},
{
"epoch": 1.2017263606300095,
"grad_norm": 1.1397708654403687,
"learning_rate": 6.536150017677675e-05,
"loss": 1.342,
"step": 2680
},
{
"epoch": 1.2039683874222298,
"grad_norm": 1.1043004989624023,
"learning_rate": 6.524973706722562e-05,
"loss": 1.3442,
"step": 2685
},
{
"epoch": 1.2062104142144499,
"grad_norm": 1.1043583154678345,
"learning_rate": 6.513788988600441e-05,
"loss": 1.2344,
"step": 2690
},
{
"epoch": 1.2084524410066702,
"grad_norm": 1.1633187532424927,
"learning_rate": 6.502595924972565e-05,
"loss": 1.3185,
"step": 2695
},
{
"epoch": 1.2106944677988902,
"grad_norm": 1.2432576417922974,
"learning_rate": 6.491394577546204e-05,
"loss": 1.2941,
"step": 2700
},
{
"epoch": 1.2129364945911103,
"grad_norm": 1.0130048990249634,
"learning_rate": 6.480185008074284e-05,
"loss": 1.2495,
"step": 2705
},
{
"epoch": 1.2151785213833306,
"grad_norm": 1.1565743684768677,
"learning_rate": 6.468967278355072e-05,
"loss": 1.2585,
"step": 2710
},
{
"epoch": 1.2174205481755507,
"grad_norm": 0.9963768124580383,
"learning_rate": 6.457741450231812e-05,
"loss": 1.3497,
"step": 2715
},
{
"epoch": 1.2196625749677708,
"grad_norm": 1.1197139024734497,
"learning_rate": 6.446507585592399e-05,
"loss": 1.2958,
"step": 2720
},
{
"epoch": 1.221904601759991,
"grad_norm": 1.1450271606445312,
"learning_rate": 6.435265746369033e-05,
"loss": 1.3259,
"step": 2725
},
{
"epoch": 1.2241466285522111,
"grad_norm": 1.0894269943237305,
"learning_rate": 6.424015994537877e-05,
"loss": 1.272,
"step": 2730
},
{
"epoch": 1.2263886553444314,
"grad_norm": 1.1631505489349365,
"learning_rate": 6.412758392118718e-05,
"loss": 1.3315,
"step": 2735
},
{
"epoch": 1.2286306821366515,
"grad_norm": 1.213643193244934,
"learning_rate": 6.40149300117462e-05,
"loss": 1.3228,
"step": 2740
},
{
"epoch": 1.2308727089288718,
"grad_norm": 1.0162944793701172,
"learning_rate": 6.390219883811591e-05,
"loss": 1.2519,
"step": 2745
},
{
"epoch": 1.2331147357210919,
"grad_norm": 1.1782135963439941,
"learning_rate": 6.378939102178225e-05,
"loss": 1.3281,
"step": 2750
},
{
"epoch": 1.235356762513312,
"grad_norm": 1.062117576599121,
"learning_rate": 6.367650718465379e-05,
"loss": 1.2671,
"step": 2755
},
{
"epoch": 1.2375987893055322,
"grad_norm": 1.3144171237945557,
"learning_rate": 6.356354794905814e-05,
"loss": 1.3392,
"step": 2760
},
{
"epoch": 1.2398408160977523,
"grad_norm": 1.0592882633209229,
"learning_rate": 6.345051393773861e-05,
"loss": 1.2902,
"step": 2765
},
{
"epoch": 1.2420828428899726,
"grad_norm": 1.2294663190841675,
"learning_rate": 6.333740577385074e-05,
"loss": 1.3081,
"step": 2770
},
{
"epoch": 1.2443248696821927,
"grad_norm": 1.0388215780258179,
"learning_rate": 6.322422408095886e-05,
"loss": 1.2917,
"step": 2775
},
{
"epoch": 1.246566896474413,
"grad_norm": 1.094425916671753,
"learning_rate": 6.311096948303264e-05,
"loss": 1.3252,
"step": 2780
},
{
"epoch": 1.248808923266633,
"grad_norm": 1.3590023517608643,
"learning_rate": 6.299764260444378e-05,
"loss": 1.2825,
"step": 2785
},
{
"epoch": 1.2510509500588531,
"grad_norm": 1.1007918119430542,
"learning_rate": 6.288424406996238e-05,
"loss": 1.2437,
"step": 2790
},
{
"epoch": 1.2532929768510734,
"grad_norm": 1.2783552408218384,
"learning_rate": 6.277077450475354e-05,
"loss": 1.3539,
"step": 2795
},
{
"epoch": 1.2555350036432935,
"grad_norm": 1.2107961177825928,
"learning_rate": 6.265723453437404e-05,
"loss": 1.3215,
"step": 2800
},
{
"epoch": 1.2577770304355136,
"grad_norm": 1.0384870767593384,
"learning_rate": 6.254362478476878e-05,
"loss": 1.2514,
"step": 2805
},
{
"epoch": 1.2600190572277339,
"grad_norm": 1.3173192739486694,
"learning_rate": 6.242994588226731e-05,
"loss": 1.3129,
"step": 2810
},
{
"epoch": 1.262261084019954,
"grad_norm": 0.9206444621086121,
"learning_rate": 6.231619845358045e-05,
"loss": 1.3108,
"step": 2815
},
{
"epoch": 1.2645031108121743,
"grad_norm": 1.13257896900177,
"learning_rate": 6.220238312579682e-05,
"loss": 1.286,
"step": 2820
},
{
"epoch": 1.2667451376043943,
"grad_norm": 1.3280197381973267,
"learning_rate": 6.208850052637933e-05,
"loss": 1.2462,
"step": 2825
},
{
"epoch": 1.2689871643966146,
"grad_norm": 1.2632642984390259,
"learning_rate": 6.197455128316178e-05,
"loss": 1.2761,
"step": 2830
},
{
"epoch": 1.2712291911888347,
"grad_norm": 1.0141961574554443,
"learning_rate": 6.186053602434539e-05,
"loss": 1.2421,
"step": 2835
},
{
"epoch": 1.2734712179810548,
"grad_norm": 1.2043393850326538,
"learning_rate": 6.174645537849529e-05,
"loss": 1.333,
"step": 2840
},
{
"epoch": 1.275713244773275,
"grad_norm": 1.1990864276885986,
"learning_rate": 6.163230997453712e-05,
"loss": 1.3188,
"step": 2845
},
{
"epoch": 1.2779552715654952,
"grad_norm": 1.0753861665725708,
"learning_rate": 6.15181004417535e-05,
"loss": 1.3231,
"step": 2850
},
{
"epoch": 1.2801972983577155,
"grad_norm": 1.089961290359497,
"learning_rate": 6.140382740978062e-05,
"loss": 1.258,
"step": 2855
},
{
"epoch": 1.2824393251499355,
"grad_norm": 1.217774510383606,
"learning_rate": 6.12894915086047e-05,
"loss": 1.2642,
"step": 2860
},
{
"epoch": 1.2846813519421558,
"grad_norm": 1.180626630783081,
"learning_rate": 6.117509336855865e-05,
"loss": 1.2759,
"step": 2865
},
{
"epoch": 1.286923378734376,
"grad_norm": 1.2082866430282593,
"learning_rate": 6.106063362031838e-05,
"loss": 1.3255,
"step": 2870
},
{
"epoch": 1.289165405526596,
"grad_norm": 1.1015843152999878,
"learning_rate": 6.094611289489951e-05,
"loss": 1.3282,
"step": 2875
},
{
"epoch": 1.2914074323188163,
"grad_norm": 1.1207735538482666,
"learning_rate": 6.083153182365383e-05,
"loss": 1.2982,
"step": 2880
},
{
"epoch": 1.2936494591110363,
"grad_norm": 1.1439082622528076,
"learning_rate": 6.071689103826582e-05,
"loss": 1.3463,
"step": 2885
},
{
"epoch": 1.2958914859032564,
"grad_norm": 1.1893078088760376,
"learning_rate": 6.060219117074913e-05,
"loss": 1.2573,
"step": 2890
},
{
"epoch": 1.2981335126954767,
"grad_norm": 1.2720766067504883,
"learning_rate": 6.048743285344317e-05,
"loss": 1.3029,
"step": 2895
},
{
"epoch": 1.3003755394876968,
"grad_norm": 1.0983214378356934,
"learning_rate": 6.037261671900953e-05,
"loss": 1.2845,
"step": 2900
},
{
"epoch": 1.302617566279917,
"grad_norm": 1.1721152067184448,
"learning_rate": 6.02577434004286e-05,
"loss": 1.3025,
"step": 2905
},
{
"epoch": 1.3048595930721372,
"grad_norm": 1.1017165184020996,
"learning_rate": 6.0142813530996e-05,
"loss": 1.3166,
"step": 2910
},
{
"epoch": 1.3071016198643575,
"grad_norm": 1.1608681678771973,
"learning_rate": 6.002782774431911e-05,
"loss": 1.259,
"step": 2915
},
{
"epoch": 1.3093436466565775,
"grad_norm": 1.249861478805542,
"learning_rate": 5.9912786674313614e-05,
"loss": 1.2469,
"step": 2920
},
{
"epoch": 1.3115856734487976,
"grad_norm": 1.2426470518112183,
"learning_rate": 5.9797690955199926e-05,
"loss": 1.2541,
"step": 2925
},
{
"epoch": 1.313827700241018,
"grad_norm": 1.0516715049743652,
"learning_rate": 5.968254122149974e-05,
"loss": 1.277,
"step": 2930
},
{
"epoch": 1.316069727033238,
"grad_norm": 1.4431426525115967,
"learning_rate": 5.95673381080326e-05,
"loss": 1.3182,
"step": 2935
},
{
"epoch": 1.3183117538254583,
"grad_norm": 1.2504661083221436,
"learning_rate": 5.945208224991226e-05,
"loss": 1.3503,
"step": 2940
},
{
"epoch": 1.3205537806176784,
"grad_norm": 1.1429800987243652,
"learning_rate": 5.933677428254328e-05,
"loss": 1.2767,
"step": 2945
},
{
"epoch": 1.3227958074098987,
"grad_norm": 1.2553044557571411,
"learning_rate": 5.922141484161751e-05,
"loss": 1.2817,
"step": 2950
},
{
"epoch": 1.3250378342021187,
"grad_norm": 1.2155333757400513,
"learning_rate": 5.910600456311055e-05,
"loss": 1.3347,
"step": 2955
},
{
"epoch": 1.3272798609943388,
"grad_norm": 1.2551952600479126,
"learning_rate": 5.8990544083278285e-05,
"loss": 1.2119,
"step": 2960
},
{
"epoch": 1.329521887786559,
"grad_norm": 1.1889550685882568,
"learning_rate": 5.887503403865333e-05,
"loss": 1.3307,
"step": 2965
},
{
"epoch": 1.3317639145787792,
"grad_norm": 1.1134368181228638,
"learning_rate": 5.8759475066041624e-05,
"loss": 1.3094,
"step": 2970
},
{
"epoch": 1.3340059413709993,
"grad_norm": 1.2652761936187744,
"learning_rate": 5.8643867802518756e-05,
"loss": 1.3296,
"step": 2975
},
{
"epoch": 1.3362479681632196,
"grad_norm": 1.6688954830169678,
"learning_rate": 5.852821288542658e-05,
"loss": 1.3148,
"step": 2980
},
{
"epoch": 1.3384899949554399,
"grad_norm": 0.9661517143249512,
"learning_rate": 5.841251095236969e-05,
"loss": 1.3197,
"step": 2985
},
{
"epoch": 1.34073202174766,
"grad_norm": 1.1682339906692505,
"learning_rate": 5.829676264121183e-05,
"loss": 1.3328,
"step": 2990
},
{
"epoch": 1.34297404853988,
"grad_norm": 1.109320044517517,
"learning_rate": 5.818096859007247e-05,
"loss": 1.2575,
"step": 2995
},
{
"epoch": 1.3452160753321003,
"grad_norm": 1.2029309272766113,
"learning_rate": 5.8065129437323206e-05,
"loss": 1.3296,
"step": 3000
},
{
"epoch": 1.3474581021243204,
"grad_norm": 1.104525089263916,
"learning_rate": 5.794924582158431e-05,
"loss": 1.2558,
"step": 3005
},
{
"epoch": 1.3497001289165405,
"grad_norm": 1.1124447584152222,
"learning_rate": 5.783331838172116e-05,
"loss": 1.3036,
"step": 3010
},
{
"epoch": 1.3519421557087608,
"grad_norm": 1.1220247745513916,
"learning_rate": 5.771734775684072e-05,
"loss": 1.3161,
"step": 3015
},
{
"epoch": 1.3541841825009808,
"grad_norm": 1.143099069595337,
"learning_rate": 5.760133458628809e-05,
"loss": 1.3066,
"step": 3020
},
{
"epoch": 1.3564262092932011,
"grad_norm": 0.9693493247032166,
"learning_rate": 5.7485279509642885e-05,
"loss": 1.3089,
"step": 3025
},
{
"epoch": 1.3586682360854212,
"grad_norm": 1.0467145442962646,
"learning_rate": 5.736918316671572e-05,
"loss": 1.2631,
"step": 3030
},
{
"epoch": 1.3609102628776415,
"grad_norm": 1.1905845403671265,
"learning_rate": 5.7253046197544754e-05,
"loss": 1.2759,
"step": 3035
},
{
"epoch": 1.3631522896698616,
"grad_norm": 1.0732934474945068,
"learning_rate": 5.713686924239211e-05,
"loss": 1.296,
"step": 3040
},
{
"epoch": 1.3653943164620816,
"grad_norm": 1.2555313110351562,
"learning_rate": 5.702065294174036e-05,
"loss": 1.2306,
"step": 3045
},
{
"epoch": 1.367636343254302,
"grad_norm": 1.033304214477539,
"learning_rate": 5.690439793628896e-05,
"loss": 1.3072,
"step": 3050
},
{
"epoch": 1.369878370046522,
"grad_norm": 1.058167576789856,
"learning_rate": 5.6788104866950754e-05,
"loss": 1.3995,
"step": 3055
},
{
"epoch": 1.372120396838742,
"grad_norm": 1.0705965757369995,
"learning_rate": 5.667177437484845e-05,
"loss": 1.3035,
"step": 3060
},
{
"epoch": 1.3743624236309624,
"grad_norm": 1.052674651145935,
"learning_rate": 5.655540710131105e-05,
"loss": 1.3247,
"step": 3065
},
{
"epoch": 1.3766044504231827,
"grad_norm": 1.2467668056488037,
"learning_rate": 5.643900368787036e-05,
"loss": 1.3106,
"step": 3070
},
{
"epoch": 1.3788464772154028,
"grad_norm": 1.1554597616195679,
"learning_rate": 5.632256477625739e-05,
"loss": 1.2686,
"step": 3075
},
{
"epoch": 1.3810885040076228,
"grad_norm": 1.0708049535751343,
"learning_rate": 5.6206091008398866e-05,
"loss": 1.2774,
"step": 3080
},
{
"epoch": 1.3833305307998431,
"grad_norm": 1.1895546913146973,
"learning_rate": 5.608958302641364e-05,
"loss": 1.1813,
"step": 3085
},
{
"epoch": 1.3855725575920632,
"grad_norm": 1.4244434833526611,
"learning_rate": 5.597304147260927e-05,
"loss": 1.3678,
"step": 3090
},
{
"epoch": 1.3878145843842833,
"grad_norm": 1.2443078756332397,
"learning_rate": 5.5856466989478325e-05,
"loss": 1.2248,
"step": 3095
},
{
"epoch": 1.3900566111765036,
"grad_norm": 1.0258877277374268,
"learning_rate": 5.573986021969494e-05,
"loss": 1.2725,
"step": 3100
},
{
"epoch": 1.3922986379687237,
"grad_norm": 1.0962164402008057,
"learning_rate": 5.5623221806111224e-05,
"loss": 1.2393,
"step": 3105
},
{
"epoch": 1.394540664760944,
"grad_norm": 1.3782082796096802,
"learning_rate": 5.550655239175377e-05,
"loss": 1.2817,
"step": 3110
},
{
"epoch": 1.396782691553164,
"grad_norm": 1.2269506454467773,
"learning_rate": 5.538985261982006e-05,
"loss": 1.2376,
"step": 3115
},
{
"epoch": 1.3990247183453843,
"grad_norm": 1.2134568691253662,
"learning_rate": 5.527312313367492e-05,
"loss": 1.2925,
"step": 3120
},
{
"epoch": 1.4012667451376044,
"grad_norm": 1.275994896888733,
"learning_rate": 5.515636457684705e-05,
"loss": 1.351,
"step": 3125
},
{
"epoch": 1.4035087719298245,
"grad_norm": 1.0931142568588257,
"learning_rate": 5.5039577593025335e-05,
"loss": 1.3186,
"step": 3130
},
{
"epoch": 1.4057507987220448,
"grad_norm": 1.0260752439498901,
"learning_rate": 5.492276282605544e-05,
"loss": 1.2835,
"step": 3135
},
{
"epoch": 1.4079928255142649,
"grad_norm": 1.2498077154159546,
"learning_rate": 5.480592091993616e-05,
"loss": 1.3022,
"step": 3140
},
{
"epoch": 1.410234852306485,
"grad_norm": 1.0274704694747925,
"learning_rate": 5.4689052518815954e-05,
"loss": 1.2354,
"step": 3145
},
{
"epoch": 1.4124768790987052,
"grad_norm": 1.3377097845077515,
"learning_rate": 5.457215826698928e-05,
"loss": 1.3043,
"step": 3150
},
{
"epoch": 1.4147189058909255,
"grad_norm": 1.2201504707336426,
"learning_rate": 5.4455238808893185e-05,
"loss": 1.36,
"step": 3155
},
{
"epoch": 1.4169609326831456,
"grad_norm": 1.1458607912063599,
"learning_rate": 5.433829478910362e-05,
"loss": 1.285,
"step": 3160
},
{
"epoch": 1.4192029594753657,
"grad_norm": 1.1673274040222168,
"learning_rate": 5.4221326852331965e-05,
"loss": 1.3474,
"step": 3165
},
{
"epoch": 1.421444986267586,
"grad_norm": 1.141150951385498,
"learning_rate": 5.410433564342146e-05,
"loss": 1.3101,
"step": 3170
},
{
"epoch": 1.423687013059806,
"grad_norm": 1.18087899684906,
"learning_rate": 5.398732180734365e-05,
"loss": 1.3324,
"step": 3175
},
{
"epoch": 1.4259290398520261,
"grad_norm": 1.1001255512237549,
"learning_rate": 5.3870285989194814e-05,
"loss": 1.3046,
"step": 3180
},
{
"epoch": 1.4281710666442464,
"grad_norm": 1.2380887269973755,
"learning_rate": 5.3753228834192384e-05,
"loss": 1.283,
"step": 3185
},
{
"epoch": 1.4304130934364665,
"grad_norm": 1.2417025566101074,
"learning_rate": 5.3636150987671496e-05,
"loss": 1.2536,
"step": 3190
},
{
"epoch": 1.4326551202286868,
"grad_norm": 1.2791988849639893,
"learning_rate": 5.35190530950813e-05,
"loss": 1.314,
"step": 3195
},
{
"epoch": 1.4348971470209069,
"grad_norm": 1.0879089832305908,
"learning_rate": 5.3401935801981464e-05,
"loss": 1.2726,
"step": 3200
},
{
"epoch": 1.4371391738131272,
"grad_norm": 1.1599972248077393,
"learning_rate": 5.328479975403864e-05,
"loss": 1.3082,
"step": 3205
},
{
"epoch": 1.4393812006053472,
"grad_norm": 1.1873195171356201,
"learning_rate": 5.316764559702285e-05,
"loss": 1.2853,
"step": 3210
},
{
"epoch": 1.4416232273975673,
"grad_norm": 1.049641489982605,
"learning_rate": 5.3050473976803974e-05,
"loss": 1.3048,
"step": 3215
},
{
"epoch": 1.4438652541897876,
"grad_norm": 1.0594843626022339,
"learning_rate": 5.293328553934813e-05,
"loss": 1.2845,
"step": 3220
},
{
"epoch": 1.4461072809820077,
"grad_norm": 1.007035732269287,
"learning_rate": 5.2816080930714194e-05,
"loss": 1.3099,
"step": 3225
},
{
"epoch": 1.4483493077742278,
"grad_norm": 1.0724034309387207,
"learning_rate": 5.269886079705018e-05,
"loss": 1.28,
"step": 3230
},
{
"epoch": 1.450591334566448,
"grad_norm": 1.023113489151001,
"learning_rate": 5.258162578458963e-05,
"loss": 1.3397,
"step": 3235
},
{
"epoch": 1.4528333613586684,
"grad_norm": 1.005807638168335,
"learning_rate": 5.246437653964822e-05,
"loss": 1.2121,
"step": 3240
},
{
"epoch": 1.4550753881508884,
"grad_norm": 1.0102343559265137,
"learning_rate": 5.234711370862001e-05,
"loss": 1.276,
"step": 3245
},
{
"epoch": 1.4573174149431085,
"grad_norm": 1.173030138015747,
"learning_rate": 5.2229837937974e-05,
"loss": 1.3212,
"step": 3250
},
{
"epoch": 1.4595594417353288,
"grad_norm": 1.0489596128463745,
"learning_rate": 5.2112549874250495e-05,
"loss": 1.32,
"step": 3255
},
{
"epoch": 1.4618014685275489,
"grad_norm": 1.084416389465332,
"learning_rate": 5.199525016405759e-05,
"loss": 1.2529,
"step": 3260
},
{
"epoch": 1.464043495319769,
"grad_norm": 1.0936429500579834,
"learning_rate": 5.187793945406759e-05,
"loss": 1.241,
"step": 3265
},
{
"epoch": 1.4662855221119893,
"grad_norm": 1.199352502822876,
"learning_rate": 5.1760618391013424e-05,
"loss": 1.2246,
"step": 3270
},
{
"epoch": 1.4685275489042093,
"grad_norm": 1.1133605241775513,
"learning_rate": 5.164328762168514e-05,
"loss": 1.3192,
"step": 3275
},
{
"epoch": 1.4707695756964296,
"grad_norm": 1.165466070175171,
"learning_rate": 5.152594779292624e-05,
"loss": 1.3289,
"step": 3280
},
{
"epoch": 1.4730116024886497,
"grad_norm": 1.1635582447052002,
"learning_rate": 5.140859955163021e-05,
"loss": 1.2729,
"step": 3285
},
{
"epoch": 1.47525362928087,
"grad_norm": 1.1590099334716797,
"learning_rate": 5.1291243544736875e-05,
"loss": 1.3111,
"step": 3290
},
{
"epoch": 1.47749565607309,
"grad_norm": 1.157904863357544,
"learning_rate": 5.1173880419228935e-05,
"loss": 1.2491,
"step": 3295
},
{
"epoch": 1.4797376828653102,
"grad_norm": 1.103690266609192,
"learning_rate": 5.105651082212828e-05,
"loss": 1.2776,
"step": 3300
},
{
"epoch": 1.4819797096575305,
"grad_norm": 1.020355463027954,
"learning_rate": 5.093913540049249e-05,
"loss": 1.1967,
"step": 3305
},
{
"epoch": 1.4842217364497505,
"grad_norm": 1.2379658222198486,
"learning_rate": 5.082175480141126e-05,
"loss": 1.2427,
"step": 3310
},
{
"epoch": 1.4864637632419706,
"grad_norm": 1.072657585144043,
"learning_rate": 5.0704369672002835e-05,
"loss": 1.325,
"step": 3315
},
{
"epoch": 1.488705790034191,
"grad_norm": 1.2614028453826904,
"learning_rate": 5.0586980659410434e-05,
"loss": 1.3126,
"step": 3320
},
{
"epoch": 1.4909478168264112,
"grad_norm": 1.1547425985336304,
"learning_rate": 5.0469588410798676e-05,
"loss": 1.2616,
"step": 3325
},
{
"epoch": 1.4931898436186313,
"grad_norm": 1.2182773351669312,
"learning_rate": 5.035219357335001e-05,
"loss": 1.2992,
"step": 3330
},
{
"epoch": 1.4954318704108513,
"grad_norm": 1.2161564826965332,
"learning_rate": 5.023479679426122e-05,
"loss": 1.2788,
"step": 3335
},
{
"epoch": 1.4976738972030716,
"grad_norm": 1.122253656387329,
"learning_rate": 5.011739872073968e-05,
"loss": 1.3072,
"step": 3340
},
{
"epoch": 1.4999159239952917,
"grad_norm": 0.9858971834182739,
"learning_rate": 5e-05,
"loss": 1.2049,
"step": 3345
},
{
"epoch": 1.5021579507875118,
"grad_norm": 1.0259901285171509,
"learning_rate": 4.9882601279260324e-05,
"loss": 1.3184,
"step": 3350
},
{
"epoch": 1.504399977579732,
"grad_norm": 1.0712144374847412,
"learning_rate": 4.9765203205738805e-05,
"loss": 1.2826,
"step": 3355
},
{
"epoch": 1.5066420043719524,
"grad_norm": 1.0313420295715332,
"learning_rate": 4.964780642664999e-05,
"loss": 1.3633,
"step": 3360
},
{
"epoch": 1.5088840311641722,
"grad_norm": 1.1968498229980469,
"learning_rate": 4.953041158920133e-05,
"loss": 1.2564,
"step": 3365
},
{
"epoch": 1.5111260579563925,
"grad_norm": 1.0766561031341553,
"learning_rate": 4.9413019340589585e-05,
"loss": 1.2387,
"step": 3370
},
{
"epoch": 1.5133680847486128,
"grad_norm": 1.2741787433624268,
"learning_rate": 4.929563032799717e-05,
"loss": 1.2113,
"step": 3375
},
{
"epoch": 1.515610111540833,
"grad_norm": 1.1092220544815063,
"learning_rate": 4.917824519858875e-05,
"loss": 1.2782,
"step": 3380
},
{
"epoch": 1.517852138333053,
"grad_norm": 1.1351913213729858,
"learning_rate": 4.906086459950753e-05,
"loss": 1.2667,
"step": 3385
},
{
"epoch": 1.5200941651252733,
"grad_norm": 1.1670454740524292,
"learning_rate": 4.8943489177871735e-05,
"loss": 1.2764,
"step": 3390
},
{
"epoch": 1.5223361919174934,
"grad_norm": 1.1347793340682983,
"learning_rate": 4.882611958077108e-05,
"loss": 1.3095,
"step": 3395
},
{
"epoch": 1.5245782187097134,
"grad_norm": 1.0640754699707031,
"learning_rate": 4.870875645526313e-05,
"loss": 1.2696,
"step": 3400
},
{
"epoch": 1.5268202455019337,
"grad_norm": 1.1215641498565674,
"learning_rate": 4.859140044836979e-05,
"loss": 1.2618,
"step": 3405
},
{
"epoch": 1.529062272294154,
"grad_norm": 0.9714592695236206,
"learning_rate": 4.847405220707377e-05,
"loss": 1.3044,
"step": 3410
},
{
"epoch": 1.531304299086374,
"grad_norm": 1.055709719657898,
"learning_rate": 4.8356712378314876e-05,
"loss": 1.3893,
"step": 3415
},
{
"epoch": 1.5335463258785942,
"grad_norm": 1.0931789875030518,
"learning_rate": 4.823938160898657e-05,
"loss": 1.3075,
"step": 3420
},
{
"epoch": 1.5357883526708145,
"grad_norm": 1.0338480472564697,
"learning_rate": 4.812206054593242e-05,
"loss": 1.334,
"step": 3425
},
{
"epoch": 1.5380303794630346,
"grad_norm": 1.1629575490951538,
"learning_rate": 4.800474983594242e-05,
"loss": 1.2991,
"step": 3430
},
{
"epoch": 1.5402724062552546,
"grad_norm": 0.9702677726745605,
"learning_rate": 4.788745012574952e-05,
"loss": 1.2372,
"step": 3435
},
{
"epoch": 1.542514433047475,
"grad_norm": 1.1541732549667358,
"learning_rate": 4.777016206202602e-05,
"loss": 1.3549,
"step": 3440
},
{
"epoch": 1.5447564598396952,
"grad_norm": 1.108521580696106,
"learning_rate": 4.765288629137999e-05,
"loss": 1.2351,
"step": 3445
},
{
"epoch": 1.5469984866319153,
"grad_norm": 1.2135175466537476,
"learning_rate": 4.753562346035178e-05,
"loss": 1.2808,
"step": 3450
},
{
"epoch": 1.5492405134241354,
"grad_norm": 1.0196810960769653,
"learning_rate": 4.7418374215410374e-05,
"loss": 1.2797,
"step": 3455
},
{
"epoch": 1.5514825402163557,
"grad_norm": 1.1233173608779907,
"learning_rate": 4.730113920294983e-05,
"loss": 1.2932,
"step": 3460
},
{
"epoch": 1.5537245670085758,
"grad_norm": 1.0524299144744873,
"learning_rate": 4.7183919069285804e-05,
"loss": 1.2907,
"step": 3465
},
{
"epoch": 1.5559665938007958,
"grad_norm": 1.174949288368225,
"learning_rate": 4.706671446065188e-05,
"loss": 1.2596,
"step": 3470
},
{
"epoch": 1.5582086205930161,
"grad_norm": 1.152764081954956,
"learning_rate": 4.694952602319603e-05,
"loss": 1.3416,
"step": 3475
},
{
"epoch": 1.5604506473852362,
"grad_norm": 1.1165378093719482,
"learning_rate": 4.683235440297717e-05,
"loss": 1.2781,
"step": 3480
},
{
"epoch": 1.5626926741774563,
"grad_norm": 1.0606844425201416,
"learning_rate": 4.671520024596137e-05,
"loss": 1.3009,
"step": 3485
},
{
"epoch": 1.5649347009696766,
"grad_norm": 1.023228645324707,
"learning_rate": 4.659806419801855e-05,
"loss": 1.3042,
"step": 3490
},
{
"epoch": 1.5671767277618969,
"grad_norm": 1.2059510946273804,
"learning_rate": 4.6480946904918735e-05,
"loss": 1.2997,
"step": 3495
},
{
"epoch": 1.569418754554117,
"grad_norm": 1.0934103727340698,
"learning_rate": 4.636384901232852e-05,
"loss": 1.2771,
"step": 3500
},
{
"epoch": 1.571660781346337,
"grad_norm": 1.032578945159912,
"learning_rate": 4.6246771165807614e-05,
"loss": 1.2553,
"step": 3505
},
{
"epoch": 1.5739028081385573,
"grad_norm": 1.055982232093811,
"learning_rate": 4.612971401080521e-05,
"loss": 1.2673,
"step": 3510
},
{
"epoch": 1.5761448349307774,
"grad_norm": 1.00336754322052,
"learning_rate": 4.6012678192656364e-05,
"loss": 1.2102,
"step": 3515
},
{
"epoch": 1.5783868617229975,
"grad_norm": 1.0931719541549683,
"learning_rate": 4.589566435657854e-05,
"loss": 1.242,
"step": 3520
},
{
"epoch": 1.5806288885152178,
"grad_norm": 1.1765341758728027,
"learning_rate": 4.5778673147668053e-05,
"loss": 1.2747,
"step": 3525
},
{
"epoch": 1.582870915307438,
"grad_norm": 1.2692338228225708,
"learning_rate": 4.5661705210896395e-05,
"loss": 1.3241,
"step": 3530
},
{
"epoch": 1.5851129420996581,
"grad_norm": 1.2092036008834839,
"learning_rate": 4.5544761191106826e-05,
"loss": 1.271,
"step": 3535
},
{
"epoch": 1.5873549688918782,
"grad_norm": 1.2053848505020142,
"learning_rate": 4.542784173301072e-05,
"loss": 1.2828,
"step": 3540
},
{
"epoch": 1.5895969956840985,
"grad_norm": 1.1756088733673096,
"learning_rate": 4.5310947481184064e-05,
"loss": 1.2556,
"step": 3545
},
{
"epoch": 1.5918390224763186,
"grad_norm": 1.1956021785736084,
"learning_rate": 4.5194079080063835e-05,
"loss": 1.2561,
"step": 3550
},
{
"epoch": 1.5940810492685387,
"grad_norm": 1.0988577604293823,
"learning_rate": 4.5077237173944576e-05,
"loss": 1.4031,
"step": 3555
},
{
"epoch": 1.596323076060759,
"grad_norm": 1.1947277784347534,
"learning_rate": 4.496042240697467e-05,
"loss": 1.2634,
"step": 3560
},
{
"epoch": 1.5985651028529793,
"grad_norm": 1.0463786125183105,
"learning_rate": 4.484363542315297e-05,
"loss": 1.2856,
"step": 3565
},
{
"epoch": 1.6008071296451991,
"grad_norm": 1.0513739585876465,
"learning_rate": 4.4726876866325086e-05,
"loss": 1.2232,
"step": 3570
},
{
"epoch": 1.6030491564374194,
"grad_norm": 1.1749991178512573,
"learning_rate": 4.461014738017995e-05,
"loss": 1.3407,
"step": 3575
},
{
"epoch": 1.6052911832296397,
"grad_norm": 1.0796834230422974,
"learning_rate": 4.4493447608246253e-05,
"loss": 1.2917,
"step": 3580
},
{
"epoch": 1.6075332100218598,
"grad_norm": 1.0776811838150024,
"learning_rate": 4.437677819388879e-05,
"loss": 1.3028,
"step": 3585
},
{
"epoch": 1.6097752368140799,
"grad_norm": 1.1453006267547607,
"learning_rate": 4.4260139780305074e-05,
"loss": 1.2752,
"step": 3590
},
{
"epoch": 1.6120172636063002,
"grad_norm": 0.9714769124984741,
"learning_rate": 4.4143533010521686e-05,
"loss": 1.2274,
"step": 3595
},
{
"epoch": 1.6142592903985202,
"grad_norm": 1.298377513885498,
"learning_rate": 4.4026958527390735e-05,
"loss": 1.2982,
"step": 3600
},
{
"epoch": 1.6165013171907403,
"grad_norm": 1.1102139949798584,
"learning_rate": 4.391041697358636e-05,
"loss": 1.3122,
"step": 3605
},
{
"epoch": 1.6187433439829606,
"grad_norm": 1.0720750093460083,
"learning_rate": 4.3793908991601166e-05,
"loss": 1.3212,
"step": 3610
},
{
"epoch": 1.620985370775181,
"grad_norm": 1.0826951265335083,
"learning_rate": 4.367743522374261e-05,
"loss": 1.2706,
"step": 3615
},
{
"epoch": 1.623227397567401,
"grad_norm": 0.9729198217391968,
"learning_rate": 4.3560996312129636e-05,
"loss": 1.3026,
"step": 3620
},
{
"epoch": 1.625469424359621,
"grad_norm": 1.081446647644043,
"learning_rate": 4.344459289868895e-05,
"loss": 1.2997,
"step": 3625
},
{
"epoch": 1.6277114511518413,
"grad_norm": 1.1220719814300537,
"learning_rate": 4.3328225625151553e-05,
"loss": 1.2356,
"step": 3630
},
{
"epoch": 1.6299534779440614,
"grad_norm": 1.0425808429718018,
"learning_rate": 4.3211895133049244e-05,
"loss": 1.2756,
"step": 3635
},
{
"epoch": 1.6321955047362815,
"grad_norm": 1.0694538354873657,
"learning_rate": 4.309560206371106e-05,
"loss": 1.316,
"step": 3640
},
{
"epoch": 1.6344375315285018,
"grad_norm": 1.0898274183273315,
"learning_rate": 4.297934705825966e-05,
"loss": 1.3316,
"step": 3645
},
{
"epoch": 1.636679558320722,
"grad_norm": 1.176999807357788,
"learning_rate": 4.2863130757607906e-05,
"loss": 1.2538,
"step": 3650
},
{
"epoch": 1.638921585112942,
"grad_norm": 1.2387757301330566,
"learning_rate": 4.274695380245526e-05,
"loss": 1.3211,
"step": 3655
},
{
"epoch": 1.6411636119051622,
"grad_norm": 1.3879566192626953,
"learning_rate": 4.263081683328429e-05,
"loss": 1.2902,
"step": 3660
},
{
"epoch": 1.6434056386973825,
"grad_norm": 1.071897268295288,
"learning_rate": 4.2514720490357134e-05,
"loss": 1.251,
"step": 3665
},
{
"epoch": 1.6456476654896026,
"grad_norm": 1.221535086631775,
"learning_rate": 4.239866541371192e-05,
"loss": 1.2478,
"step": 3670
},
{
"epoch": 1.6478896922818227,
"grad_norm": 1.0815098285675049,
"learning_rate": 4.2282652243159276e-05,
"loss": 1.2811,
"step": 3675
},
{
"epoch": 1.650131719074043,
"grad_norm": 1.1960694789886475,
"learning_rate": 4.216668161827887e-05,
"loss": 1.2937,
"step": 3680
},
{
"epoch": 1.652373745866263,
"grad_norm": 1.307964563369751,
"learning_rate": 4.20507541784157e-05,
"loss": 1.2725,
"step": 3685
},
{
"epoch": 1.6546157726584831,
"grad_norm": 1.150295376777649,
"learning_rate": 4.193487056267679e-05,
"loss": 1.2542,
"step": 3690
},
{
"epoch": 1.6568577994507034,
"grad_norm": 1.0650702714920044,
"learning_rate": 4.181903140992754e-05,
"loss": 1.1894,
"step": 3695
},
{
"epoch": 1.6590998262429237,
"grad_norm": 1.0896923542022705,
"learning_rate": 4.170323735878818e-05,
"loss": 1.3178,
"step": 3700
},
{
"epoch": 1.6613418530351438,
"grad_norm": 1.1696605682373047,
"learning_rate": 4.1587489047630314e-05,
"loss": 1.2414,
"step": 3705
},
{
"epoch": 1.6635838798273639,
"grad_norm": 1.0459294319152832,
"learning_rate": 4.1471787114573426e-05,
"loss": 1.3447,
"step": 3710
},
{
"epoch": 1.6658259066195842,
"grad_norm": 1.2181882858276367,
"learning_rate": 4.135613219748125e-05,
"loss": 1.2815,
"step": 3715
},
{
"epoch": 1.6680679334118043,
"grad_norm": 1.2734391689300537,
"learning_rate": 4.124052493395838e-05,
"loss": 1.2832,
"step": 3720
},
{
"epoch": 1.6703099602040243,
"grad_norm": 1.102831244468689,
"learning_rate": 4.112496596134667e-05,
"loss": 1.2647,
"step": 3725
},
{
"epoch": 1.6725519869962446,
"grad_norm": 1.2140913009643555,
"learning_rate": 4.100945591672173e-05,
"loss": 1.258,
"step": 3730
},
{
"epoch": 1.674794013788465,
"grad_norm": 1.0504231452941895,
"learning_rate": 4.089399543688947e-05,
"loss": 1.2588,
"step": 3735
},
{
"epoch": 1.6770360405806848,
"grad_norm": 1.1240216493606567,
"learning_rate": 4.07785851583825e-05,
"loss": 1.2755,
"step": 3740
},
{
"epoch": 1.679278067372905,
"grad_norm": 1.1572685241699219,
"learning_rate": 4.066322571745673e-05,
"loss": 1.2768,
"step": 3745
},
{
"epoch": 1.6815200941651254,
"grad_norm": 0.9713603854179382,
"learning_rate": 4.054791775008775e-05,
"loss": 1.2288,
"step": 3750
},
{
"epoch": 1.6837621209573455,
"grad_norm": 1.1444541215896606,
"learning_rate": 4.043266189196741e-05,
"loss": 1.2193,
"step": 3755
},
{
"epoch": 1.6860041477495655,
"grad_norm": 1.0875182151794434,
"learning_rate": 4.031745877850026e-05,
"loss": 1.2802,
"step": 3760
},
{
"epoch": 1.6882461745417858,
"grad_norm": 1.1171207427978516,
"learning_rate": 4.02023090448001e-05,
"loss": 1.307,
"step": 3765
},
{
"epoch": 1.690488201334006,
"grad_norm": 0.991519570350647,
"learning_rate": 4.008721332568639e-05,
"loss": 1.2811,
"step": 3770
},
{
"epoch": 1.692730228126226,
"grad_norm": 1.1010782718658447,
"learning_rate": 3.9972172255680886e-05,
"loss": 1.2631,
"step": 3775
},
{
"epoch": 1.6949722549184463,
"grad_norm": 1.1108042001724243,
"learning_rate": 3.985718646900402e-05,
"loss": 1.3115,
"step": 3780
},
{
"epoch": 1.6972142817106666,
"grad_norm": 1.009866714477539,
"learning_rate": 3.974225659957141e-05,
"loss": 1.2613,
"step": 3785
},
{
"epoch": 1.6994563085028866,
"grad_norm": 1.0787150859832764,
"learning_rate": 3.9627383280990474e-05,
"loss": 1.353,
"step": 3790
},
{
"epoch": 1.7016983352951067,
"grad_norm": 1.0613850355148315,
"learning_rate": 3.951256714655685e-05,
"loss": 1.3048,
"step": 3795
},
{
"epoch": 1.703940362087327,
"grad_norm": 1.1148884296417236,
"learning_rate": 3.939780882925088e-05,
"loss": 1.2918,
"step": 3800
}
],
"logging_steps": 5,
"max_steps": 6690,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 2.325845119889375e+18,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}