LLM-RNA-Design-2026 / model /SL /trainer_state.json
Milanmg's picture
Upload folder using huggingface_hub
609f573 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.78184172591561,
"eval_steps": 500,
"global_step": 16000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0004886510786972562,
"grad_norm": 550.610107421875,
"learning_rate": 4.396678065461651e-08,
"loss": 8.0618,
"step": 10
},
{
"epoch": 0.0009773021573945123,
"grad_norm": 547.8289794921875,
"learning_rate": 9.281875915974597e-08,
"loss": 8.0357,
"step": 20
},
{
"epoch": 0.0014659532360917686,
"grad_norm": 534.9840087890625,
"learning_rate": 1.4167073766487544e-07,
"loss": 7.8875,
"step": 30
},
{
"epoch": 0.0019546043147890247,
"grad_norm": 550.1026000976562,
"learning_rate": 1.905227161700049e-07,
"loss": 7.5113,
"step": 40
},
{
"epoch": 0.002443255393486281,
"grad_norm": 519.316650390625,
"learning_rate": 2.3937469467513437e-07,
"loss": 6.2256,
"step": 50
},
{
"epoch": 0.0029319064721835372,
"grad_norm": 198.8031768798828,
"learning_rate": 2.8822667318026384e-07,
"loss": 4.6185,
"step": 60
},
{
"epoch": 0.0034205575508807935,
"grad_norm": 113.63694763183594,
"learning_rate": 3.3707865168539325e-07,
"loss": 2.404,
"step": 70
},
{
"epoch": 0.003909208629578049,
"grad_norm": 77.9081802368164,
"learning_rate": 3.859306301905227e-07,
"loss": 1.9127,
"step": 80
},
{
"epoch": 0.004397859708275306,
"grad_norm": 92.41219329833984,
"learning_rate": 4.347826086956522e-07,
"loss": 1.5194,
"step": 90
},
{
"epoch": 0.004886510786972562,
"grad_norm": 20.305315017700195,
"learning_rate": 4.836345872007817e-07,
"loss": 1.3619,
"step": 100
},
{
"epoch": 0.005375161865669819,
"grad_norm": 15.185184478759766,
"learning_rate": 5.324865657059111e-07,
"loss": 1.2711,
"step": 110
},
{
"epoch": 0.0058638129443670745,
"grad_norm": 17.12596321105957,
"learning_rate": 5.813385442110406e-07,
"loss": 1.2171,
"step": 120
},
{
"epoch": 0.006352464023064331,
"grad_norm": 24.50565528869629,
"learning_rate": 6.3019052271617e-07,
"loss": 1.1917,
"step": 130
},
{
"epoch": 0.006841115101761587,
"grad_norm": 9.890480995178223,
"learning_rate": 6.790425012212995e-07,
"loss": 1.1793,
"step": 140
},
{
"epoch": 0.007329766180458844,
"grad_norm": 16.53375816345215,
"learning_rate": 7.278944797264289e-07,
"loss": 1.1749,
"step": 150
},
{
"epoch": 0.007818417259156099,
"grad_norm": 19.722103118896484,
"learning_rate": 7.767464582315585e-07,
"loss": 1.1696,
"step": 160
},
{
"epoch": 0.008307068337853355,
"grad_norm": 21.687253952026367,
"learning_rate": 8.255984367366879e-07,
"loss": 1.1658,
"step": 170
},
{
"epoch": 0.008795719416550612,
"grad_norm": 20.212892532348633,
"learning_rate": 8.744504152418174e-07,
"loss": 1.1672,
"step": 180
},
{
"epoch": 0.009284370495247869,
"grad_norm": 14.680685043334961,
"learning_rate": 9.233023937469468e-07,
"loss": 1.164,
"step": 190
},
{
"epoch": 0.009773021573945124,
"grad_norm": 15.129215240478516,
"learning_rate": 9.721543722520762e-07,
"loss": 1.1606,
"step": 200
},
{
"epoch": 0.01026167265264238,
"grad_norm": 6.895666122436523,
"learning_rate": 1.0210063507572057e-06,
"loss": 1.161,
"step": 210
},
{
"epoch": 0.010750323731339637,
"grad_norm": 6.139767646789551,
"learning_rate": 1.0698583292623353e-06,
"loss": 1.1602,
"step": 220
},
{
"epoch": 0.011238974810036894,
"grad_norm": 25.940549850463867,
"learning_rate": 1.1187103077674646e-06,
"loss": 1.158,
"step": 230
},
{
"epoch": 0.011727625888734149,
"grad_norm": 5.9631829261779785,
"learning_rate": 1.167562286272594e-06,
"loss": 1.1565,
"step": 240
},
{
"epoch": 0.012216276967431406,
"grad_norm": 42.288856506347656,
"learning_rate": 1.2164142647777236e-06,
"loss": 1.1634,
"step": 250
},
{
"epoch": 0.012704928046128662,
"grad_norm": 23.973031997680664,
"learning_rate": 1.265266243282853e-06,
"loss": 1.1622,
"step": 260
},
{
"epoch": 0.013193579124825917,
"grad_norm": 32.71512985229492,
"learning_rate": 1.3141182217879824e-06,
"loss": 1.1632,
"step": 270
},
{
"epoch": 0.013682230203523174,
"grad_norm": 45.931095123291016,
"learning_rate": 1.362970200293112e-06,
"loss": 1.1611,
"step": 280
},
{
"epoch": 0.01417088128222043,
"grad_norm": 31.298593521118164,
"learning_rate": 1.4118221787982415e-06,
"loss": 1.1609,
"step": 290
},
{
"epoch": 0.014659532360917688,
"grad_norm": 37.475921630859375,
"learning_rate": 1.4606741573033708e-06,
"loss": 1.159,
"step": 300
},
{
"epoch": 0.015148183439614942,
"grad_norm": 56.90618896484375,
"learning_rate": 1.5095261358085003e-06,
"loss": 1.1611,
"step": 310
},
{
"epoch": 0.015636834518312197,
"grad_norm": 26.84503746032715,
"learning_rate": 1.5583781143136298e-06,
"loss": 1.1605,
"step": 320
},
{
"epoch": 0.016125485597009454,
"grad_norm": 31.706214904785156,
"learning_rate": 1.6072300928187593e-06,
"loss": 1.1636,
"step": 330
},
{
"epoch": 0.01661413667570671,
"grad_norm": 20.083066940307617,
"learning_rate": 1.6560820713238887e-06,
"loss": 1.155,
"step": 340
},
{
"epoch": 0.017102787754403968,
"grad_norm": 20.37936782836914,
"learning_rate": 1.7049340498290182e-06,
"loss": 1.1544,
"step": 350
},
{
"epoch": 0.017591438833101224,
"grad_norm": 29.238149642944336,
"learning_rate": 1.7537860283341477e-06,
"loss": 1.1526,
"step": 360
},
{
"epoch": 0.01808008991179848,
"grad_norm": 24.459911346435547,
"learning_rate": 1.802638006839277e-06,
"loss": 1.1542,
"step": 370
},
{
"epoch": 0.018568740990495738,
"grad_norm": 25.11469841003418,
"learning_rate": 1.8514899853444065e-06,
"loss": 1.153,
"step": 380
},
{
"epoch": 0.019057392069192994,
"grad_norm": 19.211380004882812,
"learning_rate": 1.900341963849536e-06,
"loss": 1.1524,
"step": 390
},
{
"epoch": 0.019546043147890248,
"grad_norm": 29.28157615661621,
"learning_rate": 1.9491939423546656e-06,
"loss": 1.1507,
"step": 400
},
{
"epoch": 0.020034694226587504,
"grad_norm": 18.40865707397461,
"learning_rate": 1.998045920859795e-06,
"loss": 1.1514,
"step": 410
},
{
"epoch": 0.02052334530528476,
"grad_norm": 32.68934631347656,
"learning_rate": 2.046897899364924e-06,
"loss": 1.1509,
"step": 420
},
{
"epoch": 0.021011996383982018,
"grad_norm": 28.276269912719727,
"learning_rate": 2.0957498778700537e-06,
"loss": 1.1508,
"step": 430
},
{
"epoch": 0.021500647462679275,
"grad_norm": 29.66724967956543,
"learning_rate": 2.1446018563751832e-06,
"loss": 1.1507,
"step": 440
},
{
"epoch": 0.02198929854137653,
"grad_norm": 33.39693069458008,
"learning_rate": 2.1934538348803127e-06,
"loss": 1.1505,
"step": 450
},
{
"epoch": 0.022477949620073788,
"grad_norm": 28.482940673828125,
"learning_rate": 2.2423058133854423e-06,
"loss": 1.1488,
"step": 460
},
{
"epoch": 0.02296660069877104,
"grad_norm": 22.53483009338379,
"learning_rate": 2.2911577918905718e-06,
"loss": 1.1495,
"step": 470
},
{
"epoch": 0.023455251777468298,
"grad_norm": 20.745651245117188,
"learning_rate": 2.3400097703957013e-06,
"loss": 1.1477,
"step": 480
},
{
"epoch": 0.023943902856165555,
"grad_norm": 27.499927520751953,
"learning_rate": 2.388861748900831e-06,
"loss": 1.1491,
"step": 490
},
{
"epoch": 0.02443255393486281,
"grad_norm": 17.32890510559082,
"learning_rate": 2.43771372740596e-06,
"loss": 1.1483,
"step": 500
},
{
"epoch": 0.02443255393486281,
"eval_loss": 1.1245596408843994,
"eval_runtime": 728.0762,
"eval_samples_per_second": 242.98,
"eval_steps_per_second": 0.475,
"step": 500
},
{
"epoch": 0.024921205013560068,
"grad_norm": 28.78767967224121,
"learning_rate": 2.4865657059110894e-06,
"loss": 1.1466,
"step": 510
},
{
"epoch": 0.025409856092257325,
"grad_norm": 19.717103958129883,
"learning_rate": 2.5354176844162194e-06,
"loss": 1.1464,
"step": 520
},
{
"epoch": 0.02589850717095458,
"grad_norm": 27.598007202148438,
"learning_rate": 2.584269662921349e-06,
"loss": 1.1465,
"step": 530
},
{
"epoch": 0.026387158249651835,
"grad_norm": 23.63872528076172,
"learning_rate": 2.633121641426478e-06,
"loss": 1.1453,
"step": 540
},
{
"epoch": 0.02687580932834909,
"grad_norm": 26.8532772064209,
"learning_rate": 2.6819736199316075e-06,
"loss": 1.1466,
"step": 550
},
{
"epoch": 0.027364460407046348,
"grad_norm": 22.594478607177734,
"learning_rate": 2.730825598436737e-06,
"loss": 1.146,
"step": 560
},
{
"epoch": 0.027853111485743605,
"grad_norm": 22.817705154418945,
"learning_rate": 2.7796775769418666e-06,
"loss": 1.1437,
"step": 570
},
{
"epoch": 0.02834176256444086,
"grad_norm": 7.5399250984191895,
"learning_rate": 2.828529555446996e-06,
"loss": 1.1443,
"step": 580
},
{
"epoch": 0.02883041364313812,
"grad_norm": 34.241981506347656,
"learning_rate": 2.8773815339521256e-06,
"loss": 1.1723,
"step": 590
},
{
"epoch": 0.029319064721835375,
"grad_norm": 41.769996643066406,
"learning_rate": 2.926233512457255e-06,
"loss": 1.1566,
"step": 600
},
{
"epoch": 0.02980771580053263,
"grad_norm": 60.579036712646484,
"learning_rate": 2.9750854909623842e-06,
"loss": 1.1581,
"step": 610
},
{
"epoch": 0.030296366879229885,
"grad_norm": 9.520317077636719,
"learning_rate": 3.0239374694675137e-06,
"loss": 1.1521,
"step": 620
},
{
"epoch": 0.03078501795792714,
"grad_norm": 48.57497787475586,
"learning_rate": 3.0727894479726433e-06,
"loss": 1.1721,
"step": 630
},
{
"epoch": 0.031273669036624395,
"grad_norm": 44.00090026855469,
"learning_rate": 3.1216414264777728e-06,
"loss": 1.1517,
"step": 640
},
{
"epoch": 0.03176232011532165,
"grad_norm": 25.236425399780273,
"learning_rate": 3.1704934049829023e-06,
"loss": 1.1657,
"step": 650
},
{
"epoch": 0.03225097119401891,
"grad_norm": 26.25858497619629,
"learning_rate": 3.219345383488032e-06,
"loss": 1.146,
"step": 660
},
{
"epoch": 0.032739622272716165,
"grad_norm": 33.179771423339844,
"learning_rate": 3.2681973619931613e-06,
"loss": 1.1452,
"step": 670
},
{
"epoch": 0.03322827335141342,
"grad_norm": 26.868507385253906,
"learning_rate": 3.3170493404982904e-06,
"loss": 1.1399,
"step": 680
},
{
"epoch": 0.03371692443011068,
"grad_norm": 14.93895149230957,
"learning_rate": 3.36590131900342e-06,
"loss": 1.142,
"step": 690
},
{
"epoch": 0.034205575508807935,
"grad_norm": 10.044151306152344,
"learning_rate": 3.4147532975085495e-06,
"loss": 1.1391,
"step": 700
},
{
"epoch": 0.03469422658750519,
"grad_norm": 22.18167495727539,
"learning_rate": 3.463605276013679e-06,
"loss": 1.1439,
"step": 710
},
{
"epoch": 0.03518287766620245,
"grad_norm": 17.276782989501953,
"learning_rate": 3.5124572545188085e-06,
"loss": 1.1402,
"step": 720
},
{
"epoch": 0.035671528744899705,
"grad_norm": 22.945816040039062,
"learning_rate": 3.561309233023938e-06,
"loss": 1.1409,
"step": 730
},
{
"epoch": 0.03616017982359696,
"grad_norm": 28.2205810546875,
"learning_rate": 3.6101612115290676e-06,
"loss": 1.1425,
"step": 740
},
{
"epoch": 0.03664883090229422,
"grad_norm": 29.494741439819336,
"learning_rate": 3.6590131900341966e-06,
"loss": 1.1423,
"step": 750
},
{
"epoch": 0.037137481980991476,
"grad_norm": 12.51252269744873,
"learning_rate": 3.707865168539326e-06,
"loss": 1.14,
"step": 760
},
{
"epoch": 0.03762613305968873,
"grad_norm": 19.594589233398438,
"learning_rate": 3.7567171470444557e-06,
"loss": 1.1472,
"step": 770
},
{
"epoch": 0.03811478413838599,
"grad_norm": 8.805150032043457,
"learning_rate": 3.805569125549585e-06,
"loss": 1.143,
"step": 780
},
{
"epoch": 0.03860343521708324,
"grad_norm": 16.11551856994629,
"learning_rate": 3.854421104054714e-06,
"loss": 1.1394,
"step": 790
},
{
"epoch": 0.039092086295780495,
"grad_norm": 30.43114471435547,
"learning_rate": 3.903273082559844e-06,
"loss": 1.1412,
"step": 800
},
{
"epoch": 0.03958073737447775,
"grad_norm": 24.658550262451172,
"learning_rate": 3.952125061064973e-06,
"loss": 1.1379,
"step": 810
},
{
"epoch": 0.04006938845317501,
"grad_norm": 30.698740005493164,
"learning_rate": 4.000977039570103e-06,
"loss": 1.1402,
"step": 820
},
{
"epoch": 0.040558039531872266,
"grad_norm": 15.285526275634766,
"learning_rate": 4.049829018075232e-06,
"loss": 1.1397,
"step": 830
},
{
"epoch": 0.04104669061056952,
"grad_norm": 16.959575653076172,
"learning_rate": 4.098680996580362e-06,
"loss": 1.143,
"step": 840
},
{
"epoch": 0.04153534168926678,
"grad_norm": 9.172962188720703,
"learning_rate": 4.1475329750854914e-06,
"loss": 1.1394,
"step": 850
},
{
"epoch": 0.042023992767964036,
"grad_norm": 34.94649887084961,
"learning_rate": 4.196384953590621e-06,
"loss": 1.1586,
"step": 860
},
{
"epoch": 0.04251264384666129,
"grad_norm": 31.494056701660156,
"learning_rate": 4.2452369320957505e-06,
"loss": 1.1561,
"step": 870
},
{
"epoch": 0.04300129492535855,
"grad_norm": 30.298629760742188,
"learning_rate": 4.29408891060088e-06,
"loss": 1.1579,
"step": 880
},
{
"epoch": 0.043489946004055806,
"grad_norm": 8.591865539550781,
"learning_rate": 4.3429408891060095e-06,
"loss": 1.1441,
"step": 890
},
{
"epoch": 0.04397859708275306,
"grad_norm": 22.960233688354492,
"learning_rate": 4.391792867611139e-06,
"loss": 1.1444,
"step": 900
},
{
"epoch": 0.04446724816145032,
"grad_norm": 14.041954040527344,
"learning_rate": 4.4406448461162685e-06,
"loss": 1.141,
"step": 910
},
{
"epoch": 0.044955899240147576,
"grad_norm": 25.523542404174805,
"learning_rate": 4.489496824621398e-06,
"loss": 1.1335,
"step": 920
},
{
"epoch": 0.045444550318844826,
"grad_norm": 12.317065238952637,
"learning_rate": 4.538348803126527e-06,
"loss": 1.1412,
"step": 930
},
{
"epoch": 0.04593320139754208,
"grad_norm": 6.889744758605957,
"learning_rate": 4.587200781631656e-06,
"loss": 1.1431,
"step": 940
},
{
"epoch": 0.04642185247623934,
"grad_norm": 14.626124382019043,
"learning_rate": 4.636052760136786e-06,
"loss": 1.1369,
"step": 950
},
{
"epoch": 0.046910503554936596,
"grad_norm": 8.889772415161133,
"learning_rate": 4.684904738641915e-06,
"loss": 1.1327,
"step": 960
},
{
"epoch": 0.04739915463363385,
"grad_norm": 22.604360580444336,
"learning_rate": 4.733756717147045e-06,
"loss": 1.1311,
"step": 970
},
{
"epoch": 0.04788780571233111,
"grad_norm": 18.373239517211914,
"learning_rate": 4.782608695652174e-06,
"loss": 1.1329,
"step": 980
},
{
"epoch": 0.048376456791028366,
"grad_norm": 18.741851806640625,
"learning_rate": 4.831460674157304e-06,
"loss": 1.1275,
"step": 990
},
{
"epoch": 0.04886510786972562,
"grad_norm": 21.531051635742188,
"learning_rate": 4.880312652662433e-06,
"loss": 1.1242,
"step": 1000
},
{
"epoch": 0.04886510786972562,
"eval_loss": 1.0948448181152344,
"eval_runtime": 728.3165,
"eval_samples_per_second": 242.9,
"eval_steps_per_second": 0.475,
"step": 1000
},
{
"epoch": 0.04935375894842288,
"grad_norm": 9.384544372558594,
"learning_rate": 4.929164631167563e-06,
"loss": 1.1484,
"step": 1010
},
{
"epoch": 0.049842410027120136,
"grad_norm": 25.287551879882812,
"learning_rate": 4.978016609672692e-06,
"loss": 1.1365,
"step": 1020
},
{
"epoch": 0.05033106110581739,
"grad_norm": 25.104299545288086,
"learning_rate": 5.026868588177821e-06,
"loss": 1.1073,
"step": 1030
},
{
"epoch": 0.05081971218451465,
"grad_norm": 5.135197639465332,
"learning_rate": 5.0757205666829515e-06,
"loss": 1.0908,
"step": 1040
},
{
"epoch": 0.051308363263211906,
"grad_norm": 10.835426330566406,
"learning_rate": 5.12457254518808e-06,
"loss": 1.0344,
"step": 1050
},
{
"epoch": 0.05179701434190916,
"grad_norm": 17.45260238647461,
"learning_rate": 5.1734245236932105e-06,
"loss": 0.9916,
"step": 1060
},
{
"epoch": 0.05228566542060641,
"grad_norm": 18.409074783325195,
"learning_rate": 5.222276502198339e-06,
"loss": 0.9616,
"step": 1070
},
{
"epoch": 0.05277431649930367,
"grad_norm": 13.753133773803711,
"learning_rate": 5.271128480703469e-06,
"loss": 0.9379,
"step": 1080
},
{
"epoch": 0.053262967578000926,
"grad_norm": 16.086511611938477,
"learning_rate": 5.319980459208598e-06,
"loss": 0.922,
"step": 1090
},
{
"epoch": 0.05375161865669818,
"grad_norm": 14.6001558303833,
"learning_rate": 5.368832437713728e-06,
"loss": 0.9061,
"step": 1100
},
{
"epoch": 0.05424026973539544,
"grad_norm": 22.474435806274414,
"learning_rate": 5.417684416218857e-06,
"loss": 0.9022,
"step": 1110
},
{
"epoch": 0.054728920814092696,
"grad_norm": 22.234281539916992,
"learning_rate": 5.466536394723987e-06,
"loss": 0.9091,
"step": 1120
},
{
"epoch": 0.05521757189278995,
"grad_norm": 10.945754051208496,
"learning_rate": 5.5153883732291154e-06,
"loss": 0.903,
"step": 1130
},
{
"epoch": 0.05570622297148721,
"grad_norm": 12.38178539276123,
"learning_rate": 5.564240351734246e-06,
"loss": 0.882,
"step": 1140
},
{
"epoch": 0.056194874050184467,
"grad_norm": 18.168428421020508,
"learning_rate": 5.6130923302393745e-06,
"loss": 0.8671,
"step": 1150
},
{
"epoch": 0.05668352512888172,
"grad_norm": 13.480072975158691,
"learning_rate": 5.661944308744505e-06,
"loss": 0.8604,
"step": 1160
},
{
"epoch": 0.05717217620757898,
"grad_norm": 15.529900550842285,
"learning_rate": 5.7107962872496335e-06,
"loss": 0.8467,
"step": 1170
},
{
"epoch": 0.05766082728627624,
"grad_norm": 12.60476016998291,
"learning_rate": 5.759648265754764e-06,
"loss": 0.8376,
"step": 1180
},
{
"epoch": 0.05814947836497349,
"grad_norm": 12.737000465393066,
"learning_rate": 5.8085002442598926e-06,
"loss": 0.8262,
"step": 1190
},
{
"epoch": 0.05863812944367075,
"grad_norm": 11.14971923828125,
"learning_rate": 5.857352222765023e-06,
"loss": 0.8164,
"step": 1200
},
{
"epoch": 0.059126780522368,
"grad_norm": 13.185476303100586,
"learning_rate": 5.906204201270152e-06,
"loss": 0.8107,
"step": 1210
},
{
"epoch": 0.05961543160106526,
"grad_norm": 19.2025203704834,
"learning_rate": 5.955056179775281e-06,
"loss": 0.8026,
"step": 1220
},
{
"epoch": 0.06010408267976251,
"grad_norm": 15.930268287658691,
"learning_rate": 6.003908158280411e-06,
"loss": 0.8046,
"step": 1230
},
{
"epoch": 0.06059273375845977,
"grad_norm": 9.219900131225586,
"learning_rate": 6.05276013678554e-06,
"loss": 0.7913,
"step": 1240
},
{
"epoch": 0.06108138483715703,
"grad_norm": 9.282882690429688,
"learning_rate": 6.10161211529067e-06,
"loss": 0.7798,
"step": 1250
},
{
"epoch": 0.06157003591585428,
"grad_norm": 10.684017181396484,
"learning_rate": 6.150464093795799e-06,
"loss": 0.7746,
"step": 1260
},
{
"epoch": 0.06205868699455154,
"grad_norm": 19.030454635620117,
"learning_rate": 6.199316072300928e-06,
"loss": 0.7789,
"step": 1270
},
{
"epoch": 0.06254733807324879,
"grad_norm": 14.472164154052734,
"learning_rate": 6.248168050806058e-06,
"loss": 0.7645,
"step": 1280
},
{
"epoch": 0.06303598915194605,
"grad_norm": 15.92104721069336,
"learning_rate": 6.297020029311187e-06,
"loss": 0.7601,
"step": 1290
},
{
"epoch": 0.0635246402306433,
"grad_norm": 12.93683910369873,
"learning_rate": 6.345872007816317e-06,
"loss": 0.7508,
"step": 1300
},
{
"epoch": 0.06401329130934057,
"grad_norm": 12.283439636230469,
"learning_rate": 6.394723986321446e-06,
"loss": 0.7441,
"step": 1310
},
{
"epoch": 0.06450194238803782,
"grad_norm": 11.30448055267334,
"learning_rate": 6.443575964826576e-06,
"loss": 0.7359,
"step": 1320
},
{
"epoch": 0.06499059346673508,
"grad_norm": 10.10558795928955,
"learning_rate": 6.492427943331705e-06,
"loss": 0.7312,
"step": 1330
},
{
"epoch": 0.06547924454543233,
"grad_norm": 10.84056282043457,
"learning_rate": 6.541279921836835e-06,
"loss": 0.7257,
"step": 1340
},
{
"epoch": 0.0659678956241296,
"grad_norm": 11.601236343383789,
"learning_rate": 6.590131900341964e-06,
"loss": 0.7205,
"step": 1350
},
{
"epoch": 0.06645654670282684,
"grad_norm": 9.640713691711426,
"learning_rate": 6.6389838788470936e-06,
"loss": 0.7179,
"step": 1360
},
{
"epoch": 0.06694519778152411,
"grad_norm": 7.962968826293945,
"learning_rate": 6.687835857352223e-06,
"loss": 0.7218,
"step": 1370
},
{
"epoch": 0.06743384886022136,
"grad_norm": 6.22469425201416,
"learning_rate": 6.736687835857353e-06,
"loss": 0.7158,
"step": 1380
},
{
"epoch": 0.06792249993891862,
"grad_norm": 11.041025161743164,
"learning_rate": 6.785539814362482e-06,
"loss": 0.7075,
"step": 1390
},
{
"epoch": 0.06841115101761587,
"grad_norm": 11.427011489868164,
"learning_rate": 6.834391792867612e-06,
"loss": 0.7097,
"step": 1400
},
{
"epoch": 0.06889980209631313,
"grad_norm": 5.472978115081787,
"learning_rate": 6.88324377137274e-06,
"loss": 0.7058,
"step": 1410
},
{
"epoch": 0.06938845317501038,
"grad_norm": 11.191500663757324,
"learning_rate": 6.932095749877871e-06,
"loss": 0.7033,
"step": 1420
},
{
"epoch": 0.06987710425370763,
"grad_norm": 9.202252388000488,
"learning_rate": 6.980947728382999e-06,
"loss": 0.7019,
"step": 1430
},
{
"epoch": 0.0703657553324049,
"grad_norm": 5.8239216804504395,
"learning_rate": 7.02979970688813e-06,
"loss": 0.6908,
"step": 1440
},
{
"epoch": 0.07085440641110215,
"grad_norm": 6.5890092849731445,
"learning_rate": 7.078651685393258e-06,
"loss": 0.6821,
"step": 1450
},
{
"epoch": 0.07134305748979941,
"grad_norm": 5.046870231628418,
"learning_rate": 7.127503663898389e-06,
"loss": 0.6782,
"step": 1460
},
{
"epoch": 0.07183170856849666,
"grad_norm": 6.238111972808838,
"learning_rate": 7.1763556424035174e-06,
"loss": 0.6597,
"step": 1470
},
{
"epoch": 0.07232035964719392,
"grad_norm": 14.37743091583252,
"learning_rate": 7.225207620908648e-06,
"loss": 0.6472,
"step": 1480
},
{
"epoch": 0.07280901072589117,
"grad_norm": 8.147233963012695,
"learning_rate": 7.2740595994137765e-06,
"loss": 0.6435,
"step": 1490
},
{
"epoch": 0.07329766180458844,
"grad_norm": 10.538481712341309,
"learning_rate": 7.322911577918906e-06,
"loss": 0.6375,
"step": 1500
},
{
"epoch": 0.07329766180458844,
"eval_loss": 0.6336132884025574,
"eval_runtime": 729.8371,
"eval_samples_per_second": 242.394,
"eval_steps_per_second": 0.474,
"step": 1500
},
{
"epoch": 0.07378631288328569,
"grad_norm": 8.202781677246094,
"learning_rate": 7.3717635564240355e-06,
"loss": 0.627,
"step": 1510
},
{
"epoch": 0.07427496396198295,
"grad_norm": 9.16813850402832,
"learning_rate": 7.420615534929165e-06,
"loss": 0.6136,
"step": 1520
},
{
"epoch": 0.0747636150406802,
"grad_norm": 4.204853057861328,
"learning_rate": 7.4694675134342946e-06,
"loss": 0.6092,
"step": 1530
},
{
"epoch": 0.07525226611937746,
"grad_norm": 7.2187652587890625,
"learning_rate": 7.518319491939424e-06,
"loss": 0.619,
"step": 1540
},
{
"epoch": 0.07574091719807471,
"grad_norm": 6.75137996673584,
"learning_rate": 7.567171470444553e-06,
"loss": 0.6183,
"step": 1550
},
{
"epoch": 0.07622956827677198,
"grad_norm": 12.58353042602539,
"learning_rate": 7.616023448949683e-06,
"loss": 0.6053,
"step": 1560
},
{
"epoch": 0.07671821935546923,
"grad_norm": 5.846193313598633,
"learning_rate": 7.664875427454813e-06,
"loss": 0.607,
"step": 1570
},
{
"epoch": 0.07720687043416648,
"grad_norm": 7.444247722625732,
"learning_rate": 7.713727405959941e-06,
"loss": 0.5934,
"step": 1580
},
{
"epoch": 0.07769552151286374,
"grad_norm": 3.659825563430786,
"learning_rate": 7.762579384465072e-06,
"loss": 0.5938,
"step": 1590
},
{
"epoch": 0.07818417259156099,
"grad_norm": 6.078113079071045,
"learning_rate": 7.8114313629702e-06,
"loss": 0.5942,
"step": 1600
},
{
"epoch": 0.07867282367025825,
"grad_norm": 7.572592735290527,
"learning_rate": 7.86028334147533e-06,
"loss": 0.6032,
"step": 1610
},
{
"epoch": 0.0791614747489555,
"grad_norm": 6.511207103729248,
"learning_rate": 7.90913531998046e-06,
"loss": 0.5873,
"step": 1620
},
{
"epoch": 0.07965012582765277,
"grad_norm": 6.170757293701172,
"learning_rate": 7.957987298485588e-06,
"loss": 0.5804,
"step": 1630
},
{
"epoch": 0.08013877690635002,
"grad_norm": 14.552532196044922,
"learning_rate": 8.006839276990718e-06,
"loss": 0.5753,
"step": 1640
},
{
"epoch": 0.08062742798504728,
"grad_norm": 8.183059692382812,
"learning_rate": 8.055691255495847e-06,
"loss": 0.5739,
"step": 1650
},
{
"epoch": 0.08111607906374453,
"grad_norm": 4.893775463104248,
"learning_rate": 8.104543234000977e-06,
"loss": 0.5722,
"step": 1660
},
{
"epoch": 0.0816047301424418,
"grad_norm": 9.298670768737793,
"learning_rate": 8.153395212506106e-06,
"loss": 0.5696,
"step": 1670
},
{
"epoch": 0.08209338122113904,
"grad_norm": 5.700584888458252,
"learning_rate": 8.202247191011237e-06,
"loss": 0.568,
"step": 1680
},
{
"epoch": 0.08258203229983631,
"grad_norm": 14.690134048461914,
"learning_rate": 8.251099169516365e-06,
"loss": 0.5739,
"step": 1690
},
{
"epoch": 0.08307068337853356,
"grad_norm": 12.68682861328125,
"learning_rate": 8.299951148021496e-06,
"loss": 0.5801,
"step": 1700
},
{
"epoch": 0.08355933445723081,
"grad_norm": 8.979551315307617,
"learning_rate": 8.348803126526624e-06,
"loss": 0.5791,
"step": 1710
},
{
"epoch": 0.08404798553592807,
"grad_norm": 5.448888301849365,
"learning_rate": 8.397655105031755e-06,
"loss": 0.5657,
"step": 1720
},
{
"epoch": 0.08453663661462532,
"grad_norm": 8.006872177124023,
"learning_rate": 8.446507083536883e-06,
"loss": 0.5484,
"step": 1730
},
{
"epoch": 0.08502528769332258,
"grad_norm": 6.7078046798706055,
"learning_rate": 8.495359062042014e-06,
"loss": 0.5555,
"step": 1740
},
{
"epoch": 0.08551393877201983,
"grad_norm": 8.614073753356934,
"learning_rate": 8.544211040547142e-06,
"loss": 0.5606,
"step": 1750
},
{
"epoch": 0.0860025898507171,
"grad_norm": 4.551246643066406,
"learning_rate": 8.593063019052273e-06,
"loss": 0.5544,
"step": 1760
},
{
"epoch": 0.08649124092941435,
"grad_norm": 3.444021463394165,
"learning_rate": 8.641914997557401e-06,
"loss": 0.5411,
"step": 1770
},
{
"epoch": 0.08697989200811161,
"grad_norm": 17.660511016845703,
"learning_rate": 8.690766976062532e-06,
"loss": 0.5427,
"step": 1780
},
{
"epoch": 0.08746854308680886,
"grad_norm": 7.721867561340332,
"learning_rate": 8.73961895456766e-06,
"loss": 0.5526,
"step": 1790
},
{
"epoch": 0.08795719416550613,
"grad_norm": 3.451046943664551,
"learning_rate": 8.78847093307279e-06,
"loss": 0.5425,
"step": 1800
},
{
"epoch": 0.08844584524420337,
"grad_norm": 4.078919887542725,
"learning_rate": 8.83732291157792e-06,
"loss": 0.5543,
"step": 1810
},
{
"epoch": 0.08893449632290064,
"grad_norm": 4.645016193389893,
"learning_rate": 8.88617489008305e-06,
"loss": 0.5463,
"step": 1820
},
{
"epoch": 0.08942314740159789,
"grad_norm": 8.30947208404541,
"learning_rate": 8.935026868588178e-06,
"loss": 0.5452,
"step": 1830
},
{
"epoch": 0.08991179848029515,
"grad_norm": 5.685572147369385,
"learning_rate": 8.983878847093309e-06,
"loss": 0.5369,
"step": 1840
},
{
"epoch": 0.0904004495589924,
"grad_norm": 9.45528793334961,
"learning_rate": 9.032730825598438e-06,
"loss": 0.5299,
"step": 1850
},
{
"epoch": 0.09088910063768965,
"grad_norm": 10.99970817565918,
"learning_rate": 9.081582804103566e-06,
"loss": 0.5287,
"step": 1860
},
{
"epoch": 0.09137775171638692,
"grad_norm": 6.199814796447754,
"learning_rate": 9.130434782608697e-06,
"loss": 0.5397,
"step": 1870
},
{
"epoch": 0.09186640279508416,
"grad_norm": 5.611557483673096,
"learning_rate": 9.179286761113825e-06,
"loss": 0.5472,
"step": 1880
},
{
"epoch": 0.09235505387378143,
"grad_norm": 4.567397594451904,
"learning_rate": 9.228138739618956e-06,
"loss": 0.5353,
"step": 1890
},
{
"epoch": 0.09284370495247868,
"grad_norm": 17.8961238861084,
"learning_rate": 9.276990718124084e-06,
"loss": 0.5263,
"step": 1900
},
{
"epoch": 0.09333235603117594,
"grad_norm": 8.548867225646973,
"learning_rate": 9.325842696629213e-06,
"loss": 0.5301,
"step": 1910
},
{
"epoch": 0.09382100710987319,
"grad_norm": 5.053003787994385,
"learning_rate": 9.374694675134343e-06,
"loss": 0.5316,
"step": 1920
},
{
"epoch": 0.09430965818857046,
"grad_norm": 10.809515953063965,
"learning_rate": 9.423546653639472e-06,
"loss": 0.5255,
"step": 1930
},
{
"epoch": 0.0947983092672677,
"grad_norm": 4.784992218017578,
"learning_rate": 9.472398632144602e-06,
"loss": 0.5232,
"step": 1940
},
{
"epoch": 0.09528696034596497,
"grad_norm": 6.658916473388672,
"learning_rate": 9.521250610649731e-06,
"loss": 0.5225,
"step": 1950
},
{
"epoch": 0.09577561142466222,
"grad_norm": 5.238591194152832,
"learning_rate": 9.570102589154861e-06,
"loss": 0.52,
"step": 1960
},
{
"epoch": 0.09626426250335948,
"grad_norm": 6.568732261657715,
"learning_rate": 9.61895456765999e-06,
"loss": 0.5117,
"step": 1970
},
{
"epoch": 0.09675291358205673,
"grad_norm": 11.915630340576172,
"learning_rate": 9.66780654616512e-06,
"loss": 0.5113,
"step": 1980
},
{
"epoch": 0.09724156466075398,
"grad_norm": 3.4283180236816406,
"learning_rate": 9.716658524670249e-06,
"loss": 0.5206,
"step": 1990
},
{
"epoch": 0.09773021573945125,
"grad_norm": 7.299953937530518,
"learning_rate": 9.76551050317538e-06,
"loss": 0.5319,
"step": 2000
},
{
"epoch": 0.09773021573945125,
"eval_loss": 0.5161277055740356,
"eval_runtime": 728.8014,
"eval_samples_per_second": 242.738,
"eval_steps_per_second": 0.475,
"step": 2000
},
{
"epoch": 0.0982188668181485,
"grad_norm": 4.911329746246338,
"learning_rate": 9.814362481680508e-06,
"loss": 0.5179,
"step": 2010
},
{
"epoch": 0.09870751789684576,
"grad_norm": 3.644986152648926,
"learning_rate": 9.863214460185639e-06,
"loss": 0.5148,
"step": 2020
},
{
"epoch": 0.09919616897554301,
"grad_norm": 5.680597305297852,
"learning_rate": 9.912066438690767e-06,
"loss": 0.5119,
"step": 2030
},
{
"epoch": 0.09968482005424027,
"grad_norm": 6.847180366516113,
"learning_rate": 9.960918417195898e-06,
"loss": 0.5135,
"step": 2040
},
{
"epoch": 0.10017347113293752,
"grad_norm": 4.022679328918457,
"learning_rate": 9.999999709052384e-06,
"loss": 0.5307,
"step": 2050
},
{
"epoch": 0.10066212221163479,
"grad_norm": 8.008437156677246,
"learning_rate": 9.999989525889357e-06,
"loss": 0.5135,
"step": 2060
},
{
"epoch": 0.10115077329033204,
"grad_norm": 3.9152987003326416,
"learning_rate": 9.99996479537936e-06,
"loss": 0.5098,
"step": 2070
},
{
"epoch": 0.1016394243690293,
"grad_norm": 4.81342887878418,
"learning_rate": 9.999925517594343e-06,
"loss": 0.5229,
"step": 2080
},
{
"epoch": 0.10212807544772655,
"grad_norm": 4.663543224334717,
"learning_rate": 9.999871692648587e-06,
"loss": 0.5198,
"step": 2090
},
{
"epoch": 0.10261672652642381,
"grad_norm": 3.905458927154541,
"learning_rate": 9.999803320698692e-06,
"loss": 0.5074,
"step": 2100
},
{
"epoch": 0.10310537760512106,
"grad_norm": 7.694464206695557,
"learning_rate": 9.999720401943584e-06,
"loss": 0.503,
"step": 2110
},
{
"epoch": 0.10359402868381833,
"grad_norm": 4.2866668701171875,
"learning_rate": 9.999622936624515e-06,
"loss": 0.5052,
"step": 2120
},
{
"epoch": 0.10408267976251558,
"grad_norm": 7.022489070892334,
"learning_rate": 9.999510925025058e-06,
"loss": 0.5087,
"step": 2130
},
{
"epoch": 0.10457133084121283,
"grad_norm": 2.201606273651123,
"learning_rate": 9.999384367471108e-06,
"loss": 0.5051,
"step": 2140
},
{
"epoch": 0.10505998191991009,
"grad_norm": 4.468674659729004,
"learning_rate": 9.99924326433088e-06,
"loss": 0.5123,
"step": 2150
},
{
"epoch": 0.10554863299860734,
"grad_norm": 3.214961528778076,
"learning_rate": 9.999087616014909e-06,
"loss": 0.5045,
"step": 2160
},
{
"epoch": 0.1060372840773046,
"grad_norm": 8.839011192321777,
"learning_rate": 9.998917422976053e-06,
"loss": 0.5057,
"step": 2170
},
{
"epoch": 0.10652593515600185,
"grad_norm": 3.3649775981903076,
"learning_rate": 9.998732685709482e-06,
"loss": 0.5026,
"step": 2180
},
{
"epoch": 0.10701458623469912,
"grad_norm": 5.231264591217041,
"learning_rate": 9.998533404752686e-06,
"loss": 0.4967,
"step": 2190
},
{
"epoch": 0.10750323731339637,
"grad_norm": 10.444920539855957,
"learning_rate": 9.998319580685467e-06,
"loss": 0.4978,
"step": 2200
},
{
"epoch": 0.10799188839209363,
"grad_norm": 3.976793050765991,
"learning_rate": 9.998091214129943e-06,
"loss": 0.5012,
"step": 2210
},
{
"epoch": 0.10848053947079088,
"grad_norm": 4.761758327484131,
"learning_rate": 9.997848305750538e-06,
"loss": 0.4948,
"step": 2220
},
{
"epoch": 0.10896919054948814,
"grad_norm": 4.317152976989746,
"learning_rate": 9.997590856253988e-06,
"loss": 0.4991,
"step": 2230
},
{
"epoch": 0.10945784162818539,
"grad_norm": 3.9865562915802,
"learning_rate": 9.99731886638934e-06,
"loss": 0.4973,
"step": 2240
},
{
"epoch": 0.10994649270688266,
"grad_norm": 3.0519254207611084,
"learning_rate": 9.997032336947938e-06,
"loss": 0.4968,
"step": 2250
},
{
"epoch": 0.1104351437855799,
"grad_norm": 3.462034225463867,
"learning_rate": 9.996731268763434e-06,
"loss": 0.4908,
"step": 2260
},
{
"epoch": 0.11092379486427716,
"grad_norm": 4.285225868225098,
"learning_rate": 9.996415662711779e-06,
"loss": 0.4906,
"step": 2270
},
{
"epoch": 0.11141244594297442,
"grad_norm": 2.549806833267212,
"learning_rate": 9.996085519711218e-06,
"loss": 0.4934,
"step": 2280
},
{
"epoch": 0.11190109702167167,
"grad_norm": 6.287642478942871,
"learning_rate": 9.995740840722297e-06,
"loss": 0.4969,
"step": 2290
},
{
"epoch": 0.11238974810036893,
"grad_norm": 6.043119430541992,
"learning_rate": 9.99538162674785e-06,
"loss": 0.4959,
"step": 2300
},
{
"epoch": 0.11287839917906618,
"grad_norm": 3.221782922744751,
"learning_rate": 9.995007878833001e-06,
"loss": 0.4895,
"step": 2310
},
{
"epoch": 0.11336705025776345,
"grad_norm": 7.820531368255615,
"learning_rate": 9.994619598065162e-06,
"loss": 0.4921,
"step": 2320
},
{
"epoch": 0.1138557013364607,
"grad_norm": 1.8136892318725586,
"learning_rate": 9.994216785574024e-06,
"loss": 0.4893,
"step": 2330
},
{
"epoch": 0.11434435241515796,
"grad_norm": 2.453530788421631,
"learning_rate": 9.993799442531562e-06,
"loss": 0.4874,
"step": 2340
},
{
"epoch": 0.11483300349385521,
"grad_norm": 2.470960855484009,
"learning_rate": 9.993367570152024e-06,
"loss": 0.4876,
"step": 2350
},
{
"epoch": 0.11532165457255247,
"grad_norm": 5.889760971069336,
"learning_rate": 9.992921169691934e-06,
"loss": 0.485,
"step": 2360
},
{
"epoch": 0.11581030565124972,
"grad_norm": 1.6044597625732422,
"learning_rate": 9.992460242450081e-06,
"loss": 0.4857,
"step": 2370
},
{
"epoch": 0.11629895672994699,
"grad_norm": 3.4553425312042236,
"learning_rate": 9.991984789767521e-06,
"loss": 0.4894,
"step": 2380
},
{
"epoch": 0.11678760780864424,
"grad_norm": 4.581337928771973,
"learning_rate": 9.991494813027576e-06,
"loss": 0.4915,
"step": 2390
},
{
"epoch": 0.1172762588873415,
"grad_norm": 3.9853124618530273,
"learning_rate": 9.990990313655817e-06,
"loss": 0.4885,
"step": 2400
},
{
"epoch": 0.11776490996603875,
"grad_norm": 2.2269527912139893,
"learning_rate": 9.990471293120074e-06,
"loss": 0.4868,
"step": 2410
},
{
"epoch": 0.118253561044736,
"grad_norm": 5.388997554779053,
"learning_rate": 9.989937752930426e-06,
"loss": 0.4958,
"step": 2420
},
{
"epoch": 0.11874221212343326,
"grad_norm": 4.705722332000732,
"learning_rate": 9.989389694639194e-06,
"loss": 0.4916,
"step": 2430
},
{
"epoch": 0.11923086320213051,
"grad_norm": 3.4011592864990234,
"learning_rate": 9.988827119840937e-06,
"loss": 0.4879,
"step": 2440
},
{
"epoch": 0.11971951428082778,
"grad_norm": 4.242159366607666,
"learning_rate": 9.98825003017246e-06,
"loss": 0.4856,
"step": 2450
},
{
"epoch": 0.12020816535952503,
"grad_norm": 3.563094139099121,
"learning_rate": 9.987658427312785e-06,
"loss": 0.4838,
"step": 2460
},
{
"epoch": 0.12069681643822229,
"grad_norm": 3.6437556743621826,
"learning_rate": 9.987052312983168e-06,
"loss": 0.4803,
"step": 2470
},
{
"epoch": 0.12118546751691954,
"grad_norm": 7.271683216094971,
"learning_rate": 9.986431688947083e-06,
"loss": 0.4855,
"step": 2480
},
{
"epoch": 0.1216741185956168,
"grad_norm": 4.0858941078186035,
"learning_rate": 9.98579655701022e-06,
"loss": 0.4878,
"step": 2490
},
{
"epoch": 0.12216276967431405,
"grad_norm": 4.186237335205078,
"learning_rate": 9.985146919020483e-06,
"loss": 0.4849,
"step": 2500
},
{
"epoch": 0.12216276967431405,
"eval_loss": 0.46980607509613037,
"eval_runtime": 728.0838,
"eval_samples_per_second": 242.978,
"eval_steps_per_second": 0.475,
"step": 2500
},
{
"epoch": 0.12265142075301132,
"grad_norm": 4.360340595245361,
"learning_rate": 9.984482776867975e-06,
"loss": 0.4824,
"step": 2510
},
{
"epoch": 0.12314007183170857,
"grad_norm": 2.920182228088379,
"learning_rate": 9.983804132485003e-06,
"loss": 0.4813,
"step": 2520
},
{
"epoch": 0.12362872291040583,
"grad_norm": 2.6488723754882812,
"learning_rate": 9.983110987846063e-06,
"loss": 0.4811,
"step": 2530
},
{
"epoch": 0.12411737398910308,
"grad_norm": 2.2960548400878906,
"learning_rate": 9.982403344967847e-06,
"loss": 0.4755,
"step": 2540
},
{
"epoch": 0.12460602506780034,
"grad_norm": 2.8793044090270996,
"learning_rate": 9.98168120590922e-06,
"loss": 0.4792,
"step": 2550
},
{
"epoch": 0.12509467614649758,
"grad_norm": 2.4910120964050293,
"learning_rate": 9.980944572771231e-06,
"loss": 0.4839,
"step": 2560
},
{
"epoch": 0.12558332722519486,
"grad_norm": 6.9705891609191895,
"learning_rate": 9.980193447697095e-06,
"loss": 0.4792,
"step": 2570
},
{
"epoch": 0.1260719783038921,
"grad_norm": 2.401073694229126,
"learning_rate": 9.979427832872191e-06,
"loss": 0.4788,
"step": 2580
},
{
"epoch": 0.12656062938258936,
"grad_norm": 2.653182029724121,
"learning_rate": 9.97864773052406e-06,
"loss": 0.4804,
"step": 2590
},
{
"epoch": 0.1270492804612866,
"grad_norm": 2.8506484031677246,
"learning_rate": 9.977853142922386e-06,
"loss": 0.4769,
"step": 2600
},
{
"epoch": 0.12753793153998388,
"grad_norm": 3.2540268898010254,
"learning_rate": 9.977044072379006e-06,
"loss": 0.4797,
"step": 2610
},
{
"epoch": 0.12802658261868113,
"grad_norm": 6.425643444061279,
"learning_rate": 9.976220521247888e-06,
"loss": 0.4872,
"step": 2620
},
{
"epoch": 0.12851523369737838,
"grad_norm": 3.4844772815704346,
"learning_rate": 9.975382491925137e-06,
"loss": 0.4775,
"step": 2630
},
{
"epoch": 0.12900388477607563,
"grad_norm": 2.7126948833465576,
"learning_rate": 9.974529986848976e-06,
"loss": 0.4795,
"step": 2640
},
{
"epoch": 0.1294925358547729,
"grad_norm": 3.378321409225464,
"learning_rate": 9.973663008499748e-06,
"loss": 0.4851,
"step": 2650
},
{
"epoch": 0.12998118693347016,
"grad_norm": 2.3212387561798096,
"learning_rate": 9.972781559399906e-06,
"loss": 0.4765,
"step": 2660
},
{
"epoch": 0.1304698380121674,
"grad_norm": 3.0284295082092285,
"learning_rate": 9.971885642114006e-06,
"loss": 0.4779,
"step": 2670
},
{
"epoch": 0.13095848909086466,
"grad_norm": 2.1346194744110107,
"learning_rate": 9.970975259248696e-06,
"loss": 0.4765,
"step": 2680
},
{
"epoch": 0.13144714016956194,
"grad_norm": 2.0011963844299316,
"learning_rate": 9.97005041345271e-06,
"loss": 0.4813,
"step": 2690
},
{
"epoch": 0.1319357912482592,
"grad_norm": 3.866771936416626,
"learning_rate": 9.969111107416867e-06,
"loss": 0.4766,
"step": 2700
},
{
"epoch": 0.13242444232695644,
"grad_norm": 6.982947826385498,
"learning_rate": 9.968157343874056e-06,
"loss": 0.4773,
"step": 2710
},
{
"epoch": 0.1329130934056537,
"grad_norm": 4.293519973754883,
"learning_rate": 9.967189125599228e-06,
"loss": 0.4818,
"step": 2720
},
{
"epoch": 0.13340174448435094,
"grad_norm": 3.3985178470611572,
"learning_rate": 9.966206455409386e-06,
"loss": 0.4778,
"step": 2730
},
{
"epoch": 0.13389039556304821,
"grad_norm": 1.5569087266921997,
"learning_rate": 9.96520933616359e-06,
"loss": 0.4737,
"step": 2740
},
{
"epoch": 0.13437904664174546,
"grad_norm": 4.966946125030518,
"learning_rate": 9.964197770762933e-06,
"loss": 0.4762,
"step": 2750
},
{
"epoch": 0.13486769772044271,
"grad_norm": 2.4373340606689453,
"learning_rate": 9.96317176215054e-06,
"loss": 0.4764,
"step": 2760
},
{
"epoch": 0.13535634879913996,
"grad_norm": 4.127823352813721,
"learning_rate": 9.962131313311555e-06,
"loss": 0.4753,
"step": 2770
},
{
"epoch": 0.13584499987783724,
"grad_norm": 2.2819466590881348,
"learning_rate": 9.96107642727314e-06,
"loss": 0.475,
"step": 2780
},
{
"epoch": 0.1363336509565345,
"grad_norm": 5.689523696899414,
"learning_rate": 9.960007107104462e-06,
"loss": 0.4748,
"step": 2790
},
{
"epoch": 0.13682230203523174,
"grad_norm": 2.3338582515716553,
"learning_rate": 9.958923355916682e-06,
"loss": 0.4774,
"step": 2800
},
{
"epoch": 0.137310953113929,
"grad_norm": 5.458847522735596,
"learning_rate": 9.95782517686294e-06,
"loss": 0.474,
"step": 2810
},
{
"epoch": 0.13779960419262627,
"grad_norm": 1.634664535522461,
"learning_rate": 9.956712573138371e-06,
"loss": 0.4737,
"step": 2820
},
{
"epoch": 0.13828825527132352,
"grad_norm": 1.757805585861206,
"learning_rate": 9.955585547980065e-06,
"loss": 0.4713,
"step": 2830
},
{
"epoch": 0.13877690635002077,
"grad_norm": 1.5585452318191528,
"learning_rate": 9.954444104667071e-06,
"loss": 0.4734,
"step": 2840
},
{
"epoch": 0.13926555742871802,
"grad_norm": 8.348752975463867,
"learning_rate": 9.953288246520393e-06,
"loss": 0.4754,
"step": 2850
},
{
"epoch": 0.13975420850741527,
"grad_norm": 2.4966542720794678,
"learning_rate": 9.95211797690297e-06,
"loss": 0.4719,
"step": 2860
},
{
"epoch": 0.14024285958611254,
"grad_norm": 2.205169439315796,
"learning_rate": 9.950933299219676e-06,
"loss": 0.4705,
"step": 2870
},
{
"epoch": 0.1407315106648098,
"grad_norm": 2.2777152061462402,
"learning_rate": 9.949734216917301e-06,
"loss": 0.4687,
"step": 2880
},
{
"epoch": 0.14122016174350704,
"grad_norm": 1.1067817211151123,
"learning_rate": 9.948520733484543e-06,
"loss": 0.4673,
"step": 2890
},
{
"epoch": 0.1417088128222043,
"grad_norm": 3.3773841857910156,
"learning_rate": 9.947292852452003e-06,
"loss": 0.4707,
"step": 2900
},
{
"epoch": 0.14219746390090157,
"grad_norm": 1.1769728660583496,
"learning_rate": 9.946050577392173e-06,
"loss": 0.4703,
"step": 2910
},
{
"epoch": 0.14268611497959882,
"grad_norm": 7.464486122131348,
"learning_rate": 9.94479391191942e-06,
"loss": 0.4723,
"step": 2920
},
{
"epoch": 0.14317476605829607,
"grad_norm": 2.232747793197632,
"learning_rate": 9.94352285968998e-06,
"loss": 0.4735,
"step": 2930
},
{
"epoch": 0.14366341713699332,
"grad_norm": 3.54618239402771,
"learning_rate": 9.942237424401952e-06,
"loss": 0.4695,
"step": 2940
},
{
"epoch": 0.1441520682156906,
"grad_norm": 1.840293526649475,
"learning_rate": 9.940937609795276e-06,
"loss": 0.471,
"step": 2950
},
{
"epoch": 0.14464071929438785,
"grad_norm": 3.1132638454437256,
"learning_rate": 9.939623419651732e-06,
"loss": 0.47,
"step": 2960
},
{
"epoch": 0.1451293703730851,
"grad_norm": 1.120263695716858,
"learning_rate": 9.93829485779492e-06,
"loss": 0.47,
"step": 2970
},
{
"epoch": 0.14561802145178235,
"grad_norm": 4.9828057289123535,
"learning_rate": 9.936951928090266e-06,
"loss": 0.4731,
"step": 2980
},
{
"epoch": 0.1461066725304796,
"grad_norm": 2.0490236282348633,
"learning_rate": 9.935594634444985e-06,
"loss": 0.4707,
"step": 2990
},
{
"epoch": 0.14659532360917688,
"grad_norm": 3.313997268676758,
"learning_rate": 9.93422298080809e-06,
"loss": 0.4675,
"step": 3000
},
{
"epoch": 0.14659532360917688,
"eval_loss": 0.45171666145324707,
"eval_runtime": 727.5975,
"eval_samples_per_second": 243.14,
"eval_steps_per_second": 0.476,
"step": 3000
},
{
"epoch": 0.14708397468787412,
"grad_norm": 3.7955098152160645,
"learning_rate": 9.932836971170375e-06,
"loss": 0.4759,
"step": 3010
},
{
"epoch": 0.14757262576657137,
"grad_norm": 0.955259382724762,
"learning_rate": 9.931436609564402e-06,
"loss": 0.4676,
"step": 3020
},
{
"epoch": 0.14806127684526862,
"grad_norm": 1.6290405988693237,
"learning_rate": 9.930021900064486e-06,
"loss": 0.47,
"step": 3030
},
{
"epoch": 0.1485499279239659,
"grad_norm": 4.106773376464844,
"learning_rate": 9.928592846786693e-06,
"loss": 0.4693,
"step": 3040
},
{
"epoch": 0.14903857900266315,
"grad_norm": 1.7998560667037964,
"learning_rate": 9.927149453888814e-06,
"loss": 0.4679,
"step": 3050
},
{
"epoch": 0.1495272300813604,
"grad_norm": 3.9935462474823,
"learning_rate": 9.92569172557037e-06,
"loss": 0.4675,
"step": 3060
},
{
"epoch": 0.15001588116005765,
"grad_norm": 1.6421153545379639,
"learning_rate": 9.924219666072584e-06,
"loss": 0.469,
"step": 3070
},
{
"epoch": 0.15050453223875493,
"grad_norm": 6.065347671508789,
"learning_rate": 9.922733279678376e-06,
"loss": 0.478,
"step": 3080
},
{
"epoch": 0.15099318331745218,
"grad_norm": 4.049252986907959,
"learning_rate": 9.921232570712351e-06,
"loss": 0.4734,
"step": 3090
},
{
"epoch": 0.15148183439614943,
"grad_norm": 3.9484283924102783,
"learning_rate": 9.919717543540786e-06,
"loss": 0.4702,
"step": 3100
},
{
"epoch": 0.15197048547484668,
"grad_norm": 3.8022537231445312,
"learning_rate": 9.918188202571615e-06,
"loss": 0.4674,
"step": 3110
},
{
"epoch": 0.15245913655354396,
"grad_norm": 3.4525346755981445,
"learning_rate": 9.916644552254417e-06,
"loss": 0.4724,
"step": 3120
},
{
"epoch": 0.1529477876322412,
"grad_norm": 1.305325984954834,
"learning_rate": 9.915086597080407e-06,
"loss": 0.468,
"step": 3130
},
{
"epoch": 0.15343643871093846,
"grad_norm": 2.5436055660247803,
"learning_rate": 9.913514341582415e-06,
"loss": 0.4706,
"step": 3140
},
{
"epoch": 0.1539250897896357,
"grad_norm": 2.798241376876831,
"learning_rate": 9.911927790334882e-06,
"loss": 0.4695,
"step": 3150
},
{
"epoch": 0.15441374086833295,
"grad_norm": 2.0094220638275146,
"learning_rate": 9.910326947953838e-06,
"loss": 0.4694,
"step": 3160
},
{
"epoch": 0.15490239194703023,
"grad_norm": 2.610715389251709,
"learning_rate": 9.908711819096897e-06,
"loss": 0.4668,
"step": 3170
},
{
"epoch": 0.15539104302572748,
"grad_norm": 4.6221232414245605,
"learning_rate": 9.907082408463234e-06,
"loss": 0.4679,
"step": 3180
},
{
"epoch": 0.15587969410442473,
"grad_norm": 5.538655757904053,
"learning_rate": 9.905438720793582e-06,
"loss": 0.474,
"step": 3190
},
{
"epoch": 0.15636834518312198,
"grad_norm": 2.6583926677703857,
"learning_rate": 9.903780760870208e-06,
"loss": 0.475,
"step": 3200
},
{
"epoch": 0.15685699626181926,
"grad_norm": 4.67283821105957,
"learning_rate": 9.902108533516907e-06,
"loss": 0.4693,
"step": 3210
},
{
"epoch": 0.1573456473405165,
"grad_norm": 2.7513134479522705,
"learning_rate": 9.900422043598982e-06,
"loss": 0.4675,
"step": 3220
},
{
"epoch": 0.15783429841921376,
"grad_norm": 1.7802903652191162,
"learning_rate": 9.898721296023234e-06,
"loss": 0.466,
"step": 3230
},
{
"epoch": 0.158322949497911,
"grad_norm": 2.868180751800537,
"learning_rate": 9.89700629573795e-06,
"loss": 0.4652,
"step": 3240
},
{
"epoch": 0.15881160057660829,
"grad_norm": 2.2115590572357178,
"learning_rate": 9.895277047732879e-06,
"loss": 0.4649,
"step": 3250
},
{
"epoch": 0.15930025165530554,
"grad_norm": 2.7699434757232666,
"learning_rate": 9.893533557039223e-06,
"loss": 0.466,
"step": 3260
},
{
"epoch": 0.15978890273400279,
"grad_norm": 2.4520747661590576,
"learning_rate": 9.891775828729628e-06,
"loss": 0.4639,
"step": 3270
},
{
"epoch": 0.16027755381270004,
"grad_norm": 2.2992360591888428,
"learning_rate": 9.890003867918162e-06,
"loss": 0.4643,
"step": 3280
},
{
"epoch": 0.16076620489139729,
"grad_norm": 2.04976224899292,
"learning_rate": 9.888217679760303e-06,
"loss": 0.4649,
"step": 3290
},
{
"epoch": 0.16125485597009456,
"grad_norm": 1.9434853792190552,
"learning_rate": 9.886417269452918e-06,
"loss": 0.4665,
"step": 3300
},
{
"epoch": 0.1617435070487918,
"grad_norm": 2.6264779567718506,
"learning_rate": 9.884602642234257e-06,
"loss": 0.4647,
"step": 3310
},
{
"epoch": 0.16223215812748906,
"grad_norm": 3.206934690475464,
"learning_rate": 9.882773803383934e-06,
"loss": 0.4675,
"step": 3320
},
{
"epoch": 0.1627208092061863,
"grad_norm": 7.612506866455078,
"learning_rate": 9.880930758222912e-06,
"loss": 0.4728,
"step": 3330
},
{
"epoch": 0.1632094602848836,
"grad_norm": 1.3091853857040405,
"learning_rate": 9.879073512113487e-06,
"loss": 0.4691,
"step": 3340
},
{
"epoch": 0.16369811136358084,
"grad_norm": 3.0943753719329834,
"learning_rate": 9.877202070459268e-06,
"loss": 0.4657,
"step": 3350
},
{
"epoch": 0.1641867624422781,
"grad_norm": 1.4435592889785767,
"learning_rate": 9.87531643870517e-06,
"loss": 0.465,
"step": 3360
},
{
"epoch": 0.16467541352097534,
"grad_norm": 1.4803426265716553,
"learning_rate": 9.87341662233739e-06,
"loss": 0.4637,
"step": 3370
},
{
"epoch": 0.16516406459967262,
"grad_norm": 0.7361840605735779,
"learning_rate": 9.871502626883403e-06,
"loss": 0.463,
"step": 3380
},
{
"epoch": 0.16565271567836987,
"grad_norm": 36.265968322753906,
"learning_rate": 9.869574457911925e-06,
"loss": 0.4701,
"step": 3390
},
{
"epoch": 0.16614136675706712,
"grad_norm": 1.7011586427688599,
"learning_rate": 9.86763212103292e-06,
"loss": 0.4701,
"step": 3400
},
{
"epoch": 0.16663001783576437,
"grad_norm": 2.6717042922973633,
"learning_rate": 9.865675621897571e-06,
"loss": 0.4644,
"step": 3410
},
{
"epoch": 0.16711866891446162,
"grad_norm": 2.1945064067840576,
"learning_rate": 9.86370496619826e-06,
"loss": 0.4641,
"step": 3420
},
{
"epoch": 0.1676073199931589,
"grad_norm": 1.7178106307983398,
"learning_rate": 9.861720159668566e-06,
"loss": 0.4628,
"step": 3430
},
{
"epoch": 0.16809597107185614,
"grad_norm": 1.945646047592163,
"learning_rate": 9.85972120808323e-06,
"loss": 0.4623,
"step": 3440
},
{
"epoch": 0.1685846221505534,
"grad_norm": 1.8980379104614258,
"learning_rate": 9.857708117258158e-06,
"loss": 0.4621,
"step": 3450
},
{
"epoch": 0.16907327322925064,
"grad_norm": 1.8674242496490479,
"learning_rate": 9.855680893050384e-06,
"loss": 0.4621,
"step": 3460
},
{
"epoch": 0.16956192430794792,
"grad_norm": 1.6167813539505005,
"learning_rate": 9.853639541358069e-06,
"loss": 0.4629,
"step": 3470
},
{
"epoch": 0.17005057538664517,
"grad_norm": 2.2286250591278076,
"learning_rate": 9.851584068120477e-06,
"loss": 0.4634,
"step": 3480
},
{
"epoch": 0.17053922646534242,
"grad_norm": 1.4811843633651733,
"learning_rate": 9.849514479317955e-06,
"loss": 0.4614,
"step": 3490
},
{
"epoch": 0.17102787754403967,
"grad_norm": 3.2708358764648438,
"learning_rate": 9.84743078097192e-06,
"loss": 0.4616,
"step": 3500
},
{
"epoch": 0.17102787754403967,
"eval_loss": 0.4388451874256134,
"eval_runtime": 729.7082,
"eval_samples_per_second": 242.437,
"eval_steps_per_second": 0.474,
"step": 3500
},
{
"epoch": 0.17151652862273695,
"grad_norm": 2.3546407222747803,
"learning_rate": 9.845332979144845e-06,
"loss": 0.4629,
"step": 3510
},
{
"epoch": 0.1720051797014342,
"grad_norm": 2.703920841217041,
"learning_rate": 9.84322107994023e-06,
"loss": 0.4624,
"step": 3520
},
{
"epoch": 0.17249383078013145,
"grad_norm": 2.2422356605529785,
"learning_rate": 9.841095089502595e-06,
"loss": 0.4625,
"step": 3530
},
{
"epoch": 0.1729824818588287,
"grad_norm": 1.0636337995529175,
"learning_rate": 9.838955014017455e-06,
"loss": 0.46,
"step": 3540
},
{
"epoch": 0.17347113293752595,
"grad_norm": 3.9872353076934814,
"learning_rate": 9.836800859711311e-06,
"loss": 0.4601,
"step": 3550
},
{
"epoch": 0.17395978401622322,
"grad_norm": 1.2745929956436157,
"learning_rate": 9.83463263285162e-06,
"loss": 0.4628,
"step": 3560
},
{
"epoch": 0.17444843509492047,
"grad_norm": 2.2762491703033447,
"learning_rate": 9.832450339746785e-06,
"loss": 0.4622,
"step": 3570
},
{
"epoch": 0.17493708617361772,
"grad_norm": 1.6486016511917114,
"learning_rate": 9.830253986746134e-06,
"loss": 0.4699,
"step": 3580
},
{
"epoch": 0.17542573725231497,
"grad_norm": 1.5666919946670532,
"learning_rate": 9.8280435802399e-06,
"loss": 0.4646,
"step": 3590
},
{
"epoch": 0.17591438833101225,
"grad_norm": 1.8892680406570435,
"learning_rate": 9.825819126659214e-06,
"loss": 0.4646,
"step": 3600
},
{
"epoch": 0.1764030394097095,
"grad_norm": 2.9722862243652344,
"learning_rate": 9.823580632476062e-06,
"loss": 0.4598,
"step": 3610
},
{
"epoch": 0.17689169048840675,
"grad_norm": 2.4820001125335693,
"learning_rate": 9.82132810420329e-06,
"loss": 0.4629,
"step": 3620
},
{
"epoch": 0.177380341567104,
"grad_norm": 2.4330859184265137,
"learning_rate": 9.819061548394574e-06,
"loss": 0.4611,
"step": 3630
},
{
"epoch": 0.17786899264580128,
"grad_norm": 3.4170515537261963,
"learning_rate": 9.816780971644403e-06,
"loss": 0.4647,
"step": 3640
},
{
"epoch": 0.17835764372449853,
"grad_norm": 1.6359800100326538,
"learning_rate": 9.814486380588058e-06,
"loss": 0.4629,
"step": 3650
},
{
"epoch": 0.17884629480319578,
"grad_norm": 1.80881929397583,
"learning_rate": 9.812177781901597e-06,
"loss": 0.4607,
"step": 3660
},
{
"epoch": 0.17933494588189303,
"grad_norm": 1.8495383262634277,
"learning_rate": 9.80985518230183e-06,
"loss": 0.4598,
"step": 3670
},
{
"epoch": 0.1798235969605903,
"grad_norm": 2.4074761867523193,
"learning_rate": 9.807518588546305e-06,
"loss": 0.4609,
"step": 3680
},
{
"epoch": 0.18031224803928755,
"grad_norm": 3.074289321899414,
"learning_rate": 9.805168007433283e-06,
"loss": 0.4599,
"step": 3690
},
{
"epoch": 0.1808008991179848,
"grad_norm": 3.443209648132324,
"learning_rate": 9.802803445801723e-06,
"loss": 0.4589,
"step": 3700
},
{
"epoch": 0.18128955019668205,
"grad_norm": 1.7589000463485718,
"learning_rate": 9.800424910531256e-06,
"loss": 0.4608,
"step": 3710
},
{
"epoch": 0.1817782012753793,
"grad_norm": 1.6772186756134033,
"learning_rate": 9.798032408542177e-06,
"loss": 0.4614,
"step": 3720
},
{
"epoch": 0.18226685235407658,
"grad_norm": 2.250244617462158,
"learning_rate": 9.79562594679541e-06,
"loss": 0.4601,
"step": 3730
},
{
"epoch": 0.18275550343277383,
"grad_norm": 1.436660647392273,
"learning_rate": 9.793205532292496e-06,
"loss": 0.459,
"step": 3740
},
{
"epoch": 0.18324415451147108,
"grad_norm": 2.040019989013672,
"learning_rate": 9.79077117207557e-06,
"loss": 0.4691,
"step": 3750
},
{
"epoch": 0.18373280559016833,
"grad_norm": 2.091820240020752,
"learning_rate": 9.788322873227347e-06,
"loss": 0.4624,
"step": 3760
},
{
"epoch": 0.1842214566688656,
"grad_norm": 2.1219372749328613,
"learning_rate": 9.78586064287109e-06,
"loss": 0.4614,
"step": 3770
},
{
"epoch": 0.18471010774756286,
"grad_norm": 1.5753206014633179,
"learning_rate": 9.783384488170598e-06,
"loss": 0.4635,
"step": 3780
},
{
"epoch": 0.1851987588262601,
"grad_norm": 2.6877732276916504,
"learning_rate": 9.780894416330182e-06,
"loss": 0.4626,
"step": 3790
},
{
"epoch": 0.18568740990495736,
"grad_norm": 1.7835508584976196,
"learning_rate": 9.778390434594647e-06,
"loss": 0.461,
"step": 3800
},
{
"epoch": 0.18617606098365463,
"grad_norm": 2.014145851135254,
"learning_rate": 9.775872550249266e-06,
"loss": 0.4595,
"step": 3810
},
{
"epoch": 0.18666471206235188,
"grad_norm": 1.9438420534133911,
"learning_rate": 9.77334077061976e-06,
"loss": 0.459,
"step": 3820
},
{
"epoch": 0.18715336314104913,
"grad_norm": 1.6419105529785156,
"learning_rate": 9.770795103072281e-06,
"loss": 0.4572,
"step": 3830
},
{
"epoch": 0.18764201421974638,
"grad_norm": 1.0788559913635254,
"learning_rate": 9.768235555013385e-06,
"loss": 0.4582,
"step": 3840
},
{
"epoch": 0.18813066529844363,
"grad_norm": 1.149911642074585,
"learning_rate": 9.765662133890017e-06,
"loss": 0.4573,
"step": 3850
},
{
"epoch": 0.1886193163771409,
"grad_norm": 1.5427783727645874,
"learning_rate": 9.763074847189483e-06,
"loss": 0.4637,
"step": 3860
},
{
"epoch": 0.18910796745583816,
"grad_norm": 2.3992674350738525,
"learning_rate": 9.760473702439426e-06,
"loss": 0.4629,
"step": 3870
},
{
"epoch": 0.1895966185345354,
"grad_norm": 2.0136971473693848,
"learning_rate": 9.757858707207815e-06,
"loss": 0.4584,
"step": 3880
},
{
"epoch": 0.19008526961323266,
"grad_norm": 0.9144098162651062,
"learning_rate": 9.755229869102916e-06,
"loss": 0.4597,
"step": 3890
},
{
"epoch": 0.19057392069192994,
"grad_norm": 4.437480449676514,
"learning_rate": 9.752587195773268e-06,
"loss": 0.4584,
"step": 3900
},
{
"epoch": 0.1910625717706272,
"grad_norm": 2.2352585792541504,
"learning_rate": 9.749930694907666e-06,
"loss": 0.4584,
"step": 3910
},
{
"epoch": 0.19155122284932444,
"grad_norm": 1.6085118055343628,
"learning_rate": 9.74726037423513e-06,
"loss": 0.4598,
"step": 3920
},
{
"epoch": 0.1920398739280217,
"grad_norm": 0.8404253721237183,
"learning_rate": 9.744576241524895e-06,
"loss": 0.4571,
"step": 3930
},
{
"epoch": 0.19252852500671896,
"grad_norm": 1.5468897819519043,
"learning_rate": 9.741878304586379e-06,
"loss": 0.4586,
"step": 3940
},
{
"epoch": 0.19301717608541621,
"grad_norm": 3.8875999450683594,
"learning_rate": 9.739166571269166e-06,
"loss": 0.4601,
"step": 3950
},
{
"epoch": 0.19350582716411346,
"grad_norm": 2.8351383209228516,
"learning_rate": 9.736441049462973e-06,
"loss": 0.4598,
"step": 3960
},
{
"epoch": 0.19399447824281071,
"grad_norm": 2.2148053646087646,
"learning_rate": 9.733701747097641e-06,
"loss": 0.4604,
"step": 3970
},
{
"epoch": 0.19448312932150796,
"grad_norm": 2.287990093231201,
"learning_rate": 9.730948672143105e-06,
"loss": 0.4576,
"step": 3980
},
{
"epoch": 0.19497178040020524,
"grad_norm": 1.138305902481079,
"learning_rate": 9.728181832609366e-06,
"loss": 0.458,
"step": 3990
},
{
"epoch": 0.1954604314789025,
"grad_norm": 0.8007479906082153,
"learning_rate": 9.725401236546476e-06,
"loss": 0.4593,
"step": 4000
},
{
"epoch": 0.1954604314789025,
"eval_loss": 0.43675804138183594,
"eval_runtime": 728.6855,
"eval_samples_per_second": 242.777,
"eval_steps_per_second": 0.475,
"step": 4000
},
{
"epoch": 0.19594908255759974,
"grad_norm": 3.4049859046936035,
"learning_rate": 9.722606892044516e-06,
"loss": 0.4573,
"step": 4010
},
{
"epoch": 0.196437733636297,
"grad_norm": 1.6767579317092896,
"learning_rate": 9.719798807233555e-06,
"loss": 0.461,
"step": 4020
},
{
"epoch": 0.19692638471499427,
"grad_norm": 1.4466297626495361,
"learning_rate": 9.716976990283654e-06,
"loss": 0.4629,
"step": 4030
},
{
"epoch": 0.19741503579369152,
"grad_norm": 8.470952033996582,
"learning_rate": 9.714141449404815e-06,
"loss": 0.4857,
"step": 4040
},
{
"epoch": 0.19790368687238877,
"grad_norm": 1.1967353820800781,
"learning_rate": 9.711292192846979e-06,
"loss": 0.4613,
"step": 4050
},
{
"epoch": 0.19839233795108602,
"grad_norm": 1.025749921798706,
"learning_rate": 9.708429228899984e-06,
"loss": 0.4579,
"step": 4060
},
{
"epoch": 0.1988809890297833,
"grad_norm": 2.6049964427948,
"learning_rate": 9.705552565893557e-06,
"loss": 0.46,
"step": 4070
},
{
"epoch": 0.19936964010848054,
"grad_norm": 1.4117764234542847,
"learning_rate": 9.702662212197277e-06,
"loss": 0.4598,
"step": 4080
},
{
"epoch": 0.1998582911871778,
"grad_norm": 1.602464199066162,
"learning_rate": 9.699758176220558e-06,
"loss": 0.4579,
"step": 4090
},
{
"epoch": 0.20034694226587504,
"grad_norm": 2.6832380294799805,
"learning_rate": 9.696840466412619e-06,
"loss": 0.4582,
"step": 4100
},
{
"epoch": 0.20083559334457232,
"grad_norm": 1.2473195791244507,
"learning_rate": 9.693909091262467e-06,
"loss": 0.457,
"step": 4110
},
{
"epoch": 0.20132424442326957,
"grad_norm": 1.5877009630203247,
"learning_rate": 9.690964059298866e-06,
"loss": 0.4565,
"step": 4120
},
{
"epoch": 0.20181289550196682,
"grad_norm": 2.6137261390686035,
"learning_rate": 9.688005379090315e-06,
"loss": 0.4566,
"step": 4130
},
{
"epoch": 0.20230154658066407,
"grad_norm": 2.4244110584259033,
"learning_rate": 9.68503305924502e-06,
"loss": 0.4567,
"step": 4140
},
{
"epoch": 0.20279019765936132,
"grad_norm": 2.0475914478302,
"learning_rate": 9.682047108410875e-06,
"loss": 0.458,
"step": 4150
},
{
"epoch": 0.2032788487380586,
"grad_norm": 0.8052435517311096,
"learning_rate": 9.679047535275427e-06,
"loss": 0.4567,
"step": 4160
},
{
"epoch": 0.20376749981675585,
"grad_norm": 3.230631113052368,
"learning_rate": 9.676034348565865e-06,
"loss": 0.4569,
"step": 4170
},
{
"epoch": 0.2042561508954531,
"grad_norm": 2.166372776031494,
"learning_rate": 9.673007557048981e-06,
"loss": 0.4564,
"step": 4180
},
{
"epoch": 0.20474480197415035,
"grad_norm": 1.2645494937896729,
"learning_rate": 9.669967169531148e-06,
"loss": 0.4547,
"step": 4190
},
{
"epoch": 0.20523345305284763,
"grad_norm": 2.206819772720337,
"learning_rate": 9.666913194858301e-06,
"loss": 0.4563,
"step": 4200
},
{
"epoch": 0.20572210413154488,
"grad_norm": 0.8847692608833313,
"learning_rate": 9.663845641915901e-06,
"loss": 0.4581,
"step": 4210
},
{
"epoch": 0.20621075521024212,
"grad_norm": 2.756206512451172,
"learning_rate": 9.660764519628925e-06,
"loss": 0.458,
"step": 4220
},
{
"epoch": 0.20669940628893937,
"grad_norm": 8.863795280456543,
"learning_rate": 9.657669836961816e-06,
"loss": 0.458,
"step": 4230
},
{
"epoch": 0.20718805736763665,
"grad_norm": 1.3385523557662964,
"learning_rate": 9.654561602918481e-06,
"loss": 0.4597,
"step": 4240
},
{
"epoch": 0.2076767084463339,
"grad_norm": 0.9093275666236877,
"learning_rate": 9.651439826542252e-06,
"loss": 0.4561,
"step": 4250
},
{
"epoch": 0.20816535952503115,
"grad_norm": 5.106472492218018,
"learning_rate": 9.648304516915856e-06,
"loss": 0.457,
"step": 4260
},
{
"epoch": 0.2086540106037284,
"grad_norm": 1.5481184720993042,
"learning_rate": 9.645155683161405e-06,
"loss": 0.4607,
"step": 4270
},
{
"epoch": 0.20914266168242565,
"grad_norm": 1.1324431896209717,
"learning_rate": 9.641993334440349e-06,
"loss": 0.4578,
"step": 4280
},
{
"epoch": 0.20963131276112293,
"grad_norm": 2.1077959537506104,
"learning_rate": 9.638817479953466e-06,
"loss": 0.4551,
"step": 4290
},
{
"epoch": 0.21011996383982018,
"grad_norm": 1.6443114280700684,
"learning_rate": 9.635628128940827e-06,
"loss": 0.4564,
"step": 4300
},
{
"epoch": 0.21060861491851743,
"grad_norm": 6.366663455963135,
"learning_rate": 9.632425290681771e-06,
"loss": 0.455,
"step": 4310
},
{
"epoch": 0.21109726599721468,
"grad_norm": 1.1640760898590088,
"learning_rate": 9.629208974494876e-06,
"loss": 0.4568,
"step": 4320
},
{
"epoch": 0.21158591707591196,
"grad_norm": 1.8168015480041504,
"learning_rate": 9.625979189737935e-06,
"loss": 0.4551,
"step": 4330
},
{
"epoch": 0.2120745681546092,
"grad_norm": 1.9787592887878418,
"learning_rate": 9.62273594580793e-06,
"loss": 0.4578,
"step": 4340
},
{
"epoch": 0.21256321923330646,
"grad_norm": 1.0346274375915527,
"learning_rate": 9.619479252141e-06,
"loss": 0.4559,
"step": 4350
},
{
"epoch": 0.2130518703120037,
"grad_norm": 1.2450488805770874,
"learning_rate": 9.61620911821241e-06,
"loss": 0.454,
"step": 4360
},
{
"epoch": 0.21354052139070098,
"grad_norm": 6.733485221862793,
"learning_rate": 9.61292555353654e-06,
"loss": 0.4844,
"step": 4370
},
{
"epoch": 0.21402917246939823,
"grad_norm": 3.55191707611084,
"learning_rate": 9.609628567666838e-06,
"loss": 0.4783,
"step": 4380
},
{
"epoch": 0.21451782354809548,
"grad_norm": 1.3721413612365723,
"learning_rate": 9.606318170195805e-06,
"loss": 0.4614,
"step": 4390
},
{
"epoch": 0.21500647462679273,
"grad_norm": 1.7321209907531738,
"learning_rate": 9.602994370754962e-06,
"loss": 0.457,
"step": 4400
},
{
"epoch": 0.21549512570548998,
"grad_norm": 4.538996696472168,
"learning_rate": 9.599657179014821e-06,
"loss": 0.4573,
"step": 4410
},
{
"epoch": 0.21598377678418726,
"grad_norm": 1.3797237873077393,
"learning_rate": 9.596306604684859e-06,
"loss": 0.4569,
"step": 4420
},
{
"epoch": 0.2164724278628845,
"grad_norm": 1.5394078493118286,
"learning_rate": 9.59294265751349e-06,
"loss": 0.454,
"step": 4430
},
{
"epoch": 0.21696107894158176,
"grad_norm": 5.713876247406006,
"learning_rate": 9.589565347288036e-06,
"loss": 0.4559,
"step": 4440
},
{
"epoch": 0.217449730020279,
"grad_norm": 1.5455598831176758,
"learning_rate": 9.5861746838347e-06,
"loss": 0.4556,
"step": 4450
},
{
"epoch": 0.21793838109897629,
"grad_norm": 1.5440407991409302,
"learning_rate": 9.58277067701853e-06,
"loss": 0.4548,
"step": 4460
},
{
"epoch": 0.21842703217767354,
"grad_norm": 2.495877981185913,
"learning_rate": 9.579353336743406e-06,
"loss": 0.4551,
"step": 4470
},
{
"epoch": 0.21891568325637079,
"grad_norm": 1.8251252174377441,
"learning_rate": 9.575922672951992e-06,
"loss": 0.4543,
"step": 4480
},
{
"epoch": 0.21940433433506804,
"grad_norm": 1.808957815170288,
"learning_rate": 9.572478695625722e-06,
"loss": 0.4533,
"step": 4490
},
{
"epoch": 0.2198929854137653,
"grad_norm": 1.994743824005127,
"learning_rate": 9.56902141478476e-06,
"loss": 0.4536,
"step": 4500
},
{
"epoch": 0.2198929854137653,
"eval_loss": 0.42861247062683105,
"eval_runtime": 729.5786,
"eval_samples_per_second": 242.48,
"eval_steps_per_second": 0.474,
"step": 4500
},
{
"epoch": 0.22038163649246256,
"grad_norm": 1.9364984035491943,
"learning_rate": 9.565550840487987e-06,
"loss": 0.4548,
"step": 4510
},
{
"epoch": 0.2208702875711598,
"grad_norm": 2.0791361331939697,
"learning_rate": 9.562066982832945e-06,
"loss": 0.4546,
"step": 4520
},
{
"epoch": 0.22135893864985706,
"grad_norm": 1.759068489074707,
"learning_rate": 9.55856985195584e-06,
"loss": 0.455,
"step": 4530
},
{
"epoch": 0.2218475897285543,
"grad_norm": 1.7903980016708374,
"learning_rate": 9.555059458031485e-06,
"loss": 0.4536,
"step": 4540
},
{
"epoch": 0.2223362408072516,
"grad_norm": 1.3520255088806152,
"learning_rate": 9.551535811273285e-06,
"loss": 0.4521,
"step": 4550
},
{
"epoch": 0.22282489188594884,
"grad_norm": 1.4286073446273804,
"learning_rate": 9.547998921933203e-06,
"loss": 0.4541,
"step": 4560
},
{
"epoch": 0.2233135429646461,
"grad_norm": 1.2026102542877197,
"learning_rate": 9.544448800301736e-06,
"loss": 0.4531,
"step": 4570
},
{
"epoch": 0.22380219404334334,
"grad_norm": 3.257838010787964,
"learning_rate": 9.54088545670787e-06,
"loss": 0.4614,
"step": 4580
},
{
"epoch": 0.22429084512204062,
"grad_norm": 1.2527670860290527,
"learning_rate": 9.537308901519073e-06,
"loss": 0.4606,
"step": 4590
},
{
"epoch": 0.22477949620073787,
"grad_norm": 4.201780319213867,
"learning_rate": 9.533719145141239e-06,
"loss": 0.4577,
"step": 4600
},
{
"epoch": 0.22526814727943512,
"grad_norm": 1.9157164096832275,
"learning_rate": 9.530116198018677e-06,
"loss": 0.4566,
"step": 4610
},
{
"epoch": 0.22575679835813237,
"grad_norm": 1.9841718673706055,
"learning_rate": 9.526500070634075e-06,
"loss": 0.4561,
"step": 4620
},
{
"epoch": 0.22624544943682964,
"grad_norm": 1.592416524887085,
"learning_rate": 9.522870773508466e-06,
"loss": 0.4538,
"step": 4630
},
{
"epoch": 0.2267341005155269,
"grad_norm": 1.8579721450805664,
"learning_rate": 9.519228317201201e-06,
"loss": 0.4565,
"step": 4640
},
{
"epoch": 0.22722275159422414,
"grad_norm": 0.8354905247688293,
"learning_rate": 9.51557271230992e-06,
"loss": 0.4535,
"step": 4650
},
{
"epoch": 0.2277114026729214,
"grad_norm": 1.6300770044326782,
"learning_rate": 9.51190396947051e-06,
"loss": 0.4555,
"step": 4660
},
{
"epoch": 0.22820005375161867,
"grad_norm": 2.1904120445251465,
"learning_rate": 9.508222099357094e-06,
"loss": 0.455,
"step": 4670
},
{
"epoch": 0.22868870483031592,
"grad_norm": 2.8534996509552,
"learning_rate": 9.504527112681978e-06,
"loss": 0.4551,
"step": 4680
},
{
"epoch": 0.22917735590901317,
"grad_norm": 1.0719540119171143,
"learning_rate": 9.50081902019564e-06,
"loss": 0.4531,
"step": 4690
},
{
"epoch": 0.22966600698771042,
"grad_norm": 1.4179500341415405,
"learning_rate": 9.497097832686682e-06,
"loss": 0.4531,
"step": 4700
},
{
"epoch": 0.23015465806640767,
"grad_norm": 3.2865960597991943,
"learning_rate": 9.493363560981808e-06,
"loss": 0.4531,
"step": 4710
},
{
"epoch": 0.23064330914510495,
"grad_norm": 1.4232662916183472,
"learning_rate": 9.489616215945788e-06,
"loss": 0.4542,
"step": 4720
},
{
"epoch": 0.2311319602238022,
"grad_norm": 1.7004929780960083,
"learning_rate": 9.485855808481434e-06,
"loss": 0.4537,
"step": 4730
},
{
"epoch": 0.23162061130249945,
"grad_norm": 1.8315871953964233,
"learning_rate": 9.482082349529558e-06,
"loss": 0.4544,
"step": 4740
},
{
"epoch": 0.2321092623811967,
"grad_norm": 1.7571625709533691,
"learning_rate": 9.478295850068945e-06,
"loss": 0.4528,
"step": 4750
},
{
"epoch": 0.23259791345989397,
"grad_norm": 2.470423936843872,
"learning_rate": 9.474496321116324e-06,
"loss": 0.4523,
"step": 4760
},
{
"epoch": 0.23308656453859122,
"grad_norm": 1.8932669162750244,
"learning_rate": 9.470683773726331e-06,
"loss": 0.4543,
"step": 4770
},
{
"epoch": 0.23357521561728847,
"grad_norm": 0.8342353105545044,
"learning_rate": 9.466858218991477e-06,
"loss": 0.4537,
"step": 4780
},
{
"epoch": 0.23406386669598572,
"grad_norm": 5.9539055824279785,
"learning_rate": 9.463019668042123e-06,
"loss": 0.4672,
"step": 4790
},
{
"epoch": 0.234552517774683,
"grad_norm": 1.770120620727539,
"learning_rate": 9.459168132046438e-06,
"loss": 0.4571,
"step": 4800
},
{
"epoch": 0.23504116885338025,
"grad_norm": 1.4648096561431885,
"learning_rate": 9.455303622210371e-06,
"loss": 0.4557,
"step": 4810
},
{
"epoch": 0.2355298199320775,
"grad_norm": 1.6342428922653198,
"learning_rate": 9.451426149777617e-06,
"loss": 0.4531,
"step": 4820
},
{
"epoch": 0.23601847101077475,
"grad_norm": 4.125144958496094,
"learning_rate": 9.447535726029593e-06,
"loss": 0.4532,
"step": 4830
},
{
"epoch": 0.236507122089472,
"grad_norm": 1.3270002603530884,
"learning_rate": 9.443632362285385e-06,
"loss": 0.4571,
"step": 4840
},
{
"epoch": 0.23699577316816928,
"grad_norm": 1.3583444356918335,
"learning_rate": 9.439716069901735e-06,
"loss": 0.4553,
"step": 4850
},
{
"epoch": 0.23748442424686653,
"grad_norm": 1.0656195878982544,
"learning_rate": 9.435786860273003e-06,
"loss": 0.4501,
"step": 4860
},
{
"epoch": 0.23797307532556378,
"grad_norm": 1.0093538761138916,
"learning_rate": 9.431844744831126e-06,
"loss": 0.4525,
"step": 4870
},
{
"epoch": 0.23846172640426103,
"grad_norm": 1.5308141708374023,
"learning_rate": 9.427889735045593e-06,
"loss": 0.4533,
"step": 4880
},
{
"epoch": 0.2389503774829583,
"grad_norm": 1.855191707611084,
"learning_rate": 9.423921842423406e-06,
"loss": 0.454,
"step": 4890
},
{
"epoch": 0.23943902856165555,
"grad_norm": 1.1727728843688965,
"learning_rate": 9.419941078509054e-06,
"loss": 0.4523,
"step": 4900
},
{
"epoch": 0.2399276796403528,
"grad_norm": 0.5924420356750488,
"learning_rate": 9.415947454884471e-06,
"loss": 0.4522,
"step": 4910
},
{
"epoch": 0.24041633071905005,
"grad_norm": 2.744570732116699,
"learning_rate": 9.411940983169006e-06,
"loss": 0.4529,
"step": 4920
},
{
"epoch": 0.24090498179774733,
"grad_norm": 1.7564640045166016,
"learning_rate": 9.407921675019393e-06,
"loss": 0.4532,
"step": 4930
},
{
"epoch": 0.24139363287644458,
"grad_norm": 1.4309080839157104,
"learning_rate": 9.403889542129707e-06,
"loss": 0.4533,
"step": 4940
},
{
"epoch": 0.24188228395514183,
"grad_norm": 1.6379081010818481,
"learning_rate": 9.399844596231343e-06,
"loss": 0.4515,
"step": 4950
},
{
"epoch": 0.24237093503383908,
"grad_norm": 0.7086363434791565,
"learning_rate": 9.39578684909297e-06,
"loss": 0.4527,
"step": 4960
},
{
"epoch": 0.24285958611253633,
"grad_norm": 0.987898051738739,
"learning_rate": 9.391716312520503e-06,
"loss": 0.453,
"step": 4970
},
{
"epoch": 0.2433482371912336,
"grad_norm": 1.1785643100738525,
"learning_rate": 9.387632998357073e-06,
"loss": 0.4532,
"step": 4980
},
{
"epoch": 0.24383688826993086,
"grad_norm": 2.234311819076538,
"learning_rate": 9.383536918482976e-06,
"loss": 0.4541,
"step": 4990
},
{
"epoch": 0.2443255393486281,
"grad_norm": 0.9595701098442078,
"learning_rate": 9.37942808481566e-06,
"loss": 0.4532,
"step": 5000
},
{
"epoch": 0.2443255393486281,
"eval_loss": 0.42559880018234253,
"eval_runtime": 729.4388,
"eval_samples_per_second": 242.526,
"eval_steps_per_second": 0.474,
"step": 5000
},
{
"epoch": 0.24481419042732536,
"grad_norm": 1.8065398931503296,
"learning_rate": 9.375306509309676e-06,
"loss": 0.4532,
"step": 5010
},
{
"epoch": 0.24530284150602263,
"grad_norm": 1.7283066511154175,
"learning_rate": 9.371172203956646e-06,
"loss": 0.4534,
"step": 5020
},
{
"epoch": 0.24579149258471988,
"grad_norm": 1.2136019468307495,
"learning_rate": 9.367025180785229e-06,
"loss": 0.4536,
"step": 5030
},
{
"epoch": 0.24628014366341713,
"grad_norm": 0.9906538724899292,
"learning_rate": 9.36286545186109e-06,
"loss": 0.4536,
"step": 5040
},
{
"epoch": 0.24676879474211438,
"grad_norm": 1.390766978263855,
"learning_rate": 9.358693029286855e-06,
"loss": 0.4514,
"step": 5050
},
{
"epoch": 0.24725744582081166,
"grad_norm": 1.2268085479736328,
"learning_rate": 9.354507925202088e-06,
"loss": 0.4516,
"step": 5060
},
{
"epoch": 0.2477460968995089,
"grad_norm": 2.122887372970581,
"learning_rate": 9.350310151783244e-06,
"loss": 0.4491,
"step": 5070
},
{
"epoch": 0.24823474797820616,
"grad_norm": 1.7110397815704346,
"learning_rate": 9.346099721243646e-06,
"loss": 0.4522,
"step": 5080
},
{
"epoch": 0.2487233990569034,
"grad_norm": 0.9016057252883911,
"learning_rate": 9.341876645833434e-06,
"loss": 0.4515,
"step": 5090
},
{
"epoch": 0.2492120501356007,
"grad_norm": 1.6355116367340088,
"learning_rate": 9.337640937839544e-06,
"loss": 0.4545,
"step": 5100
},
{
"epoch": 0.24970070121429794,
"grad_norm": 0.9655557870864868,
"learning_rate": 9.333392609585667e-06,
"loss": 0.455,
"step": 5110
},
{
"epoch": 0.25018935229299516,
"grad_norm": 1.8593004941940308,
"learning_rate": 9.329131673432208e-06,
"loss": 0.4522,
"step": 5120
},
{
"epoch": 0.25067800337169244,
"grad_norm": 1.764728307723999,
"learning_rate": 9.324858141776254e-06,
"loss": 0.4541,
"step": 5130
},
{
"epoch": 0.2511666544503897,
"grad_norm": 0.7554106116294861,
"learning_rate": 9.320572027051544e-06,
"loss": 0.4566,
"step": 5140
},
{
"epoch": 0.25165530552908694,
"grad_norm": 2.081536054611206,
"learning_rate": 9.316273341728423e-06,
"loss": 0.4518,
"step": 5150
},
{
"epoch": 0.2521439566077842,
"grad_norm": 0.8827464580535889,
"learning_rate": 9.311962098313809e-06,
"loss": 0.4502,
"step": 5160
},
{
"epoch": 0.2526326076864815,
"grad_norm": 0.9885526895523071,
"learning_rate": 9.307638309351162e-06,
"loss": 0.4533,
"step": 5170
},
{
"epoch": 0.2531212587651787,
"grad_norm": 1.7395132780075073,
"learning_rate": 9.303301987420436e-06,
"loss": 0.4516,
"step": 5180
},
{
"epoch": 0.253609909843876,
"grad_norm": 0.679470419883728,
"learning_rate": 9.298953145138057e-06,
"loss": 0.4514,
"step": 5190
},
{
"epoch": 0.2540985609225732,
"grad_norm": 1.4864360094070435,
"learning_rate": 9.294591795156873e-06,
"loss": 0.4502,
"step": 5200
},
{
"epoch": 0.2545872120012705,
"grad_norm": 1.3630485534667969,
"learning_rate": 9.290217950166125e-06,
"loss": 0.4508,
"step": 5210
},
{
"epoch": 0.25507586307996777,
"grad_norm": 1.3194668292999268,
"learning_rate": 9.285831622891409e-06,
"loss": 0.4511,
"step": 5220
},
{
"epoch": 0.255564514158665,
"grad_norm": 1.602607250213623,
"learning_rate": 9.281432826094635e-06,
"loss": 0.4523,
"step": 5230
},
{
"epoch": 0.25605316523736227,
"grad_norm": 0.8694007992744446,
"learning_rate": 9.277021572573996e-06,
"loss": 0.4522,
"step": 5240
},
{
"epoch": 0.2565418163160595,
"grad_norm": 0.9949777722358704,
"learning_rate": 9.272597875163925e-06,
"loss": 0.4532,
"step": 5250
},
{
"epoch": 0.25703046739475677,
"grad_norm": 1.0901665687561035,
"learning_rate": 9.268161746735063e-06,
"loss": 0.4509,
"step": 5260
},
{
"epoch": 0.25751911847345405,
"grad_norm": 0.87218177318573,
"learning_rate": 9.263713200194212e-06,
"loss": 0.4506,
"step": 5270
},
{
"epoch": 0.25800776955215127,
"grad_norm": 2.6825311183929443,
"learning_rate": 9.259252248484317e-06,
"loss": 0.4508,
"step": 5280
},
{
"epoch": 0.25849642063084854,
"grad_norm": 1.7756139039993286,
"learning_rate": 9.2547789045844e-06,
"loss": 0.4524,
"step": 5290
},
{
"epoch": 0.2589850717095458,
"grad_norm": 1.459425449371338,
"learning_rate": 9.250293181509551e-06,
"loss": 0.4525,
"step": 5300
},
{
"epoch": 0.25947372278824304,
"grad_norm": 0.5840021371841431,
"learning_rate": 9.245795092310867e-06,
"loss": 0.4508,
"step": 5310
},
{
"epoch": 0.2599623738669403,
"grad_norm": 1.1396574974060059,
"learning_rate": 9.241284650075432e-06,
"loss": 0.4498,
"step": 5320
},
{
"epoch": 0.26045102494563754,
"grad_norm": 2.9981930255889893,
"learning_rate": 9.236761867926264e-06,
"loss": 0.4538,
"step": 5330
},
{
"epoch": 0.2609396760243348,
"grad_norm": 1.627025842666626,
"learning_rate": 9.23222675902229e-06,
"loss": 0.4542,
"step": 5340
},
{
"epoch": 0.2614283271030321,
"grad_norm": 2.1768600940704346,
"learning_rate": 9.227679336558295e-06,
"loss": 0.4514,
"step": 5350
},
{
"epoch": 0.2619169781817293,
"grad_norm": 0.6379441618919373,
"learning_rate": 9.223119613764895e-06,
"loss": 0.4504,
"step": 5360
},
{
"epoch": 0.2624056292604266,
"grad_norm": 1.7971820831298828,
"learning_rate": 9.21854760390849e-06,
"loss": 0.4503,
"step": 5370
},
{
"epoch": 0.2628942803391239,
"grad_norm": 2.099776029586792,
"learning_rate": 9.213963320291232e-06,
"loss": 0.4509,
"step": 5380
},
{
"epoch": 0.2633829314178211,
"grad_norm": 1.0017653703689575,
"learning_rate": 9.209366776250984e-06,
"loss": 0.4504,
"step": 5390
},
{
"epoch": 0.2638715824965184,
"grad_norm": 1.0879532098770142,
"learning_rate": 9.204757985161274e-06,
"loss": 0.4501,
"step": 5400
},
{
"epoch": 0.2643602335752156,
"grad_norm": 1.28214693069458,
"learning_rate": 9.20013696043127e-06,
"loss": 0.4483,
"step": 5410
},
{
"epoch": 0.2648488846539129,
"grad_norm": 2.457913398742676,
"learning_rate": 9.195503715505729e-06,
"loss": 0.4517,
"step": 5420
},
{
"epoch": 0.26533753573261015,
"grad_norm": 0.9251576662063599,
"learning_rate": 9.190858263864963e-06,
"loss": 0.4515,
"step": 5430
},
{
"epoch": 0.2658261868113074,
"grad_norm": 1.5031663179397583,
"learning_rate": 9.1862006190248e-06,
"loss": 0.4499,
"step": 5440
},
{
"epoch": 0.26631483789000465,
"grad_norm": 1.5385842323303223,
"learning_rate": 9.181530794536544e-06,
"loss": 0.4497,
"step": 5450
},
{
"epoch": 0.2668034889687019,
"grad_norm": 1.0565071105957031,
"learning_rate": 9.176848803986934e-06,
"loss": 0.451,
"step": 5460
},
{
"epoch": 0.26729214004739915,
"grad_norm": 0.9009528756141663,
"learning_rate": 9.172154660998108e-06,
"loss": 0.4507,
"step": 5470
},
{
"epoch": 0.26778079112609643,
"grad_norm": 0.7359398007392883,
"learning_rate": 9.167448379227558e-06,
"loss": 0.4493,
"step": 5480
},
{
"epoch": 0.26826944220479365,
"grad_norm": 4.481854438781738,
"learning_rate": 9.162729972368098e-06,
"loss": 0.4516,
"step": 5490
},
{
"epoch": 0.26875809328349093,
"grad_norm": 1.0901057720184326,
"learning_rate": 9.157999454147814e-06,
"loss": 0.4518,
"step": 5500
},
{
"epoch": 0.26875809328349093,
"eval_loss": 0.4273635745048523,
"eval_runtime": 728.6534,
"eval_samples_per_second": 242.788,
"eval_steps_per_second": 0.475,
"step": 5500
},
{
"epoch": 0.2692467443621882,
"grad_norm": 1.3341872692108154,
"learning_rate": 9.153256838330035e-06,
"loss": 0.4499,
"step": 5510
},
{
"epoch": 0.26973539544088543,
"grad_norm": 1.7751141786575317,
"learning_rate": 9.148502138713286e-06,
"loss": 0.4491,
"step": 5520
},
{
"epoch": 0.2702240465195827,
"grad_norm": 1.0976356267929077,
"learning_rate": 9.143735369131249e-06,
"loss": 0.4496,
"step": 5530
},
{
"epoch": 0.2707126975982799,
"grad_norm": 2.7799429893493652,
"learning_rate": 9.13895654345272e-06,
"loss": 0.4501,
"step": 5540
},
{
"epoch": 0.2712013486769772,
"grad_norm": 1.4997122287750244,
"learning_rate": 9.134165675581579e-06,
"loss": 0.4494,
"step": 5550
},
{
"epoch": 0.2716899997556745,
"grad_norm": 1.3157509565353394,
"learning_rate": 9.129362779456737e-06,
"loss": 0.4505,
"step": 5560
},
{
"epoch": 0.2721786508343717,
"grad_norm": 2.182624101638794,
"learning_rate": 9.124547869052103e-06,
"loss": 0.4499,
"step": 5570
},
{
"epoch": 0.272667301913069,
"grad_norm": 0.6629562377929688,
"learning_rate": 9.11972095837654e-06,
"loss": 0.4501,
"step": 5580
},
{
"epoch": 0.2731559529917662,
"grad_norm": 0.7715067863464355,
"learning_rate": 9.114882061473827e-06,
"loss": 0.4496,
"step": 5590
},
{
"epoch": 0.2736446040704635,
"grad_norm": 1.0679346323013306,
"learning_rate": 9.110031192422613e-06,
"loss": 0.4488,
"step": 5600
},
{
"epoch": 0.27413325514916076,
"grad_norm": 2.0973806381225586,
"learning_rate": 9.105168365336389e-06,
"loss": 0.4505,
"step": 5610
},
{
"epoch": 0.274621906227858,
"grad_norm": 1.7515530586242676,
"learning_rate": 9.100293594363425e-06,
"loss": 0.4498,
"step": 5620
},
{
"epoch": 0.27511055730655526,
"grad_norm": 1.3219352960586548,
"learning_rate": 9.095406893686752e-06,
"loss": 0.45,
"step": 5630
},
{
"epoch": 0.27559920838525254,
"grad_norm": 1.7914499044418335,
"learning_rate": 9.090508277524103e-06,
"loss": 0.4506,
"step": 5640
},
{
"epoch": 0.27608785946394976,
"grad_norm": 1.048553228378296,
"learning_rate": 9.085597760127884e-06,
"loss": 0.4479,
"step": 5650
},
{
"epoch": 0.27657651054264704,
"grad_norm": 0.9424349069595337,
"learning_rate": 9.080675355785123e-06,
"loss": 0.4479,
"step": 5660
},
{
"epoch": 0.27706516162134426,
"grad_norm": 2.2007129192352295,
"learning_rate": 9.075741078817435e-06,
"loss": 0.4517,
"step": 5670
},
{
"epoch": 0.27755381270004154,
"grad_norm": 1.4200412034988403,
"learning_rate": 9.070794943580978e-06,
"loss": 0.4503,
"step": 5680
},
{
"epoch": 0.2780424637787388,
"grad_norm": 3.359553575515747,
"learning_rate": 9.065836964466412e-06,
"loss": 0.4504,
"step": 5690
},
{
"epoch": 0.27853111485743604,
"grad_norm": 1.0638636350631714,
"learning_rate": 9.060867155898856e-06,
"loss": 0.4503,
"step": 5700
},
{
"epoch": 0.2790197659361333,
"grad_norm": 1.592399001121521,
"learning_rate": 9.055885532337847e-06,
"loss": 0.4485,
"step": 5710
},
{
"epoch": 0.27950841701483053,
"grad_norm": 0.6336447596549988,
"learning_rate": 9.050892108277292e-06,
"loss": 0.4486,
"step": 5720
},
{
"epoch": 0.2799970680935278,
"grad_norm": 2.1107187271118164,
"learning_rate": 9.045886898245441e-06,
"loss": 0.451,
"step": 5730
},
{
"epoch": 0.2804857191722251,
"grad_norm": 1.656101107597351,
"learning_rate": 9.040869916804827e-06,
"loss": 0.4494,
"step": 5740
},
{
"epoch": 0.2809743702509223,
"grad_norm": 1.3328661918640137,
"learning_rate": 9.035841178552236e-06,
"loss": 0.4492,
"step": 5750
},
{
"epoch": 0.2814630213296196,
"grad_norm": 0.48556625843048096,
"learning_rate": 9.030800698118658e-06,
"loss": 0.4494,
"step": 5760
},
{
"epoch": 0.28195167240831687,
"grad_norm": 2.595662832260132,
"learning_rate": 9.025748490169248e-06,
"loss": 0.4498,
"step": 5770
},
{
"epoch": 0.2824403234870141,
"grad_norm": 0.8997907042503357,
"learning_rate": 9.02068456940328e-06,
"loss": 0.4482,
"step": 5780
},
{
"epoch": 0.28292897456571137,
"grad_norm": 1.9101444482803345,
"learning_rate": 9.01560895055411e-06,
"loss": 0.4495,
"step": 5790
},
{
"epoch": 0.2834176256444086,
"grad_norm": 0.7567463517189026,
"learning_rate": 9.010521648389122e-06,
"loss": 0.4501,
"step": 5800
},
{
"epoch": 0.28390627672310587,
"grad_norm": 3.035726547241211,
"learning_rate": 9.005422677709701e-06,
"loss": 0.4499,
"step": 5810
},
{
"epoch": 0.28439492780180314,
"grad_norm": 1.5301775932312012,
"learning_rate": 9.000312053351175e-06,
"loss": 0.4484,
"step": 5820
},
{
"epoch": 0.28488357888050037,
"grad_norm": 1.8312554359436035,
"learning_rate": 8.995189790182782e-06,
"loss": 0.4486,
"step": 5830
},
{
"epoch": 0.28537222995919764,
"grad_norm": 1.362288236618042,
"learning_rate": 8.99005590310762e-06,
"loss": 0.4497,
"step": 5840
},
{
"epoch": 0.28586088103789487,
"grad_norm": 1.4402492046356201,
"learning_rate": 8.984910407062608e-06,
"loss": 0.4496,
"step": 5850
},
{
"epoch": 0.28634953211659214,
"grad_norm": 0.9459155201911926,
"learning_rate": 8.97975331701844e-06,
"loss": 0.4485,
"step": 5860
},
{
"epoch": 0.2868381831952894,
"grad_norm": 1.4187127351760864,
"learning_rate": 8.974584647979546e-06,
"loss": 0.449,
"step": 5870
},
{
"epoch": 0.28732683427398664,
"grad_norm": 2.6295182704925537,
"learning_rate": 8.969404414984035e-06,
"loss": 0.4493,
"step": 5880
},
{
"epoch": 0.2878154853526839,
"grad_norm": 1.6124824285507202,
"learning_rate": 8.964212633103674e-06,
"loss": 0.4496,
"step": 5890
},
{
"epoch": 0.2883041364313812,
"grad_norm": 0.6683453321456909,
"learning_rate": 8.959009317443825e-06,
"loss": 0.4484,
"step": 5900
},
{
"epoch": 0.2887927875100784,
"grad_norm": 1.6014492511749268,
"learning_rate": 8.953794483143406e-06,
"loss": 0.4483,
"step": 5910
},
{
"epoch": 0.2892814385887757,
"grad_norm": 1.014033317565918,
"learning_rate": 8.948568145374849e-06,
"loss": 0.449,
"step": 5920
},
{
"epoch": 0.2897700896674729,
"grad_norm": 1.4940074682235718,
"learning_rate": 8.943330319344055e-06,
"loss": 0.4496,
"step": 5930
},
{
"epoch": 0.2902587407461702,
"grad_norm": 0.863261342048645,
"learning_rate": 8.938081020290352e-06,
"loss": 0.4495,
"step": 5940
},
{
"epoch": 0.2907473918248675,
"grad_norm": 1.5809766054153442,
"learning_rate": 8.932820263486447e-06,
"loss": 0.4493,
"step": 5950
},
{
"epoch": 0.2912360429035647,
"grad_norm": 0.7684280276298523,
"learning_rate": 8.927548064238383e-06,
"loss": 0.4492,
"step": 5960
},
{
"epoch": 0.291724693982262,
"grad_norm": 2.1927716732025146,
"learning_rate": 8.922264437885492e-06,
"loss": 0.451,
"step": 5970
},
{
"epoch": 0.2922133450609592,
"grad_norm": 1.0817362070083618,
"learning_rate": 8.916969399800359e-06,
"loss": 0.4506,
"step": 5980
},
{
"epoch": 0.2927019961396565,
"grad_norm": 0.7948960661888123,
"learning_rate": 8.911662965388765e-06,
"loss": 0.4499,
"step": 5990
},
{
"epoch": 0.29319064721835375,
"grad_norm": 0.9926490187644958,
"learning_rate": 8.906345150089652e-06,
"loss": 0.4486,
"step": 6000
},
{
"epoch": 0.29319064721835375,
"eval_loss": 0.4233919382095337,
"eval_runtime": 728.6738,
"eval_samples_per_second": 242.781,
"eval_steps_per_second": 0.475,
"step": 6000
},
{
"epoch": 0.293679298297051,
"grad_norm": 1.070708155632019,
"learning_rate": 8.901015969375074e-06,
"loss": 0.4497,
"step": 6010
},
{
"epoch": 0.29416794937574825,
"grad_norm": 1.2505017518997192,
"learning_rate": 8.89567543875015e-06,
"loss": 0.4479,
"step": 6020
},
{
"epoch": 0.2946566004544455,
"grad_norm": 0.6546292304992676,
"learning_rate": 8.890323573753023e-06,
"loss": 0.4495,
"step": 6030
},
{
"epoch": 0.29514525153314275,
"grad_norm": 5.781423091888428,
"learning_rate": 8.884960389954813e-06,
"loss": 0.4478,
"step": 6040
},
{
"epoch": 0.29563390261184,
"grad_norm": 1.123044490814209,
"learning_rate": 8.879585902959573e-06,
"loss": 0.4493,
"step": 6050
},
{
"epoch": 0.29612255369053725,
"grad_norm": 1.8155452013015747,
"learning_rate": 8.874200128404242e-06,
"loss": 0.4504,
"step": 6060
},
{
"epoch": 0.2966112047692345,
"grad_norm": 1.4578708410263062,
"learning_rate": 8.868803081958597e-06,
"loss": 0.4503,
"step": 6070
},
{
"epoch": 0.2970998558479318,
"grad_norm": 1.241621971130371,
"learning_rate": 8.863394779325212e-06,
"loss": 0.4495,
"step": 6080
},
{
"epoch": 0.297588506926629,
"grad_norm": 0.9442185759544373,
"learning_rate": 8.857975236239412e-06,
"loss": 0.4484,
"step": 6090
},
{
"epoch": 0.2980771580053263,
"grad_norm": 1.3439468145370483,
"learning_rate": 8.852544468469224e-06,
"loss": 0.4488,
"step": 6100
},
{
"epoch": 0.2985658090840235,
"grad_norm": 2.7450032234191895,
"learning_rate": 8.847102491815336e-06,
"loss": 0.4488,
"step": 6110
},
{
"epoch": 0.2990544601627208,
"grad_norm": 1.1001813411712646,
"learning_rate": 8.841649322111044e-06,
"loss": 0.4501,
"step": 6120
},
{
"epoch": 0.2995431112414181,
"grad_norm": 0.6491206884384155,
"learning_rate": 8.836184975222212e-06,
"loss": 0.4474,
"step": 6130
},
{
"epoch": 0.3000317623201153,
"grad_norm": 0.40915462374687195,
"learning_rate": 8.830709467047223e-06,
"loss": 0.4486,
"step": 6140
},
{
"epoch": 0.3005204133988126,
"grad_norm": 0.9558333158493042,
"learning_rate": 8.825222813516933e-06,
"loss": 0.4468,
"step": 6150
},
{
"epoch": 0.30100906447750986,
"grad_norm": 1.2985563278198242,
"learning_rate": 8.819725030594626e-06,
"loss": 0.4484,
"step": 6160
},
{
"epoch": 0.3014977155562071,
"grad_norm": 1.1261284351348877,
"learning_rate": 8.81421613427597e-06,
"loss": 0.4493,
"step": 6170
},
{
"epoch": 0.30198636663490436,
"grad_norm": 1.677819848060608,
"learning_rate": 8.80869614058896e-06,
"loss": 0.4476,
"step": 6180
},
{
"epoch": 0.3024750177136016,
"grad_norm": 1.6651966571807861,
"learning_rate": 8.803165065593884e-06,
"loss": 0.4473,
"step": 6190
},
{
"epoch": 0.30296366879229886,
"grad_norm": 0.8978771567344666,
"learning_rate": 8.797622925383267e-06,
"loss": 0.4478,
"step": 6200
},
{
"epoch": 0.30345231987099613,
"grad_norm": 0.6011471748352051,
"learning_rate": 8.792069736081835e-06,
"loss": 0.4478,
"step": 6210
},
{
"epoch": 0.30394097094969336,
"grad_norm": 3.1353821754455566,
"learning_rate": 8.78650551384645e-06,
"loss": 0.4515,
"step": 6220
},
{
"epoch": 0.30442962202839063,
"grad_norm": 1.1291117668151855,
"learning_rate": 8.780930274866084e-06,
"loss": 0.4498,
"step": 6230
},
{
"epoch": 0.3049182731070879,
"grad_norm": 0.6393253803253174,
"learning_rate": 8.775344035361758e-06,
"loss": 0.4489,
"step": 6240
},
{
"epoch": 0.30540692418578513,
"grad_norm": 1.493739366531372,
"learning_rate": 8.7697468115865e-06,
"loss": 0.4498,
"step": 6250
},
{
"epoch": 0.3058955752644824,
"grad_norm": 1.8243303298950195,
"learning_rate": 8.76413861982529e-06,
"loss": 0.4492,
"step": 6260
},
{
"epoch": 0.30638422634317963,
"grad_norm": 0.7140172719955444,
"learning_rate": 8.758519476395029e-06,
"loss": 0.4478,
"step": 6270
},
{
"epoch": 0.3068728774218769,
"grad_norm": 0.9651872515678406,
"learning_rate": 8.752889397644478e-06,
"loss": 0.4484,
"step": 6280
},
{
"epoch": 0.3073615285005742,
"grad_norm": 0.4499496817588806,
"learning_rate": 8.747248399954212e-06,
"loss": 0.4475,
"step": 6290
},
{
"epoch": 0.3078501795792714,
"grad_norm": 1.09201180934906,
"learning_rate": 8.741596499736573e-06,
"loss": 0.4491,
"step": 6300
},
{
"epoch": 0.3083388306579687,
"grad_norm": 0.835132360458374,
"learning_rate": 8.735933713435627e-06,
"loss": 0.4479,
"step": 6310
},
{
"epoch": 0.3088274817366659,
"grad_norm": 0.7163196802139282,
"learning_rate": 8.730260057527116e-06,
"loss": 0.4484,
"step": 6320
},
{
"epoch": 0.3093161328153632,
"grad_norm": 1.1830068826675415,
"learning_rate": 8.724575548518397e-06,
"loss": 0.4475,
"step": 6330
},
{
"epoch": 0.30980478389406046,
"grad_norm": 1.2740248441696167,
"learning_rate": 8.718880202948414e-06,
"loss": 0.447,
"step": 6340
},
{
"epoch": 0.3102934349727577,
"grad_norm": 1.1490364074707031,
"learning_rate": 8.713174037387633e-06,
"loss": 0.447,
"step": 6350
},
{
"epoch": 0.31078208605145496,
"grad_norm": 1.9249966144561768,
"learning_rate": 8.707457068438004e-06,
"loss": 0.4477,
"step": 6360
},
{
"epoch": 0.31127073713015224,
"grad_norm": 1.1233280897140503,
"learning_rate": 8.701729312732907e-06,
"loss": 0.45,
"step": 6370
},
{
"epoch": 0.31175938820884946,
"grad_norm": 0.5614790916442871,
"learning_rate": 8.695990786937109e-06,
"loss": 0.447,
"step": 6380
},
{
"epoch": 0.31224803928754674,
"grad_norm": 0.8090300559997559,
"learning_rate": 8.690241507746706e-06,
"loss": 0.4493,
"step": 6390
},
{
"epoch": 0.31273669036624396,
"grad_norm": 0.9170634746551514,
"learning_rate": 8.68448149188909e-06,
"loss": 0.4479,
"step": 6400
},
{
"epoch": 0.31322534144494124,
"grad_norm": 0.8162520527839661,
"learning_rate": 8.67871075612288e-06,
"loss": 0.4473,
"step": 6410
},
{
"epoch": 0.3137139925236385,
"grad_norm": 2.09964656829834,
"learning_rate": 8.672929317237897e-06,
"loss": 0.4466,
"step": 6420
},
{
"epoch": 0.31420264360233574,
"grad_norm": 1.2079427242279053,
"learning_rate": 8.667137192055093e-06,
"loss": 0.4483,
"step": 6430
},
{
"epoch": 0.314691294681033,
"grad_norm": 0.8319594860076904,
"learning_rate": 8.661334397426511e-06,
"loss": 0.4457,
"step": 6440
},
{
"epoch": 0.31517994575973024,
"grad_norm": 1.2110413312911987,
"learning_rate": 8.655520950235243e-06,
"loss": 0.449,
"step": 6450
},
{
"epoch": 0.3156685968384275,
"grad_norm": 1.1097526550292969,
"learning_rate": 8.649696867395372e-06,
"loss": 0.4482,
"step": 6460
},
{
"epoch": 0.3161572479171248,
"grad_norm": 0.4162759482860565,
"learning_rate": 8.643862165851922e-06,
"loss": 0.4465,
"step": 6470
},
{
"epoch": 0.316645898995822,
"grad_norm": 0.8267191052436829,
"learning_rate": 8.638016862580814e-06,
"loss": 0.4469,
"step": 6480
},
{
"epoch": 0.3171345500745193,
"grad_norm": 1.518624186515808,
"learning_rate": 8.632160974588817e-06,
"loss": 0.4482,
"step": 6490
},
{
"epoch": 0.31762320115321657,
"grad_norm": 0.7973819375038147,
"learning_rate": 8.62629451891349e-06,
"loss": 0.448,
"step": 6500
},
{
"epoch": 0.31762320115321657,
"eval_loss": 0.4212668538093567,
"eval_runtime": 728.4104,
"eval_samples_per_second": 242.869,
"eval_steps_per_second": 0.475,
"step": 6500
},
{
"epoch": 0.3181118522319138,
"grad_norm": 1.7393572330474854,
"learning_rate": 8.620417512623145e-06,
"loss": 0.4462,
"step": 6510
},
{
"epoch": 0.31860050331061107,
"grad_norm": 0.8156083226203918,
"learning_rate": 8.614529972816787e-06,
"loss": 0.4478,
"step": 6520
},
{
"epoch": 0.3190891543893083,
"grad_norm": 0.6622930765151978,
"learning_rate": 8.608631916624069e-06,
"loss": 0.4468,
"step": 6530
},
{
"epoch": 0.31957780546800557,
"grad_norm": 1.1308300495147705,
"learning_rate": 8.602723361205241e-06,
"loss": 0.4467,
"step": 6540
},
{
"epoch": 0.32006645654670285,
"grad_norm": 0.8318139314651489,
"learning_rate": 8.596804323751098e-06,
"loss": 0.4471,
"step": 6550
},
{
"epoch": 0.32055510762540007,
"grad_norm": 0.5246617794036865,
"learning_rate": 8.590874821482937e-06,
"loss": 0.446,
"step": 6560
},
{
"epoch": 0.32104375870409735,
"grad_norm": 0.8752800226211548,
"learning_rate": 8.584934871652498e-06,
"loss": 0.4468,
"step": 6570
},
{
"epoch": 0.32153240978279457,
"grad_norm": 1.248165249824524,
"learning_rate": 8.57898449154192e-06,
"loss": 0.448,
"step": 6580
},
{
"epoch": 0.32202106086149185,
"grad_norm": 1.0610485076904297,
"learning_rate": 8.573023698463689e-06,
"loss": 0.4468,
"step": 6590
},
{
"epoch": 0.3225097119401891,
"grad_norm": 3.7733728885650635,
"learning_rate": 8.567052509760586e-06,
"loss": 0.4538,
"step": 6600
},
{
"epoch": 0.32299836301888635,
"grad_norm": 3.644801616668701,
"learning_rate": 8.561070942805636e-06,
"loss": 0.449,
"step": 6610
},
{
"epoch": 0.3234870140975836,
"grad_norm": 0.774163544178009,
"learning_rate": 8.555079015002063e-06,
"loss": 0.4471,
"step": 6620
},
{
"epoch": 0.3239756651762809,
"grad_norm": 1.7043198347091675,
"learning_rate": 8.549076743783236e-06,
"loss": 0.4474,
"step": 6630
},
{
"epoch": 0.3244643162549781,
"grad_norm": 1.1995218992233276,
"learning_rate": 8.543064146612612e-06,
"loss": 0.4477,
"step": 6640
},
{
"epoch": 0.3249529673336754,
"grad_norm": 1.5275466442108154,
"learning_rate": 8.5370412409837e-06,
"loss": 0.448,
"step": 6650
},
{
"epoch": 0.3254416184123726,
"grad_norm": 0.8573246002197266,
"learning_rate": 8.53100804441999e-06,
"loss": 0.4474,
"step": 6660
},
{
"epoch": 0.3259302694910699,
"grad_norm": 1.1308470964431763,
"learning_rate": 8.524964574474925e-06,
"loss": 0.4466,
"step": 6670
},
{
"epoch": 0.3264189205697672,
"grad_norm": 1.240512728691101,
"learning_rate": 8.51891084873183e-06,
"loss": 0.4463,
"step": 6680
},
{
"epoch": 0.3269075716484644,
"grad_norm": 2.6846487522125244,
"learning_rate": 8.512846884803874e-06,
"loss": 0.4476,
"step": 6690
},
{
"epoch": 0.3273962227271617,
"grad_norm": 0.7580792307853699,
"learning_rate": 8.506772700334008e-06,
"loss": 0.4463,
"step": 6700
},
{
"epoch": 0.3278848738058589,
"grad_norm": 0.49652209877967834,
"learning_rate": 8.500688312994925e-06,
"loss": 0.4471,
"step": 6710
},
{
"epoch": 0.3283735248845562,
"grad_norm": 2.0272531509399414,
"learning_rate": 8.494593740489e-06,
"loss": 0.4465,
"step": 6720
},
{
"epoch": 0.32886217596325346,
"grad_norm": 1.3837034702301025,
"learning_rate": 8.488489000548244e-06,
"loss": 0.4493,
"step": 6730
},
{
"epoch": 0.3293508270419507,
"grad_norm": 1.1367080211639404,
"learning_rate": 8.482374110934246e-06,
"loss": 0.4474,
"step": 6740
},
{
"epoch": 0.32983947812064796,
"grad_norm": 1.121301531791687,
"learning_rate": 8.476249089438129e-06,
"loss": 0.4459,
"step": 6750
},
{
"epoch": 0.33032812919934523,
"grad_norm": 0.9756953120231628,
"learning_rate": 8.470113953880493e-06,
"loss": 0.4468,
"step": 6760
},
{
"epoch": 0.33081678027804245,
"grad_norm": 1.3827910423278809,
"learning_rate": 8.463968722111362e-06,
"loss": 0.4473,
"step": 6770
},
{
"epoch": 0.33130543135673973,
"grad_norm": 0.6767109632492065,
"learning_rate": 8.45781341201014e-06,
"loss": 0.447,
"step": 6780
},
{
"epoch": 0.33179408243543695,
"grad_norm": 1.0480477809906006,
"learning_rate": 8.451648041485551e-06,
"loss": 0.4469,
"step": 6790
},
{
"epoch": 0.33228273351413423,
"grad_norm": 1.5709936618804932,
"learning_rate": 8.445472628475588e-06,
"loss": 0.4471,
"step": 6800
},
{
"epoch": 0.3327713845928315,
"grad_norm": 1.5795131921768188,
"learning_rate": 8.439287190947464e-06,
"loss": 0.447,
"step": 6810
},
{
"epoch": 0.33326003567152873,
"grad_norm": 1.1700830459594727,
"learning_rate": 8.433091746897559e-06,
"loss": 0.4455,
"step": 6820
},
{
"epoch": 0.333748686750226,
"grad_norm": 1.7184573411941528,
"learning_rate": 8.426886314351363e-06,
"loss": 0.4458,
"step": 6830
},
{
"epoch": 0.33423733782892323,
"grad_norm": 0.4313448667526245,
"learning_rate": 8.420670911363433e-06,
"loss": 0.447,
"step": 6840
},
{
"epoch": 0.3347259889076205,
"grad_norm": 1.0812926292419434,
"learning_rate": 8.41444555601733e-06,
"loss": 0.4456,
"step": 6850
},
{
"epoch": 0.3352146399863178,
"grad_norm": 1.1345865726470947,
"learning_rate": 8.40821026642557e-06,
"loss": 0.447,
"step": 6860
},
{
"epoch": 0.335703291065015,
"grad_norm": 0.6373735070228577,
"learning_rate": 8.401965060729582e-06,
"loss": 0.4451,
"step": 6870
},
{
"epoch": 0.3361919421437123,
"grad_norm": 6.616238594055176,
"learning_rate": 8.395709957099633e-06,
"loss": 0.4475,
"step": 6880
},
{
"epoch": 0.33668059322240956,
"grad_norm": 0.9826495051383972,
"learning_rate": 8.389444973734797e-06,
"loss": 0.4486,
"step": 6890
},
{
"epoch": 0.3371692443011068,
"grad_norm": 1.7973625659942627,
"learning_rate": 8.383170128862887e-06,
"loss": 0.4473,
"step": 6900
},
{
"epoch": 0.33765789537980406,
"grad_norm": 0.9026411175727844,
"learning_rate": 8.376885440740414e-06,
"loss": 0.4472,
"step": 6910
},
{
"epoch": 0.3381465464585013,
"grad_norm": 0.9952638149261475,
"learning_rate": 8.37059092765252e-06,
"loss": 0.4461,
"step": 6920
},
{
"epoch": 0.33863519753719856,
"grad_norm": 2.210338830947876,
"learning_rate": 8.364286607912938e-06,
"loss": 0.4487,
"step": 6930
},
{
"epoch": 0.33912384861589584,
"grad_norm": 1.286643385887146,
"learning_rate": 8.357972499863933e-06,
"loss": 0.4469,
"step": 6940
},
{
"epoch": 0.33961249969459306,
"grad_norm": 1.2331130504608154,
"learning_rate": 8.351648621876248e-06,
"loss": 0.4479,
"step": 6950
},
{
"epoch": 0.34010115077329034,
"grad_norm": 0.7784949541091919,
"learning_rate": 8.345314992349047e-06,
"loss": 0.4468,
"step": 6960
},
{
"epoch": 0.34058980185198756,
"grad_norm": 3.558990955352783,
"learning_rate": 8.338971629709873e-06,
"loss": 0.4455,
"step": 6970
},
{
"epoch": 0.34107845293068484,
"grad_norm": 0.712576150894165,
"learning_rate": 8.332618552414585e-06,
"loss": 0.4461,
"step": 6980
},
{
"epoch": 0.3415671040093821,
"grad_norm": 1.1077570915222168,
"learning_rate": 8.326255778947303e-06,
"loss": 0.4453,
"step": 6990
},
{
"epoch": 0.34205575508807934,
"grad_norm": 1.3067269325256348,
"learning_rate": 8.319883327820363e-06,
"loss": 0.4462,
"step": 7000
},
{
"epoch": 0.34205575508807934,
"eval_loss": 0.4191921055316925,
"eval_runtime": 728.4719,
"eval_samples_per_second": 242.848,
"eval_steps_per_second": 0.475,
"step": 7000
},
{
"epoch": 0.3425444061667766,
"grad_norm": 1.001678705215454,
"learning_rate": 8.313501217574253e-06,
"loss": 0.4465,
"step": 7010
},
{
"epoch": 0.3430330572454739,
"grad_norm": 0.7304960489273071,
"learning_rate": 8.307109466777567e-06,
"loss": 0.4458,
"step": 7020
},
{
"epoch": 0.3435217083241711,
"grad_norm": 0.7707636952400208,
"learning_rate": 8.30070809402695e-06,
"loss": 0.4441,
"step": 7030
},
{
"epoch": 0.3440103594028684,
"grad_norm": 0.9046769142150879,
"learning_rate": 8.294297117947035e-06,
"loss": 0.4445,
"step": 7040
},
{
"epoch": 0.3444990104815656,
"grad_norm": 0.8245752453804016,
"learning_rate": 8.287876557190402e-06,
"loss": 0.444,
"step": 7050
},
{
"epoch": 0.3449876615602629,
"grad_norm": 1.746430516242981,
"learning_rate": 8.281446430437516e-06,
"loss": 0.4469,
"step": 7060
},
{
"epoch": 0.34547631263896017,
"grad_norm": 1.3313848972320557,
"learning_rate": 8.27500675639667e-06,
"loss": 0.4473,
"step": 7070
},
{
"epoch": 0.3459649637176574,
"grad_norm": 1.182501196861267,
"learning_rate": 8.26855755380394e-06,
"loss": 0.4453,
"step": 7080
},
{
"epoch": 0.34645361479635467,
"grad_norm": 2.6568055152893066,
"learning_rate": 8.262098841423126e-06,
"loss": 0.4462,
"step": 7090
},
{
"epoch": 0.3469422658750519,
"grad_norm": 1.4778715372085571,
"learning_rate": 8.255630638045685e-06,
"loss": 0.4463,
"step": 7100
},
{
"epoch": 0.34743091695374917,
"grad_norm": 1.463995099067688,
"learning_rate": 8.249152962490705e-06,
"loss": 0.4468,
"step": 7110
},
{
"epoch": 0.34791956803244645,
"grad_norm": 0.9242321848869324,
"learning_rate": 8.242665833604818e-06,
"loss": 0.446,
"step": 7120
},
{
"epoch": 0.34840821911114367,
"grad_norm": 0.8648793697357178,
"learning_rate": 8.236169270262168e-06,
"loss": 0.4447,
"step": 7130
},
{
"epoch": 0.34889687018984095,
"grad_norm": 0.7932630777359009,
"learning_rate": 8.229663291364349e-06,
"loss": 0.4458,
"step": 7140
},
{
"epoch": 0.3493855212685382,
"grad_norm": 2.303868055343628,
"learning_rate": 8.223147915840347e-06,
"loss": 0.446,
"step": 7150
},
{
"epoch": 0.34987417234723545,
"grad_norm": 0.47625330090522766,
"learning_rate": 8.216623162646487e-06,
"loss": 0.4469,
"step": 7160
},
{
"epoch": 0.3503628234259327,
"grad_norm": 0.5169132947921753,
"learning_rate": 8.210089050766374e-06,
"loss": 0.4461,
"step": 7170
},
{
"epoch": 0.35085147450462995,
"grad_norm": 1.1093195676803589,
"learning_rate": 8.203545599210851e-06,
"loss": 0.4457,
"step": 7180
},
{
"epoch": 0.3513401255833272,
"grad_norm": 1.9182569980621338,
"learning_rate": 8.19699282701793e-06,
"loss": 0.4453,
"step": 7190
},
{
"epoch": 0.3518287766620245,
"grad_norm": 0.5894930958747864,
"learning_rate": 8.190430753252742e-06,
"loss": 0.4462,
"step": 7200
},
{
"epoch": 0.3523174277407217,
"grad_norm": 1.633952260017395,
"learning_rate": 8.183859397007476e-06,
"loss": 0.4446,
"step": 7210
},
{
"epoch": 0.352806078819419,
"grad_norm": 1.9727741479873657,
"learning_rate": 8.177278777401332e-06,
"loss": 0.448,
"step": 7220
},
{
"epoch": 0.3532947298981163,
"grad_norm": 1.4541544914245605,
"learning_rate": 8.170688913580465e-06,
"loss": 0.4474,
"step": 7230
},
{
"epoch": 0.3537833809768135,
"grad_norm": 2.3945956230163574,
"learning_rate": 8.16408982471792e-06,
"loss": 0.4456,
"step": 7240
},
{
"epoch": 0.3542720320555108,
"grad_norm": 0.821062445640564,
"learning_rate": 8.157481530013586e-06,
"loss": 0.4459,
"step": 7250
},
{
"epoch": 0.354760683134208,
"grad_norm": 0.6615464687347412,
"learning_rate": 8.150864048694132e-06,
"loss": 0.4458,
"step": 7260
},
{
"epoch": 0.3552493342129053,
"grad_norm": 0.6758638620376587,
"learning_rate": 8.14423740001296e-06,
"loss": 0.4441,
"step": 7270
},
{
"epoch": 0.35573798529160255,
"grad_norm": 1.2416491508483887,
"learning_rate": 8.137601603250139e-06,
"loss": 0.4454,
"step": 7280
},
{
"epoch": 0.3562266363702998,
"grad_norm": 0.828959584236145,
"learning_rate": 8.13095667771236e-06,
"loss": 0.4444,
"step": 7290
},
{
"epoch": 0.35671528744899705,
"grad_norm": 0.5700317025184631,
"learning_rate": 8.124302642732871e-06,
"loss": 0.4459,
"step": 7300
},
{
"epoch": 0.3572039385276943,
"grad_norm": 0.6910264492034912,
"learning_rate": 8.117639517671421e-06,
"loss": 0.4446,
"step": 7310
},
{
"epoch": 0.35769258960639155,
"grad_norm": 1.0732626914978027,
"learning_rate": 8.11096732191421e-06,
"loss": 0.4457,
"step": 7320
},
{
"epoch": 0.35818124068508883,
"grad_norm": 0.9882492423057556,
"learning_rate": 8.10428607487383e-06,
"loss": 0.445,
"step": 7330
},
{
"epoch": 0.35866989176378605,
"grad_norm": 0.5441588163375854,
"learning_rate": 8.097595795989203e-06,
"loss": 0.4453,
"step": 7340
},
{
"epoch": 0.35915854284248333,
"grad_norm": 0.8513416647911072,
"learning_rate": 8.090896504725534e-06,
"loss": 0.4455,
"step": 7350
},
{
"epoch": 0.3596471939211806,
"grad_norm": 0.5936821103096008,
"learning_rate": 8.084188220574244e-06,
"loss": 0.444,
"step": 7360
},
{
"epoch": 0.36013584499987783,
"grad_norm": 4.0613017082214355,
"learning_rate": 8.077470963052922e-06,
"loss": 0.447,
"step": 7370
},
{
"epoch": 0.3606244960785751,
"grad_norm": 0.7625659704208374,
"learning_rate": 8.070744751705267e-06,
"loss": 0.4463,
"step": 7380
},
{
"epoch": 0.36111314715727233,
"grad_norm": 0.8564379811286926,
"learning_rate": 8.064009606101023e-06,
"loss": 0.4452,
"step": 7390
},
{
"epoch": 0.3616017982359696,
"grad_norm": 0.671668291091919,
"learning_rate": 8.05726554583593e-06,
"loss": 0.4458,
"step": 7400
},
{
"epoch": 0.3620904493146669,
"grad_norm": 1.2709118127822876,
"learning_rate": 8.050512590531669e-06,
"loss": 0.4454,
"step": 7410
},
{
"epoch": 0.3625791003933641,
"grad_norm": 0.7745212912559509,
"learning_rate": 8.043750759835795e-06,
"loss": 0.446,
"step": 7420
},
{
"epoch": 0.3630677514720614,
"grad_norm": 0.7901990413665771,
"learning_rate": 8.036980073421693e-06,
"loss": 0.4444,
"step": 7430
},
{
"epoch": 0.3635564025507586,
"grad_norm": 1.0258527994155884,
"learning_rate": 8.030200550988505e-06,
"loss": 0.4437,
"step": 7440
},
{
"epoch": 0.3640450536294559,
"grad_norm": 1.6445204019546509,
"learning_rate": 8.023412212261088e-06,
"loss": 0.444,
"step": 7450
},
{
"epoch": 0.36453370470815316,
"grad_norm": 1.1179972887039185,
"learning_rate": 8.016615076989947e-06,
"loss": 0.4449,
"step": 7460
},
{
"epoch": 0.3650223557868504,
"grad_norm": 0.4461180567741394,
"learning_rate": 8.009809164951176e-06,
"loss": 0.4446,
"step": 7470
},
{
"epoch": 0.36551100686554766,
"grad_norm": 0.6667689681053162,
"learning_rate": 8.002994495946415e-06,
"loss": 0.4443,
"step": 7480
},
{
"epoch": 0.36599965794424494,
"grad_norm": 0.691374659538269,
"learning_rate": 7.996171089802774e-06,
"loss": 0.4445,
"step": 7490
},
{
"epoch": 0.36648830902294216,
"grad_norm": 1.3462163209915161,
"learning_rate": 7.989338966372787e-06,
"loss": 0.4431,
"step": 7500
},
{
"epoch": 0.36648830902294216,
"eval_loss": 0.4194032549858093,
"eval_runtime": 728.4338,
"eval_samples_per_second": 242.861,
"eval_steps_per_second": 0.475,
"step": 7500
},
{
"epoch": 0.36697696010163944,
"grad_norm": 1.0293834209442139,
"learning_rate": 7.982498145534348e-06,
"loss": 0.4454,
"step": 7510
},
{
"epoch": 0.36746561118033666,
"grad_norm": 1.0880999565124512,
"learning_rate": 7.97564864719066e-06,
"loss": 0.4435,
"step": 7520
},
{
"epoch": 0.36795426225903394,
"grad_norm": 3.1764519214630127,
"learning_rate": 7.968790491270165e-06,
"loss": 0.4451,
"step": 7530
},
{
"epoch": 0.3684429133377312,
"grad_norm": 0.6520982980728149,
"learning_rate": 7.961923697726506e-06,
"loss": 0.4464,
"step": 7540
},
{
"epoch": 0.36893156441642844,
"grad_norm": 1.566203236579895,
"learning_rate": 7.955048286538448e-06,
"loss": 0.4455,
"step": 7550
},
{
"epoch": 0.3694202154951257,
"grad_norm": 1.396600365638733,
"learning_rate": 7.948164277709831e-06,
"loss": 0.4466,
"step": 7560
},
{
"epoch": 0.36990886657382294,
"grad_norm": 39.281192779541016,
"learning_rate": 7.941271691269511e-06,
"loss": 0.4899,
"step": 7570
},
{
"epoch": 0.3703975176525202,
"grad_norm": 2.0359652042388916,
"learning_rate": 7.934370547271297e-06,
"loss": 0.4587,
"step": 7580
},
{
"epoch": 0.3708861687312175,
"grad_norm": 0.7175349593162537,
"learning_rate": 7.9274608657939e-06,
"loss": 0.4484,
"step": 7590
},
{
"epoch": 0.3713748198099147,
"grad_norm": 1.1124777793884277,
"learning_rate": 7.920542666940871e-06,
"loss": 0.4465,
"step": 7600
},
{
"epoch": 0.371863470888612,
"grad_norm": 1.0177866220474243,
"learning_rate": 7.913615970840535e-06,
"loss": 0.4447,
"step": 7610
},
{
"epoch": 0.37235212196730927,
"grad_norm": 0.7671780586242676,
"learning_rate": 7.90668079764595e-06,
"loss": 0.4455,
"step": 7620
},
{
"epoch": 0.3728407730460065,
"grad_norm": 1.171650767326355,
"learning_rate": 7.899737167534827e-06,
"loss": 0.4456,
"step": 7630
},
{
"epoch": 0.37332942412470377,
"grad_norm": 0.5443609356880188,
"learning_rate": 7.892785100709492e-06,
"loss": 0.4461,
"step": 7640
},
{
"epoch": 0.373818075203401,
"grad_norm": 1.2549580335617065,
"learning_rate": 7.885824617396812e-06,
"loss": 0.4451,
"step": 7650
},
{
"epoch": 0.37430672628209827,
"grad_norm": 0.7662185430526733,
"learning_rate": 7.878855737848139e-06,
"loss": 0.4446,
"step": 7660
},
{
"epoch": 0.37479537736079555,
"grad_norm": 1.3419959545135498,
"learning_rate": 7.871878482339264e-06,
"loss": 0.4468,
"step": 7670
},
{
"epoch": 0.37528402843949277,
"grad_norm": 1.2521858215332031,
"learning_rate": 7.864892871170335e-06,
"loss": 0.4451,
"step": 7680
},
{
"epoch": 0.37577267951819004,
"grad_norm": 2.5343024730682373,
"learning_rate": 7.857898924665817e-06,
"loss": 0.4458,
"step": 7690
},
{
"epoch": 0.37626133059688727,
"grad_norm": 0.9986534118652344,
"learning_rate": 7.85089666317443e-06,
"loss": 0.4451,
"step": 7700
},
{
"epoch": 0.37674998167558454,
"grad_norm": 0.8709741830825806,
"learning_rate": 7.843886107069077e-06,
"loss": 0.4439,
"step": 7710
},
{
"epoch": 0.3772386327542818,
"grad_norm": 0.8361919522285461,
"learning_rate": 7.836867276746805e-06,
"loss": 0.4444,
"step": 7720
},
{
"epoch": 0.37772728383297904,
"grad_norm": 1.1930742263793945,
"learning_rate": 7.829840192628723e-06,
"loss": 0.4461,
"step": 7730
},
{
"epoch": 0.3782159349116763,
"grad_norm": 1.6097028255462646,
"learning_rate": 7.822804875159962e-06,
"loss": 0.4444,
"step": 7740
},
{
"epoch": 0.3787045859903736,
"grad_norm": 0.6868306994438171,
"learning_rate": 7.815761344809609e-06,
"loss": 0.4457,
"step": 7750
},
{
"epoch": 0.3791932370690708,
"grad_norm": 0.5000033974647522,
"learning_rate": 7.808709622070639e-06,
"loss": 0.4449,
"step": 7760
},
{
"epoch": 0.3796818881477681,
"grad_norm": 0.3964043855667114,
"learning_rate": 7.801649727459868e-06,
"loss": 0.4439,
"step": 7770
},
{
"epoch": 0.3801705392264653,
"grad_norm": 1.3012721538543701,
"learning_rate": 7.794581681517886e-06,
"loss": 0.4454,
"step": 7780
},
{
"epoch": 0.3806591903051626,
"grad_norm": 0.6892145276069641,
"learning_rate": 7.787505504808997e-06,
"loss": 0.4456,
"step": 7790
},
{
"epoch": 0.3811478413838599,
"grad_norm": 0.48608964681625366,
"learning_rate": 7.780421217921169e-06,
"loss": 0.4439,
"step": 7800
},
{
"epoch": 0.3816364924625571,
"grad_norm": 0.7753750085830688,
"learning_rate": 7.773328841465958e-06,
"loss": 0.4438,
"step": 7810
},
{
"epoch": 0.3821251435412544,
"grad_norm": 0.5739250183105469,
"learning_rate": 7.766228396078458e-06,
"loss": 0.4444,
"step": 7820
},
{
"epoch": 0.3826137946199516,
"grad_norm": 0.6620212197303772,
"learning_rate": 7.759119902417244e-06,
"loss": 0.445,
"step": 7830
},
{
"epoch": 0.3831024456986489,
"grad_norm": 0.5474065542221069,
"learning_rate": 7.7520033811643e-06,
"loss": 0.4436,
"step": 7840
},
{
"epoch": 0.38359109677734615,
"grad_norm": 1.7903695106506348,
"learning_rate": 7.744878853024976e-06,
"loss": 0.444,
"step": 7850
},
{
"epoch": 0.3840797478560434,
"grad_norm": 0.9528830051422119,
"learning_rate": 7.737746338727908e-06,
"loss": 0.4436,
"step": 7860
},
{
"epoch": 0.38456839893474065,
"grad_norm": 0.9075807332992554,
"learning_rate": 7.730605859024971e-06,
"loss": 0.4433,
"step": 7870
},
{
"epoch": 0.38505705001343793,
"grad_norm": 1.1544967889785767,
"learning_rate": 7.723457434691216e-06,
"loss": 0.4456,
"step": 7880
},
{
"epoch": 0.38554570109213515,
"grad_norm": 1.7026115655899048,
"learning_rate": 7.71630108652481e-06,
"loss": 0.4458,
"step": 7890
},
{
"epoch": 0.38603435217083243,
"grad_norm": 0.6825501918792725,
"learning_rate": 7.709136835346973e-06,
"loss": 0.4447,
"step": 7900
},
{
"epoch": 0.38652300324952965,
"grad_norm": 1.6804189682006836,
"learning_rate": 7.701964702001916e-06,
"loss": 0.4446,
"step": 7910
},
{
"epoch": 0.38701165432822693,
"grad_norm": 3.464137077331543,
"learning_rate": 7.694784707356786e-06,
"loss": 0.4467,
"step": 7920
},
{
"epoch": 0.3875003054069242,
"grad_norm": 0.6467346549034119,
"learning_rate": 7.687596872301603e-06,
"loss": 0.4446,
"step": 7930
},
{
"epoch": 0.38798895648562143,
"grad_norm": 1.6307556629180908,
"learning_rate": 7.680401217749194e-06,
"loss": 0.4454,
"step": 7940
},
{
"epoch": 0.3884776075643187,
"grad_norm": 1.3172680139541626,
"learning_rate": 7.67319776463514e-06,
"loss": 0.447,
"step": 7950
},
{
"epoch": 0.3889662586430159,
"grad_norm": 0.94371497631073,
"learning_rate": 7.665986533917715e-06,
"loss": 0.4443,
"step": 7960
},
{
"epoch": 0.3894549097217132,
"grad_norm": 1.032759666442871,
"learning_rate": 7.658767546577815e-06,
"loss": 0.4435,
"step": 7970
},
{
"epoch": 0.3899435608004105,
"grad_norm": 0.6555205583572388,
"learning_rate": 7.651540823618906e-06,
"loss": 0.4456,
"step": 7980
},
{
"epoch": 0.3904322118791077,
"grad_norm": 0.8276070952415466,
"learning_rate": 7.644306386066964e-06,
"loss": 0.4437,
"step": 7990
},
{
"epoch": 0.390920862957805,
"grad_norm": 0.9051567912101746,
"learning_rate": 7.637064254970404e-06,
"loss": 0.4439,
"step": 8000
},
{
"epoch": 0.390920862957805,
"eval_loss": 0.41898027062416077,
"eval_runtime": 729.9138,
"eval_samples_per_second": 242.368,
"eval_steps_per_second": 0.474,
"step": 8000
},
{
"epoch": 0.39140951403650226,
"grad_norm": 0.7855016589164734,
"learning_rate": 7.629814451400034e-06,
"loss": 0.4434,
"step": 8010
},
{
"epoch": 0.3918981651151995,
"grad_norm": 1.8473398685455322,
"learning_rate": 7.622556996448973e-06,
"loss": 0.4441,
"step": 8020
},
{
"epoch": 0.39238681619389676,
"grad_norm": 1.2307816743850708,
"learning_rate": 7.615291911232614e-06,
"loss": 0.4426,
"step": 8030
},
{
"epoch": 0.392875467272594,
"grad_norm": 0.9610106945037842,
"learning_rate": 7.6080192168885436e-06,
"loss": 0.4439,
"step": 8040
},
{
"epoch": 0.39336411835129126,
"grad_norm": 0.8011897206306458,
"learning_rate": 7.600738934576484e-06,
"loss": 0.4424,
"step": 8050
},
{
"epoch": 0.39385276942998854,
"grad_norm": 0.9333787560462952,
"learning_rate": 7.593451085478243e-06,
"loss": 0.443,
"step": 8060
},
{
"epoch": 0.39434142050868576,
"grad_norm": 0.5144811868667603,
"learning_rate": 7.586155690797636e-06,
"loss": 0.4446,
"step": 8070
},
{
"epoch": 0.39483007158738304,
"grad_norm": 1.6834224462509155,
"learning_rate": 7.578852771760437e-06,
"loss": 0.4443,
"step": 8080
},
{
"epoch": 0.39531872266608026,
"grad_norm": 1.0620421171188354,
"learning_rate": 7.571542349614307e-06,
"loss": 0.4436,
"step": 8090
},
{
"epoch": 0.39580737374477754,
"grad_norm": 0.8550513386726379,
"learning_rate": 7.564224445628741e-06,
"loss": 0.4439,
"step": 8100
},
{
"epoch": 0.3962960248234748,
"grad_norm": 0.5044734477996826,
"learning_rate": 7.556899081095004e-06,
"loss": 0.4446,
"step": 8110
},
{
"epoch": 0.39678467590217203,
"grad_norm": 0.8119836449623108,
"learning_rate": 7.549566277326061e-06,
"loss": 0.4438,
"step": 8120
},
{
"epoch": 0.3972733269808693,
"grad_norm": 10.883358001708984,
"learning_rate": 7.542226055656527e-06,
"loss": 0.4461,
"step": 8130
},
{
"epoch": 0.3977619780595666,
"grad_norm": 1.7727267742156982,
"learning_rate": 7.534878437442597e-06,
"loss": 0.4482,
"step": 8140
},
{
"epoch": 0.3982506291382638,
"grad_norm": 1.0288087129592896,
"learning_rate": 7.527523444061984e-06,
"loss": 0.4443,
"step": 8150
},
{
"epoch": 0.3987392802169611,
"grad_norm": 1.184952974319458,
"learning_rate": 7.520161096913863e-06,
"loss": 0.4466,
"step": 8160
},
{
"epoch": 0.3992279312956583,
"grad_norm": 0.9457073211669922,
"learning_rate": 7.512791417418802e-06,
"loss": 0.4454,
"step": 8170
},
{
"epoch": 0.3997165823743556,
"grad_norm": 0.771334171295166,
"learning_rate": 7.505414427018704e-06,
"loss": 0.445,
"step": 8180
},
{
"epoch": 0.40020523345305287,
"grad_norm": 1.0723953247070312,
"learning_rate": 7.4980301471767404e-06,
"loss": 0.4449,
"step": 8190
},
{
"epoch": 0.4006938845317501,
"grad_norm": 0.9210856556892395,
"learning_rate": 7.490638599377291e-06,
"loss": 0.4432,
"step": 8200
},
{
"epoch": 0.40118253561044737,
"grad_norm": 0.8094615340232849,
"learning_rate": 7.483239805125886e-06,
"loss": 0.4443,
"step": 8210
},
{
"epoch": 0.40167118668914464,
"grad_norm": 1.3815480470657349,
"learning_rate": 7.475833785949134e-06,
"loss": 0.4431,
"step": 8220
},
{
"epoch": 0.40215983776784187,
"grad_norm": 1.4028229713439941,
"learning_rate": 7.468420563394667e-06,
"loss": 0.4449,
"step": 8230
},
{
"epoch": 0.40264848884653914,
"grad_norm": 0.7880713939666748,
"learning_rate": 7.461000159031073e-06,
"loss": 0.4444,
"step": 8240
},
{
"epoch": 0.40313713992523637,
"grad_norm": 0.573472797870636,
"learning_rate": 7.45357259444784e-06,
"loss": 0.4432,
"step": 8250
},
{
"epoch": 0.40362579100393364,
"grad_norm": 1.1918740272521973,
"learning_rate": 7.4461378912552806e-06,
"loss": 0.4428,
"step": 8260
},
{
"epoch": 0.4041144420826309,
"grad_norm": 0.6638442277908325,
"learning_rate": 7.438696071084483e-06,
"loss": 0.4447,
"step": 8270
},
{
"epoch": 0.40460309316132814,
"grad_norm": 1.2030208110809326,
"learning_rate": 7.431247155587243e-06,
"loss": 0.4436,
"step": 8280
},
{
"epoch": 0.4050917442400254,
"grad_norm": 0.3726930320262909,
"learning_rate": 7.423791166435997e-06,
"loss": 0.4433,
"step": 8290
},
{
"epoch": 0.40558039531872264,
"grad_norm": 0.8080679178237915,
"learning_rate": 7.4163281253237604e-06,
"loss": 0.4437,
"step": 8300
},
{
"epoch": 0.4060690463974199,
"grad_norm": 0.7469872832298279,
"learning_rate": 7.40885805396407e-06,
"loss": 0.4427,
"step": 8310
},
{
"epoch": 0.4065576974761172,
"grad_norm": 1.38739812374115,
"learning_rate": 7.4013809740909135e-06,
"loss": 0.443,
"step": 8320
},
{
"epoch": 0.4070463485548144,
"grad_norm": 0.823733389377594,
"learning_rate": 7.393896907458674e-06,
"loss": 0.4427,
"step": 8330
},
{
"epoch": 0.4075349996335117,
"grad_norm": 0.47151875495910645,
"learning_rate": 7.3864058758420595e-06,
"loss": 0.445,
"step": 8340
},
{
"epoch": 0.408023650712209,
"grad_norm": 0.34016215801239014,
"learning_rate": 7.378907901036042e-06,
"loss": 0.4437,
"step": 8350
},
{
"epoch": 0.4085123017909062,
"grad_norm": 0.9797572493553162,
"learning_rate": 7.3714030048557935e-06,
"loss": 0.4431,
"step": 8360
},
{
"epoch": 0.4090009528696035,
"grad_norm": 0.8803391456604004,
"learning_rate": 7.363891209136631e-06,
"loss": 0.4431,
"step": 8370
},
{
"epoch": 0.4094896039483007,
"grad_norm": 0.9852266907691956,
"learning_rate": 7.356372535733934e-06,
"loss": 0.443,
"step": 8380
},
{
"epoch": 0.409978255026998,
"grad_norm": 1.409609317779541,
"learning_rate": 7.348847006523103e-06,
"loss": 0.4447,
"step": 8390
},
{
"epoch": 0.41046690610569525,
"grad_norm": 0.47717586159706116,
"learning_rate": 7.341314643399479e-06,
"loss": 0.4443,
"step": 8400
},
{
"epoch": 0.4109555571843925,
"grad_norm": 0.3413306176662445,
"learning_rate": 7.333775468278285e-06,
"loss": 0.443,
"step": 8410
},
{
"epoch": 0.41144420826308975,
"grad_norm": 0.5356876254081726,
"learning_rate": 7.326229503094573e-06,
"loss": 0.4429,
"step": 8420
},
{
"epoch": 0.41193285934178697,
"grad_norm": 0.5036433339118958,
"learning_rate": 7.318676769803137e-06,
"loss": 0.4441,
"step": 8430
},
{
"epoch": 0.41242151042048425,
"grad_norm": 0.9086324572563171,
"learning_rate": 7.311117290378473e-06,
"loss": 0.4431,
"step": 8440
},
{
"epoch": 0.4129101614991815,
"grad_norm": 0.827485203742981,
"learning_rate": 7.303551086814702e-06,
"loss": 0.4428,
"step": 8450
},
{
"epoch": 0.41339881257787875,
"grad_norm": 1.1920230388641357,
"learning_rate": 7.295978181125503e-06,
"loss": 0.445,
"step": 8460
},
{
"epoch": 0.413887463656576,
"grad_norm": 0.9056548476219177,
"learning_rate": 7.2883985953440636e-06,
"loss": 0.4442,
"step": 8470
},
{
"epoch": 0.4143761147352733,
"grad_norm": 0.5254775881767273,
"learning_rate": 7.280812351523003e-06,
"loss": 0.4432,
"step": 8480
},
{
"epoch": 0.4148647658139705,
"grad_norm": 0.6151171922683716,
"learning_rate": 7.27321947173431e-06,
"loss": 0.4442,
"step": 8490
},
{
"epoch": 0.4153534168926678,
"grad_norm": 0.3920780420303345,
"learning_rate": 7.265619978069281e-06,
"loss": 0.4432,
"step": 8500
},
{
"epoch": 0.4153534168926678,
"eval_loss": 0.41748544573783875,
"eval_runtime": 729.7588,
"eval_samples_per_second": 242.42,
"eval_steps_per_second": 0.474,
"step": 8500
},
{
"epoch": 0.415842067971365,
"grad_norm": 0.5901490449905396,
"learning_rate": 7.25801389263846e-06,
"loss": 0.4442,
"step": 8510
},
{
"epoch": 0.4163307190500623,
"grad_norm": 0.5799441337585449,
"learning_rate": 7.2504012375715645e-06,
"loss": 0.4427,
"step": 8520
},
{
"epoch": 0.4168193701287596,
"grad_norm": 0.9592375755310059,
"learning_rate": 7.242782035017428e-06,
"loss": 0.4439,
"step": 8530
},
{
"epoch": 0.4173080212074568,
"grad_norm": 0.6781924962997437,
"learning_rate": 7.235156307143933e-06,
"loss": 0.4429,
"step": 8540
},
{
"epoch": 0.4177966722861541,
"grad_norm": 0.37766560912132263,
"learning_rate": 7.2275240761379464e-06,
"loss": 0.4422,
"step": 8550
},
{
"epoch": 0.4182853233648513,
"grad_norm": 1.2287683486938477,
"learning_rate": 7.2198853642052615e-06,
"loss": 0.4426,
"step": 8560
},
{
"epoch": 0.4187739744435486,
"grad_norm": 0.9670842289924622,
"learning_rate": 7.212240193570519e-06,
"loss": 0.4434,
"step": 8570
},
{
"epoch": 0.41926262552224586,
"grad_norm": 0.5393080115318298,
"learning_rate": 7.204588586477157e-06,
"loss": 0.4433,
"step": 8580
},
{
"epoch": 0.4197512766009431,
"grad_norm": 0.5459208488464355,
"learning_rate": 7.196930565187341e-06,
"loss": 0.4433,
"step": 8590
},
{
"epoch": 0.42023992767964036,
"grad_norm": 0.8376490473747253,
"learning_rate": 7.189266151981893e-06,
"loss": 0.4424,
"step": 8600
},
{
"epoch": 0.42072857875833763,
"grad_norm": 3.4486372470855713,
"learning_rate": 7.181595369160237e-06,
"loss": 0.4425,
"step": 8610
},
{
"epoch": 0.42121722983703486,
"grad_norm": 2.3472955226898193,
"learning_rate": 7.173918239040329e-06,
"loss": 0.445,
"step": 8620
},
{
"epoch": 0.42170588091573213,
"grad_norm": 2.3312840461730957,
"learning_rate": 7.166234783958587e-06,
"loss": 0.4447,
"step": 8630
},
{
"epoch": 0.42219453199442936,
"grad_norm": 0.7450709342956543,
"learning_rate": 7.158545026269838e-06,
"loss": 0.4438,
"step": 8640
},
{
"epoch": 0.42268318307312663,
"grad_norm": 1.204588532447815,
"learning_rate": 7.150848988347244e-06,
"loss": 0.4441,
"step": 8650
},
{
"epoch": 0.4231718341518239,
"grad_norm": 0.7559615969657898,
"learning_rate": 7.143146692582237e-06,
"loss": 0.4423,
"step": 8660
},
{
"epoch": 0.42366048523052113,
"grad_norm": 1.6019837856292725,
"learning_rate": 7.135438161384458e-06,
"loss": 0.4436,
"step": 8670
},
{
"epoch": 0.4241491363092184,
"grad_norm": 1.278933048248291,
"learning_rate": 7.127723417181691e-06,
"loss": 0.4429,
"step": 8680
},
{
"epoch": 0.42463778738791563,
"grad_norm": 0.6044679284095764,
"learning_rate": 7.1200024824197945e-06,
"loss": 0.442,
"step": 8690
},
{
"epoch": 0.4251264384666129,
"grad_norm": 0.771743655204773,
"learning_rate": 7.1122753795626385e-06,
"loss": 0.4429,
"step": 8700
},
{
"epoch": 0.4256150895453102,
"grad_norm": 1.0729281902313232,
"learning_rate": 7.1045421310920386e-06,
"loss": 0.4436,
"step": 8710
},
{
"epoch": 0.4261037406240074,
"grad_norm": 0.48893994092941284,
"learning_rate": 7.096802759507693e-06,
"loss": 0.4427,
"step": 8720
},
{
"epoch": 0.4265923917027047,
"grad_norm": 0.5487367510795593,
"learning_rate": 7.0890572873271125e-06,
"loss": 0.4435,
"step": 8730
},
{
"epoch": 0.42708104278140196,
"grad_norm": 0.39890584349632263,
"learning_rate": 7.08130573708556e-06,
"loss": 0.4427,
"step": 8740
},
{
"epoch": 0.4275696938600992,
"grad_norm": 0.437925785779953,
"learning_rate": 7.07354813133598e-06,
"loss": 0.4423,
"step": 8750
},
{
"epoch": 0.42805834493879646,
"grad_norm": 1.0761085748672485,
"learning_rate": 7.065784492648937e-06,
"loss": 0.4447,
"step": 8760
},
{
"epoch": 0.4285469960174937,
"grad_norm": 0.6409640312194824,
"learning_rate": 7.058014843612546e-06,
"loss": 0.4432,
"step": 8770
},
{
"epoch": 0.42903564709619096,
"grad_norm": 0.8142459988594055,
"learning_rate": 7.050239206832412e-06,
"loss": 0.4431,
"step": 8780
},
{
"epoch": 0.42952429817488824,
"grad_norm": 0.7957897782325745,
"learning_rate": 7.042457604931558e-06,
"loss": 0.4427,
"step": 8790
},
{
"epoch": 0.43001294925358546,
"grad_norm": 0.8293124437332153,
"learning_rate": 7.034670060550367e-06,
"loss": 0.4425,
"step": 8800
},
{
"epoch": 0.43050160033228274,
"grad_norm": 0.3750956654548645,
"learning_rate": 7.026876596346505e-06,
"loss": 0.4416,
"step": 8810
},
{
"epoch": 0.43099025141097996,
"grad_norm": 0.755920946598053,
"learning_rate": 7.019077234994865e-06,
"loss": 0.443,
"step": 8820
},
{
"epoch": 0.43147890248967724,
"grad_norm": 0.6560993194580078,
"learning_rate": 7.0112719991875025e-06,
"loss": 0.443,
"step": 8830
},
{
"epoch": 0.4319675535683745,
"grad_norm": 0.3859688341617584,
"learning_rate": 7.003460911633555e-06,
"loss": 0.443,
"step": 8840
},
{
"epoch": 0.43245620464707174,
"grad_norm": 0.6885735988616943,
"learning_rate": 6.9956439950591915e-06,
"loss": 0.4418,
"step": 8850
},
{
"epoch": 0.432944855725769,
"grad_norm": 1.1823225021362305,
"learning_rate": 6.98782127220754e-06,
"loss": 0.4433,
"step": 8860
},
{
"epoch": 0.4334335068044663,
"grad_norm": 0.9184996485710144,
"learning_rate": 6.979992765838619e-06,
"loss": 0.4439,
"step": 8870
},
{
"epoch": 0.4339221578831635,
"grad_norm": 0.6856487989425659,
"learning_rate": 6.97215849872928e-06,
"loss": 0.4431,
"step": 8880
},
{
"epoch": 0.4344108089618608,
"grad_norm": 0.4063749611377716,
"learning_rate": 6.964318493673126e-06,
"loss": 0.4435,
"step": 8890
},
{
"epoch": 0.434899460040558,
"grad_norm": 1.1154191493988037,
"learning_rate": 6.956472773480463e-06,
"loss": 0.4435,
"step": 8900
},
{
"epoch": 0.4353881111192553,
"grad_norm": 0.4631388485431671,
"learning_rate": 6.948621360978221e-06,
"loss": 0.4424,
"step": 8910
},
{
"epoch": 0.43587676219795257,
"grad_norm": 0.6873944997787476,
"learning_rate": 6.94076427900989e-06,
"loss": 0.443,
"step": 8920
},
{
"epoch": 0.4363654132766498,
"grad_norm": 0.37667331099510193,
"learning_rate": 6.9329015504354605e-06,
"loss": 0.4422,
"step": 8930
},
{
"epoch": 0.43685406435534707,
"grad_norm": 1.4186402559280396,
"learning_rate": 6.925033198131347e-06,
"loss": 0.4428,
"step": 8940
},
{
"epoch": 0.4373427154340443,
"grad_norm": 0.6768743395805359,
"learning_rate": 6.917159244990328e-06,
"loss": 0.443,
"step": 8950
},
{
"epoch": 0.43783136651274157,
"grad_norm": 0.6607493162155151,
"learning_rate": 6.909279713921477e-06,
"loss": 0.4429,
"step": 8960
},
{
"epoch": 0.43832001759143885,
"grad_norm": 1.2457571029663086,
"learning_rate": 6.9013946278500964e-06,
"loss": 0.4431,
"step": 8970
},
{
"epoch": 0.43880866867013607,
"grad_norm": 0.506984531879425,
"learning_rate": 6.89350400971765e-06,
"loss": 0.444,
"step": 8980
},
{
"epoch": 0.43929731974883335,
"grad_norm": 0.9251278638839722,
"learning_rate": 6.885607882481699e-06,
"loss": 0.4426,
"step": 8990
},
{
"epoch": 0.4397859708275306,
"grad_norm": 1.2666517496109009,
"learning_rate": 6.8777062691158335e-06,
"loss": 0.4428,
"step": 9000
},
{
"epoch": 0.4397859708275306,
"eval_loss": 0.4181945323944092,
"eval_runtime": 729.4373,
"eval_samples_per_second": 242.527,
"eval_steps_per_second": 0.474,
"step": 9000
},
{
"epoch": 0.44027462190622785,
"grad_norm": 0.909946620464325,
"learning_rate": 6.869799192609602e-06,
"loss": 0.4423,
"step": 9010
},
{
"epoch": 0.4407632729849251,
"grad_norm": 0.6974407434463501,
"learning_rate": 6.8618866759684496e-06,
"loss": 0.4421,
"step": 9020
},
{
"epoch": 0.44125192406362235,
"grad_norm": 1.4556212425231934,
"learning_rate": 6.85396874221365e-06,
"loss": 0.4421,
"step": 9030
},
{
"epoch": 0.4417405751423196,
"grad_norm": 0.7077080607414246,
"learning_rate": 6.846045414382237e-06,
"loss": 0.4415,
"step": 9040
},
{
"epoch": 0.4422292262210169,
"grad_norm": 1.2867698669433594,
"learning_rate": 6.838116715526941e-06,
"loss": 0.4431,
"step": 9050
},
{
"epoch": 0.4427178772997141,
"grad_norm": 0.350985586643219,
"learning_rate": 6.8301826687161135e-06,
"loss": 0.4425,
"step": 9060
},
{
"epoch": 0.4432065283784114,
"grad_norm": 0.9761406779289246,
"learning_rate": 6.822243297033671e-06,
"loss": 0.4415,
"step": 9070
},
{
"epoch": 0.4436951794571086,
"grad_norm": 0.7296372652053833,
"learning_rate": 6.814298623579021e-06,
"loss": 0.4432,
"step": 9080
},
{
"epoch": 0.4441838305358059,
"grad_norm": 0.8322256803512573,
"learning_rate": 6.806348671466996e-06,
"loss": 0.442,
"step": 9090
},
{
"epoch": 0.4446724816145032,
"grad_norm": 0.6768003106117249,
"learning_rate": 6.798393463827786e-06,
"loss": 0.442,
"step": 9100
},
{
"epoch": 0.4451611326932004,
"grad_norm": 0.9105594754219055,
"learning_rate": 6.790433023806874e-06,
"loss": 0.4426,
"step": 9110
},
{
"epoch": 0.4456497837718977,
"grad_norm": 0.8735663890838623,
"learning_rate": 6.782467374564964e-06,
"loss": 0.4414,
"step": 9120
},
{
"epoch": 0.44613843485059496,
"grad_norm": 0.4745177626609802,
"learning_rate": 6.774496539277917e-06,
"loss": 0.4428,
"step": 9130
},
{
"epoch": 0.4466270859292922,
"grad_norm": 0.35364508628845215,
"learning_rate": 6.766520541136684e-06,
"loss": 0.4425,
"step": 9140
},
{
"epoch": 0.44711573700798946,
"grad_norm": 1.5570448637008667,
"learning_rate": 6.758539403347235e-06,
"loss": 0.4423,
"step": 9150
},
{
"epoch": 0.4476043880866867,
"grad_norm": 0.6677067279815674,
"learning_rate": 6.750553149130498e-06,
"loss": 0.4425,
"step": 9160
},
{
"epoch": 0.44809303916538396,
"grad_norm": 0.5844752192497253,
"learning_rate": 6.74256180172228e-06,
"loss": 0.4427,
"step": 9170
},
{
"epoch": 0.44858169024408123,
"grad_norm": 0.5263113379478455,
"learning_rate": 6.734565384373211e-06,
"loss": 0.4419,
"step": 9180
},
{
"epoch": 0.44907034132277845,
"grad_norm": 0.7214266061782837,
"learning_rate": 6.726563920348671e-06,
"loss": 0.442,
"step": 9190
},
{
"epoch": 0.44955899240147573,
"grad_norm": 1.2973275184631348,
"learning_rate": 6.718557432928725e-06,
"loss": 0.4428,
"step": 9200
},
{
"epoch": 0.450047643480173,
"grad_norm": 1.9566432237625122,
"learning_rate": 6.7105459454080535e-06,
"loss": 0.4444,
"step": 9210
},
{
"epoch": 0.45053629455887023,
"grad_norm": 1.5999767780303955,
"learning_rate": 6.7025294810958785e-06,
"loss": 0.4439,
"step": 9220
},
{
"epoch": 0.4510249456375675,
"grad_norm": 1.2058864831924438,
"learning_rate": 6.6945080633159096e-06,
"loss": 0.4428,
"step": 9230
},
{
"epoch": 0.45151359671626473,
"grad_norm": 0.682574987411499,
"learning_rate": 6.686481715406264e-06,
"loss": 0.442,
"step": 9240
},
{
"epoch": 0.452002247794962,
"grad_norm": 0.6059571504592896,
"learning_rate": 6.678450460719405e-06,
"loss": 0.4428,
"step": 9250
},
{
"epoch": 0.4524908988736593,
"grad_norm": 0.9549880027770996,
"learning_rate": 6.670414322622072e-06,
"loss": 0.4421,
"step": 9260
},
{
"epoch": 0.4529795499523565,
"grad_norm": 0.7796644568443298,
"learning_rate": 6.66237332449521e-06,
"loss": 0.4428,
"step": 9270
},
{
"epoch": 0.4534682010310538,
"grad_norm": 1.1869465112686157,
"learning_rate": 6.6543274897339075e-06,
"loss": 0.4439,
"step": 9280
},
{
"epoch": 0.453956852109751,
"grad_norm": 4.104377269744873,
"learning_rate": 6.6462768417473215e-06,
"loss": 0.4455,
"step": 9290
},
{
"epoch": 0.4544455031884483,
"grad_norm": 0.8395638465881348,
"learning_rate": 6.638221403958616e-06,
"loss": 0.443,
"step": 9300
},
{
"epoch": 0.45493415426714556,
"grad_norm": 0.7057262659072876,
"learning_rate": 6.63016119980489e-06,
"loss": 0.443,
"step": 9310
},
{
"epoch": 0.4554228053458428,
"grad_norm": 1.067874789237976,
"learning_rate": 6.622096252737111e-06,
"loss": 0.4434,
"step": 9320
},
{
"epoch": 0.45591145642454006,
"grad_norm": 1.1366690397262573,
"learning_rate": 6.614026586220043e-06,
"loss": 0.4442,
"step": 9330
},
{
"epoch": 0.45640010750323734,
"grad_norm": 0.8740336298942566,
"learning_rate": 6.605952223732183e-06,
"loss": 0.4419,
"step": 9340
},
{
"epoch": 0.45688875858193456,
"grad_norm": 1.2686458826065063,
"learning_rate": 6.597873188765693e-06,
"loss": 0.4413,
"step": 9350
},
{
"epoch": 0.45737740966063184,
"grad_norm": 0.4457259774208069,
"learning_rate": 6.589789504826325e-06,
"loss": 0.4421,
"step": 9360
},
{
"epoch": 0.45786606073932906,
"grad_norm": 0.5987876057624817,
"learning_rate": 6.581701195433358e-06,
"loss": 0.4418,
"step": 9370
},
{
"epoch": 0.45835471181802634,
"grad_norm": 0.430936336517334,
"learning_rate": 6.573608284119536e-06,
"loss": 0.4415,
"step": 9380
},
{
"epoch": 0.4588433628967236,
"grad_norm": 0.9248373508453369,
"learning_rate": 6.565510794430978e-06,
"loss": 0.4408,
"step": 9390
},
{
"epoch": 0.45933201397542084,
"grad_norm": 0.5061573386192322,
"learning_rate": 6.557408749927139e-06,
"loss": 0.4436,
"step": 9400
},
{
"epoch": 0.4598206650541181,
"grad_norm": 0.6956728100776672,
"learning_rate": 6.5493021741807125e-06,
"loss": 0.4424,
"step": 9410
},
{
"epoch": 0.46030931613281534,
"grad_norm": 0.5525333881378174,
"learning_rate": 6.541191090777586e-06,
"loss": 0.4419,
"step": 9420
},
{
"epoch": 0.4607979672115126,
"grad_norm": 0.5926039218902588,
"learning_rate": 6.5330755233167586e-06,
"loss": 0.4417,
"step": 9430
},
{
"epoch": 0.4612866182902099,
"grad_norm": 0.7355937361717224,
"learning_rate": 6.524955495410271e-06,
"loss": 0.441,
"step": 9440
},
{
"epoch": 0.4617752693689071,
"grad_norm": 0.9713565111160278,
"learning_rate": 6.516831030683148e-06,
"loss": 0.4412,
"step": 9450
},
{
"epoch": 0.4622639204476044,
"grad_norm": 1.2393561601638794,
"learning_rate": 6.508702152773323e-06,
"loss": 0.4418,
"step": 9460
},
{
"epoch": 0.46275257152630167,
"grad_norm": 0.83049476146698,
"learning_rate": 6.5005688853315615e-06,
"loss": 0.4432,
"step": 9470
},
{
"epoch": 0.4632412226049989,
"grad_norm": 0.4689672291278839,
"learning_rate": 6.492431252021408e-06,
"loss": 0.4425,
"step": 9480
},
{
"epoch": 0.46372987368369617,
"grad_norm": 0.5514821410179138,
"learning_rate": 6.484289276519109e-06,
"loss": 0.442,
"step": 9490
},
{
"epoch": 0.4642185247623934,
"grad_norm": 0.4042249321937561,
"learning_rate": 6.47614298251354e-06,
"loss": 0.442,
"step": 9500
},
{
"epoch": 0.4642185247623934,
"eval_loss": 0.41554516553878784,
"eval_runtime": 729.5945,
"eval_samples_per_second": 242.474,
"eval_steps_per_second": 0.474,
"step": 9500
},
{
"epoch": 0.46470717584109067,
"grad_norm": 0.44334179162979126,
"learning_rate": 6.467992393706147e-06,
"loss": 0.4403,
"step": 9510
},
{
"epoch": 0.46519582691978795,
"grad_norm": 0.49329543113708496,
"learning_rate": 6.4598375338108656e-06,
"loss": 0.4418,
"step": 9520
},
{
"epoch": 0.46568447799848517,
"grad_norm": 0.5903816223144531,
"learning_rate": 6.451678426554061e-06,
"loss": 0.4409,
"step": 9530
},
{
"epoch": 0.46617312907718245,
"grad_norm": 1.2968121767044067,
"learning_rate": 6.443515095674456e-06,
"loss": 0.443,
"step": 9540
},
{
"epoch": 0.46666178015587967,
"grad_norm": 0.7769078612327576,
"learning_rate": 6.435347564923062e-06,
"loss": 0.4432,
"step": 9550
},
{
"epoch": 0.46715043123457695,
"grad_norm": 0.8744146823883057,
"learning_rate": 6.42717585806311e-06,
"loss": 0.4411,
"step": 9560
},
{
"epoch": 0.4676390823132742,
"grad_norm": 0.47742319107055664,
"learning_rate": 6.418999998869982e-06,
"loss": 0.4426,
"step": 9570
},
{
"epoch": 0.46812773339197145,
"grad_norm": 0.4284425973892212,
"learning_rate": 6.4108200111311355e-06,
"loss": 0.4426,
"step": 9580
},
{
"epoch": 0.4686163844706687,
"grad_norm": 0.37580737471580505,
"learning_rate": 6.402635918646049e-06,
"loss": 0.4425,
"step": 9590
},
{
"epoch": 0.469105035549366,
"grad_norm": 0.3638119399547577,
"learning_rate": 6.394447745226137e-06,
"loss": 0.4411,
"step": 9600
},
{
"epoch": 0.4695936866280632,
"grad_norm": 2.9128997325897217,
"learning_rate": 6.386255514694688e-06,
"loss": 0.4418,
"step": 9610
},
{
"epoch": 0.4700823377067605,
"grad_norm": 0.9645544290542603,
"learning_rate": 6.378059250886799e-06,
"loss": 0.4419,
"step": 9620
},
{
"epoch": 0.4705709887854577,
"grad_norm": 0.43301165103912354,
"learning_rate": 6.369858977649297e-06,
"loss": 0.4429,
"step": 9630
},
{
"epoch": 0.471059639864155,
"grad_norm": 1.5179802179336548,
"learning_rate": 6.361654718840675e-06,
"loss": 0.4414,
"step": 9640
},
{
"epoch": 0.4715482909428523,
"grad_norm": 0.3464379608631134,
"learning_rate": 6.353446498331024e-06,
"loss": 0.4428,
"step": 9650
},
{
"epoch": 0.4720369420215495,
"grad_norm": 0.89571613073349,
"learning_rate": 6.34523434000196e-06,
"loss": 0.441,
"step": 9660
},
{
"epoch": 0.4725255931002468,
"grad_norm": 0.6052807569503784,
"learning_rate": 6.337018267746558e-06,
"loss": 0.4412,
"step": 9670
},
{
"epoch": 0.473014244178944,
"grad_norm": 1.2590041160583496,
"learning_rate": 6.328798305469278e-06,
"loss": 0.4415,
"step": 9680
},
{
"epoch": 0.4735028952576413,
"grad_norm": 0.6158220171928406,
"learning_rate": 6.3205744770858965e-06,
"loss": 0.4419,
"step": 9690
},
{
"epoch": 0.47399154633633855,
"grad_norm": 0.46361032128334045,
"learning_rate": 6.312346806523444e-06,
"loss": 0.4417,
"step": 9700
},
{
"epoch": 0.4744801974150358,
"grad_norm": 1.2066395282745361,
"learning_rate": 6.304115317720123e-06,
"loss": 0.4415,
"step": 9710
},
{
"epoch": 0.47496884849373305,
"grad_norm": 0.9531863331794739,
"learning_rate": 6.295880034625251e-06,
"loss": 0.4421,
"step": 9720
},
{
"epoch": 0.47545749957243033,
"grad_norm": 0.3842741549015045,
"learning_rate": 6.287640981199183e-06,
"loss": 0.4412,
"step": 9730
},
{
"epoch": 0.47594615065112755,
"grad_norm": 0.3795391023159027,
"learning_rate": 6.27939818141324e-06,
"loss": 0.4414,
"step": 9740
},
{
"epoch": 0.47643480172982483,
"grad_norm": 0.5748067498207092,
"learning_rate": 6.2711516592496455e-06,
"loss": 0.4411,
"step": 9750
},
{
"epoch": 0.47692345280852205,
"grad_norm": 0.7015309929847717,
"learning_rate": 6.262901438701459e-06,
"loss": 0.4417,
"step": 9760
},
{
"epoch": 0.47741210388721933,
"grad_norm": 0.4260580539703369,
"learning_rate": 6.254647543772489e-06,
"loss": 0.4419,
"step": 9770
},
{
"epoch": 0.4779007549659166,
"grad_norm": 0.9640613198280334,
"learning_rate": 6.246389998477245e-06,
"loss": 0.4405,
"step": 9780
},
{
"epoch": 0.47838940604461383,
"grad_norm": 0.7557575106620789,
"learning_rate": 6.23812882684085e-06,
"loss": 0.4409,
"step": 9790
},
{
"epoch": 0.4788780571233111,
"grad_norm": 1.2757539749145508,
"learning_rate": 6.22986405289898e-06,
"loss": 0.4421,
"step": 9800
},
{
"epoch": 0.47936670820200833,
"grad_norm": 1.3108956813812256,
"learning_rate": 6.221595700697794e-06,
"loss": 0.4434,
"step": 9810
},
{
"epoch": 0.4798553592807056,
"grad_norm": 0.7379423379898071,
"learning_rate": 6.2133237942938594e-06,
"loss": 0.4423,
"step": 9820
},
{
"epoch": 0.4803440103594029,
"grad_norm": 0.6387200951576233,
"learning_rate": 6.2050483577540845e-06,
"loss": 0.4419,
"step": 9830
},
{
"epoch": 0.4808326614381001,
"grad_norm": 1.4142051935195923,
"learning_rate": 6.19676941515565e-06,
"loss": 0.4422,
"step": 9840
},
{
"epoch": 0.4813213125167974,
"grad_norm": 0.9402855038642883,
"learning_rate": 6.188486990585936e-06,
"loss": 0.4415,
"step": 9850
},
{
"epoch": 0.48180996359549466,
"grad_norm": 1.5236409902572632,
"learning_rate": 6.180201108142454e-06,
"loss": 0.4409,
"step": 9860
},
{
"epoch": 0.4822986146741919,
"grad_norm": 1.1364696025848389,
"learning_rate": 6.171911791932774e-06,
"loss": 0.4414,
"step": 9870
},
{
"epoch": 0.48278726575288916,
"grad_norm": 0.48199930787086487,
"learning_rate": 6.163619066074462e-06,
"loss": 0.4403,
"step": 9880
},
{
"epoch": 0.4832759168315864,
"grad_norm": 0.3505820333957672,
"learning_rate": 6.1553229546949975e-06,
"loss": 0.4394,
"step": 9890
},
{
"epoch": 0.48376456791028366,
"grad_norm": 1.0344538688659668,
"learning_rate": 6.147023481931716e-06,
"loss": 0.4408,
"step": 9900
},
{
"epoch": 0.48425321898898094,
"grad_norm": 0.39767566323280334,
"learning_rate": 6.138720671931726e-06,
"loss": 0.4408,
"step": 9910
},
{
"epoch": 0.48474187006767816,
"grad_norm": 0.6819673180580139,
"learning_rate": 6.130414548851854e-06,
"loss": 0.4412,
"step": 9920
},
{
"epoch": 0.48523052114637544,
"grad_norm": 1.247071623802185,
"learning_rate": 6.122105136858558e-06,
"loss": 0.4402,
"step": 9930
},
{
"epoch": 0.48571917222507266,
"grad_norm": 1.1983033418655396,
"learning_rate": 6.113792460127872e-06,
"loss": 0.442,
"step": 9940
},
{
"epoch": 0.48620782330376994,
"grad_norm": 0.9668486714363098,
"learning_rate": 6.105476542845324e-06,
"loss": 0.4421,
"step": 9950
},
{
"epoch": 0.4866964743824672,
"grad_norm": 0.5192340016365051,
"learning_rate": 6.097157409205867e-06,
"loss": 0.4415,
"step": 9960
},
{
"epoch": 0.48718512546116444,
"grad_norm": 1.4621678590774536,
"learning_rate": 6.088835083413823e-06,
"loss": 0.4413,
"step": 9970
},
{
"epoch": 0.4876737765398617,
"grad_norm": 0.4883491098880768,
"learning_rate": 6.080509589682793e-06,
"loss": 0.4417,
"step": 9980
},
{
"epoch": 0.488162427618559,
"grad_norm": 0.4201609194278717,
"learning_rate": 6.072180952235593e-06,
"loss": 0.4414,
"step": 9990
},
{
"epoch": 0.4886510786972562,
"grad_norm": 0.8927011489868164,
"learning_rate": 6.063849195304194e-06,
"loss": 0.4404,
"step": 10000
},
{
"epoch": 0.4886510786972562,
"eval_loss": 0.4168914556503296,
"eval_runtime": 729.6388,
"eval_samples_per_second": 242.46,
"eval_steps_per_second": 0.474,
"step": 10000
},
{
"epoch": 0.4891397297759535,
"grad_norm": 1.4013339281082153,
"learning_rate": 6.055514343129638e-06,
"loss": 0.4427,
"step": 10010
},
{
"epoch": 0.4896283808546507,
"grad_norm": 0.5623286366462708,
"learning_rate": 6.047176419961972e-06,
"loss": 0.4414,
"step": 10020
},
{
"epoch": 0.490117031933348,
"grad_norm": 0.5934634804725647,
"learning_rate": 6.038835450060181e-06,
"loss": 0.4419,
"step": 10030
},
{
"epoch": 0.49060568301204527,
"grad_norm": 0.5222377181053162,
"learning_rate": 6.030491457692108e-06,
"loss": 0.4415,
"step": 10040
},
{
"epoch": 0.4910943340907425,
"grad_norm": 0.4764785170555115,
"learning_rate": 6.022144467134399e-06,
"loss": 0.4407,
"step": 10050
},
{
"epoch": 0.49158298516943977,
"grad_norm": 0.738297700881958,
"learning_rate": 6.013794502672415e-06,
"loss": 0.442,
"step": 10060
},
{
"epoch": 0.49207163624813705,
"grad_norm": 5.993337631225586,
"learning_rate": 6.005441588600176e-06,
"loss": 0.4424,
"step": 10070
},
{
"epoch": 0.49256028732683427,
"grad_norm": 2.3225927352905273,
"learning_rate": 5.99708574922028e-06,
"loss": 0.4487,
"step": 10080
},
{
"epoch": 0.49304893840553154,
"grad_norm": 0.8819851875305176,
"learning_rate": 5.988727008843834e-06,
"loss": 0.443,
"step": 10090
},
{
"epoch": 0.49353758948422877,
"grad_norm": 0.8179062008857727,
"learning_rate": 5.980365391790392e-06,
"loss": 0.4415,
"step": 10100
},
{
"epoch": 0.49402624056292604,
"grad_norm": 1.1633132696151733,
"learning_rate": 5.97200092238787e-06,
"loss": 0.4415,
"step": 10110
},
{
"epoch": 0.4945148916416233,
"grad_norm": 0.5630601048469543,
"learning_rate": 5.963633624972491e-06,
"loss": 0.4421,
"step": 10120
},
{
"epoch": 0.49500354272032054,
"grad_norm": 0.95186847448349,
"learning_rate": 5.955263523888699e-06,
"loss": 0.4424,
"step": 10130
},
{
"epoch": 0.4954921937990178,
"grad_norm": 0.9137486219406128,
"learning_rate": 5.9468906434890995e-06,
"loss": 0.4409,
"step": 10140
},
{
"epoch": 0.49598084487771504,
"grad_norm": 0.5341358184814453,
"learning_rate": 5.938515008134381e-06,
"loss": 0.4407,
"step": 10150
},
{
"epoch": 0.4964694959564123,
"grad_norm": 0.8407842516899109,
"learning_rate": 5.9301366421932505e-06,
"loss": 0.4404,
"step": 10160
},
{
"epoch": 0.4969581470351096,
"grad_norm": 0.7001408338546753,
"learning_rate": 5.921755570042358e-06,
"loss": 0.4412,
"step": 10170
},
{
"epoch": 0.4974467981138068,
"grad_norm": 0.8030371069908142,
"learning_rate": 5.913371816066226e-06,
"loss": 0.4415,
"step": 10180
},
{
"epoch": 0.4979354491925041,
"grad_norm": 0.9030990600585938,
"learning_rate": 5.904985404657187e-06,
"loss": 0.4409,
"step": 10190
},
{
"epoch": 0.4984241002712014,
"grad_norm": 1.0445612668991089,
"learning_rate": 5.896596360215292e-06,
"loss": 0.4419,
"step": 10200
},
{
"epoch": 0.4989127513498986,
"grad_norm": 0.8249901533126831,
"learning_rate": 5.888204707148263e-06,
"loss": 0.4406,
"step": 10210
},
{
"epoch": 0.4994014024285959,
"grad_norm": 0.4994339048862457,
"learning_rate": 5.8798104698714095e-06,
"loss": 0.4397,
"step": 10220
},
{
"epoch": 0.4998900535072931,
"grad_norm": 0.5726603865623474,
"learning_rate": 5.87141367280756e-06,
"loss": 0.4403,
"step": 10230
},
{
"epoch": 0.5003787045859903,
"grad_norm": 0.7047241926193237,
"learning_rate": 5.863014340386988e-06,
"loss": 0.4416,
"step": 10240
},
{
"epoch": 0.5008673556646877,
"grad_norm": 0.730197012424469,
"learning_rate": 5.854612497047347e-06,
"loss": 0.4419,
"step": 10250
},
{
"epoch": 0.5013560067433849,
"grad_norm": 0.6394559741020203,
"learning_rate": 5.846208167233593e-06,
"loss": 0.4407,
"step": 10260
},
{
"epoch": 0.5018446578220821,
"grad_norm": 0.4507567882537842,
"learning_rate": 5.837801375397916e-06,
"loss": 0.4399,
"step": 10270
},
{
"epoch": 0.5023333089007794,
"grad_norm": 0.6874068975448608,
"learning_rate": 5.829392145999673e-06,
"loss": 0.442,
"step": 10280
},
{
"epoch": 0.5028219599794767,
"grad_norm": 0.48060235381126404,
"learning_rate": 5.820980503505311e-06,
"loss": 0.4397,
"step": 10290
},
{
"epoch": 0.5033106110581739,
"grad_norm": 0.4969087541103363,
"learning_rate": 5.812566472388298e-06,
"loss": 0.4399,
"step": 10300
},
{
"epoch": 0.5037992621368712,
"grad_norm": 0.8934044241905212,
"learning_rate": 5.804150077129049e-06,
"loss": 0.4406,
"step": 10310
},
{
"epoch": 0.5042879132155684,
"grad_norm": 0.6583065390586853,
"learning_rate": 5.795731342214861e-06,
"loss": 0.4406,
"step": 10320
},
{
"epoch": 0.5047765642942657,
"grad_norm": 0.7381777167320251,
"learning_rate": 5.787310292139837e-06,
"loss": 0.4414,
"step": 10330
},
{
"epoch": 0.505265215372963,
"grad_norm": 0.5181640386581421,
"learning_rate": 5.778886951404816e-06,
"loss": 0.4409,
"step": 10340
},
{
"epoch": 0.5057538664516602,
"grad_norm": 0.44236427545547485,
"learning_rate": 5.770461344517302e-06,
"loss": 0.4415,
"step": 10350
},
{
"epoch": 0.5062425175303574,
"grad_norm": 0.40523165464401245,
"learning_rate": 5.76203349599139e-06,
"loss": 0.4404,
"step": 10360
},
{
"epoch": 0.5067311686090546,
"grad_norm": 0.36556363105773926,
"learning_rate": 5.753603430347699e-06,
"loss": 0.443,
"step": 10370
},
{
"epoch": 0.507219819687752,
"grad_norm": 0.3584481477737427,
"learning_rate": 5.7451711721133e-06,
"loss": 0.44,
"step": 10380
},
{
"epoch": 0.5077084707664492,
"grad_norm": 0.5849773287773132,
"learning_rate": 5.736736745821641e-06,
"loss": 0.4398,
"step": 10390
},
{
"epoch": 0.5081971218451464,
"grad_norm": 1.39704167842865,
"learning_rate": 5.728300176012476e-06,
"loss": 0.4406,
"step": 10400
},
{
"epoch": 0.5086857729238438,
"grad_norm": 1.3421454429626465,
"learning_rate": 5.719861487231802e-06,
"loss": 0.4411,
"step": 10410
},
{
"epoch": 0.509174424002541,
"grad_norm": 0.8897213935852051,
"learning_rate": 5.711420704031774e-06,
"loss": 0.4418,
"step": 10420
},
{
"epoch": 0.5096630750812382,
"grad_norm": 0.8177825212478638,
"learning_rate": 5.702977850970646e-06,
"loss": 0.4414,
"step": 10430
},
{
"epoch": 0.5101517261599355,
"grad_norm": 0.5944052934646606,
"learning_rate": 5.694532952612692e-06,
"loss": 0.4406,
"step": 10440
},
{
"epoch": 0.5106403772386328,
"grad_norm": 0.48135659098625183,
"learning_rate": 5.686086033528135e-06,
"loss": 0.4409,
"step": 10450
},
{
"epoch": 0.51112902831733,
"grad_norm": 0.6524203419685364,
"learning_rate": 5.67763711829308e-06,
"loss": 0.4413,
"step": 10460
},
{
"epoch": 0.5116176793960273,
"grad_norm": 0.8007875084877014,
"learning_rate": 5.66918623148944e-06,
"loss": 0.4399,
"step": 10470
},
{
"epoch": 0.5121063304747245,
"grad_norm": 0.9331921339035034,
"learning_rate": 5.660733397704861e-06,
"loss": 0.4407,
"step": 10480
},
{
"epoch": 0.5125949815534218,
"grad_norm": 0.5154340863227844,
"learning_rate": 5.652278641532657e-06,
"loss": 0.4399,
"step": 10490
},
{
"epoch": 0.513083632632119,
"grad_norm": 0.5443922877311707,
"learning_rate": 5.643821987571732e-06,
"loss": 0.4418,
"step": 10500
},
{
"epoch": 0.513083632632119,
"eval_loss": 0.41731706261634827,
"eval_runtime": 729.1332,
"eval_samples_per_second": 242.628,
"eval_steps_per_second": 0.475,
"step": 10500
},
{
"epoch": 0.5135722837108163,
"grad_norm": 0.7409442067146301,
"learning_rate": 5.635363460426516e-06,
"loss": 0.4416,
"step": 10510
},
{
"epoch": 0.5140609347895135,
"grad_norm": 0.5923414826393127,
"learning_rate": 5.6269030847068855e-06,
"loss": 0.4398,
"step": 10520
},
{
"epoch": 0.5145495858682108,
"grad_norm": 0.4530554711818695,
"learning_rate": 5.6184408850280955e-06,
"loss": 0.4408,
"step": 10530
},
{
"epoch": 0.5150382369469081,
"grad_norm": 0.49950364232063293,
"learning_rate": 5.609976886010708e-06,
"loss": 0.4409,
"step": 10540
},
{
"epoch": 0.5155268880256053,
"grad_norm": 2.171323776245117,
"learning_rate": 5.601511112280525e-06,
"loss": 0.4396,
"step": 10550
},
{
"epoch": 0.5160155391043025,
"grad_norm": 0.5502694249153137,
"learning_rate": 5.593043588468502e-06,
"loss": 0.4399,
"step": 10560
},
{
"epoch": 0.5165041901829999,
"grad_norm": 0.3139466941356659,
"learning_rate": 5.584574339210694e-06,
"loss": 0.4405,
"step": 10570
},
{
"epoch": 0.5169928412616971,
"grad_norm": 0.756894588470459,
"learning_rate": 5.576103389148175e-06,
"loss": 0.4401,
"step": 10580
},
{
"epoch": 0.5174814923403943,
"grad_norm": 0.5437245965003967,
"learning_rate": 5.567630762926967e-06,
"loss": 0.4412,
"step": 10590
},
{
"epoch": 0.5179701434190916,
"grad_norm": 0.796293318271637,
"learning_rate": 5.559156485197967e-06,
"loss": 0.441,
"step": 10600
},
{
"epoch": 0.5184587944977889,
"grad_norm": 0.642201840877533,
"learning_rate": 5.550680580616878e-06,
"loss": 0.4412,
"step": 10610
},
{
"epoch": 0.5189474455764861,
"grad_norm": 0.2663089632987976,
"learning_rate": 5.542203073844139e-06,
"loss": 0.441,
"step": 10620
},
{
"epoch": 0.5194360966551834,
"grad_norm": 0.45160502195358276,
"learning_rate": 5.533723989544844e-06,
"loss": 0.4404,
"step": 10630
},
{
"epoch": 0.5199247477338806,
"grad_norm": 0.4790808856487274,
"learning_rate": 5.525243352388686e-06,
"loss": 0.4402,
"step": 10640
},
{
"epoch": 0.5204133988125779,
"grad_norm": 0.3323618471622467,
"learning_rate": 5.5167611870498676e-06,
"loss": 0.4398,
"step": 10650
},
{
"epoch": 0.5209020498912751,
"grad_norm": 0.3828358054161072,
"learning_rate": 5.508277518207042e-06,
"loss": 0.4402,
"step": 10660
},
{
"epoch": 0.5213907009699724,
"grad_norm": 4.394709587097168,
"learning_rate": 5.499792370543236e-06,
"loss": 0.4401,
"step": 10670
},
{
"epoch": 0.5218793520486696,
"grad_norm": 0.34605780243873596,
"learning_rate": 5.491305768745776e-06,
"loss": 0.4409,
"step": 10680
},
{
"epoch": 0.5223680031273669,
"grad_norm": 0.41763895750045776,
"learning_rate": 5.4828177375062255e-06,
"loss": 0.4398,
"step": 10690
},
{
"epoch": 0.5228566542060642,
"grad_norm": 1.0943188667297363,
"learning_rate": 5.474328301520302e-06,
"loss": 0.4395,
"step": 10700
},
{
"epoch": 0.5233453052847614,
"grad_norm": 0.8608265519142151,
"learning_rate": 5.465837485487813e-06,
"loss": 0.4413,
"step": 10710
},
{
"epoch": 0.5238339563634586,
"grad_norm": 1.6863247156143188,
"learning_rate": 5.457345314112577e-06,
"loss": 0.4413,
"step": 10720
},
{
"epoch": 0.524322607442156,
"grad_norm": 0.5766188502311707,
"learning_rate": 5.448851812102357e-06,
"loss": 0.4406,
"step": 10730
},
{
"epoch": 0.5248112585208532,
"grad_norm": 0.84405517578125,
"learning_rate": 5.440357004168795e-06,
"loss": 0.441,
"step": 10740
},
{
"epoch": 0.5252999095995504,
"grad_norm": 0.7851320505142212,
"learning_rate": 5.431860915027321e-06,
"loss": 0.4402,
"step": 10750
},
{
"epoch": 0.5257885606782478,
"grad_norm": 0.4214421510696411,
"learning_rate": 5.423363569397101e-06,
"loss": 0.441,
"step": 10760
},
{
"epoch": 0.526277211756945,
"grad_norm": 1.1546157598495483,
"learning_rate": 5.4148649920009534e-06,
"loss": 0.4394,
"step": 10770
},
{
"epoch": 0.5267658628356422,
"grad_norm": 0.7156729102134705,
"learning_rate": 5.4063652075652786e-06,
"loss": 0.4404,
"step": 10780
},
{
"epoch": 0.5272545139143394,
"grad_norm": 1.8909116983413696,
"learning_rate": 5.3978642408199934e-06,
"loss": 0.4409,
"step": 10790
},
{
"epoch": 0.5277431649930368,
"grad_norm": 0.5709353685379028,
"learning_rate": 5.3893621164984524e-06,
"loss": 0.4403,
"step": 10800
},
{
"epoch": 0.528231816071734,
"grad_norm": 0.8182409405708313,
"learning_rate": 5.380858859337375e-06,
"loss": 0.4404,
"step": 10810
},
{
"epoch": 0.5287204671504312,
"grad_norm": 0.432432621717453,
"learning_rate": 5.372354494076784e-06,
"loss": 0.4402,
"step": 10820
},
{
"epoch": 0.5292091182291285,
"grad_norm": 0.8491529226303101,
"learning_rate": 5.363849045459918e-06,
"loss": 0.44,
"step": 10830
},
{
"epoch": 0.5296977693078257,
"grad_norm": 0.4220905900001526,
"learning_rate": 5.355342538233172e-06,
"loss": 0.4399,
"step": 10840
},
{
"epoch": 0.530186420386523,
"grad_norm": 1.0726776123046875,
"learning_rate": 5.346834997146023e-06,
"loss": 0.44,
"step": 10850
},
{
"epoch": 0.5306750714652203,
"grad_norm": 0.43123483657836914,
"learning_rate": 5.3383264469509484e-06,
"loss": 0.4411,
"step": 10860
},
{
"epoch": 0.5311637225439175,
"grad_norm": 0.3041502833366394,
"learning_rate": 5.32981691240337e-06,
"loss": 0.4414,
"step": 10870
},
{
"epoch": 0.5316523736226147,
"grad_norm": 0.7714064121246338,
"learning_rate": 5.321306418261572e-06,
"loss": 0.4402,
"step": 10880
},
{
"epoch": 0.5321410247013121,
"grad_norm": 0.441977322101593,
"learning_rate": 5.31279498928662e-06,
"loss": 0.44,
"step": 10890
},
{
"epoch": 0.5326296757800093,
"grad_norm": 1.5782145261764526,
"learning_rate": 5.304282650242318e-06,
"loss": 0.4406,
"step": 10900
},
{
"epoch": 0.5331183268587065,
"grad_norm": 0.678400993347168,
"learning_rate": 5.295769425895102e-06,
"loss": 0.4412,
"step": 10910
},
{
"epoch": 0.5336069779374037,
"grad_norm": 0.9773678183555603,
"learning_rate": 5.28725534101399e-06,
"loss": 0.4407,
"step": 10920
},
{
"epoch": 0.5340956290161011,
"grad_norm": 0.6579413414001465,
"learning_rate": 5.278740420370506e-06,
"loss": 0.442,
"step": 10930
},
{
"epoch": 0.5345842800947983,
"grad_norm": 0.760147213935852,
"learning_rate": 5.2702246887386e-06,
"loss": 0.4407,
"step": 10940
},
{
"epoch": 0.5350729311734955,
"grad_norm": 0.9420449137687683,
"learning_rate": 5.261708170894585e-06,
"loss": 0.4395,
"step": 10950
},
{
"epoch": 0.5355615822521929,
"grad_norm": 1.1415859460830688,
"learning_rate": 5.253190891617063e-06,
"loss": 0.4402,
"step": 10960
},
{
"epoch": 0.5360502333308901,
"grad_norm": 0.4278971552848816,
"learning_rate": 5.244672875686847e-06,
"loss": 0.4405,
"step": 10970
},
{
"epoch": 0.5365388844095873,
"grad_norm": 0.6837897300720215,
"learning_rate": 5.236154147886896e-06,
"loss": 0.4399,
"step": 10980
},
{
"epoch": 0.5370275354882846,
"grad_norm": 0.7087698578834534,
"learning_rate": 5.227634733002241e-06,
"loss": 0.4397,
"step": 10990
},
{
"epoch": 0.5375161865669819,
"grad_norm": 1.1717066764831543,
"learning_rate": 5.219114655819909e-06,
"loss": 0.4408,
"step": 11000
},
{
"epoch": 0.5375161865669819,
"eval_loss": 0.4167872965335846,
"eval_runtime": 729.0747,
"eval_samples_per_second": 242.647,
"eval_steps_per_second": 0.475,
"step": 11000
},
{
"epoch": 0.5380048376456791,
"grad_norm": 0.7513532638549805,
"learning_rate": 5.210593941128858e-06,
"loss": 0.4408,
"step": 11010
},
{
"epoch": 0.5384934887243764,
"grad_norm": 0.6454597115516663,
"learning_rate": 5.202072613719895e-06,
"loss": 0.4406,
"step": 11020
},
{
"epoch": 0.5389821398030736,
"grad_norm": 0.459091454744339,
"learning_rate": 5.193550698385616e-06,
"loss": 0.4411,
"step": 11030
},
{
"epoch": 0.5394707908817709,
"grad_norm": 0.40384477376937866,
"learning_rate": 5.185028219920325e-06,
"loss": 0.4406,
"step": 11040
},
{
"epoch": 0.5399594419604681,
"grad_norm": 0.44627973437309265,
"learning_rate": 5.1765052031199626e-06,
"loss": 0.4393,
"step": 11050
},
{
"epoch": 0.5404480930391654,
"grad_norm": 0.9470422267913818,
"learning_rate": 5.167981672782038e-06,
"loss": 0.4395,
"step": 11060
},
{
"epoch": 0.5409367441178626,
"grad_norm": 0.968473494052887,
"learning_rate": 5.1594576537055555e-06,
"loss": 0.4401,
"step": 11070
},
{
"epoch": 0.5414253951965599,
"grad_norm": 0.4251641631126404,
"learning_rate": 5.150933170690936e-06,
"loss": 0.439,
"step": 11080
},
{
"epoch": 0.5419140462752572,
"grad_norm": 0.5823407173156738,
"learning_rate": 5.142408248539956e-06,
"loss": 0.4398,
"step": 11090
},
{
"epoch": 0.5424026973539544,
"grad_norm": 0.7198439836502075,
"learning_rate": 5.133882912055669e-06,
"loss": 0.439,
"step": 11100
},
{
"epoch": 0.5428913484326516,
"grad_norm": 0.8078601360321045,
"learning_rate": 5.125357186042329e-06,
"loss": 0.44,
"step": 11110
},
{
"epoch": 0.543379999511349,
"grad_norm": 0.713046133518219,
"learning_rate": 5.116831095305331e-06,
"loss": 0.4398,
"step": 11120
},
{
"epoch": 0.5438686505900462,
"grad_norm": 0.5632086396217346,
"learning_rate": 5.108304664651123e-06,
"loss": 0.4398,
"step": 11130
},
{
"epoch": 0.5443573016687434,
"grad_norm": 1.3256471157073975,
"learning_rate": 5.099777918887149e-06,
"loss": 0.4396,
"step": 11140
},
{
"epoch": 0.5448459527474407,
"grad_norm": 0.9530927538871765,
"learning_rate": 5.0912508828217645e-06,
"loss": 0.4389,
"step": 11150
},
{
"epoch": 0.545334603826138,
"grad_norm": 2.566054582595825,
"learning_rate": 5.082723581264174e-06,
"loss": 0.44,
"step": 11160
},
{
"epoch": 0.5458232549048352,
"grad_norm": 0.6221000552177429,
"learning_rate": 5.074196039024351e-06,
"loss": 0.4399,
"step": 11170
},
{
"epoch": 0.5463119059835324,
"grad_norm": 0.5202614665031433,
"learning_rate": 5.065668280912972e-06,
"loss": 0.4394,
"step": 11180
},
{
"epoch": 0.5468005570622297,
"grad_norm": 0.9228209257125854,
"learning_rate": 5.057140331741337e-06,
"loss": 0.4402,
"step": 11190
},
{
"epoch": 0.547289208140927,
"grad_norm": 0.3940802216529846,
"learning_rate": 5.048612216321311e-06,
"loss": 0.4393,
"step": 11200
},
{
"epoch": 0.5477778592196242,
"grad_norm": 1.3075381517410278,
"learning_rate": 5.04008395946523e-06,
"loss": 0.4407,
"step": 11210
},
{
"epoch": 0.5482665102983215,
"grad_norm": 0.4319058060646057,
"learning_rate": 5.031555585985852e-06,
"loss": 0.4396,
"step": 11220
},
{
"epoch": 0.5487551613770187,
"grad_norm": 0.9323760867118835,
"learning_rate": 5.023027120696271e-06,
"loss": 0.4395,
"step": 11230
},
{
"epoch": 0.549243812455716,
"grad_norm": 0.726767361164093,
"learning_rate": 5.014498588409847e-06,
"loss": 0.4403,
"step": 11240
},
{
"epoch": 0.5497324635344133,
"grad_norm": 0.6504103541374207,
"learning_rate": 5.005970013940133e-06,
"loss": 0.4397,
"step": 11250
},
{
"epoch": 0.5502211146131105,
"grad_norm": 1.1144918203353882,
"learning_rate": 4.9974414221008125e-06,
"loss": 0.4412,
"step": 11260
},
{
"epoch": 0.5507097656918077,
"grad_norm": 0.6615655422210693,
"learning_rate": 4.98891283770561e-06,
"loss": 0.4397,
"step": 11270
},
{
"epoch": 0.5511984167705051,
"grad_norm": 0.5302955508232117,
"learning_rate": 4.980384285568235e-06,
"loss": 0.4395,
"step": 11280
},
{
"epoch": 0.5516870678492023,
"grad_norm": 0.4470592439174652,
"learning_rate": 4.9718557905023e-06,
"loss": 0.4402,
"step": 11290
},
{
"epoch": 0.5521757189278995,
"grad_norm": 0.4626651108264923,
"learning_rate": 4.963327377321253e-06,
"loss": 0.4382,
"step": 11300
},
{
"epoch": 0.5526643700065967,
"grad_norm": 0.48710301518440247,
"learning_rate": 4.954799070838304e-06,
"loss": 0.4404,
"step": 11310
},
{
"epoch": 0.5531530210852941,
"grad_norm": 0.31981727480888367,
"learning_rate": 4.946270895866347e-06,
"loss": 0.4391,
"step": 11320
},
{
"epoch": 0.5536416721639913,
"grad_norm": 1.153678297996521,
"learning_rate": 4.937742877217906e-06,
"loss": 0.4403,
"step": 11330
},
{
"epoch": 0.5541303232426885,
"grad_norm": 1.0284217596054077,
"learning_rate": 4.929215039705035e-06,
"loss": 0.4402,
"step": 11340
},
{
"epoch": 0.5546189743213858,
"grad_norm": 0.7204963564872742,
"learning_rate": 4.920687408139271e-06,
"loss": 0.439,
"step": 11350
},
{
"epoch": 0.5551076254000831,
"grad_norm": 0.6162496209144592,
"learning_rate": 4.91216000733155e-06,
"loss": 0.439,
"step": 11360
},
{
"epoch": 0.5555962764787803,
"grad_norm": 0.5891590118408203,
"learning_rate": 4.903632862092135e-06,
"loss": 0.439,
"step": 11370
},
{
"epoch": 0.5560849275574776,
"grad_norm": 0.5290629863739014,
"learning_rate": 4.895105997230544e-06,
"loss": 0.4407,
"step": 11380
},
{
"epoch": 0.5565735786361748,
"grad_norm": 1.0910426378250122,
"learning_rate": 4.886579437555484e-06,
"loss": 0.4386,
"step": 11390
},
{
"epoch": 0.5570622297148721,
"grad_norm": 0.4426107108592987,
"learning_rate": 4.878053207874771e-06,
"loss": 0.4393,
"step": 11400
},
{
"epoch": 0.5575508807935694,
"grad_norm": 0.7471179366111755,
"learning_rate": 4.8695273329952605e-06,
"loss": 0.4396,
"step": 11410
},
{
"epoch": 0.5580395318722666,
"grad_norm": 0.6447209119796753,
"learning_rate": 4.861001837722775e-06,
"loss": 0.4401,
"step": 11420
},
{
"epoch": 0.5585281829509638,
"grad_norm": 0.42997971177101135,
"learning_rate": 4.852476746862036e-06,
"loss": 0.4389,
"step": 11430
},
{
"epoch": 0.5590168340296611,
"grad_norm": 2.535978317260742,
"learning_rate": 4.8439520852165874e-06,
"loss": 0.4398,
"step": 11440
},
{
"epoch": 0.5595054851083584,
"grad_norm": 0.5462396740913391,
"learning_rate": 4.8354278775887215e-06,
"loss": 0.4402,
"step": 11450
},
{
"epoch": 0.5599941361870556,
"grad_norm": 0.6172703504562378,
"learning_rate": 4.8269041487794115e-06,
"loss": 0.4396,
"step": 11460
},
{
"epoch": 0.5604827872657528,
"grad_norm": 0.6260773539543152,
"learning_rate": 4.81838092358824e-06,
"loss": 0.4387,
"step": 11470
},
{
"epoch": 0.5609714383444502,
"grad_norm": 0.45732706785202026,
"learning_rate": 4.809858226813317e-06,
"loss": 0.4398,
"step": 11480
},
{
"epoch": 0.5614600894231474,
"grad_norm": 0.5570266246795654,
"learning_rate": 4.801336083251224e-06,
"loss": 0.4393,
"step": 11490
},
{
"epoch": 0.5619487405018446,
"grad_norm": 0.4119241535663605,
"learning_rate": 4.792814517696927e-06,
"loss": 0.4403,
"step": 11500
},
{
"epoch": 0.5619487405018446,
"eval_loss": 0.414816677570343,
"eval_runtime": 728.7625,
"eval_samples_per_second": 242.751,
"eval_steps_per_second": 0.475,
"step": 11500
},
{
"epoch": 0.562437391580542,
"grad_norm": 0.9447699785232544,
"learning_rate": 4.784293554943712e-06,
"loss": 0.4389,
"step": 11510
},
{
"epoch": 0.5629260426592392,
"grad_norm": 0.5308591723442078,
"learning_rate": 4.775773219783112e-06,
"loss": 0.4406,
"step": 11520
},
{
"epoch": 0.5634146937379364,
"grad_norm": 1.3727697134017944,
"learning_rate": 4.767253537004832e-06,
"loss": 0.4401,
"step": 11530
},
{
"epoch": 0.5639033448166337,
"grad_norm": 0.9330416321754456,
"learning_rate": 4.7587345313966815e-06,
"loss": 0.4406,
"step": 11540
},
{
"epoch": 0.564391995895331,
"grad_norm": 0.32605278491973877,
"learning_rate": 4.7502162277445e-06,
"loss": 0.44,
"step": 11550
},
{
"epoch": 0.5648806469740282,
"grad_norm": 0.7518082857131958,
"learning_rate": 4.741698650832081e-06,
"loss": 0.4393,
"step": 11560
},
{
"epoch": 0.5653692980527254,
"grad_norm": 0.4452798068523407,
"learning_rate": 4.7331818254411046e-06,
"loss": 0.44,
"step": 11570
},
{
"epoch": 0.5658579491314227,
"grad_norm": 0.46914970874786377,
"learning_rate": 4.724665776351069e-06,
"loss": 0.44,
"step": 11580
},
{
"epoch": 0.56634660021012,
"grad_norm": 0.7172492146492004,
"learning_rate": 4.716150528339208e-06,
"loss": 0.4404,
"step": 11590
},
{
"epoch": 0.5668352512888172,
"grad_norm": 0.5215288996696472,
"learning_rate": 4.7076361061804264e-06,
"loss": 0.4399,
"step": 11600
},
{
"epoch": 0.5673239023675145,
"grad_norm": 0.5500718355178833,
"learning_rate": 4.69912253464723e-06,
"loss": 0.4399,
"step": 11610
},
{
"epoch": 0.5678125534462117,
"grad_norm": 0.9018455147743225,
"learning_rate": 4.690609838509642e-06,
"loss": 0.4396,
"step": 11620
},
{
"epoch": 0.568301204524909,
"grad_norm": 0.46901988983154297,
"learning_rate": 4.682098042535145e-06,
"loss": 0.4382,
"step": 11630
},
{
"epoch": 0.5687898556036063,
"grad_norm": 1.1770741939544678,
"learning_rate": 4.673587171488601e-06,
"loss": 0.4402,
"step": 11640
},
{
"epoch": 0.5692785066823035,
"grad_norm": 0.3521255552768707,
"learning_rate": 4.665077250132183e-06,
"loss": 0.4388,
"step": 11650
},
{
"epoch": 0.5697671577610007,
"grad_norm": 0.4423331618309021,
"learning_rate": 4.656568303225296e-06,
"loss": 0.4402,
"step": 11660
},
{
"epoch": 0.5702558088396981,
"grad_norm": 0.4402877390384674,
"learning_rate": 4.648060355524512e-06,
"loss": 0.4391,
"step": 11670
},
{
"epoch": 0.5707444599183953,
"grad_norm": 0.3995070457458496,
"learning_rate": 4.639553431783498e-06,
"loss": 0.4404,
"step": 11680
},
{
"epoch": 0.5712331109970925,
"grad_norm": 0.5264308452606201,
"learning_rate": 4.63104755675294e-06,
"loss": 0.4389,
"step": 11690
},
{
"epoch": 0.5717217620757897,
"grad_norm": 0.28230753540992737,
"learning_rate": 4.622542755180471e-06,
"loss": 0.4389,
"step": 11700
},
{
"epoch": 0.5722104131544871,
"grad_norm": 0.7925990223884583,
"learning_rate": 4.6140390518106034e-06,
"loss": 0.4395,
"step": 11710
},
{
"epoch": 0.5726990642331843,
"grad_norm": 1.0194525718688965,
"learning_rate": 4.605536471384656e-06,
"loss": 0.44,
"step": 11720
},
{
"epoch": 0.5731877153118815,
"grad_norm": 0.510903000831604,
"learning_rate": 4.597035038640676e-06,
"loss": 0.439,
"step": 11730
},
{
"epoch": 0.5736763663905788,
"grad_norm": 0.42407867312431335,
"learning_rate": 4.5885347783133725e-06,
"loss": 0.4401,
"step": 11740
},
{
"epoch": 0.5741650174692761,
"grad_norm": 0.5859852433204651,
"learning_rate": 4.580035715134047e-06,
"loss": 0.4381,
"step": 11750
},
{
"epoch": 0.5746536685479733,
"grad_norm": 0.5147973895072937,
"learning_rate": 4.571537873830515e-06,
"loss": 0.4399,
"step": 11760
},
{
"epoch": 0.5751423196266706,
"grad_norm": 0.6203701496124268,
"learning_rate": 4.563041279127038e-06,
"loss": 0.4389,
"step": 11770
},
{
"epoch": 0.5756309707053678,
"grad_norm": 0.4585236608982086,
"learning_rate": 4.554545955744247e-06,
"loss": 0.4383,
"step": 11780
},
{
"epoch": 0.5761196217840651,
"grad_norm": 0.41942375898361206,
"learning_rate": 4.546051928399081e-06,
"loss": 0.4386,
"step": 11790
},
{
"epoch": 0.5766082728627624,
"grad_norm": 0.5585193037986755,
"learning_rate": 4.537559221804703e-06,
"loss": 0.4389,
"step": 11800
},
{
"epoch": 0.5770969239414596,
"grad_norm": 0.4607734680175781,
"learning_rate": 4.529067860670433e-06,
"loss": 0.4388,
"step": 11810
},
{
"epoch": 0.5775855750201568,
"grad_norm": 0.6180665493011475,
"learning_rate": 4.520577869701679e-06,
"loss": 0.4382,
"step": 11820
},
{
"epoch": 0.5780742260988541,
"grad_norm": 0.7965272068977356,
"learning_rate": 4.5120892735998636e-06,
"loss": 0.4387,
"step": 11830
},
{
"epoch": 0.5785628771775514,
"grad_norm": 0.37461355328559875,
"learning_rate": 4.503602097062344e-06,
"loss": 0.4395,
"step": 11840
},
{
"epoch": 0.5790515282562486,
"grad_norm": 0.5917596817016602,
"learning_rate": 4.4951163647823595e-06,
"loss": 0.4385,
"step": 11850
},
{
"epoch": 0.5795401793349458,
"grad_norm": 0.47392183542251587,
"learning_rate": 4.486632101448935e-06,
"loss": 0.4372,
"step": 11860
},
{
"epoch": 0.5800288304136432,
"grad_norm": 0.43549230694770813,
"learning_rate": 4.478149331746829e-06,
"loss": 0.4387,
"step": 11870
},
{
"epoch": 0.5805174814923404,
"grad_norm": 0.5697550177574158,
"learning_rate": 4.469668080356451e-06,
"loss": 0.4387,
"step": 11880
},
{
"epoch": 0.5810061325710376,
"grad_norm": 0.3437957763671875,
"learning_rate": 4.461188371953795e-06,
"loss": 0.4388,
"step": 11890
},
{
"epoch": 0.581494783649735,
"grad_norm": 1.4066935777664185,
"learning_rate": 4.4527102312103624e-06,
"loss": 0.4402,
"step": 11900
},
{
"epoch": 0.5819834347284322,
"grad_norm": 0.5635364055633545,
"learning_rate": 4.4442336827930995e-06,
"loss": 0.4387,
"step": 11910
},
{
"epoch": 0.5824720858071294,
"grad_norm": 0.42688384652137756,
"learning_rate": 4.435758751364312e-06,
"loss": 0.4408,
"step": 11920
},
{
"epoch": 0.5829607368858267,
"grad_norm": 0.5010594725608826,
"learning_rate": 4.427285461581609e-06,
"loss": 0.4385,
"step": 11930
},
{
"epoch": 0.583449387964524,
"grad_norm": 0.6035897135734558,
"learning_rate": 4.418813838097815e-06,
"loss": 0.4402,
"step": 11940
},
{
"epoch": 0.5839380390432212,
"grad_norm": 0.7641412019729614,
"learning_rate": 4.410343905560916e-06,
"loss": 0.4391,
"step": 11950
},
{
"epoch": 0.5844266901219184,
"grad_norm": 0.4700312614440918,
"learning_rate": 4.401875688613971e-06,
"loss": 0.4379,
"step": 11960
},
{
"epoch": 0.5849153412006157,
"grad_norm": 0.9198450446128845,
"learning_rate": 4.3934092118950485e-06,
"loss": 0.4374,
"step": 11970
},
{
"epoch": 0.585403992279313,
"grad_norm": 0.896514356136322,
"learning_rate": 4.384944500037156e-06,
"loss": 0.4384,
"step": 11980
},
{
"epoch": 0.5858926433580102,
"grad_norm": 0.49591732025146484,
"learning_rate": 4.376481577668167e-06,
"loss": 0.44,
"step": 11990
},
{
"epoch": 0.5863812944367075,
"grad_norm": 0.5625073909759521,
"learning_rate": 4.368020469410742e-06,
"loss": 0.4389,
"step": 12000
},
{
"epoch": 0.5863812944367075,
"eval_loss": 0.41703999042510986,
"eval_runtime": 727.8065,
"eval_samples_per_second": 243.07,
"eval_steps_per_second": 0.475,
"step": 12000
},
{
"epoch": 0.5868699455154047,
"grad_norm": 0.6674771904945374,
"learning_rate": 4.359561199882272e-06,
"loss": 0.4393,
"step": 12010
},
{
"epoch": 0.587358596594102,
"grad_norm": 0.5143821239471436,
"learning_rate": 4.351103793694794e-06,
"loss": 0.4375,
"step": 12020
},
{
"epoch": 0.5878472476727993,
"grad_norm": 0.4788214862346649,
"learning_rate": 4.342648275454922e-06,
"loss": 0.4386,
"step": 12030
},
{
"epoch": 0.5883358987514965,
"grad_norm": 0.5421459078788757,
"learning_rate": 4.334194669763781e-06,
"loss": 0.4386,
"step": 12040
},
{
"epoch": 0.5888245498301937,
"grad_norm": 0.6345226168632507,
"learning_rate": 4.325743001216926e-06,
"loss": 0.4388,
"step": 12050
},
{
"epoch": 0.589313200908891,
"grad_norm": 1.1048717498779297,
"learning_rate": 4.317293294404285e-06,
"loss": 0.44,
"step": 12060
},
{
"epoch": 0.5898018519875883,
"grad_norm": 0.5707539916038513,
"learning_rate": 4.308845573910071e-06,
"loss": 0.4379,
"step": 12070
},
{
"epoch": 0.5902905030662855,
"grad_norm": 0.7084303498268127,
"learning_rate": 4.300399864312718e-06,
"loss": 0.4388,
"step": 12080
},
{
"epoch": 0.5907791541449827,
"grad_norm": 0.5199768543243408,
"learning_rate": 4.291956190184811e-06,
"loss": 0.4385,
"step": 12090
},
{
"epoch": 0.59126780522368,
"grad_norm": 0.35853302478790283,
"learning_rate": 4.283514576093015e-06,
"loss": 0.44,
"step": 12100
},
{
"epoch": 0.5917564563023773,
"grad_norm": 0.6634894609451294,
"learning_rate": 4.275075046597997e-06,
"loss": 0.4386,
"step": 12110
},
{
"epoch": 0.5922451073810745,
"grad_norm": 0.3389874994754791,
"learning_rate": 4.266637626254363e-06,
"loss": 0.439,
"step": 12120
},
{
"epoch": 0.5927337584597718,
"grad_norm": 0.38937532901763916,
"learning_rate": 4.258202339610581e-06,
"loss": 0.4389,
"step": 12130
},
{
"epoch": 0.593222409538469,
"grad_norm": 0.47301584482192993,
"learning_rate": 4.2497692112089086e-06,
"loss": 0.4382,
"step": 12140
},
{
"epoch": 0.5937110606171663,
"grad_norm": 0.4262164533138275,
"learning_rate": 4.241338265585327e-06,
"loss": 0.4384,
"step": 12150
},
{
"epoch": 0.5941997116958636,
"grad_norm": 0.3946975767612457,
"learning_rate": 4.232909527269465e-06,
"loss": 0.4389,
"step": 12160
},
{
"epoch": 0.5946883627745608,
"grad_norm": 0.30611652135849,
"learning_rate": 4.2244830207845335e-06,
"loss": 0.4384,
"step": 12170
},
{
"epoch": 0.595177013853258,
"grad_norm": 0.5015509128570557,
"learning_rate": 4.2160587706472445e-06,
"loss": 0.4386,
"step": 12180
},
{
"epoch": 0.5956656649319554,
"grad_norm": 2.779911518096924,
"learning_rate": 4.207636801367746e-06,
"loss": 0.4388,
"step": 12190
},
{
"epoch": 0.5961543160106526,
"grad_norm": 0.940437912940979,
"learning_rate": 4.199217137449553e-06,
"loss": 0.4403,
"step": 12200
},
{
"epoch": 0.5966429670893498,
"grad_norm": 1.1815273761749268,
"learning_rate": 4.190799803389472e-06,
"loss": 0.4384,
"step": 12210
},
{
"epoch": 0.597131618168047,
"grad_norm": 0.4534102976322174,
"learning_rate": 4.182384823677527e-06,
"loss": 0.4385,
"step": 12220
},
{
"epoch": 0.5976202692467444,
"grad_norm": 0.694245457649231,
"learning_rate": 4.173972222796897e-06,
"loss": 0.4382,
"step": 12230
},
{
"epoch": 0.5981089203254416,
"grad_norm": 0.5328917503356934,
"learning_rate": 4.165562025223839e-06,
"loss": 0.4392,
"step": 12240
},
{
"epoch": 0.5985975714041388,
"grad_norm": 0.7805267572402954,
"learning_rate": 4.157154255427613e-06,
"loss": 0.4394,
"step": 12250
},
{
"epoch": 0.5990862224828362,
"grad_norm": 0.6959843635559082,
"learning_rate": 4.148748937870425e-06,
"loss": 0.4366,
"step": 12260
},
{
"epoch": 0.5995748735615334,
"grad_norm": 0.9793679714202881,
"learning_rate": 4.140346097007336e-06,
"loss": 0.4383,
"step": 12270
},
{
"epoch": 0.6000635246402306,
"grad_norm": 0.3973505198955536,
"learning_rate": 4.1319457572862066e-06,
"loss": 0.4394,
"step": 12280
},
{
"epoch": 0.6005521757189279,
"grad_norm": 0.5687869191169739,
"learning_rate": 4.123547943147621e-06,
"loss": 0.4388,
"step": 12290
},
{
"epoch": 0.6010408267976252,
"grad_norm": 0.4026346802711487,
"learning_rate": 4.115152679024811e-06,
"loss": 0.4391,
"step": 12300
},
{
"epoch": 0.6015294778763224,
"grad_norm": 0.513808012008667,
"learning_rate": 4.106759989343594e-06,
"loss": 0.4381,
"step": 12310
},
{
"epoch": 0.6020181289550197,
"grad_norm": 0.36706215143203735,
"learning_rate": 4.0983698985222935e-06,
"loss": 0.4384,
"step": 12320
},
{
"epoch": 0.6025067800337169,
"grad_norm": 0.5302925705909729,
"learning_rate": 4.089982430971673e-06,
"loss": 0.4387,
"step": 12330
},
{
"epoch": 0.6029954311124142,
"grad_norm": 0.6953673362731934,
"learning_rate": 4.081597611094864e-06,
"loss": 0.4385,
"step": 12340
},
{
"epoch": 0.6034840821911114,
"grad_norm": 0.46951332688331604,
"learning_rate": 4.073215463287296e-06,
"loss": 0.4385,
"step": 12350
},
{
"epoch": 0.6039727332698087,
"grad_norm": 0.30504921078681946,
"learning_rate": 4.064836011936618e-06,
"loss": 0.4378,
"step": 12360
},
{
"epoch": 0.6044613843485059,
"grad_norm": 0.34291717410087585,
"learning_rate": 4.056459281422644e-06,
"loss": 0.4367,
"step": 12370
},
{
"epoch": 0.6049500354272032,
"grad_norm": 0.3311258852481842,
"learning_rate": 4.0480852961172635e-06,
"loss": 0.4387,
"step": 12380
},
{
"epoch": 0.6054386865059005,
"grad_norm": 0.48355287313461304,
"learning_rate": 4.039714080384381e-06,
"loss": 0.4385,
"step": 12390
},
{
"epoch": 0.6059273375845977,
"grad_norm": 0.6378800868988037,
"learning_rate": 4.031345658579846e-06,
"loss": 0.438,
"step": 12400
},
{
"epoch": 0.6064159886632949,
"grad_norm": 0.3167429566383362,
"learning_rate": 4.022980055051372e-06,
"loss": 0.4395,
"step": 12410
},
{
"epoch": 0.6069046397419923,
"grad_norm": 1.2204922437667847,
"learning_rate": 4.014617294138482e-06,
"loss": 0.4394,
"step": 12420
},
{
"epoch": 0.6073932908206895,
"grad_norm": 0.775138258934021,
"learning_rate": 4.006257400172422e-06,
"loss": 0.4393,
"step": 12430
},
{
"epoch": 0.6078819418993867,
"grad_norm": 0.5826382637023926,
"learning_rate": 3.9979003974760985e-06,
"loss": 0.4379,
"step": 12440
},
{
"epoch": 0.608370592978084,
"grad_norm": 0.5798311233520508,
"learning_rate": 3.989546310364005e-06,
"loss": 0.4379,
"step": 12450
},
{
"epoch": 0.6088592440567813,
"grad_norm": 0.749792218208313,
"learning_rate": 3.981195163142154e-06,
"loss": 0.4379,
"step": 12460
},
{
"epoch": 0.6093478951354785,
"grad_norm": 0.514415979385376,
"learning_rate": 3.972846980108005e-06,
"loss": 0.4391,
"step": 12470
},
{
"epoch": 0.6098365462141758,
"grad_norm": 0.38157758116722107,
"learning_rate": 3.964501785550392e-06,
"loss": 0.4375,
"step": 12480
},
{
"epoch": 0.610325197292873,
"grad_norm": 0.915421724319458,
"learning_rate": 3.956159603749452e-06,
"loss": 0.437,
"step": 12490
},
{
"epoch": 0.6108138483715703,
"grad_norm": 0.5357415080070496,
"learning_rate": 3.947820458976559e-06,
"loss": 0.4388,
"step": 12500
},
{
"epoch": 0.6108138483715703,
"eval_loss": 0.41651272773742676,
"eval_runtime": 728.6136,
"eval_samples_per_second": 242.801,
"eval_steps_per_second": 0.475,
"step": 12500
},
{
"epoch": 0.6113024994502675,
"grad_norm": 0.8443652391433716,
"learning_rate": 3.939484375494252e-06,
"loss": 0.4405,
"step": 12510
},
{
"epoch": 0.6117911505289648,
"grad_norm": 0.604301929473877,
"learning_rate": 3.931151377556159e-06,
"loss": 0.4383,
"step": 12520
},
{
"epoch": 0.612279801607662,
"grad_norm": 0.36815837025642395,
"learning_rate": 3.922821489406935e-06,
"loss": 0.4386,
"step": 12530
},
{
"epoch": 0.6127684526863593,
"grad_norm": 0.6259467005729675,
"learning_rate": 3.914494735282185e-06,
"loss": 0.4392,
"step": 12540
},
{
"epoch": 0.6132571037650566,
"grad_norm": 0.6359371542930603,
"learning_rate": 3.9061711394083965e-06,
"loss": 0.4392,
"step": 12550
},
{
"epoch": 0.6137457548437538,
"grad_norm": 0.5041322112083435,
"learning_rate": 3.897850726002864e-06,
"loss": 0.4399,
"step": 12560
},
{
"epoch": 0.614234405922451,
"grad_norm": 0.5972697138786316,
"learning_rate": 3.889533519273633e-06,
"loss": 0.4391,
"step": 12570
},
{
"epoch": 0.6147230570011484,
"grad_norm": 0.7836823463439941,
"learning_rate": 3.881219543419407e-06,
"loss": 0.4387,
"step": 12580
},
{
"epoch": 0.6152117080798456,
"grad_norm": 0.44390979409217834,
"learning_rate": 3.8729088226294995e-06,
"loss": 0.4384,
"step": 12590
},
{
"epoch": 0.6157003591585428,
"grad_norm": 0.32042884826660156,
"learning_rate": 3.8646013810837445e-06,
"loss": 0.4379,
"step": 12600
},
{
"epoch": 0.6161890102372402,
"grad_norm": 0.5421732664108276,
"learning_rate": 3.856297242952442e-06,
"loss": 0.4384,
"step": 12610
},
{
"epoch": 0.6166776613159374,
"grad_norm": 0.5136971473693848,
"learning_rate": 3.847996432396279e-06,
"loss": 0.4371,
"step": 12620
},
{
"epoch": 0.6171663123946346,
"grad_norm": 0.46279609203338623,
"learning_rate": 3.839698973566258e-06,
"loss": 0.4378,
"step": 12630
},
{
"epoch": 0.6176549634733318,
"grad_norm": 0.7376791834831238,
"learning_rate": 3.831404890603634e-06,
"loss": 0.4381,
"step": 12640
},
{
"epoch": 0.6181436145520292,
"grad_norm": 0.5303279757499695,
"learning_rate": 3.823114207639838e-06,
"loss": 0.4386,
"step": 12650
},
{
"epoch": 0.6186322656307264,
"grad_norm": 0.7225260138511658,
"learning_rate": 3.814826948796404e-06,
"loss": 0.438,
"step": 12660
},
{
"epoch": 0.6191209167094236,
"grad_norm": 0.8428411483764648,
"learning_rate": 3.8065431381849178e-06,
"loss": 0.4385,
"step": 12670
},
{
"epoch": 0.6196095677881209,
"grad_norm": 0.40499812364578247,
"learning_rate": 3.7982627999069164e-06,
"loss": 0.4382,
"step": 12680
},
{
"epoch": 0.6200982188668182,
"grad_norm": 0.44530633091926575,
"learning_rate": 3.7899859580538436e-06,
"loss": 0.4386,
"step": 12690
},
{
"epoch": 0.6205868699455154,
"grad_norm": 0.4268031418323517,
"learning_rate": 3.7817126367069674e-06,
"loss": 0.4374,
"step": 12700
},
{
"epoch": 0.6210755210242127,
"grad_norm": 0.2745535373687744,
"learning_rate": 3.773442859937313e-06,
"loss": 0.4383,
"step": 12710
},
{
"epoch": 0.6215641721029099,
"grad_norm": 0.5120725035667419,
"learning_rate": 3.765176651805593e-06,
"loss": 0.4383,
"step": 12720
},
{
"epoch": 0.6220528231816072,
"grad_norm": 0.3301103413105011,
"learning_rate": 3.7569140363621393e-06,
"loss": 0.4384,
"step": 12730
},
{
"epoch": 0.6225414742603045,
"grad_norm": 0.34257206320762634,
"learning_rate": 3.7486550376468266e-06,
"loss": 0.4366,
"step": 12740
},
{
"epoch": 0.6230301253390017,
"grad_norm": 0.37387409806251526,
"learning_rate": 3.7403996796890096e-06,
"loss": 0.4381,
"step": 12750
},
{
"epoch": 0.6235187764176989,
"grad_norm": 0.5832339525222778,
"learning_rate": 3.732147986507453e-06,
"loss": 0.4389,
"step": 12760
},
{
"epoch": 0.6240074274963961,
"grad_norm": 0.39319491386413574,
"learning_rate": 3.723899982110249e-06,
"loss": 0.4379,
"step": 12770
},
{
"epoch": 0.6244960785750935,
"grad_norm": 1.1208192110061646,
"learning_rate": 3.7156556904947725e-06,
"loss": 0.4374,
"step": 12780
},
{
"epoch": 0.6249847296537907,
"grad_norm": 1.2163150310516357,
"learning_rate": 3.7074151356475828e-06,
"loss": 0.4386,
"step": 12790
},
{
"epoch": 0.6254733807324879,
"grad_norm": 0.5749249458312988,
"learning_rate": 3.6991783415443726e-06,
"loss": 0.4376,
"step": 12800
},
{
"epoch": 0.6259620318111853,
"grad_norm": 0.3662860095500946,
"learning_rate": 3.6909453321498954e-06,
"loss": 0.4387,
"step": 12810
},
{
"epoch": 0.6264506828898825,
"grad_norm": 0.7711629271507263,
"learning_rate": 3.682716131417887e-06,
"loss": 0.4387,
"step": 12820
},
{
"epoch": 0.6269393339685797,
"grad_norm": 0.4106141924858093,
"learning_rate": 3.6744907632910064e-06,
"loss": 0.4376,
"step": 12830
},
{
"epoch": 0.627427985047277,
"grad_norm": 0.8427706956863403,
"learning_rate": 3.6662692517007613e-06,
"loss": 0.4376,
"step": 12840
},
{
"epoch": 0.6279166361259743,
"grad_norm": 0.4671982526779175,
"learning_rate": 3.6580516205674367e-06,
"loss": 0.4375,
"step": 12850
},
{
"epoch": 0.6284052872046715,
"grad_norm": 0.643839955329895,
"learning_rate": 3.64983789380003e-06,
"loss": 0.4382,
"step": 12860
},
{
"epoch": 0.6288939382833688,
"grad_norm": 0.3143644630908966,
"learning_rate": 3.6416280952961756e-06,
"loss": 0.4378,
"step": 12870
},
{
"epoch": 0.629382589362066,
"grad_norm": 0.5174784064292908,
"learning_rate": 3.6334222489420845e-06,
"loss": 0.4386,
"step": 12880
},
{
"epoch": 0.6298712404407633,
"grad_norm": 0.35816308856010437,
"learning_rate": 3.625220378612465e-06,
"loss": 0.4381,
"step": 12890
},
{
"epoch": 0.6303598915194605,
"grad_norm": 0.4106110632419586,
"learning_rate": 3.617022508170456e-06,
"loss": 0.4372,
"step": 12900
},
{
"epoch": 0.6308485425981578,
"grad_norm": 1.5037391185760498,
"learning_rate": 3.608828661467561e-06,
"loss": 0.4366,
"step": 12910
},
{
"epoch": 0.631337193676855,
"grad_norm": 0.6403370499610901,
"learning_rate": 3.6006388623435778e-06,
"loss": 0.4373,
"step": 12920
},
{
"epoch": 0.6318258447555523,
"grad_norm": 0.4930186867713928,
"learning_rate": 3.5924531346265235e-06,
"loss": 0.4379,
"step": 12930
},
{
"epoch": 0.6323144958342496,
"grad_norm": 0.3067891001701355,
"learning_rate": 3.5842715021325745e-06,
"loss": 0.4368,
"step": 12940
},
{
"epoch": 0.6328031469129468,
"grad_norm": 0.7694682478904724,
"learning_rate": 3.5760939886659896e-06,
"loss": 0.438,
"step": 12950
},
{
"epoch": 0.633291797991644,
"grad_norm": 0.5115815997123718,
"learning_rate": 3.567920618019043e-06,
"loss": 0.4377,
"step": 12960
},
{
"epoch": 0.6337804490703414,
"grad_norm": 0.6964974999427795,
"learning_rate": 3.559751413971955e-06,
"loss": 0.4375,
"step": 12970
},
{
"epoch": 0.6342691001490386,
"grad_norm": 0.5830658078193665,
"learning_rate": 3.551586400292828e-06,
"loss": 0.4381,
"step": 12980
},
{
"epoch": 0.6347577512277358,
"grad_norm": 0.8513720035552979,
"learning_rate": 3.5434256007375666e-06,
"loss": 0.4376,
"step": 12990
},
{
"epoch": 0.6352464023064331,
"grad_norm": 0.4420766234397888,
"learning_rate": 3.535269039049819e-06,
"loss": 0.436,
"step": 13000
},
{
"epoch": 0.6352464023064331,
"eval_loss": 0.41433966159820557,
"eval_runtime": 729.8346,
"eval_samples_per_second": 242.395,
"eval_steps_per_second": 0.474,
"step": 13000
},
{
"epoch": 0.6357350533851304,
"grad_norm": 1.997313380241394,
"learning_rate": 3.5271167389608996e-06,
"loss": 0.4376,
"step": 13010
},
{
"epoch": 0.6362237044638276,
"grad_norm": 0.5262558460235596,
"learning_rate": 3.518968724189727e-06,
"loss": 0.439,
"step": 13020
},
{
"epoch": 0.6367123555425248,
"grad_norm": 0.7942774295806885,
"learning_rate": 3.5108250184427507e-06,
"loss": 0.4368,
"step": 13030
},
{
"epoch": 0.6372010066212221,
"grad_norm": 0.44571954011917114,
"learning_rate": 3.50268564541388e-06,
"loss": 0.4386,
"step": 13040
},
{
"epoch": 0.6376896576999194,
"grad_norm": 0.3043385148048401,
"learning_rate": 3.4945506287844245e-06,
"loss": 0.4377,
"step": 13050
},
{
"epoch": 0.6381783087786166,
"grad_norm": 0.41446077823638916,
"learning_rate": 3.4864199922230156e-06,
"loss": 0.4376,
"step": 13060
},
{
"epoch": 0.6386669598573139,
"grad_norm": 0.361817866563797,
"learning_rate": 3.4782937593855386e-06,
"loss": 0.4368,
"step": 13070
},
{
"epoch": 0.6391556109360111,
"grad_norm": 0.2429763674736023,
"learning_rate": 3.4701719539150692e-06,
"loss": 0.4384,
"step": 13080
},
{
"epoch": 0.6396442620147084,
"grad_norm": 0.6479557156562805,
"learning_rate": 3.4620545994418044e-06,
"loss": 0.4369,
"step": 13090
},
{
"epoch": 0.6401329130934057,
"grad_norm": 0.2731977105140686,
"learning_rate": 3.453941719582985e-06,
"loss": 0.4367,
"step": 13100
},
{
"epoch": 0.6406215641721029,
"grad_norm": 0.3237663209438324,
"learning_rate": 3.445833337942838e-06,
"loss": 0.4389,
"step": 13110
},
{
"epoch": 0.6411102152508001,
"grad_norm": 0.6228281855583191,
"learning_rate": 3.4377294781124997e-06,
"loss": 0.4361,
"step": 13120
},
{
"epoch": 0.6415988663294975,
"grad_norm": 0.7347028255462646,
"learning_rate": 3.4296301636699527e-06,
"loss": 0.4378,
"step": 13130
},
{
"epoch": 0.6420875174081947,
"grad_norm": 0.8885689377784729,
"learning_rate": 3.421535418179953e-06,
"loss": 0.4379,
"step": 13140
},
{
"epoch": 0.6425761684868919,
"grad_norm": 0.7151497602462769,
"learning_rate": 3.413445265193964e-06,
"loss": 0.4373,
"step": 13150
},
{
"epoch": 0.6430648195655891,
"grad_norm": 0.46183907985687256,
"learning_rate": 3.4053597282500882e-06,
"loss": 0.4373,
"step": 13160
},
{
"epoch": 0.6435534706442865,
"grad_norm": 0.7960475087165833,
"learning_rate": 3.397278830872998e-06,
"loss": 0.4358,
"step": 13170
},
{
"epoch": 0.6440421217229837,
"grad_norm": 0.5535709261894226,
"learning_rate": 3.3892025965738616e-06,
"loss": 0.4373,
"step": 13180
},
{
"epoch": 0.6445307728016809,
"grad_norm": 0.8837286829948425,
"learning_rate": 3.3811310488502924e-06,
"loss": 0.4372,
"step": 13190
},
{
"epoch": 0.6450194238803783,
"grad_norm": 0.5701731443405151,
"learning_rate": 3.3730642111862543e-06,
"loss": 0.4381,
"step": 13200
},
{
"epoch": 0.6455080749590755,
"grad_norm": 0.449485182762146,
"learning_rate": 3.365002107052017e-06,
"loss": 0.4381,
"step": 13210
},
{
"epoch": 0.6459967260377727,
"grad_norm": 0.43208661675453186,
"learning_rate": 3.356944759904075e-06,
"loss": 0.4387,
"step": 13220
},
{
"epoch": 0.64648537711647,
"grad_norm": 0.5452390313148499,
"learning_rate": 3.3488921931850794e-06,
"loss": 0.4374,
"step": 13230
},
{
"epoch": 0.6469740281951673,
"grad_norm": 0.37224072217941284,
"learning_rate": 3.3408444303237786e-06,
"loss": 0.4376,
"step": 13240
},
{
"epoch": 0.6474626792738645,
"grad_norm": 0.6517390608787537,
"learning_rate": 3.3328014947349406e-06,
"loss": 0.4377,
"step": 13250
},
{
"epoch": 0.6479513303525618,
"grad_norm": 0.3955247402191162,
"learning_rate": 3.3247634098192884e-06,
"loss": 0.4388,
"step": 13260
},
{
"epoch": 0.648439981431259,
"grad_norm": 0.3447047770023346,
"learning_rate": 3.316730198963433e-06,
"loss": 0.4377,
"step": 13270
},
{
"epoch": 0.6489286325099562,
"grad_norm": 0.8046542406082153,
"learning_rate": 3.3087018855398045e-06,
"loss": 0.4374,
"step": 13280
},
{
"epoch": 0.6494172835886535,
"grad_norm": 0.5053970217704773,
"learning_rate": 3.300678492906586e-06,
"loss": 0.4377,
"step": 13290
},
{
"epoch": 0.6499059346673508,
"grad_norm": 1.129328727722168,
"learning_rate": 3.292660044407642e-06,
"loss": 0.4373,
"step": 13300
},
{
"epoch": 0.650394585746048,
"grad_norm": 1.0235140323638916,
"learning_rate": 3.2846465633724487e-06,
"loss": 0.438,
"step": 13310
},
{
"epoch": 0.6508832368247452,
"grad_norm": 1.2467355728149414,
"learning_rate": 3.2766380731160342e-06,
"loss": 0.4376,
"step": 13320
},
{
"epoch": 0.6513718879034426,
"grad_norm": 0.42103078961372375,
"learning_rate": 3.268634596938906e-06,
"loss": 0.4369,
"step": 13330
},
{
"epoch": 0.6518605389821398,
"grad_norm": 0.6862124800682068,
"learning_rate": 3.26063615812698e-06,
"loss": 0.4384,
"step": 13340
},
{
"epoch": 0.652349190060837,
"grad_norm": 0.4259004294872284,
"learning_rate": 3.252642779951518e-06,
"loss": 0.4385,
"step": 13350
},
{
"epoch": 0.6528378411395344,
"grad_norm": 0.3901737630367279,
"learning_rate": 3.2446544856690595e-06,
"loss": 0.4375,
"step": 13360
},
{
"epoch": 0.6533264922182316,
"grad_norm": 0.5543071627616882,
"learning_rate": 3.236671298521349e-06,
"loss": 0.4373,
"step": 13370
},
{
"epoch": 0.6538151432969288,
"grad_norm": 0.4534682333469391,
"learning_rate": 3.228693241735274e-06,
"loss": 0.4379,
"step": 13380
},
{
"epoch": 0.6543037943756261,
"grad_norm": 0.25920426845550537,
"learning_rate": 3.220720338522795e-06,
"loss": 0.4371,
"step": 13390
},
{
"epoch": 0.6547924454543234,
"grad_norm": 0.3495095372200012,
"learning_rate": 3.2127526120808807e-06,
"loss": 0.4386,
"step": 13400
},
{
"epoch": 0.6552810965330206,
"grad_norm": 0.5815818309783936,
"learning_rate": 3.204790085591435e-06,
"loss": 0.4386,
"step": 13410
},
{
"epoch": 0.6557697476117178,
"grad_norm": 0.48536261916160583,
"learning_rate": 3.1968327822212325e-06,
"loss": 0.4376,
"step": 13420
},
{
"epoch": 0.6562583986904151,
"grad_norm": 0.46819832921028137,
"learning_rate": 3.1888807251218524e-06,
"loss": 0.4364,
"step": 13430
},
{
"epoch": 0.6567470497691124,
"grad_norm": 0.6933978199958801,
"learning_rate": 3.180933937429612e-06,
"loss": 0.4366,
"step": 13440
},
{
"epoch": 0.6572357008478096,
"grad_norm": 0.6232645511627197,
"learning_rate": 3.1729924422654917e-06,
"loss": 0.4372,
"step": 13450
},
{
"epoch": 0.6577243519265069,
"grad_norm": 0.6195946335792542,
"learning_rate": 3.1650562627350797e-06,
"loss": 0.4379,
"step": 13460
},
{
"epoch": 0.6582130030052041,
"grad_norm": 0.47936639189720154,
"learning_rate": 3.157125421928496e-06,
"loss": 0.4375,
"step": 13470
},
{
"epoch": 0.6587016540839014,
"grad_norm": 0.7483202219009399,
"learning_rate": 3.1491999429203253e-06,
"loss": 0.4375,
"step": 13480
},
{
"epoch": 0.6591903051625987,
"grad_norm": 0.6134311556816101,
"learning_rate": 3.141279848769555e-06,
"loss": 0.4373,
"step": 13490
},
{
"epoch": 0.6596789562412959,
"grad_norm": 0.623481810092926,
"learning_rate": 3.1333651625195065e-06,
"loss": 0.4377,
"step": 13500
},
{
"epoch": 0.6596789562412959,
"eval_loss": 0.41565391421318054,
"eval_runtime": 728.9931,
"eval_samples_per_second": 242.674,
"eval_steps_per_second": 0.475,
"step": 13500
},
{
"epoch": 0.6601676073199931,
"grad_norm": 0.6795092225074768,
"learning_rate": 3.125455907197765e-06,
"loss": 0.4376,
"step": 13510
},
{
"epoch": 0.6606562583986905,
"grad_norm": 0.766830325126648,
"learning_rate": 3.117552105816116e-06,
"loss": 0.4361,
"step": 13520
},
{
"epoch": 0.6611449094773877,
"grad_norm": 1.3391401767730713,
"learning_rate": 3.109653781370473e-06,
"loss": 0.4367,
"step": 13530
},
{
"epoch": 0.6616335605560849,
"grad_norm": 0.4134541451931,
"learning_rate": 3.101760956840819e-06,
"loss": 0.4379,
"step": 13540
},
{
"epoch": 0.6621222116347821,
"grad_norm": 0.8742764592170715,
"learning_rate": 3.093873655191135e-06,
"loss": 0.4365,
"step": 13550
},
{
"epoch": 0.6626108627134795,
"grad_norm": 0.5794457197189331,
"learning_rate": 3.0859918993693294e-06,
"loss": 0.4375,
"step": 13560
},
{
"epoch": 0.6630995137921767,
"grad_norm": 0.5920469760894775,
"learning_rate": 3.0781157123071782e-06,
"loss": 0.4366,
"step": 13570
},
{
"epoch": 0.6635881648708739,
"grad_norm": 0.8090350031852722,
"learning_rate": 3.070245116920255e-06,
"loss": 0.4375,
"step": 13580
},
{
"epoch": 0.6640768159495712,
"grad_norm": 0.6335200071334839,
"learning_rate": 3.062380136107863e-06,
"loss": 0.4372,
"step": 13590
},
{
"epoch": 0.6645654670282685,
"grad_norm": 0.5986902117729187,
"learning_rate": 3.054520792752973e-06,
"loss": 0.4369,
"step": 13600
},
{
"epoch": 0.6650541181069657,
"grad_norm": 0.5582621693611145,
"learning_rate": 3.0466671097221506e-06,
"loss": 0.4373,
"step": 13610
},
{
"epoch": 0.665542769185663,
"grad_norm": 0.40097367763519287,
"learning_rate": 3.038819109865495e-06,
"loss": 0.437,
"step": 13620
},
{
"epoch": 0.6660314202643602,
"grad_norm": 0.3417106866836548,
"learning_rate": 3.0309768160165697e-06,
"loss": 0.4367,
"step": 13630
},
{
"epoch": 0.6665200713430575,
"grad_norm": 0.8929291367530823,
"learning_rate": 3.0231402509923347e-06,
"loss": 0.4371,
"step": 13640
},
{
"epoch": 0.6670087224217548,
"grad_norm": 0.5947937965393066,
"learning_rate": 3.015309437593084e-06,
"loss": 0.4381,
"step": 13650
},
{
"epoch": 0.667497373500452,
"grad_norm": 0.42644399404525757,
"learning_rate": 3.00748439860238e-06,
"loss": 0.4368,
"step": 13660
},
{
"epoch": 0.6679860245791492,
"grad_norm": 0.378689706325531,
"learning_rate": 2.9996651567869784e-06,
"loss": 0.4358,
"step": 13670
},
{
"epoch": 0.6684746756578465,
"grad_norm": 0.5530552268028259,
"learning_rate": 2.9918517348967734e-06,
"loss": 0.4376,
"step": 13680
},
{
"epoch": 0.6689633267365438,
"grad_norm": 0.5646296739578247,
"learning_rate": 2.9840441556647247e-06,
"loss": 0.4371,
"step": 13690
},
{
"epoch": 0.669451977815241,
"grad_norm": 0.7569136619567871,
"learning_rate": 2.9762424418067905e-06,
"loss": 0.4373,
"step": 13700
},
{
"epoch": 0.6699406288939382,
"grad_norm": 1.0428658723831177,
"learning_rate": 2.968446616021868e-06,
"loss": 0.4379,
"step": 13710
},
{
"epoch": 0.6704292799726356,
"grad_norm": 0.7594118714332581,
"learning_rate": 2.9606567009917218e-06,
"loss": 0.4375,
"step": 13720
},
{
"epoch": 0.6709179310513328,
"grad_norm": 2.9701902866363525,
"learning_rate": 2.952872719380917e-06,
"loss": 0.4379,
"step": 13730
},
{
"epoch": 0.67140658213003,
"grad_norm": 0.9236831665039062,
"learning_rate": 2.94509469383676e-06,
"loss": 0.4371,
"step": 13740
},
{
"epoch": 0.6718952332087273,
"grad_norm": 1.1951662302017212,
"learning_rate": 2.9373226469892223e-06,
"loss": 0.4377,
"step": 13750
},
{
"epoch": 0.6723838842874246,
"grad_norm": 0.3135634958744049,
"learning_rate": 2.9295566014508853e-06,
"loss": 0.4369,
"step": 13760
},
{
"epoch": 0.6728725353661218,
"grad_norm": 0.4447099566459656,
"learning_rate": 2.9217965798168685e-06,
"loss": 0.4376,
"step": 13770
},
{
"epoch": 0.6733611864448191,
"grad_norm": 0.3684927821159363,
"learning_rate": 2.914042604664764e-06,
"loss": 0.4373,
"step": 13780
},
{
"epoch": 0.6738498375235163,
"grad_norm": 0.3362119495868683,
"learning_rate": 2.9062946985545707e-06,
"loss": 0.4371,
"step": 13790
},
{
"epoch": 0.6743384886022136,
"grad_norm": 0.6925981640815735,
"learning_rate": 2.898552884028634e-06,
"loss": 0.4371,
"step": 13800
},
{
"epoch": 0.6748271396809108,
"grad_norm": 0.49009522795677185,
"learning_rate": 2.8908171836115736e-06,
"loss": 0.4382,
"step": 13810
},
{
"epoch": 0.6753157907596081,
"grad_norm": 0.49105721712112427,
"learning_rate": 2.8830876198102176e-06,
"loss": 0.4369,
"step": 13820
},
{
"epoch": 0.6758044418383053,
"grad_norm": 0.5330390930175781,
"learning_rate": 2.875364215113547e-06,
"loss": 0.4365,
"step": 13830
},
{
"epoch": 0.6762930929170026,
"grad_norm": 0.43516021966934204,
"learning_rate": 2.8676469919926152e-06,
"loss": 0.437,
"step": 13840
},
{
"epoch": 0.6767817439956999,
"grad_norm": 0.4716795086860657,
"learning_rate": 2.859935972900492e-06,
"loss": 0.4361,
"step": 13850
},
{
"epoch": 0.6772703950743971,
"grad_norm": 0.38898736238479614,
"learning_rate": 2.8522311802722038e-06,
"loss": 0.4369,
"step": 13860
},
{
"epoch": 0.6777590461530943,
"grad_norm": 0.34487384557724,
"learning_rate": 2.8445326365246516e-06,
"loss": 0.4381,
"step": 13870
},
{
"epoch": 0.6782476972317917,
"grad_norm": 0.30314865708351135,
"learning_rate": 2.836840364056559e-06,
"loss": 0.4371,
"step": 13880
},
{
"epoch": 0.6787363483104889,
"grad_norm": 0.5969054102897644,
"learning_rate": 2.829154385248409e-06,
"loss": 0.4367,
"step": 13890
},
{
"epoch": 0.6792249993891861,
"grad_norm": 0.32903793454170227,
"learning_rate": 2.8214747224623627e-06,
"loss": 0.4357,
"step": 13900
},
{
"epoch": 0.6797136504678835,
"grad_norm": 0.3053576648235321,
"learning_rate": 2.8138013980422164e-06,
"loss": 0.4365,
"step": 13910
},
{
"epoch": 0.6802023015465807,
"grad_norm": 0.43716076016426086,
"learning_rate": 2.8061344343133144e-06,
"loss": 0.4364,
"step": 13920
},
{
"epoch": 0.6806909526252779,
"grad_norm": 0.4108024537563324,
"learning_rate": 2.7984738535825044e-06,
"loss": 0.4379,
"step": 13930
},
{
"epoch": 0.6811796037039751,
"grad_norm": 0.45622798800468445,
"learning_rate": 2.790819678138056e-06,
"loss": 0.4368,
"step": 13940
},
{
"epoch": 0.6816682547826725,
"grad_norm": 0.416840523481369,
"learning_rate": 2.783171930249603e-06,
"loss": 0.4374,
"step": 13950
},
{
"epoch": 0.6821569058613697,
"grad_norm": 0.5477868914604187,
"learning_rate": 2.775530632168084e-06,
"loss": 0.4372,
"step": 13960
},
{
"epoch": 0.6826455569400669,
"grad_norm": 0.3617335259914398,
"learning_rate": 2.7678958061256667e-06,
"loss": 0.4363,
"step": 13970
},
{
"epoch": 0.6831342080187642,
"grad_norm": 0.45384445786476135,
"learning_rate": 2.7602674743356893e-06,
"loss": 0.4349,
"step": 13980
},
{
"epoch": 0.6836228590974615,
"grad_norm": 0.3232516944408417,
"learning_rate": 2.752645658992599e-06,
"loss": 0.4369,
"step": 13990
},
{
"epoch": 0.6841115101761587,
"grad_norm": 0.4313335418701172,
"learning_rate": 2.745030382271879e-06,
"loss": 0.4378,
"step": 14000
},
{
"epoch": 0.6841115101761587,
"eval_loss": 0.41417059302330017,
"eval_runtime": 727.8695,
"eval_samples_per_second": 243.049,
"eval_steps_per_second": 0.475,
"step": 14000
},
{
"epoch": 0.684600161254856,
"grad_norm": 0.5871222615242004,
"learning_rate": 2.737421666329987e-06,
"loss": 0.4373,
"step": 14010
},
{
"epoch": 0.6850888123335532,
"grad_norm": 0.4553307294845581,
"learning_rate": 2.7298195333043022e-06,
"loss": 0.4372,
"step": 14020
},
{
"epoch": 0.6855774634122505,
"grad_norm": 0.49893367290496826,
"learning_rate": 2.722224005313041e-06,
"loss": 0.4366,
"step": 14030
},
{
"epoch": 0.6860661144909478,
"grad_norm": 0.401821494102478,
"learning_rate": 2.7146351044552045e-06,
"loss": 0.4372,
"step": 14040
},
{
"epoch": 0.686554765569645,
"grad_norm": 0.2880902588367462,
"learning_rate": 2.7070528528105165e-06,
"loss": 0.4366,
"step": 14050
},
{
"epoch": 0.6870434166483422,
"grad_norm": 0.4244653880596161,
"learning_rate": 2.6994772724393516e-06,
"loss": 0.4368,
"step": 14060
},
{
"epoch": 0.6875320677270395,
"grad_norm": 0.4931180775165558,
"learning_rate": 2.6919083853826724e-06,
"loss": 0.4371,
"step": 14070
},
{
"epoch": 0.6880207188057368,
"grad_norm": 0.5409946441650391,
"learning_rate": 2.684346213661974e-06,
"loss": 0.4363,
"step": 14080
},
{
"epoch": 0.688509369884434,
"grad_norm": 0.4695432484149933,
"learning_rate": 2.676790779279209e-06,
"loss": 0.4369,
"step": 14090
},
{
"epoch": 0.6889980209631312,
"grad_norm": 4.034379005432129,
"learning_rate": 2.669242104216725e-06,
"loss": 0.4363,
"step": 14100
},
{
"epoch": 0.6894866720418286,
"grad_norm": 0.6742619872093201,
"learning_rate": 2.6617002104372096e-06,
"loss": 0.4373,
"step": 14110
},
{
"epoch": 0.6899753231205258,
"grad_norm": 0.6923062801361084,
"learning_rate": 2.6541651198836207e-06,
"loss": 0.4365,
"step": 14120
},
{
"epoch": 0.690463974199223,
"grad_norm": 0.6054366230964661,
"learning_rate": 2.6466368544791164e-06,
"loss": 0.4364,
"step": 14130
},
{
"epoch": 0.6909526252779203,
"grad_norm": 0.809479296207428,
"learning_rate": 2.639115436126999e-06,
"loss": 0.4358,
"step": 14140
},
{
"epoch": 0.6914412763566176,
"grad_norm": 0.458893358707428,
"learning_rate": 2.6316008867106547e-06,
"loss": 0.4365,
"step": 14150
},
{
"epoch": 0.6919299274353148,
"grad_norm": 1.5249381065368652,
"learning_rate": 2.6240932280934794e-06,
"loss": 0.4353,
"step": 14160
},
{
"epoch": 0.6924185785140121,
"grad_norm": 0.435376912355423,
"learning_rate": 2.616592482118818e-06,
"loss": 0.4358,
"step": 14170
},
{
"epoch": 0.6929072295927093,
"grad_norm": 0.5011893510818481,
"learning_rate": 2.6090986706099135e-06,
"loss": 0.4361,
"step": 14180
},
{
"epoch": 0.6933958806714066,
"grad_norm": 0.42486095428466797,
"learning_rate": 2.6016118153698235e-06,
"loss": 0.4374,
"step": 14190
},
{
"epoch": 0.6938845317501038,
"grad_norm": 0.29725852608680725,
"learning_rate": 2.594131938181368e-06,
"loss": 0.4367,
"step": 14200
},
{
"epoch": 0.6943731828288011,
"grad_norm": 1.0349030494689941,
"learning_rate": 2.586659060807068e-06,
"loss": 0.4382,
"step": 14210
},
{
"epoch": 0.6948618339074983,
"grad_norm": 0.3708353340625763,
"learning_rate": 2.579193204989079e-06,
"loss": 0.4373,
"step": 14220
},
{
"epoch": 0.6953504849861956,
"grad_norm": 0.4205668568611145,
"learning_rate": 2.5717343924491224e-06,
"loss": 0.4362,
"step": 14230
},
{
"epoch": 0.6958391360648929,
"grad_norm": 0.6266738772392273,
"learning_rate": 2.564282644888434e-06,
"loss": 0.4372,
"step": 14240
},
{
"epoch": 0.6963277871435901,
"grad_norm": 0.43474340438842773,
"learning_rate": 2.5568379839876883e-06,
"loss": 0.4359,
"step": 14250
},
{
"epoch": 0.6968164382222873,
"grad_norm": 0.7086150646209717,
"learning_rate": 2.5494004314069422e-06,
"loss": 0.4357,
"step": 14260
},
{
"epoch": 0.6973050893009847,
"grad_norm": 0.6918942332267761,
"learning_rate": 2.5419700087855765e-06,
"loss": 0.4358,
"step": 14270
},
{
"epoch": 0.6977937403796819,
"grad_norm": 0.7701777219772339,
"learning_rate": 2.5345467377422216e-06,
"loss": 0.4369,
"step": 14280
},
{
"epoch": 0.6982823914583791,
"grad_norm": 0.40936869382858276,
"learning_rate": 2.527130639874701e-06,
"loss": 0.4364,
"step": 14290
},
{
"epoch": 0.6987710425370764,
"grad_norm": 0.432035356760025,
"learning_rate": 2.5197217367599726e-06,
"loss": 0.4366,
"step": 14300
},
{
"epoch": 0.6992596936157737,
"grad_norm": 0.41449683904647827,
"learning_rate": 2.512320049954056e-06,
"loss": 0.4359,
"step": 14310
},
{
"epoch": 0.6997483446944709,
"grad_norm": 0.49594905972480774,
"learning_rate": 2.50492560099198e-06,
"loss": 0.4364,
"step": 14320
},
{
"epoch": 0.7002369957731681,
"grad_norm": 0.38190391659736633,
"learning_rate": 2.4975384113877093e-06,
"loss": 0.4362,
"step": 14330
},
{
"epoch": 0.7007256468518654,
"grad_norm": 0.8239844441413879,
"learning_rate": 2.490158502634095e-06,
"loss": 0.4361,
"step": 14340
},
{
"epoch": 0.7012142979305627,
"grad_norm": 0.5367412567138672,
"learning_rate": 2.4827858962027994e-06,
"loss": 0.4355,
"step": 14350
},
{
"epoch": 0.7017029490092599,
"grad_norm": 0.4261118173599243,
"learning_rate": 2.475420613544237e-06,
"loss": 0.4357,
"step": 14360
},
{
"epoch": 0.7021916000879572,
"grad_norm": 0.7066627144813538,
"learning_rate": 2.468062676087522e-06,
"loss": 0.4379,
"step": 14370
},
{
"epoch": 0.7026802511666544,
"grad_norm": 0.7751229405403137,
"learning_rate": 2.4607121052403903e-06,
"loss": 0.4358,
"step": 14380
},
{
"epoch": 0.7031689022453517,
"grad_norm": 0.3944869041442871,
"learning_rate": 2.4533689223891466e-06,
"loss": 0.4371,
"step": 14390
},
{
"epoch": 0.703657553324049,
"grad_norm": 0.5122698545455933,
"learning_rate": 2.446033148898605e-06,
"loss": 0.4361,
"step": 14400
},
{
"epoch": 0.7041462044027462,
"grad_norm": 0.4192598760128021,
"learning_rate": 2.438704806112016e-06,
"loss": 0.4361,
"step": 14410
},
{
"epoch": 0.7046348554814434,
"grad_norm": 0.4704056680202484,
"learning_rate": 2.4313839153510112e-06,
"loss": 0.4359,
"step": 14420
},
{
"epoch": 0.7051235065601408,
"grad_norm": 0.3789515197277069,
"learning_rate": 2.4240704979155484e-06,
"loss": 0.436,
"step": 14430
},
{
"epoch": 0.705612157638838,
"grad_norm": 0.48638996481895447,
"learning_rate": 2.4167645750838336e-06,
"loss": 0.4366,
"step": 14440
},
{
"epoch": 0.7061008087175352,
"grad_norm": 0.3896729052066803,
"learning_rate": 2.4094661681122684e-06,
"loss": 0.4372,
"step": 14450
},
{
"epoch": 0.7065894597962326,
"grad_norm": 0.5547624826431274,
"learning_rate": 2.4021752982353918e-06,
"loss": 0.435,
"step": 14460
},
{
"epoch": 0.7070781108749298,
"grad_norm": 0.4325717091560364,
"learning_rate": 2.394891986665811e-06,
"loss": 0.4353,
"step": 14470
},
{
"epoch": 0.707566761953627,
"grad_norm": 0.46477776765823364,
"learning_rate": 2.387616254594139e-06,
"loss": 0.4372,
"step": 14480
},
{
"epoch": 0.7080554130323242,
"grad_norm": 0.39680078625679016,
"learning_rate": 2.3803481231889443e-06,
"loss": 0.4359,
"step": 14490
},
{
"epoch": 0.7085440641110216,
"grad_norm": 0.34461086988449097,
"learning_rate": 2.3730876135966746e-06,
"loss": 0.4377,
"step": 14500
},
{
"epoch": 0.7085440641110216,
"eval_loss": 0.4154199957847595,
"eval_runtime": 729.053,
"eval_samples_per_second": 242.654,
"eval_steps_per_second": 0.475,
"step": 14500
},
{
"epoch": 0.7090327151897188,
"grad_norm": 0.4224153757095337,
"learning_rate": 2.3658347469416037e-06,
"loss": 0.4366,
"step": 14510
},
{
"epoch": 0.709521366268416,
"grad_norm": 0.32037585973739624,
"learning_rate": 2.3585895443257705e-06,
"loss": 0.4364,
"step": 14520
},
{
"epoch": 0.7100100173471133,
"grad_norm": 0.6405905485153198,
"learning_rate": 2.351352026828917e-06,
"loss": 0.4359,
"step": 14530
},
{
"epoch": 0.7104986684258106,
"grad_norm": 0.4093703627586365,
"learning_rate": 2.3441222155084196e-06,
"loss": 0.4372,
"step": 14540
},
{
"epoch": 0.7109873195045078,
"grad_norm": 0.31071528792381287,
"learning_rate": 2.3369001313992373e-06,
"loss": 0.4367,
"step": 14550
},
{
"epoch": 0.7114759705832051,
"grad_norm": 0.502044141292572,
"learning_rate": 2.3296857955138493e-06,
"loss": 0.4365,
"step": 14560
},
{
"epoch": 0.7119646216619023,
"grad_norm": 0.5427960753440857,
"learning_rate": 2.3224792288421873e-06,
"loss": 0.4372,
"step": 14570
},
{
"epoch": 0.7124532727405996,
"grad_norm": 0.6338086128234863,
"learning_rate": 2.3152804523515787e-06,
"loss": 0.4358,
"step": 14580
},
{
"epoch": 0.7129419238192969,
"grad_norm": 0.36875244975090027,
"learning_rate": 2.3080894869866906e-06,
"loss": 0.436,
"step": 14590
},
{
"epoch": 0.7134305748979941,
"grad_norm": 0.39585214853286743,
"learning_rate": 2.3009063536694588e-06,
"loss": 0.4334,
"step": 14600
},
{
"epoch": 0.7139192259766913,
"grad_norm": 0.4556538164615631,
"learning_rate": 2.293731073299032e-06,
"loss": 0.4367,
"step": 14610
},
{
"epoch": 0.7144078770553886,
"grad_norm": 0.4585372507572174,
"learning_rate": 2.286563666751714e-06,
"loss": 0.4344,
"step": 14620
},
{
"epoch": 0.7148965281340859,
"grad_norm": 0.3792722821235657,
"learning_rate": 2.2794041548809013e-06,
"loss": 0.4372,
"step": 14630
},
{
"epoch": 0.7153851792127831,
"grad_norm": 0.5071465969085693,
"learning_rate": 2.2722525585170136e-06,
"loss": 0.437,
"step": 14640
},
{
"epoch": 0.7158738302914803,
"grad_norm": 0.47391828894615173,
"learning_rate": 2.265108898467449e-06,
"loss": 0.4361,
"step": 14650
},
{
"epoch": 0.7163624813701777,
"grad_norm": 0.450090229511261,
"learning_rate": 2.2579731955165098e-06,
"loss": 0.435,
"step": 14660
},
{
"epoch": 0.7168511324488749,
"grad_norm": 0.4352344870567322,
"learning_rate": 2.250845470425346e-06,
"loss": 0.4358,
"step": 14670
},
{
"epoch": 0.7173397835275721,
"grad_norm": 1.0980722904205322,
"learning_rate": 2.2437257439319045e-06,
"loss": 0.4349,
"step": 14680
},
{
"epoch": 0.7178284346062694,
"grad_norm": 0.7365118265151978,
"learning_rate": 2.2366140367508515e-06,
"loss": 0.436,
"step": 14690
},
{
"epoch": 0.7183170856849667,
"grad_norm": 0.3632850646972656,
"learning_rate": 2.2295103695735237e-06,
"loss": 0.437,
"step": 14700
},
{
"epoch": 0.7188057367636639,
"grad_norm": 0.4772653877735138,
"learning_rate": 2.2224147630678698e-06,
"loss": 0.434,
"step": 14710
},
{
"epoch": 0.7192943878423612,
"grad_norm": 0.533318042755127,
"learning_rate": 2.2153272378783823e-06,
"loss": 0.4348,
"step": 14720
},
{
"epoch": 0.7197830389210584,
"grad_norm": 0.649156928062439,
"learning_rate": 2.2082478146260394e-06,
"loss": 0.4354,
"step": 14730
},
{
"epoch": 0.7202716899997557,
"grad_norm": 0.5530617833137512,
"learning_rate": 2.2011765139082514e-06,
"loss": 0.436,
"step": 14740
},
{
"epoch": 0.7207603410784529,
"grad_norm": 0.48404207825660706,
"learning_rate": 2.194113356298796e-06,
"loss": 0.4359,
"step": 14750
},
{
"epoch": 0.7212489921571502,
"grad_norm": 0.6402378082275391,
"learning_rate": 2.1870583623477554e-06,
"loss": 0.4366,
"step": 14760
},
{
"epoch": 0.7217376432358474,
"grad_norm": 0.4514593183994293,
"learning_rate": 2.1800115525814604e-06,
"loss": 0.4347,
"step": 14770
},
{
"epoch": 0.7222262943145447,
"grad_norm": 0.4350273013114929,
"learning_rate": 2.1729729475024337e-06,
"loss": 0.437,
"step": 14780
},
{
"epoch": 0.722714945393242,
"grad_norm": 0.7733496427536011,
"learning_rate": 2.165942567589324e-06,
"loss": 0.4362,
"step": 14790
},
{
"epoch": 0.7232035964719392,
"grad_norm": 0.3570731282234192,
"learning_rate": 2.158920433296846e-06,
"loss": 0.435,
"step": 14800
},
{
"epoch": 0.7236922475506364,
"grad_norm": 0.45792272686958313,
"learning_rate": 2.151906565055732e-06,
"loss": 0.4359,
"step": 14810
},
{
"epoch": 0.7241808986293338,
"grad_norm": 0.3383428454399109,
"learning_rate": 2.1449009832726576e-06,
"loss": 0.4367,
"step": 14820
},
{
"epoch": 0.724669549708031,
"grad_norm": 0.4315878450870514,
"learning_rate": 2.137903708330188e-06,
"loss": 0.4359,
"step": 14830
},
{
"epoch": 0.7251582007867282,
"grad_norm": 0.5013752579689026,
"learning_rate": 2.130914760586729e-06,
"loss": 0.4346,
"step": 14840
},
{
"epoch": 0.7256468518654255,
"grad_norm": 0.5946633815765381,
"learning_rate": 2.1239341603764506e-06,
"loss": 0.4355,
"step": 14850
},
{
"epoch": 0.7261355029441228,
"grad_norm": 1.4556235074996948,
"learning_rate": 2.1169619280092362e-06,
"loss": 0.4352,
"step": 14860
},
{
"epoch": 0.72662415402282,
"grad_norm": 0.49753642082214355,
"learning_rate": 2.109998083770628e-06,
"loss": 0.4369,
"step": 14870
},
{
"epoch": 0.7271128051015172,
"grad_norm": 0.3729608654975891,
"learning_rate": 2.103042647921758e-06,
"loss": 0.4356,
"step": 14880
},
{
"epoch": 0.7276014561802145,
"grad_norm": 0.39122653007507324,
"learning_rate": 2.096095640699295e-06,
"loss": 0.4368,
"step": 14890
},
{
"epoch": 0.7280901072589118,
"grad_norm": 0.42691490054130554,
"learning_rate": 2.08915708231539e-06,
"loss": 0.4357,
"step": 14900
},
{
"epoch": 0.728578758337609,
"grad_norm": 0.38435041904449463,
"learning_rate": 2.0822269929576066e-06,
"loss": 0.4363,
"step": 14910
},
{
"epoch": 0.7290674094163063,
"grad_norm": 0.8433852195739746,
"learning_rate": 2.075305392788868e-06,
"loss": 0.4366,
"step": 14920
},
{
"epoch": 0.7295560604950035,
"grad_norm": 0.5046951174736023,
"learning_rate": 2.0683923019474016e-06,
"loss": 0.4358,
"step": 14930
},
{
"epoch": 0.7300447115737008,
"grad_norm": 0.9538094401359558,
"learning_rate": 2.061487740546679e-06,
"loss": 0.4358,
"step": 14940
},
{
"epoch": 0.7305333626523981,
"grad_norm": 0.542107343673706,
"learning_rate": 2.0545917286753494e-06,
"loss": 0.437,
"step": 14950
},
{
"epoch": 0.7310220137310953,
"grad_norm": 0.2896505296230316,
"learning_rate": 2.047704286397188e-06,
"loss": 0.4355,
"step": 14960
},
{
"epoch": 0.7315106648097925,
"grad_norm": 0.43803542852401733,
"learning_rate": 2.040825433751044e-06,
"loss": 0.4363,
"step": 14970
},
{
"epoch": 0.7319993158884899,
"grad_norm": 0.5424397587776184,
"learning_rate": 2.0339551907507687e-06,
"loss": 0.4366,
"step": 14980
},
{
"epoch": 0.7324879669671871,
"grad_norm": 0.5848090648651123,
"learning_rate": 2.027093577385163e-06,
"loss": 0.4349,
"step": 14990
},
{
"epoch": 0.7329766180458843,
"grad_norm": 0.3782629072666168,
"learning_rate": 2.0202406136179275e-06,
"loss": 0.4372,
"step": 15000
},
{
"epoch": 0.7329766180458843,
"eval_loss": 0.4146045744419098,
"eval_runtime": 728.9472,
"eval_samples_per_second": 242.69,
"eval_steps_per_second": 0.475,
"step": 15000
},
{
"epoch": 0.7334652691245815,
"grad_norm": 0.27179810404777527,
"learning_rate": 2.01339631938759e-06,
"loss": 0.4349,
"step": 15010
},
{
"epoch": 0.7339539202032789,
"grad_norm": 0.6157824397087097,
"learning_rate": 2.006560714607455e-06,
"loss": 0.436,
"step": 15020
},
{
"epoch": 0.7344425712819761,
"grad_norm": 0.38568001985549927,
"learning_rate": 1.99973381916555e-06,
"loss": 0.4353,
"step": 15030
},
{
"epoch": 0.7349312223606733,
"grad_norm": 0.3673468232154846,
"learning_rate": 1.992915652924558e-06,
"loss": 0.4365,
"step": 15040
},
{
"epoch": 0.7354198734393707,
"grad_norm": 0.4711572229862213,
"learning_rate": 1.986106235721769e-06,
"loss": 0.4348,
"step": 15050
},
{
"epoch": 0.7359085245180679,
"grad_norm": 0.30081677436828613,
"learning_rate": 1.9793055873690115e-06,
"loss": 0.4361,
"step": 15060
},
{
"epoch": 0.7363971755967651,
"grad_norm": 0.49421292543411255,
"learning_rate": 1.9725137276526098e-06,
"loss": 0.436,
"step": 15070
},
{
"epoch": 0.7368858266754624,
"grad_norm": 0.4806350767612457,
"learning_rate": 1.965730676333309e-06,
"loss": 0.4352,
"step": 15080
},
{
"epoch": 0.7373744777541597,
"grad_norm": 0.7303268909454346,
"learning_rate": 1.9589564531462344e-06,
"loss": 0.4351,
"step": 15090
},
{
"epoch": 0.7378631288328569,
"grad_norm": 0.3639063537120819,
"learning_rate": 1.952191077800821e-06,
"loss": 0.4361,
"step": 15100
},
{
"epoch": 0.7383517799115542,
"grad_norm": 0.3184981048107147,
"learning_rate": 1.94543456998076e-06,
"loss": 0.4361,
"step": 15110
},
{
"epoch": 0.7388404309902514,
"grad_norm": 0.4460330605506897,
"learning_rate": 1.9386869493439485e-06,
"loss": 0.4367,
"step": 15120
},
{
"epoch": 0.7393290820689487,
"grad_norm": 0.2961271107196808,
"learning_rate": 1.9319482355224235e-06,
"loss": 0.435,
"step": 15130
},
{
"epoch": 0.7398177331476459,
"grad_norm": 0.4846443235874176,
"learning_rate": 1.9252184481223033e-06,
"loss": 0.4354,
"step": 15140
},
{
"epoch": 0.7403063842263432,
"grad_norm": 0.35571032762527466,
"learning_rate": 1.918497606723744e-06,
"loss": 0.436,
"step": 15150
},
{
"epoch": 0.7407950353050404,
"grad_norm": 0.5735732913017273,
"learning_rate": 1.9117857308808687e-06,
"loss": 0.4358,
"step": 15160
},
{
"epoch": 0.7412836863837376,
"grad_norm": 0.5794824361801147,
"learning_rate": 1.9050828401217142e-06,
"loss": 0.436,
"step": 15170
},
{
"epoch": 0.741772337462435,
"grad_norm": 0.25915631651878357,
"learning_rate": 1.8983889539481754e-06,
"loss": 0.4357,
"step": 15180
},
{
"epoch": 0.7422609885411322,
"grad_norm": 0.582955002784729,
"learning_rate": 1.891704091835953e-06,
"loss": 0.4368,
"step": 15190
},
{
"epoch": 0.7427496396198294,
"grad_norm": 0.42489010095596313,
"learning_rate": 1.8850282732344887e-06,
"loss": 0.4354,
"step": 15200
},
{
"epoch": 0.7432382906985268,
"grad_norm": 0.31416329741477966,
"learning_rate": 1.8783615175669106e-06,
"loss": 0.4354,
"step": 15210
},
{
"epoch": 0.743726941777224,
"grad_norm": 4.887961387634277,
"learning_rate": 1.871703844229985e-06,
"loss": 0.4385,
"step": 15220
},
{
"epoch": 0.7442155928559212,
"grad_norm": 1.1010756492614746,
"learning_rate": 1.8650552725940468e-06,
"loss": 0.4357,
"step": 15230
},
{
"epoch": 0.7447042439346185,
"grad_norm": 0.46031710505485535,
"learning_rate": 1.8584158220029514e-06,
"loss": 0.4363,
"step": 15240
},
{
"epoch": 0.7451928950133158,
"grad_norm": 0.8031518459320068,
"learning_rate": 1.851785511774018e-06,
"loss": 0.4355,
"step": 15250
},
{
"epoch": 0.745681546092013,
"grad_norm": 0.33631330728530884,
"learning_rate": 1.8451643611979746e-06,
"loss": 0.4358,
"step": 15260
},
{
"epoch": 0.7461701971707102,
"grad_norm": 0.3979465365409851,
"learning_rate": 1.838552389538894e-06,
"loss": 0.4353,
"step": 15270
},
{
"epoch": 0.7466588482494075,
"grad_norm": 0.5838291049003601,
"learning_rate": 1.831949616034145e-06,
"loss": 0.4369,
"step": 15280
},
{
"epoch": 0.7471474993281048,
"grad_norm": 0.34562933444976807,
"learning_rate": 1.8253560598943377e-06,
"loss": 0.4373,
"step": 15290
},
{
"epoch": 0.747636150406802,
"grad_norm": 0.42259690165519714,
"learning_rate": 1.81877174030326e-06,
"loss": 0.436,
"step": 15300
},
{
"epoch": 0.7481248014854993,
"grad_norm": 0.47943800687789917,
"learning_rate": 1.8121966764178278e-06,
"loss": 0.4341,
"step": 15310
},
{
"epoch": 0.7486134525641965,
"grad_norm": 0.4682493805885315,
"learning_rate": 1.8056308873680316e-06,
"loss": 0.4361,
"step": 15320
},
{
"epoch": 0.7491021036428938,
"grad_norm": 0.5536458492279053,
"learning_rate": 1.7990743922568699e-06,
"loss": 0.4359,
"step": 15330
},
{
"epoch": 0.7495907547215911,
"grad_norm": 0.3631746768951416,
"learning_rate": 1.7925272101603076e-06,
"loss": 0.4358,
"step": 15340
},
{
"epoch": 0.7500794058002883,
"grad_norm": 0.480092853307724,
"learning_rate": 1.7859893601272077e-06,
"loss": 0.4362,
"step": 15350
},
{
"epoch": 0.7505680568789855,
"grad_norm": 0.4252304434776306,
"learning_rate": 1.7794608611792873e-06,
"loss": 0.4339,
"step": 15360
},
{
"epoch": 0.7510567079576829,
"grad_norm": 0.37599998712539673,
"learning_rate": 1.772941732311052e-06,
"loss": 0.4346,
"step": 15370
},
{
"epoch": 0.7515453590363801,
"grad_norm": 0.5096463561058044,
"learning_rate": 1.7664319924897493e-06,
"loss": 0.4361,
"step": 15380
},
{
"epoch": 0.7520340101150773,
"grad_norm": 0.402937650680542,
"learning_rate": 1.7599316606553074e-06,
"loss": 0.4345,
"step": 15390
},
{
"epoch": 0.7525226611937745,
"grad_norm": 0.5899362564086914,
"learning_rate": 1.75344075572028e-06,
"loss": 0.4354,
"step": 15400
},
{
"epoch": 0.7530113122724719,
"grad_norm": 0.6552911996841431,
"learning_rate": 1.7469592965697985e-06,
"loss": 0.4367,
"step": 15410
},
{
"epoch": 0.7534999633511691,
"grad_norm": 0.34461089968681335,
"learning_rate": 1.7404873020615092e-06,
"loss": 0.4356,
"step": 15420
},
{
"epoch": 0.7539886144298663,
"grad_norm": 0.34054285287857056,
"learning_rate": 1.7340247910255193e-06,
"loss": 0.4347,
"step": 15430
},
{
"epoch": 0.7544772655085636,
"grad_norm": 0.548925518989563,
"learning_rate": 1.7275717822643496e-06,
"loss": 0.4356,
"step": 15440
},
{
"epoch": 0.7549659165872609,
"grad_norm": 0.3071838319301605,
"learning_rate": 1.7211282945528667e-06,
"loss": 0.4346,
"step": 15450
},
{
"epoch": 0.7554545676659581,
"grad_norm": 0.32380637526512146,
"learning_rate": 1.714694346638245e-06,
"loss": 0.4363,
"step": 15460
},
{
"epoch": 0.7559432187446554,
"grad_norm": 0.3220982253551483,
"learning_rate": 1.7082699572398941e-06,
"loss": 0.4356,
"step": 15470
},
{
"epoch": 0.7564318698233526,
"grad_norm": 0.48519644141197205,
"learning_rate": 1.7018551450494208e-06,
"loss": 0.4337,
"step": 15480
},
{
"epoch": 0.7569205209020499,
"grad_norm": 0.49619343876838684,
"learning_rate": 1.6954499287305625e-06,
"loss": 0.4359,
"step": 15490
},
{
"epoch": 0.7574091719807472,
"grad_norm": 0.31478312611579895,
"learning_rate": 1.6890543269191372e-06,
"loss": 0.4353,
"step": 15500
},
{
"epoch": 0.7574091719807472,
"eval_loss": 0.4151374399662018,
"eval_runtime": 729.456,
"eval_samples_per_second": 242.52,
"eval_steps_per_second": 0.474,
"step": 15500
},
{
"epoch": 0.7578978230594444,
"grad_norm": 0.5134409666061401,
"learning_rate": 4.396678065461651e-08,
"loss": 0.4363,
"step": 15510
},
{
"epoch": 0.7583864741381416,
"grad_norm": 0.3412030041217804,
"learning_rate": 9.281875915974597e-08,
"loss": 0.435,
"step": 15520
},
{
"epoch": 0.7588751252168389,
"grad_norm": 0.3823215365409851,
"learning_rate": 1.4167073766487544e-07,
"loss": 0.4359,
"step": 15530
},
{
"epoch": 0.7593637762955362,
"grad_norm": 0.3025282323360443,
"learning_rate": 1.905227161700049e-07,
"loss": 0.4355,
"step": 15540
},
{
"epoch": 0.7598524273742334,
"grad_norm": 0.4344797730445862,
"learning_rate": 2.3937469467513437e-07,
"loss": 0.4356,
"step": 15550
},
{
"epoch": 0.7603410784529306,
"grad_norm": 0.28436729311943054,
"learning_rate": 2.8822667318026384e-07,
"loss": 0.436,
"step": 15560
},
{
"epoch": 0.760829729531628,
"grad_norm": 0.3204064667224884,
"learning_rate": 3.3707865168539325e-07,
"loss": 0.4361,
"step": 15570
},
{
"epoch": 0.7613183806103252,
"grad_norm": 0.3875465989112854,
"learning_rate": 3.859306301905227e-07,
"loss": 0.4341,
"step": 15580
},
{
"epoch": 0.7618070316890224,
"grad_norm": 0.368078351020813,
"learning_rate": 4.347826086956522e-07,
"loss": 0.4351,
"step": 15590
},
{
"epoch": 0.7622956827677198,
"grad_norm": 0.36300018429756165,
"learning_rate": 4.836345872007817e-07,
"loss": 0.4344,
"step": 15600
},
{
"epoch": 0.762784333846417,
"grad_norm": 0.42110690474510193,
"learning_rate": 5.324865657059111e-07,
"loss": 0.434,
"step": 15610
},
{
"epoch": 0.7632729849251142,
"grad_norm": 0.37072572112083435,
"learning_rate": 5.813385442110406e-07,
"loss": 0.4354,
"step": 15620
},
{
"epoch": 0.7637616360038115,
"grad_norm": 0.5293629169464111,
"learning_rate": 6.3019052271617e-07,
"loss": 0.4342,
"step": 15630
},
{
"epoch": 0.7642502870825088,
"grad_norm": 0.31591010093688965,
"learning_rate": 6.790425012212995e-07,
"loss": 0.4343,
"step": 15640
},
{
"epoch": 0.764738938161206,
"grad_norm": 0.27564629912376404,
"learning_rate": 7.278944797264289e-07,
"loss": 0.4364,
"step": 15650
},
{
"epoch": 0.7652275892399032,
"grad_norm": 0.29514557123184204,
"learning_rate": 7.767464582315585e-07,
"loss": 0.4349,
"step": 15660
},
{
"epoch": 0.7657162403186005,
"grad_norm": 0.26547813415527344,
"learning_rate": 8.255984367366879e-07,
"loss": 0.4357,
"step": 15670
},
{
"epoch": 0.7662048913972977,
"grad_norm": 0.3546208441257477,
"learning_rate": 8.744504152418174e-07,
"loss": 0.4342,
"step": 15680
},
{
"epoch": 0.766693542475995,
"grad_norm": 0.6953465938568115,
"learning_rate": 9.233023937469468e-07,
"loss": 0.4339,
"step": 15690
},
{
"epoch": 0.7671821935546923,
"grad_norm": 0.37491822242736816,
"learning_rate": 9.721543722520762e-07,
"loss": 0.4357,
"step": 15700
},
{
"epoch": 0.7676708446333895,
"grad_norm": 0.4774235486984253,
"learning_rate": 1.0210063507572057e-06,
"loss": 0.435,
"step": 15710
},
{
"epoch": 0.7681594957120867,
"grad_norm": 0.47825121879577637,
"learning_rate": 1.0698583292623353e-06,
"loss": 0.4345,
"step": 15720
},
{
"epoch": 0.7686481467907841,
"grad_norm": 0.35943761467933655,
"learning_rate": 1.1187103077674646e-06,
"loss": 0.4345,
"step": 15730
},
{
"epoch": 0.7691367978694813,
"grad_norm": 0.41238027811050415,
"learning_rate": 1.167562286272594e-06,
"loss": 0.4351,
"step": 15740
},
{
"epoch": 0.7696254489481785,
"grad_norm": 0.5406340956687927,
"learning_rate": 1.2164142647777236e-06,
"loss": 0.4347,
"step": 15750
},
{
"epoch": 0.7701141000268759,
"grad_norm": 0.3181721568107605,
"learning_rate": 1.265266243282853e-06,
"loss": 0.4354,
"step": 15760
},
{
"epoch": 0.7706027511055731,
"grad_norm": 0.37955865263938904,
"learning_rate": 1.3141182217879824e-06,
"loss": 0.4347,
"step": 15770
},
{
"epoch": 0.7710914021842703,
"grad_norm": 0.3683488667011261,
"learning_rate": 1.362970200293112e-06,
"loss": 0.4361,
"step": 15780
},
{
"epoch": 0.7715800532629675,
"grad_norm": 0.3671647012233734,
"learning_rate": 1.4118221787982415e-06,
"loss": 0.4348,
"step": 15790
},
{
"epoch": 0.7720687043416649,
"grad_norm": 0.4749736189842224,
"learning_rate": 1.4606741573033708e-06,
"loss": 0.4354,
"step": 15800
},
{
"epoch": 0.7725573554203621,
"grad_norm": 0.2920779883861542,
"learning_rate": 1.5095261358085003e-06,
"loss": 0.4349,
"step": 15810
},
{
"epoch": 0.7730460064990593,
"grad_norm": 0.5698887705802917,
"learning_rate": 1.5583781143136298e-06,
"loss": 0.4349,
"step": 15820
},
{
"epoch": 0.7735346575777566,
"grad_norm": 0.4958445131778717,
"learning_rate": 1.6072300928187593e-06,
"loss": 0.4373,
"step": 15830
},
{
"epoch": 0.7740233086564539,
"grad_norm": 0.37633660435676575,
"learning_rate": 1.6560820713238887e-06,
"loss": 0.4356,
"step": 15840
},
{
"epoch": 0.7745119597351511,
"grad_norm": 0.3820544183254242,
"learning_rate": 1.7049340498290182e-06,
"loss": 0.4351,
"step": 15850
},
{
"epoch": 0.7750006108138484,
"grad_norm": 0.3899173140525818,
"learning_rate": 1.7537860283341477e-06,
"loss": 0.4355,
"step": 15860
},
{
"epoch": 0.7754892618925456,
"grad_norm": 0.36729347705841064,
"learning_rate": 1.802638006839277e-06,
"loss": 0.4353,
"step": 15870
},
{
"epoch": 0.7759779129712429,
"grad_norm": 0.442569762468338,
"learning_rate": 1.8514899853444065e-06,
"loss": 0.4363,
"step": 15880
},
{
"epoch": 0.7764665640499402,
"grad_norm": 0.5207741260528564,
"learning_rate": 1.900341963849536e-06,
"loss": 0.4362,
"step": 15890
},
{
"epoch": 0.7769552151286374,
"grad_norm": 0.901549756526947,
"learning_rate": 1.9491939423546656e-06,
"loss": 0.4359,
"step": 15900
},
{
"epoch": 0.7774438662073346,
"grad_norm": 0.5226088166236877,
"learning_rate": 1.998045920859795e-06,
"loss": 0.4362,
"step": 15910
},
{
"epoch": 0.7779325172860319,
"grad_norm": 0.7250573635101318,
"learning_rate": 2.046897899364924e-06,
"loss": 0.4374,
"step": 15920
},
{
"epoch": 0.7784211683647292,
"grad_norm": 0.34755152463912964,
"learning_rate": 2.0957498778700537e-06,
"loss": 0.4355,
"step": 15930
},
{
"epoch": 0.7789098194434264,
"grad_norm": 0.37030619382858276,
"learning_rate": 2.1446018563751832e-06,
"loss": 0.4356,
"step": 15940
},
{
"epoch": 0.7793984705221236,
"grad_norm": 0.44449082016944885,
"learning_rate": 2.1934538348803127e-06,
"loss": 0.4349,
"step": 15950
},
{
"epoch": 0.779887121600821,
"grad_norm": 1.273260235786438,
"learning_rate": 2.2423058133854423e-06,
"loss": 0.4356,
"step": 15960
},
{
"epoch": 0.7803757726795182,
"grad_norm": 0.6899981498718262,
"learning_rate": 2.2911577918905718e-06,
"loss": 0.4355,
"step": 15970
},
{
"epoch": 0.7808644237582154,
"grad_norm": 0.5005556344985962,
"learning_rate": 2.3400097703957013e-06,
"loss": 0.4347,
"step": 15980
},
{
"epoch": 0.7813530748369127,
"grad_norm": 0.5572786331176758,
"learning_rate": 2.388861748900831e-06,
"loss": 0.4357,
"step": 15990
},
{
"epoch": 0.78184172591561,
"grad_norm": 0.6249894499778748,
"learning_rate": 2.43771372740596e-06,
"loss": 0.4348,
"step": 16000
},
{
"epoch": 0.78184172591561,
"eval_loss": 0.4141230583190918,
"eval_runtime": 729.2488,
"eval_samples_per_second": 242.589,
"eval_steps_per_second": 0.474,
"step": 16000
}
],
"logging_steps": 10,
"max_steps": 20465,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 1.7679175174984827e+19,
"train_batch_size": 256,
"trial_name": null,
"trial_params": null
}