ChlorophyllChampion's picture
Upload 11 files
f3d9d7e verified
raw
history blame contribute delete
No virus
211 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.15198026808855228,
"eval_steps": 500,
"global_step": 131000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0,
"grad_norm": 0.28216952085494995,
"learning_rate": 9.998839845281768e-05,
"loss": 5.404,
"step": 100
},
{
"epoch": 0.0,
"grad_norm": 0.7618443965911865,
"learning_rate": 9.997679690563535e-05,
"loss": 4.9594,
"step": 200
},
{
"epoch": 0.0,
"grad_norm": 0.8141049742698669,
"learning_rate": 9.996519535845302e-05,
"loss": 4.9031,
"step": 300
},
{
"epoch": 0.0,
"grad_norm": 2.8989076614379883,
"learning_rate": 9.995359381127067e-05,
"loss": 4.8882,
"step": 400
},
{
"epoch": 0.0,
"grad_norm": 1.622833251953125,
"learning_rate": 9.994199226408834e-05,
"loss": 4.8752,
"step": 500
},
{
"epoch": 0.0,
"grad_norm": 0.2884034812450409,
"learning_rate": 9.993039071690601e-05,
"loss": 4.8623,
"step": 600
},
{
"epoch": 0.0,
"grad_norm": 3.4078500270843506,
"learning_rate": 9.991878916972368e-05,
"loss": 4.8539,
"step": 700
},
{
"epoch": 0.0,
"grad_norm": 4.187527179718018,
"learning_rate": 9.990718762254134e-05,
"loss": 4.8479,
"step": 800
},
{
"epoch": 0.0,
"grad_norm": 1.675564169883728,
"learning_rate": 9.989558607535901e-05,
"loss": 4.8473,
"step": 900
},
{
"epoch": 0.0,
"grad_norm": 1.7946183681488037,
"learning_rate": 9.988398452817668e-05,
"loss": 4.8428,
"step": 1000
},
{
"epoch": 0.0,
"grad_norm": 3.8930752277374268,
"learning_rate": 9.987238298099435e-05,
"loss": 4.8403,
"step": 1100
},
{
"epoch": 0.0,
"grad_norm": 1.8201618194580078,
"learning_rate": 9.986078143381202e-05,
"loss": 4.8341,
"step": 1200
},
{
"epoch": 0.0,
"grad_norm": 1.5597014427185059,
"learning_rate": 9.984917988662968e-05,
"loss": 4.8324,
"step": 1300
},
{
"epoch": 0.0,
"grad_norm": 3.2638113498687744,
"learning_rate": 9.983757833944736e-05,
"loss": 4.8274,
"step": 1400
},
{
"epoch": 0.0,
"grad_norm": 1.5054211616516113,
"learning_rate": 9.982597679226502e-05,
"loss": 4.8262,
"step": 1500
},
{
"epoch": 0.0,
"grad_norm": 2.0371103286743164,
"learning_rate": 9.981437524508269e-05,
"loss": 4.826,
"step": 1600
},
{
"epoch": 0.0,
"grad_norm": 2.8935482501983643,
"learning_rate": 9.980277369790035e-05,
"loss": 4.823,
"step": 1700
},
{
"epoch": 0.0,
"grad_norm": 1.6551401615142822,
"learning_rate": 9.979117215071803e-05,
"loss": 4.8232,
"step": 1800
},
{
"epoch": 0.0,
"grad_norm": 1.8115674257278442,
"learning_rate": 9.977957060353569e-05,
"loss": 4.8182,
"step": 1900
},
{
"epoch": 0.0,
"grad_norm": 3.1292033195495605,
"learning_rate": 9.976796905635336e-05,
"loss": 4.8205,
"step": 2000
},
{
"epoch": 0.0,
"grad_norm": 1.3206559419631958,
"learning_rate": 9.975636750917103e-05,
"loss": 4.8176,
"step": 2100
},
{
"epoch": 0.0,
"grad_norm": 2.540377140045166,
"learning_rate": 9.97447659619887e-05,
"loss": 4.8147,
"step": 2200
},
{
"epoch": 0.0,
"grad_norm": 4.180980682373047,
"learning_rate": 9.973316441480637e-05,
"loss": 4.8146,
"step": 2300
},
{
"epoch": 0.0,
"grad_norm": 1.8660849332809448,
"learning_rate": 9.972156286762403e-05,
"loss": 4.8105,
"step": 2400
},
{
"epoch": 0.0,
"grad_norm": 2.911318778991699,
"learning_rate": 9.97099613204417e-05,
"loss": 4.8131,
"step": 2500
},
{
"epoch": 0.0,
"grad_norm": 4.366623878479004,
"learning_rate": 9.969835977325937e-05,
"loss": 4.8113,
"step": 2600
},
{
"epoch": 0.0,
"grad_norm": 1.6765426397323608,
"learning_rate": 9.968675822607704e-05,
"loss": 4.8092,
"step": 2700
},
{
"epoch": 0.0,
"grad_norm": 2.4829912185668945,
"learning_rate": 9.96751566788947e-05,
"loss": 4.8102,
"step": 2800
},
{
"epoch": 0.0,
"grad_norm": 3.6401925086975098,
"learning_rate": 9.966355513171237e-05,
"loss": 4.8039,
"step": 2900
},
{
"epoch": 0.0,
"grad_norm": 2.1426470279693604,
"learning_rate": 9.965195358453004e-05,
"loss": 4.8059,
"step": 3000
},
{
"epoch": 0.0,
"grad_norm": 2.311486005783081,
"learning_rate": 9.96403520373477e-05,
"loss": 4.8068,
"step": 3100
},
{
"epoch": 0.0,
"grad_norm": 2.069549083709717,
"learning_rate": 9.962875049016536e-05,
"loss": 4.8054,
"step": 3200
},
{
"epoch": 0.0,
"grad_norm": 3.6770570278167725,
"learning_rate": 9.961714894298305e-05,
"loss": 4.8008,
"step": 3300
},
{
"epoch": 0.0,
"grad_norm": 1.9226771593093872,
"learning_rate": 9.960554739580072e-05,
"loss": 4.8004,
"step": 3400
},
{
"epoch": 0.0,
"grad_norm": 3.2032277584075928,
"learning_rate": 9.959394584861837e-05,
"loss": 4.7977,
"step": 3500
},
{
"epoch": 0.0,
"grad_norm": 2.310493230819702,
"learning_rate": 9.958234430143604e-05,
"loss": 4.7997,
"step": 3600
},
{
"epoch": 0.0,
"grad_norm": 2.559783935546875,
"learning_rate": 9.957074275425372e-05,
"loss": 4.7961,
"step": 3700
},
{
"epoch": 0.0,
"grad_norm": 2.714775562286377,
"learning_rate": 9.955914120707139e-05,
"loss": 4.7943,
"step": 3800
},
{
"epoch": 0.0,
"grad_norm": 2.100062847137451,
"learning_rate": 9.954753965988904e-05,
"loss": 4.7961,
"step": 3900
},
{
"epoch": 0.0,
"grad_norm": 5.665365695953369,
"learning_rate": 9.953593811270671e-05,
"loss": 4.7992,
"step": 4000
},
{
"epoch": 0.0,
"grad_norm": 2.1073904037475586,
"learning_rate": 9.952433656552438e-05,
"loss": 4.7957,
"step": 4100
},
{
"epoch": 0.0,
"grad_norm": 3.3605892658233643,
"learning_rate": 9.951273501834205e-05,
"loss": 4.7959,
"step": 4200
},
{
"epoch": 0.0,
"grad_norm": 4.566422939300537,
"learning_rate": 9.950113347115971e-05,
"loss": 4.7975,
"step": 4300
},
{
"epoch": 0.01,
"grad_norm": 1.812922716140747,
"learning_rate": 9.948953192397738e-05,
"loss": 4.7942,
"step": 4400
},
{
"epoch": 0.01,
"grad_norm": 3.2284045219421387,
"learning_rate": 9.947793037679507e-05,
"loss": 4.7942,
"step": 4500
},
{
"epoch": 0.01,
"grad_norm": 4.2551093101501465,
"learning_rate": 9.946632882961272e-05,
"loss": 4.791,
"step": 4600
},
{
"epoch": 0.01,
"grad_norm": 1.8002365827560425,
"learning_rate": 9.945472728243039e-05,
"loss": 4.7905,
"step": 4700
},
{
"epoch": 0.01,
"grad_norm": 2.5465445518493652,
"learning_rate": 9.944312573524805e-05,
"loss": 4.7939,
"step": 4800
},
{
"epoch": 0.01,
"grad_norm": 4.329401969909668,
"learning_rate": 9.943152418806573e-05,
"loss": 4.795,
"step": 4900
},
{
"epoch": 0.01,
"grad_norm": 2.2818596363067627,
"learning_rate": 9.941992264088339e-05,
"loss": 4.7952,
"step": 5000
},
{
"epoch": 0.01,
"grad_norm": 2.9942195415496826,
"learning_rate": 9.940832109370106e-05,
"loss": 4.791,
"step": 5100
},
{
"epoch": 0.01,
"grad_norm": 5.174030780792236,
"learning_rate": 9.939671954651872e-05,
"loss": 4.7921,
"step": 5200
},
{
"epoch": 0.01,
"grad_norm": 1.8438475131988525,
"learning_rate": 9.93851179993364e-05,
"loss": 4.79,
"step": 5300
},
{
"epoch": 0.01,
"grad_norm": 3.8706634044647217,
"learning_rate": 9.937351645215407e-05,
"loss": 4.7902,
"step": 5400
},
{
"epoch": 0.01,
"grad_norm": 5.187880516052246,
"learning_rate": 9.936191490497173e-05,
"loss": 4.7917,
"step": 5500
},
{
"epoch": 0.01,
"grad_norm": 2.2282915115356445,
"learning_rate": 9.93503133577894e-05,
"loss": 4.7869,
"step": 5600
},
{
"epoch": 0.01,
"grad_norm": 3.4940683841705322,
"learning_rate": 9.933871181060707e-05,
"loss": 4.7932,
"step": 5700
},
{
"epoch": 0.01,
"grad_norm": 4.8060383796691895,
"learning_rate": 9.932711026342474e-05,
"loss": 4.788,
"step": 5800
},
{
"epoch": 0.01,
"grad_norm": 2.129607677459717,
"learning_rate": 9.93155087162424e-05,
"loss": 4.7885,
"step": 5900
},
{
"epoch": 0.01,
"grad_norm": 3.5312142372131348,
"learning_rate": 9.930390716906007e-05,
"loss": 4.786,
"step": 6000
},
{
"epoch": 0.01,
"grad_norm": 4.944730281829834,
"learning_rate": 9.929230562187774e-05,
"loss": 4.7867,
"step": 6100
},
{
"epoch": 0.01,
"grad_norm": 1.5370301008224487,
"learning_rate": 9.928070407469541e-05,
"loss": 4.7864,
"step": 6200
},
{
"epoch": 0.01,
"grad_norm": 2.7849390506744385,
"learning_rate": 9.926910252751306e-05,
"loss": 4.7885,
"step": 6300
},
{
"epoch": 0.01,
"grad_norm": 5.041503429412842,
"learning_rate": 9.925750098033075e-05,
"loss": 4.7867,
"step": 6400
},
{
"epoch": 0.01,
"grad_norm": 1.8032560348510742,
"learning_rate": 9.924589943314842e-05,
"loss": 4.7851,
"step": 6500
},
{
"epoch": 0.01,
"grad_norm": 3.296067953109741,
"learning_rate": 9.923429788596608e-05,
"loss": 4.7847,
"step": 6600
},
{
"epoch": 0.01,
"grad_norm": 5.063502311706543,
"learning_rate": 9.922269633878375e-05,
"loss": 4.7832,
"step": 6700
},
{
"epoch": 0.01,
"grad_norm": 2.149693012237549,
"learning_rate": 9.921109479160142e-05,
"loss": 4.779,
"step": 6800
},
{
"epoch": 0.01,
"grad_norm": 3.4809441566467285,
"learning_rate": 9.919949324441909e-05,
"loss": 4.7823,
"step": 6900
},
{
"epoch": 0.01,
"grad_norm": 4.948490142822266,
"learning_rate": 9.918789169723674e-05,
"loss": 4.7842,
"step": 7000
},
{
"epoch": 0.01,
"grad_norm": 1.5616728067398071,
"learning_rate": 9.917629015005441e-05,
"loss": 4.7861,
"step": 7100
},
{
"epoch": 0.01,
"grad_norm": 3.1099443435668945,
"learning_rate": 9.916468860287209e-05,
"loss": 4.7829,
"step": 7200
},
{
"epoch": 0.01,
"grad_norm": 5.170409679412842,
"learning_rate": 9.915308705568976e-05,
"loss": 4.7817,
"step": 7300
},
{
"epoch": 0.01,
"grad_norm": 1.6011829376220703,
"learning_rate": 9.914148550850741e-05,
"loss": 4.7805,
"step": 7400
},
{
"epoch": 0.01,
"grad_norm": 3.222562789916992,
"learning_rate": 9.912988396132508e-05,
"loss": 4.7802,
"step": 7500
},
{
"epoch": 0.01,
"grad_norm": 4.803954601287842,
"learning_rate": 9.911828241414277e-05,
"loss": 4.7828,
"step": 7600
},
{
"epoch": 0.01,
"grad_norm": 1.8963723182678223,
"learning_rate": 9.910668086696042e-05,
"loss": 4.7823,
"step": 7700
},
{
"epoch": 0.01,
"grad_norm": 2.4876952171325684,
"learning_rate": 9.90950793197781e-05,
"loss": 4.7768,
"step": 7800
},
{
"epoch": 0.01,
"grad_norm": 5.709277629852295,
"learning_rate": 9.908347777259575e-05,
"loss": 4.7806,
"step": 7900
},
{
"epoch": 0.01,
"grad_norm": 1.8382807970046997,
"learning_rate": 9.907187622541343e-05,
"loss": 4.7791,
"step": 8000
},
{
"epoch": 0.01,
"grad_norm": 3.2304890155792236,
"learning_rate": 9.906027467823109e-05,
"loss": 4.778,
"step": 8100
},
{
"epoch": 0.01,
"grad_norm": 5.252920150756836,
"learning_rate": 9.904867313104876e-05,
"loss": 4.7796,
"step": 8200
},
{
"epoch": 0.01,
"grad_norm": 1.16917884349823,
"learning_rate": 9.903707158386642e-05,
"loss": 4.7817,
"step": 8300
},
{
"epoch": 0.01,
"grad_norm": 2.6509056091308594,
"learning_rate": 9.90254700366841e-05,
"loss": 4.778,
"step": 8400
},
{
"epoch": 0.01,
"grad_norm": 1.1537126302719116,
"learning_rate": 9.901386848950176e-05,
"loss": 4.7755,
"step": 8500
},
{
"epoch": 0.01,
"grad_norm": 1.7410359382629395,
"learning_rate": 9.900226694231943e-05,
"loss": 4.7751,
"step": 8600
},
{
"epoch": 0.01,
"grad_norm": 3.2682690620422363,
"learning_rate": 9.89906653951371e-05,
"loss": 4.7741,
"step": 8700
},
{
"epoch": 0.01,
"grad_norm": 1.825622320175171,
"learning_rate": 9.897906384795477e-05,
"loss": 4.7759,
"step": 8800
},
{
"epoch": 0.01,
"grad_norm": 0.8567586541175842,
"learning_rate": 9.896746230077244e-05,
"loss": 4.7738,
"step": 8900
},
{
"epoch": 0.01,
"grad_norm": 0.5261878967285156,
"learning_rate": 9.89558607535901e-05,
"loss": 4.7722,
"step": 9000
},
{
"epoch": 0.01,
"grad_norm": 0.7664969563484192,
"learning_rate": 9.894425920640777e-05,
"loss": 4.7757,
"step": 9100
},
{
"epoch": 0.01,
"grad_norm": 1.6178815364837646,
"learning_rate": 9.893265765922544e-05,
"loss": 4.7715,
"step": 9200
},
{
"epoch": 0.01,
"grad_norm": 5.013388156890869,
"learning_rate": 9.892105611204311e-05,
"loss": 4.7718,
"step": 9300
},
{
"epoch": 0.01,
"grad_norm": 0.765504777431488,
"learning_rate": 9.890945456486077e-05,
"loss": 4.7707,
"step": 9400
},
{
"epoch": 0.01,
"grad_norm": 1.0577245950698853,
"learning_rate": 9.889785301767844e-05,
"loss": 4.7709,
"step": 9500
},
{
"epoch": 0.01,
"grad_norm": 0.5963281393051147,
"learning_rate": 9.888625147049611e-05,
"loss": 4.7738,
"step": 9600
},
{
"epoch": 0.01,
"grad_norm": 1.5044456720352173,
"learning_rate": 9.887464992331378e-05,
"loss": 4.772,
"step": 9700
},
{
"epoch": 0.01,
"grad_norm": 2.449915647506714,
"learning_rate": 9.886304837613145e-05,
"loss": 4.7695,
"step": 9800
},
{
"epoch": 0.01,
"grad_norm": 4.757066249847412,
"learning_rate": 9.885144682894912e-05,
"loss": 4.7723,
"step": 9900
},
{
"epoch": 0.01,
"grad_norm": 2.8173577785491943,
"learning_rate": 9.883984528176679e-05,
"loss": 4.7711,
"step": 10000
},
{
"epoch": 0.01,
"grad_norm": 3.4169297218322754,
"learning_rate": 9.882824373458445e-05,
"loss": 4.7739,
"step": 10100
},
{
"epoch": 0.01,
"grad_norm": 5.771091938018799,
"learning_rate": 9.881664218740212e-05,
"loss": 4.7747,
"step": 10200
},
{
"epoch": 0.01,
"grad_norm": 2.6634373664855957,
"learning_rate": 9.880504064021979e-05,
"loss": 4.7732,
"step": 10300
},
{
"epoch": 0.01,
"grad_norm": 4.215581893920898,
"learning_rate": 9.879343909303746e-05,
"loss": 4.7704,
"step": 10400
},
{
"epoch": 0.01,
"grad_norm": 6.152705192565918,
"learning_rate": 9.878183754585511e-05,
"loss": 4.7699,
"step": 10500
},
{
"epoch": 0.01,
"grad_norm": 1.5891577005386353,
"learning_rate": 9.877023599867278e-05,
"loss": 4.7717,
"step": 10600
},
{
"epoch": 0.01,
"grad_norm": 4.256149768829346,
"learning_rate": 9.875863445149045e-05,
"loss": 4.7664,
"step": 10700
},
{
"epoch": 0.01,
"grad_norm": 4.941390037536621,
"learning_rate": 9.874703290430813e-05,
"loss": 4.7721,
"step": 10800
},
{
"epoch": 0.01,
"grad_norm": 1.2469244003295898,
"learning_rate": 9.87354313571258e-05,
"loss": 4.7676,
"step": 10900
},
{
"epoch": 0.01,
"grad_norm": 1.073249340057373,
"learning_rate": 9.872382980994345e-05,
"loss": 4.7644,
"step": 11000
},
{
"epoch": 0.01,
"grad_norm": 1.6434860229492188,
"learning_rate": 9.871222826276114e-05,
"loss": 4.7645,
"step": 11100
},
{
"epoch": 0.01,
"grad_norm": 0.8313368558883667,
"learning_rate": 9.87006267155788e-05,
"loss": 4.7637,
"step": 11200
},
{
"epoch": 0.01,
"grad_norm": 0.8201664090156555,
"learning_rate": 9.868902516839646e-05,
"loss": 4.7667,
"step": 11300
},
{
"epoch": 0.01,
"grad_norm": 0.47900694608688354,
"learning_rate": 9.867742362121412e-05,
"loss": 4.7652,
"step": 11400
},
{
"epoch": 0.01,
"grad_norm": 0.9311307072639465,
"learning_rate": 9.86658220740318e-05,
"loss": 4.766,
"step": 11500
},
{
"epoch": 0.01,
"grad_norm": 8.805180549621582,
"learning_rate": 9.865422052684946e-05,
"loss": 4.7641,
"step": 11600
},
{
"epoch": 0.01,
"grad_norm": 9.388609886169434,
"learning_rate": 9.864261897966713e-05,
"loss": 4.7717,
"step": 11700
},
{
"epoch": 0.01,
"grad_norm": 2.2487683296203613,
"learning_rate": 9.86310174324848e-05,
"loss": 4.7689,
"step": 11800
},
{
"epoch": 0.01,
"grad_norm": 4.081148624420166,
"learning_rate": 9.861941588530247e-05,
"loss": 4.7673,
"step": 11900
},
{
"epoch": 0.01,
"grad_norm": 7.44789981842041,
"learning_rate": 9.860781433812014e-05,
"loss": 4.7722,
"step": 12000
},
{
"epoch": 0.01,
"grad_norm": 1.649463176727295,
"learning_rate": 9.85962127909378e-05,
"loss": 4.7667,
"step": 12100
},
{
"epoch": 0.01,
"grad_norm": 3.827794075012207,
"learning_rate": 9.858461124375547e-05,
"loss": 4.768,
"step": 12200
},
{
"epoch": 0.01,
"grad_norm": 6.945425510406494,
"learning_rate": 9.857300969657314e-05,
"loss": 4.7689,
"step": 12300
},
{
"epoch": 0.01,
"grad_norm": 1.1629202365875244,
"learning_rate": 9.856140814939081e-05,
"loss": 4.7656,
"step": 12400
},
{
"epoch": 0.01,
"grad_norm": 0.5653738379478455,
"learning_rate": 9.854980660220847e-05,
"loss": 4.7631,
"step": 12500
},
{
"epoch": 0.01,
"grad_norm": 0.724161684513092,
"learning_rate": 9.853820505502614e-05,
"loss": 4.7622,
"step": 12600
},
{
"epoch": 0.01,
"grad_norm": 0.5713109970092773,
"learning_rate": 9.852660350784381e-05,
"loss": 4.7633,
"step": 12700
},
{
"epoch": 0.01,
"grad_norm": 0.9219802021980286,
"learning_rate": 9.851500196066148e-05,
"loss": 4.7616,
"step": 12800
},
{
"epoch": 0.01,
"grad_norm": 0.7306973338127136,
"learning_rate": 9.850340041347914e-05,
"loss": 4.7616,
"step": 12900
},
{
"epoch": 0.02,
"grad_norm": 2.1123743057250977,
"learning_rate": 9.849179886629682e-05,
"loss": 4.7635,
"step": 13000
},
{
"epoch": 0.02,
"grad_norm": 0.5764673352241516,
"learning_rate": 9.848019731911449e-05,
"loss": 4.7602,
"step": 13100
},
{
"epoch": 0.02,
"grad_norm": 1.456514835357666,
"learning_rate": 9.846859577193215e-05,
"loss": 4.762,
"step": 13200
},
{
"epoch": 0.02,
"grad_norm": 0.6817034482955933,
"learning_rate": 9.845699422474982e-05,
"loss": 4.7585,
"step": 13300
},
{
"epoch": 0.02,
"grad_norm": 1.166177749633789,
"learning_rate": 9.844539267756749e-05,
"loss": 4.7653,
"step": 13400
},
{
"epoch": 0.02,
"grad_norm": 1.546487808227539,
"learning_rate": 9.843379113038516e-05,
"loss": 4.7607,
"step": 13500
},
{
"epoch": 0.02,
"grad_norm": 0.610224187374115,
"learning_rate": 9.842218958320282e-05,
"loss": 4.7598,
"step": 13600
},
{
"epoch": 0.02,
"grad_norm": 0.5171063542366028,
"learning_rate": 9.841058803602049e-05,
"loss": 4.7598,
"step": 13700
},
{
"epoch": 0.02,
"grad_norm": 2.632072687149048,
"learning_rate": 9.839898648883816e-05,
"loss": 4.7634,
"step": 13800
},
{
"epoch": 0.02,
"grad_norm": 0.6531881093978882,
"learning_rate": 9.838738494165583e-05,
"loss": 4.7585,
"step": 13900
},
{
"epoch": 0.02,
"grad_norm": 0.6771488189697266,
"learning_rate": 9.837578339447348e-05,
"loss": 4.757,
"step": 14000
},
{
"epoch": 0.02,
"grad_norm": 1.341787338256836,
"learning_rate": 9.836418184729115e-05,
"loss": 4.7579,
"step": 14100
},
{
"epoch": 0.02,
"grad_norm": 2.1839771270751953,
"learning_rate": 9.835258030010884e-05,
"loss": 4.7581,
"step": 14200
},
{
"epoch": 0.02,
"grad_norm": 1.8916560411453247,
"learning_rate": 9.83409787529265e-05,
"loss": 4.7599,
"step": 14300
},
{
"epoch": 0.02,
"grad_norm": 2.7338578701019287,
"learning_rate": 9.832937720574417e-05,
"loss": 4.7607,
"step": 14400
},
{
"epoch": 0.02,
"grad_norm": 0.7070731520652771,
"learning_rate": 9.831777565856182e-05,
"loss": 4.7587,
"step": 14500
},
{
"epoch": 0.02,
"grad_norm": 0.5772482752799988,
"learning_rate": 9.83061741113795e-05,
"loss": 4.763,
"step": 14600
},
{
"epoch": 0.02,
"grad_norm": 0.9034737348556519,
"learning_rate": 9.829457256419716e-05,
"loss": 4.7597,
"step": 14700
},
{
"epoch": 0.02,
"grad_norm": 2.127034902572632,
"learning_rate": 9.828297101701483e-05,
"loss": 4.7573,
"step": 14800
},
{
"epoch": 0.02,
"grad_norm": 1.8689446449279785,
"learning_rate": 9.82713694698325e-05,
"loss": 4.757,
"step": 14900
},
{
"epoch": 0.02,
"grad_norm": 0.7055838108062744,
"learning_rate": 9.825976792265017e-05,
"loss": 4.7557,
"step": 15000
},
{
"epoch": 0.02,
"grad_norm": 1.9538499116897583,
"learning_rate": 9.824816637546783e-05,
"loss": 4.7611,
"step": 15100
},
{
"epoch": 0.02,
"grad_norm": 1.487042784690857,
"learning_rate": 9.82365648282855e-05,
"loss": 4.7574,
"step": 15200
},
{
"epoch": 0.02,
"grad_norm": 2.1112914085388184,
"learning_rate": 9.822496328110317e-05,
"loss": 4.7566,
"step": 15300
},
{
"epoch": 0.02,
"grad_norm": 0.9239519834518433,
"learning_rate": 9.821336173392084e-05,
"loss": 4.7587,
"step": 15400
},
{
"epoch": 0.02,
"grad_norm": 3.975541353225708,
"learning_rate": 9.820176018673851e-05,
"loss": 4.7577,
"step": 15500
},
{
"epoch": 0.02,
"grad_norm": 0.5043540000915527,
"learning_rate": 9.819015863955617e-05,
"loss": 4.7565,
"step": 15600
},
{
"epoch": 0.02,
"grad_norm": 0.8457772731781006,
"learning_rate": 9.817855709237384e-05,
"loss": 4.7573,
"step": 15700
},
{
"epoch": 0.02,
"grad_norm": 0.5468181371688843,
"learning_rate": 9.816695554519151e-05,
"loss": 4.7555,
"step": 15800
},
{
"epoch": 0.02,
"grad_norm": 5.87544584274292,
"learning_rate": 9.815535399800918e-05,
"loss": 4.7588,
"step": 15900
},
{
"epoch": 0.02,
"grad_norm": 1.0000234842300415,
"learning_rate": 9.814375245082684e-05,
"loss": 4.753,
"step": 16000
},
{
"epoch": 0.02,
"grad_norm": 0.6940212249755859,
"learning_rate": 9.813215090364452e-05,
"loss": 4.7543,
"step": 16100
},
{
"epoch": 0.02,
"grad_norm": 0.9902929663658142,
"learning_rate": 9.812054935646219e-05,
"loss": 4.7562,
"step": 16200
},
{
"epoch": 0.02,
"grad_norm": 17.795963287353516,
"learning_rate": 9.810894780927985e-05,
"loss": 4.7551,
"step": 16300
},
{
"epoch": 0.02,
"grad_norm": 7.341447830200195,
"learning_rate": 9.809734626209752e-05,
"loss": 4.7569,
"step": 16400
},
{
"epoch": 0.02,
"grad_norm": 7.844032287597656,
"learning_rate": 9.808574471491519e-05,
"loss": 4.7546,
"step": 16500
},
{
"epoch": 0.02,
"grad_norm": 0.8450008034706116,
"learning_rate": 9.807414316773286e-05,
"loss": 4.7554,
"step": 16600
},
{
"epoch": 0.02,
"grad_norm": 1.3422025442123413,
"learning_rate": 9.806254162055052e-05,
"loss": 4.7524,
"step": 16700
},
{
"epoch": 0.02,
"grad_norm": 0.6606966853141785,
"learning_rate": 9.805094007336819e-05,
"loss": 4.7533,
"step": 16800
},
{
"epoch": 0.02,
"grad_norm": 0.6223414540290833,
"learning_rate": 9.803933852618586e-05,
"loss": 4.7544,
"step": 16900
},
{
"epoch": 0.02,
"grad_norm": 0.53001469373703,
"learning_rate": 9.802773697900353e-05,
"loss": 4.7529,
"step": 17000
},
{
"epoch": 0.02,
"grad_norm": 0.538820743560791,
"learning_rate": 9.801613543182119e-05,
"loss": 4.7514,
"step": 17100
},
{
"epoch": 0.02,
"grad_norm": 6.79536247253418,
"learning_rate": 9.800453388463886e-05,
"loss": 4.7522,
"step": 17200
},
{
"epoch": 0.02,
"grad_norm": 1.8284698724746704,
"learning_rate": 9.799293233745654e-05,
"loss": 4.7532,
"step": 17300
},
{
"epoch": 0.02,
"grad_norm": 0.8321298956871033,
"learning_rate": 9.79813307902742e-05,
"loss": 4.7553,
"step": 17400
},
{
"epoch": 0.02,
"grad_norm": 5.962058067321777,
"learning_rate": 9.796972924309187e-05,
"loss": 4.7641,
"step": 17500
},
{
"epoch": 0.02,
"grad_norm": 11.184019088745117,
"learning_rate": 9.795812769590952e-05,
"loss": 4.7608,
"step": 17600
},
{
"epoch": 0.02,
"grad_norm": 1.882926106452942,
"learning_rate": 9.794652614872721e-05,
"loss": 4.7582,
"step": 17700
},
{
"epoch": 0.02,
"grad_norm": 4.5953168869018555,
"learning_rate": 9.793492460154486e-05,
"loss": 4.756,
"step": 17800
},
{
"epoch": 0.02,
"grad_norm": 7.986889839172363,
"learning_rate": 9.792332305436253e-05,
"loss": 4.759,
"step": 17900
},
{
"epoch": 0.02,
"grad_norm": 1.7497138977050781,
"learning_rate": 9.791172150718019e-05,
"loss": 4.7567,
"step": 18000
},
{
"epoch": 0.02,
"grad_norm": 0.4782524108886719,
"learning_rate": 9.790011995999788e-05,
"loss": 4.753,
"step": 18100
},
{
"epoch": 0.02,
"grad_norm": 0.7002395987510681,
"learning_rate": 9.788851841281553e-05,
"loss": 4.7513,
"step": 18200
},
{
"epoch": 0.02,
"grad_norm": 0.49617066979408264,
"learning_rate": 9.78769168656332e-05,
"loss": 4.7508,
"step": 18300
},
{
"epoch": 0.02,
"grad_norm": 0.459750235080719,
"learning_rate": 9.786531531845087e-05,
"loss": 4.7518,
"step": 18400
},
{
"epoch": 0.02,
"grad_norm": 0.6577441692352295,
"learning_rate": 9.785371377126854e-05,
"loss": 4.7512,
"step": 18500
},
{
"epoch": 0.02,
"grad_norm": 1.284149408340454,
"learning_rate": 9.784211222408621e-05,
"loss": 4.752,
"step": 18600
},
{
"epoch": 0.02,
"grad_norm": 2.703396797180176,
"learning_rate": 9.783051067690387e-05,
"loss": 4.754,
"step": 18700
},
{
"epoch": 0.02,
"grad_norm": 0.5742882490158081,
"learning_rate": 9.781890912972154e-05,
"loss": 4.7521,
"step": 18800
},
{
"epoch": 0.02,
"grad_norm": 0.5588614344596863,
"learning_rate": 9.780730758253921e-05,
"loss": 4.7541,
"step": 18900
},
{
"epoch": 0.02,
"grad_norm": 1.6749471426010132,
"learning_rate": 9.779570603535688e-05,
"loss": 4.7521,
"step": 19000
},
{
"epoch": 0.02,
"grad_norm": 1.4725801944732666,
"learning_rate": 9.778410448817454e-05,
"loss": 4.7503,
"step": 19100
},
{
"epoch": 0.02,
"grad_norm": 0.7287809252738953,
"learning_rate": 9.777250294099222e-05,
"loss": 4.7492,
"step": 19200
},
{
"epoch": 0.02,
"grad_norm": 3.7486460208892822,
"learning_rate": 9.776090139380988e-05,
"loss": 4.7488,
"step": 19300
},
{
"epoch": 0.02,
"grad_norm": 2.859056234359741,
"learning_rate": 9.774929984662755e-05,
"loss": 4.7492,
"step": 19400
},
{
"epoch": 0.02,
"grad_norm": 0.5980411171913147,
"learning_rate": 9.773769829944522e-05,
"loss": 4.7534,
"step": 19500
},
{
"epoch": 0.02,
"grad_norm": 0.6939360499382019,
"learning_rate": 9.772609675226289e-05,
"loss": 4.7484,
"step": 19600
},
{
"epoch": 0.02,
"grad_norm": 7.793490886688232,
"learning_rate": 9.771449520508056e-05,
"loss": 4.751,
"step": 19700
},
{
"epoch": 0.02,
"grad_norm": 0.7629129886627197,
"learning_rate": 9.770289365789822e-05,
"loss": 4.7474,
"step": 19800
},
{
"epoch": 0.02,
"grad_norm": 2.9731252193450928,
"learning_rate": 9.769129211071589e-05,
"loss": 4.7499,
"step": 19900
},
{
"epoch": 0.02,
"grad_norm": 0.9709353446960449,
"learning_rate": 9.767969056353356e-05,
"loss": 4.7488,
"step": 20000
},
{
"epoch": 0.02,
"grad_norm": 0.8048213720321655,
"learning_rate": 9.766808901635123e-05,
"loss": 4.7483,
"step": 20100
},
{
"epoch": 0.02,
"grad_norm": 0.6946632862091064,
"learning_rate": 9.765648746916889e-05,
"loss": 4.748,
"step": 20200
},
{
"epoch": 0.02,
"grad_norm": 13.102263450622559,
"learning_rate": 9.764488592198656e-05,
"loss": 4.7483,
"step": 20300
},
{
"epoch": 0.02,
"grad_norm": 0.7083731889724731,
"learning_rate": 9.763328437480423e-05,
"loss": 4.7495,
"step": 20400
},
{
"epoch": 0.02,
"grad_norm": 0.5328574180603027,
"learning_rate": 9.76216828276219e-05,
"loss": 4.7492,
"step": 20500
},
{
"epoch": 0.02,
"grad_norm": 2.2578125,
"learning_rate": 9.761008128043957e-05,
"loss": 4.7474,
"step": 20600
},
{
"epoch": 0.02,
"grad_norm": 7.049394607543945,
"learning_rate": 9.759847973325723e-05,
"loss": 4.7484,
"step": 20700
},
{
"epoch": 0.02,
"grad_norm": 0.5719186663627625,
"learning_rate": 9.758687818607491e-05,
"loss": 4.7476,
"step": 20800
},
{
"epoch": 0.02,
"grad_norm": 1.7516430616378784,
"learning_rate": 9.757527663889257e-05,
"loss": 4.7468,
"step": 20900
},
{
"epoch": 0.02,
"grad_norm": 0.7269846200942993,
"learning_rate": 9.756367509171024e-05,
"loss": 4.7482,
"step": 21000
},
{
"epoch": 0.02,
"grad_norm": 1.0633318424224854,
"learning_rate": 9.75520735445279e-05,
"loss": 4.7469,
"step": 21100
},
{
"epoch": 0.02,
"grad_norm": 0.5097702145576477,
"learning_rate": 9.754047199734558e-05,
"loss": 4.7482,
"step": 21200
},
{
"epoch": 0.02,
"grad_norm": 0.5308384895324707,
"learning_rate": 9.752887045016323e-05,
"loss": 4.7444,
"step": 21300
},
{
"epoch": 0.02,
"grad_norm": 0.5017550587654114,
"learning_rate": 9.75172689029809e-05,
"loss": 4.7474,
"step": 21400
},
{
"epoch": 0.02,
"grad_norm": 0.526594340801239,
"learning_rate": 9.750566735579858e-05,
"loss": 4.7463,
"step": 21500
},
{
"epoch": 0.03,
"grad_norm": 2.675493001937866,
"learning_rate": 9.749406580861625e-05,
"loss": 4.7475,
"step": 21600
},
{
"epoch": 0.03,
"grad_norm": 5.905905723571777,
"learning_rate": 9.748246426143392e-05,
"loss": 4.7499,
"step": 21700
},
{
"epoch": 0.03,
"grad_norm": 4.458377838134766,
"learning_rate": 9.747086271425157e-05,
"loss": 4.7474,
"step": 21800
},
{
"epoch": 0.03,
"grad_norm": 1.3677464723587036,
"learning_rate": 9.745926116706924e-05,
"loss": 4.7502,
"step": 21900
},
{
"epoch": 0.03,
"grad_norm": 9.98462200164795,
"learning_rate": 9.744765961988691e-05,
"loss": 4.7486,
"step": 22000
},
{
"epoch": 0.03,
"grad_norm": 6.801756858825684,
"learning_rate": 9.743605807270458e-05,
"loss": 4.7458,
"step": 22100
},
{
"epoch": 0.03,
"grad_norm": 0.5047688484191895,
"learning_rate": 9.742445652552224e-05,
"loss": 4.7574,
"step": 22200
},
{
"epoch": 0.03,
"grad_norm": 0.5461822152137756,
"learning_rate": 9.741285497833991e-05,
"loss": 4.7441,
"step": 22300
},
{
"epoch": 0.03,
"grad_norm": 5.560943603515625,
"learning_rate": 9.740125343115758e-05,
"loss": 4.7447,
"step": 22400
},
{
"epoch": 0.03,
"grad_norm": 0.5233502984046936,
"learning_rate": 9.738965188397525e-05,
"loss": 4.7474,
"step": 22500
},
{
"epoch": 0.03,
"grad_norm": 0.5336612462997437,
"learning_rate": 9.737805033679291e-05,
"loss": 4.7425,
"step": 22600
},
{
"epoch": 0.03,
"grad_norm": 2.945470094680786,
"learning_rate": 9.736644878961059e-05,
"loss": 4.7482,
"step": 22700
},
{
"epoch": 0.03,
"grad_norm": 0.4918281137943268,
"learning_rate": 9.735484724242826e-05,
"loss": 4.7469,
"step": 22800
},
{
"epoch": 0.03,
"grad_norm": 2.591059923171997,
"learning_rate": 9.734324569524592e-05,
"loss": 4.7458,
"step": 22900
},
{
"epoch": 0.03,
"grad_norm": 0.5464999079704285,
"learning_rate": 9.733164414806359e-05,
"loss": 4.7443,
"step": 23000
},
{
"epoch": 0.03,
"grad_norm": 6.337867259979248,
"learning_rate": 9.732004260088126e-05,
"loss": 4.7453,
"step": 23100
},
{
"epoch": 0.03,
"grad_norm": 0.4762401878833771,
"learning_rate": 9.730844105369893e-05,
"loss": 4.7418,
"step": 23200
},
{
"epoch": 0.03,
"grad_norm": 2.0473411083221436,
"learning_rate": 9.729683950651659e-05,
"loss": 4.7454,
"step": 23300
},
{
"epoch": 0.03,
"grad_norm": 0.72366863489151,
"learning_rate": 9.728523795933426e-05,
"loss": 4.7425,
"step": 23400
},
{
"epoch": 0.03,
"grad_norm": 0.5242981314659119,
"learning_rate": 9.727363641215193e-05,
"loss": 4.7461,
"step": 23500
},
{
"epoch": 0.03,
"grad_norm": 0.6705228090286255,
"learning_rate": 9.72620348649696e-05,
"loss": 4.7495,
"step": 23600
},
{
"epoch": 0.03,
"grad_norm": 1.549325704574585,
"learning_rate": 9.725043331778726e-05,
"loss": 4.7436,
"step": 23700
},
{
"epoch": 0.03,
"grad_norm": 0.599336564540863,
"learning_rate": 9.723883177060493e-05,
"loss": 4.743,
"step": 23800
},
{
"epoch": 0.03,
"grad_norm": 1.103011965751648,
"learning_rate": 9.722723022342261e-05,
"loss": 4.7437,
"step": 23900
},
{
"epoch": 0.03,
"grad_norm": 3.4058070182800293,
"learning_rate": 9.721562867624027e-05,
"loss": 4.7466,
"step": 24000
},
{
"epoch": 0.03,
"grad_norm": 1.0680614709854126,
"learning_rate": 9.720402712905794e-05,
"loss": 4.7424,
"step": 24100
},
{
"epoch": 0.03,
"grad_norm": 0.5411515235900879,
"learning_rate": 9.71924255818756e-05,
"loss": 4.7408,
"step": 24200
},
{
"epoch": 0.03,
"grad_norm": 8.111778259277344,
"learning_rate": 9.718082403469328e-05,
"loss": 4.7454,
"step": 24300
},
{
"epoch": 0.03,
"grad_norm": 0.44278204441070557,
"learning_rate": 9.716922248751094e-05,
"loss": 4.7599,
"step": 24400
},
{
"epoch": 0.03,
"grad_norm": 0.6122348308563232,
"learning_rate": 9.71576209403286e-05,
"loss": 4.7427,
"step": 24500
},
{
"epoch": 0.03,
"grad_norm": 0.5633386373519897,
"learning_rate": 9.714601939314628e-05,
"loss": 4.746,
"step": 24600
},
{
"epoch": 0.03,
"grad_norm": 0.8872265219688416,
"learning_rate": 9.713441784596395e-05,
"loss": 4.7439,
"step": 24700
},
{
"epoch": 0.03,
"grad_norm": 1.6827236413955688,
"learning_rate": 9.71228162987816e-05,
"loss": 4.7413,
"step": 24800
},
{
"epoch": 0.03,
"grad_norm": 0.6328549385070801,
"learning_rate": 9.711121475159927e-05,
"loss": 4.7408,
"step": 24900
},
{
"epoch": 0.03,
"grad_norm": 0.4887780249118805,
"learning_rate": 9.709961320441694e-05,
"loss": 4.7425,
"step": 25000
},
{
"epoch": 0.03,
"grad_norm": 2.9315319061279297,
"learning_rate": 9.708801165723462e-05,
"loss": 4.7431,
"step": 25100
},
{
"epoch": 0.03,
"grad_norm": 0.6957463026046753,
"learning_rate": 9.707641011005229e-05,
"loss": 4.7414,
"step": 25200
},
{
"epoch": 0.03,
"grad_norm": 0.5213463306427002,
"learning_rate": 9.706480856286994e-05,
"loss": 4.7429,
"step": 25300
},
{
"epoch": 0.03,
"grad_norm": 0.486128032207489,
"learning_rate": 9.705320701568761e-05,
"loss": 4.7405,
"step": 25400
},
{
"epoch": 0.03,
"grad_norm": 2.561840295791626,
"learning_rate": 9.704160546850528e-05,
"loss": 4.741,
"step": 25500
},
{
"epoch": 0.03,
"grad_norm": 5.198677062988281,
"learning_rate": 9.703000392132295e-05,
"loss": 4.7447,
"step": 25600
},
{
"epoch": 0.03,
"grad_norm": 3.6826071739196777,
"learning_rate": 9.701840237414061e-05,
"loss": 4.7435,
"step": 25700
},
{
"epoch": 0.03,
"grad_norm": 0.7129253149032593,
"learning_rate": 9.70068008269583e-05,
"loss": 4.7432,
"step": 25800
},
{
"epoch": 0.03,
"grad_norm": 2.101804733276367,
"learning_rate": 9.699519927977595e-05,
"loss": 4.7518,
"step": 25900
},
{
"epoch": 0.03,
"grad_norm": 0.5510717034339905,
"learning_rate": 9.698359773259362e-05,
"loss": 4.7417,
"step": 26000
},
{
"epoch": 0.03,
"grad_norm": 0.45920300483703613,
"learning_rate": 9.697199618541129e-05,
"loss": 4.7421,
"step": 26100
},
{
"epoch": 0.03,
"grad_norm": 0.5336456894874573,
"learning_rate": 9.696039463822896e-05,
"loss": 4.7411,
"step": 26200
},
{
"epoch": 0.03,
"grad_norm": 0.5019949078559875,
"learning_rate": 9.694879309104663e-05,
"loss": 4.7436,
"step": 26300
},
{
"epoch": 0.03,
"grad_norm": 33.536495208740234,
"learning_rate": 9.693719154386429e-05,
"loss": 4.7436,
"step": 26400
},
{
"epoch": 0.03,
"grad_norm": 1.4645074605941772,
"learning_rate": 9.692558999668196e-05,
"loss": 4.7447,
"step": 26500
},
{
"epoch": 0.03,
"grad_norm": 3.655017614364624,
"learning_rate": 9.691398844949963e-05,
"loss": 4.7404,
"step": 26600
},
{
"epoch": 0.03,
"grad_norm": 0.49820154905319214,
"learning_rate": 9.69023869023173e-05,
"loss": 4.7432,
"step": 26700
},
{
"epoch": 0.03,
"grad_norm": 0.6071832180023193,
"learning_rate": 9.689078535513496e-05,
"loss": 4.7436,
"step": 26800
},
{
"epoch": 0.03,
"grad_norm": 0.5458092093467712,
"learning_rate": 9.687918380795263e-05,
"loss": 4.7438,
"step": 26900
},
{
"epoch": 0.03,
"grad_norm": 0.5005242824554443,
"learning_rate": 9.686758226077031e-05,
"loss": 4.741,
"step": 27000
},
{
"epoch": 0.03,
"grad_norm": 0.5511128306388855,
"learning_rate": 9.685598071358797e-05,
"loss": 4.743,
"step": 27100
},
{
"epoch": 0.03,
"grad_norm": 17.676786422729492,
"learning_rate": 9.684437916640564e-05,
"loss": 4.7423,
"step": 27200
},
{
"epoch": 0.03,
"grad_norm": 1.3633731603622437,
"learning_rate": 9.68327776192233e-05,
"loss": 4.7408,
"step": 27300
},
{
"epoch": 0.03,
"grad_norm": 1.5432199239730835,
"learning_rate": 9.682117607204098e-05,
"loss": 4.7391,
"step": 27400
},
{
"epoch": 0.03,
"grad_norm": 5.02588415145874,
"learning_rate": 9.680957452485864e-05,
"loss": 4.7416,
"step": 27500
},
{
"epoch": 0.03,
"grad_norm": 2.1455700397491455,
"learning_rate": 9.679797297767631e-05,
"loss": 4.7398,
"step": 27600
},
{
"epoch": 0.03,
"grad_norm": 1.6886168718338013,
"learning_rate": 9.678637143049398e-05,
"loss": 4.7418,
"step": 27700
},
{
"epoch": 0.03,
"grad_norm": 0.5645326375961304,
"learning_rate": 9.677476988331165e-05,
"loss": 4.7381,
"step": 27800
},
{
"epoch": 0.03,
"grad_norm": 6.7613911628723145,
"learning_rate": 9.67631683361293e-05,
"loss": 4.7404,
"step": 27900
},
{
"epoch": 0.03,
"grad_norm": 0.5324685573577881,
"learning_rate": 9.675156678894698e-05,
"loss": 4.7378,
"step": 28000
},
{
"epoch": 0.03,
"grad_norm": 4.839243412017822,
"learning_rate": 9.673996524176465e-05,
"loss": 4.7395,
"step": 28100
},
{
"epoch": 0.03,
"grad_norm": 0.458783894777298,
"learning_rate": 9.672836369458232e-05,
"loss": 4.7422,
"step": 28200
},
{
"epoch": 0.03,
"grad_norm": 0.9166305065155029,
"learning_rate": 9.671676214739999e-05,
"loss": 4.7428,
"step": 28300
},
{
"epoch": 0.03,
"grad_norm": 1.108514428138733,
"learning_rate": 9.670516060021764e-05,
"loss": 4.7403,
"step": 28400
},
{
"epoch": 0.03,
"grad_norm": 1.5330324172973633,
"learning_rate": 9.669355905303531e-05,
"loss": 4.7366,
"step": 28500
},
{
"epoch": 0.03,
"grad_norm": 0.4888221025466919,
"learning_rate": 9.668195750585298e-05,
"loss": 4.7384,
"step": 28600
},
{
"epoch": 0.03,
"grad_norm": 0.600286602973938,
"learning_rate": 9.667035595867066e-05,
"loss": 4.7367,
"step": 28700
},
{
"epoch": 0.03,
"grad_norm": 2.486511468887329,
"learning_rate": 9.665875441148831e-05,
"loss": 4.7391,
"step": 28800
},
{
"epoch": 0.03,
"grad_norm": 0.9454842209815979,
"learning_rate": 9.6647152864306e-05,
"loss": 4.7384,
"step": 28900
},
{
"epoch": 0.03,
"grad_norm": 1.6840468645095825,
"learning_rate": 9.663555131712365e-05,
"loss": 4.7379,
"step": 29000
},
{
"epoch": 0.03,
"grad_norm": 0.795483410358429,
"learning_rate": 9.662394976994132e-05,
"loss": 4.7393,
"step": 29100
},
{
"epoch": 0.03,
"grad_norm": 0.8666725158691406,
"learning_rate": 9.6612348222759e-05,
"loss": 4.7389,
"step": 29200
},
{
"epoch": 0.03,
"grad_norm": 4.16463565826416,
"learning_rate": 9.660074667557666e-05,
"loss": 4.7395,
"step": 29300
},
{
"epoch": 0.03,
"grad_norm": 0.6866464614868164,
"learning_rate": 9.658914512839433e-05,
"loss": 4.7401,
"step": 29400
},
{
"epoch": 0.03,
"grad_norm": 23.52367401123047,
"learning_rate": 9.657754358121199e-05,
"loss": 4.7562,
"step": 29500
},
{
"epoch": 0.03,
"grad_norm": 4.992708683013916,
"learning_rate": 9.656594203402966e-05,
"loss": 4.7743,
"step": 29600
},
{
"epoch": 0.03,
"grad_norm": 4.8701677322387695,
"learning_rate": 9.655434048684733e-05,
"loss": 4.7586,
"step": 29700
},
{
"epoch": 0.03,
"grad_norm": 4.0893425941467285,
"learning_rate": 9.6542738939665e-05,
"loss": 4.7572,
"step": 29800
},
{
"epoch": 0.03,
"grad_norm": 4.048985958099365,
"learning_rate": 9.653113739248266e-05,
"loss": 4.7514,
"step": 29900
},
{
"epoch": 0.03,
"grad_norm": 4.577606678009033,
"learning_rate": 9.651953584530033e-05,
"loss": 4.7529,
"step": 30000
},
{
"epoch": 0.03,
"grad_norm": 4.831415176391602,
"learning_rate": 9.6507934298118e-05,
"loss": 4.7512,
"step": 30100
},
{
"epoch": 0.04,
"grad_norm": 4.870159149169922,
"learning_rate": 9.649633275093567e-05,
"loss": 4.7494,
"step": 30200
},
{
"epoch": 0.04,
"grad_norm": 4.836753845214844,
"learning_rate": 9.648473120375334e-05,
"loss": 4.7493,
"step": 30300
},
{
"epoch": 0.04,
"grad_norm": 4.718664169311523,
"learning_rate": 9.6473129656571e-05,
"loss": 4.7511,
"step": 30400
},
{
"epoch": 0.04,
"grad_norm": 5.49000358581543,
"learning_rate": 9.646152810938868e-05,
"loss": 4.7494,
"step": 30500
},
{
"epoch": 0.04,
"grad_norm": 4.819366931915283,
"learning_rate": 9.644992656220634e-05,
"loss": 4.7504,
"step": 30600
},
{
"epoch": 0.04,
"grad_norm": 0.7323962450027466,
"learning_rate": 9.643832501502401e-05,
"loss": 4.7461,
"step": 30700
},
{
"epoch": 0.04,
"grad_norm": 0.8463295102119446,
"learning_rate": 9.642672346784168e-05,
"loss": 4.7376,
"step": 30800
},
{
"epoch": 0.04,
"grad_norm": 0.5474389791488647,
"learning_rate": 9.641512192065935e-05,
"loss": 4.7394,
"step": 30900
},
{
"epoch": 0.04,
"grad_norm": 0.6245602965354919,
"learning_rate": 9.640352037347701e-05,
"loss": 4.7363,
"step": 31000
},
{
"epoch": 0.04,
"grad_norm": 1.6616430282592773,
"learning_rate": 9.639191882629468e-05,
"loss": 4.7375,
"step": 31100
},
{
"epoch": 0.04,
"grad_norm": 2.0729475021362305,
"learning_rate": 9.638031727911235e-05,
"loss": 4.7393,
"step": 31200
},
{
"epoch": 0.04,
"grad_norm": 0.4616137146949768,
"learning_rate": 9.636871573193002e-05,
"loss": 4.7374,
"step": 31300
},
{
"epoch": 0.04,
"grad_norm": 1.6392802000045776,
"learning_rate": 9.635711418474769e-05,
"loss": 4.7383,
"step": 31400
},
{
"epoch": 0.04,
"grad_norm": 0.5004580616950989,
"learning_rate": 9.634551263756535e-05,
"loss": 4.7378,
"step": 31500
},
{
"epoch": 0.04,
"grad_norm": 0.5481105446815491,
"learning_rate": 9.633391109038302e-05,
"loss": 4.7382,
"step": 31600
},
{
"epoch": 0.04,
"grad_norm": 0.6281868815422058,
"learning_rate": 9.632230954320069e-05,
"loss": 4.7383,
"step": 31700
},
{
"epoch": 0.04,
"grad_norm": 0.664202868938446,
"learning_rate": 9.631070799601836e-05,
"loss": 4.739,
"step": 31800
},
{
"epoch": 0.04,
"grad_norm": 0.6041770577430725,
"learning_rate": 9.629910644883601e-05,
"loss": 4.7359,
"step": 31900
},
{
"epoch": 0.04,
"grad_norm": 3.237818717956543,
"learning_rate": 9.62875049016537e-05,
"loss": 4.7384,
"step": 32000
},
{
"epoch": 0.04,
"grad_norm": 0.4940323829650879,
"learning_rate": 9.627590335447135e-05,
"loss": 4.7363,
"step": 32100
},
{
"epoch": 0.04,
"grad_norm": 0.5114046335220337,
"learning_rate": 9.626430180728902e-05,
"loss": 4.7383,
"step": 32200
},
{
"epoch": 0.04,
"grad_norm": 0.9840266704559326,
"learning_rate": 9.625270026010668e-05,
"loss": 4.7378,
"step": 32300
},
{
"epoch": 0.04,
"grad_norm": 0.5324087738990784,
"learning_rate": 9.624109871292437e-05,
"loss": 4.7398,
"step": 32400
},
{
"epoch": 0.04,
"grad_norm": 4.636378765106201,
"learning_rate": 9.622949716574204e-05,
"loss": 4.7336,
"step": 32500
},
{
"epoch": 0.04,
"grad_norm": 0.4898914396762848,
"learning_rate": 9.621789561855969e-05,
"loss": 4.7374,
"step": 32600
},
{
"epoch": 0.04,
"grad_norm": 0.6327505111694336,
"learning_rate": 9.620629407137736e-05,
"loss": 4.7321,
"step": 32700
},
{
"epoch": 0.04,
"grad_norm": 10.219440460205078,
"learning_rate": 9.619469252419503e-05,
"loss": 4.737,
"step": 32800
},
{
"epoch": 0.04,
"grad_norm": 0.5579794049263,
"learning_rate": 9.61830909770127e-05,
"loss": 4.7367,
"step": 32900
},
{
"epoch": 0.04,
"grad_norm": 0.46085384488105774,
"learning_rate": 9.617148942983036e-05,
"loss": 4.7379,
"step": 33000
},
{
"epoch": 0.04,
"grad_norm": 0.5021482110023499,
"learning_rate": 9.615988788264803e-05,
"loss": 4.7368,
"step": 33100
},
{
"epoch": 0.04,
"grad_norm": 15.20097541809082,
"learning_rate": 9.61482863354657e-05,
"loss": 4.736,
"step": 33200
},
{
"epoch": 0.04,
"grad_norm": 0.7271726727485657,
"learning_rate": 9.613668478828337e-05,
"loss": 4.7366,
"step": 33300
},
{
"epoch": 0.04,
"grad_norm": 0.48880913853645325,
"learning_rate": 9.612508324110103e-05,
"loss": 4.7366,
"step": 33400
},
{
"epoch": 0.04,
"grad_norm": 0.5590758323669434,
"learning_rate": 9.61134816939187e-05,
"loss": 4.7391,
"step": 33500
},
{
"epoch": 0.04,
"grad_norm": 0.4674455225467682,
"learning_rate": 9.610188014673638e-05,
"loss": 4.7379,
"step": 33600
},
{
"epoch": 0.04,
"grad_norm": 0.517918050289154,
"learning_rate": 9.609027859955404e-05,
"loss": 4.7367,
"step": 33700
},
{
"epoch": 0.04,
"grad_norm": 1.2398717403411865,
"learning_rate": 9.607867705237171e-05,
"loss": 4.738,
"step": 33800
},
{
"epoch": 0.04,
"grad_norm": 0.5215381979942322,
"learning_rate": 9.606707550518937e-05,
"loss": 4.7352,
"step": 33900
},
{
"epoch": 0.04,
"grad_norm": 2.9021263122558594,
"learning_rate": 9.605547395800705e-05,
"loss": 4.7348,
"step": 34000
},
{
"epoch": 0.04,
"grad_norm": 0.46950528025627136,
"learning_rate": 9.604387241082471e-05,
"loss": 4.7348,
"step": 34100
},
{
"epoch": 0.04,
"grad_norm": 0.5117696523666382,
"learning_rate": 9.603227086364238e-05,
"loss": 4.7371,
"step": 34200
},
{
"epoch": 0.04,
"grad_norm": 0.7093480825424194,
"learning_rate": 9.602066931646005e-05,
"loss": 4.7383,
"step": 34300
},
{
"epoch": 0.04,
"grad_norm": 0.4637995958328247,
"learning_rate": 9.600906776927772e-05,
"loss": 4.7371,
"step": 34400
},
{
"epoch": 0.04,
"grad_norm": 0.5002011060714722,
"learning_rate": 9.599746622209538e-05,
"loss": 4.7359,
"step": 34500
},
{
"epoch": 0.04,
"grad_norm": 0.7651986479759216,
"learning_rate": 9.598586467491305e-05,
"loss": 4.7303,
"step": 34600
},
{
"epoch": 0.04,
"grad_norm": 1.2940102815628052,
"learning_rate": 9.597426312773072e-05,
"loss": 4.7331,
"step": 34700
},
{
"epoch": 0.04,
"grad_norm": 0.5338752269744873,
"learning_rate": 9.596266158054839e-05,
"loss": 4.7361,
"step": 34800
},
{
"epoch": 0.04,
"grad_norm": 1.049210786819458,
"learning_rate": 9.595106003336606e-05,
"loss": 4.7347,
"step": 34900
},
{
"epoch": 0.04,
"grad_norm": 2.3599870204925537,
"learning_rate": 9.593945848618372e-05,
"loss": 4.7375,
"step": 35000
},
{
"epoch": 0.04,
"grad_norm": 0.5492005348205566,
"learning_rate": 9.59278569390014e-05,
"loss": 4.7352,
"step": 35100
},
{
"epoch": 0.04,
"grad_norm": 0.5505391955375671,
"learning_rate": 9.591625539181906e-05,
"loss": 4.7332,
"step": 35200
},
{
"epoch": 0.04,
"grad_norm": 0.6726617813110352,
"learning_rate": 9.590465384463673e-05,
"loss": 4.7315,
"step": 35300
},
{
"epoch": 0.04,
"grad_norm": 0.558929979801178,
"learning_rate": 9.589305229745438e-05,
"loss": 4.732,
"step": 35400
},
{
"epoch": 0.04,
"grad_norm": 1.7398250102996826,
"learning_rate": 9.588145075027207e-05,
"loss": 4.7329,
"step": 35500
},
{
"epoch": 0.04,
"grad_norm": 0.5553580522537231,
"learning_rate": 9.586984920308972e-05,
"loss": 4.7355,
"step": 35600
},
{
"epoch": 0.04,
"grad_norm": 0.5289317965507507,
"learning_rate": 9.58582476559074e-05,
"loss": 4.7363,
"step": 35700
},
{
"epoch": 0.04,
"grad_norm": 3.049525499343872,
"learning_rate": 9.584664610872507e-05,
"loss": 4.7342,
"step": 35800
},
{
"epoch": 0.04,
"grad_norm": 0.4871656000614166,
"learning_rate": 9.583504456154274e-05,
"loss": 4.7356,
"step": 35900
},
{
"epoch": 0.04,
"grad_norm": 2.0833821296691895,
"learning_rate": 9.58234430143604e-05,
"loss": 4.7345,
"step": 36000
},
{
"epoch": 0.04,
"grad_norm": 5.624002456665039,
"learning_rate": 9.581184146717806e-05,
"loss": 4.7323,
"step": 36100
},
{
"epoch": 0.04,
"grad_norm": 0.6378651261329651,
"learning_rate": 9.580023991999573e-05,
"loss": 4.7349,
"step": 36200
},
{
"epoch": 0.04,
"grad_norm": 2.1615560054779053,
"learning_rate": 9.57886383728134e-05,
"loss": 4.7345,
"step": 36300
},
{
"epoch": 0.04,
"grad_norm": 0.5154897570610046,
"learning_rate": 9.577703682563107e-05,
"loss": 4.7357,
"step": 36400
},
{
"epoch": 0.04,
"grad_norm": 6.502463340759277,
"learning_rate": 9.576543527844873e-05,
"loss": 4.7354,
"step": 36500
},
{
"epoch": 0.04,
"grad_norm": 0.5349368453025818,
"learning_rate": 9.57538337312664e-05,
"loss": 4.7342,
"step": 36600
},
{
"epoch": 0.04,
"grad_norm": 1.0265626907348633,
"learning_rate": 9.574223218408407e-05,
"loss": 4.7327,
"step": 36700
},
{
"epoch": 0.04,
"grad_norm": 0.8190938830375671,
"learning_rate": 9.573063063690174e-05,
"loss": 4.732,
"step": 36800
},
{
"epoch": 0.04,
"grad_norm": 9.420807838439941,
"learning_rate": 9.571902908971941e-05,
"loss": 4.7322,
"step": 36900
},
{
"epoch": 0.04,
"grad_norm": 0.5019901990890503,
"learning_rate": 9.570742754253707e-05,
"loss": 4.7314,
"step": 37000
},
{
"epoch": 0.04,
"grad_norm": 2.35811448097229,
"learning_rate": 9.569582599535475e-05,
"loss": 4.7314,
"step": 37100
},
{
"epoch": 0.04,
"grad_norm": 6.828240871429443,
"learning_rate": 9.568422444817241e-05,
"loss": 4.7329,
"step": 37200
},
{
"epoch": 0.04,
"grad_norm": 0.4542797803878784,
"learning_rate": 9.567262290099008e-05,
"loss": 4.7297,
"step": 37300
},
{
"epoch": 0.04,
"grad_norm": 0.4769699275493622,
"learning_rate": 9.566102135380775e-05,
"loss": 4.7334,
"step": 37400
},
{
"epoch": 0.04,
"grad_norm": 0.6346319317817688,
"learning_rate": 9.564941980662542e-05,
"loss": 4.7324,
"step": 37500
},
{
"epoch": 0.04,
"grad_norm": 0.621337890625,
"learning_rate": 9.563781825944308e-05,
"loss": 4.7335,
"step": 37600
},
{
"epoch": 0.04,
"grad_norm": 0.49926477670669556,
"learning_rate": 9.562621671226075e-05,
"loss": 4.7334,
"step": 37700
},
{
"epoch": 0.04,
"grad_norm": 1.4097819328308105,
"learning_rate": 9.561461516507842e-05,
"loss": 4.7316,
"step": 37800
},
{
"epoch": 0.04,
"grad_norm": 0.4786432087421417,
"learning_rate": 9.560301361789609e-05,
"loss": 4.7323,
"step": 37900
},
{
"epoch": 0.04,
"grad_norm": 0.7441820502281189,
"learning_rate": 9.559141207071376e-05,
"loss": 4.7322,
"step": 38000
},
{
"epoch": 0.04,
"grad_norm": 0.5333019495010376,
"learning_rate": 9.557981052353142e-05,
"loss": 4.7326,
"step": 38100
},
{
"epoch": 0.04,
"grad_norm": 0.9219884872436523,
"learning_rate": 9.556820897634909e-05,
"loss": 4.732,
"step": 38200
},
{
"epoch": 0.04,
"grad_norm": 0.7354302406311035,
"learning_rate": 9.555660742916676e-05,
"loss": 4.7322,
"step": 38300
},
{
"epoch": 0.04,
"grad_norm": 0.43798136711120605,
"learning_rate": 9.554500588198443e-05,
"loss": 4.7319,
"step": 38400
},
{
"epoch": 0.04,
"grad_norm": 0.5193877220153809,
"learning_rate": 9.553340433480208e-05,
"loss": 4.7313,
"step": 38500
},
{
"epoch": 0.04,
"grad_norm": 0.6578642725944519,
"learning_rate": 9.552180278761977e-05,
"loss": 4.7301,
"step": 38600
},
{
"epoch": 0.04,
"grad_norm": 0.481916606426239,
"learning_rate": 9.551020124043743e-05,
"loss": 4.7336,
"step": 38700
},
{
"epoch": 0.05,
"grad_norm": 0.495345801115036,
"learning_rate": 9.54985996932551e-05,
"loss": 4.7292,
"step": 38800
},
{
"epoch": 0.05,
"grad_norm": 1.2032209634780884,
"learning_rate": 9.548699814607277e-05,
"loss": 4.731,
"step": 38900
},
{
"epoch": 0.05,
"grad_norm": 0.45609620213508606,
"learning_rate": 9.547539659889044e-05,
"loss": 4.7309,
"step": 39000
},
{
"epoch": 0.05,
"grad_norm": 0.46040889620780945,
"learning_rate": 9.546379505170811e-05,
"loss": 4.7319,
"step": 39100
},
{
"epoch": 0.05,
"grad_norm": 0.8751170635223389,
"learning_rate": 9.545219350452576e-05,
"loss": 4.7304,
"step": 39200
},
{
"epoch": 0.05,
"grad_norm": 0.4785304665565491,
"learning_rate": 9.544059195734343e-05,
"loss": 4.729,
"step": 39300
},
{
"epoch": 0.05,
"grad_norm": 4.840359210968018,
"learning_rate": 9.54289904101611e-05,
"loss": 4.7368,
"step": 39400
},
{
"epoch": 0.05,
"grad_norm": 0.5387877225875854,
"learning_rate": 9.541738886297878e-05,
"loss": 4.7325,
"step": 39500
},
{
"epoch": 0.05,
"grad_norm": 1.0035640001296997,
"learning_rate": 9.540578731579643e-05,
"loss": 4.7307,
"step": 39600
},
{
"epoch": 0.05,
"grad_norm": 0.5036232471466064,
"learning_rate": 9.53941857686141e-05,
"loss": 4.7324,
"step": 39700
},
{
"epoch": 0.05,
"grad_norm": 0.8626024127006531,
"learning_rate": 9.538258422143177e-05,
"loss": 4.7286,
"step": 39800
},
{
"epoch": 0.05,
"grad_norm": 6.899303436279297,
"learning_rate": 9.537098267424944e-05,
"loss": 4.7311,
"step": 39900
},
{
"epoch": 0.05,
"grad_norm": 0.5646871328353882,
"learning_rate": 9.535938112706711e-05,
"loss": 4.7303,
"step": 40000
},
{
"epoch": 0.05,
"grad_norm": 0.9518368244171143,
"learning_rate": 9.534777957988477e-05,
"loss": 4.7314,
"step": 40100
},
{
"epoch": 0.05,
"grad_norm": 0.5652722120285034,
"learning_rate": 9.533617803270245e-05,
"loss": 4.7304,
"step": 40200
},
{
"epoch": 0.05,
"grad_norm": 0.5541896224021912,
"learning_rate": 9.532457648552011e-05,
"loss": 4.73,
"step": 40300
},
{
"epoch": 0.05,
"grad_norm": 0.49006637930870056,
"learning_rate": 9.531297493833778e-05,
"loss": 4.7311,
"step": 40400
},
{
"epoch": 0.05,
"grad_norm": 0.5993065237998962,
"learning_rate": 9.530137339115545e-05,
"loss": 4.7309,
"step": 40500
},
{
"epoch": 0.05,
"grad_norm": 0.5642876029014587,
"learning_rate": 9.528977184397312e-05,
"loss": 4.7299,
"step": 40600
},
{
"epoch": 0.05,
"grad_norm": 7.888554096221924,
"learning_rate": 9.527817029679078e-05,
"loss": 4.7435,
"step": 40700
},
{
"epoch": 0.05,
"grad_norm": 12.3725004196167,
"learning_rate": 9.526656874960845e-05,
"loss": 4.7401,
"step": 40800
},
{
"epoch": 0.05,
"grad_norm": 2.139461040496826,
"learning_rate": 9.525496720242612e-05,
"loss": 4.7359,
"step": 40900
},
{
"epoch": 0.05,
"grad_norm": 0.4835149645805359,
"learning_rate": 9.524336565524379e-05,
"loss": 4.7327,
"step": 41000
},
{
"epoch": 0.05,
"grad_norm": 0.4717291593551636,
"learning_rate": 9.523176410806146e-05,
"loss": 4.7305,
"step": 41100
},
{
"epoch": 0.05,
"grad_norm": 0.48854538798332214,
"learning_rate": 9.522016256087912e-05,
"loss": 4.7265,
"step": 41200
},
{
"epoch": 0.05,
"grad_norm": 0.48703619837760925,
"learning_rate": 9.520856101369679e-05,
"loss": 4.7299,
"step": 41300
},
{
"epoch": 0.05,
"grad_norm": 0.49546846747398376,
"learning_rate": 9.519695946651446e-05,
"loss": 4.7297,
"step": 41400
},
{
"epoch": 0.05,
"grad_norm": 0.4952056109905243,
"learning_rate": 9.518535791933213e-05,
"loss": 4.7326,
"step": 41500
},
{
"epoch": 0.05,
"grad_norm": 8.376193046569824,
"learning_rate": 9.517375637214979e-05,
"loss": 4.7306,
"step": 41600
},
{
"epoch": 0.05,
"grad_norm": 0.4306395351886749,
"learning_rate": 9.516215482496747e-05,
"loss": 4.7286,
"step": 41700
},
{
"epoch": 0.05,
"grad_norm": 0.9443186521530151,
"learning_rate": 9.515055327778513e-05,
"loss": 4.7308,
"step": 41800
},
{
"epoch": 0.05,
"grad_norm": 0.47867128252983093,
"learning_rate": 9.51389517306028e-05,
"loss": 4.7289,
"step": 41900
},
{
"epoch": 0.05,
"grad_norm": 1.2091776132583618,
"learning_rate": 9.512735018342045e-05,
"loss": 4.7378,
"step": 42000
},
{
"epoch": 0.05,
"grad_norm": 0.4395917057991028,
"learning_rate": 9.511574863623814e-05,
"loss": 4.7328,
"step": 42100
},
{
"epoch": 0.05,
"grad_norm": 0.723639965057373,
"learning_rate": 9.510414708905581e-05,
"loss": 4.7288,
"step": 42200
},
{
"epoch": 0.05,
"grad_norm": 5.487166881561279,
"learning_rate": 9.509254554187347e-05,
"loss": 4.7298,
"step": 42300
},
{
"epoch": 0.05,
"grad_norm": 2.0608367919921875,
"learning_rate": 9.508094399469114e-05,
"loss": 4.7312,
"step": 42400
},
{
"epoch": 0.05,
"grad_norm": 0.465772807598114,
"learning_rate": 9.50693424475088e-05,
"loss": 4.7308,
"step": 42500
},
{
"epoch": 0.05,
"grad_norm": 0.4866921007633209,
"learning_rate": 9.505774090032648e-05,
"loss": 4.7295,
"step": 42600
},
{
"epoch": 0.05,
"grad_norm": 1.1015464067459106,
"learning_rate": 9.504613935314413e-05,
"loss": 4.7296,
"step": 42700
},
{
"epoch": 0.05,
"grad_norm": 0.6402974128723145,
"learning_rate": 9.50345378059618e-05,
"loss": 4.7285,
"step": 42800
},
{
"epoch": 0.05,
"grad_norm": 2.76873517036438,
"learning_rate": 9.502293625877947e-05,
"loss": 4.7306,
"step": 42900
},
{
"epoch": 0.05,
"grad_norm": 0.7945021390914917,
"learning_rate": 9.501133471159715e-05,
"loss": 4.729,
"step": 43000
},
{
"epoch": 0.05,
"grad_norm": 0.3986837565898895,
"learning_rate": 9.49997331644148e-05,
"loss": 4.7306,
"step": 43100
},
{
"epoch": 0.05,
"grad_norm": 0.4017668068408966,
"learning_rate": 9.498813161723247e-05,
"loss": 4.7277,
"step": 43200
},
{
"epoch": 0.05,
"grad_norm": 1.6943633556365967,
"learning_rate": 9.497653007005016e-05,
"loss": 4.7301,
"step": 43300
},
{
"epoch": 0.05,
"grad_norm": 2.686843156814575,
"learning_rate": 9.496492852286781e-05,
"loss": 4.728,
"step": 43400
},
{
"epoch": 0.05,
"grad_norm": 0.7867078185081482,
"learning_rate": 9.495332697568548e-05,
"loss": 4.7272,
"step": 43500
},
{
"epoch": 0.05,
"grad_norm": 1.0648784637451172,
"learning_rate": 9.494172542850315e-05,
"loss": 4.7263,
"step": 43600
},
{
"epoch": 0.05,
"grad_norm": 1.6653295755386353,
"learning_rate": 9.493012388132082e-05,
"loss": 4.7309,
"step": 43700
},
{
"epoch": 0.05,
"grad_norm": 0.9347316026687622,
"learning_rate": 9.491852233413848e-05,
"loss": 4.7303,
"step": 43800
},
{
"epoch": 0.05,
"grad_norm": 1.3211824893951416,
"learning_rate": 9.490692078695615e-05,
"loss": 4.7294,
"step": 43900
},
{
"epoch": 0.05,
"grad_norm": 0.8230929970741272,
"learning_rate": 9.489531923977382e-05,
"loss": 4.7282,
"step": 44000
},
{
"epoch": 0.05,
"grad_norm": 1.3526966571807861,
"learning_rate": 9.488371769259149e-05,
"loss": 4.7267,
"step": 44100
},
{
"epoch": 0.05,
"grad_norm": 0.4441579282283783,
"learning_rate": 9.487211614540915e-05,
"loss": 4.7265,
"step": 44200
},
{
"epoch": 0.05,
"grad_norm": 5.511246204376221,
"learning_rate": 9.486051459822682e-05,
"loss": 4.7302,
"step": 44300
},
{
"epoch": 0.05,
"grad_norm": 0.5045779943466187,
"learning_rate": 9.484891305104449e-05,
"loss": 4.73,
"step": 44400
},
{
"epoch": 0.05,
"grad_norm": 0.4756096601486206,
"learning_rate": 9.483731150386216e-05,
"loss": 4.7291,
"step": 44500
},
{
"epoch": 0.05,
"grad_norm": 0.4292340576648712,
"learning_rate": 9.482570995667983e-05,
"loss": 4.7272,
"step": 44600
},
{
"epoch": 0.05,
"grad_norm": 0.6878976821899414,
"learning_rate": 9.481410840949749e-05,
"loss": 4.726,
"step": 44700
},
{
"epoch": 0.05,
"grad_norm": 0.502358078956604,
"learning_rate": 9.480250686231517e-05,
"loss": 4.7266,
"step": 44800
},
{
"epoch": 0.05,
"grad_norm": 0.6721329092979431,
"learning_rate": 9.479090531513283e-05,
"loss": 4.7294,
"step": 44900
},
{
"epoch": 0.05,
"grad_norm": 0.438326895236969,
"learning_rate": 9.47793037679505e-05,
"loss": 4.728,
"step": 45000
},
{
"epoch": 0.05,
"grad_norm": 0.722699761390686,
"learning_rate": 9.476770222076816e-05,
"loss": 4.7298,
"step": 45100
},
{
"epoch": 0.05,
"grad_norm": 7.889718532562256,
"learning_rate": 9.475610067358584e-05,
"loss": 4.724,
"step": 45200
},
{
"epoch": 0.05,
"grad_norm": 25.698381423950195,
"learning_rate": 9.47444991264035e-05,
"loss": 4.7291,
"step": 45300
},
{
"epoch": 0.05,
"grad_norm": 0.5010745525360107,
"learning_rate": 9.473289757922117e-05,
"loss": 4.7344,
"step": 45400
},
{
"epoch": 0.05,
"grad_norm": 3.237755537033081,
"learning_rate": 9.472129603203884e-05,
"loss": 4.7266,
"step": 45500
},
{
"epoch": 0.05,
"grad_norm": 0.9018293619155884,
"learning_rate": 9.470969448485651e-05,
"loss": 4.728,
"step": 45600
},
{
"epoch": 0.05,
"grad_norm": 0.5156495571136475,
"learning_rate": 9.469809293767418e-05,
"loss": 4.7259,
"step": 45700
},
{
"epoch": 0.05,
"grad_norm": 5.2465362548828125,
"learning_rate": 9.468649139049184e-05,
"loss": 4.7269,
"step": 45800
},
{
"epoch": 0.05,
"grad_norm": 0.610849916934967,
"learning_rate": 9.46748898433095e-05,
"loss": 4.7292,
"step": 45900
},
{
"epoch": 0.05,
"grad_norm": 2.3922882080078125,
"learning_rate": 9.466328829612718e-05,
"loss": 4.7283,
"step": 46000
},
{
"epoch": 0.05,
"grad_norm": 0.42459169030189514,
"learning_rate": 9.465168674894485e-05,
"loss": 4.727,
"step": 46100
},
{
"epoch": 0.05,
"grad_norm": 0.4948953092098236,
"learning_rate": 9.46400852017625e-05,
"loss": 4.7247,
"step": 46200
},
{
"epoch": 0.05,
"grad_norm": 0.4495919346809387,
"learning_rate": 9.462848365458017e-05,
"loss": 4.7275,
"step": 46300
},
{
"epoch": 0.05,
"grad_norm": 1.1093862056732178,
"learning_rate": 9.461688210739784e-05,
"loss": 4.7297,
"step": 46400
},
{
"epoch": 0.05,
"grad_norm": 0.4342365860939026,
"learning_rate": 9.460528056021551e-05,
"loss": 4.7306,
"step": 46500
},
{
"epoch": 0.05,
"grad_norm": 25.432937622070312,
"learning_rate": 9.459367901303319e-05,
"loss": 4.732,
"step": 46600
},
{
"epoch": 0.05,
"grad_norm": 2.0753910541534424,
"learning_rate": 9.458207746585086e-05,
"loss": 4.7368,
"step": 46700
},
{
"epoch": 0.05,
"grad_norm": 4.888164520263672,
"learning_rate": 9.457047591866853e-05,
"loss": 4.7355,
"step": 46800
},
{
"epoch": 0.05,
"grad_norm": 0.5124868750572205,
"learning_rate": 9.455887437148618e-05,
"loss": 4.7255,
"step": 46900
},
{
"epoch": 0.05,
"grad_norm": 0.4563619792461395,
"learning_rate": 9.454727282430385e-05,
"loss": 4.7266,
"step": 47000
},
{
"epoch": 0.05,
"grad_norm": 0.6962786316871643,
"learning_rate": 9.453567127712152e-05,
"loss": 4.7266,
"step": 47100
},
{
"epoch": 0.05,
"grad_norm": 0.6182125210762024,
"learning_rate": 9.45240697299392e-05,
"loss": 4.7262,
"step": 47200
},
{
"epoch": 0.05,
"grad_norm": 0.44259679317474365,
"learning_rate": 9.451246818275685e-05,
"loss": 4.7289,
"step": 47300
},
{
"epoch": 0.05,
"grad_norm": 1.4451597929000854,
"learning_rate": 9.450086663557452e-05,
"loss": 4.7253,
"step": 47400
},
{
"epoch": 0.06,
"grad_norm": 6.354624271392822,
"learning_rate": 9.448926508839219e-05,
"loss": 4.7252,
"step": 47500
},
{
"epoch": 0.06,
"grad_norm": 0.49475088715553284,
"learning_rate": 9.447766354120986e-05,
"loss": 4.7261,
"step": 47600
},
{
"epoch": 0.06,
"grad_norm": 5.40777587890625,
"learning_rate": 9.446606199402753e-05,
"loss": 4.7274,
"step": 47700
},
{
"epoch": 0.06,
"grad_norm": 2.0609936714172363,
"learning_rate": 9.445446044684519e-05,
"loss": 4.7283,
"step": 47800
},
{
"epoch": 0.06,
"grad_norm": 1.6895414590835571,
"learning_rate": 9.444285889966287e-05,
"loss": 4.7247,
"step": 47900
},
{
"epoch": 0.06,
"grad_norm": 0.4589858949184418,
"learning_rate": 9.443125735248053e-05,
"loss": 4.7275,
"step": 48000
},
{
"epoch": 0.06,
"grad_norm": 3.8931922912597656,
"learning_rate": 9.44196558052982e-05,
"loss": 4.7245,
"step": 48100
},
{
"epoch": 0.06,
"grad_norm": 0.5537588000297546,
"learning_rate": 9.440805425811586e-05,
"loss": 4.7283,
"step": 48200
},
{
"epoch": 0.06,
"grad_norm": 4.216842174530029,
"learning_rate": 9.439645271093354e-05,
"loss": 4.7238,
"step": 48300
},
{
"epoch": 0.06,
"grad_norm": 0.4659540355205536,
"learning_rate": 9.43848511637512e-05,
"loss": 4.7269,
"step": 48400
},
{
"epoch": 0.06,
"grad_norm": 0.4347868859767914,
"learning_rate": 9.437324961656887e-05,
"loss": 4.7272,
"step": 48500
},
{
"epoch": 0.06,
"grad_norm": 0.4908686578273773,
"learning_rate": 9.436164806938654e-05,
"loss": 4.7217,
"step": 48600
},
{
"epoch": 0.06,
"grad_norm": 0.4566729664802551,
"learning_rate": 9.435004652220421e-05,
"loss": 4.7259,
"step": 48700
},
{
"epoch": 0.06,
"grad_norm": 0.6793931126594543,
"learning_rate": 9.433844497502188e-05,
"loss": 4.7272,
"step": 48800
},
{
"epoch": 0.06,
"grad_norm": 0.4818676710128784,
"learning_rate": 9.432684342783954e-05,
"loss": 4.7261,
"step": 48900
},
{
"epoch": 0.06,
"grad_norm": 0.475321888923645,
"learning_rate": 9.431524188065721e-05,
"loss": 4.7261,
"step": 49000
},
{
"epoch": 0.06,
"grad_norm": 0.4802456200122833,
"learning_rate": 9.430364033347488e-05,
"loss": 4.7276,
"step": 49100
},
{
"epoch": 0.06,
"grad_norm": 1.9790329933166504,
"learning_rate": 9.429203878629255e-05,
"loss": 4.7238,
"step": 49200
},
{
"epoch": 0.06,
"grad_norm": 0.4709641933441162,
"learning_rate": 9.42804372391102e-05,
"loss": 4.7238,
"step": 49300
},
{
"epoch": 0.06,
"grad_norm": 0.43033114075660706,
"learning_rate": 9.426883569192788e-05,
"loss": 4.7242,
"step": 49400
},
{
"epoch": 0.06,
"grad_norm": 0.4878010153770447,
"learning_rate": 9.425723414474555e-05,
"loss": 4.7291,
"step": 49500
},
{
"epoch": 0.06,
"grad_norm": 0.6672825813293457,
"learning_rate": 9.424563259756322e-05,
"loss": 4.7276,
"step": 49600
},
{
"epoch": 0.06,
"grad_norm": 2.705127239227295,
"learning_rate": 9.423403105038089e-05,
"loss": 4.7258,
"step": 49700
},
{
"epoch": 0.06,
"grad_norm": 0.4465647339820862,
"learning_rate": 9.422242950319854e-05,
"loss": 4.7254,
"step": 49800
},
{
"epoch": 0.06,
"grad_norm": 1.3617794513702393,
"learning_rate": 9.421082795601623e-05,
"loss": 4.7275,
"step": 49900
},
{
"epoch": 0.06,
"grad_norm": 1.3789376020431519,
"learning_rate": 9.419922640883388e-05,
"loss": 4.727,
"step": 50000
},
{
"epoch": 0.06,
"grad_norm": 0.8773742318153381,
"learning_rate": 9.418762486165156e-05,
"loss": 4.7279,
"step": 50100
},
{
"epoch": 0.06,
"grad_norm": 0.47493115067481995,
"learning_rate": 9.417602331446923e-05,
"loss": 4.7244,
"step": 50200
},
{
"epoch": 0.06,
"grad_norm": 0.4560215175151825,
"learning_rate": 9.41644217672869e-05,
"loss": 4.7258,
"step": 50300
},
{
"epoch": 0.06,
"grad_norm": 0.5064975023269653,
"learning_rate": 9.415282022010455e-05,
"loss": 4.7246,
"step": 50400
},
{
"epoch": 0.06,
"grad_norm": 1.276395320892334,
"learning_rate": 9.414121867292222e-05,
"loss": 4.73,
"step": 50500
},
{
"epoch": 0.06,
"grad_norm": 0.6075153350830078,
"learning_rate": 9.41296171257399e-05,
"loss": 4.726,
"step": 50600
},
{
"epoch": 0.06,
"grad_norm": 3.9961352348327637,
"learning_rate": 9.411801557855756e-05,
"loss": 4.7256,
"step": 50700
},
{
"epoch": 0.06,
"grad_norm": 0.469164103269577,
"learning_rate": 9.410641403137523e-05,
"loss": 4.7264,
"step": 50800
},
{
"epoch": 0.06,
"grad_norm": 0.9661677479743958,
"learning_rate": 9.409481248419289e-05,
"loss": 4.7256,
"step": 50900
},
{
"epoch": 0.06,
"grad_norm": 4.348484039306641,
"learning_rate": 9.408321093701058e-05,
"loss": 4.7207,
"step": 51000
},
{
"epoch": 0.06,
"grad_norm": 0.4262404143810272,
"learning_rate": 9.407160938982823e-05,
"loss": 4.7248,
"step": 51100
},
{
"epoch": 0.06,
"grad_norm": 2.119818687438965,
"learning_rate": 9.40600078426459e-05,
"loss": 4.7234,
"step": 51200
},
{
"epoch": 0.06,
"grad_norm": 0.43322816491127014,
"learning_rate": 9.404840629546356e-05,
"loss": 4.7262,
"step": 51300
},
{
"epoch": 0.06,
"grad_norm": 0.43069151043891907,
"learning_rate": 9.403680474828124e-05,
"loss": 4.725,
"step": 51400
},
{
"epoch": 0.06,
"grad_norm": 0.5061689019203186,
"learning_rate": 9.40252032010989e-05,
"loss": 4.7281,
"step": 51500
},
{
"epoch": 0.06,
"grad_norm": 1.387876272201538,
"learning_rate": 9.401360165391657e-05,
"loss": 4.7265,
"step": 51600
},
{
"epoch": 0.06,
"grad_norm": 0.48391860723495483,
"learning_rate": 9.400200010673423e-05,
"loss": 4.7293,
"step": 51700
},
{
"epoch": 0.06,
"grad_norm": 0.8254903554916382,
"learning_rate": 9.399039855955191e-05,
"loss": 4.7232,
"step": 51800
},
{
"epoch": 0.06,
"grad_norm": 3.460291862487793,
"learning_rate": 9.397879701236958e-05,
"loss": 4.7237,
"step": 51900
},
{
"epoch": 0.06,
"grad_norm": 1.3960816860198975,
"learning_rate": 9.396719546518724e-05,
"loss": 4.7254,
"step": 52000
},
{
"epoch": 0.06,
"grad_norm": 6.771088600158691,
"learning_rate": 9.395559391800491e-05,
"loss": 4.7251,
"step": 52100
},
{
"epoch": 0.06,
"grad_norm": 0.49326056241989136,
"learning_rate": 9.394399237082258e-05,
"loss": 4.7239,
"step": 52200
},
{
"epoch": 0.06,
"grad_norm": 0.45919495820999146,
"learning_rate": 9.393239082364025e-05,
"loss": 4.7262,
"step": 52300
},
{
"epoch": 0.06,
"grad_norm": 0.7638592720031738,
"learning_rate": 9.392078927645791e-05,
"loss": 4.7253,
"step": 52400
},
{
"epoch": 0.06,
"grad_norm": 1.0643978118896484,
"learning_rate": 9.390918772927558e-05,
"loss": 4.722,
"step": 52500
},
{
"epoch": 0.06,
"grad_norm": 0.4550251364707947,
"learning_rate": 9.389758618209325e-05,
"loss": 4.7239,
"step": 52600
},
{
"epoch": 0.06,
"grad_norm": 0.7114538550376892,
"learning_rate": 9.388598463491092e-05,
"loss": 4.7239,
"step": 52700
},
{
"epoch": 0.06,
"grad_norm": 0.4556228220462799,
"learning_rate": 9.387438308772857e-05,
"loss": 4.725,
"step": 52800
},
{
"epoch": 0.06,
"grad_norm": 0.4266548752784729,
"learning_rate": 9.386278154054625e-05,
"loss": 4.7238,
"step": 52900
},
{
"epoch": 0.06,
"grad_norm": 1.829122543334961,
"learning_rate": 9.385117999336393e-05,
"loss": 4.7246,
"step": 53000
},
{
"epoch": 0.06,
"grad_norm": 0.4608537554740906,
"learning_rate": 9.383957844618159e-05,
"loss": 4.723,
"step": 53100
},
{
"epoch": 0.06,
"grad_norm": 2.6241915225982666,
"learning_rate": 9.382797689899926e-05,
"loss": 4.7239,
"step": 53200
},
{
"epoch": 0.06,
"grad_norm": 0.4583094120025635,
"learning_rate": 9.381637535181693e-05,
"loss": 4.722,
"step": 53300
},
{
"epoch": 0.06,
"grad_norm": 1.0573890209197998,
"learning_rate": 9.38047738046346e-05,
"loss": 4.726,
"step": 53400
},
{
"epoch": 0.06,
"grad_norm": 1.0593518018722534,
"learning_rate": 9.379317225745225e-05,
"loss": 4.7224,
"step": 53500
},
{
"epoch": 0.06,
"grad_norm": 15.059647560119629,
"learning_rate": 9.378157071026992e-05,
"loss": 4.7271,
"step": 53600
},
{
"epoch": 0.06,
"grad_norm": 0.4176161587238312,
"learning_rate": 9.37699691630876e-05,
"loss": 4.7219,
"step": 53700
},
{
"epoch": 0.06,
"grad_norm": 7.073505401611328,
"learning_rate": 9.375836761590527e-05,
"loss": 4.7261,
"step": 53800
},
{
"epoch": 0.06,
"grad_norm": 0.4442752003669739,
"learning_rate": 9.374676606872292e-05,
"loss": 4.7203,
"step": 53900
},
{
"epoch": 0.06,
"grad_norm": 0.9274020195007324,
"learning_rate": 9.373516452154059e-05,
"loss": 4.7251,
"step": 54000
},
{
"epoch": 0.06,
"grad_norm": 0.5794118046760559,
"learning_rate": 9.372356297435826e-05,
"loss": 4.7229,
"step": 54100
},
{
"epoch": 0.06,
"grad_norm": 0.49662327766418457,
"learning_rate": 9.371196142717593e-05,
"loss": 4.7233,
"step": 54200
},
{
"epoch": 0.06,
"grad_norm": 1.1663075685501099,
"learning_rate": 9.37003598799936e-05,
"loss": 4.7245,
"step": 54300
},
{
"epoch": 0.06,
"grad_norm": 0.5266515612602234,
"learning_rate": 9.368875833281126e-05,
"loss": 4.7285,
"step": 54400
},
{
"epoch": 0.06,
"grad_norm": 0.9966477751731873,
"learning_rate": 9.367715678562894e-05,
"loss": 4.7221,
"step": 54500
},
{
"epoch": 0.06,
"grad_norm": 0.42000776529312134,
"learning_rate": 9.36655552384466e-05,
"loss": 4.7277,
"step": 54600
},
{
"epoch": 0.06,
"grad_norm": 0.4215773344039917,
"learning_rate": 9.365395369126427e-05,
"loss": 4.7204,
"step": 54700
},
{
"epoch": 0.06,
"grad_norm": 0.4621349573135376,
"learning_rate": 9.364235214408193e-05,
"loss": 4.7214,
"step": 54800
},
{
"epoch": 0.06,
"grad_norm": 0.4330434203147888,
"learning_rate": 9.363075059689961e-05,
"loss": 4.7244,
"step": 54900
},
{
"epoch": 0.06,
"grad_norm": 0.45555201172828674,
"learning_rate": 9.361914904971727e-05,
"loss": 4.7201,
"step": 55000
},
{
"epoch": 0.06,
"grad_norm": 0.45038706064224243,
"learning_rate": 9.360754750253494e-05,
"loss": 4.7254,
"step": 55100
},
{
"epoch": 0.06,
"grad_norm": 1.2518788576126099,
"learning_rate": 9.359594595535261e-05,
"loss": 4.7237,
"step": 55200
},
{
"epoch": 0.06,
"grad_norm": 0.44323647022247314,
"learning_rate": 9.358434440817028e-05,
"loss": 4.7269,
"step": 55300
},
{
"epoch": 0.06,
"grad_norm": 1.72452974319458,
"learning_rate": 9.357274286098795e-05,
"loss": 4.7218,
"step": 55400
},
{
"epoch": 0.06,
"grad_norm": 10.425994873046875,
"learning_rate": 9.356114131380561e-05,
"loss": 4.7237,
"step": 55500
},
{
"epoch": 0.06,
"grad_norm": 1.400721549987793,
"learning_rate": 9.354953976662328e-05,
"loss": 4.7282,
"step": 55600
},
{
"epoch": 0.06,
"grad_norm": 0.49108538031578064,
"learning_rate": 9.353793821944095e-05,
"loss": 4.7187,
"step": 55700
},
{
"epoch": 0.06,
"grad_norm": 0.47107619047164917,
"learning_rate": 9.352633667225862e-05,
"loss": 4.7217,
"step": 55800
},
{
"epoch": 0.06,
"grad_norm": 0.43458929657936096,
"learning_rate": 9.351473512507628e-05,
"loss": 4.72,
"step": 55900
},
{
"epoch": 0.06,
"grad_norm": 0.4294680655002594,
"learning_rate": 9.350313357789395e-05,
"loss": 4.7232,
"step": 56000
},
{
"epoch": 0.07,
"grad_norm": 0.8897697329521179,
"learning_rate": 9.349153203071162e-05,
"loss": 4.721,
"step": 56100
},
{
"epoch": 0.07,
"grad_norm": 0.4212525188922882,
"learning_rate": 9.347993048352929e-05,
"loss": 4.7234,
"step": 56200
},
{
"epoch": 0.07,
"grad_norm": 0.9617418646812439,
"learning_rate": 9.346832893634696e-05,
"loss": 4.7209,
"step": 56300
},
{
"epoch": 0.07,
"grad_norm": 0.46716544032096863,
"learning_rate": 9.345672738916463e-05,
"loss": 4.7212,
"step": 56400
},
{
"epoch": 0.07,
"grad_norm": 0.8855605721473694,
"learning_rate": 9.34451258419823e-05,
"loss": 4.7255,
"step": 56500
},
{
"epoch": 0.07,
"grad_norm": 0.47797083854675293,
"learning_rate": 9.343352429479996e-05,
"loss": 4.7252,
"step": 56600
},
{
"epoch": 0.07,
"grad_norm": 0.5321144461631775,
"learning_rate": 9.342192274761763e-05,
"loss": 4.7234,
"step": 56700
},
{
"epoch": 0.07,
"grad_norm": 1.9607291221618652,
"learning_rate": 9.34103212004353e-05,
"loss": 4.7244,
"step": 56800
},
{
"epoch": 0.07,
"grad_norm": 0.45025816559791565,
"learning_rate": 9.339871965325297e-05,
"loss": 4.7214,
"step": 56900
},
{
"epoch": 0.07,
"grad_norm": 0.46686848998069763,
"learning_rate": 9.338711810607062e-05,
"loss": 4.7222,
"step": 57000
},
{
"epoch": 0.07,
"grad_norm": 0.4851933419704437,
"learning_rate": 9.33755165588883e-05,
"loss": 4.7246,
"step": 57100
},
{
"epoch": 0.07,
"grad_norm": 0.9222490787506104,
"learning_rate": 9.336391501170596e-05,
"loss": 4.7234,
"step": 57200
},
{
"epoch": 0.07,
"grad_norm": 2.9428153038024902,
"learning_rate": 9.335231346452364e-05,
"loss": 4.7218,
"step": 57300
},
{
"epoch": 0.07,
"grad_norm": 0.5928464531898499,
"learning_rate": 9.33407119173413e-05,
"loss": 4.7201,
"step": 57400
},
{
"epoch": 0.07,
"grad_norm": 0.47166940569877625,
"learning_rate": 9.332911037015896e-05,
"loss": 4.719,
"step": 57500
},
{
"epoch": 0.07,
"grad_norm": 0.5025938153266907,
"learning_rate": 9.331750882297665e-05,
"loss": 4.7216,
"step": 57600
},
{
"epoch": 0.07,
"grad_norm": 0.40926048159599304,
"learning_rate": 9.33059072757943e-05,
"loss": 4.7214,
"step": 57700
},
{
"epoch": 0.07,
"grad_norm": 0.5747073292732239,
"learning_rate": 9.329430572861197e-05,
"loss": 4.7236,
"step": 57800
},
{
"epoch": 0.07,
"grad_norm": 0.4234209656715393,
"learning_rate": 9.328270418142963e-05,
"loss": 4.7218,
"step": 57900
},
{
"epoch": 0.07,
"grad_norm": 0.47158312797546387,
"learning_rate": 9.327110263424731e-05,
"loss": 4.7212,
"step": 58000
},
{
"epoch": 0.07,
"grad_norm": 0.48128268122673035,
"learning_rate": 9.325950108706497e-05,
"loss": 4.7217,
"step": 58100
},
{
"epoch": 0.07,
"grad_norm": 0.6812758445739746,
"learning_rate": 9.324789953988264e-05,
"loss": 4.7229,
"step": 58200
},
{
"epoch": 0.07,
"grad_norm": 0.4417859613895416,
"learning_rate": 9.32362979927003e-05,
"loss": 4.7203,
"step": 58300
},
{
"epoch": 0.07,
"grad_norm": 0.41741812229156494,
"learning_rate": 9.322469644551798e-05,
"loss": 4.7253,
"step": 58400
},
{
"epoch": 0.07,
"grad_norm": 0.9673991799354553,
"learning_rate": 9.321309489833565e-05,
"loss": 4.7205,
"step": 58500
},
{
"epoch": 0.07,
"grad_norm": 0.4821186363697052,
"learning_rate": 9.320149335115331e-05,
"loss": 4.7232,
"step": 58600
},
{
"epoch": 0.07,
"grad_norm": 0.46331697702407837,
"learning_rate": 9.318989180397098e-05,
"loss": 4.7198,
"step": 58700
},
{
"epoch": 0.07,
"grad_norm": 0.4690409004688263,
"learning_rate": 9.317829025678865e-05,
"loss": 4.7235,
"step": 58800
},
{
"epoch": 0.07,
"grad_norm": 5.2196197509765625,
"learning_rate": 9.316668870960632e-05,
"loss": 4.7232,
"step": 58900
},
{
"epoch": 0.07,
"grad_norm": 0.4771330952644348,
"learning_rate": 9.315508716242398e-05,
"loss": 4.7252,
"step": 59000
},
{
"epoch": 0.07,
"grad_norm": 0.5050112009048462,
"learning_rate": 9.314348561524165e-05,
"loss": 4.7231,
"step": 59100
},
{
"epoch": 0.07,
"grad_norm": 5.69675874710083,
"learning_rate": 9.313188406805932e-05,
"loss": 4.7234,
"step": 59200
},
{
"epoch": 0.07,
"grad_norm": 0.6608816981315613,
"learning_rate": 9.312028252087699e-05,
"loss": 4.721,
"step": 59300
},
{
"epoch": 0.07,
"grad_norm": 0.42843931913375854,
"learning_rate": 9.310868097369466e-05,
"loss": 4.7207,
"step": 59400
},
{
"epoch": 0.07,
"grad_norm": 0.5942014455795288,
"learning_rate": 9.309707942651233e-05,
"loss": 4.7235,
"step": 59500
},
{
"epoch": 0.07,
"grad_norm": 4.165734767913818,
"learning_rate": 9.308547787933e-05,
"loss": 4.7201,
"step": 59600
},
{
"epoch": 0.07,
"grad_norm": 0.4411238729953766,
"learning_rate": 9.307387633214766e-05,
"loss": 4.7214,
"step": 59700
},
{
"epoch": 0.07,
"grad_norm": 0.42770957946777344,
"learning_rate": 9.306227478496533e-05,
"loss": 4.7232,
"step": 59800
},
{
"epoch": 0.07,
"grad_norm": 1.3136483430862427,
"learning_rate": 9.3050673237783e-05,
"loss": 4.7232,
"step": 59900
},
{
"epoch": 0.07,
"grad_norm": 3.146456241607666,
"learning_rate": 9.303907169060067e-05,
"loss": 4.7231,
"step": 60000
},
{
"epoch": 0.07,
"grad_norm": 0.6570073366165161,
"learning_rate": 9.302747014341833e-05,
"loss": 4.7241,
"step": 60100
},
{
"epoch": 0.07,
"grad_norm": 0.44109871983528137,
"learning_rate": 9.3015868596236e-05,
"loss": 4.7227,
"step": 60200
},
{
"epoch": 0.07,
"grad_norm": 4.838292598724365,
"learning_rate": 9.300426704905367e-05,
"loss": 4.7241,
"step": 60300
},
{
"epoch": 0.07,
"grad_norm": 0.6892091631889343,
"learning_rate": 9.299266550187134e-05,
"loss": 4.7245,
"step": 60400
},
{
"epoch": 0.07,
"grad_norm": 0.4367011785507202,
"learning_rate": 9.298106395468901e-05,
"loss": 4.7186,
"step": 60500
},
{
"epoch": 0.07,
"grad_norm": 0.45093631744384766,
"learning_rate": 9.296946240750666e-05,
"loss": 4.7211,
"step": 60600
},
{
"epoch": 0.07,
"grad_norm": 0.8475301861763,
"learning_rate": 9.295786086032435e-05,
"loss": 4.7201,
"step": 60700
},
{
"epoch": 0.07,
"grad_norm": 0.5478389263153076,
"learning_rate": 9.2946259313142e-05,
"loss": 4.7238,
"step": 60800
},
{
"epoch": 0.07,
"grad_norm": 0.43729883432388306,
"learning_rate": 9.293465776595968e-05,
"loss": 4.7177,
"step": 60900
},
{
"epoch": 0.07,
"grad_norm": 1.243775725364685,
"learning_rate": 9.292305621877733e-05,
"loss": 4.7232,
"step": 61000
},
{
"epoch": 0.07,
"grad_norm": 0.4283261299133301,
"learning_rate": 9.291145467159502e-05,
"loss": 4.7211,
"step": 61100
},
{
"epoch": 0.07,
"grad_norm": 0.5506263375282288,
"learning_rate": 9.289985312441267e-05,
"loss": 4.7194,
"step": 61200
},
{
"epoch": 0.07,
"grad_norm": 2.6256954669952393,
"learning_rate": 9.288825157723034e-05,
"loss": 4.7229,
"step": 61300
},
{
"epoch": 0.07,
"grad_norm": 0.47135502099990845,
"learning_rate": 9.2876650030048e-05,
"loss": 4.7194,
"step": 61400
},
{
"epoch": 0.07,
"grad_norm": 0.4235610067844391,
"learning_rate": 9.286504848286568e-05,
"loss": 4.7228,
"step": 61500
},
{
"epoch": 0.07,
"grad_norm": 0.691027820110321,
"learning_rate": 9.285344693568335e-05,
"loss": 4.7178,
"step": 61600
},
{
"epoch": 0.07,
"grad_norm": 1.3785357475280762,
"learning_rate": 9.284184538850101e-05,
"loss": 4.722,
"step": 61700
},
{
"epoch": 0.07,
"grad_norm": 0.7555792927742004,
"learning_rate": 9.283024384131868e-05,
"loss": 4.7192,
"step": 61800
},
{
"epoch": 0.07,
"grad_norm": 0.45149528980255127,
"learning_rate": 9.281864229413635e-05,
"loss": 4.7223,
"step": 61900
},
{
"epoch": 0.07,
"grad_norm": 0.3915523886680603,
"learning_rate": 9.280704074695402e-05,
"loss": 4.7281,
"step": 62000
},
{
"epoch": 0.07,
"grad_norm": 0.6808405518531799,
"learning_rate": 9.279543919977168e-05,
"loss": 4.7186,
"step": 62100
},
{
"epoch": 0.07,
"grad_norm": 0.4248678982257843,
"learning_rate": 9.278383765258935e-05,
"loss": 4.7212,
"step": 62200
},
{
"epoch": 0.07,
"grad_norm": 0.4634566605091095,
"learning_rate": 9.277223610540702e-05,
"loss": 4.7232,
"step": 62300
},
{
"epoch": 0.07,
"grad_norm": 0.435062438249588,
"learning_rate": 9.276063455822469e-05,
"loss": 4.7195,
"step": 62400
},
{
"epoch": 0.07,
"grad_norm": 0.44273582100868225,
"learning_rate": 9.274903301104235e-05,
"loss": 4.7201,
"step": 62500
},
{
"epoch": 0.07,
"grad_norm": 1.6790070533752441,
"learning_rate": 9.273743146386002e-05,
"loss": 4.7233,
"step": 62600
},
{
"epoch": 0.07,
"grad_norm": 0.6105982065200806,
"learning_rate": 9.27258299166777e-05,
"loss": 4.7195,
"step": 62700
},
{
"epoch": 0.07,
"grad_norm": 0.4533052444458008,
"learning_rate": 9.271422836949536e-05,
"loss": 4.7196,
"step": 62800
},
{
"epoch": 0.07,
"grad_norm": 2.9967572689056396,
"learning_rate": 9.270262682231303e-05,
"loss": 4.7203,
"step": 62900
},
{
"epoch": 0.07,
"grad_norm": 0.4764550030231476,
"learning_rate": 9.26910252751307e-05,
"loss": 4.7199,
"step": 63000
},
{
"epoch": 0.07,
"grad_norm": 0.6913366913795471,
"learning_rate": 9.267942372794837e-05,
"loss": 4.7205,
"step": 63100
},
{
"epoch": 0.07,
"grad_norm": 0.43220970034599304,
"learning_rate": 9.266782218076603e-05,
"loss": 4.7217,
"step": 63200
},
{
"epoch": 0.07,
"grad_norm": 0.44269081950187683,
"learning_rate": 9.26562206335837e-05,
"loss": 4.7206,
"step": 63300
},
{
"epoch": 0.07,
"grad_norm": 0.4221411645412445,
"learning_rate": 9.264461908640137e-05,
"loss": 4.7208,
"step": 63400
},
{
"epoch": 0.07,
"grad_norm": 2.0757687091827393,
"learning_rate": 9.263301753921904e-05,
"loss": 4.7207,
"step": 63500
},
{
"epoch": 0.07,
"grad_norm": 3.6522767543792725,
"learning_rate": 9.26214159920367e-05,
"loss": 4.721,
"step": 63600
},
{
"epoch": 0.07,
"grad_norm": 0.5005024075508118,
"learning_rate": 9.260981444485437e-05,
"loss": 4.7216,
"step": 63700
},
{
"epoch": 0.07,
"grad_norm": 0.45725876092910767,
"learning_rate": 9.259821289767205e-05,
"loss": 4.7182,
"step": 63800
},
{
"epoch": 0.07,
"grad_norm": 7.862248420715332,
"learning_rate": 9.25866113504897e-05,
"loss": 4.7165,
"step": 63900
},
{
"epoch": 0.07,
"grad_norm": 0.43823152780532837,
"learning_rate": 9.257500980330738e-05,
"loss": 4.7194,
"step": 64000
},
{
"epoch": 0.07,
"grad_norm": 0.4907792806625366,
"learning_rate": 9.256340825612503e-05,
"loss": 4.7216,
"step": 64100
},
{
"epoch": 0.07,
"grad_norm": 0.4658365547657013,
"learning_rate": 9.255180670894272e-05,
"loss": 4.7176,
"step": 64200
},
{
"epoch": 0.07,
"grad_norm": 0.7001955509185791,
"learning_rate": 9.254020516176037e-05,
"loss": 4.7205,
"step": 64300
},
{
"epoch": 0.07,
"grad_norm": 0.4331388771533966,
"learning_rate": 9.252860361457805e-05,
"loss": 4.7193,
"step": 64400
},
{
"epoch": 0.07,
"grad_norm": 1.5930231809616089,
"learning_rate": 9.25170020673957e-05,
"loss": 4.7191,
"step": 64500
},
{
"epoch": 0.07,
"grad_norm": 0.5135887265205383,
"learning_rate": 9.250540052021339e-05,
"loss": 4.7156,
"step": 64600
},
{
"epoch": 0.08,
"grad_norm": 13.119867324829102,
"learning_rate": 9.249379897303104e-05,
"loss": 4.717,
"step": 64700
},
{
"epoch": 0.08,
"grad_norm": 0.43670549988746643,
"learning_rate": 9.248219742584871e-05,
"loss": 4.7208,
"step": 64800
},
{
"epoch": 0.08,
"grad_norm": 1.4829180240631104,
"learning_rate": 9.247059587866638e-05,
"loss": 4.717,
"step": 64900
},
{
"epoch": 0.08,
"grad_norm": 0.5183930397033691,
"learning_rate": 9.245899433148405e-05,
"loss": 4.7213,
"step": 65000
},
{
"epoch": 0.08,
"grad_norm": 0.3946682810783386,
"learning_rate": 9.244739278430172e-05,
"loss": 4.7212,
"step": 65100
},
{
"epoch": 0.08,
"grad_norm": 0.46105512976646423,
"learning_rate": 9.243579123711938e-05,
"loss": 4.7192,
"step": 65200
},
{
"epoch": 0.08,
"grad_norm": 0.44661739468574524,
"learning_rate": 9.242418968993705e-05,
"loss": 4.7211,
"step": 65300
},
{
"epoch": 0.08,
"grad_norm": 0.6695325374603271,
"learning_rate": 9.241258814275472e-05,
"loss": 4.7193,
"step": 65400
},
{
"epoch": 0.08,
"grad_norm": 2.9194817543029785,
"learning_rate": 9.240098659557239e-05,
"loss": 4.7224,
"step": 65500
},
{
"epoch": 0.08,
"grad_norm": 0.4088338017463684,
"learning_rate": 9.238938504839005e-05,
"loss": 4.7186,
"step": 65600
},
{
"epoch": 0.08,
"grad_norm": 0.5131499767303467,
"learning_rate": 9.237778350120772e-05,
"loss": 4.7158,
"step": 65700
},
{
"epoch": 0.08,
"grad_norm": 9.862509727478027,
"learning_rate": 9.236618195402539e-05,
"loss": 4.716,
"step": 65800
},
{
"epoch": 0.08,
"grad_norm": 0.40794941782951355,
"learning_rate": 9.235458040684306e-05,
"loss": 4.718,
"step": 65900
},
{
"epoch": 0.08,
"grad_norm": 0.40948912501335144,
"learning_rate": 9.234297885966073e-05,
"loss": 4.7162,
"step": 66000
},
{
"epoch": 0.08,
"grad_norm": 3.3764171600341797,
"learning_rate": 9.23313773124784e-05,
"loss": 4.7209,
"step": 66100
},
{
"epoch": 0.08,
"grad_norm": 0.44073447585105896,
"learning_rate": 9.231977576529607e-05,
"loss": 4.7222,
"step": 66200
},
{
"epoch": 0.08,
"grad_norm": 0.5790233016014099,
"learning_rate": 9.230817421811373e-05,
"loss": 4.7167,
"step": 66300
},
{
"epoch": 0.08,
"grad_norm": 0.9112139344215393,
"learning_rate": 9.22965726709314e-05,
"loss": 4.7198,
"step": 66400
},
{
"epoch": 0.08,
"grad_norm": 0.5049501061439514,
"learning_rate": 9.228497112374907e-05,
"loss": 4.7158,
"step": 66500
},
{
"epoch": 0.08,
"grad_norm": 0.6256558895111084,
"learning_rate": 9.227336957656674e-05,
"loss": 4.7217,
"step": 66600
},
{
"epoch": 0.08,
"grad_norm": 0.392869234085083,
"learning_rate": 9.22617680293844e-05,
"loss": 4.72,
"step": 66700
},
{
"epoch": 0.08,
"grad_norm": 0.4745340347290039,
"learning_rate": 9.225016648220207e-05,
"loss": 4.719,
"step": 66800
},
{
"epoch": 0.08,
"grad_norm": 0.4279547333717346,
"learning_rate": 9.223856493501974e-05,
"loss": 4.7191,
"step": 66900
},
{
"epoch": 0.08,
"grad_norm": 3.0647037029266357,
"learning_rate": 9.222696338783741e-05,
"loss": 4.7241,
"step": 67000
},
{
"epoch": 0.08,
"grad_norm": 0.4187820255756378,
"learning_rate": 9.221536184065508e-05,
"loss": 4.7211,
"step": 67100
},
{
"epoch": 0.08,
"grad_norm": 0.9273259043693542,
"learning_rate": 9.220376029347274e-05,
"loss": 4.7245,
"step": 67200
},
{
"epoch": 0.08,
"grad_norm": 0.49627095460891724,
"learning_rate": 9.219215874629042e-05,
"loss": 4.7182,
"step": 67300
},
{
"epoch": 0.08,
"grad_norm": 1.4430867433547974,
"learning_rate": 9.218055719910808e-05,
"loss": 4.7196,
"step": 67400
},
{
"epoch": 0.08,
"grad_norm": 0.4100867509841919,
"learning_rate": 9.216895565192575e-05,
"loss": 4.7207,
"step": 67500
},
{
"epoch": 0.08,
"grad_norm": 0.49010923504829407,
"learning_rate": 9.21573541047434e-05,
"loss": 4.7169,
"step": 67600
},
{
"epoch": 0.08,
"grad_norm": 0.4209338426589966,
"learning_rate": 9.214575255756109e-05,
"loss": 4.7162,
"step": 67700
},
{
"epoch": 0.08,
"grad_norm": 0.45588409900665283,
"learning_rate": 9.213415101037874e-05,
"loss": 4.7177,
"step": 67800
},
{
"epoch": 0.08,
"grad_norm": 0.405272901058197,
"learning_rate": 9.212254946319641e-05,
"loss": 4.7175,
"step": 67900
},
{
"epoch": 0.08,
"grad_norm": 2.1375739574432373,
"learning_rate": 9.211094791601409e-05,
"loss": 4.7211,
"step": 68000
},
{
"epoch": 0.08,
"grad_norm": 0.4474073648452759,
"learning_rate": 9.209934636883176e-05,
"loss": 4.7152,
"step": 68100
},
{
"epoch": 0.08,
"grad_norm": 3.441603422164917,
"learning_rate": 9.208774482164943e-05,
"loss": 4.7209,
"step": 68200
},
{
"epoch": 0.08,
"grad_norm": 0.562470018863678,
"learning_rate": 9.207614327446708e-05,
"loss": 4.7188,
"step": 68300
},
{
"epoch": 0.08,
"grad_norm": 0.4169847071170807,
"learning_rate": 9.206454172728475e-05,
"loss": 4.7166,
"step": 68400
},
{
"epoch": 0.08,
"grad_norm": 0.43674352765083313,
"learning_rate": 9.205294018010242e-05,
"loss": 4.7136,
"step": 68500
},
{
"epoch": 0.08,
"grad_norm": 0.39852142333984375,
"learning_rate": 9.20413386329201e-05,
"loss": 4.7176,
"step": 68600
},
{
"epoch": 0.08,
"grad_norm": 0.41788631677627563,
"learning_rate": 9.202973708573775e-05,
"loss": 4.7202,
"step": 68700
},
{
"epoch": 0.08,
"grad_norm": 0.4472859501838684,
"learning_rate": 9.201813553855542e-05,
"loss": 4.7204,
"step": 68800
},
{
"epoch": 0.08,
"grad_norm": 0.3944435119628906,
"learning_rate": 9.200653399137309e-05,
"loss": 4.7181,
"step": 68900
},
{
"epoch": 0.08,
"grad_norm": 0.444679319858551,
"learning_rate": 9.199493244419076e-05,
"loss": 4.723,
"step": 69000
},
{
"epoch": 0.08,
"grad_norm": 0.4338514804840088,
"learning_rate": 9.198333089700842e-05,
"loss": 4.7213,
"step": 69100
},
{
"epoch": 0.08,
"grad_norm": 0.4399813711643219,
"learning_rate": 9.19717293498261e-05,
"loss": 4.7167,
"step": 69200
},
{
"epoch": 0.08,
"grad_norm": 0.46220558881759644,
"learning_rate": 9.196012780264377e-05,
"loss": 4.7158,
"step": 69300
},
{
"epoch": 0.08,
"grad_norm": 1.313628911972046,
"learning_rate": 9.194852625546143e-05,
"loss": 4.7165,
"step": 69400
},
{
"epoch": 0.08,
"grad_norm": 0.5480996370315552,
"learning_rate": 9.19369247082791e-05,
"loss": 4.7174,
"step": 69500
},
{
"epoch": 0.08,
"grad_norm": 0.4436434805393219,
"learning_rate": 9.192532316109677e-05,
"loss": 4.7198,
"step": 69600
},
{
"epoch": 0.08,
"grad_norm": 5.995100498199463,
"learning_rate": 9.191372161391444e-05,
"loss": 4.7176,
"step": 69700
},
{
"epoch": 0.08,
"grad_norm": 0.44759735465049744,
"learning_rate": 9.19021200667321e-05,
"loss": 4.7166,
"step": 69800
},
{
"epoch": 0.08,
"grad_norm": 0.9924051761627197,
"learning_rate": 9.189051851954977e-05,
"loss": 4.7191,
"step": 69900
},
{
"epoch": 0.08,
"grad_norm": 3.259697675704956,
"learning_rate": 9.187891697236744e-05,
"loss": 4.7166,
"step": 70000
},
{
"epoch": 0.08,
"grad_norm": 0.5200223326683044,
"learning_rate": 9.186731542518511e-05,
"loss": 4.72,
"step": 70100
},
{
"epoch": 0.08,
"grad_norm": 0.5573539733886719,
"learning_rate": 9.185571387800278e-05,
"loss": 4.7173,
"step": 70200
},
{
"epoch": 0.08,
"grad_norm": 0.46177276968955994,
"learning_rate": 9.184411233082044e-05,
"loss": 4.718,
"step": 70300
},
{
"epoch": 0.08,
"grad_norm": 0.4673067033290863,
"learning_rate": 9.183251078363812e-05,
"loss": 4.7181,
"step": 70400
},
{
"epoch": 0.08,
"grad_norm": 0.4644356667995453,
"learning_rate": 9.182090923645578e-05,
"loss": 4.7146,
"step": 70500
},
{
"epoch": 0.08,
"grad_norm": 0.4181482195854187,
"learning_rate": 9.180930768927345e-05,
"loss": 4.7132,
"step": 70600
},
{
"epoch": 0.08,
"grad_norm": 0.42789730429649353,
"learning_rate": 9.17977061420911e-05,
"loss": 4.7172,
"step": 70700
},
{
"epoch": 0.08,
"grad_norm": 3.978273391723633,
"learning_rate": 9.178610459490879e-05,
"loss": 4.7201,
"step": 70800
},
{
"epoch": 0.08,
"grad_norm": 6.412673473358154,
"learning_rate": 9.177450304772645e-05,
"loss": 4.7201,
"step": 70900
},
{
"epoch": 0.08,
"grad_norm": 0.45621249079704285,
"learning_rate": 9.176290150054412e-05,
"loss": 4.7167,
"step": 71000
},
{
"epoch": 0.08,
"grad_norm": 0.4043485224246979,
"learning_rate": 9.175129995336179e-05,
"loss": 4.7154,
"step": 71100
},
{
"epoch": 0.08,
"grad_norm": 0.45778888463974,
"learning_rate": 9.173969840617946e-05,
"loss": 4.7193,
"step": 71200
},
{
"epoch": 0.08,
"grad_norm": 0.42115822434425354,
"learning_rate": 9.172809685899713e-05,
"loss": 4.7165,
"step": 71300
},
{
"epoch": 0.08,
"grad_norm": 0.421622633934021,
"learning_rate": 9.171649531181478e-05,
"loss": 4.717,
"step": 71400
},
{
"epoch": 0.08,
"grad_norm": 0.4167540967464447,
"learning_rate": 9.170489376463245e-05,
"loss": 4.7161,
"step": 71500
},
{
"epoch": 0.08,
"grad_norm": 0.7092134356498718,
"learning_rate": 9.169329221745013e-05,
"loss": 4.7197,
"step": 71600
},
{
"epoch": 0.08,
"grad_norm": 0.5543451905250549,
"learning_rate": 9.16816906702678e-05,
"loss": 4.7185,
"step": 71700
},
{
"epoch": 0.08,
"grad_norm": 0.44146397709846497,
"learning_rate": 9.167008912308545e-05,
"loss": 4.7146,
"step": 71800
},
{
"epoch": 0.08,
"grad_norm": 0.5362756848335266,
"learning_rate": 9.165848757590312e-05,
"loss": 4.72,
"step": 71900
},
{
"epoch": 0.08,
"grad_norm": 0.4360448122024536,
"learning_rate": 9.16468860287208e-05,
"loss": 4.7198,
"step": 72000
},
{
"epoch": 0.08,
"grad_norm": 0.40648597478866577,
"learning_rate": 9.163528448153846e-05,
"loss": 4.7168,
"step": 72100
},
{
"epoch": 0.08,
"grad_norm": 0.8124480843544006,
"learning_rate": 9.162368293435612e-05,
"loss": 4.7185,
"step": 72200
},
{
"epoch": 0.08,
"grad_norm": 0.48873409628868103,
"learning_rate": 9.16120813871738e-05,
"loss": 4.7192,
"step": 72300
},
{
"epoch": 0.08,
"grad_norm": 1.013222098350525,
"learning_rate": 9.160047983999148e-05,
"loss": 4.7156,
"step": 72400
},
{
"epoch": 0.08,
"grad_norm": 0.4423314034938812,
"learning_rate": 9.158887829280913e-05,
"loss": 4.7172,
"step": 72500
},
{
"epoch": 0.08,
"grad_norm": 0.5054183602333069,
"learning_rate": 9.15772767456268e-05,
"loss": 4.7199,
"step": 72600
},
{
"epoch": 0.08,
"grad_norm": 0.4140661060810089,
"learning_rate": 9.156567519844447e-05,
"loss": 4.7142,
"step": 72700
},
{
"epoch": 0.08,
"grad_norm": 1.734790325164795,
"learning_rate": 9.155407365126214e-05,
"loss": 4.7135,
"step": 72800
},
{
"epoch": 0.08,
"grad_norm": 0.3996482193470001,
"learning_rate": 9.15424721040798e-05,
"loss": 4.7194,
"step": 72900
},
{
"epoch": 0.08,
"grad_norm": 0.4260541796684265,
"learning_rate": 9.153087055689747e-05,
"loss": 4.7143,
"step": 73000
},
{
"epoch": 0.08,
"grad_norm": 2.339716672897339,
"learning_rate": 9.151926900971514e-05,
"loss": 4.7185,
"step": 73100
},
{
"epoch": 0.08,
"grad_norm": 0.48179367184638977,
"learning_rate": 9.150766746253281e-05,
"loss": 4.7161,
"step": 73200
},
{
"epoch": 0.09,
"grad_norm": 0.4243863821029663,
"learning_rate": 9.149606591535047e-05,
"loss": 4.7168,
"step": 73300
},
{
"epoch": 0.09,
"grad_norm": 0.38295459747314453,
"learning_rate": 9.148446436816814e-05,
"loss": 4.715,
"step": 73400
},
{
"epoch": 0.09,
"grad_norm": 0.5023996829986572,
"learning_rate": 9.147286282098582e-05,
"loss": 4.7115,
"step": 73500
},
{
"epoch": 0.09,
"grad_norm": 0.5002963542938232,
"learning_rate": 9.146126127380348e-05,
"loss": 4.7145,
"step": 73600
},
{
"epoch": 0.09,
"grad_norm": 9.655466079711914,
"learning_rate": 9.144965972662115e-05,
"loss": 4.7166,
"step": 73700
},
{
"epoch": 0.09,
"grad_norm": 0.5054044723510742,
"learning_rate": 9.14380581794388e-05,
"loss": 4.7193,
"step": 73800
},
{
"epoch": 0.09,
"grad_norm": 0.453080415725708,
"learning_rate": 9.142645663225649e-05,
"loss": 4.7161,
"step": 73900
},
{
"epoch": 0.09,
"grad_norm": 0.44624748826026917,
"learning_rate": 9.141485508507415e-05,
"loss": 4.7187,
"step": 74000
},
{
"epoch": 0.09,
"grad_norm": 0.4639555811882019,
"learning_rate": 9.140325353789182e-05,
"loss": 4.7176,
"step": 74100
},
{
"epoch": 0.09,
"grad_norm": 0.41725417971611023,
"learning_rate": 9.139165199070947e-05,
"loss": 4.7163,
"step": 74200
},
{
"epoch": 0.09,
"grad_norm": 0.866028904914856,
"learning_rate": 9.138005044352716e-05,
"loss": 4.716,
"step": 74300
},
{
"epoch": 0.09,
"grad_norm": 0.46224868297576904,
"learning_rate": 9.136844889634482e-05,
"loss": 4.7139,
"step": 74400
},
{
"epoch": 0.09,
"grad_norm": 0.43956613540649414,
"learning_rate": 9.135684734916249e-05,
"loss": 4.7171,
"step": 74500
},
{
"epoch": 0.09,
"grad_norm": 0.4600673019886017,
"learning_rate": 9.134524580198016e-05,
"loss": 4.7157,
"step": 74600
},
{
"epoch": 0.09,
"grad_norm": 1.0996651649475098,
"learning_rate": 9.133364425479783e-05,
"loss": 4.7146,
"step": 74700
},
{
"epoch": 0.09,
"grad_norm": 0.5937373042106628,
"learning_rate": 9.13220427076155e-05,
"loss": 4.719,
"step": 74800
},
{
"epoch": 0.09,
"grad_norm": 3.145916223526001,
"learning_rate": 9.131044116043315e-05,
"loss": 4.7144,
"step": 74900
},
{
"epoch": 0.09,
"grad_norm": 5.668306827545166,
"learning_rate": 9.129883961325082e-05,
"loss": 4.7148,
"step": 75000
},
{
"epoch": 0.09,
"grad_norm": 0.5367663502693176,
"learning_rate": 9.12872380660685e-05,
"loss": 4.7152,
"step": 75100
},
{
"epoch": 0.09,
"grad_norm": 0.4246484637260437,
"learning_rate": 9.127563651888617e-05,
"loss": 4.7177,
"step": 75200
},
{
"epoch": 0.09,
"grad_norm": 0.47237107157707214,
"learning_rate": 9.126403497170382e-05,
"loss": 4.7166,
"step": 75300
},
{
"epoch": 0.09,
"grad_norm": 0.5941634178161621,
"learning_rate": 9.12524334245215e-05,
"loss": 4.7158,
"step": 75400
},
{
"epoch": 0.09,
"grad_norm": 0.4129926562309265,
"learning_rate": 9.124083187733916e-05,
"loss": 4.7151,
"step": 75500
},
{
"epoch": 0.09,
"grad_norm": 0.5305066704750061,
"learning_rate": 9.122923033015683e-05,
"loss": 4.7151,
"step": 75600
},
{
"epoch": 0.09,
"grad_norm": 0.43299585580825806,
"learning_rate": 9.12176287829745e-05,
"loss": 4.716,
"step": 75700
},
{
"epoch": 0.09,
"grad_norm": 10.120293617248535,
"learning_rate": 9.120602723579217e-05,
"loss": 4.7183,
"step": 75800
},
{
"epoch": 0.09,
"grad_norm": 0.393725723028183,
"learning_rate": 9.119442568860984e-05,
"loss": 4.7173,
"step": 75900
},
{
"epoch": 0.09,
"grad_norm": 5.1044440269470215,
"learning_rate": 9.11828241414275e-05,
"loss": 4.7144,
"step": 76000
},
{
"epoch": 0.09,
"grad_norm": 0.38623175024986267,
"learning_rate": 9.117122259424517e-05,
"loss": 4.7165,
"step": 76100
},
{
"epoch": 0.09,
"grad_norm": 2.17921781539917,
"learning_rate": 9.115962104706284e-05,
"loss": 4.7171,
"step": 76200
},
{
"epoch": 0.09,
"grad_norm": 0.4206479489803314,
"learning_rate": 9.114801949988051e-05,
"loss": 4.7119,
"step": 76300
},
{
"epoch": 0.09,
"grad_norm": 0.46056804060935974,
"learning_rate": 9.113641795269817e-05,
"loss": 4.7129,
"step": 76400
},
{
"epoch": 0.09,
"grad_norm": 0.4047752320766449,
"learning_rate": 9.112481640551584e-05,
"loss": 4.7133,
"step": 76500
},
{
"epoch": 0.09,
"grad_norm": 0.4942108988761902,
"learning_rate": 9.111321485833351e-05,
"loss": 4.7143,
"step": 76600
},
{
"epoch": 0.09,
"grad_norm": 1.5642993450164795,
"learning_rate": 9.110161331115118e-05,
"loss": 4.7126,
"step": 76700
},
{
"epoch": 0.09,
"grad_norm": 0.4183765947818756,
"learning_rate": 9.109001176396885e-05,
"loss": 4.7138,
"step": 76800
},
{
"epoch": 0.09,
"grad_norm": 0.4382226765155792,
"learning_rate": 9.107841021678651e-05,
"loss": 4.7156,
"step": 76900
},
{
"epoch": 0.09,
"grad_norm": 0.429971843957901,
"learning_rate": 9.106680866960419e-05,
"loss": 4.7152,
"step": 77000
},
{
"epoch": 0.09,
"grad_norm": 0.4051077663898468,
"learning_rate": 9.105520712242185e-05,
"loss": 4.7146,
"step": 77100
},
{
"epoch": 0.09,
"grad_norm": 0.4416183829307556,
"learning_rate": 9.104360557523952e-05,
"loss": 4.716,
"step": 77200
},
{
"epoch": 0.09,
"grad_norm": 1.668931007385254,
"learning_rate": 9.103200402805718e-05,
"loss": 4.7116,
"step": 77300
},
{
"epoch": 0.09,
"grad_norm": 0.42298269271850586,
"learning_rate": 9.102040248087486e-05,
"loss": 4.7173,
"step": 77400
},
{
"epoch": 0.09,
"grad_norm": 0.7401153445243835,
"learning_rate": 9.100880093369252e-05,
"loss": 4.7148,
"step": 77500
},
{
"epoch": 0.09,
"grad_norm": 0.6153188943862915,
"learning_rate": 9.099719938651019e-05,
"loss": 4.7157,
"step": 77600
},
{
"epoch": 0.09,
"grad_norm": 0.41621410846710205,
"learning_rate": 9.098559783932786e-05,
"loss": 4.7169,
"step": 77700
},
{
"epoch": 0.09,
"grad_norm": 0.4147420823574066,
"learning_rate": 9.097399629214553e-05,
"loss": 4.7155,
"step": 77800
},
{
"epoch": 0.09,
"grad_norm": 8.179230690002441,
"learning_rate": 9.09623947449632e-05,
"loss": 4.7142,
"step": 77900
},
{
"epoch": 0.09,
"grad_norm": 0.8721107244491577,
"learning_rate": 9.095079319778086e-05,
"loss": 4.7145,
"step": 78000
},
{
"epoch": 0.09,
"grad_norm": 1.0575512647628784,
"learning_rate": 9.093919165059853e-05,
"loss": 4.7156,
"step": 78100
},
{
"epoch": 0.09,
"grad_norm": 0.7179790139198303,
"learning_rate": 9.09275901034162e-05,
"loss": 4.7145,
"step": 78200
},
{
"epoch": 0.09,
"grad_norm": 1.8817191123962402,
"learning_rate": 9.091598855623387e-05,
"loss": 4.7154,
"step": 78300
},
{
"epoch": 0.09,
"grad_norm": 0.47256988286972046,
"learning_rate": 9.090438700905152e-05,
"loss": 4.7176,
"step": 78400
},
{
"epoch": 0.09,
"grad_norm": 0.7659066319465637,
"learning_rate": 9.08927854618692e-05,
"loss": 4.7176,
"step": 78500
},
{
"epoch": 0.09,
"grad_norm": 0.4704316556453705,
"learning_rate": 9.088118391468686e-05,
"loss": 4.7173,
"step": 78600
},
{
"epoch": 0.09,
"grad_norm": 0.47578001022338867,
"learning_rate": 9.086958236750454e-05,
"loss": 4.7122,
"step": 78700
},
{
"epoch": 0.09,
"grad_norm": 3.0918359756469727,
"learning_rate": 9.085798082032219e-05,
"loss": 4.712,
"step": 78800
},
{
"epoch": 0.09,
"grad_norm": 0.712931752204895,
"learning_rate": 9.084637927313988e-05,
"loss": 4.7125,
"step": 78900
},
{
"epoch": 0.09,
"grad_norm": 0.39107823371887207,
"learning_rate": 9.083477772595755e-05,
"loss": 4.7133,
"step": 79000
},
{
"epoch": 0.09,
"grad_norm": 0.45276641845703125,
"learning_rate": 9.08231761787752e-05,
"loss": 4.7135,
"step": 79100
},
{
"epoch": 0.09,
"grad_norm": 0.4419955611228943,
"learning_rate": 9.081157463159287e-05,
"loss": 4.7133,
"step": 79200
},
{
"epoch": 0.09,
"grad_norm": 0.39920610189437866,
"learning_rate": 9.079997308441054e-05,
"loss": 4.7121,
"step": 79300
},
{
"epoch": 0.09,
"grad_norm": 0.8428456783294678,
"learning_rate": 9.078837153722821e-05,
"loss": 4.7135,
"step": 79400
},
{
"epoch": 0.09,
"grad_norm": 0.664378821849823,
"learning_rate": 9.077676999004587e-05,
"loss": 4.7165,
"step": 79500
},
{
"epoch": 0.09,
"grad_norm": 0.4127453863620758,
"learning_rate": 9.076516844286354e-05,
"loss": 4.7149,
"step": 79600
},
{
"epoch": 0.09,
"grad_norm": 1.8230032920837402,
"learning_rate": 9.075356689568121e-05,
"loss": 4.7123,
"step": 79700
},
{
"epoch": 0.09,
"grad_norm": 0.4852809011936188,
"learning_rate": 9.074196534849888e-05,
"loss": 4.7155,
"step": 79800
},
{
"epoch": 0.09,
"grad_norm": 10.022642135620117,
"learning_rate": 9.073036380131654e-05,
"loss": 4.7138,
"step": 79900
},
{
"epoch": 0.09,
"grad_norm": 0.4676394462585449,
"learning_rate": 9.071876225413421e-05,
"loss": 4.7148,
"step": 80000
},
{
"epoch": 0.09,
"grad_norm": 0.4943694770336151,
"learning_rate": 9.07071607069519e-05,
"loss": 4.7136,
"step": 80100
},
{
"epoch": 0.09,
"grad_norm": 0.45253655314445496,
"learning_rate": 9.069555915976955e-05,
"loss": 4.7134,
"step": 80200
},
{
"epoch": 0.09,
"grad_norm": 0.42640450596809387,
"learning_rate": 9.068395761258722e-05,
"loss": 4.7107,
"step": 80300
},
{
"epoch": 0.09,
"grad_norm": 0.409261018037796,
"learning_rate": 9.067235606540488e-05,
"loss": 4.7141,
"step": 80400
},
{
"epoch": 0.09,
"grad_norm": 0.9824286103248596,
"learning_rate": 9.066075451822256e-05,
"loss": 4.7141,
"step": 80500
},
{
"epoch": 0.09,
"grad_norm": 0.6486682891845703,
"learning_rate": 9.064915297104022e-05,
"loss": 4.7161,
"step": 80600
},
{
"epoch": 0.09,
"grad_norm": 0.587768018245697,
"learning_rate": 9.063755142385789e-05,
"loss": 4.712,
"step": 80700
},
{
"epoch": 0.09,
"grad_norm": 0.4047750234603882,
"learning_rate": 9.062594987667556e-05,
"loss": 4.7133,
"step": 80800
},
{
"epoch": 0.09,
"grad_norm": 0.6553632616996765,
"learning_rate": 9.061434832949323e-05,
"loss": 4.7139,
"step": 80900
},
{
"epoch": 0.09,
"grad_norm": 0.46661463379859924,
"learning_rate": 9.06027467823109e-05,
"loss": 4.7138,
"step": 81000
},
{
"epoch": 0.09,
"grad_norm": 0.44048547744750977,
"learning_rate": 9.059114523512856e-05,
"loss": 4.7159,
"step": 81100
},
{
"epoch": 0.09,
"grad_norm": 0.9623928666114807,
"learning_rate": 9.057954368794623e-05,
"loss": 4.7134,
"step": 81200
},
{
"epoch": 0.09,
"grad_norm": 0.40359175205230713,
"learning_rate": 9.05679421407639e-05,
"loss": 4.7147,
"step": 81300
},
{
"epoch": 0.09,
"grad_norm": 0.6080948114395142,
"learning_rate": 9.055634059358157e-05,
"loss": 4.7132,
"step": 81400
},
{
"epoch": 0.09,
"grad_norm": 2.992509126663208,
"learning_rate": 9.054473904639923e-05,
"loss": 4.7137,
"step": 81500
},
{
"epoch": 0.09,
"grad_norm": 0.4392452538013458,
"learning_rate": 9.05331374992169e-05,
"loss": 4.7122,
"step": 81600
},
{
"epoch": 0.09,
"grad_norm": 0.9614498019218445,
"learning_rate": 9.052153595203457e-05,
"loss": 4.71,
"step": 81700
},
{
"epoch": 0.09,
"grad_norm": 0.4181789755821228,
"learning_rate": 9.050993440485224e-05,
"loss": 4.7127,
"step": 81800
},
{
"epoch": 0.1,
"grad_norm": 1.0023537874221802,
"learning_rate": 9.04983328576699e-05,
"loss": 4.7144,
"step": 81900
},
{
"epoch": 0.1,
"grad_norm": 9.406643867492676,
"learning_rate": 9.048673131048758e-05,
"loss": 4.7134,
"step": 82000
},
{
"epoch": 0.1,
"grad_norm": 0.4215865433216095,
"learning_rate": 9.047512976330525e-05,
"loss": 4.7137,
"step": 82100
},
{
"epoch": 0.1,
"grad_norm": 0.4328692853450775,
"learning_rate": 9.04635282161229e-05,
"loss": 4.7125,
"step": 82200
},
{
"epoch": 0.1,
"grad_norm": 0.38356509804725647,
"learning_rate": 9.045192666894058e-05,
"loss": 4.7195,
"step": 82300
},
{
"epoch": 0.1,
"grad_norm": 0.4107106626033783,
"learning_rate": 9.044032512175825e-05,
"loss": 4.7097,
"step": 82400
},
{
"epoch": 0.1,
"grad_norm": 18.908138275146484,
"learning_rate": 9.042872357457592e-05,
"loss": 4.7187,
"step": 82500
},
{
"epoch": 0.1,
"grad_norm": 0.48430249094963074,
"learning_rate": 9.041712202739357e-05,
"loss": 4.7174,
"step": 82600
},
{
"epoch": 0.1,
"grad_norm": 0.4645005166530609,
"learning_rate": 9.040552048021124e-05,
"loss": 4.7146,
"step": 82700
},
{
"epoch": 0.1,
"grad_norm": 1.38825261592865,
"learning_rate": 9.039391893302891e-05,
"loss": 4.7159,
"step": 82800
},
{
"epoch": 0.1,
"grad_norm": 1.1587923765182495,
"learning_rate": 9.038231738584658e-05,
"loss": 4.7182,
"step": 82900
},
{
"epoch": 0.1,
"grad_norm": 0.42529210448265076,
"learning_rate": 9.037071583866424e-05,
"loss": 4.7118,
"step": 83000
},
{
"epoch": 0.1,
"grad_norm": 6.866576671600342,
"learning_rate": 9.035911429148191e-05,
"loss": 4.7124,
"step": 83100
},
{
"epoch": 0.1,
"grad_norm": 0.415056973695755,
"learning_rate": 9.03475127442996e-05,
"loss": 4.7136,
"step": 83200
},
{
"epoch": 0.1,
"grad_norm": 0.4472216069698334,
"learning_rate": 9.033591119711725e-05,
"loss": 4.71,
"step": 83300
},
{
"epoch": 0.1,
"grad_norm": 0.5460192561149597,
"learning_rate": 9.032430964993492e-05,
"loss": 4.7155,
"step": 83400
},
{
"epoch": 0.1,
"grad_norm": 1.7862334251403809,
"learning_rate": 9.031270810275258e-05,
"loss": 4.7135,
"step": 83500
},
{
"epoch": 0.1,
"grad_norm": 0.4369088411331177,
"learning_rate": 9.030110655557026e-05,
"loss": 4.7136,
"step": 83600
},
{
"epoch": 0.1,
"grad_norm": 0.40611496567726135,
"learning_rate": 9.028950500838792e-05,
"loss": 4.7144,
"step": 83700
},
{
"epoch": 0.1,
"grad_norm": 0.434088796377182,
"learning_rate": 9.027790346120559e-05,
"loss": 4.7161,
"step": 83800
},
{
"epoch": 0.1,
"grad_norm": 0.3754402995109558,
"learning_rate": 9.026630191402326e-05,
"loss": 4.7112,
"step": 83900
},
{
"epoch": 0.1,
"grad_norm": 0.507156491279602,
"learning_rate": 9.025470036684093e-05,
"loss": 4.7091,
"step": 84000
},
{
"epoch": 0.1,
"grad_norm": 0.43876662850379944,
"learning_rate": 9.024309881965859e-05,
"loss": 4.7107,
"step": 84100
},
{
"epoch": 0.1,
"grad_norm": 1.3095866441726685,
"learning_rate": 9.023149727247626e-05,
"loss": 4.7114,
"step": 84200
},
{
"epoch": 0.1,
"grad_norm": 1.327343225479126,
"learning_rate": 9.021989572529393e-05,
"loss": 4.713,
"step": 84300
},
{
"epoch": 0.1,
"grad_norm": 0.4155600965023041,
"learning_rate": 9.02082941781116e-05,
"loss": 4.7136,
"step": 84400
},
{
"epoch": 0.1,
"grad_norm": 0.4112358093261719,
"learning_rate": 9.019669263092927e-05,
"loss": 4.7096,
"step": 84500
},
{
"epoch": 0.1,
"grad_norm": 1.422731637954712,
"learning_rate": 9.018509108374693e-05,
"loss": 4.7132,
"step": 84600
},
{
"epoch": 0.1,
"grad_norm": 0.42826709151268005,
"learning_rate": 9.01734895365646e-05,
"loss": 4.7125,
"step": 84700
},
{
"epoch": 0.1,
"grad_norm": 0.4018416702747345,
"learning_rate": 9.016188798938227e-05,
"loss": 4.7123,
"step": 84800
},
{
"epoch": 0.1,
"grad_norm": 0.3829653561115265,
"learning_rate": 9.015028644219994e-05,
"loss": 4.7143,
"step": 84900
},
{
"epoch": 0.1,
"grad_norm": 0.45742008090019226,
"learning_rate": 9.01386848950176e-05,
"loss": 4.7114,
"step": 85000
},
{
"epoch": 0.1,
"grad_norm": 2.7736401557922363,
"learning_rate": 9.012708334783528e-05,
"loss": 4.7138,
"step": 85100
},
{
"epoch": 0.1,
"grad_norm": 0.40791991353034973,
"learning_rate": 9.011548180065294e-05,
"loss": 4.7116,
"step": 85200
},
{
"epoch": 0.1,
"grad_norm": 0.40508711338043213,
"learning_rate": 9.01038802534706e-05,
"loss": 4.7109,
"step": 85300
},
{
"epoch": 0.1,
"grad_norm": 0.41179603338241577,
"learning_rate": 9.009227870628828e-05,
"loss": 4.7147,
"step": 85400
},
{
"epoch": 0.1,
"grad_norm": 0.49404746294021606,
"learning_rate": 9.008067715910595e-05,
"loss": 4.714,
"step": 85500
},
{
"epoch": 0.1,
"grad_norm": 0.4049994647502899,
"learning_rate": 9.006907561192362e-05,
"loss": 4.7131,
"step": 85600
},
{
"epoch": 0.1,
"grad_norm": 0.3943819999694824,
"learning_rate": 9.005747406474127e-05,
"loss": 4.7099,
"step": 85700
},
{
"epoch": 0.1,
"grad_norm": 0.3848678171634674,
"learning_rate": 9.004587251755894e-05,
"loss": 4.7105,
"step": 85800
},
{
"epoch": 0.1,
"grad_norm": 0.39097893238067627,
"learning_rate": 9.003427097037662e-05,
"loss": 4.7134,
"step": 85900
},
{
"epoch": 0.1,
"grad_norm": 0.5381259918212891,
"learning_rate": 9.002266942319429e-05,
"loss": 4.7111,
"step": 86000
},
{
"epoch": 0.1,
"grad_norm": 0.44293051958084106,
"learning_rate": 9.001106787601194e-05,
"loss": 4.7132,
"step": 86100
},
{
"epoch": 0.1,
"grad_norm": 2.449153423309326,
"learning_rate": 8.999946632882961e-05,
"loss": 4.7108,
"step": 86200
},
{
"epoch": 0.1,
"grad_norm": 1.677003026008606,
"learning_rate": 8.998786478164728e-05,
"loss": 4.7117,
"step": 86300
},
{
"epoch": 0.1,
"grad_norm": 0.4721989035606384,
"learning_rate": 8.997626323446495e-05,
"loss": 4.7084,
"step": 86400
},
{
"epoch": 0.1,
"grad_norm": 0.4921315908432007,
"learning_rate": 8.996466168728262e-05,
"loss": 4.7157,
"step": 86500
},
{
"epoch": 0.1,
"grad_norm": 0.6939030885696411,
"learning_rate": 8.995306014010028e-05,
"loss": 4.7114,
"step": 86600
},
{
"epoch": 0.1,
"grad_norm": 1.3602544069290161,
"learning_rate": 8.994145859291797e-05,
"loss": 4.7116,
"step": 86700
},
{
"epoch": 0.1,
"grad_norm": 0.40210890769958496,
"learning_rate": 8.992985704573562e-05,
"loss": 4.7132,
"step": 86800
},
{
"epoch": 0.1,
"grad_norm": 0.41714897751808167,
"learning_rate": 8.991825549855329e-05,
"loss": 4.713,
"step": 86900
},
{
"epoch": 0.1,
"grad_norm": 3.1149511337280273,
"learning_rate": 8.990665395137096e-05,
"loss": 4.709,
"step": 87000
},
{
"epoch": 0.1,
"grad_norm": 1.3356086015701294,
"learning_rate": 8.989505240418863e-05,
"loss": 4.7102,
"step": 87100
},
{
"epoch": 0.1,
"grad_norm": 0.36242830753326416,
"learning_rate": 8.988345085700629e-05,
"loss": 4.7091,
"step": 87200
},
{
"epoch": 0.1,
"grad_norm": 0.45975425839424133,
"learning_rate": 8.987184930982396e-05,
"loss": 4.7098,
"step": 87300
},
{
"epoch": 0.1,
"grad_norm": 0.38699838519096375,
"learning_rate": 8.986024776264163e-05,
"loss": 4.7113,
"step": 87400
},
{
"epoch": 0.1,
"grad_norm": 2.0604562759399414,
"learning_rate": 8.98486462154593e-05,
"loss": 4.7108,
"step": 87500
},
{
"epoch": 0.1,
"grad_norm": 0.39834097027778625,
"learning_rate": 8.983704466827697e-05,
"loss": 4.7131,
"step": 87600
},
{
"epoch": 0.1,
"grad_norm": 0.40465226769447327,
"learning_rate": 8.982544312109463e-05,
"loss": 4.7101,
"step": 87700
},
{
"epoch": 0.1,
"grad_norm": 0.43859535455703735,
"learning_rate": 8.98138415739123e-05,
"loss": 4.7118,
"step": 87800
},
{
"epoch": 0.1,
"grad_norm": 0.5537000894546509,
"learning_rate": 8.980224002672997e-05,
"loss": 4.7106,
"step": 87900
},
{
"epoch": 0.1,
"grad_norm": 0.5020445585250854,
"learning_rate": 8.979063847954764e-05,
"loss": 4.7094,
"step": 88000
},
{
"epoch": 0.1,
"grad_norm": 0.419494092464447,
"learning_rate": 8.97790369323653e-05,
"loss": 4.7111,
"step": 88100
},
{
"epoch": 0.1,
"grad_norm": 0.4324280023574829,
"learning_rate": 8.976743538518298e-05,
"loss": 4.7151,
"step": 88200
},
{
"epoch": 0.1,
"grad_norm": 0.410915732383728,
"learning_rate": 8.975583383800064e-05,
"loss": 4.7126,
"step": 88300
},
{
"epoch": 0.1,
"grad_norm": 0.4660918414592743,
"learning_rate": 8.974423229081831e-05,
"loss": 4.7097,
"step": 88400
},
{
"epoch": 0.1,
"grad_norm": 1.4695265293121338,
"learning_rate": 8.973263074363596e-05,
"loss": 4.7094,
"step": 88500
},
{
"epoch": 0.1,
"grad_norm": 0.7962493896484375,
"learning_rate": 8.972102919645365e-05,
"loss": 4.7106,
"step": 88600
},
{
"epoch": 0.1,
"grad_norm": 0.41975751519203186,
"learning_rate": 8.970942764927132e-05,
"loss": 4.7082,
"step": 88700
},
{
"epoch": 0.1,
"grad_norm": 2.06545090675354,
"learning_rate": 8.969782610208898e-05,
"loss": 4.7115,
"step": 88800
},
{
"epoch": 0.1,
"grad_norm": 0.5522103309631348,
"learning_rate": 8.968622455490665e-05,
"loss": 4.7116,
"step": 88900
},
{
"epoch": 0.1,
"grad_norm": 0.4133060574531555,
"learning_rate": 8.967462300772432e-05,
"loss": 4.7117,
"step": 89000
},
{
"epoch": 0.1,
"grad_norm": 0.6902609467506409,
"learning_rate": 8.966302146054199e-05,
"loss": 4.7114,
"step": 89100
},
{
"epoch": 0.1,
"grad_norm": 0.39645135402679443,
"learning_rate": 8.965141991335964e-05,
"loss": 4.7075,
"step": 89200
},
{
"epoch": 0.1,
"grad_norm": 0.702682614326477,
"learning_rate": 8.963981836617731e-05,
"loss": 4.7132,
"step": 89300
},
{
"epoch": 0.1,
"grad_norm": 0.4847519099712372,
"learning_rate": 8.962821681899499e-05,
"loss": 4.7107,
"step": 89400
},
{
"epoch": 0.1,
"grad_norm": 0.4338213801383972,
"learning_rate": 8.961661527181266e-05,
"loss": 4.7112,
"step": 89500
},
{
"epoch": 0.1,
"grad_norm": 0.5931529402732849,
"learning_rate": 8.960501372463031e-05,
"loss": 4.713,
"step": 89600
},
{
"epoch": 0.1,
"grad_norm": 0.4164227545261383,
"learning_rate": 8.959341217744798e-05,
"loss": 4.7124,
"step": 89700
},
{
"epoch": 0.1,
"grad_norm": 0.41698023676872253,
"learning_rate": 8.958181063026567e-05,
"loss": 4.7098,
"step": 89800
},
{
"epoch": 0.1,
"grad_norm": 0.45001962780952454,
"learning_rate": 8.957020908308332e-05,
"loss": 4.7099,
"step": 89900
},
{
"epoch": 0.1,
"grad_norm": 0.5672646164894104,
"learning_rate": 8.9558607535901e-05,
"loss": 4.7101,
"step": 90000
},
{
"epoch": 0.1,
"grad_norm": 0.4138546586036682,
"learning_rate": 8.954700598871865e-05,
"loss": 4.7114,
"step": 90100
},
{
"epoch": 0.1,
"grad_norm": 0.5982836484909058,
"learning_rate": 8.953540444153633e-05,
"loss": 4.7077,
"step": 90200
},
{
"epoch": 0.1,
"grad_norm": 0.39376482367515564,
"learning_rate": 8.952380289435399e-05,
"loss": 4.713,
"step": 90300
},
{
"epoch": 0.1,
"grad_norm": 0.39758920669555664,
"learning_rate": 8.951220134717166e-05,
"loss": 4.7105,
"step": 90400
},
{
"epoch": 0.1,
"grad_norm": 1.4163142442703247,
"learning_rate": 8.950059979998933e-05,
"loss": 4.7071,
"step": 90500
},
{
"epoch": 0.11,
"grad_norm": 0.4065834581851959,
"learning_rate": 8.9488998252807e-05,
"loss": 4.7136,
"step": 90600
},
{
"epoch": 0.11,
"grad_norm": 0.4078928530216217,
"learning_rate": 8.947739670562466e-05,
"loss": 4.7081,
"step": 90700
},
{
"epoch": 0.11,
"grad_norm": 2.143505096435547,
"learning_rate": 8.946579515844233e-05,
"loss": 4.7119,
"step": 90800
},
{
"epoch": 0.11,
"grad_norm": 0.4692690074443817,
"learning_rate": 8.945419361126e-05,
"loss": 4.71,
"step": 90900
},
{
"epoch": 0.11,
"grad_norm": 0.4491908848285675,
"learning_rate": 8.944259206407767e-05,
"loss": 4.709,
"step": 91000
},
{
"epoch": 0.11,
"grad_norm": 0.4015776515007019,
"learning_rate": 8.943099051689534e-05,
"loss": 4.7133,
"step": 91100
},
{
"epoch": 0.11,
"grad_norm": 12.54626178741455,
"learning_rate": 8.9419388969713e-05,
"loss": 4.7115,
"step": 91200
},
{
"epoch": 0.11,
"grad_norm": 0.4521196484565735,
"learning_rate": 8.940778742253068e-05,
"loss": 4.7098,
"step": 91300
},
{
"epoch": 0.11,
"grad_norm": 0.40918394923210144,
"learning_rate": 8.939618587534834e-05,
"loss": 4.7064,
"step": 91400
},
{
"epoch": 0.11,
"grad_norm": 0.7197582721710205,
"learning_rate": 8.938458432816601e-05,
"loss": 4.7083,
"step": 91500
},
{
"epoch": 0.11,
"grad_norm": 0.4415791630744934,
"learning_rate": 8.937298278098367e-05,
"loss": 4.7121,
"step": 91600
},
{
"epoch": 0.11,
"grad_norm": 0.37231072783470154,
"learning_rate": 8.936138123380135e-05,
"loss": 4.7076,
"step": 91700
},
{
"epoch": 0.11,
"grad_norm": 0.5812079906463623,
"learning_rate": 8.934977968661902e-05,
"loss": 4.7103,
"step": 91800
},
{
"epoch": 0.11,
"grad_norm": 0.42401689291000366,
"learning_rate": 8.933817813943668e-05,
"loss": 4.7087,
"step": 91900
},
{
"epoch": 0.11,
"grad_norm": 0.433369904756546,
"learning_rate": 8.932657659225435e-05,
"loss": 4.7102,
"step": 92000
},
{
"epoch": 0.11,
"grad_norm": 0.4144749045372009,
"learning_rate": 8.931497504507202e-05,
"loss": 4.711,
"step": 92100
},
{
"epoch": 0.11,
"grad_norm": 0.3901808559894562,
"learning_rate": 8.930337349788969e-05,
"loss": 4.712,
"step": 92200
},
{
"epoch": 0.11,
"grad_norm": 0.41925275325775146,
"learning_rate": 8.929177195070735e-05,
"loss": 4.7123,
"step": 92300
},
{
"epoch": 0.11,
"grad_norm": 0.44110241532325745,
"learning_rate": 8.928017040352502e-05,
"loss": 4.7107,
"step": 92400
},
{
"epoch": 0.11,
"grad_norm": 0.3761931359767914,
"learning_rate": 8.926856885634269e-05,
"loss": 4.7067,
"step": 92500
},
{
"epoch": 0.11,
"grad_norm": 0.37371134757995605,
"learning_rate": 8.925696730916036e-05,
"loss": 4.7076,
"step": 92600
},
{
"epoch": 0.11,
"grad_norm": 2.6820991039276123,
"learning_rate": 8.924536576197801e-05,
"loss": 4.7101,
"step": 92700
},
{
"epoch": 0.11,
"grad_norm": 8.034706115722656,
"learning_rate": 8.923376421479568e-05,
"loss": 4.7052,
"step": 92800
},
{
"epoch": 0.11,
"grad_norm": 0.6186811327934265,
"learning_rate": 8.922216266761337e-05,
"loss": 4.7108,
"step": 92900
},
{
"epoch": 0.11,
"grad_norm": 16.127689361572266,
"learning_rate": 8.921056112043103e-05,
"loss": 4.7109,
"step": 93000
},
{
"epoch": 0.11,
"grad_norm": 0.4097209572792053,
"learning_rate": 8.91989595732487e-05,
"loss": 4.7107,
"step": 93100
},
{
"epoch": 0.11,
"grad_norm": 0.4645543098449707,
"learning_rate": 8.918735802606635e-05,
"loss": 4.706,
"step": 93200
},
{
"epoch": 0.11,
"grad_norm": 0.4279698431491852,
"learning_rate": 8.917575647888404e-05,
"loss": 4.7081,
"step": 93300
},
{
"epoch": 0.11,
"grad_norm": 0.3847067654132843,
"learning_rate": 8.91641549317017e-05,
"loss": 4.7104,
"step": 93400
},
{
"epoch": 0.11,
"grad_norm": 0.9405023455619812,
"learning_rate": 8.915255338451936e-05,
"loss": 4.7077,
"step": 93500
},
{
"epoch": 0.11,
"grad_norm": 0.4204134941101074,
"learning_rate": 8.914095183733703e-05,
"loss": 4.7102,
"step": 93600
},
{
"epoch": 0.11,
"grad_norm": 0.411493718624115,
"learning_rate": 8.91293502901547e-05,
"loss": 4.7128,
"step": 93700
},
{
"epoch": 0.11,
"grad_norm": 0.4645240604877472,
"learning_rate": 8.911774874297236e-05,
"loss": 4.708,
"step": 93800
},
{
"epoch": 0.11,
"grad_norm": 8.295918464660645,
"learning_rate": 8.910614719579003e-05,
"loss": 4.7087,
"step": 93900
},
{
"epoch": 0.11,
"grad_norm": 0.5342881083488464,
"learning_rate": 8.90945456486077e-05,
"loss": 4.7076,
"step": 94000
},
{
"epoch": 0.11,
"grad_norm": 0.41708967089653015,
"learning_rate": 8.908294410142537e-05,
"loss": 4.7096,
"step": 94100
},
{
"epoch": 0.11,
"grad_norm": 0.50910484790802,
"learning_rate": 8.907134255424304e-05,
"loss": 4.7074,
"step": 94200
},
{
"epoch": 0.11,
"grad_norm": 0.8689035773277283,
"learning_rate": 8.90597410070607e-05,
"loss": 4.7084,
"step": 94300
},
{
"epoch": 0.11,
"grad_norm": 1.477888584136963,
"learning_rate": 8.904813945987837e-05,
"loss": 4.7089,
"step": 94400
},
{
"epoch": 0.11,
"grad_norm": 0.3866770565509796,
"learning_rate": 8.903653791269604e-05,
"loss": 4.71,
"step": 94500
},
{
"epoch": 0.11,
"grad_norm": 0.4197094440460205,
"learning_rate": 8.902493636551371e-05,
"loss": 4.7062,
"step": 94600
},
{
"epoch": 0.11,
"grad_norm": 0.4031221866607666,
"learning_rate": 8.901333481833137e-05,
"loss": 4.7105,
"step": 94700
},
{
"epoch": 0.11,
"grad_norm": 0.4098550081253052,
"learning_rate": 8.900173327114905e-05,
"loss": 4.7099,
"step": 94800
},
{
"epoch": 0.11,
"grad_norm": 0.37702327966690063,
"learning_rate": 8.899013172396671e-05,
"loss": 4.7073,
"step": 94900
},
{
"epoch": 0.11,
"grad_norm": 0.4298543632030487,
"learning_rate": 8.897853017678438e-05,
"loss": 4.7092,
"step": 95000
},
{
"epoch": 0.11,
"grad_norm": 0.49779465794563293,
"learning_rate": 8.896692862960205e-05,
"loss": 4.7059,
"step": 95100
},
{
"epoch": 0.11,
"grad_norm": 0.3974045515060425,
"learning_rate": 8.895532708241972e-05,
"loss": 4.7116,
"step": 95200
},
{
"epoch": 0.11,
"grad_norm": 4.801963806152344,
"learning_rate": 8.894372553523739e-05,
"loss": 4.7103,
"step": 95300
},
{
"epoch": 0.11,
"grad_norm": 0.6826298236846924,
"learning_rate": 8.893212398805505e-05,
"loss": 4.7124,
"step": 95400
},
{
"epoch": 0.11,
"grad_norm": 0.8550306558609009,
"learning_rate": 8.892052244087272e-05,
"loss": 4.7078,
"step": 95500
},
{
"epoch": 0.11,
"grad_norm": 0.4076642394065857,
"learning_rate": 8.890892089369039e-05,
"loss": 4.7065,
"step": 95600
},
{
"epoch": 0.11,
"grad_norm": 0.4062858521938324,
"learning_rate": 8.889731934650806e-05,
"loss": 4.7086,
"step": 95700
},
{
"epoch": 0.11,
"grad_norm": 2.432161808013916,
"learning_rate": 8.888571779932572e-05,
"loss": 4.7049,
"step": 95800
},
{
"epoch": 0.11,
"grad_norm": 0.41037145256996155,
"learning_rate": 8.887411625214339e-05,
"loss": 4.7042,
"step": 95900
},
{
"epoch": 0.11,
"grad_norm": 0.3852427005767822,
"learning_rate": 8.886251470496106e-05,
"loss": 4.7051,
"step": 96000
},
{
"epoch": 0.11,
"grad_norm": 0.43618643283843994,
"learning_rate": 8.885091315777873e-05,
"loss": 4.7092,
"step": 96100
},
{
"epoch": 0.11,
"grad_norm": 0.4727887511253357,
"learning_rate": 8.88393116105964e-05,
"loss": 4.7076,
"step": 96200
},
{
"epoch": 0.11,
"grad_norm": 0.4139232337474823,
"learning_rate": 8.882771006341405e-05,
"loss": 4.7078,
"step": 96300
},
{
"epoch": 0.11,
"grad_norm": 2.8521788120269775,
"learning_rate": 8.881610851623174e-05,
"loss": 4.7086,
"step": 96400
},
{
"epoch": 0.11,
"grad_norm": 0.43892329931259155,
"learning_rate": 8.88045069690494e-05,
"loss": 4.7067,
"step": 96500
},
{
"epoch": 0.11,
"grad_norm": 0.40643131732940674,
"learning_rate": 8.879290542186707e-05,
"loss": 4.7078,
"step": 96600
},
{
"epoch": 0.11,
"grad_norm": 0.44639840722084045,
"learning_rate": 8.878130387468474e-05,
"loss": 4.7092,
"step": 96700
},
{
"epoch": 0.11,
"grad_norm": 10.743888854980469,
"learning_rate": 8.87697023275024e-05,
"loss": 4.7048,
"step": 96800
},
{
"epoch": 0.11,
"grad_norm": 0.39078572392463684,
"learning_rate": 8.875810078032006e-05,
"loss": 4.7015,
"step": 96900
},
{
"epoch": 0.11,
"grad_norm": 0.4048124849796295,
"learning_rate": 8.874649923313773e-05,
"loss": 4.7126,
"step": 97000
},
{
"epoch": 0.11,
"grad_norm": 0.5833559632301331,
"learning_rate": 8.87348976859554e-05,
"loss": 4.7057,
"step": 97100
},
{
"epoch": 0.11,
"grad_norm": 0.4627319276332855,
"learning_rate": 8.872329613877307e-05,
"loss": 4.7091,
"step": 97200
},
{
"epoch": 0.11,
"grad_norm": 0.460225909948349,
"learning_rate": 8.871169459159074e-05,
"loss": 4.7079,
"step": 97300
},
{
"epoch": 0.11,
"grad_norm": 0.44441863894462585,
"learning_rate": 8.87000930444084e-05,
"loss": 4.7039,
"step": 97400
},
{
"epoch": 0.11,
"grad_norm": 0.40614771842956543,
"learning_rate": 8.868849149722607e-05,
"loss": 4.7067,
"step": 97500
},
{
"epoch": 0.11,
"grad_norm": 0.48935455083847046,
"learning_rate": 8.867688995004374e-05,
"loss": 4.7072,
"step": 97600
},
{
"epoch": 0.11,
"grad_norm": 1.503010869026184,
"learning_rate": 8.866528840286141e-05,
"loss": 4.7051,
"step": 97700
},
{
"epoch": 0.11,
"grad_norm": 0.40835824608802795,
"learning_rate": 8.865368685567907e-05,
"loss": 4.7085,
"step": 97800
},
{
"epoch": 0.11,
"grad_norm": 0.47787487506866455,
"learning_rate": 8.864208530849675e-05,
"loss": 4.7042,
"step": 97900
},
{
"epoch": 0.11,
"grad_norm": 0.44062966108322144,
"learning_rate": 8.863048376131441e-05,
"loss": 4.7088,
"step": 98000
},
{
"epoch": 0.11,
"grad_norm": 0.5162871479988098,
"learning_rate": 8.861888221413208e-05,
"loss": 4.705,
"step": 98100
},
{
"epoch": 0.11,
"grad_norm": 0.7115480899810791,
"learning_rate": 8.860728066694974e-05,
"loss": 4.7069,
"step": 98200
},
{
"epoch": 0.11,
"grad_norm": 0.43299001455307007,
"learning_rate": 8.859567911976742e-05,
"loss": 4.7037,
"step": 98300
},
{
"epoch": 0.11,
"grad_norm": 0.5156663656234741,
"learning_rate": 8.858407757258509e-05,
"loss": 4.7057,
"step": 98400
},
{
"epoch": 0.11,
"grad_norm": 1.3159596920013428,
"learning_rate": 8.857247602540275e-05,
"loss": 4.705,
"step": 98500
},
{
"epoch": 0.11,
"grad_norm": 5.239011287689209,
"learning_rate": 8.856087447822042e-05,
"loss": 4.7104,
"step": 98600
},
{
"epoch": 0.11,
"grad_norm": 0.5143164992332458,
"learning_rate": 8.854927293103809e-05,
"loss": 4.7075,
"step": 98700
},
{
"epoch": 0.11,
"grad_norm": 0.4091811180114746,
"learning_rate": 8.853767138385576e-05,
"loss": 4.706,
"step": 98800
},
{
"epoch": 0.11,
"grad_norm": 0.3682583272457123,
"learning_rate": 8.852606983667342e-05,
"loss": 4.7049,
"step": 98900
},
{
"epoch": 0.11,
"grad_norm": 1.9574017524719238,
"learning_rate": 8.851446828949109e-05,
"loss": 4.7085,
"step": 99000
},
{
"epoch": 0.11,
"grad_norm": 0.4863421320915222,
"learning_rate": 8.850286674230876e-05,
"loss": 4.7042,
"step": 99100
},
{
"epoch": 0.12,
"grad_norm": 0.41971755027770996,
"learning_rate": 8.849126519512643e-05,
"loss": 4.7046,
"step": 99200
},
{
"epoch": 0.12,
"grad_norm": 0.45720914006233215,
"learning_rate": 8.847966364794409e-05,
"loss": 4.705,
"step": 99300
},
{
"epoch": 0.12,
"grad_norm": 0.3879556953907013,
"learning_rate": 8.846806210076176e-05,
"loss": 4.7051,
"step": 99400
},
{
"epoch": 0.12,
"grad_norm": 0.6103081703186035,
"learning_rate": 8.845646055357944e-05,
"loss": 4.7018,
"step": 99500
},
{
"epoch": 0.12,
"grad_norm": 0.4066198468208313,
"learning_rate": 8.84448590063971e-05,
"loss": 4.7039,
"step": 99600
},
{
"epoch": 0.12,
"grad_norm": 0.568277895450592,
"learning_rate": 8.843325745921477e-05,
"loss": 4.7046,
"step": 99700
},
{
"epoch": 0.12,
"grad_norm": 0.41322973370552063,
"learning_rate": 8.842165591203244e-05,
"loss": 4.7051,
"step": 99800
},
{
"epoch": 0.12,
"grad_norm": 3.952580690383911,
"learning_rate": 8.841005436485011e-05,
"loss": 4.7017,
"step": 99900
},
{
"epoch": 0.12,
"grad_norm": 0.40354979038238525,
"learning_rate": 8.839845281766776e-05,
"loss": 4.7076,
"step": 100000
},
{
"epoch": 0.12,
"grad_norm": 0.44302529096603394,
"learning_rate": 8.838685127048543e-05,
"loss": 4.7071,
"step": 100100
},
{
"epoch": 0.12,
"grad_norm": 0.5012074112892151,
"learning_rate": 8.83752497233031e-05,
"loss": 4.7024,
"step": 100200
},
{
"epoch": 0.12,
"grad_norm": 0.4387538433074951,
"learning_rate": 8.836364817612078e-05,
"loss": 4.7019,
"step": 100300
},
{
"epoch": 0.12,
"grad_norm": 4.33089017868042,
"learning_rate": 8.835204662893843e-05,
"loss": 4.7034,
"step": 100400
},
{
"epoch": 0.12,
"grad_norm": 0.6666650176048279,
"learning_rate": 8.83404450817561e-05,
"loss": 4.7006,
"step": 100500
},
{
"epoch": 0.12,
"grad_norm": 0.43695616722106934,
"learning_rate": 8.832884353457377e-05,
"loss": 4.7028,
"step": 100600
},
{
"epoch": 0.12,
"grad_norm": 0.39040979743003845,
"learning_rate": 8.831724198739144e-05,
"loss": 4.703,
"step": 100700
},
{
"epoch": 0.12,
"grad_norm": 0.4081096053123474,
"learning_rate": 8.830564044020911e-05,
"loss": 4.7062,
"step": 100800
},
{
"epoch": 0.12,
"grad_norm": 0.4371529519557953,
"learning_rate": 8.829403889302677e-05,
"loss": 4.7027,
"step": 100900
},
{
"epoch": 0.12,
"grad_norm": 0.4820528030395508,
"learning_rate": 8.828243734584446e-05,
"loss": 4.7026,
"step": 101000
},
{
"epoch": 0.12,
"grad_norm": 0.4896353781223297,
"learning_rate": 8.827083579866211e-05,
"loss": 4.7052,
"step": 101100
},
{
"epoch": 0.12,
"grad_norm": 0.4941895604133606,
"learning_rate": 8.825923425147978e-05,
"loss": 4.7065,
"step": 101200
},
{
"epoch": 0.12,
"grad_norm": 0.4064702093601227,
"learning_rate": 8.824763270429744e-05,
"loss": 4.7032,
"step": 101300
},
{
"epoch": 0.12,
"grad_norm": 0.7341260313987732,
"learning_rate": 8.823603115711512e-05,
"loss": 4.7029,
"step": 101400
},
{
"epoch": 0.12,
"grad_norm": 0.41835737228393555,
"learning_rate": 8.822442960993278e-05,
"loss": 4.7012,
"step": 101500
},
{
"epoch": 0.12,
"grad_norm": 0.4256590008735657,
"learning_rate": 8.821282806275045e-05,
"loss": 4.7038,
"step": 101600
},
{
"epoch": 0.12,
"grad_norm": 0.43846938014030457,
"learning_rate": 8.820122651556812e-05,
"loss": 4.7014,
"step": 101700
},
{
"epoch": 0.12,
"grad_norm": 0.4327394962310791,
"learning_rate": 8.818962496838579e-05,
"loss": 4.7022,
"step": 101800
},
{
"epoch": 0.12,
"grad_norm": 0.42558759450912476,
"learning_rate": 8.817802342120346e-05,
"loss": 4.7001,
"step": 101900
},
{
"epoch": 0.12,
"grad_norm": 0.41578829288482666,
"learning_rate": 8.816642187402112e-05,
"loss": 4.6996,
"step": 102000
},
{
"epoch": 0.12,
"grad_norm": 1.1245514154434204,
"learning_rate": 8.815482032683879e-05,
"loss": 4.7055,
"step": 102100
},
{
"epoch": 0.12,
"grad_norm": 0.43621131777763367,
"learning_rate": 8.814321877965646e-05,
"loss": 4.7011,
"step": 102200
},
{
"epoch": 0.12,
"grad_norm": 0.555548906326294,
"learning_rate": 8.813161723247413e-05,
"loss": 4.7026,
"step": 102300
},
{
"epoch": 0.12,
"grad_norm": 0.5187997221946716,
"learning_rate": 8.812001568529179e-05,
"loss": 4.7046,
"step": 102400
},
{
"epoch": 0.12,
"grad_norm": 0.8702124953269958,
"learning_rate": 8.810841413810946e-05,
"loss": 4.7009,
"step": 102500
},
{
"epoch": 0.12,
"grad_norm": 0.4865017533302307,
"learning_rate": 8.809681259092714e-05,
"loss": 4.7012,
"step": 102600
},
{
"epoch": 0.12,
"grad_norm": 0.4582134783267975,
"learning_rate": 8.80852110437448e-05,
"loss": 4.7024,
"step": 102700
},
{
"epoch": 0.12,
"grad_norm": 0.48400819301605225,
"learning_rate": 8.807360949656247e-05,
"loss": 4.7009,
"step": 102800
},
{
"epoch": 0.12,
"grad_norm": 0.4129534363746643,
"learning_rate": 8.806200794938013e-05,
"loss": 4.6953,
"step": 102900
},
{
"epoch": 0.12,
"grad_norm": 0.5257245302200317,
"learning_rate": 8.805040640219781e-05,
"loss": 4.6998,
"step": 103000
},
{
"epoch": 0.12,
"grad_norm": 0.6813905239105225,
"learning_rate": 8.803880485501547e-05,
"loss": 4.7037,
"step": 103100
},
{
"epoch": 0.12,
"grad_norm": 0.6715384721755981,
"learning_rate": 8.802720330783314e-05,
"loss": 4.7011,
"step": 103200
},
{
"epoch": 0.12,
"grad_norm": 0.41265252232551575,
"learning_rate": 8.801560176065081e-05,
"loss": 4.7019,
"step": 103300
},
{
"epoch": 0.12,
"grad_norm": 0.37935546040534973,
"learning_rate": 8.800400021346848e-05,
"loss": 4.6995,
"step": 103400
},
{
"epoch": 0.12,
"grad_norm": 2.578775644302368,
"learning_rate": 8.799239866628613e-05,
"loss": 4.7025,
"step": 103500
},
{
"epoch": 0.12,
"grad_norm": 0.5321478843688965,
"learning_rate": 8.79807971191038e-05,
"loss": 4.6983,
"step": 103600
},
{
"epoch": 0.12,
"grad_norm": 0.4280731976032257,
"learning_rate": 8.796919557192148e-05,
"loss": 4.6947,
"step": 103700
},
{
"epoch": 0.12,
"grad_norm": 0.559546709060669,
"learning_rate": 8.795759402473915e-05,
"loss": 4.7027,
"step": 103800
},
{
"epoch": 0.12,
"grad_norm": 0.39881765842437744,
"learning_rate": 8.794599247755682e-05,
"loss": 4.7024,
"step": 103900
},
{
"epoch": 0.12,
"grad_norm": 0.5303432941436768,
"learning_rate": 8.793439093037447e-05,
"loss": 4.6984,
"step": 104000
},
{
"epoch": 0.12,
"grad_norm": 0.4243880808353424,
"learning_rate": 8.792278938319216e-05,
"loss": 4.7011,
"step": 104100
},
{
"epoch": 0.12,
"grad_norm": 1.8046822547912598,
"learning_rate": 8.791118783600981e-05,
"loss": 4.703,
"step": 104200
},
{
"epoch": 0.12,
"grad_norm": 0.45566943287849426,
"learning_rate": 8.789958628882748e-05,
"loss": 4.702,
"step": 104300
},
{
"epoch": 0.12,
"grad_norm": 4.0667338371276855,
"learning_rate": 8.788798474164514e-05,
"loss": 4.6984,
"step": 104400
},
{
"epoch": 0.12,
"grad_norm": 0.4209063947200775,
"learning_rate": 8.787638319446282e-05,
"loss": 4.6985,
"step": 104500
},
{
"epoch": 0.12,
"grad_norm": 0.41932806372642517,
"learning_rate": 8.786478164728048e-05,
"loss": 4.6987,
"step": 104600
},
{
"epoch": 0.12,
"grad_norm": 3.0371077060699463,
"learning_rate": 8.785318010009815e-05,
"loss": 4.7016,
"step": 104700
},
{
"epoch": 0.12,
"grad_norm": 0.5310882925987244,
"learning_rate": 8.784157855291582e-05,
"loss": 4.6996,
"step": 104800
},
{
"epoch": 0.12,
"grad_norm": 0.43061837553977966,
"learning_rate": 8.782997700573349e-05,
"loss": 4.6987,
"step": 104900
},
{
"epoch": 0.12,
"grad_norm": 0.5050943493843079,
"learning_rate": 8.781837545855116e-05,
"loss": 4.7012,
"step": 105000
},
{
"epoch": 0.12,
"grad_norm": 0.39062759280204773,
"learning_rate": 8.780677391136882e-05,
"loss": 4.6966,
"step": 105100
},
{
"epoch": 0.12,
"grad_norm": 4.528038024902344,
"learning_rate": 8.779517236418649e-05,
"loss": 4.6964,
"step": 105200
},
{
"epoch": 0.12,
"grad_norm": 0.3973850905895233,
"learning_rate": 8.778357081700416e-05,
"loss": 4.6989,
"step": 105300
},
{
"epoch": 0.12,
"grad_norm": 0.42976003885269165,
"learning_rate": 8.777196926982183e-05,
"loss": 4.6996,
"step": 105400
},
{
"epoch": 0.12,
"grad_norm": 0.43507105112075806,
"learning_rate": 8.776036772263949e-05,
"loss": 4.698,
"step": 105500
},
{
"epoch": 0.12,
"grad_norm": 0.4329013526439667,
"learning_rate": 8.774876617545716e-05,
"loss": 4.6967,
"step": 105600
},
{
"epoch": 0.12,
"grad_norm": 0.40144628286361694,
"learning_rate": 8.773716462827483e-05,
"loss": 4.6944,
"step": 105700
},
{
"epoch": 0.12,
"grad_norm": 0.5410645008087158,
"learning_rate": 8.77255630810925e-05,
"loss": 4.6962,
"step": 105800
},
{
"epoch": 0.12,
"grad_norm": 0.709266722202301,
"learning_rate": 8.771396153391017e-05,
"loss": 4.696,
"step": 105900
},
{
"epoch": 0.12,
"grad_norm": 0.4841790199279785,
"learning_rate": 8.770235998672783e-05,
"loss": 4.7003,
"step": 106000
},
{
"epoch": 0.12,
"grad_norm": 0.48105648159980774,
"learning_rate": 8.769075843954551e-05,
"loss": 4.6925,
"step": 106100
},
{
"epoch": 0.12,
"grad_norm": 0.4711418151855469,
"learning_rate": 8.767915689236317e-05,
"loss": 4.6967,
"step": 106200
},
{
"epoch": 0.12,
"grad_norm": 0.4943852126598358,
"learning_rate": 8.766755534518084e-05,
"loss": 4.6958,
"step": 106300
},
{
"epoch": 0.12,
"grad_norm": 1.1731679439544678,
"learning_rate": 8.765595379799851e-05,
"loss": 4.697,
"step": 106400
},
{
"epoch": 0.12,
"grad_norm": 2.6942670345306396,
"learning_rate": 8.764435225081618e-05,
"loss": 4.6988,
"step": 106500
},
{
"epoch": 0.12,
"grad_norm": 0.5932111144065857,
"learning_rate": 8.763275070363384e-05,
"loss": 4.7008,
"step": 106600
},
{
"epoch": 0.12,
"grad_norm": 0.498844176530838,
"learning_rate": 8.76211491564515e-05,
"loss": 4.6966,
"step": 106700
},
{
"epoch": 0.12,
"grad_norm": 0.525398850440979,
"learning_rate": 8.760954760926918e-05,
"loss": 4.6992,
"step": 106800
},
{
"epoch": 0.12,
"grad_norm": 2.405345916748047,
"learning_rate": 8.759794606208685e-05,
"loss": 4.6968,
"step": 106900
},
{
"epoch": 0.12,
"grad_norm": 0.4208034873008728,
"learning_rate": 8.758634451490452e-05,
"loss": 4.6919,
"step": 107000
},
{
"epoch": 0.12,
"grad_norm": 0.3912876844406128,
"learning_rate": 8.757474296772217e-05,
"loss": 4.6921,
"step": 107100
},
{
"epoch": 0.12,
"grad_norm": 0.4323633909225464,
"learning_rate": 8.756314142053984e-05,
"loss": 4.6966,
"step": 107200
},
{
"epoch": 0.12,
"grad_norm": 0.39893805980682373,
"learning_rate": 8.755153987335752e-05,
"loss": 4.6928,
"step": 107300
},
{
"epoch": 0.12,
"grad_norm": 0.42258456349372864,
"learning_rate": 8.753993832617519e-05,
"loss": 4.6955,
"step": 107400
},
{
"epoch": 0.12,
"grad_norm": 0.40447258949279785,
"learning_rate": 8.752833677899284e-05,
"loss": 4.6972,
"step": 107500
},
{
"epoch": 0.12,
"grad_norm": 0.5002463459968567,
"learning_rate": 8.751673523181053e-05,
"loss": 4.6977,
"step": 107600
},
{
"epoch": 0.12,
"grad_norm": 5.072608470916748,
"learning_rate": 8.750513368462818e-05,
"loss": 4.6992,
"step": 107700
},
{
"epoch": 0.13,
"grad_norm": 0.4288334250450134,
"learning_rate": 8.749353213744585e-05,
"loss": 4.6984,
"step": 107800
},
{
"epoch": 0.13,
"grad_norm": 0.529216468334198,
"learning_rate": 8.748193059026351e-05,
"loss": 4.698,
"step": 107900
},
{
"epoch": 0.13,
"grad_norm": 0.42044612765312195,
"learning_rate": 8.74703290430812e-05,
"loss": 4.6957,
"step": 108000
},
{
"epoch": 0.13,
"grad_norm": 0.8280035853385925,
"learning_rate": 8.745872749589886e-05,
"loss": 4.6916,
"step": 108100
},
{
"epoch": 0.13,
"grad_norm": 0.41592320799827576,
"learning_rate": 8.744712594871652e-05,
"loss": 4.6946,
"step": 108200
},
{
"epoch": 0.13,
"grad_norm": 0.425014466047287,
"learning_rate": 8.743552440153419e-05,
"loss": 4.6955,
"step": 108300
},
{
"epoch": 0.13,
"grad_norm": 0.40150442719459534,
"learning_rate": 8.742392285435186e-05,
"loss": 4.6946,
"step": 108400
},
{
"epoch": 0.13,
"grad_norm": 0.441785991191864,
"learning_rate": 8.741232130716953e-05,
"loss": 4.6908,
"step": 108500
},
{
"epoch": 0.13,
"grad_norm": 1.035128116607666,
"learning_rate": 8.740071975998719e-05,
"loss": 4.6926,
"step": 108600
},
{
"epoch": 0.13,
"grad_norm": 0.4290466606616974,
"learning_rate": 8.738911821280486e-05,
"loss": 4.6926,
"step": 108700
},
{
"epoch": 0.13,
"grad_norm": 0.4321587085723877,
"learning_rate": 8.737751666562253e-05,
"loss": 4.6937,
"step": 108800
},
{
"epoch": 0.13,
"grad_norm": 0.40536269545555115,
"learning_rate": 8.73659151184402e-05,
"loss": 4.69,
"step": 108900
},
{
"epoch": 0.13,
"grad_norm": 0.3988741934299469,
"learning_rate": 8.735431357125786e-05,
"loss": 4.6935,
"step": 109000
},
{
"epoch": 0.13,
"grad_norm": 0.8648415803909302,
"learning_rate": 8.734271202407553e-05,
"loss": 4.6978,
"step": 109100
},
{
"epoch": 0.13,
"grad_norm": 0.6086846590042114,
"learning_rate": 8.733111047689321e-05,
"loss": 4.6929,
"step": 109200
},
{
"epoch": 0.13,
"grad_norm": 0.38623344898223877,
"learning_rate": 8.731950892971087e-05,
"loss": 4.697,
"step": 109300
},
{
"epoch": 0.13,
"grad_norm": 0.7714725732803345,
"learning_rate": 8.730790738252854e-05,
"loss": 4.6926,
"step": 109400
},
{
"epoch": 0.13,
"grad_norm": 0.37906432151794434,
"learning_rate": 8.729630583534621e-05,
"loss": 4.6927,
"step": 109500
},
{
"epoch": 0.13,
"grad_norm": 0.4156550168991089,
"learning_rate": 8.728470428816388e-05,
"loss": 4.6953,
"step": 109600
},
{
"epoch": 0.13,
"grad_norm": 0.39191532135009766,
"learning_rate": 8.727310274098154e-05,
"loss": 4.6935,
"step": 109700
},
{
"epoch": 0.13,
"grad_norm": 0.4391324818134308,
"learning_rate": 8.726150119379921e-05,
"loss": 4.6939,
"step": 109800
},
{
"epoch": 0.13,
"grad_norm": 0.42475369572639465,
"learning_rate": 8.724989964661688e-05,
"loss": 4.6903,
"step": 109900
},
{
"epoch": 0.13,
"grad_norm": 0.4388252794742584,
"learning_rate": 8.723829809943455e-05,
"loss": 4.6914,
"step": 110000
},
{
"epoch": 0.13,
"grad_norm": 0.4022296667098999,
"learning_rate": 8.72266965522522e-05,
"loss": 4.6948,
"step": 110100
},
{
"epoch": 0.13,
"grad_norm": 0.4492380917072296,
"learning_rate": 8.721509500506988e-05,
"loss": 4.6911,
"step": 110200
},
{
"epoch": 0.13,
"grad_norm": 0.4049302637577057,
"learning_rate": 8.720349345788755e-05,
"loss": 4.6902,
"step": 110300
},
{
"epoch": 0.13,
"grad_norm": 1.9303175210952759,
"learning_rate": 8.719189191070522e-05,
"loss": 4.6892,
"step": 110400
},
{
"epoch": 0.13,
"grad_norm": 0.48216354846954346,
"learning_rate": 8.718029036352289e-05,
"loss": 4.6882,
"step": 110500
},
{
"epoch": 0.13,
"grad_norm": 1.4418731927871704,
"learning_rate": 8.716868881634054e-05,
"loss": 4.6893,
"step": 110600
},
{
"epoch": 0.13,
"grad_norm": 0.5436264276504517,
"learning_rate": 8.715708726915823e-05,
"loss": 4.6892,
"step": 110700
},
{
"epoch": 0.13,
"grad_norm": 0.3900034427642822,
"learning_rate": 8.714548572197588e-05,
"loss": 4.6862,
"step": 110800
},
{
"epoch": 0.13,
"grad_norm": 0.4286581575870514,
"learning_rate": 8.713388417479356e-05,
"loss": 4.6902,
"step": 110900
},
{
"epoch": 0.13,
"grad_norm": 2.8317441940307617,
"learning_rate": 8.712228262761121e-05,
"loss": 4.692,
"step": 111000
},
{
"epoch": 0.13,
"grad_norm": 0.4669002890586853,
"learning_rate": 8.71106810804289e-05,
"loss": 4.6933,
"step": 111100
},
{
"epoch": 0.13,
"grad_norm": 12.876310348510742,
"learning_rate": 8.709907953324655e-05,
"loss": 4.6933,
"step": 111200
},
{
"epoch": 0.13,
"grad_norm": 0.4043096601963043,
"learning_rate": 8.708747798606422e-05,
"loss": 4.6905,
"step": 111300
},
{
"epoch": 0.13,
"grad_norm": 0.6544508934020996,
"learning_rate": 8.70758764388819e-05,
"loss": 4.6862,
"step": 111400
},
{
"epoch": 0.13,
"grad_norm": 0.4368617534637451,
"learning_rate": 8.706427489169956e-05,
"loss": 4.6938,
"step": 111500
},
{
"epoch": 0.13,
"grad_norm": 0.8235392570495605,
"learning_rate": 8.705267334451723e-05,
"loss": 4.6894,
"step": 111600
},
{
"epoch": 0.13,
"grad_norm": 0.44141313433647156,
"learning_rate": 8.704107179733489e-05,
"loss": 4.6899,
"step": 111700
},
{
"epoch": 0.13,
"grad_norm": 0.4662952423095703,
"learning_rate": 8.702947025015256e-05,
"loss": 4.6862,
"step": 111800
},
{
"epoch": 0.13,
"grad_norm": 0.5402454137802124,
"learning_rate": 8.701786870297023e-05,
"loss": 4.6885,
"step": 111900
},
{
"epoch": 0.13,
"grad_norm": 0.5668596029281616,
"learning_rate": 8.70062671557879e-05,
"loss": 4.6842,
"step": 112000
},
{
"epoch": 0.13,
"grad_norm": 0.586438000202179,
"learning_rate": 8.699466560860556e-05,
"loss": 4.6893,
"step": 112100
},
{
"epoch": 0.13,
"grad_norm": 0.4443139135837555,
"learning_rate": 8.698306406142323e-05,
"loss": 4.6914,
"step": 112200
},
{
"epoch": 0.13,
"grad_norm": 0.4333912432193756,
"learning_rate": 8.69714625142409e-05,
"loss": 4.691,
"step": 112300
},
{
"epoch": 0.13,
"grad_norm": 0.5807641744613647,
"learning_rate": 8.695986096705857e-05,
"loss": 4.6935,
"step": 112400
},
{
"epoch": 0.13,
"grad_norm": 0.3854735493659973,
"learning_rate": 8.694825941987624e-05,
"loss": 4.6889,
"step": 112500
},
{
"epoch": 0.13,
"grad_norm": 0.40090426802635193,
"learning_rate": 8.693665787269391e-05,
"loss": 4.6874,
"step": 112600
},
{
"epoch": 0.13,
"grad_norm": 0.40820324420928955,
"learning_rate": 8.692505632551158e-05,
"loss": 4.6902,
"step": 112700
},
{
"epoch": 0.13,
"grad_norm": 1.9625461101531982,
"learning_rate": 8.691345477832924e-05,
"loss": 4.6876,
"step": 112800
},
{
"epoch": 0.13,
"grad_norm": 0.39590463042259216,
"learning_rate": 8.690185323114691e-05,
"loss": 4.6895,
"step": 112900
},
{
"epoch": 0.13,
"grad_norm": 0.4090369641780853,
"learning_rate": 8.689025168396458e-05,
"loss": 4.6921,
"step": 113000
},
{
"epoch": 0.13,
"grad_norm": 0.48455721139907837,
"learning_rate": 8.687865013678225e-05,
"loss": 4.686,
"step": 113100
},
{
"epoch": 0.13,
"grad_norm": 0.5107011795043945,
"learning_rate": 8.686704858959991e-05,
"loss": 4.6863,
"step": 113200
},
{
"epoch": 0.13,
"grad_norm": 0.4489879608154297,
"learning_rate": 8.685544704241758e-05,
"loss": 4.689,
"step": 113300
},
{
"epoch": 0.13,
"grad_norm": 0.7847305536270142,
"learning_rate": 8.684384549523525e-05,
"loss": 4.687,
"step": 113400
},
{
"epoch": 0.13,
"grad_norm": 0.4807884693145752,
"learning_rate": 8.683224394805292e-05,
"loss": 4.6844,
"step": 113500
},
{
"epoch": 0.13,
"grad_norm": 0.4227710962295532,
"learning_rate": 8.682064240087059e-05,
"loss": 4.6882,
"step": 113600
},
{
"epoch": 0.13,
"grad_norm": 0.425329327583313,
"learning_rate": 8.680904085368825e-05,
"loss": 4.6876,
"step": 113700
},
{
"epoch": 0.13,
"grad_norm": 0.39754214882850647,
"learning_rate": 8.679743930650593e-05,
"loss": 4.6928,
"step": 113800
},
{
"epoch": 0.13,
"grad_norm": 0.41271573305130005,
"learning_rate": 8.678583775932359e-05,
"loss": 4.6871,
"step": 113900
},
{
"epoch": 0.13,
"grad_norm": 0.6075764298439026,
"learning_rate": 8.677423621214126e-05,
"loss": 4.6846,
"step": 114000
},
{
"epoch": 0.13,
"grad_norm": 0.4607229232788086,
"learning_rate": 8.676263466495891e-05,
"loss": 4.6907,
"step": 114100
},
{
"epoch": 0.13,
"grad_norm": 0.49409857392311096,
"learning_rate": 8.67510331177766e-05,
"loss": 4.6835,
"step": 114200
},
{
"epoch": 0.13,
"grad_norm": 1.1049710512161255,
"learning_rate": 8.673943157059425e-05,
"loss": 4.6851,
"step": 114300
},
{
"epoch": 0.13,
"grad_norm": 0.4263964593410492,
"learning_rate": 8.672783002341192e-05,
"loss": 4.6881,
"step": 114400
},
{
"epoch": 0.13,
"grad_norm": 1.2732186317443848,
"learning_rate": 8.67162284762296e-05,
"loss": 4.6919,
"step": 114500
},
{
"epoch": 0.13,
"grad_norm": 2.2248380184173584,
"learning_rate": 8.670462692904727e-05,
"loss": 4.6864,
"step": 114600
},
{
"epoch": 0.13,
"grad_norm": 1.1274431943893433,
"learning_rate": 8.669302538186494e-05,
"loss": 4.6905,
"step": 114700
},
{
"epoch": 0.13,
"grad_norm": 0.41689029335975647,
"learning_rate": 8.668142383468259e-05,
"loss": 4.6888,
"step": 114800
},
{
"epoch": 0.13,
"grad_norm": 0.39221587777137756,
"learning_rate": 8.666982228750026e-05,
"loss": 4.6836,
"step": 114900
},
{
"epoch": 0.13,
"grad_norm": 0.4654403626918793,
"learning_rate": 8.665822074031793e-05,
"loss": 4.6869,
"step": 115000
},
{
"epoch": 0.13,
"grad_norm": 0.5727940201759338,
"learning_rate": 8.66466191931356e-05,
"loss": 4.6911,
"step": 115100
},
{
"epoch": 0.13,
"grad_norm": 0.3905259966850281,
"learning_rate": 8.663501764595326e-05,
"loss": 4.6837,
"step": 115200
},
{
"epoch": 0.13,
"grad_norm": 1.094152808189392,
"learning_rate": 8.662341609877093e-05,
"loss": 4.6847,
"step": 115300
},
{
"epoch": 0.13,
"grad_norm": 0.39949262142181396,
"learning_rate": 8.66118145515886e-05,
"loss": 4.6877,
"step": 115400
},
{
"epoch": 0.13,
"grad_norm": 0.3685074746608734,
"learning_rate": 8.660021300440627e-05,
"loss": 4.6849,
"step": 115500
},
{
"epoch": 0.13,
"grad_norm": 0.41748788952827454,
"learning_rate": 8.658861145722394e-05,
"loss": 4.6861,
"step": 115600
},
{
"epoch": 0.13,
"grad_norm": 5.671760082244873,
"learning_rate": 8.657700991004161e-05,
"loss": 4.6914,
"step": 115700
},
{
"epoch": 0.13,
"grad_norm": 0.40038684010505676,
"learning_rate": 8.656540836285928e-05,
"loss": 4.6923,
"step": 115800
},
{
"epoch": 0.13,
"grad_norm": 0.4368140995502472,
"learning_rate": 8.655380681567694e-05,
"loss": 4.6875,
"step": 115900
},
{
"epoch": 0.13,
"grad_norm": 0.4060142934322357,
"learning_rate": 8.654220526849461e-05,
"loss": 4.6899,
"step": 116000
},
{
"epoch": 0.13,
"grad_norm": 0.4354085922241211,
"learning_rate": 8.653060372131228e-05,
"loss": 4.6814,
"step": 116100
},
{
"epoch": 0.13,
"grad_norm": 0.6066785454750061,
"learning_rate": 8.651900217412995e-05,
"loss": 4.6823,
"step": 116200
},
{
"epoch": 0.13,
"grad_norm": 0.35917678475379944,
"learning_rate": 8.650740062694761e-05,
"loss": 4.6882,
"step": 116300
},
{
"epoch": 0.14,
"grad_norm": 0.41072890162467957,
"learning_rate": 8.649579907976528e-05,
"loss": 4.6804,
"step": 116400
},
{
"epoch": 0.14,
"grad_norm": 0.494043231010437,
"learning_rate": 8.648419753258295e-05,
"loss": 4.6811,
"step": 116500
},
{
"epoch": 0.14,
"grad_norm": 0.4103025197982788,
"learning_rate": 8.647259598540062e-05,
"loss": 4.682,
"step": 116600
},
{
"epoch": 0.14,
"grad_norm": 0.5634316802024841,
"learning_rate": 8.646099443821829e-05,
"loss": 4.6873,
"step": 116700
},
{
"epoch": 0.14,
"grad_norm": 0.4846033453941345,
"learning_rate": 8.644939289103595e-05,
"loss": 4.6852,
"step": 116800
},
{
"epoch": 0.14,
"grad_norm": 0.8417425751686096,
"learning_rate": 8.643779134385363e-05,
"loss": 4.6835,
"step": 116900
},
{
"epoch": 0.14,
"grad_norm": 0.9649463295936584,
"learning_rate": 8.642618979667129e-05,
"loss": 4.6895,
"step": 117000
},
{
"epoch": 0.14,
"grad_norm": 0.4397778809070587,
"learning_rate": 8.641458824948896e-05,
"loss": 4.6847,
"step": 117100
},
{
"epoch": 0.14,
"grad_norm": 0.40810275077819824,
"learning_rate": 8.640298670230662e-05,
"loss": 4.6849,
"step": 117200
},
{
"epoch": 0.14,
"grad_norm": 0.39354971051216125,
"learning_rate": 8.63913851551243e-05,
"loss": 4.687,
"step": 117300
},
{
"epoch": 0.14,
"grad_norm": 0.6670671701431274,
"learning_rate": 8.637978360794196e-05,
"loss": 4.6858,
"step": 117400
},
{
"epoch": 0.14,
"grad_norm": 0.8771277070045471,
"learning_rate": 8.636818206075963e-05,
"loss": 4.6842,
"step": 117500
},
{
"epoch": 0.14,
"grad_norm": 0.6346202492713928,
"learning_rate": 8.635658051357728e-05,
"loss": 4.6821,
"step": 117600
},
{
"epoch": 0.14,
"grad_norm": 0.3756822347640991,
"learning_rate": 8.634497896639497e-05,
"loss": 4.6852,
"step": 117700
},
{
"epoch": 0.14,
"grad_norm": 0.809264063835144,
"learning_rate": 8.633337741921264e-05,
"loss": 4.6809,
"step": 117800
},
{
"epoch": 0.14,
"grad_norm": 0.4695824980735779,
"learning_rate": 8.63217758720303e-05,
"loss": 4.6829,
"step": 117900
},
{
"epoch": 0.14,
"grad_norm": 0.3882189095020294,
"learning_rate": 8.631017432484797e-05,
"loss": 4.6824,
"step": 118000
},
{
"epoch": 0.14,
"grad_norm": 0.46574699878692627,
"learning_rate": 8.629857277766564e-05,
"loss": 4.6873,
"step": 118100
},
{
"epoch": 0.14,
"grad_norm": 1.8306362628936768,
"learning_rate": 8.62869712304833e-05,
"loss": 4.6865,
"step": 118200
},
{
"epoch": 0.14,
"grad_norm": 0.5190596580505371,
"learning_rate": 8.627536968330096e-05,
"loss": 4.6839,
"step": 118300
},
{
"epoch": 0.14,
"grad_norm": 0.5554775595664978,
"learning_rate": 8.626376813611863e-05,
"loss": 4.6833,
"step": 118400
},
{
"epoch": 0.14,
"grad_norm": 0.5029674768447876,
"learning_rate": 8.62521665889363e-05,
"loss": 4.6858,
"step": 118500
},
{
"epoch": 0.14,
"grad_norm": 0.5281274914741516,
"learning_rate": 8.624056504175397e-05,
"loss": 4.6801,
"step": 118600
},
{
"epoch": 0.14,
"grad_norm": 0.4034444987773895,
"learning_rate": 8.622896349457163e-05,
"loss": 4.6797,
"step": 118700
},
{
"epoch": 0.14,
"grad_norm": 0.4782603085041046,
"learning_rate": 8.62173619473893e-05,
"loss": 4.6808,
"step": 118800
},
{
"epoch": 0.14,
"grad_norm": 0.505681574344635,
"learning_rate": 8.620576040020699e-05,
"loss": 4.6805,
"step": 118900
},
{
"epoch": 0.14,
"grad_norm": 0.45551398396492004,
"learning_rate": 8.619415885302464e-05,
"loss": 4.681,
"step": 119000
},
{
"epoch": 0.14,
"grad_norm": 0.46791940927505493,
"learning_rate": 8.618255730584231e-05,
"loss": 4.6804,
"step": 119100
},
{
"epoch": 0.14,
"grad_norm": 0.40453869104385376,
"learning_rate": 8.617095575865998e-05,
"loss": 4.6849,
"step": 119200
},
{
"epoch": 0.14,
"grad_norm": 0.5323479771614075,
"learning_rate": 8.615935421147765e-05,
"loss": 4.6856,
"step": 119300
},
{
"epoch": 0.14,
"grad_norm": 0.3924371600151062,
"learning_rate": 8.614775266429531e-05,
"loss": 4.6811,
"step": 119400
},
{
"epoch": 0.14,
"grad_norm": 0.4504171311855316,
"learning_rate": 8.613615111711298e-05,
"loss": 4.6807,
"step": 119500
},
{
"epoch": 0.14,
"grad_norm": 1.3027788400650024,
"learning_rate": 8.612454956993065e-05,
"loss": 4.6847,
"step": 119600
},
{
"epoch": 0.14,
"grad_norm": 0.5537983179092407,
"learning_rate": 8.611294802274832e-05,
"loss": 4.681,
"step": 119700
},
{
"epoch": 0.14,
"grad_norm": 0.3960654139518738,
"learning_rate": 8.610134647556598e-05,
"loss": 4.686,
"step": 119800
},
{
"epoch": 0.14,
"grad_norm": 0.4028005003929138,
"learning_rate": 8.608974492838365e-05,
"loss": 4.6777,
"step": 119900
},
{
"epoch": 0.14,
"grad_norm": 0.42540809512138367,
"learning_rate": 8.607814338120133e-05,
"loss": 4.6831,
"step": 120000
},
{
"epoch": 0.14,
"grad_norm": 1.0470972061157227,
"learning_rate": 8.606654183401899e-05,
"loss": 4.6818,
"step": 120100
},
{
"epoch": 0.14,
"grad_norm": 0.4977876842021942,
"learning_rate": 8.605494028683666e-05,
"loss": 4.6789,
"step": 120200
},
{
"epoch": 0.14,
"grad_norm": 0.5205646753311157,
"learning_rate": 8.604333873965432e-05,
"loss": 4.6806,
"step": 120300
},
{
"epoch": 0.14,
"grad_norm": 0.3748117983341217,
"learning_rate": 8.6031737192472e-05,
"loss": 4.6847,
"step": 120400
},
{
"epoch": 0.14,
"grad_norm": 0.4421583414077759,
"learning_rate": 8.602013564528966e-05,
"loss": 4.6772,
"step": 120500
},
{
"epoch": 0.14,
"grad_norm": 0.596479594707489,
"learning_rate": 8.600853409810733e-05,
"loss": 4.6808,
"step": 120600
},
{
"epoch": 0.14,
"grad_norm": 1.599827527999878,
"learning_rate": 8.599693255092498e-05,
"loss": 4.6842,
"step": 120700
},
{
"epoch": 0.14,
"grad_norm": 0.47555598616600037,
"learning_rate": 8.598533100374267e-05,
"loss": 4.6842,
"step": 120800
},
{
"epoch": 0.14,
"grad_norm": 0.5468947887420654,
"learning_rate": 8.597372945656033e-05,
"loss": 4.6863,
"step": 120900
},
{
"epoch": 0.14,
"grad_norm": 0.4049210548400879,
"learning_rate": 8.5962127909378e-05,
"loss": 4.6802,
"step": 121000
},
{
"epoch": 0.14,
"grad_norm": 0.39633214473724365,
"learning_rate": 8.595052636219567e-05,
"loss": 4.681,
"step": 121100
},
{
"epoch": 0.14,
"grad_norm": 0.3883484899997711,
"learning_rate": 8.593892481501334e-05,
"loss": 4.6846,
"step": 121200
},
{
"epoch": 0.14,
"grad_norm": 0.4120100736618042,
"learning_rate": 8.592732326783101e-05,
"loss": 4.6829,
"step": 121300
},
{
"epoch": 0.14,
"grad_norm": 0.554779052734375,
"learning_rate": 8.591572172064866e-05,
"loss": 4.6828,
"step": 121400
},
{
"epoch": 0.14,
"grad_norm": 1.2129675149917603,
"learning_rate": 8.590412017346633e-05,
"loss": 4.6802,
"step": 121500
},
{
"epoch": 0.14,
"grad_norm": 0.413712739944458,
"learning_rate": 8.5892518626284e-05,
"loss": 4.6826,
"step": 121600
},
{
"epoch": 0.14,
"grad_norm": 1.532698392868042,
"learning_rate": 8.588091707910168e-05,
"loss": 4.6837,
"step": 121700
},
{
"epoch": 0.14,
"grad_norm": 0.3998155891895294,
"learning_rate": 8.586931553191933e-05,
"loss": 4.6829,
"step": 121800
},
{
"epoch": 0.14,
"grad_norm": 0.41272029280662537,
"learning_rate": 8.5857713984737e-05,
"loss": 4.6809,
"step": 121900
},
{
"epoch": 0.14,
"grad_norm": 0.40928176045417786,
"learning_rate": 8.584611243755467e-05,
"loss": 4.6827,
"step": 122000
},
{
"epoch": 0.14,
"grad_norm": 0.6598598957061768,
"learning_rate": 8.583451089037234e-05,
"loss": 4.6869,
"step": 122100
},
{
"epoch": 0.14,
"grad_norm": 0.47744888067245483,
"learning_rate": 8.582290934319001e-05,
"loss": 4.6834,
"step": 122200
},
{
"epoch": 0.14,
"grad_norm": 0.42328187823295593,
"learning_rate": 8.581130779600768e-05,
"loss": 4.6823,
"step": 122300
},
{
"epoch": 0.14,
"grad_norm": 0.5083212852478027,
"learning_rate": 8.579970624882535e-05,
"loss": 4.6799,
"step": 122400
},
{
"epoch": 0.14,
"grad_norm": 0.41937893629074097,
"learning_rate": 8.578810470164301e-05,
"loss": 4.6823,
"step": 122500
},
{
"epoch": 0.14,
"grad_norm": 0.46155425906181335,
"learning_rate": 8.577650315446068e-05,
"loss": 4.6847,
"step": 122600
},
{
"epoch": 0.14,
"grad_norm": 0.4063146710395813,
"learning_rate": 8.576490160727835e-05,
"loss": 4.6777,
"step": 122700
},
{
"epoch": 0.14,
"grad_norm": 0.9644930362701416,
"learning_rate": 8.575330006009602e-05,
"loss": 4.6816,
"step": 122800
},
{
"epoch": 0.14,
"grad_norm": 0.4703579246997833,
"learning_rate": 8.574169851291368e-05,
"loss": 4.6841,
"step": 122900
},
{
"epoch": 0.14,
"grad_norm": 0.3799445629119873,
"learning_rate": 8.573009696573135e-05,
"loss": 4.6874,
"step": 123000
},
{
"epoch": 0.14,
"grad_norm": 1.223569393157959,
"learning_rate": 8.571849541854902e-05,
"loss": 4.685,
"step": 123100
},
{
"epoch": 0.14,
"grad_norm": 1.3866465091705322,
"learning_rate": 8.570689387136669e-05,
"loss": 4.6814,
"step": 123200
},
{
"epoch": 0.14,
"grad_norm": 0.8391069769859314,
"learning_rate": 8.569529232418436e-05,
"loss": 4.676,
"step": 123300
},
{
"epoch": 0.14,
"grad_norm": 0.41561365127563477,
"learning_rate": 8.568369077700202e-05,
"loss": 4.6841,
"step": 123400
},
{
"epoch": 0.14,
"grad_norm": 1.2650662660598755,
"learning_rate": 8.56720892298197e-05,
"loss": 4.6828,
"step": 123500
},
{
"epoch": 0.14,
"grad_norm": 0.6244620084762573,
"learning_rate": 8.566048768263736e-05,
"loss": 4.6762,
"step": 123600
},
{
"epoch": 0.14,
"grad_norm": 0.36179304122924805,
"learning_rate": 8.564888613545503e-05,
"loss": 4.6765,
"step": 123700
},
{
"epoch": 0.14,
"grad_norm": 0.4196653366088867,
"learning_rate": 8.563728458827269e-05,
"loss": 4.6831,
"step": 123800
},
{
"epoch": 0.14,
"grad_norm": 0.3799944818019867,
"learning_rate": 8.562568304109037e-05,
"loss": 4.6783,
"step": 123900
},
{
"epoch": 0.14,
"grad_norm": 0.47710663080215454,
"learning_rate": 8.561408149390803e-05,
"loss": 4.681,
"step": 124000
},
{
"epoch": 0.14,
"grad_norm": 0.38043680787086487,
"learning_rate": 8.56024799467257e-05,
"loss": 4.6831,
"step": 124100
},
{
"epoch": 0.14,
"grad_norm": 3.3860080242156982,
"learning_rate": 8.559087839954337e-05,
"loss": 4.6771,
"step": 124200
},
{
"epoch": 0.14,
"grad_norm": 0.5481556057929993,
"learning_rate": 8.557927685236104e-05,
"loss": 4.6815,
"step": 124300
},
{
"epoch": 0.14,
"grad_norm": 0.5027362704277039,
"learning_rate": 8.556767530517871e-05,
"loss": 4.6813,
"step": 124400
},
{
"epoch": 0.14,
"grad_norm": 15.900703430175781,
"learning_rate": 8.555607375799637e-05,
"loss": 4.6881,
"step": 124500
},
{
"epoch": 0.14,
"grad_norm": 0.3703874349594116,
"learning_rate": 8.554447221081404e-05,
"loss": 4.6873,
"step": 124600
},
{
"epoch": 0.14,
"grad_norm": 0.3847333490848541,
"learning_rate": 8.55328706636317e-05,
"loss": 4.6814,
"step": 124700
},
{
"epoch": 0.14,
"grad_norm": 0.6109323501586914,
"learning_rate": 8.552126911644938e-05,
"loss": 4.6812,
"step": 124800
},
{
"epoch": 0.14,
"grad_norm": 0.39739909768104553,
"learning_rate": 8.550966756926703e-05,
"loss": 4.6791,
"step": 124900
},
{
"epoch": 0.15,
"grad_norm": 0.4672738313674927,
"learning_rate": 8.54980660220847e-05,
"loss": 4.682,
"step": 125000
},
{
"epoch": 0.15,
"grad_norm": 0.3909046947956085,
"learning_rate": 8.548646447490237e-05,
"loss": 4.676,
"step": 125100
},
{
"epoch": 0.15,
"grad_norm": 0.7728373408317566,
"learning_rate": 8.547486292772005e-05,
"loss": 4.6798,
"step": 125200
},
{
"epoch": 0.15,
"grad_norm": 0.6358431577682495,
"learning_rate": 8.546326138053772e-05,
"loss": 4.682,
"step": 125300
},
{
"epoch": 0.15,
"grad_norm": 0.8379632234573364,
"learning_rate": 8.545165983335539e-05,
"loss": 4.6806,
"step": 125400
},
{
"epoch": 0.15,
"grad_norm": 0.4461159110069275,
"learning_rate": 8.544005828617306e-05,
"loss": 4.6822,
"step": 125500
},
{
"epoch": 0.15,
"grad_norm": 1.537218689918518,
"learning_rate": 8.542845673899071e-05,
"loss": 4.6802,
"step": 125600
},
{
"epoch": 0.15,
"grad_norm": 0.43007928133010864,
"learning_rate": 8.541685519180838e-05,
"loss": 4.6779,
"step": 125700
},
{
"epoch": 0.15,
"grad_norm": 0.5105913281440735,
"learning_rate": 8.540525364462605e-05,
"loss": 4.6817,
"step": 125800
},
{
"epoch": 0.15,
"grad_norm": 0.5022956728935242,
"learning_rate": 8.539365209744372e-05,
"loss": 4.678,
"step": 125900
},
{
"epoch": 0.15,
"grad_norm": 0.45969635248184204,
"learning_rate": 8.538205055026138e-05,
"loss": 4.6779,
"step": 126000
},
{
"epoch": 0.15,
"grad_norm": 0.8507632613182068,
"learning_rate": 8.537044900307905e-05,
"loss": 4.6747,
"step": 126100
},
{
"epoch": 0.15,
"grad_norm": 0.4339176118373871,
"learning_rate": 8.535884745589672e-05,
"loss": 4.6818,
"step": 126200
},
{
"epoch": 0.15,
"grad_norm": 0.814734160900116,
"learning_rate": 8.534724590871439e-05,
"loss": 4.6786,
"step": 126300
},
{
"epoch": 0.15,
"grad_norm": 0.4143752455711365,
"learning_rate": 8.533564436153206e-05,
"loss": 4.6783,
"step": 126400
},
{
"epoch": 0.15,
"grad_norm": 0.44699016213417053,
"learning_rate": 8.532404281434972e-05,
"loss": 4.6752,
"step": 126500
},
{
"epoch": 0.15,
"grad_norm": 0.38036221265792847,
"learning_rate": 8.53124412671674e-05,
"loss": 4.679,
"step": 126600
},
{
"epoch": 0.15,
"grad_norm": 0.38587382435798645,
"learning_rate": 8.530083971998506e-05,
"loss": 4.6801,
"step": 126700
},
{
"epoch": 0.15,
"grad_norm": 0.4019007682800293,
"learning_rate": 8.528923817280273e-05,
"loss": 4.681,
"step": 126800
},
{
"epoch": 0.15,
"grad_norm": 0.43139323592185974,
"learning_rate": 8.527763662562039e-05,
"loss": 4.6849,
"step": 126900
},
{
"epoch": 0.15,
"grad_norm": 0.3975641131401062,
"learning_rate": 8.526603507843807e-05,
"loss": 4.6796,
"step": 127000
},
{
"epoch": 0.15,
"grad_norm": 1.2765636444091797,
"learning_rate": 8.525443353125573e-05,
"loss": 4.6789,
"step": 127100
},
{
"epoch": 0.15,
"grad_norm": 0.3746165335178375,
"learning_rate": 8.52428319840734e-05,
"loss": 4.6801,
"step": 127200
},
{
"epoch": 0.15,
"grad_norm": 3.3519067764282227,
"learning_rate": 8.523123043689106e-05,
"loss": 4.6818,
"step": 127300
},
{
"epoch": 0.15,
"grad_norm": 0.41541022062301636,
"learning_rate": 8.521962888970874e-05,
"loss": 4.674,
"step": 127400
},
{
"epoch": 0.15,
"grad_norm": 0.41416847705841064,
"learning_rate": 8.520802734252641e-05,
"loss": 4.6802,
"step": 127500
},
{
"epoch": 0.15,
"grad_norm": 0.4105985164642334,
"learning_rate": 8.519642579534407e-05,
"loss": 4.6818,
"step": 127600
},
{
"epoch": 0.15,
"grad_norm": 0.5681924223899841,
"learning_rate": 8.518482424816174e-05,
"loss": 4.6801,
"step": 127700
},
{
"epoch": 0.15,
"grad_norm": 0.5585260391235352,
"learning_rate": 8.517322270097941e-05,
"loss": 4.6775,
"step": 127800
},
{
"epoch": 0.15,
"grad_norm": 0.5471286177635193,
"learning_rate": 8.516162115379708e-05,
"loss": 4.6785,
"step": 127900
},
{
"epoch": 0.15,
"grad_norm": 0.39895522594451904,
"learning_rate": 8.515001960661474e-05,
"loss": 4.6804,
"step": 128000
},
{
"epoch": 0.15,
"grad_norm": 0.38762810826301575,
"learning_rate": 8.51384180594324e-05,
"loss": 4.6781,
"step": 128100
},
{
"epoch": 0.15,
"grad_norm": 0.42689523100852966,
"learning_rate": 8.512681651225008e-05,
"loss": 4.6797,
"step": 128200
},
{
"epoch": 0.15,
"grad_norm": 0.39311423897743225,
"learning_rate": 8.511521496506775e-05,
"loss": 4.6883,
"step": 128300
},
{
"epoch": 0.15,
"grad_norm": 0.5697504878044128,
"learning_rate": 8.51036134178854e-05,
"loss": 4.6743,
"step": 128400
},
{
"epoch": 0.15,
"grad_norm": 0.42035338282585144,
"learning_rate": 8.509201187070309e-05,
"loss": 4.6797,
"step": 128500
},
{
"epoch": 0.15,
"grad_norm": 0.7120895981788635,
"learning_rate": 8.508041032352076e-05,
"loss": 4.6778,
"step": 128600
},
{
"epoch": 0.15,
"grad_norm": 0.38941749930381775,
"learning_rate": 8.506880877633841e-05,
"loss": 4.6787,
"step": 128700
},
{
"epoch": 0.15,
"grad_norm": 0.7306220531463623,
"learning_rate": 8.505720722915609e-05,
"loss": 4.6772,
"step": 128800
},
{
"epoch": 0.15,
"grad_norm": 0.38047537207603455,
"learning_rate": 8.504560568197376e-05,
"loss": 4.6804,
"step": 128900
},
{
"epoch": 0.15,
"grad_norm": 2.079150676727295,
"learning_rate": 8.503400413479143e-05,
"loss": 4.6788,
"step": 129000
},
{
"epoch": 0.15,
"grad_norm": 0.44098183512687683,
"learning_rate": 8.502240258760908e-05,
"loss": 4.6749,
"step": 129100
},
{
"epoch": 0.15,
"grad_norm": 1.4035288095474243,
"learning_rate": 8.501080104042675e-05,
"loss": 4.6691,
"step": 129200
},
{
"epoch": 0.15,
"grad_norm": 0.43114110827445984,
"learning_rate": 8.499919949324442e-05,
"loss": 4.6797,
"step": 129300
},
{
"epoch": 0.15,
"grad_norm": 0.5038126707077026,
"learning_rate": 8.49875979460621e-05,
"loss": 4.6818,
"step": 129400
},
{
"epoch": 0.15,
"grad_norm": 0.44660595059394836,
"learning_rate": 8.497599639887975e-05,
"loss": 4.6804,
"step": 129500
},
{
"epoch": 0.15,
"grad_norm": 0.3962138593196869,
"learning_rate": 8.496439485169742e-05,
"loss": 4.6718,
"step": 129600
},
{
"epoch": 0.15,
"grad_norm": 0.5164237022399902,
"learning_rate": 8.49527933045151e-05,
"loss": 4.6833,
"step": 129700
},
{
"epoch": 0.15,
"grad_norm": 1.7946994304656982,
"learning_rate": 8.494119175733276e-05,
"loss": 4.678,
"step": 129800
},
{
"epoch": 0.15,
"grad_norm": 0.39110898971557617,
"learning_rate": 8.492959021015043e-05,
"loss": 4.6782,
"step": 129900
},
{
"epoch": 0.15,
"grad_norm": 8.766246795654297,
"learning_rate": 8.491798866296809e-05,
"loss": 4.6728,
"step": 130000
},
{
"epoch": 0.15,
"grad_norm": 7.374971866607666,
"learning_rate": 8.490638711578577e-05,
"loss": 4.6776,
"step": 130100
},
{
"epoch": 0.15,
"grad_norm": 0.5221861600875854,
"learning_rate": 8.489478556860343e-05,
"loss": 4.679,
"step": 130200
},
{
"epoch": 0.15,
"grad_norm": 0.3894909620285034,
"learning_rate": 8.48831840214211e-05,
"loss": 4.6727,
"step": 130300
},
{
"epoch": 0.15,
"grad_norm": 0.39896321296691895,
"learning_rate": 8.487158247423876e-05,
"loss": 4.6779,
"step": 130400
},
{
"epoch": 0.15,
"grad_norm": 0.4062064588069916,
"learning_rate": 8.485998092705644e-05,
"loss": 4.6774,
"step": 130500
},
{
"epoch": 0.15,
"grad_norm": 0.5894352197647095,
"learning_rate": 8.48483793798741e-05,
"loss": 4.679,
"step": 130600
},
{
"epoch": 0.15,
"grad_norm": 0.7942706942558289,
"learning_rate": 8.483677783269177e-05,
"loss": 4.673,
"step": 130700
},
{
"epoch": 0.15,
"grad_norm": 0.4256235361099243,
"learning_rate": 8.482517628550944e-05,
"loss": 4.6752,
"step": 130800
},
{
"epoch": 0.15,
"grad_norm": 0.4181482791900635,
"learning_rate": 8.481357473832711e-05,
"loss": 4.6809,
"step": 130900
},
{
"epoch": 0.15,
"grad_norm": 0.37271997332572937,
"learning_rate": 8.480197319114478e-05,
"loss": 4.678,
"step": 131000
}
],
"logging_steps": 100,
"max_steps": 861954,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"total_flos": 3.271273415079469e+18,
"train_batch_size": 192,
"trial_name": null,
"trial_params": null
}