m8x7b-final-vision-adaptee / trainer_state.json
kloodia's picture
Upload folder using huggingface_hub
b71fd7e verified
raw
history blame contribute delete
No virus
152 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.4999552075681293,
"eval_steps": 500,
"global_step": 872,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0005733431279451024,
"grad_norm": 343.8302001953125,
"learning_rate": 3.7037037037037036e-07,
"loss": 15.7056,
"step": 1
},
{
"epoch": 0.0011466862558902047,
"grad_norm": 354.2492370605469,
"learning_rate": 7.407407407407407e-07,
"loss": 15.7685,
"step": 2
},
{
"epoch": 0.0017200293838353072,
"grad_norm": 332.5870666503906,
"learning_rate": 1.111111111111111e-06,
"loss": 15.7735,
"step": 3
},
{
"epoch": 0.0022933725117804094,
"grad_norm": 337.5965576171875,
"learning_rate": 1.4814814814814815e-06,
"loss": 15.6978,
"step": 4
},
{
"epoch": 0.002866715639725512,
"grad_norm": 337.68743896484375,
"learning_rate": 1.8518518518518519e-06,
"loss": 15.7328,
"step": 5
},
{
"epoch": 0.0034400587676706143,
"grad_norm": 352.12841796875,
"learning_rate": 2.222222222222222e-06,
"loss": 15.7606,
"step": 6
},
{
"epoch": 0.004013401895615717,
"grad_norm": 350.4210510253906,
"learning_rate": 2.5925925925925925e-06,
"loss": 15.7352,
"step": 7
},
{
"epoch": 0.004586745023560819,
"grad_norm": 326.6189270019531,
"learning_rate": 2.962962962962963e-06,
"loss": 15.7233,
"step": 8
},
{
"epoch": 0.005160088151505922,
"grad_norm": 337.8202819824219,
"learning_rate": 3.3333333333333333e-06,
"loss": 15.7473,
"step": 9
},
{
"epoch": 0.005733431279451024,
"grad_norm": 346.01373291015625,
"learning_rate": 3.7037037037037037e-06,
"loss": 15.7525,
"step": 10
},
{
"epoch": 0.006306774407396127,
"grad_norm": 345.3333435058594,
"learning_rate": 4.074074074074074e-06,
"loss": 15.7312,
"step": 11
},
{
"epoch": 0.006880117535341229,
"grad_norm": 366.6058044433594,
"learning_rate": 4.444444444444444e-06,
"loss": 15.8701,
"step": 12
},
{
"epoch": 0.0074534606632863315,
"grad_norm": 343.2021484375,
"learning_rate": 4.814814814814815e-06,
"loss": 15.7247,
"step": 13
},
{
"epoch": 0.008026803791231434,
"grad_norm": 355.9106750488281,
"learning_rate": 5.185185185185185e-06,
"loss": 15.8606,
"step": 14
},
{
"epoch": 0.008600146919176536,
"grad_norm": 348.240966796875,
"learning_rate": 5.555555555555557e-06,
"loss": 15.7669,
"step": 15
},
{
"epoch": 0.009173490047121638,
"grad_norm": 338.352783203125,
"learning_rate": 5.925925925925926e-06,
"loss": 15.7275,
"step": 16
},
{
"epoch": 0.00974683317506674,
"grad_norm": 345.3967590332031,
"learning_rate": 6.296296296296297e-06,
"loss": 15.6908,
"step": 17
},
{
"epoch": 0.010320176303011843,
"grad_norm": 337.52459716796875,
"learning_rate": 6.666666666666667e-06,
"loss": 15.7063,
"step": 18
},
{
"epoch": 0.010893519430956946,
"grad_norm": 348.9382019042969,
"learning_rate": 7.0370370370370375e-06,
"loss": 15.7312,
"step": 19
},
{
"epoch": 0.011466862558902047,
"grad_norm": 367.71954345703125,
"learning_rate": 7.4074074074074075e-06,
"loss": 15.6506,
"step": 20
},
{
"epoch": 0.01204020568684715,
"grad_norm": 340.8507080078125,
"learning_rate": 7.77777777777778e-06,
"loss": 15.6737,
"step": 21
},
{
"epoch": 0.012613548814792253,
"grad_norm": 342.04400634765625,
"learning_rate": 8.148148148148148e-06,
"loss": 15.7023,
"step": 22
},
{
"epoch": 0.013186891942737354,
"grad_norm": 341.126708984375,
"learning_rate": 8.518518518518519e-06,
"loss": 15.7475,
"step": 23
},
{
"epoch": 0.013760235070682457,
"grad_norm": 355.2649841308594,
"learning_rate": 8.888888888888888e-06,
"loss": 15.719,
"step": 24
},
{
"epoch": 0.01433357819862756,
"grad_norm": 332.40753173828125,
"learning_rate": 9.25925925925926e-06,
"loss": 15.7316,
"step": 25
},
{
"epoch": 0.014906921326572663,
"grad_norm": 337.4178771972656,
"learning_rate": 9.62962962962963e-06,
"loss": 15.7153,
"step": 26
},
{
"epoch": 0.015480264454517764,
"grad_norm": 337.7605895996094,
"learning_rate": 1e-05,
"loss": 15.6421,
"step": 27
},
{
"epoch": 0.016053607582462867,
"grad_norm": 339.8226013183594,
"learning_rate": 9.999965443811378e-06,
"loss": 15.7325,
"step": 28
},
{
"epoch": 0.01662695071040797,
"grad_norm": 338.6639709472656,
"learning_rate": 9.999861775723162e-06,
"loss": 15.5632,
"step": 29
},
{
"epoch": 0.017200293838353073,
"grad_norm": 313.73358154296875,
"learning_rate": 9.999688997168301e-06,
"loss": 15.6312,
"step": 30
},
{
"epoch": 0.017773636966298176,
"grad_norm": 324.01812744140625,
"learning_rate": 9.999447110535026e-06,
"loss": 15.568,
"step": 31
},
{
"epoch": 0.018346980094243275,
"grad_norm": 350.6103820800781,
"learning_rate": 9.999136119166803e-06,
"loss": 15.6763,
"step": 32
},
{
"epoch": 0.018920323222188378,
"grad_norm": 345.98101806640625,
"learning_rate": 9.998756027362308e-06,
"loss": 15.6779,
"step": 33
},
{
"epoch": 0.01949366635013348,
"grad_norm": 332.9476013183594,
"learning_rate": 9.99830684037535e-06,
"loss": 15.5666,
"step": 34
},
{
"epoch": 0.020067009478078584,
"grad_norm": 323.1610412597656,
"learning_rate": 9.9977885644148e-06,
"loss": 15.5645,
"step": 35
},
{
"epoch": 0.020640352606023687,
"grad_norm": 345.10504150390625,
"learning_rate": 9.997201206644522e-06,
"loss": 15.6663,
"step": 36
},
{
"epoch": 0.02121369573396879,
"grad_norm": 327.5580749511719,
"learning_rate": 9.99654477518325e-06,
"loss": 15.5561,
"step": 37
},
{
"epoch": 0.021787038861913893,
"grad_norm": 321.0745849609375,
"learning_rate": 9.995819279104494e-06,
"loss": 15.6465,
"step": 38
},
{
"epoch": 0.022360381989858992,
"grad_norm": 331.85736083984375,
"learning_rate": 9.995024728436402e-06,
"loss": 15.6326,
"step": 39
},
{
"epoch": 0.022933725117804095,
"grad_norm": 317.6231994628906,
"learning_rate": 9.994161134161635e-06,
"loss": 15.5944,
"step": 40
},
{
"epoch": 0.023507068245749198,
"grad_norm": 329.2905578613281,
"learning_rate": 9.993228508217201e-06,
"loss": 15.6057,
"step": 41
},
{
"epoch": 0.0240804113736943,
"grad_norm": 331.61407470703125,
"learning_rate": 9.9922268634943e-06,
"loss": 15.5034,
"step": 42
},
{
"epoch": 0.024653754501639404,
"grad_norm": 308.8927917480469,
"learning_rate": 9.991156213838143e-06,
"loss": 15.5626,
"step": 43
},
{
"epoch": 0.025227097629584506,
"grad_norm": 331.08624267578125,
"learning_rate": 9.990016574047757e-06,
"loss": 15.628,
"step": 44
},
{
"epoch": 0.02580044075752961,
"grad_norm": 343.9646911621094,
"learning_rate": 9.988807959875785e-06,
"loss": 15.52,
"step": 45
},
{
"epoch": 0.02637378388547471,
"grad_norm": 317.4025573730469,
"learning_rate": 9.987530388028269e-06,
"loss": 15.5872,
"step": 46
},
{
"epoch": 0.02694712701341981,
"grad_norm": 322.307373046875,
"learning_rate": 9.986183876164412e-06,
"loss": 15.4988,
"step": 47
},
{
"epoch": 0.027520470141364915,
"grad_norm": 317.954833984375,
"learning_rate": 9.984768442896342e-06,
"loss": 15.4338,
"step": 48
},
{
"epoch": 0.028093813269310017,
"grad_norm": 316.19512939453125,
"learning_rate": 9.983284107788852e-06,
"loss": 15.464,
"step": 49
},
{
"epoch": 0.02866715639725512,
"grad_norm": 309.9515686035156,
"learning_rate": 9.981730891359123e-06,
"loss": 15.4762,
"step": 50
},
{
"epoch": 0.029240499525200223,
"grad_norm": 325.8763427734375,
"learning_rate": 9.980108815076456e-06,
"loss": 15.4914,
"step": 51
},
{
"epoch": 0.029813842653145326,
"grad_norm": 310.78424072265625,
"learning_rate": 9.978417901361958e-06,
"loss": 15.5108,
"step": 52
},
{
"epoch": 0.030387185781090426,
"grad_norm": 335.9707946777344,
"learning_rate": 9.976658173588244e-06,
"loss": 15.5588,
"step": 53
},
{
"epoch": 0.03096052890903553,
"grad_norm": 325.359375,
"learning_rate": 9.974829656079106e-06,
"loss": 15.5329,
"step": 54
},
{
"epoch": 0.03153387203698063,
"grad_norm": 313.94293212890625,
"learning_rate": 9.972932374109184e-06,
"loss": 15.4339,
"step": 55
},
{
"epoch": 0.032107215164925734,
"grad_norm": 328.7725830078125,
"learning_rate": 9.97096635390361e-06,
"loss": 15.5493,
"step": 56
},
{
"epoch": 0.03268055829287084,
"grad_norm": 329.2198791503906,
"learning_rate": 9.968931622637652e-06,
"loss": 15.5452,
"step": 57
},
{
"epoch": 0.03325390142081594,
"grad_norm": 323.4937438964844,
"learning_rate": 9.966828208436332e-06,
"loss": 15.4639,
"step": 58
},
{
"epoch": 0.03382724454876104,
"grad_norm": 318.2495422363281,
"learning_rate": 9.96465614037404e-06,
"loss": 15.4536,
"step": 59
},
{
"epoch": 0.034400587676706146,
"grad_norm": 315.8673095703125,
"learning_rate": 9.962415448474134e-06,
"loss": 15.4971,
"step": 60
},
{
"epoch": 0.03497393080465125,
"grad_norm": 313.1468505859375,
"learning_rate": 9.960106163708522e-06,
"loss": 15.4236,
"step": 61
},
{
"epoch": 0.03554727393259635,
"grad_norm": 311.7138977050781,
"learning_rate": 9.95772831799724e-06,
"loss": 15.4216,
"step": 62
},
{
"epoch": 0.03612061706054145,
"grad_norm": 306.1082763671875,
"learning_rate": 9.955281944207998e-06,
"loss": 15.4497,
"step": 63
},
{
"epoch": 0.03669396018848655,
"grad_norm": 320.2241516113281,
"learning_rate": 9.95276707615574e-06,
"loss": 15.4434,
"step": 64
},
{
"epoch": 0.03726730331643165,
"grad_norm": 314.4595642089844,
"learning_rate": 9.950183748602164e-06,
"loss": 15.4662,
"step": 65
},
{
"epoch": 0.037840646444376756,
"grad_norm": 329.10784912109375,
"learning_rate": 9.947531997255256e-06,
"loss": 15.4808,
"step": 66
},
{
"epoch": 0.03841398957232186,
"grad_norm": 312.20135498046875,
"learning_rate": 9.944811858768782e-06,
"loss": 15.4633,
"step": 67
},
{
"epoch": 0.03898733270026696,
"grad_norm": 331.2897033691406,
"learning_rate": 9.94202337074179e-06,
"loss": 15.4861,
"step": 68
},
{
"epoch": 0.039560675828212065,
"grad_norm": 325.3959655761719,
"learning_rate": 9.939166571718086e-06,
"loss": 15.349,
"step": 69
},
{
"epoch": 0.04013401895615717,
"grad_norm": 312.45513916015625,
"learning_rate": 9.936241501185706e-06,
"loss": 15.476,
"step": 70
},
{
"epoch": 0.04070736208410227,
"grad_norm": 318.12322998046875,
"learning_rate": 9.933248199576366e-06,
"loss": 15.5243,
"step": 71
},
{
"epoch": 0.041280705212047374,
"grad_norm": 313.1653137207031,
"learning_rate": 9.930186708264902e-06,
"loss": 15.4566,
"step": 72
},
{
"epoch": 0.041854048339992476,
"grad_norm": 310.0314636230469,
"learning_rate": 9.927057069568704e-06,
"loss": 15.4365,
"step": 73
},
{
"epoch": 0.04242739146793758,
"grad_norm": 319.7293701171875,
"learning_rate": 9.923859326747125e-06,
"loss": 15.4605,
"step": 74
},
{
"epoch": 0.04300073459588268,
"grad_norm": 306.68524169921875,
"learning_rate": 9.920593524000887e-06,
"loss": 15.3812,
"step": 75
},
{
"epoch": 0.043574077723827785,
"grad_norm": 309.94500732421875,
"learning_rate": 9.917259706471469e-06,
"loss": 15.3971,
"step": 76
},
{
"epoch": 0.04414742085177288,
"grad_norm": 321.57647705078125,
"learning_rate": 9.913857920240481e-06,
"loss": 15.471,
"step": 77
},
{
"epoch": 0.044720763979717984,
"grad_norm": 310.59991455078125,
"learning_rate": 9.91038821232903e-06,
"loss": 15.4669,
"step": 78
},
{
"epoch": 0.04529410710766309,
"grad_norm": 298.2730407714844,
"learning_rate": 9.906850630697068e-06,
"loss": 15.4534,
"step": 79
},
{
"epoch": 0.04586745023560819,
"grad_norm": 303.1147766113281,
"learning_rate": 9.903245224242732e-06,
"loss": 15.3767,
"step": 80
},
{
"epoch": 0.04644079336355329,
"grad_norm": 299.9115905761719,
"learning_rate": 9.899572042801662e-06,
"loss": 15.3181,
"step": 81
},
{
"epoch": 0.047014136491498396,
"grad_norm": 299.8761901855469,
"learning_rate": 9.895831137146319e-06,
"loss": 15.3273,
"step": 82
},
{
"epoch": 0.0475874796194435,
"grad_norm": 305.44244384765625,
"learning_rate": 9.89202255898528e-06,
"loss": 15.3504,
"step": 83
},
{
"epoch": 0.0481608227473886,
"grad_norm": 302.8594055175781,
"learning_rate": 9.888146360962523e-06,
"loss": 15.4113,
"step": 84
},
{
"epoch": 0.048734165875333704,
"grad_norm": 310.83587646484375,
"learning_rate": 9.8842025966567e-06,
"loss": 15.4274,
"step": 85
},
{
"epoch": 0.04930750900327881,
"grad_norm": 292.6897277832031,
"learning_rate": 9.880191320580396e-06,
"loss": 15.2777,
"step": 86
},
{
"epoch": 0.04988085213122391,
"grad_norm": 308.5329895019531,
"learning_rate": 9.876112588179378e-06,
"loss": 15.3073,
"step": 87
},
{
"epoch": 0.05045419525916901,
"grad_norm": 295.7265319824219,
"learning_rate": 9.87196645583182e-06,
"loss": 15.3201,
"step": 88
},
{
"epoch": 0.051027538387114116,
"grad_norm": 300.5785827636719,
"learning_rate": 9.86775298084754e-06,
"loss": 15.4455,
"step": 89
},
{
"epoch": 0.05160088151505922,
"grad_norm": 293.6327819824219,
"learning_rate": 9.863472221467189e-06,
"loss": 15.4047,
"step": 90
},
{
"epoch": 0.052174224643004315,
"grad_norm": 314.74468994140625,
"learning_rate": 9.85912423686146e-06,
"loss": 15.4229,
"step": 91
},
{
"epoch": 0.05274756777094942,
"grad_norm": 292.306640625,
"learning_rate": 9.854709087130261e-06,
"loss": 15.3212,
"step": 92
},
{
"epoch": 0.05332091089889452,
"grad_norm": 321.5920715332031,
"learning_rate": 9.850226833301893e-06,
"loss": 15.3655,
"step": 93
},
{
"epoch": 0.05389425402683962,
"grad_norm": 288.67535400390625,
"learning_rate": 9.8456775373322e-06,
"loss": 15.3491,
"step": 94
},
{
"epoch": 0.054467597154784726,
"grad_norm": 301.61151123046875,
"learning_rate": 9.841061262103713e-06,
"loss": 15.4396,
"step": 95
},
{
"epoch": 0.05504094028272983,
"grad_norm": 291.6568908691406,
"learning_rate": 9.836378071424782e-06,
"loss": 15.3401,
"step": 96
},
{
"epoch": 0.05561428341067493,
"grad_norm": 292.19915771484375,
"learning_rate": 9.831628030028698e-06,
"loss": 15.3169,
"step": 97
},
{
"epoch": 0.056187626538620035,
"grad_norm": 291.9767150878906,
"learning_rate": 9.826811203572785e-06,
"loss": 15.3443,
"step": 98
},
{
"epoch": 0.05676096966656514,
"grad_norm": 304.70599365234375,
"learning_rate": 9.821927658637518e-06,
"loss": 15.3755,
"step": 99
},
{
"epoch": 0.05733431279451024,
"grad_norm": 302.005859375,
"learning_rate": 9.81697746272557e-06,
"loss": 15.332,
"step": 100
},
{
"epoch": 0.057907655922455344,
"grad_norm": 302.4617004394531,
"learning_rate": 9.811960684260907e-06,
"loss": 15.4224,
"step": 101
},
{
"epoch": 0.058480999050400446,
"grad_norm": 298.9280700683594,
"learning_rate": 9.80687739258782e-06,
"loss": 15.377,
"step": 102
},
{
"epoch": 0.05905434217834555,
"grad_norm": 287.72869873046875,
"learning_rate": 9.801727657969988e-06,
"loss": 15.3631,
"step": 103
},
{
"epoch": 0.05962768530629065,
"grad_norm": 275.61376953125,
"learning_rate": 9.796511551589492e-06,
"loss": 15.2808,
"step": 104
},
{
"epoch": 0.06020102843423575,
"grad_norm": 283.3959655761719,
"learning_rate": 9.791229145545832e-06,
"loss": 15.3636,
"step": 105
},
{
"epoch": 0.06077437156218085,
"grad_norm": 295.7449035644531,
"learning_rate": 9.785880512854937e-06,
"loss": 15.2886,
"step": 106
},
{
"epoch": 0.061347714690125954,
"grad_norm": 287.8319091796875,
"learning_rate": 9.78046572744815e-06,
"loss": 15.2978,
"step": 107
},
{
"epoch": 0.06192105781807106,
"grad_norm": 294.1200256347656,
"learning_rate": 9.77498486417121e-06,
"loss": 15.2803,
"step": 108
},
{
"epoch": 0.06249440094601616,
"grad_norm": 281.8334655761719,
"learning_rate": 9.769437998783216e-06,
"loss": 15.3278,
"step": 109
},
{
"epoch": 0.06306774407396126,
"grad_norm": 283.81732177734375,
"learning_rate": 9.763825207955577e-06,
"loss": 15.2408,
"step": 110
},
{
"epoch": 0.06364108720190637,
"grad_norm": 289.8335876464844,
"learning_rate": 9.758146569270957e-06,
"loss": 15.2072,
"step": 111
},
{
"epoch": 0.06421443032985147,
"grad_norm": 283.79541015625,
"learning_rate": 9.7524021612222e-06,
"loss": 15.2841,
"step": 112
},
{
"epoch": 0.06478777345779657,
"grad_norm": 278.183349609375,
"learning_rate": 9.746592063211247e-06,
"loss": 15.2678,
"step": 113
},
{
"epoch": 0.06536111658574167,
"grad_norm": 285.3382568359375,
"learning_rate": 9.74071635554803e-06,
"loss": 15.2402,
"step": 114
},
{
"epoch": 0.06593445971368678,
"grad_norm": 278.955078125,
"learning_rate": 9.73477511944938e-06,
"loss": 15.3042,
"step": 115
},
{
"epoch": 0.06650780284163188,
"grad_norm": 279.9234924316406,
"learning_rate": 9.728768437037882e-06,
"loss": 15.2099,
"step": 116
},
{
"epoch": 0.06708114596957698,
"grad_norm": 279.9479064941406,
"learning_rate": 9.722696391340762e-06,
"loss": 15.3344,
"step": 117
},
{
"epoch": 0.06765448909752209,
"grad_norm": 286.9149169921875,
"learning_rate": 9.716559066288716e-06,
"loss": 15.2665,
"step": 118
},
{
"epoch": 0.06822783222546719,
"grad_norm": 306.3753356933594,
"learning_rate": 9.710356546714774e-06,
"loss": 15.2843,
"step": 119
},
{
"epoch": 0.06880117535341229,
"grad_norm": 276.3395690917969,
"learning_rate": 9.704088918353108e-06,
"loss": 15.2029,
"step": 120
},
{
"epoch": 0.0693745184813574,
"grad_norm": 283.53411865234375,
"learning_rate": 9.697756267837856e-06,
"loss": 15.2337,
"step": 121
},
{
"epoch": 0.0699478616093025,
"grad_norm": 288.1679382324219,
"learning_rate": 9.691358682701927e-06,
"loss": 15.1838,
"step": 122
},
{
"epoch": 0.0705212047372476,
"grad_norm": 275.3481750488281,
"learning_rate": 9.684896251375784e-06,
"loss": 15.214,
"step": 123
},
{
"epoch": 0.0710945478651927,
"grad_norm": 271.927490234375,
"learning_rate": 9.678369063186224e-06,
"loss": 15.2439,
"step": 124
},
{
"epoch": 0.0716678909931378,
"grad_norm": 280.0527648925781,
"learning_rate": 9.671777208355146e-06,
"loss": 15.2833,
"step": 125
},
{
"epoch": 0.0722412341210829,
"grad_norm": 286.959228515625,
"learning_rate": 9.665120777998303e-06,
"loss": 15.3076,
"step": 126
},
{
"epoch": 0.072814577249028,
"grad_norm": 268.98553466796875,
"learning_rate": 9.658399864124037e-06,
"loss": 15.3274,
"step": 127
},
{
"epoch": 0.0733879203769731,
"grad_norm": 261.5675964355469,
"learning_rate": 9.65161455963202e-06,
"loss": 15.2216,
"step": 128
},
{
"epoch": 0.0739612635049182,
"grad_norm": 272.29425048828125,
"learning_rate": 9.64476495831195e-06,
"loss": 15.2385,
"step": 129
},
{
"epoch": 0.0745346066328633,
"grad_norm": 282.3617248535156,
"learning_rate": 9.637851154842279e-06,
"loss": 15.2864,
"step": 130
},
{
"epoch": 0.07510794976080841,
"grad_norm": 260.4862976074219,
"learning_rate": 9.630873244788884e-06,
"loss": 15.3039,
"step": 131
},
{
"epoch": 0.07568129288875351,
"grad_norm": 268.15582275390625,
"learning_rate": 9.623831324603755e-06,
"loss": 15.2402,
"step": 132
},
{
"epoch": 0.07625463601669862,
"grad_norm": 275.54180908203125,
"learning_rate": 9.61672549162366e-06,
"loss": 15.2216,
"step": 133
},
{
"epoch": 0.07682797914464372,
"grad_norm": 274.50299072265625,
"learning_rate": 9.6095558440688e-06,
"loss": 15.2265,
"step": 134
},
{
"epoch": 0.07740132227258882,
"grad_norm": 274.8090515136719,
"learning_rate": 9.602322481041457e-06,
"loss": 15.2518,
"step": 135
},
{
"epoch": 0.07797466540053392,
"grad_norm": 264.6287841796875,
"learning_rate": 9.595025502524609e-06,
"loss": 15.2621,
"step": 136
},
{
"epoch": 0.07854800852847903,
"grad_norm": 261.9557189941406,
"learning_rate": 9.587665009380565e-06,
"loss": 15.2255,
"step": 137
},
{
"epoch": 0.07912135165642413,
"grad_norm": 264.4668273925781,
"learning_rate": 9.580241103349562e-06,
"loss": 15.1974,
"step": 138
},
{
"epoch": 0.07969469478436923,
"grad_norm": 268.053955078125,
"learning_rate": 9.572753887048353e-06,
"loss": 15.1732,
"step": 139
},
{
"epoch": 0.08026803791231434,
"grad_norm": 275.5241394042969,
"learning_rate": 9.565203463968808e-06,
"loss": 15.2277,
"step": 140
},
{
"epoch": 0.08084138104025944,
"grad_norm": 270.20001220703125,
"learning_rate": 9.557589938476462e-06,
"loss": 15.2393,
"step": 141
},
{
"epoch": 0.08141472416820454,
"grad_norm": 274.71453857421875,
"learning_rate": 9.549913415809084e-06,
"loss": 15.1832,
"step": 142
},
{
"epoch": 0.08198806729614964,
"grad_norm": 266.2647399902344,
"learning_rate": 9.542174002075221e-06,
"loss": 15.1934,
"step": 143
},
{
"epoch": 0.08256141042409475,
"grad_norm": 270.1286315917969,
"learning_rate": 9.534371804252727e-06,
"loss": 15.1652,
"step": 144
},
{
"epoch": 0.08313475355203985,
"grad_norm": 253.12673950195312,
"learning_rate": 9.526506930187294e-06,
"loss": 15.2471,
"step": 145
},
{
"epoch": 0.08370809667998495,
"grad_norm": 266.5976867675781,
"learning_rate": 9.518579488590947e-06,
"loss": 15.26,
"step": 146
},
{
"epoch": 0.08428143980793006,
"grad_norm": 264.99481201171875,
"learning_rate": 9.510589589040554e-06,
"loss": 15.1794,
"step": 147
},
{
"epoch": 0.08485478293587516,
"grad_norm": 255.4492950439453,
"learning_rate": 9.502537341976305e-06,
"loss": 15.2214,
"step": 148
},
{
"epoch": 0.08542812606382026,
"grad_norm": 264.4046325683594,
"learning_rate": 9.494422858700188e-06,
"loss": 15.1397,
"step": 149
},
{
"epoch": 0.08600146919176536,
"grad_norm": 276.0195007324219,
"learning_rate": 9.48624625137445e-06,
"loss": 15.2514,
"step": 150
},
{
"epoch": 0.08657481231971047,
"grad_norm": 261.25848388671875,
"learning_rate": 9.478007633020043e-06,
"loss": 15.1633,
"step": 151
},
{
"epoch": 0.08714815544765557,
"grad_norm": 273.81439208984375,
"learning_rate": 9.469707117515068e-06,
"loss": 15.3146,
"step": 152
},
{
"epoch": 0.08772149857560067,
"grad_norm": 278.4958801269531,
"learning_rate": 9.461344819593194e-06,
"loss": 15.2173,
"step": 153
},
{
"epoch": 0.08829484170354576,
"grad_norm": 270.7554931640625,
"learning_rate": 9.452920854842085e-06,
"loss": 15.2049,
"step": 154
},
{
"epoch": 0.08886818483149087,
"grad_norm": 277.895751953125,
"learning_rate": 9.44443533970178e-06,
"loss": 15.2012,
"step": 155
},
{
"epoch": 0.08944152795943597,
"grad_norm": 260.6186828613281,
"learning_rate": 9.435888391463108e-06,
"loss": 15.1519,
"step": 156
},
{
"epoch": 0.09001487108738107,
"grad_norm": 266.2400817871094,
"learning_rate": 9.427280128266049e-06,
"loss": 15.1982,
"step": 157
},
{
"epoch": 0.09058821421532617,
"grad_norm": 258.3689270019531,
"learning_rate": 9.418610669098114e-06,
"loss": 15.2358,
"step": 158
},
{
"epoch": 0.09116155734327128,
"grad_norm": 255.73751831054688,
"learning_rate": 9.409880133792684e-06,
"loss": 15.2167,
"step": 159
},
{
"epoch": 0.09173490047121638,
"grad_norm": 257.6156311035156,
"learning_rate": 9.40108864302737e-06,
"loss": 15.1499,
"step": 160
},
{
"epoch": 0.09230824359916148,
"grad_norm": 259.1768493652344,
"learning_rate": 9.392236318322339e-06,
"loss": 15.1413,
"step": 161
},
{
"epoch": 0.09288158672710659,
"grad_norm": 259.98583984375,
"learning_rate": 9.383323282038632e-06,
"loss": 15.2688,
"step": 162
},
{
"epoch": 0.09345492985505169,
"grad_norm": 270.8675537109375,
"learning_rate": 9.374349657376473e-06,
"loss": 15.19,
"step": 163
},
{
"epoch": 0.09402827298299679,
"grad_norm": 252.3112030029297,
"learning_rate": 9.365315568373569e-06,
"loss": 15.1946,
"step": 164
},
{
"epoch": 0.0946016161109419,
"grad_norm": 271.8454284667969,
"learning_rate": 9.356221139903395e-06,
"loss": 15.1801,
"step": 165
},
{
"epoch": 0.095174959238887,
"grad_norm": 252.07545471191406,
"learning_rate": 9.347066497673462e-06,
"loss": 15.169,
"step": 166
},
{
"epoch": 0.0957483023668321,
"grad_norm": 274.141357421875,
"learning_rate": 9.337851768223589e-06,
"loss": 15.2279,
"step": 167
},
{
"epoch": 0.0963216454947772,
"grad_norm": 257.8874206542969,
"learning_rate": 9.328577078924151e-06,
"loss": 15.2368,
"step": 168
},
{
"epoch": 0.0968949886227223,
"grad_norm": 259.5989990234375,
"learning_rate": 9.319242557974306e-06,
"loss": 15.1261,
"step": 169
},
{
"epoch": 0.09746833175066741,
"grad_norm": 268.7466735839844,
"learning_rate": 9.309848334400247e-06,
"loss": 15.1956,
"step": 170
},
{
"epoch": 0.09804167487861251,
"grad_norm": 250.24107360839844,
"learning_rate": 9.300394538053395e-06,
"loss": 15.2186,
"step": 171
},
{
"epoch": 0.09861501800655761,
"grad_norm": 242.27389526367188,
"learning_rate": 9.29088129960862e-06,
"loss": 15.208,
"step": 172
},
{
"epoch": 0.09918836113450272,
"grad_norm": 257.0928649902344,
"learning_rate": 9.281308750562426e-06,
"loss": 15.2165,
"step": 173
},
{
"epoch": 0.09976170426244782,
"grad_norm": 252.54974365234375,
"learning_rate": 9.271677023231137e-06,
"loss": 15.2131,
"step": 174
},
{
"epoch": 0.10033504739039292,
"grad_norm": 257.41192626953125,
"learning_rate": 9.261986250749068e-06,
"loss": 15.1474,
"step": 175
},
{
"epoch": 0.10090839051833803,
"grad_norm": 260.325439453125,
"learning_rate": 9.252236567066686e-06,
"loss": 15.1335,
"step": 176
},
{
"epoch": 0.10148173364628313,
"grad_norm": 265.9437561035156,
"learning_rate": 9.242428106948748e-06,
"loss": 15.2201,
"step": 177
},
{
"epoch": 0.10205507677422823,
"grad_norm": 255.51026916503906,
"learning_rate": 9.23256100597246e-06,
"loss": 15.167,
"step": 178
},
{
"epoch": 0.10262841990217333,
"grad_norm": 254.357666015625,
"learning_rate": 9.22263540052558e-06,
"loss": 15.2428,
"step": 179
},
{
"epoch": 0.10320176303011844,
"grad_norm": 253.48025512695312,
"learning_rate": 9.212651427804544e-06,
"loss": 15.0791,
"step": 180
},
{
"epoch": 0.10377510615806354,
"grad_norm": 258.47149658203125,
"learning_rate": 9.202609225812572e-06,
"loss": 15.2475,
"step": 181
},
{
"epoch": 0.10434844928600863,
"grad_norm": 257.2544860839844,
"learning_rate": 9.192508933357753e-06,
"loss": 15.1288,
"step": 182
},
{
"epoch": 0.10492179241395373,
"grad_norm": 250.79588317871094,
"learning_rate": 9.182350690051134e-06,
"loss": 15.1739,
"step": 183
},
{
"epoch": 0.10549513554189884,
"grad_norm": 275.7869873046875,
"learning_rate": 9.172134636304783e-06,
"loss": 15.1487,
"step": 184
},
{
"epoch": 0.10606847866984394,
"grad_norm": 256.7626647949219,
"learning_rate": 9.16186091332985e-06,
"loss": 15.1919,
"step": 185
},
{
"epoch": 0.10664182179778904,
"grad_norm": 255.94090270996094,
"learning_rate": 9.15152966313462e-06,
"loss": 15.1635,
"step": 186
},
{
"epoch": 0.10721516492573414,
"grad_norm": 267.16448974609375,
"learning_rate": 9.141141028522544e-06,
"loss": 15.1597,
"step": 187
},
{
"epoch": 0.10778850805367925,
"grad_norm": 258.8427734375,
"learning_rate": 9.130695153090272e-06,
"loss": 15.1459,
"step": 188
},
{
"epoch": 0.10836185118162435,
"grad_norm": 253.86849975585938,
"learning_rate": 9.120192181225658e-06,
"loss": 15.1216,
"step": 189
},
{
"epoch": 0.10893519430956945,
"grad_norm": 265.7057189941406,
"learning_rate": 9.109632258105771e-06,
"loss": 15.1723,
"step": 190
},
{
"epoch": 0.10950853743751456,
"grad_norm": 250.55398559570312,
"learning_rate": 9.099015529694894e-06,
"loss": 15.026,
"step": 191
},
{
"epoch": 0.11008188056545966,
"grad_norm": 255.69390869140625,
"learning_rate": 9.088342142742493e-06,
"loss": 15.1254,
"step": 192
},
{
"epoch": 0.11065522369340476,
"grad_norm": 254.11236572265625,
"learning_rate": 9.077612244781196e-06,
"loss": 15.079,
"step": 193
},
{
"epoch": 0.11122856682134986,
"grad_norm": 247.76478576660156,
"learning_rate": 9.066825984124751e-06,
"loss": 15.1122,
"step": 194
},
{
"epoch": 0.11180190994929497,
"grad_norm": 265.3432922363281,
"learning_rate": 9.055983509865988e-06,
"loss": 15.305,
"step": 195
},
{
"epoch": 0.11237525307724007,
"grad_norm": 244.3975067138672,
"learning_rate": 9.045084971874738e-06,
"loss": 15.1207,
"step": 196
},
{
"epoch": 0.11294859620518517,
"grad_norm": 245.3219757080078,
"learning_rate": 9.034130520795774e-06,
"loss": 15.2254,
"step": 197
},
{
"epoch": 0.11352193933313028,
"grad_norm": 248.05052185058594,
"learning_rate": 9.023120308046726e-06,
"loss": 15.0549,
"step": 198
},
{
"epoch": 0.11409528246107538,
"grad_norm": 249.66659545898438,
"learning_rate": 9.012054485815995e-06,
"loss": 15.0402,
"step": 199
},
{
"epoch": 0.11466862558902048,
"grad_norm": 247.83876037597656,
"learning_rate": 9.00093320706063e-06,
"loss": 15.1167,
"step": 200
},
{
"epoch": 0.11524196871696558,
"grad_norm": 241.92027282714844,
"learning_rate": 8.989756625504237e-06,
"loss": 15.0883,
"step": 201
},
{
"epoch": 0.11581531184491069,
"grad_norm": 247.57127380371094,
"learning_rate": 8.978524895634842e-06,
"loss": 15.0762,
"step": 202
},
{
"epoch": 0.11638865497285579,
"grad_norm": 260.8078918457031,
"learning_rate": 8.967238172702754e-06,
"loss": 15.1708,
"step": 203
},
{
"epoch": 0.11696199810080089,
"grad_norm": 234.99139404296875,
"learning_rate": 8.95589661271842e-06,
"loss": 15.0437,
"step": 204
},
{
"epoch": 0.117535341228746,
"grad_norm": 252.7474822998047,
"learning_rate": 8.94450037245028e-06,
"loss": 15.1181,
"step": 205
},
{
"epoch": 0.1181086843566911,
"grad_norm": 254.7908477783203,
"learning_rate": 8.933049609422582e-06,
"loss": 15.053,
"step": 206
},
{
"epoch": 0.1186820274846362,
"grad_norm": 249.38302612304688,
"learning_rate": 8.921544481913218e-06,
"loss": 15.1128,
"step": 207
},
{
"epoch": 0.1192553706125813,
"grad_norm": 244.7653350830078,
"learning_rate": 8.909985148951528e-06,
"loss": 15.0565,
"step": 208
},
{
"epoch": 0.11982871374052641,
"grad_norm": 246.28976440429688,
"learning_rate": 8.898371770316113e-06,
"loss": 14.9964,
"step": 209
},
{
"epoch": 0.1204020568684715,
"grad_norm": 260.9649658203125,
"learning_rate": 8.886704506532611e-06,
"loss": 15.0536,
"step": 210
},
{
"epoch": 0.1209753999964166,
"grad_norm": 254.28854370117188,
"learning_rate": 8.874983518871488e-06,
"loss": 15.1222,
"step": 211
},
{
"epoch": 0.1215487431243617,
"grad_norm": 233.388427734375,
"learning_rate": 8.86320896934581e-06,
"loss": 15.1175,
"step": 212
},
{
"epoch": 0.1221220862523068,
"grad_norm": 264.84063720703125,
"learning_rate": 8.851381020709e-06,
"loss": 15.0966,
"step": 213
},
{
"epoch": 0.12269542938025191,
"grad_norm": 238.38485717773438,
"learning_rate": 8.839499836452584e-06,
"loss": 15.0013,
"step": 214
},
{
"epoch": 0.12326877250819701,
"grad_norm": 251.5662384033203,
"learning_rate": 8.827565580803944e-06,
"loss": 15.1437,
"step": 215
},
{
"epoch": 0.12384211563614211,
"grad_norm": 250.67286682128906,
"learning_rate": 8.815578418724031e-06,
"loss": 15.0635,
"step": 216
},
{
"epoch": 0.12441545876408722,
"grad_norm": 262.172607421875,
"learning_rate": 8.803538515905102e-06,
"loss": 15.1516,
"step": 217
},
{
"epoch": 0.12498880189203232,
"grad_norm": 241.4354705810547,
"learning_rate": 8.791446038768416e-06,
"loss": 15.1259,
"step": 218
},
{
"epoch": 0.12556214501997742,
"grad_norm": 247.70347595214844,
"learning_rate": 8.779301154461945e-06,
"loss": 15.1325,
"step": 219
},
{
"epoch": 0.12613548814792253,
"grad_norm": 234.08982849121094,
"learning_rate": 8.76710403085805e-06,
"loss": 15.01,
"step": 220
},
{
"epoch": 0.12670883127586763,
"grad_norm": 245.60804748535156,
"learning_rate": 8.754854836551174e-06,
"loss": 15.0905,
"step": 221
},
{
"epoch": 0.12728217440381273,
"grad_norm": 254.1485137939453,
"learning_rate": 8.742553740855507e-06,
"loss": 15.1127,
"step": 222
},
{
"epoch": 0.12785551753175783,
"grad_norm": 238.38563537597656,
"learning_rate": 8.730200913802638e-06,
"loss": 15.0614,
"step": 223
},
{
"epoch": 0.12842886065970294,
"grad_norm": 248.24403381347656,
"learning_rate": 8.717796526139218e-06,
"loss": 15.0618,
"step": 224
},
{
"epoch": 0.12900220378764804,
"grad_norm": 246.24209594726562,
"learning_rate": 8.70534074932459e-06,
"loss": 15.0455,
"step": 225
},
{
"epoch": 0.12957554691559314,
"grad_norm": 237.25454711914062,
"learning_rate": 8.692833755528426e-06,
"loss": 15.0558,
"step": 226
},
{
"epoch": 0.13014889004353825,
"grad_norm": 246.07095336914062,
"learning_rate": 8.680275717628336e-06,
"loss": 15.0205,
"step": 227
},
{
"epoch": 0.13072223317148335,
"grad_norm": 242.9619903564453,
"learning_rate": 8.667666809207495e-06,
"loss": 15.142,
"step": 228
},
{
"epoch": 0.13129557629942845,
"grad_norm": 242.89532470703125,
"learning_rate": 8.655007204552228e-06,
"loss": 15.0199,
"step": 229
},
{
"epoch": 0.13186891942737355,
"grad_norm": 254.67239379882812,
"learning_rate": 8.64229707864961e-06,
"loss": 15.088,
"step": 230
},
{
"epoch": 0.13244226255531866,
"grad_norm": 240.30972290039062,
"learning_rate": 8.629536607185042e-06,
"loss": 15.1037,
"step": 231
},
{
"epoch": 0.13301560568326376,
"grad_norm": 250.13949584960938,
"learning_rate": 8.616725966539831e-06,
"loss": 15.0717,
"step": 232
},
{
"epoch": 0.13358894881120886,
"grad_norm": 237.8465576171875,
"learning_rate": 8.60386533378874e-06,
"loss": 15.05,
"step": 233
},
{
"epoch": 0.13416229193915397,
"grad_norm": 244.82315063476562,
"learning_rate": 8.590954886697554e-06,
"loss": 15.101,
"step": 234
},
{
"epoch": 0.13473563506709907,
"grad_norm": 237.0764923095703,
"learning_rate": 8.577994803720605e-06,
"loss": 15.0211,
"step": 235
},
{
"epoch": 0.13530897819504417,
"grad_norm": 241.53424072265625,
"learning_rate": 8.564985263998327e-06,
"loss": 15.0495,
"step": 236
},
{
"epoch": 0.13588232132298927,
"grad_norm": 232.84251403808594,
"learning_rate": 8.551926447354759e-06,
"loss": 14.9438,
"step": 237
},
{
"epoch": 0.13645566445093438,
"grad_norm": 242.9515838623047,
"learning_rate": 8.538818534295076e-06,
"loss": 15.028,
"step": 238
},
{
"epoch": 0.13702900757887948,
"grad_norm": 248.1451416015625,
"learning_rate": 8.525661706003083e-06,
"loss": 15.0705,
"step": 239
},
{
"epoch": 0.13760235070682458,
"grad_norm": 253.95338439941406,
"learning_rate": 8.512456144338717e-06,
"loss": 15.097,
"step": 240
},
{
"epoch": 0.1381756938347697,
"grad_norm": 243.39439392089844,
"learning_rate": 8.499202031835532e-06,
"loss": 15.0549,
"step": 241
},
{
"epoch": 0.1387490369627148,
"grad_norm": 247.52191162109375,
"learning_rate": 8.485899551698166e-06,
"loss": 15.1328,
"step": 242
},
{
"epoch": 0.1393223800906599,
"grad_norm": 236.9805908203125,
"learning_rate": 8.472548887799833e-06,
"loss": 15.0222,
"step": 243
},
{
"epoch": 0.139895723218605,
"grad_norm": 239.95289611816406,
"learning_rate": 8.45915022467975e-06,
"loss": 15.0937,
"step": 244
},
{
"epoch": 0.1404690663465501,
"grad_norm": 254.6737060546875,
"learning_rate": 8.445703747540614e-06,
"loss": 15.06,
"step": 245
},
{
"epoch": 0.1410424094744952,
"grad_norm": 247.96080017089844,
"learning_rate": 8.43220964224602e-06,
"loss": 15.0793,
"step": 246
},
{
"epoch": 0.1416157526024403,
"grad_norm": 241.89292907714844,
"learning_rate": 8.418668095317912e-06,
"loss": 15.0339,
"step": 247
},
{
"epoch": 0.1421890957303854,
"grad_norm": 245.7707061767578,
"learning_rate": 8.405079293933986e-06,
"loss": 15.0187,
"step": 248
},
{
"epoch": 0.1427624388583305,
"grad_norm": 244.69918823242188,
"learning_rate": 8.391443425925118e-06,
"loss": 14.9716,
"step": 249
},
{
"epoch": 0.1433357819862756,
"grad_norm": 247.9059295654297,
"learning_rate": 8.37776067977276e-06,
"loss": 15.0733,
"step": 250
},
{
"epoch": 0.14390912511422072,
"grad_norm": 238.36126708984375,
"learning_rate": 8.36403124460633e-06,
"loss": 14.9511,
"step": 251
},
{
"epoch": 0.1444824682421658,
"grad_norm": 239.73057556152344,
"learning_rate": 8.350255310200611e-06,
"loss": 15.0428,
"step": 252
},
{
"epoch": 0.1450558113701109,
"grad_norm": 230.3163299560547,
"learning_rate": 8.336433066973122e-06,
"loss": 14.997,
"step": 253
},
{
"epoch": 0.145629154498056,
"grad_norm": 237.23446655273438,
"learning_rate": 8.322564705981476e-06,
"loss": 14.973,
"step": 254
},
{
"epoch": 0.1462024976260011,
"grad_norm": 230.16468811035156,
"learning_rate": 8.308650418920751e-06,
"loss": 15.0256,
"step": 255
},
{
"epoch": 0.1467758407539462,
"grad_norm": 233.07260131835938,
"learning_rate": 8.294690398120843e-06,
"loss": 14.945,
"step": 256
},
{
"epoch": 0.1473491838818913,
"grad_norm": 240.12940979003906,
"learning_rate": 8.280684836543794e-06,
"loss": 14.9974,
"step": 257
},
{
"epoch": 0.1479225270098364,
"grad_norm": 243.80523681640625,
"learning_rate": 8.266633927781135e-06,
"loss": 15.0705,
"step": 258
},
{
"epoch": 0.1484958701377815,
"grad_norm": 244.0867462158203,
"learning_rate": 8.25253786605121e-06,
"loss": 15.0141,
"step": 259
},
{
"epoch": 0.1490692132657266,
"grad_norm": 247.33151245117188,
"learning_rate": 8.238396846196483e-06,
"loss": 15.0344,
"step": 260
},
{
"epoch": 0.14964255639367172,
"grad_norm": 250.08273315429688,
"learning_rate": 8.224211063680854e-06,
"loss": 14.9305,
"step": 261
},
{
"epoch": 0.15021589952161682,
"grad_norm": 257.2216491699219,
"learning_rate": 8.209980714586955e-06,
"loss": 14.9938,
"step": 262
},
{
"epoch": 0.15078924264956192,
"grad_norm": 238.5064239501953,
"learning_rate": 8.195705995613436e-06,
"loss": 15.0064,
"step": 263
},
{
"epoch": 0.15136258577750702,
"grad_norm": 232.31155395507812,
"learning_rate": 8.181387104072252e-06,
"loss": 14.9449,
"step": 264
},
{
"epoch": 0.15193592890545213,
"grad_norm": 227.94029235839844,
"learning_rate": 8.167024237885927e-06,
"loss": 14.8337,
"step": 265
},
{
"epoch": 0.15250927203339723,
"grad_norm": 240.96424865722656,
"learning_rate": 8.152617595584827e-06,
"loss": 15.0939,
"step": 266
},
{
"epoch": 0.15308261516134233,
"grad_norm": 237.70541381835938,
"learning_rate": 8.138167376304411e-06,
"loss": 14.909,
"step": 267
},
{
"epoch": 0.15365595828928744,
"grad_norm": 233.10304260253906,
"learning_rate": 8.123673779782481e-06,
"loss": 14.9505,
"step": 268
},
{
"epoch": 0.15422930141723254,
"grad_norm": 240.28123474121094,
"learning_rate": 8.10913700635642e-06,
"loss": 14.9045,
"step": 269
},
{
"epoch": 0.15480264454517764,
"grad_norm": 233.11627197265625,
"learning_rate": 8.094557256960419e-06,
"loss": 14.9225,
"step": 270
},
{
"epoch": 0.15537598767312275,
"grad_norm": 244.76693725585938,
"learning_rate": 8.079934733122708e-06,
"loss": 14.9717,
"step": 271
},
{
"epoch": 0.15594933080106785,
"grad_norm": 240.1745147705078,
"learning_rate": 8.065269636962765e-06,
"loss": 15.0261,
"step": 272
},
{
"epoch": 0.15652267392901295,
"grad_norm": 246.17298889160156,
"learning_rate": 8.05056217118852e-06,
"loss": 14.9933,
"step": 273
},
{
"epoch": 0.15709601705695805,
"grad_norm": 244.8893585205078,
"learning_rate": 8.035812539093557e-06,
"loss": 15.0351,
"step": 274
},
{
"epoch": 0.15766936018490316,
"grad_norm": 244.82302856445312,
"learning_rate": 8.021020944554305e-06,
"loss": 14.9442,
"step": 275
},
{
"epoch": 0.15824270331284826,
"grad_norm": 243.9514923095703,
"learning_rate": 8.006187592027215e-06,
"loss": 14.9621,
"step": 276
},
{
"epoch": 0.15881604644079336,
"grad_norm": 230.46597290039062,
"learning_rate": 7.991312686545939e-06,
"loss": 14.8903,
"step": 277
},
{
"epoch": 0.15938938956873847,
"grad_norm": 249.49838256835938,
"learning_rate": 7.976396433718492e-06,
"loss": 14.9777,
"step": 278
},
{
"epoch": 0.15996273269668357,
"grad_norm": 243.70870971679688,
"learning_rate": 7.961439039724413e-06,
"loss": 15.0312,
"step": 279
},
{
"epoch": 0.16053607582462867,
"grad_norm": 230.47183227539062,
"learning_rate": 7.946440711311913e-06,
"loss": 14.9198,
"step": 280
},
{
"epoch": 0.16110941895257377,
"grad_norm": 236.70082092285156,
"learning_rate": 7.931401655795021e-06,
"loss": 14.9223,
"step": 281
},
{
"epoch": 0.16168276208051888,
"grad_norm": 234.71527099609375,
"learning_rate": 7.916322081050708e-06,
"loss": 14.9188,
"step": 282
},
{
"epoch": 0.16225610520846398,
"grad_norm": 235.15675354003906,
"learning_rate": 7.90120219551603e-06,
"loss": 14.9309,
"step": 283
},
{
"epoch": 0.16282944833640908,
"grad_norm": 229.10137939453125,
"learning_rate": 7.88604220818523e-06,
"loss": 14.8877,
"step": 284
},
{
"epoch": 0.16340279146435419,
"grad_norm": 237.02072143554688,
"learning_rate": 7.870842328606863e-06,
"loss": 15.0099,
"step": 285
},
{
"epoch": 0.1639761345922993,
"grad_norm": 236.75343322753906,
"learning_rate": 7.85560276688089e-06,
"loss": 14.8486,
"step": 286
},
{
"epoch": 0.1645494777202444,
"grad_norm": 233.91934204101562,
"learning_rate": 7.84032373365578e-06,
"loss": 14.897,
"step": 287
},
{
"epoch": 0.1651228208481895,
"grad_norm": 230.60330200195312,
"learning_rate": 7.825005440125595e-06,
"loss": 14.9105,
"step": 288
},
{
"epoch": 0.1656961639761346,
"grad_norm": 235.03897094726562,
"learning_rate": 7.809648098027067e-06,
"loss": 14.994,
"step": 289
},
{
"epoch": 0.1662695071040797,
"grad_norm": 233.12936401367188,
"learning_rate": 7.794251919636687e-06,
"loss": 14.9753,
"step": 290
},
{
"epoch": 0.1668428502320248,
"grad_norm": 231.44244384765625,
"learning_rate": 7.778817117767748e-06,
"loss": 14.994,
"step": 291
},
{
"epoch": 0.1674161933599699,
"grad_norm": 228.026611328125,
"learning_rate": 7.76334390576742e-06,
"loss": 14.9458,
"step": 292
},
{
"epoch": 0.167989536487915,
"grad_norm": 231.06951904296875,
"learning_rate": 7.747832497513797e-06,
"loss": 14.9729,
"step": 293
},
{
"epoch": 0.1685628796158601,
"grad_norm": 239.63568115234375,
"learning_rate": 7.732283107412938e-06,
"loss": 14.9274,
"step": 294
},
{
"epoch": 0.16913622274380521,
"grad_norm": 220.87551879882812,
"learning_rate": 7.71669595039591e-06,
"loss": 14.9327,
"step": 295
},
{
"epoch": 0.16970956587175032,
"grad_norm": 214.35519409179688,
"learning_rate": 7.701071241915804e-06,
"loss": 14.8955,
"step": 296
},
{
"epoch": 0.17028290899969542,
"grad_norm": 229.36508178710938,
"learning_rate": 7.685409197944768e-06,
"loss": 14.903,
"step": 297
},
{
"epoch": 0.17085625212764052,
"grad_norm": 224.3822021484375,
"learning_rate": 7.669710034971025e-06,
"loss": 14.9543,
"step": 298
},
{
"epoch": 0.17142959525558563,
"grad_norm": 228.7742462158203,
"learning_rate": 7.653973969995866e-06,
"loss": 14.9022,
"step": 299
},
{
"epoch": 0.17200293838353073,
"grad_norm": 228.00148010253906,
"learning_rate": 7.638201220530664e-06,
"loss": 14.8216,
"step": 300
},
{
"epoch": 0.17257628151147583,
"grad_norm": 216.36854553222656,
"learning_rate": 7.622392004593862e-06,
"loss": 14.8582,
"step": 301
},
{
"epoch": 0.17314962463942093,
"grad_norm": 221.77157592773438,
"learning_rate": 7.60654654070796e-06,
"loss": 14.9161,
"step": 302
},
{
"epoch": 0.17372296776736604,
"grad_norm": 223.14935302734375,
"learning_rate": 7.59066504789649e-06,
"loss": 14.9057,
"step": 303
},
{
"epoch": 0.17429631089531114,
"grad_norm": 219.07955932617188,
"learning_rate": 7.574747745681e-06,
"loss": 14.8669,
"step": 304
},
{
"epoch": 0.17486965402325624,
"grad_norm": 226.2716827392578,
"learning_rate": 7.558794854078006e-06,
"loss": 14.8365,
"step": 305
},
{
"epoch": 0.17544299715120135,
"grad_norm": 243.78469848632812,
"learning_rate": 7.542806593595963e-06,
"loss": 14.9013,
"step": 306
},
{
"epoch": 0.17601634027914642,
"grad_norm": 214.9324188232422,
"learning_rate": 7.526783185232208e-06,
"loss": 14.7971,
"step": 307
},
{
"epoch": 0.17658968340709152,
"grad_norm": 217.00315856933594,
"learning_rate": 7.51072485046991e-06,
"loss": 14.8198,
"step": 308
},
{
"epoch": 0.17716302653503663,
"grad_norm": 230.4095001220703,
"learning_rate": 7.494631811275008e-06,
"loss": 14.8371,
"step": 309
},
{
"epoch": 0.17773636966298173,
"grad_norm": 236.96478271484375,
"learning_rate": 7.478504290093138e-06,
"loss": 14.8929,
"step": 310
},
{
"epoch": 0.17830971279092683,
"grad_norm": 222.3997344970703,
"learning_rate": 7.462342509846571e-06,
"loss": 14.9166,
"step": 311
},
{
"epoch": 0.17888305591887194,
"grad_norm": 230.09429931640625,
"learning_rate": 7.446146693931111e-06,
"loss": 14.8528,
"step": 312
},
{
"epoch": 0.17945639904681704,
"grad_norm": 222.97035217285156,
"learning_rate": 7.42991706621303e-06,
"loss": 14.8732,
"step": 313
},
{
"epoch": 0.18002974217476214,
"grad_norm": 226.1836700439453,
"learning_rate": 7.413653851025959e-06,
"loss": 14.7586,
"step": 314
},
{
"epoch": 0.18060308530270724,
"grad_norm": 222.79554748535156,
"learning_rate": 7.397357273167789e-06,
"loss": 14.8905,
"step": 315
},
{
"epoch": 0.18117642843065235,
"grad_norm": 230.41497802734375,
"learning_rate": 7.381027557897568e-06,
"loss": 14.7686,
"step": 316
},
{
"epoch": 0.18174977155859745,
"grad_norm": 207.27145385742188,
"learning_rate": 7.364664930932385e-06,
"loss": 14.8313,
"step": 317
},
{
"epoch": 0.18232311468654255,
"grad_norm": 224.7344207763672,
"learning_rate": 7.348269618444248e-06,
"loss": 14.7949,
"step": 318
},
{
"epoch": 0.18289645781448766,
"grad_norm": 227.63766479492188,
"learning_rate": 7.331841847056962e-06,
"loss": 14.7235,
"step": 319
},
{
"epoch": 0.18346980094243276,
"grad_norm": 214.8011932373047,
"learning_rate": 7.315381843842995e-06,
"loss": 14.7835,
"step": 320
},
{
"epoch": 0.18404314407037786,
"grad_norm": 217.45916748046875,
"learning_rate": 7.298889836320334e-06,
"loss": 14.8223,
"step": 321
},
{
"epoch": 0.18461648719832296,
"grad_norm": 221.9704132080078,
"learning_rate": 7.282366052449351e-06,
"loss": 14.871,
"step": 322
},
{
"epoch": 0.18518983032626807,
"grad_norm": 222.32537841796875,
"learning_rate": 7.265810720629643e-06,
"loss": 14.8007,
"step": 323
},
{
"epoch": 0.18576317345421317,
"grad_norm": 227.74884033203125,
"learning_rate": 7.249224069696876e-06,
"loss": 14.8103,
"step": 324
},
{
"epoch": 0.18633651658215827,
"grad_norm": 219.51748657226562,
"learning_rate": 7.232606328919627e-06,
"loss": 14.7732,
"step": 325
},
{
"epoch": 0.18690985971010338,
"grad_norm": 217.20773315429688,
"learning_rate": 7.215957727996208e-06,
"loss": 14.7552,
"step": 326
},
{
"epoch": 0.18748320283804848,
"grad_norm": 209.55203247070312,
"learning_rate": 7.199278497051498e-06,
"loss": 14.7018,
"step": 327
},
{
"epoch": 0.18805654596599358,
"grad_norm": 214.1074676513672,
"learning_rate": 7.182568866633757e-06,
"loss": 14.7702,
"step": 328
},
{
"epoch": 0.18862988909393869,
"grad_norm": 229.8917236328125,
"learning_rate": 7.16582906771144e-06,
"loss": 14.7891,
"step": 329
},
{
"epoch": 0.1892032322218838,
"grad_norm": 217.26866149902344,
"learning_rate": 7.149059331670009e-06,
"loss": 14.7741,
"step": 330
},
{
"epoch": 0.1897765753498289,
"grad_norm": 210.88253784179688,
"learning_rate": 7.132259890308726e-06,
"loss": 14.715,
"step": 331
},
{
"epoch": 0.190349918477774,
"grad_norm": 231.31787109375,
"learning_rate": 7.115430975837457e-06,
"loss": 14.7906,
"step": 332
},
{
"epoch": 0.1909232616057191,
"grad_norm": 224.2241973876953,
"learning_rate": 7.098572820873461e-06,
"loss": 14.7868,
"step": 333
},
{
"epoch": 0.1914966047336642,
"grad_norm": 220.03028869628906,
"learning_rate": 7.081685658438173e-06,
"loss": 14.7613,
"step": 334
},
{
"epoch": 0.1920699478616093,
"grad_norm": 213.73609924316406,
"learning_rate": 7.064769721953975e-06,
"loss": 14.7319,
"step": 335
},
{
"epoch": 0.1926432909895544,
"grad_norm": 223.67706298828125,
"learning_rate": 7.047825245240989e-06,
"loss": 14.8181,
"step": 336
},
{
"epoch": 0.1932166341174995,
"grad_norm": 207.2647705078125,
"learning_rate": 7.030852462513827e-06,
"loss": 14.7896,
"step": 337
},
{
"epoch": 0.1937899772454446,
"grad_norm": 213.09942626953125,
"learning_rate": 7.013851608378359e-06,
"loss": 14.727,
"step": 338
},
{
"epoch": 0.19436332037338971,
"grad_norm": 229.02037048339844,
"learning_rate": 6.9968229178284775e-06,
"loss": 14.7458,
"step": 339
},
{
"epoch": 0.19493666350133482,
"grad_norm": 222.83213806152344,
"learning_rate": 6.979766626242839e-06,
"loss": 14.7459,
"step": 340
},
{
"epoch": 0.19551000662927992,
"grad_norm": 220.72726440429688,
"learning_rate": 6.9626829693816135e-06,
"loss": 14.7011,
"step": 341
},
{
"epoch": 0.19608334975722502,
"grad_norm": 214.8241424560547,
"learning_rate": 6.945572183383229e-06,
"loss": 14.7731,
"step": 342
},
{
"epoch": 0.19665669288517013,
"grad_norm": 222.2461700439453,
"learning_rate": 6.928434504761106e-06,
"loss": 14.681,
"step": 343
},
{
"epoch": 0.19723003601311523,
"grad_norm": 223.89845275878906,
"learning_rate": 6.911270170400385e-06,
"loss": 14.7092,
"step": 344
},
{
"epoch": 0.19780337914106033,
"grad_norm": 219.92869567871094,
"learning_rate": 6.894079417554657e-06,
"loss": 14.8403,
"step": 345
},
{
"epoch": 0.19837672226900543,
"grad_norm": 219.98406982421875,
"learning_rate": 6.8768624838426815e-06,
"loss": 14.7576,
"step": 346
},
{
"epoch": 0.19895006539695054,
"grad_norm": 207.61367797851562,
"learning_rate": 6.859619607245102e-06,
"loss": 14.7059,
"step": 347
},
{
"epoch": 0.19952340852489564,
"grad_norm": 206.98719787597656,
"learning_rate": 6.842351026101155e-06,
"loss": 14.6511,
"step": 348
},
{
"epoch": 0.20009675165284074,
"grad_norm": 210.80372619628906,
"learning_rate": 6.825056979105382e-06,
"loss": 14.7222,
"step": 349
},
{
"epoch": 0.20067009478078585,
"grad_norm": 213.69117736816406,
"learning_rate": 6.807737705304324e-06,
"loss": 14.7251,
"step": 350
},
{
"epoch": 0.20124343790873095,
"grad_norm": 219.47328186035156,
"learning_rate": 6.790393444093214e-06,
"loss": 14.7487,
"step": 351
},
{
"epoch": 0.20181678103667605,
"grad_norm": 214.07040405273438,
"learning_rate": 6.773024435212678e-06,
"loss": 14.6365,
"step": 352
},
{
"epoch": 0.20239012416462115,
"grad_norm": 214.93496704101562,
"learning_rate": 6.7556309187454185e-06,
"loss": 14.6673,
"step": 353
},
{
"epoch": 0.20296346729256626,
"grad_norm": 206.5713348388672,
"learning_rate": 6.738213135112884e-06,
"loss": 14.7522,
"step": 354
},
{
"epoch": 0.20353681042051136,
"grad_norm": 210.60606384277344,
"learning_rate": 6.720771325071965e-06,
"loss": 14.6979,
"step": 355
},
{
"epoch": 0.20411015354845646,
"grad_norm": 212.65887451171875,
"learning_rate": 6.703305729711653e-06,
"loss": 14.7409,
"step": 356
},
{
"epoch": 0.20468349667640157,
"grad_norm": 216.2197723388672,
"learning_rate": 6.685816590449708e-06,
"loss": 14.7433,
"step": 357
},
{
"epoch": 0.20525683980434667,
"grad_norm": 210.51260375976562,
"learning_rate": 6.668304149029331e-06,
"loss": 14.7338,
"step": 358
},
{
"epoch": 0.20583018293229177,
"grad_norm": 210.6771697998047,
"learning_rate": 6.650768647515813e-06,
"loss": 14.7397,
"step": 359
},
{
"epoch": 0.20640352606023687,
"grad_norm": 216.00897216796875,
"learning_rate": 6.63321032829319e-06,
"loss": 14.8058,
"step": 360
},
{
"epoch": 0.20697686918818198,
"grad_norm": 206.54159545898438,
"learning_rate": 6.615629434060903e-06,
"loss": 14.6842,
"step": 361
},
{
"epoch": 0.20755021231612708,
"grad_norm": 213.61300659179688,
"learning_rate": 6.598026207830428e-06,
"loss": 14.6042,
"step": 362
},
{
"epoch": 0.20812355544407216,
"grad_norm": 217.9312744140625,
"learning_rate": 6.5804008929219284e-06,
"loss": 14.7647,
"step": 363
},
{
"epoch": 0.20869689857201726,
"grad_norm": 220.873291015625,
"learning_rate": 6.562753732960887e-06,
"loss": 14.7314,
"step": 364
},
{
"epoch": 0.20927024169996236,
"grad_norm": 223.9777069091797,
"learning_rate": 6.545084971874738e-06,
"loss": 14.7555,
"step": 365
},
{
"epoch": 0.20984358482790746,
"grad_norm": 217.828125,
"learning_rate": 6.527394853889499e-06,
"loss": 14.7245,
"step": 366
},
{
"epoch": 0.21041692795585257,
"grad_norm": 224.16778564453125,
"learning_rate": 6.5096836235263904e-06,
"loss": 14.7414,
"step": 367
},
{
"epoch": 0.21099027108379767,
"grad_norm": 216.91224670410156,
"learning_rate": 6.491951525598461e-06,
"loss": 14.6045,
"step": 368
},
{
"epoch": 0.21156361421174277,
"grad_norm": 209.5393829345703,
"learning_rate": 6.4741988052071965e-06,
"loss": 14.6805,
"step": 369
},
{
"epoch": 0.21213695733968788,
"grad_norm": 222.77627563476562,
"learning_rate": 6.45642570773914e-06,
"loss": 14.746,
"step": 370
},
{
"epoch": 0.21271030046763298,
"grad_norm": 216.05712890625,
"learning_rate": 6.438632478862495e-06,
"loss": 14.6645,
"step": 371
},
{
"epoch": 0.21328364359557808,
"grad_norm": 206.27911376953125,
"learning_rate": 6.4208193645237314e-06,
"loss": 14.6834,
"step": 372
},
{
"epoch": 0.21385698672352318,
"grad_norm": 215.7952880859375,
"learning_rate": 6.402986610944183e-06,
"loss": 14.7863,
"step": 373
},
{
"epoch": 0.2144303298514683,
"grad_norm": 212.9938201904297,
"learning_rate": 6.385134464616649e-06,
"loss": 14.7525,
"step": 374
},
{
"epoch": 0.2150036729794134,
"grad_norm": 200.97154235839844,
"learning_rate": 6.367263172301985e-06,
"loss": 14.649,
"step": 375
},
{
"epoch": 0.2155770161073585,
"grad_norm": 222.55943298339844,
"learning_rate": 6.3493729810256895e-06,
"loss": 14.7005,
"step": 376
},
{
"epoch": 0.2161503592353036,
"grad_norm": 220.4983367919922,
"learning_rate": 6.331464138074493e-06,
"loss": 14.7608,
"step": 377
},
{
"epoch": 0.2167237023632487,
"grad_norm": 213.09095764160156,
"learning_rate": 6.313536890992935e-06,
"loss": 14.5953,
"step": 378
},
{
"epoch": 0.2172970454911938,
"grad_norm": 211.12828063964844,
"learning_rate": 6.29559148757995e-06,
"loss": 14.6474,
"step": 379
},
{
"epoch": 0.2178703886191389,
"grad_norm": 222.33969116210938,
"learning_rate": 6.277628175885437e-06,
"loss": 14.7324,
"step": 380
},
{
"epoch": 0.218443731747084,
"grad_norm": 209.89747619628906,
"learning_rate": 6.2596472042068275e-06,
"loss": 14.622,
"step": 381
},
{
"epoch": 0.2190170748750291,
"grad_norm": 219.60342407226562,
"learning_rate": 6.241648821085666e-06,
"loss": 14.6497,
"step": 382
},
{
"epoch": 0.2195904180029742,
"grad_norm": 221.1376953125,
"learning_rate": 6.223633275304157e-06,
"loss": 14.7248,
"step": 383
},
{
"epoch": 0.22016376113091932,
"grad_norm": 217.87611389160156,
"learning_rate": 6.205600815881741e-06,
"loss": 14.7175,
"step": 384
},
{
"epoch": 0.22073710425886442,
"grad_norm": 210.81985473632812,
"learning_rate": 6.187551692071648e-06,
"loss": 14.7288,
"step": 385
},
{
"epoch": 0.22131044738680952,
"grad_norm": 218.46176147460938,
"learning_rate": 6.1694861533574445e-06,
"loss": 14.6473,
"step": 386
},
{
"epoch": 0.22188379051475463,
"grad_norm": 211.04080200195312,
"learning_rate": 6.1514044494496e-06,
"loss": 14.728,
"step": 387
},
{
"epoch": 0.22245713364269973,
"grad_norm": 214.88522338867188,
"learning_rate": 6.133306830282021e-06,
"loss": 14.5944,
"step": 388
},
{
"epoch": 0.22303047677064483,
"grad_norm": 214.91293334960938,
"learning_rate": 6.115193546008602e-06,
"loss": 14.6812,
"step": 389
},
{
"epoch": 0.22360381989858993,
"grad_norm": 218.2246856689453,
"learning_rate": 6.097064846999774e-06,
"loss": 14.6757,
"step": 390
},
{
"epoch": 0.22417716302653504,
"grad_norm": 209.82518005371094,
"learning_rate": 6.078920983839032e-06,
"loss": 14.6697,
"step": 391
},
{
"epoch": 0.22475050615448014,
"grad_norm": 219.08514404296875,
"learning_rate": 6.060762207319479e-06,
"loss": 14.663,
"step": 392
},
{
"epoch": 0.22532384928242524,
"grad_norm": 224.61856079101562,
"learning_rate": 6.042588768440358e-06,
"loss": 14.6559,
"step": 393
},
{
"epoch": 0.22589719241037035,
"grad_norm": 216.43028259277344,
"learning_rate": 6.024400918403581e-06,
"loss": 14.6848,
"step": 394
},
{
"epoch": 0.22647053553831545,
"grad_norm": 217.51576232910156,
"learning_rate": 6.006198908610261e-06,
"loss": 14.6885,
"step": 395
},
{
"epoch": 0.22704387866626055,
"grad_norm": 194.5399627685547,
"learning_rate": 5.987982990657229e-06,
"loss": 14.589,
"step": 396
},
{
"epoch": 0.22761722179420565,
"grad_norm": 214.05809020996094,
"learning_rate": 5.9697534163335645e-06,
"loss": 14.6364,
"step": 397
},
{
"epoch": 0.22819056492215076,
"grad_norm": 212.87832641601562,
"learning_rate": 5.95151043761711e-06,
"loss": 14.7834,
"step": 398
},
{
"epoch": 0.22876390805009586,
"grad_norm": 203.37142944335938,
"learning_rate": 5.933254306670995e-06,
"loss": 14.5586,
"step": 399
},
{
"epoch": 0.22933725117804096,
"grad_norm": 217.5912322998047,
"learning_rate": 5.914985275840135e-06,
"loss": 14.7334,
"step": 400
},
{
"epoch": 0.22991059430598607,
"grad_norm": 201.1334991455078,
"learning_rate": 5.896703597647765e-06,
"loss": 14.6263,
"step": 401
},
{
"epoch": 0.23048393743393117,
"grad_norm": 206.36265563964844,
"learning_rate": 5.878409524791931e-06,
"loss": 14.6252,
"step": 402
},
{
"epoch": 0.23105728056187627,
"grad_norm": 213.31422424316406,
"learning_rate": 5.8601033101420055e-06,
"loss": 14.718,
"step": 403
},
{
"epoch": 0.23163062368982137,
"grad_norm": 213.38626098632812,
"learning_rate": 5.841785206735192e-06,
"loss": 14.5727,
"step": 404
},
{
"epoch": 0.23220396681776648,
"grad_norm": 189.9121551513672,
"learning_rate": 5.823455467773027e-06,
"loss": 14.5197,
"step": 405
},
{
"epoch": 0.23277730994571158,
"grad_norm": 198.7380828857422,
"learning_rate": 5.805114346617874e-06,
"loss": 14.5848,
"step": 406
},
{
"epoch": 0.23335065307365668,
"grad_norm": 212.24783325195312,
"learning_rate": 5.786762096789431e-06,
"loss": 14.6107,
"step": 407
},
{
"epoch": 0.23392399620160179,
"grad_norm": 219.87643432617188,
"learning_rate": 5.768398971961221e-06,
"loss": 14.7092,
"step": 408
},
{
"epoch": 0.2344973393295469,
"grad_norm": 206.90530395507812,
"learning_rate": 5.750025225957086e-06,
"loss": 14.5481,
"step": 409
},
{
"epoch": 0.235070682457492,
"grad_norm": 202.2758331298828,
"learning_rate": 5.731641112747679e-06,
"loss": 14.6385,
"step": 410
},
{
"epoch": 0.2356440255854371,
"grad_norm": 215.7546844482422,
"learning_rate": 5.713246886446954e-06,
"loss": 14.5969,
"step": 411
},
{
"epoch": 0.2362173687133822,
"grad_norm": 208.98550415039062,
"learning_rate": 5.694842801308651e-06,
"loss": 14.6304,
"step": 412
},
{
"epoch": 0.2367907118413273,
"grad_norm": 207.6781005859375,
"learning_rate": 5.676429111722786e-06,
"loss": 14.6177,
"step": 413
},
{
"epoch": 0.2373640549692724,
"grad_norm": 201.2788543701172,
"learning_rate": 5.6580060722121325e-06,
"loss": 14.5918,
"step": 414
},
{
"epoch": 0.2379373980972175,
"grad_norm": 213.871826171875,
"learning_rate": 5.639573937428699e-06,
"loss": 14.5532,
"step": 415
},
{
"epoch": 0.2385107412251626,
"grad_norm": 196.2823486328125,
"learning_rate": 5.621132962150216e-06,
"loss": 14.5558,
"step": 416
},
{
"epoch": 0.2390840843531077,
"grad_norm": 199.7825927734375,
"learning_rate": 5.6026834012766155e-06,
"loss": 14.5658,
"step": 417
},
{
"epoch": 0.23965742748105281,
"grad_norm": 192.31263732910156,
"learning_rate": 5.584225509826497e-06,
"loss": 14.5083,
"step": 418
},
{
"epoch": 0.2402307706089979,
"grad_norm": 201.0004119873047,
"learning_rate": 5.565759542933612e-06,
"loss": 14.6235,
"step": 419
},
{
"epoch": 0.240804113736943,
"grad_norm": 197.17825317382812,
"learning_rate": 5.547285755843334e-06,
"loss": 14.5237,
"step": 420
},
{
"epoch": 0.2413774568648881,
"grad_norm": 209.01620483398438,
"learning_rate": 5.5288044039091335e-06,
"loss": 14.596,
"step": 421
},
{
"epoch": 0.2419507999928332,
"grad_norm": 204.07884216308594,
"learning_rate": 5.510315742589042e-06,
"loss": 14.617,
"step": 422
},
{
"epoch": 0.2425241431207783,
"grad_norm": 208.53651428222656,
"learning_rate": 5.491820027442126e-06,
"loss": 14.6785,
"step": 423
},
{
"epoch": 0.2430974862487234,
"grad_norm": 199.32315063476562,
"learning_rate": 5.473317514124958e-06,
"loss": 14.512,
"step": 424
},
{
"epoch": 0.2436708293766685,
"grad_norm": 206.72837829589844,
"learning_rate": 5.454808458388069e-06,
"loss": 14.6038,
"step": 425
},
{
"epoch": 0.2442441725046136,
"grad_norm": 196.9921112060547,
"learning_rate": 5.436293116072431e-06,
"loss": 14.5451,
"step": 426
},
{
"epoch": 0.2448175156325587,
"grad_norm": 207.21530151367188,
"learning_rate": 5.417771743105908e-06,
"loss": 14.551,
"step": 427
},
{
"epoch": 0.24539085876050382,
"grad_norm": 201.5275115966797,
"learning_rate": 5.399244595499721e-06,
"loss": 14.5262,
"step": 428
},
{
"epoch": 0.24596420188844892,
"grad_norm": 204.6480712890625,
"learning_rate": 5.380711929344915e-06,
"loss": 14.4846,
"step": 429
},
{
"epoch": 0.24653754501639402,
"grad_norm": 194.9892120361328,
"learning_rate": 5.362174000808813e-06,
"loss": 14.5942,
"step": 430
},
{
"epoch": 0.24711088814433912,
"grad_norm": 199.96047973632812,
"learning_rate": 5.343631066131476e-06,
"loss": 14.6091,
"step": 431
},
{
"epoch": 0.24768423127228423,
"grad_norm": 212.93307495117188,
"learning_rate": 5.325083381622165e-06,
"loss": 14.5455,
"step": 432
},
{
"epoch": 0.24825757440022933,
"grad_norm": 194.9511260986328,
"learning_rate": 5.30653120365579e-06,
"loss": 14.5044,
"step": 433
},
{
"epoch": 0.24883091752817443,
"grad_norm": 200.14315795898438,
"learning_rate": 5.28797478866938e-06,
"loss": 14.6439,
"step": 434
},
{
"epoch": 0.24940426065611954,
"grad_norm": 197.60902404785156,
"learning_rate": 5.269414393158523e-06,
"loss": 14.5721,
"step": 435
},
{
"epoch": 0.24997760378406464,
"grad_norm": 192.06671142578125,
"learning_rate": 5.250850273673831e-06,
"loss": 14.5812,
"step": 436
},
{
"epoch": 0.25055094691200974,
"grad_norm": 189.84034729003906,
"learning_rate": 5.232282686817392e-06,
"loss": 14.6002,
"step": 437
},
{
"epoch": 0.25112429003995484,
"grad_norm": 195.87533569335938,
"learning_rate": 5.213711889239214e-06,
"loss": 14.4797,
"step": 438
},
{
"epoch": 0.25169763316789995,
"grad_norm": 186.12464904785156,
"learning_rate": 5.195138137633695e-06,
"loss": 14.5298,
"step": 439
},
{
"epoch": 0.25227097629584505,
"grad_norm": 189.66380310058594,
"learning_rate": 5.17656168873606e-06,
"loss": 14.4488,
"step": 440
},
{
"epoch": 0.25284431942379015,
"grad_norm": 196.0492401123047,
"learning_rate": 5.157982799318817e-06,
"loss": 14.5268,
"step": 441
},
{
"epoch": 0.25341766255173526,
"grad_norm": 192.8926239013672,
"learning_rate": 5.139401726188208e-06,
"loss": 14.555,
"step": 442
},
{
"epoch": 0.25399100567968036,
"grad_norm": 201.20632934570312,
"learning_rate": 5.120818726180662e-06,
"loss": 14.4914,
"step": 443
},
{
"epoch": 0.25456434880762546,
"grad_norm": 200.86207580566406,
"learning_rate": 5.1022340561592396e-06,
"loss": 14.5471,
"step": 444
},
{
"epoch": 0.25513769193557057,
"grad_norm": 203.37557983398438,
"learning_rate": 5.083647973010085e-06,
"loss": 14.5438,
"step": 445
},
{
"epoch": 0.25571103506351567,
"grad_norm": 193.55697631835938,
"learning_rate": 5.065060733638878e-06,
"loss": 14.4965,
"step": 446
},
{
"epoch": 0.25628437819146077,
"grad_norm": 195.2728271484375,
"learning_rate": 5.046472594967279e-06,
"loss": 14.5723,
"step": 447
},
{
"epoch": 0.2568577213194059,
"grad_norm": 197.77818298339844,
"learning_rate": 5.027883813929374e-06,
"loss": 14.4772,
"step": 448
},
{
"epoch": 0.257431064447351,
"grad_norm": 196.05238342285156,
"learning_rate": 5.009294647468137e-06,
"loss": 14.5655,
"step": 449
},
{
"epoch": 0.2580044075752961,
"grad_norm": 194.8416290283203,
"learning_rate": 4.990705352531864e-06,
"loss": 14.5701,
"step": 450
},
{
"epoch": 0.2585777507032412,
"grad_norm": 193.21575927734375,
"learning_rate": 4.972116186070626e-06,
"loss": 14.5292,
"step": 451
},
{
"epoch": 0.2591510938311863,
"grad_norm": 189.819580078125,
"learning_rate": 4.953527405032723e-06,
"loss": 14.4925,
"step": 452
},
{
"epoch": 0.2597244369591314,
"grad_norm": 194.4360809326172,
"learning_rate": 4.934939266361123e-06,
"loss": 14.4965,
"step": 453
},
{
"epoch": 0.2602977800870765,
"grad_norm": 198.99061584472656,
"learning_rate": 4.916352026989914e-06,
"loss": 14.484,
"step": 454
},
{
"epoch": 0.2608711232150216,
"grad_norm": 193.81446838378906,
"learning_rate": 4.897765943840761e-06,
"loss": 14.527,
"step": 455
},
{
"epoch": 0.2614444663429667,
"grad_norm": 189.20484924316406,
"learning_rate": 4.87918127381934e-06,
"loss": 14.4895,
"step": 456
},
{
"epoch": 0.2620178094709118,
"grad_norm": 190.6830291748047,
"learning_rate": 4.860598273811793e-06,
"loss": 14.4308,
"step": 457
},
{
"epoch": 0.2625911525988569,
"grad_norm": 189.31912231445312,
"learning_rate": 4.842017200681185e-06,
"loss": 14.5519,
"step": 458
},
{
"epoch": 0.263164495726802,
"grad_norm": 188.8474578857422,
"learning_rate": 4.823438311263943e-06,
"loss": 14.4147,
"step": 459
},
{
"epoch": 0.2637378388547471,
"grad_norm": 192.68406677246094,
"learning_rate": 4.804861862366306e-06,
"loss": 14.471,
"step": 460
},
{
"epoch": 0.2643111819826922,
"grad_norm": 188.2942657470703,
"learning_rate": 4.786288110760787e-06,
"loss": 14.5164,
"step": 461
},
{
"epoch": 0.2648845251106373,
"grad_norm": 191.98313903808594,
"learning_rate": 4.767717313182611e-06,
"loss": 14.3865,
"step": 462
},
{
"epoch": 0.2654578682385824,
"grad_norm": 197.7642364501953,
"learning_rate": 4.74914972632617e-06,
"loss": 14.6162,
"step": 463
},
{
"epoch": 0.2660312113665275,
"grad_norm": 199.40097045898438,
"learning_rate": 4.730585606841479e-06,
"loss": 14.4812,
"step": 464
},
{
"epoch": 0.2666045544944726,
"grad_norm": 191.48199462890625,
"learning_rate": 4.7120252113306216e-06,
"loss": 14.445,
"step": 465
},
{
"epoch": 0.2671778976224177,
"grad_norm": 195.9621124267578,
"learning_rate": 4.693468796344211e-06,
"loss": 14.4466,
"step": 466
},
{
"epoch": 0.26775124075036283,
"grad_norm": 193.89913940429688,
"learning_rate": 4.6749166183778375e-06,
"loss": 14.4653,
"step": 467
},
{
"epoch": 0.26832458387830793,
"grad_norm": 185.12448120117188,
"learning_rate": 4.656368933868525e-06,
"loss": 14.4962,
"step": 468
},
{
"epoch": 0.26889792700625303,
"grad_norm": 188.17173767089844,
"learning_rate": 4.637825999191189e-06,
"loss": 14.4282,
"step": 469
},
{
"epoch": 0.26947127013419814,
"grad_norm": 179.78378295898438,
"learning_rate": 4.619288070655086e-06,
"loss": 14.4112,
"step": 470
},
{
"epoch": 0.27004461326214324,
"grad_norm": 184.57598876953125,
"learning_rate": 4.600755404500281e-06,
"loss": 14.4972,
"step": 471
},
{
"epoch": 0.27061795639008834,
"grad_norm": 190.61500549316406,
"learning_rate": 4.582228256894093e-06,
"loss": 14.4585,
"step": 472
},
{
"epoch": 0.27119129951803345,
"grad_norm": 191.43365478515625,
"learning_rate": 4.56370688392757e-06,
"loss": 14.3984,
"step": 473
},
{
"epoch": 0.27176464264597855,
"grad_norm": 189.6448211669922,
"learning_rate": 4.545191541611933e-06,
"loss": 14.4596,
"step": 474
},
{
"epoch": 0.27233798577392365,
"grad_norm": 199.958740234375,
"learning_rate": 4.526682485875044e-06,
"loss": 14.5124,
"step": 475
},
{
"epoch": 0.27291132890186875,
"grad_norm": 187.1591033935547,
"learning_rate": 4.508179972557875e-06,
"loss": 14.4502,
"step": 476
},
{
"epoch": 0.27348467202981386,
"grad_norm": 212.7876739501953,
"learning_rate": 4.489684257410959e-06,
"loss": 14.4952,
"step": 477
},
{
"epoch": 0.27405801515775896,
"grad_norm": 197.2154541015625,
"learning_rate": 4.471195596090867e-06,
"loss": 14.5392,
"step": 478
},
{
"epoch": 0.27463135828570406,
"grad_norm": 193.218505859375,
"learning_rate": 4.452714244156667e-06,
"loss": 14.5221,
"step": 479
},
{
"epoch": 0.27520470141364917,
"grad_norm": 195.3530731201172,
"learning_rate": 4.434240457066388e-06,
"loss": 14.4045,
"step": 480
},
{
"epoch": 0.27577804454159427,
"grad_norm": 191.08155822753906,
"learning_rate": 4.415774490173504e-06,
"loss": 14.363,
"step": 481
},
{
"epoch": 0.2763513876695394,
"grad_norm": 205.4665985107422,
"learning_rate": 4.397316598723385e-06,
"loss": 14.5536,
"step": 482
},
{
"epoch": 0.2769247307974845,
"grad_norm": 202.93714904785156,
"learning_rate": 4.3788670378497836e-06,
"loss": 14.4253,
"step": 483
},
{
"epoch": 0.2774980739254296,
"grad_norm": 199.6490020751953,
"learning_rate": 4.360426062571303e-06,
"loss": 14.5529,
"step": 484
},
{
"epoch": 0.2780714170533747,
"grad_norm": 198.09494018554688,
"learning_rate": 4.341993927787871e-06,
"loss": 14.4701,
"step": 485
},
{
"epoch": 0.2786447601813198,
"grad_norm": 194.7907257080078,
"learning_rate": 4.323570888277215e-06,
"loss": 14.4267,
"step": 486
},
{
"epoch": 0.2792181033092649,
"grad_norm": 204.8142852783203,
"learning_rate": 4.305157198691351e-06,
"loss": 14.4313,
"step": 487
},
{
"epoch": 0.27979144643721,
"grad_norm": 199.0611572265625,
"learning_rate": 4.286753113553049e-06,
"loss": 14.4615,
"step": 488
},
{
"epoch": 0.2803647895651551,
"grad_norm": 188.00750732421875,
"learning_rate": 4.268358887252322e-06,
"loss": 14.3631,
"step": 489
},
{
"epoch": 0.2809381326931002,
"grad_norm": 191.73825073242188,
"learning_rate": 4.249974774042915e-06,
"loss": 14.4741,
"step": 490
},
{
"epoch": 0.2815114758210453,
"grad_norm": 188.29759216308594,
"learning_rate": 4.231601028038781e-06,
"loss": 14.446,
"step": 491
},
{
"epoch": 0.2820848189489904,
"grad_norm": 197.5531768798828,
"learning_rate": 4.2132379032105695e-06,
"loss": 14.4405,
"step": 492
},
{
"epoch": 0.2826581620769355,
"grad_norm": 190.16937255859375,
"learning_rate": 4.194885653382128e-06,
"loss": 14.3906,
"step": 493
},
{
"epoch": 0.2832315052048806,
"grad_norm": 188.8497772216797,
"learning_rate": 4.176544532226974e-06,
"loss": 14.4415,
"step": 494
},
{
"epoch": 0.2838048483328257,
"grad_norm": 186.59799194335938,
"learning_rate": 4.158214793264808e-06,
"loss": 14.4197,
"step": 495
},
{
"epoch": 0.2843781914607708,
"grad_norm": 184.35581970214844,
"learning_rate": 4.139896689857995e-06,
"loss": 14.3536,
"step": 496
},
{
"epoch": 0.2849515345887159,
"grad_norm": 199.46311950683594,
"learning_rate": 4.121590475208071e-06,
"loss": 14.4356,
"step": 497
},
{
"epoch": 0.285524877716661,
"grad_norm": 200.33966064453125,
"learning_rate": 4.1032964023522366e-06,
"loss": 14.4552,
"step": 498
},
{
"epoch": 0.2860982208446061,
"grad_norm": 189.87977600097656,
"learning_rate": 4.085014724159866e-06,
"loss": 14.3919,
"step": 499
},
{
"epoch": 0.2866715639725512,
"grad_norm": 196.80152893066406,
"learning_rate": 4.066745693329008e-06,
"loss": 14.5031,
"step": 500
},
{
"epoch": 0.2872449071004963,
"grad_norm": 193.42140197753906,
"learning_rate": 4.0484895623828906e-06,
"loss": 14.4403,
"step": 501
},
{
"epoch": 0.28781825022844143,
"grad_norm": 194.4940948486328,
"learning_rate": 4.030246583666437e-06,
"loss": 14.4734,
"step": 502
},
{
"epoch": 0.2883915933563865,
"grad_norm": 192.37107849121094,
"learning_rate": 4.012017009342773e-06,
"loss": 14.4512,
"step": 503
},
{
"epoch": 0.2889649364843316,
"grad_norm": 181.2819366455078,
"learning_rate": 3.99380109138974e-06,
"loss": 14.4906,
"step": 504
},
{
"epoch": 0.2895382796122767,
"grad_norm": 199.6365509033203,
"learning_rate": 3.97559908159642e-06,
"loss": 14.4517,
"step": 505
},
{
"epoch": 0.2901116227402218,
"grad_norm": 182.9588165283203,
"learning_rate": 3.9574112315596425e-06,
"loss": 14.4496,
"step": 506
},
{
"epoch": 0.2906849658681669,
"grad_norm": 183.8024139404297,
"learning_rate": 3.9392377926805226e-06,
"loss": 14.403,
"step": 507
},
{
"epoch": 0.291258308996112,
"grad_norm": 195.86257934570312,
"learning_rate": 3.92107901616097e-06,
"loss": 14.3586,
"step": 508
},
{
"epoch": 0.2918316521240571,
"grad_norm": 193.3267822265625,
"learning_rate": 3.9029351530002264e-06,
"loss": 14.4352,
"step": 509
},
{
"epoch": 0.2924049952520022,
"grad_norm": 189.76773071289062,
"learning_rate": 3.884806453991399e-06,
"loss": 14.3374,
"step": 510
},
{
"epoch": 0.2929783383799473,
"grad_norm": 190.036865234375,
"learning_rate": 3.866693169717982e-06,
"loss": 14.3719,
"step": 511
},
{
"epoch": 0.2935516815078924,
"grad_norm": 187.96229553222656,
"learning_rate": 3.848595550550401e-06,
"loss": 14.4594,
"step": 512
},
{
"epoch": 0.2941250246358375,
"grad_norm": 189.76959228515625,
"learning_rate": 3.830513846642556e-06,
"loss": 14.3997,
"step": 513
},
{
"epoch": 0.2946983677637826,
"grad_norm": 188.51016235351562,
"learning_rate": 3.8124483079283546e-06,
"loss": 14.3977,
"step": 514
},
{
"epoch": 0.2952717108917277,
"grad_norm": 182.27618408203125,
"learning_rate": 3.7943991841182586e-06,
"loss": 14.3342,
"step": 515
},
{
"epoch": 0.2958450540196728,
"grad_norm": 194.53384399414062,
"learning_rate": 3.7763667246958447e-06,
"loss": 14.3353,
"step": 516
},
{
"epoch": 0.2964183971476179,
"grad_norm": 186.60391235351562,
"learning_rate": 3.758351178914336e-06,
"loss": 14.3462,
"step": 517
},
{
"epoch": 0.296991740275563,
"grad_norm": 208.77110290527344,
"learning_rate": 3.7403527957931716e-06,
"loss": 14.4527,
"step": 518
},
{
"epoch": 0.2975650834035081,
"grad_norm": 192.8214111328125,
"learning_rate": 3.7223718241145646e-06,
"loss": 14.3971,
"step": 519
},
{
"epoch": 0.2981384265314532,
"grad_norm": 185.70005798339844,
"learning_rate": 3.7044085124200517e-06,
"loss": 14.3432,
"step": 520
},
{
"epoch": 0.29871176965939833,
"grad_norm": 196.39981079101562,
"learning_rate": 3.6864631090070656e-06,
"loss": 14.5102,
"step": 521
},
{
"epoch": 0.29928511278734343,
"grad_norm": 187.2920684814453,
"learning_rate": 3.668535861925509e-06,
"loss": 14.4782,
"step": 522
},
{
"epoch": 0.29985845591528854,
"grad_norm": 186.00146484375,
"learning_rate": 3.650627018974312e-06,
"loss": 14.4494,
"step": 523
},
{
"epoch": 0.30043179904323364,
"grad_norm": 189.43801879882812,
"learning_rate": 3.632736827698015e-06,
"loss": 14.3908,
"step": 524
},
{
"epoch": 0.30100514217117874,
"grad_norm": 201.06126403808594,
"learning_rate": 3.6148655353833518e-06,
"loss": 14.458,
"step": 525
},
{
"epoch": 0.30157848529912384,
"grad_norm": 190.3157501220703,
"learning_rate": 3.5970133890558184e-06,
"loss": 14.3939,
"step": 526
},
{
"epoch": 0.30215182842706895,
"grad_norm": 203.18019104003906,
"learning_rate": 3.5791806354762702e-06,
"loss": 14.4642,
"step": 527
},
{
"epoch": 0.30272517155501405,
"grad_norm": 186.1299285888672,
"learning_rate": 3.5613675211375066e-06,
"loss": 14.3403,
"step": 528
},
{
"epoch": 0.30329851468295915,
"grad_norm": 188.37765502929688,
"learning_rate": 3.5435742922608618e-06,
"loss": 14.3578,
"step": 529
},
{
"epoch": 0.30387185781090426,
"grad_norm": 184.9286346435547,
"learning_rate": 3.525801194792805e-06,
"loss": 14.3543,
"step": 530
},
{
"epoch": 0.30444520093884936,
"grad_norm": 193.71884155273438,
"learning_rate": 3.508048474401541e-06,
"loss": 14.3639,
"step": 531
},
{
"epoch": 0.30501854406679446,
"grad_norm": 187.72390747070312,
"learning_rate": 3.4903163764736104e-06,
"loss": 14.2493,
"step": 532
},
{
"epoch": 0.30559188719473956,
"grad_norm": 195.72886657714844,
"learning_rate": 3.4726051461105016e-06,
"loss": 14.4045,
"step": 533
},
{
"epoch": 0.30616523032268467,
"grad_norm": 185.08929443359375,
"learning_rate": 3.4549150281252635e-06,
"loss": 14.4521,
"step": 534
},
{
"epoch": 0.30673857345062977,
"grad_norm": 182.60292053222656,
"learning_rate": 3.437246267039115e-06,
"loss": 14.3866,
"step": 535
},
{
"epoch": 0.3073119165785749,
"grad_norm": 181.70509338378906,
"learning_rate": 3.419599107078073e-06,
"loss": 14.4036,
"step": 536
},
{
"epoch": 0.30788525970652,
"grad_norm": 187.29672241210938,
"learning_rate": 3.401973792169574e-06,
"loss": 14.3734,
"step": 537
},
{
"epoch": 0.3084586028344651,
"grad_norm": 187.84115600585938,
"learning_rate": 3.384370565939098e-06,
"loss": 14.4167,
"step": 538
},
{
"epoch": 0.3090319459624102,
"grad_norm": 200.47061157226562,
"learning_rate": 3.3667896717068105e-06,
"loss": 14.4517,
"step": 539
},
{
"epoch": 0.3096052890903553,
"grad_norm": 192.6443634033203,
"learning_rate": 3.34923135248419e-06,
"loss": 14.4143,
"step": 540
},
{
"epoch": 0.3101786322183004,
"grad_norm": 189.818115234375,
"learning_rate": 3.33169585097067e-06,
"loss": 14.3478,
"step": 541
},
{
"epoch": 0.3107519753462455,
"grad_norm": 185.73080444335938,
"learning_rate": 3.314183409550293e-06,
"loss": 14.3765,
"step": 542
},
{
"epoch": 0.3113253184741906,
"grad_norm": 183.9041290283203,
"learning_rate": 3.2966942702883494e-06,
"loss": 14.3506,
"step": 543
},
{
"epoch": 0.3118986616021357,
"grad_norm": 188.9761505126953,
"learning_rate": 3.279228674928035e-06,
"loss": 14.4349,
"step": 544
},
{
"epoch": 0.3124720047300808,
"grad_norm": 190.45909118652344,
"learning_rate": 3.261786864887117e-06,
"loss": 14.3562,
"step": 545
},
{
"epoch": 0.3130453478580259,
"grad_norm": 191.3506317138672,
"learning_rate": 3.244369081254585e-06,
"loss": 14.2781,
"step": 546
},
{
"epoch": 0.313618690985971,
"grad_norm": 181.74490356445312,
"learning_rate": 3.226975564787322e-06,
"loss": 14.3264,
"step": 547
},
{
"epoch": 0.3141920341139161,
"grad_norm": 186.11990356445312,
"learning_rate": 3.209606555906788e-06,
"loss": 14.3599,
"step": 548
},
{
"epoch": 0.3147653772418612,
"grad_norm": 192.1141357421875,
"learning_rate": 3.192262294695679e-06,
"loss": 14.3444,
"step": 549
},
{
"epoch": 0.3153387203698063,
"grad_norm": 193.52890014648438,
"learning_rate": 3.174943020894618e-06,
"loss": 14.4323,
"step": 550
},
{
"epoch": 0.3159120634977514,
"grad_norm": 183.9879150390625,
"learning_rate": 3.1576489738988457e-06,
"loss": 14.2539,
"step": 551
},
{
"epoch": 0.3164854066256965,
"grad_norm": 186.39529418945312,
"learning_rate": 3.140380392754901e-06,
"loss": 14.3633,
"step": 552
},
{
"epoch": 0.3170587497536416,
"grad_norm": 193.56439208984375,
"learning_rate": 3.12313751615732e-06,
"loss": 14.3256,
"step": 553
},
{
"epoch": 0.3176320928815867,
"grad_norm": 187.15281677246094,
"learning_rate": 3.1059205824453446e-06,
"loss": 14.3763,
"step": 554
},
{
"epoch": 0.31820543600953183,
"grad_norm": 188.94200134277344,
"learning_rate": 3.0887298295996183e-06,
"loss": 14.3864,
"step": 555
},
{
"epoch": 0.31877877913747693,
"grad_norm": 186.75950622558594,
"learning_rate": 3.0715654952388957e-06,
"loss": 14.3803,
"step": 556
},
{
"epoch": 0.31935212226542203,
"grad_norm": 193.4385223388672,
"learning_rate": 3.054427816616773e-06,
"loss": 14.2965,
"step": 557
},
{
"epoch": 0.31992546539336714,
"grad_norm": 188.6703338623047,
"learning_rate": 3.0373170306183885e-06,
"loss": 14.4114,
"step": 558
},
{
"epoch": 0.32049880852131224,
"grad_norm": 194.2964630126953,
"learning_rate": 3.020233373757162e-06,
"loss": 14.2351,
"step": 559
},
{
"epoch": 0.32107215164925734,
"grad_norm": 204.58041381835938,
"learning_rate": 3.0031770821715233e-06,
"loss": 14.3925,
"step": 560
},
{
"epoch": 0.32164549477720245,
"grad_norm": 182.63665771484375,
"learning_rate": 2.9861483916216404e-06,
"loss": 14.371,
"step": 561
},
{
"epoch": 0.32221883790514755,
"grad_norm": 201.17764282226562,
"learning_rate": 2.969147537486175e-06,
"loss": 14.301,
"step": 562
},
{
"epoch": 0.32279218103309265,
"grad_norm": 187.64376831054688,
"learning_rate": 2.952174754759012e-06,
"loss": 14.3037,
"step": 563
},
{
"epoch": 0.32336552416103775,
"grad_norm": 182.01651000976562,
"learning_rate": 2.935230278046025e-06,
"loss": 14.2326,
"step": 564
},
{
"epoch": 0.32393886728898286,
"grad_norm": 184.65011596679688,
"learning_rate": 2.9183143415618297e-06,
"loss": 14.3121,
"step": 565
},
{
"epoch": 0.32451221041692796,
"grad_norm": 172.1057891845703,
"learning_rate": 2.9014271791265403e-06,
"loss": 14.203,
"step": 566
},
{
"epoch": 0.32508555354487306,
"grad_norm": 178.24777221679688,
"learning_rate": 2.8845690241625437e-06,
"loss": 14.3961,
"step": 567
},
{
"epoch": 0.32565889667281817,
"grad_norm": 198.43179321289062,
"learning_rate": 2.867740109691277e-06,
"loss": 14.3644,
"step": 568
},
{
"epoch": 0.32623223980076327,
"grad_norm": 184.53721618652344,
"learning_rate": 2.850940668329996e-06,
"loss": 14.3736,
"step": 569
},
{
"epoch": 0.32680558292870837,
"grad_norm": 186.57337951660156,
"learning_rate": 2.8341709322885624e-06,
"loss": 14.2914,
"step": 570
},
{
"epoch": 0.3273789260566535,
"grad_norm": 194.31634521484375,
"learning_rate": 2.817431133366246e-06,
"loss": 14.3647,
"step": 571
},
{
"epoch": 0.3279522691845986,
"grad_norm": 189.49636840820312,
"learning_rate": 2.800721502948506e-06,
"loss": 14.4111,
"step": 572
},
{
"epoch": 0.3285256123125437,
"grad_norm": 194.70204162597656,
"learning_rate": 2.7840422720037943e-06,
"loss": 14.4538,
"step": 573
},
{
"epoch": 0.3290989554404888,
"grad_norm": 191.64688110351562,
"learning_rate": 2.767393671080376e-06,
"loss": 14.2899,
"step": 574
},
{
"epoch": 0.3296722985684339,
"grad_norm": 193.7047576904297,
"learning_rate": 2.7507759303031257e-06,
"loss": 14.3198,
"step": 575
},
{
"epoch": 0.330245641696379,
"grad_norm": 189.0587158203125,
"learning_rate": 2.7341892793703594e-06,
"loss": 14.3457,
"step": 576
},
{
"epoch": 0.3308189848243241,
"grad_norm": 188.7035675048828,
"learning_rate": 2.7176339475506515e-06,
"loss": 14.2817,
"step": 577
},
{
"epoch": 0.3313923279522692,
"grad_norm": 184.22344970703125,
"learning_rate": 2.7011101636796677e-06,
"loss": 14.3146,
"step": 578
},
{
"epoch": 0.3319656710802143,
"grad_norm": 180.2777557373047,
"learning_rate": 2.6846181561570085e-06,
"loss": 14.3799,
"step": 579
},
{
"epoch": 0.3325390142081594,
"grad_norm": 185.93838500976562,
"learning_rate": 2.668158152943039e-06,
"loss": 14.3632,
"step": 580
},
{
"epoch": 0.3331123573361045,
"grad_norm": 183.86941528320312,
"learning_rate": 2.651730381555754e-06,
"loss": 14.3327,
"step": 581
},
{
"epoch": 0.3336857004640496,
"grad_norm": 184.0933074951172,
"learning_rate": 2.635335069067617e-06,
"loss": 14.3807,
"step": 582
},
{
"epoch": 0.3342590435919947,
"grad_norm": 183.67532348632812,
"learning_rate": 2.618972442102432e-06,
"loss": 14.4402,
"step": 583
},
{
"epoch": 0.3348323867199398,
"grad_norm": 185.25009155273438,
"learning_rate": 2.602642726832212e-06,
"loss": 14.3258,
"step": 584
},
{
"epoch": 0.3354057298478849,
"grad_norm": 186.76087951660156,
"learning_rate": 2.5863461489740403e-06,
"loss": 14.2503,
"step": 585
},
{
"epoch": 0.33597907297583,
"grad_norm": 183.74209594726562,
"learning_rate": 2.57008293378697e-06,
"loss": 14.282,
"step": 586
},
{
"epoch": 0.3365524161037751,
"grad_norm": 185.21743774414062,
"learning_rate": 2.553853306068888e-06,
"loss": 14.3058,
"step": 587
},
{
"epoch": 0.3371257592317202,
"grad_norm": 180.64405822753906,
"learning_rate": 2.5376574901534303e-06,
"loss": 14.2191,
"step": 588
},
{
"epoch": 0.3376991023596653,
"grad_norm": 197.49221801757812,
"learning_rate": 2.5214957099068613e-06,
"loss": 14.2684,
"step": 589
},
{
"epoch": 0.33827244548761043,
"grad_norm": 178.35708618164062,
"learning_rate": 2.5053681887249916e-06,
"loss": 14.2358,
"step": 590
},
{
"epoch": 0.33884578861555553,
"grad_norm": 181.4188995361328,
"learning_rate": 2.4892751495300893e-06,
"loss": 14.3204,
"step": 591
},
{
"epoch": 0.33941913174350063,
"grad_norm": 178.8732452392578,
"learning_rate": 2.4732168147677927e-06,
"loss": 14.2609,
"step": 592
},
{
"epoch": 0.33999247487144574,
"grad_norm": 191.7628631591797,
"learning_rate": 2.4571934064040364e-06,
"loss": 14.2528,
"step": 593
},
{
"epoch": 0.34056581799939084,
"grad_norm": 193.52305603027344,
"learning_rate": 2.4412051459219945e-06,
"loss": 14.3341,
"step": 594
},
{
"epoch": 0.34113916112733594,
"grad_norm": 198.21897888183594,
"learning_rate": 2.425252254319002e-06,
"loss": 14.3828,
"step": 595
},
{
"epoch": 0.34171250425528105,
"grad_norm": 191.85609436035156,
"learning_rate": 2.4093349521035105e-06,
"loss": 14.3309,
"step": 596
},
{
"epoch": 0.34228584738322615,
"grad_norm": 185.22528076171875,
"learning_rate": 2.3934534592920416e-06,
"loss": 14.2623,
"step": 597
},
{
"epoch": 0.34285919051117125,
"grad_norm": 188.74754333496094,
"learning_rate": 2.3776079954061385e-06,
"loss": 14.4269,
"step": 598
},
{
"epoch": 0.34343253363911636,
"grad_norm": 178.31825256347656,
"learning_rate": 2.3617987794693358e-06,
"loss": 14.2489,
"step": 599
},
{
"epoch": 0.34400587676706146,
"grad_norm": 188.00209045410156,
"learning_rate": 2.3460260300041355e-06,
"loss": 14.3401,
"step": 600
},
{
"epoch": 0.34457921989500656,
"grad_norm": 191.75465393066406,
"learning_rate": 2.3302899650289773e-06,
"loss": 14.3273,
"step": 601
},
{
"epoch": 0.34515256302295166,
"grad_norm": 185.55166625976562,
"learning_rate": 2.314590802055232e-06,
"loss": 14.3695,
"step": 602
},
{
"epoch": 0.34572590615089677,
"grad_norm": 177.90130615234375,
"learning_rate": 2.2989287580841985e-06,
"loss": 14.3113,
"step": 603
},
{
"epoch": 0.34629924927884187,
"grad_norm": 189.20179748535156,
"learning_rate": 2.2833040496040925e-06,
"loss": 14.2244,
"step": 604
},
{
"epoch": 0.346872592406787,
"grad_norm": 192.3074493408203,
"learning_rate": 2.267716892587062e-06,
"loss": 14.28,
"step": 605
},
{
"epoch": 0.3474459355347321,
"grad_norm": 192.26055908203125,
"learning_rate": 2.252167502486205e-06,
"loss": 14.2554,
"step": 606
},
{
"epoch": 0.3480192786626772,
"grad_norm": 184.66305541992188,
"learning_rate": 2.2366560942325833e-06,
"loss": 14.3175,
"step": 607
},
{
"epoch": 0.3485926217906223,
"grad_norm": 186.08566284179688,
"learning_rate": 2.2211828822322547e-06,
"loss": 14.2586,
"step": 608
},
{
"epoch": 0.3491659649185674,
"grad_norm": 183.10336303710938,
"learning_rate": 2.205748080363316e-06,
"loss": 14.3051,
"step": 609
},
{
"epoch": 0.3497393080465125,
"grad_norm": 188.01463317871094,
"learning_rate": 2.190351901972935e-06,
"loss": 14.2597,
"step": 610
},
{
"epoch": 0.3503126511744576,
"grad_norm": 186.796630859375,
"learning_rate": 2.1749945598744076e-06,
"loss": 14.3121,
"step": 611
},
{
"epoch": 0.3508859943024027,
"grad_norm": 197.26966857910156,
"learning_rate": 2.159676266344222e-06,
"loss": 14.3272,
"step": 612
},
{
"epoch": 0.3514593374303478,
"grad_norm": 188.6767578125,
"learning_rate": 2.144397233119112e-06,
"loss": 14.2799,
"step": 613
},
{
"epoch": 0.35203268055829284,
"grad_norm": 185.7920684814453,
"learning_rate": 2.1291576713931382e-06,
"loss": 14.3654,
"step": 614
},
{
"epoch": 0.35260602368623795,
"grad_norm": 183.85186767578125,
"learning_rate": 2.1139577918147715e-06,
"loss": 14.2435,
"step": 615
},
{
"epoch": 0.35317936681418305,
"grad_norm": 188.81492614746094,
"learning_rate": 2.0987978044839707e-06,
"loss": 14.3787,
"step": 616
},
{
"epoch": 0.35375270994212815,
"grad_norm": 181.99166870117188,
"learning_rate": 2.0836779189492925e-06,
"loss": 14.3489,
"step": 617
},
{
"epoch": 0.35432605307007325,
"grad_norm": 182.6253204345703,
"learning_rate": 2.068598344204981e-06,
"loss": 14.2816,
"step": 618
},
{
"epoch": 0.35489939619801836,
"grad_norm": 178.6793975830078,
"learning_rate": 2.053559288688086e-06,
"loss": 14.2392,
"step": 619
},
{
"epoch": 0.35547273932596346,
"grad_norm": 190.26219177246094,
"learning_rate": 2.0385609602755878e-06,
"loss": 14.2875,
"step": 620
},
{
"epoch": 0.35604608245390856,
"grad_norm": 199.85971069335938,
"learning_rate": 2.02360356628151e-06,
"loss": 14.3167,
"step": 621
},
{
"epoch": 0.35661942558185367,
"grad_norm": 199.51605224609375,
"learning_rate": 2.0086873134540626e-06,
"loss": 14.336,
"step": 622
},
{
"epoch": 0.35719276870979877,
"grad_norm": 183.92247009277344,
"learning_rate": 1.9938124079727874e-06,
"loss": 14.2201,
"step": 623
},
{
"epoch": 0.35776611183774387,
"grad_norm": 193.48175048828125,
"learning_rate": 1.9789790554456977e-06,
"loss": 14.2868,
"step": 624
},
{
"epoch": 0.358339454965689,
"grad_norm": 189.4330291748047,
"learning_rate": 1.9641874609064443e-06,
"loss": 14.2538,
"step": 625
},
{
"epoch": 0.3589127980936341,
"grad_norm": 182.5979461669922,
"learning_rate": 1.9494378288114816e-06,
"loss": 14.2463,
"step": 626
},
{
"epoch": 0.3594861412215792,
"grad_norm": 177.77850341796875,
"learning_rate": 1.9347303630372373e-06,
"loss": 14.1946,
"step": 627
},
{
"epoch": 0.3600594843495243,
"grad_norm": 182.85313415527344,
"learning_rate": 1.9200652668772924e-06,
"loss": 14.2852,
"step": 628
},
{
"epoch": 0.3606328274774694,
"grad_norm": 189.149169921875,
"learning_rate": 1.9054427430395828e-06,
"loss": 14.2522,
"step": 629
},
{
"epoch": 0.3612061706054145,
"grad_norm": 186.2698211669922,
"learning_rate": 1.890862993643583e-06,
"loss": 14.2526,
"step": 630
},
{
"epoch": 0.3617795137333596,
"grad_norm": 188.8157196044922,
"learning_rate": 1.8763262202175204e-06,
"loss": 14.2772,
"step": 631
},
{
"epoch": 0.3623528568613047,
"grad_norm": 184.87147521972656,
"learning_rate": 1.8618326236955908e-06,
"loss": 14.3395,
"step": 632
},
{
"epoch": 0.3629261999892498,
"grad_norm": 185.856201171875,
"learning_rate": 1.8473824044151762e-06,
"loss": 14.2998,
"step": 633
},
{
"epoch": 0.3634995431171949,
"grad_norm": 184.26248168945312,
"learning_rate": 1.8329757621140748e-06,
"loss": 14.2654,
"step": 634
},
{
"epoch": 0.36407288624514,
"grad_norm": 186.35105895996094,
"learning_rate": 1.81861289592775e-06,
"loss": 14.2294,
"step": 635
},
{
"epoch": 0.3646462293730851,
"grad_norm": 187.1624298095703,
"learning_rate": 1.8042940043865658e-06,
"loss": 14.3037,
"step": 636
},
{
"epoch": 0.3652195725010302,
"grad_norm": 176.15463256835938,
"learning_rate": 1.7900192854130465e-06,
"loss": 14.2271,
"step": 637
},
{
"epoch": 0.3657929156289753,
"grad_norm": 188.59449768066406,
"learning_rate": 1.7757889363191484e-06,
"loss": 14.3419,
"step": 638
},
{
"epoch": 0.3663662587569204,
"grad_norm": 180.50051879882812,
"learning_rate": 1.7616031538035189e-06,
"loss": 14.2815,
"step": 639
},
{
"epoch": 0.3669396018848655,
"grad_norm": 185.34474182128906,
"learning_rate": 1.7474621339487925e-06,
"loss": 14.2534,
"step": 640
},
{
"epoch": 0.3675129450128106,
"grad_norm": 184.1910858154297,
"learning_rate": 1.7333660722188667e-06,
"loss": 14.2397,
"step": 641
},
{
"epoch": 0.3680862881407557,
"grad_norm": 185.2908477783203,
"learning_rate": 1.7193151634562071e-06,
"loss": 14.2306,
"step": 642
},
{
"epoch": 0.3686596312687008,
"grad_norm": 183.8131103515625,
"learning_rate": 1.7053096018791588e-06,
"loss": 14.2843,
"step": 643
},
{
"epoch": 0.36923297439664593,
"grad_norm": 189.00628662109375,
"learning_rate": 1.691349581079249e-06,
"loss": 14.1944,
"step": 644
},
{
"epoch": 0.36980631752459103,
"grad_norm": 189.68801879882812,
"learning_rate": 1.6774352940185269e-06,
"loss": 14.2894,
"step": 645
},
{
"epoch": 0.37037966065253614,
"grad_norm": 193.29290771484375,
"learning_rate": 1.663566933026879e-06,
"loss": 14.3125,
"step": 646
},
{
"epoch": 0.37095300378048124,
"grad_norm": 189.4978790283203,
"learning_rate": 1.6497446897993885e-06,
"loss": 14.1912,
"step": 647
},
{
"epoch": 0.37152634690842634,
"grad_norm": 187.17823791503906,
"learning_rate": 1.6359687553936714e-06,
"loss": 14.2728,
"step": 648
},
{
"epoch": 0.37209969003637144,
"grad_norm": 180.1759033203125,
"learning_rate": 1.6222393202272414e-06,
"loss": 14.2409,
"step": 649
},
{
"epoch": 0.37267303316431655,
"grad_norm": 175.7593536376953,
"learning_rate": 1.6085565740748825e-06,
"loss": 14.1765,
"step": 650
},
{
"epoch": 0.37324637629226165,
"grad_norm": 183.71810913085938,
"learning_rate": 1.5949207060660138e-06,
"loss": 14.2563,
"step": 651
},
{
"epoch": 0.37381971942020675,
"grad_norm": 185.6693572998047,
"learning_rate": 1.581331904682089e-06,
"loss": 14.3579,
"step": 652
},
{
"epoch": 0.37439306254815186,
"grad_norm": 189.27444458007812,
"learning_rate": 1.5677903577539806e-06,
"loss": 14.2853,
"step": 653
},
{
"epoch": 0.37496640567609696,
"grad_norm": 190.42837524414062,
"learning_rate": 1.5542962524593869e-06,
"loss": 14.2187,
"step": 654
},
{
"epoch": 0.37553974880404206,
"grad_norm": 177.54698181152344,
"learning_rate": 1.54084977532025e-06,
"loss": 14.1745,
"step": 655
},
{
"epoch": 0.37611309193198716,
"grad_norm": 183.06019592285156,
"learning_rate": 1.5274511122001684e-06,
"loss": 14.2742,
"step": 656
},
{
"epoch": 0.37668643505993227,
"grad_norm": 190.93809509277344,
"learning_rate": 1.5141004483018323e-06,
"loss": 14.3287,
"step": 657
},
{
"epoch": 0.37725977818787737,
"grad_norm": 195.81625366210938,
"learning_rate": 1.5007979681644696e-06,
"loss": 14.2384,
"step": 658
},
{
"epoch": 0.3778331213158225,
"grad_norm": 187.17530822753906,
"learning_rate": 1.4875438556612836e-06,
"loss": 14.25,
"step": 659
},
{
"epoch": 0.3784064644437676,
"grad_norm": 183.16397094726562,
"learning_rate": 1.474338293996917e-06,
"loss": 14.3265,
"step": 660
},
{
"epoch": 0.3789798075717127,
"grad_norm": 177.78402709960938,
"learning_rate": 1.4611814657049257e-06,
"loss": 14.1526,
"step": 661
},
{
"epoch": 0.3795531506996578,
"grad_norm": 187.61419677734375,
"learning_rate": 1.4480735526452427e-06,
"loss": 14.2041,
"step": 662
},
{
"epoch": 0.3801264938276029,
"grad_norm": 181.4232635498047,
"learning_rate": 1.4350147360016743e-06,
"loss": 14.2766,
"step": 663
},
{
"epoch": 0.380699836955548,
"grad_norm": 185.21261596679688,
"learning_rate": 1.4220051962793952e-06,
"loss": 14.216,
"step": 664
},
{
"epoch": 0.3812731800834931,
"grad_norm": 187.9059295654297,
"learning_rate": 1.4090451133024473e-06,
"loss": 14.2696,
"step": 665
},
{
"epoch": 0.3818465232114382,
"grad_norm": 184.80746459960938,
"learning_rate": 1.3961346662112585e-06,
"loss": 14.2777,
"step": 666
},
{
"epoch": 0.3824198663393833,
"grad_norm": 178.53359985351562,
"learning_rate": 1.3832740334601692e-06,
"loss": 14.2119,
"step": 667
},
{
"epoch": 0.3829932094673284,
"grad_norm": 186.9265594482422,
"learning_rate": 1.3704633928149575e-06,
"loss": 14.278,
"step": 668
},
{
"epoch": 0.3835665525952735,
"grad_norm": 181.26290893554688,
"learning_rate": 1.3577029213503911e-06,
"loss": 14.2922,
"step": 669
},
{
"epoch": 0.3841398957232186,
"grad_norm": 182.86557006835938,
"learning_rate": 1.3449927954477732e-06,
"loss": 14.2855,
"step": 670
},
{
"epoch": 0.3847132388511637,
"grad_norm": 182.475830078125,
"learning_rate": 1.3323331907925046e-06,
"loss": 14.2958,
"step": 671
},
{
"epoch": 0.3852865819791088,
"grad_norm": 189.7706756591797,
"learning_rate": 1.319724282371664e-06,
"loss": 14.2176,
"step": 672
},
{
"epoch": 0.3858599251070539,
"grad_norm": 193.93069458007812,
"learning_rate": 1.307166244471576e-06,
"loss": 14.2117,
"step": 673
},
{
"epoch": 0.386433268234999,
"grad_norm": 179.2334442138672,
"learning_rate": 1.2946592506754097e-06,
"loss": 14.3632,
"step": 674
},
{
"epoch": 0.3870066113629441,
"grad_norm": 189.32432556152344,
"learning_rate": 1.282203473860783e-06,
"loss": 14.1928,
"step": 675
},
{
"epoch": 0.3875799544908892,
"grad_norm": 182.27935791015625,
"learning_rate": 1.2697990861973635e-06,
"loss": 14.2161,
"step": 676
},
{
"epoch": 0.3881532976188343,
"grad_norm": 181.55154418945312,
"learning_rate": 1.257446259144494e-06,
"loss": 14.2658,
"step": 677
},
{
"epoch": 0.38872664074677943,
"grad_norm": 183.76902770996094,
"learning_rate": 1.2451451634488264e-06,
"loss": 14.3169,
"step": 678
},
{
"epoch": 0.38929998387472453,
"grad_norm": 179.52069091796875,
"learning_rate": 1.2328959691419517e-06,
"loss": 14.261,
"step": 679
},
{
"epoch": 0.38987332700266963,
"grad_norm": 187.97842407226562,
"learning_rate": 1.2206988455380558e-06,
"loss": 14.1935,
"step": 680
},
{
"epoch": 0.39044667013061474,
"grad_norm": 177.58485412597656,
"learning_rate": 1.2085539612315844e-06,
"loss": 14.0745,
"step": 681
},
{
"epoch": 0.39102001325855984,
"grad_norm": 178.7311248779297,
"learning_rate": 1.1964614840949002e-06,
"loss": 14.223,
"step": 682
},
{
"epoch": 0.39159335638650494,
"grad_norm": 181.48497009277344,
"learning_rate": 1.1844215812759708e-06,
"loss": 14.1863,
"step": 683
},
{
"epoch": 0.39216669951445005,
"grad_norm": 183.38412475585938,
"learning_rate": 1.1724344191960591e-06,
"loss": 14.2664,
"step": 684
},
{
"epoch": 0.39274004264239515,
"grad_norm": 190.3087921142578,
"learning_rate": 1.1605001635474183e-06,
"loss": 14.3032,
"step": 685
},
{
"epoch": 0.39331338577034025,
"grad_norm": 179.9006805419922,
"learning_rate": 1.1486189792910024e-06,
"loss": 14.2501,
"step": 686
},
{
"epoch": 0.39388672889828535,
"grad_norm": 186.22154235839844,
"learning_rate": 1.1367910306541918e-06,
"loss": 14.1971,
"step": 687
},
{
"epoch": 0.39446007202623046,
"grad_norm": 180.23377990722656,
"learning_rate": 1.1250164811285148e-06,
"loss": 14.2892,
"step": 688
},
{
"epoch": 0.39503341515417556,
"grad_norm": 177.89480590820312,
"learning_rate": 1.1132954934673911e-06,
"loss": 14.1728,
"step": 689
},
{
"epoch": 0.39560675828212066,
"grad_norm": 187.4567108154297,
"learning_rate": 1.1016282296838887e-06,
"loss": 14.2579,
"step": 690
},
{
"epoch": 0.39618010141006577,
"grad_norm": 176.47003173828125,
"learning_rate": 1.090014851048473e-06,
"loss": 14.2398,
"step": 691
},
{
"epoch": 0.39675344453801087,
"grad_norm": 182.03118896484375,
"learning_rate": 1.078455518086784e-06,
"loss": 14.2395,
"step": 692
},
{
"epoch": 0.39732678766595597,
"grad_norm": 181.1314697265625,
"learning_rate": 1.0669503905774198e-06,
"loss": 14.1643,
"step": 693
},
{
"epoch": 0.3979001307939011,
"grad_norm": 189.62818908691406,
"learning_rate": 1.055499627549722e-06,
"loss": 14.1924,
"step": 694
},
{
"epoch": 0.3984734739218462,
"grad_norm": 180.246337890625,
"learning_rate": 1.0441033872815804e-06,
"loss": 14.2148,
"step": 695
},
{
"epoch": 0.3990468170497913,
"grad_norm": 180.3937530517578,
"learning_rate": 1.0327618272972484e-06,
"loss": 14.2263,
"step": 696
},
{
"epoch": 0.3996201601777364,
"grad_norm": 189.4615478515625,
"learning_rate": 1.0214751043651582e-06,
"loss": 14.2253,
"step": 697
},
{
"epoch": 0.4001935033056815,
"grad_norm": 177.67706298828125,
"learning_rate": 1.010243374495763e-06,
"loss": 14.1809,
"step": 698
},
{
"epoch": 0.4007668464336266,
"grad_norm": 176.24996948242188,
"learning_rate": 9.990667929393715e-07,
"loss": 14.0939,
"step": 699
},
{
"epoch": 0.4013401895615717,
"grad_norm": 184.9351806640625,
"learning_rate": 9.879455141840067e-07,
"loss": 14.3538,
"step": 700
},
{
"epoch": 0.4019135326895168,
"grad_norm": 189.4423370361328,
"learning_rate": 9.768796919532742e-07,
"loss": 14.2778,
"step": 701
},
{
"epoch": 0.4024868758174619,
"grad_norm": 190.33895874023438,
"learning_rate": 9.658694792042284e-07,
"loss": 14.3299,
"step": 702
},
{
"epoch": 0.403060218945407,
"grad_norm": 183.4825897216797,
"learning_rate": 9.549150281252633e-07,
"loss": 14.1587,
"step": 703
},
{
"epoch": 0.4036335620733521,
"grad_norm": 184.19715881347656,
"learning_rate": 9.440164901340127e-07,
"loss": 14.1235,
"step": 704
},
{
"epoch": 0.4042069052012972,
"grad_norm": 191.84231567382812,
"learning_rate": 9.331740158752495e-07,
"loss": 14.1645,
"step": 705
},
{
"epoch": 0.4047802483292423,
"grad_norm": 181.87342834472656,
"learning_rate": 9.223877552188065e-07,
"loss": 14.2719,
"step": 706
},
{
"epoch": 0.4053535914571874,
"grad_norm": 183.34930419921875,
"learning_rate": 9.116578572575091e-07,
"loss": 14.2534,
"step": 707
},
{
"epoch": 0.4059269345851325,
"grad_norm": 174.75514221191406,
"learning_rate": 9.009844703051063e-07,
"loss": 14.3114,
"step": 708
},
{
"epoch": 0.4065002777130776,
"grad_norm": 176.34121704101562,
"learning_rate": 8.903677418942292e-07,
"loss": 14.2201,
"step": 709
},
{
"epoch": 0.4070736208410227,
"grad_norm": 183.08766174316406,
"learning_rate": 8.79807818774343e-07,
"loss": 14.1528,
"step": 710
},
{
"epoch": 0.4076469639689678,
"grad_norm": 189.90757751464844,
"learning_rate": 8.693048469097293e-07,
"loss": 14.2383,
"step": 711
},
{
"epoch": 0.4082203070969129,
"grad_norm": 181.50448608398438,
"learning_rate": 8.58858971477457e-07,
"loss": 14.262,
"step": 712
},
{
"epoch": 0.40879365022485803,
"grad_norm": 178.92880249023438,
"learning_rate": 8.484703368653812e-07,
"loss": 14.1923,
"step": 713
},
{
"epoch": 0.40936699335280313,
"grad_norm": 186.92608642578125,
"learning_rate": 8.381390866701517e-07,
"loss": 14.1751,
"step": 714
},
{
"epoch": 0.40994033648074824,
"grad_norm": 183.1122589111328,
"learning_rate": 8.278653636952177e-07,
"loss": 14.2072,
"step": 715
},
{
"epoch": 0.41051367960869334,
"grad_norm": 172.70138549804688,
"learning_rate": 8.176493099488664e-07,
"loss": 14.209,
"step": 716
},
{
"epoch": 0.41108702273663844,
"grad_norm": 193.0767822265625,
"learning_rate": 8.074910666422475e-07,
"loss": 14.2055,
"step": 717
},
{
"epoch": 0.41166036586458354,
"grad_norm": 181.7238006591797,
"learning_rate": 7.973907741874287e-07,
"loss": 14.2313,
"step": 718
},
{
"epoch": 0.41223370899252865,
"grad_norm": 196.82655334472656,
"learning_rate": 7.873485721954572e-07,
"loss": 14.3521,
"step": 719
},
{
"epoch": 0.41280705212047375,
"grad_norm": 184.25498962402344,
"learning_rate": 7.773645994744222e-07,
"loss": 14.2955,
"step": 720
},
{
"epoch": 0.41338039524841885,
"grad_norm": 179.338623046875,
"learning_rate": 7.674389940275406e-07,
"loss": 14.1519,
"step": 721
},
{
"epoch": 0.41395373837636396,
"grad_norm": 179.32083129882812,
"learning_rate": 7.575718930512516e-07,
"loss": 14.2179,
"step": 722
},
{
"epoch": 0.41452708150430906,
"grad_norm": 178.83621215820312,
"learning_rate": 7.47763432933315e-07,
"loss": 14.2179,
"step": 723
},
{
"epoch": 0.41510042463225416,
"grad_norm": 184.3859100341797,
"learning_rate": 7.380137492509309e-07,
"loss": 14.2816,
"step": 724
},
{
"epoch": 0.41567376776019926,
"grad_norm": 178.84129333496094,
"learning_rate": 7.283229767688627e-07,
"loss": 14.2278,
"step": 725
},
{
"epoch": 0.4162471108881443,
"grad_norm": 171.81666564941406,
"learning_rate": 7.186912494375736e-07,
"loss": 14.1466,
"step": 726
},
{
"epoch": 0.4168204540160894,
"grad_norm": 194.59820556640625,
"learning_rate": 7.091187003913802e-07,
"loss": 14.2792,
"step": 727
},
{
"epoch": 0.4173937971440345,
"grad_norm": 180.1846160888672,
"learning_rate": 6.996054619466053e-07,
"loss": 14.1733,
"step": 728
},
{
"epoch": 0.4179671402719796,
"grad_norm": 180.338134765625,
"learning_rate": 6.901516655997536e-07,
"loss": 14.1878,
"step": 729
},
{
"epoch": 0.4185404833999247,
"grad_norm": 182.3441162109375,
"learning_rate": 6.80757442025694e-07,
"loss": 14.2232,
"step": 730
},
{
"epoch": 0.4191138265278698,
"grad_norm": 180.3588104248047,
"learning_rate": 6.714229210758516e-07,
"loss": 14.2163,
"step": 731
},
{
"epoch": 0.41968716965581493,
"grad_norm": 181.55784606933594,
"learning_rate": 6.621482317764105e-07,
"loss": 14.1579,
"step": 732
},
{
"epoch": 0.42026051278376003,
"grad_norm": 184.411376953125,
"learning_rate": 6.529335023265387e-07,
"loss": 14.2631,
"step": 733
},
{
"epoch": 0.42083385591170513,
"grad_norm": 182.96253967285156,
"learning_rate": 6.437788600966066e-07,
"loss": 14.285,
"step": 734
},
{
"epoch": 0.42140719903965024,
"grad_norm": 192.8575897216797,
"learning_rate": 6.346844316264312e-07,
"loss": 14.1554,
"step": 735
},
{
"epoch": 0.42198054216759534,
"grad_norm": 176.40582275390625,
"learning_rate": 6.256503426235277e-07,
"loss": 14.2083,
"step": 736
},
{
"epoch": 0.42255388529554044,
"grad_norm": 183.86581420898438,
"learning_rate": 6.166767179613691e-07,
"loss": 14.2304,
"step": 737
},
{
"epoch": 0.42312722842348555,
"grad_norm": 190.2710723876953,
"learning_rate": 6.077636816776611e-07,
"loss": 14.2459,
"step": 738
},
{
"epoch": 0.42370057155143065,
"grad_norm": 183.04217529296875,
"learning_rate": 5.989113569726312e-07,
"loss": 14.1955,
"step": 739
},
{
"epoch": 0.42427391467937575,
"grad_norm": 176.5095672607422,
"learning_rate": 5.901198662073188e-07,
"loss": 14.2403,
"step": 740
},
{
"epoch": 0.42484725780732085,
"grad_norm": 175.92588806152344,
"learning_rate": 5.813893309018881e-07,
"loss": 14.2281,
"step": 741
},
{
"epoch": 0.42542060093526596,
"grad_norm": 190.41502380371094,
"learning_rate": 5.727198717339511e-07,
"loss": 14.239,
"step": 742
},
{
"epoch": 0.42599394406321106,
"grad_norm": 179.48741149902344,
"learning_rate": 5.641116085368931e-07,
"loss": 14.2565,
"step": 743
},
{
"epoch": 0.42656728719115616,
"grad_norm": 195.33184814453125,
"learning_rate": 5.555646602982207e-07,
"loss": 14.3216,
"step": 744
},
{
"epoch": 0.42714063031910127,
"grad_norm": 185.87525939941406,
"learning_rate": 5.470791451579172e-07,
"loss": 14.242,
"step": 745
},
{
"epoch": 0.42771397344704637,
"grad_norm": 188.23599243164062,
"learning_rate": 5.386551804068063e-07,
"loss": 14.2882,
"step": 746
},
{
"epoch": 0.4282873165749915,
"grad_norm": 177.58998107910156,
"learning_rate": 5.302928824849335e-07,
"loss": 14.2356,
"step": 747
},
{
"epoch": 0.4288606597029366,
"grad_norm": 186.3286895751953,
"learning_rate": 5.219923669799587e-07,
"loss": 14.2915,
"step": 748
},
{
"epoch": 0.4294340028308817,
"grad_norm": 180.6791229248047,
"learning_rate": 5.137537486255517e-07,
"loss": 14.2342,
"step": 749
},
{
"epoch": 0.4300073459588268,
"grad_norm": 194.50714111328125,
"learning_rate": 5.055771412998122e-07,
"loss": 14.2382,
"step": 750
},
{
"epoch": 0.4305806890867719,
"grad_norm": 181.63011169433594,
"learning_rate": 4.974626580236957e-07,
"loss": 14.1548,
"step": 751
},
{
"epoch": 0.431154032214717,
"grad_norm": 185.96437072753906,
"learning_rate": 4.894104109594466e-07,
"loss": 14.2133,
"step": 752
},
{
"epoch": 0.4317273753426621,
"grad_norm": 177.23391723632812,
"learning_rate": 4.814205114090543e-07,
"loss": 14.213,
"step": 753
},
{
"epoch": 0.4323007184706072,
"grad_norm": 178.10658264160156,
"learning_rate": 4.734930698127077e-07,
"loss": 14.216,
"step": 754
},
{
"epoch": 0.4328740615985523,
"grad_norm": 178.41822814941406,
"learning_rate": 4.6562819574727304e-07,
"loss": 14.0747,
"step": 755
},
{
"epoch": 0.4334474047264974,
"grad_norm": 192.12301635742188,
"learning_rate": 4.578259979247801e-07,
"loss": 14.2543,
"step": 756
},
{
"epoch": 0.4340207478544425,
"grad_norm": 182.95399475097656,
"learning_rate": 4.500865841909169e-07,
"loss": 14.1967,
"step": 757
},
{
"epoch": 0.4345940909823876,
"grad_norm": 182.12098693847656,
"learning_rate": 4.4241006152353885e-07,
"loss": 14.233,
"step": 758
},
{
"epoch": 0.4351674341103327,
"grad_norm": 185.19178771972656,
"learning_rate": 4.3479653603119287e-07,
"loss": 14.1932,
"step": 759
},
{
"epoch": 0.4357407772382778,
"grad_norm": 175.16232299804688,
"learning_rate": 4.2724611295164755e-07,
"loss": 14.2061,
"step": 760
},
{
"epoch": 0.4363141203662229,
"grad_norm": 171.42161560058594,
"learning_rate": 4.197588966504401e-07,
"loss": 14.1964,
"step": 761
},
{
"epoch": 0.436887463494168,
"grad_norm": 179.8773193359375,
"learning_rate": 4.123349906194357e-07,
"loss": 14.1541,
"step": 762
},
{
"epoch": 0.4374608066221131,
"grad_norm": 179.10585021972656,
"learning_rate": 4.0497449747539217e-07,
"loss": 14.1968,
"step": 763
},
{
"epoch": 0.4380341497500582,
"grad_norm": 191.01058959960938,
"learning_rate": 3.9767751895854467e-07,
"loss": 14.2196,
"step": 764
},
{
"epoch": 0.4386074928780033,
"grad_norm": 183.64254760742188,
"learning_rate": 3.904441559312006e-07,
"loss": 14.2129,
"step": 765
},
{
"epoch": 0.4391808360059484,
"grad_norm": 186.27633666992188,
"learning_rate": 3.8327450837634284e-07,
"loss": 14.1771,
"step": 766
},
{
"epoch": 0.43975417913389353,
"grad_norm": 189.0173797607422,
"learning_rate": 3.7616867539624733e-07,
"loss": 14.275,
"step": 767
},
{
"epoch": 0.44032752226183863,
"grad_norm": 187.9246368408203,
"learning_rate": 3.691267552111183e-07,
"loss": 14.2115,
"step": 768
},
{
"epoch": 0.44090086538978374,
"grad_norm": 185.96083068847656,
"learning_rate": 3.621488451577221e-07,
"loss": 14.1871,
"step": 769
},
{
"epoch": 0.44147420851772884,
"grad_norm": 180.14927673339844,
"learning_rate": 3.552350416880507e-07,
"loss": 14.1769,
"step": 770
},
{
"epoch": 0.44204755164567394,
"grad_norm": 190.77037048339844,
"learning_rate": 3.483854403679832e-07,
"loss": 14.159,
"step": 771
},
{
"epoch": 0.44262089477361904,
"grad_norm": 179.29052734375,
"learning_rate": 3.416001358759635e-07,
"loss": 14.2194,
"step": 772
},
{
"epoch": 0.44319423790156415,
"grad_norm": 187.48687744140625,
"learning_rate": 3.3487922200169944e-07,
"loss": 14.2782,
"step": 773
},
{
"epoch": 0.44376758102950925,
"grad_norm": 175.16188049316406,
"learning_rate": 3.2822279164485494e-07,
"loss": 14.1779,
"step": 774
},
{
"epoch": 0.44434092415745435,
"grad_norm": 182.10446166992188,
"learning_rate": 3.2163093681377765e-07,
"loss": 14.1585,
"step": 775
},
{
"epoch": 0.44491426728539946,
"grad_norm": 179.84536743164062,
"learning_rate": 3.151037486242181e-07,
"loss": 14.1605,
"step": 776
},
{
"epoch": 0.44548761041334456,
"grad_norm": 179.8004608154297,
"learning_rate": 3.08641317298074e-07,
"loss": 14.231,
"step": 777
},
{
"epoch": 0.44606095354128966,
"grad_norm": 190.25631713867188,
"learning_rate": 3.022437321621452e-07,
"loss": 14.2661,
"step": 778
},
{
"epoch": 0.44663429666923476,
"grad_norm": 177.27598571777344,
"learning_rate": 2.959110816468935e-07,
"loss": 14.3369,
"step": 779
},
{
"epoch": 0.44720763979717987,
"grad_norm": 180.63668823242188,
"learning_rate": 2.896434532852277e-07,
"loss": 14.1925,
"step": 780
},
{
"epoch": 0.44778098292512497,
"grad_norm": 172.8029022216797,
"learning_rate": 2.834409337112842e-07,
"loss": 14.2616,
"step": 781
},
{
"epoch": 0.4483543260530701,
"grad_norm": 182.10931396484375,
"learning_rate": 2.7730360865923954e-07,
"loss": 14.2489,
"step": 782
},
{
"epoch": 0.4489276691810152,
"grad_norm": 182.58995056152344,
"learning_rate": 2.712315629621176e-07,
"loss": 14.2247,
"step": 783
},
{
"epoch": 0.4495010123089603,
"grad_norm": 182.17227172851562,
"learning_rate": 2.6522488055062076e-07,
"loss": 14.251,
"step": 784
},
{
"epoch": 0.4500743554369054,
"grad_norm": 179.82858276367188,
"learning_rate": 2.5928364445196975e-07,
"loss": 14.2028,
"step": 785
},
{
"epoch": 0.4506476985648505,
"grad_norm": 177.07699584960938,
"learning_rate": 2.534079367887549e-07,
"loss": 14.1402,
"step": 786
},
{
"epoch": 0.4512210416927956,
"grad_norm": 174.88539123535156,
"learning_rate": 2.475978387778e-07,
"loss": 14.2159,
"step": 787
},
{
"epoch": 0.4517943848207407,
"grad_norm": 182.9810028076172,
"learning_rate": 2.4185343072904376e-07,
"loss": 14.2624,
"step": 788
},
{
"epoch": 0.4523677279486858,
"grad_norm": 180.19107055664062,
"learning_rate": 2.3617479204442462e-07,
"loss": 14.2149,
"step": 789
},
{
"epoch": 0.4529410710766309,
"grad_norm": 181.24143981933594,
"learning_rate": 2.305620012167853e-07,
"loss": 14.1732,
"step": 790
},
{
"epoch": 0.453514414204576,
"grad_norm": 174.54727172851562,
"learning_rate": 2.2501513582879108e-07,
"loss": 14.1911,
"step": 791
},
{
"epoch": 0.4540877573325211,
"grad_norm": 181.31564331054688,
"learning_rate": 2.1953427255185122e-07,
"loss": 14.2618,
"step": 792
},
{
"epoch": 0.4546611004604662,
"grad_norm": 179.88681030273438,
"learning_rate": 2.1411948714506414e-07,
"loss": 14.2918,
"step": 793
},
{
"epoch": 0.4552344435884113,
"grad_norm": 175.06451416015625,
"learning_rate": 2.0877085445416889e-07,
"loss": 14.1995,
"step": 794
},
{
"epoch": 0.4558077867163564,
"grad_norm": 181.00540161132812,
"learning_rate": 2.034884484105093e-07,
"loss": 14.1838,
"step": 795
},
{
"epoch": 0.4563811298443015,
"grad_norm": 182.90286254882812,
"learning_rate": 1.98272342030012e-07,
"loss": 14.2421,
"step": 796
},
{
"epoch": 0.4569544729722466,
"grad_norm": 188.80038452148438,
"learning_rate": 1.9312260741218114e-07,
"loss": 14.2287,
"step": 797
},
{
"epoch": 0.4575278161001917,
"grad_norm": 189.58168029785156,
"learning_rate": 1.8803931573909584e-07,
"loss": 14.1547,
"step": 798
},
{
"epoch": 0.4581011592281368,
"grad_norm": 182.40635681152344,
"learning_rate": 1.8302253727443041e-07,
"loss": 14.1816,
"step": 799
},
{
"epoch": 0.4586745023560819,
"grad_norm": 177.64756774902344,
"learning_rate": 1.7807234136248296e-07,
"loss": 14.0972,
"step": 800
},
{
"epoch": 0.45924784548402703,
"grad_norm": 179.09646606445312,
"learning_rate": 1.731887964272144e-07,
"loss": 14.2329,
"step": 801
},
{
"epoch": 0.45982118861197213,
"grad_norm": 182.92236328125,
"learning_rate": 1.6837196997130434e-07,
"loss": 14.129,
"step": 802
},
{
"epoch": 0.46039453173991723,
"grad_norm": 180.4651641845703,
"learning_rate": 1.6362192857521942e-07,
"loss": 14.2359,
"step": 803
},
{
"epoch": 0.46096787486786234,
"grad_norm": 188.5583038330078,
"learning_rate": 1.5893873789628812e-07,
"loss": 14.3177,
"step": 804
},
{
"epoch": 0.46154121799580744,
"grad_norm": 179.5811767578125,
"learning_rate": 1.5432246266780083e-07,
"loss": 14.1691,
"step": 805
},
{
"epoch": 0.46211456112375254,
"grad_norm": 174.0120391845703,
"learning_rate": 1.4977316669810782e-07,
"loss": 14.1824,
"step": 806
},
{
"epoch": 0.46268790425169765,
"grad_norm": 191.203369140625,
"learning_rate": 1.4529091286973994e-07,
"loss": 14.1955,
"step": 807
},
{
"epoch": 0.46326124737964275,
"grad_norm": 183.5585479736328,
"learning_rate": 1.4087576313854212e-07,
"loss": 14.2568,
"step": 808
},
{
"epoch": 0.46383459050758785,
"grad_norm": 183.38294982910156,
"learning_rate": 1.365277785328123e-07,
"loss": 14.1888,
"step": 809
},
{
"epoch": 0.46440793363553295,
"grad_norm": 178.88182067871094,
"learning_rate": 1.3224701915246053e-07,
"loss": 14.1905,
"step": 810
},
{
"epoch": 0.46498127676347806,
"grad_norm": 176.96397399902344,
"learning_rate": 1.280335441681796e-07,
"loss": 14.2524,
"step": 811
},
{
"epoch": 0.46555461989142316,
"grad_norm": 182.11790466308594,
"learning_rate": 1.2388741182062348e-07,
"loss": 14.161,
"step": 812
},
{
"epoch": 0.46612796301936826,
"grad_norm": 178.43495178222656,
"learning_rate": 1.198086794196035e-07,
"loss": 14.2621,
"step": 813
},
{
"epoch": 0.46670130614731337,
"grad_norm": 172.70196533203125,
"learning_rate": 1.1579740334330014e-07,
"loss": 14.1181,
"step": 814
},
{
"epoch": 0.46727464927525847,
"grad_norm": 187.98484802246094,
"learning_rate": 1.1185363903747748e-07,
"loss": 14.269,
"step": 815
},
{
"epoch": 0.46784799240320357,
"grad_norm": 172.6577911376953,
"learning_rate": 1.0797744101472052e-07,
"loss": 14.1737,
"step": 816
},
{
"epoch": 0.4684213355311487,
"grad_norm": 181.676025390625,
"learning_rate": 1.0416886285368188e-07,
"loss": 14.2495,
"step": 817
},
{
"epoch": 0.4689946786590938,
"grad_norm": 176.9022216796875,
"learning_rate": 1.0042795719833964e-07,
"loss": 14.1739,
"step": 818
},
{
"epoch": 0.4695680217870389,
"grad_norm": 174.6312255859375,
"learning_rate": 9.675477575726954e-08,
"loss": 14.2219,
"step": 819
},
{
"epoch": 0.470141364914984,
"grad_norm": 176.77125549316406,
"learning_rate": 9.314936930293283e-08,
"loss": 14.1415,
"step": 820
},
{
"epoch": 0.4707147080429291,
"grad_norm": 180.0824432373047,
"learning_rate": 8.961178767097178e-08,
"loss": 14.2163,
"step": 821
},
{
"epoch": 0.4712880511708742,
"grad_norm": 179.52847290039062,
"learning_rate": 8.614207975952083e-08,
"loss": 14.2163,
"step": 822
},
{
"epoch": 0.4718613942988193,
"grad_norm": 175.8388671875,
"learning_rate": 8.274029352853264e-08,
"loss": 14.1408,
"step": 823
},
{
"epoch": 0.4724347374267644,
"grad_norm": 179.85772705078125,
"learning_rate": 7.940647599911477e-08,
"loss": 14.2558,
"step": 824
},
{
"epoch": 0.4730080805547095,
"grad_norm": 176.0479278564453,
"learning_rate": 7.614067325287632e-08,
"loss": 14.1834,
"step": 825
},
{
"epoch": 0.4735814236826546,
"grad_norm": 179.95497131347656,
"learning_rate": 7.294293043129785e-08,
"loss": 14.2747,
"step": 826
},
{
"epoch": 0.4741547668105997,
"grad_norm": 187.21307373046875,
"learning_rate": 6.981329173509909e-08,
"loss": 14.235,
"step": 827
},
{
"epoch": 0.4747281099385448,
"grad_norm": 183.1041717529297,
"learning_rate": 6.675180042363505e-08,
"loss": 14.2802,
"step": 828
},
{
"epoch": 0.4753014530664899,
"grad_norm": 177.82183837890625,
"learning_rate": 6.375849881429418e-08,
"loss": 14.2127,
"step": 829
},
{
"epoch": 0.475874796194435,
"grad_norm": 185.0269775390625,
"learning_rate": 6.083342828191453e-08,
"loss": 14.1445,
"step": 830
},
{
"epoch": 0.4764481393223801,
"grad_norm": 184.57952880859375,
"learning_rate": 5.797662925821068e-08,
"loss": 14.2531,
"step": 831
},
{
"epoch": 0.4770214824503252,
"grad_norm": 184.90017700195312,
"learning_rate": 5.518814123121885e-08,
"loss": 14.1998,
"step": 832
},
{
"epoch": 0.4775948255782703,
"grad_norm": 178.5499267578125,
"learning_rate": 5.246800274474439e-08,
"loss": 14.1822,
"step": 833
},
{
"epoch": 0.4781681687062154,
"grad_norm": 176.92861938476562,
"learning_rate": 4.981625139783619e-08,
"loss": 14.1861,
"step": 834
},
{
"epoch": 0.4787415118341605,
"grad_norm": 176.53111267089844,
"learning_rate": 4.723292384426203e-08,
"loss": 14.1773,
"step": 835
},
{
"epoch": 0.47931485496210563,
"grad_norm": 198.08750915527344,
"learning_rate": 4.471805579200239e-08,
"loss": 14.3216,
"step": 836
},
{
"epoch": 0.47988819809005073,
"grad_norm": 180.246826171875,
"learning_rate": 4.227168200276077e-08,
"loss": 14.0681,
"step": 837
},
{
"epoch": 0.4804615412179958,
"grad_norm": 181.2344970703125,
"learning_rate": 3.989383629147747e-08,
"loss": 14.239,
"step": 838
},
{
"epoch": 0.4810348843459409,
"grad_norm": 182.63856506347656,
"learning_rate": 3.758455152586715e-08,
"loss": 14.1785,
"step": 839
},
{
"epoch": 0.481608227473886,
"grad_norm": 176.11099243164062,
"learning_rate": 3.534385962596143e-08,
"loss": 14.1423,
"step": 840
},
{
"epoch": 0.4821815706018311,
"grad_norm": 175.12725830078125,
"learning_rate": 3.3171791563669785e-08,
"loss": 14.2053,
"step": 841
},
{
"epoch": 0.4827549137297762,
"grad_norm": 185.15928649902344,
"learning_rate": 3.10683773623488e-08,
"loss": 14.146,
"step": 842
},
{
"epoch": 0.4833282568577213,
"grad_norm": 188.5362548828125,
"learning_rate": 2.9033646096390255e-08,
"loss": 14.2097,
"step": 843
},
{
"epoch": 0.4839015999856664,
"grad_norm": 187.72796630859375,
"learning_rate": 2.706762589081646e-08,
"loss": 14.1802,
"step": 844
},
{
"epoch": 0.4844749431136115,
"grad_norm": 177.17909240722656,
"learning_rate": 2.517034392089446e-08,
"loss": 14.1847,
"step": 845
},
{
"epoch": 0.4850482862415566,
"grad_norm": 174.41868591308594,
"learning_rate": 2.3341826411756863e-08,
"loss": 14.1541,
"step": 846
},
{
"epoch": 0.4856216293695017,
"grad_norm": 177.609130859375,
"learning_rate": 2.158209863804217e-08,
"loss": 14.2386,
"step": 847
},
{
"epoch": 0.4861949724974468,
"grad_norm": 182.3568115234375,
"learning_rate": 1.9891184923544472e-08,
"loss": 14.1531,
"step": 848
},
{
"epoch": 0.4867683156253919,
"grad_norm": 174.87728881835938,
"learning_rate": 1.826910864087761e-08,
"loss": 14.1399,
"step": 849
},
{
"epoch": 0.487341658753337,
"grad_norm": 183.8682403564453,
"learning_rate": 1.6715892211150442e-08,
"loss": 14.1189,
"step": 850
},
{
"epoch": 0.4879150018812821,
"grad_norm": 176.34315490722656,
"learning_rate": 1.5231557103658755e-08,
"loss": 14.2468,
"step": 851
},
{
"epoch": 0.4884883450092272,
"grad_norm": 177.79586791992188,
"learning_rate": 1.3816123835588835e-08,
"loss": 14.2414,
"step": 852
},
{
"epoch": 0.4890616881371723,
"grad_norm": 173.83486938476562,
"learning_rate": 1.2469611971731576e-08,
"loss": 14.1864,
"step": 853
},
{
"epoch": 0.4896350312651174,
"grad_norm": 181.15512084960938,
"learning_rate": 1.1192040124214931e-08,
"loss": 14.1471,
"step": 854
},
{
"epoch": 0.49020837439306253,
"grad_norm": 185.8532257080078,
"learning_rate": 9.983425952243552e-09,
"loss": 14.2145,
"step": 855
},
{
"epoch": 0.49078171752100763,
"grad_norm": 172.852783203125,
"learning_rate": 8.84378616185788e-09,
"loss": 14.1675,
"step": 856
},
{
"epoch": 0.49135506064895274,
"grad_norm": 183.1013946533203,
"learning_rate": 7.773136505700995e-09,
"loss": 14.1541,
"step": 857
},
{
"epoch": 0.49192840377689784,
"grad_norm": 173.04444885253906,
"learning_rate": 6.7714917828004545e-09,
"loss": 14.1,
"step": 858
},
{
"epoch": 0.49250174690484294,
"grad_norm": 173.59991455078125,
"learning_rate": 5.838865838366792e-09,
"loss": 14.1511,
"step": 859
},
{
"epoch": 0.49307509003278804,
"grad_norm": 197.82601928710938,
"learning_rate": 4.975271563599227e-09,
"loss": 14.2182,
"step": 860
},
{
"epoch": 0.49364843316073315,
"grad_norm": 183.84568786621094,
"learning_rate": 4.180720895508028e-09,
"loss": 14.1797,
"step": 861
},
{
"epoch": 0.49422177628867825,
"grad_norm": 194.36610412597656,
"learning_rate": 3.4552248167507576e-09,
"loss": 14.237,
"step": 862
},
{
"epoch": 0.49479511941662335,
"grad_norm": 172.23765563964844,
"learning_rate": 2.798793355478502e-09,
"loss": 14.094,
"step": 863
},
{
"epoch": 0.49536846254456846,
"grad_norm": 185.76551818847656,
"learning_rate": 2.2114355851993175e-09,
"loss": 14.1855,
"step": 864
},
{
"epoch": 0.49594180567251356,
"grad_norm": 175.8227081298828,
"learning_rate": 1.6931596246516636e-09,
"loss": 14.2448,
"step": 865
},
{
"epoch": 0.49651514880045866,
"grad_norm": 174.75340270996094,
"learning_rate": 1.24397263769227e-09,
"loss": 14.103,
"step": 866
},
{
"epoch": 0.49708849192840376,
"grad_norm": 175.39210510253906,
"learning_rate": 8.638808331973281e-10,
"loss": 14.1831,
"step": 867
},
{
"epoch": 0.49766183505634887,
"grad_norm": 180.33180236816406,
"learning_rate": 5.528894649758921e-10,
"loss": 14.1561,
"step": 868
},
{
"epoch": 0.49823517818429397,
"grad_norm": 168.7253875732422,
"learning_rate": 3.1100283169938074e-10,
"loss": 14.2352,
"step": 869
},
{
"epoch": 0.4988085213122391,
"grad_norm": 179.21212768554688,
"learning_rate": 1.3822427683884975e-10,
"loss": 14.2388,
"step": 870
},
{
"epoch": 0.4993818644401842,
"grad_norm": 181.83961486816406,
"learning_rate": 3.4556188622802964e-11,
"loss": 14.1703,
"step": 871
},
{
"epoch": 0.4999552075681293,
"grad_norm": 178.8787078857422,
"learning_rate": 0.0,
"loss": 14.2526,
"step": 872
},
{
"epoch": 0.4999552075681293,
"step": 872,
"total_flos": 7.585435033523978e+18,
"train_loss": 14.689219380737445,
"train_runtime": 70676.4546,
"train_samples_per_second": 3.948,
"train_steps_per_second": 0.012
}
],
"logging_steps": 1.0,
"max_steps": 872,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 1000,
"total_flos": 7.585435033523978e+18,
"train_batch_size": 10,
"trial_name": null,
"trial_params": null
}