zephyr-7b-sft-full / trainer_state.json
RikkiXu's picture
Model save
5d81a14 verified
raw
history blame
No virus
93.1 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 5.0,
"eval_steps": 500,
"global_step": 2860,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0,
"grad_norm": 5.587369652925466,
"learning_rate": 1.7482517482517483e-08,
"loss": 0.8755,
"step": 1
},
{
"epoch": 0.01,
"grad_norm": 5.500934350042366,
"learning_rate": 8.741258741258742e-08,
"loss": 0.8733,
"step": 5
},
{
"epoch": 0.02,
"grad_norm": 4.778154128284402,
"learning_rate": 1.7482517482517484e-07,
"loss": 0.8812,
"step": 10
},
{
"epoch": 0.03,
"grad_norm": 4.157465345945996,
"learning_rate": 2.622377622377623e-07,
"loss": 0.8595,
"step": 15
},
{
"epoch": 0.03,
"grad_norm": 4.01463883737159,
"learning_rate": 3.496503496503497e-07,
"loss": 0.8627,
"step": 20
},
{
"epoch": 0.04,
"grad_norm": 2.545403355372151,
"learning_rate": 4.3706293706293707e-07,
"loss": 0.8508,
"step": 25
},
{
"epoch": 0.05,
"grad_norm": 2.2986226033618844,
"learning_rate": 5.244755244755246e-07,
"loss": 0.8633,
"step": 30
},
{
"epoch": 0.06,
"grad_norm": 2.0882287948426668,
"learning_rate": 6.118881118881119e-07,
"loss": 0.822,
"step": 35
},
{
"epoch": 0.07,
"grad_norm": 1.9905683806362928,
"learning_rate": 6.993006993006994e-07,
"loss": 0.8195,
"step": 40
},
{
"epoch": 0.08,
"grad_norm": 1.8546442781858452,
"learning_rate": 7.867132867132868e-07,
"loss": 0.8423,
"step": 45
},
{
"epoch": 0.09,
"grad_norm": 1.9539088930354693,
"learning_rate": 8.741258741258741e-07,
"loss": 0.7999,
"step": 50
},
{
"epoch": 0.1,
"grad_norm": 1.8944469761122955,
"learning_rate": 9.615384615384617e-07,
"loss": 0.8234,
"step": 55
},
{
"epoch": 0.1,
"grad_norm": 1.976723504907432,
"learning_rate": 1.0489510489510491e-06,
"loss": 0.8047,
"step": 60
},
{
"epoch": 0.11,
"grad_norm": 1.8383713435930058,
"learning_rate": 1.1363636363636364e-06,
"loss": 0.8046,
"step": 65
},
{
"epoch": 0.12,
"grad_norm": 1.9093043777289103,
"learning_rate": 1.2237762237762238e-06,
"loss": 0.7935,
"step": 70
},
{
"epoch": 0.13,
"grad_norm": 1.8651426939128903,
"learning_rate": 1.3111888111888113e-06,
"loss": 0.8036,
"step": 75
},
{
"epoch": 0.14,
"grad_norm": 1.8471862984018954,
"learning_rate": 1.3986013986013987e-06,
"loss": 0.8034,
"step": 80
},
{
"epoch": 0.15,
"grad_norm": 1.9262668779136634,
"learning_rate": 1.486013986013986e-06,
"loss": 0.8008,
"step": 85
},
{
"epoch": 0.16,
"grad_norm": 1.9222277764812408,
"learning_rate": 1.5734265734265736e-06,
"loss": 0.7835,
"step": 90
},
{
"epoch": 0.17,
"grad_norm": 1.8688469770089136,
"learning_rate": 1.660839160839161e-06,
"loss": 0.7967,
"step": 95
},
{
"epoch": 0.17,
"grad_norm": 1.8814770181802394,
"learning_rate": 1.7482517482517483e-06,
"loss": 0.7998,
"step": 100
},
{
"epoch": 0.18,
"grad_norm": 2.037977585299295,
"learning_rate": 1.8356643356643357e-06,
"loss": 0.8056,
"step": 105
},
{
"epoch": 0.19,
"grad_norm": 2.001270020691376,
"learning_rate": 1.9230769230769234e-06,
"loss": 0.7756,
"step": 110
},
{
"epoch": 0.2,
"grad_norm": 1.9295825867916314,
"learning_rate": 2.0104895104895104e-06,
"loss": 0.788,
"step": 115
},
{
"epoch": 0.21,
"grad_norm": 2.0467456581571577,
"learning_rate": 2.0979020979020983e-06,
"loss": 0.7852,
"step": 120
},
{
"epoch": 0.22,
"grad_norm": 2.140303916653523,
"learning_rate": 2.1853146853146857e-06,
"loss": 0.7749,
"step": 125
},
{
"epoch": 0.23,
"grad_norm": 2.093503142861556,
"learning_rate": 2.2727272727272728e-06,
"loss": 0.7905,
"step": 130
},
{
"epoch": 0.24,
"grad_norm": 2.064301983027922,
"learning_rate": 2.36013986013986e-06,
"loss": 0.7659,
"step": 135
},
{
"epoch": 0.24,
"grad_norm": 1.997358747946483,
"learning_rate": 2.4475524475524477e-06,
"loss": 0.7831,
"step": 140
},
{
"epoch": 0.25,
"grad_norm": 2.1027121897036274,
"learning_rate": 2.534965034965035e-06,
"loss": 0.7702,
"step": 145
},
{
"epoch": 0.26,
"grad_norm": 2.1291996409910015,
"learning_rate": 2.6223776223776225e-06,
"loss": 0.7658,
"step": 150
},
{
"epoch": 0.27,
"grad_norm": 41.9980343074638,
"learning_rate": 2.70979020979021e-06,
"loss": 0.7668,
"step": 155
},
{
"epoch": 0.28,
"grad_norm": 2.198061904985496,
"learning_rate": 2.7972027972027974e-06,
"loss": 0.7742,
"step": 160
},
{
"epoch": 0.29,
"grad_norm": 2.129322746365122,
"learning_rate": 2.8846153846153845e-06,
"loss": 0.7458,
"step": 165
},
{
"epoch": 0.3,
"grad_norm": 2.0235052008880867,
"learning_rate": 2.972027972027972e-06,
"loss": 0.7434,
"step": 170
},
{
"epoch": 0.31,
"grad_norm": 2.0254135513369156,
"learning_rate": 3.0594405594405598e-06,
"loss": 0.7338,
"step": 175
},
{
"epoch": 0.31,
"grad_norm": 1.985119509495075,
"learning_rate": 3.1468531468531472e-06,
"loss": 0.7341,
"step": 180
},
{
"epoch": 0.32,
"grad_norm": 2.250959168282524,
"learning_rate": 3.2342657342657347e-06,
"loss": 0.7307,
"step": 185
},
{
"epoch": 0.33,
"grad_norm": 1.9968379378052319,
"learning_rate": 3.321678321678322e-06,
"loss": 0.7385,
"step": 190
},
{
"epoch": 0.34,
"grad_norm": 2.630990506587568,
"learning_rate": 3.409090909090909e-06,
"loss": 0.7328,
"step": 195
},
{
"epoch": 0.35,
"grad_norm": 3.57779109043985,
"learning_rate": 3.4965034965034966e-06,
"loss": 0.7429,
"step": 200
},
{
"epoch": 0.36,
"grad_norm": 26.759360492511966,
"learning_rate": 3.583916083916084e-06,
"loss": 0.7541,
"step": 205
},
{
"epoch": 0.37,
"grad_norm": 2.779819046550619,
"learning_rate": 3.6713286713286715e-06,
"loss": 0.7457,
"step": 210
},
{
"epoch": 0.38,
"grad_norm": 1.9676288431898625,
"learning_rate": 3.7587412587412593e-06,
"loss": 0.7347,
"step": 215
},
{
"epoch": 0.38,
"grad_norm": 1.7250524315438622,
"learning_rate": 3.846153846153847e-06,
"loss": 0.7383,
"step": 220
},
{
"epoch": 0.39,
"grad_norm": 1.8839750025590591,
"learning_rate": 3.933566433566433e-06,
"loss": 0.7318,
"step": 225
},
{
"epoch": 0.4,
"grad_norm": 2.6586996717057443,
"learning_rate": 4.020979020979021e-06,
"loss": 0.7227,
"step": 230
},
{
"epoch": 0.41,
"grad_norm": 3.585620352276218,
"learning_rate": 4.108391608391608e-06,
"loss": 0.7051,
"step": 235
},
{
"epoch": 0.42,
"grad_norm": 1.676940387665252,
"learning_rate": 4.195804195804197e-06,
"loss": 0.7296,
"step": 240
},
{
"epoch": 0.43,
"grad_norm": 1.4297941750638832,
"learning_rate": 4.283216783216784e-06,
"loss": 0.7034,
"step": 245
},
{
"epoch": 0.44,
"grad_norm": 1.5752492466565178,
"learning_rate": 4.3706293706293715e-06,
"loss": 0.7114,
"step": 250
},
{
"epoch": 0.45,
"grad_norm": 23.603921846562006,
"learning_rate": 4.458041958041958e-06,
"loss": 0.7189,
"step": 255
},
{
"epoch": 0.45,
"grad_norm": 1.4838961986379053,
"learning_rate": 4.5454545454545455e-06,
"loss": 0.7134,
"step": 260
},
{
"epoch": 0.46,
"grad_norm": 1.553567707360373,
"learning_rate": 4.632867132867133e-06,
"loss": 0.7244,
"step": 265
},
{
"epoch": 0.47,
"grad_norm": 1.6872621755065427,
"learning_rate": 4.72027972027972e-06,
"loss": 0.7221,
"step": 270
},
{
"epoch": 0.48,
"grad_norm": 26.978554548474214,
"learning_rate": 4.807692307692308e-06,
"loss": 0.6954,
"step": 275
},
{
"epoch": 0.49,
"grad_norm": 1.5442233324126737,
"learning_rate": 4.895104895104895e-06,
"loss": 0.7098,
"step": 280
},
{
"epoch": 0.5,
"grad_norm": 1.5826467174296213,
"learning_rate": 4.982517482517483e-06,
"loss": 0.6956,
"step": 285
},
{
"epoch": 0.51,
"grad_norm": 1.529332769657097,
"learning_rate": 4.999970207167611e-06,
"loss": 0.7054,
"step": 290
},
{
"epoch": 0.52,
"grad_norm": 1.4134102929361827,
"learning_rate": 4.999849175003032e-06,
"loss": 0.6925,
"step": 295
},
{
"epoch": 0.52,
"grad_norm": 1.5561380803552274,
"learning_rate": 4.999635045958129e-06,
"loss": 0.6777,
"step": 300
},
{
"epoch": 0.53,
"grad_norm": 1.7122419800735684,
"learning_rate": 4.999327828007281e-06,
"loss": 0.6782,
"step": 305
},
{
"epoch": 0.54,
"grad_norm": 3.884234946585116,
"learning_rate": 4.998927532591592e-06,
"loss": 0.714,
"step": 310
},
{
"epoch": 0.55,
"grad_norm": 1.5140404948967683,
"learning_rate": 4.998434174618464e-06,
"loss": 0.6969,
"step": 315
},
{
"epoch": 0.56,
"grad_norm": 1.5157313273979758,
"learning_rate": 4.997847772461038e-06,
"loss": 0.6937,
"step": 320
},
{
"epoch": 0.57,
"grad_norm": 1.4006324095309106,
"learning_rate": 4.997168347957521e-06,
"loss": 0.6707,
"step": 325
},
{
"epoch": 0.58,
"grad_norm": 1.4555169305723101,
"learning_rate": 4.996395926410354e-06,
"loss": 0.6901,
"step": 330
},
{
"epoch": 0.59,
"grad_norm": 1.4699402702938433,
"learning_rate": 4.995530536585293e-06,
"loss": 0.6954,
"step": 335
},
{
"epoch": 0.59,
"grad_norm": 1.423966140797764,
"learning_rate": 4.994572210710315e-06,
"loss": 0.6834,
"step": 340
},
{
"epoch": 0.6,
"grad_norm": 1.4114672539255717,
"learning_rate": 4.993520984474435e-06,
"loss": 0.7091,
"step": 345
},
{
"epoch": 0.61,
"grad_norm": 1.4495020382615775,
"learning_rate": 4.9923768970263675e-06,
"loss": 0.6909,
"step": 350
},
{
"epoch": 0.62,
"grad_norm": 1.5759646703195747,
"learning_rate": 4.991139990973071e-06,
"loss": 0.6943,
"step": 355
},
{
"epoch": 0.63,
"grad_norm": 1.5472655442648218,
"learning_rate": 4.989810312378165e-06,
"loss": 0.699,
"step": 360
},
{
"epoch": 0.64,
"grad_norm": 1.4257086908354295,
"learning_rate": 4.988387910760206e-06,
"loss": 0.688,
"step": 365
},
{
"epoch": 0.65,
"grad_norm": 1.4575544094857862,
"learning_rate": 4.986872839090853e-06,
"loss": 0.6715,
"step": 370
},
{
"epoch": 0.66,
"grad_norm": 1.5103716324462753,
"learning_rate": 4.985265153792887e-06,
"loss": 0.6818,
"step": 375
},
{
"epoch": 0.66,
"grad_norm": 1.8027777365944309,
"learning_rate": 4.983564914738113e-06,
"loss": 0.6927,
"step": 380
},
{
"epoch": 0.67,
"grad_norm": 1.4256278816465284,
"learning_rate": 4.981772185245135e-06,
"loss": 0.6886,
"step": 385
},
{
"epoch": 0.68,
"grad_norm": 1.795771667278661,
"learning_rate": 4.9798870320769884e-06,
"loss": 0.6716,
"step": 390
},
{
"epoch": 0.69,
"grad_norm": 1.3370646701877693,
"learning_rate": 4.9779095254386605e-06,
"loss": 0.6939,
"step": 395
},
{
"epoch": 0.7,
"grad_norm": 1.4886996027080686,
"learning_rate": 4.975839738974473e-06,
"loss": 0.698,
"step": 400
},
{
"epoch": 0.71,
"grad_norm": 1.4334461526468911,
"learning_rate": 4.9736777497653425e-06,
"loss": 0.6706,
"step": 405
},
{
"epoch": 0.72,
"grad_norm": 1.500198901246391,
"learning_rate": 4.971423638325906e-06,
"loss": 0.6687,
"step": 410
},
{
"epoch": 0.73,
"grad_norm": 1.5396586819167384,
"learning_rate": 4.969077488601525e-06,
"loss": 0.6734,
"step": 415
},
{
"epoch": 0.73,
"grad_norm": 1.4725153996617695,
"learning_rate": 4.966639387965158e-06,
"loss": 0.6999,
"step": 420
},
{
"epoch": 0.74,
"grad_norm": 1.5885070209974053,
"learning_rate": 4.964109427214111e-06,
"loss": 0.6853,
"step": 425
},
{
"epoch": 0.75,
"grad_norm": 1.7768817102056875,
"learning_rate": 4.961487700566646e-06,
"loss": 0.6758,
"step": 430
},
{
"epoch": 0.76,
"grad_norm": 1.4156688514395073,
"learning_rate": 4.958774305658484e-06,
"loss": 0.682,
"step": 435
},
{
"epoch": 0.77,
"grad_norm": 1.615821708366966,
"learning_rate": 4.955969343539162e-06,
"loss": 0.6623,
"step": 440
},
{
"epoch": 0.78,
"grad_norm": 1.5778334799669915,
"learning_rate": 4.95307291866827e-06,
"loss": 0.7118,
"step": 445
},
{
"epoch": 0.79,
"grad_norm": 1.5459039639149776,
"learning_rate": 4.9500851389115645e-06,
"loss": 0.6822,
"step": 450
},
{
"epoch": 0.8,
"grad_norm": 1.950166551192635,
"learning_rate": 4.947006115536947e-06,
"loss": 0.6896,
"step": 455
},
{
"epoch": 0.8,
"grad_norm": 1.4583416417638757,
"learning_rate": 4.943835963210324e-06,
"loss": 0.6737,
"step": 460
},
{
"epoch": 0.81,
"grad_norm": 1.3818410898501687,
"learning_rate": 4.9405747999913355e-06,
"loss": 0.6663,
"step": 465
},
{
"epoch": 0.82,
"grad_norm": 1.642028714364656,
"learning_rate": 4.937222747328956e-06,
"loss": 0.6702,
"step": 470
},
{
"epoch": 0.83,
"grad_norm": 1.4162244952189953,
"learning_rate": 4.933779930056975e-06,
"loss": 0.6709,
"step": 475
},
{
"epoch": 0.84,
"grad_norm": 1.5391809516965524,
"learning_rate": 4.9302464763893474e-06,
"loss": 0.6581,
"step": 480
},
{
"epoch": 0.85,
"grad_norm": 1.4577041041003882,
"learning_rate": 4.926622517915417e-06,
"loss": 0.6523,
"step": 485
},
{
"epoch": 0.86,
"grad_norm": 1.4232896131606396,
"learning_rate": 4.9229081895950185e-06,
"loss": 0.6828,
"step": 490
},
{
"epoch": 0.87,
"grad_norm": 1.5308862157877259,
"learning_rate": 4.9191036297534455e-06,
"loss": 0.6671,
"step": 495
},
{
"epoch": 0.87,
"grad_norm": 1.4998597450310402,
"learning_rate": 4.91520898007631e-06,
"loss": 0.6559,
"step": 500
},
{
"epoch": 0.88,
"grad_norm": 1.5868254943762528,
"learning_rate": 4.911224385604255e-06,
"loss": 0.6731,
"step": 505
},
{
"epoch": 0.89,
"grad_norm": 1.5018477168697573,
"learning_rate": 4.907149994727559e-06,
"loss": 0.6643,
"step": 510
},
{
"epoch": 0.9,
"grad_norm": 1.4716960928109946,
"learning_rate": 4.902985959180608e-06,
"loss": 0.6556,
"step": 515
},
{
"epoch": 0.91,
"grad_norm": 1.4130064465598204,
"learning_rate": 4.8987324340362445e-06,
"loss": 0.6675,
"step": 520
},
{
"epoch": 0.92,
"grad_norm": 1.4999119765126108,
"learning_rate": 4.894389577699994e-06,
"loss": 0.6484,
"step": 525
},
{
"epoch": 0.93,
"grad_norm": 1.4835440823652937,
"learning_rate": 4.889957551904164e-06,
"loss": 0.6566,
"step": 530
},
{
"epoch": 0.94,
"grad_norm": 1.5704867480691889,
"learning_rate": 4.885436521701824e-06,
"loss": 0.6589,
"step": 535
},
{
"epoch": 0.94,
"grad_norm": 1.5154576884271134,
"learning_rate": 4.8808266554606535e-06,
"loss": 0.6476,
"step": 540
},
{
"epoch": 0.95,
"grad_norm": 1.4656996812743353,
"learning_rate": 4.876128124856676e-06,
"loss": 0.6652,
"step": 545
},
{
"epoch": 0.96,
"grad_norm": 1.6670494857895617,
"learning_rate": 4.8713411048678635e-06,
"loss": 0.6711,
"step": 550
},
{
"epoch": 0.97,
"grad_norm": 12.070438425219901,
"learning_rate": 4.866465773767625e-06,
"loss": 0.6941,
"step": 555
},
{
"epoch": 0.98,
"grad_norm": 1.9537677638893325,
"learning_rate": 4.861502313118157e-06,
"loss": 0.6606,
"step": 560
},
{
"epoch": 0.99,
"grad_norm": 2.0272499350973807,
"learning_rate": 4.856450907763693e-06,
"loss": 0.6506,
"step": 565
},
{
"epoch": 1.0,
"grad_norm": 1.7554436009464944,
"learning_rate": 4.851311745823616e-06,
"loss": 0.6695,
"step": 570
},
{
"epoch": 1.0,
"eval_loss": 0.6609551310539246,
"eval_runtime": 330.32,
"eval_samples_per_second": 22.769,
"eval_steps_per_second": 0.357,
"step": 572
},
{
"epoch": 1.01,
"grad_norm": 2.145254776756945,
"learning_rate": 4.846085018685449e-06,
"loss": 0.6166,
"step": 575
},
{
"epoch": 1.01,
"grad_norm": 1.7225148009689535,
"learning_rate": 4.84077092099773e-06,
"loss": 0.5855,
"step": 580
},
{
"epoch": 1.02,
"grad_norm": 1.5799570982565634,
"learning_rate": 4.835369650662767e-06,
"loss": 0.589,
"step": 585
},
{
"epoch": 1.03,
"grad_norm": 1.7146719749288577,
"learning_rate": 4.829881408829262e-06,
"loss": 0.5809,
"step": 590
},
{
"epoch": 1.04,
"grad_norm": 1.6210603375147141,
"learning_rate": 4.824306399884822e-06,
"loss": 0.5619,
"step": 595
},
{
"epoch": 1.05,
"grad_norm": 1.8183201949889993,
"learning_rate": 4.81864483144835e-06,
"loss": 0.5821,
"step": 600
},
{
"epoch": 1.06,
"grad_norm": 1.5245577439422557,
"learning_rate": 4.81289691436231e-06,
"loss": 0.5622,
"step": 605
},
{
"epoch": 1.07,
"grad_norm": 1.6453822729343333,
"learning_rate": 4.807062862684874e-06,
"loss": 0.6047,
"step": 610
},
{
"epoch": 1.08,
"grad_norm": 1.7446190682981846,
"learning_rate": 4.801142893681955e-06,
"loss": 0.5696,
"step": 615
},
{
"epoch": 1.08,
"grad_norm": 1.6147794969895501,
"learning_rate": 4.795137227819113e-06,
"loss": 0.5523,
"step": 620
},
{
"epoch": 1.09,
"grad_norm": 1.6086737084266636,
"learning_rate": 4.7890460887533415e-06,
"loss": 0.5871,
"step": 625
},
{
"epoch": 1.1,
"grad_norm": 1.5378514408370791,
"learning_rate": 4.782869703324746e-06,
"loss": 0.5687,
"step": 630
},
{
"epoch": 1.11,
"grad_norm": 1.6893413050433177,
"learning_rate": 4.7766083015480876e-06,
"loss": 0.5748,
"step": 635
},
{
"epoch": 1.12,
"grad_norm": 1.6460419873518315,
"learning_rate": 4.770262116604224e-06,
"loss": 0.566,
"step": 640
},
{
"epoch": 1.13,
"grad_norm": 1.5227788895596694,
"learning_rate": 4.763831384831421e-06,
"loss": 0.542,
"step": 645
},
{
"epoch": 1.14,
"grad_norm": 2.0335648439912455,
"learning_rate": 4.757316345716554e-06,
"loss": 0.5659,
"step": 650
},
{
"epoch": 1.15,
"grad_norm": 1.5041239731809826,
"learning_rate": 4.750717241886186e-06,
"loss": 0.5529,
"step": 655
},
{
"epoch": 1.15,
"grad_norm": 1.900645473248817,
"learning_rate": 4.744034319097536e-06,
"loss": 0.5726,
"step": 660
},
{
"epoch": 1.16,
"grad_norm": 1.9943298775514051,
"learning_rate": 4.7372678262293235e-06,
"loss": 0.5709,
"step": 665
},
{
"epoch": 1.17,
"grad_norm": 1.6801680941350656,
"learning_rate": 4.7304180152725035e-06,
"loss": 0.5592,
"step": 670
},
{
"epoch": 1.18,
"grad_norm": 1.6643180642477764,
"learning_rate": 4.723485141320877e-06,
"loss": 0.5755,
"step": 675
},
{
"epoch": 1.19,
"grad_norm": 1.7468633841451968,
"learning_rate": 4.716469462561595e-06,
"loss": 0.5737,
"step": 680
},
{
"epoch": 1.2,
"grad_norm": 1.565224204964231,
"learning_rate": 4.709371240265543e-06,
"loss": 0.5713,
"step": 685
},
{
"epoch": 1.21,
"grad_norm": 1.704523210767029,
"learning_rate": 4.702190738777608e-06,
"loss": 0.5765,
"step": 690
},
{
"epoch": 1.22,
"grad_norm": 1.643895772580424,
"learning_rate": 4.69492822550684e-06,
"loss": 0.5646,
"step": 695
},
{
"epoch": 1.22,
"grad_norm": 1.8094188227885684,
"learning_rate": 4.687583970916487e-06,
"loss": 0.5652,
"step": 700
},
{
"epoch": 1.23,
"grad_norm": 1.7680146082507486,
"learning_rate": 4.680158248513924e-06,
"loss": 0.5789,
"step": 705
},
{
"epoch": 1.24,
"grad_norm": 1.900227044096463,
"learning_rate": 4.6726513348404736e-06,
"loss": 0.5806,
"step": 710
},
{
"epoch": 1.25,
"grad_norm": 1.6673076686826593,
"learning_rate": 4.665063509461098e-06,
"loss": 0.5685,
"step": 715
},
{
"epoch": 1.26,
"grad_norm": 1.8123226938487078,
"learning_rate": 4.657395054953992e-06,
"loss": 0.5886,
"step": 720
},
{
"epoch": 1.27,
"grad_norm": 1.7238311806282647,
"learning_rate": 4.649646256900064e-06,
"loss": 0.5635,
"step": 725
},
{
"epoch": 1.28,
"grad_norm": 1.6188026163610223,
"learning_rate": 4.641817403872293e-06,
"loss": 0.5731,
"step": 730
},
{
"epoch": 1.28,
"grad_norm": 1.4913698739002144,
"learning_rate": 4.633908787424986e-06,
"loss": 0.5732,
"step": 735
},
{
"epoch": 1.29,
"grad_norm": 1.9372955869658495,
"learning_rate": 4.625920702082918e-06,
"loss": 0.5571,
"step": 740
},
{
"epoch": 1.3,
"grad_norm": 1.7995058667015358,
"learning_rate": 4.617853445330367e-06,
"loss": 0.5607,
"step": 745
},
{
"epoch": 1.31,
"grad_norm": 1.6274220841761087,
"learning_rate": 4.6097073176000325e-06,
"loss": 0.559,
"step": 750
},
{
"epoch": 1.32,
"grad_norm": 1.6385978867845423,
"learning_rate": 4.601482622261848e-06,
"loss": 0.5737,
"step": 755
},
{
"epoch": 1.33,
"grad_norm": 1.9049655839724005,
"learning_rate": 4.593179665611685e-06,
"loss": 0.5578,
"step": 760
},
{
"epoch": 1.34,
"grad_norm": 1.8343333848444217,
"learning_rate": 4.584798756859941e-06,
"loss": 0.5666,
"step": 765
},
{
"epoch": 1.35,
"grad_norm": 1.6845673541242205,
"learning_rate": 4.5763402081200295e-06,
"loss": 0.5718,
"step": 770
},
{
"epoch": 1.35,
"grad_norm": 1.772559396539105,
"learning_rate": 4.567804334396756e-06,
"loss": 0.5716,
"step": 775
},
{
"epoch": 1.36,
"grad_norm": 1.748801886170466,
"learning_rate": 4.559191453574582e-06,
"loss": 0.5426,
"step": 780
},
{
"epoch": 1.37,
"grad_norm": 1.5929925572389767,
"learning_rate": 4.550501886405795e-06,
"loss": 0.5748,
"step": 785
},
{
"epoch": 1.38,
"grad_norm": 1.9926799279560756,
"learning_rate": 4.541735956498555e-06,
"loss": 0.5704,
"step": 790
},
{
"epoch": 1.39,
"grad_norm": 1.6789209872224784,
"learning_rate": 4.532893990304848e-06,
"loss": 0.5581,
"step": 795
},
{
"epoch": 1.4,
"grad_norm": 1.7322217739168355,
"learning_rate": 4.523976317108326e-06,
"loss": 0.5608,
"step": 800
},
{
"epoch": 1.41,
"grad_norm": 29.58409303047747,
"learning_rate": 4.514983269012048e-06,
"loss": 0.5806,
"step": 805
},
{
"epoch": 1.42,
"grad_norm": 2.307561877023173,
"learning_rate": 4.5059151809261085e-06,
"loss": 0.5521,
"step": 810
},
{
"epoch": 1.42,
"grad_norm": 1.8846862840887817,
"learning_rate": 4.496772390555164e-06,
"loss": 0.5516,
"step": 815
},
{
"epoch": 1.43,
"grad_norm": 2.8880758982566186,
"learning_rate": 4.487555238385862e-06,
"loss": 0.5643,
"step": 820
},
{
"epoch": 1.44,
"grad_norm": 1.7726833097207872,
"learning_rate": 4.478264067674155e-06,
"loss": 0.5603,
"step": 825
},
{
"epoch": 1.45,
"grad_norm": 1.9926460899793028,
"learning_rate": 4.4688992244325215e-06,
"loss": 0.5648,
"step": 830
},
{
"epoch": 1.46,
"grad_norm": 1.959915507617199,
"learning_rate": 4.459461057417078e-06,
"loss": 0.5408,
"step": 835
},
{
"epoch": 1.47,
"grad_norm": 1.8617496123839707,
"learning_rate": 4.449949918114593e-06,
"loss": 0.5528,
"step": 840
},
{
"epoch": 1.48,
"grad_norm": 1.9538793006795439,
"learning_rate": 4.440366160729393e-06,
"loss": 0.5674,
"step": 845
},
{
"epoch": 1.49,
"grad_norm": 1.8693300905967214,
"learning_rate": 4.430710142170176e-06,
"loss": 0.5578,
"step": 850
},
{
"epoch": 1.49,
"grad_norm": 1.9612803924260176,
"learning_rate": 4.420982222036719e-06,
"loss": 0.5435,
"step": 855
},
{
"epoch": 1.5,
"grad_norm": 1.995791143771985,
"learning_rate": 4.411182762606484e-06,
"loss": 0.5613,
"step": 860
},
{
"epoch": 1.51,
"grad_norm": 5.458759085906611,
"learning_rate": 4.401312128821131e-06,
"loss": 0.5535,
"step": 865
},
{
"epoch": 1.52,
"grad_norm": 3.017681632112604,
"learning_rate": 4.391370688272919e-06,
"loss": 0.5761,
"step": 870
},
{
"epoch": 1.53,
"grad_norm": 1.6117486364142628,
"learning_rate": 4.381358811191025e-06,
"loss": 0.5424,
"step": 875
},
{
"epoch": 1.54,
"grad_norm": 1.6889717368205845,
"learning_rate": 4.3712768704277535e-06,
"loss": 0.5578,
"step": 880
},
{
"epoch": 1.55,
"grad_norm": 1.6263535777974591,
"learning_rate": 4.361125241444647e-06,
"loss": 0.5517,
"step": 885
},
{
"epoch": 1.56,
"grad_norm": 1.5391004007544808,
"learning_rate": 4.350904302298511e-06,
"loss": 0.5475,
"step": 890
},
{
"epoch": 1.56,
"grad_norm": 3.919504264594871,
"learning_rate": 4.3406144336273284e-06,
"loss": 0.5488,
"step": 895
},
{
"epoch": 1.57,
"grad_norm": 1.7006307109389633,
"learning_rate": 4.330256018636086e-06,
"loss": 0.5385,
"step": 900
},
{
"epoch": 1.58,
"grad_norm": 1.6821530119030959,
"learning_rate": 4.319829443082506e-06,
"loss": 0.5392,
"step": 905
},
{
"epoch": 1.59,
"grad_norm": 1.81327230754709,
"learning_rate": 4.309335095262675e-06,
"loss": 0.5463,
"step": 910
},
{
"epoch": 1.6,
"grad_norm": 1.7582986070443545,
"learning_rate": 4.298773365996591e-06,
"loss": 0.5698,
"step": 915
},
{
"epoch": 1.61,
"grad_norm": 1.6167984983083563,
"learning_rate": 4.288144648613601e-06,
"loss": 0.5451,
"step": 920
},
{
"epoch": 1.62,
"grad_norm": 3.2311390071409125,
"learning_rate": 4.277449338937754e-06,
"loss": 0.5516,
"step": 925
},
{
"epoch": 1.63,
"grad_norm": 1.6218961380455157,
"learning_rate": 4.266687835273071e-06,
"loss": 0.538,
"step": 930
},
{
"epoch": 1.63,
"grad_norm": 1.6454054434252245,
"learning_rate": 4.255860538388694e-06,
"loss": 0.5678,
"step": 935
},
{
"epoch": 1.64,
"grad_norm": 1.9639702122144873,
"learning_rate": 4.244967851503975e-06,
"loss": 0.5446,
"step": 940
},
{
"epoch": 1.65,
"grad_norm": 1.7028448251386163,
"learning_rate": 4.234010180273455e-06,
"loss": 0.5297,
"step": 945
},
{
"epoch": 1.66,
"grad_norm": 1.689519412982808,
"learning_rate": 4.2229879327717545e-06,
"loss": 0.5432,
"step": 950
},
{
"epoch": 1.67,
"grad_norm": 1.64362516189522,
"learning_rate": 4.211901519478382e-06,
"loss": 0.5414,
"step": 955
},
{
"epoch": 1.68,
"grad_norm": 1.9767208825933924,
"learning_rate": 4.200751353262442e-06,
"loss": 0.5578,
"step": 960
},
{
"epoch": 1.69,
"grad_norm": 1.7757696899561655,
"learning_rate": 4.1895378493672615e-06,
"loss": 0.5447,
"step": 965
},
{
"epoch": 1.7,
"grad_norm": 1.7488808503021864,
"learning_rate": 4.178261425394926e-06,
"loss": 0.5362,
"step": 970
},
{
"epoch": 1.7,
"grad_norm": 1.720469955142088,
"learning_rate": 4.16692250129073e-06,
"loss": 0.5339,
"step": 975
},
{
"epoch": 1.71,
"grad_norm": 1.656259449924422,
"learning_rate": 4.15552149932753e-06,
"loss": 0.5324,
"step": 980
},
{
"epoch": 1.72,
"grad_norm": 1.605065327847084,
"learning_rate": 4.144058844090032e-06,
"loss": 0.5421,
"step": 985
},
{
"epoch": 1.73,
"grad_norm": 2.400014331232814,
"learning_rate": 4.1325349624589625e-06,
"loss": 0.5345,
"step": 990
},
{
"epoch": 1.74,
"grad_norm": 1.8217771753224867,
"learning_rate": 4.120950283595188e-06,
"loss": 0.5645,
"step": 995
},
{
"epoch": 1.75,
"grad_norm": 1.7606143199953959,
"learning_rate": 4.109305238923718e-06,
"loss": 0.5385,
"step": 1000
},
{
"epoch": 1.76,
"grad_norm": 1.854649950712452,
"learning_rate": 4.09760026211765e-06,
"loss": 0.537,
"step": 1005
},
{
"epoch": 1.77,
"grad_norm": 1.834481039824983,
"learning_rate": 4.0858357890820115e-06,
"loss": 0.5256,
"step": 1010
},
{
"epoch": 1.77,
"grad_norm": 1.7881039817362976,
"learning_rate": 4.0740122579375284e-06,
"loss": 0.5492,
"step": 1015
},
{
"epoch": 1.78,
"grad_norm": 1.99136744573053,
"learning_rate": 4.062130109004313e-06,
"loss": 0.5423,
"step": 1020
},
{
"epoch": 1.79,
"grad_norm": 2.067114672482736,
"learning_rate": 4.0501897847854596e-06,
"loss": 0.5248,
"step": 1025
},
{
"epoch": 1.8,
"grad_norm": 2.3619737058273698,
"learning_rate": 4.038191729950569e-06,
"loss": 0.5548,
"step": 1030
},
{
"epoch": 1.81,
"grad_norm": 1.6422923327155017,
"learning_rate": 4.026136391319187e-06,
"loss": 0.5661,
"step": 1035
},
{
"epoch": 1.82,
"grad_norm": 1.7084270040887584,
"learning_rate": 4.014024217844167e-06,
"loss": 0.5413,
"step": 1040
},
{
"epoch": 1.83,
"grad_norm": 2.4216667822493387,
"learning_rate": 4.001855660594948e-06,
"loss": 0.5251,
"step": 1045
},
{
"epoch": 1.84,
"grad_norm": 1.6894496135090933,
"learning_rate": 3.989631172740756e-06,
"loss": 0.5282,
"step": 1050
},
{
"epoch": 1.84,
"grad_norm": 2.3374959162830224,
"learning_rate": 3.97735120953373e-06,
"loss": 0.5375,
"step": 1055
},
{
"epoch": 1.85,
"grad_norm": 1.6036838276324374,
"learning_rate": 3.965016228291966e-06,
"loss": 0.5482,
"step": 1060
},
{
"epoch": 1.86,
"grad_norm": 1.5838188110377205,
"learning_rate": 3.9526266883824865e-06,
"loss": 0.5406,
"step": 1065
},
{
"epoch": 1.87,
"grad_norm": 2.925850402777526,
"learning_rate": 3.940183051204133e-06,
"loss": 0.5514,
"step": 1070
},
{
"epoch": 1.88,
"grad_norm": 1.8345221327535834,
"learning_rate": 3.927685780170385e-06,
"loss": 0.5351,
"step": 1075
},
{
"epoch": 1.89,
"grad_norm": 1.7672217360317701,
"learning_rate": 3.915135340692098e-06,
"loss": 0.5431,
"step": 1080
},
{
"epoch": 1.9,
"grad_norm": 1.7690557869414958,
"learning_rate": 3.902532200160174e-06,
"loss": 0.5332,
"step": 1085
},
{
"epoch": 1.91,
"grad_norm": 1.8016714829119445,
"learning_rate": 3.889876827928156e-06,
"loss": 0.5547,
"step": 1090
},
{
"epoch": 1.91,
"grad_norm": 1.8101317872578178,
"learning_rate": 3.877169695294749e-06,
"loss": 0.5152,
"step": 1095
},
{
"epoch": 1.92,
"grad_norm": 1.863558573257381,
"learning_rate": 3.8644112754862614e-06,
"loss": 0.5496,
"step": 1100
},
{
"epoch": 1.93,
"grad_norm": 1.6834072929471855,
"learning_rate": 3.8516020436389945e-06,
"loss": 0.5403,
"step": 1105
},
{
"epoch": 1.94,
"grad_norm": 1.8343601221708872,
"learning_rate": 3.838742476781535e-06,
"loss": 0.5449,
"step": 1110
},
{
"epoch": 1.95,
"grad_norm": 1.8111180639710172,
"learning_rate": 3.825833053816998e-06,
"loss": 0.5177,
"step": 1115
},
{
"epoch": 1.96,
"grad_norm": 1.760616458847631,
"learning_rate": 3.812874255505191e-06,
"loss": 0.5282,
"step": 1120
},
{
"epoch": 1.97,
"grad_norm": 1.662863038395529,
"learning_rate": 3.7998665644447064e-06,
"loss": 0.5462,
"step": 1125
},
{
"epoch": 1.98,
"grad_norm": 1.5940854942690412,
"learning_rate": 3.786810465054953e-06,
"loss": 0.5155,
"step": 1130
},
{
"epoch": 1.98,
"grad_norm": 1.7490423857630772,
"learning_rate": 3.773706443558112e-06,
"loss": 0.5333,
"step": 1135
},
{
"epoch": 1.99,
"grad_norm": 1.7805842103529974,
"learning_rate": 3.7605549879610346e-06,
"loss": 0.523,
"step": 1140
},
{
"epoch": 2.0,
"eval_loss": 0.48952990770339966,
"eval_runtime": 329.9798,
"eval_samples_per_second": 22.792,
"eval_steps_per_second": 0.358,
"step": 1144
},
{
"epoch": 2.0,
"grad_norm": 3.530130888128286,
"learning_rate": 3.747356588037064e-06,
"loss": 0.4829,
"step": 1145
},
{
"epoch": 2.01,
"grad_norm": 2.3455796414814656,
"learning_rate": 3.7341117353077964e-06,
"loss": 0.4185,
"step": 1150
},
{
"epoch": 2.02,
"grad_norm": 11.689021838621713,
"learning_rate": 3.7208209230247785e-06,
"loss": 0.4431,
"step": 1155
},
{
"epoch": 2.03,
"grad_norm": 2.136634988191819,
"learning_rate": 3.7074846461511336e-06,
"loss": 0.4166,
"step": 1160
},
{
"epoch": 2.04,
"grad_norm": 1.8007601624193452,
"learning_rate": 3.694103401343136e-06,
"loss": 0.4247,
"step": 1165
},
{
"epoch": 2.05,
"grad_norm": 1.8280089181701968,
"learning_rate": 3.6806776869317074e-06,
"loss": 0.4184,
"step": 1170
},
{
"epoch": 2.05,
"grad_norm": 1.8498758253178664,
"learning_rate": 3.667208002903863e-06,
"loss": 0.4015,
"step": 1175
},
{
"epoch": 2.06,
"grad_norm": 1.7616805224163088,
"learning_rate": 3.6536948508840915e-06,
"loss": 0.4139,
"step": 1180
},
{
"epoch": 2.07,
"grad_norm": 1.7368842598902192,
"learning_rate": 3.6401387341156715e-06,
"loss": 0.3943,
"step": 1185
},
{
"epoch": 2.08,
"grad_norm": 1.735642439390496,
"learning_rate": 3.6265401574419316e-06,
"loss": 0.4362,
"step": 1190
},
{
"epoch": 2.09,
"grad_norm": 1.9606530829602056,
"learning_rate": 3.6128996272874523e-06,
"loss": 0.4177,
"step": 1195
},
{
"epoch": 2.1,
"grad_norm": 2.045121741468278,
"learning_rate": 3.5992176516392007e-06,
"loss": 0.4191,
"step": 1200
},
{
"epoch": 2.11,
"grad_norm": 1.8838455742670526,
"learning_rate": 3.5854947400276164e-06,
"loss": 0.414,
"step": 1205
},
{
"epoch": 2.12,
"grad_norm": 2.0713934176853366,
"learning_rate": 3.5717314035076355e-06,
"loss": 0.423,
"step": 1210
},
{
"epoch": 2.12,
"grad_norm": 1.9586979622942737,
"learning_rate": 3.5579281546396582e-06,
"loss": 0.4089,
"step": 1215
},
{
"epoch": 2.13,
"grad_norm": 1.6793543323852747,
"learning_rate": 3.54408550747046e-06,
"loss": 0.4045,
"step": 1220
},
{
"epoch": 2.14,
"grad_norm": 1.8609322408923432,
"learning_rate": 3.530203977514049e-06,
"loss": 0.4216,
"step": 1225
},
{
"epoch": 2.15,
"grad_norm": 1.9301155764363938,
"learning_rate": 3.516284081732466e-06,
"loss": 0.4141,
"step": 1230
},
{
"epoch": 2.16,
"grad_norm": 1.8203713381481375,
"learning_rate": 3.5023263385165346e-06,
"loss": 0.4193,
"step": 1235
},
{
"epoch": 2.17,
"grad_norm": 1.986035063455061,
"learning_rate": 3.4883312676665537e-06,
"loss": 0.4102,
"step": 1240
},
{
"epoch": 2.18,
"grad_norm": 1.932701768003649,
"learning_rate": 3.4742993903729423e-06,
"loss": 0.4096,
"step": 1245
},
{
"epoch": 2.19,
"grad_norm": 2.072592848243265,
"learning_rate": 3.460231229196826e-06,
"loss": 0.4356,
"step": 1250
},
{
"epoch": 2.19,
"grad_norm": 1.8608782853161558,
"learning_rate": 3.446127308050579e-06,
"loss": 0.422,
"step": 1255
},
{
"epoch": 2.2,
"grad_norm": 1.8587160248700436,
"learning_rate": 3.431988152178315e-06,
"loss": 0.4055,
"step": 1260
},
{
"epoch": 2.21,
"grad_norm": 1.7275839544864644,
"learning_rate": 3.4178142881363192e-06,
"loss": 0.3994,
"step": 1265
},
{
"epoch": 2.22,
"grad_norm": 1.7408220116241242,
"learning_rate": 3.4036062437734484e-06,
"loss": 0.4247,
"step": 1270
},
{
"epoch": 2.23,
"grad_norm": 1.8684812995394824,
"learning_rate": 3.3893645482114663e-06,
"loss": 0.4013,
"step": 1275
},
{
"epoch": 2.24,
"grad_norm": 1.9169356401008089,
"learning_rate": 3.3750897318253407e-06,
"loss": 0.4436,
"step": 1280
},
{
"epoch": 2.25,
"grad_norm": 1.9965429262903887,
"learning_rate": 3.3607823262234936e-06,
"loss": 0.4052,
"step": 1285
},
{
"epoch": 2.26,
"grad_norm": 1.676108249215827,
"learning_rate": 3.3464428642280004e-06,
"loss": 0.4043,
"step": 1290
},
{
"epoch": 2.26,
"grad_norm": 1.759052520530038,
"learning_rate": 3.3320718798547503e-06,
"loss": 0.3891,
"step": 1295
},
{
"epoch": 2.27,
"grad_norm": 1.8158002528007304,
"learning_rate": 3.3176699082935546e-06,
"loss": 0.4048,
"step": 1300
},
{
"epoch": 2.28,
"grad_norm": 1.8540661043983968,
"learning_rate": 3.303237485888221e-06,
"loss": 0.413,
"step": 1305
},
{
"epoch": 2.29,
"grad_norm": 1.8368083328897664,
"learning_rate": 3.2887751501165755e-06,
"loss": 0.4134,
"step": 1310
},
{
"epoch": 2.3,
"grad_norm": 1.943248250811944,
"learning_rate": 3.2742834395704486e-06,
"loss": 0.4225,
"step": 1315
},
{
"epoch": 2.31,
"grad_norm": 2.023022399396665,
"learning_rate": 3.2597628939356174e-06,
"loss": 0.4045,
"step": 1320
},
{
"epoch": 2.32,
"grad_norm": 2.0127015131643575,
"learning_rate": 3.2452140539717047e-06,
"loss": 0.4157,
"step": 1325
},
{
"epoch": 2.33,
"grad_norm": 1.9502787177069507,
"learning_rate": 3.2306374614920434e-06,
"loss": 0.4037,
"step": 1330
},
{
"epoch": 2.33,
"grad_norm": 1.938220761297596,
"learning_rate": 3.2160336593434977e-06,
"loss": 0.4065,
"step": 1335
},
{
"epoch": 2.34,
"grad_norm": 1.8287659925527244,
"learning_rate": 3.201403191386247e-06,
"loss": 0.4249,
"step": 1340
},
{
"epoch": 2.35,
"grad_norm": 2.115173595697826,
"learning_rate": 3.1867466024735327e-06,
"loss": 0.4083,
"step": 1345
},
{
"epoch": 2.36,
"grad_norm": 1.8653664947018882,
"learning_rate": 3.1720644384313647e-06,
"loss": 0.4311,
"step": 1350
},
{
"epoch": 2.37,
"grad_norm": 2.0817854592111025,
"learning_rate": 3.1573572460381992e-06,
"loss": 0.4184,
"step": 1355
},
{
"epoch": 2.38,
"grad_norm": 1.9348176314156256,
"learning_rate": 3.1426255730045703e-06,
"loss": 0.4231,
"step": 1360
},
{
"epoch": 2.39,
"grad_norm": 1.9693517430562117,
"learning_rate": 3.127869967952698e-06,
"loss": 0.4206,
"step": 1365
},
{
"epoch": 2.4,
"grad_norm": 1.8456135208525017,
"learning_rate": 3.1130909803960533e-06,
"loss": 0.3978,
"step": 1370
},
{
"epoch": 2.4,
"grad_norm": 1.9626926891274619,
"learning_rate": 3.0982891607188948e-06,
"loss": 0.4064,
"step": 1375
},
{
"epoch": 2.41,
"grad_norm": 2.026442414272127,
"learning_rate": 3.0834650601557724e-06,
"loss": 0.4149,
"step": 1380
},
{
"epoch": 2.42,
"grad_norm": 2.063023886136181,
"learning_rate": 3.068619230770999e-06,
"loss": 0.4118,
"step": 1385
},
{
"epoch": 2.43,
"grad_norm": 1.9784482842258986,
"learning_rate": 3.0537522254380902e-06,
"loss": 0.4026,
"step": 1390
},
{
"epoch": 2.44,
"grad_norm": 1.921786550245392,
"learning_rate": 3.0388645978191745e-06,
"loss": 0.4044,
"step": 1395
},
{
"epoch": 2.45,
"grad_norm": 1.9377855933194186,
"learning_rate": 3.0239569023443756e-06,
"loss": 0.4294,
"step": 1400
},
{
"epoch": 2.46,
"grad_norm": 1.8926358118848756,
"learning_rate": 3.0090296941911633e-06,
"loss": 0.4131,
"step": 1405
},
{
"epoch": 2.47,
"grad_norm": 1.8195567532258379,
"learning_rate": 2.9940835292636806e-06,
"loss": 0.416,
"step": 1410
},
{
"epoch": 2.47,
"grad_norm": 1.9219319571815525,
"learning_rate": 2.9791189641720385e-06,
"loss": 0.4023,
"step": 1415
},
{
"epoch": 2.48,
"grad_norm": 1.9615684580403125,
"learning_rate": 2.9641365562115886e-06,
"loss": 0.4256,
"step": 1420
},
{
"epoch": 2.49,
"grad_norm": 2.2610537590860145,
"learning_rate": 2.949136863342169e-06,
"loss": 0.4101,
"step": 1425
},
{
"epoch": 2.5,
"grad_norm": 1.9357969284665328,
"learning_rate": 2.9341204441673267e-06,
"loss": 0.4202,
"step": 1430
},
{
"epoch": 2.51,
"grad_norm": 1.8520492192305298,
"learning_rate": 2.9190878579135077e-06,
"loss": 0.4155,
"step": 1435
},
{
"epoch": 2.52,
"grad_norm": 1.9246616441113338,
"learning_rate": 2.904039664409244e-06,
"loss": 0.4056,
"step": 1440
},
{
"epoch": 2.53,
"grad_norm": 2.030321498137203,
"learning_rate": 2.888976424064289e-06,
"loss": 0.4176,
"step": 1445
},
{
"epoch": 2.53,
"grad_norm": 2.238812621763622,
"learning_rate": 2.8738986978487625e-06,
"loss": 0.4049,
"step": 1450
},
{
"epoch": 2.54,
"grad_norm": 2.259109743694307,
"learning_rate": 2.8588070472722486e-06,
"loss": 0.3983,
"step": 1455
},
{
"epoch": 2.55,
"grad_norm": 2.437226455840826,
"learning_rate": 2.8437020343628896e-06,
"loss": 0.3935,
"step": 1460
},
{
"epoch": 2.56,
"grad_norm": 2.346717301291877,
"learning_rate": 2.8285842216464544e-06,
"loss": 0.4093,
"step": 1465
},
{
"epoch": 2.57,
"grad_norm": 2.4385081373620197,
"learning_rate": 2.813454172125389e-06,
"loss": 0.4007,
"step": 1470
},
{
"epoch": 2.58,
"grad_norm": 2.0580498358246557,
"learning_rate": 2.79831244925785e-06,
"loss": 0.4012,
"step": 1475
},
{
"epoch": 2.59,
"grad_norm": 1.8904939031404155,
"learning_rate": 2.783159616936723e-06,
"loss": 0.4247,
"step": 1480
},
{
"epoch": 2.6,
"grad_norm": 1.939667430958579,
"learning_rate": 2.76799623946862e-06,
"loss": 0.3983,
"step": 1485
},
{
"epoch": 2.6,
"grad_norm": 1.9363488042991572,
"learning_rate": 2.7528228815528622e-06,
"loss": 0.4054,
"step": 1490
},
{
"epoch": 2.61,
"grad_norm": 1.9510664856850064,
"learning_rate": 2.7376401082604563e-06,
"loss": 0.4098,
"step": 1495
},
{
"epoch": 2.62,
"grad_norm": 1.838136670411088,
"learning_rate": 2.722448485013046e-06,
"loss": 0.3997,
"step": 1500
},
{
"epoch": 2.63,
"grad_norm": 1.792723761937039,
"learning_rate": 2.707248577561854e-06,
"loss": 0.4029,
"step": 1505
},
{
"epoch": 2.64,
"grad_norm": 1.8102828207451078,
"learning_rate": 2.6920409519666173e-06,
"loss": 0.4249,
"step": 1510
},
{
"epoch": 2.65,
"grad_norm": 2.1220720458624465,
"learning_rate": 2.6768261745745037e-06,
"loss": 0.4106,
"step": 1515
},
{
"epoch": 2.66,
"grad_norm": 1.812641713649887,
"learning_rate": 2.6616048119990214e-06,
"loss": 0.3944,
"step": 1520
},
{
"epoch": 2.67,
"grad_norm": 2.4648885003841046,
"learning_rate": 2.6463774310989154e-06,
"loss": 0.3971,
"step": 1525
},
{
"epoch": 2.67,
"grad_norm": 1.85108667649988,
"learning_rate": 2.6311445989570633e-06,
"loss": 0.3975,
"step": 1530
},
{
"epoch": 2.68,
"grad_norm": 1.7524914725271679,
"learning_rate": 2.615906882859349e-06,
"loss": 0.392,
"step": 1535
},
{
"epoch": 2.69,
"grad_norm": 2.0229317958148867,
"learning_rate": 2.6006648502735384e-06,
"loss": 0.4123,
"step": 1540
},
{
"epoch": 2.7,
"grad_norm": 1.8570564954534334,
"learning_rate": 2.585419068828152e-06,
"loss": 0.4174,
"step": 1545
},
{
"epoch": 2.71,
"grad_norm": 1.8937349012574363,
"learning_rate": 2.5701701062913194e-06,
"loss": 0.4265,
"step": 1550
},
{
"epoch": 2.72,
"grad_norm": 1.7590069556724997,
"learning_rate": 2.554918530549637e-06,
"loss": 0.3863,
"step": 1555
},
{
"epoch": 2.73,
"grad_norm": 2.096408205769661,
"learning_rate": 2.53966490958702e-06,
"loss": 0.396,
"step": 1560
},
{
"epoch": 2.74,
"grad_norm": 1.962464797958802,
"learning_rate": 2.5244098114635503e-06,
"loss": 0.4142,
"step": 1565
},
{
"epoch": 2.74,
"grad_norm": 1.8458575978769718,
"learning_rate": 2.5091538042943183e-06,
"loss": 0.3946,
"step": 1570
},
{
"epoch": 2.75,
"grad_norm": 1.8269976161284471,
"learning_rate": 2.4938974562282708e-06,
"loss": 0.4127,
"step": 1575
},
{
"epoch": 2.76,
"grad_norm": 2.01278268309966,
"learning_rate": 2.4786413354270494e-06,
"loss": 0.4032,
"step": 1580
},
{
"epoch": 2.77,
"grad_norm": 1.7849542357371841,
"learning_rate": 2.4633860100438317e-06,
"loss": 0.3905,
"step": 1585
},
{
"epoch": 2.78,
"grad_norm": 69.63657032212085,
"learning_rate": 2.4481320482021716e-06,
"loss": 0.4038,
"step": 1590
},
{
"epoch": 2.79,
"grad_norm": 2.1742738648421023,
"learning_rate": 2.4328800179748475e-06,
"loss": 0.3888,
"step": 1595
},
{
"epoch": 2.8,
"grad_norm": 2.8552125373844985,
"learning_rate": 2.4176304873626983e-06,
"loss": 0.4066,
"step": 1600
},
{
"epoch": 2.81,
"grad_norm": 2.1841551939924244,
"learning_rate": 2.4023840242734774e-06,
"loss": 0.3997,
"step": 1605
},
{
"epoch": 2.81,
"grad_norm": 2.194040510042996,
"learning_rate": 2.3871411965006985e-06,
"loss": 0.4102,
"step": 1610
},
{
"epoch": 2.82,
"grad_norm": 2.3916077476597475,
"learning_rate": 2.3719025717024946e-06,
"loss": 0.4014,
"step": 1615
},
{
"epoch": 2.83,
"grad_norm": 1.9141926801542115,
"learning_rate": 2.3566687173804747e-06,
"loss": 0.3977,
"step": 1620
},
{
"epoch": 2.84,
"grad_norm": 1.8144717702331585,
"learning_rate": 2.341440200858589e-06,
"loss": 0.3964,
"step": 1625
},
{
"epoch": 2.85,
"grad_norm": 1.851593659208308,
"learning_rate": 2.3262175892620064e-06,
"loss": 0.4085,
"step": 1630
},
{
"epoch": 2.86,
"grad_norm": 1.8686287238442707,
"learning_rate": 2.311001449495986e-06,
"loss": 0.4002,
"step": 1635
},
{
"epoch": 2.87,
"grad_norm": 1.898471354316753,
"learning_rate": 2.2957923482247745e-06,
"loss": 0.399,
"step": 1640
},
{
"epoch": 2.88,
"grad_norm": 1.8645673290000866,
"learning_rate": 2.280590851850493e-06,
"loss": 0.4025,
"step": 1645
},
{
"epoch": 2.88,
"grad_norm": 1.8195968879508377,
"learning_rate": 2.265397526492052e-06,
"loss": 0.3942,
"step": 1650
},
{
"epoch": 2.89,
"grad_norm": 2.133179133462322,
"learning_rate": 2.2502129379640644e-06,
"loss": 0.4135,
"step": 1655
},
{
"epoch": 2.9,
"grad_norm": 1.790567048936156,
"learning_rate": 2.235037651755773e-06,
"loss": 0.3889,
"step": 1660
},
{
"epoch": 2.91,
"grad_norm": 1.7690615361620154,
"learning_rate": 2.2198722330099964e-06,
"loss": 0.3971,
"step": 1665
},
{
"epoch": 2.92,
"grad_norm": 2.0091067058646543,
"learning_rate": 2.2047172465020757e-06,
"loss": 0.3953,
"step": 1670
},
{
"epoch": 2.93,
"grad_norm": 1.9832659283001421,
"learning_rate": 2.1895732566188475e-06,
"loss": 0.396,
"step": 1675
},
{
"epoch": 2.94,
"grad_norm": 1.7695360334830164,
"learning_rate": 2.1744408273376204e-06,
"loss": 0.3864,
"step": 1680
},
{
"epoch": 2.95,
"grad_norm": 1.9266047240521331,
"learning_rate": 2.159320522205179e-06,
"loss": 0.3902,
"step": 1685
},
{
"epoch": 2.95,
"grad_norm": 1.9067745089946206,
"learning_rate": 2.1442129043167877e-06,
"loss": 0.4048,
"step": 1690
},
{
"epoch": 2.96,
"grad_norm": 1.8121353203087656,
"learning_rate": 2.1291185362952274e-06,
"loss": 0.3949,
"step": 1695
},
{
"epoch": 2.97,
"grad_norm": 1.8495794192789976,
"learning_rate": 2.114037980269842e-06,
"loss": 0.4038,
"step": 1700
},
{
"epoch": 2.98,
"grad_norm": 1.912104342675586,
"learning_rate": 2.0989717978555992e-06,
"loss": 0.3919,
"step": 1705
},
{
"epoch": 2.99,
"grad_norm": 1.8275208022684235,
"learning_rate": 2.0839205501321844e-06,
"loss": 0.4066,
"step": 1710
},
{
"epoch": 3.0,
"grad_norm": 2.057130603594985,
"learning_rate": 2.0688847976230952e-06,
"loss": 0.3858,
"step": 1715
},
{
"epoch": 3.0,
"eval_loss": 0.377420037984848,
"eval_runtime": 330.5022,
"eval_samples_per_second": 22.756,
"eval_steps_per_second": 0.357,
"step": 1716
},
{
"epoch": 3.01,
"grad_norm": 3.3388932256530697,
"learning_rate": 2.0538651002747745e-06,
"loss": 0.2972,
"step": 1720
},
{
"epoch": 3.02,
"grad_norm": 2.8981240612419423,
"learning_rate": 2.0388620174357542e-06,
"loss": 0.3002,
"step": 1725
},
{
"epoch": 3.02,
"grad_norm": 2.205591361217027,
"learning_rate": 2.023876107835825e-06,
"loss": 0.3135,
"step": 1730
},
{
"epoch": 3.03,
"grad_norm": 2.1495938527408835,
"learning_rate": 2.008907929565231e-06,
"loss": 0.3126,
"step": 1735
},
{
"epoch": 3.04,
"grad_norm": 2.3520040686767167,
"learning_rate": 1.993958040053881e-06,
"loss": 0.3209,
"step": 1740
},
{
"epoch": 3.05,
"grad_norm": 2.0650913293200848,
"learning_rate": 1.9790269960505947e-06,
"loss": 0.2944,
"step": 1745
},
{
"epoch": 3.06,
"grad_norm": 2.1220138588069704,
"learning_rate": 1.9641153536023646e-06,
"loss": 0.308,
"step": 1750
},
{
"epoch": 3.07,
"grad_norm": 2.150988951055739,
"learning_rate": 1.9492236680336486e-06,
"loss": 0.2931,
"step": 1755
},
{
"epoch": 3.08,
"grad_norm": 2.124000470558878,
"learning_rate": 1.934352493925695e-06,
"loss": 0.2995,
"step": 1760
},
{
"epoch": 3.09,
"grad_norm": 1.9934393020695829,
"learning_rate": 1.9195023850958812e-06,
"loss": 0.2926,
"step": 1765
},
{
"epoch": 3.09,
"grad_norm": 2.154505232085453,
"learning_rate": 1.9046738945770932e-06,
"loss": 0.3057,
"step": 1770
},
{
"epoch": 3.1,
"grad_norm": 2.1900958746909565,
"learning_rate": 1.889867574597129e-06,
"loss": 0.3012,
"step": 1775
},
{
"epoch": 3.11,
"grad_norm": 1.8955650762693546,
"learning_rate": 1.875083976558136e-06,
"loss": 0.2838,
"step": 1780
},
{
"epoch": 3.12,
"grad_norm": 2.0442738100465783,
"learning_rate": 1.860323651016072e-06,
"loss": 0.2977,
"step": 1785
},
{
"epoch": 3.13,
"grad_norm": 2.110043278615698,
"learning_rate": 1.8455871476602023e-06,
"loss": 0.2984,
"step": 1790
},
{
"epoch": 3.14,
"grad_norm": 2.3136510034991282,
"learning_rate": 1.8308750152926338e-06,
"loss": 0.2927,
"step": 1795
},
{
"epoch": 3.15,
"grad_norm": 2.0131063879803826,
"learning_rate": 1.8161878018078693e-06,
"loss": 0.2908,
"step": 1800
},
{
"epoch": 3.16,
"grad_norm": 2.1355586799767674,
"learning_rate": 1.8015260541724128e-06,
"loss": 0.2905,
"step": 1805
},
{
"epoch": 3.16,
"grad_norm": 2.0664019629242087,
"learning_rate": 1.7868903184043888e-06,
"loss": 0.2955,
"step": 1810
},
{
"epoch": 3.17,
"grad_norm": 2.188481583652719,
"learning_rate": 1.772281139553218e-06,
"loss": 0.3071,
"step": 1815
},
{
"epoch": 3.18,
"grad_norm": 2.0847120701500774,
"learning_rate": 1.7576990616793139e-06,
"loss": 0.2908,
"step": 1820
},
{
"epoch": 3.19,
"grad_norm": 2.018064853932535,
"learning_rate": 1.74314462783382e-06,
"loss": 0.2967,
"step": 1825
},
{
"epoch": 3.2,
"grad_norm": 2.0242538886830843,
"learning_rate": 1.7286183800383937e-06,
"loss": 0.3017,
"step": 1830
},
{
"epoch": 3.21,
"grad_norm": 2.145977098033245,
"learning_rate": 1.714120859265011e-06,
"loss": 0.2872,
"step": 1835
},
{
"epoch": 3.22,
"grad_norm": 1.9029268616901622,
"learning_rate": 1.6996526054158283e-06,
"loss": 0.2888,
"step": 1840
},
{
"epoch": 3.23,
"grad_norm": 4.096519984140158,
"learning_rate": 1.685214157303069e-06,
"loss": 0.2966,
"step": 1845
},
{
"epoch": 3.23,
"grad_norm": 2.1188967978481013,
"learning_rate": 1.6708060526289648e-06,
"loss": 0.299,
"step": 1850
},
{
"epoch": 3.24,
"grad_norm": 2.7278823372957666,
"learning_rate": 1.6564288279657253e-06,
"loss": 0.3034,
"step": 1855
},
{
"epoch": 3.25,
"grad_norm": 2.0002017052238257,
"learning_rate": 1.6420830187355572e-06,
"loss": 0.2958,
"step": 1860
},
{
"epoch": 3.26,
"grad_norm": 3.6893974280355097,
"learning_rate": 1.6277691591907272e-06,
"loss": 0.2954,
"step": 1865
},
{
"epoch": 3.27,
"grad_norm": 1.8667517556916042,
"learning_rate": 1.613487782393661e-06,
"loss": 0.2933,
"step": 1870
},
{
"epoch": 3.28,
"grad_norm": 2.0838269309261364,
"learning_rate": 1.599239420197098e-06,
"loss": 0.297,
"step": 1875
},
{
"epoch": 3.29,
"grad_norm": 1.9755359084479687,
"learning_rate": 1.5850246032242766e-06,
"loss": 0.2909,
"step": 1880
},
{
"epoch": 3.3,
"grad_norm": 2096.7504686136917,
"learning_rate": 1.5708438608491816e-06,
"loss": 0.2951,
"step": 1885
},
{
"epoch": 3.3,
"grad_norm": 2.918573785412608,
"learning_rate": 1.556697721176823e-06,
"loss": 0.3012,
"step": 1890
},
{
"epoch": 3.31,
"grad_norm": 4.274328804415084,
"learning_rate": 1.5425867110235717e-06,
"loss": 0.2876,
"step": 1895
},
{
"epoch": 3.32,
"grad_norm": 5.92446369343208,
"learning_rate": 1.5285113558975429e-06,
"loss": 0.3466,
"step": 1900
},
{
"epoch": 3.33,
"grad_norm": 5.122609680763134,
"learning_rate": 1.5144721799790194e-06,
"loss": 0.3023,
"step": 1905
},
{
"epoch": 3.34,
"grad_norm": 5.036779121717785,
"learning_rate": 1.5004697061009372e-06,
"loss": 0.3013,
"step": 1910
},
{
"epoch": 3.35,
"grad_norm": 4.110935956575215,
"learning_rate": 1.486504455729408e-06,
"loss": 0.2915,
"step": 1915
},
{
"epoch": 3.36,
"grad_norm": 2.867011840176696,
"learning_rate": 1.4725769489443082e-06,
"loss": 0.2822,
"step": 1920
},
{
"epoch": 3.37,
"grad_norm": 2.9818315723679327,
"learning_rate": 1.4586877044199015e-06,
"loss": 0.286,
"step": 1925
},
{
"epoch": 3.37,
"grad_norm": 3.284507097488489,
"learning_rate": 1.4448372394055249e-06,
"loss": 0.3031,
"step": 1930
},
{
"epoch": 3.38,
"grad_norm": 2.6446231205609907,
"learning_rate": 1.431026069706335e-06,
"loss": 0.2951,
"step": 1935
},
{
"epoch": 3.39,
"grad_norm": 2.429736926817698,
"learning_rate": 1.4172547096640837e-06,
"loss": 0.2948,
"step": 1940
},
{
"epoch": 3.4,
"grad_norm": 2.4087174238529565,
"learning_rate": 1.4035236721379758e-06,
"loss": 0.3079,
"step": 1945
},
{
"epoch": 3.41,
"grad_norm": 2.346665896073256,
"learning_rate": 1.3898334684855647e-06,
"loss": 0.2884,
"step": 1950
},
{
"epoch": 3.42,
"grad_norm": 2.2112665369477806,
"learning_rate": 1.376184608543709e-06,
"loss": 0.2784,
"step": 1955
},
{
"epoch": 3.43,
"grad_norm": 2.3555732075174705,
"learning_rate": 1.3625776006095882e-06,
"loss": 0.2951,
"step": 1960
},
{
"epoch": 3.44,
"grad_norm": 2.453057863316602,
"learning_rate": 1.3490129514217665e-06,
"loss": 0.2912,
"step": 1965
},
{
"epoch": 3.44,
"grad_norm": 2.349215074609662,
"learning_rate": 1.3354911661413305e-06,
"loss": 0.3175,
"step": 1970
},
{
"epoch": 3.45,
"grad_norm": 2.369663823495378,
"learning_rate": 1.3220127483330714e-06,
"loss": 0.3115,
"step": 1975
},
{
"epoch": 3.46,
"grad_norm": 2.353086780550841,
"learning_rate": 1.3085781999467303e-06,
"loss": 0.2974,
"step": 1980
},
{
"epoch": 3.47,
"grad_norm": 2.283791495168621,
"learning_rate": 1.2951880212983106e-06,
"loss": 0.2963,
"step": 1985
},
{
"epoch": 3.48,
"grad_norm": 2.285838470923522,
"learning_rate": 1.2818427110514382e-06,
"loss": 0.2925,
"step": 1990
},
{
"epoch": 3.49,
"grad_norm": 2.0947843282235863,
"learning_rate": 1.2685427661987975e-06,
"loss": 0.2944,
"step": 1995
},
{
"epoch": 3.5,
"grad_norm": 2.181297764037496,
"learning_rate": 1.2552886820436208e-06,
"loss": 0.2821,
"step": 2000
},
{
"epoch": 3.51,
"grad_norm": 2.2572030477763847,
"learning_rate": 1.2420809521812406e-06,
"loss": 0.299,
"step": 2005
},
{
"epoch": 3.51,
"grad_norm": 2.2492785376097677,
"learning_rate": 1.2289200684807098e-06,
"loss": 0.3049,
"step": 2010
},
{
"epoch": 3.52,
"grad_norm": 2.334560206327675,
"learning_rate": 1.2158065210664848e-06,
"loss": 0.2877,
"step": 2015
},
{
"epoch": 3.53,
"grad_norm": 2.0972126060506375,
"learning_rate": 1.2027407983001683e-06,
"loss": 0.2818,
"step": 2020
},
{
"epoch": 3.54,
"grad_norm": 2.545800792955665,
"learning_rate": 1.189723386762328e-06,
"loss": 0.2927,
"step": 2025
},
{
"epoch": 3.55,
"grad_norm": 2.2889188636916624,
"learning_rate": 1.1767547712343722e-06,
"loss": 0.3015,
"step": 2030
},
{
"epoch": 3.56,
"grad_norm": 2.073915350133538,
"learning_rate": 1.1638354346804974e-06,
"loss": 0.2784,
"step": 2035
},
{
"epoch": 3.57,
"grad_norm": 2.2964925609294826,
"learning_rate": 1.1509658582297025e-06,
"loss": 0.2906,
"step": 2040
},
{
"epoch": 3.58,
"grad_norm": 2.12295197546072,
"learning_rate": 1.1381465211578673e-06,
"loss": 0.2894,
"step": 2045
},
{
"epoch": 3.58,
"grad_norm": 21.13479225301652,
"learning_rate": 1.1253779008699131e-06,
"loss": 0.2995,
"step": 2050
},
{
"epoch": 3.59,
"grad_norm": 1.9921395090376566,
"learning_rate": 1.1126604728820102e-06,
"loss": 0.2766,
"step": 2055
},
{
"epoch": 3.6,
"grad_norm": 2.1541306330776258,
"learning_rate": 1.0999947108038816e-06,
"loss": 0.3002,
"step": 2060
},
{
"epoch": 3.61,
"grad_norm": 2.3718321918672824,
"learning_rate": 1.0873810863211595e-06,
"loss": 0.2894,
"step": 2065
},
{
"epoch": 3.62,
"grad_norm": 2.106333020947831,
"learning_rate": 1.074820069177816e-06,
"loss": 0.3026,
"step": 2070
},
{
"epoch": 3.63,
"grad_norm": 2.232973434321066,
"learning_rate": 1.0623121271586806e-06,
"loss": 0.287,
"step": 2075
},
{
"epoch": 3.64,
"grad_norm": 2.150526937883329,
"learning_rate": 1.049857726072005e-06,
"loss": 0.3079,
"step": 2080
},
{
"epoch": 3.65,
"grad_norm": 2.4299034348212034,
"learning_rate": 1.037457329732127e-06,
"loss": 0.2912,
"step": 2085
},
{
"epoch": 3.65,
"grad_norm": 2.2110962609469644,
"learning_rate": 1.0251113999421936e-06,
"loss": 0.2983,
"step": 2090
},
{
"epoch": 3.66,
"grad_norm": 2.2895025988377857,
"learning_rate": 1.0128203964769602e-06,
"loss": 0.2804,
"step": 2095
},
{
"epoch": 3.67,
"grad_norm": 1.9663163475739167,
"learning_rate": 1.0005847770656757e-06,
"loss": 0.3099,
"step": 2100
},
{
"epoch": 3.68,
"grad_norm": 1.955773416240966,
"learning_rate": 9.884049973750268e-07,
"loss": 0.2843,
"step": 2105
},
{
"epoch": 3.69,
"grad_norm": 2.151841568064239,
"learning_rate": 9.762815109921762e-07,
"loss": 0.2898,
"step": 2110
},
{
"epoch": 3.7,
"grad_norm": 2.2055014532745925,
"learning_rate": 9.642147694078664e-07,
"loss": 0.2922,
"step": 2115
},
{
"epoch": 3.71,
"grad_norm": 2.0548163347288955,
"learning_rate": 9.522052219996072e-07,
"loss": 0.2875,
"step": 2120
},
{
"epoch": 3.72,
"grad_norm": 2.0264167858867554,
"learning_rate": 9.402533160149415e-07,
"loss": 0.2916,
"step": 2125
},
{
"epoch": 3.72,
"grad_norm": 1.9310228298997407,
"learning_rate": 9.283594965547846e-07,
"loss": 0.288,
"step": 2130
},
{
"epoch": 3.73,
"grad_norm": 2.0775492416338994,
"learning_rate": 9.165242065568547e-07,
"loss": 0.3013,
"step": 2135
},
{
"epoch": 3.74,
"grad_norm": 4.273341280017886,
"learning_rate": 9.047478867791732e-07,
"loss": 0.2963,
"step": 2140
},
{
"epoch": 3.75,
"grad_norm": 2.2046805609188,
"learning_rate": 8.930309757836517e-07,
"loss": 0.2884,
"step": 2145
},
{
"epoch": 3.76,
"grad_norm": 2.0943612862709022,
"learning_rate": 8.813739099197597e-07,
"loss": 0.288,
"step": 2150
},
{
"epoch": 3.77,
"grad_norm": 2.470082871683493,
"learning_rate": 8.697771233082744e-07,
"loss": 0.2958,
"step": 2155
},
{
"epoch": 3.78,
"grad_norm": 2.514580950169932,
"learning_rate": 8.582410478251119e-07,
"loss": 0.2877,
"step": 2160
},
{
"epoch": 3.78,
"grad_norm": 2.4225946060971326,
"learning_rate": 8.467661130852464e-07,
"loss": 0.2964,
"step": 2165
},
{
"epoch": 3.79,
"grad_norm": 2.5213364012816752,
"learning_rate": 8.353527464267105e-07,
"loss": 0.3139,
"step": 2170
},
{
"epoch": 3.8,
"grad_norm": 2.1383450216449766,
"learning_rate": 8.240013728946786e-07,
"loss": 0.3011,
"step": 2175
},
{
"epoch": 3.81,
"grad_norm": 2.275623309460668,
"learning_rate": 8.127124152256408e-07,
"loss": 0.2877,
"step": 2180
},
{
"epoch": 3.82,
"grad_norm": 2.3231659654583945,
"learning_rate": 8.014862938316542e-07,
"loss": 0.2877,
"step": 2185
},
{
"epoch": 3.83,
"grad_norm": 2.034432326023906,
"learning_rate": 7.903234267846965e-07,
"loss": 0.2891,
"step": 2190
},
{
"epoch": 3.84,
"grad_norm": 2.1427143907161024,
"learning_rate": 7.792242298010847e-07,
"loss": 0.2905,
"step": 2195
},
{
"epoch": 3.85,
"grad_norm": 1.992125321368494,
"learning_rate": 7.681891162260016e-07,
"loss": 0.2904,
"step": 2200
},
{
"epoch": 3.85,
"grad_norm": 2.2912621172371073,
"learning_rate": 7.572184970181005e-07,
"loss": 0.2931,
"step": 2205
},
{
"epoch": 3.86,
"grad_norm": 2.1841846462580303,
"learning_rate": 7.463127807341966e-07,
"loss": 0.2888,
"step": 2210
},
{
"epoch": 3.87,
"grad_norm": 2.4186238871530947,
"learning_rate": 7.354723735140609e-07,
"loss": 0.2972,
"step": 2215
},
{
"epoch": 3.88,
"grad_norm": 2.165303668523032,
"learning_rate": 7.246976790652843e-07,
"loss": 0.2894,
"step": 2220
},
{
"epoch": 3.89,
"grad_norm": 1.971010064873144,
"learning_rate": 7.139890986482515e-07,
"loss": 0.2801,
"step": 2225
},
{
"epoch": 3.9,
"grad_norm": 2.226759681716193,
"learning_rate": 7.033470310611945e-07,
"loss": 0.2936,
"step": 2230
},
{
"epoch": 3.91,
"grad_norm": 2.128956719636366,
"learning_rate": 6.927718726253379e-07,
"loss": 0.2873,
"step": 2235
},
{
"epoch": 3.92,
"grad_norm": 1.9680023419439248,
"learning_rate": 6.822640171701486e-07,
"loss": 0.2946,
"step": 2240
},
{
"epoch": 3.92,
"grad_norm": 2.2622423856131344,
"learning_rate": 6.718238560186572e-07,
"loss": 0.2825,
"step": 2245
},
{
"epoch": 3.93,
"grad_norm": 1.947066365404126,
"learning_rate": 6.614517779728943e-07,
"loss": 0.2893,
"step": 2250
},
{
"epoch": 3.94,
"grad_norm": 2.2897303224060064,
"learning_rate": 6.511481692994077e-07,
"loss": 0.2816,
"step": 2255
},
{
"epoch": 3.95,
"grad_norm": 2.1661490360203937,
"learning_rate": 6.409134137148737e-07,
"loss": 0.2961,
"step": 2260
},
{
"epoch": 3.96,
"grad_norm": 2.163171587598097,
"learning_rate": 6.307478923718171e-07,
"loss": 0.2858,
"step": 2265
},
{
"epoch": 3.97,
"grad_norm": 2.03844883078776,
"learning_rate": 6.206519838444044e-07,
"loss": 0.2802,
"step": 2270
},
{
"epoch": 3.98,
"grad_norm": 1.950829082933861,
"learning_rate": 6.106260641143547e-07,
"loss": 0.3058,
"step": 2275
},
{
"epoch": 3.99,
"grad_norm": 1.971999267799782,
"learning_rate": 6.006705065569329e-07,
"loss": 0.2809,
"step": 2280
},
{
"epoch": 3.99,
"grad_norm": 1.9523923513832082,
"learning_rate": 5.907856819270471e-07,
"loss": 0.281,
"step": 2285
},
{
"epoch": 4.0,
"eval_loss": 0.3295736014842987,
"eval_runtime": 330.2445,
"eval_samples_per_second": 22.774,
"eval_steps_per_second": 0.357,
"step": 2288
},
{
"epoch": 4.0,
"grad_norm": 4.044140540928026,
"learning_rate": 5.809719583454415e-07,
"loss": 0.2618,
"step": 2290
},
{
"epoch": 4.01,
"grad_norm": 2.5480857730573305,
"learning_rate": 5.712297012849826e-07,
"loss": 0.2349,
"step": 2295
},
{
"epoch": 4.02,
"grad_norm": 2.2011961310601276,
"learning_rate": 5.615592735570563e-07,
"loss": 0.2555,
"step": 2300
},
{
"epoch": 4.03,
"grad_norm": 2.1457420228511497,
"learning_rate": 5.519610352980501e-07,
"loss": 0.2208,
"step": 2305
},
{
"epoch": 4.04,
"grad_norm": 1.989514783963616,
"learning_rate": 5.424353439559446e-07,
"loss": 0.24,
"step": 2310
},
{
"epoch": 4.05,
"grad_norm": 5.657828438864317,
"learning_rate": 5.32982554277e-07,
"loss": 0.2344,
"step": 2315
},
{
"epoch": 4.06,
"grad_norm": 2.1124182295185783,
"learning_rate": 5.236030182925475e-07,
"loss": 0.2332,
"step": 2320
},
{
"epoch": 4.06,
"grad_norm": 2.1054589415417797,
"learning_rate": 5.142970853058743e-07,
"loss": 0.2264,
"step": 2325
},
{
"epoch": 4.07,
"grad_norm": 2.1186956472831557,
"learning_rate": 5.050651018792213e-07,
"loss": 0.2331,
"step": 2330
},
{
"epoch": 4.08,
"grad_norm": 1.9513447351491608,
"learning_rate": 4.959074118208726e-07,
"loss": 0.2243,
"step": 2335
},
{
"epoch": 4.09,
"grad_norm": 2.020062184794575,
"learning_rate": 4.868243561723535e-07,
"loss": 0.2467,
"step": 2340
},
{
"epoch": 4.1,
"grad_norm": 2.180369971274554,
"learning_rate": 4.7781627319573e-07,
"loss": 0.2396,
"step": 2345
},
{
"epoch": 4.11,
"grad_norm": 1.9826937910162328,
"learning_rate": 4.688834983610083e-07,
"loss": 0.23,
"step": 2350
},
{
"epoch": 4.12,
"grad_norm": 1.9374964597970792,
"learning_rate": 4.6002636433364836e-07,
"loss": 0.233,
"step": 2355
},
{
"epoch": 4.13,
"grad_norm": 2.2161679673685364,
"learning_rate": 4.512452009621665e-07,
"loss": 0.2355,
"step": 2360
},
{
"epoch": 4.13,
"grad_norm": 2.0656182008910258,
"learning_rate": 4.4254033526585917e-07,
"loss": 0.2298,
"step": 2365
},
{
"epoch": 4.14,
"grad_norm": 1.9301077360385739,
"learning_rate": 4.3391209142261996e-07,
"loss": 0.2244,
"step": 2370
},
{
"epoch": 4.15,
"grad_norm": 2.019772633601012,
"learning_rate": 4.2536079075686687e-07,
"loss": 0.239,
"step": 2375
},
{
"epoch": 4.16,
"grad_norm": 1.9236814219014258,
"learning_rate": 4.168867517275807e-07,
"loss": 0.2294,
"step": 2380
},
{
"epoch": 4.17,
"grad_norm": 1.9496838165175443,
"learning_rate": 4.0849028991643726e-07,
"loss": 0.2351,
"step": 2385
},
{
"epoch": 4.18,
"grad_norm": 2.0751662501448687,
"learning_rate": 4.0017171801606245e-07,
"loss": 0.2328,
"step": 2390
},
{
"epoch": 4.19,
"grad_norm": 2.030778712243648,
"learning_rate": 3.9193134581838375e-07,
"loss": 0.2193,
"step": 2395
},
{
"epoch": 4.2,
"grad_norm": 2.1108537998820696,
"learning_rate": 3.8376948020309083e-07,
"loss": 0.2259,
"step": 2400
},
{
"epoch": 4.2,
"grad_norm": 2.0388571280858003,
"learning_rate": 3.756864251262143e-07,
"loss": 0.2335,
"step": 2405
},
{
"epoch": 4.21,
"grad_norm": 2.4800994906904616,
"learning_rate": 3.6768248160879786e-07,
"loss": 0.2359,
"step": 2410
},
{
"epoch": 4.22,
"grad_norm": 2.0463977880911193,
"learning_rate": 3.597579477256932e-07,
"loss": 0.2296,
"step": 2415
},
{
"epoch": 4.23,
"grad_norm": 1.980879433572184,
"learning_rate": 3.51913118594458e-07,
"loss": 0.2244,
"step": 2420
},
{
"epoch": 4.24,
"grad_norm": 2.201271813612607,
"learning_rate": 3.4414828636436525e-07,
"loss": 0.2355,
"step": 2425
},
{
"epoch": 4.25,
"grad_norm": 1.9358796946633714,
"learning_rate": 3.364637402055235e-07,
"loss": 0.2235,
"step": 2430
},
{
"epoch": 4.26,
"grad_norm": 1.9943772531112582,
"learning_rate": 3.288597662981069e-07,
"loss": 0.2363,
"step": 2435
},
{
"epoch": 4.27,
"grad_norm": 1.9701699768295131,
"learning_rate": 3.2133664782169947e-07,
"loss": 0.2211,
"step": 2440
},
{
"epoch": 4.27,
"grad_norm": 1.9846134376402154,
"learning_rate": 3.138946649447483e-07,
"loss": 0.2421,
"step": 2445
},
{
"epoch": 4.28,
"grad_norm": 2.1267366719554084,
"learning_rate": 3.0653409481412906e-07,
"loss": 0.232,
"step": 2450
},
{
"epoch": 4.29,
"grad_norm": 2.025895740497588,
"learning_rate": 2.9925521154482577e-07,
"loss": 0.2327,
"step": 2455
},
{
"epoch": 4.3,
"grad_norm": 2.0801594057057304,
"learning_rate": 2.9205828620972267e-07,
"loss": 0.229,
"step": 2460
},
{
"epoch": 4.31,
"grad_norm": 2.1905576705856413,
"learning_rate": 2.8494358682950725e-07,
"loss": 0.2343,
"step": 2465
},
{
"epoch": 4.32,
"grad_norm": 2.104131043625037,
"learning_rate": 2.779113783626916e-07,
"loss": 0.2324,
"step": 2470
},
{
"epoch": 4.33,
"grad_norm": 1.9860664411869802,
"learning_rate": 2.70961922695743e-07,
"loss": 0.2238,
"step": 2475
},
{
"epoch": 4.34,
"grad_norm": 2.0126712420413324,
"learning_rate": 2.6409547863333246e-07,
"loss": 0.2203,
"step": 2480
},
{
"epoch": 4.34,
"grad_norm": 1.9772873060488734,
"learning_rate": 2.573123018886961e-07,
"loss": 0.231,
"step": 2485
},
{
"epoch": 4.35,
"grad_norm": 1.8887328536766113,
"learning_rate": 2.5061264507411057e-07,
"loss": 0.2135,
"step": 2490
},
{
"epoch": 4.36,
"grad_norm": 2.1093653326387933,
"learning_rate": 2.4399675769148784e-07,
"loss": 0.2262,
"step": 2495
},
{
"epoch": 4.37,
"grad_norm": 2.209645499404587,
"learning_rate": 2.37464886123083e-07,
"loss": 0.2307,
"step": 2500
},
{
"epoch": 4.38,
"grad_norm": 2.0867824783223208,
"learning_rate": 2.3101727362231762e-07,
"loss": 0.2363,
"step": 2505
},
{
"epoch": 4.39,
"grad_norm": 2.1576991729700814,
"learning_rate": 2.2465416030472227e-07,
"loss": 0.2331,
"step": 2510
},
{
"epoch": 4.4,
"grad_norm": 2.0751768584129495,
"learning_rate": 2.18375783138991e-07,
"loss": 0.2243,
"step": 2515
},
{
"epoch": 4.41,
"grad_norm": 1.8719170813482977,
"learning_rate": 2.1218237593816305e-07,
"loss": 0.2366,
"step": 2520
},
{
"epoch": 4.41,
"grad_norm": 1.9719981089033465,
"learning_rate": 2.0607416935090746e-07,
"loss": 0.2239,
"step": 2525
},
{
"epoch": 4.42,
"grad_norm": 2.1100472183339276,
"learning_rate": 2.0005139085293945e-07,
"loss": 0.2384,
"step": 2530
},
{
"epoch": 4.43,
"grad_norm": 2.194632892084896,
"learning_rate": 1.941142647385469e-07,
"loss": 0.2311,
"step": 2535
},
{
"epoch": 4.44,
"grad_norm": 2.0340858963446475,
"learning_rate": 1.882630121122353e-07,
"loss": 0.2358,
"step": 2540
},
{
"epoch": 4.45,
"grad_norm": 1.9791028474343204,
"learning_rate": 1.8249785088049894e-07,
"loss": 0.2389,
"step": 2545
},
{
"epoch": 4.46,
"grad_norm": 2.2469036964697082,
"learning_rate": 1.7681899574369916e-07,
"loss": 0.2373,
"step": 2550
},
{
"epoch": 4.47,
"grad_norm": 1.9763859240449058,
"learning_rate": 1.7122665818807478e-07,
"loss": 0.2242,
"step": 2555
},
{
"epoch": 4.48,
"grad_norm": 2.2911402959565224,
"learning_rate": 1.6572104647786247e-07,
"loss": 0.2411,
"step": 2560
},
{
"epoch": 4.48,
"grad_norm": 2.083914931198256,
"learning_rate": 1.6030236564754054e-07,
"loss": 0.2228,
"step": 2565
},
{
"epoch": 4.49,
"grad_norm": 1.9520019226109886,
"learning_rate": 1.5497081749419745e-07,
"loss": 0.223,
"step": 2570
},
{
"epoch": 4.5,
"grad_norm": 2.000958322889037,
"learning_rate": 1.497266005700107e-07,
"loss": 0.2366,
"step": 2575
},
{
"epoch": 4.51,
"grad_norm": 2.255516690447902,
"learning_rate": 1.4456991017485737e-07,
"loss": 0.2228,
"step": 2580
},
{
"epoch": 4.52,
"grad_norm": 1.9859817862123228,
"learning_rate": 1.3950093834903865e-07,
"loss": 0.2277,
"step": 2585
},
{
"epoch": 4.53,
"grad_norm": 1.9478379602583347,
"learning_rate": 1.3451987386612852e-07,
"loss": 0.2303,
"step": 2590
},
{
"epoch": 4.54,
"grad_norm": 2.0551925040719135,
"learning_rate": 1.2962690222594444e-07,
"loss": 0.2213,
"step": 2595
},
{
"epoch": 4.55,
"grad_norm": 1.9946612862734567,
"learning_rate": 1.2482220564763669e-07,
"loss": 0.2332,
"step": 2600
},
{
"epoch": 4.55,
"grad_norm": 2.013120854912158,
"learning_rate": 1.2010596306290588e-07,
"loss": 0.2171,
"step": 2605
},
{
"epoch": 4.56,
"grad_norm": 3.943001009546542,
"learning_rate": 1.154783501093365e-07,
"loss": 0.2351,
"step": 2610
},
{
"epoch": 4.57,
"grad_norm": 2.077111667283586,
"learning_rate": 1.1093953912385769e-07,
"loss": 0.2227,
"step": 2615
},
{
"epoch": 4.58,
"grad_norm": 1.9319320014149188,
"learning_rate": 1.0648969913632401e-07,
"loss": 0.2349,
"step": 2620
},
{
"epoch": 4.59,
"grad_norm": 2.0904879562505125,
"learning_rate": 1.0212899586322183e-07,
"loss": 0.2353,
"step": 2625
},
{
"epoch": 4.6,
"grad_norm": 1.9353613819189108,
"learning_rate": 9.785759170149622e-08,
"loss": 0.2207,
"step": 2630
},
{
"epoch": 4.61,
"grad_norm": 2.070941108809218,
"learning_rate": 9.36756457225052e-08,
"loss": 0.2333,
"step": 2635
},
{
"epoch": 4.62,
"grad_norm": 1.962848104620626,
"learning_rate": 8.958331366609424e-08,
"loss": 0.2268,
"step": 2640
},
{
"epoch": 4.62,
"grad_norm": 2.0575315291559857,
"learning_rate": 8.55807479347967e-08,
"loss": 0.2253,
"step": 2645
},
{
"epoch": 4.63,
"grad_norm": 2.107360332698912,
"learning_rate": 8.166809758815897e-08,
"loss": 0.224,
"step": 2650
},
{
"epoch": 4.64,
"grad_norm": 2.144090062153445,
"learning_rate": 7.784550833718707e-08,
"loss": 0.2365,
"step": 2655
},
{
"epoch": 4.65,
"grad_norm": 1.9482188832166278,
"learning_rate": 7.411312253892466e-08,
"loss": 0.2294,
"step": 2660
},
{
"epoch": 4.66,
"grad_norm": 2.1524181670324616,
"learning_rate": 7.047107919114588e-08,
"loss": 0.2336,
"step": 2665
},
{
"epoch": 4.67,
"grad_norm": 2.152296301431861,
"learning_rate": 6.691951392718332e-08,
"loss": 0.2258,
"step": 2670
},
{
"epoch": 4.68,
"grad_norm": 1.9081910207653139,
"learning_rate": 6.345855901087522e-08,
"loss": 0.2338,
"step": 2675
},
{
"epoch": 4.69,
"grad_norm": 1.938526307346203,
"learning_rate": 6.008834333163876e-08,
"loss": 0.2195,
"step": 2680
},
{
"epoch": 4.69,
"grad_norm": 1.9617786271701327,
"learning_rate": 5.680899239967369e-08,
"loss": 0.2337,
"step": 2685
},
{
"epoch": 4.7,
"grad_norm": 2.070106598519499,
"learning_rate": 5.3620628341283234e-08,
"loss": 0.2339,
"step": 2690
},
{
"epoch": 4.71,
"grad_norm": 2.063857678666163,
"learning_rate": 5.052336989433082e-08,
"loss": 0.2424,
"step": 2695
},
{
"epoch": 4.72,
"grad_norm": 1.9748870315279254,
"learning_rate": 4.75173324038139e-08,
"loss": 0.2284,
"step": 2700
},
{
"epoch": 4.73,
"grad_norm": 2.001809998998157,
"learning_rate": 4.4602627817571245e-08,
"loss": 0.2358,
"step": 2705
},
{
"epoch": 4.74,
"grad_norm": 1.9729922140795935,
"learning_rate": 4.1779364682113796e-08,
"loss": 0.233,
"step": 2710
},
{
"epoch": 4.75,
"grad_norm": 2.045685009725174,
"learning_rate": 3.904764813858014e-08,
"loss": 0.2323,
"step": 2715
},
{
"epoch": 4.76,
"grad_norm": 1.9591449865359234,
"learning_rate": 3.640757991882349e-08,
"loss": 0.2349,
"step": 2720
},
{
"epoch": 4.76,
"grad_norm": 1.9911019888259411,
"learning_rate": 3.385925834162113e-08,
"loss": 0.2257,
"step": 2725
},
{
"epoch": 4.77,
"grad_norm": 2.050714072028577,
"learning_rate": 3.1402778309014284e-08,
"loss": 0.2299,
"step": 2730
},
{
"epoch": 4.78,
"grad_norm": 2.0356703719044518,
"learning_rate": 2.903823130277289e-08,
"loss": 0.2277,
"step": 2735
},
{
"epoch": 4.79,
"grad_norm": 1.9823356155432006,
"learning_rate": 2.676570538098944e-08,
"loss": 0.2431,
"step": 2740
},
{
"epoch": 4.8,
"grad_norm": 2.0568572715035063,
"learning_rate": 2.4585285174799357e-08,
"loss": 0.2331,
"step": 2745
},
{
"epoch": 4.81,
"grad_norm": 2.195948388536599,
"learning_rate": 2.2497051885228825e-08,
"loss": 0.2308,
"step": 2750
},
{
"epoch": 4.82,
"grad_norm": 2.1647004516523523,
"learning_rate": 2.050108328017164e-08,
"loss": 0.2418,
"step": 2755
},
{
"epoch": 4.83,
"grad_norm": 2.3975608426150656,
"learning_rate": 1.8597453691492628e-08,
"loss": 0.2318,
"step": 2760
},
{
"epoch": 4.83,
"grad_norm": 2.0876506445205716,
"learning_rate": 1.678623401225876e-08,
"loss": 0.2227,
"step": 2765
},
{
"epoch": 4.84,
"grad_norm": 1.9235370393907136,
"learning_rate": 1.5067491694100156e-08,
"loss": 0.2317,
"step": 2770
},
{
"epoch": 4.85,
"grad_norm": 2.166013576809247,
"learning_rate": 1.3441290744697632e-08,
"loss": 0.2281,
"step": 2775
},
{
"epoch": 4.86,
"grad_norm": 1.9685518937742286,
"learning_rate": 1.1907691725398795e-08,
"loss": 0.2238,
"step": 2780
},
{
"epoch": 4.87,
"grad_norm": 1.9483454828149802,
"learning_rate": 1.0466751748963443e-08,
"loss": 0.2304,
"step": 2785
},
{
"epoch": 4.88,
"grad_norm": 1.9753685457102212,
"learning_rate": 9.118524477434999e-09,
"loss": 0.2365,
"step": 2790
},
{
"epoch": 4.89,
"grad_norm": 1.8854947658053076,
"learning_rate": 7.863060120144316e-09,
"loss": 0.2142,
"step": 2795
},
{
"epoch": 4.9,
"grad_norm": 2.154842589588192,
"learning_rate": 6.700405431837587e-09,
"loss": 0.2315,
"step": 2800
},
{
"epoch": 4.9,
"grad_norm": 2.0037090722639506,
"learning_rate": 5.6306037109371544e-09,
"loss": 0.2225,
"step": 2805
},
{
"epoch": 4.91,
"grad_norm": 1.9552444706301575,
"learning_rate": 4.653694797927544e-09,
"loss": 0.2331,
"step": 2810
},
{
"epoch": 4.92,
"grad_norm": 2.563219810008139,
"learning_rate": 3.769715073872749e-09,
"loss": 0.2407,
"step": 2815
},
{
"epoch": 4.93,
"grad_norm": 1.9200483422222523,
"learning_rate": 2.978697459060098e-09,
"loss": 0.23,
"step": 2820
},
{
"epoch": 4.94,
"grad_norm": 1.9640394162849764,
"learning_rate": 2.280671411776514e-09,
"loss": 0.2264,
"step": 2825
},
{
"epoch": 4.95,
"grad_norm": 2.1505121388781467,
"learning_rate": 1.6756629272085545e-09,
"loss": 0.2278,
"step": 2830
},
{
"epoch": 4.96,
"grad_norm": 1.9911030010645636,
"learning_rate": 1.1636945364768005e-09,
"loss": 0.2341,
"step": 2835
},
{
"epoch": 4.97,
"grad_norm": 2.0018264068324187,
"learning_rate": 7.447853057954146e-10,
"loss": 0.2185,
"step": 2840
},
{
"epoch": 4.97,
"grad_norm": 1.9862221260079735,
"learning_rate": 4.1895083576271035e-10,
"loss": 0.221,
"step": 2845
},
{
"epoch": 4.98,
"grad_norm": 1.9962774038489774,
"learning_rate": 1.8620326077967155e-10,
"loss": 0.2316,
"step": 2850
},
{
"epoch": 4.99,
"grad_norm": 1.9534249748428105,
"learning_rate": 4.6551248598647457e-11,
"loss": 0.2333,
"step": 2855
},
{
"epoch": 5.0,
"grad_norm": 2.0900193031037806,
"learning_rate": 0.0,
"loss": 0.229,
"step": 2860
},
{
"epoch": 5.0,
"eval_loss": 0.3284313678741455,
"eval_runtime": 329.743,
"eval_samples_per_second": 22.809,
"eval_steps_per_second": 0.358,
"step": 2860
},
{
"epoch": 5.0,
"step": 2860,
"total_flos": 2395303260979200.0,
"train_loss": 0.44255039914921446,
"train_runtime": 63778.4258,
"train_samples_per_second": 5.733,
"train_steps_per_second": 0.045
}
],
"logging_steps": 5,
"max_steps": 2860,
"num_input_tokens_seen": 0,
"num_train_epochs": 5,
"save_steps": 500,
"total_flos": 2395303260979200.0,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}