TianyiQ's picture
Upload folder using huggingface_hub
536cb42 verified
raw
history blame
141 kB
{
"best_metric": 2.4689557552337646,
"best_model_checkpoint": "./output/training_results/C017_random_sample_llama3-8b-base_pretrain_20240504_182259/checkpoint-800",
"epoch": 4.0,
"eval_steps": 200,
"global_step": 3944,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0010141987829614604,
"grad_norm": 4.267137538119642,
"learning_rate": 7.5e-07,
"loss": 2.7134,
"step": 1
},
{
"epoch": 0.005070993914807302,
"grad_norm": 4.879489677016923,
"learning_rate": 2.25e-06,
"loss": 2.7254,
"step": 5
},
{
"epoch": 0.010141987829614604,
"grad_norm": 2.7621009561709564,
"learning_rate": 6e-06,
"loss": 2.707,
"step": 10
},
{
"epoch": 0.015212981744421906,
"grad_norm": 2.404100845677231,
"learning_rate": 9e-06,
"loss": 2.6421,
"step": 15
},
{
"epoch": 0.02028397565922921,
"grad_norm": 2.4429846538599254,
"learning_rate": 1.275e-05,
"loss": 2.6682,
"step": 20
},
{
"epoch": 0.02535496957403651,
"grad_norm": 2.8575493026010625,
"learning_rate": 1.4916395742870319e-05,
"loss": 2.6639,
"step": 25
},
{
"epoch": 0.030425963488843813,
"grad_norm": 2.4347171369214538,
"learning_rate": 1.4709241308404976e-05,
"loss": 2.6624,
"step": 30
},
{
"epoch": 0.035496957403651115,
"grad_norm": 2.5792627004512942,
"learning_rate": 1.4504714365262738e-05,
"loss": 2.6351,
"step": 35
},
{
"epoch": 0.04056795131845842,
"grad_norm": 2.1789139866654366,
"learning_rate": 1.4302784881547452e-05,
"loss": 2.6055,
"step": 40
},
{
"epoch": 0.04563894523326572,
"grad_norm": 2.232485210798856,
"learning_rate": 1.4103423130872168e-05,
"loss": 2.5938,
"step": 45
},
{
"epoch": 0.05070993914807302,
"grad_norm": 2.2896589926745814,
"learning_rate": 1.390659968963626e-05,
"loss": 2.6334,
"step": 50
},
{
"epoch": 0.055780933062880324,
"grad_norm": 2.7780457428021985,
"learning_rate": 1.3712285434323396e-05,
"loss": 2.646,
"step": 55
},
{
"epoch": 0.060851926977687626,
"grad_norm": 1.9399001575023072,
"learning_rate": 1.352045153882017e-05,
"loss": 2.6182,
"step": 60
},
{
"epoch": 0.06592292089249494,
"grad_norm": 1.9083156579424998,
"learning_rate": 1.3331069471755332e-05,
"loss": 2.6056,
"step": 65
},
{
"epoch": 0.07099391480730223,
"grad_norm": 2.2298396560554683,
"learning_rate": 1.314411099385942e-05,
"loss": 2.6043,
"step": 70
},
{
"epoch": 0.07606490872210954,
"grad_norm": 1.9661711744318215,
"learning_rate": 1.2959548155344706e-05,
"loss": 2.6321,
"step": 75
},
{
"epoch": 0.08113590263691683,
"grad_norm": 2.1260634398939438,
"learning_rate": 1.2777353293305311e-05,
"loss": 2.5744,
"step": 80
},
{
"epoch": 0.08620689655172414,
"grad_norm": 2.171189842092272,
"learning_rate": 1.2597499029137354e-05,
"loss": 2.6102,
"step": 85
},
{
"epoch": 0.09127789046653144,
"grad_norm": 2.118995328928547,
"learning_rate": 1.2419958265979023e-05,
"loss": 2.6056,
"step": 90
},
{
"epoch": 0.09634888438133875,
"grad_norm": 2.1743656445294466,
"learning_rate": 1.2244704186170414e-05,
"loss": 2.591,
"step": 95
},
{
"epoch": 0.10141987829614604,
"grad_norm": 2.100620832387391,
"learning_rate": 1.2106129489565247e-05,
"loss": 2.6461,
"step": 100
},
{
"epoch": 0.10649087221095335,
"grad_norm": 2.02911049207023,
"learning_rate": 1.1934924740853141e-05,
"loss": 2.5878,
"step": 105
},
{
"epoch": 0.11156186612576065,
"grad_norm": 2.12870974325018,
"learning_rate": 1.1765933050017452e-05,
"loss": 2.5793,
"step": 110
},
{
"epoch": 0.11663286004056796,
"grad_norm": 1.9038783159180614,
"learning_rate": 1.1599128637544344e-05,
"loss": 2.5612,
"step": 115
},
{
"epoch": 0.12170385395537525,
"grad_norm": 1.9647399779959451,
"learning_rate": 1.1434485991200533e-05,
"loss": 2.6083,
"step": 120
},
{
"epoch": 0.12677484787018256,
"grad_norm": 1.88937427094592,
"learning_rate": 1.1271979863605386e-05,
"loss": 2.5561,
"step": 125
},
{
"epoch": 0.13184584178498987,
"grad_norm": 1.8208051471693176,
"learning_rate": 1.111158526982193e-05,
"loss": 2.5884,
"step": 130
},
{
"epoch": 0.13691683569979715,
"grad_norm": 1.771422341312915,
"learning_rate": 1.0953277484966689e-05,
"loss": 2.5509,
"step": 135
},
{
"epoch": 0.14198782961460446,
"grad_norm": 1.8296701053391813,
"learning_rate": 1.0797032041838185e-05,
"loss": 2.5784,
"step": 140
},
{
"epoch": 0.14705882352941177,
"grad_norm": 1.8139046565289612,
"learning_rate": 1.0642824728564022e-05,
"loss": 2.5624,
"step": 145
},
{
"epoch": 0.15212981744421908,
"grad_norm": 1.9862915107502803,
"learning_rate": 1.0490631586266381e-05,
"loss": 2.6007,
"step": 150
},
{
"epoch": 0.15720081135902636,
"grad_norm": 1.8392246134083736,
"learning_rate": 1.0340428906745863e-05,
"loss": 2.5775,
"step": 155
},
{
"epoch": 0.16227180527383367,
"grad_norm": 1.9250085841598776,
"learning_rate": 1.0192193230183505e-05,
"loss": 2.6045,
"step": 160
},
{
"epoch": 0.16734279918864098,
"grad_norm": 2.1119936162911825,
"learning_rate": 1.0045901342860905e-05,
"loss": 2.5838,
"step": 165
},
{
"epoch": 0.1724137931034483,
"grad_norm": 1.9416866546338962,
"learning_rate": 9.901530274898272e-06,
"loss": 2.5643,
"step": 170
},
{
"epoch": 0.17748478701825557,
"grad_norm": 1.871570899679003,
"learning_rate": 9.75905729801036e-06,
"loss": 2.5549,
"step": 175
},
{
"epoch": 0.18255578093306288,
"grad_norm": 2.0672616615182897,
"learning_rate": 9.61845992328009e-06,
"loss": 2.561,
"step": 180
},
{
"epoch": 0.1876267748478702,
"grad_norm": 1.8373271363293353,
"learning_rate": 9.479715898949807e-06,
"loss": 2.5728,
"step": 185
},
{
"epoch": 0.1926977687626775,
"grad_norm": 1.9497106449021773,
"learning_rate": 9.342803208230014e-06,
"loss": 2.5535,
"step": 190
},
{
"epoch": 0.19776876267748478,
"grad_norm": 1.913646357656738,
"learning_rate": 9.207700067125492e-06,
"loss": 2.5411,
"step": 195
},
{
"epoch": 0.2028397565922921,
"grad_norm": 1.7027113982701332,
"learning_rate": 9.074384922278684e-06,
"loss": 2.5442,
"step": 200
},
{
"epoch": 0.2028397565922921,
"eval_loss": 2.55521821975708,
"eval_runtime": 81.0607,
"eval_samples_per_second": 86.429,
"eval_steps_per_second": 0.679,
"step": 200
},
{
"epoch": 0.2079107505070994,
"grad_norm": 1.753576639879344,
"learning_rate": 8.942836448830213e-06,
"loss": 2.5264,
"step": 205
},
{
"epoch": 0.2129817444219067,
"grad_norm": 1.7785092188900598,
"learning_rate": 8.813033548296443e-06,
"loss": 2.5645,
"step": 210
},
{
"epoch": 0.21805273833671399,
"grad_norm": 1.7915296631060966,
"learning_rate": 8.684955346463971e-06,
"loss": 2.555,
"step": 215
},
{
"epoch": 0.2231237322515213,
"grad_norm": 1.7452346531223148,
"learning_rate": 8.558581191300906e-06,
"loss": 2.6118,
"step": 220
},
{
"epoch": 0.2281947261663286,
"grad_norm": 2.339774136223256,
"learning_rate": 8.433890650884857e-06,
"loss": 2.5284,
"step": 225
},
{
"epoch": 0.2332657200811359,
"grad_norm": 1.7961229516339332,
"learning_rate": 8.310863511347508e-06,
"loss": 2.558,
"step": 230
},
{
"epoch": 0.2383367139959432,
"grad_norm": 2.000305491613022,
"learning_rate": 8.189479774835651e-06,
"loss": 2.5312,
"step": 235
},
{
"epoch": 0.2434077079107505,
"grad_norm": 1.9162907270104979,
"learning_rate": 8.069719657488614e-06,
"loss": 2.4983,
"step": 240
},
{
"epoch": 0.2484787018255578,
"grad_norm": 1.9447544732938296,
"learning_rate": 7.951563587431902e-06,
"loss": 2.5462,
"step": 245
},
{
"epoch": 0.2535496957403651,
"grad_norm": 1.8244106804572084,
"learning_rate": 7.834992202787018e-06,
"loss": 2.5354,
"step": 250
},
{
"epoch": 0.25862068965517243,
"grad_norm": 1.714609238517639,
"learning_rate": 7.719986349697309e-06,
"loss": 2.5386,
"step": 255
},
{
"epoch": 0.26369168356997974,
"grad_norm": 1.795436681758725,
"learning_rate": 7.606527080369728e-06,
"loss": 2.5388,
"step": 260
},
{
"epoch": 0.268762677484787,
"grad_norm": 1.7081706265027667,
"learning_rate": 7.494595651132443e-06,
"loss": 2.568,
"step": 265
},
{
"epoch": 0.2738336713995943,
"grad_norm": 1.6958291617768828,
"learning_rate": 7.384173520508138e-06,
"loss": 2.5489,
"step": 270
},
{
"epoch": 0.2789046653144016,
"grad_norm": 1.6677502189962874,
"learning_rate": 7.275242347302937e-06,
"loss": 2.5666,
"step": 275
},
{
"epoch": 0.2839756592292089,
"grad_norm": 1.6916519769077745,
"learning_rate": 7.167783988710829e-06,
"loss": 2.5161,
"step": 280
},
{
"epoch": 0.28904665314401623,
"grad_norm": 1.9276199368209956,
"learning_rate": 7.061780498433485e-06,
"loss": 2.5461,
"step": 285
},
{
"epoch": 0.29411764705882354,
"grad_norm": 1.721858200785338,
"learning_rate": 6.957214124815376e-06,
"loss": 2.56,
"step": 290
},
{
"epoch": 0.29918864097363085,
"grad_norm": 1.7023218265873687,
"learning_rate": 6.854067308994081e-06,
"loss": 2.5252,
"step": 295
},
{
"epoch": 0.30425963488843816,
"grad_norm": 1.7702063060142263,
"learning_rate": 6.752322683065677e-06,
"loss": 2.5365,
"step": 300
},
{
"epoch": 0.3093306288032454,
"grad_norm": 1.807175965887596,
"learning_rate": 6.651963068265119e-06,
"loss": 2.5351,
"step": 305
},
{
"epoch": 0.3144016227180527,
"grad_norm": 1.7687398862728192,
"learning_rate": 6.5529714731614995e-06,
"loss": 2.5184,
"step": 310
},
{
"epoch": 0.31947261663286003,
"grad_norm": 1.808664958617461,
"learning_rate": 6.455331091868087e-06,
"loss": 2.5062,
"step": 315
},
{
"epoch": 0.32454361054766734,
"grad_norm": 1.9021979000655393,
"learning_rate": 6.359025302267049e-06,
"loss": 2.5225,
"step": 320
},
{
"epoch": 0.32961460446247465,
"grad_norm": 1.704473391712384,
"learning_rate": 6.264037664248752e-06,
"loss": 2.5233,
"step": 325
},
{
"epoch": 0.33468559837728196,
"grad_norm": 1.751379362669565,
"learning_rate": 6.17035191796554e-06,
"loss": 2.4854,
"step": 330
},
{
"epoch": 0.33975659229208927,
"grad_norm": 1.6980009285341724,
"learning_rate": 6.077951982099886e-06,
"loss": 2.5008,
"step": 335
},
{
"epoch": 0.3448275862068966,
"grad_norm": 1.6987141788770321,
"learning_rate": 5.986821952146847e-06,
"loss": 2.5438,
"step": 340
},
{
"epoch": 0.34989858012170383,
"grad_norm": 1.6781775461943316,
"learning_rate": 5.89694609871067e-06,
"loss": 2.5417,
"step": 345
},
{
"epoch": 0.35496957403651114,
"grad_norm": 1.7326892052245193,
"learning_rate": 5.808308865815513e-06,
"loss": 2.5185,
"step": 350
},
{
"epoch": 0.36004056795131845,
"grad_norm": 1.743645811121294,
"learning_rate": 5.720894869230136e-06,
"loss": 2.5094,
"step": 355
},
{
"epoch": 0.36511156186612576,
"grad_norm": 1.7256678519147217,
"learning_rate": 5.634688894806482e-06,
"loss": 2.5316,
"step": 360
},
{
"epoch": 0.37018255578093306,
"grad_norm": 1.6209115792712339,
"learning_rate": 5.549675896832072e-06,
"loss": 2.5164,
"step": 365
},
{
"epoch": 0.3752535496957404,
"grad_norm": 1.6497735310259896,
"learning_rate": 5.465840996396076e-06,
"loss": 2.5363,
"step": 370
},
{
"epoch": 0.3803245436105477,
"grad_norm": 1.665747208014539,
"learning_rate": 5.383169479769005e-06,
"loss": 2.5015,
"step": 375
},
{
"epoch": 0.385395537525355,
"grad_norm": 1.8360023746562857,
"learning_rate": 5.301646796795905e-06,
"loss": 2.4465,
"step": 380
},
{
"epoch": 0.39046653144016225,
"grad_norm": 1.721788501212322,
"learning_rate": 5.221258559302969e-06,
"loss": 2.5104,
"step": 385
},
{
"epoch": 0.39553752535496955,
"grad_norm": 1.7896539066797603,
"learning_rate": 5.141990539517474e-06,
"loss": 2.5406,
"step": 390
},
{
"epoch": 0.40060851926977686,
"grad_norm": 1.7026594592165973,
"learning_rate": 5.0638286685009445e-06,
"loss": 2.5403,
"step": 395
},
{
"epoch": 0.4056795131845842,
"grad_norm": 1.7666645373608338,
"learning_rate": 4.986759034595453e-06,
"loss": 2.5376,
"step": 400
},
{
"epoch": 0.4056795131845842,
"eval_loss": 2.509550094604492,
"eval_runtime": 81.0126,
"eval_samples_per_second": 86.48,
"eval_steps_per_second": 0.679,
"step": 400
},
{
"epoch": 0.4107505070993915,
"grad_norm": 1.702454460655481,
"learning_rate": 4.910767881882966e-06,
"loss": 2.5017,
"step": 405
},
{
"epoch": 0.4158215010141988,
"grad_norm": 1.6625424708509573,
"learning_rate": 4.83584160865765e-06,
"loss": 2.5271,
"step": 410
},
{
"epoch": 0.4208924949290061,
"grad_norm": 1.6622717975288752,
"learning_rate": 4.761966765911026e-06,
"loss": 2.5238,
"step": 415
},
{
"epoch": 0.4259634888438134,
"grad_norm": 1.6256800857720881,
"learning_rate": 4.689130055829907e-06,
"loss": 2.5191,
"step": 420
},
{
"epoch": 0.43103448275862066,
"grad_norm": 1.7950911413498376,
"learning_rate": 4.617318330307044e-06,
"loss": 2.4909,
"step": 425
},
{
"epoch": 0.43610547667342797,
"grad_norm": 1.5866160053351177,
"learning_rate": 4.5465185894642715e-06,
"loss": 2.5128,
"step": 430
},
{
"epoch": 0.4411764705882353,
"grad_norm": 1.6754882575554404,
"learning_rate": 4.476717980188313e-06,
"loss": 2.5028,
"step": 435
},
{
"epoch": 0.4462474645030426,
"grad_norm": 1.6606915353792953,
"learning_rate": 4.407903794678819e-06,
"loss": 2.5207,
"step": 440
},
{
"epoch": 0.4513184584178499,
"grad_norm": 1.8160247477825882,
"learning_rate": 4.340063469008923e-06,
"loss": 2.5017,
"step": 445
},
{
"epoch": 0.4563894523326572,
"grad_norm": 1.7663094048322825,
"learning_rate": 4.2731845816978475e-06,
"loss": 2.5021,
"step": 450
},
{
"epoch": 0.4614604462474645,
"grad_norm": 1.7799998175038592,
"learning_rate": 4.207254852295854e-06,
"loss": 2.4953,
"step": 455
},
{
"epoch": 0.4665314401622718,
"grad_norm": 1.6715645487953392,
"learning_rate": 4.142262139981073e-06,
"loss": 2.4435,
"step": 460
},
{
"epoch": 0.4716024340770791,
"grad_norm": 1.7256265015398793,
"learning_rate": 4.078194442168494e-06,
"loss": 2.5146,
"step": 465
},
{
"epoch": 0.4766734279918864,
"grad_norm": 1.6662015811964308,
"learning_rate": 4.015039893130705e-06,
"loss": 2.5187,
"step": 470
},
{
"epoch": 0.4817444219066937,
"grad_norm": 1.7649431318197315,
"learning_rate": 3.952786762630535e-06,
"loss": 2.5223,
"step": 475
},
{
"epoch": 0.486815415821501,
"grad_norm": 1.679617464261057,
"learning_rate": 3.891423454565385e-06,
"loss": 2.4394,
"step": 480
},
{
"epoch": 0.4918864097363083,
"grad_norm": 1.6233085596184735,
"learning_rate": 3.830938505623211e-06,
"loss": 2.512,
"step": 485
},
{
"epoch": 0.4969574036511156,
"grad_norm": 1.7195900327055993,
"learning_rate": 3.7713205839500707e-06,
"loss": 2.4649,
"step": 490
},
{
"epoch": 0.5020283975659229,
"grad_norm": 1.7034828407083669,
"learning_rate": 3.7125584878291374e-06,
"loss": 2.497,
"step": 495
},
{
"epoch": 0.5070993914807302,
"grad_norm": 1.7618287486879018,
"learning_rate": 3.6546411443711164e-06,
"loss": 2.5353,
"step": 500
},
{
"epoch": 0.5121703853955375,
"grad_norm": 1.6191614066287776,
"learning_rate": 3.597557608215969e-06,
"loss": 2.5052,
"step": 505
},
{
"epoch": 0.5172413793103449,
"grad_norm": 1.6450813134062763,
"learning_rate": 3.54129706024587e-06,
"loss": 2.5106,
"step": 510
},
{
"epoch": 0.5223123732251521,
"grad_norm": 1.7767916102532666,
"learning_rate": 3.4858488063093135e-06,
"loss": 2.4651,
"step": 515
},
{
"epoch": 0.5273833671399595,
"grad_norm": 1.6720237829560067,
"learning_rate": 3.431202275956285e-06,
"loss": 2.4908,
"step": 520
},
{
"epoch": 0.5324543610547667,
"grad_norm": 1.6484154917054958,
"learning_rate": 3.3773470211844283e-06,
"loss": 2.4856,
"step": 525
},
{
"epoch": 0.537525354969574,
"grad_norm": 1.651838194240797,
"learning_rate": 3.324272715196116e-06,
"loss": 2.4675,
"step": 530
},
{
"epoch": 0.5425963488843814,
"grad_norm": 1.6241151521510617,
"learning_rate": 3.2719691511663524e-06,
"loss": 2.4896,
"step": 535
},
{
"epoch": 0.5476673427991886,
"grad_norm": 1.6894175077795812,
"learning_rate": 3.2204262410214273e-06,
"loss": 2.4556,
"step": 540
},
{
"epoch": 0.552738336713996,
"grad_norm": 1.6686417855987385,
"learning_rate": 3.1696340142282437e-06,
"loss": 2.5062,
"step": 545
},
{
"epoch": 0.5578093306288032,
"grad_norm": 1.7200856267540612,
"learning_rate": 3.119582616594238e-06,
"loss": 2.4878,
"step": 550
},
{
"epoch": 0.5628803245436106,
"grad_norm": 1.672252633477676,
"learning_rate": 3.0702623090778174e-06,
"loss": 2.5077,
"step": 555
},
{
"epoch": 0.5679513184584178,
"grad_norm": 1.7008466667698958,
"learning_rate": 3.021663466609246e-06,
"loss": 2.4837,
"step": 560
},
{
"epoch": 0.5730223123732252,
"grad_norm": 1.6805676799462346,
"learning_rate": 2.973776576921883e-06,
"loss": 2.5062,
"step": 565
},
{
"epoch": 0.5780933062880325,
"grad_norm": 1.6136103005628197,
"learning_rate": 2.9265922393937183e-06,
"loss": 2.5035,
"step": 570
},
{
"epoch": 0.5831643002028397,
"grad_norm": 1.6014078073339035,
"learning_rate": 2.880101163899116e-06,
"loss": 2.5101,
"step": 575
},
{
"epoch": 0.5882352941176471,
"grad_norm": 1.7220406203120746,
"learning_rate": 2.8342941696706994e-06,
"loss": 2.5217,
"step": 580
},
{
"epoch": 0.5933062880324543,
"grad_norm": 1.6605964063316545,
"learning_rate": 2.789162184171294e-06,
"loss": 2.4756,
"step": 585
},
{
"epoch": 0.5983772819472617,
"grad_norm": 1.6566249973518374,
"learning_rate": 2.7446962419758632e-06,
"loss": 2.4739,
"step": 590
},
{
"epoch": 0.603448275862069,
"grad_norm": 1.6340883136536262,
"learning_rate": 2.700887483663357e-06,
"loss": 2.4869,
"step": 595
},
{
"epoch": 0.6085192697768763,
"grad_norm": 1.6233109361058542,
"learning_rate": 2.657727154718401e-06,
"loss": 2.4487,
"step": 600
},
{
"epoch": 0.6085192697768763,
"eval_loss": 2.4831416606903076,
"eval_runtime": 80.984,
"eval_samples_per_second": 86.511,
"eval_steps_per_second": 0.679,
"step": 600
},
{
"epoch": 0.6135902636916836,
"grad_norm": 1.616769928055098,
"learning_rate": 2.615206604442756e-06,
"loss": 2.4638,
"step": 605
},
{
"epoch": 0.6186612576064908,
"grad_norm": 1.6396235170920117,
"learning_rate": 2.5733172848764733e-06,
"loss": 2.4891,
"step": 610
},
{
"epoch": 0.6237322515212982,
"grad_norm": 1.5936144163067276,
"learning_rate": 2.5320507497286705e-06,
"loss": 2.4902,
"step": 615
},
{
"epoch": 0.6288032454361054,
"grad_norm": 1.6679977682798468,
"learning_rate": 2.491398653317866e-06,
"loss": 2.4695,
"step": 620
},
{
"epoch": 0.6338742393509128,
"grad_norm": 1.7008178983911084,
"learning_rate": 2.4513527495217875e-06,
"loss": 2.4626,
"step": 625
},
{
"epoch": 0.6389452332657201,
"grad_norm": 1.610985443276998,
"learning_rate": 2.4119048907365937e-06,
"loss": 2.4934,
"step": 630
},
{
"epoch": 0.6440162271805274,
"grad_norm": 1.6323121910464156,
"learning_rate": 2.3730470268454385e-06,
"loss": 2.4819,
"step": 635
},
{
"epoch": 0.6490872210953347,
"grad_norm": 1.6525382291119861,
"learning_rate": 2.3347712041962997e-06,
"loss": 2.5046,
"step": 640
},
{
"epoch": 0.654158215010142,
"grad_norm": 1.6380351817927594,
"learning_rate": 2.297069564589013e-06,
"loss": 2.4864,
"step": 645
},
{
"epoch": 0.6592292089249493,
"grad_norm": 1.6579813340009797,
"learning_rate": 2.259934344271433e-06,
"loss": 2.4715,
"step": 650
},
{
"epoch": 0.6643002028397565,
"grad_norm": 1.7919239246160015,
"learning_rate": 2.22335787294466e-06,
"loss": 2.4972,
"step": 655
},
{
"epoch": 0.6693711967545639,
"grad_norm": 1.586961409961355,
"learning_rate": 2.18733257277726e-06,
"loss": 2.4787,
"step": 660
},
{
"epoch": 0.6744421906693712,
"grad_norm": 1.684301176230389,
"learning_rate": 2.1518509574284106e-06,
"loss": 2.4158,
"step": 665
},
{
"epoch": 0.6795131845841785,
"grad_norm": 1.6178388175493554,
"learning_rate": 2.123852145211829e-06,
"loss": 2.5152,
"step": 670
},
{
"epoch": 0.6845841784989858,
"grad_norm": 1.704137336957441,
"learning_rate": 2.089330585293108e-06,
"loss": 2.4807,
"step": 675
},
{
"epoch": 0.6896551724137931,
"grad_norm": 1.653288753563856,
"learning_rate": 2.055332226962747e-06,
"loss": 2.4781,
"step": 680
},
{
"epoch": 0.6947261663286004,
"grad_norm": 1.6910190620923418,
"learning_rate": 2.0218499227907136e-06,
"loss": 2.5114,
"step": 685
},
{
"epoch": 0.6997971602434077,
"grad_norm": 1.6297896630103186,
"learning_rate": 1.988876612270826e-06,
"loss": 2.4963,
"step": 690
},
{
"epoch": 0.704868154158215,
"grad_norm": 1.6254042637268307,
"learning_rate": 1.9564053208943578e-06,
"loss": 2.4651,
"step": 695
},
{
"epoch": 0.7099391480730223,
"grad_norm": 1.849820644961665,
"learning_rate": 1.924429159232111e-06,
"loss": 2.4625,
"step": 700
},
{
"epoch": 0.7150101419878296,
"grad_norm": 1.6947938784926828,
"learning_rate": 1.892941322024907e-06,
"loss": 2.4683,
"step": 705
},
{
"epoch": 0.7200811359026369,
"grad_norm": 1.6500218076608433,
"learning_rate": 1.861935087282421e-06,
"loss": 2.474,
"step": 710
},
{
"epoch": 0.7251521298174443,
"grad_norm": 1.5695461599237197,
"learning_rate": 1.8314038153902991e-06,
"loss": 2.4626,
"step": 715
},
{
"epoch": 0.7302231237322515,
"grad_norm": 1.661274439764298,
"learning_rate": 1.8013409482254947e-06,
"loss": 2.4901,
"step": 720
},
{
"epoch": 0.7352941176470589,
"grad_norm": 1.5971717624468098,
"learning_rate": 1.7717400082797614e-06,
"loss": 2.498,
"step": 725
},
{
"epoch": 0.7403651115618661,
"grad_norm": 1.6006841184664817,
"learning_rate": 1.7425945977912387e-06,
"loss": 2.5096,
"step": 730
},
{
"epoch": 0.7454361054766734,
"grad_norm": 1.8078007149616142,
"learning_rate": 1.7138983978840686e-06,
"loss": 2.4733,
"step": 735
},
{
"epoch": 0.7505070993914807,
"grad_norm": 1.6080637102108633,
"learning_rate": 1.685645167715982e-06,
"loss": 2.4645,
"step": 740
},
{
"epoch": 0.755578093306288,
"grad_norm": 1.6034092883417612,
"learning_rate": 1.6578287436337897e-06,
"loss": 2.4874,
"step": 745
},
{
"epoch": 0.7606490872210954,
"grad_norm": 1.6562691168973722,
"learning_rate": 1.6304430383367233e-06,
"loss": 2.5147,
"step": 750
},
{
"epoch": 0.7657200811359026,
"grad_norm": 1.631836734297837,
"learning_rate": 1.6034820400475576e-06,
"loss": 2.449,
"step": 755
},
{
"epoch": 0.77079107505071,
"grad_norm": 2.633902381426751,
"learning_rate": 1.5769398116914607e-06,
"loss": 2.4502,
"step": 760
},
{
"epoch": 0.7758620689655172,
"grad_norm": 1.6338196504524252,
"learning_rate": 1.550810490082507e-06,
"loss": 2.4375,
"step": 765
},
{
"epoch": 0.7809330628803245,
"grad_norm": 1.6881605246261733,
"learning_rate": 1.5250882851177956e-06,
"loss": 2.4623,
"step": 770
},
{
"epoch": 0.7860040567951319,
"grad_norm": 1.7430128340035491,
"learning_rate": 1.4997674789791142e-06,
"loss": 2.4592,
"step": 775
},
{
"epoch": 0.7910750507099391,
"grad_norm": 1.6974037503954427,
"learning_rate": 1.4748424253420905e-06,
"loss": 2.5001,
"step": 780
},
{
"epoch": 0.7961460446247465,
"grad_norm": 1.6057434981804433,
"learning_rate": 1.4503075485927704e-06,
"loss": 2.4603,
"step": 785
},
{
"epoch": 0.8012170385395537,
"grad_norm": 1.5564356238507298,
"learning_rate": 1.4261573430515669e-06,
"loss": 2.4357,
"step": 790
},
{
"epoch": 0.8062880324543611,
"grad_norm": 1.7042405076576008,
"learning_rate": 1.4023863722045201e-06,
"loss": 2.4747,
"step": 795
},
{
"epoch": 0.8113590263691683,
"grad_norm": 1.5640034942530554,
"learning_rate": 1.3789892679418134e-06,
"loss": 2.5324,
"step": 800
},
{
"epoch": 0.8113590263691683,
"eval_loss": 2.4689557552337646,
"eval_runtime": 81.0232,
"eval_samples_per_second": 86.469,
"eval_steps_per_second": 0.679,
"step": 800
},
{
"epoch": 0.8164300202839757,
"grad_norm": 1.7227060519078905,
"learning_rate": 1.3559607298034838e-06,
"loss": 2.4806,
"step": 805
},
{
"epoch": 0.821501014198783,
"grad_norm": 1.5855673393298833,
"learning_rate": 1.333295524232277e-06,
"loss": 2.4642,
"step": 810
},
{
"epoch": 0.8265720081135902,
"grad_norm": 1.8155636812941185,
"learning_rate": 1.310988483833583e-06,
"loss": 2.4746,
"step": 815
},
{
"epoch": 0.8316430020283976,
"grad_norm": 1.6824796691575312,
"learning_rate": 1.289034506642401e-06,
"loss": 2.5168,
"step": 820
},
{
"epoch": 0.8367139959432048,
"grad_norm": 1.6084122349859742,
"learning_rate": 1.2674285553972776e-06,
"loss": 2.4112,
"step": 825
},
{
"epoch": 0.8417849898580122,
"grad_norm": 1.6807591569306923,
"learning_rate": 1.2461656568211607e-06,
"loss": 2.4555,
"step": 830
},
{
"epoch": 0.8468559837728195,
"grad_norm": 1.64520194930749,
"learning_rate": 1.2252409009091154e-06,
"loss": 2.5222,
"step": 835
},
{
"epoch": 0.8519269776876268,
"grad_norm": 1.642941398726877,
"learning_rate": 1.2046494402228485e-06,
"loss": 2.4607,
"step": 840
},
{
"epoch": 0.8569979716024341,
"grad_norm": 1.6323907187692908,
"learning_rate": 1.1843864891919843e-06,
"loss": 2.4724,
"step": 845
},
{
"epoch": 0.8620689655172413,
"grad_norm": 1.6489728444762863,
"learning_rate": 1.1644473234220412e-06,
"loss": 2.483,
"step": 850
},
{
"epoch": 0.8671399594320487,
"grad_norm": 1.5735584816022383,
"learning_rate": 1.1448272790090529e-06,
"loss": 2.4423,
"step": 855
},
{
"epoch": 0.8722109533468559,
"grad_norm": 1.6290164674794758,
"learning_rate": 1.1255217518607806e-06,
"loss": 2.4745,
"step": 860
},
{
"epoch": 0.8772819472616633,
"grad_norm": 1.9631129344699565,
"learning_rate": 1.1065261970244678e-06,
"loss": 2.4595,
"step": 865
},
{
"epoch": 0.8823529411764706,
"grad_norm": 1.8876833985138877,
"learning_rate": 1.0878361280210782e-06,
"loss": 2.4761,
"step": 870
},
{
"epoch": 0.8874239350912779,
"grad_norm": 1.7449962901668679,
"learning_rate": 1.0694471161859696e-06,
"loss": 2.4726,
"step": 875
},
{
"epoch": 0.8924949290060852,
"grad_norm": 1.6608657901001447,
"learning_rate": 1.051354790015952e-06,
"loss": 2.4817,
"step": 880
},
{
"epoch": 0.8975659229208925,
"grad_norm": 1.6370419920913908,
"learning_rate": 1.0335548345226733e-06,
"loss": 2.4861,
"step": 885
},
{
"epoch": 0.9026369168356998,
"grad_norm": 1.6266725844295284,
"learning_rate": 1.016042990592287e-06,
"loss": 2.4437,
"step": 890
},
{
"epoch": 0.907707910750507,
"grad_norm": 1.5909779389607082,
"learning_rate": 9.988150543513476e-07,
"loss": 2.4605,
"step": 895
},
{
"epoch": 0.9127789046653144,
"grad_norm": 1.5796802186393568,
"learning_rate": 9.818668765388872e-07,
"loss": 2.4863,
"step": 900
},
{
"epoch": 0.9178498985801217,
"grad_norm": 1.5779871460684796,
"learning_rate": 9.651943618846152e-07,
"loss": 2.4514,
"step": 905
},
{
"epoch": 0.922920892494929,
"grad_norm": 1.605102383763968,
"learning_rate": 9.487934684931995e-07,
"loss": 2.474,
"step": 910
},
{
"epoch": 0.9279918864097363,
"grad_norm": 1.6069103870683263,
"learning_rate": 9.326602072345758e-07,
"loss": 2.4828,
"step": 915
},
{
"epoch": 0.9330628803245437,
"grad_norm": 1.6236038441464034,
"learning_rate": 9.167906411402357e-07,
"loss": 2.4501,
"step": 920
},
{
"epoch": 0.9381338742393509,
"grad_norm": 1.6140284100171378,
"learning_rate": 9.011808848054445e-07,
"loss": 2.4441,
"step": 925
},
{
"epoch": 0.9432048681541582,
"grad_norm": 1.9823289784825078,
"learning_rate": 8.858271037973411e-07,
"loss": 2.4834,
"step": 930
},
{
"epoch": 0.9482758620689655,
"grad_norm": 1.7094985628575186,
"learning_rate": 8.707255140688767e-07,
"loss": 2.4428,
"step": 935
},
{
"epoch": 0.9533468559837728,
"grad_norm": 1.5851821971427773,
"learning_rate": 8.558723813785198e-07,
"loss": 2.4459,
"step": 940
},
{
"epoch": 0.9584178498985801,
"grad_norm": 1.8489283203955083,
"learning_rate": 8.412640207157327e-07,
"loss": 2.4671,
"step": 945
},
{
"epoch": 0.9634888438133874,
"grad_norm": 1.565327828926634,
"learning_rate": 8.268967957320976e-07,
"loss": 2.4762,
"step": 950
},
{
"epoch": 0.9685598377281948,
"grad_norm": 1.5753092524917698,
"learning_rate": 8.127671181781262e-07,
"loss": 2.487,
"step": 955
},
{
"epoch": 0.973630831643002,
"grad_norm": 1.5627741498336793,
"learning_rate": 7.988714473456279e-07,
"loss": 2.4899,
"step": 960
},
{
"epoch": 0.9787018255578094,
"grad_norm": 1.7322054425536324,
"learning_rate": 7.852062895156654e-07,
"loss": 2.4328,
"step": 965
},
{
"epoch": 0.9837728194726166,
"grad_norm": 1.5912533141539165,
"learning_rate": 7.717681974119764e-07,
"loss": 2.4887,
"step": 970
},
{
"epoch": 0.9888438133874239,
"grad_norm": 1.7127177872013957,
"learning_rate": 7.585537696598922e-07,
"loss": 2.4414,
"step": 975
},
{
"epoch": 0.9939148073022313,
"grad_norm": 1.6239111267541033,
"learning_rate": 7.455596502506312e-07,
"loss": 2.4962,
"step": 980
},
{
"epoch": 0.9989858012170385,
"grad_norm": 1.6117561424503084,
"learning_rate": 7.327825280109957e-07,
"loss": 2.4738,
"step": 985
},
{
"epoch": 1.0040567951318458,
"grad_norm": 1.9019039739296713,
"learning_rate": 7.20219136078357e-07,
"loss": 2.27,
"step": 990
},
{
"epoch": 1.0091277890466532,
"grad_norm": 1.7075178009820928,
"learning_rate": 7.078662513809528e-07,
"loss": 2.3072,
"step": 995
},
{
"epoch": 1.0141987829614605,
"grad_norm": 1.7844249258995124,
"learning_rate": 6.957206941233838e-07,
"loss": 2.265,
"step": 1000
},
{
"epoch": 1.0141987829614605,
"eval_loss": 2.473280668258667,
"eval_runtime": 81.0085,
"eval_samples_per_second": 86.485,
"eval_steps_per_second": 0.679,
"step": 1000
},
{
"epoch": 1.0192697768762677,
"grad_norm": 1.833316481131949,
"learning_rate": 6.837793272773345e-07,
"loss": 2.3069,
"step": 1005
},
{
"epoch": 1.024340770791075,
"grad_norm": 1.7388775994426842,
"learning_rate": 6.720390560774066e-07,
"loss": 2.266,
"step": 1010
},
{
"epoch": 1.0294117647058822,
"grad_norm": 1.6270190329782648,
"learning_rate": 6.604968275220875e-07,
"loss": 2.2664,
"step": 1015
},
{
"epoch": 1.0344827586206897,
"grad_norm": 1.7956207149367391,
"learning_rate": 6.491496298797458e-07,
"loss": 2.2394,
"step": 1020
},
{
"epoch": 1.039553752535497,
"grad_norm": 1.6994135825189252,
"learning_rate": 6.379944921996764e-07,
"loss": 2.2727,
"step": 1025
},
{
"epoch": 1.0446247464503042,
"grad_norm": 1.677197538792328,
"learning_rate": 6.270284838280882e-07,
"loss": 2.2072,
"step": 1030
},
{
"epoch": 1.0496957403651115,
"grad_norm": 1.719327046611783,
"learning_rate": 6.162487139290532e-07,
"loss": 2.3021,
"step": 1035
},
{
"epoch": 1.054766734279919,
"grad_norm": 1.7292340128968464,
"learning_rate": 6.056523310103172e-07,
"loss": 2.2737,
"step": 1040
},
{
"epoch": 1.0598377281947262,
"grad_norm": 1.7428974260955565,
"learning_rate": 5.95236522453988e-07,
"loss": 2.2556,
"step": 1045
},
{
"epoch": 1.0649087221095335,
"grad_norm": 1.694959472171586,
"learning_rate": 5.849985140519998e-07,
"loss": 2.2992,
"step": 1050
},
{
"epoch": 1.0699797160243407,
"grad_norm": 1.7439692178448947,
"learning_rate": 5.749355695463754e-07,
"loss": 2.2557,
"step": 1055
},
{
"epoch": 1.075050709939148,
"grad_norm": 1.7558636029085997,
"learning_rate": 5.650449901741813e-07,
"loss": 2.2474,
"step": 1060
},
{
"epoch": 1.0801217038539555,
"grad_norm": 1.785367595963534,
"learning_rate": 5.553241142171985e-07,
"loss": 2.267,
"step": 1065
},
{
"epoch": 1.0851926977687627,
"grad_norm": 1.7537584511707027,
"learning_rate": 5.45770316556211e-07,
"loss": 2.2823,
"step": 1070
},
{
"epoch": 1.09026369168357,
"grad_norm": 1.6825060417395732,
"learning_rate": 5.363810082299148e-07,
"loss": 2.2525,
"step": 1075
},
{
"epoch": 1.0953346855983772,
"grad_norm": 1.7339475460772475,
"learning_rate": 5.27153635998387e-07,
"loss": 2.3006,
"step": 1080
},
{
"epoch": 1.1004056795131847,
"grad_norm": 1.6977028436147512,
"learning_rate": 5.180856819110773e-07,
"loss": 2.2862,
"step": 1085
},
{
"epoch": 1.105476673427992,
"grad_norm": 1.7119437312783958,
"learning_rate": 5.091746628792904e-07,
"loss": 2.243,
"step": 1090
},
{
"epoch": 1.1105476673427992,
"grad_norm": 1.7918277133466605,
"learning_rate": 5.004181302531108e-07,
"loss": 2.2653,
"step": 1095
},
{
"epoch": 1.1156186612576064,
"grad_norm": 1.7198038075584687,
"learning_rate": 4.918136694027396e-07,
"loss": 2.2741,
"step": 1100
},
{
"epoch": 1.1206896551724137,
"grad_norm": 1.7122122501534425,
"learning_rate": 4.833588993041994e-07,
"loss": 2.2757,
"step": 1105
},
{
"epoch": 1.1257606490872212,
"grad_norm": 1.6934117050919777,
"learning_rate": 4.750514721293719e-07,
"loss": 2.2484,
"step": 1110
},
{
"epoch": 1.1308316430020284,
"grad_norm": 1.8096755323665539,
"learning_rate": 4.6688907284032994e-07,
"loss": 2.2329,
"step": 1115
},
{
"epoch": 1.1359026369168357,
"grad_norm": 1.7732841203420067,
"learning_rate": 4.588694187879258e-07,
"loss": 2.2636,
"step": 1120
},
{
"epoch": 1.140973630831643,
"grad_norm": 1.70514589311023,
"learning_rate": 4.5099025931459913e-07,
"loss": 2.2778,
"step": 1125
},
{
"epoch": 1.1460446247464504,
"grad_norm": 1.7135354540773058,
"learning_rate": 4.4324937536136735e-07,
"loss": 2.2905,
"step": 1130
},
{
"epoch": 1.1511156186612577,
"grad_norm": 1.6901713268949445,
"learning_rate": 4.3564457907896125e-07,
"loss": 2.302,
"step": 1135
},
{
"epoch": 1.156186612576065,
"grad_norm": 1.7350424488382163,
"learning_rate": 4.281737134430704e-07,
"loss": 2.2441,
"step": 1140
},
{
"epoch": 1.1612576064908722,
"grad_norm": 1.7433418190612922,
"learning_rate": 4.208346518736604e-07,
"loss": 2.2639,
"step": 1145
},
{
"epoch": 1.1663286004056794,
"grad_norm": 1.7278183208713844,
"learning_rate": 4.136252978583281e-07,
"loss": 2.272,
"step": 1150
},
{
"epoch": 1.171399594320487,
"grad_norm": 1.7049575091462312,
"learning_rate": 4.0654358457965706e-07,
"loss": 2.2822,
"step": 1155
},
{
"epoch": 1.1764705882352942,
"grad_norm": 1.7614119208994081,
"learning_rate": 3.995874745465392e-07,
"loss": 2.2882,
"step": 1160
},
{
"epoch": 1.1815415821501014,
"grad_norm": 1.7783667378053016,
"learning_rate": 3.927549592294267e-07,
"loss": 2.2779,
"step": 1165
},
{
"epoch": 1.1866125760649087,
"grad_norm": 1.7857803604726208,
"learning_rate": 3.8604405869947905e-07,
"loss": 2.2504,
"step": 1170
},
{
"epoch": 1.1916835699797161,
"grad_norm": 1.7894737586957659,
"learning_rate": 3.794528212715714e-07,
"loss": 2.2896,
"step": 1175
},
{
"epoch": 1.1967545638945234,
"grad_norm": 1.7605294591830605,
"learning_rate": 3.7297932315112855e-07,
"loss": 2.2803,
"step": 1180
},
{
"epoch": 1.2018255578093306,
"grad_norm": 1.7037189312181982,
"learning_rate": 3.6662166808475126e-07,
"loss": 2.2595,
"step": 1185
},
{
"epoch": 1.206896551724138,
"grad_norm": 1.802568691083643,
"learning_rate": 3.6037798701460037e-07,
"loss": 2.3097,
"step": 1190
},
{
"epoch": 1.2119675456389452,
"grad_norm": 1.7227242510965723,
"learning_rate": 3.5424643773650545e-07,
"loss": 2.2473,
"step": 1195
},
{
"epoch": 1.2170385395537526,
"grad_norm": 1.7126735182979083,
"learning_rate": 3.482252045617637e-07,
"loss": 2.3002,
"step": 1200
},
{
"epoch": 1.2170385395537526,
"eval_loss": 2.4735846519470215,
"eval_runtime": 81.0924,
"eval_samples_per_second": 86.395,
"eval_steps_per_second": 0.678,
"step": 1200
},
{
"epoch": 1.2221095334685599,
"grad_norm": 1.7418672417675343,
"learning_rate": 3.423124979825969e-07,
"loss": 2.2259,
"step": 1205
},
{
"epoch": 1.2271805273833671,
"grad_norm": 1.7536106052680211,
"learning_rate": 3.365065543412324e-07,
"loss": 2.2625,
"step": 1210
},
{
"epoch": 1.2322515212981744,
"grad_norm": 1.6738354256007202,
"learning_rate": 3.3080563550257607e-07,
"loss": 2.2762,
"step": 1215
},
{
"epoch": 1.2373225152129819,
"grad_norm": 1.7304199756653005,
"learning_rate": 3.2520802853044393e-07,
"loss": 2.2864,
"step": 1220
},
{
"epoch": 1.2423935091277891,
"grad_norm": 1.761088776037141,
"learning_rate": 3.197120453673215e-07,
"loss": 2.2665,
"step": 1225
},
{
"epoch": 1.2474645030425964,
"grad_norm": 1.7101358055188194,
"learning_rate": 3.143160225176168e-07,
"loss": 2.2775,
"step": 1230
},
{
"epoch": 1.2525354969574036,
"grad_norm": 1.7571854143932952,
"learning_rate": 3.0901832073437713e-07,
"loss": 2.2979,
"step": 1235
},
{
"epoch": 1.2576064908722109,
"grad_norm": 1.7216743809437804,
"learning_rate": 3.0381732470943653e-07,
"loss": 2.3094,
"step": 1240
},
{
"epoch": 1.2626774847870181,
"grad_norm": 1.6935950803242086,
"learning_rate": 2.9871144276696387e-07,
"loss": 2.2707,
"step": 1245
},
{
"epoch": 1.2677484787018256,
"grad_norm": 1.7158452472154153,
"learning_rate": 2.9369910656037903e-07,
"loss": 2.2532,
"step": 1250
},
{
"epoch": 1.2728194726166329,
"grad_norm": 1.7587458046328184,
"learning_rate": 2.8877877077260676e-07,
"loss": 2.2968,
"step": 1255
},
{
"epoch": 1.2778904665314401,
"grad_norm": 1.7348605445713965,
"learning_rate": 2.839489128196406e-07,
"loss": 2.2596,
"step": 1260
},
{
"epoch": 1.2829614604462476,
"grad_norm": 1.6962275978449755,
"learning_rate": 2.7920803255737635e-07,
"loss": 2.2579,
"step": 1265
},
{
"epoch": 1.2880324543610548,
"grad_norm": 1.7562952815143784,
"learning_rate": 2.7455465199170286e-07,
"loss": 2.2518,
"step": 1270
},
{
"epoch": 1.293103448275862,
"grad_norm": 1.6974150722131578,
"learning_rate": 2.699873149917968e-07,
"loss": 2.2504,
"step": 1275
},
{
"epoch": 1.2981744421906694,
"grad_norm": 1.7036916845012207,
"learning_rate": 2.655045870066172e-07,
"loss": 2.2861,
"step": 1280
},
{
"epoch": 1.3032454361054766,
"grad_norm": 1.7486208966066876,
"learning_rate": 2.6110505478454324e-07,
"loss": 2.2467,
"step": 1285
},
{
"epoch": 1.3083164300202839,
"grad_norm": 1.712258524308874,
"learning_rate": 2.5678732609615423e-07,
"loss": 2.2515,
"step": 1290
},
{
"epoch": 1.3133874239350913,
"grad_norm": 1.7341023622582277,
"learning_rate": 2.525500294600939e-07,
"loss": 2.2757,
"step": 1295
},
{
"epoch": 1.3184584178498986,
"grad_norm": 1.889990239211246,
"learning_rate": 2.4839181387201796e-07,
"loss": 2.2791,
"step": 1300
},
{
"epoch": 1.3235294117647058,
"grad_norm": 1.798861207791198,
"learning_rate": 2.4431134853656976e-07,
"loss": 2.2817,
"step": 1305
},
{
"epoch": 1.3286004056795133,
"grad_norm": 1.7472239831698717,
"learning_rate": 2.4030732260238086e-07,
"loss": 2.2521,
"step": 1310
},
{
"epoch": 1.3336713995943206,
"grad_norm": 1.782522588407923,
"learning_rate": 2.3637844490004408e-07,
"loss": 2.2316,
"step": 1315
},
{
"epoch": 1.3387423935091278,
"grad_norm": 1.6996053792107884,
"learning_rate": 2.325234436830538e-07,
"loss": 2.2734,
"step": 1320
},
{
"epoch": 1.343813387423935,
"grad_norm": 1.7994805518930097,
"learning_rate": 2.2874106637166403e-07,
"loss": 2.2484,
"step": 1325
},
{
"epoch": 1.3488843813387423,
"grad_norm": 1.7489331509437775,
"learning_rate": 2.2503007929965749e-07,
"loss": 2.28,
"step": 1330
},
{
"epoch": 1.3539553752535496,
"grad_norm": 1.7160678233869127,
"learning_rate": 2.2138926746397777e-07,
"loss": 2.2565,
"step": 1335
},
{
"epoch": 1.359026369168357,
"grad_norm": 1.814687918697313,
"learning_rate": 2.178174342772177e-07,
"loss": 2.2517,
"step": 1340
},
{
"epoch": 1.3640973630831643,
"grad_norm": 1.6987256946879317,
"learning_rate": 2.143134013229167e-07,
"loss": 2.2672,
"step": 1345
},
{
"epoch": 1.3691683569979716,
"grad_norm": 1.7371785897491874,
"learning_rate": 2.1087600811366032e-07,
"loss": 2.2628,
"step": 1350
},
{
"epoch": 1.3742393509127788,
"grad_norm": 1.745926263655127,
"learning_rate": 2.075041118519355e-07,
"loss": 2.2532,
"step": 1355
},
{
"epoch": 1.3793103448275863,
"grad_norm": 1.700613383279488,
"learning_rate": 2.0419658719373504e-07,
"loss": 2.2617,
"step": 1360
},
{
"epoch": 1.3843813387423936,
"grad_norm": 1.691103098158946,
"learning_rate": 2.009523260148652e-07,
"loss": 2.2391,
"step": 1365
},
{
"epoch": 1.3894523326572008,
"grad_norm": 1.6917956046319294,
"learning_rate": 1.977702371799498e-07,
"loss": 2.2973,
"step": 1370
},
{
"epoch": 1.394523326572008,
"grad_norm": 1.7504566996070137,
"learning_rate": 1.946492463140869e-07,
"loss": 2.3102,
"step": 1375
},
{
"epoch": 1.3995943204868153,
"grad_norm": 1.838843879022522,
"learning_rate": 1.9158829557714903e-07,
"loss": 2.2819,
"step": 1380
},
{
"epoch": 1.4046653144016228,
"grad_norm": 1.7034157869918263,
"learning_rate": 1.8858634344068625e-07,
"loss": 2.2463,
"step": 1385
},
{
"epoch": 1.40973630831643,
"grad_norm": 1.7726664220307162,
"learning_rate": 1.8564236446742146e-07,
"loss": 2.2458,
"step": 1390
},
{
"epoch": 1.4148073022312373,
"grad_norm": 1.7584441947795304,
"learning_rate": 1.8275534909329853e-07,
"loss": 2.2663,
"step": 1395
},
{
"epoch": 1.4198782961460445,
"grad_norm": 1.7548926938859895,
"learning_rate": 1.7992430341207304e-07,
"loss": 2.29,
"step": 1400
},
{
"epoch": 1.4198782961460445,
"eval_loss": 2.4734323024749756,
"eval_runtime": 81.002,
"eval_samples_per_second": 86.492,
"eval_steps_per_second": 0.679,
"step": 1400
},
{
"epoch": 1.424949290060852,
"grad_norm": 1.691411914276979,
"learning_rate": 1.7714824896240595e-07,
"loss": 2.2565,
"step": 1405
},
{
"epoch": 1.4300202839756593,
"grad_norm": 1.7523279327159709,
"learning_rate": 1.7442622251745125e-07,
"loss": 2.2582,
"step": 1410
},
{
"epoch": 1.4350912778904665,
"grad_norm": 1.6844227513504313,
"learning_rate": 1.717572758768978e-07,
"loss": 2.2416,
"step": 1415
},
{
"epoch": 1.4401622718052738,
"grad_norm": 2.2030630647830245,
"learning_rate": 1.6914047566145662e-07,
"loss": 2.2289,
"step": 1420
},
{
"epoch": 1.445233265720081,
"grad_norm": 1.7795541841017355,
"learning_rate": 1.6657490310975468e-07,
"loss": 2.2841,
"step": 1425
},
{
"epoch": 1.4503042596348885,
"grad_norm": 1.8134633165357201,
"learning_rate": 1.6405965387762636e-07,
"loss": 2.2542,
"step": 1430
},
{
"epoch": 1.4553752535496958,
"grad_norm": 1.7604092301048675,
"learning_rate": 1.615938378397648e-07,
"loss": 2.2493,
"step": 1435
},
{
"epoch": 1.460446247464503,
"grad_norm": 1.8595724042593027,
"learning_rate": 1.5917657889372315e-07,
"loss": 2.2484,
"step": 1440
},
{
"epoch": 1.4655172413793103,
"grad_norm": 1.7081713686615858,
"learning_rate": 1.568070147662311e-07,
"loss": 2.2744,
"step": 1445
},
{
"epoch": 1.4705882352941178,
"grad_norm": 14.41030902656843,
"learning_rate": 1.5448429682181186e-07,
"loss": 2.2609,
"step": 1450
},
{
"epoch": 1.475659229208925,
"grad_norm": 1.7702111899429174,
"learning_rate": 1.5220758987367309e-07,
"loss": 2.2955,
"step": 1455
},
{
"epoch": 1.4807302231237323,
"grad_norm": 1.7932941724173908,
"learning_rate": 1.4997607199684964e-07,
"loss": 2.2478,
"step": 1460
},
{
"epoch": 1.4858012170385395,
"grad_norm": 1.7327449633169845,
"learning_rate": 1.477889343435765e-07,
"loss": 2.2713,
"step": 1465
},
{
"epoch": 1.4908722109533468,
"grad_norm": 1.7047486187689578,
"learning_rate": 1.456453809608691e-07,
"loss": 2.2586,
"step": 1470
},
{
"epoch": 1.495943204868154,
"grad_norm": 1.7085975289965103,
"learning_rate": 1.4354462861028889e-07,
"loss": 2.2602,
"step": 1475
},
{
"epoch": 1.5010141987829615,
"grad_norm": 1.7708851051604204,
"learning_rate": 1.414859065898731e-07,
"loss": 2.2913,
"step": 1480
},
{
"epoch": 1.5060851926977687,
"grad_norm": 1.6849008491575197,
"learning_rate": 1.3946845655820588e-07,
"loss": 2.2129,
"step": 1485
},
{
"epoch": 1.5111561866125762,
"grad_norm": 1.6770410018579935,
"learning_rate": 1.374915323606102e-07,
"loss": 2.2641,
"step": 1490
},
{
"epoch": 1.5162271805273835,
"grad_norm": 1.7333889728562109,
"learning_rate": 1.3555439985743863e-07,
"loss": 2.3096,
"step": 1495
},
{
"epoch": 1.5212981744421907,
"grad_norm": 1.7381149429179856,
"learning_rate": 1.3365633675444236e-07,
"loss": 2.2449,
"step": 1500
},
{
"epoch": 1.526369168356998,
"grad_norm": 1.7508604376509869,
"learning_rate": 1.317966324351968e-07,
"loss": 2.3006,
"step": 1505
},
{
"epoch": 1.5314401622718052,
"grad_norm": 1.731173156378831,
"learning_rate": 1.2997458779556342e-07,
"loss": 2.2721,
"step": 1510
},
{
"epoch": 1.5365111561866125,
"grad_norm": 1.7880722742651989,
"learning_rate": 1.2818951508016706e-07,
"loss": 2.2839,
"step": 1515
},
{
"epoch": 1.5415821501014197,
"grad_norm": 1.766456825336907,
"learning_rate": 1.264407377208682e-07,
"loss": 2.2542,
"step": 1520
},
{
"epoch": 1.5466531440162272,
"grad_norm": 1.793293076179441,
"learning_rate": 1.2472759017720967e-07,
"loss": 2.2345,
"step": 1525
},
{
"epoch": 1.5517241379310345,
"grad_norm": 1.7255231286858488,
"learning_rate": 1.2304941777881816e-07,
"loss": 2.2587,
"step": 1530
},
{
"epoch": 1.556795131845842,
"grad_norm": 1.7107497208562314,
"learning_rate": 1.214055765697399e-07,
"loss": 2.2587,
"step": 1535
},
{
"epoch": 1.5618661257606492,
"grad_norm": 1.7448234273922532,
"learning_rate": 1.197954331546911e-07,
"loss": 2.2493,
"step": 1540
},
{
"epoch": 1.5669371196754565,
"grad_norm": 1.713933005233849,
"learning_rate": 1.1821836454720342e-07,
"loss": 2.3028,
"step": 1545
},
{
"epoch": 1.5720081135902637,
"grad_norm": 1.8430768650069782,
"learning_rate": 1.1667375801964492e-07,
"loss": 2.2595,
"step": 1550
},
{
"epoch": 1.577079107505071,
"grad_norm": 1.7903141506679578,
"learning_rate": 1.15161010955097e-07,
"loss": 2.2555,
"step": 1555
},
{
"epoch": 1.5821501014198782,
"grad_norm": 1.810165731715535,
"learning_rate": 1.136795307010685e-07,
"loss": 2.2728,
"step": 1560
},
{
"epoch": 1.5872210953346855,
"grad_norm": 1.7357274884238136,
"learning_rate": 1.1222873442502753e-07,
"loss": 2.2741,
"step": 1565
},
{
"epoch": 1.592292089249493,
"grad_norm": 1.7545984913046129,
"learning_rate": 1.108080489717326e-07,
"loss": 2.2609,
"step": 1570
},
{
"epoch": 1.5973630831643002,
"grad_norm": 1.8639925458297812,
"learning_rate": 1.0941691072234387e-07,
"loss": 2.2349,
"step": 1575
},
{
"epoch": 1.6024340770791075,
"grad_norm": 1.7125402909483072,
"learning_rate": 1.080547654552963e-07,
"loss": 2.2929,
"step": 1580
},
{
"epoch": 1.607505070993915,
"grad_norm": 1.7300627575439524,
"learning_rate": 1.0672106820891631e-07,
"loss": 2.2823,
"step": 1585
},
{
"epoch": 1.6125760649087222,
"grad_norm": 1.7190554348875562,
"learning_rate": 1.0541528314576339e-07,
"loss": 2.2708,
"step": 1590
},
{
"epoch": 1.6176470588235294,
"grad_norm": 1.724918915538896,
"learning_rate": 1.04136883418679e-07,
"loss": 2.2491,
"step": 1595
},
{
"epoch": 1.6227180527383367,
"grad_norm": 1.7342048226287368,
"learning_rate": 1.0288535103852444e-07,
"loss": 2.2566,
"step": 1600
},
{
"epoch": 1.6227180527383367,
"eval_loss": 2.472487688064575,
"eval_runtime": 81.0795,
"eval_samples_per_second": 86.409,
"eval_steps_per_second": 0.678,
"step": 1600
},
{
"epoch": 1.627789046653144,
"grad_norm": 1.752725508386252,
"learning_rate": 1.0166017674359012e-07,
"loss": 2.2115,
"step": 1605
},
{
"epoch": 1.6328600405679512,
"grad_norm": 1.7053034674622713,
"learning_rate": 1.0046085987065856e-07,
"loss": 2.2349,
"step": 1610
},
{
"epoch": 1.6379310344827587,
"grad_norm": 1.6910767224745546,
"learning_rate": 9.928690822770361e-08,
"loss": 2.2661,
"step": 1615
},
{
"epoch": 1.643002028397566,
"grad_norm": 1.9415101732879068,
"learning_rate": 9.81378379682085e-08,
"loss": 2.2355,
"step": 1620
},
{
"epoch": 1.6480730223123732,
"grad_norm": 1.7692640477521646,
"learning_rate": 9.70131734670856e-08,
"loss": 2.2605,
"step": 1625
},
{
"epoch": 1.6531440162271807,
"grad_norm": 1.7825871200246013,
"learning_rate": 9.59124471981808e-08,
"loss": 2.2842,
"step": 1630
},
{
"epoch": 1.658215010141988,
"grad_norm": 1.805395258521555,
"learning_rate": 9.483519961334607e-08,
"loss": 2.2543,
"step": 1635
},
{
"epoch": 1.6632860040567952,
"grad_norm": 1.7151309029731219,
"learning_rate": 9.378097902306157e-08,
"loss": 2.2507,
"step": 1640
},
{
"epoch": 1.6683569979716024,
"grad_norm": 1.7662462146082336,
"learning_rate": 9.274934147859458e-08,
"loss": 2.2822,
"step": 1645
},
{
"epoch": 1.6734279918864097,
"grad_norm": 1.7065430440445857,
"learning_rate": 9.173985065567343e-08,
"loss": 2.2727,
"step": 1650
},
{
"epoch": 1.678498985801217,
"grad_norm": 1.8167004072102202,
"learning_rate": 9.075207773966592e-08,
"loss": 2.2582,
"step": 1655
},
{
"epoch": 1.6835699797160242,
"grad_norm": 1.7276973068156511,
"learning_rate": 8.978560131224021e-08,
"loss": 2.2451,
"step": 1660
},
{
"epoch": 1.6886409736308317,
"grad_norm": 1.7787413203893692,
"learning_rate": 8.88400072394981e-08,
"loss": 2.2421,
"step": 1665
},
{
"epoch": 1.693711967545639,
"grad_norm": 0.8868153668800921,
"learning_rate": 8.791488856155857e-08,
"loss": 2.2354,
"step": 1670
},
{
"epoch": 1.6987829614604464,
"grad_norm": 1.6998265742091707,
"learning_rate": 8.700984538358205e-08,
"loss": 2.264,
"step": 1675
},
{
"epoch": 1.7038539553752536,
"grad_norm": 1.7045446815412617,
"learning_rate": 8.612448476821393e-08,
"loss": 2.2775,
"step": 1680
},
{
"epoch": 1.708924949290061,
"grad_norm": 1.7898247009022359,
"learning_rate": 8.525842062943714e-08,
"loss": 2.2733,
"step": 1685
},
{
"epoch": 1.7139959432048681,
"grad_norm": 1.7604334600933766,
"learning_rate": 8.441127362781345e-08,
"loss": 2.2704,
"step": 1690
},
{
"epoch": 1.7190669371196754,
"grad_norm": 1.8108867949678853,
"learning_rate": 8.358267106710315e-08,
"loss": 2.2626,
"step": 1695
},
{
"epoch": 1.7241379310344827,
"grad_norm": 1.6881452920332736,
"learning_rate": 8.277224679224312e-08,
"loss": 2.2694,
"step": 1700
},
{
"epoch": 1.72920892494929,
"grad_norm": 1.7530216839199022,
"learning_rate": 8.197964108867328e-08,
"loss": 2.2622,
"step": 1705
},
{
"epoch": 1.7342799188640974,
"grad_norm": 1.7278497657123897,
"learning_rate": 8.12045005829916e-08,
"loss": 2.2471,
"step": 1710
},
{
"epoch": 1.7393509127789046,
"grad_norm": 1.8213327178561642,
"learning_rate": 8.044647814492792e-08,
"loss": 2.2313,
"step": 1715
},
{
"epoch": 1.744421906693712,
"grad_norm": 1.8304362576609268,
"learning_rate": 7.970523279061717e-08,
"loss": 2.2738,
"step": 1720
},
{
"epoch": 1.7494929006085194,
"grad_norm": 1.7718300765439339,
"learning_rate": 7.898042958716228e-08,
"loss": 2.2308,
"step": 1725
},
{
"epoch": 1.7545638945233266,
"grad_norm": 1.7305535723288619,
"learning_rate": 7.827173955846786e-08,
"loss": 2.2513,
"step": 1730
},
{
"epoch": 1.7596348884381339,
"grad_norm": 1.7402125464421778,
"learning_rate": 7.757883959233495e-08,
"loss": 2.2429,
"step": 1735
},
{
"epoch": 1.7647058823529411,
"grad_norm": 1.8175975710441392,
"learning_rate": 7.690141234879847e-08,
"loss": 2.288,
"step": 1740
},
{
"epoch": 1.7697768762677484,
"grad_norm": 1.851991292226803,
"learning_rate": 7.623914616969753e-08,
"loss": 2.2644,
"step": 1745
},
{
"epoch": 1.7748478701825556,
"grad_norm": 1.6602366231900278,
"learning_rate": 7.559173498946088e-08,
"loss": 2.2733,
"step": 1750
},
{
"epoch": 1.779918864097363,
"grad_norm": 1.7034994512549433,
"learning_rate": 7.495887824709769e-08,
"loss": 2.2674,
"step": 1755
},
{
"epoch": 1.7849898580121704,
"grad_norm": 1.7102833212058115,
"learning_rate": 7.434028079937624e-08,
"loss": 2.2752,
"step": 1760
},
{
"epoch": 1.7900608519269778,
"grad_norm": 2.1016603731428067,
"learning_rate": 7.373565283518085e-08,
"loss": 2.2726,
"step": 1765
},
{
"epoch": 1.795131845841785,
"grad_norm": 1.7876491597075783,
"learning_rate": 7.314470979103019e-08,
"loss": 2.2188,
"step": 1770
},
{
"epoch": 1.8002028397565923,
"grad_norm": 1.7984832581935817,
"learning_rate": 7.256717226774701e-08,
"loss": 2.2772,
"step": 1775
},
{
"epoch": 1.8052738336713996,
"grad_norm": 1.7621637378160073,
"learning_rate": 7.200276594826329e-08,
"loss": 2.2466,
"step": 1780
},
{
"epoch": 1.8103448275862069,
"grad_norm": 1.7255493399444854,
"learning_rate": 7.145122151655066e-08,
"loss": 2.2633,
"step": 1785
},
{
"epoch": 1.815415821501014,
"grad_norm": 1.7774418294615342,
"learning_rate": 7.101906869364121e-08,
"loss": 2.2966,
"step": 1790
},
{
"epoch": 1.8204868154158214,
"grad_norm": 1.7397631305330485,
"learning_rate": 7.049001264123894e-08,
"loss": 2.2644,
"step": 1795
},
{
"epoch": 1.8255578093306288,
"grad_norm": 1.7641738767791946,
"learning_rate": 6.997309032084255e-08,
"loss": 2.3052,
"step": 1800
},
{
"epoch": 1.8255578093306288,
"eval_loss": 2.4720866680145264,
"eval_runtime": 81.0596,
"eval_samples_per_second": 86.43,
"eval_steps_per_second": 0.679,
"step": 1800
},
{
"epoch": 1.830628803245436,
"grad_norm": 1.730995593445214,
"learning_rate": 6.946805070044455e-08,
"loss": 2.2748,
"step": 1805
},
{
"epoch": 1.8356997971602436,
"grad_norm": 1.708076665562477,
"learning_rate": 6.897464737518235e-08,
"loss": 2.2709,
"step": 1810
},
{
"epoch": 1.8407707910750508,
"grad_norm": 1.7961247246527527,
"learning_rate": 6.849263849253629e-08,
"loss": 2.2756,
"step": 1815
},
{
"epoch": 1.845841784989858,
"grad_norm": 1.7873259024447121,
"learning_rate": 6.802178667856782e-08,
"loss": 2.2619,
"step": 1820
},
{
"epoch": 1.8509127789046653,
"grad_norm": 1.7208578483390204,
"learning_rate": 6.756185896518329e-08,
"loss": 2.2563,
"step": 1825
},
{
"epoch": 1.8559837728194726,
"grad_norm": 1.6824119656694438,
"learning_rate": 6.711262671841385e-08,
"loss": 2.2524,
"step": 1830
},
{
"epoch": 1.8610547667342798,
"grad_norm": 1.717042060961093,
"learning_rate": 6.667386556769717e-08,
"loss": 2.3135,
"step": 1835
},
{
"epoch": 1.866125760649087,
"grad_norm": 1.736419652896857,
"learning_rate": 6.624535533615173e-08,
"loss": 2.288,
"step": 1840
},
{
"epoch": 1.8711967545638946,
"grad_norm": 1.75637188785577,
"learning_rate": 6.582687997182971e-08,
"loss": 2.2392,
"step": 1845
},
{
"epoch": 1.8762677484787018,
"grad_norm": 1.7282509939601418,
"learning_rate": 6.54182274799391e-08,
"loss": 2.2662,
"step": 1850
},
{
"epoch": 1.8813387423935093,
"grad_norm": 1.7060962855685544,
"learning_rate": 6.501918985602177e-08,
"loss": 2.2935,
"step": 1855
},
{
"epoch": 1.8864097363083165,
"grad_norm": 1.7581616823404618,
"learning_rate": 6.462956302007797e-08,
"loss": 2.2478,
"step": 1860
},
{
"epoch": 1.8914807302231238,
"grad_norm": 1.7987997676993257,
"learning_rate": 6.424914675162432e-08,
"loss": 2.2853,
"step": 1865
},
{
"epoch": 1.896551724137931,
"grad_norm": 1.7116689993633696,
"learning_rate": 6.387774462567602e-08,
"loss": 2.2503,
"step": 1870
},
{
"epoch": 1.9016227180527383,
"grad_norm": 1.7086258587789072,
"learning_rate": 6.351516394964051e-08,
"loss": 2.2822,
"step": 1875
},
{
"epoch": 1.9066937119675456,
"grad_norm": 1.8235148496074345,
"learning_rate": 6.31612157011135e-08,
"loss": 2.2879,
"step": 1880
},
{
"epoch": 1.9117647058823528,
"grad_norm": 1.7448709638927917,
"learning_rate": 6.281571446656485e-08,
"loss": 2.2586,
"step": 1885
},
{
"epoch": 1.9168356997971603,
"grad_norm": 1.7421662505581106,
"learning_rate": 6.247847838090545e-08,
"loss": 2.2791,
"step": 1890
},
{
"epoch": 1.9219066937119675,
"grad_norm": 1.825830026911039,
"learning_rate": 6.21493290679226e-08,
"loss": 2.2385,
"step": 1895
},
{
"epoch": 1.9269776876267748,
"grad_norm": 1.796187481606512,
"learning_rate": 6.182809158157558e-08,
"loss": 2.2756,
"step": 1900
},
{
"epoch": 1.9320486815415823,
"grad_norm": 1.7552941496595575,
"learning_rate": 6.151459434813879e-08,
"loss": 2.2587,
"step": 1905
},
{
"epoch": 1.9371196754563895,
"grad_norm": 1.7522494947057408,
"learning_rate": 6.120866910918446e-08,
"loss": 2.2585,
"step": 1910
},
{
"epoch": 1.9421906693711968,
"grad_norm": 1.7522459962159465,
"learning_rate": 6.091015086539273e-08,
"loss": 2.251,
"step": 1915
},
{
"epoch": 1.947261663286004,
"grad_norm": 1.702096284758162,
"learning_rate": 6.061887782118077e-08,
"loss": 2.285,
"step": 1920
},
{
"epoch": 1.9523326572008113,
"grad_norm": 1.7643281133012019,
"learning_rate": 6.033469133013957e-08,
"loss": 2.2846,
"step": 1925
},
{
"epoch": 1.9574036511156185,
"grad_norm": 1.6926355627529537,
"learning_rate": 6.005743584126981e-08,
"loss": 2.2124,
"step": 1930
},
{
"epoch": 1.962474645030426,
"grad_norm": 1.6991484085258466,
"learning_rate": 5.984051918509233e-08,
"loss": 2.2919,
"step": 1935
},
{
"epoch": 1.9675456389452333,
"grad_norm": 1.6959402402475394,
"learning_rate": 5.957535718971899e-08,
"loss": 2.2133,
"step": 1940
},
{
"epoch": 1.9726166328600405,
"grad_norm": 1.7435422008262311,
"learning_rate": 5.931670667334593e-08,
"loss": 2.2272,
"step": 1945
},
{
"epoch": 1.977687626774848,
"grad_norm": 1.7235339509485863,
"learning_rate": 5.906442337098544e-08,
"loss": 2.2566,
"step": 1950
},
{
"epoch": 1.9827586206896552,
"grad_norm": 1.8046591422600013,
"learning_rate": 5.881836586579961e-08,
"loss": 2.295,
"step": 1955
},
{
"epoch": 1.9878296146044625,
"grad_norm": 1.8447312096680564,
"learning_rate": 5.8578395539777033e-08,
"loss": 2.29,
"step": 1960
},
{
"epoch": 1.9929006085192698,
"grad_norm": 1.6943108398877464,
"learning_rate": 5.834437652514426e-08,
"loss": 2.2188,
"step": 1965
},
{
"epoch": 1.997971602434077,
"grad_norm": 1.7174652428188777,
"learning_rate": 5.811617565650129e-08,
"loss": 2.2692,
"step": 1970
},
{
"epoch": 2.0030425963488843,
"grad_norm": 1.6831299340128894,
"learning_rate": 5.7893662423673665e-08,
"loss": 2.2025,
"step": 1975
},
{
"epoch": 2.0081135902636915,
"grad_norm": 1.826795197065323,
"learning_rate": 5.767670892527061e-08,
"loss": 2.2579,
"step": 1980
},
{
"epoch": 2.0131845841784988,
"grad_norm": 1.7520235012361185,
"learning_rate": 5.746518982294192e-08,
"loss": 2.2388,
"step": 1985
},
{
"epoch": 2.0182555780933065,
"grad_norm": 1.8440219249964744,
"learning_rate": 5.72589822963234e-08,
"loss": 2.2582,
"step": 1990
},
{
"epoch": 2.0233265720081137,
"grad_norm": 1.7151060194819,
"learning_rate": 5.705796599866345e-08,
"loss": 2.2156,
"step": 1995
},
{
"epoch": 2.028397565922921,
"grad_norm": 1.7333738899068507,
"learning_rate": 5.686202301312118e-08,
"loss": 2.2702,
"step": 2000
},
{
"epoch": 2.028397565922921,
"eval_loss": 2.4733877182006836,
"eval_runtime": 81.1205,
"eval_samples_per_second": 86.365,
"eval_steps_per_second": 0.678,
"step": 2000
},
{
"epoch": 2.0334685598377282,
"grad_norm": 1.7637474983877708,
"learning_rate": 5.667103780972823e-08,
"loss": 2.2378,
"step": 2005
},
{
"epoch": 2.0385395537525355,
"grad_norm": 1.7730571315134518,
"learning_rate": 5.648489720300554e-08,
"loss": 2.2513,
"step": 2010
},
{
"epoch": 2.0436105476673427,
"grad_norm": 1.774271074894755,
"learning_rate": 5.630349031022691e-08,
"loss": 2.2518,
"step": 2015
},
{
"epoch": 2.04868154158215,
"grad_norm": 1.6997020509374097,
"learning_rate": 5.6126708510320976e-08,
"loss": 2.2464,
"step": 2020
},
{
"epoch": 2.0537525354969572,
"grad_norm": 1.7833382557650153,
"learning_rate": 5.595444540340353e-08,
"loss": 2.2317,
"step": 2025
},
{
"epoch": 2.0588235294117645,
"grad_norm": 1.7296871432561252,
"learning_rate": 5.578659677093205e-08,
"loss": 2.231,
"step": 2030
},
{
"epoch": 2.063894523326572,
"grad_norm": 1.7166463945290173,
"learning_rate": 5.562306053647459e-08,
"loss": 2.2347,
"step": 2035
},
{
"epoch": 2.0689655172413794,
"grad_norm": 1.7948324654757548,
"learning_rate": 5.546373672708482e-08,
"loss": 2.2458,
"step": 2040
},
{
"epoch": 2.0740365111561867,
"grad_norm": 1.745646645076283,
"learning_rate": 5.530852743527571e-08,
"loss": 2.2504,
"step": 2045
},
{
"epoch": 2.079107505070994,
"grad_norm": 1.7778201657756552,
"learning_rate": 5.515733678158393e-08,
"loss": 2.26,
"step": 2050
},
{
"epoch": 2.084178498985801,
"grad_norm": 1.7226724662159607,
"learning_rate": 5.5010070877717374e-08,
"loss": 2.24,
"step": 2055
},
{
"epoch": 2.0892494929006085,
"grad_norm": 1.737085412071484,
"learning_rate": 5.486663779027808e-08,
"loss": 2.2138,
"step": 2060
},
{
"epoch": 2.0943204868154157,
"grad_norm": 1.7680067007098665,
"learning_rate": 5.4726947505053265e-08,
"loss": 2.2688,
"step": 2065
},
{
"epoch": 2.099391480730223,
"grad_norm": 1.7414742255329991,
"learning_rate": 5.459091189186688e-08,
"loss": 2.2591,
"step": 2070
},
{
"epoch": 2.1044624746450302,
"grad_norm": 1.7804223600059563,
"learning_rate": 5.4458444669984314e-08,
"loss": 2.2337,
"step": 2075
},
{
"epoch": 2.109533468559838,
"grad_norm": 1.7481822321590552,
"learning_rate": 5.432946137406314e-08,
"loss": 2.2792,
"step": 2080
},
{
"epoch": 2.114604462474645,
"grad_norm": 1.7497391573214505,
"learning_rate": 5.420387932064249e-08,
"loss": 2.2927,
"step": 2085
},
{
"epoch": 2.1196754563894524,
"grad_norm": 1.7279168540890797,
"learning_rate": 5.408161757516413e-08,
"loss": 2.2451,
"step": 2090
},
{
"epoch": 2.1247464503042597,
"grad_norm": 1.7394662730899328,
"learning_rate": 5.396259691951805e-08,
"loss": 2.2424,
"step": 2095
},
{
"epoch": 2.129817444219067,
"grad_norm": 1.77875077601377,
"learning_rate": 5.384673982010568e-08,
"loss": 2.2402,
"step": 2100
},
{
"epoch": 2.134888438133874,
"grad_norm": 1.7319261658863345,
"learning_rate": 5.373397039641377e-08,
"loss": 2.2287,
"step": 2105
},
{
"epoch": 2.1399594320486814,
"grad_norm": 1.751571162082358,
"learning_rate": 5.362421439009217e-08,
"loss": 2.2334,
"step": 2110
},
{
"epoch": 2.1450304259634887,
"grad_norm": 1.8093044605440316,
"learning_rate": 5.351739913452874e-08,
"loss": 2.271,
"step": 2115
},
{
"epoch": 2.150101419878296,
"grad_norm": 1.8469881188013633,
"learning_rate": 5.341345352491468e-08,
"loss": 2.2284,
"step": 2120
},
{
"epoch": 2.1551724137931036,
"grad_norm": 1.7711139740473771,
"learning_rate": 5.331230798879373e-08,
"loss": 2.2644,
"step": 2125
},
{
"epoch": 2.160243407707911,
"grad_norm": 1.7271859975777568,
"learning_rate": 5.3213894457088646e-08,
"loss": 2.2378,
"step": 2130
},
{
"epoch": 2.165314401622718,
"grad_norm": 1.8925272013685321,
"learning_rate": 5.3118146335598536e-08,
"loss": 2.265,
"step": 2135
},
{
"epoch": 2.1703853955375254,
"grad_norm": 1.7527393142771752,
"learning_rate": 5.3024998476960626e-08,
"loss": 2.2183,
"step": 2140
},
{
"epoch": 2.1754563894523327,
"grad_norm": 1.7698628867396988,
"learning_rate": 5.293438715307019e-08,
"loss": 2.233,
"step": 2145
},
{
"epoch": 2.18052738336714,
"grad_norm": 1.724950058777004,
"learning_rate": 5.2846250027952295e-08,
"loss": 2.249,
"step": 2150
},
{
"epoch": 2.185598377281947,
"grad_norm": 1.9072718835854334,
"learning_rate": 5.276052613107927e-08,
"loss": 2.2342,
"step": 2155
},
{
"epoch": 2.1906693711967544,
"grad_norm": 1.7983471937343785,
"learning_rate": 5.2677155831127696e-08,
"loss": 2.2707,
"step": 2160
},
{
"epoch": 2.1957403651115617,
"grad_norm": 1.7092533410568467,
"learning_rate": 5.259608081016899e-08,
"loss": 2.2479,
"step": 2165
},
{
"epoch": 2.2008113590263694,
"grad_norm": 1.7921254707864127,
"learning_rate": 5.2517244038287416e-08,
"loss": 2.229,
"step": 2170
},
{
"epoch": 2.2058823529411766,
"grad_norm": 1.75489401951672,
"learning_rate": 5.244058974861976e-08,
"loss": 2.2772,
"step": 2175
},
{
"epoch": 2.210953346855984,
"grad_norm": 1.8175479517709452,
"learning_rate": 5.236606341281078e-08,
"loss": 2.2356,
"step": 2180
},
{
"epoch": 2.216024340770791,
"grad_norm": 1.808556074117745,
"learning_rate": 5.229361171687859e-08,
"loss": 2.2553,
"step": 2185
},
{
"epoch": 2.2210953346855984,
"grad_norm": 1.7664667006627157,
"learning_rate": 5.2223182537484316e-08,
"loss": 2.2719,
"step": 2190
},
{
"epoch": 2.2261663286004056,
"grad_norm": 1.7502392717778497,
"learning_rate": 5.2154724918600314e-08,
"loss": 2.2583,
"step": 2195
},
{
"epoch": 2.231237322515213,
"grad_norm": 1.7242967584463027,
"learning_rate": 5.208818904857144e-08,
"loss": 2.2411,
"step": 2200
},
{
"epoch": 2.231237322515213,
"eval_loss": 2.474597930908203,
"eval_runtime": 81.0438,
"eval_samples_per_second": 86.447,
"eval_steps_per_second": 0.679,
"step": 2200
},
{
"epoch": 2.23630831643002,
"grad_norm": 1.760326712726159,
"learning_rate": 5.202352623756371e-08,
"loss": 2.2356,
"step": 2205
},
{
"epoch": 2.2413793103448274,
"grad_norm": 1.7625638663030738,
"learning_rate": 5.1960688895395006e-08,
"loss": 2.2441,
"step": 2210
},
{
"epoch": 2.2464503042596347,
"grad_norm": 1.7518142596486186,
"learning_rate": 5.189963050974238e-08,
"loss": 2.2674,
"step": 2215
},
{
"epoch": 2.2515212981744424,
"grad_norm": 1.8040378121090448,
"learning_rate": 5.184030562472053e-08,
"loss": 2.2233,
"step": 2220
},
{
"epoch": 2.2565922920892496,
"grad_norm": 1.769147010660197,
"learning_rate": 5.1782669819826294e-08,
"loss": 2.2445,
"step": 2225
},
{
"epoch": 2.261663286004057,
"grad_norm": 1.802360281392845,
"learning_rate": 5.1726679689243875e-08,
"loss": 2.234,
"step": 2230
},
{
"epoch": 2.266734279918864,
"grad_norm": 1.763707867667644,
"learning_rate": 5.1672292821505586e-08,
"loss": 2.2132,
"step": 2235
},
{
"epoch": 2.2718052738336714,
"grad_norm": 1.75034581686763,
"learning_rate": 5.161946777950308e-08,
"loss": 2.2381,
"step": 2240
},
{
"epoch": 2.2768762677484786,
"grad_norm": 1.7401836199474783,
"learning_rate": 5.1568164080844036e-08,
"loss": 2.2416,
"step": 2245
},
{
"epoch": 2.281947261663286,
"grad_norm": 1.7713650977668527,
"learning_rate": 5.1518342178549174e-08,
"loss": 2.224,
"step": 2250
},
{
"epoch": 2.287018255578093,
"grad_norm": 1.7671231076913356,
"learning_rate": 5.146996344208486e-08,
"loss": 2.2183,
"step": 2255
},
{
"epoch": 2.292089249492901,
"grad_norm": 1.7464419032652747,
"learning_rate": 5.142299013872629e-08,
"loss": 2.2419,
"step": 2260
},
{
"epoch": 2.297160243407708,
"grad_norm": 1.7990294085116565,
"learning_rate": 5.1377385415246445e-08,
"loss": 2.2311,
"step": 2265
},
{
"epoch": 2.3022312373225153,
"grad_norm": 1.7543351264072877,
"learning_rate": 5.1333113279926185e-08,
"loss": 2.238,
"step": 2270
},
{
"epoch": 2.3073022312373226,
"grad_norm": 1.6898279670163325,
"learning_rate": 5.129013858488057e-08,
"loss": 2.2308,
"step": 2275
},
{
"epoch": 2.31237322515213,
"grad_norm": 1.7334567047607963,
"learning_rate": 5.124842700869695e-08,
"loss": 2.3031,
"step": 2280
},
{
"epoch": 2.317444219066937,
"grad_norm": 1.760983319309442,
"learning_rate": 5.120794503938012e-08,
"loss": 2.2455,
"step": 2285
},
{
"epoch": 2.3225152129817443,
"grad_norm": 1.7621675205518297,
"learning_rate": 5.116865995760006e-08,
"loss": 2.228,
"step": 2290
},
{
"epoch": 2.3275862068965516,
"grad_norm": 1.8080633887862172,
"learning_rate": 5.113053982023768e-08,
"loss": 2.284,
"step": 2295
},
{
"epoch": 2.332657200811359,
"grad_norm": 1.7592998081055247,
"learning_rate": 5.1093553444224286e-08,
"loss": 2.2196,
"step": 2300
},
{
"epoch": 2.337728194726166,
"grad_norm": 1.7831607571885368,
"learning_rate": 5.105767039067024e-08,
"loss": 2.269,
"step": 2305
},
{
"epoch": 2.342799188640974,
"grad_norm": 1.7176459519033709,
"learning_rate": 5.102286094927856e-08,
"loss": 2.2435,
"step": 2310
},
{
"epoch": 2.347870182555781,
"grad_norm": 1.7512756209003166,
"learning_rate": 5.098909612303925e-08,
"loss": 2.2579,
"step": 2315
},
{
"epoch": 2.3529411764705883,
"grad_norm": 1.7419259056225642,
"learning_rate": 5.095634761319991e-08,
"loss": 2.268,
"step": 2320
},
{
"epoch": 2.3580121703853956,
"grad_norm": 1.7461469979215953,
"learning_rate": 5.092458780450876e-08,
"loss": 2.2252,
"step": 2325
},
{
"epoch": 2.363083164300203,
"grad_norm": 1.745083473021831,
"learning_rate": 5.089378975072569e-08,
"loss": 2.2591,
"step": 2330
},
{
"epoch": 2.36815415821501,
"grad_norm": 1.8343705825023535,
"learning_rate": 5.086392716039744e-08,
"loss": 2.2626,
"step": 2335
},
{
"epoch": 2.3732251521298173,
"grad_norm": 1.7515682941502182,
"learning_rate": 5.0834974382892763e-08,
"loss": 2.2378,
"step": 2340
},
{
"epoch": 2.3782961460446246,
"grad_norm": 1.772483228062822,
"learning_rate": 5.080690639469371e-08,
"loss": 2.2906,
"step": 2345
},
{
"epoch": 2.3833671399594323,
"grad_norm": 1.8298309311035177,
"learning_rate": 5.077969878593903e-08,
"loss": 2.2782,
"step": 2350
},
{
"epoch": 2.3884381338742395,
"grad_norm": 1.778228901931638,
"learning_rate": 5.0753327747215805e-08,
"loss": 2.2687,
"step": 2355
},
{
"epoch": 2.393509127789047,
"grad_norm": 1.9355725485663295,
"learning_rate": 5.0727770056595594e-08,
"loss": 2.25,
"step": 2360
},
{
"epoch": 2.398580121703854,
"grad_norm": 1.7876677525732199,
"learning_rate": 5.070300306691114e-08,
"loss": 2.2811,
"step": 2365
},
{
"epoch": 2.4036511156186613,
"grad_norm": 1.766450812020173,
"learning_rate": 5.067900469327011e-08,
"loss": 2.265,
"step": 2370
},
{
"epoch": 2.4087221095334685,
"grad_norm": 1.6988211316677768,
"learning_rate": 5.065575340080193e-08,
"loss": 2.2458,
"step": 2375
},
{
"epoch": 2.413793103448276,
"grad_norm": 1.777565241311822,
"learning_rate": 5.063322819263436e-08,
"loss": 2.289,
"step": 2380
},
{
"epoch": 2.418864097363083,
"grad_norm": 1.766648317811343,
"learning_rate": 5.061140859809592e-08,
"loss": 2.2263,
"step": 2385
},
{
"epoch": 2.4239350912778903,
"grad_norm": 1.760808570512941,
"learning_rate": 5.059027466114087e-08,
"loss": 2.2371,
"step": 2390
},
{
"epoch": 2.4290060851926976,
"grad_norm": 1.7497881623660254,
"learning_rate": 5.056980692899308e-08,
"loss": 2.2186,
"step": 2395
},
{
"epoch": 2.4340770791075053,
"grad_norm": 1.904368651484495,
"learning_rate": 5.0549986441005356e-08,
"loss": 2.2413,
"step": 2400
},
{
"epoch": 2.4340770791075053,
"eval_loss": 2.4748759269714355,
"eval_runtime": 81.0832,
"eval_samples_per_second": 86.405,
"eval_steps_per_second": 0.678,
"step": 2400
},
{
"epoch": 2.4391480730223125,
"grad_norm": 1.7410363640013542,
"learning_rate": 5.053079471773089e-08,
"loss": 2.2531,
"step": 2405
},
{
"epoch": 2.4442190669371198,
"grad_norm": 1.7518018775000213,
"learning_rate": 5.0512213750203305e-08,
"loss": 2.2473,
"step": 2410
},
{
"epoch": 2.449290060851927,
"grad_norm": 1.7662222396602074,
"learning_rate": 5.049422598942212e-08,
"loss": 2.2389,
"step": 2415
},
{
"epoch": 2.4543610547667343,
"grad_norm": 1.780666367007688,
"learning_rate": 5.0476814336040274e-08,
"loss": 2.197,
"step": 2420
},
{
"epoch": 2.4594320486815415,
"grad_norm": 1.7499711395815145,
"learning_rate": 5.04599621302504e-08,
"loss": 2.2261,
"step": 2425
},
{
"epoch": 2.464503042596349,
"grad_norm": 1.7882713122146334,
"learning_rate": 5.04436531418668e-08,
"loss": 2.2393,
"step": 2430
},
{
"epoch": 2.469574036511156,
"grad_norm": 1.75643986036064,
"learning_rate": 5.042787156059982e-08,
"loss": 2.2439,
"step": 2435
},
{
"epoch": 2.4746450304259637,
"grad_norm": 1.7353199942499,
"learning_rate": 5.041260198651953e-08,
"loss": 2.2275,
"step": 2440
},
{
"epoch": 2.479716024340771,
"grad_norm": 1.7683236873580634,
"learning_rate": 5.039782942070575e-08,
"loss": 2.2378,
"step": 2445
},
{
"epoch": 2.4847870182555782,
"grad_norm": 1.7482878827223234,
"learning_rate": 5.038353925608112e-08,
"loss": 2.2655,
"step": 2450
},
{
"epoch": 2.4898580121703855,
"grad_norm": 1.7553465772492238,
"learning_rate": 5.036971726842454e-08,
"loss": 2.2509,
"step": 2455
},
{
"epoch": 2.4949290060851927,
"grad_norm": 1.7194051175937297,
"learning_rate": 5.035634960756173e-08,
"loss": 2.2246,
"step": 2460
},
{
"epoch": 2.5,
"grad_norm": 1.780820717878673,
"learning_rate": 5.0345973520341744e-08,
"loss": 2.3116,
"step": 2465
},
{
"epoch": 2.5050709939148073,
"grad_norm": 1.7092302368812895,
"learning_rate": 5.0333389906255366e-08,
"loss": 2.2434,
"step": 2470
},
{
"epoch": 2.5101419878296145,
"grad_norm": 1.6995993050400164,
"learning_rate": 5.03212237555571e-08,
"loss": 2.234,
"step": 2475
},
{
"epoch": 2.5152129817444218,
"grad_norm": 1.7916125090755124,
"learning_rate": 5.030946256214713e-08,
"loss": 2.2365,
"step": 2480
},
{
"epoch": 2.520283975659229,
"grad_norm": 1.743409123646943,
"learning_rate": 5.0298094154063516e-08,
"loss": 2.2778,
"step": 2485
},
{
"epoch": 2.5253549695740363,
"grad_norm": 1.7989761193864806,
"learning_rate": 5.028710668564437e-08,
"loss": 2.2698,
"step": 2490
},
{
"epoch": 2.530425963488844,
"grad_norm": 1.768436463277154,
"learning_rate": 5.027648862984817e-08,
"loss": 2.2295,
"step": 2495
},
{
"epoch": 2.535496957403651,
"grad_norm": 1.7762161444449078,
"learning_rate": 5.026622877072948e-08,
"loss": 2.2772,
"step": 2500
},
{
"epoch": 2.5405679513184585,
"grad_norm": 1.7325943514517332,
"learning_rate": 5.0256316196067565e-08,
"loss": 2.2326,
"step": 2505
},
{
"epoch": 2.5456389452332657,
"grad_norm": 1.7568007182157335,
"learning_rate": 5.024674029014512e-08,
"loss": 2.2575,
"step": 2510
},
{
"epoch": 2.550709939148073,
"grad_norm": 1.7465474101311085,
"learning_rate": 5.023749072667476e-08,
"loss": 2.2398,
"step": 2515
},
{
"epoch": 2.5557809330628802,
"grad_norm": 1.7105972624166814,
"learning_rate": 5.022855746187064e-08,
"loss": 2.2348,
"step": 2520
},
{
"epoch": 2.5608519269776875,
"grad_norm": 1.759196327867933,
"learning_rate": 5.021993072766265e-08,
"loss": 2.2302,
"step": 2525
},
{
"epoch": 2.565922920892495,
"grad_norm": 1.7618696598564434,
"learning_rate": 5.0211601025050875e-08,
"loss": 2.2783,
"step": 2530
},
{
"epoch": 2.5709939148073024,
"grad_norm": 1.7357397604845723,
"learning_rate": 5.020355911759782e-08,
"loss": 2.2399,
"step": 2535
},
{
"epoch": 2.5760649087221097,
"grad_norm": 1.7797963559349856,
"learning_rate": 5.019579602505595e-08,
"loss": 2.3119,
"step": 2540
},
{
"epoch": 2.581135902636917,
"grad_norm": 1.7476476267237637,
"learning_rate": 5.0188303017128396e-08,
"loss": 2.2362,
"step": 2545
},
{
"epoch": 2.586206896551724,
"grad_norm": 1.7871655712678034,
"learning_rate": 5.018107160736018e-08,
"loss": 2.2684,
"step": 2550
},
{
"epoch": 2.5912778904665315,
"grad_norm": 1.8564365985849263,
"learning_rate": 5.0174093547158035e-08,
"loss": 2.2683,
"step": 2555
},
{
"epoch": 2.5963488843813387,
"grad_norm": 1.7498854511370805,
"learning_rate": 5.016736081993624e-08,
"loss": 2.2518,
"step": 2560
},
{
"epoch": 2.601419878296146,
"grad_norm": 1.7966977533010748,
"learning_rate": 5.016086563538651e-08,
"loss": 2.2218,
"step": 2565
},
{
"epoch": 2.606490872210953,
"grad_norm": 1.7558979371137615,
"learning_rate": 5.015460042386951e-08,
"loss": 2.2658,
"step": 2570
},
{
"epoch": 2.6115618661257605,
"grad_norm": 1.7805268954368878,
"learning_rate": 5.014855783092602e-08,
"loss": 2.2324,
"step": 2575
},
{
"epoch": 2.6166328600405677,
"grad_norm": 1.7547744035144406,
"learning_rate": 5.0142730711905564e-08,
"loss": 2.2635,
"step": 2580
},
{
"epoch": 2.6217038539553754,
"grad_norm": 1.7892043381738651,
"learning_rate": 5.013711212671024e-08,
"loss": 2.2174,
"step": 2585
},
{
"epoch": 2.6267748478701827,
"grad_norm": 1.7661048483256172,
"learning_rate": 5.013169533465201e-08,
"loss": 2.2411,
"step": 2590
},
{
"epoch": 2.63184584178499,
"grad_norm": 1.7714992602824393,
"learning_rate": 5.012647378942108e-08,
"loss": 2.2379,
"step": 2595
},
{
"epoch": 2.636916835699797,
"grad_norm": 1.757980523509378,
"learning_rate": 5.0121441134163554e-08,
"loss": 2.216,
"step": 2600
},
{
"epoch": 2.636916835699797,
"eval_loss": 2.4749209880828857,
"eval_runtime": 81.0391,
"eval_samples_per_second": 86.452,
"eval_steps_per_second": 0.679,
"step": 2600
},
{
"epoch": 2.6419878296146044,
"grad_norm": 1.8485215916583273,
"learning_rate": 5.011659119666631e-08,
"loss": 2.2233,
"step": 2605
},
{
"epoch": 2.6470588235294117,
"grad_norm": 1.7863067371305124,
"learning_rate": 5.0111917984647157e-08,
"loss": 2.244,
"step": 2610
},
{
"epoch": 2.652129817444219,
"grad_norm": 1.7146816358296353,
"learning_rate": 5.010741568114834e-08,
"loss": 2.2351,
"step": 2615
},
{
"epoch": 2.6572008113590266,
"grad_norm": 1.831188399230356,
"learning_rate": 5.0103078640031516e-08,
"loss": 2.2269,
"step": 2620
},
{
"epoch": 2.662271805273834,
"grad_norm": 1.7724728214531387,
"learning_rate": 5.009890138157231e-08,
"loss": 2.2075,
"step": 2625
},
{
"epoch": 2.667342799188641,
"grad_norm": 1.782021890238949,
"learning_rate": 5.009487858815262e-08,
"loss": 2.217,
"step": 2630
},
{
"epoch": 2.6724137931034484,
"grad_norm": 1.7481328251498853,
"learning_rate": 5.0091005100048845e-08,
"loss": 2.2719,
"step": 2635
},
{
"epoch": 2.6774847870182557,
"grad_norm": 1.7906104909059064,
"learning_rate": 5.0087275911314286e-08,
"loss": 2.236,
"step": 2640
},
{
"epoch": 2.682555780933063,
"grad_norm": 1.7602535674283515,
"learning_rate": 5.008368616575389e-08,
"loss": 2.2479,
"step": 2645
},
{
"epoch": 2.68762677484787,
"grad_norm": 1.775336072092801,
"learning_rate": 5.00802311529897e-08,
"loss": 2.2651,
"step": 2650
},
{
"epoch": 2.6926977687626774,
"grad_norm": 1.7553544981528668,
"learning_rate": 5.00769063046152e-08,
"loss": 2.2695,
"step": 2655
},
{
"epoch": 2.6977687626774847,
"grad_norm": 1.827043155040219,
"learning_rate": 5.0073707190436947e-08,
"loss": 2.2565,
"step": 2660
},
{
"epoch": 2.702839756592292,
"grad_norm": 1.7286161050152862,
"learning_rate": 5.00706295148018e-08,
"loss": 2.2447,
"step": 2665
},
{
"epoch": 2.707910750507099,
"grad_norm": 1.818175461042268,
"learning_rate": 5.0067669113008144e-08,
"loss": 2.2437,
"step": 2670
},
{
"epoch": 2.7129817444219064,
"grad_norm": 1.8017061603291116,
"learning_rate": 5.006482194779946e-08,
"loss": 2.2557,
"step": 2675
},
{
"epoch": 2.718052738336714,
"grad_norm": 1.7866064039916518,
"learning_rate": 5.006208410593867e-08,
"loss": 2.2752,
"step": 2680
},
{
"epoch": 2.7231237322515214,
"grad_norm": 1.7655940160674672,
"learning_rate": 5.0059451794861766e-08,
"loss": 2.2834,
"step": 2685
},
{
"epoch": 2.7281947261663286,
"grad_norm": 1.7936324116014108,
"learning_rate": 5.005692133940906e-08,
"loss": 2.2634,
"step": 2690
},
{
"epoch": 2.733265720081136,
"grad_norm": 1.7857563825463283,
"learning_rate": 5.00544891786327e-08,
"loss": 2.2741,
"step": 2695
},
{
"epoch": 2.738336713995943,
"grad_norm": 1.7472045814339527,
"learning_rate": 5.005215186267882e-08,
"loss": 2.2644,
"step": 2700
},
{
"epoch": 2.7434077079107504,
"grad_norm": 1.8795177703424921,
"learning_rate": 5.0049906049743e-08,
"loss": 2.3007,
"step": 2705
},
{
"epoch": 2.7484787018255576,
"grad_norm": 1.8521743861085576,
"learning_rate": 5.004774850309745e-08,
"loss": 2.2366,
"step": 2710
},
{
"epoch": 2.7535496957403653,
"grad_norm": 1.7735396381086006,
"learning_rate": 5.0045676088188616e-08,
"loss": 2.2481,
"step": 2715
},
{
"epoch": 2.7586206896551726,
"grad_norm": 1.750426755759642,
"learning_rate": 5.004368576980381e-08,
"loss": 2.2235,
"step": 2720
},
{
"epoch": 2.76369168356998,
"grad_norm": 1.7041388090684644,
"learning_rate": 5.004177460930539e-08,
"loss": 2.2231,
"step": 2725
},
{
"epoch": 2.768762677484787,
"grad_norm": 1.8140115420681437,
"learning_rate": 5.003993976193124e-08,
"loss": 2.2138,
"step": 2730
},
{
"epoch": 2.7738336713995944,
"grad_norm": 1.822513477317258,
"learning_rate": 5.0038178474160234e-08,
"loss": 2.2612,
"step": 2735
},
{
"epoch": 2.7789046653144016,
"grad_norm": 1.7108014551704207,
"learning_rate": 5.003648808114121e-08,
"loss": 2.2464,
"step": 2740
},
{
"epoch": 2.783975659229209,
"grad_norm": 1.7880353168056893,
"learning_rate": 5.0034866004184443e-08,
"loss": 2.2571,
"step": 2745
},
{
"epoch": 2.789046653144016,
"grad_norm": 1.738078469289302,
"learning_rate": 5.003330974831406e-08,
"loss": 2.2712,
"step": 2750
},
{
"epoch": 2.7941176470588234,
"grad_norm": 1.851997917147577,
"learning_rate": 5.0031816899880413e-08,
"loss": 2.266,
"step": 2755
},
{
"epoch": 2.7991886409736306,
"grad_norm": 1.7297614602052127,
"learning_rate": 5.0030385124230966e-08,
"loss": 2.2423,
"step": 2760
},
{
"epoch": 2.804259634888438,
"grad_norm": 1.8006816107770167,
"learning_rate": 5.002901216343864e-08,
"loss": 2.2506,
"step": 2765
},
{
"epoch": 2.8093306288032456,
"grad_norm": 1.8037373860257597,
"learning_rate": 5.002769583408638e-08,
"loss": 2.2504,
"step": 2770
},
{
"epoch": 2.814401622718053,
"grad_norm": 1.7406557827783702,
"learning_rate": 5.002643402510677e-08,
"loss": 2.2676,
"step": 2775
},
{
"epoch": 2.81947261663286,
"grad_norm": 1.7784795193672072,
"learning_rate": 5.0025224695675576e-08,
"loss": 2.2052,
"step": 2780
},
{
"epoch": 2.8245436105476673,
"grad_norm": 1.7627831810019972,
"learning_rate": 5.002406587315805e-08,
"loss": 2.2315,
"step": 2785
},
{
"epoch": 2.8296146044624746,
"grad_norm": 1.798869752268086,
"learning_rate": 5.0022955651106973e-08,
"loss": 2.2436,
"step": 2790
},
{
"epoch": 2.834685598377282,
"grad_norm": 1.712097491290732,
"learning_rate": 5.00218921873112e-08,
"loss": 2.274,
"step": 2795
},
{
"epoch": 2.839756592292089,
"grad_norm": 1.8197661388888422,
"learning_rate": 5.002087370189384e-08,
"loss": 2.2696,
"step": 2800
},
{
"epoch": 2.839756592292089,
"eval_loss": 2.4746689796447754,
"eval_runtime": 80.933,
"eval_samples_per_second": 86.565,
"eval_steps_per_second": 0.68,
"step": 2800
},
{
"epoch": 2.844827586206897,
"grad_norm": 1.7693694208988924,
"learning_rate": 5.001989847545882e-08,
"loss": 2.2054,
"step": 2805
},
{
"epoch": 2.849898580121704,
"grad_norm": 1.8223549799119019,
"learning_rate": 5.001896484728491e-08,
"loss": 2.2656,
"step": 2810
},
{
"epoch": 2.8549695740365113,
"grad_norm": 1.805868445642325,
"learning_rate": 5.00180712135662e-08,
"loss": 2.26,
"step": 2815
},
{
"epoch": 2.8600405679513186,
"grad_norm": 1.7505054153674502,
"learning_rate": 5.001721602569797e-08,
"loss": 2.2465,
"step": 2820
},
{
"epoch": 2.865111561866126,
"grad_norm": 1.8486977309170785,
"learning_rate": 5.0016397788606984e-08,
"loss": 2.2764,
"step": 2825
},
{
"epoch": 2.870182555780933,
"grad_norm": 1.7740829866432102,
"learning_rate": 5.0015615059125324e-08,
"loss": 2.2303,
"step": 2830
},
{
"epoch": 2.8752535496957403,
"grad_norm": 1.7656514305652502,
"learning_rate": 5.00148664444067e-08,
"loss": 2.238,
"step": 2835
},
{
"epoch": 2.8803245436105476,
"grad_norm": 1.7634420973902674,
"learning_rate": 5.001415060038435e-08,
"loss": 2.2489,
"step": 2840
},
{
"epoch": 2.885395537525355,
"grad_norm": 1.8143454888420456,
"learning_rate": 5.0013466230269694e-08,
"loss": 2.2607,
"step": 2845
},
{
"epoch": 2.890466531440162,
"grad_norm": 1.7405623983796592,
"learning_rate": 5.001281208309067e-08,
"loss": 2.2677,
"step": 2850
},
{
"epoch": 2.8955375253549693,
"grad_norm": 1.7692613071607504,
"learning_rate": 5.0012186952269086e-08,
"loss": 2.2499,
"step": 2855
},
{
"epoch": 2.900608519269777,
"grad_norm": 1.8007487263191868,
"learning_rate": 5.0011589674235926e-08,
"loss": 2.277,
"step": 2860
},
{
"epoch": 2.9056795131845843,
"grad_norm": 1.7487914626739638,
"learning_rate": 5.001101912708386e-08,
"loss": 2.2377,
"step": 2865
},
{
"epoch": 2.9107505070993915,
"grad_norm": 1.7555747509644022,
"learning_rate": 5.0010474229256126e-08,
"loss": 2.2532,
"step": 2870
},
{
"epoch": 2.915821501014199,
"grad_norm": 1.791874000591728,
"learning_rate": 5.0009953938270927e-08,
"loss": 2.234,
"step": 2875
},
{
"epoch": 2.920892494929006,
"grad_norm": 1.8071787232301668,
"learning_rate": 5.0009457249480536e-08,
"loss": 2.2316,
"step": 2880
},
{
"epoch": 2.9259634888438133,
"grad_norm": 1.7814343272445903,
"learning_rate": 5.000898319486436e-08,
"loss": 2.2427,
"step": 2885
},
{
"epoch": 2.9310344827586206,
"grad_norm": 1.8248593697919109,
"learning_rate": 5.000853084185513e-08,
"loss": 2.2027,
"step": 2890
},
{
"epoch": 2.9361054766734282,
"grad_norm": 1.7986268547334479,
"learning_rate": 5.00080992921975e-08,
"loss": 2.244,
"step": 2895
},
{
"epoch": 2.9411764705882355,
"grad_norm": 1.8701642658692874,
"learning_rate": 5.0007687680838296e-08,
"loss": 2.2341,
"step": 2900
},
{
"epoch": 2.9462474645030428,
"grad_norm": 1.7265239787323012,
"learning_rate": 5.000729517484766e-08,
"loss": 2.2781,
"step": 2905
},
{
"epoch": 2.95131845841785,
"grad_norm": 1.7596094154490194,
"learning_rate": 5.0006920972370384e-08,
"loss": 2.2184,
"step": 2910
},
{
"epoch": 2.9563894523326573,
"grad_norm": 1.775542548895703,
"learning_rate": 5.000656430160671e-08,
"loss": 2.2404,
"step": 2915
},
{
"epoch": 2.9614604462474645,
"grad_norm": 1.7859302210997496,
"learning_rate": 5.0006224419821984e-08,
"loss": 2.2567,
"step": 2920
},
{
"epoch": 2.9665314401622718,
"grad_norm": 1.8410867262560875,
"learning_rate": 5.000590061238431e-08,
"loss": 2.2288,
"step": 2925
},
{
"epoch": 2.971602434077079,
"grad_norm": 1.79261063919542,
"learning_rate": 5.0005592191829755e-08,
"loss": 2.2421,
"step": 2930
},
{
"epoch": 2.9766734279918863,
"grad_norm": 1.787266539908181,
"learning_rate": 5.0005298496954236e-08,
"loss": 2.2713,
"step": 2935
},
{
"epoch": 2.9817444219066935,
"grad_norm": 1.8046073077938924,
"learning_rate": 5.000501889193161e-08,
"loss": 2.2292,
"step": 2940
},
{
"epoch": 2.986815415821501,
"grad_norm": 1.785150585134779,
"learning_rate": 5.0004752765457286e-08,
"loss": 2.2557,
"step": 2945
},
{
"epoch": 2.991886409736308,
"grad_norm": 1.7007836630596234,
"learning_rate": 5.000449952991666e-08,
"loss": 2.2913,
"step": 2950
},
{
"epoch": 2.9969574036511157,
"grad_norm": 1.7834634363941848,
"learning_rate": 5.000425862057791e-08,
"loss": 2.2178,
"step": 2955
},
{
"epoch": 3.002028397565923,
"grad_norm": 1.7711499203458665,
"learning_rate": 5.000402949480845e-08,
"loss": 2.2302,
"step": 2960
},
{
"epoch": 3.0070993914807302,
"grad_norm": 1.757400702100505,
"learning_rate": 5.000381163131448e-08,
"loss": 2.228,
"step": 2965
},
{
"epoch": 3.0121703853955375,
"grad_norm": 1.7587243027978727,
"learning_rate": 5.0003604529403105e-08,
"loss": 2.2532,
"step": 2970
},
{
"epoch": 3.0172413793103448,
"grad_norm": 1.8076763012567914,
"learning_rate": 5.000340770826644e-08,
"loss": 2.2812,
"step": 2975
},
{
"epoch": 3.022312373225152,
"grad_norm": 1.7710168575859588,
"learning_rate": 5.000322070628711e-08,
"loss": 2.2227,
"step": 2980
},
{
"epoch": 3.0273833671399593,
"grad_norm": 1.7518567665908418,
"learning_rate": 5.0003043080364665e-08,
"loss": 2.267,
"step": 2985
},
{
"epoch": 3.032454361054767,
"grad_norm": 1.75371879782544,
"learning_rate": 5.0002874405262365e-08,
"loss": 2.2748,
"step": 2990
},
{
"epoch": 3.037525354969574,
"grad_norm": 1.7604102341237111,
"learning_rate": 5.000271427297382e-08,
"loss": 2.244,
"step": 2995
},
{
"epoch": 3.0425963488843815,
"grad_norm": 1.7473066315528492,
"learning_rate": 5.0002562292108974e-08,
"loss": 2.2455,
"step": 3000
},
{
"epoch": 3.0425963488843815,
"eval_loss": 2.475208282470703,
"eval_runtime": 81.0816,
"eval_samples_per_second": 86.407,
"eval_steps_per_second": 0.678,
"step": 3000
},
{
"epoch": 3.0476673427991887,
"grad_norm": 1.8183626105425974,
"learning_rate": 5.000241808729891e-08,
"loss": 2.2598,
"step": 3005
},
{
"epoch": 3.052738336713996,
"grad_norm": 1.776003383845723,
"learning_rate": 5.00022812986191e-08,
"loss": 2.2749,
"step": 3010
},
{
"epoch": 3.0578093306288032,
"grad_norm": 1.8405505191800016,
"learning_rate": 5.0002151581030434e-08,
"loss": 2.2201,
"step": 3015
},
{
"epoch": 3.0628803245436105,
"grad_norm": 1.7687042107524293,
"learning_rate": 5.00020286038378e-08,
"loss": 2.2398,
"step": 3020
},
{
"epoch": 3.0679513184584177,
"grad_norm": 1.7504153888466234,
"learning_rate": 5.000191205016553e-08,
"loss": 2.2221,
"step": 3025
},
{
"epoch": 3.073022312373225,
"grad_norm": 1.7642074409964643,
"learning_rate": 5.000180161644944e-08,
"loss": 2.2223,
"step": 3030
},
{
"epoch": 3.0780933062880322,
"grad_norm": 1.7392036544850287,
"learning_rate": 5.000169701194494e-08,
"loss": 2.2192,
"step": 3035
},
{
"epoch": 3.08316430020284,
"grad_norm": 1.720350344708903,
"learning_rate": 5.0001597958250776e-08,
"loss": 2.2315,
"step": 3040
},
{
"epoch": 3.088235294117647,
"grad_norm": 1.7724706443214726,
"learning_rate": 5.000150418884808e-08,
"loss": 2.2501,
"step": 3045
},
{
"epoch": 3.0933062880324544,
"grad_norm": 1.7924639073969963,
"learning_rate": 5.000141544865421e-08,
"loss": 2.2446,
"step": 3050
},
{
"epoch": 3.0983772819472617,
"grad_norm": 1.736852243176053,
"learning_rate": 5.000133149359102e-08,
"loss": 2.2457,
"step": 3055
},
{
"epoch": 3.103448275862069,
"grad_norm": 1.784090807966895,
"learning_rate": 5.000125209016723e-08,
"loss": 2.2521,
"step": 3060
},
{
"epoch": 3.108519269776876,
"grad_norm": 1.7552195819841987,
"learning_rate": 5.000117701507439e-08,
"loss": 2.2331,
"step": 3065
},
{
"epoch": 3.1135902636916835,
"grad_norm": 1.7588419707647238,
"learning_rate": 5.0001106054796176e-08,
"loss": 2.2465,
"step": 3070
},
{
"epoch": 3.1186612576064907,
"grad_norm": 1.731249391051153,
"learning_rate": 5.000103900523059e-08,
"loss": 2.2154,
"step": 3075
},
{
"epoch": 3.123732251521298,
"grad_norm": 1.86107961069035,
"learning_rate": 5.0000975671324725e-08,
"loss": 2.2498,
"step": 3080
},
{
"epoch": 3.1288032454361057,
"grad_norm": 1.7453958505335196,
"learning_rate": 5.000091586672176e-08,
"loss": 2.213,
"step": 3085
},
{
"epoch": 3.133874239350913,
"grad_norm": 1.739107722358469,
"learning_rate": 5.000085941341981e-08,
"loss": 2.2703,
"step": 3090
},
{
"epoch": 3.13894523326572,
"grad_norm": 1.723031377500322,
"learning_rate": 5.000080614144228e-08,
"loss": 2.256,
"step": 3095
},
{
"epoch": 3.1440162271805274,
"grad_norm": 1.7859618571141844,
"learning_rate": 5.0000755888519526e-08,
"loss": 2.2446,
"step": 3100
},
{
"epoch": 3.1490872210953347,
"grad_norm": 1.7642645902841112,
"learning_rate": 5.0000708499781274e-08,
"loss": 2.2365,
"step": 3105
},
{
"epoch": 3.154158215010142,
"grad_norm": 1.8188951223969028,
"learning_rate": 5.000066382745973e-08,
"loss": 2.2743,
"step": 3110
},
{
"epoch": 3.159229208924949,
"grad_norm": 1.8017937041457348,
"learning_rate": 5.000062173060291e-08,
"loss": 2.2501,
"step": 3115
},
{
"epoch": 3.1643002028397564,
"grad_norm": 1.7816544045204796,
"learning_rate": 5.0000582074797944e-08,
"loss": 2.2025,
"step": 3120
},
{
"epoch": 3.1693711967545637,
"grad_norm": 1.7911385432695703,
"learning_rate": 5.0000544731904076e-08,
"loss": 2.2284,
"step": 3125
},
{
"epoch": 3.1744421906693714,
"grad_norm": 1.9232399576032946,
"learning_rate": 5.000050957979507e-08,
"loss": 2.2407,
"step": 3130
},
{
"epoch": 3.1795131845841786,
"grad_norm": 1.7293397524348884,
"learning_rate": 5.000047650211071e-08,
"loss": 2.2468,
"step": 3135
},
{
"epoch": 3.184584178498986,
"grad_norm": 1.7870474846773756,
"learning_rate": 5.000044538801721e-08,
"loss": 2.2432,
"step": 3140
},
{
"epoch": 3.189655172413793,
"grad_norm": 1.7179456705770244,
"learning_rate": 5.000041613197611e-08,
"loss": 2.2478,
"step": 3145
},
{
"epoch": 3.1947261663286004,
"grad_norm": 1.782930312662543,
"learning_rate": 5.0000388633521626e-08,
"loss": 2.219,
"step": 3150
},
{
"epoch": 3.1997971602434077,
"grad_norm": 1.8396726211182168,
"learning_rate": 5.000036279704598e-08,
"loss": 2.2131,
"step": 3155
},
{
"epoch": 3.204868154158215,
"grad_norm": 1.7441223394696925,
"learning_rate": 5.000033853159261e-08,
"loss": 2.216,
"step": 3160
},
{
"epoch": 3.209939148073022,
"grad_norm": 1.79701015495686,
"learning_rate": 5.000031575065695e-08,
"loss": 2.2423,
"step": 3165
},
{
"epoch": 3.2150101419878294,
"grad_norm": 1.7824241551117812,
"learning_rate": 5.000029437199458e-08,
"loss": 2.245,
"step": 3170
},
{
"epoch": 3.220081135902637,
"grad_norm": 1.7859671284571614,
"learning_rate": 5.000027431743653e-08,
"loss": 2.2466,
"step": 3175
},
{
"epoch": 3.2251521298174444,
"grad_norm": 1.7508641392805016,
"learning_rate": 5.000025551271141e-08,
"loss": 2.2123,
"step": 3180
},
{
"epoch": 3.2302231237322516,
"grad_norm": 1.790375251718636,
"learning_rate": 5.000023788727435e-08,
"loss": 2.2387,
"step": 3185
},
{
"epoch": 3.235294117647059,
"grad_norm": 1.8347285573544698,
"learning_rate": 5.0000221374142326e-08,
"loss": 2.2024,
"step": 3190
},
{
"epoch": 3.240365111561866,
"grad_norm": 1.766020664546832,
"learning_rate": 5.0000205909735805e-08,
"loss": 2.25,
"step": 3195
},
{
"epoch": 3.2454361054766734,
"grad_norm": 1.7685652184853669,
"learning_rate": 5.000019143372644e-08,
"loss": 2.216,
"step": 3200
},
{
"epoch": 3.2454361054766734,
"eval_loss": 2.475315809249878,
"eval_runtime": 81.0728,
"eval_samples_per_second": 86.416,
"eval_steps_per_second": 0.678,
"step": 3200
},
{
"epoch": 3.2505070993914806,
"grad_norm": 1.8114020440458831,
"learning_rate": 5.000017788889067e-08,
"loss": 2.2909,
"step": 3205
},
{
"epoch": 3.255578093306288,
"grad_norm": 1.8044780174846506,
"learning_rate": 5.0000165220969006e-08,
"loss": 2.2682,
"step": 3210
},
{
"epoch": 3.260649087221095,
"grad_norm": 1.8227060747974817,
"learning_rate": 5.0000153378530776e-08,
"loss": 2.2551,
"step": 3215
},
{
"epoch": 3.2657200811359024,
"grad_norm": 1.712746112733307,
"learning_rate": 5.000014231284425e-08,
"loss": 2.2085,
"step": 3220
},
{
"epoch": 3.27079107505071,
"grad_norm": 1.7693643379563115,
"learning_rate": 5.000013197775189e-08,
"loss": 2.2089,
"step": 3225
},
{
"epoch": 3.2758620689655173,
"grad_norm": 1.742416891486272,
"learning_rate": 5.000012232955056e-08,
"loss": 2.2256,
"step": 3230
},
{
"epoch": 3.2809330628803246,
"grad_norm": 1.7588332712006007,
"learning_rate": 5.000011332687656e-08,
"loss": 2.2411,
"step": 3235
},
{
"epoch": 3.286004056795132,
"grad_norm": 1.748987632844159,
"learning_rate": 5.000010493059533e-08,
"loss": 2.2161,
"step": 3240
},
{
"epoch": 3.291075050709939,
"grad_norm": 1.7730209178260556,
"learning_rate": 5.000009710369558e-08,
"loss": 2.2454,
"step": 3245
},
{
"epoch": 3.2961460446247464,
"grad_norm": 1.7638994477476329,
"learning_rate": 5.000008981118782e-08,
"loss": 2.2762,
"step": 3250
},
{
"epoch": 3.3012170385395536,
"grad_norm": 1.8306906774843352,
"learning_rate": 5.000008302000705e-08,
"loss": 2.2484,
"step": 3255
},
{
"epoch": 3.306288032454361,
"grad_norm": 1.8155910247025784,
"learning_rate": 5.0000076698919504e-08,
"loss": 2.2172,
"step": 3260
},
{
"epoch": 3.3113590263691686,
"grad_norm": 1.9000838772092157,
"learning_rate": 5.0000070818433264e-08,
"loss": 2.2639,
"step": 3265
},
{
"epoch": 3.316430020283976,
"grad_norm": 1.8182257588876376,
"learning_rate": 5.000006535071267e-08,
"loss": 2.2302,
"step": 3270
},
{
"epoch": 3.321501014198783,
"grad_norm": 1.7421030430480422,
"learning_rate": 5.0000060269496374e-08,
"loss": 2.2618,
"step": 3275
},
{
"epoch": 3.3265720081135903,
"grad_norm": 1.7545361773998456,
"learning_rate": 5.0000055550018825e-08,
"loss": 2.2174,
"step": 3280
},
{
"epoch": 3.3316430020283976,
"grad_norm": 1.7382589137313635,
"learning_rate": 5.000005116893524e-08,
"loss": 2.2497,
"step": 3285
},
{
"epoch": 3.336713995943205,
"grad_norm": 1.7544110577796528,
"learning_rate": 5.000004710424972e-08,
"loss": 2.2386,
"step": 3290
},
{
"epoch": 3.341784989858012,
"grad_norm": 1.7756370830140873,
"learning_rate": 5.0000043335246576e-08,
"loss": 2.2124,
"step": 3295
},
{
"epoch": 3.3468559837728193,
"grad_norm": 1.7647740914276824,
"learning_rate": 5.0000039842424645e-08,
"loss": 2.2357,
"step": 3300
},
{
"epoch": 3.3519269776876266,
"grad_norm": 1.7614092517536837,
"learning_rate": 5.000003660743452e-08,
"loss": 2.2823,
"step": 3305
},
{
"epoch": 3.356997971602434,
"grad_norm": 1.7889494130903192,
"learning_rate": 5.000003361301858e-08,
"loss": 2.1835,
"step": 3310
},
{
"epoch": 3.3620689655172415,
"grad_norm": 1.7154434994558871,
"learning_rate": 5.000003084295374e-08,
"loss": 2.2724,
"step": 3315
},
{
"epoch": 3.367139959432049,
"grad_norm": 1.8155130093382392,
"learning_rate": 5.0000028281996743e-08,
"loss": 2.2823,
"step": 3320
},
{
"epoch": 3.372210953346856,
"grad_norm": 1.880078020122213,
"learning_rate": 5.0000025915832e-08,
"loss": 2.2421,
"step": 3325
},
{
"epoch": 3.3772819472616633,
"grad_norm": 1.7913171122885942,
"learning_rate": 5.000002373102181e-08,
"loss": 2.1806,
"step": 3330
},
{
"epoch": 3.3823529411764706,
"grad_norm": 1.8110141267464457,
"learning_rate": 5.000002171495887e-08,
"loss": 2.2315,
"step": 3335
},
{
"epoch": 3.387423935091278,
"grad_norm": 1.8187945379716748,
"learning_rate": 5.000001985582107e-08,
"loss": 2.2207,
"step": 3340
},
{
"epoch": 3.392494929006085,
"grad_norm": 1.7822827152937282,
"learning_rate": 5.000001814252828e-08,
"loss": 2.2411,
"step": 3345
},
{
"epoch": 3.3975659229208923,
"grad_norm": 1.7281310183638643,
"learning_rate": 5.0000016564701364e-08,
"loss": 2.2415,
"step": 3350
},
{
"epoch": 3.4026369168357,
"grad_norm": 1.7550793470914747,
"learning_rate": 5.000001511262302e-08,
"loss": 2.2464,
"step": 3355
},
{
"epoch": 3.4077079107505073,
"grad_norm": 1.7459578038518018,
"learning_rate": 5.0000013777200565e-08,
"loss": 2.2504,
"step": 3360
},
{
"epoch": 3.4127789046653145,
"grad_norm": 1.740338062654503,
"learning_rate": 5.000001254993049e-08,
"loss": 2.2292,
"step": 3365
},
{
"epoch": 3.417849898580122,
"grad_norm": 1.8005446847395141,
"learning_rate": 5.000001142286484e-08,
"loss": 2.2646,
"step": 3370
},
{
"epoch": 3.422920892494929,
"grad_norm": 1.8075984781615184,
"learning_rate": 5.000001038857911e-08,
"loss": 2.2549,
"step": 3375
},
{
"epoch": 3.4279918864097363,
"grad_norm": 1.7944612854944237,
"learning_rate": 5.000000944014192e-08,
"loss": 2.2607,
"step": 3380
},
{
"epoch": 3.4330628803245435,
"grad_norm": 1.8042996177778357,
"learning_rate": 5.000000857108604e-08,
"loss": 2.2129,
"step": 3385
},
{
"epoch": 3.438133874239351,
"grad_norm": 1.812331539187214,
"learning_rate": 5.0000007775380984e-08,
"loss": 2.247,
"step": 3390
},
{
"epoch": 3.443204868154158,
"grad_norm": 1.7634101221518121,
"learning_rate": 5.0000007047407e-08,
"loss": 2.2454,
"step": 3395
},
{
"epoch": 3.4482758620689653,
"grad_norm": 1.8137979752467785,
"learning_rate": 5.000000638193037e-08,
"loss": 2.2348,
"step": 3400
},
{
"epoch": 3.4482758620689653,
"eval_loss": 2.475677013397217,
"eval_runtime": 81.0429,
"eval_samples_per_second": 86.448,
"eval_steps_per_second": 0.679,
"step": 3400
},
{
"epoch": 3.453346855983773,
"grad_norm": 1.7639358988388496,
"learning_rate": 5.0000005774079994e-08,
"loss": 2.2434,
"step": 3405
},
{
"epoch": 3.4584178498985803,
"grad_norm": 1.8860372894717414,
"learning_rate": 5.0000005219325215e-08,
"loss": 2.2184,
"step": 3410
},
{
"epoch": 3.4634888438133875,
"grad_norm": 1.792302245525526,
"learning_rate": 5.000000471345483e-08,
"loss": 2.2405,
"step": 3415
},
{
"epoch": 3.4685598377281948,
"grad_norm": 1.7326646638681342,
"learning_rate": 5.000000425255718e-08,
"loss": 2.2582,
"step": 3420
},
{
"epoch": 3.473630831643002,
"grad_norm": 1.7944771245301363,
"learning_rate": 5.0000003833001365e-08,
"loss": 2.202,
"step": 3425
},
{
"epoch": 3.4787018255578093,
"grad_norm": 1.8158606522431084,
"learning_rate": 5.000000345141943e-08,
"loss": 2.2533,
"step": 3430
},
{
"epoch": 3.4837728194726165,
"grad_norm": 1.8541024781685664,
"learning_rate": 5.0000003104689555e-08,
"loss": 2.2387,
"step": 3435
},
{
"epoch": 3.4888438133874238,
"grad_norm": 1.7999921658917655,
"learning_rate": 5.0000002789920174e-08,
"loss": 2.2441,
"step": 3440
},
{
"epoch": 3.4939148073022315,
"grad_norm": 1.7685774604287066,
"learning_rate": 5.000000250443497e-08,
"loss": 2.3018,
"step": 3445
},
{
"epoch": 3.4989858012170387,
"grad_norm": 1.7777470112493552,
"learning_rate": 5.000000224575872e-08,
"loss": 2.2433,
"step": 3450
},
{
"epoch": 3.504056795131846,
"grad_norm": 1.7748253950125374,
"learning_rate": 5.000000201160396e-08,
"loss": 2.2782,
"step": 3455
},
{
"epoch": 3.5091277890466532,
"grad_norm": 1.7842700957790634,
"learning_rate": 5.000000179985839e-08,
"loss": 2.2659,
"step": 3460
},
{
"epoch": 3.5141987829614605,
"grad_norm": 1.798939281745875,
"learning_rate": 5.000000160857302e-08,
"loss": 2.2396,
"step": 3465
},
{
"epoch": 3.5192697768762677,
"grad_norm": 1.8045276757468276,
"learning_rate": 5.000000143595102e-08,
"loss": 2.2325,
"step": 3470
},
{
"epoch": 3.524340770791075,
"grad_norm": 1.7262031285233723,
"learning_rate": 5.0000001280337235e-08,
"loss": 2.243,
"step": 3475
},
{
"epoch": 3.5294117647058822,
"grad_norm": 1.8375261220518257,
"learning_rate": 5.000000114020828e-08,
"loss": 2.2075,
"step": 3480
},
{
"epoch": 3.5344827586206895,
"grad_norm": 1.8163152606406519,
"learning_rate": 5.0000001014163305e-08,
"loss": 2.2494,
"step": 3485
},
{
"epoch": 3.5395537525354968,
"grad_norm": 1.8525927462335219,
"learning_rate": 5.0000000900915245e-08,
"loss": 2.2163,
"step": 3490
},
{
"epoch": 3.544624746450304,
"grad_norm": 1.7805165281974848,
"learning_rate": 5.000000079928269e-08,
"loss": 2.2525,
"step": 3495
},
{
"epoch": 3.5496957403651117,
"grad_norm": 1.7990737454408499,
"learning_rate": 5.000000070818217e-08,
"loss": 2.2874,
"step": 3500
},
{
"epoch": 3.554766734279919,
"grad_norm": 1.8247781997920414,
"learning_rate": 5.000000062662102e-08,
"loss": 2.2215,
"step": 3505
},
{
"epoch": 3.559837728194726,
"grad_norm": 1.9826615858248522,
"learning_rate": 5.000000055369062e-08,
"loss": 2.2443,
"step": 3510
},
{
"epoch": 3.5649087221095335,
"grad_norm": 1.799487216606698,
"learning_rate": 5.000000048856012e-08,
"loss": 2.2266,
"step": 3515
},
{
"epoch": 3.5699797160243407,
"grad_norm": 1.8091696515518445,
"learning_rate": 5.0000000430470526e-08,
"loss": 2.2517,
"step": 3520
},
{
"epoch": 3.575050709939148,
"grad_norm": 1.7814535925288772,
"learning_rate": 5.0000000378729234e-08,
"loss": 2.2321,
"step": 3525
},
{
"epoch": 3.5801217038539552,
"grad_norm": 1.850742214981416,
"learning_rate": 5.000000033270488e-08,
"loss": 2.2597,
"step": 3530
},
{
"epoch": 3.585192697768763,
"grad_norm": 1.7822355084719033,
"learning_rate": 5.000000029182252e-08,
"loss": 2.2963,
"step": 3535
},
{
"epoch": 3.59026369168357,
"grad_norm": 1.7548584963433536,
"learning_rate": 5.0000000255559235e-08,
"loss": 2.2669,
"step": 3540
},
{
"epoch": 3.5953346855983774,
"grad_norm": 1.8526633444752874,
"learning_rate": 5.0000000223439884e-08,
"loss": 2.2367,
"step": 3545
},
{
"epoch": 3.6004056795131847,
"grad_norm": 1.8813498033155052,
"learning_rate": 5.0000000195033304e-08,
"loss": 2.2373,
"step": 3550
},
{
"epoch": 3.605476673427992,
"grad_norm": 1.7670822667081592,
"learning_rate": 5.0000000169948675e-08,
"loss": 2.2705,
"step": 3555
},
{
"epoch": 3.610547667342799,
"grad_norm": 1.7756286276528583,
"learning_rate": 5.000000014783217e-08,
"loss": 2.2979,
"step": 3560
},
{
"epoch": 3.6156186612576064,
"grad_norm": 1.7467172856710016,
"learning_rate": 5.000000012836387e-08,
"loss": 2.2538,
"step": 3565
},
{
"epoch": 3.6206896551724137,
"grad_norm": 1.7107623358426811,
"learning_rate": 5.000000011125491e-08,
"loss": 2.2807,
"step": 3570
},
{
"epoch": 3.625760649087221,
"grad_norm": 1.8431542462448438,
"learning_rate": 5.000000009624475e-08,
"loss": 2.252,
"step": 3575
},
{
"epoch": 3.630831643002028,
"grad_norm": 1.7683303237840782,
"learning_rate": 5.000000008309876e-08,
"loss": 2.2722,
"step": 3580
},
{
"epoch": 3.6359026369168355,
"grad_norm": 1.7463535795755278,
"learning_rate": 5.000000007160591e-08,
"loss": 2.2712,
"step": 3585
},
{
"epoch": 3.640973630831643,
"grad_norm": 1.8412435208194315,
"learning_rate": 5.0000000061576706e-08,
"loss": 2.2438,
"step": 3590
},
{
"epoch": 3.6460446247464504,
"grad_norm": 1.7731354966851007,
"learning_rate": 5.000000005284119e-08,
"loss": 2.2305,
"step": 3595
},
{
"epoch": 3.6511156186612577,
"grad_norm": 1.7263977118619886,
"learning_rate": 5.0000000045247174e-08,
"loss": 2.238,
"step": 3600
},
{
"epoch": 3.6511156186612577,
"eval_loss": 2.475299596786499,
"eval_runtime": 81.0503,
"eval_samples_per_second": 86.44,
"eval_steps_per_second": 0.679,
"step": 3600
},
{
"epoch": 3.656186612576065,
"grad_norm": 1.725184319305705,
"learning_rate": 5.000000003865863e-08,
"loss": 2.2283,
"step": 3605
},
{
"epoch": 3.661257606490872,
"grad_norm": 1.9023050674895976,
"learning_rate": 5.000000003295409e-08,
"loss": 2.21,
"step": 3610
},
{
"epoch": 3.6663286004056794,
"grad_norm": 1.8044353617499143,
"learning_rate": 5.0000000028025353e-08,
"loss": 2.2658,
"step": 3615
},
{
"epoch": 3.6713995943204867,
"grad_norm": 1.7560239895320502,
"learning_rate": 5.0000000023776127e-08,
"loss": 2.2558,
"step": 3620
},
{
"epoch": 3.6764705882352944,
"grad_norm": 1.9019670084185585,
"learning_rate": 5.00000000201209e-08,
"loss": 2.2154,
"step": 3625
},
{
"epoch": 3.6815415821501016,
"grad_norm": 1.835689830804529,
"learning_rate": 5.0000000016983875e-08,
"loss": 2.2586,
"step": 3630
},
{
"epoch": 3.686612576064909,
"grad_norm": 1.8589538257906977,
"learning_rate": 5.000000001429796e-08,
"loss": 2.2388,
"step": 3635
},
{
"epoch": 3.691683569979716,
"grad_norm": 1.8068715773945243,
"learning_rate": 5.000000001200391e-08,
"loss": 2.2571,
"step": 3640
},
{
"epoch": 3.6967545638945234,
"grad_norm": 1.7775448603509494,
"learning_rate": 5.0000000010049494e-08,
"loss": 2.2751,
"step": 3645
},
{
"epoch": 3.7018255578093306,
"grad_norm": 1.748064680879759,
"learning_rate": 5.0000000008388774e-08,
"loss": 2.2183,
"step": 3650
},
{
"epoch": 3.706896551724138,
"grad_norm": 1.752057568304335,
"learning_rate": 5.000000000698141e-08,
"loss": 2.2532,
"step": 3655
},
{
"epoch": 3.711967545638945,
"grad_norm": 1.7976874660325244,
"learning_rate": 5.000000000579206e-08,
"loss": 2.2447,
"step": 3660
},
{
"epoch": 3.7170385395537524,
"grad_norm": 1.8361658170177098,
"learning_rate": 5.000000000478986e-08,
"loss": 2.2274,
"step": 3665
},
{
"epoch": 3.7221095334685597,
"grad_norm": 1.7595086838837224,
"learning_rate": 5.0000000003947866e-08,
"loss": 2.2704,
"step": 3670
},
{
"epoch": 3.727180527383367,
"grad_norm": 1.7772692374868122,
"learning_rate": 5.0000000003242645e-08,
"loss": 2.2394,
"step": 3675
},
{
"epoch": 3.732251521298174,
"grad_norm": 1.7860835232171102,
"learning_rate": 5.000000000265387e-08,
"loss": 2.238,
"step": 3680
},
{
"epoch": 3.737322515212982,
"grad_norm": 1.7590689183822192,
"learning_rate": 5.000000000216394e-08,
"loss": 2.2764,
"step": 3685
},
{
"epoch": 3.742393509127789,
"grad_norm": 1.7659065260707336,
"learning_rate": 5.0000000001757664e-08,
"loss": 2.2459,
"step": 3690
},
{
"epoch": 3.7474645030425964,
"grad_norm": 1.8153822365083379,
"learning_rate": 5.0000000001421954e-08,
"loss": 2.2299,
"step": 3695
},
{
"epoch": 3.7525354969574036,
"grad_norm": 1.7704864144251407,
"learning_rate": 5.0000000001145583e-08,
"loss": 2.247,
"step": 3700
},
{
"epoch": 3.757606490872211,
"grad_norm": 1.7268180675977047,
"learning_rate": 5.000000000091894e-08,
"loss": 2.2483,
"step": 3705
},
{
"epoch": 3.762677484787018,
"grad_norm": 1.7808473093189052,
"learning_rate": 5.000000000073382e-08,
"loss": 2.2774,
"step": 3710
},
{
"epoch": 3.767748478701826,
"grad_norm": 1.7999930755140212,
"learning_rate": 5.0000000000583246e-08,
"loss": 2.2209,
"step": 3715
},
{
"epoch": 3.772819472616633,
"grad_norm": 1.7741565202241085,
"learning_rate": 5.0000000000461306e-08,
"loss": 2.2353,
"step": 3720
},
{
"epoch": 3.7778904665314403,
"grad_norm": 1.8046657930760326,
"learning_rate": 5.0000000000363e-08,
"loss": 2.2255,
"step": 3725
},
{
"epoch": 3.7829614604462476,
"grad_norm": 1.8038566902418574,
"learning_rate": 5.000000000028412e-08,
"loss": 2.2781,
"step": 3730
},
{
"epoch": 3.788032454361055,
"grad_norm": 1.7944584001789026,
"learning_rate": 5.0000000000221146e-08,
"loss": 2.272,
"step": 3735
},
{
"epoch": 3.793103448275862,
"grad_norm": 1.7491739462315397,
"learning_rate": 5.0000000000171125e-08,
"loss": 2.2293,
"step": 3740
},
{
"epoch": 3.7981744421906694,
"grad_norm": 1.7505007397811716,
"learning_rate": 5.000000000013161e-08,
"loss": 2.2373,
"step": 3745
},
{
"epoch": 3.8032454361054766,
"grad_norm": 1.8014769703402196,
"learning_rate": 5.000000000010057e-08,
"loss": 2.2552,
"step": 3750
},
{
"epoch": 3.808316430020284,
"grad_norm": 1.7608287864741985,
"learning_rate": 5.0000000000076337e-08,
"loss": 2.2277,
"step": 3755
},
{
"epoch": 3.813387423935091,
"grad_norm": 1.8323757058256038,
"learning_rate": 5.0000000000057536e-08,
"loss": 2.2341,
"step": 3760
},
{
"epoch": 3.8184584178498984,
"grad_norm": 1.7574657806555387,
"learning_rate": 5.000000000004304e-08,
"loss": 2.2223,
"step": 3765
},
{
"epoch": 3.8235294117647056,
"grad_norm": 1.7900689784727426,
"learning_rate": 5.000000000003194e-08,
"loss": 2.2445,
"step": 3770
},
{
"epoch": 3.8286004056795133,
"grad_norm": 1.7873969080046235,
"learning_rate": 5.000000000002351e-08,
"loss": 2.2692,
"step": 3775
},
{
"epoch": 3.8336713995943206,
"grad_norm": 1.7693343584107923,
"learning_rate": 5.000000000001716e-08,
"loss": 2.1982,
"step": 3780
},
{
"epoch": 3.838742393509128,
"grad_norm": 1.782049247288072,
"learning_rate": 5.00000000000124e-08,
"loss": 2.2417,
"step": 3785
},
{
"epoch": 3.843813387423935,
"grad_norm": 1.8357614780582354,
"learning_rate": 5.000000000000888e-08,
"loss": 2.2414,
"step": 3790
},
{
"epoch": 3.8488843813387423,
"grad_norm": 1.7593131764821546,
"learning_rate": 5.0000000000006284e-08,
"loss": 2.2721,
"step": 3795
},
{
"epoch": 3.8539553752535496,
"grad_norm": 1.8355045282767246,
"learning_rate": 5.0000000000004405e-08,
"loss": 2.2349,
"step": 3800
},
{
"epoch": 3.8539553752535496,
"eval_loss": 2.475205421447754,
"eval_runtime": 81.089,
"eval_samples_per_second": 86.399,
"eval_steps_per_second": 0.678,
"step": 3800
},
{
"epoch": 3.859026369168357,
"grad_norm": 1.7617334472370734,
"learning_rate": 5.000000000000305e-08,
"loss": 2.2796,
"step": 3805
},
{
"epoch": 3.8640973630831645,
"grad_norm": 1.7655616354078496,
"learning_rate": 5.000000000000208e-08,
"loss": 2.2904,
"step": 3810
},
{
"epoch": 3.869168356997972,
"grad_norm": 1.7499887502905194,
"learning_rate": 5.00000000000014e-08,
"loss": 2.2289,
"step": 3815
},
{
"epoch": 3.874239350912779,
"grad_norm": 1.7552158736441676,
"learning_rate": 5.000000000000093e-08,
"loss": 2.2524,
"step": 3820
},
{
"epoch": 3.8793103448275863,
"grad_norm": 1.779864718453615,
"learning_rate": 5.0000000000000607e-08,
"loss": 2.2557,
"step": 3825
},
{
"epoch": 3.8843813387423936,
"grad_norm": 1.8326086874257492,
"learning_rate": 5.000000000000039e-08,
"loss": 2.2642,
"step": 3830
},
{
"epoch": 3.889452332657201,
"grad_norm": 1.7709614684441606,
"learning_rate": 5.000000000000024e-08,
"loss": 2.2316,
"step": 3835
},
{
"epoch": 3.894523326572008,
"grad_norm": 1.8053802580849208,
"learning_rate": 5.000000000000015e-08,
"loss": 2.2568,
"step": 3840
},
{
"epoch": 3.8995943204868153,
"grad_norm": 1.7935470548184194,
"learning_rate": 5.0000000000000104e-08,
"loss": 2.2993,
"step": 3845
},
{
"epoch": 3.9046653144016226,
"grad_norm": 1.7497664491493299,
"learning_rate": 5.000000000000006e-08,
"loss": 2.1989,
"step": 3850
},
{
"epoch": 3.90973630831643,
"grad_norm": 1.754972418650299,
"learning_rate": 5.000000000000003e-08,
"loss": 2.2424,
"step": 3855
},
{
"epoch": 3.914807302231237,
"grad_norm": 1.7589479994346042,
"learning_rate": 5.000000000000002e-08,
"loss": 2.2632,
"step": 3860
},
{
"epoch": 3.9198782961460448,
"grad_norm": 1.7971848831669277,
"learning_rate": 5.000000000000001e-08,
"loss": 2.2336,
"step": 3865
},
{
"epoch": 3.924949290060852,
"grad_norm": 1.7639968737695348,
"learning_rate": 5.0000000000000004e-08,
"loss": 2.2296,
"step": 3870
},
{
"epoch": 3.9300202839756593,
"grad_norm": 1.72827012743299,
"learning_rate": 5e-08,
"loss": 2.2451,
"step": 3875
},
{
"epoch": 3.9350912778904665,
"grad_norm": 1.749153588059136,
"learning_rate": 5e-08,
"loss": 2.258,
"step": 3880
},
{
"epoch": 3.940162271805274,
"grad_norm": 1.753206456867822,
"learning_rate": 5e-08,
"loss": 2.2587,
"step": 3885
},
{
"epoch": 3.945233265720081,
"grad_norm": 1.7816747777928572,
"learning_rate": 5e-08,
"loss": 2.2532,
"step": 3890
},
{
"epoch": 3.9503042596348883,
"grad_norm": 1.7762615930524053,
"learning_rate": 5e-08,
"loss": 2.2331,
"step": 3895
},
{
"epoch": 3.955375253549696,
"grad_norm": 1.8039115341801395,
"learning_rate": 5e-08,
"loss": 2.2271,
"step": 3900
},
{
"epoch": 3.9604462474645032,
"grad_norm": 1.7530354888252304,
"learning_rate": 5e-08,
"loss": 2.2191,
"step": 3905
},
{
"epoch": 3.9655172413793105,
"grad_norm": 1.883699780217342,
"learning_rate": 5e-08,
"loss": 2.2059,
"step": 3910
},
{
"epoch": 3.9705882352941178,
"grad_norm": 1.7246634345468168,
"learning_rate": 5e-08,
"loss": 2.2482,
"step": 3915
},
{
"epoch": 3.975659229208925,
"grad_norm": 1.762677648630269,
"learning_rate": 5e-08,
"loss": 2.2521,
"step": 3920
},
{
"epoch": 3.9807302231237323,
"grad_norm": 1.786354894638501,
"learning_rate": 5e-08,
"loss": 2.2763,
"step": 3925
},
{
"epoch": 3.9858012170385395,
"grad_norm": 1.81100838850099,
"learning_rate": 5e-08,
"loss": 2.2326,
"step": 3930
},
{
"epoch": 3.9908722109533468,
"grad_norm": 1.8115971845880692,
"learning_rate": 5e-08,
"loss": 2.2409,
"step": 3935
},
{
"epoch": 3.995943204868154,
"grad_norm": 1.901268059775357,
"learning_rate": 5e-08,
"loss": 2.2217,
"step": 3940
},
{
"epoch": 4.0,
"step": 3944,
"total_flos": 411954472550400.0,
"train_loss": 2.318100369605283,
"train_runtime": 14372.236,
"train_samples_per_second": 17.546,
"train_steps_per_second": 0.274
}
],
"logging_steps": 5,
"max_steps": 3944,
"num_input_tokens_seen": 0,
"num_train_epochs": 4,
"save_steps": 200,
"total_flos": 411954472550400.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}