TianyiQ's picture
Upload folder using huggingface_hub
a00aaaa verified
raw
history blame
149 kB
{
"best_metric": 2.323676109313965,
"best_model_checkpoint": "./output/training_results/C019_random_sample_llama3-8b-base_pretrain_20240504_182259/checkpoint-1000",
"epoch": 4.0,
"eval_steps": 200,
"global_step": 4160,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0009615384615384616,
"grad_norm": 0.0,
"learning_rate": 0.0,
"loss": 2.5996,
"step": 1
},
{
"epoch": 0.004807692307692308,
"grad_norm": 3.093270098005958,
"learning_rate": 2.25e-06,
"loss": 2.5704,
"step": 5
},
{
"epoch": 0.009615384615384616,
"grad_norm": 2.3983439225151337,
"learning_rate": 6e-06,
"loss": 2.598,
"step": 10
},
{
"epoch": 0.014423076923076924,
"grad_norm": 2.365104415775466,
"learning_rate": 9.75e-06,
"loss": 2.5213,
"step": 15
},
{
"epoch": 0.019230769230769232,
"grad_norm": 2.377061508613044,
"learning_rate": 1.3500000000000001e-05,
"loss": 2.5413,
"step": 20
},
{
"epoch": 0.02403846153846154,
"grad_norm": 2.7238687593360633,
"learning_rate": 1.488126415936146e-05,
"loss": 2.4619,
"step": 25
},
{
"epoch": 0.028846153846153848,
"grad_norm": 2.1821698028288496,
"learning_rate": 1.468527480858081e-05,
"loss": 2.4796,
"step": 30
},
{
"epoch": 0.03365384615384615,
"grad_norm": 2.209060379147765,
"learning_rate": 1.4491642768162611e-05,
"loss": 2.4632,
"step": 35
},
{
"epoch": 0.038461538461538464,
"grad_norm": 2.1033623949557465,
"learning_rate": 1.4376584414398205e-05,
"loss": 2.4363,
"step": 40
},
{
"epoch": 0.04326923076923077,
"grad_norm": 2.232481096526571,
"learning_rate": 1.4186671032101571e-05,
"loss": 2.4888,
"step": 45
},
{
"epoch": 0.04807692307692308,
"grad_norm": 2.1509113321913413,
"learning_rate": 1.3999049045545275e-05,
"loss": 2.4947,
"step": 50
},
{
"epoch": 0.052884615384615384,
"grad_norm": 2.35512436324606,
"learning_rate": 1.3813693542528815e-05,
"loss": 2.4788,
"step": 55
},
{
"epoch": 0.057692307692307696,
"grad_norm": 2.0401062809167683,
"learning_rate": 1.3630579851896082e-05,
"loss": 2.4441,
"step": 60
},
{
"epoch": 0.0625,
"grad_norm": 2.0096811058967425,
"learning_rate": 1.3449683541492259e-05,
"loss": 2.4552,
"step": 65
},
{
"epoch": 0.0673076923076923,
"grad_norm": 2.258689794653528,
"learning_rate": 1.3270980416135356e-05,
"loss": 2.48,
"step": 70
},
{
"epoch": 0.07211538461538461,
"grad_norm": 2.020330092733293,
"learning_rate": 1.3094446515602676e-05,
"loss": 2.4756,
"step": 75
},
{
"epoch": 0.07692307692307693,
"grad_norm": 2.062564685463297,
"learning_rate": 1.2920058112631874e-05,
"loss": 2.4676,
"step": 80
},
{
"epoch": 0.08173076923076923,
"grad_norm": 2.0801794381372196,
"learning_rate": 1.2747791710936666e-05,
"loss": 2.5349,
"step": 85
},
{
"epoch": 0.08653846153846154,
"grad_norm": 3.522036550275993,
"learning_rate": 1.2577624043237019e-05,
"loss": 2.4357,
"step": 90
},
{
"epoch": 0.09134615384615384,
"grad_norm": 2.096385210617988,
"learning_rate": 1.240953206930375e-05,
"loss": 2.4441,
"step": 95
},
{
"epoch": 0.09615384615384616,
"grad_norm": 2.0071639436136737,
"learning_rate": 1.2243492974017472e-05,
"loss": 2.4663,
"step": 100
},
{
"epoch": 0.10096153846153846,
"grad_norm": 2.1419668864903794,
"learning_rate": 1.2079484165441774e-05,
"loss": 2.5266,
"step": 105
},
{
"epoch": 0.10576923076923077,
"grad_norm": 1.853996222690424,
"learning_rate": 1.1917483272910544e-05,
"loss": 2.4803,
"step": 110
},
{
"epoch": 0.11057692307692307,
"grad_norm": 1.8741352536661482,
"learning_rate": 1.1757468145129383e-05,
"loss": 2.4532,
"step": 115
},
{
"epoch": 0.11538461538461539,
"grad_norm": 2.5986583647330344,
"learning_rate": 1.1599416848290976e-05,
"loss": 2.4519,
"step": 120
},
{
"epoch": 0.1201923076923077,
"grad_norm": 1.960401134525488,
"learning_rate": 1.1443307664204364e-05,
"loss": 2.4225,
"step": 125
},
{
"epoch": 0.125,
"grad_norm": 2.000854689144336,
"learning_rate": 1.1289119088438038e-05,
"loss": 2.4376,
"step": 130
},
{
"epoch": 0.12980769230769232,
"grad_norm": 2.0163596039348373,
"learning_rate": 1.1136829828476745e-05,
"loss": 2.4494,
"step": 135
},
{
"epoch": 0.1346153846153846,
"grad_norm": 2.000675810989018,
"learning_rate": 1.0986418801891934e-05,
"loss": 2.462,
"step": 140
},
{
"epoch": 0.13942307692307693,
"grad_norm": 2.0014951060919746,
"learning_rate": 1.0837865134525763e-05,
"loss": 2.4331,
"step": 145
},
{
"epoch": 0.14423076923076922,
"grad_norm": 1.9032594688995426,
"learning_rate": 1.069114815868857e-05,
"loss": 2.443,
"step": 150
},
{
"epoch": 0.14903846153846154,
"grad_norm": 2.344078595183246,
"learning_rate": 1.0546247411369744e-05,
"loss": 2.3993,
"step": 155
},
{
"epoch": 0.15384615384615385,
"grad_norm": 2.261655660998884,
"learning_rate": 1.0403142632461892e-05,
"loss": 2.427,
"step": 160
},
{
"epoch": 0.15865384615384615,
"grad_norm": 1.9697690775283647,
"learning_rate": 1.0261813762998242e-05,
"loss": 2.3969,
"step": 165
},
{
"epoch": 0.16346153846153846,
"grad_norm": 1.9785704107813238,
"learning_rate": 1.0122240943403124e-05,
"loss": 2.4541,
"step": 170
},
{
"epoch": 0.16826923076923078,
"grad_norm": 1.8261246917010026,
"learning_rate": 9.984404511755643e-06,
"loss": 2.4736,
"step": 175
},
{
"epoch": 0.17307692307692307,
"grad_norm": 1.99665744273795,
"learning_rate": 9.848285002066194e-06,
"loss": 2.353,
"step": 180
},
{
"epoch": 0.1778846153846154,
"grad_norm": 1.8159030807907148,
"learning_rate": 9.71386314256594e-06,
"loss": 2.4447,
"step": 185
},
{
"epoch": 0.18269230769230768,
"grad_norm": 1.9924841032422067,
"learning_rate": 9.581119854009096e-06,
"loss": 2.3577,
"step": 190
},
{
"epoch": 0.1875,
"grad_norm": 1.8364970229914088,
"learning_rate": 9.45003624798795e-06,
"loss": 2.4096,
"step": 195
},
{
"epoch": 0.19230769230769232,
"grad_norm": 1.9566999587123155,
"learning_rate": 9.320593625260526e-06,
"loss": 2.3809,
"step": 200
},
{
"epoch": 0.19230769230769232,
"eval_loss": 2.4206786155700684,
"eval_runtime": 85.4007,
"eval_samples_per_second": 86.592,
"eval_steps_per_second": 0.679,
"step": 200
},
{
"epoch": 0.1971153846153846,
"grad_norm": 1.958978215443068,
"learning_rate": 9.192773474090845e-06,
"loss": 2.3997,
"step": 205
},
{
"epoch": 0.20192307692307693,
"grad_norm": 1.999117184727505,
"learning_rate": 9.066557468601675e-06,
"loss": 2.3995,
"step": 210
},
{
"epoch": 0.20673076923076922,
"grad_norm": 2.0120971325180634,
"learning_rate": 8.966727451760845e-06,
"loss": 2.3394,
"step": 215
},
{
"epoch": 0.21153846153846154,
"grad_norm": 1.8965405647532796,
"learning_rate": 8.843353314292577e-06,
"loss": 2.4373,
"step": 220
},
{
"epoch": 0.21634615384615385,
"grad_norm": 1.793020827788288,
"learning_rate": 8.721532984948616e-06,
"loss": 2.4004,
"step": 225
},
{
"epoch": 0.22115384615384615,
"grad_norm": 1.8928727830060093,
"learning_rate": 8.601248829310043e-06,
"loss": 2.4425,
"step": 230
},
{
"epoch": 0.22596153846153846,
"grad_norm": 1.8359177916301768,
"learning_rate": 8.482483391081384e-06,
"loss": 2.4048,
"step": 235
},
{
"epoch": 0.23076923076923078,
"grad_norm": 1.771634179795241,
"learning_rate": 8.365219390514311e-06,
"loss": 2.3701,
"step": 240
},
{
"epoch": 0.23557692307692307,
"grad_norm": 2.2382487479171966,
"learning_rate": 8.249439722843319e-06,
"loss": 2.3873,
"step": 245
},
{
"epoch": 0.2403846153846154,
"grad_norm": 1.825838956406169,
"learning_rate": 8.135127456733292e-06,
"loss": 2.4484,
"step": 250
},
{
"epoch": 0.24519230769230768,
"grad_norm": 1.779047182560338,
"learning_rate": 8.022265832738892e-06,
"loss": 2.4533,
"step": 255
},
{
"epoch": 0.25,
"grad_norm": 1.8121397814224398,
"learning_rate": 7.9108382617757e-06,
"loss": 2.4032,
"step": 260
},
{
"epoch": 0.2548076923076923,
"grad_norm": 1.7304835073136142,
"learning_rate": 7.800828323603008e-06,
"loss": 2.3965,
"step": 265
},
{
"epoch": 0.25961538461538464,
"grad_norm": 1.9948337899573474,
"learning_rate": 7.692219765318242e-06,
"loss": 2.4174,
"step": 270
},
{
"epoch": 0.2644230769230769,
"grad_norm": 2.498650132767716,
"learning_rate": 7.584996499862861e-06,
"loss": 2.39,
"step": 275
},
{
"epoch": 0.2692307692307692,
"grad_norm": 1.9036689673638798,
"learning_rate": 7.479142604539756e-06,
"loss": 2.3903,
"step": 280
},
{
"epoch": 0.27403846153846156,
"grad_norm": 1.9727971553625547,
"learning_rate": 7.374642319541976e-06,
"loss": 2.352,
"step": 285
},
{
"epoch": 0.27884615384615385,
"grad_norm": 1.7682776753325222,
"learning_rate": 7.271480046492797e-06,
"loss": 2.3595,
"step": 290
},
{
"epoch": 0.28365384615384615,
"grad_norm": 2.466547945028361,
"learning_rate": 7.1696403469970005e-06,
"loss": 2.4387,
"step": 295
},
{
"epoch": 0.28846153846153844,
"grad_norm": 1.7588363798238758,
"learning_rate": 7.0691079412032825e-06,
"loss": 2.4327,
"step": 300
},
{
"epoch": 0.2932692307692308,
"grad_norm": 1.8462300982749367,
"learning_rate": 6.969867706377832e-06,
"loss": 2.4041,
"step": 305
},
{
"epoch": 0.2980769230769231,
"grad_norm": 2.0032200252529098,
"learning_rate": 6.87190467548884e-06,
"loss": 2.4022,
"step": 310
},
{
"epoch": 0.30288461538461536,
"grad_norm": 2.0051781024154383,
"learning_rate": 6.775204035801989e-06,
"loss": 2.3978,
"step": 315
},
{
"epoch": 0.3076923076923077,
"grad_norm": 1.7525097649477925,
"learning_rate": 6.679751127486818e-06,
"loss": 2.3874,
"step": 320
},
{
"epoch": 0.3125,
"grad_norm": 1.8163864310732767,
"learning_rate": 6.585531442233879e-06,
"loss": 2.3982,
"step": 325
},
{
"epoch": 0.3173076923076923,
"grad_norm": 1.8911617099161901,
"learning_rate": 6.492530621882634e-06,
"loss": 2.3816,
"step": 330
},
{
"epoch": 0.32211538461538464,
"grad_norm": 1.8956241442821822,
"learning_rate": 6.400734457060024e-06,
"loss": 2.3557,
"step": 335
},
{
"epoch": 0.3269230769230769,
"grad_norm": 1.8585394840952694,
"learning_rate": 6.310128885829607e-06,
"loss": 2.4309,
"step": 340
},
{
"epoch": 0.3317307692307692,
"grad_norm": 1.8977154535780991,
"learning_rate": 6.220699992351257e-06,
"loss": 2.4039,
"step": 345
},
{
"epoch": 0.33653846153846156,
"grad_norm": 1.803139553519876,
"learning_rate": 6.132434005551287e-06,
"loss": 2.4042,
"step": 350
},
{
"epoch": 0.34134615384615385,
"grad_norm": 1.757715074609487,
"learning_rate": 6.045317297802985e-06,
"loss": 2.3759,
"step": 355
},
{
"epoch": 0.34615384615384615,
"grad_norm": 1.8026638689606764,
"learning_rate": 5.95933638361746e-06,
"loss": 2.4149,
"step": 360
},
{
"epoch": 0.35096153846153844,
"grad_norm": 1.7463547692619898,
"learning_rate": 5.874477918344749e-06,
"loss": 2.3951,
"step": 365
},
{
"epoch": 0.3557692307692308,
"grad_norm": 1.869103918883084,
"learning_rate": 5.7907286968851065e-06,
"loss": 2.3785,
"step": 370
},
{
"epoch": 0.3605769230769231,
"grad_norm": 1.8694975836317,
"learning_rate": 5.708075652410414e-06,
"loss": 2.4295,
"step": 375
},
{
"epoch": 0.36538461538461536,
"grad_norm": 1.9186264569383331,
"learning_rate": 5.626505855095647e-06,
"loss": 2.4053,
"step": 380
},
{
"epoch": 0.3701923076923077,
"grad_norm": 1.8627599571104616,
"learning_rate": 5.546006510860341e-06,
"loss": 2.3935,
"step": 385
},
{
"epoch": 0.375,
"grad_norm": 1.7601694633490985,
"learning_rate": 5.466564960119934e-06,
"loss": 2.3533,
"step": 390
},
{
"epoch": 0.3798076923076923,
"grad_norm": 1.6940078427675656,
"learning_rate": 5.388168676547046e-06,
"loss": 2.3602,
"step": 395
},
{
"epoch": 0.38461538461538464,
"grad_norm": 2.3248960946347155,
"learning_rate": 5.31080526584248e-06,
"loss": 2.3057,
"step": 400
},
{
"epoch": 0.38461538461538464,
"eval_loss": 2.3750226497650146,
"eval_runtime": 85.4352,
"eval_samples_per_second": 86.557,
"eval_steps_per_second": 0.679,
"step": 400
},
{
"epoch": 0.3894230769230769,
"grad_norm": 1.7637614396329135,
"learning_rate": 5.234462464515984e-06,
"loss": 2.3852,
"step": 405
},
{
"epoch": 0.3942307692307692,
"grad_norm": 1.8306112577514888,
"learning_rate": 5.159128138676664e-06,
"loss": 2.3683,
"step": 410
},
{
"epoch": 0.39903846153846156,
"grad_norm": 1.88396403239199,
"learning_rate": 5.0847902828330104e-06,
"loss": 2.3303,
"step": 415
},
{
"epoch": 0.40384615384615385,
"grad_norm": 1.9387815046466974,
"learning_rate": 5.011437018702448e-06,
"loss": 2.3596,
"step": 420
},
{
"epoch": 0.40865384615384615,
"grad_norm": 1.797535293599832,
"learning_rate": 4.939056594030363e-06,
"loss": 2.3807,
"step": 425
},
{
"epoch": 0.41346153846153844,
"grad_norm": 1.7674969210476854,
"learning_rate": 4.867637381418548e-06,
"loss": 2.4203,
"step": 430
},
{
"epoch": 0.4182692307692308,
"grad_norm": 1.7330827184520308,
"learning_rate": 4.797167877162977e-06,
"loss": 2.4145,
"step": 435
},
{
"epoch": 0.4230769230769231,
"grad_norm": 1.7505951142772842,
"learning_rate": 4.72763670010088e-06,
"loss": 2.3664,
"step": 440
},
{
"epoch": 0.42788461538461536,
"grad_norm": 1.7277179266718043,
"learning_rate": 4.6590325904670434e-06,
"loss": 2.3618,
"step": 445
},
{
"epoch": 0.4326923076923077,
"grad_norm": 1.824045183697345,
"learning_rate": 4.5913444087592555e-06,
"loss": 2.3677,
"step": 450
},
{
"epoch": 0.4375,
"grad_norm": 2.541872533331478,
"learning_rate": 4.524561134612869e-06,
"loss": 2.3953,
"step": 455
},
{
"epoch": 0.4423076923076923,
"grad_norm": 1.8053852132874109,
"learning_rate": 4.4586718656843925e-06,
"loss": 2.4119,
"step": 460
},
{
"epoch": 0.44711538461538464,
"grad_norm": 1.6878117932040484,
"learning_rate": 4.39366581654407e-06,
"loss": 2.3864,
"step": 465
},
{
"epoch": 0.4519230769230769,
"grad_norm": 1.8260105801902033,
"learning_rate": 4.329532317577373e-06,
"loss": 2.387,
"step": 470
},
{
"epoch": 0.4567307692307692,
"grad_norm": 1.8118051823045696,
"learning_rate": 4.26626081389535e-06,
"loss": 2.4271,
"step": 475
},
{
"epoch": 0.46153846153846156,
"grad_norm": 2.3122157740257157,
"learning_rate": 4.2038408642537815e-06,
"loss": 2.3746,
"step": 480
},
{
"epoch": 0.46634615384615385,
"grad_norm": 2.0895941468983126,
"learning_rate": 4.142262139981073e-06,
"loss": 2.3491,
"step": 485
},
{
"epoch": 0.47115384615384615,
"grad_norm": 1.8059979746514452,
"learning_rate": 4.0815144239148194e-06,
"loss": 2.3499,
"step": 490
},
{
"epoch": 0.47596153846153844,
"grad_norm": 1.886181072515567,
"learning_rate": 4.0215876093470125e-06,
"loss": 2.3631,
"step": 495
},
{
"epoch": 0.4807692307692308,
"grad_norm": 1.8494449235344264,
"learning_rate": 3.962471698977794e-06,
"loss": 2.3689,
"step": 500
},
{
"epoch": 0.4855769230769231,
"grad_norm": 1.7530451717430282,
"learning_rate": 3.904156803877704e-06,
"loss": 2.3126,
"step": 505
},
{
"epoch": 0.49038461538461536,
"grad_norm": 1.7478042759208887,
"learning_rate": 3.846633142458427e-06,
"loss": 2.3706,
"step": 510
},
{
"epoch": 0.4951923076923077,
"grad_norm": 1.7582686186315075,
"learning_rate": 3.7898910394518715e-06,
"loss": 2.3913,
"step": 515
},
{
"epoch": 0.5,
"grad_norm": 1.719027129765464,
"learning_rate": 3.7339209248976165e-06,
"loss": 2.3352,
"step": 520
},
{
"epoch": 0.5048076923076923,
"grad_norm": 1.7460100588180303,
"learning_rate": 3.678713333138621e-06,
"loss": 2.3206,
"step": 525
},
{
"epoch": 0.5096153846153846,
"grad_norm": 1.82603479631214,
"learning_rate": 3.6242589018251656e-06,
"loss": 2.328,
"step": 530
},
{
"epoch": 0.5144230769230769,
"grad_norm": 2.909265992463998,
"learning_rate": 3.570548370926946e-06,
"loss": 2.3763,
"step": 535
},
{
"epoch": 0.5192307692307693,
"grad_norm": 1.8988240634311662,
"learning_rate": 3.5175725817532863e-06,
"loss": 2.3422,
"step": 540
},
{
"epoch": 0.5240384615384616,
"grad_norm": 1.8816807225199998,
"learning_rate": 3.4653224759813952e-06,
"loss": 2.31,
"step": 545
},
{
"epoch": 0.5288461538461539,
"grad_norm": 1.7734887040078462,
"learning_rate": 3.413789094692631e-06,
"loss": 2.3708,
"step": 550
},
{
"epoch": 0.5336538461538461,
"grad_norm": 14.829267205139884,
"learning_rate": 3.362963577416697e-06,
"loss": 2.353,
"step": 555
},
{
"epoch": 0.5384615384615384,
"grad_norm": 1.767298642358234,
"learning_rate": 3.312837161183736e-06,
"loss": 2.3772,
"step": 560
},
{
"epoch": 0.5432692307692307,
"grad_norm": 2.0381765168658714,
"learning_rate": 3.2634011795842525e-06,
"loss": 2.3277,
"step": 565
},
{
"epoch": 0.5480769230769231,
"grad_norm": 1.687367468245635,
"learning_rate": 3.2146470618368156e-06,
"loss": 2.3702,
"step": 570
},
{
"epoch": 0.5528846153846154,
"grad_norm": 1.7200567763349082,
"learning_rate": 3.1665663318634906e-06,
"loss": 2.2972,
"step": 575
},
{
"epoch": 0.5576923076923077,
"grad_norm": 1.7213863859635832,
"learning_rate": 3.119150607372941e-06,
"loss": 2.3279,
"step": 580
},
{
"epoch": 0.5625,
"grad_norm": 1.7895318194941465,
"learning_rate": 3.0723915989511547e-06,
"loss": 2.3264,
"step": 585
},
{
"epoch": 0.5673076923076923,
"grad_norm": 1.6926941348086333,
"learning_rate": 3.035451716037107e-06,
"loss": 2.4078,
"step": 590
},
{
"epoch": 0.5721153846153846,
"grad_norm": 1.835513287932842,
"learning_rate": 2.9898542002308595e-06,
"loss": 2.3339,
"step": 595
},
{
"epoch": 0.5769230769230769,
"grad_norm": 1.7870911584404572,
"learning_rate": 2.944890676594853e-06,
"loss": 2.35,
"step": 600
},
{
"epoch": 0.5769230769230769,
"eval_loss": 2.3476545810699463,
"eval_runtime": 85.4325,
"eval_samples_per_second": 86.56,
"eval_steps_per_second": 0.679,
"step": 600
},
{
"epoch": 0.5817307692307693,
"grad_norm": 1.7960612955748432,
"learning_rate": 2.900553200489045e-06,
"loss": 2.379,
"step": 605
},
{
"epoch": 0.5865384615384616,
"grad_norm": 2.662329393803985,
"learning_rate": 2.8568339158905825e-06,
"loss": 2.3121,
"step": 610
},
{
"epoch": 0.5913461538461539,
"grad_norm": 1.751319402693243,
"learning_rate": 2.8137250545276917e-06,
"loss": 2.3453,
"step": 615
},
{
"epoch": 0.5961538461538461,
"grad_norm": 2.2858590472007325,
"learning_rate": 2.77121893502082e-06,
"loss": 2.3469,
"step": 620
},
{
"epoch": 0.6009615384615384,
"grad_norm": 1.8051336435298304,
"learning_rate": 2.729307962031005e-06,
"loss": 2.3764,
"step": 625
},
{
"epoch": 0.6057692307692307,
"grad_norm": 1.7204864022940245,
"learning_rate": 2.6879846254154052e-06,
"loss": 2.3047,
"step": 630
},
{
"epoch": 0.6105769230769231,
"grad_norm": 1.6529012434786867,
"learning_rate": 2.647241499389928e-06,
"loss": 2.3594,
"step": 635
},
{
"epoch": 0.6153846153846154,
"grad_norm": 1.732240061787434,
"learning_rate": 2.607071241698958e-06,
"loss": 2.3265,
"step": 640
},
{
"epoch": 0.6201923076923077,
"grad_norm": 1.7491108722836675,
"learning_rate": 2.567466592792067e-06,
"loss": 2.3546,
"step": 645
},
{
"epoch": 0.625,
"grad_norm": 1.8515026129037757,
"learning_rate": 2.5284203750077018e-06,
"loss": 2.3665,
"step": 650
},
{
"epoch": 0.6298076923076923,
"grad_norm": 1.9236177470936695,
"learning_rate": 2.4899254917637856e-06,
"loss": 2.3532,
"step": 655
},
{
"epoch": 0.6346153846153846,
"grad_norm": 1.7377562070977945,
"learning_rate": 2.4519749267551924e-06,
"loss": 2.3056,
"step": 660
},
{
"epoch": 0.6394230769230769,
"grad_norm": 1.8604329624496534,
"learning_rate": 2.414561743158029e-06,
"loss": 2.4127,
"step": 665
},
{
"epoch": 0.6442307692307693,
"grad_norm": 1.7518401108851098,
"learning_rate": 2.3776790828406987e-06,
"loss": 2.3923,
"step": 670
},
{
"epoch": 0.6490384615384616,
"grad_norm": 1.931606951701668,
"learning_rate": 2.341320165581676e-06,
"loss": 2.3243,
"step": 675
},
{
"epoch": 0.6538461538461539,
"grad_norm": 1.812856790111344,
"learning_rate": 2.3054782882939655e-06,
"loss": 2.3149,
"step": 680
},
{
"epoch": 0.6586538461538461,
"grad_norm": 1.7938076588828502,
"learning_rate": 2.2701468242561784e-06,
"loss": 2.3098,
"step": 685
},
{
"epoch": 0.6634615384615384,
"grad_norm": 1.6875935166811342,
"learning_rate": 2.2353192223501965e-06,
"loss": 2.3627,
"step": 690
},
{
"epoch": 0.6682692307692307,
"grad_norm": 1.7370129856938976,
"learning_rate": 2.2009890063053612e-06,
"loss": 2.3905,
"step": 695
},
{
"epoch": 0.6730769230769231,
"grad_norm": 1.786880089249507,
"learning_rate": 2.167149773949154e-06,
"loss": 2.3904,
"step": 700
},
{
"epoch": 0.6778846153846154,
"grad_norm": 1.766140826477351,
"learning_rate": 2.133795196464315e-06,
"loss": 2.3069,
"step": 705
},
{
"epoch": 0.6826923076923077,
"grad_norm": 1.73381149404956,
"learning_rate": 2.100919017652352e-06,
"loss": 2.3367,
"step": 710
},
{
"epoch": 0.6875,
"grad_norm": 1.6802393388684402,
"learning_rate": 2.0685150532033913e-06,
"loss": 2.3349,
"step": 715
},
{
"epoch": 0.6923076923076923,
"grad_norm": 1.719597560705125,
"learning_rate": 2.036577189972352e-06,
"loss": 2.347,
"step": 720
},
{
"epoch": 0.6971153846153846,
"grad_norm": 1.7179306585516882,
"learning_rate": 2.005099385261351e-06,
"loss": 2.2808,
"step": 725
},
{
"epoch": 0.7019230769230769,
"grad_norm": 1.693677430438375,
"learning_rate": 1.9740756661083308e-06,
"loss": 2.3601,
"step": 730
},
{
"epoch": 0.7067307692307693,
"grad_norm": 1.7284703551106673,
"learning_rate": 1.9435001285818512e-06,
"loss": 2.3698,
"step": 735
},
{
"epoch": 0.7115384615384616,
"grad_norm": 1.7201691395467102,
"learning_rate": 1.913366937082008e-06,
"loss": 2.3383,
"step": 740
},
{
"epoch": 0.7163461538461539,
"grad_norm": 1.8376437399845924,
"learning_rate": 1.883670323647419e-06,
"loss": 2.3575,
"step": 745
},
{
"epoch": 0.7211538461538461,
"grad_norm": 1.7519138621360655,
"learning_rate": 1.8544045872682494e-06,
"loss": 2.4116,
"step": 750
},
{
"epoch": 0.7259615384615384,
"grad_norm": 1.6767007868001402,
"learning_rate": 1.8255640932052287e-06,
"loss": 2.3197,
"step": 755
},
{
"epoch": 0.7307692307692307,
"grad_norm": 1.8411908944181066,
"learning_rate": 1.7971432723146058e-06,
"loss": 2.3908,
"step": 760
},
{
"epoch": 0.7355769230769231,
"grad_norm": 1.7508438925830225,
"learning_rate": 1.769136620379013e-06,
"loss": 2.3188,
"step": 765
},
{
"epoch": 0.7403846153846154,
"grad_norm": 1.7436172155395409,
"learning_rate": 1.7415386974441854e-06,
"loss": 2.321,
"step": 770
},
{
"epoch": 0.7451923076923077,
"grad_norm": 1.8045595856913115,
"learning_rate": 1.7143441271614997e-06,
"loss": 2.3454,
"step": 775
},
{
"epoch": 0.75,
"grad_norm": 1.763756591492577,
"learning_rate": 1.687547596136285e-06,
"loss": 2.3234,
"step": 780
},
{
"epoch": 0.7548076923076923,
"grad_norm": 1.7186205772688097,
"learning_rate": 1.661143853281865e-06,
"loss": 2.2885,
"step": 785
},
{
"epoch": 0.7596153846153846,
"grad_norm": 1.7694258113773655,
"learning_rate": 1.6351277091792915e-06,
"loss": 2.3391,
"step": 790
},
{
"epoch": 0.7644230769230769,
"grad_norm": 1.725458209313717,
"learning_rate": 1.6094940354427228e-06,
"loss": 2.3098,
"step": 795
},
{
"epoch": 0.7692307692307693,
"grad_norm": 34.858863328576724,
"learning_rate": 1.5842377640904125e-06,
"loss": 2.3291,
"step": 800
},
{
"epoch": 0.7692307692307693,
"eval_loss": 2.3324432373046875,
"eval_runtime": 85.489,
"eval_samples_per_second": 86.502,
"eval_steps_per_second": 0.678,
"step": 800
},
{
"epoch": 0.7740384615384616,
"grad_norm": 1.7300557356264337,
"learning_rate": 1.5593538869212577e-06,
"loss": 2.3633,
"step": 805
},
{
"epoch": 0.7788461538461539,
"grad_norm": 1.6677853311569053,
"learning_rate": 1.5348374548968758e-06,
"loss": 2.31,
"step": 810
},
{
"epoch": 0.7836538461538461,
"grad_norm": 1.6959216377511328,
"learning_rate": 1.5106835775291604e-06,
"loss": 2.3239,
"step": 815
},
{
"epoch": 0.7884615384615384,
"grad_norm": 1.703559225147181,
"learning_rate": 1.4868874222732831e-06,
"loss": 2.324,
"step": 820
},
{
"epoch": 0.7932692307692307,
"grad_norm": 1.7178542423600203,
"learning_rate": 1.4634442139260933e-06,
"loss": 2.342,
"step": 825
},
{
"epoch": 0.7980769230769231,
"grad_norm": 1.6873420836748758,
"learning_rate": 1.440349234029883e-06,
"loss": 2.3434,
"step": 830
},
{
"epoch": 0.8028846153846154,
"grad_norm": 1.742480497378871,
"learning_rate": 1.417597820281471e-06,
"loss": 2.3966,
"step": 835
},
{
"epoch": 0.8076923076923077,
"grad_norm": 1.6566648049272492,
"learning_rate": 1.3951853659465747e-06,
"loss": 2.3217,
"step": 840
},
{
"epoch": 0.8125,
"grad_norm": 1.78249147233943,
"learning_rate": 1.3731073192794095e-06,
"loss": 2.3719,
"step": 845
},
{
"epoch": 0.8173076923076923,
"grad_norm": 1.8035253977271137,
"learning_rate": 1.3513591829475174e-06,
"loss": 2.317,
"step": 850
},
{
"epoch": 0.8221153846153846,
"grad_norm": 2.035309467875598,
"learning_rate": 1.3299365134617373e-06,
"loss": 2.313,
"step": 855
},
{
"epoch": 0.8269230769230769,
"grad_norm": 1.7174745299655327,
"learning_rate": 1.3088349206113118e-06,
"loss": 2.3239,
"step": 860
},
{
"epoch": 0.8317307692307693,
"grad_norm": 1.7333933814361635,
"learning_rate": 1.2880500669040793e-06,
"loss": 2.3025,
"step": 865
},
{
"epoch": 0.8365384615384616,
"grad_norm": 1.7754019490280168,
"learning_rate": 1.2675776670117165e-06,
"loss": 2.2899,
"step": 870
},
{
"epoch": 0.8413461538461539,
"grad_norm": 1.773766560162585,
"learning_rate": 1.2474134872199916e-06,
"loss": 2.3348,
"step": 875
},
{
"epoch": 0.8461538461538461,
"grad_norm": 1.6780258578572016,
"learning_rate": 1.2275533448839897e-06,
"loss": 2.3305,
"step": 880
},
{
"epoch": 0.8509615384615384,
"grad_norm": 1.733329835045473,
"learning_rate": 1.2079931078882769e-06,
"loss": 2.3059,
"step": 885
},
{
"epoch": 0.8557692307692307,
"grad_norm": 1.688022550790151,
"learning_rate": 1.1887286941119609e-06,
"loss": 2.2872,
"step": 890
},
{
"epoch": 0.8605769230769231,
"grad_norm": 1.7172166393971702,
"learning_rate": 1.1697560708986142e-06,
"loss": 2.3042,
"step": 895
},
{
"epoch": 0.8653846153846154,
"grad_norm": 1.6641411293848463,
"learning_rate": 1.1510712545310206e-06,
"loss": 2.2959,
"step": 900
},
{
"epoch": 0.8701923076923077,
"grad_norm": 1.7296381589810081,
"learning_rate": 1.1326703097107125e-06,
"loss": 2.339,
"step": 905
},
{
"epoch": 0.875,
"grad_norm": 1.6487202037599287,
"learning_rate": 1.1145493490422558e-06,
"loss": 2.309,
"step": 910
},
{
"epoch": 0.8798076923076923,
"grad_norm": 2.181232627254535,
"learning_rate": 1.096704532522256e-06,
"loss": 2.2499,
"step": 915
},
{
"epoch": 0.8846153846153846,
"grad_norm": 1.7663666904603283,
"learning_rate": 1.0791320670330332e-06,
"loss": 2.4002,
"step": 920
},
{
"epoch": 0.8894230769230769,
"grad_norm": 2.063321871244198,
"learning_rate": 1.061828205840956e-06,
"loss": 2.3313,
"step": 925
},
{
"epoch": 0.8942307692307693,
"grad_norm": 1.8140222643627664,
"learning_rate": 1.0447892480993706e-06,
"loss": 2.3454,
"step": 930
},
{
"epoch": 0.8990384615384616,
"grad_norm": 1.7048216508873255,
"learning_rate": 1.0280115383561078e-06,
"loss": 2.3296,
"step": 935
},
{
"epoch": 0.9038461538461539,
"grad_norm": 1.7706072815766516,
"learning_rate": 1.0114914660655272e-06,
"loss": 2.3379,
"step": 940
},
{
"epoch": 0.9086538461538461,
"grad_norm": 1.8968636807180728,
"learning_rate": 9.95225465105065e-07,
"loss": 2.3336,
"step": 945
},
{
"epoch": 0.9134615384615384,
"grad_norm": 1.8148188080264716,
"learning_rate": 9.792100132962467e-07,
"loss": 2.3244,
"step": 950
},
{
"epoch": 0.9182692307692307,
"grad_norm": 1.700784769345341,
"learning_rate": 9.634416319301388e-07,
"loss": 2.2875,
"step": 955
},
{
"epoch": 0.9230769230769231,
"grad_norm": 1.678153310810481,
"learning_rate": 9.479168852971943e-07,
"loss": 2.3299,
"step": 960
},
{
"epoch": 0.9278846153846154,
"grad_norm": 1.702217146168844,
"learning_rate": 9.326323802214668e-07,
"loss": 2.3312,
"step": 965
},
{
"epoch": 0.9326923076923077,
"grad_norm": 1.7687681371145616,
"learning_rate": 9.175847655991562e-07,
"loss": 2.3722,
"step": 970
},
{
"epoch": 0.9375,
"grad_norm": 1.7230729231020288,
"learning_rate": 9.027707319414495e-07,
"loss": 2.3735,
"step": 975
},
{
"epoch": 0.9423076923076923,
"grad_norm": 1.7291556590880472,
"learning_rate": 8.881870109216298e-07,
"loss": 2.3127,
"step": 980
},
{
"epoch": 0.9471153846153846,
"grad_norm": 1.7116649138045492,
"learning_rate": 8.73830374926414e-07,
"loss": 2.3561,
"step": 985
},
{
"epoch": 0.9519230769230769,
"grad_norm": 1.6739783387575036,
"learning_rate": 8.596976366114889e-07,
"loss": 2.351,
"step": 990
},
{
"epoch": 0.9567307692307693,
"grad_norm": 1.9461130756235225,
"learning_rate": 8.457856484612148e-07,
"loss": 2.3294,
"step": 995
},
{
"epoch": 0.9615384615384616,
"grad_norm": 1.8094460359927895,
"learning_rate": 8.320913023524591e-07,
"loss": 2.2998,
"step": 1000
},
{
"epoch": 0.9615384615384616,
"eval_loss": 2.323676109313965,
"eval_runtime": 85.3479,
"eval_samples_per_second": 86.645,
"eval_steps_per_second": 0.68,
"step": 1000
},
{
"epoch": 0.9663461538461539,
"grad_norm": 1.7191097995210292,
"learning_rate": 8.186115291225334e-07,
"loss": 2.3048,
"step": 1005
},
{
"epoch": 0.9711538461538461,
"grad_norm": 1.7123549721593059,
"learning_rate": 8.05343298141196e-07,
"loss": 2.2933,
"step": 1010
},
{
"epoch": 0.9759615384615384,
"grad_norm": 1.615875433552917,
"learning_rate": 7.922836168866939e-07,
"loss": 2.3564,
"step": 1015
},
{
"epoch": 0.9807692307692307,
"grad_norm": 1.928169845331568,
"learning_rate": 7.794295305258064e-07,
"loss": 2.304,
"step": 1020
},
{
"epoch": 0.9855769230769231,
"grad_norm": 1.6770198711392135,
"learning_rate": 7.667781214978637e-07,
"loss": 2.3152,
"step": 1025
},
{
"epoch": 0.9903846153846154,
"grad_norm": 1.942852074361696,
"learning_rate": 7.543265091027068e-07,
"loss": 2.2961,
"step": 1030
},
{
"epoch": 0.9951923076923077,
"grad_norm": 1.7644307035655395,
"learning_rate": 7.420718490925571e-07,
"loss": 2.3559,
"step": 1035
},
{
"epoch": 1.0,
"grad_norm": 1.6849031142151147,
"learning_rate": 7.300113332677667e-07,
"loss": 2.2943,
"step": 1040
},
{
"epoch": 1.0048076923076923,
"grad_norm": 2.0233629664399646,
"learning_rate": 7.181421890764176e-07,
"loss": 2.1536,
"step": 1045
},
{
"epoch": 1.0096153846153846,
"grad_norm": 1.6857528037531342,
"learning_rate": 7.064616792177334e-07,
"loss": 2.1437,
"step": 1050
},
{
"epoch": 1.0144230769230769,
"grad_norm": 1.856293792049413,
"learning_rate": 6.949671012492914e-07,
"loss": 2.0699,
"step": 1055
},
{
"epoch": 1.0192307692307692,
"grad_norm": 1.8179118022888037,
"learning_rate": 6.836557871979786e-07,
"loss": 2.0974,
"step": 1060
},
{
"epoch": 1.0240384615384615,
"grad_norm": 1.8749106071870572,
"learning_rate": 6.725251031746841e-07,
"loss": 2.1025,
"step": 1065
},
{
"epoch": 1.0288461538461537,
"grad_norm": 2.4469738972729442,
"learning_rate": 6.61572448992684e-07,
"loss": 2.0592,
"step": 1070
},
{
"epoch": 1.0336538461538463,
"grad_norm": 1.9600481862823989,
"learning_rate": 6.507952577896988e-07,
"loss": 2.1909,
"step": 1075
},
{
"epoch": 1.0384615384615385,
"grad_norm": 1.7683431826042773,
"learning_rate": 6.401909956535864e-07,
"loss": 2.0983,
"step": 1080
},
{
"epoch": 1.0432692307692308,
"grad_norm": 1.8700170966385194,
"learning_rate": 6.297571612516455e-07,
"loss": 2.1326,
"step": 1085
},
{
"epoch": 1.0480769230769231,
"grad_norm": 1.7984837423328528,
"learning_rate": 6.194912854635e-07,
"loss": 2.1085,
"step": 1090
},
{
"epoch": 1.0528846153846154,
"grad_norm": 1.8234811332020633,
"learning_rate": 6.093909310175343e-07,
"loss": 2.1227,
"step": 1095
},
{
"epoch": 1.0576923076923077,
"grad_norm": 1.8669294021521274,
"learning_rate": 5.994536921308514e-07,
"loss": 2.0538,
"step": 1100
},
{
"epoch": 1.0625,
"grad_norm": 1.834973873963248,
"learning_rate": 5.896771941527257e-07,
"loss": 2.163,
"step": 1105
},
{
"epoch": 1.0673076923076923,
"grad_norm": 1.7568094748940102,
"learning_rate": 5.800590932115227e-07,
"loss": 2.1596,
"step": 1110
},
{
"epoch": 1.0721153846153846,
"grad_norm": 1.9456491484317202,
"learning_rate": 5.705970758650521e-07,
"loss": 2.092,
"step": 1115
},
{
"epoch": 1.0769230769230769,
"grad_norm": 1.8020042844163735,
"learning_rate": 5.612888587543394e-07,
"loss": 2.1022,
"step": 1120
},
{
"epoch": 1.0817307692307692,
"grad_norm": 1.853382300488598,
"learning_rate": 5.521321882607727e-07,
"loss": 2.0697,
"step": 1125
},
{
"epoch": 1.0865384615384615,
"grad_norm": 1.8362880314320598,
"learning_rate": 5.431248401666053e-07,
"loss": 2.1201,
"step": 1130
},
{
"epoch": 1.0913461538461537,
"grad_norm": 1.8442235682625632,
"learning_rate": 5.342646193187874e-07,
"loss": 2.0395,
"step": 1135
},
{
"epoch": 1.0961538461538463,
"grad_norm": 1.8214033367021532,
"learning_rate": 5.255493592960974e-07,
"loss": 2.113,
"step": 1140
},
{
"epoch": 1.1009615384615385,
"grad_norm": 1.7988424197058015,
"learning_rate": 5.169769220795454e-07,
"loss": 2.131,
"step": 1145
},
{
"epoch": 1.1057692307692308,
"grad_norm": 1.8109458308469661,
"learning_rate": 5.085451977260232e-07,
"loss": 2.1636,
"step": 1150
},
{
"epoch": 1.1105769230769231,
"grad_norm": 1.8188669425027102,
"learning_rate": 5.00252104045174e-07,
"loss": 2.1307,
"step": 1155
},
{
"epoch": 1.1153846153846154,
"grad_norm": 1.7643977620250952,
"learning_rate": 4.920955862794543e-07,
"loss": 2.1029,
"step": 1160
},
{
"epoch": 1.1201923076923077,
"grad_norm": 1.871509851180396,
"learning_rate": 4.84073616787364e-07,
"loss": 2.106,
"step": 1165
},
{
"epoch": 1.125,
"grad_norm": 1.827457682413712,
"learning_rate": 4.7618419472981506e-07,
"loss": 2.1616,
"step": 1170
},
{
"epoch": 1.1298076923076923,
"grad_norm": 1.7536769808765222,
"learning_rate": 4.684253457596156e-07,
"loss": 2.1077,
"step": 1175
},
{
"epoch": 1.1346153846153846,
"grad_norm": 1.9063367359144818,
"learning_rate": 4.6079512171404304e-07,
"loss": 2.1849,
"step": 1180
},
{
"epoch": 1.1394230769230769,
"grad_norm": 2.145803926574076,
"learning_rate": 4.5329160031047875e-07,
"loss": 2.1577,
"step": 1185
},
{
"epoch": 1.1442307692307692,
"grad_norm": 1.8443487836196741,
"learning_rate": 4.4591288484508226e-07,
"loss": 2.064,
"step": 1190
},
{
"epoch": 1.1490384615384615,
"grad_norm": 1.815754689621411,
"learning_rate": 4.3865710389447586e-07,
"loss": 2.1008,
"step": 1195
},
{
"epoch": 1.1538461538461537,
"grad_norm": 1.8139614221776288,
"learning_rate": 4.315224110204174e-07,
"loss": 2.1248,
"step": 1200
},
{
"epoch": 1.1538461538461537,
"eval_loss": 2.336085319519043,
"eval_runtime": 85.3746,
"eval_samples_per_second": 86.618,
"eval_steps_per_second": 0.679,
"step": 1200
},
{
"epoch": 1.1586538461538463,
"grad_norm": 1.7983716043793538,
"learning_rate": 4.245069844774349e-07,
"loss": 2.0729,
"step": 1205
},
{
"epoch": 1.1634615384615385,
"grad_norm": 1.8990292619468592,
"learning_rate": 4.17609026923398e-07,
"loss": 2.1249,
"step": 1210
},
{
"epoch": 1.1682692307692308,
"grad_norm": 1.762763830487173,
"learning_rate": 4.1082676513300323e-07,
"loss": 2.154,
"step": 1215
},
{
"epoch": 1.1730769230769231,
"grad_norm": 1.759984272000879,
"learning_rate": 4.0415844971414616e-07,
"loss": 2.1299,
"step": 1220
},
{
"epoch": 1.1778846153846154,
"grad_norm": 1.7856327184643472,
"learning_rate": 3.976023548271586e-07,
"loss": 2.1663,
"step": 1225
},
{
"epoch": 1.1826923076923077,
"grad_norm": 1.8453273970913073,
"learning_rate": 3.9115677790688485e-07,
"loss": 2.1115,
"step": 1230
},
{
"epoch": 1.1875,
"grad_norm": 1.7711541036032603,
"learning_rate": 3.8482003938757386e-07,
"loss": 2.1207,
"step": 1235
},
{
"epoch": 1.1923076923076923,
"grad_norm": 1.7750356264689093,
"learning_rate": 3.78590482430564e-07,
"loss": 2.0857,
"step": 1240
},
{
"epoch": 1.1971153846153846,
"grad_norm": 1.7976368503882154,
"learning_rate": 3.724664726547351e-07,
"loss": 2.1386,
"step": 1245
},
{
"epoch": 1.2019230769230769,
"grad_norm": 1.829414461965732,
"learning_rate": 3.6644639786970623e-07,
"loss": 2.174,
"step": 1250
},
{
"epoch": 1.2067307692307692,
"grad_norm": 1.825361485465677,
"learning_rate": 3.6052866781175476e-07,
"loss": 2.1057,
"step": 1255
},
{
"epoch": 1.2115384615384615,
"grad_norm": 1.8292622951367188,
"learning_rate": 3.547117138824332e-07,
"loss": 2.08,
"step": 1260
},
{
"epoch": 1.2163461538461537,
"grad_norm": 1.8307121677285738,
"learning_rate": 3.48993988889863e-07,
"loss": 2.1154,
"step": 1265
},
{
"epoch": 1.2211538461538463,
"grad_norm": 1.862688434301242,
"learning_rate": 3.433739667926769e-07,
"loss": 2.0719,
"step": 1270
},
{
"epoch": 1.2259615384615385,
"grad_norm": 1.8172648051882496,
"learning_rate": 3.378501424465974e-07,
"loss": 2.08,
"step": 1275
},
{
"epoch": 1.2307692307692308,
"grad_norm": 1.831590098407615,
"learning_rate": 3.3242103135361645e-07,
"loss": 2.1313,
"step": 1280
},
{
"epoch": 1.2355769230769231,
"grad_norm": 1.8337034054812522,
"learning_rate": 3.2708516941376294e-07,
"loss": 2.1436,
"step": 1285
},
{
"epoch": 1.2403846153846154,
"grad_norm": 1.8090147347855563,
"learning_rate": 3.218411126794323e-07,
"loss": 2.1503,
"step": 1290
},
{
"epoch": 1.2451923076923077,
"grad_norm": 1.8544882033122045,
"learning_rate": 3.166874371122564e-07,
"loss": 2.1303,
"step": 1295
},
{
"epoch": 1.25,
"grad_norm": 1.781492016300762,
"learning_rate": 3.116227383424919e-07,
"loss": 2.0967,
"step": 1300
},
{
"epoch": 1.2548076923076923,
"grad_norm": 1.8889890359608847,
"learning_rate": 3.066456314309059e-07,
"loss": 2.0931,
"step": 1305
},
{
"epoch": 1.2596153846153846,
"grad_norm": 1.8232794987114287,
"learning_rate": 3.017547506331364e-07,
"loss": 2.1251,
"step": 1310
},
{
"epoch": 1.2644230769230769,
"grad_norm": 1.8856640991380471,
"learning_rate": 2.969487491665068e-07,
"loss": 2.1139,
"step": 1315
},
{
"epoch": 1.2692307692307692,
"grad_norm": 1.7930598313625747,
"learning_rate": 2.9222629897927087e-07,
"loss": 2.1204,
"step": 1320
},
{
"epoch": 1.2740384615384617,
"grad_norm": 1.8132589043201648,
"learning_rate": 2.8758609052227305e-07,
"loss": 2.034,
"step": 1325
},
{
"epoch": 1.2788461538461537,
"grad_norm": 1.8767260044973102,
"learning_rate": 2.830268325229947e-07,
"loss": 2.1215,
"step": 1330
},
{
"epoch": 1.2836538461538463,
"grad_norm": 1.8491028909697207,
"learning_rate": 2.785472517619713e-07,
"loss": 2.1328,
"step": 1335
},
{
"epoch": 1.2884615384615383,
"grad_norm": 1.9076802028303976,
"learning_rate": 2.74146092851559e-07,
"loss": 2.084,
"step": 1340
},
{
"epoch": 1.2932692307692308,
"grad_norm": 1.849289922308255,
"learning_rate": 2.698221180170271e-07,
"loss": 2.1259,
"step": 1345
},
{
"epoch": 1.2980769230769231,
"grad_norm": 1.7905203171901232,
"learning_rate": 2.6557410687996006e-07,
"loss": 2.1151,
"step": 1350
},
{
"epoch": 1.3028846153846154,
"grad_norm": 1.8830908621706892,
"learning_rate": 2.6140085624394526e-07,
"loss": 2.1457,
"step": 1355
},
{
"epoch": 1.3076923076923077,
"grad_norm": 1.8596784397686372,
"learning_rate": 2.573011798825286e-07,
"loss": 2.073,
"step": 1360
},
{
"epoch": 1.3125,
"grad_norm": 1.8448017924414952,
"learning_rate": 2.5327390832941644e-07,
"loss": 2.1286,
"step": 1365
},
{
"epoch": 1.3173076923076923,
"grad_norm": 2.0018781537530996,
"learning_rate": 2.4931788867090523e-07,
"loss": 2.09,
"step": 1370
},
{
"epoch": 1.3221153846153846,
"grad_norm": 1.8762757684058704,
"learning_rate": 2.4543198434051835e-07,
"loss": 2.075,
"step": 1375
},
{
"epoch": 1.3269230769230769,
"grad_norm": 1.952448677696025,
"learning_rate": 2.4161507491583033e-07,
"loss": 2.1256,
"step": 1380
},
{
"epoch": 1.3317307692307692,
"grad_norm": 1.8165760972158784,
"learning_rate": 2.3786605591746012e-07,
"loss": 2.0566,
"step": 1385
},
{
"epoch": 1.3365384615384617,
"grad_norm": 5.253827520965963,
"learning_rate": 2.341838386102127e-07,
"loss": 2.2116,
"step": 1390
},
{
"epoch": 1.3413461538461537,
"grad_norm": 1.8446995708115508,
"learning_rate": 2.3056734980635093e-07,
"loss": 2.1001,
"step": 1395
},
{
"epoch": 1.3461538461538463,
"grad_norm": 1.9617802338733952,
"learning_rate": 2.2701553167097801e-07,
"loss": 2.1239,
"step": 1400
},
{
"epoch": 1.3461538461538463,
"eval_loss": 2.334371566772461,
"eval_runtime": 85.4548,
"eval_samples_per_second": 86.537,
"eval_steps_per_second": 0.679,
"step": 1400
},
{
"epoch": 1.3509615384615383,
"grad_norm": 1.8285827211419716,
"learning_rate": 2.2352734152951196e-07,
"loss": 2.1184,
"step": 1405
},
{
"epoch": 1.3557692307692308,
"grad_norm": 2.0394120658337305,
"learning_rate": 2.2010175167723296e-07,
"loss": 2.0568,
"step": 1410
},
{
"epoch": 1.3605769230769231,
"grad_norm": 1.7875137882919705,
"learning_rate": 2.167377491908854e-07,
"loss": 2.0625,
"step": 1415
},
{
"epoch": 1.3653846153846154,
"grad_norm": 1.7866761410178333,
"learning_rate": 2.134343357423158e-07,
"loss": 2.0555,
"step": 1420
},
{
"epoch": 1.3701923076923077,
"grad_norm": 1.932563852514787,
"learning_rate": 2.101905274141283e-07,
"loss": 2.1069,
"step": 1425
},
{
"epoch": 1.375,
"grad_norm": 1.9475188936955665,
"learning_rate": 2.0700535451733951e-07,
"loss": 2.1086,
"step": 1430
},
{
"epoch": 1.3798076923076923,
"grad_norm": 1.8526120458954936,
"learning_rate": 2.0387786141101492e-07,
"loss": 2.1378,
"step": 1435
},
{
"epoch": 1.3846153846153846,
"grad_norm": 1.8562018803586509,
"learning_rate": 2.0080710632386802e-07,
"loss": 2.1353,
"step": 1440
},
{
"epoch": 1.3894230769230769,
"grad_norm": 1.8313311377456998,
"learning_rate": 1.9779216117780527e-07,
"loss": 2.1171,
"step": 1445
},
{
"epoch": 1.3942307692307692,
"grad_norm": 1.8142973032453498,
"learning_rate": 1.9483211141339894e-07,
"loss": 2.0766,
"step": 1450
},
{
"epoch": 1.3990384615384617,
"grad_norm": 1.8237674767411933,
"learning_rate": 1.9192605581726967e-07,
"loss": 2.1593,
"step": 1455
},
{
"epoch": 1.4038461538461537,
"grad_norm": 1.772508678674097,
"learning_rate": 1.8907310635136197e-07,
"loss": 2.1314,
"step": 1460
},
{
"epoch": 1.4086538461538463,
"grad_norm": 1.8899727080269664,
"learning_rate": 1.8627238798409526e-07,
"loss": 2.0845,
"step": 1465
},
{
"epoch": 1.4134615384615383,
"grad_norm": 1.90653257600126,
"learning_rate": 1.8352303852337284e-07,
"loss": 2.1508,
"step": 1470
},
{
"epoch": 1.4182692307692308,
"grad_norm": 1.8534900824085168,
"learning_rate": 1.8082420845143144e-07,
"loss": 2.0896,
"step": 1475
},
{
"epoch": 1.4230769230769231,
"grad_norm": 1.8066064812360683,
"learning_rate": 1.7817506076151663e-07,
"loss": 2.1493,
"step": 1480
},
{
"epoch": 1.4278846153846154,
"grad_norm": 1.8590166269045232,
"learning_rate": 1.7557477079636372e-07,
"loss": 2.0614,
"step": 1485
},
{
"epoch": 1.4326923076923077,
"grad_norm": 1.8782140024216563,
"learning_rate": 1.7302252608847008e-07,
"loss": 2.0691,
"step": 1490
},
{
"epoch": 1.4375,
"grad_norm": 1.8729309652922037,
"learning_rate": 1.7051752620214163e-07,
"loss": 2.0573,
"step": 1495
},
{
"epoch": 1.4423076923076923,
"grad_norm": 1.8894921416533526,
"learning_rate": 1.6805898257729673e-07,
"loss": 2.0936,
"step": 1500
},
{
"epoch": 1.4471153846153846,
"grad_norm": 1.9015071278716307,
"learning_rate": 1.6564611837501148e-07,
"loss": 2.0837,
"step": 1505
},
{
"epoch": 1.4519230769230769,
"grad_norm": 1.8197453987244108,
"learning_rate": 1.6327816832478985e-07,
"loss": 2.1064,
"step": 1510
},
{
"epoch": 1.4567307692307692,
"grad_norm": 1.8526075910672721,
"learning_rate": 1.6095437857354324e-07,
"loss": 2.0926,
"step": 1515
},
{
"epoch": 1.4615384615384617,
"grad_norm": 1.8572065984966375,
"learning_rate": 1.586740065362626e-07,
"loss": 2.0582,
"step": 1520
},
{
"epoch": 1.4663461538461537,
"grad_norm": 1.8156159477376175,
"learning_rate": 1.5643632074836825e-07,
"loss": 2.1037,
"step": 1525
},
{
"epoch": 1.4711538461538463,
"grad_norm": 1.8649198187665965,
"learning_rate": 1.5424060071972007e-07,
"loss": 2.125,
"step": 1530
},
{
"epoch": 1.4759615384615383,
"grad_norm": 1.8545497800311697,
"learning_rate": 1.5208613679027549e-07,
"loss": 2.0884,
"step": 1535
},
{
"epoch": 1.4807692307692308,
"grad_norm": 1.8606969338206512,
"learning_rate": 1.4997222998737582e-07,
"loss": 2.1157,
"step": 1540
},
{
"epoch": 1.4855769230769231,
"grad_norm": 1.8859903197241183,
"learning_rate": 1.478981918846486e-07,
"loss": 2.1273,
"step": 1545
},
{
"epoch": 1.4903846153846154,
"grad_norm": 1.8869329872162925,
"learning_rate": 1.4586334446250955e-07,
"loss": 2.1386,
"step": 1550
},
{
"epoch": 1.4951923076923077,
"grad_norm": 1.860329950662595,
"learning_rate": 1.43867019970249e-07,
"loss": 2.157,
"step": 1555
},
{
"epoch": 1.5,
"grad_norm": 1.8134076526838725,
"learning_rate": 1.419085607896877e-07,
"loss": 2.1129,
"step": 1560
},
{
"epoch": 1.5048076923076923,
"grad_norm": 1.8259889434431678,
"learning_rate": 1.3998731930038773e-07,
"loss": 2.1292,
"step": 1565
},
{
"epoch": 1.5096153846153846,
"grad_norm": 1.8908539458019609,
"learning_rate": 1.381026577464028e-07,
"loss": 2.1286,
"step": 1570
},
{
"epoch": 1.5144230769230769,
"grad_norm": 1.7930674977942935,
"learning_rate": 1.3625394810455382e-07,
"loss": 2.1092,
"step": 1575
},
{
"epoch": 1.5192307692307692,
"grad_norm": 1.8496202978075098,
"learning_rate": 1.3444057195421526e-07,
"loss": 2.1075,
"step": 1580
},
{
"epoch": 1.5240384615384617,
"grad_norm": 1.8344118160186549,
"learning_rate": 1.326619203485973e-07,
"loss": 2.1007,
"step": 1585
},
{
"epoch": 1.5288461538461537,
"grad_norm": 1.8585688089026406,
"learning_rate": 1.3091739368750989e-07,
"loss": 2.1521,
"step": 1590
},
{
"epoch": 1.5336538461538463,
"grad_norm": 2.0502623341105517,
"learning_rate": 1.292064015915944e-07,
"loss": 2.0904,
"step": 1595
},
{
"epoch": 1.5384615384615383,
"grad_norm": 1.8474141432895723,
"learning_rate": 1.2752836277800852e-07,
"loss": 2.1521,
"step": 1600
},
{
"epoch": 1.5384615384615383,
"eval_loss": 2.333831548690796,
"eval_runtime": 85.4542,
"eval_samples_per_second": 86.538,
"eval_steps_per_second": 0.679,
"step": 1600
},
{
"epoch": 1.5432692307692308,
"grad_norm": 1.908368834971653,
"learning_rate": 1.2588270493755057e-07,
"loss": 2.0545,
"step": 1605
},
{
"epoch": 1.5480769230769231,
"grad_norm": 1.8891697271029433,
"learning_rate": 1.242688646132092e-07,
"loss": 2.1085,
"step": 1610
},
{
"epoch": 1.5528846153846154,
"grad_norm": 1.8238620642049488,
"learning_rate": 1.22686287080125e-07,
"loss": 2.1416,
"step": 1615
},
{
"epoch": 1.5576923076923077,
"grad_norm": 1.845379742670226,
"learning_rate": 1.2113442622694955e-07,
"loss": 2.0587,
"step": 1620
},
{
"epoch": 1.5625,
"grad_norm": 1.760419766434776,
"learning_rate": 1.1961274443858932e-07,
"loss": 2.0988,
"step": 1625
},
{
"epoch": 1.5673076923076923,
"grad_norm": 1.9500128322951924,
"learning_rate": 1.1812071248031999e-07,
"loss": 2.1024,
"step": 1630
},
{
"epoch": 1.5721153846153846,
"grad_norm": 1.8158972995099203,
"learning_rate": 1.1665780938325871e-07,
"loss": 2.1387,
"step": 1635
},
{
"epoch": 1.5769230769230769,
"grad_norm": 1.86611749153697,
"learning_rate": 1.152235223311802e-07,
"loss": 2.1525,
"step": 1640
},
{
"epoch": 1.5817307692307692,
"grad_norm": 1.8447983570027537,
"learning_rate": 1.1381734654866389e-07,
"loss": 2.0554,
"step": 1645
},
{
"epoch": 1.5865384615384617,
"grad_norm": 1.828362228823549,
"learning_rate": 1.1243878519055928e-07,
"loss": 2.1187,
"step": 1650
},
{
"epoch": 1.5913461538461537,
"grad_norm": 1.947875660376608,
"learning_rate": 1.1108734923275605e-07,
"loss": 2.0531,
"step": 1655
},
{
"epoch": 1.5961538461538463,
"grad_norm": 1.818226522118368,
"learning_rate": 1.0976255736424637e-07,
"loss": 2.1036,
"step": 1660
},
{
"epoch": 1.6009615384615383,
"grad_norm": 1.9755891501080045,
"learning_rate": 1.0846393588046656e-07,
"loss": 2.1296,
"step": 1665
},
{
"epoch": 1.6057692307692308,
"grad_norm": 1.8165676756032596,
"learning_rate": 1.0719101857790552e-07,
"loss": 2.0842,
"step": 1670
},
{
"epoch": 1.6105769230769231,
"grad_norm": 1.8480994780626476,
"learning_rate": 1.0594334664996721e-07,
"loss": 2.0833,
"step": 1675
},
{
"epoch": 1.6153846153846154,
"grad_norm": 1.7568276519420272,
"learning_rate": 1.0472046858407492e-07,
"loss": 2.1152,
"step": 1680
},
{
"epoch": 1.6201923076923077,
"grad_norm": 1.8155268250435754,
"learning_rate": 1.0352194006000441e-07,
"loss": 2.1277,
"step": 1685
},
{
"epoch": 1.625,
"grad_norm": 1.8688450613110825,
"learning_rate": 1.0234732384943512e-07,
"loss": 2.055,
"step": 1690
},
{
"epoch": 1.6298076923076923,
"grad_norm": 1.834466807811679,
"learning_rate": 1.0119618971670507e-07,
"loss": 2.1648,
"step": 1695
},
{
"epoch": 1.6346153846153846,
"grad_norm": 1.9150332485014145,
"learning_rate": 1.0006811432075942e-07,
"loss": 2.0587,
"step": 1700
},
{
"epoch": 1.6394230769230769,
"grad_norm": 1.866607921147843,
"learning_rate": 9.896268111827943e-08,
"loss": 2.076,
"step": 1705
},
{
"epoch": 1.6442307692307692,
"grad_norm": 1.8656204113992287,
"learning_rate": 9.787948026798065e-08,
"loss": 2.1168,
"step": 1710
},
{
"epoch": 1.6490384615384617,
"grad_norm": 1.849474324070502,
"learning_rate": 9.68181085360681e-08,
"loss": 2.1075,
"step": 1715
},
{
"epoch": 1.6538461538461537,
"grad_norm": 1.8108526684678354,
"learning_rate": 9.57781692028372e-08,
"loss": 2.1368,
"step": 1720
},
{
"epoch": 1.6586538461538463,
"grad_norm": 1.8133873110154997,
"learning_rate": 9.475927197040834e-08,
"loss": 2.088,
"step": 1725
},
{
"epoch": 1.6634615384615383,
"grad_norm": 1.8155032971792053,
"learning_rate": 9.376103287158425e-08,
"loss": 2.1397,
"step": 1730
},
{
"epoch": 1.6682692307692308,
"grad_norm": 1.8962575557301127,
"learning_rate": 9.278307417981768e-08,
"loss": 2.116,
"step": 1735
},
{
"epoch": 1.6730769230769231,
"grad_norm": 1.8976326651339515,
"learning_rate": 9.182502432027988e-08,
"loss": 2.0869,
"step": 1740
},
{
"epoch": 1.6778846153846154,
"grad_norm": 1.805419356077963,
"learning_rate": 9.107267296696801e-08,
"loss": 2.0926,
"step": 1745
},
{
"epoch": 1.6826923076923077,
"grad_norm": 1.8237173931210868,
"learning_rate": 9.014954193734225e-08,
"loss": 2.07,
"step": 1750
},
{
"epoch": 1.6875,
"grad_norm": 1.874303236724565,
"learning_rate": 8.924531131396056e-08,
"loss": 2.0852,
"step": 1755
},
{
"epoch": 1.6923076923076923,
"grad_norm": 1.8446431514031785,
"learning_rate": 8.835963210651791e-08,
"loss": 2.0639,
"step": 1760
},
{
"epoch": 1.6971153846153846,
"grad_norm": 1.8962482308020339,
"learning_rate": 8.749216106451011e-08,
"loss": 2.1162,
"step": 1765
},
{
"epoch": 1.7019230769230769,
"grad_norm": 1.8192264354608538,
"learning_rate": 8.664256059446181e-08,
"loss": 2.1065,
"step": 1770
},
{
"epoch": 1.7067307692307692,
"grad_norm": 2.366332770975045,
"learning_rate": 8.581049867817956e-08,
"loss": 2.0625,
"step": 1775
},
{
"epoch": 1.7115384615384617,
"grad_norm": 1.8446173561965722,
"learning_rate": 8.499564879201958e-08,
"loss": 2.0537,
"step": 1780
},
{
"epoch": 1.7163461538461537,
"grad_norm": 1.8507785394900198,
"learning_rate": 8.419768982715971e-08,
"loss": 2.1093,
"step": 1785
},
{
"epoch": 1.7211538461538463,
"grad_norm": 1.9304487119438947,
"learning_rate": 8.341630601086485e-08,
"loss": 2.118,
"step": 1790
},
{
"epoch": 1.7259615384615383,
"grad_norm": 1.8294859378005517,
"learning_rate": 8.265118682873593e-08,
"loss": 2.1369,
"step": 1795
},
{
"epoch": 1.7307692307692308,
"grad_norm": 1.8613822811922678,
"learning_rate": 8.190202694793183e-08,
"loss": 2.1359,
"step": 1800
},
{
"epoch": 1.7307692307692308,
"eval_loss": 2.333617687225342,
"eval_runtime": 85.3403,
"eval_samples_per_second": 86.653,
"eval_steps_per_second": 0.68,
"step": 1800
},
{
"epoch": 1.7355769230769231,
"grad_norm": 1.8159457192343773,
"learning_rate": 8.116852614135445e-08,
"loss": 2.1222,
"step": 1805
},
{
"epoch": 1.7403846153846154,
"grad_norm": 1.857716576691175,
"learning_rate": 8.045038921278602e-08,
"loss": 2.1139,
"step": 1810
},
{
"epoch": 1.7451923076923077,
"grad_norm": 1.8694725467916173,
"learning_rate": 7.974732592297013e-08,
"loss": 2.094,
"step": 1815
},
{
"epoch": 1.75,
"grad_norm": 1.8560579082110327,
"learning_rate": 7.905905091662493e-08,
"loss": 2.1622,
"step": 1820
},
{
"epoch": 1.7548076923076923,
"grad_norm": 1.875970072144303,
"learning_rate": 7.838528365037967e-08,
"loss": 2.1179,
"step": 1825
},
{
"epoch": 1.7596153846153846,
"grad_norm": 1.9019026590876629,
"learning_rate": 7.77257483216247e-08,
"loss": 2.1137,
"step": 1830
},
{
"epoch": 1.7644230769230769,
"grad_norm": 1.8292496367699893,
"learning_rate": 7.708017379826487e-08,
"loss": 2.0573,
"step": 1835
},
{
"epoch": 1.7692307692307692,
"grad_norm": 1.8672483366732924,
"learning_rate": 7.644829354936725e-08,
"loss": 2.1275,
"step": 1840
},
{
"epoch": 1.7740384615384617,
"grad_norm": 1.734535999037372,
"learning_rate": 7.582984557669328e-08,
"loss": 2.0798,
"step": 1845
},
{
"epoch": 1.7788461538461537,
"grad_norm": 1.8512196694843002,
"learning_rate": 7.52245723471061e-08,
"loss": 2.1569,
"step": 1850
},
{
"epoch": 1.7836538461538463,
"grad_norm": 1.7836085149148238,
"learning_rate": 7.463222072584383e-08,
"loss": 2.1196,
"step": 1855
},
{
"epoch": 1.7884615384615383,
"grad_norm": 1.8793796811188046,
"learning_rate": 7.405254191064901e-08,
"loss": 2.0593,
"step": 1860
},
{
"epoch": 1.7932692307692308,
"grad_norm": 1.8737352256216766,
"learning_rate": 7.348529136674602e-08,
"loss": 2.0905,
"step": 1865
},
{
"epoch": 1.7980769230769231,
"grad_norm": 1.832908496175927,
"learning_rate": 7.293022876265624e-08,
"loss": 2.1636,
"step": 1870
},
{
"epoch": 1.8028846153846154,
"grad_norm": 1.914652585529052,
"learning_rate": 7.23871179068426e-08,
"loss": 2.1163,
"step": 1875
},
{
"epoch": 1.8076923076923077,
"grad_norm": 1.8575655442671353,
"learning_rate": 7.185572668517463e-08,
"loss": 2.0961,
"step": 1880
},
{
"epoch": 1.8125,
"grad_norm": 1.872595689449834,
"learning_rate": 7.133582699920455e-08,
"loss": 2.1504,
"step": 1885
},
{
"epoch": 1.8173076923076923,
"grad_norm": 1.8150069813971093,
"learning_rate": 7.082719470524635e-08,
"loss": 2.1249,
"step": 1890
},
{
"epoch": 1.8221153846153846,
"grad_norm": 1.892110067355825,
"learning_rate": 7.032960955424859e-08,
"loss": 2.0501,
"step": 1895
},
{
"epoch": 1.8269230769230769,
"grad_norm": 2.017115554517963,
"learning_rate": 6.98428551324525e-08,
"loss": 2.0568,
"step": 1900
},
{
"epoch": 1.8317307692307692,
"grad_norm": 1.8844252622464137,
"learning_rate": 6.936671880282684e-08,
"loss": 2.1413,
"step": 1905
},
{
"epoch": 1.8365384615384617,
"grad_norm": 1.8438419406531692,
"learning_rate": 6.890099164727089e-08,
"loss": 2.1635,
"step": 1910
},
{
"epoch": 1.8413461538461537,
"grad_norm": 1.8996214354229564,
"learning_rate": 6.844546840957736e-08,
"loss": 2.1141,
"step": 1915
},
{
"epoch": 1.8461538461538463,
"grad_norm": 1.7579428295336565,
"learning_rate": 6.799994743914665e-08,
"loss": 2.0918,
"step": 1920
},
{
"epoch": 1.8509615384615383,
"grad_norm": 1.7922772912832896,
"learning_rate": 6.756423063544432e-08,
"loss": 2.078,
"step": 1925
},
{
"epoch": 1.8557692307692308,
"grad_norm": 1.8562342019145215,
"learning_rate": 6.713812339319366e-08,
"loss": 2.1416,
"step": 1930
},
{
"epoch": 1.8605769230769231,
"grad_norm": 1.9439324971687737,
"learning_rate": 6.672143454829497e-08,
"loss": 2.1372,
"step": 1935
},
{
"epoch": 1.8653846153846154,
"grad_norm": 1.8774979949999377,
"learning_rate": 6.631397632446378e-08,
"loss": 2.1379,
"step": 1940
},
{
"epoch": 1.8701923076923077,
"grad_norm": 1.842493871682372,
"learning_rate": 6.591556428057989e-08,
"loss": 2.101,
"step": 1945
},
{
"epoch": 1.875,
"grad_norm": 1.7980810141414054,
"learning_rate": 6.552601725873927e-08,
"loss": 2.1336,
"step": 1950
},
{
"epoch": 1.8798076923076923,
"grad_norm": 1.909273446139313,
"learning_rate": 6.514515733300119e-08,
"loss": 2.1389,
"step": 1955
},
{
"epoch": 1.8846153846153846,
"grad_norm": 1.9398969365111554,
"learning_rate": 6.484660656765394e-08,
"loss": 2.1039,
"step": 1960
},
{
"epoch": 1.8894230769230769,
"grad_norm": 1.85453008710647,
"learning_rate": 6.448094516468652e-08,
"loss": 2.0795,
"step": 1965
},
{
"epoch": 1.8942307692307692,
"grad_norm": 1.7956663379402615,
"learning_rate": 6.412348943141603e-08,
"loss": 2.1183,
"step": 1970
},
{
"epoch": 1.8990384615384617,
"grad_norm": 2.078977441304735,
"learning_rate": 6.377407326795944e-08,
"loss": 2.0763,
"step": 1975
},
{
"epoch": 1.9038461538461537,
"grad_norm": 1.757810065596903,
"learning_rate": 6.343253356981554e-08,
"loss": 2.13,
"step": 1980
},
{
"epoch": 1.9086538461538463,
"grad_norm": 1.8683875085590016,
"learning_rate": 6.309871018049243e-08,
"loss": 2.0809,
"step": 1985
},
{
"epoch": 1.9134615384615383,
"grad_norm": 1.7848369332013463,
"learning_rate": 6.277244584477894e-08,
"loss": 2.1428,
"step": 1990
},
{
"epoch": 1.9182692307692308,
"grad_norm": 1.802325866848323,
"learning_rate": 6.245358616265204e-08,
"loss": 2.0786,
"step": 1995
},
{
"epoch": 1.9230769230769231,
"grad_norm": 1.807966959879067,
"learning_rate": 6.214197954381353e-08,
"loss": 2.0531,
"step": 2000
},
{
"epoch": 1.9230769230769231,
"eval_loss": 2.333247184753418,
"eval_runtime": 85.394,
"eval_samples_per_second": 86.599,
"eval_steps_per_second": 0.679,
"step": 2000
},
{
"epoch": 1.9278846153846154,
"grad_norm": 1.779659361884406,
"learning_rate": 6.183747716284858e-08,
"loss": 2.1421,
"step": 2005
},
{
"epoch": 1.9326923076923077,
"grad_norm": 1.9140174756953598,
"learning_rate": 6.153993291499917e-08,
"loss": 2.1539,
"step": 2010
},
{
"epoch": 1.9375,
"grad_norm": 1.8616242261169418,
"learning_rate": 6.124920337254512e-08,
"loss": 2.1089,
"step": 2015
},
{
"epoch": 1.9423076923076923,
"grad_norm": 1.88338038531167,
"learning_rate": 6.096514774178612e-08,
"loss": 2.0954,
"step": 2020
},
{
"epoch": 1.9471153846153846,
"grad_norm": 1.9384073065008345,
"learning_rate": 6.068762782061749e-08,
"loss": 2.1067,
"step": 2025
},
{
"epoch": 1.9519230769230769,
"grad_norm": 1.7842608425146953,
"learning_rate": 6.04165079566931e-08,
"loss": 2.0734,
"step": 2030
},
{
"epoch": 1.9567307692307692,
"grad_norm": 1.8980213968050201,
"learning_rate": 6.015165500616844e-08,
"loss": 2.1398,
"step": 2035
},
{
"epoch": 1.9615384615384617,
"grad_norm": 1.8854870321306716,
"learning_rate": 5.989293829301721e-08,
"loss": 2.0905,
"step": 2040
},
{
"epoch": 1.9663461538461537,
"grad_norm": 1.8366214101050582,
"learning_rate": 5.964022956891487e-08,
"loss": 2.1192,
"step": 2045
},
{
"epoch": 1.9711538461538463,
"grad_norm": 1.9702601160939885,
"learning_rate": 5.9393402973682475e-08,
"loss": 2.0562,
"step": 2050
},
{
"epoch": 1.9759615384615383,
"grad_norm": 1.7854608377655588,
"learning_rate": 5.915233499628401e-08,
"loss": 2.0958,
"step": 2055
},
{
"epoch": 1.9807692307692308,
"grad_norm": 1.8080366636915477,
"learning_rate": 5.8916904436371357e-08,
"loss": 2.118,
"step": 2060
},
{
"epoch": 1.9855769230769231,
"grad_norm": 1.7747943415915806,
"learning_rate": 5.868699236636974e-08,
"loss": 2.0928,
"step": 2065
},
{
"epoch": 1.9903846153846154,
"grad_norm": 2.0207986490578067,
"learning_rate": 5.846248209409795e-08,
"loss": 2.1142,
"step": 2070
},
{
"epoch": 1.9951923076923077,
"grad_norm": 1.7957289252600956,
"learning_rate": 5.824325912591659e-08,
"loss": 2.144,
"step": 2075
},
{
"epoch": 2.0,
"grad_norm": 1.8248097411911974,
"learning_rate": 5.802921113039837e-08,
"loss": 2.1047,
"step": 2080
},
{
"epoch": 2.0048076923076925,
"grad_norm": 1.7961928041751198,
"learning_rate": 5.782022790251414e-08,
"loss": 2.1187,
"step": 2085
},
{
"epoch": 2.0096153846153846,
"grad_norm": 1.8336585044351084,
"learning_rate": 5.761620132832865e-08,
"loss": 2.0685,
"step": 2090
},
{
"epoch": 2.014423076923077,
"grad_norm": 1.8219809800904603,
"learning_rate": 5.741702535019987e-08,
"loss": 2.0564,
"step": 2095
},
{
"epoch": 2.019230769230769,
"grad_norm": 1.819040393659182,
"learning_rate": 5.722259593247595e-08,
"loss": 2.1339,
"step": 2100
},
{
"epoch": 2.0240384615384617,
"grad_norm": 1.8732187096486306,
"learning_rate": 5.703281102768385e-08,
"loss": 2.0996,
"step": 2105
},
{
"epoch": 2.0288461538461537,
"grad_norm": 1.8473280371987284,
"learning_rate": 5.684757054320374e-08,
"loss": 2.1093,
"step": 2110
},
{
"epoch": 2.0336538461538463,
"grad_norm": 1.8326317747277034,
"learning_rate": 5.6666776308423326e-08,
"loss": 2.1007,
"step": 2115
},
{
"epoch": 2.0384615384615383,
"grad_norm": 1.7796391236885234,
"learning_rate": 5.649033204236644e-08,
"loss": 2.0974,
"step": 2120
},
{
"epoch": 2.043269230769231,
"grad_norm": 1.8279643679656394,
"learning_rate": 5.631814332179001e-08,
"loss": 2.1061,
"step": 2125
},
{
"epoch": 2.048076923076923,
"grad_norm": 1.915680312366823,
"learning_rate": 5.615011754974382e-08,
"loss": 2.095,
"step": 2130
},
{
"epoch": 2.0528846153846154,
"grad_norm": 1.8545098240752675,
"learning_rate": 5.5986163924587514e-08,
"loss": 2.0248,
"step": 2135
},
{
"epoch": 2.0576923076923075,
"grad_norm": 2.5876380487293065,
"learning_rate": 5.5826193409459206e-08,
"loss": 2.0417,
"step": 2140
},
{
"epoch": 2.0625,
"grad_norm": 1.8049671277672117,
"learning_rate": 5.567011870219021e-08,
"loss": 2.0592,
"step": 2145
},
{
"epoch": 2.0673076923076925,
"grad_norm": 1.875703854921943,
"learning_rate": 5.551785420566048e-08,
"loss": 2.0804,
"step": 2150
},
{
"epoch": 2.0721153846153846,
"grad_norm": 1.8546691508228774,
"learning_rate": 5.536931599858935e-08,
"loss": 2.0805,
"step": 2155
},
{
"epoch": 2.076923076923077,
"grad_norm": 1.773767471396823,
"learning_rate": 5.522442180675621e-08,
"loss": 2.056,
"step": 2160
},
{
"epoch": 2.081730769230769,
"grad_norm": 1.861161247873578,
"learning_rate": 5.508309097464585e-08,
"loss": 2.0671,
"step": 2165
},
{
"epoch": 2.0865384615384617,
"grad_norm": 1.7742050719059044,
"learning_rate": 5.494524443751328e-08,
"loss": 2.0738,
"step": 2170
},
{
"epoch": 2.0913461538461537,
"grad_norm": 1.8318030243960468,
"learning_rate": 5.481080469386275e-08,
"loss": 2.0907,
"step": 2175
},
{
"epoch": 2.0961538461538463,
"grad_norm": 1.778257367233478,
"learning_rate": 5.467969577833591e-08,
"loss": 2.0639,
"step": 2180
},
{
"epoch": 2.1009615384615383,
"grad_norm": 1.867111620525417,
"learning_rate": 5.455184323500402e-08,
"loss": 2.105,
"step": 2185
},
{
"epoch": 2.105769230769231,
"grad_norm": 1.8898912766644747,
"learning_rate": 5.442717409105915e-08,
"loss": 2.0611,
"step": 2190
},
{
"epoch": 2.110576923076923,
"grad_norm": 1.9217461466226302,
"learning_rate": 5.430561683089944e-08,
"loss": 2.0806,
"step": 2195
},
{
"epoch": 2.1153846153846154,
"grad_norm": 1.861293839223179,
"learning_rate": 5.418710137060338e-08,
"loss": 2.0783,
"step": 2200
},
{
"epoch": 2.1153846153846154,
"eval_loss": 2.3356776237487793,
"eval_runtime": 85.3872,
"eval_samples_per_second": 86.605,
"eval_steps_per_second": 0.679,
"step": 2200
},
{
"epoch": 2.1201923076923075,
"grad_norm": 1.8572146395283573,
"learning_rate": 5.4071559032788445e-08,
"loss": 2.026,
"step": 2205
},
{
"epoch": 2.125,
"grad_norm": 1.8919061510828592,
"learning_rate": 5.395892252184894e-08,
"loss": 2.0538,
"step": 2210
},
{
"epoch": 2.1298076923076925,
"grad_norm": 1.9423965048231926,
"learning_rate": 5.384912589956864e-08,
"loss": 2.1354,
"step": 2215
},
{
"epoch": 2.1346153846153846,
"grad_norm": 1.86358642820622,
"learning_rate": 5.37421045611031e-08,
"loss": 2.0615,
"step": 2220
},
{
"epoch": 2.139423076923077,
"grad_norm": 1.9498064656844925,
"learning_rate": 5.363779521132732e-08,
"loss": 2.1152,
"step": 2225
},
{
"epoch": 2.144230769230769,
"grad_norm": 1.838720387490978,
"learning_rate": 5.353613584154386e-08,
"loss": 2.0802,
"step": 2230
},
{
"epoch": 2.1490384615384617,
"grad_norm": 1.8736999627632185,
"learning_rate": 5.3437065706546936e-08,
"loss": 2.0794,
"step": 2235
},
{
"epoch": 2.1538461538461537,
"grad_norm": 1.8185612650303689,
"learning_rate": 5.334052530203788e-08,
"loss": 2.0371,
"step": 2240
},
{
"epoch": 2.1586538461538463,
"grad_norm": 1.9598826857016363,
"learning_rate": 5.3246456342387584e-08,
"loss": 2.142,
"step": 2245
},
{
"epoch": 2.1634615384615383,
"grad_norm": 1.8852398707927738,
"learning_rate": 5.315480173874134e-08,
"loss": 2.0632,
"step": 2250
},
{
"epoch": 2.168269230769231,
"grad_norm": 1.8471328295295872,
"learning_rate": 5.306550557746175e-08,
"loss": 2.1116,
"step": 2255
},
{
"epoch": 2.173076923076923,
"grad_norm": 1.8068482718199097,
"learning_rate": 5.297851309890534e-08,
"loss": 2.0509,
"step": 2260
},
{
"epoch": 2.1778846153846154,
"grad_norm": 1.9264454870094807,
"learning_rate": 5.2893770676528514e-08,
"loss": 2.1262,
"step": 2265
},
{
"epoch": 2.1826923076923075,
"grad_norm": 1.8408137576329833,
"learning_rate": 5.281122579631865e-08,
"loss": 2.0472,
"step": 2270
},
{
"epoch": 2.1875,
"grad_norm": 1.821289584580464,
"learning_rate": 5.273082703654604e-08,
"loss": 2.1308,
"step": 2275
},
{
"epoch": 2.1923076923076925,
"grad_norm": 1.856905589818333,
"learning_rate": 5.265252404783256e-08,
"loss": 2.1068,
"step": 2280
},
{
"epoch": 2.1971153846153846,
"grad_norm": 1.8604589823269795,
"learning_rate": 5.257626753353287e-08,
"loss": 2.0947,
"step": 2285
},
{
"epoch": 2.201923076923077,
"grad_norm": 1.8525412113722146,
"learning_rate": 5.250200923042405e-08,
"loss": 2.104,
"step": 2290
},
{
"epoch": 2.206730769230769,
"grad_norm": 1.851550872426419,
"learning_rate": 5.242970188969973e-08,
"loss": 2.1139,
"step": 2295
},
{
"epoch": 2.2115384615384617,
"grad_norm": 1.8371736291077507,
"learning_rate": 5.2359299258264526e-08,
"loss": 2.1049,
"step": 2300
},
{
"epoch": 2.2163461538461537,
"grad_norm": 1.8854850811887058,
"learning_rate": 5.229075606032495e-08,
"loss": 2.0936,
"step": 2305
},
{
"epoch": 2.2211538461538463,
"grad_norm": 1.8111275047358883,
"learning_rate": 5.222402797927284e-08,
"loss": 2.0958,
"step": 2310
},
{
"epoch": 2.2259615384615383,
"grad_norm": 1.9091134111717707,
"learning_rate": 5.2159071639857394e-08,
"loss": 2.0999,
"step": 2315
},
{
"epoch": 2.230769230769231,
"grad_norm": 1.8879383298945882,
"learning_rate": 5.209584459064199e-08,
"loss": 2.1623,
"step": 2320
},
{
"epoch": 2.235576923076923,
"grad_norm": 37.03097635246021,
"learning_rate": 5.2034305286741963e-08,
"loss": 2.135,
"step": 2325
},
{
"epoch": 2.2403846153846154,
"grad_norm": 1.870738678414933,
"learning_rate": 5.197441307283966e-08,
"loss": 2.118,
"step": 2330
},
{
"epoch": 2.2451923076923075,
"grad_norm": 1.8528184603825324,
"learning_rate": 5.191612816647293e-08,
"loss": 2.1268,
"step": 2335
},
{
"epoch": 2.25,
"grad_norm": 1.9400695194615212,
"learning_rate": 5.185941164159351e-08,
"loss": 2.076,
"step": 2340
},
{
"epoch": 2.2548076923076925,
"grad_norm": 1.9062576912141294,
"learning_rate": 5.180422541239147e-08,
"loss": 2.1306,
"step": 2345
},
{
"epoch": 2.2596153846153846,
"grad_norm": 1.9730673873781654,
"learning_rate": 5.175053221738239e-08,
"loss": 2.104,
"step": 2350
},
{
"epoch": 2.264423076923077,
"grad_norm": 1.8371019460322038,
"learning_rate": 5.169829560375344e-08,
"loss": 2.0874,
"step": 2355
},
{
"epoch": 2.269230769230769,
"grad_norm": 1.874231056452069,
"learning_rate": 5.164747991196499e-08,
"loss": 2.0847,
"step": 2360
},
{
"epoch": 2.2740384615384617,
"grad_norm": 1.8794376823061034,
"learning_rate": 5.159805026060424e-08,
"loss": 2.0682,
"step": 2365
},
{
"epoch": 2.2788461538461537,
"grad_norm": 1.8255930007868693,
"learning_rate": 5.15499725314874e-08,
"loss": 2.0599,
"step": 2370
},
{
"epoch": 2.2836538461538463,
"grad_norm": 2.0171761498440333,
"learning_rate": 5.150321335500705e-08,
"loss": 2.0613,
"step": 2375
},
{
"epoch": 2.2884615384615383,
"grad_norm": 1.888512163517087,
"learning_rate": 5.145774009572124e-08,
"loss": 2.0746,
"step": 2380
},
{
"epoch": 2.293269230769231,
"grad_norm": 1.963864155096598,
"learning_rate": 5.141352083818108e-08,
"loss": 2.0992,
"step": 2385
},
{
"epoch": 2.298076923076923,
"grad_norm": 1.887413641506116,
"learning_rate": 5.1370524372993444e-08,
"loss": 2.0665,
"step": 2390
},
{
"epoch": 2.3028846153846154,
"grad_norm": 1.8425396594889334,
"learning_rate": 5.132872018311563e-08,
"loss": 2.0938,
"step": 2395
},
{
"epoch": 2.3076923076923075,
"grad_norm": 1.8343062688513765,
"learning_rate": 5.128807843037861e-08,
"loss": 2.0952,
"step": 2400
},
{
"epoch": 2.3076923076923075,
"eval_loss": 2.3359732627868652,
"eval_runtime": 85.421,
"eval_samples_per_second": 86.571,
"eval_steps_per_second": 0.679,
"step": 2400
},
{
"epoch": 2.3125,
"grad_norm": 1.8257992505700218,
"learning_rate": 5.1248569942235814e-08,
"loss": 2.0523,
"step": 2405
},
{
"epoch": 2.3173076923076925,
"grad_norm": 1.8895070139431327,
"learning_rate": 5.1210166198734225e-08,
"loss": 2.0834,
"step": 2410
},
{
"epoch": 2.3221153846153846,
"grad_norm": 1.9125461978695824,
"learning_rate": 5.117283931970468e-08,
"loss": 2.1017,
"step": 2415
},
{
"epoch": 2.326923076923077,
"grad_norm": 1.9275823669446988,
"learning_rate": 5.113656205216831e-08,
"loss": 2.1226,
"step": 2420
},
{
"epoch": 2.331730769230769,
"grad_norm": 1.889535416833256,
"learning_rate": 5.1101307757956035e-08,
"loss": 2.0764,
"step": 2425
},
{
"epoch": 2.3365384615384617,
"grad_norm": 1.8514556811164167,
"learning_rate": 5.106705040153818e-08,
"loss": 1.9975,
"step": 2430
},
{
"epoch": 2.3413461538461537,
"grad_norm": 1.958278628755969,
"learning_rate": 5.103376453806111e-08,
"loss": 2.1202,
"step": 2435
},
{
"epoch": 2.3461538461538463,
"grad_norm": 1.910793379676731,
"learning_rate": 5.100142530158806e-08,
"loss": 2.1254,
"step": 2440
},
{
"epoch": 2.3509615384615383,
"grad_norm": 2.2904582126799875,
"learning_rate": 5.0970008393541184e-08,
"loss": 2.0487,
"step": 2445
},
{
"epoch": 2.355769230769231,
"grad_norm": 1.928870195572868,
"learning_rate": 5.093949007134195e-08,
"loss": 2.0428,
"step": 2450
},
{
"epoch": 2.360576923076923,
"grad_norm": 1.9109302889112307,
"learning_rate": 5.090984713724707e-08,
"loss": 2.1073,
"step": 2455
},
{
"epoch": 2.3653846153846154,
"grad_norm": 1.8446780789197135,
"learning_rate": 5.0881056927377075e-08,
"loss": 2.1346,
"step": 2460
},
{
"epoch": 2.3701923076923075,
"grad_norm": 1.9119026418605038,
"learning_rate": 5.0853097300934865e-08,
"loss": 2.0757,
"step": 2465
},
{
"epoch": 2.375,
"grad_norm": 1.952480119894523,
"learning_rate": 5.082594662961142e-08,
"loss": 2.0955,
"step": 2470
},
{
"epoch": 2.3798076923076925,
"grad_norm": 1.9160233774476225,
"learning_rate": 5.0799583787175916e-08,
"loss": 2.094,
"step": 2475
},
{
"epoch": 2.3846153846153846,
"grad_norm": 1.8139526421054863,
"learning_rate": 5.07739881392477e-08,
"loss": 2.0905,
"step": 2480
},
{
"epoch": 2.389423076923077,
"grad_norm": 1.8207559563475217,
"learning_rate": 5.074913953324727e-08,
"loss": 2.0863,
"step": 2485
},
{
"epoch": 2.394230769230769,
"grad_norm": 1.8507805248963738,
"learning_rate": 5.0725018288523865e-08,
"loss": 2.0771,
"step": 2490
},
{
"epoch": 2.3990384615384617,
"grad_norm": 1.8116379225558112,
"learning_rate": 5.0701605186656875e-08,
"loss": 2.063,
"step": 2495
},
{
"epoch": 2.4038461538461537,
"grad_norm": 1.8790784349307603,
"learning_rate": 5.067888146192865e-08,
"loss": 2.0535,
"step": 2500
},
{
"epoch": 2.4086538461538463,
"grad_norm": 1.8572351494806207,
"learning_rate": 5.06568287919661e-08,
"loss": 2.0588,
"step": 2505
},
{
"epoch": 2.4134615384615383,
"grad_norm": 1.7890661820190739,
"learning_rate": 5.063542928854859e-08,
"loss": 2.0719,
"step": 2510
},
{
"epoch": 2.418269230769231,
"grad_norm": 1.780938750209951,
"learning_rate": 5.061466548857974e-08,
"loss": 2.1399,
"step": 2515
},
{
"epoch": 2.423076923076923,
"grad_norm": 1.864652061283046,
"learning_rate": 5.059452034522056e-08,
"loss": 2.0946,
"step": 2520
},
{
"epoch": 2.4278846153846154,
"grad_norm": 1.8661367735575938,
"learning_rate": 5.057497721918164e-08,
"loss": 2.0811,
"step": 2525
},
{
"epoch": 2.4326923076923075,
"grad_norm": 1.7957946183317377,
"learning_rate": 5.055601987017185e-08,
"loss": 2.0997,
"step": 2530
},
{
"epoch": 2.4375,
"grad_norm": 1.8001974731925174,
"learning_rate": 5.053763244850147e-08,
"loss": 2.1219,
"step": 2535
},
{
"epoch": 2.4423076923076925,
"grad_norm": 1.8983691367559397,
"learning_rate": 5.0519799486837034e-08,
"loss": 2.1097,
"step": 2540
},
{
"epoch": 2.4471153846153846,
"grad_norm": 1.905238107904784,
"learning_rate": 5.050250589210597e-08,
"loss": 2.0688,
"step": 2545
},
{
"epoch": 2.451923076923077,
"grad_norm": 1.825345955550652,
"learning_rate": 5.048573693754852e-08,
"loss": 2.0937,
"step": 2550
},
{
"epoch": 2.456730769230769,
"grad_norm": 1.855436622240645,
"learning_rate": 5.0469478254914804e-08,
"loss": 2.1167,
"step": 2555
},
{
"epoch": 2.4615384615384617,
"grad_norm": 1.8976603753246268,
"learning_rate": 5.04537158268048e-08,
"loss": 2.0693,
"step": 2560
},
{
"epoch": 2.4663461538461537,
"grad_norm": 1.9048063196287657,
"learning_rate": 5.043843597914902e-08,
"loss": 2.0695,
"step": 2565
},
{
"epoch": 2.4711538461538463,
"grad_norm": 1.8780277621645116,
"learning_rate": 5.042362537382771e-08,
"loss": 2.0692,
"step": 2570
},
{
"epoch": 2.4759615384615383,
"grad_norm": 1.7927549821388442,
"learning_rate": 5.040927100142658e-08,
"loss": 2.0756,
"step": 2575
},
{
"epoch": 2.480769230769231,
"grad_norm": 1.9065399802572085,
"learning_rate": 5.03953601741267e-08,
"loss": 2.0273,
"step": 2580
},
{
"epoch": 2.485576923076923,
"grad_norm": 1.8711481004226065,
"learning_rate": 5.0381880518726784e-08,
"loss": 2.1434,
"step": 2585
},
{
"epoch": 2.4903846153846154,
"grad_norm": 1.8706391357800631,
"learning_rate": 5.03688199697955e-08,
"loss": 2.1032,
"step": 2590
},
{
"epoch": 2.4951923076923075,
"grad_norm": 1.9079920113146567,
"learning_rate": 5.0356166762952054e-08,
"loss": 2.0575,
"step": 2595
},
{
"epoch": 2.5,
"grad_norm": 1.8325624675703904,
"learning_rate": 5.0343909428272807e-08,
"loss": 2.1009,
"step": 2600
},
{
"epoch": 2.5,
"eval_loss": 2.3360962867736816,
"eval_runtime": 85.4584,
"eval_samples_per_second": 86.533,
"eval_steps_per_second": 0.679,
"step": 2600
},
{
"epoch": 2.5048076923076925,
"grad_norm": 1.9117983598651567,
"learning_rate": 5.033203678382215e-08,
"loss": 2.1034,
"step": 2605
},
{
"epoch": 2.5096153846153846,
"grad_norm": 1.8482924541401045,
"learning_rate": 5.032053792930553e-08,
"loss": 2.0938,
"step": 2610
},
{
"epoch": 2.5144230769230766,
"grad_norm": 1.8309284870035238,
"learning_rate": 5.030940223984276e-08,
"loss": 2.0545,
"step": 2615
},
{
"epoch": 2.519230769230769,
"grad_norm": 1.887238798925063,
"learning_rate": 5.0298619359859705e-08,
"loss": 2.0947,
"step": 2620
},
{
"epoch": 2.5240384615384617,
"grad_norm": 1.8229917506754332,
"learning_rate": 5.0288179197096475e-08,
"loss": 2.1367,
"step": 2625
},
{
"epoch": 2.5288461538461537,
"grad_norm": 1.8745480293774028,
"learning_rate": 5.027807191673022e-08,
"loss": 2.1263,
"step": 2630
},
{
"epoch": 2.5336538461538463,
"grad_norm": 1.8565511172706295,
"learning_rate": 5.026828793561077e-08,
"loss": 2.069,
"step": 2635
},
{
"epoch": 2.5384615384615383,
"grad_norm": 1.8435366151404853,
"learning_rate": 5.0258817916607186e-08,
"loss": 2.0715,
"step": 2640
},
{
"epoch": 2.543269230769231,
"grad_norm": 1.82801282007265,
"learning_rate": 5.024965276306364e-08,
"loss": 2.1124,
"step": 2645
},
{
"epoch": 2.5480769230769234,
"grad_norm": 1.871706442781542,
"learning_rate": 5.02407836133626e-08,
"loss": 2.0849,
"step": 2650
},
{
"epoch": 2.5528846153846154,
"grad_norm": 1.8633902158148148,
"learning_rate": 5.02322018355938e-08,
"loss": 2.0835,
"step": 2655
},
{
"epoch": 2.5576923076923075,
"grad_norm": 1.8664407309122704,
"learning_rate": 5.022389902232716e-08,
"loss": 2.058,
"step": 2660
},
{
"epoch": 2.5625,
"grad_norm": 1.8241814220396138,
"learning_rate": 5.0215866985488015e-08,
"loss": 2.1001,
"step": 2665
},
{
"epoch": 2.5673076923076925,
"grad_norm": 1.8728742912893366,
"learning_rate": 5.020809775133292e-08,
"loss": 2.0782,
"step": 2670
},
{
"epoch": 2.5721153846153846,
"grad_norm": 1.836951128615928,
"learning_rate": 5.020058355552443e-08,
"loss": 2.032,
"step": 2675
},
{
"epoch": 2.5769230769230766,
"grad_norm": 1.8159474479645261,
"learning_rate": 5.019331683830326e-08,
"loss": 2.0842,
"step": 2680
},
{
"epoch": 2.581730769230769,
"grad_norm": 1.8210257982061508,
"learning_rate": 5.018629023975606e-08,
"loss": 2.1517,
"step": 2685
},
{
"epoch": 2.5865384615384617,
"grad_norm": 1.8501212045264834,
"learning_rate": 5.0179496595177436e-08,
"loss": 2.0773,
"step": 2690
},
{
"epoch": 2.5913461538461537,
"grad_norm": 1.882222780292571,
"learning_rate": 5.017292893052448e-08,
"loss": 2.0555,
"step": 2695
},
{
"epoch": 2.5961538461538463,
"grad_norm": 1.843070652377049,
"learning_rate": 5.0166580457962346e-08,
"loss": 2.0461,
"step": 2700
},
{
"epoch": 2.6009615384615383,
"grad_norm": 1.847536413092705,
"learning_rate": 5.0160444571499293e-08,
"loss": 2.1485,
"step": 2705
},
{
"epoch": 2.605769230769231,
"grad_norm": 1.8266553603942388,
"learning_rate": 5.0154514842709816e-08,
"loss": 2.0737,
"step": 2710
},
{
"epoch": 2.6105769230769234,
"grad_norm": 1.9237223597123432,
"learning_rate": 5.014878501654416e-08,
"loss": 2.0757,
"step": 2715
},
{
"epoch": 2.6153846153846154,
"grad_norm": 1.8948119829446708,
"learning_rate": 5.0143249007222985e-08,
"loss": 2.1339,
"step": 2720
},
{
"epoch": 2.6201923076923075,
"grad_norm": 1.8301707716670057,
"learning_rate": 5.013790089421563e-08,
"loss": 2.0548,
"step": 2725
},
{
"epoch": 2.625,
"grad_norm": 1.8663429882080074,
"learning_rate": 5.0132734918300504e-08,
"loss": 2.1375,
"step": 2730
},
{
"epoch": 2.6298076923076925,
"grad_norm": 1.942647379328917,
"learning_rate": 5.012774547770629e-08,
"loss": 2.1396,
"step": 2735
},
{
"epoch": 2.6346153846153846,
"grad_norm": 1.8441092861484971,
"learning_rate": 5.012292712433258e-08,
"loss": 2.0696,
"step": 2740
},
{
"epoch": 2.6394230769230766,
"grad_norm": 1.9320657665881027,
"learning_rate": 5.011827456004847e-08,
"loss": 2.1119,
"step": 2745
},
{
"epoch": 2.644230769230769,
"grad_norm": 1.8427805768866328,
"learning_rate": 5.0113782633067863e-08,
"loss": 2.084,
"step": 2750
},
{
"epoch": 2.6490384615384617,
"grad_norm": 1.8440694033677212,
"learning_rate": 5.0109446334400176e-08,
"loss": 2.0882,
"step": 2755
},
{
"epoch": 2.6538461538461537,
"grad_norm": 1.893152979504229,
"learning_rate": 5.010526079437498e-08,
"loss": 2.1043,
"step": 2760
},
{
"epoch": 2.6586538461538463,
"grad_norm": 1.9949218255548784,
"learning_rate": 5.010122127923951e-08,
"loss": 2.1103,
"step": 2765
},
{
"epoch": 2.6634615384615383,
"grad_norm": 1.8456542683339325,
"learning_rate": 5.0097323187827586e-08,
"loss": 2.0738,
"step": 2770
},
{
"epoch": 2.668269230769231,
"grad_norm": 1.8984568625826008,
"learning_rate": 5.009356204829874e-08,
"loss": 2.0612,
"step": 2775
},
{
"epoch": 2.6730769230769234,
"grad_norm": 1.8703440919228778,
"learning_rate": 5.008993351494639e-08,
"loss": 2.1919,
"step": 2780
},
{
"epoch": 2.6778846153846154,
"grad_norm": 1.9243113440055457,
"learning_rate": 5.008643336507372e-08,
"loss": 2.0829,
"step": 2785
},
{
"epoch": 2.6826923076923075,
"grad_norm": 1.834031155910534,
"learning_rate": 5.0083057495936144e-08,
"loss": 2.0647,
"step": 2790
},
{
"epoch": 2.6875,
"grad_norm": 2.0300087855547897,
"learning_rate": 5.0079801921749176e-08,
"loss": 2.0993,
"step": 2795
},
{
"epoch": 2.6923076923076925,
"grad_norm": 1.8096967426995145,
"learning_rate": 5.007666277076042e-08,
"loss": 2.125,
"step": 2800
},
{
"epoch": 2.6923076923076925,
"eval_loss": 2.3360354900360107,
"eval_runtime": 85.4625,
"eval_samples_per_second": 86.529,
"eval_steps_per_second": 0.679,
"step": 2800
},
{
"epoch": 2.6971153846153846,
"grad_norm": 1.863239316605401,
"learning_rate": 5.0073636282384696e-08,
"loss": 2.1135,
"step": 2805
},
{
"epoch": 2.7019230769230766,
"grad_norm": 1.9593347265344716,
"learning_rate": 5.007071880440107e-08,
"loss": 2.087,
"step": 2810
},
{
"epoch": 2.706730769230769,
"grad_norm": 1.8698219251596924,
"learning_rate": 5.006790679021062e-08,
"loss": 2.1106,
"step": 2815
},
{
"epoch": 2.7115384615384617,
"grad_norm": 1.9096265265503567,
"learning_rate": 5.006519679615399e-08,
"loss": 2.1065,
"step": 2820
},
{
"epoch": 2.7163461538461537,
"grad_norm": 1.8385721642634492,
"learning_rate": 5.0062585478887454e-08,
"loss": 2.1307,
"step": 2825
},
{
"epoch": 2.7211538461538463,
"grad_norm": 2.045452351348729,
"learning_rate": 5.006006959281663e-08,
"loss": 2.0573,
"step": 2830
},
{
"epoch": 2.7259615384615383,
"grad_norm": 1.8727571024658705,
"learning_rate": 5.005764598758657e-08,
"loss": 2.1193,
"step": 2835
},
{
"epoch": 2.730769230769231,
"grad_norm": 1.9077767348853074,
"learning_rate": 5.005531160562734e-08,
"loss": 2.1097,
"step": 2840
},
{
"epoch": 2.7355769230769234,
"grad_norm": 1.8266187984214344,
"learning_rate": 5.005306347975403e-08,
"loss": 2.0879,
"step": 2845
},
{
"epoch": 2.7403846153846154,
"grad_norm": 1.9460294408394188,
"learning_rate": 5.0050898730820176e-08,
"loss": 2.0667,
"step": 2850
},
{
"epoch": 2.7451923076923075,
"grad_norm": 1.8751685321455078,
"learning_rate": 5.0048814565423524e-08,
"loss": 2.1122,
"step": 2855
},
{
"epoch": 2.75,
"grad_norm": 1.8138239598798986,
"learning_rate": 5.004680827366333e-08,
"loss": 2.0571,
"step": 2860
},
{
"epoch": 2.7548076923076925,
"grad_norm": 1.9103749761871995,
"learning_rate": 5.0044877226948085e-08,
"loss": 2.0773,
"step": 2865
},
{
"epoch": 2.7596153846153846,
"grad_norm": 1.8517186742525418,
"learning_rate": 5.004301887585273e-08,
"loss": 2.0633,
"step": 2870
},
{
"epoch": 2.7644230769230766,
"grad_norm": 1.8277041575262993,
"learning_rate": 5.0041230748024515e-08,
"loss": 2.0995,
"step": 2875
},
{
"epoch": 2.769230769230769,
"grad_norm": 1.8783284685972508,
"learning_rate": 5.0039510446136475e-08,
"loss": 2.0799,
"step": 2880
},
{
"epoch": 2.7740384615384617,
"grad_norm": 1.8214139607696012,
"learning_rate": 5.00378556458877e-08,
"loss": 2.1185,
"step": 2885
},
{
"epoch": 2.7788461538461537,
"grad_norm": 1.754546607125489,
"learning_rate": 5.0036264094049414e-08,
"loss": 2.1165,
"step": 2890
},
{
"epoch": 2.7836538461538463,
"grad_norm": 1.8605888233369712,
"learning_rate": 5.0034733606556126e-08,
"loss": 2.0909,
"step": 2895
},
{
"epoch": 2.7884615384615383,
"grad_norm": 1.903011452864366,
"learning_rate": 5.003326206664078e-08,
"loss": 2.0946,
"step": 2900
},
{
"epoch": 2.793269230769231,
"grad_norm": 1.7737987493209635,
"learning_rate": 5.003184742301327e-08,
"loss": 2.108,
"step": 2905
},
{
"epoch": 2.7980769230769234,
"grad_norm": 1.8885111840024975,
"learning_rate": 5.0030487688081324e-08,
"loss": 2.0753,
"step": 2910
},
{
"epoch": 2.8028846153846154,
"grad_norm": 1.8832929741438638,
"learning_rate": 5.002918093621301e-08,
"loss": 2.0825,
"step": 2915
},
{
"epoch": 2.8076923076923075,
"grad_norm": 1.8972739478097906,
"learning_rate": 5.0027925302039994e-08,
"loss": 2.1004,
"step": 2920
},
{
"epoch": 2.8125,
"grad_norm": 1.8077990099256764,
"learning_rate": 5.002671897880082e-08,
"loss": 2.0858,
"step": 2925
},
{
"epoch": 2.8173076923076925,
"grad_norm": 1.8611265826571517,
"learning_rate": 5.002556021672335e-08,
"loss": 2.0735,
"step": 2930
},
{
"epoch": 2.8221153846153846,
"grad_norm": 1.9313284111744764,
"learning_rate": 5.002444732144568e-08,
"loss": 2.1131,
"step": 2935
},
{
"epoch": 2.8269230769230766,
"grad_norm": 1.8676490764521987,
"learning_rate": 5.00233786524746e-08,
"loss": 2.1365,
"step": 2940
},
{
"epoch": 2.831730769230769,
"grad_norm": 1.8494289564318631,
"learning_rate": 5.002235262168107e-08,
"loss": 2.1757,
"step": 2945
},
{
"epoch": 2.8365384615384617,
"grad_norm": 1.85497440355638,
"learning_rate": 5.0021367691831825e-08,
"loss": 2.1242,
"step": 2950
},
{
"epoch": 2.8413461538461537,
"grad_norm": 1.8486274892842425,
"learning_rate": 5.002042237515639e-08,
"loss": 2.1245,
"step": 2955
},
{
"epoch": 2.8461538461538463,
"grad_norm": 1.895043426117041,
"learning_rate": 5.001951523194882e-08,
"loss": 2.0803,
"step": 2960
},
{
"epoch": 2.8509615384615383,
"grad_norm": 1.874846017392855,
"learning_rate": 5.001864486920352e-08,
"loss": 2.1229,
"step": 2965
},
{
"epoch": 2.855769230769231,
"grad_norm": 1.8257810113586723,
"learning_rate": 5.001780993928431e-08,
"loss": 2.0623,
"step": 2970
},
{
"epoch": 2.8605769230769234,
"grad_norm": 2.0410507440850743,
"learning_rate": 5.0017009138626176e-08,
"loss": 2.1375,
"step": 2975
},
{
"epoch": 2.8653846153846154,
"grad_norm": 1.8536732613204967,
"learning_rate": 5.001624120646899e-08,
"loss": 2.1198,
"step": 2980
},
{
"epoch": 2.8701923076923075,
"grad_norm": 1.8420057076108896,
"learning_rate": 5.0015504923622523e-08,
"loss": 2.0588,
"step": 2985
},
{
"epoch": 2.875,
"grad_norm": 2.06664054369849,
"learning_rate": 5.0014799111262185e-08,
"loss": 2.065,
"step": 2990
},
{
"epoch": 2.8798076923076925,
"grad_norm": 1.8942959478783434,
"learning_rate": 5.001412262975472e-08,
"loss": 2.0928,
"step": 2995
},
{
"epoch": 2.8846153846153846,
"grad_norm": 1.9095141517679362,
"learning_rate": 5.0013474377513345e-08,
"loss": 2.1206,
"step": 3000
},
{
"epoch": 2.8846153846153846,
"eval_loss": 2.335968494415283,
"eval_runtime": 85.3698,
"eval_samples_per_second": 86.623,
"eval_steps_per_second": 0.679,
"step": 3000
},
{
"epoch": 2.8894230769230766,
"grad_norm": 1.8262058020984504,
"learning_rate": 5.001285328988167e-08,
"loss": 2.095,
"step": 3005
},
{
"epoch": 2.894230769230769,
"grad_norm": 1.8525491687163678,
"learning_rate": 5.0012258338045814e-08,
"loss": 2.0854,
"step": 3010
},
{
"epoch": 2.8990384615384617,
"grad_norm": 1.876102814594601,
"learning_rate": 5.001168852797407e-08,
"loss": 2.0836,
"step": 3015
},
{
"epoch": 2.9038461538461537,
"grad_norm": 1.8864256560953125,
"learning_rate": 5.0011142899383596e-08,
"loss": 2.1177,
"step": 3020
},
{
"epoch": 2.9086538461538463,
"grad_norm": 1.8543259178498985,
"learning_rate": 5.001062052473354e-08,
"loss": 2.0708,
"step": 3025
},
{
"epoch": 2.9134615384615383,
"grad_norm": 1.8468081058935386,
"learning_rate": 5.0010120508243996e-08,
"loss": 2.0649,
"step": 3030
},
{
"epoch": 2.918269230769231,
"grad_norm": 1.870394880857915,
"learning_rate": 5.000964198494029e-08,
"loss": 2.0948,
"step": 3035
},
{
"epoch": 2.9230769230769234,
"grad_norm": 1.8291813927626337,
"learning_rate": 5.000918411972201e-08,
"loss": 2.0571,
"step": 3040
},
{
"epoch": 2.9278846153846154,
"grad_norm": 1.8345615836931617,
"learning_rate": 5.000874610645626e-08,
"loss": 2.0843,
"step": 3045
},
{
"epoch": 2.9326923076923075,
"grad_norm": 1.784288247829563,
"learning_rate": 5.000832716709459e-08,
"loss": 2.088,
"step": 3050
},
{
"epoch": 2.9375,
"grad_norm": 1.8828904166386582,
"learning_rate": 5.000792655081313e-08,
"loss": 2.1294,
"step": 3055
},
{
"epoch": 2.9423076923076925,
"grad_norm": 1.876834782651868,
"learning_rate": 5.00075435331754e-08,
"loss": 2.0835,
"step": 3060
},
{
"epoch": 2.9471153846153846,
"grad_norm": 1.7891832679275306,
"learning_rate": 5.000717741531722e-08,
"loss": 2.0758,
"step": 3065
},
{
"epoch": 2.9519230769230766,
"grad_norm": 1.9834817400632345,
"learning_rate": 5.000682752315336e-08,
"loss": 2.1172,
"step": 3070
},
{
"epoch": 2.956730769230769,
"grad_norm": 2.01686543949811,
"learning_rate": 5.000649320660537e-08,
"loss": 2.129,
"step": 3075
},
{
"epoch": 2.9615384615384617,
"grad_norm": 1.882159640395084,
"learning_rate": 5.0006173838850096e-08,
"loss": 2.0194,
"step": 3080
},
{
"epoch": 2.9663461538461537,
"grad_norm": 1.8632173120315059,
"learning_rate": 5.0005868815588486e-08,
"loss": 2.0399,
"step": 3085
},
{
"epoch": 2.9711538461538463,
"grad_norm": 1.899662124124679,
"learning_rate": 5.000557755433416e-08,
"loss": 2.0669,
"step": 3090
},
{
"epoch": 2.9759615384615383,
"grad_norm": 1.9288229898878364,
"learning_rate": 5.0005299493721366e-08,
"loss": 2.0695,
"step": 3095
},
{
"epoch": 2.980769230769231,
"grad_norm": 1.9430306138069855,
"learning_rate": 5.000503409283182e-08,
"loss": 2.0771,
"step": 3100
},
{
"epoch": 2.9855769230769234,
"grad_norm": 1.8642254344339084,
"learning_rate": 5.0004780830540004e-08,
"loss": 2.067,
"step": 3105
},
{
"epoch": 2.9903846153846154,
"grad_norm": 1.843625830841223,
"learning_rate": 5.0004539204876536e-08,
"loss": 2.0557,
"step": 3110
},
{
"epoch": 2.9951923076923075,
"grad_norm": 1.905040671688552,
"learning_rate": 5.000430873240919e-08,
"loss": 2.1085,
"step": 3115
},
{
"epoch": 3.0,
"grad_norm": 1.9724597892841456,
"learning_rate": 5.000408894764108e-08,
"loss": 2.1109,
"step": 3120
},
{
"epoch": 3.0048076923076925,
"grad_norm": 1.930998832905121,
"learning_rate": 5.0003879402425764e-08,
"loss": 2.1045,
"step": 3125
},
{
"epoch": 3.0096153846153846,
"grad_norm": 1.906832567119333,
"learning_rate": 5.0003679665398665e-08,
"loss": 2.0992,
"step": 3130
},
{
"epoch": 3.014423076923077,
"grad_norm": 1.880028734755099,
"learning_rate": 5.000348932142462e-08,
"loss": 2.0536,
"step": 3135
},
{
"epoch": 3.019230769230769,
"grad_norm": 1.8234161328010858,
"learning_rate": 5.000330797106105e-08,
"loss": 2.0425,
"step": 3140
},
{
"epoch": 3.0240384615384617,
"grad_norm": 1.9060969026597896,
"learning_rate": 5.000313523003646e-08,
"loss": 2.0724,
"step": 3145
},
{
"epoch": 3.0288461538461537,
"grad_norm": 1.9314817600599008,
"learning_rate": 5.000297072874381e-08,
"loss": 2.0856,
"step": 3150
},
{
"epoch": 3.0336538461538463,
"grad_norm": 2.205865819233671,
"learning_rate": 5.0002814111748496e-08,
"loss": 2.0542,
"step": 3155
},
{
"epoch": 3.0384615384615383,
"grad_norm": 1.9034298586292828,
"learning_rate": 5.000266503731057e-08,
"loss": 2.1181,
"step": 3160
},
{
"epoch": 3.043269230769231,
"grad_norm": 1.9630469467362441,
"learning_rate": 5.0002523176920756e-08,
"loss": 2.0769,
"step": 3165
},
{
"epoch": 3.048076923076923,
"grad_norm": 1.8387471826204973,
"learning_rate": 5.0002388214850104e-08,
"loss": 2.0357,
"step": 3170
},
{
"epoch": 3.0528846153846154,
"grad_norm": 1.8580705264609298,
"learning_rate": 5.000225984771277e-08,
"loss": 2.1436,
"step": 3175
},
{
"epoch": 3.0576923076923075,
"grad_norm": 1.8937514188796711,
"learning_rate": 5.0002137784041715e-08,
"loss": 2.0621,
"step": 3180
},
{
"epoch": 3.0625,
"grad_norm": 1.8887722007611465,
"learning_rate": 5.0002021743876964e-08,
"loss": 2.1001,
"step": 3185
},
{
"epoch": 3.0673076923076925,
"grad_norm": 2.058985773940214,
"learning_rate": 5.0001911458366104e-08,
"loss": 2.0544,
"step": 3190
},
{
"epoch": 3.0721153846153846,
"grad_norm": 1.8613730424507313,
"learning_rate": 5.000180666937676e-08,
"loss": 2.0672,
"step": 3195
},
{
"epoch": 3.076923076923077,
"grad_norm": 1.883209445623825,
"learning_rate": 5.0001707129120686e-08,
"loss": 2.0593,
"step": 3200
},
{
"epoch": 3.076923076923077,
"eval_loss": 2.336284875869751,
"eval_runtime": 85.4905,
"eval_samples_per_second": 86.501,
"eval_steps_per_second": 0.678,
"step": 3200
},
{
"epoch": 3.081730769230769,
"grad_norm": 1.800038407134164,
"learning_rate": 5.000161259978923e-08,
"loss": 2.1135,
"step": 3205
},
{
"epoch": 3.0865384615384617,
"grad_norm": 1.9214263061349197,
"learning_rate": 5.0001522853199856e-08,
"loss": 2.0604,
"step": 3210
},
{
"epoch": 3.0913461538461537,
"grad_norm": 1.7946344678576902,
"learning_rate": 5.000143767045347e-08,
"loss": 2.0379,
"step": 3215
},
{
"epoch": 3.0961538461538463,
"grad_norm": 1.9345308159393109,
"learning_rate": 5.000135684160221e-08,
"loss": 2.1086,
"step": 3220
},
{
"epoch": 3.1009615384615383,
"grad_norm": 1.9155941341236926,
"learning_rate": 5.000128016532757e-08,
"loss": 2.1086,
"step": 3225
},
{
"epoch": 3.105769230769231,
"grad_norm": 1.8746401629643195,
"learning_rate": 5.000120744862838e-08,
"loss": 2.085,
"step": 3230
},
{
"epoch": 3.110576923076923,
"grad_norm": 1.9247774915660303,
"learning_rate": 5.00011385065186e-08,
"loss": 2.1239,
"step": 3235
},
{
"epoch": 3.1153846153846154,
"grad_norm": 1.8464578404726741,
"learning_rate": 5.0001073161734515e-08,
"loss": 2.1166,
"step": 3240
},
{
"epoch": 3.1201923076923075,
"grad_norm": 1.891327266772356,
"learning_rate": 5.000101124445121e-08,
"loss": 2.0818,
"step": 3245
},
{
"epoch": 3.125,
"grad_norm": 1.859457845102101,
"learning_rate": 5.0000952592007933e-08,
"loss": 2.043,
"step": 3250
},
{
"epoch": 3.1298076923076925,
"grad_norm": 1.8626819779803672,
"learning_rate": 5.0000897048642266e-08,
"loss": 2.1099,
"step": 3255
},
{
"epoch": 3.1346153846153846,
"grad_norm": 1.848088739569789,
"learning_rate": 5.000084446523276e-08,
"loss": 2.0433,
"step": 3260
},
{
"epoch": 3.139423076923077,
"grad_norm": 1.8088561980329354,
"learning_rate": 5.0000794699049865e-08,
"loss": 2.0828,
"step": 3265
},
{
"epoch": 3.144230769230769,
"grad_norm": 1.8338377212136632,
"learning_rate": 5.000074761351487e-08,
"loss": 2.0958,
"step": 3270
},
{
"epoch": 3.1490384615384617,
"grad_norm": 1.9050955056716428,
"learning_rate": 5.000070307796674e-08,
"loss": 2.1296,
"step": 3275
},
{
"epoch": 3.1538461538461537,
"grad_norm": 1.9053203587270828,
"learning_rate": 5.0000660967436526e-08,
"loss": 2.127,
"step": 3280
},
{
"epoch": 3.1586538461538463,
"grad_norm": 1.878537794460004,
"learning_rate": 5.000062116242918e-08,
"loss": 2.1055,
"step": 3285
},
{
"epoch": 3.1634615384615383,
"grad_norm": 1.8810850477235284,
"learning_rate": 5.000058354871263e-08,
"loss": 2.087,
"step": 3290
},
{
"epoch": 3.168269230769231,
"grad_norm": 1.8129515946311003,
"learning_rate": 5.000054801711379e-08,
"loss": 2.0779,
"step": 3295
},
{
"epoch": 3.173076923076923,
"grad_norm": 2.0073035626574915,
"learning_rate": 5.0000514463321446e-08,
"loss": 2.1102,
"step": 3300
},
{
"epoch": 3.1778846153846154,
"grad_norm": 1.904610541350343,
"learning_rate": 5.000048278769574e-08,
"loss": 2.0952,
"step": 3305
},
{
"epoch": 3.1826923076923075,
"grad_norm": 1.808902174339809,
"learning_rate": 5.000045289508406e-08,
"loss": 2.0609,
"step": 3310
},
{
"epoch": 3.1875,
"grad_norm": 1.8554788011848724,
"learning_rate": 5.000042469464323e-08,
"loss": 2.0534,
"step": 3315
},
{
"epoch": 3.1923076923076925,
"grad_norm": 1.9599174090809928,
"learning_rate": 5.000039809966777e-08,
"loss": 2.0668,
"step": 3320
},
{
"epoch": 3.1971153846153846,
"grad_norm": 1.8859333707205377,
"learning_rate": 5.000037302742402e-08,
"loss": 2.073,
"step": 3325
},
{
"epoch": 3.201923076923077,
"grad_norm": 1.8053367407893148,
"learning_rate": 5.000034939899001e-08,
"loss": 2.058,
"step": 3330
},
{
"epoch": 3.206730769230769,
"grad_norm": 1.9093669207818855,
"learning_rate": 5.000032713910095e-08,
"loss": 2.0711,
"step": 3335
},
{
"epoch": 3.2115384615384617,
"grad_norm": 1.8573175727984386,
"learning_rate": 5.0000306175999996e-08,
"loss": 2.1104,
"step": 3340
},
{
"epoch": 3.2163461538461537,
"grad_norm": 1.818915273922553,
"learning_rate": 5.000028644129445e-08,
"loss": 2.0857,
"step": 3345
},
{
"epoch": 3.2211538461538463,
"grad_norm": 1.8159720078784984,
"learning_rate": 5.000026786981683e-08,
"loss": 2.0886,
"step": 3350
},
{
"epoch": 3.2259615384615383,
"grad_norm": 1.8959271365869055,
"learning_rate": 5.000025380834318e-08,
"loss": 2.1141,
"step": 3355
},
{
"epoch": 3.230769230769231,
"grad_norm": 1.8963113166938355,
"learning_rate": 5.000023717623903e-08,
"loss": 2.1259,
"step": 3360
},
{
"epoch": 3.235576923076923,
"grad_norm": 1.9029307905210568,
"learning_rate": 5.0000221540931055e-08,
"loss": 2.0854,
"step": 3365
},
{
"epoch": 3.2403846153846154,
"grad_norm": 1.838526466646601,
"learning_rate": 5.0000206848327065e-08,
"loss": 2.0741,
"step": 3370
},
{
"epoch": 3.2451923076923075,
"grad_norm": 1.8859567421929686,
"learning_rate": 5.000019304696002e-08,
"loss": 2.0582,
"step": 3375
},
{
"epoch": 3.25,
"grad_norm": 1.9217466457908856,
"learning_rate": 5.000018008787587e-08,
"loss": 2.0699,
"step": 3380
},
{
"epoch": 3.2548076923076925,
"grad_norm": 1.9074470673862487,
"learning_rate": 5.0000167924525525e-08,
"loss": 2.032,
"step": 3385
},
{
"epoch": 3.2596153846153846,
"grad_norm": 1.8425868401366883,
"learning_rate": 5.000015651266079e-08,
"loss": 2.1211,
"step": 3390
},
{
"epoch": 3.264423076923077,
"grad_norm": 1.8269121873511085,
"learning_rate": 5.00001458102343e-08,
"loss": 2.1272,
"step": 3395
},
{
"epoch": 3.269230769230769,
"grad_norm": 1.9274516851712518,
"learning_rate": 5.000013577730309e-08,
"loss": 2.0927,
"step": 3400
},
{
"epoch": 3.269230769230769,
"eval_loss": 2.3365249633789062,
"eval_runtime": 85.4018,
"eval_samples_per_second": 86.591,
"eval_steps_per_second": 0.679,
"step": 3400
},
{
"epoch": 3.2740384615384617,
"grad_norm": 1.889849662397209,
"learning_rate": 5.000012637593584e-08,
"loss": 2.0617,
"step": 3405
},
{
"epoch": 3.2788461538461537,
"grad_norm": 1.9502873503727838,
"learning_rate": 5.000011757012371e-08,
"loss": 2.1223,
"step": 3410
},
{
"epoch": 3.2836538461538463,
"grad_norm": 1.9403389617445832,
"learning_rate": 5.0000109325694494e-08,
"loss": 2.0963,
"step": 3415
},
{
"epoch": 3.2884615384615383,
"grad_norm": 1.9220338068487544,
"learning_rate": 5.0000101610230143e-08,
"loss": 2.0916,
"step": 3420
},
{
"epoch": 3.293269230769231,
"grad_norm": 1.9375048503232193,
"learning_rate": 5.000009439298745e-08,
"loss": 2.0717,
"step": 3425
},
{
"epoch": 3.298076923076923,
"grad_norm": 1.8438418543194979,
"learning_rate": 5.000008895827592e-08,
"loss": 2.1255,
"step": 3430
},
{
"epoch": 3.3028846153846154,
"grad_norm": 1.8629567514533452,
"learning_rate": 5.00000825654154e-08,
"loss": 2.0806,
"step": 3435
},
{
"epoch": 3.3076923076923075,
"grad_norm": 1.9106656016326038,
"learning_rate": 5.000007659296849e-08,
"loss": 2.1158,
"step": 3440
},
{
"epoch": 3.3125,
"grad_norm": 1.9013483711226824,
"learning_rate": 5.000007101588647e-08,
"loss": 2.1251,
"step": 3445
},
{
"epoch": 3.3173076923076925,
"grad_norm": 1.918508888857165,
"learning_rate": 5.0000065810456154e-08,
"loss": 2.0693,
"step": 3450
},
{
"epoch": 3.3221153846153846,
"grad_norm": 1.8062766125316954,
"learning_rate": 5.0000060954237113e-08,
"loss": 2.1227,
"step": 3455
},
{
"epoch": 3.326923076923077,
"grad_norm": 1.863020981136348,
"learning_rate": 5.000005642600152e-08,
"loss": 2.1291,
"step": 3460
},
{
"epoch": 3.331730769230769,
"grad_norm": 1.814260156227495,
"learning_rate": 5.000005220567642e-08,
"loss": 2.0376,
"step": 3465
},
{
"epoch": 3.3365384615384617,
"grad_norm": 1.860164501188251,
"learning_rate": 5.000004827428838e-08,
"loss": 2.0692,
"step": 3470
},
{
"epoch": 3.3413461538461537,
"grad_norm": 1.8559616510930068,
"learning_rate": 5.000004461391041e-08,
"loss": 2.1154,
"step": 3475
},
{
"epoch": 3.3461538461538463,
"grad_norm": 1.8531248832701233,
"learning_rate": 5.000004120761112e-08,
"loss": 2.1368,
"step": 3480
},
{
"epoch": 3.3509615384615383,
"grad_norm": 2.0855871097245697,
"learning_rate": 5.000003803940601e-08,
"loss": 2.0614,
"step": 3485
},
{
"epoch": 3.355769230769231,
"grad_norm": 1.849398364726841,
"learning_rate": 5.000003509421077e-08,
"loss": 2.0439,
"step": 3490
},
{
"epoch": 3.360576923076923,
"grad_norm": 1.8843707405312315,
"learning_rate": 5.000003235779665e-08,
"loss": 2.1177,
"step": 3495
},
{
"epoch": 3.3653846153846154,
"grad_norm": 1.8674622419471962,
"learning_rate": 5.0000029816747665e-08,
"loss": 2.0846,
"step": 3500
},
{
"epoch": 3.3701923076923075,
"grad_norm": 1.861783824284357,
"learning_rate": 5.000002745841968e-08,
"loss": 2.0955,
"step": 3505
},
{
"epoch": 3.375,
"grad_norm": 1.9278334626136537,
"learning_rate": 5.000002527090128e-08,
"loss": 2.059,
"step": 3510
},
{
"epoch": 3.3798076923076925,
"grad_norm": 1.8337005789104908,
"learning_rate": 5.0000023242976346e-08,
"loss": 2.0665,
"step": 3515
},
{
"epoch": 3.3846153846153846,
"grad_norm": 1.9024075084324792,
"learning_rate": 5.000002136408825e-08,
"loss": 2.1361,
"step": 3520
},
{
"epoch": 3.389423076923077,
"grad_norm": 1.8782715480203358,
"learning_rate": 5.0000019624305734e-08,
"loss": 2.1163,
"step": 3525
},
{
"epoch": 3.394230769230769,
"grad_norm": 1.86058034338409,
"learning_rate": 5.000001801429018e-08,
"loss": 2.1186,
"step": 3530
},
{
"epoch": 3.3990384615384617,
"grad_norm": 1.8881759634428155,
"learning_rate": 5.000001652526446e-08,
"loss": 2.0883,
"step": 3535
},
{
"epoch": 3.4038461538461537,
"grad_norm": 1.785713447960782,
"learning_rate": 5.000001514898321e-08,
"loss": 2.0527,
"step": 3540
},
{
"epoch": 3.4086538461538463,
"grad_norm": 1.9555165881816705,
"learning_rate": 5.0000013877704346e-08,
"loss": 2.1163,
"step": 3545
},
{
"epoch": 3.4134615384615383,
"grad_norm": 1.9223532202133446,
"learning_rate": 5.000001270416205e-08,
"loss": 2.0901,
"step": 3550
},
{
"epoch": 3.418269230769231,
"grad_norm": 1.9193635011123766,
"learning_rate": 5.000001162154087e-08,
"loss": 2.0746,
"step": 3555
},
{
"epoch": 3.423076923076923,
"grad_norm": 1.8733962144827436,
"learning_rate": 5.000001062345115e-08,
"loss": 2.0671,
"step": 3560
},
{
"epoch": 3.4278846153846154,
"grad_norm": 1.85873983452056,
"learning_rate": 5.0000009703905566e-08,
"loss": 2.1137,
"step": 3565
},
{
"epoch": 3.4326923076923075,
"grad_norm": 1.8503554423844921,
"learning_rate": 5.000000885729673e-08,
"loss": 2.0894,
"step": 3570
},
{
"epoch": 3.4375,
"grad_norm": 1.8222014591366218,
"learning_rate": 5.0000008078376005e-08,
"loss": 2.0432,
"step": 3575
},
{
"epoch": 3.4423076923076925,
"grad_norm": 1.7957714401504574,
"learning_rate": 5.0000007362233173e-08,
"loss": 2.1261,
"step": 3580
},
{
"epoch": 3.4471153846153846,
"grad_norm": 1.931908483475819,
"learning_rate": 5.000000670427727e-08,
"loss": 2.0361,
"step": 3585
},
{
"epoch": 3.451923076923077,
"grad_norm": 1.9002646238486756,
"learning_rate": 5.00000061002182e-08,
"loss": 2.0524,
"step": 3590
},
{
"epoch": 3.456730769230769,
"grad_norm": 1.8204343994860845,
"learning_rate": 5.0000005546049374e-08,
"loss": 2.0467,
"step": 3595
},
{
"epoch": 3.4615384615384617,
"grad_norm": 1.9057120685414555,
"learning_rate": 5.00000050380312e-08,
"loss": 2.093,
"step": 3600
},
{
"epoch": 3.4615384615384617,
"eval_loss": 2.3367574214935303,
"eval_runtime": 85.4244,
"eval_samples_per_second": 86.568,
"eval_steps_per_second": 0.679,
"step": 3600
},
{
"epoch": 3.4663461538461537,
"grad_norm": 1.9365323482683579,
"learning_rate": 5.000000457267532e-08,
"loss": 2.0553,
"step": 3605
},
{
"epoch": 3.4711538461538463,
"grad_norm": 1.8079565138425362,
"learning_rate": 5.0000004146729796e-08,
"loss": 2.089,
"step": 3610
},
{
"epoch": 3.4759615384615383,
"grad_norm": 1.8121185503245834,
"learning_rate": 5.0000003757164884e-08,
"loss": 2.0986,
"step": 3615
},
{
"epoch": 3.480769230769231,
"grad_norm": 1.8091507058120948,
"learning_rate": 5.00000034011597e-08,
"loss": 2.0754,
"step": 3620
},
{
"epoch": 3.485576923076923,
"grad_norm": 1.8733942037147027,
"learning_rate": 5.000000307608948e-08,
"loss": 2.0668,
"step": 3625
},
{
"epoch": 3.4903846153846154,
"grad_norm": 1.8821202627650557,
"learning_rate": 5.000000277951357e-08,
"loss": 1.9986,
"step": 3630
},
{
"epoch": 3.4951923076923075,
"grad_norm": 1.842855668232229,
"learning_rate": 5.0000002509163964e-08,
"loss": 2.0966,
"step": 3635
},
{
"epoch": 3.5,
"grad_norm": 1.8876473696523732,
"learning_rate": 5.0000002262934616e-08,
"loss": 2.0639,
"step": 3640
},
{
"epoch": 3.5048076923076925,
"grad_norm": 1.9962924727314426,
"learning_rate": 5.0000002038871134e-08,
"loss": 2.0818,
"step": 3645
},
{
"epoch": 3.5096153846153846,
"grad_norm": 1.9564800425998439,
"learning_rate": 5.0000001835161206e-08,
"loss": 2.1244,
"step": 3650
},
{
"epoch": 3.5144230769230766,
"grad_norm": 1.8523701031395317,
"learning_rate": 5.0000001650125436e-08,
"loss": 2.0887,
"step": 3655
},
{
"epoch": 3.519230769230769,
"grad_norm": 1.9350705828074954,
"learning_rate": 5.0000001482208764e-08,
"loss": 2.0847,
"step": 3660
},
{
"epoch": 3.5240384615384617,
"grad_norm": 1.946869882547775,
"learning_rate": 5.000000132997231e-08,
"loss": 2.0947,
"step": 3665
},
{
"epoch": 3.5288461538461537,
"grad_norm": 1.8459205035434865,
"learning_rate": 5.0000001192085726e-08,
"loss": 2.0312,
"step": 3670
},
{
"epoch": 3.5336538461538463,
"grad_norm": 1.919571637460775,
"learning_rate": 5.000000106731995e-08,
"loss": 2.0684,
"step": 3675
},
{
"epoch": 3.5384615384615383,
"grad_norm": 1.8251904058697088,
"learning_rate": 5.000000095454041e-08,
"loss": 2.0681,
"step": 3680
},
{
"epoch": 3.543269230769231,
"grad_norm": 1.8644080480328407,
"learning_rate": 5.000000085270059e-08,
"loss": 2.07,
"step": 3685
},
{
"epoch": 3.5480769230769234,
"grad_norm": 1.9449733940426817,
"learning_rate": 5.0000000760835994e-08,
"loss": 2.0474,
"step": 3690
},
{
"epoch": 3.5528846153846154,
"grad_norm": 1.8861381009831941,
"learning_rate": 5.000000067805847e-08,
"loss": 2.0788,
"step": 3695
},
{
"epoch": 3.5576923076923075,
"grad_norm": 1.9119855215360249,
"learning_rate": 5.000000060355086e-08,
"loss": 2.133,
"step": 3700
},
{
"epoch": 3.5625,
"grad_norm": 2.0025144773598713,
"learning_rate": 5.000000053656201e-08,
"loss": 2.0604,
"step": 3705
},
{
"epoch": 3.5673076923076925,
"grad_norm": 1.9599184161336376,
"learning_rate": 5.000000047640201e-08,
"loss": 2.0693,
"step": 3710
},
{
"epoch": 3.5721153846153846,
"grad_norm": 1.9332484541798294,
"learning_rate": 5.000000042243783e-08,
"loss": 2.1326,
"step": 3715
},
{
"epoch": 3.5769230769230766,
"grad_norm": 1.8373427956250443,
"learning_rate": 5.000000037408913e-08,
"loss": 2.0914,
"step": 3720
},
{
"epoch": 3.581730769230769,
"grad_norm": 1.8985422762821798,
"learning_rate": 5.000000033082442e-08,
"loss": 2.1263,
"step": 3725
},
{
"epoch": 3.5865384615384617,
"grad_norm": 1.8507361941632516,
"learning_rate": 5.000000029215739e-08,
"loss": 2.1016,
"step": 3730
},
{
"epoch": 3.5913461538461537,
"grad_norm": 1.918522522188892,
"learning_rate": 5.0000000257643545e-08,
"loss": 2.1104,
"step": 3735
},
{
"epoch": 3.5961538461538463,
"grad_norm": 1.9234648718431095,
"learning_rate": 5.0000000226876985e-08,
"loss": 2.0551,
"step": 3740
},
{
"epoch": 3.6009615384615383,
"grad_norm": 1.822481727821557,
"learning_rate": 5.000000019948749e-08,
"loss": 2.165,
"step": 3745
},
{
"epoch": 3.605769230769231,
"grad_norm": 1.8897986361161199,
"learning_rate": 5.000000017513769e-08,
"loss": 2.1189,
"step": 3750
},
{
"epoch": 3.6105769230769234,
"grad_norm": 1.8846334119765857,
"learning_rate": 5.0000000153520544e-08,
"loss": 2.0941,
"step": 3755
},
{
"epoch": 3.6153846153846154,
"grad_norm": 1.9439696562766058,
"learning_rate": 5.000000013435687e-08,
"loss": 2.0899,
"step": 3760
},
{
"epoch": 3.6201923076923075,
"grad_norm": 2.1285672502730897,
"learning_rate": 5.000000011739313e-08,
"loss": 2.0651,
"step": 3765
},
{
"epoch": 3.625,
"grad_norm": 1.9213014147357517,
"learning_rate": 5.000000010239938e-08,
"loss": 2.0956,
"step": 3770
},
{
"epoch": 3.6298076923076925,
"grad_norm": 2.0068609857257806,
"learning_rate": 5.0000000089167275e-08,
"loss": 2.1357,
"step": 3775
},
{
"epoch": 3.6346153846153846,
"grad_norm": 1.8705225726991637,
"learning_rate": 5.0000000077508284e-08,
"loss": 2.0578,
"step": 3780
},
{
"epoch": 3.6394230769230766,
"grad_norm": 1.8943581631321806,
"learning_rate": 5.000000006725204e-08,
"loss": 2.0315,
"step": 3785
},
{
"epoch": 3.644230769230769,
"grad_norm": 1.7746155655966087,
"learning_rate": 5.0000000058244776e-08,
"loss": 2.0558,
"step": 3790
},
{
"epoch": 3.6490384615384617,
"grad_norm": 1.9075711009896643,
"learning_rate": 5.00000000503479e-08,
"loss": 2.0978,
"step": 3795
},
{
"epoch": 3.6538461538461537,
"grad_norm": 1.850526459782874,
"learning_rate": 5.0000000043436655e-08,
"loss": 2.066,
"step": 3800
},
{
"epoch": 3.6538461538461537,
"eval_loss": 2.3363423347473145,
"eval_runtime": 85.3021,
"eval_samples_per_second": 86.692,
"eval_steps_per_second": 0.68,
"step": 3800
},
{
"epoch": 3.6586538461538463,
"grad_norm": 1.8690566333305048,
"learning_rate": 5.000000003739891e-08,
"loss": 2.0487,
"step": 3805
},
{
"epoch": 3.6634615384615383,
"grad_norm": 1.900722274652347,
"learning_rate": 5.000000003213401e-08,
"loss": 2.1207,
"step": 3810
},
{
"epoch": 3.668269230769231,
"grad_norm": 1.9465838080070361,
"learning_rate": 5.0000000027551756e-08,
"loss": 2.055,
"step": 3815
},
{
"epoch": 3.6730769230769234,
"grad_norm": 1.9044190775719372,
"learning_rate": 5.000000002357143e-08,
"loss": 2.0932,
"step": 3820
},
{
"epoch": 3.6778846153846154,
"grad_norm": 1.877437768825067,
"learning_rate": 5.00000000201209e-08,
"loss": 2.0378,
"step": 3825
},
{
"epoch": 3.6826923076923075,
"grad_norm": 1.9479165928017026,
"learning_rate": 5.0000000017135845e-08,
"loss": 2.12,
"step": 3830
},
{
"epoch": 3.6875,
"grad_norm": 1.8934460533416513,
"learning_rate": 5.000000001455896e-08,
"loss": 2.0638,
"step": 3835
},
{
"epoch": 3.6923076923076925,
"grad_norm": 1.8852430662362558,
"learning_rate": 5.00000000123393e-08,
"loss": 2.0684,
"step": 3840
},
{
"epoch": 3.6971153846153846,
"grad_norm": 1.860403694759792,
"learning_rate": 5.000000001043168e-08,
"loss": 2.0769,
"step": 3845
},
{
"epoch": 3.7019230769230766,
"grad_norm": 1.8537616298510589,
"learning_rate": 5.000000000879604e-08,
"loss": 2.0796,
"step": 3850
},
{
"epoch": 3.706730769230769,
"grad_norm": 1.9070836535172773,
"learning_rate": 5.0000000007396964e-08,
"loss": 2.0788,
"step": 3855
},
{
"epoch": 3.7115384615384617,
"grad_norm": 1.8144568187717154,
"learning_rate": 5.0000000006203204e-08,
"loss": 2.0824,
"step": 3860
},
{
"epoch": 3.7163461538461537,
"grad_norm": 1.891955133693288,
"learning_rate": 5.000000000518723e-08,
"loss": 2.0976,
"step": 3865
},
{
"epoch": 3.7211538461538463,
"grad_norm": 1.9703595895690142,
"learning_rate": 5.000000000432485e-08,
"loss": 2.0787,
"step": 3870
},
{
"epoch": 3.7259615384615383,
"grad_norm": 1.8460940153632612,
"learning_rate": 5.000000000359484e-08,
"loss": 2.1149,
"step": 3875
},
{
"epoch": 3.730769230769231,
"grad_norm": 1.9416809896930844,
"learning_rate": 5.000000000297862e-08,
"loss": 2.103,
"step": 3880
},
{
"epoch": 3.7355769230769234,
"grad_norm": 1.8235135326813838,
"learning_rate": 5.0000000002459973e-08,
"loss": 2.0464,
"step": 3885
},
{
"epoch": 3.7403846153846154,
"grad_norm": 1.8544605215958418,
"learning_rate": 5.000000000202477e-08,
"loss": 2.1148,
"step": 3890
},
{
"epoch": 3.7451923076923075,
"grad_norm": 1.9297008145685273,
"learning_rate": 5.000000000166072e-08,
"loss": 2.0917,
"step": 3895
},
{
"epoch": 3.75,
"grad_norm": 1.841810840824877,
"learning_rate": 5.000000000135718e-08,
"loss": 2.0486,
"step": 3900
},
{
"epoch": 3.7548076923076925,
"grad_norm": 1.8206643156132905,
"learning_rate": 5.0000000001104946e-08,
"loss": 2.0672,
"step": 3905
},
{
"epoch": 3.7596153846153846,
"grad_norm": 1.8759920863049961,
"learning_rate": 5.000000000089607e-08,
"loss": 2.0244,
"step": 3910
},
{
"epoch": 3.7644230769230766,
"grad_norm": 1.9048495951309699,
"learning_rate": 5.0000000000723734e-08,
"loss": 2.0743,
"step": 3915
},
{
"epoch": 3.769230769230769,
"grad_norm": 1.8193159595260147,
"learning_rate": 5.000000000058207e-08,
"loss": 2.0722,
"step": 3920
},
{
"epoch": 3.7740384615384617,
"grad_norm": 1.8691020909344,
"learning_rate": 5.0000000000466084e-08,
"loss": 2.1207,
"step": 3925
},
{
"epoch": 3.7788461538461537,
"grad_norm": 1.8608578096368507,
"learning_rate": 5.00000000003715e-08,
"loss": 2.1023,
"step": 3930
},
{
"epoch": 3.7836538461538463,
"grad_norm": 1.861692606774206,
"learning_rate": 5.00000000002947e-08,
"loss": 2.1159,
"step": 3935
},
{
"epoch": 3.7884615384615383,
"grad_norm": 1.9009512697877335,
"learning_rate": 5.0000000000232614e-08,
"loss": 2.0928,
"step": 3940
},
{
"epoch": 3.793269230769231,
"grad_norm": 1.8247326337722605,
"learning_rate": 5.000000000018266e-08,
"loss": 2.0607,
"step": 3945
},
{
"epoch": 3.7980769230769234,
"grad_norm": 1.838081967907657,
"learning_rate": 5.000000000014265e-08,
"loss": 2.1089,
"step": 3950
},
{
"epoch": 3.8028846153846154,
"grad_norm": 1.929918706709054,
"learning_rate": 5.000000000011078e-08,
"loss": 2.0905,
"step": 3955
},
{
"epoch": 3.8076923076923075,
"grad_norm": 1.8508307524707792,
"learning_rate": 5.0000000000085515e-08,
"loss": 2.1306,
"step": 3960
},
{
"epoch": 3.8125,
"grad_norm": 1.8695517798307058,
"learning_rate": 5.00000000000656e-08,
"loss": 2.0873,
"step": 3965
},
{
"epoch": 3.8173076923076925,
"grad_norm": 1.9513218569006434,
"learning_rate": 5.000000000005e-08,
"loss": 2.1049,
"step": 3970
},
{
"epoch": 3.8221153846153846,
"grad_norm": 1.8982042501595857,
"learning_rate": 5.000000000003784e-08,
"loss": 2.1205,
"step": 3975
},
{
"epoch": 3.8269230769230766,
"grad_norm": 1.8184591699240908,
"learning_rate": 5.000000000002844e-08,
"loss": 2.0395,
"step": 3980
},
{
"epoch": 3.831730769230769,
"grad_norm": 1.8444114349744394,
"learning_rate": 5.0000000000021207e-08,
"loss": 2.0824,
"step": 3985
},
{
"epoch": 3.8365384615384617,
"grad_norm": 1.8531735260873148,
"learning_rate": 5.000000000001569e-08,
"loss": 2.0544,
"step": 3990
},
{
"epoch": 3.8413461538461537,
"grad_norm": 1.8352559334251506,
"learning_rate": 5.0000000000011505e-08,
"loss": 2.0938,
"step": 3995
},
{
"epoch": 3.8461538461538463,
"grad_norm": 1.8424349150299684,
"learning_rate": 5.000000000000836e-08,
"loss": 2.1086,
"step": 4000
},
{
"epoch": 3.8461538461538463,
"eval_loss": 2.3361942768096924,
"eval_runtime": 85.4169,
"eval_samples_per_second": 86.575,
"eval_steps_per_second": 0.679,
"step": 4000
},
{
"epoch": 3.8509615384615383,
"grad_norm": 1.90467764249709,
"learning_rate": 5.000000000000602e-08,
"loss": 2.0919,
"step": 4005
},
{
"epoch": 3.855769230769231,
"grad_norm": 1.9147996032600165,
"learning_rate": 5.000000000000429e-08,
"loss": 2.0992,
"step": 4010
},
{
"epoch": 3.8605769230769234,
"grad_norm": 1.899917149171274,
"learning_rate": 5.000000000000303e-08,
"loss": 2.0772,
"step": 4015
},
{
"epoch": 3.8653846153846154,
"grad_norm": 1.8983270516331723,
"learning_rate": 5.000000000000211e-08,
"loss": 2.088,
"step": 4020
},
{
"epoch": 3.8701923076923075,
"grad_norm": 1.9175004513272587,
"learning_rate": 5.0000000000001454e-08,
"loss": 2.0511,
"step": 4025
},
{
"epoch": 3.875,
"grad_norm": 1.8660541755671598,
"learning_rate": 5.0000000000000984e-08,
"loss": 2.1061,
"step": 4030
},
{
"epoch": 3.8798076923076925,
"grad_norm": 1.8945222773765362,
"learning_rate": 5.000000000000066e-08,
"loss": 2.0912,
"step": 4035
},
{
"epoch": 3.8846153846153846,
"grad_norm": 1.9243273581552536,
"learning_rate": 5.0000000000000434e-08,
"loss": 2.126,
"step": 4040
},
{
"epoch": 3.8894230769230766,
"grad_norm": 1.8550808979879474,
"learning_rate": 5.000000000000028e-08,
"loss": 2.1042,
"step": 4045
},
{
"epoch": 3.894230769230769,
"grad_norm": 1.97506748062818,
"learning_rate": 5.0000000000000176e-08,
"loss": 2.1115,
"step": 4050
},
{
"epoch": 3.8990384615384617,
"grad_norm": 1.9079814987909542,
"learning_rate": 5.000000000000011e-08,
"loss": 2.049,
"step": 4055
},
{
"epoch": 3.9038461538461537,
"grad_norm": 1.9271203991857457,
"learning_rate": 5.000000000000007e-08,
"loss": 2.134,
"step": 4060
},
{
"epoch": 3.9086538461538463,
"grad_norm": 1.9736638939991642,
"learning_rate": 5.000000000000004e-08,
"loss": 2.1579,
"step": 4065
},
{
"epoch": 3.9134615384615383,
"grad_norm": 1.8949062426649275,
"learning_rate": 5.0000000000000024e-08,
"loss": 2.1017,
"step": 4070
},
{
"epoch": 3.918269230769231,
"grad_norm": 1.8881914290487865,
"learning_rate": 5.000000000000001e-08,
"loss": 2.0493,
"step": 4075
},
{
"epoch": 3.9230769230769234,
"grad_norm": 1.9185864408059423,
"learning_rate": 5.0000000000000004e-08,
"loss": 2.0971,
"step": 4080
},
{
"epoch": 3.9278846153846154,
"grad_norm": 1.910935901032547,
"learning_rate": 5.0000000000000004e-08,
"loss": 2.0574,
"step": 4085
},
{
"epoch": 3.9326923076923075,
"grad_norm": 1.8477236208599264,
"learning_rate": 5e-08,
"loss": 2.0316,
"step": 4090
},
{
"epoch": 3.9375,
"grad_norm": 1.8681233408771172,
"learning_rate": 5e-08,
"loss": 2.0406,
"step": 4095
},
{
"epoch": 3.9423076923076925,
"grad_norm": 1.976625704514766,
"learning_rate": 5e-08,
"loss": 2.1185,
"step": 4100
},
{
"epoch": 3.9471153846153846,
"grad_norm": 1.8722374970584073,
"learning_rate": 5e-08,
"loss": 2.0834,
"step": 4105
},
{
"epoch": 3.9519230769230766,
"grad_norm": 2.0555523827232234,
"learning_rate": 5e-08,
"loss": 2.0699,
"step": 4110
},
{
"epoch": 3.956730769230769,
"grad_norm": 1.8728593232700466,
"learning_rate": 5e-08,
"loss": 2.0932,
"step": 4115
},
{
"epoch": 3.9615384615384617,
"grad_norm": 1.8543407125566582,
"learning_rate": 5e-08,
"loss": 2.1006,
"step": 4120
},
{
"epoch": 3.9663461538461537,
"grad_norm": 1.8246615617187374,
"learning_rate": 5e-08,
"loss": 2.0577,
"step": 4125
},
{
"epoch": 3.9711538461538463,
"grad_norm": 1.9485201624855024,
"learning_rate": 5e-08,
"loss": 2.1165,
"step": 4130
},
{
"epoch": 3.9759615384615383,
"grad_norm": 1.988247558955116,
"learning_rate": 5e-08,
"loss": 2.0729,
"step": 4135
},
{
"epoch": 3.980769230769231,
"grad_norm": 1.9867643817669718,
"learning_rate": 5e-08,
"loss": 2.0647,
"step": 4140
},
{
"epoch": 3.9855769230769234,
"grad_norm": 1.9105220330651407,
"learning_rate": 5e-08,
"loss": 2.0665,
"step": 4145
},
{
"epoch": 3.9903846153846154,
"grad_norm": 1.8202876344304606,
"learning_rate": 5e-08,
"loss": 2.1232,
"step": 4150
},
{
"epoch": 3.9951923076923075,
"grad_norm": 1.9398674577857897,
"learning_rate": 5e-08,
"loss": 2.0924,
"step": 4155
},
{
"epoch": 4.0,
"grad_norm": 1.9383477945644347,
"learning_rate": 5e-08,
"loss": 2.1167,
"step": 4160
},
{
"epoch": 4.0,
"step": 4160,
"total_flos": 434462785536000.0,
"train_loss": 2.16538261238199,
"train_runtime": 15200.3368,
"train_samples_per_second": 17.512,
"train_steps_per_second": 0.274
}
],
"logging_steps": 5,
"max_steps": 4160,
"num_input_tokens_seen": 0,
"num_train_epochs": 4,
"save_steps": 200,
"total_flos": 434462785536000.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}