korean_syllable_roberta_256 / trainer_state.json
Trofish's picture
Upload 10 files
571fb77 verified
raw
history blame
250 kB
{
"best_metric": 1.1103906631469727,
"best_model_checkpoint": "/home/wani/Desktop/roberta-pretrain/ckpt/roberta/pretrain/medium/256/checkpoint-12330",
"epoch": 10.386703853378108,
"eval_steps": 90,
"global_step": 12330,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.008423928510444533,
"grad_norm": 5.073121070861816,
"learning_rate": 4.166666666666667e-06,
"loss": 7.2395,
"step": 10
},
{
"epoch": 0.016847857020889066,
"grad_norm": 4.587955474853516,
"learning_rate": 8.333333333333334e-06,
"loss": 7.0836,
"step": 20
},
{
"epoch": 0.0252717855313336,
"grad_norm": 3.8589327335357666,
"learning_rate": 1.25e-05,
"loss": 6.8156,
"step": 30
},
{
"epoch": 0.03369571404177813,
"grad_norm": 3.4427683353424072,
"learning_rate": 1.6666666666666667e-05,
"loss": 6.5549,
"step": 40
},
{
"epoch": 0.04211964255222266,
"grad_norm": 3.109060525894165,
"learning_rate": 2.0833333333333333e-05,
"loss": 6.3522,
"step": 50
},
{
"epoch": 0.0505435710626672,
"grad_norm": 2.86232590675354,
"learning_rate": 2.5e-05,
"loss": 6.1983,
"step": 60
},
{
"epoch": 0.05896749957311173,
"grad_norm": 2.6880924701690674,
"learning_rate": 2.9166666666666666e-05,
"loss": 6.0796,
"step": 70
},
{
"epoch": 0.06739142808355626,
"grad_norm": 2.490527629852295,
"learning_rate": 3.3333333333333335e-05,
"loss": 5.9754,
"step": 80
},
{
"epoch": 0.0758153565940008,
"grad_norm": 2.3156356811523438,
"learning_rate": 3.75e-05,
"loss": 5.8736,
"step": 90
},
{
"epoch": 0.0758153565940008,
"eval_accuracy": 0.22415329938580753,
"eval_loss": 5.8054423332214355,
"eval_runtime": 910.9652,
"eval_samples_per_second": 548.183,
"eval_steps_per_second": 5.076,
"step": 90
},
{
"epoch": 0.08423928510444532,
"grad_norm": 2.1557302474975586,
"learning_rate": 4.1666666666666665e-05,
"loss": 5.7691,
"step": 100
},
{
"epoch": 0.09266321361488987,
"grad_norm": 1.9360383749008179,
"learning_rate": 4.5833333333333334e-05,
"loss": 5.6653,
"step": 110
},
{
"epoch": 0.1010871421253344,
"grad_norm": 1.731399655342102,
"learning_rate": 5e-05,
"loss": 5.5598,
"step": 120
},
{
"epoch": 0.10951107063577893,
"grad_norm": 1.508693814277649,
"learning_rate": 5.416666666666667e-05,
"loss": 5.4574,
"step": 130
},
{
"epoch": 0.11793499914622346,
"grad_norm": 1.2835007905960083,
"learning_rate": 5.833333333333333e-05,
"loss": 5.3585,
"step": 140
},
{
"epoch": 0.126358927656668,
"grad_norm": 1.0747231245040894,
"learning_rate": 6.25e-05,
"loss": 5.2667,
"step": 150
},
{
"epoch": 0.13478285616711252,
"grad_norm": 0.852271318435669,
"learning_rate": 6.666666666666667e-05,
"loss": 5.1779,
"step": 160
},
{
"epoch": 0.14320678467755707,
"grad_norm": 0.7001814842224121,
"learning_rate": 7.083333333333334e-05,
"loss": 5.0965,
"step": 170
},
{
"epoch": 0.1516307131880016,
"grad_norm": 0.5657457709312439,
"learning_rate": 7.5e-05,
"loss": 5.0237,
"step": 180
},
{
"epoch": 0.1516307131880016,
"eval_accuracy": 0.23888299376264316,
"eval_loss": 4.981535911560059,
"eval_runtime": 882.341,
"eval_samples_per_second": 565.967,
"eval_steps_per_second": 5.241,
"step": 180
},
{
"epoch": 0.16005464169844613,
"grad_norm": 0.4981703758239746,
"learning_rate": 7.916666666666666e-05,
"loss": 4.9662,
"step": 190
},
{
"epoch": 0.16847857020889065,
"grad_norm": 0.40254291892051697,
"learning_rate": 8.333333333333333e-05,
"loss": 4.9195,
"step": 200
},
{
"epoch": 0.1769024987193352,
"grad_norm": 0.32726043462753296,
"learning_rate": 8.75e-05,
"loss": 4.8766,
"step": 210
},
{
"epoch": 0.18532642722977974,
"grad_norm": 0.2471727877855301,
"learning_rate": 9.166666666666667e-05,
"loss": 4.8458,
"step": 220
},
{
"epoch": 0.19375035574022426,
"grad_norm": 0.2568261921405792,
"learning_rate": 9.583333333333334e-05,
"loss": 4.8169,
"step": 230
},
{
"epoch": 0.2021742842506688,
"grad_norm": 0.19310955703258514,
"learning_rate": 0.0001,
"loss": 4.7926,
"step": 240
},
{
"epoch": 0.21059821276111332,
"grad_norm": 0.20584674179553986,
"learning_rate": 0.00010416666666666667,
"loss": 4.7714,
"step": 250
},
{
"epoch": 0.21902214127155786,
"grad_norm": 0.26360729336738586,
"learning_rate": 0.00010833333333333334,
"loss": 4.7511,
"step": 260
},
{
"epoch": 0.22744606978200238,
"grad_norm": 0.1681978851556778,
"learning_rate": 0.00011250000000000001,
"loss": 4.7309,
"step": 270
},
{
"epoch": 0.22744606978200238,
"eval_accuracy": 0.28488370423336357,
"eval_loss": 4.706047534942627,
"eval_runtime": 889.3977,
"eval_samples_per_second": 561.477,
"eval_steps_per_second": 5.199,
"step": 270
},
{
"epoch": 0.23586999829244693,
"grad_norm": 0.17959143221378326,
"learning_rate": 0.00011666666666666667,
"loss": 4.7148,
"step": 280
},
{
"epoch": 0.24429392680289147,
"grad_norm": 0.27109047770500183,
"learning_rate": 0.00012083333333333333,
"loss": 4.6989,
"step": 290
},
{
"epoch": 0.252717855313336,
"grad_norm": 0.2674080431461334,
"learning_rate": 0.000125,
"loss": 4.6826,
"step": 300
},
{
"epoch": 0.2611417838237805,
"grad_norm": 0.24386395514011383,
"learning_rate": 0.00012916666666666667,
"loss": 4.6707,
"step": 310
},
{
"epoch": 0.26956571233422505,
"grad_norm": 0.5274083614349365,
"learning_rate": 0.00013333333333333334,
"loss": 4.6553,
"step": 320
},
{
"epoch": 0.2779896408446696,
"grad_norm": 0.4005141258239746,
"learning_rate": 0.0001375,
"loss": 4.6446,
"step": 330
},
{
"epoch": 0.28641356935511414,
"grad_norm": 0.3732853829860687,
"learning_rate": 0.00014166666666666668,
"loss": 4.6315,
"step": 340
},
{
"epoch": 0.29483749786555863,
"grad_norm": 0.2742752730846405,
"learning_rate": 0.00014583333333333335,
"loss": 4.6221,
"step": 350
},
{
"epoch": 0.3032614263760032,
"grad_norm": 0.20482462644577026,
"learning_rate": 0.00015,
"loss": 4.6138,
"step": 360
},
{
"epoch": 0.3032614263760032,
"eval_accuracy": 0.28836420126551926,
"eval_loss": 4.5933918952941895,
"eval_runtime": 880.4452,
"eval_samples_per_second": 567.186,
"eval_steps_per_second": 5.252,
"step": 360
},
{
"epoch": 0.3116853548864477,
"grad_norm": 0.26613757014274597,
"learning_rate": 0.00015416666666666668,
"loss": 4.5983,
"step": 370
},
{
"epoch": 0.32010928339689226,
"grad_norm": 0.20205098390579224,
"learning_rate": 0.00015833333333333332,
"loss": 4.5922,
"step": 380
},
{
"epoch": 0.3285332119073368,
"grad_norm": 0.5084218978881836,
"learning_rate": 0.00016250000000000002,
"loss": 4.5826,
"step": 390
},
{
"epoch": 0.3369571404177813,
"grad_norm": 0.2835780084133148,
"learning_rate": 0.00016666666666666666,
"loss": 4.5771,
"step": 400
},
{
"epoch": 0.34538106892822584,
"grad_norm": 0.23976200819015503,
"learning_rate": 0.00017083333333333333,
"loss": 4.5726,
"step": 410
},
{
"epoch": 0.3538049974386704,
"grad_norm": 0.2275087982416153,
"learning_rate": 0.000175,
"loss": 4.5666,
"step": 420
},
{
"epoch": 0.36222892594911493,
"grad_norm": 0.27758899331092834,
"learning_rate": 0.00017916666666666667,
"loss": 4.5654,
"step": 430
},
{
"epoch": 0.3706528544595595,
"grad_norm": 0.18581350147724152,
"learning_rate": 0.00018333333333333334,
"loss": 4.5593,
"step": 440
},
{
"epoch": 0.37907678297000397,
"grad_norm": 0.1667676419019699,
"learning_rate": 0.0001875,
"loss": 4.5538,
"step": 450
},
{
"epoch": 0.37907678297000397,
"eval_accuracy": 0.28966679521500804,
"eval_loss": 4.547606468200684,
"eval_runtime": 890.3979,
"eval_samples_per_second": 560.846,
"eval_steps_per_second": 5.193,
"step": 450
},
{
"epoch": 0.3875007114804485,
"grad_norm": 0.32489290833473206,
"learning_rate": 0.00019166666666666667,
"loss": 4.5532,
"step": 460
},
{
"epoch": 0.39592463999089306,
"grad_norm": 0.7000045776367188,
"learning_rate": 0.00019583333333333334,
"loss": 4.5484,
"step": 470
},
{
"epoch": 0.4043485685013376,
"grad_norm": 0.43668240308761597,
"learning_rate": 0.0002,
"loss": 4.5489,
"step": 480
},
{
"epoch": 0.4127724970117821,
"grad_norm": 0.36716368794441223,
"learning_rate": 0.00020416666666666668,
"loss": 4.5459,
"step": 490
},
{
"epoch": 0.42119642552222664,
"grad_norm": 0.30332931876182556,
"learning_rate": 0.00020833333333333335,
"loss": 4.5418,
"step": 500
},
{
"epoch": 0.4296203540326712,
"grad_norm": 0.5920347571372986,
"learning_rate": 0.0002125,
"loss": 4.5406,
"step": 510
},
{
"epoch": 0.4380442825431157,
"grad_norm": 0.45020386576652527,
"learning_rate": 0.00021666666666666668,
"loss": 4.5372,
"step": 520
},
{
"epoch": 0.44646821105356027,
"grad_norm": 0.33357909321784973,
"learning_rate": 0.00022083333333333333,
"loss": 4.5367,
"step": 530
},
{
"epoch": 0.45489213956400476,
"grad_norm": 0.45888572931289673,
"learning_rate": 0.00022500000000000002,
"loss": 4.5344,
"step": 540
},
{
"epoch": 0.45489213956400476,
"eval_accuracy": 0.2902362393111046,
"eval_loss": 4.531790256500244,
"eval_runtime": 882.2427,
"eval_samples_per_second": 566.03,
"eval_steps_per_second": 5.241,
"step": 540
},
{
"epoch": 0.4633160680744493,
"grad_norm": 0.4458440840244293,
"learning_rate": 0.00022916666666666666,
"loss": 4.5328,
"step": 550
},
{
"epoch": 0.47173999658489385,
"grad_norm": 0.1917838305234909,
"learning_rate": 0.00023333333333333333,
"loss": 4.5296,
"step": 560
},
{
"epoch": 0.4801639250953384,
"grad_norm": 0.8310424089431763,
"learning_rate": 0.0002375,
"loss": 4.5275,
"step": 570
},
{
"epoch": 0.48858785360578294,
"grad_norm": 0.4216615855693817,
"learning_rate": 0.00024166666666666667,
"loss": 4.531,
"step": 580
},
{
"epoch": 0.49701178211622743,
"grad_norm": 0.2320231944322586,
"learning_rate": 0.0002458333333333333,
"loss": 4.5276,
"step": 590
},
{
"epoch": 0.505435710626672,
"grad_norm": 0.3115006983280182,
"learning_rate": 0.00025,
"loss": 4.5252,
"step": 600
},
{
"epoch": 0.5138596391371165,
"grad_norm": 0.13032270967960358,
"learning_rate": 0.00025416666666666665,
"loss": 4.5227,
"step": 610
},
{
"epoch": 0.522283567647561,
"grad_norm": 0.5333927273750305,
"learning_rate": 0.00025833333333333334,
"loss": 4.5214,
"step": 620
},
{
"epoch": 0.5307074961580056,
"grad_norm": 0.8976441025733948,
"learning_rate": 0.00026250000000000004,
"loss": 4.5218,
"step": 630
},
{
"epoch": 0.5307074961580056,
"eval_accuracy": 0.290083406000685,
"eval_loss": 4.522771835327148,
"eval_runtime": 892.1941,
"eval_samples_per_second": 559.717,
"eval_steps_per_second": 5.183,
"step": 630
},
{
"epoch": 0.5391314246684501,
"grad_norm": 0.1657322496175766,
"learning_rate": 0.0002666666666666667,
"loss": 4.523,
"step": 640
},
{
"epoch": 0.5475553531788947,
"grad_norm": 0.1890048235654831,
"learning_rate": 0.0002708333333333333,
"loss": 4.5185,
"step": 650
},
{
"epoch": 0.5559792816893392,
"grad_norm": 0.8254080414772034,
"learning_rate": 0.000275,
"loss": 4.5196,
"step": 660
},
{
"epoch": 0.5644032101997837,
"grad_norm": 0.1703944355249405,
"learning_rate": 0.00027916666666666666,
"loss": 4.52,
"step": 670
},
{
"epoch": 0.5728271387102283,
"grad_norm": 0.33486783504486084,
"learning_rate": 0.00028333333333333335,
"loss": 4.5139,
"step": 680
},
{
"epoch": 0.5812510672206728,
"grad_norm": 0.4759036600589752,
"learning_rate": 0.0002875,
"loss": 4.5158,
"step": 690
},
{
"epoch": 0.5896749957311173,
"grad_norm": 0.26314422488212585,
"learning_rate": 0.0002916666666666667,
"loss": 4.5135,
"step": 700
},
{
"epoch": 0.5980989242415619,
"grad_norm": 0.39898937940597534,
"learning_rate": 0.00029583333333333333,
"loss": 4.5114,
"step": 710
},
{
"epoch": 0.6065228527520063,
"grad_norm": 0.5003794431686401,
"learning_rate": 0.0003,
"loss": 4.5148,
"step": 720
},
{
"epoch": 0.6065228527520063,
"eval_accuracy": 0.2903979539286128,
"eval_loss": 4.508981704711914,
"eval_runtime": 878.8487,
"eval_samples_per_second": 568.216,
"eval_steps_per_second": 5.261,
"step": 720
},
{
"epoch": 0.614946781262451,
"grad_norm": 0.2276950627565384,
"learning_rate": 0.00030416666666666667,
"loss": 4.5111,
"step": 730
},
{
"epoch": 0.6233707097728954,
"grad_norm": 0.21725377440452576,
"learning_rate": 0.00030833333333333337,
"loss": 4.5088,
"step": 740
},
{
"epoch": 0.6317946382833399,
"grad_norm": 0.8084585666656494,
"learning_rate": 0.0003125,
"loss": 4.5074,
"step": 750
},
{
"epoch": 0.6402185667937845,
"grad_norm": 0.46915069222450256,
"learning_rate": 0.00031666666666666665,
"loss": 4.5072,
"step": 760
},
{
"epoch": 0.648642495304229,
"grad_norm": 0.15649260580539703,
"learning_rate": 0.00032083333333333334,
"loss": 4.5039,
"step": 770
},
{
"epoch": 0.6570664238146736,
"grad_norm": 0.42916274070739746,
"learning_rate": 0.00032500000000000004,
"loss": 4.5056,
"step": 780
},
{
"epoch": 0.6654903523251181,
"grad_norm": 0.287572979927063,
"learning_rate": 0.0003291666666666667,
"loss": 4.5045,
"step": 790
},
{
"epoch": 0.6739142808355626,
"grad_norm": 0.6869699358940125,
"learning_rate": 0.0003333333333333333,
"loss": 4.5029,
"step": 800
},
{
"epoch": 0.6823382093460072,
"grad_norm": 0.2973476052284241,
"learning_rate": 0.0003375,
"loss": 4.5009,
"step": 810
},
{
"epoch": 0.6823382093460072,
"eval_accuracy": 0.29041409279207236,
"eval_loss": 4.497637748718262,
"eval_runtime": 872.3603,
"eval_samples_per_second": 572.442,
"eval_steps_per_second": 5.301,
"step": 810
},
{
"epoch": 0.6907621378564517,
"grad_norm": 0.5773557424545288,
"learning_rate": 0.00034166666666666666,
"loss": 4.5024,
"step": 820
},
{
"epoch": 0.6991860663668963,
"grad_norm": 0.31921157240867615,
"learning_rate": 0.00034583333333333335,
"loss": 4.5006,
"step": 830
},
{
"epoch": 0.7076099948773408,
"grad_norm": 0.4232361912727356,
"learning_rate": 0.00035,
"loss": 4.5001,
"step": 840
},
{
"epoch": 0.7160339233877853,
"grad_norm": 0.30865538120269775,
"learning_rate": 0.0003541666666666667,
"loss": 4.4998,
"step": 850
},
{
"epoch": 0.7244578518982299,
"grad_norm": 0.6191368699073792,
"learning_rate": 0.00035833333333333333,
"loss": 4.4967,
"step": 860
},
{
"epoch": 0.7328817804086744,
"grad_norm": 0.3202773630619049,
"learning_rate": 0.0003625,
"loss": 4.499,
"step": 870
},
{
"epoch": 0.741305708919119,
"grad_norm": 0.3090028464794159,
"learning_rate": 0.00036666666666666667,
"loss": 4.4967,
"step": 880
},
{
"epoch": 0.7497296374295634,
"grad_norm": 0.9248805046081543,
"learning_rate": 0.00037083333333333337,
"loss": 4.4962,
"step": 890
},
{
"epoch": 0.7581535659400079,
"grad_norm": 0.27745822072029114,
"learning_rate": 0.000375,
"loss": 4.4956,
"step": 900
},
{
"epoch": 0.7581535659400079,
"eval_accuracy": 0.29047371761644103,
"eval_loss": 4.492140293121338,
"eval_runtime": 888.1144,
"eval_samples_per_second": 562.288,
"eval_steps_per_second": 5.207,
"step": 900
},
{
"epoch": 0.7665774944504525,
"grad_norm": 0.2972380518913269,
"learning_rate": 0.00037916666666666665,
"loss": 4.4936,
"step": 910
},
{
"epoch": 0.775001422960897,
"grad_norm": 1.4440104961395264,
"learning_rate": 0.00038333333333333334,
"loss": 4.4956,
"step": 920
},
{
"epoch": 0.7834253514713415,
"grad_norm": 0.2894129455089569,
"learning_rate": 0.00038750000000000004,
"loss": 4.4961,
"step": 930
},
{
"epoch": 0.7918492799817861,
"grad_norm": 0.22757315635681152,
"learning_rate": 0.0003916666666666667,
"loss": 4.495,
"step": 940
},
{
"epoch": 0.8002732084922306,
"grad_norm": 0.2084762305021286,
"learning_rate": 0.0003958333333333333,
"loss": 4.4921,
"step": 950
},
{
"epoch": 0.8086971370026752,
"grad_norm": 0.4823535084724426,
"learning_rate": 0.0004,
"loss": 4.4928,
"step": 960
},
{
"epoch": 0.8171210655131197,
"grad_norm": 0.22939594089984894,
"learning_rate": 0.00040416666666666666,
"loss": 4.4889,
"step": 970
},
{
"epoch": 0.8255449940235642,
"grad_norm": 0.4983462989330292,
"learning_rate": 0.00040833333333333336,
"loss": 4.4888,
"step": 980
},
{
"epoch": 0.8339689225340088,
"grad_norm": 0.7445792555809021,
"learning_rate": 0.0004125,
"loss": 4.4899,
"step": 990
},
{
"epoch": 0.8339689225340088,
"eval_accuracy": 0.2903607895100575,
"eval_loss": 4.490144729614258,
"eval_runtime": 872.9885,
"eval_samples_per_second": 572.03,
"eval_steps_per_second": 5.297,
"step": 990
},
{
"epoch": 0.8423928510444533,
"grad_norm": 0.3264559805393219,
"learning_rate": 0.0004166666666666667,
"loss": 4.4879,
"step": 1000
},
{
"epoch": 0.8508167795548979,
"grad_norm": 0.5130082964897156,
"learning_rate": 0.00042083333333333333,
"loss": 4.4881,
"step": 1010
},
{
"epoch": 0.8592407080653424,
"grad_norm": 0.2776341736316681,
"learning_rate": 0.000425,
"loss": 4.4872,
"step": 1020
},
{
"epoch": 0.8676646365757869,
"grad_norm": 0.9157618880271912,
"learning_rate": 0.00042916666666666667,
"loss": 4.4868,
"step": 1030
},
{
"epoch": 0.8760885650862315,
"grad_norm": 0.22099615633487701,
"learning_rate": 0.00043333333333333337,
"loss": 4.4877,
"step": 1040
},
{
"epoch": 0.8845124935966759,
"grad_norm": 0.2313142567873001,
"learning_rate": 0.0004375,
"loss": 4.4845,
"step": 1050
},
{
"epoch": 0.8929364221071205,
"grad_norm": 0.4353635907173157,
"learning_rate": 0.00044166666666666665,
"loss": 4.4888,
"step": 1060
},
{
"epoch": 0.901360350617565,
"grad_norm": 0.2390984743833542,
"learning_rate": 0.00044583333333333335,
"loss": 4.4827,
"step": 1070
},
{
"epoch": 0.9097842791280095,
"grad_norm": 0.31369632482528687,
"learning_rate": 0.00045000000000000004,
"loss": 4.4832,
"step": 1080
},
{
"epoch": 0.9097842791280095,
"eval_accuracy": 0.2904605834264481,
"eval_loss": 4.480494499206543,
"eval_runtime": 880.1337,
"eval_samples_per_second": 567.386,
"eval_steps_per_second": 5.254,
"step": 1080
},
{
"epoch": 0.9182082076384541,
"grad_norm": 0.6700971722602844,
"learning_rate": 0.0004541666666666667,
"loss": 4.483,
"step": 1090
},
{
"epoch": 0.9266321361488986,
"grad_norm": 0.25950998067855835,
"learning_rate": 0.0004583333333333333,
"loss": 4.4832,
"step": 1100
},
{
"epoch": 0.9350560646593432,
"grad_norm": 0.2840316593647003,
"learning_rate": 0.0004625,
"loss": 4.4819,
"step": 1110
},
{
"epoch": 0.9434799931697877,
"grad_norm": 0.6859279274940491,
"learning_rate": 0.00046666666666666666,
"loss": 4.4819,
"step": 1120
},
{
"epoch": 0.9519039216802322,
"grad_norm": 0.2865343391895294,
"learning_rate": 0.00047083333333333336,
"loss": 4.48,
"step": 1130
},
{
"epoch": 0.9603278501906768,
"grad_norm": 1.179539442062378,
"learning_rate": 0.000475,
"loss": 4.4762,
"step": 1140
},
{
"epoch": 0.9687517787011213,
"grad_norm": 0.4731704294681549,
"learning_rate": 0.0004791666666666667,
"loss": 4.4831,
"step": 1150
},
{
"epoch": 0.9771757072115659,
"grad_norm": 0.298757404088974,
"learning_rate": 0.00048333333333333334,
"loss": 4.4742,
"step": 1160
},
{
"epoch": 0.9855996357220104,
"grad_norm": 1.0954639911651611,
"learning_rate": 0.0004875,
"loss": 4.46,
"step": 1170
},
{
"epoch": 0.9855996357220104,
"eval_accuracy": 0.29021425691327735,
"eval_loss": 4.458162784576416,
"eval_runtime": 887.8161,
"eval_samples_per_second": 562.477,
"eval_steps_per_second": 5.208,
"step": 1170
},
{
"epoch": 0.9940235642324549,
"grad_norm": 0.441949725151062,
"learning_rate": 0.0004916666666666666,
"loss": 4.4549,
"step": 1180
},
{
"epoch": 1.0024474927428995,
"grad_norm": 0.5917736887931824,
"learning_rate": 0.0004958333333333334,
"loss": 4.4425,
"step": 1190
},
{
"epoch": 1.010871421253344,
"grad_norm": 0.3910304307937622,
"learning_rate": 0.0005,
"loss": 4.4376,
"step": 1200
},
{
"epoch": 1.0192953497637884,
"grad_norm": 0.446277916431427,
"learning_rate": 0.0005041666666666667,
"loss": 4.4284,
"step": 1210
},
{
"epoch": 1.027719278274233,
"grad_norm": 0.7843539118766785,
"learning_rate": 0.0005083333333333333,
"loss": 4.4216,
"step": 1220
},
{
"epoch": 1.0361432067846776,
"grad_norm": 0.5028587579727173,
"learning_rate": 0.0005124999999999999,
"loss": 4.418,
"step": 1230
},
{
"epoch": 1.044567135295122,
"grad_norm": 0.5062530636787415,
"learning_rate": 0.0005166666666666667,
"loss": 4.4099,
"step": 1240
},
{
"epoch": 1.0529910638055666,
"grad_norm": 0.4109475016593933,
"learning_rate": 0.0005208333333333334,
"loss": 4.4005,
"step": 1250
},
{
"epoch": 1.0614149923160112,
"grad_norm": 0.494357705116272,
"learning_rate": 0.0005250000000000001,
"loss": 4.3924,
"step": 1260
},
{
"epoch": 1.0614149923160112,
"eval_accuracy": 0.29121270831959656,
"eval_loss": 4.368500232696533,
"eval_runtime": 885.6194,
"eval_samples_per_second": 563.872,
"eval_steps_per_second": 5.221,
"step": 1260
},
{
"epoch": 1.0698389208264556,
"grad_norm": 0.4964124858379364,
"learning_rate": 0.0005291666666666667,
"loss": 4.3843,
"step": 1270
},
{
"epoch": 1.0782628493369002,
"grad_norm": 0.6328290700912476,
"learning_rate": 0.0005333333333333334,
"loss": 4.3756,
"step": 1280
},
{
"epoch": 1.0866867778473448,
"grad_norm": 0.8674759268760681,
"learning_rate": 0.0005375,
"loss": 4.3697,
"step": 1290
},
{
"epoch": 1.0951107063577892,
"grad_norm": 0.4631132185459137,
"learning_rate": 0.0005416666666666666,
"loss": 4.3676,
"step": 1300
},
{
"epoch": 1.1035346348682338,
"grad_norm": 0.5043870210647583,
"learning_rate": 0.0005458333333333333,
"loss": 4.3582,
"step": 1310
},
{
"epoch": 1.1119585633786784,
"grad_norm": 0.5791853666305542,
"learning_rate": 0.00055,
"loss": 4.3529,
"step": 1320
},
{
"epoch": 1.120382491889123,
"grad_norm": 0.6443321108818054,
"learning_rate": 0.0005541666666666667,
"loss": 4.3471,
"step": 1330
},
{
"epoch": 1.1288064203995674,
"grad_norm": 0.6193282008171082,
"learning_rate": 0.0005583333333333333,
"loss": 4.338,
"step": 1340
},
{
"epoch": 1.137230348910012,
"grad_norm": 0.6169930696487427,
"learning_rate": 0.0005625000000000001,
"loss": 4.3365,
"step": 1350
},
{
"epoch": 1.137230348910012,
"eval_accuracy": 0.2912005471998471,
"eval_loss": 4.2970428466796875,
"eval_runtime": 875.1704,
"eval_samples_per_second": 570.604,
"eval_steps_per_second": 5.284,
"step": 1350
},
{
"epoch": 1.1456542774204566,
"grad_norm": 0.8051270246505737,
"learning_rate": 0.0005666666666666667,
"loss": 4.3252,
"step": 1360
},
{
"epoch": 1.154078205930901,
"grad_norm": 0.7985979914665222,
"learning_rate": 0.0005708333333333333,
"loss": 4.3185,
"step": 1370
},
{
"epoch": 1.1625021344413455,
"grad_norm": 0.7459626793861389,
"learning_rate": 0.000575,
"loss": 4.3119,
"step": 1380
},
{
"epoch": 1.1709260629517901,
"grad_norm": 0.572289228439331,
"learning_rate": 0.0005791666666666667,
"loss": 4.3066,
"step": 1390
},
{
"epoch": 1.1793499914622347,
"grad_norm": 0.5565480589866638,
"learning_rate": 0.0005833333333333334,
"loss": 4.2973,
"step": 1400
},
{
"epoch": 1.1877739199726791,
"grad_norm": 0.789574384689331,
"learning_rate": 0.0005875,
"loss": 4.2922,
"step": 1410
},
{
"epoch": 1.1961978484831237,
"grad_norm": 1.0027601718902588,
"learning_rate": 0.0005916666666666667,
"loss": 4.2824,
"step": 1420
},
{
"epoch": 1.204621776993568,
"grad_norm": 0.8137519359588623,
"learning_rate": 0.0005958333333333333,
"loss": 4.2808,
"step": 1430
},
{
"epoch": 1.2130457055040127,
"grad_norm": 0.8705686330795288,
"learning_rate": 0.0006,
"loss": 4.2685,
"step": 1440
},
{
"epoch": 1.2130457055040127,
"eval_accuracy": 0.2922224943254529,
"eval_loss": 4.225285053253174,
"eval_runtime": 885.6768,
"eval_samples_per_second": 563.835,
"eval_steps_per_second": 5.221,
"step": 1440
},
{
"epoch": 1.2214696340144573,
"grad_norm": 1.0055943727493286,
"learning_rate": 0.0006041666666666666,
"loss": 4.2639,
"step": 1450
},
{
"epoch": 1.229893562524902,
"grad_norm": 0.9747255444526672,
"learning_rate": 0.0006083333333333333,
"loss": 4.2622,
"step": 1460
},
{
"epoch": 1.2383174910353463,
"grad_norm": 0.6799793243408203,
"learning_rate": 0.0006125000000000001,
"loss": 4.251,
"step": 1470
},
{
"epoch": 1.2467414195457909,
"grad_norm": 0.8863984942436218,
"learning_rate": 0.0006166666666666667,
"loss": 4.2476,
"step": 1480
},
{
"epoch": 1.2551653480562355,
"grad_norm": 0.891790509223938,
"learning_rate": 0.0006208333333333334,
"loss": 4.2434,
"step": 1490
},
{
"epoch": 1.2635892765666799,
"grad_norm": 0.731626033782959,
"learning_rate": 0.000625,
"loss": 4.233,
"step": 1500
},
{
"epoch": 1.2720132050771245,
"grad_norm": 0.7038396000862122,
"learning_rate": 0.0006291666666666667,
"loss": 4.2264,
"step": 1510
},
{
"epoch": 1.280437133587569,
"grad_norm": 1.0247654914855957,
"learning_rate": 0.0006333333333333333,
"loss": 4.2198,
"step": 1520
},
{
"epoch": 1.2888610620980137,
"grad_norm": 1.0854212045669556,
"learning_rate": 0.0006374999999999999,
"loss": 4.2126,
"step": 1530
},
{
"epoch": 1.2888610620980137,
"eval_accuracy": 0.2953678601775117,
"eval_loss": 4.152132034301758,
"eval_runtime": 880.7951,
"eval_samples_per_second": 566.96,
"eval_steps_per_second": 5.25,
"step": 1530
},
{
"epoch": 1.297284990608458,
"grad_norm": 0.8179611563682556,
"learning_rate": 0.0006416666666666667,
"loss": 4.2081,
"step": 1540
},
{
"epoch": 1.3057089191189026,
"grad_norm": 1.4174506664276123,
"learning_rate": 0.0006458333333333334,
"loss": 4.2027,
"step": 1550
},
{
"epoch": 1.314132847629347,
"grad_norm": 1.1611113548278809,
"learning_rate": 0.0006500000000000001,
"loss": 4.1992,
"step": 1560
},
{
"epoch": 1.3225567761397916,
"grad_norm": 1.1475598812103271,
"learning_rate": 0.0006541666666666667,
"loss": 4.1875,
"step": 1570
},
{
"epoch": 1.3309807046502362,
"grad_norm": 1.158115267753601,
"learning_rate": 0.0006583333333333334,
"loss": 4.1883,
"step": 1580
},
{
"epoch": 1.3394046331606808,
"grad_norm": 1.325655221939087,
"learning_rate": 0.0006625,
"loss": 4.181,
"step": 1590
},
{
"epoch": 1.3478285616711254,
"grad_norm": 1.077793836593628,
"learning_rate": 0.0006666666666666666,
"loss": 4.1727,
"step": 1600
},
{
"epoch": 1.3562524901815698,
"grad_norm": 1.2139134407043457,
"learning_rate": 0.0006708333333333333,
"loss": 4.1691,
"step": 1610
},
{
"epoch": 1.3646764186920144,
"grad_norm": 1.075778603553772,
"learning_rate": 0.000675,
"loss": 4.1563,
"step": 1620
},
{
"epoch": 1.3646764186920144,
"eval_accuracy": 0.2982954422675167,
"eval_loss": 4.0783562660217285,
"eval_runtime": 880.4076,
"eval_samples_per_second": 567.21,
"eval_steps_per_second": 5.252,
"step": 1620
},
{
"epoch": 1.3731003472024588,
"grad_norm": 1.8017152547836304,
"learning_rate": 0.0006791666666666667,
"loss": 4.1523,
"step": 1630
},
{
"epoch": 1.3815242757129034,
"grad_norm": 1.2614473104476929,
"learning_rate": 0.0006833333333333333,
"loss": 4.1481,
"step": 1640
},
{
"epoch": 1.389948204223348,
"grad_norm": 1.179167628288269,
"learning_rate": 0.0006875,
"loss": 4.1421,
"step": 1650
},
{
"epoch": 1.3983721327337926,
"grad_norm": 1.463998794555664,
"learning_rate": 0.0006916666666666667,
"loss": 4.1331,
"step": 1660
},
{
"epoch": 1.406796061244237,
"grad_norm": 1.086358666419983,
"learning_rate": 0.0006958333333333334,
"loss": 4.1276,
"step": 1670
},
{
"epoch": 1.4152199897546816,
"grad_norm": 1.3272647857666016,
"learning_rate": 0.0007,
"loss": 4.1357,
"step": 1680
},
{
"epoch": 1.4236439182651262,
"grad_norm": 1.4760971069335938,
"learning_rate": 0.0007041666666666667,
"loss": 4.1299,
"step": 1690
},
{
"epoch": 1.4320678467755705,
"grad_norm": 1.7591749429702759,
"learning_rate": 0.0007083333333333334,
"loss": 4.129,
"step": 1700
},
{
"epoch": 1.4404917752860151,
"grad_norm": 1.7945603132247925,
"learning_rate": 0.0007125,
"loss": 4.1221,
"step": 1710
},
{
"epoch": 1.4404917752860151,
"eval_accuracy": 0.3010639405026742,
"eval_loss": 4.012106895446777,
"eval_runtime": 881.7425,
"eval_samples_per_second": 566.351,
"eval_steps_per_second": 5.244,
"step": 1710
},
{
"epoch": 1.4489157037964597,
"grad_norm": 1.7016360759735107,
"learning_rate": 0.0007166666666666667,
"loss": 4.1043,
"step": 1720
},
{
"epoch": 1.4573396323069043,
"grad_norm": 1.8240207433700562,
"learning_rate": 0.0007208333333333333,
"loss": 4.1034,
"step": 1730
},
{
"epoch": 1.4657635608173487,
"grad_norm": 2.4510786533355713,
"learning_rate": 0.000725,
"loss": 4.0924,
"step": 1740
},
{
"epoch": 1.4741874893277933,
"grad_norm": 1.7411324977874756,
"learning_rate": 0.0007291666666666666,
"loss": 4.1041,
"step": 1750
},
{
"epoch": 1.4826114178382377,
"grad_norm": 1.1133612394332886,
"learning_rate": 0.0007333333333333333,
"loss": 4.1064,
"step": 1760
},
{
"epoch": 1.4910353463486823,
"grad_norm": 1.3936740159988403,
"learning_rate": 0.0007375000000000001,
"loss": 4.0954,
"step": 1770
},
{
"epoch": 1.499459274859127,
"grad_norm": 2.3855819702148438,
"learning_rate": 0.0007416666666666667,
"loss": 4.0836,
"step": 1780
},
{
"epoch": 1.5078832033695715,
"grad_norm": 1.2734453678131104,
"learning_rate": 0.0007458333333333334,
"loss": 4.0834,
"step": 1790
},
{
"epoch": 1.516307131880016,
"grad_norm": 1.432719349861145,
"learning_rate": 0.00075,
"loss": 4.0711,
"step": 1800
},
{
"epoch": 1.516307131880016,
"eval_accuracy": 0.3055703004736556,
"eval_loss": 3.976287841796875,
"eval_runtime": 881.3595,
"eval_samples_per_second": 566.597,
"eval_steps_per_second": 5.246,
"step": 1800
},
{
"epoch": 1.5247310603904605,
"grad_norm": 1.5839996337890625,
"learning_rate": 0.0007541666666666667,
"loss": 4.0712,
"step": 1810
},
{
"epoch": 1.5331549889009048,
"grad_norm": 3.0461270809173584,
"learning_rate": 0.0007583333333333333,
"loss": 4.0617,
"step": 1820
},
{
"epoch": 1.5415789174113494,
"grad_norm": 1.760568380355835,
"learning_rate": 0.0007624999999999999,
"loss": 4.0486,
"step": 1830
},
{
"epoch": 1.550002845921794,
"grad_norm": 1.6682184934616089,
"learning_rate": 0.0007666666666666667,
"loss": 4.0034,
"step": 1840
},
{
"epoch": 1.5584267744322386,
"grad_norm": 1.4350653886795044,
"learning_rate": 0.0007708333333333334,
"loss": 3.9644,
"step": 1850
},
{
"epoch": 1.5668507029426832,
"grad_norm": 1.4870712757110596,
"learning_rate": 0.0007750000000000001,
"loss": 3.9314,
"step": 1860
},
{
"epoch": 1.5752746314531276,
"grad_norm": 1.7954463958740234,
"learning_rate": 0.0007791666666666667,
"loss": 3.8939,
"step": 1870
},
{
"epoch": 1.5836985599635722,
"grad_norm": 2.1485602855682373,
"learning_rate": 0.0007833333333333334,
"loss": 3.8576,
"step": 1880
},
{
"epoch": 1.5921224884740166,
"grad_norm": 1.647570252418518,
"learning_rate": 0.0007875,
"loss": 3.8159,
"step": 1890
},
{
"epoch": 1.5921224884740166,
"eval_accuracy": 0.3353472770952767,
"eval_loss": 3.6341910362243652,
"eval_runtime": 881.1424,
"eval_samples_per_second": 566.737,
"eval_steps_per_second": 5.248,
"step": 1890
},
{
"epoch": 1.6005464169844612,
"grad_norm": 1.7171742916107178,
"learning_rate": 0.0007916666666666666,
"loss": 3.7812,
"step": 1900
},
{
"epoch": 1.6089703454949058,
"grad_norm": 2.12190580368042,
"learning_rate": 0.0007958333333333333,
"loss": 3.7402,
"step": 1910
},
{
"epoch": 1.6173942740053504,
"grad_norm": 1.7334414720535278,
"learning_rate": 0.0008,
"loss": 3.7025,
"step": 1920
},
{
"epoch": 1.625818202515795,
"grad_norm": 1.8880668878555298,
"learning_rate": 0.0008041666666666667,
"loss": 3.6808,
"step": 1930
},
{
"epoch": 1.6342421310262394,
"grad_norm": 2.3294591903686523,
"learning_rate": 0.0008083333333333333,
"loss": 3.6419,
"step": 1940
},
{
"epoch": 1.642666059536684,
"grad_norm": 2.4122796058654785,
"learning_rate": 0.0008125000000000001,
"loss": 3.6114,
"step": 1950
},
{
"epoch": 1.6510899880471284,
"grad_norm": 2.090388774871826,
"learning_rate": 0.0008166666666666667,
"loss": 3.5867,
"step": 1960
},
{
"epoch": 1.659513916557573,
"grad_norm": 2.267676830291748,
"learning_rate": 0.0008208333333333334,
"loss": 3.5501,
"step": 1970
},
{
"epoch": 1.6679378450680176,
"grad_norm": 2.253739833831787,
"learning_rate": 0.000825,
"loss": 3.5114,
"step": 1980
},
{
"epoch": 1.6679378450680176,
"eval_accuracy": 0.38861593633258434,
"eval_loss": 3.2597665786743164,
"eval_runtime": 889.3264,
"eval_samples_per_second": 561.522,
"eval_steps_per_second": 5.199,
"step": 1980
},
{
"epoch": 1.6763617735784622,
"grad_norm": 2.269505739212036,
"learning_rate": 0.0008291666666666667,
"loss": 3.4854,
"step": 1990
},
{
"epoch": 1.6847857020889065,
"grad_norm": 1.7237802743911743,
"learning_rate": 0.0008333333333333334,
"loss": 3.4651,
"step": 2000
},
{
"epoch": 1.6932096305993511,
"grad_norm": 2.1117663383483887,
"learning_rate": 0.0008375,
"loss": 3.4558,
"step": 2010
},
{
"epoch": 1.7016335591097955,
"grad_norm": 2.1351046562194824,
"learning_rate": 0.0008416666666666667,
"loss": 3.4256,
"step": 2020
},
{
"epoch": 1.7100574876202401,
"grad_norm": 2.326232671737671,
"learning_rate": 0.0008458333333333333,
"loss": 3.3998,
"step": 2030
},
{
"epoch": 1.7184814161306847,
"grad_norm": 2.1802730560302734,
"learning_rate": 0.00085,
"loss": 3.3865,
"step": 2040
},
{
"epoch": 1.7269053446411293,
"grad_norm": 2.042966604232788,
"learning_rate": 0.0008541666666666666,
"loss": 3.3539,
"step": 2050
},
{
"epoch": 1.735329273151574,
"grad_norm": 2.052464008331299,
"learning_rate": 0.0008583333333333333,
"loss": 3.3308,
"step": 2060
},
{
"epoch": 1.7437532016620183,
"grad_norm": 1.5790934562683105,
"learning_rate": 0.0008625000000000001,
"loss": 3.3122,
"step": 2070
},
{
"epoch": 1.7437532016620183,
"eval_accuracy": 0.41178756961484836,
"eval_loss": 3.0882680416107178,
"eval_runtime": 878.4742,
"eval_samples_per_second": 568.458,
"eval_steps_per_second": 5.264,
"step": 2070
},
{
"epoch": 1.752177130172463,
"grad_norm": 2.2859761714935303,
"learning_rate": 0.0008666666666666667,
"loss": 3.3034,
"step": 2080
},
{
"epoch": 1.7606010586829073,
"grad_norm": 2.912191867828369,
"learning_rate": 0.0008708333333333334,
"loss": 3.289,
"step": 2090
},
{
"epoch": 1.7690249871933519,
"grad_norm": 2.143118143081665,
"learning_rate": 0.000875,
"loss": 3.2547,
"step": 2100
},
{
"epoch": 1.7774489157037965,
"grad_norm": 1.8577404022216797,
"learning_rate": 0.0008791666666666667,
"loss": 3.2383,
"step": 2110
},
{
"epoch": 1.785872844214241,
"grad_norm": 1.9692562818527222,
"learning_rate": 0.0008833333333333333,
"loss": 3.2137,
"step": 2120
},
{
"epoch": 1.7942967727246857,
"grad_norm": 1.938915729522705,
"learning_rate": 0.0008874999999999999,
"loss": 3.1909,
"step": 2130
},
{
"epoch": 1.80272070123513,
"grad_norm": 1.395321011543274,
"learning_rate": 0.0008916666666666667,
"loss": 3.1346,
"step": 2140
},
{
"epoch": 1.8111446297455744,
"grad_norm": 1.8771544694900513,
"learning_rate": 0.0008958333333333334,
"loss": 3.1035,
"step": 2150
},
{
"epoch": 1.819568558256019,
"grad_norm": 1.5829336643218994,
"learning_rate": 0.0009000000000000001,
"loss": 3.0328,
"step": 2160
},
{
"epoch": 1.819568558256019,
"eval_accuracy": 0.45304088376136725,
"eval_loss": 2.8062996864318848,
"eval_runtime": 886.0675,
"eval_samples_per_second": 563.587,
"eval_steps_per_second": 5.219,
"step": 2160
},
{
"epoch": 1.8279924867664636,
"grad_norm": 1.5085866451263428,
"learning_rate": 0.0009041666666666667,
"loss": 3.0089,
"step": 2170
},
{
"epoch": 1.8364164152769082,
"grad_norm": 1.4988549947738647,
"learning_rate": 0.0009083333333333334,
"loss": 2.9786,
"step": 2180
},
{
"epoch": 1.8448403437873528,
"grad_norm": 1.5726799964904785,
"learning_rate": 0.0009125,
"loss": 2.936,
"step": 2190
},
{
"epoch": 1.8532642722977972,
"grad_norm": 1.2175358533859253,
"learning_rate": 0.0009166666666666666,
"loss": 2.8996,
"step": 2200
},
{
"epoch": 1.8616882008082418,
"grad_norm": 1.4195218086242676,
"learning_rate": 0.0009208333333333333,
"loss": 2.8664,
"step": 2210
},
{
"epoch": 1.8701121293186862,
"grad_norm": 1.1213312149047852,
"learning_rate": 0.000925,
"loss": 2.8382,
"step": 2220
},
{
"epoch": 1.8785360578291308,
"grad_norm": 1.169554591178894,
"learning_rate": 0.0009291666666666667,
"loss": 2.8026,
"step": 2230
},
{
"epoch": 1.8869599863395754,
"grad_norm": 1.4759305715560913,
"learning_rate": 0.0009333333333333333,
"loss": 2.7654,
"step": 2240
},
{
"epoch": 1.89538391485002,
"grad_norm": 1.3071763515472412,
"learning_rate": 0.0009375,
"loss": 2.7311,
"step": 2250
},
{
"epoch": 1.89538391485002,
"eval_accuracy": 0.4917409385648686,
"eval_loss": 2.5433878898620605,
"eval_runtime": 879.3794,
"eval_samples_per_second": 567.873,
"eval_steps_per_second": 5.258,
"step": 2250
},
{
"epoch": 1.9038078433604646,
"grad_norm": 0.9968194961547852,
"learning_rate": 0.0009416666666666667,
"loss": 2.7044,
"step": 2260
},
{
"epoch": 1.912231771870909,
"grad_norm": 1.1783692836761475,
"learning_rate": 0.0009458333333333334,
"loss": 2.6819,
"step": 2270
},
{
"epoch": 1.9206557003813534,
"grad_norm": 0.9856918454170227,
"learning_rate": 0.00095,
"loss": 2.6528,
"step": 2280
},
{
"epoch": 1.929079628891798,
"grad_norm": 1.0605028867721558,
"learning_rate": 0.0009541666666666667,
"loss": 2.6226,
"step": 2290
},
{
"epoch": 1.9375035574022426,
"grad_norm": 0.8553977608680725,
"learning_rate": 0.0009583333333333334,
"loss": 2.608,
"step": 2300
},
{
"epoch": 1.9459274859126872,
"grad_norm": 0.9543612599372864,
"learning_rate": 0.0009625,
"loss": 2.5865,
"step": 2310
},
{
"epoch": 1.9543514144231318,
"grad_norm": 1.1085282564163208,
"learning_rate": 0.0009666666666666667,
"loss": 2.5586,
"step": 2320
},
{
"epoch": 1.9627753429335761,
"grad_norm": 0.8689624667167664,
"learning_rate": 0.0009708333333333333,
"loss": 2.541,
"step": 2330
},
{
"epoch": 1.9711992714440207,
"grad_norm": 0.6790447235107422,
"learning_rate": 0.000975,
"loss": 2.5214,
"step": 2340
},
{
"epoch": 1.9711992714440207,
"eval_accuracy": 0.5198810557311793,
"eval_loss": 2.3582663536071777,
"eval_runtime": 891.4654,
"eval_samples_per_second": 560.174,
"eval_steps_per_second": 5.187,
"step": 2340
},
{
"epoch": 1.9796231999544651,
"grad_norm": 1.1572414636611938,
"learning_rate": 0.0009791666666666666,
"loss": 2.5126,
"step": 2350
},
{
"epoch": 1.9880471284649097,
"grad_norm": 0.8218650221824646,
"learning_rate": 0.0009833333333333332,
"loss": 2.4903,
"step": 2360
},
{
"epoch": 1.9964710569753543,
"grad_norm": 0.9195880889892578,
"learning_rate": 0.0009875,
"loss": 2.479,
"step": 2370
},
{
"epoch": 2.004894985485799,
"grad_norm": 0.6436383724212646,
"learning_rate": 0.0009916666666666667,
"loss": 2.4509,
"step": 2380
},
{
"epoch": 2.0133189139962435,
"grad_norm": 0.9757860898971558,
"learning_rate": 0.0009958333333333334,
"loss": 2.453,
"step": 2390
},
{
"epoch": 2.021742842506688,
"grad_norm": 0.8884423971176147,
"learning_rate": 0.001,
"loss": 2.428,
"step": 2400
},
{
"epoch": 2.0301667710171323,
"grad_norm": 1.097330093383789,
"learning_rate": 0.000999009900990099,
"loss": 2.4139,
"step": 2410
},
{
"epoch": 2.038590699527577,
"grad_norm": 1.095337152481079,
"learning_rate": 0.0009980198019801981,
"loss": 2.4024,
"step": 2420
},
{
"epoch": 2.0470146280380215,
"grad_norm": 1.0757551193237305,
"learning_rate": 0.000997029702970297,
"loss": 2.3853,
"step": 2430
},
{
"epoch": 2.0470146280380215,
"eval_accuracy": 0.538133837771306,
"eval_loss": 2.2352097034454346,
"eval_runtime": 883.4374,
"eval_samples_per_second": 565.265,
"eval_steps_per_second": 5.234,
"step": 2430
},
{
"epoch": 2.055438556548466,
"grad_norm": 0.9356153011322021,
"learning_rate": 0.000996039603960396,
"loss": 2.3669,
"step": 2440
},
{
"epoch": 2.0638624850589107,
"grad_norm": 0.8463107347488403,
"learning_rate": 0.000995049504950495,
"loss": 2.3604,
"step": 2450
},
{
"epoch": 2.0722864135693553,
"grad_norm": 0.8833483457565308,
"learning_rate": 0.0009940594059405941,
"loss": 2.3574,
"step": 2460
},
{
"epoch": 2.0807103420797994,
"grad_norm": 0.7081923484802246,
"learning_rate": 0.0009930693069306932,
"loss": 2.3338,
"step": 2470
},
{
"epoch": 2.089134270590244,
"grad_norm": 0.5993143916130066,
"learning_rate": 0.000992079207920792,
"loss": 2.3219,
"step": 2480
},
{
"epoch": 2.0975581991006886,
"grad_norm": 0.8431512117385864,
"learning_rate": 0.000991089108910891,
"loss": 2.3108,
"step": 2490
},
{
"epoch": 2.1059821276111332,
"grad_norm": 0.9983824491500854,
"learning_rate": 0.0009900990099009901,
"loss": 2.305,
"step": 2500
},
{
"epoch": 2.114406056121578,
"grad_norm": 0.6354156732559204,
"learning_rate": 0.0009891089108910892,
"loss": 2.2965,
"step": 2510
},
{
"epoch": 2.1228299846320224,
"grad_norm": 0.8491016626358032,
"learning_rate": 0.0009881188118811882,
"loss": 2.2763,
"step": 2520
},
{
"epoch": 2.1228299846320224,
"eval_accuracy": 0.5540495533549666,
"eval_loss": 2.135758399963379,
"eval_runtime": 895.5557,
"eval_samples_per_second": 557.616,
"eval_steps_per_second": 5.163,
"step": 2520
},
{
"epoch": 2.131253913142467,
"grad_norm": 0.6909253001213074,
"learning_rate": 0.000987128712871287,
"loss": 2.2696,
"step": 2530
},
{
"epoch": 2.139677841652911,
"grad_norm": 0.5072851181030273,
"learning_rate": 0.000986138613861386,
"loss": 2.2555,
"step": 2540
},
{
"epoch": 2.148101770163356,
"grad_norm": 0.7575969696044922,
"learning_rate": 0.0009851485148514852,
"loss": 2.2552,
"step": 2550
},
{
"epoch": 2.1565256986738004,
"grad_norm": 0.7418563365936279,
"learning_rate": 0.0009841584158415842,
"loss": 2.2439,
"step": 2560
},
{
"epoch": 2.164949627184245,
"grad_norm": 0.5893211960792542,
"learning_rate": 0.0009831683168316833,
"loss": 2.2282,
"step": 2570
},
{
"epoch": 2.1733735556946896,
"grad_norm": 0.892035186290741,
"learning_rate": 0.000982178217821782,
"loss": 2.2201,
"step": 2580
},
{
"epoch": 2.181797484205134,
"grad_norm": 0.688275933265686,
"learning_rate": 0.0009811881188118811,
"loss": 2.2174,
"step": 2590
},
{
"epoch": 2.1902214127155784,
"grad_norm": 0.5092687010765076,
"learning_rate": 0.0009801980198019802,
"loss": 2.2032,
"step": 2600
},
{
"epoch": 2.198645341226023,
"grad_norm": 0.6715185642242432,
"learning_rate": 0.0009792079207920793,
"loss": 2.189,
"step": 2610
},
{
"epoch": 2.198645341226023,
"eval_accuracy": 0.5674450081410035,
"eval_loss": 2.053079605102539,
"eval_runtime": 876.7453,
"eval_samples_per_second": 569.579,
"eval_steps_per_second": 5.274,
"step": 2610
},
{
"epoch": 2.2070692697364676,
"grad_norm": 0.5717750191688538,
"learning_rate": 0.0009782178217821783,
"loss": 2.1894,
"step": 2620
},
{
"epoch": 2.215493198246912,
"grad_norm": 0.7002500295639038,
"learning_rate": 0.0009772277227722771,
"loss": 2.1851,
"step": 2630
},
{
"epoch": 2.2239171267573568,
"grad_norm": 0.6041799783706665,
"learning_rate": 0.0009762376237623762,
"loss": 2.1899,
"step": 2640
},
{
"epoch": 2.2323410552678014,
"grad_norm": 0.40263745188713074,
"learning_rate": 0.0009752475247524752,
"loss": 2.1633,
"step": 2650
},
{
"epoch": 2.240764983778246,
"grad_norm": 0.47779303789138794,
"learning_rate": 0.0009742574257425743,
"loss": 2.1478,
"step": 2660
},
{
"epoch": 2.24918891228869,
"grad_norm": 0.8906975984573364,
"learning_rate": 0.0009732673267326732,
"loss": 2.1508,
"step": 2670
},
{
"epoch": 2.2576128407991347,
"grad_norm": 0.4588846266269684,
"learning_rate": 0.0009722772277227723,
"loss": 2.1422,
"step": 2680
},
{
"epoch": 2.2660367693095793,
"grad_norm": 0.6038916707038879,
"learning_rate": 0.0009712871287128712,
"loss": 2.1229,
"step": 2690
},
{
"epoch": 2.274460697820024,
"grad_norm": 0.792378842830658,
"learning_rate": 0.0009702970297029703,
"loss": 2.1262,
"step": 2700
},
{
"epoch": 2.274460697820024,
"eval_accuracy": 0.5767164906847645,
"eval_loss": 1.9968212842941284,
"eval_runtime": 890.0794,
"eval_samples_per_second": 561.047,
"eval_steps_per_second": 5.195,
"step": 2700
},
{
"epoch": 2.2828846263304685,
"grad_norm": 0.5215600728988647,
"learning_rate": 0.0009693069306930693,
"loss": 2.1315,
"step": 2710
},
{
"epoch": 2.291308554840913,
"grad_norm": 0.42443060874938965,
"learning_rate": 0.0009683168316831683,
"loss": 2.1075,
"step": 2720
},
{
"epoch": 2.2997324833513577,
"grad_norm": 0.7379765510559082,
"learning_rate": 0.0009673267326732673,
"loss": 2.0997,
"step": 2730
},
{
"epoch": 2.308156411861802,
"grad_norm": 0.532883882522583,
"learning_rate": 0.0009663366336633663,
"loss": 2.1009,
"step": 2740
},
{
"epoch": 2.3165803403722465,
"grad_norm": 0.4312550127506256,
"learning_rate": 0.0009653465346534653,
"loss": 2.0836,
"step": 2750
},
{
"epoch": 2.325004268882691,
"grad_norm": 0.42506101727485657,
"learning_rate": 0.0009643564356435644,
"loss": 2.0751,
"step": 2760
},
{
"epoch": 2.3334281973931357,
"grad_norm": 0.9728929400444031,
"learning_rate": 0.0009633663366336633,
"loss": 2.0755,
"step": 2770
},
{
"epoch": 2.3418521259035803,
"grad_norm": 0.4502295255661011,
"learning_rate": 0.0009623762376237624,
"loss": 2.0757,
"step": 2780
},
{
"epoch": 2.350276054414025,
"grad_norm": 0.6825786232948303,
"learning_rate": 0.0009613861386138613,
"loss": 2.0593,
"step": 2790
},
{
"epoch": 2.350276054414025,
"eval_accuracy": 0.5877788692302428,
"eval_loss": 1.932070255279541,
"eval_runtime": 877.2049,
"eval_samples_per_second": 569.281,
"eval_steps_per_second": 5.271,
"step": 2790
},
{
"epoch": 2.3586999829244695,
"grad_norm": 0.5142760276794434,
"learning_rate": 0.0009603960396039604,
"loss": 2.0529,
"step": 2800
},
{
"epoch": 2.3671239114349136,
"grad_norm": 0.613132119178772,
"learning_rate": 0.0009594059405940594,
"loss": 2.0423,
"step": 2810
},
{
"epoch": 2.3755478399453582,
"grad_norm": 0.7282253503799438,
"learning_rate": 0.0009584158415841584,
"loss": 2.0522,
"step": 2820
},
{
"epoch": 2.383971768455803,
"grad_norm": 0.37959426641464233,
"learning_rate": 0.0009574257425742574,
"loss": 2.0367,
"step": 2830
},
{
"epoch": 2.3923956969662474,
"grad_norm": 0.35326164960861206,
"learning_rate": 0.0009564356435643564,
"loss": 2.0233,
"step": 2840
},
{
"epoch": 2.400819625476692,
"grad_norm": 0.8196151256561279,
"learning_rate": 0.0009554455445544554,
"loss": 2.0264,
"step": 2850
},
{
"epoch": 2.409243553987136,
"grad_norm": 0.7122208476066589,
"learning_rate": 0.0009544554455445545,
"loss": 2.0308,
"step": 2860
},
{
"epoch": 2.417667482497581,
"grad_norm": 0.35665011405944824,
"learning_rate": 0.0009534653465346534,
"loss": 2.0133,
"step": 2870
},
{
"epoch": 2.4260914110080254,
"grad_norm": 0.3755228519439697,
"learning_rate": 0.0009524752475247525,
"loss": 1.9992,
"step": 2880
},
{
"epoch": 2.4260914110080254,
"eval_accuracy": 0.596780331496744,
"eval_loss": 1.8819479942321777,
"eval_runtime": 890.4504,
"eval_samples_per_second": 560.813,
"eval_steps_per_second": 5.193,
"step": 2880
},
{
"epoch": 2.43451533951847,
"grad_norm": 0.7018378376960754,
"learning_rate": 0.0009514851485148514,
"loss": 2.0013,
"step": 2890
},
{
"epoch": 2.4429392680289146,
"grad_norm": 0.4874301850795746,
"learning_rate": 0.0009504950495049505,
"loss": 1.9971,
"step": 2900
},
{
"epoch": 2.451363196539359,
"grad_norm": 0.45909377932548523,
"learning_rate": 0.0009495049504950495,
"loss": 1.9881,
"step": 2910
},
{
"epoch": 2.459787125049804,
"grad_norm": 0.4965904951095581,
"learning_rate": 0.0009485148514851485,
"loss": 1.989,
"step": 2920
},
{
"epoch": 2.468211053560248,
"grad_norm": 0.4780527949333191,
"learning_rate": 0.0009475247524752475,
"loss": 1.9795,
"step": 2930
},
{
"epoch": 2.4766349820706925,
"grad_norm": 0.5145118236541748,
"learning_rate": 0.0009465346534653465,
"loss": 1.973,
"step": 2940
},
{
"epoch": 2.485058910581137,
"grad_norm": 0.5469622015953064,
"learning_rate": 0.0009455445544554455,
"loss": 1.9692,
"step": 2950
},
{
"epoch": 2.4934828390915817,
"grad_norm": 0.5788788199424744,
"learning_rate": 0.0009445544554455446,
"loss": 1.9627,
"step": 2960
},
{
"epoch": 2.5019067676020263,
"grad_norm": 0.5380696654319763,
"learning_rate": 0.0009435643564356435,
"loss": 1.9624,
"step": 2970
},
{
"epoch": 2.5019067676020263,
"eval_accuracy": 0.6028271764812113,
"eval_loss": 1.8441975116729736,
"eval_runtime": 877.1334,
"eval_samples_per_second": 569.327,
"eval_steps_per_second": 5.272,
"step": 2970
},
{
"epoch": 2.510330696112471,
"grad_norm": 0.4939862787723541,
"learning_rate": 0.0009425742574257426,
"loss": 1.9576,
"step": 2980
},
{
"epoch": 2.5187546246229155,
"grad_norm": 0.4804815649986267,
"learning_rate": 0.0009415841584158415,
"loss": 1.948,
"step": 2990
},
{
"epoch": 2.5271785531333597,
"grad_norm": 0.529515266418457,
"learning_rate": 0.0009405940594059406,
"loss": 1.9414,
"step": 3000
},
{
"epoch": 2.5356024816438043,
"grad_norm": 0.5104151964187622,
"learning_rate": 0.0009396039603960396,
"loss": 1.9472,
"step": 3010
},
{
"epoch": 2.544026410154249,
"grad_norm": 0.36934202909469604,
"learning_rate": 0.0009386138613861386,
"loss": 1.9358,
"step": 3020
},
{
"epoch": 2.5524503386646935,
"grad_norm": 0.5956403017044067,
"learning_rate": 0.0009376237623762376,
"loss": 1.9272,
"step": 3030
},
{
"epoch": 2.560874267175138,
"grad_norm": 0.5035738348960876,
"learning_rate": 0.0009366336633663367,
"loss": 1.934,
"step": 3040
},
{
"epoch": 2.5692981956855827,
"grad_norm": 0.44133296608924866,
"learning_rate": 0.0009356435643564357,
"loss": 1.9192,
"step": 3050
},
{
"epoch": 2.5777221241960273,
"grad_norm": 0.617588996887207,
"learning_rate": 0.0009346534653465348,
"loss": 1.9189,
"step": 3060
},
{
"epoch": 2.5777221241960273,
"eval_accuracy": 0.6097417836200192,
"eval_loss": 1.806692123413086,
"eval_runtime": 890.173,
"eval_samples_per_second": 560.988,
"eval_steps_per_second": 5.194,
"step": 3060
},
{
"epoch": 2.5861460527064715,
"grad_norm": 0.4702962338924408,
"learning_rate": 0.0009336633663366337,
"loss": 1.9145,
"step": 3070
},
{
"epoch": 2.594569981216916,
"grad_norm": 0.37163108587265015,
"learning_rate": 0.0009326732673267328,
"loss": 1.907,
"step": 3080
},
{
"epoch": 2.6029939097273607,
"grad_norm": 0.8039525151252747,
"learning_rate": 0.0009316831683168317,
"loss": 1.9071,
"step": 3090
},
{
"epoch": 2.6114178382378053,
"grad_norm": 0.3594844341278076,
"learning_rate": 0.0009306930693069308,
"loss": 1.9109,
"step": 3100
},
{
"epoch": 2.61984176674825,
"grad_norm": 0.44677871465682983,
"learning_rate": 0.0009297029702970298,
"loss": 1.8948,
"step": 3110
},
{
"epoch": 2.628265695258694,
"grad_norm": 0.4496874511241913,
"learning_rate": 0.0009287128712871288,
"loss": 1.893,
"step": 3120
},
{
"epoch": 2.636689623769139,
"grad_norm": 0.44437769055366516,
"learning_rate": 0.0009277227722772278,
"loss": 1.8891,
"step": 3130
},
{
"epoch": 2.6451135522795832,
"grad_norm": 0.47511276602745056,
"learning_rate": 0.0009267326732673268,
"loss": 1.8828,
"step": 3140
},
{
"epoch": 2.653537480790028,
"grad_norm": 0.5357436537742615,
"learning_rate": 0.0009257425742574258,
"loss": 1.8802,
"step": 3150
},
{
"epoch": 2.653537480790028,
"eval_accuracy": 0.6167399590165771,
"eval_loss": 1.7698620557785034,
"eval_runtime": 887.5592,
"eval_samples_per_second": 562.64,
"eval_steps_per_second": 5.21,
"step": 3150
},
{
"epoch": 2.6619614093004724,
"grad_norm": 0.5014392137527466,
"learning_rate": 0.0009247524752475249,
"loss": 1.8819,
"step": 3160
},
{
"epoch": 2.670385337810917,
"grad_norm": 0.41872531175613403,
"learning_rate": 0.0009237623762376238,
"loss": 1.8736,
"step": 3170
},
{
"epoch": 2.6788092663213616,
"grad_norm": 0.4343492388725281,
"learning_rate": 0.0009227722772277229,
"loss": 1.8659,
"step": 3180
},
{
"epoch": 2.687233194831806,
"grad_norm": 0.45470404624938965,
"learning_rate": 0.0009217821782178218,
"loss": 1.8689,
"step": 3190
},
{
"epoch": 2.695657123342251,
"grad_norm": 0.4626518487930298,
"learning_rate": 0.0009207920792079209,
"loss": 1.8606,
"step": 3200
},
{
"epoch": 2.704081051852695,
"grad_norm": 0.4213305711746216,
"learning_rate": 0.0009198019801980199,
"loss": 1.8587,
"step": 3210
},
{
"epoch": 2.7125049803631396,
"grad_norm": 0.5036765336990356,
"learning_rate": 0.0009188118811881188,
"loss": 1.8514,
"step": 3220
},
{
"epoch": 2.720928908873584,
"grad_norm": 0.4738876223564148,
"learning_rate": 0.0009178217821782179,
"loss": 1.8506,
"step": 3230
},
{
"epoch": 2.729352837384029,
"grad_norm": 0.3712784945964813,
"learning_rate": 0.0009168316831683168,
"loss": 1.8461,
"step": 3240
},
{
"epoch": 2.729352837384029,
"eval_accuracy": 0.6231111347423419,
"eval_loss": 1.7313838005065918,
"eval_runtime": 889.784,
"eval_samples_per_second": 561.233,
"eval_steps_per_second": 5.197,
"step": 3240
},
{
"epoch": 2.7377767658944734,
"grad_norm": 0.45651596784591675,
"learning_rate": 0.0009158415841584159,
"loss": 1.8405,
"step": 3250
},
{
"epoch": 2.7462006944049175,
"grad_norm": 0.5253742933273315,
"learning_rate": 0.000914851485148515,
"loss": 1.839,
"step": 3260
},
{
"epoch": 2.754624622915362,
"grad_norm": 0.4810900390148163,
"learning_rate": 0.0009138613861386139,
"loss": 1.8352,
"step": 3270
},
{
"epoch": 2.7630485514258067,
"grad_norm": 0.42353251576423645,
"learning_rate": 0.0009128712871287129,
"loss": 1.8308,
"step": 3280
},
{
"epoch": 2.7714724799362513,
"grad_norm": 0.34494903683662415,
"learning_rate": 0.0009118811881188119,
"loss": 1.8271,
"step": 3290
},
{
"epoch": 2.779896408446696,
"grad_norm": 0.44857293367385864,
"learning_rate": 0.0009108910891089109,
"loss": 1.8272,
"step": 3300
},
{
"epoch": 2.7883203369571405,
"grad_norm": 0.32810303568840027,
"learning_rate": 0.00090990099009901,
"loss": 1.8201,
"step": 3310
},
{
"epoch": 2.796744265467585,
"grad_norm": 0.5814313292503357,
"learning_rate": 0.0009089108910891089,
"loss": 1.8181,
"step": 3320
},
{
"epoch": 2.8051681939780293,
"grad_norm": 0.6469531655311584,
"learning_rate": 0.000907920792079208,
"loss": 1.8228,
"step": 3330
},
{
"epoch": 2.8051681939780293,
"eval_accuracy": 0.627194729904968,
"eval_loss": 1.7094751596450806,
"eval_runtime": 879.8799,
"eval_samples_per_second": 567.55,
"eval_steps_per_second": 5.255,
"step": 3330
},
{
"epoch": 2.813592122488474,
"grad_norm": 0.37370234727859497,
"learning_rate": 0.0009069306930693069,
"loss": 1.8143,
"step": 3340
},
{
"epoch": 2.8220160509989185,
"grad_norm": 0.2818905711174011,
"learning_rate": 0.000905940594059406,
"loss": 1.8058,
"step": 3350
},
{
"epoch": 2.830439979509363,
"grad_norm": 0.40032240748405457,
"learning_rate": 0.000904950495049505,
"loss": 1.8037,
"step": 3360
},
{
"epoch": 2.8388639080198077,
"grad_norm": 0.4075703024864197,
"learning_rate": 0.000903960396039604,
"loss": 1.8042,
"step": 3370
},
{
"epoch": 2.8472878365302523,
"grad_norm": 0.4188884496688843,
"learning_rate": 0.000902970297029703,
"loss": 1.7954,
"step": 3380
},
{
"epoch": 2.855711765040697,
"grad_norm": 0.40151095390319824,
"learning_rate": 0.000901980198019802,
"loss": 1.8,
"step": 3390
},
{
"epoch": 2.864135693551141,
"grad_norm": 0.38640516996383667,
"learning_rate": 0.000900990099009901,
"loss": 1.7897,
"step": 3400
},
{
"epoch": 2.8725596220615857,
"grad_norm": 0.46775710582733154,
"learning_rate": 0.0009000000000000001,
"loss": 1.7889,
"step": 3410
},
{
"epoch": 2.8809835505720303,
"grad_norm": 0.5004317760467529,
"learning_rate": 0.000899009900990099,
"loss": 1.7838,
"step": 3420
},
{
"epoch": 2.8809835505720303,
"eval_accuracy": 0.6330453392339891,
"eval_loss": 1.6756778955459595,
"eval_runtime": 890.43,
"eval_samples_per_second": 560.826,
"eval_steps_per_second": 5.193,
"step": 3420
},
{
"epoch": 2.889407479082475,
"grad_norm": 0.44054290652275085,
"learning_rate": 0.0008980198019801981,
"loss": 1.7839,
"step": 3430
},
{
"epoch": 2.8978314075929195,
"grad_norm": 0.38003844022750854,
"learning_rate": 0.000897029702970297,
"loss": 1.7793,
"step": 3440
},
{
"epoch": 2.9062553361033636,
"grad_norm": 0.3714471757411957,
"learning_rate": 0.0008960396039603961,
"loss": 1.7765,
"step": 3450
},
{
"epoch": 2.9146792646138087,
"grad_norm": 0.4955293834209442,
"learning_rate": 0.0008950495049504951,
"loss": 1.7729,
"step": 3460
},
{
"epoch": 2.923103193124253,
"grad_norm": 0.367481529712677,
"learning_rate": 0.0008940594059405941,
"loss": 1.7666,
"step": 3470
},
{
"epoch": 2.9315271216346974,
"grad_norm": 0.48372742533683777,
"learning_rate": 0.0008930693069306931,
"loss": 1.7638,
"step": 3480
},
{
"epoch": 2.939951050145142,
"grad_norm": 0.5356625318527222,
"learning_rate": 0.0008920792079207921,
"loss": 1.7625,
"step": 3490
},
{
"epoch": 2.9483749786555866,
"grad_norm": 0.396090030670166,
"learning_rate": 0.0008910891089108911,
"loss": 1.7597,
"step": 3500
},
{
"epoch": 2.956798907166031,
"grad_norm": 0.3071458041667938,
"learning_rate": 0.0008900990099009902,
"loss": 1.7513,
"step": 3510
},
{
"epoch": 2.956798907166031,
"eval_accuracy": 0.640630813225039,
"eval_loss": 1.6351577043533325,
"eval_runtime": 887.1061,
"eval_samples_per_second": 562.927,
"eval_steps_per_second": 5.212,
"step": 3510
},
{
"epoch": 2.9652228356764754,
"grad_norm": 0.7265316247940063,
"learning_rate": 0.0008891089108910891,
"loss": 1.7482,
"step": 3520
},
{
"epoch": 2.97364676418692,
"grad_norm": 0.34152501821517944,
"learning_rate": 0.0008881188118811882,
"loss": 1.7454,
"step": 3530
},
{
"epoch": 2.9820706926973646,
"grad_norm": 0.5570985078811646,
"learning_rate": 0.0008871287128712871,
"loss": 1.736,
"step": 3540
},
{
"epoch": 2.990494621207809,
"grad_norm": 0.29268133640289307,
"learning_rate": 0.0008861386138613862,
"loss": 1.7323,
"step": 3550
},
{
"epoch": 2.998918549718254,
"grad_norm": 0.4475082755088806,
"learning_rate": 0.0008851485148514852,
"loss": 1.7207,
"step": 3560
},
{
"epoch": 3.0073424782286984,
"grad_norm": 0.39963921904563904,
"learning_rate": 0.0008841584158415842,
"loss": 1.7199,
"step": 3570
},
{
"epoch": 3.015766406739143,
"grad_norm": 0.3290662169456482,
"learning_rate": 0.0008831683168316832,
"loss": 1.7103,
"step": 3580
},
{
"epoch": 3.024190335249587,
"grad_norm": 0.4892579913139343,
"learning_rate": 0.0008821782178217822,
"loss": 1.7024,
"step": 3590
},
{
"epoch": 3.0326142637600317,
"grad_norm": 0.45102205872535706,
"learning_rate": 0.0008811881188118812,
"loss": 1.7012,
"step": 3600
},
{
"epoch": 3.0326142637600317,
"eval_accuracy": 0.65292687328356,
"eval_loss": 1.578561544418335,
"eval_runtime": 889.1801,
"eval_samples_per_second": 561.614,
"eval_steps_per_second": 5.2,
"step": 3600
},
{
"epoch": 3.0410381922704763,
"grad_norm": 0.38877975940704346,
"learning_rate": 0.0008801980198019803,
"loss": 1.6999,
"step": 3610
},
{
"epoch": 3.049462120780921,
"grad_norm": 0.32052722573280334,
"learning_rate": 0.0008792079207920792,
"loss": 1.6898,
"step": 3620
},
{
"epoch": 3.0578860492913655,
"grad_norm": 0.4076586365699768,
"learning_rate": 0.0008782178217821783,
"loss": 1.682,
"step": 3630
},
{
"epoch": 3.06630997780181,
"grad_norm": 0.3886164724826813,
"learning_rate": 0.0008772277227722772,
"loss": 1.6788,
"step": 3640
},
{
"epoch": 3.0747339063122547,
"grad_norm": 0.43478402495384216,
"learning_rate": 0.0008762376237623763,
"loss": 1.6757,
"step": 3650
},
{
"epoch": 3.083157834822699,
"grad_norm": 0.3681798279285431,
"learning_rate": 0.0008752475247524753,
"loss": 1.6725,
"step": 3660
},
{
"epoch": 3.0915817633331435,
"grad_norm": 0.44459056854248047,
"learning_rate": 0.0008742574257425743,
"loss": 1.6653,
"step": 3670
},
{
"epoch": 3.100005691843588,
"grad_norm": 0.3404163420200348,
"learning_rate": 0.0008732673267326733,
"loss": 1.6597,
"step": 3680
},
{
"epoch": 3.1084296203540327,
"grad_norm": 0.39622583985328674,
"learning_rate": 0.0008722772277227722,
"loss": 1.664,
"step": 3690
},
{
"epoch": 3.1084296203540327,
"eval_accuracy": 0.6616252383451875,
"eval_loss": 1.5378377437591553,
"eval_runtime": 880.004,
"eval_samples_per_second": 567.47,
"eval_steps_per_second": 5.255,
"step": 3690
},
{
"epoch": 3.1168535488644773,
"grad_norm": 0.36066505312919617,
"learning_rate": 0.0008712871287128713,
"loss": 1.6552,
"step": 3700
},
{
"epoch": 3.125277477374922,
"grad_norm": 0.45852380990982056,
"learning_rate": 0.0008702970297029704,
"loss": 1.6581,
"step": 3710
},
{
"epoch": 3.1337014058853665,
"grad_norm": 0.3647266924381256,
"learning_rate": 0.0008693069306930693,
"loss": 1.6493,
"step": 3720
},
{
"epoch": 3.1421253343958107,
"grad_norm": 0.4774695038795471,
"learning_rate": 0.0008683168316831684,
"loss": 1.6457,
"step": 3730
},
{
"epoch": 3.1505492629062553,
"grad_norm": 0.4143640398979187,
"learning_rate": 0.0008673267326732673,
"loss": 1.6436,
"step": 3740
},
{
"epoch": 3.1589731914167,
"grad_norm": 0.4920789897441864,
"learning_rate": 0.0008663366336633663,
"loss": 1.6431,
"step": 3750
},
{
"epoch": 3.1673971199271445,
"grad_norm": 0.40231600403785706,
"learning_rate": 0.0008653465346534654,
"loss": 1.6373,
"step": 3760
},
{
"epoch": 3.175821048437589,
"grad_norm": 0.35115131735801697,
"learning_rate": 0.0008643564356435643,
"loss": 1.6343,
"step": 3770
},
{
"epoch": 3.1842449769480337,
"grad_norm": 0.3814195990562439,
"learning_rate": 0.0008633663366336634,
"loss": 1.6345,
"step": 3780
},
{
"epoch": 3.1842449769480337,
"eval_accuracy": 0.6669776046149977,
"eval_loss": 1.5131778717041016,
"eval_runtime": 887.9268,
"eval_samples_per_second": 562.407,
"eval_steps_per_second": 5.208,
"step": 3780
},
{
"epoch": 3.192668905458478,
"grad_norm": 0.3229101896286011,
"learning_rate": 0.0008623762376237623,
"loss": 1.6281,
"step": 3790
},
{
"epoch": 3.2010928339689224,
"grad_norm": 0.4361475110054016,
"learning_rate": 0.0008613861386138614,
"loss": 1.6253,
"step": 3800
},
{
"epoch": 3.209516762479367,
"grad_norm": 0.3246362507343292,
"learning_rate": 0.0008603960396039604,
"loss": 1.6269,
"step": 3810
},
{
"epoch": 3.2179406909898116,
"grad_norm": 0.5126762390136719,
"learning_rate": 0.0008594059405940594,
"loss": 1.62,
"step": 3820
},
{
"epoch": 3.226364619500256,
"grad_norm": 0.3813638389110565,
"learning_rate": 0.0008584158415841584,
"loss": 1.6228,
"step": 3830
},
{
"epoch": 3.234788548010701,
"grad_norm": 0.5111351013183594,
"learning_rate": 0.0008574257425742574,
"loss": 1.6162,
"step": 3840
},
{
"epoch": 3.243212476521145,
"grad_norm": 0.3448195457458496,
"learning_rate": 0.0008564356435643564,
"loss": 1.6156,
"step": 3850
},
{
"epoch": 3.2516364050315896,
"grad_norm": 0.50129634141922,
"learning_rate": 0.0008554455445544555,
"loss": 1.6153,
"step": 3860
},
{
"epoch": 3.260060333542034,
"grad_norm": 0.3352351188659668,
"learning_rate": 0.0008544554455445544,
"loss": 1.6117,
"step": 3870
},
{
"epoch": 3.260060333542034,
"eval_accuracy": 0.6717362607348063,
"eval_loss": 1.4890562295913696,
"eval_runtime": 886.1465,
"eval_samples_per_second": 563.537,
"eval_steps_per_second": 5.218,
"step": 3870
},
{
"epoch": 3.2684842620524788,
"grad_norm": 0.38713541626930237,
"learning_rate": 0.0008534653465346535,
"loss": 1.6058,
"step": 3880
},
{
"epoch": 3.2769081905629234,
"grad_norm": 0.46299123764038086,
"learning_rate": 0.0008524752475247524,
"loss": 1.6053,
"step": 3890
},
{
"epoch": 3.285332119073368,
"grad_norm": 0.4045964181423187,
"learning_rate": 0.0008514851485148515,
"loss": 1.6064,
"step": 3900
},
{
"epoch": 3.2937560475838126,
"grad_norm": 0.37616729736328125,
"learning_rate": 0.0008504950495049505,
"loss": 1.6005,
"step": 3910
},
{
"epoch": 3.3021799760942567,
"grad_norm": 0.47833314538002014,
"learning_rate": 0.0008495049504950495,
"loss": 1.599,
"step": 3920
},
{
"epoch": 3.3106039046047013,
"grad_norm": 0.436625212430954,
"learning_rate": 0.0008485148514851485,
"loss": 1.5954,
"step": 3930
},
{
"epoch": 3.319027833115146,
"grad_norm": 0.3456842005252838,
"learning_rate": 0.0008475247524752475,
"loss": 1.5924,
"step": 3940
},
{
"epoch": 3.3274517616255905,
"grad_norm": 0.5403941869735718,
"learning_rate": 0.0008465346534653465,
"loss": 1.5915,
"step": 3950
},
{
"epoch": 3.335875690136035,
"grad_norm": 0.3622403144836426,
"learning_rate": 0.0008455445544554456,
"loss": 1.6013,
"step": 3960
},
{
"epoch": 3.335875690136035,
"eval_accuracy": 0.6740560565861919,
"eval_loss": 1.475487232208252,
"eval_runtime": 895.3114,
"eval_samples_per_second": 557.768,
"eval_steps_per_second": 5.165,
"step": 3960
},
{
"epoch": 3.3442996186464797,
"grad_norm": 0.2850242555141449,
"learning_rate": 0.0008445544554455445,
"loss": 1.5903,
"step": 3970
},
{
"epoch": 3.3527235471569243,
"grad_norm": 0.39831429719924927,
"learning_rate": 0.0008435643564356436,
"loss": 1.5846,
"step": 3980
},
{
"epoch": 3.3611474756673685,
"grad_norm": 0.4886794686317444,
"learning_rate": 0.0008425742574257425,
"loss": 1.5876,
"step": 3990
},
{
"epoch": 3.369571404177813,
"grad_norm": 0.35439977049827576,
"learning_rate": 0.0008415841584158416,
"loss": 1.5839,
"step": 4000
},
{
"epoch": 3.3779953326882577,
"grad_norm": 0.32369595766067505,
"learning_rate": 0.0008405940594059406,
"loss": 1.5797,
"step": 4010
},
{
"epoch": 3.3864192611987023,
"grad_norm": 0.48595139384269714,
"learning_rate": 0.0008396039603960396,
"loss": 1.58,
"step": 4020
},
{
"epoch": 3.394843189709147,
"grad_norm": 0.39331361651420593,
"learning_rate": 0.0008386138613861386,
"loss": 1.5786,
"step": 4030
},
{
"epoch": 3.4032671182195915,
"grad_norm": 0.31911513209342957,
"learning_rate": 0.0008376237623762376,
"loss": 1.5745,
"step": 4040
},
{
"epoch": 3.411691046730036,
"grad_norm": 0.319876104593277,
"learning_rate": 0.0008366336633663366,
"loss": 1.5749,
"step": 4050
},
{
"epoch": 3.411691046730036,
"eval_accuracy": 0.6780886041474171,
"eval_loss": 1.4578139781951904,
"eval_runtime": 880.4333,
"eval_samples_per_second": 567.193,
"eval_steps_per_second": 5.252,
"step": 4050
},
{
"epoch": 3.4201149752404802,
"grad_norm": 0.45969948172569275,
"learning_rate": 0.0008356435643564357,
"loss": 1.5759,
"step": 4060
},
{
"epoch": 3.428538903750925,
"grad_norm": 0.34449151158332825,
"learning_rate": 0.0008346534653465346,
"loss": 1.5707,
"step": 4070
},
{
"epoch": 3.4369628322613694,
"grad_norm": 0.3478371202945709,
"learning_rate": 0.0008336633663366337,
"loss": 1.5699,
"step": 4080
},
{
"epoch": 3.445386760771814,
"grad_norm": 0.5127679109573364,
"learning_rate": 0.0008326732673267326,
"loss": 1.5668,
"step": 4090
},
{
"epoch": 3.4538106892822587,
"grad_norm": 0.302216500043869,
"learning_rate": 0.0008316831683168317,
"loss": 1.5647,
"step": 4100
},
{
"epoch": 3.4622346177927033,
"grad_norm": 0.3295814096927643,
"learning_rate": 0.0008306930693069307,
"loss": 1.5628,
"step": 4110
},
{
"epoch": 3.4706585463031474,
"grad_norm": 0.4209032654762268,
"learning_rate": 0.0008297029702970297,
"loss": 1.5628,
"step": 4120
},
{
"epoch": 3.479082474813592,
"grad_norm": 0.34786614775657654,
"learning_rate": 0.0008287128712871287,
"loss": 1.5613,
"step": 4130
},
{
"epoch": 3.4875064033240366,
"grad_norm": 0.4870763421058655,
"learning_rate": 0.0008277227722772277,
"loss": 1.5584,
"step": 4140
},
{
"epoch": 3.4875064033240366,
"eval_accuracy": 0.6804383346028876,
"eval_loss": 1.4444972276687622,
"eval_runtime": 891.9286,
"eval_samples_per_second": 559.883,
"eval_steps_per_second": 5.184,
"step": 4140
},
{
"epoch": 3.495930331834481,
"grad_norm": 0.31641605496406555,
"learning_rate": 0.0008267326732673267,
"loss": 1.5581,
"step": 4150
},
{
"epoch": 3.504354260344926,
"grad_norm": 0.31303870677948,
"learning_rate": 0.0008257425742574258,
"loss": 1.5548,
"step": 4160
},
{
"epoch": 3.5127781888553704,
"grad_norm": 0.35413628816604614,
"learning_rate": 0.0008247524752475247,
"loss": 1.5506,
"step": 4170
},
{
"epoch": 3.5212021173658146,
"grad_norm": 0.39600226283073425,
"learning_rate": 0.0008237623762376238,
"loss": 1.5517,
"step": 4180
},
{
"epoch": 3.529626045876259,
"grad_norm": 0.3600960075855255,
"learning_rate": 0.0008227722772277227,
"loss": 1.5563,
"step": 4190
},
{
"epoch": 3.5380499743867038,
"grad_norm": 0.2877024710178375,
"learning_rate": 0.0008217821782178218,
"loss": 1.5467,
"step": 4200
},
{
"epoch": 3.5464739028971484,
"grad_norm": 0.42324578762054443,
"learning_rate": 0.0008207920792079208,
"loss": 1.546,
"step": 4210
},
{
"epoch": 3.554897831407593,
"grad_norm": 0.38907232880592346,
"learning_rate": 0.0008198019801980197,
"loss": 1.5458,
"step": 4220
},
{
"epoch": 3.5633217599180376,
"grad_norm": 0.34750425815582275,
"learning_rate": 0.0008188118811881188,
"loss": 1.5437,
"step": 4230
},
{
"epoch": 3.5633217599180376,
"eval_accuracy": 0.6840987986477044,
"eval_loss": 1.4261698722839355,
"eval_runtime": 886.2695,
"eval_samples_per_second": 563.458,
"eval_steps_per_second": 5.217,
"step": 4230
},
{
"epoch": 3.571745688428482,
"grad_norm": 0.3718611001968384,
"learning_rate": 0.0008178217821782177,
"loss": 1.546,
"step": 4240
},
{
"epoch": 3.5801696169389263,
"grad_norm": 0.39119917154312134,
"learning_rate": 0.0008168316831683168,
"loss": 1.5411,
"step": 4250
},
{
"epoch": 3.588593545449371,
"grad_norm": 0.45689284801483154,
"learning_rate": 0.0008158415841584159,
"loss": 1.5416,
"step": 4260
},
{
"epoch": 3.5970174739598155,
"grad_norm": 0.4029008150100708,
"learning_rate": 0.0008148514851485148,
"loss": 1.5364,
"step": 4270
},
{
"epoch": 3.60544140247026,
"grad_norm": 0.3843879997730255,
"learning_rate": 0.0008138613861386138,
"loss": 1.5368,
"step": 4280
},
{
"epoch": 3.6138653309807047,
"grad_norm": 0.33945897221565247,
"learning_rate": 0.0008128712871287128,
"loss": 1.5369,
"step": 4290
},
{
"epoch": 3.6222892594911493,
"grad_norm": 0.29753997921943665,
"learning_rate": 0.000811881188118812,
"loss": 1.5326,
"step": 4300
},
{
"epoch": 3.630713188001594,
"grad_norm": 0.4412858784198761,
"learning_rate": 0.000810891089108911,
"loss": 1.5316,
"step": 4310
},
{
"epoch": 3.639137116512038,
"grad_norm": 0.30377647280693054,
"learning_rate": 0.00080990099009901,
"loss": 1.5308,
"step": 4320
},
{
"epoch": 3.639137116512038,
"eval_accuracy": 0.6865785598346558,
"eval_loss": 1.4111888408660889,
"eval_runtime": 880.9823,
"eval_samples_per_second": 566.84,
"eval_steps_per_second": 5.249,
"step": 4320
},
{
"epoch": 3.6475610450224827,
"grad_norm": 0.3666999638080597,
"learning_rate": 0.000808910891089109,
"loss": 1.5279,
"step": 4330
},
{
"epoch": 3.6559849735329273,
"grad_norm": 0.3254301846027374,
"learning_rate": 0.0008079207920792079,
"loss": 1.5277,
"step": 4340
},
{
"epoch": 3.664408902043372,
"grad_norm": 0.4963987469673157,
"learning_rate": 0.000806930693069307,
"loss": 1.5286,
"step": 4350
},
{
"epoch": 3.6728328305538165,
"grad_norm": 0.34190070629119873,
"learning_rate": 0.000805940594059406,
"loss": 1.5294,
"step": 4360
},
{
"epoch": 3.6812567590642606,
"grad_norm": 0.35153254866600037,
"learning_rate": 0.000804950495049505,
"loss": 1.5217,
"step": 4370
},
{
"epoch": 3.6896806875747057,
"grad_norm": 0.345929354429245,
"learning_rate": 0.000803960396039604,
"loss": 1.52,
"step": 4380
},
{
"epoch": 3.69810461608515,
"grad_norm": 0.37540799379348755,
"learning_rate": 0.000802970297029703,
"loss": 1.5208,
"step": 4390
},
{
"epoch": 3.7065285445955944,
"grad_norm": 0.33499011397361755,
"learning_rate": 0.000801980198019802,
"loss": 1.5196,
"step": 4400
},
{
"epoch": 3.714952473106039,
"grad_norm": 0.3461949825286865,
"learning_rate": 0.0008009900990099011,
"loss": 1.5188,
"step": 4410
},
{
"epoch": 3.714952473106039,
"eval_accuracy": 0.6888913088166951,
"eval_loss": 1.40292227268219,
"eval_runtime": 882.772,
"eval_samples_per_second": 565.691,
"eval_steps_per_second": 5.238,
"step": 4410
},
{
"epoch": 3.7233764016164836,
"grad_norm": 0.36491358280181885,
"learning_rate": 0.0008,
"loss": 1.5171,
"step": 4420
},
{
"epoch": 3.7318003301269282,
"grad_norm": 0.2799367606639862,
"learning_rate": 0.0007990099009900991,
"loss": 1.5142,
"step": 4430
},
{
"epoch": 3.7402242586373724,
"grad_norm": 0.361971914768219,
"learning_rate": 0.000798019801980198,
"loss": 1.5145,
"step": 4440
},
{
"epoch": 3.7486481871478174,
"grad_norm": 0.2618056535720825,
"learning_rate": 0.0007970297029702971,
"loss": 1.5113,
"step": 4450
},
{
"epoch": 3.7570721156582616,
"grad_norm": 0.5228148698806763,
"learning_rate": 0.0007960396039603961,
"loss": 1.5111,
"step": 4460
},
{
"epoch": 3.765496044168706,
"grad_norm": 0.37740132212638855,
"learning_rate": 0.0007950495049504951,
"loss": 1.5121,
"step": 4470
},
{
"epoch": 3.773919972679151,
"grad_norm": 0.3701629340648651,
"learning_rate": 0.0007940594059405941,
"loss": 1.5083,
"step": 4480
},
{
"epoch": 3.7823439011895954,
"grad_norm": 0.3345108926296234,
"learning_rate": 0.0007930693069306931,
"loss": 1.5077,
"step": 4490
},
{
"epoch": 3.79076782970004,
"grad_norm": 0.3989773988723755,
"learning_rate": 0.0007920792079207921,
"loss": 1.5079,
"step": 4500
},
{
"epoch": 3.79076782970004,
"eval_accuracy": 0.6907081981543249,
"eval_loss": 1.3909889459609985,
"eval_runtime": 889.7203,
"eval_samples_per_second": 561.273,
"eval_steps_per_second": 5.197,
"step": 4500
},
{
"epoch": 3.799191758210484,
"grad_norm": 0.284728080034256,
"learning_rate": 0.0007910891089108912,
"loss": 1.5046,
"step": 4510
},
{
"epoch": 3.8076156867209288,
"grad_norm": 0.5029779672622681,
"learning_rate": 0.0007900990099009901,
"loss": 1.5049,
"step": 4520
},
{
"epoch": 3.8160396152313734,
"grad_norm": 0.32617345452308655,
"learning_rate": 0.0007891089108910892,
"loss": 1.5068,
"step": 4530
},
{
"epoch": 3.824463543741818,
"grad_norm": 0.36316540837287903,
"learning_rate": 0.0007881188118811881,
"loss": 1.4999,
"step": 4540
},
{
"epoch": 3.8328874722522626,
"grad_norm": 0.30240392684936523,
"learning_rate": 0.0007871287128712872,
"loss": 1.498,
"step": 4550
},
{
"epoch": 3.841311400762707,
"grad_norm": 0.3905390202999115,
"learning_rate": 0.0007861386138613862,
"loss": 1.4978,
"step": 4560
},
{
"epoch": 3.8497353292731518,
"grad_norm": 0.30473875999450684,
"learning_rate": 0.0007851485148514852,
"loss": 1.4965,
"step": 4570
},
{
"epoch": 3.858159257783596,
"grad_norm": 0.3675777316093445,
"learning_rate": 0.0007841584158415842,
"loss": 1.4957,
"step": 4580
},
{
"epoch": 3.8665831862940405,
"grad_norm": 0.394168883562088,
"learning_rate": 0.0007831683168316832,
"loss": 1.4936,
"step": 4590
},
{
"epoch": 3.8665831862940405,
"eval_accuracy": 0.6926193728848408,
"eval_loss": 1.3844850063323975,
"eval_runtime": 887.3028,
"eval_samples_per_second": 562.802,
"eval_steps_per_second": 5.211,
"step": 4590
},
{
"epoch": 3.875007114804485,
"grad_norm": 0.3404500186443329,
"learning_rate": 0.0007821782178217822,
"loss": 1.4956,
"step": 4600
},
{
"epoch": 3.8834310433149297,
"grad_norm": 0.3074527978897095,
"learning_rate": 0.0007811881188118813,
"loss": 1.4928,
"step": 4610
},
{
"epoch": 3.8918549718253743,
"grad_norm": 0.44941094517707825,
"learning_rate": 0.0007801980198019802,
"loss": 1.4911,
"step": 4620
},
{
"epoch": 3.900278900335819,
"grad_norm": 0.3098917603492737,
"learning_rate": 0.0007792079207920793,
"loss": 1.4918,
"step": 4630
},
{
"epoch": 3.9087028288462635,
"grad_norm": 0.37436243891716003,
"learning_rate": 0.0007782178217821782,
"loss": 1.4866,
"step": 4640
},
{
"epoch": 3.9171267573567077,
"grad_norm": 0.3058597445487976,
"learning_rate": 0.0007772277227722773,
"loss": 1.4896,
"step": 4650
},
{
"epoch": 3.9255506858671523,
"grad_norm": 0.34245744347572327,
"learning_rate": 0.0007762376237623763,
"loss": 1.4874,
"step": 4660
},
{
"epoch": 3.933974614377597,
"grad_norm": 0.3401254117488861,
"learning_rate": 0.0007752475247524753,
"loss": 1.4866,
"step": 4670
},
{
"epoch": 3.9423985428880415,
"grad_norm": 0.35778889060020447,
"learning_rate": 0.0007742574257425743,
"loss": 1.4818,
"step": 4680
},
{
"epoch": 3.9423985428880415,
"eval_accuracy": 0.6951155140000936,
"eval_loss": 1.3689333200454712,
"eval_runtime": 879.8095,
"eval_samples_per_second": 567.596,
"eval_steps_per_second": 5.256,
"step": 4680
},
{
"epoch": 3.950822471398486,
"grad_norm": 0.2895776927471161,
"learning_rate": 0.0007732673267326733,
"loss": 1.4822,
"step": 4690
},
{
"epoch": 3.9592463999089302,
"grad_norm": 0.3483330309391022,
"learning_rate": 0.0007722772277227723,
"loss": 1.4802,
"step": 4700
},
{
"epoch": 3.9676703284193753,
"grad_norm": 0.30115026235580444,
"learning_rate": 0.0007712871287128714,
"loss": 1.4838,
"step": 4710
},
{
"epoch": 3.9760942569298194,
"grad_norm": 0.32046666741371155,
"learning_rate": 0.0007702970297029703,
"loss": 1.4799,
"step": 4720
},
{
"epoch": 3.984518185440264,
"grad_norm": 0.3833225965499878,
"learning_rate": 0.0007693069306930694,
"loss": 1.4785,
"step": 4730
},
{
"epoch": 3.9929421139507086,
"grad_norm": 0.30888909101486206,
"learning_rate": 0.0007683168316831683,
"loss": 1.475,
"step": 4740
},
{
"epoch": 4.001366042461153,
"grad_norm": 0.32462459802627563,
"learning_rate": 0.0007673267326732674,
"loss": 1.4746,
"step": 4750
},
{
"epoch": 4.009789970971598,
"grad_norm": 0.3200187683105469,
"learning_rate": 0.0007663366336633664,
"loss": 1.4768,
"step": 4760
},
{
"epoch": 4.018213899482042,
"grad_norm": 0.3794704079627991,
"learning_rate": 0.0007653465346534654,
"loss": 1.4761,
"step": 4770
},
{
"epoch": 4.018213899482042,
"eval_accuracy": 0.6969660848927619,
"eval_loss": 1.3595411777496338,
"eval_runtime": 887.2228,
"eval_samples_per_second": 562.853,
"eval_steps_per_second": 5.212,
"step": 4770
},
{
"epoch": 4.026637827992487,
"grad_norm": 0.27933019399642944,
"learning_rate": 0.0007643564356435644,
"loss": 1.47,
"step": 4780
},
{
"epoch": 4.035061756502931,
"grad_norm": 0.32542508840560913,
"learning_rate": 0.0007633663366336634,
"loss": 1.4726,
"step": 4790
},
{
"epoch": 4.043485685013376,
"grad_norm": 0.3638169765472412,
"learning_rate": 0.0007623762376237624,
"loss": 1.4697,
"step": 4800
},
{
"epoch": 4.05190961352382,
"grad_norm": 0.3762564957141876,
"learning_rate": 0.0007613861386138615,
"loss": 1.4663,
"step": 4810
},
{
"epoch": 4.0603335420342646,
"grad_norm": 0.36758995056152344,
"learning_rate": 0.0007603960396039604,
"loss": 1.4729,
"step": 4820
},
{
"epoch": 4.06875747054471,
"grad_norm": 0.34590932726860046,
"learning_rate": 0.0007594059405940595,
"loss": 1.4665,
"step": 4830
},
{
"epoch": 4.077181399055154,
"grad_norm": 0.3242778182029724,
"learning_rate": 0.0007584158415841584,
"loss": 1.4639,
"step": 4840
},
{
"epoch": 4.085605327565599,
"grad_norm": 0.3849882185459137,
"learning_rate": 0.0007574257425742574,
"loss": 1.4613,
"step": 4850
},
{
"epoch": 4.094029256076043,
"grad_norm": 0.3495323061943054,
"learning_rate": 0.0007564356435643565,
"loss": 1.4598,
"step": 4860
},
{
"epoch": 4.094029256076043,
"eval_accuracy": 0.6996214986490302,
"eval_loss": 1.3455697298049927,
"eval_runtime": 887.3091,
"eval_samples_per_second": 562.798,
"eval_steps_per_second": 5.211,
"step": 4860
},
{
"epoch": 4.102453184586488,
"grad_norm": 0.3290145993232727,
"learning_rate": 0.0007554455445544554,
"loss": 1.4601,
"step": 4870
},
{
"epoch": 4.110877113096932,
"grad_norm": 0.34369096159935,
"learning_rate": 0.0007544554455445545,
"loss": 1.4603,
"step": 4880
},
{
"epoch": 4.119301041607376,
"grad_norm": 0.3350279629230499,
"learning_rate": 0.0007534653465346534,
"loss": 1.4609,
"step": 4890
},
{
"epoch": 4.127724970117821,
"grad_norm": 0.2575846016407013,
"learning_rate": 0.0007524752475247525,
"loss": 1.4565,
"step": 4900
},
{
"epoch": 4.1361488986282655,
"grad_norm": 0.3337861895561218,
"learning_rate": 0.0007514851485148515,
"loss": 1.4574,
"step": 4910
},
{
"epoch": 4.144572827138711,
"grad_norm": 0.3752147853374481,
"learning_rate": 0.0007504950495049505,
"loss": 1.4594,
"step": 4920
},
{
"epoch": 4.152996755649155,
"grad_norm": 0.29587122797966003,
"learning_rate": 0.0007495049504950495,
"loss": 1.4518,
"step": 4930
},
{
"epoch": 4.161420684159599,
"grad_norm": 0.2764742374420166,
"learning_rate": 0.0007485148514851485,
"loss": 1.4514,
"step": 4940
},
{
"epoch": 4.169844612670044,
"grad_norm": 0.4625591039657593,
"learning_rate": 0.0007475247524752475,
"loss": 1.4527,
"step": 4950
},
{
"epoch": 4.169844612670044,
"eval_accuracy": 0.701515475804278,
"eval_loss": 1.3361947536468506,
"eval_runtime": 883.9818,
"eval_samples_per_second": 564.917,
"eval_steps_per_second": 5.231,
"step": 4950
},
{
"epoch": 4.178268541180488,
"grad_norm": 0.29412004351615906,
"learning_rate": 0.0007465346534653466,
"loss": 1.4514,
"step": 4960
},
{
"epoch": 4.186692469690933,
"grad_norm": 0.3580242693424225,
"learning_rate": 0.0007455445544554455,
"loss": 1.4486,
"step": 4970
},
{
"epoch": 4.195116398201377,
"grad_norm": 0.46256908774375916,
"learning_rate": 0.0007445544554455446,
"loss": 1.4494,
"step": 4980
},
{
"epoch": 4.203540326711822,
"grad_norm": 0.3117842674255371,
"learning_rate": 0.0007435643564356435,
"loss": 1.4486,
"step": 4990
},
{
"epoch": 4.2119642552222665,
"grad_norm": 0.3382858335971832,
"learning_rate": 0.0007425742574257426,
"loss": 1.4452,
"step": 5000
},
{
"epoch": 4.220388183732711,
"grad_norm": 0.3153148889541626,
"learning_rate": 0.0007415841584158416,
"loss": 1.4465,
"step": 5010
},
{
"epoch": 4.228812112243156,
"grad_norm": 0.3635173439979553,
"learning_rate": 0.0007405940594059406,
"loss": 1.4443,
"step": 5020
},
{
"epoch": 4.2372360407536,
"grad_norm": 0.4260285794734955,
"learning_rate": 0.0007396039603960396,
"loss": 1.4454,
"step": 5030
},
{
"epoch": 4.245659969264045,
"grad_norm": 0.29188039898872375,
"learning_rate": 0.0007386138613861386,
"loss": 1.4442,
"step": 5040
},
{
"epoch": 4.245659969264045,
"eval_accuracy": 0.7031089800515327,
"eval_loss": 1.3285191059112549,
"eval_runtime": 890.9721,
"eval_samples_per_second": 560.484,
"eval_steps_per_second": 5.19,
"step": 5040
},
{
"epoch": 4.254083897774489,
"grad_norm": 0.5350555777549744,
"learning_rate": 0.0007376237623762376,
"loss": 1.4416,
"step": 5050
},
{
"epoch": 4.262507826284934,
"grad_norm": 0.35281315445899963,
"learning_rate": 0.0007366336633663367,
"loss": 1.4432,
"step": 5060
},
{
"epoch": 4.270931754795378,
"grad_norm": 0.37922871112823486,
"learning_rate": 0.0007356435643564356,
"loss": 1.4399,
"step": 5070
},
{
"epoch": 4.279355683305822,
"grad_norm": 0.3072182238101959,
"learning_rate": 0.0007346534653465347,
"loss": 1.4383,
"step": 5080
},
{
"epoch": 4.287779611816267,
"grad_norm": 0.30223241448402405,
"learning_rate": 0.0007336633663366336,
"loss": 1.4406,
"step": 5090
},
{
"epoch": 4.296203540326712,
"grad_norm": 0.5292770862579346,
"learning_rate": 0.0007326732673267327,
"loss": 1.4376,
"step": 5100
},
{
"epoch": 4.304627468837157,
"grad_norm": 0.35330840945243835,
"learning_rate": 0.0007316831683168317,
"loss": 1.4389,
"step": 5110
},
{
"epoch": 4.313051397347601,
"grad_norm": 0.30719104409217834,
"learning_rate": 0.0007306930693069307,
"loss": 1.4384,
"step": 5120
},
{
"epoch": 4.321475325858046,
"grad_norm": 0.34203872084617615,
"learning_rate": 0.0007297029702970297,
"loss": 1.4374,
"step": 5130
},
{
"epoch": 4.321475325858046,
"eval_accuracy": 0.7048288335521147,
"eval_loss": 1.3187906742095947,
"eval_runtime": 887.0787,
"eval_samples_per_second": 562.944,
"eval_steps_per_second": 5.213,
"step": 5130
},
{
"epoch": 4.32989925436849,
"grad_norm": 0.38140207529067993,
"learning_rate": 0.0007287128712871287,
"loss": 1.4353,
"step": 5140
},
{
"epoch": 4.338323182878934,
"grad_norm": 0.303752064704895,
"learning_rate": 0.0007277227722772277,
"loss": 1.4336,
"step": 5150
},
{
"epoch": 4.346747111389379,
"grad_norm": 0.290764719247818,
"learning_rate": 0.0007267326732673268,
"loss": 1.4304,
"step": 5160
},
{
"epoch": 4.355171039899823,
"grad_norm": 0.4335167407989502,
"learning_rate": 0.0007257425742574257,
"loss": 1.4327,
"step": 5170
},
{
"epoch": 4.363594968410268,
"grad_norm": 0.3198365271091461,
"learning_rate": 0.0007247524752475248,
"loss": 1.4319,
"step": 5180
},
{
"epoch": 4.3720188969207125,
"grad_norm": 0.41567763686180115,
"learning_rate": 0.0007237623762376237,
"loss": 1.4318,
"step": 5190
},
{
"epoch": 4.380442825431157,
"grad_norm": 0.3342703580856323,
"learning_rate": 0.0007227722772277228,
"loss": 1.4298,
"step": 5200
},
{
"epoch": 4.388866753941602,
"grad_norm": 0.25702279806137085,
"learning_rate": 0.0007217821782178218,
"loss": 1.4265,
"step": 5210
},
{
"epoch": 4.397290682452046,
"grad_norm": 0.26949411630630493,
"learning_rate": 0.0007207920792079208,
"loss": 1.4278,
"step": 5220
},
{
"epoch": 4.397290682452046,
"eval_accuracy": 0.7063243134470976,
"eval_loss": 1.3113943338394165,
"eval_runtime": 889.8031,
"eval_samples_per_second": 561.221,
"eval_steps_per_second": 5.197,
"step": 5220
},
{
"epoch": 4.405714610962491,
"grad_norm": 0.3861467242240906,
"learning_rate": 0.0007198019801980198,
"loss": 1.4318,
"step": 5230
},
{
"epoch": 4.414138539472935,
"grad_norm": 0.34858283400535583,
"learning_rate": 0.0007188118811881188,
"loss": 1.4291,
"step": 5240
},
{
"epoch": 4.42256246798338,
"grad_norm": 0.3346785604953766,
"learning_rate": 0.0007178217821782178,
"loss": 1.425,
"step": 5250
},
{
"epoch": 4.430986396493824,
"grad_norm": 0.3916323184967041,
"learning_rate": 0.0007168316831683169,
"loss": 1.4241,
"step": 5260
},
{
"epoch": 4.439410325004269,
"grad_norm": 0.2802947759628296,
"learning_rate": 0.0007158415841584158,
"loss": 1.4221,
"step": 5270
},
{
"epoch": 4.4478342535147135,
"grad_norm": 0.4092938303947449,
"learning_rate": 0.0007148514851485149,
"loss": 1.4236,
"step": 5280
},
{
"epoch": 4.456258182025158,
"grad_norm": 0.25096723437309265,
"learning_rate": 0.0007138613861386138,
"loss": 1.4235,
"step": 5290
},
{
"epoch": 4.464682110535603,
"grad_norm": 0.3570871949195862,
"learning_rate": 0.0007128712871287129,
"loss": 1.4216,
"step": 5300
},
{
"epoch": 4.473106039046047,
"grad_norm": 0.3168172240257263,
"learning_rate": 0.0007118811881188119,
"loss": 1.4236,
"step": 5310
},
{
"epoch": 4.473106039046047,
"eval_accuracy": 0.7076842136916008,
"eval_loss": 1.307774543762207,
"eval_runtime": 889.4836,
"eval_samples_per_second": 561.422,
"eval_steps_per_second": 5.199,
"step": 5310
},
{
"epoch": 4.481529967556492,
"grad_norm": 0.30059170722961426,
"learning_rate": 0.0007108910891089109,
"loss": 1.4193,
"step": 5320
},
{
"epoch": 4.489953896066936,
"grad_norm": 0.331824392080307,
"learning_rate": 0.0007099009900990099,
"loss": 1.4185,
"step": 5330
},
{
"epoch": 4.49837782457738,
"grad_norm": 0.3295821249485016,
"learning_rate": 0.0007089108910891088,
"loss": 1.4198,
"step": 5340
},
{
"epoch": 4.506801753087825,
"grad_norm": 0.3506734371185303,
"learning_rate": 0.0007079207920792079,
"loss": 1.4167,
"step": 5350
},
{
"epoch": 4.515225681598269,
"grad_norm": 0.3836129903793335,
"learning_rate": 0.000706930693069307,
"loss": 1.417,
"step": 5360
},
{
"epoch": 4.5236496101087145,
"grad_norm": 0.3046220541000366,
"learning_rate": 0.0007059405940594059,
"loss": 1.4177,
"step": 5370
},
{
"epoch": 4.532073538619159,
"grad_norm": 0.37655332684516907,
"learning_rate": 0.000704950495049505,
"loss": 1.4149,
"step": 5380
},
{
"epoch": 4.540497467129603,
"grad_norm": 0.32939672470092773,
"learning_rate": 0.0007039603960396039,
"loss": 1.4165,
"step": 5390
},
{
"epoch": 4.548921395640048,
"grad_norm": 0.2900882363319397,
"learning_rate": 0.0007029702970297029,
"loss": 1.4128,
"step": 5400
},
{
"epoch": 4.548921395640048,
"eval_accuracy": 0.7087959913049944,
"eval_loss": 1.3013147115707397,
"eval_runtime": 892.9333,
"eval_samples_per_second": 559.253,
"eval_steps_per_second": 5.178,
"step": 5400
},
{
"epoch": 4.557345324150492,
"grad_norm": 0.27651771903038025,
"learning_rate": 0.000701980198019802,
"loss": 1.4122,
"step": 5410
},
{
"epoch": 4.565769252660937,
"grad_norm": 0.4160715639591217,
"learning_rate": 0.0007009900990099009,
"loss": 1.4122,
"step": 5420
},
{
"epoch": 4.574193181171381,
"grad_norm": 0.2724072337150574,
"learning_rate": 0.0007,
"loss": 1.41,
"step": 5430
},
{
"epoch": 4.582617109681826,
"grad_norm": 0.35586145520210266,
"learning_rate": 0.0006990099009900989,
"loss": 1.4118,
"step": 5440
},
{
"epoch": 4.59104103819227,
"grad_norm": 0.3268265128135681,
"learning_rate": 0.000698019801980198,
"loss": 1.4117,
"step": 5450
},
{
"epoch": 4.599464966702715,
"grad_norm": 0.3230002522468567,
"learning_rate": 0.000697029702970297,
"loss": 1.4102,
"step": 5460
},
{
"epoch": 4.60788889521316,
"grad_norm": 0.25019174814224243,
"learning_rate": 0.000696039603960396,
"loss": 1.4102,
"step": 5470
},
{
"epoch": 4.616312823723604,
"grad_norm": 0.38475289940834045,
"learning_rate": 0.000695049504950495,
"loss": 1.4075,
"step": 5480
},
{
"epoch": 4.624736752234049,
"grad_norm": 0.39824309945106506,
"learning_rate": 0.000694059405940594,
"loss": 1.4077,
"step": 5490
},
{
"epoch": 4.624736752234049,
"eval_accuracy": 0.7098417264518991,
"eval_loss": 1.2926928997039795,
"eval_runtime": 881.9048,
"eval_samples_per_second": 566.247,
"eval_steps_per_second": 5.243,
"step": 5490
},
{
"epoch": 4.633160680744493,
"grad_norm": 0.3250022828578949,
"learning_rate": 0.000693069306930693,
"loss": 1.4068,
"step": 5500
},
{
"epoch": 4.641584609254938,
"grad_norm": 0.32388612627983093,
"learning_rate": 0.0006920792079207921,
"loss": 1.4062,
"step": 5510
},
{
"epoch": 4.650008537765382,
"grad_norm": 0.2806077003479004,
"learning_rate": 0.000691089108910891,
"loss": 1.4049,
"step": 5520
},
{
"epoch": 4.658432466275826,
"grad_norm": 0.33755025267601013,
"learning_rate": 0.0006900990099009901,
"loss": 1.4045,
"step": 5530
},
{
"epoch": 4.666856394786271,
"grad_norm": 0.4184636175632477,
"learning_rate": 0.000689108910891089,
"loss": 1.4042,
"step": 5540
},
{
"epoch": 4.6752803232967155,
"grad_norm": 0.34234240651130676,
"learning_rate": 0.0006881188118811881,
"loss": 1.4055,
"step": 5550
},
{
"epoch": 4.6837042518071605,
"grad_norm": 0.32120293378829956,
"learning_rate": 0.0006871287128712872,
"loss": 1.4014,
"step": 5560
},
{
"epoch": 4.692128180317605,
"grad_norm": 0.3810026943683624,
"learning_rate": 0.0006861386138613862,
"loss": 1.4039,
"step": 5570
},
{
"epoch": 4.70055210882805,
"grad_norm": 0.3171080946922302,
"learning_rate": 0.0006851485148514852,
"loss": 1.4025,
"step": 5580
},
{
"epoch": 4.70055210882805,
"eval_accuracy": 0.7115425686273988,
"eval_loss": 1.285227656364441,
"eval_runtime": 891.3368,
"eval_samples_per_second": 560.255,
"eval_steps_per_second": 5.188,
"step": 5580
},
{
"epoch": 4.708976037338494,
"grad_norm": 0.24618960916996002,
"learning_rate": 0.0006841584158415842,
"loss": 1.3983,
"step": 5590
},
{
"epoch": 4.717399965848939,
"grad_norm": 0.494895339012146,
"learning_rate": 0.0006831683168316832,
"loss": 1.4,
"step": 5600
},
{
"epoch": 4.725823894359383,
"grad_norm": 0.31908226013183594,
"learning_rate": 0.0006821782178217823,
"loss": 1.3983,
"step": 5610
},
{
"epoch": 4.734247822869827,
"grad_norm": 0.26488983631134033,
"learning_rate": 0.0006811881188118812,
"loss": 1.3956,
"step": 5620
},
{
"epoch": 4.742671751380272,
"grad_norm": 0.3156343102455139,
"learning_rate": 0.0006801980198019803,
"loss": 1.397,
"step": 5630
},
{
"epoch": 4.7510956798907165,
"grad_norm": 0.38938194513320923,
"learning_rate": 0.0006792079207920792,
"loss": 1.3987,
"step": 5640
},
{
"epoch": 4.7595196084011615,
"grad_norm": 0.27233967185020447,
"learning_rate": 0.0006782178217821783,
"loss": 1.3983,
"step": 5650
},
{
"epoch": 4.767943536911606,
"grad_norm": 0.347419410943985,
"learning_rate": 0.0006772277227722773,
"loss": 1.3953,
"step": 5660
},
{
"epoch": 4.77636746542205,
"grad_norm": 0.44131675362586975,
"learning_rate": 0.0006762376237623763,
"loss": 1.3956,
"step": 5670
},
{
"epoch": 4.77636746542205,
"eval_accuracy": 0.7112416746447588,
"eval_loss": 1.290834665298462,
"eval_runtime": 886.5668,
"eval_samples_per_second": 563.269,
"eval_steps_per_second": 5.216,
"step": 5670
},
{
"epoch": 4.784791393932495,
"grad_norm": 0.3185184895992279,
"learning_rate": 0.0006752475247524753,
"loss": 1.3976,
"step": 5680
},
{
"epoch": 4.793215322442939,
"grad_norm": 0.2549585998058319,
"learning_rate": 0.0006742574257425743,
"loss": 1.3931,
"step": 5690
},
{
"epoch": 4.801639250953384,
"grad_norm": 0.315294086933136,
"learning_rate": 0.0006732673267326733,
"loss": 1.393,
"step": 5700
},
{
"epoch": 4.810063179463828,
"grad_norm": 0.3866962492465973,
"learning_rate": 0.0006722772277227724,
"loss": 1.3923,
"step": 5710
},
{
"epoch": 4.818487107974272,
"grad_norm": 0.28364527225494385,
"learning_rate": 0.0006712871287128713,
"loss": 1.3924,
"step": 5720
},
{
"epoch": 4.826911036484717,
"grad_norm": 0.3253314793109894,
"learning_rate": 0.0006702970297029704,
"loss": 1.3914,
"step": 5730
},
{
"epoch": 4.835334964995162,
"grad_norm": 0.31215131282806396,
"learning_rate": 0.0006693069306930693,
"loss": 1.3903,
"step": 5740
},
{
"epoch": 4.843758893505607,
"grad_norm": 0.34929993748664856,
"learning_rate": 0.0006683168316831684,
"loss": 1.3894,
"step": 5750
},
{
"epoch": 4.852182822016051,
"grad_norm": 0.38991761207580566,
"learning_rate": 0.0006673267326732674,
"loss": 1.3924,
"step": 5760
},
{
"epoch": 4.852182822016051,
"eval_accuracy": 0.7133021748514282,
"eval_loss": 1.2766938209533691,
"eval_runtime": 881.7452,
"eval_samples_per_second": 566.35,
"eval_steps_per_second": 5.244,
"step": 5760
},
{
"epoch": 4.860606750526496,
"grad_norm": 0.2888573408126831,
"learning_rate": 0.0006663366336633664,
"loss": 1.3918,
"step": 5770
},
{
"epoch": 4.86903067903694,
"grad_norm": 0.3224232494831085,
"learning_rate": 0.0006653465346534654,
"loss": 1.3895,
"step": 5780
},
{
"epoch": 4.877454607547385,
"grad_norm": 0.3562750518321991,
"learning_rate": 0.0006643564356435644,
"loss": 1.387,
"step": 5790
},
{
"epoch": 4.885878536057829,
"grad_norm": 0.3339401185512543,
"learning_rate": 0.0006633663366336634,
"loss": 1.3886,
"step": 5800
},
{
"epoch": 4.894302464568273,
"grad_norm": 0.3022938072681427,
"learning_rate": 0.0006623762376237625,
"loss": 1.3858,
"step": 5810
},
{
"epoch": 4.902726393078718,
"grad_norm": 0.276065856218338,
"learning_rate": 0.0006613861386138614,
"loss": 1.386,
"step": 5820
},
{
"epoch": 4.9111503215891625,
"grad_norm": 0.3148975372314453,
"learning_rate": 0.0006603960396039605,
"loss": 1.385,
"step": 5830
},
{
"epoch": 4.919574250099608,
"grad_norm": 0.3374193608760834,
"learning_rate": 0.0006594059405940594,
"loss": 1.3842,
"step": 5840
},
{
"epoch": 4.927998178610052,
"grad_norm": 0.3293200135231018,
"learning_rate": 0.0006584158415841585,
"loss": 1.3835,
"step": 5850
},
{
"epoch": 4.927998178610052,
"eval_accuracy": 0.7147221912687882,
"eval_loss": 1.2681052684783936,
"eval_runtime": 890.793,
"eval_samples_per_second": 560.597,
"eval_steps_per_second": 5.191,
"step": 5850
},
{
"epoch": 4.936422107120496,
"grad_norm": 0.3032568693161011,
"learning_rate": 0.0006574257425742575,
"loss": 1.3828,
"step": 5860
},
{
"epoch": 4.944846035630941,
"grad_norm": 0.24251434206962585,
"learning_rate": 0.0006564356435643565,
"loss": 1.3818,
"step": 5870
},
{
"epoch": 4.953269964141385,
"grad_norm": 0.3096301257610321,
"learning_rate": 0.0006554455445544555,
"loss": 1.3814,
"step": 5880
},
{
"epoch": 4.96169389265183,
"grad_norm": 0.34841156005859375,
"learning_rate": 0.0006544554455445545,
"loss": 1.3823,
"step": 5890
},
{
"epoch": 4.970117821162274,
"grad_norm": 0.312688946723938,
"learning_rate": 0.0006534653465346535,
"loss": 1.3818,
"step": 5900
},
{
"epoch": 4.978541749672719,
"grad_norm": 0.30799320340156555,
"learning_rate": 0.0006524752475247526,
"loss": 1.379,
"step": 5910
},
{
"epoch": 4.9869656781831635,
"grad_norm": 0.3510371148586273,
"learning_rate": 0.0006514851485148515,
"loss": 1.3814,
"step": 5920
},
{
"epoch": 4.9953896066936085,
"grad_norm": 0.2894381582736969,
"learning_rate": 0.0006504950495049506,
"loss": 1.3812,
"step": 5930
},
{
"epoch": 5.003813535204053,
"grad_norm": 0.2685450315475464,
"learning_rate": 0.0006495049504950495,
"loss": 1.3788,
"step": 5940
},
{
"epoch": 5.003813535204053,
"eval_accuracy": 0.7160080315056353,
"eval_loss": 1.2630343437194824,
"eval_runtime": 883.8805,
"eval_samples_per_second": 564.981,
"eval_steps_per_second": 5.231,
"step": 5940
},
{
"epoch": 5.012237463714497,
"grad_norm": 0.38857927918434143,
"learning_rate": 0.0006485148514851485,
"loss": 1.3809,
"step": 5950
},
{
"epoch": 5.020661392224942,
"grad_norm": 0.2822309136390686,
"learning_rate": 0.0006475247524752476,
"loss": 1.3769,
"step": 5960
},
{
"epoch": 5.029085320735386,
"grad_norm": 0.2725491523742676,
"learning_rate": 0.0006465346534653465,
"loss": 1.3762,
"step": 5970
},
{
"epoch": 5.037509249245831,
"grad_norm": 0.32517486810684204,
"learning_rate": 0.0006455445544554456,
"loss": 1.377,
"step": 5980
},
{
"epoch": 5.045933177756275,
"grad_norm": 0.34373360872268677,
"learning_rate": 0.0006445544554455445,
"loss": 1.3774,
"step": 5990
},
{
"epoch": 5.054357106266719,
"grad_norm": 0.3029853403568268,
"learning_rate": 0.0006435643564356436,
"loss": 1.3746,
"step": 6000
},
{
"epoch": 5.0627810347771645,
"grad_norm": 0.5577653646469116,
"learning_rate": 0.0006425742574257426,
"loss": 1.378,
"step": 6010
},
{
"epoch": 5.071204963287609,
"grad_norm": 0.27967342734336853,
"learning_rate": 0.0006415841584158416,
"loss": 1.3779,
"step": 6020
},
{
"epoch": 5.079628891798054,
"grad_norm": 0.2680428624153137,
"learning_rate": 0.0006405940594059406,
"loss": 1.3733,
"step": 6030
},
{
"epoch": 5.079628891798054,
"eval_accuracy": 0.7168763989390342,
"eval_loss": 1.258245825767517,
"eval_runtime": 902.3568,
"eval_samples_per_second": 553.413,
"eval_steps_per_second": 5.124,
"step": 6030
},
{
"epoch": 5.088052820308498,
"grad_norm": 0.24522745609283447,
"learning_rate": 0.0006396039603960396,
"loss": 1.3692,
"step": 6040
},
{
"epoch": 5.096476748818943,
"grad_norm": 0.3076081871986389,
"learning_rate": 0.0006386138613861386,
"loss": 1.3724,
"step": 6050
},
{
"epoch": 5.104900677329387,
"grad_norm": 0.32096347212791443,
"learning_rate": 0.0006376237623762377,
"loss": 1.3737,
"step": 6060
},
{
"epoch": 5.113324605839831,
"grad_norm": 0.35196197032928467,
"learning_rate": 0.0006366336633663366,
"loss": 1.3719,
"step": 6070
},
{
"epoch": 5.121748534350276,
"grad_norm": 0.39065635204315186,
"learning_rate": 0.0006356435643564357,
"loss": 1.3719,
"step": 6080
},
{
"epoch": 5.13017246286072,
"grad_norm": 0.3439326882362366,
"learning_rate": 0.0006346534653465346,
"loss": 1.3749,
"step": 6090
},
{
"epoch": 5.138596391371165,
"grad_norm": 0.3175961673259735,
"learning_rate": 0.0006336633663366337,
"loss": 1.3679,
"step": 6100
},
{
"epoch": 5.14702031988161,
"grad_norm": 0.37071719765663147,
"learning_rate": 0.0006326732673267327,
"loss": 1.3706,
"step": 6110
},
{
"epoch": 5.155444248392055,
"grad_norm": 0.2499271035194397,
"learning_rate": 0.0006316831683168317,
"loss": 1.3685,
"step": 6120
},
{
"epoch": 5.155444248392055,
"eval_accuracy": 0.717981203712741,
"eval_loss": 1.2521748542785645,
"eval_runtime": 885.5528,
"eval_samples_per_second": 563.914,
"eval_steps_per_second": 5.222,
"step": 6120
},
{
"epoch": 5.163868176902499,
"grad_norm": 0.3951607346534729,
"learning_rate": 0.0006306930693069307,
"loss": 1.3671,
"step": 6130
},
{
"epoch": 5.172292105412943,
"grad_norm": 0.4264112114906311,
"learning_rate": 0.0006297029702970297,
"loss": 1.3652,
"step": 6140
},
{
"epoch": 5.180716033923388,
"grad_norm": 0.3097785711288452,
"learning_rate": 0.0006287128712871287,
"loss": 1.3695,
"step": 6150
},
{
"epoch": 5.189139962433832,
"grad_norm": 0.28887125849723816,
"learning_rate": 0.0006277227722772278,
"loss": 1.3658,
"step": 6160
},
{
"epoch": 5.197563890944277,
"grad_norm": 0.27163591980934143,
"learning_rate": 0.0006267326732673267,
"loss": 1.3655,
"step": 6170
},
{
"epoch": 5.205987819454721,
"grad_norm": 0.30266183614730835,
"learning_rate": 0.0006257425742574258,
"loss": 1.3631,
"step": 6180
},
{
"epoch": 5.2144117479651655,
"grad_norm": 0.3191784620285034,
"learning_rate": 0.0006247524752475247,
"loss": 1.3667,
"step": 6190
},
{
"epoch": 5.2228356764756105,
"grad_norm": 0.30907300114631653,
"learning_rate": 0.0006237623762376238,
"loss": 1.3667,
"step": 6200
},
{
"epoch": 5.231259604986055,
"grad_norm": 0.3120558559894562,
"learning_rate": 0.0006227722772277228,
"loss": 1.3638,
"step": 6210
},
{
"epoch": 5.231259604986055,
"eval_accuracy": 0.7190249020483522,
"eval_loss": 1.2470471858978271,
"eval_runtime": 893.7706,
"eval_samples_per_second": 558.73,
"eval_steps_per_second": 5.174,
"step": 6210
},
{
"epoch": 5.2396835334965,
"grad_norm": 0.35595396161079407,
"learning_rate": 0.0006217821782178218,
"loss": 1.3634,
"step": 6220
},
{
"epoch": 5.248107462006944,
"grad_norm": 0.33759573101997375,
"learning_rate": 0.0006207920792079208,
"loss": 1.3661,
"step": 6230
},
{
"epoch": 5.256531390517389,
"grad_norm": 0.26417672634124756,
"learning_rate": 0.0006198019801980198,
"loss": 1.3627,
"step": 6240
},
{
"epoch": 5.264955319027833,
"grad_norm": 0.28236111998558044,
"learning_rate": 0.0006188118811881188,
"loss": 1.362,
"step": 6250
},
{
"epoch": 5.273379247538277,
"grad_norm": 0.5903481245040894,
"learning_rate": 0.0006178217821782179,
"loss": 1.3619,
"step": 6260
},
{
"epoch": 5.281803176048722,
"grad_norm": 0.298475056886673,
"learning_rate": 0.0006168316831683168,
"loss": 1.3671,
"step": 6270
},
{
"epoch": 5.2902271045591664,
"grad_norm": 0.27397215366363525,
"learning_rate": 0.0006158415841584159,
"loss": 1.3611,
"step": 6280
},
{
"epoch": 5.2986510330696115,
"grad_norm": 0.28740593791007996,
"learning_rate": 0.0006148514851485148,
"loss": 1.3579,
"step": 6290
},
{
"epoch": 5.307074961580056,
"grad_norm": 0.274557888507843,
"learning_rate": 0.0006138613861386139,
"loss": 1.3587,
"step": 6300
},
{
"epoch": 5.307074961580056,
"eval_accuracy": 0.719703789624826,
"eval_loss": 1.2432972192764282,
"eval_runtime": 881.2394,
"eval_samples_per_second": 566.675,
"eval_steps_per_second": 5.247,
"step": 6300
},
{
"epoch": 5.315498890090501,
"grad_norm": 0.31431418657302856,
"learning_rate": 0.0006128712871287129,
"loss": 1.3565,
"step": 6310
},
{
"epoch": 5.323922818600945,
"grad_norm": 0.358239084482193,
"learning_rate": 0.0006118811881188119,
"loss": 1.3614,
"step": 6320
},
{
"epoch": 5.332346747111389,
"grad_norm": 0.3043140769004822,
"learning_rate": 0.0006108910891089109,
"loss": 1.3576,
"step": 6330
},
{
"epoch": 5.340770675621834,
"grad_norm": 0.2583385109901428,
"learning_rate": 0.0006099009900990099,
"loss": 1.3578,
"step": 6340
},
{
"epoch": 5.349194604132278,
"grad_norm": 0.3068407475948334,
"learning_rate": 0.0006089108910891089,
"loss": 1.3577,
"step": 6350
},
{
"epoch": 5.357618532642723,
"grad_norm": 0.2893878221511841,
"learning_rate": 0.000607920792079208,
"loss": 1.3569,
"step": 6360
},
{
"epoch": 5.366042461153167,
"grad_norm": 0.2883850634098053,
"learning_rate": 0.0006069306930693069,
"loss": 1.3555,
"step": 6370
},
{
"epoch": 5.3744663896636125,
"grad_norm": 0.3248838484287262,
"learning_rate": 0.000605940594059406,
"loss": 1.3561,
"step": 6380
},
{
"epoch": 5.382890318174057,
"grad_norm": 0.29167214035987854,
"learning_rate": 0.0006049504950495049,
"loss": 1.3582,
"step": 6390
},
{
"epoch": 5.382890318174057,
"eval_accuracy": 0.7203339064191229,
"eval_loss": 1.241172432899475,
"eval_runtime": 891.2006,
"eval_samples_per_second": 560.341,
"eval_steps_per_second": 5.189,
"step": 6390
},
{
"epoch": 5.391314246684501,
"grad_norm": 0.3090030550956726,
"learning_rate": 0.000603960396039604,
"loss": 1.3534,
"step": 6400
},
{
"epoch": 5.399738175194946,
"grad_norm": 0.25337210297584534,
"learning_rate": 0.000602970297029703,
"loss": 1.3564,
"step": 6410
},
{
"epoch": 5.40816210370539,
"grad_norm": 0.25656768679618835,
"learning_rate": 0.000601980198019802,
"loss": 1.3549,
"step": 6420
},
{
"epoch": 5.416586032215835,
"grad_norm": 0.2951459288597107,
"learning_rate": 0.000600990099009901,
"loss": 1.3518,
"step": 6430
},
{
"epoch": 5.425009960726279,
"grad_norm": 0.2697450816631317,
"learning_rate": 0.0006,
"loss": 1.3531,
"step": 6440
},
{
"epoch": 5.433433889236724,
"grad_norm": 0.28866857290267944,
"learning_rate": 0.000599009900990099,
"loss": 1.3524,
"step": 6450
},
{
"epoch": 5.441857817747168,
"grad_norm": 0.26775673031806946,
"learning_rate": 0.000598019801980198,
"loss": 1.3505,
"step": 6460
},
{
"epoch": 5.4502817462576125,
"grad_norm": 0.3911271393299103,
"learning_rate": 0.000597029702970297,
"loss": 1.3516,
"step": 6470
},
{
"epoch": 5.458705674768058,
"grad_norm": 0.3151527941226959,
"learning_rate": 0.000596039603960396,
"loss": 1.353,
"step": 6480
},
{
"epoch": 5.458705674768058,
"eval_accuracy": 0.7213715986510872,
"eval_loss": 1.2357591390609741,
"eval_runtime": 888.8097,
"eval_samples_per_second": 561.848,
"eval_steps_per_second": 5.202,
"step": 6480
},
{
"epoch": 5.467129603278502,
"grad_norm": 0.32286888360977173,
"learning_rate": 0.000595049504950495,
"loss": 1.3527,
"step": 6490
},
{
"epoch": 5.475553531788947,
"grad_norm": 0.3933228850364685,
"learning_rate": 0.000594059405940594,
"loss": 1.3511,
"step": 6500
},
{
"epoch": 5.483977460299391,
"grad_norm": 0.3246067762374878,
"learning_rate": 0.0005930693069306931,
"loss": 1.3524,
"step": 6510
},
{
"epoch": 5.492401388809835,
"grad_norm": 0.2912397086620331,
"learning_rate": 0.000592079207920792,
"loss": 1.3495,
"step": 6520
},
{
"epoch": 5.50082531732028,
"grad_norm": 0.3058258891105652,
"learning_rate": 0.0005910891089108911,
"loss": 1.3486,
"step": 6530
},
{
"epoch": 5.509249245830724,
"grad_norm": 0.310024231672287,
"learning_rate": 0.00059009900990099,
"loss": 1.3507,
"step": 6540
},
{
"epoch": 5.517673174341169,
"grad_norm": 0.289165198802948,
"learning_rate": 0.0005891089108910891,
"loss": 1.3475,
"step": 6550
},
{
"epoch": 5.5260971028516135,
"grad_norm": 0.324613094329834,
"learning_rate": 0.0005881188118811881,
"loss": 1.3489,
"step": 6560
},
{
"epoch": 5.5345210313620585,
"grad_norm": 0.3530217111110687,
"learning_rate": 0.0005871287128712871,
"loss": 1.3477,
"step": 6570
},
{
"epoch": 5.5345210313620585,
"eval_accuracy": 0.722217175302605,
"eval_loss": 1.2293946743011475,
"eval_runtime": 881.4092,
"eval_samples_per_second": 566.565,
"eval_steps_per_second": 5.246,
"step": 6570
},
{
"epoch": 5.542944959872503,
"grad_norm": 0.3527272045612335,
"learning_rate": 0.0005861386138613861,
"loss": 1.3447,
"step": 6580
},
{
"epoch": 5.551368888382948,
"grad_norm": 0.26519855856895447,
"learning_rate": 0.0005851485148514851,
"loss": 1.346,
"step": 6590
},
{
"epoch": 5.559792816893392,
"grad_norm": 0.29473376274108887,
"learning_rate": 0.0005841584158415841,
"loss": 1.3461,
"step": 6600
},
{
"epoch": 5.568216745403836,
"grad_norm": 0.31212469935417175,
"learning_rate": 0.0005831683168316832,
"loss": 1.3454,
"step": 6610
},
{
"epoch": 5.576640673914281,
"grad_norm": 0.2541083097457886,
"learning_rate": 0.0005821782178217821,
"loss": 1.3451,
"step": 6620
},
{
"epoch": 5.585064602424725,
"grad_norm": 0.28075823187828064,
"learning_rate": 0.0005811881188118812,
"loss": 1.3417,
"step": 6630
},
{
"epoch": 5.59348853093517,
"grad_norm": 0.286945641040802,
"learning_rate": 0.0005801980198019801,
"loss": 1.3439,
"step": 6640
},
{
"epoch": 5.6019124594456144,
"grad_norm": 0.2825601100921631,
"learning_rate": 0.0005792079207920792,
"loss": 1.3447,
"step": 6650
},
{
"epoch": 5.610336387956059,
"grad_norm": 0.3023243844509125,
"learning_rate": 0.0005782178217821782,
"loss": 1.3428,
"step": 6660
},
{
"epoch": 5.610336387956059,
"eval_accuracy": 0.7226627197479346,
"eval_loss": 1.2287484407424927,
"eval_runtime": 893.8585,
"eval_samples_per_second": 558.675,
"eval_steps_per_second": 5.173,
"step": 6660
},
{
"epoch": 5.618760316466504,
"grad_norm": 0.2548897862434387,
"learning_rate": 0.0005772277227722772,
"loss": 1.3441,
"step": 6670
},
{
"epoch": 5.627184244976948,
"grad_norm": 0.28277119994163513,
"learning_rate": 0.0005762376237623762,
"loss": 1.3421,
"step": 6680
},
{
"epoch": 5.635608173487393,
"grad_norm": 0.35963568091392517,
"learning_rate": 0.0005752475247524752,
"loss": 1.3421,
"step": 6690
},
{
"epoch": 5.644032101997837,
"grad_norm": 0.2753046452999115,
"learning_rate": 0.0005742574257425742,
"loss": 1.3449,
"step": 6700
},
{
"epoch": 5.652456030508281,
"grad_norm": 0.31272053718566895,
"learning_rate": 0.0005732673267326733,
"loss": 1.3418,
"step": 6710
},
{
"epoch": 5.660879959018726,
"grad_norm": 0.24427007138729095,
"learning_rate": 0.0005722772277227722,
"loss": 1.3409,
"step": 6720
},
{
"epoch": 5.66930388752917,
"grad_norm": 0.4038189649581909,
"learning_rate": 0.0005712871287128713,
"loss": 1.3387,
"step": 6730
},
{
"epoch": 5.677727816039615,
"grad_norm": 0.30009007453918457,
"learning_rate": 0.0005702970297029702,
"loss": 1.3425,
"step": 6740
},
{
"epoch": 5.68615174455006,
"grad_norm": 0.2813461720943451,
"learning_rate": 0.0005693069306930693,
"loss": 1.3396,
"step": 6750
},
{
"epoch": 5.68615174455006,
"eval_accuracy": 0.7239226758241876,
"eval_loss": 1.2240657806396484,
"eval_runtime": 898.7215,
"eval_samples_per_second": 555.652,
"eval_steps_per_second": 5.145,
"step": 6750
},
{
"epoch": 5.694575673060505,
"grad_norm": 0.4396764039993286,
"learning_rate": 0.0005683168316831683,
"loss": 1.3408,
"step": 6760
},
{
"epoch": 5.702999601570949,
"grad_norm": 0.2992042899131775,
"learning_rate": 0.0005673267326732673,
"loss": 1.3408,
"step": 6770
},
{
"epoch": 5.711423530081394,
"grad_norm": 0.2579440474510193,
"learning_rate": 0.0005663366336633663,
"loss": 1.3369,
"step": 6780
},
{
"epoch": 5.719847458591838,
"grad_norm": 0.32076653838157654,
"learning_rate": 0.0005653465346534653,
"loss": 1.3365,
"step": 6790
},
{
"epoch": 5.728271387102282,
"grad_norm": 0.3180268108844757,
"learning_rate": 0.0005643564356435643,
"loss": 1.339,
"step": 6800
},
{
"epoch": 5.736695315612727,
"grad_norm": 0.27663713693618774,
"learning_rate": 0.0005633663366336634,
"loss": 1.3373,
"step": 6810
},
{
"epoch": 5.745119244123171,
"grad_norm": 0.27103811502456665,
"learning_rate": 0.0005623762376237624,
"loss": 1.3332,
"step": 6820
},
{
"epoch": 5.753543172633616,
"grad_norm": 0.34022676944732666,
"learning_rate": 0.0005613861386138615,
"loss": 1.3373,
"step": 6830
},
{
"epoch": 5.7619671011440605,
"grad_norm": 0.36838725209236145,
"learning_rate": 0.0005603960396039604,
"loss": 1.3384,
"step": 6840
},
{
"epoch": 5.7619671011440605,
"eval_accuracy": 0.7243312842270887,
"eval_loss": 1.221815586090088,
"eval_runtime": 891.7897,
"eval_samples_per_second": 559.971,
"eval_steps_per_second": 5.185,
"step": 6840
},
{
"epoch": 5.770391029654505,
"grad_norm": 0.2968374490737915,
"learning_rate": 0.0005594059405940595,
"loss": 1.3353,
"step": 6850
},
{
"epoch": 5.77881495816495,
"grad_norm": 0.36536258459091187,
"learning_rate": 0.0005584158415841585,
"loss": 1.3331,
"step": 6860
},
{
"epoch": 5.787238886675394,
"grad_norm": 0.2985541522502899,
"learning_rate": 0.0005574257425742575,
"loss": 1.3313,
"step": 6870
},
{
"epoch": 5.795662815185839,
"grad_norm": 0.33506348729133606,
"learning_rate": 0.0005564356435643565,
"loss": 1.3349,
"step": 6880
},
{
"epoch": 5.804086743696283,
"grad_norm": 0.31232866644859314,
"learning_rate": 0.0005554455445544555,
"loss": 1.3335,
"step": 6890
},
{
"epoch": 5.812510672206728,
"grad_norm": 0.27576977014541626,
"learning_rate": 0.0005544554455445545,
"loss": 1.3309,
"step": 6900
},
{
"epoch": 5.820934600717172,
"grad_norm": 0.2526339590549469,
"learning_rate": 0.0005534653465346536,
"loss": 1.3318,
"step": 6910
},
{
"epoch": 5.829358529227616,
"grad_norm": 0.25774866342544556,
"learning_rate": 0.0005524752475247525,
"loss": 1.3329,
"step": 6920
},
{
"epoch": 5.8377824577380615,
"grad_norm": 0.34311917424201965,
"learning_rate": 0.0005514851485148516,
"loss": 1.3334,
"step": 6930
},
{
"epoch": 5.8377824577380615,
"eval_accuracy": 0.7251374384748042,
"eval_loss": 1.216299057006836,
"eval_runtime": 889.6984,
"eval_samples_per_second": 561.287,
"eval_steps_per_second": 5.197,
"step": 6930
},
{
"epoch": 5.846206386248506,
"grad_norm": 0.32087624073028564,
"learning_rate": 0.0005504950495049505,
"loss": 1.3338,
"step": 6940
},
{
"epoch": 5.854630314758951,
"grad_norm": 0.25447556376457214,
"learning_rate": 0.0005495049504950496,
"loss": 1.3315,
"step": 6950
},
{
"epoch": 5.863054243269395,
"grad_norm": 0.285826712846756,
"learning_rate": 0.0005485148514851486,
"loss": 1.3303,
"step": 6960
},
{
"epoch": 5.87147817177984,
"grad_norm": 0.2816094756126404,
"learning_rate": 0.0005475247524752476,
"loss": 1.3308,
"step": 6970
},
{
"epoch": 5.879902100290284,
"grad_norm": 0.30444055795669556,
"learning_rate": 0.0005465346534653466,
"loss": 1.3303,
"step": 6980
},
{
"epoch": 5.888326028800728,
"grad_norm": 0.3512563705444336,
"learning_rate": 0.0005455445544554456,
"loss": 1.3305,
"step": 6990
},
{
"epoch": 5.896749957311173,
"grad_norm": 0.2924775779247284,
"learning_rate": 0.0005445544554455446,
"loss": 1.3307,
"step": 7000
},
{
"epoch": 5.905173885821617,
"grad_norm": 0.3497087359428406,
"learning_rate": 0.0005435643564356437,
"loss": 1.3295,
"step": 7010
},
{
"epoch": 5.913597814332062,
"grad_norm": 0.2714064419269562,
"learning_rate": 0.0005425742574257426,
"loss": 1.329,
"step": 7020
},
{
"epoch": 5.913597814332062,
"eval_accuracy": 0.7261800107692413,
"eval_loss": 1.2115275859832764,
"eval_runtime": 893.0627,
"eval_samples_per_second": 559.172,
"eval_steps_per_second": 5.178,
"step": 7020
},
{
"epoch": 5.922021742842507,
"grad_norm": 0.277203232049942,
"learning_rate": 0.0005415841584158417,
"loss": 1.3269,
"step": 7030
},
{
"epoch": 5.930445671352951,
"grad_norm": 0.3769485354423523,
"learning_rate": 0.0005405940594059406,
"loss": 1.3268,
"step": 7040
},
{
"epoch": 5.938869599863396,
"grad_norm": 0.2526576817035675,
"learning_rate": 0.0005396039603960396,
"loss": 1.3262,
"step": 7050
},
{
"epoch": 5.94729352837384,
"grad_norm": 0.2670144736766815,
"learning_rate": 0.0005386138613861387,
"loss": 1.327,
"step": 7060
},
{
"epoch": 5.955717456884285,
"grad_norm": 0.26662877202033997,
"learning_rate": 0.0005376237623762376,
"loss": 1.3277,
"step": 7070
},
{
"epoch": 5.964141385394729,
"grad_norm": 0.3263689875602722,
"learning_rate": 0.0005366336633663367,
"loss": 1.3271,
"step": 7080
},
{
"epoch": 5.972565313905174,
"grad_norm": 0.26732614636421204,
"learning_rate": 0.0005356435643564356,
"loss": 1.3264,
"step": 7090
},
{
"epoch": 5.980989242415618,
"grad_norm": 0.3332139551639557,
"learning_rate": 0.0005346534653465347,
"loss": 1.3266,
"step": 7100
},
{
"epoch": 5.989413170926063,
"grad_norm": 0.3081839680671692,
"learning_rate": 0.0005336633663366337,
"loss": 1.325,
"step": 7110
},
{
"epoch": 5.989413170926063,
"eval_accuracy": 0.7263082386708871,
"eval_loss": 1.2105002403259277,
"eval_runtime": 893.0055,
"eval_samples_per_second": 559.208,
"eval_steps_per_second": 5.178,
"step": 7110
},
{
"epoch": 5.997837099436508,
"grad_norm": 0.2502419650554657,
"learning_rate": 0.0005326732673267327,
"loss": 1.3263,
"step": 7120
},
{
"epoch": 6.006261027946952,
"grad_norm": 0.2437312752008438,
"learning_rate": 0.0005316831683168317,
"loss": 1.3225,
"step": 7130
},
{
"epoch": 6.014684956457397,
"grad_norm": 0.3372795581817627,
"learning_rate": 0.0005306930693069307,
"loss": 1.3234,
"step": 7140
},
{
"epoch": 6.023108884967841,
"grad_norm": 0.2895912826061249,
"learning_rate": 0.0005297029702970297,
"loss": 1.3252,
"step": 7150
},
{
"epoch": 6.031532813478286,
"grad_norm": 0.28451213240623474,
"learning_rate": 0.0005287128712871288,
"loss": 1.3238,
"step": 7160
},
{
"epoch": 6.03995674198873,
"grad_norm": 0.2496078759431839,
"learning_rate": 0.0005277227722772277,
"loss": 1.323,
"step": 7170
},
{
"epoch": 6.048380670499174,
"grad_norm": 0.26850923895835876,
"learning_rate": 0.0005267326732673268,
"loss": 1.322,
"step": 7180
},
{
"epoch": 6.056804599009619,
"grad_norm": 0.30225685238838196,
"learning_rate": 0.0005257425742574257,
"loss": 1.3212,
"step": 7190
},
{
"epoch": 6.0652285275200635,
"grad_norm": 0.32349905371665955,
"learning_rate": 0.0005247524752475248,
"loss": 1.3219,
"step": 7200
},
{
"epoch": 6.0652285275200635,
"eval_accuracy": 0.727180971273756,
"eval_loss": 1.205489993095398,
"eval_runtime": 890.8938,
"eval_samples_per_second": 560.534,
"eval_steps_per_second": 5.19,
"step": 7200
},
{
"epoch": 6.0736524560305085,
"grad_norm": 0.29943209886550903,
"learning_rate": 0.0005237623762376238,
"loss": 1.3182,
"step": 7210
},
{
"epoch": 6.082076384540953,
"grad_norm": 0.30952343344688416,
"learning_rate": 0.0005227722772277228,
"loss": 1.3194,
"step": 7220
},
{
"epoch": 6.090500313051398,
"grad_norm": 0.3158267140388489,
"learning_rate": 0.0005217821782178218,
"loss": 1.319,
"step": 7230
},
{
"epoch": 6.098924241561842,
"grad_norm": 0.27009105682373047,
"learning_rate": 0.0005207920792079208,
"loss": 1.3212,
"step": 7240
},
{
"epoch": 6.107348170072286,
"grad_norm": 0.2660143971443176,
"learning_rate": 0.0005198019801980198,
"loss": 1.3181,
"step": 7250
},
{
"epoch": 6.115772098582731,
"grad_norm": 0.32289671897888184,
"learning_rate": 0.0005188118811881189,
"loss": 1.3166,
"step": 7260
},
{
"epoch": 6.124196027093175,
"grad_norm": 0.301577627658844,
"learning_rate": 0.0005178217821782178,
"loss": 1.3215,
"step": 7270
},
{
"epoch": 6.13261995560362,
"grad_norm": 0.26539114117622375,
"learning_rate": 0.0005168316831683169,
"loss": 1.3173,
"step": 7280
},
{
"epoch": 6.141043884114064,
"grad_norm": 0.30636703968048096,
"learning_rate": 0.0005158415841584158,
"loss": 1.319,
"step": 7290
},
{
"epoch": 6.141043884114064,
"eval_accuracy": 0.7278776618882268,
"eval_loss": 1.2021031379699707,
"eval_runtime": 893.3533,
"eval_samples_per_second": 558.99,
"eval_steps_per_second": 5.176,
"step": 7290
},
{
"epoch": 6.1494678126245095,
"grad_norm": 0.2906350791454315,
"learning_rate": 0.0005148514851485149,
"loss": 1.3177,
"step": 7300
},
{
"epoch": 6.157891741134954,
"grad_norm": 0.33962422609329224,
"learning_rate": 0.0005138613861386139,
"loss": 1.3173,
"step": 7310
},
{
"epoch": 6.166315669645398,
"grad_norm": 0.29772093892097473,
"learning_rate": 0.0005128712871287129,
"loss": 1.3194,
"step": 7320
},
{
"epoch": 6.174739598155843,
"grad_norm": 0.27262043952941895,
"learning_rate": 0.0005118811881188119,
"loss": 1.3159,
"step": 7330
},
{
"epoch": 6.183163526666287,
"grad_norm": 0.2678314745426178,
"learning_rate": 0.0005108910891089109,
"loss": 1.3167,
"step": 7340
},
{
"epoch": 6.191587455176732,
"grad_norm": 0.3115740716457367,
"learning_rate": 0.0005099009900990099,
"loss": 1.3142,
"step": 7350
},
{
"epoch": 6.200011383687176,
"grad_norm": 0.2983403205871582,
"learning_rate": 0.000508910891089109,
"loss": 1.3158,
"step": 7360
},
{
"epoch": 6.208435312197621,
"grad_norm": 0.2797269821166992,
"learning_rate": 0.0005079207920792079,
"loss": 1.3163,
"step": 7370
},
{
"epoch": 6.216859240708065,
"grad_norm": 0.29581907391548157,
"learning_rate": 0.000506930693069307,
"loss": 1.3156,
"step": 7380
},
{
"epoch": 6.216859240708065,
"eval_accuracy": 0.7285335214596267,
"eval_loss": 1.1984630823135376,
"eval_runtime": 881.1088,
"eval_samples_per_second": 566.759,
"eval_steps_per_second": 5.248,
"step": 7380
},
{
"epoch": 6.2252831692185095,
"grad_norm": 0.2843240797519684,
"learning_rate": 0.0005059405940594059,
"loss": 1.3162,
"step": 7390
},
{
"epoch": 6.233707097728955,
"grad_norm": 0.2662515938282013,
"learning_rate": 0.000504950495049505,
"loss": 1.314,
"step": 7400
},
{
"epoch": 6.242131026239399,
"grad_norm": 0.3370913565158844,
"learning_rate": 0.000503960396039604,
"loss": 1.3136,
"step": 7410
},
{
"epoch": 6.250554954749844,
"grad_norm": 0.29014459252357483,
"learning_rate": 0.000502970297029703,
"loss": 1.3127,
"step": 7420
},
{
"epoch": 6.258978883260288,
"grad_norm": 0.2779816687107086,
"learning_rate": 0.000501980198019802,
"loss": 1.3137,
"step": 7430
},
{
"epoch": 6.267402811770733,
"grad_norm": 0.2942447066307068,
"learning_rate": 0.000500990099009901,
"loss": 1.3138,
"step": 7440
},
{
"epoch": 6.275826740281177,
"grad_norm": 0.3536125719547272,
"learning_rate": 0.0005,
"loss": 1.3135,
"step": 7450
},
{
"epoch": 6.284250668791621,
"grad_norm": 0.29686686396598816,
"learning_rate": 0.0004990099009900991,
"loss": 1.3129,
"step": 7460
},
{
"epoch": 6.292674597302066,
"grad_norm": 0.30590084195137024,
"learning_rate": 0.000498019801980198,
"loss": 1.3114,
"step": 7470
},
{
"epoch": 6.292674597302066,
"eval_accuracy": 0.7293452386458654,
"eval_loss": 1.1951327323913574,
"eval_runtime": 893.3348,
"eval_samples_per_second": 559.002,
"eval_steps_per_second": 5.176,
"step": 7470
},
{
"epoch": 6.3010985258125105,
"grad_norm": 0.2687655985355377,
"learning_rate": 0.0004970297029702971,
"loss": 1.3125,
"step": 7480
},
{
"epoch": 6.3095224543229556,
"grad_norm": 0.31057268381118774,
"learning_rate": 0.000496039603960396,
"loss": 1.3106,
"step": 7490
},
{
"epoch": 6.3179463828334,
"grad_norm": 0.3097970187664032,
"learning_rate": 0.0004950495049504951,
"loss": 1.31,
"step": 7500
},
{
"epoch": 6.326370311343844,
"grad_norm": 0.28469330072402954,
"learning_rate": 0.0004940594059405941,
"loss": 1.3098,
"step": 7510
},
{
"epoch": 6.334794239854289,
"grad_norm": 0.2911768853664398,
"learning_rate": 0.000493069306930693,
"loss": 1.3103,
"step": 7520
},
{
"epoch": 6.343218168364733,
"grad_norm": 0.2990330755710602,
"learning_rate": 0.0004920792079207921,
"loss": 1.3108,
"step": 7530
},
{
"epoch": 6.351642096875178,
"grad_norm": 0.2908383905887604,
"learning_rate": 0.000491089108910891,
"loss": 1.3092,
"step": 7540
},
{
"epoch": 6.360066025385622,
"grad_norm": 0.306233674287796,
"learning_rate": 0.0004900990099009901,
"loss": 1.3107,
"step": 7550
},
{
"epoch": 6.368489953896067,
"grad_norm": 0.2749456465244293,
"learning_rate": 0.0004891089108910892,
"loss": 1.3073,
"step": 7560
},
{
"epoch": 6.368489953896067,
"eval_accuracy": 0.7300212582744398,
"eval_loss": 1.1918327808380127,
"eval_runtime": 886.4778,
"eval_samples_per_second": 563.326,
"eval_steps_per_second": 5.216,
"step": 7560
},
{
"epoch": 6.3769138824065115,
"grad_norm": 0.2799837291240692,
"learning_rate": 0.0004881188118811881,
"loss": 1.3084,
"step": 7570
},
{
"epoch": 6.385337810916956,
"grad_norm": 0.3050614893436432,
"learning_rate": 0.00048712871287128715,
"loss": 1.3082,
"step": 7580
},
{
"epoch": 6.393761739427401,
"grad_norm": 0.2900220453739166,
"learning_rate": 0.00048613861386138615,
"loss": 1.3087,
"step": 7590
},
{
"epoch": 6.402185667937845,
"grad_norm": 0.2592508792877197,
"learning_rate": 0.00048514851485148515,
"loss": 1.3082,
"step": 7600
},
{
"epoch": 6.41060959644829,
"grad_norm": 0.2503323256969452,
"learning_rate": 0.00048415841584158414,
"loss": 1.3066,
"step": 7610
},
{
"epoch": 6.419033524958734,
"grad_norm": 0.30254074931144714,
"learning_rate": 0.00048316831683168314,
"loss": 1.3079,
"step": 7620
},
{
"epoch": 6.427457453469179,
"grad_norm": 0.28869137167930603,
"learning_rate": 0.0004821782178217822,
"loss": 1.3061,
"step": 7630
},
{
"epoch": 6.435881381979623,
"grad_norm": 0.3226109445095062,
"learning_rate": 0.0004811881188118812,
"loss": 1.3051,
"step": 7640
},
{
"epoch": 6.444305310490067,
"grad_norm": 0.2900817096233368,
"learning_rate": 0.0004801980198019802,
"loss": 1.3062,
"step": 7650
},
{
"epoch": 6.444305310490067,
"eval_accuracy": 0.7304169114350704,
"eval_loss": 1.1914669275283813,
"eval_runtime": 888.5325,
"eval_samples_per_second": 562.023,
"eval_steps_per_second": 5.204,
"step": 7650
},
{
"epoch": 6.452729239000512,
"grad_norm": 0.3235354721546173,
"learning_rate": 0.0004792079207920792,
"loss": 1.3074,
"step": 7660
},
{
"epoch": 6.461153167510957,
"grad_norm": 0.26384827494621277,
"learning_rate": 0.0004782178217821782,
"loss": 1.3052,
"step": 7670
},
{
"epoch": 6.469577096021402,
"grad_norm": 0.27176037430763245,
"learning_rate": 0.00047722772277227724,
"loss": 1.3032,
"step": 7680
},
{
"epoch": 6.478001024531846,
"grad_norm": 0.27846911549568176,
"learning_rate": 0.00047623762376237624,
"loss": 1.3038,
"step": 7690
},
{
"epoch": 6.48642495304229,
"grad_norm": 0.32258498668670654,
"learning_rate": 0.00047524752475247524,
"loss": 1.3052,
"step": 7700
},
{
"epoch": 6.494848881552735,
"grad_norm": 0.3000924587249756,
"learning_rate": 0.00047425742574257423,
"loss": 1.3046,
"step": 7710
},
{
"epoch": 6.503272810063179,
"grad_norm": 0.22748370468616486,
"learning_rate": 0.00047326732673267323,
"loss": 1.3054,
"step": 7720
},
{
"epoch": 6.511696738573624,
"grad_norm": 0.3552054464817047,
"learning_rate": 0.0004722772277227723,
"loss": 1.3026,
"step": 7730
},
{
"epoch": 6.520120667084068,
"grad_norm": 0.2629605531692505,
"learning_rate": 0.0004712871287128713,
"loss": 1.3021,
"step": 7740
},
{
"epoch": 6.520120667084068,
"eval_accuracy": 0.7311149976881265,
"eval_loss": 1.1877076625823975,
"eval_runtime": 883.1573,
"eval_samples_per_second": 565.444,
"eval_steps_per_second": 5.236,
"step": 7740
},
{
"epoch": 6.528544595594513,
"grad_norm": 0.31692177057266235,
"learning_rate": 0.0004702970297029703,
"loss": 1.3048,
"step": 7750
},
{
"epoch": 6.5369685241049575,
"grad_norm": 0.3689730167388916,
"learning_rate": 0.0004693069306930693,
"loss": 1.3016,
"step": 7760
},
{
"epoch": 6.545392452615403,
"grad_norm": 0.2619648277759552,
"learning_rate": 0.00046831683168316833,
"loss": 1.3018,
"step": 7770
},
{
"epoch": 6.553816381125847,
"grad_norm": 0.29713907837867737,
"learning_rate": 0.0004673267326732674,
"loss": 1.3007,
"step": 7780
},
{
"epoch": 6.562240309636291,
"grad_norm": 0.3426944315433502,
"learning_rate": 0.0004663366336633664,
"loss": 1.302,
"step": 7790
},
{
"epoch": 6.570664238146736,
"grad_norm": 0.30286312103271484,
"learning_rate": 0.0004653465346534654,
"loss": 1.3024,
"step": 7800
},
{
"epoch": 6.57908816665718,
"grad_norm": 0.2533584237098694,
"learning_rate": 0.0004643564356435644,
"loss": 1.2991,
"step": 7810
},
{
"epoch": 6.587512095167625,
"grad_norm": 0.23465867340564728,
"learning_rate": 0.0004633663366336634,
"loss": 1.3007,
"step": 7820
},
{
"epoch": 6.595936023678069,
"grad_norm": 0.31729191541671753,
"learning_rate": 0.00046237623762376243,
"loss": 1.3,
"step": 7830
},
{
"epoch": 6.595936023678069,
"eval_accuracy": 0.7318502985148011,
"eval_loss": 1.1818432807922363,
"eval_runtime": 891.13,
"eval_samples_per_second": 560.385,
"eval_steps_per_second": 5.189,
"step": 7830
},
{
"epoch": 6.6043599521885135,
"grad_norm": 0.26264631748199463,
"learning_rate": 0.00046138613861386143,
"loss": 1.3003,
"step": 7840
},
{
"epoch": 6.6127838806989585,
"grad_norm": 0.26062801480293274,
"learning_rate": 0.0004603960396039604,
"loss": 1.2977,
"step": 7850
},
{
"epoch": 6.621207809209403,
"grad_norm": 0.2755686640739441,
"learning_rate": 0.0004594059405940594,
"loss": 1.2979,
"step": 7860
},
{
"epoch": 6.629631737719848,
"grad_norm": 0.32309025526046753,
"learning_rate": 0.0004584158415841584,
"loss": 1.297,
"step": 7870
},
{
"epoch": 6.638055666230292,
"grad_norm": 0.2709057927131653,
"learning_rate": 0.0004574257425742575,
"loss": 1.2999,
"step": 7880
},
{
"epoch": 6.646479594740737,
"grad_norm": 0.2785532772541046,
"learning_rate": 0.00045643564356435647,
"loss": 1.2959,
"step": 7890
},
{
"epoch": 6.654903523251181,
"grad_norm": 0.2822953164577484,
"learning_rate": 0.00045544554455445547,
"loss": 1.2984,
"step": 7900
},
{
"epoch": 6.663327451761625,
"grad_norm": 0.2704668641090393,
"learning_rate": 0.00045445544554455447,
"loss": 1.2956,
"step": 7910
},
{
"epoch": 6.67175138027207,
"grad_norm": 0.3228791058063507,
"learning_rate": 0.00045346534653465347,
"loss": 1.2984,
"step": 7920
},
{
"epoch": 6.67175138027207,
"eval_accuracy": 0.7318941432804211,
"eval_loss": 1.184158205986023,
"eval_runtime": 883.7641,
"eval_samples_per_second": 565.056,
"eval_steps_per_second": 5.232,
"step": 7920
},
{
"epoch": 6.680175308782514,
"grad_norm": 0.2641367018222809,
"learning_rate": 0.0004524752475247525,
"loss": 1.299,
"step": 7930
},
{
"epoch": 6.6885992372929595,
"grad_norm": 0.28555190563201904,
"learning_rate": 0.0004514851485148515,
"loss": 1.2985,
"step": 7940
},
{
"epoch": 6.697023165803404,
"grad_norm": 0.2615039050579071,
"learning_rate": 0.0004504950495049505,
"loss": 1.294,
"step": 7950
},
{
"epoch": 6.705447094313849,
"grad_norm": 0.25349870324134827,
"learning_rate": 0.0004495049504950495,
"loss": 1.295,
"step": 7960
},
{
"epoch": 6.713871022824293,
"grad_norm": 0.3342011272907257,
"learning_rate": 0.0004485148514851485,
"loss": 1.2963,
"step": 7970
},
{
"epoch": 6.722294951334737,
"grad_norm": 0.2608206570148468,
"learning_rate": 0.00044752475247524756,
"loss": 1.2957,
"step": 7980
},
{
"epoch": 6.730718879845182,
"grad_norm": 0.27476873993873596,
"learning_rate": 0.00044653465346534656,
"loss": 1.2939,
"step": 7990
},
{
"epoch": 6.739142808355626,
"grad_norm": 0.3241907060146332,
"learning_rate": 0.00044554455445544556,
"loss": 1.2965,
"step": 8000
},
{
"epoch": 6.747566736866071,
"grad_norm": 0.3494180142879486,
"learning_rate": 0.00044455445544554456,
"loss": 1.2962,
"step": 8010
},
{
"epoch": 6.747566736866071,
"eval_accuracy": 0.7322386411238602,
"eval_loss": 1.182516098022461,
"eval_runtime": 889.7545,
"eval_samples_per_second": 561.251,
"eval_steps_per_second": 5.197,
"step": 8010
},
{
"epoch": 6.755990665376515,
"grad_norm": 0.2616145610809326,
"learning_rate": 0.00044356435643564356,
"loss": 1.2958,
"step": 8020
},
{
"epoch": 6.7644145938869595,
"grad_norm": 0.29238995909690857,
"learning_rate": 0.0004425742574257426,
"loss": 1.293,
"step": 8030
},
{
"epoch": 6.772838522397405,
"grad_norm": 0.24060964584350586,
"learning_rate": 0.0004415841584158416,
"loss": 1.2948,
"step": 8040
},
{
"epoch": 6.781262450907849,
"grad_norm": 0.29363489151000977,
"learning_rate": 0.0004405940594059406,
"loss": 1.2928,
"step": 8050
},
{
"epoch": 6.789686379418294,
"grad_norm": 0.3320622444152832,
"learning_rate": 0.0004396039603960396,
"loss": 1.2925,
"step": 8060
},
{
"epoch": 6.798110307928738,
"grad_norm": 0.23857133090496063,
"learning_rate": 0.0004386138613861386,
"loss": 1.2943,
"step": 8070
},
{
"epoch": 6.806534236439183,
"grad_norm": 0.24713198840618134,
"learning_rate": 0.00043762376237623765,
"loss": 1.2938,
"step": 8080
},
{
"epoch": 6.814958164949627,
"grad_norm": 0.26270854473114014,
"learning_rate": 0.00043663366336633665,
"loss": 1.2916,
"step": 8090
},
{
"epoch": 6.823382093460072,
"grad_norm": 0.2450101524591446,
"learning_rate": 0.00043564356435643565,
"loss": 1.2931,
"step": 8100
},
{
"epoch": 6.823382093460072,
"eval_accuracy": 0.7332625526391774,
"eval_loss": 1.1757333278656006,
"eval_runtime": 889.0249,
"eval_samples_per_second": 561.712,
"eval_steps_per_second": 5.201,
"step": 8100
},
{
"epoch": 6.831806021970516,
"grad_norm": 0.27462685108184814,
"learning_rate": 0.00043465346534653465,
"loss": 1.2923,
"step": 8110
},
{
"epoch": 6.8402299504809605,
"grad_norm": 0.2707907259464264,
"learning_rate": 0.00043366336633663365,
"loss": 1.2925,
"step": 8120
},
{
"epoch": 6.8486538789914055,
"grad_norm": 0.24748317897319794,
"learning_rate": 0.0004326732673267327,
"loss": 1.2929,
"step": 8130
},
{
"epoch": 6.85707780750185,
"grad_norm": 0.226767897605896,
"learning_rate": 0.0004316831683168317,
"loss": 1.2883,
"step": 8140
},
{
"epoch": 6.865501736012295,
"grad_norm": 0.24889105558395386,
"learning_rate": 0.0004306930693069307,
"loss": 1.2893,
"step": 8150
},
{
"epoch": 6.873925664522739,
"grad_norm": 0.26075902581214905,
"learning_rate": 0.0004297029702970297,
"loss": 1.2893,
"step": 8160
},
{
"epoch": 6.882349593033183,
"grad_norm": 0.26210734248161316,
"learning_rate": 0.0004287128712871287,
"loss": 1.2868,
"step": 8170
},
{
"epoch": 6.890773521543628,
"grad_norm": 0.2559298872947693,
"learning_rate": 0.00042772277227722774,
"loss": 1.2886,
"step": 8180
},
{
"epoch": 6.899197450054072,
"grad_norm": 0.2503817081451416,
"learning_rate": 0.00042673267326732674,
"loss": 1.2883,
"step": 8190
},
{
"epoch": 6.899197450054072,
"eval_accuracy": 0.7335132915044345,
"eval_loss": 1.1744158267974854,
"eval_runtime": 885.5636,
"eval_samples_per_second": 563.908,
"eval_steps_per_second": 5.222,
"step": 8190
},
{
"epoch": 6.907621378564517,
"grad_norm": 0.24540117383003235,
"learning_rate": 0.00042574257425742574,
"loss": 1.2893,
"step": 8200
},
{
"epoch": 6.9160453070749615,
"grad_norm": 0.3089258670806885,
"learning_rate": 0.00042475247524752474,
"loss": 1.2896,
"step": 8210
},
{
"epoch": 6.9244692355854065,
"grad_norm": 0.26888999342918396,
"learning_rate": 0.00042376237623762374,
"loss": 1.2895,
"step": 8220
},
{
"epoch": 6.932893164095851,
"grad_norm": 0.24743571877479553,
"learning_rate": 0.0004227722772277228,
"loss": 1.2884,
"step": 8230
},
{
"epoch": 6.941317092606295,
"grad_norm": 0.24364733695983887,
"learning_rate": 0.0004217821782178218,
"loss": 1.2879,
"step": 8240
},
{
"epoch": 6.94974102111674,
"grad_norm": 0.2963743507862091,
"learning_rate": 0.0004207920792079208,
"loss": 1.2878,
"step": 8250
},
{
"epoch": 6.958164949627184,
"grad_norm": 0.2444639950990677,
"learning_rate": 0.0004198019801980198,
"loss": 1.2871,
"step": 8260
},
{
"epoch": 6.966588878137629,
"grad_norm": 0.27140820026397705,
"learning_rate": 0.0004188118811881188,
"loss": 1.2878,
"step": 8270
},
{
"epoch": 6.975012806648073,
"grad_norm": 0.2628765404224396,
"learning_rate": 0.00041782178217821784,
"loss": 1.2873,
"step": 8280
},
{
"epoch": 6.975012806648073,
"eval_accuracy": 0.734204579286565,
"eval_loss": 1.171156644821167,
"eval_runtime": 888.1172,
"eval_samples_per_second": 562.286,
"eval_steps_per_second": 5.207,
"step": 8280
},
{
"epoch": 6.983436735158518,
"grad_norm": 0.2539413869380951,
"learning_rate": 0.00041683168316831683,
"loss": 1.2874,
"step": 8290
},
{
"epoch": 6.991860663668962,
"grad_norm": 0.29522642493247986,
"learning_rate": 0.00041584158415841583,
"loss": 1.2859,
"step": 8300
},
{
"epoch": 7.000284592179407,
"grad_norm": 0.29553958773612976,
"learning_rate": 0.00041485148514851483,
"loss": 1.2878,
"step": 8310
},
{
"epoch": 7.008708520689852,
"grad_norm": 0.3111182153224945,
"learning_rate": 0.00041386138613861383,
"loss": 1.2874,
"step": 8320
},
{
"epoch": 7.017132449200296,
"grad_norm": 0.33146336674690247,
"learning_rate": 0.0004128712871287129,
"loss": 1.287,
"step": 8330
},
{
"epoch": 7.025556377710741,
"grad_norm": 0.27456361055374146,
"learning_rate": 0.0004118811881188119,
"loss": 1.2858,
"step": 8340
},
{
"epoch": 7.033980306221185,
"grad_norm": 0.29216212034225464,
"learning_rate": 0.0004108910891089109,
"loss": 1.2838,
"step": 8350
},
{
"epoch": 7.042404234731629,
"grad_norm": 0.24966631829738617,
"learning_rate": 0.0004099009900990099,
"loss": 1.2857,
"step": 8360
},
{
"epoch": 7.050828163242074,
"grad_norm": 0.2910294234752655,
"learning_rate": 0.0004089108910891089,
"loss": 1.2858,
"step": 8370
},
{
"epoch": 7.050828163242074,
"eval_accuracy": 0.7346228547150983,
"eval_loss": 1.169946551322937,
"eval_runtime": 890.9908,
"eval_samples_per_second": 560.473,
"eval_steps_per_second": 5.19,
"step": 8370
},
{
"epoch": 7.059252091752518,
"grad_norm": 0.26337358355522156,
"learning_rate": 0.0004079207920792079,
"loss": 1.2842,
"step": 8380
},
{
"epoch": 7.067676020262963,
"grad_norm": 0.2426845133304596,
"learning_rate": 0.0004069306930693069,
"loss": 1.2836,
"step": 8390
},
{
"epoch": 7.0760999487734075,
"grad_norm": 0.2740408778190613,
"learning_rate": 0.000405940594059406,
"loss": 1.2842,
"step": 8400
},
{
"epoch": 7.084523877283853,
"grad_norm": 0.27966201305389404,
"learning_rate": 0.000404950495049505,
"loss": 1.2841,
"step": 8410
},
{
"epoch": 7.092947805794297,
"grad_norm": 0.3083817660808563,
"learning_rate": 0.00040396039603960397,
"loss": 1.2823,
"step": 8420
},
{
"epoch": 7.101371734304741,
"grad_norm": 0.30730104446411133,
"learning_rate": 0.000402970297029703,
"loss": 1.2845,
"step": 8430
},
{
"epoch": 7.109795662815186,
"grad_norm": 0.2973144054412842,
"learning_rate": 0.000401980198019802,
"loss": 1.2814,
"step": 8440
},
{
"epoch": 7.11821959132563,
"grad_norm": 0.2775426208972931,
"learning_rate": 0.000400990099009901,
"loss": 1.2823,
"step": 8450
},
{
"epoch": 7.126643519836075,
"grad_norm": 0.2734345495700836,
"learning_rate": 0.0004,
"loss": 1.2819,
"step": 8460
},
{
"epoch": 7.126643519836075,
"eval_accuracy": 0.735104089750221,
"eval_loss": 1.1682698726654053,
"eval_runtime": 886.7497,
"eval_samples_per_second": 563.153,
"eval_steps_per_second": 5.215,
"step": 8460
},
{
"epoch": 7.135067448346519,
"grad_norm": 0.27912047505378723,
"learning_rate": 0.000399009900990099,
"loss": 1.2826,
"step": 8470
},
{
"epoch": 7.143491376856964,
"grad_norm": 0.3084285855293274,
"learning_rate": 0.00039801980198019807,
"loss": 1.2811,
"step": 8480
},
{
"epoch": 7.1519153053674085,
"grad_norm": 0.30194783210754395,
"learning_rate": 0.00039702970297029707,
"loss": 1.2828,
"step": 8490
},
{
"epoch": 7.160339233877853,
"grad_norm": 0.25307685136795044,
"learning_rate": 0.00039603960396039607,
"loss": 1.2791,
"step": 8500
},
{
"epoch": 7.168763162388298,
"grad_norm": 0.25018778443336487,
"learning_rate": 0.00039504950495049506,
"loss": 1.2796,
"step": 8510
},
{
"epoch": 7.177187090898742,
"grad_norm": 0.2541010081768036,
"learning_rate": 0.00039405940594059406,
"loss": 1.2812,
"step": 8520
},
{
"epoch": 7.185611019409187,
"grad_norm": 0.29745373129844666,
"learning_rate": 0.0003930693069306931,
"loss": 1.2828,
"step": 8530
},
{
"epoch": 7.194034947919631,
"grad_norm": 0.2740705907344818,
"learning_rate": 0.0003920792079207921,
"loss": 1.2812,
"step": 8540
},
{
"epoch": 7.202458876430076,
"grad_norm": 0.23998434841632843,
"learning_rate": 0.0003910891089108911,
"loss": 1.2781,
"step": 8550
},
{
"epoch": 7.202458876430076,
"eval_accuracy": 0.7354429371546514,
"eval_loss": 1.1649537086486816,
"eval_runtime": 891.9041,
"eval_samples_per_second": 559.899,
"eval_steps_per_second": 5.184,
"step": 8550
},
{
"epoch": 7.21088280494052,
"grad_norm": 0.2691722512245178,
"learning_rate": 0.0003900990099009901,
"loss": 1.2785,
"step": 8560
},
{
"epoch": 7.219306733450964,
"grad_norm": 0.28188225626945496,
"learning_rate": 0.0003891089108910891,
"loss": 1.2807,
"step": 8570
},
{
"epoch": 7.2277306619614095,
"grad_norm": 0.3311617970466614,
"learning_rate": 0.00038811881188118816,
"loss": 1.2809,
"step": 8580
},
{
"epoch": 7.236154590471854,
"grad_norm": 0.2717738747596741,
"learning_rate": 0.00038712871287128716,
"loss": 1.278,
"step": 8590
},
{
"epoch": 7.244578518982299,
"grad_norm": 0.27171820402145386,
"learning_rate": 0.00038613861386138616,
"loss": 1.2803,
"step": 8600
},
{
"epoch": 7.253002447492743,
"grad_norm": 0.249137282371521,
"learning_rate": 0.00038514851485148515,
"loss": 1.277,
"step": 8610
},
{
"epoch": 7.261426376003188,
"grad_norm": 0.26939263939857483,
"learning_rate": 0.00038415841584158415,
"loss": 1.2773,
"step": 8620
},
{
"epoch": 7.269850304513632,
"grad_norm": 0.3177802860736847,
"learning_rate": 0.0003831683168316832,
"loss": 1.2763,
"step": 8630
},
{
"epoch": 7.278274233024076,
"grad_norm": 0.2421504557132721,
"learning_rate": 0.0003821782178217822,
"loss": 1.2771,
"step": 8640
},
{
"epoch": 7.278274233024076,
"eval_accuracy": 0.7357238880776348,
"eval_loss": 1.1646403074264526,
"eval_runtime": 878.5966,
"eval_samples_per_second": 568.379,
"eval_steps_per_second": 5.263,
"step": 8640
},
{
"epoch": 7.286698161534521,
"grad_norm": 0.28808215260505676,
"learning_rate": 0.0003811881188118812,
"loss": 1.2744,
"step": 8650
},
{
"epoch": 7.295122090044965,
"grad_norm": 0.26363667845726013,
"learning_rate": 0.0003801980198019802,
"loss": 1.2788,
"step": 8660
},
{
"epoch": 7.30354601855541,
"grad_norm": 0.35491064190864563,
"learning_rate": 0.0003792079207920792,
"loss": 1.2792,
"step": 8670
},
{
"epoch": 7.311969947065855,
"grad_norm": 0.3273920714855194,
"learning_rate": 0.00037821782178217825,
"loss": 1.278,
"step": 8680
},
{
"epoch": 7.320393875576299,
"grad_norm": 0.28319239616394043,
"learning_rate": 0.00037722772277227725,
"loss": 1.2762,
"step": 8690
},
{
"epoch": 7.328817804086744,
"grad_norm": 0.28414586186408997,
"learning_rate": 0.00037623762376237625,
"loss": 1.2769,
"step": 8700
},
{
"epoch": 7.337241732597188,
"grad_norm": 0.25393033027648926,
"learning_rate": 0.00037524752475247524,
"loss": 1.2742,
"step": 8710
},
{
"epoch": 7.345665661107633,
"grad_norm": 0.25634288787841797,
"learning_rate": 0.00037425742574257424,
"loss": 1.2753,
"step": 8720
},
{
"epoch": 7.354089589618077,
"grad_norm": 0.2355813831090927,
"learning_rate": 0.0003732673267326733,
"loss": 1.2749,
"step": 8730
},
{
"epoch": 7.354089589618077,
"eval_accuracy": 0.7361996522899728,
"eval_loss": 1.160847544670105,
"eval_runtime": 889.4544,
"eval_samples_per_second": 561.441,
"eval_steps_per_second": 5.199,
"step": 8730
},
{
"epoch": 7.362513518128522,
"grad_norm": 0.24002189934253693,
"learning_rate": 0.0003722772277227723,
"loss": 1.2751,
"step": 8740
},
{
"epoch": 7.370937446638966,
"grad_norm": 0.2806450128555298,
"learning_rate": 0.0003712871287128713,
"loss": 1.275,
"step": 8750
},
{
"epoch": 7.3793613751494105,
"grad_norm": 0.24552834033966064,
"learning_rate": 0.0003702970297029703,
"loss": 1.2753,
"step": 8760
},
{
"epoch": 7.3877853036598555,
"grad_norm": 0.24814461171627045,
"learning_rate": 0.0003693069306930693,
"loss": 1.276,
"step": 8770
},
{
"epoch": 7.3962092321703,
"grad_norm": 0.26086533069610596,
"learning_rate": 0.00036831683168316834,
"loss": 1.2744,
"step": 8780
},
{
"epoch": 7.404633160680745,
"grad_norm": 0.2854679822921753,
"learning_rate": 0.00036732673267326734,
"loss": 1.2739,
"step": 8790
},
{
"epoch": 7.413057089191189,
"grad_norm": 0.24847003817558289,
"learning_rate": 0.00036633663366336634,
"loss": 1.2731,
"step": 8800
},
{
"epoch": 7.421481017701634,
"grad_norm": 0.3230905532836914,
"learning_rate": 0.00036534653465346533,
"loss": 1.2732,
"step": 8810
},
{
"epoch": 7.429904946212078,
"grad_norm": 0.30264076590538025,
"learning_rate": 0.00036435643564356433,
"loss": 1.273,
"step": 8820
},
{
"epoch": 7.429904946212078,
"eval_accuracy": 0.7366944357714759,
"eval_loss": 1.1585748195648193,
"eval_runtime": 884.7129,
"eval_samples_per_second": 564.45,
"eval_steps_per_second": 5.227,
"step": 8820
},
{
"epoch": 7.438328874722522,
"grad_norm": 0.25705888867378235,
"learning_rate": 0.0003633663366336634,
"loss": 1.2738,
"step": 8830
},
{
"epoch": 7.446752803232967,
"grad_norm": 0.2455236166715622,
"learning_rate": 0.0003623762376237624,
"loss": 1.2727,
"step": 8840
},
{
"epoch": 7.4551767317434114,
"grad_norm": 0.2877678871154785,
"learning_rate": 0.0003613861386138614,
"loss": 1.2733,
"step": 8850
},
{
"epoch": 7.4636006602538565,
"grad_norm": 0.2644253969192505,
"learning_rate": 0.0003603960396039604,
"loss": 1.2711,
"step": 8860
},
{
"epoch": 7.472024588764301,
"grad_norm": 0.25103089213371277,
"learning_rate": 0.0003594059405940594,
"loss": 1.2727,
"step": 8870
},
{
"epoch": 7.480448517274746,
"grad_norm": 0.28732746839523315,
"learning_rate": 0.00035841584158415843,
"loss": 1.2729,
"step": 8880
},
{
"epoch": 7.48887244578519,
"grad_norm": 0.3096875846385956,
"learning_rate": 0.00035742574257425743,
"loss": 1.2733,
"step": 8890
},
{
"epoch": 7.497296374295634,
"grad_norm": 0.27695363759994507,
"learning_rate": 0.0003564356435643564,
"loss": 1.2719,
"step": 8900
},
{
"epoch": 7.505720302806079,
"grad_norm": 0.26089048385620117,
"learning_rate": 0.0003554455445544554,
"loss": 1.2718,
"step": 8910
},
{
"epoch": 7.505720302806079,
"eval_accuracy": 0.7372118632602084,
"eval_loss": 1.1557950973510742,
"eval_runtime": 890.5411,
"eval_samples_per_second": 560.756,
"eval_steps_per_second": 5.192,
"step": 8910
},
{
"epoch": 7.514144231316523,
"grad_norm": 0.24578547477722168,
"learning_rate": 0.0003544554455445544,
"loss": 1.2723,
"step": 8920
},
{
"epoch": 7.522568159826968,
"grad_norm": 0.2624136209487915,
"learning_rate": 0.0003534653465346535,
"loss": 1.2708,
"step": 8930
},
{
"epoch": 7.530992088337412,
"grad_norm": 0.25748109817504883,
"learning_rate": 0.0003524752475247525,
"loss": 1.2708,
"step": 8940
},
{
"epoch": 7.5394160168478574,
"grad_norm": 0.28079208731651306,
"learning_rate": 0.00035148514851485147,
"loss": 1.2727,
"step": 8950
},
{
"epoch": 7.547839945358302,
"grad_norm": 0.2706407904624939,
"learning_rate": 0.00035049504950495047,
"loss": 1.2712,
"step": 8960
},
{
"epoch": 7.556263873868746,
"grad_norm": 0.27032172679901123,
"learning_rate": 0.00034950495049504947,
"loss": 1.2673,
"step": 8970
},
{
"epoch": 7.564687802379191,
"grad_norm": 0.24915465712547302,
"learning_rate": 0.0003485148514851485,
"loss": 1.2682,
"step": 8980
},
{
"epoch": 7.573111730889635,
"grad_norm": 0.24191108345985413,
"learning_rate": 0.0003475247524752475,
"loss": 1.2719,
"step": 8990
},
{
"epoch": 7.58153565940008,
"grad_norm": 0.2806965112686157,
"learning_rate": 0.0003465346534653465,
"loss": 1.2681,
"step": 9000
},
{
"epoch": 7.58153565940008,
"eval_accuracy": 0.7375367942915361,
"eval_loss": 1.1551363468170166,
"eval_runtime": 876.3936,
"eval_samples_per_second": 569.808,
"eval_steps_per_second": 5.276,
"step": 9000
},
{
"epoch": 7.589959587910524,
"grad_norm": 0.2909415364265442,
"learning_rate": 0.0003455445544554455,
"loss": 1.2687,
"step": 9010
},
{
"epoch": 7.598383516420968,
"grad_norm": 0.30222398042678833,
"learning_rate": 0.0003445544554455445,
"loss": 1.2684,
"step": 9020
},
{
"epoch": 7.606807444931413,
"grad_norm": 0.25246381759643555,
"learning_rate": 0.0003435643564356436,
"loss": 1.2689,
"step": 9030
},
{
"epoch": 7.6152313734418575,
"grad_norm": 0.25202953815460205,
"learning_rate": 0.0003425742574257426,
"loss": 1.2689,
"step": 9040
},
{
"epoch": 7.623655301952303,
"grad_norm": 0.2351432740688324,
"learning_rate": 0.0003415841584158416,
"loss": 1.2655,
"step": 9050
},
{
"epoch": 7.632079230462747,
"grad_norm": 0.26545044779777527,
"learning_rate": 0.0003405940594059406,
"loss": 1.2659,
"step": 9060
},
{
"epoch": 7.640503158973192,
"grad_norm": 0.248436838388443,
"learning_rate": 0.0003396039603960396,
"loss": 1.2677,
"step": 9070
},
{
"epoch": 7.648927087483636,
"grad_norm": 0.3021203279495239,
"learning_rate": 0.00033861386138613867,
"loss": 1.2692,
"step": 9080
},
{
"epoch": 7.657351015994081,
"grad_norm": 0.27577024698257446,
"learning_rate": 0.00033762376237623766,
"loss": 1.2672,
"step": 9090
},
{
"epoch": 7.657351015994081,
"eval_accuracy": 0.7378275299930978,
"eval_loss": 1.1522574424743652,
"eval_runtime": 891.8663,
"eval_samples_per_second": 559.923,
"eval_steps_per_second": 5.185,
"step": 9090
},
{
"epoch": 7.665774944504525,
"grad_norm": 0.2087612897157669,
"learning_rate": 0.00033663366336633666,
"loss": 1.2655,
"step": 9100
},
{
"epoch": 7.674198873014969,
"grad_norm": 0.24880866706371307,
"learning_rate": 0.00033564356435643566,
"loss": 1.2677,
"step": 9110
},
{
"epoch": 7.682622801525414,
"grad_norm": 0.26335397362709045,
"learning_rate": 0.00033465346534653466,
"loss": 1.2647,
"step": 9120
},
{
"epoch": 7.6910467300358585,
"grad_norm": 0.25413015484809875,
"learning_rate": 0.0003336633663366337,
"loss": 1.265,
"step": 9130
},
{
"epoch": 7.6994706585463035,
"grad_norm": 0.3119896650314331,
"learning_rate": 0.0003326732673267327,
"loss": 1.2674,
"step": 9140
},
{
"epoch": 7.707894587056748,
"grad_norm": 0.2269907146692276,
"learning_rate": 0.0003316831683168317,
"loss": 1.2647,
"step": 9150
},
{
"epoch": 7.716318515567192,
"grad_norm": 0.31745684146881104,
"learning_rate": 0.0003306930693069307,
"loss": 1.2668,
"step": 9160
},
{
"epoch": 7.724742444077637,
"grad_norm": 0.28096485137939453,
"learning_rate": 0.0003297029702970297,
"loss": 1.2658,
"step": 9170
},
{
"epoch": 7.733166372588081,
"grad_norm": 0.26646697521209717,
"learning_rate": 0.00032871287128712876,
"loss": 1.2664,
"step": 9180
},
{
"epoch": 7.733166372588081,
"eval_accuracy": 0.7381772885380696,
"eval_loss": 1.151962161064148,
"eval_runtime": 889.9446,
"eval_samples_per_second": 561.132,
"eval_steps_per_second": 5.196,
"step": 9180
},
{
"epoch": 7.741590301098526,
"grad_norm": 0.24463273584842682,
"learning_rate": 0.00032772277227722775,
"loss": 1.2663,
"step": 9190
},
{
"epoch": 7.75001422960897,
"grad_norm": 0.23978425562381744,
"learning_rate": 0.00032673267326732675,
"loss": 1.2634,
"step": 9200
},
{
"epoch": 7.758438158119414,
"grad_norm": 0.25662901997566223,
"learning_rate": 0.00032574257425742575,
"loss": 1.2651,
"step": 9210
},
{
"epoch": 7.766862086629859,
"grad_norm": 0.2697198688983917,
"learning_rate": 0.00032475247524752475,
"loss": 1.2628,
"step": 9220
},
{
"epoch": 7.775286015140304,
"grad_norm": 0.2753835618495941,
"learning_rate": 0.0003237623762376238,
"loss": 1.2632,
"step": 9230
},
{
"epoch": 7.783709943650749,
"grad_norm": 0.23303931951522827,
"learning_rate": 0.0003227722772277228,
"loss": 1.2625,
"step": 9240
},
{
"epoch": 7.792133872161193,
"grad_norm": 0.26077255606651306,
"learning_rate": 0.0003217821782178218,
"loss": 1.2648,
"step": 9250
},
{
"epoch": 7.800557800671638,
"grad_norm": 0.25494781136512756,
"learning_rate": 0.0003207920792079208,
"loss": 1.2648,
"step": 9260
},
{
"epoch": 7.808981729182082,
"grad_norm": 0.2447885125875473,
"learning_rate": 0.0003198019801980198,
"loss": 1.2645,
"step": 9270
},
{
"epoch": 7.808981729182082,
"eval_accuracy": 0.7385748699480129,
"eval_loss": 1.1492513418197632,
"eval_runtime": 885.3604,
"eval_samples_per_second": 564.037,
"eval_steps_per_second": 5.223,
"step": 9270
},
{
"epoch": 7.817405657692527,
"grad_norm": 0.23961922526359558,
"learning_rate": 0.00031881188118811885,
"loss": 1.2631,
"step": 9280
},
{
"epoch": 7.825829586202971,
"grad_norm": 0.2850695252418518,
"learning_rate": 0.00031782178217821784,
"loss": 1.2636,
"step": 9290
},
{
"epoch": 7.834253514713415,
"grad_norm": 0.257962167263031,
"learning_rate": 0.00031683168316831684,
"loss": 1.2647,
"step": 9300
},
{
"epoch": 7.84267744322386,
"grad_norm": 0.28995752334594727,
"learning_rate": 0.00031584158415841584,
"loss": 1.2613,
"step": 9310
},
{
"epoch": 7.851101371734305,
"grad_norm": 0.23544956743717194,
"learning_rate": 0.00031485148514851484,
"loss": 1.261,
"step": 9320
},
{
"epoch": 7.85952530024475,
"grad_norm": 0.27855780720710754,
"learning_rate": 0.0003138613861386139,
"loss": 1.2615,
"step": 9330
},
{
"epoch": 7.867949228755194,
"grad_norm": 0.2668914198875427,
"learning_rate": 0.0003128712871287129,
"loss": 1.2629,
"step": 9340
},
{
"epoch": 7.876373157265638,
"grad_norm": 0.2561187446117401,
"learning_rate": 0.0003118811881188119,
"loss": 1.2614,
"step": 9350
},
{
"epoch": 7.884797085776083,
"grad_norm": 0.23943807184696198,
"learning_rate": 0.0003108910891089109,
"loss": 1.2591,
"step": 9360
},
{
"epoch": 7.884797085776083,
"eval_accuracy": 0.7389714933005799,
"eval_loss": 1.1477636098861694,
"eval_runtime": 884.2901,
"eval_samples_per_second": 564.72,
"eval_steps_per_second": 5.229,
"step": 9360
},
{
"epoch": 7.893221014286527,
"grad_norm": 0.3144013583660126,
"learning_rate": 0.0003099009900990099,
"loss": 1.2606,
"step": 9370
},
{
"epoch": 7.901644942796972,
"grad_norm": 0.30694615840911865,
"learning_rate": 0.00030891089108910894,
"loss": 1.2607,
"step": 9380
},
{
"epoch": 7.910068871307416,
"grad_norm": 0.28703033924102783,
"learning_rate": 0.00030792079207920793,
"loss": 1.2625,
"step": 9390
},
{
"epoch": 7.918492799817861,
"grad_norm": 0.24160224199295044,
"learning_rate": 0.00030693069306930693,
"loss": 1.2594,
"step": 9400
},
{
"epoch": 7.9269167283283055,
"grad_norm": 0.26693734526634216,
"learning_rate": 0.00030594059405940593,
"loss": 1.2605,
"step": 9410
},
{
"epoch": 7.935340656838751,
"grad_norm": 0.23551449179649353,
"learning_rate": 0.00030495049504950493,
"loss": 1.2589,
"step": 9420
},
{
"epoch": 7.943764585349195,
"grad_norm": 0.23266945779323578,
"learning_rate": 0.000303960396039604,
"loss": 1.2575,
"step": 9430
},
{
"epoch": 7.952188513859639,
"grad_norm": 0.19307726621627808,
"learning_rate": 0.000302970297029703,
"loss": 1.2594,
"step": 9440
},
{
"epoch": 7.960612442370084,
"grad_norm": 0.2490869015455246,
"learning_rate": 0.000301980198019802,
"loss": 1.2594,
"step": 9450
},
{
"epoch": 7.960612442370084,
"eval_accuracy": 0.7392987654643606,
"eval_loss": 1.1463170051574707,
"eval_runtime": 887.3291,
"eval_samples_per_second": 562.786,
"eval_steps_per_second": 5.211,
"step": 9450
},
{
"epoch": 7.969036370880528,
"grad_norm": 0.24613766372203827,
"learning_rate": 0.000300990099009901,
"loss": 1.2586,
"step": 9460
},
{
"epoch": 7.977460299390973,
"grad_norm": 0.28653955459594727,
"learning_rate": 0.0003,
"loss": 1.2596,
"step": 9470
},
{
"epoch": 7.985884227901417,
"grad_norm": 0.2534151077270508,
"learning_rate": 0.000299009900990099,
"loss": 1.258,
"step": 9480
},
{
"epoch": 7.994308156411861,
"grad_norm": 0.2278260588645935,
"learning_rate": 0.000298019801980198,
"loss": 1.2596,
"step": 9490
},
{
"epoch": 8.002732084922306,
"grad_norm": 0.24955512583255768,
"learning_rate": 0.000297029702970297,
"loss": 1.2589,
"step": 9500
},
{
"epoch": 8.011156013432752,
"grad_norm": 0.24727576971054077,
"learning_rate": 0.000296039603960396,
"loss": 1.259,
"step": 9510
},
{
"epoch": 8.019579941943196,
"grad_norm": 0.23246212303638458,
"learning_rate": 0.000295049504950495,
"loss": 1.2569,
"step": 9520
},
{
"epoch": 8.02800387045364,
"grad_norm": 0.31031736731529236,
"learning_rate": 0.00029405940594059407,
"loss": 1.2576,
"step": 9530
},
{
"epoch": 8.036427798964084,
"grad_norm": 0.25005343556404114,
"learning_rate": 0.00029306930693069307,
"loss": 1.2586,
"step": 9540
},
{
"epoch": 8.036427798964084,
"eval_accuracy": 0.7396166114825387,
"eval_loss": 1.1443780660629272,
"eval_runtime": 886.7087,
"eval_samples_per_second": 563.179,
"eval_steps_per_second": 5.215,
"step": 9540
},
{
"epoch": 8.044851727474528,
"grad_norm": 0.26693809032440186,
"learning_rate": 0.00029207920792079207,
"loss": 1.2565,
"step": 9550
},
{
"epoch": 8.053275655984974,
"grad_norm": 0.2694302797317505,
"learning_rate": 0.00029108910891089107,
"loss": 1.2578,
"step": 9560
},
{
"epoch": 8.061699584495418,
"grad_norm": 0.28717589378356934,
"learning_rate": 0.00029009900990099006,
"loss": 1.257,
"step": 9570
},
{
"epoch": 8.070123513005862,
"grad_norm": 0.2473517805337906,
"learning_rate": 0.0002891089108910891,
"loss": 1.2584,
"step": 9580
},
{
"epoch": 8.078547441516307,
"grad_norm": 0.238663449883461,
"learning_rate": 0.0002881188118811881,
"loss": 1.2565,
"step": 9590
},
{
"epoch": 8.086971370026752,
"grad_norm": 0.25168007612228394,
"learning_rate": 0.0002871287128712871,
"loss": 1.2601,
"step": 9600
},
{
"epoch": 8.095395298537197,
"grad_norm": 0.2553163766860962,
"learning_rate": 0.0002861386138613861,
"loss": 1.2582,
"step": 9610
},
{
"epoch": 8.10381922704764,
"grad_norm": 0.22442133724689484,
"learning_rate": 0.0002851485148514851,
"loss": 1.2564,
"step": 9620
},
{
"epoch": 8.112243155558085,
"grad_norm": 0.2428729087114334,
"learning_rate": 0.00028415841584158416,
"loss": 1.2555,
"step": 9630
},
{
"epoch": 8.112243155558085,
"eval_accuracy": 0.7398516451845706,
"eval_loss": 1.1434710025787354,
"eval_runtime": 884.9135,
"eval_samples_per_second": 564.322,
"eval_steps_per_second": 5.225,
"step": 9630
},
{
"epoch": 8.120667084068529,
"grad_norm": 0.24635536968708038,
"learning_rate": 0.00028316831683168316,
"loss": 1.256,
"step": 9640
},
{
"epoch": 8.129091012578975,
"grad_norm": 0.25894826650619507,
"learning_rate": 0.00028217821782178216,
"loss": 1.2559,
"step": 9650
},
{
"epoch": 8.13751494108942,
"grad_norm": 0.28364095091819763,
"learning_rate": 0.0002811881188118812,
"loss": 1.2558,
"step": 9660
},
{
"epoch": 8.145938869599863,
"grad_norm": 0.27813902497291565,
"learning_rate": 0.0002801980198019802,
"loss": 1.2551,
"step": 9670
},
{
"epoch": 8.154362798110308,
"grad_norm": 0.25842994451522827,
"learning_rate": 0.00027920792079207926,
"loss": 1.2566,
"step": 9680
},
{
"epoch": 8.162786726620752,
"grad_norm": 0.28136196732521057,
"learning_rate": 0.00027821782178217826,
"loss": 1.2558,
"step": 9690
},
{
"epoch": 8.171210655131198,
"grad_norm": 0.24087685346603394,
"learning_rate": 0.00027722772277227726,
"loss": 1.2548,
"step": 9700
},
{
"epoch": 8.179634583641642,
"grad_norm": 0.24687226116657257,
"learning_rate": 0.00027623762376237626,
"loss": 1.2585,
"step": 9710
},
{
"epoch": 8.188058512152086,
"grad_norm": 0.22570998966693878,
"learning_rate": 0.00027524752475247525,
"loss": 1.2534,
"step": 9720
},
{
"epoch": 8.188058512152086,
"eval_accuracy": 0.7402963892075639,
"eval_loss": 1.1417516469955444,
"eval_runtime": 887.2248,
"eval_samples_per_second": 562.852,
"eval_steps_per_second": 5.212,
"step": 9720
},
{
"epoch": 8.19648244066253,
"grad_norm": 0.2180325835943222,
"learning_rate": 0.0002742574257425743,
"loss": 1.254,
"step": 9730
},
{
"epoch": 8.204906369172976,
"grad_norm": 0.24650686979293823,
"learning_rate": 0.0002732673267326733,
"loss": 1.2549,
"step": 9740
},
{
"epoch": 8.21333029768342,
"grad_norm": 0.23055210709571838,
"learning_rate": 0.0002722772277227723,
"loss": 1.2533,
"step": 9750
},
{
"epoch": 8.221754226193864,
"grad_norm": 0.2486119419336319,
"learning_rate": 0.0002712871287128713,
"loss": 1.2535,
"step": 9760
},
{
"epoch": 8.230178154704308,
"grad_norm": 0.2295829951763153,
"learning_rate": 0.0002702970297029703,
"loss": 1.2532,
"step": 9770
},
{
"epoch": 8.238602083214753,
"grad_norm": 0.24997445940971375,
"learning_rate": 0.00026930693069306935,
"loss": 1.2531,
"step": 9780
},
{
"epoch": 8.247026011725199,
"grad_norm": 0.26696640253067017,
"learning_rate": 0.00026831683168316835,
"loss": 1.2537,
"step": 9790
},
{
"epoch": 8.255449940235643,
"grad_norm": 0.26139459013938904,
"learning_rate": 0.00026732673267326735,
"loss": 1.255,
"step": 9800
},
{
"epoch": 8.263873868746087,
"grad_norm": 0.24359402060508728,
"learning_rate": 0.00026633663366336635,
"loss": 1.2531,
"step": 9810
},
{
"epoch": 8.263873868746087,
"eval_accuracy": 0.7405673501883495,
"eval_loss": 1.139613389968872,
"eval_runtime": 879.601,
"eval_samples_per_second": 567.73,
"eval_steps_per_second": 5.257,
"step": 9810
},
{
"epoch": 8.272297797256531,
"grad_norm": 0.2327917069196701,
"learning_rate": 0.00026534653465346534,
"loss": 1.2534,
"step": 9820
},
{
"epoch": 8.280721725766975,
"grad_norm": 0.25629815459251404,
"learning_rate": 0.0002643564356435644,
"loss": 1.2531,
"step": 9830
},
{
"epoch": 8.289145654277421,
"grad_norm": 0.22450138628482819,
"learning_rate": 0.0002633663366336634,
"loss": 1.2529,
"step": 9840
},
{
"epoch": 8.297569582787865,
"grad_norm": 0.2623524069786072,
"learning_rate": 0.0002623762376237624,
"loss": 1.2504,
"step": 9850
},
{
"epoch": 8.30599351129831,
"grad_norm": 0.2159668356180191,
"learning_rate": 0.0002613861386138614,
"loss": 1.2528,
"step": 9860
},
{
"epoch": 8.314417439808754,
"grad_norm": 0.24267102777957916,
"learning_rate": 0.0002603960396039604,
"loss": 1.2514,
"step": 9870
},
{
"epoch": 8.322841368319198,
"grad_norm": 0.2541745603084564,
"learning_rate": 0.00025940594059405944,
"loss": 1.2505,
"step": 9880
},
{
"epoch": 8.331265296829644,
"grad_norm": 0.28231385350227356,
"learning_rate": 0.00025841584158415844,
"loss": 1.2511,
"step": 9890
},
{
"epoch": 8.339689225340088,
"grad_norm": 0.2412833273410797,
"learning_rate": 0.00025742574257425744,
"loss": 1.2506,
"step": 9900
},
{
"epoch": 8.339689225340088,
"eval_accuracy": 0.740612444763646,
"eval_loss": 1.140478491783142,
"eval_runtime": 884.9323,
"eval_samples_per_second": 564.31,
"eval_steps_per_second": 5.225,
"step": 9900
},
{
"epoch": 8.348113153850532,
"grad_norm": 0.2641441524028778,
"learning_rate": 0.00025643564356435644,
"loss": 1.2519,
"step": 9910
},
{
"epoch": 8.356537082360976,
"grad_norm": 0.2675786316394806,
"learning_rate": 0.00025544554455445543,
"loss": 1.2516,
"step": 9920
},
{
"epoch": 8.364961010871422,
"grad_norm": 0.2118910253047943,
"learning_rate": 0.0002544554455445545,
"loss": 1.2511,
"step": 9930
},
{
"epoch": 8.373384939381866,
"grad_norm": 0.27223941683769226,
"learning_rate": 0.0002534653465346535,
"loss": 1.2519,
"step": 9940
},
{
"epoch": 8.38180886789231,
"grad_norm": 0.2487749308347702,
"learning_rate": 0.0002524752475247525,
"loss": 1.2506,
"step": 9950
},
{
"epoch": 8.390232796402755,
"grad_norm": 0.2320510894060135,
"learning_rate": 0.0002514851485148515,
"loss": 1.2534,
"step": 9960
},
{
"epoch": 8.398656724913199,
"grad_norm": 0.2474934607744217,
"learning_rate": 0.0002504950495049505,
"loss": 1.249,
"step": 9970
},
{
"epoch": 8.407080653423645,
"grad_norm": 0.23778343200683594,
"learning_rate": 0.00024950495049504953,
"loss": 1.2503,
"step": 9980
},
{
"epoch": 8.415504581934089,
"grad_norm": 0.2715946137905121,
"learning_rate": 0.00024851485148514853,
"loss": 1.2515,
"step": 9990
},
{
"epoch": 8.415504581934089,
"eval_accuracy": 0.7412818791412316,
"eval_loss": 1.137270450592041,
"eval_runtime": 885.4223,
"eval_samples_per_second": 563.998,
"eval_steps_per_second": 5.222,
"step": 9990
},
{
"epoch": 8.423928510444533,
"grad_norm": 0.26555290818214417,
"learning_rate": 0.00024752475247524753,
"loss": 1.2485,
"step": 10000
},
{
"epoch": 8.432352438954977,
"grad_norm": 0.23698092997074127,
"learning_rate": 0.0002465346534653465,
"loss": 1.2498,
"step": 10010
},
{
"epoch": 8.440776367465421,
"grad_norm": 0.23015616834163666,
"learning_rate": 0.0002455445544554455,
"loss": 1.2482,
"step": 10020
},
{
"epoch": 8.449200295975867,
"grad_norm": 0.22911451756954193,
"learning_rate": 0.0002445544554455446,
"loss": 1.2503,
"step": 10030
},
{
"epoch": 8.457624224486311,
"grad_norm": 0.24171452224254608,
"learning_rate": 0.00024356435643564357,
"loss": 1.2485,
"step": 10040
},
{
"epoch": 8.466048152996756,
"grad_norm": 0.24717497825622559,
"learning_rate": 0.00024257425742574257,
"loss": 1.2503,
"step": 10050
},
{
"epoch": 8.4744720815072,
"grad_norm": 0.23118732869625092,
"learning_rate": 0.00024158415841584157,
"loss": 1.2488,
"step": 10060
},
{
"epoch": 8.482896010017644,
"grad_norm": 0.22151467204093933,
"learning_rate": 0.0002405940594059406,
"loss": 1.2484,
"step": 10070
},
{
"epoch": 8.49131993852809,
"grad_norm": 0.2284466177225113,
"learning_rate": 0.0002396039603960396,
"loss": 1.2487,
"step": 10080
},
{
"epoch": 8.49131993852809,
"eval_accuracy": 0.7414350855696202,
"eval_loss": 1.134464144706726,
"eval_runtime": 887.5421,
"eval_samples_per_second": 562.65,
"eval_steps_per_second": 5.21,
"step": 10080
},
{
"epoch": 8.499743867038534,
"grad_norm": 0.2377534806728363,
"learning_rate": 0.00023861386138613862,
"loss": 1.2491,
"step": 10090
},
{
"epoch": 8.508167795548978,
"grad_norm": 0.2649644613265991,
"learning_rate": 0.00023762376237623762,
"loss": 1.2467,
"step": 10100
},
{
"epoch": 8.516591724059422,
"grad_norm": 0.22302138805389404,
"learning_rate": 0.00023663366336633662,
"loss": 1.2496,
"step": 10110
},
{
"epoch": 8.525015652569868,
"grad_norm": 0.24170257151126862,
"learning_rate": 0.00023564356435643564,
"loss": 1.2471,
"step": 10120
},
{
"epoch": 8.533439581080312,
"grad_norm": 0.2645774781703949,
"learning_rate": 0.00023465346534653464,
"loss": 1.2477,
"step": 10130
},
{
"epoch": 8.541863509590756,
"grad_norm": 0.24155734479427338,
"learning_rate": 0.0002336633663366337,
"loss": 1.2466,
"step": 10140
},
{
"epoch": 8.5502874381012,
"grad_norm": 0.23023132979869843,
"learning_rate": 0.0002326732673267327,
"loss": 1.2457,
"step": 10150
},
{
"epoch": 8.558711366611645,
"grad_norm": 0.2243080586194992,
"learning_rate": 0.0002316831683168317,
"loss": 1.2476,
"step": 10160
},
{
"epoch": 8.56713529512209,
"grad_norm": 0.278157114982605,
"learning_rate": 0.00023069306930693071,
"loss": 1.2462,
"step": 10170
},
{
"epoch": 8.56713529512209,
"eval_accuracy": 0.7417397824056636,
"eval_loss": 1.1336922645568848,
"eval_runtime": 892.4907,
"eval_samples_per_second": 559.531,
"eval_steps_per_second": 5.181,
"step": 10170
},
{
"epoch": 8.575559223632535,
"grad_norm": 0.24606026709079742,
"learning_rate": 0.0002297029702970297,
"loss": 1.2478,
"step": 10180
},
{
"epoch": 8.583983152142979,
"grad_norm": 0.23494498431682587,
"learning_rate": 0.00022871287128712874,
"loss": 1.2463,
"step": 10190
},
{
"epoch": 8.592407080653423,
"grad_norm": 0.21522320806980133,
"learning_rate": 0.00022772277227722774,
"loss": 1.2479,
"step": 10200
},
{
"epoch": 8.60083100916387,
"grad_norm": 0.2655723989009857,
"learning_rate": 0.00022673267326732673,
"loss": 1.2468,
"step": 10210
},
{
"epoch": 8.609254937674313,
"grad_norm": 0.2444898933172226,
"learning_rate": 0.00022574257425742576,
"loss": 1.246,
"step": 10220
},
{
"epoch": 8.617678866184757,
"grad_norm": 0.2277156114578247,
"learning_rate": 0.00022475247524752476,
"loss": 1.2466,
"step": 10230
},
{
"epoch": 8.626102794695202,
"grad_norm": 0.22111962735652924,
"learning_rate": 0.00022376237623762378,
"loss": 1.2451,
"step": 10240
},
{
"epoch": 8.634526723205646,
"grad_norm": 0.23199447989463806,
"learning_rate": 0.00022277227722772278,
"loss": 1.2463,
"step": 10250
},
{
"epoch": 8.642950651716092,
"grad_norm": 0.22960427403450012,
"learning_rate": 0.00022178217821782178,
"loss": 1.2465,
"step": 10260
},
{
"epoch": 8.642950651716092,
"eval_accuracy": 0.7420823467349104,
"eval_loss": 1.1322184801101685,
"eval_runtime": 883.7567,
"eval_samples_per_second": 565.061,
"eval_steps_per_second": 5.232,
"step": 10260
},
{
"epoch": 8.651374580226536,
"grad_norm": 0.290622353553772,
"learning_rate": 0.0002207920792079208,
"loss": 1.2444,
"step": 10270
},
{
"epoch": 8.65979850873698,
"grad_norm": 0.2639337480068207,
"learning_rate": 0.0002198019801980198,
"loss": 1.247,
"step": 10280
},
{
"epoch": 8.668222437247424,
"grad_norm": 0.22477252781391144,
"learning_rate": 0.00021881188118811883,
"loss": 1.2443,
"step": 10290
},
{
"epoch": 8.676646365757868,
"grad_norm": 0.2989983558654785,
"learning_rate": 0.00021782178217821783,
"loss": 1.2461,
"step": 10300
},
{
"epoch": 8.685070294268314,
"grad_norm": 0.22259776294231415,
"learning_rate": 0.00021683168316831682,
"loss": 1.2438,
"step": 10310
},
{
"epoch": 8.693494222778758,
"grad_norm": 0.21380363404750824,
"learning_rate": 0.00021584158415841585,
"loss": 1.2414,
"step": 10320
},
{
"epoch": 8.701918151289203,
"grad_norm": 0.23593538999557495,
"learning_rate": 0.00021485148514851485,
"loss": 1.2454,
"step": 10330
},
{
"epoch": 8.710342079799647,
"grad_norm": 0.25987499952316284,
"learning_rate": 0.00021386138613861387,
"loss": 1.2444,
"step": 10340
},
{
"epoch": 8.71876600831009,
"grad_norm": 0.21150009334087372,
"learning_rate": 0.00021287128712871287,
"loss": 1.2414,
"step": 10350
},
{
"epoch": 8.71876600831009,
"eval_accuracy": 0.7421671573662553,
"eval_loss": 1.1316900253295898,
"eval_runtime": 893.0033,
"eval_samples_per_second": 559.21,
"eval_steps_per_second": 5.178,
"step": 10350
},
{
"epoch": 8.727189936820537,
"grad_norm": 0.23628725111484528,
"learning_rate": 0.00021188118811881187,
"loss": 1.2432,
"step": 10360
},
{
"epoch": 8.735613865330981,
"grad_norm": 0.24477533996105194,
"learning_rate": 0.0002108910891089109,
"loss": 1.2447,
"step": 10370
},
{
"epoch": 8.744037793841425,
"grad_norm": 0.2156253159046173,
"learning_rate": 0.0002099009900990099,
"loss": 1.2452,
"step": 10380
},
{
"epoch": 8.75246172235187,
"grad_norm": 0.27982792258262634,
"learning_rate": 0.00020891089108910892,
"loss": 1.2434,
"step": 10390
},
{
"epoch": 8.760885650862313,
"grad_norm": 0.24025356769561768,
"learning_rate": 0.00020792079207920792,
"loss": 1.244,
"step": 10400
},
{
"epoch": 8.76930957937276,
"grad_norm": 0.22768454253673553,
"learning_rate": 0.00020693069306930691,
"loss": 1.2427,
"step": 10410
},
{
"epoch": 8.777733507883204,
"grad_norm": 0.2676762640476227,
"learning_rate": 0.00020594059405940594,
"loss": 1.244,
"step": 10420
},
{
"epoch": 8.786157436393648,
"grad_norm": 0.23502378165721893,
"learning_rate": 0.00020495049504950494,
"loss": 1.244,
"step": 10430
},
{
"epoch": 8.794581364904092,
"grad_norm": 0.23354895412921906,
"learning_rate": 0.00020396039603960396,
"loss": 1.2435,
"step": 10440
},
{
"epoch": 8.794581364904092,
"eval_accuracy": 0.7425177306861277,
"eval_loss": 1.1301963329315186,
"eval_runtime": 885.137,
"eval_samples_per_second": 564.179,
"eval_steps_per_second": 5.224,
"step": 10440
},
{
"epoch": 8.803005293414538,
"grad_norm": 0.22738757729530334,
"learning_rate": 0.000202970297029703,
"loss": 1.2426,
"step": 10450
},
{
"epoch": 8.811429221924982,
"grad_norm": 0.20702116191387177,
"learning_rate": 0.00020198019801980199,
"loss": 1.243,
"step": 10460
},
{
"epoch": 8.819853150435426,
"grad_norm": 0.20945468544960022,
"learning_rate": 0.000200990099009901,
"loss": 1.2411,
"step": 10470
},
{
"epoch": 8.82827707894587,
"grad_norm": 0.21654458343982697,
"learning_rate": 0.0002,
"loss": 1.2428,
"step": 10480
},
{
"epoch": 8.836701007456314,
"grad_norm": 0.2217228263616562,
"learning_rate": 0.00019900990099009903,
"loss": 1.2405,
"step": 10490
},
{
"epoch": 8.84512493596676,
"grad_norm": 0.27619633078575134,
"learning_rate": 0.00019801980198019803,
"loss": 1.2424,
"step": 10500
},
{
"epoch": 8.853548864477204,
"grad_norm": 0.2569934129714966,
"learning_rate": 0.00019702970297029703,
"loss": 1.2418,
"step": 10510
},
{
"epoch": 8.861972792987649,
"grad_norm": 0.2570299804210663,
"learning_rate": 0.00019603960396039606,
"loss": 1.2423,
"step": 10520
},
{
"epoch": 8.870396721498093,
"grad_norm": 0.22972337901592255,
"learning_rate": 0.00019504950495049505,
"loss": 1.2399,
"step": 10530
},
{
"epoch": 8.870396721498093,
"eval_accuracy": 0.7427001211705735,
"eval_loss": 1.1304486989974976,
"eval_runtime": 881.4454,
"eval_samples_per_second": 566.542,
"eval_steps_per_second": 5.246,
"step": 10530
},
{
"epoch": 8.878820650008539,
"grad_norm": 0.2365693300962448,
"learning_rate": 0.00019405940594059408,
"loss": 1.2426,
"step": 10540
},
{
"epoch": 8.887244578518983,
"grad_norm": 0.2252751588821411,
"learning_rate": 0.00019306930693069308,
"loss": 1.2406,
"step": 10550
},
{
"epoch": 8.895668507029427,
"grad_norm": 0.2205033302307129,
"learning_rate": 0.00019207920792079208,
"loss": 1.2419,
"step": 10560
},
{
"epoch": 8.904092435539871,
"grad_norm": 0.21468041837215424,
"learning_rate": 0.0001910891089108911,
"loss": 1.2406,
"step": 10570
},
{
"epoch": 8.912516364050315,
"grad_norm": 0.23669223487377167,
"learning_rate": 0.0001900990099009901,
"loss": 1.2401,
"step": 10580
},
{
"epoch": 8.920940292560761,
"grad_norm": 0.2412618100643158,
"learning_rate": 0.00018910891089108913,
"loss": 1.2402,
"step": 10590
},
{
"epoch": 8.929364221071205,
"grad_norm": 0.21675223112106323,
"learning_rate": 0.00018811881188118812,
"loss": 1.2417,
"step": 10600
},
{
"epoch": 8.93778814958165,
"grad_norm": 0.24683676660060883,
"learning_rate": 0.00018712871287128712,
"loss": 1.2417,
"step": 10610
},
{
"epoch": 8.946212078092094,
"grad_norm": 0.21681492030620575,
"learning_rate": 0.00018613861386138615,
"loss": 1.2408,
"step": 10620
},
{
"epoch": 8.946212078092094,
"eval_accuracy": 0.7428579001690714,
"eval_loss": 1.1290760040283203,
"eval_runtime": 889.1418,
"eval_samples_per_second": 561.638,
"eval_steps_per_second": 5.201,
"step": 10620
},
{
"epoch": 8.954636006602538,
"grad_norm": 0.22117485105991364,
"learning_rate": 0.00018514851485148514,
"loss": 1.2399,
"step": 10630
},
{
"epoch": 8.963059935112984,
"grad_norm": 0.2180255800485611,
"learning_rate": 0.00018415841584158417,
"loss": 1.2378,
"step": 10640
},
{
"epoch": 8.971483863623428,
"grad_norm": 0.23244567215442657,
"learning_rate": 0.00018316831683168317,
"loss": 1.2402,
"step": 10650
},
{
"epoch": 8.979907792133872,
"grad_norm": 0.23777294158935547,
"learning_rate": 0.00018217821782178217,
"loss": 1.2417,
"step": 10660
},
{
"epoch": 8.988331720644316,
"grad_norm": 0.26418906450271606,
"learning_rate": 0.0001811881188118812,
"loss": 1.238,
"step": 10670
},
{
"epoch": 8.99675564915476,
"grad_norm": 0.21142803132534027,
"learning_rate": 0.0001801980198019802,
"loss": 1.2384,
"step": 10680
},
{
"epoch": 9.005179577665206,
"grad_norm": 0.21976542472839355,
"learning_rate": 0.00017920792079207922,
"loss": 1.2399,
"step": 10690
},
{
"epoch": 9.01360350617565,
"grad_norm": 0.2216147631406784,
"learning_rate": 0.0001782178217821782,
"loss": 1.2391,
"step": 10700
},
{
"epoch": 9.022027434686095,
"grad_norm": 0.1873018890619278,
"learning_rate": 0.0001772277227722772,
"loss": 1.2368,
"step": 10710
},
{
"epoch": 9.022027434686095,
"eval_accuracy": 0.7431224622062498,
"eval_loss": 1.1265127658843994,
"eval_runtime": 891.5668,
"eval_samples_per_second": 560.111,
"eval_steps_per_second": 5.186,
"step": 10710
},
{
"epoch": 9.030451363196539,
"grad_norm": 0.23913191258907318,
"learning_rate": 0.00017623762376237624,
"loss": 1.2404,
"step": 10720
},
{
"epoch": 9.038875291706983,
"grad_norm": 0.21578449010849,
"learning_rate": 0.00017524752475247524,
"loss": 1.2388,
"step": 10730
},
{
"epoch": 9.047299220217429,
"grad_norm": 0.2038455754518509,
"learning_rate": 0.00017425742574257426,
"loss": 1.2402,
"step": 10740
},
{
"epoch": 9.055723148727873,
"grad_norm": 0.21903488039970398,
"learning_rate": 0.00017326732673267326,
"loss": 1.2383,
"step": 10750
},
{
"epoch": 9.064147077238317,
"grad_norm": 0.21970726549625397,
"learning_rate": 0.00017227722772277226,
"loss": 1.2386,
"step": 10760
},
{
"epoch": 9.072571005748761,
"grad_norm": 0.22701360285282135,
"learning_rate": 0.0001712871287128713,
"loss": 1.2391,
"step": 10770
},
{
"epoch": 9.080994934259207,
"grad_norm": 0.21777622401714325,
"learning_rate": 0.0001702970297029703,
"loss": 1.2388,
"step": 10780
},
{
"epoch": 9.089418862769651,
"grad_norm": 0.2336941659450531,
"learning_rate": 0.00016930693069306933,
"loss": 1.2383,
"step": 10790
},
{
"epoch": 9.097842791280096,
"grad_norm": 0.20545706152915955,
"learning_rate": 0.00016831683168316833,
"loss": 1.2376,
"step": 10800
},
{
"epoch": 9.097842791280096,
"eval_accuracy": 0.7435866345331611,
"eval_loss": 1.1250243186950684,
"eval_runtime": 885.3582,
"eval_samples_per_second": 564.038,
"eval_steps_per_second": 5.223,
"step": 10800
},
{
"epoch": 9.10626671979054,
"grad_norm": 0.23678459227085114,
"learning_rate": 0.00016732673267326733,
"loss": 1.2394,
"step": 10810
},
{
"epoch": 9.114690648300984,
"grad_norm": 0.24195948243141174,
"learning_rate": 0.00016633663366336635,
"loss": 1.238,
"step": 10820
},
{
"epoch": 9.12311457681143,
"grad_norm": 0.20026259124279022,
"learning_rate": 0.00016534653465346535,
"loss": 1.2364,
"step": 10830
},
{
"epoch": 9.131538505321874,
"grad_norm": 0.21753010153770447,
"learning_rate": 0.00016435643564356438,
"loss": 1.238,
"step": 10840
},
{
"epoch": 9.139962433832318,
"grad_norm": 0.20273657143115997,
"learning_rate": 0.00016336633663366338,
"loss": 1.2374,
"step": 10850
},
{
"epoch": 9.148386362342762,
"grad_norm": 0.21302086114883423,
"learning_rate": 0.00016237623762376237,
"loss": 1.2372,
"step": 10860
},
{
"epoch": 9.156810290853207,
"grad_norm": 0.23342467844486237,
"learning_rate": 0.0001613861386138614,
"loss": 1.2378,
"step": 10870
},
{
"epoch": 9.165234219363652,
"grad_norm": 0.24393875896930695,
"learning_rate": 0.0001603960396039604,
"loss": 1.2362,
"step": 10880
},
{
"epoch": 9.173658147874097,
"grad_norm": 0.19604717195034027,
"learning_rate": 0.00015940594059405942,
"loss": 1.237,
"step": 10890
},
{
"epoch": 9.173658147874097,
"eval_accuracy": 0.743667723412049,
"eval_loss": 1.124830722808838,
"eval_runtime": 887.4222,
"eval_samples_per_second": 562.727,
"eval_steps_per_second": 5.211,
"step": 10890
},
{
"epoch": 9.18208207638454,
"grad_norm": 0.19619697332382202,
"learning_rate": 0.00015841584158415842,
"loss": 1.2356,
"step": 10900
},
{
"epoch": 9.190506004894985,
"grad_norm": 0.20415499806404114,
"learning_rate": 0.00015742574257425742,
"loss": 1.2373,
"step": 10910
},
{
"epoch": 9.19892993340543,
"grad_norm": 0.21602529287338257,
"learning_rate": 0.00015643564356435644,
"loss": 1.2369,
"step": 10920
},
{
"epoch": 9.207353861915875,
"grad_norm": 0.2266259491443634,
"learning_rate": 0.00015544554455445544,
"loss": 1.236,
"step": 10930
},
{
"epoch": 9.21577779042632,
"grad_norm": 0.2172340452671051,
"learning_rate": 0.00015445544554455447,
"loss": 1.236,
"step": 10940
},
{
"epoch": 9.224201718936763,
"grad_norm": 0.21929994225502014,
"learning_rate": 0.00015346534653465347,
"loss": 1.2381,
"step": 10950
},
{
"epoch": 9.232625647447207,
"grad_norm": 0.20617130398750305,
"learning_rate": 0.00015247524752475246,
"loss": 1.2346,
"step": 10960
},
{
"epoch": 9.241049575957653,
"grad_norm": 0.2271021008491516,
"learning_rate": 0.0001514851485148515,
"loss": 1.2364,
"step": 10970
},
{
"epoch": 9.249473504468098,
"grad_norm": 0.22377552092075348,
"learning_rate": 0.0001504950495049505,
"loss": 1.2342,
"step": 10980
},
{
"epoch": 9.249473504468098,
"eval_accuracy": 0.7438243969178056,
"eval_loss": 1.124144434928894,
"eval_runtime": 880.0851,
"eval_samples_per_second": 567.418,
"eval_steps_per_second": 5.254,
"step": 10980
},
{
"epoch": 9.257897432978542,
"grad_norm": 0.23195216059684753,
"learning_rate": 0.0001495049504950495,
"loss": 1.2347,
"step": 10990
},
{
"epoch": 9.266321361488986,
"grad_norm": 0.19934554398059845,
"learning_rate": 0.0001485148514851485,
"loss": 1.2359,
"step": 11000
},
{
"epoch": 9.27474528999943,
"grad_norm": 0.19541287422180176,
"learning_rate": 0.0001475247524752475,
"loss": 1.2342,
"step": 11010
},
{
"epoch": 9.283169218509876,
"grad_norm": 0.2204955518245697,
"learning_rate": 0.00014653465346534653,
"loss": 1.2356,
"step": 11020
},
{
"epoch": 9.29159314702032,
"grad_norm": 0.22855669260025024,
"learning_rate": 0.00014554455445544553,
"loss": 1.2367,
"step": 11030
},
{
"epoch": 9.300017075530764,
"grad_norm": 0.20308193564414978,
"learning_rate": 0.00014455445544554456,
"loss": 1.235,
"step": 11040
},
{
"epoch": 9.308441004041208,
"grad_norm": 0.18201188743114471,
"learning_rate": 0.00014356435643564356,
"loss": 1.235,
"step": 11050
},
{
"epoch": 9.316864932551653,
"grad_norm": 0.199186772108078,
"learning_rate": 0.00014257425742574255,
"loss": 1.2348,
"step": 11060
},
{
"epoch": 9.325288861062099,
"grad_norm": 0.23214493691921234,
"learning_rate": 0.00014158415841584158,
"loss": 1.2335,
"step": 11070
},
{
"epoch": 9.325288861062099,
"eval_accuracy": 0.7438911749364814,
"eval_loss": 1.123384714126587,
"eval_runtime": 888.3176,
"eval_samples_per_second": 562.159,
"eval_steps_per_second": 5.205,
"step": 11070
},
{
"epoch": 9.333712789572543,
"grad_norm": 0.2128278762102127,
"learning_rate": 0.0001405940594059406,
"loss": 1.2337,
"step": 11080
},
{
"epoch": 9.342136718082987,
"grad_norm": 0.20257510244846344,
"learning_rate": 0.00013960396039603963,
"loss": 1.2357,
"step": 11090
},
{
"epoch": 9.350560646593431,
"grad_norm": 0.22038786113262177,
"learning_rate": 0.00013861386138613863,
"loss": 1.2333,
"step": 11100
},
{
"epoch": 9.358984575103877,
"grad_norm": 0.2351042628288269,
"learning_rate": 0.00013762376237623763,
"loss": 1.235,
"step": 11110
},
{
"epoch": 9.367408503614321,
"grad_norm": 0.2042153775691986,
"learning_rate": 0.00013663366336633665,
"loss": 1.2339,
"step": 11120
},
{
"epoch": 9.375832432124765,
"grad_norm": 0.20065917074680328,
"learning_rate": 0.00013564356435643565,
"loss": 1.234,
"step": 11130
},
{
"epoch": 9.38425636063521,
"grad_norm": 0.22544540464878082,
"learning_rate": 0.00013465346534653468,
"loss": 1.2319,
"step": 11140
},
{
"epoch": 9.392680289145654,
"grad_norm": 0.2352074533700943,
"learning_rate": 0.00013366336633663367,
"loss": 1.2347,
"step": 11150
},
{
"epoch": 9.4011042176561,
"grad_norm": 0.2452593892812729,
"learning_rate": 0.00013267326732673267,
"loss": 1.2343,
"step": 11160
},
{
"epoch": 9.4011042176561,
"eval_accuracy": 0.7445740208736444,
"eval_loss": 1.1202077865600586,
"eval_runtime": 879.3984,
"eval_samples_per_second": 567.861,
"eval_steps_per_second": 5.258,
"step": 11160
},
{
"epoch": 9.409528146166544,
"grad_norm": 0.20848217606544495,
"learning_rate": 0.0001316831683168317,
"loss": 1.2315,
"step": 11170
},
{
"epoch": 9.417952074676988,
"grad_norm": 0.20628029108047485,
"learning_rate": 0.0001306930693069307,
"loss": 1.2326,
"step": 11180
},
{
"epoch": 9.426376003187432,
"grad_norm": 0.199026957154274,
"learning_rate": 0.00012970297029702972,
"loss": 1.2329,
"step": 11190
},
{
"epoch": 9.434799931697876,
"grad_norm": 0.21373671293258667,
"learning_rate": 0.00012871287128712872,
"loss": 1.2326,
"step": 11200
},
{
"epoch": 9.443223860208322,
"grad_norm": 0.2015460729598999,
"learning_rate": 0.00012772277227722772,
"loss": 1.2327,
"step": 11210
},
{
"epoch": 9.451647788718766,
"grad_norm": 0.2228008210659027,
"learning_rate": 0.00012673267326732674,
"loss": 1.2334,
"step": 11220
},
{
"epoch": 9.46007171722921,
"grad_norm": 0.21561528742313385,
"learning_rate": 0.00012574257425742574,
"loss": 1.233,
"step": 11230
},
{
"epoch": 9.468495645739655,
"grad_norm": 0.2073032706975937,
"learning_rate": 0.00012475247524752477,
"loss": 1.2314,
"step": 11240
},
{
"epoch": 9.4769195742501,
"grad_norm": 0.19552037119865417,
"learning_rate": 0.00012376237623762376,
"loss": 1.2333,
"step": 11250
},
{
"epoch": 9.4769195742501,
"eval_accuracy": 0.744401638855597,
"eval_loss": 1.1210565567016602,
"eval_runtime": 888.2535,
"eval_samples_per_second": 562.2,
"eval_steps_per_second": 5.206,
"step": 11250
},
{
"epoch": 9.485343502760545,
"grad_norm": 0.20909276604652405,
"learning_rate": 0.00012277227722772276,
"loss": 1.2332,
"step": 11260
},
{
"epoch": 9.493767431270989,
"grad_norm": 0.210150346159935,
"learning_rate": 0.00012178217821782179,
"loss": 1.2308,
"step": 11270
},
{
"epoch": 9.502191359781433,
"grad_norm": 0.1982164978981018,
"learning_rate": 0.00012079207920792079,
"loss": 1.2305,
"step": 11280
},
{
"epoch": 9.510615288291877,
"grad_norm": 0.2049965262413025,
"learning_rate": 0.0001198019801980198,
"loss": 1.2334,
"step": 11290
},
{
"epoch": 9.519039216802323,
"grad_norm": 0.18243108689785004,
"learning_rate": 0.00011881188118811881,
"loss": 1.2335,
"step": 11300
},
{
"epoch": 9.527463145312767,
"grad_norm": 0.2009328156709671,
"learning_rate": 0.00011782178217821782,
"loss": 1.2313,
"step": 11310
},
{
"epoch": 9.535887073823211,
"grad_norm": 0.19226033985614777,
"learning_rate": 0.00011683168316831685,
"loss": 1.2332,
"step": 11320
},
{
"epoch": 9.544311002333655,
"grad_norm": 0.20206843316555023,
"learning_rate": 0.00011584158415841584,
"loss": 1.2333,
"step": 11330
},
{
"epoch": 9.5527349308441,
"grad_norm": 0.20852382481098175,
"learning_rate": 0.00011485148514851486,
"loss": 1.2322,
"step": 11340
},
{
"epoch": 9.5527349308441,
"eval_accuracy": 0.7448142064493213,
"eval_loss": 1.1182734966278076,
"eval_runtime": 889.106,
"eval_samples_per_second": 561.661,
"eval_steps_per_second": 5.201,
"step": 11340
},
{
"epoch": 9.561158859354546,
"grad_norm": 0.19330884516239166,
"learning_rate": 0.00011386138613861387,
"loss": 1.2294,
"step": 11350
},
{
"epoch": 9.56958278786499,
"grad_norm": 0.17878125607967377,
"learning_rate": 0.00011287128712871288,
"loss": 1.2301,
"step": 11360
},
{
"epoch": 9.578006716375434,
"grad_norm": 0.20679515600204468,
"learning_rate": 0.00011188118811881189,
"loss": 1.2302,
"step": 11370
},
{
"epoch": 9.586430644885878,
"grad_norm": 0.20949432253837585,
"learning_rate": 0.00011089108910891089,
"loss": 1.2308,
"step": 11380
},
{
"epoch": 9.594854573396322,
"grad_norm": 0.21771377325057983,
"learning_rate": 0.0001099009900990099,
"loss": 1.2313,
"step": 11390
},
{
"epoch": 9.603278501906768,
"grad_norm": 0.1953546106815338,
"learning_rate": 0.00010891089108910891,
"loss": 1.2305,
"step": 11400
},
{
"epoch": 9.611702430417212,
"grad_norm": 0.20105966925621033,
"learning_rate": 0.00010792079207920792,
"loss": 1.2294,
"step": 11410
},
{
"epoch": 9.620126358927656,
"grad_norm": 0.20625823736190796,
"learning_rate": 0.00010693069306930694,
"loss": 1.2287,
"step": 11420
},
{
"epoch": 9.6285502874381,
"grad_norm": 0.2024402767419815,
"learning_rate": 0.00010594059405940593,
"loss": 1.2309,
"step": 11430
},
{
"epoch": 9.6285502874381,
"eval_accuracy": 0.7450274546722492,
"eval_loss": 1.1177880764007568,
"eval_runtime": 889.3816,
"eval_samples_per_second": 561.487,
"eval_steps_per_second": 5.199,
"step": 11430
},
{
"epoch": 9.636974215948547,
"grad_norm": 0.20498992502689362,
"learning_rate": 0.00010495049504950495,
"loss": 1.228,
"step": 11440
},
{
"epoch": 9.64539814445899,
"grad_norm": 0.18760576844215393,
"learning_rate": 0.00010396039603960396,
"loss": 1.2287,
"step": 11450
},
{
"epoch": 9.653822072969435,
"grad_norm": 0.2059292048215866,
"learning_rate": 0.00010297029702970297,
"loss": 1.2284,
"step": 11460
},
{
"epoch": 9.662246001479879,
"grad_norm": 0.20898665487766266,
"learning_rate": 0.00010198019801980198,
"loss": 1.231,
"step": 11470
},
{
"epoch": 9.670669929990323,
"grad_norm": 0.20303255319595337,
"learning_rate": 0.00010099009900990099,
"loss": 1.2302,
"step": 11480
},
{
"epoch": 9.679093858500769,
"grad_norm": 0.20947200059890747,
"learning_rate": 0.0001,
"loss": 1.2314,
"step": 11490
},
{
"epoch": 9.687517787011213,
"grad_norm": 0.20898771286010742,
"learning_rate": 9.900990099009902e-05,
"loss": 1.2294,
"step": 11500
},
{
"epoch": 9.695941715521657,
"grad_norm": 0.18466849625110626,
"learning_rate": 9.801980198019803e-05,
"loss": 1.2309,
"step": 11510
},
{
"epoch": 9.704365644032102,
"grad_norm": 0.1769760698080063,
"learning_rate": 9.702970297029704e-05,
"loss": 1.2282,
"step": 11520
},
{
"epoch": 9.704365644032102,
"eval_accuracy": 0.7449189101862153,
"eval_loss": 1.118354082107544,
"eval_runtime": 879.3937,
"eval_samples_per_second": 567.864,
"eval_steps_per_second": 5.258,
"step": 11520
},
{
"epoch": 9.712789572542546,
"grad_norm": 0.18270480632781982,
"learning_rate": 9.603960396039604e-05,
"loss": 1.2286,
"step": 11530
},
{
"epoch": 9.721213501052992,
"grad_norm": 0.1812662035226822,
"learning_rate": 9.504950495049505e-05,
"loss": 1.2279,
"step": 11540
},
{
"epoch": 9.729637429563436,
"grad_norm": 0.20632152259349823,
"learning_rate": 9.405940594059406e-05,
"loss": 1.2295,
"step": 11550
},
{
"epoch": 9.73806135807388,
"grad_norm": 0.19512777030467987,
"learning_rate": 9.306930693069307e-05,
"loss": 1.2292,
"step": 11560
},
{
"epoch": 9.746485286584324,
"grad_norm": 0.19665522873401642,
"learning_rate": 9.207920792079209e-05,
"loss": 1.2294,
"step": 11570
},
{
"epoch": 9.75490921509477,
"grad_norm": 0.18540680408477783,
"learning_rate": 9.108910891089108e-05,
"loss": 1.2297,
"step": 11580
},
{
"epoch": 9.763333143605214,
"grad_norm": 0.21472424268722534,
"learning_rate": 9.00990099009901e-05,
"loss": 1.2277,
"step": 11590
},
{
"epoch": 9.771757072115658,
"grad_norm": 0.2189822793006897,
"learning_rate": 8.91089108910891e-05,
"loss": 1.2293,
"step": 11600
},
{
"epoch": 9.780181000626103,
"grad_norm": 0.19983939826488495,
"learning_rate": 8.811881188118812e-05,
"loss": 1.2287,
"step": 11610
},
{
"epoch": 9.780181000626103,
"eval_accuracy": 0.7452771934107217,
"eval_loss": 1.1166530847549438,
"eval_runtime": 886.9822,
"eval_samples_per_second": 563.006,
"eval_steps_per_second": 5.213,
"step": 11610
},
{
"epoch": 9.788604929136547,
"grad_norm": 0.1868014931678772,
"learning_rate": 8.712871287128713e-05,
"loss": 1.2296,
"step": 11620
},
{
"epoch": 9.797028857646993,
"grad_norm": 0.2048911601305008,
"learning_rate": 8.613861386138613e-05,
"loss": 1.2291,
"step": 11630
},
{
"epoch": 9.805452786157437,
"grad_norm": 0.2088802009820938,
"learning_rate": 8.514851485148515e-05,
"loss": 1.2271,
"step": 11640
},
{
"epoch": 9.813876714667881,
"grad_norm": 0.20058122277259827,
"learning_rate": 8.415841584158417e-05,
"loss": 1.2296,
"step": 11650
},
{
"epoch": 9.822300643178325,
"grad_norm": 0.1964656561613083,
"learning_rate": 8.316831683168318e-05,
"loss": 1.2272,
"step": 11660
},
{
"epoch": 9.83072457168877,
"grad_norm": 0.20214231312274933,
"learning_rate": 8.217821782178219e-05,
"loss": 1.2271,
"step": 11670
},
{
"epoch": 9.839148500199215,
"grad_norm": 0.19427910447120667,
"learning_rate": 8.118811881188119e-05,
"loss": 1.2264,
"step": 11680
},
{
"epoch": 9.84757242870966,
"grad_norm": 0.18842646479606628,
"learning_rate": 8.01980198019802e-05,
"loss": 1.2265,
"step": 11690
},
{
"epoch": 9.855996357220103,
"grad_norm": 0.18588952720165253,
"learning_rate": 7.920792079207921e-05,
"loss": 1.2279,
"step": 11700
},
{
"epoch": 9.855996357220103,
"eval_accuracy": 0.7454476541387279,
"eval_loss": 1.1153885126113892,
"eval_runtime": 879.2745,
"eval_samples_per_second": 567.941,
"eval_steps_per_second": 5.259,
"step": 11700
},
{
"epoch": 9.864420285730548,
"grad_norm": 0.18300525844097137,
"learning_rate": 7.821782178217822e-05,
"loss": 1.2268,
"step": 11710
},
{
"epoch": 9.872844214240992,
"grad_norm": 0.18436813354492188,
"learning_rate": 7.722772277227723e-05,
"loss": 1.2256,
"step": 11720
},
{
"epoch": 9.881268142751438,
"grad_norm": 0.19767363369464874,
"learning_rate": 7.623762376237623e-05,
"loss": 1.2246,
"step": 11730
},
{
"epoch": 9.889692071261882,
"grad_norm": 0.1749766319990158,
"learning_rate": 7.524752475247524e-05,
"loss": 1.2277,
"step": 11740
},
{
"epoch": 9.898115999772326,
"grad_norm": 0.17161355912685394,
"learning_rate": 7.425742574257426e-05,
"loss": 1.2262,
"step": 11750
},
{
"epoch": 9.90653992828277,
"grad_norm": 0.190937340259552,
"learning_rate": 7.326732673267327e-05,
"loss": 1.2276,
"step": 11760
},
{
"epoch": 9.914963856793216,
"grad_norm": 0.18256962299346924,
"learning_rate": 7.227722772277228e-05,
"loss": 1.2274,
"step": 11770
},
{
"epoch": 9.92338778530366,
"grad_norm": 0.1912631094455719,
"learning_rate": 7.128712871287128e-05,
"loss": 1.2243,
"step": 11780
},
{
"epoch": 9.931811713814104,
"grad_norm": 0.19331537187099457,
"learning_rate": 7.02970297029703e-05,
"loss": 1.2261,
"step": 11790
},
{
"epoch": 9.931811713814104,
"eval_accuracy": 0.7455543705350357,
"eval_loss": 1.115136981010437,
"eval_runtime": 887.3277,
"eval_samples_per_second": 562.786,
"eval_steps_per_second": 5.211,
"step": 11790
},
{
"epoch": 9.940235642324549,
"grad_norm": 0.17607170343399048,
"learning_rate": 6.930693069306931e-05,
"loss": 1.228,
"step": 11800
},
{
"epoch": 9.948659570834993,
"grad_norm": 0.17280788719654083,
"learning_rate": 6.831683168316833e-05,
"loss": 1.2269,
"step": 11810
},
{
"epoch": 9.957083499345439,
"grad_norm": 0.19290916621685028,
"learning_rate": 6.732673267326734e-05,
"loss": 1.2279,
"step": 11820
},
{
"epoch": 9.965507427855883,
"grad_norm": 0.19125664234161377,
"learning_rate": 6.633663366336634e-05,
"loss": 1.227,
"step": 11830
},
{
"epoch": 9.973931356366327,
"grad_norm": 0.18251217901706696,
"learning_rate": 6.534653465346535e-05,
"loss": 1.2254,
"step": 11840
},
{
"epoch": 9.982355284876771,
"grad_norm": 0.19647039473056793,
"learning_rate": 6.435643564356436e-05,
"loss": 1.2261,
"step": 11850
},
{
"epoch": 9.990779213387215,
"grad_norm": 0.17714038491249084,
"learning_rate": 6.336633663366337e-05,
"loss": 1.2276,
"step": 11860
},
{
"epoch": 9.999203141897661,
"grad_norm": 0.18365037441253662,
"learning_rate": 6.237623762376238e-05,
"loss": 1.2261,
"step": 11870
},
{
"epoch": 10.007627070408105,
"grad_norm": 0.1910678595304489,
"learning_rate": 6.138613861386138e-05,
"loss": 1.2244,
"step": 11880
},
{
"epoch": 10.007627070408105,
"eval_accuracy": 0.7456593741030724,
"eval_loss": 1.1154232025146484,
"eval_runtime": 887.0764,
"eval_samples_per_second": 562.946,
"eval_steps_per_second": 5.213,
"step": 11880
},
{
"epoch": 10.01605099891855,
"grad_norm": 0.18324702978134155,
"learning_rate": 6.039603960396039e-05,
"loss": 1.2267,
"step": 11890
},
{
"epoch": 10.024474927428994,
"grad_norm": 0.1686498522758484,
"learning_rate": 5.9405940594059404e-05,
"loss": 1.2242,
"step": 11900
},
{
"epoch": 10.03289885593944,
"grad_norm": 0.17256265878677368,
"learning_rate": 5.841584158415842e-05,
"loss": 1.2239,
"step": 11910
},
{
"epoch": 10.041322784449884,
"grad_norm": 0.19624483585357666,
"learning_rate": 5.742574257425743e-05,
"loss": 1.2258,
"step": 11920
},
{
"epoch": 10.049746712960328,
"grad_norm": 0.17262500524520874,
"learning_rate": 5.643564356435644e-05,
"loss": 1.2258,
"step": 11930
},
{
"epoch": 10.058170641470772,
"grad_norm": 0.1741054356098175,
"learning_rate": 5.5445544554455445e-05,
"loss": 1.2245,
"step": 11940
},
{
"epoch": 10.066594569981216,
"grad_norm": 0.17313139140605927,
"learning_rate": 5.4455445544554456e-05,
"loss": 1.2256,
"step": 11950
},
{
"epoch": 10.075018498491662,
"grad_norm": 0.18322905898094177,
"learning_rate": 5.346534653465347e-05,
"loss": 1.2243,
"step": 11960
},
{
"epoch": 10.083442427002106,
"grad_norm": 0.18261946737766266,
"learning_rate": 5.247524752475247e-05,
"loss": 1.2252,
"step": 11970
},
{
"epoch": 10.083442427002106,
"eval_accuracy": 0.7457714664313748,
"eval_loss": 1.1143237352371216,
"eval_runtime": 887.1041,
"eval_samples_per_second": 562.928,
"eval_steps_per_second": 5.212,
"step": 11970
},
{
"epoch": 10.09186635551255,
"grad_norm": 0.1877572238445282,
"learning_rate": 5.1485148514851485e-05,
"loss": 1.2249,
"step": 11980
},
{
"epoch": 10.100290284022995,
"grad_norm": 0.18356889486312866,
"learning_rate": 5.0495049504950497e-05,
"loss": 1.2255,
"step": 11990
},
{
"epoch": 10.108714212533439,
"grad_norm": 0.1898818463087082,
"learning_rate": 4.950495049504951e-05,
"loss": 1.2241,
"step": 12000
},
{
"epoch": 10.117138141043885,
"grad_norm": 0.17149324715137482,
"learning_rate": 4.851485148514852e-05,
"loss": 1.2257,
"step": 12010
},
{
"epoch": 10.125562069554329,
"grad_norm": 0.16672831773757935,
"learning_rate": 4.7524752475247525e-05,
"loss": 1.2255,
"step": 12020
},
{
"epoch": 10.133985998064773,
"grad_norm": 0.16820046305656433,
"learning_rate": 4.653465346534654e-05,
"loss": 1.225,
"step": 12030
},
{
"epoch": 10.142409926575217,
"grad_norm": 0.17770229279994965,
"learning_rate": 4.554455445544554e-05,
"loss": 1.227,
"step": 12040
},
{
"epoch": 10.150833855085661,
"grad_norm": 0.16082800924777985,
"learning_rate": 4.455445544554455e-05,
"loss": 1.2253,
"step": 12050
},
{
"epoch": 10.159257783596107,
"grad_norm": 0.1669086515903473,
"learning_rate": 4.3564356435643565e-05,
"loss": 1.2241,
"step": 12060
},
{
"epoch": 10.159257783596107,
"eval_accuracy": 0.7460534494522424,
"eval_loss": 1.1121779680252075,
"eval_runtime": 882.614,
"eval_samples_per_second": 565.792,
"eval_steps_per_second": 5.239,
"step": 12060
},
{
"epoch": 10.167681712106551,
"grad_norm": 0.17394189536571503,
"learning_rate": 4.257425742574258e-05,
"loss": 1.2238,
"step": 12070
},
{
"epoch": 10.176105640616996,
"grad_norm": 0.1611398160457611,
"learning_rate": 4.158415841584159e-05,
"loss": 1.2243,
"step": 12080
},
{
"epoch": 10.18452956912744,
"grad_norm": 0.16469168663024902,
"learning_rate": 4.0594059405940594e-05,
"loss": 1.2232,
"step": 12090
},
{
"epoch": 10.192953497637886,
"grad_norm": 0.1700202375650406,
"learning_rate": 3.9603960396039605e-05,
"loss": 1.2243,
"step": 12100
},
{
"epoch": 10.20137742614833,
"grad_norm": 0.16961273550987244,
"learning_rate": 3.861386138613862e-05,
"loss": 1.2244,
"step": 12110
},
{
"epoch": 10.209801354658774,
"grad_norm": 0.18176864087581635,
"learning_rate": 3.762376237623762e-05,
"loss": 1.2234,
"step": 12120
},
{
"epoch": 10.218225283169218,
"grad_norm": 0.17132678627967834,
"learning_rate": 3.6633663366336634e-05,
"loss": 1.2231,
"step": 12130
},
{
"epoch": 10.226649211679662,
"grad_norm": 0.1708788424730301,
"learning_rate": 3.564356435643564e-05,
"loss": 1.2228,
"step": 12140
},
{
"epoch": 10.235073140190108,
"grad_norm": 0.16924616694450378,
"learning_rate": 3.465346534653466e-05,
"loss": 1.2241,
"step": 12150
},
{
"epoch": 10.235073140190108,
"eval_accuracy": 0.7462807420235112,
"eval_loss": 1.1115893125534058,
"eval_runtime": 893.1249,
"eval_samples_per_second": 559.133,
"eval_steps_per_second": 5.177,
"step": 12150
},
{
"epoch": 10.243497068700552,
"grad_norm": 0.1617705076932907,
"learning_rate": 3.366336633663367e-05,
"loss": 1.2239,
"step": 12160
},
{
"epoch": 10.251920997210997,
"grad_norm": 0.17731362581253052,
"learning_rate": 3.2673267326732674e-05,
"loss": 1.2232,
"step": 12170
},
{
"epoch": 10.26034492572144,
"grad_norm": 0.17324230074882507,
"learning_rate": 3.1683168316831686e-05,
"loss": 1.224,
"step": 12180
},
{
"epoch": 10.268768854231885,
"grad_norm": 0.15266722440719604,
"learning_rate": 3.069306930693069e-05,
"loss": 1.224,
"step": 12190
},
{
"epoch": 10.27719278274233,
"grad_norm": 0.1547342985868454,
"learning_rate": 2.9702970297029702e-05,
"loss": 1.2232,
"step": 12200
},
{
"epoch": 10.285616711252775,
"grad_norm": 0.15873835980892181,
"learning_rate": 2.8712871287128714e-05,
"loss": 1.2221,
"step": 12210
},
{
"epoch": 10.29404063976322,
"grad_norm": 0.15968631207942963,
"learning_rate": 2.7722772277227722e-05,
"loss": 1.223,
"step": 12220
},
{
"epoch": 10.302464568273663,
"grad_norm": 0.15929782390594482,
"learning_rate": 2.6732673267326734e-05,
"loss": 1.2242,
"step": 12230
},
{
"epoch": 10.31088849678411,
"grad_norm": 0.1512889713048935,
"learning_rate": 2.5742574257425742e-05,
"loss": 1.2223,
"step": 12240
},
{
"epoch": 10.31088849678411,
"eval_accuracy": 0.7462616988558893,
"eval_loss": 1.1114362478256226,
"eval_runtime": 886.8923,
"eval_samples_per_second": 563.063,
"eval_steps_per_second": 5.214,
"step": 12240
},
{
"epoch": 10.319312425294553,
"grad_norm": 0.15943297743797302,
"learning_rate": 2.4752475247524754e-05,
"loss": 1.2224,
"step": 12250
},
{
"epoch": 10.327736353804998,
"grad_norm": 0.16134706139564514,
"learning_rate": 2.3762376237623762e-05,
"loss": 1.2218,
"step": 12260
},
{
"epoch": 10.336160282315442,
"grad_norm": 0.15525278449058533,
"learning_rate": 2.277227722772277e-05,
"loss": 1.2237,
"step": 12270
},
{
"epoch": 10.344584210825886,
"grad_norm": 0.1626599282026291,
"learning_rate": 2.1782178217821783e-05,
"loss": 1.2228,
"step": 12280
},
{
"epoch": 10.353008139336332,
"grad_norm": 0.1533862203359604,
"learning_rate": 2.0792079207920794e-05,
"loss": 1.221,
"step": 12290
},
{
"epoch": 10.361432067846776,
"grad_norm": 0.14988014101982117,
"learning_rate": 1.9801980198019803e-05,
"loss": 1.2238,
"step": 12300
},
{
"epoch": 10.36985599635722,
"grad_norm": 0.15282054245471954,
"learning_rate": 1.881188118811881e-05,
"loss": 1.2202,
"step": 12310
},
{
"epoch": 10.378279924867664,
"grad_norm": 0.1532844454050064,
"learning_rate": 1.782178217821782e-05,
"loss": 1.2222,
"step": 12320
},
{
"epoch": 10.386703853378108,
"grad_norm": 0.15041793882846832,
"learning_rate": 1.6831683168316834e-05,
"loss": 1.2233,
"step": 12330
},
{
"epoch": 10.386703853378108,
"eval_accuracy": 0.7464784909349403,
"eval_loss": 1.1103906631469727,
"eval_runtime": 893.2259,
"eval_samples_per_second": 559.07,
"eval_steps_per_second": 5.177,
"step": 12330
}
],
"logging_steps": 10,
"max_steps": 12500,
"num_input_tokens_seen": 0,
"num_train_epochs": 11,
"save_steps": 90,
"total_flos": 3.205415169974477e+18,
"train_batch_size": 108,
"trial_name": null,
"trial_params": null
}