gpt2-cpt-dutch / trainer_state.json
wdli's picture
Model save
6e78c3a verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 500,
"global_step": 2010,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0004975124378109452,
"grad_norm": 11.875,
"learning_rate": 9.950248756218907e-07,
"loss": 4.8907,
"step": 1
},
{
"epoch": 0.0024875621890547263,
"grad_norm": 13.125,
"learning_rate": 4.975124378109453e-06,
"loss": 4.8447,
"step": 5
},
{
"epoch": 0.004975124378109453,
"grad_norm": 12.5,
"learning_rate": 9.950248756218906e-06,
"loss": 4.9268,
"step": 10
},
{
"epoch": 0.007462686567164179,
"grad_norm": 12.375,
"learning_rate": 1.4925373134328357e-05,
"loss": 4.9396,
"step": 15
},
{
"epoch": 0.009950248756218905,
"grad_norm": 14.6875,
"learning_rate": 1.990049751243781e-05,
"loss": 5.0521,
"step": 20
},
{
"epoch": 0.012437810945273632,
"grad_norm": 12.625,
"learning_rate": 2.4875621890547266e-05,
"loss": 4.7898,
"step": 25
},
{
"epoch": 0.014925373134328358,
"grad_norm": 12.875,
"learning_rate": 2.9850746268656714e-05,
"loss": 4.8798,
"step": 30
},
{
"epoch": 0.017412935323383085,
"grad_norm": 11.9375,
"learning_rate": 3.4825870646766175e-05,
"loss": 4.8204,
"step": 35
},
{
"epoch": 0.01990049751243781,
"grad_norm": 11.6875,
"learning_rate": 3.980099502487562e-05,
"loss": 4.684,
"step": 40
},
{
"epoch": 0.022388059701492536,
"grad_norm": 13.1875,
"learning_rate": 4.477611940298508e-05,
"loss": 4.6143,
"step": 45
},
{
"epoch": 0.024875621890547265,
"grad_norm": 10.6875,
"learning_rate": 4.975124378109453e-05,
"loss": 4.5667,
"step": 50
},
{
"epoch": 0.02736318407960199,
"grad_norm": 12.5625,
"learning_rate": 5.472636815920398e-05,
"loss": 4.6985,
"step": 55
},
{
"epoch": 0.029850746268656716,
"grad_norm": 13.375,
"learning_rate": 5.970149253731343e-05,
"loss": 4.5517,
"step": 60
},
{
"epoch": 0.03233830845771144,
"grad_norm": 11.5625,
"learning_rate": 6.46766169154229e-05,
"loss": 4.5351,
"step": 65
},
{
"epoch": 0.03482587064676617,
"grad_norm": 11.5625,
"learning_rate": 6.965174129353235e-05,
"loss": 4.454,
"step": 70
},
{
"epoch": 0.03731343283582089,
"grad_norm": 11.125,
"learning_rate": 7.46268656716418e-05,
"loss": 4.4039,
"step": 75
},
{
"epoch": 0.03980099502487562,
"grad_norm": 10.875,
"learning_rate": 7.960199004975125e-05,
"loss": 4.3716,
"step": 80
},
{
"epoch": 0.04228855721393035,
"grad_norm": 10.8125,
"learning_rate": 8.45771144278607e-05,
"loss": 4.3879,
"step": 85
},
{
"epoch": 0.04477611940298507,
"grad_norm": 10.25,
"learning_rate": 8.955223880597016e-05,
"loss": 4.1858,
"step": 90
},
{
"epoch": 0.0472636815920398,
"grad_norm": 10.5,
"learning_rate": 9.452736318407961e-05,
"loss": 4.421,
"step": 95
},
{
"epoch": 0.04975124378109453,
"grad_norm": 10.375,
"learning_rate": 9.950248756218906e-05,
"loss": 4.3506,
"step": 100
},
{
"epoch": 0.05223880597014925,
"grad_norm": 11.3125,
"learning_rate": 0.0001044776119402985,
"loss": 4.2671,
"step": 105
},
{
"epoch": 0.05472636815920398,
"grad_norm": 8.625,
"learning_rate": 0.00010945273631840796,
"loss": 4.296,
"step": 110
},
{
"epoch": 0.05721393034825871,
"grad_norm": 11.75,
"learning_rate": 0.00011442786069651741,
"loss": 4.1273,
"step": 115
},
{
"epoch": 0.05970149253731343,
"grad_norm": 8.25,
"learning_rate": 0.00011940298507462686,
"loss": 4.3068,
"step": 120
},
{
"epoch": 0.06218905472636816,
"grad_norm": 9.1875,
"learning_rate": 0.0001243781094527363,
"loss": 4.2837,
"step": 125
},
{
"epoch": 0.06467661691542288,
"grad_norm": 11.75,
"learning_rate": 0.0001293532338308458,
"loss": 4.3174,
"step": 130
},
{
"epoch": 0.06716417910447761,
"grad_norm": 10.0,
"learning_rate": 0.00013432835820895525,
"loss": 4.086,
"step": 135
},
{
"epoch": 0.06965174129353234,
"grad_norm": 9.3125,
"learning_rate": 0.0001393034825870647,
"loss": 4.4375,
"step": 140
},
{
"epoch": 0.07213930348258707,
"grad_norm": 8.8125,
"learning_rate": 0.00014427860696517416,
"loss": 4.2058,
"step": 145
},
{
"epoch": 0.07462686567164178,
"grad_norm": 8.6875,
"learning_rate": 0.0001492537313432836,
"loss": 4.1712,
"step": 150
},
{
"epoch": 0.07711442786069651,
"grad_norm": 9.1875,
"learning_rate": 0.00015422885572139304,
"loss": 4.0637,
"step": 155
},
{
"epoch": 0.07960199004975124,
"grad_norm": 9.0,
"learning_rate": 0.0001592039800995025,
"loss": 4.0861,
"step": 160
},
{
"epoch": 0.08208955223880597,
"grad_norm": 10.0625,
"learning_rate": 0.00016417910447761195,
"loss": 4.1879,
"step": 165
},
{
"epoch": 0.0845771144278607,
"grad_norm": 8.5625,
"learning_rate": 0.0001691542288557214,
"loss": 4.2878,
"step": 170
},
{
"epoch": 0.08706467661691543,
"grad_norm": 9.6875,
"learning_rate": 0.00017412935323383086,
"loss": 4.0535,
"step": 175
},
{
"epoch": 0.08955223880597014,
"grad_norm": 8.8125,
"learning_rate": 0.0001791044776119403,
"loss": 4.1332,
"step": 180
},
{
"epoch": 0.09203980099502487,
"grad_norm": 9.8125,
"learning_rate": 0.00018407960199004977,
"loss": 4.114,
"step": 185
},
{
"epoch": 0.0945273631840796,
"grad_norm": 9.25,
"learning_rate": 0.00018905472636815922,
"loss": 4.0242,
"step": 190
},
{
"epoch": 0.09701492537313433,
"grad_norm": 8.875,
"learning_rate": 0.00019402985074626867,
"loss": 4.1829,
"step": 195
},
{
"epoch": 0.09950248756218906,
"grad_norm": 11.375,
"learning_rate": 0.00019900497512437813,
"loss": 4.2259,
"step": 200
},
{
"epoch": 0.10199004975124377,
"grad_norm": 8.4375,
"learning_rate": 0.000199997587258178,
"loss": 4.302,
"step": 205
},
{
"epoch": 0.1044776119402985,
"grad_norm": 10.4375,
"learning_rate": 0.0001999877856940653,
"loss": 4.0564,
"step": 210
},
{
"epoch": 0.10696517412935323,
"grad_norm": 10.8125,
"learning_rate": 0.00019997044524974799,
"loss": 4.0902,
"step": 215
},
{
"epoch": 0.10945273631840796,
"grad_norm": 8.125,
"learning_rate": 0.00019994556723266103,
"loss": 4.0716,
"step": 220
},
{
"epoch": 0.11194029850746269,
"grad_norm": 8.0,
"learning_rate": 0.00019991315351855748,
"loss": 4.1908,
"step": 225
},
{
"epoch": 0.11442786069651742,
"grad_norm": 9.0,
"learning_rate": 0.00019987320655136693,
"loss": 4.2773,
"step": 230
},
{
"epoch": 0.11691542288557213,
"grad_norm": 9.6875,
"learning_rate": 0.00019982572934301122,
"loss": 4.1853,
"step": 235
},
{
"epoch": 0.11940298507462686,
"grad_norm": 9.0625,
"learning_rate": 0.0001997707254731775,
"loss": 4.1019,
"step": 240
},
{
"epoch": 0.12189054726368159,
"grad_norm": 9.3125,
"learning_rate": 0.00019970819908904814,
"loss": 4.198,
"step": 245
},
{
"epoch": 0.12437810945273632,
"grad_norm": 9.125,
"learning_rate": 0.00019963815490498817,
"loss": 4.1697,
"step": 250
},
{
"epoch": 0.12686567164179105,
"grad_norm": 10.625,
"learning_rate": 0.00019956059820218982,
"loss": 4.0784,
"step": 255
},
{
"epoch": 0.12935323383084577,
"grad_norm": 9.0,
"learning_rate": 0.00019947553482827418,
"loss": 4.1693,
"step": 260
},
{
"epoch": 0.1318407960199005,
"grad_norm": 8.0625,
"learning_rate": 0.00019938297119685054,
"loss": 4.0987,
"step": 265
},
{
"epoch": 0.13432835820895522,
"grad_norm": 7.9375,
"learning_rate": 0.00019928291428703262,
"loss": 4.1734,
"step": 270
},
{
"epoch": 0.13681592039800994,
"grad_norm": 7.5625,
"learning_rate": 0.00019917537164291244,
"loss": 4.0456,
"step": 275
},
{
"epoch": 0.13930348258706468,
"grad_norm": 8.25,
"learning_rate": 0.0001990603513729915,
"loss": 4.1025,
"step": 280
},
{
"epoch": 0.1417910447761194,
"grad_norm": 8.625,
"learning_rate": 0.00019893786214956945,
"loss": 4.1861,
"step": 285
},
{
"epoch": 0.14427860696517414,
"grad_norm": 8.5,
"learning_rate": 0.0001988079132080901,
"loss": 4.1516,
"step": 290
},
{
"epoch": 0.14676616915422885,
"grad_norm": 8.0625,
"learning_rate": 0.0001986705143464453,
"loss": 4.0148,
"step": 295
},
{
"epoch": 0.14925373134328357,
"grad_norm": 7.96875,
"learning_rate": 0.0001985256759242359,
"loss": 3.9918,
"step": 300
},
{
"epoch": 0.1517412935323383,
"grad_norm": 7.875,
"learning_rate": 0.00019837340886199096,
"loss": 4.0434,
"step": 305
},
{
"epoch": 0.15422885572139303,
"grad_norm": 7.65625,
"learning_rate": 0.00019821372464034416,
"loss": 4.1499,
"step": 310
},
{
"epoch": 0.15671641791044777,
"grad_norm": 8.75,
"learning_rate": 0.00019804663529916826,
"loss": 4.0495,
"step": 315
},
{
"epoch": 0.15920398009950248,
"grad_norm": 9.6875,
"learning_rate": 0.00019787215343666732,
"loss": 4.0981,
"step": 320
},
{
"epoch": 0.16169154228855723,
"grad_norm": 8.3125,
"learning_rate": 0.00019769029220842677,
"loss": 4.0678,
"step": 325
},
{
"epoch": 0.16417910447761194,
"grad_norm": 8.8125,
"learning_rate": 0.0001975010653264216,
"loss": 4.1043,
"step": 330
},
{
"epoch": 0.16666666666666666,
"grad_norm": 8.5,
"learning_rate": 0.00019730448705798239,
"loss": 4.1361,
"step": 335
},
{
"epoch": 0.1691542288557214,
"grad_norm": 8.1875,
"learning_rate": 0.00019710057222471967,
"loss": 3.9753,
"step": 340
},
{
"epoch": 0.17164179104477612,
"grad_norm": 7.875,
"learning_rate": 0.00019688933620140637,
"loss": 4.1972,
"step": 345
},
{
"epoch": 0.17412935323383086,
"grad_norm": 7.4375,
"learning_rate": 0.0001966707949148186,
"loss": 4.0355,
"step": 350
},
{
"epoch": 0.17661691542288557,
"grad_norm": 8.1875,
"learning_rate": 0.00019644496484253474,
"loss": 4.0079,
"step": 355
},
{
"epoch": 0.1791044776119403,
"grad_norm": 8.4375,
"learning_rate": 0.00019621186301169315,
"loss": 3.9721,
"step": 360
},
{
"epoch": 0.18159203980099503,
"grad_norm": 8.25,
"learning_rate": 0.00019597150699770835,
"loss": 4.1628,
"step": 365
},
{
"epoch": 0.18407960199004975,
"grad_norm": 8.0625,
"learning_rate": 0.0001957239149229458,
"loss": 3.9472,
"step": 370
},
{
"epoch": 0.1865671641791045,
"grad_norm": 8.125,
"learning_rate": 0.00019546910545535558,
"loss": 4.2425,
"step": 375
},
{
"epoch": 0.1890547263681592,
"grad_norm": 9.0625,
"learning_rate": 0.00019520709780706486,
"loss": 4.1314,
"step": 380
},
{
"epoch": 0.19154228855721392,
"grad_norm": 8.5,
"learning_rate": 0.00019493791173292923,
"loss": 4.0324,
"step": 385
},
{
"epoch": 0.19402985074626866,
"grad_norm": 8.0,
"learning_rate": 0.00019466156752904343,
"loss": 4.0719,
"step": 390
},
{
"epoch": 0.19651741293532338,
"grad_norm": 8.0625,
"learning_rate": 0.00019437808603121087,
"loss": 3.8774,
"step": 395
},
{
"epoch": 0.19900497512437812,
"grad_norm": 9.3125,
"learning_rate": 0.00019408748861337273,
"loss": 4.1163,
"step": 400
},
{
"epoch": 0.20149253731343283,
"grad_norm": 8.1875,
"learning_rate": 0.00019378979718599645,
"loss": 4.1658,
"step": 405
},
{
"epoch": 0.20398009950248755,
"grad_norm": 7.625,
"learning_rate": 0.0001934850341944237,
"loss": 4.1059,
"step": 410
},
{
"epoch": 0.2064676616915423,
"grad_norm": 7.9375,
"learning_rate": 0.00019317322261717794,
"loss": 4.0303,
"step": 415
},
{
"epoch": 0.208955223880597,
"grad_norm": 8.25,
"learning_rate": 0.00019285438596423204,
"loss": 3.9217,
"step": 420
},
{
"epoch": 0.21144278606965175,
"grad_norm": 7.375,
"learning_rate": 0.00019252854827523557,
"loss": 4.0601,
"step": 425
},
{
"epoch": 0.21393034825870647,
"grad_norm": 9.6875,
"learning_rate": 0.00019219573411770235,
"loss": 4.025,
"step": 430
},
{
"epoch": 0.21641791044776118,
"grad_norm": 7.71875,
"learning_rate": 0.000191855968585158,
"loss": 4.0082,
"step": 435
},
{
"epoch": 0.21890547263681592,
"grad_norm": 6.46875,
"learning_rate": 0.000191509277295248,
"loss": 3.9052,
"step": 440
},
{
"epoch": 0.22139303482587064,
"grad_norm": 9.3125,
"learning_rate": 0.00019115568638780622,
"loss": 3.8947,
"step": 445
},
{
"epoch": 0.22388059701492538,
"grad_norm": 7.40625,
"learning_rate": 0.00019079522252288386,
"loss": 3.8908,
"step": 450
},
{
"epoch": 0.2263681592039801,
"grad_norm": 8.5,
"learning_rate": 0.00019042791287873957,
"loss": 4.198,
"step": 455
},
{
"epoch": 0.22885572139303484,
"grad_norm": 8.1875,
"learning_rate": 0.00019005378514979008,
"loss": 4.06,
"step": 460
},
{
"epoch": 0.23134328358208955,
"grad_norm": 7.625,
"learning_rate": 0.00018967286754452214,
"loss": 4.1332,
"step": 465
},
{
"epoch": 0.23383084577114427,
"grad_norm": 7.9375,
"learning_rate": 0.0001892851887833657,
"loss": 4.0782,
"step": 470
},
{
"epoch": 0.236318407960199,
"grad_norm": 7.28125,
"learning_rate": 0.0001888907780965284,
"loss": 4.0219,
"step": 475
},
{
"epoch": 0.23880597014925373,
"grad_norm": 7.59375,
"learning_rate": 0.00018848966522179168,
"loss": 4.0916,
"step": 480
},
{
"epoch": 0.24129353233830847,
"grad_norm": 7.53125,
"learning_rate": 0.00018808188040226868,
"loss": 4.1352,
"step": 485
},
{
"epoch": 0.24378109452736318,
"grad_norm": 8.25,
"learning_rate": 0.00018766745438412384,
"loss": 4.0799,
"step": 490
},
{
"epoch": 0.2462686567164179,
"grad_norm": 8.375,
"learning_rate": 0.00018724641841425478,
"loss": 4.0501,
"step": 495
},
{
"epoch": 0.24875621890547264,
"grad_norm": 7.90625,
"learning_rate": 0.00018681880423793642,
"loss": 4.0131,
"step": 500
},
{
"epoch": 0.2512437810945274,
"grad_norm": 8.25,
"learning_rate": 0.00018638464409642723,
"loss": 4.2064,
"step": 505
},
{
"epoch": 0.2537313432835821,
"grad_norm": 7.0,
"learning_rate": 0.00018594397072453856,
"loss": 4.1475,
"step": 510
},
{
"epoch": 0.2562189054726368,
"grad_norm": 7.28125,
"learning_rate": 0.00018549681734816623,
"loss": 3.9829,
"step": 515
},
{
"epoch": 0.25870646766169153,
"grad_norm": 7.5625,
"learning_rate": 0.0001850432176817857,
"loss": 4.0752,
"step": 520
},
{
"epoch": 0.26119402985074625,
"grad_norm": 7.96875,
"learning_rate": 0.00018458320592590975,
"loss": 3.8724,
"step": 525
},
{
"epoch": 0.263681592039801,
"grad_norm": 7.3125,
"learning_rate": 0.00018411681676450999,
"loss": 4.0854,
"step": 530
},
{
"epoch": 0.26616915422885573,
"grad_norm": 7.84375,
"learning_rate": 0.0001836440853624017,
"loss": 4.0408,
"step": 535
},
{
"epoch": 0.26865671641791045,
"grad_norm": 8.6875,
"learning_rate": 0.00018316504736259255,
"loss": 4.0437,
"step": 540
},
{
"epoch": 0.27114427860696516,
"grad_norm": 6.8125,
"learning_rate": 0.00018267973888359509,
"loss": 4.0593,
"step": 545
},
{
"epoch": 0.2736318407960199,
"grad_norm": 7.625,
"learning_rate": 0.00018218819651670356,
"loss": 3.9724,
"step": 550
},
{
"epoch": 0.27611940298507465,
"grad_norm": 7.09375,
"learning_rate": 0.00018169045732323492,
"loss": 4.1018,
"step": 555
},
{
"epoch": 0.27860696517412936,
"grad_norm": 7.75,
"learning_rate": 0.00018118655883173456,
"loss": 4.1389,
"step": 560
},
{
"epoch": 0.2810945273631841,
"grad_norm": 7.75,
"learning_rate": 0.0001806765390351467,
"loss": 4.0369,
"step": 565
},
{
"epoch": 0.2835820895522388,
"grad_norm": 6.84375,
"learning_rate": 0.00018016043638794974,
"loss": 4.1131,
"step": 570
},
{
"epoch": 0.2860696517412935,
"grad_norm": 6.6875,
"learning_rate": 0.00017963828980325697,
"loss": 3.8789,
"step": 575
},
{
"epoch": 0.2885572139303483,
"grad_norm": 8.0625,
"learning_rate": 0.00017911013864988252,
"loss": 4.1892,
"step": 580
},
{
"epoch": 0.291044776119403,
"grad_norm": 7.4375,
"learning_rate": 0.00017857602274937308,
"loss": 4.0332,
"step": 585
},
{
"epoch": 0.2935323383084577,
"grad_norm": 7.6875,
"learning_rate": 0.00017803598237300537,
"loss": 4.0141,
"step": 590
},
{
"epoch": 0.2960199004975124,
"grad_norm": 7.34375,
"learning_rate": 0.00017749005823874988,
"loss": 3.9258,
"step": 595
},
{
"epoch": 0.29850746268656714,
"grad_norm": 6.9375,
"learning_rate": 0.00017693829150820068,
"loss": 4.072,
"step": 600
},
{
"epoch": 0.3009950248756219,
"grad_norm": 7.3125,
"learning_rate": 0.00017638072378347203,
"loss": 3.8492,
"step": 605
},
{
"epoch": 0.3034825870646766,
"grad_norm": 7.1875,
"learning_rate": 0.0001758173971040616,
"loss": 3.8323,
"step": 610
},
{
"epoch": 0.30597014925373134,
"grad_norm": 8.25,
"learning_rate": 0.00017524835394368065,
"loss": 3.9926,
"step": 615
},
{
"epoch": 0.30845771144278605,
"grad_norm": 7.53125,
"learning_rate": 0.00017467363720705204,
"loss": 4.0593,
"step": 620
},
{
"epoch": 0.31094527363184077,
"grad_norm": 7.90625,
"learning_rate": 0.0001740932902266747,
"loss": 3.8775,
"step": 625
},
{
"epoch": 0.31343283582089554,
"grad_norm": 7.8125,
"learning_rate": 0.00017350735675955697,
"loss": 4.1344,
"step": 630
},
{
"epoch": 0.31592039800995025,
"grad_norm": 8.4375,
"learning_rate": 0.000172915880983917,
"loss": 3.948,
"step": 635
},
{
"epoch": 0.31840796019900497,
"grad_norm": 6.96875,
"learning_rate": 0.0001723189074958521,
"loss": 3.9485,
"step": 640
},
{
"epoch": 0.3208955223880597,
"grad_norm": 7.625,
"learning_rate": 0.00017171648130597612,
"loss": 3.9687,
"step": 645
},
{
"epoch": 0.32338308457711445,
"grad_norm": 6.6875,
"learning_rate": 0.0001711086478360257,
"loss": 4.0554,
"step": 650
},
{
"epoch": 0.32587064676616917,
"grad_norm": 6.9375,
"learning_rate": 0.0001704954529154359,
"loss": 4.0395,
"step": 655
},
{
"epoch": 0.3283582089552239,
"grad_norm": 6.71875,
"learning_rate": 0.00016987694277788417,
"loss": 3.9427,
"step": 660
},
{
"epoch": 0.3308457711442786,
"grad_norm": 7.4375,
"learning_rate": 0.000169253164057805,
"loss": 3.9438,
"step": 665
},
{
"epoch": 0.3333333333333333,
"grad_norm": 7.6875,
"learning_rate": 0.0001686241637868734,
"loss": 4.1186,
"step": 670
},
{
"epoch": 0.3358208955223881,
"grad_norm": 6.59375,
"learning_rate": 0.00016798998939045895,
"loss": 4.0849,
"step": 675
},
{
"epoch": 0.3383084577114428,
"grad_norm": 7.375,
"learning_rate": 0.00016735068868404998,
"loss": 3.9868,
"step": 680
},
{
"epoch": 0.3407960199004975,
"grad_norm": 6.9375,
"learning_rate": 0.0001667063098696485,
"loss": 3.9275,
"step": 685
},
{
"epoch": 0.34328358208955223,
"grad_norm": 8.125,
"learning_rate": 0.0001660569015321357,
"loss": 4.0451,
"step": 690
},
{
"epoch": 0.34577114427860695,
"grad_norm": 8.3125,
"learning_rate": 0.00016540251263560878,
"loss": 3.9818,
"step": 695
},
{
"epoch": 0.3482587064676617,
"grad_norm": 8.875,
"learning_rate": 0.00016474319251968923,
"loss": 3.9491,
"step": 700
},
{
"epoch": 0.35074626865671643,
"grad_norm": 6.65625,
"learning_rate": 0.00016407899089580262,
"loss": 3.9901,
"step": 705
},
{
"epoch": 0.35323383084577115,
"grad_norm": 7.46875,
"learning_rate": 0.0001634099578434306,
"loss": 3.9471,
"step": 710
},
{
"epoch": 0.35572139303482586,
"grad_norm": 7.375,
"learning_rate": 0.00016273614380633484,
"loss": 3.897,
"step": 715
},
{
"epoch": 0.3582089552238806,
"grad_norm": 8.4375,
"learning_rate": 0.0001620575995887538,
"loss": 3.9658,
"step": 720
},
{
"epoch": 0.36069651741293535,
"grad_norm": 6.78125,
"learning_rate": 0.00016137437635157213,
"loss": 3.9457,
"step": 725
},
{
"epoch": 0.36318407960199006,
"grad_norm": 6.96875,
"learning_rate": 0.00016068652560846327,
"loss": 4.143,
"step": 730
},
{
"epoch": 0.3656716417910448,
"grad_norm": 7.21875,
"learning_rate": 0.0001599940992220053,
"loss": 4.0813,
"step": 735
},
{
"epoch": 0.3681592039800995,
"grad_norm": 7.40625,
"learning_rate": 0.0001592971493997709,
"loss": 4.019,
"step": 740
},
{
"epoch": 0.3706467661691542,
"grad_norm": 6.75,
"learning_rate": 0.00015859572869039064,
"loss": 4.0779,
"step": 745
},
{
"epoch": 0.373134328358209,
"grad_norm": 8.1875,
"learning_rate": 0.00015788988997959114,
"loss": 4.1056,
"step": 750
},
{
"epoch": 0.3756218905472637,
"grad_norm": 6.53125,
"learning_rate": 0.00015717968648620764,
"loss": 4.0207,
"step": 755
},
{
"epoch": 0.3781094527363184,
"grad_norm": 7.8125,
"learning_rate": 0.00015646517175817114,
"loss": 4.2123,
"step": 760
},
{
"epoch": 0.3805970149253731,
"grad_norm": 6.84375,
"learning_rate": 0.00015574639966847126,
"loss": 4.0826,
"step": 765
},
{
"epoch": 0.38308457711442784,
"grad_norm": 6.71875,
"learning_rate": 0.00015502342441109422,
"loss": 4.0236,
"step": 770
},
{
"epoch": 0.3855721393034826,
"grad_norm": 7.625,
"learning_rate": 0.00015429630049693674,
"loss": 3.9291,
"step": 775
},
{
"epoch": 0.3880597014925373,
"grad_norm": 7.875,
"learning_rate": 0.00015356508274969594,
"loss": 4.0301,
"step": 780
},
{
"epoch": 0.39054726368159204,
"grad_norm": 7.03125,
"learning_rate": 0.00015282982630173585,
"loss": 3.9478,
"step": 785
},
{
"epoch": 0.39303482587064675,
"grad_norm": 6.0625,
"learning_rate": 0.00015209058658993056,
"loss": 3.9102,
"step": 790
},
{
"epoch": 0.39552238805970147,
"grad_norm": 6.46875,
"learning_rate": 0.0001513474193514842,
"loss": 4.0111,
"step": 795
},
{
"epoch": 0.39800995024875624,
"grad_norm": 7.15625,
"learning_rate": 0.00015060038061972874,
"loss": 3.9447,
"step": 800
},
{
"epoch": 0.40049751243781095,
"grad_norm": 5.5,
"learning_rate": 0.000149849526719899,
"loss": 3.7303,
"step": 805
},
{
"epoch": 0.40298507462686567,
"grad_norm": 7.21875,
"learning_rate": 0.00014909491426488578,
"loss": 4.1654,
"step": 810
},
{
"epoch": 0.4054726368159204,
"grad_norm": 7.46875,
"learning_rate": 0.00014833660015096766,
"loss": 3.8909,
"step": 815
},
{
"epoch": 0.4079601990049751,
"grad_norm": 6.84375,
"learning_rate": 0.00014757464155352082,
"loss": 3.9657,
"step": 820
},
{
"epoch": 0.41044776119402987,
"grad_norm": 7.125,
"learning_rate": 0.0001468090959227082,
"loss": 3.9625,
"step": 825
},
{
"epoch": 0.4129353233830846,
"grad_norm": 7.34375,
"learning_rate": 0.00014604002097914806,
"loss": 3.8299,
"step": 830
},
{
"epoch": 0.4154228855721393,
"grad_norm": 7.15625,
"learning_rate": 0.00014526747470956176,
"loss": 3.9513,
"step": 835
},
{
"epoch": 0.417910447761194,
"grad_norm": 6.09375,
"learning_rate": 0.00014449151536240166,
"loss": 3.828,
"step": 840
},
{
"epoch": 0.42039800995024873,
"grad_norm": 6.0625,
"learning_rate": 0.00014371220144345954,
"loss": 3.9232,
"step": 845
},
{
"epoch": 0.4228855721393035,
"grad_norm": 5.96875,
"learning_rate": 0.0001429295917114551,
"loss": 3.8572,
"step": 850
},
{
"epoch": 0.4253731343283582,
"grad_norm": 7.03125,
"learning_rate": 0.00014214374517360575,
"loss": 3.9477,
"step": 855
},
{
"epoch": 0.42786069651741293,
"grad_norm": 7.90625,
"learning_rate": 0.00014135472108117787,
"loss": 4.2486,
"step": 860
},
{
"epoch": 0.43034825870646765,
"grad_norm": 6.34375,
"learning_rate": 0.00014056257892501885,
"loss": 3.9868,
"step": 865
},
{
"epoch": 0.43283582089552236,
"grad_norm": 6.5,
"learning_rate": 0.00013976737843107202,
"loss": 4.1234,
"step": 870
},
{
"epoch": 0.43532338308457713,
"grad_norm": 7.0625,
"learning_rate": 0.00013896917955587328,
"loss": 4.006,
"step": 875
},
{
"epoch": 0.43781094527363185,
"grad_norm": 7.375,
"learning_rate": 0.00013816804248203052,
"loss": 3.9775,
"step": 880
},
{
"epoch": 0.44029850746268656,
"grad_norm": 6.84375,
"learning_rate": 0.00013736402761368598,
"loss": 3.9257,
"step": 885
},
{
"epoch": 0.4427860696517413,
"grad_norm": 6.78125,
"learning_rate": 0.00013655719557196185,
"loss": 3.9621,
"step": 890
},
{
"epoch": 0.44527363184079605,
"grad_norm": 7.25,
"learning_rate": 0.0001357476071903896,
"loss": 3.8718,
"step": 895
},
{
"epoch": 0.44776119402985076,
"grad_norm": 7.4375,
"learning_rate": 0.0001349353235103232,
"loss": 3.9892,
"step": 900
},
{
"epoch": 0.4502487562189055,
"grad_norm": 7.625,
"learning_rate": 0.00013412040577633687,
"loss": 4.2505,
"step": 905
},
{
"epoch": 0.4527363184079602,
"grad_norm": 7.03125,
"learning_rate": 0.0001333029154316072,
"loss": 3.9349,
"step": 910
},
{
"epoch": 0.4552238805970149,
"grad_norm": 6.9375,
"learning_rate": 0.00013248291411328047,
"loss": 3.9892,
"step": 915
},
{
"epoch": 0.4577114427860697,
"grad_norm": 5.84375,
"learning_rate": 0.00013166046364782545,
"loss": 3.9654,
"step": 920
},
{
"epoch": 0.4601990049751244,
"grad_norm": 7.03125,
"learning_rate": 0.0001308356260463717,
"loss": 4.0497,
"step": 925
},
{
"epoch": 0.4626865671641791,
"grad_norm": 7.34375,
"learning_rate": 0.0001300084635000341,
"loss": 3.8808,
"step": 930
},
{
"epoch": 0.4651741293532338,
"grad_norm": 6.71875,
"learning_rate": 0.0001291790383752237,
"loss": 3.957,
"step": 935
},
{
"epoch": 0.46766169154228854,
"grad_norm": 7.59375,
"learning_rate": 0.00012834741320894553,
"loss": 3.936,
"step": 940
},
{
"epoch": 0.4701492537313433,
"grad_norm": 6.34375,
"learning_rate": 0.00012751365070408333,
"loss": 4.0231,
"step": 945
},
{
"epoch": 0.472636815920398,
"grad_norm": 5.96875,
"learning_rate": 0.00012667781372467202,
"loss": 4.0101,
"step": 950
},
{
"epoch": 0.47512437810945274,
"grad_norm": 7.15625,
"learning_rate": 0.00012583996529115762,
"loss": 3.9361,
"step": 955
},
{
"epoch": 0.47761194029850745,
"grad_norm": 6.5625,
"learning_rate": 0.00012500016857564585,
"loss": 4.0114,
"step": 960
},
{
"epoch": 0.48009950248756217,
"grad_norm": 6.875,
"learning_rate": 0.00012415848689713903,
"loss": 3.9577,
"step": 965
},
{
"epoch": 0.48258706467661694,
"grad_norm": 6.875,
"learning_rate": 0.00012331498371676204,
"loss": 3.8951,
"step": 970
},
{
"epoch": 0.48507462686567165,
"grad_norm": 7.0,
"learning_rate": 0.0001224697226329772,
"loss": 3.9695,
"step": 975
},
{
"epoch": 0.48756218905472637,
"grad_norm": 6.09375,
"learning_rate": 0.00012162276737678933,
"loss": 3.9444,
"step": 980
},
{
"epoch": 0.4900497512437811,
"grad_norm": 6.75,
"learning_rate": 0.0001207741818069405,
"loss": 3.9877,
"step": 985
},
{
"epoch": 0.4925373134328358,
"grad_norm": 6.875,
"learning_rate": 0.00011992402990509515,
"loss": 3.9706,
"step": 990
},
{
"epoch": 0.49502487562189057,
"grad_norm": 6.9375,
"learning_rate": 0.00011907237577101611,
"loss": 3.8701,
"step": 995
},
{
"epoch": 0.4975124378109453,
"grad_norm": 7.46875,
"learning_rate": 0.00011821928361773147,
"loss": 4.0109,
"step": 1000
},
{
"epoch": 0.5,
"grad_norm": 6.53125,
"learning_rate": 0.00011736481776669306,
"loss": 3.9716,
"step": 1005
},
{
"epoch": 0.5024875621890548,
"grad_norm": 6.8125,
"learning_rate": 0.00011650904264292687,
"loss": 3.9534,
"step": 1010
},
{
"epoch": 0.5049751243781094,
"grad_norm": 8.5,
"learning_rate": 0.00011565202277017551,
"loss": 4.0376,
"step": 1015
},
{
"epoch": 0.5074626865671642,
"grad_norm": 5.75,
"learning_rate": 0.000114793822766033,
"loss": 3.9223,
"step": 1020
},
{
"epoch": 0.5099502487562189,
"grad_norm": 7.0,
"learning_rate": 0.00011393450733707309,
"loss": 4.11,
"step": 1025
},
{
"epoch": 0.5124378109452736,
"grad_norm": 6.6875,
"learning_rate": 0.00011307414127397027,
"loss": 4.0138,
"step": 1030
},
{
"epoch": 0.5149253731343284,
"grad_norm": 7.09375,
"learning_rate": 0.00011221278944661473,
"loss": 3.8801,
"step": 1035
},
{
"epoch": 0.5174129353233831,
"grad_norm": 7.84375,
"learning_rate": 0.00011135051679922141,
"loss": 4.0368,
"step": 1040
},
{
"epoch": 0.5199004975124378,
"grad_norm": 6.4375,
"learning_rate": 0.00011048738834543319,
"loss": 3.8343,
"step": 1045
},
{
"epoch": 0.5223880597014925,
"grad_norm": 7.21875,
"learning_rate": 0.00010962346916341903,
"loss": 3.885,
"step": 1050
},
{
"epoch": 0.5248756218905473,
"grad_norm": 6.125,
"learning_rate": 0.00010875882439096729,
"loss": 3.9348,
"step": 1055
},
{
"epoch": 0.527363184079602,
"grad_norm": 6.375,
"learning_rate": 0.00010789351922057435,
"loss": 3.9439,
"step": 1060
},
{
"epoch": 0.5298507462686567,
"grad_norm": 6.4375,
"learning_rate": 0.0001070276188945293,
"loss": 3.7975,
"step": 1065
},
{
"epoch": 0.5323383084577115,
"grad_norm": 6.25,
"learning_rate": 0.00010616118869999483,
"loss": 3.8004,
"step": 1070
},
{
"epoch": 0.5348258706467661,
"grad_norm": 5.875,
"learning_rate": 0.00010529429396408452,
"loss": 3.967,
"step": 1075
},
{
"epoch": 0.5373134328358209,
"grad_norm": 7.15625,
"learning_rate": 0.00010442700004893764,
"loss": 3.8504,
"step": 1080
},
{
"epoch": 0.5398009950248757,
"grad_norm": 6.46875,
"learning_rate": 0.00010355937234679065,
"loss": 3.7783,
"step": 1085
},
{
"epoch": 0.5422885572139303,
"grad_norm": 7.0,
"learning_rate": 0.00010269147627504692,
"loss": 3.7741,
"step": 1090
},
{
"epoch": 0.5447761194029851,
"grad_norm": 6.3125,
"learning_rate": 0.0001018233772713443,
"loss": 3.9042,
"step": 1095
},
{
"epoch": 0.5472636815920398,
"grad_norm": 6.0,
"learning_rate": 0.00010095514078862147,
"loss": 4.0435,
"step": 1100
},
{
"epoch": 0.5497512437810945,
"grad_norm": 6.6875,
"learning_rate": 0.00010008683229018256,
"loss": 4.0422,
"step": 1105
},
{
"epoch": 0.5522388059701493,
"grad_norm": 7.0,
"learning_rate": 9.92185172447616e-05,
"loss": 3.9499,
"step": 1110
},
{
"epoch": 0.554726368159204,
"grad_norm": 7.125,
"learning_rate": 9.835026112158637e-05,
"loss": 3.9851,
"step": 1115
},
{
"epoch": 0.5572139303482587,
"grad_norm": 6.25,
"learning_rate": 9.74821293854419e-05,
"loss": 3.9625,
"step": 1120
},
{
"epoch": 0.5597014925373134,
"grad_norm": 6.75,
"learning_rate": 9.661418749173467e-05,
"loss": 3.9269,
"step": 1125
},
{
"epoch": 0.5621890547263682,
"grad_norm": 6.15625,
"learning_rate": 9.574650088155752e-05,
"loss": 4.0838,
"step": 1130
},
{
"epoch": 0.5646766169154229,
"grad_norm": 7.40625,
"learning_rate": 9.487913497675536e-05,
"loss": 4.0415,
"step": 1135
},
{
"epoch": 0.5671641791044776,
"grad_norm": 6.59375,
"learning_rate": 9.40121551749925e-05,
"loss": 4.0316,
"step": 1140
},
{
"epoch": 0.5696517412935324,
"grad_norm": 6.0,
"learning_rate": 9.314562684482202e-05,
"loss": 4.0425,
"step": 1145
},
{
"epoch": 0.572139303482587,
"grad_norm": 6.25,
"learning_rate": 9.227961532075671e-05,
"loss": 3.9342,
"step": 1150
},
{
"epoch": 0.5746268656716418,
"grad_norm": 6.65625,
"learning_rate": 9.141418589834339e-05,
"loss": 4.0811,
"step": 1155
},
{
"epoch": 0.5771144278606966,
"grad_norm": 7.5625,
"learning_rate": 9.054940382923953e-05,
"loss": 4.0697,
"step": 1160
},
{
"epoch": 0.5796019900497512,
"grad_norm": 7.4375,
"learning_rate": 8.96853343162934e-05,
"loss": 4.0852,
"step": 1165
},
{
"epoch": 0.582089552238806,
"grad_norm": 5.8125,
"learning_rate": 8.882204250862796e-05,
"loss": 4.0077,
"step": 1170
},
{
"epoch": 0.5845771144278606,
"grad_norm": 6.65625,
"learning_rate": 8.795959349672878e-05,
"loss": 3.9226,
"step": 1175
},
{
"epoch": 0.5870646766169154,
"grad_norm": 6.78125,
"learning_rate": 8.709805230753627e-05,
"loss": 4.0092,
"step": 1180
},
{
"epoch": 0.5895522388059702,
"grad_norm": 6.4375,
"learning_rate": 8.623748389954283e-05,
"loss": 3.9131,
"step": 1185
},
{
"epoch": 0.5920398009950248,
"grad_norm": 6.375,
"learning_rate": 8.537795315789509e-05,
"loss": 3.857,
"step": 1190
},
{
"epoch": 0.5945273631840796,
"grad_norm": 5.8125,
"learning_rate": 8.451952488950166e-05,
"loss": 3.9707,
"step": 1195
},
{
"epoch": 0.5970149253731343,
"grad_norm": 7.15625,
"learning_rate": 8.366226381814697e-05,
"loss": 3.9853,
"step": 1200
},
{
"epoch": 0.599502487562189,
"grad_norm": 5.9375,
"learning_rate": 8.280623457961102e-05,
"loss": 4.107,
"step": 1205
},
{
"epoch": 0.6019900497512438,
"grad_norm": 6.4375,
"learning_rate": 8.195150171679608e-05,
"loss": 3.7088,
"step": 1210
},
{
"epoch": 0.6044776119402985,
"grad_norm": 6.15625,
"learning_rate": 8.109812967486025e-05,
"loss": 3.9205,
"step": 1215
},
{
"epoch": 0.6069651741293532,
"grad_norm": 7.1875,
"learning_rate": 8.02461827963585e-05,
"loss": 3.9086,
"step": 1220
},
{
"epoch": 0.6094527363184079,
"grad_norm": 6.3125,
"learning_rate": 7.939572531639128e-05,
"loss": 3.9716,
"step": 1225
},
{
"epoch": 0.6119402985074627,
"grad_norm": 6.40625,
"learning_rate": 7.854682135776131e-05,
"loss": 3.9194,
"step": 1230
},
{
"epoch": 0.6144278606965174,
"grad_norm": 6.875,
"learning_rate": 7.769953492613899e-05,
"loss": 3.8653,
"step": 1235
},
{
"epoch": 0.6169154228855721,
"grad_norm": 6.625,
"learning_rate": 7.685392990523626e-05,
"loss": 4.043,
"step": 1240
},
{
"epoch": 0.6194029850746269,
"grad_norm": 6.6875,
"learning_rate": 7.601007005199021e-05,
"loss": 3.829,
"step": 1245
},
{
"epoch": 0.6218905472636815,
"grad_norm": 6.53125,
"learning_rate": 7.516801899175565e-05,
"loss": 3.9138,
"step": 1250
},
{
"epoch": 0.6243781094527363,
"grad_norm": 6.5,
"learning_rate": 7.432784021350796e-05,
"loss": 3.9103,
"step": 1255
},
{
"epoch": 0.6268656716417911,
"grad_norm": 6.15625,
"learning_rate": 7.348959706505626e-05,
"loss": 3.9792,
"step": 1260
},
{
"epoch": 0.6293532338308457,
"grad_norm": 6.28125,
"learning_rate": 7.265335274826704e-05,
"loss": 4.0775,
"step": 1265
},
{
"epoch": 0.6318407960199005,
"grad_norm": 6.59375,
"learning_rate": 7.181917031429874e-05,
"loss": 4.0234,
"step": 1270
},
{
"epoch": 0.6343283582089553,
"grad_norm": 7.03125,
"learning_rate": 7.09871126588481e-05,
"loss": 3.9329,
"step": 1275
},
{
"epoch": 0.6368159203980099,
"grad_norm": 6.0625,
"learning_rate": 7.015724251740766e-05,
"loss": 3.6704,
"step": 1280
},
{
"epoch": 0.6393034825870647,
"grad_norm": 6.71875,
"learning_rate": 6.932962246053577e-05,
"loss": 3.8563,
"step": 1285
},
{
"epoch": 0.6417910447761194,
"grad_norm": 5.84375,
"learning_rate": 6.850431488913895e-05,
"loss": 3.8506,
"step": 1290
},
{
"epoch": 0.6442786069651741,
"grad_norm": 6.8125,
"learning_rate": 6.76813820297669e-05,
"loss": 4.008,
"step": 1295
},
{
"epoch": 0.6467661691542289,
"grad_norm": 7.09375,
"learning_rate": 6.686088592992067e-05,
"loss": 4.0959,
"step": 1300
},
{
"epoch": 0.6492537313432836,
"grad_norm": 6.375,
"learning_rate": 6.604288845337453e-05,
"loss": 4.0365,
"step": 1305
},
{
"epoch": 0.6517412935323383,
"grad_norm": 6.96875,
"learning_rate": 6.522745127551158e-05,
"loss": 3.8927,
"step": 1310
},
{
"epoch": 0.654228855721393,
"grad_norm": 6.96875,
"learning_rate": 6.44146358786734e-05,
"loss": 3.9165,
"step": 1315
},
{
"epoch": 0.6567164179104478,
"grad_norm": 6.65625,
"learning_rate": 6.360450354752458e-05,
"loss": 4.1257,
"step": 1320
},
{
"epoch": 0.6592039800995025,
"grad_norm": 7.125,
"learning_rate": 6.279711536443185e-05,
"loss": 3.9571,
"step": 1325
},
{
"epoch": 0.6616915422885572,
"grad_norm": 6.125,
"learning_rate": 6.199253220485856e-05,
"loss": 3.7978,
"step": 1330
},
{
"epoch": 0.664179104477612,
"grad_norm": 6.59375,
"learning_rate": 6.119081473277501e-05,
"loss": 3.859,
"step": 1335
},
{
"epoch": 0.6666666666666666,
"grad_norm": 6.1875,
"learning_rate": 6.039202339608432e-05,
"loss": 4.015,
"step": 1340
},
{
"epoch": 0.6691542288557214,
"grad_norm": 6.625,
"learning_rate": 5.959621842206474e-05,
"loss": 4.0804,
"step": 1345
},
{
"epoch": 0.6716417910447762,
"grad_norm": 6.625,
"learning_rate": 5.880345981282876e-05,
"loss": 4.0607,
"step": 1350
},
{
"epoch": 0.6741293532338308,
"grad_norm": 6.46875,
"learning_rate": 5.801380734079907e-05,
"loss": 3.8616,
"step": 1355
},
{
"epoch": 0.6766169154228856,
"grad_norm": 6.375,
"learning_rate": 5.722732054420172e-05,
"loss": 3.8968,
"step": 1360
},
{
"epoch": 0.6791044776119403,
"grad_norm": 6.71875,
"learning_rate": 5.6444058722577165e-05,
"loss": 4.0431,
"step": 1365
},
{
"epoch": 0.681592039800995,
"grad_norm": 6.65625,
"learning_rate": 5.566408093230911e-05,
"loss": 3.9798,
"step": 1370
},
{
"epoch": 0.6840796019900498,
"grad_norm": 5.375,
"learning_rate": 5.4887445982171906e-05,
"loss": 3.7958,
"step": 1375
},
{
"epoch": 0.6865671641791045,
"grad_norm": 5.96875,
"learning_rate": 5.4114212428896424e-05,
"loss": 3.9962,
"step": 1380
},
{
"epoch": 0.6890547263681592,
"grad_norm": 6.0625,
"learning_rate": 5.334443857275487e-05,
"loss": 4.009,
"step": 1385
},
{
"epoch": 0.6915422885572139,
"grad_norm": 6.65625,
"learning_rate": 5.257818245316522e-05,
"loss": 3.9681,
"step": 1390
},
{
"epoch": 0.6940298507462687,
"grad_norm": 7.21875,
"learning_rate": 5.1815501844315105e-05,
"loss": 4.0784,
"step": 1395
},
{
"epoch": 0.6965174129353234,
"grad_norm": 7.375,
"learning_rate": 5.105645425080572e-05,
"loss": 4.0183,
"step": 1400
},
{
"epoch": 0.6990049751243781,
"grad_norm": 6.0625,
"learning_rate": 5.030109690331625e-05,
"loss": 3.9356,
"step": 1405
},
{
"epoch": 0.7014925373134329,
"grad_norm": 7.625,
"learning_rate": 4.954948675428853e-05,
"loss": 3.7845,
"step": 1410
},
{
"epoch": 0.7039800995024875,
"grad_norm": 6.1875,
"learning_rate": 4.880168047363312e-05,
"loss": 3.7763,
"step": 1415
},
{
"epoch": 0.7064676616915423,
"grad_norm": 6.46875,
"learning_rate": 4.8057734444456536e-05,
"loss": 4.0405,
"step": 1420
},
{
"epoch": 0.7089552238805971,
"grad_norm": 6.09375,
"learning_rate": 4.7317704758809946e-05,
"loss": 3.9666,
"step": 1425
},
{
"epoch": 0.7114427860696517,
"grad_norm": 7.0,
"learning_rate": 4.658164721345998e-05,
"loss": 3.9511,
"step": 1430
},
{
"epoch": 0.7139303482587065,
"grad_norm": 7.03125,
"learning_rate": 4.584961730568188e-05,
"loss": 4.0864,
"step": 1435
},
{
"epoch": 0.7164179104477612,
"grad_norm": 6.625,
"learning_rate": 4.512167022907494e-05,
"loss": 4.0077,
"step": 1440
},
{
"epoch": 0.7189054726368159,
"grad_norm": 6.53125,
"learning_rate": 4.439786086940115e-05,
"loss": 3.8572,
"step": 1445
},
{
"epoch": 0.7213930348258707,
"grad_norm": 6.5625,
"learning_rate": 4.3678243800446835e-05,
"loss": 3.812,
"step": 1450
},
{
"epoch": 0.7238805970149254,
"grad_norm": 5.375,
"learning_rate": 4.296287327990797e-05,
"loss": 3.816,
"step": 1455
},
{
"epoch": 0.7263681592039801,
"grad_norm": 6.03125,
"learning_rate": 4.225180324529917e-05,
"loss": 3.8844,
"step": 1460
},
{
"epoch": 0.7288557213930348,
"grad_norm": 7.46875,
"learning_rate": 4.1545087309887045e-05,
"loss": 3.9426,
"step": 1465
},
{
"epoch": 0.7313432835820896,
"grad_norm": 6.0,
"learning_rate": 4.084277875864776e-05,
"loss": 3.9788,
"step": 1470
},
{
"epoch": 0.7338308457711443,
"grad_norm": 6.34375,
"learning_rate": 4.014493054424944e-05,
"loss": 4.0493,
"step": 1475
},
{
"epoch": 0.736318407960199,
"grad_norm": 6.78125,
"learning_rate": 3.945159528305971e-05,
"loss": 4.0197,
"step": 1480
},
{
"epoch": 0.7388059701492538,
"grad_norm": 6.8125,
"learning_rate": 3.876282525117847e-05,
"loss": 3.9014,
"step": 1485
},
{
"epoch": 0.7412935323383084,
"grad_norm": 7.15625,
"learning_rate": 3.807867238049642e-05,
"loss": 3.987,
"step": 1490
},
{
"epoch": 0.7437810945273632,
"grad_norm": 6.0,
"learning_rate": 3.739918825477953e-05,
"loss": 3.9318,
"step": 1495
},
{
"epoch": 0.746268656716418,
"grad_norm": 5.5625,
"learning_rate": 3.672442410577965e-05,
"loss": 3.8518,
"step": 1500
},
{
"epoch": 0.7487562189054726,
"grad_norm": 5.59375,
"learning_rate": 3.605443080937172e-05,
"loss": 3.7997,
"step": 1505
},
{
"epoch": 0.7512437810945274,
"grad_norm": 6.09375,
"learning_rate": 3.5389258881718e-05,
"loss": 3.9,
"step": 1510
},
{
"epoch": 0.753731343283582,
"grad_norm": 6.4375,
"learning_rate": 3.472895847545905e-05,
"loss": 4.005,
"step": 1515
},
{
"epoch": 0.7562189054726368,
"grad_norm": 7.375,
"learning_rate": 3.407357937593237e-05,
"loss": 3.9962,
"step": 1520
},
{
"epoch": 0.7587064676616916,
"grad_norm": 6.5,
"learning_rate": 3.342317099741886e-05,
"loss": 3.9809,
"step": 1525
},
{
"epoch": 0.7611940298507462,
"grad_norm": 6.875,
"learning_rate": 3.27777823794168e-05,
"loss": 3.9891,
"step": 1530
},
{
"epoch": 0.763681592039801,
"grad_norm": 6.84375,
"learning_rate": 3.213746218294455e-05,
"loss": 4.0958,
"step": 1535
},
{
"epoch": 0.7661691542288557,
"grad_norm": 6.1875,
"learning_rate": 3.150225868687161e-05,
"loss": 3.838,
"step": 1540
},
{
"epoch": 0.7686567164179104,
"grad_norm": 7.46875,
"learning_rate": 3.0872219784278354e-05,
"loss": 3.9027,
"step": 1545
},
{
"epoch": 0.7711442786069652,
"grad_norm": 5.78125,
"learning_rate": 3.02473929788452e-05,
"loss": 3.9055,
"step": 1550
},
{
"epoch": 0.7736318407960199,
"grad_norm": 6.0,
"learning_rate": 2.96278253812707e-05,
"loss": 3.9548,
"step": 1555
},
{
"epoch": 0.7761194029850746,
"grad_norm": 6.71875,
"learning_rate": 2.901356370571967e-05,
"loss": 3.8413,
"step": 1560
},
{
"epoch": 0.7786069651741293,
"grad_norm": 6.625,
"learning_rate": 2.840465426630091e-05,
"loss": 4.1502,
"step": 1565
},
{
"epoch": 0.7810945273631841,
"grad_norm": 6.53125,
"learning_rate": 2.7801142973575243e-05,
"loss": 3.851,
"step": 1570
},
{
"epoch": 0.7835820895522388,
"grad_norm": 6.125,
"learning_rate": 2.7203075331094017e-05,
"loss": 4.0059,
"step": 1575
},
{
"epoch": 0.7860696517412935,
"grad_norm": 7.0,
"learning_rate": 2.6610496431968125e-05,
"loss": 3.8795,
"step": 1580
},
{
"epoch": 0.7885572139303483,
"grad_norm": 6.53125,
"learning_rate": 2.6023450955468176e-05,
"loss": 3.8933,
"step": 1585
},
{
"epoch": 0.7910447761194029,
"grad_norm": 5.96875,
"learning_rate": 2.54419831636557e-05,
"loss": 4.1032,
"step": 1590
},
{
"epoch": 0.7935323383084577,
"grad_norm": 5.8125,
"learning_rate": 2.4866136898045843e-05,
"loss": 3.8866,
"step": 1595
},
{
"epoch": 0.7960199004975125,
"grad_norm": 6.375,
"learning_rate": 2.4295955576301965e-05,
"loss": 4.0359,
"step": 1600
},
{
"epoch": 0.7985074626865671,
"grad_norm": 6.75,
"learning_rate": 2.3731482188961818e-05,
"loss": 3.8639,
"step": 1605
},
{
"epoch": 0.8009950248756219,
"grad_norm": 6.78125,
"learning_rate": 2.317275929619627e-05,
"loss": 4.0732,
"step": 1610
},
{
"epoch": 0.8034825870646766,
"grad_norm": 7.625,
"learning_rate": 2.261982902460039e-05,
"loss": 3.9888,
"step": 1615
},
{
"epoch": 0.8059701492537313,
"grad_norm": 6.0,
"learning_rate": 2.2072733064017103e-05,
"loss": 4.0829,
"step": 1620
},
{
"epoch": 0.8084577114427861,
"grad_norm": 8.0625,
"learning_rate": 2.1531512664393838e-05,
"loss": 4.1679,
"step": 1625
},
{
"epoch": 0.8109452736318408,
"grad_norm": 7.0,
"learning_rate": 2.0996208632672475e-05,
"loss": 4.0939,
"step": 1630
},
{
"epoch": 0.8134328358208955,
"grad_norm": 6.8125,
"learning_rate": 2.0466861329712473e-05,
"loss": 3.8609,
"step": 1635
},
{
"epoch": 0.8159203980099502,
"grad_norm": 6.71875,
"learning_rate": 1.9943510667247813e-05,
"loss": 4.083,
"step": 1640
},
{
"epoch": 0.818407960199005,
"grad_norm": 6.1875,
"learning_rate": 1.9426196104877735e-05,
"loss": 3.9825,
"step": 1645
},
{
"epoch": 0.8208955223880597,
"grad_norm": 6.96875,
"learning_rate": 1.89149566470915e-05,
"loss": 3.9559,
"step": 1650
},
{
"epoch": 0.8233830845771144,
"grad_norm": 6.59375,
"learning_rate": 1.8409830840327546e-05,
"loss": 4.0314,
"step": 1655
},
{
"epoch": 0.8258706467661692,
"grad_norm": 6.1875,
"learning_rate": 1.791085677006722e-05,
"loss": 3.8751,
"step": 1660
},
{
"epoch": 0.8283582089552238,
"grad_norm": 7.25,
"learning_rate": 1.741807205796314e-05,
"loss": 4.0051,
"step": 1665
},
{
"epoch": 0.8308457711442786,
"grad_norm": 5.59375,
"learning_rate": 1.6931513859002635e-05,
"loss": 3.9194,
"step": 1670
},
{
"epoch": 0.8333333333333334,
"grad_norm": 6.34375,
"learning_rate": 1.6451218858706374e-05,
"loss": 3.8528,
"step": 1675
},
{
"epoch": 0.835820895522388,
"grad_norm": 6.15625,
"learning_rate": 1.5977223270362196e-05,
"loss": 4.0617,
"step": 1680
},
{
"epoch": 0.8383084577114428,
"grad_norm": 7.125,
"learning_rate": 1.5509562832294944e-05,
"loss": 3.7389,
"step": 1685
},
{
"epoch": 0.8407960199004975,
"grad_norm": 6.25,
"learning_rate": 1.5048272805171615e-05,
"loss": 3.9292,
"step": 1690
},
{
"epoch": 0.8432835820895522,
"grad_norm": 5.90625,
"learning_rate": 1.459338796934293e-05,
"loss": 4.011,
"step": 1695
},
{
"epoch": 0.845771144278607,
"grad_norm": 6.53125,
"learning_rate": 1.4144942622220902e-05,
"loss": 3.8728,
"step": 1700
},
{
"epoch": 0.8482587064676617,
"grad_norm": 6.71875,
"learning_rate": 1.3702970575692975e-05,
"loss": 4.0874,
"step": 1705
},
{
"epoch": 0.8507462686567164,
"grad_norm": 6.9375,
"learning_rate": 1.3267505153572501e-05,
"loss": 4.0708,
"step": 1710
},
{
"epoch": 0.8532338308457711,
"grad_norm": 5.03125,
"learning_rate": 1.2838579189086353e-05,
"loss": 3.8598,
"step": 1715
},
{
"epoch": 0.8557213930348259,
"grad_norm": 7.46875,
"learning_rate": 1.2416225022399286e-05,
"loss": 4.0435,
"step": 1720
},
{
"epoch": 0.8582089552238806,
"grad_norm": 5.96875,
"learning_rate": 1.2000474498175552e-05,
"loss": 4.054,
"step": 1725
},
{
"epoch": 0.8606965174129353,
"grad_norm": 6.8125,
"learning_rate": 1.1591358963177923e-05,
"loss": 3.8522,
"step": 1730
},
{
"epoch": 0.8631840796019901,
"grad_norm": 5.34375,
"learning_rate": 1.118890926390419e-05,
"loss": 3.9089,
"step": 1735
},
{
"epoch": 0.8656716417910447,
"grad_norm": 6.8125,
"learning_rate": 1.0793155744261351e-05,
"loss": 4.0584,
"step": 1740
},
{
"epoch": 0.8681592039800995,
"grad_norm": 6.25,
"learning_rate": 1.0404128243277777e-05,
"loss": 3.9094,
"step": 1745
},
{
"epoch": 0.8706467661691543,
"grad_norm": 6.75,
"learning_rate": 1.0021856092853432e-05,
"loss": 3.9982,
"step": 1750
},
{
"epoch": 0.8731343283582089,
"grad_norm": 6.25,
"learning_rate": 9.646368115548232e-06,
"loss": 3.946,
"step": 1755
},
{
"epoch": 0.8756218905472637,
"grad_norm": 6.71875,
"learning_rate": 9.277692622409018e-06,
"loss": 3.8958,
"step": 1760
},
{
"epoch": 0.8781094527363185,
"grad_norm": 6.375,
"learning_rate": 8.915857410834794e-06,
"loss": 3.7367,
"step": 1765
},
{
"epoch": 0.8805970149253731,
"grad_norm": 6.28125,
"learning_rate": 8.56088976248095e-06,
"loss": 3.9724,
"step": 1770
},
{
"epoch": 0.8830845771144279,
"grad_norm": 6.125,
"learning_rate": 8.212816441202309e-06,
"loss": 4.0212,
"step": 1775
},
{
"epoch": 0.8855721393034826,
"grad_norm": 6.90625,
"learning_rate": 7.871663691035103e-06,
"loss": 3.6865,
"step": 1780
},
{
"epoch": 0.8880597014925373,
"grad_norm": 6.78125,
"learning_rate": 7.53745723421827e-06,
"loss": 3.9914,
"step": 1785
},
{
"epoch": 0.8905472636815921,
"grad_norm": 5.4375,
"learning_rate": 7.2102222692540415e-06,
"loss": 3.9573,
"step": 1790
},
{
"epoch": 0.8930348258706468,
"grad_norm": 6.78125,
"learning_rate": 6.889983469008055e-06,
"loss": 4.1287,
"step": 1795
},
{
"epoch": 0.8955223880597015,
"grad_norm": 5.84375,
"learning_rate": 6.576764978849004e-06,
"loss": 4.1117,
"step": 1800
},
{
"epoch": 0.8980099502487562,
"grad_norm": 6.8125,
"learning_rate": 6.27059041482817e-06,
"loss": 3.8274,
"step": 1805
},
{
"epoch": 0.900497512437811,
"grad_norm": 6.4375,
"learning_rate": 5.971482861898836e-06,
"loss": 3.8697,
"step": 1810
},
{
"epoch": 0.9029850746268657,
"grad_norm": 6.03125,
"learning_rate": 5.679464872175666e-06,
"loss": 3.9326,
"step": 1815
},
{
"epoch": 0.9054726368159204,
"grad_norm": 6.5,
"learning_rate": 5.394558463234378e-06,
"loss": 3.8915,
"step": 1820
},
{
"epoch": 0.9079601990049752,
"grad_norm": 5.9375,
"learning_rate": 5.116785116451661e-06,
"loss": 3.9306,
"step": 1825
},
{
"epoch": 0.9104477611940298,
"grad_norm": 5.96875,
"learning_rate": 4.846165775385459e-06,
"loss": 3.901,
"step": 1830
},
{
"epoch": 0.9129353233830846,
"grad_norm": 6.28125,
"learning_rate": 4.5827208441959424e-06,
"loss": 3.9952,
"step": 1835
},
{
"epoch": 0.9154228855721394,
"grad_norm": 5.9375,
"learning_rate": 4.3264701861070345e-06,
"loss": 4.0501,
"step": 1840
},
{
"epoch": 0.917910447761194,
"grad_norm": 5.90625,
"learning_rate": 4.077433121908747e-06,
"loss": 3.7784,
"step": 1845
},
{
"epoch": 0.9203980099502488,
"grad_norm": 6.8125,
"learning_rate": 3.835628428500515e-06,
"loss": 3.928,
"step": 1850
},
{
"epoch": 0.9228855721393034,
"grad_norm": 6.53125,
"learning_rate": 3.601074337475352e-06,
"loss": 3.9705,
"step": 1855
},
{
"epoch": 0.9253731343283582,
"grad_norm": 7.5,
"learning_rate": 3.3737885337452814e-06,
"loss": 4.0769,
"step": 1860
},
{
"epoch": 0.927860696517413,
"grad_norm": 6.15625,
"learning_rate": 3.153788154207926e-06,
"loss": 3.9098,
"step": 1865
},
{
"epoch": 0.9303482587064676,
"grad_norm": 6.875,
"learning_rate": 2.9410897864544206e-06,
"loss": 4.037,
"step": 1870
},
{
"epoch": 0.9328358208955224,
"grad_norm": 6.21875,
"learning_rate": 2.735709467518699e-06,
"loss": 3.9169,
"step": 1875
},
{
"epoch": 0.9353233830845771,
"grad_norm": 6.5,
"learning_rate": 2.5376626826683956e-06,
"loss": 3.9237,
"step": 1880
},
{
"epoch": 0.9378109452736318,
"grad_norm": 6.375,
"learning_rate": 2.3469643642372586e-06,
"loss": 3.9737,
"step": 1885
},
{
"epoch": 0.9402985074626866,
"grad_norm": 7.15625,
"learning_rate": 2.1636288904992585e-06,
"loss": 4.0875,
"step": 1890
},
{
"epoch": 0.9427860696517413,
"grad_norm": 6.25,
"learning_rate": 1.9876700845845475e-06,
"loss": 3.8926,
"step": 1895
},
{
"epoch": 0.945273631840796,
"grad_norm": 6.03125,
"learning_rate": 1.8191012134371577e-06,
"loss": 3.8843,
"step": 1900
},
{
"epoch": 0.9477611940298507,
"grad_norm": 6.59375,
"learning_rate": 1.6579349868147687e-06,
"loss": 3.9296,
"step": 1905
},
{
"epoch": 0.9502487562189055,
"grad_norm": 6.9375,
"learning_rate": 1.5041835563303742e-06,
"loss": 3.8406,
"step": 1910
},
{
"epoch": 0.9527363184079602,
"grad_norm": 6.78125,
"learning_rate": 1.3578585145360812e-06,
"loss": 4.1326,
"step": 1915
},
{
"epoch": 0.9552238805970149,
"grad_norm": 6.21875,
"learning_rate": 1.2189708940490652e-06,
"loss": 3.8935,
"step": 1920
},
{
"epoch": 0.9577114427860697,
"grad_norm": 5.6875,
"learning_rate": 1.0875311667196908e-06,
"loss": 3.7818,
"step": 1925
},
{
"epoch": 0.9601990049751243,
"grad_norm": 5.15625,
"learning_rate": 9.635492428420434e-07,
"loss": 3.6428,
"step": 1930
},
{
"epoch": 0.9626865671641791,
"grad_norm": 6.4375,
"learning_rate": 8.470344704066046e-07,
"loss": 3.8555,
"step": 1935
},
{
"epoch": 0.9651741293532339,
"grad_norm": 6.1875,
"learning_rate": 7.379956343955386e-07,
"loss": 3.8856,
"step": 1940
},
{
"epoch": 0.9676616915422885,
"grad_norm": 6.0,
"learning_rate": 6.364409561202323e-07,
"loss": 4.0294,
"step": 1945
},
{
"epoch": 0.9701492537313433,
"grad_norm": 7.15625,
"learning_rate": 5.42378092601481e-07,
"loss": 3.9593,
"step": 1950
},
{
"epoch": 0.972636815920398,
"grad_norm": 6.25,
"learning_rate": 4.558141359921386e-07,
"loss": 3.9914,
"step": 1955
},
{
"epoch": 0.9751243781094527,
"grad_norm": 8.375,
"learning_rate": 3.7675561304238994e-07,
"loss": 3.9707,
"step": 1960
},
{
"epoch": 0.9776119402985075,
"grad_norm": 5.65625,
"learning_rate": 3.0520848460765527e-07,
"loss": 3.7689,
"step": 1965
},
{
"epoch": 0.9800995024875622,
"grad_norm": 6.6875,
"learning_rate": 2.4117814519911684e-07,
"loss": 3.9225,
"step": 1970
},
{
"epoch": 0.9825870646766169,
"grad_norm": 6.125,
"learning_rate": 1.846694225770551e-07,
"loss": 3.9233,
"step": 1975
},
{
"epoch": 0.9850746268656716,
"grad_norm": 6.0625,
"learning_rate": 1.3568657738678435e-07,
"loss": 3.8331,
"step": 1980
},
{
"epoch": 0.9875621890547264,
"grad_norm": 6.40625,
"learning_rate": 9.423330283742093e-08,
"loss": 3.993,
"step": 1985
},
{
"epoch": 0.9900497512437811,
"grad_norm": 6.4375,
"learning_rate": 6.031272442341696e-08,
"loss": 3.8852,
"step": 1990
},
{
"epoch": 0.9925373134328358,
"grad_norm": 6.75,
"learning_rate": 3.392739968894887e-08,
"loss": 3.821,
"step": 1995
},
{
"epoch": 0.9950248756218906,
"grad_norm": 6.59375,
"learning_rate": 1.5079318035016164e-08,
"loss": 3.975,
"step": 2000
},
{
"epoch": 0.9975124378109452,
"grad_norm": 5.46875,
"learning_rate": 3.769900569505769e-09,
"loss": 4.0483,
"step": 2005
},
{
"epoch": 1.0,
"grad_norm": 8.5625,
"learning_rate": 0.0,
"loss": 3.9369,
"step": 2010
},
{
"epoch": 1.0,
"step": 2010,
"total_flos": 1275064289820672.0,
"train_loss": 4.025861802029966,
"train_runtime": 256.6461,
"train_samples_per_second": 125.278,
"train_steps_per_second": 7.832
}
],
"logging_steps": 5,
"max_steps": 2010,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 1275064289820672.0,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}