tktung's picture
Upload folder using huggingface_hub
7c3e04c verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 7.0,
"eval_steps": 500,
"global_step": 1092,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.00641025641025641,
"grad_norm": 3.8148568052575884,
"learning_rate": 1.282051282051282e-07,
"loss": 4.889,
"step": 1
},
{
"epoch": 0.01282051282051282,
"grad_norm": 4.453444589892027,
"learning_rate": 2.564102564102564e-07,
"loss": 4.9097,
"step": 2
},
{
"epoch": 0.02564102564102564,
"grad_norm": 4.896614258621833,
"learning_rate": 5.128205128205128e-07,
"loss": 4.9099,
"step": 4
},
{
"epoch": 0.038461538461538464,
"grad_norm": 4.456576485464451,
"learning_rate": 7.692307692307694e-07,
"loss": 4.9102,
"step": 6
},
{
"epoch": 0.05128205128205128,
"grad_norm": 4.193427815120892,
"learning_rate": 1.0256410256410257e-06,
"loss": 4.8924,
"step": 8
},
{
"epoch": 0.0641025641025641,
"grad_norm": 3.6726747534666555,
"learning_rate": 1.282051282051282e-06,
"loss": 4.8372,
"step": 10
},
{
"epoch": 0.07692307692307693,
"grad_norm": 3.337981680961211,
"learning_rate": 1.5384615384615387e-06,
"loss": 4.7794,
"step": 12
},
{
"epoch": 0.08974358974358974,
"grad_norm": 2.675890453922504,
"learning_rate": 1.794871794871795e-06,
"loss": 4.6191,
"step": 14
},
{
"epoch": 0.10256410256410256,
"grad_norm": 2.398848700299253,
"learning_rate": 2.0512820512820513e-06,
"loss": 4.5723,
"step": 16
},
{
"epoch": 0.11538461538461539,
"grad_norm": 1.8159784961859098,
"learning_rate": 2.307692307692308e-06,
"loss": 4.3568,
"step": 18
},
{
"epoch": 0.1282051282051282,
"grad_norm": 1.6094220673057946,
"learning_rate": 2.564102564102564e-06,
"loss": 4.2686,
"step": 20
},
{
"epoch": 0.14102564102564102,
"grad_norm": 1.4349818434671497,
"learning_rate": 2.8205128205128207e-06,
"loss": 4.169,
"step": 22
},
{
"epoch": 0.15384615384615385,
"grad_norm": 1.4412559958198408,
"learning_rate": 3.0769230769230774e-06,
"loss": 4.0415,
"step": 24
},
{
"epoch": 0.16666666666666666,
"grad_norm": 1.3626982007755366,
"learning_rate": 3.3333333333333333e-06,
"loss": 3.8569,
"step": 26
},
{
"epoch": 0.1794871794871795,
"grad_norm": 1.3679096739652512,
"learning_rate": 3.58974358974359e-06,
"loss": 3.7409,
"step": 28
},
{
"epoch": 0.19230769230769232,
"grad_norm": 1.3396391976584703,
"learning_rate": 3.846153846153847e-06,
"loss": 3.6585,
"step": 30
},
{
"epoch": 0.20512820512820512,
"grad_norm": 1.294876480457606,
"learning_rate": 4.102564102564103e-06,
"loss": 3.4961,
"step": 32
},
{
"epoch": 0.21794871794871795,
"grad_norm": 1.103820056614455,
"learning_rate": 4.358974358974359e-06,
"loss": 3.3518,
"step": 34
},
{
"epoch": 0.23076923076923078,
"grad_norm": 1.0522131115906572,
"learning_rate": 4.615384615384616e-06,
"loss": 3.1984,
"step": 36
},
{
"epoch": 0.24358974358974358,
"grad_norm": 1.0081732884085817,
"learning_rate": 4.871794871794872e-06,
"loss": 3.054,
"step": 38
},
{
"epoch": 0.2564102564102564,
"grad_norm": 0.9214039999549644,
"learning_rate": 5.128205128205128e-06,
"loss": 2.8628,
"step": 40
},
{
"epoch": 0.2692307692307692,
"grad_norm": 0.8143994876297143,
"learning_rate": 5.384615384615385e-06,
"loss": 2.7475,
"step": 42
},
{
"epoch": 0.28205128205128205,
"grad_norm": 0.700891765547207,
"learning_rate": 5.641025641025641e-06,
"loss": 2.5869,
"step": 44
},
{
"epoch": 0.2948717948717949,
"grad_norm": 0.7510674065754775,
"learning_rate": 5.897435897435898e-06,
"loss": 2.4461,
"step": 46
},
{
"epoch": 0.3076923076923077,
"grad_norm": 0.6794074940373539,
"learning_rate": 6.153846153846155e-06,
"loss": 2.3477,
"step": 48
},
{
"epoch": 0.32051282051282054,
"grad_norm": 0.5162215042692575,
"learning_rate": 6.410256410256412e-06,
"loss": 2.2152,
"step": 50
},
{
"epoch": 0.3333333333333333,
"grad_norm": 0.5146975027904754,
"learning_rate": 6.666666666666667e-06,
"loss": 2.1975,
"step": 52
},
{
"epoch": 0.34615384615384615,
"grad_norm": 0.4474574545979082,
"learning_rate": 6.923076923076923e-06,
"loss": 2.0824,
"step": 54
},
{
"epoch": 0.358974358974359,
"grad_norm": 0.40379510918119965,
"learning_rate": 7.17948717948718e-06,
"loss": 2.0388,
"step": 56
},
{
"epoch": 0.3717948717948718,
"grad_norm": 0.4109144194248555,
"learning_rate": 7.435897435897437e-06,
"loss": 1.9699,
"step": 58
},
{
"epoch": 0.38461538461538464,
"grad_norm": 0.36878556755849573,
"learning_rate": 7.692307692307694e-06,
"loss": 1.9252,
"step": 60
},
{
"epoch": 0.3974358974358974,
"grad_norm": 0.33951214974325605,
"learning_rate": 7.948717948717949e-06,
"loss": 1.8773,
"step": 62
},
{
"epoch": 0.41025641025641024,
"grad_norm": 0.31625266306424027,
"learning_rate": 8.205128205128205e-06,
"loss": 1.7966,
"step": 64
},
{
"epoch": 0.4230769230769231,
"grad_norm": 0.7180890498799148,
"learning_rate": 8.461538461538462e-06,
"loss": 1.8108,
"step": 66
},
{
"epoch": 0.4358974358974359,
"grad_norm": 0.33704662479371716,
"learning_rate": 8.717948717948719e-06,
"loss": 1.7498,
"step": 68
},
{
"epoch": 0.44871794871794873,
"grad_norm": 0.2761824271642518,
"learning_rate": 8.974358974358976e-06,
"loss": 1.7124,
"step": 70
},
{
"epoch": 0.46153846153846156,
"grad_norm": 0.24386286193528572,
"learning_rate": 9.230769230769232e-06,
"loss": 1.6382,
"step": 72
},
{
"epoch": 0.47435897435897434,
"grad_norm": 0.25885451676676363,
"learning_rate": 9.487179487179487e-06,
"loss": 1.6588,
"step": 74
},
{
"epoch": 0.48717948717948717,
"grad_norm": 0.3040030663690383,
"learning_rate": 9.743589743589744e-06,
"loss": 1.6209,
"step": 76
},
{
"epoch": 0.5,
"grad_norm": 0.26598080566137733,
"learning_rate": 1e-05,
"loss": 1.6294,
"step": 78
},
{
"epoch": 0.5128205128205128,
"grad_norm": 0.22696288673824674,
"learning_rate": 9.99995506314361e-06,
"loss": 1.58,
"step": 80
},
{
"epoch": 0.5256410256410257,
"grad_norm": 0.21242259411358655,
"learning_rate": 9.99982025338217e-06,
"loss": 1.5439,
"step": 82
},
{
"epoch": 0.5384615384615384,
"grad_norm": 0.20291826899403465,
"learning_rate": 9.999595573138845e-06,
"loss": 1.5274,
"step": 84
},
{
"epoch": 0.5512820512820513,
"grad_norm": 0.1855444412322797,
"learning_rate": 9.99928102645221e-06,
"loss": 1.5161,
"step": 86
},
{
"epoch": 0.5641025641025641,
"grad_norm": 0.17883874148398324,
"learning_rate": 9.99887661897616e-06,
"loss": 1.4916,
"step": 88
},
{
"epoch": 0.5769230769230769,
"grad_norm": 0.17041478792908024,
"learning_rate": 9.99838235797981e-06,
"loss": 1.4679,
"step": 90
},
{
"epoch": 0.5897435897435898,
"grad_norm": 0.1904762198987749,
"learning_rate": 9.997798252347382e-06,
"loss": 1.471,
"step": 92
},
{
"epoch": 0.6025641025641025,
"grad_norm": 0.19077041355708335,
"learning_rate": 9.99712431257802e-06,
"loss": 1.4672,
"step": 94
},
{
"epoch": 0.6153846153846154,
"grad_norm": 0.1702104328191874,
"learning_rate": 9.996360550785619e-06,
"loss": 1.4455,
"step": 96
},
{
"epoch": 0.6282051282051282,
"grad_norm": 0.19039133859515542,
"learning_rate": 9.9955069806986e-06,
"loss": 1.4727,
"step": 98
},
{
"epoch": 0.6410256410256411,
"grad_norm": 0.15448238517128507,
"learning_rate": 9.994563617659665e-06,
"loss": 1.4257,
"step": 100
},
{
"epoch": 0.6538461538461539,
"grad_norm": 0.15202351051018634,
"learning_rate": 9.993530478625524e-06,
"loss": 1.4214,
"step": 102
},
{
"epoch": 0.6666666666666666,
"grad_norm": 0.16296598133044526,
"learning_rate": 9.992407582166582e-06,
"loss": 1.4213,
"step": 104
},
{
"epoch": 0.6794871794871795,
"grad_norm": 0.1462038294164801,
"learning_rate": 9.991194948466615e-06,
"loss": 1.3993,
"step": 106
},
{
"epoch": 0.6923076923076923,
"grad_norm": 0.14470989191451086,
"learning_rate": 9.989892599322404e-06,
"loss": 1.4014,
"step": 108
},
{
"epoch": 0.7051282051282052,
"grad_norm": 0.15440545758233384,
"learning_rate": 9.988500558143337e-06,
"loss": 1.3878,
"step": 110
},
{
"epoch": 0.717948717948718,
"grad_norm": 0.1412948019214843,
"learning_rate": 9.987018849950996e-06,
"loss": 1.355,
"step": 112
},
{
"epoch": 0.7307692307692307,
"grad_norm": 0.15156074653795895,
"learning_rate": 9.985447501378706e-06,
"loss": 1.3642,
"step": 114
},
{
"epoch": 0.7435897435897436,
"grad_norm": 0.3875845143038168,
"learning_rate": 9.983786540671052e-06,
"loss": 1.3797,
"step": 116
},
{
"epoch": 0.7564102564102564,
"grad_norm": 0.15788537547887518,
"learning_rate": 9.982035997683372e-06,
"loss": 1.3388,
"step": 118
},
{
"epoch": 0.7692307692307693,
"grad_norm": 0.15056320914445512,
"learning_rate": 9.980195903881231e-06,
"loss": 1.343,
"step": 120
},
{
"epoch": 0.782051282051282,
"grad_norm": 0.1555129283317706,
"learning_rate": 9.978266292339838e-06,
"loss": 1.328,
"step": 122
},
{
"epoch": 0.7948717948717948,
"grad_norm": 0.14999182496915453,
"learning_rate": 9.976247197743465e-06,
"loss": 1.352,
"step": 124
},
{
"epoch": 0.8076923076923077,
"grad_norm": 0.14124313426191026,
"learning_rate": 9.974138656384815e-06,
"loss": 1.3243,
"step": 126
},
{
"epoch": 0.8205128205128205,
"grad_norm": 0.1378326204862212,
"learning_rate": 9.97194070616438e-06,
"loss": 1.3241,
"step": 128
},
{
"epoch": 0.8333333333333334,
"grad_norm": 0.14227960534974604,
"learning_rate": 9.969653386589749e-06,
"loss": 1.3219,
"step": 130
},
{
"epoch": 0.8461538461538461,
"grad_norm": 0.12713543749272155,
"learning_rate": 9.967276738774897e-06,
"loss": 1.3096,
"step": 132
},
{
"epoch": 0.8589743589743589,
"grad_norm": 0.15061232362563903,
"learning_rate": 9.964810805439464e-06,
"loss": 1.3011,
"step": 134
},
{
"epoch": 0.8717948717948718,
"grad_norm": 0.14361563348990292,
"learning_rate": 9.962255630907964e-06,
"loss": 1.2827,
"step": 136
},
{
"epoch": 0.8846153846153846,
"grad_norm": 0.17754387209035652,
"learning_rate": 9.959611261108999e-06,
"loss": 1.3185,
"step": 138
},
{
"epoch": 0.8974358974358975,
"grad_norm": 0.1458623897430443,
"learning_rate": 9.956877743574437e-06,
"loss": 1.3286,
"step": 140
},
{
"epoch": 0.9102564102564102,
"grad_norm": 0.14084398418567437,
"learning_rate": 9.954055127438554e-06,
"loss": 1.3005,
"step": 142
},
{
"epoch": 0.9230769230769231,
"grad_norm": 0.13580861113069753,
"learning_rate": 9.951143463437145e-06,
"loss": 1.3165,
"step": 144
},
{
"epoch": 0.9358974358974359,
"grad_norm": 0.13622051889734035,
"learning_rate": 9.948142803906623e-06,
"loss": 1.2929,
"step": 146
},
{
"epoch": 0.9487179487179487,
"grad_norm": 0.12679082371935066,
"learning_rate": 9.94505320278307e-06,
"loss": 1.2833,
"step": 148
},
{
"epoch": 0.9615384615384616,
"grad_norm": 0.11939382079952243,
"learning_rate": 9.94187471560127e-06,
"loss": 1.2851,
"step": 150
},
{
"epoch": 0.9743589743589743,
"grad_norm": 0.11752490134274678,
"learning_rate": 9.938607399493714e-06,
"loss": 1.2559,
"step": 152
},
{
"epoch": 0.9871794871794872,
"grad_norm": 0.11807212671773365,
"learning_rate": 9.935251313189564e-06,
"loss": 1.285,
"step": 154
},
{
"epoch": 1.0,
"grad_norm": 0.1120761333795772,
"learning_rate": 9.931806517013612e-06,
"loss": 1.2491,
"step": 156
},
{
"epoch": 1.0128205128205128,
"grad_norm": 0.10750345822189263,
"learning_rate": 9.92827307288518e-06,
"loss": 1.2442,
"step": 158
},
{
"epoch": 1.0256410256410255,
"grad_norm": 0.10918642022881683,
"learning_rate": 9.924651044317017e-06,
"loss": 1.2286,
"step": 160
},
{
"epoch": 1.0384615384615385,
"grad_norm": 0.11225330042691335,
"learning_rate": 9.920940496414153e-06,
"loss": 1.2158,
"step": 162
},
{
"epoch": 1.0512820512820513,
"grad_norm": 0.11366482652198566,
"learning_rate": 9.917141495872733e-06,
"loss": 1.2074,
"step": 164
},
{
"epoch": 1.064102564102564,
"grad_norm": 0.12295651003296312,
"learning_rate": 9.913254110978812e-06,
"loss": 1.2003,
"step": 166
},
{
"epoch": 1.0769230769230769,
"grad_norm": 0.1144456030840293,
"learning_rate": 9.909278411607134e-06,
"loss": 1.206,
"step": 168
},
{
"epoch": 1.0897435897435896,
"grad_norm": 0.2468334129961725,
"learning_rate": 9.90521446921987e-06,
"loss": 1.2235,
"step": 170
},
{
"epoch": 1.1025641025641026,
"grad_norm": 0.127278158070263,
"learning_rate": 9.90106235686534e-06,
"loss": 1.1928,
"step": 172
},
{
"epoch": 1.1153846153846154,
"grad_norm": 0.1280282060730887,
"learning_rate": 9.896822149176695e-06,
"loss": 1.2068,
"step": 174
},
{
"epoch": 1.1282051282051282,
"grad_norm": 0.1142922422404122,
"learning_rate": 9.892493922370575e-06,
"loss": 1.217,
"step": 176
},
{
"epoch": 1.141025641025641,
"grad_norm": 0.17470470224878323,
"learning_rate": 9.888077754245741e-06,
"loss": 1.2099,
"step": 178
},
{
"epoch": 1.1538461538461537,
"grad_norm": 0.10477882692325258,
"learning_rate": 9.883573724181683e-06,
"loss": 1.1944,
"step": 180
},
{
"epoch": 1.1666666666666667,
"grad_norm": 0.114790034377695,
"learning_rate": 9.878981913137178e-06,
"loss": 1.172,
"step": 182
},
{
"epoch": 1.1794871794871795,
"grad_norm": 0.1044922535107306,
"learning_rate": 9.87430240364885e-06,
"loss": 1.2147,
"step": 184
},
{
"epoch": 1.1923076923076923,
"grad_norm": 0.09771283060341285,
"learning_rate": 9.869535279829674e-06,
"loss": 1.173,
"step": 186
},
{
"epoch": 1.205128205128205,
"grad_norm": 0.1013995999635824,
"learning_rate": 9.864680627367476e-06,
"loss": 1.2023,
"step": 188
},
{
"epoch": 1.217948717948718,
"grad_norm": 0.10273326452887067,
"learning_rate": 9.859738533523384e-06,
"loss": 1.1732,
"step": 190
},
{
"epoch": 1.2307692307692308,
"grad_norm": 0.09684048616936082,
"learning_rate": 9.854709087130261e-06,
"loss": 1.1952,
"step": 192
},
{
"epoch": 1.2435897435897436,
"grad_norm": 0.10827760658070901,
"learning_rate": 9.849592378591113e-06,
"loss": 1.1864,
"step": 194
},
{
"epoch": 1.2564102564102564,
"grad_norm": 0.09989527940011267,
"learning_rate": 9.844388499877457e-06,
"loss": 1.2016,
"step": 196
},
{
"epoch": 1.2692307692307692,
"grad_norm": 0.09930771667309381,
"learning_rate": 9.839097544527674e-06,
"loss": 1.1738,
"step": 198
},
{
"epoch": 1.282051282051282,
"grad_norm": 0.1032001919164007,
"learning_rate": 9.833719607645325e-06,
"loss": 1.176,
"step": 200
},
{
"epoch": 1.294871794871795,
"grad_norm": 0.09859412157061716,
"learning_rate": 9.82825478589744e-06,
"loss": 1.1682,
"step": 202
},
{
"epoch": 1.3076923076923077,
"grad_norm": 0.09558235334437347,
"learning_rate": 9.822703177512783e-06,
"loss": 1.181,
"step": 204
},
{
"epoch": 1.3205128205128205,
"grad_norm": 0.08733478657745303,
"learning_rate": 9.817064882280085e-06,
"loss": 1.1686,
"step": 206
},
{
"epoch": 1.3333333333333333,
"grad_norm": 0.09397505343456257,
"learning_rate": 9.811340001546252e-06,
"loss": 1.1778,
"step": 208
},
{
"epoch": 1.3461538461538463,
"grad_norm": 0.09590407825516856,
"learning_rate": 9.805528638214543e-06,
"loss": 1.1542,
"step": 210
},
{
"epoch": 1.358974358974359,
"grad_norm": 0.0912508440064145,
"learning_rate": 9.799630896742716e-06,
"loss": 1.1643,
"step": 212
},
{
"epoch": 1.3717948717948718,
"grad_norm": 0.09258955107744923,
"learning_rate": 9.793646883141155e-06,
"loss": 1.1686,
"step": 214
},
{
"epoch": 1.3846153846153846,
"grad_norm": 0.09889457149777804,
"learning_rate": 9.787576704970965e-06,
"loss": 1.1677,
"step": 216
},
{
"epoch": 1.3974358974358974,
"grad_norm": 0.09374670756166416,
"learning_rate": 9.781420471342035e-06,
"loss": 1.146,
"step": 218
},
{
"epoch": 1.4102564102564101,
"grad_norm": 0.09136677460744856,
"learning_rate": 9.77517829291108e-06,
"loss": 1.1594,
"step": 220
},
{
"epoch": 1.4230769230769231,
"grad_norm": 0.10584946030378292,
"learning_rate": 9.768850281879651e-06,
"loss": 1.1865,
"step": 222
},
{
"epoch": 1.435897435897436,
"grad_norm": 0.09187981607301214,
"learning_rate": 9.762436551992117e-06,
"loss": 1.1606,
"step": 224
},
{
"epoch": 1.4487179487179487,
"grad_norm": 0.09880449655805854,
"learning_rate": 9.755937218533622e-06,
"loss": 1.1586,
"step": 226
},
{
"epoch": 1.4615384615384617,
"grad_norm": 0.08704607108972029,
"learning_rate": 9.74935239832801e-06,
"loss": 1.1746,
"step": 228
},
{
"epoch": 1.4743589743589745,
"grad_norm": 0.08909112778091671,
"learning_rate": 9.742682209735727e-06,
"loss": 1.1575,
"step": 230
},
{
"epoch": 1.4871794871794872,
"grad_norm": 0.09035998053799675,
"learning_rate": 9.735926772651703e-06,
"loss": 1.1678,
"step": 232
},
{
"epoch": 1.5,
"grad_norm": 0.09500864788295198,
"learning_rate": 9.729086208503174e-06,
"loss": 1.1466,
"step": 234
},
{
"epoch": 1.5128205128205128,
"grad_norm": 0.09247434213683463,
"learning_rate": 9.722160640247523e-06,
"loss": 1.1687,
"step": 236
},
{
"epoch": 1.5256410256410255,
"grad_norm": 0.09322212100100113,
"learning_rate": 9.715150192370054e-06,
"loss": 1.1376,
"step": 238
},
{
"epoch": 1.5384615384615383,
"grad_norm": 0.08824919508271642,
"learning_rate": 9.708054990881763e-06,
"loss": 1.1523,
"step": 240
},
{
"epoch": 1.5512820512820513,
"grad_norm": 0.25559730635424294,
"learning_rate": 9.700875163317072e-06,
"loss": 1.1488,
"step": 242
},
{
"epoch": 1.564102564102564,
"grad_norm": 0.2487505162861363,
"learning_rate": 9.693610838731532e-06,
"loss": 1.1481,
"step": 244
},
{
"epoch": 1.5769230769230769,
"grad_norm": 0.12151469789600829,
"learning_rate": 9.686262147699507e-06,
"loss": 1.1483,
"step": 246
},
{
"epoch": 1.5897435897435899,
"grad_norm": 0.10407519891252137,
"learning_rate": 9.678829222311827e-06,
"loss": 1.13,
"step": 248
},
{
"epoch": 1.6025641025641026,
"grad_norm": 0.11236395690738615,
"learning_rate": 9.671312196173413e-06,
"loss": 1.1493,
"step": 250
},
{
"epoch": 1.6153846153846154,
"grad_norm": 0.1012523372817843,
"learning_rate": 9.663711204400872e-06,
"loss": 1.148,
"step": 252
},
{
"epoch": 1.6282051282051282,
"grad_norm": 0.09652583778417714,
"learning_rate": 9.656026383620076e-06,
"loss": 1.1074,
"step": 254
},
{
"epoch": 1.641025641025641,
"grad_norm": 0.09448533541138639,
"learning_rate": 9.6482578719637e-06,
"loss": 1.1486,
"step": 256
},
{
"epoch": 1.6538461538461537,
"grad_norm": 0.09453430664055591,
"learning_rate": 9.640405809068743e-06,
"loss": 1.1197,
"step": 258
},
{
"epoch": 1.6666666666666665,
"grad_norm": 0.0952812616531032,
"learning_rate": 9.632470336074009e-06,
"loss": 1.1337,
"step": 260
},
{
"epoch": 1.6794871794871795,
"grad_norm": 0.09048018082770859,
"learning_rate": 9.624451595617588e-06,
"loss": 1.0885,
"step": 262
},
{
"epoch": 1.6923076923076923,
"grad_norm": 0.0922717302732401,
"learning_rate": 9.616349731834271e-06,
"loss": 1.1294,
"step": 264
},
{
"epoch": 1.7051282051282053,
"grad_norm": 0.09113342238000427,
"learning_rate": 9.608164890352977e-06,
"loss": 1.0871,
"step": 266
},
{
"epoch": 1.717948717948718,
"grad_norm": 0.10188653395954697,
"learning_rate": 9.599897218294122e-06,
"loss": 1.1237,
"step": 268
},
{
"epoch": 1.7307692307692308,
"grad_norm": 0.08946291041522332,
"learning_rate": 9.591546864266983e-06,
"loss": 1.1129,
"step": 270
},
{
"epoch": 1.7435897435897436,
"grad_norm": 0.092702242157672,
"learning_rate": 9.583113978367026e-06,
"loss": 1.1089,
"step": 272
},
{
"epoch": 1.7564102564102564,
"grad_norm": 0.1140491779513373,
"learning_rate": 9.574598712173202e-06,
"loss": 1.1286,
"step": 274
},
{
"epoch": 1.7692307692307692,
"grad_norm": 0.09516237353719291,
"learning_rate": 9.56600121874523e-06,
"loss": 1.1122,
"step": 276
},
{
"epoch": 1.782051282051282,
"grad_norm": 0.08916708413619781,
"learning_rate": 9.557321652620839e-06,
"loss": 1.1048,
"step": 278
},
{
"epoch": 1.7948717948717947,
"grad_norm": 0.09140805156925046,
"learning_rate": 9.548560169812997e-06,
"loss": 1.1058,
"step": 280
},
{
"epoch": 1.8076923076923077,
"grad_norm": 0.08683635001330178,
"learning_rate": 9.539716927807102e-06,
"loss": 1.0925,
"step": 282
},
{
"epoch": 1.8205128205128205,
"grad_norm": 0.09284148179598711,
"learning_rate": 9.530792085558151e-06,
"loss": 1.0948,
"step": 284
},
{
"epoch": 1.8333333333333335,
"grad_norm": 0.08800610945553744,
"learning_rate": 9.521785803487888e-06,
"loss": 1.1116,
"step": 286
},
{
"epoch": 1.8461538461538463,
"grad_norm": 0.08758546749473674,
"learning_rate": 9.512698243481914e-06,
"loss": 1.1059,
"step": 288
},
{
"epoch": 1.858974358974359,
"grad_norm": 0.08336608124209365,
"learning_rate": 9.50352956888678e-06,
"loss": 1.1015,
"step": 290
},
{
"epoch": 1.8717948717948718,
"grad_norm": 0.09199580396288136,
"learning_rate": 9.49427994450705e-06,
"loss": 1.0828,
"step": 292
},
{
"epoch": 1.8846153846153846,
"grad_norm": 0.5410940704298627,
"learning_rate": 9.484949536602343e-06,
"loss": 1.1412,
"step": 294
},
{
"epoch": 1.8974358974358974,
"grad_norm": 0.08913430120295451,
"learning_rate": 9.47553851288434e-06,
"loss": 1.1073,
"step": 296
},
{
"epoch": 1.9102564102564101,
"grad_norm": 0.09420167495815907,
"learning_rate": 9.466047042513767e-06,
"loss": 1.0957,
"step": 298
},
{
"epoch": 1.9230769230769231,
"grad_norm": 0.08189970955203785,
"learning_rate": 9.45647529609736e-06,
"loss": 1.0909,
"step": 300
},
{
"epoch": 1.935897435897436,
"grad_norm": 0.09065809775757692,
"learning_rate": 9.4468234456848e-06,
"loss": 1.0896,
"step": 302
},
{
"epoch": 1.9487179487179487,
"grad_norm": 0.08763498764491487,
"learning_rate": 9.437091664765611e-06,
"loss": 1.1099,
"step": 304
},
{
"epoch": 1.9615384615384617,
"grad_norm": 0.09257403574026254,
"learning_rate": 9.427280128266049e-06,
"loss": 1.1236,
"step": 306
},
{
"epoch": 1.9743589743589745,
"grad_norm": 0.08983923370086075,
"learning_rate": 9.41738901254596e-06,
"loss": 1.0909,
"step": 308
},
{
"epoch": 1.9871794871794872,
"grad_norm": 0.086289850522152,
"learning_rate": 9.4074184953956e-06,
"loss": 1.0942,
"step": 310
},
{
"epoch": 2.0,
"grad_norm": 0.0874296283040965,
"learning_rate": 9.397368756032445e-06,
"loss": 1.0651,
"step": 312
},
{
"epoch": 2.0128205128205128,
"grad_norm": 0.0848953888966574,
"learning_rate": 9.38723997509798e-06,
"loss": 1.0569,
"step": 314
},
{
"epoch": 2.0256410256410255,
"grad_norm": 0.08790616172980993,
"learning_rate": 9.37703233465443e-06,
"loss": 1.035,
"step": 316
},
{
"epoch": 2.0384615384615383,
"grad_norm": 0.08376355574572536,
"learning_rate": 9.366746018181503e-06,
"loss": 1.0379,
"step": 318
},
{
"epoch": 2.051282051282051,
"grad_norm": 0.7353839032057593,
"learning_rate": 9.356381210573092e-06,
"loss": 1.0623,
"step": 320
},
{
"epoch": 2.064102564102564,
"grad_norm": 0.09158722362975955,
"learning_rate": 9.345938098133946e-06,
"loss": 1.0264,
"step": 322
},
{
"epoch": 2.076923076923077,
"grad_norm": 0.08819422670959466,
"learning_rate": 9.33541686857632e-06,
"loss": 1.0456,
"step": 324
},
{
"epoch": 2.08974358974359,
"grad_norm": 0.0905819981621342,
"learning_rate": 9.324817711016609e-06,
"loss": 1.0239,
"step": 326
},
{
"epoch": 2.1025641025641026,
"grad_norm": 0.08799589635983858,
"learning_rate": 9.31414081597194e-06,
"loss": 1.0498,
"step": 328
},
{
"epoch": 2.1153846153846154,
"grad_norm": 0.0847927160084877,
"learning_rate": 9.303386375356752e-06,
"loss": 1.0163,
"step": 330
},
{
"epoch": 2.128205128205128,
"grad_norm": 0.09169187613815971,
"learning_rate": 9.292554582479349e-06,
"loss": 1.0054,
"step": 332
},
{
"epoch": 2.141025641025641,
"grad_norm": 0.08905293788047657,
"learning_rate": 9.281645632038417e-06,
"loss": 1.062,
"step": 334
},
{
"epoch": 2.1538461538461537,
"grad_norm": 0.09229173633666073,
"learning_rate": 9.270659720119533e-06,
"loss": 1.039,
"step": 336
},
{
"epoch": 2.1666666666666665,
"grad_norm": 0.08430144514732368,
"learning_rate": 9.259597044191635e-06,
"loss": 1.0268,
"step": 338
},
{
"epoch": 2.1794871794871793,
"grad_norm": 0.08706427078942988,
"learning_rate": 9.248457803103476e-06,
"loss": 1.0038,
"step": 340
},
{
"epoch": 2.1923076923076925,
"grad_norm": 0.0851666955740436,
"learning_rate": 9.237242197080045e-06,
"loss": 1.0218,
"step": 342
},
{
"epoch": 2.2051282051282053,
"grad_norm": 0.08446573269728049,
"learning_rate": 9.225950427718974e-06,
"loss": 1.0254,
"step": 344
},
{
"epoch": 2.217948717948718,
"grad_norm": 0.08907279788471897,
"learning_rate": 9.21458269798691e-06,
"loss": 0.9916,
"step": 346
},
{
"epoch": 2.230769230769231,
"grad_norm": 0.09072043470187022,
"learning_rate": 9.203139212215868e-06,
"loss": 1.0103,
"step": 348
},
{
"epoch": 2.2435897435897436,
"grad_norm": 0.08618586552830075,
"learning_rate": 9.191620176099559e-06,
"loss": 0.9995,
"step": 350
},
{
"epoch": 2.2564102564102564,
"grad_norm": 0.09111342426909275,
"learning_rate": 9.180025796689692e-06,
"loss": 1.0292,
"step": 352
},
{
"epoch": 2.269230769230769,
"grad_norm": 0.2022564482536435,
"learning_rate": 9.168356282392253e-06,
"loss": 1.0226,
"step": 354
},
{
"epoch": 2.282051282051282,
"grad_norm": 0.1039362123101456,
"learning_rate": 9.156611842963753e-06,
"loss": 1.0152,
"step": 356
},
{
"epoch": 2.2948717948717947,
"grad_norm": 0.10035717927769394,
"learning_rate": 9.144792689507471e-06,
"loss": 1.0049,
"step": 358
},
{
"epoch": 2.3076923076923075,
"grad_norm": 0.08924064734394851,
"learning_rate": 9.132899034469648e-06,
"loss": 0.9962,
"step": 360
},
{
"epoch": 2.3205128205128207,
"grad_norm": 0.09443040073005612,
"learning_rate": 9.120931091635669e-06,
"loss": 0.9976,
"step": 362
},
{
"epoch": 2.3333333333333335,
"grad_norm": 0.09377508422363312,
"learning_rate": 9.108889076126226e-06,
"loss": 1.0306,
"step": 364
},
{
"epoch": 2.3461538461538463,
"grad_norm": 0.0895229930946655,
"learning_rate": 9.09677320439345e-06,
"loss": 1.0126,
"step": 366
},
{
"epoch": 2.358974358974359,
"grad_norm": 0.08795872722111464,
"learning_rate": 9.084583694217012e-06,
"loss": 0.9926,
"step": 368
},
{
"epoch": 2.371794871794872,
"grad_norm": 0.08704560136887454,
"learning_rate": 9.072320764700223e-06,
"loss": 0.9978,
"step": 370
},
{
"epoch": 2.3846153846153846,
"grad_norm": 0.0898387630341298,
"learning_rate": 9.059984636266082e-06,
"loss": 1.0042,
"step": 372
},
{
"epoch": 2.3974358974358974,
"grad_norm": 0.08357247562762515,
"learning_rate": 9.047575530653324e-06,
"loss": 1.0094,
"step": 374
},
{
"epoch": 2.41025641025641,
"grad_norm": 0.0843437057196144,
"learning_rate": 9.035093670912424e-06,
"loss": 0.9966,
"step": 376
},
{
"epoch": 2.423076923076923,
"grad_norm": 0.08357196997203281,
"learning_rate": 9.022539281401601e-06,
"loss": 1.0038,
"step": 378
},
{
"epoch": 2.435897435897436,
"grad_norm": 0.08859683961596204,
"learning_rate": 9.009912587782772e-06,
"loss": 1.0133,
"step": 380
},
{
"epoch": 2.448717948717949,
"grad_norm": 0.09024266497375917,
"learning_rate": 8.997213817017508e-06,
"loss": 0.9782,
"step": 382
},
{
"epoch": 2.4615384615384617,
"grad_norm": 0.0960929339414081,
"learning_rate": 8.984443197362938e-06,
"loss": 1.0013,
"step": 384
},
{
"epoch": 2.4743589743589745,
"grad_norm": 0.08862629313408348,
"learning_rate": 8.971600958367668e-06,
"loss": 1.0059,
"step": 386
},
{
"epoch": 2.4871794871794872,
"grad_norm": 0.09201716039902362,
"learning_rate": 8.958687330867634e-06,
"loss": 1.0263,
"step": 388
},
{
"epoch": 2.5,
"grad_norm": 0.08694363384662504,
"learning_rate": 8.94570254698197e-06,
"loss": 1.0163,
"step": 390
},
{
"epoch": 2.5128205128205128,
"grad_norm": 0.09205164914341211,
"learning_rate": 8.932646840108818e-06,
"loss": 0.9865,
"step": 392
},
{
"epoch": 2.5256410256410255,
"grad_norm": 0.09081872370987605,
"learning_rate": 8.919520444921153e-06,
"loss": 0.9819,
"step": 394
},
{
"epoch": 2.5384615384615383,
"grad_norm": 0.08905442630582544,
"learning_rate": 8.906323597362547e-06,
"loss": 1.0171,
"step": 396
},
{
"epoch": 2.551282051282051,
"grad_norm": 0.08717951944686292,
"learning_rate": 8.893056534642938e-06,
"loss": 1.0244,
"step": 398
},
{
"epoch": 2.564102564102564,
"grad_norm": 0.09573458066741532,
"learning_rate": 8.879719495234363e-06,
"loss": 0.9848,
"step": 400
},
{
"epoch": 2.5769230769230766,
"grad_norm": 0.0898624666623644,
"learning_rate": 8.866312718866669e-06,
"loss": 0.982,
"step": 402
},
{
"epoch": 2.58974358974359,
"grad_norm": 0.09305658353350323,
"learning_rate": 8.852836446523213e-06,
"loss": 0.9742,
"step": 404
},
{
"epoch": 2.6025641025641026,
"grad_norm": 0.08663704229153721,
"learning_rate": 8.83929092043652e-06,
"loss": 0.9783,
"step": 406
},
{
"epoch": 2.6153846153846154,
"grad_norm": 0.08983846726156959,
"learning_rate": 8.825676384083936e-06,
"loss": 0.998,
"step": 408
},
{
"epoch": 2.628205128205128,
"grad_norm": 0.09388895481313425,
"learning_rate": 8.811993082183243e-06,
"loss": 1.0005,
"step": 410
},
{
"epoch": 2.641025641025641,
"grad_norm": 0.09226783931828283,
"learning_rate": 8.798241260688273e-06,
"loss": 1.0055,
"step": 412
},
{
"epoch": 2.6538461538461537,
"grad_norm": 0.09021054214140613,
"learning_rate": 8.784421166784476e-06,
"loss": 0.9981,
"step": 414
},
{
"epoch": 2.6666666666666665,
"grad_norm": 0.0860573848233807,
"learning_rate": 8.770533048884483e-06,
"loss": 1.0017,
"step": 416
},
{
"epoch": 2.6794871794871797,
"grad_norm": 0.0880124822318372,
"learning_rate": 8.756577156623636e-06,
"loss": 0.9834,
"step": 418
},
{
"epoch": 2.6923076923076925,
"grad_norm": 0.0867421199146975,
"learning_rate": 8.742553740855507e-06,
"loss": 0.9983,
"step": 420
},
{
"epoch": 2.7051282051282053,
"grad_norm": 0.09006077507273828,
"learning_rate": 8.728463053647382e-06,
"loss": 0.9702,
"step": 422
},
{
"epoch": 2.717948717948718,
"grad_norm": 0.08669250030062742,
"learning_rate": 8.71430534827574e-06,
"loss": 0.9952,
"step": 424
},
{
"epoch": 2.730769230769231,
"grad_norm": 0.09026424854741899,
"learning_rate": 8.700080879221689e-06,
"loss": 1.0054,
"step": 426
},
{
"epoch": 2.7435897435897436,
"grad_norm": 0.087975640704094,
"learning_rate": 8.685789902166395e-06,
"loss": 0.9845,
"step": 428
},
{
"epoch": 2.7564102564102564,
"grad_norm": 0.08642431755631451,
"learning_rate": 8.671432673986493e-06,
"loss": 0.9791,
"step": 430
},
{
"epoch": 2.769230769230769,
"grad_norm": 0.08649701419340423,
"learning_rate": 8.657009452749466e-06,
"loss": 0.9752,
"step": 432
},
{
"epoch": 2.782051282051282,
"grad_norm": 0.0879183947838203,
"learning_rate": 8.642520497709001e-06,
"loss": 0.9788,
"step": 434
},
{
"epoch": 2.7948717948717947,
"grad_norm": 0.08596416297337815,
"learning_rate": 8.627966069300332e-06,
"loss": 0.9807,
"step": 436
},
{
"epoch": 2.8076923076923075,
"grad_norm": 0.08918860363970792,
"learning_rate": 8.613346429135567e-06,
"loss": 0.9958,
"step": 438
},
{
"epoch": 2.8205128205128203,
"grad_norm": 0.08972585580799317,
"learning_rate": 8.598661839998972e-06,
"loss": 0.9895,
"step": 440
},
{
"epoch": 2.8333333333333335,
"grad_norm": 0.08703685151364528,
"learning_rate": 8.583912565842258e-06,
"loss": 0.9652,
"step": 442
},
{
"epoch": 2.8461538461538463,
"grad_norm": 0.08688465565057563,
"learning_rate": 8.569098871779828e-06,
"loss": 0.9984,
"step": 444
},
{
"epoch": 2.858974358974359,
"grad_norm": 0.08809758545326962,
"learning_rate": 8.554221024084019e-06,
"loss": 0.9905,
"step": 446
},
{
"epoch": 2.871794871794872,
"grad_norm": 0.08572911529655777,
"learning_rate": 8.539279290180315e-06,
"loss": 0.9692,
"step": 448
},
{
"epoch": 2.8846153846153846,
"grad_norm": 0.08836722634323343,
"learning_rate": 8.524273938642539e-06,
"loss": 0.9547,
"step": 450
},
{
"epoch": 2.8974358974358974,
"grad_norm": 0.09242854914045788,
"learning_rate": 8.509205239188017e-06,
"loss": 0.9838,
"step": 452
},
{
"epoch": 2.91025641025641,
"grad_norm": 0.08849881930024005,
"learning_rate": 8.494073462672743e-06,
"loss": 0.9615,
"step": 454
},
{
"epoch": 2.9230769230769234,
"grad_norm": 0.08854620618403236,
"learning_rate": 8.478878881086505e-06,
"loss": 0.9977,
"step": 456
},
{
"epoch": 2.935897435897436,
"grad_norm": 0.094665430731143,
"learning_rate": 8.463621767547998e-06,
"loss": 0.9927,
"step": 458
},
{
"epoch": 2.948717948717949,
"grad_norm": 0.09196410792880014,
"learning_rate": 8.448302396299906e-06,
"loss": 1.0113,
"step": 460
},
{
"epoch": 2.9615384615384617,
"grad_norm": 0.09036486236859728,
"learning_rate": 8.432921042703985e-06,
"loss": 0.9457,
"step": 462
},
{
"epoch": 2.9743589743589745,
"grad_norm": 0.08576032950610284,
"learning_rate": 8.417477983236107e-06,
"loss": 0.9645,
"step": 464
},
{
"epoch": 2.9871794871794872,
"grad_norm": 0.08403590001526823,
"learning_rate": 8.401973495481289e-06,
"loss": 0.9544,
"step": 466
},
{
"epoch": 3.0,
"grad_norm": 0.09355532269950335,
"learning_rate": 8.386407858128707e-06,
"loss": 0.9719,
"step": 468
},
{
"epoch": 3.0128205128205128,
"grad_norm": 0.08685232548889178,
"learning_rate": 8.370781350966683e-06,
"loss": 0.8933,
"step": 470
},
{
"epoch": 3.0256410256410255,
"grad_norm": 0.10917681684685593,
"learning_rate": 8.355094254877665e-06,
"loss": 0.9222,
"step": 472
},
{
"epoch": 3.0384615384615383,
"grad_norm": 0.09821414680349456,
"learning_rate": 8.339346851833163e-06,
"loss": 0.9187,
"step": 474
},
{
"epoch": 3.051282051282051,
"grad_norm": 0.0953257584501641,
"learning_rate": 8.323539424888695e-06,
"loss": 0.9068,
"step": 476
},
{
"epoch": 3.064102564102564,
"grad_norm": 0.10096821936698265,
"learning_rate": 8.30767225817869e-06,
"loss": 0.9005,
"step": 478
},
{
"epoch": 3.076923076923077,
"grad_norm": 0.09745049198474258,
"learning_rate": 8.291745636911382e-06,
"loss": 0.8955,
"step": 480
},
{
"epoch": 3.08974358974359,
"grad_norm": 0.09581071499737452,
"learning_rate": 8.27575984736369e-06,
"loss": 0.9034,
"step": 482
},
{
"epoch": 3.1025641025641026,
"grad_norm": 0.09048589565605356,
"learning_rate": 8.259715176876069e-06,
"loss": 0.8964,
"step": 484
},
{
"epoch": 3.1153846153846154,
"grad_norm": 0.09408149538192938,
"learning_rate": 8.243611913847337e-06,
"loss": 0.9157,
"step": 486
},
{
"epoch": 3.128205128205128,
"grad_norm": 0.0947487050346647,
"learning_rate": 8.2274503477295e-06,
"loss": 0.9053,
"step": 488
},
{
"epoch": 3.141025641025641,
"grad_norm": 0.09366500902355888,
"learning_rate": 8.211230769022552e-06,
"loss": 0.8925,
"step": 490
},
{
"epoch": 3.1538461538461537,
"grad_norm": 0.09167161100151112,
"learning_rate": 8.19495346926924e-06,
"loss": 0.9165,
"step": 492
},
{
"epoch": 3.1666666666666665,
"grad_norm": 0.09307041831758973,
"learning_rate": 8.178618741049841e-06,
"loss": 0.8989,
"step": 494
},
{
"epoch": 3.1794871794871793,
"grad_norm": 0.09585560939367876,
"learning_rate": 8.162226877976886e-06,
"loss": 0.9147,
"step": 496
},
{
"epoch": 3.1923076923076925,
"grad_norm": 0.09180060088840723,
"learning_rate": 8.145778174689897e-06,
"loss": 0.8882,
"step": 498
},
{
"epoch": 3.2051282051282053,
"grad_norm": 0.09609878354099273,
"learning_rate": 8.129272926850079e-06,
"loss": 0.8744,
"step": 500
},
{
"epoch": 3.217948717948718,
"grad_norm": 0.09691473472460625,
"learning_rate": 8.112711431135014e-06,
"loss": 0.8736,
"step": 502
},
{
"epoch": 3.230769230769231,
"grad_norm": 0.09236636322834278,
"learning_rate": 8.096093985233323e-06,
"loss": 0.848,
"step": 504
},
{
"epoch": 3.2435897435897436,
"grad_norm": 0.09704717599279773,
"learning_rate": 8.079420887839316e-06,
"loss": 0.8844,
"step": 506
},
{
"epoch": 3.2564102564102564,
"grad_norm": 0.09939291409466518,
"learning_rate": 8.062692438647628e-06,
"loss": 0.8866,
"step": 508
},
{
"epoch": 3.269230769230769,
"grad_norm": 0.09353962075083472,
"learning_rate": 8.045908938347828e-06,
"loss": 0.8742,
"step": 510
},
{
"epoch": 3.282051282051282,
"grad_norm": 0.09465310178443197,
"learning_rate": 8.029070688619013e-06,
"loss": 0.8833,
"step": 512
},
{
"epoch": 3.2948717948717947,
"grad_norm": 0.09443637715651476,
"learning_rate": 8.012177992124385e-06,
"loss": 0.8794,
"step": 514
},
{
"epoch": 3.3076923076923075,
"grad_norm": 0.09728431520292821,
"learning_rate": 7.995231152505815e-06,
"loss": 0.8732,
"step": 516
},
{
"epoch": 3.3205128205128207,
"grad_norm": 0.09428493650909285,
"learning_rate": 7.978230474378383e-06,
"loss": 0.8597,
"step": 518
},
{
"epoch": 3.3333333333333335,
"grad_norm": 0.09850772889396305,
"learning_rate": 7.961176263324902e-06,
"loss": 0.8624,
"step": 520
},
{
"epoch": 3.3461538461538463,
"grad_norm": 0.09087037549609535,
"learning_rate": 7.944068825890424e-06,
"loss": 0.8821,
"step": 522
},
{
"epoch": 3.358974358974359,
"grad_norm": 0.09180369503983593,
"learning_rate": 7.92690846957673e-06,
"loss": 0.8688,
"step": 524
},
{
"epoch": 3.371794871794872,
"grad_norm": 0.09491604280681391,
"learning_rate": 7.909695502836814e-06,
"loss": 0.8647,
"step": 526
},
{
"epoch": 3.3846153846153846,
"grad_norm": 0.09921876854138406,
"learning_rate": 7.892430235069317e-06,
"loss": 0.8869,
"step": 528
},
{
"epoch": 3.3974358974358974,
"grad_norm": 0.09457741703712105,
"learning_rate": 7.875112976612984e-06,
"loss": 0.8639,
"step": 530
},
{
"epoch": 3.41025641025641,
"grad_norm": 0.09583219613481893,
"learning_rate": 7.857744038741076e-06,
"loss": 0.8805,
"step": 532
},
{
"epoch": 3.423076923076923,
"grad_norm": 0.09260516206658106,
"learning_rate": 7.84032373365578e-06,
"loss": 0.8603,
"step": 534
},
{
"epoch": 3.435897435897436,
"grad_norm": 0.09932108403192164,
"learning_rate": 7.822852374482597e-06,
"loss": 0.8658,
"step": 536
},
{
"epoch": 3.448717948717949,
"grad_norm": 0.09728531208245553,
"learning_rate": 7.805330275264707e-06,
"loss": 0.8536,
"step": 538
},
{
"epoch": 3.4615384615384617,
"grad_norm": 0.09952432033061036,
"learning_rate": 7.787757750957335e-06,
"loss": 0.8763,
"step": 540
},
{
"epoch": 3.4743589743589745,
"grad_norm": 0.09845329832112057,
"learning_rate": 7.77013511742208e-06,
"loss": 0.8658,
"step": 542
},
{
"epoch": 3.4871794871794872,
"grad_norm": 0.10349699075619775,
"learning_rate": 7.752462691421245e-06,
"loss": 0.8538,
"step": 544
},
{
"epoch": 3.5,
"grad_norm": 0.15469316317671902,
"learning_rate": 7.734740790612137e-06,
"loss": 0.8644,
"step": 546
},
{
"epoch": 3.5128205128205128,
"grad_norm": 0.09649309700047885,
"learning_rate": 7.716969733541357e-06,
"loss": 0.8755,
"step": 548
},
{
"epoch": 3.5256410256410255,
"grad_norm": 0.09860823779259517,
"learning_rate": 7.699149839639086e-06,
"loss": 0.8471,
"step": 550
},
{
"epoch": 3.5384615384615383,
"grad_norm": 0.09867635522074884,
"learning_rate": 7.681281429213328e-06,
"loss": 0.8512,
"step": 552
},
{
"epoch": 3.551282051282051,
"grad_norm": 0.09856703594780034,
"learning_rate": 7.663364823444157e-06,
"loss": 0.8581,
"step": 554
},
{
"epoch": 3.564102564102564,
"grad_norm": 0.10120010505390695,
"learning_rate": 7.645400344377953e-06,
"loss": 0.8647,
"step": 556
},
{
"epoch": 3.5769230769230766,
"grad_norm": 0.09353647856294549,
"learning_rate": 7.627388314921602e-06,
"loss": 0.8563,
"step": 558
},
{
"epoch": 3.58974358974359,
"grad_norm": 0.097727849555005,
"learning_rate": 7.609329058836694e-06,
"loss": 0.8629,
"step": 560
},
{
"epoch": 3.6025641025641026,
"grad_norm": 0.09185843649741915,
"learning_rate": 7.59122290073371e-06,
"loss": 0.8517,
"step": 562
},
{
"epoch": 3.6153846153846154,
"grad_norm": 0.16467906411387448,
"learning_rate": 7.5730701660661795e-06,
"loss": 0.8588,
"step": 564
},
{
"epoch": 3.628205128205128,
"grad_norm": 0.10490078157659109,
"learning_rate": 7.554871181124836e-06,
"loss": 0.8916,
"step": 566
},
{
"epoch": 3.641025641025641,
"grad_norm": 0.09862237486460196,
"learning_rate": 7.536626273031747e-06,
"loss": 0.8486,
"step": 568
},
{
"epoch": 3.6538461538461537,
"grad_norm": 0.09855168103779419,
"learning_rate": 7.5183357697344395e-06,
"loss": 0.8532,
"step": 570
},
{
"epoch": 3.6666666666666665,
"grad_norm": 0.09943631897387811,
"learning_rate": 7.500000000000001e-06,
"loss": 0.8643,
"step": 572
},
{
"epoch": 3.6794871794871797,
"grad_norm": 0.09470558794565637,
"learning_rate": 7.481619293409173e-06,
"loss": 0.8705,
"step": 574
},
{
"epoch": 3.6923076923076925,
"grad_norm": 0.09434833275033037,
"learning_rate": 7.4631939803504215e-06,
"loss": 0.8597,
"step": 576
},
{
"epoch": 3.7051282051282053,
"grad_norm": 0.09852625213361811,
"learning_rate": 7.44472439201401e-06,
"loss": 0.8665,
"step": 578
},
{
"epoch": 3.717948717948718,
"grad_norm": 0.09522012579767557,
"learning_rate": 7.426210860386032e-06,
"loss": 0.8373,
"step": 580
},
{
"epoch": 3.730769230769231,
"grad_norm": 0.09872214935386595,
"learning_rate": 7.407653718242449e-06,
"loss": 0.8266,
"step": 582
},
{
"epoch": 3.7435897435897436,
"grad_norm": 0.09611754066886699,
"learning_rate": 7.3890532991431174e-06,
"loss": 0.8422,
"step": 584
},
{
"epoch": 3.7564102564102564,
"grad_norm": 0.09430702389773353,
"learning_rate": 7.370409937425781e-06,
"loss": 0.8349,
"step": 586
},
{
"epoch": 3.769230769230769,
"grad_norm": 0.10000120202753963,
"learning_rate": 7.3517239682000675e-06,
"loss": 0.8589,
"step": 588
},
{
"epoch": 3.782051282051282,
"grad_norm": 0.09477208728170344,
"learning_rate": 7.332995727341462e-06,
"loss": 0.8591,
"step": 590
},
{
"epoch": 3.7948717948717947,
"grad_norm": 0.09696166000717225,
"learning_rate": 7.314225551485273e-06,
"loss": 0.8397,
"step": 592
},
{
"epoch": 3.8076923076923075,
"grad_norm": 0.09621353397155066,
"learning_rate": 7.295413778020579e-06,
"loss": 0.8166,
"step": 594
},
{
"epoch": 3.8205128205128203,
"grad_norm": 0.09692687114207367,
"learning_rate": 7.276560745084167e-06,
"loss": 0.8521,
"step": 596
},
{
"epoch": 3.8333333333333335,
"grad_norm": 0.09885126357081214,
"learning_rate": 7.257666791554448e-06,
"loss": 0.8416,
"step": 598
},
{
"epoch": 3.8461538461538463,
"grad_norm": 0.10239714078021848,
"learning_rate": 7.2387322570453724e-06,
"loss": 0.8324,
"step": 600
},
{
"epoch": 3.858974358974359,
"grad_norm": 0.11251898784242197,
"learning_rate": 7.219757481900325e-06,
"loss": 0.835,
"step": 602
},
{
"epoch": 3.871794871794872,
"grad_norm": 0.1005799166719958,
"learning_rate": 7.2007428071860045e-06,
"loss": 0.8035,
"step": 604
},
{
"epoch": 3.8846153846153846,
"grad_norm": 0.10103534145014936,
"learning_rate": 7.181688574686292e-06,
"loss": 0.8709,
"step": 606
},
{
"epoch": 3.8974358974358974,
"grad_norm": 0.10027552225015914,
"learning_rate": 7.162595126896111e-06,
"loss": 0.8319,
"step": 608
},
{
"epoch": 3.91025641025641,
"grad_norm": 0.10075780749863547,
"learning_rate": 7.143462807015271e-06,
"loss": 0.8323,
"step": 610
},
{
"epoch": 3.9230769230769234,
"grad_norm": 0.09472929060217589,
"learning_rate": 7.1242919589422974e-06,
"loss": 0.8185,
"step": 612
},
{
"epoch": 3.935897435897436,
"grad_norm": 0.09472378350788888,
"learning_rate": 7.105082927268247e-06,
"loss": 0.8304,
"step": 614
},
{
"epoch": 3.948717948717949,
"grad_norm": 0.10337359146731352,
"learning_rate": 7.085836057270521e-06,
"loss": 0.8174,
"step": 616
},
{
"epoch": 3.9615384615384617,
"grad_norm": 0.0983672088113577,
"learning_rate": 7.066551694906651e-06,
"loss": 0.8322,
"step": 618
},
{
"epoch": 3.9743589743589745,
"grad_norm": 0.1019500525911841,
"learning_rate": 7.047230186808085e-06,
"loss": 0.8021,
"step": 620
},
{
"epoch": 3.9871794871794872,
"grad_norm": 0.09750574300751329,
"learning_rate": 7.027871880273959e-06,
"loss": 0.7983,
"step": 622
},
{
"epoch": 4.0,
"grad_norm": 0.10208128186441004,
"learning_rate": 7.008477123264849e-06,
"loss": 0.8239,
"step": 624
},
{
"epoch": 4.012820512820513,
"grad_norm": 0.10734300522197977,
"learning_rate": 6.989046264396516e-06,
"loss": 0.7678,
"step": 626
},
{
"epoch": 4.0256410256410255,
"grad_norm": 0.102980617519378,
"learning_rate": 6.96957965293365e-06,
"loss": 0.7377,
"step": 628
},
{
"epoch": 4.038461538461538,
"grad_norm": 0.12545611352143843,
"learning_rate": 6.9500776387835785e-06,
"loss": 0.7581,
"step": 630
},
{
"epoch": 4.051282051282051,
"grad_norm": 0.122707057481331,
"learning_rate": 6.9305405724899876e-06,
"loss": 0.7399,
"step": 632
},
{
"epoch": 4.064102564102564,
"grad_norm": 0.11397293701236821,
"learning_rate": 6.91096880522661e-06,
"loss": 0.7447,
"step": 634
},
{
"epoch": 4.076923076923077,
"grad_norm": 0.13487306338562802,
"learning_rate": 6.891362688790925e-06,
"loss": 0.7546,
"step": 636
},
{
"epoch": 4.089743589743589,
"grad_norm": 0.10896326697255375,
"learning_rate": 6.871722575597829e-06,
"loss": 0.7579,
"step": 638
},
{
"epoch": 4.102564102564102,
"grad_norm": 0.11165624709106642,
"learning_rate": 6.8520488186733e-06,
"loss": 0.7517,
"step": 640
},
{
"epoch": 4.115384615384615,
"grad_norm": 0.11518303790398043,
"learning_rate": 6.832341771648057e-06,
"loss": 0.7459,
"step": 642
},
{
"epoch": 4.128205128205128,
"grad_norm": 0.11119475069076129,
"learning_rate": 6.812601788751192e-06,
"loss": 0.7825,
"step": 644
},
{
"epoch": 4.141025641025641,
"grad_norm": 0.10743202492055963,
"learning_rate": 6.792829224803816e-06,
"loss": 0.7395,
"step": 646
},
{
"epoch": 4.153846153846154,
"grad_norm": 0.10582985557685172,
"learning_rate": 6.773024435212678e-06,
"loss": 0.7637,
"step": 648
},
{
"epoch": 4.166666666666667,
"grad_norm": 0.10835455281881788,
"learning_rate": 6.753187775963773e-06,
"loss": 0.7576,
"step": 650
},
{
"epoch": 4.17948717948718,
"grad_norm": 0.1107213708183791,
"learning_rate": 6.733319603615941e-06,
"loss": 0.7519,
"step": 652
},
{
"epoch": 4.1923076923076925,
"grad_norm": 0.11143239841237282,
"learning_rate": 6.713420275294467e-06,
"loss": 0.7421,
"step": 654
},
{
"epoch": 4.205128205128205,
"grad_norm": 0.10135913047939792,
"learning_rate": 6.693490148684654e-06,
"loss": 0.7503,
"step": 656
},
{
"epoch": 4.217948717948718,
"grad_norm": 0.10935890173613132,
"learning_rate": 6.673529582025398e-06,
"loss": 0.7469,
"step": 658
},
{
"epoch": 4.230769230769231,
"grad_norm": 0.10682800250997206,
"learning_rate": 6.653538934102743e-06,
"loss": 0.7519,
"step": 660
},
{
"epoch": 4.243589743589744,
"grad_norm": 0.11174312070750286,
"learning_rate": 6.633518564243442e-06,
"loss": 0.7388,
"step": 662
},
{
"epoch": 4.256410256410256,
"grad_norm": 0.10996882792698588,
"learning_rate": 6.6134688323084884e-06,
"loss": 0.735,
"step": 664
},
{
"epoch": 4.269230769230769,
"grad_norm": 0.11513381552989353,
"learning_rate": 6.593390098686653e-06,
"loss": 0.7266,
"step": 666
},
{
"epoch": 4.282051282051282,
"grad_norm": 0.10383307615951057,
"learning_rate": 6.573282724288001e-06,
"loss": 0.7354,
"step": 668
},
{
"epoch": 4.294871794871795,
"grad_norm": 0.10064526192695795,
"learning_rate": 6.553147070537413e-06,
"loss": 0.7316,
"step": 670
},
{
"epoch": 4.3076923076923075,
"grad_norm": 0.10546529880700707,
"learning_rate": 6.532983499368078e-06,
"loss": 0.7345,
"step": 672
},
{
"epoch": 4.32051282051282,
"grad_norm": 0.10452514955349174,
"learning_rate": 6.512792373215e-06,
"loss": 0.7552,
"step": 674
},
{
"epoch": 4.333333333333333,
"grad_norm": 0.10501851155628895,
"learning_rate": 6.492574055008474e-06,
"loss": 0.715,
"step": 676
},
{
"epoch": 4.346153846153846,
"grad_norm": 0.10411036896818421,
"learning_rate": 6.472328908167562e-06,
"loss": 0.729,
"step": 678
},
{
"epoch": 4.358974358974359,
"grad_norm": 0.11127049713049718,
"learning_rate": 6.452057296593568e-06,
"loss": 0.744,
"step": 680
},
{
"epoch": 4.371794871794872,
"grad_norm": 0.12676881136201423,
"learning_rate": 6.431759584663492e-06,
"loss": 0.7588,
"step": 682
},
{
"epoch": 4.384615384615385,
"grad_norm": 0.105870619579206,
"learning_rate": 6.411436137223479e-06,
"loss": 0.7247,
"step": 684
},
{
"epoch": 4.397435897435898,
"grad_norm": 0.10374120826824249,
"learning_rate": 6.391087319582264e-06,
"loss": 0.7309,
"step": 686
},
{
"epoch": 4.410256410256411,
"grad_norm": 0.10865846153479375,
"learning_rate": 6.370713497504607e-06,
"loss": 0.7482,
"step": 688
},
{
"epoch": 4.423076923076923,
"grad_norm": 0.11160085810481411,
"learning_rate": 6.350315037204714e-06,
"loss": 0.7254,
"step": 690
},
{
"epoch": 4.435897435897436,
"grad_norm": 0.10544486611527323,
"learning_rate": 6.329892305339659e-06,
"loss": 0.7053,
"step": 692
},
{
"epoch": 4.448717948717949,
"grad_norm": 0.10611707780750092,
"learning_rate": 6.309445669002787e-06,
"loss": 0.7078,
"step": 694
},
{
"epoch": 4.461538461538462,
"grad_norm": 0.10588157071847835,
"learning_rate": 6.288975495717124e-06,
"loss": 0.7412,
"step": 696
},
{
"epoch": 4.4743589743589745,
"grad_norm": 0.10785564192135899,
"learning_rate": 6.268482153428763e-06,
"loss": 0.7289,
"step": 698
},
{
"epoch": 4.487179487179487,
"grad_norm": 0.10456174831291559,
"learning_rate": 6.247966010500258e-06,
"loss": 0.7233,
"step": 700
},
{
"epoch": 4.5,
"grad_norm": 0.10739198046560715,
"learning_rate": 6.227427435703997e-06,
"loss": 0.7308,
"step": 702
},
{
"epoch": 4.512820512820513,
"grad_norm": 0.11062331534549659,
"learning_rate": 6.206866798215571e-06,
"loss": 0.7188,
"step": 704
},
{
"epoch": 4.5256410256410255,
"grad_norm": 0.1120412879852177,
"learning_rate": 6.186284467607149e-06,
"loss": 0.7149,
"step": 706
},
{
"epoch": 4.538461538461538,
"grad_norm": 0.10581044212948068,
"learning_rate": 6.165680813840822e-06,
"loss": 0.7286,
"step": 708
},
{
"epoch": 4.551282051282051,
"grad_norm": 0.10581925858925155,
"learning_rate": 6.1450562072619635e-06,
"loss": 0.6854,
"step": 710
},
{
"epoch": 4.564102564102564,
"grad_norm": 0.11850925857398317,
"learning_rate": 6.124411018592568e-06,
"loss": 0.7215,
"step": 712
},
{
"epoch": 4.576923076923077,
"grad_norm": 0.12029983367038724,
"learning_rate": 6.103745618924587e-06,
"loss": 0.7142,
"step": 714
},
{
"epoch": 4.589743589743589,
"grad_norm": 0.10567103079137533,
"learning_rate": 6.0830603797132574e-06,
"loss": 0.7162,
"step": 716
},
{
"epoch": 4.602564102564102,
"grad_norm": 0.10836686741052724,
"learning_rate": 6.0623556727704306e-06,
"loss": 0.7165,
"step": 718
},
{
"epoch": 4.615384615384615,
"grad_norm": 0.11249604087548312,
"learning_rate": 6.041631870257882e-06,
"loss": 0.7383,
"step": 720
},
{
"epoch": 4.628205128205128,
"grad_norm": 0.1082063599668396,
"learning_rate": 6.020889344680627e-06,
"loss": 0.6952,
"step": 722
},
{
"epoch": 4.641025641025641,
"grad_norm": 0.10282892185990167,
"learning_rate": 6.000128468880223e-06,
"loss": 0.7167,
"step": 724
},
{
"epoch": 4.653846153846154,
"grad_norm": 0.14775806059988206,
"learning_rate": 5.979349616028067e-06,
"loss": 0.7015,
"step": 726
},
{
"epoch": 4.666666666666667,
"grad_norm": 0.1146560861251404,
"learning_rate": 5.958553159618693e-06,
"loss": 0.7213,
"step": 728
},
{
"epoch": 4.67948717948718,
"grad_norm": 0.10561771243314702,
"learning_rate": 5.937739473463047e-06,
"loss": 0.7296,
"step": 730
},
{
"epoch": 4.6923076923076925,
"grad_norm": 0.1030552904773058,
"learning_rate": 5.916908931681781e-06,
"loss": 0.7123,
"step": 732
},
{
"epoch": 4.705128205128205,
"grad_norm": 0.11007539115142843,
"learning_rate": 5.896061908698521e-06,
"loss": 0.7048,
"step": 734
},
{
"epoch": 4.717948717948718,
"grad_norm": 0.11416376306689043,
"learning_rate": 5.8751987792331365e-06,
"loss": 0.7137,
"step": 736
},
{
"epoch": 4.730769230769231,
"grad_norm": 0.10152180107259362,
"learning_rate": 5.854319918295012e-06,
"loss": 0.7051,
"step": 738
},
{
"epoch": 4.743589743589744,
"grad_norm": 0.11206883891832514,
"learning_rate": 5.833425701176294e-06,
"loss": 0.6923,
"step": 740
},
{
"epoch": 4.756410256410256,
"grad_norm": 0.10804199427828234,
"learning_rate": 5.812516503445158e-06,
"loss": 0.6955,
"step": 742
},
{
"epoch": 4.769230769230769,
"grad_norm": 0.10618536471151145,
"learning_rate": 5.79159270093905e-06,
"loss": 0.7051,
"step": 744
},
{
"epoch": 4.782051282051282,
"grad_norm": 0.112445946670164,
"learning_rate": 5.770654669757935e-06,
"loss": 0.6862,
"step": 746
},
{
"epoch": 4.794871794871795,
"grad_norm": 0.10623939719616725,
"learning_rate": 5.749702786257529e-06,
"loss": 0.7021,
"step": 748
},
{
"epoch": 4.8076923076923075,
"grad_norm": 0.11066503537728437,
"learning_rate": 5.7287374270425475e-06,
"loss": 0.7083,
"step": 750
},
{
"epoch": 4.82051282051282,
"grad_norm": 0.11956485729401434,
"learning_rate": 5.707758968959923e-06,
"loss": 0.7052,
"step": 752
},
{
"epoch": 4.833333333333333,
"grad_norm": 0.11607859173183654,
"learning_rate": 5.686767789092041e-06,
"loss": 0.7114,
"step": 754
},
{
"epoch": 4.846153846153846,
"grad_norm": 0.10875829497210732,
"learning_rate": 5.6657642647499545e-06,
"loss": 0.7159,
"step": 756
},
{
"epoch": 4.858974358974359,
"grad_norm": 0.10952816111674243,
"learning_rate": 5.644748773466606e-06,
"loss": 0.7036,
"step": 758
},
{
"epoch": 4.871794871794872,
"grad_norm": 0.10684780948629531,
"learning_rate": 5.62372169299004e-06,
"loss": 0.7225,
"step": 760
},
{
"epoch": 4.884615384615385,
"grad_norm": 0.1047662448976948,
"learning_rate": 5.6026834012766155e-06,
"loss": 0.6805,
"step": 762
},
{
"epoch": 4.897435897435898,
"grad_norm": 0.10955003114927836,
"learning_rate": 5.581634276484211e-06,
"loss": 0.6792,
"step": 764
},
{
"epoch": 4.910256410256411,
"grad_norm": 0.10878018550941551,
"learning_rate": 5.560574696965425e-06,
"loss": 0.6921,
"step": 766
},
{
"epoch": 4.923076923076923,
"grad_norm": 0.11093790171018045,
"learning_rate": 5.539505041260779e-06,
"loss": 0.6956,
"step": 768
},
{
"epoch": 4.935897435897436,
"grad_norm": 0.1115655815203421,
"learning_rate": 5.518425688091906e-06,
"loss": 0.7024,
"step": 770
},
{
"epoch": 4.948717948717949,
"grad_norm": 0.1131005595068268,
"learning_rate": 5.497337016354757e-06,
"loss": 0.7148,
"step": 772
},
{
"epoch": 4.961538461538462,
"grad_norm": 0.11347516336979874,
"learning_rate": 5.476239405112775e-06,
"loss": 0.6816,
"step": 774
},
{
"epoch": 4.9743589743589745,
"grad_norm": 0.10898186232548415,
"learning_rate": 5.45513323359009e-06,
"loss": 0.7273,
"step": 776
},
{
"epoch": 4.987179487179487,
"grad_norm": 0.11549198646562549,
"learning_rate": 5.434018881164702e-06,
"loss": 0.6917,
"step": 778
},
{
"epoch": 5.0,
"grad_norm": 0.10772346133987304,
"learning_rate": 5.412896727361663e-06,
"loss": 0.6863,
"step": 780
},
{
"epoch": 5.012820512820513,
"grad_norm": 0.12047900705924511,
"learning_rate": 5.391767151846247e-06,
"loss": 0.6282,
"step": 782
},
{
"epoch": 5.0256410256410255,
"grad_norm": 0.11697315416876589,
"learning_rate": 5.370630534417133e-06,
"loss": 0.6488,
"step": 784
},
{
"epoch": 5.038461538461538,
"grad_norm": 0.12319515362238828,
"learning_rate": 5.349487254999579e-06,
"loss": 0.6356,
"step": 786
},
{
"epoch": 5.051282051282051,
"grad_norm": 0.13800689390518142,
"learning_rate": 5.328337693638591e-06,
"loss": 0.6048,
"step": 788
},
{
"epoch": 5.064102564102564,
"grad_norm": 0.15216649155566264,
"learning_rate": 5.307182230492089e-06,
"loss": 0.6275,
"step": 790
},
{
"epoch": 5.076923076923077,
"grad_norm": 0.14128455530647008,
"learning_rate": 5.286021245824075e-06,
"loss": 0.6255,
"step": 792
},
{
"epoch": 5.089743589743589,
"grad_norm": 0.11688265140577293,
"learning_rate": 5.264855119997803e-06,
"loss": 0.6257,
"step": 794
},
{
"epoch": 5.102564102564102,
"grad_norm": 0.11464621859154618,
"learning_rate": 5.243684233468933e-06,
"loss": 0.6157,
"step": 796
},
{
"epoch": 5.115384615384615,
"grad_norm": 0.13981445654349306,
"learning_rate": 5.222508966778702e-06,
"loss": 0.6508,
"step": 798
},
{
"epoch": 5.128205128205128,
"grad_norm": 0.14392893164542925,
"learning_rate": 5.201329700547077e-06,
"loss": 0.6287,
"step": 800
},
{
"epoch": 5.141025641025641,
"grad_norm": 0.13170699750399084,
"learning_rate": 5.180146815465915e-06,
"loss": 0.614,
"step": 802
},
{
"epoch": 5.153846153846154,
"grad_norm": 0.10789843763721454,
"learning_rate": 5.158960692292122e-06,
"loss": 0.6078,
"step": 804
},
{
"epoch": 5.166666666666667,
"grad_norm": 0.10695990705186179,
"learning_rate": 5.137771711840811e-06,
"loss": 0.6034,
"step": 806
},
{
"epoch": 5.17948717948718,
"grad_norm": 0.11206607562131153,
"learning_rate": 5.116580254978447e-06,
"loss": 0.5992,
"step": 808
},
{
"epoch": 5.1923076923076925,
"grad_norm": 0.118691570735362,
"learning_rate": 5.095386702616012e-06,
"loss": 0.6111,
"step": 810
},
{
"epoch": 5.205128205128205,
"grad_norm": 0.10922869228321892,
"learning_rate": 5.074191435702155e-06,
"loss": 0.5879,
"step": 812
},
{
"epoch": 5.217948717948718,
"grad_norm": 0.11128326921959672,
"learning_rate": 5.05299483521634e-06,
"loss": 0.6165,
"step": 814
},
{
"epoch": 5.230769230769231,
"grad_norm": 0.10582354369475495,
"learning_rate": 5.031797282162007e-06,
"loss": 0.5897,
"step": 816
},
{
"epoch": 5.243589743589744,
"grad_norm": 0.11451654465777673,
"learning_rate": 5.010599157559713e-06,
"loss": 0.6062,
"step": 818
},
{
"epoch": 5.256410256410256,
"grad_norm": 0.10173363821181294,
"learning_rate": 4.98940084244029e-06,
"loss": 0.6285,
"step": 820
},
{
"epoch": 5.269230769230769,
"grad_norm": 0.11291157343417497,
"learning_rate": 4.968202717837996e-06,
"loss": 0.6049,
"step": 822
},
{
"epoch": 5.282051282051282,
"grad_norm": 0.12019898107948757,
"learning_rate": 4.947005164783661e-06,
"loss": 0.6208,
"step": 824
},
{
"epoch": 5.294871794871795,
"grad_norm": 0.114373764807783,
"learning_rate": 4.925808564297847e-06,
"loss": 0.6043,
"step": 826
},
{
"epoch": 5.3076923076923075,
"grad_norm": 0.11491095476092386,
"learning_rate": 4.9046132973839895e-06,
"loss": 0.6519,
"step": 828
},
{
"epoch": 5.32051282051282,
"grad_norm": 0.11758969246232907,
"learning_rate": 4.883419745021554e-06,
"loss": 0.6156,
"step": 830
},
{
"epoch": 5.333333333333333,
"grad_norm": 0.11120312257140173,
"learning_rate": 4.862228288159191e-06,
"loss": 0.6217,
"step": 832
},
{
"epoch": 5.346153846153846,
"grad_norm": 0.11752943064735909,
"learning_rate": 4.841039307707878e-06,
"loss": 0.6234,
"step": 834
},
{
"epoch": 5.358974358974359,
"grad_norm": 0.11864432138040991,
"learning_rate": 4.819853184534085e-06,
"loss": 0.5947,
"step": 836
},
{
"epoch": 5.371794871794872,
"grad_norm": 0.11941298362678118,
"learning_rate": 4.798670299452926e-06,
"loss": 0.613,
"step": 838
},
{
"epoch": 5.384615384615385,
"grad_norm": 0.11307708710954777,
"learning_rate": 4.7774910332213005e-06,
"loss": 0.594,
"step": 840
},
{
"epoch": 5.397435897435898,
"grad_norm": 0.1093069103681417,
"learning_rate": 4.756315766531069e-06,
"loss": 0.6049,
"step": 842
},
{
"epoch": 5.410256410256411,
"grad_norm": 0.11381070200838837,
"learning_rate": 4.735144880002199e-06,
"loss": 0.6105,
"step": 844
},
{
"epoch": 5.423076923076923,
"grad_norm": 0.1318249567633448,
"learning_rate": 4.713978754175926e-06,
"loss": 0.6002,
"step": 846
},
{
"epoch": 5.435897435897436,
"grad_norm": 0.11599444919219819,
"learning_rate": 4.692817769507912e-06,
"loss": 0.6042,
"step": 848
},
{
"epoch": 5.448717948717949,
"grad_norm": 0.11182097821019421,
"learning_rate": 4.671662306361409e-06,
"loss": 0.6103,
"step": 850
},
{
"epoch": 5.461538461538462,
"grad_norm": 0.10954799492546995,
"learning_rate": 4.6505127450004216e-06,
"loss": 0.6016,
"step": 852
},
{
"epoch": 5.4743589743589745,
"grad_norm": 0.11308176387281334,
"learning_rate": 4.62936946558287e-06,
"loss": 0.6051,
"step": 854
},
{
"epoch": 5.487179487179487,
"grad_norm": 0.10759177792306925,
"learning_rate": 4.608232848153757e-06,
"loss": 0.6134,
"step": 856
},
{
"epoch": 5.5,
"grad_norm": 0.11581427783989148,
"learning_rate": 4.587103272638339e-06,
"loss": 0.6119,
"step": 858
},
{
"epoch": 5.512820512820513,
"grad_norm": 0.11905934839017085,
"learning_rate": 4.565981118835299e-06,
"loss": 0.5898,
"step": 860
},
{
"epoch": 5.5256410256410255,
"grad_norm": 0.11350808165481102,
"learning_rate": 4.5448667664099125e-06,
"loss": 0.5917,
"step": 862
},
{
"epoch": 5.538461538461538,
"grad_norm": 0.10584763781810944,
"learning_rate": 4.523760594887228e-06,
"loss": 0.5989,
"step": 864
},
{
"epoch": 5.551282051282051,
"grad_norm": 0.1052428154285861,
"learning_rate": 4.5026629836452445e-06,
"loss": 0.5957,
"step": 866
},
{
"epoch": 5.564102564102564,
"grad_norm": 0.10854637153715369,
"learning_rate": 4.481574311908096e-06,
"loss": 0.6091,
"step": 868
},
{
"epoch": 5.576923076923077,
"grad_norm": 0.1114814068673184,
"learning_rate": 4.460494958739223e-06,
"loss": 0.5982,
"step": 870
},
{
"epoch": 5.589743589743589,
"grad_norm": 0.10679516983860898,
"learning_rate": 4.439425303034576e-06,
"loss": 0.5781,
"step": 872
},
{
"epoch": 5.602564102564102,
"grad_norm": 0.11151279055313694,
"learning_rate": 4.418365723515791e-06,
"loss": 0.5875,
"step": 874
},
{
"epoch": 5.615384615384615,
"grad_norm": 0.11196990690511702,
"learning_rate": 4.397316598723385e-06,
"loss": 0.6389,
"step": 876
},
{
"epoch": 5.628205128205128,
"grad_norm": 0.11160421611762042,
"learning_rate": 4.376278307009962e-06,
"loss": 0.5966,
"step": 878
},
{
"epoch": 5.641025641025641,
"grad_norm": 0.11892010523420538,
"learning_rate": 4.355251226533396e-06,
"loss": 0.5967,
"step": 880
},
{
"epoch": 5.653846153846154,
"grad_norm": 0.10528377897572419,
"learning_rate": 4.334235735250047e-06,
"loss": 0.5872,
"step": 882
},
{
"epoch": 5.666666666666667,
"grad_norm": 0.11078816330375864,
"learning_rate": 4.313232210907959e-06,
"loss": 0.5839,
"step": 884
},
{
"epoch": 5.67948717948718,
"grad_norm": 0.11204326662361994,
"learning_rate": 4.292241031040077e-06,
"loss": 0.5985,
"step": 886
},
{
"epoch": 5.6923076923076925,
"grad_norm": 0.11136880270558686,
"learning_rate": 4.271262572957453e-06,
"loss": 0.5899,
"step": 888
},
{
"epoch": 5.705128205128205,
"grad_norm": 0.11504012535772838,
"learning_rate": 4.250297213742473e-06,
"loss": 0.5861,
"step": 890
},
{
"epoch": 5.717948717948718,
"grad_norm": 0.11477930854527991,
"learning_rate": 4.229345330242067e-06,
"loss": 0.603,
"step": 892
},
{
"epoch": 5.730769230769231,
"grad_norm": 0.10662143707078175,
"learning_rate": 4.2084072990609505e-06,
"loss": 0.6222,
"step": 894
},
{
"epoch": 5.743589743589744,
"grad_norm": 0.11014567262417806,
"learning_rate": 4.187483496554844e-06,
"loss": 0.607,
"step": 896
},
{
"epoch": 5.756410256410256,
"grad_norm": 0.11111661489395548,
"learning_rate": 4.166574298823707e-06,
"loss": 0.5932,
"step": 898
},
{
"epoch": 5.769230769230769,
"grad_norm": 0.11242007034134975,
"learning_rate": 4.145680081704989e-06,
"loss": 0.5995,
"step": 900
},
{
"epoch": 5.782051282051282,
"grad_norm": 0.11419724226540427,
"learning_rate": 4.1248012207668635e-06,
"loss": 0.5915,
"step": 902
},
{
"epoch": 5.794871794871795,
"grad_norm": 0.11706613959593416,
"learning_rate": 4.103938091301479e-06,
"loss": 0.618,
"step": 904
},
{
"epoch": 5.8076923076923075,
"grad_norm": 0.11209016426196475,
"learning_rate": 4.08309106831822e-06,
"loss": 0.6047,
"step": 906
},
{
"epoch": 5.82051282051282,
"grad_norm": 0.10918746883833244,
"learning_rate": 4.062260526536955e-06,
"loss": 0.5993,
"step": 908
},
{
"epoch": 5.833333333333333,
"grad_norm": 0.11265204962035352,
"learning_rate": 4.041446840381309e-06,
"loss": 0.6107,
"step": 910
},
{
"epoch": 5.846153846153846,
"grad_norm": 0.11006154535411454,
"learning_rate": 4.0206503839719335e-06,
"loss": 0.5968,
"step": 912
},
{
"epoch": 5.858974358974359,
"grad_norm": 0.10704556131214127,
"learning_rate": 3.999871531119779e-06,
"loss": 0.6004,
"step": 914
},
{
"epoch": 5.871794871794872,
"grad_norm": 0.11890492535370141,
"learning_rate": 3.9791106553193746e-06,
"loss": 0.6235,
"step": 916
},
{
"epoch": 5.884615384615385,
"grad_norm": 0.11125280463928439,
"learning_rate": 3.9583681297421194e-06,
"loss": 0.5936,
"step": 918
},
{
"epoch": 5.897435897435898,
"grad_norm": 0.11331294271299998,
"learning_rate": 3.937644327229572e-06,
"loss": 0.5935,
"step": 920
},
{
"epoch": 5.910256410256411,
"grad_norm": 0.121603183912255,
"learning_rate": 3.916939620286743e-06,
"loss": 0.5784,
"step": 922
},
{
"epoch": 5.923076923076923,
"grad_norm": 0.11031017606070566,
"learning_rate": 3.896254381075416e-06,
"loss": 0.572,
"step": 924
},
{
"epoch": 5.935897435897436,
"grad_norm": 0.11514051578131597,
"learning_rate": 3.875588981407433e-06,
"loss": 0.6112,
"step": 926
},
{
"epoch": 5.948717948717949,
"grad_norm": 0.10743392753449098,
"learning_rate": 3.854943792738037e-06,
"loss": 0.606,
"step": 928
},
{
"epoch": 5.961538461538462,
"grad_norm": 0.11236268397619992,
"learning_rate": 3.834319186159179e-06,
"loss": 0.5922,
"step": 930
},
{
"epoch": 5.9743589743589745,
"grad_norm": 0.12074864124016804,
"learning_rate": 3.8137155323928526e-06,
"loss": 0.5949,
"step": 932
},
{
"epoch": 5.987179487179487,
"grad_norm": 0.10827890126242279,
"learning_rate": 3.7931332017844302e-06,
"loss": 0.6203,
"step": 934
},
{
"epoch": 6.0,
"grad_norm": 0.10969453481755433,
"learning_rate": 3.7725725642960047e-06,
"loss": 0.5635,
"step": 936
},
{
"epoch": 6.012820512820513,
"grad_norm": 0.1031061811953427,
"learning_rate": 3.752033989499742e-06,
"loss": 0.5165,
"step": 938
},
{
"epoch": 6.0256410256410255,
"grad_norm": 0.1194475307958204,
"learning_rate": 3.7315178465712364e-06,
"loss": 0.5365,
"step": 940
},
{
"epoch": 6.038461538461538,
"grad_norm": 0.12267686021895448,
"learning_rate": 3.7110245042828786e-06,
"loss": 0.557,
"step": 942
},
{
"epoch": 6.051282051282051,
"grad_norm": 0.13498002529028316,
"learning_rate": 3.690554330997215e-06,
"loss": 0.5647,
"step": 944
},
{
"epoch": 6.064102564102564,
"grad_norm": 0.11313547950477416,
"learning_rate": 3.670107694660343e-06,
"loss": 0.5319,
"step": 946
},
{
"epoch": 6.076923076923077,
"grad_norm": 0.11337535375238234,
"learning_rate": 3.6496849627952875e-06,
"loss": 0.5337,
"step": 948
},
{
"epoch": 6.089743589743589,
"grad_norm": 0.12108662571350122,
"learning_rate": 3.6292865024953945e-06,
"loss": 0.5317,
"step": 950
},
{
"epoch": 6.102564102564102,
"grad_norm": 0.10860898068984597,
"learning_rate": 3.6089126804177373e-06,
"loss": 0.5144,
"step": 952
},
{
"epoch": 6.115384615384615,
"grad_norm": 0.10618003581452222,
"learning_rate": 3.5885638627765228e-06,
"loss": 0.5297,
"step": 954
},
{
"epoch": 6.128205128205128,
"grad_norm": 0.10830357191265076,
"learning_rate": 3.568240415336509e-06,
"loss": 0.5153,
"step": 956
},
{
"epoch": 6.141025641025641,
"grad_norm": 0.11656722880396314,
"learning_rate": 3.547942703406433e-06,
"loss": 0.5689,
"step": 958
},
{
"epoch": 6.153846153846154,
"grad_norm": 0.12409515119125744,
"learning_rate": 3.52767109183244e-06,
"loss": 0.5339,
"step": 960
},
{
"epoch": 6.166666666666667,
"grad_norm": 0.10533677158970278,
"learning_rate": 3.507425944991529e-06,
"loss": 0.5298,
"step": 962
},
{
"epoch": 6.17948717948718,
"grad_norm": 0.10779883016864458,
"learning_rate": 3.4872076267850015e-06,
"loss": 0.5393,
"step": 964
},
{
"epoch": 6.1923076923076925,
"grad_norm": 0.10689267808425468,
"learning_rate": 3.4670165006319236e-06,
"loss": 0.5385,
"step": 966
},
{
"epoch": 6.205128205128205,
"grad_norm": 0.10262511919512685,
"learning_rate": 3.4468529294625895e-06,
"loss": 0.5415,
"step": 968
},
{
"epoch": 6.217948717948718,
"grad_norm": 0.1057687129463025,
"learning_rate": 3.4267172757120005e-06,
"loss": 0.5424,
"step": 970
},
{
"epoch": 6.230769230769231,
"grad_norm": 0.10378572914010763,
"learning_rate": 3.406609901313349e-06,
"loss": 0.5112,
"step": 972
},
{
"epoch": 6.243589743589744,
"grad_norm": 0.12203819824027057,
"learning_rate": 3.386531167691512e-06,
"loss": 0.5384,
"step": 974
},
{
"epoch": 6.256410256410256,
"grad_norm": 0.10802288722224572,
"learning_rate": 3.36648143575656e-06,
"loss": 0.5028,
"step": 976
},
{
"epoch": 6.269230769230769,
"grad_norm": 0.1142664875757734,
"learning_rate": 3.3464610658972584e-06,
"loss": 0.5292,
"step": 978
},
{
"epoch": 6.282051282051282,
"grad_norm": 0.10568144050459893,
"learning_rate": 3.326470417974604e-06,
"loss": 0.5105,
"step": 980
},
{
"epoch": 6.294871794871795,
"grad_norm": 0.10752642025433035,
"learning_rate": 3.3065098513153473e-06,
"loss": 0.543,
"step": 982
},
{
"epoch": 6.3076923076923075,
"grad_norm": 0.10979310993514947,
"learning_rate": 3.2865797247055354e-06,
"loss": 0.5277,
"step": 984
},
{
"epoch": 6.32051282051282,
"grad_norm": 0.10254738368035236,
"learning_rate": 3.266680396384061e-06,
"loss": 0.5159,
"step": 986
},
{
"epoch": 6.333333333333333,
"grad_norm": 0.10564094033375905,
"learning_rate": 3.2468122240362287e-06,
"loss": 0.5314,
"step": 988
},
{
"epoch": 6.346153846153846,
"grad_norm": 0.11424837464444032,
"learning_rate": 3.226975564787322e-06,
"loss": 0.5584,
"step": 990
},
{
"epoch": 6.358974358974359,
"grad_norm": 0.10635885003440321,
"learning_rate": 3.2071707751961838e-06,
"loss": 0.542,
"step": 992
},
{
"epoch": 6.371794871794872,
"grad_norm": 0.11342627001310603,
"learning_rate": 3.187398211248811e-06,
"loss": 0.526,
"step": 994
},
{
"epoch": 6.384615384615385,
"grad_norm": 0.10721179464672517,
"learning_rate": 3.1676582283519454e-06,
"loss": 0.5161,
"step": 996
},
{
"epoch": 6.397435897435898,
"grad_norm": 0.10847381371938812,
"learning_rate": 3.1479511813267006e-06,
"loss": 0.5308,
"step": 998
},
{
"epoch": 6.410256410256411,
"grad_norm": 0.10553871633956305,
"learning_rate": 3.1282774244021717e-06,
"loss": 0.5389,
"step": 1000
},
{
"epoch": 6.423076923076923,
"grad_norm": 0.11217705883560118,
"learning_rate": 3.1086373112090762e-06,
"loss": 0.5157,
"step": 1002
},
{
"epoch": 6.435897435897436,
"grad_norm": 0.11158771311440012,
"learning_rate": 3.089031194773392e-06,
"loss": 0.5053,
"step": 1004
},
{
"epoch": 6.448717948717949,
"grad_norm": 0.1089332843436129,
"learning_rate": 3.069459427510014e-06,
"loss": 0.5347,
"step": 1006
},
{
"epoch": 6.461538461538462,
"grad_norm": 0.11195737620435117,
"learning_rate": 3.049922361216422e-06,
"loss": 0.5114,
"step": 1008
},
{
"epoch": 6.4743589743589745,
"grad_norm": 0.10286473273779988,
"learning_rate": 3.0304203470663507e-06,
"loss": 0.5234,
"step": 1010
},
{
"epoch": 6.487179487179487,
"grad_norm": 0.1097353874290811,
"learning_rate": 3.0109537356034856e-06,
"loss": 0.5149,
"step": 1012
},
{
"epoch": 6.5,
"grad_norm": 0.10411123643651161,
"learning_rate": 2.991522876735154e-06,
"loss": 0.5548,
"step": 1014
},
{
"epoch": 6.512820512820513,
"grad_norm": 0.11334395185933709,
"learning_rate": 2.9721281197260427e-06,
"loss": 0.51,
"step": 1016
},
{
"epoch": 6.5256410256410255,
"grad_norm": 0.10500841046128222,
"learning_rate": 2.9527698131919156e-06,
"loss": 0.5139,
"step": 1018
},
{
"epoch": 6.538461538461538,
"grad_norm": 0.11046543252263778,
"learning_rate": 2.9334483050933506e-06,
"loss": 0.5078,
"step": 1020
},
{
"epoch": 6.551282051282051,
"grad_norm": 0.10051525374525226,
"learning_rate": 2.91416394272948e-06,
"loss": 0.5231,
"step": 1022
},
{
"epoch": 6.564102564102564,
"grad_norm": 0.11091867450033485,
"learning_rate": 2.894917072731753e-06,
"loss": 0.5248,
"step": 1024
},
{
"epoch": 6.576923076923077,
"grad_norm": 0.11548978463749487,
"learning_rate": 2.8757080410577042e-06,
"loss": 0.5331,
"step": 1026
},
{
"epoch": 6.589743589743589,
"grad_norm": 0.10458731304307277,
"learning_rate": 2.8565371929847286e-06,
"loss": 0.5107,
"step": 1028
},
{
"epoch": 6.602564102564102,
"grad_norm": 0.1050325378040027,
"learning_rate": 2.83740487310389e-06,
"loss": 0.5477,
"step": 1030
},
{
"epoch": 6.615384615384615,
"grad_norm": 0.10848913413728074,
"learning_rate": 2.81831142531371e-06,
"loss": 0.5268,
"step": 1032
},
{
"epoch": 6.628205128205128,
"grad_norm": 0.10721120045068884,
"learning_rate": 2.7992571928139984e-06,
"loss": 0.5433,
"step": 1034
},
{
"epoch": 6.641025641025641,
"grad_norm": 0.11369155962674747,
"learning_rate": 2.780242518099675e-06,
"loss": 0.5359,
"step": 1036
},
{
"epoch": 6.653846153846154,
"grad_norm": 0.10077730135750612,
"learning_rate": 2.761267742954629e-06,
"loss": 0.5283,
"step": 1038
},
{
"epoch": 6.666666666666667,
"grad_norm": 0.10299831989759317,
"learning_rate": 2.7423332084455543e-06,
"loss": 0.5012,
"step": 1040
},
{
"epoch": 6.67948717948718,
"grad_norm": 0.11366647060441272,
"learning_rate": 2.723439254915834e-06,
"loss": 0.5431,
"step": 1042
},
{
"epoch": 6.6923076923076925,
"grad_norm": 0.10337571007144457,
"learning_rate": 2.704586221979422e-06,
"loss": 0.5218,
"step": 1044
},
{
"epoch": 6.705128205128205,
"grad_norm": 0.10480130947183795,
"learning_rate": 2.6857744485147286e-06,
"loss": 0.5275,
"step": 1046
},
{
"epoch": 6.717948717948718,
"grad_norm": 0.10841475505701116,
"learning_rate": 2.667004272658541e-06,
"loss": 0.5239,
"step": 1048
},
{
"epoch": 6.730769230769231,
"grad_norm": 0.10494980003698792,
"learning_rate": 2.6482760317999338e-06,
"loss": 0.5307,
"step": 1050
},
{
"epoch": 6.743589743589744,
"grad_norm": 0.11015308961784986,
"learning_rate": 2.629590062574221e-06,
"loss": 0.5188,
"step": 1052
},
{
"epoch": 6.756410256410256,
"grad_norm": 0.10650485718812779,
"learning_rate": 2.610946700856885e-06,
"loss": 0.5289,
"step": 1054
},
{
"epoch": 6.769230769230769,
"grad_norm": 0.11042204753698427,
"learning_rate": 2.592346281757552e-06,
"loss": 0.5256,
"step": 1056
},
{
"epoch": 6.782051282051282,
"grad_norm": 0.1109413966927775,
"learning_rate": 2.5737891396139713e-06,
"loss": 0.5113,
"step": 1058
},
{
"epoch": 6.794871794871795,
"grad_norm": 0.0995906902958352,
"learning_rate": 2.5552756079859904e-06,
"loss": 0.5286,
"step": 1060
},
{
"epoch": 6.8076923076923075,
"grad_norm": 0.10719822108936067,
"learning_rate": 2.5368060196495785e-06,
"loss": 0.5346,
"step": 1062
},
{
"epoch": 6.82051282051282,
"grad_norm": 0.10359465789573513,
"learning_rate": 2.5183807065908296e-06,
"loss": 0.5345,
"step": 1064
},
{
"epoch": 6.833333333333333,
"grad_norm": 0.10868896685434068,
"learning_rate": 2.5000000000000015e-06,
"loss": 0.5374,
"step": 1066
},
{
"epoch": 6.846153846153846,
"grad_norm": 0.09898844010754565,
"learning_rate": 2.4816642302655634e-06,
"loss": 0.525,
"step": 1068
},
{
"epoch": 6.858974358974359,
"grad_norm": 0.10379473426548341,
"learning_rate": 2.4633737269682546e-06,
"loss": 0.5016,
"step": 1070
},
{
"epoch": 6.871794871794872,
"grad_norm": 0.10543135697876233,
"learning_rate": 2.445128818875166e-06,
"loss": 0.5113,
"step": 1072
},
{
"epoch": 6.884615384615385,
"grad_norm": 0.7212115719506329,
"learning_rate": 2.4269298339338205e-06,
"loss": 0.5296,
"step": 1074
},
{
"epoch": 6.897435897435898,
"grad_norm": 0.10405167616760767,
"learning_rate": 2.408777099266291e-06,
"loss": 0.5305,
"step": 1076
},
{
"epoch": 6.910256410256411,
"grad_norm": 0.1105828209294347,
"learning_rate": 2.3906709411633073e-06,
"loss": 0.5249,
"step": 1078
},
{
"epoch": 6.923076923076923,
"grad_norm": 0.09948650821267507,
"learning_rate": 2.3726116850783987e-06,
"loss": 0.5053,
"step": 1080
},
{
"epoch": 6.935897435897436,
"grad_norm": 0.10462584074627808,
"learning_rate": 2.354599655622049e-06,
"loss": 0.5355,
"step": 1082
},
{
"epoch": 6.948717948717949,
"grad_norm": 0.10815781379459323,
"learning_rate": 2.3366351765558437e-06,
"loss": 0.518,
"step": 1084
},
{
"epoch": 6.961538461538462,
"grad_norm": 0.10164985194268733,
"learning_rate": 2.318718570786675e-06,
"loss": 0.5164,
"step": 1086
},
{
"epoch": 6.9743589743589745,
"grad_norm": 0.10415162261749741,
"learning_rate": 2.3008501603609147e-06,
"loss": 0.5027,
"step": 1088
},
{
"epoch": 6.987179487179487,
"grad_norm": 0.10190530521809464,
"learning_rate": 2.283030266458644e-06,
"loss": 0.5015,
"step": 1090
},
{
"epoch": 7.0,
"grad_norm": 0.10253347317045373,
"learning_rate": 2.265259209387867e-06,
"loss": 0.5162,
"step": 1092
}
],
"logging_steps": 2,
"max_steps": 1560,
"num_input_tokens_seen": 0,
"num_train_epochs": 10,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 4.4209696878656225e+19,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}