llama-3.1-8b-chinese-sft / trainer_state.json
zhehuderek's picture
Upload 12 files
6e9fe61 verified
raw
history blame contribute delete
No virus
71.4 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.999005469915465,
"eval_steps": 500,
"global_step": 2010,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.000994530084535057,
"grad_norm": 23.116023523946204,
"learning_rate": 9.950248756218906e-08,
"loss": 1.4467,
"step": 1
},
{
"epoch": 0.004972650422675286,
"grad_norm": 22.614904504884613,
"learning_rate": 4.975124378109453e-07,
"loss": 1.4087,
"step": 5
},
{
"epoch": 0.009945300845350571,
"grad_norm": 16.569672699698376,
"learning_rate": 9.950248756218907e-07,
"loss": 1.3944,
"step": 10
},
{
"epoch": 0.014917951268025857,
"grad_norm": 3.581639215568655,
"learning_rate": 1.4925373134328358e-06,
"loss": 1.3012,
"step": 15
},
{
"epoch": 0.019890601690701143,
"grad_norm": 1.9954309309104792,
"learning_rate": 1.9900497512437813e-06,
"loss": 1.2372,
"step": 20
},
{
"epoch": 0.02486325211337643,
"grad_norm": 1.363643820597461,
"learning_rate": 2.4875621890547264e-06,
"loss": 1.1994,
"step": 25
},
{
"epoch": 0.029835902536051714,
"grad_norm": 0.9605594350100669,
"learning_rate": 2.9850746268656716e-06,
"loss": 1.1902,
"step": 30
},
{
"epoch": 0.034808552958727,
"grad_norm": 0.9786652533260872,
"learning_rate": 3.4825870646766175e-06,
"loss": 1.1542,
"step": 35
},
{
"epoch": 0.039781203381402286,
"grad_norm": 0.9984572750782378,
"learning_rate": 3.980099502487563e-06,
"loss": 1.1175,
"step": 40
},
{
"epoch": 0.04475385380407757,
"grad_norm": 0.7596880194893064,
"learning_rate": 4.477611940298508e-06,
"loss": 1.1268,
"step": 45
},
{
"epoch": 0.04972650422675286,
"grad_norm": 0.8716479829689806,
"learning_rate": 4.975124378109453e-06,
"loss": 1.1332,
"step": 50
},
{
"epoch": 0.05469915464942814,
"grad_norm": 0.791852622062893,
"learning_rate": 5.472636815920398e-06,
"loss": 1.132,
"step": 55
},
{
"epoch": 0.05967180507210343,
"grad_norm": 0.733246468747676,
"learning_rate": 5.970149253731343e-06,
"loss": 1.0988,
"step": 60
},
{
"epoch": 0.06464445549477872,
"grad_norm": 0.7512018991531512,
"learning_rate": 6.46766169154229e-06,
"loss": 1.1027,
"step": 65
},
{
"epoch": 0.069617105917454,
"grad_norm": 1.0504364963362558,
"learning_rate": 6.965174129353235e-06,
"loss": 1.1057,
"step": 70
},
{
"epoch": 0.07458975634012929,
"grad_norm": 0.8367942560809148,
"learning_rate": 7.46268656716418e-06,
"loss": 1.1211,
"step": 75
},
{
"epoch": 0.07956240676280457,
"grad_norm": 0.7659112396017037,
"learning_rate": 7.960199004975125e-06,
"loss": 1.1034,
"step": 80
},
{
"epoch": 0.08453505718547986,
"grad_norm": 0.7564136569017796,
"learning_rate": 8.45771144278607e-06,
"loss": 1.1095,
"step": 85
},
{
"epoch": 0.08950770760815514,
"grad_norm": 0.680667197827103,
"learning_rate": 8.955223880597016e-06,
"loss": 1.1228,
"step": 90
},
{
"epoch": 0.09448035803083044,
"grad_norm": 0.7778561000416293,
"learning_rate": 9.45273631840796e-06,
"loss": 1.1305,
"step": 95
},
{
"epoch": 0.09945300845350571,
"grad_norm": 0.7582042725036765,
"learning_rate": 9.950248756218906e-06,
"loss": 1.1032,
"step": 100
},
{
"epoch": 0.10442565887618101,
"grad_norm": 0.6743713824965428,
"learning_rate": 1.0447761194029851e-05,
"loss": 1.094,
"step": 105
},
{
"epoch": 0.10939830929885629,
"grad_norm": 0.8423183568175382,
"learning_rate": 1.0945273631840796e-05,
"loss": 1.1025,
"step": 110
},
{
"epoch": 0.11437095972153158,
"grad_norm": 0.7394348567397752,
"learning_rate": 1.1442786069651741e-05,
"loss": 1.105,
"step": 115
},
{
"epoch": 0.11934361014420686,
"grad_norm": 1.039483051036899,
"learning_rate": 1.1940298507462686e-05,
"loss": 1.1086,
"step": 120
},
{
"epoch": 0.12431626056688215,
"grad_norm": 0.8818107439680185,
"learning_rate": 1.2437810945273631e-05,
"loss": 1.1218,
"step": 125
},
{
"epoch": 0.12928891098955744,
"grad_norm": 0.7903765993701828,
"learning_rate": 1.293532338308458e-05,
"loss": 1.0911,
"step": 130
},
{
"epoch": 0.13426156141223272,
"grad_norm": 0.8928561309689899,
"learning_rate": 1.3432835820895525e-05,
"loss": 1.1035,
"step": 135
},
{
"epoch": 0.139234211834908,
"grad_norm": 0.8136039158436408,
"learning_rate": 1.393034825870647e-05,
"loss": 1.1187,
"step": 140
},
{
"epoch": 0.14420686225758328,
"grad_norm": 0.7064949127673769,
"learning_rate": 1.4427860696517415e-05,
"loss": 1.1357,
"step": 145
},
{
"epoch": 0.14917951268025859,
"grad_norm": 0.920457572603285,
"learning_rate": 1.492537313432836e-05,
"loss": 1.1218,
"step": 150
},
{
"epoch": 0.15415216310293386,
"grad_norm": 0.7265048207847127,
"learning_rate": 1.5422885572139307e-05,
"loss": 1.0863,
"step": 155
},
{
"epoch": 0.15912481352560914,
"grad_norm": 0.9780404914044643,
"learning_rate": 1.592039800995025e-05,
"loss": 1.1203,
"step": 160
},
{
"epoch": 0.16409746394828442,
"grad_norm": 1.0254663723419732,
"learning_rate": 1.6417910447761197e-05,
"loss": 1.1079,
"step": 165
},
{
"epoch": 0.16907011437095973,
"grad_norm": 0.8097421068534958,
"learning_rate": 1.691542288557214e-05,
"loss": 1.1086,
"step": 170
},
{
"epoch": 0.174042764793635,
"grad_norm": 0.7012708079715231,
"learning_rate": 1.7412935323383088e-05,
"loss": 1.0985,
"step": 175
},
{
"epoch": 0.1790154152163103,
"grad_norm": 0.8920950219504408,
"learning_rate": 1.791044776119403e-05,
"loss": 1.106,
"step": 180
},
{
"epoch": 0.1839880656389856,
"grad_norm": 0.7178834727892439,
"learning_rate": 1.8407960199004978e-05,
"loss": 1.1089,
"step": 185
},
{
"epoch": 0.18896071606166087,
"grad_norm": 0.8106510565314389,
"learning_rate": 1.890547263681592e-05,
"loss": 1.1303,
"step": 190
},
{
"epoch": 0.19393336648433615,
"grad_norm": 0.7760161666634037,
"learning_rate": 1.9402985074626868e-05,
"loss": 1.1,
"step": 195
},
{
"epoch": 0.19890601690701143,
"grad_norm": 0.9543725947472645,
"learning_rate": 1.990049751243781e-05,
"loss": 1.1113,
"step": 200
},
{
"epoch": 0.20387866732968674,
"grad_norm": 0.7825954675614248,
"learning_rate": 1.9999758725817802e-05,
"loss": 1.1142,
"step": 205
},
{
"epoch": 0.20885131775236201,
"grad_norm": 0.9041843566452702,
"learning_rate": 1.999877856940653e-05,
"loss": 1.1107,
"step": 210
},
{
"epoch": 0.2138239681750373,
"grad_norm": 0.8759171946616057,
"learning_rate": 1.9997044524974797e-05,
"loss": 1.1075,
"step": 215
},
{
"epoch": 0.21879661859771257,
"grad_norm": 0.6883474381832496,
"learning_rate": 1.9994556723266102e-05,
"loss": 1.0916,
"step": 220
},
{
"epoch": 0.22376926902038788,
"grad_norm": 0.935590319528177,
"learning_rate": 1.999131535185575e-05,
"loss": 1.1354,
"step": 225
},
{
"epoch": 0.22874191944306316,
"grad_norm": 0.7241812404841466,
"learning_rate": 1.9987320655136693e-05,
"loss": 1.091,
"step": 230
},
{
"epoch": 0.23371456986573844,
"grad_norm": 0.7126868228357645,
"learning_rate": 1.998257293430112e-05,
"loss": 1.1183,
"step": 235
},
{
"epoch": 0.23868722028841372,
"grad_norm": 0.7747343156203321,
"learning_rate": 1.997707254731775e-05,
"loss": 1.1183,
"step": 240
},
{
"epoch": 0.24365987071108902,
"grad_norm": 0.7040038125289689,
"learning_rate": 1.9970819908904815e-05,
"loss": 1.1129,
"step": 245
},
{
"epoch": 0.2486325211337643,
"grad_norm": 0.7279011261296227,
"learning_rate": 1.996381549049882e-05,
"loss": 1.1198,
"step": 250
},
{
"epoch": 0.2536051715564396,
"grad_norm": 0.7449816844457622,
"learning_rate": 1.9956059820218982e-05,
"loss": 1.112,
"step": 255
},
{
"epoch": 0.2585778219791149,
"grad_norm": 0.8826758475728962,
"learning_rate": 1.994755348282742e-05,
"loss": 1.1176,
"step": 260
},
{
"epoch": 0.26355047240179014,
"grad_norm": 0.7356262066526851,
"learning_rate": 1.9938297119685054e-05,
"loss": 1.0975,
"step": 265
},
{
"epoch": 0.26852312282446544,
"grad_norm": 0.7345831845947749,
"learning_rate": 1.9928291428703265e-05,
"loss": 1.1054,
"step": 270
},
{
"epoch": 0.27349577324714075,
"grad_norm": 0.6792936934909664,
"learning_rate": 1.9917537164291244e-05,
"loss": 1.0971,
"step": 275
},
{
"epoch": 0.278468423669816,
"grad_norm": 0.7211549142360152,
"learning_rate": 1.990603513729915e-05,
"loss": 1.1137,
"step": 280
},
{
"epoch": 0.2834410740924913,
"grad_norm": 0.7338829367672012,
"learning_rate": 1.9893786214956946e-05,
"loss": 1.1031,
"step": 285
},
{
"epoch": 0.28841372451516656,
"grad_norm": 0.8494824202658868,
"learning_rate": 1.9880791320809012e-05,
"loss": 1.0962,
"step": 290
},
{
"epoch": 0.29338637493784187,
"grad_norm": 0.7216958796251164,
"learning_rate": 1.9867051434644532e-05,
"loss": 1.1262,
"step": 295
},
{
"epoch": 0.29835902536051717,
"grad_norm": 0.7349825775644381,
"learning_rate": 1.985256759242359e-05,
"loss": 1.0938,
"step": 300
},
{
"epoch": 0.3033316757831924,
"grad_norm": 0.6477209491442245,
"learning_rate": 1.9837340886199097e-05,
"loss": 1.0925,
"step": 305
},
{
"epoch": 0.30830432620586773,
"grad_norm": 0.643732891793806,
"learning_rate": 1.9821372464034416e-05,
"loss": 1.116,
"step": 310
},
{
"epoch": 0.31327697662854304,
"grad_norm": 0.7127469375661294,
"learning_rate": 1.9804663529916825e-05,
"loss": 1.118,
"step": 315
},
{
"epoch": 0.3182496270512183,
"grad_norm": 0.6302238233373078,
"learning_rate": 1.9787215343666732e-05,
"loss": 1.0933,
"step": 320
},
{
"epoch": 0.3232222774738936,
"grad_norm": 0.6512766142325113,
"learning_rate": 1.9769029220842678e-05,
"loss": 1.1022,
"step": 325
},
{
"epoch": 0.32819492789656884,
"grad_norm": 0.6928608412763418,
"learning_rate": 1.975010653264216e-05,
"loss": 1.1057,
"step": 330
},
{
"epoch": 0.33316757831924415,
"grad_norm": 0.6801795596819155,
"learning_rate": 1.973044870579824e-05,
"loss": 1.099,
"step": 335
},
{
"epoch": 0.33814022874191946,
"grad_norm": 0.7155915889134151,
"learning_rate": 1.971005722247197e-05,
"loss": 1.1112,
"step": 340
},
{
"epoch": 0.3431128791645947,
"grad_norm": 0.7270391229967981,
"learning_rate": 1.9688933620140638e-05,
"loss": 1.0994,
"step": 345
},
{
"epoch": 0.34808552958727,
"grad_norm": 0.6781191980419545,
"learning_rate": 1.966707949148186e-05,
"loss": 1.0933,
"step": 350
},
{
"epoch": 0.3530581800099453,
"grad_norm": 0.8199363645347214,
"learning_rate": 1.9644496484253473e-05,
"loss": 1.0993,
"step": 355
},
{
"epoch": 0.3580308304326206,
"grad_norm": 0.7553462054946344,
"learning_rate": 1.9621186301169316e-05,
"loss": 1.1111,
"step": 360
},
{
"epoch": 0.3630034808552959,
"grad_norm": 0.6733526636292482,
"learning_rate": 1.9597150699770834e-05,
"loss": 1.1038,
"step": 365
},
{
"epoch": 0.3679761312779712,
"grad_norm": 0.63611657294842,
"learning_rate": 1.957239149229458e-05,
"loss": 1.0894,
"step": 370
},
{
"epoch": 0.37294878170064644,
"grad_norm": 0.6705699643223696,
"learning_rate": 1.954691054553556e-05,
"loss": 1.0908,
"step": 375
},
{
"epoch": 0.37792143212332174,
"grad_norm": 0.6042254858161937,
"learning_rate": 1.9520709780706485e-05,
"loss": 1.0968,
"step": 380
},
{
"epoch": 0.382894082545997,
"grad_norm": 0.7276058593761272,
"learning_rate": 1.9493791173292924e-05,
"loss": 1.0863,
"step": 385
},
{
"epoch": 0.3878667329686723,
"grad_norm": 0.7335666141482664,
"learning_rate": 1.9466156752904344e-05,
"loss": 1.0968,
"step": 390
},
{
"epoch": 0.3928393833913476,
"grad_norm": 0.6999656279264759,
"learning_rate": 1.9437808603121086e-05,
"loss": 1.1077,
"step": 395
},
{
"epoch": 0.39781203381402286,
"grad_norm": 0.7171558885154616,
"learning_rate": 1.9408748861337274e-05,
"loss": 1.0929,
"step": 400
},
{
"epoch": 0.40278468423669817,
"grad_norm": 0.7881416088231044,
"learning_rate": 1.9378979718599647e-05,
"loss": 1.1068,
"step": 405
},
{
"epoch": 0.40775733465937347,
"grad_norm": 0.7628447209491808,
"learning_rate": 1.934850341944237e-05,
"loss": 1.0979,
"step": 410
},
{
"epoch": 0.4127299850820487,
"grad_norm": 0.6407210873259641,
"learning_rate": 1.9317322261717794e-05,
"loss": 1.1028,
"step": 415
},
{
"epoch": 0.41770263550472403,
"grad_norm": 0.6676034803018082,
"learning_rate": 1.9285438596423204e-05,
"loss": 1.0943,
"step": 420
},
{
"epoch": 0.4226752859273993,
"grad_norm": 0.7550987050238746,
"learning_rate": 1.9252854827523557e-05,
"loss": 1.1066,
"step": 425
},
{
"epoch": 0.4276479363500746,
"grad_norm": 0.6729112307385414,
"learning_rate": 1.9219573411770235e-05,
"loss": 1.1008,
"step": 430
},
{
"epoch": 0.4326205867727499,
"grad_norm": 0.6703726778340369,
"learning_rate": 1.9185596858515797e-05,
"loss": 1.107,
"step": 435
},
{
"epoch": 0.43759323719542514,
"grad_norm": 0.6957611244541537,
"learning_rate": 1.91509277295248e-05,
"loss": 1.0803,
"step": 440
},
{
"epoch": 0.44256588761810045,
"grad_norm": 0.6302665113292966,
"learning_rate": 1.911556863878062e-05,
"loss": 1.089,
"step": 445
},
{
"epoch": 0.44753853804077576,
"grad_norm": 0.6619346129189808,
"learning_rate": 1.9079522252288387e-05,
"loss": 1.0998,
"step": 450
},
{
"epoch": 0.452511188463451,
"grad_norm": 0.6303067629333522,
"learning_rate": 1.9042791287873958e-05,
"loss": 1.0982,
"step": 455
},
{
"epoch": 0.4574838388861263,
"grad_norm": 0.6817551059292462,
"learning_rate": 1.900537851497901e-05,
"loss": 1.1123,
"step": 460
},
{
"epoch": 0.46245648930880157,
"grad_norm": 0.6883985523342773,
"learning_rate": 1.8967286754452214e-05,
"loss": 1.0994,
"step": 465
},
{
"epoch": 0.4674291397314769,
"grad_norm": 0.6303976451992164,
"learning_rate": 1.892851887833657e-05,
"loss": 1.0915,
"step": 470
},
{
"epoch": 0.4724017901541522,
"grad_norm": 0.6822781316090195,
"learning_rate": 1.8889077809652837e-05,
"loss": 1.0798,
"step": 475
},
{
"epoch": 0.47737444057682743,
"grad_norm": 0.6360054418246037,
"learning_rate": 1.884896652217917e-05,
"loss": 1.0939,
"step": 480
},
{
"epoch": 0.48234709099950274,
"grad_norm": 0.6992087315432453,
"learning_rate": 1.880818804022687e-05,
"loss": 1.0987,
"step": 485
},
{
"epoch": 0.48731974142217804,
"grad_norm": 0.6768633602943569,
"learning_rate": 1.8766745438412382e-05,
"loss": 1.1199,
"step": 490
},
{
"epoch": 0.4922923918448533,
"grad_norm": 0.6243661194702032,
"learning_rate": 1.872464184142548e-05,
"loss": 1.0883,
"step": 495
},
{
"epoch": 0.4972650422675286,
"grad_norm": 0.6589230792154801,
"learning_rate": 1.868188042379364e-05,
"loss": 1.1163,
"step": 500
},
{
"epoch": 0.5022376926902039,
"grad_norm": 0.6642790387561549,
"learning_rate": 1.8638464409642724e-05,
"loss": 1.0954,
"step": 505
},
{
"epoch": 0.5072103431128792,
"grad_norm": 0.6423332854509105,
"learning_rate": 1.8594397072453854e-05,
"loss": 1.076,
"step": 510
},
{
"epoch": 0.5121829935355544,
"grad_norm": 0.7506393578542332,
"learning_rate": 1.8549681734816624e-05,
"loss": 1.0985,
"step": 515
},
{
"epoch": 0.5171556439582298,
"grad_norm": 0.7180721263869316,
"learning_rate": 1.850432176817857e-05,
"loss": 1.098,
"step": 520
},
{
"epoch": 0.522128294380905,
"grad_norm": 0.6300038431215529,
"learning_rate": 1.8458320592590976e-05,
"loss": 1.083,
"step": 525
},
{
"epoch": 0.5271009448035803,
"grad_norm": 0.8081330388900698,
"learning_rate": 1.8411681676450998e-05,
"loss": 1.0852,
"step": 530
},
{
"epoch": 0.5320735952262556,
"grad_norm": 0.8167362817265476,
"learning_rate": 1.836440853624017e-05,
"loss": 1.1036,
"step": 535
},
{
"epoch": 0.5370462456489309,
"grad_norm": 0.6121720313365099,
"learning_rate": 1.8316504736259257e-05,
"loss": 1.0891,
"step": 540
},
{
"epoch": 0.5420188960716061,
"grad_norm": 0.6187125374867918,
"learning_rate": 1.826797388835951e-05,
"loss": 1.1023,
"step": 545
},
{
"epoch": 0.5469915464942815,
"grad_norm": 0.5866212425394042,
"learning_rate": 1.8218819651670356e-05,
"loss": 1.1075,
"step": 550
},
{
"epoch": 0.5519641969169568,
"grad_norm": 0.600120796062157,
"learning_rate": 1.8169045732323495e-05,
"loss": 1.0763,
"step": 555
},
{
"epoch": 0.556936847339632,
"grad_norm": 0.634846796553325,
"learning_rate": 1.8118655883173458e-05,
"loss": 1.0827,
"step": 560
},
{
"epoch": 0.5619094977623074,
"grad_norm": 0.6398611916918977,
"learning_rate": 1.8067653903514674e-05,
"loss": 1.0787,
"step": 565
},
{
"epoch": 0.5668821481849826,
"grad_norm": 0.6406203607058498,
"learning_rate": 1.8016043638794975e-05,
"loss": 1.0738,
"step": 570
},
{
"epoch": 0.5718547986076579,
"grad_norm": 0.6400875643577613,
"learning_rate": 1.7963828980325696e-05,
"loss": 1.0818,
"step": 575
},
{
"epoch": 0.5768274490303331,
"grad_norm": 0.649631239193607,
"learning_rate": 1.7911013864988254e-05,
"loss": 1.0801,
"step": 580
},
{
"epoch": 0.5818000994530085,
"grad_norm": 0.6384185175759214,
"learning_rate": 1.785760227493731e-05,
"loss": 1.0962,
"step": 585
},
{
"epoch": 0.5867727498756837,
"grad_norm": 0.6637419994479018,
"learning_rate": 1.780359823730054e-05,
"loss": 1.0986,
"step": 590
},
{
"epoch": 0.591745400298359,
"grad_norm": 0.6940445103521063,
"learning_rate": 1.774900582387499e-05,
"loss": 1.0849,
"step": 595
},
{
"epoch": 0.5967180507210343,
"grad_norm": 0.6352562893683872,
"learning_rate": 1.769382915082007e-05,
"loss": 1.0827,
"step": 600
},
{
"epoch": 0.6016907011437096,
"grad_norm": 0.644197961344711,
"learning_rate": 1.7638072378347205e-05,
"loss": 1.0782,
"step": 605
},
{
"epoch": 0.6066633515663848,
"grad_norm": 0.6217957456701444,
"learning_rate": 1.7581739710406158e-05,
"loss": 1.0979,
"step": 610
},
{
"epoch": 0.6116360019890602,
"grad_norm": 0.6697037211804547,
"learning_rate": 1.752483539436807e-05,
"loss": 1.0902,
"step": 615
},
{
"epoch": 0.6166086524117355,
"grad_norm": 0.584860135164739,
"learning_rate": 1.7467363720705204e-05,
"loss": 1.0779,
"step": 620
},
{
"epoch": 0.6215813028344107,
"grad_norm": 0.6961081581048412,
"learning_rate": 1.740932902266747e-05,
"loss": 1.072,
"step": 625
},
{
"epoch": 0.6265539532570861,
"grad_norm": 0.660941824477212,
"learning_rate": 1.7350735675955696e-05,
"loss": 1.0857,
"step": 630
},
{
"epoch": 0.6315266036797613,
"grad_norm": 0.6390929736380937,
"learning_rate": 1.72915880983917e-05,
"loss": 1.0748,
"step": 635
},
{
"epoch": 0.6364992541024366,
"grad_norm": 0.6136644106406868,
"learning_rate": 1.7231890749585208e-05,
"loss": 1.0704,
"step": 640
},
{
"epoch": 0.6414719045251119,
"grad_norm": 0.6500326781000102,
"learning_rate": 1.717164813059761e-05,
"loss": 1.0621,
"step": 645
},
{
"epoch": 0.6464445549477872,
"grad_norm": 0.6274957853270717,
"learning_rate": 1.711086478360257e-05,
"loss": 1.0882,
"step": 650
},
{
"epoch": 0.6514172053704624,
"grad_norm": 0.6776791597333586,
"learning_rate": 1.704954529154359e-05,
"loss": 1.069,
"step": 655
},
{
"epoch": 0.6563898557931377,
"grad_norm": 0.6307039768815268,
"learning_rate": 1.698769427778842e-05,
"loss": 1.0845,
"step": 660
},
{
"epoch": 0.661362506215813,
"grad_norm": 0.6017095125274678,
"learning_rate": 1.69253164057805e-05,
"loss": 1.0804,
"step": 665
},
{
"epoch": 0.6663351566384883,
"grad_norm": 0.6094011002277849,
"learning_rate": 1.686241637868734e-05,
"loss": 1.0681,
"step": 670
},
{
"epoch": 0.6713078070611636,
"grad_norm": 0.6098389254101954,
"learning_rate": 1.6798998939045893e-05,
"loss": 1.078,
"step": 675
},
{
"epoch": 0.6762804574838389,
"grad_norm": 0.6371210646880789,
"learning_rate": 1.6735068868405e-05,
"loss": 1.0776,
"step": 680
},
{
"epoch": 0.6812531079065142,
"grad_norm": 0.6519412959002887,
"learning_rate": 1.667063098696485e-05,
"loss": 1.093,
"step": 685
},
{
"epoch": 0.6862257583291894,
"grad_norm": 0.6296676825303947,
"learning_rate": 1.660569015321357e-05,
"loss": 1.079,
"step": 690
},
{
"epoch": 0.6911984087518648,
"grad_norm": 0.6515938103088492,
"learning_rate": 1.654025126356088e-05,
"loss": 1.0763,
"step": 695
},
{
"epoch": 0.69617105917454,
"grad_norm": 0.63614383650923,
"learning_rate": 1.647431925196892e-05,
"loss": 1.0726,
"step": 700
},
{
"epoch": 0.7011437095972153,
"grad_norm": 0.6733392483456604,
"learning_rate": 1.6407899089580263e-05,
"loss": 1.0808,
"step": 705
},
{
"epoch": 0.7061163600198906,
"grad_norm": 0.6177292064792552,
"learning_rate": 1.6340995784343058e-05,
"loss": 1.0662,
"step": 710
},
{
"epoch": 0.7110890104425659,
"grad_norm": 0.6282011292024681,
"learning_rate": 1.6273614380633484e-05,
"loss": 1.0756,
"step": 715
},
{
"epoch": 0.7160616608652411,
"grad_norm": 0.6382859967632786,
"learning_rate": 1.620575995887538e-05,
"loss": 1.0784,
"step": 720
},
{
"epoch": 0.7210343112879165,
"grad_norm": 0.5746413645512066,
"learning_rate": 1.6137437635157214e-05,
"loss": 1.0812,
"step": 725
},
{
"epoch": 0.7260069617105918,
"grad_norm": 0.5979846067324073,
"learning_rate": 1.6068652560846328e-05,
"loss": 1.0731,
"step": 730
},
{
"epoch": 0.730979612133267,
"grad_norm": 0.6150123084073673,
"learning_rate": 1.5999409922200534e-05,
"loss": 1.0836,
"step": 735
},
{
"epoch": 0.7359522625559424,
"grad_norm": 0.6300145625125909,
"learning_rate": 1.592971493997709e-05,
"loss": 1.0635,
"step": 740
},
{
"epoch": 0.7409249129786176,
"grad_norm": 0.6454552072886953,
"learning_rate": 1.5859572869039063e-05,
"loss": 1.0713,
"step": 745
},
{
"epoch": 0.7458975634012929,
"grad_norm": 0.5969281554213641,
"learning_rate": 1.5788988997959115e-05,
"loss": 1.0692,
"step": 750
},
{
"epoch": 0.7508702138239681,
"grad_norm": 0.6207791802320729,
"learning_rate": 1.571796864862076e-05,
"loss": 1.0789,
"step": 755
},
{
"epoch": 0.7558428642466435,
"grad_norm": 0.6191886764048089,
"learning_rate": 1.5646517175817114e-05,
"loss": 1.0714,
"step": 760
},
{
"epoch": 0.7608155146693187,
"grad_norm": 0.6281154744718429,
"learning_rate": 1.5574639966847128e-05,
"loss": 1.0661,
"step": 765
},
{
"epoch": 0.765788165091994,
"grad_norm": 0.6096651334127257,
"learning_rate": 1.5502342441109423e-05,
"loss": 1.0814,
"step": 770
},
{
"epoch": 0.7707608155146694,
"grad_norm": 0.6122390840080459,
"learning_rate": 1.5429630049693676e-05,
"loss": 1.0769,
"step": 775
},
{
"epoch": 0.7757334659373446,
"grad_norm": 0.6362786028351584,
"learning_rate": 1.5356508274969595e-05,
"loss": 1.0689,
"step": 780
},
{
"epoch": 0.7807061163600199,
"grad_norm": 0.5843854420963039,
"learning_rate": 1.5282982630173587e-05,
"loss": 1.0755,
"step": 785
},
{
"epoch": 0.7856787667826952,
"grad_norm": 0.6093020908006089,
"learning_rate": 1.5209058658993056e-05,
"loss": 1.0704,
"step": 790
},
{
"epoch": 0.7906514172053705,
"grad_norm": 0.5732577171485725,
"learning_rate": 1.513474193514842e-05,
"loss": 1.0824,
"step": 795
},
{
"epoch": 0.7956240676280457,
"grad_norm": 0.604193134815774,
"learning_rate": 1.5060038061972875e-05,
"loss": 1.0825,
"step": 800
},
{
"epoch": 0.8005967180507211,
"grad_norm": 0.5972633379846944,
"learning_rate": 1.49849526719899e-05,
"loss": 1.0786,
"step": 805
},
{
"epoch": 0.8055693684733963,
"grad_norm": 0.648881707010711,
"learning_rate": 1.4909491426488579e-05,
"loss": 1.071,
"step": 810
},
{
"epoch": 0.8105420188960716,
"grad_norm": 0.6747876304157826,
"learning_rate": 1.4833660015096767e-05,
"loss": 1.0881,
"step": 815
},
{
"epoch": 0.8155146693187469,
"grad_norm": 0.6061995987355869,
"learning_rate": 1.4757464155352082e-05,
"loss": 1.0836,
"step": 820
},
{
"epoch": 0.8204873197414222,
"grad_norm": 0.5651261625287576,
"learning_rate": 1.468090959227082e-05,
"loss": 1.0578,
"step": 825
},
{
"epoch": 0.8254599701640974,
"grad_norm": 0.6708302058482162,
"learning_rate": 1.4604002097914806e-05,
"loss": 1.0874,
"step": 830
},
{
"epoch": 0.8304326205867727,
"grad_norm": 0.5998607882192355,
"learning_rate": 1.4526747470956175e-05,
"loss": 1.078,
"step": 835
},
{
"epoch": 0.8354052710094481,
"grad_norm": 0.6239914515973155,
"learning_rate": 1.4449151536240167e-05,
"loss": 1.0691,
"step": 840
},
{
"epoch": 0.8403779214321233,
"grad_norm": 0.5757669163668722,
"learning_rate": 1.4371220144345954e-05,
"loss": 1.0644,
"step": 845
},
{
"epoch": 0.8453505718547986,
"grad_norm": 0.6156324529775569,
"learning_rate": 1.4292959171145509e-05,
"loss": 1.0918,
"step": 850
},
{
"epoch": 0.8503232222774739,
"grad_norm": 0.589513385399977,
"learning_rate": 1.4214374517360576e-05,
"loss": 1.0768,
"step": 855
},
{
"epoch": 0.8552958727001492,
"grad_norm": 0.5729612698754991,
"learning_rate": 1.4135472108117786e-05,
"loss": 1.0555,
"step": 860
},
{
"epoch": 0.8602685231228244,
"grad_norm": 0.5712525334541005,
"learning_rate": 1.4056257892501886e-05,
"loss": 1.0679,
"step": 865
},
{
"epoch": 0.8652411735454998,
"grad_norm": 0.6414106262572151,
"learning_rate": 1.3976737843107203e-05,
"loss": 1.0725,
"step": 870
},
{
"epoch": 0.870213823968175,
"grad_norm": 0.6169285531207347,
"learning_rate": 1.3896917955587328e-05,
"loss": 1.0695,
"step": 875
},
{
"epoch": 0.8751864743908503,
"grad_norm": 0.581169031342323,
"learning_rate": 1.3816804248203053e-05,
"loss": 1.0732,
"step": 880
},
{
"epoch": 0.8801591248135257,
"grad_norm": 0.5976235787701776,
"learning_rate": 1.3736402761368597e-05,
"loss": 1.057,
"step": 885
},
{
"epoch": 0.8851317752362009,
"grad_norm": 0.6033341957672993,
"learning_rate": 1.3655719557196185e-05,
"loss": 1.0778,
"step": 890
},
{
"epoch": 0.8901044256588762,
"grad_norm": 0.6026576916975591,
"learning_rate": 1.3574760719038959e-05,
"loss": 1.0659,
"step": 895
},
{
"epoch": 0.8950770760815515,
"grad_norm": 0.5976061942900717,
"learning_rate": 1.3493532351032318e-05,
"loss": 1.0444,
"step": 900
},
{
"epoch": 0.9000497265042268,
"grad_norm": 0.5679367137196055,
"learning_rate": 1.3412040577633687e-05,
"loss": 1.0505,
"step": 905
},
{
"epoch": 0.905022376926902,
"grad_norm": 0.5851846275345329,
"learning_rate": 1.333029154316072e-05,
"loss": 1.0561,
"step": 910
},
{
"epoch": 0.9099950273495774,
"grad_norm": 0.6070199211173483,
"learning_rate": 1.3248291411328048e-05,
"loss": 1.0718,
"step": 915
},
{
"epoch": 0.9149676777722526,
"grad_norm": 0.6061266848548619,
"learning_rate": 1.3166046364782545e-05,
"loss": 1.0608,
"step": 920
},
{
"epoch": 0.9199403281949279,
"grad_norm": 0.6073941055156064,
"learning_rate": 1.308356260463717e-05,
"loss": 1.0776,
"step": 925
},
{
"epoch": 0.9249129786176031,
"grad_norm": 0.670942497774792,
"learning_rate": 1.300084635000341e-05,
"loss": 1.0867,
"step": 930
},
{
"epoch": 0.9298856290402785,
"grad_norm": 0.6063652114093658,
"learning_rate": 1.291790383752237e-05,
"loss": 1.0726,
"step": 935
},
{
"epoch": 0.9348582794629537,
"grad_norm": 0.6504237055754567,
"learning_rate": 1.2834741320894554e-05,
"loss": 1.0747,
"step": 940
},
{
"epoch": 0.939830929885629,
"grad_norm": 0.6770849233282473,
"learning_rate": 1.2751365070408335e-05,
"loss": 1.0747,
"step": 945
},
{
"epoch": 0.9448035803083044,
"grad_norm": 0.6051457904824041,
"learning_rate": 1.2667781372467203e-05,
"loss": 1.0618,
"step": 950
},
{
"epoch": 0.9497762307309796,
"grad_norm": 0.6082598939008289,
"learning_rate": 1.2583996529115762e-05,
"loss": 1.0675,
"step": 955
},
{
"epoch": 0.9547488811536549,
"grad_norm": 0.6063786635274505,
"learning_rate": 1.2500016857564585e-05,
"loss": 1.089,
"step": 960
},
{
"epoch": 0.9597215315763302,
"grad_norm": 0.6004777766594889,
"learning_rate": 1.2415848689713904e-05,
"loss": 1.0761,
"step": 965
},
{
"epoch": 0.9646941819990055,
"grad_norm": 0.5838756120365949,
"learning_rate": 1.2331498371676206e-05,
"loss": 1.0641,
"step": 970
},
{
"epoch": 0.9696668324216807,
"grad_norm": 0.5881811224350209,
"learning_rate": 1.2246972263297718e-05,
"loss": 1.0556,
"step": 975
},
{
"epoch": 0.9746394828443561,
"grad_norm": 0.5659402723765035,
"learning_rate": 1.2162276737678934e-05,
"loss": 1.0535,
"step": 980
},
{
"epoch": 0.9796121332670313,
"grad_norm": 0.5667335140444246,
"learning_rate": 1.2077418180694049e-05,
"loss": 1.0575,
"step": 985
},
{
"epoch": 0.9845847836897066,
"grad_norm": 0.5707663035442004,
"learning_rate": 1.1992402990509515e-05,
"loss": 1.0486,
"step": 990
},
{
"epoch": 0.989557434112382,
"grad_norm": 0.5843296109709583,
"learning_rate": 1.1907237577101612e-05,
"loss": 1.0706,
"step": 995
},
{
"epoch": 0.9945300845350572,
"grad_norm": 0.5784451382415723,
"learning_rate": 1.1821928361773148e-05,
"loss": 1.0583,
"step": 1000
},
{
"epoch": 0.9995027349577325,
"grad_norm": 0.5601123703118762,
"learning_rate": 1.1736481776669307e-05,
"loss": 1.0638,
"step": 1005
},
{
"epoch": 0.9995027349577325,
"eval_loss": 1.0704214572906494,
"eval_runtime": 313.2095,
"eval_samples_per_second": 45.455,
"eval_steps_per_second": 0.712,
"step": 1005
},
{
"epoch": 1.0044753853804078,
"grad_norm": 0.7234842589362833,
"learning_rate": 1.1650904264292689e-05,
"loss": 0.9297,
"step": 1010
},
{
"epoch": 1.009448035803083,
"grad_norm": 0.7141499870480417,
"learning_rate": 1.1565202277017551e-05,
"loss": 0.9093,
"step": 1015
},
{
"epoch": 1.0144206862257583,
"grad_norm": 0.7209311229992112,
"learning_rate": 1.14793822766033e-05,
"loss": 0.8998,
"step": 1020
},
{
"epoch": 1.0193933366484336,
"grad_norm": 0.7103101581589907,
"learning_rate": 1.139345073370731e-05,
"loss": 0.9174,
"step": 1025
},
{
"epoch": 1.0243659870711088,
"grad_norm": 0.6866739397113207,
"learning_rate": 1.1307414127397028e-05,
"loss": 0.8991,
"step": 1030
},
{
"epoch": 1.0293386374937843,
"grad_norm": 0.629326336496965,
"learning_rate": 1.1221278944661474e-05,
"loss": 0.9109,
"step": 1035
},
{
"epoch": 1.0343112879164595,
"grad_norm": 0.6926758402006077,
"learning_rate": 1.1135051679922143e-05,
"loss": 0.9111,
"step": 1040
},
{
"epoch": 1.0392839383391348,
"grad_norm": 0.610723675260377,
"learning_rate": 1.104873883454332e-05,
"loss": 0.908,
"step": 1045
},
{
"epoch": 1.04425658876181,
"grad_norm": 0.636677273500329,
"learning_rate": 1.0962346916341904e-05,
"loss": 0.8833,
"step": 1050
},
{
"epoch": 1.0492292391844853,
"grad_norm": 0.6005893454455321,
"learning_rate": 1.087588243909673e-05,
"loss": 0.9091,
"step": 1055
},
{
"epoch": 1.0542018896071605,
"grad_norm": 0.6048311027567271,
"learning_rate": 1.0789351922057437e-05,
"loss": 0.9031,
"step": 1060
},
{
"epoch": 1.0591745400298358,
"grad_norm": 0.6157155501614046,
"learning_rate": 1.070276188945293e-05,
"loss": 0.8928,
"step": 1065
},
{
"epoch": 1.0641471904525113,
"grad_norm": 0.6465505729365048,
"learning_rate": 1.0616118869999484e-05,
"loss": 0.8942,
"step": 1070
},
{
"epoch": 1.0691198408751865,
"grad_norm": 0.6540283250849646,
"learning_rate": 1.0529429396408452e-05,
"loss": 0.9028,
"step": 1075
},
{
"epoch": 1.0740924912978618,
"grad_norm": 0.6319963673992531,
"learning_rate": 1.0442700004893764e-05,
"loss": 0.8908,
"step": 1080
},
{
"epoch": 1.079065141720537,
"grad_norm": 0.6255624407373807,
"learning_rate": 1.0355937234679065e-05,
"loss": 0.9039,
"step": 1085
},
{
"epoch": 1.0840377921432123,
"grad_norm": 0.5872134392284907,
"learning_rate": 1.0269147627504692e-05,
"loss": 0.9176,
"step": 1090
},
{
"epoch": 1.0890104425658875,
"grad_norm": 0.6687386627981542,
"learning_rate": 1.0182337727134431e-05,
"loss": 0.9118,
"step": 1095
},
{
"epoch": 1.093983092988563,
"grad_norm": 0.6382535990052277,
"learning_rate": 1.0095514078862147e-05,
"loss": 0.9082,
"step": 1100
},
{
"epoch": 1.0989557434112383,
"grad_norm": 0.6119367907834425,
"learning_rate": 1.0008683229018257e-05,
"loss": 0.9057,
"step": 1105
},
{
"epoch": 1.1039283938339135,
"grad_norm": 0.6013607999486498,
"learning_rate": 9.92185172447616e-06,
"loss": 0.9247,
"step": 1110
},
{
"epoch": 1.1089010442565888,
"grad_norm": 0.6313265309829375,
"learning_rate": 9.835026112158637e-06,
"loss": 0.9065,
"step": 1115
},
{
"epoch": 1.113873694679264,
"grad_norm": 0.6369037011412366,
"learning_rate": 9.748212938544188e-06,
"loss": 0.9217,
"step": 1120
},
{
"epoch": 1.1188463451019393,
"grad_norm": 0.6131706455581138,
"learning_rate": 9.661418749173467e-06,
"loss": 0.9161,
"step": 1125
},
{
"epoch": 1.1238189955246147,
"grad_norm": 0.6129393934682658,
"learning_rate": 9.574650088155752e-06,
"loss": 0.8958,
"step": 1130
},
{
"epoch": 1.12879164594729,
"grad_norm": 0.6436312212169086,
"learning_rate": 9.487913497675536e-06,
"loss": 0.9052,
"step": 1135
},
{
"epoch": 1.1337642963699652,
"grad_norm": 0.6721603259383877,
"learning_rate": 9.401215517499252e-06,
"loss": 0.9078,
"step": 1140
},
{
"epoch": 1.1387369467926405,
"grad_norm": 0.6531721143996831,
"learning_rate": 9.314562684482202e-06,
"loss": 0.8982,
"step": 1145
},
{
"epoch": 1.1437095972153157,
"grad_norm": 0.624269506566372,
"learning_rate": 9.22796153207567e-06,
"loss": 0.9006,
"step": 1150
},
{
"epoch": 1.148682247637991,
"grad_norm": 0.6430651884495883,
"learning_rate": 9.14141858983434e-06,
"loss": 0.9016,
"step": 1155
},
{
"epoch": 1.1536548980606662,
"grad_norm": 0.6676665306535187,
"learning_rate": 9.054940382923954e-06,
"loss": 0.8893,
"step": 1160
},
{
"epoch": 1.1586275484833417,
"grad_norm": 0.6252335256876467,
"learning_rate": 8.96853343162934e-06,
"loss": 0.8893,
"step": 1165
},
{
"epoch": 1.163600198906017,
"grad_norm": 0.5838746279233594,
"learning_rate": 8.882204250862796e-06,
"loss": 0.8992,
"step": 1170
},
{
"epoch": 1.1685728493286922,
"grad_norm": 0.6369545427202165,
"learning_rate": 8.795959349672878e-06,
"loss": 0.8902,
"step": 1175
},
{
"epoch": 1.1735454997513675,
"grad_norm": 0.6174021193552773,
"learning_rate": 8.709805230753628e-06,
"loss": 0.9053,
"step": 1180
},
{
"epoch": 1.1785181501740427,
"grad_norm": 0.6123198457341488,
"learning_rate": 8.623748389954284e-06,
"loss": 0.903,
"step": 1185
},
{
"epoch": 1.183490800596718,
"grad_norm": 0.5956552825535298,
"learning_rate": 8.53779531578951e-06,
"loss": 0.896,
"step": 1190
},
{
"epoch": 1.1884634510193934,
"grad_norm": 0.6113334565497545,
"learning_rate": 8.451952488950167e-06,
"loss": 0.8966,
"step": 1195
},
{
"epoch": 1.1934361014420687,
"grad_norm": 0.6957806045214961,
"learning_rate": 8.366226381814698e-06,
"loss": 0.9135,
"step": 1200
},
{
"epoch": 1.198408751864744,
"grad_norm": 0.6224006138182644,
"learning_rate": 8.280623457961102e-06,
"loss": 0.9092,
"step": 1205
},
{
"epoch": 1.2033814022874192,
"grad_norm": 0.6154668861281782,
"learning_rate": 8.195150171679608e-06,
"loss": 0.8961,
"step": 1210
},
{
"epoch": 1.2083540527100944,
"grad_norm": 0.601285614291192,
"learning_rate": 8.109812967486024e-06,
"loss": 0.8957,
"step": 1215
},
{
"epoch": 1.2133267031327697,
"grad_norm": 0.6200705833301541,
"learning_rate": 8.02461827963585e-06,
"loss": 0.9007,
"step": 1220
},
{
"epoch": 1.218299353555445,
"grad_norm": 0.621174344679376,
"learning_rate": 7.939572531639128e-06,
"loss": 0.9078,
"step": 1225
},
{
"epoch": 1.2232720039781204,
"grad_norm": 0.6237287223906944,
"learning_rate": 7.85468213577613e-06,
"loss": 0.9085,
"step": 1230
},
{
"epoch": 1.2282446544007957,
"grad_norm": 0.6313023907502137,
"learning_rate": 7.7699534926139e-06,
"loss": 0.9121,
"step": 1235
},
{
"epoch": 1.233217304823471,
"grad_norm": 0.6645038885414285,
"learning_rate": 7.685392990523628e-06,
"loss": 0.895,
"step": 1240
},
{
"epoch": 1.2381899552461462,
"grad_norm": 0.6182163241650108,
"learning_rate": 7.601007005199022e-06,
"loss": 0.8958,
"step": 1245
},
{
"epoch": 1.2431626056688214,
"grad_norm": 0.6152497076528807,
"learning_rate": 7.5168018991755645e-06,
"loss": 0.9123,
"step": 1250
},
{
"epoch": 1.248135256091497,
"grad_norm": 0.5828706125717531,
"learning_rate": 7.432784021350796e-06,
"loss": 0.9116,
"step": 1255
},
{
"epoch": 1.2531079065141721,
"grad_norm": 0.6380432502898638,
"learning_rate": 7.3489597065056274e-06,
"loss": 0.8931,
"step": 1260
},
{
"epoch": 1.2580805569368474,
"grad_norm": 0.6666672232856957,
"learning_rate": 7.265335274826704e-06,
"loss": 0.8985,
"step": 1265
},
{
"epoch": 1.2630532073595226,
"grad_norm": 0.6111003204096657,
"learning_rate": 7.1819170314298746e-06,
"loss": 0.9022,
"step": 1270
},
{
"epoch": 1.268025857782198,
"grad_norm": 0.5636007624972241,
"learning_rate": 7.09871126588481e-06,
"loss": 0.8926,
"step": 1275
},
{
"epoch": 1.2729985082048731,
"grad_norm": 0.6013817391842333,
"learning_rate": 7.015724251740766e-06,
"loss": 0.9104,
"step": 1280
},
{
"epoch": 1.2779711586275484,
"grad_norm": 0.6155188711752694,
"learning_rate": 6.932962246053577e-06,
"loss": 0.9095,
"step": 1285
},
{
"epoch": 1.2829438090502236,
"grad_norm": 0.6172811916494736,
"learning_rate": 6.8504314889138956e-06,
"loss": 0.8996,
"step": 1290
},
{
"epoch": 1.2879164594728991,
"grad_norm": 0.605702181756811,
"learning_rate": 6.768138202976691e-06,
"loss": 0.8974,
"step": 1295
},
{
"epoch": 1.2928891098955744,
"grad_norm": 0.6159076278297484,
"learning_rate": 6.686088592992067e-06,
"loss": 0.8946,
"step": 1300
},
{
"epoch": 1.2978617603182496,
"grad_norm": 0.6031330118847871,
"learning_rate": 6.604288845337453e-06,
"loss": 0.8899,
"step": 1305
},
{
"epoch": 1.3028344107409249,
"grad_norm": 0.5905887773391477,
"learning_rate": 6.522745127551158e-06,
"loss": 0.8783,
"step": 1310
},
{
"epoch": 1.3078070611636001,
"grad_norm": 0.6196998052199774,
"learning_rate": 6.441463587867341e-06,
"loss": 0.8913,
"step": 1315
},
{
"epoch": 1.3127797115862756,
"grad_norm": 0.6365018272765014,
"learning_rate": 6.360450354752459e-06,
"loss": 0.8971,
"step": 1320
},
{
"epoch": 1.3177523620089509,
"grad_norm": 0.6432175965282628,
"learning_rate": 6.279711536443185e-06,
"loss": 0.8997,
"step": 1325
},
{
"epoch": 1.322725012431626,
"grad_norm": 0.6190646458071539,
"learning_rate": 6.199253220485857e-06,
"loss": 0.8959,
"step": 1330
},
{
"epoch": 1.3276976628543014,
"grad_norm": 0.5944004518537376,
"learning_rate": 6.119081473277502e-06,
"loss": 0.8891,
"step": 1335
},
{
"epoch": 1.3326703132769766,
"grad_norm": 0.5966011460567672,
"learning_rate": 6.039202339608432e-06,
"loss": 0.8972,
"step": 1340
},
{
"epoch": 1.3376429636996519,
"grad_norm": 0.6043525331022274,
"learning_rate": 5.959621842206474e-06,
"loss": 0.8968,
"step": 1345
},
{
"epoch": 1.342615614122327,
"grad_norm": 0.6274316003330035,
"learning_rate": 5.880345981282877e-06,
"loss": 0.8975,
"step": 1350
},
{
"epoch": 1.3475882645450024,
"grad_norm": 0.6387485089736155,
"learning_rate": 5.801380734079906e-06,
"loss": 0.8882,
"step": 1355
},
{
"epoch": 1.3525609149676778,
"grad_norm": 0.6057683456315234,
"learning_rate": 5.722732054420172e-06,
"loss": 0.8968,
"step": 1360
},
{
"epoch": 1.357533565390353,
"grad_norm": 0.641291647959097,
"learning_rate": 5.644405872257716e-06,
"loss": 0.9089,
"step": 1365
},
{
"epoch": 1.3625062158130283,
"grad_norm": 0.6093344166563688,
"learning_rate": 5.566408093230911e-06,
"loss": 0.901,
"step": 1370
},
{
"epoch": 1.3674788662357036,
"grad_norm": 0.6137226994575677,
"learning_rate": 5.48874459821719e-06,
"loss": 0.8955,
"step": 1375
},
{
"epoch": 1.3724515166583788,
"grad_norm": 0.5988605927384872,
"learning_rate": 5.411421242889643e-06,
"loss": 0.8972,
"step": 1380
},
{
"epoch": 1.3774241670810543,
"grad_norm": 0.5972211719762297,
"learning_rate": 5.334443857275488e-06,
"loss": 0.8943,
"step": 1385
},
{
"epoch": 1.3823968175037296,
"grad_norm": 0.5959667581735492,
"learning_rate": 5.257818245316522e-06,
"loss": 0.8838,
"step": 1390
},
{
"epoch": 1.3873694679264048,
"grad_norm": 0.595064918048714,
"learning_rate": 5.181550184431511e-06,
"loss": 0.8969,
"step": 1395
},
{
"epoch": 1.39234211834908,
"grad_norm": 0.6244444134401027,
"learning_rate": 5.105645425080572e-06,
"loss": 0.8999,
"step": 1400
},
{
"epoch": 1.3973147687717553,
"grad_norm": 0.6143231676432872,
"learning_rate": 5.030109690331625e-06,
"loss": 0.8848,
"step": 1405
},
{
"epoch": 1.4022874191944306,
"grad_norm": 0.5932069860124085,
"learning_rate": 4.954948675428853e-06,
"loss": 0.9015,
"step": 1410
},
{
"epoch": 1.4072600696171058,
"grad_norm": 0.6037350510324395,
"learning_rate": 4.880168047363312e-06,
"loss": 0.904,
"step": 1415
},
{
"epoch": 1.4122327200397813,
"grad_norm": 0.6127392479761228,
"learning_rate": 4.805773444445654e-06,
"loss": 0.888,
"step": 1420
},
{
"epoch": 1.4172053704624565,
"grad_norm": 0.6177060602745384,
"learning_rate": 4.731770475880995e-06,
"loss": 0.8983,
"step": 1425
},
{
"epoch": 1.4221780208851318,
"grad_norm": 0.5872982950015017,
"learning_rate": 4.658164721345998e-06,
"loss": 0.8924,
"step": 1430
},
{
"epoch": 1.427150671307807,
"grad_norm": 0.5820337262775029,
"learning_rate": 4.584961730568188e-06,
"loss": 0.8748,
"step": 1435
},
{
"epoch": 1.4321233217304823,
"grad_norm": 0.5998336756778414,
"learning_rate": 4.512167022907494e-06,
"loss": 0.8957,
"step": 1440
},
{
"epoch": 1.4370959721531578,
"grad_norm": 0.5961438955138286,
"learning_rate": 4.439786086940116e-06,
"loss": 0.8961,
"step": 1445
},
{
"epoch": 1.442068622575833,
"grad_norm": 0.6078965414451544,
"learning_rate": 4.367824380044684e-06,
"loss": 0.8911,
"step": 1450
},
{
"epoch": 1.4470412729985083,
"grad_norm": 0.6288618597339264,
"learning_rate": 4.296287327990797e-06,
"loss": 0.9019,
"step": 1455
},
{
"epoch": 1.4520139234211835,
"grad_norm": 0.6151434227389634,
"learning_rate": 4.225180324529917e-06,
"loss": 0.8993,
"step": 1460
},
{
"epoch": 1.4569865738438588,
"grad_norm": 0.6404133470470939,
"learning_rate": 4.154508730988704e-06,
"loss": 0.889,
"step": 1465
},
{
"epoch": 1.461959224266534,
"grad_norm": 0.6164441372345689,
"learning_rate": 4.084277875864776e-06,
"loss": 0.8986,
"step": 1470
},
{
"epoch": 1.4669318746892093,
"grad_norm": 0.6109571828322322,
"learning_rate": 4.0144930544249436e-06,
"loss": 0.8946,
"step": 1475
},
{
"epoch": 1.4719045251118845,
"grad_norm": 0.5732121037549397,
"learning_rate": 3.945159528305971e-06,
"loss": 0.8917,
"step": 1480
},
{
"epoch": 1.47687717553456,
"grad_norm": 0.5956428501272881,
"learning_rate": 3.876282525117847e-06,
"loss": 0.906,
"step": 1485
},
{
"epoch": 1.4818498259572352,
"grad_norm": 0.6072276314986297,
"learning_rate": 3.8078672380496416e-06,
"loss": 0.8924,
"step": 1490
},
{
"epoch": 1.4868224763799105,
"grad_norm": 0.5896349716762431,
"learning_rate": 3.7399188254779527e-06,
"loss": 0.9039,
"step": 1495
},
{
"epoch": 1.4917951268025857,
"grad_norm": 0.6075888226854094,
"learning_rate": 3.6724424105779654e-06,
"loss": 0.8993,
"step": 1500
},
{
"epoch": 1.496767777225261,
"grad_norm": 0.6174215911692067,
"learning_rate": 3.6054430809371723e-06,
"loss": 0.9013,
"step": 1505
},
{
"epoch": 1.5017404276479365,
"grad_norm": 0.6152858958633052,
"learning_rate": 3.5389258881718003e-06,
"loss": 0.8818,
"step": 1510
},
{
"epoch": 1.5067130780706117,
"grad_norm": 0.617067387408615,
"learning_rate": 3.4728958475459052e-06,
"loss": 0.8879,
"step": 1515
},
{
"epoch": 1.511685728493287,
"grad_norm": 0.6005431823762195,
"learning_rate": 3.4073579375932377e-06,
"loss": 0.8917,
"step": 1520
},
{
"epoch": 1.5166583789159622,
"grad_norm": 0.5744056476076853,
"learning_rate": 3.342317099741886e-06,
"loss": 0.883,
"step": 1525
},
{
"epoch": 1.5216310293386375,
"grad_norm": 0.6483072296156038,
"learning_rate": 3.27777823794168e-06,
"loss": 0.911,
"step": 1530
},
{
"epoch": 1.5266036797613127,
"grad_norm": 0.6051760146177871,
"learning_rate": 3.2137462182944557e-06,
"loss": 0.898,
"step": 1535
},
{
"epoch": 1.531576330183988,
"grad_norm": 0.5905220877100078,
"learning_rate": 3.150225868687161e-06,
"loss": 0.8885,
"step": 1540
},
{
"epoch": 1.5365489806066632,
"grad_norm": 0.5972075844479279,
"learning_rate": 3.0872219784278357e-06,
"loss": 0.8754,
"step": 1545
},
{
"epoch": 1.5415216310293385,
"grad_norm": 0.5896259569827768,
"learning_rate": 3.0247392978845203e-06,
"loss": 0.8976,
"step": 1550
},
{
"epoch": 1.546494281452014,
"grad_norm": 0.5850464264225753,
"learning_rate": 2.9627825381270704e-06,
"loss": 0.8762,
"step": 1555
},
{
"epoch": 1.5514669318746892,
"grad_norm": 0.5803411751479154,
"learning_rate": 2.9013563705719673e-06,
"loss": 0.8914,
"step": 1560
},
{
"epoch": 1.5564395822973645,
"grad_norm": 0.6107318414430488,
"learning_rate": 2.840465426630091e-06,
"loss": 0.8927,
"step": 1565
},
{
"epoch": 1.56141223272004,
"grad_norm": 0.5883688758640033,
"learning_rate": 2.7801142973575245e-06,
"loss": 0.899,
"step": 1570
},
{
"epoch": 1.5663848831427152,
"grad_norm": 0.5806776595867678,
"learning_rate": 2.720307533109402e-06,
"loss": 0.8714,
"step": 1575
},
{
"epoch": 1.5713575335653904,
"grad_norm": 0.6072257831136263,
"learning_rate": 2.6610496431968125e-06,
"loss": 0.8909,
"step": 1580
},
{
"epoch": 1.5763301839880657,
"grad_norm": 0.5984905911242053,
"learning_rate": 2.6023450955468176e-06,
"loss": 0.8905,
"step": 1585
},
{
"epoch": 1.581302834410741,
"grad_norm": 0.5982169185055861,
"learning_rate": 2.5441983163655705e-06,
"loss": 0.893,
"step": 1590
},
{
"epoch": 1.5862754848334162,
"grad_norm": 0.6845641216521319,
"learning_rate": 2.4866136898045844e-06,
"loss": 0.888,
"step": 1595
},
{
"epoch": 1.5912481352560914,
"grad_norm": 0.6521336761418035,
"learning_rate": 2.4295955576301966e-06,
"loss": 0.8975,
"step": 1600
},
{
"epoch": 1.5962207856787667,
"grad_norm": 0.6026814731835343,
"learning_rate": 2.373148218896182e-06,
"loss": 0.8955,
"step": 1605
},
{
"epoch": 1.601193436101442,
"grad_norm": 0.6031328469823622,
"learning_rate": 2.3172759296196267e-06,
"loss": 0.8984,
"step": 1610
},
{
"epoch": 1.6061660865241174,
"grad_norm": 0.6085878347429402,
"learning_rate": 2.2619829024600394e-06,
"loss": 0.897,
"step": 1615
},
{
"epoch": 1.6111387369467927,
"grad_norm": 0.5913537335897251,
"learning_rate": 2.2072733064017104e-06,
"loss": 0.9019,
"step": 1620
},
{
"epoch": 1.616111387369468,
"grad_norm": 0.6046011617732275,
"learning_rate": 2.153151266439384e-06,
"loss": 0.89,
"step": 1625
},
{
"epoch": 1.6210840377921432,
"grad_norm": 0.5977994670480866,
"learning_rate": 2.0996208632672475e-06,
"loss": 0.8857,
"step": 1630
},
{
"epoch": 1.6260566882148186,
"grad_norm": 0.5798385899827562,
"learning_rate": 2.0466861329712473e-06,
"loss": 0.8893,
"step": 1635
},
{
"epoch": 1.6310293386374939,
"grad_norm": 0.6034790461256396,
"learning_rate": 1.994351066724781e-06,
"loss": 0.8841,
"step": 1640
},
{
"epoch": 1.6360019890601691,
"grad_norm": 0.6144777283053662,
"learning_rate": 1.9426196104877737e-06,
"loss": 0.8754,
"step": 1645
},
{
"epoch": 1.6409746394828444,
"grad_norm": 0.589100085406895,
"learning_rate": 1.8914956647091497e-06,
"loss": 0.8859,
"step": 1650
},
{
"epoch": 1.6459472899055196,
"grad_norm": 0.5907474864486845,
"learning_rate": 1.8409830840327546e-06,
"loss": 0.8906,
"step": 1655
},
{
"epoch": 1.650919940328195,
"grad_norm": 0.5944183790215095,
"learning_rate": 1.791085677006722e-06,
"loss": 0.8987,
"step": 1660
},
{
"epoch": 1.6558925907508701,
"grad_norm": 0.5862999400507831,
"learning_rate": 1.7418072057963143e-06,
"loss": 0.8846,
"step": 1665
},
{
"epoch": 1.6608652411735454,
"grad_norm": 0.6065039887812087,
"learning_rate": 1.6931513859002636e-06,
"loss": 0.8882,
"step": 1670
},
{
"epoch": 1.6658378915962206,
"grad_norm": 0.5829700290824299,
"learning_rate": 1.6451218858706374e-06,
"loss": 0.8899,
"step": 1675
},
{
"epoch": 1.6708105420188961,
"grad_norm": 0.5747340494639567,
"learning_rate": 1.5977223270362197e-06,
"loss": 0.8779,
"step": 1680
},
{
"epoch": 1.6757831924415714,
"grad_norm": 0.6236514063391653,
"learning_rate": 1.5509562832294944e-06,
"loss": 0.8906,
"step": 1685
},
{
"epoch": 1.6807558428642466,
"grad_norm": 0.5890477510202525,
"learning_rate": 1.5048272805171615e-06,
"loss": 0.8735,
"step": 1690
},
{
"epoch": 1.685728493286922,
"grad_norm": 0.5887405386066722,
"learning_rate": 1.459338796934293e-06,
"loss": 0.8787,
"step": 1695
},
{
"epoch": 1.6907011437095973,
"grad_norm": 0.5560854923827532,
"learning_rate": 1.4144942622220902e-06,
"loss": 0.8818,
"step": 1700
},
{
"epoch": 1.6956737941322726,
"grad_norm": 0.6127568360778808,
"learning_rate": 1.3702970575692975e-06,
"loss": 0.8969,
"step": 1705
},
{
"epoch": 1.7006464445549478,
"grad_norm": 0.576741886798026,
"learning_rate": 1.3267505153572502e-06,
"loss": 0.8913,
"step": 1710
},
{
"epoch": 1.705619094977623,
"grad_norm": 0.5786099553553555,
"learning_rate": 1.2838579189086352e-06,
"loss": 0.8836,
"step": 1715
},
{
"epoch": 1.7105917454002983,
"grad_norm": 0.5977190789902269,
"learning_rate": 1.2416225022399286e-06,
"loss": 0.8837,
"step": 1720
},
{
"epoch": 1.7155643958229736,
"grad_norm": 0.60144046208383,
"learning_rate": 1.2000474498175552e-06,
"loss": 0.8904,
"step": 1725
},
{
"epoch": 1.7205370462456488,
"grad_norm": 0.5695644106157955,
"learning_rate": 1.1591358963177924e-06,
"loss": 0.8999,
"step": 1730
},
{
"epoch": 1.725509696668324,
"grad_norm": 0.6030300408695916,
"learning_rate": 1.118890926390419e-06,
"loss": 0.8849,
"step": 1735
},
{
"epoch": 1.7304823470909994,
"grad_norm": 0.5754077081164456,
"learning_rate": 1.0793155744261352e-06,
"loss": 0.8809,
"step": 1740
},
{
"epoch": 1.7354549975136748,
"grad_norm": 0.5782023378111908,
"learning_rate": 1.0404128243277778e-06,
"loss": 0.8875,
"step": 1745
},
{
"epoch": 1.74042764793635,
"grad_norm": 0.589876052766199,
"learning_rate": 1.0021856092853433e-06,
"loss": 0.8843,
"step": 1750
},
{
"epoch": 1.7454002983590253,
"grad_norm": 0.594939794983627,
"learning_rate": 9.646368115548232e-07,
"loss": 0.8834,
"step": 1755
},
{
"epoch": 1.7503729487817008,
"grad_norm": 0.5953106465263236,
"learning_rate": 9.277692622409018e-07,
"loss": 0.8667,
"step": 1760
},
{
"epoch": 1.755345599204376,
"grad_norm": 0.6161359913787724,
"learning_rate": 8.915857410834793e-07,
"loss": 0.8891,
"step": 1765
},
{
"epoch": 1.7603182496270513,
"grad_norm": 0.6439397975128724,
"learning_rate": 8.560889762480951e-07,
"loss": 0.8768,
"step": 1770
},
{
"epoch": 1.7652909000497266,
"grad_norm": 0.5657607722214749,
"learning_rate": 8.212816441202309e-07,
"loss": 0.8886,
"step": 1775
},
{
"epoch": 1.7702635504724018,
"grad_norm": 0.6073482243771166,
"learning_rate": 7.871663691035103e-07,
"loss": 0.8901,
"step": 1780
},
{
"epoch": 1.775236200895077,
"grad_norm": 0.566216143109814,
"learning_rate": 7.537457234218271e-07,
"loss": 0.8844,
"step": 1785
},
{
"epoch": 1.7802088513177523,
"grad_norm": 0.5896090553262934,
"learning_rate": 7.210222269254041e-07,
"loss": 0.8897,
"step": 1790
},
{
"epoch": 1.7851815017404276,
"grad_norm": 0.5874428543517964,
"learning_rate": 6.889983469008055e-07,
"loss": 0.887,
"step": 1795
},
{
"epoch": 1.7901541521631028,
"grad_norm": 0.5923783661686378,
"learning_rate": 6.576764978849005e-07,
"loss": 0.89,
"step": 1800
},
{
"epoch": 1.795126802585778,
"grad_norm": 0.5747156271260179,
"learning_rate": 6.27059041482817e-07,
"loss": 0.8803,
"step": 1805
},
{
"epoch": 1.8000994530084535,
"grad_norm": 0.5843164968993672,
"learning_rate": 5.971482861898836e-07,
"loss": 0.8814,
"step": 1810
},
{
"epoch": 1.8050721034311288,
"grad_norm": 0.5925488819832142,
"learning_rate": 5.679464872175666e-07,
"loss": 0.8798,
"step": 1815
},
{
"epoch": 1.810044753853804,
"grad_norm": 0.5814849122329898,
"learning_rate": 5.394558463234378e-07,
"loss": 0.8928,
"step": 1820
},
{
"epoch": 1.8150174042764795,
"grad_norm": 0.5653073997150958,
"learning_rate": 5.116785116451661e-07,
"loss": 0.8858,
"step": 1825
},
{
"epoch": 1.8199900546991548,
"grad_norm": 0.5977924852737442,
"learning_rate": 4.846165775385459e-07,
"loss": 0.8868,
"step": 1830
},
{
"epoch": 1.82496270512183,
"grad_norm": 0.5993863609862281,
"learning_rate": 4.5827208441959426e-07,
"loss": 0.891,
"step": 1835
},
{
"epoch": 1.8299353555445053,
"grad_norm": 0.601479874243501,
"learning_rate": 4.326470186107035e-07,
"loss": 0.8791,
"step": 1840
},
{
"epoch": 1.8349080059671805,
"grad_norm": 0.592784564470986,
"learning_rate": 4.077433121908747e-07,
"loss": 0.8875,
"step": 1845
},
{
"epoch": 1.8398806563898558,
"grad_norm": 0.5707840764321231,
"learning_rate": 3.835628428500515e-07,
"loss": 0.8646,
"step": 1850
},
{
"epoch": 1.844853306812531,
"grad_norm": 0.5628275928965689,
"learning_rate": 3.601074337475352e-07,
"loss": 0.8769,
"step": 1855
},
{
"epoch": 1.8498259572352063,
"grad_norm": 0.6070264634956976,
"learning_rate": 3.3737885337452815e-07,
"loss": 0.8996,
"step": 1860
},
{
"epoch": 1.8547986076578815,
"grad_norm": 0.5987252942276654,
"learning_rate": 3.153788154207926e-07,
"loss": 0.9035,
"step": 1865
},
{
"epoch": 1.859771258080557,
"grad_norm": 0.603313374471039,
"learning_rate": 2.941089786454421e-07,
"loss": 0.8928,
"step": 1870
},
{
"epoch": 1.8647439085032322,
"grad_norm": 0.6133979767792208,
"learning_rate": 2.735709467518699e-07,
"loss": 0.8796,
"step": 1875
},
{
"epoch": 1.8697165589259075,
"grad_norm": 0.5708712405374609,
"learning_rate": 2.5376626826683956e-07,
"loss": 0.8783,
"step": 1880
},
{
"epoch": 1.8746892093485827,
"grad_norm": 0.5903635964464384,
"learning_rate": 2.3469643642372587e-07,
"loss": 0.8727,
"step": 1885
},
{
"epoch": 1.8796618597712582,
"grad_norm": 0.5860886967031799,
"learning_rate": 2.1636288904992585e-07,
"loss": 0.8911,
"step": 1890
},
{
"epoch": 1.8846345101939335,
"grad_norm": 0.5987205804929953,
"learning_rate": 1.9876700845845475e-07,
"loss": 0.8622,
"step": 1895
},
{
"epoch": 1.8896071606166087,
"grad_norm": 0.5864702159345376,
"learning_rate": 1.8191012134371576e-07,
"loss": 0.8997,
"step": 1900
},
{
"epoch": 1.894579811039284,
"grad_norm": 0.5941563218461623,
"learning_rate": 1.6579349868147688e-07,
"loss": 0.8801,
"step": 1905
},
{
"epoch": 1.8995524614619592,
"grad_norm": 0.5890830924362549,
"learning_rate": 1.504183556330374e-07,
"loss": 0.8752,
"step": 1910
},
{
"epoch": 1.9045251118846345,
"grad_norm": 0.5973613796458782,
"learning_rate": 1.3578585145360812e-07,
"loss": 0.888,
"step": 1915
},
{
"epoch": 1.9094977623073097,
"grad_norm": 0.6015109014180703,
"learning_rate": 1.2189708940490653e-07,
"loss": 0.8915,
"step": 1920
},
{
"epoch": 1.914470412729985,
"grad_norm": 0.5840101145509294,
"learning_rate": 1.0875311667196908e-07,
"loss": 0.8823,
"step": 1925
},
{
"epoch": 1.9194430631526602,
"grad_norm": 0.9953027707453141,
"learning_rate": 9.635492428420434e-08,
"loss": 0.8706,
"step": 1930
},
{
"epoch": 1.9244157135753357,
"grad_norm": 0.569992582848356,
"learning_rate": 8.470344704066047e-08,
"loss": 0.8935,
"step": 1935
},
{
"epoch": 1.929388363998011,
"grad_norm": 0.5820719941969129,
"learning_rate": 7.379956343955385e-08,
"loss": 0.8726,
"step": 1940
},
{
"epoch": 1.9343610144206862,
"grad_norm": 0.5883844683240437,
"learning_rate": 6.364409561202323e-08,
"loss": 0.8907,
"step": 1945
},
{
"epoch": 1.9393336648433617,
"grad_norm": 0.5888023020558691,
"learning_rate": 5.42378092601481e-08,
"loss": 0.8733,
"step": 1950
},
{
"epoch": 1.944306315266037,
"grad_norm": 0.6131742183657333,
"learning_rate": 4.558141359921386e-08,
"loss": 0.8835,
"step": 1955
},
{
"epoch": 1.9492789656887122,
"grad_norm": 0.5724474865234898,
"learning_rate": 3.7675561304238996e-08,
"loss": 0.9032,
"step": 1960
},
{
"epoch": 1.9542516161113874,
"grad_norm": 0.5716096201690882,
"learning_rate": 3.0520848460765525e-08,
"loss": 0.8891,
"step": 1965
},
{
"epoch": 1.9592242665340627,
"grad_norm": 0.5954898642826182,
"learning_rate": 2.4117814519911687e-08,
"loss": 0.8951,
"step": 1970
},
{
"epoch": 1.964196916956738,
"grad_norm": 0.5883075227671835,
"learning_rate": 1.846694225770551e-08,
"loss": 0.8799,
"step": 1975
},
{
"epoch": 1.9691695673794132,
"grad_norm": 0.5596221737627415,
"learning_rate": 1.3568657738678437e-08,
"loss": 0.8917,
"step": 1980
},
{
"epoch": 1.9741422178020884,
"grad_norm": 0.5985361336002101,
"learning_rate": 9.423330283742093e-09,
"loss": 0.8822,
"step": 1985
},
{
"epoch": 1.9791148682247637,
"grad_norm": 0.5774439559985055,
"learning_rate": 6.031272442341696e-09,
"loss": 0.8984,
"step": 1990
},
{
"epoch": 1.984087518647439,
"grad_norm": 0.5947058034013541,
"learning_rate": 3.3927399688948868e-09,
"loss": 0.88,
"step": 1995
},
{
"epoch": 1.9890601690701144,
"grad_norm": 0.5662497030571633,
"learning_rate": 1.5079318035016166e-09,
"loss": 0.8558,
"step": 2000
},
{
"epoch": 1.9940328194927897,
"grad_norm": 0.584469598380576,
"learning_rate": 3.7699005695057687e-10,
"loss": 0.8875,
"step": 2005
},
{
"epoch": 1.999005469915465,
"grad_norm": 0.5888373608660001,
"learning_rate": 0.0,
"loss": 0.8693,
"step": 2010
},
{
"epoch": 1.999005469915465,
"eval_loss": 1.057645320892334,
"eval_runtime": 312.6039,
"eval_samples_per_second": 45.543,
"eval_steps_per_second": 0.713,
"step": 2010
},
{
"epoch": 1.999005469915465,
"step": 2010,
"total_flos": 841287456522240.0,
"train_loss": 0.6627339932455946,
"train_runtime": 16962.808,
"train_samples_per_second": 15.173,
"train_steps_per_second": 0.118
}
],
"logging_steps": 5,
"max_steps": 2010,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 841287456522240.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}