|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.999005469915465, |
|
"eval_steps": 500, |
|
"global_step": 2010, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.000994530084535057, |
|
"grad_norm": 23.116023523946204, |
|
"learning_rate": 9.950248756218906e-08, |
|
"loss": 1.4467, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.004972650422675286, |
|
"grad_norm": 22.614904504884613, |
|
"learning_rate": 4.975124378109453e-07, |
|
"loss": 1.4087, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.009945300845350571, |
|
"grad_norm": 16.569672699698376, |
|
"learning_rate": 9.950248756218907e-07, |
|
"loss": 1.3944, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.014917951268025857, |
|
"grad_norm": 3.581639215568655, |
|
"learning_rate": 1.4925373134328358e-06, |
|
"loss": 1.3012, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.019890601690701143, |
|
"grad_norm": 1.9954309309104792, |
|
"learning_rate": 1.9900497512437813e-06, |
|
"loss": 1.2372, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.02486325211337643, |
|
"grad_norm": 1.363643820597461, |
|
"learning_rate": 2.4875621890547264e-06, |
|
"loss": 1.1994, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.029835902536051714, |
|
"grad_norm": 0.9605594350100669, |
|
"learning_rate": 2.9850746268656716e-06, |
|
"loss": 1.1902, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.034808552958727, |
|
"grad_norm": 0.9786652533260872, |
|
"learning_rate": 3.4825870646766175e-06, |
|
"loss": 1.1542, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.039781203381402286, |
|
"grad_norm": 0.9984572750782378, |
|
"learning_rate": 3.980099502487563e-06, |
|
"loss": 1.1175, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.04475385380407757, |
|
"grad_norm": 0.7596880194893064, |
|
"learning_rate": 4.477611940298508e-06, |
|
"loss": 1.1268, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.04972650422675286, |
|
"grad_norm": 0.8716479829689806, |
|
"learning_rate": 4.975124378109453e-06, |
|
"loss": 1.1332, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.05469915464942814, |
|
"grad_norm": 0.791852622062893, |
|
"learning_rate": 5.472636815920398e-06, |
|
"loss": 1.132, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.05967180507210343, |
|
"grad_norm": 0.733246468747676, |
|
"learning_rate": 5.970149253731343e-06, |
|
"loss": 1.0988, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.06464445549477872, |
|
"grad_norm": 0.7512018991531512, |
|
"learning_rate": 6.46766169154229e-06, |
|
"loss": 1.1027, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.069617105917454, |
|
"grad_norm": 1.0504364963362558, |
|
"learning_rate": 6.965174129353235e-06, |
|
"loss": 1.1057, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.07458975634012929, |
|
"grad_norm": 0.8367942560809148, |
|
"learning_rate": 7.46268656716418e-06, |
|
"loss": 1.1211, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.07956240676280457, |
|
"grad_norm": 0.7659112396017037, |
|
"learning_rate": 7.960199004975125e-06, |
|
"loss": 1.1034, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.08453505718547986, |
|
"grad_norm": 0.7564136569017796, |
|
"learning_rate": 8.45771144278607e-06, |
|
"loss": 1.1095, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.08950770760815514, |
|
"grad_norm": 0.680667197827103, |
|
"learning_rate": 8.955223880597016e-06, |
|
"loss": 1.1228, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.09448035803083044, |
|
"grad_norm": 0.7778561000416293, |
|
"learning_rate": 9.45273631840796e-06, |
|
"loss": 1.1305, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.09945300845350571, |
|
"grad_norm": 0.7582042725036765, |
|
"learning_rate": 9.950248756218906e-06, |
|
"loss": 1.1032, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.10442565887618101, |
|
"grad_norm": 0.6743713824965428, |
|
"learning_rate": 1.0447761194029851e-05, |
|
"loss": 1.094, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.10939830929885629, |
|
"grad_norm": 0.8423183568175382, |
|
"learning_rate": 1.0945273631840796e-05, |
|
"loss": 1.1025, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.11437095972153158, |
|
"grad_norm": 0.7394348567397752, |
|
"learning_rate": 1.1442786069651741e-05, |
|
"loss": 1.105, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.11934361014420686, |
|
"grad_norm": 1.039483051036899, |
|
"learning_rate": 1.1940298507462686e-05, |
|
"loss": 1.1086, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.12431626056688215, |
|
"grad_norm": 0.8818107439680185, |
|
"learning_rate": 1.2437810945273631e-05, |
|
"loss": 1.1218, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.12928891098955744, |
|
"grad_norm": 0.7903765993701828, |
|
"learning_rate": 1.293532338308458e-05, |
|
"loss": 1.0911, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.13426156141223272, |
|
"grad_norm": 0.8928561309689899, |
|
"learning_rate": 1.3432835820895525e-05, |
|
"loss": 1.1035, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.139234211834908, |
|
"grad_norm": 0.8136039158436408, |
|
"learning_rate": 1.393034825870647e-05, |
|
"loss": 1.1187, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.14420686225758328, |
|
"grad_norm": 0.7064949127673769, |
|
"learning_rate": 1.4427860696517415e-05, |
|
"loss": 1.1357, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.14917951268025859, |
|
"grad_norm": 0.920457572603285, |
|
"learning_rate": 1.492537313432836e-05, |
|
"loss": 1.1218, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.15415216310293386, |
|
"grad_norm": 0.7265048207847127, |
|
"learning_rate": 1.5422885572139307e-05, |
|
"loss": 1.0863, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.15912481352560914, |
|
"grad_norm": 0.9780404914044643, |
|
"learning_rate": 1.592039800995025e-05, |
|
"loss": 1.1203, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.16409746394828442, |
|
"grad_norm": 1.0254663723419732, |
|
"learning_rate": 1.6417910447761197e-05, |
|
"loss": 1.1079, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.16907011437095973, |
|
"grad_norm": 0.8097421068534958, |
|
"learning_rate": 1.691542288557214e-05, |
|
"loss": 1.1086, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.174042764793635, |
|
"grad_norm": 0.7012708079715231, |
|
"learning_rate": 1.7412935323383088e-05, |
|
"loss": 1.0985, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.1790154152163103, |
|
"grad_norm": 0.8920950219504408, |
|
"learning_rate": 1.791044776119403e-05, |
|
"loss": 1.106, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.1839880656389856, |
|
"grad_norm": 0.7178834727892439, |
|
"learning_rate": 1.8407960199004978e-05, |
|
"loss": 1.1089, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.18896071606166087, |
|
"grad_norm": 0.8106510565314389, |
|
"learning_rate": 1.890547263681592e-05, |
|
"loss": 1.1303, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.19393336648433615, |
|
"grad_norm": 0.7760161666634037, |
|
"learning_rate": 1.9402985074626868e-05, |
|
"loss": 1.1, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.19890601690701143, |
|
"grad_norm": 0.9543725947472645, |
|
"learning_rate": 1.990049751243781e-05, |
|
"loss": 1.1113, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.20387866732968674, |
|
"grad_norm": 0.7825954675614248, |
|
"learning_rate": 1.9999758725817802e-05, |
|
"loss": 1.1142, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.20885131775236201, |
|
"grad_norm": 0.9041843566452702, |
|
"learning_rate": 1.999877856940653e-05, |
|
"loss": 1.1107, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.2138239681750373, |
|
"grad_norm": 0.8759171946616057, |
|
"learning_rate": 1.9997044524974797e-05, |
|
"loss": 1.1075, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.21879661859771257, |
|
"grad_norm": 0.6883474381832496, |
|
"learning_rate": 1.9994556723266102e-05, |
|
"loss": 1.0916, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.22376926902038788, |
|
"grad_norm": 0.935590319528177, |
|
"learning_rate": 1.999131535185575e-05, |
|
"loss": 1.1354, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.22874191944306316, |
|
"grad_norm": 0.7241812404841466, |
|
"learning_rate": 1.9987320655136693e-05, |
|
"loss": 1.091, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.23371456986573844, |
|
"grad_norm": 0.7126868228357645, |
|
"learning_rate": 1.998257293430112e-05, |
|
"loss": 1.1183, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.23868722028841372, |
|
"grad_norm": 0.7747343156203321, |
|
"learning_rate": 1.997707254731775e-05, |
|
"loss": 1.1183, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.24365987071108902, |
|
"grad_norm": 0.7040038125289689, |
|
"learning_rate": 1.9970819908904815e-05, |
|
"loss": 1.1129, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.2486325211337643, |
|
"grad_norm": 0.7279011261296227, |
|
"learning_rate": 1.996381549049882e-05, |
|
"loss": 1.1198, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.2536051715564396, |
|
"grad_norm": 0.7449816844457622, |
|
"learning_rate": 1.9956059820218982e-05, |
|
"loss": 1.112, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.2585778219791149, |
|
"grad_norm": 0.8826758475728962, |
|
"learning_rate": 1.994755348282742e-05, |
|
"loss": 1.1176, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.26355047240179014, |
|
"grad_norm": 0.7356262066526851, |
|
"learning_rate": 1.9938297119685054e-05, |
|
"loss": 1.0975, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.26852312282446544, |
|
"grad_norm": 0.7345831845947749, |
|
"learning_rate": 1.9928291428703265e-05, |
|
"loss": 1.1054, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.27349577324714075, |
|
"grad_norm": 0.6792936934909664, |
|
"learning_rate": 1.9917537164291244e-05, |
|
"loss": 1.0971, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.278468423669816, |
|
"grad_norm": 0.7211549142360152, |
|
"learning_rate": 1.990603513729915e-05, |
|
"loss": 1.1137, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.2834410740924913, |
|
"grad_norm": 0.7338829367672012, |
|
"learning_rate": 1.9893786214956946e-05, |
|
"loss": 1.1031, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.28841372451516656, |
|
"grad_norm": 0.8494824202658868, |
|
"learning_rate": 1.9880791320809012e-05, |
|
"loss": 1.0962, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.29338637493784187, |
|
"grad_norm": 0.7216958796251164, |
|
"learning_rate": 1.9867051434644532e-05, |
|
"loss": 1.1262, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.29835902536051717, |
|
"grad_norm": 0.7349825775644381, |
|
"learning_rate": 1.985256759242359e-05, |
|
"loss": 1.0938, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.3033316757831924, |
|
"grad_norm": 0.6477209491442245, |
|
"learning_rate": 1.9837340886199097e-05, |
|
"loss": 1.0925, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.30830432620586773, |
|
"grad_norm": 0.643732891793806, |
|
"learning_rate": 1.9821372464034416e-05, |
|
"loss": 1.116, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.31327697662854304, |
|
"grad_norm": 0.7127469375661294, |
|
"learning_rate": 1.9804663529916825e-05, |
|
"loss": 1.118, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.3182496270512183, |
|
"grad_norm": 0.6302238233373078, |
|
"learning_rate": 1.9787215343666732e-05, |
|
"loss": 1.0933, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.3232222774738936, |
|
"grad_norm": 0.6512766142325113, |
|
"learning_rate": 1.9769029220842678e-05, |
|
"loss": 1.1022, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.32819492789656884, |
|
"grad_norm": 0.6928608412763418, |
|
"learning_rate": 1.975010653264216e-05, |
|
"loss": 1.1057, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.33316757831924415, |
|
"grad_norm": 0.6801795596819155, |
|
"learning_rate": 1.973044870579824e-05, |
|
"loss": 1.099, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.33814022874191946, |
|
"grad_norm": 0.7155915889134151, |
|
"learning_rate": 1.971005722247197e-05, |
|
"loss": 1.1112, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.3431128791645947, |
|
"grad_norm": 0.7270391229967981, |
|
"learning_rate": 1.9688933620140638e-05, |
|
"loss": 1.0994, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.34808552958727, |
|
"grad_norm": 0.6781191980419545, |
|
"learning_rate": 1.966707949148186e-05, |
|
"loss": 1.0933, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.3530581800099453, |
|
"grad_norm": 0.8199363645347214, |
|
"learning_rate": 1.9644496484253473e-05, |
|
"loss": 1.0993, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 0.3580308304326206, |
|
"grad_norm": 0.7553462054946344, |
|
"learning_rate": 1.9621186301169316e-05, |
|
"loss": 1.1111, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.3630034808552959, |
|
"grad_norm": 0.6733526636292482, |
|
"learning_rate": 1.9597150699770834e-05, |
|
"loss": 1.1038, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 0.3679761312779712, |
|
"grad_norm": 0.63611657294842, |
|
"learning_rate": 1.957239149229458e-05, |
|
"loss": 1.0894, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.37294878170064644, |
|
"grad_norm": 0.6705699643223696, |
|
"learning_rate": 1.954691054553556e-05, |
|
"loss": 1.0908, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.37792143212332174, |
|
"grad_norm": 0.6042254858161937, |
|
"learning_rate": 1.9520709780706485e-05, |
|
"loss": 1.0968, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.382894082545997, |
|
"grad_norm": 0.7276058593761272, |
|
"learning_rate": 1.9493791173292924e-05, |
|
"loss": 1.0863, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 0.3878667329686723, |
|
"grad_norm": 0.7335666141482664, |
|
"learning_rate": 1.9466156752904344e-05, |
|
"loss": 1.0968, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.3928393833913476, |
|
"grad_norm": 0.6999656279264759, |
|
"learning_rate": 1.9437808603121086e-05, |
|
"loss": 1.1077, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 0.39781203381402286, |
|
"grad_norm": 0.7171558885154616, |
|
"learning_rate": 1.9408748861337274e-05, |
|
"loss": 1.0929, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.40278468423669817, |
|
"grad_norm": 0.7881416088231044, |
|
"learning_rate": 1.9378979718599647e-05, |
|
"loss": 1.1068, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 0.40775733465937347, |
|
"grad_norm": 0.7628447209491808, |
|
"learning_rate": 1.934850341944237e-05, |
|
"loss": 1.0979, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.4127299850820487, |
|
"grad_norm": 0.6407210873259641, |
|
"learning_rate": 1.9317322261717794e-05, |
|
"loss": 1.1028, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 0.41770263550472403, |
|
"grad_norm": 0.6676034803018082, |
|
"learning_rate": 1.9285438596423204e-05, |
|
"loss": 1.0943, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.4226752859273993, |
|
"grad_norm": 0.7550987050238746, |
|
"learning_rate": 1.9252854827523557e-05, |
|
"loss": 1.1066, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 0.4276479363500746, |
|
"grad_norm": 0.6729112307385414, |
|
"learning_rate": 1.9219573411770235e-05, |
|
"loss": 1.1008, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.4326205867727499, |
|
"grad_norm": 0.6703726778340369, |
|
"learning_rate": 1.9185596858515797e-05, |
|
"loss": 1.107, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 0.43759323719542514, |
|
"grad_norm": 0.6957611244541537, |
|
"learning_rate": 1.91509277295248e-05, |
|
"loss": 1.0803, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.44256588761810045, |
|
"grad_norm": 0.6302665113292966, |
|
"learning_rate": 1.911556863878062e-05, |
|
"loss": 1.089, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 0.44753853804077576, |
|
"grad_norm": 0.6619346129189808, |
|
"learning_rate": 1.9079522252288387e-05, |
|
"loss": 1.0998, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.452511188463451, |
|
"grad_norm": 0.6303067629333522, |
|
"learning_rate": 1.9042791287873958e-05, |
|
"loss": 1.0982, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 0.4574838388861263, |
|
"grad_norm": 0.6817551059292462, |
|
"learning_rate": 1.900537851497901e-05, |
|
"loss": 1.1123, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.46245648930880157, |
|
"grad_norm": 0.6883985523342773, |
|
"learning_rate": 1.8967286754452214e-05, |
|
"loss": 1.0994, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 0.4674291397314769, |
|
"grad_norm": 0.6303976451992164, |
|
"learning_rate": 1.892851887833657e-05, |
|
"loss": 1.0915, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.4724017901541522, |
|
"grad_norm": 0.6822781316090195, |
|
"learning_rate": 1.8889077809652837e-05, |
|
"loss": 1.0798, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 0.47737444057682743, |
|
"grad_norm": 0.6360054418246037, |
|
"learning_rate": 1.884896652217917e-05, |
|
"loss": 1.0939, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.48234709099950274, |
|
"grad_norm": 0.6992087315432453, |
|
"learning_rate": 1.880818804022687e-05, |
|
"loss": 1.0987, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 0.48731974142217804, |
|
"grad_norm": 0.6768633602943569, |
|
"learning_rate": 1.8766745438412382e-05, |
|
"loss": 1.1199, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.4922923918448533, |
|
"grad_norm": 0.6243661194702032, |
|
"learning_rate": 1.872464184142548e-05, |
|
"loss": 1.0883, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 0.4972650422675286, |
|
"grad_norm": 0.6589230792154801, |
|
"learning_rate": 1.868188042379364e-05, |
|
"loss": 1.1163, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.5022376926902039, |
|
"grad_norm": 0.6642790387561549, |
|
"learning_rate": 1.8638464409642724e-05, |
|
"loss": 1.0954, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 0.5072103431128792, |
|
"grad_norm": 0.6423332854509105, |
|
"learning_rate": 1.8594397072453854e-05, |
|
"loss": 1.076, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.5121829935355544, |
|
"grad_norm": 0.7506393578542332, |
|
"learning_rate": 1.8549681734816624e-05, |
|
"loss": 1.0985, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 0.5171556439582298, |
|
"grad_norm": 0.7180721263869316, |
|
"learning_rate": 1.850432176817857e-05, |
|
"loss": 1.098, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.522128294380905, |
|
"grad_norm": 0.6300038431215529, |
|
"learning_rate": 1.8458320592590976e-05, |
|
"loss": 1.083, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 0.5271009448035803, |
|
"grad_norm": 0.8081330388900698, |
|
"learning_rate": 1.8411681676450998e-05, |
|
"loss": 1.0852, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.5320735952262556, |
|
"grad_norm": 0.8167362817265476, |
|
"learning_rate": 1.836440853624017e-05, |
|
"loss": 1.1036, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 0.5370462456489309, |
|
"grad_norm": 0.6121720313365099, |
|
"learning_rate": 1.8316504736259257e-05, |
|
"loss": 1.0891, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.5420188960716061, |
|
"grad_norm": 0.6187125374867918, |
|
"learning_rate": 1.826797388835951e-05, |
|
"loss": 1.1023, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 0.5469915464942815, |
|
"grad_norm": 0.5866212425394042, |
|
"learning_rate": 1.8218819651670356e-05, |
|
"loss": 1.1075, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.5519641969169568, |
|
"grad_norm": 0.600120796062157, |
|
"learning_rate": 1.8169045732323495e-05, |
|
"loss": 1.0763, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 0.556936847339632, |
|
"grad_norm": 0.634846796553325, |
|
"learning_rate": 1.8118655883173458e-05, |
|
"loss": 1.0827, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.5619094977623074, |
|
"grad_norm": 0.6398611916918977, |
|
"learning_rate": 1.8067653903514674e-05, |
|
"loss": 1.0787, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 0.5668821481849826, |
|
"grad_norm": 0.6406203607058498, |
|
"learning_rate": 1.8016043638794975e-05, |
|
"loss": 1.0738, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.5718547986076579, |
|
"grad_norm": 0.6400875643577613, |
|
"learning_rate": 1.7963828980325696e-05, |
|
"loss": 1.0818, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 0.5768274490303331, |
|
"grad_norm": 0.649631239193607, |
|
"learning_rate": 1.7911013864988254e-05, |
|
"loss": 1.0801, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.5818000994530085, |
|
"grad_norm": 0.6384185175759214, |
|
"learning_rate": 1.785760227493731e-05, |
|
"loss": 1.0962, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 0.5867727498756837, |
|
"grad_norm": 0.6637419994479018, |
|
"learning_rate": 1.780359823730054e-05, |
|
"loss": 1.0986, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.591745400298359, |
|
"grad_norm": 0.6940445103521063, |
|
"learning_rate": 1.774900582387499e-05, |
|
"loss": 1.0849, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 0.5967180507210343, |
|
"grad_norm": 0.6352562893683872, |
|
"learning_rate": 1.769382915082007e-05, |
|
"loss": 1.0827, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.6016907011437096, |
|
"grad_norm": 0.644197961344711, |
|
"learning_rate": 1.7638072378347205e-05, |
|
"loss": 1.0782, |
|
"step": 605 |
|
}, |
|
{ |
|
"epoch": 0.6066633515663848, |
|
"grad_norm": 0.6217957456701444, |
|
"learning_rate": 1.7581739710406158e-05, |
|
"loss": 1.0979, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.6116360019890602, |
|
"grad_norm": 0.6697037211804547, |
|
"learning_rate": 1.752483539436807e-05, |
|
"loss": 1.0902, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 0.6166086524117355, |
|
"grad_norm": 0.584860135164739, |
|
"learning_rate": 1.7467363720705204e-05, |
|
"loss": 1.0779, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.6215813028344107, |
|
"grad_norm": 0.6961081581048412, |
|
"learning_rate": 1.740932902266747e-05, |
|
"loss": 1.072, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 0.6265539532570861, |
|
"grad_norm": 0.660941824477212, |
|
"learning_rate": 1.7350735675955696e-05, |
|
"loss": 1.0857, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.6315266036797613, |
|
"grad_norm": 0.6390929736380937, |
|
"learning_rate": 1.72915880983917e-05, |
|
"loss": 1.0748, |
|
"step": 635 |
|
}, |
|
{ |
|
"epoch": 0.6364992541024366, |
|
"grad_norm": 0.6136644106406868, |
|
"learning_rate": 1.7231890749585208e-05, |
|
"loss": 1.0704, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.6414719045251119, |
|
"grad_norm": 0.6500326781000102, |
|
"learning_rate": 1.717164813059761e-05, |
|
"loss": 1.0621, |
|
"step": 645 |
|
}, |
|
{ |
|
"epoch": 0.6464445549477872, |
|
"grad_norm": 0.6274957853270717, |
|
"learning_rate": 1.711086478360257e-05, |
|
"loss": 1.0882, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.6514172053704624, |
|
"grad_norm": 0.6776791597333586, |
|
"learning_rate": 1.704954529154359e-05, |
|
"loss": 1.069, |
|
"step": 655 |
|
}, |
|
{ |
|
"epoch": 0.6563898557931377, |
|
"grad_norm": 0.6307039768815268, |
|
"learning_rate": 1.698769427778842e-05, |
|
"loss": 1.0845, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.661362506215813, |
|
"grad_norm": 0.6017095125274678, |
|
"learning_rate": 1.69253164057805e-05, |
|
"loss": 1.0804, |
|
"step": 665 |
|
}, |
|
{ |
|
"epoch": 0.6663351566384883, |
|
"grad_norm": 0.6094011002277849, |
|
"learning_rate": 1.686241637868734e-05, |
|
"loss": 1.0681, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.6713078070611636, |
|
"grad_norm": 0.6098389254101954, |
|
"learning_rate": 1.6798998939045893e-05, |
|
"loss": 1.078, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 0.6762804574838389, |
|
"grad_norm": 0.6371210646880789, |
|
"learning_rate": 1.6735068868405e-05, |
|
"loss": 1.0776, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.6812531079065142, |
|
"grad_norm": 0.6519412959002887, |
|
"learning_rate": 1.667063098696485e-05, |
|
"loss": 1.093, |
|
"step": 685 |
|
}, |
|
{ |
|
"epoch": 0.6862257583291894, |
|
"grad_norm": 0.6296676825303947, |
|
"learning_rate": 1.660569015321357e-05, |
|
"loss": 1.079, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.6911984087518648, |
|
"grad_norm": 0.6515938103088492, |
|
"learning_rate": 1.654025126356088e-05, |
|
"loss": 1.0763, |
|
"step": 695 |
|
}, |
|
{ |
|
"epoch": 0.69617105917454, |
|
"grad_norm": 0.63614383650923, |
|
"learning_rate": 1.647431925196892e-05, |
|
"loss": 1.0726, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.7011437095972153, |
|
"grad_norm": 0.6733392483456604, |
|
"learning_rate": 1.6407899089580263e-05, |
|
"loss": 1.0808, |
|
"step": 705 |
|
}, |
|
{ |
|
"epoch": 0.7061163600198906, |
|
"grad_norm": 0.6177292064792552, |
|
"learning_rate": 1.6340995784343058e-05, |
|
"loss": 1.0662, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.7110890104425659, |
|
"grad_norm": 0.6282011292024681, |
|
"learning_rate": 1.6273614380633484e-05, |
|
"loss": 1.0756, |
|
"step": 715 |
|
}, |
|
{ |
|
"epoch": 0.7160616608652411, |
|
"grad_norm": 0.6382859967632786, |
|
"learning_rate": 1.620575995887538e-05, |
|
"loss": 1.0784, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.7210343112879165, |
|
"grad_norm": 0.5746413645512066, |
|
"learning_rate": 1.6137437635157214e-05, |
|
"loss": 1.0812, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 0.7260069617105918, |
|
"grad_norm": 0.5979846067324073, |
|
"learning_rate": 1.6068652560846328e-05, |
|
"loss": 1.0731, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.730979612133267, |
|
"grad_norm": 0.6150123084073673, |
|
"learning_rate": 1.5999409922200534e-05, |
|
"loss": 1.0836, |
|
"step": 735 |
|
}, |
|
{ |
|
"epoch": 0.7359522625559424, |
|
"grad_norm": 0.6300145625125909, |
|
"learning_rate": 1.592971493997709e-05, |
|
"loss": 1.0635, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.7409249129786176, |
|
"grad_norm": 0.6454552072886953, |
|
"learning_rate": 1.5859572869039063e-05, |
|
"loss": 1.0713, |
|
"step": 745 |
|
}, |
|
{ |
|
"epoch": 0.7458975634012929, |
|
"grad_norm": 0.5969281554213641, |
|
"learning_rate": 1.5788988997959115e-05, |
|
"loss": 1.0692, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.7508702138239681, |
|
"grad_norm": 0.6207791802320729, |
|
"learning_rate": 1.571796864862076e-05, |
|
"loss": 1.0789, |
|
"step": 755 |
|
}, |
|
{ |
|
"epoch": 0.7558428642466435, |
|
"grad_norm": 0.6191886764048089, |
|
"learning_rate": 1.5646517175817114e-05, |
|
"loss": 1.0714, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.7608155146693187, |
|
"grad_norm": 0.6281154744718429, |
|
"learning_rate": 1.5574639966847128e-05, |
|
"loss": 1.0661, |
|
"step": 765 |
|
}, |
|
{ |
|
"epoch": 0.765788165091994, |
|
"grad_norm": 0.6096651334127257, |
|
"learning_rate": 1.5502342441109423e-05, |
|
"loss": 1.0814, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.7707608155146694, |
|
"grad_norm": 0.6122390840080459, |
|
"learning_rate": 1.5429630049693676e-05, |
|
"loss": 1.0769, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 0.7757334659373446, |
|
"grad_norm": 0.6362786028351584, |
|
"learning_rate": 1.5356508274969595e-05, |
|
"loss": 1.0689, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.7807061163600199, |
|
"grad_norm": 0.5843854420963039, |
|
"learning_rate": 1.5282982630173587e-05, |
|
"loss": 1.0755, |
|
"step": 785 |
|
}, |
|
{ |
|
"epoch": 0.7856787667826952, |
|
"grad_norm": 0.6093020908006089, |
|
"learning_rate": 1.5209058658993056e-05, |
|
"loss": 1.0704, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.7906514172053705, |
|
"grad_norm": 0.5732577171485725, |
|
"learning_rate": 1.513474193514842e-05, |
|
"loss": 1.0824, |
|
"step": 795 |
|
}, |
|
{ |
|
"epoch": 0.7956240676280457, |
|
"grad_norm": 0.604193134815774, |
|
"learning_rate": 1.5060038061972875e-05, |
|
"loss": 1.0825, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.8005967180507211, |
|
"grad_norm": 0.5972633379846944, |
|
"learning_rate": 1.49849526719899e-05, |
|
"loss": 1.0786, |
|
"step": 805 |
|
}, |
|
{ |
|
"epoch": 0.8055693684733963, |
|
"grad_norm": 0.648881707010711, |
|
"learning_rate": 1.4909491426488579e-05, |
|
"loss": 1.071, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.8105420188960716, |
|
"grad_norm": 0.6747876304157826, |
|
"learning_rate": 1.4833660015096767e-05, |
|
"loss": 1.0881, |
|
"step": 815 |
|
}, |
|
{ |
|
"epoch": 0.8155146693187469, |
|
"grad_norm": 0.6061995987355869, |
|
"learning_rate": 1.4757464155352082e-05, |
|
"loss": 1.0836, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.8204873197414222, |
|
"grad_norm": 0.5651261625287576, |
|
"learning_rate": 1.468090959227082e-05, |
|
"loss": 1.0578, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 0.8254599701640974, |
|
"grad_norm": 0.6708302058482162, |
|
"learning_rate": 1.4604002097914806e-05, |
|
"loss": 1.0874, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.8304326205867727, |
|
"grad_norm": 0.5998607882192355, |
|
"learning_rate": 1.4526747470956175e-05, |
|
"loss": 1.078, |
|
"step": 835 |
|
}, |
|
{ |
|
"epoch": 0.8354052710094481, |
|
"grad_norm": 0.6239914515973155, |
|
"learning_rate": 1.4449151536240167e-05, |
|
"loss": 1.0691, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.8403779214321233, |
|
"grad_norm": 0.5757669163668722, |
|
"learning_rate": 1.4371220144345954e-05, |
|
"loss": 1.0644, |
|
"step": 845 |
|
}, |
|
{ |
|
"epoch": 0.8453505718547986, |
|
"grad_norm": 0.6156324529775569, |
|
"learning_rate": 1.4292959171145509e-05, |
|
"loss": 1.0918, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.8503232222774739, |
|
"grad_norm": 0.589513385399977, |
|
"learning_rate": 1.4214374517360576e-05, |
|
"loss": 1.0768, |
|
"step": 855 |
|
}, |
|
{ |
|
"epoch": 0.8552958727001492, |
|
"grad_norm": 0.5729612698754991, |
|
"learning_rate": 1.4135472108117786e-05, |
|
"loss": 1.0555, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.8602685231228244, |
|
"grad_norm": 0.5712525334541005, |
|
"learning_rate": 1.4056257892501886e-05, |
|
"loss": 1.0679, |
|
"step": 865 |
|
}, |
|
{ |
|
"epoch": 0.8652411735454998, |
|
"grad_norm": 0.6414106262572151, |
|
"learning_rate": 1.3976737843107203e-05, |
|
"loss": 1.0725, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.870213823968175, |
|
"grad_norm": 0.6169285531207347, |
|
"learning_rate": 1.3896917955587328e-05, |
|
"loss": 1.0695, |
|
"step": 875 |
|
}, |
|
{ |
|
"epoch": 0.8751864743908503, |
|
"grad_norm": 0.581169031342323, |
|
"learning_rate": 1.3816804248203053e-05, |
|
"loss": 1.0732, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.8801591248135257, |
|
"grad_norm": 0.5976235787701776, |
|
"learning_rate": 1.3736402761368597e-05, |
|
"loss": 1.057, |
|
"step": 885 |
|
}, |
|
{ |
|
"epoch": 0.8851317752362009, |
|
"grad_norm": 0.6033341957672993, |
|
"learning_rate": 1.3655719557196185e-05, |
|
"loss": 1.0778, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 0.8901044256588762, |
|
"grad_norm": 0.6026576916975591, |
|
"learning_rate": 1.3574760719038959e-05, |
|
"loss": 1.0659, |
|
"step": 895 |
|
}, |
|
{ |
|
"epoch": 0.8950770760815515, |
|
"grad_norm": 0.5976061942900717, |
|
"learning_rate": 1.3493532351032318e-05, |
|
"loss": 1.0444, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.9000497265042268, |
|
"grad_norm": 0.5679367137196055, |
|
"learning_rate": 1.3412040577633687e-05, |
|
"loss": 1.0505, |
|
"step": 905 |
|
}, |
|
{ |
|
"epoch": 0.905022376926902, |
|
"grad_norm": 0.5851846275345329, |
|
"learning_rate": 1.333029154316072e-05, |
|
"loss": 1.0561, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 0.9099950273495774, |
|
"grad_norm": 0.6070199211173483, |
|
"learning_rate": 1.3248291411328048e-05, |
|
"loss": 1.0718, |
|
"step": 915 |
|
}, |
|
{ |
|
"epoch": 0.9149676777722526, |
|
"grad_norm": 0.6061266848548619, |
|
"learning_rate": 1.3166046364782545e-05, |
|
"loss": 1.0608, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.9199403281949279, |
|
"grad_norm": 0.6073941055156064, |
|
"learning_rate": 1.308356260463717e-05, |
|
"loss": 1.0776, |
|
"step": 925 |
|
}, |
|
{ |
|
"epoch": 0.9249129786176031, |
|
"grad_norm": 0.670942497774792, |
|
"learning_rate": 1.300084635000341e-05, |
|
"loss": 1.0867, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 0.9298856290402785, |
|
"grad_norm": 0.6063652114093658, |
|
"learning_rate": 1.291790383752237e-05, |
|
"loss": 1.0726, |
|
"step": 935 |
|
}, |
|
{ |
|
"epoch": 0.9348582794629537, |
|
"grad_norm": 0.6504237055754567, |
|
"learning_rate": 1.2834741320894554e-05, |
|
"loss": 1.0747, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 0.939830929885629, |
|
"grad_norm": 0.6770849233282473, |
|
"learning_rate": 1.2751365070408335e-05, |
|
"loss": 1.0747, |
|
"step": 945 |
|
}, |
|
{ |
|
"epoch": 0.9448035803083044, |
|
"grad_norm": 0.6051457904824041, |
|
"learning_rate": 1.2667781372467203e-05, |
|
"loss": 1.0618, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.9497762307309796, |
|
"grad_norm": 0.6082598939008289, |
|
"learning_rate": 1.2583996529115762e-05, |
|
"loss": 1.0675, |
|
"step": 955 |
|
}, |
|
{ |
|
"epoch": 0.9547488811536549, |
|
"grad_norm": 0.6063786635274505, |
|
"learning_rate": 1.2500016857564585e-05, |
|
"loss": 1.089, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 0.9597215315763302, |
|
"grad_norm": 0.6004777766594889, |
|
"learning_rate": 1.2415848689713904e-05, |
|
"loss": 1.0761, |
|
"step": 965 |
|
}, |
|
{ |
|
"epoch": 0.9646941819990055, |
|
"grad_norm": 0.5838756120365949, |
|
"learning_rate": 1.2331498371676206e-05, |
|
"loss": 1.0641, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 0.9696668324216807, |
|
"grad_norm": 0.5881811224350209, |
|
"learning_rate": 1.2246972263297718e-05, |
|
"loss": 1.0556, |
|
"step": 975 |
|
}, |
|
{ |
|
"epoch": 0.9746394828443561, |
|
"grad_norm": 0.5659402723765035, |
|
"learning_rate": 1.2162276737678934e-05, |
|
"loss": 1.0535, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 0.9796121332670313, |
|
"grad_norm": 0.5667335140444246, |
|
"learning_rate": 1.2077418180694049e-05, |
|
"loss": 1.0575, |
|
"step": 985 |
|
}, |
|
{ |
|
"epoch": 0.9845847836897066, |
|
"grad_norm": 0.5707663035442004, |
|
"learning_rate": 1.1992402990509515e-05, |
|
"loss": 1.0486, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 0.989557434112382, |
|
"grad_norm": 0.5843296109709583, |
|
"learning_rate": 1.1907237577101612e-05, |
|
"loss": 1.0706, |
|
"step": 995 |
|
}, |
|
{ |
|
"epoch": 0.9945300845350572, |
|
"grad_norm": 0.5784451382415723, |
|
"learning_rate": 1.1821928361773148e-05, |
|
"loss": 1.0583, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.9995027349577325, |
|
"grad_norm": 0.5601123703118762, |
|
"learning_rate": 1.1736481776669307e-05, |
|
"loss": 1.0638, |
|
"step": 1005 |
|
}, |
|
{ |
|
"epoch": 0.9995027349577325, |
|
"eval_loss": 1.0704214572906494, |
|
"eval_runtime": 313.2095, |
|
"eval_samples_per_second": 45.455, |
|
"eval_steps_per_second": 0.712, |
|
"step": 1005 |
|
}, |
|
{ |
|
"epoch": 1.0044753853804078, |
|
"grad_norm": 0.7234842589362833, |
|
"learning_rate": 1.1650904264292689e-05, |
|
"loss": 0.9297, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 1.009448035803083, |
|
"grad_norm": 0.7141499870480417, |
|
"learning_rate": 1.1565202277017551e-05, |
|
"loss": 0.9093, |
|
"step": 1015 |
|
}, |
|
{ |
|
"epoch": 1.0144206862257583, |
|
"grad_norm": 0.7209311229992112, |
|
"learning_rate": 1.14793822766033e-05, |
|
"loss": 0.8998, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 1.0193933366484336, |
|
"grad_norm": 0.7103101581589907, |
|
"learning_rate": 1.139345073370731e-05, |
|
"loss": 0.9174, |
|
"step": 1025 |
|
}, |
|
{ |
|
"epoch": 1.0243659870711088, |
|
"grad_norm": 0.6866739397113207, |
|
"learning_rate": 1.1307414127397028e-05, |
|
"loss": 0.8991, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 1.0293386374937843, |
|
"grad_norm": 0.629326336496965, |
|
"learning_rate": 1.1221278944661474e-05, |
|
"loss": 0.9109, |
|
"step": 1035 |
|
}, |
|
{ |
|
"epoch": 1.0343112879164595, |
|
"grad_norm": 0.6926758402006077, |
|
"learning_rate": 1.1135051679922143e-05, |
|
"loss": 0.9111, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 1.0392839383391348, |
|
"grad_norm": 0.610723675260377, |
|
"learning_rate": 1.104873883454332e-05, |
|
"loss": 0.908, |
|
"step": 1045 |
|
}, |
|
{ |
|
"epoch": 1.04425658876181, |
|
"grad_norm": 0.636677273500329, |
|
"learning_rate": 1.0962346916341904e-05, |
|
"loss": 0.8833, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 1.0492292391844853, |
|
"grad_norm": 0.6005893454455321, |
|
"learning_rate": 1.087588243909673e-05, |
|
"loss": 0.9091, |
|
"step": 1055 |
|
}, |
|
{ |
|
"epoch": 1.0542018896071605, |
|
"grad_norm": 0.6048311027567271, |
|
"learning_rate": 1.0789351922057437e-05, |
|
"loss": 0.9031, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 1.0591745400298358, |
|
"grad_norm": 0.6157155501614046, |
|
"learning_rate": 1.070276188945293e-05, |
|
"loss": 0.8928, |
|
"step": 1065 |
|
}, |
|
{ |
|
"epoch": 1.0641471904525113, |
|
"grad_norm": 0.6465505729365048, |
|
"learning_rate": 1.0616118869999484e-05, |
|
"loss": 0.8942, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 1.0691198408751865, |
|
"grad_norm": 0.6540283250849646, |
|
"learning_rate": 1.0529429396408452e-05, |
|
"loss": 0.9028, |
|
"step": 1075 |
|
}, |
|
{ |
|
"epoch": 1.0740924912978618, |
|
"grad_norm": 0.6319963673992531, |
|
"learning_rate": 1.0442700004893764e-05, |
|
"loss": 0.8908, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 1.079065141720537, |
|
"grad_norm": 0.6255624407373807, |
|
"learning_rate": 1.0355937234679065e-05, |
|
"loss": 0.9039, |
|
"step": 1085 |
|
}, |
|
{ |
|
"epoch": 1.0840377921432123, |
|
"grad_norm": 0.5872134392284907, |
|
"learning_rate": 1.0269147627504692e-05, |
|
"loss": 0.9176, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 1.0890104425658875, |
|
"grad_norm": 0.6687386627981542, |
|
"learning_rate": 1.0182337727134431e-05, |
|
"loss": 0.9118, |
|
"step": 1095 |
|
}, |
|
{ |
|
"epoch": 1.093983092988563, |
|
"grad_norm": 0.6382535990052277, |
|
"learning_rate": 1.0095514078862147e-05, |
|
"loss": 0.9082, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 1.0989557434112383, |
|
"grad_norm": 0.6119367907834425, |
|
"learning_rate": 1.0008683229018257e-05, |
|
"loss": 0.9057, |
|
"step": 1105 |
|
}, |
|
{ |
|
"epoch": 1.1039283938339135, |
|
"grad_norm": 0.6013607999486498, |
|
"learning_rate": 9.92185172447616e-06, |
|
"loss": 0.9247, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 1.1089010442565888, |
|
"grad_norm": 0.6313265309829375, |
|
"learning_rate": 9.835026112158637e-06, |
|
"loss": 0.9065, |
|
"step": 1115 |
|
}, |
|
{ |
|
"epoch": 1.113873694679264, |
|
"grad_norm": 0.6369037011412366, |
|
"learning_rate": 9.748212938544188e-06, |
|
"loss": 0.9217, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 1.1188463451019393, |
|
"grad_norm": 0.6131706455581138, |
|
"learning_rate": 9.661418749173467e-06, |
|
"loss": 0.9161, |
|
"step": 1125 |
|
}, |
|
{ |
|
"epoch": 1.1238189955246147, |
|
"grad_norm": 0.6129393934682658, |
|
"learning_rate": 9.574650088155752e-06, |
|
"loss": 0.8958, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 1.12879164594729, |
|
"grad_norm": 0.6436312212169086, |
|
"learning_rate": 9.487913497675536e-06, |
|
"loss": 0.9052, |
|
"step": 1135 |
|
}, |
|
{ |
|
"epoch": 1.1337642963699652, |
|
"grad_norm": 0.6721603259383877, |
|
"learning_rate": 9.401215517499252e-06, |
|
"loss": 0.9078, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 1.1387369467926405, |
|
"grad_norm": 0.6531721143996831, |
|
"learning_rate": 9.314562684482202e-06, |
|
"loss": 0.8982, |
|
"step": 1145 |
|
}, |
|
{ |
|
"epoch": 1.1437095972153157, |
|
"grad_norm": 0.624269506566372, |
|
"learning_rate": 9.22796153207567e-06, |
|
"loss": 0.9006, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 1.148682247637991, |
|
"grad_norm": 0.6430651884495883, |
|
"learning_rate": 9.14141858983434e-06, |
|
"loss": 0.9016, |
|
"step": 1155 |
|
}, |
|
{ |
|
"epoch": 1.1536548980606662, |
|
"grad_norm": 0.6676665306535187, |
|
"learning_rate": 9.054940382923954e-06, |
|
"loss": 0.8893, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 1.1586275484833417, |
|
"grad_norm": 0.6252335256876467, |
|
"learning_rate": 8.96853343162934e-06, |
|
"loss": 0.8893, |
|
"step": 1165 |
|
}, |
|
{ |
|
"epoch": 1.163600198906017, |
|
"grad_norm": 0.5838746279233594, |
|
"learning_rate": 8.882204250862796e-06, |
|
"loss": 0.8992, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 1.1685728493286922, |
|
"grad_norm": 0.6369545427202165, |
|
"learning_rate": 8.795959349672878e-06, |
|
"loss": 0.8902, |
|
"step": 1175 |
|
}, |
|
{ |
|
"epoch": 1.1735454997513675, |
|
"grad_norm": 0.6174021193552773, |
|
"learning_rate": 8.709805230753628e-06, |
|
"loss": 0.9053, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 1.1785181501740427, |
|
"grad_norm": 0.6123198457341488, |
|
"learning_rate": 8.623748389954284e-06, |
|
"loss": 0.903, |
|
"step": 1185 |
|
}, |
|
{ |
|
"epoch": 1.183490800596718, |
|
"grad_norm": 0.5956552825535298, |
|
"learning_rate": 8.53779531578951e-06, |
|
"loss": 0.896, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 1.1884634510193934, |
|
"grad_norm": 0.6113334565497545, |
|
"learning_rate": 8.451952488950167e-06, |
|
"loss": 0.8966, |
|
"step": 1195 |
|
}, |
|
{ |
|
"epoch": 1.1934361014420687, |
|
"grad_norm": 0.6957806045214961, |
|
"learning_rate": 8.366226381814698e-06, |
|
"loss": 0.9135, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 1.198408751864744, |
|
"grad_norm": 0.6224006138182644, |
|
"learning_rate": 8.280623457961102e-06, |
|
"loss": 0.9092, |
|
"step": 1205 |
|
}, |
|
{ |
|
"epoch": 1.2033814022874192, |
|
"grad_norm": 0.6154668861281782, |
|
"learning_rate": 8.195150171679608e-06, |
|
"loss": 0.8961, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 1.2083540527100944, |
|
"grad_norm": 0.601285614291192, |
|
"learning_rate": 8.109812967486024e-06, |
|
"loss": 0.8957, |
|
"step": 1215 |
|
}, |
|
{ |
|
"epoch": 1.2133267031327697, |
|
"grad_norm": 0.6200705833301541, |
|
"learning_rate": 8.02461827963585e-06, |
|
"loss": 0.9007, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 1.218299353555445, |
|
"grad_norm": 0.621174344679376, |
|
"learning_rate": 7.939572531639128e-06, |
|
"loss": 0.9078, |
|
"step": 1225 |
|
}, |
|
{ |
|
"epoch": 1.2232720039781204, |
|
"grad_norm": 0.6237287223906944, |
|
"learning_rate": 7.85468213577613e-06, |
|
"loss": 0.9085, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 1.2282446544007957, |
|
"grad_norm": 0.6313023907502137, |
|
"learning_rate": 7.7699534926139e-06, |
|
"loss": 0.9121, |
|
"step": 1235 |
|
}, |
|
{ |
|
"epoch": 1.233217304823471, |
|
"grad_norm": 0.6645038885414285, |
|
"learning_rate": 7.685392990523628e-06, |
|
"loss": 0.895, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 1.2381899552461462, |
|
"grad_norm": 0.6182163241650108, |
|
"learning_rate": 7.601007005199022e-06, |
|
"loss": 0.8958, |
|
"step": 1245 |
|
}, |
|
{ |
|
"epoch": 1.2431626056688214, |
|
"grad_norm": 0.6152497076528807, |
|
"learning_rate": 7.5168018991755645e-06, |
|
"loss": 0.9123, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 1.248135256091497, |
|
"grad_norm": 0.5828706125717531, |
|
"learning_rate": 7.432784021350796e-06, |
|
"loss": 0.9116, |
|
"step": 1255 |
|
}, |
|
{ |
|
"epoch": 1.2531079065141721, |
|
"grad_norm": 0.6380432502898638, |
|
"learning_rate": 7.3489597065056274e-06, |
|
"loss": 0.8931, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 1.2580805569368474, |
|
"grad_norm": 0.6666672232856957, |
|
"learning_rate": 7.265335274826704e-06, |
|
"loss": 0.8985, |
|
"step": 1265 |
|
}, |
|
{ |
|
"epoch": 1.2630532073595226, |
|
"grad_norm": 0.6111003204096657, |
|
"learning_rate": 7.1819170314298746e-06, |
|
"loss": 0.9022, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 1.268025857782198, |
|
"grad_norm": 0.5636007624972241, |
|
"learning_rate": 7.09871126588481e-06, |
|
"loss": 0.8926, |
|
"step": 1275 |
|
}, |
|
{ |
|
"epoch": 1.2729985082048731, |
|
"grad_norm": 0.6013817391842333, |
|
"learning_rate": 7.015724251740766e-06, |
|
"loss": 0.9104, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 1.2779711586275484, |
|
"grad_norm": 0.6155188711752694, |
|
"learning_rate": 6.932962246053577e-06, |
|
"loss": 0.9095, |
|
"step": 1285 |
|
}, |
|
{ |
|
"epoch": 1.2829438090502236, |
|
"grad_norm": 0.6172811916494736, |
|
"learning_rate": 6.8504314889138956e-06, |
|
"loss": 0.8996, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 1.2879164594728991, |
|
"grad_norm": 0.605702181756811, |
|
"learning_rate": 6.768138202976691e-06, |
|
"loss": 0.8974, |
|
"step": 1295 |
|
}, |
|
{ |
|
"epoch": 1.2928891098955744, |
|
"grad_norm": 0.6159076278297484, |
|
"learning_rate": 6.686088592992067e-06, |
|
"loss": 0.8946, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 1.2978617603182496, |
|
"grad_norm": 0.6031330118847871, |
|
"learning_rate": 6.604288845337453e-06, |
|
"loss": 0.8899, |
|
"step": 1305 |
|
}, |
|
{ |
|
"epoch": 1.3028344107409249, |
|
"grad_norm": 0.5905887773391477, |
|
"learning_rate": 6.522745127551158e-06, |
|
"loss": 0.8783, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 1.3078070611636001, |
|
"grad_norm": 0.6196998052199774, |
|
"learning_rate": 6.441463587867341e-06, |
|
"loss": 0.8913, |
|
"step": 1315 |
|
}, |
|
{ |
|
"epoch": 1.3127797115862756, |
|
"grad_norm": 0.6365018272765014, |
|
"learning_rate": 6.360450354752459e-06, |
|
"loss": 0.8971, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 1.3177523620089509, |
|
"grad_norm": 0.6432175965282628, |
|
"learning_rate": 6.279711536443185e-06, |
|
"loss": 0.8997, |
|
"step": 1325 |
|
}, |
|
{ |
|
"epoch": 1.322725012431626, |
|
"grad_norm": 0.6190646458071539, |
|
"learning_rate": 6.199253220485857e-06, |
|
"loss": 0.8959, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 1.3276976628543014, |
|
"grad_norm": 0.5944004518537376, |
|
"learning_rate": 6.119081473277502e-06, |
|
"loss": 0.8891, |
|
"step": 1335 |
|
}, |
|
{ |
|
"epoch": 1.3326703132769766, |
|
"grad_norm": 0.5966011460567672, |
|
"learning_rate": 6.039202339608432e-06, |
|
"loss": 0.8972, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 1.3376429636996519, |
|
"grad_norm": 0.6043525331022274, |
|
"learning_rate": 5.959621842206474e-06, |
|
"loss": 0.8968, |
|
"step": 1345 |
|
}, |
|
{ |
|
"epoch": 1.342615614122327, |
|
"grad_norm": 0.6274316003330035, |
|
"learning_rate": 5.880345981282877e-06, |
|
"loss": 0.8975, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 1.3475882645450024, |
|
"grad_norm": 0.6387485089736155, |
|
"learning_rate": 5.801380734079906e-06, |
|
"loss": 0.8882, |
|
"step": 1355 |
|
}, |
|
{ |
|
"epoch": 1.3525609149676778, |
|
"grad_norm": 0.6057683456315234, |
|
"learning_rate": 5.722732054420172e-06, |
|
"loss": 0.8968, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 1.357533565390353, |
|
"grad_norm": 0.641291647959097, |
|
"learning_rate": 5.644405872257716e-06, |
|
"loss": 0.9089, |
|
"step": 1365 |
|
}, |
|
{ |
|
"epoch": 1.3625062158130283, |
|
"grad_norm": 0.6093344166563688, |
|
"learning_rate": 5.566408093230911e-06, |
|
"loss": 0.901, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 1.3674788662357036, |
|
"grad_norm": 0.6137226994575677, |
|
"learning_rate": 5.48874459821719e-06, |
|
"loss": 0.8955, |
|
"step": 1375 |
|
}, |
|
{ |
|
"epoch": 1.3724515166583788, |
|
"grad_norm": 0.5988605927384872, |
|
"learning_rate": 5.411421242889643e-06, |
|
"loss": 0.8972, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 1.3774241670810543, |
|
"grad_norm": 0.5972211719762297, |
|
"learning_rate": 5.334443857275488e-06, |
|
"loss": 0.8943, |
|
"step": 1385 |
|
}, |
|
{ |
|
"epoch": 1.3823968175037296, |
|
"grad_norm": 0.5959667581735492, |
|
"learning_rate": 5.257818245316522e-06, |
|
"loss": 0.8838, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 1.3873694679264048, |
|
"grad_norm": 0.595064918048714, |
|
"learning_rate": 5.181550184431511e-06, |
|
"loss": 0.8969, |
|
"step": 1395 |
|
}, |
|
{ |
|
"epoch": 1.39234211834908, |
|
"grad_norm": 0.6244444134401027, |
|
"learning_rate": 5.105645425080572e-06, |
|
"loss": 0.8999, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 1.3973147687717553, |
|
"grad_norm": 0.6143231676432872, |
|
"learning_rate": 5.030109690331625e-06, |
|
"loss": 0.8848, |
|
"step": 1405 |
|
}, |
|
{ |
|
"epoch": 1.4022874191944306, |
|
"grad_norm": 0.5932069860124085, |
|
"learning_rate": 4.954948675428853e-06, |
|
"loss": 0.9015, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 1.4072600696171058, |
|
"grad_norm": 0.6037350510324395, |
|
"learning_rate": 4.880168047363312e-06, |
|
"loss": 0.904, |
|
"step": 1415 |
|
}, |
|
{ |
|
"epoch": 1.4122327200397813, |
|
"grad_norm": 0.6127392479761228, |
|
"learning_rate": 4.805773444445654e-06, |
|
"loss": 0.888, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 1.4172053704624565, |
|
"grad_norm": 0.6177060602745384, |
|
"learning_rate": 4.731770475880995e-06, |
|
"loss": 0.8983, |
|
"step": 1425 |
|
}, |
|
{ |
|
"epoch": 1.4221780208851318, |
|
"grad_norm": 0.5872982950015017, |
|
"learning_rate": 4.658164721345998e-06, |
|
"loss": 0.8924, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 1.427150671307807, |
|
"grad_norm": 0.5820337262775029, |
|
"learning_rate": 4.584961730568188e-06, |
|
"loss": 0.8748, |
|
"step": 1435 |
|
}, |
|
{ |
|
"epoch": 1.4321233217304823, |
|
"grad_norm": 0.5998336756778414, |
|
"learning_rate": 4.512167022907494e-06, |
|
"loss": 0.8957, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 1.4370959721531578, |
|
"grad_norm": 0.5961438955138286, |
|
"learning_rate": 4.439786086940116e-06, |
|
"loss": 0.8961, |
|
"step": 1445 |
|
}, |
|
{ |
|
"epoch": 1.442068622575833, |
|
"grad_norm": 0.6078965414451544, |
|
"learning_rate": 4.367824380044684e-06, |
|
"loss": 0.8911, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 1.4470412729985083, |
|
"grad_norm": 0.6288618597339264, |
|
"learning_rate": 4.296287327990797e-06, |
|
"loss": 0.9019, |
|
"step": 1455 |
|
}, |
|
{ |
|
"epoch": 1.4520139234211835, |
|
"grad_norm": 0.6151434227389634, |
|
"learning_rate": 4.225180324529917e-06, |
|
"loss": 0.8993, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 1.4569865738438588, |
|
"grad_norm": 0.6404133470470939, |
|
"learning_rate": 4.154508730988704e-06, |
|
"loss": 0.889, |
|
"step": 1465 |
|
}, |
|
{ |
|
"epoch": 1.461959224266534, |
|
"grad_norm": 0.6164441372345689, |
|
"learning_rate": 4.084277875864776e-06, |
|
"loss": 0.8986, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 1.4669318746892093, |
|
"grad_norm": 0.6109571828322322, |
|
"learning_rate": 4.0144930544249436e-06, |
|
"loss": 0.8946, |
|
"step": 1475 |
|
}, |
|
{ |
|
"epoch": 1.4719045251118845, |
|
"grad_norm": 0.5732121037549397, |
|
"learning_rate": 3.945159528305971e-06, |
|
"loss": 0.8917, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 1.47687717553456, |
|
"grad_norm": 0.5956428501272881, |
|
"learning_rate": 3.876282525117847e-06, |
|
"loss": 0.906, |
|
"step": 1485 |
|
}, |
|
{ |
|
"epoch": 1.4818498259572352, |
|
"grad_norm": 0.6072276314986297, |
|
"learning_rate": 3.8078672380496416e-06, |
|
"loss": 0.8924, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 1.4868224763799105, |
|
"grad_norm": 0.5896349716762431, |
|
"learning_rate": 3.7399188254779527e-06, |
|
"loss": 0.9039, |
|
"step": 1495 |
|
}, |
|
{ |
|
"epoch": 1.4917951268025857, |
|
"grad_norm": 0.6075888226854094, |
|
"learning_rate": 3.6724424105779654e-06, |
|
"loss": 0.8993, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 1.496767777225261, |
|
"grad_norm": 0.6174215911692067, |
|
"learning_rate": 3.6054430809371723e-06, |
|
"loss": 0.9013, |
|
"step": 1505 |
|
}, |
|
{ |
|
"epoch": 1.5017404276479365, |
|
"grad_norm": 0.6152858958633052, |
|
"learning_rate": 3.5389258881718003e-06, |
|
"loss": 0.8818, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 1.5067130780706117, |
|
"grad_norm": 0.617067387408615, |
|
"learning_rate": 3.4728958475459052e-06, |
|
"loss": 0.8879, |
|
"step": 1515 |
|
}, |
|
{ |
|
"epoch": 1.511685728493287, |
|
"grad_norm": 0.6005431823762195, |
|
"learning_rate": 3.4073579375932377e-06, |
|
"loss": 0.8917, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 1.5166583789159622, |
|
"grad_norm": 0.5744056476076853, |
|
"learning_rate": 3.342317099741886e-06, |
|
"loss": 0.883, |
|
"step": 1525 |
|
}, |
|
{ |
|
"epoch": 1.5216310293386375, |
|
"grad_norm": 0.6483072296156038, |
|
"learning_rate": 3.27777823794168e-06, |
|
"loss": 0.911, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 1.5266036797613127, |
|
"grad_norm": 0.6051760146177871, |
|
"learning_rate": 3.2137462182944557e-06, |
|
"loss": 0.898, |
|
"step": 1535 |
|
}, |
|
{ |
|
"epoch": 1.531576330183988, |
|
"grad_norm": 0.5905220877100078, |
|
"learning_rate": 3.150225868687161e-06, |
|
"loss": 0.8885, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 1.5365489806066632, |
|
"grad_norm": 0.5972075844479279, |
|
"learning_rate": 3.0872219784278357e-06, |
|
"loss": 0.8754, |
|
"step": 1545 |
|
}, |
|
{ |
|
"epoch": 1.5415216310293385, |
|
"grad_norm": 0.5896259569827768, |
|
"learning_rate": 3.0247392978845203e-06, |
|
"loss": 0.8976, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 1.546494281452014, |
|
"grad_norm": 0.5850464264225753, |
|
"learning_rate": 2.9627825381270704e-06, |
|
"loss": 0.8762, |
|
"step": 1555 |
|
}, |
|
{ |
|
"epoch": 1.5514669318746892, |
|
"grad_norm": 0.5803411751479154, |
|
"learning_rate": 2.9013563705719673e-06, |
|
"loss": 0.8914, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 1.5564395822973645, |
|
"grad_norm": 0.6107318414430488, |
|
"learning_rate": 2.840465426630091e-06, |
|
"loss": 0.8927, |
|
"step": 1565 |
|
}, |
|
{ |
|
"epoch": 1.56141223272004, |
|
"grad_norm": 0.5883688758640033, |
|
"learning_rate": 2.7801142973575245e-06, |
|
"loss": 0.899, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 1.5663848831427152, |
|
"grad_norm": 0.5806776595867678, |
|
"learning_rate": 2.720307533109402e-06, |
|
"loss": 0.8714, |
|
"step": 1575 |
|
}, |
|
{ |
|
"epoch": 1.5713575335653904, |
|
"grad_norm": 0.6072257831136263, |
|
"learning_rate": 2.6610496431968125e-06, |
|
"loss": 0.8909, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 1.5763301839880657, |
|
"grad_norm": 0.5984905911242053, |
|
"learning_rate": 2.6023450955468176e-06, |
|
"loss": 0.8905, |
|
"step": 1585 |
|
}, |
|
{ |
|
"epoch": 1.581302834410741, |
|
"grad_norm": 0.5982169185055861, |
|
"learning_rate": 2.5441983163655705e-06, |
|
"loss": 0.893, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 1.5862754848334162, |
|
"grad_norm": 0.6845641216521319, |
|
"learning_rate": 2.4866136898045844e-06, |
|
"loss": 0.888, |
|
"step": 1595 |
|
}, |
|
{ |
|
"epoch": 1.5912481352560914, |
|
"grad_norm": 0.6521336761418035, |
|
"learning_rate": 2.4295955576301966e-06, |
|
"loss": 0.8975, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 1.5962207856787667, |
|
"grad_norm": 0.6026814731835343, |
|
"learning_rate": 2.373148218896182e-06, |
|
"loss": 0.8955, |
|
"step": 1605 |
|
}, |
|
{ |
|
"epoch": 1.601193436101442, |
|
"grad_norm": 0.6031328469823622, |
|
"learning_rate": 2.3172759296196267e-06, |
|
"loss": 0.8984, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 1.6061660865241174, |
|
"grad_norm": 0.6085878347429402, |
|
"learning_rate": 2.2619829024600394e-06, |
|
"loss": 0.897, |
|
"step": 1615 |
|
}, |
|
{ |
|
"epoch": 1.6111387369467927, |
|
"grad_norm": 0.5913537335897251, |
|
"learning_rate": 2.2072733064017104e-06, |
|
"loss": 0.9019, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 1.616111387369468, |
|
"grad_norm": 0.6046011617732275, |
|
"learning_rate": 2.153151266439384e-06, |
|
"loss": 0.89, |
|
"step": 1625 |
|
}, |
|
{ |
|
"epoch": 1.6210840377921432, |
|
"grad_norm": 0.5977994670480866, |
|
"learning_rate": 2.0996208632672475e-06, |
|
"loss": 0.8857, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 1.6260566882148186, |
|
"grad_norm": 0.5798385899827562, |
|
"learning_rate": 2.0466861329712473e-06, |
|
"loss": 0.8893, |
|
"step": 1635 |
|
}, |
|
{ |
|
"epoch": 1.6310293386374939, |
|
"grad_norm": 0.6034790461256396, |
|
"learning_rate": 1.994351066724781e-06, |
|
"loss": 0.8841, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 1.6360019890601691, |
|
"grad_norm": 0.6144777283053662, |
|
"learning_rate": 1.9426196104877737e-06, |
|
"loss": 0.8754, |
|
"step": 1645 |
|
}, |
|
{ |
|
"epoch": 1.6409746394828444, |
|
"grad_norm": 0.589100085406895, |
|
"learning_rate": 1.8914956647091497e-06, |
|
"loss": 0.8859, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 1.6459472899055196, |
|
"grad_norm": 0.5907474864486845, |
|
"learning_rate": 1.8409830840327546e-06, |
|
"loss": 0.8906, |
|
"step": 1655 |
|
}, |
|
{ |
|
"epoch": 1.650919940328195, |
|
"grad_norm": 0.5944183790215095, |
|
"learning_rate": 1.791085677006722e-06, |
|
"loss": 0.8987, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 1.6558925907508701, |
|
"grad_norm": 0.5862999400507831, |
|
"learning_rate": 1.7418072057963143e-06, |
|
"loss": 0.8846, |
|
"step": 1665 |
|
}, |
|
{ |
|
"epoch": 1.6608652411735454, |
|
"grad_norm": 0.6065039887812087, |
|
"learning_rate": 1.6931513859002636e-06, |
|
"loss": 0.8882, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 1.6658378915962206, |
|
"grad_norm": 0.5829700290824299, |
|
"learning_rate": 1.6451218858706374e-06, |
|
"loss": 0.8899, |
|
"step": 1675 |
|
}, |
|
{ |
|
"epoch": 1.6708105420188961, |
|
"grad_norm": 0.5747340494639567, |
|
"learning_rate": 1.5977223270362197e-06, |
|
"loss": 0.8779, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 1.6757831924415714, |
|
"grad_norm": 0.6236514063391653, |
|
"learning_rate": 1.5509562832294944e-06, |
|
"loss": 0.8906, |
|
"step": 1685 |
|
}, |
|
{ |
|
"epoch": 1.6807558428642466, |
|
"grad_norm": 0.5890477510202525, |
|
"learning_rate": 1.5048272805171615e-06, |
|
"loss": 0.8735, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 1.685728493286922, |
|
"grad_norm": 0.5887405386066722, |
|
"learning_rate": 1.459338796934293e-06, |
|
"loss": 0.8787, |
|
"step": 1695 |
|
}, |
|
{ |
|
"epoch": 1.6907011437095973, |
|
"grad_norm": 0.5560854923827532, |
|
"learning_rate": 1.4144942622220902e-06, |
|
"loss": 0.8818, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 1.6956737941322726, |
|
"grad_norm": 0.6127568360778808, |
|
"learning_rate": 1.3702970575692975e-06, |
|
"loss": 0.8969, |
|
"step": 1705 |
|
}, |
|
{ |
|
"epoch": 1.7006464445549478, |
|
"grad_norm": 0.576741886798026, |
|
"learning_rate": 1.3267505153572502e-06, |
|
"loss": 0.8913, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 1.705619094977623, |
|
"grad_norm": 0.5786099553553555, |
|
"learning_rate": 1.2838579189086352e-06, |
|
"loss": 0.8836, |
|
"step": 1715 |
|
}, |
|
{ |
|
"epoch": 1.7105917454002983, |
|
"grad_norm": 0.5977190789902269, |
|
"learning_rate": 1.2416225022399286e-06, |
|
"loss": 0.8837, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 1.7155643958229736, |
|
"grad_norm": 0.60144046208383, |
|
"learning_rate": 1.2000474498175552e-06, |
|
"loss": 0.8904, |
|
"step": 1725 |
|
}, |
|
{ |
|
"epoch": 1.7205370462456488, |
|
"grad_norm": 0.5695644106157955, |
|
"learning_rate": 1.1591358963177924e-06, |
|
"loss": 0.8999, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 1.725509696668324, |
|
"grad_norm": 0.6030300408695916, |
|
"learning_rate": 1.118890926390419e-06, |
|
"loss": 0.8849, |
|
"step": 1735 |
|
}, |
|
{ |
|
"epoch": 1.7304823470909994, |
|
"grad_norm": 0.5754077081164456, |
|
"learning_rate": 1.0793155744261352e-06, |
|
"loss": 0.8809, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 1.7354549975136748, |
|
"grad_norm": 0.5782023378111908, |
|
"learning_rate": 1.0404128243277778e-06, |
|
"loss": 0.8875, |
|
"step": 1745 |
|
}, |
|
{ |
|
"epoch": 1.74042764793635, |
|
"grad_norm": 0.589876052766199, |
|
"learning_rate": 1.0021856092853433e-06, |
|
"loss": 0.8843, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 1.7454002983590253, |
|
"grad_norm": 0.594939794983627, |
|
"learning_rate": 9.646368115548232e-07, |
|
"loss": 0.8834, |
|
"step": 1755 |
|
}, |
|
{ |
|
"epoch": 1.7503729487817008, |
|
"grad_norm": 0.5953106465263236, |
|
"learning_rate": 9.277692622409018e-07, |
|
"loss": 0.8667, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 1.755345599204376, |
|
"grad_norm": 0.6161359913787724, |
|
"learning_rate": 8.915857410834793e-07, |
|
"loss": 0.8891, |
|
"step": 1765 |
|
}, |
|
{ |
|
"epoch": 1.7603182496270513, |
|
"grad_norm": 0.6439397975128724, |
|
"learning_rate": 8.560889762480951e-07, |
|
"loss": 0.8768, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 1.7652909000497266, |
|
"grad_norm": 0.5657607722214749, |
|
"learning_rate": 8.212816441202309e-07, |
|
"loss": 0.8886, |
|
"step": 1775 |
|
}, |
|
{ |
|
"epoch": 1.7702635504724018, |
|
"grad_norm": 0.6073482243771166, |
|
"learning_rate": 7.871663691035103e-07, |
|
"loss": 0.8901, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 1.775236200895077, |
|
"grad_norm": 0.566216143109814, |
|
"learning_rate": 7.537457234218271e-07, |
|
"loss": 0.8844, |
|
"step": 1785 |
|
}, |
|
{ |
|
"epoch": 1.7802088513177523, |
|
"grad_norm": 0.5896090553262934, |
|
"learning_rate": 7.210222269254041e-07, |
|
"loss": 0.8897, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 1.7851815017404276, |
|
"grad_norm": 0.5874428543517964, |
|
"learning_rate": 6.889983469008055e-07, |
|
"loss": 0.887, |
|
"step": 1795 |
|
}, |
|
{ |
|
"epoch": 1.7901541521631028, |
|
"grad_norm": 0.5923783661686378, |
|
"learning_rate": 6.576764978849005e-07, |
|
"loss": 0.89, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 1.795126802585778, |
|
"grad_norm": 0.5747156271260179, |
|
"learning_rate": 6.27059041482817e-07, |
|
"loss": 0.8803, |
|
"step": 1805 |
|
}, |
|
{ |
|
"epoch": 1.8000994530084535, |
|
"grad_norm": 0.5843164968993672, |
|
"learning_rate": 5.971482861898836e-07, |
|
"loss": 0.8814, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 1.8050721034311288, |
|
"grad_norm": 0.5925488819832142, |
|
"learning_rate": 5.679464872175666e-07, |
|
"loss": 0.8798, |
|
"step": 1815 |
|
}, |
|
{ |
|
"epoch": 1.810044753853804, |
|
"grad_norm": 0.5814849122329898, |
|
"learning_rate": 5.394558463234378e-07, |
|
"loss": 0.8928, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 1.8150174042764795, |
|
"grad_norm": 0.5653073997150958, |
|
"learning_rate": 5.116785116451661e-07, |
|
"loss": 0.8858, |
|
"step": 1825 |
|
}, |
|
{ |
|
"epoch": 1.8199900546991548, |
|
"grad_norm": 0.5977924852737442, |
|
"learning_rate": 4.846165775385459e-07, |
|
"loss": 0.8868, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 1.82496270512183, |
|
"grad_norm": 0.5993863609862281, |
|
"learning_rate": 4.5827208441959426e-07, |
|
"loss": 0.891, |
|
"step": 1835 |
|
}, |
|
{ |
|
"epoch": 1.8299353555445053, |
|
"grad_norm": 0.601479874243501, |
|
"learning_rate": 4.326470186107035e-07, |
|
"loss": 0.8791, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 1.8349080059671805, |
|
"grad_norm": 0.592784564470986, |
|
"learning_rate": 4.077433121908747e-07, |
|
"loss": 0.8875, |
|
"step": 1845 |
|
}, |
|
{ |
|
"epoch": 1.8398806563898558, |
|
"grad_norm": 0.5707840764321231, |
|
"learning_rate": 3.835628428500515e-07, |
|
"loss": 0.8646, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 1.844853306812531, |
|
"grad_norm": 0.5628275928965689, |
|
"learning_rate": 3.601074337475352e-07, |
|
"loss": 0.8769, |
|
"step": 1855 |
|
}, |
|
{ |
|
"epoch": 1.8498259572352063, |
|
"grad_norm": 0.6070264634956976, |
|
"learning_rate": 3.3737885337452815e-07, |
|
"loss": 0.8996, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 1.8547986076578815, |
|
"grad_norm": 0.5987252942276654, |
|
"learning_rate": 3.153788154207926e-07, |
|
"loss": 0.9035, |
|
"step": 1865 |
|
}, |
|
{ |
|
"epoch": 1.859771258080557, |
|
"grad_norm": 0.603313374471039, |
|
"learning_rate": 2.941089786454421e-07, |
|
"loss": 0.8928, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 1.8647439085032322, |
|
"grad_norm": 0.6133979767792208, |
|
"learning_rate": 2.735709467518699e-07, |
|
"loss": 0.8796, |
|
"step": 1875 |
|
}, |
|
{ |
|
"epoch": 1.8697165589259075, |
|
"grad_norm": 0.5708712405374609, |
|
"learning_rate": 2.5376626826683956e-07, |
|
"loss": 0.8783, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 1.8746892093485827, |
|
"grad_norm": 0.5903635964464384, |
|
"learning_rate": 2.3469643642372587e-07, |
|
"loss": 0.8727, |
|
"step": 1885 |
|
}, |
|
{ |
|
"epoch": 1.8796618597712582, |
|
"grad_norm": 0.5860886967031799, |
|
"learning_rate": 2.1636288904992585e-07, |
|
"loss": 0.8911, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 1.8846345101939335, |
|
"grad_norm": 0.5987205804929953, |
|
"learning_rate": 1.9876700845845475e-07, |
|
"loss": 0.8622, |
|
"step": 1895 |
|
}, |
|
{ |
|
"epoch": 1.8896071606166087, |
|
"grad_norm": 0.5864702159345376, |
|
"learning_rate": 1.8191012134371576e-07, |
|
"loss": 0.8997, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 1.894579811039284, |
|
"grad_norm": 0.5941563218461623, |
|
"learning_rate": 1.6579349868147688e-07, |
|
"loss": 0.8801, |
|
"step": 1905 |
|
}, |
|
{ |
|
"epoch": 1.8995524614619592, |
|
"grad_norm": 0.5890830924362549, |
|
"learning_rate": 1.504183556330374e-07, |
|
"loss": 0.8752, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 1.9045251118846345, |
|
"grad_norm": 0.5973613796458782, |
|
"learning_rate": 1.3578585145360812e-07, |
|
"loss": 0.888, |
|
"step": 1915 |
|
}, |
|
{ |
|
"epoch": 1.9094977623073097, |
|
"grad_norm": 0.6015109014180703, |
|
"learning_rate": 1.2189708940490653e-07, |
|
"loss": 0.8915, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 1.914470412729985, |
|
"grad_norm": 0.5840101145509294, |
|
"learning_rate": 1.0875311667196908e-07, |
|
"loss": 0.8823, |
|
"step": 1925 |
|
}, |
|
{ |
|
"epoch": 1.9194430631526602, |
|
"grad_norm": 0.9953027707453141, |
|
"learning_rate": 9.635492428420434e-08, |
|
"loss": 0.8706, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 1.9244157135753357, |
|
"grad_norm": 0.569992582848356, |
|
"learning_rate": 8.470344704066047e-08, |
|
"loss": 0.8935, |
|
"step": 1935 |
|
}, |
|
{ |
|
"epoch": 1.929388363998011, |
|
"grad_norm": 0.5820719941969129, |
|
"learning_rate": 7.379956343955385e-08, |
|
"loss": 0.8726, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 1.9343610144206862, |
|
"grad_norm": 0.5883844683240437, |
|
"learning_rate": 6.364409561202323e-08, |
|
"loss": 0.8907, |
|
"step": 1945 |
|
}, |
|
{ |
|
"epoch": 1.9393336648433617, |
|
"grad_norm": 0.5888023020558691, |
|
"learning_rate": 5.42378092601481e-08, |
|
"loss": 0.8733, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 1.944306315266037, |
|
"grad_norm": 0.6131742183657333, |
|
"learning_rate": 4.558141359921386e-08, |
|
"loss": 0.8835, |
|
"step": 1955 |
|
}, |
|
{ |
|
"epoch": 1.9492789656887122, |
|
"grad_norm": 0.5724474865234898, |
|
"learning_rate": 3.7675561304238996e-08, |
|
"loss": 0.9032, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 1.9542516161113874, |
|
"grad_norm": 0.5716096201690882, |
|
"learning_rate": 3.0520848460765525e-08, |
|
"loss": 0.8891, |
|
"step": 1965 |
|
}, |
|
{ |
|
"epoch": 1.9592242665340627, |
|
"grad_norm": 0.5954898642826182, |
|
"learning_rate": 2.4117814519911687e-08, |
|
"loss": 0.8951, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 1.964196916956738, |
|
"grad_norm": 0.5883075227671835, |
|
"learning_rate": 1.846694225770551e-08, |
|
"loss": 0.8799, |
|
"step": 1975 |
|
}, |
|
{ |
|
"epoch": 1.9691695673794132, |
|
"grad_norm": 0.5596221737627415, |
|
"learning_rate": 1.3568657738678437e-08, |
|
"loss": 0.8917, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 1.9741422178020884, |
|
"grad_norm": 0.5985361336002101, |
|
"learning_rate": 9.423330283742093e-09, |
|
"loss": 0.8822, |
|
"step": 1985 |
|
}, |
|
{ |
|
"epoch": 1.9791148682247637, |
|
"grad_norm": 0.5774439559985055, |
|
"learning_rate": 6.031272442341696e-09, |
|
"loss": 0.8984, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 1.984087518647439, |
|
"grad_norm": 0.5947058034013541, |
|
"learning_rate": 3.3927399688948868e-09, |
|
"loss": 0.88, |
|
"step": 1995 |
|
}, |
|
{ |
|
"epoch": 1.9890601690701144, |
|
"grad_norm": 0.5662497030571633, |
|
"learning_rate": 1.5079318035016166e-09, |
|
"loss": 0.8558, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 1.9940328194927897, |
|
"grad_norm": 0.584469598380576, |
|
"learning_rate": 3.7699005695057687e-10, |
|
"loss": 0.8875, |
|
"step": 2005 |
|
}, |
|
{ |
|
"epoch": 1.999005469915465, |
|
"grad_norm": 0.5888373608660001, |
|
"learning_rate": 0.0, |
|
"loss": 0.8693, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 1.999005469915465, |
|
"eval_loss": 1.057645320892334, |
|
"eval_runtime": 312.6039, |
|
"eval_samples_per_second": 45.543, |
|
"eval_steps_per_second": 0.713, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 1.999005469915465, |
|
"step": 2010, |
|
"total_flos": 841287456522240.0, |
|
"train_loss": 0.6627339932455946, |
|
"train_runtime": 16962.808, |
|
"train_samples_per_second": 15.173, |
|
"train_steps_per_second": 0.118 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 2010, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 2, |
|
"save_steps": 100, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 841287456522240.0, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|