{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.999005469915465, "eval_steps": 500, "global_step": 2010, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.000994530084535057, "grad_norm": 23.116023523946204, "learning_rate": 9.950248756218906e-08, "loss": 1.4467, "step": 1 }, { "epoch": 0.004972650422675286, "grad_norm": 22.614904504884613, "learning_rate": 4.975124378109453e-07, "loss": 1.4087, "step": 5 }, { "epoch": 0.009945300845350571, "grad_norm": 16.569672699698376, "learning_rate": 9.950248756218907e-07, "loss": 1.3944, "step": 10 }, { "epoch": 0.014917951268025857, "grad_norm": 3.581639215568655, "learning_rate": 1.4925373134328358e-06, "loss": 1.3012, "step": 15 }, { "epoch": 0.019890601690701143, "grad_norm": 1.9954309309104792, "learning_rate": 1.9900497512437813e-06, "loss": 1.2372, "step": 20 }, { "epoch": 0.02486325211337643, "grad_norm": 1.363643820597461, "learning_rate": 2.4875621890547264e-06, "loss": 1.1994, "step": 25 }, { "epoch": 0.029835902536051714, "grad_norm": 0.9605594350100669, "learning_rate": 2.9850746268656716e-06, "loss": 1.1902, "step": 30 }, { "epoch": 0.034808552958727, "grad_norm": 0.9786652533260872, "learning_rate": 3.4825870646766175e-06, "loss": 1.1542, "step": 35 }, { "epoch": 0.039781203381402286, "grad_norm": 0.9984572750782378, "learning_rate": 3.980099502487563e-06, "loss": 1.1175, "step": 40 }, { "epoch": 0.04475385380407757, "grad_norm": 0.7596880194893064, "learning_rate": 4.477611940298508e-06, "loss": 1.1268, "step": 45 }, { "epoch": 0.04972650422675286, "grad_norm": 0.8716479829689806, "learning_rate": 4.975124378109453e-06, "loss": 1.1332, "step": 50 }, { "epoch": 0.05469915464942814, "grad_norm": 0.791852622062893, "learning_rate": 5.472636815920398e-06, "loss": 1.132, "step": 55 }, { "epoch": 0.05967180507210343, "grad_norm": 0.733246468747676, "learning_rate": 5.970149253731343e-06, "loss": 1.0988, "step": 60 }, { "epoch": 0.06464445549477872, "grad_norm": 0.7512018991531512, "learning_rate": 6.46766169154229e-06, "loss": 1.1027, "step": 65 }, { "epoch": 0.069617105917454, "grad_norm": 1.0504364963362558, "learning_rate": 6.965174129353235e-06, "loss": 1.1057, "step": 70 }, { "epoch": 0.07458975634012929, "grad_norm": 0.8367942560809148, "learning_rate": 7.46268656716418e-06, "loss": 1.1211, "step": 75 }, { "epoch": 0.07956240676280457, "grad_norm": 0.7659112396017037, "learning_rate": 7.960199004975125e-06, "loss": 1.1034, "step": 80 }, { "epoch": 0.08453505718547986, "grad_norm": 0.7564136569017796, "learning_rate": 8.45771144278607e-06, "loss": 1.1095, "step": 85 }, { "epoch": 0.08950770760815514, "grad_norm": 0.680667197827103, "learning_rate": 8.955223880597016e-06, "loss": 1.1228, "step": 90 }, { "epoch": 0.09448035803083044, "grad_norm": 0.7778561000416293, "learning_rate": 9.45273631840796e-06, "loss": 1.1305, "step": 95 }, { "epoch": 0.09945300845350571, "grad_norm": 0.7582042725036765, "learning_rate": 9.950248756218906e-06, "loss": 1.1032, "step": 100 }, { "epoch": 0.10442565887618101, "grad_norm": 0.6743713824965428, "learning_rate": 1.0447761194029851e-05, "loss": 1.094, "step": 105 }, { "epoch": 0.10939830929885629, "grad_norm": 0.8423183568175382, "learning_rate": 1.0945273631840796e-05, "loss": 1.1025, "step": 110 }, { "epoch": 0.11437095972153158, "grad_norm": 0.7394348567397752, "learning_rate": 1.1442786069651741e-05, "loss": 1.105, "step": 115 }, { "epoch": 0.11934361014420686, "grad_norm": 1.039483051036899, "learning_rate": 1.1940298507462686e-05, "loss": 1.1086, "step": 120 }, { "epoch": 0.12431626056688215, "grad_norm": 0.8818107439680185, "learning_rate": 1.2437810945273631e-05, "loss": 1.1218, "step": 125 }, { "epoch": 0.12928891098955744, "grad_norm": 0.7903765993701828, "learning_rate": 1.293532338308458e-05, "loss": 1.0911, "step": 130 }, { "epoch": 0.13426156141223272, "grad_norm": 0.8928561309689899, "learning_rate": 1.3432835820895525e-05, "loss": 1.1035, "step": 135 }, { "epoch": 0.139234211834908, "grad_norm": 0.8136039158436408, "learning_rate": 1.393034825870647e-05, "loss": 1.1187, "step": 140 }, { "epoch": 0.14420686225758328, "grad_norm": 0.7064949127673769, "learning_rate": 1.4427860696517415e-05, "loss": 1.1357, "step": 145 }, { "epoch": 0.14917951268025859, "grad_norm": 0.920457572603285, "learning_rate": 1.492537313432836e-05, "loss": 1.1218, "step": 150 }, { "epoch": 0.15415216310293386, "grad_norm": 0.7265048207847127, "learning_rate": 1.5422885572139307e-05, "loss": 1.0863, "step": 155 }, { "epoch": 0.15912481352560914, "grad_norm": 0.9780404914044643, "learning_rate": 1.592039800995025e-05, "loss": 1.1203, "step": 160 }, { "epoch": 0.16409746394828442, "grad_norm": 1.0254663723419732, "learning_rate": 1.6417910447761197e-05, "loss": 1.1079, "step": 165 }, { "epoch": 0.16907011437095973, "grad_norm": 0.8097421068534958, "learning_rate": 1.691542288557214e-05, "loss": 1.1086, "step": 170 }, { "epoch": 0.174042764793635, "grad_norm": 0.7012708079715231, "learning_rate": 1.7412935323383088e-05, "loss": 1.0985, "step": 175 }, { "epoch": 0.1790154152163103, "grad_norm": 0.8920950219504408, "learning_rate": 1.791044776119403e-05, "loss": 1.106, "step": 180 }, { "epoch": 0.1839880656389856, "grad_norm": 0.7178834727892439, "learning_rate": 1.8407960199004978e-05, "loss": 1.1089, "step": 185 }, { "epoch": 0.18896071606166087, "grad_norm": 0.8106510565314389, "learning_rate": 1.890547263681592e-05, "loss": 1.1303, "step": 190 }, { "epoch": 0.19393336648433615, "grad_norm": 0.7760161666634037, "learning_rate": 1.9402985074626868e-05, "loss": 1.1, "step": 195 }, { "epoch": 0.19890601690701143, "grad_norm": 0.9543725947472645, "learning_rate": 1.990049751243781e-05, "loss": 1.1113, "step": 200 }, { "epoch": 0.20387866732968674, "grad_norm": 0.7825954675614248, "learning_rate": 1.9999758725817802e-05, "loss": 1.1142, "step": 205 }, { "epoch": 0.20885131775236201, "grad_norm": 0.9041843566452702, "learning_rate": 1.999877856940653e-05, "loss": 1.1107, "step": 210 }, { "epoch": 0.2138239681750373, "grad_norm": 0.8759171946616057, "learning_rate": 1.9997044524974797e-05, "loss": 1.1075, "step": 215 }, { "epoch": 0.21879661859771257, "grad_norm": 0.6883474381832496, "learning_rate": 1.9994556723266102e-05, "loss": 1.0916, "step": 220 }, { "epoch": 0.22376926902038788, "grad_norm": 0.935590319528177, "learning_rate": 1.999131535185575e-05, "loss": 1.1354, "step": 225 }, { "epoch": 0.22874191944306316, "grad_norm": 0.7241812404841466, "learning_rate": 1.9987320655136693e-05, "loss": 1.091, "step": 230 }, { "epoch": 0.23371456986573844, "grad_norm": 0.7126868228357645, "learning_rate": 1.998257293430112e-05, "loss": 1.1183, "step": 235 }, { "epoch": 0.23868722028841372, "grad_norm": 0.7747343156203321, "learning_rate": 1.997707254731775e-05, "loss": 1.1183, "step": 240 }, { "epoch": 0.24365987071108902, "grad_norm": 0.7040038125289689, "learning_rate": 1.9970819908904815e-05, "loss": 1.1129, "step": 245 }, { "epoch": 0.2486325211337643, "grad_norm": 0.7279011261296227, "learning_rate": 1.996381549049882e-05, "loss": 1.1198, "step": 250 }, { "epoch": 0.2536051715564396, "grad_norm": 0.7449816844457622, "learning_rate": 1.9956059820218982e-05, "loss": 1.112, "step": 255 }, { "epoch": 0.2585778219791149, "grad_norm": 0.8826758475728962, "learning_rate": 1.994755348282742e-05, "loss": 1.1176, "step": 260 }, { "epoch": 0.26355047240179014, "grad_norm": 0.7356262066526851, "learning_rate": 1.9938297119685054e-05, "loss": 1.0975, "step": 265 }, { "epoch": 0.26852312282446544, "grad_norm": 0.7345831845947749, "learning_rate": 1.9928291428703265e-05, "loss": 1.1054, "step": 270 }, { "epoch": 0.27349577324714075, "grad_norm": 0.6792936934909664, "learning_rate": 1.9917537164291244e-05, "loss": 1.0971, "step": 275 }, { "epoch": 0.278468423669816, "grad_norm": 0.7211549142360152, "learning_rate": 1.990603513729915e-05, "loss": 1.1137, "step": 280 }, { "epoch": 0.2834410740924913, "grad_norm": 0.7338829367672012, "learning_rate": 1.9893786214956946e-05, "loss": 1.1031, "step": 285 }, { "epoch": 0.28841372451516656, "grad_norm": 0.8494824202658868, "learning_rate": 1.9880791320809012e-05, "loss": 1.0962, "step": 290 }, { "epoch": 0.29338637493784187, "grad_norm": 0.7216958796251164, "learning_rate": 1.9867051434644532e-05, "loss": 1.1262, "step": 295 }, { "epoch": 0.29835902536051717, "grad_norm": 0.7349825775644381, "learning_rate": 1.985256759242359e-05, "loss": 1.0938, "step": 300 }, { "epoch": 0.3033316757831924, "grad_norm": 0.6477209491442245, "learning_rate": 1.9837340886199097e-05, "loss": 1.0925, "step": 305 }, { "epoch": 0.30830432620586773, "grad_norm": 0.643732891793806, "learning_rate": 1.9821372464034416e-05, "loss": 1.116, "step": 310 }, { "epoch": 0.31327697662854304, "grad_norm": 0.7127469375661294, "learning_rate": 1.9804663529916825e-05, "loss": 1.118, "step": 315 }, { "epoch": 0.3182496270512183, "grad_norm": 0.6302238233373078, "learning_rate": 1.9787215343666732e-05, "loss": 1.0933, "step": 320 }, { "epoch": 0.3232222774738936, "grad_norm": 0.6512766142325113, "learning_rate": 1.9769029220842678e-05, "loss": 1.1022, "step": 325 }, { "epoch": 0.32819492789656884, "grad_norm": 0.6928608412763418, "learning_rate": 1.975010653264216e-05, "loss": 1.1057, "step": 330 }, { "epoch": 0.33316757831924415, "grad_norm": 0.6801795596819155, "learning_rate": 1.973044870579824e-05, "loss": 1.099, "step": 335 }, { "epoch": 0.33814022874191946, "grad_norm": 0.7155915889134151, "learning_rate": 1.971005722247197e-05, "loss": 1.1112, "step": 340 }, { "epoch": 0.3431128791645947, "grad_norm": 0.7270391229967981, "learning_rate": 1.9688933620140638e-05, "loss": 1.0994, "step": 345 }, { "epoch": 0.34808552958727, "grad_norm": 0.6781191980419545, "learning_rate": 1.966707949148186e-05, "loss": 1.0933, "step": 350 }, { "epoch": 0.3530581800099453, "grad_norm": 0.8199363645347214, "learning_rate": 1.9644496484253473e-05, "loss": 1.0993, "step": 355 }, { "epoch": 0.3580308304326206, "grad_norm": 0.7553462054946344, "learning_rate": 1.9621186301169316e-05, "loss": 1.1111, "step": 360 }, { "epoch": 0.3630034808552959, "grad_norm": 0.6733526636292482, "learning_rate": 1.9597150699770834e-05, "loss": 1.1038, "step": 365 }, { "epoch": 0.3679761312779712, "grad_norm": 0.63611657294842, "learning_rate": 1.957239149229458e-05, "loss": 1.0894, "step": 370 }, { "epoch": 0.37294878170064644, "grad_norm": 0.6705699643223696, "learning_rate": 1.954691054553556e-05, "loss": 1.0908, "step": 375 }, { "epoch": 0.37792143212332174, "grad_norm": 0.6042254858161937, "learning_rate": 1.9520709780706485e-05, "loss": 1.0968, "step": 380 }, { "epoch": 0.382894082545997, "grad_norm": 0.7276058593761272, "learning_rate": 1.9493791173292924e-05, "loss": 1.0863, "step": 385 }, { "epoch": 0.3878667329686723, "grad_norm": 0.7335666141482664, "learning_rate": 1.9466156752904344e-05, "loss": 1.0968, "step": 390 }, { "epoch": 0.3928393833913476, "grad_norm": 0.6999656279264759, "learning_rate": 1.9437808603121086e-05, "loss": 1.1077, "step": 395 }, { "epoch": 0.39781203381402286, "grad_norm": 0.7171558885154616, "learning_rate": 1.9408748861337274e-05, "loss": 1.0929, "step": 400 }, { "epoch": 0.40278468423669817, "grad_norm": 0.7881416088231044, "learning_rate": 1.9378979718599647e-05, "loss": 1.1068, "step": 405 }, { "epoch": 0.40775733465937347, "grad_norm": 0.7628447209491808, "learning_rate": 1.934850341944237e-05, "loss": 1.0979, "step": 410 }, { "epoch": 0.4127299850820487, "grad_norm": 0.6407210873259641, "learning_rate": 1.9317322261717794e-05, "loss": 1.1028, "step": 415 }, { "epoch": 0.41770263550472403, "grad_norm": 0.6676034803018082, "learning_rate": 1.9285438596423204e-05, "loss": 1.0943, "step": 420 }, { "epoch": 0.4226752859273993, "grad_norm": 0.7550987050238746, "learning_rate": 1.9252854827523557e-05, "loss": 1.1066, "step": 425 }, { "epoch": 0.4276479363500746, "grad_norm": 0.6729112307385414, "learning_rate": 1.9219573411770235e-05, "loss": 1.1008, "step": 430 }, { "epoch": 0.4326205867727499, "grad_norm": 0.6703726778340369, "learning_rate": 1.9185596858515797e-05, "loss": 1.107, "step": 435 }, { "epoch": 0.43759323719542514, "grad_norm": 0.6957611244541537, "learning_rate": 1.91509277295248e-05, "loss": 1.0803, "step": 440 }, { "epoch": 0.44256588761810045, "grad_norm": 0.6302665113292966, "learning_rate": 1.911556863878062e-05, "loss": 1.089, "step": 445 }, { "epoch": 0.44753853804077576, "grad_norm": 0.6619346129189808, "learning_rate": 1.9079522252288387e-05, "loss": 1.0998, "step": 450 }, { "epoch": 0.452511188463451, "grad_norm": 0.6303067629333522, "learning_rate": 1.9042791287873958e-05, "loss": 1.0982, "step": 455 }, { "epoch": 0.4574838388861263, "grad_norm": 0.6817551059292462, "learning_rate": 1.900537851497901e-05, "loss": 1.1123, "step": 460 }, { "epoch": 0.46245648930880157, "grad_norm": 0.6883985523342773, "learning_rate": 1.8967286754452214e-05, "loss": 1.0994, "step": 465 }, { "epoch": 0.4674291397314769, "grad_norm": 0.6303976451992164, "learning_rate": 1.892851887833657e-05, "loss": 1.0915, "step": 470 }, { "epoch": 0.4724017901541522, "grad_norm": 0.6822781316090195, "learning_rate": 1.8889077809652837e-05, "loss": 1.0798, "step": 475 }, { "epoch": 0.47737444057682743, "grad_norm": 0.6360054418246037, "learning_rate": 1.884896652217917e-05, "loss": 1.0939, "step": 480 }, { "epoch": 0.48234709099950274, "grad_norm": 0.6992087315432453, "learning_rate": 1.880818804022687e-05, "loss": 1.0987, "step": 485 }, { "epoch": 0.48731974142217804, "grad_norm": 0.6768633602943569, "learning_rate": 1.8766745438412382e-05, "loss": 1.1199, "step": 490 }, { "epoch": 0.4922923918448533, "grad_norm": 0.6243661194702032, "learning_rate": 1.872464184142548e-05, "loss": 1.0883, "step": 495 }, { "epoch": 0.4972650422675286, "grad_norm": 0.6589230792154801, "learning_rate": 1.868188042379364e-05, "loss": 1.1163, "step": 500 }, { "epoch": 0.5022376926902039, "grad_norm": 0.6642790387561549, "learning_rate": 1.8638464409642724e-05, "loss": 1.0954, "step": 505 }, { "epoch": 0.5072103431128792, "grad_norm": 0.6423332854509105, "learning_rate": 1.8594397072453854e-05, "loss": 1.076, "step": 510 }, { "epoch": 0.5121829935355544, "grad_norm": 0.7506393578542332, "learning_rate": 1.8549681734816624e-05, "loss": 1.0985, "step": 515 }, { "epoch": 0.5171556439582298, "grad_norm": 0.7180721263869316, "learning_rate": 1.850432176817857e-05, "loss": 1.098, "step": 520 }, { "epoch": 0.522128294380905, "grad_norm": 0.6300038431215529, "learning_rate": 1.8458320592590976e-05, "loss": 1.083, "step": 525 }, { "epoch": 0.5271009448035803, "grad_norm": 0.8081330388900698, "learning_rate": 1.8411681676450998e-05, "loss": 1.0852, "step": 530 }, { "epoch": 0.5320735952262556, "grad_norm": 0.8167362817265476, "learning_rate": 1.836440853624017e-05, "loss": 1.1036, "step": 535 }, { "epoch": 0.5370462456489309, "grad_norm": 0.6121720313365099, "learning_rate": 1.8316504736259257e-05, "loss": 1.0891, "step": 540 }, { "epoch": 0.5420188960716061, "grad_norm": 0.6187125374867918, "learning_rate": 1.826797388835951e-05, "loss": 1.1023, "step": 545 }, { "epoch": 0.5469915464942815, "grad_norm": 0.5866212425394042, "learning_rate": 1.8218819651670356e-05, "loss": 1.1075, "step": 550 }, { "epoch": 0.5519641969169568, "grad_norm": 0.600120796062157, "learning_rate": 1.8169045732323495e-05, "loss": 1.0763, "step": 555 }, { "epoch": 0.556936847339632, "grad_norm": 0.634846796553325, "learning_rate": 1.8118655883173458e-05, "loss": 1.0827, "step": 560 }, { "epoch": 0.5619094977623074, "grad_norm": 0.6398611916918977, "learning_rate": 1.8067653903514674e-05, "loss": 1.0787, "step": 565 }, { "epoch": 0.5668821481849826, "grad_norm": 0.6406203607058498, "learning_rate": 1.8016043638794975e-05, "loss": 1.0738, "step": 570 }, { "epoch": 0.5718547986076579, "grad_norm": 0.6400875643577613, "learning_rate": 1.7963828980325696e-05, "loss": 1.0818, "step": 575 }, { "epoch": 0.5768274490303331, "grad_norm": 0.649631239193607, "learning_rate": 1.7911013864988254e-05, "loss": 1.0801, "step": 580 }, { "epoch": 0.5818000994530085, "grad_norm": 0.6384185175759214, "learning_rate": 1.785760227493731e-05, "loss": 1.0962, "step": 585 }, { "epoch": 0.5867727498756837, "grad_norm": 0.6637419994479018, "learning_rate": 1.780359823730054e-05, "loss": 1.0986, "step": 590 }, { "epoch": 0.591745400298359, "grad_norm": 0.6940445103521063, "learning_rate": 1.774900582387499e-05, "loss": 1.0849, "step": 595 }, { "epoch": 0.5967180507210343, "grad_norm": 0.6352562893683872, "learning_rate": 1.769382915082007e-05, "loss": 1.0827, "step": 600 }, { "epoch": 0.6016907011437096, "grad_norm": 0.644197961344711, "learning_rate": 1.7638072378347205e-05, "loss": 1.0782, "step": 605 }, { "epoch": 0.6066633515663848, "grad_norm": 0.6217957456701444, "learning_rate": 1.7581739710406158e-05, "loss": 1.0979, "step": 610 }, { "epoch": 0.6116360019890602, "grad_norm": 0.6697037211804547, "learning_rate": 1.752483539436807e-05, "loss": 1.0902, "step": 615 }, { "epoch": 0.6166086524117355, "grad_norm": 0.584860135164739, "learning_rate": 1.7467363720705204e-05, "loss": 1.0779, "step": 620 }, { "epoch": 0.6215813028344107, "grad_norm": 0.6961081581048412, "learning_rate": 1.740932902266747e-05, "loss": 1.072, "step": 625 }, { "epoch": 0.6265539532570861, "grad_norm": 0.660941824477212, "learning_rate": 1.7350735675955696e-05, "loss": 1.0857, "step": 630 }, { "epoch": 0.6315266036797613, "grad_norm": 0.6390929736380937, "learning_rate": 1.72915880983917e-05, "loss": 1.0748, "step": 635 }, { "epoch": 0.6364992541024366, "grad_norm": 0.6136644106406868, "learning_rate": 1.7231890749585208e-05, "loss": 1.0704, "step": 640 }, { "epoch": 0.6414719045251119, "grad_norm": 0.6500326781000102, "learning_rate": 1.717164813059761e-05, "loss": 1.0621, "step": 645 }, { "epoch": 0.6464445549477872, "grad_norm": 0.6274957853270717, "learning_rate": 1.711086478360257e-05, "loss": 1.0882, "step": 650 }, { "epoch": 0.6514172053704624, "grad_norm": 0.6776791597333586, "learning_rate": 1.704954529154359e-05, "loss": 1.069, "step": 655 }, { "epoch": 0.6563898557931377, "grad_norm": 0.6307039768815268, "learning_rate": 1.698769427778842e-05, "loss": 1.0845, "step": 660 }, { "epoch": 0.661362506215813, "grad_norm": 0.6017095125274678, "learning_rate": 1.69253164057805e-05, "loss": 1.0804, "step": 665 }, { "epoch": 0.6663351566384883, "grad_norm": 0.6094011002277849, "learning_rate": 1.686241637868734e-05, "loss": 1.0681, "step": 670 }, { "epoch": 0.6713078070611636, "grad_norm": 0.6098389254101954, "learning_rate": 1.6798998939045893e-05, "loss": 1.078, "step": 675 }, { "epoch": 0.6762804574838389, "grad_norm": 0.6371210646880789, "learning_rate": 1.6735068868405e-05, "loss": 1.0776, "step": 680 }, { "epoch": 0.6812531079065142, "grad_norm": 0.6519412959002887, "learning_rate": 1.667063098696485e-05, "loss": 1.093, "step": 685 }, { "epoch": 0.6862257583291894, "grad_norm": 0.6296676825303947, "learning_rate": 1.660569015321357e-05, "loss": 1.079, "step": 690 }, { "epoch": 0.6911984087518648, "grad_norm": 0.6515938103088492, "learning_rate": 1.654025126356088e-05, "loss": 1.0763, "step": 695 }, { "epoch": 0.69617105917454, "grad_norm": 0.63614383650923, "learning_rate": 1.647431925196892e-05, "loss": 1.0726, "step": 700 }, { "epoch": 0.7011437095972153, "grad_norm": 0.6733392483456604, "learning_rate": 1.6407899089580263e-05, "loss": 1.0808, "step": 705 }, { "epoch": 0.7061163600198906, "grad_norm": 0.6177292064792552, "learning_rate": 1.6340995784343058e-05, "loss": 1.0662, "step": 710 }, { "epoch": 0.7110890104425659, "grad_norm": 0.6282011292024681, "learning_rate": 1.6273614380633484e-05, "loss": 1.0756, "step": 715 }, { "epoch": 0.7160616608652411, "grad_norm": 0.6382859967632786, "learning_rate": 1.620575995887538e-05, "loss": 1.0784, "step": 720 }, { "epoch": 0.7210343112879165, "grad_norm": 0.5746413645512066, "learning_rate": 1.6137437635157214e-05, "loss": 1.0812, "step": 725 }, { "epoch": 0.7260069617105918, "grad_norm": 0.5979846067324073, "learning_rate": 1.6068652560846328e-05, "loss": 1.0731, "step": 730 }, { "epoch": 0.730979612133267, "grad_norm": 0.6150123084073673, "learning_rate": 1.5999409922200534e-05, "loss": 1.0836, "step": 735 }, { "epoch": 0.7359522625559424, "grad_norm": 0.6300145625125909, "learning_rate": 1.592971493997709e-05, "loss": 1.0635, "step": 740 }, { "epoch": 0.7409249129786176, "grad_norm": 0.6454552072886953, "learning_rate": 1.5859572869039063e-05, "loss": 1.0713, "step": 745 }, { "epoch": 0.7458975634012929, "grad_norm": 0.5969281554213641, "learning_rate": 1.5788988997959115e-05, "loss": 1.0692, "step": 750 }, { "epoch": 0.7508702138239681, "grad_norm": 0.6207791802320729, "learning_rate": 1.571796864862076e-05, "loss": 1.0789, "step": 755 }, { "epoch": 0.7558428642466435, "grad_norm": 0.6191886764048089, "learning_rate": 1.5646517175817114e-05, "loss": 1.0714, "step": 760 }, { "epoch": 0.7608155146693187, "grad_norm": 0.6281154744718429, "learning_rate": 1.5574639966847128e-05, "loss": 1.0661, "step": 765 }, { "epoch": 0.765788165091994, "grad_norm": 0.6096651334127257, "learning_rate": 1.5502342441109423e-05, "loss": 1.0814, "step": 770 }, { "epoch": 0.7707608155146694, "grad_norm": 0.6122390840080459, "learning_rate": 1.5429630049693676e-05, "loss": 1.0769, "step": 775 }, { "epoch": 0.7757334659373446, "grad_norm": 0.6362786028351584, "learning_rate": 1.5356508274969595e-05, "loss": 1.0689, "step": 780 }, { "epoch": 0.7807061163600199, "grad_norm": 0.5843854420963039, "learning_rate": 1.5282982630173587e-05, "loss": 1.0755, "step": 785 }, { "epoch": 0.7856787667826952, "grad_norm": 0.6093020908006089, "learning_rate": 1.5209058658993056e-05, "loss": 1.0704, "step": 790 }, { "epoch": 0.7906514172053705, "grad_norm": 0.5732577171485725, "learning_rate": 1.513474193514842e-05, "loss": 1.0824, "step": 795 }, { "epoch": 0.7956240676280457, "grad_norm": 0.604193134815774, "learning_rate": 1.5060038061972875e-05, "loss": 1.0825, "step": 800 }, { "epoch": 0.8005967180507211, "grad_norm": 0.5972633379846944, "learning_rate": 1.49849526719899e-05, "loss": 1.0786, "step": 805 }, { "epoch": 0.8055693684733963, "grad_norm": 0.648881707010711, "learning_rate": 1.4909491426488579e-05, "loss": 1.071, "step": 810 }, { "epoch": 0.8105420188960716, "grad_norm": 0.6747876304157826, "learning_rate": 1.4833660015096767e-05, "loss": 1.0881, "step": 815 }, { "epoch": 0.8155146693187469, "grad_norm": 0.6061995987355869, "learning_rate": 1.4757464155352082e-05, "loss": 1.0836, "step": 820 }, { "epoch": 0.8204873197414222, "grad_norm": 0.5651261625287576, "learning_rate": 1.468090959227082e-05, "loss": 1.0578, "step": 825 }, { "epoch": 0.8254599701640974, "grad_norm": 0.6708302058482162, "learning_rate": 1.4604002097914806e-05, "loss": 1.0874, "step": 830 }, { "epoch": 0.8304326205867727, "grad_norm": 0.5998607882192355, "learning_rate": 1.4526747470956175e-05, "loss": 1.078, "step": 835 }, { "epoch": 0.8354052710094481, "grad_norm": 0.6239914515973155, "learning_rate": 1.4449151536240167e-05, "loss": 1.0691, "step": 840 }, { "epoch": 0.8403779214321233, "grad_norm": 0.5757669163668722, "learning_rate": 1.4371220144345954e-05, "loss": 1.0644, "step": 845 }, { "epoch": 0.8453505718547986, "grad_norm": 0.6156324529775569, "learning_rate": 1.4292959171145509e-05, "loss": 1.0918, "step": 850 }, { "epoch": 0.8503232222774739, "grad_norm": 0.589513385399977, "learning_rate": 1.4214374517360576e-05, "loss": 1.0768, "step": 855 }, { "epoch": 0.8552958727001492, "grad_norm": 0.5729612698754991, "learning_rate": 1.4135472108117786e-05, "loss": 1.0555, "step": 860 }, { "epoch": 0.8602685231228244, "grad_norm": 0.5712525334541005, "learning_rate": 1.4056257892501886e-05, "loss": 1.0679, "step": 865 }, { "epoch": 0.8652411735454998, "grad_norm": 0.6414106262572151, "learning_rate": 1.3976737843107203e-05, "loss": 1.0725, "step": 870 }, { "epoch": 0.870213823968175, "grad_norm": 0.6169285531207347, "learning_rate": 1.3896917955587328e-05, "loss": 1.0695, "step": 875 }, { "epoch": 0.8751864743908503, "grad_norm": 0.581169031342323, "learning_rate": 1.3816804248203053e-05, "loss": 1.0732, "step": 880 }, { "epoch": 0.8801591248135257, "grad_norm": 0.5976235787701776, "learning_rate": 1.3736402761368597e-05, "loss": 1.057, "step": 885 }, { "epoch": 0.8851317752362009, "grad_norm": 0.6033341957672993, "learning_rate": 1.3655719557196185e-05, "loss": 1.0778, "step": 890 }, { "epoch": 0.8901044256588762, "grad_norm": 0.6026576916975591, "learning_rate": 1.3574760719038959e-05, "loss": 1.0659, "step": 895 }, { "epoch": 0.8950770760815515, "grad_norm": 0.5976061942900717, "learning_rate": 1.3493532351032318e-05, "loss": 1.0444, "step": 900 }, { "epoch": 0.9000497265042268, "grad_norm": 0.5679367137196055, "learning_rate": 1.3412040577633687e-05, "loss": 1.0505, "step": 905 }, { "epoch": 0.905022376926902, "grad_norm": 0.5851846275345329, "learning_rate": 1.333029154316072e-05, "loss": 1.0561, "step": 910 }, { "epoch": 0.9099950273495774, "grad_norm": 0.6070199211173483, "learning_rate": 1.3248291411328048e-05, "loss": 1.0718, "step": 915 }, { "epoch": 0.9149676777722526, "grad_norm": 0.6061266848548619, "learning_rate": 1.3166046364782545e-05, "loss": 1.0608, "step": 920 }, { "epoch": 0.9199403281949279, "grad_norm": 0.6073941055156064, "learning_rate": 1.308356260463717e-05, "loss": 1.0776, "step": 925 }, { "epoch": 0.9249129786176031, "grad_norm": 0.670942497774792, "learning_rate": 1.300084635000341e-05, "loss": 1.0867, "step": 930 }, { "epoch": 0.9298856290402785, "grad_norm": 0.6063652114093658, "learning_rate": 1.291790383752237e-05, "loss": 1.0726, "step": 935 }, { "epoch": 0.9348582794629537, "grad_norm": 0.6504237055754567, "learning_rate": 1.2834741320894554e-05, "loss": 1.0747, "step": 940 }, { "epoch": 0.939830929885629, "grad_norm": 0.6770849233282473, "learning_rate": 1.2751365070408335e-05, "loss": 1.0747, "step": 945 }, { "epoch": 0.9448035803083044, "grad_norm": 0.6051457904824041, "learning_rate": 1.2667781372467203e-05, "loss": 1.0618, "step": 950 }, { "epoch": 0.9497762307309796, "grad_norm": 0.6082598939008289, "learning_rate": 1.2583996529115762e-05, "loss": 1.0675, "step": 955 }, { "epoch": 0.9547488811536549, "grad_norm": 0.6063786635274505, "learning_rate": 1.2500016857564585e-05, "loss": 1.089, "step": 960 }, { "epoch": 0.9597215315763302, "grad_norm": 0.6004777766594889, "learning_rate": 1.2415848689713904e-05, "loss": 1.0761, "step": 965 }, { "epoch": 0.9646941819990055, "grad_norm": 0.5838756120365949, "learning_rate": 1.2331498371676206e-05, "loss": 1.0641, "step": 970 }, { "epoch": 0.9696668324216807, "grad_norm": 0.5881811224350209, "learning_rate": 1.2246972263297718e-05, "loss": 1.0556, "step": 975 }, { "epoch": 0.9746394828443561, "grad_norm": 0.5659402723765035, "learning_rate": 1.2162276737678934e-05, "loss": 1.0535, "step": 980 }, { "epoch": 0.9796121332670313, "grad_norm": 0.5667335140444246, "learning_rate": 1.2077418180694049e-05, "loss": 1.0575, "step": 985 }, { "epoch": 0.9845847836897066, "grad_norm": 0.5707663035442004, "learning_rate": 1.1992402990509515e-05, "loss": 1.0486, "step": 990 }, { "epoch": 0.989557434112382, "grad_norm": 0.5843296109709583, "learning_rate": 1.1907237577101612e-05, "loss": 1.0706, "step": 995 }, { "epoch": 0.9945300845350572, "grad_norm": 0.5784451382415723, "learning_rate": 1.1821928361773148e-05, "loss": 1.0583, "step": 1000 }, { "epoch": 0.9995027349577325, "grad_norm": 0.5601123703118762, "learning_rate": 1.1736481776669307e-05, "loss": 1.0638, "step": 1005 }, { "epoch": 0.9995027349577325, "eval_loss": 1.0704214572906494, "eval_runtime": 313.2095, "eval_samples_per_second": 45.455, "eval_steps_per_second": 0.712, "step": 1005 }, { "epoch": 1.0044753853804078, "grad_norm": 0.7234842589362833, "learning_rate": 1.1650904264292689e-05, "loss": 0.9297, "step": 1010 }, { "epoch": 1.009448035803083, "grad_norm": 0.7141499870480417, "learning_rate": 1.1565202277017551e-05, "loss": 0.9093, "step": 1015 }, { "epoch": 1.0144206862257583, "grad_norm": 0.7209311229992112, "learning_rate": 1.14793822766033e-05, "loss": 0.8998, "step": 1020 }, { "epoch": 1.0193933366484336, "grad_norm": 0.7103101581589907, "learning_rate": 1.139345073370731e-05, "loss": 0.9174, "step": 1025 }, { "epoch": 1.0243659870711088, "grad_norm": 0.6866739397113207, "learning_rate": 1.1307414127397028e-05, "loss": 0.8991, "step": 1030 }, { "epoch": 1.0293386374937843, "grad_norm": 0.629326336496965, "learning_rate": 1.1221278944661474e-05, "loss": 0.9109, "step": 1035 }, { "epoch": 1.0343112879164595, "grad_norm": 0.6926758402006077, "learning_rate": 1.1135051679922143e-05, "loss": 0.9111, "step": 1040 }, { "epoch": 1.0392839383391348, "grad_norm": 0.610723675260377, "learning_rate": 1.104873883454332e-05, "loss": 0.908, "step": 1045 }, { "epoch": 1.04425658876181, "grad_norm": 0.636677273500329, "learning_rate": 1.0962346916341904e-05, "loss": 0.8833, "step": 1050 }, { "epoch": 1.0492292391844853, "grad_norm": 0.6005893454455321, "learning_rate": 1.087588243909673e-05, "loss": 0.9091, "step": 1055 }, { "epoch": 1.0542018896071605, "grad_norm": 0.6048311027567271, "learning_rate": 1.0789351922057437e-05, "loss": 0.9031, "step": 1060 }, { "epoch": 1.0591745400298358, "grad_norm": 0.6157155501614046, "learning_rate": 1.070276188945293e-05, "loss": 0.8928, "step": 1065 }, { "epoch": 1.0641471904525113, "grad_norm": 0.6465505729365048, "learning_rate": 1.0616118869999484e-05, "loss": 0.8942, "step": 1070 }, { "epoch": 1.0691198408751865, "grad_norm": 0.6540283250849646, "learning_rate": 1.0529429396408452e-05, "loss": 0.9028, "step": 1075 }, { "epoch": 1.0740924912978618, "grad_norm": 0.6319963673992531, "learning_rate": 1.0442700004893764e-05, "loss": 0.8908, "step": 1080 }, { "epoch": 1.079065141720537, "grad_norm": 0.6255624407373807, "learning_rate": 1.0355937234679065e-05, "loss": 0.9039, "step": 1085 }, { "epoch": 1.0840377921432123, "grad_norm": 0.5872134392284907, "learning_rate": 1.0269147627504692e-05, "loss": 0.9176, "step": 1090 }, { "epoch": 1.0890104425658875, "grad_norm": 0.6687386627981542, "learning_rate": 1.0182337727134431e-05, "loss": 0.9118, "step": 1095 }, { "epoch": 1.093983092988563, "grad_norm": 0.6382535990052277, "learning_rate": 1.0095514078862147e-05, "loss": 0.9082, "step": 1100 }, { "epoch": 1.0989557434112383, "grad_norm": 0.6119367907834425, "learning_rate": 1.0008683229018257e-05, "loss": 0.9057, "step": 1105 }, { "epoch": 1.1039283938339135, "grad_norm": 0.6013607999486498, "learning_rate": 9.92185172447616e-06, "loss": 0.9247, "step": 1110 }, { "epoch": 1.1089010442565888, "grad_norm": 0.6313265309829375, "learning_rate": 9.835026112158637e-06, "loss": 0.9065, "step": 1115 }, { "epoch": 1.113873694679264, "grad_norm": 0.6369037011412366, "learning_rate": 9.748212938544188e-06, "loss": 0.9217, "step": 1120 }, { "epoch": 1.1188463451019393, "grad_norm": 0.6131706455581138, "learning_rate": 9.661418749173467e-06, "loss": 0.9161, "step": 1125 }, { "epoch": 1.1238189955246147, "grad_norm": 0.6129393934682658, "learning_rate": 9.574650088155752e-06, "loss": 0.8958, "step": 1130 }, { "epoch": 1.12879164594729, "grad_norm": 0.6436312212169086, "learning_rate": 9.487913497675536e-06, "loss": 0.9052, "step": 1135 }, { "epoch": 1.1337642963699652, "grad_norm": 0.6721603259383877, "learning_rate": 9.401215517499252e-06, "loss": 0.9078, "step": 1140 }, { "epoch": 1.1387369467926405, "grad_norm": 0.6531721143996831, "learning_rate": 9.314562684482202e-06, "loss": 0.8982, "step": 1145 }, { "epoch": 1.1437095972153157, "grad_norm": 0.624269506566372, "learning_rate": 9.22796153207567e-06, "loss": 0.9006, "step": 1150 }, { "epoch": 1.148682247637991, "grad_norm": 0.6430651884495883, "learning_rate": 9.14141858983434e-06, "loss": 0.9016, "step": 1155 }, { "epoch": 1.1536548980606662, "grad_norm": 0.6676665306535187, "learning_rate": 9.054940382923954e-06, "loss": 0.8893, "step": 1160 }, { "epoch": 1.1586275484833417, "grad_norm": 0.6252335256876467, "learning_rate": 8.96853343162934e-06, "loss": 0.8893, "step": 1165 }, { "epoch": 1.163600198906017, "grad_norm": 0.5838746279233594, "learning_rate": 8.882204250862796e-06, "loss": 0.8992, "step": 1170 }, { "epoch": 1.1685728493286922, "grad_norm": 0.6369545427202165, "learning_rate": 8.795959349672878e-06, "loss": 0.8902, "step": 1175 }, { "epoch": 1.1735454997513675, "grad_norm": 0.6174021193552773, "learning_rate": 8.709805230753628e-06, "loss": 0.9053, "step": 1180 }, { "epoch": 1.1785181501740427, "grad_norm": 0.6123198457341488, "learning_rate": 8.623748389954284e-06, "loss": 0.903, "step": 1185 }, { "epoch": 1.183490800596718, "grad_norm": 0.5956552825535298, "learning_rate": 8.53779531578951e-06, "loss": 0.896, "step": 1190 }, { "epoch": 1.1884634510193934, "grad_norm": 0.6113334565497545, "learning_rate": 8.451952488950167e-06, "loss": 0.8966, "step": 1195 }, { "epoch": 1.1934361014420687, "grad_norm": 0.6957806045214961, "learning_rate": 8.366226381814698e-06, "loss": 0.9135, "step": 1200 }, { "epoch": 1.198408751864744, "grad_norm": 0.6224006138182644, "learning_rate": 8.280623457961102e-06, "loss": 0.9092, "step": 1205 }, { "epoch": 1.2033814022874192, "grad_norm": 0.6154668861281782, "learning_rate": 8.195150171679608e-06, "loss": 0.8961, "step": 1210 }, { "epoch": 1.2083540527100944, "grad_norm": 0.601285614291192, "learning_rate": 8.109812967486024e-06, "loss": 0.8957, "step": 1215 }, { "epoch": 1.2133267031327697, "grad_norm": 0.6200705833301541, "learning_rate": 8.02461827963585e-06, "loss": 0.9007, "step": 1220 }, { "epoch": 1.218299353555445, "grad_norm": 0.621174344679376, "learning_rate": 7.939572531639128e-06, "loss": 0.9078, "step": 1225 }, { "epoch": 1.2232720039781204, "grad_norm": 0.6237287223906944, "learning_rate": 7.85468213577613e-06, "loss": 0.9085, "step": 1230 }, { "epoch": 1.2282446544007957, "grad_norm": 0.6313023907502137, "learning_rate": 7.7699534926139e-06, "loss": 0.9121, "step": 1235 }, { "epoch": 1.233217304823471, "grad_norm": 0.6645038885414285, "learning_rate": 7.685392990523628e-06, "loss": 0.895, "step": 1240 }, { "epoch": 1.2381899552461462, "grad_norm": 0.6182163241650108, "learning_rate": 7.601007005199022e-06, "loss": 0.8958, "step": 1245 }, { "epoch": 1.2431626056688214, "grad_norm": 0.6152497076528807, "learning_rate": 7.5168018991755645e-06, "loss": 0.9123, "step": 1250 }, { "epoch": 1.248135256091497, "grad_norm": 0.5828706125717531, "learning_rate": 7.432784021350796e-06, "loss": 0.9116, "step": 1255 }, { "epoch": 1.2531079065141721, "grad_norm": 0.6380432502898638, "learning_rate": 7.3489597065056274e-06, "loss": 0.8931, "step": 1260 }, { "epoch": 1.2580805569368474, "grad_norm": 0.6666672232856957, "learning_rate": 7.265335274826704e-06, "loss": 0.8985, "step": 1265 }, { "epoch": 1.2630532073595226, "grad_norm": 0.6111003204096657, "learning_rate": 7.1819170314298746e-06, "loss": 0.9022, "step": 1270 }, { "epoch": 1.268025857782198, "grad_norm": 0.5636007624972241, "learning_rate": 7.09871126588481e-06, "loss": 0.8926, "step": 1275 }, { "epoch": 1.2729985082048731, "grad_norm": 0.6013817391842333, "learning_rate": 7.015724251740766e-06, "loss": 0.9104, "step": 1280 }, { "epoch": 1.2779711586275484, "grad_norm": 0.6155188711752694, "learning_rate": 6.932962246053577e-06, "loss": 0.9095, "step": 1285 }, { "epoch": 1.2829438090502236, "grad_norm": 0.6172811916494736, "learning_rate": 6.8504314889138956e-06, "loss": 0.8996, "step": 1290 }, { "epoch": 1.2879164594728991, "grad_norm": 0.605702181756811, "learning_rate": 6.768138202976691e-06, "loss": 0.8974, "step": 1295 }, { "epoch": 1.2928891098955744, "grad_norm": 0.6159076278297484, "learning_rate": 6.686088592992067e-06, "loss": 0.8946, "step": 1300 }, { "epoch": 1.2978617603182496, "grad_norm": 0.6031330118847871, "learning_rate": 6.604288845337453e-06, "loss": 0.8899, "step": 1305 }, { "epoch": 1.3028344107409249, "grad_norm": 0.5905887773391477, "learning_rate": 6.522745127551158e-06, "loss": 0.8783, "step": 1310 }, { "epoch": 1.3078070611636001, "grad_norm": 0.6196998052199774, "learning_rate": 6.441463587867341e-06, "loss": 0.8913, "step": 1315 }, { "epoch": 1.3127797115862756, "grad_norm": 0.6365018272765014, "learning_rate": 6.360450354752459e-06, "loss": 0.8971, "step": 1320 }, { "epoch": 1.3177523620089509, "grad_norm": 0.6432175965282628, "learning_rate": 6.279711536443185e-06, "loss": 0.8997, "step": 1325 }, { "epoch": 1.322725012431626, "grad_norm": 0.6190646458071539, "learning_rate": 6.199253220485857e-06, "loss": 0.8959, "step": 1330 }, { "epoch": 1.3276976628543014, "grad_norm": 0.5944004518537376, "learning_rate": 6.119081473277502e-06, "loss": 0.8891, "step": 1335 }, { "epoch": 1.3326703132769766, "grad_norm": 0.5966011460567672, "learning_rate": 6.039202339608432e-06, "loss": 0.8972, "step": 1340 }, { "epoch": 1.3376429636996519, "grad_norm": 0.6043525331022274, "learning_rate": 5.959621842206474e-06, "loss": 0.8968, "step": 1345 }, { "epoch": 1.342615614122327, "grad_norm": 0.6274316003330035, "learning_rate": 5.880345981282877e-06, "loss": 0.8975, "step": 1350 }, { "epoch": 1.3475882645450024, "grad_norm": 0.6387485089736155, "learning_rate": 5.801380734079906e-06, "loss": 0.8882, "step": 1355 }, { "epoch": 1.3525609149676778, "grad_norm": 0.6057683456315234, "learning_rate": 5.722732054420172e-06, "loss": 0.8968, "step": 1360 }, { "epoch": 1.357533565390353, "grad_norm": 0.641291647959097, "learning_rate": 5.644405872257716e-06, "loss": 0.9089, "step": 1365 }, { "epoch": 1.3625062158130283, "grad_norm": 0.6093344166563688, "learning_rate": 5.566408093230911e-06, "loss": 0.901, "step": 1370 }, { "epoch": 1.3674788662357036, "grad_norm": 0.6137226994575677, "learning_rate": 5.48874459821719e-06, "loss": 0.8955, "step": 1375 }, { "epoch": 1.3724515166583788, "grad_norm": 0.5988605927384872, "learning_rate": 5.411421242889643e-06, "loss": 0.8972, "step": 1380 }, { "epoch": 1.3774241670810543, "grad_norm": 0.5972211719762297, "learning_rate": 5.334443857275488e-06, "loss": 0.8943, "step": 1385 }, { "epoch": 1.3823968175037296, "grad_norm": 0.5959667581735492, "learning_rate": 5.257818245316522e-06, "loss": 0.8838, "step": 1390 }, { "epoch": 1.3873694679264048, "grad_norm": 0.595064918048714, "learning_rate": 5.181550184431511e-06, "loss": 0.8969, "step": 1395 }, { "epoch": 1.39234211834908, "grad_norm": 0.6244444134401027, "learning_rate": 5.105645425080572e-06, "loss": 0.8999, "step": 1400 }, { "epoch": 1.3973147687717553, "grad_norm": 0.6143231676432872, "learning_rate": 5.030109690331625e-06, "loss": 0.8848, "step": 1405 }, { "epoch": 1.4022874191944306, "grad_norm": 0.5932069860124085, "learning_rate": 4.954948675428853e-06, "loss": 0.9015, "step": 1410 }, { "epoch": 1.4072600696171058, "grad_norm": 0.6037350510324395, "learning_rate": 4.880168047363312e-06, "loss": 0.904, "step": 1415 }, { "epoch": 1.4122327200397813, "grad_norm": 0.6127392479761228, "learning_rate": 4.805773444445654e-06, "loss": 0.888, "step": 1420 }, { "epoch": 1.4172053704624565, "grad_norm": 0.6177060602745384, "learning_rate": 4.731770475880995e-06, "loss": 0.8983, "step": 1425 }, { "epoch": 1.4221780208851318, "grad_norm": 0.5872982950015017, "learning_rate": 4.658164721345998e-06, "loss": 0.8924, "step": 1430 }, { "epoch": 1.427150671307807, "grad_norm": 0.5820337262775029, "learning_rate": 4.584961730568188e-06, "loss": 0.8748, "step": 1435 }, { "epoch": 1.4321233217304823, "grad_norm": 0.5998336756778414, "learning_rate": 4.512167022907494e-06, "loss": 0.8957, "step": 1440 }, { "epoch": 1.4370959721531578, "grad_norm": 0.5961438955138286, "learning_rate": 4.439786086940116e-06, "loss": 0.8961, "step": 1445 }, { "epoch": 1.442068622575833, "grad_norm": 0.6078965414451544, "learning_rate": 4.367824380044684e-06, "loss": 0.8911, "step": 1450 }, { "epoch": 1.4470412729985083, "grad_norm": 0.6288618597339264, "learning_rate": 4.296287327990797e-06, "loss": 0.9019, "step": 1455 }, { "epoch": 1.4520139234211835, "grad_norm": 0.6151434227389634, "learning_rate": 4.225180324529917e-06, "loss": 0.8993, "step": 1460 }, { "epoch": 1.4569865738438588, "grad_norm": 0.6404133470470939, "learning_rate": 4.154508730988704e-06, "loss": 0.889, "step": 1465 }, { "epoch": 1.461959224266534, "grad_norm": 0.6164441372345689, "learning_rate": 4.084277875864776e-06, "loss": 0.8986, "step": 1470 }, { "epoch": 1.4669318746892093, "grad_norm": 0.6109571828322322, "learning_rate": 4.0144930544249436e-06, "loss": 0.8946, "step": 1475 }, { "epoch": 1.4719045251118845, "grad_norm": 0.5732121037549397, "learning_rate": 3.945159528305971e-06, "loss": 0.8917, "step": 1480 }, { "epoch": 1.47687717553456, "grad_norm": 0.5956428501272881, "learning_rate": 3.876282525117847e-06, "loss": 0.906, "step": 1485 }, { "epoch": 1.4818498259572352, "grad_norm": 0.6072276314986297, "learning_rate": 3.8078672380496416e-06, "loss": 0.8924, "step": 1490 }, { "epoch": 1.4868224763799105, "grad_norm": 0.5896349716762431, "learning_rate": 3.7399188254779527e-06, "loss": 0.9039, "step": 1495 }, { "epoch": 1.4917951268025857, "grad_norm": 0.6075888226854094, "learning_rate": 3.6724424105779654e-06, "loss": 0.8993, "step": 1500 }, { "epoch": 1.496767777225261, "grad_norm": 0.6174215911692067, "learning_rate": 3.6054430809371723e-06, "loss": 0.9013, "step": 1505 }, { "epoch": 1.5017404276479365, "grad_norm": 0.6152858958633052, "learning_rate": 3.5389258881718003e-06, "loss": 0.8818, "step": 1510 }, { "epoch": 1.5067130780706117, "grad_norm": 0.617067387408615, "learning_rate": 3.4728958475459052e-06, "loss": 0.8879, "step": 1515 }, { "epoch": 1.511685728493287, "grad_norm": 0.6005431823762195, "learning_rate": 3.4073579375932377e-06, "loss": 0.8917, "step": 1520 }, { "epoch": 1.5166583789159622, "grad_norm": 0.5744056476076853, "learning_rate": 3.342317099741886e-06, "loss": 0.883, "step": 1525 }, { "epoch": 1.5216310293386375, "grad_norm": 0.6483072296156038, "learning_rate": 3.27777823794168e-06, "loss": 0.911, "step": 1530 }, { "epoch": 1.5266036797613127, "grad_norm": 0.6051760146177871, "learning_rate": 3.2137462182944557e-06, "loss": 0.898, "step": 1535 }, { "epoch": 1.531576330183988, "grad_norm": 0.5905220877100078, "learning_rate": 3.150225868687161e-06, "loss": 0.8885, "step": 1540 }, { "epoch": 1.5365489806066632, "grad_norm": 0.5972075844479279, "learning_rate": 3.0872219784278357e-06, "loss": 0.8754, "step": 1545 }, { "epoch": 1.5415216310293385, "grad_norm": 0.5896259569827768, "learning_rate": 3.0247392978845203e-06, "loss": 0.8976, "step": 1550 }, { "epoch": 1.546494281452014, "grad_norm": 0.5850464264225753, "learning_rate": 2.9627825381270704e-06, "loss": 0.8762, "step": 1555 }, { "epoch": 1.5514669318746892, "grad_norm": 0.5803411751479154, "learning_rate": 2.9013563705719673e-06, "loss": 0.8914, "step": 1560 }, { "epoch": 1.5564395822973645, "grad_norm": 0.6107318414430488, "learning_rate": 2.840465426630091e-06, "loss": 0.8927, "step": 1565 }, { "epoch": 1.56141223272004, "grad_norm": 0.5883688758640033, "learning_rate": 2.7801142973575245e-06, "loss": 0.899, "step": 1570 }, { "epoch": 1.5663848831427152, "grad_norm": 0.5806776595867678, "learning_rate": 2.720307533109402e-06, "loss": 0.8714, "step": 1575 }, { "epoch": 1.5713575335653904, "grad_norm": 0.6072257831136263, "learning_rate": 2.6610496431968125e-06, "loss": 0.8909, "step": 1580 }, { "epoch": 1.5763301839880657, "grad_norm": 0.5984905911242053, "learning_rate": 2.6023450955468176e-06, "loss": 0.8905, "step": 1585 }, { "epoch": 1.581302834410741, "grad_norm": 0.5982169185055861, "learning_rate": 2.5441983163655705e-06, "loss": 0.893, "step": 1590 }, { "epoch": 1.5862754848334162, "grad_norm": 0.6845641216521319, "learning_rate": 2.4866136898045844e-06, "loss": 0.888, "step": 1595 }, { "epoch": 1.5912481352560914, "grad_norm": 0.6521336761418035, "learning_rate": 2.4295955576301966e-06, "loss": 0.8975, "step": 1600 }, { "epoch": 1.5962207856787667, "grad_norm": 0.6026814731835343, "learning_rate": 2.373148218896182e-06, "loss": 0.8955, "step": 1605 }, { "epoch": 1.601193436101442, "grad_norm": 0.6031328469823622, "learning_rate": 2.3172759296196267e-06, "loss": 0.8984, "step": 1610 }, { "epoch": 1.6061660865241174, "grad_norm": 0.6085878347429402, "learning_rate": 2.2619829024600394e-06, "loss": 0.897, "step": 1615 }, { "epoch": 1.6111387369467927, "grad_norm": 0.5913537335897251, "learning_rate": 2.2072733064017104e-06, "loss": 0.9019, "step": 1620 }, { "epoch": 1.616111387369468, "grad_norm": 0.6046011617732275, "learning_rate": 2.153151266439384e-06, "loss": 0.89, "step": 1625 }, { "epoch": 1.6210840377921432, "grad_norm": 0.5977994670480866, "learning_rate": 2.0996208632672475e-06, "loss": 0.8857, "step": 1630 }, { "epoch": 1.6260566882148186, "grad_norm": 0.5798385899827562, "learning_rate": 2.0466861329712473e-06, "loss": 0.8893, "step": 1635 }, { "epoch": 1.6310293386374939, "grad_norm": 0.6034790461256396, "learning_rate": 1.994351066724781e-06, "loss": 0.8841, "step": 1640 }, { "epoch": 1.6360019890601691, "grad_norm": 0.6144777283053662, "learning_rate": 1.9426196104877737e-06, "loss": 0.8754, "step": 1645 }, { "epoch": 1.6409746394828444, "grad_norm": 0.589100085406895, "learning_rate": 1.8914956647091497e-06, "loss": 0.8859, "step": 1650 }, { "epoch": 1.6459472899055196, "grad_norm": 0.5907474864486845, "learning_rate": 1.8409830840327546e-06, "loss": 0.8906, "step": 1655 }, { "epoch": 1.650919940328195, "grad_norm": 0.5944183790215095, "learning_rate": 1.791085677006722e-06, "loss": 0.8987, "step": 1660 }, { "epoch": 1.6558925907508701, "grad_norm": 0.5862999400507831, "learning_rate": 1.7418072057963143e-06, "loss": 0.8846, "step": 1665 }, { "epoch": 1.6608652411735454, "grad_norm": 0.6065039887812087, "learning_rate": 1.6931513859002636e-06, "loss": 0.8882, "step": 1670 }, { "epoch": 1.6658378915962206, "grad_norm": 0.5829700290824299, "learning_rate": 1.6451218858706374e-06, "loss": 0.8899, "step": 1675 }, { "epoch": 1.6708105420188961, "grad_norm": 0.5747340494639567, "learning_rate": 1.5977223270362197e-06, "loss": 0.8779, "step": 1680 }, { "epoch": 1.6757831924415714, "grad_norm": 0.6236514063391653, "learning_rate": 1.5509562832294944e-06, "loss": 0.8906, "step": 1685 }, { "epoch": 1.6807558428642466, "grad_norm": 0.5890477510202525, "learning_rate": 1.5048272805171615e-06, "loss": 0.8735, "step": 1690 }, { "epoch": 1.685728493286922, "grad_norm": 0.5887405386066722, "learning_rate": 1.459338796934293e-06, "loss": 0.8787, "step": 1695 }, { "epoch": 1.6907011437095973, "grad_norm": 0.5560854923827532, "learning_rate": 1.4144942622220902e-06, "loss": 0.8818, "step": 1700 }, { "epoch": 1.6956737941322726, "grad_norm": 0.6127568360778808, "learning_rate": 1.3702970575692975e-06, "loss": 0.8969, "step": 1705 }, { "epoch": 1.7006464445549478, "grad_norm": 0.576741886798026, "learning_rate": 1.3267505153572502e-06, "loss": 0.8913, "step": 1710 }, { "epoch": 1.705619094977623, "grad_norm": 0.5786099553553555, "learning_rate": 1.2838579189086352e-06, "loss": 0.8836, "step": 1715 }, { "epoch": 1.7105917454002983, "grad_norm": 0.5977190789902269, "learning_rate": 1.2416225022399286e-06, "loss": 0.8837, "step": 1720 }, { "epoch": 1.7155643958229736, "grad_norm": 0.60144046208383, "learning_rate": 1.2000474498175552e-06, "loss": 0.8904, "step": 1725 }, { "epoch": 1.7205370462456488, "grad_norm": 0.5695644106157955, "learning_rate": 1.1591358963177924e-06, "loss": 0.8999, "step": 1730 }, { "epoch": 1.725509696668324, "grad_norm": 0.6030300408695916, "learning_rate": 1.118890926390419e-06, "loss": 0.8849, "step": 1735 }, { "epoch": 1.7304823470909994, "grad_norm": 0.5754077081164456, "learning_rate": 1.0793155744261352e-06, "loss": 0.8809, "step": 1740 }, { "epoch": 1.7354549975136748, "grad_norm": 0.5782023378111908, "learning_rate": 1.0404128243277778e-06, "loss": 0.8875, "step": 1745 }, { "epoch": 1.74042764793635, "grad_norm": 0.589876052766199, "learning_rate": 1.0021856092853433e-06, "loss": 0.8843, "step": 1750 }, { "epoch": 1.7454002983590253, "grad_norm": 0.594939794983627, "learning_rate": 9.646368115548232e-07, "loss": 0.8834, "step": 1755 }, { "epoch": 1.7503729487817008, "grad_norm": 0.5953106465263236, "learning_rate": 9.277692622409018e-07, "loss": 0.8667, "step": 1760 }, { "epoch": 1.755345599204376, "grad_norm": 0.6161359913787724, "learning_rate": 8.915857410834793e-07, "loss": 0.8891, "step": 1765 }, { "epoch": 1.7603182496270513, "grad_norm": 0.6439397975128724, "learning_rate": 8.560889762480951e-07, "loss": 0.8768, "step": 1770 }, { "epoch": 1.7652909000497266, "grad_norm": 0.5657607722214749, "learning_rate": 8.212816441202309e-07, "loss": 0.8886, "step": 1775 }, { "epoch": 1.7702635504724018, "grad_norm": 0.6073482243771166, "learning_rate": 7.871663691035103e-07, "loss": 0.8901, "step": 1780 }, { "epoch": 1.775236200895077, "grad_norm": 0.566216143109814, "learning_rate": 7.537457234218271e-07, "loss": 0.8844, "step": 1785 }, { "epoch": 1.7802088513177523, "grad_norm": 0.5896090553262934, "learning_rate": 7.210222269254041e-07, "loss": 0.8897, "step": 1790 }, { "epoch": 1.7851815017404276, "grad_norm": 0.5874428543517964, "learning_rate": 6.889983469008055e-07, "loss": 0.887, "step": 1795 }, { "epoch": 1.7901541521631028, "grad_norm": 0.5923783661686378, "learning_rate": 6.576764978849005e-07, "loss": 0.89, "step": 1800 }, { "epoch": 1.795126802585778, "grad_norm": 0.5747156271260179, "learning_rate": 6.27059041482817e-07, "loss": 0.8803, "step": 1805 }, { "epoch": 1.8000994530084535, "grad_norm": 0.5843164968993672, "learning_rate": 5.971482861898836e-07, "loss": 0.8814, "step": 1810 }, { "epoch": 1.8050721034311288, "grad_norm": 0.5925488819832142, "learning_rate": 5.679464872175666e-07, "loss": 0.8798, "step": 1815 }, { "epoch": 1.810044753853804, "grad_norm": 0.5814849122329898, "learning_rate": 5.394558463234378e-07, "loss": 0.8928, "step": 1820 }, { "epoch": 1.8150174042764795, "grad_norm": 0.5653073997150958, "learning_rate": 5.116785116451661e-07, "loss": 0.8858, "step": 1825 }, { "epoch": 1.8199900546991548, "grad_norm": 0.5977924852737442, "learning_rate": 4.846165775385459e-07, "loss": 0.8868, "step": 1830 }, { "epoch": 1.82496270512183, "grad_norm": 0.5993863609862281, "learning_rate": 4.5827208441959426e-07, "loss": 0.891, "step": 1835 }, { "epoch": 1.8299353555445053, "grad_norm": 0.601479874243501, "learning_rate": 4.326470186107035e-07, "loss": 0.8791, "step": 1840 }, { "epoch": 1.8349080059671805, "grad_norm": 0.592784564470986, "learning_rate": 4.077433121908747e-07, "loss": 0.8875, "step": 1845 }, { "epoch": 1.8398806563898558, "grad_norm": 0.5707840764321231, "learning_rate": 3.835628428500515e-07, "loss": 0.8646, "step": 1850 }, { "epoch": 1.844853306812531, "grad_norm": 0.5628275928965689, "learning_rate": 3.601074337475352e-07, "loss": 0.8769, "step": 1855 }, { "epoch": 1.8498259572352063, "grad_norm": 0.6070264634956976, "learning_rate": 3.3737885337452815e-07, "loss": 0.8996, "step": 1860 }, { "epoch": 1.8547986076578815, "grad_norm": 0.5987252942276654, "learning_rate": 3.153788154207926e-07, "loss": 0.9035, "step": 1865 }, { "epoch": 1.859771258080557, "grad_norm": 0.603313374471039, "learning_rate": 2.941089786454421e-07, "loss": 0.8928, "step": 1870 }, { "epoch": 1.8647439085032322, "grad_norm": 0.6133979767792208, "learning_rate": 2.735709467518699e-07, "loss": 0.8796, "step": 1875 }, { "epoch": 1.8697165589259075, "grad_norm": 0.5708712405374609, "learning_rate": 2.5376626826683956e-07, "loss": 0.8783, "step": 1880 }, { "epoch": 1.8746892093485827, "grad_norm": 0.5903635964464384, "learning_rate": 2.3469643642372587e-07, "loss": 0.8727, "step": 1885 }, { "epoch": 1.8796618597712582, "grad_norm": 0.5860886967031799, "learning_rate": 2.1636288904992585e-07, "loss": 0.8911, "step": 1890 }, { "epoch": 1.8846345101939335, "grad_norm": 0.5987205804929953, "learning_rate": 1.9876700845845475e-07, "loss": 0.8622, "step": 1895 }, { "epoch": 1.8896071606166087, "grad_norm": 0.5864702159345376, "learning_rate": 1.8191012134371576e-07, "loss": 0.8997, "step": 1900 }, { "epoch": 1.894579811039284, "grad_norm": 0.5941563218461623, "learning_rate": 1.6579349868147688e-07, "loss": 0.8801, "step": 1905 }, { "epoch": 1.8995524614619592, "grad_norm": 0.5890830924362549, "learning_rate": 1.504183556330374e-07, "loss": 0.8752, "step": 1910 }, { "epoch": 1.9045251118846345, "grad_norm": 0.5973613796458782, "learning_rate": 1.3578585145360812e-07, "loss": 0.888, "step": 1915 }, { "epoch": 1.9094977623073097, "grad_norm": 0.6015109014180703, "learning_rate": 1.2189708940490653e-07, "loss": 0.8915, "step": 1920 }, { "epoch": 1.914470412729985, "grad_norm": 0.5840101145509294, "learning_rate": 1.0875311667196908e-07, "loss": 0.8823, "step": 1925 }, { "epoch": 1.9194430631526602, "grad_norm": 0.9953027707453141, "learning_rate": 9.635492428420434e-08, "loss": 0.8706, "step": 1930 }, { "epoch": 1.9244157135753357, "grad_norm": 0.569992582848356, "learning_rate": 8.470344704066047e-08, "loss": 0.8935, "step": 1935 }, { "epoch": 1.929388363998011, "grad_norm": 0.5820719941969129, "learning_rate": 7.379956343955385e-08, "loss": 0.8726, "step": 1940 }, { "epoch": 1.9343610144206862, "grad_norm": 0.5883844683240437, "learning_rate": 6.364409561202323e-08, "loss": 0.8907, "step": 1945 }, { "epoch": 1.9393336648433617, "grad_norm": 0.5888023020558691, "learning_rate": 5.42378092601481e-08, "loss": 0.8733, "step": 1950 }, { "epoch": 1.944306315266037, "grad_norm": 0.6131742183657333, "learning_rate": 4.558141359921386e-08, "loss": 0.8835, "step": 1955 }, { "epoch": 1.9492789656887122, "grad_norm": 0.5724474865234898, "learning_rate": 3.7675561304238996e-08, "loss": 0.9032, "step": 1960 }, { "epoch": 1.9542516161113874, "grad_norm": 0.5716096201690882, "learning_rate": 3.0520848460765525e-08, "loss": 0.8891, "step": 1965 }, { "epoch": 1.9592242665340627, "grad_norm": 0.5954898642826182, "learning_rate": 2.4117814519911687e-08, "loss": 0.8951, "step": 1970 }, { "epoch": 1.964196916956738, "grad_norm": 0.5883075227671835, "learning_rate": 1.846694225770551e-08, "loss": 0.8799, "step": 1975 }, { "epoch": 1.9691695673794132, "grad_norm": 0.5596221737627415, "learning_rate": 1.3568657738678437e-08, "loss": 0.8917, "step": 1980 }, { "epoch": 1.9741422178020884, "grad_norm": 0.5985361336002101, "learning_rate": 9.423330283742093e-09, "loss": 0.8822, "step": 1985 }, { "epoch": 1.9791148682247637, "grad_norm": 0.5774439559985055, "learning_rate": 6.031272442341696e-09, "loss": 0.8984, "step": 1990 }, { "epoch": 1.984087518647439, "grad_norm": 0.5947058034013541, "learning_rate": 3.3927399688948868e-09, "loss": 0.88, "step": 1995 }, { "epoch": 1.9890601690701144, "grad_norm": 0.5662497030571633, "learning_rate": 1.5079318035016166e-09, "loss": 0.8558, "step": 2000 }, { "epoch": 1.9940328194927897, "grad_norm": 0.584469598380576, "learning_rate": 3.7699005695057687e-10, "loss": 0.8875, "step": 2005 }, { "epoch": 1.999005469915465, "grad_norm": 0.5888373608660001, "learning_rate": 0.0, "loss": 0.8693, "step": 2010 }, { "epoch": 1.999005469915465, "eval_loss": 1.057645320892334, "eval_runtime": 312.6039, "eval_samples_per_second": 45.543, "eval_steps_per_second": 0.713, "step": 2010 }, { "epoch": 1.999005469915465, "step": 2010, "total_flos": 841287456522240.0, "train_loss": 0.6627339932455946, "train_runtime": 16962.808, "train_samples_per_second": 15.173, "train_steps_per_second": 0.118 } ], "logging_steps": 5, "max_steps": 2010, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 841287456522240.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }