diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,93606 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 19.997756086615055, + "eval_steps": 500, + "global_step": 66840, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "grad_norm": 0.4561903178691864, + "learning_rate": 4.9999999309637996e-05, + "loss": 1.7761, + "step": 5 + }, + { + "epoch": 0.0, + "grad_norm": 0.6195425391197205, + "learning_rate": 4.999999723855201e-05, + "loss": 1.6886, + "step": 10 + }, + { + "epoch": 0.0, + "grad_norm": 0.6293871402740479, + "learning_rate": 4.9999993786742175e-05, + "loss": 1.6967, + "step": 15 + }, + { + "epoch": 0.01, + "grad_norm": 0.6493249535560608, + "learning_rate": 4.999998895420865e-05, + "loss": 1.8346, + "step": 20 + }, + { + "epoch": 0.01, + "grad_norm": 0.6501443982124329, + "learning_rate": 4.999998274095173e-05, + "loss": 1.7955, + "step": 25 + }, + { + "epoch": 0.01, + "grad_norm": 1.5174015760421753, + "learning_rate": 4.999997514697176e-05, + "loss": 1.8013, + "step": 30 + }, + { + "epoch": 0.01, + "grad_norm": 0.7737800478935242, + "learning_rate": 4.999996617226913e-05, + "loss": 1.6485, + "step": 35 + }, + { + "epoch": 0.01, + "grad_norm": 1.2001756429672241, + "learning_rate": 4.999995581684437e-05, + "loss": 1.3865, + "step": 40 + }, + { + "epoch": 0.01, + "grad_norm": 1.1627835035324097, + "learning_rate": 4.9999944080698024e-05, + "loss": 1.2725, + "step": 45 + }, + { + "epoch": 0.01, + "grad_norm": 0.5961304306983948, + "learning_rate": 4.999993096383076e-05, + "loss": 1.5426, + "step": 50 + }, + { + "epoch": 0.02, + "grad_norm": 0.7442084550857544, + "learning_rate": 4.999991646624329e-05, + "loss": 1.5626, + "step": 55 + }, + { + "epoch": 0.02, + "grad_norm": 0.9939375519752502, + "learning_rate": 4.9999900587936426e-05, + "loss": 1.427, + "step": 60 + }, + { + "epoch": 0.02, + "grad_norm": 0.8643019199371338, + "learning_rate": 4.9999883328911034e-05, + "loss": 1.8313, + "step": 65 + }, + { + "epoch": 0.02, + "grad_norm": 0.8557299375534058, + "learning_rate": 4.999986468916807e-05, + "loss": 1.4777, + "step": 70 + }, + { + "epoch": 0.02, + "grad_norm": 0.45538774132728577, + "learning_rate": 4.9999844668708574e-05, + "loss": 1.5458, + "step": 75 + }, + { + "epoch": 0.02, + "grad_norm": 0.6336299777030945, + "learning_rate": 4.9999823267533627e-05, + "loss": 1.3938, + "step": 80 + }, + { + "epoch": 0.03, + "grad_norm": 0.778620719909668, + "learning_rate": 4.9999800485644445e-05, + "loss": 1.4802, + "step": 85 + }, + { + "epoch": 0.03, + "grad_norm": 0.8213117718696594, + "learning_rate": 4.9999776323042255e-05, + "loss": 1.351, + "step": 90 + }, + { + "epoch": 0.03, + "grad_norm": 0.6166107058525085, + "learning_rate": 4.999975077972841e-05, + "loss": 1.4811, + "step": 95 + }, + { + "epoch": 0.03, + "grad_norm": 0.6074905395507812, + "learning_rate": 4.999972385570432e-05, + "loss": 1.4381, + "step": 100 + }, + { + "epoch": 0.03, + "grad_norm": 0.7994800209999084, + "learning_rate": 4.999969555097146e-05, + "loss": 1.4045, + "step": 105 + }, + { + "epoch": 0.03, + "grad_norm": 0.5199000239372253, + "learning_rate": 4.99996658655314e-05, + "loss": 1.479, + "step": 110 + }, + { + "epoch": 0.03, + "grad_norm": 0.5505474209785461, + "learning_rate": 4.999963479938577e-05, + "loss": 1.5272, + "step": 115 + }, + { + "epoch": 0.04, + "grad_norm": 1.027380108833313, + "learning_rate": 4.999960235253631e-05, + "loss": 1.4791, + "step": 120 + }, + { + "epoch": 0.04, + "grad_norm": 0.7694772481918335, + "learning_rate": 4.99995685249848e-05, + "loss": 1.4485, + "step": 125 + }, + { + "epoch": 0.04, + "grad_norm": 0.9029272794723511, + "learning_rate": 4.99995333167331e-05, + "loss": 1.3591, + "step": 130 + }, + { + "epoch": 0.04, + "grad_norm": 0.46942275762557983, + "learning_rate": 4.999949672778316e-05, + "loss": 1.3628, + "step": 135 + }, + { + "epoch": 0.04, + "grad_norm": 0.8350025415420532, + "learning_rate": 4.999945875813701e-05, + "loss": 1.4336, + "step": 140 + }, + { + "epoch": 0.04, + "grad_norm": 0.6389840841293335, + "learning_rate": 4.999941940779673e-05, + "loss": 1.4277, + "step": 145 + }, + { + "epoch": 0.04, + "grad_norm": 0.38392823934555054, + "learning_rate": 4.99993786767645e-05, + "loss": 1.4192, + "step": 150 + }, + { + "epoch": 0.05, + "grad_norm": 1.229735255241394, + "learning_rate": 4.999933656504257e-05, + "loss": 1.514, + "step": 155 + }, + { + "epoch": 0.05, + "grad_norm": 1.3870686292648315, + "learning_rate": 4.9999293072633273e-05, + "loss": 1.4909, + "step": 160 + }, + { + "epoch": 0.05, + "grad_norm": 2.531494617462158, + "learning_rate": 4.9999248199539006e-05, + "loss": 1.4567, + "step": 165 + }, + { + "epoch": 0.05, + "grad_norm": 0.5871274471282959, + "learning_rate": 4.9999201945762244e-05, + "loss": 1.326, + "step": 170 + }, + { + "epoch": 0.05, + "grad_norm": 0.35784173011779785, + "learning_rate": 4.999915431130554e-05, + "loss": 1.4567, + "step": 175 + }, + { + "epoch": 0.05, + "grad_norm": 0.901470422744751, + "learning_rate": 4.9999105296171535e-05, + "loss": 1.5553, + "step": 180 + }, + { + "epoch": 0.06, + "grad_norm": 1.3752151727676392, + "learning_rate": 4.9999054900362915e-05, + "loss": 1.389, + "step": 185 + }, + { + "epoch": 0.06, + "grad_norm": 1.1029325723648071, + "learning_rate": 4.9999003123882494e-05, + "loss": 1.4993, + "step": 190 + }, + { + "epoch": 0.06, + "grad_norm": 1.255204200744629, + "learning_rate": 4.999894996673311e-05, + "loss": 1.5035, + "step": 195 + }, + { + "epoch": 0.06, + "grad_norm": 1.002902865409851, + "learning_rate": 4.9998895428917704e-05, + "loss": 1.3108, + "step": 200 + }, + { + "epoch": 0.06, + "grad_norm": 1.3146979808807373, + "learning_rate": 4.9998839510439286e-05, + "loss": 1.5487, + "step": 205 + }, + { + "epoch": 0.06, + "grad_norm": 0.5592719316482544, + "learning_rate": 4.999878221130095e-05, + "loss": 1.3763, + "step": 210 + }, + { + "epoch": 0.06, + "grad_norm": 0.5583714246749878, + "learning_rate": 4.999872353150586e-05, + "loss": 1.4305, + "step": 215 + }, + { + "epoch": 0.07, + "grad_norm": 1.0968598127365112, + "learning_rate": 4.999866347105725e-05, + "loss": 1.4064, + "step": 220 + }, + { + "epoch": 0.07, + "grad_norm": 0.7956498265266418, + "learning_rate": 4.999860202995844e-05, + "loss": 1.3986, + "step": 225 + }, + { + "epoch": 0.07, + "grad_norm": 1.1131305694580078, + "learning_rate": 4.999853920821283e-05, + "loss": 1.4269, + "step": 230 + }, + { + "epoch": 0.07, + "grad_norm": 1.5098061561584473, + "learning_rate": 4.999847500582388e-05, + "loss": 1.5158, + "step": 235 + }, + { + "epoch": 0.07, + "grad_norm": 0.8009808659553528, + "learning_rate": 4.999840942279514e-05, + "loss": 1.4029, + "step": 240 + }, + { + "epoch": 0.07, + "grad_norm": 0.5857890248298645, + "learning_rate": 4.999834245913023e-05, + "loss": 1.4703, + "step": 245 + }, + { + "epoch": 0.07, + "grad_norm": 0.9343911409378052, + "learning_rate": 4.9998274114832854e-05, + "loss": 1.358, + "step": 250 + }, + { + "epoch": 0.08, + "grad_norm": 1.430106520652771, + "learning_rate": 4.999820438990678e-05, + "loss": 1.3883, + "step": 255 + }, + { + "epoch": 0.08, + "grad_norm": 0.5393354296684265, + "learning_rate": 4.999813328435586e-05, + "loss": 1.4375, + "step": 260 + }, + { + "epoch": 0.08, + "grad_norm": 0.7708584666252136, + "learning_rate": 4.999806079818403e-05, + "loss": 1.4298, + "step": 265 + }, + { + "epoch": 0.08, + "grad_norm": 0.8902773261070251, + "learning_rate": 4.999798693139528e-05, + "loss": 1.4823, + "step": 270 + }, + { + "epoch": 0.08, + "grad_norm": 0.7855858206748962, + "learning_rate": 4.99979116839937e-05, + "loss": 1.3869, + "step": 275 + }, + { + "epoch": 0.08, + "grad_norm": 0.864590585231781, + "learning_rate": 4.9997835055983436e-05, + "loss": 1.3566, + "step": 280 + }, + { + "epoch": 0.09, + "grad_norm": 1.4891940355300903, + "learning_rate": 4.999775704736873e-05, + "loss": 1.3024, + "step": 285 + }, + { + "epoch": 0.09, + "grad_norm": 1.1072434186935425, + "learning_rate": 4.9997677658153885e-05, + "loss": 1.4124, + "step": 290 + }, + { + "epoch": 0.09, + "grad_norm": 3.1894373893737793, + "learning_rate": 4.999759688834329e-05, + "loss": 1.5361, + "step": 295 + }, + { + "epoch": 0.09, + "grad_norm": 0.6644517183303833, + "learning_rate": 4.99975147379414e-05, + "loss": 1.4811, + "step": 300 + }, + { + "epoch": 0.09, + "grad_norm": 1.1441906690597534, + "learning_rate": 4.999743120695275e-05, + "loss": 1.4017, + "step": 305 + }, + { + "epoch": 0.09, + "grad_norm": 0.748852550983429, + "learning_rate": 4.999734629538197e-05, + "loss": 1.3883, + "step": 310 + }, + { + "epoch": 0.09, + "grad_norm": 0.8531980514526367, + "learning_rate": 4.999726000323373e-05, + "loss": 1.3784, + "step": 315 + }, + { + "epoch": 0.1, + "grad_norm": 0.746377170085907, + "learning_rate": 4.99971723305128e-05, + "loss": 1.4622, + "step": 320 + }, + { + "epoch": 0.1, + "grad_norm": 0.8359838128089905, + "learning_rate": 4.999708327722402e-05, + "loss": 1.2369, + "step": 325 + }, + { + "epoch": 0.1, + "grad_norm": 1.8505661487579346, + "learning_rate": 4.999699284337232e-05, + "loss": 1.6082, + "step": 330 + }, + { + "epoch": 0.1, + "grad_norm": 1.608825922012329, + "learning_rate": 4.99969010289627e-05, + "loss": 1.5367, + "step": 335 + }, + { + "epoch": 0.1, + "grad_norm": 1.0551233291625977, + "learning_rate": 4.99968078340002e-05, + "loss": 1.4654, + "step": 340 + }, + { + "epoch": 0.1, + "grad_norm": 1.0916029214859009, + "learning_rate": 4.999671325848999e-05, + "loss": 1.585, + "step": 345 + }, + { + "epoch": 0.1, + "grad_norm": 0.6593181490898132, + "learning_rate": 4.9996617302437296e-05, + "loss": 1.4262, + "step": 350 + }, + { + "epoch": 0.11, + "grad_norm": 0.575879693031311, + "learning_rate": 4.999651996584741e-05, + "loss": 1.4462, + "step": 355 + }, + { + "epoch": 0.11, + "grad_norm": 1.3045742511749268, + "learning_rate": 4.999642124872571e-05, + "loss": 1.492, + "step": 360 + }, + { + "epoch": 0.11, + "grad_norm": 0.6474262475967407, + "learning_rate": 4.999632115107764e-05, + "loss": 1.3665, + "step": 365 + }, + { + "epoch": 0.11, + "grad_norm": 0.9724390506744385, + "learning_rate": 4.999621967290874e-05, + "loss": 1.2883, + "step": 370 + }, + { + "epoch": 0.11, + "grad_norm": 0.917503297328949, + "learning_rate": 4.999611681422461e-05, + "loss": 1.5095, + "step": 375 + }, + { + "epoch": 0.11, + "grad_norm": 1.5174071788787842, + "learning_rate": 4.999601257503093e-05, + "loss": 1.5043, + "step": 380 + }, + { + "epoch": 0.12, + "grad_norm": 1.2477240562438965, + "learning_rate": 4.999590695533345e-05, + "loss": 1.4741, + "step": 385 + }, + { + "epoch": 0.12, + "grad_norm": 1.0032185316085815, + "learning_rate": 4.9995799955138025e-05, + "loss": 1.3719, + "step": 390 + }, + { + "epoch": 0.12, + "grad_norm": 1.0506250858306885, + "learning_rate": 4.999569157445054e-05, + "loss": 1.4187, + "step": 395 + }, + { + "epoch": 0.12, + "grad_norm": 0.5169631242752075, + "learning_rate": 4.999558181327699e-05, + "loss": 1.3557, + "step": 400 + }, + { + "epoch": 0.12, + "grad_norm": 1.166700839996338, + "learning_rate": 4.9995470671623446e-05, + "loss": 1.5298, + "step": 405 + }, + { + "epoch": 0.12, + "grad_norm": 0.9644590616226196, + "learning_rate": 4.999535814949603e-05, + "loss": 1.4766, + "step": 410 + }, + { + "epoch": 0.12, + "grad_norm": 1.849117636680603, + "learning_rate": 4.999524424690097e-05, + "loss": 1.3298, + "step": 415 + }, + { + "epoch": 0.13, + "grad_norm": 0.5561323761940002, + "learning_rate": 4.9995128963844545e-05, + "loss": 1.5718, + "step": 420 + }, + { + "epoch": 0.13, + "grad_norm": 1.6733776330947876, + "learning_rate": 4.9995012300333134e-05, + "loss": 1.3693, + "step": 425 + }, + { + "epoch": 0.13, + "grad_norm": 0.8910932540893555, + "learning_rate": 4.999489425637317e-05, + "loss": 1.5194, + "step": 430 + }, + { + "epoch": 0.13, + "grad_norm": 3.7125866413116455, + "learning_rate": 4.9994774831971184e-05, + "loss": 1.6596, + "step": 435 + }, + { + "epoch": 0.13, + "grad_norm": 0.8103092312812805, + "learning_rate": 4.999465402713376e-05, + "loss": 1.4965, + "step": 440 + }, + { + "epoch": 0.13, + "grad_norm": 0.902019739151001, + "learning_rate": 4.999453184186757e-05, + "loss": 1.3179, + "step": 445 + }, + { + "epoch": 0.13, + "grad_norm": 0.9942508935928345, + "learning_rate": 4.999440827617938e-05, + "loss": 1.3365, + "step": 450 + }, + { + "epoch": 0.14, + "grad_norm": 0.8688194751739502, + "learning_rate": 4.9994283330075983e-05, + "loss": 1.3126, + "step": 455 + }, + { + "epoch": 0.14, + "grad_norm": 0.632107675075531, + "learning_rate": 4.9994157003564314e-05, + "loss": 1.4769, + "step": 460 + }, + { + "epoch": 0.14, + "grad_norm": 1.341265082359314, + "learning_rate": 4.999402929665133e-05, + "loss": 1.3436, + "step": 465 + }, + { + "epoch": 0.14, + "grad_norm": 1.463435173034668, + "learning_rate": 4.999390020934408e-05, + "loss": 1.3653, + "step": 470 + }, + { + "epoch": 0.14, + "grad_norm": 0.5955095291137695, + "learning_rate": 4.9993769741649707e-05, + "loss": 1.3658, + "step": 475 + }, + { + "epoch": 0.14, + "grad_norm": 0.7632045149803162, + "learning_rate": 4.999363789357541e-05, + "loss": 1.2948, + "step": 480 + }, + { + "epoch": 0.15, + "grad_norm": 0.9410882592201233, + "learning_rate": 4.9993504665128474e-05, + "loss": 1.4352, + "step": 485 + }, + { + "epoch": 0.15, + "grad_norm": 1.489148736000061, + "learning_rate": 4.999337005631625e-05, + "loss": 1.5849, + "step": 490 + }, + { + "epoch": 0.15, + "grad_norm": 1.0961707830429077, + "learning_rate": 4.9993234067146186e-05, + "loss": 1.4885, + "step": 495 + }, + { + "epoch": 0.15, + "grad_norm": 0.9856806993484497, + "learning_rate": 4.999309669762577e-05, + "loss": 1.6123, + "step": 500 + }, + { + "epoch": 0.15, + "grad_norm": 0.6104403734207153, + "learning_rate": 4.999295794776261e-05, + "loss": 1.4599, + "step": 505 + }, + { + "epoch": 0.15, + "grad_norm": 1.5283727645874023, + "learning_rate": 4.9992817817564366e-05, + "loss": 1.5948, + "step": 510 + }, + { + "epoch": 0.15, + "grad_norm": 2.664371967315674, + "learning_rate": 4.9992676307038765e-05, + "loss": 1.5192, + "step": 515 + }, + { + "epoch": 0.16, + "grad_norm": 0.813569962978363, + "learning_rate": 4.999253341619363e-05, + "loss": 1.3386, + "step": 520 + }, + { + "epoch": 0.16, + "grad_norm": 0.8261412382125854, + "learning_rate": 4.999238914503686e-05, + "loss": 1.3626, + "step": 525 + }, + { + "epoch": 0.16, + "grad_norm": 1.0505256652832031, + "learning_rate": 4.999224349357641e-05, + "loss": 1.3934, + "step": 530 + }, + { + "epoch": 0.16, + "grad_norm": 1.3349155187606812, + "learning_rate": 4.999209646182033e-05, + "loss": 1.3489, + "step": 535 + }, + { + "epoch": 0.16, + "grad_norm": 0.8616823554039001, + "learning_rate": 4.999194804977674e-05, + "loss": 1.4238, + "step": 540 + }, + { + "epoch": 0.16, + "grad_norm": 0.6977237462997437, + "learning_rate": 4.9991798257453834e-05, + "loss": 1.4843, + "step": 545 + }, + { + "epoch": 0.16, + "grad_norm": 1.441572666168213, + "learning_rate": 4.9991647084859894e-05, + "loss": 1.3559, + "step": 550 + }, + { + "epoch": 0.17, + "grad_norm": 1.1105183362960815, + "learning_rate": 4.9991494532003255e-05, + "loss": 1.5466, + "step": 555 + }, + { + "epoch": 0.17, + "grad_norm": 1.4936366081237793, + "learning_rate": 4.9991340598892355e-05, + "loss": 1.256, + "step": 560 + }, + { + "epoch": 0.17, + "grad_norm": 1.0292248725891113, + "learning_rate": 4.999118528553569e-05, + "loss": 1.4012, + "step": 565 + }, + { + "epoch": 0.17, + "grad_norm": 0.9438468217849731, + "learning_rate": 4.9991028591941846e-05, + "loss": 1.4391, + "step": 570 + }, + { + "epoch": 0.17, + "grad_norm": 1.2015315294265747, + "learning_rate": 4.9990870518119456e-05, + "loss": 1.2896, + "step": 575 + }, + { + "epoch": 0.17, + "grad_norm": 1.021628737449646, + "learning_rate": 4.9990711064077276e-05, + "loss": 1.4336, + "step": 580 + }, + { + "epoch": 0.18, + "grad_norm": 1.3274457454681396, + "learning_rate": 4.999055022982409e-05, + "loss": 1.3766, + "step": 585 + }, + { + "epoch": 0.18, + "grad_norm": 0.7466582655906677, + "learning_rate": 4.99903880153688e-05, + "loss": 1.4709, + "step": 590 + }, + { + "epoch": 0.18, + "grad_norm": 1.1090950965881348, + "learning_rate": 4.999022442072036e-05, + "loss": 1.505, + "step": 595 + }, + { + "epoch": 0.18, + "grad_norm": 1.0768952369689941, + "learning_rate": 4.999005944588778e-05, + "loss": 1.4515, + "step": 600 + }, + { + "epoch": 0.18, + "grad_norm": 1.0834194421768188, + "learning_rate": 4.998989309088021e-05, + "loss": 1.5076, + "step": 605 + }, + { + "epoch": 0.18, + "grad_norm": 1.119468092918396, + "learning_rate": 4.998972535570682e-05, + "loss": 1.3812, + "step": 610 + }, + { + "epoch": 0.18, + "grad_norm": 1.5594128370285034, + "learning_rate": 4.9989556240376864e-05, + "loss": 1.3452, + "step": 615 + }, + { + "epoch": 0.19, + "grad_norm": 1.9768519401550293, + "learning_rate": 4.9989385744899705e-05, + "loss": 1.5971, + "step": 620 + }, + { + "epoch": 0.19, + "grad_norm": 0.7751976847648621, + "learning_rate": 4.9989213869284734e-05, + "loss": 1.5723, + "step": 625 + }, + { + "epoch": 0.19, + "grad_norm": 0.8533837795257568, + "learning_rate": 4.998904061354146e-05, + "loss": 1.3348, + "step": 630 + }, + { + "epoch": 0.19, + "grad_norm": 0.8224301338195801, + "learning_rate": 4.9988865977679454e-05, + "loss": 1.4717, + "step": 635 + }, + { + "epoch": 0.19, + "grad_norm": 1.4539473056793213, + "learning_rate": 4.998868996170835e-05, + "loss": 1.4803, + "step": 640 + }, + { + "epoch": 0.19, + "grad_norm": 1.3216567039489746, + "learning_rate": 4.9988512565637866e-05, + "loss": 1.4922, + "step": 645 + }, + { + "epoch": 0.19, + "grad_norm": 0.8094632029533386, + "learning_rate": 4.998833378947781e-05, + "loss": 1.521, + "step": 650 + }, + { + "epoch": 0.2, + "grad_norm": 1.1426763534545898, + "learning_rate": 4.9988153633238065e-05, + "loss": 1.2602, + "step": 655 + }, + { + "epoch": 0.2, + "grad_norm": 0.7161766886711121, + "learning_rate": 4.998797209692856e-05, + "loss": 1.3155, + "step": 660 + }, + { + "epoch": 0.2, + "grad_norm": 0.38354432582855225, + "learning_rate": 4.998778918055933e-05, + "loss": 1.4266, + "step": 665 + }, + { + "epoch": 0.2, + "grad_norm": 0.6859254837036133, + "learning_rate": 4.9987604884140485e-05, + "loss": 1.3925, + "step": 670 + }, + { + "epoch": 0.2, + "grad_norm": 1.218139886856079, + "learning_rate": 4.9987419207682186e-05, + "loss": 1.4437, + "step": 675 + }, + { + "epoch": 0.2, + "grad_norm": 0.8014891743659973, + "learning_rate": 4.99872321511947e-05, + "loss": 1.398, + "step": 680 + }, + { + "epoch": 0.2, + "grad_norm": 1.764227271080017, + "learning_rate": 4.998704371468835e-05, + "loss": 1.4238, + "step": 685 + }, + { + "epoch": 0.21, + "grad_norm": 0.8913390636444092, + "learning_rate": 4.998685389817355e-05, + "loss": 1.5147, + "step": 690 + }, + { + "epoch": 0.21, + "grad_norm": 1.5515832901000977, + "learning_rate": 4.998666270166078e-05, + "loss": 1.5834, + "step": 695 + }, + { + "epoch": 0.21, + "grad_norm": 0.7237977385520935, + "learning_rate": 4.998647012516061e-05, + "loss": 1.3329, + "step": 700 + }, + { + "epoch": 0.21, + "grad_norm": 1.647252082824707, + "learning_rate": 4.998627616868366e-05, + "loss": 1.3771, + "step": 705 + }, + { + "epoch": 0.21, + "grad_norm": 0.8865868449211121, + "learning_rate": 4.9986080832240646e-05, + "loss": 1.4632, + "step": 710 + }, + { + "epoch": 0.21, + "grad_norm": 2.0670599937438965, + "learning_rate": 4.998588411584236e-05, + "loss": 1.3726, + "step": 715 + }, + { + "epoch": 0.22, + "grad_norm": 0.742981493473053, + "learning_rate": 4.998568601949968e-05, + "loss": 1.3809, + "step": 720 + }, + { + "epoch": 0.22, + "grad_norm": 0.7563424110412598, + "learning_rate": 4.998548654322351e-05, + "loss": 1.5463, + "step": 725 + }, + { + "epoch": 0.22, + "grad_norm": 1.2141450643539429, + "learning_rate": 4.998528568702491e-05, + "loss": 1.4768, + "step": 730 + }, + { + "epoch": 0.22, + "grad_norm": 1.1518967151641846, + "learning_rate": 4.998508345091494e-05, + "loss": 1.5653, + "step": 735 + }, + { + "epoch": 0.22, + "grad_norm": 0.5695226788520813, + "learning_rate": 4.998487983490478e-05, + "loss": 1.3969, + "step": 740 + }, + { + "epoch": 0.22, + "grad_norm": 0.9854166507720947, + "learning_rate": 4.998467483900568e-05, + "loss": 1.5679, + "step": 745 + }, + { + "epoch": 0.22, + "grad_norm": 1.0108509063720703, + "learning_rate": 4.998446846322897e-05, + "loss": 1.3145, + "step": 750 + }, + { + "epoch": 0.23, + "grad_norm": 0.8769714832305908, + "learning_rate": 4.998426070758603e-05, + "loss": 1.2757, + "step": 755 + }, + { + "epoch": 0.23, + "grad_norm": 1.4470043182373047, + "learning_rate": 4.998405157208833e-05, + "loss": 1.4435, + "step": 760 + }, + { + "epoch": 0.23, + "grad_norm": 1.0755447149276733, + "learning_rate": 4.9983841056747446e-05, + "loss": 1.3629, + "step": 765 + }, + { + "epoch": 0.23, + "grad_norm": 0.8289944529533386, + "learning_rate": 4.998362916157498e-05, + "loss": 1.4345, + "step": 770 + }, + { + "epoch": 0.23, + "grad_norm": 0.8254856467247009, + "learning_rate": 4.998341588658265e-05, + "loss": 1.4909, + "step": 775 + }, + { + "epoch": 0.23, + "grad_norm": 0.4825249910354614, + "learning_rate": 4.998320123178223e-05, + "loss": 1.3529, + "step": 780 + }, + { + "epoch": 0.23, + "grad_norm": 1.111685037612915, + "learning_rate": 4.998298519718557e-05, + "loss": 1.292, + "step": 785 + }, + { + "epoch": 0.24, + "grad_norm": 0.8751310706138611, + "learning_rate": 4.998276778280461e-05, + "loss": 1.2957, + "step": 790 + }, + { + "epoch": 0.24, + "grad_norm": 2.0307741165161133, + "learning_rate": 4.998254898865134e-05, + "loss": 1.364, + "step": 795 + }, + { + "epoch": 0.24, + "grad_norm": 1.2933695316314697, + "learning_rate": 4.998232881473787e-05, + "loss": 1.4821, + "step": 800 + }, + { + "epoch": 0.24, + "grad_norm": 1.1037958860397339, + "learning_rate": 4.998210726107635e-05, + "loss": 1.3466, + "step": 805 + }, + { + "epoch": 0.24, + "grad_norm": 1.1049686670303345, + "learning_rate": 4.9981884327679005e-05, + "loss": 1.3488, + "step": 810 + }, + { + "epoch": 0.24, + "grad_norm": 1.055496335029602, + "learning_rate": 4.9981660014558165e-05, + "loss": 1.4615, + "step": 815 + }, + { + "epoch": 0.25, + "grad_norm": 0.8814008831977844, + "learning_rate": 4.99814343217262e-05, + "loss": 1.4898, + "step": 820 + }, + { + "epoch": 0.25, + "grad_norm": 3.712348222732544, + "learning_rate": 4.998120724919559e-05, + "loss": 1.4881, + "step": 825 + }, + { + "epoch": 0.25, + "grad_norm": 0.7310431003570557, + "learning_rate": 4.9980978796978865e-05, + "loss": 1.3357, + "step": 830 + }, + { + "epoch": 0.25, + "grad_norm": 0.5869240164756775, + "learning_rate": 4.998074896508865e-05, + "loss": 1.4089, + "step": 835 + }, + { + "epoch": 0.25, + "grad_norm": 0.99127197265625, + "learning_rate": 4.9980517753537634e-05, + "loss": 1.5001, + "step": 840 + }, + { + "epoch": 0.25, + "grad_norm": 1.083500623703003, + "learning_rate": 4.998028516233859e-05, + "loss": 1.334, + "step": 845 + }, + { + "epoch": 0.25, + "grad_norm": 0.7491528391838074, + "learning_rate": 4.998005119150436e-05, + "loss": 1.5227, + "step": 850 + }, + { + "epoch": 0.26, + "grad_norm": 0.5255687832832336, + "learning_rate": 4.997981584104788e-05, + "loss": 1.3318, + "step": 855 + }, + { + "epoch": 0.26, + "grad_norm": 0.6178213357925415, + "learning_rate": 4.997957911098212e-05, + "loss": 1.4365, + "step": 860 + }, + { + "epoch": 0.26, + "grad_norm": 1.9565240144729614, + "learning_rate": 4.997934100132018e-05, + "loss": 1.3325, + "step": 865 + }, + { + "epoch": 0.26, + "grad_norm": 0.6852719187736511, + "learning_rate": 4.99791015120752e-05, + "loss": 1.4526, + "step": 870 + }, + { + "epoch": 0.26, + "grad_norm": 1.6580244302749634, + "learning_rate": 4.99788606432604e-05, + "loss": 1.585, + "step": 875 + }, + { + "epoch": 0.26, + "grad_norm": 1.2035952806472778, + "learning_rate": 4.9978618394889097e-05, + "loss": 1.3965, + "step": 880 + }, + { + "epoch": 0.26, + "grad_norm": 1.3621057271957397, + "learning_rate": 4.9978374766974664e-05, + "loss": 1.4035, + "step": 885 + }, + { + "epoch": 0.27, + "grad_norm": 1.326550006866455, + "learning_rate": 4.997812975953056e-05, + "loss": 1.6006, + "step": 890 + }, + { + "epoch": 0.27, + "grad_norm": 0.5781381726264954, + "learning_rate": 4.997788337257031e-05, + "loss": 1.376, + "step": 895 + }, + { + "epoch": 0.27, + "grad_norm": 0.650931715965271, + "learning_rate": 4.997763560610752e-05, + "loss": 1.4043, + "step": 900 + }, + { + "epoch": 0.27, + "grad_norm": 0.7104997038841248, + "learning_rate": 4.997738646015588e-05, + "loss": 1.3388, + "step": 905 + }, + { + "epoch": 0.27, + "grad_norm": 1.88402259349823, + "learning_rate": 4.997713593472915e-05, + "loss": 1.2928, + "step": 910 + }, + { + "epoch": 0.27, + "grad_norm": 1.100692868232727, + "learning_rate": 4.997688402984116e-05, + "loss": 1.3306, + "step": 915 + }, + { + "epoch": 0.28, + "grad_norm": 0.7148817777633667, + "learning_rate": 4.997663074550584e-05, + "loss": 1.2836, + "step": 920 + }, + { + "epoch": 0.28, + "grad_norm": 1.2554770708084106, + "learning_rate": 4.9976376081737154e-05, + "loss": 1.365, + "step": 925 + }, + { + "epoch": 0.28, + "grad_norm": 0.8165137767791748, + "learning_rate": 4.9976120038549187e-05, + "loss": 1.3327, + "step": 930 + }, + { + "epoch": 0.28, + "grad_norm": 1.1443816423416138, + "learning_rate": 4.997586261595606e-05, + "loss": 1.6129, + "step": 935 + }, + { + "epoch": 0.28, + "grad_norm": 0.6165381669998169, + "learning_rate": 4.997560381397201e-05, + "loss": 1.3688, + "step": 940 + }, + { + "epoch": 0.28, + "grad_norm": 0.8930396437644958, + "learning_rate": 4.997534363261132e-05, + "loss": 1.2127, + "step": 945 + }, + { + "epoch": 0.28, + "grad_norm": 1.0518968105316162, + "learning_rate": 4.997508207188836e-05, + "loss": 1.407, + "step": 950 + }, + { + "epoch": 0.29, + "grad_norm": 0.8033773303031921, + "learning_rate": 4.997481913181758e-05, + "loss": 1.4512, + "step": 955 + }, + { + "epoch": 0.29, + "grad_norm": 0.7851698398590088, + "learning_rate": 4.99745548124135e-05, + "loss": 1.4645, + "step": 960 + }, + { + "epoch": 0.29, + "grad_norm": 0.4588683843612671, + "learning_rate": 4.997428911369072e-05, + "loss": 1.3892, + "step": 965 + }, + { + "epoch": 0.29, + "grad_norm": 0.6735519170761108, + "learning_rate": 4.99740220356639e-05, + "loss": 1.4797, + "step": 970 + }, + { + "epoch": 0.29, + "grad_norm": 0.6732616424560547, + "learning_rate": 4.997375357834781e-05, + "loss": 1.4322, + "step": 975 + }, + { + "epoch": 0.29, + "grad_norm": 0.9135339856147766, + "learning_rate": 4.997348374175727e-05, + "loss": 1.489, + "step": 980 + }, + { + "epoch": 0.29, + "grad_norm": 0.7825419902801514, + "learning_rate": 4.9973212525907176e-05, + "loss": 1.4497, + "step": 985 + }, + { + "epoch": 0.3, + "grad_norm": 1.4693922996520996, + "learning_rate": 4.997293993081251e-05, + "loss": 1.4971, + "step": 990 + }, + { + "epoch": 0.3, + "grad_norm": 0.7837210893630981, + "learning_rate": 4.997266595648834e-05, + "loss": 1.4497, + "step": 995 + }, + { + "epoch": 0.3, + "grad_norm": 1.0610207319259644, + "learning_rate": 4.997239060294978e-05, + "loss": 1.3558, + "step": 1000 + }, + { + "epoch": 0.3, + "grad_norm": 0.9275992512702942, + "learning_rate": 4.997211387021204e-05, + "loss": 1.3527, + "step": 1005 + }, + { + "epoch": 0.3, + "grad_norm": 0.9326786994934082, + "learning_rate": 4.997183575829042e-05, + "loss": 1.4296, + "step": 1010 + }, + { + "epoch": 0.3, + "grad_norm": 1.0663551092147827, + "learning_rate": 4.997155626720026e-05, + "loss": 1.4676, + "step": 1015 + }, + { + "epoch": 0.31, + "grad_norm": 0.5471872091293335, + "learning_rate": 4.997127539695701e-05, + "loss": 1.2381, + "step": 1020 + }, + { + "epoch": 0.31, + "grad_norm": 1.2285401821136475, + "learning_rate": 4.997099314757617e-05, + "loss": 1.3078, + "step": 1025 + }, + { + "epoch": 0.31, + "grad_norm": 0.5964838862419128, + "learning_rate": 4.9970709519073334e-05, + "loss": 1.4322, + "step": 1030 + }, + { + "epoch": 0.31, + "grad_norm": 6.244211196899414, + "learning_rate": 4.997042451146417e-05, + "loss": 1.3203, + "step": 1035 + }, + { + "epoch": 0.31, + "grad_norm": 0.8410915732383728, + "learning_rate": 4.997013812476442e-05, + "loss": 1.6386, + "step": 1040 + }, + { + "epoch": 0.31, + "grad_norm": 1.518509030342102, + "learning_rate": 4.996985035898989e-05, + "loss": 1.4518, + "step": 1045 + }, + { + "epoch": 0.31, + "grad_norm": 0.8771889209747314, + "learning_rate": 4.996956121415648e-05, + "loss": 1.3745, + "step": 1050 + }, + { + "epoch": 0.32, + "grad_norm": 1.0236855745315552, + "learning_rate": 4.996927069028016e-05, + "loss": 1.3879, + "step": 1055 + }, + { + "epoch": 0.32, + "grad_norm": 1.9780511856079102, + "learning_rate": 4.996897878737697e-05, + "loss": 1.3488, + "step": 1060 + }, + { + "epoch": 0.32, + "grad_norm": 0.8157287836074829, + "learning_rate": 4.9968685505463036e-05, + "loss": 1.2988, + "step": 1065 + }, + { + "epoch": 0.32, + "grad_norm": 1.7686500549316406, + "learning_rate": 4.9968390844554556e-05, + "loss": 1.5909, + "step": 1070 + }, + { + "epoch": 0.32, + "grad_norm": 1.0402336120605469, + "learning_rate": 4.9968094804667805e-05, + "loss": 1.3257, + "step": 1075 + }, + { + "epoch": 0.32, + "grad_norm": 0.8256892561912537, + "learning_rate": 4.9967797385819135e-05, + "loss": 1.3697, + "step": 1080 + }, + { + "epoch": 0.32, + "grad_norm": 0.6995580792427063, + "learning_rate": 4.9967498588024956e-05, + "loss": 1.3315, + "step": 1085 + }, + { + "epoch": 0.33, + "grad_norm": 0.6209663152694702, + "learning_rate": 4.996719841130179e-05, + "loss": 1.3986, + "step": 1090 + }, + { + "epoch": 0.33, + "grad_norm": 0.9525293707847595, + "learning_rate": 4.996689685566621e-05, + "loss": 1.5316, + "step": 1095 + }, + { + "epoch": 0.33, + "grad_norm": 0.5040790438652039, + "learning_rate": 4.9966593921134863e-05, + "loss": 1.5408, + "step": 1100 + }, + { + "epoch": 0.33, + "grad_norm": 1.3051776885986328, + "learning_rate": 4.996628960772449e-05, + "loss": 1.4065, + "step": 1105 + }, + { + "epoch": 0.33, + "grad_norm": 1.1322470903396606, + "learning_rate": 4.996598391545189e-05, + "loss": 1.3907, + "step": 1110 + }, + { + "epoch": 0.33, + "grad_norm": 1.1070497035980225, + "learning_rate": 4.996567684433395e-05, + "loss": 1.4401, + "step": 1115 + }, + { + "epoch": 0.34, + "grad_norm": 2.0832529067993164, + "learning_rate": 4.996536839438762e-05, + "loss": 1.3753, + "step": 1120 + }, + { + "epoch": 0.34, + "grad_norm": 1.2609474658966064, + "learning_rate": 4.996505856562995e-05, + "loss": 1.4671, + "step": 1125 + }, + { + "epoch": 0.34, + "grad_norm": 0.5162214040756226, + "learning_rate": 4.996474735807805e-05, + "loss": 1.3594, + "step": 1130 + }, + { + "epoch": 0.34, + "grad_norm": 0.6403785347938538, + "learning_rate": 4.9964434771749105e-05, + "loss": 1.441, + "step": 1135 + }, + { + "epoch": 0.34, + "grad_norm": 1.0005476474761963, + "learning_rate": 4.996412080666036e-05, + "loss": 1.323, + "step": 1140 + }, + { + "epoch": 0.34, + "grad_norm": 0.7727034687995911, + "learning_rate": 4.996380546282919e-05, + "loss": 1.4661, + "step": 1145 + }, + { + "epoch": 0.34, + "grad_norm": 1.1217765808105469, + "learning_rate": 4.996348874027298e-05, + "loss": 1.4096, + "step": 1150 + }, + { + "epoch": 0.35, + "grad_norm": 0.5037316083908081, + "learning_rate": 4.9963170639009246e-05, + "loss": 1.3398, + "step": 1155 + }, + { + "epoch": 0.35, + "grad_norm": 1.459764838218689, + "learning_rate": 4.996285115905554e-05, + "loss": 1.5387, + "step": 1160 + }, + { + "epoch": 0.35, + "grad_norm": 0.856709897518158, + "learning_rate": 4.996253030042951e-05, + "loss": 1.4499, + "step": 1165 + }, + { + "epoch": 0.35, + "grad_norm": 0.6828152537345886, + "learning_rate": 4.996220806314888e-05, + "loss": 1.3039, + "step": 1170 + }, + { + "epoch": 0.35, + "grad_norm": 1.1198612451553345, + "learning_rate": 4.996188444723144e-05, + "loss": 1.356, + "step": 1175 + }, + { + "epoch": 0.35, + "grad_norm": 0.6652525067329407, + "learning_rate": 4.996155945269507e-05, + "loss": 1.3835, + "step": 1180 + }, + { + "epoch": 0.35, + "grad_norm": 1.4609639644622803, + "learning_rate": 4.996123307955773e-05, + "loss": 1.4632, + "step": 1185 + }, + { + "epoch": 0.36, + "grad_norm": 1.3598580360412598, + "learning_rate": 4.996090532783742e-05, + "loss": 1.4897, + "step": 1190 + }, + { + "epoch": 0.36, + "grad_norm": 1.24635910987854, + "learning_rate": 4.996057619755225e-05, + "loss": 1.417, + "step": 1195 + }, + { + "epoch": 0.36, + "grad_norm": 1.0729330778121948, + "learning_rate": 4.996024568872042e-05, + "loss": 1.3506, + "step": 1200 + }, + { + "epoch": 0.36, + "grad_norm": 0.6681000590324402, + "learning_rate": 4.9959913801360156e-05, + "loss": 1.3517, + "step": 1205 + }, + { + "epoch": 0.36, + "grad_norm": 0.8434261679649353, + "learning_rate": 4.995958053548979e-05, + "loss": 1.6053, + "step": 1210 + }, + { + "epoch": 0.36, + "grad_norm": 0.7810892462730408, + "learning_rate": 4.9959245891127745e-05, + "loss": 1.4025, + "step": 1215 + }, + { + "epoch": 0.37, + "grad_norm": 0.6281015276908875, + "learning_rate": 4.995890986829249e-05, + "loss": 1.2792, + "step": 1220 + }, + { + "epoch": 0.37, + "grad_norm": 1.185341477394104, + "learning_rate": 4.9958572467002586e-05, + "loss": 1.5974, + "step": 1225 + }, + { + "epoch": 0.37, + "grad_norm": 0.9366752505302429, + "learning_rate": 4.995823368727667e-05, + "loss": 1.3961, + "step": 1230 + }, + { + "epoch": 0.37, + "grad_norm": 2.3230159282684326, + "learning_rate": 4.995789352913345e-05, + "loss": 1.5296, + "step": 1235 + }, + { + "epoch": 0.37, + "grad_norm": 1.690661907196045, + "learning_rate": 4.995755199259171e-05, + "loss": 1.522, + "step": 1240 + }, + { + "epoch": 0.37, + "grad_norm": 1.1200650930404663, + "learning_rate": 4.995720907767031e-05, + "loss": 1.6149, + "step": 1245 + }, + { + "epoch": 0.37, + "grad_norm": 0.8390517830848694, + "learning_rate": 4.9956864784388204e-05, + "loss": 1.4568, + "step": 1250 + }, + { + "epoch": 0.38, + "grad_norm": 0.49734947085380554, + "learning_rate": 4.9956519112764385e-05, + "loss": 1.4331, + "step": 1255 + }, + { + "epoch": 0.38, + "grad_norm": 1.3497544527053833, + "learning_rate": 4.995617206281797e-05, + "loss": 1.4477, + "step": 1260 + }, + { + "epoch": 0.38, + "grad_norm": 0.4694572687149048, + "learning_rate": 4.9955823634568105e-05, + "loss": 1.3934, + "step": 1265 + }, + { + "epoch": 0.38, + "grad_norm": 0.5619333982467651, + "learning_rate": 4.9955473828034045e-05, + "loss": 1.5304, + "step": 1270 + }, + { + "epoch": 0.38, + "grad_norm": 0.7059439420700073, + "learning_rate": 4.99551226432351e-05, + "loss": 1.548, + "step": 1275 + }, + { + "epoch": 0.38, + "grad_norm": 2.1901280879974365, + "learning_rate": 4.995477008019067e-05, + "loss": 1.5823, + "step": 1280 + }, + { + "epoch": 0.38, + "grad_norm": 0.7577235698699951, + "learning_rate": 4.9954416138920235e-05, + "loss": 1.3385, + "step": 1285 + }, + { + "epoch": 0.39, + "grad_norm": 1.0513741970062256, + "learning_rate": 4.995406081944333e-05, + "loss": 1.5494, + "step": 1290 + }, + { + "epoch": 0.39, + "grad_norm": 0.9030594229698181, + "learning_rate": 4.995370412177959e-05, + "loss": 1.5816, + "step": 1295 + }, + { + "epoch": 0.39, + "grad_norm": 0.6190074682235718, + "learning_rate": 4.99533460459487e-05, + "loss": 1.4034, + "step": 1300 + }, + { + "epoch": 0.39, + "grad_norm": 0.6334771513938904, + "learning_rate": 4.995298659197045e-05, + "loss": 1.4598, + "step": 1305 + }, + { + "epoch": 0.39, + "grad_norm": 1.5323703289031982, + "learning_rate": 4.9952625759864694e-05, + "loss": 1.3749, + "step": 1310 + }, + { + "epoch": 0.39, + "grad_norm": 1.3500525951385498, + "learning_rate": 4.9952263549651346e-05, + "loss": 1.38, + "step": 1315 + }, + { + "epoch": 0.39, + "grad_norm": 0.885956883430481, + "learning_rate": 4.995189996135042e-05, + "loss": 1.4463, + "step": 1320 + }, + { + "epoch": 0.4, + "grad_norm": 0.774631917476654, + "learning_rate": 4.9951534994981994e-05, + "loss": 1.24, + "step": 1325 + }, + { + "epoch": 0.4, + "grad_norm": 0.670936644077301, + "learning_rate": 4.9951168650566226e-05, + "loss": 1.348, + "step": 1330 + }, + { + "epoch": 0.4, + "grad_norm": 0.5310854315757751, + "learning_rate": 4.995080092812335e-05, + "loss": 1.4768, + "step": 1335 + }, + { + "epoch": 0.4, + "grad_norm": 0.7614613771438599, + "learning_rate": 4.995043182767368e-05, + "loss": 1.4268, + "step": 1340 + }, + { + "epoch": 0.4, + "grad_norm": 1.5375897884368896, + "learning_rate": 4.995006134923759e-05, + "loss": 1.3634, + "step": 1345 + }, + { + "epoch": 0.4, + "grad_norm": 0.8461021780967712, + "learning_rate": 4.994968949283554e-05, + "loss": 1.4866, + "step": 1350 + }, + { + "epoch": 0.41, + "grad_norm": 0.7309428453445435, + "learning_rate": 4.994931625848808e-05, + "loss": 1.5075, + "step": 1355 + }, + { + "epoch": 0.41, + "grad_norm": 1.151624321937561, + "learning_rate": 4.994894164621581e-05, + "loss": 1.4178, + "step": 1360 + }, + { + "epoch": 0.41, + "grad_norm": 0.812978208065033, + "learning_rate": 4.9948565656039434e-05, + "loss": 1.2951, + "step": 1365 + }, + { + "epoch": 0.41, + "grad_norm": 0.8754581212997437, + "learning_rate": 4.99481882879797e-05, + "loss": 1.4593, + "step": 1370 + }, + { + "epoch": 0.41, + "grad_norm": 1.1998158693313599, + "learning_rate": 4.994780954205747e-05, + "loss": 1.3958, + "step": 1375 + }, + { + "epoch": 0.41, + "grad_norm": 0.9774079918861389, + "learning_rate": 4.994742941829364e-05, + "loss": 1.3993, + "step": 1380 + }, + { + "epoch": 0.41, + "grad_norm": 0.7645953297615051, + "learning_rate": 4.9947047916709224e-05, + "loss": 1.4456, + "step": 1385 + }, + { + "epoch": 0.42, + "grad_norm": 0.7388116717338562, + "learning_rate": 4.994666503732528e-05, + "loss": 1.4401, + "step": 1390 + }, + { + "epoch": 0.42, + "grad_norm": 1.9941197633743286, + "learning_rate": 4.994628078016296e-05, + "loss": 1.4012, + "step": 1395 + }, + { + "epoch": 0.42, + "grad_norm": 1.6217386722564697, + "learning_rate": 4.994589514524347e-05, + "loss": 1.5009, + "step": 1400 + }, + { + "epoch": 0.42, + "grad_norm": 1.7688075304031372, + "learning_rate": 4.9945508132588134e-05, + "loss": 1.332, + "step": 1405 + }, + { + "epoch": 0.42, + "grad_norm": 0.620936393737793, + "learning_rate": 4.994511974221831e-05, + "loss": 1.4864, + "step": 1410 + }, + { + "epoch": 0.42, + "grad_norm": 0.8650867938995361, + "learning_rate": 4.9944729974155444e-05, + "loss": 1.5008, + "step": 1415 + }, + { + "epoch": 0.42, + "grad_norm": 1.145591139793396, + "learning_rate": 4.994433882842108e-05, + "loss": 1.242, + "step": 1420 + }, + { + "epoch": 0.43, + "grad_norm": 1.6311044692993164, + "learning_rate": 4.9943946305036806e-05, + "loss": 1.6072, + "step": 1425 + }, + { + "epoch": 0.43, + "grad_norm": 0.777138888835907, + "learning_rate": 4.9943552404024303e-05, + "loss": 1.4589, + "step": 1430 + }, + { + "epoch": 0.43, + "grad_norm": 1.3018752336502075, + "learning_rate": 4.994315712540533e-05, + "loss": 1.5466, + "step": 1435 + }, + { + "epoch": 0.43, + "grad_norm": 1.3640024662017822, + "learning_rate": 4.994276046920172e-05, + "loss": 1.5863, + "step": 1440 + }, + { + "epoch": 0.43, + "grad_norm": 1.0018880367279053, + "learning_rate": 4.994236243543537e-05, + "loss": 1.3074, + "step": 1445 + }, + { + "epoch": 0.43, + "grad_norm": 0.5440133810043335, + "learning_rate": 4.994196302412827e-05, + "loss": 1.5295, + "step": 1450 + }, + { + "epoch": 0.44, + "grad_norm": 2.098235845565796, + "learning_rate": 4.994156223530248e-05, + "loss": 1.4089, + "step": 1455 + }, + { + "epoch": 0.44, + "grad_norm": 1.157474398612976, + "learning_rate": 4.994116006898013e-05, + "loss": 1.4596, + "step": 1460 + }, + { + "epoch": 0.44, + "grad_norm": 1.6245862245559692, + "learning_rate": 4.994075652518344e-05, + "loss": 1.3593, + "step": 1465 + }, + { + "epoch": 0.44, + "grad_norm": 1.1995525360107422, + "learning_rate": 4.994035160393469e-05, + "loss": 1.4744, + "step": 1470 + }, + { + "epoch": 0.44, + "grad_norm": 0.7864271402359009, + "learning_rate": 4.993994530525624e-05, + "loss": 1.3689, + "step": 1475 + }, + { + "epoch": 0.44, + "grad_norm": 0.8403075337409973, + "learning_rate": 4.9939537629170544e-05, + "loss": 1.4856, + "step": 1480 + }, + { + "epoch": 0.44, + "grad_norm": 0.5901763439178467, + "learning_rate": 4.99391285757001e-05, + "loss": 1.3932, + "step": 1485 + }, + { + "epoch": 0.45, + "grad_norm": 0.9339320063591003, + "learning_rate": 4.993871814486751e-05, + "loss": 1.5516, + "step": 1490 + }, + { + "epoch": 0.45, + "grad_norm": 1.3036901950836182, + "learning_rate": 4.993830633669544e-05, + "loss": 1.4868, + "step": 1495 + }, + { + "epoch": 0.45, + "grad_norm": 0.6879803538322449, + "learning_rate": 4.9937893151206626e-05, + "loss": 1.38, + "step": 1500 + }, + { + "epoch": 0.45, + "grad_norm": 0.7101956009864807, + "learning_rate": 4.99374785884239e-05, + "loss": 1.3969, + "step": 1505 + }, + { + "epoch": 0.45, + "grad_norm": 1.220004916191101, + "learning_rate": 4.9937062648370154e-05, + "loss": 1.4985, + "step": 1510 + }, + { + "epoch": 0.45, + "grad_norm": 0.5632555484771729, + "learning_rate": 4.993664533106835e-05, + "loss": 1.3545, + "step": 1515 + }, + { + "epoch": 0.45, + "grad_norm": 1.4013103246688843, + "learning_rate": 4.993622663654156e-05, + "loss": 1.4475, + "step": 1520 + }, + { + "epoch": 0.46, + "grad_norm": 0.9690436124801636, + "learning_rate": 4.993580656481288e-05, + "loss": 1.3327, + "step": 1525 + }, + { + "epoch": 0.46, + "grad_norm": 0.6049666404724121, + "learning_rate": 4.993538511590553e-05, + "loss": 1.1947, + "step": 1530 + }, + { + "epoch": 0.46, + "grad_norm": 0.7417116165161133, + "learning_rate": 4.993496228984278e-05, + "loss": 1.5365, + "step": 1535 + }, + { + "epoch": 0.46, + "grad_norm": 1.1022894382476807, + "learning_rate": 4.993453808664797e-05, + "loss": 1.4447, + "step": 1540 + }, + { + "epoch": 0.46, + "grad_norm": 0.834717333316803, + "learning_rate": 4.993411250634455e-05, + "loss": 1.4464, + "step": 1545 + }, + { + "epoch": 0.46, + "grad_norm": 0.6041282415390015, + "learning_rate": 4.9933685548956014e-05, + "loss": 1.388, + "step": 1550 + }, + { + "epoch": 0.47, + "grad_norm": 0.8697533011436462, + "learning_rate": 4.993325721450594e-05, + "loss": 1.425, + "step": 1555 + }, + { + "epoch": 0.47, + "grad_norm": 1.2372421026229858, + "learning_rate": 4.993282750301799e-05, + "loss": 1.3586, + "step": 1560 + }, + { + "epoch": 0.47, + "grad_norm": 2.791959285736084, + "learning_rate": 4.993239641451588e-05, + "loss": 1.7594, + "step": 1565 + }, + { + "epoch": 0.47, + "grad_norm": 3.356498956680298, + "learning_rate": 4.993196394902344e-05, + "loss": 1.4429, + "step": 1570 + }, + { + "epoch": 0.47, + "grad_norm": 0.8572433590888977, + "learning_rate": 4.993153010656455e-05, + "loss": 1.3813, + "step": 1575 + }, + { + "epoch": 0.47, + "grad_norm": 1.199879765510559, + "learning_rate": 4.993109488716316e-05, + "loss": 1.2936, + "step": 1580 + }, + { + "epoch": 0.47, + "grad_norm": 0.6142605543136597, + "learning_rate": 4.993065829084332e-05, + "loss": 1.3402, + "step": 1585 + }, + { + "epoch": 0.48, + "grad_norm": 0.4303763806819916, + "learning_rate": 4.993022031762914e-05, + "loss": 1.3431, + "step": 1590 + }, + { + "epoch": 0.48, + "grad_norm": 0.7792863249778748, + "learning_rate": 4.99297809675448e-05, + "loss": 1.5333, + "step": 1595 + }, + { + "epoch": 0.48, + "grad_norm": 1.015516996383667, + "learning_rate": 4.992934024061456e-05, + "loss": 1.5036, + "step": 1600 + }, + { + "epoch": 0.48, + "grad_norm": 0.5127567648887634, + "learning_rate": 4.992889813686279e-05, + "loss": 1.3214, + "step": 1605 + }, + { + "epoch": 0.48, + "grad_norm": 1.4127864837646484, + "learning_rate": 4.992845465631388e-05, + "loss": 1.4556, + "step": 1610 + }, + { + "epoch": 0.48, + "grad_norm": 2.0938849449157715, + "learning_rate": 4.992800979899233e-05, + "loss": 1.372, + "step": 1615 + }, + { + "epoch": 0.48, + "grad_norm": 0.7442615032196045, + "learning_rate": 4.992756356492271e-05, + "loss": 1.293, + "step": 1620 + }, + { + "epoch": 0.49, + "grad_norm": 0.879451334476471, + "learning_rate": 4.9927115954129665e-05, + "loss": 1.4263, + "step": 1625 + }, + { + "epoch": 0.49, + "grad_norm": 0.867012619972229, + "learning_rate": 4.9926666966637914e-05, + "loss": 1.3219, + "step": 1630 + }, + { + "epoch": 0.49, + "grad_norm": 2.4864187240600586, + "learning_rate": 4.992621660247226e-05, + "loss": 1.5152, + "step": 1635 + }, + { + "epoch": 0.49, + "grad_norm": 1.02019202709198, + "learning_rate": 4.992576486165758e-05, + "loss": 1.3683, + "step": 1640 + }, + { + "epoch": 0.49, + "grad_norm": 0.8681535720825195, + "learning_rate": 4.99253117442188e-05, + "loss": 1.4345, + "step": 1645 + }, + { + "epoch": 0.49, + "grad_norm": 0.6605016589164734, + "learning_rate": 4.992485725018097e-05, + "loss": 1.3767, + "step": 1650 + }, + { + "epoch": 0.5, + "grad_norm": 1.7456071376800537, + "learning_rate": 4.9924401379569174e-05, + "loss": 1.17, + "step": 1655 + }, + { + "epoch": 0.5, + "grad_norm": 1.299971342086792, + "learning_rate": 4.992394413240861e-05, + "loss": 1.545, + "step": 1660 + }, + { + "epoch": 0.5, + "grad_norm": 0.6303769946098328, + "learning_rate": 4.992348550872451e-05, + "loss": 1.3899, + "step": 1665 + }, + { + "epoch": 0.5, + "grad_norm": 0.5625646710395813, + "learning_rate": 4.992302550854221e-05, + "loss": 1.3778, + "step": 1670 + }, + { + "epoch": 0.5, + "grad_norm": 1.0175740718841553, + "learning_rate": 4.992256413188712e-05, + "loss": 1.3702, + "step": 1675 + }, + { + "epoch": 0.5, + "grad_norm": 0.6424248814582825, + "learning_rate": 4.992210137878472e-05, + "loss": 1.3868, + "step": 1680 + }, + { + "epoch": 0.5, + "grad_norm": 1.0193471908569336, + "learning_rate": 4.992163724926057e-05, + "loss": 1.3882, + "step": 1685 + }, + { + "epoch": 0.51, + "grad_norm": 0.6782774925231934, + "learning_rate": 4.992117174334029e-05, + "loss": 1.4042, + "step": 1690 + }, + { + "epoch": 0.51, + "grad_norm": 0.7803168892860413, + "learning_rate": 4.99207048610496e-05, + "loss": 1.2801, + "step": 1695 + }, + { + "epoch": 0.51, + "grad_norm": 0.7628136873245239, + "learning_rate": 4.992023660241429e-05, + "loss": 1.4438, + "step": 1700 + }, + { + "epoch": 0.51, + "grad_norm": 0.7955535054206848, + "learning_rate": 4.991976696746021e-05, + "loss": 1.2711, + "step": 1705 + }, + { + "epoch": 0.51, + "grad_norm": 1.0430067777633667, + "learning_rate": 4.991929595621331e-05, + "loss": 1.3843, + "step": 1710 + }, + { + "epoch": 0.51, + "grad_norm": 1.0455753803253174, + "learning_rate": 4.991882356869959e-05, + "loss": 1.3456, + "step": 1715 + }, + { + "epoch": 0.51, + "grad_norm": 0.5499788522720337, + "learning_rate": 4.9918349804945154e-05, + "loss": 1.4326, + "step": 1720 + }, + { + "epoch": 0.52, + "grad_norm": 0.6572498083114624, + "learning_rate": 4.991787466497615e-05, + "loss": 1.5624, + "step": 1725 + }, + { + "epoch": 0.52, + "grad_norm": 1.2623273134231567, + "learning_rate": 4.9917398148818836e-05, + "loss": 1.5035, + "step": 1730 + }, + { + "epoch": 0.52, + "grad_norm": 1.5261033773422241, + "learning_rate": 4.991692025649952e-05, + "loss": 1.492, + "step": 1735 + }, + { + "epoch": 0.52, + "grad_norm": 0.8616695404052734, + "learning_rate": 4.991644098804459e-05, + "loss": 1.3531, + "step": 1740 + }, + { + "epoch": 0.52, + "grad_norm": 1.3723504543304443, + "learning_rate": 4.991596034348053e-05, + "loss": 1.5159, + "step": 1745 + }, + { + "epoch": 0.52, + "grad_norm": 0.5867530703544617, + "learning_rate": 4.991547832283389e-05, + "loss": 1.5382, + "step": 1750 + }, + { + "epoch": 0.53, + "grad_norm": 1.0854787826538086, + "learning_rate": 4.9914994926131265e-05, + "loss": 1.4862, + "step": 1755 + }, + { + "epoch": 0.53, + "grad_norm": 1.023762822151184, + "learning_rate": 4.991451015339937e-05, + "loss": 1.4294, + "step": 1760 + }, + { + "epoch": 0.53, + "grad_norm": 1.219565749168396, + "learning_rate": 4.9914024004664986e-05, + "loss": 1.3301, + "step": 1765 + }, + { + "epoch": 0.53, + "grad_norm": 0.8434814214706421, + "learning_rate": 4.991353647995494e-05, + "loss": 1.5169, + "step": 1770 + }, + { + "epoch": 0.53, + "grad_norm": 1.8482433557510376, + "learning_rate": 4.9913047579296177e-05, + "loss": 1.4846, + "step": 1775 + }, + { + "epoch": 0.53, + "grad_norm": 0.639321506023407, + "learning_rate": 4.991255730271569e-05, + "loss": 1.3135, + "step": 1780 + }, + { + "epoch": 0.53, + "grad_norm": 1.1192939281463623, + "learning_rate": 4.991206565024056e-05, + "loss": 1.3945, + "step": 1785 + }, + { + "epoch": 0.54, + "grad_norm": 0.9012823700904846, + "learning_rate": 4.991157262189794e-05, + "loss": 1.2153, + "step": 1790 + }, + { + "epoch": 0.54, + "grad_norm": 1.0617835521697998, + "learning_rate": 4.991107821771506e-05, + "loss": 1.3468, + "step": 1795 + }, + { + "epoch": 0.54, + "grad_norm": 1.25299870967865, + "learning_rate": 4.991058243771922e-05, + "loss": 1.436, + "step": 1800 + }, + { + "epoch": 0.54, + "grad_norm": 0.779280960559845, + "learning_rate": 4.9910085281937804e-05, + "loss": 1.3551, + "step": 1805 + }, + { + "epoch": 0.54, + "grad_norm": 0.6145398020744324, + "learning_rate": 4.9909586750398274e-05, + "loss": 1.2627, + "step": 1810 + }, + { + "epoch": 0.54, + "grad_norm": 0.7118735313415527, + "learning_rate": 4.9909086843128154e-05, + "loss": 1.5455, + "step": 1815 + }, + { + "epoch": 0.54, + "grad_norm": 1.2611353397369385, + "learning_rate": 4.990858556015506e-05, + "loss": 1.4429, + "step": 1820 + }, + { + "epoch": 0.55, + "grad_norm": 1.1818840503692627, + "learning_rate": 4.990808290150668e-05, + "loss": 1.586, + "step": 1825 + }, + { + "epoch": 0.55, + "grad_norm": 0.6355205774307251, + "learning_rate": 4.990757886721077e-05, + "loss": 1.3538, + "step": 1830 + }, + { + "epoch": 0.55, + "grad_norm": 0.7018294334411621, + "learning_rate": 4.990707345729517e-05, + "loss": 1.5147, + "step": 1835 + }, + { + "epoch": 0.55, + "grad_norm": 1.5799555778503418, + "learning_rate": 4.99065666717878e-05, + "loss": 1.4748, + "step": 1840 + }, + { + "epoch": 0.55, + "grad_norm": 1.1363238096237183, + "learning_rate": 4.990605851071664e-05, + "loss": 1.3812, + "step": 1845 + }, + { + "epoch": 0.55, + "grad_norm": 0.8724756836891174, + "learning_rate": 4.9905548974109746e-05, + "loss": 1.4634, + "step": 1850 + }, + { + "epoch": 0.55, + "grad_norm": 1.3491142988204956, + "learning_rate": 4.990503806199527e-05, + "loss": 1.4406, + "step": 1855 + }, + { + "epoch": 0.56, + "grad_norm": 0.8592522740364075, + "learning_rate": 4.990452577440144e-05, + "loss": 1.55, + "step": 1860 + }, + { + "epoch": 0.56, + "grad_norm": 0.8113953471183777, + "learning_rate": 4.9904012111356536e-05, + "loss": 1.5138, + "step": 1865 + }, + { + "epoch": 0.56, + "grad_norm": 0.8070884943008423, + "learning_rate": 4.990349707288892e-05, + "loss": 1.5115, + "step": 1870 + }, + { + "epoch": 0.56, + "grad_norm": 1.018451452255249, + "learning_rate": 4.990298065902706e-05, + "loss": 1.4229, + "step": 1875 + }, + { + "epoch": 0.56, + "grad_norm": 0.5133267641067505, + "learning_rate": 4.9902462869799446e-05, + "loss": 1.1838, + "step": 1880 + }, + { + "epoch": 0.56, + "grad_norm": 0.795879065990448, + "learning_rate": 4.990194370523471e-05, + "loss": 1.5473, + "step": 1885 + }, + { + "epoch": 0.57, + "grad_norm": 0.8955612182617188, + "learning_rate": 4.990142316536149e-05, + "loss": 1.4088, + "step": 1890 + }, + { + "epoch": 0.57, + "grad_norm": 1.3825359344482422, + "learning_rate": 4.990090125020857e-05, + "loss": 1.3161, + "step": 1895 + }, + { + "epoch": 0.57, + "grad_norm": 0.9346737265586853, + "learning_rate": 4.990037795980474e-05, + "loss": 1.3944, + "step": 1900 + }, + { + "epoch": 0.57, + "grad_norm": 0.5142995119094849, + "learning_rate": 4.989985329417893e-05, + "loss": 1.3262, + "step": 1905 + }, + { + "epoch": 0.57, + "grad_norm": 1.0340321063995361, + "learning_rate": 4.98993272533601e-05, + "loss": 1.4552, + "step": 1910 + }, + { + "epoch": 0.57, + "grad_norm": 0.5901709794998169, + "learning_rate": 4.989879983737732e-05, + "loss": 1.2281, + "step": 1915 + }, + { + "epoch": 0.57, + "grad_norm": 1.0993763208389282, + "learning_rate": 4.989827104625969e-05, + "loss": 1.4189, + "step": 1920 + }, + { + "epoch": 0.58, + "grad_norm": 0.9149566292762756, + "learning_rate": 4.989774088003644e-05, + "loss": 1.5347, + "step": 1925 + }, + { + "epoch": 0.58, + "grad_norm": 0.6457690596580505, + "learning_rate": 4.989720933873683e-05, + "loss": 1.2577, + "step": 1930 + }, + { + "epoch": 0.58, + "grad_norm": 0.5968732237815857, + "learning_rate": 4.989667642239023e-05, + "loss": 1.3495, + "step": 1935 + }, + { + "epoch": 0.58, + "grad_norm": 0.9515772461891174, + "learning_rate": 4.989614213102608e-05, + "loss": 1.4536, + "step": 1940 + }, + { + "epoch": 0.58, + "grad_norm": 0.9571760892868042, + "learning_rate": 4.989560646467387e-05, + "loss": 1.4527, + "step": 1945 + }, + { + "epoch": 0.58, + "grad_norm": 0.5556336045265198, + "learning_rate": 4.989506942336319e-05, + "loss": 1.414, + "step": 1950 + }, + { + "epoch": 0.58, + "grad_norm": 1.146568775177002, + "learning_rate": 4.989453100712371e-05, + "loss": 1.3903, + "step": 1955 + }, + { + "epoch": 0.59, + "grad_norm": 0.47517749667167664, + "learning_rate": 4.989399121598515e-05, + "loss": 1.2966, + "step": 1960 + }, + { + "epoch": 0.59, + "grad_norm": 1.3402255773544312, + "learning_rate": 4.989345004997734e-05, + "loss": 1.4773, + "step": 1965 + }, + { + "epoch": 0.59, + "grad_norm": 1.027206540107727, + "learning_rate": 4.9892907509130156e-05, + "loss": 1.5167, + "step": 1970 + }, + { + "epoch": 0.59, + "grad_norm": 0.7391974925994873, + "learning_rate": 4.989236359347356e-05, + "loss": 1.492, + "step": 1975 + }, + { + "epoch": 0.59, + "grad_norm": 2.5391299724578857, + "learning_rate": 4.989181830303761e-05, + "loss": 1.3457, + "step": 1980 + }, + { + "epoch": 0.59, + "grad_norm": 0.5835283398628235, + "learning_rate": 4.9891271637852396e-05, + "loss": 1.3803, + "step": 1985 + }, + { + "epoch": 0.6, + "grad_norm": 1.2071964740753174, + "learning_rate": 4.9890723597948126e-05, + "loss": 1.4644, + "step": 1990 + }, + { + "epoch": 0.6, + "grad_norm": 0.8295852541923523, + "learning_rate": 4.989017418335507e-05, + "loss": 1.3478, + "step": 1995 + }, + { + "epoch": 0.6, + "grad_norm": 1.2899019718170166, + "learning_rate": 4.988962339410356e-05, + "loss": 1.5883, + "step": 2000 + }, + { + "epoch": 0.6, + "grad_norm": 0.8376094698905945, + "learning_rate": 4.988907123022401e-05, + "loss": 1.6507, + "step": 2005 + }, + { + "epoch": 0.6, + "grad_norm": 1.5938405990600586, + "learning_rate": 4.988851769174695e-05, + "loss": 1.5191, + "step": 2010 + }, + { + "epoch": 0.6, + "grad_norm": 0.7297775149345398, + "learning_rate": 4.988796277870291e-05, + "loss": 1.4755, + "step": 2015 + }, + { + "epoch": 0.6, + "grad_norm": 0.8325863480567932, + "learning_rate": 4.988740649112256e-05, + "loss": 1.455, + "step": 2020 + }, + { + "epoch": 0.61, + "grad_norm": 0.6817718148231506, + "learning_rate": 4.9886848829036624e-05, + "loss": 1.3709, + "step": 2025 + }, + { + "epoch": 0.61, + "grad_norm": 0.8563414812088013, + "learning_rate": 4.9886289792475894e-05, + "loss": 1.3933, + "step": 2030 + }, + { + "epoch": 0.61, + "grad_norm": 0.8419439792633057, + "learning_rate": 4.9885729381471244e-05, + "loss": 1.2793, + "step": 2035 + }, + { + "epoch": 0.61, + "grad_norm": 1.1437691450119019, + "learning_rate": 4.988516759605363e-05, + "loss": 1.4561, + "step": 2040 + }, + { + "epoch": 0.61, + "grad_norm": 1.0974056720733643, + "learning_rate": 4.9884604436254065e-05, + "loss": 1.3317, + "step": 2045 + }, + { + "epoch": 0.61, + "grad_norm": 0.7327514886856079, + "learning_rate": 4.9884039902103674e-05, + "loss": 1.5174, + "step": 2050 + }, + { + "epoch": 0.61, + "grad_norm": 0.5889623165130615, + "learning_rate": 4.9883473993633626e-05, + "loss": 1.4063, + "step": 2055 + }, + { + "epoch": 0.62, + "grad_norm": 1.461192011833191, + "learning_rate": 4.988290671087517e-05, + "loss": 1.4681, + "step": 2060 + }, + { + "epoch": 0.62, + "grad_norm": 0.7430127859115601, + "learning_rate": 4.9882338053859646e-05, + "loss": 1.3817, + "step": 2065 + }, + { + "epoch": 0.62, + "grad_norm": 1.4986989498138428, + "learning_rate": 4.988176802261845e-05, + "loss": 1.3504, + "step": 2070 + }, + { + "epoch": 0.62, + "grad_norm": 0.7772125601768494, + "learning_rate": 4.988119661718307e-05, + "loss": 1.36, + "step": 2075 + }, + { + "epoch": 0.62, + "grad_norm": 1.2904176712036133, + "learning_rate": 4.988062383758506e-05, + "loss": 1.4057, + "step": 2080 + }, + { + "epoch": 0.62, + "grad_norm": 1.2020084857940674, + "learning_rate": 4.9880049683856066e-05, + "loss": 1.5082, + "step": 2085 + }, + { + "epoch": 0.63, + "grad_norm": 0.7529141306877136, + "learning_rate": 4.987947415602778e-05, + "loss": 1.3725, + "step": 2090 + }, + { + "epoch": 0.63, + "grad_norm": 0.8067646622657776, + "learning_rate": 4.987889725413201e-05, + "loss": 1.3523, + "step": 2095 + }, + { + "epoch": 0.63, + "grad_norm": 0.7970091104507446, + "learning_rate": 4.987831897820059e-05, + "loss": 1.6039, + "step": 2100 + }, + { + "epoch": 0.63, + "grad_norm": 0.6938903331756592, + "learning_rate": 4.987773932826548e-05, + "loss": 1.4473, + "step": 2105 + }, + { + "epoch": 0.63, + "grad_norm": 0.8393217921257019, + "learning_rate": 4.9877158304358687e-05, + "loss": 1.4723, + "step": 2110 + }, + { + "epoch": 0.63, + "grad_norm": 1.2300384044647217, + "learning_rate": 4.987657590651229e-05, + "loss": 1.2791, + "step": 2115 + }, + { + "epoch": 0.63, + "grad_norm": 0.7245568633079529, + "learning_rate": 4.9875992134758476e-05, + "loss": 1.4415, + "step": 2120 + }, + { + "epoch": 0.64, + "grad_norm": 0.7715205550193787, + "learning_rate": 4.987540698912947e-05, + "loss": 1.4555, + "step": 2125 + }, + { + "epoch": 0.64, + "grad_norm": 0.9755416512489319, + "learning_rate": 4.987482046965759e-05, + "loss": 1.4118, + "step": 2130 + }, + { + "epoch": 0.64, + "grad_norm": 1.0845292806625366, + "learning_rate": 4.987423257637523e-05, + "loss": 1.4648, + "step": 2135 + }, + { + "epoch": 0.64, + "grad_norm": 0.7946663498878479, + "learning_rate": 4.9873643309314864e-05, + "loss": 1.4335, + "step": 2140 + }, + { + "epoch": 0.64, + "grad_norm": 1.767040491104126, + "learning_rate": 4.987305266850903e-05, + "loss": 1.3784, + "step": 2145 + }, + { + "epoch": 0.64, + "grad_norm": 1.0675404071807861, + "learning_rate": 4.987246065399035e-05, + "loss": 1.573, + "step": 2150 + }, + { + "epoch": 0.64, + "grad_norm": 1.144506573677063, + "learning_rate": 4.987186726579153e-05, + "loss": 1.3506, + "step": 2155 + }, + { + "epoch": 0.65, + "grad_norm": 0.6684775948524475, + "learning_rate": 4.987127250394532e-05, + "loss": 1.2895, + "step": 2160 + }, + { + "epoch": 0.65, + "grad_norm": 0.9211767911911011, + "learning_rate": 4.987067636848459e-05, + "loss": 1.2889, + "step": 2165 + }, + { + "epoch": 0.65, + "grad_norm": 1.1036546230316162, + "learning_rate": 4.987007885944226e-05, + "loss": 1.39, + "step": 2170 + }, + { + "epoch": 0.65, + "grad_norm": 0.8043128252029419, + "learning_rate": 4.986947997685132e-05, + "loss": 1.4297, + "step": 2175 + }, + { + "epoch": 0.65, + "grad_norm": 2.3548777103424072, + "learning_rate": 4.986887972074485e-05, + "loss": 1.4425, + "step": 2180 + }, + { + "epoch": 0.65, + "grad_norm": 1.4966070652008057, + "learning_rate": 4.9868278091156e-05, + "loss": 1.4042, + "step": 2185 + }, + { + "epoch": 0.66, + "grad_norm": 0.8505388498306274, + "learning_rate": 4.986767508811801e-05, + "loss": 1.4276, + "step": 2190 + }, + { + "epoch": 0.66, + "grad_norm": 1.0283442735671997, + "learning_rate": 4.986707071166417e-05, + "loss": 1.4598, + "step": 2195 + }, + { + "epoch": 0.66, + "grad_norm": 0.680404007434845, + "learning_rate": 4.9866464961827856e-05, + "loss": 1.4773, + "step": 2200 + }, + { + "epoch": 0.66, + "grad_norm": 0.575295090675354, + "learning_rate": 4.986585783864254e-05, + "loss": 1.3162, + "step": 2205 + }, + { + "epoch": 0.66, + "grad_norm": 0.7415833473205566, + "learning_rate": 4.9865249342141726e-05, + "loss": 1.4289, + "step": 2210 + }, + { + "epoch": 0.66, + "grad_norm": 0.8124527335166931, + "learning_rate": 4.986463947235905e-05, + "loss": 1.4606, + "step": 2215 + }, + { + "epoch": 0.66, + "grad_norm": 0.7351284623146057, + "learning_rate": 4.9864028229328186e-05, + "loss": 1.5149, + "step": 2220 + }, + { + "epoch": 0.67, + "grad_norm": 0.9113266468048096, + "learning_rate": 4.9863415613082876e-05, + "loss": 1.5249, + "step": 2225 + }, + { + "epoch": 0.67, + "grad_norm": 1.276861310005188, + "learning_rate": 4.986280162365697e-05, + "loss": 1.4372, + "step": 2230 + }, + { + "epoch": 0.67, + "grad_norm": 0.6544708013534546, + "learning_rate": 4.9862186261084374e-05, + "loss": 1.3714, + "step": 2235 + }, + { + "epoch": 0.67, + "grad_norm": 0.9754915833473206, + "learning_rate": 4.986156952539908e-05, + "loss": 1.3048, + "step": 2240 + }, + { + "epoch": 0.67, + "grad_norm": 0.7614801526069641, + "learning_rate": 4.9860951416635126e-05, + "loss": 1.4061, + "step": 2245 + }, + { + "epoch": 0.67, + "grad_norm": 1.1788151264190674, + "learning_rate": 4.986033193482668e-05, + "loss": 1.2955, + "step": 2250 + }, + { + "epoch": 0.67, + "grad_norm": 0.917193591594696, + "learning_rate": 4.9859711080007944e-05, + "loss": 1.351, + "step": 2255 + }, + { + "epoch": 0.68, + "grad_norm": 0.7633816003799438, + "learning_rate": 4.9859088852213196e-05, + "loss": 1.3889, + "step": 2260 + }, + { + "epoch": 0.68, + "grad_norm": 0.4886833429336548, + "learning_rate": 4.985846525147681e-05, + "loss": 1.4501, + "step": 2265 + }, + { + "epoch": 0.68, + "grad_norm": 0.9673319458961487, + "learning_rate": 4.9857840277833236e-05, + "loss": 1.4624, + "step": 2270 + }, + { + "epoch": 0.68, + "grad_norm": 0.7718215584754944, + "learning_rate": 4.9857213931316984e-05, + "loss": 1.4116, + "step": 2275 + }, + { + "epoch": 0.68, + "grad_norm": 0.6662101745605469, + "learning_rate": 4.985658621196263e-05, + "loss": 1.425, + "step": 2280 + }, + { + "epoch": 0.68, + "grad_norm": 0.7783111333847046, + "learning_rate": 4.985595711980486e-05, + "loss": 1.5157, + "step": 2285 + }, + { + "epoch": 0.69, + "grad_norm": 0.4051775634288788, + "learning_rate": 4.985532665487843e-05, + "loss": 1.3326, + "step": 2290 + }, + { + "epoch": 0.69, + "grad_norm": 0.8741587996482849, + "learning_rate": 4.9854694817218125e-05, + "loss": 1.3785, + "step": 2295 + }, + { + "epoch": 0.69, + "grad_norm": 0.6999533772468567, + "learning_rate": 4.985406160685887e-05, + "loss": 1.3553, + "step": 2300 + }, + { + "epoch": 0.69, + "grad_norm": 0.7438477277755737, + "learning_rate": 4.985342702383563e-05, + "loss": 1.4612, + "step": 2305 + }, + { + "epoch": 0.69, + "grad_norm": 1.4302748441696167, + "learning_rate": 4.985279106818345e-05, + "loss": 1.4045, + "step": 2310 + }, + { + "epoch": 0.69, + "grad_norm": 0.8926997780799866, + "learning_rate": 4.9852153739937444e-05, + "loss": 1.448, + "step": 2315 + }, + { + "epoch": 0.69, + "grad_norm": 0.7506667375564575, + "learning_rate": 4.9851515039132824e-05, + "loss": 1.3386, + "step": 2320 + }, + { + "epoch": 0.7, + "grad_norm": 0.7566189169883728, + "learning_rate": 4.985087496580485e-05, + "loss": 1.5076, + "step": 2325 + }, + { + "epoch": 0.7, + "grad_norm": 2.2634682655334473, + "learning_rate": 4.98502335199889e-05, + "loss": 1.5423, + "step": 2330 + }, + { + "epoch": 0.7, + "grad_norm": 1.0470796823501587, + "learning_rate": 4.984959070172037e-05, + "loss": 1.3496, + "step": 2335 + }, + { + "epoch": 0.7, + "grad_norm": 0.8911975026130676, + "learning_rate": 4.984894651103478e-05, + "loss": 1.4615, + "step": 2340 + }, + { + "epoch": 0.7, + "grad_norm": 0.8113840222358704, + "learning_rate": 4.98483009479677e-05, + "loss": 1.4067, + "step": 2345 + }, + { + "epoch": 0.7, + "grad_norm": 0.6011930704116821, + "learning_rate": 4.984765401255479e-05, + "loss": 1.3069, + "step": 2350 + }, + { + "epoch": 0.7, + "grad_norm": 0.4766760468482971, + "learning_rate": 4.984700570483178e-05, + "loss": 1.552, + "step": 2355 + }, + { + "epoch": 0.71, + "grad_norm": 1.0068391561508179, + "learning_rate": 4.984635602483447e-05, + "loss": 1.3886, + "step": 2360 + }, + { + "epoch": 0.71, + "grad_norm": 1.179116129875183, + "learning_rate": 4.984570497259874e-05, + "loss": 1.4761, + "step": 2365 + }, + { + "epoch": 0.71, + "grad_norm": 0.623014509677887, + "learning_rate": 4.9845052548160554e-05, + "loss": 1.627, + "step": 2370 + }, + { + "epoch": 0.71, + "grad_norm": 0.6420192718505859, + "learning_rate": 4.984439875155593e-05, + "loss": 1.2861, + "step": 2375 + }, + { + "epoch": 0.71, + "grad_norm": 0.6435323357582092, + "learning_rate": 4.9843743582821e-05, + "loss": 1.3463, + "step": 2380 + }, + { + "epoch": 0.71, + "grad_norm": 0.8503981828689575, + "learning_rate": 4.984308704199193e-05, + "loss": 1.4085, + "step": 2385 + }, + { + "epoch": 0.72, + "grad_norm": 0.8631937503814697, + "learning_rate": 4.984242912910499e-05, + "loss": 1.3638, + "step": 2390 + }, + { + "epoch": 0.72, + "grad_norm": 0.8973201513290405, + "learning_rate": 4.984176984419651e-05, + "loss": 1.3254, + "step": 2395 + }, + { + "epoch": 0.72, + "grad_norm": 0.8003751635551453, + "learning_rate": 4.9841109187302896e-05, + "loss": 1.3815, + "step": 2400 + }, + { + "epoch": 0.72, + "grad_norm": 1.4101213216781616, + "learning_rate": 4.984044715846065e-05, + "loss": 1.5289, + "step": 2405 + }, + { + "epoch": 0.72, + "grad_norm": 0.5882824063301086, + "learning_rate": 4.983978375770633e-05, + "loss": 1.369, + "step": 2410 + }, + { + "epoch": 0.72, + "grad_norm": 0.5322262644767761, + "learning_rate": 4.983911898507656e-05, + "loss": 1.6157, + "step": 2415 + }, + { + "epoch": 0.72, + "grad_norm": 1.2763606309890747, + "learning_rate": 4.9838452840608076e-05, + "loss": 1.4812, + "step": 2420 + }, + { + "epoch": 0.73, + "grad_norm": 0.9247528910636902, + "learning_rate": 4.983778532433766e-05, + "loss": 1.4012, + "step": 2425 + }, + { + "epoch": 0.73, + "grad_norm": 1.0083926916122437, + "learning_rate": 4.983711643630218e-05, + "loss": 1.4391, + "step": 2430 + }, + { + "epoch": 0.73, + "grad_norm": 1.7945544719696045, + "learning_rate": 4.983644617653857e-05, + "loss": 1.3384, + "step": 2435 + }, + { + "epoch": 0.73, + "grad_norm": 0.8284363150596619, + "learning_rate": 4.983577454508386e-05, + "loss": 1.497, + "step": 2440 + }, + { + "epoch": 0.73, + "grad_norm": 0.9625481367111206, + "learning_rate": 4.9835101541975125e-05, + "loss": 1.5237, + "step": 2445 + }, + { + "epoch": 0.73, + "grad_norm": 0.6448972821235657, + "learning_rate": 4.983442716724956e-05, + "loss": 1.3189, + "step": 2450 + }, + { + "epoch": 0.73, + "grad_norm": 0.6880027651786804, + "learning_rate": 4.983375142094439e-05, + "loss": 1.4792, + "step": 2455 + }, + { + "epoch": 0.74, + "grad_norm": 0.7837314605712891, + "learning_rate": 4.983307430309695e-05, + "loss": 1.4544, + "step": 2460 + }, + { + "epoch": 0.74, + "grad_norm": 0.7839219570159912, + "learning_rate": 4.9832395813744614e-05, + "loss": 1.4142, + "step": 2465 + }, + { + "epoch": 0.74, + "grad_norm": 1.2939424514770508, + "learning_rate": 4.983171595292489e-05, + "loss": 1.3861, + "step": 2470 + }, + { + "epoch": 0.74, + "grad_norm": 0.47123095393180847, + "learning_rate": 4.983103472067529e-05, + "loss": 1.4055, + "step": 2475 + }, + { + "epoch": 0.74, + "grad_norm": 0.9300950169563293, + "learning_rate": 4.983035211703345e-05, + "loss": 1.3981, + "step": 2480 + }, + { + "epoch": 0.74, + "grad_norm": 0.5792704820632935, + "learning_rate": 4.982966814203708e-05, + "loss": 1.2605, + "step": 2485 + }, + { + "epoch": 0.74, + "grad_norm": 1.7978287935256958, + "learning_rate": 4.9828982795723944e-05, + "loss": 1.573, + "step": 2490 + }, + { + "epoch": 0.75, + "grad_norm": 0.7604524493217468, + "learning_rate": 4.98282960781319e-05, + "loss": 1.4199, + "step": 2495 + }, + { + "epoch": 0.75, + "grad_norm": 0.9219200015068054, + "learning_rate": 4.982760798929887e-05, + "loss": 1.3895, + "step": 2500 + }, + { + "epoch": 0.75, + "grad_norm": 0.7405526638031006, + "learning_rate": 4.982691852926286e-05, + "loss": 1.3229, + "step": 2505 + }, + { + "epoch": 0.75, + "grad_norm": 0.8719181418418884, + "learning_rate": 4.982622769806193e-05, + "loss": 1.3892, + "step": 2510 + }, + { + "epoch": 0.75, + "grad_norm": 1.5789086818695068, + "learning_rate": 4.982553549573427e-05, + "loss": 1.3667, + "step": 2515 + }, + { + "epoch": 0.75, + "grad_norm": 0.6426258683204651, + "learning_rate": 4.982484192231808e-05, + "loss": 1.463, + "step": 2520 + }, + { + "epoch": 0.76, + "grad_norm": 0.7936124205589294, + "learning_rate": 4.982414697785168e-05, + "loss": 1.4307, + "step": 2525 + }, + { + "epoch": 0.76, + "grad_norm": 0.7490212917327881, + "learning_rate": 4.982345066237344e-05, + "loss": 1.3233, + "step": 2530 + }, + { + "epoch": 0.76, + "grad_norm": 0.6938835978507996, + "learning_rate": 4.9822752975921826e-05, + "loss": 1.4814, + "step": 2535 + }, + { + "epoch": 0.76, + "grad_norm": 0.6656054854393005, + "learning_rate": 4.982205391853536e-05, + "loss": 1.3243, + "step": 2540 + }, + { + "epoch": 0.76, + "grad_norm": 0.8422638773918152, + "learning_rate": 4.982135349025266e-05, + "loss": 1.4865, + "step": 2545 + }, + { + "epoch": 0.76, + "grad_norm": 0.7025824785232544, + "learning_rate": 4.982065169111241e-05, + "loss": 1.3322, + "step": 2550 + }, + { + "epoch": 0.76, + "grad_norm": 1.2645024061203003, + "learning_rate": 4.981994852115337e-05, + "loss": 1.405, + "step": 2555 + }, + { + "epoch": 0.77, + "grad_norm": 0.8437483310699463, + "learning_rate": 4.981924398041437e-05, + "loss": 1.49, + "step": 2560 + }, + { + "epoch": 0.77, + "grad_norm": 1.1599018573760986, + "learning_rate": 4.9818538068934314e-05, + "loss": 1.3704, + "step": 2565 + }, + { + "epoch": 0.77, + "grad_norm": 1.4388209581375122, + "learning_rate": 4.981783078675221e-05, + "loss": 1.3873, + "step": 2570 + }, + { + "epoch": 0.77, + "grad_norm": 1.019699215888977, + "learning_rate": 4.98171221339071e-05, + "loss": 1.2774, + "step": 2575 + }, + { + "epoch": 0.77, + "grad_norm": 0.7524467706680298, + "learning_rate": 4.981641211043814e-05, + "loss": 1.3507, + "step": 2580 + }, + { + "epoch": 0.77, + "grad_norm": 0.7237170934677124, + "learning_rate": 4.981570071638453e-05, + "loss": 1.4106, + "step": 2585 + }, + { + "epoch": 0.77, + "grad_norm": 2.224061965942383, + "learning_rate": 4.981498795178556e-05, + "loss": 1.4087, + "step": 2590 + }, + { + "epoch": 0.78, + "grad_norm": 0.733247697353363, + "learning_rate": 4.98142738166806e-05, + "loss": 1.4756, + "step": 2595 + }, + { + "epoch": 0.78, + "grad_norm": 0.9564744830131531, + "learning_rate": 4.9813558311109095e-05, + "loss": 1.4537, + "step": 2600 + }, + { + "epoch": 0.78, + "grad_norm": 0.5492211580276489, + "learning_rate": 4.981284143511055e-05, + "loss": 1.1261, + "step": 2605 + }, + { + "epoch": 0.78, + "grad_norm": 0.8463557362556458, + "learning_rate": 4.981212318872457e-05, + "loss": 1.4015, + "step": 2610 + }, + { + "epoch": 0.78, + "grad_norm": 1.0302629470825195, + "learning_rate": 4.981140357199081e-05, + "loss": 1.4069, + "step": 2615 + }, + { + "epoch": 0.78, + "grad_norm": 0.7704905867576599, + "learning_rate": 4.981068258494903e-05, + "loss": 1.3306, + "step": 2620 + }, + { + "epoch": 0.79, + "grad_norm": 1.499060034751892, + "learning_rate": 4.980996022763904e-05, + "loss": 1.5796, + "step": 2625 + }, + { + "epoch": 0.79, + "grad_norm": 0.660788357257843, + "learning_rate": 4.980923650010072e-05, + "loss": 1.401, + "step": 2630 + }, + { + "epoch": 0.79, + "grad_norm": 2.033163547515869, + "learning_rate": 4.980851140237407e-05, + "loss": 1.442, + "step": 2635 + }, + { + "epoch": 0.79, + "grad_norm": 1.5330935716629028, + "learning_rate": 4.9807784934499125e-05, + "loss": 1.5339, + "step": 2640 + }, + { + "epoch": 0.79, + "grad_norm": 3.3255956172943115, + "learning_rate": 4.9807057096516e-05, + "loss": 1.5884, + "step": 2645 + }, + { + "epoch": 0.79, + "grad_norm": 0.7848919630050659, + "learning_rate": 4.9806327888464885e-05, + "loss": 1.4167, + "step": 2650 + }, + { + "epoch": 0.79, + "grad_norm": 0.9711475968360901, + "learning_rate": 4.980559731038608e-05, + "loss": 1.4576, + "step": 2655 + }, + { + "epoch": 0.8, + "grad_norm": 0.9157351851463318, + "learning_rate": 4.9804865362319914e-05, + "loss": 1.3899, + "step": 2660 + }, + { + "epoch": 0.8, + "grad_norm": 1.0029135942459106, + "learning_rate": 4.980413204430682e-05, + "loss": 1.4998, + "step": 2665 + }, + { + "epoch": 0.8, + "grad_norm": 1.5365571975708008, + "learning_rate": 4.980339735638729e-05, + "loss": 1.3776, + "step": 2670 + }, + { + "epoch": 0.8, + "grad_norm": 0.6640515327453613, + "learning_rate": 4.980266129860191e-05, + "loss": 1.3586, + "step": 2675 + }, + { + "epoch": 0.8, + "grad_norm": 0.6773358583450317, + "learning_rate": 4.9801923870991326e-05, + "loss": 1.3586, + "step": 2680 + }, + { + "epoch": 0.8, + "grad_norm": 0.62249755859375, + "learning_rate": 4.9801185073596266e-05, + "loss": 1.5044, + "step": 2685 + }, + { + "epoch": 0.8, + "grad_norm": 1.2861700057983398, + "learning_rate": 4.980044490645754e-05, + "loss": 1.5279, + "step": 2690 + }, + { + "epoch": 0.81, + "grad_norm": 1.6400649547576904, + "learning_rate": 4.979970336961601e-05, + "loss": 1.4849, + "step": 2695 + }, + { + "epoch": 0.81, + "grad_norm": 1.2481294870376587, + "learning_rate": 4.9798960463112654e-05, + "loss": 1.3853, + "step": 2700 + }, + { + "epoch": 0.81, + "grad_norm": 1.038743019104004, + "learning_rate": 4.979821618698848e-05, + "loss": 1.3042, + "step": 2705 + }, + { + "epoch": 0.81, + "grad_norm": 1.1778807640075684, + "learning_rate": 4.979747054128461e-05, + "loss": 1.4474, + "step": 2710 + }, + { + "epoch": 0.81, + "grad_norm": 0.8384398818016052, + "learning_rate": 4.97967235260422e-05, + "loss": 1.3712, + "step": 2715 + }, + { + "epoch": 0.81, + "grad_norm": 1.2256791591644287, + "learning_rate": 4.9795975141302545e-05, + "loss": 1.6502, + "step": 2720 + }, + { + "epoch": 0.82, + "grad_norm": 0.5445688962936401, + "learning_rate": 4.979522538710695e-05, + "loss": 1.414, + "step": 2725 + }, + { + "epoch": 0.82, + "grad_norm": 1.1054037809371948, + "learning_rate": 4.979447426349682e-05, + "loss": 1.3975, + "step": 2730 + }, + { + "epoch": 0.82, + "grad_norm": 0.6710256934165955, + "learning_rate": 4.979372177051366e-05, + "loss": 1.3354, + "step": 2735 + }, + { + "epoch": 0.82, + "grad_norm": 1.0434719324111938, + "learning_rate": 4.979296790819901e-05, + "loss": 1.4389, + "step": 2740 + }, + { + "epoch": 0.82, + "grad_norm": 0.8868400454521179, + "learning_rate": 4.9792212676594516e-05, + "loss": 1.3882, + "step": 2745 + }, + { + "epoch": 0.82, + "grad_norm": 0.7899145483970642, + "learning_rate": 4.9791456075741895e-05, + "loss": 1.2246, + "step": 2750 + }, + { + "epoch": 0.82, + "grad_norm": 0.6027569770812988, + "learning_rate": 4.979069810568292e-05, + "loss": 1.4354, + "step": 2755 + }, + { + "epoch": 0.83, + "grad_norm": 1.1097197532653809, + "learning_rate": 4.9789938766459446e-05, + "loss": 1.4536, + "step": 2760 + }, + { + "epoch": 0.83, + "grad_norm": 0.8584468960762024, + "learning_rate": 4.9789178058113434e-05, + "loss": 1.3242, + "step": 2765 + }, + { + "epoch": 0.83, + "grad_norm": 0.6569173336029053, + "learning_rate": 4.978841598068688e-05, + "loss": 1.4388, + "step": 2770 + }, + { + "epoch": 0.83, + "grad_norm": 0.6124112010002136, + "learning_rate": 4.978765253422188e-05, + "loss": 1.3181, + "step": 2775 + }, + { + "epoch": 0.83, + "grad_norm": 0.7764132022857666, + "learning_rate": 4.978688771876059e-05, + "loss": 1.4396, + "step": 2780 + }, + { + "epoch": 0.83, + "grad_norm": 1.035800576210022, + "learning_rate": 4.9786121534345265e-05, + "loss": 1.3593, + "step": 2785 + }, + { + "epoch": 0.83, + "grad_norm": 0.6600098609924316, + "learning_rate": 4.97853539810182e-05, + "loss": 1.4073, + "step": 2790 + }, + { + "epoch": 0.84, + "grad_norm": 0.6001609563827515, + "learning_rate": 4.9784585058821807e-05, + "loss": 1.3929, + "step": 2795 + }, + { + "epoch": 0.84, + "grad_norm": 0.7586673498153687, + "learning_rate": 4.9783814767798545e-05, + "loss": 1.1887, + "step": 2800 + }, + { + "epoch": 0.84, + "grad_norm": 0.542594313621521, + "learning_rate": 4.9783043107990946e-05, + "loss": 1.4094, + "step": 2805 + }, + { + "epoch": 0.84, + "grad_norm": 1.1905423402786255, + "learning_rate": 4.978227007944164e-05, + "loss": 1.4199, + "step": 2810 + }, + { + "epoch": 0.84, + "grad_norm": 1.1273014545440674, + "learning_rate": 4.978149568219332e-05, + "loss": 1.5138, + "step": 2815 + }, + { + "epoch": 0.84, + "grad_norm": 0.8764979839324951, + "learning_rate": 4.978071991628875e-05, + "loss": 1.5155, + "step": 2820 + }, + { + "epoch": 0.85, + "grad_norm": 0.8033466339111328, + "learning_rate": 4.9779942781770776e-05, + "loss": 1.3844, + "step": 2825 + }, + { + "epoch": 0.85, + "grad_norm": 0.9946221113204956, + "learning_rate": 4.9779164278682324e-05, + "loss": 1.3978, + "step": 2830 + }, + { + "epoch": 0.85, + "grad_norm": 0.596258282661438, + "learning_rate": 4.977838440706638e-05, + "loss": 1.4013, + "step": 2835 + }, + { + "epoch": 0.85, + "grad_norm": 1.155501365661621, + "learning_rate": 4.977760316696603e-05, + "loss": 1.4486, + "step": 2840 + }, + { + "epoch": 0.85, + "grad_norm": 0.8394220471382141, + "learning_rate": 4.9776820558424396e-05, + "loss": 1.4483, + "step": 2845 + }, + { + "epoch": 0.85, + "grad_norm": 0.8399643301963806, + "learning_rate": 4.977603658148473e-05, + "loss": 1.4999, + "step": 2850 + }, + { + "epoch": 0.85, + "grad_norm": 1.7223281860351562, + "learning_rate": 4.977525123619031e-05, + "loss": 1.4209, + "step": 2855 + }, + { + "epoch": 0.86, + "grad_norm": 0.8585662245750427, + "learning_rate": 4.9774464522584516e-05, + "loss": 1.2834, + "step": 2860 + }, + { + "epoch": 0.86, + "grad_norm": 0.903371274471283, + "learning_rate": 4.9773676440710804e-05, + "loss": 1.514, + "step": 2865 + }, + { + "epoch": 0.86, + "grad_norm": 0.6225999593734741, + "learning_rate": 4.977288699061269e-05, + "loss": 1.4325, + "step": 2870 + }, + { + "epoch": 0.86, + "grad_norm": 0.8646724224090576, + "learning_rate": 4.977209617233378e-05, + "loss": 1.2028, + "step": 2875 + }, + { + "epoch": 0.86, + "grad_norm": 0.7862356305122375, + "learning_rate": 4.9771303985917745e-05, + "loss": 1.4069, + "step": 2880 + }, + { + "epoch": 0.86, + "grad_norm": 0.635420024394989, + "learning_rate": 4.977051043140834e-05, + "loss": 1.4109, + "step": 2885 + }, + { + "epoch": 0.86, + "grad_norm": 0.7677288055419922, + "learning_rate": 4.976971550884939e-05, + "loss": 1.1608, + "step": 2890 + }, + { + "epoch": 0.87, + "grad_norm": 0.845956027507782, + "learning_rate": 4.9768919218284804e-05, + "loss": 1.4072, + "step": 2895 + }, + { + "epoch": 0.87, + "grad_norm": 0.9149651527404785, + "learning_rate": 4.976812155975855e-05, + "loss": 1.3957, + "step": 2900 + }, + { + "epoch": 0.87, + "grad_norm": 0.9289765954017639, + "learning_rate": 4.9767322533314685e-05, + "loss": 1.4147, + "step": 2905 + }, + { + "epoch": 0.87, + "grad_norm": 1.1953096389770508, + "learning_rate": 4.9766522138997347e-05, + "loss": 1.4155, + "step": 2910 + }, + { + "epoch": 0.87, + "grad_norm": 0.9393072128295898, + "learning_rate": 4.976572037685073e-05, + "loss": 1.4166, + "step": 2915 + }, + { + "epoch": 0.87, + "grad_norm": 1.012731909751892, + "learning_rate": 4.9764917246919125e-05, + "loss": 1.4684, + "step": 2920 + }, + { + "epoch": 0.88, + "grad_norm": 0.9099532961845398, + "learning_rate": 4.9764112749246876e-05, + "loss": 1.4124, + "step": 2925 + }, + { + "epoch": 0.88, + "grad_norm": 0.69303959608078, + "learning_rate": 4.976330688387842e-05, + "loss": 1.4344, + "step": 2930 + }, + { + "epoch": 0.88, + "grad_norm": 0.6986569762229919, + "learning_rate": 4.9762499650858274e-05, + "loss": 1.3507, + "step": 2935 + }, + { + "epoch": 0.88, + "grad_norm": 0.5651214718818665, + "learning_rate": 4.976169105023101e-05, + "loss": 1.2479, + "step": 2940 + }, + { + "epoch": 0.88, + "grad_norm": 0.7775252461433411, + "learning_rate": 4.9760881082041275e-05, + "loss": 1.3942, + "step": 2945 + }, + { + "epoch": 0.88, + "grad_norm": 1.379727840423584, + "learning_rate": 4.976006974633383e-05, + "loss": 1.4428, + "step": 2950 + }, + { + "epoch": 0.88, + "grad_norm": 0.850517749786377, + "learning_rate": 4.9759257043153454e-05, + "loss": 1.4216, + "step": 2955 + }, + { + "epoch": 0.89, + "grad_norm": 0.9250400066375732, + "learning_rate": 4.975844297254506e-05, + "loss": 1.4909, + "step": 2960 + }, + { + "epoch": 0.89, + "grad_norm": 1.5304796695709229, + "learning_rate": 4.975762753455359e-05, + "loss": 1.4814, + "step": 2965 + }, + { + "epoch": 0.89, + "grad_norm": 1.1414655447006226, + "learning_rate": 4.975681072922409e-05, + "loss": 1.408, + "step": 2970 + }, + { + "epoch": 0.89, + "grad_norm": 0.5515819191932678, + "learning_rate": 4.975599255660166e-05, + "loss": 1.3775, + "step": 2975 + }, + { + "epoch": 0.89, + "grad_norm": 0.8293300867080688, + "learning_rate": 4.97551730167315e-05, + "loss": 1.3132, + "step": 2980 + }, + { + "epoch": 0.89, + "grad_norm": 1.0170328617095947, + "learning_rate": 4.9754352109658865e-05, + "loss": 1.4993, + "step": 2985 + }, + { + "epoch": 0.89, + "grad_norm": 0.937022864818573, + "learning_rate": 4.9753529835429094e-05, + "loss": 1.3501, + "step": 2990 + }, + { + "epoch": 0.9, + "grad_norm": 0.9053025245666504, + "learning_rate": 4.97527061940876e-05, + "loss": 1.4049, + "step": 2995 + }, + { + "epoch": 0.9, + "grad_norm": 0.49352753162384033, + "learning_rate": 4.975188118567986e-05, + "loss": 1.4339, + "step": 3000 + }, + { + "epoch": 0.9, + "grad_norm": 1.048647165298462, + "learning_rate": 4.975105481025146e-05, + "loss": 1.3139, + "step": 3005 + }, + { + "epoch": 0.9, + "grad_norm": 0.6800316572189331, + "learning_rate": 4.9750227067848034e-05, + "loss": 1.2836, + "step": 3010 + }, + { + "epoch": 0.9, + "grad_norm": 1.1411221027374268, + "learning_rate": 4.974939795851529e-05, + "loss": 1.4515, + "step": 3015 + }, + { + "epoch": 0.9, + "grad_norm": 0.7004803419113159, + "learning_rate": 4.9748567482299025e-05, + "loss": 1.551, + "step": 3020 + }, + { + "epoch": 0.91, + "grad_norm": 1.4398937225341797, + "learning_rate": 4.97477356392451e-05, + "loss": 1.4192, + "step": 3025 + }, + { + "epoch": 0.91, + "grad_norm": 0.5903295278549194, + "learning_rate": 4.974690242939946e-05, + "loss": 1.5211, + "step": 3030 + }, + { + "epoch": 0.91, + "grad_norm": 0.6511791348457336, + "learning_rate": 4.974606785280812e-05, + "loss": 1.5211, + "step": 3035 + }, + { + "epoch": 0.91, + "grad_norm": 1.2127279043197632, + "learning_rate": 4.9745231909517176e-05, + "loss": 1.3477, + "step": 3040 + }, + { + "epoch": 0.91, + "grad_norm": 2.202561855316162, + "learning_rate": 4.9744394599572795e-05, + "loss": 1.5234, + "step": 3045 + }, + { + "epoch": 0.91, + "grad_norm": 0.7070451378822327, + "learning_rate": 4.974355592302122e-05, + "loss": 1.4987, + "step": 3050 + }, + { + "epoch": 0.91, + "grad_norm": 0.9108610153198242, + "learning_rate": 4.974271587990877e-05, + "loss": 1.2861, + "step": 3055 + }, + { + "epoch": 0.92, + "grad_norm": 0.9873246550559998, + "learning_rate": 4.974187447028184e-05, + "loss": 1.4222, + "step": 3060 + }, + { + "epoch": 0.92, + "grad_norm": 1.2977701425552368, + "learning_rate": 4.9741031694186904e-05, + "loss": 1.4062, + "step": 3065 + }, + { + "epoch": 0.92, + "grad_norm": 1.025950312614441, + "learning_rate": 4.9740187551670505e-05, + "loss": 1.5189, + "step": 3070 + }, + { + "epoch": 0.92, + "grad_norm": 1.1865170001983643, + "learning_rate": 4.973934204277926e-05, + "loss": 1.3828, + "step": 3075 + }, + { + "epoch": 0.92, + "grad_norm": 0.818302571773529, + "learning_rate": 4.973849516755987e-05, + "loss": 1.467, + "step": 3080 + }, + { + "epoch": 0.92, + "grad_norm": 0.8224700689315796, + "learning_rate": 4.9737646926059104e-05, + "loss": 1.7332, + "step": 3085 + }, + { + "epoch": 0.92, + "grad_norm": 1.3279811143875122, + "learning_rate": 4.973679731832381e-05, + "loss": 1.5156, + "step": 3090 + }, + { + "epoch": 0.93, + "grad_norm": 0.5964494347572327, + "learning_rate": 4.973594634440092e-05, + "loss": 1.3524, + "step": 3095 + }, + { + "epoch": 0.93, + "grad_norm": 1.196366786956787, + "learning_rate": 4.9735094004337427e-05, + "loss": 1.5278, + "step": 3100 + }, + { + "epoch": 0.93, + "grad_norm": 0.7338801622390747, + "learning_rate": 4.97342402981804e-05, + "loss": 1.4148, + "step": 3105 + }, + { + "epoch": 0.93, + "grad_norm": 0.7936698794364929, + "learning_rate": 4.973338522597698e-05, + "loss": 1.4142, + "step": 3110 + }, + { + "epoch": 0.93, + "grad_norm": 0.5636515021324158, + "learning_rate": 4.9732528787774416e-05, + "loss": 1.5779, + "step": 3115 + }, + { + "epoch": 0.93, + "grad_norm": 0.5962295532226562, + "learning_rate": 4.973167098361999e-05, + "loss": 1.6513, + "step": 3120 + }, + { + "epoch": 0.93, + "grad_norm": 0.47786709666252136, + "learning_rate": 4.9730811813561083e-05, + "loss": 1.3843, + "step": 3125 + }, + { + "epoch": 0.94, + "grad_norm": 0.7690017223358154, + "learning_rate": 4.972995127764515e-05, + "loss": 1.2518, + "step": 3130 + }, + { + "epoch": 0.94, + "grad_norm": 0.7740575671195984, + "learning_rate": 4.97290893759197e-05, + "loss": 1.487, + "step": 3135 + }, + { + "epoch": 0.94, + "grad_norm": 0.7592212557792664, + "learning_rate": 4.972822610843236e-05, + "loss": 1.4385, + "step": 3140 + }, + { + "epoch": 0.94, + "grad_norm": 0.5696493983268738, + "learning_rate": 4.972736147523079e-05, + "loss": 1.4802, + "step": 3145 + }, + { + "epoch": 0.94, + "grad_norm": 0.5601330995559692, + "learning_rate": 4.9726495476362756e-05, + "loss": 1.4203, + "step": 3150 + }, + { + "epoch": 0.94, + "grad_norm": 1.0239603519439697, + "learning_rate": 4.972562811187608e-05, + "loss": 1.4046, + "step": 3155 + }, + { + "epoch": 0.95, + "grad_norm": 0.7492188811302185, + "learning_rate": 4.972475938181865e-05, + "loss": 1.3938, + "step": 3160 + }, + { + "epoch": 0.95, + "grad_norm": 1.0243171453475952, + "learning_rate": 4.972388928623847e-05, + "loss": 1.2638, + "step": 3165 + }, + { + "epoch": 0.95, + "grad_norm": 1.4914931058883667, + "learning_rate": 4.972301782518358e-05, + "loss": 1.4414, + "step": 3170 + }, + { + "epoch": 0.95, + "grad_norm": 1.6274807453155518, + "learning_rate": 4.972214499870212e-05, + "loss": 1.4207, + "step": 3175 + }, + { + "epoch": 0.95, + "grad_norm": 0.9303854703903198, + "learning_rate": 4.9721270806842277e-05, + "loss": 1.4353, + "step": 3180 + }, + { + "epoch": 0.95, + "grad_norm": 0.8997390866279602, + "learning_rate": 4.9720395249652355e-05, + "loss": 1.3493, + "step": 3185 + }, + { + "epoch": 0.95, + "grad_norm": 1.1290831565856934, + "learning_rate": 4.97195183271807e-05, + "loss": 1.4754, + "step": 3190 + }, + { + "epoch": 0.96, + "grad_norm": 0.7762711644172668, + "learning_rate": 4.971864003947573e-05, + "loss": 1.45, + "step": 3195 + }, + { + "epoch": 0.96, + "grad_norm": 0.6150183081626892, + "learning_rate": 4.971776038658598e-05, + "loss": 1.2738, + "step": 3200 + }, + { + "epoch": 0.96, + "grad_norm": 0.6307562589645386, + "learning_rate": 4.971687936856e-05, + "loss": 1.4348, + "step": 3205 + }, + { + "epoch": 0.96, + "grad_norm": 0.7655115127563477, + "learning_rate": 4.971599698544648e-05, + "loss": 1.2861, + "step": 3210 + }, + { + "epoch": 0.96, + "grad_norm": 0.8392760157585144, + "learning_rate": 4.971511323729412e-05, + "loss": 1.4351, + "step": 3215 + }, + { + "epoch": 0.96, + "grad_norm": 1.204959511756897, + "learning_rate": 4.9714228124151756e-05, + "loss": 1.4165, + "step": 3220 + }, + { + "epoch": 0.96, + "grad_norm": 0.6539895534515381, + "learning_rate": 4.9713341646068264e-05, + "loss": 1.3264, + "step": 3225 + }, + { + "epoch": 0.97, + "grad_norm": 0.6086089611053467, + "learning_rate": 4.97124538030926e-05, + "loss": 1.3271, + "step": 3230 + }, + { + "epoch": 0.97, + "grad_norm": 0.9812304377555847, + "learning_rate": 4.97115645952738e-05, + "loss": 1.4253, + "step": 3235 + }, + { + "epoch": 0.97, + "grad_norm": 0.5627865791320801, + "learning_rate": 4.9710674022660964e-05, + "loss": 1.4749, + "step": 3240 + }, + { + "epoch": 0.97, + "grad_norm": 1.062355399131775, + "learning_rate": 4.970978208530329e-05, + "loss": 1.3088, + "step": 3245 + }, + { + "epoch": 0.97, + "grad_norm": 0.5620614290237427, + "learning_rate": 4.9708888783250047e-05, + "loss": 1.3848, + "step": 3250 + }, + { + "epoch": 0.97, + "grad_norm": 1.4596127271652222, + "learning_rate": 4.970799411655055e-05, + "loss": 1.5558, + "step": 3255 + }, + { + "epoch": 0.98, + "grad_norm": 0.5726821422576904, + "learning_rate": 4.9707098085254224e-05, + "loss": 1.4792, + "step": 3260 + }, + { + "epoch": 0.98, + "grad_norm": 1.5486913919448853, + "learning_rate": 4.970620068941055e-05, + "loss": 1.3452, + "step": 3265 + }, + { + "epoch": 0.98, + "grad_norm": 0.5982449650764465, + "learning_rate": 4.9705301929069094e-05, + "loss": 1.3137, + "step": 3270 + }, + { + "epoch": 0.98, + "grad_norm": 1.129350185394287, + "learning_rate": 4.9704401804279495e-05, + "loss": 1.3673, + "step": 3275 + }, + { + "epoch": 0.98, + "grad_norm": 0.7439037561416626, + "learning_rate": 4.9703500315091455e-05, + "loss": 1.3652, + "step": 3280 + }, + { + "epoch": 0.98, + "grad_norm": 0.6493015885353088, + "learning_rate": 4.970259746155478e-05, + "loss": 1.3959, + "step": 3285 + }, + { + "epoch": 0.98, + "grad_norm": 1.614264726638794, + "learning_rate": 4.9701693243719324e-05, + "loss": 1.5581, + "step": 3290 + }, + { + "epoch": 0.99, + "grad_norm": 0.9825220108032227, + "learning_rate": 4.970078766163502e-05, + "loss": 1.3638, + "step": 3295 + }, + { + "epoch": 0.99, + "grad_norm": 0.509028971195221, + "learning_rate": 4.9699880715351884e-05, + "loss": 1.4559, + "step": 3300 + }, + { + "epoch": 0.99, + "grad_norm": 0.7303828001022339, + "learning_rate": 4.969897240492002e-05, + "loss": 1.2282, + "step": 3305 + }, + { + "epoch": 0.99, + "grad_norm": 0.7061027884483337, + "learning_rate": 4.9698062730389586e-05, + "loss": 1.4301, + "step": 3310 + }, + { + "epoch": 0.99, + "grad_norm": 0.5220503211021423, + "learning_rate": 4.9697151691810814e-05, + "loss": 1.5012, + "step": 3315 + }, + { + "epoch": 0.99, + "grad_norm": 1.2105666399002075, + "learning_rate": 4.969623928923402e-05, + "loss": 1.5438, + "step": 3320 + }, + { + "epoch": 0.99, + "grad_norm": 0.6172266006469727, + "learning_rate": 4.969532552270961e-05, + "loss": 1.3556, + "step": 3325 + }, + { + "epoch": 1.0, + "grad_norm": 1.089318037033081, + "learning_rate": 4.969441039228803e-05, + "loss": 1.3997, + "step": 3330 + }, + { + "epoch": 1.0, + "grad_norm": 0.8561093211174011, + "learning_rate": 4.969349389801984e-05, + "loss": 1.3716, + "step": 3335 + }, + { + "epoch": 1.0, + "grad_norm": 0.4685690701007843, + "learning_rate": 4.969257603995565e-05, + "loss": 1.4295, + "step": 3340 + }, + { + "epoch": 1.0, + "grad_norm": 1.1991277933120728, + "learning_rate": 4.9691656818146146e-05, + "loss": 1.3521, + "step": 3345 + }, + { + "epoch": 1.0, + "grad_norm": 1.0424867868423462, + "learning_rate": 4.969073623264211e-05, + "loss": 1.4858, + "step": 3350 + }, + { + "epoch": 1.0, + "grad_norm": 1.1299071311950684, + "learning_rate": 4.968981428349438e-05, + "loss": 1.3135, + "step": 3355 + }, + { + "epoch": 1.01, + "grad_norm": 0.6989724636077881, + "learning_rate": 4.9688890970753856e-05, + "loss": 1.4063, + "step": 3360 + }, + { + "epoch": 1.01, + "grad_norm": 0.4703225791454315, + "learning_rate": 4.968796629447155e-05, + "loss": 1.3124, + "step": 3365 + }, + { + "epoch": 1.01, + "grad_norm": 1.0257688760757446, + "learning_rate": 4.968704025469853e-05, + "loss": 1.3443, + "step": 3370 + }, + { + "epoch": 1.01, + "grad_norm": 0.8301354050636292, + "learning_rate": 4.968611285148594e-05, + "loss": 1.2692, + "step": 3375 + }, + { + "epoch": 1.01, + "grad_norm": 1.3049982786178589, + "learning_rate": 4.968518408488499e-05, + "loss": 1.4013, + "step": 3380 + }, + { + "epoch": 1.01, + "grad_norm": 1.38068425655365, + "learning_rate": 4.968425395494699e-05, + "loss": 1.3014, + "step": 3385 + }, + { + "epoch": 1.01, + "grad_norm": 0.8539844751358032, + "learning_rate": 4.96833224617233e-05, + "loss": 1.3768, + "step": 3390 + }, + { + "epoch": 1.02, + "grad_norm": 1.6281200647354126, + "learning_rate": 4.968238960526537e-05, + "loss": 1.3423, + "step": 3395 + }, + { + "epoch": 1.02, + "grad_norm": 1.638692855834961, + "learning_rate": 4.968145538562471e-05, + "loss": 1.4069, + "step": 3400 + }, + { + "epoch": 1.02, + "grad_norm": 1.0412074327468872, + "learning_rate": 4.968051980285293e-05, + "loss": 1.3327, + "step": 3405 + }, + { + "epoch": 1.02, + "grad_norm": 1.5191795825958252, + "learning_rate": 4.967958285700169e-05, + "loss": 1.4662, + "step": 3410 + }, + { + "epoch": 1.02, + "grad_norm": 1.035595417022705, + "learning_rate": 4.967864454812274e-05, + "loss": 1.4095, + "step": 3415 + }, + { + "epoch": 1.02, + "grad_norm": 1.3538129329681396, + "learning_rate": 4.967770487626791e-05, + "loss": 1.4562, + "step": 3420 + }, + { + "epoch": 1.02, + "grad_norm": 0.785943865776062, + "learning_rate": 4.9676763841489093e-05, + "loss": 1.2219, + "step": 3425 + }, + { + "epoch": 1.03, + "grad_norm": 0.9118176698684692, + "learning_rate": 4.967582144383826e-05, + "loss": 1.3501, + "step": 3430 + }, + { + "epoch": 1.03, + "grad_norm": 0.7986450791358948, + "learning_rate": 4.967487768336745e-05, + "loss": 1.5203, + "step": 3435 + }, + { + "epoch": 1.03, + "grad_norm": 0.6016656160354614, + "learning_rate": 4.967393256012879e-05, + "loss": 1.3272, + "step": 3440 + }, + { + "epoch": 1.03, + "grad_norm": 1.1561404466629028, + "learning_rate": 4.967298607417449e-05, + "loss": 1.3276, + "step": 3445 + }, + { + "epoch": 1.03, + "grad_norm": 1.0883400440216064, + "learning_rate": 4.9672038225556816e-05, + "loss": 1.4162, + "step": 3450 + }, + { + "epoch": 1.03, + "grad_norm": 1.2652868032455444, + "learning_rate": 4.967108901432811e-05, + "loss": 1.4396, + "step": 3455 + }, + { + "epoch": 1.04, + "grad_norm": 1.0570058822631836, + "learning_rate": 4.967013844054081e-05, + "loss": 1.2359, + "step": 3460 + }, + { + "epoch": 1.04, + "grad_norm": 0.9641575217247009, + "learning_rate": 4.96691865042474e-05, + "loss": 1.3216, + "step": 3465 + }, + { + "epoch": 1.04, + "grad_norm": 0.511062502861023, + "learning_rate": 4.966823320550047e-05, + "loss": 1.4389, + "step": 3470 + }, + { + "epoch": 1.04, + "grad_norm": 0.7163792252540588, + "learning_rate": 4.9667278544352653e-05, + "loss": 1.4032, + "step": 3475 + }, + { + "epoch": 1.04, + "grad_norm": 1.194694995880127, + "learning_rate": 4.966632252085668e-05, + "loss": 1.3817, + "step": 3480 + }, + { + "epoch": 1.04, + "grad_norm": 1.0525634288787842, + "learning_rate": 4.9665365135065365e-05, + "loss": 1.3344, + "step": 3485 + }, + { + "epoch": 1.04, + "grad_norm": 3.383202075958252, + "learning_rate": 4.966440638703156e-05, + "loss": 1.3635, + "step": 3490 + }, + { + "epoch": 1.05, + "grad_norm": 0.7507001757621765, + "learning_rate": 4.9663446276808235e-05, + "loss": 1.1945, + "step": 3495 + }, + { + "epoch": 1.05, + "grad_norm": 1.1535072326660156, + "learning_rate": 4.9662484804448404e-05, + "loss": 1.5449, + "step": 3500 + }, + { + "epoch": 1.05, + "grad_norm": 1.666061282157898, + "learning_rate": 4.966152197000517e-05, + "loss": 1.4527, + "step": 3505 + }, + { + "epoch": 1.05, + "grad_norm": 0.6894571781158447, + "learning_rate": 4.9660557773531723e-05, + "loss": 1.4377, + "step": 3510 + }, + { + "epoch": 1.05, + "grad_norm": 0.7343246936798096, + "learning_rate": 4.9659592215081296e-05, + "loss": 1.6301, + "step": 3515 + }, + { + "epoch": 1.05, + "grad_norm": 1.4154441356658936, + "learning_rate": 4.9658625294707226e-05, + "loss": 1.4235, + "step": 3520 + }, + { + "epoch": 1.05, + "grad_norm": 1.732731580734253, + "learning_rate": 4.9657657012462904e-05, + "loss": 1.491, + "step": 3525 + }, + { + "epoch": 1.06, + "grad_norm": 1.0298057794570923, + "learning_rate": 4.965668736840182e-05, + "loss": 1.4066, + "step": 3530 + }, + { + "epoch": 1.06, + "grad_norm": 0.9874246716499329, + "learning_rate": 4.9655716362577525e-05, + "loss": 1.4336, + "step": 3535 + }, + { + "epoch": 1.06, + "grad_norm": 0.7678321003913879, + "learning_rate": 4.965474399504364e-05, + "loss": 1.4643, + "step": 3540 + }, + { + "epoch": 1.06, + "grad_norm": 0.8920181393623352, + "learning_rate": 4.9653770265853874e-05, + "loss": 1.514, + "step": 3545 + }, + { + "epoch": 1.06, + "grad_norm": 0.7572281956672668, + "learning_rate": 4.9652795175062005e-05, + "loss": 1.3438, + "step": 3550 + }, + { + "epoch": 1.06, + "grad_norm": 0.7819041609764099, + "learning_rate": 4.965181872272188e-05, + "loss": 1.3153, + "step": 3555 + }, + { + "epoch": 1.07, + "grad_norm": 0.8489050269126892, + "learning_rate": 4.965084090888743e-05, + "loss": 1.3211, + "step": 3560 + }, + { + "epoch": 1.07, + "grad_norm": 2.9903485774993896, + "learning_rate": 4.9649861733612654e-05, + "loss": 1.3094, + "step": 3565 + }, + { + "epoch": 1.07, + "grad_norm": 0.8128799796104431, + "learning_rate": 4.9648881196951647e-05, + "loss": 1.3364, + "step": 3570 + }, + { + "epoch": 1.07, + "grad_norm": 1.2447357177734375, + "learning_rate": 4.964789929895855e-05, + "loss": 1.5058, + "step": 3575 + }, + { + "epoch": 1.07, + "grad_norm": 1.014669418334961, + "learning_rate": 4.96469160396876e-05, + "loss": 1.2516, + "step": 3580 + }, + { + "epoch": 1.07, + "grad_norm": 0.8545610308647156, + "learning_rate": 4.964593141919308e-05, + "loss": 1.3384, + "step": 3585 + }, + { + "epoch": 1.07, + "grad_norm": 1.7975364923477173, + "learning_rate": 4.96449454375294e-05, + "loss": 1.4322, + "step": 3590 + }, + { + "epoch": 1.08, + "grad_norm": 0.7121825218200684, + "learning_rate": 4.9643958094750996e-05, + "loss": 1.3345, + "step": 3595 + }, + { + "epoch": 1.08, + "grad_norm": 0.9037702083587646, + "learning_rate": 4.9642969390912394e-05, + "loss": 1.4319, + "step": 3600 + }, + { + "epoch": 1.08, + "grad_norm": 1.2663486003875732, + "learning_rate": 4.964197932606822e-05, + "loss": 1.6255, + "step": 3605 + }, + { + "epoch": 1.08, + "grad_norm": 0.8707221150398254, + "learning_rate": 4.964098790027314e-05, + "loss": 1.4954, + "step": 3610 + }, + { + "epoch": 1.08, + "grad_norm": 1.0182719230651855, + "learning_rate": 4.963999511358191e-05, + "loss": 1.2573, + "step": 3615 + }, + { + "epoch": 1.08, + "grad_norm": 0.5255160927772522, + "learning_rate": 4.963900096604936e-05, + "loss": 1.2647, + "step": 3620 + }, + { + "epoch": 1.08, + "grad_norm": 1.4247102737426758, + "learning_rate": 4.96380054577304e-05, + "loss": 1.3231, + "step": 3625 + }, + { + "epoch": 1.09, + "grad_norm": 0.5628049969673157, + "learning_rate": 4.963700858868e-05, + "loss": 1.2791, + "step": 3630 + }, + { + "epoch": 1.09, + "grad_norm": 0.755516529083252, + "learning_rate": 4.9636010358953235e-05, + "loss": 1.4192, + "step": 3635 + }, + { + "epoch": 1.09, + "grad_norm": 0.752839982509613, + "learning_rate": 4.963501076860522e-05, + "loss": 1.3058, + "step": 3640 + }, + { + "epoch": 1.09, + "grad_norm": 1.969702959060669, + "learning_rate": 4.963400981769117e-05, + "loss": 1.2481, + "step": 3645 + }, + { + "epoch": 1.09, + "grad_norm": 0.9159872531890869, + "learning_rate": 4.963300750626636e-05, + "loss": 1.4535, + "step": 3650 + }, + { + "epoch": 1.09, + "grad_norm": 0.8840392827987671, + "learning_rate": 4.963200383438615e-05, + "loss": 1.4412, + "step": 3655 + }, + { + "epoch": 1.1, + "grad_norm": 1.0793917179107666, + "learning_rate": 4.9630998802105975e-05, + "loss": 1.3668, + "step": 3660 + }, + { + "epoch": 1.1, + "grad_norm": 1.078147053718567, + "learning_rate": 4.962999240948134e-05, + "loss": 1.3659, + "step": 3665 + }, + { + "epoch": 1.1, + "grad_norm": 0.8372591137886047, + "learning_rate": 4.962898465656782e-05, + "loss": 1.3811, + "step": 3670 + }, + { + "epoch": 1.1, + "grad_norm": 1.0935574769973755, + "learning_rate": 4.962797554342108e-05, + "loss": 1.4259, + "step": 3675 + }, + { + "epoch": 1.1, + "grad_norm": 1.2438262701034546, + "learning_rate": 4.962696507009685e-05, + "loss": 1.4393, + "step": 3680 + }, + { + "epoch": 1.1, + "grad_norm": 0.8623482584953308, + "learning_rate": 4.962595323665094e-05, + "loss": 1.5026, + "step": 3685 + }, + { + "epoch": 1.1, + "grad_norm": 4.854569911956787, + "learning_rate": 4.962494004313923e-05, + "loss": 1.4825, + "step": 3690 + }, + { + "epoch": 1.11, + "grad_norm": 1.069012999534607, + "learning_rate": 4.962392548961767e-05, + "loss": 1.428, + "step": 3695 + }, + { + "epoch": 1.11, + "grad_norm": 1.0599278211593628, + "learning_rate": 4.9622909576142304e-05, + "loss": 1.5312, + "step": 3700 + }, + { + "epoch": 1.11, + "grad_norm": 1.7758433818817139, + "learning_rate": 4.962189230276925e-05, + "loss": 1.3304, + "step": 3705 + }, + { + "epoch": 1.11, + "grad_norm": 1.406180500984192, + "learning_rate": 4.962087366955466e-05, + "loss": 1.4616, + "step": 3710 + }, + { + "epoch": 1.11, + "grad_norm": 0.6990206837654114, + "learning_rate": 4.9619853676554814e-05, + "loss": 1.3334, + "step": 3715 + }, + { + "epoch": 1.11, + "grad_norm": 0.6970654726028442, + "learning_rate": 4.9618832323826036e-05, + "loss": 1.3963, + "step": 3720 + }, + { + "epoch": 1.11, + "grad_norm": 0.8413794040679932, + "learning_rate": 4.9617809611424745e-05, + "loss": 1.3082, + "step": 3725 + }, + { + "epoch": 1.12, + "grad_norm": 0.9142487645149231, + "learning_rate": 4.9616785539407414e-05, + "loss": 1.3984, + "step": 3730 + }, + { + "epoch": 1.12, + "grad_norm": 0.7050864696502686, + "learning_rate": 4.961576010783061e-05, + "loss": 1.3641, + "step": 3735 + }, + { + "epoch": 1.12, + "grad_norm": 0.7158265709877014, + "learning_rate": 4.961473331675096e-05, + "loss": 1.3355, + "step": 3740 + }, + { + "epoch": 1.12, + "grad_norm": 1.8010573387145996, + "learning_rate": 4.961370516622518e-05, + "loss": 1.3238, + "step": 3745 + }, + { + "epoch": 1.12, + "grad_norm": 0.9509638547897339, + "learning_rate": 4.961267565631004e-05, + "loss": 1.4979, + "step": 3750 + }, + { + "epoch": 1.12, + "grad_norm": 0.8768844604492188, + "learning_rate": 4.96116447870624e-05, + "loss": 1.3485, + "step": 3755 + }, + { + "epoch": 1.12, + "grad_norm": 1.029345989227295, + "learning_rate": 4.961061255853921e-05, + "loss": 1.4198, + "step": 3760 + }, + { + "epoch": 1.13, + "grad_norm": 1.0226683616638184, + "learning_rate": 4.960957897079748e-05, + "loss": 1.501, + "step": 3765 + }, + { + "epoch": 1.13, + "grad_norm": 1.1053155660629272, + "learning_rate": 4.9608544023894276e-05, + "loss": 1.5134, + "step": 3770 + }, + { + "epoch": 1.13, + "grad_norm": 0.7653586864471436, + "learning_rate": 4.960750771788676e-05, + "loss": 1.3514, + "step": 3775 + }, + { + "epoch": 1.13, + "grad_norm": 1.1900826692581177, + "learning_rate": 4.9606470052832174e-05, + "loss": 1.3914, + "step": 3780 + }, + { + "epoch": 1.13, + "grad_norm": 0.7287219762802124, + "learning_rate": 4.960543102878782e-05, + "loss": 1.2752, + "step": 3785 + }, + { + "epoch": 1.13, + "grad_norm": 0.8212308287620544, + "learning_rate": 4.96043906458111e-05, + "loss": 1.3529, + "step": 3790 + }, + { + "epoch": 1.14, + "grad_norm": 1.0006697177886963, + "learning_rate": 4.960334890395944e-05, + "loss": 1.2957, + "step": 3795 + }, + { + "epoch": 1.14, + "grad_norm": 0.5815529823303223, + "learning_rate": 4.96023058032904e-05, + "loss": 1.6203, + "step": 3800 + }, + { + "epoch": 1.14, + "grad_norm": 1.1902085542678833, + "learning_rate": 4.9601261343861586e-05, + "loss": 1.5133, + "step": 3805 + }, + { + "epoch": 1.14, + "grad_norm": 0.7083724141120911, + "learning_rate": 4.960021552573068e-05, + "loss": 1.415, + "step": 3810 + }, + { + "epoch": 1.14, + "grad_norm": 1.382954716682434, + "learning_rate": 4.959916834895544e-05, + "loss": 1.4189, + "step": 3815 + }, + { + "epoch": 1.14, + "grad_norm": 1.2294280529022217, + "learning_rate": 4.95981198135937e-05, + "loss": 1.5006, + "step": 3820 + }, + { + "epoch": 1.14, + "grad_norm": 0.6995631456375122, + "learning_rate": 4.9597069919703375e-05, + "loss": 1.5182, + "step": 3825 + }, + { + "epoch": 1.15, + "grad_norm": 0.8354824185371399, + "learning_rate": 4.959601866734245e-05, + "loss": 1.4081, + "step": 3830 + }, + { + "epoch": 1.15, + "grad_norm": 2.210512399673462, + "learning_rate": 4.959496605656897e-05, + "loss": 1.428, + "step": 3835 + }, + { + "epoch": 1.15, + "grad_norm": 0.7832702994346619, + "learning_rate": 4.959391208744108e-05, + "loss": 1.2983, + "step": 3840 + }, + { + "epoch": 1.15, + "grad_norm": 0.9913911819458008, + "learning_rate": 4.959285676001699e-05, + "loss": 1.4373, + "step": 3845 + }, + { + "epoch": 1.15, + "grad_norm": 0.5717589855194092, + "learning_rate": 4.9591800074354987e-05, + "loss": 1.446, + "step": 3850 + }, + { + "epoch": 1.15, + "grad_norm": 0.780748188495636, + "learning_rate": 4.959074203051343e-05, + "loss": 1.2086, + "step": 3855 + }, + { + "epoch": 1.15, + "grad_norm": 1.7067710161209106, + "learning_rate": 4.958968262855075e-05, + "loss": 1.4519, + "step": 3860 + }, + { + "epoch": 1.16, + "grad_norm": 1.3409600257873535, + "learning_rate": 4.958862186852545e-05, + "loss": 1.3027, + "step": 3865 + }, + { + "epoch": 1.16, + "grad_norm": 1.7507779598236084, + "learning_rate": 4.9587559750496135e-05, + "loss": 1.415, + "step": 3870 + }, + { + "epoch": 1.16, + "grad_norm": 0.7299233078956604, + "learning_rate": 4.9586496274521446e-05, + "loss": 1.3096, + "step": 3875 + }, + { + "epoch": 1.16, + "grad_norm": 0.76593017578125, + "learning_rate": 4.958543144066012e-05, + "loss": 1.4701, + "step": 3880 + }, + { + "epoch": 1.16, + "grad_norm": 1.9224169254302979, + "learning_rate": 4.958436524897098e-05, + "loss": 1.4677, + "step": 3885 + }, + { + "epoch": 1.16, + "grad_norm": 0.777151882648468, + "learning_rate": 4.95832976995129e-05, + "loss": 1.4735, + "step": 3890 + }, + { + "epoch": 1.17, + "grad_norm": 1.2696973085403442, + "learning_rate": 4.958222879234483e-05, + "loss": 1.4598, + "step": 3895 + }, + { + "epoch": 1.17, + "grad_norm": 1.122017741203308, + "learning_rate": 4.958115852752583e-05, + "loss": 1.2936, + "step": 3900 + }, + { + "epoch": 1.17, + "grad_norm": 1.7346524000167847, + "learning_rate": 4.9580086905114984e-05, + "loss": 1.4084, + "step": 3905 + }, + { + "epoch": 1.17, + "grad_norm": 1.49281907081604, + "learning_rate": 4.957901392517149e-05, + "loss": 1.2421, + "step": 3910 + }, + { + "epoch": 1.17, + "grad_norm": 0.9897387027740479, + "learning_rate": 4.957793958775461e-05, + "loss": 1.4164, + "step": 3915 + }, + { + "epoch": 1.17, + "grad_norm": 1.001846432685852, + "learning_rate": 4.9576863892923676e-05, + "loss": 1.1946, + "step": 3920 + }, + { + "epoch": 1.17, + "grad_norm": 0.9374973773956299, + "learning_rate": 4.9575786840738085e-05, + "loss": 1.4949, + "step": 3925 + }, + { + "epoch": 1.18, + "grad_norm": 1.216041088104248, + "learning_rate": 4.957470843125734e-05, + "loss": 1.4853, + "step": 3930 + }, + { + "epoch": 1.18, + "grad_norm": 1.8419601917266846, + "learning_rate": 4.9573628664540985e-05, + "loss": 1.2595, + "step": 3935 + }, + { + "epoch": 1.18, + "grad_norm": 3.0801103115081787, + "learning_rate": 4.957254754064867e-05, + "loss": 1.2968, + "step": 3940 + }, + { + "epoch": 1.18, + "grad_norm": 1.5771223306655884, + "learning_rate": 4.9571465059640094e-05, + "loss": 1.3501, + "step": 3945 + }, + { + "epoch": 1.18, + "grad_norm": 1.0766230821609497, + "learning_rate": 4.957038122157504e-05, + "loss": 1.4484, + "step": 3950 + }, + { + "epoch": 1.18, + "grad_norm": 1.5095306634902954, + "learning_rate": 4.9569296026513374e-05, + "loss": 1.3842, + "step": 3955 + }, + { + "epoch": 1.18, + "grad_norm": 1.0730971097946167, + "learning_rate": 4.956820947451503e-05, + "loss": 1.3855, + "step": 3960 + }, + { + "epoch": 1.19, + "grad_norm": 1.0428813695907593, + "learning_rate": 4.956712156564001e-05, + "loss": 1.4501, + "step": 3965 + }, + { + "epoch": 1.19, + "grad_norm": 1.020739197731018, + "learning_rate": 4.9566032299948394e-05, + "loss": 1.2739, + "step": 3970 + }, + { + "epoch": 1.19, + "grad_norm": 0.895963191986084, + "learning_rate": 4.956494167750036e-05, + "loss": 1.2692, + "step": 3975 + }, + { + "epoch": 1.19, + "grad_norm": 0.5474923849105835, + "learning_rate": 4.956384969835612e-05, + "loss": 1.348, + "step": 3980 + }, + { + "epoch": 1.19, + "grad_norm": 0.9486867785453796, + "learning_rate": 4.956275636257601e-05, + "loss": 1.3352, + "step": 3985 + }, + { + "epoch": 1.19, + "grad_norm": 0.9013283848762512, + "learning_rate": 4.9561661670220386e-05, + "loss": 1.4037, + "step": 3990 + }, + { + "epoch": 1.2, + "grad_norm": 0.8151796460151672, + "learning_rate": 4.956056562134972e-05, + "loss": 1.383, + "step": 3995 + }, + { + "epoch": 1.2, + "grad_norm": 1.0700517892837524, + "learning_rate": 4.955946821602455e-05, + "loss": 1.4735, + "step": 4000 + }, + { + "epoch": 1.2, + "grad_norm": 0.7493061423301697, + "learning_rate": 4.9558369454305476e-05, + "loss": 1.4365, + "step": 4005 + }, + { + "epoch": 1.2, + "grad_norm": 0.9456654191017151, + "learning_rate": 4.955726933625318e-05, + "loss": 1.2838, + "step": 4010 + }, + { + "epoch": 1.2, + "grad_norm": 1.0961682796478271, + "learning_rate": 4.955616786192843e-05, + "loss": 1.3222, + "step": 4015 + }, + { + "epoch": 1.2, + "grad_norm": 1.0026607513427734, + "learning_rate": 4.9555065031392044e-05, + "loss": 1.3279, + "step": 4020 + }, + { + "epoch": 1.2, + "grad_norm": 1.1015264987945557, + "learning_rate": 4.955396084470495e-05, + "loss": 1.2958, + "step": 4025 + }, + { + "epoch": 1.21, + "grad_norm": 1.213651180267334, + "learning_rate": 4.9552855301928114e-05, + "loss": 1.3827, + "step": 4030 + }, + { + "epoch": 1.21, + "grad_norm": 0.8415979743003845, + "learning_rate": 4.9551748403122604e-05, + "loss": 1.3701, + "step": 4035 + }, + { + "epoch": 1.21, + "grad_norm": 1.27717924118042, + "learning_rate": 4.955064014834955e-05, + "loss": 1.2613, + "step": 4040 + }, + { + "epoch": 1.21, + "grad_norm": 0.7791318297386169, + "learning_rate": 4.954953053767016e-05, + "loss": 1.2606, + "step": 4045 + }, + { + "epoch": 1.21, + "grad_norm": 0.8190057277679443, + "learning_rate": 4.954841957114572e-05, + "loss": 1.3877, + "step": 4050 + }, + { + "epoch": 1.21, + "grad_norm": 1.4794509410858154, + "learning_rate": 4.954730724883757e-05, + "loss": 1.3563, + "step": 4055 + }, + { + "epoch": 1.21, + "grad_norm": 0.701057493686676, + "learning_rate": 4.954619357080717e-05, + "loss": 1.3641, + "step": 4060 + }, + { + "epoch": 1.22, + "grad_norm": 0.7502091526985168, + "learning_rate": 4.954507853711601e-05, + "loss": 1.3876, + "step": 4065 + }, + { + "epoch": 1.22, + "grad_norm": 1.5233298540115356, + "learning_rate": 4.9543962147825675e-05, + "loss": 1.3748, + "step": 4070 + }, + { + "epoch": 1.22, + "grad_norm": 0.7826765775680542, + "learning_rate": 4.954284440299782e-05, + "loss": 1.3566, + "step": 4075 + }, + { + "epoch": 1.22, + "grad_norm": 0.7345137596130371, + "learning_rate": 4.9541725302694185e-05, + "loss": 1.3793, + "step": 4080 + }, + { + "epoch": 1.22, + "grad_norm": 0.9479917883872986, + "learning_rate": 4.954060484697657e-05, + "loss": 1.4317, + "step": 4085 + }, + { + "epoch": 1.22, + "grad_norm": 1.6998075246810913, + "learning_rate": 4.9539483035906854e-05, + "loss": 1.5301, + "step": 4090 + }, + { + "epoch": 1.23, + "grad_norm": 0.9664790034294128, + "learning_rate": 4.9538359869546996e-05, + "loss": 1.4804, + "step": 4095 + }, + { + "epoch": 1.23, + "grad_norm": 0.9688673615455627, + "learning_rate": 4.9537235347959034e-05, + "loss": 1.3919, + "step": 4100 + }, + { + "epoch": 1.23, + "grad_norm": 0.8500693440437317, + "learning_rate": 4.953610947120506e-05, + "loss": 1.4584, + "step": 4105 + }, + { + "epoch": 1.23, + "grad_norm": 1.2263480424880981, + "learning_rate": 4.953498223934727e-05, + "loss": 1.3839, + "step": 4110 + }, + { + "epoch": 1.23, + "grad_norm": 0.9976312518119812, + "learning_rate": 4.953385365244791e-05, + "loss": 1.4062, + "step": 4115 + }, + { + "epoch": 1.23, + "grad_norm": 1.165010690689087, + "learning_rate": 4.953272371056933e-05, + "loss": 1.3283, + "step": 4120 + }, + { + "epoch": 1.23, + "grad_norm": NaN, + "learning_rate": 4.953181878152334e-05, + "loss": 1.3901, + "step": 4125 + }, + { + "epoch": 1.24, + "grad_norm": 0.8692017793655396, + "learning_rate": 4.9530686400839435e-05, + "loss": 1.4443, + "step": 4130 + }, + { + "epoch": 1.24, + "grad_norm": 1.5308072566986084, + "learning_rate": 4.952955266535122e-05, + "loss": 1.3798, + "step": 4135 + }, + { + "epoch": 1.24, + "grad_norm": 1.0614334344863892, + "learning_rate": 4.952841757512131e-05, + "loss": 1.2199, + "step": 4140 + }, + { + "epoch": 1.24, + "grad_norm": 0.5995301604270935, + "learning_rate": 4.952728113021239e-05, + "loss": 1.3149, + "step": 4145 + }, + { + "epoch": 1.24, + "grad_norm": 0.9404197335243225, + "learning_rate": 4.9526143330687224e-05, + "loss": 1.4628, + "step": 4150 + }, + { + "epoch": 1.24, + "grad_norm": 1.0483546257019043, + "learning_rate": 4.9525004176608655e-05, + "loss": 1.4337, + "step": 4155 + }, + { + "epoch": 1.24, + "grad_norm": 1.1325901746749878, + "learning_rate": 4.9523863668039606e-05, + "loss": 1.3896, + "step": 4160 + }, + { + "epoch": 1.25, + "grad_norm": 2.8560702800750732, + "learning_rate": 4.952272180504306e-05, + "loss": 1.5117, + "step": 4165 + }, + { + "epoch": 1.25, + "grad_norm": 2.0659074783325195, + "learning_rate": 4.9521578587682074e-05, + "loss": 1.4522, + "step": 4170 + }, + { + "epoch": 1.25, + "grad_norm": 0.6918451189994812, + "learning_rate": 4.952043401601979e-05, + "loss": 1.2501, + "step": 4175 + }, + { + "epoch": 1.25, + "grad_norm": 1.1893597841262817, + "learning_rate": 4.951928809011942e-05, + "loss": 1.507, + "step": 4180 + }, + { + "epoch": 1.25, + "grad_norm": 1.9000897407531738, + "learning_rate": 4.951814081004426e-05, + "loss": 1.3378, + "step": 4185 + }, + { + "epoch": 1.25, + "grad_norm": 1.5121021270751953, + "learning_rate": 4.9516992175857665e-05, + "loss": 1.2653, + "step": 4190 + }, + { + "epoch": 1.26, + "grad_norm": 0.9484086632728577, + "learning_rate": 4.9515842187623076e-05, + "loss": 1.304, + "step": 4195 + }, + { + "epoch": 1.26, + "grad_norm": 0.6229005455970764, + "learning_rate": 4.951469084540401e-05, + "loss": 1.3531, + "step": 4200 + }, + { + "epoch": 1.26, + "grad_norm": 0.877293050289154, + "learning_rate": 4.951353814926405e-05, + "loss": 1.2548, + "step": 4205 + }, + { + "epoch": 1.26, + "grad_norm": 0.8774591088294983, + "learning_rate": 4.9512384099266854e-05, + "loss": 1.3658, + "step": 4210 + }, + { + "epoch": 1.26, + "grad_norm": 1.7639179229736328, + "learning_rate": 4.9511228695476165e-05, + "loss": 1.4088, + "step": 4215 + }, + { + "epoch": 1.26, + "grad_norm": 0.9273248314857483, + "learning_rate": 4.9510071937955794e-05, + "loss": 1.3634, + "step": 4220 + }, + { + "epoch": 1.26, + "grad_norm": 2.5506839752197266, + "learning_rate": 4.950891382676963e-05, + "loss": 1.4951, + "step": 4225 + }, + { + "epoch": 1.27, + "grad_norm": 1.1920723915100098, + "learning_rate": 4.9507754361981625e-05, + "loss": 1.3341, + "step": 4230 + }, + { + "epoch": 1.27, + "grad_norm": 0.6930038332939148, + "learning_rate": 4.9506593543655824e-05, + "loss": 1.4112, + "step": 4235 + }, + { + "epoch": 1.27, + "grad_norm": 0.8218011260032654, + "learning_rate": 4.9505431371856334e-05, + "loss": 1.2737, + "step": 4240 + }, + { + "epoch": 1.27, + "grad_norm": 1.1834139823913574, + "learning_rate": 4.950426784664734e-05, + "loss": 1.4452, + "step": 4245 + }, + { + "epoch": 1.27, + "grad_norm": 0.9107833504676819, + "learning_rate": 4.950310296809311e-05, + "loss": 1.3029, + "step": 4250 + }, + { + "epoch": 1.27, + "grad_norm": 0.9917646646499634, + "learning_rate": 4.950193673625796e-05, + "loss": 1.181, + "step": 4255 + }, + { + "epoch": 1.27, + "grad_norm": 0.8065804243087769, + "learning_rate": 4.9500769151206325e-05, + "loss": 1.3394, + "step": 4260 + }, + { + "epoch": 1.28, + "grad_norm": 1.371374249458313, + "learning_rate": 4.9499600213002673e-05, + "loss": 1.374, + "step": 4265 + }, + { + "epoch": 1.28, + "grad_norm": 1.1488021612167358, + "learning_rate": 4.9498429921711566e-05, + "loss": 1.3386, + "step": 4270 + }, + { + "epoch": 1.28, + "grad_norm": 0.6660547256469727, + "learning_rate": 4.9497258277397635e-05, + "loss": 1.4149, + "step": 4275 + }, + { + "epoch": 1.28, + "grad_norm": 1.1452709436416626, + "learning_rate": 4.94960852801256e-05, + "loss": 1.5156, + "step": 4280 + }, + { + "epoch": 1.28, + "grad_norm": 0.7714586853981018, + "learning_rate": 4.949491092996023e-05, + "loss": 1.3724, + "step": 4285 + }, + { + "epoch": 1.28, + "grad_norm": 1.2154569625854492, + "learning_rate": 4.94937352269664e-05, + "loss": 1.3114, + "step": 4290 + }, + { + "epoch": 1.29, + "grad_norm": 1.180679440498352, + "learning_rate": 4.949255817120903e-05, + "loss": 1.4339, + "step": 4295 + }, + { + "epoch": 1.29, + "grad_norm": 1.1234863996505737, + "learning_rate": 4.949137976275312e-05, + "loss": 1.238, + "step": 4300 + }, + { + "epoch": 1.29, + "grad_norm": 1.8907201290130615, + "learning_rate": 4.949020000166378e-05, + "loss": 1.1503, + "step": 4305 + }, + { + "epoch": 1.29, + "grad_norm": 0.9505663514137268, + "learning_rate": 4.9489018888006136e-05, + "loss": 1.3278, + "step": 4310 + }, + { + "epoch": 1.29, + "grad_norm": 2.0911295413970947, + "learning_rate": 4.948783642184544e-05, + "loss": 1.3792, + "step": 4315 + }, + { + "epoch": 1.29, + "grad_norm": 1.295822262763977, + "learning_rate": 4.948665260324699e-05, + "loss": 1.4859, + "step": 4320 + }, + { + "epoch": 1.29, + "grad_norm": 0.6218513250350952, + "learning_rate": 4.948546743227617e-05, + "loss": 1.2983, + "step": 4325 + }, + { + "epoch": 1.3, + "grad_norm": 0.901931881904602, + "learning_rate": 4.948428090899844e-05, + "loss": 1.4346, + "step": 4330 + }, + { + "epoch": 1.3, + "grad_norm": 2.49190616607666, + "learning_rate": 4.9483093033479324e-05, + "loss": 1.245, + "step": 4335 + }, + { + "epoch": 1.3, + "grad_norm": 0.7503476738929749, + "learning_rate": 4.948190380578442e-05, + "loss": 1.3196, + "step": 4340 + }, + { + "epoch": 1.3, + "grad_norm": 0.7911471724510193, + "learning_rate": 4.948071322597942e-05, + "loss": 1.3248, + "step": 4345 + }, + { + "epoch": 1.3, + "grad_norm": 2.4888834953308105, + "learning_rate": 4.947952129413008e-05, + "loss": 1.4045, + "step": 4350 + }, + { + "epoch": 1.3, + "grad_norm": 0.8818658590316772, + "learning_rate": 4.9478328010302225e-05, + "loss": 1.4385, + "step": 4355 + }, + { + "epoch": 1.3, + "grad_norm": 1.1271659135818481, + "learning_rate": 4.947713337456175e-05, + "loss": 1.3797, + "step": 4360 + }, + { + "epoch": 1.31, + "grad_norm": 4.77896785736084, + "learning_rate": 4.9475937386974645e-05, + "loss": 1.4799, + "step": 4365 + }, + { + "epoch": 1.31, + "grad_norm": 0.8201467394828796, + "learning_rate": 4.947474004760696e-05, + "loss": 1.4012, + "step": 4370 + }, + { + "epoch": 1.31, + "grad_norm": 1.4401566982269287, + "learning_rate": 4.947354135652482e-05, + "loss": 1.3238, + "step": 4375 + }, + { + "epoch": 1.31, + "grad_norm": 0.7641358971595764, + "learning_rate": 4.947234131379444e-05, + "loss": 1.4674, + "step": 4380 + }, + { + "epoch": 1.31, + "grad_norm": 1.366176962852478, + "learning_rate": 4.947113991948207e-05, + "loss": 1.3375, + "step": 4385 + }, + { + "epoch": 1.31, + "grad_norm": 1.671121597290039, + "learning_rate": 4.9469937173654094e-05, + "loss": 1.251, + "step": 4390 + }, + { + "epoch": 1.31, + "grad_norm": 0.6068382859230042, + "learning_rate": 4.9468733076376906e-05, + "loss": 1.4187, + "step": 4395 + }, + { + "epoch": 1.32, + "grad_norm": 0.8216054439544678, + "learning_rate": 4.9467527627717036e-05, + "loss": 1.3438, + "step": 4400 + }, + { + "epoch": 1.32, + "grad_norm": 0.8097289800643921, + "learning_rate": 4.946632082774104e-05, + "loss": 1.5979, + "step": 4405 + }, + { + "epoch": 1.32, + "grad_norm": 1.1140400171279907, + "learning_rate": 4.946511267651559e-05, + "loss": 1.2796, + "step": 4410 + }, + { + "epoch": 1.32, + "grad_norm": 0.9381734728813171, + "learning_rate": 4.9463903174107386e-05, + "loss": 1.3549, + "step": 4415 + }, + { + "epoch": 1.32, + "grad_norm": 1.5891177654266357, + "learning_rate": 4.9462692320583236e-05, + "loss": 1.4088, + "step": 4420 + }, + { + "epoch": 1.32, + "grad_norm": 2.4111642837524414, + "learning_rate": 4.946148011601003e-05, + "loss": 1.32, + "step": 4425 + }, + { + "epoch": 1.33, + "grad_norm": 1.5155541896820068, + "learning_rate": 4.9460266560454696e-05, + "loss": 1.2985, + "step": 4430 + }, + { + "epoch": 1.33, + "grad_norm": 1.3506906032562256, + "learning_rate": 4.9459051653984253e-05, + "loss": 1.2462, + "step": 4435 + }, + { + "epoch": 1.33, + "grad_norm": 2.700868844985962, + "learning_rate": 4.945783539666583e-05, + "loss": 1.5184, + "step": 4440 + }, + { + "epoch": 1.33, + "grad_norm": 1.9799214601516724, + "learning_rate": 4.9456617788566576e-05, + "loss": 1.4099, + "step": 4445 + }, + { + "epoch": 1.33, + "grad_norm": 1.3236010074615479, + "learning_rate": 4.945539882975373e-05, + "loss": 1.3994, + "step": 4450 + }, + { + "epoch": 1.33, + "grad_norm": 3.069741725921631, + "learning_rate": 4.9454178520294634e-05, + "loss": 1.5627, + "step": 4455 + }, + { + "epoch": 1.33, + "grad_norm": 3.1382434368133545, + "learning_rate": 4.9452956860256685e-05, + "loss": 1.415, + "step": 4460 + }, + { + "epoch": 1.34, + "grad_norm": 0.9453111886978149, + "learning_rate": 4.945173384970734e-05, + "loss": 1.3312, + "step": 4465 + }, + { + "epoch": 1.34, + "grad_norm": 1.3048676252365112, + "learning_rate": 4.9450509488714146e-05, + "loss": 1.3344, + "step": 4470 + }, + { + "epoch": 1.34, + "grad_norm": 0.9417716860771179, + "learning_rate": 4.9449283777344736e-05, + "loss": 1.4794, + "step": 4475 + }, + { + "epoch": 1.34, + "grad_norm": 2.0928821563720703, + "learning_rate": 4.944805671566679e-05, + "loss": 1.5453, + "step": 4480 + }, + { + "epoch": 1.34, + "grad_norm": 2.025001049041748, + "learning_rate": 4.944682830374809e-05, + "loss": 1.3634, + "step": 4485 + }, + { + "epoch": 1.34, + "grad_norm": 1.4281930923461914, + "learning_rate": 4.944559854165647e-05, + "loss": 1.2763, + "step": 4490 + }, + { + "epoch": 1.34, + "grad_norm": 1.3360381126403809, + "learning_rate": 4.944436742945985e-05, + "loss": 1.4286, + "step": 4495 + }, + { + "epoch": 1.35, + "grad_norm": 1.891915202140808, + "learning_rate": 4.944313496722623e-05, + "loss": 1.2857, + "step": 4500 + }, + { + "epoch": 1.35, + "grad_norm": 1.244935154914856, + "learning_rate": 4.944190115502367e-05, + "loss": 1.3384, + "step": 4505 + }, + { + "epoch": 1.35, + "grad_norm": 1.4173117876052856, + "learning_rate": 4.944066599292032e-05, + "loss": 1.4777, + "step": 4510 + }, + { + "epoch": 1.35, + "grad_norm": 1.03322434425354, + "learning_rate": 4.943942948098439e-05, + "loss": 1.4247, + "step": 4515 + }, + { + "epoch": 1.35, + "grad_norm": 0.9695542454719543, + "learning_rate": 4.943819161928417e-05, + "loss": 1.3148, + "step": 4520 + }, + { + "epoch": 1.35, + "grad_norm": 1.5252701044082642, + "learning_rate": 4.943695240788803e-05, + "loss": 1.4396, + "step": 4525 + }, + { + "epoch": 1.36, + "grad_norm": 1.4560002088546753, + "learning_rate": 4.943571184686441e-05, + "loss": 1.5466, + "step": 4530 + }, + { + "epoch": 1.36, + "grad_norm": 1.7746248245239258, + "learning_rate": 4.9434469936281825e-05, + "loss": 1.4468, + "step": 4535 + }, + { + "epoch": 1.36, + "grad_norm": 0.7101891040802002, + "learning_rate": 4.943322667620886e-05, + "loss": 1.3618, + "step": 4540 + }, + { + "epoch": 1.36, + "grad_norm": 0.8597649931907654, + "learning_rate": 4.9431982066714186e-05, + "loss": 1.3144, + "step": 4545 + }, + { + "epoch": 1.36, + "grad_norm": 1.0961127281188965, + "learning_rate": 4.9430736107866535e-05, + "loss": 1.3769, + "step": 4550 + }, + { + "epoch": 1.36, + "grad_norm": 1.1628168821334839, + "learning_rate": 4.942948879973472e-05, + "loss": 1.427, + "step": 4555 + }, + { + "epoch": 1.36, + "grad_norm": 0.946785569190979, + "learning_rate": 4.9428240142387646e-05, + "loss": 1.4057, + "step": 4560 + }, + { + "epoch": 1.37, + "grad_norm": 0.9042614698410034, + "learning_rate": 4.9426990135894245e-05, + "loss": 1.3838, + "step": 4565 + }, + { + "epoch": 1.37, + "grad_norm": 1.2647825479507446, + "learning_rate": 4.942573878032358e-05, + "loss": 1.3895, + "step": 4570 + }, + { + "epoch": 1.37, + "grad_norm": 1.29331636428833, + "learning_rate": 4.9424486075744745e-05, + "loss": 1.4304, + "step": 4575 + }, + { + "epoch": 1.37, + "grad_norm": 1.3520721197128296, + "learning_rate": 4.9423232022226936e-05, + "loss": 1.2225, + "step": 4580 + }, + { + "epoch": 1.37, + "grad_norm": 0.6012137532234192, + "learning_rate": 4.942197661983941e-05, + "loss": 1.3025, + "step": 4585 + }, + { + "epoch": 1.37, + "grad_norm": 0.9096260070800781, + "learning_rate": 4.9420719868651496e-05, + "loss": 1.3349, + "step": 4590 + }, + { + "epoch": 1.37, + "grad_norm": 1.007503628730774, + "learning_rate": 4.941946176873261e-05, + "loss": 1.3936, + "step": 4595 + }, + { + "epoch": 1.38, + "grad_norm": 1.1042200326919556, + "learning_rate": 4.941820232015223e-05, + "loss": 1.3694, + "step": 4600 + }, + { + "epoch": 1.38, + "grad_norm": 0.5444115400314331, + "learning_rate": 4.941694152297992e-05, + "loss": 1.3672, + "step": 4605 + }, + { + "epoch": 1.38, + "grad_norm": 0.9802212715148926, + "learning_rate": 4.9415679377285305e-05, + "loss": 1.4417, + "step": 4610 + }, + { + "epoch": 1.38, + "grad_norm": 0.9913745522499084, + "learning_rate": 4.9414415883138106e-05, + "loss": 1.3331, + "step": 4615 + }, + { + "epoch": 1.38, + "grad_norm": 1.2390682697296143, + "learning_rate": 4.941315104060808e-05, + "loss": 1.4843, + "step": 4620 + }, + { + "epoch": 1.38, + "grad_norm": 0.7250832915306091, + "learning_rate": 4.941188484976512e-05, + "loss": 1.4415, + "step": 4625 + }, + { + "epoch": 1.39, + "grad_norm": 0.6558685898780823, + "learning_rate": 4.941061731067912e-05, + "loss": 1.3587, + "step": 4630 + }, + { + "epoch": 1.39, + "grad_norm": 2.1380136013031006, + "learning_rate": 4.94093484234201e-05, + "loss": 1.3663, + "step": 4635 + }, + { + "epoch": 1.39, + "grad_norm": 2.294971227645874, + "learning_rate": 4.9408078188058145e-05, + "loss": 1.402, + "step": 4640 + }, + { + "epoch": 1.39, + "grad_norm": 2.2890818119049072, + "learning_rate": 4.940680660466339e-05, + "loss": 1.3872, + "step": 4645 + }, + { + "epoch": 1.39, + "grad_norm": 0.7086050510406494, + "learning_rate": 4.9405533673306094e-05, + "loss": 1.4863, + "step": 4650 + }, + { + "epoch": 1.39, + "grad_norm": 1.047402024269104, + "learning_rate": 4.940425939405653e-05, + "loss": 1.4587, + "step": 4655 + }, + { + "epoch": 1.39, + "grad_norm": 1.5536178350448608, + "learning_rate": 4.940298376698508e-05, + "loss": 1.333, + "step": 4660 + }, + { + "epoch": 1.4, + "grad_norm": 1.2016557455062866, + "learning_rate": 4.9401706792162215e-05, + "loss": 1.3375, + "step": 4665 + }, + { + "epoch": 1.4, + "grad_norm": 1.0145758390426636, + "learning_rate": 4.940042846965844e-05, + "loss": 1.3985, + "step": 4670 + }, + { + "epoch": 1.4, + "grad_norm": 1.1620700359344482, + "learning_rate": 4.939914879954437e-05, + "loss": 1.4384, + "step": 4675 + }, + { + "epoch": 1.4, + "grad_norm": 0.6000904440879822, + "learning_rate": 4.9397867781890665e-05, + "loss": 1.3831, + "step": 4680 + }, + { + "epoch": 1.4, + "grad_norm": 1.2584675550460815, + "learning_rate": 4.939658541676809e-05, + "loss": 1.3704, + "step": 4685 + }, + { + "epoch": 1.4, + "grad_norm": 1.9861582517623901, + "learning_rate": 4.939530170424745e-05, + "loss": 1.4437, + "step": 4690 + }, + { + "epoch": 1.4, + "grad_norm": 1.411476492881775, + "learning_rate": 4.9394016644399666e-05, + "loss": 1.3568, + "step": 4695 + }, + { + "epoch": 1.41, + "grad_norm": 0.8464270234107971, + "learning_rate": 4.93927302372957e-05, + "loss": 1.414, + "step": 4700 + }, + { + "epoch": 1.41, + "grad_norm": 1.0788897275924683, + "learning_rate": 4.939144248300659e-05, + "loss": 1.4118, + "step": 4705 + }, + { + "epoch": 1.41, + "grad_norm": 1.8137201070785522, + "learning_rate": 4.9390153381603466e-05, + "loss": 1.4496, + "step": 4710 + }, + { + "epoch": 1.41, + "grad_norm": 0.9756670594215393, + "learning_rate": 4.938886293315753e-05, + "loss": 1.4993, + "step": 4715 + }, + { + "epoch": 1.41, + "grad_norm": 2.05425763130188, + "learning_rate": 4.938757113774004e-05, + "loss": 1.2575, + "step": 4720 + }, + { + "epoch": 1.41, + "grad_norm": 1.3005917072296143, + "learning_rate": 4.938627799542235e-05, + "loss": 1.3646, + "step": 4725 + }, + { + "epoch": 1.42, + "grad_norm": 0.8430652618408203, + "learning_rate": 4.9384983506275864e-05, + "loss": 1.4074, + "step": 4730 + }, + { + "epoch": 1.42, + "grad_norm": 0.666009247303009, + "learning_rate": 4.9383687670372094e-05, + "loss": 1.2717, + "step": 4735 + }, + { + "epoch": 1.42, + "grad_norm": 0.5754490494728088, + "learning_rate": 4.9382390487782594e-05, + "loss": 1.3396, + "step": 4740 + }, + { + "epoch": 1.42, + "grad_norm": 0.9268550276756287, + "learning_rate": 4.938109195857902e-05, + "loss": 1.3328, + "step": 4745 + }, + { + "epoch": 1.42, + "grad_norm": 3.1448147296905518, + "learning_rate": 4.9379792082833076e-05, + "loss": 1.4236, + "step": 4750 + }, + { + "epoch": 1.42, + "grad_norm": 0.8515195250511169, + "learning_rate": 4.937849086061656e-05, + "loss": 1.3495, + "step": 4755 + }, + { + "epoch": 1.42, + "grad_norm": 1.5721395015716553, + "learning_rate": 4.937718829200132e-05, + "loss": 1.5374, + "step": 4760 + }, + { + "epoch": 1.43, + "grad_norm": 1.147706389427185, + "learning_rate": 4.9375884377059324e-05, + "loss": 1.2307, + "step": 4765 + }, + { + "epoch": 1.43, + "grad_norm": 0.7554968595504761, + "learning_rate": 4.937457911586256e-05, + "loss": 1.3832, + "step": 4770 + }, + { + "epoch": 1.43, + "grad_norm": 1.0911380052566528, + "learning_rate": 4.9373272508483135e-05, + "loss": 1.2698, + "step": 4775 + }, + { + "epoch": 1.43, + "grad_norm": 1.6409761905670166, + "learning_rate": 4.93719645549932e-05, + "loss": 1.2276, + "step": 4780 + }, + { + "epoch": 1.43, + "grad_norm": 1.3163679838180542, + "learning_rate": 4.9370655255464996e-05, + "loss": 1.3913, + "step": 4785 + }, + { + "epoch": 1.43, + "grad_norm": 0.9681575298309326, + "learning_rate": 4.9369344609970837e-05, + "loss": 1.3934, + "step": 4790 + }, + { + "epoch": 1.43, + "grad_norm": 0.6123712658882141, + "learning_rate": 4.9368032618583106e-05, + "loss": 1.3078, + "step": 4795 + }, + { + "epoch": 1.44, + "grad_norm": 1.4885551929473877, + "learning_rate": 4.9366719281374264e-05, + "loss": 1.3818, + "step": 4800 + }, + { + "epoch": 1.44, + "grad_norm": 0.8683892488479614, + "learning_rate": 4.936540459841684e-05, + "loss": 1.2567, + "step": 4805 + }, + { + "epoch": 1.44, + "grad_norm": 1.2614054679870605, + "learning_rate": 4.936408856978345e-05, + "loss": 1.3863, + "step": 4810 + }, + { + "epoch": 1.44, + "grad_norm": 0.7793049216270447, + "learning_rate": 4.9362771195546767e-05, + "loss": 1.3391, + "step": 4815 + }, + { + "epoch": 1.44, + "grad_norm": 0.7818357944488525, + "learning_rate": 4.936145247577956e-05, + "loss": 1.425, + "step": 4820 + }, + { + "epoch": 1.44, + "grad_norm": 2.959354877471924, + "learning_rate": 4.936013241055465e-05, + "loss": 1.4967, + "step": 4825 + }, + { + "epoch": 1.45, + "grad_norm": 1.2867703437805176, + "learning_rate": 4.935881099994495e-05, + "loss": 1.3091, + "step": 4830 + }, + { + "epoch": 1.45, + "grad_norm": 1.163822889328003, + "learning_rate": 4.935748824402344e-05, + "loss": 1.3299, + "step": 4835 + }, + { + "epoch": 1.45, + "grad_norm": 1.4018052816390991, + "learning_rate": 4.9356164142863174e-05, + "loss": 1.2329, + "step": 4840 + }, + { + "epoch": 1.45, + "grad_norm": 3.0731418132781982, + "learning_rate": 4.935483869653728e-05, + "loss": 1.448, + "step": 4845 + }, + { + "epoch": 1.45, + "grad_norm": 1.6475387811660767, + "learning_rate": 4.9353511905118954e-05, + "loss": 1.3146, + "step": 4850 + }, + { + "epoch": 1.45, + "grad_norm": 0.7880772948265076, + "learning_rate": 4.935218376868149e-05, + "loss": 1.3605, + "step": 4855 + }, + { + "epoch": 1.45, + "grad_norm": 1.6786603927612305, + "learning_rate": 4.9350854287298224e-05, + "loss": 1.2491, + "step": 4860 + }, + { + "epoch": 1.46, + "grad_norm": 0.7267608642578125, + "learning_rate": 4.9349523461042576e-05, + "loss": 1.4106, + "step": 4865 + }, + { + "epoch": 1.46, + "grad_norm": 1.2122886180877686, + "learning_rate": 4.9348191289988064e-05, + "loss": 1.4208, + "step": 4870 + }, + { + "epoch": 1.46, + "grad_norm": 1.0894712209701538, + "learning_rate": 4.934685777420827e-05, + "loss": 1.3931, + "step": 4875 + }, + { + "epoch": 1.46, + "grad_norm": 0.950336217880249, + "learning_rate": 4.934552291377681e-05, + "loss": 1.4107, + "step": 4880 + }, + { + "epoch": 1.46, + "grad_norm": 1.30669367313385, + "learning_rate": 4.934418670876743e-05, + "loss": 1.3144, + "step": 4885 + }, + { + "epoch": 1.46, + "grad_norm": 1.539128065109253, + "learning_rate": 4.934284915925392e-05, + "loss": 1.2911, + "step": 4890 + }, + { + "epoch": 1.46, + "grad_norm": 1.0481332540512085, + "learning_rate": 4.934151026531016e-05, + "loss": 1.3932, + "step": 4895 + }, + { + "epoch": 1.47, + "grad_norm": 0.9544585943222046, + "learning_rate": 4.934017002701009e-05, + "loss": 1.4928, + "step": 4900 + }, + { + "epoch": 1.47, + "grad_norm": 0.628352165222168, + "learning_rate": 4.933882844442773e-05, + "loss": 1.3738, + "step": 4905 + }, + { + "epoch": 1.47, + "grad_norm": 1.2132518291473389, + "learning_rate": 4.9337485517637174e-05, + "loss": 1.3586, + "step": 4910 + }, + { + "epoch": 1.47, + "grad_norm": 1.3063114881515503, + "learning_rate": 4.9336141246712585e-05, + "loss": 1.2829, + "step": 4915 + }, + { + "epoch": 1.47, + "grad_norm": 1.5061261653900146, + "learning_rate": 4.933479563172822e-05, + "loss": 1.3731, + "step": 4920 + }, + { + "epoch": 1.47, + "grad_norm": 1.1727710962295532, + "learning_rate": 4.933344867275837e-05, + "loss": 1.5842, + "step": 4925 + }, + { + "epoch": 1.47, + "grad_norm": 1.4313628673553467, + "learning_rate": 4.9332100369877457e-05, + "loss": 1.3292, + "step": 4930 + }, + { + "epoch": 1.48, + "grad_norm": 0.8247719407081604, + "learning_rate": 4.9330750723159924e-05, + "loss": 1.2706, + "step": 4935 + }, + { + "epoch": 1.48, + "grad_norm": 0.8011229634284973, + "learning_rate": 4.932939973268033e-05, + "loss": 1.5348, + "step": 4940 + }, + { + "epoch": 1.48, + "grad_norm": 1.4439839124679565, + "learning_rate": 4.932804739851327e-05, + "loss": 1.5019, + "step": 4945 + }, + { + "epoch": 1.48, + "grad_norm": 1.8708484172821045, + "learning_rate": 4.9326693720733434e-05, + "loss": 1.3857, + "step": 4950 + }, + { + "epoch": 1.48, + "grad_norm": 1.144667387008667, + "learning_rate": 4.93253386994156e-05, + "loss": 1.3401, + "step": 4955 + }, + { + "epoch": 1.48, + "grad_norm": 0.9807694554328918, + "learning_rate": 4.932398233463459e-05, + "loss": 1.3908, + "step": 4960 + }, + { + "epoch": 1.49, + "grad_norm": 1.2701613903045654, + "learning_rate": 4.932262462646532e-05, + "loss": 1.3377, + "step": 4965 + }, + { + "epoch": 1.49, + "grad_norm": 1.4072773456573486, + "learning_rate": 4.932126557498278e-05, + "loss": 1.4369, + "step": 4970 + }, + { + "epoch": 1.49, + "grad_norm": 0.901105523109436, + "learning_rate": 4.931990518026202e-05, + "loss": 1.4648, + "step": 4975 + }, + { + "epoch": 1.49, + "grad_norm": 1.1365903615951538, + "learning_rate": 4.931854344237816e-05, + "loss": 1.4489, + "step": 4980 + }, + { + "epoch": 1.49, + "grad_norm": 1.307646632194519, + "learning_rate": 4.931718036140644e-05, + "loss": 1.3409, + "step": 4985 + }, + { + "epoch": 1.49, + "grad_norm": 1.375880479812622, + "learning_rate": 4.9315815937422124e-05, + "loss": 1.3656, + "step": 4990 + }, + { + "epoch": 1.49, + "grad_norm": 0.6324434876441956, + "learning_rate": 4.931445017050056e-05, + "loss": 1.4595, + "step": 4995 + }, + { + "epoch": 1.5, + "grad_norm": 1.9427233934402466, + "learning_rate": 4.93130830607172e-05, + "loss": 1.3704, + "step": 5000 + }, + { + "epoch": 1.5, + "grad_norm": 1.6412770748138428, + "learning_rate": 4.931171460814753e-05, + "loss": 1.4681, + "step": 5005 + }, + { + "epoch": 1.5, + "grad_norm": 1.330474615097046, + "learning_rate": 4.931034481286713e-05, + "loss": 1.3628, + "step": 5010 + }, + { + "epoch": 1.5, + "grad_norm": 0.8582301735877991, + "learning_rate": 4.9308973674951656e-05, + "loss": 1.3646, + "step": 5015 + }, + { + "epoch": 1.5, + "grad_norm": 0.6373203992843628, + "learning_rate": 4.9307601194476825e-05, + "loss": 1.2416, + "step": 5020 + }, + { + "epoch": 1.5, + "grad_norm": 1.919718623161316, + "learning_rate": 4.9306227371518455e-05, + "loss": 1.3791, + "step": 5025 + }, + { + "epoch": 1.5, + "grad_norm": 0.8354704976081848, + "learning_rate": 4.9304852206152415e-05, + "loss": 1.393, + "step": 5030 + }, + { + "epoch": 1.51, + "grad_norm": 1.773903489112854, + "learning_rate": 4.9303475698454645e-05, + "loss": 1.283, + "step": 5035 + }, + { + "epoch": 1.51, + "grad_norm": 0.9701683521270752, + "learning_rate": 4.9302097848501176e-05, + "loss": 1.41, + "step": 5040 + }, + { + "epoch": 1.51, + "grad_norm": 1.3933402299880981, + "learning_rate": 4.9300718656368104e-05, + "loss": 1.3723, + "step": 5045 + }, + { + "epoch": 1.51, + "grad_norm": 1.408264398574829, + "learning_rate": 4.92993381221316e-05, + "loss": 1.274, + "step": 5050 + }, + { + "epoch": 1.51, + "grad_norm": 1.3229018449783325, + "learning_rate": 4.929795624586791e-05, + "loss": 1.3723, + "step": 5055 + }, + { + "epoch": 1.51, + "grad_norm": 0.9255488514900208, + "learning_rate": 4.9296573027653353e-05, + "loss": 1.2437, + "step": 5060 + }, + { + "epoch": 1.52, + "grad_norm": 1.3084803819656372, + "learning_rate": 4.9295188467564324e-05, + "loss": 1.2855, + "step": 5065 + }, + { + "epoch": 1.52, + "grad_norm": 1.0244333744049072, + "learning_rate": 4.9293802565677284e-05, + "loss": 1.3545, + "step": 5070 + }, + { + "epoch": 1.52, + "grad_norm": 1.627091646194458, + "learning_rate": 4.9292415322068785e-05, + "loss": 1.443, + "step": 5075 + }, + { + "epoch": 1.52, + "grad_norm": 0.5619713664054871, + "learning_rate": 4.929102673681544e-05, + "loss": 1.54, + "step": 5080 + }, + { + "epoch": 1.52, + "grad_norm": 1.0287528038024902, + "learning_rate": 4.928963680999393e-05, + "loss": 1.3472, + "step": 5085 + }, + { + "epoch": 1.52, + "grad_norm": 0.9265909194946289, + "learning_rate": 4.9288245541681036e-05, + "loss": 1.3337, + "step": 5090 + }, + { + "epoch": 1.52, + "grad_norm": 1.3812994956970215, + "learning_rate": 4.9286852931953576e-05, + "loss": 1.4148, + "step": 5095 + }, + { + "epoch": 1.53, + "grad_norm": 0.8753069043159485, + "learning_rate": 4.928545898088848e-05, + "loss": 1.4186, + "step": 5100 + }, + { + "epoch": 1.53, + "grad_norm": 0.826627254486084, + "learning_rate": 4.928406368856273e-05, + "loss": 1.509, + "step": 5105 + }, + { + "epoch": 1.53, + "grad_norm": 2.941596508026123, + "learning_rate": 4.928266705505338e-05, + "loss": 1.4212, + "step": 5110 + }, + { + "epoch": 1.53, + "grad_norm": 1.5150552988052368, + "learning_rate": 4.928126908043757e-05, + "loss": 1.3308, + "step": 5115 + }, + { + "epoch": 1.53, + "grad_norm": 0.9515051245689392, + "learning_rate": 4.927986976479251e-05, + "loss": 1.3392, + "step": 5120 + }, + { + "epoch": 1.53, + "grad_norm": 1.2691097259521484, + "learning_rate": 4.927846910819548e-05, + "loss": 1.2719, + "step": 5125 + }, + { + "epoch": 1.53, + "grad_norm": 0.8292602896690369, + "learning_rate": 4.927706711072383e-05, + "loss": 1.2852, + "step": 5130 + }, + { + "epoch": 1.54, + "grad_norm": 1.4237391948699951, + "learning_rate": 4.927566377245501e-05, + "loss": 1.5021, + "step": 5135 + }, + { + "epoch": 1.54, + "grad_norm": 0.7673839926719666, + "learning_rate": 4.92742590934665e-05, + "loss": 1.5058, + "step": 5140 + }, + { + "epoch": 1.54, + "grad_norm": 1.159803867340088, + "learning_rate": 4.92728530738359e-05, + "loss": 1.3966, + "step": 5145 + }, + { + "epoch": 1.54, + "grad_norm": 1.0266951322555542, + "learning_rate": 4.927144571364085e-05, + "loss": 1.387, + "step": 5150 + }, + { + "epoch": 1.54, + "grad_norm": 1.0855332612991333, + "learning_rate": 4.927003701295909e-05, + "loss": 1.5594, + "step": 5155 + }, + { + "epoch": 1.54, + "grad_norm": 1.604235053062439, + "learning_rate": 4.92686269718684e-05, + "loss": 1.4042, + "step": 5160 + }, + { + "epoch": 1.55, + "grad_norm": 1.2728831768035889, + "learning_rate": 4.926721559044668e-05, + "loss": 1.3788, + "step": 5165 + }, + { + "epoch": 1.55, + "grad_norm": 1.2990281581878662, + "learning_rate": 4.926580286877187e-05, + "loss": 1.3904, + "step": 5170 + }, + { + "epoch": 1.55, + "grad_norm": 0.8917083144187927, + "learning_rate": 4.926438880692198e-05, + "loss": 1.3893, + "step": 5175 + }, + { + "epoch": 1.55, + "grad_norm": 1.6582956314086914, + "learning_rate": 4.9262973404975124e-05, + "loss": 1.3848, + "step": 5180 + }, + { + "epoch": 1.55, + "grad_norm": 0.7999622225761414, + "learning_rate": 4.9261556663009465e-05, + "loss": 1.5204, + "step": 5185 + }, + { + "epoch": 1.55, + "grad_norm": 1.0454286336898804, + "learning_rate": 4.926013858110326e-05, + "loss": 1.4529, + "step": 5190 + }, + { + "epoch": 1.55, + "grad_norm": 0.8179815411567688, + "learning_rate": 4.92587191593348e-05, + "loss": 1.3022, + "step": 5195 + }, + { + "epoch": 1.56, + "grad_norm": 1.9816410541534424, + "learning_rate": 4.92572983977825e-05, + "loss": 1.3634, + "step": 5200 + }, + { + "epoch": 1.56, + "grad_norm": 0.7995482087135315, + "learning_rate": 4.925587629652483e-05, + "loss": 1.464, + "step": 5205 + }, + { + "epoch": 1.56, + "grad_norm": 1.1661982536315918, + "learning_rate": 4.925445285564032e-05, + "loss": 1.5452, + "step": 5210 + }, + { + "epoch": 1.56, + "grad_norm": 0.9501356482505798, + "learning_rate": 4.9253028075207595e-05, + "loss": 1.2182, + "step": 5215 + }, + { + "epoch": 1.56, + "grad_norm": 1.2546149492263794, + "learning_rate": 4.925160195530534e-05, + "loss": 1.5899, + "step": 5220 + }, + { + "epoch": 1.56, + "grad_norm": 0.617793619632721, + "learning_rate": 4.9250174496012316e-05, + "loss": 1.4278, + "step": 5225 + }, + { + "epoch": 1.56, + "grad_norm": 0.8397419452667236, + "learning_rate": 4.9248745697407353e-05, + "loss": 1.2532, + "step": 5230 + }, + { + "epoch": 1.57, + "grad_norm": 1.4492918252944946, + "learning_rate": 4.924731555956938e-05, + "loss": 1.3777, + "step": 5235 + }, + { + "epoch": 1.57, + "grad_norm": 1.5298033952713013, + "learning_rate": 4.924588408257736e-05, + "loss": 1.3783, + "step": 5240 + }, + { + "epoch": 1.57, + "grad_norm": 0.9098453521728516, + "learning_rate": 4.9244451266510384e-05, + "loss": 1.3461, + "step": 5245 + }, + { + "epoch": 1.57, + "grad_norm": 1.8338463306427002, + "learning_rate": 4.9243017111447556e-05, + "loss": 1.4105, + "step": 5250 + }, + { + "epoch": 1.57, + "grad_norm": 2.729804039001465, + "learning_rate": 4.924158161746809e-05, + "loss": 1.491, + "step": 5255 + }, + { + "epoch": 1.57, + "grad_norm": 3.264284610748291, + "learning_rate": 4.9240144784651265e-05, + "loss": 1.3767, + "step": 5260 + }, + { + "epoch": 1.58, + "grad_norm": 1.7262938022613525, + "learning_rate": 4.923870661307645e-05, + "loss": 1.3735, + "step": 5265 + }, + { + "epoch": 1.58, + "grad_norm": 0.7457590103149414, + "learning_rate": 4.923726710282305e-05, + "loss": 1.485, + "step": 5270 + }, + { + "epoch": 1.58, + "grad_norm": 0.9725492000579834, + "learning_rate": 4.923582625397059e-05, + "loss": 1.393, + "step": 5275 + }, + { + "epoch": 1.58, + "grad_norm": 2.8428640365600586, + "learning_rate": 4.923438406659864e-05, + "loss": 1.3126, + "step": 5280 + }, + { + "epoch": 1.58, + "grad_norm": 1.4272032976150513, + "learning_rate": 4.923294054078684e-05, + "loss": 1.3194, + "step": 5285 + }, + { + "epoch": 1.58, + "grad_norm": 0.783457338809967, + "learning_rate": 4.923149567661492e-05, + "loss": 1.3534, + "step": 5290 + }, + { + "epoch": 1.58, + "grad_norm": 2.0013182163238525, + "learning_rate": 4.9230049474162697e-05, + "loss": 1.3938, + "step": 5295 + }, + { + "epoch": 1.59, + "grad_norm": 0.9497959017753601, + "learning_rate": 4.922860193351002e-05, + "loss": 1.4592, + "step": 5300 + }, + { + "epoch": 1.59, + "grad_norm": 1.3006082773208618, + "learning_rate": 4.922715305473684e-05, + "loss": 1.3387, + "step": 5305 + }, + { + "epoch": 1.59, + "grad_norm": 1.0174909830093384, + "learning_rate": 4.922570283792318e-05, + "loss": 1.3116, + "step": 5310 + }, + { + "epoch": 1.59, + "grad_norm": 1.6044186353683472, + "learning_rate": 4.9224251283149136e-05, + "loss": 1.4901, + "step": 5315 + }, + { + "epoch": 1.59, + "grad_norm": 0.9575896859169006, + "learning_rate": 4.9222798390494874e-05, + "loss": 1.549, + "step": 5320 + }, + { + "epoch": 1.59, + "grad_norm": 0.8215287923812866, + "learning_rate": 4.922134416004063e-05, + "loss": 1.2743, + "step": 5325 + }, + { + "epoch": 1.59, + "grad_norm": 1.3678843975067139, + "learning_rate": 4.9219888591866725e-05, + "loss": 1.3831, + "step": 5330 + }, + { + "epoch": 1.6, + "grad_norm": 1.1509759426116943, + "learning_rate": 4.921843168605355e-05, + "loss": 1.4874, + "step": 5335 + }, + { + "epoch": 1.6, + "grad_norm": 0.6248118281364441, + "learning_rate": 4.921697344268157e-05, + "loss": 1.4002, + "step": 5340 + }, + { + "epoch": 1.6, + "grad_norm": 0.9303049445152283, + "learning_rate": 4.9215513861831316e-05, + "loss": 1.4764, + "step": 5345 + }, + { + "epoch": 1.6, + "grad_norm": 1.2478617429733276, + "learning_rate": 4.92140529435834e-05, + "loss": 1.3353, + "step": 5350 + }, + { + "epoch": 1.6, + "grad_norm": 1.1409193277359009, + "learning_rate": 4.921259068801851e-05, + "loss": 1.4389, + "step": 5355 + }, + { + "epoch": 1.6, + "grad_norm": 3.311739921569824, + "learning_rate": 4.921112709521741e-05, + "loss": 1.456, + "step": 5360 + }, + { + "epoch": 1.61, + "grad_norm": 1.399682879447937, + "learning_rate": 4.920966216526092e-05, + "loss": 1.3554, + "step": 5365 + }, + { + "epoch": 1.61, + "grad_norm": 1.2935621738433838, + "learning_rate": 4.920819589822995e-05, + "loss": 1.2121, + "step": 5370 + }, + { + "epoch": 1.61, + "grad_norm": 0.8797827363014221, + "learning_rate": 4.92067282942055e-05, + "loss": 1.2156, + "step": 5375 + }, + { + "epoch": 1.61, + "grad_norm": 0.6572903990745544, + "learning_rate": 4.9205259353268585e-05, + "loss": 1.4867, + "step": 5380 + }, + { + "epoch": 1.61, + "grad_norm": 1.5460227727890015, + "learning_rate": 4.920378907550037e-05, + "loss": 1.4242, + "step": 5385 + }, + { + "epoch": 1.61, + "grad_norm": 1.2212427854537964, + "learning_rate": 4.9202317460982037e-05, + "loss": 1.3154, + "step": 5390 + }, + { + "epoch": 1.61, + "grad_norm": 1.2694352865219116, + "learning_rate": 4.9200844509794876e-05, + "loss": 1.3032, + "step": 5395 + }, + { + "epoch": 1.62, + "grad_norm": 0.9184812903404236, + "learning_rate": 4.919937022202022e-05, + "loss": 1.3117, + "step": 5400 + }, + { + "epoch": 1.62, + "grad_norm": 1.309106469154358, + "learning_rate": 4.9197894597739505e-05, + "loss": 1.3399, + "step": 5405 + }, + { + "epoch": 1.62, + "grad_norm": 1.238127589225769, + "learning_rate": 4.919641763703422e-05, + "loss": 1.3066, + "step": 5410 + }, + { + "epoch": 1.62, + "grad_norm": 1.8524130582809448, + "learning_rate": 4.919493933998594e-05, + "loss": 1.5195, + "step": 5415 + }, + { + "epoch": 1.62, + "grad_norm": 1.081799030303955, + "learning_rate": 4.919345970667631e-05, + "loss": 1.2854, + "step": 5420 + }, + { + "epoch": 1.62, + "grad_norm": 0.8055438995361328, + "learning_rate": 4.919197873718705e-05, + "loss": 1.1592, + "step": 5425 + }, + { + "epoch": 1.62, + "grad_norm": 1.1697745323181152, + "learning_rate": 4.919049643159995e-05, + "loss": 1.3942, + "step": 5430 + }, + { + "epoch": 1.63, + "grad_norm": 1.0526283979415894, + "learning_rate": 4.918901278999687e-05, + "loss": 1.4919, + "step": 5435 + }, + { + "epoch": 1.63, + "grad_norm": 0.9364911913871765, + "learning_rate": 4.918752781245976e-05, + "loss": 1.2354, + "step": 5440 + }, + { + "epoch": 1.63, + "grad_norm": 1.036669135093689, + "learning_rate": 4.918604149907064e-05, + "loss": 1.4115, + "step": 5445 + }, + { + "epoch": 1.63, + "grad_norm": 1.323757529258728, + "learning_rate": 4.918455384991159e-05, + "loss": 1.4734, + "step": 5450 + }, + { + "epoch": 1.63, + "grad_norm": 1.043481707572937, + "learning_rate": 4.9183064865064756e-05, + "loss": 1.3592, + "step": 5455 + }, + { + "epoch": 1.63, + "grad_norm": 1.0515973567962646, + "learning_rate": 4.918157454461238e-05, + "loss": 1.3578, + "step": 5460 + }, + { + "epoch": 1.64, + "grad_norm": 1.5430241823196411, + "learning_rate": 4.918008288863679e-05, + "loss": 1.5445, + "step": 5465 + }, + { + "epoch": 1.64, + "grad_norm": 1.392940640449524, + "learning_rate": 4.917858989722036e-05, + "loss": 1.3484, + "step": 5470 + }, + { + "epoch": 1.64, + "grad_norm": 0.8935360312461853, + "learning_rate": 4.917709557044553e-05, + "loss": 1.35, + "step": 5475 + }, + { + "epoch": 1.64, + "grad_norm": 0.7475391626358032, + "learning_rate": 4.9175599908394854e-05, + "loss": 1.331, + "step": 5480 + }, + { + "epoch": 1.64, + "grad_norm": 1.0226927995681763, + "learning_rate": 4.917410291115092e-05, + "loss": 1.4033, + "step": 5485 + }, + { + "epoch": 1.64, + "grad_norm": 1.5844924449920654, + "learning_rate": 4.917260457879641e-05, + "loss": 1.3624, + "step": 5490 + }, + { + "epoch": 1.64, + "grad_norm": 2.3369362354278564, + "learning_rate": 4.917110491141407e-05, + "loss": 1.4102, + "step": 5495 + }, + { + "epoch": 1.65, + "grad_norm": 1.9585330486297607, + "learning_rate": 4.9169603909086736e-05, + "loss": 1.3082, + "step": 5500 + }, + { + "epoch": 1.65, + "grad_norm": 0.9265090227127075, + "learning_rate": 4.91681015718973e-05, + "loss": 1.3516, + "step": 5505 + }, + { + "epoch": 1.65, + "grad_norm": 0.8970497250556946, + "learning_rate": 4.9166597899928735e-05, + "loss": 1.2882, + "step": 5510 + }, + { + "epoch": 1.65, + "grad_norm": 1.3045713901519775, + "learning_rate": 4.9165092893264086e-05, + "loss": 1.4484, + "step": 5515 + }, + { + "epoch": 1.65, + "grad_norm": 1.7835524082183838, + "learning_rate": 4.9163586551986475e-05, + "loss": 1.4134, + "step": 5520 + }, + { + "epoch": 1.65, + "grad_norm": 0.7851400971412659, + "learning_rate": 4.9162078876179095e-05, + "loss": 1.1785, + "step": 5525 + }, + { + "epoch": 1.65, + "grad_norm": 0.5514227747917175, + "learning_rate": 4.916056986592522e-05, + "loss": 1.2639, + "step": 5530 + }, + { + "epoch": 1.66, + "grad_norm": 1.4473460912704468, + "learning_rate": 4.915905952130818e-05, + "loss": 1.3462, + "step": 5535 + }, + { + "epoch": 1.66, + "grad_norm": 1.3507695198059082, + "learning_rate": 4.9157547842411387e-05, + "loss": 1.2437, + "step": 5540 + }, + { + "epoch": 1.66, + "grad_norm": 1.2776063680648804, + "learning_rate": 4.9156034829318345e-05, + "loss": 1.3707, + "step": 5545 + }, + { + "epoch": 1.66, + "grad_norm": 2.255897045135498, + "learning_rate": 4.915452048211261e-05, + "loss": 1.2916, + "step": 5550 + }, + { + "epoch": 1.66, + "grad_norm": 1.0228066444396973, + "learning_rate": 4.915300480087781e-05, + "loss": 1.4058, + "step": 5555 + }, + { + "epoch": 1.66, + "grad_norm": 1.0739355087280273, + "learning_rate": 4.915148778569767e-05, + "loss": 1.4693, + "step": 5560 + }, + { + "epoch": 1.66, + "grad_norm": 1.723423719406128, + "learning_rate": 4.914996943665596e-05, + "loss": 1.0389, + "step": 5565 + }, + { + "epoch": 1.67, + "grad_norm": 1.2610810995101929, + "learning_rate": 4.9148449753836534e-05, + "loss": 1.3547, + "step": 5570 + }, + { + "epoch": 1.67, + "grad_norm": 0.9140579700469971, + "learning_rate": 4.9146928737323334e-05, + "loss": 1.3006, + "step": 5575 + }, + { + "epoch": 1.67, + "grad_norm": 0.7020224332809448, + "learning_rate": 4.914540638720035e-05, + "loss": 1.3814, + "step": 5580 + }, + { + "epoch": 1.67, + "grad_norm": 1.6752337217330933, + "learning_rate": 4.9143882703551685e-05, + "loss": 1.3788, + "step": 5585 + }, + { + "epoch": 1.67, + "grad_norm": 1.1528271436691284, + "learning_rate": 4.914235768646147e-05, + "loss": 1.4718, + "step": 5590 + }, + { + "epoch": 1.67, + "grad_norm": 0.9976640343666077, + "learning_rate": 4.9140831336013925e-05, + "loss": 1.4391, + "step": 5595 + }, + { + "epoch": 1.68, + "grad_norm": 0.7416092157363892, + "learning_rate": 4.9139303652293365e-05, + "loss": 1.2846, + "step": 5600 + }, + { + "epoch": 1.68, + "grad_norm": 0.8383505940437317, + "learning_rate": 4.913777463538416e-05, + "loss": 1.3933, + "step": 5605 + }, + { + "epoch": 1.68, + "grad_norm": 0.7380107045173645, + "learning_rate": 4.9136244285370746e-05, + "loss": 1.4613, + "step": 5610 + }, + { + "epoch": 1.68, + "grad_norm": 0.8241648077964783, + "learning_rate": 4.913471260233765e-05, + "loss": 1.3475, + "step": 5615 + }, + { + "epoch": 1.68, + "grad_norm": 1.6132783889770508, + "learning_rate": 4.913317958636946e-05, + "loss": 1.4238, + "step": 5620 + }, + { + "epoch": 1.68, + "grad_norm": 0.8694246411323547, + "learning_rate": 4.913164523755085e-05, + "loss": 1.6346, + "step": 5625 + }, + { + "epoch": 1.68, + "grad_norm": 1.2814140319824219, + "learning_rate": 4.9130109555966565e-05, + "loss": 1.3782, + "step": 5630 + }, + { + "epoch": 1.69, + "grad_norm": 1.7262847423553467, + "learning_rate": 4.91285725417014e-05, + "loss": 1.3747, + "step": 5635 + }, + { + "epoch": 1.69, + "grad_norm": 0.8753448128700256, + "learning_rate": 4.912703419484026e-05, + "loss": 1.3918, + "step": 5640 + }, + { + "epoch": 1.69, + "grad_norm": 1.002820611000061, + "learning_rate": 4.912549451546809e-05, + "loss": 1.2981, + "step": 5645 + }, + { + "epoch": 1.69, + "grad_norm": 1.3369395732879639, + "learning_rate": 4.912395350366994e-05, + "loss": 1.3446, + "step": 5650 + }, + { + "epoch": 1.69, + "grad_norm": 1.4645837545394897, + "learning_rate": 4.9122411159530916e-05, + "loss": 1.5831, + "step": 5655 + }, + { + "epoch": 1.69, + "grad_norm": 1.1211307048797607, + "learning_rate": 4.91208674831362e-05, + "loss": 1.3518, + "step": 5660 + }, + { + "epoch": 1.69, + "grad_norm": 1.3223671913146973, + "learning_rate": 4.911932247457104e-05, + "loss": 1.4612, + "step": 5665 + }, + { + "epoch": 1.7, + "grad_norm": 1.5868595838546753, + "learning_rate": 4.911777613392077e-05, + "loss": 1.2637, + "step": 5670 + }, + { + "epoch": 1.7, + "grad_norm": 1.2033578157424927, + "learning_rate": 4.91162284612708e-05, + "loss": 1.1865, + "step": 5675 + }, + { + "epoch": 1.7, + "grad_norm": 1.1434866189956665, + "learning_rate": 4.9114679456706594e-05, + "loss": 1.5031, + "step": 5680 + }, + { + "epoch": 1.7, + "grad_norm": 1.0561349391937256, + "learning_rate": 4.911312912031371e-05, + "loss": 1.3647, + "step": 5685 + }, + { + "epoch": 1.7, + "grad_norm": 2.2689900398254395, + "learning_rate": 4.911157745217776e-05, + "loss": 1.4179, + "step": 5690 + }, + { + "epoch": 1.7, + "grad_norm": 0.9906610250473022, + "learning_rate": 4.911002445238446e-05, + "loss": 1.4319, + "step": 5695 + }, + { + "epoch": 1.71, + "grad_norm": 2.0266451835632324, + "learning_rate": 4.9108470121019565e-05, + "loss": 1.2428, + "step": 5700 + }, + { + "epoch": 1.71, + "grad_norm": 1.1348594427108765, + "learning_rate": 4.910691445816893e-05, + "loss": 1.3954, + "step": 5705 + }, + { + "epoch": 1.71, + "grad_norm": 1.3033210039138794, + "learning_rate": 4.910535746391846e-05, + "loss": 1.3513, + "step": 5710 + }, + { + "epoch": 1.71, + "grad_norm": 1.4596507549285889, + "learning_rate": 4.910379913835416e-05, + "loss": 1.4849, + "step": 5715 + }, + { + "epoch": 1.71, + "grad_norm": 1.2325527667999268, + "learning_rate": 4.910223948156208e-05, + "loss": 1.231, + "step": 5720 + }, + { + "epoch": 1.71, + "grad_norm": 1.0921927690505981, + "learning_rate": 4.9100678493628374e-05, + "loss": 1.4783, + "step": 5725 + }, + { + "epoch": 1.71, + "grad_norm": 0.5836876034736633, + "learning_rate": 4.909911617463925e-05, + "loss": 1.3841, + "step": 5730 + }, + { + "epoch": 1.72, + "grad_norm": 0.6414868831634521, + "learning_rate": 4.909755252468098e-05, + "loss": 1.414, + "step": 5735 + }, + { + "epoch": 1.72, + "grad_norm": 1.1332942247390747, + "learning_rate": 4.909598754383994e-05, + "loss": 1.364, + "step": 5740 + }, + { + "epoch": 1.72, + "grad_norm": 1.3707212209701538, + "learning_rate": 4.909442123220255e-05, + "loss": 1.3071, + "step": 5745 + }, + { + "epoch": 1.72, + "grad_norm": 0.9404252171516418, + "learning_rate": 4.909285358985532e-05, + "loss": 1.332, + "step": 5750 + }, + { + "epoch": 1.72, + "grad_norm": 0.9168292880058289, + "learning_rate": 4.9091284616884824e-05, + "loss": 1.313, + "step": 5755 + }, + { + "epoch": 1.72, + "grad_norm": 0.628679633140564, + "learning_rate": 4.908971431337772e-05, + "loss": 1.3079, + "step": 5760 + }, + { + "epoch": 1.72, + "grad_norm": 2.0647614002227783, + "learning_rate": 4.908814267942074e-05, + "loss": 1.2487, + "step": 5765 + }, + { + "epoch": 1.73, + "grad_norm": 1.053024411201477, + "learning_rate": 4.908656971510068e-05, + "loss": 1.3222, + "step": 5770 + }, + { + "epoch": 1.73, + "grad_norm": 0.9378986358642578, + "learning_rate": 4.908499542050441e-05, + "loss": 1.3492, + "step": 5775 + }, + { + "epoch": 1.73, + "grad_norm": 0.9233798980712891, + "learning_rate": 4.9083419795718875e-05, + "loss": 1.4543, + "step": 5780 + }, + { + "epoch": 1.73, + "grad_norm": 1.063833236694336, + "learning_rate": 4.9081842840831104e-05, + "loss": 1.3152, + "step": 5785 + }, + { + "epoch": 1.73, + "grad_norm": 1.2852481603622437, + "learning_rate": 4.908026455592818e-05, + "loss": 1.2847, + "step": 5790 + }, + { + "epoch": 1.73, + "grad_norm": 1.1695432662963867, + "learning_rate": 4.9078684941097275e-05, + "loss": 1.2973, + "step": 5795 + }, + { + "epoch": 1.74, + "grad_norm": 0.908960223197937, + "learning_rate": 4.9077103996425625e-05, + "loss": 1.1404, + "step": 5800 + }, + { + "epoch": 1.74, + "grad_norm": 1.2946605682373047, + "learning_rate": 4.9075521722000556e-05, + "loss": 1.3848, + "step": 5805 + }, + { + "epoch": 1.74, + "grad_norm": 0.6911953091621399, + "learning_rate": 4.907393811790945e-05, + "loss": 1.4204, + "step": 5810 + }, + { + "epoch": 1.74, + "grad_norm": 0.9933083653450012, + "learning_rate": 4.907235318423975e-05, + "loss": 1.4721, + "step": 5815 + }, + { + "epoch": 1.74, + "grad_norm": 1.4515044689178467, + "learning_rate": 4.9070766921079014e-05, + "loss": 1.3615, + "step": 5820 + }, + { + "epoch": 1.74, + "grad_norm": 1.067295789718628, + "learning_rate": 4.906917932851484e-05, + "loss": 1.3254, + "step": 5825 + }, + { + "epoch": 1.74, + "grad_norm": 1.1964728832244873, + "learning_rate": 4.9067590406634914e-05, + "loss": 1.2453, + "step": 5830 + }, + { + "epoch": 1.75, + "grad_norm": 0.9193742275238037, + "learning_rate": 4.906600015552698e-05, + "loss": 1.5016, + "step": 5835 + }, + { + "epoch": 1.75, + "grad_norm": 1.4056816101074219, + "learning_rate": 4.906440857527888e-05, + "loss": 1.2901, + "step": 5840 + }, + { + "epoch": 1.75, + "grad_norm": 1.7364879846572876, + "learning_rate": 4.9062815665978504e-05, + "loss": 1.3488, + "step": 5845 + }, + { + "epoch": 1.75, + "grad_norm": 1.1966763734817505, + "learning_rate": 4.9061221427713835e-05, + "loss": 1.3019, + "step": 5850 + }, + { + "epoch": 1.75, + "grad_norm": 1.1406923532485962, + "learning_rate": 4.905962586057291e-05, + "loss": 1.27, + "step": 5855 + }, + { + "epoch": 1.75, + "grad_norm": 1.5411876440048218, + "learning_rate": 4.9058028964643865e-05, + "loss": 1.2951, + "step": 5860 + }, + { + "epoch": 1.75, + "grad_norm": 0.9900470972061157, + "learning_rate": 4.9056430740014883e-05, + "loss": 1.3227, + "step": 5865 + }, + { + "epoch": 1.76, + "grad_norm": 1.95987868309021, + "learning_rate": 4.905483118677423e-05, + "loss": 1.4196, + "step": 5870 + }, + { + "epoch": 1.76, + "grad_norm": 1.0949276685714722, + "learning_rate": 4.9053230305010264e-05, + "loss": 1.2241, + "step": 5875 + }, + { + "epoch": 1.76, + "grad_norm": 0.6950658559799194, + "learning_rate": 4.9051628094811386e-05, + "loss": 1.2839, + "step": 5880 + }, + { + "epoch": 1.76, + "grad_norm": 1.1980772018432617, + "learning_rate": 4.905002455626609e-05, + "loss": 1.458, + "step": 5885 + }, + { + "epoch": 1.76, + "grad_norm": 1.1213877201080322, + "learning_rate": 4.904841968946293e-05, + "loss": 1.4148, + "step": 5890 + }, + { + "epoch": 1.76, + "grad_norm": 1.1307765245437622, + "learning_rate": 4.904681349449056e-05, + "loss": 1.4134, + "step": 5895 + }, + { + "epoch": 1.77, + "grad_norm": 1.358079433441162, + "learning_rate": 4.904520597143767e-05, + "loss": 1.3841, + "step": 5900 + }, + { + "epoch": 1.77, + "grad_norm": 1.1629695892333984, + "learning_rate": 4.904359712039304e-05, + "loss": 1.39, + "step": 5905 + }, + { + "epoch": 1.77, + "grad_norm": 1.8049236536026, + "learning_rate": 4.904198694144554e-05, + "loss": 1.3112, + "step": 5910 + }, + { + "epoch": 1.77, + "grad_norm": 1.5202418565750122, + "learning_rate": 4.904037543468409e-05, + "loss": 1.2515, + "step": 5915 + }, + { + "epoch": 1.77, + "grad_norm": 1.6032389402389526, + "learning_rate": 4.90387626001977e-05, + "loss": 1.4008, + "step": 5920 + }, + { + "epoch": 1.77, + "grad_norm": 1.2656590938568115, + "learning_rate": 4.903714843807543e-05, + "loss": 1.4304, + "step": 5925 + }, + { + "epoch": 1.77, + "grad_norm": 1.622446894645691, + "learning_rate": 4.9035532948406436e-05, + "loss": 1.4896, + "step": 5930 + }, + { + "epoch": 1.78, + "grad_norm": 1.3047736883163452, + "learning_rate": 4.903391613127995e-05, + "loss": 1.3628, + "step": 5935 + }, + { + "epoch": 1.78, + "grad_norm": 0.8946179747581482, + "learning_rate": 4.903229798678525e-05, + "loss": 1.3129, + "step": 5940 + }, + { + "epoch": 1.78, + "grad_norm": 0.7396437525749207, + "learning_rate": 4.903067851501172e-05, + "loss": 1.4662, + "step": 5945 + }, + { + "epoch": 1.78, + "grad_norm": 0.7600730061531067, + "learning_rate": 4.9029057716048786e-05, + "loss": 1.3341, + "step": 5950 + }, + { + "epoch": 1.78, + "grad_norm": 1.3639189004898071, + "learning_rate": 4.902743558998597e-05, + "loss": 1.4028, + "step": 5955 + }, + { + "epoch": 1.78, + "grad_norm": 1.2151553630828857, + "learning_rate": 4.9025812136912874e-05, + "loss": 1.2892, + "step": 5960 + }, + { + "epoch": 1.78, + "grad_norm": 1.493870496749878, + "learning_rate": 4.902418735691914e-05, + "loss": 1.3913, + "step": 5965 + }, + { + "epoch": 1.79, + "grad_norm": 2.329575538635254, + "learning_rate": 4.90225612500945e-05, + "loss": 1.3196, + "step": 5970 + }, + { + "epoch": 1.79, + "grad_norm": 1.2628295421600342, + "learning_rate": 4.9020933816528784e-05, + "loss": 1.3211, + "step": 5975 + }, + { + "epoch": 1.79, + "grad_norm": 1.0996872186660767, + "learning_rate": 4.901930505631186e-05, + "loss": 1.402, + "step": 5980 + }, + { + "epoch": 1.79, + "grad_norm": 0.5428398251533508, + "learning_rate": 4.901767496953368e-05, + "loss": 1.5071, + "step": 5985 + }, + { + "epoch": 1.79, + "grad_norm": 0.6966472864151001, + "learning_rate": 4.9016043556284284e-05, + "loss": 1.4231, + "step": 5990 + }, + { + "epoch": 1.79, + "grad_norm": 1.003456473350525, + "learning_rate": 4.901441081665376e-05, + "loss": 1.2952, + "step": 5995 + }, + { + "epoch": 1.8, + "grad_norm": 1.4980223178863525, + "learning_rate": 4.9012776750732295e-05, + "loss": 1.2411, + "step": 6000 + }, + { + "epoch": 1.8, + "grad_norm": 1.0407806634902954, + "learning_rate": 4.901114135861012e-05, + "loss": 1.3536, + "step": 6005 + }, + { + "epoch": 1.8, + "grad_norm": 0.7962636947631836, + "learning_rate": 4.9009504640377565e-05, + "loss": 1.4762, + "step": 6010 + }, + { + "epoch": 1.8, + "grad_norm": 1.4784387350082397, + "learning_rate": 4.900786659612504e-05, + "loss": 1.3125, + "step": 6015 + }, + { + "epoch": 1.8, + "grad_norm": 0.9292671084403992, + "learning_rate": 4.900622722594299e-05, + "loss": 1.4477, + "step": 6020 + }, + { + "epoch": 1.8, + "grad_norm": 1.320400357246399, + "learning_rate": 4.9004586529921955e-05, + "loss": 1.3478, + "step": 6025 + }, + { + "epoch": 1.8, + "grad_norm": 0.593423068523407, + "learning_rate": 4.9002944508152567e-05, + "loss": 1.2153, + "step": 6030 + }, + { + "epoch": 1.81, + "grad_norm": 1.3448033332824707, + "learning_rate": 4.90013011607255e-05, + "loss": 1.4262, + "step": 6035 + }, + { + "epoch": 1.81, + "grad_norm": 1.65395987033844, + "learning_rate": 4.8999656487731516e-05, + "loss": 1.4308, + "step": 6040 + }, + { + "epoch": 1.81, + "grad_norm": 1.5153121948242188, + "learning_rate": 4.899801048926146e-05, + "loss": 1.3404, + "step": 6045 + }, + { + "epoch": 1.81, + "grad_norm": 0.942297637462616, + "learning_rate": 4.899636316540622e-05, + "loss": 1.4883, + "step": 6050 + }, + { + "epoch": 1.81, + "grad_norm": 0.9569664597511292, + "learning_rate": 4.899471451625678e-05, + "loss": 1.3667, + "step": 6055 + }, + { + "epoch": 1.81, + "grad_norm": 1.1966135501861572, + "learning_rate": 4.899306454190421e-05, + "loss": 1.1763, + "step": 6060 + }, + { + "epoch": 1.81, + "grad_norm": 1.0947861671447754, + "learning_rate": 4.899141324243962e-05, + "loss": 1.2947, + "step": 6065 + }, + { + "epoch": 1.82, + "grad_norm": 0.9983508586883545, + "learning_rate": 4.8989760617954215e-05, + "loss": 1.7462, + "step": 6070 + }, + { + "epoch": 1.82, + "grad_norm": 0.8864436745643616, + "learning_rate": 4.898810666853927e-05, + "loss": 1.526, + "step": 6075 + }, + { + "epoch": 1.82, + "grad_norm": 1.3127366304397583, + "learning_rate": 4.898645139428613e-05, + "loss": 1.3631, + "step": 6080 + }, + { + "epoch": 1.82, + "grad_norm": 0.9836210608482361, + "learning_rate": 4.8984794795286196e-05, + "loss": 1.2838, + "step": 6085 + }, + { + "epoch": 1.82, + "grad_norm": 1.2298458814620972, + "learning_rate": 4.8983136871630995e-05, + "loss": 1.3527, + "step": 6090 + }, + { + "epoch": 1.82, + "grad_norm": 1.1192662715911865, + "learning_rate": 4.8981477623412064e-05, + "loss": 1.4846, + "step": 6095 + }, + { + "epoch": 1.83, + "grad_norm": 0.9733280539512634, + "learning_rate": 4.897981705072105e-05, + "loss": 1.4492, + "step": 6100 + }, + { + "epoch": 1.83, + "grad_norm": 0.5876336097717285, + "learning_rate": 4.897815515364967e-05, + "loss": 1.3106, + "step": 6105 + }, + { + "epoch": 1.83, + "grad_norm": 1.3915202617645264, + "learning_rate": 4.89764919322897e-05, + "loss": 1.2493, + "step": 6110 + }, + { + "epoch": 1.83, + "grad_norm": 1.1110719442367554, + "learning_rate": 4.897482738673301e-05, + "loss": 1.5083, + "step": 6115 + }, + { + "epoch": 1.83, + "grad_norm": 0.9961562752723694, + "learning_rate": 4.897316151707152e-05, + "loss": 1.5106, + "step": 6120 + }, + { + "epoch": 1.83, + "grad_norm": 0.7797302603721619, + "learning_rate": 4.897149432339724e-05, + "loss": 1.3252, + "step": 6125 + }, + { + "epoch": 1.83, + "grad_norm": 1.6558862924575806, + "learning_rate": 4.896982580580224e-05, + "loss": 1.5234, + "step": 6130 + }, + { + "epoch": 1.84, + "grad_norm": 0.8228052854537964, + "learning_rate": 4.896815596437868e-05, + "loss": 1.3884, + "step": 6135 + }, + { + "epoch": 1.84, + "grad_norm": 0.7382698059082031, + "learning_rate": 4.896648479921878e-05, + "loss": 1.3653, + "step": 6140 + }, + { + "epoch": 1.84, + "grad_norm": 1.1932933330535889, + "learning_rate": 4.896481231041483e-05, + "loss": 1.6213, + "step": 6145 + }, + { + "epoch": 1.84, + "grad_norm": 1.208572506904602, + "learning_rate": 4.896313849805921e-05, + "loss": 1.4082, + "step": 6150 + }, + { + "epoch": 1.84, + "grad_norm": 1.3084590435028076, + "learning_rate": 4.896146336224436e-05, + "loss": 1.5231, + "step": 6155 + }, + { + "epoch": 1.84, + "grad_norm": 1.2458943128585815, + "learning_rate": 4.89597869030628e-05, + "loss": 1.2657, + "step": 6160 + }, + { + "epoch": 1.84, + "grad_norm": 1.221347689628601, + "learning_rate": 4.8958109120607117e-05, + "loss": 1.4708, + "step": 6165 + }, + { + "epoch": 1.85, + "grad_norm": 0.9371591210365295, + "learning_rate": 4.895643001496996e-05, + "loss": 1.3741, + "step": 6170 + }, + { + "epoch": 1.85, + "grad_norm": 1.7532001733779907, + "learning_rate": 4.8954749586244074e-05, + "loss": 1.3621, + "step": 6175 + }, + { + "epoch": 1.85, + "grad_norm": 1.8896582126617432, + "learning_rate": 4.895306783452228e-05, + "loss": 1.398, + "step": 6180 + }, + { + "epoch": 1.85, + "grad_norm": 1.096148133277893, + "learning_rate": 4.895138475989743e-05, + "loss": 1.2032, + "step": 6185 + }, + { + "epoch": 1.85, + "grad_norm": 0.7891479730606079, + "learning_rate": 4.894970036246251e-05, + "loss": 1.3164, + "step": 6190 + }, + { + "epoch": 1.85, + "grad_norm": 0.9778878092765808, + "learning_rate": 4.894801464231053e-05, + "loss": 1.4422, + "step": 6195 + }, + { + "epoch": 1.85, + "grad_norm": 1.088839054107666, + "learning_rate": 4.894632759953459e-05, + "loss": 1.3869, + "step": 6200 + }, + { + "epoch": 1.86, + "grad_norm": 1.0717918872833252, + "learning_rate": 4.894463923422787e-05, + "loss": 1.3477, + "step": 6205 + }, + { + "epoch": 1.86, + "grad_norm": 1.6605066061019897, + "learning_rate": 4.894294954648362e-05, + "loss": 1.2396, + "step": 6210 + }, + { + "epoch": 1.86, + "grad_norm": 1.642807126045227, + "learning_rate": 4.894125853639514e-05, + "loss": 1.3233, + "step": 6215 + }, + { + "epoch": 1.86, + "grad_norm": 2.835897207260132, + "learning_rate": 4.893956620405585e-05, + "loss": 1.4308, + "step": 6220 + }, + { + "epoch": 1.86, + "grad_norm": 0.6643049120903015, + "learning_rate": 4.893787254955919e-05, + "loss": 1.5178, + "step": 6225 + }, + { + "epoch": 1.86, + "grad_norm": 0.7968789935112, + "learning_rate": 4.893617757299872e-05, + "loss": 1.2129, + "step": 6230 + }, + { + "epoch": 1.87, + "grad_norm": 1.484046459197998, + "learning_rate": 4.893448127446805e-05, + "loss": 1.2537, + "step": 6235 + }, + { + "epoch": 1.87, + "grad_norm": 0.99761962890625, + "learning_rate": 4.8932783654060844e-05, + "loss": 1.3536, + "step": 6240 + }, + { + "epoch": 1.87, + "grad_norm": 0.965552806854248, + "learning_rate": 4.8931084711870876e-05, + "loss": 1.3365, + "step": 6245 + }, + { + "epoch": 1.87, + "grad_norm": 0.8811823129653931, + "learning_rate": 4.8929384447991974e-05, + "loss": 1.4075, + "step": 6250 + }, + { + "epoch": 1.87, + "grad_norm": 1.0002772808074951, + "learning_rate": 4.8927682862518044e-05, + "loss": 1.4315, + "step": 6255 + }, + { + "epoch": 1.87, + "grad_norm": 0.4986729919910431, + "learning_rate": 4.8925979955543067e-05, + "loss": 1.1747, + "step": 6260 + }, + { + "epoch": 1.87, + "grad_norm": 1.492148995399475, + "learning_rate": 4.892427572716108e-05, + "loss": 1.3318, + "step": 6265 + }, + { + "epoch": 1.88, + "grad_norm": 1.448102593421936, + "learning_rate": 4.892257017746621e-05, + "loss": 1.2832, + "step": 6270 + }, + { + "epoch": 1.88, + "grad_norm": 1.4307514429092407, + "learning_rate": 4.892086330655266e-05, + "loss": 1.5149, + "step": 6275 + }, + { + "epoch": 1.88, + "grad_norm": 0.8795152306556702, + "learning_rate": 4.8919155114514695e-05, + "loss": 1.4482, + "step": 6280 + }, + { + "epoch": 1.88, + "grad_norm": 0.9869243502616882, + "learning_rate": 4.891744560144666e-05, + "loss": 1.3061, + "step": 6285 + }, + { + "epoch": 1.88, + "grad_norm": 0.6511804461479187, + "learning_rate": 4.891573476744295e-05, + "loss": 1.4904, + "step": 6290 + }, + { + "epoch": 1.88, + "grad_norm": 0.5018410086631775, + "learning_rate": 4.891402261259807e-05, + "loss": 1.374, + "step": 6295 + }, + { + "epoch": 1.88, + "grad_norm": 0.5579232573509216, + "learning_rate": 4.891230913700659e-05, + "loss": 1.2526, + "step": 6300 + }, + { + "epoch": 1.89, + "grad_norm": 1.1983088254928589, + "learning_rate": 4.8910594340763126e-05, + "loss": 1.5322, + "step": 6305 + }, + { + "epoch": 1.89, + "grad_norm": 1.0148495435714722, + "learning_rate": 4.89088782239624e-05, + "loss": 1.4543, + "step": 6310 + }, + { + "epoch": 1.89, + "grad_norm": 0.739857017993927, + "learning_rate": 4.890716078669917e-05, + "loss": 1.373, + "step": 6315 + }, + { + "epoch": 1.89, + "grad_norm": 1.4641748666763306, + "learning_rate": 4.8905442029068296e-05, + "loss": 1.3945, + "step": 6320 + }, + { + "epoch": 1.89, + "grad_norm": 1.2002661228179932, + "learning_rate": 4.890372195116471e-05, + "loss": 1.3935, + "step": 6325 + }, + { + "epoch": 1.89, + "grad_norm": 0.9854932427406311, + "learning_rate": 4.890200055308342e-05, + "loss": 1.4561, + "step": 6330 + }, + { + "epoch": 1.9, + "grad_norm": 0.8449004888534546, + "learning_rate": 4.8900277834919475e-05, + "loss": 1.3978, + "step": 6335 + }, + { + "epoch": 1.9, + "grad_norm": 1.8262367248535156, + "learning_rate": 4.889855379676802e-05, + "loss": 1.3593, + "step": 6340 + }, + { + "epoch": 1.9, + "grad_norm": 1.1936458349227905, + "learning_rate": 4.889682843872429e-05, + "loss": 1.3747, + "step": 6345 + }, + { + "epoch": 1.9, + "grad_norm": 1.5638678073883057, + "learning_rate": 4.8895101760883566e-05, + "loss": 1.2618, + "step": 6350 + }, + { + "epoch": 1.9, + "grad_norm": 1.084822654724121, + "learning_rate": 4.88933737633412e-05, + "loss": 1.4142, + "step": 6355 + }, + { + "epoch": 1.9, + "grad_norm": 3.615086793899536, + "learning_rate": 4.889164444619264e-05, + "loss": 1.3423, + "step": 6360 + }, + { + "epoch": 1.9, + "grad_norm": 1.1368343830108643, + "learning_rate": 4.88899138095334e-05, + "loss": 1.325, + "step": 6365 + }, + { + "epoch": 1.91, + "grad_norm": 0.9688156247138977, + "learning_rate": 4.8888181853459046e-05, + "loss": 1.3005, + "step": 6370 + }, + { + "epoch": 1.91, + "grad_norm": 1.1333907842636108, + "learning_rate": 4.8886448578065236e-05, + "loss": 1.5213, + "step": 6375 + }, + { + "epoch": 1.91, + "grad_norm": 1.5067886114120483, + "learning_rate": 4.8884713983447704e-05, + "loss": 1.4821, + "step": 6380 + }, + { + "epoch": 1.91, + "grad_norm": 1.2759270668029785, + "learning_rate": 4.8882978069702246e-05, + "loss": 1.2604, + "step": 6385 + }, + { + "epoch": 1.91, + "grad_norm": 1.2852680683135986, + "learning_rate": 4.888124083692473e-05, + "loss": 1.4365, + "step": 6390 + }, + { + "epoch": 1.91, + "grad_norm": 1.2797343730926514, + "learning_rate": 4.887950228521111e-05, + "loss": 1.3997, + "step": 6395 + }, + { + "epoch": 1.91, + "grad_norm": 1.6342377662658691, + "learning_rate": 4.8877762414657394e-05, + "loss": 1.3325, + "step": 6400 + }, + { + "epoch": 1.92, + "grad_norm": 1.4419025182724, + "learning_rate": 4.8876021225359684e-05, + "loss": 1.3404, + "step": 6405 + }, + { + "epoch": 1.92, + "grad_norm": 0.9858295321464539, + "learning_rate": 4.887427871741414e-05, + "loss": 1.3812, + "step": 6410 + }, + { + "epoch": 1.92, + "grad_norm": 1.2529702186584473, + "learning_rate": 4.8872534890916996e-05, + "loss": 1.3797, + "step": 6415 + }, + { + "epoch": 1.92, + "grad_norm": 1.7771295309066772, + "learning_rate": 4.8870789745964566e-05, + "loss": 1.3254, + "step": 6420 + }, + { + "epoch": 1.92, + "grad_norm": 0.7619970440864563, + "learning_rate": 4.8869043282653234e-05, + "loss": 1.3069, + "step": 6425 + }, + { + "epoch": 1.92, + "grad_norm": 1.5197542905807495, + "learning_rate": 4.886729550107945e-05, + "loss": 1.4575, + "step": 6430 + }, + { + "epoch": 1.93, + "grad_norm": 0.9382418990135193, + "learning_rate": 4.8865546401339736e-05, + "loss": 1.4566, + "step": 6435 + }, + { + "epoch": 1.93, + "grad_norm": 0.7788457274436951, + "learning_rate": 4.886379598353071e-05, + "loss": 1.242, + "step": 6440 + }, + { + "epoch": 1.93, + "grad_norm": 1.4081120491027832, + "learning_rate": 4.8862044247749034e-05, + "loss": 1.4399, + "step": 6445 + }, + { + "epoch": 1.93, + "grad_norm": 1.1004774570465088, + "learning_rate": 4.886029119409146e-05, + "loss": 1.4171, + "step": 6450 + }, + { + "epoch": 1.93, + "grad_norm": 1.2400892972946167, + "learning_rate": 4.88585368226548e-05, + "loss": 1.3873, + "step": 6455 + }, + { + "epoch": 1.93, + "grad_norm": 0.6717667579650879, + "learning_rate": 4.8856781133535955e-05, + "loss": 1.1926, + "step": 6460 + }, + { + "epoch": 1.93, + "grad_norm": 0.7015379667282104, + "learning_rate": 4.8855024126831886e-05, + "loss": 1.2202, + "step": 6465 + }, + { + "epoch": 1.94, + "grad_norm": 1.3034001588821411, + "learning_rate": 4.8853265802639625e-05, + "loss": 1.2759, + "step": 6470 + }, + { + "epoch": 1.94, + "grad_norm": 1.2087301015853882, + "learning_rate": 4.8851506161056296e-05, + "loss": 1.1914, + "step": 6475 + }, + { + "epoch": 1.94, + "grad_norm": 0.8940001726150513, + "learning_rate": 4.8849745202179064e-05, + "loss": 1.4391, + "step": 6480 + }, + { + "epoch": 1.94, + "grad_norm": 1.4564568996429443, + "learning_rate": 4.8847982926105195e-05, + "loss": 1.4835, + "step": 6485 + }, + { + "epoch": 1.94, + "grad_norm": 1.3092164993286133, + "learning_rate": 4.884621933293203e-05, + "loss": 1.3326, + "step": 6490 + }, + { + "epoch": 1.94, + "grad_norm": 1.8018982410430908, + "learning_rate": 4.8844454422756946e-05, + "loss": 1.4825, + "step": 6495 + }, + { + "epoch": 1.94, + "grad_norm": 0.8499402403831482, + "learning_rate": 4.884268819567743e-05, + "loss": 1.38, + "step": 6500 + }, + { + "epoch": 1.95, + "grad_norm": 1.2163305282592773, + "learning_rate": 4.8840920651791036e-05, + "loss": 1.2982, + "step": 6505 + }, + { + "epoch": 1.95, + "grad_norm": 1.5904464721679688, + "learning_rate": 4.883915179119537e-05, + "loss": 1.3852, + "step": 6510 + }, + { + "epoch": 1.95, + "grad_norm": 1.2518161535263062, + "learning_rate": 4.883738161398813e-05, + "loss": 1.4104, + "step": 6515 + }, + { + "epoch": 1.95, + "grad_norm": 1.155625581741333, + "learning_rate": 4.883561012026708e-05, + "loss": 1.3967, + "step": 6520 + }, + { + "epoch": 1.95, + "grad_norm": 0.6556622982025146, + "learning_rate": 4.883383731013007e-05, + "loss": 1.3625, + "step": 6525 + }, + { + "epoch": 1.95, + "grad_norm": 0.9793677926063538, + "learning_rate": 4.883206318367499e-05, + "loss": 1.4226, + "step": 6530 + }, + { + "epoch": 1.96, + "grad_norm": 1.0961387157440186, + "learning_rate": 4.883028774099983e-05, + "loss": 1.4849, + "step": 6535 + }, + { + "epoch": 1.96, + "grad_norm": 0.9702796936035156, + "learning_rate": 4.882851098220265e-05, + "loss": 1.3409, + "step": 6540 + }, + { + "epoch": 1.96, + "grad_norm": 1.4797945022583008, + "learning_rate": 4.882673290738158e-05, + "loss": 1.3482, + "step": 6545 + }, + { + "epoch": 1.96, + "grad_norm": 1.1663824319839478, + "learning_rate": 4.8824953516634816e-05, + "loss": 1.2391, + "step": 6550 + }, + { + "epoch": 1.96, + "grad_norm": 0.7633179426193237, + "learning_rate": 4.882317281006064e-05, + "loss": 1.2115, + "step": 6555 + }, + { + "epoch": 1.96, + "grad_norm": 0.6997314095497131, + "learning_rate": 4.8821390787757384e-05, + "loss": 1.3422, + "step": 6560 + }, + { + "epoch": 1.96, + "grad_norm": 1.2793208360671997, + "learning_rate": 4.8819607449823476e-05, + "loss": 1.4855, + "step": 6565 + }, + { + "epoch": 1.97, + "grad_norm": 1.2282752990722656, + "learning_rate": 4.881782279635741e-05, + "loss": 1.5351, + "step": 6570 + }, + { + "epoch": 1.97, + "grad_norm": 1.7947735786437988, + "learning_rate": 4.8816036827457745e-05, + "loss": 1.2795, + "step": 6575 + }, + { + "epoch": 1.97, + "grad_norm": 1.136925458908081, + "learning_rate": 4.8814249543223125e-05, + "loss": 1.3417, + "step": 6580 + }, + { + "epoch": 1.97, + "grad_norm": 0.9631902575492859, + "learning_rate": 4.8812460943752256e-05, + "loss": 1.4113, + "step": 6585 + }, + { + "epoch": 1.97, + "grad_norm": 0.7148537039756775, + "learning_rate": 4.881067102914392e-05, + "loss": 1.3135, + "step": 6590 + }, + { + "epoch": 1.97, + "grad_norm": 2.7874670028686523, + "learning_rate": 4.880887979949698e-05, + "loss": 1.3566, + "step": 6595 + }, + { + "epoch": 1.97, + "grad_norm": 0.9266506433486938, + "learning_rate": 4.8807087254910344e-05, + "loss": 1.3909, + "step": 6600 + }, + { + "epoch": 1.98, + "grad_norm": 1.1458144187927246, + "learning_rate": 4.880529339548303e-05, + "loss": 1.4995, + "step": 6605 + }, + { + "epoch": 1.98, + "grad_norm": 0.6967890858650208, + "learning_rate": 4.8803498221314106e-05, + "loss": 1.5207, + "step": 6610 + }, + { + "epoch": 1.98, + "grad_norm": 1.142874836921692, + "learning_rate": 4.880170173250272e-05, + "loss": 1.335, + "step": 6615 + }, + { + "epoch": 1.98, + "grad_norm": 2.030205726623535, + "learning_rate": 4.879990392914809e-05, + "loss": 1.2267, + "step": 6620 + }, + { + "epoch": 1.98, + "grad_norm": 0.5845741033554077, + "learning_rate": 4.8798104811349496e-05, + "loss": 1.4528, + "step": 6625 + }, + { + "epoch": 1.98, + "grad_norm": 1.0889317989349365, + "learning_rate": 4.879630437920631e-05, + "loss": 1.3085, + "step": 6630 + }, + { + "epoch": 1.99, + "grad_norm": 1.1721889972686768, + "learning_rate": 4.8794502632817983e-05, + "loss": 1.3531, + "step": 6635 + }, + { + "epoch": 1.99, + "grad_norm": 1.2524924278259277, + "learning_rate": 4.8792699572283996e-05, + "loss": 1.482, + "step": 6640 + }, + { + "epoch": 1.99, + "grad_norm": 0.9633756875991821, + "learning_rate": 4.879089519770395e-05, + "loss": 1.3769, + "step": 6645 + }, + { + "epoch": 1.99, + "grad_norm": 1.5347559452056885, + "learning_rate": 4.8789089509177485e-05, + "loss": 1.3181, + "step": 6650 + }, + { + "epoch": 1.99, + "grad_norm": 0.992938220500946, + "learning_rate": 4.878728250680433e-05, + "loss": 1.376, + "step": 6655 + }, + { + "epoch": 1.99, + "grad_norm": 1.4422589540481567, + "learning_rate": 4.87854741906843e-05, + "loss": 1.2264, + "step": 6660 + }, + { + "epoch": 1.99, + "grad_norm": 1.2222764492034912, + "learning_rate": 4.878366456091724e-05, + "loss": 1.3726, + "step": 6665 + }, + { + "epoch": 2.0, + "grad_norm": 0.5847399830818176, + "learning_rate": 4.8781853617603116e-05, + "loss": 1.3855, + "step": 6670 + }, + { + "epoch": 2.0, + "grad_norm": 1.1310397386550903, + "learning_rate": 4.878004136084194e-05, + "loss": 1.4062, + "step": 6675 + }, + { + "epoch": 2.0, + "grad_norm": 2.0542705059051514, + "learning_rate": 4.877822779073379e-05, + "loss": 1.641, + "step": 6680 + }, + { + "epoch": 2.0, + "grad_norm": 1.2997204065322876, + "learning_rate": 4.877641290737884e-05, + "loss": 1.4196, + "step": 6685 + }, + { + "epoch": 2.0, + "grad_norm": 0.9382727146148682, + "learning_rate": 4.8774596710877315e-05, + "loss": 1.3389, + "step": 6690 + }, + { + "epoch": 2.0, + "grad_norm": 0.9488927125930786, + "learning_rate": 4.877277920132953e-05, + "loss": 1.1815, + "step": 6695 + }, + { + "epoch": 2.0, + "grad_norm": 0.7831886410713196, + "learning_rate": 4.877096037883586e-05, + "loss": 1.2372, + "step": 6700 + }, + { + "epoch": 2.01, + "grad_norm": 1.236717939376831, + "learning_rate": 4.876914024349676e-05, + "loss": 1.3009, + "step": 6705 + }, + { + "epoch": 2.01, + "grad_norm": 0.9065530896186829, + "learning_rate": 4.8767318795412746e-05, + "loss": 1.4108, + "step": 6710 + }, + { + "epoch": 2.01, + "grad_norm": 0.7689698934555054, + "learning_rate": 4.876549603468442e-05, + "loss": 1.3159, + "step": 6715 + }, + { + "epoch": 2.01, + "grad_norm": 1.1612657308578491, + "learning_rate": 4.876367196141245e-05, + "loss": 1.4331, + "step": 6720 + }, + { + "epoch": 2.01, + "grad_norm": 0.7282540798187256, + "learning_rate": 4.876184657569758e-05, + "loss": 1.2104, + "step": 6725 + }, + { + "epoch": 2.01, + "grad_norm": 1.3885889053344727, + "learning_rate": 4.876001987764063e-05, + "loss": 1.3829, + "step": 6730 + }, + { + "epoch": 2.02, + "grad_norm": 1.1671245098114014, + "learning_rate": 4.8758191867342465e-05, + "loss": 1.3192, + "step": 6735 + }, + { + "epoch": 2.02, + "grad_norm": 1.369300365447998, + "learning_rate": 4.875636254490406e-05, + "loss": 1.202, + "step": 6740 + }, + { + "epoch": 2.02, + "grad_norm": 0.8616713881492615, + "learning_rate": 4.875453191042646e-05, + "loss": 1.3398, + "step": 6745 + }, + { + "epoch": 2.02, + "grad_norm": 1.1366583108901978, + "learning_rate": 4.875269996401074e-05, + "loss": 1.306, + "step": 6750 + }, + { + "epoch": 2.02, + "grad_norm": 1.2424169778823853, + "learning_rate": 4.8750866705758106e-05, + "loss": 1.2174, + "step": 6755 + }, + { + "epoch": 2.02, + "grad_norm": 1.3406630754470825, + "learning_rate": 4.874903213576977e-05, + "loss": 1.2905, + "step": 6760 + }, + { + "epoch": 2.02, + "grad_norm": 0.9850149750709534, + "learning_rate": 4.874719625414709e-05, + "loss": 1.4, + "step": 6765 + }, + { + "epoch": 2.03, + "grad_norm": 1.6209102869033813, + "learning_rate": 4.874535906099144e-05, + "loss": 1.347, + "step": 6770 + }, + { + "epoch": 2.03, + "grad_norm": 1.3967558145523071, + "learning_rate": 4.874352055640429e-05, + "loss": 1.2893, + "step": 6775 + }, + { + "epoch": 2.03, + "grad_norm": 1.4221364259719849, + "learning_rate": 4.874204880857213e-05, + "loss": 1.3171, + "step": 6780 + }, + { + "epoch": 2.03, + "grad_norm": 1.1001510620117188, + "learning_rate": 4.8740207943664204e-05, + "loss": 1.2465, + "step": 6785 + }, + { + "epoch": 2.03, + "grad_norm": 0.5954471230506897, + "learning_rate": 4.8738365767609275e-05, + "loss": 1.2439, + "step": 6790 + }, + { + "epoch": 2.03, + "grad_norm": 0.8812218308448792, + "learning_rate": 4.873652228050908e-05, + "loss": 1.3691, + "step": 6795 + }, + { + "epoch": 2.03, + "grad_norm": 1.045620083808899, + "learning_rate": 4.873467748246543e-05, + "loss": 1.2654, + "step": 6800 + }, + { + "epoch": 2.04, + "grad_norm": 1.3172338008880615, + "learning_rate": 4.8732831373580216e-05, + "loss": 1.3099, + "step": 6805 + }, + { + "epoch": 2.04, + "grad_norm": 0.8830695748329163, + "learning_rate": 4.873098395395539e-05, + "loss": 1.1458, + "step": 6810 + }, + { + "epoch": 2.04, + "grad_norm": 1.0735907554626465, + "learning_rate": 4.872913522369299e-05, + "loss": 1.2193, + "step": 6815 + }, + { + "epoch": 2.04, + "grad_norm": 1.7402642965316772, + "learning_rate": 4.8727285182895124e-05, + "loss": 1.3219, + "step": 6820 + }, + { + "epoch": 2.04, + "grad_norm": 1.2762502431869507, + "learning_rate": 4.8725433831663944e-05, + "loss": 1.2906, + "step": 6825 + }, + { + "epoch": 2.04, + "grad_norm": 0.894281804561615, + "learning_rate": 4.872358117010173e-05, + "loss": 1.4766, + "step": 6830 + }, + { + "epoch": 2.04, + "grad_norm": 1.1440064907073975, + "learning_rate": 4.872172719831078e-05, + "loss": 1.2786, + "step": 6835 + }, + { + "epoch": 2.05, + "grad_norm": 1.659610629081726, + "learning_rate": 4.8719871916393495e-05, + "loss": 1.4152, + "step": 6840 + }, + { + "epoch": 2.05, + "grad_norm": 1.5597167015075684, + "learning_rate": 4.8718015324452336e-05, + "loss": 1.2945, + "step": 6845 + }, + { + "epoch": 2.05, + "grad_norm": 1.578195571899414, + "learning_rate": 4.8716157422589855e-05, + "loss": 1.4838, + "step": 6850 + }, + { + "epoch": 2.05, + "grad_norm": 0.8329493403434753, + "learning_rate": 4.8714298210908646e-05, + "loss": 1.2861, + "step": 6855 + }, + { + "epoch": 2.05, + "grad_norm": 1.347894310951233, + "learning_rate": 4.8712437689511395e-05, + "loss": 1.3531, + "step": 6860 + }, + { + "epoch": 2.05, + "grad_norm": 2.246103286743164, + "learning_rate": 4.871057585850085e-05, + "loss": 1.3114, + "step": 6865 + }, + { + "epoch": 2.06, + "grad_norm": 0.8596112132072449, + "learning_rate": 4.870871271797986e-05, + "loss": 1.4278, + "step": 6870 + }, + { + "epoch": 2.06, + "grad_norm": 0.986096203327179, + "learning_rate": 4.8706848268051305e-05, + "loss": 1.3325, + "step": 6875 + }, + { + "epoch": 2.06, + "grad_norm": 1.3321117162704468, + "learning_rate": 4.870498250881816e-05, + "loss": 1.3678, + "step": 6880 + }, + { + "epoch": 2.06, + "grad_norm": 1.3195933103561401, + "learning_rate": 4.8703115440383474e-05, + "loss": 1.2902, + "step": 6885 + }, + { + "epoch": 2.06, + "grad_norm": 0.8268415927886963, + "learning_rate": 4.8701247062850355e-05, + "loss": 1.2304, + "step": 6890 + }, + { + "epoch": 2.06, + "grad_norm": 0.8425890207290649, + "learning_rate": 4.8699377376322e-05, + "loss": 1.3488, + "step": 6895 + }, + { + "epoch": 2.06, + "grad_norm": 1.5140923261642456, + "learning_rate": 4.869750638090167e-05, + "loss": 1.3143, + "step": 6900 + }, + { + "epoch": 2.07, + "grad_norm": 1.7727601528167725, + "learning_rate": 4.8695634076692696e-05, + "loss": 1.4197, + "step": 6905 + }, + { + "epoch": 2.07, + "grad_norm": 1.5425249338150024, + "learning_rate": 4.8693760463798476e-05, + "loss": 1.2215, + "step": 6910 + }, + { + "epoch": 2.07, + "grad_norm": 1.0112886428833008, + "learning_rate": 4.869188554232249e-05, + "loss": 1.4421, + "step": 6915 + }, + { + "epoch": 2.07, + "grad_norm": 0.9172187447547913, + "learning_rate": 4.86900093123683e-05, + "loss": 1.4047, + "step": 6920 + }, + { + "epoch": 2.07, + "grad_norm": 1.2613557577133179, + "learning_rate": 4.868813177403952e-05, + "loss": 1.1333, + "step": 6925 + }, + { + "epoch": 2.07, + "grad_norm": 0.811103880405426, + "learning_rate": 4.8686252927439844e-05, + "loss": 1.2167, + "step": 6930 + }, + { + "epoch": 2.07, + "grad_norm": 2.1426024436950684, + "learning_rate": 4.868437277267304e-05, + "loss": 1.4477, + "step": 6935 + }, + { + "epoch": 2.08, + "grad_norm": 0.6741098165512085, + "learning_rate": 4.868249130984294e-05, + "loss": 1.4966, + "step": 6940 + }, + { + "epoch": 2.08, + "grad_norm": 0.6223343014717102, + "learning_rate": 4.868060853905346e-05, + "loss": 1.3415, + "step": 6945 + }, + { + "epoch": 2.08, + "grad_norm": 1.3729140758514404, + "learning_rate": 4.86787244604086e-05, + "loss": 1.2362, + "step": 6950 + }, + { + "epoch": 2.08, + "grad_norm": 1.3088891506195068, + "learning_rate": 4.8676839074012385e-05, + "loss": 1.4647, + "step": 6955 + }, + { + "epoch": 2.08, + "grad_norm": 1.7941012382507324, + "learning_rate": 4.867495237996897e-05, + "loss": 1.2942, + "step": 6960 + }, + { + "epoch": 2.08, + "grad_norm": 1.6377042531967163, + "learning_rate": 4.867306437838254e-05, + "loss": 1.4394, + "step": 6965 + }, + { + "epoch": 2.09, + "grad_norm": 1.298292636871338, + "learning_rate": 4.867117506935737e-05, + "loss": 1.3146, + "step": 6970 + }, + { + "epoch": 2.09, + "grad_norm": 1.6398062705993652, + "learning_rate": 4.8669284452997795e-05, + "loss": 1.1931, + "step": 6975 + }, + { + "epoch": 2.09, + "grad_norm": 0.8299776911735535, + "learning_rate": 4.866739252940826e-05, + "loss": 1.3339, + "step": 6980 + }, + { + "epoch": 2.09, + "grad_norm": 1.9005075693130493, + "learning_rate": 4.866549929869323e-05, + "loss": 1.289, + "step": 6985 + }, + { + "epoch": 2.09, + "grad_norm": 0.9177480340003967, + "learning_rate": 4.866360476095727e-05, + "loss": 1.4014, + "step": 6990 + }, + { + "epoch": 2.09, + "grad_norm": 1.6924612522125244, + "learning_rate": 4.866170891630502e-05, + "loss": 1.2609, + "step": 6995 + }, + { + "epoch": 2.09, + "grad_norm": 0.8772937059402466, + "learning_rate": 4.865981176484118e-05, + "loss": 1.2517, + "step": 7000 + }, + { + "epoch": 2.1, + "grad_norm": 1.7310841083526611, + "learning_rate": 4.865791330667053e-05, + "loss": 1.3634, + "step": 7005 + }, + { + "epoch": 2.1, + "grad_norm": 1.4149776697158813, + "learning_rate": 4.865601354189792e-05, + "loss": 1.28, + "step": 7010 + }, + { + "epoch": 2.1, + "grad_norm": 0.9106690287590027, + "learning_rate": 4.865411247062827e-05, + "loss": 1.4158, + "step": 7015 + }, + { + "epoch": 2.1, + "grad_norm": 1.424204707145691, + "learning_rate": 4.865221009296657e-05, + "loss": 1.4474, + "step": 7020 + }, + { + "epoch": 2.1, + "grad_norm": 0.823806881904602, + "learning_rate": 4.86503064090179e-05, + "loss": 1.1959, + "step": 7025 + }, + { + "epoch": 2.1, + "grad_norm": 1.7646584510803223, + "learning_rate": 4.864840141888739e-05, + "loss": 1.2428, + "step": 7030 + }, + { + "epoch": 2.1, + "grad_norm": 0.7892530560493469, + "learning_rate": 4.864649512268024e-05, + "loss": 1.4816, + "step": 7035 + }, + { + "epoch": 2.11, + "grad_norm": 1.802615761756897, + "learning_rate": 4.864458752050175e-05, + "loss": 1.2827, + "step": 7040 + }, + { + "epoch": 2.11, + "grad_norm": 1.3489775657653809, + "learning_rate": 4.864267861245727e-05, + "loss": 1.3001, + "step": 7045 + }, + { + "epoch": 2.11, + "grad_norm": 1.7209609746932983, + "learning_rate": 4.8640768398652224e-05, + "loss": 1.2558, + "step": 7050 + }, + { + "epoch": 2.11, + "grad_norm": 1.2249550819396973, + "learning_rate": 4.863885687919212e-05, + "loss": 1.4765, + "step": 7055 + }, + { + "epoch": 2.11, + "grad_norm": 1.2855435609817505, + "learning_rate": 4.863694405418251e-05, + "loss": 1.1862, + "step": 7060 + }, + { + "epoch": 2.11, + "grad_norm": 1.4414563179016113, + "learning_rate": 4.863502992372906e-05, + "loss": 1.1531, + "step": 7065 + }, + { + "epoch": 2.12, + "grad_norm": 0.9723905920982361, + "learning_rate": 4.863311448793746e-05, + "loss": 1.3402, + "step": 7070 + }, + { + "epoch": 2.12, + "grad_norm": 1.274014949798584, + "learning_rate": 4.8631197746913525e-05, + "loss": 1.5264, + "step": 7075 + }, + { + "epoch": 2.12, + "grad_norm": 0.9938822388648987, + "learning_rate": 4.86292797007631e-05, + "loss": 1.2517, + "step": 7080 + }, + { + "epoch": 2.12, + "grad_norm": 2.461939811706543, + "learning_rate": 4.862736034959211e-05, + "loss": 1.3553, + "step": 7085 + }, + { + "epoch": 2.12, + "grad_norm": 0.8186405897140503, + "learning_rate": 4.8625439693506576e-05, + "loss": 1.3186, + "step": 7090 + }, + { + "epoch": 2.12, + "grad_norm": 2.9292609691619873, + "learning_rate": 4.862351773261256e-05, + "loss": 1.3257, + "step": 7095 + }, + { + "epoch": 2.12, + "grad_norm": 0.6807072758674622, + "learning_rate": 4.8621594467016216e-05, + "loss": 1.2733, + "step": 7100 + }, + { + "epoch": 2.13, + "grad_norm": 0.9890395402908325, + "learning_rate": 4.8619669896823766e-05, + "loss": 1.4189, + "step": 7105 + }, + { + "epoch": 2.13, + "grad_norm": 0.6927660703659058, + "learning_rate": 4.8617744022141496e-05, + "loss": 1.0253, + "step": 7110 + }, + { + "epoch": 2.13, + "grad_norm": 2.2300829887390137, + "learning_rate": 4.861581684307577e-05, + "loss": 1.3179, + "step": 7115 + }, + { + "epoch": 2.13, + "grad_norm": 2.7834036350250244, + "learning_rate": 4.8613888359733035e-05, + "loss": 1.4093, + "step": 7120 + }, + { + "epoch": 2.13, + "grad_norm": 1.634334683418274, + "learning_rate": 4.861195857221978e-05, + "loss": 1.3292, + "step": 7125 + }, + { + "epoch": 2.13, + "grad_norm": 2.0878982543945312, + "learning_rate": 4.86100274806426e-05, + "loss": 1.2545, + "step": 7130 + }, + { + "epoch": 2.13, + "grad_norm": 1.6050611734390259, + "learning_rate": 4.8608095085108155e-05, + "loss": 1.2622, + "step": 7135 + }, + { + "epoch": 2.14, + "grad_norm": 1.9072321653366089, + "learning_rate": 4.8606161385723147e-05, + "loss": 1.2726, + "step": 7140 + }, + { + "epoch": 2.14, + "grad_norm": 1.5780624151229858, + "learning_rate": 4.860422638259439e-05, + "loss": 1.3947, + "step": 7145 + }, + { + "epoch": 2.14, + "grad_norm": 2.0767362117767334, + "learning_rate": 4.860229007582874e-05, + "loss": 1.4344, + "step": 7150 + }, + { + "epoch": 2.14, + "grad_norm": 1.11172354221344, + "learning_rate": 4.860035246553314e-05, + "loss": 1.3602, + "step": 7155 + }, + { + "epoch": 2.14, + "grad_norm": 0.8127298951148987, + "learning_rate": 4.859841355181461e-05, + "loss": 1.351, + "step": 7160 + }, + { + "epoch": 2.14, + "grad_norm": 1.1209958791732788, + "learning_rate": 4.8596473334780225e-05, + "loss": 1.279, + "step": 7165 + }, + { + "epoch": 2.15, + "grad_norm": 1.7135673761367798, + "learning_rate": 4.859453181453715e-05, + "loss": 1.5372, + "step": 7170 + }, + { + "epoch": 2.15, + "grad_norm": 1.5923312902450562, + "learning_rate": 4.85925889911926e-05, + "loss": 1.3879, + "step": 7175 + }, + { + "epoch": 2.15, + "grad_norm": 1.2418073415756226, + "learning_rate": 4.8590644864853886e-05, + "loss": 1.3691, + "step": 7180 + }, + { + "epoch": 2.15, + "grad_norm": 0.9829604625701904, + "learning_rate": 4.858869943562838e-05, + "loss": 1.3597, + "step": 7185 + }, + { + "epoch": 2.15, + "grad_norm": 1.1962437629699707, + "learning_rate": 4.8586752703623516e-05, + "loss": 1.3715, + "step": 7190 + }, + { + "epoch": 2.15, + "grad_norm": 1.7202482223510742, + "learning_rate": 4.8584804668946825e-05, + "loss": 1.446, + "step": 7195 + }, + { + "epoch": 2.15, + "grad_norm": 2.16395902633667, + "learning_rate": 4.858285533170589e-05, + "loss": 1.2798, + "step": 7200 + }, + { + "epoch": 2.16, + "grad_norm": 0.6793248057365417, + "learning_rate": 4.858090469200835e-05, + "loss": 1.3275, + "step": 7205 + }, + { + "epoch": 2.16, + "grad_norm": 0.6636670231819153, + "learning_rate": 4.8578952749961974e-05, + "loss": 1.433, + "step": 7210 + }, + { + "epoch": 2.16, + "grad_norm": 1.3391855955123901, + "learning_rate": 4.8576999505674546e-05, + "loss": 1.4, + "step": 7215 + }, + { + "epoch": 2.16, + "grad_norm": 0.8590707182884216, + "learning_rate": 4.857504495925393e-05, + "loss": 1.4144, + "step": 7220 + }, + { + "epoch": 2.16, + "grad_norm": 1.8004813194274902, + "learning_rate": 4.85730891108081e-05, + "loss": 1.2784, + "step": 7225 + }, + { + "epoch": 2.16, + "grad_norm": 1.125076413154602, + "learning_rate": 4.8571131960445046e-05, + "loss": 1.3068, + "step": 7230 + }, + { + "epoch": 2.16, + "grad_norm": 0.909472644329071, + "learning_rate": 4.856917350827289e-05, + "loss": 1.3304, + "step": 7235 + }, + { + "epoch": 2.17, + "grad_norm": 1.6477268934249878, + "learning_rate": 4.8567213754399764e-05, + "loss": 1.2217, + "step": 7240 + }, + { + "epoch": 2.17, + "grad_norm": 0.9648934006690979, + "learning_rate": 4.856525269893393e-05, + "loss": 1.4223, + "step": 7245 + }, + { + "epoch": 2.17, + "grad_norm": 2.0322964191436768, + "learning_rate": 4.856329034198368e-05, + "loss": 1.4918, + "step": 7250 + }, + { + "epoch": 2.17, + "grad_norm": 1.6450649499893188, + "learning_rate": 4.8561326683657405e-05, + "loss": 1.3366, + "step": 7255 + }, + { + "epoch": 2.17, + "grad_norm": 1.0612916946411133, + "learning_rate": 4.855936172406354e-05, + "loss": 1.2437, + "step": 7260 + }, + { + "epoch": 2.17, + "grad_norm": 1.8631542921066284, + "learning_rate": 4.855739546331062e-05, + "loss": 1.3482, + "step": 7265 + }, + { + "epoch": 2.18, + "grad_norm": 2.2706685066223145, + "learning_rate": 4.855542790150723e-05, + "loss": 1.3031, + "step": 7270 + }, + { + "epoch": 2.18, + "grad_norm": 1.1304792165756226, + "learning_rate": 4.855345903876204e-05, + "loss": 1.4319, + "step": 7275 + }, + { + "epoch": 2.18, + "grad_norm": 1.4051589965820312, + "learning_rate": 4.8551488875183794e-05, + "loss": 1.2979, + "step": 7280 + }, + { + "epoch": 2.18, + "grad_norm": 1.5046559572219849, + "learning_rate": 4.8549517410881296e-05, + "loss": 1.3051, + "step": 7285 + }, + { + "epoch": 2.18, + "grad_norm": 1.2949678897857666, + "learning_rate": 4.8547544645963435e-05, + "loss": 1.4025, + "step": 7290 + }, + { + "epoch": 2.18, + "grad_norm": 1.3546310663223267, + "learning_rate": 4.854557058053915e-05, + "loss": 1.2204, + "step": 7295 + }, + { + "epoch": 2.18, + "grad_norm": 1.055916666984558, + "learning_rate": 4.8543595214717486e-05, + "loss": 1.3216, + "step": 7300 + }, + { + "epoch": 2.19, + "grad_norm": 1.6973356008529663, + "learning_rate": 4.8541618548607525e-05, + "loss": 1.1717, + "step": 7305 + }, + { + "epoch": 2.19, + "grad_norm": 0.9131316542625427, + "learning_rate": 4.853964058231844e-05, + "loss": 1.224, + "step": 7310 + }, + { + "epoch": 2.19, + "grad_norm": 1.3941514492034912, + "learning_rate": 4.853766131595948e-05, + "loss": 1.4302, + "step": 7315 + }, + { + "epoch": 2.19, + "grad_norm": 0.6351519823074341, + "learning_rate": 4.853568074963994e-05, + "loss": 1.3049, + "step": 7320 + }, + { + "epoch": 2.19, + "grad_norm": 1.1367075443267822, + "learning_rate": 4.853369888346923e-05, + "loss": 1.4376, + "step": 7325 + }, + { + "epoch": 2.19, + "grad_norm": 1.2908804416656494, + "learning_rate": 4.853171571755679e-05, + "loss": 1.3539, + "step": 7330 + }, + { + "epoch": 2.19, + "grad_norm": 0.7789058089256287, + "learning_rate": 4.8529731252012145e-05, + "loss": 1.409, + "step": 7335 + }, + { + "epoch": 2.2, + "grad_norm": 1.526838779449463, + "learning_rate": 4.8527745486944906e-05, + "loss": 1.3515, + "step": 7340 + }, + { + "epoch": 2.2, + "grad_norm": 1.1574357748031616, + "learning_rate": 4.852575842246474e-05, + "loss": 1.3984, + "step": 7345 + }, + { + "epoch": 2.2, + "grad_norm": 2.089668035507202, + "learning_rate": 4.852377005868138e-05, + "loss": 1.3556, + "step": 7350 + }, + { + "epoch": 2.2, + "grad_norm": 1.4345306158065796, + "learning_rate": 4.852178039570466e-05, + "loss": 1.2993, + "step": 7355 + }, + { + "epoch": 2.2, + "grad_norm": 1.0445202589035034, + "learning_rate": 4.851978943364446e-05, + "loss": 1.4464, + "step": 7360 + }, + { + "epoch": 2.2, + "grad_norm": 2.4098684787750244, + "learning_rate": 4.851779717261072e-05, + "loss": 1.4884, + "step": 7365 + }, + { + "epoch": 2.21, + "grad_norm": 0.9191934466362, + "learning_rate": 4.851580361271351e-05, + "loss": 1.5069, + "step": 7370 + }, + { + "epoch": 2.21, + "grad_norm": 1.705700397491455, + "learning_rate": 4.8513808754062894e-05, + "loss": 1.2528, + "step": 7375 + }, + { + "epoch": 2.21, + "grad_norm": 1.1692149639129639, + "learning_rate": 4.851181259676907e-05, + "loss": 1.3765, + "step": 7380 + }, + { + "epoch": 2.21, + "grad_norm": 3.2977681159973145, + "learning_rate": 4.850981514094228e-05, + "loss": 1.2544, + "step": 7385 + }, + { + "epoch": 2.21, + "grad_norm": 1.1639879941940308, + "learning_rate": 4.850781638669283e-05, + "loss": 1.342, + "step": 7390 + }, + { + "epoch": 2.21, + "grad_norm": 0.9928296208381653, + "learning_rate": 4.8505816334131116e-05, + "loss": 1.2802, + "step": 7395 + }, + { + "epoch": 2.21, + "grad_norm": 0.9788908958435059, + "learning_rate": 4.85038149833676e-05, + "loss": 1.39, + "step": 7400 + }, + { + "epoch": 2.22, + "grad_norm": 1.9940471649169922, + "learning_rate": 4.850181233451281e-05, + "loss": 1.354, + "step": 7405 + }, + { + "epoch": 2.22, + "grad_norm": 1.0245496034622192, + "learning_rate": 4.849980838767736e-05, + "loss": 1.3212, + "step": 7410 + }, + { + "epoch": 2.22, + "grad_norm": 2.179013967514038, + "learning_rate": 4.849780314297191e-05, + "loss": 1.5085, + "step": 7415 + }, + { + "epoch": 2.22, + "grad_norm": 1.598319411277771, + "learning_rate": 4.8495796600507226e-05, + "loss": 1.4813, + "step": 7420 + }, + { + "epoch": 2.22, + "grad_norm": 1.519370675086975, + "learning_rate": 4.8493788760394115e-05, + "loss": 1.1946, + "step": 7425 + }, + { + "epoch": 2.22, + "grad_norm": 1.5374704599380493, + "learning_rate": 4.849177962274347e-05, + "loss": 1.5137, + "step": 7430 + }, + { + "epoch": 2.22, + "grad_norm": 0.7265501022338867, + "learning_rate": 4.8489769187666255e-05, + "loss": 1.3113, + "step": 7435 + }, + { + "epoch": 2.23, + "grad_norm": 1.2072696685791016, + "learning_rate": 4.848775745527351e-05, + "loss": 1.3973, + "step": 7440 + }, + { + "epoch": 2.23, + "grad_norm": 1.156033992767334, + "learning_rate": 4.848574442567633e-05, + "loss": 1.4471, + "step": 7445 + }, + { + "epoch": 2.23, + "grad_norm": 1.4983317852020264, + "learning_rate": 4.848373009898589e-05, + "loss": 1.3487, + "step": 7450 + }, + { + "epoch": 2.23, + "grad_norm": 1.2913556098937988, + "learning_rate": 4.848171447531346e-05, + "loss": 1.3145, + "step": 7455 + }, + { + "epoch": 2.23, + "grad_norm": 1.8600425720214844, + "learning_rate": 4.847969755477034e-05, + "loss": 1.3808, + "step": 7460 + }, + { + "epoch": 2.23, + "grad_norm": 1.028699278831482, + "learning_rate": 4.847767933746793e-05, + "loss": 1.3718, + "step": 7465 + }, + { + "epoch": 2.23, + "grad_norm": 1.1770877838134766, + "learning_rate": 4.8475659823517695e-05, + "loss": 1.4073, + "step": 7470 + }, + { + "epoch": 2.24, + "grad_norm": 0.6877673268318176, + "learning_rate": 4.847363901303117e-05, + "loss": 1.3549, + "step": 7475 + }, + { + "epoch": 2.24, + "grad_norm": 0.6186959743499756, + "learning_rate": 4.847161690611996e-05, + "loss": 1.2288, + "step": 7480 + }, + { + "epoch": 2.24, + "grad_norm": 1.7008882761001587, + "learning_rate": 4.846959350289575e-05, + "loss": 1.1874, + "step": 7485 + }, + { + "epoch": 2.24, + "grad_norm": 1.5733880996704102, + "learning_rate": 4.846756880347029e-05, + "loss": 1.3694, + "step": 7490 + }, + { + "epoch": 2.24, + "grad_norm": 0.7959824800491333, + "learning_rate": 4.846554280795539e-05, + "loss": 1.3386, + "step": 7495 + }, + { + "epoch": 2.24, + "grad_norm": 1.6827507019042969, + "learning_rate": 4.8463515516462946e-05, + "loss": 1.3071, + "step": 7500 + }, + { + "epoch": 2.25, + "grad_norm": 1.2505954504013062, + "learning_rate": 4.8461486929104936e-05, + "loss": 1.3838, + "step": 7505 + }, + { + "epoch": 2.25, + "grad_norm": 0.9889876246452332, + "learning_rate": 4.8459457045993396e-05, + "loss": 1.4403, + "step": 7510 + }, + { + "epoch": 2.25, + "grad_norm": 1.9707186222076416, + "learning_rate": 4.845742586724042e-05, + "loss": 1.3676, + "step": 7515 + }, + { + "epoch": 2.25, + "grad_norm": 2.145528554916382, + "learning_rate": 4.845539339295819e-05, + "loss": 1.5171, + "step": 7520 + }, + { + "epoch": 2.25, + "grad_norm": 1.8875707387924194, + "learning_rate": 4.845335962325897e-05, + "loss": 1.236, + "step": 7525 + }, + { + "epoch": 2.25, + "grad_norm": 1.5600254535675049, + "learning_rate": 4.845132455825508e-05, + "loss": 1.1585, + "step": 7530 + }, + { + "epoch": 2.25, + "grad_norm": 1.4640624523162842, + "learning_rate": 4.844928819805892e-05, + "loss": 1.2519, + "step": 7535 + }, + { + "epoch": 2.26, + "grad_norm": 0.7027156949043274, + "learning_rate": 4.844725054278293e-05, + "loss": 1.1937, + "step": 7540 + }, + { + "epoch": 2.26, + "grad_norm": 1.2013248205184937, + "learning_rate": 4.8445211592539674e-05, + "loss": 1.3886, + "step": 7545 + }, + { + "epoch": 2.26, + "grad_norm": 0.7057489156723022, + "learning_rate": 4.844317134744174e-05, + "loss": 1.3281, + "step": 7550 + }, + { + "epoch": 2.26, + "grad_norm": 0.8883240818977356, + "learning_rate": 4.8441129807601834e-05, + "loss": 1.3181, + "step": 7555 + }, + { + "epoch": 2.26, + "grad_norm": 1.322933554649353, + "learning_rate": 4.8439086973132684e-05, + "loss": 1.4801, + "step": 7560 + }, + { + "epoch": 2.26, + "grad_norm": 2.008849859237671, + "learning_rate": 4.843704284414713e-05, + "loss": 1.4635, + "step": 7565 + }, + { + "epoch": 2.26, + "grad_norm": 1.0462682247161865, + "learning_rate": 4.8434997420758065e-05, + "loss": 1.4163, + "step": 7570 + }, + { + "epoch": 2.27, + "grad_norm": 1.419507622718811, + "learning_rate": 4.843295070307844e-05, + "loss": 1.366, + "step": 7575 + }, + { + "epoch": 2.27, + "grad_norm": 1.5404951572418213, + "learning_rate": 4.8430902691221314e-05, + "loss": 1.4692, + "step": 7580 + }, + { + "epoch": 2.27, + "grad_norm": 1.5255093574523926, + "learning_rate": 4.842885338529979e-05, + "loss": 1.5013, + "step": 7585 + }, + { + "epoch": 2.27, + "grad_norm": 2.1905360221862793, + "learning_rate": 4.842680278542704e-05, + "loss": 1.3308, + "step": 7590 + }, + { + "epoch": 2.27, + "grad_norm": 1.8828074932098389, + "learning_rate": 4.842475089171632e-05, + "loss": 1.2902, + "step": 7595 + }, + { + "epoch": 2.27, + "grad_norm": 1.31754732131958, + "learning_rate": 4.842269770428096e-05, + "loss": 1.2712, + "step": 7600 + }, + { + "epoch": 2.28, + "grad_norm": 1.3499248027801514, + "learning_rate": 4.842064322323436e-05, + "loss": 1.419, + "step": 7605 + }, + { + "epoch": 2.28, + "grad_norm": 2.2709991931915283, + "learning_rate": 4.841858744868997e-05, + "loss": 1.3574, + "step": 7610 + }, + { + "epoch": 2.28, + "grad_norm": 0.6626339554786682, + "learning_rate": 4.8416530380761335e-05, + "loss": 1.3227, + "step": 7615 + }, + { + "epoch": 2.28, + "grad_norm": 1.2082407474517822, + "learning_rate": 4.841447201956208e-05, + "loss": 1.1635, + "step": 7620 + }, + { + "epoch": 2.28, + "grad_norm": 1.1640417575836182, + "learning_rate": 4.841241236520586e-05, + "loss": 1.2477, + "step": 7625 + }, + { + "epoch": 2.28, + "grad_norm": 1.2419222593307495, + "learning_rate": 4.8410351417806454e-05, + "loss": 1.1595, + "step": 7630 + }, + { + "epoch": 2.28, + "grad_norm": 2.1427175998687744, + "learning_rate": 4.840828917747766e-05, + "loss": 1.3203, + "step": 7635 + }, + { + "epoch": 2.29, + "grad_norm": 1.4415032863616943, + "learning_rate": 4.8406225644333395e-05, + "loss": 1.2786, + "step": 7640 + }, + { + "epoch": 2.29, + "grad_norm": 1.9205933809280396, + "learning_rate": 4.8404160818487615e-05, + "loss": 1.3197, + "step": 7645 + }, + { + "epoch": 2.29, + "grad_norm": 2.3150992393493652, + "learning_rate": 4.840209470005436e-05, + "loss": 1.3841, + "step": 7650 + }, + { + "epoch": 2.29, + "grad_norm": 2.114997148513794, + "learning_rate": 4.8400027289147746e-05, + "loss": 1.4485, + "step": 7655 + }, + { + "epoch": 2.29, + "grad_norm": 1.2688688039779663, + "learning_rate": 4.8397958585881934e-05, + "loss": 1.2421, + "step": 7660 + }, + { + "epoch": 2.29, + "grad_norm": 2.3500120639801025, + "learning_rate": 4.83958885903712e-05, + "loss": 1.4009, + "step": 7665 + }, + { + "epoch": 2.29, + "grad_norm": 0.87721848487854, + "learning_rate": 4.839381730272985e-05, + "loss": 1.2508, + "step": 7670 + }, + { + "epoch": 2.3, + "grad_norm": 2.1031057834625244, + "learning_rate": 4.83917447230723e-05, + "loss": 1.5297, + "step": 7675 + }, + { + "epoch": 2.3, + "grad_norm": 0.6989408731460571, + "learning_rate": 4.838967085151299e-05, + "loss": 1.1791, + "step": 7680 + }, + { + "epoch": 2.3, + "grad_norm": 1.4262398481369019, + "learning_rate": 4.8387595688166474e-05, + "loss": 1.4372, + "step": 7685 + }, + { + "epoch": 2.3, + "grad_norm": 1.9426538944244385, + "learning_rate": 4.8385519233147355e-05, + "loss": 1.2325, + "step": 7690 + }, + { + "epoch": 2.3, + "grad_norm": 1.5246938467025757, + "learning_rate": 4.838344148657033e-05, + "loss": 1.3084, + "step": 7695 + }, + { + "epoch": 2.3, + "grad_norm": 1.8816734552383423, + "learning_rate": 4.8381362448550126e-05, + "loss": 1.3766, + "step": 7700 + }, + { + "epoch": 2.31, + "grad_norm": 1.2604855298995972, + "learning_rate": 4.837928211920159e-05, + "loss": 1.2206, + "step": 7705 + }, + { + "epoch": 2.31, + "grad_norm": 1.575439691543579, + "learning_rate": 4.837720049863958e-05, + "loss": 1.4169, + "step": 7710 + }, + { + "epoch": 2.31, + "grad_norm": 0.9932951331138611, + "learning_rate": 4.837511758697911e-05, + "loss": 1.3783, + "step": 7715 + }, + { + "epoch": 2.31, + "grad_norm": 0.9457315802574158, + "learning_rate": 4.8373033384335185e-05, + "loss": 1.3412, + "step": 7720 + }, + { + "epoch": 2.31, + "grad_norm": 1.7361222505569458, + "learning_rate": 4.8370947890822914e-05, + "loss": 1.3247, + "step": 7725 + }, + { + "epoch": 2.31, + "grad_norm": 0.6409812569618225, + "learning_rate": 4.8368861106557494e-05, + "loss": 1.3878, + "step": 7730 + }, + { + "epoch": 2.31, + "grad_norm": 1.8458154201507568, + "learning_rate": 4.8366773031654155e-05, + "loss": 1.3113, + "step": 7735 + }, + { + "epoch": 2.32, + "grad_norm": 1.0293338298797607, + "learning_rate": 4.836468366622824e-05, + "loss": 1.5338, + "step": 7740 + }, + { + "epoch": 2.32, + "grad_norm": 1.2134134769439697, + "learning_rate": 4.836259301039513e-05, + "loss": 1.4984, + "step": 7745 + }, + { + "epoch": 2.32, + "grad_norm": 1.1873372793197632, + "learning_rate": 4.8360501064270293e-05, + "loss": 1.2362, + "step": 7750 + }, + { + "epoch": 2.32, + "grad_norm": 2.671053647994995, + "learning_rate": 4.835840782796925e-05, + "loss": 1.4207, + "step": 7755 + }, + { + "epoch": 2.32, + "grad_norm": 1.1975551843643188, + "learning_rate": 4.835631330160764e-05, + "loss": 1.3198, + "step": 7760 + }, + { + "epoch": 2.32, + "grad_norm": 1.6073672771453857, + "learning_rate": 4.835421748530112e-05, + "loss": 1.2571, + "step": 7765 + }, + { + "epoch": 2.32, + "grad_norm": 1.3273591995239258, + "learning_rate": 4.8352120379165444e-05, + "loss": 1.5365, + "step": 7770 + }, + { + "epoch": 2.33, + "grad_norm": 1.3509129285812378, + "learning_rate": 4.835002198331643e-05, + "loss": 1.4124, + "step": 7775 + }, + { + "epoch": 2.33, + "grad_norm": 1.4344903230667114, + "learning_rate": 4.834792229786997e-05, + "loss": 1.2365, + "step": 7780 + }, + { + "epoch": 2.33, + "grad_norm": 0.8779138922691345, + "learning_rate": 4.834582132294203e-05, + "loss": 1.3173, + "step": 7785 + }, + { + "epoch": 2.33, + "grad_norm": 1.2786595821380615, + "learning_rate": 4.834371905864865e-05, + "loss": 1.5267, + "step": 7790 + }, + { + "epoch": 2.33, + "grad_norm": 1.5323184728622437, + "learning_rate": 4.834161550510593e-05, + "loss": 1.2912, + "step": 7795 + }, + { + "epoch": 2.33, + "grad_norm": 0.7492801547050476, + "learning_rate": 4.8339510662430046e-05, + "loss": 1.3253, + "step": 7800 + }, + { + "epoch": 2.34, + "grad_norm": 0.9587426781654358, + "learning_rate": 4.833740453073725e-05, + "loss": 1.3364, + "step": 7805 + }, + { + "epoch": 2.34, + "grad_norm": 1.607540488243103, + "learning_rate": 4.833529711014386e-05, + "loss": 1.2618, + "step": 7810 + }, + { + "epoch": 2.34, + "grad_norm": 1.6319369077682495, + "learning_rate": 4.833318840076626e-05, + "loss": 1.2573, + "step": 7815 + }, + { + "epoch": 2.34, + "grad_norm": 1.2608624696731567, + "learning_rate": 4.833107840272092e-05, + "loss": 1.1074, + "step": 7820 + }, + { + "epoch": 2.34, + "grad_norm": 0.8057272434234619, + "learning_rate": 4.832896711612438e-05, + "loss": 1.3739, + "step": 7825 + }, + { + "epoch": 2.34, + "grad_norm": 1.2614953517913818, + "learning_rate": 4.832685454109323e-05, + "loss": 1.4891, + "step": 7830 + }, + { + "epoch": 2.34, + "grad_norm": 1.4242806434631348, + "learning_rate": 4.8324740677744154e-05, + "loss": 1.254, + "step": 7835 + }, + { + "epoch": 2.35, + "grad_norm": 1.3085395097732544, + "learning_rate": 4.832262552619389e-05, + "loss": 1.3165, + "step": 7840 + }, + { + "epoch": 2.35, + "grad_norm": 1.3229748010635376, + "learning_rate": 4.832050908655926e-05, + "loss": 1.4134, + "step": 7845 + }, + { + "epoch": 2.35, + "grad_norm": 2.4007372856140137, + "learning_rate": 4.831839135895716e-05, + "loss": 1.3104, + "step": 7850 + }, + { + "epoch": 2.35, + "grad_norm": 1.3286653757095337, + "learning_rate": 4.831627234350453e-05, + "loss": 1.2269, + "step": 7855 + }, + { + "epoch": 2.35, + "grad_norm": 1.3296312093734741, + "learning_rate": 4.831415204031843e-05, + "loss": 1.3776, + "step": 7860 + }, + { + "epoch": 2.35, + "grad_norm": 0.8926234841346741, + "learning_rate": 4.831203044951593e-05, + "loss": 1.3842, + "step": 7865 + }, + { + "epoch": 2.35, + "grad_norm": 1.3321502208709717, + "learning_rate": 4.8309907571214234e-05, + "loss": 1.2039, + "step": 7870 + }, + { + "epoch": 2.36, + "grad_norm": 0.7737197875976562, + "learning_rate": 4.830778340553057e-05, + "loss": 1.3269, + "step": 7875 + }, + { + "epoch": 2.36, + "grad_norm": 1.7216429710388184, + "learning_rate": 4.830565795258225e-05, + "loss": 1.4176, + "step": 7880 + }, + { + "epoch": 2.36, + "grad_norm": 7.1848835945129395, + "learning_rate": 4.830353121248667e-05, + "loss": 1.429, + "step": 7885 + }, + { + "epoch": 2.36, + "grad_norm": 1.1508842706680298, + "learning_rate": 4.830140318536128e-05, + "loss": 1.361, + "step": 7890 + }, + { + "epoch": 2.36, + "grad_norm": 0.9978459477424622, + "learning_rate": 4.829927387132362e-05, + "loss": 1.2397, + "step": 7895 + }, + { + "epoch": 2.36, + "grad_norm": 1.3493157625198364, + "learning_rate": 4.829714327049127e-05, + "loss": 1.3463, + "step": 7900 + }, + { + "epoch": 2.37, + "grad_norm": 1.3078923225402832, + "learning_rate": 4.829501138298192e-05, + "loss": 1.3451, + "step": 7905 + }, + { + "epoch": 2.37, + "grad_norm": 1.330299735069275, + "learning_rate": 4.829287820891332e-05, + "loss": 1.4147, + "step": 7910 + }, + { + "epoch": 2.37, + "grad_norm": 1.1133418083190918, + "learning_rate": 4.829074374840325e-05, + "loss": 1.2857, + "step": 7915 + }, + { + "epoch": 2.37, + "grad_norm": 1.8395391702651978, + "learning_rate": 4.828860800156961e-05, + "loss": 1.262, + "step": 7920 + }, + { + "epoch": 2.37, + "grad_norm": 1.1168737411499023, + "learning_rate": 4.8286470968530375e-05, + "loss": 1.364, + "step": 7925 + }, + { + "epoch": 2.37, + "grad_norm": 1.8225456476211548, + "learning_rate": 4.8284332649403534e-05, + "loss": 1.3554, + "step": 7930 + }, + { + "epoch": 2.37, + "grad_norm": 1.4843734502792358, + "learning_rate": 4.8282193044307213e-05, + "loss": 1.3841, + "step": 7935 + }, + { + "epoch": 2.38, + "grad_norm": 2.07279896736145, + "learning_rate": 4.8280052153359565e-05, + "loss": 1.1941, + "step": 7940 + }, + { + "epoch": 2.38, + "grad_norm": 0.881043016910553, + "learning_rate": 4.8277909976678847e-05, + "loss": 1.2924, + "step": 7945 + }, + { + "epoch": 2.38, + "grad_norm": 1.8441745042800903, + "learning_rate": 4.8275766514383346e-05, + "loss": 1.3775, + "step": 7950 + }, + { + "epoch": 2.38, + "grad_norm": 1.2822072505950928, + "learning_rate": 4.827362176659146e-05, + "loss": 1.3072, + "step": 7955 + }, + { + "epoch": 2.38, + "grad_norm": 0.9727440476417542, + "learning_rate": 4.8271475733421636e-05, + "loss": 1.4031, + "step": 7960 + }, + { + "epoch": 2.38, + "grad_norm": 1.616499900817871, + "learning_rate": 4.826932841499239e-05, + "loss": 1.383, + "step": 7965 + }, + { + "epoch": 2.38, + "grad_norm": 1.3687835931777954, + "learning_rate": 4.826717981142233e-05, + "loss": 1.2824, + "step": 7970 + }, + { + "epoch": 2.39, + "grad_norm": 6.598733901977539, + "learning_rate": 4.826502992283011e-05, + "loss": 1.2172, + "step": 7975 + }, + { + "epoch": 2.39, + "grad_norm": 1.1800307035446167, + "learning_rate": 4.826287874933446e-05, + "loss": 1.2832, + "step": 7980 + }, + { + "epoch": 2.39, + "grad_norm": 0.8195118308067322, + "learning_rate": 4.826072629105422e-05, + "loss": 1.2309, + "step": 7985 + }, + { + "epoch": 2.39, + "grad_norm": 3.522782564163208, + "learning_rate": 4.8258572548108226e-05, + "loss": 1.4097, + "step": 7990 + }, + { + "epoch": 2.39, + "grad_norm": 1.1727079153060913, + "learning_rate": 4.8256417520615446e-05, + "loss": 1.507, + "step": 7995 + }, + { + "epoch": 2.39, + "grad_norm": 0.7329881191253662, + "learning_rate": 4.825426120869491e-05, + "loss": 1.2724, + "step": 8000 + }, + { + "epoch": 2.4, + "grad_norm": 1.2820346355438232, + "learning_rate": 4.825210361246569e-05, + "loss": 1.4193, + "step": 8005 + }, + { + "epoch": 2.4, + "grad_norm": 3.400033712387085, + "learning_rate": 4.824994473204697e-05, + "loss": 1.2408, + "step": 8010 + }, + { + "epoch": 2.4, + "grad_norm": 1.259271502494812, + "learning_rate": 4.824778456755796e-05, + "loss": 1.1783, + "step": 8015 + }, + { + "epoch": 2.4, + "grad_norm": 1.4319287538528442, + "learning_rate": 4.824562311911798e-05, + "loss": 1.2565, + "step": 8020 + }, + { + "epoch": 2.4, + "grad_norm": 1.389338493347168, + "learning_rate": 4.824346038684638e-05, + "loss": 1.4043, + "step": 8025 + }, + { + "epoch": 2.4, + "grad_norm": 0.8132510185241699, + "learning_rate": 4.824129637086264e-05, + "loss": 1.3739, + "step": 8030 + }, + { + "epoch": 2.4, + "grad_norm": 1.4358786344528198, + "learning_rate": 4.823913107128626e-05, + "loss": 1.2356, + "step": 8035 + }, + { + "epoch": 2.41, + "grad_norm": 1.380957841873169, + "learning_rate": 4.823696448823681e-05, + "loss": 1.41, + "step": 8040 + }, + { + "epoch": 2.41, + "grad_norm": 0.9512829780578613, + "learning_rate": 4.823479662183398e-05, + "loss": 1.1268, + "step": 8045 + }, + { + "epoch": 2.41, + "grad_norm": 1.8032479286193848, + "learning_rate": 4.823262747219749e-05, + "loss": 1.3039, + "step": 8050 + }, + { + "epoch": 2.41, + "grad_norm": 1.2832984924316406, + "learning_rate": 4.823045703944712e-05, + "loss": 1.3076, + "step": 8055 + }, + { + "epoch": 2.41, + "grad_norm": 1.5915247201919556, + "learning_rate": 4.8228285323702754e-05, + "loss": 1.3724, + "step": 8060 + }, + { + "epoch": 2.41, + "grad_norm": 0.7308753132820129, + "learning_rate": 4.8226112325084335e-05, + "loss": 1.1823, + "step": 8065 + }, + { + "epoch": 2.41, + "grad_norm": 1.5579122304916382, + "learning_rate": 4.822393804371188e-05, + "loss": 1.2714, + "step": 8070 + }, + { + "epoch": 2.42, + "grad_norm": 1.8810973167419434, + "learning_rate": 4.822176247970547e-05, + "loss": 1.3433, + "step": 8075 + }, + { + "epoch": 2.42, + "grad_norm": 0.7145785689353943, + "learning_rate": 4.821958563318524e-05, + "loss": 1.2667, + "step": 8080 + }, + { + "epoch": 2.42, + "grad_norm": 1.8525540828704834, + "learning_rate": 4.8217407504271446e-05, + "loss": 1.3189, + "step": 8085 + }, + { + "epoch": 2.42, + "grad_norm": 1.6017142534255981, + "learning_rate": 4.821522809308436e-05, + "loss": 1.2954, + "step": 8090 + }, + { + "epoch": 2.42, + "grad_norm": 1.1823608875274658, + "learning_rate": 4.821304739974437e-05, + "loss": 1.4232, + "step": 8095 + }, + { + "epoch": 2.42, + "grad_norm": 1.2978624105453491, + "learning_rate": 4.821086542437189e-05, + "loss": 1.3639, + "step": 8100 + }, + { + "epoch": 2.42, + "grad_norm": 0.7997733354568481, + "learning_rate": 4.8208682167087436e-05, + "loss": 1.3463, + "step": 8105 + }, + { + "epoch": 2.43, + "grad_norm": 1.6938416957855225, + "learning_rate": 4.820649762801159e-05, + "loss": 1.3213, + "step": 8110 + }, + { + "epoch": 2.43, + "grad_norm": 0.9703941941261292, + "learning_rate": 4.820431180726501e-05, + "loss": 1.2537, + "step": 8115 + }, + { + "epoch": 2.43, + "grad_norm": 1.5119740962982178, + "learning_rate": 4.820212470496841e-05, + "loss": 1.2918, + "step": 8120 + }, + { + "epoch": 2.43, + "grad_norm": 1.7154700756072998, + "learning_rate": 4.8199936321242576e-05, + "loss": 1.4548, + "step": 8125 + }, + { + "epoch": 2.43, + "grad_norm": 0.7919926643371582, + "learning_rate": 4.819774665620837e-05, + "loss": 1.2193, + "step": 8130 + }, + { + "epoch": 2.43, + "grad_norm": 1.4892973899841309, + "learning_rate": 4.819555570998673e-05, + "loss": 1.325, + "step": 8135 + }, + { + "epoch": 2.44, + "grad_norm": 1.446234107017517, + "learning_rate": 4.819336348269866e-05, + "loss": 1.371, + "step": 8140 + }, + { + "epoch": 2.44, + "grad_norm": 1.9358090162277222, + "learning_rate": 4.8191169974465235e-05, + "loss": 1.3127, + "step": 8145 + }, + { + "epoch": 2.44, + "grad_norm": 1.372162938117981, + "learning_rate": 4.818897518540759e-05, + "loss": 1.3957, + "step": 8150 + }, + { + "epoch": 2.44, + "grad_norm": 1.8464466333389282, + "learning_rate": 4.818677911564696e-05, + "loss": 1.3159, + "step": 8155 + }, + { + "epoch": 2.44, + "grad_norm": 0.7592960000038147, + "learning_rate": 4.8184581765304616e-05, + "loss": 1.3153, + "step": 8160 + }, + { + "epoch": 2.44, + "grad_norm": 0.8226920366287231, + "learning_rate": 4.8182383134501915e-05, + "loss": 1.4496, + "step": 8165 + }, + { + "epoch": 2.44, + "grad_norm": 1.1266101598739624, + "learning_rate": 4.81801832233603e-05, + "loss": 1.2566, + "step": 8170 + }, + { + "epoch": 2.45, + "grad_norm": 1.9116572141647339, + "learning_rate": 4.817798203200126e-05, + "loss": 1.2411, + "step": 8175 + }, + { + "epoch": 2.45, + "grad_norm": 1.1480777263641357, + "learning_rate": 4.8175779560546357e-05, + "loss": 1.4541, + "step": 8180 + }, + { + "epoch": 2.45, + "grad_norm": 1.545530080795288, + "learning_rate": 4.8173575809117246e-05, + "loss": 1.2647, + "step": 8185 + }, + { + "epoch": 2.45, + "grad_norm": 2.0556013584136963, + "learning_rate": 4.817137077783562e-05, + "loss": 1.4301, + "step": 8190 + }, + { + "epoch": 2.45, + "grad_norm": 1.490013599395752, + "learning_rate": 4.816916446682328e-05, + "loss": 1.5971, + "step": 8195 + }, + { + "epoch": 2.45, + "grad_norm": 1.620077133178711, + "learning_rate": 4.8166956876202066e-05, + "loss": 1.2097, + "step": 8200 + }, + { + "epoch": 2.45, + "grad_norm": 1.2473911046981812, + "learning_rate": 4.816474800609391e-05, + "loss": 1.4232, + "step": 8205 + }, + { + "epoch": 2.46, + "grad_norm": 1.5744521617889404, + "learning_rate": 4.816253785662079e-05, + "loss": 1.2997, + "step": 8210 + }, + { + "epoch": 2.46, + "grad_norm": 2.0682075023651123, + "learning_rate": 4.816032642790479e-05, + "loss": 1.2491, + "step": 8215 + }, + { + "epoch": 2.46, + "grad_norm": 0.989682674407959, + "learning_rate": 4.815811372006803e-05, + "loss": 1.2638, + "step": 8220 + }, + { + "epoch": 2.46, + "grad_norm": 0.9861974120140076, + "learning_rate": 4.8155899733232724e-05, + "loss": 1.3134, + "step": 8225 + }, + { + "epoch": 2.46, + "grad_norm": 1.6601946353912354, + "learning_rate": 4.8153684467521145e-05, + "loss": 1.2995, + "step": 8230 + }, + { + "epoch": 2.46, + "grad_norm": 1.8159310817718506, + "learning_rate": 4.8151467923055636e-05, + "loss": 1.4021, + "step": 8235 + }, + { + "epoch": 2.47, + "grad_norm": 1.435726523399353, + "learning_rate": 4.814925009995862e-05, + "loss": 1.2735, + "step": 8240 + }, + { + "epoch": 2.47, + "grad_norm": 1.530027985572815, + "learning_rate": 4.8147030998352585e-05, + "loss": 1.436, + "step": 8245 + }, + { + "epoch": 2.47, + "grad_norm": 1.6053141355514526, + "learning_rate": 4.814481061836008e-05, + "loss": 1.2369, + "step": 8250 + }, + { + "epoch": 2.47, + "grad_norm": 1.5459723472595215, + "learning_rate": 4.814258896010375e-05, + "loss": 1.2969, + "step": 8255 + }, + { + "epoch": 2.47, + "grad_norm": 1.4241658449172974, + "learning_rate": 4.814036602370628e-05, + "loss": 1.2725, + "step": 8260 + }, + { + "epoch": 2.47, + "grad_norm": 1.7884745597839355, + "learning_rate": 4.813814180929046e-05, + "loss": 1.3672, + "step": 8265 + }, + { + "epoch": 2.47, + "grad_norm": 1.1821926832199097, + "learning_rate": 4.8135916316979114e-05, + "loss": 1.2665, + "step": 8270 + }, + { + "epoch": 2.48, + "grad_norm": 1.176249623298645, + "learning_rate": 4.813368954689516e-05, + "loss": 1.2877, + "step": 8275 + }, + { + "epoch": 2.48, + "grad_norm": 2.3001742362976074, + "learning_rate": 4.813146149916157e-05, + "loss": 1.1879, + "step": 8280 + }, + { + "epoch": 2.48, + "grad_norm": 1.2863318920135498, + "learning_rate": 4.812923217390141e-05, + "loss": 1.2886, + "step": 8285 + }, + { + "epoch": 2.48, + "grad_norm": 1.8792412281036377, + "learning_rate": 4.81270015712378e-05, + "loss": 1.3733, + "step": 8290 + }, + { + "epoch": 2.48, + "grad_norm": 1.0877119302749634, + "learning_rate": 4.8124769691293925e-05, + "loss": 1.2741, + "step": 8295 + }, + { + "epoch": 2.48, + "grad_norm": 1.0546271800994873, + "learning_rate": 4.812253653419306e-05, + "loss": 1.2784, + "step": 8300 + }, + { + "epoch": 2.48, + "grad_norm": 0.8137794137001038, + "learning_rate": 4.8120302100058545e-05, + "loss": 1.3681, + "step": 8305 + }, + { + "epoch": 2.49, + "grad_norm": 2.380187749862671, + "learning_rate": 4.811806638901377e-05, + "loss": 1.2771, + "step": 8310 + }, + { + "epoch": 2.49, + "grad_norm": 1.1586872339248657, + "learning_rate": 4.8115829401182224e-05, + "loss": 1.3558, + "step": 8315 + }, + { + "epoch": 2.49, + "grad_norm": 0.9364005327224731, + "learning_rate": 4.811359113668744e-05, + "loss": 1.4381, + "step": 8320 + }, + { + "epoch": 2.49, + "grad_norm": 1.160302758216858, + "learning_rate": 4.8111351595653044e-05, + "loss": 1.4166, + "step": 8325 + }, + { + "epoch": 2.49, + "grad_norm": 1.486975908279419, + "learning_rate": 4.810911077820273e-05, + "loss": 1.3687, + "step": 8330 + }, + { + "epoch": 2.49, + "grad_norm": 1.307428002357483, + "learning_rate": 4.810686868446024e-05, + "loss": 1.2001, + "step": 8335 + }, + { + "epoch": 2.5, + "grad_norm": 1.7174419164657593, + "learning_rate": 4.8104625314549414e-05, + "loss": 1.3438, + "step": 8340 + }, + { + "epoch": 2.5, + "grad_norm": 2.201108455657959, + "learning_rate": 4.810238066859415e-05, + "loss": 1.3922, + "step": 8345 + }, + { + "epoch": 2.5, + "grad_norm": 1.2420109510421753, + "learning_rate": 4.8100134746718405e-05, + "loss": 1.352, + "step": 8350 + }, + { + "epoch": 2.5, + "grad_norm": 1.2646502256393433, + "learning_rate": 4.809788754904624e-05, + "loss": 1.291, + "step": 8355 + }, + { + "epoch": 2.5, + "grad_norm": 1.1232274770736694, + "learning_rate": 4.809563907570175e-05, + "loss": 1.1746, + "step": 8360 + }, + { + "epoch": 2.5, + "grad_norm": 1.6200834512710571, + "learning_rate": 4.809338932680912e-05, + "loss": 1.3263, + "step": 8365 + }, + { + "epoch": 2.5, + "grad_norm": 1.9596359729766846, + "learning_rate": 4.809113830249261e-05, + "loss": 1.4203, + "step": 8370 + }, + { + "epoch": 2.51, + "grad_norm": 1.5598084926605225, + "learning_rate": 4.808888600287652e-05, + "loss": 1.2957, + "step": 8375 + }, + { + "epoch": 2.51, + "grad_norm": 0.9596273899078369, + "learning_rate": 4.808663242808526e-05, + "loss": 1.3509, + "step": 8380 + }, + { + "epoch": 2.51, + "grad_norm": 1.8248343467712402, + "learning_rate": 4.80843775782433e-05, + "loss": 1.4082, + "step": 8385 + }, + { + "epoch": 2.51, + "grad_norm": 1.0814073085784912, + "learning_rate": 4.808212145347515e-05, + "loss": 1.382, + "step": 8390 + }, + { + "epoch": 2.51, + "grad_norm": 1.3564740419387817, + "learning_rate": 4.807986405390543e-05, + "loss": 1.3745, + "step": 8395 + }, + { + "epoch": 2.51, + "grad_norm": 1.0324515104293823, + "learning_rate": 4.8077605379658804e-05, + "loss": 1.2376, + "step": 8400 + }, + { + "epoch": 2.51, + "grad_norm": 1.1277930736541748, + "learning_rate": 4.807534543086002e-05, + "loss": 1.3123, + "step": 8405 + }, + { + "epoch": 2.52, + "grad_norm": 2.6605470180511475, + "learning_rate": 4.807308420763389e-05, + "loss": 1.3997, + "step": 8410 + }, + { + "epoch": 2.52, + "grad_norm": 0.9557091593742371, + "learning_rate": 4.807082171010531e-05, + "loss": 1.3075, + "step": 8415 + }, + { + "epoch": 2.52, + "grad_norm": 0.9519081115722656, + "learning_rate": 4.8068557938399225e-05, + "loss": 1.3437, + "step": 8420 + }, + { + "epoch": 2.52, + "grad_norm": 1.2350136041641235, + "learning_rate": 4.8066292892640666e-05, + "loss": 1.3418, + "step": 8425 + }, + { + "epoch": 2.52, + "grad_norm": 1.178934931755066, + "learning_rate": 4.8064026572954726e-05, + "loss": 1.2916, + "step": 8430 + }, + { + "epoch": 2.52, + "grad_norm": 0.9107993841171265, + "learning_rate": 4.806175897946657e-05, + "loss": 1.1943, + "step": 8435 + }, + { + "epoch": 2.53, + "grad_norm": 1.4889490604400635, + "learning_rate": 4.805949011230144e-05, + "loss": 1.3919, + "step": 8440 + }, + { + "epoch": 2.53, + "grad_norm": 0.8420235514640808, + "learning_rate": 4.805721997158463e-05, + "loss": 1.323, + "step": 8445 + }, + { + "epoch": 2.53, + "grad_norm": 1.4694463014602661, + "learning_rate": 4.8054948557441535e-05, + "loss": 1.2482, + "step": 8450 + }, + { + "epoch": 2.53, + "grad_norm": 1.4326045513153076, + "learning_rate": 4.8052675869997596e-05, + "loss": 1.2654, + "step": 8455 + }, + { + "epoch": 2.53, + "grad_norm": 1.6901538372039795, + "learning_rate": 4.805040190937833e-05, + "loss": 1.2435, + "step": 8460 + }, + { + "epoch": 2.53, + "grad_norm": 1.1050915718078613, + "learning_rate": 4.804812667570933e-05, + "loss": 1.2554, + "step": 8465 + }, + { + "epoch": 2.53, + "grad_norm": 1.468774676322937, + "learning_rate": 4.8045850169116244e-05, + "loss": 1.319, + "step": 8470 + }, + { + "epoch": 2.54, + "grad_norm": 1.1394233703613281, + "learning_rate": 4.804357238972482e-05, + "loss": 1.3279, + "step": 8475 + }, + { + "epoch": 2.54, + "grad_norm": 1.781924843788147, + "learning_rate": 4.804129333766083e-05, + "loss": 1.3171, + "step": 8480 + }, + { + "epoch": 2.54, + "grad_norm": 1.5067170858383179, + "learning_rate": 4.803901301305017e-05, + "loss": 1.3916, + "step": 8485 + }, + { + "epoch": 2.54, + "grad_norm": 1.2595572471618652, + "learning_rate": 4.803673141601877e-05, + "loss": 1.3188, + "step": 8490 + }, + { + "epoch": 2.54, + "grad_norm": 1.1094878911972046, + "learning_rate": 4.803444854669262e-05, + "loss": 1.346, + "step": 8495 + }, + { + "epoch": 2.54, + "grad_norm": 1.4350841045379639, + "learning_rate": 4.803216440519784e-05, + "loss": 1.3852, + "step": 8500 + }, + { + "epoch": 2.54, + "grad_norm": 1.89030122756958, + "learning_rate": 4.8029878991660556e-05, + "loss": 1.4023, + "step": 8505 + }, + { + "epoch": 2.55, + "grad_norm": 0.8968062996864319, + "learning_rate": 4.802759230620699e-05, + "loss": 1.329, + "step": 8510 + }, + { + "epoch": 2.55, + "grad_norm": 1.008400559425354, + "learning_rate": 4.802530434896344e-05, + "loss": 1.321, + "step": 8515 + }, + { + "epoch": 2.55, + "grad_norm": 1.945785641670227, + "learning_rate": 4.802301512005626e-05, + "loss": 1.4198, + "step": 8520 + }, + { + "epoch": 2.55, + "grad_norm": 1.0313044786453247, + "learning_rate": 4.802072461961189e-05, + "loss": 1.1841, + "step": 8525 + }, + { + "epoch": 2.55, + "grad_norm": 0.9936201572418213, + "learning_rate": 4.8018432847756823e-05, + "loss": 1.4208, + "step": 8530 + }, + { + "epoch": 2.55, + "grad_norm": 1.7192137241363525, + "learning_rate": 4.8016139804617646e-05, + "loss": 1.4593, + "step": 8535 + }, + { + "epoch": 2.56, + "grad_norm": 1.4999003410339355, + "learning_rate": 4.801384549032099e-05, + "loss": 1.388, + "step": 8540 + }, + { + "epoch": 2.56, + "grad_norm": 1.0595996379852295, + "learning_rate": 4.8011549904993555e-05, + "loss": 1.3635, + "step": 8545 + }, + { + "epoch": 2.56, + "grad_norm": 2.0259523391723633, + "learning_rate": 4.800925304876215e-05, + "loss": 1.4075, + "step": 8550 + }, + { + "epoch": 2.56, + "grad_norm": 1.3080471754074097, + "learning_rate": 4.800695492175361e-05, + "loss": 1.4913, + "step": 8555 + }, + { + "epoch": 2.56, + "grad_norm": 1.0340911149978638, + "learning_rate": 4.800465552409487e-05, + "loss": 1.3514, + "step": 8560 + }, + { + "epoch": 2.56, + "grad_norm": 0.8910084366798401, + "learning_rate": 4.800235485591291e-05, + "loss": 1.3424, + "step": 8565 + }, + { + "epoch": 2.56, + "grad_norm": 1.5599783658981323, + "learning_rate": 4.8000052917334815e-05, + "loss": 1.4032, + "step": 8570 + }, + { + "epoch": 2.57, + "grad_norm": 1.667799949645996, + "learning_rate": 4.7997749708487695e-05, + "loss": 1.4332, + "step": 8575 + }, + { + "epoch": 2.57, + "grad_norm": 2.000908374786377, + "learning_rate": 4.799544522949876e-05, + "loss": 1.4121, + "step": 8580 + }, + { + "epoch": 2.57, + "grad_norm": 0.7706380486488342, + "learning_rate": 4.799313948049529e-05, + "loss": 1.1469, + "step": 8585 + }, + { + "epoch": 2.57, + "grad_norm": 1.065397024154663, + "learning_rate": 4.799083246160463e-05, + "loss": 1.4589, + "step": 8590 + }, + { + "epoch": 2.57, + "grad_norm": 1.4384722709655762, + "learning_rate": 4.798852417295418e-05, + "loss": 1.3352, + "step": 8595 + }, + { + "epoch": 2.57, + "grad_norm": 1.4065508842468262, + "learning_rate": 4.798621461467146e-05, + "loss": 1.4188, + "step": 8600 + }, + { + "epoch": 2.57, + "grad_norm": 1.0686558485031128, + "learning_rate": 4.798390378688398e-05, + "loss": 1.5157, + "step": 8605 + }, + { + "epoch": 2.58, + "grad_norm": 1.01014244556427, + "learning_rate": 4.798159168971938e-05, + "loss": 1.305, + "step": 8610 + }, + { + "epoch": 2.58, + "grad_norm": 1.5456489324569702, + "learning_rate": 4.7979278323305364e-05, + "loss": 1.4341, + "step": 8615 + }, + { + "epoch": 2.58, + "grad_norm": 1.8445463180541992, + "learning_rate": 4.7976963687769696e-05, + "loss": 1.3751, + "step": 8620 + }, + { + "epoch": 2.58, + "grad_norm": 1.879667043685913, + "learning_rate": 4.797464778324021e-05, + "loss": 1.3944, + "step": 8625 + }, + { + "epoch": 2.58, + "grad_norm": 0.7700638175010681, + "learning_rate": 4.79723306098448e-05, + "loss": 1.2801, + "step": 8630 + }, + { + "epoch": 2.58, + "grad_norm": 1.3582429885864258, + "learning_rate": 4.7970012167711456e-05, + "loss": 1.4009, + "step": 8635 + }, + { + "epoch": 2.58, + "grad_norm": 1.5631026029586792, + "learning_rate": 4.7967692456968207e-05, + "loss": 1.3642, + "step": 8640 + }, + { + "epoch": 2.59, + "grad_norm": 1.840862512588501, + "learning_rate": 4.7965371477743185e-05, + "loss": 1.1852, + "step": 8645 + }, + { + "epoch": 2.59, + "grad_norm": 1.0810298919677734, + "learning_rate": 4.7963049230164556e-05, + "loss": 1.3303, + "step": 8650 + }, + { + "epoch": 2.59, + "grad_norm": 1.0809688568115234, + "learning_rate": 4.7960725714360596e-05, + "loss": 1.4119, + "step": 8655 + }, + { + "epoch": 2.59, + "grad_norm": 1.8407636880874634, + "learning_rate": 4.7958400930459626e-05, + "loss": 1.2693, + "step": 8660 + }, + { + "epoch": 2.59, + "grad_norm": 2.3845536708831787, + "learning_rate": 4.795607487859003e-05, + "loss": 1.3055, + "step": 8665 + }, + { + "epoch": 2.59, + "grad_norm": 1.7638427019119263, + "learning_rate": 4.795374755888028e-05, + "loss": 1.2628, + "step": 8670 + }, + { + "epoch": 2.6, + "grad_norm": 1.1101248264312744, + "learning_rate": 4.7951418971458915e-05, + "loss": 1.3121, + "step": 8675 + }, + { + "epoch": 2.6, + "grad_norm": 1.5876396894454956, + "learning_rate": 4.794908911645453e-05, + "loss": 1.3242, + "step": 8680 + }, + { + "epoch": 2.6, + "grad_norm": 4.759340763092041, + "learning_rate": 4.7946757993995815e-05, + "loss": 1.2821, + "step": 8685 + }, + { + "epoch": 2.6, + "grad_norm": 1.5042836666107178, + "learning_rate": 4.794442560421151e-05, + "loss": 1.423, + "step": 8690 + }, + { + "epoch": 2.6, + "grad_norm": 1.0872600078582764, + "learning_rate": 4.794209194723042e-05, + "loss": 1.2986, + "step": 8695 + }, + { + "epoch": 2.6, + "grad_norm": 0.748469352722168, + "learning_rate": 4.7939757023181435e-05, + "loss": 1.364, + "step": 8700 + }, + { + "epoch": 2.6, + "grad_norm": 0.6730393171310425, + "learning_rate": 4.793742083219353e-05, + "loss": 1.3123, + "step": 8705 + }, + { + "epoch": 2.61, + "grad_norm": 1.2897133827209473, + "learning_rate": 4.793508337439569e-05, + "loss": 1.3758, + "step": 8710 + }, + { + "epoch": 2.61, + "grad_norm": 1.8004686832427979, + "learning_rate": 4.793274464991706e-05, + "loss": 1.3204, + "step": 8715 + }, + { + "epoch": 2.61, + "grad_norm": 3.192774772644043, + "learning_rate": 4.7930404658886766e-05, + "loss": 1.25, + "step": 8720 + }, + { + "epoch": 2.61, + "grad_norm": 1.638178825378418, + "learning_rate": 4.7928063401434065e-05, + "loss": 1.4043, + "step": 8725 + }, + { + "epoch": 2.61, + "grad_norm": 1.2886130809783936, + "learning_rate": 4.792572087768825e-05, + "loss": 1.4547, + "step": 8730 + }, + { + "epoch": 2.61, + "grad_norm": 1.1815823316574097, + "learning_rate": 4.7923377087778695e-05, + "loss": 1.4566, + "step": 8735 + }, + { + "epoch": 2.61, + "grad_norm": 1.7591919898986816, + "learning_rate": 4.7921032031834864e-05, + "loss": 1.3245, + "step": 8740 + }, + { + "epoch": 2.62, + "grad_norm": 1.386547565460205, + "learning_rate": 4.7918685709986254e-05, + "loss": 1.3942, + "step": 8745 + }, + { + "epoch": 2.62, + "grad_norm": 1.2307301759719849, + "learning_rate": 4.791633812236245e-05, + "loss": 1.2166, + "step": 8750 + }, + { + "epoch": 2.62, + "grad_norm": 1.1974438428878784, + "learning_rate": 4.791398926909312e-05, + "loss": 1.361, + "step": 8755 + }, + { + "epoch": 2.62, + "grad_norm": 1.449912190437317, + "learning_rate": 4.791163915030797e-05, + "loss": 1.4183, + "step": 8760 + }, + { + "epoch": 2.62, + "grad_norm": 1.0868198871612549, + "learning_rate": 4.790928776613682e-05, + "loss": 1.4563, + "step": 8765 + }, + { + "epoch": 2.62, + "grad_norm": 1.4005632400512695, + "learning_rate": 4.790693511670951e-05, + "loss": 1.189, + "step": 8770 + }, + { + "epoch": 2.63, + "grad_norm": 1.2266290187835693, + "learning_rate": 4.7904581202155983e-05, + "loss": 1.435, + "step": 8775 + }, + { + "epoch": 2.63, + "grad_norm": 0.5857153534889221, + "learning_rate": 4.790222602260625e-05, + "loss": 1.3531, + "step": 8780 + }, + { + "epoch": 2.63, + "grad_norm": 3.5321526527404785, + "learning_rate": 4.789986957819037e-05, + "loss": 1.361, + "step": 8785 + }, + { + "epoch": 2.63, + "grad_norm": 1.288081407546997, + "learning_rate": 4.78975118690385e-05, + "loss": 1.3323, + "step": 8790 + }, + { + "epoch": 2.63, + "grad_norm": 1.5145295858383179, + "learning_rate": 4.7895152895280856e-05, + "loss": 1.3989, + "step": 8795 + }, + { + "epoch": 2.63, + "grad_norm": 1.247381329536438, + "learning_rate": 4.7892792657047714e-05, + "loss": 1.2564, + "step": 8800 + }, + { + "epoch": 2.63, + "grad_norm": 0.8097814917564392, + "learning_rate": 4.7890903556126435e-05, + "loss": 1.44, + "step": 8805 + }, + { + "epoch": 2.64, + "grad_norm": 2.263486385345459, + "learning_rate": 4.7888541042165937e-05, + "loss": 1.4437, + "step": 8810 + }, + { + "epoch": 2.64, + "grad_norm": 1.0362577438354492, + "learning_rate": 4.78861772640951e-05, + "loss": 1.3168, + "step": 8815 + }, + { + "epoch": 2.64, + "grad_norm": 1.632360816001892, + "learning_rate": 4.7883812222044486e-05, + "loss": 1.287, + "step": 8820 + }, + { + "epoch": 2.64, + "grad_norm": 2.122438669204712, + "learning_rate": 4.788144591614472e-05, + "loss": 1.3109, + "step": 8825 + }, + { + "epoch": 2.64, + "grad_norm": 1.8862602710723877, + "learning_rate": 4.7879078346526464e-05, + "loss": 1.3944, + "step": 8830 + }, + { + "epoch": 2.64, + "grad_norm": 1.2232091426849365, + "learning_rate": 4.7876709513320506e-05, + "loss": 1.4264, + "step": 8835 + }, + { + "epoch": 2.64, + "grad_norm": 1.3420339822769165, + "learning_rate": 4.787433941665765e-05, + "loss": 1.2224, + "step": 8840 + }, + { + "epoch": 2.65, + "grad_norm": 1.9875779151916504, + "learning_rate": 4.787196805666881e-05, + "loss": 1.1951, + "step": 8845 + }, + { + "epoch": 2.65, + "grad_norm": 1.106386661529541, + "learning_rate": 4.7869595433484946e-05, + "loss": 1.4364, + "step": 8850 + }, + { + "epoch": 2.65, + "grad_norm": 1.7247858047485352, + "learning_rate": 4.78672215472371e-05, + "loss": 1.2084, + "step": 8855 + }, + { + "epoch": 2.65, + "grad_norm": 1.5715019702911377, + "learning_rate": 4.786484639805637e-05, + "loss": 1.4345, + "step": 8860 + }, + { + "epoch": 2.65, + "grad_norm": 1.0408717393875122, + "learning_rate": 4.7862469986073954e-05, + "loss": 1.4077, + "step": 8865 + }, + { + "epoch": 2.65, + "grad_norm": 1.5695548057556152, + "learning_rate": 4.786009231142108e-05, + "loss": 1.2914, + "step": 8870 + }, + { + "epoch": 2.66, + "grad_norm": 1.7669672966003418, + "learning_rate": 4.7857713374229066e-05, + "loss": 1.2795, + "step": 8875 + }, + { + "epoch": 2.66, + "grad_norm": 1.5824629068374634, + "learning_rate": 4.78553331746293e-05, + "loss": 1.2562, + "step": 8880 + }, + { + "epoch": 2.66, + "grad_norm": 0.8411540389060974, + "learning_rate": 4.7852951712753244e-05, + "loss": 1.4505, + "step": 8885 + }, + { + "epoch": 2.66, + "grad_norm": 1.6968480348587036, + "learning_rate": 4.7850568988732416e-05, + "loss": 1.3871, + "step": 8890 + }, + { + "epoch": 2.66, + "grad_norm": 1.4560775756835938, + "learning_rate": 4.7848185002698416e-05, + "loss": 1.4416, + "step": 8895 + }, + { + "epoch": 2.66, + "grad_norm": 1.1410988569259644, + "learning_rate": 4.7845799754782907e-05, + "loss": 1.3336, + "step": 8900 + }, + { + "epoch": 2.66, + "grad_norm": 1.136461615562439, + "learning_rate": 4.784341324511762e-05, + "loss": 1.3175, + "step": 8905 + }, + { + "epoch": 2.67, + "grad_norm": 1.0604544878005981, + "learning_rate": 4.784102547383437e-05, + "loss": 1.3411, + "step": 8910 + }, + { + "epoch": 2.67, + "grad_norm": 0.7411584258079529, + "learning_rate": 4.783863644106502e-05, + "loss": 1.1918, + "step": 8915 + }, + { + "epoch": 2.67, + "grad_norm": 1.8481019735336304, + "learning_rate": 4.783624614694153e-05, + "loss": 1.4247, + "step": 8920 + }, + { + "epoch": 2.67, + "grad_norm": 1.549062967300415, + "learning_rate": 4.7833854591595895e-05, + "loss": 1.2813, + "step": 8925 + }, + { + "epoch": 2.67, + "grad_norm": 1.4914495944976807, + "learning_rate": 4.78314617751602e-05, + "loss": 1.3358, + "step": 8930 + }, + { + "epoch": 2.67, + "grad_norm": 2.6367831230163574, + "learning_rate": 4.782906769776661e-05, + "loss": 1.2798, + "step": 8935 + }, + { + "epoch": 2.67, + "grad_norm": 1.661944031715393, + "learning_rate": 4.7826672359547343e-05, + "loss": 1.3176, + "step": 8940 + }, + { + "epoch": 2.68, + "grad_norm": 1.858271598815918, + "learning_rate": 4.782427576063468e-05, + "loss": 1.1787, + "step": 8945 + }, + { + "epoch": 2.68, + "grad_norm": 1.1787148714065552, + "learning_rate": 4.7821877901160996e-05, + "loss": 1.3318, + "step": 8950 + }, + { + "epoch": 2.68, + "grad_norm": 0.9741628170013428, + "learning_rate": 4.781947878125872e-05, + "loss": 1.3084, + "step": 8955 + }, + { + "epoch": 2.68, + "grad_norm": 1.8977104425430298, + "learning_rate": 4.781707840106034e-05, + "loss": 1.3595, + "step": 8960 + }, + { + "epoch": 2.68, + "grad_norm": 1.1290730237960815, + "learning_rate": 4.781467676069845e-05, + "loss": 1.3126, + "step": 8965 + }, + { + "epoch": 2.68, + "grad_norm": 1.0687497854232788, + "learning_rate": 4.7812273860305665e-05, + "loss": 1.2979, + "step": 8970 + }, + { + "epoch": 2.69, + "grad_norm": 1.463879108428955, + "learning_rate": 4.780986970001472e-05, + "loss": 1.3988, + "step": 8975 + }, + { + "epoch": 2.69, + "grad_norm": 0.7218008637428284, + "learning_rate": 4.780746427995837e-05, + "loss": 1.3782, + "step": 8980 + }, + { + "epoch": 2.69, + "grad_norm": 0.7819427847862244, + "learning_rate": 4.7805057600269485e-05, + "loss": 1.2902, + "step": 8985 + }, + { + "epoch": 2.69, + "grad_norm": 0.7912634015083313, + "learning_rate": 4.780264966108097e-05, + "loss": 1.2542, + "step": 8990 + }, + { + "epoch": 2.69, + "grad_norm": 1.160401701927185, + "learning_rate": 4.780024046252581e-05, + "loss": 1.2758, + "step": 8995 + }, + { + "epoch": 2.69, + "grad_norm": 0.7142820954322815, + "learning_rate": 4.779783000473707e-05, + "loss": 1.278, + "step": 9000 + }, + { + "epoch": 2.69, + "grad_norm": 1.7503546476364136, + "learning_rate": 4.779541828784788e-05, + "loss": 1.3523, + "step": 9005 + }, + { + "epoch": 2.7, + "grad_norm": 1.522162914276123, + "learning_rate": 4.779300531199143e-05, + "loss": 1.3737, + "step": 9010 + }, + { + "epoch": 2.7, + "grad_norm": 1.4219088554382324, + "learning_rate": 4.779059107730099e-05, + "loss": 1.4303, + "step": 9015 + }, + { + "epoch": 2.7, + "grad_norm": 1.6250290870666504, + "learning_rate": 4.778817558390989e-05, + "loss": 1.366, + "step": 9020 + }, + { + "epoch": 2.7, + "grad_norm": 1.2301737070083618, + "learning_rate": 4.7785758831951543e-05, + "loss": 1.3006, + "step": 9025 + }, + { + "epoch": 2.7, + "grad_norm": 1.753127932548523, + "learning_rate": 4.778334082155942e-05, + "loss": 1.4303, + "step": 9030 + }, + { + "epoch": 2.7, + "grad_norm": 1.423354148864746, + "learning_rate": 4.778092155286707e-05, + "loss": 1.2672, + "step": 9035 + }, + { + "epoch": 2.7, + "grad_norm": 2.4239237308502197, + "learning_rate": 4.777850102600809e-05, + "loss": 1.2822, + "step": 9040 + }, + { + "epoch": 2.71, + "grad_norm": 1.1485992670059204, + "learning_rate": 4.777607924111619e-05, + "loss": 1.3411, + "step": 9045 + }, + { + "epoch": 2.71, + "grad_norm": 0.7235462665557861, + "learning_rate": 4.77736561983251e-05, + "loss": 1.3456, + "step": 9050 + }, + { + "epoch": 2.71, + "grad_norm": 1.1545169353485107, + "learning_rate": 4.7771231897768655e-05, + "loss": 1.4223, + "step": 9055 + }, + { + "epoch": 2.71, + "grad_norm": 2.123680353164673, + "learning_rate": 4.776880633958073e-05, + "loss": 1.4271, + "step": 9060 + }, + { + "epoch": 2.71, + "grad_norm": 1.6497604846954346, + "learning_rate": 4.776637952389531e-05, + "loss": 1.4639, + "step": 9065 + }, + { + "epoch": 2.71, + "grad_norm": 1.2287499904632568, + "learning_rate": 4.776395145084641e-05, + "loss": 1.2522, + "step": 9070 + }, + { + "epoch": 2.72, + "grad_norm": 1.5195343494415283, + "learning_rate": 4.7761522120568134e-05, + "loss": 1.4156, + "step": 9075 + }, + { + "epoch": 2.72, + "grad_norm": 1.1889725923538208, + "learning_rate": 4.775909153319465e-05, + "loss": 1.2898, + "step": 9080 + }, + { + "epoch": 2.72, + "grad_norm": 1.7549879550933838, + "learning_rate": 4.775665968886019e-05, + "loss": 1.2376, + "step": 9085 + }, + { + "epoch": 2.72, + "grad_norm": 1.9946390390396118, + "learning_rate": 4.775422658769908e-05, + "loss": 1.4544, + "step": 9090 + }, + { + "epoch": 2.72, + "grad_norm": 1.594741940498352, + "learning_rate": 4.775179222984568e-05, + "loss": 1.2683, + "step": 9095 + }, + { + "epoch": 2.72, + "grad_norm": 1.0693196058273315, + "learning_rate": 4.774935661543445e-05, + "loss": 1.3477, + "step": 9100 + }, + { + "epoch": 2.72, + "grad_norm": 2.610431671142578, + "learning_rate": 4.774691974459989e-05, + "loss": 1.321, + "step": 9105 + }, + { + "epoch": 2.73, + "grad_norm": 1.1639457941055298, + "learning_rate": 4.774448161747661e-05, + "loss": 1.3614, + "step": 9110 + }, + { + "epoch": 2.73, + "grad_norm": 2.8405585289001465, + "learning_rate": 4.774204223419925e-05, + "loss": 1.2398, + "step": 9115 + }, + { + "epoch": 2.73, + "grad_norm": 2.058262586593628, + "learning_rate": 4.773960159490253e-05, + "loss": 1.2883, + "step": 9120 + }, + { + "epoch": 2.73, + "grad_norm": 6.126988410949707, + "learning_rate": 4.773715969972125e-05, + "loss": 1.4032, + "step": 9125 + }, + { + "epoch": 2.73, + "grad_norm": 1.726212739944458, + "learning_rate": 4.7734716548790274e-05, + "loss": 1.362, + "step": 9130 + }, + { + "epoch": 2.73, + "grad_norm": 2.678607940673828, + "learning_rate": 4.773227214224454e-05, + "loss": 1.3634, + "step": 9135 + }, + { + "epoch": 2.73, + "grad_norm": 0.9776396155357361, + "learning_rate": 4.7729826480219044e-05, + "loss": 1.274, + "step": 9140 + }, + { + "epoch": 2.74, + "grad_norm": 1.055379033088684, + "learning_rate": 4.772737956284885e-05, + "loss": 1.219, + "step": 9145 + }, + { + "epoch": 2.74, + "grad_norm": 1.7453423738479614, + "learning_rate": 4.7724931390269115e-05, + "loss": 1.1836, + "step": 9150 + }, + { + "epoch": 2.74, + "grad_norm": 1.390795350074768, + "learning_rate": 4.772248196261504e-05, + "loss": 1.4451, + "step": 9155 + }, + { + "epoch": 2.74, + "grad_norm": 1.7880897521972656, + "learning_rate": 4.7720031280021905e-05, + "loss": 1.3696, + "step": 9160 + }, + { + "epoch": 2.74, + "grad_norm": 1.2243832349777222, + "learning_rate": 4.771757934262505e-05, + "loss": 1.2609, + "step": 9165 + }, + { + "epoch": 2.74, + "grad_norm": 2.5606729984283447, + "learning_rate": 4.771512615055991e-05, + "loss": 1.4425, + "step": 9170 + }, + { + "epoch": 2.75, + "grad_norm": 1.7605032920837402, + "learning_rate": 4.771267170396196e-05, + "loss": 1.4839, + "step": 9175 + }, + { + "epoch": 2.75, + "grad_norm": 2.1422829627990723, + "learning_rate": 4.771021600296676e-05, + "loss": 1.4384, + "step": 9180 + }, + { + "epoch": 2.75, + "grad_norm": 1.3212814331054688, + "learning_rate": 4.770775904770994e-05, + "loss": 1.4812, + "step": 9185 + }, + { + "epoch": 2.75, + "grad_norm": 0.6888114213943481, + "learning_rate": 4.770530083832719e-05, + "loss": 1.3274, + "step": 9190 + }, + { + "epoch": 2.75, + "grad_norm": 1.3165194988250732, + "learning_rate": 4.770284137495428e-05, + "loss": 1.3288, + "step": 9195 + }, + { + "epoch": 2.75, + "grad_norm": 0.9350600242614746, + "learning_rate": 4.7700380657727027e-05, + "loss": 1.459, + "step": 9200 + }, + { + "epoch": 2.75, + "grad_norm": 1.657118797302246, + "learning_rate": 4.769791868678135e-05, + "loss": 1.277, + "step": 9205 + }, + { + "epoch": 2.76, + "grad_norm": 0.7231934666633606, + "learning_rate": 4.769545546225322e-05, + "loss": 1.3761, + "step": 9210 + }, + { + "epoch": 2.76, + "grad_norm": 1.0625771284103394, + "learning_rate": 4.7692990984278676e-05, + "loss": 1.294, + "step": 9215 + }, + { + "epoch": 2.76, + "grad_norm": 1.9042518138885498, + "learning_rate": 4.769052525299383e-05, + "loss": 1.4183, + "step": 9220 + }, + { + "epoch": 2.76, + "grad_norm": 2.654555082321167, + "learning_rate": 4.7688058268534855e-05, + "loss": 1.365, + "step": 9225 + }, + { + "epoch": 2.76, + "grad_norm": 0.9852314591407776, + "learning_rate": 4.768559003103801e-05, + "loss": 1.4423, + "step": 9230 + }, + { + "epoch": 2.76, + "grad_norm": 1.300337791442871, + "learning_rate": 4.76831205406396e-05, + "loss": 1.2752, + "step": 9235 + }, + { + "epoch": 2.76, + "grad_norm": 1.255034327507019, + "learning_rate": 4.768064979747603e-05, + "loss": 1.3097, + "step": 9240 + }, + { + "epoch": 2.77, + "grad_norm": 1.3751298189163208, + "learning_rate": 4.767817780168374e-05, + "loss": 1.2666, + "step": 9245 + }, + { + "epoch": 2.77, + "grad_norm": 0.7448214888572693, + "learning_rate": 4.7675704553399265e-05, + "loss": 1.4518, + "step": 9250 + }, + { + "epoch": 2.77, + "grad_norm": 2.0621886253356934, + "learning_rate": 4.76732300527592e-05, + "loss": 1.4063, + "step": 9255 + }, + { + "epoch": 2.77, + "grad_norm": 1.179983139038086, + "learning_rate": 4.76707542999002e-05, + "loss": 1.425, + "step": 9260 + }, + { + "epoch": 2.77, + "grad_norm": 1.047968864440918, + "learning_rate": 4.7668277294959006e-05, + "loss": 1.3343, + "step": 9265 + }, + { + "epoch": 2.77, + "grad_norm": 1.8152706623077393, + "learning_rate": 4.766579903807242e-05, + "loss": 1.3582, + "step": 9270 + }, + { + "epoch": 2.77, + "grad_norm": 1.3376197814941406, + "learning_rate": 4.7663319529377323e-05, + "loss": 1.4053, + "step": 9275 + }, + { + "epoch": 2.78, + "grad_norm": 1.8099796772003174, + "learning_rate": 4.7660838769010635e-05, + "loss": 1.5009, + "step": 9280 + }, + { + "epoch": 2.78, + "grad_norm": 1.4460123777389526, + "learning_rate": 4.765835675710938e-05, + "loss": 1.5084, + "step": 9285 + }, + { + "epoch": 2.78, + "grad_norm": 2.1671342849731445, + "learning_rate": 4.765587349381063e-05, + "loss": 1.3576, + "step": 9290 + }, + { + "epoch": 2.78, + "grad_norm": 0.9347168803215027, + "learning_rate": 4.765338897925154e-05, + "loss": 1.4602, + "step": 9295 + }, + { + "epoch": 2.78, + "grad_norm": 1.7557331323623657, + "learning_rate": 4.765090321356932e-05, + "loss": 1.4104, + "step": 9300 + }, + { + "epoch": 2.78, + "grad_norm": 0.8540169596672058, + "learning_rate": 4.764841619690127e-05, + "loss": 1.3433, + "step": 9305 + }, + { + "epoch": 2.79, + "grad_norm": 2.3423655033111572, + "learning_rate": 4.764592792938473e-05, + "loss": 1.3206, + "step": 9310 + }, + { + "epoch": 2.79, + "grad_norm": 1.7521955966949463, + "learning_rate": 4.764343841115712e-05, + "loss": 1.2059, + "step": 9315 + }, + { + "epoch": 2.79, + "grad_norm": 2.0334832668304443, + "learning_rate": 4.764094764235595e-05, + "loss": 1.3869, + "step": 9320 + }, + { + "epoch": 2.79, + "grad_norm": 6.126003265380859, + "learning_rate": 4.763845562311877e-05, + "loss": 1.4294, + "step": 9325 + }, + { + "epoch": 2.79, + "grad_norm": 0.9254896640777588, + "learning_rate": 4.763596235358323e-05, + "loss": 1.3488, + "step": 9330 + }, + { + "epoch": 2.79, + "grad_norm": 1.4646904468536377, + "learning_rate": 4.7633467833887017e-05, + "loss": 1.397, + "step": 9335 + }, + { + "epoch": 2.79, + "grad_norm": 2.3618175983428955, + "learning_rate": 4.763097206416789e-05, + "loss": 1.4709, + "step": 9340 + }, + { + "epoch": 2.8, + "grad_norm": 0.8631429672241211, + "learning_rate": 4.76284750445637e-05, + "loss": 1.4577, + "step": 9345 + }, + { + "epoch": 2.8, + "grad_norm": 1.02370285987854, + "learning_rate": 4.762597677521237e-05, + "loss": 1.4129, + "step": 9350 + }, + { + "epoch": 2.8, + "grad_norm": 0.7775421142578125, + "learning_rate": 4.762347725625184e-05, + "loss": 1.3969, + "step": 9355 + }, + { + "epoch": 2.8, + "grad_norm": 1.6427661180496216, + "learning_rate": 4.762097648782019e-05, + "loss": 1.4851, + "step": 9360 + }, + { + "epoch": 2.8, + "grad_norm": 1.482051968574524, + "learning_rate": 4.761847447005552e-05, + "loss": 1.4408, + "step": 9365 + }, + { + "epoch": 2.8, + "grad_norm": 2.8775594234466553, + "learning_rate": 4.761597120309602e-05, + "loss": 1.338, + "step": 9370 + }, + { + "epoch": 2.8, + "grad_norm": 2.230642795562744, + "learning_rate": 4.7613466687079924e-05, + "loss": 1.3657, + "step": 9375 + }, + { + "epoch": 2.81, + "grad_norm": 1.0118967294692993, + "learning_rate": 4.7610960922145585e-05, + "loss": 1.3863, + "step": 9380 + }, + { + "epoch": 2.81, + "grad_norm": 1.517298936843872, + "learning_rate": 4.7608453908431365e-05, + "loss": 1.337, + "step": 9385 + }, + { + "epoch": 2.81, + "grad_norm": 1.0825563669204712, + "learning_rate": 4.760594564607574e-05, + "loss": 1.1625, + "step": 9390 + }, + { + "epoch": 2.81, + "grad_norm": 1.539588212966919, + "learning_rate": 4.760343613521724e-05, + "loss": 1.3975, + "step": 9395 + }, + { + "epoch": 2.81, + "grad_norm": 0.7479655742645264, + "learning_rate": 4.760092537599445e-05, + "loss": 1.3449, + "step": 9400 + }, + { + "epoch": 2.81, + "grad_norm": 1.136042594909668, + "learning_rate": 4.7598413368546045e-05, + "loss": 1.3229, + "step": 9405 + }, + { + "epoch": 2.82, + "grad_norm": 1.4666575193405151, + "learning_rate": 4.759590011301076e-05, + "loss": 1.382, + "step": 9410 + }, + { + "epoch": 2.82, + "grad_norm": 1.5006098747253418, + "learning_rate": 4.7593385609527406e-05, + "loss": 1.1975, + "step": 9415 + }, + { + "epoch": 2.82, + "grad_norm": 1.135817527770996, + "learning_rate": 4.7590869858234837e-05, + "loss": 1.3296, + "step": 9420 + }, + { + "epoch": 2.82, + "grad_norm": 1.2089482545852661, + "learning_rate": 4.758835285927201e-05, + "loss": 1.195, + "step": 9425 + }, + { + "epoch": 2.82, + "grad_norm": 1.0782078504562378, + "learning_rate": 4.758583461277794e-05, + "loss": 1.295, + "step": 9430 + }, + { + "epoch": 2.82, + "grad_norm": 0.9145467877388, + "learning_rate": 4.75833151188917e-05, + "loss": 1.2709, + "step": 9435 + }, + { + "epoch": 2.82, + "grad_norm": 1.9970406293869019, + "learning_rate": 4.7580794377752436e-05, + "loss": 1.1399, + "step": 9440 + }, + { + "epoch": 2.83, + "grad_norm": 0.8484485149383545, + "learning_rate": 4.7578272389499375e-05, + "loss": 1.3806, + "step": 9445 + }, + { + "epoch": 2.83, + "grad_norm": 1.205751895904541, + "learning_rate": 4.75757491542718e-05, + "loss": 1.4711, + "step": 9450 + }, + { + "epoch": 2.83, + "grad_norm": 2.451490640640259, + "learning_rate": 4.757322467220905e-05, + "loss": 1.5658, + "step": 9455 + }, + { + "epoch": 2.83, + "grad_norm": 1.7217803001403809, + "learning_rate": 4.757069894345058e-05, + "loss": 1.5043, + "step": 9460 + }, + { + "epoch": 2.83, + "grad_norm": 1.288859248161316, + "learning_rate": 4.756817196813587e-05, + "loss": 1.4451, + "step": 9465 + }, + { + "epoch": 2.83, + "grad_norm": 2.3524889945983887, + "learning_rate": 4.756564374640447e-05, + "loss": 1.1978, + "step": 9470 + }, + { + "epoch": 2.83, + "grad_norm": 2.2573511600494385, + "learning_rate": 4.756311427839602e-05, + "loss": 1.4399, + "step": 9475 + }, + { + "epoch": 2.84, + "grad_norm": 0.9026219844818115, + "learning_rate": 4.756058356425024e-05, + "loss": 1.262, + "step": 9480 + }, + { + "epoch": 2.84, + "grad_norm": 1.9231290817260742, + "learning_rate": 4.755805160410686e-05, + "loss": 1.2867, + "step": 9485 + }, + { + "epoch": 2.84, + "grad_norm": 2.3469276428222656, + "learning_rate": 4.755551839810575e-05, + "loss": 1.425, + "step": 9490 + }, + { + "epoch": 2.84, + "grad_norm": 1.1398067474365234, + "learning_rate": 4.75529839463868e-05, + "loss": 1.3015, + "step": 9495 + }, + { + "epoch": 2.84, + "grad_norm": 1.265406847000122, + "learning_rate": 4.755044824908998e-05, + "loss": 1.4007, + "step": 9500 + }, + { + "epoch": 2.84, + "grad_norm": 1.610261082649231, + "learning_rate": 4.754791130635537e-05, + "loss": 1.4184, + "step": 9505 + }, + { + "epoch": 2.85, + "grad_norm": 1.8379100561141968, + "learning_rate": 4.754537311832303e-05, + "loss": 1.3009, + "step": 9510 + }, + { + "epoch": 2.85, + "grad_norm": 2.690635919570923, + "learning_rate": 4.754283368513317e-05, + "loss": 1.3863, + "step": 9515 + }, + { + "epoch": 2.85, + "grad_norm": 1.1724915504455566, + "learning_rate": 4.754029300692604e-05, + "loss": 1.236, + "step": 9520 + }, + { + "epoch": 2.85, + "grad_norm": 1.0479933023452759, + "learning_rate": 4.753775108384196e-05, + "loss": 1.2245, + "step": 9525 + }, + { + "epoch": 2.85, + "grad_norm": 2.2428202629089355, + "learning_rate": 4.753520791602132e-05, + "loss": 1.3393, + "step": 9530 + }, + { + "epoch": 2.85, + "grad_norm": 1.5128904581069946, + "learning_rate": 4.753266350360456e-05, + "loss": 1.3574, + "step": 9535 + }, + { + "epoch": 2.85, + "grad_norm": 0.9393275380134583, + "learning_rate": 4.7530117846732224e-05, + "loss": 1.4423, + "step": 9540 + }, + { + "epoch": 2.86, + "grad_norm": 1.992976427078247, + "learning_rate": 4.75275709455449e-05, + "loss": 1.4572, + "step": 9545 + }, + { + "epoch": 2.86, + "grad_norm": 1.182762622833252, + "learning_rate": 4.752502280018324e-05, + "loss": 1.3492, + "step": 9550 + }, + { + "epoch": 2.86, + "grad_norm": 1.1004512310028076, + "learning_rate": 4.7522473410787985e-05, + "loss": 1.3265, + "step": 9555 + }, + { + "epoch": 2.86, + "grad_norm": 0.9491164684295654, + "learning_rate": 4.751992277749994e-05, + "loss": 1.2525, + "step": 9560 + }, + { + "epoch": 2.86, + "grad_norm": 1.315471887588501, + "learning_rate": 4.751737090045996e-05, + "loss": 1.3748, + "step": 9565 + }, + { + "epoch": 2.86, + "grad_norm": 1.0210347175598145, + "learning_rate": 4.7514817779809e-05, + "loss": 1.3718, + "step": 9570 + }, + { + "epoch": 2.86, + "grad_norm": 1.1683263778686523, + "learning_rate": 4.7512263415688054e-05, + "loss": 1.3103, + "step": 9575 + }, + { + "epoch": 2.87, + "grad_norm": 0.9375793933868408, + "learning_rate": 4.75097078082382e-05, + "loss": 1.4674, + "step": 9580 + }, + { + "epoch": 2.87, + "grad_norm": 1.5116485357284546, + "learning_rate": 4.750715095760058e-05, + "loss": 1.3561, + "step": 9585 + }, + { + "epoch": 2.87, + "grad_norm": 1.965453028678894, + "learning_rate": 4.7504592863916405e-05, + "loss": 1.4002, + "step": 9590 + }, + { + "epoch": 2.87, + "grad_norm": 1.2619178295135498, + "learning_rate": 4.750203352732696e-05, + "loss": 1.2696, + "step": 9595 + }, + { + "epoch": 2.87, + "grad_norm": 2.385756731033325, + "learning_rate": 4.749947294797359e-05, + "loss": 1.4395, + "step": 9600 + }, + { + "epoch": 2.87, + "grad_norm": 1.714622139930725, + "learning_rate": 4.749691112599772e-05, + "loss": 1.1337, + "step": 9605 + }, + { + "epoch": 2.88, + "grad_norm": 0.6344813704490662, + "learning_rate": 4.7494348061540835e-05, + "loss": 1.3205, + "step": 9610 + }, + { + "epoch": 2.88, + "grad_norm": 2.1575703620910645, + "learning_rate": 4.749178375474448e-05, + "loss": 1.325, + "step": 9615 + }, + { + "epoch": 2.88, + "grad_norm": 1.1499351263046265, + "learning_rate": 4.7489218205750295e-05, + "loss": 1.3551, + "step": 9620 + }, + { + "epoch": 2.88, + "grad_norm": 2.3958709239959717, + "learning_rate": 4.7486651414699965e-05, + "loss": 1.2264, + "step": 9625 + }, + { + "epoch": 2.88, + "grad_norm": 1.359483242034912, + "learning_rate": 4.748408338173525e-05, + "loss": 1.3342, + "step": 9630 + }, + { + "epoch": 2.88, + "grad_norm": 1.6914485692977905, + "learning_rate": 4.7481514106997975e-05, + "loss": 1.4204, + "step": 9635 + }, + { + "epoch": 2.88, + "grad_norm": 3.1051025390625, + "learning_rate": 4.747894359063005e-05, + "loss": 1.273, + "step": 9640 + }, + { + "epoch": 2.89, + "grad_norm": 1.179664134979248, + "learning_rate": 4.747637183277343e-05, + "loss": 1.1155, + "step": 9645 + }, + { + "epoch": 2.89, + "grad_norm": 1.4367886781692505, + "learning_rate": 4.747379883357016e-05, + "loss": 1.2716, + "step": 9650 + }, + { + "epoch": 2.89, + "grad_norm": 0.8933409452438354, + "learning_rate": 4.7471224593162346e-05, + "loss": 1.2237, + "step": 9655 + }, + { + "epoch": 2.89, + "grad_norm": 1.1432609558105469, + "learning_rate": 4.7468649111692145e-05, + "loss": 1.4141, + "step": 9660 + }, + { + "epoch": 2.89, + "grad_norm": 1.6368004083633423, + "learning_rate": 4.746607238930182e-05, + "loss": 1.3289, + "step": 9665 + }, + { + "epoch": 2.89, + "grad_norm": 1.7630443572998047, + "learning_rate": 4.746349442613366e-05, + "loss": 1.2229, + "step": 9670 + }, + { + "epoch": 2.89, + "grad_norm": 1.9641880989074707, + "learning_rate": 4.7460915222330054e-05, + "loss": 1.2851, + "step": 9675 + }, + { + "epoch": 2.9, + "grad_norm": 1.504604458808899, + "learning_rate": 4.7458334778033446e-05, + "loss": 1.2702, + "step": 9680 + }, + { + "epoch": 2.9, + "grad_norm": 1.346604585647583, + "learning_rate": 4.745575309338636e-05, + "loss": 1.3684, + "step": 9685 + }, + { + "epoch": 2.9, + "grad_norm": 1.1776371002197266, + "learning_rate": 4.745317016853137e-05, + "loss": 1.2718, + "step": 9690 + }, + { + "epoch": 2.9, + "grad_norm": 0.8132845163345337, + "learning_rate": 4.745058600361112e-05, + "loss": 1.5028, + "step": 9695 + }, + { + "epoch": 2.9, + "grad_norm": 3.2717533111572266, + "learning_rate": 4.7448000598768346e-05, + "loss": 1.2721, + "step": 9700 + }, + { + "epoch": 2.9, + "grad_norm": 1.1330522298812866, + "learning_rate": 4.7445413954145834e-05, + "loss": 1.1869, + "step": 9705 + }, + { + "epoch": 2.91, + "grad_norm": 1.326873779296875, + "learning_rate": 4.744282606988645e-05, + "loss": 1.2496, + "step": 9710 + }, + { + "epoch": 2.91, + "grad_norm": 0.6587218642234802, + "learning_rate": 4.74402369461331e-05, + "loss": 1.2123, + "step": 9715 + }, + { + "epoch": 2.91, + "grad_norm": 1.4744004011154175, + "learning_rate": 4.7437646583028784e-05, + "loss": 1.4327, + "step": 9720 + }, + { + "epoch": 2.91, + "grad_norm": 2.3345212936401367, + "learning_rate": 4.7435054980716576e-05, + "loss": 1.2863, + "step": 9725 + }, + { + "epoch": 2.91, + "grad_norm": 2.557159185409546, + "learning_rate": 4.74324621393396e-05, + "loss": 1.4434, + "step": 9730 + }, + { + "epoch": 2.91, + "grad_norm": 1.1000850200653076, + "learning_rate": 4.742986805904106e-05, + "loss": 1.4368, + "step": 9735 + }, + { + "epoch": 2.91, + "grad_norm": 1.9850451946258545, + "learning_rate": 4.742727273996422e-05, + "loss": 1.3663, + "step": 9740 + }, + { + "epoch": 2.92, + "grad_norm": 1.2970614433288574, + "learning_rate": 4.7424676182252414e-05, + "loss": 1.1515, + "step": 9745 + }, + { + "epoch": 2.92, + "grad_norm": 1.1286096572875977, + "learning_rate": 4.742207838604906e-05, + "loss": 1.4357, + "step": 9750 + }, + { + "epoch": 2.92, + "grad_norm": 2.1564157009124756, + "learning_rate": 4.741947935149762e-05, + "loss": 1.29, + "step": 9755 + }, + { + "epoch": 2.92, + "grad_norm": 1.436423897743225, + "learning_rate": 4.741687907874164e-05, + "loss": 1.3875, + "step": 9760 + }, + { + "epoch": 2.92, + "grad_norm": 2.328244924545288, + "learning_rate": 4.7414277567924723e-05, + "loss": 1.2351, + "step": 9765 + }, + { + "epoch": 2.92, + "grad_norm": 1.7096843719482422, + "learning_rate": 4.741167481919056e-05, + "loss": 1.3325, + "step": 9770 + }, + { + "epoch": 2.92, + "grad_norm": 1.5687915086746216, + "learning_rate": 4.740907083268289e-05, + "loss": 1.3996, + "step": 9775 + }, + { + "epoch": 2.93, + "grad_norm": 0.8579280376434326, + "learning_rate": 4.7406465608545534e-05, + "loss": 1.4976, + "step": 9780 + }, + { + "epoch": 2.93, + "grad_norm": 1.4446935653686523, + "learning_rate": 4.740385914692237e-05, + "loss": 1.2704, + "step": 9785 + }, + { + "epoch": 2.93, + "grad_norm": 0.7776928544044495, + "learning_rate": 4.7401251447957354e-05, + "loss": 1.4133, + "step": 9790 + }, + { + "epoch": 2.93, + "grad_norm": 0.9897927641868591, + "learning_rate": 4.73986425117945e-05, + "loss": 1.3766, + "step": 9795 + }, + { + "epoch": 2.93, + "grad_norm": 1.700108528137207, + "learning_rate": 4.739603233857791e-05, + "loss": 1.451, + "step": 9800 + }, + { + "epoch": 2.93, + "grad_norm": 0.8862264156341553, + "learning_rate": 4.7393420928451733e-05, + "loss": 1.3687, + "step": 9805 + }, + { + "epoch": 2.94, + "grad_norm": 1.4384057521820068, + "learning_rate": 4.739080828156019e-05, + "loss": 1.3752, + "step": 9810 + }, + { + "epoch": 2.94, + "grad_norm": 0.7352772951126099, + "learning_rate": 4.738819439804758e-05, + "loss": 1.2511, + "step": 9815 + }, + { + "epoch": 2.94, + "grad_norm": 0.937332272529602, + "learning_rate": 4.738557927805827e-05, + "loss": 1.3718, + "step": 9820 + }, + { + "epoch": 2.94, + "grad_norm": 1.5250344276428223, + "learning_rate": 4.738296292173668e-05, + "loss": 1.3754, + "step": 9825 + }, + { + "epoch": 2.94, + "grad_norm": 1.4641368389129639, + "learning_rate": 4.7380345329227315e-05, + "loss": 1.4706, + "step": 9830 + }, + { + "epoch": 2.94, + "grad_norm": 2.3265063762664795, + "learning_rate": 4.737772650067474e-05, + "loss": 1.3016, + "step": 9835 + }, + { + "epoch": 2.94, + "grad_norm": 1.2815556526184082, + "learning_rate": 4.737510643622359e-05, + "loss": 1.4134, + "step": 9840 + }, + { + "epoch": 2.95, + "grad_norm": 1.8234738111495972, + "learning_rate": 4.7372485136018577e-05, + "loss": 1.2904, + "step": 9845 + }, + { + "epoch": 2.95, + "grad_norm": 1.526788353919983, + "learning_rate": 4.736986260020445e-05, + "loss": 1.222, + "step": 9850 + }, + { + "epoch": 2.95, + "grad_norm": 2.026932716369629, + "learning_rate": 4.736723882892607e-05, + "loss": 1.3834, + "step": 9855 + }, + { + "epoch": 2.95, + "grad_norm": 1.5664438009262085, + "learning_rate": 4.736461382232835e-05, + "loss": 1.4893, + "step": 9860 + }, + { + "epoch": 2.95, + "grad_norm": 0.8656167387962341, + "learning_rate": 4.736198758055624e-05, + "loss": 1.2126, + "step": 9865 + }, + { + "epoch": 2.95, + "grad_norm": 2.0590291023254395, + "learning_rate": 4.735936010375481e-05, + "loss": 1.504, + "step": 9870 + }, + { + "epoch": 2.95, + "grad_norm": 1.066401481628418, + "learning_rate": 4.7356731392069154e-05, + "loss": 1.3136, + "step": 9875 + }, + { + "epoch": 2.96, + "grad_norm": 1.326509952545166, + "learning_rate": 4.7354101445644475e-05, + "loss": 1.2444, + "step": 9880 + }, + { + "epoch": 2.96, + "grad_norm": 1.4632080793380737, + "learning_rate": 4.7351470264625995e-05, + "loss": 1.4441, + "step": 9885 + }, + { + "epoch": 2.96, + "grad_norm": 3.273226261138916, + "learning_rate": 4.734883784915905e-05, + "loss": 1.2485, + "step": 9890 + }, + { + "epoch": 2.96, + "grad_norm": 1.4411187171936035, + "learning_rate": 4.734620419938902e-05, + "loss": 1.2447, + "step": 9895 + }, + { + "epoch": 2.96, + "grad_norm": 1.597489833831787, + "learning_rate": 4.734356931546137e-05, + "loss": 1.3136, + "step": 9900 + }, + { + "epoch": 2.96, + "grad_norm": 1.6306302547454834, + "learning_rate": 4.7340933197521595e-05, + "loss": 1.4123, + "step": 9905 + }, + { + "epoch": 2.96, + "grad_norm": 1.817524790763855, + "learning_rate": 4.7338295845715316e-05, + "loss": 1.4538, + "step": 9910 + }, + { + "epoch": 2.97, + "grad_norm": 1.2677677869796753, + "learning_rate": 4.733565726018816e-05, + "loss": 1.2595, + "step": 9915 + }, + { + "epoch": 2.97, + "grad_norm": 1.2904586791992188, + "learning_rate": 4.7333017441085884e-05, + "loss": 1.3257, + "step": 9920 + }, + { + "epoch": 2.97, + "grad_norm": 1.0129778385162354, + "learning_rate": 4.733037638855427e-05, + "loss": 1.2028, + "step": 9925 + }, + { + "epoch": 2.97, + "grad_norm": 0.7746032476425171, + "learning_rate": 4.732773410273917e-05, + "loss": 1.174, + "step": 9930 + }, + { + "epoch": 2.97, + "grad_norm": 2.065194845199585, + "learning_rate": 4.732509058378653e-05, + "loss": 1.2716, + "step": 9935 + }, + { + "epoch": 2.97, + "grad_norm": 1.3352677822113037, + "learning_rate": 4.732244583184234e-05, + "loss": 1.4513, + "step": 9940 + }, + { + "epoch": 2.98, + "grad_norm": 1.7781025171279907, + "learning_rate": 4.731979984705267e-05, + "loss": 1.3069, + "step": 9945 + }, + { + "epoch": 2.98, + "grad_norm": 1.0501564741134644, + "learning_rate": 4.731715262956365e-05, + "loss": 1.3894, + "step": 9950 + }, + { + "epoch": 2.98, + "grad_norm": 2.136993885040283, + "learning_rate": 4.7314504179521505e-05, + "loss": 1.3747, + "step": 9955 + }, + { + "epoch": 2.98, + "grad_norm": 1.8982741832733154, + "learning_rate": 4.7311854497072474e-05, + "loss": 1.4344, + "step": 9960 + }, + { + "epoch": 2.98, + "grad_norm": 2.735962390899658, + "learning_rate": 4.730920358236291e-05, + "loss": 1.3851, + "step": 9965 + }, + { + "epoch": 2.98, + "grad_norm": 1.7903727293014526, + "learning_rate": 4.730655143553922e-05, + "loss": 1.3806, + "step": 9970 + }, + { + "epoch": 2.98, + "grad_norm": 1.4778308868408203, + "learning_rate": 4.7303898056747895e-05, + "loss": 1.312, + "step": 9975 + }, + { + "epoch": 2.99, + "grad_norm": 4.839424133300781, + "learning_rate": 4.730124344613545e-05, + "loss": 1.3264, + "step": 9980 + }, + { + "epoch": 2.99, + "grad_norm": 1.6125905513763428, + "learning_rate": 4.729858760384851e-05, + "loss": 1.3275, + "step": 9985 + }, + { + "epoch": 2.99, + "grad_norm": 1.407729148864746, + "learning_rate": 4.7295930530033765e-05, + "loss": 1.401, + "step": 9990 + }, + { + "epoch": 2.99, + "grad_norm": 0.9895592331886292, + "learning_rate": 4.729327222483795e-05, + "loss": 1.3163, + "step": 9995 + }, + { + "epoch": 2.99, + "grad_norm": 1.6113483905792236, + "learning_rate": 4.729061268840788e-05, + "loss": 1.3118, + "step": 10000 + }, + { + "epoch": 2.99, + "grad_norm": 2.3144493103027344, + "learning_rate": 4.728795192089044e-05, + "loss": 1.4342, + "step": 10005 + }, + { + "epoch": 2.99, + "grad_norm": 2.2775607109069824, + "learning_rate": 4.728528992243258e-05, + "loss": 1.3856, + "step": 10010 + }, + { + "epoch": 3.0, + "grad_norm": 0.822791576385498, + "learning_rate": 4.728262669318132e-05, + "loss": 1.3132, + "step": 10015 + }, + { + "epoch": 3.0, + "grad_norm": 0.9759201407432556, + "learning_rate": 4.727996223328376e-05, + "loss": 1.3979, + "step": 10020 + }, + { + "epoch": 3.0, + "grad_norm": 1.3269437551498413, + "learning_rate": 4.727729654288704e-05, + "loss": 1.3228, + "step": 10025 + }, + { + "epoch": 3.0, + "grad_norm": 0.9397942423820496, + "learning_rate": 4.7274629622138384e-05, + "loss": 1.2653, + "step": 10030 + }, + { + "epoch": 3.0, + "grad_norm": 0.8297321200370789, + "learning_rate": 4.7271961471185086e-05, + "loss": 1.2575, + "step": 10035 + }, + { + "epoch": 3.0, + "grad_norm": 1.417336106300354, + "learning_rate": 4.7269292090174514e-05, + "loss": 1.2264, + "step": 10040 + }, + { + "epoch": 3.01, + "grad_norm": 2.3243861198425293, + "learning_rate": 4.7266621479254084e-05, + "loss": 1.2769, + "step": 10045 + }, + { + "epoch": 3.01, + "grad_norm": 0.8861342072486877, + "learning_rate": 4.726394963857129e-05, + "loss": 1.137, + "step": 10050 + }, + { + "epoch": 3.01, + "grad_norm": 1.0285983085632324, + "learning_rate": 4.72612765682737e-05, + "loss": 1.3641, + "step": 10055 + }, + { + "epoch": 3.01, + "grad_norm": 1.3051286935806274, + "learning_rate": 4.725860226850896e-05, + "loss": 1.2869, + "step": 10060 + }, + { + "epoch": 3.01, + "grad_norm": 1.4701013565063477, + "learning_rate": 4.725592673942473e-05, + "loss": 1.3181, + "step": 10065 + }, + { + "epoch": 3.01, + "grad_norm": 1.0764509439468384, + "learning_rate": 4.725324998116881e-05, + "loss": 1.1691, + "step": 10070 + }, + { + "epoch": 3.01, + "grad_norm": 0.9032073616981506, + "learning_rate": 4.7250571993889025e-05, + "loss": 1.181, + "step": 10075 + }, + { + "epoch": 3.02, + "grad_norm": 1.8329408168792725, + "learning_rate": 4.724789277773328e-05, + "loss": 1.2904, + "step": 10080 + }, + { + "epoch": 3.02, + "grad_norm": 1.2701138257980347, + "learning_rate": 4.7245212332849544e-05, + "loss": 1.3036, + "step": 10085 + }, + { + "epoch": 3.02, + "grad_norm": 1.293056607246399, + "learning_rate": 4.7242530659385845e-05, + "loss": 1.4167, + "step": 10090 + }, + { + "epoch": 3.02, + "grad_norm": 0.7726661562919617, + "learning_rate": 4.72398477574903e-05, + "loss": 1.3607, + "step": 10095 + }, + { + "epoch": 3.02, + "grad_norm": 1.5324546098709106, + "learning_rate": 4.7237163627311084e-05, + "loss": 1.475, + "step": 10100 + }, + { + "epoch": 3.02, + "grad_norm": 1.288621187210083, + "learning_rate": 4.723447826899644e-05, + "loss": 1.3092, + "step": 10105 + }, + { + "epoch": 3.02, + "grad_norm": 1.084633469581604, + "learning_rate": 4.723179168269466e-05, + "loss": 1.2224, + "step": 10110 + }, + { + "epoch": 3.03, + "grad_norm": 1.096497654914856, + "learning_rate": 4.7229103868554136e-05, + "loss": 1.2714, + "step": 10115 + }, + { + "epoch": 3.03, + "grad_norm": 1.8230210542678833, + "learning_rate": 4.722641482672332e-05, + "loss": 1.3204, + "step": 10120 + }, + { + "epoch": 3.03, + "grad_norm": 2.226288080215454, + "learning_rate": 4.7223724557350714e-05, + "loss": 1.3604, + "step": 10125 + }, + { + "epoch": 3.03, + "grad_norm": 0.8786224722862244, + "learning_rate": 4.7221033060584897e-05, + "loss": 1.3312, + "step": 10130 + }, + { + "epoch": 3.03, + "grad_norm": 0.790179431438446, + "learning_rate": 4.7218340336574516e-05, + "loss": 1.2433, + "step": 10135 + }, + { + "epoch": 3.03, + "grad_norm": 0.7193413376808167, + "learning_rate": 4.7215646385468304e-05, + "loss": 1.1751, + "step": 10140 + }, + { + "epoch": 3.04, + "grad_norm": 1.1704049110412598, + "learning_rate": 4.721295120741503e-05, + "loss": 1.2481, + "step": 10145 + }, + { + "epoch": 3.04, + "grad_norm": 1.1936416625976562, + "learning_rate": 4.7210254802563547e-05, + "loss": 1.382, + "step": 10150 + }, + { + "epoch": 3.04, + "grad_norm": 1.2714142799377441, + "learning_rate": 4.7207557171062784e-05, + "loss": 1.3151, + "step": 10155 + }, + { + "epoch": 3.04, + "grad_norm": 1.973885178565979, + "learning_rate": 4.7204858313061715e-05, + "loss": 1.3476, + "step": 10160 + }, + { + "epoch": 3.04, + "grad_norm": 1.704947829246521, + "learning_rate": 4.7202158228709404e-05, + "loss": 1.2753, + "step": 10165 + }, + { + "epoch": 3.04, + "grad_norm": 1.6987488269805908, + "learning_rate": 4.719945691815498e-05, + "loss": 1.3038, + "step": 10170 + }, + { + "epoch": 3.04, + "grad_norm": 1.8484985828399658, + "learning_rate": 4.719675438154761e-05, + "loss": 1.2721, + "step": 10175 + }, + { + "epoch": 3.05, + "grad_norm": 1.5991469621658325, + "learning_rate": 4.719405061903658e-05, + "loss": 1.2864, + "step": 10180 + }, + { + "epoch": 3.05, + "grad_norm": 0.9577586054801941, + "learning_rate": 4.71913456307712e-05, + "loss": 1.2446, + "step": 10185 + }, + { + "epoch": 3.05, + "grad_norm": 1.2344039678573608, + "learning_rate": 4.718863941690087e-05, + "loss": 1.3046, + "step": 10190 + }, + { + "epoch": 3.05, + "grad_norm": 1.5823438167572021, + "learning_rate": 4.7185931977575046e-05, + "loss": 1.2316, + "step": 10195 + }, + { + "epoch": 3.05, + "grad_norm": 1.7910969257354736, + "learning_rate": 4.7183223312943257e-05, + "loss": 1.3293, + "step": 10200 + }, + { + "epoch": 3.05, + "grad_norm": 1.5367566347122192, + "learning_rate": 4.7180513423155105e-05, + "loss": 1.2248, + "step": 10205 + }, + { + "epoch": 3.05, + "grad_norm": 1.397963047027588, + "learning_rate": 4.717780230836025e-05, + "loss": 1.3568, + "step": 10210 + }, + { + "epoch": 3.06, + "grad_norm": 1.2412103414535522, + "learning_rate": 4.717508996870843e-05, + "loss": 1.2943, + "step": 10215 + }, + { + "epoch": 3.06, + "grad_norm": 2.0877139568328857, + "learning_rate": 4.7172376404349436e-05, + "loss": 1.415, + "step": 10220 + }, + { + "epoch": 3.06, + "grad_norm": 1.540336012840271, + "learning_rate": 4.716966161543315e-05, + "loss": 1.1981, + "step": 10225 + }, + { + "epoch": 3.06, + "grad_norm": 1.3535550832748413, + "learning_rate": 4.716694560210949e-05, + "loss": 1.2493, + "step": 10230 + }, + { + "epoch": 3.06, + "grad_norm": 1.6786630153656006, + "learning_rate": 4.7164228364528464e-05, + "loss": 1.2774, + "step": 10235 + }, + { + "epoch": 3.06, + "grad_norm": 1.0593065023422241, + "learning_rate": 4.716150990284015e-05, + "loss": 1.1643, + "step": 10240 + }, + { + "epoch": 3.07, + "grad_norm": 1.656699299812317, + "learning_rate": 4.715879021719467e-05, + "loss": 1.2421, + "step": 10245 + }, + { + "epoch": 3.07, + "grad_norm": 1.0437387228012085, + "learning_rate": 4.7156069307742244e-05, + "loss": 1.2925, + "step": 10250 + }, + { + "epoch": 3.07, + "grad_norm": 1.7811616659164429, + "learning_rate": 4.7153347174633145e-05, + "loss": 1.3529, + "step": 10255 + }, + { + "epoch": 3.07, + "grad_norm": 1.5187066793441772, + "learning_rate": 4.71506238180177e-05, + "loss": 1.5044, + "step": 10260 + }, + { + "epoch": 3.07, + "grad_norm": 2.582772731781006, + "learning_rate": 4.714789923804633e-05, + "loss": 1.4396, + "step": 10265 + }, + { + "epoch": 3.07, + "grad_norm": 1.7767497301101685, + "learning_rate": 4.714517343486951e-05, + "loss": 1.2826, + "step": 10270 + }, + { + "epoch": 3.07, + "grad_norm": 1.2945784330368042, + "learning_rate": 4.7142446408637774e-05, + "loss": 1.2212, + "step": 10275 + }, + { + "epoch": 3.08, + "grad_norm": 0.8587417006492615, + "learning_rate": 4.7139718159501747e-05, + "loss": 1.2171, + "step": 10280 + }, + { + "epoch": 3.08, + "grad_norm": 3.030939817428589, + "learning_rate": 4.71369886876121e-05, + "loss": 1.3301, + "step": 10285 + }, + { + "epoch": 3.08, + "grad_norm": 1.777191162109375, + "learning_rate": 4.7134257993119564e-05, + "loss": 1.2228, + "step": 10290 + }, + { + "epoch": 3.08, + "grad_norm": 1.1202077865600586, + "learning_rate": 4.713152607617497e-05, + "loss": 1.2215, + "step": 10295 + }, + { + "epoch": 3.08, + "grad_norm": 0.6653339862823486, + "learning_rate": 4.71287929369292e-05, + "loss": 1.4115, + "step": 10300 + }, + { + "epoch": 3.08, + "grad_norm": 1.3566327095031738, + "learning_rate": 4.712605857553319e-05, + "loss": 1.332, + "step": 10305 + }, + { + "epoch": 3.08, + "grad_norm": 1.673338532447815, + "learning_rate": 4.7123322992137975e-05, + "loss": 1.4227, + "step": 10310 + }, + { + "epoch": 3.09, + "grad_norm": 0.8734028339385986, + "learning_rate": 4.712058618689462e-05, + "loss": 1.2329, + "step": 10315 + }, + { + "epoch": 3.09, + "grad_norm": 1.6159021854400635, + "learning_rate": 4.7117848159954294e-05, + "loss": 1.1735, + "step": 10320 + }, + { + "epoch": 3.09, + "grad_norm": 1.456439733505249, + "learning_rate": 4.7115108911468194e-05, + "loss": 1.3611, + "step": 10325 + }, + { + "epoch": 3.09, + "grad_norm": 1.3023260831832886, + "learning_rate": 4.7112368441587615e-05, + "loss": 1.3157, + "step": 10330 + }, + { + "epoch": 3.09, + "grad_norm": 0.9196366667747498, + "learning_rate": 4.710962675046391e-05, + "loss": 1.3092, + "step": 10335 + }, + { + "epoch": 3.09, + "grad_norm": 0.8758362531661987, + "learning_rate": 4.7106883838248505e-05, + "loss": 1.4856, + "step": 10340 + }, + { + "epoch": 3.1, + "grad_norm": 1.404257893562317, + "learning_rate": 4.710413970509289e-05, + "loss": 1.3546, + "step": 10345 + }, + { + "epoch": 3.1, + "grad_norm": 2.7018842697143555, + "learning_rate": 4.710139435114861e-05, + "loss": 1.196, + "step": 10350 + }, + { + "epoch": 3.1, + "grad_norm": 1.2335234880447388, + "learning_rate": 4.709864777656729e-05, + "loss": 1.145, + "step": 10355 + }, + { + "epoch": 3.1, + "grad_norm": 2.3632590770721436, + "learning_rate": 4.709589998150063e-05, + "loss": 1.4927, + "step": 10360 + }, + { + "epoch": 3.1, + "grad_norm": 0.9068330526351929, + "learning_rate": 4.709315096610038e-05, + "loss": 1.2923, + "step": 10365 + }, + { + "epoch": 3.1, + "grad_norm": 1.093493103981018, + "learning_rate": 4.709040073051837e-05, + "loss": 1.2462, + "step": 10370 + }, + { + "epoch": 3.1, + "grad_norm": 0.9117778539657593, + "learning_rate": 4.7087649274906475e-05, + "loss": 1.2301, + "step": 10375 + }, + { + "epoch": 3.11, + "grad_norm": 1.3600761890411377, + "learning_rate": 4.7084896599416685e-05, + "loss": 1.3659, + "step": 10380 + }, + { + "epoch": 3.11, + "grad_norm": 2.149157762527466, + "learning_rate": 4.7082142704200996e-05, + "loss": 1.2298, + "step": 10385 + }, + { + "epoch": 3.11, + "grad_norm": 1.4319911003112793, + "learning_rate": 4.707938758941153e-05, + "loss": 1.2099, + "step": 10390 + }, + { + "epoch": 3.11, + "grad_norm": 0.8007309436798096, + "learning_rate": 4.7076631255200436e-05, + "loss": 1.208, + "step": 10395 + }, + { + "epoch": 3.11, + "grad_norm": 2.5758299827575684, + "learning_rate": 4.707387370171995e-05, + "loss": 1.457, + "step": 10400 + }, + { + "epoch": 3.11, + "grad_norm": 1.3891574144363403, + "learning_rate": 4.707111492912235e-05, + "loss": 1.1751, + "step": 10405 + }, + { + "epoch": 3.11, + "grad_norm": 1.3793649673461914, + "learning_rate": 4.7068354937560026e-05, + "loss": 1.2248, + "step": 10410 + }, + { + "epoch": 3.12, + "grad_norm": 1.7429753541946411, + "learning_rate": 4.7065593727185395e-05, + "loss": 1.3831, + "step": 10415 + }, + { + "epoch": 3.12, + "grad_norm": 0.9534806609153748, + "learning_rate": 4.706283129815095e-05, + "loss": 1.3393, + "step": 10420 + }, + { + "epoch": 3.12, + "grad_norm": 0.9170734286308289, + "learning_rate": 4.706006765060928e-05, + "loss": 1.2613, + "step": 10425 + }, + { + "epoch": 3.12, + "grad_norm": 1.9768500328063965, + "learning_rate": 4.7057302784713e-05, + "loss": 1.3073, + "step": 10430 + }, + { + "epoch": 3.12, + "grad_norm": 1.7326947450637817, + "learning_rate": 4.705453670061481e-05, + "loss": 1.2648, + "step": 10435 + }, + { + "epoch": 3.12, + "grad_norm": 4.0265069007873535, + "learning_rate": 4.7051769398467484e-05, + "loss": 1.3263, + "step": 10440 + }, + { + "epoch": 3.13, + "grad_norm": 1.0905771255493164, + "learning_rate": 4.7049000878423856e-05, + "loss": 1.1149, + "step": 10445 + }, + { + "epoch": 3.13, + "grad_norm": 2.596532106399536, + "learning_rate": 4.7046231140636826e-05, + "loss": 1.3235, + "step": 10450 + }, + { + "epoch": 3.13, + "grad_norm": 1.497429370880127, + "learning_rate": 4.704346018525937e-05, + "loss": 1.3615, + "step": 10455 + }, + { + "epoch": 3.13, + "grad_norm": 0.9473041892051697, + "learning_rate": 4.704068801244452e-05, + "loss": 1.2733, + "step": 10460 + }, + { + "epoch": 3.13, + "grad_norm": 3.628065824508667, + "learning_rate": 4.703791462234537e-05, + "loss": 1.3351, + "step": 10465 + }, + { + "epoch": 3.13, + "grad_norm": 0.9832010865211487, + "learning_rate": 4.703514001511512e-05, + "loss": 1.3246, + "step": 10470 + }, + { + "epoch": 3.13, + "grad_norm": 1.0642294883728027, + "learning_rate": 4.7032364190906985e-05, + "loss": 1.309, + "step": 10475 + }, + { + "epoch": 3.14, + "grad_norm": 2.5547728538513184, + "learning_rate": 4.702958714987427e-05, + "loss": 1.2542, + "step": 10480 + }, + { + "epoch": 3.14, + "grad_norm": 2.006378173828125, + "learning_rate": 4.702680889217036e-05, + "loss": 1.2813, + "step": 10485 + }, + { + "epoch": 3.14, + "grad_norm": 0.6523733735084534, + "learning_rate": 4.702402941794869e-05, + "loss": 1.2206, + "step": 10490 + }, + { + "epoch": 3.14, + "grad_norm": 1.642492651939392, + "learning_rate": 4.702124872736277e-05, + "loss": 1.3215, + "step": 10495 + }, + { + "epoch": 3.14, + "grad_norm": 3.4225122928619385, + "learning_rate": 4.7018466820566174e-05, + "loss": 1.2342, + "step": 10500 + }, + { + "epoch": 3.14, + "grad_norm": 1.2977136373519897, + "learning_rate": 4.701568369771254e-05, + "loss": 1.3561, + "step": 10505 + }, + { + "epoch": 3.14, + "grad_norm": 2.203577995300293, + "learning_rate": 4.701289935895558e-05, + "loss": 1.2129, + "step": 10510 + }, + { + "epoch": 3.15, + "grad_norm": 1.4925588369369507, + "learning_rate": 4.701011380444907e-05, + "loss": 1.2579, + "step": 10515 + }, + { + "epoch": 3.15, + "grad_norm": 2.624371290206909, + "learning_rate": 4.700732703434685e-05, + "loss": 1.4046, + "step": 10520 + }, + { + "epoch": 3.15, + "grad_norm": 1.6034959554672241, + "learning_rate": 4.7004539048802834e-05, + "loss": 1.2614, + "step": 10525 + }, + { + "epoch": 3.15, + "grad_norm": 1.3191356658935547, + "learning_rate": 4.7001749847971e-05, + "loss": 1.2469, + "step": 10530 + }, + { + "epoch": 3.15, + "grad_norm": 1.2245357036590576, + "learning_rate": 4.6998959432005393e-05, + "loss": 1.3599, + "step": 10535 + }, + { + "epoch": 3.15, + "grad_norm": 0.8731427192687988, + "learning_rate": 4.699616780106012e-05, + "loss": 1.2735, + "step": 10540 + }, + { + "epoch": 3.15, + "grad_norm": 2.4979584217071533, + "learning_rate": 4.699337495528937e-05, + "loss": 1.3179, + "step": 10545 + }, + { + "epoch": 3.16, + "grad_norm": 0.922391414642334, + "learning_rate": 4.699058089484737e-05, + "loss": 1.3218, + "step": 10550 + }, + { + "epoch": 3.16, + "grad_norm": 1.5169897079467773, + "learning_rate": 4.6987785619888455e-05, + "loss": 1.3695, + "step": 10555 + }, + { + "epoch": 3.16, + "grad_norm": 1.7197904586791992, + "learning_rate": 4.698498913056699e-05, + "loss": 1.3514, + "step": 10560 + }, + { + "epoch": 3.16, + "grad_norm": 1.886755108833313, + "learning_rate": 4.698219142703743e-05, + "loss": 1.3594, + "step": 10565 + }, + { + "epoch": 3.16, + "grad_norm": 2.066739082336426, + "learning_rate": 4.6979392509454286e-05, + "loss": 1.2485, + "step": 10570 + }, + { + "epoch": 3.16, + "grad_norm": 1.8264986276626587, + "learning_rate": 4.697659237797214e-05, + "loss": 1.2989, + "step": 10575 + }, + { + "epoch": 3.17, + "grad_norm": 1.5807194709777832, + "learning_rate": 4.697379103274564e-05, + "loss": 1.1532, + "step": 10580 + }, + { + "epoch": 3.17, + "grad_norm": 2.379920244216919, + "learning_rate": 4.69709884739295e-05, + "loss": 1.1664, + "step": 10585 + }, + { + "epoch": 3.17, + "grad_norm": 2.331644296646118, + "learning_rate": 4.696818470167851e-05, + "loss": 1.2713, + "step": 10590 + }, + { + "epoch": 3.17, + "grad_norm": 2.402665376663208, + "learning_rate": 4.6965379716147504e-05, + "loss": 1.3309, + "step": 10595 + }, + { + "epoch": 3.17, + "grad_norm": 3.28230881690979, + "learning_rate": 4.6962573517491414e-05, + "loss": 1.2569, + "step": 10600 + }, + { + "epoch": 3.17, + "grad_norm": 1.6851478815078735, + "learning_rate": 4.6959766105865225e-05, + "loss": 1.4249, + "step": 10605 + }, + { + "epoch": 3.17, + "grad_norm": 3.4568557739257812, + "learning_rate": 4.695695748142397e-05, + "loss": 1.4145, + "step": 10610 + }, + { + "epoch": 3.18, + "grad_norm": 3.6388516426086426, + "learning_rate": 4.695414764432278e-05, + "loss": 1.3806, + "step": 10615 + }, + { + "epoch": 3.18, + "grad_norm": 2.0879287719726562, + "learning_rate": 4.695133659471683e-05, + "loss": 1.2937, + "step": 10620 + }, + { + "epoch": 3.18, + "grad_norm": 1.9023692607879639, + "learning_rate": 4.694852433276138e-05, + "loss": 1.3197, + "step": 10625 + }, + { + "epoch": 3.18, + "grad_norm": 3.421626329421997, + "learning_rate": 4.6945710858611746e-05, + "loss": 1.2769, + "step": 10630 + }, + { + "epoch": 3.18, + "grad_norm": 1.5215307474136353, + "learning_rate": 4.694289617242331e-05, + "loss": 1.3148, + "step": 10635 + }, + { + "epoch": 3.18, + "grad_norm": 1.3588393926620483, + "learning_rate": 4.694008027435154e-05, + "loss": 1.2765, + "step": 10640 + }, + { + "epoch": 3.18, + "grad_norm": 1.517864465713501, + "learning_rate": 4.6937263164551926e-05, + "loss": 1.1829, + "step": 10645 + }, + { + "epoch": 3.19, + "grad_norm": 1.18218994140625, + "learning_rate": 4.693444484318008e-05, + "loss": 1.3342, + "step": 10650 + }, + { + "epoch": 3.19, + "grad_norm": 0.9616156816482544, + "learning_rate": 4.693162531039164e-05, + "loss": 1.4308, + "step": 10655 + }, + { + "epoch": 3.19, + "grad_norm": 1.3494954109191895, + "learning_rate": 4.692880456634233e-05, + "loss": 1.2483, + "step": 10660 + }, + { + "epoch": 3.19, + "grad_norm": 1.6697038412094116, + "learning_rate": 4.6925982611187934e-05, + "loss": 1.2745, + "step": 10665 + }, + { + "epoch": 3.19, + "grad_norm": 1.6610711812973022, + "learning_rate": 4.692315944508432e-05, + "loss": 1.3191, + "step": 10670 + }, + { + "epoch": 3.19, + "grad_norm": 2.5130321979522705, + "learning_rate": 4.692033506818739e-05, + "loss": 1.0889, + "step": 10675 + }, + { + "epoch": 3.2, + "grad_norm": 1.1333353519439697, + "learning_rate": 4.6917509480653146e-05, + "loss": 1.1744, + "step": 10680 + }, + { + "epoch": 3.2, + "grad_norm": 0.9024247527122498, + "learning_rate": 4.6914682682637626e-05, + "loss": 1.2681, + "step": 10685 + }, + { + "epoch": 3.2, + "grad_norm": 1.6913889646530151, + "learning_rate": 4.6911854674296964e-05, + "loss": 1.246, + "step": 10690 + }, + { + "epoch": 3.2, + "grad_norm": 2.867292881011963, + "learning_rate": 4.690902545578735e-05, + "loss": 1.3205, + "step": 10695 + }, + { + "epoch": 3.2, + "grad_norm": 1.3072094917297363, + "learning_rate": 4.690619502726502e-05, + "loss": 1.3059, + "step": 10700 + }, + { + "epoch": 3.2, + "grad_norm": 1.9709635972976685, + "learning_rate": 4.6903363388886325e-05, + "loss": 1.321, + "step": 10705 + }, + { + "epoch": 3.2, + "grad_norm": 1.7038654088974, + "learning_rate": 4.6900530540807624e-05, + "loss": 1.3357, + "step": 10710 + }, + { + "epoch": 3.21, + "grad_norm": 0.885414183139801, + "learning_rate": 4.68976964831854e-05, + "loss": 1.2921, + "step": 10715 + }, + { + "epoch": 3.21, + "grad_norm": 1.5345251560211182, + "learning_rate": 4.689486121617615e-05, + "loss": 1.4398, + "step": 10720 + }, + { + "epoch": 3.21, + "grad_norm": 1.8840970993041992, + "learning_rate": 4.689202473993647e-05, + "loss": 1.1967, + "step": 10725 + }, + { + "epoch": 3.21, + "grad_norm": 1.861816167831421, + "learning_rate": 4.6889187054623016e-05, + "loss": 1.2309, + "step": 10730 + }, + { + "epoch": 3.21, + "grad_norm": 1.6276249885559082, + "learning_rate": 4.688634816039253e-05, + "loss": 1.2227, + "step": 10735 + }, + { + "epoch": 3.21, + "grad_norm": 1.3406356573104858, + "learning_rate": 4.688350805740178e-05, + "loss": 1.3609, + "step": 10740 + }, + { + "epoch": 3.21, + "grad_norm": 2.407961130142212, + "learning_rate": 4.6880666745807625e-05, + "loss": 1.0896, + "step": 10745 + }, + { + "epoch": 3.22, + "grad_norm": 1.9864040613174438, + "learning_rate": 4.687782422576698e-05, + "loss": 1.2063, + "step": 10750 + }, + { + "epoch": 3.22, + "grad_norm": 1.8912273645401, + "learning_rate": 4.687498049743685e-05, + "loss": 1.2833, + "step": 10755 + }, + { + "epoch": 3.22, + "grad_norm": 1.490443468093872, + "learning_rate": 4.6872135560974285e-05, + "loss": 1.3106, + "step": 10760 + }, + { + "epoch": 3.22, + "grad_norm": 2.050848960876465, + "learning_rate": 4.686928941653641e-05, + "loss": 1.2577, + "step": 10765 + }, + { + "epoch": 3.22, + "grad_norm": 1.508426308631897, + "learning_rate": 4.686644206428041e-05, + "loss": 1.2863, + "step": 10770 + }, + { + "epoch": 3.22, + "grad_norm": 1.549778699874878, + "learning_rate": 4.686359350436354e-05, + "loss": 1.1178, + "step": 10775 + }, + { + "epoch": 3.23, + "grad_norm": 1.004459023475647, + "learning_rate": 4.6860743736943134e-05, + "loss": 1.3404, + "step": 10780 + }, + { + "epoch": 3.23, + "grad_norm": 3.65075945854187, + "learning_rate": 4.685789276217658e-05, + "loss": 1.449, + "step": 10785 + }, + { + "epoch": 3.23, + "grad_norm": 0.863338053226471, + "learning_rate": 4.6855040580221323e-05, + "loss": 1.1967, + "step": 10790 + }, + { + "epoch": 3.23, + "grad_norm": 1.1816414594650269, + "learning_rate": 4.685218719123489e-05, + "loss": 1.2813, + "step": 10795 + }, + { + "epoch": 3.23, + "grad_norm": 1.7856791019439697, + "learning_rate": 4.6849332595374864e-05, + "loss": 1.4305, + "step": 10800 + }, + { + "epoch": 3.23, + "grad_norm": 1.4339046478271484, + "learning_rate": 4.684647679279892e-05, + "loss": 1.3854, + "step": 10805 + }, + { + "epoch": 3.23, + "grad_norm": 0.8699505925178528, + "learning_rate": 4.684361978366477e-05, + "loss": 1.2915, + "step": 10810 + }, + { + "epoch": 3.24, + "grad_norm": 1.3646979331970215, + "learning_rate": 4.6840761568130206e-05, + "loss": 1.3623, + "step": 10815 + }, + { + "epoch": 3.24, + "grad_norm": 2.1360511779785156, + "learning_rate": 4.6837902146353076e-05, + "loss": 1.2835, + "step": 10820 + }, + { + "epoch": 3.24, + "grad_norm": 1.1500669717788696, + "learning_rate": 4.683504151849132e-05, + "loss": 1.2727, + "step": 10825 + }, + { + "epoch": 3.24, + "grad_norm": 4.0960187911987305, + "learning_rate": 4.68321796847029e-05, + "loss": 1.2047, + "step": 10830 + }, + { + "epoch": 3.24, + "grad_norm": 0.9620679020881653, + "learning_rate": 4.68293166451459e-05, + "loss": 1.2046, + "step": 10835 + }, + { + "epoch": 3.24, + "grad_norm": 1.1444592475891113, + "learning_rate": 4.6826452399978436e-05, + "loss": 1.391, + "step": 10840 + }, + { + "epoch": 3.24, + "grad_norm": 2.369436740875244, + "learning_rate": 4.6823586949358686e-05, + "loss": 1.2937, + "step": 10845 + }, + { + "epoch": 3.25, + "grad_norm": 1.327525019645691, + "learning_rate": 4.682072029344492e-05, + "loss": 1.4105, + "step": 10850 + }, + { + "epoch": 3.25, + "grad_norm": 1.3739736080169678, + "learning_rate": 4.681785243239545e-05, + "loss": 1.4381, + "step": 10855 + }, + { + "epoch": 3.25, + "grad_norm": 1.522316813468933, + "learning_rate": 4.681498336636867e-05, + "loss": 1.3176, + "step": 10860 + }, + { + "epoch": 3.25, + "grad_norm": 1.123241901397705, + "learning_rate": 4.681211309552304e-05, + "loss": 1.2045, + "step": 10865 + }, + { + "epoch": 3.25, + "grad_norm": 0.9612917900085449, + "learning_rate": 4.680924162001706e-05, + "loss": 1.3388, + "step": 10870 + }, + { + "epoch": 3.25, + "grad_norm": 1.3160079717636108, + "learning_rate": 4.680636894000935e-05, + "loss": 1.338, + "step": 10875 + }, + { + "epoch": 3.26, + "grad_norm": 0.9297557473182678, + "learning_rate": 4.680349505565854e-05, + "loss": 1.2748, + "step": 10880 + }, + { + "epoch": 3.26, + "grad_norm": 0.926292359828949, + "learning_rate": 4.6800619967123373e-05, + "loss": 1.3359, + "step": 10885 + }, + { + "epoch": 3.26, + "grad_norm": 1.0277527570724487, + "learning_rate": 4.679774367456261e-05, + "loss": 1.2692, + "step": 10890 + }, + { + "epoch": 3.26, + "grad_norm": 1.3001703023910522, + "learning_rate": 4.679486617813513e-05, + "loss": 1.2465, + "step": 10895 + }, + { + "epoch": 3.26, + "grad_norm": 1.934599757194519, + "learning_rate": 4.679198747799984e-05, + "loss": 1.318, + "step": 10900 + }, + { + "epoch": 3.26, + "grad_norm": 1.1883105039596558, + "learning_rate": 4.678910757431574e-05, + "loss": 1.2774, + "step": 10905 + }, + { + "epoch": 3.26, + "grad_norm": 2.791804075241089, + "learning_rate": 4.678622646724188e-05, + "loss": 1.3288, + "step": 10910 + }, + { + "epoch": 3.27, + "grad_norm": 1.5142138004302979, + "learning_rate": 4.678334415693737e-05, + "loss": 1.444, + "step": 10915 + }, + { + "epoch": 3.27, + "grad_norm": 1.1314914226531982, + "learning_rate": 4.678046064356141e-05, + "loss": 1.3735, + "step": 10920 + }, + { + "epoch": 3.27, + "grad_norm": 2.2148430347442627, + "learning_rate": 4.677757592727325e-05, + "loss": 1.2055, + "step": 10925 + }, + { + "epoch": 3.27, + "grad_norm": 1.7565327882766724, + "learning_rate": 4.67746900082322e-05, + "loss": 1.2262, + "step": 10930 + }, + { + "epoch": 3.27, + "grad_norm": 4.066123962402344, + "learning_rate": 4.677180288659766e-05, + "loss": 1.3309, + "step": 10935 + }, + { + "epoch": 3.27, + "grad_norm": 2.2226760387420654, + "learning_rate": 4.676891456252908e-05, + "loss": 1.3966, + "step": 10940 + }, + { + "epoch": 3.27, + "grad_norm": 2.1616628170013428, + "learning_rate": 4.676602503618597e-05, + "loss": 1.1928, + "step": 10945 + }, + { + "epoch": 3.28, + "grad_norm": 1.3349906206130981, + "learning_rate": 4.6763134307727916e-05, + "loss": 1.2268, + "step": 10950 + }, + { + "epoch": 3.28, + "grad_norm": 1.6452038288116455, + "learning_rate": 4.676024237731459e-05, + "loss": 1.1698, + "step": 10955 + }, + { + "epoch": 3.28, + "grad_norm": 1.8451074361801147, + "learning_rate": 4.675734924510569e-05, + "loss": 1.2763, + "step": 10960 + }, + { + "epoch": 3.28, + "grad_norm": 1.9175277948379517, + "learning_rate": 4.6754454911261005e-05, + "loss": 1.3425, + "step": 10965 + }, + { + "epoch": 3.28, + "grad_norm": 1.0430094003677368, + "learning_rate": 4.6751559375940384e-05, + "loss": 1.324, + "step": 10970 + }, + { + "epoch": 3.28, + "grad_norm": 4.597285747528076, + "learning_rate": 4.674866263930375e-05, + "loss": 1.2731, + "step": 10975 + }, + { + "epoch": 3.29, + "grad_norm": 1.3398327827453613, + "learning_rate": 4.674576470151109e-05, + "loss": 1.3967, + "step": 10980 + }, + { + "epoch": 3.29, + "grad_norm": 1.1260203123092651, + "learning_rate": 4.674344548655217e-05, + "loss": 1.4702, + "step": 10985 + }, + { + "epoch": 3.29, + "grad_norm": 1.8859392404556274, + "learning_rate": 4.674054538708202e-05, + "loss": 1.3604, + "step": 10990 + }, + { + "epoch": 3.29, + "grad_norm": 1.0552682876586914, + "learning_rate": 4.6737644086904156e-05, + "loss": 1.2871, + "step": 10995 + }, + { + "epoch": 3.29, + "grad_norm": 1.1921944618225098, + "learning_rate": 4.6734741586178794e-05, + "loss": 1.411, + "step": 11000 + }, + { + "epoch": 3.29, + "grad_norm": 1.534670352935791, + "learning_rate": 4.673183788506625e-05, + "loss": 1.1321, + "step": 11005 + }, + { + "epoch": 3.29, + "grad_norm": 0.8128979802131653, + "learning_rate": 4.672893298372688e-05, + "loss": 1.1416, + "step": 11010 + }, + { + "epoch": 3.3, + "grad_norm": 1.2440390586853027, + "learning_rate": 4.672602688232114e-05, + "loss": 1.3895, + "step": 11015 + }, + { + "epoch": 3.3, + "grad_norm": 0.7895771861076355, + "learning_rate": 4.672311958100951e-05, + "loss": 1.2575, + "step": 11020 + }, + { + "epoch": 3.3, + "grad_norm": 8.737719535827637, + "learning_rate": 4.672021107995257e-05, + "loss": 1.2813, + "step": 11025 + }, + { + "epoch": 3.3, + "grad_norm": 1.1101980209350586, + "learning_rate": 4.671730137931095e-05, + "loss": 1.3107, + "step": 11030 + }, + { + "epoch": 3.3, + "grad_norm": 1.0142308473587036, + "learning_rate": 4.671439047924535e-05, + "loss": 1.2947, + "step": 11035 + }, + { + "epoch": 3.3, + "grad_norm": 1.9950822591781616, + "learning_rate": 4.671147837991653e-05, + "loss": 1.4065, + "step": 11040 + }, + { + "epoch": 3.3, + "grad_norm": 1.4179012775421143, + "learning_rate": 4.670856508148532e-05, + "loss": 1.3455, + "step": 11045 + }, + { + "epoch": 3.31, + "grad_norm": 1.6330708265304565, + "learning_rate": 4.670565058411264e-05, + "loss": 1.3063, + "step": 11050 + }, + { + "epoch": 3.31, + "grad_norm": 1.7123674154281616, + "learning_rate": 4.6702734887959434e-05, + "loss": 1.2452, + "step": 11055 + }, + { + "epoch": 3.31, + "grad_norm": 1.1619246006011963, + "learning_rate": 4.669981799318674e-05, + "loss": 1.361, + "step": 11060 + }, + { + "epoch": 3.31, + "grad_norm": 2.159376382827759, + "learning_rate": 4.669689989995565e-05, + "loss": 1.2923, + "step": 11065 + }, + { + "epoch": 3.31, + "grad_norm": 1.8677573204040527, + "learning_rate": 4.6693980608427326e-05, + "loss": 1.2485, + "step": 11070 + }, + { + "epoch": 3.31, + "grad_norm": 1.9316105842590332, + "learning_rate": 4.669106011876301e-05, + "loss": 1.2627, + "step": 11075 + }, + { + "epoch": 3.32, + "grad_norm": 1.5687687397003174, + "learning_rate": 4.6688138431124e-05, + "loss": 1.5126, + "step": 11080 + }, + { + "epoch": 3.32, + "grad_norm": 1.8287721872329712, + "learning_rate": 4.6685215545671634e-05, + "loss": 1.1865, + "step": 11085 + }, + { + "epoch": 3.32, + "grad_norm": 1.737056851387024, + "learning_rate": 4.668229146256735e-05, + "loss": 1.2676, + "step": 11090 + }, + { + "epoch": 3.32, + "grad_norm": 1.2314867973327637, + "learning_rate": 4.667936618197266e-05, + "loss": 1.2986, + "step": 11095 + }, + { + "epoch": 3.32, + "grad_norm": 2.5323379039764404, + "learning_rate": 4.66764397040491e-05, + "loss": 1.2406, + "step": 11100 + }, + { + "epoch": 3.32, + "grad_norm": 2.0463790893554688, + "learning_rate": 4.667351202895831e-05, + "loss": 1.3175, + "step": 11105 + }, + { + "epoch": 3.32, + "grad_norm": 1.8490899801254272, + "learning_rate": 4.6670583156861984e-05, + "loss": 1.273, + "step": 11110 + }, + { + "epoch": 3.33, + "grad_norm": 2.06270694732666, + "learning_rate": 4.666765308792187e-05, + "loss": 1.2421, + "step": 11115 + }, + { + "epoch": 3.33, + "grad_norm": 2.100942611694336, + "learning_rate": 4.66647218222998e-05, + "loss": 1.3169, + "step": 11120 + }, + { + "epoch": 3.33, + "grad_norm": 2.400592565536499, + "learning_rate": 4.666178936015767e-05, + "loss": 1.4987, + "step": 11125 + }, + { + "epoch": 3.33, + "grad_norm": 1.5925202369689941, + "learning_rate": 4.665885570165742e-05, + "loss": 1.2487, + "step": 11130 + }, + { + "epoch": 3.33, + "grad_norm": 0.8102724552154541, + "learning_rate": 4.665592084696108e-05, + "loss": 1.2773, + "step": 11135 + }, + { + "epoch": 3.33, + "grad_norm": 0.9131859540939331, + "learning_rate": 4.665298479623075e-05, + "loss": 1.0861, + "step": 11140 + }, + { + "epoch": 3.33, + "grad_norm": 2.433790445327759, + "learning_rate": 4.665004754962857e-05, + "loss": 1.1546, + "step": 11145 + }, + { + "epoch": 3.34, + "grad_norm": 1.100435495376587, + "learning_rate": 4.664710910731677e-05, + "loss": 1.2528, + "step": 11150 + }, + { + "epoch": 3.34, + "grad_norm": 3.4649596214294434, + "learning_rate": 4.6644169469457635e-05, + "loss": 1.3513, + "step": 11155 + }, + { + "epoch": 3.34, + "grad_norm": 1.005937933921814, + "learning_rate": 4.664122863621352e-05, + "loss": 1.2872, + "step": 11160 + }, + { + "epoch": 3.34, + "grad_norm": 1.2074639797210693, + "learning_rate": 4.663828660774684e-05, + "loss": 1.341, + "step": 11165 + }, + { + "epoch": 3.34, + "grad_norm": 2.826364040374756, + "learning_rate": 4.663534338422009e-05, + "loss": 1.2848, + "step": 11170 + }, + { + "epoch": 3.34, + "grad_norm": 2.759699821472168, + "learning_rate": 4.663239896579581e-05, + "loss": 1.4064, + "step": 11175 + }, + { + "epoch": 3.34, + "grad_norm": 2.0414791107177734, + "learning_rate": 4.662945335263662e-05, + "loss": 1.3405, + "step": 11180 + }, + { + "epoch": 3.35, + "grad_norm": 1.7230271100997925, + "learning_rate": 4.6626506544905194e-05, + "loss": 1.3063, + "step": 11185 + }, + { + "epoch": 3.35, + "grad_norm": 0.979017972946167, + "learning_rate": 4.66235585427643e-05, + "loss": 1.2698, + "step": 11190 + }, + { + "epoch": 3.35, + "grad_norm": 1.1741644144058228, + "learning_rate": 4.662060934637674e-05, + "loss": 1.1668, + "step": 11195 + }, + { + "epoch": 3.35, + "grad_norm": 1.050943374633789, + "learning_rate": 4.661765895590541e-05, + "loss": 1.3493, + "step": 11200 + }, + { + "epoch": 3.35, + "grad_norm": 3.3978419303894043, + "learning_rate": 4.661470737151323e-05, + "loss": 1.2133, + "step": 11205 + }, + { + "epoch": 3.35, + "grad_norm": 1.509317398071289, + "learning_rate": 4.661175459336324e-05, + "loss": 1.3368, + "step": 11210 + }, + { + "epoch": 3.36, + "grad_norm": 2.1096768379211426, + "learning_rate": 4.6608800621618506e-05, + "loss": 1.3254, + "step": 11215 + }, + { + "epoch": 3.36, + "grad_norm": 1.4031864404678345, + "learning_rate": 4.660584545644218e-05, + "loss": 1.4675, + "step": 11220 + }, + { + "epoch": 3.36, + "grad_norm": 1.8638592958450317, + "learning_rate": 4.660288909799746e-05, + "loss": 1.4674, + "step": 11225 + }, + { + "epoch": 3.36, + "grad_norm": 1.681207299232483, + "learning_rate": 4.659993154644763e-05, + "loss": 1.1908, + "step": 11230 + }, + { + "epoch": 3.36, + "grad_norm": 1.484300136566162, + "learning_rate": 4.659697280195604e-05, + "loss": 1.1835, + "step": 11235 + }, + { + "epoch": 3.36, + "grad_norm": 1.0395090579986572, + "learning_rate": 4.6594012864686084e-05, + "loss": 1.2898, + "step": 11240 + }, + { + "epoch": 3.36, + "grad_norm": 2.5014450550079346, + "learning_rate": 4.6591051734801246e-05, + "loss": 1.2174, + "step": 11245 + }, + { + "epoch": 3.37, + "grad_norm": 2.0165107250213623, + "learning_rate": 4.658808941246506e-05, + "loss": 1.2818, + "step": 11250 + }, + { + "epoch": 3.37, + "grad_norm": 1.2676451206207275, + "learning_rate": 4.658512589784114e-05, + "loss": 1.3866, + "step": 11255 + }, + { + "epoch": 3.37, + "grad_norm": 1.7123560905456543, + "learning_rate": 4.658216119109315e-05, + "loss": 1.4062, + "step": 11260 + }, + { + "epoch": 3.37, + "grad_norm": 2.4512739181518555, + "learning_rate": 4.6579195292384825e-05, + "loss": 1.2707, + "step": 11265 + }, + { + "epoch": 3.37, + "grad_norm": 1.6923884153366089, + "learning_rate": 4.657622820187998e-05, + "loss": 1.3526, + "step": 11270 + }, + { + "epoch": 3.37, + "grad_norm": 2.5449416637420654, + "learning_rate": 4.6573259919742484e-05, + "loss": 1.1957, + "step": 11275 + }, + { + "epoch": 3.37, + "grad_norm": 1.0603396892547607, + "learning_rate": 4.657029044613626e-05, + "loss": 1.4085, + "step": 11280 + }, + { + "epoch": 3.38, + "grad_norm": 2.8096063137054443, + "learning_rate": 4.6567319781225313e-05, + "loss": 1.4052, + "step": 11285 + }, + { + "epoch": 3.38, + "grad_norm": 2.0202414989471436, + "learning_rate": 4.656434792517372e-05, + "loss": 1.3356, + "step": 11290 + }, + { + "epoch": 3.38, + "grad_norm": 1.3925175666809082, + "learning_rate": 4.6561374878145606e-05, + "loss": 1.2098, + "step": 11295 + }, + { + "epoch": 3.38, + "grad_norm": 1.2433990240097046, + "learning_rate": 4.655840064030517e-05, + "loss": 1.3263, + "step": 11300 + }, + { + "epoch": 3.38, + "grad_norm": 1.0137583017349243, + "learning_rate": 4.6555425211816675e-05, + "loss": 1.2833, + "step": 11305 + }, + { + "epoch": 3.38, + "grad_norm": 2.3041789531707764, + "learning_rate": 4.655244859284444e-05, + "loss": 1.4461, + "step": 11310 + }, + { + "epoch": 3.39, + "grad_norm": 1.4320485591888428, + "learning_rate": 4.6549470783552886e-05, + "loss": 1.3514, + "step": 11315 + }, + { + "epoch": 3.39, + "grad_norm": 1.6517833471298218, + "learning_rate": 4.654649178410645e-05, + "loss": 1.3607, + "step": 11320 + }, + { + "epoch": 3.39, + "grad_norm": 2.3063910007476807, + "learning_rate": 4.6543511594669675e-05, + "loss": 1.2331, + "step": 11325 + }, + { + "epoch": 3.39, + "grad_norm": 1.5468136072158813, + "learning_rate": 4.654053021540714e-05, + "loss": 1.207, + "step": 11330 + }, + { + "epoch": 3.39, + "grad_norm": 0.7770300507545471, + "learning_rate": 4.653754764648352e-05, + "loss": 1.2422, + "step": 11335 + }, + { + "epoch": 3.39, + "grad_norm": 1.037409782409668, + "learning_rate": 4.6534563888063534e-05, + "loss": 1.1782, + "step": 11340 + }, + { + "epoch": 3.39, + "grad_norm": 1.4780035018920898, + "learning_rate": 4.653157894031196e-05, + "loss": 1.281, + "step": 11345 + }, + { + "epoch": 3.4, + "grad_norm": 1.644164800643921, + "learning_rate": 4.652859280339366e-05, + "loss": 1.2121, + "step": 11350 + }, + { + "epoch": 3.4, + "grad_norm": 1.3956807851791382, + "learning_rate": 4.6525605477473564e-05, + "loss": 1.2194, + "step": 11355 + }, + { + "epoch": 3.4, + "grad_norm": 1.0469894409179688, + "learning_rate": 4.6522616962716646e-05, + "loss": 1.3257, + "step": 11360 + }, + { + "epoch": 3.4, + "grad_norm": 1.9847218990325928, + "learning_rate": 4.651962725928797e-05, + "loss": 1.4013, + "step": 11365 + }, + { + "epoch": 3.4, + "grad_norm": 1.7343077659606934, + "learning_rate": 4.6516636367352646e-05, + "loss": 1.3517, + "step": 11370 + }, + { + "epoch": 3.4, + "grad_norm": 2.5026791095733643, + "learning_rate": 4.6513644287075866e-05, + "loss": 1.4136, + "step": 11375 + }, + { + "epoch": 3.4, + "grad_norm": 2.037338972091675, + "learning_rate": 4.651065101862286e-05, + "loss": 1.3706, + "step": 11380 + }, + { + "epoch": 3.41, + "grad_norm": 2.3901917934417725, + "learning_rate": 4.650765656215898e-05, + "loss": 1.2601, + "step": 11385 + }, + { + "epoch": 3.41, + "grad_norm": 1.2326328754425049, + "learning_rate": 4.650466091784956e-05, + "loss": 1.4596, + "step": 11390 + }, + { + "epoch": 3.41, + "grad_norm": 1.7991396188735962, + "learning_rate": 4.650166408586009e-05, + "loss": 1.2718, + "step": 11395 + }, + { + "epoch": 3.41, + "grad_norm": 1.9020781517028809, + "learning_rate": 4.649866606635605e-05, + "loss": 1.2791, + "step": 11400 + }, + { + "epoch": 3.41, + "grad_norm": 1.2564764022827148, + "learning_rate": 4.649566685950304e-05, + "loss": 1.2263, + "step": 11405 + }, + { + "epoch": 3.41, + "grad_norm": 1.6249325275421143, + "learning_rate": 4.649266646546668e-05, + "loss": 1.3176, + "step": 11410 + }, + { + "epoch": 3.42, + "grad_norm": 1.1263751983642578, + "learning_rate": 4.64896648844127e-05, + "loss": 1.3471, + "step": 11415 + }, + { + "epoch": 3.42, + "grad_norm": 1.0438363552093506, + "learning_rate": 4.648666211650686e-05, + "loss": 1.3099, + "step": 11420 + }, + { + "epoch": 3.42, + "grad_norm": 1.0559061765670776, + "learning_rate": 4.648365816191501e-05, + "loss": 1.2612, + "step": 11425 + }, + { + "epoch": 3.42, + "grad_norm": 0.788719654083252, + "learning_rate": 4.648065302080305e-05, + "loss": 1.2446, + "step": 11430 + }, + { + "epoch": 3.42, + "grad_norm": 2.290250539779663, + "learning_rate": 4.647764669333695e-05, + "loss": 1.2215, + "step": 11435 + }, + { + "epoch": 3.42, + "grad_norm": 1.0487291812896729, + "learning_rate": 4.647463917968275e-05, + "loss": 1.1949, + "step": 11440 + }, + { + "epoch": 3.42, + "grad_norm": 1.4382762908935547, + "learning_rate": 4.647163048000655e-05, + "loss": 1.2458, + "step": 11445 + }, + { + "epoch": 3.43, + "grad_norm": 2.330712080001831, + "learning_rate": 4.6468620594474515e-05, + "loss": 1.2711, + "step": 11450 + }, + { + "epoch": 3.43, + "grad_norm": 3.3765647411346436, + "learning_rate": 4.6465609523252884e-05, + "loss": 1.359, + "step": 11455 + }, + { + "epoch": 3.43, + "grad_norm": 1.7210214138031006, + "learning_rate": 4.646259726650794e-05, + "loss": 1.2642, + "step": 11460 + }, + { + "epoch": 3.43, + "grad_norm": 1.9674427509307861, + "learning_rate": 4.645958382440607e-05, + "loss": 1.2704, + "step": 11465 + }, + { + "epoch": 3.43, + "grad_norm": 1.1404820680618286, + "learning_rate": 4.645656919711369e-05, + "loss": 1.3052, + "step": 11470 + }, + { + "epoch": 3.43, + "grad_norm": 1.92625892162323, + "learning_rate": 4.645355338479729e-05, + "loss": 1.1419, + "step": 11475 + }, + { + "epoch": 3.43, + "grad_norm": 2.301802396774292, + "learning_rate": 4.645053638762344e-05, + "loss": 1.2296, + "step": 11480 + }, + { + "epoch": 3.44, + "grad_norm": 2.9896774291992188, + "learning_rate": 4.6447518205758765e-05, + "loss": 1.3242, + "step": 11485 + }, + { + "epoch": 3.44, + "grad_norm": 1.1062313318252563, + "learning_rate": 4.6444498839369956e-05, + "loss": 1.3635, + "step": 11490 + }, + { + "epoch": 3.44, + "grad_norm": 1.2494499683380127, + "learning_rate": 4.644147828862375e-05, + "loss": 1.3554, + "step": 11495 + }, + { + "epoch": 3.44, + "grad_norm": 4.519656658172607, + "learning_rate": 4.6438456553687005e-05, + "loss": 1.3204, + "step": 11500 + }, + { + "epoch": 3.44, + "grad_norm": 1.678928256034851, + "learning_rate": 4.6435433634726575e-05, + "loss": 1.2169, + "step": 11505 + }, + { + "epoch": 3.44, + "grad_norm": 1.1864182949066162, + "learning_rate": 4.6432409531909434e-05, + "loss": 1.2554, + "step": 11510 + }, + { + "epoch": 3.45, + "grad_norm": 1.6995583772659302, + "learning_rate": 4.6429384245402585e-05, + "loss": 1.3414, + "step": 11515 + }, + { + "epoch": 3.45, + "grad_norm": 1.1473424434661865, + "learning_rate": 4.642635777537312e-05, + "loss": 1.1982, + "step": 11520 + }, + { + "epoch": 3.45, + "grad_norm": 1.1320407390594482, + "learning_rate": 4.6423330121988196e-05, + "loss": 1.3277, + "step": 11525 + }, + { + "epoch": 3.45, + "grad_norm": 0.7268291115760803, + "learning_rate": 4.6420301285415005e-05, + "loss": 1.4298, + "step": 11530 + }, + { + "epoch": 3.45, + "grad_norm": 1.2748498916625977, + "learning_rate": 4.641727126582085e-05, + "loss": 1.3528, + "step": 11535 + }, + { + "epoch": 3.45, + "grad_norm": 1.8894537687301636, + "learning_rate": 4.641424006337306e-05, + "loss": 1.3222, + "step": 11540 + }, + { + "epoch": 3.45, + "grad_norm": 0.8295863270759583, + "learning_rate": 4.641120767823905e-05, + "loss": 1.2138, + "step": 11545 + }, + { + "epoch": 3.46, + "grad_norm": 2.017294406890869, + "learning_rate": 4.640817411058629e-05, + "loss": 1.3272, + "step": 11550 + }, + { + "epoch": 3.46, + "grad_norm": 2.1451973915100098, + "learning_rate": 4.640513936058233e-05, + "loss": 1.2706, + "step": 11555 + }, + { + "epoch": 3.46, + "grad_norm": 1.6634198427200317, + "learning_rate": 4.640210342839478e-05, + "loss": 1.2066, + "step": 11560 + }, + { + "epoch": 3.46, + "grad_norm": 2.191540002822876, + "learning_rate": 4.6399066314191294e-05, + "loss": 1.2551, + "step": 11565 + }, + { + "epoch": 3.46, + "grad_norm": 1.4875872135162354, + "learning_rate": 4.639602801813963e-05, + "loss": 1.4758, + "step": 11570 + }, + { + "epoch": 3.46, + "grad_norm": 1.266520619392395, + "learning_rate": 4.6392988540407564e-05, + "loss": 1.2998, + "step": 11575 + }, + { + "epoch": 3.46, + "grad_norm": 1.3954042196273804, + "learning_rate": 4.6389947881162984e-05, + "loss": 1.2636, + "step": 11580 + }, + { + "epoch": 3.47, + "grad_norm": 1.5587852001190186, + "learning_rate": 4.6386906040573825e-05, + "loss": 1.2853, + "step": 11585 + }, + { + "epoch": 3.47, + "grad_norm": 2.421710729598999, + "learning_rate": 4.638386301880807e-05, + "loss": 1.3729, + "step": 11590 + }, + { + "epoch": 3.47, + "grad_norm": 1.5814123153686523, + "learning_rate": 4.638081881603378e-05, + "loss": 1.4307, + "step": 11595 + }, + { + "epoch": 3.47, + "grad_norm": 0.9030851125717163, + "learning_rate": 4.63777734324191e-05, + "loss": 1.3591, + "step": 11600 + }, + { + "epoch": 3.47, + "grad_norm": 2.414091110229492, + "learning_rate": 4.637472686813221e-05, + "loss": 1.2515, + "step": 11605 + }, + { + "epoch": 3.47, + "grad_norm": 1.2490521669387817, + "learning_rate": 4.637167912334138e-05, + "loss": 1.3923, + "step": 11610 + }, + { + "epoch": 3.48, + "grad_norm": 1.478395700454712, + "learning_rate": 4.6368630198214916e-05, + "loss": 1.3068, + "step": 11615 + }, + { + "epoch": 3.48, + "grad_norm": 1.7249629497528076, + "learning_rate": 4.636558009292122e-05, + "loss": 1.4022, + "step": 11620 + }, + { + "epoch": 3.48, + "grad_norm": 1.7434816360473633, + "learning_rate": 4.636252880762875e-05, + "loss": 1.2509, + "step": 11625 + }, + { + "epoch": 3.48, + "grad_norm": 1.605729341506958, + "learning_rate": 4.6359476342506015e-05, + "loss": 1.2654, + "step": 11630 + }, + { + "epoch": 3.48, + "grad_norm": 1.8081423044204712, + "learning_rate": 4.63564226977216e-05, + "loss": 1.2407, + "step": 11635 + }, + { + "epoch": 3.48, + "grad_norm": 1.519515037536621, + "learning_rate": 4.6353367873444165e-05, + "loss": 1.427, + "step": 11640 + }, + { + "epoch": 3.48, + "grad_norm": 1.4996379613876343, + "learning_rate": 4.635031186984241e-05, + "loss": 1.2309, + "step": 11645 + }, + { + "epoch": 3.49, + "grad_norm": 1.2170788049697876, + "learning_rate": 4.634725468708513e-05, + "loss": 1.2613, + "step": 11650 + }, + { + "epoch": 3.49, + "grad_norm": 1.4180010557174683, + "learning_rate": 4.634419632534116e-05, + "loss": 1.3331, + "step": 11655 + }, + { + "epoch": 3.49, + "grad_norm": 0.9083313345909119, + "learning_rate": 4.6341136784779415e-05, + "loss": 1.3276, + "step": 11660 + }, + { + "epoch": 3.49, + "grad_norm": 2.1086177825927734, + "learning_rate": 4.633807606556887e-05, + "loss": 1.2769, + "step": 11665 + }, + { + "epoch": 3.49, + "grad_norm": 1.2872865200042725, + "learning_rate": 4.6335014167878557e-05, + "loss": 1.1635, + "step": 11670 + }, + { + "epoch": 3.49, + "grad_norm": 0.8815200924873352, + "learning_rate": 4.633195109187759e-05, + "loss": 1.2593, + "step": 11675 + }, + { + "epoch": 3.49, + "grad_norm": 2.032335042953491, + "learning_rate": 4.632888683773514e-05, + "loss": 1.3319, + "step": 11680 + }, + { + "epoch": 3.5, + "grad_norm": 2.3916192054748535, + "learning_rate": 4.632582140562044e-05, + "loss": 1.2105, + "step": 11685 + }, + { + "epoch": 3.5, + "grad_norm": 1.9669456481933594, + "learning_rate": 4.6322754795702795e-05, + "loss": 1.2198, + "step": 11690 + }, + { + "epoch": 3.5, + "grad_norm": 1.4312375783920288, + "learning_rate": 4.6319687008151555e-05, + "loss": 1.2218, + "step": 11695 + }, + { + "epoch": 3.5, + "grad_norm": 1.3405520915985107, + "learning_rate": 4.6316618043136175e-05, + "loss": 1.1403, + "step": 11700 + }, + { + "epoch": 3.5, + "grad_norm": 1.2970833778381348, + "learning_rate": 4.6313547900826124e-05, + "loss": 1.4566, + "step": 11705 + }, + { + "epoch": 3.5, + "grad_norm": 1.1170778274536133, + "learning_rate": 4.6310476581390985e-05, + "loss": 1.3428, + "step": 11710 + }, + { + "epoch": 3.5, + "grad_norm": 2.171121835708618, + "learning_rate": 4.6307404085000374e-05, + "loss": 1.2992, + "step": 11715 + }, + { + "epoch": 3.51, + "grad_norm": 1.2334961891174316, + "learning_rate": 4.630433041182398e-05, + "loss": 1.1957, + "step": 11720 + }, + { + "epoch": 3.51, + "grad_norm": 2.142889976501465, + "learning_rate": 4.630125556203156e-05, + "loss": 1.3488, + "step": 11725 + }, + { + "epoch": 3.51, + "grad_norm": 1.060781478881836, + "learning_rate": 4.629817953579295e-05, + "loss": 1.0665, + "step": 11730 + }, + { + "epoch": 3.51, + "grad_norm": 1.4236716032028198, + "learning_rate": 4.629510233327802e-05, + "loss": 1.2998, + "step": 11735 + }, + { + "epoch": 3.51, + "grad_norm": 1.210963249206543, + "learning_rate": 4.6292023954656716e-05, + "loss": 1.3734, + "step": 11740 + }, + { + "epoch": 3.51, + "grad_norm": 1.2984216213226318, + "learning_rate": 4.628894440009906e-05, + "loss": 1.3029, + "step": 11745 + }, + { + "epoch": 3.52, + "grad_norm": 1.2461925745010376, + "learning_rate": 4.628586366977513e-05, + "loss": 1.2483, + "step": 11750 + }, + { + "epoch": 3.52, + "grad_norm": 1.2298219203948975, + "learning_rate": 4.628278176385509e-05, + "loss": 1.1885, + "step": 11755 + }, + { + "epoch": 3.52, + "grad_norm": 2.885982036590576, + "learning_rate": 4.627969868250912e-05, + "loss": 1.2902, + "step": 11760 + }, + { + "epoch": 3.52, + "grad_norm": 1.220379114151001, + "learning_rate": 4.6276614425907514e-05, + "loss": 1.2562, + "step": 11765 + }, + { + "epoch": 3.52, + "grad_norm": 1.2900983095169067, + "learning_rate": 4.6273528994220616e-05, + "loss": 1.2722, + "step": 11770 + }, + { + "epoch": 3.52, + "grad_norm": 1.6189454793930054, + "learning_rate": 4.627044238761882e-05, + "loss": 1.2755, + "step": 11775 + }, + { + "epoch": 3.52, + "grad_norm": 2.1183040142059326, + "learning_rate": 4.62673546062726e-05, + "loss": 1.3765, + "step": 11780 + }, + { + "epoch": 3.53, + "grad_norm": 1.5245985984802246, + "learning_rate": 4.6264265650352494e-05, + "loss": 1.3768, + "step": 11785 + }, + { + "epoch": 3.53, + "grad_norm": 1.0661191940307617, + "learning_rate": 4.62611755200291e-05, + "loss": 1.2853, + "step": 11790 + }, + { + "epoch": 3.53, + "grad_norm": 2.844651699066162, + "learning_rate": 4.625808421547307e-05, + "loss": 1.3109, + "step": 11795 + }, + { + "epoch": 3.53, + "grad_norm": 1.5837798118591309, + "learning_rate": 4.6254991736855156e-05, + "loss": 1.192, + "step": 11800 + }, + { + "epoch": 3.53, + "grad_norm": 1.2192292213439941, + "learning_rate": 4.625189808434614e-05, + "loss": 1.207, + "step": 11805 + }, + { + "epoch": 3.53, + "grad_norm": 2.5890302658081055, + "learning_rate": 4.624880325811689e-05, + "loss": 1.3307, + "step": 11810 + }, + { + "epoch": 3.53, + "grad_norm": 1.7649799585342407, + "learning_rate": 4.624570725833831e-05, + "loss": 1.3292, + "step": 11815 + }, + { + "epoch": 3.54, + "grad_norm": 2.073655605316162, + "learning_rate": 4.624261008518141e-05, + "loss": 1.2353, + "step": 11820 + }, + { + "epoch": 3.54, + "grad_norm": 1.5090842247009277, + "learning_rate": 4.623951173881723e-05, + "loss": 1.2454, + "step": 11825 + }, + { + "epoch": 3.54, + "grad_norm": 1.7265774011611938, + "learning_rate": 4.62364122194169e-05, + "loss": 1.2835, + "step": 11830 + }, + { + "epoch": 3.54, + "grad_norm": 2.3008291721343994, + "learning_rate": 4.62333115271516e-05, + "loss": 1.3273, + "step": 11835 + }, + { + "epoch": 3.54, + "grad_norm": 1.107966661453247, + "learning_rate": 4.6230209662192565e-05, + "loss": 1.2891, + "step": 11840 + }, + { + "epoch": 3.54, + "grad_norm": 1.2190146446228027, + "learning_rate": 4.622710662471112e-05, + "loss": 1.4084, + "step": 11845 + }, + { + "epoch": 3.55, + "grad_norm": 1.5428764820098877, + "learning_rate": 4.6224002414878644e-05, + "loss": 1.1754, + "step": 11850 + }, + { + "epoch": 3.55, + "grad_norm": 1.452290654182434, + "learning_rate": 4.6220897032866574e-05, + "loss": 1.2888, + "step": 11855 + }, + { + "epoch": 3.55, + "grad_norm": 1.2167850732803345, + "learning_rate": 4.621779047884642e-05, + "loss": 1.3761, + "step": 11860 + }, + { + "epoch": 3.55, + "grad_norm": 0.8653399348258972, + "learning_rate": 4.6214682752989746e-05, + "loss": 1.3106, + "step": 11865 + }, + { + "epoch": 3.55, + "grad_norm": 1.1357594728469849, + "learning_rate": 4.6211573855468205e-05, + "loss": 1.3966, + "step": 11870 + }, + { + "epoch": 3.55, + "grad_norm": 2.4457480907440186, + "learning_rate": 4.6208463786453485e-05, + "loss": 1.3881, + "step": 11875 + }, + { + "epoch": 3.55, + "grad_norm": 2.0536937713623047, + "learning_rate": 4.6205352546117356e-05, + "loss": 1.3554, + "step": 11880 + }, + { + "epoch": 3.56, + "grad_norm": 1.843001127243042, + "learning_rate": 4.6202240134631644e-05, + "loss": 1.3486, + "step": 11885 + }, + { + "epoch": 3.56, + "grad_norm": 1.340734839439392, + "learning_rate": 4.619912655216825e-05, + "loss": 1.2276, + "step": 11890 + }, + { + "epoch": 3.56, + "grad_norm": 0.9751700162887573, + "learning_rate": 4.619601179889913e-05, + "loss": 1.1469, + "step": 11895 + }, + { + "epoch": 3.56, + "grad_norm": 1.659359335899353, + "learning_rate": 4.619289587499631e-05, + "loss": 1.271, + "step": 11900 + }, + { + "epoch": 3.56, + "grad_norm": 1.9473052024841309, + "learning_rate": 4.618977878063188e-05, + "loss": 1.2939, + "step": 11905 + }, + { + "epoch": 3.56, + "grad_norm": 1.0142465829849243, + "learning_rate": 4.6186660515978e-05, + "loss": 1.2489, + "step": 11910 + }, + { + "epoch": 3.56, + "grad_norm": 1.311125636100769, + "learning_rate": 4.618354108120687e-05, + "loss": 1.2565, + "step": 11915 + }, + { + "epoch": 3.57, + "grad_norm": 1.6890449523925781, + "learning_rate": 4.618042047649079e-05, + "loss": 1.2294, + "step": 11920 + }, + { + "epoch": 3.57, + "grad_norm": 2.259535789489746, + "learning_rate": 4.6177298702002106e-05, + "loss": 1.3299, + "step": 11925 + }, + { + "epoch": 3.57, + "grad_norm": 1.944066047668457, + "learning_rate": 4.6174800440290745e-05, + "loss": 1.4415, + "step": 11930 + }, + { + "epoch": 3.57, + "grad_norm": 1.1252658367156982, + "learning_rate": 4.617167656064589e-05, + "loss": 1.271, + "step": 11935 + }, + { + "epoch": 3.57, + "grad_norm": 1.6652323007583618, + "learning_rate": 4.616855151171134e-05, + "loss": 1.2467, + "step": 11940 + }, + { + "epoch": 3.57, + "grad_norm": 1.1521717309951782, + "learning_rate": 4.61654252936597e-05, + "loss": 1.3624, + "step": 11945 + }, + { + "epoch": 3.58, + "grad_norm": 1.9085320234298706, + "learning_rate": 4.616229790666362e-05, + "loss": 1.2027, + "step": 11950 + }, + { + "epoch": 3.58, + "grad_norm": 1.8030284643173218, + "learning_rate": 4.6159169350895825e-05, + "loss": 1.2746, + "step": 11955 + }, + { + "epoch": 3.58, + "grad_norm": 1.5365478992462158, + "learning_rate": 4.61560396265291e-05, + "loss": 1.4625, + "step": 11960 + }, + { + "epoch": 3.58, + "grad_norm": 2.0654516220092773, + "learning_rate": 4.615290873373629e-05, + "loss": 1.2609, + "step": 11965 + }, + { + "epoch": 3.58, + "grad_norm": 2.4061896800994873, + "learning_rate": 4.614977667269033e-05, + "loss": 1.2647, + "step": 11970 + }, + { + "epoch": 3.58, + "grad_norm": 3.210092544555664, + "learning_rate": 4.614664344356417e-05, + "loss": 1.2957, + "step": 11975 + }, + { + "epoch": 3.58, + "grad_norm": 2.317610025405884, + "learning_rate": 4.614350904653089e-05, + "loss": 1.2691, + "step": 11980 + }, + { + "epoch": 3.59, + "grad_norm": 1.450660228729248, + "learning_rate": 4.614037348176358e-05, + "loss": 1.2757, + "step": 11985 + }, + { + "epoch": 3.59, + "grad_norm": 1.0978591442108154, + "learning_rate": 4.6137236749435413e-05, + "loss": 1.2778, + "step": 11990 + }, + { + "epoch": 3.59, + "grad_norm": 1.411629319190979, + "learning_rate": 4.613409884971963e-05, + "loss": 1.5244, + "step": 11995 + }, + { + "epoch": 3.59, + "grad_norm": 4.375972270965576, + "learning_rate": 4.613095978278954e-05, + "loss": 1.3383, + "step": 12000 + }, + { + "epoch": 3.59, + "grad_norm": 1.4129486083984375, + "learning_rate": 4.6127819548818506e-05, + "loss": 1.4209, + "step": 12005 + }, + { + "epoch": 3.59, + "grad_norm": 1.0286214351654053, + "learning_rate": 4.612467814797996e-05, + "loss": 1.3274, + "step": 12010 + }, + { + "epoch": 3.59, + "grad_norm": 1.4175000190734863, + "learning_rate": 4.61215355804474e-05, + "loss": 1.261, + "step": 12015 + }, + { + "epoch": 3.6, + "grad_norm": 1.794857144355774, + "learning_rate": 4.611839184639437e-05, + "loss": 1.2601, + "step": 12020 + }, + { + "epoch": 3.6, + "grad_norm": 2.3006887435913086, + "learning_rate": 4.611524694599452e-05, + "loss": 1.1114, + "step": 12025 + }, + { + "epoch": 3.6, + "grad_norm": 2.3993430137634277, + "learning_rate": 4.6112100879421524e-05, + "loss": 1.3558, + "step": 12030 + }, + { + "epoch": 3.6, + "grad_norm": 1.9399762153625488, + "learning_rate": 4.610895364684915e-05, + "loss": 1.3497, + "step": 12035 + }, + { + "epoch": 3.6, + "grad_norm": 1.2342357635498047, + "learning_rate": 4.61058052484512e-05, + "loss": 1.3263, + "step": 12040 + }, + { + "epoch": 3.6, + "grad_norm": 2.044203042984009, + "learning_rate": 4.610265568440157e-05, + "loss": 1.3344, + "step": 12045 + }, + { + "epoch": 3.61, + "grad_norm": 0.6386117935180664, + "learning_rate": 4.609950495487419e-05, + "loss": 1.143, + "step": 12050 + }, + { + "epoch": 3.61, + "grad_norm": 1.735022783279419, + "learning_rate": 4.60963530600431e-05, + "loss": 1.4065, + "step": 12055 + }, + { + "epoch": 3.61, + "grad_norm": 1.7873737812042236, + "learning_rate": 4.6093200000082346e-05, + "loss": 1.3379, + "step": 12060 + }, + { + "epoch": 3.61, + "grad_norm": 1.9546738862991333, + "learning_rate": 4.609004577516609e-05, + "loss": 1.1978, + "step": 12065 + }, + { + "epoch": 3.61, + "grad_norm": 1.1246274709701538, + "learning_rate": 4.6086890385468526e-05, + "loss": 1.2626, + "step": 12070 + }, + { + "epoch": 3.61, + "grad_norm": 1.5824791193008423, + "learning_rate": 4.6083733831163925e-05, + "loss": 1.3883, + "step": 12075 + }, + { + "epoch": 3.61, + "grad_norm": 1.4226622581481934, + "learning_rate": 4.608057611242662e-05, + "loss": 1.2575, + "step": 12080 + }, + { + "epoch": 3.62, + "grad_norm": 1.6612082719802856, + "learning_rate": 4.6077417229431e-05, + "loss": 1.107, + "step": 12085 + }, + { + "epoch": 3.62, + "grad_norm": 1.722304105758667, + "learning_rate": 4.6074257182351546e-05, + "loss": 1.1742, + "step": 12090 + }, + { + "epoch": 3.62, + "grad_norm": 2.5315911769866943, + "learning_rate": 4.607109597136277e-05, + "loss": 1.2671, + "step": 12095 + }, + { + "epoch": 3.62, + "grad_norm": 1.4968048334121704, + "learning_rate": 4.606793359663926e-05, + "loss": 1.2445, + "step": 12100 + }, + { + "epoch": 3.62, + "grad_norm": 0.9814133644104004, + "learning_rate": 4.6064770058355675e-05, + "loss": 1.2989, + "step": 12105 + }, + { + "epoch": 3.62, + "grad_norm": 1.6419323682785034, + "learning_rate": 4.6061605356686746e-05, + "loss": 1.4119, + "step": 12110 + }, + { + "epoch": 3.62, + "grad_norm": 1.662709355354309, + "learning_rate": 4.605843949180724e-05, + "loss": 1.4737, + "step": 12115 + }, + { + "epoch": 3.63, + "grad_norm": 1.9931641817092896, + "learning_rate": 4.605527246389201e-05, + "loss": 1.3292, + "step": 12120 + }, + { + "epoch": 3.63, + "grad_norm": 2.7940378189086914, + "learning_rate": 4.605210427311596e-05, + "loss": 1.3846, + "step": 12125 + }, + { + "epoch": 3.63, + "grad_norm": 1.8893239498138428, + "learning_rate": 4.604893491965409e-05, + "loss": 1.2893, + "step": 12130 + }, + { + "epoch": 3.63, + "grad_norm": 2.463541030883789, + "learning_rate": 4.6045764403681415e-05, + "loss": 1.2914, + "step": 12135 + }, + { + "epoch": 3.63, + "grad_norm": 2.0132181644439697, + "learning_rate": 4.604259272537304e-05, + "loss": 1.4236, + "step": 12140 + }, + { + "epoch": 3.63, + "grad_norm": 1.1866495609283447, + "learning_rate": 4.603941988490415e-05, + "loss": 1.3439, + "step": 12145 + }, + { + "epoch": 3.64, + "grad_norm": 1.9764355421066284, + "learning_rate": 4.603624588244997e-05, + "loss": 1.3349, + "step": 12150 + }, + { + "epoch": 3.64, + "grad_norm": 1.9496948719024658, + "learning_rate": 4.603307071818579e-05, + "loss": 1.2836, + "step": 12155 + }, + { + "epoch": 3.64, + "grad_norm": 3.0015974044799805, + "learning_rate": 4.602989439228698e-05, + "loss": 1.367, + "step": 12160 + }, + { + "epoch": 3.64, + "grad_norm": 1.3181309700012207, + "learning_rate": 4.6026716904928965e-05, + "loss": 1.4144, + "step": 12165 + }, + { + "epoch": 3.64, + "grad_norm": 1.3011974096298218, + "learning_rate": 4.602353825628722e-05, + "loss": 1.2887, + "step": 12170 + }, + { + "epoch": 3.64, + "grad_norm": 2.6620357036590576, + "learning_rate": 4.602035844653733e-05, + "loss": 1.3088, + "step": 12175 + }, + { + "epoch": 3.64, + "grad_norm": 1.3964091539382935, + "learning_rate": 4.601717747585488e-05, + "loss": 1.2074, + "step": 12180 + }, + { + "epoch": 3.65, + "grad_norm": 2.161696434020996, + "learning_rate": 4.601399534441556e-05, + "loss": 1.3595, + "step": 12185 + }, + { + "epoch": 3.65, + "grad_norm": 1.0250306129455566, + "learning_rate": 4.601081205239512e-05, + "loss": 1.3982, + "step": 12190 + }, + { + "epoch": 3.65, + "grad_norm": 1.1172339916229248, + "learning_rate": 4.6007627599969385e-05, + "loss": 1.1501, + "step": 12195 + }, + { + "epoch": 3.65, + "grad_norm": 1.6155879497528076, + "learning_rate": 4.60044419873142e-05, + "loss": 1.3539, + "step": 12200 + }, + { + "epoch": 3.65, + "grad_norm": 1.6924985647201538, + "learning_rate": 4.600125521460552e-05, + "loss": 1.2307, + "step": 12205 + }, + { + "epoch": 3.65, + "grad_norm": 1.349064826965332, + "learning_rate": 4.599806728201935e-05, + "loss": 1.1824, + "step": 12210 + }, + { + "epoch": 3.65, + "grad_norm": 2.545496940612793, + "learning_rate": 4.599487818973174e-05, + "loss": 1.3419, + "step": 12215 + }, + { + "epoch": 3.66, + "grad_norm": 2.050647258758545, + "learning_rate": 4.599168793791884e-05, + "loss": 1.21, + "step": 12220 + }, + { + "epoch": 3.66, + "grad_norm": 2.1457366943359375, + "learning_rate": 4.598849652675683e-05, + "loss": 1.3711, + "step": 12225 + }, + { + "epoch": 3.66, + "grad_norm": 2.319671630859375, + "learning_rate": 4.598530395642197e-05, + "loss": 1.2425, + "step": 12230 + }, + { + "epoch": 3.66, + "grad_norm": 1.073062777519226, + "learning_rate": 4.598211022709059e-05, + "loss": 1.3835, + "step": 12235 + }, + { + "epoch": 3.66, + "grad_norm": 1.0914404392242432, + "learning_rate": 4.597891533893908e-05, + "loss": 1.5263, + "step": 12240 + }, + { + "epoch": 3.66, + "grad_norm": 0.8850699663162231, + "learning_rate": 4.5975719292143865e-05, + "loss": 1.299, + "step": 12245 + }, + { + "epoch": 3.67, + "grad_norm": 1.4798288345336914, + "learning_rate": 4.5972522086881485e-05, + "loss": 1.3877, + "step": 12250 + }, + { + "epoch": 3.67, + "grad_norm": 0.8555247783660889, + "learning_rate": 4.5969323723328505e-05, + "loss": 1.4655, + "step": 12255 + }, + { + "epoch": 3.67, + "grad_norm": 1.9914946556091309, + "learning_rate": 4.596612420166158e-05, + "loss": 1.2558, + "step": 12260 + }, + { + "epoch": 3.67, + "grad_norm": 1.4527971744537354, + "learning_rate": 4.596292352205741e-05, + "loss": 1.2612, + "step": 12265 + }, + { + "epoch": 3.67, + "grad_norm": 1.6714884042739868, + "learning_rate": 4.595972168469276e-05, + "loss": 1.2867, + "step": 12270 + }, + { + "epoch": 3.67, + "grad_norm": 1.638249397277832, + "learning_rate": 4.595651868974447e-05, + "loss": 1.3167, + "step": 12275 + }, + { + "epoch": 3.67, + "grad_norm": 1.3472903966903687, + "learning_rate": 4.5953314537389426e-05, + "loss": 1.2534, + "step": 12280 + }, + { + "epoch": 3.68, + "grad_norm": 1.781898856163025, + "learning_rate": 4.59501092278046e-05, + "loss": 1.2795, + "step": 12285 + }, + { + "epoch": 3.68, + "grad_norm": 2.2303428649902344, + "learning_rate": 4.594690276116703e-05, + "loss": 1.3091, + "step": 12290 + }, + { + "epoch": 3.68, + "grad_norm": 1.383138656616211, + "learning_rate": 4.594369513765379e-05, + "loss": 1.2663, + "step": 12295 + }, + { + "epoch": 3.68, + "grad_norm": 1.707878828048706, + "learning_rate": 4.594048635744203e-05, + "loss": 1.2528, + "step": 12300 + }, + { + "epoch": 3.68, + "grad_norm": 0.8633110523223877, + "learning_rate": 4.5937276420708985e-05, + "loss": 1.3231, + "step": 12305 + }, + { + "epoch": 3.68, + "grad_norm": 1.4325439929962158, + "learning_rate": 4.593406532763192e-05, + "loss": 1.21, + "step": 12310 + }, + { + "epoch": 3.68, + "grad_norm": 1.455572247505188, + "learning_rate": 4.5930853078388185e-05, + "loss": 1.2322, + "step": 12315 + }, + { + "epoch": 3.69, + "grad_norm": 1.189207911491394, + "learning_rate": 4.59276396731552e-05, + "loss": 1.3804, + "step": 12320 + }, + { + "epoch": 3.69, + "grad_norm": 1.2645961046218872, + "learning_rate": 4.592442511211042e-05, + "loss": 1.3339, + "step": 12325 + }, + { + "epoch": 3.69, + "grad_norm": 1.2472920417785645, + "learning_rate": 4.59212093954314e-05, + "loss": 1.2354, + "step": 12330 + }, + { + "epoch": 3.69, + "grad_norm": 1.2860960960388184, + "learning_rate": 4.5917992523295716e-05, + "loss": 1.2905, + "step": 12335 + }, + { + "epoch": 3.69, + "grad_norm": 1.902571439743042, + "learning_rate": 4.591477449588106e-05, + "loss": 1.2967, + "step": 12340 + }, + { + "epoch": 3.69, + "grad_norm": 2.72182559967041, + "learning_rate": 4.591155531336514e-05, + "loss": 1.3203, + "step": 12345 + }, + { + "epoch": 3.69, + "grad_norm": 4.221126079559326, + "learning_rate": 4.590833497592576e-05, + "loss": 1.3342, + "step": 12350 + }, + { + "epoch": 3.7, + "grad_norm": 2.888122320175171, + "learning_rate": 4.590511348374078e-05, + "loss": 1.2249, + "step": 12355 + }, + { + "epoch": 3.7, + "grad_norm": 1.8218414783477783, + "learning_rate": 4.5901890836988107e-05, + "loss": 1.3318, + "step": 12360 + }, + { + "epoch": 3.7, + "grad_norm": 1.3401216268539429, + "learning_rate": 4.589866703584573e-05, + "loss": 1.355, + "step": 12365 + }, + { + "epoch": 3.7, + "grad_norm": 1.530899167060852, + "learning_rate": 4.5895442080491694e-05, + "loss": 1.1148, + "step": 12370 + }, + { + "epoch": 3.7, + "grad_norm": 1.1697745323181152, + "learning_rate": 4.589221597110411e-05, + "loss": 1.3208, + "step": 12375 + }, + { + "epoch": 3.7, + "grad_norm": 1.9336832761764526, + "learning_rate": 4.588898870786116e-05, + "loss": 1.3834, + "step": 12380 + }, + { + "epoch": 3.71, + "grad_norm": 1.2124789953231812, + "learning_rate": 4.588576029094106e-05, + "loss": 1.2128, + "step": 12385 + }, + { + "epoch": 3.71, + "grad_norm": 1.4189821481704712, + "learning_rate": 4.588253072052214e-05, + "loss": 1.3005, + "step": 12390 + }, + { + "epoch": 3.71, + "grad_norm": 1.8805235624313354, + "learning_rate": 4.5879299996782765e-05, + "loss": 1.3208, + "step": 12395 + }, + { + "epoch": 3.71, + "grad_norm": 2.4041588306427, + "learning_rate": 4.587606811990134e-05, + "loss": 1.3952, + "step": 12400 + }, + { + "epoch": 3.71, + "grad_norm": 1.0374820232391357, + "learning_rate": 4.5872835090056375e-05, + "loss": 1.3175, + "step": 12405 + }, + { + "epoch": 3.71, + "grad_norm": 2.124577760696411, + "learning_rate": 4.586960090742643e-05, + "loss": 1.2484, + "step": 12410 + }, + { + "epoch": 3.71, + "grad_norm": 4.781686305999756, + "learning_rate": 4.586636557219011e-05, + "loss": 1.2985, + "step": 12415 + }, + { + "epoch": 3.72, + "grad_norm": 1.7843397855758667, + "learning_rate": 4.586312908452612e-05, + "loss": 1.2425, + "step": 12420 + }, + { + "epoch": 3.72, + "grad_norm": 3.2184629440307617, + "learning_rate": 4.585989144461319e-05, + "loss": 1.3105, + "step": 12425 + }, + { + "epoch": 3.72, + "grad_norm": 1.4963960647583008, + "learning_rate": 4.585665265263014e-05, + "loss": 1.2972, + "step": 12430 + }, + { + "epoch": 3.72, + "grad_norm": 3.5858564376831055, + "learning_rate": 4.585341270875584e-05, + "loss": 1.3527, + "step": 12435 + }, + { + "epoch": 3.72, + "grad_norm": 2.781733989715576, + "learning_rate": 4.5850171613169235e-05, + "loss": 1.3733, + "step": 12440 + }, + { + "epoch": 3.72, + "grad_norm": 0.917702317237854, + "learning_rate": 4.5846929366049316e-05, + "loss": 1.2823, + "step": 12445 + }, + { + "epoch": 3.72, + "grad_norm": 1.9473000764846802, + "learning_rate": 4.584368596757517e-05, + "loss": 1.2604, + "step": 12450 + }, + { + "epoch": 3.73, + "grad_norm": 1.2440733909606934, + "learning_rate": 4.584044141792591e-05, + "loss": 1.3558, + "step": 12455 + }, + { + "epoch": 3.73, + "grad_norm": 1.291435956954956, + "learning_rate": 4.5837195717280736e-05, + "loss": 1.2116, + "step": 12460 + }, + { + "epoch": 3.73, + "grad_norm": 1.013763427734375, + "learning_rate": 4.583394886581889e-05, + "loss": 1.3011, + "step": 12465 + }, + { + "epoch": 3.73, + "grad_norm": 1.0628958940505981, + "learning_rate": 4.583070086371971e-05, + "loss": 1.4285, + "step": 12470 + }, + { + "epoch": 3.73, + "grad_norm": 1.822394847869873, + "learning_rate": 4.5827451711162575e-05, + "loss": 1.2755, + "step": 12475 + }, + { + "epoch": 3.73, + "grad_norm": 1.7311921119689941, + "learning_rate": 4.5824201408326934e-05, + "loss": 1.4643, + "step": 12480 + }, + { + "epoch": 3.74, + "grad_norm": 1.9452160596847534, + "learning_rate": 4.582094995539229e-05, + "loss": 1.2745, + "step": 12485 + }, + { + "epoch": 3.74, + "grad_norm": 4.922383785247803, + "learning_rate": 4.581769735253822e-05, + "loss": 1.3349, + "step": 12490 + }, + { + "epoch": 3.74, + "grad_norm": 2.8117499351501465, + "learning_rate": 4.581444359994437e-05, + "loss": 1.4012, + "step": 12495 + }, + { + "epoch": 3.74, + "grad_norm": 1.1426706314086914, + "learning_rate": 4.581118869779043e-05, + "loss": 1.2974, + "step": 12500 + }, + { + "epoch": 3.74, + "grad_norm": 1.857204556465149, + "learning_rate": 4.580793264625618e-05, + "loss": 1.2929, + "step": 12505 + }, + { + "epoch": 3.74, + "grad_norm": 1.1504136323928833, + "learning_rate": 4.580467544552143e-05, + "loss": 1.3019, + "step": 12510 + }, + { + "epoch": 3.74, + "grad_norm": 3.0354089736938477, + "learning_rate": 4.580141709576608e-05, + "loss": 1.3482, + "step": 12515 + }, + { + "epoch": 3.75, + "grad_norm": 1.1074090003967285, + "learning_rate": 4.579815759717009e-05, + "loss": 1.4458, + "step": 12520 + }, + { + "epoch": 3.75, + "grad_norm": 1.3502519130706787, + "learning_rate": 4.579489694991347e-05, + "loss": 1.1843, + "step": 12525 + }, + { + "epoch": 3.75, + "grad_norm": 0.9303249716758728, + "learning_rate": 4.579163515417631e-05, + "loss": 1.3647, + "step": 12530 + }, + { + "epoch": 3.75, + "grad_norm": 0.86869877576828, + "learning_rate": 4.578837221013875e-05, + "loss": 1.2498, + "step": 12535 + }, + { + "epoch": 3.75, + "grad_norm": 1.8906731605529785, + "learning_rate": 4.5785108117981e-05, + "loss": 1.3277, + "step": 12540 + }, + { + "epoch": 3.75, + "grad_norm": 1.9639962911605835, + "learning_rate": 4.578184287788334e-05, + "loss": 1.4021, + "step": 12545 + }, + { + "epoch": 3.75, + "grad_norm": 2.0274109840393066, + "learning_rate": 4.5778576490026094e-05, + "loss": 1.2441, + "step": 12550 + }, + { + "epoch": 3.76, + "grad_norm": 2.2819442749023438, + "learning_rate": 4.577530895458967e-05, + "loss": 1.4074, + "step": 12555 + }, + { + "epoch": 3.76, + "grad_norm": 1.8426662683486938, + "learning_rate": 4.577204027175453e-05, + "loss": 1.2962, + "step": 12560 + }, + { + "epoch": 3.76, + "grad_norm": 1.2728874683380127, + "learning_rate": 4.576877044170118e-05, + "loss": 1.2356, + "step": 12565 + }, + { + "epoch": 3.76, + "grad_norm": 1.636061668395996, + "learning_rate": 4.576549946461024e-05, + "loss": 1.4087, + "step": 12570 + }, + { + "epoch": 3.76, + "grad_norm": 1.0893962383270264, + "learning_rate": 4.576222734066235e-05, + "loss": 1.1445, + "step": 12575 + }, + { + "epoch": 3.76, + "grad_norm": 1.898051381111145, + "learning_rate": 4.575895407003822e-05, + "loss": 1.3225, + "step": 12580 + }, + { + "epoch": 3.77, + "grad_norm": 1.7080368995666504, + "learning_rate": 4.575567965291864e-05, + "loss": 1.3176, + "step": 12585 + }, + { + "epoch": 3.77, + "grad_norm": 2.100494861602783, + "learning_rate": 4.575240408948443e-05, + "loss": 1.4143, + "step": 12590 + }, + { + "epoch": 3.77, + "grad_norm": 1.711904764175415, + "learning_rate": 4.5749127379916536e-05, + "loss": 1.4185, + "step": 12595 + }, + { + "epoch": 3.77, + "grad_norm": 3.1518118381500244, + "learning_rate": 4.57458495243959e-05, + "loss": 1.3182, + "step": 12600 + }, + { + "epoch": 3.77, + "grad_norm": 2.016057252883911, + "learning_rate": 4.574257052310355e-05, + "loss": 1.3566, + "step": 12605 + }, + { + "epoch": 3.77, + "grad_norm": 1.0968761444091797, + "learning_rate": 4.573929037622059e-05, + "loss": 1.2916, + "step": 12610 + }, + { + "epoch": 3.77, + "grad_norm": 1.1773560047149658, + "learning_rate": 4.573600908392819e-05, + "loss": 1.2659, + "step": 12615 + }, + { + "epoch": 3.78, + "grad_norm": 1.2752783298492432, + "learning_rate": 4.573272664640755e-05, + "loss": 1.1999, + "step": 12620 + }, + { + "epoch": 3.78, + "grad_norm": 1.5268641710281372, + "learning_rate": 4.5729443063839986e-05, + "loss": 1.4445, + "step": 12625 + }, + { + "epoch": 3.78, + "grad_norm": 1.8288689851760864, + "learning_rate": 4.572615833640681e-05, + "loss": 1.2565, + "step": 12630 + }, + { + "epoch": 3.78, + "grad_norm": 2.030205726623535, + "learning_rate": 4.572287246428946e-05, + "loss": 1.2579, + "step": 12635 + }, + { + "epoch": 3.78, + "grad_norm": 2.9597010612487793, + "learning_rate": 4.57195854476694e-05, + "loss": 1.1755, + "step": 12640 + }, + { + "epoch": 3.78, + "grad_norm": 1.5973179340362549, + "learning_rate": 4.571629728672818e-05, + "loss": 1.1414, + "step": 12645 + }, + { + "epoch": 3.78, + "grad_norm": 1.202883005142212, + "learning_rate": 4.5713007981647394e-05, + "loss": 1.3216, + "step": 12650 + }, + { + "epoch": 3.79, + "grad_norm": 1.9232937097549438, + "learning_rate": 4.57097175326087e-05, + "loss": 1.306, + "step": 12655 + }, + { + "epoch": 3.79, + "grad_norm": 1.9437748193740845, + "learning_rate": 4.570642593979384e-05, + "loss": 1.2824, + "step": 12660 + }, + { + "epoch": 3.79, + "grad_norm": 1.9514751434326172, + "learning_rate": 4.5703133203384594e-05, + "loss": 1.404, + "step": 12665 + }, + { + "epoch": 3.79, + "grad_norm": 1.337649941444397, + "learning_rate": 4.5699839323562824e-05, + "loss": 1.4712, + "step": 12670 + }, + { + "epoch": 3.79, + "grad_norm": 0.9567248225212097, + "learning_rate": 4.569654430051045e-05, + "loss": 1.2779, + "step": 12675 + }, + { + "epoch": 3.79, + "grad_norm": 1.1744178533554077, + "learning_rate": 4.5693248134409434e-05, + "loss": 1.3132, + "step": 12680 + }, + { + "epoch": 3.8, + "grad_norm": 2.404231548309326, + "learning_rate": 4.568995082544184e-05, + "loss": 1.3972, + "step": 12685 + }, + { + "epoch": 3.8, + "grad_norm": 1.4296953678131104, + "learning_rate": 4.5686652373789764e-05, + "loss": 1.3201, + "step": 12690 + }, + { + "epoch": 3.8, + "grad_norm": 0.6986697912216187, + "learning_rate": 4.568335277963538e-05, + "loss": 1.2344, + "step": 12695 + }, + { + "epoch": 3.8, + "grad_norm": 1.2852948904037476, + "learning_rate": 4.568005204316093e-05, + "loss": 1.3389, + "step": 12700 + }, + { + "epoch": 3.8, + "grad_norm": 1.1272681951522827, + "learning_rate": 4.567675016454869e-05, + "loss": 1.3613, + "step": 12705 + }, + { + "epoch": 3.8, + "grad_norm": 3.3429181575775146, + "learning_rate": 4.567344714398104e-05, + "loss": 1.3005, + "step": 12710 + }, + { + "epoch": 3.8, + "grad_norm": 1.5739295482635498, + "learning_rate": 4.5670142981640384e-05, + "loss": 1.0753, + "step": 12715 + }, + { + "epoch": 3.81, + "grad_norm": 1.3423209190368652, + "learning_rate": 4.566683767770923e-05, + "loss": 1.1526, + "step": 12720 + }, + { + "epoch": 3.81, + "grad_norm": 1.0714255571365356, + "learning_rate": 4.5663531232370105e-05, + "loss": 1.4233, + "step": 12725 + }, + { + "epoch": 3.81, + "grad_norm": 0.9389391541481018, + "learning_rate": 4.5660223645805624e-05, + "loss": 1.3871, + "step": 12730 + }, + { + "epoch": 3.81, + "grad_norm": 1.9636523723602295, + "learning_rate": 4.5656914918198465e-05, + "loss": 1.2896, + "step": 12735 + }, + { + "epoch": 3.81, + "grad_norm": 2.509476661682129, + "learning_rate": 4.565360504973138e-05, + "loss": 1.2616, + "step": 12740 + }, + { + "epoch": 3.81, + "grad_norm": 0.8651650547981262, + "learning_rate": 4.565029404058715e-05, + "loss": 1.4291, + "step": 12745 + }, + { + "epoch": 3.81, + "grad_norm": 1.635857105255127, + "learning_rate": 4.564698189094864e-05, + "loss": 1.2209, + "step": 12750 + }, + { + "epoch": 3.82, + "grad_norm": 2.29955792427063, + "learning_rate": 4.564366860099879e-05, + "loss": 1.3041, + "step": 12755 + }, + { + "epoch": 3.82, + "grad_norm": 1.347915768623352, + "learning_rate": 4.5640354170920575e-05, + "loss": 1.2319, + "step": 12760 + }, + { + "epoch": 3.82, + "grad_norm": 3.2308144569396973, + "learning_rate": 4.563703860089705e-05, + "loss": 1.3922, + "step": 12765 + }, + { + "epoch": 3.82, + "grad_norm": 1.212275505065918, + "learning_rate": 4.5633721891111336e-05, + "loss": 1.4356, + "step": 12770 + }, + { + "epoch": 3.82, + "grad_norm": 1.5803014039993286, + "learning_rate": 4.563040404174662e-05, + "loss": 1.2631, + "step": 12775 + }, + { + "epoch": 3.82, + "grad_norm": 2.851174831390381, + "learning_rate": 4.562708505298612e-05, + "loss": 1.3305, + "step": 12780 + }, + { + "epoch": 3.83, + "grad_norm": 1.705056071281433, + "learning_rate": 4.5623764925013155e-05, + "loss": 1.1758, + "step": 12785 + }, + { + "epoch": 3.83, + "grad_norm": 1.0315181016921997, + "learning_rate": 4.56204436580111e-05, + "loss": 1.1893, + "step": 12790 + }, + { + "epoch": 3.83, + "grad_norm": 1.8870512247085571, + "learning_rate": 4.561712125216337e-05, + "loss": 1.4365, + "step": 12795 + }, + { + "epoch": 3.83, + "grad_norm": 2.1169257164001465, + "learning_rate": 4.561379770765346e-05, + "loss": 1.3076, + "step": 12800 + }, + { + "epoch": 3.83, + "grad_norm": 2.4839279651641846, + "learning_rate": 4.5610473024664935e-05, + "loss": 1.2516, + "step": 12805 + }, + { + "epoch": 3.83, + "grad_norm": 1.1137679815292358, + "learning_rate": 4.560714720338141e-05, + "loss": 1.3052, + "step": 12810 + }, + { + "epoch": 3.83, + "grad_norm": 1.6442543268203735, + "learning_rate": 4.560382024398655e-05, + "loss": 1.3419, + "step": 12815 + }, + { + "epoch": 3.84, + "grad_norm": 1.8083311319351196, + "learning_rate": 4.560049214666413e-05, + "loss": 1.304, + "step": 12820 + }, + { + "epoch": 3.84, + "grad_norm": 1.2691807746887207, + "learning_rate": 4.559716291159793e-05, + "loss": 1.2481, + "step": 12825 + }, + { + "epoch": 3.84, + "grad_norm": 1.8940715789794922, + "learning_rate": 4.5593832538971846e-05, + "loss": 1.2207, + "step": 12830 + }, + { + "epoch": 3.84, + "grad_norm": 1.9223408699035645, + "learning_rate": 4.5590501028969787e-05, + "loss": 1.313, + "step": 12835 + }, + { + "epoch": 3.84, + "grad_norm": 1.92839777469635, + "learning_rate": 4.5587168381775756e-05, + "loss": 1.2413, + "step": 12840 + }, + { + "epoch": 3.84, + "grad_norm": 1.7354521751403809, + "learning_rate": 4.5583834597573825e-05, + "loss": 1.2551, + "step": 12845 + }, + { + "epoch": 3.84, + "grad_norm": 1.491490125656128, + "learning_rate": 4.55804996765481e-05, + "loss": 1.3245, + "step": 12850 + }, + { + "epoch": 3.85, + "grad_norm": 0.8341639041900635, + "learning_rate": 4.5577163618882766e-05, + "loss": 1.3094, + "step": 12855 + }, + { + "epoch": 3.85, + "grad_norm": 0.9003182053565979, + "learning_rate": 4.557382642476208e-05, + "loss": 1.3996, + "step": 12860 + }, + { + "epoch": 3.85, + "grad_norm": 2.012786388397217, + "learning_rate": 4.557048809437035e-05, + "loss": 1.4447, + "step": 12865 + }, + { + "epoch": 3.85, + "grad_norm": 1.0975584983825684, + "learning_rate": 4.556714862789193e-05, + "loss": 1.0958, + "step": 12870 + }, + { + "epoch": 3.85, + "grad_norm": 1.1538339853286743, + "learning_rate": 4.556380802551128e-05, + "loss": 1.268, + "step": 12875 + }, + { + "epoch": 3.85, + "grad_norm": 2.0543277263641357, + "learning_rate": 4.556046628741288e-05, + "loss": 1.4274, + "step": 12880 + }, + { + "epoch": 3.86, + "grad_norm": 1.767110824584961, + "learning_rate": 4.555712341378131e-05, + "loss": 1.3229, + "step": 12885 + }, + { + "epoch": 3.86, + "grad_norm": 1.8329064846038818, + "learning_rate": 4.555377940480118e-05, + "loss": 1.2361, + "step": 12890 + }, + { + "epoch": 3.86, + "grad_norm": 0.9706895351409912, + "learning_rate": 4.5550434260657174e-05, + "loss": 1.0816, + "step": 12895 + }, + { + "epoch": 3.86, + "grad_norm": 1.0541021823883057, + "learning_rate": 4.554708798153404e-05, + "loss": 1.1388, + "step": 12900 + }, + { + "epoch": 3.86, + "grad_norm": 1.4227367639541626, + "learning_rate": 4.55437405676166e-05, + "loss": 1.3312, + "step": 12905 + }, + { + "epoch": 3.86, + "grad_norm": 1.4533671140670776, + "learning_rate": 4.554039201908972e-05, + "loss": 1.2984, + "step": 12910 + }, + { + "epoch": 3.86, + "grad_norm": 1.0998986959457397, + "learning_rate": 4.553704233613833e-05, + "loss": 1.2591, + "step": 12915 + }, + { + "epoch": 3.87, + "grad_norm": 1.8056262731552124, + "learning_rate": 4.553369151894746e-05, + "loss": 1.113, + "step": 12920 + }, + { + "epoch": 3.87, + "grad_norm": 0.9125102162361145, + "learning_rate": 4.553033956770214e-05, + "loss": 1.3231, + "step": 12925 + }, + { + "epoch": 3.87, + "grad_norm": 1.3653115034103394, + "learning_rate": 4.55269864825875e-05, + "loss": 1.4139, + "step": 12930 + }, + { + "epoch": 3.87, + "grad_norm": 1.144234299659729, + "learning_rate": 4.552363226378874e-05, + "loss": 1.2501, + "step": 12935 + }, + { + "epoch": 3.87, + "grad_norm": 2.3261189460754395, + "learning_rate": 4.5520276911491104e-05, + "loss": 1.2052, + "step": 12940 + }, + { + "epoch": 3.87, + "grad_norm": 1.1465058326721191, + "learning_rate": 4.5516920425879905e-05, + "loss": 1.3006, + "step": 12945 + }, + { + "epoch": 3.87, + "grad_norm": 1.5299936532974243, + "learning_rate": 4.5513562807140515e-05, + "loss": 1.3346, + "step": 12950 + }, + { + "epoch": 3.88, + "grad_norm": 1.8486464023590088, + "learning_rate": 4.551020405545837e-05, + "loss": 1.3022, + "step": 12955 + }, + { + "epoch": 3.88, + "grad_norm": 1.537140130996704, + "learning_rate": 4.550684417101898e-05, + "loss": 1.3352, + "step": 12960 + }, + { + "epoch": 3.88, + "grad_norm": 1.5041508674621582, + "learning_rate": 4.55034831540079e-05, + "loss": 1.3945, + "step": 12965 + }, + { + "epoch": 3.88, + "grad_norm": 2.660659074783325, + "learning_rate": 4.550012100461075e-05, + "loss": 1.3366, + "step": 12970 + }, + { + "epoch": 3.88, + "grad_norm": 1.6128523349761963, + "learning_rate": 4.549675772301323e-05, + "loss": 1.3376, + "step": 12975 + }, + { + "epoch": 3.88, + "grad_norm": 1.45650315284729, + "learning_rate": 4.549339330940109e-05, + "loss": 1.5226, + "step": 12980 + }, + { + "epoch": 3.88, + "grad_norm": 1.1668511629104614, + "learning_rate": 4.5490027763960144e-05, + "loss": 1.2782, + "step": 12985 + }, + { + "epoch": 3.89, + "grad_norm": 1.2220723628997803, + "learning_rate": 4.548666108687625e-05, + "loss": 1.3738, + "step": 12990 + }, + { + "epoch": 3.89, + "grad_norm": 2.1042776107788086, + "learning_rate": 4.548329327833537e-05, + "loss": 1.3442, + "step": 12995 + }, + { + "epoch": 3.89, + "grad_norm": 1.7469139099121094, + "learning_rate": 4.547992433852349e-05, + "loss": 1.2097, + "step": 13000 + }, + { + "epoch": 3.89, + "grad_norm": 0.8937451839447021, + "learning_rate": 4.547655426762668e-05, + "loss": 1.3553, + "step": 13005 + }, + { + "epoch": 3.89, + "grad_norm": 2.8568267822265625, + "learning_rate": 4.5473183065831045e-05, + "loss": 1.4715, + "step": 13010 + }, + { + "epoch": 3.89, + "grad_norm": 1.3825722932815552, + "learning_rate": 4.546981073332281e-05, + "loss": 1.2806, + "step": 13015 + }, + { + "epoch": 3.9, + "grad_norm": 1.2610100507736206, + "learning_rate": 4.54664372702882e-05, + "loss": 1.1896, + "step": 13020 + }, + { + "epoch": 3.9, + "grad_norm": 1.5070395469665527, + "learning_rate": 4.5463062676913527e-05, + "loss": 1.3712, + "step": 13025 + }, + { + "epoch": 3.9, + "grad_norm": 2.2432565689086914, + "learning_rate": 4.545968695338518e-05, + "loss": 1.3497, + "step": 13030 + }, + { + "epoch": 3.9, + "grad_norm": 1.4944096803665161, + "learning_rate": 4.545631009988958e-05, + "loss": 1.2663, + "step": 13035 + }, + { + "epoch": 3.9, + "grad_norm": 2.186475992202759, + "learning_rate": 4.545293211661324e-05, + "loss": 1.3296, + "step": 13040 + }, + { + "epoch": 3.9, + "grad_norm": 1.5511248111724854, + "learning_rate": 4.544955300374273e-05, + "loss": 1.1667, + "step": 13045 + }, + { + "epoch": 3.9, + "grad_norm": 2.8364615440368652, + "learning_rate": 4.544617276146465e-05, + "loss": 1.2472, + "step": 13050 + }, + { + "epoch": 3.91, + "grad_norm": 0.9596519470214844, + "learning_rate": 4.544279138996571e-05, + "loss": 1.3599, + "step": 13055 + }, + { + "epoch": 3.91, + "grad_norm": 1.2000133991241455, + "learning_rate": 4.543940888943264e-05, + "loss": 1.229, + "step": 13060 + }, + { + "epoch": 3.91, + "grad_norm": 1.129855751991272, + "learning_rate": 4.543602526005227e-05, + "loss": 1.2954, + "step": 13065 + }, + { + "epoch": 3.91, + "grad_norm": 1.3284913301467896, + "learning_rate": 4.543264050201146e-05, + "loss": 1.3602, + "step": 13070 + }, + { + "epoch": 3.91, + "grad_norm": 1.7732700109481812, + "learning_rate": 4.5429254615497165e-05, + "loss": 1.3526, + "step": 13075 + }, + { + "epoch": 3.91, + "grad_norm": 1.5674593448638916, + "learning_rate": 4.542586760069637e-05, + "loss": 1.4001, + "step": 13080 + }, + { + "epoch": 3.91, + "grad_norm": 1.350285291671753, + "learning_rate": 4.542247945779613e-05, + "loss": 1.3551, + "step": 13085 + }, + { + "epoch": 3.92, + "grad_norm": 1.2830190658569336, + "learning_rate": 4.5419090186983587e-05, + "loss": 1.3441, + "step": 13090 + }, + { + "epoch": 3.92, + "grad_norm": 1.5413031578063965, + "learning_rate": 4.541569978844591e-05, + "loss": 1.2436, + "step": 13095 + }, + { + "epoch": 3.92, + "grad_norm": 1.2088831663131714, + "learning_rate": 4.541230826237036e-05, + "loss": 1.2817, + "step": 13100 + }, + { + "epoch": 3.92, + "grad_norm": 1.644108533859253, + "learning_rate": 4.540891560894424e-05, + "loss": 1.3467, + "step": 13105 + }, + { + "epoch": 3.92, + "grad_norm": 1.8455935716629028, + "learning_rate": 4.5405521828354924e-05, + "loss": 1.2956, + "step": 13110 + }, + { + "epoch": 3.92, + "grad_norm": 1.0476539134979248, + "learning_rate": 4.5402126920789854e-05, + "loss": 1.2251, + "step": 13115 + }, + { + "epoch": 3.93, + "grad_norm": 1.2740333080291748, + "learning_rate": 4.539873088643651e-05, + "loss": 1.4182, + "step": 13120 + }, + { + "epoch": 3.93, + "grad_norm": 1.2473523616790771, + "learning_rate": 4.539533372548247e-05, + "loss": 1.4605, + "step": 13125 + }, + { + "epoch": 3.93, + "grad_norm": 1.2522354125976562, + "learning_rate": 4.5391935438115355e-05, + "loss": 1.4085, + "step": 13130 + }, + { + "epoch": 3.93, + "grad_norm": 0.8400856852531433, + "learning_rate": 4.538853602452283e-05, + "loss": 1.4076, + "step": 13135 + }, + { + "epoch": 3.93, + "grad_norm": 3.1586833000183105, + "learning_rate": 4.538513548489265e-05, + "loss": 1.2214, + "step": 13140 + }, + { + "epoch": 3.93, + "grad_norm": 1.3957751989364624, + "learning_rate": 4.538173381941264e-05, + "loss": 1.3048, + "step": 13145 + }, + { + "epoch": 3.93, + "grad_norm": 1.6071856021881104, + "learning_rate": 4.537833102827065e-05, + "loss": 1.2303, + "step": 13150 + }, + { + "epoch": 3.94, + "grad_norm": 1.384070873260498, + "learning_rate": 4.537492711165462e-05, + "loss": 1.3229, + "step": 13155 + }, + { + "epoch": 3.94, + "grad_norm": 2.3858485221862793, + "learning_rate": 4.5371522069752544e-05, + "loss": 1.3097, + "step": 13160 + }, + { + "epoch": 3.94, + "grad_norm": 1.255236029624939, + "learning_rate": 4.536811590275247e-05, + "loss": 1.3769, + "step": 13165 + }, + { + "epoch": 3.94, + "grad_norm": 1.0633465051651, + "learning_rate": 4.5364708610842545e-05, + "loss": 1.2269, + "step": 13170 + }, + { + "epoch": 3.94, + "grad_norm": 1.6456542015075684, + "learning_rate": 4.536130019421092e-05, + "loss": 1.4208, + "step": 13175 + }, + { + "epoch": 3.94, + "grad_norm": 1.0649845600128174, + "learning_rate": 4.535789065304585e-05, + "loss": 1.2724, + "step": 13180 + }, + { + "epoch": 3.94, + "grad_norm": 2.226824998855591, + "learning_rate": 4.535447998753564e-05, + "loss": 1.2443, + "step": 13185 + }, + { + "epoch": 3.95, + "grad_norm": 1.2029057741165161, + "learning_rate": 4.535106819786866e-05, + "loss": 1.3256, + "step": 13190 + }, + { + "epoch": 3.95, + "grad_norm": 0.9885445237159729, + "learning_rate": 4.534765528423333e-05, + "loss": 1.3866, + "step": 13195 + }, + { + "epoch": 3.95, + "grad_norm": 4.255756855010986, + "learning_rate": 4.5344241246818154e-05, + "loss": 1.1524, + "step": 13200 + }, + { + "epoch": 3.95, + "grad_norm": 1.9906721115112305, + "learning_rate": 4.5340826085811684e-05, + "loss": 1.3169, + "step": 13205 + }, + { + "epoch": 3.95, + "grad_norm": 1.9341821670532227, + "learning_rate": 4.533740980140253e-05, + "loss": 1.4293, + "step": 13210 + }, + { + "epoch": 3.95, + "grad_norm": 2.2988078594207764, + "learning_rate": 4.533399239377937e-05, + "loss": 1.2559, + "step": 13215 + }, + { + "epoch": 3.96, + "grad_norm": 1.1931335926055908, + "learning_rate": 4.5330573863130946e-05, + "loss": 1.2239, + "step": 13220 + }, + { + "epoch": 3.96, + "grad_norm": 1.9085408449172974, + "learning_rate": 4.5327154209646065e-05, + "loss": 1.3006, + "step": 13225 + }, + { + "epoch": 3.96, + "grad_norm": 1.42606782913208, + "learning_rate": 4.532373343351358e-05, + "loss": 1.1926, + "step": 13230 + }, + { + "epoch": 3.96, + "grad_norm": 1.9960684776306152, + "learning_rate": 4.5320311534922425e-05, + "loss": 1.4581, + "step": 13235 + }, + { + "epoch": 3.96, + "grad_norm": 1.3302810192108154, + "learning_rate": 4.531688851406159e-05, + "loss": 1.2497, + "step": 13240 + }, + { + "epoch": 3.96, + "grad_norm": 1.0140787363052368, + "learning_rate": 4.531346437112012e-05, + "loss": 1.2587, + "step": 13245 + }, + { + "epoch": 3.96, + "grad_norm": 2.0420432090759277, + "learning_rate": 4.5310039106287115e-05, + "loss": 1.2299, + "step": 13250 + }, + { + "epoch": 3.97, + "grad_norm": 0.9707965850830078, + "learning_rate": 4.530661271975177e-05, + "loss": 1.2625, + "step": 13255 + }, + { + "epoch": 3.97, + "grad_norm": 0.9889094829559326, + "learning_rate": 4.530318521170332e-05, + "loss": 1.445, + "step": 13260 + }, + { + "epoch": 3.97, + "grad_norm": 1.5554451942443848, + "learning_rate": 4.529975658233104e-05, + "loss": 1.1236, + "step": 13265 + }, + { + "epoch": 3.97, + "grad_norm": 1.6578177213668823, + "learning_rate": 4.529632683182432e-05, + "loss": 1.1677, + "step": 13270 + }, + { + "epoch": 3.97, + "grad_norm": 2.4614758491516113, + "learning_rate": 4.529289596037256e-05, + "loss": 1.2574, + "step": 13275 + }, + { + "epoch": 3.97, + "grad_norm": 1.8018752336502075, + "learning_rate": 4.528946396816524e-05, + "loss": 1.3027, + "step": 13280 + }, + { + "epoch": 3.97, + "grad_norm": 2.3157856464385986, + "learning_rate": 4.5286030855391924e-05, + "loss": 1.439, + "step": 13285 + }, + { + "epoch": 3.98, + "grad_norm": 1.605214238166809, + "learning_rate": 4.528259662224221e-05, + "loss": 1.2793, + "step": 13290 + }, + { + "epoch": 3.98, + "grad_norm": 3.1263720989227295, + "learning_rate": 4.527916126890576e-05, + "loss": 1.386, + "step": 13295 + }, + { + "epoch": 3.98, + "grad_norm": 0.8434714078903198, + "learning_rate": 4.527572479557232e-05, + "loss": 1.3058, + "step": 13300 + }, + { + "epoch": 3.98, + "grad_norm": 1.2560151815414429, + "learning_rate": 4.5272287202431674e-05, + "loss": 1.2894, + "step": 13305 + }, + { + "epoch": 3.98, + "grad_norm": 2.4032607078552246, + "learning_rate": 4.526884848967368e-05, + "loss": 1.2046, + "step": 13310 + }, + { + "epoch": 3.98, + "grad_norm": 0.8934091329574585, + "learning_rate": 4.526540865748824e-05, + "loss": 1.3464, + "step": 13315 + }, + { + "epoch": 3.99, + "grad_norm": 1.3520596027374268, + "learning_rate": 4.526196770606536e-05, + "loss": 1.3311, + "step": 13320 + }, + { + "epoch": 3.99, + "grad_norm": 1.0915952920913696, + "learning_rate": 4.5258525635595054e-05, + "loss": 1.3354, + "step": 13325 + }, + { + "epoch": 3.99, + "grad_norm": 1.1796505451202393, + "learning_rate": 4.525508244626743e-05, + "loss": 1.3232, + "step": 13330 + }, + { + "epoch": 3.99, + "grad_norm": 1.1852450370788574, + "learning_rate": 4.525163813827267e-05, + "loss": 1.1876, + "step": 13335 + }, + { + "epoch": 3.99, + "grad_norm": 3.0009610652923584, + "learning_rate": 4.524819271180098e-05, + "loss": 1.1891, + "step": 13340 + }, + { + "epoch": 3.99, + "grad_norm": 0.9333651065826416, + "learning_rate": 4.524474616704265e-05, + "loss": 1.3117, + "step": 13345 + }, + { + "epoch": 3.99, + "grad_norm": 1.0289852619171143, + "learning_rate": 4.524129850418803e-05, + "loss": 1.3284, + "step": 13350 + }, + { + "epoch": 4.0, + "grad_norm": 1.6752545833587646, + "learning_rate": 4.523784972342755e-05, + "loss": 1.4082, + "step": 13355 + }, + { + "epoch": 4.0, + "grad_norm": 1.345482349395752, + "learning_rate": 4.523439982495166e-05, + "loss": 1.2208, + "step": 13360 + }, + { + "epoch": 4.0, + "grad_norm": 1.1788907051086426, + "learning_rate": 4.5230948808950894e-05, + "loss": 1.2674, + "step": 13365 + }, + { + "epoch": 4.0, + "grad_norm": 0.9993130564689636, + "learning_rate": 4.522749667561586e-05, + "loss": 1.1215, + "step": 13370 + }, + { + "epoch": 4.0, + "grad_norm": 1.5533525943756104, + "learning_rate": 4.52240434251372e-05, + "loss": 1.409, + "step": 13375 + }, + { + "epoch": 4.0, + "grad_norm": 0.6221991181373596, + "learning_rate": 4.522058905770564e-05, + "loss": 1.2527, + "step": 13380 + }, + { + "epoch": 4.0, + "grad_norm": 1.9359819889068604, + "learning_rate": 4.521713357351198e-05, + "loss": 1.1879, + "step": 13385 + }, + { + "epoch": 4.01, + "grad_norm": 1.0392394065856934, + "learning_rate": 4.521367697274704e-05, + "loss": 1.1468, + "step": 13390 + }, + { + "epoch": 4.01, + "grad_norm": 1.0424681901931763, + "learning_rate": 4.5210219255601734e-05, + "loss": 1.2535, + "step": 13395 + }, + { + "epoch": 4.01, + "grad_norm": 1.4294004440307617, + "learning_rate": 4.5206760422267025e-05, + "loss": 1.3226, + "step": 13400 + }, + { + "epoch": 4.01, + "grad_norm": 1.275223731994629, + "learning_rate": 4.520330047293394e-05, + "loss": 1.3615, + "step": 13405 + }, + { + "epoch": 4.01, + "grad_norm": 1.919305443763733, + "learning_rate": 4.519983940779357e-05, + "loss": 1.1126, + "step": 13410 + }, + { + "epoch": 4.01, + "grad_norm": 0.8619056344032288, + "learning_rate": 4.519637722703707e-05, + "loss": 1.2955, + "step": 13415 + }, + { + "epoch": 4.02, + "grad_norm": 3.1386544704437256, + "learning_rate": 4.519291393085564e-05, + "loss": 1.0872, + "step": 13420 + }, + { + "epoch": 4.02, + "grad_norm": 1.826619029045105, + "learning_rate": 4.5189449519440575e-05, + "loss": 1.3617, + "step": 13425 + }, + { + "epoch": 4.02, + "grad_norm": 0.8781746029853821, + "learning_rate": 4.5185983992983186e-05, + "loss": 1.1874, + "step": 13430 + }, + { + "epoch": 4.02, + "grad_norm": 1.6987659931182861, + "learning_rate": 4.518251735167489e-05, + "loss": 1.5476, + "step": 13435 + }, + { + "epoch": 4.02, + "grad_norm": 1.178309679031372, + "learning_rate": 4.517904959570714e-05, + "loss": 1.1798, + "step": 13440 + }, + { + "epoch": 4.02, + "grad_norm": 1.655390977859497, + "learning_rate": 4.517558072527146e-05, + "loss": 1.3168, + "step": 13445 + }, + { + "epoch": 4.02, + "grad_norm": 2.087698459625244, + "learning_rate": 4.517211074055942e-05, + "loss": 1.1618, + "step": 13450 + }, + { + "epoch": 4.03, + "grad_norm": 1.096436858177185, + "learning_rate": 4.516863964176268e-05, + "loss": 1.1491, + "step": 13455 + }, + { + "epoch": 4.03, + "grad_norm": 1.1264992952346802, + "learning_rate": 4.5165167429072924e-05, + "loss": 1.2237, + "step": 13460 + }, + { + "epoch": 4.03, + "grad_norm": 4.63450813293457, + "learning_rate": 4.516169410268194e-05, + "loss": 1.2954, + "step": 13465 + }, + { + "epoch": 4.03, + "grad_norm": 1.207798719406128, + "learning_rate": 4.5158219662781546e-05, + "loss": 1.3565, + "step": 13470 + }, + { + "epoch": 4.03, + "grad_norm": 1.3179618120193481, + "learning_rate": 4.515474410956363e-05, + "loss": 1.358, + "step": 13475 + }, + { + "epoch": 4.03, + "grad_norm": 2.3414742946624756, + "learning_rate": 4.5151267443220146e-05, + "loss": 1.2395, + "step": 13480 + }, + { + "epoch": 4.03, + "grad_norm": 1.8621050119400024, + "learning_rate": 4.514778966394312e-05, + "loss": 1.318, + "step": 13485 + }, + { + "epoch": 4.04, + "grad_norm": 1.7734013795852661, + "learning_rate": 4.5144310771924606e-05, + "loss": 1.261, + "step": 13490 + }, + { + "epoch": 4.04, + "grad_norm": 1.8414225578308105, + "learning_rate": 4.514083076735674e-05, + "loss": 1.1795, + "step": 13495 + }, + { + "epoch": 4.04, + "grad_norm": 1.5032846927642822, + "learning_rate": 4.5137349650431735e-05, + "loss": 1.2926, + "step": 13500 + }, + { + "epoch": 4.04, + "grad_norm": 2.254154920578003, + "learning_rate": 4.513386742134183e-05, + "loss": 1.227, + "step": 13505 + }, + { + "epoch": 4.04, + "grad_norm": 1.056298017501831, + "learning_rate": 4.5130384080279364e-05, + "loss": 1.1288, + "step": 13510 + }, + { + "epoch": 4.04, + "grad_norm": 1.4214680194854736, + "learning_rate": 4.512689962743671e-05, + "loss": 1.317, + "step": 13515 + }, + { + "epoch": 4.05, + "grad_norm": 2.0844547748565674, + "learning_rate": 4.512341406300631e-05, + "loss": 1.152, + "step": 13520 + }, + { + "epoch": 4.05, + "grad_norm": 1.6370080709457397, + "learning_rate": 4.511992738718066e-05, + "loss": 1.2748, + "step": 13525 + }, + { + "epoch": 4.05, + "grad_norm": 1.4384336471557617, + "learning_rate": 4.511643960015234e-05, + "loss": 1.3395, + "step": 13530 + }, + { + "epoch": 4.05, + "grad_norm": 1.0424606800079346, + "learning_rate": 4.5112950702113975e-05, + "loss": 1.2358, + "step": 13535 + }, + { + "epoch": 4.05, + "grad_norm": 1.1179556846618652, + "learning_rate": 4.5109460693258245e-05, + "loss": 1.1542, + "step": 13540 + }, + { + "epoch": 4.05, + "grad_norm": 1.2349655628204346, + "learning_rate": 4.51059695737779e-05, + "loss": 1.2682, + "step": 13545 + }, + { + "epoch": 4.05, + "grad_norm": 2.0068607330322266, + "learning_rate": 4.510247734386576e-05, + "loss": 1.2837, + "step": 13550 + }, + { + "epoch": 4.06, + "grad_norm": 3.1415653228759766, + "learning_rate": 4.5098984003714686e-05, + "loss": 1.2172, + "step": 13555 + }, + { + "epoch": 4.06, + "grad_norm": 2.988642454147339, + "learning_rate": 4.5095489553517625e-05, + "loss": 1.1815, + "step": 13560 + }, + { + "epoch": 4.06, + "grad_norm": 2.215256690979004, + "learning_rate": 4.509199399346756e-05, + "loss": 1.2083, + "step": 13565 + }, + { + "epoch": 4.06, + "grad_norm": 2.271375894546509, + "learning_rate": 4.508849732375755e-05, + "loss": 1.4811, + "step": 13570 + }, + { + "epoch": 4.06, + "grad_norm": 2.4488816261291504, + "learning_rate": 4.5084999544580714e-05, + "loss": 1.3544, + "step": 13575 + }, + { + "epoch": 4.06, + "grad_norm": 1.7777202129364014, + "learning_rate": 4.5081500656130225e-05, + "loss": 1.0847, + "step": 13580 + }, + { + "epoch": 4.06, + "grad_norm": 1.3816187381744385, + "learning_rate": 4.507800065859934e-05, + "loss": 1.2128, + "step": 13585 + }, + { + "epoch": 4.07, + "grad_norm": 1.5490256547927856, + "learning_rate": 4.507449955218134e-05, + "loss": 1.2547, + "step": 13590 + }, + { + "epoch": 4.07, + "grad_norm": 1.968616247177124, + "learning_rate": 4.50709973370696e-05, + "loss": 1.2077, + "step": 13595 + }, + { + "epoch": 4.07, + "grad_norm": 1.046992540359497, + "learning_rate": 4.506749401345754e-05, + "loss": 1.3679, + "step": 13600 + }, + { + "epoch": 4.07, + "grad_norm": 2.8304526805877686, + "learning_rate": 4.506398958153864e-05, + "loss": 1.2516, + "step": 13605 + }, + { + "epoch": 4.07, + "grad_norm": 2.89141845703125, + "learning_rate": 4.5060484041506454e-05, + "loss": 1.2331, + "step": 13610 + }, + { + "epoch": 4.07, + "grad_norm": 1.2688446044921875, + "learning_rate": 4.505697739355459e-05, + "loss": 1.2566, + "step": 13615 + }, + { + "epoch": 4.07, + "grad_norm": 2.1087491512298584, + "learning_rate": 4.505346963787671e-05, + "loss": 1.0886, + "step": 13620 + }, + { + "epoch": 4.08, + "grad_norm": 2.119562864303589, + "learning_rate": 4.5049960774666546e-05, + "loss": 1.1783, + "step": 13625 + }, + { + "epoch": 4.08, + "grad_norm": 1.7251992225646973, + "learning_rate": 4.504645080411789e-05, + "loss": 1.1867, + "step": 13630 + }, + { + "epoch": 4.08, + "grad_norm": 2.285336494445801, + "learning_rate": 4.5042939726424585e-05, + "loss": 1.223, + "step": 13635 + }, + { + "epoch": 4.08, + "grad_norm": 1.3887274265289307, + "learning_rate": 4.503942754178056e-05, + "loss": 1.1539, + "step": 13640 + }, + { + "epoch": 4.08, + "grad_norm": 2.003840923309326, + "learning_rate": 4.503591425037978e-05, + "loss": 1.3083, + "step": 13645 + }, + { + "epoch": 4.08, + "grad_norm": 1.603243112564087, + "learning_rate": 4.5032399852416285e-05, + "loss": 1.3067, + "step": 13650 + }, + { + "epoch": 4.09, + "grad_norm": 1.1768264770507812, + "learning_rate": 4.5028884348084166e-05, + "loss": 1.4105, + "step": 13655 + }, + { + "epoch": 4.09, + "grad_norm": 1.2902339696884155, + "learning_rate": 4.502536773757758e-05, + "loss": 1.306, + "step": 13660 + }, + { + "epoch": 4.09, + "grad_norm": 1.6430679559707642, + "learning_rate": 4.502185002109075e-05, + "loss": 1.3452, + "step": 13665 + }, + { + "epoch": 4.09, + "grad_norm": 1.537097454071045, + "learning_rate": 4.5018331198817966e-05, + "loss": 1.2132, + "step": 13670 + }, + { + "epoch": 4.09, + "grad_norm": 0.9888067245483398, + "learning_rate": 4.5014811270953546e-05, + "loss": 1.1079, + "step": 13675 + }, + { + "epoch": 4.09, + "grad_norm": 2.0669586658477783, + "learning_rate": 4.5011290237691905e-05, + "loss": 1.3401, + "step": 13680 + }, + { + "epoch": 4.09, + "grad_norm": 1.0341452360153198, + "learning_rate": 4.500776809922751e-05, + "loss": 1.2518, + "step": 13685 + }, + { + "epoch": 4.1, + "grad_norm": 1.6001540422439575, + "learning_rate": 4.500424485575487e-05, + "loss": 1.2202, + "step": 13690 + }, + { + "epoch": 4.1, + "grad_norm": 1.255566120147705, + "learning_rate": 4.500072050746859e-05, + "loss": 1.187, + "step": 13695 + }, + { + "epoch": 4.1, + "grad_norm": 1.929617166519165, + "learning_rate": 4.4997195054563304e-05, + "loss": 1.1811, + "step": 13700 + }, + { + "epoch": 4.1, + "grad_norm": 5.457527160644531, + "learning_rate": 4.499366849723372e-05, + "loss": 1.296, + "step": 13705 + }, + { + "epoch": 4.1, + "grad_norm": 1.2115625143051147, + "learning_rate": 4.4990140835674606e-05, + "loss": 1.3369, + "step": 13710 + }, + { + "epoch": 4.1, + "grad_norm": 0.7389108538627625, + "learning_rate": 4.498661207008079e-05, + "loss": 1.2963, + "step": 13715 + }, + { + "epoch": 4.1, + "grad_norm": 1.6749463081359863, + "learning_rate": 4.498308220064717e-05, + "loss": 1.1794, + "step": 13720 + }, + { + "epoch": 4.11, + "grad_norm": 2.0370562076568604, + "learning_rate": 4.497955122756868e-05, + "loss": 1.4314, + "step": 13725 + }, + { + "epoch": 4.11, + "grad_norm": 3.5120744705200195, + "learning_rate": 4.4976019151040364e-05, + "loss": 1.1369, + "step": 13730 + }, + { + "epoch": 4.11, + "grad_norm": 2.3618757724761963, + "learning_rate": 4.497248597125726e-05, + "loss": 1.3716, + "step": 13735 + }, + { + "epoch": 4.11, + "grad_norm": 1.495405673980713, + "learning_rate": 4.496895168841452e-05, + "loss": 1.3534, + "step": 13740 + }, + { + "epoch": 4.11, + "grad_norm": 3.024486541748047, + "learning_rate": 4.496541630270733e-05, + "loss": 1.2207, + "step": 13745 + }, + { + "epoch": 4.11, + "grad_norm": 2.2674665451049805, + "learning_rate": 4.496187981433095e-05, + "loss": 1.4337, + "step": 13750 + }, + { + "epoch": 4.12, + "grad_norm": 2.4405548572540283, + "learning_rate": 4.49583422234807e-05, + "loss": 1.318, + "step": 13755 + }, + { + "epoch": 4.12, + "grad_norm": 0.9450554847717285, + "learning_rate": 4.495480353035196e-05, + "loss": 1.3367, + "step": 13760 + }, + { + "epoch": 4.12, + "grad_norm": 2.9744069576263428, + "learning_rate": 4.4951263735140156e-05, + "loss": 1.2266, + "step": 13765 + }, + { + "epoch": 4.12, + "grad_norm": 2.1863605976104736, + "learning_rate": 4.4947722838040795e-05, + "loss": 1.2261, + "step": 13770 + }, + { + "epoch": 4.12, + "grad_norm": 1.7596246004104614, + "learning_rate": 4.494418083924944e-05, + "loss": 1.187, + "step": 13775 + }, + { + "epoch": 4.12, + "grad_norm": 1.8887165784835815, + "learning_rate": 4.4940637738961697e-05, + "loss": 1.2505, + "step": 13780 + }, + { + "epoch": 4.12, + "grad_norm": 1.4990458488464355, + "learning_rate": 4.493709353737327e-05, + "loss": 1.2451, + "step": 13785 + }, + { + "epoch": 4.13, + "grad_norm": 1.5014214515686035, + "learning_rate": 4.493354823467989e-05, + "loss": 1.2606, + "step": 13790 + }, + { + "epoch": 4.13, + "grad_norm": 2.0150039196014404, + "learning_rate": 4.4930001831077355e-05, + "loss": 1.4081, + "step": 13795 + }, + { + "epoch": 4.13, + "grad_norm": 2.609929323196411, + "learning_rate": 4.492645432676154e-05, + "loss": 1.2381, + "step": 13800 + }, + { + "epoch": 4.13, + "grad_norm": 1.365830659866333, + "learning_rate": 4.4922905721928366e-05, + "loss": 1.2332, + "step": 13805 + }, + { + "epoch": 4.13, + "grad_norm": 1.4366599321365356, + "learning_rate": 4.491935601677381e-05, + "loss": 1.1557, + "step": 13810 + }, + { + "epoch": 4.13, + "grad_norm": 3.553328037261963, + "learning_rate": 4.491580521149393e-05, + "loss": 1.2962, + "step": 13815 + }, + { + "epoch": 4.13, + "grad_norm": 1.386233925819397, + "learning_rate": 4.4912253306284835e-05, + "loss": 1.1249, + "step": 13820 + }, + { + "epoch": 4.14, + "grad_norm": 2.232959747314453, + "learning_rate": 4.490870030134268e-05, + "loss": 1.0412, + "step": 13825 + }, + { + "epoch": 4.14, + "grad_norm": 1.5061743259429932, + "learning_rate": 4.49051461968637e-05, + "loss": 1.2668, + "step": 13830 + }, + { + "epoch": 4.14, + "grad_norm": 2.83976674079895, + "learning_rate": 4.49015909930442e-05, + "loss": 1.3338, + "step": 13835 + }, + { + "epoch": 4.14, + "grad_norm": 1.8282452821731567, + "learning_rate": 4.48980346900805e-05, + "loss": 1.2672, + "step": 13840 + }, + { + "epoch": 4.14, + "grad_norm": 1.090014100074768, + "learning_rate": 4.489447728816904e-05, + "loss": 1.1899, + "step": 13845 + }, + { + "epoch": 4.14, + "grad_norm": 2.0513787269592285, + "learning_rate": 4.489091878750627e-05, + "loss": 1.2211, + "step": 13850 + }, + { + "epoch": 4.15, + "grad_norm": 1.3292232751846313, + "learning_rate": 4.4887359188288724e-05, + "loss": 1.199, + "step": 13855 + }, + { + "epoch": 4.15, + "grad_norm": 2.754366397857666, + "learning_rate": 4.4883798490713014e-05, + "loss": 1.2695, + "step": 13860 + }, + { + "epoch": 4.15, + "grad_norm": 1.0987385511398315, + "learning_rate": 4.488023669497578e-05, + "loss": 1.327, + "step": 13865 + }, + { + "epoch": 4.15, + "grad_norm": 0.9801754951477051, + "learning_rate": 4.487667380127373e-05, + "loss": 1.291, + "step": 13870 + }, + { + "epoch": 4.15, + "grad_norm": 1.8694130182266235, + "learning_rate": 4.4873109809803654e-05, + "loss": 1.3247, + "step": 13875 + }, + { + "epoch": 4.15, + "grad_norm": 1.1474727392196655, + "learning_rate": 4.486954472076238e-05, + "loss": 1.2209, + "step": 13880 + }, + { + "epoch": 4.15, + "grad_norm": 1.853026032447815, + "learning_rate": 4.48659785343468e-05, + "loss": 1.1524, + "step": 13885 + }, + { + "epoch": 4.16, + "grad_norm": 1.3957430124282837, + "learning_rate": 4.4862411250753875e-05, + "loss": 1.3743, + "step": 13890 + }, + { + "epoch": 4.16, + "grad_norm": 1.5522494316101074, + "learning_rate": 4.485884287018063e-05, + "loss": 1.2599, + "step": 13895 + }, + { + "epoch": 4.16, + "grad_norm": 2.7096736431121826, + "learning_rate": 4.485527339282412e-05, + "loss": 1.3198, + "step": 13900 + }, + { + "epoch": 4.16, + "grad_norm": 1.6966811418533325, + "learning_rate": 4.485170281888151e-05, + "loss": 1.2198, + "step": 13905 + }, + { + "epoch": 4.16, + "grad_norm": 1.197799563407898, + "learning_rate": 4.484813114854999e-05, + "loss": 1.3353, + "step": 13910 + }, + { + "epoch": 4.16, + "grad_norm": 1.5476326942443848, + "learning_rate": 4.4844558382026814e-05, + "loss": 1.1109, + "step": 13915 + }, + { + "epoch": 4.16, + "grad_norm": 2.144993543624878, + "learning_rate": 4.484098451950931e-05, + "loss": 1.2156, + "step": 13920 + }, + { + "epoch": 4.17, + "grad_norm": 1.5577327013015747, + "learning_rate": 4.483740956119485e-05, + "loss": 1.3845, + "step": 13925 + }, + { + "epoch": 4.17, + "grad_norm": 1.9885210990905762, + "learning_rate": 4.4833833507280884e-05, + "loss": 1.2396, + "step": 13930 + }, + { + "epoch": 4.17, + "grad_norm": 1.833871841430664, + "learning_rate": 4.483025635796491e-05, + "loss": 1.3424, + "step": 13935 + }, + { + "epoch": 4.17, + "grad_norm": 2.1424660682678223, + "learning_rate": 4.482667811344448e-05, + "loss": 1.2628, + "step": 13940 + }, + { + "epoch": 4.17, + "grad_norm": 2.6087942123413086, + "learning_rate": 4.4823098773917235e-05, + "loss": 1.1262, + "step": 13945 + }, + { + "epoch": 4.17, + "grad_norm": 1.847556710243225, + "learning_rate": 4.4819518339580844e-05, + "loss": 1.2522, + "step": 13950 + }, + { + "epoch": 4.18, + "grad_norm": 1.6152390241622925, + "learning_rate": 4.4815936810633066e-05, + "loss": 1.4579, + "step": 13955 + }, + { + "epoch": 4.18, + "grad_norm": 3.4430344104766846, + "learning_rate": 4.4812354187271686e-05, + "loss": 1.2838, + "step": 13960 + }, + { + "epoch": 4.18, + "grad_norm": 1.8315435647964478, + "learning_rate": 4.4808770469694584e-05, + "loss": 1.322, + "step": 13965 + }, + { + "epoch": 4.18, + "grad_norm": 1.6161450147628784, + "learning_rate": 4.480518565809967e-05, + "loss": 1.4559, + "step": 13970 + }, + { + "epoch": 4.18, + "grad_norm": 1.8581215143203735, + "learning_rate": 4.480159975268494e-05, + "loss": 1.2889, + "step": 13975 + }, + { + "epoch": 4.18, + "grad_norm": 1.6320557594299316, + "learning_rate": 4.479801275364845e-05, + "loss": 1.3009, + "step": 13980 + }, + { + "epoch": 4.18, + "grad_norm": 1.0718029737472534, + "learning_rate": 4.4794424661188286e-05, + "loss": 1.3986, + "step": 13985 + }, + { + "epoch": 4.19, + "grad_norm": 2.1297664642333984, + "learning_rate": 4.479083547550263e-05, + "loss": 1.1725, + "step": 13990 + }, + { + "epoch": 4.19, + "grad_norm": 2.0025558471679688, + "learning_rate": 4.478724519678969e-05, + "loss": 1.1128, + "step": 13995 + }, + { + "epoch": 4.19, + "grad_norm": 0.8468442559242249, + "learning_rate": 4.4783653825247776e-05, + "loss": 1.1415, + "step": 14000 + }, + { + "epoch": 4.19, + "grad_norm": 1.221956491470337, + "learning_rate": 4.478006136107522e-05, + "loss": 1.4145, + "step": 14005 + }, + { + "epoch": 4.19, + "grad_norm": 2.629194974899292, + "learning_rate": 4.477646780447043e-05, + "loss": 1.4677, + "step": 14010 + }, + { + "epoch": 4.19, + "grad_norm": 1.3865878582000732, + "learning_rate": 4.477287315563189e-05, + "loss": 1.1557, + "step": 14015 + }, + { + "epoch": 4.19, + "grad_norm": 1.6378535032272339, + "learning_rate": 4.4769277414758115e-05, + "loss": 1.1965, + "step": 14020 + }, + { + "epoch": 4.2, + "grad_norm": 7.970203876495361, + "learning_rate": 4.47656805820477e-05, + "loss": 1.2789, + "step": 14025 + }, + { + "epoch": 4.2, + "grad_norm": 2.070845365524292, + "learning_rate": 4.476208265769929e-05, + "loss": 1.2761, + "step": 14030 + }, + { + "epoch": 4.2, + "grad_norm": 4.676011085510254, + "learning_rate": 4.475848364191159e-05, + "loss": 1.2767, + "step": 14035 + }, + { + "epoch": 4.2, + "grad_norm": 2.338252305984497, + "learning_rate": 4.4754883534883384e-05, + "loss": 1.3701, + "step": 14040 + }, + { + "epoch": 4.2, + "grad_norm": 1.2467647790908813, + "learning_rate": 4.4751282336813494e-05, + "loss": 1.1699, + "step": 14045 + }, + { + "epoch": 4.2, + "grad_norm": 1.1198375225067139, + "learning_rate": 4.474768004790081e-05, + "loss": 1.3193, + "step": 14050 + }, + { + "epoch": 4.21, + "grad_norm": 2.231294870376587, + "learning_rate": 4.474407666834428e-05, + "loss": 1.3017, + "step": 14055 + }, + { + "epoch": 4.21, + "grad_norm": 1.3994890451431274, + "learning_rate": 4.474047219834292e-05, + "loss": 1.181, + "step": 14060 + }, + { + "epoch": 4.21, + "grad_norm": 1.440882921218872, + "learning_rate": 4.47368666380958e-05, + "loss": 1.3528, + "step": 14065 + }, + { + "epoch": 4.21, + "grad_norm": 2.0190446376800537, + "learning_rate": 4.4733259987802046e-05, + "loss": 1.3357, + "step": 14070 + }, + { + "epoch": 4.21, + "grad_norm": 1.6590640544891357, + "learning_rate": 4.4729652247660855e-05, + "loss": 1.2267, + "step": 14075 + }, + { + "epoch": 4.21, + "grad_norm": 1.370227336883545, + "learning_rate": 4.472604341787149e-05, + "loss": 1.2786, + "step": 14080 + }, + { + "epoch": 4.21, + "grad_norm": 1.0957473516464233, + "learning_rate": 4.472243349863324e-05, + "loss": 1.11, + "step": 14085 + }, + { + "epoch": 4.22, + "grad_norm": 2.918076992034912, + "learning_rate": 4.47188224901455e-05, + "loss": 1.2905, + "step": 14090 + }, + { + "epoch": 4.22, + "grad_norm": 1.5842289924621582, + "learning_rate": 4.4715210392607675e-05, + "loss": 1.1318, + "step": 14095 + }, + { + "epoch": 4.22, + "grad_norm": 0.9747616052627563, + "learning_rate": 4.471159720621928e-05, + "loss": 1.3338, + "step": 14100 + }, + { + "epoch": 4.22, + "grad_norm": 1.7682857513427734, + "learning_rate": 4.470798293117986e-05, + "loss": 1.0593, + "step": 14105 + }, + { + "epoch": 4.22, + "grad_norm": 1.311083436012268, + "learning_rate": 4.470436756768903e-05, + "loss": 1.1753, + "step": 14110 + }, + { + "epoch": 4.22, + "grad_norm": 1.279892086982727, + "learning_rate": 4.470075111594646e-05, + "loss": 1.2065, + "step": 14115 + }, + { + "epoch": 4.22, + "grad_norm": 1.927426815032959, + "learning_rate": 4.4697133576151885e-05, + "loss": 1.3022, + "step": 14120 + }, + { + "epoch": 4.23, + "grad_norm": 2.021284580230713, + "learning_rate": 4.4693514948505095e-05, + "loss": 1.1597, + "step": 14125 + }, + { + "epoch": 4.23, + "grad_norm": 1.2196049690246582, + "learning_rate": 4.4689895233205945e-05, + "loss": 1.3091, + "step": 14130 + }, + { + "epoch": 4.23, + "grad_norm": 2.0658938884735107, + "learning_rate": 4.4686274430454346e-05, + "loss": 1.2713, + "step": 14135 + }, + { + "epoch": 4.23, + "grad_norm": 1.2542707920074463, + "learning_rate": 4.4682652540450274e-05, + "loss": 1.2778, + "step": 14140 + }, + { + "epoch": 4.23, + "grad_norm": 1.4691362380981445, + "learning_rate": 4.4679029563393756e-05, + "loss": 1.073, + "step": 14145 + }, + { + "epoch": 4.23, + "grad_norm": 2.216343641281128, + "learning_rate": 4.46754054994849e-05, + "loss": 1.3336, + "step": 14150 + }, + { + "epoch": 4.24, + "grad_norm": 3.187505006790161, + "learning_rate": 4.467178034892384e-05, + "loss": 1.2517, + "step": 14155 + }, + { + "epoch": 4.24, + "grad_norm": 1.3655002117156982, + "learning_rate": 4.466815411191081e-05, + "loss": 1.3117, + "step": 14160 + }, + { + "epoch": 4.24, + "grad_norm": 1.2147845029830933, + "learning_rate": 4.466452678864607e-05, + "loss": 1.4121, + "step": 14165 + }, + { + "epoch": 4.24, + "grad_norm": 0.8723526000976562, + "learning_rate": 4.466089837932995e-05, + "loss": 1.2157, + "step": 14170 + }, + { + "epoch": 4.24, + "grad_norm": 2.1111645698547363, + "learning_rate": 4.465726888416285e-05, + "loss": 1.3665, + "step": 14175 + }, + { + "epoch": 4.24, + "grad_norm": 3.004091739654541, + "learning_rate": 4.4653638303345225e-05, + "loss": 1.4633, + "step": 14180 + }, + { + "epoch": 4.24, + "grad_norm": 2.4839401245117188, + "learning_rate": 4.465000663707758e-05, + "loss": 1.3408, + "step": 14185 + }, + { + "epoch": 4.25, + "grad_norm": 1.8565268516540527, + "learning_rate": 4.46463738855605e-05, + "loss": 1.2731, + "step": 14190 + }, + { + "epoch": 4.25, + "grad_norm": 2.492748975753784, + "learning_rate": 4.464274004899461e-05, + "loss": 1.3723, + "step": 14195 + }, + { + "epoch": 4.25, + "grad_norm": 2.4697351455688477, + "learning_rate": 4.46391051275806e-05, + "loss": 1.1248, + "step": 14200 + }, + { + "epoch": 4.25, + "grad_norm": 2.914813280105591, + "learning_rate": 4.4635469121519234e-05, + "loss": 1.0966, + "step": 14205 + }, + { + "epoch": 4.25, + "grad_norm": 1.3312268257141113, + "learning_rate": 4.463183203101131e-05, + "loss": 1.218, + "step": 14210 + }, + { + "epoch": 4.25, + "grad_norm": 3.274707317352295, + "learning_rate": 4.4628193856257714e-05, + "loss": 1.1181, + "step": 14215 + }, + { + "epoch": 4.25, + "grad_norm": 1.0050827264785767, + "learning_rate": 4.462455459745938e-05, + "loss": 1.28, + "step": 14220 + }, + { + "epoch": 4.26, + "grad_norm": 1.2786221504211426, + "learning_rate": 4.462091425481728e-05, + "loss": 1.1789, + "step": 14225 + }, + { + "epoch": 4.26, + "grad_norm": 1.9664305448532104, + "learning_rate": 4.4617272828532495e-05, + "loss": 1.2938, + "step": 14230 + }, + { + "epoch": 4.26, + "grad_norm": 1.6089929342269897, + "learning_rate": 4.461363031880611e-05, + "loss": 1.2964, + "step": 14235 + }, + { + "epoch": 4.26, + "grad_norm": 2.8070948123931885, + "learning_rate": 4.460998672583933e-05, + "loss": 1.3383, + "step": 14240 + }, + { + "epoch": 4.26, + "grad_norm": 1.145674705505371, + "learning_rate": 4.460634204983334e-05, + "loss": 1.3326, + "step": 14245 + }, + { + "epoch": 4.26, + "grad_norm": 2.9494822025299072, + "learning_rate": 4.4602696290989477e-05, + "loss": 1.2622, + "step": 14250 + }, + { + "epoch": 4.26, + "grad_norm": 2.005913734436035, + "learning_rate": 4.459904944950907e-05, + "loss": 1.267, + "step": 14255 + }, + { + "epoch": 4.27, + "grad_norm": 1.1085389852523804, + "learning_rate": 4.4595401525593525e-05, + "loss": 1.2181, + "step": 14260 + }, + { + "epoch": 4.27, + "grad_norm": 3.622462272644043, + "learning_rate": 4.459175251944433e-05, + "loss": 1.3522, + "step": 14265 + }, + { + "epoch": 4.27, + "grad_norm": 2.914292573928833, + "learning_rate": 4.458810243126301e-05, + "loss": 1.3473, + "step": 14270 + }, + { + "epoch": 4.27, + "grad_norm": 2.464303493499756, + "learning_rate": 4.458445126125115e-05, + "loss": 1.1967, + "step": 14275 + }, + { + "epoch": 4.27, + "grad_norm": 1.2096035480499268, + "learning_rate": 4.45807990096104e-05, + "loss": 1.1164, + "step": 14280 + }, + { + "epoch": 4.27, + "grad_norm": 2.403764247894287, + "learning_rate": 4.4577145676542474e-05, + "loss": 1.1992, + "step": 14285 + }, + { + "epoch": 4.28, + "grad_norm": 1.3285304307937622, + "learning_rate": 4.4573491262249144e-05, + "loss": 1.2245, + "step": 14290 + }, + { + "epoch": 4.28, + "grad_norm": 1.1528428792953491, + "learning_rate": 4.4569835766932244e-05, + "loss": 1.3815, + "step": 14295 + }, + { + "epoch": 4.28, + "grad_norm": 1.0769155025482178, + "learning_rate": 4.4566179190793646e-05, + "loss": 1.1992, + "step": 14300 + }, + { + "epoch": 4.28, + "grad_norm": 1.2317618131637573, + "learning_rate": 4.4562521534035316e-05, + "loss": 1.2237, + "step": 14305 + }, + { + "epoch": 4.28, + "grad_norm": 1.0465807914733887, + "learning_rate": 4.455886279685925e-05, + "loss": 1.3325, + "step": 14310 + }, + { + "epoch": 4.28, + "grad_norm": 2.072091579437256, + "learning_rate": 4.4555202979467526e-05, + "loss": 1.3239, + "step": 14315 + }, + { + "epoch": 4.28, + "grad_norm": 1.5427582263946533, + "learning_rate": 4.455154208206227e-05, + "loss": 1.284, + "step": 14320 + }, + { + "epoch": 4.29, + "grad_norm": 1.5451370477676392, + "learning_rate": 4.454788010484566e-05, + "loss": 1.2576, + "step": 14325 + }, + { + "epoch": 4.29, + "grad_norm": 1.7191996574401855, + "learning_rate": 4.454421704801996e-05, + "loss": 1.3133, + "step": 14330 + }, + { + "epoch": 4.29, + "grad_norm": 1.3877907991409302, + "learning_rate": 4.454055291178746e-05, + "loss": 1.1534, + "step": 14335 + }, + { + "epoch": 4.29, + "grad_norm": 1.9861388206481934, + "learning_rate": 4.453688769635054e-05, + "loss": 1.2559, + "step": 14340 + }, + { + "epoch": 4.29, + "grad_norm": 2.1039087772369385, + "learning_rate": 4.453322140191162e-05, + "loss": 1.3874, + "step": 14345 + }, + { + "epoch": 4.29, + "grad_norm": 1.7028883695602417, + "learning_rate": 4.452955402867318e-05, + "loss": 1.3288, + "step": 14350 + }, + { + "epoch": 4.29, + "grad_norm": 2.711984634399414, + "learning_rate": 4.452588557683777e-05, + "loss": 1.2802, + "step": 14355 + }, + { + "epoch": 4.3, + "grad_norm": 1.9355015754699707, + "learning_rate": 4.4522216046608004e-05, + "loss": 1.2044, + "step": 14360 + }, + { + "epoch": 4.3, + "grad_norm": 1.0914154052734375, + "learning_rate": 4.451854543818653e-05, + "loss": 1.0817, + "step": 14365 + }, + { + "epoch": 4.3, + "grad_norm": 1.7667779922485352, + "learning_rate": 4.451487375177608e-05, + "loss": 1.2535, + "step": 14370 + }, + { + "epoch": 4.3, + "grad_norm": 2.6625638008117676, + "learning_rate": 4.4511200987579445e-05, + "loss": 1.3501, + "step": 14375 + }, + { + "epoch": 4.3, + "grad_norm": 2.3096730709075928, + "learning_rate": 4.450752714579946e-05, + "loss": 1.2074, + "step": 14380 + }, + { + "epoch": 4.3, + "grad_norm": 1.5009289979934692, + "learning_rate": 4.450385222663902e-05, + "loss": 1.2544, + "step": 14385 + }, + { + "epoch": 4.31, + "grad_norm": 2.635723352432251, + "learning_rate": 4.4500176230301095e-05, + "loss": 1.2226, + "step": 14390 + }, + { + "epoch": 4.31, + "grad_norm": 1.7658405303955078, + "learning_rate": 4.449649915698871e-05, + "loss": 1.2582, + "step": 14395 + }, + { + "epoch": 4.31, + "grad_norm": 3.9313101768493652, + "learning_rate": 4.449282100690494e-05, + "loss": 1.2111, + "step": 14400 + }, + { + "epoch": 4.31, + "grad_norm": 2.178057909011841, + "learning_rate": 4.448914178025293e-05, + "loss": 1.4299, + "step": 14405 + }, + { + "epoch": 4.31, + "grad_norm": 1.1012213230133057, + "learning_rate": 4.4485461477235865e-05, + "loss": 1.0573, + "step": 14410 + }, + { + "epoch": 4.31, + "grad_norm": 1.9754661321640015, + "learning_rate": 4.448178009805704e-05, + "loss": 1.2845, + "step": 14415 + }, + { + "epoch": 4.31, + "grad_norm": 1.2951463460922241, + "learning_rate": 4.4478097642919734e-05, + "loss": 1.2188, + "step": 14420 + }, + { + "epoch": 4.32, + "grad_norm": 1.210935354232788, + "learning_rate": 4.447441411202734e-05, + "loss": 1.3247, + "step": 14425 + }, + { + "epoch": 4.32, + "grad_norm": 1.0859736204147339, + "learning_rate": 4.447072950558331e-05, + "loss": 1.2225, + "step": 14430 + }, + { + "epoch": 4.32, + "grad_norm": 0.7958767414093018, + "learning_rate": 4.446704382379111e-05, + "loss": 1.3079, + "step": 14435 + }, + { + "epoch": 4.32, + "grad_norm": 1.6416263580322266, + "learning_rate": 4.4463357066854326e-05, + "loss": 1.1547, + "step": 14440 + }, + { + "epoch": 4.32, + "grad_norm": 1.793069839477539, + "learning_rate": 4.445966923497656e-05, + "loss": 1.388, + "step": 14445 + }, + { + "epoch": 4.32, + "grad_norm": 2.6959428787231445, + "learning_rate": 4.44559803283615e-05, + "loss": 1.2552, + "step": 14450 + }, + { + "epoch": 4.32, + "grad_norm": 3.44150710105896, + "learning_rate": 4.445229034721285e-05, + "loss": 1.2653, + "step": 14455 + }, + { + "epoch": 4.33, + "grad_norm": 2.6374642848968506, + "learning_rate": 4.444859929173444e-05, + "loss": 1.204, + "step": 14460 + }, + { + "epoch": 4.33, + "grad_norm": 1.654412031173706, + "learning_rate": 4.44449071621301e-05, + "loss": 1.3707, + "step": 14465 + }, + { + "epoch": 4.33, + "grad_norm": 1.6137704849243164, + "learning_rate": 4.444121395860375e-05, + "loss": 1.29, + "step": 14470 + }, + { + "epoch": 4.33, + "grad_norm": 1.6746236085891724, + "learning_rate": 4.443751968135936e-05, + "loss": 1.4198, + "step": 14475 + }, + { + "epoch": 4.33, + "grad_norm": 1.7420883178710938, + "learning_rate": 4.4433824330600964e-05, + "loss": 1.245, + "step": 14480 + }, + { + "epoch": 4.33, + "grad_norm": 5.652534484863281, + "learning_rate": 4.443012790653265e-05, + "loss": 1.2699, + "step": 14485 + }, + { + "epoch": 4.34, + "grad_norm": 2.45516037940979, + "learning_rate": 4.442643040935856e-05, + "loss": 1.1824, + "step": 14490 + }, + { + "epoch": 4.34, + "grad_norm": 2.3738396167755127, + "learning_rate": 4.442273183928293e-05, + "loss": 1.1621, + "step": 14495 + }, + { + "epoch": 4.34, + "grad_norm": 1.6174094676971436, + "learning_rate": 4.4419032196509994e-05, + "loss": 1.3374, + "step": 14500 + }, + { + "epoch": 4.34, + "grad_norm": 2.4241867065429688, + "learning_rate": 4.44153314812441e-05, + "loss": 1.1839, + "step": 14505 + }, + { + "epoch": 4.34, + "grad_norm": 2.8648736476898193, + "learning_rate": 4.4411629693689626e-05, + "loss": 1.2719, + "step": 14510 + }, + { + "epoch": 4.34, + "grad_norm": 2.2779765129089355, + "learning_rate": 4.4407926834051025e-05, + "loss": 1.2144, + "step": 14515 + }, + { + "epoch": 4.34, + "grad_norm": 3.223737955093384, + "learning_rate": 4.44042229025328e-05, + "loss": 1.112, + "step": 14520 + }, + { + "epoch": 4.35, + "grad_norm": 1.5524216890335083, + "learning_rate": 4.440051789933951e-05, + "loss": 1.2005, + "step": 14525 + }, + { + "epoch": 4.35, + "grad_norm": 2.5354983806610107, + "learning_rate": 4.439681182467579e-05, + "loss": 1.0579, + "step": 14530 + }, + { + "epoch": 4.35, + "grad_norm": 1.221150279045105, + "learning_rate": 4.43931046787463e-05, + "loss": 1.3449, + "step": 14535 + }, + { + "epoch": 4.35, + "grad_norm": 1.6341768503189087, + "learning_rate": 4.4389396461755804e-05, + "loss": 1.2786, + "step": 14540 + }, + { + "epoch": 4.35, + "grad_norm": 1.0187619924545288, + "learning_rate": 4.4385687173909093e-05, + "loss": 1.303, + "step": 14545 + }, + { + "epoch": 4.35, + "grad_norm": 1.5471796989440918, + "learning_rate": 4.4381976815411036e-05, + "loss": 1.1909, + "step": 14550 + }, + { + "epoch": 4.35, + "grad_norm": 1.6268233060836792, + "learning_rate": 4.437826538646655e-05, + "loss": 1.2544, + "step": 14555 + }, + { + "epoch": 4.36, + "grad_norm": 1.3081377744674683, + "learning_rate": 4.4374552887280594e-05, + "loss": 1.3514, + "step": 14560 + }, + { + "epoch": 4.36, + "grad_norm": 1.7368911504745483, + "learning_rate": 4.4370839318058235e-05, + "loss": 1.3513, + "step": 14565 + }, + { + "epoch": 4.36, + "grad_norm": 0.9481498003005981, + "learning_rate": 4.4367124679004545e-05, + "loss": 1.2849, + "step": 14570 + }, + { + "epoch": 4.36, + "grad_norm": 1.1011961698532104, + "learning_rate": 4.4363408970324696e-05, + "loss": 1.3394, + "step": 14575 + }, + { + "epoch": 4.36, + "grad_norm": 0.9509512782096863, + "learning_rate": 4.435969219222389e-05, + "loss": 1.1828, + "step": 14580 + }, + { + "epoch": 4.36, + "grad_norm": 0.9883051514625549, + "learning_rate": 4.4355974344907414e-05, + "loss": 1.1986, + "step": 14585 + }, + { + "epoch": 4.37, + "grad_norm": 1.3182764053344727, + "learning_rate": 4.4352255428580595e-05, + "loss": 1.4436, + "step": 14590 + }, + { + "epoch": 4.37, + "grad_norm": 1.5108343362808228, + "learning_rate": 4.434853544344882e-05, + "loss": 1.2572, + "step": 14595 + }, + { + "epoch": 4.37, + "grad_norm": 1.2329615354537964, + "learning_rate": 4.434481438971754e-05, + "loss": 1.16, + "step": 14600 + }, + { + "epoch": 4.37, + "grad_norm": 2.406236171722412, + "learning_rate": 4.4341092267592276e-05, + "loss": 1.2099, + "step": 14605 + }, + { + "epoch": 4.37, + "grad_norm": 2.0363478660583496, + "learning_rate": 4.433736907727859e-05, + "loss": 1.2245, + "step": 14610 + }, + { + "epoch": 4.37, + "grad_norm": 1.3022775650024414, + "learning_rate": 4.4333644818982115e-05, + "loss": 1.2434, + "step": 14615 + }, + { + "epoch": 4.37, + "grad_norm": 0.9420449733734131, + "learning_rate": 4.4329919492908526e-05, + "loss": 1.2814, + "step": 14620 + }, + { + "epoch": 4.38, + "grad_norm": 4.690912246704102, + "learning_rate": 4.432619309926357e-05, + "loss": 1.2965, + "step": 14625 + }, + { + "epoch": 4.38, + "grad_norm": 2.9743857383728027, + "learning_rate": 4.432246563825306e-05, + "loss": 1.1934, + "step": 14630 + }, + { + "epoch": 4.38, + "grad_norm": 1.6823005676269531, + "learning_rate": 4.431873711008286e-05, + "loss": 1.3078, + "step": 14635 + }, + { + "epoch": 4.38, + "grad_norm": 2.596446990966797, + "learning_rate": 4.4315007514958896e-05, + "loss": 1.1404, + "step": 14640 + }, + { + "epoch": 4.38, + "grad_norm": 2.1992132663726807, + "learning_rate": 4.4311276853087144e-05, + "loss": 1.0836, + "step": 14645 + }, + { + "epoch": 4.38, + "grad_norm": 1.4632948637008667, + "learning_rate": 4.430754512467364e-05, + "loss": 1.2743, + "step": 14650 + }, + { + "epoch": 4.38, + "grad_norm": 1.9432785511016846, + "learning_rate": 4.430381232992449e-05, + "loss": 1.1317, + "step": 14655 + }, + { + "epoch": 4.39, + "grad_norm": 2.028427839279175, + "learning_rate": 4.430007846904585e-05, + "loss": 1.135, + "step": 14660 + }, + { + "epoch": 4.39, + "grad_norm": 1.5426450967788696, + "learning_rate": 4.4296343542243926e-05, + "loss": 1.3249, + "step": 14665 + }, + { + "epoch": 4.39, + "grad_norm": 1.5564109086990356, + "learning_rate": 4.4292607549725016e-05, + "loss": 1.269, + "step": 14670 + }, + { + "epoch": 4.39, + "grad_norm": 1.5921878814697266, + "learning_rate": 4.428887049169544e-05, + "loss": 1.3068, + "step": 14675 + }, + { + "epoch": 4.39, + "grad_norm": 1.6243040561676025, + "learning_rate": 4.4285132368361606e-05, + "loss": 1.0717, + "step": 14680 + }, + { + "epoch": 4.39, + "grad_norm": 2.8081164360046387, + "learning_rate": 4.428139317992995e-05, + "loss": 1.1523, + "step": 14685 + }, + { + "epoch": 4.4, + "grad_norm": 3.132068157196045, + "learning_rate": 4.4277652926607e-05, + "loss": 1.3471, + "step": 14690 + }, + { + "epoch": 4.4, + "grad_norm": 2.9057159423828125, + "learning_rate": 4.42739116085993e-05, + "loss": 1.2094, + "step": 14695 + }, + { + "epoch": 4.4, + "grad_norm": 1.643255591392517, + "learning_rate": 4.42701692261135e-05, + "loss": 1.5053, + "step": 14700 + }, + { + "epoch": 4.4, + "grad_norm": 1.549208164215088, + "learning_rate": 4.426642577935629e-05, + "loss": 1.1451, + "step": 14705 + }, + { + "epoch": 4.4, + "grad_norm": 1.8293100595474243, + "learning_rate": 4.426268126853441e-05, + "loss": 1.2295, + "step": 14710 + }, + { + "epoch": 4.4, + "grad_norm": 2.3333680629730225, + "learning_rate": 4.425893569385466e-05, + "loss": 1.1442, + "step": 14715 + }, + { + "epoch": 4.4, + "grad_norm": 1.7189589738845825, + "learning_rate": 4.425518905552392e-05, + "loss": 1.3381, + "step": 14720 + }, + { + "epoch": 4.41, + "grad_norm": 3.104936361312866, + "learning_rate": 4.42514413537491e-05, + "loss": 1.2112, + "step": 14725 + }, + { + "epoch": 4.41, + "grad_norm": 1.1761854887008667, + "learning_rate": 4.424769258873718e-05, + "loss": 1.2502, + "step": 14730 + }, + { + "epoch": 4.41, + "grad_norm": 1.4067808389663696, + "learning_rate": 4.424394276069521e-05, + "loss": 1.3833, + "step": 14735 + }, + { + "epoch": 4.41, + "grad_norm": 1.7447682619094849, + "learning_rate": 4.424019186983028e-05, + "loss": 1.203, + "step": 14740 + }, + { + "epoch": 4.41, + "grad_norm": 1.739342212677002, + "learning_rate": 4.423643991634956e-05, + "loss": 1.2031, + "step": 14745 + }, + { + "epoch": 4.41, + "grad_norm": 2.5015289783477783, + "learning_rate": 4.423268690046025e-05, + "loss": 1.4271, + "step": 14750 + }, + { + "epoch": 4.41, + "grad_norm": 1.010996699333191, + "learning_rate": 4.422893282236963e-05, + "loss": 1.2328, + "step": 14755 + }, + { + "epoch": 4.42, + "grad_norm": 1.5030901432037354, + "learning_rate": 4.422517768228505e-05, + "loss": 1.3878, + "step": 14760 + }, + { + "epoch": 4.42, + "grad_norm": 1.2070742845535278, + "learning_rate": 4.422142148041388e-05, + "loss": 1.3728, + "step": 14765 + }, + { + "epoch": 4.42, + "grad_norm": 1.405918002128601, + "learning_rate": 4.421766421696358e-05, + "loss": 1.3906, + "step": 14770 + }, + { + "epoch": 4.42, + "grad_norm": 2.8007607460021973, + "learning_rate": 4.4213905892141674e-05, + "loss": 1.3337, + "step": 14775 + }, + { + "epoch": 4.42, + "grad_norm": 1.6845779418945312, + "learning_rate": 4.421014650615571e-05, + "loss": 1.3817, + "step": 14780 + }, + { + "epoch": 4.42, + "grad_norm": 2.3008716106414795, + "learning_rate": 4.420638605921332e-05, + "loss": 1.2791, + "step": 14785 + }, + { + "epoch": 4.42, + "grad_norm": 2.7502546310424805, + "learning_rate": 4.4202624551522195e-05, + "loss": 1.1827, + "step": 14790 + }, + { + "epoch": 4.43, + "grad_norm": 1.4934040307998657, + "learning_rate": 4.419886198329008e-05, + "loss": 1.4211, + "step": 14795 + }, + { + "epoch": 4.43, + "grad_norm": 2.8900766372680664, + "learning_rate": 4.419509835472476e-05, + "loss": 1.3247, + "step": 14800 + }, + { + "epoch": 4.43, + "grad_norm": 1.89850652217865, + "learning_rate": 4.4191333666034124e-05, + "loss": 1.3627, + "step": 14805 + }, + { + "epoch": 4.43, + "grad_norm": 2.504714250564575, + "learning_rate": 4.4187567917426074e-05, + "loss": 1.2384, + "step": 14810 + }, + { + "epoch": 4.43, + "grad_norm": 2.1975607872009277, + "learning_rate": 4.418380110910859e-05, + "loss": 1.1535, + "step": 14815 + }, + { + "epoch": 4.43, + "grad_norm": 2.044182062149048, + "learning_rate": 4.4180033241289706e-05, + "loss": 1.2568, + "step": 14820 + }, + { + "epoch": 4.44, + "grad_norm": 1.6117463111877441, + "learning_rate": 4.417626431417753e-05, + "loss": 1.0122, + "step": 14825 + }, + { + "epoch": 4.44, + "grad_norm": 1.8464300632476807, + "learning_rate": 4.4172494327980205e-05, + "loss": 1.19, + "step": 14830 + }, + { + "epoch": 4.44, + "grad_norm": 2.2936418056488037, + "learning_rate": 4.4168723282905954e-05, + "loss": 1.0822, + "step": 14835 + }, + { + "epoch": 4.44, + "grad_norm": 1.4737282991409302, + "learning_rate": 4.416495117916304e-05, + "loss": 1.4572, + "step": 14840 + }, + { + "epoch": 4.44, + "grad_norm": 1.4652832746505737, + "learning_rate": 4.416117801695979e-05, + "loss": 1.219, + "step": 14845 + }, + { + "epoch": 4.44, + "grad_norm": 1.4503779411315918, + "learning_rate": 4.415740379650459e-05, + "loss": 1.3568, + "step": 14850 + }, + { + "epoch": 4.44, + "grad_norm": 1.4976866245269775, + "learning_rate": 4.41536285180059e-05, + "loss": 1.1892, + "step": 14855 + }, + { + "epoch": 4.45, + "grad_norm": 5.945674896240234, + "learning_rate": 4.414985218167221e-05, + "loss": 1.146, + "step": 14860 + }, + { + "epoch": 4.45, + "grad_norm": 1.585028886795044, + "learning_rate": 4.414607478771209e-05, + "loss": 1.1714, + "step": 14865 + }, + { + "epoch": 4.45, + "grad_norm": 1.4117640256881714, + "learning_rate": 4.4142296336334166e-05, + "loss": 1.2991, + "step": 14870 + }, + { + "epoch": 4.45, + "grad_norm": 2.092742443084717, + "learning_rate": 4.41385168277471e-05, + "loss": 1.1138, + "step": 14875 + }, + { + "epoch": 4.45, + "grad_norm": 2.3509175777435303, + "learning_rate": 4.413473626215965e-05, + "loss": 1.3, + "step": 14880 + }, + { + "epoch": 4.45, + "grad_norm": 2.7636947631835938, + "learning_rate": 4.4130954639780616e-05, + "loss": 1.2863, + "step": 14885 + }, + { + "epoch": 4.45, + "grad_norm": 1.4940133094787598, + "learning_rate": 4.412717196081883e-05, + "loss": 1.2997, + "step": 14890 + }, + { + "epoch": 4.46, + "grad_norm": 2.2368600368499756, + "learning_rate": 4.412338822548322e-05, + "loss": 1.2766, + "step": 14895 + }, + { + "epoch": 4.46, + "grad_norm": 1.0473464727401733, + "learning_rate": 4.411960343398276e-05, + "loss": 1.2436, + "step": 14900 + }, + { + "epoch": 4.46, + "grad_norm": 1.2610361576080322, + "learning_rate": 4.4115817586526475e-05, + "loss": 1.2788, + "step": 14905 + }, + { + "epoch": 4.46, + "grad_norm": 1.074568271636963, + "learning_rate": 4.411203068332345e-05, + "loss": 1.2546, + "step": 14910 + }, + { + "epoch": 4.46, + "grad_norm": 1.4182261228561401, + "learning_rate": 4.4108242724582836e-05, + "loss": 1.0938, + "step": 14915 + }, + { + "epoch": 4.46, + "grad_norm": 5.393111705780029, + "learning_rate": 4.410445371051385e-05, + "loss": 1.2659, + "step": 14920 + }, + { + "epoch": 4.47, + "grad_norm": 1.6223890781402588, + "learning_rate": 4.410066364132573e-05, + "loss": 1.3116, + "step": 14925 + }, + { + "epoch": 4.47, + "grad_norm": 2.0401763916015625, + "learning_rate": 4.409687251722782e-05, + "loss": 1.2728, + "step": 14930 + }, + { + "epoch": 4.47, + "grad_norm": 2.3179221153259277, + "learning_rate": 4.4093080338429485e-05, + "loss": 0.9557, + "step": 14935 + }, + { + "epoch": 4.47, + "grad_norm": 0.9907518029212952, + "learning_rate": 4.4089287105140176e-05, + "loss": 1.2843, + "step": 14940 + }, + { + "epoch": 4.47, + "grad_norm": 3.440904378890991, + "learning_rate": 4.4085492817569375e-05, + "loss": 1.2877, + "step": 14945 + }, + { + "epoch": 4.47, + "grad_norm": 1.7463809251785278, + "learning_rate": 4.408169747592665e-05, + "loss": 1.2269, + "step": 14950 + }, + { + "epoch": 4.47, + "grad_norm": 1.2096577882766724, + "learning_rate": 4.40779010804216e-05, + "loss": 1.318, + "step": 14955 + }, + { + "epoch": 4.48, + "grad_norm": 1.5482676029205322, + "learning_rate": 4.407410363126391e-05, + "loss": 1.2836, + "step": 14960 + }, + { + "epoch": 4.48, + "grad_norm": 0.8990564346313477, + "learning_rate": 4.4070305128663306e-05, + "loss": 1.175, + "step": 14965 + }, + { + "epoch": 4.48, + "grad_norm": 1.5660568475723267, + "learning_rate": 4.406650557282957e-05, + "loss": 1.0142, + "step": 14970 + }, + { + "epoch": 4.48, + "grad_norm": 1.8954380750656128, + "learning_rate": 4.406270496397254e-05, + "loss": 1.1563, + "step": 14975 + }, + { + "epoch": 4.48, + "grad_norm": 3.845919609069824, + "learning_rate": 4.4058903302302136e-05, + "loss": 1.1624, + "step": 14980 + }, + { + "epoch": 4.48, + "grad_norm": 2.766300678253174, + "learning_rate": 4.4055100588028315e-05, + "loss": 1.2274, + "step": 14985 + }, + { + "epoch": 4.48, + "grad_norm": 1.7678008079528809, + "learning_rate": 4.405129682136109e-05, + "loss": 1.2156, + "step": 14990 + }, + { + "epoch": 4.49, + "grad_norm": 1.1421842575073242, + "learning_rate": 4.404749200251055e-05, + "loss": 1.2917, + "step": 14995 + }, + { + "epoch": 4.49, + "grad_norm": 2.411649227142334, + "learning_rate": 4.4043686131686825e-05, + "loss": 1.177, + "step": 15000 + }, + { + "epoch": 4.49, + "grad_norm": 0.9833216667175293, + "learning_rate": 4.403987920910011e-05, + "loss": 1.4059, + "step": 15005 + }, + { + "epoch": 4.49, + "grad_norm": 1.068702220916748, + "learning_rate": 4.403607123496065e-05, + "loss": 1.0853, + "step": 15010 + }, + { + "epoch": 4.49, + "grad_norm": 2.074406385421753, + "learning_rate": 4.4032262209478774e-05, + "loss": 1.4351, + "step": 15015 + }, + { + "epoch": 4.49, + "grad_norm": 2.2396271228790283, + "learning_rate": 4.402845213286483e-05, + "loss": 1.3586, + "step": 15020 + }, + { + "epoch": 4.5, + "grad_norm": 2.4395458698272705, + "learning_rate": 4.402464100532926e-05, + "loss": 1.2868, + "step": 15025 + }, + { + "epoch": 4.5, + "grad_norm": 2.2216920852661133, + "learning_rate": 4.402082882708254e-05, + "loss": 1.3321, + "step": 15030 + }, + { + "epoch": 4.5, + "grad_norm": 1.8366425037384033, + "learning_rate": 4.401701559833521e-05, + "loss": 1.3224, + "step": 15035 + }, + { + "epoch": 4.5, + "grad_norm": 2.4641847610473633, + "learning_rate": 4.401320131929788e-05, + "loss": 1.0611, + "step": 15040 + }, + { + "epoch": 4.5, + "grad_norm": 1.1396476030349731, + "learning_rate": 4.400938599018121e-05, + "loss": 1.3745, + "step": 15045 + }, + { + "epoch": 4.5, + "grad_norm": 2.272923469543457, + "learning_rate": 4.40055696111959e-05, + "loss": 1.2228, + "step": 15050 + }, + { + "epoch": 4.5, + "grad_norm": 3.0234322547912598, + "learning_rate": 4.400175218255274e-05, + "loss": 1.2259, + "step": 15055 + }, + { + "epoch": 4.51, + "grad_norm": 4.0134501457214355, + "learning_rate": 4.3997933704462555e-05, + "loss": 1.1956, + "step": 15060 + }, + { + "epoch": 4.51, + "grad_norm": 1.733201503753662, + "learning_rate": 4.399411417713625e-05, + "loss": 1.3304, + "step": 15065 + }, + { + "epoch": 4.51, + "grad_norm": 1.0404208898544312, + "learning_rate": 4.3990293600784754e-05, + "loss": 1.2417, + "step": 15070 + }, + { + "epoch": 4.51, + "grad_norm": 1.6445467472076416, + "learning_rate": 4.398647197561908e-05, + "loss": 1.4209, + "step": 15075 + }, + { + "epoch": 4.51, + "grad_norm": 2.307591676712036, + "learning_rate": 4.3982649301850296e-05, + "loss": 1.4352, + "step": 15080 + }, + { + "epoch": 4.51, + "grad_norm": 1.3652942180633545, + "learning_rate": 4.397882557968952e-05, + "loss": 1.4074, + "step": 15085 + }, + { + "epoch": 4.51, + "grad_norm": 1.15928053855896, + "learning_rate": 4.397500080934794e-05, + "loss": 1.175, + "step": 15090 + }, + { + "epoch": 4.52, + "grad_norm": 1.9573633670806885, + "learning_rate": 4.397117499103679e-05, + "loss": 1.3634, + "step": 15095 + }, + { + "epoch": 4.52, + "grad_norm": 2.113678455352783, + "learning_rate": 4.3967348124967365e-05, + "loss": 1.1655, + "step": 15100 + }, + { + "epoch": 4.52, + "grad_norm": 2.7751240730285645, + "learning_rate": 4.396352021135101e-05, + "loss": 1.1952, + "step": 15105 + }, + { + "epoch": 4.52, + "grad_norm": 3.2183516025543213, + "learning_rate": 4.395969125039915e-05, + "loss": 1.4013, + "step": 15110 + }, + { + "epoch": 4.52, + "grad_norm": 3.216507911682129, + "learning_rate": 4.395586124232325e-05, + "loss": 1.2403, + "step": 15115 + }, + { + "epoch": 4.52, + "grad_norm": 1.8810077905654907, + "learning_rate": 4.395203018733484e-05, + "loss": 1.3741, + "step": 15120 + }, + { + "epoch": 4.53, + "grad_norm": 1.4608815908432007, + "learning_rate": 4.3948198085645495e-05, + "loss": 1.2175, + "step": 15125 + }, + { + "epoch": 4.53, + "grad_norm": 2.5550155639648438, + "learning_rate": 4.394436493746687e-05, + "loss": 1.3009, + "step": 15130 + }, + { + "epoch": 4.53, + "grad_norm": 1.188637375831604, + "learning_rate": 4.394053074301066e-05, + "loss": 1.3576, + "step": 15135 + }, + { + "epoch": 4.53, + "grad_norm": 1.8632690906524658, + "learning_rate": 4.3936695502488623e-05, + "loss": 1.2394, + "step": 15140 + }, + { + "epoch": 4.53, + "grad_norm": 2.738532304763794, + "learning_rate": 4.3932859216112584e-05, + "loss": 1.2289, + "step": 15145 + }, + { + "epoch": 4.53, + "grad_norm": 1.352799415588379, + "learning_rate": 4.39290218840944e-05, + "loss": 1.2893, + "step": 15150 + }, + { + "epoch": 4.53, + "grad_norm": 1.2402245998382568, + "learning_rate": 4.392518350664602e-05, + "loss": 1.2977, + "step": 15155 + }, + { + "epoch": 4.54, + "grad_norm": 1.9382325410842896, + "learning_rate": 4.392134408397942e-05, + "loss": 1.197, + "step": 15160 + }, + { + "epoch": 4.54, + "grad_norm": 3.8150017261505127, + "learning_rate": 4.391750361630666e-05, + "loss": 1.3319, + "step": 15165 + }, + { + "epoch": 4.54, + "grad_norm": 1.3974863290786743, + "learning_rate": 4.391366210383984e-05, + "loss": 1.2548, + "step": 15170 + }, + { + "epoch": 4.54, + "grad_norm": 1.6200858354568481, + "learning_rate": 4.390981954679112e-05, + "loss": 1.3474, + "step": 15175 + }, + { + "epoch": 4.54, + "grad_norm": 2.464508056640625, + "learning_rate": 4.390597594537272e-05, + "loss": 1.2525, + "step": 15180 + }, + { + "epoch": 4.54, + "grad_norm": 2.1834940910339355, + "learning_rate": 4.390213129979692e-05, + "loss": 1.2651, + "step": 15185 + }, + { + "epoch": 4.54, + "grad_norm": 1.6160869598388672, + "learning_rate": 4.3898285610276056e-05, + "loss": 1.3427, + "step": 15190 + }, + { + "epoch": 4.55, + "grad_norm": 2.14302921295166, + "learning_rate": 4.389443887702252e-05, + "loss": 1.3329, + "step": 15195 + }, + { + "epoch": 4.55, + "grad_norm": 1.1593176126480103, + "learning_rate": 4.3890591100248773e-05, + "loss": 1.2283, + "step": 15200 + }, + { + "epoch": 4.55, + "grad_norm": 1.1334714889526367, + "learning_rate": 4.388674228016731e-05, + "loss": 1.2044, + "step": 15205 + }, + { + "epoch": 4.55, + "grad_norm": 1.1369993686676025, + "learning_rate": 4.38828924169907e-05, + "loss": 1.1575, + "step": 15210 + }, + { + "epoch": 4.55, + "grad_norm": 1.315269112586975, + "learning_rate": 4.387904151093157e-05, + "loss": 1.3368, + "step": 15215 + }, + { + "epoch": 4.55, + "grad_norm": 1.833358883857727, + "learning_rate": 4.38751895622026e-05, + "loss": 1.2118, + "step": 15220 + }, + { + "epoch": 4.56, + "grad_norm": 2.8720498085021973, + "learning_rate": 4.387133657101654e-05, + "loss": 1.2441, + "step": 15225 + }, + { + "epoch": 4.56, + "grad_norm": 2.276634931564331, + "learning_rate": 4.386748253758617e-05, + "loss": 1.1128, + "step": 15230 + }, + { + "epoch": 4.56, + "grad_norm": 1.893906831741333, + "learning_rate": 4.386362746212435e-05, + "loss": 1.2714, + "step": 15235 + }, + { + "epoch": 4.56, + "grad_norm": 2.4002392292022705, + "learning_rate": 4.385977134484399e-05, + "loss": 1.2763, + "step": 15240 + }, + { + "epoch": 4.56, + "grad_norm": 1.7395859956741333, + "learning_rate": 4.385591418595807e-05, + "loss": 1.2394, + "step": 15245 + }, + { + "epoch": 4.56, + "grad_norm": 2.103240966796875, + "learning_rate": 4.38520559856796e-05, + "loss": 1.2259, + "step": 15250 + }, + { + "epoch": 4.56, + "grad_norm": 1.817021369934082, + "learning_rate": 4.384819674422168e-05, + "loss": 1.4099, + "step": 15255 + }, + { + "epoch": 4.57, + "grad_norm": 3.440264940261841, + "learning_rate": 4.3844336461797445e-05, + "loss": 1.1517, + "step": 15260 + }, + { + "epoch": 4.57, + "grad_norm": 2.334442615509033, + "learning_rate": 4.384047513862009e-05, + "loss": 1.1874, + "step": 15265 + }, + { + "epoch": 4.57, + "grad_norm": 1.8950352668762207, + "learning_rate": 4.383661277490289e-05, + "loss": 1.2903, + "step": 15270 + }, + { + "epoch": 4.57, + "grad_norm": 1.6918821334838867, + "learning_rate": 4.3832749370859124e-05, + "loss": 1.1923, + "step": 15275 + }, + { + "epoch": 4.57, + "grad_norm": 2.603799819946289, + "learning_rate": 4.38288849267022e-05, + "loss": 1.3905, + "step": 15280 + }, + { + "epoch": 4.57, + "grad_norm": 1.2318238019943237, + "learning_rate": 4.3825019442645534e-05, + "loss": 1.2528, + "step": 15285 + }, + { + "epoch": 4.57, + "grad_norm": 2.4402077198028564, + "learning_rate": 4.382115291890261e-05, + "loss": 1.2203, + "step": 15290 + }, + { + "epoch": 4.58, + "grad_norm": 1.929358720779419, + "learning_rate": 4.3817285355686973e-05, + "loss": 1.2942, + "step": 15295 + }, + { + "epoch": 4.58, + "grad_norm": 2.6186788082122803, + "learning_rate": 4.3813416753212224e-05, + "loss": 1.1942, + "step": 15300 + }, + { + "epoch": 4.58, + "grad_norm": 3.274463653564453, + "learning_rate": 4.3809547111692024e-05, + "loss": 1.3078, + "step": 15305 + }, + { + "epoch": 4.58, + "grad_norm": 3.3684141635894775, + "learning_rate": 4.380567643134009e-05, + "loss": 1.2054, + "step": 15310 + }, + { + "epoch": 4.58, + "grad_norm": 0.9591112732887268, + "learning_rate": 4.380180471237019e-05, + "loss": 1.1912, + "step": 15315 + }, + { + "epoch": 4.58, + "grad_norm": 3.8308017253875732, + "learning_rate": 4.379793195499616e-05, + "loss": 1.2198, + "step": 15320 + }, + { + "epoch": 4.59, + "grad_norm": 1.5418450832366943, + "learning_rate": 4.3794058159431895e-05, + "loss": 1.3401, + "step": 15325 + }, + { + "epoch": 4.59, + "grad_norm": 1.2798864841461182, + "learning_rate": 4.379018332589132e-05, + "loss": 1.0988, + "step": 15330 + }, + { + "epoch": 4.59, + "grad_norm": 1.1847316026687622, + "learning_rate": 4.378630745458846e-05, + "loss": 1.378, + "step": 15335 + }, + { + "epoch": 4.59, + "grad_norm": 2.026315450668335, + "learning_rate": 4.3782430545737377e-05, + "loss": 1.267, + "step": 15340 + }, + { + "epoch": 4.59, + "grad_norm": 1.8548667430877686, + "learning_rate": 4.3778552599552156e-05, + "loss": 1.2185, + "step": 15345 + }, + { + "epoch": 4.59, + "grad_norm": 1.9220659732818604, + "learning_rate": 4.377467361624701e-05, + "loss": 1.2383, + "step": 15350 + }, + { + "epoch": 4.59, + "grad_norm": 1.5379008054733276, + "learning_rate": 4.3770793596036145e-05, + "loss": 1.2225, + "step": 15355 + }, + { + "epoch": 4.6, + "grad_norm": 1.2834293842315674, + "learning_rate": 4.3766912539133864e-05, + "loss": 1.1713, + "step": 15360 + }, + { + "epoch": 4.6, + "grad_norm": 1.6671650409698486, + "learning_rate": 4.3763030445754514e-05, + "loss": 1.3282, + "step": 15365 + }, + { + "epoch": 4.6, + "grad_norm": 1.0815520286560059, + "learning_rate": 4.37591473161125e-05, + "loss": 1.3631, + "step": 15370 + }, + { + "epoch": 4.6, + "grad_norm": 2.721876859664917, + "learning_rate": 4.375526315042227e-05, + "loss": 1.1959, + "step": 15375 + }, + { + "epoch": 4.6, + "grad_norm": 1.370292067527771, + "learning_rate": 4.3751377948898356e-05, + "loss": 1.2007, + "step": 15380 + }, + { + "epoch": 4.6, + "grad_norm": 2.0268495082855225, + "learning_rate": 4.374749171175533e-05, + "loss": 1.243, + "step": 15385 + }, + { + "epoch": 4.6, + "grad_norm": 1.4112547636032104, + "learning_rate": 4.3743604439207817e-05, + "loss": 1.1799, + "step": 15390 + }, + { + "epoch": 4.61, + "grad_norm": 1.6107220649719238, + "learning_rate": 4.373971613147051e-05, + "loss": 1.3369, + "step": 15395 + }, + { + "epoch": 4.61, + "grad_norm": 2.6075730323791504, + "learning_rate": 4.373582678875817e-05, + "loss": 1.2638, + "step": 15400 + }, + { + "epoch": 4.61, + "grad_norm": 1.0856952667236328, + "learning_rate": 4.373193641128559e-05, + "loss": 1.2009, + "step": 15405 + }, + { + "epoch": 4.61, + "grad_norm": 2.2727081775665283, + "learning_rate": 4.372804499926762e-05, + "loss": 1.2948, + "step": 15410 + }, + { + "epoch": 4.61, + "grad_norm": 1.7927814722061157, + "learning_rate": 4.3724152552919214e-05, + "loss": 1.1401, + "step": 15415 + }, + { + "epoch": 4.61, + "grad_norm": 2.271836996078491, + "learning_rate": 4.372025907245532e-05, + "loss": 1.2022, + "step": 15420 + }, + { + "epoch": 4.61, + "grad_norm": 2.37410306930542, + "learning_rate": 4.371636455809096e-05, + "loss": 1.2243, + "step": 15425 + }, + { + "epoch": 4.62, + "grad_norm": 1.7014080286026, + "learning_rate": 4.371246901004125e-05, + "loss": 1.2851, + "step": 15430 + }, + { + "epoch": 4.62, + "grad_norm": 2.186640977859497, + "learning_rate": 4.3708572428521334e-05, + "loss": 1.4657, + "step": 15435 + }, + { + "epoch": 4.62, + "grad_norm": 2.204148769378662, + "learning_rate": 4.370467481374639e-05, + "loss": 1.1876, + "step": 15440 + }, + { + "epoch": 4.62, + "grad_norm": 1.9742987155914307, + "learning_rate": 4.3700776165931716e-05, + "loss": 1.2819, + "step": 15445 + }, + { + "epoch": 4.62, + "grad_norm": 1.3668123483657837, + "learning_rate": 4.369687648529261e-05, + "loss": 1.2395, + "step": 15450 + }, + { + "epoch": 4.62, + "grad_norm": 3.831627368927002, + "learning_rate": 4.3692975772044444e-05, + "loss": 1.1898, + "step": 15455 + }, + { + "epoch": 4.63, + "grad_norm": 1.8598977327346802, + "learning_rate": 4.3689074026402665e-05, + "loss": 1.2271, + "step": 15460 + }, + { + "epoch": 4.63, + "grad_norm": 1.0298603773117065, + "learning_rate": 4.3685171248582744e-05, + "loss": 1.2812, + "step": 15465 + }, + { + "epoch": 4.63, + "grad_norm": 2.9451205730438232, + "learning_rate": 4.368126743880024e-05, + "loss": 1.2084, + "step": 15470 + }, + { + "epoch": 4.63, + "grad_norm": 1.8287848234176636, + "learning_rate": 4.367736259727076e-05, + "loss": 1.2462, + "step": 15475 + }, + { + "epoch": 4.63, + "grad_norm": 1.9537088871002197, + "learning_rate": 4.367345672420995e-05, + "loss": 1.2031, + "step": 15480 + }, + { + "epoch": 4.63, + "grad_norm": 2.9025750160217285, + "learning_rate": 4.366954981983354e-05, + "loss": 1.4563, + "step": 15485 + }, + { + "epoch": 4.63, + "grad_norm": 1.587408185005188, + "learning_rate": 4.3665641884357294e-05, + "loss": 1.1538, + "step": 15490 + }, + { + "epoch": 4.64, + "grad_norm": 1.1759958267211914, + "learning_rate": 4.366173291799705e-05, + "loss": 1.2645, + "step": 15495 + }, + { + "epoch": 4.64, + "grad_norm": 4.82040548324585, + "learning_rate": 4.3657822920968706e-05, + "loss": 1.1618, + "step": 15500 + }, + { + "epoch": 4.64, + "grad_norm": 0.7620186805725098, + "learning_rate": 4.365391189348818e-05, + "loss": 1.3226, + "step": 15505 + }, + { + "epoch": 4.64, + "grad_norm": 1.329471230506897, + "learning_rate": 4.3649999835771496e-05, + "loss": 1.2517, + "step": 15510 + }, + { + "epoch": 4.64, + "grad_norm": 1.040663480758667, + "learning_rate": 4.364608674803471e-05, + "loss": 1.2898, + "step": 15515 + }, + { + "epoch": 4.64, + "grad_norm": 2.1228902339935303, + "learning_rate": 4.3642172630493925e-05, + "loss": 1.2581, + "step": 15520 + }, + { + "epoch": 4.64, + "grad_norm": 1.2862385511398315, + "learning_rate": 4.3638257483365336e-05, + "loss": 1.2414, + "step": 15525 + }, + { + "epoch": 4.65, + "grad_norm": 1.5629061460494995, + "learning_rate": 4.363434130686515e-05, + "loss": 1.201, + "step": 15530 + }, + { + "epoch": 4.65, + "grad_norm": 1.894118309020996, + "learning_rate": 4.3630424101209663e-05, + "loss": 1.2576, + "step": 15535 + }, + { + "epoch": 4.65, + "grad_norm": 4.182260513305664, + "learning_rate": 4.3626505866615224e-05, + "loss": 1.1643, + "step": 15540 + }, + { + "epoch": 4.65, + "grad_norm": 2.6900877952575684, + "learning_rate": 4.362258660329822e-05, + "loss": 1.1906, + "step": 15545 + }, + { + "epoch": 4.65, + "grad_norm": 9.078900337219238, + "learning_rate": 4.361866631147512e-05, + "loss": 1.2331, + "step": 15550 + }, + { + "epoch": 4.65, + "grad_norm": 3.6981396675109863, + "learning_rate": 4.361474499136243e-05, + "loss": 1.3355, + "step": 15555 + }, + { + "epoch": 4.66, + "grad_norm": 1.7250741720199585, + "learning_rate": 4.361082264317673e-05, + "loss": 1.3161, + "step": 15560 + }, + { + "epoch": 4.66, + "grad_norm": 2.3266186714172363, + "learning_rate": 4.360689926713464e-05, + "loss": 1.3587, + "step": 15565 + }, + { + "epoch": 4.66, + "grad_norm": 1.8503769636154175, + "learning_rate": 4.3602974863452835e-05, + "loss": 1.3439, + "step": 15570 + }, + { + "epoch": 4.66, + "grad_norm": 2.670262098312378, + "learning_rate": 4.3599049432348074e-05, + "loss": 1.2923, + "step": 15575 + }, + { + "epoch": 4.66, + "grad_norm": 1.2793365716934204, + "learning_rate": 4.359512297403714e-05, + "loss": 1.4055, + "step": 15580 + }, + { + "epoch": 4.66, + "grad_norm": 1.302943468093872, + "learning_rate": 4.359119548873689e-05, + "loss": 1.2158, + "step": 15585 + }, + { + "epoch": 4.66, + "grad_norm": 1.2726792097091675, + "learning_rate": 4.358726697666424e-05, + "loss": 1.2166, + "step": 15590 + }, + { + "epoch": 4.67, + "grad_norm": 2.570844888687134, + "learning_rate": 4.358333743803616e-05, + "loss": 1.1484, + "step": 15595 + }, + { + "epoch": 4.67, + "grad_norm": 2.5889105796813965, + "learning_rate": 4.3579406873069664e-05, + "loss": 1.31, + "step": 15600 + }, + { + "epoch": 4.67, + "grad_norm": 2.399143695831299, + "learning_rate": 4.3575475281981844e-05, + "loss": 1.2022, + "step": 15605 + }, + { + "epoch": 4.67, + "grad_norm": 0.9160411357879639, + "learning_rate": 4.357154266498983e-05, + "loss": 1.2351, + "step": 15610 + }, + { + "epoch": 4.67, + "grad_norm": 2.3706166744232178, + "learning_rate": 4.356760902231082e-05, + "loss": 1.3131, + "step": 15615 + }, + { + "epoch": 4.67, + "grad_norm": 3.5267927646636963, + "learning_rate": 4.356367435416205e-05, + "loss": 1.4429, + "step": 15620 + }, + { + "epoch": 4.67, + "grad_norm": 2.2916717529296875, + "learning_rate": 4.3559738660760854e-05, + "loss": 1.4017, + "step": 15625 + }, + { + "epoch": 4.68, + "grad_norm": 1.031050205230713, + "learning_rate": 4.3555801942324584e-05, + "loss": 1.4196, + "step": 15630 + }, + { + "epoch": 4.68, + "grad_norm": 1.4319988489151, + "learning_rate": 4.355186419907066e-05, + "loss": 1.1985, + "step": 15635 + }, + { + "epoch": 4.68, + "grad_norm": 5.716891765594482, + "learning_rate": 4.354792543121656e-05, + "loss": 1.2971, + "step": 15640 + }, + { + "epoch": 4.68, + "grad_norm": 2.955399751663208, + "learning_rate": 4.3543985638979816e-05, + "loss": 1.3152, + "step": 15645 + }, + { + "epoch": 4.68, + "grad_norm": 3.042884349822998, + "learning_rate": 4.354004482257802e-05, + "loss": 1.2771, + "step": 15650 + }, + { + "epoch": 4.68, + "grad_norm": 1.6653623580932617, + "learning_rate": 4.353610298222882e-05, + "loss": 1.3533, + "step": 15655 + }, + { + "epoch": 4.69, + "grad_norm": 1.1561282873153687, + "learning_rate": 4.353216011814992e-05, + "loss": 1.2868, + "step": 15660 + }, + { + "epoch": 4.69, + "grad_norm": 1.7067124843597412, + "learning_rate": 4.352821623055908e-05, + "loss": 1.2304, + "step": 15665 + }, + { + "epoch": 4.69, + "grad_norm": 1.450724720954895, + "learning_rate": 4.352427131967412e-05, + "loss": 1.1762, + "step": 15670 + }, + { + "epoch": 4.69, + "grad_norm": 2.304478406906128, + "learning_rate": 4.3520325385712904e-05, + "loss": 1.1929, + "step": 15675 + }, + { + "epoch": 4.69, + "grad_norm": 1.6809667348861694, + "learning_rate": 4.3516378428893366e-05, + "loss": 1.238, + "step": 15680 + }, + { + "epoch": 4.69, + "grad_norm": 1.2087801694869995, + "learning_rate": 4.35124304494335e-05, + "loss": 1.2465, + "step": 15685 + }, + { + "epoch": 4.69, + "grad_norm": 1.7735761404037476, + "learning_rate": 4.350848144755134e-05, + "loss": 1.2547, + "step": 15690 + }, + { + "epoch": 4.7, + "grad_norm": 2.593214988708496, + "learning_rate": 4.3504531423464996e-05, + "loss": 1.2919, + "step": 15695 + }, + { + "epoch": 4.7, + "grad_norm": 2.0241143703460693, + "learning_rate": 4.350058037739261e-05, + "loss": 1.3598, + "step": 15700 + }, + { + "epoch": 4.7, + "grad_norm": 1.4365257024765015, + "learning_rate": 4.3496628309552395e-05, + "loss": 1.2709, + "step": 15705 + }, + { + "epoch": 4.7, + "grad_norm": 1.55208420753479, + "learning_rate": 4.349267522016263e-05, + "loss": 1.2687, + "step": 15710 + }, + { + "epoch": 4.7, + "grad_norm": 1.5196239948272705, + "learning_rate": 4.348872110944163e-05, + "loss": 1.3805, + "step": 15715 + }, + { + "epoch": 4.7, + "grad_norm": 2.587491750717163, + "learning_rate": 4.348476597760779e-05, + "loss": 1.1504, + "step": 15720 + }, + { + "epoch": 4.7, + "grad_norm": 1.250502586364746, + "learning_rate": 4.348080982487953e-05, + "loss": 1.351, + "step": 15725 + }, + { + "epoch": 4.71, + "grad_norm": 1.627832055091858, + "learning_rate": 4.347685265147536e-05, + "loss": 1.2934, + "step": 15730 + }, + { + "epoch": 4.71, + "grad_norm": 1.428882360458374, + "learning_rate": 4.347289445761382e-05, + "loss": 1.3566, + "step": 15735 + }, + { + "epoch": 4.71, + "grad_norm": 1.8175216913223267, + "learning_rate": 4.346893524351352e-05, + "loss": 1.3705, + "step": 15740 + }, + { + "epoch": 4.71, + "grad_norm": 2.059349536895752, + "learning_rate": 4.3464975009393124e-05, + "loss": 1.2061, + "step": 15745 + }, + { + "epoch": 4.71, + "grad_norm": 2.046034336090088, + "learning_rate": 4.3461013755471354e-05, + "loss": 1.0747, + "step": 15750 + }, + { + "epoch": 4.71, + "grad_norm": 1.8597091436386108, + "learning_rate": 4.345705148196698e-05, + "loss": 1.2378, + "step": 15755 + }, + { + "epoch": 4.72, + "grad_norm": 1.176646113395691, + "learning_rate": 4.345308818909884e-05, + "loss": 1.4726, + "step": 15760 + }, + { + "epoch": 4.72, + "grad_norm": 3.254624605178833, + "learning_rate": 4.344912387708582e-05, + "loss": 1.2737, + "step": 15765 + }, + { + "epoch": 4.72, + "grad_norm": 1.0571980476379395, + "learning_rate": 4.344515854614686e-05, + "loss": 1.1995, + "step": 15770 + }, + { + "epoch": 4.72, + "grad_norm": 2.313420057296753, + "learning_rate": 4.3441192196500976e-05, + "loss": 1.1803, + "step": 15775 + }, + { + "epoch": 4.72, + "grad_norm": 1.746813416481018, + "learning_rate": 4.3437224828367205e-05, + "loss": 1.3353, + "step": 15780 + }, + { + "epoch": 4.72, + "grad_norm": 1.8490636348724365, + "learning_rate": 4.343325644196468e-05, + "loss": 1.1052, + "step": 15785 + }, + { + "epoch": 4.72, + "grad_norm": 1.9969300031661987, + "learning_rate": 4.342928703751256e-05, + "loss": 1.1735, + "step": 15790 + }, + { + "epoch": 4.73, + "grad_norm": 1.4050122499465942, + "learning_rate": 4.342531661523007e-05, + "loss": 1.1179, + "step": 15795 + }, + { + "epoch": 4.73, + "grad_norm": 1.8106729984283447, + "learning_rate": 4.34213451753365e-05, + "loss": 1.4247, + "step": 15800 + }, + { + "epoch": 4.73, + "grad_norm": 1.9320409297943115, + "learning_rate": 4.341737271805118e-05, + "loss": 1.2531, + "step": 15805 + }, + { + "epoch": 4.73, + "grad_norm": 2.287996292114258, + "learning_rate": 4.3413399243593513e-05, + "loss": 1.3139, + "step": 15810 + }, + { + "epoch": 4.73, + "grad_norm": 0.8202207088470459, + "learning_rate": 4.3409424752182934e-05, + "loss": 1.1417, + "step": 15815 + }, + { + "epoch": 4.73, + "grad_norm": 1.7704706192016602, + "learning_rate": 4.340544924403898e-05, + "loss": 1.307, + "step": 15820 + }, + { + "epoch": 4.73, + "grad_norm": 2.2683053016662598, + "learning_rate": 4.340147271938118e-05, + "loss": 1.3663, + "step": 15825 + }, + { + "epoch": 4.74, + "grad_norm": 2.761477470397949, + "learning_rate": 4.339749517842917e-05, + "loss": 1.3196, + "step": 15830 + }, + { + "epoch": 4.74, + "grad_norm": 2.3123843669891357, + "learning_rate": 4.339351662140263e-05, + "loss": 1.1369, + "step": 15835 + }, + { + "epoch": 4.74, + "grad_norm": 3.907461166381836, + "learning_rate": 4.338953704852128e-05, + "loss": 1.2576, + "step": 15840 + }, + { + "epoch": 4.74, + "grad_norm": 1.5930169820785522, + "learning_rate": 4.3385556460004925e-05, + "loss": 1.2911, + "step": 15845 + }, + { + "epoch": 4.74, + "grad_norm": 3.523927688598633, + "learning_rate": 4.338157485607339e-05, + "loss": 1.1452, + "step": 15850 + }, + { + "epoch": 4.74, + "grad_norm": 1.7907129526138306, + "learning_rate": 4.3377592236946585e-05, + "loss": 1.3091, + "step": 15855 + }, + { + "epoch": 4.75, + "grad_norm": 1.3438159227371216, + "learning_rate": 4.337360860284446e-05, + "loss": 1.2201, + "step": 15860 + }, + { + "epoch": 4.75, + "grad_norm": 3.469675064086914, + "learning_rate": 4.336962395398703e-05, + "loss": 1.147, + "step": 15865 + }, + { + "epoch": 4.75, + "grad_norm": 1.553625464439392, + "learning_rate": 4.3365638290594366e-05, + "loss": 1.4353, + "step": 15870 + }, + { + "epoch": 4.75, + "grad_norm": 2.0750131607055664, + "learning_rate": 4.336165161288659e-05, + "loss": 1.2967, + "step": 15875 + }, + { + "epoch": 4.75, + "grad_norm": 2.1773085594177246, + "learning_rate": 4.335766392108387e-05, + "loss": 1.2372, + "step": 15880 + }, + { + "epoch": 4.75, + "grad_norm": 2.4969215393066406, + "learning_rate": 4.3353675215406465e-05, + "loss": 1.2196, + "step": 15885 + }, + { + "epoch": 4.75, + "grad_norm": 1.2031995058059692, + "learning_rate": 4.334968549607465e-05, + "loss": 1.1849, + "step": 15890 + }, + { + "epoch": 4.76, + "grad_norm": 2.3058178424835205, + "learning_rate": 4.3345694763308786e-05, + "loss": 1.2714, + "step": 15895 + }, + { + "epoch": 4.76, + "grad_norm": 1.7053008079528809, + "learning_rate": 4.334170301732926e-05, + "loss": 1.2352, + "step": 15900 + }, + { + "epoch": 4.76, + "grad_norm": 2.5460364818573, + "learning_rate": 4.333771025835655e-05, + "loss": 1.1808, + "step": 15905 + }, + { + "epoch": 4.76, + "grad_norm": 1.2482750415802002, + "learning_rate": 4.333371648661116e-05, + "loss": 1.2871, + "step": 15910 + }, + { + "epoch": 4.76, + "grad_norm": 2.138806104660034, + "learning_rate": 4.3329721702313665e-05, + "loss": 1.3245, + "step": 15915 + }, + { + "epoch": 4.76, + "grad_norm": 1.7979081869125366, + "learning_rate": 4.332572590568469e-05, + "loss": 1.2463, + "step": 15920 + }, + { + "epoch": 4.76, + "grad_norm": 0.921890914440155, + "learning_rate": 4.332172909694493e-05, + "loss": 1.34, + "step": 15925 + }, + { + "epoch": 4.77, + "grad_norm": 2.6406264305114746, + "learning_rate": 4.331773127631511e-05, + "loss": 1.195, + "step": 15930 + }, + { + "epoch": 4.77, + "grad_norm": 2.4825634956359863, + "learning_rate": 4.331373244401603e-05, + "loss": 1.144, + "step": 15935 + }, + { + "epoch": 4.77, + "grad_norm": 1.4051223993301392, + "learning_rate": 4.330973260026855e-05, + "loss": 1.3728, + "step": 15940 + }, + { + "epoch": 4.77, + "grad_norm": 1.1359541416168213, + "learning_rate": 4.330573174529356e-05, + "loss": 1.3258, + "step": 15945 + }, + { + "epoch": 4.77, + "grad_norm": 1.348656177520752, + "learning_rate": 4.330172987931205e-05, + "loss": 1.251, + "step": 15950 + }, + { + "epoch": 4.77, + "grad_norm": 2.296485185623169, + "learning_rate": 4.3297727002545006e-05, + "loss": 1.1964, + "step": 15955 + }, + { + "epoch": 4.78, + "grad_norm": 1.3783894777297974, + "learning_rate": 4.329372311521353e-05, + "loss": 1.261, + "step": 15960 + }, + { + "epoch": 4.78, + "grad_norm": 3.1686336994171143, + "learning_rate": 4.328971821753873e-05, + "loss": 1.31, + "step": 15965 + }, + { + "epoch": 4.78, + "grad_norm": 2.1778533458709717, + "learning_rate": 4.328571230974181e-05, + "loss": 1.1521, + "step": 15970 + }, + { + "epoch": 4.78, + "grad_norm": 1.734237790107727, + "learning_rate": 4.328170539204401e-05, + "loss": 1.392, + "step": 15975 + }, + { + "epoch": 4.78, + "grad_norm": 2.2675347328186035, + "learning_rate": 4.327769746466662e-05, + "loss": 1.3108, + "step": 15980 + }, + { + "epoch": 4.78, + "grad_norm": 1.0224782228469849, + "learning_rate": 4.3273688527831e-05, + "loss": 1.2305, + "step": 15985 + }, + { + "epoch": 4.78, + "grad_norm": 2.3099098205566406, + "learning_rate": 4.3269678581758556e-05, + "loss": 1.1456, + "step": 15990 + }, + { + "epoch": 4.79, + "grad_norm": 3.697842836380005, + "learning_rate": 4.326566762667075e-05, + "loss": 1.161, + "step": 15995 + }, + { + "epoch": 4.79, + "grad_norm": 1.7592895030975342, + "learning_rate": 4.3261655662789115e-05, + "loss": 1.3616, + "step": 16000 + }, + { + "epoch": 4.79, + "grad_norm": 2.622274160385132, + "learning_rate": 4.325764269033521e-05, + "loss": 1.2472, + "step": 16005 + }, + { + "epoch": 4.79, + "grad_norm": 1.370152473449707, + "learning_rate": 4.325362870953069e-05, + "loss": 1.2645, + "step": 16010 + }, + { + "epoch": 4.79, + "grad_norm": 1.3675168752670288, + "learning_rate": 4.324961372059722e-05, + "loss": 1.2626, + "step": 16015 + }, + { + "epoch": 4.79, + "grad_norm": 2.024503231048584, + "learning_rate": 4.324559772375656e-05, + "loss": 1.3285, + "step": 16020 + }, + { + "epoch": 4.79, + "grad_norm": 1.5145565271377563, + "learning_rate": 4.32415807192305e-05, + "loss": 1.287, + "step": 16025 + }, + { + "epoch": 4.8, + "grad_norm": 3.959343671798706, + "learning_rate": 4.3237562707240897e-05, + "loss": 1.2395, + "step": 16030 + }, + { + "epoch": 4.8, + "grad_norm": 1.6277161836624146, + "learning_rate": 4.3233543688009657e-05, + "loss": 1.1997, + "step": 16035 + }, + { + "epoch": 4.8, + "grad_norm": 1.912680983543396, + "learning_rate": 4.322952366175876e-05, + "loss": 1.2068, + "step": 16040 + }, + { + "epoch": 4.8, + "grad_norm": 3.4466264247894287, + "learning_rate": 4.322550262871021e-05, + "loss": 1.2512, + "step": 16045 + }, + { + "epoch": 4.8, + "grad_norm": 1.7129842042922974, + "learning_rate": 4.3221480589086104e-05, + "loss": 1.1405, + "step": 16050 + }, + { + "epoch": 4.8, + "grad_norm": 1.4409843683242798, + "learning_rate": 4.321745754310856e-05, + "loss": 1.3464, + "step": 16055 + }, + { + "epoch": 4.8, + "grad_norm": 1.8235528469085693, + "learning_rate": 4.321343349099978e-05, + "loss": 1.2919, + "step": 16060 + }, + { + "epoch": 4.81, + "grad_norm": 1.156563639640808, + "learning_rate": 4.3210213525047607e-05, + "loss": 1.2472, + "step": 16065 + }, + { + "epoch": 4.81, + "grad_norm": 2.0325233936309814, + "learning_rate": 4.320618766246267e-05, + "loss": 1.3308, + "step": 16070 + }, + { + "epoch": 4.81, + "grad_norm": 1.3709142208099365, + "learning_rate": 4.320216079436892e-05, + "loss": 1.3726, + "step": 16075 + }, + { + "epoch": 4.81, + "grad_norm": 2.0729141235351562, + "learning_rate": 4.3198132920988746e-05, + "loss": 1.2768, + "step": 16080 + }, + { + "epoch": 4.81, + "grad_norm": 2.903198480606079, + "learning_rate": 4.3194104042544606e-05, + "loss": 1.1628, + "step": 16085 + }, + { + "epoch": 4.81, + "grad_norm": 2.467470645904541, + "learning_rate": 4.319007415925901e-05, + "loss": 1.2066, + "step": 16090 + }, + { + "epoch": 4.82, + "grad_norm": 1.2951477766036987, + "learning_rate": 4.3186043271354534e-05, + "loss": 1.3537, + "step": 16095 + }, + { + "epoch": 4.82, + "grad_norm": 1.79331374168396, + "learning_rate": 4.318201137905379e-05, + "loss": 1.1477, + "step": 16100 + }, + { + "epoch": 4.82, + "grad_norm": 2.5787532329559326, + "learning_rate": 4.317797848257945e-05, + "loss": 1.2367, + "step": 16105 + }, + { + "epoch": 4.82, + "grad_norm": 2.574690580368042, + "learning_rate": 4.3173944582154256e-05, + "loss": 1.2734, + "step": 16110 + }, + { + "epoch": 4.82, + "grad_norm": 3.0416769981384277, + "learning_rate": 4.316990967800099e-05, + "loss": 1.2573, + "step": 16115 + }, + { + "epoch": 4.82, + "grad_norm": 1.79559326171875, + "learning_rate": 4.31658737703425e-05, + "loss": 1.1909, + "step": 16120 + }, + { + "epoch": 4.82, + "grad_norm": 3.641195774078369, + "learning_rate": 4.31618368594017e-05, + "loss": 1.2127, + "step": 16125 + }, + { + "epoch": 4.83, + "grad_norm": 2.858097553253174, + "learning_rate": 4.315779894540151e-05, + "loss": 1.1407, + "step": 16130 + }, + { + "epoch": 4.83, + "grad_norm": 2.4541454315185547, + "learning_rate": 4.315376002856496e-05, + "loss": 1.2637, + "step": 16135 + }, + { + "epoch": 4.83, + "grad_norm": 4.378109455108643, + "learning_rate": 4.3149720109115116e-05, + "loss": 1.3395, + "step": 16140 + }, + { + "epoch": 4.83, + "grad_norm": 4.454669952392578, + "learning_rate": 4.31456791872751e-05, + "loss": 1.2609, + "step": 16145 + }, + { + "epoch": 4.83, + "grad_norm": 0.9385771751403809, + "learning_rate": 4.3141637263268074e-05, + "loss": 1.4461, + "step": 16150 + }, + { + "epoch": 4.83, + "grad_norm": 4.431612014770508, + "learning_rate": 4.313759433731729e-05, + "loss": 1.2072, + "step": 16155 + }, + { + "epoch": 4.83, + "grad_norm": 3.2460215091705322, + "learning_rate": 4.313355040964602e-05, + "loss": 1.3532, + "step": 16160 + }, + { + "epoch": 4.84, + "grad_norm": 1.6554932594299316, + "learning_rate": 4.312950548047761e-05, + "loss": 1.4051, + "step": 16165 + }, + { + "epoch": 4.84, + "grad_norm": 1.5239330530166626, + "learning_rate": 4.3125459550035454e-05, + "loss": 1.2129, + "step": 16170 + }, + { + "epoch": 4.84, + "grad_norm": 1.9607256650924683, + "learning_rate": 4.3121412618543014e-05, + "loss": 1.1648, + "step": 16175 + }, + { + "epoch": 4.84, + "grad_norm": 1.0055090188980103, + "learning_rate": 4.311736468622378e-05, + "loss": 1.398, + "step": 16180 + }, + { + "epoch": 4.84, + "grad_norm": 1.6140061616897583, + "learning_rate": 4.311331575330134e-05, + "loss": 1.3406, + "step": 16185 + }, + { + "epoch": 4.84, + "grad_norm": 8.580751419067383, + "learning_rate": 4.310926581999929e-05, + "loss": 1.285, + "step": 16190 + }, + { + "epoch": 4.85, + "grad_norm": 2.2242555618286133, + "learning_rate": 4.3105214886541315e-05, + "loss": 1.1491, + "step": 16195 + }, + { + "epoch": 4.85, + "grad_norm": 4.345371246337891, + "learning_rate": 4.310116295315114e-05, + "loss": 1.1643, + "step": 16200 + }, + { + "epoch": 4.85, + "grad_norm": 3.590310573577881, + "learning_rate": 4.309711002005255e-05, + "loss": 1.2797, + "step": 16205 + }, + { + "epoch": 4.85, + "grad_norm": 1.9311707019805908, + "learning_rate": 4.309305608746939e-05, + "loss": 1.287, + "step": 16210 + }, + { + "epoch": 4.85, + "grad_norm": 2.0158779621124268, + "learning_rate": 4.308900115562554e-05, + "loss": 1.222, + "step": 16215 + }, + { + "epoch": 4.85, + "grad_norm": 0.8616341352462769, + "learning_rate": 4.308494522474496e-05, + "loss": 1.2951, + "step": 16220 + }, + { + "epoch": 4.85, + "grad_norm": 1.497802495956421, + "learning_rate": 4.308088829505166e-05, + "loss": 1.1485, + "step": 16225 + }, + { + "epoch": 4.86, + "grad_norm": 1.7404403686523438, + "learning_rate": 4.307683036676968e-05, + "loss": 1.2864, + "step": 16230 + }, + { + "epoch": 4.86, + "grad_norm": 3.3228366374969482, + "learning_rate": 4.3072771440123164e-05, + "loss": 1.2396, + "step": 16235 + }, + { + "epoch": 4.86, + "grad_norm": 1.6669496297836304, + "learning_rate": 4.3068711515336254e-05, + "loss": 1.1937, + "step": 16240 + }, + { + "epoch": 4.86, + "grad_norm": 2.0878639221191406, + "learning_rate": 4.3064650592633206e-05, + "loss": 1.2871, + "step": 16245 + }, + { + "epoch": 4.86, + "grad_norm": 1.9474278688430786, + "learning_rate": 4.3060588672238266e-05, + "loss": 1.264, + "step": 16250 + }, + { + "epoch": 4.86, + "grad_norm": 1.4628163576126099, + "learning_rate": 4.30565257543758e-05, + "loss": 1.2895, + "step": 16255 + }, + { + "epoch": 4.86, + "grad_norm": 2.5341408252716064, + "learning_rate": 4.3052461839270176e-05, + "loss": 1.2429, + "step": 16260 + }, + { + "epoch": 4.87, + "grad_norm": 2.7245945930480957, + "learning_rate": 4.3048396927145854e-05, + "loss": 1.3481, + "step": 16265 + }, + { + "epoch": 4.87, + "grad_norm": 1.005662202835083, + "learning_rate": 4.3044331018227324e-05, + "loss": 1.1808, + "step": 16270 + }, + { + "epoch": 4.87, + "grad_norm": 0.959071695804596, + "learning_rate": 4.304026411273915e-05, + "loss": 1.2639, + "step": 16275 + }, + { + "epoch": 4.87, + "grad_norm": 1.850095272064209, + "learning_rate": 4.303619621090594e-05, + "loss": 1.319, + "step": 16280 + }, + { + "epoch": 4.87, + "grad_norm": 2.577181816101074, + "learning_rate": 4.3032127312952367e-05, + "loss": 1.2413, + "step": 16285 + }, + { + "epoch": 4.87, + "grad_norm": 2.0503063201904297, + "learning_rate": 4.302805741910314e-05, + "loss": 1.1287, + "step": 16290 + }, + { + "epoch": 4.88, + "grad_norm": 2.769632339477539, + "learning_rate": 4.3023986529583046e-05, + "loss": 1.2283, + "step": 16295 + }, + { + "epoch": 4.88, + "grad_norm": 2.0074775218963623, + "learning_rate": 4.3019914644616904e-05, + "loss": 1.1908, + "step": 16300 + }, + { + "epoch": 4.88, + "grad_norm": 1.1852775812149048, + "learning_rate": 4.301584176442961e-05, + "loss": 1.314, + "step": 16305 + }, + { + "epoch": 4.88, + "grad_norm": 2.0226988792419434, + "learning_rate": 4.3011767889246105e-05, + "loss": 1.2051, + "step": 16310 + }, + { + "epoch": 4.88, + "grad_norm": 2.0487165451049805, + "learning_rate": 4.300769301929138e-05, + "loss": 1.1422, + "step": 16315 + }, + { + "epoch": 4.88, + "grad_norm": 2.2297685146331787, + "learning_rate": 4.300361715479049e-05, + "loss": 1.3844, + "step": 16320 + }, + { + "epoch": 4.88, + "grad_norm": 1.5562113523483276, + "learning_rate": 4.2999540295968535e-05, + "loss": 1.2164, + "step": 16325 + }, + { + "epoch": 4.89, + "grad_norm": 1.00557541847229, + "learning_rate": 4.2995462443050674e-05, + "loss": 1.1486, + "step": 16330 + }, + { + "epoch": 4.89, + "grad_norm": 1.7114874124526978, + "learning_rate": 4.299138359626213e-05, + "loss": 1.3049, + "step": 16335 + }, + { + "epoch": 4.89, + "grad_norm": 1.610742449760437, + "learning_rate": 4.2987303755828176e-05, + "loss": 1.1456, + "step": 16340 + }, + { + "epoch": 4.89, + "grad_norm": 2.69520902633667, + "learning_rate": 4.298322292197413e-05, + "loss": 1.2903, + "step": 16345 + }, + { + "epoch": 4.89, + "grad_norm": 1.4834461212158203, + "learning_rate": 4.2979141094925365e-05, + "loss": 1.1353, + "step": 16350 + }, + { + "epoch": 4.89, + "grad_norm": 2.4924049377441406, + "learning_rate": 4.297505827490734e-05, + "loss": 1.2812, + "step": 16355 + }, + { + "epoch": 4.89, + "grad_norm": 1.1296648979187012, + "learning_rate": 4.297097446214553e-05, + "loss": 1.0921, + "step": 16360 + }, + { + "epoch": 4.9, + "grad_norm": 0.9421242475509644, + "learning_rate": 4.296688965686547e-05, + "loss": 1.1506, + "step": 16365 + }, + { + "epoch": 4.9, + "grad_norm": 1.1938999891281128, + "learning_rate": 4.2962803859292776e-05, + "loss": 1.243, + "step": 16370 + }, + { + "epoch": 4.9, + "grad_norm": 2.3676552772521973, + "learning_rate": 4.29587170696531e-05, + "loss": 1.2123, + "step": 16375 + }, + { + "epoch": 4.9, + "grad_norm": 1.4618945121765137, + "learning_rate": 4.295462928817214e-05, + "loss": 1.4101, + "step": 16380 + }, + { + "epoch": 4.9, + "grad_norm": 1.811956763267517, + "learning_rate": 4.2950540515075664e-05, + "loss": 1.2196, + "step": 16385 + }, + { + "epoch": 4.9, + "grad_norm": 2.578814744949341, + "learning_rate": 4.294645075058951e-05, + "loss": 1.3067, + "step": 16390 + }, + { + "epoch": 4.91, + "grad_norm": 2.0412240028381348, + "learning_rate": 4.294235999493952e-05, + "loss": 1.123, + "step": 16395 + }, + { + "epoch": 4.91, + "grad_norm": 1.7934796810150146, + "learning_rate": 4.293826824835164e-05, + "loss": 1.1079, + "step": 16400 + }, + { + "epoch": 4.91, + "grad_norm": 1.1003563404083252, + "learning_rate": 4.293417551105186e-05, + "loss": 1.2645, + "step": 16405 + }, + { + "epoch": 4.91, + "grad_norm": 2.166717767715454, + "learning_rate": 4.29300817832662e-05, + "loss": 1.1873, + "step": 16410 + }, + { + "epoch": 4.91, + "grad_norm": 2.064626455307007, + "learning_rate": 4.292598706522075e-05, + "loss": 1.3719, + "step": 16415 + }, + { + "epoch": 4.91, + "grad_norm": 2.102949857711792, + "learning_rate": 4.2921891357141686e-05, + "loss": 1.3074, + "step": 16420 + }, + { + "epoch": 4.91, + "grad_norm": 2.0334112644195557, + "learning_rate": 4.2917794659255183e-05, + "loss": 1.3698, + "step": 16425 + }, + { + "epoch": 4.92, + "grad_norm": 1.0431818962097168, + "learning_rate": 4.29136969717875e-05, + "loss": 1.2163, + "step": 16430 + }, + { + "epoch": 4.92, + "grad_norm": 1.496396541595459, + "learning_rate": 4.290959829496497e-05, + "loss": 1.3081, + "step": 16435 + }, + { + "epoch": 4.92, + "grad_norm": 1.6219655275344849, + "learning_rate": 4.290549862901393e-05, + "loss": 1.2083, + "step": 16440 + }, + { + "epoch": 4.92, + "grad_norm": 1.4473518133163452, + "learning_rate": 4.290139797416081e-05, + "loss": 1.1601, + "step": 16445 + }, + { + "epoch": 4.92, + "grad_norm": 1.5219734907150269, + "learning_rate": 4.28972963306321e-05, + "loss": 1.0509, + "step": 16450 + }, + { + "epoch": 4.92, + "grad_norm": 3.6221630573272705, + "learning_rate": 4.28931936986543e-05, + "loss": 1.2632, + "step": 16455 + }, + { + "epoch": 4.92, + "grad_norm": 1.292728066444397, + "learning_rate": 4.2889090078454016e-05, + "loss": 1.3043, + "step": 16460 + }, + { + "epoch": 4.93, + "grad_norm": 2.52746844291687, + "learning_rate": 4.2884985470257885e-05, + "loss": 1.2606, + "step": 16465 + }, + { + "epoch": 4.93, + "grad_norm": 2.0615930557250977, + "learning_rate": 4.288087987429261e-05, + "loss": 1.4306, + "step": 16470 + }, + { + "epoch": 4.93, + "grad_norm": 2.1873435974121094, + "learning_rate": 4.287677329078491e-05, + "loss": 1.3791, + "step": 16475 + }, + { + "epoch": 4.93, + "grad_norm": 0.9649629592895508, + "learning_rate": 4.2872665719961605e-05, + "loss": 1.2359, + "step": 16480 + }, + { + "epoch": 4.93, + "grad_norm": 2.3767576217651367, + "learning_rate": 4.2868557162049546e-05, + "loss": 1.3599, + "step": 16485 + }, + { + "epoch": 4.93, + "grad_norm": 4.670454025268555, + "learning_rate": 4.286444761727566e-05, + "loss": 1.1474, + "step": 16490 + }, + { + "epoch": 4.94, + "grad_norm": 3.1629886627197266, + "learning_rate": 4.286033708586689e-05, + "loss": 1.2123, + "step": 16495 + }, + { + "epoch": 4.94, + "grad_norm": 1.4645919799804688, + "learning_rate": 4.2856225568050266e-05, + "loss": 1.4218, + "step": 16500 + }, + { + "epoch": 4.94, + "grad_norm": 1.6503212451934814, + "learning_rate": 4.2852113064052874e-05, + "loss": 1.2461, + "step": 16505 + }, + { + "epoch": 4.94, + "grad_norm": 1.6822096109390259, + "learning_rate": 4.2847999574101826e-05, + "loss": 1.127, + "step": 16510 + }, + { + "epoch": 4.94, + "grad_norm": 1.8511792421340942, + "learning_rate": 4.284388509842432e-05, + "loss": 1.0953, + "step": 16515 + }, + { + "epoch": 4.94, + "grad_norm": 3.1232521533966064, + "learning_rate": 4.2839769637247584e-05, + "loss": 1.1931, + "step": 16520 + }, + { + "epoch": 4.94, + "grad_norm": 1.0340989828109741, + "learning_rate": 4.283565319079892e-05, + "loss": 1.2855, + "step": 16525 + }, + { + "epoch": 4.95, + "grad_norm": 1.5845731496810913, + "learning_rate": 4.2831535759305664e-05, + "loss": 1.2968, + "step": 16530 + }, + { + "epoch": 4.95, + "grad_norm": 1.5348531007766724, + "learning_rate": 4.282741734299523e-05, + "loss": 1.3528, + "step": 16535 + }, + { + "epoch": 4.95, + "grad_norm": 1.6758315563201904, + "learning_rate": 4.2823297942095054e-05, + "loss": 1.3494, + "step": 16540 + }, + { + "epoch": 4.95, + "grad_norm": 2.661583185195923, + "learning_rate": 4.2819177556832665e-05, + "loss": 1.3549, + "step": 16545 + }, + { + "epoch": 4.95, + "grad_norm": 1.3939776420593262, + "learning_rate": 4.281505618743562e-05, + "loss": 1.25, + "step": 16550 + }, + { + "epoch": 4.95, + "grad_norm": 4.4011616706848145, + "learning_rate": 4.281093383413154e-05, + "loss": 1.1899, + "step": 16555 + }, + { + "epoch": 4.95, + "grad_norm": 2.488006830215454, + "learning_rate": 4.28068104971481e-05, + "loss": 1.153, + "step": 16560 + }, + { + "epoch": 4.96, + "grad_norm": 2.225281000137329, + "learning_rate": 4.2802686176713026e-05, + "loss": 1.2394, + "step": 16565 + }, + { + "epoch": 4.96, + "grad_norm": 0.9629842042922974, + "learning_rate": 4.27985608730541e-05, + "loss": 1.1729, + "step": 16570 + }, + { + "epoch": 4.96, + "grad_norm": 2.1733222007751465, + "learning_rate": 4.279443458639916e-05, + "loss": 1.0447, + "step": 16575 + }, + { + "epoch": 4.96, + "grad_norm": 2.7707877159118652, + "learning_rate": 4.279030731697609e-05, + "loss": 1.3554, + "step": 16580 + }, + { + "epoch": 4.96, + "grad_norm": 1.399963617324829, + "learning_rate": 4.278617906501283e-05, + "loss": 1.3338, + "step": 16585 + }, + { + "epoch": 4.96, + "grad_norm": 1.5772522687911987, + "learning_rate": 4.2782049830737394e-05, + "loss": 1.4114, + "step": 16590 + }, + { + "epoch": 4.97, + "grad_norm": 2.678196907043457, + "learning_rate": 4.277791961437784e-05, + "loss": 1.2212, + "step": 16595 + }, + { + "epoch": 4.97, + "grad_norm": 0.8356932997703552, + "learning_rate": 4.277378841616225e-05, + "loss": 1.1251, + "step": 16600 + }, + { + "epoch": 4.97, + "grad_norm": 2.3631153106689453, + "learning_rate": 4.276965623631881e-05, + "loss": 1.0944, + "step": 16605 + }, + { + "epoch": 4.97, + "grad_norm": 1.5095417499542236, + "learning_rate": 4.276552307507572e-05, + "loss": 1.2006, + "step": 16610 + }, + { + "epoch": 4.97, + "grad_norm": 1.5276120901107788, + "learning_rate": 4.2761388932661264e-05, + "loss": 1.2489, + "step": 16615 + }, + { + "epoch": 4.97, + "grad_norm": 2.6862080097198486, + "learning_rate": 4.275725380930375e-05, + "loss": 1.2177, + "step": 16620 + }, + { + "epoch": 4.97, + "grad_norm": 3.0933616161346436, + "learning_rate": 4.275311770523157e-05, + "loss": 1.3473, + "step": 16625 + }, + { + "epoch": 4.98, + "grad_norm": 2.2336065769195557, + "learning_rate": 4.2748980620673155e-05, + "loss": 1.4323, + "step": 16630 + }, + { + "epoch": 4.98, + "grad_norm": 1.266717791557312, + "learning_rate": 4.274484255585699e-05, + "loss": 1.2058, + "step": 16635 + }, + { + "epoch": 4.98, + "grad_norm": 3.835813522338867, + "learning_rate": 4.274070351101161e-05, + "loss": 1.2881, + "step": 16640 + }, + { + "epoch": 4.98, + "grad_norm": 1.388725757598877, + "learning_rate": 4.273656348636562e-05, + "loss": 1.1769, + "step": 16645 + }, + { + "epoch": 4.98, + "grad_norm": 1.5018970966339111, + "learning_rate": 4.273242248214767e-05, + "loss": 1.2342, + "step": 16650 + }, + { + "epoch": 4.98, + "grad_norm": 1.3790775537490845, + "learning_rate": 4.272828049858645e-05, + "loss": 1.2862, + "step": 16655 + }, + { + "epoch": 4.98, + "grad_norm": 1.3459662199020386, + "learning_rate": 4.2724137535910727e-05, + "loss": 1.2824, + "step": 16660 + }, + { + "epoch": 4.99, + "grad_norm": 2.3956856727600098, + "learning_rate": 4.2719993594349316e-05, + "loss": 1.2141, + "step": 16665 + }, + { + "epoch": 4.99, + "grad_norm": 4.897758960723877, + "learning_rate": 4.271584867413107e-05, + "loss": 1.2121, + "step": 16670 + }, + { + "epoch": 4.99, + "grad_norm": 1.78498113155365, + "learning_rate": 4.271170277548492e-05, + "loss": 1.4079, + "step": 16675 + }, + { + "epoch": 4.99, + "grad_norm": 2.1399378776550293, + "learning_rate": 4.270755589863983e-05, + "loss": 1.205, + "step": 16680 + }, + { + "epoch": 4.99, + "grad_norm": 2.0815482139587402, + "learning_rate": 4.2703408043824845e-05, + "loss": 1.2547, + "step": 16685 + }, + { + "epoch": 4.99, + "grad_norm": 2.381340980529785, + "learning_rate": 4.2699259211269025e-05, + "loss": 1.3422, + "step": 16690 + }, + { + "epoch": 4.99, + "grad_norm": 2.151851177215576, + "learning_rate": 4.269510940120152e-05, + "loss": 1.2764, + "step": 16695 + }, + { + "epoch": 5.0, + "grad_norm": 1.3695425987243652, + "learning_rate": 4.269095861385152e-05, + "loss": 1.2128, + "step": 16700 + }, + { + "epoch": 5.0, + "grad_norm": 2.25834321975708, + "learning_rate": 4.268680684944826e-05, + "loss": 1.5246, + "step": 16705 + }, + { + "epoch": 5.0, + "grad_norm": 1.995069980621338, + "learning_rate": 4.268265410822104e-05, + "loss": 1.3022, + "step": 16710 + }, + { + "epoch": 5.0, + "grad_norm": 1.8739442825317383, + "learning_rate": 4.267850039039922e-05, + "loss": 1.1678, + "step": 16715 + }, + { + "epoch": 5.0, + "grad_norm": 1.2266640663146973, + "learning_rate": 4.26743456962122e-05, + "loss": 1.1569, + "step": 16720 + }, + { + "epoch": 5.0, + "grad_norm": 2.092092990875244, + "learning_rate": 4.2670190025889436e-05, + "loss": 1.3194, + "step": 16725 + }, + { + "epoch": 5.01, + "grad_norm": 2.1528186798095703, + "learning_rate": 4.266603337966044e-05, + "loss": 1.2965, + "step": 16730 + }, + { + "epoch": 5.01, + "grad_norm": 1.078891396522522, + "learning_rate": 4.266187575775479e-05, + "loss": 1.1757, + "step": 16735 + }, + { + "epoch": 5.01, + "grad_norm": 2.155930995941162, + "learning_rate": 4.265771716040209e-05, + "loss": 1.1875, + "step": 16740 + }, + { + "epoch": 5.01, + "grad_norm": 1.9117164611816406, + "learning_rate": 4.265355758783203e-05, + "loss": 1.1627, + "step": 16745 + }, + { + "epoch": 5.01, + "grad_norm": 2.450892210006714, + "learning_rate": 4.264939704027434e-05, + "loss": 1.3303, + "step": 16750 + }, + { + "epoch": 5.01, + "grad_norm": 1.585326910018921, + "learning_rate": 4.2645235517958796e-05, + "loss": 0.9849, + "step": 16755 + }, + { + "epoch": 5.01, + "grad_norm": 1.436105728149414, + "learning_rate": 4.264107302111523e-05, + "loss": 1.2115, + "step": 16760 + }, + { + "epoch": 5.02, + "grad_norm": 3.469156503677368, + "learning_rate": 4.2636909549973535e-05, + "loss": 1.2543, + "step": 16765 + }, + { + "epoch": 5.02, + "grad_norm": 1.467289924621582, + "learning_rate": 4.263274510476366e-05, + "loss": 1.2511, + "step": 16770 + }, + { + "epoch": 5.02, + "grad_norm": 1.1534559726715088, + "learning_rate": 4.262857968571561e-05, + "loss": 1.1398, + "step": 16775 + }, + { + "epoch": 5.02, + "grad_norm": 2.7166788578033447, + "learning_rate": 4.262441329305942e-05, + "loss": 1.213, + "step": 16780 + }, + { + "epoch": 5.02, + "grad_norm": 3.0506703853607178, + "learning_rate": 4.262024592702521e-05, + "loss": 1.2207, + "step": 16785 + }, + { + "epoch": 5.02, + "grad_norm": 2.2028605937957764, + "learning_rate": 4.2616077587843126e-05, + "loss": 1.2165, + "step": 16790 + }, + { + "epoch": 5.02, + "grad_norm": 1.4191491603851318, + "learning_rate": 4.26119082757434e-05, + "loss": 1.2871, + "step": 16795 + }, + { + "epoch": 5.03, + "grad_norm": 0.948824942111969, + "learning_rate": 4.2607737990956276e-05, + "loss": 1.1885, + "step": 16800 + }, + { + "epoch": 5.03, + "grad_norm": 1.8765445947647095, + "learning_rate": 4.260356673371208e-05, + "loss": 1.228, + "step": 16805 + }, + { + "epoch": 5.03, + "grad_norm": 1.0872840881347656, + "learning_rate": 4.2599394504241195e-05, + "loss": 1.2076, + "step": 16810 + }, + { + "epoch": 5.03, + "grad_norm": 1.3634966611862183, + "learning_rate": 4.259522130277405e-05, + "loss": 1.2841, + "step": 16815 + }, + { + "epoch": 5.03, + "grad_norm": 2.5034995079040527, + "learning_rate": 4.259104712954112e-05, + "loss": 1.2005, + "step": 16820 + }, + { + "epoch": 5.03, + "grad_norm": 1.3833112716674805, + "learning_rate": 4.2586871984772943e-05, + "loss": 1.0861, + "step": 16825 + }, + { + "epoch": 5.04, + "grad_norm": 2.4615163803100586, + "learning_rate": 4.25826958687001e-05, + "loss": 1.2638, + "step": 16830 + }, + { + "epoch": 5.04, + "grad_norm": 3.113060474395752, + "learning_rate": 4.257851878155324e-05, + "loss": 1.3006, + "step": 16835 + }, + { + "epoch": 5.04, + "grad_norm": 2.4361579418182373, + "learning_rate": 4.257434072356307e-05, + "loss": 1.2466, + "step": 16840 + }, + { + "epoch": 5.04, + "grad_norm": 2.1952314376831055, + "learning_rate": 4.257016169496032e-05, + "loss": 1.2439, + "step": 16845 + }, + { + "epoch": 5.04, + "grad_norm": 1.1961103677749634, + "learning_rate": 4.256598169597581e-05, + "loss": 1.306, + "step": 16850 + }, + { + "epoch": 5.04, + "grad_norm": 1.3207539319992065, + "learning_rate": 4.2561800726840385e-05, + "loss": 1.2848, + "step": 16855 + }, + { + "epoch": 5.04, + "grad_norm": 1.850471019744873, + "learning_rate": 4.255761878778496e-05, + "loss": 1.3269, + "step": 16860 + }, + { + "epoch": 5.05, + "grad_norm": 1.6024876832962036, + "learning_rate": 4.255343587904051e-05, + "loss": 1.2962, + "step": 16865 + }, + { + "epoch": 5.05, + "grad_norm": 2.1519381999969482, + "learning_rate": 4.2549252000838025e-05, + "loss": 1.3354, + "step": 16870 + }, + { + "epoch": 5.05, + "grad_norm": 2.02545166015625, + "learning_rate": 4.25450671534086e-05, + "loss": 1.2417, + "step": 16875 + }, + { + "epoch": 5.05, + "grad_norm": 1.6265103816986084, + "learning_rate": 4.2540881336983357e-05, + "loss": 1.2181, + "step": 16880 + }, + { + "epoch": 5.05, + "grad_norm": 2.541177988052368, + "learning_rate": 4.253669455179347e-05, + "loss": 1.2521, + "step": 16885 + }, + { + "epoch": 5.05, + "grad_norm": 1.0835845470428467, + "learning_rate": 4.253250679807017e-05, + "loss": 1.138, + "step": 16890 + }, + { + "epoch": 5.05, + "grad_norm": 1.4941011667251587, + "learning_rate": 4.252831807604475e-05, + "loss": 1.2487, + "step": 16895 + }, + { + "epoch": 5.06, + "grad_norm": 1.4814062118530273, + "learning_rate": 4.252412838594853e-05, + "loss": 1.217, + "step": 16900 + }, + { + "epoch": 5.06, + "grad_norm": 1.4170145988464355, + "learning_rate": 4.251993772801292e-05, + "loss": 1.2874, + "step": 16905 + }, + { + "epoch": 5.06, + "grad_norm": 1.7767305374145508, + "learning_rate": 4.251574610246935e-05, + "loss": 1.2459, + "step": 16910 + }, + { + "epoch": 5.06, + "grad_norm": 1.7504329681396484, + "learning_rate": 4.251155350954935e-05, + "loss": 1.1763, + "step": 16915 + }, + { + "epoch": 5.06, + "grad_norm": 2.1863224506378174, + "learning_rate": 4.250735994948444e-05, + "loss": 1.1726, + "step": 16920 + }, + { + "epoch": 5.06, + "grad_norm": 2.441636085510254, + "learning_rate": 4.250316542250624e-05, + "loss": 1.2343, + "step": 16925 + }, + { + "epoch": 5.07, + "grad_norm": 2.9591567516326904, + "learning_rate": 4.249896992884641e-05, + "loss": 1.2953, + "step": 16930 + }, + { + "epoch": 5.07, + "grad_norm": 1.866086721420288, + "learning_rate": 4.2494773468736654e-05, + "loss": 1.1143, + "step": 16935 + }, + { + "epoch": 5.07, + "grad_norm": 2.113767147064209, + "learning_rate": 4.249057604240875e-05, + "loss": 1.2088, + "step": 16940 + }, + { + "epoch": 5.07, + "grad_norm": 1.1835196018218994, + "learning_rate": 4.2486377650094505e-05, + "loss": 1.1204, + "step": 16945 + }, + { + "epoch": 5.07, + "grad_norm": 2.831669807434082, + "learning_rate": 4.24821782920258e-05, + "loss": 1.0658, + "step": 16950 + }, + { + "epoch": 5.07, + "grad_norm": 2.2646048069000244, + "learning_rate": 4.247797796843456e-05, + "loss": 1.3129, + "step": 16955 + }, + { + "epoch": 5.07, + "grad_norm": 1.5404542684555054, + "learning_rate": 4.247377667955277e-05, + "loss": 1.1941, + "step": 16960 + }, + { + "epoch": 5.08, + "grad_norm": 1.8242815732955933, + "learning_rate": 4.246957442561245e-05, + "loss": 1.194, + "step": 16965 + }, + { + "epoch": 5.08, + "grad_norm": 1.6594616174697876, + "learning_rate": 4.246537120684569e-05, + "loss": 1.217, + "step": 16970 + }, + { + "epoch": 5.08, + "grad_norm": 2.1032214164733887, + "learning_rate": 4.2462007937313256e-05, + "loss": 1.1512, + "step": 16975 + }, + { + "epoch": 5.08, + "grad_norm": 1.7993156909942627, + "learning_rate": 4.245780298244394e-05, + "loss": 1.3622, + "step": 16980 + }, + { + "epoch": 5.08, + "grad_norm": 2.3220622539520264, + "learning_rate": 4.245359706339832e-05, + "loss": 1.1386, + "step": 16985 + }, + { + "epoch": 5.08, + "grad_norm": 3.0997767448425293, + "learning_rate": 4.2449390180408666e-05, + "loss": 1.0817, + "step": 16990 + }, + { + "epoch": 5.08, + "grad_norm": 1.8498963117599487, + "learning_rate": 4.2445182333707334e-05, + "loss": 1.1428, + "step": 16995 + }, + { + "epoch": 5.09, + "grad_norm": 2.013101816177368, + "learning_rate": 4.244097352352672e-05, + "loss": 1.0904, + "step": 17000 + }, + { + "epoch": 5.09, + "grad_norm": 1.1816306114196777, + "learning_rate": 4.243676375009926e-05, + "loss": 1.1456, + "step": 17005 + }, + { + "epoch": 5.09, + "grad_norm": 2.447376012802124, + "learning_rate": 4.243255301365746e-05, + "loss": 1.2639, + "step": 17010 + }, + { + "epoch": 5.09, + "grad_norm": 7.374262809753418, + "learning_rate": 4.2428341314433884e-05, + "loss": 1.3451, + "step": 17015 + }, + { + "epoch": 5.09, + "grad_norm": 1.150455117225647, + "learning_rate": 4.2424128652661135e-05, + "loss": 1.4079, + "step": 17020 + }, + { + "epoch": 5.09, + "grad_norm": 3.0257742404937744, + "learning_rate": 4.241991502857187e-05, + "loss": 1.0875, + "step": 17025 + }, + { + "epoch": 5.1, + "grad_norm": 2.3345866203308105, + "learning_rate": 4.24157004423988e-05, + "loss": 1.0676, + "step": 17030 + }, + { + "epoch": 5.1, + "grad_norm": 2.740541696548462, + "learning_rate": 4.24114848943747e-05, + "loss": 1.2396, + "step": 17035 + }, + { + "epoch": 5.1, + "grad_norm": 1.1719865798950195, + "learning_rate": 4.240726838473239e-05, + "loss": 1.2915, + "step": 17040 + }, + { + "epoch": 5.1, + "grad_norm": 2.3873939514160156, + "learning_rate": 4.240305091370473e-05, + "loss": 1.0754, + "step": 17045 + }, + { + "epoch": 5.1, + "grad_norm": 2.359999179840088, + "learning_rate": 4.239883248152467e-05, + "loss": 1.0515, + "step": 17050 + }, + { + "epoch": 5.1, + "grad_norm": 2.249455451965332, + "learning_rate": 4.2394613088425176e-05, + "loss": 1.1345, + "step": 17055 + }, + { + "epoch": 5.1, + "grad_norm": 3.461733818054199, + "learning_rate": 4.239039273463927e-05, + "loss": 1.3695, + "step": 17060 + }, + { + "epoch": 5.11, + "grad_norm": 1.5373154878616333, + "learning_rate": 4.238617142040007e-05, + "loss": 1.3273, + "step": 17065 + }, + { + "epoch": 5.11, + "grad_norm": 6.110920429229736, + "learning_rate": 4.238194914594068e-05, + "loss": 1.3379, + "step": 17070 + }, + { + "epoch": 5.11, + "grad_norm": 1.5171079635620117, + "learning_rate": 4.237772591149431e-05, + "loss": 1.2593, + "step": 17075 + }, + { + "epoch": 5.11, + "grad_norm": 1.9108515977859497, + "learning_rate": 4.23735017172942e-05, + "loss": 1.4072, + "step": 17080 + }, + { + "epoch": 5.11, + "grad_norm": 3.4119062423706055, + "learning_rate": 4.2369276563573645e-05, + "loss": 1.3117, + "step": 17085 + }, + { + "epoch": 5.11, + "grad_norm": 1.446298599243164, + "learning_rate": 4.2365050450566e-05, + "loss": 1.2207, + "step": 17090 + }, + { + "epoch": 5.11, + "grad_norm": 3.0925724506378174, + "learning_rate": 4.2360823378504674e-05, + "loss": 1.1398, + "step": 17095 + }, + { + "epoch": 5.12, + "grad_norm": 3.060494899749756, + "learning_rate": 4.235659534762312e-05, + "loss": 1.2885, + "step": 17100 + }, + { + "epoch": 5.12, + "grad_norm": 2.6073570251464844, + "learning_rate": 4.235236635815484e-05, + "loss": 1.2522, + "step": 17105 + }, + { + "epoch": 5.12, + "grad_norm": 0.9218636751174927, + "learning_rate": 4.23481364103334e-05, + "loss": 1.253, + "step": 17110 + }, + { + "epoch": 5.12, + "grad_norm": 0.8496900200843811, + "learning_rate": 4.234390550439243e-05, + "loss": 1.3157, + "step": 17115 + }, + { + "epoch": 5.12, + "grad_norm": 2.110830783843994, + "learning_rate": 4.233967364056558e-05, + "loss": 1.2034, + "step": 17120 + }, + { + "epoch": 5.12, + "grad_norm": 1.333644986152649, + "learning_rate": 4.233544081908658e-05, + "loss": 1.0453, + "step": 17125 + }, + { + "epoch": 5.13, + "grad_norm": 3.8100926876068115, + "learning_rate": 4.233120704018921e-05, + "loss": 1.1498, + "step": 17130 + }, + { + "epoch": 5.13, + "grad_norm": 1.855100154876709, + "learning_rate": 4.2326972304107284e-05, + "loss": 1.1742, + "step": 17135 + }, + { + "epoch": 5.13, + "grad_norm": 1.8866065740585327, + "learning_rate": 4.232273661107468e-05, + "loss": 1.2759, + "step": 17140 + }, + { + "epoch": 5.13, + "grad_norm": 1.6143709421157837, + "learning_rate": 4.231849996132535e-05, + "loss": 1.1766, + "step": 17145 + }, + { + "epoch": 5.13, + "grad_norm": 1.1877189874649048, + "learning_rate": 4.231426235509326e-05, + "loss": 1.3377, + "step": 17150 + }, + { + "epoch": 5.13, + "grad_norm": 3.203878879547119, + "learning_rate": 4.2310023792612466e-05, + "loss": 1.2728, + "step": 17155 + }, + { + "epoch": 5.13, + "grad_norm": 2.035184860229492, + "learning_rate": 4.230578427411705e-05, + "loss": 1.2227, + "step": 17160 + }, + { + "epoch": 5.14, + "grad_norm": 2.1178765296936035, + "learning_rate": 4.230154379984115e-05, + "loss": 1.0745, + "step": 17165 + }, + { + "epoch": 5.14, + "grad_norm": 1.0364670753479004, + "learning_rate": 4.229730237001897e-05, + "loss": 1.1975, + "step": 17170 + }, + { + "epoch": 5.14, + "grad_norm": 1.6296310424804688, + "learning_rate": 4.2293059984884765e-05, + "loss": 1.301, + "step": 17175 + }, + { + "epoch": 5.14, + "grad_norm": 2.1096887588500977, + "learning_rate": 4.228881664467282e-05, + "loss": 1.2428, + "step": 17180 + }, + { + "epoch": 5.14, + "grad_norm": 3.0210041999816895, + "learning_rate": 4.228457234961752e-05, + "loss": 1.2328, + "step": 17185 + }, + { + "epoch": 5.14, + "grad_norm": 1.9511592388153076, + "learning_rate": 4.2280327099953245e-05, + "loss": 1.084, + "step": 17190 + }, + { + "epoch": 5.14, + "grad_norm": 2.9204232692718506, + "learning_rate": 4.227608089591447e-05, + "loss": 1.2676, + "step": 17195 + }, + { + "epoch": 5.15, + "grad_norm": 1.7769725322723389, + "learning_rate": 4.22718337377357e-05, + "loss": 1.2958, + "step": 17200 + }, + { + "epoch": 5.15, + "grad_norm": 1.5279228687286377, + "learning_rate": 4.2267585625651516e-05, + "loss": 1.2537, + "step": 17205 + }, + { + "epoch": 5.15, + "grad_norm": 2.0030903816223145, + "learning_rate": 4.2263336559896514e-05, + "loss": 1.1446, + "step": 17210 + }, + { + "epoch": 5.15, + "grad_norm": 1.8052384853363037, + "learning_rate": 4.2259086540705384e-05, + "loss": 1.3123, + "step": 17215 + }, + { + "epoch": 5.15, + "grad_norm": 1.5273395776748657, + "learning_rate": 4.225483556831284e-05, + "loss": 1.3197, + "step": 17220 + }, + { + "epoch": 5.15, + "grad_norm": 7.197212219238281, + "learning_rate": 4.225058364295367e-05, + "loss": 1.2388, + "step": 17225 + }, + { + "epoch": 5.16, + "grad_norm": 2.869080066680908, + "learning_rate": 4.22463307648627e-05, + "loss": 1.1988, + "step": 17230 + }, + { + "epoch": 5.16, + "grad_norm": 1.3211283683776855, + "learning_rate": 4.22420769342748e-05, + "loss": 1.3071, + "step": 17235 + }, + { + "epoch": 5.16, + "grad_norm": 0.8859264850616455, + "learning_rate": 4.223782215142491e-05, + "loss": 1.2127, + "step": 17240 + }, + { + "epoch": 5.16, + "grad_norm": 2.70227313041687, + "learning_rate": 4.2233566416548024e-05, + "loss": 1.2588, + "step": 17245 + }, + { + "epoch": 5.16, + "grad_norm": 1.2390819787979126, + "learning_rate": 4.2229309729879174e-05, + "loss": 1.3169, + "step": 17250 + }, + { + "epoch": 5.16, + "grad_norm": 1.9545568227767944, + "learning_rate": 4.222505209165346e-05, + "loss": 1.1279, + "step": 17255 + }, + { + "epoch": 5.16, + "grad_norm": 2.1074938774108887, + "learning_rate": 4.2220793502106035e-05, + "loss": 1.093, + "step": 17260 + }, + { + "epoch": 5.17, + "grad_norm": 2.4729721546173096, + "learning_rate": 4.2216533961472074e-05, + "loss": 1.2353, + "step": 17265 + }, + { + "epoch": 5.17, + "grad_norm": 2.2002952098846436, + "learning_rate": 4.221227346998684e-05, + "loss": 1.3197, + "step": 17270 + }, + { + "epoch": 5.17, + "grad_norm": 2.468754768371582, + "learning_rate": 4.220801202788563e-05, + "loss": 1.2085, + "step": 17275 + }, + { + "epoch": 5.17, + "grad_norm": 1.546784520149231, + "learning_rate": 4.22037496354038e-05, + "loss": 1.0504, + "step": 17280 + }, + { + "epoch": 5.17, + "grad_norm": 2.2986133098602295, + "learning_rate": 4.219948629277677e-05, + "loss": 1.0963, + "step": 17285 + }, + { + "epoch": 5.17, + "grad_norm": 1.0062131881713867, + "learning_rate": 4.219522200023999e-05, + "loss": 1.1797, + "step": 17290 + }, + { + "epoch": 5.17, + "grad_norm": 1.0225803852081299, + "learning_rate": 4.219095675802896e-05, + "loss": 1.1858, + "step": 17295 + }, + { + "epoch": 5.18, + "grad_norm": 1.347981572151184, + "learning_rate": 4.218669056637926e-05, + "loss": 1.2715, + "step": 17300 + }, + { + "epoch": 5.18, + "grad_norm": 2.4075846672058105, + "learning_rate": 4.218242342552651e-05, + "loss": 1.0566, + "step": 17305 + }, + { + "epoch": 5.18, + "grad_norm": 4.251238822937012, + "learning_rate": 4.217815533570637e-05, + "loss": 1.1106, + "step": 17310 + }, + { + "epoch": 5.18, + "grad_norm": 2.3025827407836914, + "learning_rate": 4.2173886297154575e-05, + "loss": 1.2947, + "step": 17315 + }, + { + "epoch": 5.18, + "grad_norm": 2.7054014205932617, + "learning_rate": 4.216961631010689e-05, + "loss": 1.1512, + "step": 17320 + }, + { + "epoch": 5.18, + "grad_norm": 2.6047892570495605, + "learning_rate": 4.2165345374799134e-05, + "loss": 1.1665, + "step": 17325 + }, + { + "epoch": 5.18, + "grad_norm": 2.1984410285949707, + "learning_rate": 4.2161073491467196e-05, + "loss": 1.2841, + "step": 17330 + }, + { + "epoch": 5.19, + "grad_norm": 1.6015806198120117, + "learning_rate": 4.215680066034701e-05, + "loss": 1.3405, + "step": 17335 + }, + { + "epoch": 5.19, + "grad_norm": 5.569428443908691, + "learning_rate": 4.215252688167456e-05, + "loss": 1.0902, + "step": 17340 + }, + { + "epoch": 5.19, + "grad_norm": 2.5024139881134033, + "learning_rate": 4.2148252155685875e-05, + "loss": 1.174, + "step": 17345 + }, + { + "epoch": 5.19, + "grad_norm": 1.9789233207702637, + "learning_rate": 4.2143976482617055e-05, + "loss": 1.2465, + "step": 17350 + }, + { + "epoch": 5.19, + "grad_norm": 2.1717369556427, + "learning_rate": 4.213969986270423e-05, + "loss": 1.1812, + "step": 17355 + }, + { + "epoch": 5.19, + "grad_norm": 3.2507376670837402, + "learning_rate": 4.21354222961836e-05, + "loss": 1.2907, + "step": 17360 + }, + { + "epoch": 5.2, + "grad_norm": 1.7060433626174927, + "learning_rate": 4.21311437832914e-05, + "loss": 1.2651, + "step": 17365 + }, + { + "epoch": 5.2, + "grad_norm": 5.012210369110107, + "learning_rate": 4.212686432426394e-05, + "loss": 1.1516, + "step": 17370 + }, + { + "epoch": 5.2, + "grad_norm": 5.409265995025635, + "learning_rate": 4.2122583919337566e-05, + "loss": 1.1225, + "step": 17375 + }, + { + "epoch": 5.2, + "grad_norm": 2.1262056827545166, + "learning_rate": 4.211830256874868e-05, + "loss": 1.2868, + "step": 17380 + }, + { + "epoch": 5.2, + "grad_norm": 1.5993378162384033, + "learning_rate": 4.211402027273373e-05, + "loss": 1.333, + "step": 17385 + }, + { + "epoch": 5.2, + "grad_norm": 1.085625171661377, + "learning_rate": 4.2109737031529245e-05, + "loss": 1.2228, + "step": 17390 + }, + { + "epoch": 5.2, + "grad_norm": 1.2686785459518433, + "learning_rate": 4.2105452845371754e-05, + "loss": 1.3888, + "step": 17395 + }, + { + "epoch": 5.21, + "grad_norm": 1.7684992551803589, + "learning_rate": 4.210116771449789e-05, + "loss": 1.1147, + "step": 17400 + }, + { + "epoch": 5.21, + "grad_norm": 4.321854114532471, + "learning_rate": 4.209688163914431e-05, + "loss": 1.2644, + "step": 17405 + }, + { + "epoch": 5.21, + "grad_norm": 1.0918437242507935, + "learning_rate": 4.209259461954772e-05, + "loss": 0.9742, + "step": 17410 + }, + { + "epoch": 5.21, + "grad_norm": 1.0674161911010742, + "learning_rate": 4.20883066559449e-05, + "loss": 1.3029, + "step": 17415 + }, + { + "epoch": 5.21, + "grad_norm": 10.988029479980469, + "learning_rate": 4.208401774857267e-05, + "loss": 1.4338, + "step": 17420 + }, + { + "epoch": 5.21, + "grad_norm": 1.3051105737686157, + "learning_rate": 4.2079727897667896e-05, + "loss": 1.1686, + "step": 17425 + }, + { + "epoch": 5.21, + "grad_norm": 1.0172728300094604, + "learning_rate": 4.2075437103467495e-05, + "loss": 1.1133, + "step": 17430 + }, + { + "epoch": 5.22, + "grad_norm": 1.7416491508483887, + "learning_rate": 4.207114536620846e-05, + "loss": 1.1215, + "step": 17435 + }, + { + "epoch": 5.22, + "grad_norm": 1.3212878704071045, + "learning_rate": 4.206685268612781e-05, + "loss": 1.1689, + "step": 17440 + }, + { + "epoch": 5.22, + "grad_norm": 3.1066105365753174, + "learning_rate": 4.206255906346262e-05, + "loss": 1.0547, + "step": 17445 + }, + { + "epoch": 5.22, + "grad_norm": 4.053175449371338, + "learning_rate": 4.205826449845005e-05, + "loss": 1.2893, + "step": 17450 + }, + { + "epoch": 5.22, + "grad_norm": 1.900181531906128, + "learning_rate": 4.205396899132724e-05, + "loss": 1.2453, + "step": 17455 + }, + { + "epoch": 5.22, + "grad_norm": 0.7705186009407043, + "learning_rate": 4.2049672542331454e-05, + "loss": 1.2095, + "step": 17460 + }, + { + "epoch": 5.23, + "grad_norm": 1.5405861139297485, + "learning_rate": 4.2045375151699976e-05, + "loss": 1.2622, + "step": 17465 + }, + { + "epoch": 5.23, + "grad_norm": 3.1085848808288574, + "learning_rate": 4.204107681967015e-05, + "loss": 1.1181, + "step": 17470 + }, + { + "epoch": 5.23, + "grad_norm": 1.0741022825241089, + "learning_rate": 4.203677754647936e-05, + "loss": 1.1253, + "step": 17475 + }, + { + "epoch": 5.23, + "grad_norm": 1.2874560356140137, + "learning_rate": 4.2032477332365054e-05, + "loss": 1.1252, + "step": 17480 + }, + { + "epoch": 5.23, + "grad_norm": 2.286153554916382, + "learning_rate": 4.2028176177564736e-05, + "loss": 1.1286, + "step": 17485 + }, + { + "epoch": 5.23, + "grad_norm": 1.706329107284546, + "learning_rate": 4.202387408231595e-05, + "loss": 1.1123, + "step": 17490 + }, + { + "epoch": 5.23, + "grad_norm": 2.1467063426971436, + "learning_rate": 4.2019571046856284e-05, + "loss": 1.2309, + "step": 17495 + }, + { + "epoch": 5.24, + "grad_norm": 2.3436472415924072, + "learning_rate": 4.2015267071423404e-05, + "loss": 1.1822, + "step": 17500 + }, + { + "epoch": 5.24, + "grad_norm": 1.5204914808273315, + "learning_rate": 4.2010962156255004e-05, + "loss": 1.1763, + "step": 17505 + }, + { + "epoch": 5.24, + "grad_norm": 1.3198552131652832, + "learning_rate": 4.200665630158885e-05, + "loss": 1.174, + "step": 17510 + }, + { + "epoch": 5.24, + "grad_norm": 1.4208104610443115, + "learning_rate": 4.200234950766275e-05, + "loss": 1.1142, + "step": 17515 + }, + { + "epoch": 5.24, + "grad_norm": 2.4655168056488037, + "learning_rate": 4.199804177471456e-05, + "loss": 0.9846, + "step": 17520 + }, + { + "epoch": 5.24, + "grad_norm": 1.046281099319458, + "learning_rate": 4.199373310298219e-05, + "loss": 1.2307, + "step": 17525 + }, + { + "epoch": 5.24, + "grad_norm": 1.329183578491211, + "learning_rate": 4.19894234927036e-05, + "loss": 1.1682, + "step": 17530 + }, + { + "epoch": 5.25, + "grad_norm": 1.4482097625732422, + "learning_rate": 4.198511294411681e-05, + "loss": 1.2648, + "step": 17535 + }, + { + "epoch": 5.25, + "grad_norm": 1.9892488718032837, + "learning_rate": 4.1980801457459895e-05, + "loss": 1.0417, + "step": 17540 + }, + { + "epoch": 5.25, + "grad_norm": 2.176584243774414, + "learning_rate": 4.197648903297096e-05, + "loss": 1.2264, + "step": 17545 + }, + { + "epoch": 5.25, + "grad_norm": 1.82011878490448, + "learning_rate": 4.197217567088818e-05, + "loss": 1.1264, + "step": 17550 + }, + { + "epoch": 5.25, + "grad_norm": 1.9624288082122803, + "learning_rate": 4.196786137144979e-05, + "loss": 1.2737, + "step": 17555 + }, + { + "epoch": 5.25, + "grad_norm": 1.3855583667755127, + "learning_rate": 4.196354613489404e-05, + "loss": 1.3331, + "step": 17560 + }, + { + "epoch": 5.26, + "grad_norm": 1.9667317867279053, + "learning_rate": 4.195922996145928e-05, + "loss": 1.1719, + "step": 17565 + }, + { + "epoch": 5.26, + "grad_norm": 1.6437666416168213, + "learning_rate": 4.1954912851383864e-05, + "loss": 1.1407, + "step": 17570 + }, + { + "epoch": 5.26, + "grad_norm": 1.1012728214263916, + "learning_rate": 4.1950594804906246e-05, + "loss": 1.3084, + "step": 17575 + }, + { + "epoch": 5.26, + "grad_norm": 1.9337821006774902, + "learning_rate": 4.1946275822264904e-05, + "loss": 1.1738, + "step": 17580 + }, + { + "epoch": 5.26, + "grad_norm": 3.0107462406158447, + "learning_rate": 4.194195590369835e-05, + "loss": 1.2292, + "step": 17585 + }, + { + "epoch": 5.26, + "grad_norm": 1.6357334852218628, + "learning_rate": 4.193763504944518e-05, + "loss": 1.2465, + "step": 17590 + }, + { + "epoch": 5.26, + "grad_norm": 3.3010284900665283, + "learning_rate": 4.193331325974403e-05, + "loss": 1.2547, + "step": 17595 + }, + { + "epoch": 5.27, + "grad_norm": 1.5490888357162476, + "learning_rate": 4.19289905348336e-05, + "loss": 1.2406, + "step": 17600 + }, + { + "epoch": 5.27, + "grad_norm": 1.6628612279891968, + "learning_rate": 4.192466687495262e-05, + "loss": 1.0981, + "step": 17605 + }, + { + "epoch": 5.27, + "grad_norm": 1.1924035549163818, + "learning_rate": 4.192034228033987e-05, + "loss": 1.1726, + "step": 17610 + }, + { + "epoch": 5.27, + "grad_norm": 2.363252639770508, + "learning_rate": 4.191601675123422e-05, + "loss": 1.2418, + "step": 17615 + }, + { + "epoch": 5.27, + "grad_norm": 2.983738899230957, + "learning_rate": 4.1911690287874535e-05, + "loss": 1.1598, + "step": 17620 + }, + { + "epoch": 5.27, + "grad_norm": 2.698246955871582, + "learning_rate": 4.190736289049977e-05, + "loss": 1.256, + "step": 17625 + }, + { + "epoch": 5.27, + "grad_norm": 2.4981305599212646, + "learning_rate": 4.190303455934894e-05, + "loss": 1.2023, + "step": 17630 + }, + { + "epoch": 5.28, + "grad_norm": 1.513262152671814, + "learning_rate": 4.189870529466107e-05, + "loss": 1.1823, + "step": 17635 + }, + { + "epoch": 5.28, + "grad_norm": 2.1456382274627686, + "learning_rate": 4.1894375096675274e-05, + "loss": 1.007, + "step": 17640 + }, + { + "epoch": 5.28, + "grad_norm": 2.4811346530914307, + "learning_rate": 4.189004396563071e-05, + "loss": 1.2597, + "step": 17645 + }, + { + "epoch": 5.28, + "grad_norm": 2.1642322540283203, + "learning_rate": 4.1885711901766564e-05, + "loss": 1.2648, + "step": 17650 + }, + { + "epoch": 5.28, + "grad_norm": 2.7846784591674805, + "learning_rate": 4.188137890532211e-05, + "loss": 1.0887, + "step": 17655 + }, + { + "epoch": 5.28, + "grad_norm": 2.358762502670288, + "learning_rate": 4.187704497653665e-05, + "loss": 1.2641, + "step": 17660 + }, + { + "epoch": 5.29, + "grad_norm": 1.4334055185317993, + "learning_rate": 4.1872710115649525e-05, + "loss": 1.2008, + "step": 17665 + }, + { + "epoch": 5.29, + "grad_norm": 1.1440168619155884, + "learning_rate": 4.1868374322900163e-05, + "loss": 1.2595, + "step": 17670 + }, + { + "epoch": 5.29, + "grad_norm": 1.5361130237579346, + "learning_rate": 4.186403759852802e-05, + "loss": 1.2388, + "step": 17675 + }, + { + "epoch": 5.29, + "grad_norm": 2.4247677326202393, + "learning_rate": 4.185969994277262e-05, + "loss": 1.0073, + "step": 17680 + }, + { + "epoch": 5.29, + "grad_norm": 1.3010989427566528, + "learning_rate": 4.1855361355873506e-05, + "loss": 1.3015, + "step": 17685 + }, + { + "epoch": 5.29, + "grad_norm": 2.8107826709747314, + "learning_rate": 4.185102183807031e-05, + "loss": 1.167, + "step": 17690 + }, + { + "epoch": 5.29, + "grad_norm": 1.6690555810928345, + "learning_rate": 4.1846681389602686e-05, + "loss": 1.2343, + "step": 17695 + }, + { + "epoch": 5.3, + "grad_norm": 1.7709966897964478, + "learning_rate": 4.1842340010710366e-05, + "loss": 1.3241, + "step": 17700 + }, + { + "epoch": 5.3, + "grad_norm": 4.506913661956787, + "learning_rate": 4.1837997701633115e-05, + "loss": 1.3076, + "step": 17705 + }, + { + "epoch": 5.3, + "grad_norm": 2.404618501663208, + "learning_rate": 4.183365446261075e-05, + "loss": 1.1784, + "step": 17710 + }, + { + "epoch": 5.3, + "grad_norm": 1.8350248336791992, + "learning_rate": 4.182931029388315e-05, + "loss": 1.2567, + "step": 17715 + }, + { + "epoch": 5.3, + "grad_norm": 1.5399670600891113, + "learning_rate": 4.182496519569023e-05, + "loss": 1.1838, + "step": 17720 + }, + { + "epoch": 5.3, + "grad_norm": 0.7773048281669617, + "learning_rate": 4.1820619168271975e-05, + "loss": 1.2516, + "step": 17725 + }, + { + "epoch": 5.3, + "grad_norm": 2.276472330093384, + "learning_rate": 4.181627221186841e-05, + "loss": 1.37, + "step": 17730 + }, + { + "epoch": 5.31, + "grad_norm": 2.218336582183838, + "learning_rate": 4.18119243267196e-05, + "loss": 1.3398, + "step": 17735 + }, + { + "epoch": 5.31, + "grad_norm": 1.3375170230865479, + "learning_rate": 4.180757551306569e-05, + "loss": 1.3, + "step": 17740 + }, + { + "epoch": 5.31, + "grad_norm": 1.2494932413101196, + "learning_rate": 4.180322577114686e-05, + "loss": 1.2727, + "step": 17745 + }, + { + "epoch": 5.31, + "grad_norm": 1.8490458726882935, + "learning_rate": 4.1798875101203336e-05, + "loss": 1.3535, + "step": 17750 + }, + { + "epoch": 5.31, + "grad_norm": 3.2048110961914062, + "learning_rate": 4.179452350347539e-05, + "loss": 1.1024, + "step": 17755 + }, + { + "epoch": 5.31, + "grad_norm": 2.3998045921325684, + "learning_rate": 4.179017097820338e-05, + "loss": 1.1105, + "step": 17760 + }, + { + "epoch": 5.32, + "grad_norm": 3.1060426235198975, + "learning_rate": 4.178581752562767e-05, + "loss": 1.223, + "step": 17765 + }, + { + "epoch": 5.32, + "grad_norm": 2.371044874191284, + "learning_rate": 4.178146314598872e-05, + "loss": 1.2038, + "step": 17770 + }, + { + "epoch": 5.32, + "grad_norm": 1.7323418855667114, + "learning_rate": 4.177710783952699e-05, + "loss": 1.2121, + "step": 17775 + }, + { + "epoch": 5.32, + "grad_norm": 2.284435272216797, + "learning_rate": 4.177275160648304e-05, + "loss": 1.3172, + "step": 17780 + }, + { + "epoch": 5.32, + "grad_norm": 3.9668426513671875, + "learning_rate": 4.1768394447097446e-05, + "loss": 1.2761, + "step": 17785 + }, + { + "epoch": 5.32, + "grad_norm": 0.9461215734481812, + "learning_rate": 4.176403636161086e-05, + "loss": 1.2843, + "step": 17790 + }, + { + "epoch": 5.32, + "grad_norm": 2.7739803791046143, + "learning_rate": 4.1759677350263976e-05, + "loss": 1.4001, + "step": 17795 + }, + { + "epoch": 5.33, + "grad_norm": 1.284010648727417, + "learning_rate": 4.1755317413297526e-05, + "loss": 1.2278, + "step": 17800 + }, + { + "epoch": 5.33, + "grad_norm": 2.6691951751708984, + "learning_rate": 4.175095655095232e-05, + "loss": 1.1456, + "step": 17805 + }, + { + "epoch": 5.33, + "grad_norm": 2.771559476852417, + "learning_rate": 4.174659476346919e-05, + "loss": 1.1955, + "step": 17810 + }, + { + "epoch": 5.33, + "grad_norm": 1.4969909191131592, + "learning_rate": 4.174223205108904e-05, + "loss": 1.2585, + "step": 17815 + }, + { + "epoch": 5.33, + "grad_norm": 1.8227806091308594, + "learning_rate": 4.1737868414052817e-05, + "loss": 1.3033, + "step": 17820 + }, + { + "epoch": 5.33, + "grad_norm": 1.4669899940490723, + "learning_rate": 4.1733503852601516e-05, + "loss": 1.2654, + "step": 17825 + }, + { + "epoch": 5.33, + "grad_norm": 3.621481418609619, + "learning_rate": 4.172913836697619e-05, + "loss": 1.2247, + "step": 17830 + }, + { + "epoch": 5.34, + "grad_norm": 1.8669898509979248, + "learning_rate": 4.172477195741794e-05, + "loss": 1.1747, + "step": 17835 + }, + { + "epoch": 5.34, + "grad_norm": 1.145015001296997, + "learning_rate": 4.1720404624167925e-05, + "loss": 1.3372, + "step": 17840 + }, + { + "epoch": 5.34, + "grad_norm": 1.5588964223861694, + "learning_rate": 4.171603636746734e-05, + "loss": 1.1368, + "step": 17845 + }, + { + "epoch": 5.34, + "grad_norm": 3.020059823989868, + "learning_rate": 4.171166718755744e-05, + "loss": 1.1614, + "step": 17850 + }, + { + "epoch": 5.34, + "grad_norm": 2.658262252807617, + "learning_rate": 4.170729708467953e-05, + "loss": 1.1811, + "step": 17855 + }, + { + "epoch": 5.34, + "grad_norm": 1.9605822563171387, + "learning_rate": 4.170292605907498e-05, + "loss": 1.1067, + "step": 17860 + }, + { + "epoch": 5.35, + "grad_norm": 2.259817600250244, + "learning_rate": 4.169855411098517e-05, + "loss": 1.3053, + "step": 17865 + }, + { + "epoch": 5.35, + "grad_norm": 1.7595691680908203, + "learning_rate": 4.169418124065159e-05, + "loss": 1.2888, + "step": 17870 + }, + { + "epoch": 5.35, + "grad_norm": 1.319011926651001, + "learning_rate": 4.168980744831572e-05, + "loss": 1.0001, + "step": 17875 + }, + { + "epoch": 5.35, + "grad_norm": 1.3110008239746094, + "learning_rate": 4.168543273421913e-05, + "loss": 1.3487, + "step": 17880 + }, + { + "epoch": 5.35, + "grad_norm": 2.2493362426757812, + "learning_rate": 4.168105709860344e-05, + "loss": 1.2849, + "step": 17885 + }, + { + "epoch": 5.35, + "grad_norm": 1.7619404792785645, + "learning_rate": 4.167668054171031e-05, + "loss": 1.237, + "step": 17890 + }, + { + "epoch": 5.35, + "grad_norm": 1.684557318687439, + "learning_rate": 4.167230306378144e-05, + "loss": 1.2893, + "step": 17895 + }, + { + "epoch": 5.36, + "grad_norm": 3.6283462047576904, + "learning_rate": 4.1667924665058605e-05, + "loss": 1.3225, + "step": 17900 + }, + { + "epoch": 5.36, + "grad_norm": 1.8916960954666138, + "learning_rate": 4.166354534578362e-05, + "loss": 1.2069, + "step": 17905 + }, + { + "epoch": 5.36, + "grad_norm": 1.7808947563171387, + "learning_rate": 4.165916510619834e-05, + "loss": 1.2129, + "step": 17910 + }, + { + "epoch": 5.36, + "grad_norm": 2.4525959491729736, + "learning_rate": 4.1654783946544695e-05, + "loss": 1.2451, + "step": 17915 + }, + { + "epoch": 5.36, + "grad_norm": 1.7354241609573364, + "learning_rate": 4.165040186706464e-05, + "loss": 1.2769, + "step": 17920 + }, + { + "epoch": 5.36, + "grad_norm": 2.6670055389404297, + "learning_rate": 4.1646018868000194e-05, + "loss": 1.3005, + "step": 17925 + }, + { + "epoch": 5.36, + "grad_norm": 2.470824956893921, + "learning_rate": 4.164163494959342e-05, + "loss": 1.1619, + "step": 17930 + }, + { + "epoch": 5.37, + "grad_norm": 2.5797417163848877, + "learning_rate": 4.1637250112086466e-05, + "loss": 1.3173, + "step": 17935 + }, + { + "epoch": 5.37, + "grad_norm": 2.280491828918457, + "learning_rate": 4.163286435572147e-05, + "loss": 1.323, + "step": 17940 + }, + { + "epoch": 5.37, + "grad_norm": 1.8656688928604126, + "learning_rate": 4.162847768074067e-05, + "loss": 1.3305, + "step": 17945 + }, + { + "epoch": 5.37, + "grad_norm": 1.572505235671997, + "learning_rate": 4.162409008738632e-05, + "loss": 1.1925, + "step": 17950 + }, + { + "epoch": 5.37, + "grad_norm": 1.0871065855026245, + "learning_rate": 4.161970157590077e-05, + "loss": 1.2474, + "step": 17955 + }, + { + "epoch": 5.37, + "grad_norm": 1.81941819190979, + "learning_rate": 4.161531214652637e-05, + "loss": 1.1525, + "step": 17960 + }, + { + "epoch": 5.37, + "grad_norm": 1.6600680351257324, + "learning_rate": 4.161092179950555e-05, + "loss": 1.1595, + "step": 17965 + }, + { + "epoch": 5.38, + "grad_norm": 1.9169402122497559, + "learning_rate": 4.160653053508079e-05, + "loss": 1.2836, + "step": 17970 + }, + { + "epoch": 5.38, + "grad_norm": 2.0961391925811768, + "learning_rate": 4.16021383534946e-05, + "loss": 1.1986, + "step": 17975 + }, + { + "epoch": 5.38, + "grad_norm": 1.4828705787658691, + "learning_rate": 4.159774525498957e-05, + "loss": 1.215, + "step": 17980 + }, + { + "epoch": 5.38, + "grad_norm": 1.643730640411377, + "learning_rate": 4.159335123980833e-05, + "loss": 1.1822, + "step": 17985 + }, + { + "epoch": 5.38, + "grad_norm": 2.4399468898773193, + "learning_rate": 4.158895630819354e-05, + "loss": 1.3621, + "step": 17990 + }, + { + "epoch": 5.38, + "grad_norm": 1.5563558340072632, + "learning_rate": 4.158456046038794e-05, + "loss": 1.1871, + "step": 17995 + }, + { + "epoch": 5.39, + "grad_norm": 2.194761037826538, + "learning_rate": 4.15801636966343e-05, + "loss": 1.269, + "step": 18000 + }, + { + "epoch": 5.39, + "grad_norm": 3.4397332668304443, + "learning_rate": 4.157576601717546e-05, + "loss": 1.2749, + "step": 18005 + }, + { + "epoch": 5.39, + "grad_norm": 1.9305585622787476, + "learning_rate": 4.1571367422254296e-05, + "loss": 1.2515, + "step": 18010 + }, + { + "epoch": 5.39, + "grad_norm": 2.5230295658111572, + "learning_rate": 4.156696791211372e-05, + "loss": 1.3452, + "step": 18015 + }, + { + "epoch": 5.39, + "grad_norm": 4.581267833709717, + "learning_rate": 4.156256748699673e-05, + "loss": 1.0429, + "step": 18020 + }, + { + "epoch": 5.39, + "grad_norm": 2.436643123626709, + "learning_rate": 4.155816614714636e-05, + "loss": 1.2162, + "step": 18025 + }, + { + "epoch": 5.39, + "grad_norm": 3.484376907348633, + "learning_rate": 4.155376389280569e-05, + "loss": 1.1535, + "step": 18030 + }, + { + "epoch": 5.4, + "grad_norm": 0.869539201259613, + "learning_rate": 4.1549360724217835e-05, + "loss": 1.238, + "step": 18035 + }, + { + "epoch": 5.4, + "grad_norm": 2.1661620140075684, + "learning_rate": 4.1544956641625996e-05, + "loss": 1.3041, + "step": 18040 + }, + { + "epoch": 5.4, + "grad_norm": 2.650646209716797, + "learning_rate": 4.15405516452734e-05, + "loss": 1.1902, + "step": 18045 + }, + { + "epoch": 5.4, + "grad_norm": 3.9446349143981934, + "learning_rate": 4.153614573540332e-05, + "loss": 1.337, + "step": 18050 + }, + { + "epoch": 5.4, + "grad_norm": 1.5067466497421265, + "learning_rate": 4.153173891225911e-05, + "loss": 1.2288, + "step": 18055 + }, + { + "epoch": 5.4, + "grad_norm": 3.6524410247802734, + "learning_rate": 4.152733117608413e-05, + "loss": 1.2194, + "step": 18060 + }, + { + "epoch": 5.4, + "grad_norm": 1.016907811164856, + "learning_rate": 4.1522922527121846e-05, + "loss": 1.1394, + "step": 18065 + }, + { + "epoch": 5.41, + "grad_norm": 2.3441882133483887, + "learning_rate": 4.151851296561572e-05, + "loss": 1.1889, + "step": 18070 + }, + { + "epoch": 5.41, + "grad_norm": 2.799960136413574, + "learning_rate": 4.1514102491809286e-05, + "loss": 1.1934, + "step": 18075 + }, + { + "epoch": 5.41, + "grad_norm": 3.4592952728271484, + "learning_rate": 4.1509691105946145e-05, + "loss": 1.2256, + "step": 18080 + }, + { + "epoch": 5.41, + "grad_norm": 1.311344027519226, + "learning_rate": 4.150527880826992e-05, + "loss": 1.1273, + "step": 18085 + }, + { + "epoch": 5.41, + "grad_norm": 2.329275131225586, + "learning_rate": 4.15008655990243e-05, + "loss": 1.3712, + "step": 18090 + }, + { + "epoch": 5.41, + "grad_norm": 4.58236026763916, + "learning_rate": 4.149645147845303e-05, + "loss": 1.141, + "step": 18095 + }, + { + "epoch": 5.42, + "grad_norm": 1.4890109300613403, + "learning_rate": 4.149203644679989e-05, + "loss": 1.1462, + "step": 18100 + }, + { + "epoch": 5.42, + "grad_norm": 3.4665701389312744, + "learning_rate": 4.148762050430872e-05, + "loss": 1.2364, + "step": 18105 + }, + { + "epoch": 5.42, + "grad_norm": 2.3981378078460693, + "learning_rate": 4.148320365122341e-05, + "loss": 1.2632, + "step": 18110 + }, + { + "epoch": 5.42, + "grad_norm": 2.2879130840301514, + "learning_rate": 4.147878588778789e-05, + "loss": 1.1986, + "step": 18115 + }, + { + "epoch": 5.42, + "grad_norm": 1.7750098705291748, + "learning_rate": 4.1474367214246156e-05, + "loss": 1.168, + "step": 18120 + }, + { + "epoch": 5.42, + "grad_norm": 1.1014642715454102, + "learning_rate": 4.146994763084225e-05, + "loss": 1.2932, + "step": 18125 + }, + { + "epoch": 5.42, + "grad_norm": 2.6048409938812256, + "learning_rate": 4.1465527137820255e-05, + "loss": 1.2311, + "step": 18130 + }, + { + "epoch": 5.43, + "grad_norm": 1.7655155658721924, + "learning_rate": 4.146110573542431e-05, + "loss": 1.1891, + "step": 18135 + }, + { + "epoch": 5.43, + "grad_norm": 2.196241617202759, + "learning_rate": 4.14566834238986e-05, + "loss": 1.1351, + "step": 18140 + }, + { + "epoch": 5.43, + "grad_norm": 1.649719476699829, + "learning_rate": 4.145226020348737e-05, + "loss": 1.2341, + "step": 18145 + }, + { + "epoch": 5.43, + "grad_norm": 1.7216572761535645, + "learning_rate": 4.1447836074434916e-05, + "loss": 1.2227, + "step": 18150 + }, + { + "epoch": 5.43, + "grad_norm": 2.0533690452575684, + "learning_rate": 4.144341103698557e-05, + "loss": 1.2416, + "step": 18155 + }, + { + "epoch": 5.43, + "grad_norm": 3.26836895942688, + "learning_rate": 4.143898509138373e-05, + "loss": 1.1773, + "step": 18160 + }, + { + "epoch": 5.43, + "grad_norm": 1.7304611206054688, + "learning_rate": 4.1434558237873824e-05, + "loss": 1.3271, + "step": 18165 + }, + { + "epoch": 5.44, + "grad_norm": 1.9267404079437256, + "learning_rate": 4.143013047670035e-05, + "loss": 1.2153, + "step": 18170 + }, + { + "epoch": 5.44, + "grad_norm": 1.807952642440796, + "learning_rate": 4.1425701808107855e-05, + "loss": 1.2352, + "step": 18175 + }, + { + "epoch": 5.44, + "grad_norm": 1.5293511152267456, + "learning_rate": 4.142127223234091e-05, + "loss": 1.2766, + "step": 18180 + }, + { + "epoch": 5.44, + "grad_norm": 2.029411554336548, + "learning_rate": 4.1416841749644174e-05, + "loss": 1.0328, + "step": 18185 + }, + { + "epoch": 5.44, + "grad_norm": 1.7925702333450317, + "learning_rate": 4.1412410360262334e-05, + "loss": 1.366, + "step": 18190 + }, + { + "epoch": 5.44, + "grad_norm": 0.8310097455978394, + "learning_rate": 4.140797806444013e-05, + "loss": 1.212, + "step": 18195 + }, + { + "epoch": 5.45, + "grad_norm": 3.394538402557373, + "learning_rate": 4.140354486242235e-05, + "loss": 1.3675, + "step": 18200 + }, + { + "epoch": 5.45, + "grad_norm": 4.078660488128662, + "learning_rate": 4.139911075445384e-05, + "loss": 1.1619, + "step": 18205 + }, + { + "epoch": 5.45, + "grad_norm": 1.416306495666504, + "learning_rate": 4.1394675740779485e-05, + "loss": 1.0888, + "step": 18210 + }, + { + "epoch": 5.45, + "grad_norm": 3.147977828979492, + "learning_rate": 4.139023982164424e-05, + "loss": 1.1083, + "step": 18215 + }, + { + "epoch": 5.45, + "grad_norm": 3.413358688354492, + "learning_rate": 4.138580299729308e-05, + "loss": 1.1791, + "step": 18220 + }, + { + "epoch": 5.45, + "grad_norm": 1.7212944030761719, + "learning_rate": 4.138136526797105e-05, + "loss": 1.2017, + "step": 18225 + }, + { + "epoch": 5.45, + "grad_norm": 2.027756690979004, + "learning_rate": 4.137692663392325e-05, + "loss": 1.1783, + "step": 18230 + }, + { + "epoch": 5.46, + "grad_norm": 0.9645988345146179, + "learning_rate": 4.137248709539481e-05, + "loss": 1.1465, + "step": 18235 + }, + { + "epoch": 5.46, + "grad_norm": 2.282986879348755, + "learning_rate": 4.1368046652630924e-05, + "loss": 1.2574, + "step": 18240 + }, + { + "epoch": 5.46, + "grad_norm": 2.9821066856384277, + "learning_rate": 4.136360530587684e-05, + "loss": 1.2142, + "step": 18245 + }, + { + "epoch": 5.46, + "grad_norm": 2.920485258102417, + "learning_rate": 4.135916305537784e-05, + "loss": 1.1583, + "step": 18250 + }, + { + "epoch": 5.46, + "grad_norm": 2.7918076515197754, + "learning_rate": 4.135471990137927e-05, + "loss": 1.1705, + "step": 18255 + }, + { + "epoch": 5.46, + "grad_norm": 1.458203911781311, + "learning_rate": 4.135027584412653e-05, + "loss": 1.2112, + "step": 18260 + }, + { + "epoch": 5.46, + "grad_norm": 3.7946949005126953, + "learning_rate": 4.134583088386504e-05, + "loss": 1.2094, + "step": 18265 + }, + { + "epoch": 5.47, + "grad_norm": 2.0881757736206055, + "learning_rate": 4.134138502084029e-05, + "loss": 1.3534, + "step": 18270 + }, + { + "epoch": 5.47, + "grad_norm": 3.244400978088379, + "learning_rate": 4.133693825529785e-05, + "loss": 1.3165, + "step": 18275 + }, + { + "epoch": 5.47, + "grad_norm": 1.4288554191589355, + "learning_rate": 4.1332490587483286e-05, + "loss": 1.2475, + "step": 18280 + }, + { + "epoch": 5.47, + "grad_norm": 3.9546115398406982, + "learning_rate": 4.132804201764224e-05, + "loss": 1.2295, + "step": 18285 + }, + { + "epoch": 5.47, + "grad_norm": 2.304332971572876, + "learning_rate": 4.132448251247545e-05, + "loss": 1.3196, + "step": 18290 + }, + { + "epoch": 5.47, + "grad_norm": 2.4342777729034424, + "learning_rate": 4.132003231960591e-05, + "loss": 1.1165, + "step": 18295 + }, + { + "epoch": 5.48, + "grad_norm": 2.890040397644043, + "learning_rate": 4.131558122539796e-05, + "loss": 1.1342, + "step": 18300 + }, + { + "epoch": 5.48, + "grad_norm": 1.576117992401123, + "learning_rate": 4.131112923009741e-05, + "loss": 1.2944, + "step": 18305 + }, + { + "epoch": 5.48, + "grad_norm": 2.7480485439300537, + "learning_rate": 4.130667633395015e-05, + "loss": 1.3669, + "step": 18310 + }, + { + "epoch": 5.48, + "grad_norm": 2.0813324451446533, + "learning_rate": 4.1302222537202104e-05, + "loss": 1.1589, + "step": 18315 + }, + { + "epoch": 5.48, + "grad_norm": 5.030991077423096, + "learning_rate": 4.129776784009926e-05, + "loss": 1.3765, + "step": 18320 + }, + { + "epoch": 5.48, + "grad_norm": 4.91030740737915, + "learning_rate": 4.129331224288763e-05, + "loss": 1.0896, + "step": 18325 + }, + { + "epoch": 5.48, + "grad_norm": 6.223639965057373, + "learning_rate": 4.1288855745813303e-05, + "loss": 1.2247, + "step": 18330 + }, + { + "epoch": 5.49, + "grad_norm": 3.3995022773742676, + "learning_rate": 4.128439834912241e-05, + "loss": 1.4152, + "step": 18335 + }, + { + "epoch": 5.49, + "grad_norm": 1.390557885169983, + "learning_rate": 4.127994005306112e-05, + "loss": 1.321, + "step": 18340 + }, + { + "epoch": 5.49, + "grad_norm": 1.9957035779953003, + "learning_rate": 4.127548085787566e-05, + "loss": 1.2822, + "step": 18345 + }, + { + "epoch": 5.49, + "grad_norm": 3.40853214263916, + "learning_rate": 4.127102076381231e-05, + "loss": 1.159, + "step": 18350 + }, + { + "epoch": 5.49, + "grad_norm": 1.8385875225067139, + "learning_rate": 4.12665597711174e-05, + "loss": 1.2222, + "step": 18355 + }, + { + "epoch": 5.49, + "grad_norm": 2.742072582244873, + "learning_rate": 4.12620978800373e-05, + "loss": 1.1475, + "step": 18360 + }, + { + "epoch": 5.49, + "grad_norm": 2.955695629119873, + "learning_rate": 4.125763509081844e-05, + "loss": 1.2988, + "step": 18365 + }, + { + "epoch": 5.5, + "grad_norm": 2.316948652267456, + "learning_rate": 4.125317140370729e-05, + "loss": 1.1109, + "step": 18370 + }, + { + "epoch": 5.5, + "grad_norm": 1.7137174606323242, + "learning_rate": 4.1248706818950376e-05, + "loss": 1.2146, + "step": 18375 + }, + { + "epoch": 5.5, + "grad_norm": 3.3981897830963135, + "learning_rate": 4.124424133679428e-05, + "loss": 1.291, + "step": 18380 + }, + { + "epoch": 5.5, + "grad_norm": 1.657471776008606, + "learning_rate": 4.123977495748561e-05, + "loss": 1.1274, + "step": 18385 + }, + { + "epoch": 5.5, + "grad_norm": 1.293222427368164, + "learning_rate": 4.123530768127105e-05, + "loss": 1.0961, + "step": 18390 + }, + { + "epoch": 5.5, + "grad_norm": 2.9801487922668457, + "learning_rate": 4.123083950839733e-05, + "loss": 1.1632, + "step": 18395 + }, + { + "epoch": 5.51, + "grad_norm": 2.470231294631958, + "learning_rate": 4.122637043911122e-05, + "loss": 1.0898, + "step": 18400 + }, + { + "epoch": 5.51, + "grad_norm": 1.0047528743743896, + "learning_rate": 4.122190047365952e-05, + "loss": 1.174, + "step": 18405 + }, + { + "epoch": 5.51, + "grad_norm": 5.736430644989014, + "learning_rate": 4.121742961228913e-05, + "loss": 1.1775, + "step": 18410 + }, + { + "epoch": 5.51, + "grad_norm": 1.3789207935333252, + "learning_rate": 4.121295785524696e-05, + "loss": 1.2466, + "step": 18415 + }, + { + "epoch": 5.51, + "grad_norm": 1.9205785989761353, + "learning_rate": 4.120848520277998e-05, + "loss": 1.0881, + "step": 18420 + }, + { + "epoch": 5.51, + "grad_norm": 2.249664783477783, + "learning_rate": 4.12040116551352e-05, + "loss": 1.3297, + "step": 18425 + }, + { + "epoch": 5.51, + "grad_norm": 1.7185654640197754, + "learning_rate": 4.1199537212559705e-05, + "loss": 1.1238, + "step": 18430 + }, + { + "epoch": 5.52, + "grad_norm": 3.920592784881592, + "learning_rate": 4.119506187530061e-05, + "loss": 1.1606, + "step": 18435 + }, + { + "epoch": 5.52, + "grad_norm": 1.8103728294372559, + "learning_rate": 4.119058564360509e-05, + "loss": 1.3541, + "step": 18440 + }, + { + "epoch": 5.52, + "grad_norm": 2.7962160110473633, + "learning_rate": 4.1186108517720344e-05, + "loss": 1.1774, + "step": 18445 + }, + { + "epoch": 5.52, + "grad_norm": 4.4177350997924805, + "learning_rate": 4.1181630497893645e-05, + "loss": 1.1272, + "step": 18450 + }, + { + "epoch": 5.52, + "grad_norm": 1.938101887702942, + "learning_rate": 4.117715158437232e-05, + "loss": 1.1472, + "step": 18455 + }, + { + "epoch": 5.52, + "grad_norm": 1.0241434574127197, + "learning_rate": 4.117267177740373e-05, + "loss": 1.2322, + "step": 18460 + }, + { + "epoch": 5.52, + "grad_norm": 1.9051687717437744, + "learning_rate": 4.116819107723529e-05, + "loss": 1.1203, + "step": 18465 + }, + { + "epoch": 5.53, + "grad_norm": 0.9832600951194763, + "learning_rate": 4.1163709484114456e-05, + "loss": 1.1336, + "step": 18470 + }, + { + "epoch": 5.53, + "grad_norm": 1.4782296419143677, + "learning_rate": 4.1159226998288754e-05, + "loss": 1.3825, + "step": 18475 + }, + { + "epoch": 5.53, + "grad_norm": 2.1868507862091064, + "learning_rate": 4.1154743620005734e-05, + "loss": 1.3936, + "step": 18480 + }, + { + "epoch": 5.53, + "grad_norm": 1.0652060508728027, + "learning_rate": 4.115025934951302e-05, + "loss": 1.0603, + "step": 18485 + }, + { + "epoch": 5.53, + "grad_norm": 1.29878568649292, + "learning_rate": 4.1145774187058265e-05, + "loss": 1.1407, + "step": 18490 + }, + { + "epoch": 5.53, + "grad_norm": 1.3626949787139893, + "learning_rate": 4.114128813288919e-05, + "loss": 1.1889, + "step": 18495 + }, + { + "epoch": 5.53, + "grad_norm": 1.5440853834152222, + "learning_rate": 4.113680118725355e-05, + "loss": 1.333, + "step": 18500 + }, + { + "epoch": 5.54, + "grad_norm": 4.131174564361572, + "learning_rate": 4.1132313350399155e-05, + "loss": 1.0847, + "step": 18505 + }, + { + "epoch": 5.54, + "grad_norm": 2.0806546211242676, + "learning_rate": 4.112782462257386e-05, + "loss": 1.3398, + "step": 18510 + }, + { + "epoch": 5.54, + "grad_norm": 2.146475076675415, + "learning_rate": 4.112333500402558e-05, + "loss": 1.1576, + "step": 18515 + }, + { + "epoch": 5.54, + "grad_norm": 2.3793232440948486, + "learning_rate": 4.111884449500225e-05, + "loss": 1.2433, + "step": 18520 + }, + { + "epoch": 5.54, + "grad_norm": 1.008089303970337, + "learning_rate": 4.11143530957519e-05, + "loss": 1.2024, + "step": 18525 + }, + { + "epoch": 5.54, + "grad_norm": 2.3187623023986816, + "learning_rate": 4.110986080652259e-05, + "loss": 1.1646, + "step": 18530 + }, + { + "epoch": 5.55, + "grad_norm": 2.2866110801696777, + "learning_rate": 4.1105367627562405e-05, + "loss": 1.2074, + "step": 18535 + }, + { + "epoch": 5.55, + "grad_norm": 3.7783994674682617, + "learning_rate": 4.110087355911951e-05, + "loss": 1.1768, + "step": 18540 + }, + { + "epoch": 5.55, + "grad_norm": 3.170124053955078, + "learning_rate": 4.1096378601442095e-05, + "loss": 1.0867, + "step": 18545 + }, + { + "epoch": 5.55, + "grad_norm": 1.4972915649414062, + "learning_rate": 4.109188275477843e-05, + "loss": 1.0619, + "step": 18550 + }, + { + "epoch": 5.55, + "grad_norm": 3.528488874435425, + "learning_rate": 4.1087386019376804e-05, + "loss": 1.3686, + "step": 18555 + }, + { + "epoch": 5.55, + "grad_norm": 2.188445806503296, + "learning_rate": 4.108288839548557e-05, + "loss": 1.2527, + "step": 18560 + }, + { + "epoch": 5.55, + "grad_norm": 1.9165037870407104, + "learning_rate": 4.107838988335313e-05, + "loss": 1.3966, + "step": 18565 + }, + { + "epoch": 5.56, + "grad_norm": 3.255675792694092, + "learning_rate": 4.1073890483227925e-05, + "loss": 1.1216, + "step": 18570 + }, + { + "epoch": 5.56, + "grad_norm": 1.5940176248550415, + "learning_rate": 4.106939019535846e-05, + "loss": 1.2242, + "step": 18575 + }, + { + "epoch": 5.56, + "grad_norm": 0.7701283097267151, + "learning_rate": 4.106488901999328e-05, + "loss": 1.3358, + "step": 18580 + }, + { + "epoch": 5.56, + "grad_norm": 1.4922525882720947, + "learning_rate": 4.106038695738097e-05, + "loss": 1.1585, + "step": 18585 + }, + { + "epoch": 5.56, + "grad_norm": 3.2928543090820312, + "learning_rate": 4.105588400777018e-05, + "loss": 1.1905, + "step": 18590 + }, + { + "epoch": 5.56, + "grad_norm": 2.7126290798187256, + "learning_rate": 4.1051380171409616e-05, + "loss": 1.2003, + "step": 18595 + }, + { + "epoch": 5.56, + "grad_norm": 3.020399332046509, + "learning_rate": 4.104687544854801e-05, + "loss": 1.1377, + "step": 18600 + }, + { + "epoch": 5.57, + "grad_norm": 0.8808854222297668, + "learning_rate": 4.104236983943415e-05, + "loss": 1.2103, + "step": 18605 + }, + { + "epoch": 5.57, + "grad_norm": 4.727932929992676, + "learning_rate": 4.1037863344316875e-05, + "loss": 1.1696, + "step": 18610 + }, + { + "epoch": 5.57, + "grad_norm": 6.011281967163086, + "learning_rate": 4.103335596344508e-05, + "loss": 1.199, + "step": 18615 + }, + { + "epoch": 5.57, + "grad_norm": 1.4836233854293823, + "learning_rate": 4.10288476970677e-05, + "loss": 1.1668, + "step": 18620 + }, + { + "epoch": 5.57, + "grad_norm": 1.927235722541809, + "learning_rate": 4.1024338545433724e-05, + "loss": 1.2543, + "step": 18625 + }, + { + "epoch": 5.57, + "grad_norm": 3.635470390319824, + "learning_rate": 4.101982850879218e-05, + "loss": 1.0165, + "step": 18630 + }, + { + "epoch": 5.58, + "grad_norm": 2.8242850303649902, + "learning_rate": 4.101531758739217e-05, + "loss": 1.3089, + "step": 18635 + }, + { + "epoch": 5.58, + "grad_norm": 2.512054920196533, + "learning_rate": 4.101080578148281e-05, + "loss": 1.1262, + "step": 18640 + }, + { + "epoch": 5.58, + "grad_norm": 2.9997546672821045, + "learning_rate": 4.10062930913133e-05, + "loss": 1.2634, + "step": 18645 + }, + { + "epoch": 5.58, + "grad_norm": 2.239625930786133, + "learning_rate": 4.1001779517132846e-05, + "loss": 1.1789, + "step": 18650 + }, + { + "epoch": 5.58, + "grad_norm": 1.2925533056259155, + "learning_rate": 4.099726505919075e-05, + "loss": 1.3636, + "step": 18655 + }, + { + "epoch": 5.58, + "grad_norm": 2.2485687732696533, + "learning_rate": 4.099274971773632e-05, + "loss": 1.1971, + "step": 18660 + }, + { + "epoch": 5.58, + "grad_norm": 1.4676896333694458, + "learning_rate": 4.098823349301896e-05, + "loss": 1.0614, + "step": 18665 + }, + { + "epoch": 5.59, + "grad_norm": 1.4249932765960693, + "learning_rate": 4.0983716385288083e-05, + "loss": 0.9559, + "step": 18670 + }, + { + "epoch": 5.59, + "grad_norm": 1.0708461999893188, + "learning_rate": 4.097919839479316e-05, + "loss": 1.2269, + "step": 18675 + }, + { + "epoch": 5.59, + "grad_norm": 2.775709390640259, + "learning_rate": 4.097467952178372e-05, + "loss": 1.1256, + "step": 18680 + }, + { + "epoch": 5.59, + "grad_norm": 3.5211985111236572, + "learning_rate": 4.097015976650934e-05, + "loss": 1.3445, + "step": 18685 + }, + { + "epoch": 5.59, + "grad_norm": 4.323126316070557, + "learning_rate": 4.0965639129219626e-05, + "loss": 1.3169, + "step": 18690 + }, + { + "epoch": 5.59, + "grad_norm": 1.8097007274627686, + "learning_rate": 4.096111761016426e-05, + "loss": 1.219, + "step": 18695 + }, + { + "epoch": 5.59, + "grad_norm": 1.6146286725997925, + "learning_rate": 4.095659520959297e-05, + "loss": 1.3955, + "step": 18700 + }, + { + "epoch": 5.6, + "grad_norm": 2.256901741027832, + "learning_rate": 4.09520719277555e-05, + "loss": 0.9899, + "step": 18705 + }, + { + "epoch": 5.6, + "grad_norm": 2.670027494430542, + "learning_rate": 4.094754776490168e-05, + "loss": 1.2336, + "step": 18710 + }, + { + "epoch": 5.6, + "grad_norm": 1.1742836236953735, + "learning_rate": 4.094302272128138e-05, + "loss": 1.3383, + "step": 18715 + }, + { + "epoch": 5.6, + "grad_norm": 1.9772650003433228, + "learning_rate": 4.0938496797144506e-05, + "loss": 1.2239, + "step": 18720 + }, + { + "epoch": 5.6, + "grad_norm": 1.8488539457321167, + "learning_rate": 4.093396999274102e-05, + "loss": 1.0871, + "step": 18725 + }, + { + "epoch": 5.6, + "grad_norm": 3.4919984340667725, + "learning_rate": 4.092944230832093e-05, + "loss": 1.2676, + "step": 18730 + }, + { + "epoch": 5.61, + "grad_norm": 1.2979923486709595, + "learning_rate": 4.092491374413431e-05, + "loss": 1.2437, + "step": 18735 + }, + { + "epoch": 5.61, + "grad_norm": 1.2796027660369873, + "learning_rate": 4.092038430043125e-05, + "loss": 1.1508, + "step": 18740 + }, + { + "epoch": 5.61, + "grad_norm": 1.175561547279358, + "learning_rate": 4.09158539774619e-05, + "loss": 1.1637, + "step": 18745 + }, + { + "epoch": 5.61, + "grad_norm": 3.259338140487671, + "learning_rate": 4.0911322775476494e-05, + "loss": 1.3101, + "step": 18750 + }, + { + "epoch": 5.61, + "grad_norm": 1.183301568031311, + "learning_rate": 4.0906790694725275e-05, + "loss": 1.1849, + "step": 18755 + }, + { + "epoch": 5.61, + "grad_norm": 3.8314900398254395, + "learning_rate": 4.090225773545853e-05, + "loss": 1.2224, + "step": 18760 + }, + { + "epoch": 5.61, + "grad_norm": 1.5235013961791992, + "learning_rate": 4.089772389792662e-05, + "loss": 1.3048, + "step": 18765 + }, + { + "epoch": 5.62, + "grad_norm": 2.4287803173065186, + "learning_rate": 4.089318918237994e-05, + "loss": 1.1388, + "step": 18770 + }, + { + "epoch": 5.62, + "grad_norm": 1.4137362241744995, + "learning_rate": 4.0888653589068946e-05, + "loss": 1.2584, + "step": 18775 + }, + { + "epoch": 5.62, + "grad_norm": 1.061883807182312, + "learning_rate": 4.0884117118244136e-05, + "loss": 1.3847, + "step": 18780 + }, + { + "epoch": 5.62, + "grad_norm": 3.9546823501586914, + "learning_rate": 4.087957977015604e-05, + "loss": 1.2478, + "step": 18785 + }, + { + "epoch": 5.62, + "grad_norm": 1.4414864778518677, + "learning_rate": 4.087504154505526e-05, + "loss": 1.3395, + "step": 18790 + }, + { + "epoch": 5.62, + "grad_norm": 1.690242052078247, + "learning_rate": 4.0870502443192446e-05, + "loss": 1.0451, + "step": 18795 + }, + { + "epoch": 5.62, + "grad_norm": 2.187337636947632, + "learning_rate": 4.086596246481826e-05, + "loss": 1.1884, + "step": 18800 + }, + { + "epoch": 5.63, + "grad_norm": 2.2674190998077393, + "learning_rate": 4.086142161018347e-05, + "loss": 1.323, + "step": 18805 + }, + { + "epoch": 5.63, + "grad_norm": 2.5989344120025635, + "learning_rate": 4.0856879879538854e-05, + "loss": 1.1205, + "step": 18810 + }, + { + "epoch": 5.63, + "grad_norm": 1.1521856784820557, + "learning_rate": 4.085233727313524e-05, + "loss": 1.2544, + "step": 18815 + }, + { + "epoch": 5.63, + "grad_norm": 3.605713367462158, + "learning_rate": 4.084779379122352e-05, + "loss": 1.2099, + "step": 18820 + }, + { + "epoch": 5.63, + "grad_norm": 1.6035972833633423, + "learning_rate": 4.0843249434054624e-05, + "loss": 1.3516, + "step": 18825 + }, + { + "epoch": 5.63, + "grad_norm": 1.7690247297286987, + "learning_rate": 4.083870420187953e-05, + "loss": 1.2547, + "step": 18830 + }, + { + "epoch": 5.64, + "grad_norm": 1.6774495840072632, + "learning_rate": 4.083415809494926e-05, + "loss": 1.1669, + "step": 18835 + }, + { + "epoch": 5.64, + "grad_norm": 2.1360936164855957, + "learning_rate": 4.08296111135149e-05, + "loss": 1.1833, + "step": 18840 + }, + { + "epoch": 5.64, + "grad_norm": 3.100994110107422, + "learning_rate": 4.082506325782757e-05, + "loss": 1.1586, + "step": 18845 + }, + { + "epoch": 5.64, + "grad_norm": 2.5224337577819824, + "learning_rate": 4.0820514528138444e-05, + "loss": 1.2535, + "step": 18850 + }, + { + "epoch": 5.64, + "grad_norm": 1.2927287817001343, + "learning_rate": 4.0815964924698745e-05, + "loss": 1.1492, + "step": 18855 + }, + { + "epoch": 5.64, + "grad_norm": 1.2006562948226929, + "learning_rate": 4.0811414447759755e-05, + "loss": 1.1727, + "step": 18860 + }, + { + "epoch": 5.64, + "grad_norm": 2.254915475845337, + "learning_rate": 4.080686309757277e-05, + "loss": 1.2347, + "step": 18865 + }, + { + "epoch": 5.65, + "grad_norm": 1.6122604608535767, + "learning_rate": 4.080231087438916e-05, + "loss": 1.0974, + "step": 18870 + }, + { + "epoch": 5.65, + "grad_norm": 2.5616037845611572, + "learning_rate": 4.079775777846036e-05, + "loss": 1.1608, + "step": 18875 + }, + { + "epoch": 5.65, + "grad_norm": 4.505390644073486, + "learning_rate": 4.07932038100378e-05, + "loss": 1.1997, + "step": 18880 + }, + { + "epoch": 5.65, + "grad_norm": 3.1718831062316895, + "learning_rate": 4.0788648969373035e-05, + "loss": 1.2127, + "step": 18885 + }, + { + "epoch": 5.65, + "grad_norm": 2.7245662212371826, + "learning_rate": 4.078409325671758e-05, + "loss": 1.2764, + "step": 18890 + }, + { + "epoch": 5.65, + "grad_norm": 0.9645751118659973, + "learning_rate": 4.0779536672323074e-05, + "loss": 1.2709, + "step": 18895 + }, + { + "epoch": 5.65, + "grad_norm": 2.0981924533843994, + "learning_rate": 4.077497921644115e-05, + "loss": 1.2451, + "step": 18900 + }, + { + "epoch": 5.66, + "grad_norm": 1.6078838109970093, + "learning_rate": 4.077042088932352e-05, + "loss": 1.1987, + "step": 18905 + }, + { + "epoch": 5.66, + "grad_norm": 3.138634443283081, + "learning_rate": 4.0765861691221943e-05, + "loss": 1.2224, + "step": 18910 + }, + { + "epoch": 5.66, + "grad_norm": 1.9491277933120728, + "learning_rate": 4.0761301622388204e-05, + "loss": 1.0712, + "step": 18915 + }, + { + "epoch": 5.66, + "grad_norm": 1.6134378910064697, + "learning_rate": 4.075674068307417e-05, + "loss": 1.2271, + "step": 18920 + }, + { + "epoch": 5.66, + "grad_norm": 4.156822681427002, + "learning_rate": 4.075217887353172e-05, + "loss": 1.2066, + "step": 18925 + }, + { + "epoch": 5.66, + "grad_norm": 2.2299163341522217, + "learning_rate": 4.074761619401281e-05, + "loss": 1.2968, + "step": 18930 + }, + { + "epoch": 5.67, + "grad_norm": 1.9773372411727905, + "learning_rate": 4.0743052644769416e-05, + "loss": 1.1542, + "step": 18935 + }, + { + "epoch": 5.67, + "grad_norm": 1.7787487506866455, + "learning_rate": 4.0738488226053595e-05, + "loss": 1.2944, + "step": 18940 + }, + { + "epoch": 5.67, + "grad_norm": 1.2517333030700684, + "learning_rate": 4.073392293811742e-05, + "loss": 1.2636, + "step": 18945 + }, + { + "epoch": 5.67, + "grad_norm": 2.801917314529419, + "learning_rate": 4.072935678121305e-05, + "loss": 1.2396, + "step": 18950 + }, + { + "epoch": 5.67, + "grad_norm": 2.199772357940674, + "learning_rate": 4.0724789755592654e-05, + "loss": 1.3366, + "step": 18955 + }, + { + "epoch": 5.67, + "grad_norm": 1.5385900735855103, + "learning_rate": 4.072022186150846e-05, + "loss": 1.1959, + "step": 18960 + }, + { + "epoch": 5.67, + "grad_norm": 1.959558367729187, + "learning_rate": 4.0715653099212744e-05, + "loss": 1.1745, + "step": 18965 + }, + { + "epoch": 5.68, + "grad_norm": 4.36882209777832, + "learning_rate": 4.071108346895786e-05, + "loss": 1.1226, + "step": 18970 + }, + { + "epoch": 5.68, + "grad_norm": 3.006443738937378, + "learning_rate": 4.0706512970996145e-05, + "loss": 1.1088, + "step": 18975 + }, + { + "epoch": 5.68, + "grad_norm": 1.4092053174972534, + "learning_rate": 4.070194160558006e-05, + "loss": 1.2145, + "step": 18980 + }, + { + "epoch": 5.68, + "grad_norm": 2.4782750606536865, + "learning_rate": 4.069736937296206e-05, + "loss": 1.2773, + "step": 18985 + }, + { + "epoch": 5.68, + "grad_norm": 2.681567430496216, + "learning_rate": 4.069279627339466e-05, + "loss": 1.2458, + "step": 18990 + }, + { + "epoch": 5.68, + "grad_norm": 2.1898365020751953, + "learning_rate": 4.068822230713044e-05, + "loss": 1.2287, + "step": 18995 + }, + { + "epoch": 5.68, + "grad_norm": 1.1778751611709595, + "learning_rate": 4.068364747442201e-05, + "loss": 1.3985, + "step": 19000 + }, + { + "epoch": 5.69, + "grad_norm": 2.775921106338501, + "learning_rate": 4.0679071775522024e-05, + "loss": 1.3502, + "step": 19005 + }, + { + "epoch": 5.69, + "grad_norm": 0.9329620003700256, + "learning_rate": 4.0674495210683214e-05, + "loss": 1.0845, + "step": 19010 + }, + { + "epoch": 5.69, + "grad_norm": 2.163052797317505, + "learning_rate": 4.0669917780158315e-05, + "loss": 1.1712, + "step": 19015 + }, + { + "epoch": 5.69, + "grad_norm": 1.6568050384521484, + "learning_rate": 4.066533948420015e-05, + "loss": 1.3567, + "step": 19020 + }, + { + "epoch": 5.69, + "grad_norm": 1.2599117755889893, + "learning_rate": 4.0660760323061564e-05, + "loss": 1.4662, + "step": 19025 + }, + { + "epoch": 5.69, + "grad_norm": 1.5358624458312988, + "learning_rate": 4.065618029699547e-05, + "loss": 1.372, + "step": 19030 + }, + { + "epoch": 5.7, + "grad_norm": 2.070814609527588, + "learning_rate": 4.06515994062548e-05, + "loss": 1.2393, + "step": 19035 + }, + { + "epoch": 5.7, + "grad_norm": 1.2835952043533325, + "learning_rate": 4.0647017651092575e-05, + "loss": 1.1488, + "step": 19040 + }, + { + "epoch": 5.7, + "grad_norm": 3.1609346866607666, + "learning_rate": 4.064243503176183e-05, + "loss": 1.2701, + "step": 19045 + }, + { + "epoch": 5.7, + "grad_norm": 4.8114471435546875, + "learning_rate": 4.063785154851565e-05, + "loss": 1.1842, + "step": 19050 + }, + { + "epoch": 5.7, + "grad_norm": 2.764596462249756, + "learning_rate": 4.0633267201607197e-05, + "loss": 1.1671, + "step": 19055 + }, + { + "epoch": 5.7, + "grad_norm": 1.5256291627883911, + "learning_rate": 4.062868199128964e-05, + "loss": 1.2943, + "step": 19060 + }, + { + "epoch": 5.7, + "grad_norm": 3.720947504043579, + "learning_rate": 4.062409591781622e-05, + "loss": 1.2156, + "step": 19065 + }, + { + "epoch": 5.71, + "grad_norm": 1.2810076475143433, + "learning_rate": 4.061950898144021e-05, + "loss": 1.1736, + "step": 19070 + }, + { + "epoch": 5.71, + "grad_norm": 1.708155632019043, + "learning_rate": 4.061492118241497e-05, + "loss": 1.2937, + "step": 19075 + }, + { + "epoch": 5.71, + "grad_norm": 2.4375946521759033, + "learning_rate": 4.0610332520993866e-05, + "loss": 1.2401, + "step": 19080 + }, + { + "epoch": 5.71, + "grad_norm": 2.5409374237060547, + "learning_rate": 4.060574299743032e-05, + "loss": 1.1797, + "step": 19085 + }, + { + "epoch": 5.71, + "grad_norm": 1.2663114070892334, + "learning_rate": 4.0601152611977797e-05, + "loss": 1.3379, + "step": 19090 + }, + { + "epoch": 5.71, + "grad_norm": 1.4234334230422974, + "learning_rate": 4.059656136488985e-05, + "loss": 1.2776, + "step": 19095 + }, + { + "epoch": 5.71, + "grad_norm": 2.477080821990967, + "learning_rate": 4.059196925642002e-05, + "loss": 1.1275, + "step": 19100 + }, + { + "epoch": 5.72, + "grad_norm": 2.8818910121917725, + "learning_rate": 4.058737628682193e-05, + "loss": 1.2485, + "step": 19105 + }, + { + "epoch": 5.72, + "grad_norm": 1.8685482740402222, + "learning_rate": 4.0582782456349264e-05, + "loss": 1.1152, + "step": 19110 + }, + { + "epoch": 5.72, + "grad_norm": 2.1116809844970703, + "learning_rate": 4.057818776525571e-05, + "loss": 1.2397, + "step": 19115 + }, + { + "epoch": 5.72, + "grad_norm": 3.2058467864990234, + "learning_rate": 4.0573592213795045e-05, + "loss": 1.3506, + "step": 19120 + }, + { + "epoch": 5.72, + "grad_norm": 2.4281482696533203, + "learning_rate": 4.0568995802221066e-05, + "loss": 1.0978, + "step": 19125 + }, + { + "epoch": 5.72, + "grad_norm": 3.1970787048339844, + "learning_rate": 4.056439853078763e-05, + "loss": 1.1549, + "step": 19130 + }, + { + "epoch": 5.72, + "grad_norm": 4.877747535705566, + "learning_rate": 4.0559800399748645e-05, + "loss": 1.186, + "step": 19135 + }, + { + "epoch": 5.73, + "grad_norm": 1.5900225639343262, + "learning_rate": 4.055520140935806e-05, + "loss": 1.2458, + "step": 19140 + }, + { + "epoch": 5.73, + "grad_norm": 2.55224871635437, + "learning_rate": 4.055060155986986e-05, + "loss": 1.2647, + "step": 19145 + }, + { + "epoch": 5.73, + "grad_norm": 1.5223407745361328, + "learning_rate": 4.054600085153811e-05, + "loss": 1.2311, + "step": 19150 + }, + { + "epoch": 5.73, + "grad_norm": 1.986936330795288, + "learning_rate": 4.054139928461689e-05, + "loss": 1.1517, + "step": 19155 + }, + { + "epoch": 5.73, + "grad_norm": 2.5802133083343506, + "learning_rate": 4.0536796859360336e-05, + "loss": 1.2503, + "step": 19160 + }, + { + "epoch": 5.73, + "grad_norm": 1.5326321125030518, + "learning_rate": 4.053219357602265e-05, + "loss": 1.2379, + "step": 19165 + }, + { + "epoch": 5.74, + "grad_norm": 2.684556484222412, + "learning_rate": 4.0527589434858046e-05, + "loss": 1.2704, + "step": 19170 + }, + { + "epoch": 5.74, + "grad_norm": 1.2292155027389526, + "learning_rate": 4.0522984436120826e-05, + "loss": 1.4196, + "step": 19175 + }, + { + "epoch": 5.74, + "grad_norm": 3.3510990142822266, + "learning_rate": 4.051837858006531e-05, + "loss": 1.0128, + "step": 19180 + }, + { + "epoch": 5.74, + "grad_norm": 2.779120922088623, + "learning_rate": 4.051377186694588e-05, + "loss": 1.3174, + "step": 19185 + }, + { + "epoch": 5.74, + "grad_norm": 2.3590750694274902, + "learning_rate": 4.0509164297016944e-05, + "loss": 1.2303, + "step": 19190 + }, + { + "epoch": 5.74, + "grad_norm": 1.3253639936447144, + "learning_rate": 4.050455587053299e-05, + "loss": 1.3627, + "step": 19195 + }, + { + "epoch": 5.74, + "grad_norm": 2.0595991611480713, + "learning_rate": 4.049994658774853e-05, + "loss": 1.2418, + "step": 19200 + }, + { + "epoch": 5.75, + "grad_norm": 1.3671808242797852, + "learning_rate": 4.0495336448918135e-05, + "loss": 1.2585, + "step": 19205 + }, + { + "epoch": 5.75, + "grad_norm": 1.1783967018127441, + "learning_rate": 4.0490725454296414e-05, + "loss": 1.1087, + "step": 19210 + }, + { + "epoch": 5.75, + "grad_norm": 1.43436598777771, + "learning_rate": 4.048611360413803e-05, + "loss": 1.0994, + "step": 19215 + }, + { + "epoch": 5.75, + "grad_norm": 2.8289105892181396, + "learning_rate": 4.048150089869768e-05, + "loss": 1.2857, + "step": 19220 + }, + { + "epoch": 5.75, + "grad_norm": 2.906419277191162, + "learning_rate": 4.047688733823013e-05, + "loss": 1.3255, + "step": 19225 + }, + { + "epoch": 5.75, + "grad_norm": 1.00028395652771, + "learning_rate": 4.0472272922990185e-05, + "loss": 1.3454, + "step": 19230 + }, + { + "epoch": 5.75, + "grad_norm": 1.3520787954330444, + "learning_rate": 4.046765765323269e-05, + "loss": 1.2744, + "step": 19235 + }, + { + "epoch": 5.76, + "grad_norm": 3.6939780712127686, + "learning_rate": 4.046304152921253e-05, + "loss": 1.277, + "step": 19240 + }, + { + "epoch": 5.76, + "grad_norm": 1.3129464387893677, + "learning_rate": 4.045842455118467e-05, + "loss": 1.2569, + "step": 19245 + }, + { + "epoch": 5.76, + "grad_norm": 1.3802201747894287, + "learning_rate": 4.045380671940409e-05, + "loss": 1.3112, + "step": 19250 + }, + { + "epoch": 5.76, + "grad_norm": 2.282783031463623, + "learning_rate": 4.0449188034125825e-05, + "loss": 1.1128, + "step": 19255 + }, + { + "epoch": 5.76, + "grad_norm": 2.5896713733673096, + "learning_rate": 4.044456849560496e-05, + "loss": 1.2365, + "step": 19260 + }, + { + "epoch": 5.76, + "grad_norm": 1.469196081161499, + "learning_rate": 4.043994810409664e-05, + "loss": 1.2037, + "step": 19265 + }, + { + "epoch": 5.77, + "grad_norm": 2.223722457885742, + "learning_rate": 4.043532685985602e-05, + "loss": 1.377, + "step": 19270 + }, + { + "epoch": 5.77, + "grad_norm": 1.6373085975646973, + "learning_rate": 4.043070476313835e-05, + "loss": 1.1027, + "step": 19275 + }, + { + "epoch": 5.77, + "grad_norm": 1.401114821434021, + "learning_rate": 4.0426081814198905e-05, + "loss": 1.148, + "step": 19280 + }, + { + "epoch": 5.77, + "grad_norm": 1.4653375148773193, + "learning_rate": 4.042145801329298e-05, + "loss": 1.3143, + "step": 19285 + }, + { + "epoch": 5.77, + "grad_norm": 1.659494161605835, + "learning_rate": 4.0416833360675966e-05, + "loss": 1.2026, + "step": 19290 + }, + { + "epoch": 5.77, + "grad_norm": 2.820892810821533, + "learning_rate": 4.0412207856603266e-05, + "loss": 1.1208, + "step": 19295 + }, + { + "epoch": 5.77, + "grad_norm": 2.835115909576416, + "learning_rate": 4.040758150133035e-05, + "loss": 1.2334, + "step": 19300 + }, + { + "epoch": 5.78, + "grad_norm": 1.002402901649475, + "learning_rate": 4.040295429511273e-05, + "loss": 1.2697, + "step": 19305 + }, + { + "epoch": 5.78, + "grad_norm": 1.9748258590698242, + "learning_rate": 4.0398326238205946e-05, + "loss": 1.3538, + "step": 19310 + }, + { + "epoch": 5.78, + "grad_norm": 1.5948344469070435, + "learning_rate": 4.039369733086561e-05, + "loss": 1.2056, + "step": 19315 + }, + { + "epoch": 5.78, + "grad_norm": 5.312704563140869, + "learning_rate": 4.038906757334737e-05, + "loss": 1.2161, + "step": 19320 + }, + { + "epoch": 5.78, + "grad_norm": 2.936213970184326, + "learning_rate": 4.0384436965906924e-05, + "loss": 1.1106, + "step": 19325 + }, + { + "epoch": 5.78, + "grad_norm": 2.379530429840088, + "learning_rate": 4.037980550880002e-05, + "loss": 1.2153, + "step": 19330 + }, + { + "epoch": 5.78, + "grad_norm": 3.4053995609283447, + "learning_rate": 4.0375173202282444e-05, + "loss": 1.1903, + "step": 19335 + }, + { + "epoch": 5.79, + "grad_norm": 2.1918978691101074, + "learning_rate": 4.0370540046610026e-05, + "loss": 1.1617, + "step": 19340 + }, + { + "epoch": 5.79, + "grad_norm": 1.1753573417663574, + "learning_rate": 4.036590604203867e-05, + "loss": 1.141, + "step": 19345 + }, + { + "epoch": 5.79, + "grad_norm": 2.638122081756592, + "learning_rate": 4.036127118882429e-05, + "loss": 1.0728, + "step": 19350 + }, + { + "epoch": 5.79, + "grad_norm": 1.511003017425537, + "learning_rate": 4.035663548722287e-05, + "loss": 1.3012, + "step": 19355 + }, + { + "epoch": 5.79, + "grad_norm": 1.878172516822815, + "learning_rate": 4.035199893749043e-05, + "loss": 1.2585, + "step": 19360 + }, + { + "epoch": 5.79, + "grad_norm": 1.5874683856964111, + "learning_rate": 4.0347361539883045e-05, + "loss": 1.1257, + "step": 19365 + }, + { + "epoch": 5.8, + "grad_norm": 2.5368497371673584, + "learning_rate": 4.034272329465684e-05, + "loss": 1.277, + "step": 19370 + }, + { + "epoch": 5.8, + "grad_norm": 1.9644641876220703, + "learning_rate": 4.033808420206798e-05, + "loss": 1.267, + "step": 19375 + }, + { + "epoch": 5.8, + "grad_norm": 3.0246572494506836, + "learning_rate": 4.0333444262372666e-05, + "loss": 1.1498, + "step": 19380 + }, + { + "epoch": 5.8, + "grad_norm": 2.288479804992676, + "learning_rate": 4.032880347582716e-05, + "loss": 1.2349, + "step": 19385 + }, + { + "epoch": 5.8, + "grad_norm": 1.7660423517227173, + "learning_rate": 4.032416184268778e-05, + "loss": 1.2463, + "step": 19390 + }, + { + "epoch": 5.8, + "grad_norm": 2.469409465789795, + "learning_rate": 4.031951936321086e-05, + "loss": 1.2732, + "step": 19395 + }, + { + "epoch": 5.8, + "grad_norm": 2.5984413623809814, + "learning_rate": 4.0314876037652814e-05, + "loss": 1.1642, + "step": 19400 + }, + { + "epoch": 5.81, + "grad_norm": 4.0700225830078125, + "learning_rate": 4.0310231866270086e-05, + "loss": 1.1763, + "step": 19405 + }, + { + "epoch": 5.81, + "grad_norm": 1.4601423740386963, + "learning_rate": 4.030558684931917e-05, + "loss": 1.1787, + "step": 19410 + }, + { + "epoch": 5.81, + "grad_norm": 2.451631546020508, + "learning_rate": 4.0300940987056596e-05, + "loss": 1.0628, + "step": 19415 + }, + { + "epoch": 5.81, + "grad_norm": 2.339339017868042, + "learning_rate": 4.029629427973895e-05, + "loss": 1.1614, + "step": 19420 + }, + { + "epoch": 5.81, + "grad_norm": 2.694387674331665, + "learning_rate": 4.0291646727622875e-05, + "loss": 1.1025, + "step": 19425 + }, + { + "epoch": 5.81, + "grad_norm": 3.171658754348755, + "learning_rate": 4.0286998330965056e-05, + "loss": 1.2235, + "step": 19430 + }, + { + "epoch": 5.81, + "grad_norm": 3.675020694732666, + "learning_rate": 4.02823490900222e-05, + "loss": 1.2227, + "step": 19435 + }, + { + "epoch": 5.82, + "grad_norm": 1.9126026630401611, + "learning_rate": 4.027769900505109e-05, + "loss": 1.0964, + "step": 19440 + }, + { + "epoch": 5.82, + "grad_norm": 2.5049381256103516, + "learning_rate": 4.027304807630854e-05, + "loss": 1.3127, + "step": 19445 + }, + { + "epoch": 5.82, + "grad_norm": 1.2870092391967773, + "learning_rate": 4.0268396304051426e-05, + "loss": 1.3275, + "step": 19450 + }, + { + "epoch": 5.82, + "grad_norm": 2.4361467361450195, + "learning_rate": 4.026374368853665e-05, + "loss": 1.0307, + "step": 19455 + }, + { + "epoch": 5.82, + "grad_norm": 2.4936602115631104, + "learning_rate": 4.025909023002118e-05, + "loss": 1.176, + "step": 19460 + }, + { + "epoch": 5.82, + "grad_norm": 3.3425686359405518, + "learning_rate": 4.025443592876201e-05, + "loss": 1.0798, + "step": 19465 + }, + { + "epoch": 5.83, + "grad_norm": 2.0376148223876953, + "learning_rate": 4.024978078501621e-05, + "loss": 1.1973, + "step": 19470 + }, + { + "epoch": 5.83, + "grad_norm": 4.597989559173584, + "learning_rate": 4.0245124799040864e-05, + "loss": 1.3608, + "step": 19475 + }, + { + "epoch": 5.83, + "grad_norm": 1.8406076431274414, + "learning_rate": 4.024046797109312e-05, + "loss": 1.2414, + "step": 19480 + }, + { + "epoch": 5.83, + "grad_norm": 1.4122562408447266, + "learning_rate": 4.023581030143018e-05, + "loss": 1.2824, + "step": 19485 + }, + { + "epoch": 5.83, + "grad_norm": 1.3209871053695679, + "learning_rate": 4.023115179030926e-05, + "loss": 1.2982, + "step": 19490 + }, + { + "epoch": 5.83, + "grad_norm": 2.4503726959228516, + "learning_rate": 4.0226492437987676e-05, + "loss": 1.1107, + "step": 19495 + }, + { + "epoch": 5.83, + "grad_norm": 1.1615526676177979, + "learning_rate": 4.022183224472272e-05, + "loss": 1.1436, + "step": 19500 + }, + { + "epoch": 5.84, + "grad_norm": 2.6597816944122314, + "learning_rate": 4.021717121077181e-05, + "loss": 1.189, + "step": 19505 + }, + { + "epoch": 5.84, + "grad_norm": 2.0554018020629883, + "learning_rate": 4.0212509336392345e-05, + "loss": 1.1992, + "step": 19510 + }, + { + "epoch": 5.84, + "grad_norm": 2.7470314502716064, + "learning_rate": 4.02078466218418e-05, + "loss": 1.378, + "step": 19515 + }, + { + "epoch": 5.84, + "grad_norm": 1.204832673072815, + "learning_rate": 4.020318306737769e-05, + "loss": 1.1576, + "step": 19520 + }, + { + "epoch": 5.84, + "grad_norm": 1.8212392330169678, + "learning_rate": 4.019851867325759e-05, + "loss": 1.3113, + "step": 19525 + }, + { + "epoch": 5.84, + "grad_norm": 2.319030523300171, + "learning_rate": 4.01938534397391e-05, + "loss": 1.313, + "step": 19530 + }, + { + "epoch": 5.84, + "grad_norm": 1.1338316202163696, + "learning_rate": 4.018918736707988e-05, + "loss": 1.1278, + "step": 19535 + }, + { + "epoch": 5.85, + "grad_norm": 1.2906646728515625, + "learning_rate": 4.018452045553762e-05, + "loss": 1.2246, + "step": 19540 + }, + { + "epoch": 5.85, + "grad_norm": 3.1656301021575928, + "learning_rate": 4.017985270537009e-05, + "loss": 1.2851, + "step": 19545 + }, + { + "epoch": 5.85, + "grad_norm": 1.0056490898132324, + "learning_rate": 4.017518411683507e-05, + "loss": 1.1831, + "step": 19550 + }, + { + "epoch": 5.85, + "grad_norm": 1.4432793855667114, + "learning_rate": 4.01705146901904e-05, + "loss": 1.2026, + "step": 19555 + }, + { + "epoch": 5.85, + "grad_norm": 2.5495402812957764, + "learning_rate": 4.016584442569398e-05, + "loss": 1.1733, + "step": 19560 + }, + { + "epoch": 5.85, + "grad_norm": 4.6060099601745605, + "learning_rate": 4.016117332360373e-05, + "loss": 1.1664, + "step": 19565 + }, + { + "epoch": 5.86, + "grad_norm": 1.2806930541992188, + "learning_rate": 4.015650138417764e-05, + "loss": 1.2543, + "step": 19570 + }, + { + "epoch": 5.86, + "grad_norm": 1.6852878332138062, + "learning_rate": 4.015182860767373e-05, + "loss": 1.1064, + "step": 19575 + }, + { + "epoch": 5.86, + "grad_norm": 1.5390417575836182, + "learning_rate": 4.014715499435008e-05, + "loss": 1.2544, + "step": 19580 + }, + { + "epoch": 5.86, + "grad_norm": 2.724890947341919, + "learning_rate": 4.01424805444648e-05, + "loss": 1.1592, + "step": 19585 + }, + { + "epoch": 5.86, + "grad_norm": 5.490366458892822, + "learning_rate": 4.013780525827606e-05, + "loss": 1.151, + "step": 19590 + }, + { + "epoch": 5.86, + "grad_norm": 2.0615406036376953, + "learning_rate": 4.0133129136042066e-05, + "loss": 1.2411, + "step": 19595 + }, + { + "epoch": 5.86, + "grad_norm": 5.847554683685303, + "learning_rate": 4.012845217802109e-05, + "loss": 1.2015, + "step": 19600 + }, + { + "epoch": 5.87, + "grad_norm": 1.873935341835022, + "learning_rate": 4.0123774384471425e-05, + "loss": 1.2761, + "step": 19605 + }, + { + "epoch": 5.87, + "grad_norm": 2.046846389770508, + "learning_rate": 4.0119095755651414e-05, + "loss": 1.2653, + "step": 19610 + }, + { + "epoch": 5.87, + "grad_norm": 2.1344308853149414, + "learning_rate": 4.011441629181946e-05, + "loss": 1.3688, + "step": 19615 + }, + { + "epoch": 5.87, + "grad_norm": 1.3133376836776733, + "learning_rate": 4.010973599323401e-05, + "loss": 1.3191, + "step": 19620 + }, + { + "epoch": 5.87, + "grad_norm": 1.7745246887207031, + "learning_rate": 4.010505486015354e-05, + "loss": 1.1906, + "step": 19625 + }, + { + "epoch": 5.87, + "grad_norm": 2.6860477924346924, + "learning_rate": 4.010037289283659e-05, + "loss": 1.1099, + "step": 19630 + }, + { + "epoch": 5.87, + "grad_norm": 2.37652850151062, + "learning_rate": 4.009569009154175e-05, + "loss": 1.0423, + "step": 19635 + }, + { + "epoch": 5.88, + "grad_norm": 1.3366721868515015, + "learning_rate": 4.0091006456527634e-05, + "loss": 1.3428, + "step": 19640 + }, + { + "epoch": 5.88, + "grad_norm": 5.54878568649292, + "learning_rate": 4.008632198805292e-05, + "loss": 1.1666, + "step": 19645 + }, + { + "epoch": 5.88, + "grad_norm": 0.9139274954795837, + "learning_rate": 4.008163668637632e-05, + "loss": 1.0742, + "step": 19650 + }, + { + "epoch": 5.88, + "grad_norm": 1.8782241344451904, + "learning_rate": 4.00769505517566e-05, + "loss": 1.2777, + "step": 19655 + }, + { + "epoch": 5.88, + "grad_norm": 1.8892041444778442, + "learning_rate": 4.0072263584452576e-05, + "loss": 1.3023, + "step": 19660 + }, + { + "epoch": 5.88, + "grad_norm": 1.7364705801010132, + "learning_rate": 4.0067575784723104e-05, + "loss": 1.2377, + "step": 19665 + }, + { + "epoch": 5.89, + "grad_norm": 0.6901882886886597, + "learning_rate": 4.0062887152827075e-05, + "loss": 1.2162, + "step": 19670 + }, + { + "epoch": 5.89, + "grad_norm": 1.6840230226516724, + "learning_rate": 4.005819768902346e-05, + "loss": 1.3165, + "step": 19675 + }, + { + "epoch": 5.89, + "grad_norm": 1.284379243850708, + "learning_rate": 4.005350739357122e-05, + "loss": 1.2768, + "step": 19680 + }, + { + "epoch": 5.89, + "grad_norm": 1.735634446144104, + "learning_rate": 4.004881626672943e-05, + "loss": 1.1276, + "step": 19685 + }, + { + "epoch": 5.89, + "grad_norm": 1.7231804132461548, + "learning_rate": 4.004412430875715e-05, + "loss": 1.2146, + "step": 19690 + }, + { + "epoch": 5.89, + "grad_norm": 3.919288396835327, + "learning_rate": 4.0039431519913525e-05, + "loss": 1.1341, + "step": 19695 + }, + { + "epoch": 5.89, + "grad_norm": 1.3154077529907227, + "learning_rate": 4.0034737900457734e-05, + "loss": 1.209, + "step": 19700 + }, + { + "epoch": 5.9, + "grad_norm": 2.288764715194702, + "learning_rate": 4.003004345064899e-05, + "loss": 1.2333, + "step": 19705 + }, + { + "epoch": 5.9, + "grad_norm": 1.616042137145996, + "learning_rate": 4.002534817074657e-05, + "loss": 1.2067, + "step": 19710 + }, + { + "epoch": 5.9, + "grad_norm": 2.097806692123413, + "learning_rate": 4.002065206100979e-05, + "loss": 1.2761, + "step": 19715 + }, + { + "epoch": 5.9, + "grad_norm": 4.625036716461182, + "learning_rate": 4.001595512169801e-05, + "loss": 1.1133, + "step": 19720 + }, + { + "epoch": 5.9, + "grad_norm": 1.4965091943740845, + "learning_rate": 4.001125735307063e-05, + "loss": 1.2998, + "step": 19725 + }, + { + "epoch": 5.9, + "grad_norm": 1.4583796262741089, + "learning_rate": 4.000655875538712e-05, + "loss": 1.1086, + "step": 19730 + }, + { + "epoch": 5.9, + "grad_norm": 2.9411721229553223, + "learning_rate": 4.000185932890697e-05, + "loss": 1.2033, + "step": 19735 + }, + { + "epoch": 5.91, + "grad_norm": 1.419301986694336, + "learning_rate": 3.999715907388971e-05, + "loss": 1.2032, + "step": 19740 + }, + { + "epoch": 5.91, + "grad_norm": 1.0543030500411987, + "learning_rate": 3.999245799059496e-05, + "loss": 1.3008, + "step": 19745 + }, + { + "epoch": 5.91, + "grad_norm": 4.684767723083496, + "learning_rate": 3.998775607928232e-05, + "loss": 1.3266, + "step": 19750 + }, + { + "epoch": 5.91, + "grad_norm": 3.58650541305542, + "learning_rate": 3.99830533402115e-05, + "loss": 1.2931, + "step": 19755 + }, + { + "epoch": 5.91, + "grad_norm": 1.2763997316360474, + "learning_rate": 3.997834977364222e-05, + "loss": 1.0974, + "step": 19760 + }, + { + "epoch": 5.91, + "grad_norm": 2.0618629455566406, + "learning_rate": 3.9973645379834255e-05, + "loss": 1.1197, + "step": 19765 + }, + { + "epoch": 5.91, + "grad_norm": 1.2482560873031616, + "learning_rate": 3.9968940159047416e-05, + "loss": 1.1536, + "step": 19770 + }, + { + "epoch": 5.92, + "grad_norm": 1.5299739837646484, + "learning_rate": 3.9964234111541567e-05, + "loss": 1.2666, + "step": 19775 + }, + { + "epoch": 5.92, + "grad_norm": 6.283170223236084, + "learning_rate": 3.9959527237576624e-05, + "loss": 1.2071, + "step": 19780 + }, + { + "epoch": 5.92, + "grad_norm": 1.4811429977416992, + "learning_rate": 3.995481953741254e-05, + "loss": 1.1695, + "step": 19785 + }, + { + "epoch": 5.92, + "grad_norm": 2.082146406173706, + "learning_rate": 3.995011101130932e-05, + "loss": 1.2389, + "step": 19790 + }, + { + "epoch": 5.92, + "grad_norm": 1.7142033576965332, + "learning_rate": 3.994540165952701e-05, + "loss": 1.3164, + "step": 19795 + }, + { + "epoch": 5.92, + "grad_norm": 1.3246126174926758, + "learning_rate": 3.99406914823257e-05, + "loss": 1.33, + "step": 19800 + }, + { + "epoch": 5.93, + "grad_norm": 3.4477603435516357, + "learning_rate": 3.993598047996553e-05, + "loss": 1.1553, + "step": 19805 + }, + { + "epoch": 5.93, + "grad_norm": 1.8627071380615234, + "learning_rate": 3.9931268652706676e-05, + "loss": 1.1348, + "step": 19810 + }, + { + "epoch": 5.93, + "grad_norm": 3.652338981628418, + "learning_rate": 3.992655600080938e-05, + "loss": 1.03, + "step": 19815 + }, + { + "epoch": 5.93, + "grad_norm": 2.0452940464019775, + "learning_rate": 3.992184252453392e-05, + "loss": 1.0227, + "step": 19820 + }, + { + "epoch": 5.93, + "grad_norm": 2.3490209579467773, + "learning_rate": 3.99171282241406e-05, + "loss": 1.2441, + "step": 19825 + }, + { + "epoch": 5.93, + "grad_norm": 1.5200634002685547, + "learning_rate": 3.991241309988979e-05, + "loss": 1.2541, + "step": 19830 + }, + { + "epoch": 5.93, + "grad_norm": 2.570288896560669, + "learning_rate": 3.9907697152041915e-05, + "loss": 1.3228, + "step": 19835 + }, + { + "epoch": 5.94, + "grad_norm": 1.5330473184585571, + "learning_rate": 3.9902980380857414e-05, + "loss": 1.2349, + "step": 19840 + }, + { + "epoch": 5.94, + "grad_norm": 1.0376076698303223, + "learning_rate": 3.9898262786596794e-05, + "loss": 1.1803, + "step": 19845 + }, + { + "epoch": 5.94, + "grad_norm": 1.4914852380752563, + "learning_rate": 3.989354436952061e-05, + "loss": 1.1023, + "step": 19850 + }, + { + "epoch": 5.94, + "grad_norm": 1.438751220703125, + "learning_rate": 3.988882512988945e-05, + "loss": 1.2842, + "step": 19855 + }, + { + "epoch": 5.94, + "grad_norm": 2.238565683364868, + "learning_rate": 3.988410506796396e-05, + "loss": 1.2185, + "step": 19860 + }, + { + "epoch": 5.94, + "grad_norm": 5.389926910400391, + "learning_rate": 3.987938418400482e-05, + "loss": 1.1369, + "step": 19865 + }, + { + "epoch": 5.94, + "grad_norm": 4.195040702819824, + "learning_rate": 3.987466247827275e-05, + "loss": 1.1121, + "step": 19870 + }, + { + "epoch": 5.95, + "grad_norm": 1.3536615371704102, + "learning_rate": 3.986993995102853e-05, + "loss": 1.3349, + "step": 19875 + }, + { + "epoch": 5.95, + "grad_norm": 1.4799764156341553, + "learning_rate": 3.9865216602532994e-05, + "loss": 1.2105, + "step": 19880 + }, + { + "epoch": 5.95, + "grad_norm": 1.5953404903411865, + "learning_rate": 3.986049243304699e-05, + "loss": 1.1709, + "step": 19885 + }, + { + "epoch": 5.95, + "grad_norm": 3.760248899459839, + "learning_rate": 3.9855767442831436e-05, + "loss": 1.1154, + "step": 19890 + }, + { + "epoch": 5.95, + "grad_norm": 4.010507106781006, + "learning_rate": 3.985104163214729e-05, + "loss": 1.3683, + "step": 19895 + }, + { + "epoch": 5.95, + "grad_norm": 2.708240032196045, + "learning_rate": 3.984631500125555e-05, + "loss": 1.233, + "step": 19900 + }, + { + "epoch": 5.96, + "grad_norm": 1.024338960647583, + "learning_rate": 3.984158755041726e-05, + "loss": 1.1541, + "step": 19905 + }, + { + "epoch": 5.96, + "grad_norm": 3.15925931930542, + "learning_rate": 3.9836859279893526e-05, + "loss": 1.2759, + "step": 19910 + }, + { + "epoch": 5.96, + "grad_norm": 3.913562536239624, + "learning_rate": 3.9832130189945475e-05, + "loss": 1.1862, + "step": 19915 + }, + { + "epoch": 5.96, + "grad_norm": 2.1706655025482178, + "learning_rate": 3.982740028083428e-05, + "loss": 1.252, + "step": 19920 + }, + { + "epoch": 5.96, + "grad_norm": 1.2419073581695557, + "learning_rate": 3.982266955282119e-05, + "loss": 1.3196, + "step": 19925 + }, + { + "epoch": 5.96, + "grad_norm": 1.833250880241394, + "learning_rate": 3.9817938006167465e-05, + "loss": 1.1608, + "step": 19930 + }, + { + "epoch": 5.96, + "grad_norm": 1.1308797597885132, + "learning_rate": 3.9813205641134424e-05, + "loss": 1.2362, + "step": 19935 + }, + { + "epoch": 5.97, + "grad_norm": 1.6005351543426514, + "learning_rate": 3.980847245798344e-05, + "loss": 1.2911, + "step": 19940 + }, + { + "epoch": 5.97, + "grad_norm": 1.2938203811645508, + "learning_rate": 3.9803738456975905e-05, + "loss": 1.342, + "step": 19945 + }, + { + "epoch": 5.97, + "grad_norm": 5.012538433074951, + "learning_rate": 3.9799003638373283e-05, + "loss": 1.1956, + "step": 19950 + }, + { + "epoch": 5.97, + "grad_norm": 2.233893394470215, + "learning_rate": 3.979426800243708e-05, + "loss": 1.2192, + "step": 19955 + }, + { + "epoch": 5.97, + "grad_norm": 4.2855448722839355, + "learning_rate": 3.978953154942883e-05, + "loss": 1.2098, + "step": 19960 + }, + { + "epoch": 5.97, + "grad_norm": 2.717545747756958, + "learning_rate": 3.978479427961012e-05, + "loss": 1.265, + "step": 19965 + }, + { + "epoch": 5.97, + "grad_norm": 2.452308177947998, + "learning_rate": 3.97800561932426e-05, + "loss": 1.2003, + "step": 19970 + }, + { + "epoch": 5.98, + "grad_norm": 3.6436851024627686, + "learning_rate": 3.977531729058793e-05, + "loss": 1.3046, + "step": 19975 + }, + { + "epoch": 5.98, + "grad_norm": 2.9817821979522705, + "learning_rate": 3.977057757190785e-05, + "loss": 1.0906, + "step": 19980 + }, + { + "epoch": 5.98, + "grad_norm": 2.8185782432556152, + "learning_rate": 3.9765837037464124e-05, + "loss": 1.3706, + "step": 19985 + }, + { + "epoch": 5.98, + "grad_norm": 3.414907217025757, + "learning_rate": 3.9761095687518565e-05, + "loss": 1.1195, + "step": 19990 + }, + { + "epoch": 5.98, + "grad_norm": 2.366452217102051, + "learning_rate": 3.9756353522333034e-05, + "loss": 1.3436, + "step": 19995 + }, + { + "epoch": 5.98, + "grad_norm": 2.4633522033691406, + "learning_rate": 3.975161054216944e-05, + "loss": 1.2036, + "step": 20000 + }, + { + "epoch": 5.99, + "grad_norm": 2.6752207279205322, + "learning_rate": 3.9746866747289726e-05, + "loss": 1.3225, + "step": 20005 + }, + { + "epoch": 5.99, + "grad_norm": 1.7009152173995972, + "learning_rate": 3.9742122137955884e-05, + "loss": 1.2566, + "step": 20010 + }, + { + "epoch": 5.99, + "grad_norm": 2.860818386077881, + "learning_rate": 3.9737376714429964e-05, + "loss": 1.2588, + "step": 20015 + }, + { + "epoch": 5.99, + "grad_norm": 4.960500717163086, + "learning_rate": 3.973263047697405e-05, + "loss": 1.1748, + "step": 20020 + }, + { + "epoch": 5.99, + "grad_norm": 1.3426363468170166, + "learning_rate": 3.972788342585027e-05, + "loss": 1.2358, + "step": 20025 + }, + { + "epoch": 5.99, + "grad_norm": 5.134890556335449, + "learning_rate": 3.972313556132079e-05, + "loss": 1.2629, + "step": 20030 + }, + { + "epoch": 5.99, + "grad_norm": 3.40563702583313, + "learning_rate": 3.971838688364784e-05, + "loss": 1.186, + "step": 20035 + }, + { + "epoch": 6.0, + "grad_norm": 2.111565113067627, + "learning_rate": 3.9713637393093686e-05, + "loss": 1.2938, + "step": 20040 + }, + { + "epoch": 6.0, + "grad_norm": 1.0406635999679565, + "learning_rate": 3.970888708992063e-05, + "loss": 1.1778, + "step": 20045 + }, + { + "epoch": 6.0, + "grad_norm": 2.193850517272949, + "learning_rate": 3.9704135974391026e-05, + "loss": 1.206, + "step": 20050 + }, + { + "epoch": 6.0, + "grad_norm": 0.8618187308311462, + "learning_rate": 3.969938404676728e-05, + "loss": 1.2366, + "step": 20055 + }, + { + "epoch": 6.0, + "grad_norm": 1.8126907348632812, + "learning_rate": 3.969463130731183e-05, + "loss": 0.9741, + "step": 20060 + }, + { + "epoch": 6.0, + "grad_norm": 1.3693639039993286, + "learning_rate": 3.968987775628717e-05, + "loss": 1.0962, + "step": 20065 + }, + { + "epoch": 6.0, + "grad_norm": 1.7154806852340698, + "learning_rate": 3.9685123393955824e-05, + "loss": 1.4039, + "step": 20070 + }, + { + "epoch": 6.01, + "grad_norm": 1.4863255023956299, + "learning_rate": 3.968036822058038e-05, + "loss": 1.1968, + "step": 20075 + }, + { + "epoch": 6.01, + "grad_norm": 9.683221817016602, + "learning_rate": 3.9675612236423466e-05, + "loss": 1.1326, + "step": 20080 + }, + { + "epoch": 6.01, + "grad_norm": 2.053426504135132, + "learning_rate": 3.9670855441747737e-05, + "loss": 1.2913, + "step": 20085 + }, + { + "epoch": 6.01, + "grad_norm": 1.7866415977478027, + "learning_rate": 3.9666097836815915e-05, + "loss": 1.2354, + "step": 20090 + }, + { + "epoch": 6.01, + "grad_norm": 1.6911275386810303, + "learning_rate": 3.9661339421890746e-05, + "loss": 1.0844, + "step": 20095 + }, + { + "epoch": 6.01, + "grad_norm": 1.2909938097000122, + "learning_rate": 3.965658019723505e-05, + "loss": 1.2505, + "step": 20100 + }, + { + "epoch": 6.02, + "grad_norm": 2.46484375, + "learning_rate": 3.965182016311165e-05, + "loss": 0.985, + "step": 20105 + }, + { + "epoch": 6.02, + "grad_norm": 1.2409467697143555, + "learning_rate": 3.964705931978346e-05, + "loss": 1.314, + "step": 20110 + }, + { + "epoch": 6.02, + "grad_norm": 1.6112093925476074, + "learning_rate": 3.964229766751342e-05, + "loss": 1.1812, + "step": 20115 + }, + { + "epoch": 6.02, + "grad_norm": 2.3558475971221924, + "learning_rate": 3.9637535206564485e-05, + "loss": 1.2568, + "step": 20120 + }, + { + "epoch": 6.02, + "grad_norm": 2.557868242263794, + "learning_rate": 3.96327719371997e-05, + "loss": 1.0185, + "step": 20125 + }, + { + "epoch": 6.02, + "grad_norm": 1.7992500066757202, + "learning_rate": 3.962800785968213e-05, + "loss": 1.115, + "step": 20130 + }, + { + "epoch": 6.02, + "grad_norm": 5.036953926086426, + "learning_rate": 3.962324297427489e-05, + "loss": 1.0766, + "step": 20135 + }, + { + "epoch": 6.03, + "grad_norm": 1.705336332321167, + "learning_rate": 3.9618477281241146e-05, + "loss": 1.1394, + "step": 20140 + }, + { + "epoch": 6.03, + "grad_norm": 1.967340111732483, + "learning_rate": 3.9613710780844096e-05, + "loss": 1.2392, + "step": 20145 + }, + { + "epoch": 6.03, + "grad_norm": 2.6726629734039307, + "learning_rate": 3.9608943473346985e-05, + "loss": 1.2141, + "step": 20150 + }, + { + "epoch": 6.03, + "grad_norm": 1.7967979907989502, + "learning_rate": 3.960417535901311e-05, + "loss": 1.0865, + "step": 20155 + }, + { + "epoch": 6.03, + "grad_norm": 2.194570779800415, + "learning_rate": 3.959940643810581e-05, + "loss": 1.0641, + "step": 20160 + }, + { + "epoch": 6.03, + "grad_norm": 3.644632339477539, + "learning_rate": 3.9594636710888475e-05, + "loss": 1.2237, + "step": 20165 + }, + { + "epoch": 6.03, + "grad_norm": 1.7848505973815918, + "learning_rate": 3.9589866177624515e-05, + "loss": 1.1259, + "step": 20170 + }, + { + "epoch": 6.04, + "grad_norm": 1.647910714149475, + "learning_rate": 3.958509483857742e-05, + "loss": 1.3501, + "step": 20175 + }, + { + "epoch": 6.04, + "grad_norm": 2.3628077507019043, + "learning_rate": 3.95803226940107e-05, + "loss": 1.2317, + "step": 20180 + }, + { + "epoch": 6.04, + "grad_norm": 1.816735863685608, + "learning_rate": 3.95755497441879e-05, + "loss": 0.9811, + "step": 20185 + }, + { + "epoch": 6.04, + "grad_norm": 1.5639019012451172, + "learning_rate": 3.957077598937264e-05, + "loss": 1.1717, + "step": 20190 + }, + { + "epoch": 6.04, + "grad_norm": 1.4008920192718506, + "learning_rate": 3.956600142982858e-05, + "loss": 1.1988, + "step": 20195 + }, + { + "epoch": 6.04, + "grad_norm": 3.0697457790374756, + "learning_rate": 3.956122606581939e-05, + "loss": 1.0945, + "step": 20200 + }, + { + "epoch": 6.05, + "grad_norm": 1.7359205484390259, + "learning_rate": 3.9556449897608824e-05, + "loss": 1.1508, + "step": 20205 + }, + { + "epoch": 6.05, + "grad_norm": 2.08030104637146, + "learning_rate": 3.955167292546066e-05, + "loss": 1.0043, + "step": 20210 + }, + { + "epoch": 6.05, + "grad_norm": 2.151505470275879, + "learning_rate": 3.9546895149638737e-05, + "loss": 1.3255, + "step": 20215 + }, + { + "epoch": 6.05, + "grad_norm": 2.6911120414733887, + "learning_rate": 3.954211657040691e-05, + "loss": 1.0752, + "step": 20220 + }, + { + "epoch": 6.05, + "grad_norm": 3.3755829334259033, + "learning_rate": 3.953733718802909e-05, + "loss": 1.3089, + "step": 20225 + }, + { + "epoch": 6.05, + "grad_norm": 1.3556479215621948, + "learning_rate": 3.953255700276925e-05, + "loss": 1.1838, + "step": 20230 + }, + { + "epoch": 6.05, + "grad_norm": 1.6404752731323242, + "learning_rate": 3.95277760148914e-05, + "loss": 1.1368, + "step": 20235 + }, + { + "epoch": 6.06, + "grad_norm": 1.5696030855178833, + "learning_rate": 3.9522994224659576e-05, + "loss": 1.0713, + "step": 20240 + }, + { + "epoch": 6.06, + "grad_norm": 0.9167082905769348, + "learning_rate": 3.951821163233788e-05, + "loss": 1.0272, + "step": 20245 + }, + { + "epoch": 6.06, + "grad_norm": 2.055103063583374, + "learning_rate": 3.951342823819044e-05, + "loss": 1.1271, + "step": 20250 + }, + { + "epoch": 6.06, + "grad_norm": 3.4541079998016357, + "learning_rate": 3.950864404248145e-05, + "loss": 1.3, + "step": 20255 + }, + { + "epoch": 6.06, + "grad_norm": 3.939716100692749, + "learning_rate": 3.950385904547513e-05, + "loss": 1.2012, + "step": 20260 + }, + { + "epoch": 6.06, + "grad_norm": 1.5123727321624756, + "learning_rate": 3.9499073247435755e-05, + "loss": 1.2772, + "step": 20265 + }, + { + "epoch": 6.06, + "grad_norm": 1.4127657413482666, + "learning_rate": 3.949428664862762e-05, + "loss": 1.3369, + "step": 20270 + }, + { + "epoch": 6.07, + "grad_norm": 2.0082712173461914, + "learning_rate": 3.948949924931512e-05, + "loss": 1.2377, + "step": 20275 + }, + { + "epoch": 6.07, + "grad_norm": 1.3078731298446655, + "learning_rate": 3.9484711049762625e-05, + "loss": 1.097, + "step": 20280 + }, + { + "epoch": 6.07, + "grad_norm": 1.3073629140853882, + "learning_rate": 3.94799220502346e-05, + "loss": 1.0726, + "step": 20285 + }, + { + "epoch": 6.07, + "grad_norm": 1.1648201942443848, + "learning_rate": 3.9475132250995525e-05, + "loss": 1.2021, + "step": 20290 + }, + { + "epoch": 6.07, + "grad_norm": 4.134536266326904, + "learning_rate": 3.947034165230995e-05, + "loss": 1.1795, + "step": 20295 + }, + { + "epoch": 6.07, + "grad_norm": 1.0476043224334717, + "learning_rate": 3.946555025444244e-05, + "loss": 1.1613, + "step": 20300 + }, + { + "epoch": 6.08, + "grad_norm": 2.554640769958496, + "learning_rate": 3.9460758057657626e-05, + "loss": 1.1222, + "step": 20305 + }, + { + "epoch": 6.08, + "grad_norm": 3.514789342880249, + "learning_rate": 3.945596506222018e-05, + "loss": 1.0743, + "step": 20310 + }, + { + "epoch": 6.08, + "grad_norm": 1.703673243522644, + "learning_rate": 3.945117126839481e-05, + "loss": 1.1762, + "step": 20315 + }, + { + "epoch": 6.08, + "grad_norm": 2.6589813232421875, + "learning_rate": 3.944637667644627e-05, + "loss": 1.1546, + "step": 20320 + }, + { + "epoch": 6.08, + "grad_norm": 3.949831008911133, + "learning_rate": 3.9441581286639365e-05, + "loss": 1.2165, + "step": 20325 + }, + { + "epoch": 6.08, + "grad_norm": 2.2509963512420654, + "learning_rate": 3.943678509923893e-05, + "loss": 1.2908, + "step": 20330 + }, + { + "epoch": 6.08, + "grad_norm": 2.6351144313812256, + "learning_rate": 3.9431988114509854e-05, + "loss": 1.2144, + "step": 20335 + }, + { + "epoch": 6.09, + "grad_norm": 1.2921767234802246, + "learning_rate": 3.942719033271709e-05, + "loss": 1.2182, + "step": 20340 + }, + { + "epoch": 6.09, + "grad_norm": 1.6154694557189941, + "learning_rate": 3.9422391754125596e-05, + "loss": 1.1248, + "step": 20345 + }, + { + "epoch": 6.09, + "grad_norm": 1.5173944234848022, + "learning_rate": 3.94175923790004e-05, + "loss": 1.1687, + "step": 20350 + }, + { + "epoch": 6.09, + "grad_norm": 1.5915690660476685, + "learning_rate": 3.941279220760655e-05, + "loss": 1.3048, + "step": 20355 + }, + { + "epoch": 6.09, + "grad_norm": 2.2365562915802, + "learning_rate": 3.940799124020918e-05, + "loss": 1.1774, + "step": 20360 + }, + { + "epoch": 6.09, + "grad_norm": 2.3418896198272705, + "learning_rate": 3.9403189477073424e-05, + "loss": 1.387, + "step": 20365 + }, + { + "epoch": 6.09, + "grad_norm": 1.1092318296432495, + "learning_rate": 3.939838691846449e-05, + "loss": 1.2785, + "step": 20370 + }, + { + "epoch": 6.1, + "grad_norm": 1.0265756845474243, + "learning_rate": 3.939358356464761e-05, + "loss": 1.139, + "step": 20375 + }, + { + "epoch": 6.1, + "grad_norm": 1.068686842918396, + "learning_rate": 3.9388779415888075e-05, + "loss": 1.1121, + "step": 20380 + }, + { + "epoch": 6.1, + "grad_norm": 1.821524977684021, + "learning_rate": 3.93839744724512e-05, + "loss": 1.008, + "step": 20385 + }, + { + "epoch": 6.1, + "grad_norm": 2.26947021484375, + "learning_rate": 3.937916873460237e-05, + "loss": 1.2466, + "step": 20390 + }, + { + "epoch": 6.1, + "grad_norm": 2.6822071075439453, + "learning_rate": 3.9374362202607e-05, + "loss": 1.0863, + "step": 20395 + }, + { + "epoch": 6.1, + "grad_norm": 2.3898754119873047, + "learning_rate": 3.936955487673054e-05, + "loss": 1.1818, + "step": 20400 + }, + { + "epoch": 6.1, + "grad_norm": 2.3284709453582764, + "learning_rate": 3.9364746757238515e-05, + "loss": 1.2534, + "step": 20405 + }, + { + "epoch": 6.11, + "grad_norm": 2.5323164463043213, + "learning_rate": 3.935993784439644e-05, + "loss": 1.1993, + "step": 20410 + }, + { + "epoch": 6.11, + "grad_norm": 1.2077038288116455, + "learning_rate": 3.935512813846994e-05, + "loss": 1.0831, + "step": 20415 + }, + { + "epoch": 6.11, + "grad_norm": 1.9154731035232544, + "learning_rate": 3.935031763972462e-05, + "loss": 1.1491, + "step": 20420 + }, + { + "epoch": 6.11, + "grad_norm": 1.4190819263458252, + "learning_rate": 3.9345506348426184e-05, + "loss": 1.0991, + "step": 20425 + }, + { + "epoch": 6.11, + "grad_norm": 1.8110177516937256, + "learning_rate": 3.934069426484034e-05, + "loss": 1.3591, + "step": 20430 + }, + { + "epoch": 6.11, + "grad_norm": 2.452357530593872, + "learning_rate": 3.9335881389232854e-05, + "loss": 1.1955, + "step": 20435 + }, + { + "epoch": 6.12, + "grad_norm": 3.6811835765838623, + "learning_rate": 3.9331067721869555e-05, + "loss": 1.042, + "step": 20440 + }, + { + "epoch": 6.12, + "grad_norm": 1.9267451763153076, + "learning_rate": 3.932625326301627e-05, + "loss": 1.0402, + "step": 20445 + }, + { + "epoch": 6.12, + "grad_norm": 1.685020089149475, + "learning_rate": 3.9321438012938906e-05, + "loss": 1.1954, + "step": 20450 + }, + { + "epoch": 6.12, + "grad_norm": 2.5964527130126953, + "learning_rate": 3.931662197190341e-05, + "loss": 1.069, + "step": 20455 + }, + { + "epoch": 6.12, + "grad_norm": 6.929611682891846, + "learning_rate": 3.931180514017576e-05, + "loss": 1.1414, + "step": 20460 + }, + { + "epoch": 6.12, + "grad_norm": 1.4876823425292969, + "learning_rate": 3.9306987518022e-05, + "loss": 1.293, + "step": 20465 + }, + { + "epoch": 6.12, + "grad_norm": 1.7589577436447144, + "learning_rate": 3.930216910570818e-05, + "loss": 1.0805, + "step": 20470 + }, + { + "epoch": 6.13, + "grad_norm": 3.688732862472534, + "learning_rate": 3.929734990350043e-05, + "loss": 1.2528, + "step": 20475 + }, + { + "epoch": 6.13, + "grad_norm": 1.360373854637146, + "learning_rate": 3.929252991166491e-05, + "loss": 1.3237, + "step": 20480 + }, + { + "epoch": 6.13, + "grad_norm": 6.932022571563721, + "learning_rate": 3.928770913046781e-05, + "loss": 1.1989, + "step": 20485 + }, + { + "epoch": 6.13, + "grad_norm": 2.913423538208008, + "learning_rate": 3.928288756017539e-05, + "loss": 1.1123, + "step": 20490 + }, + { + "epoch": 6.13, + "grad_norm": 1.7784669399261475, + "learning_rate": 3.927806520105394e-05, + "loss": 1.1983, + "step": 20495 + }, + { + "epoch": 6.13, + "grad_norm": 1.5603896379470825, + "learning_rate": 3.927324205336979e-05, + "loss": 1.2451, + "step": 20500 + }, + { + "epoch": 6.13, + "grad_norm": 3.020375967025757, + "learning_rate": 3.9268418117389314e-05, + "loss": 1.1557, + "step": 20505 + }, + { + "epoch": 6.14, + "grad_norm": 2.243635416030884, + "learning_rate": 3.926359339337894e-05, + "loss": 1.1213, + "step": 20510 + }, + { + "epoch": 6.14, + "grad_norm": 3.9490649700164795, + "learning_rate": 3.9258767881605126e-05, + "loss": 1.1739, + "step": 20515 + }, + { + "epoch": 6.14, + "grad_norm": 2.237060308456421, + "learning_rate": 3.925394158233438e-05, + "loss": 1.2768, + "step": 20520 + }, + { + "epoch": 6.14, + "grad_norm": 1.9132599830627441, + "learning_rate": 3.924911449583326e-05, + "loss": 1.0558, + "step": 20525 + }, + { + "epoch": 6.14, + "grad_norm": 1.5452789068222046, + "learning_rate": 3.924428662236836e-05, + "loss": 1.3715, + "step": 20530 + }, + { + "epoch": 6.14, + "grad_norm": 4.26951265335083, + "learning_rate": 3.923945796220632e-05, + "loss": 1.2972, + "step": 20535 + }, + { + "epoch": 6.15, + "grad_norm": 8.05988883972168, + "learning_rate": 3.9234628515613806e-05, + "loss": 1.1404, + "step": 20540 + }, + { + "epoch": 6.15, + "grad_norm": 2.191908359527588, + "learning_rate": 3.922979828285757e-05, + "loss": 1.1919, + "step": 20545 + }, + { + "epoch": 6.15, + "grad_norm": 1.6250762939453125, + "learning_rate": 3.922496726420435e-05, + "loss": 1.1798, + "step": 20550 + }, + { + "epoch": 6.15, + "grad_norm": 2.0135927200317383, + "learning_rate": 3.9220135459920984e-05, + "loss": 1.1462, + "step": 20555 + }, + { + "epoch": 6.15, + "grad_norm": 2.279970407485962, + "learning_rate": 3.921530287027431e-05, + "loss": 1.2313, + "step": 20560 + }, + { + "epoch": 6.15, + "grad_norm": 1.4251229763031006, + "learning_rate": 3.921046949553124e-05, + "loss": 1.2208, + "step": 20565 + }, + { + "epoch": 6.15, + "grad_norm": 2.408446788787842, + "learning_rate": 3.920563533595871e-05, + "loss": 0.9747, + "step": 20570 + }, + { + "epoch": 6.16, + "grad_norm": 4.922577381134033, + "learning_rate": 3.9200800391823705e-05, + "loss": 1.1785, + "step": 20575 + }, + { + "epoch": 6.16, + "grad_norm": 1.8073464632034302, + "learning_rate": 3.919596466339326e-05, + "loss": 1.0418, + "step": 20580 + }, + { + "epoch": 6.16, + "grad_norm": 1.2471339702606201, + "learning_rate": 3.9191128150934435e-05, + "loss": 1.1297, + "step": 20585 + }, + { + "epoch": 6.16, + "grad_norm": 3.711719036102295, + "learning_rate": 3.918629085471436e-05, + "loss": 1.1975, + "step": 20590 + }, + { + "epoch": 6.16, + "grad_norm": 4.0079498291015625, + "learning_rate": 3.918145277500018e-05, + "loss": 1.2807, + "step": 20595 + }, + { + "epoch": 6.16, + "grad_norm": 1.5881859064102173, + "learning_rate": 3.9176613912059114e-05, + "loss": 1.1369, + "step": 20600 + }, + { + "epoch": 6.16, + "grad_norm": 1.388296127319336, + "learning_rate": 3.917177426615839e-05, + "loss": 1.1525, + "step": 20605 + }, + { + "epoch": 6.17, + "grad_norm": 1.6135921478271484, + "learning_rate": 3.916790198588648e-05, + "loss": 1.2703, + "step": 20610 + }, + { + "epoch": 6.17, + "grad_norm": 1.2012652158737183, + "learning_rate": 3.916306093133198e-05, + "loss": 1.2268, + "step": 20615 + }, + { + "epoch": 6.17, + "grad_norm": 1.4433081150054932, + "learning_rate": 3.915821909456635e-05, + "loss": 1.2495, + "step": 20620 + }, + { + "epoch": 6.17, + "grad_norm": 1.3392014503479004, + "learning_rate": 3.9153376475856995e-05, + "loss": 1.2917, + "step": 20625 + }, + { + "epoch": 6.17, + "grad_norm": 1.8551833629608154, + "learning_rate": 3.9148533075471364e-05, + "loss": 1.1313, + "step": 20630 + }, + { + "epoch": 6.17, + "grad_norm": 2.474085569381714, + "learning_rate": 3.914368889367697e-05, + "loss": 1.123, + "step": 20635 + }, + { + "epoch": 6.18, + "grad_norm": 1.6884278059005737, + "learning_rate": 3.9138843930741334e-05, + "loss": 1.0056, + "step": 20640 + }, + { + "epoch": 6.18, + "grad_norm": 9.735162734985352, + "learning_rate": 3.9133998186932036e-05, + "loss": 1.1052, + "step": 20645 + }, + { + "epoch": 6.18, + "grad_norm": 3.7487850189208984, + "learning_rate": 3.912915166251672e-05, + "loss": 1.2539, + "step": 20650 + }, + { + "epoch": 6.18, + "grad_norm": 2.0177783966064453, + "learning_rate": 3.912430435776304e-05, + "loss": 1.0794, + "step": 20655 + }, + { + "epoch": 6.18, + "grad_norm": 1.6745036840438843, + "learning_rate": 3.911945627293871e-05, + "loss": 1.1716, + "step": 20660 + }, + { + "epoch": 6.18, + "grad_norm": 3.979721784591675, + "learning_rate": 3.9114607408311486e-05, + "loss": 1.1044, + "step": 20665 + }, + { + "epoch": 6.18, + "grad_norm": 3.073629140853882, + "learning_rate": 3.9109757764149166e-05, + "loss": 1.0468, + "step": 20670 + }, + { + "epoch": 6.19, + "grad_norm": 1.9026330709457397, + "learning_rate": 3.91049073407196e-05, + "loss": 1.1539, + "step": 20675 + }, + { + "epoch": 6.19, + "grad_norm": 2.6179862022399902, + "learning_rate": 3.910005613829065e-05, + "loss": 1.142, + "step": 20680 + }, + { + "epoch": 6.19, + "grad_norm": 1.8899461030960083, + "learning_rate": 3.909520415713027e-05, + "loss": 1.2276, + "step": 20685 + }, + { + "epoch": 6.19, + "grad_norm": 5.7648444175720215, + "learning_rate": 3.909035139750641e-05, + "loss": 1.166, + "step": 20690 + }, + { + "epoch": 6.19, + "grad_norm": 6.039294719696045, + "learning_rate": 3.908549785968708e-05, + "loss": 1.1015, + "step": 20695 + }, + { + "epoch": 6.19, + "grad_norm": 1.34919273853302, + "learning_rate": 3.908064354394035e-05, + "loss": 1.215, + "step": 20700 + }, + { + "epoch": 6.19, + "grad_norm": 2.3863883018493652, + "learning_rate": 3.907578845053432e-05, + "loss": 1.3305, + "step": 20705 + }, + { + "epoch": 6.2, + "grad_norm": 1.9854397773742676, + "learning_rate": 3.907093257973712e-05, + "loss": 1.2554, + "step": 20710 + }, + { + "epoch": 6.2, + "grad_norm": 2.0534327030181885, + "learning_rate": 3.9066075931816934e-05, + "loss": 1.132, + "step": 20715 + }, + { + "epoch": 6.2, + "grad_norm": 4.463151454925537, + "learning_rate": 3.9061218507042e-05, + "loss": 1.1111, + "step": 20720 + }, + { + "epoch": 6.2, + "grad_norm": 1.3392293453216553, + "learning_rate": 3.905636030568058e-05, + "loss": 1.0829, + "step": 20725 + }, + { + "epoch": 6.2, + "grad_norm": 2.5353002548217773, + "learning_rate": 3.905150132800099e-05, + "loss": 1.1461, + "step": 20730 + }, + { + "epoch": 6.2, + "grad_norm": 4.353736400604248, + "learning_rate": 3.90466415742716e-05, + "loss": 1.187, + "step": 20735 + }, + { + "epoch": 6.21, + "grad_norm": 4.582551956176758, + "learning_rate": 3.904178104476078e-05, + "loss": 1.132, + "step": 20740 + }, + { + "epoch": 6.21, + "grad_norm": 1.9312151670455933, + "learning_rate": 3.9036919739737e-05, + "loss": 1.1476, + "step": 20745 + }, + { + "epoch": 6.21, + "grad_norm": 2.270596981048584, + "learning_rate": 3.9032057659468734e-05, + "loss": 1.0766, + "step": 20750 + }, + { + "epoch": 6.21, + "grad_norm": 1.6759518384933472, + "learning_rate": 3.902719480422451e-05, + "loss": 1.0496, + "step": 20755 + }, + { + "epoch": 6.21, + "grad_norm": 1.878208875656128, + "learning_rate": 3.902233117427289e-05, + "loss": 1.3187, + "step": 20760 + }, + { + "epoch": 6.21, + "grad_norm": 1.0941290855407715, + "learning_rate": 3.9017466769882494e-05, + "loss": 1.1848, + "step": 20765 + }, + { + "epoch": 6.21, + "grad_norm": 4.294778347015381, + "learning_rate": 3.901260159132198e-05, + "loss": 1.2239, + "step": 20770 + }, + { + "epoch": 6.22, + "grad_norm": 3.6383235454559326, + "learning_rate": 3.900773563886004e-05, + "loss": 1.1626, + "step": 20775 + }, + { + "epoch": 6.22, + "grad_norm": 2.137620449066162, + "learning_rate": 3.900286891276543e-05, + "loss": 1.273, + "step": 20780 + }, + { + "epoch": 6.22, + "grad_norm": 3.62593150138855, + "learning_rate": 3.8998001413306926e-05, + "loss": 1.1256, + "step": 20785 + }, + { + "epoch": 6.22, + "grad_norm": 3.164325475692749, + "learning_rate": 3.899313314075335e-05, + "loss": 1.1586, + "step": 20790 + }, + { + "epoch": 6.22, + "grad_norm": 3.6429853439331055, + "learning_rate": 3.898826409537358e-05, + "loss": 1.1242, + "step": 20795 + }, + { + "epoch": 6.22, + "grad_norm": 2.646456241607666, + "learning_rate": 3.898339427743652e-05, + "loss": 1.2348, + "step": 20800 + }, + { + "epoch": 6.22, + "grad_norm": 2.920931100845337, + "learning_rate": 3.897852368721113e-05, + "loss": 1.1977, + "step": 20805 + }, + { + "epoch": 6.23, + "grad_norm": 3.684735059738159, + "learning_rate": 3.8973652324966404e-05, + "loss": 1.2288, + "step": 20810 + }, + { + "epoch": 6.23, + "grad_norm": 2.1889755725860596, + "learning_rate": 3.896878019097139e-05, + "loss": 1.1471, + "step": 20815 + }, + { + "epoch": 6.23, + "grad_norm": 3.232788324356079, + "learning_rate": 3.896390728549516e-05, + "loss": 1.1925, + "step": 20820 + }, + { + "epoch": 6.23, + "grad_norm": 1.4268032312393188, + "learning_rate": 3.895903360880685e-05, + "loss": 1.0222, + "step": 20825 + }, + { + "epoch": 6.23, + "grad_norm": 2.271416187286377, + "learning_rate": 3.895415916117562e-05, + "loss": 1.0918, + "step": 20830 + }, + { + "epoch": 6.23, + "grad_norm": 4.371371269226074, + "learning_rate": 3.894928394287068e-05, + "loss": 1.2376, + "step": 20835 + }, + { + "epoch": 6.24, + "grad_norm": 1.4696122407913208, + "learning_rate": 3.89444079541613e-05, + "loss": 1.3061, + "step": 20840 + }, + { + "epoch": 6.24, + "grad_norm": 2.775634527206421, + "learning_rate": 3.893953119531676e-05, + "loss": 1.069, + "step": 20845 + }, + { + "epoch": 6.24, + "grad_norm": 1.066505789756775, + "learning_rate": 3.893465366660639e-05, + "loss": 1.2153, + "step": 20850 + }, + { + "epoch": 6.24, + "grad_norm": 1.7460033893585205, + "learning_rate": 3.8929775368299595e-05, + "loss": 1.0575, + "step": 20855 + }, + { + "epoch": 6.24, + "grad_norm": 1.4495829343795776, + "learning_rate": 3.892489630066578e-05, + "loss": 1.1096, + "step": 20860 + }, + { + "epoch": 6.24, + "grad_norm": 2.495727062225342, + "learning_rate": 3.892001646397441e-05, + "loss": 1.1207, + "step": 20865 + }, + { + "epoch": 6.24, + "grad_norm": 2.2231380939483643, + "learning_rate": 3.891513585849501e-05, + "loss": 1.1967, + "step": 20870 + }, + { + "epoch": 6.25, + "grad_norm": 7.131553649902344, + "learning_rate": 3.891025448449711e-05, + "loss": 1.1515, + "step": 20875 + }, + { + "epoch": 6.25, + "grad_norm": 1.6254703998565674, + "learning_rate": 3.890537234225033e-05, + "loss": 1.2101, + "step": 20880 + }, + { + "epoch": 6.25, + "grad_norm": 2.034390687942505, + "learning_rate": 3.890048943202428e-05, + "loss": 1.0613, + "step": 20885 + }, + { + "epoch": 6.25, + "grad_norm": 2.165940999984741, + "learning_rate": 3.8895605754088646e-05, + "loss": 1.0818, + "step": 20890 + }, + { + "epoch": 6.25, + "grad_norm": 1.1313718557357788, + "learning_rate": 3.889072130871315e-05, + "loss": 1.0127, + "step": 20895 + }, + { + "epoch": 6.25, + "grad_norm": 3.206895589828491, + "learning_rate": 3.888583609616755e-05, + "loss": 1.2237, + "step": 20900 + }, + { + "epoch": 6.25, + "grad_norm": 4.657969951629639, + "learning_rate": 3.888095011672167e-05, + "loss": 1.1528, + "step": 20905 + }, + { + "epoch": 6.26, + "grad_norm": 1.6976109743118286, + "learning_rate": 3.887606337064534e-05, + "loss": 1.1705, + "step": 20910 + }, + { + "epoch": 6.26, + "grad_norm": 1.7403982877731323, + "learning_rate": 3.887117585820844e-05, + "loss": 1.1797, + "step": 20915 + }, + { + "epoch": 6.26, + "grad_norm": 3.2801010608673096, + "learning_rate": 3.8866287579680925e-05, + "loss": 1.1335, + "step": 20920 + }, + { + "epoch": 6.26, + "grad_norm": 2.5484225749969482, + "learning_rate": 3.886139853533276e-05, + "loss": 1.2226, + "step": 20925 + }, + { + "epoch": 6.26, + "grad_norm": 2.7119576930999756, + "learning_rate": 3.8856508725433966e-05, + "loss": 1.3315, + "step": 20930 + }, + { + "epoch": 6.26, + "grad_norm": 4.890354156494141, + "learning_rate": 3.8851618150254594e-05, + "loss": 1.1151, + "step": 20935 + }, + { + "epoch": 6.27, + "grad_norm": 2.2012619972229004, + "learning_rate": 3.884672681006475e-05, + "loss": 1.1209, + "step": 20940 + }, + { + "epoch": 6.27, + "grad_norm": 2.108077049255371, + "learning_rate": 3.884183470513457e-05, + "loss": 1.2317, + "step": 20945 + }, + { + "epoch": 6.27, + "grad_norm": 2.5535576343536377, + "learning_rate": 3.883694183573426e-05, + "loss": 1.253, + "step": 20950 + }, + { + "epoch": 6.27, + "grad_norm": 3.921041965484619, + "learning_rate": 3.883204820213403e-05, + "loss": 1.2734, + "step": 20955 + }, + { + "epoch": 6.27, + "grad_norm": 2.2731990814208984, + "learning_rate": 3.882715380460416e-05, + "loss": 1.1832, + "step": 20960 + }, + { + "epoch": 6.27, + "grad_norm": 2.7740683555603027, + "learning_rate": 3.882225864341494e-05, + "loss": 1.2319, + "step": 20965 + }, + { + "epoch": 6.27, + "grad_norm": 1.0402427911758423, + "learning_rate": 3.8817362718836755e-05, + "loss": 1.3091, + "step": 20970 + }, + { + "epoch": 6.28, + "grad_norm": 3.553283214569092, + "learning_rate": 3.8812466031139996e-05, + "loss": 1.0222, + "step": 20975 + }, + { + "epoch": 6.28, + "grad_norm": 2.163384437561035, + "learning_rate": 3.8807568580595085e-05, + "loss": 1.1629, + "step": 20980 + }, + { + "epoch": 6.28, + "grad_norm": 5.054582118988037, + "learning_rate": 3.8802670367472517e-05, + "loss": 1.1109, + "step": 20985 + }, + { + "epoch": 6.28, + "grad_norm": 3.024733543395996, + "learning_rate": 3.879777139204281e-05, + "loss": 1.1245, + "step": 20990 + }, + { + "epoch": 6.28, + "grad_norm": 2.468595027923584, + "learning_rate": 3.879287165457654e-05, + "loss": 1.3344, + "step": 20995 + }, + { + "epoch": 6.28, + "grad_norm": 5.894863605499268, + "learning_rate": 3.878797115534429e-05, + "loss": 1.3523, + "step": 21000 + }, + { + "epoch": 6.28, + "grad_norm": 1.7448843717575073, + "learning_rate": 3.878306989461673e-05, + "loss": 1.1512, + "step": 21005 + }, + { + "epoch": 6.29, + "grad_norm": 2.0726964473724365, + "learning_rate": 3.8778167872664554e-05, + "loss": 1.1512, + "step": 21010 + }, + { + "epoch": 6.29, + "grad_norm": 2.647890090942383, + "learning_rate": 3.8773265089758483e-05, + "loss": 1.1089, + "step": 21015 + }, + { + "epoch": 6.29, + "grad_norm": 2.005052328109741, + "learning_rate": 3.87683615461693e-05, + "loss": 1.2923, + "step": 21020 + }, + { + "epoch": 6.29, + "grad_norm": 2.600606918334961, + "learning_rate": 3.8763457242167816e-05, + "loss": 1.1266, + "step": 21025 + }, + { + "epoch": 6.29, + "grad_norm": 1.3668138980865479, + "learning_rate": 3.87585521780249e-05, + "loss": 1.1652, + "step": 21030 + }, + { + "epoch": 6.29, + "grad_norm": 2.5960161685943604, + "learning_rate": 3.8753646354011444e-05, + "loss": 1.1451, + "step": 21035 + }, + { + "epoch": 6.29, + "grad_norm": 2.091399669647217, + "learning_rate": 3.87487397703984e-05, + "loss": 1.0908, + "step": 21040 + }, + { + "epoch": 6.3, + "grad_norm": 1.7568382024765015, + "learning_rate": 3.8743832427456736e-05, + "loss": 1.1526, + "step": 21045 + }, + { + "epoch": 6.3, + "grad_norm": 1.715632677078247, + "learning_rate": 3.873892432545751e-05, + "loss": 1.2262, + "step": 21050 + }, + { + "epoch": 6.3, + "grad_norm": 2.9848005771636963, + "learning_rate": 3.873401546467177e-05, + "loss": 1.2044, + "step": 21055 + }, + { + "epoch": 6.3, + "grad_norm": 1.247778058052063, + "learning_rate": 3.872910584537063e-05, + "loss": 1.1523, + "step": 21060 + }, + { + "epoch": 6.3, + "grad_norm": 3.150620460510254, + "learning_rate": 3.872419546782524e-05, + "loss": 1.0884, + "step": 21065 + }, + { + "epoch": 6.3, + "grad_norm": 1.8623440265655518, + "learning_rate": 3.8719284332306804e-05, + "loss": 1.2578, + "step": 21070 + }, + { + "epoch": 6.31, + "grad_norm": 1.1926594972610474, + "learning_rate": 3.871437243908655e-05, + "loss": 1.1856, + "step": 21075 + }, + { + "epoch": 6.31, + "grad_norm": 2.9586501121520996, + "learning_rate": 3.870945978843578e-05, + "loss": 1.0101, + "step": 21080 + }, + { + "epoch": 6.31, + "grad_norm": 1.617187261581421, + "learning_rate": 3.8704546380625776e-05, + "loss": 1.1834, + "step": 21085 + }, + { + "epoch": 6.31, + "grad_norm": 1.3432159423828125, + "learning_rate": 3.869963221592793e-05, + "loss": 1.2137, + "step": 21090 + }, + { + "epoch": 6.31, + "grad_norm": 1.8698570728302002, + "learning_rate": 3.8694717294613625e-05, + "loss": 1.2116, + "step": 21095 + }, + { + "epoch": 6.31, + "grad_norm": 2.4080424308776855, + "learning_rate": 3.868980161695433e-05, + "loss": 1.2621, + "step": 21100 + }, + { + "epoch": 6.31, + "grad_norm": 4.020505905151367, + "learning_rate": 3.868488518322152e-05, + "loss": 1.106, + "step": 21105 + }, + { + "epoch": 6.32, + "grad_norm": 1.8761104345321655, + "learning_rate": 3.8679967993686726e-05, + "loss": 1.1593, + "step": 21110 + }, + { + "epoch": 6.32, + "grad_norm": 1.0828648805618286, + "learning_rate": 3.867505004862152e-05, + "loss": 1.1695, + "step": 21115 + }, + { + "epoch": 6.32, + "grad_norm": 2.3236336708068848, + "learning_rate": 3.8670131348297514e-05, + "loss": 1.2771, + "step": 21120 + }, + { + "epoch": 6.32, + "grad_norm": 2.3681468963623047, + "learning_rate": 3.8665211892986355e-05, + "loss": 1.3653, + "step": 21125 + }, + { + "epoch": 6.32, + "grad_norm": 3.011075973510742, + "learning_rate": 3.8660291682959753e-05, + "loss": 1.1936, + "step": 21130 + }, + { + "epoch": 6.32, + "grad_norm": 1.638193964958191, + "learning_rate": 3.865537071848944e-05, + "loss": 1.2263, + "step": 21135 + }, + { + "epoch": 6.32, + "grad_norm": 0.8209080696105957, + "learning_rate": 3.86504489998472e-05, + "loss": 1.1663, + "step": 21140 + }, + { + "epoch": 6.33, + "grad_norm": 2.627833843231201, + "learning_rate": 3.864552652730485e-05, + "loss": 1.0747, + "step": 21145 + }, + { + "epoch": 6.33, + "grad_norm": 1.2440749406814575, + "learning_rate": 3.8640603301134245e-05, + "loss": 1.211, + "step": 21150 + }, + { + "epoch": 6.33, + "grad_norm": 1.7105375528335571, + "learning_rate": 3.863567932160731e-05, + "loss": 1.177, + "step": 21155 + }, + { + "epoch": 6.33, + "grad_norm": 1.2131013870239258, + "learning_rate": 3.863075458899598e-05, + "loss": 1.0843, + "step": 21160 + }, + { + "epoch": 6.33, + "grad_norm": 5.162259101867676, + "learning_rate": 3.862582910357223e-05, + "loss": 1.2929, + "step": 21165 + }, + { + "epoch": 6.33, + "grad_norm": 3.677018165588379, + "learning_rate": 3.862090286560811e-05, + "loss": 1.0206, + "step": 21170 + }, + { + "epoch": 6.34, + "grad_norm": 1.5079269409179688, + "learning_rate": 3.861597587537568e-05, + "loss": 1.2481, + "step": 21175 + }, + { + "epoch": 6.34, + "grad_norm": 2.0663700103759766, + "learning_rate": 3.861104813314705e-05, + "loss": 1.0672, + "step": 21180 + }, + { + "epoch": 6.34, + "grad_norm": 2.495488166809082, + "learning_rate": 3.8606119639194394e-05, + "loss": 1.1044, + "step": 21185 + }, + { + "epoch": 6.34, + "grad_norm": 2.68648099899292, + "learning_rate": 3.8601190393789885e-05, + "loss": 1.1525, + "step": 21190 + }, + { + "epoch": 6.34, + "grad_norm": 1.7816789150238037, + "learning_rate": 3.8596260397205766e-05, + "loss": 1.2182, + "step": 21195 + }, + { + "epoch": 6.34, + "grad_norm": 1.921111822128296, + "learning_rate": 3.859132964971432e-05, + "loss": 1.1855, + "step": 21200 + }, + { + "epoch": 6.34, + "grad_norm": 0.9423299431800842, + "learning_rate": 3.8586398151587864e-05, + "loss": 1.2339, + "step": 21205 + }, + { + "epoch": 6.35, + "grad_norm": 1.9230526685714722, + "learning_rate": 3.858146590309877e-05, + "loss": 1.1414, + "step": 21210 + }, + { + "epoch": 6.35, + "grad_norm": 1.7740641832351685, + "learning_rate": 3.857653290451941e-05, + "loss": 1.3103, + "step": 21215 + }, + { + "epoch": 6.35, + "grad_norm": 2.815051555633545, + "learning_rate": 3.857159915612227e-05, + "loss": 1.3497, + "step": 21220 + }, + { + "epoch": 6.35, + "grad_norm": 2.79278302192688, + "learning_rate": 3.856666465817981e-05, + "loss": 1.129, + "step": 21225 + }, + { + "epoch": 6.35, + "grad_norm": 1.6344680786132812, + "learning_rate": 3.8561729410964556e-05, + "loss": 1.1296, + "step": 21230 + }, + { + "epoch": 6.35, + "grad_norm": 2.4964382648468018, + "learning_rate": 3.855679341474909e-05, + "loss": 1.2656, + "step": 21235 + }, + { + "epoch": 6.35, + "grad_norm": 2.5263328552246094, + "learning_rate": 3.855185666980602e-05, + "loss": 1.3019, + "step": 21240 + }, + { + "epoch": 6.36, + "grad_norm": 3.0418660640716553, + "learning_rate": 3.854691917640798e-05, + "loss": 1.2104, + "step": 21245 + }, + { + "epoch": 6.36, + "grad_norm": 2.407768726348877, + "learning_rate": 3.854198093482768e-05, + "loss": 1.0718, + "step": 21250 + }, + { + "epoch": 6.36, + "grad_norm": 3.624297857284546, + "learning_rate": 3.853704194533785e-05, + "loss": 1.2319, + "step": 21255 + }, + { + "epoch": 6.36, + "grad_norm": 3.435147285461426, + "learning_rate": 3.8532102208211265e-05, + "loss": 1.1739, + "step": 21260 + }, + { + "epoch": 6.36, + "grad_norm": 2.9471611976623535, + "learning_rate": 3.852716172372074e-05, + "loss": 1.1199, + "step": 21265 + }, + { + "epoch": 6.36, + "grad_norm": 3.1769185066223145, + "learning_rate": 3.8522220492139136e-05, + "loss": 1.1471, + "step": 21270 + }, + { + "epoch": 6.37, + "grad_norm": 3.634047269821167, + "learning_rate": 3.8517278513739345e-05, + "loss": 1.1882, + "step": 21275 + }, + { + "epoch": 6.37, + "grad_norm": 3.191857099533081, + "learning_rate": 3.851233578879432e-05, + "loss": 1.1523, + "step": 21280 + }, + { + "epoch": 6.37, + "grad_norm": 3.0446479320526123, + "learning_rate": 3.850739231757702e-05, + "loss": 1.1287, + "step": 21285 + }, + { + "epoch": 6.37, + "grad_norm": 1.9125072956085205, + "learning_rate": 3.850244810036049e-05, + "loss": 1.0729, + "step": 21290 + }, + { + "epoch": 6.37, + "grad_norm": 6.34651517868042, + "learning_rate": 3.849750313741779e-05, + "loss": 1.1506, + "step": 21295 + }, + { + "epoch": 6.37, + "grad_norm": 1.6267149448394775, + "learning_rate": 3.8492557429022026e-05, + "loss": 1.0437, + "step": 21300 + }, + { + "epoch": 6.37, + "grad_norm": 2.244823694229126, + "learning_rate": 3.848761097544633e-05, + "loss": 1.1806, + "step": 21305 + }, + { + "epoch": 6.38, + "grad_norm": 1.7018314599990845, + "learning_rate": 3.8482663776963904e-05, + "loss": 1.1546, + "step": 21310 + }, + { + "epoch": 6.38, + "grad_norm": 1.9691760540008545, + "learning_rate": 3.847771583384797e-05, + "loss": 1.089, + "step": 21315 + }, + { + "epoch": 6.38, + "grad_norm": 2.011826992034912, + "learning_rate": 3.8472767146371805e-05, + "loss": 1.0998, + "step": 21320 + }, + { + "epoch": 6.38, + "grad_norm": 1.590854525566101, + "learning_rate": 3.846781771480871e-05, + "loss": 1.2689, + "step": 21325 + }, + { + "epoch": 6.38, + "grad_norm": 3.3987576961517334, + "learning_rate": 3.846286753943205e-05, + "loss": 1.1582, + "step": 21330 + }, + { + "epoch": 6.38, + "grad_norm": 1.704407811164856, + "learning_rate": 3.84579166205152e-05, + "loss": 1.234, + "step": 21335 + }, + { + "epoch": 6.38, + "grad_norm": 1.0222856998443604, + "learning_rate": 3.845296495833161e-05, + "loss": 1.2021, + "step": 21340 + }, + { + "epoch": 6.39, + "grad_norm": 2.390643835067749, + "learning_rate": 3.844801255315474e-05, + "loss": 1.3623, + "step": 21345 + }, + { + "epoch": 6.39, + "grad_norm": 1.4073227643966675, + "learning_rate": 3.844305940525812e-05, + "loss": 1.2204, + "step": 21350 + }, + { + "epoch": 6.39, + "grad_norm": 3.146881341934204, + "learning_rate": 3.84381055149153e-05, + "loss": 1.3272, + "step": 21355 + }, + { + "epoch": 6.39, + "grad_norm": 1.6763017177581787, + "learning_rate": 3.843315088239988e-05, + "loss": 1.1796, + "step": 21360 + }, + { + "epoch": 6.39, + "grad_norm": 2.7298102378845215, + "learning_rate": 3.8428195507985505e-05, + "loss": 1.1095, + "step": 21365 + }, + { + "epoch": 6.39, + "grad_norm": 2.3021061420440674, + "learning_rate": 3.842323939194584e-05, + "loss": 1.0336, + "step": 21370 + }, + { + "epoch": 6.4, + "grad_norm": 2.4729456901550293, + "learning_rate": 3.841828253455463e-05, + "loss": 1.3022, + "step": 21375 + }, + { + "epoch": 6.4, + "grad_norm": 2.751861572265625, + "learning_rate": 3.841332493608561e-05, + "loss": 1.3114, + "step": 21380 + }, + { + "epoch": 6.4, + "grad_norm": 3.295260190963745, + "learning_rate": 3.840836659681261e-05, + "loss": 1.1114, + "step": 21385 + }, + { + "epoch": 6.4, + "grad_norm": 4.862136363983154, + "learning_rate": 3.840340751700945e-05, + "loss": 1.1214, + "step": 21390 + }, + { + "epoch": 6.4, + "grad_norm": 1.8735803365707397, + "learning_rate": 3.8398447696950036e-05, + "loss": 1.1672, + "step": 21395 + }, + { + "epoch": 6.4, + "grad_norm": 1.558856725692749, + "learning_rate": 3.839348713690827e-05, + "loss": 1.2898, + "step": 21400 + }, + { + "epoch": 6.4, + "grad_norm": 7.87240743637085, + "learning_rate": 3.838852583715814e-05, + "loss": 1.1136, + "step": 21405 + }, + { + "epoch": 6.41, + "grad_norm": 3.504246473312378, + "learning_rate": 3.8383563797973634e-05, + "loss": 1.0798, + "step": 21410 + }, + { + "epoch": 6.41, + "grad_norm": 1.1940808296203613, + "learning_rate": 3.837860101962882e-05, + "loss": 1.1356, + "step": 21415 + }, + { + "epoch": 6.41, + "grad_norm": 1.6067959070205688, + "learning_rate": 3.8373637502397775e-05, + "loss": 1.1774, + "step": 21420 + }, + { + "epoch": 6.41, + "grad_norm": 5.702824115753174, + "learning_rate": 3.836867324655463e-05, + "loss": 1.1839, + "step": 21425 + }, + { + "epoch": 6.41, + "grad_norm": 1.5713484287261963, + "learning_rate": 3.8363708252373563e-05, + "loss": 1.1561, + "step": 21430 + }, + { + "epoch": 6.41, + "grad_norm": 2.0264546871185303, + "learning_rate": 3.835874252012878e-05, + "loss": 1.2943, + "step": 21435 + }, + { + "epoch": 6.41, + "grad_norm": 1.9369525909423828, + "learning_rate": 3.8353776050094524e-05, + "loss": 1.0951, + "step": 21440 + }, + { + "epoch": 6.42, + "grad_norm": 1.6942447423934937, + "learning_rate": 3.8348808842545106e-05, + "loss": 1.0946, + "step": 21445 + }, + { + "epoch": 6.42, + "grad_norm": 1.3976869583129883, + "learning_rate": 3.8343840897754845e-05, + "loss": 1.2452, + "step": 21450 + }, + { + "epoch": 6.42, + "grad_norm": 3.386245012283325, + "learning_rate": 3.833887221599812e-05, + "loss": 1.3216, + "step": 21455 + }, + { + "epoch": 6.42, + "grad_norm": 1.7361987829208374, + "learning_rate": 3.833390279754935e-05, + "loss": 1.2033, + "step": 21460 + }, + { + "epoch": 6.42, + "grad_norm": 4.732967853546143, + "learning_rate": 3.832893264268299e-05, + "loss": 1.202, + "step": 21465 + }, + { + "epoch": 6.42, + "grad_norm": 11.390531539916992, + "learning_rate": 3.8323961751673545e-05, + "loss": 1.1177, + "step": 21470 + }, + { + "epoch": 6.43, + "grad_norm": 1.5871869325637817, + "learning_rate": 3.831899012479553e-05, + "loss": 1.1747, + "step": 21475 + }, + { + "epoch": 6.43, + "grad_norm": 2.522251605987549, + "learning_rate": 3.8314017762323526e-05, + "loss": 1.0981, + "step": 21480 + }, + { + "epoch": 6.43, + "grad_norm": 2.3659377098083496, + "learning_rate": 3.830904466453218e-05, + "loss": 1.1489, + "step": 21485 + }, + { + "epoch": 6.43, + "grad_norm": 2.8032987117767334, + "learning_rate": 3.830407083169612e-05, + "loss": 1.1038, + "step": 21490 + }, + { + "epoch": 6.43, + "grad_norm": 3.808497428894043, + "learning_rate": 3.829909626409006e-05, + "loss": 1.1027, + "step": 21495 + }, + { + "epoch": 6.43, + "grad_norm": 2.006352663040161, + "learning_rate": 3.829412096198873e-05, + "loss": 1.3041, + "step": 21500 + }, + { + "epoch": 6.43, + "grad_norm": 2.3574745655059814, + "learning_rate": 3.828914492566693e-05, + "loss": 1.1786, + "step": 21505 + }, + { + "epoch": 6.44, + "grad_norm": 2.2047548294067383, + "learning_rate": 3.828416815539946e-05, + "loss": 1.2032, + "step": 21510 + }, + { + "epoch": 6.44, + "grad_norm": 3.6130099296569824, + "learning_rate": 3.8279190651461195e-05, + "loss": 1.0937, + "step": 21515 + }, + { + "epoch": 6.44, + "grad_norm": 2.250490427017212, + "learning_rate": 3.827421241412703e-05, + "loss": 1.3764, + "step": 21520 + }, + { + "epoch": 6.44, + "grad_norm": 2.148017168045044, + "learning_rate": 3.8269233443671914e-05, + "loss": 1.2094, + "step": 21525 + }, + { + "epoch": 6.44, + "grad_norm": 3.4997782707214355, + "learning_rate": 3.826425374037083e-05, + "loss": 1.1462, + "step": 21530 + }, + { + "epoch": 6.44, + "grad_norm": 1.9769136905670166, + "learning_rate": 3.825927330449879e-05, + "loss": 1.0678, + "step": 21535 + }, + { + "epoch": 6.44, + "grad_norm": 1.7285351753234863, + "learning_rate": 3.825429213633087e-05, + "loss": 1.3424, + "step": 21540 + }, + { + "epoch": 6.45, + "grad_norm": 2.1386027336120605, + "learning_rate": 3.8249310236142175e-05, + "loss": 1.0702, + "step": 21545 + }, + { + "epoch": 6.45, + "grad_norm": 4.701780319213867, + "learning_rate": 3.8244327604207856e-05, + "loss": 1.2914, + "step": 21550 + }, + { + "epoch": 6.45, + "grad_norm": 0.979941725730896, + "learning_rate": 3.8239344240803077e-05, + "loss": 1.313, + "step": 21555 + }, + { + "epoch": 6.45, + "grad_norm": 2.4116852283477783, + "learning_rate": 3.823436014620308e-05, + "loss": 1.1429, + "step": 21560 + }, + { + "epoch": 6.45, + "grad_norm": 2.062809705734253, + "learning_rate": 3.822937532068314e-05, + "loss": 1.064, + "step": 21565 + }, + { + "epoch": 6.45, + "grad_norm": 0.9960473775863647, + "learning_rate": 3.822438976451854e-05, + "loss": 1.3403, + "step": 21570 + }, + { + "epoch": 6.45, + "grad_norm": 0.9557723999023438, + "learning_rate": 3.821940347798464e-05, + "loss": 1.2886, + "step": 21575 + }, + { + "epoch": 6.46, + "grad_norm": 1.1672139167785645, + "learning_rate": 3.8214416461356825e-05, + "loss": 1.0831, + "step": 21580 + }, + { + "epoch": 6.46, + "grad_norm": 1.0356839895248413, + "learning_rate": 3.8209428714910536e-05, + "loss": 1.2411, + "step": 21585 + }, + { + "epoch": 6.46, + "grad_norm": 2.4075944423675537, + "learning_rate": 3.820444023892122e-05, + "loss": 1.3309, + "step": 21590 + }, + { + "epoch": 6.46, + "grad_norm": 3.5480706691741943, + "learning_rate": 3.8199451033664395e-05, + "loss": 1.214, + "step": 21595 + }, + { + "epoch": 6.46, + "grad_norm": 14.751591682434082, + "learning_rate": 3.8194461099415615e-05, + "loss": 1.1567, + "step": 21600 + }, + { + "epoch": 6.46, + "grad_norm": 2.6197705268859863, + "learning_rate": 3.818947043645046e-05, + "loss": 1.3019, + "step": 21605 + }, + { + "epoch": 6.47, + "grad_norm": 3.7465360164642334, + "learning_rate": 3.818447904504456e-05, + "loss": 1.0903, + "step": 21610 + }, + { + "epoch": 6.47, + "grad_norm": 1.5686691999435425, + "learning_rate": 3.817948692547358e-05, + "loss": 1.1012, + "step": 21615 + }, + { + "epoch": 6.47, + "grad_norm": 1.792001485824585, + "learning_rate": 3.8174494078013254e-05, + "loss": 1.2688, + "step": 21620 + }, + { + "epoch": 6.47, + "grad_norm": 1.9300975799560547, + "learning_rate": 3.8169500502939305e-05, + "loss": 1.0274, + "step": 21625 + }, + { + "epoch": 6.47, + "grad_norm": 2.6718950271606445, + "learning_rate": 3.816450620052754e-05, + "loss": 1.2466, + "step": 21630 + }, + { + "epoch": 6.47, + "grad_norm": 1.7628939151763916, + "learning_rate": 3.8159511171053767e-05, + "loss": 1.1826, + "step": 21635 + }, + { + "epoch": 6.47, + "grad_norm": 3.9780194759368896, + "learning_rate": 3.8154515414793874e-05, + "loss": 1.0958, + "step": 21640 + }, + { + "epoch": 6.48, + "grad_norm": 1.881233811378479, + "learning_rate": 3.814951893202377e-05, + "loss": 1.0608, + "step": 21645 + }, + { + "epoch": 6.48, + "grad_norm": 2.578082323074341, + "learning_rate": 3.814452172301941e-05, + "loss": 1.1508, + "step": 21650 + }, + { + "epoch": 6.48, + "grad_norm": 4.1634721755981445, + "learning_rate": 3.813952378805677e-05, + "loss": 1.1515, + "step": 21655 + }, + { + "epoch": 6.48, + "grad_norm": 2.9157748222351074, + "learning_rate": 3.8134525127411896e-05, + "loss": 1.2368, + "step": 21660 + }, + { + "epoch": 6.48, + "grad_norm": 4.483683109283447, + "learning_rate": 3.812952574136085e-05, + "loss": 1.0416, + "step": 21665 + }, + { + "epoch": 6.48, + "grad_norm": 2.9984755516052246, + "learning_rate": 3.812452563017974e-05, + "loss": 1.0572, + "step": 21670 + }, + { + "epoch": 6.48, + "grad_norm": 1.295201301574707, + "learning_rate": 3.8119524794144724e-05, + "loss": 1.0067, + "step": 21675 + }, + { + "epoch": 6.49, + "grad_norm": 2.1043081283569336, + "learning_rate": 3.811452323353199e-05, + "loss": 1.2859, + "step": 21680 + }, + { + "epoch": 6.49, + "grad_norm": 1.3787553310394287, + "learning_rate": 3.810952094861777e-05, + "loss": 1.1761, + "step": 21685 + }, + { + "epoch": 6.49, + "grad_norm": 1.4187425374984741, + "learning_rate": 3.810451793967834e-05, + "loss": 1.2269, + "step": 21690 + }, + { + "epoch": 6.49, + "grad_norm": 4.034739017486572, + "learning_rate": 3.809951420699e-05, + "loss": 1.0818, + "step": 21695 + }, + { + "epoch": 6.49, + "grad_norm": 1.0711634159088135, + "learning_rate": 3.809450975082911e-05, + "loss": 1.1358, + "step": 21700 + }, + { + "epoch": 6.49, + "grad_norm": 7.229355335235596, + "learning_rate": 3.808950457147205e-05, + "loss": 1.3021, + "step": 21705 + }, + { + "epoch": 6.5, + "grad_norm": 1.897261619567871, + "learning_rate": 3.808449866919527e-05, + "loss": 1.2492, + "step": 21710 + }, + { + "epoch": 6.5, + "grad_norm": 2.2863614559173584, + "learning_rate": 3.807949204427522e-05, + "loss": 1.0721, + "step": 21715 + }, + { + "epoch": 6.5, + "grad_norm": 2.1453750133514404, + "learning_rate": 3.807448469698842e-05, + "loss": 1.1794, + "step": 21720 + }, + { + "epoch": 6.5, + "grad_norm": 0.794268012046814, + "learning_rate": 3.806947662761142e-05, + "loss": 1.3931, + "step": 21725 + }, + { + "epoch": 6.5, + "grad_norm": 1.1213440895080566, + "learning_rate": 3.8064467836420815e-05, + "loss": 1.2736, + "step": 21730 + }, + { + "epoch": 6.5, + "grad_norm": 4.008541584014893, + "learning_rate": 3.805945832369322e-05, + "loss": 1.247, + "step": 21735 + }, + { + "epoch": 6.5, + "grad_norm": 1.5506445169448853, + "learning_rate": 3.805444808970533e-05, + "loss": 1.024, + "step": 21740 + }, + { + "epoch": 6.51, + "grad_norm": 1.3730429410934448, + "learning_rate": 3.8049437134733834e-05, + "loss": 1.2982, + "step": 21745 + }, + { + "epoch": 6.51, + "grad_norm": 1.3284574747085571, + "learning_rate": 3.80444254590555e-05, + "loss": 1.2488, + "step": 21750 + }, + { + "epoch": 6.51, + "grad_norm": 3.1834280490875244, + "learning_rate": 3.80394130629471e-05, + "loss": 1.1852, + "step": 21755 + }, + { + "epoch": 6.51, + "grad_norm": 1.5427137613296509, + "learning_rate": 3.8034399946685466e-05, + "loss": 1.2332, + "step": 21760 + }, + { + "epoch": 6.51, + "grad_norm": 2.425147533416748, + "learning_rate": 3.802938611054747e-05, + "loss": 1.1036, + "step": 21765 + }, + { + "epoch": 6.51, + "grad_norm": 16.511322021484375, + "learning_rate": 3.802437155481003e-05, + "loss": 1.0699, + "step": 21770 + }, + { + "epoch": 6.51, + "grad_norm": 1.3702104091644287, + "learning_rate": 3.801935627975008e-05, + "loss": 1.2717, + "step": 21775 + }, + { + "epoch": 6.52, + "grad_norm": 2.96932315826416, + "learning_rate": 3.8014340285644625e-05, + "loss": 1.2624, + "step": 21780 + }, + { + "epoch": 6.52, + "grad_norm": 1.8278169631958008, + "learning_rate": 3.8009323572770684e-05, + "loss": 1.1947, + "step": 21785 + }, + { + "epoch": 6.52, + "grad_norm": 1.3895366191864014, + "learning_rate": 3.8004306141405314e-05, + "loss": 1.1731, + "step": 21790 + }, + { + "epoch": 6.52, + "grad_norm": 2.925675868988037, + "learning_rate": 3.799928799182564e-05, + "loss": 1.22, + "step": 21795 + }, + { + "epoch": 6.52, + "grad_norm": 1.1851224899291992, + "learning_rate": 3.79942691243088e-05, + "loss": 1.0118, + "step": 21800 + }, + { + "epoch": 6.52, + "grad_norm": 2.040398120880127, + "learning_rate": 3.7989249539131984e-05, + "loss": 1.0473, + "step": 21805 + }, + { + "epoch": 6.53, + "grad_norm": 3.938803195953369, + "learning_rate": 3.798422923657241e-05, + "loss": 1.0397, + "step": 21810 + }, + { + "epoch": 6.53, + "grad_norm": 2.930795907974243, + "learning_rate": 3.797920821690736e-05, + "loss": 1.1349, + "step": 21815 + }, + { + "epoch": 6.53, + "grad_norm": 2.4224655628204346, + "learning_rate": 3.797418648041413e-05, + "loss": 1.2474, + "step": 21820 + }, + { + "epoch": 6.53, + "grad_norm": 2.1420652866363525, + "learning_rate": 3.796916402737007e-05, + "loss": 1.1601, + "step": 21825 + }, + { + "epoch": 6.53, + "grad_norm": 2.40444016456604, + "learning_rate": 3.7964140858052555e-05, + "loss": 1.252, + "step": 21830 + }, + { + "epoch": 6.53, + "grad_norm": 3.1540682315826416, + "learning_rate": 3.795911697273902e-05, + "loss": 1.2526, + "step": 21835 + }, + { + "epoch": 6.53, + "grad_norm": 1.9476728439331055, + "learning_rate": 3.7954092371706925e-05, + "loss": 1.0338, + "step": 21840 + }, + { + "epoch": 6.54, + "grad_norm": 1.7248507738113403, + "learning_rate": 3.7949067055233774e-05, + "loss": 1.1222, + "step": 21845 + }, + { + "epoch": 6.54, + "grad_norm": 3.3292136192321777, + "learning_rate": 3.7944041023597105e-05, + "loss": 1.1172, + "step": 21850 + }, + { + "epoch": 6.54, + "grad_norm": 2.2748050689697266, + "learning_rate": 3.793901427707451e-05, + "loss": 1.1926, + "step": 21855 + }, + { + "epoch": 6.54, + "grad_norm": 1.3561511039733887, + "learning_rate": 3.79339868159436e-05, + "loss": 1.2057, + "step": 21860 + }, + { + "epoch": 6.54, + "grad_norm": 4.775590896606445, + "learning_rate": 3.7928958640482046e-05, + "loss": 1.112, + "step": 21865 + }, + { + "epoch": 6.54, + "grad_norm": 1.4836394786834717, + "learning_rate": 3.792392975096754e-05, + "loss": 1.3004, + "step": 21870 + }, + { + "epoch": 6.54, + "grad_norm": 1.8242735862731934, + "learning_rate": 3.791890014767783e-05, + "loss": 1.3185, + "step": 21875 + }, + { + "epoch": 6.55, + "grad_norm": 3.001962661743164, + "learning_rate": 3.79138698308907e-05, + "loss": 1.082, + "step": 21880 + }, + { + "epoch": 6.55, + "grad_norm": 1.9447215795516968, + "learning_rate": 3.790883880088396e-05, + "loss": 1.3185, + "step": 21885 + }, + { + "epoch": 6.55, + "grad_norm": 2.3919167518615723, + "learning_rate": 3.790380705793547e-05, + "loss": 1.2078, + "step": 21890 + }, + { + "epoch": 6.55, + "grad_norm": 6.295918941497803, + "learning_rate": 3.789877460232313e-05, + "loss": 0.922, + "step": 21895 + }, + { + "epoch": 6.55, + "grad_norm": 2.696068525314331, + "learning_rate": 3.789374143432487e-05, + "loss": 1.3105, + "step": 21900 + }, + { + "epoch": 6.55, + "grad_norm": 3.568495035171509, + "learning_rate": 3.788870755421867e-05, + "loss": 1.128, + "step": 21905 + }, + { + "epoch": 6.56, + "grad_norm": 1.2198554277420044, + "learning_rate": 3.7883672962282555e-05, + "loss": 1.2149, + "step": 21910 + }, + { + "epoch": 6.56, + "grad_norm": 5.560957908630371, + "learning_rate": 3.787863765879458e-05, + "loss": 1.1389, + "step": 21915 + }, + { + "epoch": 6.56, + "grad_norm": 3.2426624298095703, + "learning_rate": 3.787360164403283e-05, + "loss": 1.0907, + "step": 21920 + }, + { + "epoch": 6.56, + "grad_norm": 2.8648648262023926, + "learning_rate": 3.786856491827544e-05, + "loss": 0.966, + "step": 21925 + }, + { + "epoch": 6.56, + "grad_norm": 1.9964818954467773, + "learning_rate": 3.786352748180059e-05, + "loss": 1.2, + "step": 21930 + }, + { + "epoch": 6.56, + "grad_norm": 1.6703797578811646, + "learning_rate": 3.7858489334886477e-05, + "loss": 1.2223, + "step": 21935 + }, + { + "epoch": 6.56, + "grad_norm": 2.2925643920898438, + "learning_rate": 3.785345047781137e-05, + "loss": 1.3513, + "step": 21940 + }, + { + "epoch": 6.57, + "grad_norm": 1.0569705963134766, + "learning_rate": 3.784841091085356e-05, + "loss": 1.1399, + "step": 21945 + }, + { + "epoch": 6.57, + "grad_norm": 3.3244879245758057, + "learning_rate": 3.784337063429136e-05, + "loss": 1.3942, + "step": 21950 + }, + { + "epoch": 6.57, + "grad_norm": 3.7458391189575195, + "learning_rate": 3.783832964840316e-05, + "loss": 1.1377, + "step": 21955 + }, + { + "epoch": 6.57, + "grad_norm": 1.8675755262374878, + "learning_rate": 3.7833287953467354e-05, + "loss": 1.249, + "step": 21960 + }, + { + "epoch": 6.57, + "grad_norm": 2.4439284801483154, + "learning_rate": 3.782824554976239e-05, + "loss": 1.2981, + "step": 21965 + }, + { + "epoch": 6.57, + "grad_norm": 5.8578877449035645, + "learning_rate": 3.7823202437566764e-05, + "loss": 1.2135, + "step": 21970 + }, + { + "epoch": 6.57, + "grad_norm": 1.4064592123031616, + "learning_rate": 3.7818158617158995e-05, + "loss": 1.3478, + "step": 21975 + }, + { + "epoch": 6.58, + "grad_norm": 2.7772834300994873, + "learning_rate": 3.781311408881765e-05, + "loss": 1.2668, + "step": 21980 + }, + { + "epoch": 6.58, + "grad_norm": 2.9502522945404053, + "learning_rate": 3.7808068852821334e-05, + "loss": 1.1482, + "step": 21985 + }, + { + "epoch": 6.58, + "grad_norm": 4.143762588500977, + "learning_rate": 3.780302290944868e-05, + "loss": 1.1882, + "step": 21990 + }, + { + "epoch": 6.58, + "grad_norm": 5.523019313812256, + "learning_rate": 3.7797976258978386e-05, + "loss": 1.3205, + "step": 21995 + }, + { + "epoch": 6.58, + "grad_norm": 1.1239092350006104, + "learning_rate": 3.779292890168916e-05, + "loss": 1.3228, + "step": 22000 + }, + { + "epoch": 6.58, + "grad_norm": 2.243865966796875, + "learning_rate": 3.7787880837859767e-05, + "loss": 1.3075, + "step": 22005 + }, + { + "epoch": 6.59, + "grad_norm": 2.5616250038146973, + "learning_rate": 3.778283206776902e-05, + "loss": 1.2133, + "step": 22010 + }, + { + "epoch": 6.59, + "grad_norm": 3.7556142807006836, + "learning_rate": 3.777778259169574e-05, + "loss": 1.1166, + "step": 22015 + }, + { + "epoch": 6.59, + "grad_norm": 2.411454439163208, + "learning_rate": 3.77727324099188e-05, + "loss": 1.2067, + "step": 22020 + }, + { + "epoch": 6.59, + "grad_norm": 7.664334297180176, + "learning_rate": 3.776768152271713e-05, + "loss": 1.0729, + "step": 22025 + }, + { + "epoch": 6.59, + "grad_norm": 1.7373452186584473, + "learning_rate": 3.776262993036969e-05, + "loss": 1.1381, + "step": 22030 + }, + { + "epoch": 6.59, + "grad_norm": 1.6284407377243042, + "learning_rate": 3.7757577633155464e-05, + "loss": 1.1592, + "step": 22035 + }, + { + "epoch": 6.59, + "grad_norm": 1.5111100673675537, + "learning_rate": 3.7752524631353484e-05, + "loss": 1.1819, + "step": 22040 + }, + { + "epoch": 6.6, + "grad_norm": 2.4430501461029053, + "learning_rate": 3.7747470925242815e-05, + "loss": 1.3295, + "step": 22045 + }, + { + "epoch": 6.6, + "grad_norm": 1.4049407243728638, + "learning_rate": 3.774241651510258e-05, + "loss": 1.1802, + "step": 22050 + }, + { + "epoch": 6.6, + "grad_norm": 1.4053436517715454, + "learning_rate": 3.773736140121193e-05, + "loss": 1.2387, + "step": 22055 + }, + { + "epoch": 6.6, + "grad_norm": 1.9086112976074219, + "learning_rate": 3.7732305583850044e-05, + "loss": 1.1988, + "step": 22060 + }, + { + "epoch": 6.6, + "grad_norm": 2.53025221824646, + "learning_rate": 3.772724906329616e-05, + "loss": 1.1728, + "step": 22065 + }, + { + "epoch": 6.6, + "grad_norm": 2.8648312091827393, + "learning_rate": 3.772219183982953e-05, + "loss": 1.0513, + "step": 22070 + }, + { + "epoch": 6.6, + "grad_norm": 1.3704460859298706, + "learning_rate": 3.771713391372949e-05, + "loss": 1.2856, + "step": 22075 + }, + { + "epoch": 6.61, + "grad_norm": 5.734248161315918, + "learning_rate": 3.771207528527534e-05, + "loss": 1.1455, + "step": 22080 + }, + { + "epoch": 6.61, + "grad_norm": 2.4370834827423096, + "learning_rate": 3.77070159547465e-05, + "loss": 1.1426, + "step": 22085 + }, + { + "epoch": 6.61, + "grad_norm": 13.942612648010254, + "learning_rate": 3.770195592242237e-05, + "loss": 1.2256, + "step": 22090 + }, + { + "epoch": 6.61, + "grad_norm": 2.1054158210754395, + "learning_rate": 3.769689518858242e-05, + "loss": 1.2302, + "step": 22095 + }, + { + "epoch": 6.61, + "grad_norm": 2.0124964714050293, + "learning_rate": 3.7691833753506145e-05, + "loss": 1.2412, + "step": 22100 + }, + { + "epoch": 6.61, + "grad_norm": 2.5039613246917725, + "learning_rate": 3.7686771617473094e-05, + "loss": 1.0837, + "step": 22105 + }, + { + "epoch": 6.62, + "grad_norm": 1.1294031143188477, + "learning_rate": 3.768170878076283e-05, + "loss": 1.2652, + "step": 22110 + }, + { + "epoch": 6.62, + "grad_norm": 1.8122518062591553, + "learning_rate": 3.767664524365496e-05, + "loss": 1.3259, + "step": 22115 + }, + { + "epoch": 6.62, + "grad_norm": 2.7276618480682373, + "learning_rate": 3.767158100642916e-05, + "loss": 1.0632, + "step": 22120 + }, + { + "epoch": 6.62, + "grad_norm": 6.643666744232178, + "learning_rate": 3.766651606936511e-05, + "loss": 1.1819, + "step": 22125 + }, + { + "epoch": 6.62, + "grad_norm": 1.3447577953338623, + "learning_rate": 3.7661450432742534e-05, + "loss": 1.3445, + "step": 22130 + }, + { + "epoch": 6.62, + "grad_norm": 4.023565769195557, + "learning_rate": 3.7656384096841226e-05, + "loss": 1.1085, + "step": 22135 + }, + { + "epoch": 6.62, + "grad_norm": 2.0256614685058594, + "learning_rate": 3.7651317061940975e-05, + "loss": 1.2472, + "step": 22140 + }, + { + "epoch": 6.63, + "grad_norm": 1.7763872146606445, + "learning_rate": 3.764624932832163e-05, + "loss": 1.1751, + "step": 22145 + }, + { + "epoch": 6.63, + "grad_norm": 2.526588201522827, + "learning_rate": 3.7641180896263085e-05, + "loss": 1.0481, + "step": 22150 + }, + { + "epoch": 6.63, + "grad_norm": 2.3050858974456787, + "learning_rate": 3.7636111766045254e-05, + "loss": 1.1158, + "step": 22155 + }, + { + "epoch": 6.63, + "grad_norm": 3.152416944503784, + "learning_rate": 3.76310419379481e-05, + "loss": 1.188, + "step": 22160 + }, + { + "epoch": 6.63, + "grad_norm": 2.187168598175049, + "learning_rate": 3.7625971412251636e-05, + "loss": 1.2462, + "step": 22165 + }, + { + "epoch": 6.63, + "grad_norm": 1.528957486152649, + "learning_rate": 3.76209001892359e-05, + "loss": 1.0216, + "step": 22170 + }, + { + "epoch": 6.63, + "grad_norm": 2.985142230987549, + "learning_rate": 3.7615828269180955e-05, + "loss": 1.1851, + "step": 22175 + }, + { + "epoch": 6.64, + "grad_norm": 1.8914645910263062, + "learning_rate": 3.7610755652366933e-05, + "loss": 1.1384, + "step": 22180 + }, + { + "epoch": 6.64, + "grad_norm": 1.485687017440796, + "learning_rate": 3.7605682339073986e-05, + "loss": 1.1725, + "step": 22185 + }, + { + "epoch": 6.64, + "grad_norm": 2.7884113788604736, + "learning_rate": 3.76006083295823e-05, + "loss": 1.0369, + "step": 22190 + }, + { + "epoch": 6.64, + "grad_norm": 2.6795337200164795, + "learning_rate": 3.759553362417212e-05, + "loss": 1.2506, + "step": 22195 + }, + { + "epoch": 6.64, + "grad_norm": 3.732194662094116, + "learning_rate": 3.759045822312371e-05, + "loss": 1.2024, + "step": 22200 + }, + { + "epoch": 6.64, + "grad_norm": 2.955843210220337, + "learning_rate": 3.758538212671738e-05, + "loss": 1.092, + "step": 22205 + }, + { + "epoch": 6.64, + "grad_norm": 2.543309450149536, + "learning_rate": 3.7580305335233466e-05, + "loss": 1.1417, + "step": 22210 + }, + { + "epoch": 6.65, + "grad_norm": 1.1416964530944824, + "learning_rate": 3.757522784895238e-05, + "loss": 1.2911, + "step": 22215 + }, + { + "epoch": 6.65, + "grad_norm": 2.8476202487945557, + "learning_rate": 3.757014966815452e-05, + "loss": 1.1666, + "step": 22220 + }, + { + "epoch": 6.65, + "grad_norm": 3.7398064136505127, + "learning_rate": 3.7565070793120366e-05, + "loss": 1.2501, + "step": 22225 + }, + { + "epoch": 6.65, + "grad_norm": 1.3023401498794556, + "learning_rate": 3.755999122413041e-05, + "loss": 1.0223, + "step": 22230 + }, + { + "epoch": 6.65, + "grad_norm": 3.140843391418457, + "learning_rate": 3.7554910961465204e-05, + "loss": 1.1367, + "step": 22235 + }, + { + "epoch": 6.65, + "grad_norm": 1.410656452178955, + "learning_rate": 3.75498300054053e-05, + "loss": 1.227, + "step": 22240 + }, + { + "epoch": 6.66, + "grad_norm": 3.442096710205078, + "learning_rate": 3.754474835623134e-05, + "loss": 1.0639, + "step": 22245 + }, + { + "epoch": 6.66, + "grad_norm": 2.1821417808532715, + "learning_rate": 3.753966601422396e-05, + "loss": 1.2744, + "step": 22250 + }, + { + "epoch": 6.66, + "grad_norm": 1.2946699857711792, + "learning_rate": 3.753458297966387e-05, + "loss": 1.1973, + "step": 22255 + }, + { + "epoch": 6.66, + "grad_norm": 1.561705231666565, + "learning_rate": 3.752949925283178e-05, + "loss": 1.1897, + "step": 22260 + }, + { + "epoch": 6.66, + "grad_norm": 1.594902515411377, + "learning_rate": 3.7524414834008475e-05, + "loss": 1.1862, + "step": 22265 + }, + { + "epoch": 6.66, + "grad_norm": 2.135448932647705, + "learning_rate": 3.751932972347476e-05, + "loss": 1.2566, + "step": 22270 + }, + { + "epoch": 6.66, + "grad_norm": 4.568878650665283, + "learning_rate": 3.751424392151147e-05, + "loss": 0.9779, + "step": 22275 + }, + { + "epoch": 6.67, + "grad_norm": 1.9273889064788818, + "learning_rate": 3.75091574283995e-05, + "loss": 1.2662, + "step": 22280 + }, + { + "epoch": 6.67, + "grad_norm": 2.1159157752990723, + "learning_rate": 3.750407024441977e-05, + "loss": 1.1524, + "step": 22285 + }, + { + "epoch": 6.67, + "grad_norm": 2.8787682056427, + "learning_rate": 3.7498982369853235e-05, + "loss": 1.1924, + "step": 22290 + }, + { + "epoch": 6.67, + "grad_norm": 3.1426780223846436, + "learning_rate": 3.7493893804980897e-05, + "loss": 0.9211, + "step": 22295 + }, + { + "epoch": 6.67, + "grad_norm": 2.8271982669830322, + "learning_rate": 3.748880455008379e-05, + "loss": 1.3036, + "step": 22300 + }, + { + "epoch": 6.67, + "grad_norm": 3.4877443313598633, + "learning_rate": 3.7483714605442996e-05, + "loss": 1.1612, + "step": 22305 + }, + { + "epoch": 6.67, + "grad_norm": 1.2759697437286377, + "learning_rate": 3.747862397133961e-05, + "loss": 1.2321, + "step": 22310 + }, + { + "epoch": 6.68, + "grad_norm": 2.5716593265533447, + "learning_rate": 3.74735326480548e-05, + "loss": 1.0509, + "step": 22315 + }, + { + "epoch": 6.68, + "grad_norm": 1.8768802881240845, + "learning_rate": 3.7468440635869746e-05, + "loss": 1.2344, + "step": 22320 + }, + { + "epoch": 6.68, + "grad_norm": 3.4969723224639893, + "learning_rate": 3.746334793506567e-05, + "loss": 1.1697, + "step": 22325 + }, + { + "epoch": 6.68, + "grad_norm": 2.126424551010132, + "learning_rate": 3.7458254545923855e-05, + "loss": 1.2722, + "step": 22330 + }, + { + "epoch": 6.68, + "grad_norm": 2.6972811222076416, + "learning_rate": 3.745316046872558e-05, + "loss": 1.3685, + "step": 22335 + }, + { + "epoch": 6.68, + "grad_norm": 1.37725830078125, + "learning_rate": 3.74480657037522e-05, + "loss": 1.2091, + "step": 22340 + }, + { + "epoch": 6.69, + "grad_norm": 1.036205768585205, + "learning_rate": 3.7442970251285095e-05, + "loss": 1.3278, + "step": 22345 + }, + { + "epoch": 6.69, + "grad_norm": 1.2511407136917114, + "learning_rate": 3.743787411160567e-05, + "loss": 1.1185, + "step": 22350 + }, + { + "epoch": 6.69, + "grad_norm": 2.403078079223633, + "learning_rate": 3.7432777284995384e-05, + "loss": 1.187, + "step": 22355 + }, + { + "epoch": 6.69, + "grad_norm": 2.58774471282959, + "learning_rate": 3.742767977173573e-05, + "loss": 1.2784, + "step": 22360 + }, + { + "epoch": 6.69, + "grad_norm": 1.8022263050079346, + "learning_rate": 3.742258157210825e-05, + "loss": 1.1472, + "step": 22365 + }, + { + "epoch": 6.69, + "grad_norm": 1.9385359287261963, + "learning_rate": 3.74174826863945e-05, + "loss": 1.0361, + "step": 22370 + }, + { + "epoch": 6.69, + "grad_norm": 1.3857041597366333, + "learning_rate": 3.741238311487609e-05, + "loss": 1.2892, + "step": 22375 + }, + { + "epoch": 6.7, + "grad_norm": 1.456871747970581, + "learning_rate": 3.740728285783466e-05, + "loss": 1.251, + "step": 22380 + }, + { + "epoch": 6.7, + "grad_norm": 1.2048922777175903, + "learning_rate": 3.740218191555189e-05, + "loss": 1.2085, + "step": 22385 + }, + { + "epoch": 6.7, + "grad_norm": 1.9001728296279907, + "learning_rate": 3.7397080288309504e-05, + "loss": 1.1827, + "step": 22390 + }, + { + "epoch": 6.7, + "grad_norm": 1.7842788696289062, + "learning_rate": 3.739197797638927e-05, + "loss": 1.1712, + "step": 22395 + }, + { + "epoch": 6.7, + "grad_norm": 2.8742237091064453, + "learning_rate": 3.7386874980072965e-05, + "loss": 1.346, + "step": 22400 + }, + { + "epoch": 6.7, + "grad_norm": 0.9042024612426758, + "learning_rate": 3.7381771299642434e-05, + "loss": 1.1113, + "step": 22405 + }, + { + "epoch": 6.7, + "grad_norm": 2.389392852783203, + "learning_rate": 3.737666693537954e-05, + "loss": 1.2051, + "step": 22410 + }, + { + "epoch": 6.71, + "grad_norm": 1.8585515022277832, + "learning_rate": 3.73715618875662e-05, + "loss": 1.0961, + "step": 22415 + }, + { + "epoch": 6.71, + "grad_norm": 1.445894479751587, + "learning_rate": 3.736645615648435e-05, + "loss": 1.205, + "step": 22420 + }, + { + "epoch": 6.71, + "grad_norm": 1.4859192371368408, + "learning_rate": 3.736134974241599e-05, + "loss": 1.2583, + "step": 22425 + }, + { + "epoch": 6.71, + "grad_norm": 1.2996731996536255, + "learning_rate": 3.735624264564313e-05, + "loss": 1.1439, + "step": 22430 + }, + { + "epoch": 6.71, + "grad_norm": 2.0021395683288574, + "learning_rate": 3.735113486644783e-05, + "loss": 1.153, + "step": 22435 + }, + { + "epoch": 6.71, + "grad_norm": 1.7902204990386963, + "learning_rate": 3.734602640511219e-05, + "loss": 1.0953, + "step": 22440 + }, + { + "epoch": 6.72, + "grad_norm": 2.417541027069092, + "learning_rate": 3.734091726191834e-05, + "loss": 1.2148, + "step": 22445 + }, + { + "epoch": 6.72, + "grad_norm": 3.7760798931121826, + "learning_rate": 3.7335807437148454e-05, + "loss": 1.1157, + "step": 22450 + }, + { + "epoch": 6.72, + "grad_norm": 1.5918861627578735, + "learning_rate": 3.733069693108476e-05, + "loss": 1.2129, + "step": 22455 + }, + { + "epoch": 6.72, + "grad_norm": 0.9340783953666687, + "learning_rate": 3.732558574400948e-05, + "loss": 1.2252, + "step": 22460 + }, + { + "epoch": 6.72, + "grad_norm": 1.9744924306869507, + "learning_rate": 3.732047387620492e-05, + "loss": 1.1734, + "step": 22465 + }, + { + "epoch": 6.72, + "grad_norm": 1.9062458276748657, + "learning_rate": 3.731536132795339e-05, + "loss": 1.1843, + "step": 22470 + }, + { + "epoch": 6.72, + "grad_norm": 2.320115566253662, + "learning_rate": 3.731024809953726e-05, + "loss": 1.0732, + "step": 22475 + }, + { + "epoch": 6.73, + "grad_norm": 2.1954009532928467, + "learning_rate": 3.7305134191238914e-05, + "loss": 1.195, + "step": 22480 + }, + { + "epoch": 6.73, + "grad_norm": 2.1128578186035156, + "learning_rate": 3.73000196033408e-05, + "loss": 1.1004, + "step": 22485 + }, + { + "epoch": 6.73, + "grad_norm": 2.868645191192627, + "learning_rate": 3.729490433612539e-05, + "loss": 1.1917, + "step": 22490 + }, + { + "epoch": 6.73, + "grad_norm": 4.787483215332031, + "learning_rate": 3.7289788389875195e-05, + "loss": 1.1271, + "step": 22495 + }, + { + "epoch": 6.73, + "grad_norm": 1.8350549936294556, + "learning_rate": 3.7284671764872764e-05, + "loss": 1.1971, + "step": 22500 + }, + { + "epoch": 6.73, + "grad_norm": 1.9952346086502075, + "learning_rate": 3.7279554461400675e-05, + "loss": 1.2058, + "step": 22505 + }, + { + "epoch": 6.73, + "grad_norm": 2.3155946731567383, + "learning_rate": 3.727443647974156e-05, + "loss": 1.0397, + "step": 22510 + }, + { + "epoch": 6.74, + "grad_norm": 2.454383611679077, + "learning_rate": 3.726931782017807e-05, + "loss": 1.2128, + "step": 22515 + }, + { + "epoch": 6.74, + "grad_norm": 1.4196258783340454, + "learning_rate": 3.726419848299292e-05, + "loss": 1.3271, + "step": 22520 + }, + { + "epoch": 6.74, + "grad_norm": 4.044848918914795, + "learning_rate": 3.725907846846883e-05, + "loss": 1.1593, + "step": 22525 + }, + { + "epoch": 6.74, + "grad_norm": 1.5444008111953735, + "learning_rate": 3.725395777688858e-05, + "loss": 1.165, + "step": 22530 + }, + { + "epoch": 6.74, + "grad_norm": 2.297614097595215, + "learning_rate": 3.7248836408534975e-05, + "loss": 1.259, + "step": 22535 + }, + { + "epoch": 6.74, + "grad_norm": 1.7064695358276367, + "learning_rate": 3.724371436369087e-05, + "loss": 1.2259, + "step": 22540 + }, + { + "epoch": 6.75, + "grad_norm": 1.426712155342102, + "learning_rate": 3.7238591642639155e-05, + "loss": 1.1047, + "step": 22545 + }, + { + "epoch": 6.75, + "grad_norm": 2.8516054153442383, + "learning_rate": 3.723346824566273e-05, + "loss": 1.1642, + "step": 22550 + }, + { + "epoch": 6.75, + "grad_norm": 1.5631628036499023, + "learning_rate": 3.722834417304459e-05, + "loss": 1.1772, + "step": 22555 + }, + { + "epoch": 6.75, + "grad_norm": 1.1760963201522827, + "learning_rate": 3.7223219425067694e-05, + "loss": 1.2157, + "step": 22560 + }, + { + "epoch": 6.75, + "grad_norm": 1.5245758295059204, + "learning_rate": 3.7218094002015104e-05, + "loss": 1.0718, + "step": 22565 + }, + { + "epoch": 6.75, + "grad_norm": 5.258598327636719, + "learning_rate": 3.7212967904169874e-05, + "loss": 1.0714, + "step": 22570 + }, + { + "epoch": 6.75, + "grad_norm": 0.8288945555686951, + "learning_rate": 3.720784113181513e-05, + "loss": 1.1171, + "step": 22575 + }, + { + "epoch": 6.76, + "grad_norm": 2.472649097442627, + "learning_rate": 3.7202713685234006e-05, + "loss": 1.2317, + "step": 22580 + }, + { + "epoch": 6.76, + "grad_norm": 6.6396565437316895, + "learning_rate": 3.719758556470969e-05, + "loss": 1.1746, + "step": 22585 + }, + { + "epoch": 6.76, + "grad_norm": 1.3930727243423462, + "learning_rate": 3.71924567705254e-05, + "loss": 1.2441, + "step": 22590 + }, + { + "epoch": 6.76, + "grad_norm": 1.635603666305542, + "learning_rate": 3.71873273029644e-05, + "loss": 1.4045, + "step": 22595 + }, + { + "epoch": 6.76, + "grad_norm": 2.988915205001831, + "learning_rate": 3.718219716230998e-05, + "loss": 1.2706, + "step": 22600 + }, + { + "epoch": 6.76, + "grad_norm": 1.915685772895813, + "learning_rate": 3.717706634884547e-05, + "loss": 1.3597, + "step": 22605 + }, + { + "epoch": 6.76, + "grad_norm": 2.3813059329986572, + "learning_rate": 3.7171934862854244e-05, + "loss": 1.2544, + "step": 22610 + }, + { + "epoch": 6.77, + "grad_norm": 3.367213010787964, + "learning_rate": 3.716680270461971e-05, + "loss": 1.2843, + "step": 22615 + }, + { + "epoch": 6.77, + "grad_norm": 0.9341633319854736, + "learning_rate": 3.7161669874425304e-05, + "loss": 1.1621, + "step": 22620 + }, + { + "epoch": 6.77, + "grad_norm": 2.1656148433685303, + "learning_rate": 3.715653637255452e-05, + "loss": 1.0864, + "step": 22625 + }, + { + "epoch": 6.77, + "grad_norm": 2.8741602897644043, + "learning_rate": 3.715140219929086e-05, + "loss": 1.331, + "step": 22630 + }, + { + "epoch": 6.77, + "grad_norm": 3.68282151222229, + "learning_rate": 3.714626735491789e-05, + "loss": 1.2079, + "step": 22635 + }, + { + "epoch": 6.77, + "grad_norm": 1.4136219024658203, + "learning_rate": 3.71411318397192e-05, + "loss": 1.168, + "step": 22640 + }, + { + "epoch": 6.78, + "grad_norm": 2.178769588470459, + "learning_rate": 3.7135995653978415e-05, + "loss": 1.2211, + "step": 22645 + }, + { + "epoch": 6.78, + "grad_norm": 2.099884033203125, + "learning_rate": 3.713085879797921e-05, + "loss": 1.2584, + "step": 22650 + }, + { + "epoch": 6.78, + "grad_norm": 2.237651824951172, + "learning_rate": 3.7125721272005285e-05, + "loss": 1.1906, + "step": 22655 + }, + { + "epoch": 6.78, + "grad_norm": 5.40058708190918, + "learning_rate": 3.7120583076340374e-05, + "loss": 1.177, + "step": 22660 + }, + { + "epoch": 6.78, + "grad_norm": 3.936767578125, + "learning_rate": 3.711544421126826e-05, + "loss": 1.1745, + "step": 22665 + }, + { + "epoch": 6.78, + "grad_norm": 9.406877517700195, + "learning_rate": 3.711030467707275e-05, + "loss": 1.1282, + "step": 22670 + }, + { + "epoch": 6.78, + "grad_norm": 4.18208122253418, + "learning_rate": 3.7105164474037706e-05, + "loss": 1.3561, + "step": 22675 + }, + { + "epoch": 6.79, + "grad_norm": 1.8916975259780884, + "learning_rate": 3.710002360244701e-05, + "loss": 1.2032, + "step": 22680 + }, + { + "epoch": 6.79, + "grad_norm": 1.6227880716323853, + "learning_rate": 3.709488206258459e-05, + "loss": 1.1442, + "step": 22685 + }, + { + "epoch": 6.79, + "grad_norm": 2.2588720321655273, + "learning_rate": 3.7089739854734406e-05, + "loss": 1.1536, + "step": 22690 + }, + { + "epoch": 6.79, + "grad_norm": 2.82357120513916, + "learning_rate": 3.708459697918045e-05, + "loss": 1.1142, + "step": 22695 + }, + { + "epoch": 6.79, + "grad_norm": 2.484433650970459, + "learning_rate": 3.707945343620677e-05, + "loss": 1.2712, + "step": 22700 + }, + { + "epoch": 6.79, + "grad_norm": 3.071495771408081, + "learning_rate": 3.7074309226097424e-05, + "loss": 1.0004, + "step": 22705 + }, + { + "epoch": 6.79, + "grad_norm": 1.225174069404602, + "learning_rate": 3.706916434913654e-05, + "loss": 0.9823, + "step": 22710 + }, + { + "epoch": 6.8, + "grad_norm": 2.9054837226867676, + "learning_rate": 3.7064018805608244e-05, + "loss": 1.1946, + "step": 22715 + }, + { + "epoch": 6.8, + "grad_norm": 2.1783711910247803, + "learning_rate": 3.705887259579674e-05, + "loss": 1.1034, + "step": 22720 + }, + { + "epoch": 6.8, + "grad_norm": 7.1069016456604, + "learning_rate": 3.7053725719986234e-05, + "loss": 1.2409, + "step": 22725 + }, + { + "epoch": 6.8, + "grad_norm": 1.7012306451797485, + "learning_rate": 3.704960774000956e-05, + "loss": 1.0688, + "step": 22730 + }, + { + "epoch": 6.8, + "grad_norm": 2.2616653442382812, + "learning_rate": 3.704445966611721e-05, + "loss": 1.2547, + "step": 22735 + }, + { + "epoch": 6.8, + "grad_norm": 0.8807179927825928, + "learning_rate": 3.703931092702188e-05, + "loss": 1.0968, + "step": 22740 + }, + { + "epoch": 6.81, + "grad_norm": 3.9191973209381104, + "learning_rate": 3.7034161523007905e-05, + "loss": 1.008, + "step": 22745 + }, + { + "epoch": 6.81, + "grad_norm": 2.1995856761932373, + "learning_rate": 3.70290114543597e-05, + "loss": 1.1851, + "step": 22750 + }, + { + "epoch": 6.81, + "grad_norm": 2.6970908641815186, + "learning_rate": 3.70238607213617e-05, + "loss": 1.1741, + "step": 22755 + }, + { + "epoch": 6.81, + "grad_norm": 1.7191367149353027, + "learning_rate": 3.7018709324298364e-05, + "loss": 1.3658, + "step": 22760 + }, + { + "epoch": 6.81, + "grad_norm": 2.820333242416382, + "learning_rate": 3.701355726345421e-05, + "loss": 0.9673, + "step": 22765 + }, + { + "epoch": 6.81, + "grad_norm": 3.5425102710723877, + "learning_rate": 3.700840453911378e-05, + "loss": 1.2862, + "step": 22770 + }, + { + "epoch": 6.81, + "grad_norm": 2.4167778491973877, + "learning_rate": 3.700325115156165e-05, + "loss": 1.1087, + "step": 22775 + }, + { + "epoch": 6.82, + "grad_norm": 1.7471559047698975, + "learning_rate": 3.6998097101082424e-05, + "loss": 1.2492, + "step": 22780 + }, + { + "epoch": 6.82, + "grad_norm": 4.646815776824951, + "learning_rate": 3.6992942387960766e-05, + "loss": 1.2538, + "step": 22785 + }, + { + "epoch": 6.82, + "grad_norm": 1.7695302963256836, + "learning_rate": 3.698778701248137e-05, + "loss": 1.0982, + "step": 22790 + }, + { + "epoch": 6.82, + "grad_norm": 3.9096667766571045, + "learning_rate": 3.698263097492896e-05, + "loss": 1.1196, + "step": 22795 + }, + { + "epoch": 6.82, + "grad_norm": 2.3695907592773438, + "learning_rate": 3.69774742755883e-05, + "loss": 0.9765, + "step": 22800 + }, + { + "epoch": 6.82, + "grad_norm": 2.3105177879333496, + "learning_rate": 3.697231691474419e-05, + "loss": 1.1265, + "step": 22805 + }, + { + "epoch": 6.82, + "grad_norm": 8.283111572265625, + "learning_rate": 3.6967158892681456e-05, + "loss": 1.1517, + "step": 22810 + }, + { + "epoch": 6.83, + "grad_norm": 1.0593783855438232, + "learning_rate": 3.696200020968497e-05, + "loss": 1.2822, + "step": 22815 + }, + { + "epoch": 6.83, + "grad_norm": 2.067911386489868, + "learning_rate": 3.695684086603964e-05, + "loss": 1.1125, + "step": 22820 + }, + { + "epoch": 6.83, + "grad_norm": 7.895202159881592, + "learning_rate": 3.695168086203044e-05, + "loss": 1.1565, + "step": 22825 + }, + { + "epoch": 6.83, + "grad_norm": 2.0913262367248535, + "learning_rate": 3.694652019794231e-05, + "loss": 1.159, + "step": 22830 + }, + { + "epoch": 6.83, + "grad_norm": 2.38482403755188, + "learning_rate": 3.6941358874060295e-05, + "loss": 1.2542, + "step": 22835 + }, + { + "epoch": 6.83, + "grad_norm": 3.0547502040863037, + "learning_rate": 3.693619689066944e-05, + "loss": 1.3171, + "step": 22840 + }, + { + "epoch": 6.83, + "grad_norm": 4.048938751220703, + "learning_rate": 3.6931034248054834e-05, + "loss": 1.0786, + "step": 22845 + }, + { + "epoch": 6.84, + "grad_norm": 4.187254428863525, + "learning_rate": 3.6925870946501615e-05, + "loss": 1.1805, + "step": 22850 + }, + { + "epoch": 6.84, + "grad_norm": 2.219910144805908, + "learning_rate": 3.692070698629493e-05, + "loss": 1.2842, + "step": 22855 + }, + { + "epoch": 6.84, + "grad_norm": 3.7236175537109375, + "learning_rate": 3.6915542367720005e-05, + "loss": 1.2186, + "step": 22860 + }, + { + "epoch": 6.84, + "grad_norm": 3.3912012577056885, + "learning_rate": 3.691037709106205e-05, + "loss": 1.0233, + "step": 22865 + }, + { + "epoch": 6.84, + "grad_norm": 3.296980857849121, + "learning_rate": 3.6905211156606347e-05, + "loss": 1.2314, + "step": 22870 + }, + { + "epoch": 6.84, + "grad_norm": 2.4683995246887207, + "learning_rate": 3.690004456463821e-05, + "loss": 1.3122, + "step": 22875 + }, + { + "epoch": 6.85, + "grad_norm": 2.9964542388916016, + "learning_rate": 3.689487731544298e-05, + "loss": 1.2645, + "step": 22880 + }, + { + "epoch": 6.85, + "grad_norm": 2.661977529525757, + "learning_rate": 3.688970940930604e-05, + "loss": 0.9723, + "step": 22885 + }, + { + "epoch": 6.85, + "grad_norm": 1.609717607498169, + "learning_rate": 3.6884540846512806e-05, + "loss": 1.0939, + "step": 22890 + }, + { + "epoch": 6.85, + "grad_norm": 2.814666271209717, + "learning_rate": 3.687937162734874e-05, + "loss": 1.2832, + "step": 22895 + }, + { + "epoch": 6.85, + "grad_norm": 6.085357666015625, + "learning_rate": 3.687420175209933e-05, + "loss": 1.3523, + "step": 22900 + }, + { + "epoch": 6.85, + "grad_norm": 2.151413679122925, + "learning_rate": 3.686903122105009e-05, + "loss": 1.0956, + "step": 22905 + }, + { + "epoch": 6.85, + "grad_norm": 1.2115709781646729, + "learning_rate": 3.686386003448659e-05, + "loss": 0.9451, + "step": 22910 + }, + { + "epoch": 6.86, + "grad_norm": 1.7795747518539429, + "learning_rate": 3.685868819269444e-05, + "loss": 1.272, + "step": 22915 + }, + { + "epoch": 6.86, + "grad_norm": 1.4842565059661865, + "learning_rate": 3.6853515695959264e-05, + "loss": 1.3025, + "step": 22920 + }, + { + "epoch": 6.86, + "grad_norm": 1.450868010520935, + "learning_rate": 3.684834254456674e-05, + "loss": 1.2952, + "step": 22925 + }, + { + "epoch": 6.86, + "grad_norm": 2.6749181747436523, + "learning_rate": 3.684316873880257e-05, + "loss": 1.2248, + "step": 22930 + }, + { + "epoch": 6.86, + "grad_norm": 1.4082355499267578, + "learning_rate": 3.6837994278952506e-05, + "loss": 1.2363, + "step": 22935 + }, + { + "epoch": 6.86, + "grad_norm": 1.7880902290344238, + "learning_rate": 3.6832819165302325e-05, + "loss": 1.1894, + "step": 22940 + }, + { + "epoch": 6.86, + "grad_norm": 2.6306281089782715, + "learning_rate": 3.682764339813783e-05, + "loss": 1.1545, + "step": 22945 + }, + { + "epoch": 6.87, + "grad_norm": 4.578315734863281, + "learning_rate": 3.682246697774489e-05, + "loss": 1.0853, + "step": 22950 + }, + { + "epoch": 6.87, + "grad_norm": 1.6041889190673828, + "learning_rate": 3.6817289904409394e-05, + "loss": 1.2719, + "step": 22955 + }, + { + "epoch": 6.87, + "grad_norm": 4.389896869659424, + "learning_rate": 3.6812112178417244e-05, + "loss": 1.12, + "step": 22960 + }, + { + "epoch": 6.87, + "grad_norm": 3.254216432571411, + "learning_rate": 3.6806933800054425e-05, + "loss": 1.2594, + "step": 22965 + }, + { + "epoch": 6.87, + "grad_norm": 1.4171181917190552, + "learning_rate": 3.680175476960693e-05, + "loss": 1.2488, + "step": 22970 + }, + { + "epoch": 6.87, + "grad_norm": 8.962324142456055, + "learning_rate": 3.679657508736078e-05, + "loss": 1.1462, + "step": 22975 + }, + { + "epoch": 6.88, + "grad_norm": 4.2052321434021, + "learning_rate": 3.6791394753602054e-05, + "loss": 1.241, + "step": 22980 + }, + { + "epoch": 6.88, + "grad_norm": 1.6290329694747925, + "learning_rate": 3.678621376861685e-05, + "loss": 1.2826, + "step": 22985 + }, + { + "epoch": 6.88, + "grad_norm": 2.770118236541748, + "learning_rate": 3.678103213269131e-05, + "loss": 1.3042, + "step": 22990 + }, + { + "epoch": 6.88, + "grad_norm": 2.3370397090911865, + "learning_rate": 3.6775849846111614e-05, + "loss": 1.1647, + "step": 22995 + }, + { + "epoch": 6.88, + "grad_norm": 2.9405107498168945, + "learning_rate": 3.677066690916397e-05, + "loss": 1.231, + "step": 23000 + }, + { + "epoch": 6.88, + "grad_norm": 3.535926580429077, + "learning_rate": 3.6765483322134625e-05, + "loss": 1.1303, + "step": 23005 + }, + { + "epoch": 6.88, + "grad_norm": 1.9250695705413818, + "learning_rate": 3.676029908530987e-05, + "loss": 1.2256, + "step": 23010 + }, + { + "epoch": 6.89, + "grad_norm": 1.7612509727478027, + "learning_rate": 3.6755114198976016e-05, + "loss": 1.061, + "step": 23015 + }, + { + "epoch": 6.89, + "grad_norm": 2.3679986000061035, + "learning_rate": 3.674992866341943e-05, + "loss": 1.1068, + "step": 23020 + }, + { + "epoch": 6.89, + "grad_norm": 1.4026302099227905, + "learning_rate": 3.6744742478926485e-05, + "loss": 1.1352, + "step": 23025 + }, + { + "epoch": 6.89, + "grad_norm": 2.6781442165374756, + "learning_rate": 3.673955564578364e-05, + "loss": 1.3133, + "step": 23030 + }, + { + "epoch": 6.89, + "grad_norm": 3.6922409534454346, + "learning_rate": 3.673436816427732e-05, + "loss": 1.1815, + "step": 23035 + }, + { + "epoch": 6.89, + "grad_norm": 1.3594402074813843, + "learning_rate": 3.6729180034694055e-05, + "loss": 1.2121, + "step": 23040 + }, + { + "epoch": 6.89, + "grad_norm": 2.578176736831665, + "learning_rate": 3.672399125732036e-05, + "loss": 1.2374, + "step": 23045 + }, + { + "epoch": 6.9, + "grad_norm": 2.196401357650757, + "learning_rate": 3.671880183244282e-05, + "loss": 1.2618, + "step": 23050 + }, + { + "epoch": 6.9, + "grad_norm": 1.1504014730453491, + "learning_rate": 3.671361176034803e-05, + "loss": 1.2077, + "step": 23055 + }, + { + "epoch": 6.9, + "grad_norm": 1.4980058670043945, + "learning_rate": 3.670842104132265e-05, + "loss": 1.1627, + "step": 23060 + }, + { + "epoch": 6.9, + "grad_norm": 2.5145270824432373, + "learning_rate": 3.670322967565333e-05, + "loss": 1.2164, + "step": 23065 + }, + { + "epoch": 6.9, + "grad_norm": 1.6475614309310913, + "learning_rate": 3.6698037663626815e-05, + "loss": 1.1775, + "step": 23070 + }, + { + "epoch": 6.9, + "grad_norm": 1.1627910137176514, + "learning_rate": 3.669284500552983e-05, + "loss": 1.2284, + "step": 23075 + }, + { + "epoch": 6.91, + "grad_norm": 2.22105073928833, + "learning_rate": 3.668765170164917e-05, + "loss": 1.1692, + "step": 23080 + }, + { + "epoch": 6.91, + "grad_norm": 1.5115783214569092, + "learning_rate": 3.668245775227166e-05, + "loss": 1.0566, + "step": 23085 + }, + { + "epoch": 6.91, + "grad_norm": 1.735473394393921, + "learning_rate": 3.667726315768415e-05, + "loss": 1.0803, + "step": 23090 + }, + { + "epoch": 6.91, + "grad_norm": 2.8493192195892334, + "learning_rate": 3.667206791817354e-05, + "loss": 1.2563, + "step": 23095 + }, + { + "epoch": 6.91, + "grad_norm": 2.09541916847229, + "learning_rate": 3.6666872034026746e-05, + "loss": 1.2383, + "step": 23100 + }, + { + "epoch": 6.91, + "grad_norm": 2.5542640686035156, + "learning_rate": 3.666167550553073e-05, + "loss": 1.1892, + "step": 23105 + }, + { + "epoch": 6.91, + "grad_norm": 3.2478058338165283, + "learning_rate": 3.66564783329725e-05, + "loss": 1.3445, + "step": 23110 + }, + { + "epoch": 6.92, + "grad_norm": 3.8739407062530518, + "learning_rate": 3.665128051663909e-05, + "loss": 1.2336, + "step": 23115 + }, + { + "epoch": 6.92, + "grad_norm": 1.4774131774902344, + "learning_rate": 3.664608205681757e-05, + "loss": 1.0818, + "step": 23120 + }, + { + "epoch": 6.92, + "grad_norm": 2.790187120437622, + "learning_rate": 3.664088295379505e-05, + "loss": 1.1317, + "step": 23125 + }, + { + "epoch": 6.92, + "grad_norm": 5.5581278800964355, + "learning_rate": 3.663568320785865e-05, + "loss": 1.2463, + "step": 23130 + }, + { + "epoch": 6.92, + "grad_norm": 1.6298890113830566, + "learning_rate": 3.6630482819295564e-05, + "loss": 1.1815, + "step": 23135 + }, + { + "epoch": 6.92, + "grad_norm": 2.767247200012207, + "learning_rate": 3.662528178839301e-05, + "loss": 1.1912, + "step": 23140 + }, + { + "epoch": 6.92, + "grad_norm": 2.707919120788574, + "learning_rate": 3.6620080115438214e-05, + "loss": 1.1293, + "step": 23145 + }, + { + "epoch": 6.93, + "grad_norm": 4.199367046356201, + "learning_rate": 3.661487780071848e-05, + "loss": 1.0369, + "step": 23150 + }, + { + "epoch": 6.93, + "grad_norm": 0.9766875505447388, + "learning_rate": 3.6609674844521113e-05, + "loss": 1.2359, + "step": 23155 + }, + { + "epoch": 6.93, + "grad_norm": 2.218275308609009, + "learning_rate": 3.6604471247133476e-05, + "loss": 1.314, + "step": 23160 + }, + { + "epoch": 6.93, + "grad_norm": 3.2757742404937744, + "learning_rate": 3.659926700884296e-05, + "loss": 1.1953, + "step": 23165 + }, + { + "epoch": 6.93, + "grad_norm": 1.7651225328445435, + "learning_rate": 3.6594062129936974e-05, + "loss": 1.2148, + "step": 23170 + }, + { + "epoch": 6.93, + "grad_norm": 6.852691650390625, + "learning_rate": 3.6588856610702994e-05, + "loss": 1.1669, + "step": 23175 + }, + { + "epoch": 6.94, + "grad_norm": 4.03468132019043, + "learning_rate": 3.65836504514285e-05, + "loss": 1.3234, + "step": 23180 + }, + { + "epoch": 6.94, + "grad_norm": 1.6668856143951416, + "learning_rate": 3.657844365240104e-05, + "loss": 1.2555, + "step": 23185 + }, + { + "epoch": 6.94, + "grad_norm": 2.5867419242858887, + "learning_rate": 3.657323621390818e-05, + "loss": 1.2328, + "step": 23190 + }, + { + "epoch": 6.94, + "grad_norm": 4.226202011108398, + "learning_rate": 3.65680281362375e-05, + "loss": 1.2422, + "step": 23195 + }, + { + "epoch": 6.94, + "grad_norm": 1.6796958446502686, + "learning_rate": 3.656281941967665e-05, + "loss": 1.3767, + "step": 23200 + }, + { + "epoch": 6.94, + "grad_norm": 2.898170232772827, + "learning_rate": 3.6557610064513314e-05, + "loss": 1.2212, + "step": 23205 + }, + { + "epoch": 6.94, + "grad_norm": 2.079902172088623, + "learning_rate": 3.6552400071035184e-05, + "loss": 1.1753, + "step": 23210 + }, + { + "epoch": 6.95, + "grad_norm": 2.3818612098693848, + "learning_rate": 3.654718943953e-05, + "loss": 1.2634, + "step": 23215 + }, + { + "epoch": 6.95, + "grad_norm": 2.0267412662506104, + "learning_rate": 3.654197817028555e-05, + "loss": 1.294, + "step": 23220 + }, + { + "epoch": 6.95, + "grad_norm": 2.046330213546753, + "learning_rate": 3.6536766263589646e-05, + "loss": 1.2195, + "step": 23225 + }, + { + "epoch": 6.95, + "grad_norm": 1.5229865312576294, + "learning_rate": 3.653155371973012e-05, + "loss": 1.2128, + "step": 23230 + }, + { + "epoch": 6.95, + "grad_norm": 1.3038949966430664, + "learning_rate": 3.652634053899487e-05, + "loss": 1.1954, + "step": 23235 + }, + { + "epoch": 6.95, + "grad_norm": 2.71051025390625, + "learning_rate": 3.652112672167183e-05, + "loss": 1.1658, + "step": 23240 + }, + { + "epoch": 6.95, + "grad_norm": 2.3642067909240723, + "learning_rate": 3.651591226804892e-05, + "loss": 1.2206, + "step": 23245 + }, + { + "epoch": 6.96, + "grad_norm": 1.8010873794555664, + "learning_rate": 3.6510697178414156e-05, + "loss": 1.2646, + "step": 23250 + }, + { + "epoch": 6.96, + "grad_norm": 2.7175979614257812, + "learning_rate": 3.6505481453055554e-05, + "loss": 1.097, + "step": 23255 + }, + { + "epoch": 6.96, + "grad_norm": 7.800134181976318, + "learning_rate": 3.6500265092261164e-05, + "loss": 0.961, + "step": 23260 + }, + { + "epoch": 6.96, + "grad_norm": 3.0702438354492188, + "learning_rate": 3.6495048096319084e-05, + "loss": 1.1067, + "step": 23265 + }, + { + "epoch": 6.96, + "grad_norm": 2.858368158340454, + "learning_rate": 3.648983046551745e-05, + "loss": 1.1169, + "step": 23270 + }, + { + "epoch": 6.96, + "grad_norm": 4.364511966705322, + "learning_rate": 3.6484612200144425e-05, + "loss": 1.057, + "step": 23275 + }, + { + "epoch": 6.97, + "grad_norm": 1.3396683931350708, + "learning_rate": 3.64793933004882e-05, + "loss": 1.1468, + "step": 23280 + }, + { + "epoch": 6.97, + "grad_norm": 2.03324294090271, + "learning_rate": 3.647417376683702e-05, + "loss": 1.126, + "step": 23285 + }, + { + "epoch": 6.97, + "grad_norm": 2.8686647415161133, + "learning_rate": 3.646895359947915e-05, + "loss": 1.1806, + "step": 23290 + }, + { + "epoch": 6.97, + "grad_norm": 1.482798457145691, + "learning_rate": 3.646373279870289e-05, + "loss": 1.2207, + "step": 23295 + }, + { + "epoch": 6.97, + "grad_norm": 9.18463134765625, + "learning_rate": 3.6458511364796585e-05, + "loss": 1.2255, + "step": 23300 + }, + { + "epoch": 6.97, + "grad_norm": 2.2677526473999023, + "learning_rate": 3.645328929804861e-05, + "loss": 1.3509, + "step": 23305 + }, + { + "epoch": 6.97, + "grad_norm": 1.5335679054260254, + "learning_rate": 3.6448066598747365e-05, + "loss": 1.2393, + "step": 23310 + }, + { + "epoch": 6.98, + "grad_norm": 3.603954792022705, + "learning_rate": 3.644284326718131e-05, + "loss": 1.1396, + "step": 23315 + }, + { + "epoch": 6.98, + "grad_norm": 2.735201835632324, + "learning_rate": 3.6437619303638906e-05, + "loss": 1.0535, + "step": 23320 + }, + { + "epoch": 6.98, + "grad_norm": 2.017690420150757, + "learning_rate": 3.6432394708408684e-05, + "loss": 1.1505, + "step": 23325 + }, + { + "epoch": 6.98, + "grad_norm": 2.649900436401367, + "learning_rate": 3.6427169481779185e-05, + "loss": 1.1615, + "step": 23330 + }, + { + "epoch": 6.98, + "grad_norm": 3.7284982204437256, + "learning_rate": 3.642194362403899e-05, + "loss": 1.0333, + "step": 23335 + }, + { + "epoch": 6.98, + "grad_norm": 1.5677194595336914, + "learning_rate": 3.6416717135476726e-05, + "loss": 1.1199, + "step": 23340 + }, + { + "epoch": 6.98, + "grad_norm": 1.4740290641784668, + "learning_rate": 3.6411490016381036e-05, + "loss": 1.2606, + "step": 23345 + }, + { + "epoch": 6.99, + "grad_norm": 2.8093674182891846, + "learning_rate": 3.640626226704063e-05, + "loss": 1.0608, + "step": 23350 + }, + { + "epoch": 6.99, + "grad_norm": 2.619047164916992, + "learning_rate": 3.640103388774419e-05, + "loss": 1.2281, + "step": 23355 + }, + { + "epoch": 6.99, + "grad_norm": 3.5901520252227783, + "learning_rate": 3.6395804878780514e-05, + "loss": 1.0943, + "step": 23360 + }, + { + "epoch": 6.99, + "grad_norm": 2.7254862785339355, + "learning_rate": 3.639057524043838e-05, + "loss": 1.1382, + "step": 23365 + }, + { + "epoch": 6.99, + "grad_norm": 2.1018872261047363, + "learning_rate": 3.638534497300661e-05, + "loss": 1.1479, + "step": 23370 + }, + { + "epoch": 6.99, + "grad_norm": 3.5326809883117676, + "learning_rate": 3.6380114076774076e-05, + "loss": 1.2146, + "step": 23375 + }, + { + "epoch": 7.0, + "grad_norm": 3.996938467025757, + "learning_rate": 3.637488255202967e-05, + "loss": 1.1037, + "step": 23380 + }, + { + "epoch": 7.0, + "grad_norm": 5.027413845062256, + "learning_rate": 3.636965039906232e-05, + "loss": 1.1256, + "step": 23385 + }, + { + "epoch": 7.0, + "grad_norm": 2.1053760051727295, + "learning_rate": 3.6364417618161e-05, + "loss": 1.1905, + "step": 23390 + }, + { + "epoch": 7.0, + "grad_norm": 2.8059771060943604, + "learning_rate": 3.635918420961471e-05, + "loss": 1.2107, + "step": 23395 + }, + { + "epoch": 7.0, + "grad_norm": 3.2289247512817383, + "learning_rate": 3.635395017371248e-05, + "loss": 1.0534, + "step": 23400 + }, + { + "epoch": 7.0, + "grad_norm": 2.3690922260284424, + "learning_rate": 3.634871551074338e-05, + "loss": 1.0033, + "step": 23405 + }, + { + "epoch": 7.0, + "grad_norm": 1.940168857574463, + "learning_rate": 3.634348022099652e-05, + "loss": 1.1098, + "step": 23410 + }, + { + "epoch": 7.01, + "grad_norm": 2.454968214035034, + "learning_rate": 3.633824430476104e-05, + "loss": 1.0398, + "step": 23415 + }, + { + "epoch": 7.01, + "grad_norm": 2.5789737701416016, + "learning_rate": 3.6333007762326114e-05, + "loss": 1.1326, + "step": 23420 + }, + { + "epoch": 7.01, + "grad_norm": 2.819851875305176, + "learning_rate": 3.6327770593980946e-05, + "loss": 0.9739, + "step": 23425 + }, + { + "epoch": 7.01, + "grad_norm": Infinity, + "learning_rate": 3.6323580408843814e-05, + "loss": 1.1279, + "step": 23430 + }, + { + "epoch": 7.01, + "grad_norm": 2.4087419509887695, + "learning_rate": 3.631834211458914e-05, + "loss": 1.0759, + "step": 23435 + }, + { + "epoch": 7.01, + "grad_norm": 1.9813141822814941, + "learning_rate": 3.6313103195234194e-05, + "loss": 1.2067, + "step": 23440 + }, + { + "epoch": 7.01, + "grad_norm": 1.0360766649246216, + "learning_rate": 3.6307863651068315e-05, + "loss": 1.1645, + "step": 23445 + }, + { + "epoch": 7.02, + "grad_norm": 2.9632136821746826, + "learning_rate": 3.6302623482380876e-05, + "loss": 1.1632, + "step": 23450 + }, + { + "epoch": 7.02, + "grad_norm": 2.059884786605835, + "learning_rate": 3.62973826894613e-05, + "loss": 1.0065, + "step": 23455 + }, + { + "epoch": 7.02, + "grad_norm": 1.9883657693862915, + "learning_rate": 3.629214127259901e-05, + "loss": 1.1301, + "step": 23460 + }, + { + "epoch": 7.02, + "grad_norm": 1.7367918491363525, + "learning_rate": 3.628689923208351e-05, + "loss": 1.1835, + "step": 23465 + }, + { + "epoch": 7.02, + "grad_norm": 1.1696290969848633, + "learning_rate": 3.6281656568204285e-05, + "loss": 1.241, + "step": 23470 + }, + { + "epoch": 7.02, + "grad_norm": 2.884432315826416, + "learning_rate": 3.62764132812509e-05, + "loss": 1.0894, + "step": 23475 + }, + { + "epoch": 7.02, + "grad_norm": 1.4623018503189087, + "learning_rate": 3.627116937151293e-05, + "loss": 1.1655, + "step": 23480 + }, + { + "epoch": 7.03, + "grad_norm": 4.936588287353516, + "learning_rate": 3.626592483927999e-05, + "loss": 1.0457, + "step": 23485 + }, + { + "epoch": 7.03, + "grad_norm": 2.926048517227173, + "learning_rate": 3.626067968484172e-05, + "loss": 1.1325, + "step": 23490 + }, + { + "epoch": 7.03, + "grad_norm": 2.242962598800659, + "learning_rate": 3.625543390848783e-05, + "loss": 1.0459, + "step": 23495 + }, + { + "epoch": 7.03, + "grad_norm": 1.152270793914795, + "learning_rate": 3.625018751050803e-05, + "loss": 0.974, + "step": 23500 + }, + { + "epoch": 7.03, + "grad_norm": 2.016425371170044, + "learning_rate": 3.624494049119205e-05, + "loss": 1.1342, + "step": 23505 + }, + { + "epoch": 7.03, + "grad_norm": 2.164924144744873, + "learning_rate": 3.623969285082971e-05, + "loss": 1.1836, + "step": 23510 + }, + { + "epoch": 7.04, + "grad_norm": 2.6692936420440674, + "learning_rate": 3.623444458971081e-05, + "loss": 1.2556, + "step": 23515 + }, + { + "epoch": 7.04, + "grad_norm": 1.107765793800354, + "learning_rate": 3.622919570812521e-05, + "loss": 1.2507, + "step": 23520 + }, + { + "epoch": 7.04, + "grad_norm": 1.9478957653045654, + "learning_rate": 3.622394620636281e-05, + "loss": 1.0116, + "step": 23525 + }, + { + "epoch": 7.04, + "grad_norm": 2.816403865814209, + "learning_rate": 3.621869608471352e-05, + "loss": 0.9359, + "step": 23530 + }, + { + "epoch": 7.04, + "grad_norm": 2.90093994140625, + "learning_rate": 3.621344534346732e-05, + "loss": 1.109, + "step": 23535 + }, + { + "epoch": 7.04, + "grad_norm": 2.196246385574341, + "learning_rate": 3.620819398291418e-05, + "loss": 1.0691, + "step": 23540 + }, + { + "epoch": 7.04, + "grad_norm": 2.3277361392974854, + "learning_rate": 3.6202942003344134e-05, + "loss": 1.0831, + "step": 23545 + }, + { + "epoch": 7.05, + "grad_norm": 3.5473828315734863, + "learning_rate": 3.619768940504725e-05, + "loss": 1.0406, + "step": 23550 + }, + { + "epoch": 7.05, + "grad_norm": 3.1161794662475586, + "learning_rate": 3.619243618831362e-05, + "loss": 1.145, + "step": 23555 + }, + { + "epoch": 7.05, + "grad_norm": 1.5470253229141235, + "learning_rate": 3.618718235343337e-05, + "loss": 1.0475, + "step": 23560 + }, + { + "epoch": 7.05, + "grad_norm": 1.8705344200134277, + "learning_rate": 3.618192790069668e-05, + "loss": 1.1717, + "step": 23565 + }, + { + "epoch": 7.05, + "grad_norm": 2.805480480194092, + "learning_rate": 3.617667283039372e-05, + "loss": 1.1476, + "step": 23570 + }, + { + "epoch": 7.05, + "grad_norm": 1.1502296924591064, + "learning_rate": 3.617141714281473e-05, + "loss": 0.9713, + "step": 23575 + }, + { + "epoch": 7.05, + "grad_norm": 2.4745311737060547, + "learning_rate": 3.616616083825e-05, + "loss": 1.1653, + "step": 23580 + }, + { + "epoch": 7.06, + "grad_norm": 1.45720636844635, + "learning_rate": 3.616090391698981e-05, + "loss": 1.2729, + "step": 23585 + }, + { + "epoch": 7.06, + "grad_norm": 1.0925184488296509, + "learning_rate": 3.615564637932449e-05, + "loss": 0.9977, + "step": 23590 + }, + { + "epoch": 7.06, + "grad_norm": 2.9500515460968018, + "learning_rate": 3.615038822554442e-05, + "loss": 1.0472, + "step": 23595 + }, + { + "epoch": 7.06, + "grad_norm": 1.715450644493103, + "learning_rate": 3.614512945594001e-05, + "loss": 1.2471, + "step": 23600 + }, + { + "epoch": 7.06, + "grad_norm": 1.3677220344543457, + "learning_rate": 3.6139870070801675e-05, + "loss": 1.0056, + "step": 23605 + }, + { + "epoch": 7.06, + "grad_norm": 1.0849230289459229, + "learning_rate": 3.613461007041989e-05, + "loss": 1.0685, + "step": 23610 + }, + { + "epoch": 7.07, + "grad_norm": 1.6670022010803223, + "learning_rate": 3.612934945508517e-05, + "loss": 1.1105, + "step": 23615 + }, + { + "epoch": 7.07, + "grad_norm": 1.0212535858154297, + "learning_rate": 3.612408822508805e-05, + "loss": 1.1527, + "step": 23620 + }, + { + "epoch": 7.07, + "grad_norm": 3.0256199836730957, + "learning_rate": 3.611882638071909e-05, + "loss": 1.1113, + "step": 23625 + }, + { + "epoch": 7.07, + "grad_norm": 7.463032245635986, + "learning_rate": 3.6113563922268926e-05, + "loss": 1.104, + "step": 23630 + }, + { + "epoch": 7.07, + "grad_norm": 2.4572513103485107, + "learning_rate": 3.6108300850028165e-05, + "loss": 1.2712, + "step": 23635 + }, + { + "epoch": 7.07, + "grad_norm": 2.344938278198242, + "learning_rate": 3.61030371642875e-05, + "loss": 0.9881, + "step": 23640 + }, + { + "epoch": 7.07, + "grad_norm": 2.6021649837493896, + "learning_rate": 3.609777286533763e-05, + "loss": 1.2082, + "step": 23645 + }, + { + "epoch": 7.08, + "grad_norm": 2.0514180660247803, + "learning_rate": 3.6092507953469305e-05, + "loss": 0.9766, + "step": 23650 + }, + { + "epoch": 7.08, + "grad_norm": 4.5105881690979, + "learning_rate": 3.608724242897328e-05, + "loss": 0.937, + "step": 23655 + }, + { + "epoch": 7.08, + "grad_norm": 3.2438158988952637, + "learning_rate": 3.6081976292140395e-05, + "loss": 1.138, + "step": 23660 + }, + { + "epoch": 7.08, + "grad_norm": 3.015331745147705, + "learning_rate": 3.607670954326147e-05, + "loss": 1.1613, + "step": 23665 + }, + { + "epoch": 7.08, + "grad_norm": 1.9946476221084595, + "learning_rate": 3.6071442182627395e-05, + "loss": 1.2047, + "step": 23670 + }, + { + "epoch": 7.08, + "grad_norm": 3.2494962215423584, + "learning_rate": 3.606617421052908e-05, + "loss": 1.0119, + "step": 23675 + }, + { + "epoch": 7.08, + "grad_norm": 2.6945230960845947, + "learning_rate": 3.6060905627257455e-05, + "loss": 1.0004, + "step": 23680 + }, + { + "epoch": 7.09, + "grad_norm": 2.1307501792907715, + "learning_rate": 3.6055636433103524e-05, + "loss": 0.9985, + "step": 23685 + }, + { + "epoch": 7.09, + "grad_norm": 1.7671560049057007, + "learning_rate": 3.6050366628358276e-05, + "loss": 1.1638, + "step": 23690 + }, + { + "epoch": 7.09, + "grad_norm": 4.131464004516602, + "learning_rate": 3.6045096213312766e-05, + "loss": 1.0299, + "step": 23695 + }, + { + "epoch": 7.09, + "grad_norm": 1.8803554773330688, + "learning_rate": 3.6039825188258075e-05, + "loss": 1.1465, + "step": 23700 + }, + { + "epoch": 7.09, + "grad_norm": 4.313520908355713, + "learning_rate": 3.603455355348531e-05, + "loss": 1.1605, + "step": 23705 + }, + { + "epoch": 7.09, + "grad_norm": 1.4319663047790527, + "learning_rate": 3.602928130928563e-05, + "loss": 1.13, + "step": 23710 + }, + { + "epoch": 7.1, + "grad_norm": 1.6968344449996948, + "learning_rate": 3.60240084559502e-05, + "loss": 1.0394, + "step": 23715 + }, + { + "epoch": 7.1, + "grad_norm": 2.2089247703552246, + "learning_rate": 3.601873499377024e-05, + "loss": 1.0584, + "step": 23720 + }, + { + "epoch": 7.1, + "grad_norm": 2.618340253829956, + "learning_rate": 3.601346092303701e-05, + "loss": 1.1993, + "step": 23725 + }, + { + "epoch": 7.1, + "grad_norm": 1.327785849571228, + "learning_rate": 3.600818624404177e-05, + "loss": 1.0758, + "step": 23730 + }, + { + "epoch": 7.1, + "grad_norm": 3.1127471923828125, + "learning_rate": 3.600291095707585e-05, + "loss": 1.1543, + "step": 23735 + }, + { + "epoch": 7.1, + "grad_norm": 1.783038854598999, + "learning_rate": 3.59976350624306e-05, + "loss": 1.0839, + "step": 23740 + }, + { + "epoch": 7.1, + "grad_norm": 1.7736161947250366, + "learning_rate": 3.5992358560397394e-05, + "loss": 1.0779, + "step": 23745 + }, + { + "epoch": 7.11, + "grad_norm": 2.9156086444854736, + "learning_rate": 3.5987081451267646e-05, + "loss": 1.1455, + "step": 23750 + }, + { + "epoch": 7.11, + "grad_norm": 3.304860830307007, + "learning_rate": 3.598180373533281e-05, + "loss": 1.0505, + "step": 23755 + }, + { + "epoch": 7.11, + "grad_norm": 1.9659478664398193, + "learning_rate": 3.597652541288438e-05, + "loss": 1.1696, + "step": 23760 + }, + { + "epoch": 7.11, + "grad_norm": 3.989715814590454, + "learning_rate": 3.597124648421384e-05, + "loss": 1.1278, + "step": 23765 + }, + { + "epoch": 7.11, + "grad_norm": 2.289445400238037, + "learning_rate": 3.596596694961278e-05, + "loss": 1.2564, + "step": 23770 + }, + { + "epoch": 7.11, + "grad_norm": 1.6638239622116089, + "learning_rate": 3.5960686809372756e-05, + "loss": 1.2167, + "step": 23775 + }, + { + "epoch": 7.11, + "grad_norm": 2.3342809677124023, + "learning_rate": 3.595540606378539e-05, + "loss": 1.1625, + "step": 23780 + }, + { + "epoch": 7.12, + "grad_norm": 2.215494155883789, + "learning_rate": 3.595012471314234e-05, + "loss": 0.9338, + "step": 23785 + }, + { + "epoch": 7.12, + "grad_norm": 2.7711009979248047, + "learning_rate": 3.594484275773529e-05, + "loss": 1.0951, + "step": 23790 + }, + { + "epoch": 7.12, + "grad_norm": 2.727311611175537, + "learning_rate": 3.593956019785594e-05, + "loss": 1.1065, + "step": 23795 + }, + { + "epoch": 7.12, + "grad_norm": 1.7198123931884766, + "learning_rate": 3.5934277033796055e-05, + "loss": 1.102, + "step": 23800 + }, + { + "epoch": 7.12, + "grad_norm": 1.2289972305297852, + "learning_rate": 3.592899326584741e-05, + "loss": 1.1591, + "step": 23805 + }, + { + "epoch": 7.12, + "grad_norm": 9.438529968261719, + "learning_rate": 3.5923708894301836e-05, + "loss": 1.2098, + "step": 23810 + }, + { + "epoch": 7.13, + "grad_norm": 2.6810221672058105, + "learning_rate": 3.591842391945117e-05, + "loss": 1.1817, + "step": 23815 + }, + { + "epoch": 7.13, + "grad_norm": 3.3912224769592285, + "learning_rate": 3.591313834158729e-05, + "loss": 1.1386, + "step": 23820 + }, + { + "epoch": 7.13, + "grad_norm": 3.810546398162842, + "learning_rate": 3.590785216100214e-05, + "loss": 1.1549, + "step": 23825 + }, + { + "epoch": 7.13, + "grad_norm": 2.7697300910949707, + "learning_rate": 3.590256537798765e-05, + "loss": 1.2004, + "step": 23830 + }, + { + "epoch": 7.13, + "grad_norm": 1.2456960678100586, + "learning_rate": 3.58972779928358e-05, + "loss": 1.1418, + "step": 23835 + }, + { + "epoch": 7.13, + "grad_norm": 2.144900321960449, + "learning_rate": 3.589199000583862e-05, + "loss": 1.199, + "step": 23840 + }, + { + "epoch": 7.13, + "grad_norm": 3.3847482204437256, + "learning_rate": 3.588670141728815e-05, + "loss": 1.1028, + "step": 23845 + }, + { + "epoch": 7.14, + "grad_norm": 2.134644031524658, + "learning_rate": 3.5881412227476476e-05, + "loss": 1.0711, + "step": 23850 + }, + { + "epoch": 7.14, + "grad_norm": 1.6338741779327393, + "learning_rate": 3.587612243669571e-05, + "loss": 1.1405, + "step": 23855 + }, + { + "epoch": 7.14, + "grad_norm": 4.937558650970459, + "learning_rate": 3.5870832045238013e-05, + "loss": 1.1193, + "step": 23860 + }, + { + "epoch": 7.14, + "grad_norm": 2.3749191761016846, + "learning_rate": 3.5865541053395564e-05, + "loss": 1.0552, + "step": 23865 + }, + { + "epoch": 7.14, + "grad_norm": 1.3177196979522705, + "learning_rate": 3.586024946146057e-05, + "loss": 1.2673, + "step": 23870 + }, + { + "epoch": 7.14, + "grad_norm": 5.730068206787109, + "learning_rate": 3.585495726972529e-05, + "loss": 1.181, + "step": 23875 + }, + { + "epoch": 7.14, + "grad_norm": 2.8973639011383057, + "learning_rate": 3.584966447848201e-05, + "loss": 1.1447, + "step": 23880 + }, + { + "epoch": 7.15, + "grad_norm": 2.603912830352783, + "learning_rate": 3.584437108802303e-05, + "loss": 1.1617, + "step": 23885 + }, + { + "epoch": 7.15, + "grad_norm": 2.1435694694519043, + "learning_rate": 3.583907709864072e-05, + "loss": 1.1438, + "step": 23890 + }, + { + "epoch": 7.15, + "grad_norm": 3.071255922317505, + "learning_rate": 3.5833782510627436e-05, + "loss": 1.1503, + "step": 23895 + }, + { + "epoch": 7.15, + "grad_norm": 2.3412327766418457, + "learning_rate": 3.582848732427561e-05, + "loss": 1.2069, + "step": 23900 + }, + { + "epoch": 7.15, + "grad_norm": 2.986461639404297, + "learning_rate": 3.5823191539877674e-05, + "loss": 1.0559, + "step": 23905 + }, + { + "epoch": 7.15, + "grad_norm": 1.189259648323059, + "learning_rate": 3.581789515772613e-05, + "loss": 1.2836, + "step": 23910 + }, + { + "epoch": 7.16, + "grad_norm": 2.6002511978149414, + "learning_rate": 3.5812598178113476e-05, + "loss": 1.2934, + "step": 23915 + }, + { + "epoch": 7.16, + "grad_norm": 1.7060097455978394, + "learning_rate": 3.580730060133227e-05, + "loss": 1.1425, + "step": 23920 + }, + { + "epoch": 7.16, + "grad_norm": 2.59647536277771, + "learning_rate": 3.580200242767508e-05, + "loss": 1.0499, + "step": 23925 + }, + { + "epoch": 7.16, + "grad_norm": 1.8929245471954346, + "learning_rate": 3.5796703657434526e-05, + "loss": 1.2425, + "step": 23930 + }, + { + "epoch": 7.16, + "grad_norm": 1.3510212898254395, + "learning_rate": 3.579140429090325e-05, + "loss": 1.0355, + "step": 23935 + }, + { + "epoch": 7.16, + "grad_norm": 1.923653483390808, + "learning_rate": 3.578610432837393e-05, + "loss": 1.1344, + "step": 23940 + }, + { + "epoch": 7.16, + "grad_norm": 2.9542462825775146, + "learning_rate": 3.578080377013928e-05, + "loss": 1.2874, + "step": 23945 + }, + { + "epoch": 7.17, + "grad_norm": 3.119513988494873, + "learning_rate": 3.577550261649204e-05, + "loss": 1.1657, + "step": 23950 + }, + { + "epoch": 7.17, + "grad_norm": 2.2563698291778564, + "learning_rate": 3.5770200867725e-05, + "loss": 1.3417, + "step": 23955 + }, + { + "epoch": 7.17, + "grad_norm": 3.3380630016326904, + "learning_rate": 3.5764898524130965e-05, + "loss": 1.2012, + "step": 23960 + }, + { + "epoch": 7.17, + "grad_norm": 3.131051540374756, + "learning_rate": 3.5759595586002765e-05, + "loss": 1.0301, + "step": 23965 + }, + { + "epoch": 7.17, + "grad_norm": 4.349595069885254, + "learning_rate": 3.575429205363329e-05, + "loss": 1.1835, + "step": 23970 + }, + { + "epoch": 7.17, + "grad_norm": 1.5668723583221436, + "learning_rate": 3.574898792731544e-05, + "loss": 1.2774, + "step": 23975 + }, + { + "epoch": 7.17, + "grad_norm": 6.6469855308532715, + "learning_rate": 3.5743683207342154e-05, + "loss": 1.2421, + "step": 23980 + }, + { + "epoch": 7.18, + "grad_norm": 1.9514319896697998, + "learning_rate": 3.573837789400643e-05, + "loss": 1.1226, + "step": 23985 + }, + { + "epoch": 7.18, + "grad_norm": 1.3996671438217163, + "learning_rate": 3.5733071987601235e-05, + "loss": 1.195, + "step": 23990 + }, + { + "epoch": 7.18, + "grad_norm": 0.9684573411941528, + "learning_rate": 3.572776548841964e-05, + "loss": 1.1764, + "step": 23995 + }, + { + "epoch": 7.18, + "grad_norm": 1.6927798986434937, + "learning_rate": 3.572245839675471e-05, + "loss": 1.094, + "step": 24000 + }, + { + "epoch": 7.18, + "grad_norm": 3.316277265548706, + "learning_rate": 3.571715071289954e-05, + "loss": 1.0117, + "step": 24005 + }, + { + "epoch": 7.18, + "grad_norm": 3.6141722202301025, + "learning_rate": 3.571184243714729e-05, + "loss": 1.0562, + "step": 24010 + }, + { + "epoch": 7.19, + "grad_norm": 3.6086862087249756, + "learning_rate": 3.570653356979111e-05, + "loss": 1.1708, + "step": 24015 + }, + { + "epoch": 7.19, + "grad_norm": 1.5085479021072388, + "learning_rate": 3.5701224111124206e-05, + "loss": 1.1285, + "step": 24020 + }, + { + "epoch": 7.19, + "grad_norm": 3.2829747200012207, + "learning_rate": 3.5695914061439816e-05, + "loss": 1.227, + "step": 24025 + }, + { + "epoch": 7.19, + "grad_norm": 3.2367653846740723, + "learning_rate": 3.569060342103121e-05, + "loss": 1.0892, + "step": 24030 + }, + { + "epoch": 7.19, + "grad_norm": 1.7712103128433228, + "learning_rate": 3.56852921901917e-05, + "loss": 1.0643, + "step": 24035 + }, + { + "epoch": 7.19, + "grad_norm": 3.2912991046905518, + "learning_rate": 3.56799803692146e-05, + "loss": 1.1242, + "step": 24040 + }, + { + "epoch": 7.19, + "grad_norm": 2.6889803409576416, + "learning_rate": 3.5674667958393286e-05, + "loss": 1.1923, + "step": 24045 + }, + { + "epoch": 7.2, + "grad_norm": 3.47308349609375, + "learning_rate": 3.566935495802117e-05, + "loss": 1.2753, + "step": 24050 + }, + { + "epoch": 7.2, + "grad_norm": 2.2733964920043945, + "learning_rate": 3.566404136839165e-05, + "loss": 1.2105, + "step": 24055 + }, + { + "epoch": 7.2, + "grad_norm": 1.1582354307174683, + "learning_rate": 3.565872718979822e-05, + "loss": 0.9991, + "step": 24060 + }, + { + "epoch": 7.2, + "grad_norm": 2.0912468433380127, + "learning_rate": 3.565341242253437e-05, + "loss": 1.1606, + "step": 24065 + }, + { + "epoch": 7.2, + "grad_norm": 4.56016731262207, + "learning_rate": 3.5648097066893614e-05, + "loss": 1.2603, + "step": 24070 + }, + { + "epoch": 7.2, + "grad_norm": 2.5408365726470947, + "learning_rate": 3.564278112316953e-05, + "loss": 1.1349, + "step": 24075 + }, + { + "epoch": 7.2, + "grad_norm": 2.359050989151001, + "learning_rate": 3.563746459165571e-05, + "loss": 1.1326, + "step": 24080 + }, + { + "epoch": 7.21, + "grad_norm": 2.704974889755249, + "learning_rate": 3.563214747264578e-05, + "loss": 1.2259, + "step": 24085 + }, + { + "epoch": 7.21, + "grad_norm": 3.2640304565429688, + "learning_rate": 3.562682976643339e-05, + "loss": 1.0601, + "step": 24090 + }, + { + "epoch": 7.21, + "grad_norm": 2.6645710468292236, + "learning_rate": 3.562151147331224e-05, + "loss": 1.2108, + "step": 24095 + }, + { + "epoch": 7.21, + "grad_norm": 3.9869096279144287, + "learning_rate": 3.5616192593576055e-05, + "loss": 1.144, + "step": 24100 + }, + { + "epoch": 7.21, + "grad_norm": 1.9926941394805908, + "learning_rate": 3.561087312751858e-05, + "loss": 1.0182, + "step": 24105 + }, + { + "epoch": 7.21, + "grad_norm": 1.0935914516448975, + "learning_rate": 3.560555307543362e-05, + "loss": 1.3015, + "step": 24110 + }, + { + "epoch": 7.21, + "grad_norm": 3.214768409729004, + "learning_rate": 3.5600232437614984e-05, + "loss": 1.3117, + "step": 24115 + }, + { + "epoch": 7.22, + "grad_norm": 3.5183026790618896, + "learning_rate": 3.559491121435653e-05, + "loss": 1.1814, + "step": 24120 + }, + { + "epoch": 7.22, + "grad_norm": 0.981540858745575, + "learning_rate": 3.558958940595214e-05, + "loss": 1.2303, + "step": 24125 + }, + { + "epoch": 7.22, + "grad_norm": 2.553018569946289, + "learning_rate": 3.558426701269574e-05, + "loss": 1.1696, + "step": 24130 + }, + { + "epoch": 7.22, + "grad_norm": 2.0565733909606934, + "learning_rate": 3.557894403488127e-05, + "loss": 1.2089, + "step": 24135 + }, + { + "epoch": 7.22, + "grad_norm": 1.6102100610733032, + "learning_rate": 3.5573620472802714e-05, + "loss": 1.1943, + "step": 24140 + }, + { + "epoch": 7.22, + "grad_norm": 1.952919363975525, + "learning_rate": 3.55682963267541e-05, + "loss": 1.0149, + "step": 24145 + }, + { + "epoch": 7.23, + "grad_norm": 2.255753517150879, + "learning_rate": 3.556297159702946e-05, + "loss": 1.1736, + "step": 24150 + }, + { + "epoch": 7.23, + "grad_norm": 2.6610512733459473, + "learning_rate": 3.5557646283922875e-05, + "loss": 1.2155, + "step": 24155 + }, + { + "epoch": 7.23, + "grad_norm": 3.2673702239990234, + "learning_rate": 3.555232038772846e-05, + "loss": 0.9963, + "step": 24160 + }, + { + "epoch": 7.23, + "grad_norm": 3.3183624744415283, + "learning_rate": 3.554699390874036e-05, + "loss": 1.101, + "step": 24165 + }, + { + "epoch": 7.23, + "grad_norm": 1.4142159223556519, + "learning_rate": 3.554166684725275e-05, + "loss": 1.0708, + "step": 24170 + }, + { + "epoch": 7.23, + "grad_norm": 2.9164345264434814, + "learning_rate": 3.553633920355983e-05, + "loss": 1.1302, + "step": 24175 + }, + { + "epoch": 7.23, + "grad_norm": 2.0309815406799316, + "learning_rate": 3.5531010977955865e-05, + "loss": 1.0906, + "step": 24180 + }, + { + "epoch": 7.24, + "grad_norm": 2.8416526317596436, + "learning_rate": 3.55256821707351e-05, + "loss": 1.0474, + "step": 24185 + }, + { + "epoch": 7.24, + "grad_norm": 0.9157441854476929, + "learning_rate": 3.5520352782191845e-05, + "loss": 1.2212, + "step": 24190 + }, + { + "epoch": 7.24, + "grad_norm": 6.699018478393555, + "learning_rate": 3.5515022812620446e-05, + "loss": 1.1924, + "step": 24195 + }, + { + "epoch": 7.24, + "grad_norm": 4.399092674255371, + "learning_rate": 3.550969226231527e-05, + "loss": 1.047, + "step": 24200 + }, + { + "epoch": 7.24, + "grad_norm": 2.4295222759246826, + "learning_rate": 3.550436113157071e-05, + "loss": 1.2108, + "step": 24205 + }, + { + "epoch": 7.24, + "grad_norm": 3.0615289211273193, + "learning_rate": 3.54990294206812e-05, + "loss": 1.0198, + "step": 24210 + }, + { + "epoch": 7.24, + "grad_norm": 5.101573467254639, + "learning_rate": 3.549369712994122e-05, + "loss": 1.0861, + "step": 24215 + }, + { + "epoch": 7.25, + "grad_norm": 2.2024030685424805, + "learning_rate": 3.548836425964524e-05, + "loss": 1.0598, + "step": 24220 + }, + { + "epoch": 7.25, + "grad_norm": 3.093062400817871, + "learning_rate": 3.548303081008781e-05, + "loss": 1.1864, + "step": 24225 + }, + { + "epoch": 7.25, + "grad_norm": 1.283988118171692, + "learning_rate": 3.547769678156349e-05, + "loss": 1.2008, + "step": 24230 + }, + { + "epoch": 7.25, + "grad_norm": 1.7267249822616577, + "learning_rate": 3.547236217436686e-05, + "loss": 1.1673, + "step": 24235 + }, + { + "epoch": 7.25, + "grad_norm": 2.7360587120056152, + "learning_rate": 3.546702698879256e-05, + "loss": 1.0917, + "step": 24240 + }, + { + "epoch": 7.25, + "grad_norm": 3.3460683822631836, + "learning_rate": 3.546169122513524e-05, + "loss": 1.1798, + "step": 24245 + }, + { + "epoch": 7.26, + "grad_norm": 3.3018956184387207, + "learning_rate": 3.545635488368959e-05, + "loss": 1.1226, + "step": 24250 + }, + { + "epoch": 7.26, + "grad_norm": 1.762122392654419, + "learning_rate": 3.5451017964750316e-05, + "loss": 1.1001, + "step": 24255 + }, + { + "epoch": 7.26, + "grad_norm": 2.407038688659668, + "learning_rate": 3.544568046861219e-05, + "loss": 1.1731, + "step": 24260 + }, + { + "epoch": 7.26, + "grad_norm": 2.4787652492523193, + "learning_rate": 3.544034239556999e-05, + "loss": 1.0903, + "step": 24265 + }, + { + "epoch": 7.26, + "grad_norm": 3.5315353870391846, + "learning_rate": 3.543500374591853e-05, + "loss": 1.2661, + "step": 24270 + }, + { + "epoch": 7.26, + "grad_norm": 1.81735360622406, + "learning_rate": 3.5429664519952664e-05, + "loss": 1.0972, + "step": 24275 + }, + { + "epoch": 7.26, + "grad_norm": 3.0193166732788086, + "learning_rate": 3.542432471796726e-05, + "loss": 1.227, + "step": 24280 + }, + { + "epoch": 7.27, + "grad_norm": 2.1791579723358154, + "learning_rate": 3.5418984340257245e-05, + "loss": 1.1118, + "step": 24285 + }, + { + "epoch": 7.27, + "grad_norm": 2.2652621269226074, + "learning_rate": 3.541364338711755e-05, + "loss": 1.2193, + "step": 24290 + }, + { + "epoch": 7.27, + "grad_norm": 3.2821550369262695, + "learning_rate": 3.5408301858843155e-05, + "loss": 1.1751, + "step": 24295 + }, + { + "epoch": 7.27, + "grad_norm": 1.8815666437149048, + "learning_rate": 3.540295975572907e-05, + "loss": 1.0835, + "step": 24300 + }, + { + "epoch": 7.27, + "grad_norm": 6.063453674316406, + "learning_rate": 3.539761707807032e-05, + "loss": 1.081, + "step": 24305 + }, + { + "epoch": 7.27, + "grad_norm": 1.7175376415252686, + "learning_rate": 3.5392273826162004e-05, + "loss": 1.1278, + "step": 24310 + }, + { + "epoch": 7.27, + "grad_norm": 1.4363011121749878, + "learning_rate": 3.538693000029919e-05, + "loss": 1.0402, + "step": 24315 + }, + { + "epoch": 7.28, + "grad_norm": 2.4211199283599854, + "learning_rate": 3.538158560077704e-05, + "loss": 1.3065, + "step": 24320 + }, + { + "epoch": 7.28, + "grad_norm": 4.585165023803711, + "learning_rate": 3.537624062789071e-05, + "loss": 1.0919, + "step": 24325 + }, + { + "epoch": 7.28, + "grad_norm": 2.929621934890747, + "learning_rate": 3.537089508193539e-05, + "loss": 1.2163, + "step": 24330 + }, + { + "epoch": 7.28, + "grad_norm": 3.6469857692718506, + "learning_rate": 3.536554896320632e-05, + "loss": 1.3069, + "step": 24335 + }, + { + "epoch": 7.28, + "grad_norm": 2.2444517612457275, + "learning_rate": 3.536020227199875e-05, + "loss": 1.2386, + "step": 24340 + }, + { + "epoch": 7.28, + "grad_norm": 1.8699992895126343, + "learning_rate": 3.535485500860798e-05, + "loss": 1.1221, + "step": 24345 + }, + { + "epoch": 7.29, + "grad_norm": 2.0010950565338135, + "learning_rate": 3.5349507173329324e-05, + "loss": 1.157, + "step": 24350 + }, + { + "epoch": 7.29, + "grad_norm": 1.865695595741272, + "learning_rate": 3.534415876645815e-05, + "loss": 0.9495, + "step": 24355 + }, + { + "epoch": 7.29, + "grad_norm": 1.9279874563217163, + "learning_rate": 3.533880978828984e-05, + "loss": 1.2705, + "step": 24360 + }, + { + "epoch": 7.29, + "grad_norm": 2.339104175567627, + "learning_rate": 3.5333460239119814e-05, + "loss": 1.081, + "step": 24365 + }, + { + "epoch": 7.29, + "grad_norm": 2.058978319168091, + "learning_rate": 3.5328110119243515e-05, + "loss": 0.9748, + "step": 24370 + }, + { + "epoch": 7.29, + "grad_norm": 5.336772441864014, + "learning_rate": 3.532275942895644e-05, + "loss": 1.1164, + "step": 24375 + }, + { + "epoch": 7.29, + "grad_norm": 13.560040473937988, + "learning_rate": 3.531740816855408e-05, + "loss": 1.2611, + "step": 24380 + }, + { + "epoch": 7.3, + "grad_norm": 3.361783742904663, + "learning_rate": 3.5312056338331986e-05, + "loss": 1.0729, + "step": 24385 + }, + { + "epoch": 7.3, + "grad_norm": 1.754549503326416, + "learning_rate": 3.530670393858575e-05, + "loss": 1.1315, + "step": 24390 + }, + { + "epoch": 7.3, + "grad_norm": 1.895077109336853, + "learning_rate": 3.530135096961097e-05, + "loss": 1.1235, + "step": 24395 + }, + { + "epoch": 7.3, + "grad_norm": 2.7250521183013916, + "learning_rate": 3.529599743170328e-05, + "loss": 0.9917, + "step": 24400 + }, + { + "epoch": 7.3, + "grad_norm": 3.637592077255249, + "learning_rate": 3.529064332515836e-05, + "loss": 1.015, + "step": 24405 + }, + { + "epoch": 7.3, + "grad_norm": 2.967895984649658, + "learning_rate": 3.5285288650271896e-05, + "loss": 1.285, + "step": 24410 + }, + { + "epoch": 7.3, + "grad_norm": 3.107884407043457, + "learning_rate": 3.527993340733964e-05, + "loss": 1.0319, + "step": 24415 + }, + { + "epoch": 7.31, + "grad_norm": 1.9659065008163452, + "learning_rate": 3.527457759665734e-05, + "loss": 1.1129, + "step": 24420 + }, + { + "epoch": 7.31, + "grad_norm": 3.5118746757507324, + "learning_rate": 3.52692212185208e-05, + "loss": 1.1486, + "step": 24425 + }, + { + "epoch": 7.31, + "grad_norm": 3.2912468910217285, + "learning_rate": 3.526386427322585e-05, + "loss": 1.1864, + "step": 24430 + }, + { + "epoch": 7.31, + "grad_norm": 3.768181562423706, + "learning_rate": 3.5258506761068344e-05, + "loss": 1.2152, + "step": 24435 + }, + { + "epoch": 7.31, + "grad_norm": 4.644777774810791, + "learning_rate": 3.525314868234417e-05, + "loss": 1.145, + "step": 24440 + }, + { + "epoch": 7.31, + "grad_norm": 1.425614595413208, + "learning_rate": 3.524779003734925e-05, + "loss": 1.1543, + "step": 24445 + }, + { + "epoch": 7.32, + "grad_norm": 2.4408340454101562, + "learning_rate": 3.524243082637954e-05, + "loss": 1.1951, + "step": 24450 + }, + { + "epoch": 7.32, + "grad_norm": 1.1493414640426636, + "learning_rate": 3.523707104973102e-05, + "loss": 1.0654, + "step": 24455 + }, + { + "epoch": 7.32, + "grad_norm": 2.574376106262207, + "learning_rate": 3.523171070769972e-05, + "loss": 1.0558, + "step": 24460 + }, + { + "epoch": 7.32, + "grad_norm": 2.255390167236328, + "learning_rate": 3.522634980058166e-05, + "loss": 1.2315, + "step": 24465 + }, + { + "epoch": 7.32, + "grad_norm": 1.4033783674240112, + "learning_rate": 3.5220988328672935e-05, + "loss": 1.0303, + "step": 24470 + }, + { + "epoch": 7.32, + "grad_norm": 2.0258255004882812, + "learning_rate": 3.521562629226965e-05, + "loss": 1.1492, + "step": 24475 + }, + { + "epoch": 7.32, + "grad_norm": 2.6093392372131348, + "learning_rate": 3.521026369166793e-05, + "loss": 1.1797, + "step": 24480 + }, + { + "epoch": 7.33, + "grad_norm": 4.753375053405762, + "learning_rate": 3.520490052716397e-05, + "loss": 1.1525, + "step": 24485 + }, + { + "epoch": 7.33, + "grad_norm": 2.713651657104492, + "learning_rate": 3.5199536799053965e-05, + "loss": 1.0279, + "step": 24490 + }, + { + "epoch": 7.33, + "grad_norm": 2.4132895469665527, + "learning_rate": 3.5194172507634136e-05, + "loss": 1.0203, + "step": 24495 + }, + { + "epoch": 7.33, + "grad_norm": 2.83127760887146, + "learning_rate": 3.5188807653200764e-05, + "loss": 1.2186, + "step": 24500 + }, + { + "epoch": 7.33, + "grad_norm": 2.688488245010376, + "learning_rate": 3.5183442236050126e-05, + "loss": 1.2564, + "step": 24505 + }, + { + "epoch": 7.33, + "grad_norm": 1.0452677011489868, + "learning_rate": 3.517807625647857e-05, + "loss": 1.1982, + "step": 24510 + }, + { + "epoch": 7.33, + "grad_norm": 3.4765284061431885, + "learning_rate": 3.5172709714782435e-05, + "loss": 1.1172, + "step": 24515 + }, + { + "epoch": 7.34, + "grad_norm": 4.563774108886719, + "learning_rate": 3.5167342611258114e-05, + "loss": 1.2248, + "step": 24520 + }, + { + "epoch": 7.34, + "grad_norm": 3.0111424922943115, + "learning_rate": 3.5161974946202035e-05, + "loss": 0.9888, + "step": 24525 + }, + { + "epoch": 7.34, + "grad_norm": 1.582155466079712, + "learning_rate": 3.515660671991064e-05, + "loss": 1.0793, + "step": 24530 + }, + { + "epoch": 7.34, + "grad_norm": 2.5320088863372803, + "learning_rate": 3.515123793268042e-05, + "loss": 1.2144, + "step": 24535 + }, + { + "epoch": 7.34, + "grad_norm": 1.5951532125473022, + "learning_rate": 3.5145868584807875e-05, + "loss": 1.0229, + "step": 24540 + }, + { + "epoch": 7.34, + "grad_norm": 1.4666327238082886, + "learning_rate": 3.514049867658955e-05, + "loss": 1.273, + "step": 24545 + }, + { + "epoch": 7.35, + "grad_norm": 3.8786826133728027, + "learning_rate": 3.513512820832202e-05, + "loss": 1.1415, + "step": 24550 + }, + { + "epoch": 7.35, + "grad_norm": 1.4749125242233276, + "learning_rate": 3.5129757180301906e-05, + "loss": 1.2209, + "step": 24555 + }, + { + "epoch": 7.35, + "grad_norm": 6.20326566696167, + "learning_rate": 3.512438559282583e-05, + "loss": 0.953, + "step": 24560 + }, + { + "epoch": 7.35, + "grad_norm": 1.2090144157409668, + "learning_rate": 3.511901344619045e-05, + "loss": 1.195, + "step": 24565 + }, + { + "epoch": 7.35, + "grad_norm": 2.7025344371795654, + "learning_rate": 3.5113640740692485e-05, + "loss": 1.1066, + "step": 24570 + }, + { + "epoch": 7.35, + "grad_norm": 2.9294729232788086, + "learning_rate": 3.510826747662865e-05, + "loss": 1.1074, + "step": 24575 + }, + { + "epoch": 7.35, + "grad_norm": 3.470428943634033, + "learning_rate": 3.510289365429571e-05, + "loss": 1.166, + "step": 24580 + }, + { + "epoch": 7.36, + "grad_norm": 10.227189064025879, + "learning_rate": 3.509751927399046e-05, + "loss": 1.1535, + "step": 24585 + }, + { + "epoch": 7.36, + "grad_norm": 1.4875662326812744, + "learning_rate": 3.509214433600971e-05, + "loss": 1.2838, + "step": 24590 + }, + { + "epoch": 7.36, + "grad_norm": 2.2764317989349365, + "learning_rate": 3.508676884065032e-05, + "loss": 1.1669, + "step": 24595 + }, + { + "epoch": 7.36, + "grad_norm": 3.8440630435943604, + "learning_rate": 3.5081392788209176e-05, + "loss": 1.0723, + "step": 24600 + }, + { + "epoch": 7.36, + "grad_norm": 3.417236089706421, + "learning_rate": 3.507601617898319e-05, + "loss": 1.193, + "step": 24605 + }, + { + "epoch": 7.36, + "grad_norm": 2.8095319271087646, + "learning_rate": 3.5070639013269296e-05, + "loss": 1.1633, + "step": 24610 + }, + { + "epoch": 7.36, + "grad_norm": 4.380091190338135, + "learning_rate": 3.5065261291364485e-05, + "loss": 1.1677, + "step": 24615 + }, + { + "epoch": 7.37, + "grad_norm": 1.5369083881378174, + "learning_rate": 3.505988301356574e-05, + "loss": 1.2476, + "step": 24620 + }, + { + "epoch": 7.37, + "grad_norm": 1.350716233253479, + "learning_rate": 3.505450418017012e-05, + "loss": 1.0492, + "step": 24625 + }, + { + "epoch": 7.37, + "grad_norm": 1.859533429145813, + "learning_rate": 3.5049124791474696e-05, + "loss": 0.9475, + "step": 24630 + }, + { + "epoch": 7.37, + "grad_norm": 3.090188503265381, + "learning_rate": 3.504374484777655e-05, + "loss": 1.0635, + "step": 24635 + }, + { + "epoch": 7.37, + "grad_norm": 4.572514533996582, + "learning_rate": 3.503836434937281e-05, + "loss": 1.0462, + "step": 24640 + }, + { + "epoch": 7.37, + "grad_norm": 1.258280634880066, + "learning_rate": 3.503298329656064e-05, + "loss": 1.0908, + "step": 24645 + }, + { + "epoch": 7.37, + "grad_norm": 3.248828887939453, + "learning_rate": 3.5027601689637244e-05, + "loss": 1.1565, + "step": 24650 + }, + { + "epoch": 7.38, + "grad_norm": 3.4819695949554443, + "learning_rate": 3.502221952889981e-05, + "loss": 1.2237, + "step": 24655 + }, + { + "epoch": 7.38, + "grad_norm": 2.005113363265991, + "learning_rate": 3.5016836814645624e-05, + "loss": 1.2283, + "step": 24660 + }, + { + "epoch": 7.38, + "grad_norm": 4.542233943939209, + "learning_rate": 3.5011453547171954e-05, + "loss": 1.0564, + "step": 24665 + }, + { + "epoch": 7.38, + "grad_norm": 1.675430178642273, + "learning_rate": 3.50060697267761e-05, + "loss": 1.0971, + "step": 24670 + }, + { + "epoch": 7.38, + "grad_norm": 3.372847318649292, + "learning_rate": 3.500068535375543e-05, + "loss": 1.0592, + "step": 24675 + }, + { + "epoch": 7.38, + "grad_norm": 2.804337978363037, + "learning_rate": 3.499530042840728e-05, + "loss": 0.9945, + "step": 24680 + }, + { + "epoch": 7.39, + "grad_norm": 2.0974247455596924, + "learning_rate": 3.49899149510291e-05, + "loss": 1.0258, + "step": 24685 + }, + { + "epoch": 7.39, + "grad_norm": 1.919437289237976, + "learning_rate": 3.498452892191829e-05, + "loss": 1.028, + "step": 24690 + }, + { + "epoch": 7.39, + "grad_norm": 3.193256139755249, + "learning_rate": 3.4979142341372337e-05, + "loss": 1.1782, + "step": 24695 + }, + { + "epoch": 7.39, + "grad_norm": 2.895226240158081, + "learning_rate": 3.4973755209688716e-05, + "loss": 1.1318, + "step": 24700 + }, + { + "epoch": 7.39, + "grad_norm": 1.8412282466888428, + "learning_rate": 3.496836752716496e-05, + "loss": 1.335, + "step": 24705 + }, + { + "epoch": 7.39, + "grad_norm": 4.568256378173828, + "learning_rate": 3.496297929409863e-05, + "loss": 0.9776, + "step": 24710 + }, + { + "epoch": 7.39, + "grad_norm": 2.8621883392333984, + "learning_rate": 3.4957590510787306e-05, + "loss": 0.984, + "step": 24715 + }, + { + "epoch": 7.4, + "grad_norm": 3.2574169635772705, + "learning_rate": 3.495220117752861e-05, + "loss": 1.0085, + "step": 24720 + }, + { + "epoch": 7.4, + "grad_norm": 4.439530849456787, + "learning_rate": 3.4946811294620196e-05, + "loss": 1.0522, + "step": 24725 + }, + { + "epoch": 7.4, + "grad_norm": 3.148686170578003, + "learning_rate": 3.494142086235972e-05, + "loss": 0.8726, + "step": 24730 + }, + { + "epoch": 7.4, + "grad_norm": 8.771424293518066, + "learning_rate": 3.4936029881044917e-05, + "loss": 1.1329, + "step": 24735 + }, + { + "epoch": 7.4, + "grad_norm": 2.2926177978515625, + "learning_rate": 3.493063835097351e-05, + "loss": 1.1628, + "step": 24740 + }, + { + "epoch": 7.4, + "grad_norm": 1.3365390300750732, + "learning_rate": 3.4925246272443256e-05, + "loss": 1.2449, + "step": 24745 + }, + { + "epoch": 7.4, + "grad_norm": 2.2112948894500732, + "learning_rate": 3.491985364575197e-05, + "loss": 1.0303, + "step": 24750 + }, + { + "epoch": 7.41, + "grad_norm": 1.917383074760437, + "learning_rate": 3.4914460471197486e-05, + "loss": 1.3089, + "step": 24755 + }, + { + "epoch": 7.41, + "grad_norm": 3.297748565673828, + "learning_rate": 3.4909066749077654e-05, + "loss": 1.222, + "step": 24760 + }, + { + "epoch": 7.41, + "grad_norm": 2.1157360076904297, + "learning_rate": 3.490367247969036e-05, + "loss": 1.1927, + "step": 24765 + }, + { + "epoch": 7.41, + "grad_norm": 3.120800733566284, + "learning_rate": 3.489827766333353e-05, + "loss": 1.2292, + "step": 24770 + }, + { + "epoch": 7.41, + "grad_norm": 1.5992114543914795, + "learning_rate": 3.4892882300305127e-05, + "loss": 1.2151, + "step": 24775 + }, + { + "epoch": 7.41, + "grad_norm": 1.0426512956619263, + "learning_rate": 3.48874863909031e-05, + "loss": 1.1554, + "step": 24780 + }, + { + "epoch": 7.42, + "grad_norm": 2.195253372192383, + "learning_rate": 3.488208993542549e-05, + "loss": 1.042, + "step": 24785 + }, + { + "epoch": 7.42, + "grad_norm": 2.7746365070343018, + "learning_rate": 3.487669293417032e-05, + "loss": 1.104, + "step": 24790 + }, + { + "epoch": 7.42, + "grad_norm": 3.3738327026367188, + "learning_rate": 3.487129538743567e-05, + "loss": 1.0831, + "step": 24795 + }, + { + "epoch": 7.42, + "grad_norm": 2.653986930847168, + "learning_rate": 3.4865897295519624e-05, + "loss": 1.1528, + "step": 24800 + }, + { + "epoch": 7.42, + "grad_norm": 3.24540638923645, + "learning_rate": 3.486049865872033e-05, + "loss": 1.2233, + "step": 24805 + }, + { + "epoch": 7.42, + "grad_norm": 3.4608511924743652, + "learning_rate": 3.485509947733595e-05, + "loss": 1.2021, + "step": 24810 + }, + { + "epoch": 7.42, + "grad_norm": 1.495871663093567, + "learning_rate": 3.4849699751664664e-05, + "loss": 1.2353, + "step": 24815 + }, + { + "epoch": 7.43, + "grad_norm": 1.2478872537612915, + "learning_rate": 3.484429948200471e-05, + "loss": 1.2628, + "step": 24820 + }, + { + "epoch": 7.43, + "grad_norm": 4.952610015869141, + "learning_rate": 3.483889866865432e-05, + "loss": 1.1575, + "step": 24825 + }, + { + "epoch": 7.43, + "grad_norm": 1.0654171705245972, + "learning_rate": 3.483349731191178e-05, + "loss": 1.265, + "step": 24830 + }, + { + "epoch": 7.43, + "grad_norm": 5.432766437530518, + "learning_rate": 3.48280954120754e-05, + "loss": 1.0648, + "step": 24835 + }, + { + "epoch": 7.43, + "grad_norm": 6.357416152954102, + "learning_rate": 3.482269296944354e-05, + "loss": 1.0119, + "step": 24840 + }, + { + "epoch": 7.43, + "grad_norm": 2.8604278564453125, + "learning_rate": 3.481728998431455e-05, + "loss": 1.2266, + "step": 24845 + }, + { + "epoch": 7.43, + "grad_norm": 3.6308391094207764, + "learning_rate": 3.481188645698684e-05, + "loss": 1.2901, + "step": 24850 + }, + { + "epoch": 7.44, + "grad_norm": 2.2123422622680664, + "learning_rate": 3.4806482387758846e-05, + "loss": 1.2228, + "step": 24855 + }, + { + "epoch": 7.44, + "grad_norm": 1.6754069328308105, + "learning_rate": 3.4801077776929016e-05, + "loss": 1.1403, + "step": 24860 + }, + { + "epoch": 7.44, + "grad_norm": 1.3092021942138672, + "learning_rate": 3.479567262479584e-05, + "loss": 1.1213, + "step": 24865 + }, + { + "epoch": 7.44, + "grad_norm": 3.2713966369628906, + "learning_rate": 3.479026693165786e-05, + "loss": 1.0768, + "step": 24870 + }, + { + "epoch": 7.44, + "grad_norm": 2.951756000518799, + "learning_rate": 3.478486069781361e-05, + "loss": 1.1956, + "step": 24875 + }, + { + "epoch": 7.44, + "grad_norm": 2.3628857135772705, + "learning_rate": 3.4779453923561675e-05, + "loss": 1.2273, + "step": 24880 + }, + { + "epoch": 7.45, + "grad_norm": 2.3509888648986816, + "learning_rate": 3.477404660920066e-05, + "loss": 1.1566, + "step": 24885 + }, + { + "epoch": 7.45, + "grad_norm": 1.5141558647155762, + "learning_rate": 3.4768638755029226e-05, + "loss": 1.1827, + "step": 24890 + }, + { + "epoch": 7.45, + "grad_norm": 1.5687611103057861, + "learning_rate": 3.476323036134601e-05, + "loss": 1.1956, + "step": 24895 + }, + { + "epoch": 7.45, + "grad_norm": 1.7349860668182373, + "learning_rate": 3.475782142844974e-05, + "loss": 1.1523, + "step": 24900 + }, + { + "epoch": 7.45, + "grad_norm": 2.322187900543213, + "learning_rate": 3.475241195663913e-05, + "loss": 1.2337, + "step": 24905 + }, + { + "epoch": 7.45, + "grad_norm": 1.1443284749984741, + "learning_rate": 3.4747001946212944e-05, + "loss": 1.0199, + "step": 24910 + }, + { + "epoch": 7.45, + "grad_norm": 1.456677794456482, + "learning_rate": 3.4741591397469975e-05, + "loss": 1.0837, + "step": 24915 + }, + { + "epoch": 7.46, + "grad_norm": 1.2312963008880615, + "learning_rate": 3.473618031070903e-05, + "loss": 1.1786, + "step": 24920 + }, + { + "epoch": 7.46, + "grad_norm": 1.1006726026535034, + "learning_rate": 3.4730768686228976e-05, + "loss": 1.1402, + "step": 24925 + }, + { + "epoch": 7.46, + "grad_norm": 3.3980886936187744, + "learning_rate": 3.4725356524328686e-05, + "loss": 1.0626, + "step": 24930 + }, + { + "epoch": 7.46, + "grad_norm": 4.119764804840088, + "learning_rate": 3.471994382530706e-05, + "loss": 1.1438, + "step": 24935 + }, + { + "epoch": 7.46, + "grad_norm": 1.62360417842865, + "learning_rate": 3.471453058946303e-05, + "loss": 1.1758, + "step": 24940 + }, + { + "epoch": 7.46, + "grad_norm": 2.4606423377990723, + "learning_rate": 3.4709116817095584e-05, + "loss": 0.9549, + "step": 24945 + }, + { + "epoch": 7.46, + "grad_norm": 2.0009765625, + "learning_rate": 3.4703702508503704e-05, + "loss": 1.2969, + "step": 24950 + }, + { + "epoch": 7.47, + "grad_norm": 2.6999094486236572, + "learning_rate": 3.469828766398643e-05, + "loss": 1.1032, + "step": 24955 + }, + { + "epoch": 7.47, + "grad_norm": 4.238891124725342, + "learning_rate": 3.46928722838428e-05, + "loss": 1.186, + "step": 24960 + }, + { + "epoch": 7.47, + "grad_norm": 3.6732418537139893, + "learning_rate": 3.468745636837191e-05, + "loss": 1.1267, + "step": 24965 + }, + { + "epoch": 7.47, + "grad_norm": 4.03202486038208, + "learning_rate": 3.468203991787287e-05, + "loss": 1.08, + "step": 24970 + }, + { + "epoch": 7.47, + "grad_norm": 2.4040608406066895, + "learning_rate": 3.467662293264484e-05, + "loss": 0.9691, + "step": 24975 + }, + { + "epoch": 7.47, + "grad_norm": 2.929931640625, + "learning_rate": 3.4671205412986975e-05, + "loss": 1.0961, + "step": 24980 + }, + { + "epoch": 7.48, + "grad_norm": 3.1068196296691895, + "learning_rate": 3.466578735919849e-05, + "loss": 1.2342, + "step": 24985 + }, + { + "epoch": 7.48, + "grad_norm": 3.1283774375915527, + "learning_rate": 3.466036877157862e-05, + "loss": 1.1537, + "step": 24990 + }, + { + "epoch": 7.48, + "grad_norm": 2.7555837631225586, + "learning_rate": 3.465494965042662e-05, + "loss": 1.1126, + "step": 24995 + }, + { + "epoch": 7.48, + "grad_norm": 1.7763209342956543, + "learning_rate": 3.4649529996041784e-05, + "loss": 1.1552, + "step": 25000 + }, + { + "epoch": 7.48, + "grad_norm": 2.4423739910125732, + "learning_rate": 3.464410980872344e-05, + "loss": 0.9622, + "step": 25005 + }, + { + "epoch": 7.48, + "grad_norm": 2.395561933517456, + "learning_rate": 3.463868908877094e-05, + "loss": 1.0734, + "step": 25010 + }, + { + "epoch": 7.48, + "grad_norm": 1.395330548286438, + "learning_rate": 3.463326783648365e-05, + "loss": 0.914, + "step": 25015 + }, + { + "epoch": 7.49, + "grad_norm": 2.5400102138519287, + "learning_rate": 3.4627846052161e-05, + "loss": 1.1574, + "step": 25020 + }, + { + "epoch": 7.49, + "grad_norm": 1.9044983386993408, + "learning_rate": 3.4622423736102414e-05, + "loss": 1.2176, + "step": 25025 + }, + { + "epoch": 7.49, + "grad_norm": 2.5244264602661133, + "learning_rate": 3.461700088860737e-05, + "loss": 1.0935, + "step": 25030 + }, + { + "epoch": 7.49, + "grad_norm": 1.8527662754058838, + "learning_rate": 3.4611577509975366e-05, + "loss": 1.1516, + "step": 25035 + }, + { + "epoch": 7.49, + "grad_norm": 2.225349187850952, + "learning_rate": 3.460615360050592e-05, + "loss": 1.0199, + "step": 25040 + }, + { + "epoch": 7.49, + "grad_norm": 1.780076026916504, + "learning_rate": 3.46007291604986e-05, + "loss": 1.2104, + "step": 25045 + }, + { + "epoch": 7.49, + "grad_norm": 4.2586669921875, + "learning_rate": 3.4595304190252993e-05, + "loss": 1.1166, + "step": 25050 + }, + { + "epoch": 7.5, + "grad_norm": 2.9571564197540283, + "learning_rate": 3.45898786900687e-05, + "loss": 1.1133, + "step": 25055 + }, + { + "epoch": 7.5, + "grad_norm": 1.0347540378570557, + "learning_rate": 3.458445266024538e-05, + "loss": 1.0487, + "step": 25060 + }, + { + "epoch": 7.5, + "grad_norm": 1.9431984424591064, + "learning_rate": 3.45790261010827e-05, + "loss": 1.0142, + "step": 25065 + }, + { + "epoch": 7.5, + "grad_norm": 4.437554359436035, + "learning_rate": 3.4573599012880364e-05, + "loss": 1.1976, + "step": 25070 + }, + { + "epoch": 7.5, + "grad_norm": 1.2235515117645264, + "learning_rate": 3.456817139593811e-05, + "loss": 1.1345, + "step": 25075 + }, + { + "epoch": 7.5, + "grad_norm": 3.0580387115478516, + "learning_rate": 3.45627432505557e-05, + "loss": 1.2483, + "step": 25080 + }, + { + "epoch": 7.51, + "grad_norm": 5.822119235992432, + "learning_rate": 3.4557314577032915e-05, + "loss": 1.0357, + "step": 25085 + }, + { + "epoch": 7.51, + "grad_norm": 4.656231880187988, + "learning_rate": 3.455188537566957e-05, + "loss": 1.2377, + "step": 25090 + }, + { + "epoch": 7.51, + "grad_norm": 1.6126227378845215, + "learning_rate": 3.4546455646765535e-05, + "loss": 1.3903, + "step": 25095 + }, + { + "epoch": 7.51, + "grad_norm": 2.4167838096618652, + "learning_rate": 3.454102539062068e-05, + "loss": 1.1542, + "step": 25100 + }, + { + "epoch": 7.51, + "grad_norm": 1.3699878454208374, + "learning_rate": 3.45355946075349e-05, + "loss": 1.1183, + "step": 25105 + }, + { + "epoch": 7.51, + "grad_norm": 0.8811430335044861, + "learning_rate": 3.453016329780815e-05, + "loss": 1.1796, + "step": 25110 + }, + { + "epoch": 7.51, + "grad_norm": 2.27093768119812, + "learning_rate": 3.452473146174038e-05, + "loss": 1.0819, + "step": 25115 + }, + { + "epoch": 7.52, + "grad_norm": 2.351771116256714, + "learning_rate": 3.45192990996316e-05, + "loss": 1.2461, + "step": 25120 + }, + { + "epoch": 7.52, + "grad_norm": 2.03128719329834, + "learning_rate": 3.451386621178182e-05, + "loss": 1.2742, + "step": 25125 + }, + { + "epoch": 7.52, + "grad_norm": 6.1879448890686035, + "learning_rate": 3.45084327984911e-05, + "loss": 1.2229, + "step": 25130 + }, + { + "epoch": 7.52, + "grad_norm": 2.5790390968322754, + "learning_rate": 3.4502998860059514e-05, + "loss": 1.0641, + "step": 25135 + }, + { + "epoch": 7.52, + "grad_norm": 4.369030475616455, + "learning_rate": 3.4497564396787185e-05, + "loss": 1.3368, + "step": 25140 + }, + { + "epoch": 7.52, + "grad_norm": 1.9594241380691528, + "learning_rate": 3.449212940897425e-05, + "loss": 1.1571, + "step": 25145 + }, + { + "epoch": 7.52, + "grad_norm": 3.453566312789917, + "learning_rate": 3.4486693896920874e-05, + "loss": 1.1184, + "step": 25150 + }, + { + "epoch": 7.53, + "grad_norm": 19.127910614013672, + "learning_rate": 3.448125786092725e-05, + "loss": 1.102, + "step": 25155 + }, + { + "epoch": 7.53, + "grad_norm": 4.914632320404053, + "learning_rate": 3.447582130129361e-05, + "loss": 1.1391, + "step": 25160 + }, + { + "epoch": 7.53, + "grad_norm": 3.1397294998168945, + "learning_rate": 3.4470384218320205e-05, + "loss": 1.0924, + "step": 25165 + }, + { + "epoch": 7.53, + "grad_norm": 3.5343799591064453, + "learning_rate": 3.446494661230733e-05, + "loss": 1.0208, + "step": 25170 + }, + { + "epoch": 7.53, + "grad_norm": 1.5835540294647217, + "learning_rate": 3.445950848355529e-05, + "loss": 1.0526, + "step": 25175 + }, + { + "epoch": 7.53, + "grad_norm": 2.000260591506958, + "learning_rate": 3.445406983236443e-05, + "loss": 1.0166, + "step": 25180 + }, + { + "epoch": 7.54, + "grad_norm": 2.638507843017578, + "learning_rate": 3.4448630659035126e-05, + "loss": 1.2353, + "step": 25185 + }, + { + "epoch": 7.54, + "grad_norm": 3.0085575580596924, + "learning_rate": 3.444319096386777e-05, + "loss": 1.2625, + "step": 25190 + }, + { + "epoch": 7.54, + "grad_norm": 3.4091453552246094, + "learning_rate": 3.4437750747162776e-05, + "loss": 0.9272, + "step": 25195 + }, + { + "epoch": 7.54, + "grad_norm": 2.23293137550354, + "learning_rate": 3.443231000922063e-05, + "loss": 1.2352, + "step": 25200 + }, + { + "epoch": 7.54, + "grad_norm": 1.904063105583191, + "learning_rate": 3.4426868750341805e-05, + "loss": 1.1851, + "step": 25205 + }, + { + "epoch": 7.54, + "grad_norm": 3.55661678314209, + "learning_rate": 3.4421426970826826e-05, + "loss": 1.0767, + "step": 25210 + }, + { + "epoch": 7.54, + "grad_norm": 2.7817044258117676, + "learning_rate": 3.441598467097622e-05, + "loss": 1.2299, + "step": 25215 + }, + { + "epoch": 7.55, + "grad_norm": 1.7373440265655518, + "learning_rate": 3.441054185109057e-05, + "loss": 1.1791, + "step": 25220 + }, + { + "epoch": 7.55, + "grad_norm": 3.398428440093994, + "learning_rate": 3.440509851147047e-05, + "loss": 1.0867, + "step": 25225 + }, + { + "epoch": 7.55, + "grad_norm": 2.2971742153167725, + "learning_rate": 3.4399654652416566e-05, + "loss": 1.109, + "step": 25230 + }, + { + "epoch": 7.55, + "grad_norm": 3.605184555053711, + "learning_rate": 3.4394210274229496e-05, + "loss": 1.1545, + "step": 25235 + }, + { + "epoch": 7.55, + "grad_norm": 2.362635374069214, + "learning_rate": 3.4388765377209964e-05, + "loss": 1.1122, + "step": 25240 + }, + { + "epoch": 7.55, + "grad_norm": 1.9791773557662964, + "learning_rate": 3.438331996165868e-05, + "loss": 1.2522, + "step": 25245 + }, + { + "epoch": 7.55, + "grad_norm": 3.294543743133545, + "learning_rate": 3.437787402787639e-05, + "loss": 1.2164, + "step": 25250 + }, + { + "epoch": 7.56, + "grad_norm": 3.8431739807128906, + "learning_rate": 3.4372427576163856e-05, + "loss": 1.108, + "step": 25255 + }, + { + "epoch": 7.56, + "grad_norm": 4.042465686798096, + "learning_rate": 3.4366980606821895e-05, + "loss": 1.1426, + "step": 25260 + }, + { + "epoch": 7.56, + "grad_norm": 1.2647730112075806, + "learning_rate": 3.436153312015133e-05, + "loss": 1.0345, + "step": 25265 + }, + { + "epoch": 7.56, + "grad_norm": 1.1158740520477295, + "learning_rate": 3.435608511645302e-05, + "loss": 1.1481, + "step": 25270 + }, + { + "epoch": 7.56, + "grad_norm": 1.05342698097229, + "learning_rate": 3.4350636596027853e-05, + "loss": 1.0948, + "step": 25275 + }, + { + "epoch": 7.56, + "grad_norm": 2.2656328678131104, + "learning_rate": 3.434518755917675e-05, + "loss": 1.2436, + "step": 25280 + }, + { + "epoch": 7.56, + "grad_norm": 4.6103901863098145, + "learning_rate": 3.433973800620065e-05, + "loss": 1.0903, + "step": 25285 + }, + { + "epoch": 7.57, + "grad_norm": 2.1559901237487793, + "learning_rate": 3.4334287937400526e-05, + "loss": 1.0622, + "step": 25290 + }, + { + "epoch": 7.57, + "grad_norm": 2.366748332977295, + "learning_rate": 3.432883735307739e-05, + "loss": 1.1494, + "step": 25295 + }, + { + "epoch": 7.57, + "grad_norm": 2.5366220474243164, + "learning_rate": 3.4323386253532254e-05, + "loss": 1.0923, + "step": 25300 + }, + { + "epoch": 7.57, + "grad_norm": 3.0016655921936035, + "learning_rate": 3.431793463906619e-05, + "loss": 1.0665, + "step": 25305 + }, + { + "epoch": 7.57, + "grad_norm": 2.379429340362549, + "learning_rate": 3.431248250998028e-05, + "loss": 1.2129, + "step": 25310 + }, + { + "epoch": 7.57, + "grad_norm": 3.5074679851531982, + "learning_rate": 3.4307029866575645e-05, + "loss": 1.1391, + "step": 25315 + }, + { + "epoch": 7.58, + "grad_norm": 2.7369179725646973, + "learning_rate": 3.4301576709153424e-05, + "loss": 1.2312, + "step": 25320 + }, + { + "epoch": 7.58, + "grad_norm": 2.9599127769470215, + "learning_rate": 3.4296123038014786e-05, + "loss": 1.218, + "step": 25325 + }, + { + "epoch": 7.58, + "grad_norm": 1.0387095212936401, + "learning_rate": 3.429066885346094e-05, + "loss": 1.2556, + "step": 25330 + }, + { + "epoch": 7.58, + "grad_norm": 2.1688525676727295, + "learning_rate": 3.42852141557931e-05, + "loss": 1.2108, + "step": 25335 + }, + { + "epoch": 7.58, + "grad_norm": 2.080822229385376, + "learning_rate": 3.427975894531255e-05, + "loss": 1.175, + "step": 25340 + }, + { + "epoch": 7.58, + "grad_norm": 1.2066692113876343, + "learning_rate": 3.427430322232055e-05, + "loss": 1.2416, + "step": 25345 + }, + { + "epoch": 7.58, + "grad_norm": 1.1532701253890991, + "learning_rate": 3.4268846987118426e-05, + "loss": 1.171, + "step": 25350 + }, + { + "epoch": 7.59, + "grad_norm": 1.9684667587280273, + "learning_rate": 3.426339024000751e-05, + "loss": 1.1804, + "step": 25355 + }, + { + "epoch": 7.59, + "grad_norm": 1.9980390071868896, + "learning_rate": 3.4257932981289184e-05, + "loss": 1.1734, + "step": 25360 + }, + { + "epoch": 7.59, + "grad_norm": 1.5516365766525269, + "learning_rate": 3.4252475211264846e-05, + "loss": 1.2121, + "step": 25365 + }, + { + "epoch": 7.59, + "grad_norm": 1.7034841775894165, + "learning_rate": 3.424701693023591e-05, + "loss": 1.1121, + "step": 25370 + }, + { + "epoch": 7.59, + "grad_norm": 3.420742988586426, + "learning_rate": 3.424155813850385e-05, + "loss": 1.07, + "step": 25375 + }, + { + "epoch": 7.59, + "grad_norm": 2.169203519821167, + "learning_rate": 3.423609883637014e-05, + "loss": 1.1631, + "step": 25380 + }, + { + "epoch": 7.59, + "grad_norm": 4.135627746582031, + "learning_rate": 3.423063902413629e-05, + "loss": 1.2078, + "step": 25385 + }, + { + "epoch": 7.6, + "grad_norm": 3.5442159175872803, + "learning_rate": 3.422517870210384e-05, + "loss": 1.1549, + "step": 25390 + }, + { + "epoch": 7.6, + "grad_norm": 5.373948574066162, + "learning_rate": 3.421971787057436e-05, + "loss": 0.976, + "step": 25395 + }, + { + "epoch": 7.6, + "grad_norm": 2.204765558242798, + "learning_rate": 3.421425652984944e-05, + "loss": 1.2897, + "step": 25400 + }, + { + "epoch": 7.6, + "grad_norm": 4.4537811279296875, + "learning_rate": 3.420879468023072e-05, + "loss": 1.0213, + "step": 25405 + }, + { + "epoch": 7.6, + "grad_norm": 2.6348259449005127, + "learning_rate": 3.4203332322019835e-05, + "loss": 1.1508, + "step": 25410 + }, + { + "epoch": 7.6, + "grad_norm": 1.4462038278579712, + "learning_rate": 3.419786945551848e-05, + "loss": 1.2851, + "step": 25415 + }, + { + "epoch": 7.61, + "grad_norm": 4.519046783447266, + "learning_rate": 3.419240608102834e-05, + "loss": 1.2382, + "step": 25420 + }, + { + "epoch": 7.61, + "grad_norm": 2.593632936477661, + "learning_rate": 3.418694219885118e-05, + "loss": 1.1794, + "step": 25425 + }, + { + "epoch": 7.61, + "grad_norm": 2.167607545852661, + "learning_rate": 3.418147780928875e-05, + "loss": 1.2187, + "step": 25430 + }, + { + "epoch": 7.61, + "grad_norm": 3.661607027053833, + "learning_rate": 3.4176012912642844e-05, + "loss": 1.0878, + "step": 25435 + }, + { + "epoch": 7.61, + "grad_norm": 2.0092809200286865, + "learning_rate": 3.4170547509215286e-05, + "loss": 1.134, + "step": 25440 + }, + { + "epoch": 7.61, + "grad_norm": 7.421758651733398, + "learning_rate": 3.416508159930791e-05, + "loss": 0.9357, + "step": 25445 + }, + { + "epoch": 7.61, + "grad_norm": 4.737497329711914, + "learning_rate": 3.415961518322262e-05, + "loss": 1.0698, + "step": 25450 + }, + { + "epoch": 7.62, + "grad_norm": 2.214827299118042, + "learning_rate": 3.4154148261261285e-05, + "loss": 1.051, + "step": 25455 + }, + { + "epoch": 7.62, + "grad_norm": 1.7568135261535645, + "learning_rate": 3.414868083372587e-05, + "loss": 1.2068, + "step": 25460 + }, + { + "epoch": 7.62, + "grad_norm": 1.6030669212341309, + "learning_rate": 3.414321290091831e-05, + "loss": 1.1984, + "step": 25465 + }, + { + "epoch": 7.62, + "grad_norm": 1.4295237064361572, + "learning_rate": 3.413774446314062e-05, + "loss": 1.1206, + "step": 25470 + }, + { + "epoch": 7.62, + "grad_norm": 5.009536266326904, + "learning_rate": 3.41322755206948e-05, + "loss": 1.0359, + "step": 25475 + }, + { + "epoch": 7.62, + "grad_norm": 1.4227309226989746, + "learning_rate": 3.4126806073882886e-05, + "loss": 1.1036, + "step": 25480 + }, + { + "epoch": 7.62, + "grad_norm": 1.6048041582107544, + "learning_rate": 3.4121336123006965e-05, + "loss": 1.3853, + "step": 25485 + }, + { + "epoch": 7.63, + "grad_norm": 1.398371696472168, + "learning_rate": 3.411586566836913e-05, + "loss": 1.076, + "step": 25490 + }, + { + "epoch": 7.63, + "grad_norm": 1.4785361289978027, + "learning_rate": 3.4110394710271504e-05, + "loss": 1.1772, + "step": 25495 + }, + { + "epoch": 7.63, + "grad_norm": 2.355851411819458, + "learning_rate": 3.410492324901626e-05, + "loss": 1.1964, + "step": 25500 + }, + { + "epoch": 7.63, + "grad_norm": 1.825348973274231, + "learning_rate": 3.4099451284905556e-05, + "loss": 1.2284, + "step": 25505 + }, + { + "epoch": 7.63, + "grad_norm": 1.8260172605514526, + "learning_rate": 3.409397881824163e-05, + "loss": 1.2915, + "step": 25510 + }, + { + "epoch": 7.63, + "grad_norm": 2.6344926357269287, + "learning_rate": 3.4088505849326697e-05, + "loss": 1.1483, + "step": 25515 + }, + { + "epoch": 7.64, + "grad_norm": 1.9603842496871948, + "learning_rate": 3.408303237846303e-05, + "loss": 1.1863, + "step": 25520 + }, + { + "epoch": 7.64, + "grad_norm": 1.3052462339401245, + "learning_rate": 3.407755840595294e-05, + "loss": 1.2192, + "step": 25525 + }, + { + "epoch": 7.64, + "grad_norm": 3.303022623062134, + "learning_rate": 3.407208393209872e-05, + "loss": 1.2268, + "step": 25530 + }, + { + "epoch": 7.64, + "grad_norm": 1.622067928314209, + "learning_rate": 3.406660895720275e-05, + "loss": 1.2071, + "step": 25535 + }, + { + "epoch": 7.64, + "grad_norm": 3.006840467453003, + "learning_rate": 3.406113348156738e-05, + "loss": 1.1447, + "step": 25540 + }, + { + "epoch": 7.64, + "grad_norm": 2.42177152633667, + "learning_rate": 3.405565750549502e-05, + "loss": 1.1148, + "step": 25545 + }, + { + "epoch": 7.64, + "grad_norm": 2.3825762271881104, + "learning_rate": 3.405018102928812e-05, + "loss": 1.2127, + "step": 25550 + }, + { + "epoch": 7.65, + "grad_norm": 2.4480338096618652, + "learning_rate": 3.404470405324912e-05, + "loss": 1.0913, + "step": 25555 + }, + { + "epoch": 7.65, + "grad_norm": 1.3634631633758545, + "learning_rate": 3.403922657768052e-05, + "loss": 1.2583, + "step": 25560 + }, + { + "epoch": 7.65, + "grad_norm": 3.5564489364624023, + "learning_rate": 3.403374860288484e-05, + "loss": 1.14, + "step": 25565 + }, + { + "epoch": 7.65, + "grad_norm": 2.949016571044922, + "learning_rate": 3.402827012916461e-05, + "loss": 1.3249, + "step": 25570 + }, + { + "epoch": 7.65, + "grad_norm": 1.4162577390670776, + "learning_rate": 3.4022791156822395e-05, + "loss": 1.2632, + "step": 25575 + }, + { + "epoch": 7.65, + "grad_norm": 1.9798351526260376, + "learning_rate": 3.401731168616081e-05, + "loss": 1.3101, + "step": 25580 + }, + { + "epoch": 7.65, + "grad_norm": 1.9318190813064575, + "learning_rate": 3.401183171748248e-05, + "loss": 1.1082, + "step": 25585 + }, + { + "epoch": 7.66, + "grad_norm": 2.191795825958252, + "learning_rate": 3.400635125109005e-05, + "loss": 1.2841, + "step": 25590 + }, + { + "epoch": 7.66, + "grad_norm": 3.378875732421875, + "learning_rate": 3.40008702872862e-05, + "loss": 0.9573, + "step": 25595 + }, + { + "epoch": 7.66, + "grad_norm": 3.725365400314331, + "learning_rate": 3.399538882637364e-05, + "loss": 1.1023, + "step": 25600 + }, + { + "epoch": 7.66, + "grad_norm": 2.4383790493011475, + "learning_rate": 3.3989906868655104e-05, + "loss": 1.1758, + "step": 25605 + }, + { + "epoch": 7.66, + "grad_norm": 3.5799477100372314, + "learning_rate": 3.398442441443336e-05, + "loss": 1.1651, + "step": 25610 + }, + { + "epoch": 7.66, + "grad_norm": 3.4080264568328857, + "learning_rate": 3.397894146401118e-05, + "loss": 1.0039, + "step": 25615 + }, + { + "epoch": 7.67, + "grad_norm": 1.794622540473938, + "learning_rate": 3.397345801769141e-05, + "loss": 1.1809, + "step": 25620 + }, + { + "epoch": 7.67, + "grad_norm": 2.9816510677337646, + "learning_rate": 3.3967974075776875e-05, + "loss": 1.263, + "step": 25625 + }, + { + "epoch": 7.67, + "grad_norm": 2.4955384731292725, + "learning_rate": 3.3962489638570464e-05, + "loss": 1.151, + "step": 25630 + }, + { + "epoch": 7.67, + "grad_norm": 2.3042807579040527, + "learning_rate": 3.395700470637506e-05, + "loss": 1.0265, + "step": 25635 + }, + { + "epoch": 7.67, + "grad_norm": 1.0877710580825806, + "learning_rate": 3.3951519279493585e-05, + "loss": 1.2262, + "step": 25640 + }, + { + "epoch": 7.67, + "grad_norm": 2.408581018447876, + "learning_rate": 3.394603335822902e-05, + "loss": 1.1331, + "step": 25645 + }, + { + "epoch": 7.67, + "grad_norm": 1.5194898843765259, + "learning_rate": 3.3940546942884324e-05, + "loss": 1.1822, + "step": 25650 + }, + { + "epoch": 7.68, + "grad_norm": 1.858493447303772, + "learning_rate": 3.393506003376251e-05, + "loss": 1.0194, + "step": 25655 + }, + { + "epoch": 7.68, + "grad_norm": 1.8844908475875854, + "learning_rate": 3.392957263116663e-05, + "loss": 1.1915, + "step": 25660 + }, + { + "epoch": 7.68, + "grad_norm": 2.1716675758361816, + "learning_rate": 3.392408473539973e-05, + "loss": 1.1354, + "step": 25665 + }, + { + "epoch": 7.68, + "grad_norm": 5.164560317993164, + "learning_rate": 3.391859634676491e-05, + "loss": 1.1757, + "step": 25670 + }, + { + "epoch": 7.68, + "grad_norm": 3.698439359664917, + "learning_rate": 3.3913107465565274e-05, + "loss": 1.0446, + "step": 25675 + }, + { + "epoch": 7.68, + "grad_norm": 7.462309837341309, + "learning_rate": 3.390761809210398e-05, + "loss": 1.1909, + "step": 25680 + }, + { + "epoch": 7.68, + "grad_norm": 3.320486545562744, + "learning_rate": 3.39021282266842e-05, + "loss": 1.1427, + "step": 25685 + }, + { + "epoch": 7.69, + "grad_norm": 1.506848931312561, + "learning_rate": 3.389663786960913e-05, + "loss": 1.2685, + "step": 25690 + }, + { + "epoch": 7.69, + "grad_norm": 0.9463053345680237, + "learning_rate": 3.3891147021182004e-05, + "loss": 1.2191, + "step": 25695 + }, + { + "epoch": 7.69, + "grad_norm": 1.4177323579788208, + "learning_rate": 3.388565568170607e-05, + "loss": 1.1461, + "step": 25700 + }, + { + "epoch": 7.69, + "grad_norm": 1.9486898183822632, + "learning_rate": 3.38801638514846e-05, + "loss": 1.064, + "step": 25705 + }, + { + "epoch": 7.69, + "grad_norm": 3.4581215381622314, + "learning_rate": 3.3874671530820915e-05, + "loss": 1.1031, + "step": 25710 + }, + { + "epoch": 7.69, + "grad_norm": 2.35516095161438, + "learning_rate": 3.386917872001835e-05, + "loss": 1.1787, + "step": 25715 + }, + { + "epoch": 7.7, + "grad_norm": 5.054357528686523, + "learning_rate": 3.3863685419380254e-05, + "loss": 1.2588, + "step": 25720 + }, + { + "epoch": 7.7, + "grad_norm": 2.7185957431793213, + "learning_rate": 3.385819162921003e-05, + "loss": 0.9498, + "step": 25725 + }, + { + "epoch": 7.7, + "grad_norm": 1.8890604972839355, + "learning_rate": 3.385269734981109e-05, + "loss": 1.2021, + "step": 25730 + }, + { + "epoch": 7.7, + "grad_norm": 2.1678149700164795, + "learning_rate": 3.384720258148688e-05, + "loss": 1.1627, + "step": 25735 + }, + { + "epoch": 7.7, + "grad_norm": 2.4753189086914062, + "learning_rate": 3.384170732454087e-05, + "loss": 1.2426, + "step": 25740 + }, + { + "epoch": 7.7, + "grad_norm": 1.5026155710220337, + "learning_rate": 3.3836211579276546e-05, + "loss": 1.1725, + "step": 25745 + }, + { + "epoch": 7.7, + "grad_norm": 2.9787325859069824, + "learning_rate": 3.3830715345997444e-05, + "loss": 1.2786, + "step": 25750 + }, + { + "epoch": 7.71, + "grad_norm": 1.6327831745147705, + "learning_rate": 3.382521862500712e-05, + "loss": 1.1652, + "step": 25755 + }, + { + "epoch": 7.71, + "grad_norm": 3.5435101985931396, + "learning_rate": 3.381972141660914e-05, + "loss": 1.08, + "step": 25760 + }, + { + "epoch": 7.71, + "grad_norm": 2.974608898162842, + "learning_rate": 3.381422372110711e-05, + "loss": 1.2794, + "step": 25765 + }, + { + "epoch": 7.71, + "grad_norm": 1.1757948398590088, + "learning_rate": 3.3808725538804667e-05, + "loss": 1.1428, + "step": 25770 + }, + { + "epoch": 7.71, + "grad_norm": 2.5623114109039307, + "learning_rate": 3.380322687000547e-05, + "loss": 1.1777, + "step": 25775 + }, + { + "epoch": 7.71, + "grad_norm": 1.8030344247817993, + "learning_rate": 3.37977277150132e-05, + "loss": 1.3782, + "step": 25780 + }, + { + "epoch": 7.71, + "grad_norm": 2.135564088821411, + "learning_rate": 3.379222807413158e-05, + "loss": 1.2467, + "step": 25785 + }, + { + "epoch": 7.72, + "grad_norm": 6.063063144683838, + "learning_rate": 3.3786727947664344e-05, + "loss": 1.1731, + "step": 25790 + }, + { + "epoch": 7.72, + "grad_norm": 2.186962842941284, + "learning_rate": 3.378122733591525e-05, + "loss": 1.2181, + "step": 25795 + }, + { + "epoch": 7.72, + "grad_norm": 1.757891058921814, + "learning_rate": 3.37757262391881e-05, + "loss": 1.151, + "step": 25800 + }, + { + "epoch": 7.72, + "grad_norm": 17.243751525878906, + "learning_rate": 3.377022465778671e-05, + "loss": 1.1469, + "step": 25805 + }, + { + "epoch": 7.72, + "grad_norm": 3.9934499263763428, + "learning_rate": 3.376472259201493e-05, + "loss": 1.055, + "step": 25810 + }, + { + "epoch": 7.72, + "grad_norm": 1.4664825201034546, + "learning_rate": 3.375922004217663e-05, + "loss": 1.2219, + "step": 25815 + }, + { + "epoch": 7.73, + "grad_norm": 2.3855884075164795, + "learning_rate": 3.3753717008575716e-05, + "loss": 1.2283, + "step": 25820 + }, + { + "epoch": 7.73, + "grad_norm": 3.64817476272583, + "learning_rate": 3.374821349151611e-05, + "loss": 1.0902, + "step": 25825 + }, + { + "epoch": 7.73, + "grad_norm": 5.835208415985107, + "learning_rate": 3.374270949130176e-05, + "loss": 1.1913, + "step": 25830 + }, + { + "epoch": 7.73, + "grad_norm": 8.944743156433105, + "learning_rate": 3.373720500823666e-05, + "loss": 1.1375, + "step": 25835 + }, + { + "epoch": 7.73, + "grad_norm": 1.3471863269805908, + "learning_rate": 3.37317000426248e-05, + "loss": 1.1213, + "step": 25840 + }, + { + "epoch": 7.73, + "grad_norm": 1.5816328525543213, + "learning_rate": 3.3726194594770224e-05, + "loss": 1.3163, + "step": 25845 + }, + { + "epoch": 7.73, + "grad_norm": 3.4675426483154297, + "learning_rate": 3.3720688664976996e-05, + "loss": 1.1634, + "step": 25850 + }, + { + "epoch": 7.74, + "grad_norm": 1.961869478225708, + "learning_rate": 3.3715182253549205e-05, + "loss": 1.1437, + "step": 25855 + }, + { + "epoch": 7.74, + "grad_norm": 3.2622334957122803, + "learning_rate": 3.3709675360790945e-05, + "loss": 1.1406, + "step": 25860 + }, + { + "epoch": 7.74, + "grad_norm": 2.293381690979004, + "learning_rate": 3.370416798700637e-05, + "loss": 1.314, + "step": 25865 + }, + { + "epoch": 7.74, + "grad_norm": 3.0880744457244873, + "learning_rate": 3.369866013249965e-05, + "loss": 1.2104, + "step": 25870 + }, + { + "epoch": 7.74, + "grad_norm": 0.9818011522293091, + "learning_rate": 3.369315179757496e-05, + "loss": 1.205, + "step": 25875 + }, + { + "epoch": 7.74, + "grad_norm": 1.3982211351394653, + "learning_rate": 3.368764298253654e-05, + "loss": 1.1116, + "step": 25880 + }, + { + "epoch": 7.74, + "grad_norm": 2.2384934425354004, + "learning_rate": 3.368213368768863e-05, + "loss": 1.1532, + "step": 25885 + }, + { + "epoch": 7.75, + "grad_norm": 2.110853433609009, + "learning_rate": 3.3676623913335507e-05, + "loss": 1.3154, + "step": 25890 + }, + { + "epoch": 7.75, + "grad_norm": 5.248104095458984, + "learning_rate": 3.367111365978146e-05, + "loss": 1.187, + "step": 25895 + }, + { + "epoch": 7.75, + "grad_norm": 2.6548986434936523, + "learning_rate": 3.3665602927330814e-05, + "loss": 1.0928, + "step": 25900 + }, + { + "epoch": 7.75, + "grad_norm": 2.44582200050354, + "learning_rate": 3.3660091716287925e-05, + "loss": 1.2958, + "step": 25905 + }, + { + "epoch": 7.75, + "grad_norm": 2.185173273086548, + "learning_rate": 3.3654580026957176e-05, + "loss": 1.0973, + "step": 25910 + }, + { + "epoch": 7.75, + "grad_norm": 4.19801664352417, + "learning_rate": 3.364906785964297e-05, + "loss": 1.2329, + "step": 25915 + }, + { + "epoch": 7.75, + "grad_norm": 2.9229116439819336, + "learning_rate": 3.364355521464974e-05, + "loss": 1.0089, + "step": 25920 + }, + { + "epoch": 7.76, + "grad_norm": 3.5953376293182373, + "learning_rate": 3.363804209228192e-05, + "loss": 1.1683, + "step": 25925 + }, + { + "epoch": 7.76, + "grad_norm": 3.6482677459716797, + "learning_rate": 3.363252849284404e-05, + "loss": 1.1713, + "step": 25930 + }, + { + "epoch": 7.76, + "grad_norm": 3.2520852088928223, + "learning_rate": 3.3627014416640565e-05, + "loss": 1.4132, + "step": 25935 + }, + { + "epoch": 7.76, + "grad_norm": 3.2956299781799316, + "learning_rate": 3.362149986397606e-05, + "loss": 1.3033, + "step": 25940 + }, + { + "epoch": 7.76, + "grad_norm": 2.4566397666931152, + "learning_rate": 3.361598483515507e-05, + "loss": 1.1382, + "step": 25945 + }, + { + "epoch": 7.76, + "grad_norm": 2.1512653827667236, + "learning_rate": 3.3610469330482205e-05, + "loss": 1.0305, + "step": 25950 + }, + { + "epoch": 7.77, + "grad_norm": 2.5325844287872314, + "learning_rate": 3.360495335026207e-05, + "loss": 1.0878, + "step": 25955 + }, + { + "epoch": 7.77, + "grad_norm": 10.988487243652344, + "learning_rate": 3.35994368947993e-05, + "loss": 1.0437, + "step": 25960 + }, + { + "epoch": 7.77, + "grad_norm": 1.2049709558486938, + "learning_rate": 3.359391996439857e-05, + "loss": 1.2115, + "step": 25965 + }, + { + "epoch": 7.77, + "grad_norm": 4.1528143882751465, + "learning_rate": 3.358840255936457e-05, + "loss": 1.2731, + "step": 25970 + }, + { + "epoch": 7.77, + "grad_norm": 1.840437889099121, + "learning_rate": 3.3582884680002024e-05, + "loss": 1.1867, + "step": 25975 + }, + { + "epoch": 7.77, + "grad_norm": 1.5152828693389893, + "learning_rate": 3.3577366326615676e-05, + "loss": 1.297, + "step": 25980 + }, + { + "epoch": 7.77, + "grad_norm": 2.739409923553467, + "learning_rate": 3.357184749951031e-05, + "loss": 1.205, + "step": 25985 + }, + { + "epoch": 7.78, + "grad_norm": 2.3803489208221436, + "learning_rate": 3.3566328198990713e-05, + "loss": 0.934, + "step": 25990 + }, + { + "epoch": 7.78, + "grad_norm": 1.749653697013855, + "learning_rate": 3.356080842536171e-05, + "loss": 1.246, + "step": 25995 + }, + { + "epoch": 7.78, + "grad_norm": 1.9984244108200073, + "learning_rate": 3.355528817892816e-05, + "loss": 1.162, + "step": 26000 + }, + { + "epoch": 7.78, + "grad_norm": 1.7096799612045288, + "learning_rate": 3.354976745999494e-05, + "loss": 1.1473, + "step": 26005 + }, + { + "epoch": 7.78, + "grad_norm": 2.5412979125976562, + "learning_rate": 3.354424626886694e-05, + "loss": 1.2047, + "step": 26010 + }, + { + "epoch": 7.78, + "grad_norm": 1.5412076711654663, + "learning_rate": 3.3538724605849115e-05, + "loss": 1.2091, + "step": 26015 + }, + { + "epoch": 7.78, + "grad_norm": 1.5192663669586182, + "learning_rate": 3.353320247124639e-05, + "loss": 1.2168, + "step": 26020 + }, + { + "epoch": 7.79, + "grad_norm": 1.3354765176773071, + "learning_rate": 3.352767986536377e-05, + "loss": 1.177, + "step": 26025 + }, + { + "epoch": 7.79, + "grad_norm": 2.89026141166687, + "learning_rate": 3.352215678850625e-05, + "loss": 1.0474, + "step": 26030 + }, + { + "epoch": 7.79, + "grad_norm": 1.4622420072555542, + "learning_rate": 3.351663324097888e-05, + "loss": 1.0757, + "step": 26035 + }, + { + "epoch": 7.79, + "grad_norm": 3.365771532058716, + "learning_rate": 3.35111092230867e-05, + "loss": 1.1737, + "step": 26040 + }, + { + "epoch": 7.79, + "grad_norm": 8.960780143737793, + "learning_rate": 3.35055847351348e-05, + "loss": 1.1499, + "step": 26045 + }, + { + "epoch": 7.79, + "grad_norm": 3.380896806716919, + "learning_rate": 3.350005977742831e-05, + "loss": 1.1441, + "step": 26050 + }, + { + "epoch": 7.8, + "grad_norm": 2.153420925140381, + "learning_rate": 3.3494534350272344e-05, + "loss": 1.1652, + "step": 26055 + }, + { + "epoch": 7.8, + "grad_norm": 3.0268092155456543, + "learning_rate": 3.348900845397208e-05, + "loss": 1.3158, + "step": 26060 + }, + { + "epoch": 7.8, + "grad_norm": 5.089029788970947, + "learning_rate": 3.34834820888327e-05, + "loss": 1.093, + "step": 26065 + }, + { + "epoch": 7.8, + "grad_norm": 1.8281073570251465, + "learning_rate": 3.347795525515942e-05, + "loss": 1.2282, + "step": 26070 + }, + { + "epoch": 7.8, + "grad_norm": 2.1014137268066406, + "learning_rate": 3.347242795325749e-05, + "loss": 1.1495, + "step": 26075 + }, + { + "epoch": 7.8, + "grad_norm": 2.044297933578491, + "learning_rate": 3.3466900183432164e-05, + "loss": 1.0403, + "step": 26080 + }, + { + "epoch": 7.8, + "grad_norm": 2.4856858253479004, + "learning_rate": 3.346137194598874e-05, + "loss": 1.1332, + "step": 26085 + }, + { + "epoch": 7.81, + "grad_norm": 2.4128828048706055, + "learning_rate": 3.345584324123254e-05, + "loss": 1.0393, + "step": 26090 + }, + { + "epoch": 7.81, + "grad_norm": 1.2683279514312744, + "learning_rate": 3.345031406946891e-05, + "loss": 1.2443, + "step": 26095 + }, + { + "epoch": 7.81, + "grad_norm": 2.806605577468872, + "learning_rate": 3.344478443100322e-05, + "loss": 1.1122, + "step": 26100 + }, + { + "epoch": 7.81, + "grad_norm": 2.0906083583831787, + "learning_rate": 3.343925432614086e-05, + "loss": 1.0038, + "step": 26105 + }, + { + "epoch": 7.81, + "grad_norm": 2.3186562061309814, + "learning_rate": 3.3433723755187255e-05, + "loss": 1.1656, + "step": 26110 + }, + { + "epoch": 7.81, + "grad_norm": 2.0462429523468018, + "learning_rate": 3.342819271844787e-05, + "loss": 1.1485, + "step": 26115 + }, + { + "epoch": 7.81, + "grad_norm": 1.9167912006378174, + "learning_rate": 3.342266121622814e-05, + "loss": 1.2493, + "step": 26120 + }, + { + "epoch": 7.82, + "grad_norm": 3.755826473236084, + "learning_rate": 3.3417129248833596e-05, + "loss": 1.1989, + "step": 26125 + }, + { + "epoch": 7.82, + "grad_norm": 3.0041463375091553, + "learning_rate": 3.3411596816569746e-05, + "loss": 1.1634, + "step": 26130 + }, + { + "epoch": 7.82, + "grad_norm": 3.452164649963379, + "learning_rate": 3.340606391974215e-05, + "loss": 1.2504, + "step": 26135 + }, + { + "epoch": 7.82, + "grad_norm": 6.089219093322754, + "learning_rate": 3.340053055865639e-05, + "loss": 1.0353, + "step": 26140 + }, + { + "epoch": 7.82, + "grad_norm": 0.934475839138031, + "learning_rate": 3.339499673361805e-05, + "loss": 1.2358, + "step": 26145 + }, + { + "epoch": 7.82, + "grad_norm": 7.745322227478027, + "learning_rate": 3.3389462444932765e-05, + "loss": 1.1976, + "step": 26150 + }, + { + "epoch": 7.83, + "grad_norm": 1.4742351770401, + "learning_rate": 3.338392769290619e-05, + "loss": 1.2129, + "step": 26155 + }, + { + "epoch": 7.83, + "grad_norm": 1.67327880859375, + "learning_rate": 3.337839247784401e-05, + "loss": 1.1415, + "step": 26160 + }, + { + "epoch": 7.83, + "grad_norm": 4.258670806884766, + "learning_rate": 3.337285680005192e-05, + "loss": 1.0752, + "step": 26165 + }, + { + "epoch": 7.83, + "grad_norm": 2.024472236633301, + "learning_rate": 3.336732065983565e-05, + "loss": 1.0806, + "step": 26170 + }, + { + "epoch": 7.83, + "grad_norm": 1.1518867015838623, + "learning_rate": 3.336178405750095e-05, + "loss": 1.2075, + "step": 26175 + }, + { + "epoch": 7.83, + "grad_norm": 3.1105754375457764, + "learning_rate": 3.3356246993353617e-05, + "loss": 1.1183, + "step": 26180 + }, + { + "epoch": 7.83, + "grad_norm": 1.1292957067489624, + "learning_rate": 3.335070946769945e-05, + "loss": 1.381, + "step": 26185 + }, + { + "epoch": 7.84, + "grad_norm": 1.872825264930725, + "learning_rate": 3.3345171480844275e-05, + "loss": 1.0752, + "step": 26190 + }, + { + "epoch": 7.84, + "grad_norm": 1.4173269271850586, + "learning_rate": 3.3339633033093955e-05, + "loss": 1.0798, + "step": 26195 + }, + { + "epoch": 7.84, + "grad_norm": 3.544238567352295, + "learning_rate": 3.333409412475437e-05, + "loss": 1.2383, + "step": 26200 + }, + { + "epoch": 7.84, + "grad_norm": 2.445939064025879, + "learning_rate": 3.3328554756131423e-05, + "loss": 1.1121, + "step": 26205 + }, + { + "epoch": 7.84, + "grad_norm": 2.592607021331787, + "learning_rate": 3.332301492753107e-05, + "loss": 1.0819, + "step": 26210 + }, + { + "epoch": 7.84, + "grad_norm": 2.417942523956299, + "learning_rate": 3.3317474639259245e-05, + "loss": 1.128, + "step": 26215 + }, + { + "epoch": 7.84, + "grad_norm": 1.7295610904693604, + "learning_rate": 3.331193389162194e-05, + "loss": 1.18, + "step": 26220 + }, + { + "epoch": 7.85, + "grad_norm": 7.857126235961914, + "learning_rate": 3.330639268492517e-05, + "loss": 0.9577, + "step": 26225 + }, + { + "epoch": 7.85, + "grad_norm": 1.7152934074401855, + "learning_rate": 3.330085101947496e-05, + "loss": 1.2462, + "step": 26230 + }, + { + "epoch": 7.85, + "grad_norm": 1.7840052843093872, + "learning_rate": 3.3295308895577376e-05, + "loss": 0.9696, + "step": 26235 + }, + { + "epoch": 7.85, + "grad_norm": 1.5356841087341309, + "learning_rate": 3.32897663135385e-05, + "loss": 1.1286, + "step": 26240 + }, + { + "epoch": 7.85, + "grad_norm": 1.8642561435699463, + "learning_rate": 3.3284223273664465e-05, + "loss": 0.9585, + "step": 26245 + }, + { + "epoch": 7.85, + "grad_norm": 1.54176664352417, + "learning_rate": 3.327867977626138e-05, + "loss": 1.177, + "step": 26250 + }, + { + "epoch": 7.86, + "grad_norm": 3.919142961502075, + "learning_rate": 3.327313582163542e-05, + "loss": 1.1158, + "step": 26255 + }, + { + "epoch": 7.86, + "grad_norm": 2.54856276512146, + "learning_rate": 3.326759141009276e-05, + "loss": 1.1494, + "step": 26260 + }, + { + "epoch": 7.86, + "grad_norm": 2.491251230239868, + "learning_rate": 3.326204654193962e-05, + "loss": 1.2343, + "step": 26265 + }, + { + "epoch": 7.86, + "grad_norm": 4.712347030639648, + "learning_rate": 3.325650121748225e-05, + "loss": 1.1057, + "step": 26270 + }, + { + "epoch": 7.86, + "grad_norm": 1.5245498418807983, + "learning_rate": 3.325095543702688e-05, + "loss": 1.1575, + "step": 26275 + }, + { + "epoch": 7.86, + "grad_norm": 3.232236385345459, + "learning_rate": 3.324540920087983e-05, + "loss": 1.1603, + "step": 26280 + }, + { + "epoch": 7.86, + "grad_norm": 1.0489842891693115, + "learning_rate": 3.3239862509347396e-05, + "loss": 1.2751, + "step": 26285 + }, + { + "epoch": 7.87, + "grad_norm": 1.4679367542266846, + "learning_rate": 3.3234315362735926e-05, + "loss": 1.0614, + "step": 26290 + }, + { + "epoch": 7.87, + "grad_norm": 4.200760364532471, + "learning_rate": 3.3228767761351776e-05, + "loss": 1.251, + "step": 26295 + }, + { + "epoch": 7.87, + "grad_norm": 3.265259265899658, + "learning_rate": 3.3223219705501334e-05, + "loss": 1.0798, + "step": 26300 + }, + { + "epoch": 7.87, + "grad_norm": 5.21251106262207, + "learning_rate": 3.3217671195491016e-05, + "loss": 1.0074, + "step": 26305 + }, + { + "epoch": 7.87, + "grad_norm": 2.8614084720611572, + "learning_rate": 3.3212122231627265e-05, + "loss": 0.9399, + "step": 26310 + }, + { + "epoch": 7.87, + "grad_norm": 2.9637887477874756, + "learning_rate": 3.320657281421653e-05, + "loss": 1.1851, + "step": 26315 + }, + { + "epoch": 7.87, + "grad_norm": 2.9170193672180176, + "learning_rate": 3.320102294356531e-05, + "loss": 1.3154, + "step": 26320 + }, + { + "epoch": 7.88, + "grad_norm": 4.612870216369629, + "learning_rate": 3.319547261998012e-05, + "loss": 1.0731, + "step": 26325 + }, + { + "epoch": 7.88, + "grad_norm": 3.057469129562378, + "learning_rate": 3.31899218437675e-05, + "loss": 1.0565, + "step": 26330 + }, + { + "epoch": 7.88, + "grad_norm": 3.9015371799468994, + "learning_rate": 3.3184370615234004e-05, + "loss": 1.0798, + "step": 26335 + }, + { + "epoch": 7.88, + "grad_norm": 2.964965343475342, + "learning_rate": 3.317881893468623e-05, + "loss": 1.0914, + "step": 26340 + }, + { + "epoch": 7.88, + "grad_norm": 7.357722282409668, + "learning_rate": 3.3173266802430784e-05, + "loss": 1.3035, + "step": 26345 + }, + { + "epoch": 7.88, + "grad_norm": 1.9388129711151123, + "learning_rate": 3.3167714218774315e-05, + "loss": 1.1861, + "step": 26350 + }, + { + "epoch": 7.89, + "grad_norm": 1.838700532913208, + "learning_rate": 3.316216118402347e-05, + "loss": 1.0244, + "step": 26355 + }, + { + "epoch": 7.89, + "grad_norm": 2.831838846206665, + "learning_rate": 3.315660769848495e-05, + "loss": 1.0736, + "step": 26360 + }, + { + "epoch": 7.89, + "grad_norm": 4.200833320617676, + "learning_rate": 3.315105376246547e-05, + "loss": 1.2928, + "step": 26365 + }, + { + "epoch": 7.89, + "grad_norm": 6.066209316253662, + "learning_rate": 3.3145499376271754e-05, + "loss": 1.0764, + "step": 26370 + }, + { + "epoch": 7.89, + "grad_norm": 1.2264248132705688, + "learning_rate": 3.3139944540210574e-05, + "loss": 1.2454, + "step": 26375 + }, + { + "epoch": 7.89, + "grad_norm": 1.508028268814087, + "learning_rate": 3.3134389254588724e-05, + "loss": 1.0258, + "step": 26380 + }, + { + "epoch": 7.89, + "grad_norm": 2.4264774322509766, + "learning_rate": 3.312883351971301e-05, + "loss": 1.0888, + "step": 26385 + }, + { + "epoch": 7.9, + "grad_norm": 3.008584499359131, + "learning_rate": 3.312327733589027e-05, + "loss": 1.2136, + "step": 26390 + }, + { + "epoch": 7.9, + "grad_norm": 2.624093532562256, + "learning_rate": 3.3117720703427365e-05, + "loss": 1.0683, + "step": 26395 + }, + { + "epoch": 7.9, + "grad_norm": 2.020382881164551, + "learning_rate": 3.311216362263119e-05, + "loss": 1.283, + "step": 26400 + }, + { + "epoch": 7.9, + "grad_norm": 1.6679294109344482, + "learning_rate": 3.310660609380865e-05, + "loss": 1.193, + "step": 26405 + }, + { + "epoch": 7.9, + "grad_norm": 1.7250815629959106, + "learning_rate": 3.3101048117266675e-05, + "loss": 1.1785, + "step": 26410 + }, + { + "epoch": 7.9, + "grad_norm": 3.259265661239624, + "learning_rate": 3.3095489693312234e-05, + "loss": 1.1396, + "step": 26415 + }, + { + "epoch": 7.9, + "grad_norm": 6.142773151397705, + "learning_rate": 3.308993082225231e-05, + "loss": 1.0735, + "step": 26420 + }, + { + "epoch": 7.91, + "grad_norm": 2.9731175899505615, + "learning_rate": 3.308437150439392e-05, + "loss": 1.1468, + "step": 26425 + }, + { + "epoch": 7.91, + "grad_norm": 2.0407536029815674, + "learning_rate": 3.3078811740044096e-05, + "loss": 1.3368, + "step": 26430 + }, + { + "epoch": 7.91, + "grad_norm": 2.396974563598633, + "learning_rate": 3.30732515295099e-05, + "loss": 1.0735, + "step": 26435 + }, + { + "epoch": 7.91, + "grad_norm": 1.7831975221633911, + "learning_rate": 3.306769087309841e-05, + "loss": 1.212, + "step": 26440 + }, + { + "epoch": 7.91, + "grad_norm": 2.968552827835083, + "learning_rate": 3.3062129771116734e-05, + "loss": 1.0222, + "step": 26445 + }, + { + "epoch": 7.91, + "grad_norm": 2.5638179779052734, + "learning_rate": 3.305656822387201e-05, + "loss": 1.1809, + "step": 26450 + }, + { + "epoch": 7.92, + "grad_norm": 11.847745895385742, + "learning_rate": 3.30510062316714e-05, + "loss": 1.2684, + "step": 26455 + }, + { + "epoch": 7.92, + "grad_norm": 2.3828535079956055, + "learning_rate": 3.304544379482209e-05, + "loss": 1.2027, + "step": 26460 + }, + { + "epoch": 7.92, + "grad_norm": 3.3371334075927734, + "learning_rate": 3.303988091363128e-05, + "loss": 1.2209, + "step": 26465 + }, + { + "epoch": 7.92, + "grad_norm": 1.5403908491134644, + "learning_rate": 3.3034317588406205e-05, + "loss": 1.0056, + "step": 26470 + }, + { + "epoch": 7.92, + "grad_norm": 3.629544973373413, + "learning_rate": 3.302875381945412e-05, + "loss": 1.2427, + "step": 26475 + }, + { + "epoch": 7.92, + "grad_norm": 1.5579273700714111, + "learning_rate": 3.30231896070823e-05, + "loss": 1.2536, + "step": 26480 + }, + { + "epoch": 7.92, + "grad_norm": 3.202047824859619, + "learning_rate": 3.3017624951598066e-05, + "loss": 1.0117, + "step": 26485 + }, + { + "epoch": 7.93, + "grad_norm": 4.338261127471924, + "learning_rate": 3.301205985330873e-05, + "loss": 0.9417, + "step": 26490 + }, + { + "epoch": 7.93, + "grad_norm": 0.7901287078857422, + "learning_rate": 3.300649431252166e-05, + "loss": 1.005, + "step": 26495 + }, + { + "epoch": 7.93, + "grad_norm": 2.2430081367492676, + "learning_rate": 3.300092832954425e-05, + "loss": 1.2206, + "step": 26500 + }, + { + "epoch": 7.93, + "grad_norm": 2.6997382640838623, + "learning_rate": 3.2995361904683866e-05, + "loss": 1.3153, + "step": 26505 + }, + { + "epoch": 7.93, + "grad_norm": 4.912943363189697, + "learning_rate": 3.2989795038247956e-05, + "loss": 1.1331, + "step": 26510 + }, + { + "epoch": 7.93, + "grad_norm": 1.721016526222229, + "learning_rate": 3.298422773054397e-05, + "loss": 1.3072, + "step": 26515 + }, + { + "epoch": 7.93, + "grad_norm": 0.9054770469665527, + "learning_rate": 3.297865998187939e-05, + "loss": 1.2066, + "step": 26520 + }, + { + "epoch": 7.94, + "grad_norm": 1.856542706489563, + "learning_rate": 3.297309179256171e-05, + "loss": 1.2847, + "step": 26525 + }, + { + "epoch": 7.94, + "grad_norm": 2.9849486351013184, + "learning_rate": 3.2967523162898465e-05, + "loss": 1.2153, + "step": 26530 + }, + { + "epoch": 7.94, + "grad_norm": 2.4759957790374756, + "learning_rate": 3.296195409319719e-05, + "loss": 1.1859, + "step": 26535 + }, + { + "epoch": 7.94, + "grad_norm": 1.966263771057129, + "learning_rate": 3.295638458376546e-05, + "loss": 1.2078, + "step": 26540 + }, + { + "epoch": 7.94, + "grad_norm": 3.476435422897339, + "learning_rate": 3.295081463491089e-05, + "loss": 1.1706, + "step": 26545 + }, + { + "epoch": 7.94, + "grad_norm": 3.4568331241607666, + "learning_rate": 3.294524424694109e-05, + "loss": 1.2634, + "step": 26550 + }, + { + "epoch": 7.94, + "grad_norm": 1.4559932947158813, + "learning_rate": 3.2939673420163706e-05, + "loss": 1.0625, + "step": 26555 + }, + { + "epoch": 7.95, + "grad_norm": 1.8391364812850952, + "learning_rate": 3.293410215488642e-05, + "loss": 1.1624, + "step": 26560 + }, + { + "epoch": 7.95, + "grad_norm": 2.4696767330169678, + "learning_rate": 3.292853045141691e-05, + "loss": 1.2677, + "step": 26565 + }, + { + "epoch": 7.95, + "grad_norm": 1.480440378189087, + "learning_rate": 3.2922958310062904e-05, + "loss": 1.0199, + "step": 26570 + }, + { + "epoch": 7.95, + "grad_norm": 1.3950989246368408, + "learning_rate": 3.291738573113216e-05, + "loss": 1.2905, + "step": 26575 + }, + { + "epoch": 7.95, + "grad_norm": 2.3112564086914062, + "learning_rate": 3.291181271493242e-05, + "loss": 1.1195, + "step": 26580 + }, + { + "epoch": 7.95, + "grad_norm": 5.1455302238464355, + "learning_rate": 3.290623926177148e-05, + "loss": 1.1371, + "step": 26585 + }, + { + "epoch": 7.96, + "grad_norm": 7.398214340209961, + "learning_rate": 3.290066537195717e-05, + "loss": 1.1394, + "step": 26590 + }, + { + "epoch": 7.96, + "grad_norm": 3.4421448707580566, + "learning_rate": 3.2895091045797335e-05, + "loss": 1.372, + "step": 26595 + }, + { + "epoch": 7.96, + "grad_norm": 5.3825764656066895, + "learning_rate": 3.288951628359982e-05, + "loss": 1.3115, + "step": 26600 + }, + { + "epoch": 7.96, + "grad_norm": 2.211780548095703, + "learning_rate": 3.288394108567252e-05, + "loss": 1.1863, + "step": 26605 + }, + { + "epoch": 7.96, + "grad_norm": 4.291854381561279, + "learning_rate": 3.287836545232335e-05, + "loss": 1.1253, + "step": 26610 + }, + { + "epoch": 7.96, + "grad_norm": 2.0238168239593506, + "learning_rate": 3.2872789383860246e-05, + "loss": 1.34, + "step": 26615 + }, + { + "epoch": 7.96, + "grad_norm": 2.0490705966949463, + "learning_rate": 3.286721288059116e-05, + "loss": 1.0287, + "step": 26620 + }, + { + "epoch": 7.97, + "grad_norm": 2.1137607097625732, + "learning_rate": 3.28616359428241e-05, + "loss": 0.9508, + "step": 26625 + }, + { + "epoch": 7.97, + "grad_norm": 3.983616352081299, + "learning_rate": 3.285605857086704e-05, + "loss": 1.3123, + "step": 26630 + }, + { + "epoch": 7.97, + "grad_norm": 2.677525520324707, + "learning_rate": 3.285048076502805e-05, + "loss": 0.9677, + "step": 26635 + }, + { + "epoch": 7.97, + "grad_norm": 3.7401962280273438, + "learning_rate": 3.284490252561515e-05, + "loss": 1.2097, + "step": 26640 + }, + { + "epoch": 7.97, + "grad_norm": 2.0874180793762207, + "learning_rate": 3.283932385293644e-05, + "loss": 0.9952, + "step": 26645 + }, + { + "epoch": 7.97, + "grad_norm": 2.5544826984405518, + "learning_rate": 3.283374474730003e-05, + "loss": 1.1315, + "step": 26650 + }, + { + "epoch": 7.97, + "grad_norm": 1.7675901651382446, + "learning_rate": 3.2828165209014036e-05, + "loss": 1.1583, + "step": 26655 + }, + { + "epoch": 7.98, + "grad_norm": 3.3144235610961914, + "learning_rate": 3.282258523838663e-05, + "loss": 1.1255, + "step": 26660 + }, + { + "epoch": 7.98, + "grad_norm": 3.3691461086273193, + "learning_rate": 3.281700483572595e-05, + "loss": 1.2332, + "step": 26665 + }, + { + "epoch": 7.98, + "grad_norm": 3.544421911239624, + "learning_rate": 3.281142400134023e-05, + "loss": 1.0824, + "step": 26670 + }, + { + "epoch": 7.98, + "grad_norm": 1.068796157836914, + "learning_rate": 3.280584273553768e-05, + "loss": 1.14, + "step": 26675 + }, + { + "epoch": 7.98, + "grad_norm": 2.5210955142974854, + "learning_rate": 3.2800261038626544e-05, + "loss": 1.1527, + "step": 26680 + }, + { + "epoch": 7.98, + "grad_norm": 2.1437249183654785, + "learning_rate": 3.279467891091511e-05, + "loss": 0.9096, + "step": 26685 + }, + { + "epoch": 7.99, + "grad_norm": 1.478752613067627, + "learning_rate": 3.278909635271165e-05, + "loss": 1.0211, + "step": 26690 + }, + { + "epoch": 7.99, + "grad_norm": 4.072501182556152, + "learning_rate": 3.27835133643245e-05, + "loss": 1.1708, + "step": 26695 + }, + { + "epoch": 7.99, + "grad_norm": 3.866422653198242, + "learning_rate": 3.2777929946062005e-05, + "loss": 1.1402, + "step": 26700 + }, + { + "epoch": 7.99, + "grad_norm": 2.2160439491271973, + "learning_rate": 3.277234609823251e-05, + "loss": 1.2121, + "step": 26705 + }, + { + "epoch": 7.99, + "grad_norm": 3.4925968647003174, + "learning_rate": 3.276676182114443e-05, + "loss": 1.188, + "step": 26710 + }, + { + "epoch": 7.99, + "grad_norm": 1.9253541231155396, + "learning_rate": 3.276117711510616e-05, + "loss": 1.1559, + "step": 26715 + }, + { + "epoch": 7.99, + "grad_norm": 1.081268072128296, + "learning_rate": 3.2755591980426146e-05, + "loss": 1.2158, + "step": 26720 + }, + { + "epoch": 8.0, + "grad_norm": 1.8544811010360718, + "learning_rate": 3.275000641741285e-05, + "loss": 1.1147, + "step": 26725 + }, + { + "epoch": 8.0, + "grad_norm": 1.938436508178711, + "learning_rate": 3.2744420426374755e-05, + "loss": 1.2673, + "step": 26730 + }, + { + "epoch": 8.0, + "grad_norm": 2.7037627696990967, + "learning_rate": 3.273883400762037e-05, + "loss": 1.2175, + "step": 26735 + }, + { + "epoch": 8.0, + "grad_norm": 2.8835654258728027, + "learning_rate": 3.2733247161458224e-05, + "loss": 1.1878, + "step": 26740 + }, + { + "epoch": 8.0, + "grad_norm": 3.2548410892486572, + "learning_rate": 3.272765988819688e-05, + "loss": 1.2278, + "step": 26745 + }, + { + "epoch": 8.0, + "grad_norm": 2.562904119491577, + "learning_rate": 3.2722072188144916e-05, + "loss": 1.1261, + "step": 26750 + }, + { + "epoch": 8.0, + "grad_norm": 1.355686902999878, + "learning_rate": 3.271648406161092e-05, + "loss": 0.9932, + "step": 26755 + }, + { + "epoch": 8.01, + "grad_norm": 2.448178768157959, + "learning_rate": 3.2710895508903546e-05, + "loss": 1.1223, + "step": 26760 + }, + { + "epoch": 8.01, + "grad_norm": 1.8387428522109985, + "learning_rate": 3.270530653033142e-05, + "loss": 1.2345, + "step": 26765 + }, + { + "epoch": 8.01, + "grad_norm": 2.6244566440582275, + "learning_rate": 3.269971712620322e-05, + "loss": 1.2351, + "step": 26770 + }, + { + "epoch": 8.01, + "grad_norm": 1.3992868661880493, + "learning_rate": 3.269412729682765e-05, + "loss": 1.3125, + "step": 26775 + }, + { + "epoch": 8.01, + "grad_norm": 1.4152156114578247, + "learning_rate": 3.268853704251342e-05, + "loss": 1.1555, + "step": 26780 + }, + { + "epoch": 8.01, + "grad_norm": 3.2571237087249756, + "learning_rate": 3.2682946363569286e-05, + "loss": 0.986, + "step": 26785 + }, + { + "epoch": 8.02, + "grad_norm": 1.7282525300979614, + "learning_rate": 3.267735526030402e-05, + "loss": 1.1626, + "step": 26790 + }, + { + "epoch": 8.02, + "grad_norm": 1.6741749048233032, + "learning_rate": 3.267176373302639e-05, + "loss": 1.1052, + "step": 26795 + }, + { + "epoch": 8.02, + "grad_norm": 2.290086030960083, + "learning_rate": 3.266617178204523e-05, + "loss": 0.9702, + "step": 26800 + }, + { + "epoch": 8.02, + "grad_norm": 1.1538450717926025, + "learning_rate": 3.2660579407669374e-05, + "loss": 1.2276, + "step": 26805 + }, + { + "epoch": 8.02, + "grad_norm": 5.698193073272705, + "learning_rate": 3.265498661020767e-05, + "loss": 0.9967, + "step": 26810 + }, + { + "epoch": 8.02, + "grad_norm": 3.099355459213257, + "learning_rate": 3.2649393389969016e-05, + "loss": 0.9655, + "step": 26815 + }, + { + "epoch": 8.02, + "grad_norm": 2.5500590801239014, + "learning_rate": 3.264379974726232e-05, + "loss": 1.1817, + "step": 26820 + }, + { + "epoch": 8.03, + "grad_norm": 2.5358612537384033, + "learning_rate": 3.2638205682396504e-05, + "loss": 1.1314, + "step": 26825 + }, + { + "epoch": 8.03, + "grad_norm": 1.88272225856781, + "learning_rate": 3.2632611195680535e-05, + "loss": 1.0866, + "step": 26830 + }, + { + "epoch": 8.03, + "grad_norm": 2.205331325531006, + "learning_rate": 3.262701628742338e-05, + "loss": 1.1186, + "step": 26835 + }, + { + "epoch": 8.03, + "grad_norm": 3.861008644104004, + "learning_rate": 3.262142095793404e-05, + "loss": 0.9824, + "step": 26840 + }, + { + "epoch": 8.03, + "grad_norm": 1.2620456218719482, + "learning_rate": 3.261582520752154e-05, + "loss": 1.1634, + "step": 26845 + }, + { + "epoch": 8.03, + "grad_norm": 2.01745343208313, + "learning_rate": 3.261022903649494e-05, + "loss": 1.1332, + "step": 26850 + }, + { + "epoch": 8.03, + "grad_norm": 2.3341500759124756, + "learning_rate": 3.26046324451633e-05, + "loss": 1.1015, + "step": 26855 + }, + { + "epoch": 8.04, + "grad_norm": 2.3520326614379883, + "learning_rate": 3.2599035433835706e-05, + "loss": 0.9539, + "step": 26860 + }, + { + "epoch": 8.04, + "grad_norm": 2.2860143184661865, + "learning_rate": 3.2593438002821286e-05, + "loss": 1.098, + "step": 26865 + }, + { + "epoch": 8.04, + "grad_norm": 4.713287830352783, + "learning_rate": 3.2587840152429186e-05, + "loss": 1.0839, + "step": 26870 + }, + { + "epoch": 8.04, + "grad_norm": 1.481998324394226, + "learning_rate": 3.258224188296855e-05, + "loss": 1.096, + "step": 26875 + }, + { + "epoch": 8.04, + "grad_norm": 2.8222692012786865, + "learning_rate": 3.257664319474858e-05, + "loss": 1.1853, + "step": 26880 + }, + { + "epoch": 8.04, + "grad_norm": 2.920170307159424, + "learning_rate": 3.257104408807848e-05, + "loss": 1.0507, + "step": 26885 + }, + { + "epoch": 8.05, + "grad_norm": 3.3776135444641113, + "learning_rate": 3.25654445632675e-05, + "loss": 1.0174, + "step": 26890 + }, + { + "epoch": 8.05, + "grad_norm": 2.022411346435547, + "learning_rate": 3.255984462062487e-05, + "loss": 1.1012, + "step": 26895 + }, + { + "epoch": 8.05, + "grad_norm": 4.32869291305542, + "learning_rate": 3.255424426045987e-05, + "loss": 1.0786, + "step": 26900 + }, + { + "epoch": 8.05, + "grad_norm": 1.3376225233078003, + "learning_rate": 3.254864348308182e-05, + "loss": 1.1797, + "step": 26905 + }, + { + "epoch": 8.05, + "grad_norm": 1.1576924324035645, + "learning_rate": 3.2543042288800035e-05, + "loss": 1.0853, + "step": 26910 + }, + { + "epoch": 8.05, + "grad_norm": 1.6126631498336792, + "learning_rate": 3.2537440677923864e-05, + "loss": 1.0122, + "step": 26915 + }, + { + "epoch": 8.05, + "grad_norm": 2.130544900894165, + "learning_rate": 3.253183865076269e-05, + "loss": 1.1658, + "step": 26920 + }, + { + "epoch": 8.06, + "grad_norm": 3.746469259262085, + "learning_rate": 3.252623620762589e-05, + "loss": 1.1788, + "step": 26925 + }, + { + "epoch": 8.06, + "grad_norm": 2.503143548965454, + "learning_rate": 3.2520633348822884e-05, + "loss": 1.1909, + "step": 26930 + }, + { + "epoch": 8.06, + "grad_norm": 2.250931978225708, + "learning_rate": 3.251503007466311e-05, + "loss": 1.1565, + "step": 26935 + }, + { + "epoch": 8.06, + "grad_norm": 1.9409987926483154, + "learning_rate": 3.2509426385456046e-05, + "loss": 1.1417, + "step": 26940 + }, + { + "epoch": 8.06, + "grad_norm": 2.2973732948303223, + "learning_rate": 3.250382228151116e-05, + "loss": 1.1895, + "step": 26945 + }, + { + "epoch": 8.06, + "grad_norm": 3.5512242317199707, + "learning_rate": 3.249821776313798e-05, + "loss": 1.1509, + "step": 26950 + }, + { + "epoch": 8.06, + "grad_norm": 3.031540870666504, + "learning_rate": 3.2492612830646025e-05, + "loss": 1.1068, + "step": 26955 + }, + { + "epoch": 8.07, + "grad_norm": 2.2181246280670166, + "learning_rate": 3.248700748434485e-05, + "loss": 1.1758, + "step": 26960 + }, + { + "epoch": 8.07, + "grad_norm": 1.102636456489563, + "learning_rate": 3.248140172454403e-05, + "loss": 1.0737, + "step": 26965 + }, + { + "epoch": 8.07, + "grad_norm": 1.951006293296814, + "learning_rate": 3.2475795551553166e-05, + "loss": 1.1618, + "step": 26970 + }, + { + "epoch": 8.07, + "grad_norm": 1.9825594425201416, + "learning_rate": 3.2470188965681894e-05, + "loss": 1.0879, + "step": 26975 + }, + { + "epoch": 8.07, + "grad_norm": 4.088083267211914, + "learning_rate": 3.246458196723985e-05, + "loss": 1.1725, + "step": 26980 + }, + { + "epoch": 8.07, + "grad_norm": 6.844773292541504, + "learning_rate": 3.2458974556536694e-05, + "loss": 1.0785, + "step": 26985 + }, + { + "epoch": 8.08, + "grad_norm": 1.5896713733673096, + "learning_rate": 3.245336673388213e-05, + "loss": 1.0559, + "step": 26990 + }, + { + "epoch": 8.08, + "grad_norm": 1.1929066181182861, + "learning_rate": 3.244775849958587e-05, + "loss": 1.0536, + "step": 26995 + }, + { + "epoch": 8.08, + "grad_norm": 1.1825475692749023, + "learning_rate": 3.244214985395765e-05, + "loss": 1.2364, + "step": 27000 + }, + { + "epoch": 8.08, + "grad_norm": 3.0042402744293213, + "learning_rate": 3.2436540797307224e-05, + "loss": 1.1558, + "step": 27005 + }, + { + "epoch": 8.08, + "grad_norm": 1.305233359336853, + "learning_rate": 3.2430931329944384e-05, + "loss": 1.1471, + "step": 27010 + }, + { + "epoch": 8.08, + "grad_norm": 2.5870091915130615, + "learning_rate": 3.242532145217894e-05, + "loss": 1.2085, + "step": 27015 + }, + { + "epoch": 8.08, + "grad_norm": 1.0035154819488525, + "learning_rate": 3.24197111643207e-05, + "loss": 0.9627, + "step": 27020 + }, + { + "epoch": 8.09, + "grad_norm": 2.0611767768859863, + "learning_rate": 3.241410046667952e-05, + "loss": 1.0554, + "step": 27025 + }, + { + "epoch": 8.09, + "grad_norm": 1.6869128942489624, + "learning_rate": 3.2408489359565286e-05, + "loss": 1.0618, + "step": 27030 + }, + { + "epoch": 8.09, + "grad_norm": 1.8879674673080444, + "learning_rate": 3.240287784328789e-05, + "loss": 1.1033, + "step": 27035 + }, + { + "epoch": 8.09, + "grad_norm": 3.194080114364624, + "learning_rate": 3.239726591815724e-05, + "loss": 0.9735, + "step": 27040 + }, + { + "epoch": 8.09, + "grad_norm": 2.4983654022216797, + "learning_rate": 3.239165358448327e-05, + "loss": 1.1598, + "step": 27045 + }, + { + "epoch": 8.09, + "grad_norm": 1.8231905698776245, + "learning_rate": 3.2386040842575976e-05, + "loss": 1.1378, + "step": 27050 + }, + { + "epoch": 8.09, + "grad_norm": 4.191860198974609, + "learning_rate": 3.238042769274531e-05, + "loss": 0.9522, + "step": 27055 + }, + { + "epoch": 8.1, + "grad_norm": 3.744335412979126, + "learning_rate": 3.23748141353013e-05, + "loss": 1.3162, + "step": 27060 + }, + { + "epoch": 8.1, + "grad_norm": 2.127502202987671, + "learning_rate": 3.236920017055397e-05, + "loss": 0.9964, + "step": 27065 + }, + { + "epoch": 8.1, + "grad_norm": 3.770505666732788, + "learning_rate": 3.2363585798813376e-05, + "loss": 1.2214, + "step": 27070 + }, + { + "epoch": 8.1, + "grad_norm": 3.2522053718566895, + "learning_rate": 3.2357971020389586e-05, + "loss": 1.1507, + "step": 27075 + }, + { + "epoch": 8.1, + "grad_norm": 1.4464082717895508, + "learning_rate": 3.235235583559271e-05, + "loss": 1.0405, + "step": 27080 + }, + { + "epoch": 8.1, + "grad_norm": 2.175246238708496, + "learning_rate": 3.2346740244732866e-05, + "loss": 1.284, + "step": 27085 + }, + { + "epoch": 8.11, + "grad_norm": 3.6082305908203125, + "learning_rate": 3.234112424812019e-05, + "loss": 1.0077, + "step": 27090 + }, + { + "epoch": 8.11, + "grad_norm": 2.128209114074707, + "learning_rate": 3.233550784606486e-05, + "loss": 1.1639, + "step": 27095 + }, + { + "epoch": 8.11, + "grad_norm": 1.130769968032837, + "learning_rate": 3.232989103887704e-05, + "loss": 1.2106, + "step": 27100 + }, + { + "epoch": 8.11, + "grad_norm": 1.5438563823699951, + "learning_rate": 3.232427382686697e-05, + "loss": 1.1424, + "step": 27105 + }, + { + "epoch": 8.11, + "grad_norm": 5.550532817840576, + "learning_rate": 3.231865621034486e-05, + "loss": 1.1006, + "step": 27110 + }, + { + "epoch": 8.11, + "grad_norm": 20.100221633911133, + "learning_rate": 3.2313038189620995e-05, + "loss": 0.8935, + "step": 27115 + }, + { + "epoch": 8.11, + "grad_norm": 2.989924430847168, + "learning_rate": 3.230741976500562e-05, + "loss": 1.3188, + "step": 27120 + }, + { + "epoch": 8.12, + "grad_norm": 2.9077560901641846, + "learning_rate": 3.2301800936809044e-05, + "loss": 1.1537, + "step": 27125 + }, + { + "epoch": 8.12, + "grad_norm": 1.805272102355957, + "learning_rate": 3.229618170534159e-05, + "loss": 1.1026, + "step": 27130 + }, + { + "epoch": 8.12, + "grad_norm": 3.8493740558624268, + "learning_rate": 3.2290562070913613e-05, + "loss": 1.1274, + "step": 27135 + }, + { + "epoch": 8.12, + "grad_norm": 3.3296542167663574, + "learning_rate": 3.2284942033835464e-05, + "loss": 1.0671, + "step": 27140 + }, + { + "epoch": 8.12, + "grad_norm": 1.8140013217926025, + "learning_rate": 3.2279321594417546e-05, + "loss": 1.0956, + "step": 27145 + }, + { + "epoch": 8.12, + "grad_norm": 2.7062671184539795, + "learning_rate": 3.227370075297026e-05, + "loss": 0.9826, + "step": 27150 + }, + { + "epoch": 8.12, + "grad_norm": 1.546039342880249, + "learning_rate": 3.226807950980404e-05, + "loss": 1.2878, + "step": 27155 + }, + { + "epoch": 8.13, + "grad_norm": 1.861967921257019, + "learning_rate": 3.2262457865229337e-05, + "loss": 0.9757, + "step": 27160 + }, + { + "epoch": 8.13, + "grad_norm": 2.4952914714813232, + "learning_rate": 3.2256835819556643e-05, + "loss": 1.0902, + "step": 27165 + }, + { + "epoch": 8.13, + "grad_norm": 4.320837497711182, + "learning_rate": 3.225121337309645e-05, + "loss": 1.2187, + "step": 27170 + }, + { + "epoch": 8.13, + "grad_norm": 3.9226810932159424, + "learning_rate": 3.224559052615928e-05, + "loss": 1.06, + "step": 27175 + }, + { + "epoch": 8.13, + "grad_norm": 1.381098985671997, + "learning_rate": 3.2239967279055675e-05, + "loss": 1.1461, + "step": 27180 + }, + { + "epoch": 8.13, + "grad_norm": 2.233602285385132, + "learning_rate": 3.22343436320962e-05, + "loss": 1.2388, + "step": 27185 + }, + { + "epoch": 8.13, + "grad_norm": 1.6678953170776367, + "learning_rate": 3.222871958559144e-05, + "loss": 1.2351, + "step": 27190 + }, + { + "epoch": 8.14, + "grad_norm": 2.178392171859741, + "learning_rate": 3.2223095139852024e-05, + "loss": 1.0681, + "step": 27195 + }, + { + "epoch": 8.14, + "grad_norm": 1.1121859550476074, + "learning_rate": 3.221747029518857e-05, + "loss": 1.3043, + "step": 27200 + }, + { + "epoch": 8.14, + "grad_norm": 1.504831075668335, + "learning_rate": 3.221184505191173e-05, + "loss": 1.164, + "step": 27205 + }, + { + "epoch": 8.14, + "grad_norm": 1.5870822668075562, + "learning_rate": 3.2206219410332184e-05, + "loss": 1.1253, + "step": 27210 + }, + { + "epoch": 8.14, + "grad_norm": 2.400480270385742, + "learning_rate": 3.220059337076063e-05, + "loss": 1.065, + "step": 27215 + }, + { + "epoch": 8.14, + "grad_norm": 6.77662992477417, + "learning_rate": 3.2194966933507794e-05, + "loss": 1.043, + "step": 27220 + }, + { + "epoch": 8.15, + "grad_norm": 2.4106528759002686, + "learning_rate": 3.2189340098884405e-05, + "loss": 1.33, + "step": 27225 + }, + { + "epoch": 8.15, + "grad_norm": 6.033102989196777, + "learning_rate": 3.2183712867201236e-05, + "loss": 1.1468, + "step": 27230 + }, + { + "epoch": 8.15, + "grad_norm": 6.416871547698975, + "learning_rate": 3.2178085238769076e-05, + "loss": 1.0036, + "step": 27235 + }, + { + "epoch": 8.15, + "grad_norm": 3.3531298637390137, + "learning_rate": 3.217245721389873e-05, + "loss": 1.2439, + "step": 27240 + }, + { + "epoch": 8.15, + "grad_norm": 1.8699911832809448, + "learning_rate": 3.2166828792901025e-05, + "loss": 1.0194, + "step": 27245 + }, + { + "epoch": 8.15, + "grad_norm": 2.013090133666992, + "learning_rate": 3.216119997608682e-05, + "loss": 1.013, + "step": 27250 + }, + { + "epoch": 8.15, + "grad_norm": 1.574511170387268, + "learning_rate": 3.215557076376698e-05, + "loss": 1.1492, + "step": 27255 + }, + { + "epoch": 8.16, + "grad_norm": 1.5347402095794678, + "learning_rate": 3.2149941156252406e-05, + "loss": 1.0025, + "step": 27260 + }, + { + "epoch": 8.16, + "grad_norm": 2.3538808822631836, + "learning_rate": 3.214431115385401e-05, + "loss": 1.0706, + "step": 27265 + }, + { + "epoch": 8.16, + "grad_norm": 2.6728079319000244, + "learning_rate": 3.213868075688273e-05, + "loss": 1.0859, + "step": 27270 + }, + { + "epoch": 8.16, + "grad_norm": 2.588715076446533, + "learning_rate": 3.213304996564955e-05, + "loss": 1.058, + "step": 27275 + }, + { + "epoch": 8.16, + "grad_norm": 2.8418338298797607, + "learning_rate": 3.2127418780465423e-05, + "loss": 1.2622, + "step": 27280 + }, + { + "epoch": 8.16, + "grad_norm": 3.8776187896728516, + "learning_rate": 3.212178720164136e-05, + "loss": 1.0961, + "step": 27285 + }, + { + "epoch": 8.16, + "grad_norm": 1.6147956848144531, + "learning_rate": 3.2116155229488404e-05, + "loss": 1.0547, + "step": 27290 + }, + { + "epoch": 8.17, + "grad_norm": 1.412688136100769, + "learning_rate": 3.211052286431759e-05, + "loss": 1.2792, + "step": 27295 + }, + { + "epoch": 8.17, + "grad_norm": 4.9000654220581055, + "learning_rate": 3.210489010643998e-05, + "loss": 1.2015, + "step": 27300 + }, + { + "epoch": 8.17, + "grad_norm": 0.983841061592102, + "learning_rate": 3.2099256956166684e-05, + "loss": 1.022, + "step": 27305 + }, + { + "epoch": 8.17, + "grad_norm": 2.0872743129730225, + "learning_rate": 3.20936234138088e-05, + "loss": 0.9733, + "step": 27310 + }, + { + "epoch": 8.17, + "grad_norm": 2.8705453872680664, + "learning_rate": 3.208798947967748e-05, + "loss": 1.1142, + "step": 27315 + }, + { + "epoch": 8.17, + "grad_norm": 2.3172919750213623, + "learning_rate": 3.208235515408385e-05, + "loss": 1.217, + "step": 27320 + }, + { + "epoch": 8.18, + "grad_norm": 2.136756658554077, + "learning_rate": 3.207672043733912e-05, + "loss": 1.1509, + "step": 27325 + }, + { + "epoch": 8.18, + "grad_norm": 1.0697053670883179, + "learning_rate": 3.207108532975447e-05, + "loss": 0.9292, + "step": 27330 + }, + { + "epoch": 8.18, + "grad_norm": 2.2047574520111084, + "learning_rate": 3.206544983164113e-05, + "loss": 1.1085, + "step": 27335 + }, + { + "epoch": 8.18, + "grad_norm": 1.8211325407028198, + "learning_rate": 3.205981394331035e-05, + "loss": 1.1218, + "step": 27340 + }, + { + "epoch": 8.18, + "grad_norm": 10.550779342651367, + "learning_rate": 3.205417766507336e-05, + "loss": 1.009, + "step": 27345 + }, + { + "epoch": 8.18, + "grad_norm": 1.852550745010376, + "learning_rate": 3.204854099724148e-05, + "loss": 1.0607, + "step": 27350 + }, + { + "epoch": 8.18, + "grad_norm": 2.7020022869110107, + "learning_rate": 3.2042903940126015e-05, + "loss": 1.0989, + "step": 27355 + }, + { + "epoch": 8.19, + "grad_norm": 1.7700706720352173, + "learning_rate": 3.203726649403828e-05, + "loss": 1.0878, + "step": 27360 + }, + { + "epoch": 8.19, + "grad_norm": 6.152558326721191, + "learning_rate": 3.203162865928963e-05, + "loss": 1.1536, + "step": 27365 + }, + { + "epoch": 8.19, + "grad_norm": 1.028457522392273, + "learning_rate": 3.202599043619145e-05, + "loss": 1.3372, + "step": 27370 + }, + { + "epoch": 8.19, + "grad_norm": 2.56754994392395, + "learning_rate": 3.2020351825055114e-05, + "loss": 1.1325, + "step": 27375 + }, + { + "epoch": 8.19, + "grad_norm": 8.181985855102539, + "learning_rate": 3.201471282619204e-05, + "loss": 1.1545, + "step": 27380 + }, + { + "epoch": 8.19, + "grad_norm": 0.9413318037986755, + "learning_rate": 3.200907343991367e-05, + "loss": 1.1858, + "step": 27385 + }, + { + "epoch": 8.19, + "grad_norm": 2.4057810306549072, + "learning_rate": 3.2003433666531456e-05, + "loss": 1.0307, + "step": 27390 + }, + { + "epoch": 8.2, + "grad_norm": 1.1477302312850952, + "learning_rate": 3.199779350635688e-05, + "loss": 1.1957, + "step": 27395 + }, + { + "epoch": 8.2, + "grad_norm": 2.2809982299804688, + "learning_rate": 3.199215295970145e-05, + "loss": 1.2828, + "step": 27400 + }, + { + "epoch": 8.2, + "grad_norm": 10.860541343688965, + "learning_rate": 3.198651202687668e-05, + "loss": 1.0256, + "step": 27405 + }, + { + "epoch": 8.2, + "grad_norm": 2.4457149505615234, + "learning_rate": 3.198087070819411e-05, + "loss": 1.1488, + "step": 27410 + }, + { + "epoch": 8.2, + "grad_norm": 2.509361743927002, + "learning_rate": 3.1975229003965305e-05, + "loss": 0.9908, + "step": 27415 + }, + { + "epoch": 8.2, + "grad_norm": 2.225059747695923, + "learning_rate": 3.1969586914501854e-05, + "loss": 1.1636, + "step": 27420 + }, + { + "epoch": 8.21, + "grad_norm": 4.2079386711120605, + "learning_rate": 3.196394444011536e-05, + "loss": 1.0119, + "step": 27425 + }, + { + "epoch": 8.21, + "grad_norm": 3.070852041244507, + "learning_rate": 3.1958301581117455e-05, + "loss": 1.0048, + "step": 27430 + }, + { + "epoch": 8.21, + "grad_norm": 1.48086416721344, + "learning_rate": 3.195265833781979e-05, + "loss": 0.8762, + "step": 27435 + }, + { + "epoch": 8.21, + "grad_norm": 2.2743024826049805, + "learning_rate": 3.1947014710534024e-05, + "loss": 1.0763, + "step": 27440 + }, + { + "epoch": 8.21, + "grad_norm": 1.9955936670303345, + "learning_rate": 3.194137069957186e-05, + "loss": 1.1179, + "step": 27445 + }, + { + "epoch": 8.21, + "grad_norm": 1.0980212688446045, + "learning_rate": 3.1935726305245e-05, + "loss": 1.0278, + "step": 27450 + }, + { + "epoch": 8.21, + "grad_norm": 2.3399531841278076, + "learning_rate": 3.19300815278652e-05, + "loss": 1.0881, + "step": 27455 + }, + { + "epoch": 8.22, + "grad_norm": 2.5593791007995605, + "learning_rate": 3.192443636774419e-05, + "loss": 1.003, + "step": 27460 + }, + { + "epoch": 8.22, + "grad_norm": 2.6024117469787598, + "learning_rate": 3.1918790825193764e-05, + "loss": 0.9777, + "step": 27465 + }, + { + "epoch": 8.22, + "grad_norm": 3.8112480640411377, + "learning_rate": 3.191314490052572e-05, + "loss": 1.1839, + "step": 27470 + }, + { + "epoch": 8.22, + "grad_norm": 3.738812208175659, + "learning_rate": 3.190749859405185e-05, + "loss": 0.9292, + "step": 27475 + }, + { + "epoch": 8.22, + "grad_norm": 1.4232354164123535, + "learning_rate": 3.1901851906084025e-05, + "loss": 1.1799, + "step": 27480 + }, + { + "epoch": 8.22, + "grad_norm": 2.850757122039795, + "learning_rate": 3.189620483693409e-05, + "loss": 1.1459, + "step": 27485 + }, + { + "epoch": 8.22, + "grad_norm": 9.836990356445312, + "learning_rate": 3.189055738691393e-05, + "loss": 1.2023, + "step": 27490 + }, + { + "epoch": 8.23, + "grad_norm": 2.8823812007904053, + "learning_rate": 3.188490955633545e-05, + "loss": 1.1401, + "step": 27495 + }, + { + "epoch": 8.23, + "grad_norm": 4.259209632873535, + "learning_rate": 3.187926134551057e-05, + "loss": 1.0947, + "step": 27500 + }, + { + "epoch": 8.23, + "grad_norm": 1.9353116750717163, + "learning_rate": 3.1873612754751234e-05, + "loss": 0.9909, + "step": 27505 + }, + { + "epoch": 8.23, + "grad_norm": 1.2285420894622803, + "learning_rate": 3.1867963784369415e-05, + "loss": 1.1093, + "step": 27510 + }, + { + "epoch": 8.23, + "grad_norm": 2.983264446258545, + "learning_rate": 3.186231443467709e-05, + "loss": 0.963, + "step": 27515 + }, + { + "epoch": 8.23, + "grad_norm": 2.1744792461395264, + "learning_rate": 3.185666470598627e-05, + "loss": 1.1857, + "step": 27520 + }, + { + "epoch": 8.24, + "grad_norm": 6.445066928863525, + "learning_rate": 3.1851014598608994e-05, + "loss": 1.0607, + "step": 27525 + }, + { + "epoch": 8.24, + "grad_norm": 3.2869045734405518, + "learning_rate": 3.1845364112857294e-05, + "loss": 1.0861, + "step": 27530 + }, + { + "epoch": 8.24, + "grad_norm": 2.0690512657165527, + "learning_rate": 3.183971324904325e-05, + "loss": 1.1103, + "step": 27535 + }, + { + "epoch": 8.24, + "grad_norm": 3.5820834636688232, + "learning_rate": 3.183406200747896e-05, + "loss": 1.1151, + "step": 27540 + }, + { + "epoch": 8.24, + "grad_norm": 3.357940435409546, + "learning_rate": 3.1828410388476526e-05, + "loss": 0.9717, + "step": 27545 + }, + { + "epoch": 8.24, + "grad_norm": 1.1148250102996826, + "learning_rate": 3.182275839234808e-05, + "loss": 1.188, + "step": 27550 + }, + { + "epoch": 8.24, + "grad_norm": 2.7538514137268066, + "learning_rate": 3.181710601940578e-05, + "loss": 1.0378, + "step": 27555 + }, + { + "epoch": 8.25, + "grad_norm": 1.7496304512023926, + "learning_rate": 3.1811453269961804e-05, + "loss": 1.0702, + "step": 27560 + }, + { + "epoch": 8.25, + "grad_norm": 1.491807460784912, + "learning_rate": 3.180580014432835e-05, + "loss": 1.1796, + "step": 27565 + }, + { + "epoch": 8.25, + "grad_norm": 1.2797476053237915, + "learning_rate": 3.180014664281762e-05, + "loss": 0.8992, + "step": 27570 + }, + { + "epoch": 8.25, + "grad_norm": 2.5790648460388184, + "learning_rate": 3.179449276574186e-05, + "loss": 1.1928, + "step": 27575 + }, + { + "epoch": 8.25, + "grad_norm": 3.154162883758545, + "learning_rate": 3.178883851341333e-05, + "loss": 1.176, + "step": 27580 + }, + { + "epoch": 8.25, + "grad_norm": 1.5420721769332886, + "learning_rate": 3.1783183886144305e-05, + "loss": 1.162, + "step": 27585 + }, + { + "epoch": 8.25, + "grad_norm": 2.776522159576416, + "learning_rate": 3.177752888424708e-05, + "loss": 1.0938, + "step": 27590 + }, + { + "epoch": 8.26, + "grad_norm": 2.355360507965088, + "learning_rate": 3.177187350803398e-05, + "loss": 0.9884, + "step": 27595 + }, + { + "epoch": 8.26, + "grad_norm": 1.3630084991455078, + "learning_rate": 3.176621775781736e-05, + "loss": 1.1724, + "step": 27600 + }, + { + "epoch": 8.26, + "grad_norm": 2.4475739002227783, + "learning_rate": 3.1760561633909546e-05, + "loss": 1.1558, + "step": 27605 + }, + { + "epoch": 8.26, + "grad_norm": 3.5432145595550537, + "learning_rate": 3.175490513662295e-05, + "loss": 0.9509, + "step": 27610 + }, + { + "epoch": 8.26, + "grad_norm": 2.6473562717437744, + "learning_rate": 3.1749248266269966e-05, + "loss": 1.0777, + "step": 27615 + }, + { + "epoch": 8.26, + "grad_norm": 2.7413153648376465, + "learning_rate": 3.174359102316301e-05, + "loss": 1.2854, + "step": 27620 + }, + { + "epoch": 8.27, + "grad_norm": 2.960883378982544, + "learning_rate": 3.173793340761453e-05, + "loss": 1.1436, + "step": 27625 + }, + { + "epoch": 8.27, + "grad_norm": 3.2864279747009277, + "learning_rate": 3.1732275419937e-05, + "loss": 1.1531, + "step": 27630 + }, + { + "epoch": 8.27, + "grad_norm": 2.264131546020508, + "learning_rate": 3.1726617060442884e-05, + "loss": 1.1501, + "step": 27635 + }, + { + "epoch": 8.27, + "grad_norm": 4.641735553741455, + "learning_rate": 3.172095832944472e-05, + "loss": 1.1033, + "step": 27640 + }, + { + "epoch": 8.27, + "grad_norm": 2.3437931537628174, + "learning_rate": 3.1715299227255e-05, + "loss": 1.1294, + "step": 27645 + }, + { + "epoch": 8.27, + "grad_norm": 4.831884860992432, + "learning_rate": 3.170963975418628e-05, + "loss": 1.0589, + "step": 27650 + }, + { + "epoch": 8.27, + "grad_norm": 5.432398796081543, + "learning_rate": 3.170397991055114e-05, + "loss": 0.97, + "step": 27655 + }, + { + "epoch": 8.28, + "grad_norm": 2.6022696495056152, + "learning_rate": 3.1698319696662156e-05, + "loss": 1.1821, + "step": 27660 + }, + { + "epoch": 8.28, + "grad_norm": 2.880967378616333, + "learning_rate": 3.1692659112831934e-05, + "loss": 0.9731, + "step": 27665 + }, + { + "epoch": 8.28, + "grad_norm": 1.824782371520996, + "learning_rate": 3.16869981593731e-05, + "loss": 1.1546, + "step": 27670 + }, + { + "epoch": 8.28, + "grad_norm": 1.6104397773742676, + "learning_rate": 3.168133683659832e-05, + "loss": 1.0998, + "step": 27675 + }, + { + "epoch": 8.28, + "grad_norm": 3.1559090614318848, + "learning_rate": 3.167567514482025e-05, + "loss": 1.1066, + "step": 27680 + }, + { + "epoch": 8.28, + "grad_norm": 1.1295137405395508, + "learning_rate": 3.1670013084351575e-05, + "loss": 1.2197, + "step": 27685 + }, + { + "epoch": 8.28, + "grad_norm": 3.0015058517456055, + "learning_rate": 3.166435065550501e-05, + "loss": 1.1269, + "step": 27690 + }, + { + "epoch": 8.29, + "grad_norm": 4.1721391677856445, + "learning_rate": 3.1658687858593294e-05, + "loss": 0.992, + "step": 27695 + }, + { + "epoch": 8.29, + "grad_norm": 2.583388328552246, + "learning_rate": 3.165302469392917e-05, + "loss": 1.0925, + "step": 27700 + }, + { + "epoch": 8.29, + "grad_norm": 8.460959434509277, + "learning_rate": 3.16473611618254e-05, + "loss": 1.0993, + "step": 27705 + }, + { + "epoch": 8.29, + "grad_norm": 5.1278767585754395, + "learning_rate": 3.164169726259477e-05, + "loss": 1.0187, + "step": 27710 + }, + { + "epoch": 8.29, + "grad_norm": 2.619326591491699, + "learning_rate": 3.163603299655012e-05, + "loss": 0.9933, + "step": 27715 + }, + { + "epoch": 8.29, + "grad_norm": 2.9472780227661133, + "learning_rate": 3.1630368364004264e-05, + "loss": 1.1644, + "step": 27720 + }, + { + "epoch": 8.29, + "grad_norm": 1.8340462446212769, + "learning_rate": 3.162470336527006e-05, + "loss": 1.2497, + "step": 27725 + }, + { + "epoch": 8.3, + "grad_norm": 4.899162292480469, + "learning_rate": 3.161903800066037e-05, + "loss": 1.0712, + "step": 27730 + }, + { + "epoch": 8.3, + "grad_norm": 3.0266971588134766, + "learning_rate": 3.161337227048809e-05, + "loss": 1.1594, + "step": 27735 + }, + { + "epoch": 8.3, + "grad_norm": 1.212259292602539, + "learning_rate": 3.1607706175066134e-05, + "loss": 1.0818, + "step": 27740 + }, + { + "epoch": 8.3, + "grad_norm": 2.2834479808807373, + "learning_rate": 3.1602039714707434e-05, + "loss": 1.0276, + "step": 27745 + }, + { + "epoch": 8.3, + "grad_norm": 1.508610486984253, + "learning_rate": 3.159637288972494e-05, + "loss": 1.1272, + "step": 27750 + }, + { + "epoch": 8.3, + "grad_norm": 1.2361581325531006, + "learning_rate": 3.159070570043163e-05, + "loss": 1.0414, + "step": 27755 + }, + { + "epoch": 8.31, + "grad_norm": 2.1319968700408936, + "learning_rate": 3.15850381471405e-05, + "loss": 1.052, + "step": 27760 + }, + { + "epoch": 8.31, + "grad_norm": 3.3206193447113037, + "learning_rate": 3.157937023016456e-05, + "loss": 1.0506, + "step": 27765 + }, + { + "epoch": 8.31, + "grad_norm": 3.340884208679199, + "learning_rate": 3.157370194981683e-05, + "loss": 1.1607, + "step": 27770 + }, + { + "epoch": 8.31, + "grad_norm": 12.616667747497559, + "learning_rate": 3.156803330641038e-05, + "loss": 1.2882, + "step": 27775 + }, + { + "epoch": 8.31, + "grad_norm": 3.9304230213165283, + "learning_rate": 3.1562364300258275e-05, + "loss": 1.302, + "step": 27780 + }, + { + "epoch": 8.31, + "grad_norm": 3.3284659385681152, + "learning_rate": 3.1556694931673615e-05, + "loss": 1.0514, + "step": 27785 + }, + { + "epoch": 8.31, + "grad_norm": 2.4775948524475098, + "learning_rate": 3.155102520096951e-05, + "loss": 1.0072, + "step": 27790 + }, + { + "epoch": 8.32, + "grad_norm": 2.661914110183716, + "learning_rate": 3.154535510845909e-05, + "loss": 1.1323, + "step": 27795 + }, + { + "epoch": 8.32, + "grad_norm": 3.137718677520752, + "learning_rate": 3.15396846544555e-05, + "loss": 1.248, + "step": 27800 + }, + { + "epoch": 8.32, + "grad_norm": 1.339923620223999, + "learning_rate": 3.153401383927194e-05, + "loss": 1.0493, + "step": 27805 + }, + { + "epoch": 8.32, + "grad_norm": 1.203647494316101, + "learning_rate": 3.1528342663221576e-05, + "loss": 0.9536, + "step": 27810 + }, + { + "epoch": 8.32, + "grad_norm": 1.4011954069137573, + "learning_rate": 3.152267112661764e-05, + "loss": 1.257, + "step": 27815 + }, + { + "epoch": 8.32, + "grad_norm": 3.3855531215667725, + "learning_rate": 3.151699922977336e-05, + "loss": 1.1184, + "step": 27820 + }, + { + "epoch": 8.32, + "grad_norm": 2.566340208053589, + "learning_rate": 3.151132697300199e-05, + "loss": 1.0047, + "step": 27825 + }, + { + "epoch": 8.33, + "grad_norm": 2.9394350051879883, + "learning_rate": 3.150565435661679e-05, + "loss": 1.244, + "step": 27830 + }, + { + "epoch": 8.33, + "grad_norm": 3.349531412124634, + "learning_rate": 3.149998138093107e-05, + "loss": 1.1824, + "step": 27835 + }, + { + "epoch": 8.33, + "grad_norm": 1.7213515043258667, + "learning_rate": 3.149430804625812e-05, + "loss": 1.1984, + "step": 27840 + }, + { + "epoch": 8.33, + "grad_norm": 1.7168335914611816, + "learning_rate": 3.148863435291129e-05, + "loss": 1.1489, + "step": 27845 + }, + { + "epoch": 8.33, + "grad_norm": 2.286252975463867, + "learning_rate": 3.148296030120394e-05, + "loss": 1.2274, + "step": 27850 + }, + { + "epoch": 8.33, + "grad_norm": 1.7521454095840454, + "learning_rate": 3.1477285891449434e-05, + "loss": 1.1893, + "step": 27855 + }, + { + "epoch": 8.34, + "grad_norm": 3.142904281616211, + "learning_rate": 3.147161112396115e-05, + "loss": 1.0052, + "step": 27860 + }, + { + "epoch": 8.34, + "grad_norm": 2.172449827194214, + "learning_rate": 3.1465935999052514e-05, + "loss": 1.0957, + "step": 27865 + }, + { + "epoch": 8.34, + "grad_norm": 2.116454601287842, + "learning_rate": 3.146026051703695e-05, + "loss": 1.0053, + "step": 27870 + }, + { + "epoch": 8.34, + "grad_norm": 0.8631601333618164, + "learning_rate": 3.145458467822792e-05, + "loss": 0.9305, + "step": 27875 + }, + { + "epoch": 8.34, + "grad_norm": 2.0894715785980225, + "learning_rate": 3.144890848293889e-05, + "loss": 1.1072, + "step": 27880 + }, + { + "epoch": 8.34, + "grad_norm": 4.275918006896973, + "learning_rate": 3.1443231931483334e-05, + "loss": 1.0385, + "step": 27885 + }, + { + "epoch": 8.34, + "grad_norm": 2.0376479625701904, + "learning_rate": 3.143755502417478e-05, + "loss": 1.3225, + "step": 27890 + }, + { + "epoch": 8.35, + "grad_norm": 2.2488062381744385, + "learning_rate": 3.143187776132676e-05, + "loss": 1.1994, + "step": 27895 + }, + { + "epoch": 8.35, + "grad_norm": 2.3104982376098633, + "learning_rate": 3.1426200143252815e-05, + "loss": 1.0693, + "step": 27900 + }, + { + "epoch": 8.35, + "grad_norm": 2.0975565910339355, + "learning_rate": 3.142052217026651e-05, + "loss": 1.0613, + "step": 27905 + }, + { + "epoch": 8.35, + "grad_norm": 3.1652896404266357, + "learning_rate": 3.1414843842681455e-05, + "loss": 1.1656, + "step": 27910 + }, + { + "epoch": 8.35, + "grad_norm": 2.0645837783813477, + "learning_rate": 3.1409165160811226e-05, + "loss": 1.1103, + "step": 27915 + }, + { + "epoch": 8.35, + "grad_norm": 2.3577146530151367, + "learning_rate": 3.140348612496947e-05, + "loss": 1.0475, + "step": 27920 + }, + { + "epoch": 8.35, + "grad_norm": 3.1144940853118896, + "learning_rate": 3.1398942641647336e-05, + "loss": 1.3145, + "step": 27925 + }, + { + "epoch": 8.36, + "grad_norm": 2.6503536701202393, + "learning_rate": 3.139326296944723e-05, + "loss": 1.0987, + "step": 27930 + }, + { + "epoch": 8.36, + "grad_norm": 1.5339775085449219, + "learning_rate": 3.138758294415386e-05, + "loss": 1.0772, + "step": 27935 + }, + { + "epoch": 8.36, + "grad_norm": 2.08730149269104, + "learning_rate": 3.138190256608093e-05, + "loss": 1.1344, + "step": 27940 + }, + { + "epoch": 8.36, + "grad_norm": 1.825244426727295, + "learning_rate": 3.137622183554215e-05, + "loss": 1.2571, + "step": 27945 + }, + { + "epoch": 8.36, + "grad_norm": 2.94596004486084, + "learning_rate": 3.137054075285126e-05, + "loss": 1.1669, + "step": 27950 + }, + { + "epoch": 8.36, + "grad_norm": 1.1492950916290283, + "learning_rate": 3.1364859318322025e-05, + "loss": 1.2209, + "step": 27955 + }, + { + "epoch": 8.37, + "grad_norm": 5.7294769287109375, + "learning_rate": 3.135917753226823e-05, + "loss": 1.0503, + "step": 27960 + }, + { + "epoch": 8.37, + "grad_norm": 1.8514894247055054, + "learning_rate": 3.1353495395003675e-05, + "loss": 1.0441, + "step": 27965 + }, + { + "epoch": 8.37, + "grad_norm": 1.6761661767959595, + "learning_rate": 3.134781290684216e-05, + "loss": 0.9575, + "step": 27970 + }, + { + "epoch": 8.37, + "grad_norm": 2.4914565086364746, + "learning_rate": 3.134213006809755e-05, + "loss": 1.2053, + "step": 27975 + }, + { + "epoch": 8.37, + "grad_norm": 1.1112005710601807, + "learning_rate": 3.133644687908368e-05, + "loss": 1.1066, + "step": 27980 + }, + { + "epoch": 8.37, + "grad_norm": 5.55382776260376, + "learning_rate": 3.133076334011443e-05, + "loss": 0.8715, + "step": 27985 + }, + { + "epoch": 8.37, + "grad_norm": 3.3123645782470703, + "learning_rate": 3.1325079451503715e-05, + "loss": 1.0493, + "step": 27990 + }, + { + "epoch": 8.38, + "grad_norm": 29.16041374206543, + "learning_rate": 3.131939521356543e-05, + "loss": 1.0889, + "step": 27995 + }, + { + "epoch": 8.38, + "grad_norm": 2.6477432250976562, + "learning_rate": 3.131371062661351e-05, + "loss": 0.995, + "step": 28000 + }, + { + "epoch": 8.38, + "grad_norm": 1.398253321647644, + "learning_rate": 3.130802569096194e-05, + "loss": 1.0378, + "step": 28005 + }, + { + "epoch": 8.38, + "grad_norm": 1.7362818717956543, + "learning_rate": 3.130234040692464e-05, + "loss": 1.0509, + "step": 28010 + }, + { + "epoch": 8.38, + "grad_norm": 2.3737027645111084, + "learning_rate": 3.129665477481564e-05, + "loss": 1.1996, + "step": 28015 + }, + { + "epoch": 8.38, + "grad_norm": 2.6281936168670654, + "learning_rate": 3.129096879494894e-05, + "loss": 0.9499, + "step": 28020 + }, + { + "epoch": 8.38, + "grad_norm": 5.763638019561768, + "learning_rate": 3.1285282467638577e-05, + "loss": 1.1019, + "step": 28025 + }, + { + "epoch": 8.39, + "grad_norm": 3.136121988296509, + "learning_rate": 3.1279595793198593e-05, + "loss": 1.084, + "step": 28030 + }, + { + "epoch": 8.39, + "grad_norm": 1.8044530153274536, + "learning_rate": 3.1273908771943064e-05, + "loss": 1.2044, + "step": 28035 + }, + { + "epoch": 8.39, + "grad_norm": 4.135902404785156, + "learning_rate": 3.126822140418607e-05, + "loss": 1.086, + "step": 28040 + }, + { + "epoch": 8.39, + "grad_norm": 2.257347822189331, + "learning_rate": 3.1262533690241726e-05, + "loss": 1.2456, + "step": 28045 + }, + { + "epoch": 8.39, + "grad_norm": 3.2046022415161133, + "learning_rate": 3.1256845630424144e-05, + "loss": 1.214, + "step": 28050 + }, + { + "epoch": 8.39, + "grad_norm": 3.7938222885131836, + "learning_rate": 3.125115722504749e-05, + "loss": 0.989, + "step": 28055 + }, + { + "epoch": 8.4, + "grad_norm": 3.6777851581573486, + "learning_rate": 3.124546847442593e-05, + "loss": 1.0389, + "step": 28060 + }, + { + "epoch": 8.4, + "grad_norm": 2.9035661220550537, + "learning_rate": 3.123977937887363e-05, + "loss": 0.9988, + "step": 28065 + }, + { + "epoch": 8.4, + "grad_norm": 1.5526783466339111, + "learning_rate": 3.1234089938704805e-05, + "loss": 1.1668, + "step": 28070 + }, + { + "epoch": 8.4, + "grad_norm": 3.05960750579834, + "learning_rate": 3.122840015423367e-05, + "loss": 1.1137, + "step": 28075 + }, + { + "epoch": 8.4, + "grad_norm": 2.577622413635254, + "learning_rate": 3.122271002577446e-05, + "loss": 1.0632, + "step": 28080 + }, + { + "epoch": 8.4, + "grad_norm": 2.9090895652770996, + "learning_rate": 3.121701955364146e-05, + "loss": 1.1106, + "step": 28085 + }, + { + "epoch": 8.4, + "grad_norm": 1.0737097263336182, + "learning_rate": 3.121132873814892e-05, + "loss": 1.1976, + "step": 28090 + }, + { + "epoch": 8.41, + "grad_norm": 5.77609920501709, + "learning_rate": 3.1205637579611154e-05, + "loss": 1.2236, + "step": 28095 + }, + { + "epoch": 8.41, + "grad_norm": 6.517035961151123, + "learning_rate": 3.119994607834248e-05, + "loss": 0.9589, + "step": 28100 + }, + { + "epoch": 8.41, + "grad_norm": 3.637007236480713, + "learning_rate": 3.1194254234657225e-05, + "loss": 1.1054, + "step": 28105 + }, + { + "epoch": 8.41, + "grad_norm": 5.6374359130859375, + "learning_rate": 3.118856204886974e-05, + "loss": 1.1144, + "step": 28110 + }, + { + "epoch": 8.41, + "grad_norm": 1.1041793823242188, + "learning_rate": 3.118286952129441e-05, + "loss": 1.0831, + "step": 28115 + }, + { + "epoch": 8.41, + "grad_norm": 4.296421051025391, + "learning_rate": 3.117717665224562e-05, + "loss": 1.1175, + "step": 28120 + }, + { + "epoch": 8.41, + "grad_norm": 1.8130741119384766, + "learning_rate": 3.117148344203779e-05, + "loss": 1.14, + "step": 28125 + }, + { + "epoch": 8.42, + "grad_norm": 5.4478864669799805, + "learning_rate": 3.116578989098534e-05, + "loss": 1.1562, + "step": 28130 + }, + { + "epoch": 8.42, + "grad_norm": 3.172430992126465, + "learning_rate": 3.116009599940273e-05, + "loss": 1.2238, + "step": 28135 + }, + { + "epoch": 8.42, + "grad_norm": 1.4552370309829712, + "learning_rate": 3.1154401767604415e-05, + "loss": 1.2188, + "step": 28140 + }, + { + "epoch": 8.42, + "grad_norm": 2.109438419342041, + "learning_rate": 3.114870719590489e-05, + "loss": 1.0155, + "step": 28145 + }, + { + "epoch": 8.42, + "grad_norm": 1.051721215248108, + "learning_rate": 3.114301228461866e-05, + "loss": 1.0963, + "step": 28150 + }, + { + "epoch": 8.42, + "grad_norm": 1.4635205268859863, + "learning_rate": 3.1137317034060236e-05, + "loss": 1.0117, + "step": 28155 + }, + { + "epoch": 8.43, + "grad_norm": 2.354686975479126, + "learning_rate": 3.113162144454418e-05, + "loss": 1.2232, + "step": 28160 + }, + { + "epoch": 8.43, + "grad_norm": 2.211465358734131, + "learning_rate": 3.112592551638505e-05, + "loss": 1.1567, + "step": 28165 + }, + { + "epoch": 8.43, + "grad_norm": 2.628493070602417, + "learning_rate": 3.112022924989741e-05, + "loss": 1.0378, + "step": 28170 + }, + { + "epoch": 8.43, + "grad_norm": 11.486804008483887, + "learning_rate": 3.111453264539588e-05, + "loss": 1.0214, + "step": 28175 + }, + { + "epoch": 8.43, + "grad_norm": 2.4132378101348877, + "learning_rate": 3.110883570319507e-05, + "loss": 1.265, + "step": 28180 + }, + { + "epoch": 8.43, + "grad_norm": 3.6286280155181885, + "learning_rate": 3.11031384236096e-05, + "loss": 1.031, + "step": 28185 + }, + { + "epoch": 8.43, + "grad_norm": 2.399932622909546, + "learning_rate": 3.109744080695415e-05, + "loss": 1.0519, + "step": 28190 + }, + { + "epoch": 8.44, + "grad_norm": 2.9167540073394775, + "learning_rate": 3.109174285354338e-05, + "loss": 1.1285, + "step": 28195 + }, + { + "epoch": 8.44, + "grad_norm": 2.6910743713378906, + "learning_rate": 3.1086044563691984e-05, + "loss": 1.2128, + "step": 28200 + }, + { + "epoch": 8.44, + "grad_norm": 2.2151172161102295, + "learning_rate": 3.108034593771467e-05, + "loss": 1.1706, + "step": 28205 + }, + { + "epoch": 8.44, + "grad_norm": 4.152978897094727, + "learning_rate": 3.1074646975926176e-05, + "loss": 1.1396, + "step": 28210 + }, + { + "epoch": 8.44, + "grad_norm": 4.019749164581299, + "learning_rate": 3.106894767864124e-05, + "loss": 1.1841, + "step": 28215 + }, + { + "epoch": 8.44, + "grad_norm": 2.265693187713623, + "learning_rate": 3.106324804617463e-05, + "loss": 1.1932, + "step": 28220 + }, + { + "epoch": 8.44, + "grad_norm": 2.9576539993286133, + "learning_rate": 3.105754807884113e-05, + "loss": 1.1587, + "step": 28225 + }, + { + "epoch": 8.45, + "grad_norm": 3.3422529697418213, + "learning_rate": 3.105184777695555e-05, + "loss": 1.1099, + "step": 28230 + }, + { + "epoch": 8.45, + "grad_norm": 3.2703285217285156, + "learning_rate": 3.104614714083271e-05, + "loss": 1.0236, + "step": 28235 + }, + { + "epoch": 8.45, + "grad_norm": 2.098249912261963, + "learning_rate": 3.1040446170787444e-05, + "loss": 1.0275, + "step": 28240 + }, + { + "epoch": 8.45, + "grad_norm": 3.4206290245056152, + "learning_rate": 3.103474486713462e-05, + "loss": 0.9629, + "step": 28245 + }, + { + "epoch": 8.45, + "grad_norm": 3.716006278991699, + "learning_rate": 3.1029043230189106e-05, + "loss": 1.0353, + "step": 28250 + }, + { + "epoch": 8.45, + "grad_norm": 1.844235897064209, + "learning_rate": 3.10233412602658e-05, + "loss": 1.1328, + "step": 28255 + }, + { + "epoch": 8.46, + "grad_norm": 3.6148314476013184, + "learning_rate": 3.101763895767962e-05, + "loss": 1.0627, + "step": 28260 + }, + { + "epoch": 8.46, + "grad_norm": 2.58854603767395, + "learning_rate": 3.101193632274549e-05, + "loss": 1.0033, + "step": 28265 + }, + { + "epoch": 8.46, + "grad_norm": 1.9010263681411743, + "learning_rate": 3.100623335577837e-05, + "loss": 1.1728, + "step": 28270 + }, + { + "epoch": 8.46, + "grad_norm": 3.03183913230896, + "learning_rate": 3.100053005709323e-05, + "loss": 1.133, + "step": 28275 + }, + { + "epoch": 8.46, + "grad_norm": 2.4932944774627686, + "learning_rate": 3.0994826427005044e-05, + "loss": 1.1185, + "step": 28280 + }, + { + "epoch": 8.46, + "grad_norm": 3.512427568435669, + "learning_rate": 3.098912246582884e-05, + "loss": 1.0649, + "step": 28285 + }, + { + "epoch": 8.46, + "grad_norm": 1.8340367078781128, + "learning_rate": 3.098341817387961e-05, + "loss": 1.048, + "step": 28290 + }, + { + "epoch": 8.47, + "grad_norm": 2.6581850051879883, + "learning_rate": 3.0977713551472424e-05, + "loss": 1.1243, + "step": 28295 + }, + { + "epoch": 8.47, + "grad_norm": 2.2674968242645264, + "learning_rate": 3.097200859892232e-05, + "loss": 1.0713, + "step": 28300 + }, + { + "epoch": 8.47, + "grad_norm": 2.0417089462280273, + "learning_rate": 3.09663033165444e-05, + "loss": 1.142, + "step": 28305 + }, + { + "epoch": 8.47, + "grad_norm": 3.8348848819732666, + "learning_rate": 3.096059770465375e-05, + "loss": 1.0195, + "step": 28310 + }, + { + "epoch": 8.47, + "grad_norm": 1.9417195320129395, + "learning_rate": 3.095489176356548e-05, + "loss": 1.1107, + "step": 28315 + }, + { + "epoch": 8.47, + "grad_norm": 3.2145209312438965, + "learning_rate": 3.094918549359473e-05, + "loss": 1.1599, + "step": 28320 + }, + { + "epoch": 8.47, + "grad_norm": 1.5097711086273193, + "learning_rate": 3.0943478895056645e-05, + "loss": 1.1153, + "step": 28325 + }, + { + "epoch": 8.48, + "grad_norm": 1.1623693704605103, + "learning_rate": 3.09377719682664e-05, + "loss": 1.0584, + "step": 28330 + }, + { + "epoch": 8.48, + "grad_norm": 3.9560139179229736, + "learning_rate": 3.093206471353918e-05, + "loss": 1.0023, + "step": 28335 + }, + { + "epoch": 8.48, + "grad_norm": 2.495631217956543, + "learning_rate": 3.0926357131190196e-05, + "loss": 0.9913, + "step": 28340 + }, + { + "epoch": 8.48, + "grad_norm": 1.4161936044692993, + "learning_rate": 3.092064922153466e-05, + "loss": 1.1667, + "step": 28345 + }, + { + "epoch": 8.48, + "grad_norm": 2.1678714752197266, + "learning_rate": 3.091494098488783e-05, + "loss": 1.1517, + "step": 28350 + }, + { + "epoch": 8.48, + "grad_norm": 1.4280831813812256, + "learning_rate": 3.090923242156496e-05, + "loss": 1.1162, + "step": 28355 + }, + { + "epoch": 8.48, + "grad_norm": 5.203249931335449, + "learning_rate": 3.0903523531881325e-05, + "loss": 1.0205, + "step": 28360 + }, + { + "epoch": 8.49, + "grad_norm": 2.362901449203491, + "learning_rate": 3.0897814316152214e-05, + "loss": 1.095, + "step": 28365 + }, + { + "epoch": 8.49, + "grad_norm": 1.590612769126892, + "learning_rate": 3.089210477469295e-05, + "loss": 1.0452, + "step": 28370 + }, + { + "epoch": 8.49, + "grad_norm": 1.0684711933135986, + "learning_rate": 3.0886394907818864e-05, + "loss": 1.0676, + "step": 28375 + }, + { + "epoch": 8.49, + "grad_norm": 2.432785749435425, + "learning_rate": 3.088068471584531e-05, + "loss": 1.2619, + "step": 28380 + }, + { + "epoch": 8.49, + "grad_norm": 4.381847381591797, + "learning_rate": 3.0874974199087654e-05, + "loss": 0.9273, + "step": 28385 + }, + { + "epoch": 8.49, + "grad_norm": 2.7747817039489746, + "learning_rate": 3.086926335786128e-05, + "loss": 1.1676, + "step": 28390 + }, + { + "epoch": 8.5, + "grad_norm": 2.8812899589538574, + "learning_rate": 3.086355219248158e-05, + "loss": 1.0638, + "step": 28395 + }, + { + "epoch": 8.5, + "grad_norm": 3.0816097259521484, + "learning_rate": 3.0857840703263996e-05, + "loss": 0.9701, + "step": 28400 + }, + { + "epoch": 8.5, + "grad_norm": 3.1254990100860596, + "learning_rate": 3.0852128890523954e-05, + "loss": 1.2058, + "step": 28405 + }, + { + "epoch": 8.5, + "grad_norm": 3.639291286468506, + "learning_rate": 3.084641675457692e-05, + "loss": 1.1903, + "step": 28410 + }, + { + "epoch": 8.5, + "grad_norm": 12.763455390930176, + "learning_rate": 3.0840704295738364e-05, + "loss": 0.9558, + "step": 28415 + }, + { + "epoch": 8.5, + "grad_norm": 2.450136423110962, + "learning_rate": 3.083499151432378e-05, + "loss": 1.1158, + "step": 28420 + }, + { + "epoch": 8.5, + "grad_norm": 2.2202651500701904, + "learning_rate": 3.082927841064869e-05, + "loss": 1.1919, + "step": 28425 + }, + { + "epoch": 8.51, + "grad_norm": 5.485016822814941, + "learning_rate": 3.0823564985028596e-05, + "loss": 1.1658, + "step": 28430 + }, + { + "epoch": 8.51, + "grad_norm": 3.856153964996338, + "learning_rate": 3.081785123777907e-05, + "loss": 0.9867, + "step": 28435 + }, + { + "epoch": 8.51, + "grad_norm": 2.2953991889953613, + "learning_rate": 3.081213716921567e-05, + "loss": 1.1779, + "step": 28440 + }, + { + "epoch": 8.51, + "grad_norm": 2.756497859954834, + "learning_rate": 3.0806422779653974e-05, + "loss": 1.1818, + "step": 28445 + }, + { + "epoch": 8.51, + "grad_norm": 2.887840986251831, + "learning_rate": 3.080070806940958e-05, + "loss": 1.0941, + "step": 28450 + }, + { + "epoch": 8.51, + "grad_norm": 1.7637460231781006, + "learning_rate": 3.0794993038798114e-05, + "loss": 1.1327, + "step": 28455 + }, + { + "epoch": 8.51, + "grad_norm": 2.2904860973358154, + "learning_rate": 3.07892776881352e-05, + "loss": 1.2831, + "step": 28460 + }, + { + "epoch": 8.52, + "grad_norm": 1.0153388977050781, + "learning_rate": 3.07835620177365e-05, + "loss": 1.0142, + "step": 28465 + }, + { + "epoch": 8.52, + "grad_norm": 1.7611110210418701, + "learning_rate": 3.077784602791768e-05, + "loss": 1.2258, + "step": 28470 + }, + { + "epoch": 8.52, + "grad_norm": 2.403919219970703, + "learning_rate": 3.077212971899443e-05, + "loss": 0.9231, + "step": 28475 + }, + { + "epoch": 8.52, + "grad_norm": 4.131344318389893, + "learning_rate": 3.076641309128245e-05, + "loss": 1.0783, + "step": 28480 + }, + { + "epoch": 8.52, + "grad_norm": 3.618623971939087, + "learning_rate": 3.0760696145097477e-05, + "loss": 1.2069, + "step": 28485 + }, + { + "epoch": 8.52, + "grad_norm": 2.4561777114868164, + "learning_rate": 3.0754978880755246e-05, + "loss": 1.0412, + "step": 28490 + }, + { + "epoch": 8.53, + "grad_norm": 2.350400924682617, + "learning_rate": 3.074926129857151e-05, + "loss": 1.1583, + "step": 28495 + }, + { + "epoch": 8.53, + "grad_norm": 1.5594244003295898, + "learning_rate": 3.074354339886204e-05, + "loss": 1.2653, + "step": 28500 + }, + { + "epoch": 8.53, + "grad_norm": 1.28977370262146, + "learning_rate": 3.073782518194265e-05, + "loss": 1.1303, + "step": 28505 + }, + { + "epoch": 8.53, + "grad_norm": 1.747999668121338, + "learning_rate": 3.073210664812913e-05, + "loss": 1.0107, + "step": 28510 + }, + { + "epoch": 8.53, + "grad_norm": 1.435805082321167, + "learning_rate": 3.072638779773732e-05, + "loss": 0.9404, + "step": 28515 + }, + { + "epoch": 8.53, + "grad_norm": 2.7023794651031494, + "learning_rate": 3.0720668631083074e-05, + "loss": 1.1618, + "step": 28520 + }, + { + "epoch": 8.53, + "grad_norm": 2.3820207118988037, + "learning_rate": 3.071494914848224e-05, + "loss": 1.2369, + "step": 28525 + }, + { + "epoch": 8.54, + "grad_norm": 2.2789406776428223, + "learning_rate": 3.07092293502507e-05, + "loss": 1.0973, + "step": 28530 + }, + { + "epoch": 8.54, + "grad_norm": 1.514122724533081, + "learning_rate": 3.0703509236704366e-05, + "loss": 1.0835, + "step": 28535 + }, + { + "epoch": 8.54, + "grad_norm": 2.0251221656799316, + "learning_rate": 3.069778880815914e-05, + "loss": 1.1254, + "step": 28540 + }, + { + "epoch": 8.54, + "grad_norm": 4.081529140472412, + "learning_rate": 3.069206806493095e-05, + "loss": 1.2861, + "step": 28545 + }, + { + "epoch": 8.54, + "grad_norm": 1.8973231315612793, + "learning_rate": 3.068634700733577e-05, + "loss": 1.122, + "step": 28550 + }, + { + "epoch": 8.54, + "grad_norm": 1.6018325090408325, + "learning_rate": 3.068062563568956e-05, + "loss": 1.0091, + "step": 28555 + }, + { + "epoch": 8.54, + "grad_norm": 2.3443856239318848, + "learning_rate": 3.0674903950308295e-05, + "loss": 1.1682, + "step": 28560 + }, + { + "epoch": 8.55, + "grad_norm": 2.2536654472351074, + "learning_rate": 3.0669181951507986e-05, + "loss": 1.0887, + "step": 28565 + }, + { + "epoch": 8.55, + "grad_norm": 2.6237266063690186, + "learning_rate": 3.0663459639604645e-05, + "loss": 1.0353, + "step": 28570 + }, + { + "epoch": 8.55, + "grad_norm": 2.24894118309021, + "learning_rate": 3.065773701491432e-05, + "loss": 1.2106, + "step": 28575 + }, + { + "epoch": 8.55, + "grad_norm": 2.0882694721221924, + "learning_rate": 3.065201407775306e-05, + "loss": 1.1984, + "step": 28580 + }, + { + "epoch": 8.55, + "grad_norm": 2.8938817977905273, + "learning_rate": 3.064629082843693e-05, + "loss": 1.0492, + "step": 28585 + }, + { + "epoch": 8.55, + "grad_norm": 2.2271273136138916, + "learning_rate": 3.064056726728204e-05, + "loss": 1.165, + "step": 28590 + }, + { + "epoch": 8.56, + "grad_norm": 5.680057525634766, + "learning_rate": 3.063484339460447e-05, + "loss": 1.1335, + "step": 28595 + }, + { + "epoch": 8.56, + "grad_norm": 4.835309982299805, + "learning_rate": 3.0629119210720364e-05, + "loss": 1.0781, + "step": 28600 + }, + { + "epoch": 8.56, + "grad_norm": 1.8358601331710815, + "learning_rate": 3.062339471594585e-05, + "loss": 1.0802, + "step": 28605 + }, + { + "epoch": 8.56, + "grad_norm": 2.384139060974121, + "learning_rate": 3.061766991059709e-05, + "loss": 1.0434, + "step": 28610 + }, + { + "epoch": 8.56, + "grad_norm": 1.7726037502288818, + "learning_rate": 3.0611944794990265e-05, + "loss": 0.9551, + "step": 28615 + }, + { + "epoch": 8.56, + "grad_norm": 3.578003168106079, + "learning_rate": 3.060621936944157e-05, + "loss": 0.9481, + "step": 28620 + }, + { + "epoch": 8.56, + "grad_norm": 2.8826417922973633, + "learning_rate": 3.0600493634267196e-05, + "loss": 1.1564, + "step": 28625 + }, + { + "epoch": 8.57, + "grad_norm": 2.272235870361328, + "learning_rate": 3.059476758978338e-05, + "loss": 0.8952, + "step": 28630 + }, + { + "epoch": 8.57, + "grad_norm": 2.82199764251709, + "learning_rate": 3.058904123630636e-05, + "loss": 1.1897, + "step": 28635 + }, + { + "epoch": 8.57, + "grad_norm": 1.5297337770462036, + "learning_rate": 3.0583314574152414e-05, + "loss": 1.1364, + "step": 28640 + }, + { + "epoch": 8.57, + "grad_norm": 2.4493231773376465, + "learning_rate": 3.05775876036378e-05, + "loss": 1.0275, + "step": 28645 + }, + { + "epoch": 8.57, + "grad_norm": 4.320399761199951, + "learning_rate": 3.057186032507883e-05, + "loss": 1.205, + "step": 28650 + }, + { + "epoch": 8.57, + "grad_norm": 4.2380595207214355, + "learning_rate": 3.05661327387918e-05, + "loss": 1.0884, + "step": 28655 + }, + { + "epoch": 8.57, + "grad_norm": 3.2652807235717773, + "learning_rate": 3.056040484509304e-05, + "loss": 0.9676, + "step": 28660 + }, + { + "epoch": 8.58, + "grad_norm": 2.164764404296875, + "learning_rate": 3.0554676644298906e-05, + "loss": 1.1527, + "step": 28665 + }, + { + "epoch": 8.58, + "grad_norm": 3.5430350303649902, + "learning_rate": 3.0548948136725754e-05, + "loss": 1.285, + "step": 28670 + }, + { + "epoch": 8.58, + "grad_norm": 2.4227631092071533, + "learning_rate": 3.0543219322689955e-05, + "loss": 1.0471, + "step": 28675 + }, + { + "epoch": 8.58, + "grad_norm": 2.0381109714508057, + "learning_rate": 3.053749020250792e-05, + "loss": 1.0022, + "step": 28680 + }, + { + "epoch": 8.58, + "grad_norm": 1.9876576662063599, + "learning_rate": 3.0531760776496064e-05, + "loss": 1.0601, + "step": 28685 + }, + { + "epoch": 8.58, + "grad_norm": 2.803956985473633, + "learning_rate": 3.0526031044970806e-05, + "loss": 1.1354, + "step": 28690 + }, + { + "epoch": 8.59, + "grad_norm": 1.6594746112823486, + "learning_rate": 3.05203010082486e-05, + "loss": 1.1666, + "step": 28695 + }, + { + "epoch": 8.59, + "grad_norm": 2.4495763778686523, + "learning_rate": 3.0514570666645896e-05, + "loss": 1.2394, + "step": 28700 + }, + { + "epoch": 8.59, + "grad_norm": 0.8862543106079102, + "learning_rate": 3.0508840020479194e-05, + "loss": 1.1907, + "step": 28705 + }, + { + "epoch": 8.59, + "grad_norm": 3.8315329551696777, + "learning_rate": 3.0503109070064984e-05, + "loss": 1.026, + "step": 28710 + }, + { + "epoch": 8.59, + "grad_norm": 6.697932243347168, + "learning_rate": 3.0497377815719787e-05, + "loss": 1.143, + "step": 28715 + }, + { + "epoch": 8.59, + "grad_norm": 2.7949328422546387, + "learning_rate": 3.049164625776012e-05, + "loss": 1.0389, + "step": 28720 + }, + { + "epoch": 8.59, + "grad_norm": 3.456163167953491, + "learning_rate": 3.048591439650254e-05, + "loss": 0.9912, + "step": 28725 + }, + { + "epoch": 8.6, + "grad_norm": 4.755772113800049, + "learning_rate": 3.048018223226361e-05, + "loss": 1.0972, + "step": 28730 + }, + { + "epoch": 8.6, + "grad_norm": 1.6794426441192627, + "learning_rate": 3.0474449765359908e-05, + "loss": 1.0897, + "step": 28735 + }, + { + "epoch": 8.6, + "grad_norm": 1.9840363264083862, + "learning_rate": 3.0468716996108038e-05, + "loss": 1.0486, + "step": 28740 + }, + { + "epoch": 8.6, + "grad_norm": 5.026776313781738, + "learning_rate": 3.046298392482462e-05, + "loss": 1.0958, + "step": 28745 + }, + { + "epoch": 8.6, + "grad_norm": 3.4160451889038086, + "learning_rate": 3.0457250551826272e-05, + "loss": 1.1688, + "step": 28750 + }, + { + "epoch": 8.6, + "grad_norm": 5.9285054206848145, + "learning_rate": 3.0451516877429648e-05, + "loss": 1.1551, + "step": 28755 + }, + { + "epoch": 8.6, + "grad_norm": 2.421475410461426, + "learning_rate": 3.044578290195141e-05, + "loss": 1.3426, + "step": 28760 + }, + { + "epoch": 8.61, + "grad_norm": 5.284264087677002, + "learning_rate": 3.0440048625708244e-05, + "loss": 1.22, + "step": 28765 + }, + { + "epoch": 8.61, + "grad_norm": 1.7585577964782715, + "learning_rate": 3.0434314049016854e-05, + "loss": 1.3052, + "step": 28770 + }, + { + "epoch": 8.61, + "grad_norm": 2.2892251014709473, + "learning_rate": 3.042857917219394e-05, + "loss": 1.195, + "step": 28775 + }, + { + "epoch": 8.61, + "grad_norm": 2.7813527584075928, + "learning_rate": 3.0422843995556245e-05, + "loss": 1.1471, + "step": 28780 + }, + { + "epoch": 8.61, + "grad_norm": 4.371838569641113, + "learning_rate": 3.041710851942051e-05, + "loss": 1.1609, + "step": 28785 + }, + { + "epoch": 8.61, + "grad_norm": 5.511404991149902, + "learning_rate": 3.0411372744103504e-05, + "loss": 0.9399, + "step": 28790 + }, + { + "epoch": 8.62, + "grad_norm": 6.196451663970947, + "learning_rate": 3.0405636669922004e-05, + "loss": 0.9446, + "step": 28795 + }, + { + "epoch": 8.62, + "grad_norm": 1.4433752298355103, + "learning_rate": 3.0399900297192812e-05, + "loss": 1.1955, + "step": 28800 + }, + { + "epoch": 8.62, + "grad_norm": 2.80629563331604, + "learning_rate": 3.0394163626232742e-05, + "loss": 1.2015, + "step": 28805 + }, + { + "epoch": 8.62, + "grad_norm": 3.159404754638672, + "learning_rate": 3.0388426657358628e-05, + "loss": 1.0804, + "step": 28810 + }, + { + "epoch": 8.62, + "grad_norm": 1.2445366382598877, + "learning_rate": 3.0382689390887297e-05, + "loss": 1.2395, + "step": 28815 + }, + { + "epoch": 8.62, + "grad_norm": 3.2405431270599365, + "learning_rate": 3.0376951827135632e-05, + "loss": 1.0396, + "step": 28820 + }, + { + "epoch": 8.62, + "grad_norm": 3.0723726749420166, + "learning_rate": 3.0371213966420503e-05, + "loss": 1.2086, + "step": 28825 + }, + { + "epoch": 8.63, + "grad_norm": 2.3256523609161377, + "learning_rate": 3.0365475809058814e-05, + "loss": 1.2707, + "step": 28830 + }, + { + "epoch": 8.63, + "grad_norm": 2.9459950923919678, + "learning_rate": 3.0359737355367467e-05, + "loss": 1.1848, + "step": 28835 + }, + { + "epoch": 8.63, + "grad_norm": 2.410904884338379, + "learning_rate": 3.0353998605663403e-05, + "loss": 1.0366, + "step": 28840 + }, + { + "epoch": 8.63, + "grad_norm": 4.633788585662842, + "learning_rate": 3.0348259560263563e-05, + "loss": 1.0301, + "step": 28845 + }, + { + "epoch": 8.63, + "grad_norm": 2.1768550872802734, + "learning_rate": 3.0342520219484903e-05, + "loss": 1.2337, + "step": 28850 + }, + { + "epoch": 8.63, + "grad_norm": 1.6011451482772827, + "learning_rate": 3.033678058364441e-05, + "loss": 1.1315, + "step": 28855 + }, + { + "epoch": 8.63, + "grad_norm": 1.7358301877975464, + "learning_rate": 3.0331040653059063e-05, + "loss": 1.2123, + "step": 28860 + }, + { + "epoch": 8.64, + "grad_norm": 2.091358184814453, + "learning_rate": 3.0325300428045883e-05, + "loss": 1.07, + "step": 28865 + }, + { + "epoch": 8.64, + "grad_norm": 2.0123684406280518, + "learning_rate": 3.0319559908921895e-05, + "loss": 1.1838, + "step": 28870 + }, + { + "epoch": 8.64, + "grad_norm": 2.712315082550049, + "learning_rate": 3.0313819096004154e-05, + "loss": 1.2163, + "step": 28875 + }, + { + "epoch": 8.64, + "grad_norm": 2.8222601413726807, + "learning_rate": 3.03080779896097e-05, + "loss": 1.1945, + "step": 28880 + }, + { + "epoch": 8.64, + "grad_norm": 4.944241046905518, + "learning_rate": 3.0302336590055617e-05, + "loss": 0.891, + "step": 28885 + }, + { + "epoch": 8.64, + "grad_norm": 2.032339572906494, + "learning_rate": 3.0296594897658993e-05, + "loss": 1.2057, + "step": 28890 + }, + { + "epoch": 8.65, + "grad_norm": 2.003133773803711, + "learning_rate": 3.0290852912736944e-05, + "loss": 1.052, + "step": 28895 + }, + { + "epoch": 8.65, + "grad_norm": 2.7884342670440674, + "learning_rate": 3.0285110635606585e-05, + "loss": 1.1927, + "step": 28900 + }, + { + "epoch": 8.65, + "grad_norm": 2.4504990577697754, + "learning_rate": 3.0279368066585056e-05, + "loss": 1.0378, + "step": 28905 + }, + { + "epoch": 8.65, + "grad_norm": 6.284214496612549, + "learning_rate": 3.0273625205989525e-05, + "loss": 1.1471, + "step": 28910 + }, + { + "epoch": 8.65, + "grad_norm": 12.456482887268066, + "learning_rate": 3.0267882054137148e-05, + "loss": 1.0668, + "step": 28915 + }, + { + "epoch": 8.65, + "grad_norm": 2.481710195541382, + "learning_rate": 3.026213861134512e-05, + "loss": 1.2337, + "step": 28920 + }, + { + "epoch": 8.65, + "grad_norm": 7.710996150970459, + "learning_rate": 3.025639487793065e-05, + "loss": 0.9881, + "step": 28925 + }, + { + "epoch": 8.66, + "grad_norm": 2.2037603855133057, + "learning_rate": 3.0250650854210953e-05, + "loss": 1.1093, + "step": 28930 + }, + { + "epoch": 8.66, + "grad_norm": 2.117426872253418, + "learning_rate": 3.0244906540503266e-05, + "loss": 1.1326, + "step": 28935 + }, + { + "epoch": 8.66, + "grad_norm": 3.677184581756592, + "learning_rate": 3.023916193712485e-05, + "loss": 1.1629, + "step": 28940 + }, + { + "epoch": 8.66, + "grad_norm": 3.842292070388794, + "learning_rate": 3.0233417044392953e-05, + "loss": 1.0908, + "step": 28945 + }, + { + "epoch": 8.66, + "grad_norm": 2.213963508605957, + "learning_rate": 3.0227671862624878e-05, + "loss": 1.166, + "step": 28950 + }, + { + "epoch": 8.66, + "grad_norm": 0.9696227312088013, + "learning_rate": 3.0221926392137922e-05, + "loss": 1.1106, + "step": 28955 + }, + { + "epoch": 8.66, + "grad_norm": 1.315886378288269, + "learning_rate": 3.0216180633249396e-05, + "loss": 1.1739, + "step": 28960 + }, + { + "epoch": 8.67, + "grad_norm": 4.016354560852051, + "learning_rate": 3.0210434586276637e-05, + "loss": 0.9315, + "step": 28965 + }, + { + "epoch": 8.67, + "grad_norm": 4.51848030090332, + "learning_rate": 3.0204688251536994e-05, + "loss": 0.9756, + "step": 28970 + }, + { + "epoch": 8.67, + "grad_norm": 3.8415751457214355, + "learning_rate": 3.0198941629347833e-05, + "loss": 1.0718, + "step": 28975 + }, + { + "epoch": 8.67, + "grad_norm": 3.0675783157348633, + "learning_rate": 3.0193194720026524e-05, + "loss": 1.2526, + "step": 28980 + }, + { + "epoch": 8.67, + "grad_norm": 1.8814668655395508, + "learning_rate": 3.0187447523890468e-05, + "loss": 1.1403, + "step": 28985 + }, + { + "epoch": 8.67, + "grad_norm": 3.3150155544281006, + "learning_rate": 3.0181700041257077e-05, + "loss": 1.0186, + "step": 28990 + }, + { + "epoch": 8.67, + "grad_norm": 3.6737823486328125, + "learning_rate": 3.017595227244378e-05, + "loss": 1.0857, + "step": 28995 + }, + { + "epoch": 8.68, + "grad_norm": 1.9824943542480469, + "learning_rate": 3.0170204217768023e-05, + "loss": 1.2406, + "step": 29000 + }, + { + "epoch": 8.68, + "grad_norm": 1.4903932809829712, + "learning_rate": 3.016445587754726e-05, + "loss": 1.3973, + "step": 29005 + }, + { + "epoch": 8.68, + "grad_norm": 3.9678359031677246, + "learning_rate": 3.0158707252098966e-05, + "loss": 1.1738, + "step": 29010 + }, + { + "epoch": 8.68, + "grad_norm": 3.8614068031311035, + "learning_rate": 3.015295834174063e-05, + "loss": 1.123, + "step": 29015 + }, + { + "epoch": 8.68, + "grad_norm": 4.1375555992126465, + "learning_rate": 3.0147209146789762e-05, + "loss": 0.9881, + "step": 29020 + }, + { + "epoch": 8.68, + "grad_norm": 3.8305399417877197, + "learning_rate": 3.014145966756388e-05, + "loss": 0.8714, + "step": 29025 + }, + { + "epoch": 8.69, + "grad_norm": 2.49973464012146, + "learning_rate": 3.013570990438053e-05, + "loss": 1.0092, + "step": 29030 + }, + { + "epoch": 8.69, + "grad_norm": 6.754157543182373, + "learning_rate": 3.0129959857557256e-05, + "loss": 1.1094, + "step": 29035 + }, + { + "epoch": 8.69, + "grad_norm": 3.1229982376098633, + "learning_rate": 3.0124209527411634e-05, + "loss": 1.0827, + "step": 29040 + }, + { + "epoch": 8.69, + "grad_norm": 5.443647384643555, + "learning_rate": 3.0118458914261242e-05, + "loss": 1.0179, + "step": 29045 + }, + { + "epoch": 8.69, + "grad_norm": 3.6279354095458984, + "learning_rate": 3.011270801842369e-05, + "loss": 1.0837, + "step": 29050 + }, + { + "epoch": 8.69, + "grad_norm": 1.7841243743896484, + "learning_rate": 3.0106956840216586e-05, + "loss": 1.2112, + "step": 29055 + }, + { + "epoch": 8.69, + "grad_norm": 1.7607803344726562, + "learning_rate": 3.0101205379957563e-05, + "loss": 1.2424, + "step": 29060 + }, + { + "epoch": 8.7, + "grad_norm": 1.5843974351882935, + "learning_rate": 3.0095453637964272e-05, + "loss": 1.0776, + "step": 29065 + }, + { + "epoch": 8.7, + "grad_norm": 12.485400199890137, + "learning_rate": 3.0089701614554377e-05, + "loss": 1.0256, + "step": 29070 + }, + { + "epoch": 8.7, + "grad_norm": 1.710533618927002, + "learning_rate": 3.008394931004555e-05, + "loss": 1.1107, + "step": 29075 + }, + { + "epoch": 8.7, + "grad_norm": 1.6686811447143555, + "learning_rate": 3.007819672475548e-05, + "loss": 1.1631, + "step": 29080 + }, + { + "epoch": 8.7, + "grad_norm": 4.799095630645752, + "learning_rate": 3.0072443859001893e-05, + "loss": 1.1524, + "step": 29085 + }, + { + "epoch": 8.7, + "grad_norm": 3.90974497795105, + "learning_rate": 3.0066690713102497e-05, + "loss": 1.0848, + "step": 29090 + }, + { + "epoch": 8.7, + "grad_norm": 2.303661346435547, + "learning_rate": 3.006093728737504e-05, + "loss": 1.0947, + "step": 29095 + }, + { + "epoch": 8.71, + "grad_norm": 1.7615519762039185, + "learning_rate": 3.005518358213728e-05, + "loss": 0.9884, + "step": 29100 + }, + { + "epoch": 8.71, + "grad_norm": 3.437652826309204, + "learning_rate": 3.0049429597706987e-05, + "loss": 0.9993, + "step": 29105 + }, + { + "epoch": 8.71, + "grad_norm": 1.8464926481246948, + "learning_rate": 3.0043675334401943e-05, + "loss": 1.3269, + "step": 29110 + }, + { + "epoch": 8.71, + "grad_norm": 2.6147725582122803, + "learning_rate": 3.0037920792539954e-05, + "loss": 1.1523, + "step": 29115 + }, + { + "epoch": 8.71, + "grad_norm": 1.6204800605773926, + "learning_rate": 3.003216597243883e-05, + "loss": 1.0221, + "step": 29120 + }, + { + "epoch": 8.71, + "grad_norm": 2.307574510574341, + "learning_rate": 3.0026410874416416e-05, + "loss": 1.1662, + "step": 29125 + }, + { + "epoch": 8.72, + "grad_norm": 1.2338721752166748, + "learning_rate": 3.0020655498790552e-05, + "loss": 1.1687, + "step": 29130 + }, + { + "epoch": 8.72, + "grad_norm": 1.9817173480987549, + "learning_rate": 3.001489984587911e-05, + "loss": 1.1231, + "step": 29135 + }, + { + "epoch": 8.72, + "grad_norm": 2.5606281757354736, + "learning_rate": 3.000914391599995e-05, + "loss": 1.1578, + "step": 29140 + }, + { + "epoch": 8.72, + "grad_norm": 2.77673602104187, + "learning_rate": 3.0003387709470988e-05, + "loss": 1.1036, + "step": 29145 + }, + { + "epoch": 8.72, + "grad_norm": 1.4961824417114258, + "learning_rate": 2.9997631226610116e-05, + "loss": 1.197, + "step": 29150 + }, + { + "epoch": 8.72, + "grad_norm": 4.214254379272461, + "learning_rate": 2.9991874467735272e-05, + "loss": 1.1423, + "step": 29155 + }, + { + "epoch": 8.72, + "grad_norm": 4.497615337371826, + "learning_rate": 2.998611743316439e-05, + "loss": 1.1546, + "step": 29160 + }, + { + "epoch": 8.73, + "grad_norm": 1.615196943283081, + "learning_rate": 2.998036012321543e-05, + "loss": 1.2521, + "step": 29165 + }, + { + "epoch": 8.73, + "grad_norm": 1.5576528310775757, + "learning_rate": 2.997460253820635e-05, + "loss": 1.2029, + "step": 29170 + }, + { + "epoch": 8.73, + "grad_norm": 1.751317024230957, + "learning_rate": 2.996884467845514e-05, + "loss": 1.2419, + "step": 29175 + }, + { + "epoch": 8.73, + "grad_norm": 4.62571907043457, + "learning_rate": 2.9963086544279807e-05, + "loss": 1.3243, + "step": 29180 + }, + { + "epoch": 8.73, + "grad_norm": 3.067814826965332, + "learning_rate": 2.9957328135998365e-05, + "loss": 1.1107, + "step": 29185 + }, + { + "epoch": 8.73, + "grad_norm": 5.224938869476318, + "learning_rate": 2.9951569453928834e-05, + "loss": 1.2911, + "step": 29190 + }, + { + "epoch": 8.73, + "grad_norm": 3.1084907054901123, + "learning_rate": 2.9945810498389275e-05, + "loss": 1.0711, + "step": 29195 + }, + { + "epoch": 8.74, + "grad_norm": 1.8605875968933105, + "learning_rate": 2.994005126969775e-05, + "loss": 1.1007, + "step": 29200 + }, + { + "epoch": 8.74, + "grad_norm": 1.8477222919464111, + "learning_rate": 2.9934291768172324e-05, + "loss": 1.1518, + "step": 29205 + }, + { + "epoch": 8.74, + "grad_norm": 2.743089199066162, + "learning_rate": 2.9928531994131086e-05, + "loss": 1.1243, + "step": 29210 + }, + { + "epoch": 8.74, + "grad_norm": 3.1125733852386475, + "learning_rate": 2.9922771947892154e-05, + "loss": 1.2135, + "step": 29215 + }, + { + "epoch": 8.74, + "grad_norm": 2.2629575729370117, + "learning_rate": 2.9917011629773643e-05, + "loss": 1.1818, + "step": 29220 + }, + { + "epoch": 8.74, + "grad_norm": 3.8765952587127686, + "learning_rate": 2.991125104009369e-05, + "loss": 1.2061, + "step": 29225 + }, + { + "epoch": 8.75, + "grad_norm": 2.9777157306671143, + "learning_rate": 2.9905490179170446e-05, + "loss": 1.2431, + "step": 29230 + }, + { + "epoch": 8.75, + "grad_norm": 1.9893121719360352, + "learning_rate": 2.9899729047322085e-05, + "loss": 1.066, + "step": 29235 + }, + { + "epoch": 8.75, + "grad_norm": 2.7022969722747803, + "learning_rate": 2.989396764486677e-05, + "loss": 1.1172, + "step": 29240 + }, + { + "epoch": 8.75, + "grad_norm": 3.0575449466705322, + "learning_rate": 2.988820597212272e-05, + "loss": 1.0959, + "step": 29245 + }, + { + "epoch": 8.75, + "grad_norm": 1.885754942893982, + "learning_rate": 2.988244402940813e-05, + "loss": 1.1691, + "step": 29250 + }, + { + "epoch": 8.75, + "grad_norm": 1.6529167890548706, + "learning_rate": 2.9876681817041235e-05, + "loss": 1.2623, + "step": 29255 + }, + { + "epoch": 8.75, + "grad_norm": 1.5923422574996948, + "learning_rate": 2.987091933534027e-05, + "loss": 1.2028, + "step": 29260 + }, + { + "epoch": 8.76, + "grad_norm": 1.1636615991592407, + "learning_rate": 2.9865156584623495e-05, + "loss": 0.9298, + "step": 29265 + }, + { + "epoch": 8.76, + "grad_norm": 2.3403260707855225, + "learning_rate": 2.9859393565209177e-05, + "loss": 1.1567, + "step": 29270 + }, + { + "epoch": 8.76, + "grad_norm": 3.294311761856079, + "learning_rate": 2.98536302774156e-05, + "loss": 0.9932, + "step": 29275 + }, + { + "epoch": 8.76, + "grad_norm": 1.9269850254058838, + "learning_rate": 2.9847866721561075e-05, + "loss": 1.2663, + "step": 29280 + }, + { + "epoch": 8.76, + "grad_norm": 4.520174026489258, + "learning_rate": 2.9842102897963902e-05, + "loss": 1.0276, + "step": 29285 + }, + { + "epoch": 8.76, + "grad_norm": 1.383296012878418, + "learning_rate": 2.9836338806942425e-05, + "loss": 1.1108, + "step": 29290 + }, + { + "epoch": 8.76, + "grad_norm": 2.5761561393737793, + "learning_rate": 2.9830574448814984e-05, + "loss": 1.1263, + "step": 29295 + }, + { + "epoch": 8.77, + "grad_norm": 2.923858404159546, + "learning_rate": 2.9824809823899936e-05, + "loss": 1.1611, + "step": 29300 + }, + { + "epoch": 8.77, + "grad_norm": 1.9109852313995361, + "learning_rate": 2.9819044932515655e-05, + "loss": 1.0511, + "step": 29305 + }, + { + "epoch": 8.77, + "grad_norm": 2.4346346855163574, + "learning_rate": 2.9813279774980524e-05, + "loss": 1.1955, + "step": 29310 + }, + { + "epoch": 8.77, + "grad_norm": 3.335416078567505, + "learning_rate": 2.9807514351612965e-05, + "loss": 1.086, + "step": 29315 + }, + { + "epoch": 8.77, + "grad_norm": 3.1424591541290283, + "learning_rate": 2.9801748662731376e-05, + "loss": 1.2607, + "step": 29320 + }, + { + "epoch": 8.77, + "grad_norm": 3.5147266387939453, + "learning_rate": 2.9795982708654206e-05, + "loss": 1.0876, + "step": 29325 + }, + { + "epoch": 8.78, + "grad_norm": 3.8521475791931152, + "learning_rate": 2.97902164896999e-05, + "loss": 1.1177, + "step": 29330 + }, + { + "epoch": 8.78, + "grad_norm": 4.234015464782715, + "learning_rate": 2.9784450006186914e-05, + "loss": 1.0803, + "step": 29335 + }, + { + "epoch": 8.78, + "grad_norm": 1.5537238121032715, + "learning_rate": 2.9778683258433727e-05, + "loss": 1.1356, + "step": 29340 + }, + { + "epoch": 8.78, + "grad_norm": 3.8383750915527344, + "learning_rate": 2.9772916246758825e-05, + "loss": 1.0342, + "step": 29345 + }, + { + "epoch": 8.78, + "grad_norm": 2.1243529319763184, + "learning_rate": 2.9767148971480725e-05, + "loss": 1.0873, + "step": 29350 + }, + { + "epoch": 8.78, + "grad_norm": 4.195858478546143, + "learning_rate": 2.976138143291794e-05, + "loss": 1.1394, + "step": 29355 + }, + { + "epoch": 8.78, + "grad_norm": 2.7369132041931152, + "learning_rate": 2.9755613631389017e-05, + "loss": 1.0517, + "step": 29360 + }, + { + "epoch": 8.79, + "grad_norm": 1.731953740119934, + "learning_rate": 2.9749845567212487e-05, + "loss": 1.1122, + "step": 29365 + }, + { + "epoch": 8.79, + "grad_norm": 2.9949727058410645, + "learning_rate": 2.9744077240706925e-05, + "loss": 1.192, + "step": 29370 + }, + { + "epoch": 8.79, + "grad_norm": 3.624964714050293, + "learning_rate": 2.9738308652190905e-05, + "loss": 1.1176, + "step": 29375 + }, + { + "epoch": 8.79, + "grad_norm": 1.6444040536880493, + "learning_rate": 2.973253980198303e-05, + "loss": 1.1123, + "step": 29380 + }, + { + "epoch": 8.79, + "grad_norm": 3.1505863666534424, + "learning_rate": 2.9726770690401894e-05, + "loss": 1.1554, + "step": 29385 + }, + { + "epoch": 8.79, + "grad_norm": 2.775284767150879, + "learning_rate": 2.972100131776613e-05, + "loss": 1.0491, + "step": 29390 + }, + { + "epoch": 8.79, + "grad_norm": 2.9408364295959473, + "learning_rate": 2.971523168439437e-05, + "loss": 1.0462, + "step": 29395 + }, + { + "epoch": 8.8, + "grad_norm": 4.636882305145264, + "learning_rate": 2.9709461790605263e-05, + "loss": 1.1156, + "step": 29400 + }, + { + "epoch": 8.8, + "grad_norm": 4.47889518737793, + "learning_rate": 2.970369163671748e-05, + "loss": 1.1295, + "step": 29405 + }, + { + "epoch": 8.8, + "grad_norm": 5.22827672958374, + "learning_rate": 2.9697921223049697e-05, + "loss": 1.0539, + "step": 29410 + }, + { + "epoch": 8.8, + "grad_norm": 2.5013325214385986, + "learning_rate": 2.9692150549920606e-05, + "loss": 1.1867, + "step": 29415 + }, + { + "epoch": 8.8, + "grad_norm": 1.6474865674972534, + "learning_rate": 2.9686379617648917e-05, + "loss": 1.0142, + "step": 29420 + }, + { + "epoch": 8.8, + "grad_norm": 2.7693967819213867, + "learning_rate": 2.9680608426553358e-05, + "loss": 1.0996, + "step": 29425 + }, + { + "epoch": 8.81, + "grad_norm": 3.4689788818359375, + "learning_rate": 2.9674836976952657e-05, + "loss": 1.3735, + "step": 29430 + }, + { + "epoch": 8.81, + "grad_norm": 1.6215572357177734, + "learning_rate": 2.966906526916557e-05, + "loss": 1.1179, + "step": 29435 + }, + { + "epoch": 8.81, + "grad_norm": 2.069563627243042, + "learning_rate": 2.9663293303510857e-05, + "loss": 1.0961, + "step": 29440 + }, + { + "epoch": 8.81, + "grad_norm": 1.665561318397522, + "learning_rate": 2.9657521080307305e-05, + "loss": 1.1638, + "step": 29445 + }, + { + "epoch": 8.81, + "grad_norm": 3.0357062816619873, + "learning_rate": 2.9651748599873708e-05, + "loss": 1.2004, + "step": 29450 + }, + { + "epoch": 8.81, + "grad_norm": 7.12582540512085, + "learning_rate": 2.9645975862528868e-05, + "loss": 1.1028, + "step": 29455 + }, + { + "epoch": 8.81, + "grad_norm": 4.670501232147217, + "learning_rate": 2.9640202868591616e-05, + "loss": 0.9468, + "step": 29460 + }, + { + "epoch": 8.82, + "grad_norm": 4.492547035217285, + "learning_rate": 2.9634429618380775e-05, + "loss": 1.161, + "step": 29465 + }, + { + "epoch": 8.82, + "grad_norm": 4.036949634552002, + "learning_rate": 2.9628656112215202e-05, + "loss": 1.0556, + "step": 29470 + }, + { + "epoch": 8.82, + "grad_norm": 6.104241847991943, + "learning_rate": 2.962288235041377e-05, + "loss": 1.2953, + "step": 29475 + }, + { + "epoch": 8.82, + "grad_norm": 2.0466535091400146, + "learning_rate": 2.9617108333295345e-05, + "loss": 1.2064, + "step": 29480 + }, + { + "epoch": 8.82, + "grad_norm": 3.5240540504455566, + "learning_rate": 2.9612488935986672e-05, + "loss": 1.0582, + "step": 29485 + }, + { + "epoch": 8.82, + "grad_norm": 2.5299673080444336, + "learning_rate": 2.960671446010129e-05, + "loss": 1.1523, + "step": 29490 + }, + { + "epoch": 8.82, + "grad_norm": 2.4263341426849365, + "learning_rate": 2.9600939729791864e-05, + "loss": 1.0886, + "step": 29495 + }, + { + "epoch": 8.83, + "grad_norm": 5.209130764007568, + "learning_rate": 2.9595164745377314e-05, + "loss": 1.0808, + "step": 29500 + }, + { + "epoch": 8.83, + "grad_norm": 2.5315446853637695, + "learning_rate": 2.958938950717659e-05, + "loss": 1.264, + "step": 29505 + }, + { + "epoch": 8.83, + "grad_norm": 2.663262367248535, + "learning_rate": 2.9583614015508666e-05, + "loss": 1.201, + "step": 29510 + }, + { + "epoch": 8.83, + "grad_norm": 1.5915833711624146, + "learning_rate": 2.9577838270692493e-05, + "loss": 1.0883, + "step": 29515 + }, + { + "epoch": 8.83, + "grad_norm": 6.19256067276001, + "learning_rate": 2.9572062273047075e-05, + "loss": 1.1954, + "step": 29520 + }, + { + "epoch": 8.83, + "grad_norm": 2.0903115272521973, + "learning_rate": 2.9566286022891404e-05, + "loss": 1.1074, + "step": 29525 + }, + { + "epoch": 8.84, + "grad_norm": 7.435312271118164, + "learning_rate": 2.9560509520544505e-05, + "loss": 0.967, + "step": 29530 + }, + { + "epoch": 8.84, + "grad_norm": 1.675291657447815, + "learning_rate": 2.9554732766325406e-05, + "loss": 1.0977, + "step": 29535 + }, + { + "epoch": 8.84, + "grad_norm": 2.438756227493286, + "learning_rate": 2.9548955760553155e-05, + "loss": 1.1914, + "step": 29540 + }, + { + "epoch": 8.84, + "grad_norm": 2.381863832473755, + "learning_rate": 2.9543178503546805e-05, + "loss": 1.0097, + "step": 29545 + }, + { + "epoch": 8.84, + "grad_norm": 2.3754689693450928, + "learning_rate": 2.9537400995625426e-05, + "loss": 1.2332, + "step": 29550 + }, + { + "epoch": 8.84, + "grad_norm": 1.7114568948745728, + "learning_rate": 2.9531623237108103e-05, + "loss": 1.1569, + "step": 29555 + }, + { + "epoch": 8.84, + "grad_norm": 2.6389689445495605, + "learning_rate": 2.952584522831394e-05, + "loss": 1.2255, + "step": 29560 + }, + { + "epoch": 8.85, + "grad_norm": 1.7149155139923096, + "learning_rate": 2.9520066969562056e-05, + "loss": 1.0352, + "step": 29565 + }, + { + "epoch": 8.85, + "grad_norm": 4.823055744171143, + "learning_rate": 2.9514288461171557e-05, + "loss": 1.2276, + "step": 29570 + }, + { + "epoch": 8.85, + "grad_norm": 1.9236773252487183, + "learning_rate": 2.9508509703461613e-05, + "loss": 1.3419, + "step": 29575 + }, + { + "epoch": 8.85, + "grad_norm": 0.9075140357017517, + "learning_rate": 2.9502730696751362e-05, + "loss": 0.9838, + "step": 29580 + }, + { + "epoch": 8.85, + "grad_norm": 1.5596219301223755, + "learning_rate": 2.949695144135997e-05, + "loss": 1.1306, + "step": 29585 + }, + { + "epoch": 8.85, + "grad_norm": 4.231356620788574, + "learning_rate": 2.9491171937606628e-05, + "loss": 1.13, + "step": 29590 + }, + { + "epoch": 8.85, + "grad_norm": 1.3872261047363281, + "learning_rate": 2.948539218581053e-05, + "loss": 1.1116, + "step": 29595 + }, + { + "epoch": 8.86, + "grad_norm": 2.4153921604156494, + "learning_rate": 2.9479612186290878e-05, + "loss": 1.1196, + "step": 29600 + }, + { + "epoch": 8.86, + "grad_norm": 2.1678645610809326, + "learning_rate": 2.9473831939366908e-05, + "loss": 1.0154, + "step": 29605 + }, + { + "epoch": 8.86, + "grad_norm": 5.16516637802124, + "learning_rate": 2.9468051445357848e-05, + "loss": 0.9722, + "step": 29610 + }, + { + "epoch": 8.86, + "grad_norm": 1.94868004322052, + "learning_rate": 2.946227070458295e-05, + "loss": 1.1472, + "step": 29615 + }, + { + "epoch": 8.86, + "grad_norm": 1.3372802734375, + "learning_rate": 2.9456489717361478e-05, + "loss": 1.1834, + "step": 29620 + }, + { + "epoch": 8.86, + "grad_norm": 1.819388508796692, + "learning_rate": 2.9450708484012713e-05, + "loss": 1.0856, + "step": 29625 + }, + { + "epoch": 8.86, + "grad_norm": 2.233203649520874, + "learning_rate": 2.9444927004855942e-05, + "loss": 1.2613, + "step": 29630 + }, + { + "epoch": 8.87, + "grad_norm": 1.7675302028656006, + "learning_rate": 2.9439145280210473e-05, + "loss": 1.1527, + "step": 29635 + }, + { + "epoch": 8.87, + "grad_norm": 3.0035910606384277, + "learning_rate": 2.9433363310395634e-05, + "loss": 0.9933, + "step": 29640 + }, + { + "epoch": 8.87, + "grad_norm": 1.98517906665802, + "learning_rate": 2.942758109573074e-05, + "loss": 1.2819, + "step": 29645 + }, + { + "epoch": 8.87, + "grad_norm": 2.4812839031219482, + "learning_rate": 2.9421798636535137e-05, + "loss": 0.9064, + "step": 29650 + }, + { + "epoch": 8.87, + "grad_norm": 3.8419299125671387, + "learning_rate": 2.9416015933128194e-05, + "loss": 1.1614, + "step": 29655 + }, + { + "epoch": 8.87, + "grad_norm": 8.991368293762207, + "learning_rate": 2.941023298582929e-05, + "loss": 1.0835, + "step": 29660 + }, + { + "epoch": 8.88, + "grad_norm": 1.9285551309585571, + "learning_rate": 2.9404449794957795e-05, + "loss": 1.1749, + "step": 29665 + }, + { + "epoch": 8.88, + "grad_norm": 4.323269844055176, + "learning_rate": 2.9398666360833126e-05, + "loss": 1.0456, + "step": 29670 + }, + { + "epoch": 8.88, + "grad_norm": 3.1791858673095703, + "learning_rate": 2.9392882683774674e-05, + "loss": 1.0455, + "step": 29675 + }, + { + "epoch": 8.88, + "grad_norm": 1.171151041984558, + "learning_rate": 2.9387098764101882e-05, + "loss": 1.1266, + "step": 29680 + }, + { + "epoch": 8.88, + "grad_norm": 3.656491994857788, + "learning_rate": 2.938131460213419e-05, + "loss": 1.1155, + "step": 29685 + }, + { + "epoch": 8.88, + "grad_norm": 3.314228057861328, + "learning_rate": 2.937553019819104e-05, + "loss": 1.0785, + "step": 29690 + }, + { + "epoch": 8.88, + "grad_norm": 2.453927516937256, + "learning_rate": 2.936974555259191e-05, + "loss": 1.2502, + "step": 29695 + }, + { + "epoch": 8.89, + "grad_norm": 1.7844200134277344, + "learning_rate": 2.936396066565627e-05, + "loss": 1.2183, + "step": 29700 + }, + { + "epoch": 8.89, + "grad_norm": 3.409106731414795, + "learning_rate": 2.9358175537703624e-05, + "loss": 1.1222, + "step": 29705 + }, + { + "epoch": 8.89, + "grad_norm": 7.029350757598877, + "learning_rate": 2.9352390169053475e-05, + "loss": 1.1221, + "step": 29710 + }, + { + "epoch": 8.89, + "grad_norm": 5.586660861968994, + "learning_rate": 2.9346604560025336e-05, + "loss": 1.1623, + "step": 29715 + }, + { + "epoch": 8.89, + "grad_norm": 2.2339181900024414, + "learning_rate": 2.9340818710938745e-05, + "loss": 1.2503, + "step": 29720 + }, + { + "epoch": 8.89, + "grad_norm": 2.694247245788574, + "learning_rate": 2.9335032622113252e-05, + "loss": 1.1939, + "step": 29725 + }, + { + "epoch": 8.89, + "grad_norm": 1.8695008754730225, + "learning_rate": 2.9329246293868407e-05, + "loss": 1.1074, + "step": 29730 + }, + { + "epoch": 8.9, + "grad_norm": 3.689002275466919, + "learning_rate": 2.9323459726523794e-05, + "loss": 0.9559, + "step": 29735 + }, + { + "epoch": 8.9, + "grad_norm": 2.7362306118011475, + "learning_rate": 2.9317672920398993e-05, + "loss": 1.0817, + "step": 29740 + }, + { + "epoch": 8.9, + "grad_norm": 1.8381929397583008, + "learning_rate": 2.9311885875813604e-05, + "loss": 1.1532, + "step": 29745 + }, + { + "epoch": 8.9, + "grad_norm": 2.022266149520874, + "learning_rate": 2.930609859308724e-05, + "loss": 0.9327, + "step": 29750 + }, + { + "epoch": 8.9, + "grad_norm": 3.2312119007110596, + "learning_rate": 2.9300311072539527e-05, + "loss": 0.9719, + "step": 29755 + }, + { + "epoch": 8.9, + "grad_norm": 7.237270355224609, + "learning_rate": 2.92945233144901e-05, + "loss": 1.172, + "step": 29760 + }, + { + "epoch": 8.91, + "grad_norm": 2.9999465942382812, + "learning_rate": 2.928873531925862e-05, + "loss": 1.0583, + "step": 29765 + }, + { + "epoch": 8.91, + "grad_norm": 3.0304183959960938, + "learning_rate": 2.928294708716475e-05, + "loss": 1.0234, + "step": 29770 + }, + { + "epoch": 8.91, + "grad_norm": 1.2405683994293213, + "learning_rate": 2.927715861852816e-05, + "loss": 0.9056, + "step": 29775 + }, + { + "epoch": 8.91, + "grad_norm": 4.732754230499268, + "learning_rate": 2.9271369913668546e-05, + "loss": 1.2299, + "step": 29780 + }, + { + "epoch": 8.91, + "grad_norm": 2.121570348739624, + "learning_rate": 2.9265580972905603e-05, + "loss": 1.0564, + "step": 29785 + }, + { + "epoch": 8.91, + "grad_norm": 3.0892438888549805, + "learning_rate": 2.9259791796559066e-05, + "loss": 1.0464, + "step": 29790 + }, + { + "epoch": 8.91, + "grad_norm": 2.665210247039795, + "learning_rate": 2.9254002384948655e-05, + "loss": 1.1263, + "step": 29795 + }, + { + "epoch": 8.92, + "grad_norm": 2.6491611003875732, + "learning_rate": 2.9248212738394116e-05, + "loss": 1.0433, + "step": 29800 + }, + { + "epoch": 8.92, + "grad_norm": 4.1141815185546875, + "learning_rate": 2.9242422857215195e-05, + "loss": 1.134, + "step": 29805 + }, + { + "epoch": 8.92, + "grad_norm": 6.5142951011657715, + "learning_rate": 2.9236632741731673e-05, + "loss": 1.2787, + "step": 29810 + }, + { + "epoch": 8.92, + "grad_norm": 2.864144802093506, + "learning_rate": 2.923084239226333e-05, + "loss": 1.0214, + "step": 29815 + }, + { + "epoch": 8.92, + "grad_norm": 6.211786270141602, + "learning_rate": 2.922505180912996e-05, + "loss": 1.0869, + "step": 29820 + }, + { + "epoch": 8.92, + "grad_norm": 4.338846206665039, + "learning_rate": 2.921926099265137e-05, + "loss": 1.1458, + "step": 29825 + }, + { + "epoch": 8.92, + "grad_norm": 4.332965850830078, + "learning_rate": 2.9213469943147374e-05, + "loss": 1.2077, + "step": 29830 + }, + { + "epoch": 8.93, + "grad_norm": 2.471970319747925, + "learning_rate": 2.920767866093782e-05, + "loss": 1.1077, + "step": 29835 + }, + { + "epoch": 8.93, + "grad_norm": 1.8260835409164429, + "learning_rate": 2.920188714634255e-05, + "loss": 1.0752, + "step": 29840 + }, + { + "epoch": 8.93, + "grad_norm": 8.06219482421875, + "learning_rate": 2.919609539968141e-05, + "loss": 1.1127, + "step": 29845 + }, + { + "epoch": 8.93, + "grad_norm": 3.8866000175476074, + "learning_rate": 2.9190303421274288e-05, + "loss": 1.065, + "step": 29850 + }, + { + "epoch": 8.93, + "grad_norm": 3.3143255710601807, + "learning_rate": 2.918451121144107e-05, + "loss": 1.0448, + "step": 29855 + }, + { + "epoch": 8.93, + "grad_norm": 6.668207168579102, + "learning_rate": 2.9178718770501638e-05, + "loss": 1.0846, + "step": 29860 + }, + { + "epoch": 8.94, + "grad_norm": 2.2254040241241455, + "learning_rate": 2.917292609877592e-05, + "loss": 1.1679, + "step": 29865 + }, + { + "epoch": 8.94, + "grad_norm": 2.8223040103912354, + "learning_rate": 2.916713319658383e-05, + "loss": 0.9145, + "step": 29870 + }, + { + "epoch": 8.94, + "grad_norm": 2.584289312362671, + "learning_rate": 2.9161340064245302e-05, + "loss": 1.0576, + "step": 29875 + }, + { + "epoch": 8.94, + "grad_norm": 2.3711581230163574, + "learning_rate": 2.9155546702080282e-05, + "loss": 1.0433, + "step": 29880 + }, + { + "epoch": 8.94, + "grad_norm": 3.263002634048462, + "learning_rate": 2.9149753110408744e-05, + "loss": 1.0901, + "step": 29885 + }, + { + "epoch": 8.94, + "grad_norm": 3.8075244426727295, + "learning_rate": 2.9143959289550653e-05, + "loss": 0.8897, + "step": 29890 + }, + { + "epoch": 8.94, + "grad_norm": 4.947206497192383, + "learning_rate": 2.913816523982601e-05, + "loss": 1.147, + "step": 29895 + }, + { + "epoch": 8.95, + "grad_norm": 1.8365360498428345, + "learning_rate": 2.913237096155479e-05, + "loss": 1.0571, + "step": 29900 + }, + { + "epoch": 8.95, + "grad_norm": 2.5439870357513428, + "learning_rate": 2.9126576455057014e-05, + "loss": 1.062, + "step": 29905 + }, + { + "epoch": 8.95, + "grad_norm": 3.2245960235595703, + "learning_rate": 2.9120781720652713e-05, + "loss": 1.2575, + "step": 29910 + }, + { + "epoch": 8.95, + "grad_norm": 6.086019039154053, + "learning_rate": 2.9114986758661922e-05, + "loss": 1.1212, + "step": 29915 + }, + { + "epoch": 8.95, + "grad_norm": 2.142395496368408, + "learning_rate": 2.9109191569404693e-05, + "loss": 1.0457, + "step": 29920 + }, + { + "epoch": 8.95, + "grad_norm": 2.1010286808013916, + "learning_rate": 2.9104555254582145e-05, + "loss": 1.0763, + "step": 29925 + }, + { + "epoch": 8.95, + "grad_norm": 2.635202407836914, + "learning_rate": 2.909875965705188e-05, + "loss": 1.2506, + "step": 29930 + }, + { + "epoch": 8.96, + "grad_norm": 2.7703280448913574, + "learning_rate": 2.9092963833151388e-05, + "loss": 1.1693, + "step": 29935 + }, + { + "epoch": 8.96, + "grad_norm": 5.17828369140625, + "learning_rate": 2.9087167783200752e-05, + "loss": 1.1341, + "step": 29940 + }, + { + "epoch": 8.96, + "grad_norm": 3.351463556289673, + "learning_rate": 2.908137150752008e-05, + "loss": 1.1165, + "step": 29945 + }, + { + "epoch": 8.96, + "grad_norm": 2.4802517890930176, + "learning_rate": 2.9075575006429524e-05, + "loss": 1.1043, + "step": 29950 + }, + { + "epoch": 8.96, + "grad_norm": 1.3321456909179688, + "learning_rate": 2.9069778280249183e-05, + "loss": 1.0579, + "step": 29955 + }, + { + "epoch": 8.96, + "grad_norm": 5.725347518920898, + "learning_rate": 2.9063981329299216e-05, + "loss": 1.0917, + "step": 29960 + }, + { + "epoch": 8.97, + "grad_norm": 1.3497544527053833, + "learning_rate": 2.905818415389978e-05, + "loss": 1.1703, + "step": 29965 + }, + { + "epoch": 8.97, + "grad_norm": 2.001232385635376, + "learning_rate": 2.9052386754371065e-05, + "loss": 0.9471, + "step": 29970 + }, + { + "epoch": 8.97, + "grad_norm": 2.2034993171691895, + "learning_rate": 2.904658913103323e-05, + "loss": 1.2491, + "step": 29975 + }, + { + "epoch": 8.97, + "grad_norm": 7.609285354614258, + "learning_rate": 2.9040791284206493e-05, + "loss": 1.062, + "step": 29980 + }, + { + "epoch": 8.97, + "grad_norm": 3.246325731277466, + "learning_rate": 2.9034993214211048e-05, + "loss": 1.1693, + "step": 29985 + }, + { + "epoch": 8.97, + "grad_norm": 1.247209906578064, + "learning_rate": 2.902919492136712e-05, + "loss": 1.2194, + "step": 29990 + }, + { + "epoch": 8.97, + "grad_norm": 3.488403081893921, + "learning_rate": 2.9023396405994946e-05, + "loss": 1.2346, + "step": 29995 + }, + { + "epoch": 8.98, + "grad_norm": 1.7326892614364624, + "learning_rate": 2.901759766841477e-05, + "loss": 1.0722, + "step": 30000 + }, + { + "epoch": 8.98, + "grad_norm": 4.2910356521606445, + "learning_rate": 2.901179870894685e-05, + "loss": 1.0043, + "step": 30005 + }, + { + "epoch": 8.98, + "grad_norm": 2.8054089546203613, + "learning_rate": 2.900599952791146e-05, + "loss": 1.1417, + "step": 30010 + }, + { + "epoch": 8.98, + "grad_norm": 4.5647687911987305, + "learning_rate": 2.9000200125628885e-05, + "loss": 1.1773, + "step": 30015 + }, + { + "epoch": 8.98, + "grad_norm": 1.9864246845245361, + "learning_rate": 2.899440050241941e-05, + "loss": 1.212, + "step": 30020 + }, + { + "epoch": 8.98, + "grad_norm": 3.7864115238189697, + "learning_rate": 2.898860065860335e-05, + "loss": 0.9945, + "step": 30025 + }, + { + "epoch": 8.98, + "grad_norm": 2.867279529571533, + "learning_rate": 2.8982800594501014e-05, + "loss": 1.1722, + "step": 30030 + }, + { + "epoch": 8.99, + "grad_norm": 1.8707783222198486, + "learning_rate": 2.8977000310432744e-05, + "loss": 1.2435, + "step": 30035 + }, + { + "epoch": 8.99, + "grad_norm": 3.260164260864258, + "learning_rate": 2.8971199806718884e-05, + "loss": 1.2162, + "step": 30040 + }, + { + "epoch": 8.99, + "grad_norm": 3.576340436935425, + "learning_rate": 2.896539908367979e-05, + "loss": 1.0317, + "step": 30045 + }, + { + "epoch": 8.99, + "grad_norm": 2.4378347396850586, + "learning_rate": 2.8959598141635826e-05, + "loss": 1.0759, + "step": 30050 + }, + { + "epoch": 8.99, + "grad_norm": 3.9936773777008057, + "learning_rate": 2.8953796980907365e-05, + "loss": 1.1189, + "step": 30055 + }, + { + "epoch": 8.99, + "grad_norm": 2.5489132404327393, + "learning_rate": 2.894799560181481e-05, + "loss": 1.0809, + "step": 30060 + }, + { + "epoch": 9.0, + "grad_norm": 2.193693161010742, + "learning_rate": 2.894219400467856e-05, + "loss": 1.1111, + "step": 30065 + }, + { + "epoch": 9.0, + "grad_norm": 2.2259438037872314, + "learning_rate": 2.8936392189819034e-05, + "loss": 1.2037, + "step": 30070 + }, + { + "epoch": 9.0, + "grad_norm": 6.183416843414307, + "learning_rate": 2.893059015755666e-05, + "loss": 1.2801, + "step": 30075 + }, + { + "epoch": 9.0, + "grad_norm": 2.10697865486145, + "learning_rate": 2.892478790821187e-05, + "loss": 1.089, + "step": 30080 + }, + { + "epoch": 9.0, + "grad_norm": 2.224777936935425, + "learning_rate": 2.8918985442105128e-05, + "loss": 1.1623, + "step": 30085 + }, + { + "epoch": 9.0, + "grad_norm": 2.0744781494140625, + "learning_rate": 2.8913182759556894e-05, + "loss": 0.9921, + "step": 30090 + }, + { + "epoch": 9.0, + "grad_norm": 3.06063175201416, + "learning_rate": 2.8907379860887645e-05, + "loss": 1.0767, + "step": 30095 + }, + { + "epoch": 9.01, + "grad_norm": 1.988356590270996, + "learning_rate": 2.890157674641787e-05, + "loss": 1.0859, + "step": 30100 + }, + { + "epoch": 9.01, + "grad_norm": 1.3868892192840576, + "learning_rate": 2.8895773416468063e-05, + "loss": 1.118, + "step": 30105 + }, + { + "epoch": 9.01, + "grad_norm": 2.165102481842041, + "learning_rate": 2.8889969871358746e-05, + "loss": 1.0599, + "step": 30110 + }, + { + "epoch": 9.01, + "grad_norm": 1.76089346408844, + "learning_rate": 2.888416611141043e-05, + "loss": 1.1494, + "step": 30115 + }, + { + "epoch": 9.01, + "grad_norm": 3.6625218391418457, + "learning_rate": 2.887836213694366e-05, + "loss": 1.0324, + "step": 30120 + }, + { + "epoch": 9.01, + "grad_norm": 2.0290017127990723, + "learning_rate": 2.8872557948278976e-05, + "loss": 1.1136, + "step": 30125 + }, + { + "epoch": 9.01, + "grad_norm": 1.7238839864730835, + "learning_rate": 2.8866753545736946e-05, + "loss": 1.0594, + "step": 30130 + }, + { + "epoch": 9.02, + "grad_norm": 3.4980666637420654, + "learning_rate": 2.8860948929638136e-05, + "loss": 0.9758, + "step": 30135 + }, + { + "epoch": 9.02, + "grad_norm": 4.032216548919678, + "learning_rate": 2.885514410030313e-05, + "loss": 0.999, + "step": 30140 + }, + { + "epoch": 9.02, + "grad_norm": 1.803316593170166, + "learning_rate": 2.8849339058052526e-05, + "loss": 1.188, + "step": 30145 + }, + { + "epoch": 9.02, + "grad_norm": 1.5701510906219482, + "learning_rate": 2.8843533803206923e-05, + "loss": 1.0158, + "step": 30150 + }, + { + "epoch": 9.02, + "grad_norm": 1.275262713432312, + "learning_rate": 2.8837728336086946e-05, + "loss": 0.9069, + "step": 30155 + }, + { + "epoch": 9.02, + "grad_norm": 2.408705949783325, + "learning_rate": 2.8831922657013216e-05, + "loss": 1.0335, + "step": 30160 + }, + { + "epoch": 9.03, + "grad_norm": 2.678744316101074, + "learning_rate": 2.8826116766306383e-05, + "loss": 1.0944, + "step": 30165 + }, + { + "epoch": 9.03, + "grad_norm": 3.0019614696502686, + "learning_rate": 2.8820310664287096e-05, + "loss": 0.9107, + "step": 30170 + }, + { + "epoch": 9.03, + "grad_norm": 1.4615614414215088, + "learning_rate": 2.881450435127603e-05, + "loss": 1.0898, + "step": 30175 + }, + { + "epoch": 9.03, + "grad_norm": 1.9362547397613525, + "learning_rate": 2.8808697827593845e-05, + "loss": 1.1276, + "step": 30180 + }, + { + "epoch": 9.03, + "grad_norm": 2.6547698974609375, + "learning_rate": 2.880289109356124e-05, + "loss": 1.2548, + "step": 30185 + }, + { + "epoch": 9.03, + "grad_norm": 7.905858516693115, + "learning_rate": 2.8797084149498915e-05, + "loss": 0.9404, + "step": 30190 + }, + { + "epoch": 9.03, + "grad_norm": 1.2215291261672974, + "learning_rate": 2.879127699572758e-05, + "loss": 1.003, + "step": 30195 + }, + { + "epoch": 9.04, + "grad_norm": 1.8280051946640015, + "learning_rate": 2.878546963256795e-05, + "loss": 1.0391, + "step": 30200 + }, + { + "epoch": 9.04, + "grad_norm": 1.2926268577575684, + "learning_rate": 2.8779662060340778e-05, + "loss": 1.2109, + "step": 30205 + }, + { + "epoch": 9.04, + "grad_norm": 1.7577592134475708, + "learning_rate": 2.8773854279366797e-05, + "loss": 1.1695, + "step": 30210 + }, + { + "epoch": 9.04, + "grad_norm": 2.365588903427124, + "learning_rate": 2.8768046289966766e-05, + "loss": 1.1072, + "step": 30215 + }, + { + "epoch": 9.04, + "grad_norm": 2.1536929607391357, + "learning_rate": 2.8762238092461447e-05, + "loss": 1.1535, + "step": 30220 + }, + { + "epoch": 9.04, + "grad_norm": 2.988088369369507, + "learning_rate": 2.8756429687171637e-05, + "loss": 0.9861, + "step": 30225 + }, + { + "epoch": 9.04, + "grad_norm": 3.4814767837524414, + "learning_rate": 2.8750621074418115e-05, + "loss": 1.3146, + "step": 30230 + }, + { + "epoch": 9.05, + "grad_norm": 1.8867098093032837, + "learning_rate": 2.874481225452169e-05, + "loss": 1.037, + "step": 30235 + }, + { + "epoch": 9.05, + "grad_norm": 1.5579580068588257, + "learning_rate": 2.8739003227803184e-05, + "loss": 1.1022, + "step": 30240 + }, + { + "epoch": 9.05, + "grad_norm": 3.373805284500122, + "learning_rate": 2.8733193994583412e-05, + "loss": 1.1459, + "step": 30245 + }, + { + "epoch": 9.05, + "grad_norm": 2.02644419670105, + "learning_rate": 2.8727384555183217e-05, + "loss": 1.1493, + "step": 30250 + }, + { + "epoch": 9.05, + "grad_norm": 2.24518084526062, + "learning_rate": 2.8721574909923445e-05, + "loss": 1.0157, + "step": 30255 + }, + { + "epoch": 9.05, + "grad_norm": 1.2897074222564697, + "learning_rate": 2.871576505912496e-05, + "loss": 1.128, + "step": 30260 + }, + { + "epoch": 9.05, + "grad_norm": 1.510237693786621, + "learning_rate": 2.8709955003108636e-05, + "loss": 1.1292, + "step": 30265 + }, + { + "epoch": 9.06, + "grad_norm": 1.2689833641052246, + "learning_rate": 2.870414474219535e-05, + "loss": 1.0674, + "step": 30270 + }, + { + "epoch": 9.06, + "grad_norm": 1.3254342079162598, + "learning_rate": 2.8698334276705995e-05, + "loss": 1.1342, + "step": 30275 + }, + { + "epoch": 9.06, + "grad_norm": 2.8477869033813477, + "learning_rate": 2.8692523606961492e-05, + "loss": 1.1701, + "step": 30280 + }, + { + "epoch": 9.06, + "grad_norm": 1.7730213403701782, + "learning_rate": 2.868671273328274e-05, + "loss": 1.1741, + "step": 30285 + }, + { + "epoch": 9.06, + "grad_norm": 1.311159610748291, + "learning_rate": 2.8680901655990678e-05, + "loss": 1.0136, + "step": 30290 + }, + { + "epoch": 9.06, + "grad_norm": 1.5597997903823853, + "learning_rate": 2.867509037540625e-05, + "loss": 1.07, + "step": 30295 + }, + { + "epoch": 9.07, + "grad_norm": 3.055157423019409, + "learning_rate": 2.8669278891850392e-05, + "loss": 1.2124, + "step": 30300 + }, + { + "epoch": 9.07, + "grad_norm": 2.4278862476348877, + "learning_rate": 2.8663467205644086e-05, + "loss": 0.9972, + "step": 30305 + }, + { + "epoch": 9.07, + "grad_norm": 2.5763089656829834, + "learning_rate": 2.8657655317108284e-05, + "loss": 0.9838, + "step": 30310 + }, + { + "epoch": 9.07, + "grad_norm": 2.0975847244262695, + "learning_rate": 2.8651843226563983e-05, + "loss": 1.1683, + "step": 30315 + }, + { + "epoch": 9.07, + "grad_norm": 1.7405827045440674, + "learning_rate": 2.864603093433218e-05, + "loss": 1.0669, + "step": 30320 + }, + { + "epoch": 9.07, + "grad_norm": 7.420092582702637, + "learning_rate": 2.8640218440733875e-05, + "loss": 1.015, + "step": 30325 + }, + { + "epoch": 9.07, + "grad_norm": 3.352482795715332, + "learning_rate": 2.8634405746090088e-05, + "loss": 1.0018, + "step": 30330 + }, + { + "epoch": 9.08, + "grad_norm": 4.274402141571045, + "learning_rate": 2.8628592850721857e-05, + "loss": 1.1142, + "step": 30335 + }, + { + "epoch": 9.08, + "grad_norm": 1.6124602556228638, + "learning_rate": 2.862277975495021e-05, + "loss": 1.1566, + "step": 30340 + }, + { + "epoch": 9.08, + "grad_norm": 1.762075424194336, + "learning_rate": 2.8616966459096202e-05, + "loss": 1.037, + "step": 30345 + }, + { + "epoch": 9.08, + "grad_norm": 3.1598358154296875, + "learning_rate": 2.8611152963480892e-05, + "loss": 1.0827, + "step": 30350 + }, + { + "epoch": 9.08, + "grad_norm": 4.015747547149658, + "learning_rate": 2.8605339268425363e-05, + "loss": 0.9845, + "step": 30355 + }, + { + "epoch": 9.08, + "grad_norm": 6.202319145202637, + "learning_rate": 2.8599525374250684e-05, + "loss": 1.1009, + "step": 30360 + }, + { + "epoch": 9.08, + "grad_norm": 3.212571620941162, + "learning_rate": 2.859371128127797e-05, + "loss": 1.1507, + "step": 30365 + }, + { + "epoch": 9.09, + "grad_norm": 2.252204656600952, + "learning_rate": 2.8587896989828323e-05, + "loss": 1.1523, + "step": 30370 + }, + { + "epoch": 9.09, + "grad_norm": 2.1097633838653564, + "learning_rate": 2.8582082500222845e-05, + "loss": 1.0826, + "step": 30375 + }, + { + "epoch": 9.09, + "grad_norm": 2.7466039657592773, + "learning_rate": 2.8576267812782675e-05, + "loss": 1.1409, + "step": 30380 + }, + { + "epoch": 9.09, + "grad_norm": 8.023296356201172, + "learning_rate": 2.857045292782895e-05, + "loss": 0.9465, + "step": 30385 + }, + { + "epoch": 9.09, + "grad_norm": 3.5571467876434326, + "learning_rate": 2.8564637845682823e-05, + "loss": 1.1629, + "step": 30390 + }, + { + "epoch": 9.09, + "grad_norm": 3.21684193611145, + "learning_rate": 2.8558822566665454e-05, + "loss": 1.0213, + "step": 30395 + }, + { + "epoch": 9.1, + "grad_norm": 3.065678596496582, + "learning_rate": 2.8553007091098016e-05, + "loss": 0.9455, + "step": 30400 + }, + { + "epoch": 9.1, + "grad_norm": 3.9140865802764893, + "learning_rate": 2.8547191419301687e-05, + "loss": 0.974, + "step": 30405 + }, + { + "epoch": 9.1, + "grad_norm": 4.4788641929626465, + "learning_rate": 2.854137555159766e-05, + "loss": 0.9984, + "step": 30410 + }, + { + "epoch": 9.1, + "grad_norm": 1.5970557928085327, + "learning_rate": 2.8535559488307145e-05, + "loss": 1.1595, + "step": 30415 + }, + { + "epoch": 9.1, + "grad_norm": 2.536341905593872, + "learning_rate": 2.8529743229751354e-05, + "loss": 1.1607, + "step": 30420 + }, + { + "epoch": 9.1, + "grad_norm": 2.384742259979248, + "learning_rate": 2.8523926776251514e-05, + "loss": 1.0816, + "step": 30425 + }, + { + "epoch": 9.1, + "grad_norm": 4.2098388671875, + "learning_rate": 2.8518110128128863e-05, + "loss": 1.0073, + "step": 30430 + }, + { + "epoch": 9.11, + "grad_norm": 1.5726318359375, + "learning_rate": 2.851229328570465e-05, + "loss": 1.0309, + "step": 30435 + }, + { + "epoch": 9.11, + "grad_norm": 2.4862496852874756, + "learning_rate": 2.850647624930012e-05, + "loss": 1.1971, + "step": 30440 + }, + { + "epoch": 9.11, + "grad_norm": 3.5220296382904053, + "learning_rate": 2.8500659019236553e-05, + "loss": 1.0254, + "step": 30445 + }, + { + "epoch": 9.11, + "grad_norm": 5.456885814666748, + "learning_rate": 2.8494841595835226e-05, + "loss": 0.7803, + "step": 30450 + }, + { + "epoch": 9.11, + "grad_norm": 3.647815704345703, + "learning_rate": 2.8489023979417435e-05, + "loss": 1.0921, + "step": 30455 + }, + { + "epoch": 9.11, + "grad_norm": 3.8451616764068604, + "learning_rate": 2.8483206170304473e-05, + "loss": 1.048, + "step": 30460 + }, + { + "epoch": 9.11, + "grad_norm": 1.4922055006027222, + "learning_rate": 2.8477388168817664e-05, + "loss": 1.0033, + "step": 30465 + }, + { + "epoch": 9.12, + "grad_norm": 3.4989609718322754, + "learning_rate": 2.847156997527831e-05, + "loss": 1.2043, + "step": 30470 + }, + { + "epoch": 9.12, + "grad_norm": 2.6432862281799316, + "learning_rate": 2.8465751590007762e-05, + "loss": 1.0925, + "step": 30475 + }, + { + "epoch": 9.12, + "grad_norm": 2.137115478515625, + "learning_rate": 2.845993301332735e-05, + "loss": 1.1623, + "step": 30480 + }, + { + "epoch": 9.12, + "grad_norm": 3.7192418575286865, + "learning_rate": 2.845411424555844e-05, + "loss": 1.0968, + "step": 30485 + }, + { + "epoch": 9.12, + "grad_norm": 2.331989049911499, + "learning_rate": 2.8448295287022386e-05, + "loss": 1.066, + "step": 30490 + }, + { + "epoch": 9.12, + "grad_norm": 2.4555537700653076, + "learning_rate": 2.8442476138040568e-05, + "loss": 1.0104, + "step": 30495 + }, + { + "epoch": 9.13, + "grad_norm": 4.285811901092529, + "learning_rate": 2.8436656798934376e-05, + "loss": 1.0691, + "step": 30500 + }, + { + "epoch": 9.13, + "grad_norm": 1.4364317655563354, + "learning_rate": 2.8430837270025196e-05, + "loss": 1.1077, + "step": 30505 + }, + { + "epoch": 9.13, + "grad_norm": 1.1822254657745361, + "learning_rate": 2.842501755163444e-05, + "loss": 1.0285, + "step": 30510 + }, + { + "epoch": 9.13, + "grad_norm": 7.019848823547363, + "learning_rate": 2.8419197644083527e-05, + "loss": 1.1324, + "step": 30515 + }, + { + "epoch": 9.13, + "grad_norm": 1.9157739877700806, + "learning_rate": 2.841337754769388e-05, + "loss": 1.1563, + "step": 30520 + }, + { + "epoch": 9.13, + "grad_norm": 2.709386110305786, + "learning_rate": 2.8407557262786945e-05, + "loss": 1.1656, + "step": 30525 + }, + { + "epoch": 9.13, + "grad_norm": 4.894626140594482, + "learning_rate": 2.8401736789684153e-05, + "loss": 1.1094, + "step": 30530 + }, + { + "epoch": 9.14, + "grad_norm": 2.685741662979126, + "learning_rate": 2.839591612870698e-05, + "loss": 0.9865, + "step": 30535 + }, + { + "epoch": 9.14, + "grad_norm": 2.1641218662261963, + "learning_rate": 2.8390095280176894e-05, + "loss": 1.1466, + "step": 30540 + }, + { + "epoch": 9.14, + "grad_norm": 3.6387031078338623, + "learning_rate": 2.838427424441536e-05, + "loss": 1.0634, + "step": 30545 + }, + { + "epoch": 9.14, + "grad_norm": 2.7952628135681152, + "learning_rate": 2.8378453021743882e-05, + "loss": 1.104, + "step": 30550 + }, + { + "epoch": 9.14, + "grad_norm": 2.7595205307006836, + "learning_rate": 2.837263161248396e-05, + "loss": 1.1497, + "step": 30555 + }, + { + "epoch": 9.14, + "grad_norm": 2.8632593154907227, + "learning_rate": 2.8366810016957096e-05, + "loss": 0.8454, + "step": 30560 + }, + { + "epoch": 9.14, + "grad_norm": 2.603904962539673, + "learning_rate": 2.8360988235484814e-05, + "loss": 1.0677, + "step": 30565 + }, + { + "epoch": 9.15, + "grad_norm": 3.4272186756134033, + "learning_rate": 2.8355166268388643e-05, + "loss": 1.0451, + "step": 30570 + }, + { + "epoch": 9.15, + "grad_norm": 2.422112226486206, + "learning_rate": 2.8349344115990127e-05, + "loss": 1.1719, + "step": 30575 + }, + { + "epoch": 9.15, + "grad_norm": 4.006277561187744, + "learning_rate": 2.834352177861082e-05, + "loss": 1.0388, + "step": 30580 + }, + { + "epoch": 9.15, + "grad_norm": 3.0967395305633545, + "learning_rate": 2.8337699256572282e-05, + "loss": 0.8718, + "step": 30585 + }, + { + "epoch": 9.15, + "grad_norm": 3.621616840362549, + "learning_rate": 2.833187655019608e-05, + "loss": 1.2271, + "step": 30590 + }, + { + "epoch": 9.15, + "grad_norm": 1.8164085149765015, + "learning_rate": 2.8326053659803803e-05, + "loss": 1.0748, + "step": 30595 + }, + { + "epoch": 9.16, + "grad_norm": 5.264411926269531, + "learning_rate": 2.8320230585717034e-05, + "loss": 1.0605, + "step": 30600 + }, + { + "epoch": 9.16, + "grad_norm": 2.898695707321167, + "learning_rate": 2.831440732825738e-05, + "loss": 1.0845, + "step": 30605 + }, + { + "epoch": 9.16, + "grad_norm": 1.863917350769043, + "learning_rate": 2.830858388774646e-05, + "loss": 1.1017, + "step": 30610 + }, + { + "epoch": 9.16, + "grad_norm": 2.258936643600464, + "learning_rate": 2.830276026450589e-05, + "loss": 1.2552, + "step": 30615 + }, + { + "epoch": 9.16, + "grad_norm": 1.4957672357559204, + "learning_rate": 2.82969364588573e-05, + "loss": 1.1517, + "step": 30620 + }, + { + "epoch": 9.16, + "grad_norm": 4.283958911895752, + "learning_rate": 2.8291112471122338e-05, + "loss": 1.2211, + "step": 30625 + }, + { + "epoch": 9.16, + "grad_norm": 2.875387191772461, + "learning_rate": 2.8285288301622658e-05, + "loss": 1.1582, + "step": 30630 + }, + { + "epoch": 9.17, + "grad_norm": 3.575859546661377, + "learning_rate": 2.827946395067991e-05, + "loss": 1.2838, + "step": 30635 + }, + { + "epoch": 9.17, + "grad_norm": 3.4343674182891846, + "learning_rate": 2.8273639418615788e-05, + "loss": 1.0276, + "step": 30640 + }, + { + "epoch": 9.17, + "grad_norm": 2.2512946128845215, + "learning_rate": 2.826781470575196e-05, + "loss": 0.8439, + "step": 30645 + }, + { + "epoch": 9.17, + "grad_norm": 3.449051856994629, + "learning_rate": 2.826198981241012e-05, + "loss": 1.1702, + "step": 30650 + }, + { + "epoch": 9.17, + "grad_norm": 3.092623233795166, + "learning_rate": 2.8256164738911977e-05, + "loss": 1.0548, + "step": 30655 + }, + { + "epoch": 9.17, + "grad_norm": 2.316596031188965, + "learning_rate": 2.8250339485579248e-05, + "loss": 1.1654, + "step": 30660 + }, + { + "epoch": 9.17, + "grad_norm": 5.095229148864746, + "learning_rate": 2.824451405273364e-05, + "loss": 1.1143, + "step": 30665 + }, + { + "epoch": 9.18, + "grad_norm": 2.3152172565460205, + "learning_rate": 2.8238688440696885e-05, + "loss": 1.0509, + "step": 30670 + }, + { + "epoch": 9.18, + "grad_norm": 2.434166669845581, + "learning_rate": 2.8232862649790742e-05, + "loss": 0.9805, + "step": 30675 + }, + { + "epoch": 9.18, + "grad_norm": 2.4533190727233887, + "learning_rate": 2.8227036680336956e-05, + "loss": 0.964, + "step": 30680 + }, + { + "epoch": 9.18, + "grad_norm": 3.624160051345825, + "learning_rate": 2.8221210532657283e-05, + "loss": 1.0408, + "step": 30685 + }, + { + "epoch": 9.18, + "grad_norm": 4.148279666900635, + "learning_rate": 2.8215384207073515e-05, + "loss": 1.1267, + "step": 30690 + }, + { + "epoch": 9.18, + "grad_norm": 3.3725688457489014, + "learning_rate": 2.820955770390741e-05, + "loss": 1.0548, + "step": 30695 + }, + { + "epoch": 9.19, + "grad_norm": 1.4353331327438354, + "learning_rate": 2.820373102348076e-05, + "loss": 0.8967, + "step": 30700 + }, + { + "epoch": 9.19, + "grad_norm": 1.5979543924331665, + "learning_rate": 2.8197904166115386e-05, + "loss": 1.2111, + "step": 30705 + }, + { + "epoch": 9.19, + "grad_norm": 4.696241855621338, + "learning_rate": 2.819207713213309e-05, + "loss": 1.0566, + "step": 30710 + }, + { + "epoch": 9.19, + "grad_norm": 2.242954730987549, + "learning_rate": 2.8186249921855683e-05, + "loss": 1.1283, + "step": 30715 + }, + { + "epoch": 9.19, + "grad_norm": 1.5376769304275513, + "learning_rate": 2.818042253560501e-05, + "loss": 1.1125, + "step": 30720 + }, + { + "epoch": 9.19, + "grad_norm": 3.4272782802581787, + "learning_rate": 2.8174594973702906e-05, + "loss": 1.1207, + "step": 30725 + }, + { + "epoch": 9.19, + "grad_norm": 1.447213053703308, + "learning_rate": 2.8168767236471223e-05, + "loss": 1.2412, + "step": 30730 + }, + { + "epoch": 9.2, + "grad_norm": 1.9952369928359985, + "learning_rate": 2.816293932423182e-05, + "loss": 1.0659, + "step": 30735 + }, + { + "epoch": 9.2, + "grad_norm": 3.7000386714935303, + "learning_rate": 2.8157111237306562e-05, + "loss": 1.0848, + "step": 30740 + }, + { + "epoch": 9.2, + "grad_norm": 0.9547794461250305, + "learning_rate": 2.8151282976017336e-05, + "loss": 1.0252, + "step": 30745 + }, + { + "epoch": 9.2, + "grad_norm": 2.513737440109253, + "learning_rate": 2.8145454540686024e-05, + "loss": 0.9642, + "step": 30750 + }, + { + "epoch": 9.2, + "grad_norm": 5.190265655517578, + "learning_rate": 2.813962593163453e-05, + "loss": 1.0277, + "step": 30755 + }, + { + "epoch": 9.2, + "grad_norm": 2.4562807083129883, + "learning_rate": 2.8133797149184755e-05, + "loss": 1.1477, + "step": 30760 + }, + { + "epoch": 9.2, + "grad_norm": 1.605533242225647, + "learning_rate": 2.812796819365862e-05, + "loss": 1.0803, + "step": 30765 + }, + { + "epoch": 9.21, + "grad_norm": 1.3287237882614136, + "learning_rate": 2.812213906537806e-05, + "loss": 1.1856, + "step": 30770 + }, + { + "epoch": 9.21, + "grad_norm": 2.1841320991516113, + "learning_rate": 2.8116309764665e-05, + "loss": 1.272, + "step": 30775 + }, + { + "epoch": 9.21, + "grad_norm": 2.0276877880096436, + "learning_rate": 2.811048029184139e-05, + "loss": 1.0535, + "step": 30780 + }, + { + "epoch": 9.21, + "grad_norm": 2.9414737224578857, + "learning_rate": 2.810465064722919e-05, + "loss": 1.0341, + "step": 30785 + }, + { + "epoch": 9.21, + "grad_norm": 1.2586784362792969, + "learning_rate": 2.8098820831150362e-05, + "loss": 1.1424, + "step": 30790 + }, + { + "epoch": 9.21, + "grad_norm": 4.782675743103027, + "learning_rate": 2.809299084392688e-05, + "loss": 1.0249, + "step": 30795 + }, + { + "epoch": 9.22, + "grad_norm": 1.6296465396881104, + "learning_rate": 2.8087160685880726e-05, + "loss": 1.0696, + "step": 30800 + }, + { + "epoch": 9.22, + "grad_norm": 1.8938437700271606, + "learning_rate": 2.808133035733389e-05, + "loss": 1.0324, + "step": 30805 + }, + { + "epoch": 9.22, + "grad_norm": 3.3237104415893555, + "learning_rate": 2.807549985860839e-05, + "loss": 1.0725, + "step": 30810 + }, + { + "epoch": 9.22, + "grad_norm": 1.7440217733383179, + "learning_rate": 2.8069669190026233e-05, + "loss": 1.0946, + "step": 30815 + }, + { + "epoch": 9.22, + "grad_norm": 1.855573058128357, + "learning_rate": 2.8063838351909434e-05, + "loss": 1.1026, + "step": 30820 + }, + { + "epoch": 9.22, + "grad_norm": 1.5629770755767822, + "learning_rate": 2.8058007344580027e-05, + "loss": 1.3132, + "step": 30825 + }, + { + "epoch": 9.22, + "grad_norm": 4.092102527618408, + "learning_rate": 2.805217616836005e-05, + "loss": 1.1355, + "step": 30830 + }, + { + "epoch": 9.23, + "grad_norm": 2.19754695892334, + "learning_rate": 2.8046344823571557e-05, + "loss": 1.0562, + "step": 30835 + }, + { + "epoch": 9.23, + "grad_norm": 2.126832962036133, + "learning_rate": 2.804051331053661e-05, + "loss": 1.169, + "step": 30840 + }, + { + "epoch": 9.23, + "grad_norm": 3.9059386253356934, + "learning_rate": 2.803468162957727e-05, + "loss": 1.0877, + "step": 30845 + }, + { + "epoch": 9.23, + "grad_norm": 3.600858449935913, + "learning_rate": 2.8028849781015615e-05, + "loss": 1.0115, + "step": 30850 + }, + { + "epoch": 9.23, + "grad_norm": 3.020725965499878, + "learning_rate": 2.802301776517374e-05, + "loss": 0.9752, + "step": 30855 + }, + { + "epoch": 9.23, + "grad_norm": 2.0996792316436768, + "learning_rate": 2.801718558237374e-05, + "loss": 1.253, + "step": 30860 + }, + { + "epoch": 9.23, + "grad_norm": 1.7010042667388916, + "learning_rate": 2.801135323293771e-05, + "loss": 1.0858, + "step": 30865 + }, + { + "epoch": 9.24, + "grad_norm": 2.4270615577697754, + "learning_rate": 2.8005520717187773e-05, + "loss": 0.7829, + "step": 30870 + }, + { + "epoch": 9.24, + "grad_norm": 3.7250173091888428, + "learning_rate": 2.7999688035446048e-05, + "loss": 1.1342, + "step": 30875 + }, + { + "epoch": 9.24, + "grad_norm": 2.1018354892730713, + "learning_rate": 2.7993855188034674e-05, + "loss": 1.086, + "step": 30880 + }, + { + "epoch": 9.24, + "grad_norm": 1.520485758781433, + "learning_rate": 2.7988022175275786e-05, + "loss": 1.0099, + "step": 30885 + }, + { + "epoch": 9.24, + "grad_norm": 2.133230447769165, + "learning_rate": 2.7982188997491544e-05, + "loss": 1.1996, + "step": 30890 + }, + { + "epoch": 9.24, + "grad_norm": 3.0092363357543945, + "learning_rate": 2.7976355655004096e-05, + "loss": 1.1812, + "step": 30895 + }, + { + "epoch": 9.24, + "grad_norm": 4.08493185043335, + "learning_rate": 2.7970522148135626e-05, + "loss": 1.1021, + "step": 30900 + }, + { + "epoch": 9.25, + "grad_norm": 2.9970130920410156, + "learning_rate": 2.79646884772083e-05, + "loss": 1.0735, + "step": 30905 + }, + { + "epoch": 9.25, + "grad_norm": 3.3271689414978027, + "learning_rate": 2.795885464254431e-05, + "loss": 1.053, + "step": 30910 + }, + { + "epoch": 9.25, + "grad_norm": 1.4670406579971313, + "learning_rate": 2.7953020644465865e-05, + "loss": 1.0696, + "step": 30915 + }, + { + "epoch": 9.25, + "grad_norm": 5.897749900817871, + "learning_rate": 2.7947186483295157e-05, + "loss": 1.0656, + "step": 30920 + }, + { + "epoch": 9.25, + "grad_norm": 2.2055165767669678, + "learning_rate": 2.7941352159354394e-05, + "loss": 0.9848, + "step": 30925 + }, + { + "epoch": 9.25, + "grad_norm": 2.664703845977783, + "learning_rate": 2.7935517672965817e-05, + "loss": 1.071, + "step": 30930 + }, + { + "epoch": 9.26, + "grad_norm": 5.1785454750061035, + "learning_rate": 2.792968302445164e-05, + "loss": 1.1283, + "step": 30935 + }, + { + "epoch": 9.26, + "grad_norm": 4.427746772766113, + "learning_rate": 2.7923848214134123e-05, + "loss": 1.0444, + "step": 30940 + }, + { + "epoch": 9.26, + "grad_norm": 2.530972957611084, + "learning_rate": 2.7918013242335504e-05, + "loss": 1.0145, + "step": 30945 + }, + { + "epoch": 9.26, + "grad_norm": 16.054180145263672, + "learning_rate": 2.7912178109378056e-05, + "loss": 1.2892, + "step": 30950 + }, + { + "epoch": 9.26, + "grad_norm": 2.498807430267334, + "learning_rate": 2.790634281558403e-05, + "loss": 0.9976, + "step": 30955 + }, + { + "epoch": 9.26, + "grad_norm": 1.9145476818084717, + "learning_rate": 2.7900507361275714e-05, + "loss": 0.938, + "step": 30960 + }, + { + "epoch": 9.26, + "grad_norm": 1.1175459623336792, + "learning_rate": 2.7894671746775386e-05, + "loss": 1.1884, + "step": 30965 + }, + { + "epoch": 9.27, + "grad_norm": 4.308173656463623, + "learning_rate": 2.7888835972405352e-05, + "loss": 1.1149, + "step": 30970 + }, + { + "epoch": 9.27, + "grad_norm": 2.5797321796417236, + "learning_rate": 2.7883000038487904e-05, + "loss": 1.0525, + "step": 30975 + }, + { + "epoch": 9.27, + "grad_norm": 2.3308804035186768, + "learning_rate": 2.7877163945345368e-05, + "loss": 1.168, + "step": 30980 + }, + { + "epoch": 9.27, + "grad_norm": 2.186554193496704, + "learning_rate": 2.7871327693300054e-05, + "loss": 1.0804, + "step": 30985 + }, + { + "epoch": 9.27, + "grad_norm": 7.420773029327393, + "learning_rate": 2.7865491282674293e-05, + "loss": 1.021, + "step": 30990 + }, + { + "epoch": 9.27, + "grad_norm": 3.671011447906494, + "learning_rate": 2.7859654713790434e-05, + "loss": 1.0881, + "step": 30995 + }, + { + "epoch": 9.27, + "grad_norm": 1.4102942943572998, + "learning_rate": 2.7853817986970814e-05, + "loss": 1.1006, + "step": 31000 + }, + { + "epoch": 9.28, + "grad_norm": 4.502505779266357, + "learning_rate": 2.784798110253779e-05, + "loss": 1.0186, + "step": 31005 + }, + { + "epoch": 9.28, + "grad_norm": 2.1751863956451416, + "learning_rate": 2.7842144060813736e-05, + "loss": 0.9848, + "step": 31010 + }, + { + "epoch": 9.28, + "grad_norm": 2.5720980167388916, + "learning_rate": 2.783630686212102e-05, + "loss": 1.0289, + "step": 31015 + }, + { + "epoch": 9.28, + "grad_norm": 1.7621455192565918, + "learning_rate": 2.783046950678202e-05, + "loss": 0.89, + "step": 31020 + }, + { + "epoch": 9.28, + "grad_norm": 1.859261155128479, + "learning_rate": 2.7824631995119134e-05, + "loss": 1.1063, + "step": 31025 + }, + { + "epoch": 9.28, + "grad_norm": 2.331822156906128, + "learning_rate": 2.7818794327454757e-05, + "loss": 1.0846, + "step": 31030 + }, + { + "epoch": 9.29, + "grad_norm": 5.336060523986816, + "learning_rate": 2.78129565041113e-05, + "loss": 0.9446, + "step": 31035 + }, + { + "epoch": 9.29, + "grad_norm": 1.7737152576446533, + "learning_rate": 2.7807118525411176e-05, + "loss": 1.0767, + "step": 31040 + }, + { + "epoch": 9.29, + "grad_norm": 10.145687103271484, + "learning_rate": 2.780128039167682e-05, + "loss": 1.118, + "step": 31045 + }, + { + "epoch": 9.29, + "grad_norm": 2.768529176712036, + "learning_rate": 2.7795442103230656e-05, + "loss": 1.0003, + "step": 31050 + }, + { + "epoch": 9.29, + "grad_norm": 2.1712610721588135, + "learning_rate": 2.778960366039513e-05, + "loss": 1.0129, + "step": 31055 + }, + { + "epoch": 9.29, + "grad_norm": 2.940528154373169, + "learning_rate": 2.7783765063492696e-05, + "loss": 1.077, + "step": 31060 + }, + { + "epoch": 9.29, + "grad_norm": 2.5205068588256836, + "learning_rate": 2.777792631284581e-05, + "loss": 1.2037, + "step": 31065 + }, + { + "epoch": 9.3, + "grad_norm": 1.8787987232208252, + "learning_rate": 2.7772087408776937e-05, + "loss": 1.1164, + "step": 31070 + }, + { + "epoch": 9.3, + "grad_norm": 1.8113431930541992, + "learning_rate": 2.776624835160856e-05, + "loss": 1.1057, + "step": 31075 + }, + { + "epoch": 9.3, + "grad_norm": 1.184775710105896, + "learning_rate": 2.7760409141663164e-05, + "loss": 1.1547, + "step": 31080 + }, + { + "epoch": 9.3, + "grad_norm": 2.488152265548706, + "learning_rate": 2.775456977926324e-05, + "loss": 1.1285, + "step": 31085 + }, + { + "epoch": 9.3, + "grad_norm": 1.7316298484802246, + "learning_rate": 2.7748730264731287e-05, + "loss": 1.0315, + "step": 31090 + }, + { + "epoch": 9.3, + "grad_norm": 1.8663023710250854, + "learning_rate": 2.774289059838982e-05, + "loss": 1.0133, + "step": 31095 + }, + { + "epoch": 9.3, + "grad_norm": 1.8005400896072388, + "learning_rate": 2.7737050780561358e-05, + "loss": 1.0123, + "step": 31100 + }, + { + "epoch": 9.31, + "grad_norm": 2.0711119174957275, + "learning_rate": 2.7731210811568427e-05, + "loss": 1.0181, + "step": 31105 + }, + { + "epoch": 9.31, + "grad_norm": 4.488975524902344, + "learning_rate": 2.7725370691733565e-05, + "loss": 0.9204, + "step": 31110 + }, + { + "epoch": 9.31, + "grad_norm": 1.9340286254882812, + "learning_rate": 2.7719530421379308e-05, + "loss": 0.9195, + "step": 31115 + }, + { + "epoch": 9.31, + "grad_norm": 1.3298678398132324, + "learning_rate": 2.7713690000828213e-05, + "loss": 1.09, + "step": 31120 + }, + { + "epoch": 9.31, + "grad_norm": 1.4798712730407715, + "learning_rate": 2.7707849430402838e-05, + "loss": 1.0828, + "step": 31125 + }, + { + "epoch": 9.31, + "grad_norm": 2.92453670501709, + "learning_rate": 2.770200871042576e-05, + "loss": 1.1759, + "step": 31130 + }, + { + "epoch": 9.32, + "grad_norm": 2.3763322830200195, + "learning_rate": 2.769616784121954e-05, + "loss": 1.04, + "step": 31135 + }, + { + "epoch": 9.32, + "grad_norm": 1.7251625061035156, + "learning_rate": 2.7690326823106776e-05, + "loss": 1.0022, + "step": 31140 + }, + { + "epoch": 9.32, + "grad_norm": 2.365755796432495, + "learning_rate": 2.768448565641007e-05, + "loss": 1.0852, + "step": 31145 + }, + { + "epoch": 9.32, + "grad_norm": 2.628757953643799, + "learning_rate": 2.7678644341451998e-05, + "loss": 1.0231, + "step": 31150 + }, + { + "epoch": 9.32, + "grad_norm": 2.307605743408203, + "learning_rate": 2.7672802878555186e-05, + "loss": 1.1832, + "step": 31155 + }, + { + "epoch": 9.32, + "grad_norm": 4.4019951820373535, + "learning_rate": 2.7666961268042253e-05, + "loss": 1.0704, + "step": 31160 + }, + { + "epoch": 9.32, + "grad_norm": 1.9276536703109741, + "learning_rate": 2.7661119510235816e-05, + "loss": 1.1429, + "step": 31165 + }, + { + "epoch": 9.33, + "grad_norm": 1.0725014209747314, + "learning_rate": 2.7655277605458507e-05, + "loss": 0.9747, + "step": 31170 + }, + { + "epoch": 9.33, + "grad_norm": 2.2577459812164307, + "learning_rate": 2.7649435554032994e-05, + "loss": 1.1215, + "step": 31175 + }, + { + "epoch": 9.33, + "grad_norm": 1.3800833225250244, + "learning_rate": 2.76435933562819e-05, + "loss": 1.2958, + "step": 31180 + }, + { + "epoch": 9.33, + "grad_norm": 1.0564745664596558, + "learning_rate": 2.763775101252789e-05, + "loss": 1.0229, + "step": 31185 + }, + { + "epoch": 9.33, + "grad_norm": 1.7594860792160034, + "learning_rate": 2.763190852309364e-05, + "loss": 1.041, + "step": 31190 + }, + { + "epoch": 9.33, + "grad_norm": 1.3283685445785522, + "learning_rate": 2.7626065888301816e-05, + "loss": 1.1612, + "step": 31195 + }, + { + "epoch": 9.33, + "grad_norm": 2.163010597229004, + "learning_rate": 2.76202231084751e-05, + "loss": 1.0172, + "step": 31200 + }, + { + "epoch": 9.34, + "grad_norm": 0.8803406953811646, + "learning_rate": 2.761438018393619e-05, + "loss": 1.0381, + "step": 31205 + }, + { + "epoch": 9.34, + "grad_norm": 2.269392728805542, + "learning_rate": 2.7608537115007775e-05, + "loss": 1.0224, + "step": 31210 + }, + { + "epoch": 9.34, + "grad_norm": 3.6767170429229736, + "learning_rate": 2.7602693902012572e-05, + "loss": 1.1205, + "step": 31215 + }, + { + "epoch": 9.34, + "grad_norm": 2.087529182434082, + "learning_rate": 2.7596850545273286e-05, + "loss": 1.0549, + "step": 31220 + }, + { + "epoch": 9.34, + "grad_norm": 2.73809552192688, + "learning_rate": 2.7591007045112642e-05, + "loss": 1.0792, + "step": 31225 + }, + { + "epoch": 9.34, + "grad_norm": 4.561300277709961, + "learning_rate": 2.7585163401853376e-05, + "loss": 1.1238, + "step": 31230 + }, + { + "epoch": 9.35, + "grad_norm": 3.266984462738037, + "learning_rate": 2.7579319615818215e-05, + "loss": 1.0098, + "step": 31235 + }, + { + "epoch": 9.35, + "grad_norm": 2.625706434249878, + "learning_rate": 2.757347568732993e-05, + "loss": 0.9742, + "step": 31240 + }, + { + "epoch": 9.35, + "grad_norm": 1.4844180345535278, + "learning_rate": 2.7567631616711243e-05, + "loss": 1.1356, + "step": 31245 + }, + { + "epoch": 9.35, + "grad_norm": 1.707905888557434, + "learning_rate": 2.7561787404284928e-05, + "loss": 0.9465, + "step": 31250 + }, + { + "epoch": 9.35, + "grad_norm": 1.6472175121307373, + "learning_rate": 2.755594305037376e-05, + "loss": 1.1298, + "step": 31255 + }, + { + "epoch": 9.35, + "grad_norm": 4.289102554321289, + "learning_rate": 2.755009855530052e-05, + "loss": 1.0665, + "step": 31260 + }, + { + "epoch": 9.35, + "grad_norm": 1.6025971174240112, + "learning_rate": 2.7544253919387987e-05, + "loss": 0.968, + "step": 31265 + }, + { + "epoch": 9.36, + "grad_norm": 3.026954412460327, + "learning_rate": 2.7538409142958953e-05, + "loss": 0.775, + "step": 31270 + }, + { + "epoch": 9.36, + "grad_norm": 3.0979301929473877, + "learning_rate": 2.7532564226336222e-05, + "loss": 1.1642, + "step": 31275 + }, + { + "epoch": 9.36, + "grad_norm": 3.6659834384918213, + "learning_rate": 2.7526719169842602e-05, + "loss": 1.1043, + "step": 31280 + }, + { + "epoch": 9.36, + "grad_norm": 2.309248208999634, + "learning_rate": 2.7520873973800903e-05, + "loss": 1.0931, + "step": 31285 + }, + { + "epoch": 9.36, + "grad_norm": 2.78249192237854, + "learning_rate": 2.7515028638533958e-05, + "loss": 1.0622, + "step": 31290 + }, + { + "epoch": 9.36, + "grad_norm": 1.3780916929244995, + "learning_rate": 2.7509183164364593e-05, + "loss": 1.0036, + "step": 31295 + }, + { + "epoch": 9.36, + "grad_norm": 2.205402135848999, + "learning_rate": 2.7503337551615654e-05, + "loss": 1.3497, + "step": 31300 + }, + { + "epoch": 9.37, + "grad_norm": 3.5160036087036133, + "learning_rate": 2.7497491800609986e-05, + "loss": 1.2502, + "step": 31305 + }, + { + "epoch": 9.37, + "grad_norm": 1.4284930229187012, + "learning_rate": 2.7491645911670437e-05, + "loss": 0.9721, + "step": 31310 + }, + { + "epoch": 9.37, + "grad_norm": 5.789677143096924, + "learning_rate": 2.748579988511988e-05, + "loss": 1.1206, + "step": 31315 + }, + { + "epoch": 9.37, + "grad_norm": 3.2626028060913086, + "learning_rate": 2.747995372128117e-05, + "loss": 1.0979, + "step": 31320 + }, + { + "epoch": 9.37, + "grad_norm": 3.9523472785949707, + "learning_rate": 2.7474107420477195e-05, + "loss": 1.0172, + "step": 31325 + }, + { + "epoch": 9.37, + "grad_norm": 3.8621017932891846, + "learning_rate": 2.7468260983030846e-05, + "loss": 1.1448, + "step": 31330 + }, + { + "epoch": 9.38, + "grad_norm": 2.3181259632110596, + "learning_rate": 2.7462414409265003e-05, + "loss": 1.0566, + "step": 31335 + }, + { + "epoch": 9.38, + "grad_norm": 1.018276572227478, + "learning_rate": 2.7456567699502573e-05, + "loss": 1.1421, + "step": 31340 + }, + { + "epoch": 9.38, + "grad_norm": 2.900003671646118, + "learning_rate": 2.7450720854066464e-05, + "loss": 1.0946, + "step": 31345 + }, + { + "epoch": 9.38, + "grad_norm": 1.6421140432357788, + "learning_rate": 2.7444873873279586e-05, + "loss": 1.1454, + "step": 31350 + }, + { + "epoch": 9.38, + "grad_norm": 3.2035090923309326, + "learning_rate": 2.743902675746487e-05, + "loss": 1.1674, + "step": 31355 + }, + { + "epoch": 9.38, + "grad_norm": 2.1225266456604004, + "learning_rate": 2.743317950694524e-05, + "loss": 1.0314, + "step": 31360 + }, + { + "epoch": 9.38, + "grad_norm": 1.5846216678619385, + "learning_rate": 2.742733212204363e-05, + "loss": 0.9834, + "step": 31365 + }, + { + "epoch": 9.39, + "grad_norm": 2.3945157527923584, + "learning_rate": 2.7421484603083004e-05, + "loss": 1.1503, + "step": 31370 + }, + { + "epoch": 9.39, + "grad_norm": 1.944319725036621, + "learning_rate": 2.7415636950386285e-05, + "loss": 1.1397, + "step": 31375 + }, + { + "epoch": 9.39, + "grad_norm": 3.456681728363037, + "learning_rate": 2.7409789164276456e-05, + "loss": 1.0885, + "step": 31380 + }, + { + "epoch": 9.39, + "grad_norm": 2.284849166870117, + "learning_rate": 2.740394124507647e-05, + "loss": 1.0706, + "step": 31385 + }, + { + "epoch": 9.39, + "grad_norm": 3.1083555221557617, + "learning_rate": 2.7398093193109314e-05, + "loss": 1.2338, + "step": 31390 + }, + { + "epoch": 9.39, + "grad_norm": 2.024843454360962, + "learning_rate": 2.739224500869796e-05, + "loss": 1.2683, + "step": 31395 + }, + { + "epoch": 9.39, + "grad_norm": 3.8661715984344482, + "learning_rate": 2.7386396692165406e-05, + "loss": 1.213, + "step": 31400 + }, + { + "epoch": 9.4, + "grad_norm": 2.7120206356048584, + "learning_rate": 2.738054824383464e-05, + "loss": 1.0801, + "step": 31405 + }, + { + "epoch": 9.4, + "grad_norm": 1.732399582862854, + "learning_rate": 2.7374699664028668e-05, + "loss": 1.2241, + "step": 31410 + }, + { + "epoch": 9.4, + "grad_norm": 2.247511386871338, + "learning_rate": 2.7368850953070503e-05, + "loss": 1.0742, + "step": 31415 + }, + { + "epoch": 9.4, + "grad_norm": 1.6583189964294434, + "learning_rate": 2.736300211128316e-05, + "loss": 1.0932, + "step": 31420 + }, + { + "epoch": 9.4, + "grad_norm": 3.329725742340088, + "learning_rate": 2.7357153138989668e-05, + "loss": 1.0847, + "step": 31425 + }, + { + "epoch": 9.4, + "grad_norm": 4.0160956382751465, + "learning_rate": 2.735130403651306e-05, + "loss": 1.026, + "step": 31430 + }, + { + "epoch": 9.4, + "grad_norm": 4.307696342468262, + "learning_rate": 2.734545480417637e-05, + "loss": 0.9154, + "step": 31435 + }, + { + "epoch": 9.41, + "grad_norm": 1.9788671731948853, + "learning_rate": 2.733960544230265e-05, + "loss": 1.0141, + "step": 31440 + }, + { + "epoch": 9.41, + "grad_norm": 10.733818054199219, + "learning_rate": 2.7333755951214957e-05, + "loss": 0.9602, + "step": 31445 + }, + { + "epoch": 9.41, + "grad_norm": 4.034037113189697, + "learning_rate": 2.732790633123634e-05, + "loss": 0.9853, + "step": 31450 + }, + { + "epoch": 9.41, + "grad_norm": 4.451757431030273, + "learning_rate": 2.7322056582689885e-05, + "loss": 1.1026, + "step": 31455 + }, + { + "epoch": 9.41, + "grad_norm": 5.899003028869629, + "learning_rate": 2.7316206705898655e-05, + "loss": 1.2884, + "step": 31460 + }, + { + "epoch": 9.41, + "grad_norm": 2.728759765625, + "learning_rate": 2.7310356701185747e-05, + "loss": 1.0355, + "step": 31465 + }, + { + "epoch": 9.42, + "grad_norm": 1.9716694355010986, + "learning_rate": 2.7304506568874228e-05, + "loss": 1.1145, + "step": 31470 + }, + { + "epoch": 9.42, + "grad_norm": 2.966343641281128, + "learning_rate": 2.7298656309287206e-05, + "loss": 1.2393, + "step": 31475 + }, + { + "epoch": 9.42, + "grad_norm": 3.9974021911621094, + "learning_rate": 2.7292805922747787e-05, + "loss": 1.08, + "step": 31480 + }, + { + "epoch": 9.42, + "grad_norm": 2.1800546646118164, + "learning_rate": 2.7286955409579084e-05, + "loss": 1.0767, + "step": 31485 + }, + { + "epoch": 9.42, + "grad_norm": 7.327594757080078, + "learning_rate": 2.7281104770104205e-05, + "loss": 1.0415, + "step": 31490 + }, + { + "epoch": 9.42, + "grad_norm": 1.3344082832336426, + "learning_rate": 2.7275254004646284e-05, + "loss": 1.059, + "step": 31495 + }, + { + "epoch": 9.42, + "grad_norm": 3.478764772415161, + "learning_rate": 2.726940311352845e-05, + "loss": 1.1981, + "step": 31500 + }, + { + "epoch": 9.43, + "grad_norm": 0.9754401445388794, + "learning_rate": 2.726355209707384e-05, + "loss": 1.0865, + "step": 31505 + }, + { + "epoch": 9.43, + "grad_norm": 2.373671531677246, + "learning_rate": 2.72577009556056e-05, + "loss": 0.9644, + "step": 31510 + }, + { + "epoch": 9.43, + "grad_norm": 2.6133816242218018, + "learning_rate": 2.7251849689446885e-05, + "loss": 1.0806, + "step": 31515 + }, + { + "epoch": 9.43, + "grad_norm": 3.878574848175049, + "learning_rate": 2.724599829892085e-05, + "loss": 1.1225, + "step": 31520 + }, + { + "epoch": 9.43, + "grad_norm": 3.734332799911499, + "learning_rate": 2.724014678435067e-05, + "loss": 1.121, + "step": 31525 + }, + { + "epoch": 9.43, + "grad_norm": 8.514619827270508, + "learning_rate": 2.7234295146059503e-05, + "loss": 1.0401, + "step": 31530 + }, + { + "epoch": 9.43, + "grad_norm": 1.05905282497406, + "learning_rate": 2.722844338437054e-05, + "loss": 0.9848, + "step": 31535 + }, + { + "epoch": 9.44, + "grad_norm": 1.8391199111938477, + "learning_rate": 2.7222591499606966e-05, + "loss": 1.0086, + "step": 31540 + }, + { + "epoch": 9.44, + "grad_norm": 2.663299560546875, + "learning_rate": 2.721673949209197e-05, + "loss": 1.134, + "step": 31545 + }, + { + "epoch": 9.44, + "grad_norm": 2.3702192306518555, + "learning_rate": 2.7210887362148755e-05, + "loss": 1.1833, + "step": 31550 + }, + { + "epoch": 9.44, + "grad_norm": 3.5295441150665283, + "learning_rate": 2.7205035110100534e-05, + "loss": 1.0104, + "step": 31555 + }, + { + "epoch": 9.44, + "grad_norm": 6.249225616455078, + "learning_rate": 2.7199182736270524e-05, + "loss": 1.0269, + "step": 31560 + }, + { + "epoch": 9.44, + "grad_norm": 3.3529887199401855, + "learning_rate": 2.719333024098193e-05, + "loss": 1.0667, + "step": 31565 + }, + { + "epoch": 9.45, + "grad_norm": 2.5261456966400146, + "learning_rate": 2.7187477624557982e-05, + "loss": 0.9635, + "step": 31570 + }, + { + "epoch": 9.45, + "grad_norm": 3.518795967102051, + "learning_rate": 2.718162488732192e-05, + "loss": 1.1624, + "step": 31575 + }, + { + "epoch": 9.45, + "grad_norm": 8.467171669006348, + "learning_rate": 2.7175772029596986e-05, + "loss": 1.1869, + "step": 31580 + }, + { + "epoch": 9.45, + "grad_norm": 3.2780685424804688, + "learning_rate": 2.7169919051706422e-05, + "loss": 1.0429, + "step": 31585 + }, + { + "epoch": 9.45, + "grad_norm": 4.071305751800537, + "learning_rate": 2.716406595397349e-05, + "loss": 1.252, + "step": 31590 + }, + { + "epoch": 9.45, + "grad_norm": 2.072431802749634, + "learning_rate": 2.7158212736721444e-05, + "loss": 1.0693, + "step": 31595 + }, + { + "epoch": 9.45, + "grad_norm": 6.514179229736328, + "learning_rate": 2.7152359400273546e-05, + "loss": 1.147, + "step": 31600 + }, + { + "epoch": 9.46, + "grad_norm": 3.2777328491210938, + "learning_rate": 2.7146505944953075e-05, + "loss": 0.9809, + "step": 31605 + }, + { + "epoch": 9.46, + "grad_norm": 2.1688232421875, + "learning_rate": 2.714065237108332e-05, + "loss": 1.2513, + "step": 31610 + }, + { + "epoch": 9.46, + "grad_norm": 3.6384012699127197, + "learning_rate": 2.7134798678987544e-05, + "loss": 1.1978, + "step": 31615 + }, + { + "epoch": 9.46, + "grad_norm": 1.3136318922042847, + "learning_rate": 2.712894486898907e-05, + "loss": 1.0106, + "step": 31620 + }, + { + "epoch": 9.46, + "grad_norm": 1.3264063596725464, + "learning_rate": 2.7123090941411185e-05, + "loss": 1.1233, + "step": 31625 + }, + { + "epoch": 9.46, + "grad_norm": 1.84585440158844, + "learning_rate": 2.7117236896577185e-05, + "loss": 1.0608, + "step": 31630 + }, + { + "epoch": 9.46, + "grad_norm": 2.1590003967285156, + "learning_rate": 2.71113827348104e-05, + "loss": 1.1022, + "step": 31635 + }, + { + "epoch": 9.47, + "grad_norm": 5.31462287902832, + "learning_rate": 2.7105528456434136e-05, + "loss": 1.0598, + "step": 31640 + }, + { + "epoch": 9.47, + "grad_norm": 2.746979236602783, + "learning_rate": 2.7099674061771724e-05, + "loss": 1.116, + "step": 31645 + }, + { + "epoch": 9.47, + "grad_norm": 2.339494466781616, + "learning_rate": 2.70938195511465e-05, + "loss": 1.0503, + "step": 31650 + }, + { + "epoch": 9.47, + "grad_norm": 10.379657745361328, + "learning_rate": 2.708796492488179e-05, + "loss": 1.2657, + "step": 31655 + }, + { + "epoch": 9.47, + "grad_norm": 1.8636327981948853, + "learning_rate": 2.7082110183300962e-05, + "loss": 1.0569, + "step": 31660 + }, + { + "epoch": 9.47, + "grad_norm": 3.62514328956604, + "learning_rate": 2.7076255326727344e-05, + "loss": 1.1566, + "step": 31665 + }, + { + "epoch": 9.48, + "grad_norm": 1.6103851795196533, + "learning_rate": 2.7070400355484306e-05, + "loss": 1.2291, + "step": 31670 + }, + { + "epoch": 9.48, + "grad_norm": 38.40741729736328, + "learning_rate": 2.70645452698952e-05, + "loss": 1.0219, + "step": 31675 + }, + { + "epoch": 9.48, + "grad_norm": 1.3236831426620483, + "learning_rate": 2.705869007028341e-05, + "loss": 1.1879, + "step": 31680 + }, + { + "epoch": 9.48, + "grad_norm": 2.328878402709961, + "learning_rate": 2.7052834756972307e-05, + "loss": 0.949, + "step": 31685 + }, + { + "epoch": 9.48, + "grad_norm": 2.4937305450439453, + "learning_rate": 2.704697933028528e-05, + "loss": 1.2623, + "step": 31690 + }, + { + "epoch": 9.48, + "grad_norm": 1.3484976291656494, + "learning_rate": 2.7041123790545703e-05, + "loss": 0.9774, + "step": 31695 + }, + { + "epoch": 9.48, + "grad_norm": 2.5267484188079834, + "learning_rate": 2.703526813807698e-05, + "loss": 1.0935, + "step": 31700 + }, + { + "epoch": 9.49, + "grad_norm": 1.9129496812820435, + "learning_rate": 2.702941237320252e-05, + "loss": 1.0674, + "step": 31705 + }, + { + "epoch": 9.49, + "grad_norm": 2.022209882736206, + "learning_rate": 2.702355649624572e-05, + "loss": 1.1119, + "step": 31710 + }, + { + "epoch": 9.49, + "grad_norm": 5.069640159606934, + "learning_rate": 2.7017700507529996e-05, + "loss": 1.1296, + "step": 31715 + }, + { + "epoch": 9.49, + "grad_norm": 3.870335102081299, + "learning_rate": 2.7011844407378776e-05, + "loss": 1.2017, + "step": 31720 + }, + { + "epoch": 9.49, + "grad_norm": 1.1030346155166626, + "learning_rate": 2.7005988196115482e-05, + "loss": 1.1276, + "step": 31725 + }, + { + "epoch": 9.49, + "grad_norm": 1.269315481185913, + "learning_rate": 2.7000131874063545e-05, + "loss": 1.1485, + "step": 31730 + }, + { + "epoch": 9.49, + "grad_norm": 6.841867923736572, + "learning_rate": 2.69942754415464e-05, + "loss": 0.9942, + "step": 31735 + }, + { + "epoch": 9.5, + "grad_norm": 2.453371524810791, + "learning_rate": 2.6988418898887498e-05, + "loss": 1.0962, + "step": 31740 + }, + { + "epoch": 9.5, + "grad_norm": 4.395910263061523, + "learning_rate": 2.698256224641028e-05, + "loss": 1.0983, + "step": 31745 + }, + { + "epoch": 9.5, + "grad_norm": 3.848184108734131, + "learning_rate": 2.6976705484438213e-05, + "loss": 1.0488, + "step": 31750 + }, + { + "epoch": 9.5, + "grad_norm": 3.335904121398926, + "learning_rate": 2.6970848613294765e-05, + "loss": 1.2326, + "step": 31755 + }, + { + "epoch": 9.5, + "grad_norm": 6.4267706871032715, + "learning_rate": 2.696499163330339e-05, + "loss": 1.0299, + "step": 31760 + }, + { + "epoch": 9.5, + "grad_norm": 2.3225696086883545, + "learning_rate": 2.6959134544787567e-05, + "loss": 1.1295, + "step": 31765 + }, + { + "epoch": 9.51, + "grad_norm": 4.162220478057861, + "learning_rate": 2.6953277348070783e-05, + "loss": 1.0728, + "step": 31770 + }, + { + "epoch": 9.51, + "grad_norm": 5.359671592712402, + "learning_rate": 2.6947420043476524e-05, + "loss": 1.186, + "step": 31775 + }, + { + "epoch": 9.51, + "grad_norm": 2.6894781589508057, + "learning_rate": 2.6941562631328278e-05, + "loss": 0.9445, + "step": 31780 + }, + { + "epoch": 9.51, + "grad_norm": 30.432933807373047, + "learning_rate": 2.693570511194954e-05, + "loss": 1.1887, + "step": 31785 + }, + { + "epoch": 9.51, + "grad_norm": 4.67871618270874, + "learning_rate": 2.6929847485663823e-05, + "loss": 0.9568, + "step": 31790 + }, + { + "epoch": 9.51, + "grad_norm": 5.078667163848877, + "learning_rate": 2.6923989752794638e-05, + "loss": 1.0335, + "step": 31795 + }, + { + "epoch": 9.51, + "grad_norm": 3.381072521209717, + "learning_rate": 2.69181319136655e-05, + "loss": 1.1649, + "step": 31800 + }, + { + "epoch": 9.52, + "grad_norm": 3.0394537448883057, + "learning_rate": 2.6912273968599928e-05, + "loss": 1.1632, + "step": 31805 + }, + { + "epoch": 9.52, + "grad_norm": 3.1796021461486816, + "learning_rate": 2.690641591792145e-05, + "loss": 1.2188, + "step": 31810 + }, + { + "epoch": 9.52, + "grad_norm": 6.784097671508789, + "learning_rate": 2.690055776195361e-05, + "loss": 1.0647, + "step": 31815 + }, + { + "epoch": 9.52, + "grad_norm": 5.862364768981934, + "learning_rate": 2.6894699501019937e-05, + "loss": 1.034, + "step": 31820 + }, + { + "epoch": 9.52, + "grad_norm": 1.8071324825286865, + "learning_rate": 2.688884113544398e-05, + "loss": 1.0983, + "step": 31825 + }, + { + "epoch": 9.52, + "grad_norm": 3.1565282344818115, + "learning_rate": 2.6882982665549288e-05, + "loss": 1.1485, + "step": 31830 + }, + { + "epoch": 9.52, + "grad_norm": 2.5325186252593994, + "learning_rate": 2.6877124091659424e-05, + "loss": 1.1328, + "step": 31835 + }, + { + "epoch": 9.53, + "grad_norm": 1.6840946674346924, + "learning_rate": 2.6871265414097947e-05, + "loss": 0.9854, + "step": 31840 + }, + { + "epoch": 9.53, + "grad_norm": 1.4422705173492432, + "learning_rate": 2.6865406633188423e-05, + "loss": 1.0145, + "step": 31845 + }, + { + "epoch": 9.53, + "grad_norm": 2.4371087551116943, + "learning_rate": 2.6859547749254433e-05, + "loss": 1.1726, + "step": 31850 + }, + { + "epoch": 9.53, + "grad_norm": 3.1019036769866943, + "learning_rate": 2.6853688762619555e-05, + "loss": 1.1436, + "step": 31855 + }, + { + "epoch": 9.53, + "grad_norm": 3.6679959297180176, + "learning_rate": 2.6847829673607373e-05, + "loss": 1.0577, + "step": 31860 + }, + { + "epoch": 9.53, + "grad_norm": 3.406115770339966, + "learning_rate": 2.6841970482541473e-05, + "loss": 1.1801, + "step": 31865 + }, + { + "epoch": 9.54, + "grad_norm": 2.184192180633545, + "learning_rate": 2.6836111189745462e-05, + "loss": 1.1725, + "step": 31870 + }, + { + "epoch": 9.54, + "grad_norm": 2.7702200412750244, + "learning_rate": 2.6830251795542938e-05, + "loss": 1.061, + "step": 31875 + }, + { + "epoch": 9.54, + "grad_norm": 1.0884158611297607, + "learning_rate": 2.6824392300257505e-05, + "loss": 0.953, + "step": 31880 + }, + { + "epoch": 9.54, + "grad_norm": 1.4687933921813965, + "learning_rate": 2.681853270421279e-05, + "loss": 1.1745, + "step": 31885 + }, + { + "epoch": 9.54, + "grad_norm": 1.536842703819275, + "learning_rate": 2.68126730077324e-05, + "loss": 0.9724, + "step": 31890 + }, + { + "epoch": 9.54, + "grad_norm": 3.3644862174987793, + "learning_rate": 2.6806813211139965e-05, + "loss": 1.1599, + "step": 31895 + }, + { + "epoch": 9.54, + "grad_norm": 2.4303081035614014, + "learning_rate": 2.6800953314759108e-05, + "loss": 1.2175, + "step": 31900 + }, + { + "epoch": 9.55, + "grad_norm": 4.207813262939453, + "learning_rate": 2.679509331891348e-05, + "loss": 1.1489, + "step": 31905 + }, + { + "epoch": 9.55, + "grad_norm": 4.013348579406738, + "learning_rate": 2.6789233223926713e-05, + "loss": 1.1652, + "step": 31910 + }, + { + "epoch": 9.55, + "grad_norm": 4.057195663452148, + "learning_rate": 2.6783373030122455e-05, + "loss": 1.1245, + "step": 31915 + }, + { + "epoch": 9.55, + "grad_norm": 1.9660584926605225, + "learning_rate": 2.6777512737824358e-05, + "loss": 1.2141, + "step": 31920 + }, + { + "epoch": 9.55, + "grad_norm": 1.8502095937728882, + "learning_rate": 2.6771652347356074e-05, + "loss": 0.9919, + "step": 31925 + }, + { + "epoch": 9.55, + "grad_norm": 1.4505739212036133, + "learning_rate": 2.6765791859041278e-05, + "loss": 1.0553, + "step": 31930 + }, + { + "epoch": 9.55, + "grad_norm": 3.0813705921173096, + "learning_rate": 2.6759931273203632e-05, + "loss": 0.9613, + "step": 31935 + }, + { + "epoch": 9.56, + "grad_norm": 1.6187719106674194, + "learning_rate": 2.6754070590166808e-05, + "loss": 1.1784, + "step": 31940 + }, + { + "epoch": 9.56, + "grad_norm": 5.102146148681641, + "learning_rate": 2.6748209810254493e-05, + "loss": 1.1304, + "step": 31945 + }, + { + "epoch": 9.56, + "grad_norm": 2.7340807914733887, + "learning_rate": 2.674234893379037e-05, + "loss": 1.0532, + "step": 31950 + }, + { + "epoch": 9.56, + "grad_norm": 3.3203606605529785, + "learning_rate": 2.6736487961098122e-05, + "loss": 1.0749, + "step": 31955 + }, + { + "epoch": 9.56, + "grad_norm": 2.275956153869629, + "learning_rate": 2.6730626892501448e-05, + "loss": 1.0954, + "step": 31960 + }, + { + "epoch": 9.56, + "grad_norm": 1.4483486413955688, + "learning_rate": 2.6724765728324054e-05, + "loss": 1.1644, + "step": 31965 + }, + { + "epoch": 9.57, + "grad_norm": 2.504141330718994, + "learning_rate": 2.6718904468889633e-05, + "loss": 1.299, + "step": 31970 + }, + { + "epoch": 9.57, + "grad_norm": 2.5018177032470703, + "learning_rate": 2.671304311452191e-05, + "loss": 1.1193, + "step": 31975 + }, + { + "epoch": 9.57, + "grad_norm": 3.87642502784729, + "learning_rate": 2.6707181665544594e-05, + "loss": 1.0883, + "step": 31980 + }, + { + "epoch": 9.57, + "grad_norm": 4.660168647766113, + "learning_rate": 2.6701320122281416e-05, + "loss": 1.0395, + "step": 31985 + }, + { + "epoch": 9.57, + "grad_norm": 2.28615665435791, + "learning_rate": 2.669545848505609e-05, + "loss": 1.1664, + "step": 31990 + }, + { + "epoch": 9.57, + "grad_norm": 4.775680065155029, + "learning_rate": 2.6689596754192355e-05, + "loss": 1.2104, + "step": 31995 + }, + { + "epoch": 9.57, + "grad_norm": 1.285249948501587, + "learning_rate": 2.668373493001395e-05, + "loss": 1.1832, + "step": 32000 + }, + { + "epoch": 9.58, + "grad_norm": 4.368350982666016, + "learning_rate": 2.667787301284461e-05, + "loss": 1.1155, + "step": 32005 + }, + { + "epoch": 9.58, + "grad_norm": 1.7169698476791382, + "learning_rate": 2.667201100300809e-05, + "loss": 0.9742, + "step": 32010 + }, + { + "epoch": 9.58, + "grad_norm": 2.3918094635009766, + "learning_rate": 2.666614890082815e-05, + "loss": 1.067, + "step": 32015 + }, + { + "epoch": 9.58, + "grad_norm": 3.565277576446533, + "learning_rate": 2.666028670662853e-05, + "loss": 1.0911, + "step": 32020 + }, + { + "epoch": 9.58, + "grad_norm": 4.275758743286133, + "learning_rate": 2.6654424420732997e-05, + "loss": 0.998, + "step": 32025 + }, + { + "epoch": 9.58, + "grad_norm": 2.591794490814209, + "learning_rate": 2.6648562043465323e-05, + "loss": 1.111, + "step": 32030 + }, + { + "epoch": 9.58, + "grad_norm": 5.0382561683654785, + "learning_rate": 2.664269957514929e-05, + "loss": 1.1469, + "step": 32035 + }, + { + "epoch": 9.59, + "grad_norm": 2.2757861614227295, + "learning_rate": 2.6636837016108656e-05, + "loss": 1.0449, + "step": 32040 + }, + { + "epoch": 9.59, + "grad_norm": 2.4247963428497314, + "learning_rate": 2.6630974366667226e-05, + "loss": 1.166, + "step": 32045 + }, + { + "epoch": 9.59, + "grad_norm": 1.7559020519256592, + "learning_rate": 2.6625111627148768e-05, + "loss": 1.1379, + "step": 32050 + }, + { + "epoch": 9.59, + "grad_norm": 2.4389190673828125, + "learning_rate": 2.661924879787709e-05, + "loss": 1.0711, + "step": 32055 + }, + { + "epoch": 9.59, + "grad_norm": 2.227290630340576, + "learning_rate": 2.661338587917597e-05, + "loss": 1.0021, + "step": 32060 + }, + { + "epoch": 9.59, + "grad_norm": 1.5972774028778076, + "learning_rate": 2.660752287136924e-05, + "loss": 1.0343, + "step": 32065 + }, + { + "epoch": 9.59, + "grad_norm": 7.264094829559326, + "learning_rate": 2.6601659774780692e-05, + "loss": 1.0336, + "step": 32070 + }, + { + "epoch": 9.6, + "grad_norm": 1.8394643068313599, + "learning_rate": 2.6595796589734136e-05, + "loss": 1.0561, + "step": 32075 + }, + { + "epoch": 9.6, + "grad_norm": 2.3896703720092773, + "learning_rate": 2.6589933316553395e-05, + "loss": 1.2167, + "step": 32080 + }, + { + "epoch": 9.6, + "grad_norm": 1.8738449811935425, + "learning_rate": 2.6584069955562286e-05, + "loss": 1.1215, + "step": 32085 + }, + { + "epoch": 9.6, + "grad_norm": 1.6844857931137085, + "learning_rate": 2.657820650708464e-05, + "loss": 1.1444, + "step": 32090 + }, + { + "epoch": 9.6, + "grad_norm": 6.457211494445801, + "learning_rate": 2.657234297144429e-05, + "loss": 0.9628, + "step": 32095 + }, + { + "epoch": 9.6, + "grad_norm": 3.831761121749878, + "learning_rate": 2.656647934896508e-05, + "loss": 0.8621, + "step": 32100 + }, + { + "epoch": 9.61, + "grad_norm": 2.6082820892333984, + "learning_rate": 2.6560615639970833e-05, + "loss": 1.1312, + "step": 32105 + }, + { + "epoch": 9.61, + "grad_norm": 1.152781367301941, + "learning_rate": 2.6554751844785414e-05, + "loss": 1.1762, + "step": 32110 + }, + { + "epoch": 9.61, + "grad_norm": 1.7581003904342651, + "learning_rate": 2.654888796373266e-05, + "loss": 0.9553, + "step": 32115 + }, + { + "epoch": 9.61, + "grad_norm": 3.2171480655670166, + "learning_rate": 2.654302399713644e-05, + "loss": 1.0941, + "step": 32120 + }, + { + "epoch": 9.61, + "grad_norm": 1.9642105102539062, + "learning_rate": 2.6537159945320606e-05, + "loss": 1.1688, + "step": 32125 + }, + { + "epoch": 9.61, + "grad_norm": 9.561652183532715, + "learning_rate": 2.6531295808609023e-05, + "loss": 0.9819, + "step": 32130 + }, + { + "epoch": 9.61, + "grad_norm": 7.757037162780762, + "learning_rate": 2.6525431587325568e-05, + "loss": 1.0823, + "step": 32135 + }, + { + "epoch": 9.62, + "grad_norm": 4.364965438842773, + "learning_rate": 2.6519567281794105e-05, + "loss": 1.008, + "step": 32140 + }, + { + "epoch": 9.62, + "grad_norm": 5.253482818603516, + "learning_rate": 2.6513702892338526e-05, + "loss": 0.9406, + "step": 32145 + }, + { + "epoch": 9.62, + "grad_norm": 3.2114205360412598, + "learning_rate": 2.650783841928271e-05, + "loss": 0.9781, + "step": 32150 + }, + { + "epoch": 9.62, + "grad_norm": 5.03977108001709, + "learning_rate": 2.6501973862950547e-05, + "loss": 1.1507, + "step": 32155 + }, + { + "epoch": 9.62, + "grad_norm": 2.5563600063323975, + "learning_rate": 2.6496109223665928e-05, + "loss": 1.0864, + "step": 32160 + }, + { + "epoch": 9.62, + "grad_norm": 3.900869369506836, + "learning_rate": 2.649024450175275e-05, + "loss": 1.0983, + "step": 32165 + }, + { + "epoch": 9.62, + "grad_norm": 1.9279170036315918, + "learning_rate": 2.648437969753491e-05, + "loss": 1.0866, + "step": 32170 + }, + { + "epoch": 9.63, + "grad_norm": 5.001946926116943, + "learning_rate": 2.6478514811336342e-05, + "loss": 1.0995, + "step": 32175 + }, + { + "epoch": 9.63, + "grad_norm": 3.2745273113250732, + "learning_rate": 2.6472649843480923e-05, + "loss": 1.1328, + "step": 32180 + }, + { + "epoch": 9.63, + "grad_norm": 5.206939697265625, + "learning_rate": 2.6466784794292588e-05, + "loss": 1.1132, + "step": 32185 + }, + { + "epoch": 9.63, + "grad_norm": 2.813488483428955, + "learning_rate": 2.6460919664095245e-05, + "loss": 1.0479, + "step": 32190 + }, + { + "epoch": 9.63, + "grad_norm": 3.6982460021972656, + "learning_rate": 2.6455054453212837e-05, + "loss": 1.0327, + "step": 32195 + }, + { + "epoch": 9.63, + "grad_norm": 1.1332085132598877, + "learning_rate": 2.644918916196928e-05, + "loss": 1.0878, + "step": 32200 + }, + { + "epoch": 9.64, + "grad_norm": 2.1422863006591797, + "learning_rate": 2.6443323790688517e-05, + "loss": 1.0685, + "step": 32205 + }, + { + "epoch": 9.64, + "grad_norm": 4.674763202667236, + "learning_rate": 2.6437458339694483e-05, + "loss": 0.8133, + "step": 32210 + }, + { + "epoch": 9.64, + "grad_norm": 1.626625657081604, + "learning_rate": 2.6431592809311112e-05, + "loss": 1.2528, + "step": 32215 + }, + { + "epoch": 9.64, + "grad_norm": 1.6559752225875854, + "learning_rate": 2.642572719986236e-05, + "loss": 1.0107, + "step": 32220 + }, + { + "epoch": 9.64, + "grad_norm": 3.3808319568634033, + "learning_rate": 2.6419861511672174e-05, + "loss": 1.2791, + "step": 32225 + }, + { + "epoch": 9.64, + "grad_norm": 2.20304274559021, + "learning_rate": 2.6413995745064513e-05, + "loss": 1.0505, + "step": 32230 + }, + { + "epoch": 9.64, + "grad_norm": 3.014941692352295, + "learning_rate": 2.6408129900363342e-05, + "loss": 1.0684, + "step": 32235 + }, + { + "epoch": 9.65, + "grad_norm": 2.0386364459991455, + "learning_rate": 2.6402263977892617e-05, + "loss": 0.9374, + "step": 32240 + }, + { + "epoch": 9.65, + "grad_norm": 2.927957057952881, + "learning_rate": 2.6396397977976305e-05, + "loss": 0.95, + "step": 32245 + }, + { + "epoch": 9.65, + "grad_norm": 2.662858724594116, + "learning_rate": 2.639053190093839e-05, + "loss": 1.0665, + "step": 32250 + }, + { + "epoch": 9.65, + "grad_norm": 2.204014539718628, + "learning_rate": 2.6384665747102842e-05, + "loss": 1.0675, + "step": 32255 + }, + { + "epoch": 9.65, + "grad_norm": 3.048424005508423, + "learning_rate": 2.6378799516793645e-05, + "loss": 1.1372, + "step": 32260 + }, + { + "epoch": 9.65, + "grad_norm": 2.034694194793701, + "learning_rate": 2.6372933210334788e-05, + "loss": 1.1936, + "step": 32265 + }, + { + "epoch": 9.65, + "grad_norm": 1.9877517223358154, + "learning_rate": 2.636706682805026e-05, + "loss": 1.129, + "step": 32270 + }, + { + "epoch": 9.66, + "grad_norm": 1.1743559837341309, + "learning_rate": 2.636120037026404e-05, + "loss": 1.1884, + "step": 32275 + }, + { + "epoch": 9.66, + "grad_norm": 3.967900514602661, + "learning_rate": 2.6355333837300144e-05, + "loss": 1.1177, + "step": 32280 + }, + { + "epoch": 9.66, + "grad_norm": 2.5938291549682617, + "learning_rate": 2.6349467229482566e-05, + "loss": 1.1809, + "step": 32285 + }, + { + "epoch": 9.66, + "grad_norm": 3.2662594318389893, + "learning_rate": 2.6343600547135318e-05, + "loss": 1.0683, + "step": 32290 + }, + { + "epoch": 9.66, + "grad_norm": 2.3414182662963867, + "learning_rate": 2.633773379058241e-05, + "loss": 1.2112, + "step": 32295 + }, + { + "epoch": 9.66, + "grad_norm": 2.2585291862487793, + "learning_rate": 2.633186696014785e-05, + "loss": 1.1298, + "step": 32300 + }, + { + "epoch": 9.67, + "grad_norm": 1.8855080604553223, + "learning_rate": 2.632600005615567e-05, + "loss": 1.0465, + "step": 32305 + }, + { + "epoch": 9.67, + "grad_norm": 1.7714307308197021, + "learning_rate": 2.632013307892988e-05, + "loss": 1.0813, + "step": 32310 + }, + { + "epoch": 9.67, + "grad_norm": 1.7894978523254395, + "learning_rate": 2.6314266028794516e-05, + "loss": 1.0873, + "step": 32315 + }, + { + "epoch": 9.67, + "grad_norm": 2.007699489593506, + "learning_rate": 2.63083989060736e-05, + "loss": 1.0634, + "step": 32320 + }, + { + "epoch": 9.67, + "grad_norm": 4.4481892585754395, + "learning_rate": 2.630253171109118e-05, + "loss": 1.1221, + "step": 32325 + }, + { + "epoch": 9.67, + "grad_norm": 2.6404707431793213, + "learning_rate": 2.6296664444171277e-05, + "loss": 1.0629, + "step": 32330 + }, + { + "epoch": 9.67, + "grad_norm": 4.706933975219727, + "learning_rate": 2.629079710563795e-05, + "loss": 0.9293, + "step": 32335 + }, + { + "epoch": 9.68, + "grad_norm": 3.423985242843628, + "learning_rate": 2.628492969581524e-05, + "loss": 1.1948, + "step": 32340 + }, + { + "epoch": 9.68, + "grad_norm": 3.1933882236480713, + "learning_rate": 2.62790622150272e-05, + "loss": 0.9721, + "step": 32345 + }, + { + "epoch": 9.68, + "grad_norm": 2.481731414794922, + "learning_rate": 2.6273194663597877e-05, + "loss": 1.0354, + "step": 32350 + }, + { + "epoch": 9.68, + "grad_norm": 2.2168052196502686, + "learning_rate": 2.626732704185134e-05, + "loss": 1.0921, + "step": 32355 + }, + { + "epoch": 9.68, + "grad_norm": 1.8711326122283936, + "learning_rate": 2.626145935011165e-05, + "loss": 1.2218, + "step": 32360 + }, + { + "epoch": 9.68, + "grad_norm": 5.039557456970215, + "learning_rate": 2.6255591588702872e-05, + "loss": 1.0336, + "step": 32365 + }, + { + "epoch": 9.68, + "grad_norm": 2.2609169483184814, + "learning_rate": 2.624972375794908e-05, + "loss": 1.2382, + "step": 32370 + }, + { + "epoch": 9.69, + "grad_norm": 3.6978182792663574, + "learning_rate": 2.624385585817434e-05, + "loss": 1.16, + "step": 32375 + }, + { + "epoch": 9.69, + "grad_norm": 1.6256968975067139, + "learning_rate": 2.623798788970273e-05, + "loss": 1.1525, + "step": 32380 + }, + { + "epoch": 9.69, + "grad_norm": 2.7560393810272217, + "learning_rate": 2.623211985285834e-05, + "loss": 1.0104, + "step": 32385 + }, + { + "epoch": 9.69, + "grad_norm": 2.977550506591797, + "learning_rate": 2.6226251747965247e-05, + "loss": 1.093, + "step": 32390 + }, + { + "epoch": 9.69, + "grad_norm": 2.2501907348632812, + "learning_rate": 2.6220383575347547e-05, + "loss": 1.1158, + "step": 32395 + }, + { + "epoch": 9.69, + "grad_norm": 1.943388819694519, + "learning_rate": 2.621451533532933e-05, + "loss": 1.1113, + "step": 32400 + }, + { + "epoch": 9.7, + "grad_norm": 3.2862977981567383, + "learning_rate": 2.6208647028234695e-05, + "loss": 1.1408, + "step": 32405 + }, + { + "epoch": 9.7, + "grad_norm": 2.2214648723602295, + "learning_rate": 2.6202778654387737e-05, + "loss": 0.9565, + "step": 32410 + }, + { + "epoch": 9.7, + "grad_norm": 2.63506817817688, + "learning_rate": 2.619691021411257e-05, + "loss": 1.1946, + "step": 32415 + }, + { + "epoch": 9.7, + "grad_norm": 2.0027542114257812, + "learning_rate": 2.6191041707733293e-05, + "loss": 0.9843, + "step": 32420 + }, + { + "epoch": 9.7, + "grad_norm": 3.6025943756103516, + "learning_rate": 2.618517313557402e-05, + "loss": 0.9494, + "step": 32425 + }, + { + "epoch": 9.7, + "grad_norm": 3.030153751373291, + "learning_rate": 2.6179304497958855e-05, + "loss": 1.059, + "step": 32430 + }, + { + "epoch": 9.7, + "grad_norm": 1.995133399963379, + "learning_rate": 2.6173435795211947e-05, + "loss": 0.9847, + "step": 32435 + }, + { + "epoch": 9.71, + "grad_norm": 2.5811808109283447, + "learning_rate": 2.6167567027657397e-05, + "loss": 1.0383, + "step": 32440 + }, + { + "epoch": 9.71, + "grad_norm": 3.842437982559204, + "learning_rate": 2.6161698195619327e-05, + "loss": 1.0719, + "step": 32445 + }, + { + "epoch": 9.71, + "grad_norm": 5.087784290313721, + "learning_rate": 2.6155829299421875e-05, + "loss": 1.1697, + "step": 32450 + }, + { + "epoch": 9.71, + "grad_norm": 3.2515745162963867, + "learning_rate": 2.6149960339389174e-05, + "loss": 1.0113, + "step": 32455 + }, + { + "epoch": 9.71, + "grad_norm": 2.410245656967163, + "learning_rate": 2.6144091315845353e-05, + "loss": 1.1748, + "step": 32460 + }, + { + "epoch": 9.71, + "grad_norm": 2.392496347427368, + "learning_rate": 2.6138222229114572e-05, + "loss": 1.1732, + "step": 32465 + }, + { + "epoch": 9.71, + "grad_norm": 2.2038094997406006, + "learning_rate": 2.613235307952095e-05, + "loss": 0.9646, + "step": 32470 + }, + { + "epoch": 9.72, + "grad_norm": 1.9154731035232544, + "learning_rate": 2.6126483867388645e-05, + "loss": 1.2406, + "step": 32475 + }, + { + "epoch": 9.72, + "grad_norm": 1.628433108329773, + "learning_rate": 2.612061459304181e-05, + "loss": 1.1421, + "step": 32480 + }, + { + "epoch": 9.72, + "grad_norm": 1.9438600540161133, + "learning_rate": 2.611474525680459e-05, + "loss": 1.0932, + "step": 32485 + }, + { + "epoch": 9.72, + "grad_norm": 5.289341926574707, + "learning_rate": 2.6108875859001152e-05, + "loss": 0.9074, + "step": 32490 + }, + { + "epoch": 9.72, + "grad_norm": 1.4697092771530151, + "learning_rate": 2.610300639995565e-05, + "loss": 1.178, + "step": 32495 + }, + { + "epoch": 9.72, + "grad_norm": 2.8310253620147705, + "learning_rate": 2.6097136879992256e-05, + "loss": 1.0798, + "step": 32500 + }, + { + "epoch": 9.73, + "grad_norm": 4.534608840942383, + "learning_rate": 2.609126729943513e-05, + "loss": 1.1111, + "step": 32505 + }, + { + "epoch": 9.73, + "grad_norm": 4.378732681274414, + "learning_rate": 2.608539765860844e-05, + "loss": 1.1557, + "step": 32510 + }, + { + "epoch": 9.73, + "grad_norm": 2.2998828887939453, + "learning_rate": 2.607952795783637e-05, + "loss": 1.0735, + "step": 32515 + }, + { + "epoch": 9.73, + "grad_norm": 2.0673623085021973, + "learning_rate": 2.6073658197443095e-05, + "loss": 1.0948, + "step": 32520 + }, + { + "epoch": 9.73, + "grad_norm": 2.536200761795044, + "learning_rate": 2.6067788377752793e-05, + "loss": 1.3438, + "step": 32525 + }, + { + "epoch": 9.73, + "grad_norm": 1.4531900882720947, + "learning_rate": 2.6061918499089656e-05, + "loss": 0.9371, + "step": 32530 + }, + { + "epoch": 9.73, + "grad_norm": 4.1081342697143555, + "learning_rate": 2.6056048561777852e-05, + "loss": 1.1155, + "step": 32535 + }, + { + "epoch": 9.74, + "grad_norm": 8.0404691696167, + "learning_rate": 2.6050178566141585e-05, + "loss": 1.1525, + "step": 32540 + }, + { + "epoch": 9.74, + "grad_norm": 2.6150429248809814, + "learning_rate": 2.6044308512505056e-05, + "loss": 1.1611, + "step": 32545 + }, + { + "epoch": 9.74, + "grad_norm": 6.387349605560303, + "learning_rate": 2.6038438401192444e-05, + "loss": 1.0697, + "step": 32550 + }, + { + "epoch": 9.74, + "grad_norm": 3.0312044620513916, + "learning_rate": 2.6032568232527964e-05, + "loss": 1.0854, + "step": 32555 + }, + { + "epoch": 9.74, + "grad_norm": 1.371320366859436, + "learning_rate": 2.6026698006835814e-05, + "loss": 1.0813, + "step": 32560 + }, + { + "epoch": 9.74, + "grad_norm": 2.4062769412994385, + "learning_rate": 2.60208277244402e-05, + "loss": 1.0397, + "step": 32565 + }, + { + "epoch": 9.74, + "grad_norm": 2.3554258346557617, + "learning_rate": 2.601495738566533e-05, + "loss": 1.1501, + "step": 32570 + }, + { + "epoch": 9.75, + "grad_norm": 2.7629899978637695, + "learning_rate": 2.6009086990835418e-05, + "loss": 1.039, + "step": 32575 + }, + { + "epoch": 9.75, + "grad_norm": 1.9334179162979126, + "learning_rate": 2.6003216540274682e-05, + "loss": 1.1206, + "step": 32580 + }, + { + "epoch": 9.75, + "grad_norm": 2.248354434967041, + "learning_rate": 2.5997346034307337e-05, + "loss": 1.0189, + "step": 32585 + }, + { + "epoch": 9.75, + "grad_norm": 2.1622071266174316, + "learning_rate": 2.5991475473257608e-05, + "loss": 1.0698, + "step": 32590 + }, + { + "epoch": 9.75, + "grad_norm": 1.745422601699829, + "learning_rate": 2.598560485744972e-05, + "loss": 0.8944, + "step": 32595 + }, + { + "epoch": 9.75, + "grad_norm": 2.3082125186920166, + "learning_rate": 2.59797341872079e-05, + "loss": 1.0338, + "step": 32600 + }, + { + "epoch": 9.76, + "grad_norm": 3.817018985748291, + "learning_rate": 2.5973863462856378e-05, + "loss": 0.965, + "step": 32605 + }, + { + "epoch": 9.76, + "grad_norm": 2.986459493637085, + "learning_rate": 2.596799268471939e-05, + "loss": 1.1374, + "step": 32610 + }, + { + "epoch": 9.76, + "grad_norm": 4.016047477722168, + "learning_rate": 2.5962121853121174e-05, + "loss": 1.047, + "step": 32615 + }, + { + "epoch": 9.76, + "grad_norm": 3.042452812194824, + "learning_rate": 2.5956250968385966e-05, + "loss": 0.9784, + "step": 32620 + }, + { + "epoch": 9.76, + "grad_norm": 2.1322600841522217, + "learning_rate": 2.5950380030838017e-05, + "loss": 1.0133, + "step": 32625 + }, + { + "epoch": 9.76, + "grad_norm": 2.257575511932373, + "learning_rate": 2.5944509040801564e-05, + "loss": 1.0372, + "step": 32630 + }, + { + "epoch": 9.76, + "grad_norm": 2.1339566707611084, + "learning_rate": 2.593863799860085e-05, + "loss": 1.0806, + "step": 32635 + }, + { + "epoch": 9.77, + "grad_norm": 2.67110538482666, + "learning_rate": 2.593276690456014e-05, + "loss": 1.0397, + "step": 32640 + }, + { + "epoch": 9.77, + "grad_norm": 4.476059913635254, + "learning_rate": 2.592689575900369e-05, + "loss": 1.0341, + "step": 32645 + }, + { + "epoch": 9.77, + "grad_norm": 3.7582499980926514, + "learning_rate": 2.592102456225574e-05, + "loss": 0.9378, + "step": 32650 + }, + { + "epoch": 9.77, + "grad_norm": 2.036689043045044, + "learning_rate": 2.5915153314640566e-05, + "loss": 1.0847, + "step": 32655 + }, + { + "epoch": 9.77, + "grad_norm": 8.208381652832031, + "learning_rate": 2.5909282016482435e-05, + "loss": 0.9298, + "step": 32660 + }, + { + "epoch": 9.77, + "grad_norm": 2.9294674396514893, + "learning_rate": 2.5903410668105586e-05, + "loss": 1.1432, + "step": 32665 + }, + { + "epoch": 9.77, + "grad_norm": 1.4900659322738647, + "learning_rate": 2.5897539269834313e-05, + "loss": 1.0694, + "step": 32670 + }, + { + "epoch": 9.78, + "grad_norm": 7.3784990310668945, + "learning_rate": 2.5891667821992883e-05, + "loss": 1.0586, + "step": 32675 + }, + { + "epoch": 9.78, + "grad_norm": 1.4126672744750977, + "learning_rate": 2.588579632490556e-05, + "loss": 1.0634, + "step": 32680 + }, + { + "epoch": 9.78, + "grad_norm": 2.6837716102600098, + "learning_rate": 2.587992477889663e-05, + "loss": 1.0635, + "step": 32685 + }, + { + "epoch": 9.78, + "grad_norm": 1.9375698566436768, + "learning_rate": 2.5874053184290366e-05, + "loss": 1.1508, + "step": 32690 + }, + { + "epoch": 9.78, + "grad_norm": 2.615001678466797, + "learning_rate": 2.586818154141105e-05, + "loss": 1.2275, + "step": 32695 + }, + { + "epoch": 9.78, + "grad_norm": 2.7543222904205322, + "learning_rate": 2.5862309850582977e-05, + "loss": 1.072, + "step": 32700 + }, + { + "epoch": 9.78, + "grad_norm": 1.6145623922348022, + "learning_rate": 2.5856438112130427e-05, + "loss": 0.9575, + "step": 32705 + }, + { + "epoch": 9.79, + "grad_norm": 2.7645764350891113, + "learning_rate": 2.585056632637769e-05, + "loss": 1.1105, + "step": 32710 + }, + { + "epoch": 9.79, + "grad_norm": 1.5867153406143188, + "learning_rate": 2.5844694493649054e-05, + "loss": 0.9333, + "step": 32715 + }, + { + "epoch": 9.79, + "grad_norm": 1.9193617105484009, + "learning_rate": 2.583882261426882e-05, + "loss": 1.2603, + "step": 32720 + }, + { + "epoch": 9.79, + "grad_norm": 2.898785352706909, + "learning_rate": 2.5832950688561297e-05, + "loss": 1.1845, + "step": 32725 + }, + { + "epoch": 9.79, + "grad_norm": 1.4677989482879639, + "learning_rate": 2.582707871685076e-05, + "loss": 1.1262, + "step": 32730 + }, + { + "epoch": 9.79, + "grad_norm": 2.376750946044922, + "learning_rate": 2.582120669946153e-05, + "loss": 1.183, + "step": 32735 + }, + { + "epoch": 9.8, + "grad_norm": 2.5935792922973633, + "learning_rate": 2.5815334636717902e-05, + "loss": 1.0188, + "step": 32740 + }, + { + "epoch": 9.8, + "grad_norm": 4.154141902923584, + "learning_rate": 2.5809462528944195e-05, + "loss": 1.1518, + "step": 32745 + }, + { + "epoch": 9.8, + "grad_norm": 4.1396260261535645, + "learning_rate": 2.5803590376464716e-05, + "loss": 1.1648, + "step": 32750 + }, + { + "epoch": 9.8, + "grad_norm": 2.666245698928833, + "learning_rate": 2.5797718179603776e-05, + "loss": 1.0906, + "step": 32755 + }, + { + "epoch": 9.8, + "grad_norm": 2.568533420562744, + "learning_rate": 2.579184593868569e-05, + "loss": 0.9812, + "step": 32760 + }, + { + "epoch": 9.8, + "grad_norm": 3.8902790546417236, + "learning_rate": 2.578597365403477e-05, + "loss": 0.9988, + "step": 32765 + }, + { + "epoch": 9.8, + "grad_norm": 1.01784086227417, + "learning_rate": 2.578010132597534e-05, + "loss": 1.2201, + "step": 32770 + }, + { + "epoch": 9.81, + "grad_norm": 23.18547248840332, + "learning_rate": 2.577422895483173e-05, + "loss": 0.8956, + "step": 32775 + }, + { + "epoch": 9.81, + "grad_norm": 2.158190965652466, + "learning_rate": 2.5768356540928256e-05, + "loss": 1.0248, + "step": 32780 + }, + { + "epoch": 9.81, + "grad_norm": 6.015243053436279, + "learning_rate": 2.5762484084589256e-05, + "loss": 1.1377, + "step": 32785 + }, + { + "epoch": 9.81, + "grad_norm": 2.5766751766204834, + "learning_rate": 2.5756611586139044e-05, + "loss": 0.8034, + "step": 32790 + }, + { + "epoch": 9.81, + "grad_norm": 3.290252208709717, + "learning_rate": 2.5750739045901966e-05, + "loss": 1.3023, + "step": 32795 + }, + { + "epoch": 9.81, + "grad_norm": 2.5434985160827637, + "learning_rate": 2.574486646420235e-05, + "loss": 1.1181, + "step": 32800 + }, + { + "epoch": 9.81, + "grad_norm": 8.717535018920898, + "learning_rate": 2.5738993841364535e-05, + "loss": 1.056, + "step": 32805 + }, + { + "epoch": 9.82, + "grad_norm": 2.3016560077667236, + "learning_rate": 2.5733121177712856e-05, + "loss": 1.2113, + "step": 32810 + }, + { + "epoch": 9.82, + "grad_norm": 1.7972419261932373, + "learning_rate": 2.5727248473571653e-05, + "loss": 1.1958, + "step": 32815 + }, + { + "epoch": 9.82, + "grad_norm": 3.321375846862793, + "learning_rate": 2.5721375729265283e-05, + "loss": 0.9634, + "step": 32820 + }, + { + "epoch": 9.82, + "grad_norm": 3.124709129333496, + "learning_rate": 2.5715502945118075e-05, + "loss": 1.0188, + "step": 32825 + }, + { + "epoch": 9.82, + "grad_norm": 1.5069762468338013, + "learning_rate": 2.570963012145438e-05, + "loss": 0.8482, + "step": 32830 + }, + { + "epoch": 9.82, + "grad_norm": 2.640984535217285, + "learning_rate": 2.5703757258598554e-05, + "loss": 1.1124, + "step": 32835 + }, + { + "epoch": 9.83, + "grad_norm": 4.078088283538818, + "learning_rate": 2.5697884356874947e-05, + "loss": 1.0427, + "step": 32840 + }, + { + "epoch": 9.83, + "grad_norm": 3.2041678428649902, + "learning_rate": 2.5692011416607908e-05, + "loss": 0.9205, + "step": 32845 + }, + { + "epoch": 9.83, + "grad_norm": 2.750579595565796, + "learning_rate": 2.5686138438121804e-05, + "loss": 1.2344, + "step": 32850 + }, + { + "epoch": 9.83, + "grad_norm": 2.770933151245117, + "learning_rate": 2.568026542174099e-05, + "loss": 1.056, + "step": 32855 + }, + { + "epoch": 9.83, + "grad_norm": 2.695629596710205, + "learning_rate": 2.5674392367789818e-05, + "loss": 0.8705, + "step": 32860 + }, + { + "epoch": 9.83, + "grad_norm": 1.1691042184829712, + "learning_rate": 2.5668519276592658e-05, + "loss": 1.0706, + "step": 32865 + }, + { + "epoch": 9.83, + "grad_norm": 4.2656097412109375, + "learning_rate": 2.5662646148473867e-05, + "loss": 0.9537, + "step": 32870 + }, + { + "epoch": 9.84, + "grad_norm": 1.754385232925415, + "learning_rate": 2.5656772983757822e-05, + "loss": 1.1298, + "step": 32875 + }, + { + "epoch": 9.84, + "grad_norm": 1.9311494827270508, + "learning_rate": 2.5650899782768882e-05, + "loss": 1.2659, + "step": 32880 + }, + { + "epoch": 9.84, + "grad_norm": 3.109220504760742, + "learning_rate": 2.564502654583144e-05, + "loss": 1.4295, + "step": 32885 + }, + { + "epoch": 9.84, + "grad_norm": 2.71708607673645, + "learning_rate": 2.563915327326984e-05, + "loss": 1.1224, + "step": 32890 + }, + { + "epoch": 9.84, + "grad_norm": 2.3169498443603516, + "learning_rate": 2.5633279965408475e-05, + "loss": 0.8078, + "step": 32895 + }, + { + "epoch": 9.84, + "grad_norm": 1.5977541208267212, + "learning_rate": 2.5627406622571708e-05, + "loss": 1.19, + "step": 32900 + }, + { + "epoch": 9.84, + "grad_norm": 3.2006161212921143, + "learning_rate": 2.5621533245083934e-05, + "loss": 1.1088, + "step": 32905 + }, + { + "epoch": 9.85, + "grad_norm": 1.4427859783172607, + "learning_rate": 2.5615659833269516e-05, + "loss": 1.0645, + "step": 32910 + }, + { + "epoch": 9.85, + "grad_norm": 3.558783769607544, + "learning_rate": 2.5609786387452855e-05, + "loss": 1.1666, + "step": 32915 + }, + { + "epoch": 9.85, + "grad_norm": 1.977691888809204, + "learning_rate": 2.5603912907958323e-05, + "loss": 1.0334, + "step": 32920 + }, + { + "epoch": 9.85, + "grad_norm": 2.796060562133789, + "learning_rate": 2.5598039395110307e-05, + "loss": 0.9684, + "step": 32925 + }, + { + "epoch": 9.85, + "grad_norm": 4.863877773284912, + "learning_rate": 2.5592165849233196e-05, + "loss": 1.1839, + "step": 32930 + }, + { + "epoch": 9.85, + "grad_norm": 2.7297861576080322, + "learning_rate": 2.558629227065138e-05, + "loss": 1.1317, + "step": 32935 + }, + { + "epoch": 9.86, + "grad_norm": 3.373911142349243, + "learning_rate": 2.5580418659689255e-05, + "loss": 1.1497, + "step": 32940 + }, + { + "epoch": 9.86, + "grad_norm": 3.8792035579681396, + "learning_rate": 2.5574545016671204e-05, + "loss": 1.1907, + "step": 32945 + }, + { + "epoch": 9.86, + "grad_norm": 3.2369890213012695, + "learning_rate": 2.556867134192164e-05, + "loss": 1.1728, + "step": 32950 + }, + { + "epoch": 9.86, + "grad_norm": 4.953618049621582, + "learning_rate": 2.5562797635764936e-05, + "loss": 0.9538, + "step": 32955 + }, + { + "epoch": 9.86, + "grad_norm": 3.96720290184021, + "learning_rate": 2.555692389852551e-05, + "loss": 1.0353, + "step": 32960 + }, + { + "epoch": 9.86, + "grad_norm": 1.2469520568847656, + "learning_rate": 2.5551050130527753e-05, + "loss": 1.0343, + "step": 32965 + }, + { + "epoch": 9.86, + "grad_norm": 3.0623819828033447, + "learning_rate": 2.554517633209607e-05, + "loss": 0.9463, + "step": 32970 + }, + { + "epoch": 9.87, + "grad_norm": 2.923931837081909, + "learning_rate": 2.553930250355487e-05, + "loss": 1.115, + "step": 32975 + }, + { + "epoch": 9.87, + "grad_norm": 3.7751688957214355, + "learning_rate": 2.553342864522856e-05, + "loss": 1.0373, + "step": 32980 + }, + { + "epoch": 9.87, + "grad_norm": 3.002441167831421, + "learning_rate": 2.552755475744153e-05, + "loss": 1.2383, + "step": 32985 + }, + { + "epoch": 9.87, + "grad_norm": 2.0195484161376953, + "learning_rate": 2.55216808405182e-05, + "loss": 1.033, + "step": 32990 + }, + { + "epoch": 9.87, + "grad_norm": 3.1848971843719482, + "learning_rate": 2.551580689478298e-05, + "loss": 1.0186, + "step": 32995 + }, + { + "epoch": 9.87, + "grad_norm": 1.623724341392517, + "learning_rate": 2.5509932920560274e-05, + "loss": 1.0359, + "step": 33000 + }, + { + "epoch": 9.87, + "grad_norm": 3.2824020385742188, + "learning_rate": 2.5504058918174513e-05, + "loss": 1.2735, + "step": 33005 + }, + { + "epoch": 9.88, + "grad_norm": 2.9799790382385254, + "learning_rate": 2.54981848879501e-05, + "loss": 1.0281, + "step": 33010 + }, + { + "epoch": 9.88, + "grad_norm": 3.8928287029266357, + "learning_rate": 2.5492310830211456e-05, + "loss": 0.9883, + "step": 33015 + }, + { + "epoch": 9.88, + "grad_norm": 4.279067516326904, + "learning_rate": 2.5486436745282995e-05, + "loss": 1.197, + "step": 33020 + }, + { + "epoch": 9.88, + "grad_norm": 3.4032154083251953, + "learning_rate": 2.5480562633489135e-05, + "loss": 1.2078, + "step": 33025 + }, + { + "epoch": 9.88, + "grad_norm": 3.3113880157470703, + "learning_rate": 2.5474688495154298e-05, + "loss": 1.2098, + "step": 33030 + }, + { + "epoch": 9.88, + "grad_norm": 1.2559151649475098, + "learning_rate": 2.5468814330602913e-05, + "loss": 0.9934, + "step": 33035 + }, + { + "epoch": 9.89, + "grad_norm": 1.3851603269577026, + "learning_rate": 2.5462940140159398e-05, + "loss": 1.1007, + "step": 33040 + }, + { + "epoch": 9.89, + "grad_norm": 4.625222206115723, + "learning_rate": 2.5457065924148184e-05, + "loss": 1.074, + "step": 33045 + }, + { + "epoch": 9.89, + "grad_norm": 2.140420913696289, + "learning_rate": 2.545119168289369e-05, + "loss": 1.1087, + "step": 33050 + }, + { + "epoch": 9.89, + "grad_norm": 2.4748363494873047, + "learning_rate": 2.544531741672035e-05, + "loss": 1.0391, + "step": 33055 + }, + { + "epoch": 9.89, + "grad_norm": 1.9943400621414185, + "learning_rate": 2.543944312595259e-05, + "loss": 0.932, + "step": 33060 + }, + { + "epoch": 9.89, + "grad_norm": 4.314105987548828, + "learning_rate": 2.543356881091484e-05, + "loss": 1.0815, + "step": 33065 + }, + { + "epoch": 9.89, + "grad_norm": 3.688622236251831, + "learning_rate": 2.5427694471931546e-05, + "loss": 1.1081, + "step": 33070 + }, + { + "epoch": 9.9, + "grad_norm": 5.567243576049805, + "learning_rate": 2.542182010932712e-05, + "loss": 1.0279, + "step": 33075 + }, + { + "epoch": 9.9, + "grad_norm": 1.8669523000717163, + "learning_rate": 2.541594572342602e-05, + "loss": 1.0404, + "step": 33080 + }, + { + "epoch": 9.9, + "grad_norm": 4.838998317718506, + "learning_rate": 2.5410071314552664e-05, + "loss": 1.1254, + "step": 33085 + }, + { + "epoch": 9.9, + "grad_norm": 4.472231388092041, + "learning_rate": 2.540419688303149e-05, + "loss": 1.0976, + "step": 33090 + }, + { + "epoch": 9.9, + "grad_norm": 2.0864551067352295, + "learning_rate": 2.539832242918695e-05, + "loss": 1.0639, + "step": 33095 + }, + { + "epoch": 9.9, + "grad_norm": 3.6057910919189453, + "learning_rate": 2.539244795334347e-05, + "loss": 1.0898, + "step": 33100 + }, + { + "epoch": 9.9, + "grad_norm": 2.7485694885253906, + "learning_rate": 2.5386573455825503e-05, + "loss": 1.0214, + "step": 33105 + }, + { + "epoch": 9.91, + "grad_norm": 2.115241050720215, + "learning_rate": 2.5380698936957486e-05, + "loss": 1.1543, + "step": 33110 + }, + { + "epoch": 9.91, + "grad_norm": 3.4498496055603027, + "learning_rate": 2.5374824397063857e-05, + "loss": 1.2445, + "step": 33115 + }, + { + "epoch": 9.91, + "grad_norm": 1.5659513473510742, + "learning_rate": 2.536894983646907e-05, + "loss": 1.1021, + "step": 33120 + }, + { + "epoch": 9.91, + "grad_norm": 8.8037748336792, + "learning_rate": 2.5363075255497564e-05, + "loss": 0.9424, + "step": 33125 + }, + { + "epoch": 9.91, + "grad_norm": 4.802347183227539, + "learning_rate": 2.5357200654473788e-05, + "loss": 1.0744, + "step": 33130 + }, + { + "epoch": 9.91, + "grad_norm": 2.3840949535369873, + "learning_rate": 2.5351326033722194e-05, + "loss": 1.1233, + "step": 33135 + }, + { + "epoch": 9.92, + "grad_norm": 2.0759215354919434, + "learning_rate": 2.534545139356723e-05, + "loss": 0.9863, + "step": 33140 + }, + { + "epoch": 9.92, + "grad_norm": 1.2032520771026611, + "learning_rate": 2.533957673433334e-05, + "loss": 1.169, + "step": 33145 + }, + { + "epoch": 9.92, + "grad_norm": 4.248573303222656, + "learning_rate": 2.5333702056344984e-05, + "loss": 0.9997, + "step": 33150 + }, + { + "epoch": 9.92, + "grad_norm": 2.4295151233673096, + "learning_rate": 2.532782735992661e-05, + "loss": 1.0271, + "step": 33155 + }, + { + "epoch": 9.92, + "grad_norm": 2.852827310562134, + "learning_rate": 2.5321952645402668e-05, + "loss": 1.0106, + "step": 33160 + }, + { + "epoch": 9.92, + "grad_norm": 4.686607837677002, + "learning_rate": 2.5316077913097618e-05, + "loss": 1.045, + "step": 33165 + }, + { + "epoch": 9.92, + "grad_norm": 1.6220539808273315, + "learning_rate": 2.5310203163335916e-05, + "loss": 1.2063, + "step": 33170 + }, + { + "epoch": 9.93, + "grad_norm": 1.336083173751831, + "learning_rate": 2.530432839644202e-05, + "loss": 1.3131, + "step": 33175 + }, + { + "epoch": 9.93, + "grad_norm": 1.3632646799087524, + "learning_rate": 2.5298453612740375e-05, + "loss": 1.073, + "step": 33180 + }, + { + "epoch": 9.93, + "grad_norm": 1.0934840440750122, + "learning_rate": 2.529257881255545e-05, + "loss": 0.9939, + "step": 33185 + }, + { + "epoch": 9.93, + "grad_norm": 1.7837378978729248, + "learning_rate": 2.5286703996211697e-05, + "loss": 1.3673, + "step": 33190 + }, + { + "epoch": 9.93, + "grad_norm": 1.5650957822799683, + "learning_rate": 2.5280829164033588e-05, + "loss": 1.2912, + "step": 33195 + }, + { + "epoch": 9.93, + "grad_norm": 1.1075351238250732, + "learning_rate": 2.5274954316345568e-05, + "loss": 1.0113, + "step": 33200 + }, + { + "epoch": 9.93, + "grad_norm": 3.3911895751953125, + "learning_rate": 2.5269079453472117e-05, + "loss": 0.9913, + "step": 33205 + }, + { + "epoch": 9.94, + "grad_norm": 4.012703895568848, + "learning_rate": 2.5263204575737683e-05, + "loss": 1.0538, + "step": 33210 + }, + { + "epoch": 9.94, + "grad_norm": 3.5184414386749268, + "learning_rate": 2.525732968346673e-05, + "loss": 1.291, + "step": 33215 + }, + { + "epoch": 9.94, + "grad_norm": 3.051833152770996, + "learning_rate": 2.5251454776983724e-05, + "loss": 1.0442, + "step": 33220 + }, + { + "epoch": 9.94, + "grad_norm": 3.1589720249176025, + "learning_rate": 2.5245579856613137e-05, + "loss": 0.8625, + "step": 33225 + }, + { + "epoch": 9.94, + "grad_norm": 2.489769458770752, + "learning_rate": 2.523970492267943e-05, + "loss": 1.0129, + "step": 33230 + }, + { + "epoch": 9.94, + "grad_norm": 3.1582303047180176, + "learning_rate": 2.5233829975507068e-05, + "loss": 1.0015, + "step": 33235 + }, + { + "epoch": 9.95, + "grad_norm": 2.759131908416748, + "learning_rate": 2.522795501542052e-05, + "loss": 1.1334, + "step": 33240 + }, + { + "epoch": 9.95, + "grad_norm": 4.12851619720459, + "learning_rate": 2.522208004274425e-05, + "loss": 1.1993, + "step": 33245 + }, + { + "epoch": 9.95, + "grad_norm": 1.7142823934555054, + "learning_rate": 2.5216205057802732e-05, + "loss": 1.2478, + "step": 33250 + }, + { + "epoch": 9.95, + "grad_norm": 2.8924169540405273, + "learning_rate": 2.521033006092044e-05, + "loss": 1.0277, + "step": 33255 + }, + { + "epoch": 9.95, + "grad_norm": 5.144822597503662, + "learning_rate": 2.5204455052421828e-05, + "loss": 1.3498, + "step": 33260 + }, + { + "epoch": 9.95, + "grad_norm": 2.7403883934020996, + "learning_rate": 2.519858003263138e-05, + "loss": 0.9622, + "step": 33265 + }, + { + "epoch": 9.95, + "grad_norm": 3.224583864212036, + "learning_rate": 2.5192705001873566e-05, + "loss": 1.2568, + "step": 33270 + }, + { + "epoch": 9.96, + "grad_norm": 2.203730821609497, + "learning_rate": 2.518682996047285e-05, + "loss": 1.2091, + "step": 33275 + }, + { + "epoch": 9.96, + "grad_norm": 2.4827523231506348, + "learning_rate": 2.5180954908753716e-05, + "loss": 1.0987, + "step": 33280 + }, + { + "epoch": 9.96, + "grad_norm": 3.099918842315674, + "learning_rate": 2.5175079847040626e-05, + "loss": 1.0014, + "step": 33285 + }, + { + "epoch": 9.96, + "grad_norm": 1.661093831062317, + "learning_rate": 2.5169204775658055e-05, + "loss": 1.0222, + "step": 33290 + }, + { + "epoch": 9.96, + "grad_norm": 5.493398189544678, + "learning_rate": 2.5163329694930488e-05, + "loss": 1.0784, + "step": 33295 + }, + { + "epoch": 9.96, + "grad_norm": 1.5556894540786743, + "learning_rate": 2.5157454605182386e-05, + "loss": 1.1672, + "step": 33300 + }, + { + "epoch": 9.96, + "grad_norm": 3.597127676010132, + "learning_rate": 2.515157950673823e-05, + "loss": 1.0432, + "step": 33305 + }, + { + "epoch": 9.97, + "grad_norm": 1.7815759181976318, + "learning_rate": 2.51457043999225e-05, + "loss": 1.0406, + "step": 33310 + }, + { + "epoch": 9.97, + "grad_norm": 4.965347766876221, + "learning_rate": 2.5139829285059664e-05, + "loss": 1.0041, + "step": 33315 + }, + { + "epoch": 9.97, + "grad_norm": 1.7741583585739136, + "learning_rate": 2.5133954162474195e-05, + "loss": 1.0557, + "step": 33320 + }, + { + "epoch": 9.97, + "grad_norm": 2.6704812049865723, + "learning_rate": 2.512807903249058e-05, + "loss": 1.1279, + "step": 33325 + }, + { + "epoch": 9.97, + "grad_norm": 1.5926697254180908, + "learning_rate": 2.51222038954333e-05, + "loss": 1.0901, + "step": 33330 + }, + { + "epoch": 9.97, + "grad_norm": 1.6942665576934814, + "learning_rate": 2.5116328751626827e-05, + "loss": 1.034, + "step": 33335 + }, + { + "epoch": 9.97, + "grad_norm": 3.1640069484710693, + "learning_rate": 2.5110453601395633e-05, + "loss": 1.0341, + "step": 33340 + }, + { + "epoch": 9.98, + "grad_norm": 2.211376905441284, + "learning_rate": 2.5104578445064202e-05, + "loss": 1.0243, + "step": 33345 + }, + { + "epoch": 9.98, + "grad_norm": 2.296888589859009, + "learning_rate": 2.5098703282957013e-05, + "loss": 1.0835, + "step": 33350 + }, + { + "epoch": 9.98, + "grad_norm": 2.575103998184204, + "learning_rate": 2.5092828115398544e-05, + "loss": 0.963, + "step": 33355 + }, + { + "epoch": 9.98, + "grad_norm": 11.325212478637695, + "learning_rate": 2.508695294271327e-05, + "loss": 0.9614, + "step": 33360 + }, + { + "epoch": 9.98, + "grad_norm": 2.817223310470581, + "learning_rate": 2.5081077765225682e-05, + "loss": 1.0673, + "step": 33365 + }, + { + "epoch": 9.98, + "grad_norm": 2.326043128967285, + "learning_rate": 2.5075202583260256e-05, + "loss": 1.145, + "step": 33370 + }, + { + "epoch": 9.99, + "grad_norm": 2.059147357940674, + "learning_rate": 2.506932739714146e-05, + "loss": 1.3188, + "step": 33375 + }, + { + "epoch": 9.99, + "grad_norm": 1.470616102218628, + "learning_rate": 2.506345220719379e-05, + "loss": 1.1262, + "step": 33380 + }, + { + "epoch": 9.99, + "grad_norm": 2.974729537963867, + "learning_rate": 2.5057577013741716e-05, + "loss": 1.207, + "step": 33385 + }, + { + "epoch": 9.99, + "grad_norm": 1.9753620624542236, + "learning_rate": 2.505170181710973e-05, + "loss": 1.1227, + "step": 33390 + }, + { + "epoch": 9.99, + "grad_norm": 2.3012120723724365, + "learning_rate": 2.5045826617622298e-05, + "loss": 1.1889, + "step": 33395 + }, + { + "epoch": 9.99, + "grad_norm": 1.2801812887191772, + "learning_rate": 2.503995141560392e-05, + "loss": 1.1202, + "step": 33400 + }, + { + "epoch": 9.99, + "grad_norm": 4.384020805358887, + "learning_rate": 2.5034076211379053e-05, + "loss": 0.9629, + "step": 33405 + }, + { + "epoch": 10.0, + "grad_norm": 2.4980461597442627, + "learning_rate": 2.5028201005272206e-05, + "loss": 1.1078, + "step": 33410 + }, + { + "epoch": 10.0, + "grad_norm": 1.83892023563385, + "learning_rate": 2.5022325797607837e-05, + "loss": 1.1261, + "step": 33415 + }, + { + "epoch": 10.0, + "grad_norm": 2.998216390609741, + "learning_rate": 2.5016450588710443e-05, + "loss": 1.0531, + "step": 33420 + }, + { + "epoch": 10.0, + "grad_norm": 3.9695932865142822, + "learning_rate": 2.50105753789045e-05, + "loss": 0.8376, + "step": 33425 + }, + { + "epoch": 10.0, + "grad_norm": 2.823977470397949, + "learning_rate": 2.5004700168514482e-05, + "loss": 0.9155, + "step": 33430 + }, + { + "epoch": 10.0, + "grad_norm": 1.843510627746582, + "learning_rate": 2.4998824957864885e-05, + "loss": 1.1598, + "step": 33435 + }, + { + "epoch": 10.0, + "grad_norm": 1.946144461631775, + "learning_rate": 2.499294974728019e-05, + "loss": 0.9176, + "step": 33440 + }, + { + "epoch": 10.01, + "grad_norm": 2.370626211166382, + "learning_rate": 2.498707453708486e-05, + "loss": 0.9641, + "step": 33445 + }, + { + "epoch": 10.01, + "grad_norm": 1.2042300701141357, + "learning_rate": 2.4981199327603404e-05, + "loss": 1.1646, + "step": 33450 + }, + { + "epoch": 10.01, + "grad_norm": 2.661841869354248, + "learning_rate": 2.497532411916029e-05, + "loss": 1.1475, + "step": 33455 + }, + { + "epoch": 10.01, + "grad_norm": 2.6765615940093994, + "learning_rate": 2.4969448912079985e-05, + "loss": 0.9705, + "step": 33460 + }, + { + "epoch": 10.01, + "grad_norm": 2.9539201259613037, + "learning_rate": 2.4963573706686997e-05, + "loss": 0.7655, + "step": 33465 + }, + { + "epoch": 10.01, + "grad_norm": 6.744268894195557, + "learning_rate": 2.4957698503305786e-05, + "loss": 0.8524, + "step": 33470 + }, + { + "epoch": 10.02, + "grad_norm": 2.093243360519409, + "learning_rate": 2.495182330226085e-05, + "loss": 1.0871, + "step": 33475 + }, + { + "epoch": 10.02, + "grad_norm": 2.485497236251831, + "learning_rate": 2.494594810387666e-05, + "loss": 0.8595, + "step": 33480 + }, + { + "epoch": 10.02, + "grad_norm": 3.0233352184295654, + "learning_rate": 2.49400729084777e-05, + "loss": 0.8909, + "step": 33485 + }, + { + "epoch": 10.02, + "grad_norm": 2.09739351272583, + "learning_rate": 2.493419771638845e-05, + "loss": 1.0453, + "step": 33490 + }, + { + "epoch": 10.02, + "grad_norm": 2.0949864387512207, + "learning_rate": 2.4928322527933393e-05, + "loss": 0.9512, + "step": 33495 + }, + { + "epoch": 10.02, + "grad_norm": 2.8882389068603516, + "learning_rate": 2.4922447343437005e-05, + "loss": 1.0104, + "step": 33500 + }, + { + "epoch": 10.02, + "grad_norm": 1.7648423910140991, + "learning_rate": 2.4916572163223772e-05, + "loss": 0.9308, + "step": 33505 + }, + { + "epoch": 10.03, + "grad_norm": 16.295032501220703, + "learning_rate": 2.4910696987618174e-05, + "loss": 1.057, + "step": 33510 + }, + { + "epoch": 10.03, + "grad_norm": 4.960565090179443, + "learning_rate": 2.490482181694469e-05, + "loss": 1.0269, + "step": 33515 + }, + { + "epoch": 10.03, + "grad_norm": 1.606244683265686, + "learning_rate": 2.489894665152779e-05, + "loss": 1.1142, + "step": 33520 + }, + { + "epoch": 10.03, + "grad_norm": 2.571674108505249, + "learning_rate": 2.489307149169197e-05, + "loss": 1.1242, + "step": 33525 + }, + { + "epoch": 10.03, + "grad_norm": 1.8234943151474, + "learning_rate": 2.4887196337761693e-05, + "loss": 1.0145, + "step": 33530 + }, + { + "epoch": 10.03, + "grad_norm": 1.6958742141723633, + "learning_rate": 2.4881321190061453e-05, + "loss": 1.0754, + "step": 33535 + }, + { + "epoch": 10.03, + "grad_norm": 1.0637584924697876, + "learning_rate": 2.487544604891571e-05, + "loss": 1.028, + "step": 33540 + }, + { + "epoch": 10.04, + "grad_norm": 1.169573426246643, + "learning_rate": 2.4869570914648963e-05, + "loss": 1.0321, + "step": 33545 + }, + { + "epoch": 10.04, + "grad_norm": 1.4608122110366821, + "learning_rate": 2.4863695787585678e-05, + "loss": 1.1961, + "step": 33550 + }, + { + "epoch": 10.04, + "grad_norm": 2.158911943435669, + "learning_rate": 2.4857820668050324e-05, + "loss": 1.1969, + "step": 33555 + }, + { + "epoch": 10.04, + "grad_norm": 9.060139656066895, + "learning_rate": 2.4851945556367396e-05, + "loss": 0.7947, + "step": 33560 + }, + { + "epoch": 10.04, + "grad_norm": 4.058126449584961, + "learning_rate": 2.484607045286135e-05, + "loss": 0.8664, + "step": 33565 + }, + { + "epoch": 10.04, + "grad_norm": 3.4800760746002197, + "learning_rate": 2.4840195357856685e-05, + "loss": 1.0867, + "step": 33570 + }, + { + "epoch": 10.05, + "grad_norm": 2.5772833824157715, + "learning_rate": 2.483432027167785e-05, + "loss": 0.9432, + "step": 33575 + }, + { + "epoch": 10.05, + "grad_norm": 1.0311754941940308, + "learning_rate": 2.482844519464935e-05, + "loss": 1.0534, + "step": 33580 + }, + { + "epoch": 10.05, + "grad_norm": 3.5726497173309326, + "learning_rate": 2.4822570127095637e-05, + "loss": 0.9547, + "step": 33585 + }, + { + "epoch": 10.05, + "grad_norm": 3.7769217491149902, + "learning_rate": 2.4816695069341192e-05, + "loss": 1.01, + "step": 33590 + }, + { + "epoch": 10.05, + "grad_norm": 1.1994178295135498, + "learning_rate": 2.4810820021710486e-05, + "loss": 1.1168, + "step": 33595 + }, + { + "epoch": 10.05, + "grad_norm": 3.4825947284698486, + "learning_rate": 2.4804944984527995e-05, + "loss": 1.0242, + "step": 33600 + }, + { + "epoch": 10.05, + "grad_norm": 1.254137396812439, + "learning_rate": 2.4799069958118187e-05, + "loss": 0.9139, + "step": 33605 + }, + { + "epoch": 10.06, + "grad_norm": 5.904828071594238, + "learning_rate": 2.4793194942805545e-05, + "loss": 1.0005, + "step": 33610 + }, + { + "epoch": 10.06, + "grad_norm": 2.302828073501587, + "learning_rate": 2.478731993891452e-05, + "loss": 1.0739, + "step": 33615 + }, + { + "epoch": 10.06, + "grad_norm": 2.0300052165985107, + "learning_rate": 2.4781444946769603e-05, + "loss": 0.9196, + "step": 33620 + }, + { + "epoch": 10.06, + "grad_norm": 2.183711051940918, + "learning_rate": 2.4775569966695242e-05, + "loss": 1.069, + "step": 33625 + }, + { + "epoch": 10.06, + "grad_norm": 6.40975284576416, + "learning_rate": 2.476969499901593e-05, + "loss": 1.0779, + "step": 33630 + }, + { + "epoch": 10.06, + "grad_norm": 2.763566017150879, + "learning_rate": 2.4763820044056114e-05, + "loss": 1.191, + "step": 33635 + }, + { + "epoch": 10.06, + "grad_norm": 6.577224254608154, + "learning_rate": 2.4757945102140287e-05, + "loss": 0.9616, + "step": 33640 + }, + { + "epoch": 10.07, + "grad_norm": 2.686187267303467, + "learning_rate": 2.4752070173592895e-05, + "loss": 1.21, + "step": 33645 + }, + { + "epoch": 10.07, + "grad_norm": 1.8847886323928833, + "learning_rate": 2.47461952587384e-05, + "loss": 1.0332, + "step": 33650 + }, + { + "epoch": 10.07, + "grad_norm": 4.264788627624512, + "learning_rate": 2.4740320357901286e-05, + "loss": 0.9612, + "step": 33655 + }, + { + "epoch": 10.07, + "grad_norm": 3.13706111907959, + "learning_rate": 2.4734445471406e-05, + "loss": 1.1428, + "step": 33660 + }, + { + "epoch": 10.07, + "grad_norm": 3.5950639247894287, + "learning_rate": 2.472857059957703e-05, + "loss": 1.0232, + "step": 33665 + }, + { + "epoch": 10.07, + "grad_norm": 4.524314880371094, + "learning_rate": 2.4722695742738806e-05, + "loss": 0.8956, + "step": 33670 + }, + { + "epoch": 10.08, + "grad_norm": 2.5336546897888184, + "learning_rate": 2.471682090121582e-05, + "loss": 1.1582, + "step": 33675 + }, + { + "epoch": 10.08, + "grad_norm": 2.4728264808654785, + "learning_rate": 2.4710946075332515e-05, + "loss": 1.1319, + "step": 33680 + }, + { + "epoch": 10.08, + "grad_norm": 2.4756505489349365, + "learning_rate": 2.4705071265413355e-05, + "loss": 1.0484, + "step": 33685 + }, + { + "epoch": 10.08, + "grad_norm": 4.324848651885986, + "learning_rate": 2.469919647178281e-05, + "loss": 0.9865, + "step": 33690 + }, + { + "epoch": 10.08, + "grad_norm": 1.772365927696228, + "learning_rate": 2.4693321694765324e-05, + "loss": 1.1322, + "step": 33695 + }, + { + "epoch": 10.08, + "grad_norm": 4.808743476867676, + "learning_rate": 2.468744693468537e-05, + "loss": 0.9088, + "step": 33700 + }, + { + "epoch": 10.08, + "grad_norm": 2.5702743530273438, + "learning_rate": 2.46815721918674e-05, + "loss": 1.1035, + "step": 33705 + }, + { + "epoch": 10.09, + "grad_norm": 2.589160680770874, + "learning_rate": 2.4675697466635855e-05, + "loss": 1.2078, + "step": 33710 + }, + { + "epoch": 10.09, + "grad_norm": 1.3858650922775269, + "learning_rate": 2.466982275931521e-05, + "loss": 0.9329, + "step": 33715 + }, + { + "epoch": 10.09, + "grad_norm": 2.2143914699554443, + "learning_rate": 2.4663948070229905e-05, + "loss": 1.0403, + "step": 33720 + }, + { + "epoch": 10.09, + "grad_norm": 1.3419291973114014, + "learning_rate": 2.4658073399704405e-05, + "loss": 1.1443, + "step": 33725 + }, + { + "epoch": 10.09, + "grad_norm": 3.204826593399048, + "learning_rate": 2.4652198748063146e-05, + "loss": 1.0266, + "step": 33730 + }, + { + "epoch": 10.09, + "grad_norm": 2.098097085952759, + "learning_rate": 2.46463241156306e-05, + "loss": 1.1408, + "step": 33735 + }, + { + "epoch": 10.09, + "grad_norm": 3.6927952766418457, + "learning_rate": 2.4640449502731204e-05, + "loss": 1.0207, + "step": 33740 + }, + { + "epoch": 10.1, + "grad_norm": 4.6298828125, + "learning_rate": 2.46345749096894e-05, + "loss": 1.11, + "step": 33745 + }, + { + "epoch": 10.1, + "grad_norm": 1.5225332975387573, + "learning_rate": 2.4628700336829655e-05, + "loss": 1.0937, + "step": 33750 + }, + { + "epoch": 10.1, + "grad_norm": 2.734102964401245, + "learning_rate": 2.4622825784476392e-05, + "loss": 0.9719, + "step": 33755 + }, + { + "epoch": 10.1, + "grad_norm": 25.489742279052734, + "learning_rate": 2.4616951252954078e-05, + "loss": 1.1438, + "step": 33760 + }, + { + "epoch": 10.1, + "grad_norm": 1.8963426351547241, + "learning_rate": 2.4611076742587137e-05, + "loss": 1.106, + "step": 33765 + }, + { + "epoch": 10.1, + "grad_norm": 2.171485662460327, + "learning_rate": 2.4605202253700034e-05, + "loss": 1.0517, + "step": 33770 + }, + { + "epoch": 10.11, + "grad_norm": 3.205883502960205, + "learning_rate": 2.45993277866172e-05, + "loss": 1.1039, + "step": 33775 + }, + { + "epoch": 10.11, + "grad_norm": 3.306411027908325, + "learning_rate": 2.459345334166307e-05, + "loss": 1.0611, + "step": 33780 + }, + { + "epoch": 10.11, + "grad_norm": 2.3397672176361084, + "learning_rate": 2.4587578919162097e-05, + "loss": 1.1455, + "step": 33785 + }, + { + "epoch": 10.11, + "grad_norm": 3.6171796321868896, + "learning_rate": 2.45817045194387e-05, + "loss": 0.9492, + "step": 33790 + }, + { + "epoch": 10.11, + "grad_norm": 1.5914018154144287, + "learning_rate": 2.4575830142817342e-05, + "loss": 0.9019, + "step": 33795 + }, + { + "epoch": 10.11, + "grad_norm": 1.1069579124450684, + "learning_rate": 2.456995578962243e-05, + "loss": 1.1982, + "step": 33800 + }, + { + "epoch": 10.11, + "grad_norm": 4.241572856903076, + "learning_rate": 2.4564081460178427e-05, + "loss": 1.0414, + "step": 33805 + }, + { + "epoch": 10.12, + "grad_norm": 2.3619353771209717, + "learning_rate": 2.455820715480975e-05, + "loss": 1.16, + "step": 33810 + }, + { + "epoch": 10.12, + "grad_norm": 1.428951621055603, + "learning_rate": 2.4552332873840818e-05, + "loss": 1.1485, + "step": 33815 + }, + { + "epoch": 10.12, + "grad_norm": 1.82322359085083, + "learning_rate": 2.454645861759609e-05, + "loss": 1.0625, + "step": 33820 + }, + { + "epoch": 10.12, + "grad_norm": 2.700174570083618, + "learning_rate": 2.4540584386399974e-05, + "loss": 0.8932, + "step": 33825 + }, + { + "epoch": 10.12, + "grad_norm": 3.158874988555908, + "learning_rate": 2.4534710180576912e-05, + "loss": 1.1615, + "step": 33830 + }, + { + "epoch": 10.12, + "grad_norm": 6.1408820152282715, + "learning_rate": 2.4528836000451323e-05, + "loss": 1.0226, + "step": 33835 + }, + { + "epoch": 10.12, + "grad_norm": 1.1300007104873657, + "learning_rate": 2.452296184634762e-05, + "loss": 0.9967, + "step": 33840 + }, + { + "epoch": 10.13, + "grad_norm": 8.636886596679688, + "learning_rate": 2.4517087718590244e-05, + "loss": 0.9646, + "step": 33845 + }, + { + "epoch": 10.13, + "grad_norm": 4.390270233154297, + "learning_rate": 2.4511213617503616e-05, + "loss": 1.131, + "step": 33850 + }, + { + "epoch": 10.13, + "grad_norm": 4.6113409996032715, + "learning_rate": 2.4505339543412148e-05, + "loss": 1.1715, + "step": 33855 + }, + { + "epoch": 10.13, + "grad_norm": 2.2898902893066406, + "learning_rate": 2.449946549664026e-05, + "loss": 1.0189, + "step": 33860 + }, + { + "epoch": 10.13, + "grad_norm": 0.8719561100006104, + "learning_rate": 2.449359147751238e-05, + "loss": 1.1226, + "step": 33865 + }, + { + "epoch": 10.13, + "grad_norm": 3.1541287899017334, + "learning_rate": 2.4487717486352914e-05, + "loss": 0.9081, + "step": 33870 + }, + { + "epoch": 10.14, + "grad_norm": 4.781918048858643, + "learning_rate": 2.448184352348627e-05, + "loss": 1.2464, + "step": 33875 + }, + { + "epoch": 10.14, + "grad_norm": 3.1645240783691406, + "learning_rate": 2.4475969589236887e-05, + "loss": 0.847, + "step": 33880 + }, + { + "epoch": 10.14, + "grad_norm": 2.1200978755950928, + "learning_rate": 2.447009568392914e-05, + "loss": 1.1901, + "step": 33885 + }, + { + "epoch": 10.14, + "grad_norm": 3.1781980991363525, + "learning_rate": 2.446422180788747e-05, + "loss": 0.9578, + "step": 33890 + }, + { + "epoch": 10.14, + "grad_norm": 4.06983757019043, + "learning_rate": 2.4458347961436264e-05, + "loss": 0.9594, + "step": 33895 + }, + { + "epoch": 10.14, + "grad_norm": 13.415980339050293, + "learning_rate": 2.4452474144899947e-05, + "loss": 1.0795, + "step": 33900 + }, + { + "epoch": 10.14, + "grad_norm": 2.0610928535461426, + "learning_rate": 2.4446600358602915e-05, + "loss": 1.0463, + "step": 33905 + }, + { + "epoch": 10.15, + "grad_norm": 1.7884243726730347, + "learning_rate": 2.4440726602869557e-05, + "loss": 1.1359, + "step": 33910 + }, + { + "epoch": 10.15, + "grad_norm": 2.057011604309082, + "learning_rate": 2.44348528780243e-05, + "loss": 1.0827, + "step": 33915 + }, + { + "epoch": 10.15, + "grad_norm": 3.1838924884796143, + "learning_rate": 2.442897918439152e-05, + "loss": 1.107, + "step": 33920 + }, + { + "epoch": 10.15, + "grad_norm": 2.055171251296997, + "learning_rate": 2.4423105522295633e-05, + "loss": 0.9141, + "step": 33925 + }, + { + "epoch": 10.15, + "grad_norm": 7.081864833831787, + "learning_rate": 2.441723189206102e-05, + "loss": 0.9043, + "step": 33930 + }, + { + "epoch": 10.15, + "grad_norm": 1.6306078433990479, + "learning_rate": 2.44113582940121e-05, + "loss": 1.1316, + "step": 33935 + }, + { + "epoch": 10.15, + "grad_norm": 3.7581050395965576, + "learning_rate": 2.440548472847324e-05, + "loss": 1.058, + "step": 33940 + }, + { + "epoch": 10.16, + "grad_norm": 2.7319552898406982, + "learning_rate": 2.4399611195768836e-05, + "loss": 1.2098, + "step": 33945 + }, + { + "epoch": 10.16, + "grad_norm": 4.5412068367004395, + "learning_rate": 2.4393737696223286e-05, + "loss": 1.0749, + "step": 33950 + }, + { + "epoch": 10.16, + "grad_norm": 3.6076760292053223, + "learning_rate": 2.4387864230160972e-05, + "loss": 1.0315, + "step": 33955 + }, + { + "epoch": 10.16, + "grad_norm": 1.9118307828903198, + "learning_rate": 2.4381990797906284e-05, + "loss": 1.0247, + "step": 33960 + }, + { + "epoch": 10.16, + "grad_norm": 2.3119821548461914, + "learning_rate": 2.4376117399783605e-05, + "loss": 1.0456, + "step": 33965 + }, + { + "epoch": 10.16, + "grad_norm": 1.257812261581421, + "learning_rate": 2.43702440361173e-05, + "loss": 1.1916, + "step": 33970 + }, + { + "epoch": 10.16, + "grad_norm": 3.263524055480957, + "learning_rate": 2.4364370707231777e-05, + "loss": 1.1353, + "step": 33975 + }, + { + "epoch": 10.17, + "grad_norm": 2.8173470497131348, + "learning_rate": 2.4358497413451383e-05, + "loss": 1.1754, + "step": 33980 + }, + { + "epoch": 10.17, + "grad_norm": 1.8419231176376343, + "learning_rate": 2.4352624155100525e-05, + "loss": 0.9914, + "step": 33985 + }, + { + "epoch": 10.17, + "grad_norm": 4.419125080108643, + "learning_rate": 2.434675093250355e-05, + "loss": 1.0154, + "step": 33990 + }, + { + "epoch": 10.17, + "grad_norm": 4.144635200500488, + "learning_rate": 2.4340877745984854e-05, + "loss": 1.1304, + "step": 33995 + }, + { + "epoch": 10.17, + "grad_norm": 2.172372341156006, + "learning_rate": 2.4335004595868794e-05, + "loss": 1.0345, + "step": 34000 + }, + { + "epoch": 10.17, + "grad_norm": 3.2084264755249023, + "learning_rate": 2.4329131482479727e-05, + "loss": 1.1348, + "step": 34005 + }, + { + "epoch": 10.18, + "grad_norm": 1.3004094362258911, + "learning_rate": 2.4323258406142042e-05, + "loss": 1.0589, + "step": 34010 + }, + { + "epoch": 10.18, + "grad_norm": 2.1280858516693115, + "learning_rate": 2.4317385367180083e-05, + "loss": 1.16, + "step": 34015 + }, + { + "epoch": 10.18, + "grad_norm": 3.7424776554107666, + "learning_rate": 2.4311512365918232e-05, + "loss": 1.0512, + "step": 34020 + }, + { + "epoch": 10.18, + "grad_norm": 2.389583110809326, + "learning_rate": 2.4305639402680825e-05, + "loss": 1.2194, + "step": 34025 + }, + { + "epoch": 10.18, + "grad_norm": 9.110445022583008, + "learning_rate": 2.4299766477792242e-05, + "loss": 1.1583, + "step": 34030 + }, + { + "epoch": 10.18, + "grad_norm": 2.022211790084839, + "learning_rate": 2.4293893591576825e-05, + "loss": 1.0861, + "step": 34035 + }, + { + "epoch": 10.18, + "grad_norm": 3.275707483291626, + "learning_rate": 2.428802074435893e-05, + "loss": 0.866, + "step": 34040 + }, + { + "epoch": 10.19, + "grad_norm": 1.1770823001861572, + "learning_rate": 2.428214793646291e-05, + "loss": 1.2091, + "step": 34045 + }, + { + "epoch": 10.19, + "grad_norm": 7.510256767272949, + "learning_rate": 2.4276275168213105e-05, + "loss": 1.1663, + "step": 34050 + }, + { + "epoch": 10.19, + "grad_norm": 2.775092840194702, + "learning_rate": 2.4270402439933886e-05, + "loss": 1.151, + "step": 34055 + }, + { + "epoch": 10.19, + "grad_norm": 2.5497207641601562, + "learning_rate": 2.4264529751949576e-05, + "loss": 1.1476, + "step": 34060 + }, + { + "epoch": 10.19, + "grad_norm": 1.3202245235443115, + "learning_rate": 2.4258657104584518e-05, + "loss": 1.0267, + "step": 34065 + }, + { + "epoch": 10.19, + "grad_norm": 1.2881629467010498, + "learning_rate": 2.4252784498163064e-05, + "loss": 1.0565, + "step": 34070 + }, + { + "epoch": 10.19, + "grad_norm": 2.9040749073028564, + "learning_rate": 2.4246911933009536e-05, + "loss": 1.1041, + "step": 34075 + }, + { + "epoch": 10.2, + "grad_norm": 1.2760438919067383, + "learning_rate": 2.424103940944829e-05, + "loss": 1.2469, + "step": 34080 + }, + { + "epoch": 10.2, + "grad_norm": 1.911608099937439, + "learning_rate": 2.423516692780364e-05, + "loss": 0.9625, + "step": 34085 + }, + { + "epoch": 10.2, + "grad_norm": 1.7194567918777466, + "learning_rate": 2.4229294488399935e-05, + "loss": 1.1776, + "step": 34090 + }, + { + "epoch": 10.2, + "grad_norm": 4.4640960693359375, + "learning_rate": 2.4223422091561493e-05, + "loss": 0.9402, + "step": 34095 + }, + { + "epoch": 10.2, + "grad_norm": 5.374719142913818, + "learning_rate": 2.4217549737612632e-05, + "loss": 1.0412, + "step": 34100 + }, + { + "epoch": 10.2, + "grad_norm": 5.335212230682373, + "learning_rate": 2.4211677426877698e-05, + "loss": 0.9229, + "step": 34105 + }, + { + "epoch": 10.21, + "grad_norm": 1.18242609500885, + "learning_rate": 2.4205805159680986e-05, + "loss": 0.9371, + "step": 34110 + }, + { + "epoch": 10.21, + "grad_norm": 2.225745916366577, + "learning_rate": 2.419993293634684e-05, + "loss": 1.1263, + "step": 34115 + }, + { + "epoch": 10.21, + "grad_norm": 4.692055702209473, + "learning_rate": 2.4194060757199557e-05, + "loss": 1.1487, + "step": 34120 + }, + { + "epoch": 10.21, + "grad_norm": 3.8176944255828857, + "learning_rate": 2.4188188622563474e-05, + "loss": 1.1546, + "step": 34125 + }, + { + "epoch": 10.21, + "grad_norm": 2.7893073558807373, + "learning_rate": 2.4182316532762884e-05, + "loss": 0.9161, + "step": 34130 + }, + { + "epoch": 10.21, + "grad_norm": 5.134355545043945, + "learning_rate": 2.417644448812209e-05, + "loss": 1.1189, + "step": 34135 + }, + { + "epoch": 10.21, + "grad_norm": 1.823405385017395, + "learning_rate": 2.4170572488965427e-05, + "loss": 1.0416, + "step": 34140 + }, + { + "epoch": 10.22, + "grad_norm": 2.36574125289917, + "learning_rate": 2.4164700535617174e-05, + "loss": 0.9979, + "step": 34145 + }, + { + "epoch": 10.22, + "grad_norm": 1.0665093660354614, + "learning_rate": 2.415882862840165e-05, + "loss": 0.937, + "step": 34150 + }, + { + "epoch": 10.22, + "grad_norm": 3.2020561695098877, + "learning_rate": 2.4152956767643138e-05, + "loss": 1.0047, + "step": 34155 + }, + { + "epoch": 10.22, + "grad_norm": 3.9763476848602295, + "learning_rate": 2.4147084953665953e-05, + "loss": 0.9047, + "step": 34160 + }, + { + "epoch": 10.22, + "grad_norm": 5.038437843322754, + "learning_rate": 2.4141213186794378e-05, + "loss": 0.9305, + "step": 34165 + }, + { + "epoch": 10.22, + "grad_norm": 2.591571092605591, + "learning_rate": 2.4135341467352697e-05, + "loss": 1.1079, + "step": 34170 + }, + { + "epoch": 10.22, + "grad_norm": 2.8258914947509766, + "learning_rate": 2.4129469795665215e-05, + "loss": 0.9622, + "step": 34175 + }, + { + "epoch": 10.23, + "grad_norm": 1.6246439218521118, + "learning_rate": 2.4123598172056205e-05, + "loss": 1.0708, + "step": 34180 + }, + { + "epoch": 10.23, + "grad_norm": 3.791229724884033, + "learning_rate": 2.4117726596849964e-05, + "loss": 1.185, + "step": 34185 + }, + { + "epoch": 10.23, + "grad_norm": 2.0742027759552, + "learning_rate": 2.411185507037077e-05, + "loss": 1.1087, + "step": 34190 + }, + { + "epoch": 10.23, + "grad_norm": 2.5888705253601074, + "learning_rate": 2.4105983592942886e-05, + "loss": 1.1562, + "step": 34195 + }, + { + "epoch": 10.23, + "grad_norm": 1.4645191431045532, + "learning_rate": 2.410011216489061e-05, + "loss": 1.0233, + "step": 34200 + }, + { + "epoch": 10.23, + "grad_norm": 5.13542366027832, + "learning_rate": 2.409424078653819e-05, + "loss": 1.0835, + "step": 34205 + }, + { + "epoch": 10.24, + "grad_norm": 1.8826115131378174, + "learning_rate": 2.4088369458209916e-05, + "loss": 1.0035, + "step": 34210 + }, + { + "epoch": 10.24, + "grad_norm": 2.7958590984344482, + "learning_rate": 2.408249818023005e-05, + "loss": 1.0768, + "step": 34215 + }, + { + "epoch": 10.24, + "grad_norm": 1.63248872756958, + "learning_rate": 2.4076626952922857e-05, + "loss": 1.0479, + "step": 34220 + }, + { + "epoch": 10.24, + "grad_norm": 3.1579835414886475, + "learning_rate": 2.4070755776612604e-05, + "loss": 1.1089, + "step": 34225 + }, + { + "epoch": 10.24, + "grad_norm": 1.6841520071029663, + "learning_rate": 2.4064884651623527e-05, + "loss": 1.3328, + "step": 34230 + }, + { + "epoch": 10.24, + "grad_norm": 1.0290015935897827, + "learning_rate": 2.4059013578279917e-05, + "loss": 1.1065, + "step": 34235 + }, + { + "epoch": 10.24, + "grad_norm": 2.263608932495117, + "learning_rate": 2.4053142556905992e-05, + "loss": 1.0712, + "step": 34240 + }, + { + "epoch": 10.25, + "grad_norm": 3.4659013748168945, + "learning_rate": 2.4047271587826032e-05, + "loss": 0.9181, + "step": 34245 + }, + { + "epoch": 10.25, + "grad_norm": 3.7479326725006104, + "learning_rate": 2.404140067136426e-05, + "loss": 0.7427, + "step": 34250 + }, + { + "epoch": 10.25, + "grad_norm": 1.718619704246521, + "learning_rate": 2.4035529807844944e-05, + "loss": 1.0278, + "step": 34255 + }, + { + "epoch": 10.25, + "grad_norm": 1.7770065069198608, + "learning_rate": 2.4029658997592315e-05, + "loss": 1.1875, + "step": 34260 + }, + { + "epoch": 10.25, + "grad_norm": 5.215729236602783, + "learning_rate": 2.4023788240930603e-05, + "loss": 1.0645, + "step": 34265 + }, + { + "epoch": 10.25, + "grad_norm": 3.1575255393981934, + "learning_rate": 2.401791753818406e-05, + "loss": 1.1738, + "step": 34270 + }, + { + "epoch": 10.25, + "grad_norm": 1.1542024612426758, + "learning_rate": 2.4012046889676898e-05, + "loss": 1.002, + "step": 34275 + }, + { + "epoch": 10.26, + "grad_norm": 2.203456163406372, + "learning_rate": 2.400617629573337e-05, + "loss": 0.8584, + "step": 34280 + }, + { + "epoch": 10.26, + "grad_norm": 3.4563515186309814, + "learning_rate": 2.4000305756677685e-05, + "loss": 1.1988, + "step": 34285 + }, + { + "epoch": 10.26, + "grad_norm": 5.644460678100586, + "learning_rate": 2.399443527283408e-05, + "loss": 1.0035, + "step": 34290 + }, + { + "epoch": 10.26, + "grad_norm": 3.3462517261505127, + "learning_rate": 2.398856484452677e-05, + "loss": 1.2118, + "step": 34295 + }, + { + "epoch": 10.26, + "grad_norm": 2.617570161819458, + "learning_rate": 2.398269447207997e-05, + "loss": 1.1375, + "step": 34300 + }, + { + "epoch": 10.26, + "grad_norm": 3.606656789779663, + "learning_rate": 2.3976824155817894e-05, + "loss": 1.0468, + "step": 34305 + }, + { + "epoch": 10.27, + "grad_norm": 2.8307979106903076, + "learning_rate": 2.397095389606476e-05, + "loss": 1.089, + "step": 34310 + }, + { + "epoch": 10.27, + "grad_norm": 1.1224695444107056, + "learning_rate": 2.3965083693144773e-05, + "loss": 1.0908, + "step": 34315 + }, + { + "epoch": 10.27, + "grad_norm": 3.3736870288848877, + "learning_rate": 2.3959213547382138e-05, + "loss": 1.0837, + "step": 34320 + }, + { + "epoch": 10.27, + "grad_norm": 1.475428581237793, + "learning_rate": 2.395334345910105e-05, + "loss": 1.1278, + "step": 34325 + }, + { + "epoch": 10.27, + "grad_norm": 2.2437570095062256, + "learning_rate": 2.394747342862573e-05, + "loss": 1.1207, + "step": 34330 + }, + { + "epoch": 10.27, + "grad_norm": 1.757214903831482, + "learning_rate": 2.394160345628034e-05, + "loss": 1.0901, + "step": 34335 + }, + { + "epoch": 10.27, + "grad_norm": 1.8947856426239014, + "learning_rate": 2.3935733542389103e-05, + "loss": 1.164, + "step": 34340 + }, + { + "epoch": 10.28, + "grad_norm": 2.8285508155822754, + "learning_rate": 2.392986368727619e-05, + "loss": 0.9759, + "step": 34345 + }, + { + "epoch": 10.28, + "grad_norm": 1.154876470565796, + "learning_rate": 2.39239938912658e-05, + "loss": 1.1223, + "step": 34350 + }, + { + "epoch": 10.28, + "grad_norm": 2.4530434608459473, + "learning_rate": 2.391812415468211e-05, + "loss": 1.0409, + "step": 34355 + }, + { + "epoch": 10.28, + "grad_norm": 1.6551940441131592, + "learning_rate": 2.3912254477849286e-05, + "loss": 1.067, + "step": 34360 + }, + { + "epoch": 10.28, + "grad_norm": 2.834213972091675, + "learning_rate": 2.390638486109153e-05, + "loss": 0.9237, + "step": 34365 + }, + { + "epoch": 10.28, + "grad_norm": 3.8012025356292725, + "learning_rate": 2.390051530473299e-05, + "loss": 1.1377, + "step": 34370 + }, + { + "epoch": 10.28, + "grad_norm": 2.4281139373779297, + "learning_rate": 2.3894645809097858e-05, + "loss": 0.9931, + "step": 34375 + }, + { + "epoch": 10.29, + "grad_norm": 2.0580687522888184, + "learning_rate": 2.3888776374510273e-05, + "loss": 1.0118, + "step": 34380 + }, + { + "epoch": 10.29, + "grad_norm": 1.2598625421524048, + "learning_rate": 2.3882907001294433e-05, + "loss": 0.9545, + "step": 34385 + }, + { + "epoch": 10.29, + "grad_norm": 3.062026023864746, + "learning_rate": 2.3877037689774467e-05, + "loss": 1.0679, + "step": 34390 + }, + { + "epoch": 10.29, + "grad_norm": 3.9732203483581543, + "learning_rate": 2.3871168440274545e-05, + "loss": 1.1264, + "step": 34395 + }, + { + "epoch": 10.29, + "grad_norm": 8.932393074035645, + "learning_rate": 2.3865299253118816e-05, + "loss": 0.9523, + "step": 34400 + }, + { + "epoch": 10.29, + "grad_norm": 2.8433444499969482, + "learning_rate": 2.385943012863143e-05, + "loss": 0.9684, + "step": 34405 + }, + { + "epoch": 10.3, + "grad_norm": 3.345172643661499, + "learning_rate": 2.385356106713653e-05, + "loss": 0.9367, + "step": 34410 + }, + { + "epoch": 10.3, + "grad_norm": 16.16388702392578, + "learning_rate": 2.384769206895827e-05, + "loss": 0.9319, + "step": 34415 + }, + { + "epoch": 10.3, + "grad_norm": 1.3967313766479492, + "learning_rate": 2.3841823134420767e-05, + "loss": 1.1586, + "step": 34420 + }, + { + "epoch": 10.3, + "grad_norm": 3.0620460510253906, + "learning_rate": 2.3835954263848176e-05, + "loss": 1.1258, + "step": 34425 + }, + { + "epoch": 10.3, + "grad_norm": 2.4952540397644043, + "learning_rate": 2.3830085457564613e-05, + "loss": 1.0679, + "step": 34430 + }, + { + "epoch": 10.3, + "grad_norm": 2.0433554649353027, + "learning_rate": 2.3824216715894224e-05, + "loss": 1.2555, + "step": 34435 + }, + { + "epoch": 10.3, + "grad_norm": 3.4012880325317383, + "learning_rate": 2.3818348039161115e-05, + "loss": 0.8585, + "step": 34440 + }, + { + "epoch": 10.31, + "grad_norm": 1.1092032194137573, + "learning_rate": 2.3812479427689424e-05, + "loss": 1.1172, + "step": 34445 + }, + { + "epoch": 10.31, + "grad_norm": 1.4429662227630615, + "learning_rate": 2.380661088180326e-05, + "loss": 1.2767, + "step": 34450 + }, + { + "epoch": 10.31, + "grad_norm": 8.600884437561035, + "learning_rate": 2.3800742401826727e-05, + "loss": 1.0851, + "step": 34455 + }, + { + "epoch": 10.31, + "grad_norm": 2.5165319442749023, + "learning_rate": 2.3794873988083954e-05, + "loss": 0.9119, + "step": 34460 + }, + { + "epoch": 10.31, + "grad_norm": 4.288184642791748, + "learning_rate": 2.378900564089903e-05, + "loss": 1.1302, + "step": 34465 + }, + { + "epoch": 10.31, + "grad_norm": 2.89193058013916, + "learning_rate": 2.3783137360596075e-05, + "loss": 1.1722, + "step": 34470 + }, + { + "epoch": 10.31, + "grad_norm": 4.590530872344971, + "learning_rate": 2.3777269147499165e-05, + "loss": 0.8362, + "step": 34475 + }, + { + "epoch": 10.32, + "grad_norm": 4.36327600479126, + "learning_rate": 2.377140100193242e-05, + "loss": 1.0983, + "step": 34480 + }, + { + "epoch": 10.32, + "grad_norm": 2.2678892612457275, + "learning_rate": 2.376553292421992e-05, + "loss": 1.1464, + "step": 34485 + }, + { + "epoch": 10.32, + "grad_norm": 3.2285358905792236, + "learning_rate": 2.3759664914685754e-05, + "loss": 1.1098, + "step": 34490 + }, + { + "epoch": 10.32, + "grad_norm": 4.26018762588501, + "learning_rate": 2.3753796973654e-05, + "loss": 1.0358, + "step": 34495 + }, + { + "epoch": 10.32, + "grad_norm": 1.6850954294204712, + "learning_rate": 2.374792910144874e-05, + "loss": 1.0134, + "step": 34500 + }, + { + "epoch": 10.32, + "grad_norm": 1.3236955404281616, + "learning_rate": 2.3742061298394065e-05, + "loss": 1.0113, + "step": 34505 + }, + { + "epoch": 10.32, + "grad_norm": 5.23323917388916, + "learning_rate": 2.373619356481403e-05, + "loss": 1.097, + "step": 34510 + }, + { + "epoch": 10.33, + "grad_norm": 1.9136748313903809, + "learning_rate": 2.3730325901032718e-05, + "loss": 1.0261, + "step": 34515 + }, + { + "epoch": 10.33, + "grad_norm": 4.667322635650635, + "learning_rate": 2.3724458307374187e-05, + "loss": 1.0971, + "step": 34520 + }, + { + "epoch": 10.33, + "grad_norm": 1.810530185699463, + "learning_rate": 2.3718590784162485e-05, + "loss": 1.0513, + "step": 34525 + }, + { + "epoch": 10.33, + "grad_norm": 5.116344451904297, + "learning_rate": 2.3712723331721698e-05, + "loss": 1.0967, + "step": 34530 + }, + { + "epoch": 10.33, + "grad_norm": 2.3734960556030273, + "learning_rate": 2.370685595037585e-05, + "loss": 1.1215, + "step": 34535 + }, + { + "epoch": 10.33, + "grad_norm": 1.920328974723816, + "learning_rate": 2.3700988640449014e-05, + "loss": 1.1294, + "step": 34540 + }, + { + "epoch": 10.34, + "grad_norm": 3.0604352951049805, + "learning_rate": 2.3695121402265224e-05, + "loss": 1.0668, + "step": 34545 + }, + { + "epoch": 10.34, + "grad_norm": 3.079564094543457, + "learning_rate": 2.3689254236148514e-05, + "loss": 0.9018, + "step": 34550 + }, + { + "epoch": 10.34, + "grad_norm": 2.8750357627868652, + "learning_rate": 2.368338714242294e-05, + "loss": 1.0537, + "step": 34555 + }, + { + "epoch": 10.34, + "grad_norm": 4.040964603424072, + "learning_rate": 2.3677520121412516e-05, + "loss": 0.8928, + "step": 34560 + }, + { + "epoch": 10.34, + "grad_norm": 4.141225814819336, + "learning_rate": 2.3671653173441292e-05, + "loss": 1.1755, + "step": 34565 + }, + { + "epoch": 10.34, + "grad_norm": 4.6155195236206055, + "learning_rate": 2.3665786298833266e-05, + "loss": 1.0279, + "step": 34570 + }, + { + "epoch": 10.34, + "grad_norm": 4.095025539398193, + "learning_rate": 2.3659919497912488e-05, + "loss": 1.0721, + "step": 34575 + }, + { + "epoch": 10.35, + "grad_norm": 2.4246129989624023, + "learning_rate": 2.3654052771002965e-05, + "loss": 1.0305, + "step": 34580 + }, + { + "epoch": 10.35, + "grad_norm": 1.700454592704773, + "learning_rate": 2.36481861184287e-05, + "loss": 1.3431, + "step": 34585 + }, + { + "epoch": 10.35, + "grad_norm": 2.0215277671813965, + "learning_rate": 2.364231954051372e-05, + "loss": 1.0706, + "step": 34590 + }, + { + "epoch": 10.35, + "grad_norm": 3.362088203430176, + "learning_rate": 2.363645303758201e-05, + "loss": 1.0435, + "step": 34595 + }, + { + "epoch": 10.35, + "grad_norm": 1.618765115737915, + "learning_rate": 2.3630586609957592e-05, + "loss": 1.0702, + "step": 34600 + }, + { + "epoch": 10.35, + "grad_norm": 2.0355236530303955, + "learning_rate": 2.3624720257964442e-05, + "loss": 1.0517, + "step": 34605 + }, + { + "epoch": 10.35, + "grad_norm": 4.993839263916016, + "learning_rate": 2.3618853981926573e-05, + "loss": 0.96, + "step": 34610 + }, + { + "epoch": 10.36, + "grad_norm": 2.558202028274536, + "learning_rate": 2.3612987782167964e-05, + "loss": 1.053, + "step": 34615 + }, + { + "epoch": 10.36, + "grad_norm": 1.7264127731323242, + "learning_rate": 2.3607121659012586e-05, + "loss": 1.0745, + "step": 34620 + }, + { + "epoch": 10.36, + "grad_norm": 6.849349021911621, + "learning_rate": 2.360125561278444e-05, + "loss": 1.0707, + "step": 34625 + }, + { + "epoch": 10.36, + "grad_norm": 2.509521245956421, + "learning_rate": 2.3595389643807488e-05, + "loss": 1.0879, + "step": 34630 + }, + { + "epoch": 10.36, + "grad_norm": 1.6693092584609985, + "learning_rate": 2.358952375240571e-05, + "loss": 0.9827, + "step": 34635 + }, + { + "epoch": 10.36, + "grad_norm": 10.258462905883789, + "learning_rate": 2.3583657938903057e-05, + "loss": 1.1256, + "step": 34640 + }, + { + "epoch": 10.37, + "grad_norm": 3.046783685684204, + "learning_rate": 2.3577792203623523e-05, + "loss": 1.0576, + "step": 34645 + }, + { + "epoch": 10.37, + "grad_norm": 4.163492202758789, + "learning_rate": 2.3571926546891042e-05, + "loss": 0.9902, + "step": 34650 + }, + { + "epoch": 10.37, + "grad_norm": 1.5861027240753174, + "learning_rate": 2.3566060969029563e-05, + "loss": 1.195, + "step": 34655 + }, + { + "epoch": 10.37, + "grad_norm": 2.5667498111724854, + "learning_rate": 2.356019547036305e-05, + "loss": 1.0898, + "step": 34660 + }, + { + "epoch": 10.37, + "grad_norm": 1.8456099033355713, + "learning_rate": 2.355433005121545e-05, + "loss": 0.9962, + "step": 34665 + }, + { + "epoch": 10.37, + "grad_norm": 3.5785045623779297, + "learning_rate": 2.3548464711910692e-05, + "loss": 1.1786, + "step": 34670 + }, + { + "epoch": 10.37, + "grad_norm": 2.0253660678863525, + "learning_rate": 2.354259945277273e-05, + "loss": 1.1772, + "step": 34675 + }, + { + "epoch": 10.38, + "grad_norm": 1.426957368850708, + "learning_rate": 2.353673427412547e-05, + "loss": 0.9349, + "step": 34680 + }, + { + "epoch": 10.38, + "grad_norm": 2.7187154293060303, + "learning_rate": 2.353086917629287e-05, + "loss": 1.0479, + "step": 34685 + }, + { + "epoch": 10.38, + "grad_norm": 7.200928211212158, + "learning_rate": 2.3525004159598822e-05, + "loss": 0.9289, + "step": 34690 + }, + { + "epoch": 10.38, + "grad_norm": 4.092981338500977, + "learning_rate": 2.3519139224367278e-05, + "loss": 0.9506, + "step": 34695 + }, + { + "epoch": 10.38, + "grad_norm": 2.0632410049438477, + "learning_rate": 2.351327437092212e-05, + "loss": 1.054, + "step": 34700 + }, + { + "epoch": 10.38, + "grad_norm": 2.467764377593994, + "learning_rate": 2.3507409599587287e-05, + "loss": 1.2235, + "step": 34705 + }, + { + "epoch": 10.38, + "grad_norm": 2.497389793395996, + "learning_rate": 2.350154491068667e-05, + "loss": 0.9353, + "step": 34710 + }, + { + "epoch": 10.39, + "grad_norm": 1.8514834642410278, + "learning_rate": 2.349568030454416e-05, + "loss": 1.1288, + "step": 34715 + }, + { + "epoch": 10.39, + "grad_norm": 1.1936990022659302, + "learning_rate": 2.348981578148367e-05, + "loss": 1.1242, + "step": 34720 + }, + { + "epoch": 10.39, + "grad_norm": 2.6927096843719482, + "learning_rate": 2.3483951341829077e-05, + "loss": 1.1503, + "step": 34725 + }, + { + "epoch": 10.39, + "grad_norm": 5.187263011932373, + "learning_rate": 2.3478086985904287e-05, + "loss": 1.0652, + "step": 34730 + }, + { + "epoch": 10.39, + "grad_norm": 3.8386361598968506, + "learning_rate": 2.3472222714033157e-05, + "loss": 1.0557, + "step": 34735 + }, + { + "epoch": 10.39, + "grad_norm": 1.5286484956741333, + "learning_rate": 2.346635852653959e-05, + "loss": 1.0264, + "step": 34740 + }, + { + "epoch": 10.4, + "grad_norm": 2.502507209777832, + "learning_rate": 2.3460494423747443e-05, + "loss": 1.0286, + "step": 34745 + }, + { + "epoch": 10.4, + "grad_norm": 2.476358652114868, + "learning_rate": 2.345463040598059e-05, + "loss": 0.9376, + "step": 34750 + }, + { + "epoch": 10.4, + "grad_norm": 2.4512181282043457, + "learning_rate": 2.3448766473562892e-05, + "loss": 1.0785, + "step": 34755 + }, + { + "epoch": 10.4, + "grad_norm": 3.614104986190796, + "learning_rate": 2.344290262681821e-05, + "loss": 1.0775, + "step": 34760 + }, + { + "epoch": 10.4, + "grad_norm": 3.8133108615875244, + "learning_rate": 2.3437038866070396e-05, + "loss": 0.9405, + "step": 34765 + }, + { + "epoch": 10.4, + "grad_norm": 3.1824207305908203, + "learning_rate": 2.3431175191643307e-05, + "loss": 0.924, + "step": 34770 + }, + { + "epoch": 10.4, + "grad_norm": 4.146546840667725, + "learning_rate": 2.342531160386077e-05, + "loss": 1.0573, + "step": 34775 + }, + { + "epoch": 10.41, + "grad_norm": 3.6027698516845703, + "learning_rate": 2.341944810304665e-05, + "loss": 1.087, + "step": 34780 + }, + { + "epoch": 10.41, + "grad_norm": 4.211222171783447, + "learning_rate": 2.3413584689524753e-05, + "loss": 1.0654, + "step": 34785 + }, + { + "epoch": 10.41, + "grad_norm": 3.0907680988311768, + "learning_rate": 2.3407721363618935e-05, + "loss": 1.1095, + "step": 34790 + }, + { + "epoch": 10.41, + "grad_norm": 3.797797679901123, + "learning_rate": 2.3401858125653006e-05, + "loss": 0.9867, + "step": 34795 + }, + { + "epoch": 10.41, + "grad_norm": 3.5584876537323, + "learning_rate": 2.33959949759508e-05, + "loss": 1.0036, + "step": 34800 + }, + { + "epoch": 10.41, + "grad_norm": 2.268932580947876, + "learning_rate": 2.3390131914836122e-05, + "loss": 1.1396, + "step": 34805 + }, + { + "epoch": 10.41, + "grad_norm": 1.9871071577072144, + "learning_rate": 2.3384268942632777e-05, + "loss": 1.049, + "step": 34810 + }, + { + "epoch": 10.42, + "grad_norm": 5.160571098327637, + "learning_rate": 2.3378406059664587e-05, + "loss": 0.8987, + "step": 34815 + }, + { + "epoch": 10.42, + "grad_norm": 3.7017822265625, + "learning_rate": 2.3372543266255335e-05, + "loss": 1.0438, + "step": 34820 + }, + { + "epoch": 10.42, + "grad_norm": 2.0446062088012695, + "learning_rate": 2.336668056272884e-05, + "loss": 1.1254, + "step": 34825 + }, + { + "epoch": 10.42, + "grad_norm": 3.492748260498047, + "learning_rate": 2.3360817949408864e-05, + "loss": 0.9717, + "step": 34830 + }, + { + "epoch": 10.42, + "grad_norm": 4.558749198913574, + "learning_rate": 2.3354955426619222e-05, + "loss": 1.0145, + "step": 34835 + }, + { + "epoch": 10.42, + "grad_norm": 2.7081844806671143, + "learning_rate": 2.3349092994683676e-05, + "loss": 1.0516, + "step": 34840 + }, + { + "epoch": 10.43, + "grad_norm": NaN, + "learning_rate": 2.334440311476777e-05, + "loss": 0.975, + "step": 34845 + }, + { + "epoch": 10.43, + "grad_norm": 3.3851234912872314, + "learning_rate": 2.3338540847185518e-05, + "loss": 1.0871, + "step": 34850 + }, + { + "epoch": 10.43, + "grad_norm": 1.13603675365448, + "learning_rate": 2.3332678671363932e-05, + "loss": 1.0211, + "step": 34855 + }, + { + "epoch": 10.43, + "grad_norm": 1.5273590087890625, + "learning_rate": 2.3326816587626775e-05, + "loss": 1.0605, + "step": 34860 + }, + { + "epoch": 10.43, + "grad_norm": 2.7484490871429443, + "learning_rate": 2.3320954596297788e-05, + "loss": 1.193, + "step": 34865 + }, + { + "epoch": 10.43, + "grad_norm": 6.367726802825928, + "learning_rate": 2.3315092697700742e-05, + "loss": 1.1871, + "step": 34870 + }, + { + "epoch": 10.43, + "grad_norm": 2.484698534011841, + "learning_rate": 2.3309230892159365e-05, + "loss": 1.1412, + "step": 34875 + }, + { + "epoch": 10.44, + "grad_norm": 2.0732154846191406, + "learning_rate": 2.3303369179997418e-05, + "loss": 1.114, + "step": 34880 + }, + { + "epoch": 10.44, + "grad_norm": 2.8750739097595215, + "learning_rate": 2.329750756153862e-05, + "loss": 1.0455, + "step": 34885 + }, + { + "epoch": 10.44, + "grad_norm": 1.7856959104537964, + "learning_rate": 2.329164603710672e-05, + "loss": 1.0768, + "step": 34890 + }, + { + "epoch": 10.44, + "grad_norm": 8.524049758911133, + "learning_rate": 2.328578460702543e-05, + "loss": 1.1066, + "step": 34895 + }, + { + "epoch": 10.44, + "grad_norm": 3.2621653079986572, + "learning_rate": 2.3279923271618465e-05, + "loss": 1.3073, + "step": 34900 + }, + { + "epoch": 10.44, + "grad_norm": 1.9567291736602783, + "learning_rate": 2.3274062031209563e-05, + "loss": 1.0333, + "step": 34905 + }, + { + "epoch": 10.44, + "grad_norm": 3.069105386734009, + "learning_rate": 2.326820088612241e-05, + "loss": 1.0295, + "step": 34910 + }, + { + "epoch": 10.45, + "grad_norm": 1.8950694799423218, + "learning_rate": 2.3262339836680727e-05, + "loss": 1.0614, + "step": 34915 + }, + { + "epoch": 10.45, + "grad_norm": 3.2250144481658936, + "learning_rate": 2.3256478883208206e-05, + "loss": 1.0405, + "step": 34920 + }, + { + "epoch": 10.45, + "grad_norm": 4.422776699066162, + "learning_rate": 2.3250618026028543e-05, + "loss": 1.1056, + "step": 34925 + }, + { + "epoch": 10.45, + "grad_norm": 1.4227927923202515, + "learning_rate": 2.3244757265465435e-05, + "loss": 1.0868, + "step": 34930 + }, + { + "epoch": 10.45, + "grad_norm": 2.4411497116088867, + "learning_rate": 2.3238896601842548e-05, + "loss": 0.9554, + "step": 34935 + }, + { + "epoch": 10.45, + "grad_norm": 2.067897081375122, + "learning_rate": 2.3233036035483587e-05, + "loss": 1.1577, + "step": 34940 + }, + { + "epoch": 10.46, + "grad_norm": 2.7568440437316895, + "learning_rate": 2.322717556671219e-05, + "loss": 1.0572, + "step": 34945 + }, + { + "epoch": 10.46, + "grad_norm": 4.667929649353027, + "learning_rate": 2.3221315195852058e-05, + "loss": 0.9588, + "step": 34950 + }, + { + "epoch": 10.46, + "grad_norm": 2.4807698726654053, + "learning_rate": 2.321545492322684e-05, + "loss": 1.0147, + "step": 34955 + }, + { + "epoch": 10.46, + "grad_norm": 15.608848571777344, + "learning_rate": 2.320959474916018e-05, + "loss": 1.2589, + "step": 34960 + }, + { + "epoch": 10.46, + "grad_norm": 2.8121330738067627, + "learning_rate": 2.3203734673975753e-05, + "loss": 1.0937, + "step": 34965 + }, + { + "epoch": 10.46, + "grad_norm": 2.748537540435791, + "learning_rate": 2.319787469799718e-05, + "loss": 0.9264, + "step": 34970 + }, + { + "epoch": 10.46, + "grad_norm": 3.233227014541626, + "learning_rate": 2.3192014821548127e-05, + "loss": 1.2414, + "step": 34975 + }, + { + "epoch": 10.47, + "grad_norm": 1.753865361213684, + "learning_rate": 2.3186155044952203e-05, + "loss": 1.0152, + "step": 34980 + }, + { + "epoch": 10.47, + "grad_norm": 3.4801855087280273, + "learning_rate": 2.3180295368533063e-05, + "loss": 0.9391, + "step": 34985 + }, + { + "epoch": 10.47, + "grad_norm": 2.416815757751465, + "learning_rate": 2.3174435792614318e-05, + "loss": 1.2012, + "step": 34990 + }, + { + "epoch": 10.47, + "grad_norm": 8.600541114807129, + "learning_rate": 2.316857631751958e-05, + "loss": 1.1342, + "step": 34995 + }, + { + "epoch": 10.47, + "grad_norm": 3.0248448848724365, + "learning_rate": 2.3162716943572465e-05, + "loss": 1.0246, + "step": 35000 + }, + { + "epoch": 10.47, + "grad_norm": 2.2905187606811523, + "learning_rate": 2.3156857671096592e-05, + "loss": 0.9875, + "step": 35005 + }, + { + "epoch": 10.47, + "grad_norm": 5.242259979248047, + "learning_rate": 2.3150998500415546e-05, + "loss": 0.9466, + "step": 35010 + }, + { + "epoch": 10.48, + "grad_norm": 2.770148992538452, + "learning_rate": 2.3145139431852934e-05, + "loss": 1.0718, + "step": 35015 + }, + { + "epoch": 10.48, + "grad_norm": 5.146824836730957, + "learning_rate": 2.3139280465732348e-05, + "loss": 1.1159, + "step": 35020 + }, + { + "epoch": 10.48, + "grad_norm": 1.9731031656265259, + "learning_rate": 2.3133421602377366e-05, + "loss": 0.8097, + "step": 35025 + }, + { + "epoch": 10.48, + "grad_norm": 1.8398514986038208, + "learning_rate": 2.3127562842111565e-05, + "loss": 1.1045, + "step": 35030 + }, + { + "epoch": 10.48, + "grad_norm": 3.5004403591156006, + "learning_rate": 2.3121704185258527e-05, + "loss": 1.0801, + "step": 35035 + }, + { + "epoch": 10.48, + "grad_norm": 3.4678022861480713, + "learning_rate": 2.3115845632141806e-05, + "loss": 1.1383, + "step": 35040 + }, + { + "epoch": 10.49, + "grad_norm": 2.484443426132202, + "learning_rate": 2.310998718308498e-05, + "loss": 1.0761, + "step": 35045 + }, + { + "epoch": 10.49, + "grad_norm": 3.0486910343170166, + "learning_rate": 2.31041288384116e-05, + "loss": 1.2576, + "step": 35050 + }, + { + "epoch": 10.49, + "grad_norm": 2.7550971508026123, + "learning_rate": 2.3098270598445203e-05, + "loss": 0.8973, + "step": 35055 + }, + { + "epoch": 10.49, + "grad_norm": 1.8515464067459106, + "learning_rate": 2.3092412463509357e-05, + "loss": 1.0277, + "step": 35060 + }, + { + "epoch": 10.49, + "grad_norm": 1.7236946821212769, + "learning_rate": 2.3086554433927573e-05, + "loss": 1.1393, + "step": 35065 + }, + { + "epoch": 10.49, + "grad_norm": 3.1678953170776367, + "learning_rate": 2.308069651002341e-05, + "loss": 1.0378, + "step": 35070 + }, + { + "epoch": 10.49, + "grad_norm": 1.7023390531539917, + "learning_rate": 2.3074838692120378e-05, + "loss": 1.0801, + "step": 35075 + }, + { + "epoch": 10.5, + "grad_norm": 3.274125814437866, + "learning_rate": 2.306898098054201e-05, + "loss": 1.027, + "step": 35080 + }, + { + "epoch": 10.5, + "grad_norm": 4.002904415130615, + "learning_rate": 2.306312337561181e-05, + "loss": 0.9389, + "step": 35085 + }, + { + "epoch": 10.5, + "grad_norm": 3.656179904937744, + "learning_rate": 2.305726587765329e-05, + "loss": 1.0733, + "step": 35090 + }, + { + "epoch": 10.5, + "grad_norm": 5.379166126251221, + "learning_rate": 2.305140848698996e-05, + "loss": 1.0789, + "step": 35095 + }, + { + "epoch": 10.5, + "grad_norm": 1.2176295518875122, + "learning_rate": 2.3045551203945314e-05, + "loss": 1.0208, + "step": 35100 + }, + { + "epoch": 10.5, + "grad_norm": 2.7093119621276855, + "learning_rate": 2.3039694028842847e-05, + "loss": 0.9681, + "step": 35105 + }, + { + "epoch": 10.5, + "grad_norm": 9.889016151428223, + "learning_rate": 2.3033836962006033e-05, + "loss": 0.8443, + "step": 35110 + }, + { + "epoch": 10.51, + "grad_norm": 4.702127456665039, + "learning_rate": 2.3027980003758366e-05, + "loss": 0.9901, + "step": 35115 + }, + { + "epoch": 10.51, + "grad_norm": 5.113590240478516, + "learning_rate": 2.3022123154423316e-05, + "loss": 1.1291, + "step": 35120 + }, + { + "epoch": 10.51, + "grad_norm": 1.5303608179092407, + "learning_rate": 2.301626641432434e-05, + "loss": 1.1448, + "step": 35125 + }, + { + "epoch": 10.51, + "grad_norm": 2.7038862705230713, + "learning_rate": 2.3010409783784913e-05, + "loss": 1.049, + "step": 35130 + }, + { + "epoch": 10.51, + "grad_norm": 3.5487146377563477, + "learning_rate": 2.3004553263128483e-05, + "loss": 1.0263, + "step": 35135 + }, + { + "epoch": 10.51, + "grad_norm": 1.9222893714904785, + "learning_rate": 2.299869685267851e-05, + "loss": 1.0709, + "step": 35140 + }, + { + "epoch": 10.51, + "grad_norm": 3.621490716934204, + "learning_rate": 2.2992840552758428e-05, + "loss": 1.1768, + "step": 35145 + }, + { + "epoch": 10.52, + "grad_norm": 1.7279472351074219, + "learning_rate": 2.2986984363691663e-05, + "loss": 1.0183, + "step": 35150 + }, + { + "epoch": 10.52, + "grad_norm": 1.8752796649932861, + "learning_rate": 2.2981128285801672e-05, + "loss": 1.0267, + "step": 35155 + }, + { + "epoch": 10.52, + "grad_norm": 2.7973175048828125, + "learning_rate": 2.297527231941186e-05, + "loss": 1.1971, + "step": 35160 + }, + { + "epoch": 10.52, + "grad_norm": 1.7248725891113281, + "learning_rate": 2.296941646484566e-05, + "loss": 0.9569, + "step": 35165 + }, + { + "epoch": 10.52, + "grad_norm": 2.4975392818450928, + "learning_rate": 2.2963560722426468e-05, + "loss": 1.111, + "step": 35170 + }, + { + "epoch": 10.52, + "grad_norm": 3.2511773109436035, + "learning_rate": 2.295770509247771e-05, + "loss": 1.066, + "step": 35175 + }, + { + "epoch": 10.53, + "grad_norm": 2.9692437648773193, + "learning_rate": 2.2951849575322772e-05, + "loss": 0.99, + "step": 35180 + }, + { + "epoch": 10.53, + "grad_norm": 2.788581371307373, + "learning_rate": 2.2945994171285058e-05, + "loss": 1.1557, + "step": 35185 + }, + { + "epoch": 10.53, + "grad_norm": 3.816265106201172, + "learning_rate": 2.2940138880687946e-05, + "loss": 1.0066, + "step": 35190 + }, + { + "epoch": 10.53, + "grad_norm": 5.253087520599365, + "learning_rate": 2.2934283703854823e-05, + "loss": 0.9963, + "step": 35195 + }, + { + "epoch": 10.53, + "grad_norm": 2.2299044132232666, + "learning_rate": 2.2928428641109065e-05, + "loss": 1.1243, + "step": 35200 + }, + { + "epoch": 10.53, + "grad_norm": 4.349503993988037, + "learning_rate": 2.2922573692774034e-05, + "loss": 1.1135, + "step": 35205 + }, + { + "epoch": 10.53, + "grad_norm": 1.9277504682540894, + "learning_rate": 2.2916718859173108e-05, + "loss": 1.0192, + "step": 35210 + }, + { + "epoch": 10.54, + "grad_norm": 2.2060022354125977, + "learning_rate": 2.2910864140629634e-05, + "loss": 1.1672, + "step": 35215 + }, + { + "epoch": 10.54, + "grad_norm": 3.025515556335449, + "learning_rate": 2.2905009537466955e-05, + "loss": 0.9311, + "step": 35220 + }, + { + "epoch": 10.54, + "grad_norm": 3.500619649887085, + "learning_rate": 2.2899155050008428e-05, + "loss": 1.1652, + "step": 35225 + }, + { + "epoch": 10.54, + "grad_norm": 8.94033145904541, + "learning_rate": 2.2893300678577376e-05, + "loss": 1.251, + "step": 35230 + }, + { + "epoch": 10.54, + "grad_norm": 3.2193968296051025, + "learning_rate": 2.2887446423497147e-05, + "loss": 0.9508, + "step": 35235 + }, + { + "epoch": 10.54, + "grad_norm": 3.0045528411865234, + "learning_rate": 2.2881592285091045e-05, + "loss": 0.9492, + "step": 35240 + }, + { + "epoch": 10.54, + "grad_norm": 3.3813045024871826, + "learning_rate": 2.2875738263682413e-05, + "loss": 0.9394, + "step": 35245 + }, + { + "epoch": 10.55, + "grad_norm": 1.6937832832336426, + "learning_rate": 2.2869884359594545e-05, + "loss": 1.1169, + "step": 35250 + }, + { + "epoch": 10.55, + "grad_norm": 1.0461424589157104, + "learning_rate": 2.2864030573150738e-05, + "loss": 1.2219, + "step": 35255 + }, + { + "epoch": 10.55, + "grad_norm": 2.934732675552368, + "learning_rate": 2.2858176904674317e-05, + "loss": 1.14, + "step": 35260 + }, + { + "epoch": 10.55, + "grad_norm": 4.22146463394165, + "learning_rate": 2.2852323354488548e-05, + "loss": 1.103, + "step": 35265 + }, + { + "epoch": 10.55, + "grad_norm": 3.387679100036621, + "learning_rate": 2.2846469922916736e-05, + "loss": 1.0281, + "step": 35270 + }, + { + "epoch": 10.55, + "grad_norm": 2.5129520893096924, + "learning_rate": 2.284061661028215e-05, + "loss": 0.9338, + "step": 35275 + }, + { + "epoch": 10.56, + "grad_norm": 1.3689956665039062, + "learning_rate": 2.2834763416908057e-05, + "loss": 1.1322, + "step": 35280 + }, + { + "epoch": 10.56, + "grad_norm": 1.68153977394104, + "learning_rate": 2.2828910343117734e-05, + "loss": 1.1103, + "step": 35285 + }, + { + "epoch": 10.56, + "grad_norm": 3.7177228927612305, + "learning_rate": 2.2823057389234432e-05, + "loss": 1.1346, + "step": 35290 + }, + { + "epoch": 10.56, + "grad_norm": 1.6719677448272705, + "learning_rate": 2.281720455558142e-05, + "loss": 1.0287, + "step": 35295 + }, + { + "epoch": 10.56, + "grad_norm": 2.740884780883789, + "learning_rate": 2.2811351842481916e-05, + "loss": 1.0737, + "step": 35300 + }, + { + "epoch": 10.56, + "grad_norm": 3.8541574478149414, + "learning_rate": 2.280549925025919e-05, + "loss": 1.045, + "step": 35305 + }, + { + "epoch": 10.56, + "grad_norm": 3.7161831855773926, + "learning_rate": 2.2799646779236454e-05, + "loss": 1.1965, + "step": 35310 + }, + { + "epoch": 10.57, + "grad_norm": 4.742947578430176, + "learning_rate": 2.2793794429736933e-05, + "loss": 1.0853, + "step": 35315 + }, + { + "epoch": 10.57, + "grad_norm": 1.507947564125061, + "learning_rate": 2.278794220208386e-05, + "loss": 0.9959, + "step": 35320 + }, + { + "epoch": 10.57, + "grad_norm": 1.6198567152023315, + "learning_rate": 2.2782090096600434e-05, + "loss": 1.0109, + "step": 35325 + }, + { + "epoch": 10.57, + "grad_norm": 1.7230312824249268, + "learning_rate": 2.277623811360987e-05, + "loss": 1.1351, + "step": 35330 + }, + { + "epoch": 10.57, + "grad_norm": 4.703332901000977, + "learning_rate": 2.2770386253435358e-05, + "loss": 1.0449, + "step": 35335 + }, + { + "epoch": 10.57, + "grad_norm": 2.2942113876342773, + "learning_rate": 2.2764534516400106e-05, + "loss": 1.1787, + "step": 35340 + }, + { + "epoch": 10.57, + "grad_norm": 4.3071513175964355, + "learning_rate": 2.275868290282729e-05, + "loss": 1.092, + "step": 35345 + }, + { + "epoch": 10.58, + "grad_norm": 2.7554285526275635, + "learning_rate": 2.2752831413040074e-05, + "loss": 1.1289, + "step": 35350 + }, + { + "epoch": 10.58, + "grad_norm": 2.5249757766723633, + "learning_rate": 2.2746980047361654e-05, + "loss": 1.0906, + "step": 35355 + }, + { + "epoch": 10.58, + "grad_norm": 2.719829559326172, + "learning_rate": 2.2741128806115176e-05, + "loss": 0.9871, + "step": 35360 + }, + { + "epoch": 10.58, + "grad_norm": 2.8020482063293457, + "learning_rate": 2.273527768962381e-05, + "loss": 1.1126, + "step": 35365 + }, + { + "epoch": 10.58, + "grad_norm": 2.6041548252105713, + "learning_rate": 2.2729426698210703e-05, + "loss": 1.1024, + "step": 35370 + }, + { + "epoch": 10.58, + "grad_norm": 3.7162394523620605, + "learning_rate": 2.2723575832198997e-05, + "loss": 0.9589, + "step": 35375 + }, + { + "epoch": 10.59, + "grad_norm": 25.685956954956055, + "learning_rate": 2.2717725091911843e-05, + "loss": 0.9937, + "step": 35380 + }, + { + "epoch": 10.59, + "grad_norm": 4.040263652801514, + "learning_rate": 2.2711874477672342e-05, + "loss": 0.9, + "step": 35385 + }, + { + "epoch": 10.59, + "grad_norm": 2.358093023300171, + "learning_rate": 2.2706023989803653e-05, + "loss": 1.068, + "step": 35390 + }, + { + "epoch": 10.59, + "grad_norm": 3.6727354526519775, + "learning_rate": 2.270017362862886e-05, + "loss": 0.9494, + "step": 35395 + }, + { + "epoch": 10.59, + "grad_norm": 2.7660393714904785, + "learning_rate": 2.2694323394471097e-05, + "loss": 0.9925, + "step": 35400 + }, + { + "epoch": 10.59, + "grad_norm": 1.4063373804092407, + "learning_rate": 2.2688473287653457e-05, + "loss": 1.2022, + "step": 35405 + }, + { + "epoch": 10.59, + "grad_norm": 2.7881920337677, + "learning_rate": 2.2682623308499023e-05, + "loss": 0.9778, + "step": 35410 + }, + { + "epoch": 10.6, + "grad_norm": 2.657111644744873, + "learning_rate": 2.267677345733091e-05, + "loss": 1.0118, + "step": 35415 + }, + { + "epoch": 10.6, + "grad_norm": 2.8737306594848633, + "learning_rate": 2.267092373447217e-05, + "loss": 0.9643, + "step": 35420 + }, + { + "epoch": 10.6, + "grad_norm": 3.328007459640503, + "learning_rate": 2.26650741402459e-05, + "loss": 1.0657, + "step": 35425 + }, + { + "epoch": 10.6, + "grad_norm": 4.150506019592285, + "learning_rate": 2.265922467497515e-05, + "loss": 1.0437, + "step": 35430 + }, + { + "epoch": 10.6, + "grad_norm": 1.4019544124603271, + "learning_rate": 2.2653375338983e-05, + "loss": 1.1049, + "step": 35435 + }, + { + "epoch": 10.6, + "grad_norm": 2.7139785289764404, + "learning_rate": 2.264752613259249e-05, + "loss": 1.0097, + "step": 35440 + }, + { + "epoch": 10.6, + "grad_norm": 1.9748849868774414, + "learning_rate": 2.2641677056126654e-05, + "loss": 1.1295, + "step": 35445 + }, + { + "epoch": 10.61, + "grad_norm": 3.615483283996582, + "learning_rate": 2.263582810990855e-05, + "loss": 1.0361, + "step": 35450 + }, + { + "epoch": 10.61, + "grad_norm": 2.8796679973602295, + "learning_rate": 2.26299792942612e-05, + "loss": 1.1339, + "step": 35455 + }, + { + "epoch": 10.61, + "grad_norm": 1.1939923763275146, + "learning_rate": 2.262413060950763e-05, + "loss": 1.0356, + "step": 35460 + }, + { + "epoch": 10.61, + "grad_norm": 7.1389994621276855, + "learning_rate": 2.261828205597086e-05, + "loss": 1.0418, + "step": 35465 + }, + { + "epoch": 10.61, + "grad_norm": 1.4644900560379028, + "learning_rate": 2.2612433633973896e-05, + "loss": 1.0545, + "step": 35470 + }, + { + "epoch": 10.61, + "grad_norm": 1.3660115003585815, + "learning_rate": 2.2606585343839744e-05, + "loss": 1.0775, + "step": 35475 + }, + { + "epoch": 10.62, + "grad_norm": 2.621532440185547, + "learning_rate": 2.2600737185891385e-05, + "loss": 1.1644, + "step": 35480 + }, + { + "epoch": 10.62, + "grad_norm": 6.380326271057129, + "learning_rate": 2.2594889160451828e-05, + "loss": 1.1348, + "step": 35485 + }, + { + "epoch": 10.62, + "grad_norm": 2.7322096824645996, + "learning_rate": 2.2589041267844034e-05, + "loss": 1.097, + "step": 35490 + }, + { + "epoch": 10.62, + "grad_norm": 3.6032140254974365, + "learning_rate": 2.2583193508390993e-05, + "loss": 1.0526, + "step": 35495 + }, + { + "epoch": 10.62, + "grad_norm": 2.900298833847046, + "learning_rate": 2.2577345882415663e-05, + "loss": 0.9409, + "step": 35500 + }, + { + "epoch": 10.62, + "grad_norm": 1.8747291564941406, + "learning_rate": 2.257149839024099e-05, + "loss": 1.1623, + "step": 35505 + }, + { + "epoch": 10.62, + "grad_norm": 2.2065136432647705, + "learning_rate": 2.2565651032189948e-05, + "loss": 0.9874, + "step": 35510 + }, + { + "epoch": 10.63, + "grad_norm": 2.0098252296447754, + "learning_rate": 2.255980380858546e-05, + "loss": 1.1739, + "step": 35515 + }, + { + "epoch": 10.63, + "grad_norm": 3.0822856426239014, + "learning_rate": 2.2553956719750483e-05, + "loss": 1.1192, + "step": 35520 + }, + { + "epoch": 10.63, + "grad_norm": 2.1216416358947754, + "learning_rate": 2.254810976600792e-05, + "loss": 0.9649, + "step": 35525 + }, + { + "epoch": 10.63, + "grad_norm": 1.3982808589935303, + "learning_rate": 2.254226294768072e-05, + "loss": 1.0809, + "step": 35530 + }, + { + "epoch": 10.63, + "grad_norm": 1.7782407999038696, + "learning_rate": 2.2536416265091775e-05, + "loss": 1.1739, + "step": 35535 + }, + { + "epoch": 10.63, + "grad_norm": 3.666782855987549, + "learning_rate": 2.2530569718563998e-05, + "loss": 1.1696, + "step": 35540 + }, + { + "epoch": 10.63, + "grad_norm": 2.729365587234497, + "learning_rate": 2.252472330842029e-05, + "loss": 1.0824, + "step": 35545 + }, + { + "epoch": 10.64, + "grad_norm": 2.964341163635254, + "learning_rate": 2.251887703498354e-05, + "loss": 1.2195, + "step": 35550 + }, + { + "epoch": 10.64, + "grad_norm": 3.209306240081787, + "learning_rate": 2.2513030898576635e-05, + "loss": 1.1367, + "step": 35555 + }, + { + "epoch": 10.64, + "grad_norm": 3.5836918354034424, + "learning_rate": 2.2507184899522447e-05, + "loss": 1.142, + "step": 35560 + }, + { + "epoch": 10.64, + "grad_norm": 1.1011484861373901, + "learning_rate": 2.2501339038143843e-05, + "loss": 1.11, + "step": 35565 + }, + { + "epoch": 10.64, + "grad_norm": 1.9480361938476562, + "learning_rate": 2.2495493314763697e-05, + "loss": 1.0384, + "step": 35570 + }, + { + "epoch": 10.64, + "grad_norm": 1.9894143342971802, + "learning_rate": 2.2489647729704838e-05, + "loss": 1.0765, + "step": 35575 + }, + { + "epoch": 10.65, + "grad_norm": 2.3497085571289062, + "learning_rate": 2.2483802283290137e-05, + "loss": 1.0708, + "step": 35580 + }, + { + "epoch": 10.65, + "grad_norm": 2.3375086784362793, + "learning_rate": 2.2477956975842407e-05, + "loss": 1.041, + "step": 35585 + }, + { + "epoch": 10.65, + "grad_norm": 4.431266784667969, + "learning_rate": 2.2472111807684507e-05, + "loss": 1.0047, + "step": 35590 + }, + { + "epoch": 10.65, + "grad_norm": 6.045180797576904, + "learning_rate": 2.246626677913923e-05, + "loss": 1.0029, + "step": 35595 + }, + { + "epoch": 10.65, + "grad_norm": 2.7329964637756348, + "learning_rate": 2.2460421890529417e-05, + "loss": 1.0755, + "step": 35600 + }, + { + "epoch": 10.65, + "grad_norm": 3.588158130645752, + "learning_rate": 2.2454577142177865e-05, + "loss": 0.8856, + "step": 35605 + }, + { + "epoch": 10.65, + "grad_norm": 2.5146617889404297, + "learning_rate": 2.244873253440736e-05, + "loss": 1.2, + "step": 35610 + }, + { + "epoch": 10.66, + "grad_norm": 5.354367733001709, + "learning_rate": 2.2442888067540715e-05, + "loss": 1.0349, + "step": 35615 + }, + { + "epoch": 10.66, + "grad_norm": 1.597602367401123, + "learning_rate": 2.243704374190069e-05, + "loss": 1.1549, + "step": 35620 + }, + { + "epoch": 10.66, + "grad_norm": 1.7936272621154785, + "learning_rate": 2.2431199557810092e-05, + "loss": 1.0453, + "step": 35625 + }, + { + "epoch": 10.66, + "grad_norm": 1.9876726865768433, + "learning_rate": 2.2425355515591666e-05, + "loss": 1.0696, + "step": 35630 + }, + { + "epoch": 10.66, + "grad_norm": 3.0657265186309814, + "learning_rate": 2.241951161556818e-05, + "loss": 0.9361, + "step": 35635 + }, + { + "epoch": 10.66, + "grad_norm": 4.501842021942139, + "learning_rate": 2.241366785806238e-05, + "loss": 1.1468, + "step": 35640 + }, + { + "epoch": 10.66, + "grad_norm": 3.099733829498291, + "learning_rate": 2.240782424339702e-05, + "loss": 0.9717, + "step": 35645 + }, + { + "epoch": 10.67, + "grad_norm": 4.878641605377197, + "learning_rate": 2.2401980771894828e-05, + "loss": 0.9771, + "step": 35650 + }, + { + "epoch": 10.67, + "grad_norm": 3.61327862739563, + "learning_rate": 2.2396137443878534e-05, + "loss": 1.0749, + "step": 35655 + }, + { + "epoch": 10.67, + "grad_norm": 5.230576038360596, + "learning_rate": 2.2390294259670877e-05, + "loss": 1.1493, + "step": 35660 + }, + { + "epoch": 10.67, + "grad_norm": 1.4460947513580322, + "learning_rate": 2.238445121959455e-05, + "loss": 1.2674, + "step": 35665 + }, + { + "epoch": 10.67, + "grad_norm": 2.8116650581359863, + "learning_rate": 2.2378608323972255e-05, + "loss": 1.0535, + "step": 35670 + }, + { + "epoch": 10.67, + "grad_norm": 3.138775587081909, + "learning_rate": 2.2372765573126712e-05, + "loss": 0.8282, + "step": 35675 + }, + { + "epoch": 10.68, + "grad_norm": 2.2588982582092285, + "learning_rate": 2.236692296738058e-05, + "loss": 1.1188, + "step": 35680 + }, + { + "epoch": 10.68, + "grad_norm": 2.0676910877227783, + "learning_rate": 2.2361080507056565e-05, + "loss": 1.1425, + "step": 35685 + }, + { + "epoch": 10.68, + "grad_norm": 7.246330738067627, + "learning_rate": 2.2355238192477324e-05, + "loss": 0.9489, + "step": 35690 + }, + { + "epoch": 10.68, + "grad_norm": 3.171942710876465, + "learning_rate": 2.234939602396554e-05, + "loss": 1.1224, + "step": 35695 + }, + { + "epoch": 10.68, + "grad_norm": 2.2110137939453125, + "learning_rate": 2.2343554001843857e-05, + "loss": 1.1013, + "step": 35700 + }, + { + "epoch": 10.68, + "grad_norm": 2.53434157371521, + "learning_rate": 2.233771212643491e-05, + "loss": 0.9745, + "step": 35705 + }, + { + "epoch": 10.68, + "grad_norm": 2.9125607013702393, + "learning_rate": 2.2331870398061372e-05, + "loss": 1.3203, + "step": 35710 + }, + { + "epoch": 10.69, + "grad_norm": 3.6007089614868164, + "learning_rate": 2.2326028817045842e-05, + "loss": 1.0354, + "step": 35715 + }, + { + "epoch": 10.69, + "grad_norm": 4.191182613372803, + "learning_rate": 2.2320187383710978e-05, + "loss": 1.0592, + "step": 35720 + }, + { + "epoch": 10.69, + "grad_norm": 6.260267734527588, + "learning_rate": 2.2314346098379367e-05, + "loss": 1.1301, + "step": 35725 + }, + { + "epoch": 10.69, + "grad_norm": 2.3227128982543945, + "learning_rate": 2.230850496137363e-05, + "loss": 0.998, + "step": 35730 + }, + { + "epoch": 10.69, + "grad_norm": 2.7904467582702637, + "learning_rate": 2.2302663973016374e-05, + "loss": 1.0603, + "step": 35735 + }, + { + "epoch": 10.69, + "grad_norm": 5.104443550109863, + "learning_rate": 2.2296823133630174e-05, + "loss": 0.9873, + "step": 35740 + }, + { + "epoch": 10.69, + "grad_norm": 5.6274094581604, + "learning_rate": 2.2290982443537633e-05, + "loss": 0.9872, + "step": 35745 + }, + { + "epoch": 10.7, + "grad_norm": 1.1891549825668335, + "learning_rate": 2.2285141903061304e-05, + "loss": 1.0641, + "step": 35750 + }, + { + "epoch": 10.7, + "grad_norm": 2.298877000808716, + "learning_rate": 2.2279301512523778e-05, + "loss": 1.1714, + "step": 35755 + }, + { + "epoch": 10.7, + "grad_norm": 2.9934897422790527, + "learning_rate": 2.22734612722476e-05, + "loss": 0.8837, + "step": 35760 + }, + { + "epoch": 10.7, + "grad_norm": 3.266824722290039, + "learning_rate": 2.2267621182555313e-05, + "loss": 0.9443, + "step": 35765 + }, + { + "epoch": 10.7, + "grad_norm": 6.283466815948486, + "learning_rate": 2.2261781243769478e-05, + "loss": 1.1074, + "step": 35770 + }, + { + "epoch": 10.7, + "grad_norm": 1.5854196548461914, + "learning_rate": 2.2255941456212606e-05, + "loss": 0.9736, + "step": 35775 + }, + { + "epoch": 10.7, + "grad_norm": 4.49138069152832, + "learning_rate": 2.2250101820207246e-05, + "loss": 1.1243, + "step": 35780 + }, + { + "epoch": 10.71, + "grad_norm": 3.611423969268799, + "learning_rate": 2.2244262336075896e-05, + "loss": 1.1206, + "step": 35785 + }, + { + "epoch": 10.71, + "grad_norm": 2.668792247772217, + "learning_rate": 2.2238423004141083e-05, + "loss": 1.3554, + "step": 35790 + }, + { + "epoch": 10.71, + "grad_norm": 1.8896316289901733, + "learning_rate": 2.2232583824725296e-05, + "loss": 1.036, + "step": 35795 + }, + { + "epoch": 10.71, + "grad_norm": 2.4683837890625, + "learning_rate": 2.2226744798151017e-05, + "loss": 1.2378, + "step": 35800 + }, + { + "epoch": 10.71, + "grad_norm": 19.00712776184082, + "learning_rate": 2.2220905924740752e-05, + "loss": 0.9128, + "step": 35805 + }, + { + "epoch": 10.71, + "grad_norm": 3.103806972503662, + "learning_rate": 2.221506720481695e-05, + "loss": 1.1218, + "step": 35810 + }, + { + "epoch": 10.72, + "grad_norm": 5.2492756843566895, + "learning_rate": 2.2209228638702098e-05, + "loss": 1.0806, + "step": 35815 + }, + { + "epoch": 10.72, + "grad_norm": 2.3563156127929688, + "learning_rate": 2.2203390226718652e-05, + "loss": 1.1438, + "step": 35820 + }, + { + "epoch": 10.72, + "grad_norm": 1.5582448244094849, + "learning_rate": 2.2197551969189052e-05, + "loss": 1.194, + "step": 35825 + }, + { + "epoch": 10.72, + "grad_norm": 3.394693613052368, + "learning_rate": 2.219171386643575e-05, + "loss": 1.1377, + "step": 35830 + }, + { + "epoch": 10.72, + "grad_norm": 2.7966670989990234, + "learning_rate": 2.2185875918781163e-05, + "loss": 1.0573, + "step": 35835 + }, + { + "epoch": 10.72, + "grad_norm": 12.529913902282715, + "learning_rate": 2.218003812654773e-05, + "loss": 0.8754, + "step": 35840 + }, + { + "epoch": 10.72, + "grad_norm": 4.540731430053711, + "learning_rate": 2.2174200490057856e-05, + "loss": 0.9227, + "step": 35845 + }, + { + "epoch": 10.73, + "grad_norm": 2.8158795833587646, + "learning_rate": 2.2168363009633958e-05, + "loss": 0.9375, + "step": 35850 + }, + { + "epoch": 10.73, + "grad_norm": 3.6659469604492188, + "learning_rate": 2.216252568559843e-05, + "loss": 1.2302, + "step": 35855 + }, + { + "epoch": 10.73, + "grad_norm": 2.3665411472320557, + "learning_rate": 2.2156688518273648e-05, + "loss": 0.997, + "step": 35860 + }, + { + "epoch": 10.73, + "grad_norm": 3.4320127964019775, + "learning_rate": 2.2150851507982014e-05, + "loss": 1.1298, + "step": 35865 + }, + { + "epoch": 10.73, + "grad_norm": 2.856066942214966, + "learning_rate": 2.2145014655045876e-05, + "loss": 1.2628, + "step": 35870 + }, + { + "epoch": 10.73, + "grad_norm": 2.302837610244751, + "learning_rate": 2.2139177959787625e-05, + "loss": 0.997, + "step": 35875 + }, + { + "epoch": 10.73, + "grad_norm": 3.4538989067077637, + "learning_rate": 2.213334142252959e-05, + "loss": 0.9814, + "step": 35880 + }, + { + "epoch": 10.74, + "grad_norm": 1.6373672485351562, + "learning_rate": 2.212750504359414e-05, + "loss": 1.069, + "step": 35885 + }, + { + "epoch": 10.74, + "grad_norm": 1.9250200986862183, + "learning_rate": 2.2121668823303595e-05, + "loss": 1.0603, + "step": 35890 + }, + { + "epoch": 10.74, + "grad_norm": 2.419745445251465, + "learning_rate": 2.211583276198029e-05, + "loss": 1.1043, + "step": 35895 + }, + { + "epoch": 10.74, + "grad_norm": 1.965515375137329, + "learning_rate": 2.210999685994654e-05, + "loss": 0.9786, + "step": 35900 + }, + { + "epoch": 10.74, + "grad_norm": 2.6983680725097656, + "learning_rate": 2.2104161117524664e-05, + "loss": 0.9368, + "step": 35905 + }, + { + "epoch": 10.74, + "grad_norm": 2.2785370349884033, + "learning_rate": 2.2098325535036957e-05, + "loss": 1.1098, + "step": 35910 + }, + { + "epoch": 10.75, + "grad_norm": 2.5998971462249756, + "learning_rate": 2.2092490112805715e-05, + "loss": 0.9755, + "step": 35915 + }, + { + "epoch": 10.75, + "grad_norm": 2.66782546043396, + "learning_rate": 2.208665485115322e-05, + "loss": 1.2246, + "step": 35920 + }, + { + "epoch": 10.75, + "grad_norm": 2.6380605697631836, + "learning_rate": 2.2080819750401757e-05, + "loss": 1.1043, + "step": 35925 + }, + { + "epoch": 10.75, + "grad_norm": 1.2431594133377075, + "learning_rate": 2.2074984810873572e-05, + "loss": 0.9766, + "step": 35930 + }, + { + "epoch": 10.75, + "grad_norm": 1.7657440900802612, + "learning_rate": 2.2069150032890952e-05, + "loss": 1.0666, + "step": 35935 + }, + { + "epoch": 10.75, + "grad_norm": 1.1442104578018188, + "learning_rate": 2.2064482327034192e-05, + "loss": 0.958, + "step": 35940 + }, + { + "epoch": 10.75, + "grad_norm": 4.242776393890381, + "learning_rate": 2.2058647840645615e-05, + "loss": 1.0115, + "step": 35945 + }, + { + "epoch": 10.76, + "grad_norm": 2.7217624187469482, + "learning_rate": 2.2052813516704852e-05, + "loss": 1.0799, + "step": 35950 + }, + { + "epoch": 10.76, + "grad_norm": 2.8261914253234863, + "learning_rate": 2.204697935553414e-05, + "loss": 1.1698, + "step": 35955 + }, + { + "epoch": 10.76, + "grad_norm": 1.2735272645950317, + "learning_rate": 2.2041145357455684e-05, + "loss": 0.9582, + "step": 35960 + }, + { + "epoch": 10.76, + "grad_norm": 2.095346212387085, + "learning_rate": 2.2035311522791705e-05, + "loss": 1.0745, + "step": 35965 + }, + { + "epoch": 10.76, + "grad_norm": 3.746208906173706, + "learning_rate": 2.2029477851864376e-05, + "loss": 0.9504, + "step": 35970 + }, + { + "epoch": 10.76, + "grad_norm": 32.70151138305664, + "learning_rate": 2.2023644344995907e-05, + "loss": 1.1023, + "step": 35975 + }, + { + "epoch": 10.76, + "grad_norm": 2.321796417236328, + "learning_rate": 2.2017811002508466e-05, + "loss": 1.1198, + "step": 35980 + }, + { + "epoch": 10.77, + "grad_norm": 2.214085102081299, + "learning_rate": 2.201197782472422e-05, + "loss": 1.0783, + "step": 35985 + }, + { + "epoch": 10.77, + "grad_norm": 2.9954833984375, + "learning_rate": 2.2006144811965336e-05, + "loss": 1.059, + "step": 35990 + }, + { + "epoch": 10.77, + "grad_norm": 1.0794235467910767, + "learning_rate": 2.2000311964553954e-05, + "loss": 1.1247, + "step": 35995 + }, + { + "epoch": 10.77, + "grad_norm": 5.492161750793457, + "learning_rate": 2.1994479282812236e-05, + "loss": 0.9969, + "step": 36000 + }, + { + "epoch": 10.77, + "grad_norm": 1.0977524518966675, + "learning_rate": 2.19886467670623e-05, + "loss": 0.9352, + "step": 36005 + }, + { + "epoch": 10.77, + "grad_norm": 2.7051050662994385, + "learning_rate": 2.1982814417626265e-05, + "loss": 0.7758, + "step": 36010 + }, + { + "epoch": 10.78, + "grad_norm": 2.0435142517089844, + "learning_rate": 2.1976982234826265e-05, + "loss": 1.0474, + "step": 36015 + }, + { + "epoch": 10.78, + "grad_norm": 3.4940507411956787, + "learning_rate": 2.197115021898438e-05, + "loss": 1.0514, + "step": 36020 + }, + { + "epoch": 10.78, + "grad_norm": 2.082819700241089, + "learning_rate": 2.1965318370422735e-05, + "loss": 1.0285, + "step": 36025 + }, + { + "epoch": 10.78, + "grad_norm": 2.880784511566162, + "learning_rate": 2.195948668946339e-05, + "loss": 1.2136, + "step": 36030 + }, + { + "epoch": 10.78, + "grad_norm": 3.054262161254883, + "learning_rate": 2.1953655176428445e-05, + "loss": 1.0397, + "step": 36035 + }, + { + "epoch": 10.78, + "grad_norm": 3.0019352436065674, + "learning_rate": 2.194782383163996e-05, + "loss": 1.11, + "step": 36040 + }, + { + "epoch": 10.78, + "grad_norm": 0.9828662276268005, + "learning_rate": 2.194199265541998e-05, + "loss": 1.0822, + "step": 36045 + }, + { + "epoch": 10.79, + "grad_norm": 3.027097702026367, + "learning_rate": 2.1936161648090575e-05, + "loss": 0.9884, + "step": 36050 + }, + { + "epoch": 10.79, + "grad_norm": 6.978221893310547, + "learning_rate": 2.193033080997377e-05, + "loss": 1.0059, + "step": 36055 + }, + { + "epoch": 10.79, + "grad_norm": 3.2999072074890137, + "learning_rate": 2.192450014139161e-05, + "loss": 1.2139, + "step": 36060 + }, + { + "epoch": 10.79, + "grad_norm": 2.107088804244995, + "learning_rate": 2.1918669642666106e-05, + "loss": 0.9479, + "step": 36065 + }, + { + "epoch": 10.79, + "grad_norm": 1.4788484573364258, + "learning_rate": 2.1912839314119276e-05, + "loss": 1.0025, + "step": 36070 + }, + { + "epoch": 10.79, + "grad_norm": 1.6675289869308472, + "learning_rate": 2.190700915607313e-05, + "loss": 1.1289, + "step": 36075 + }, + { + "epoch": 10.79, + "grad_norm": 4.08076286315918, + "learning_rate": 2.190117916884964e-05, + "loss": 1.1822, + "step": 36080 + }, + { + "epoch": 10.8, + "grad_norm": 9.089180946350098, + "learning_rate": 2.1895349352770816e-05, + "loss": 1.0968, + "step": 36085 + }, + { + "epoch": 10.8, + "grad_norm": 4.459921360015869, + "learning_rate": 2.188951970815861e-05, + "loss": 0.8567, + "step": 36090 + }, + { + "epoch": 10.8, + "grad_norm": 3.4034626483917236, + "learning_rate": 2.1883690235335004e-05, + "loss": 1.05, + "step": 36095 + }, + { + "epoch": 10.8, + "grad_norm": 4.486004829406738, + "learning_rate": 2.187786093462194e-05, + "loss": 1.04, + "step": 36100 + }, + { + "epoch": 10.8, + "grad_norm": 2.139437675476074, + "learning_rate": 2.187203180634138e-05, + "loss": 0.9213, + "step": 36105 + }, + { + "epoch": 10.8, + "grad_norm": 2.6758251190185547, + "learning_rate": 2.1866202850815254e-05, + "loss": 1.0313, + "step": 36110 + }, + { + "epoch": 10.81, + "grad_norm": 2.4901866912841797, + "learning_rate": 2.1860374068365473e-05, + "loss": 1.0219, + "step": 36115 + }, + { + "epoch": 10.81, + "grad_norm": 4.980169296264648, + "learning_rate": 2.1854545459313985e-05, + "loss": 1.1223, + "step": 36120 + }, + { + "epoch": 10.81, + "grad_norm": 2.8188419342041016, + "learning_rate": 2.1848717023982667e-05, + "loss": 1.0865, + "step": 36125 + }, + { + "epoch": 10.81, + "grad_norm": 6.227662086486816, + "learning_rate": 2.1842888762693444e-05, + "loss": 1.1196, + "step": 36130 + }, + { + "epoch": 10.81, + "grad_norm": 2.4889774322509766, + "learning_rate": 2.1837060675768196e-05, + "loss": 1.0085, + "step": 36135 + }, + { + "epoch": 10.81, + "grad_norm": 2.1352288722991943, + "learning_rate": 2.1831232763528783e-05, + "loss": 1.0865, + "step": 36140 + }, + { + "epoch": 10.81, + "grad_norm": 1.6987781524658203, + "learning_rate": 2.1825405026297103e-05, + "loss": 1.1085, + "step": 36145 + }, + { + "epoch": 10.82, + "grad_norm": 2.239659309387207, + "learning_rate": 2.1819577464394992e-05, + "loss": 1.2226, + "step": 36150 + }, + { + "epoch": 10.82, + "grad_norm": 2.6719276905059814, + "learning_rate": 2.181375007814432e-05, + "loss": 1.2125, + "step": 36155 + }, + { + "epoch": 10.82, + "grad_norm": 1.225275993347168, + "learning_rate": 2.180792286786692e-05, + "loss": 1.0033, + "step": 36160 + }, + { + "epoch": 10.82, + "grad_norm": 4.794998645782471, + "learning_rate": 2.180209583388462e-05, + "loss": 1.0673, + "step": 36165 + }, + { + "epoch": 10.82, + "grad_norm": 2.080230951309204, + "learning_rate": 2.179626897651925e-05, + "loss": 1.067, + "step": 36170 + }, + { + "epoch": 10.82, + "grad_norm": 1.528211236000061, + "learning_rate": 2.17904422960926e-05, + "loss": 1.1119, + "step": 36175 + }, + { + "epoch": 10.82, + "grad_norm": 4.195923328399658, + "learning_rate": 2.17846157929265e-05, + "loss": 1.2904, + "step": 36180 + }, + { + "epoch": 10.83, + "grad_norm": 3.0591914653778076, + "learning_rate": 2.1778789467342713e-05, + "loss": 1.1747, + "step": 36185 + }, + { + "epoch": 10.83, + "grad_norm": 9.022075653076172, + "learning_rate": 2.177296331966305e-05, + "loss": 1.0482, + "step": 36190 + }, + { + "epoch": 10.83, + "grad_norm": 3.723809003829956, + "learning_rate": 2.1767137350209253e-05, + "loss": 1.0859, + "step": 36195 + }, + { + "epoch": 10.83, + "grad_norm": 2.5620508193969727, + "learning_rate": 2.1761311559303117e-05, + "loss": 0.9736, + "step": 36200 + }, + { + "epoch": 10.83, + "grad_norm": 3.8153581619262695, + "learning_rate": 2.1755485947266375e-05, + "loss": 1.2724, + "step": 36205 + }, + { + "epoch": 10.83, + "grad_norm": 1.3778871297836304, + "learning_rate": 2.174966051442076e-05, + "loss": 1.1058, + "step": 36210 + }, + { + "epoch": 10.84, + "grad_norm": 3.073227643966675, + "learning_rate": 2.174383526108803e-05, + "loss": 1.0846, + "step": 36215 + }, + { + "epoch": 10.84, + "grad_norm": 1.9298650026321411, + "learning_rate": 2.1738010187589878e-05, + "loss": 0.9319, + "step": 36220 + }, + { + "epoch": 10.84, + "grad_norm": 1.1409422159194946, + "learning_rate": 2.1732185294248045e-05, + "loss": 1.1355, + "step": 36225 + }, + { + "epoch": 10.84, + "grad_norm": 2.380774974822998, + "learning_rate": 2.1726360581384218e-05, + "loss": 1.1526, + "step": 36230 + }, + { + "epoch": 10.84, + "grad_norm": 1.3581106662750244, + "learning_rate": 2.172053604932009e-05, + "loss": 1.0837, + "step": 36235 + }, + { + "epoch": 10.84, + "grad_norm": 4.142109394073486, + "learning_rate": 2.1714711698377348e-05, + "loss": 1.0323, + "step": 36240 + }, + { + "epoch": 10.84, + "grad_norm": 2.672729015350342, + "learning_rate": 2.1708887528877668e-05, + "loss": 1.0175, + "step": 36245 + }, + { + "epoch": 10.85, + "grad_norm": 1.8875662088394165, + "learning_rate": 2.1703063541142703e-05, + "loss": 1.191, + "step": 36250 + }, + { + "epoch": 10.85, + "grad_norm": 1.7792450189590454, + "learning_rate": 2.1697239735494117e-05, + "loss": 0.9895, + "step": 36255 + }, + { + "epoch": 10.85, + "grad_norm": 2.7931838035583496, + "learning_rate": 2.1691416112253547e-05, + "loss": 1.0206, + "step": 36260 + }, + { + "epoch": 10.85, + "grad_norm": 3.638197660446167, + "learning_rate": 2.1685592671742626e-05, + "loss": 1.1491, + "step": 36265 + }, + { + "epoch": 10.85, + "grad_norm": 1.3871898651123047, + "learning_rate": 2.167976941428297e-05, + "loss": 1.109, + "step": 36270 + }, + { + "epoch": 10.85, + "grad_norm": 2.8166041374206543, + "learning_rate": 2.167394634019621e-05, + "loss": 1.0765, + "step": 36275 + }, + { + "epoch": 10.85, + "grad_norm": 2.0707552433013916, + "learning_rate": 2.1668123449803924e-05, + "loss": 1.0585, + "step": 36280 + }, + { + "epoch": 10.86, + "grad_norm": 3.0661659240722656, + "learning_rate": 2.1662300743427727e-05, + "loss": 1.0604, + "step": 36285 + }, + { + "epoch": 10.86, + "grad_norm": 2.1380584239959717, + "learning_rate": 2.1656478221389183e-05, + "loss": 1.0168, + "step": 36290 + }, + { + "epoch": 10.86, + "grad_norm": 2.518028974533081, + "learning_rate": 2.1650655884009875e-05, + "loss": 1.2193, + "step": 36295 + }, + { + "epoch": 10.86, + "grad_norm": 1.8819745779037476, + "learning_rate": 2.1644833731611366e-05, + "loss": 0.9329, + "step": 36300 + }, + { + "epoch": 10.86, + "grad_norm": 3.2668797969818115, + "learning_rate": 2.163901176451519e-05, + "loss": 0.9986, + "step": 36305 + }, + { + "epoch": 10.86, + "grad_norm": 2.280186653137207, + "learning_rate": 2.1633189983042916e-05, + "loss": 0.8617, + "step": 36310 + }, + { + "epoch": 10.87, + "grad_norm": 1.871981143951416, + "learning_rate": 2.1627368387516043e-05, + "loss": 0.9601, + "step": 36315 + }, + { + "epoch": 10.87, + "grad_norm": 2.4408321380615234, + "learning_rate": 2.162154697825612e-05, + "loss": 1.0367, + "step": 36320 + }, + { + "epoch": 10.87, + "grad_norm": 8.324357986450195, + "learning_rate": 2.1615725755584636e-05, + "loss": 1.1349, + "step": 36325 + }, + { + "epoch": 10.87, + "grad_norm": 3.0855600833892822, + "learning_rate": 2.1609904719823116e-05, + "loss": 1.0233, + "step": 36330 + }, + { + "epoch": 10.87, + "grad_norm": 1.6718279123306274, + "learning_rate": 2.1604083871293023e-05, + "loss": 1.1736, + "step": 36335 + }, + { + "epoch": 10.87, + "grad_norm": 2.869089365005493, + "learning_rate": 2.159826321031585e-05, + "loss": 1.0822, + "step": 36340 + }, + { + "epoch": 10.87, + "grad_norm": 6.643579483032227, + "learning_rate": 2.1592442737213064e-05, + "loss": 0.9564, + "step": 36345 + }, + { + "epoch": 10.88, + "grad_norm": 1.4432473182678223, + "learning_rate": 2.1586622452306125e-05, + "loss": 1.0499, + "step": 36350 + }, + { + "epoch": 10.88, + "grad_norm": 3.8422656059265137, + "learning_rate": 2.158080235591648e-05, + "loss": 0.7816, + "step": 36355 + }, + { + "epoch": 10.88, + "grad_norm": 2.3019320964813232, + "learning_rate": 2.1574982448365568e-05, + "loss": 1.1791, + "step": 36360 + }, + { + "epoch": 10.88, + "grad_norm": 2.5562939643859863, + "learning_rate": 2.1569162729974806e-05, + "loss": 1.1605, + "step": 36365 + }, + { + "epoch": 10.88, + "grad_norm": 6.145001411437988, + "learning_rate": 2.1563343201065633e-05, + "loss": 1.1362, + "step": 36370 + }, + { + "epoch": 10.88, + "grad_norm": 3.0975069999694824, + "learning_rate": 2.1557523861959434e-05, + "loss": 0.8223, + "step": 36375 + }, + { + "epoch": 10.88, + "grad_norm": 3.5439906120300293, + "learning_rate": 2.1551704712977623e-05, + "loss": 1.0676, + "step": 36380 + }, + { + "epoch": 10.89, + "grad_norm": 1.0710362195968628, + "learning_rate": 2.1545885754441562e-05, + "loss": 1.1736, + "step": 36385 + }, + { + "epoch": 10.89, + "grad_norm": 8.471335411071777, + "learning_rate": 2.1540066986672655e-05, + "loss": 0.8974, + "step": 36390 + }, + { + "epoch": 10.89, + "grad_norm": 3.0269336700439453, + "learning_rate": 2.153424840999225e-05, + "loss": 1.0499, + "step": 36395 + }, + { + "epoch": 10.89, + "grad_norm": 3.6303861141204834, + "learning_rate": 2.1528430024721694e-05, + "loss": 1.0851, + "step": 36400 + }, + { + "epoch": 10.89, + "grad_norm": 4.8882293701171875, + "learning_rate": 2.1522611831182348e-05, + "loss": 0.8858, + "step": 36405 + }, + { + "epoch": 10.89, + "grad_norm": 2.3631794452667236, + "learning_rate": 2.1516793829695526e-05, + "loss": 1.1782, + "step": 36410 + }, + { + "epoch": 10.89, + "grad_norm": 3.9514851570129395, + "learning_rate": 2.151097602058257e-05, + "loss": 1.195, + "step": 36415 + }, + { + "epoch": 10.9, + "grad_norm": 2.633768320083618, + "learning_rate": 2.1505158404164773e-05, + "loss": 1.1729, + "step": 36420 + }, + { + "epoch": 10.9, + "grad_norm": 6.990433692932129, + "learning_rate": 2.1499340980763456e-05, + "loss": 0.9894, + "step": 36425 + }, + { + "epoch": 10.9, + "grad_norm": 2.7082440853118896, + "learning_rate": 2.149352375069989e-05, + "loss": 1.0238, + "step": 36430 + }, + { + "epoch": 10.9, + "grad_norm": 3.5102274417877197, + "learning_rate": 2.1487706714295357e-05, + "loss": 0.9571, + "step": 36435 + }, + { + "epoch": 10.9, + "grad_norm": 2.7834293842315674, + "learning_rate": 2.1481889871871143e-05, + "loss": 1.1934, + "step": 36440 + }, + { + "epoch": 10.9, + "grad_norm": 1.7707253694534302, + "learning_rate": 2.1476073223748485e-05, + "loss": 1.0634, + "step": 36445 + }, + { + "epoch": 10.91, + "grad_norm": 1.9651144742965698, + "learning_rate": 2.147025677024865e-05, + "loss": 1.3354, + "step": 36450 + }, + { + "epoch": 10.91, + "grad_norm": 4.304943084716797, + "learning_rate": 2.146444051169285e-05, + "loss": 0.9292, + "step": 36455 + }, + { + "epoch": 10.91, + "grad_norm": 3.199054479598999, + "learning_rate": 2.1458624448402343e-05, + "loss": 0.9946, + "step": 36460 + }, + { + "epoch": 10.91, + "grad_norm": 1.4956828355789185, + "learning_rate": 2.1452808580698325e-05, + "loss": 0.9661, + "step": 36465 + }, + { + "epoch": 10.91, + "grad_norm": 1.1398528814315796, + "learning_rate": 2.144699290890199e-05, + "loss": 1.0639, + "step": 36470 + }, + { + "epoch": 10.91, + "grad_norm": 7.341798782348633, + "learning_rate": 2.1441177433334552e-05, + "loss": 0.8947, + "step": 36475 + }, + { + "epoch": 10.91, + "grad_norm": 3.4453506469726562, + "learning_rate": 2.1435362154317176e-05, + "loss": 1.0649, + "step": 36480 + }, + { + "epoch": 10.92, + "grad_norm": 0.8624611496925354, + "learning_rate": 2.1429547072171057e-05, + "loss": 1.1476, + "step": 36485 + }, + { + "epoch": 10.92, + "grad_norm": 5.505244255065918, + "learning_rate": 2.1423732187217338e-05, + "loss": 0.9781, + "step": 36490 + }, + { + "epoch": 10.92, + "grad_norm": 3.857640504837036, + "learning_rate": 2.1417917499777164e-05, + "loss": 1.0794, + "step": 36495 + }, + { + "epoch": 10.92, + "grad_norm": 3.2706289291381836, + "learning_rate": 2.1412103010171693e-05, + "loss": 1.0661, + "step": 36500 + }, + { + "epoch": 10.92, + "grad_norm": 2.5430166721343994, + "learning_rate": 2.140628871872203e-05, + "loss": 1.0644, + "step": 36505 + }, + { + "epoch": 10.92, + "grad_norm": 3.8892323970794678, + "learning_rate": 2.140047462574932e-05, + "loss": 0.9569, + "step": 36510 + }, + { + "epoch": 10.92, + "grad_norm": 5.289989948272705, + "learning_rate": 2.1394660731574643e-05, + "loss": 1.04, + "step": 36515 + }, + { + "epoch": 10.93, + "grad_norm": 1.7710543870925903, + "learning_rate": 2.138884703651911e-05, + "loss": 1.024, + "step": 36520 + }, + { + "epoch": 10.93, + "grad_norm": 7.825911998748779, + "learning_rate": 2.138303354090381e-05, + "loss": 0.9066, + "step": 36525 + }, + { + "epoch": 10.93, + "grad_norm": 1.9190192222595215, + "learning_rate": 2.1377220245049793e-05, + "loss": 0.8708, + "step": 36530 + }, + { + "epoch": 10.93, + "grad_norm": 2.441214084625244, + "learning_rate": 2.1371407149278152e-05, + "loss": 0.9889, + "step": 36535 + }, + { + "epoch": 10.93, + "grad_norm": 5.316892623901367, + "learning_rate": 2.136559425390991e-05, + "loss": 1.0597, + "step": 36540 + }, + { + "epoch": 10.93, + "grad_norm": 5.2226667404174805, + "learning_rate": 2.135978155926613e-05, + "loss": 1.0606, + "step": 36545 + }, + { + "epoch": 10.94, + "grad_norm": 3.5756514072418213, + "learning_rate": 2.135396906566782e-05, + "loss": 1.1334, + "step": 36550 + }, + { + "epoch": 10.94, + "grad_norm": 1.187839150428772, + "learning_rate": 2.134815677343602e-05, + "loss": 1.2043, + "step": 36555 + }, + { + "epoch": 10.94, + "grad_norm": 5.102145671844482, + "learning_rate": 2.1342344682891722e-05, + "loss": 0.7688, + "step": 36560 + }, + { + "epoch": 10.94, + "grad_norm": 4.2323527336120605, + "learning_rate": 2.133653279435592e-05, + "loss": 1.158, + "step": 36565 + }, + { + "epoch": 10.94, + "grad_norm": 2.2554049491882324, + "learning_rate": 2.1330721108149614e-05, + "loss": 0.8822, + "step": 36570 + }, + { + "epoch": 10.94, + "grad_norm": 2.766228437423706, + "learning_rate": 2.132490962459375e-05, + "loss": 1.0252, + "step": 36575 + }, + { + "epoch": 10.94, + "grad_norm": 5.992697238922119, + "learning_rate": 2.1319098344009325e-05, + "loss": 1.0864, + "step": 36580 + }, + { + "epoch": 10.95, + "grad_norm": 1.6674883365631104, + "learning_rate": 2.1313287266717267e-05, + "loss": 0.9115, + "step": 36585 + }, + { + "epoch": 10.95, + "grad_norm": 1.9288809299468994, + "learning_rate": 2.1307476393038517e-05, + "loss": 1.0394, + "step": 36590 + }, + { + "epoch": 10.95, + "grad_norm": 1.3914613723754883, + "learning_rate": 2.1301665723294008e-05, + "loss": 0.9237, + "step": 36595 + }, + { + "epoch": 10.95, + "grad_norm": 2.7078752517700195, + "learning_rate": 2.129585525780466e-05, + "loss": 0.9573, + "step": 36600 + }, + { + "epoch": 10.95, + "grad_norm": 1.410092830657959, + "learning_rate": 2.1290044996891373e-05, + "loss": 0.9738, + "step": 36605 + }, + { + "epoch": 10.95, + "grad_norm": 2.085670232772827, + "learning_rate": 2.1284234940875045e-05, + "loss": 1.0609, + "step": 36610 + }, + { + "epoch": 10.95, + "grad_norm": 3.979780435562134, + "learning_rate": 2.1278425090076558e-05, + "loss": 1.1589, + "step": 36615 + }, + { + "epoch": 10.96, + "grad_norm": 4.352084636688232, + "learning_rate": 2.1272615444816792e-05, + "loss": 1.0897, + "step": 36620 + }, + { + "epoch": 10.96, + "grad_norm": 3.6647934913635254, + "learning_rate": 2.126680600541659e-05, + "loss": 1.0315, + "step": 36625 + }, + { + "epoch": 10.96, + "grad_norm": 4.659989833831787, + "learning_rate": 2.126099677219682e-05, + "loss": 0.9488, + "step": 36630 + }, + { + "epoch": 10.96, + "grad_norm": 4.262448310852051, + "learning_rate": 2.1255187745478305e-05, + "loss": 1.18, + "step": 36635 + }, + { + "epoch": 10.96, + "grad_norm": 7.49601936340332, + "learning_rate": 2.124937892558189e-05, + "loss": 1.1229, + "step": 36640 + }, + { + "epoch": 10.96, + "grad_norm": 1.145183801651001, + "learning_rate": 2.1243570312828365e-05, + "loss": 0.9284, + "step": 36645 + }, + { + "epoch": 10.97, + "grad_norm": 2.557342290878296, + "learning_rate": 2.1237761907538556e-05, + "loss": 0.982, + "step": 36650 + }, + { + "epoch": 10.97, + "grad_norm": 2.6082074642181396, + "learning_rate": 2.1231953710033246e-05, + "loss": 1.0863, + "step": 36655 + }, + { + "epoch": 10.97, + "grad_norm": 3.32584810256958, + "learning_rate": 2.122614572063321e-05, + "loss": 1.0423, + "step": 36660 + }, + { + "epoch": 10.97, + "grad_norm": 2.8699169158935547, + "learning_rate": 2.122033793965923e-05, + "loss": 1.0971, + "step": 36665 + }, + { + "epoch": 10.97, + "grad_norm": 3.991990566253662, + "learning_rate": 2.1214530367432047e-05, + "loss": 1.1467, + "step": 36670 + }, + { + "epoch": 10.97, + "grad_norm": 2.663698673248291, + "learning_rate": 2.120872300427243e-05, + "loss": 0.9862, + "step": 36675 + }, + { + "epoch": 10.97, + "grad_norm": 3.4537861347198486, + "learning_rate": 2.1202915850501087e-05, + "loss": 1.1518, + "step": 36680 + }, + { + "epoch": 10.98, + "grad_norm": 3.4938299655914307, + "learning_rate": 2.1197108906438765e-05, + "loss": 1.1402, + "step": 36685 + }, + { + "epoch": 10.98, + "grad_norm": 2.6186256408691406, + "learning_rate": 2.119130217240616e-05, + "loss": 1.2347, + "step": 36690 + }, + { + "epoch": 10.98, + "grad_norm": 2.723661422729492, + "learning_rate": 2.118549564872398e-05, + "loss": 1.0232, + "step": 36695 + }, + { + "epoch": 10.98, + "grad_norm": 4.313886642456055, + "learning_rate": 2.1179689335712906e-05, + "loss": 1.2257, + "step": 36700 + }, + { + "epoch": 10.98, + "grad_norm": 1.8701528310775757, + "learning_rate": 2.1173883233693623e-05, + "loss": 1.207, + "step": 36705 + }, + { + "epoch": 10.98, + "grad_norm": 1.5927581787109375, + "learning_rate": 2.1168077342986793e-05, + "loss": 1.0767, + "step": 36710 + }, + { + "epoch": 10.98, + "grad_norm": 5.454744338989258, + "learning_rate": 2.116227166391307e-05, + "loss": 0.9561, + "step": 36715 + }, + { + "epoch": 10.99, + "grad_norm": 0.9375532269477844, + "learning_rate": 2.1156466196793086e-05, + "loss": 1.1743, + "step": 36720 + }, + { + "epoch": 10.99, + "grad_norm": 5.345798015594482, + "learning_rate": 2.1150660941947486e-05, + "loss": 1.0879, + "step": 36725 + }, + { + "epoch": 10.99, + "grad_norm": 2.450878381729126, + "learning_rate": 2.1144855899696873e-05, + "loss": 0.8149, + "step": 36730 + }, + { + "epoch": 10.99, + "grad_norm": 2.603236436843872, + "learning_rate": 2.113905107036187e-05, + "loss": 1.1418, + "step": 36735 + }, + { + "epoch": 10.99, + "grad_norm": 3.163844347000122, + "learning_rate": 2.1133246454263053e-05, + "loss": 1.0855, + "step": 36740 + }, + { + "epoch": 10.99, + "grad_norm": 2.926191568374634, + "learning_rate": 2.1127442051721026e-05, + "loss": 1.1282, + "step": 36745 + }, + { + "epoch": 11.0, + "grad_norm": 4.042135238647461, + "learning_rate": 2.112163786305635e-05, + "loss": 1.0874, + "step": 36750 + }, + { + "epoch": 11.0, + "grad_norm": 4.720858097076416, + "learning_rate": 2.1115833888589575e-05, + "loss": 1.306, + "step": 36755 + }, + { + "epoch": 11.0, + "grad_norm": 5.201600074768066, + "learning_rate": 2.1110030128641264e-05, + "loss": 1.0244, + "step": 36760 + }, + { + "epoch": 11.0, + "grad_norm": 4.97416353225708, + "learning_rate": 2.1104226583531936e-05, + "loss": 1.1876, + "step": 36765 + }, + { + "epoch": 11.0, + "grad_norm": 1.4578512907028198, + "learning_rate": 2.1098423253582136e-05, + "loss": 1.1209, + "step": 36770 + }, + { + "epoch": 11.0, + "grad_norm": 2.909250259399414, + "learning_rate": 2.109262013911235e-05, + "loss": 0.8941, + "step": 36775 + }, + { + "epoch": 11.0, + "grad_norm": 1.240099549293518, + "learning_rate": 2.108681724044311e-05, + "loss": 0.9776, + "step": 36780 + }, + { + "epoch": 11.01, + "grad_norm": 3.794100284576416, + "learning_rate": 2.1081014557894875e-05, + "loss": 1.0358, + "step": 36785 + }, + { + "epoch": 11.01, + "grad_norm": 1.7050538063049316, + "learning_rate": 2.1075212091788133e-05, + "loss": 0.8003, + "step": 36790 + }, + { + "epoch": 11.01, + "grad_norm": 1.4368036985397339, + "learning_rate": 2.106940984244335e-05, + "loss": 1.1668, + "step": 36795 + }, + { + "epoch": 11.01, + "grad_norm": 4.024328708648682, + "learning_rate": 2.1063607810180975e-05, + "loss": 1.0022, + "step": 36800 + }, + { + "epoch": 11.01, + "grad_norm": 2.9807326793670654, + "learning_rate": 2.1057805995321447e-05, + "loss": 0.9635, + "step": 36805 + }, + { + "epoch": 11.01, + "grad_norm": 2.2452774047851562, + "learning_rate": 2.1052004398185193e-05, + "loss": 1.0438, + "step": 36810 + }, + { + "epoch": 11.01, + "grad_norm": 2.678302764892578, + "learning_rate": 2.104620301909264e-05, + "loss": 0.9814, + "step": 36815 + }, + { + "epoch": 11.02, + "grad_norm": 1.3954975605010986, + "learning_rate": 2.104040185836419e-05, + "loss": 1.003, + "step": 36820 + }, + { + "epoch": 11.02, + "grad_norm": 1.7821259498596191, + "learning_rate": 2.1034600916320212e-05, + "loss": 1.1241, + "step": 36825 + }, + { + "epoch": 11.02, + "grad_norm": 2.254903793334961, + "learning_rate": 2.102880019328112e-05, + "loss": 1.0537, + "step": 36830 + }, + { + "epoch": 11.02, + "grad_norm": 3.2131004333496094, + "learning_rate": 2.102299968956725e-05, + "loss": 1.0781, + "step": 36835 + }, + { + "epoch": 11.02, + "grad_norm": 2.1577985286712646, + "learning_rate": 2.101719940549899e-05, + "loss": 0.9388, + "step": 36840 + }, + { + "epoch": 11.02, + "grad_norm": 3.0737853050231934, + "learning_rate": 2.1011399341396664e-05, + "loss": 0.9643, + "step": 36845 + }, + { + "epoch": 11.03, + "grad_norm": 3.187286853790283, + "learning_rate": 2.1005599497580596e-05, + "loss": 1.0258, + "step": 36850 + }, + { + "epoch": 11.03, + "grad_norm": 1.8270454406738281, + "learning_rate": 2.0999799874371124e-05, + "loss": 1.0375, + "step": 36855 + }, + { + "epoch": 11.03, + "grad_norm": 3.7559213638305664, + "learning_rate": 2.0994000472088537e-05, + "loss": 1.117, + "step": 36860 + }, + { + "epoch": 11.03, + "grad_norm": 2.0919082164764404, + "learning_rate": 2.0988201291053154e-05, + "loss": 1.0887, + "step": 36865 + }, + { + "epoch": 11.03, + "grad_norm": 2.3934216499328613, + "learning_rate": 2.098240233158523e-05, + "loss": 1.1399, + "step": 36870 + }, + { + "epoch": 11.03, + "grad_norm": 1.183286190032959, + "learning_rate": 2.0976603594005063e-05, + "loss": 1.0758, + "step": 36875 + }, + { + "epoch": 11.03, + "grad_norm": 2.7067835330963135, + "learning_rate": 2.0970805078632887e-05, + "loss": 1.2898, + "step": 36880 + }, + { + "epoch": 11.04, + "grad_norm": 2.4200856685638428, + "learning_rate": 2.0965006785788958e-05, + "loss": 0.9024, + "step": 36885 + }, + { + "epoch": 11.04, + "grad_norm": 1.7485568523406982, + "learning_rate": 2.0959208715793516e-05, + "loss": 1.0622, + "step": 36890 + }, + { + "epoch": 11.04, + "grad_norm": 1.9122002124786377, + "learning_rate": 2.095341086896677e-05, + "loss": 0.8777, + "step": 36895 + }, + { + "epoch": 11.04, + "grad_norm": 2.849827289581299, + "learning_rate": 2.0947613245628944e-05, + "loss": 0.9607, + "step": 36900 + }, + { + "epoch": 11.04, + "grad_norm": 2.4549102783203125, + "learning_rate": 2.0941815846100216e-05, + "loss": 0.8974, + "step": 36905 + }, + { + "epoch": 11.04, + "grad_norm": 6.144309997558594, + "learning_rate": 2.093601867070079e-05, + "loss": 1.0058, + "step": 36910 + }, + { + "epoch": 11.04, + "grad_norm": 2.7253663539886475, + "learning_rate": 2.093022171975083e-05, + "loss": 1.0684, + "step": 36915 + }, + { + "epoch": 11.05, + "grad_norm": 7.483120918273926, + "learning_rate": 2.0924424993570485e-05, + "loss": 0.8457, + "step": 36920 + }, + { + "epoch": 11.05, + "grad_norm": 2.1432487964630127, + "learning_rate": 2.091862849247992e-05, + "loss": 0.9498, + "step": 36925 + }, + { + "epoch": 11.05, + "grad_norm": 1.383862853050232, + "learning_rate": 2.091283221679925e-05, + "loss": 1.0247, + "step": 36930 + }, + { + "epoch": 11.05, + "grad_norm": 2.251617431640625, + "learning_rate": 2.090703616684862e-05, + "loss": 1.0731, + "step": 36935 + }, + { + "epoch": 11.05, + "grad_norm": 2.736708879470825, + "learning_rate": 2.0901240342948128e-05, + "loss": 1.0303, + "step": 36940 + }, + { + "epoch": 11.05, + "grad_norm": 8.472616195678711, + "learning_rate": 2.089544474541786e-05, + "loss": 1.1477, + "step": 36945 + }, + { + "epoch": 11.06, + "grad_norm": 2.7550952434539795, + "learning_rate": 2.0889649374577923e-05, + "loss": 0.7887, + "step": 36950 + }, + { + "epoch": 11.06, + "grad_norm": 4.178884506225586, + "learning_rate": 2.088385423074837e-05, + "loss": 1.0598, + "step": 36955 + }, + { + "epoch": 11.06, + "grad_norm": 2.964043140411377, + "learning_rate": 2.087805931424927e-05, + "loss": 1.1823, + "step": 36960 + }, + { + "epoch": 11.06, + "grad_norm": 1.5841584205627441, + "learning_rate": 2.0872264625400673e-05, + "loss": 0.8966, + "step": 36965 + }, + { + "epoch": 11.06, + "grad_norm": 3.2998929023742676, + "learning_rate": 2.086647016452261e-05, + "loss": 1.2444, + "step": 36970 + }, + { + "epoch": 11.06, + "grad_norm": 9.047067642211914, + "learning_rate": 2.086067593193511e-05, + "loss": 1.1063, + "step": 36975 + }, + { + "epoch": 11.06, + "grad_norm": 4.006898403167725, + "learning_rate": 2.085488192795816e-05, + "loss": 1.0788, + "step": 36980 + }, + { + "epoch": 11.07, + "grad_norm": 1.3867641687393188, + "learning_rate": 2.084908815291179e-05, + "loss": 1.0392, + "step": 36985 + }, + { + "epoch": 11.07, + "grad_norm": 2.389448404312134, + "learning_rate": 2.084329460711595e-05, + "loss": 1.169, + "step": 36990 + }, + { + "epoch": 11.07, + "grad_norm": 6.467934608459473, + "learning_rate": 2.0837501290890644e-05, + "loss": 1.1412, + "step": 36995 + }, + { + "epoch": 11.07, + "grad_norm": 1.3220041990280151, + "learning_rate": 2.08317082045558e-05, + "loss": 1.0507, + "step": 37000 + }, + { + "epoch": 11.07, + "grad_norm": 2.8691465854644775, + "learning_rate": 2.082591534843139e-05, + "loss": 1.0901, + "step": 37005 + }, + { + "epoch": 11.07, + "grad_norm": 1.4073899984359741, + "learning_rate": 2.082012272283734e-05, + "loss": 1.1085, + "step": 37010 + }, + { + "epoch": 11.07, + "grad_norm": 1.7630438804626465, + "learning_rate": 2.0814330328093557e-05, + "loss": 1.072, + "step": 37015 + }, + { + "epoch": 11.08, + "grad_norm": 3.4774584770202637, + "learning_rate": 2.0808538164519965e-05, + "loss": 0.9849, + "step": 37020 + }, + { + "epoch": 11.08, + "grad_norm": 1.4681512117385864, + "learning_rate": 2.0802746232436445e-05, + "loss": 1.0581, + "step": 37025 + }, + { + "epoch": 11.08, + "grad_norm": 1.4620872735977173, + "learning_rate": 2.0796954532162898e-05, + "loss": 1.1402, + "step": 37030 + }, + { + "epoch": 11.08, + "grad_norm": 4.764907360076904, + "learning_rate": 2.0791163064019174e-05, + "loss": 0.9688, + "step": 37035 + }, + { + "epoch": 11.08, + "grad_norm": 1.992462396621704, + "learning_rate": 2.078537182832515e-05, + "loss": 0.8545, + "step": 37040 + }, + { + "epoch": 11.08, + "grad_norm": 2.524420976638794, + "learning_rate": 2.0779580825400653e-05, + "loss": 0.9803, + "step": 37045 + }, + { + "epoch": 11.08, + "grad_norm": 4.6814351081848145, + "learning_rate": 2.077379005556552e-05, + "loss": 1.0973, + "step": 37050 + }, + { + "epoch": 11.09, + "grad_norm": 1.9848724603652954, + "learning_rate": 2.076799951913957e-05, + "loss": 1.0625, + "step": 37055 + }, + { + "epoch": 11.09, + "grad_norm": 5.070384502410889, + "learning_rate": 2.0762209216442607e-05, + "loss": 1.1657, + "step": 37060 + }, + { + "epoch": 11.09, + "grad_norm": 3.4245998859405518, + "learning_rate": 2.0756419147794427e-05, + "loss": 0.7992, + "step": 37065 + }, + { + "epoch": 11.09, + "grad_norm": 3.5566506385803223, + "learning_rate": 2.075062931351481e-05, + "loss": 0.98, + "step": 37070 + }, + { + "epoch": 11.09, + "grad_norm": 1.4207732677459717, + "learning_rate": 2.0744839713923503e-05, + "loss": 1.1456, + "step": 37075 + }, + { + "epoch": 11.09, + "grad_norm": 1.3465211391448975, + "learning_rate": 2.073905034934029e-05, + "loss": 1.0347, + "step": 37080 + }, + { + "epoch": 11.1, + "grad_norm": 1.9427791833877563, + "learning_rate": 2.0733261220084886e-05, + "loss": 1.0611, + "step": 37085 + }, + { + "epoch": 11.1, + "grad_norm": 1.6937408447265625, + "learning_rate": 2.0727472326477043e-05, + "loss": 1.0662, + "step": 37090 + }, + { + "epoch": 11.1, + "grad_norm": 3.2311325073242188, + "learning_rate": 2.072168366883645e-05, + "loss": 1.085, + "step": 37095 + }, + { + "epoch": 11.1, + "grad_norm": 1.9753808975219727, + "learning_rate": 2.0715895247482833e-05, + "loss": 0.9925, + "step": 37100 + }, + { + "epoch": 11.1, + "grad_norm": 1.8392963409423828, + "learning_rate": 2.0710107062735867e-05, + "loss": 0.9266, + "step": 37105 + }, + { + "epoch": 11.1, + "grad_norm": 1.352810025215149, + "learning_rate": 2.0704319114915218e-05, + "loss": 1.0309, + "step": 37110 + }, + { + "epoch": 11.1, + "grad_norm": 2.322559118270874, + "learning_rate": 2.0698531404340573e-05, + "loss": 1.0988, + "step": 37115 + }, + { + "epoch": 11.11, + "grad_norm": 6.6522650718688965, + "learning_rate": 2.0692743931331554e-05, + "loss": 0.7917, + "step": 37120 + }, + { + "epoch": 11.11, + "grad_norm": 5.853671550750732, + "learning_rate": 2.068695669620782e-05, + "loss": 1.034, + "step": 37125 + }, + { + "epoch": 11.11, + "grad_norm": 2.2540881633758545, + "learning_rate": 2.0681169699288974e-05, + "loss": 1.0728, + "step": 37130 + }, + { + "epoch": 11.11, + "grad_norm": 3.5193233489990234, + "learning_rate": 2.067538294089465e-05, + "loss": 1.0295, + "step": 37135 + }, + { + "epoch": 11.11, + "grad_norm": 3.8422839641571045, + "learning_rate": 2.0669596421344422e-05, + "loss": 1.1198, + "step": 37140 + }, + { + "epoch": 11.11, + "grad_norm": 2.048434019088745, + "learning_rate": 2.0663810140957884e-05, + "loss": 1.0309, + "step": 37145 + }, + { + "epoch": 11.11, + "grad_norm": 1.556140661239624, + "learning_rate": 2.0658024100054608e-05, + "loss": 0.9682, + "step": 37150 + }, + { + "epoch": 11.12, + "grad_norm": 2.3426733016967773, + "learning_rate": 2.0652238298954142e-05, + "loss": 1.0461, + "step": 37155 + }, + { + "epoch": 11.12, + "grad_norm": 2.597688913345337, + "learning_rate": 2.0646452737976037e-05, + "loss": 1.1395, + "step": 37160 + }, + { + "epoch": 11.12, + "grad_norm": 2.909790277481079, + "learning_rate": 2.0640667417439826e-05, + "loss": 1.0784, + "step": 37165 + }, + { + "epoch": 11.12, + "grad_norm": 4.795456409454346, + "learning_rate": 2.0634882337665018e-05, + "loss": 0.9231, + "step": 37170 + }, + { + "epoch": 11.12, + "grad_norm": 4.892722129821777, + "learning_rate": 2.0629097498971128e-05, + "loss": 1.0418, + "step": 37175 + }, + { + "epoch": 11.12, + "grad_norm": 2.8806307315826416, + "learning_rate": 2.062331290167763e-05, + "loss": 1.1341, + "step": 37180 + }, + { + "epoch": 11.13, + "grad_norm": 2.0100202560424805, + "learning_rate": 2.061752854610402e-05, + "loss": 1.0645, + "step": 37185 + }, + { + "epoch": 11.13, + "grad_norm": 3.8369128704071045, + "learning_rate": 2.0611744432569743e-05, + "loss": 0.8733, + "step": 37190 + }, + { + "epoch": 11.13, + "grad_norm": 1.7996447086334229, + "learning_rate": 2.060596056139427e-05, + "loss": 1.0846, + "step": 37195 + }, + { + "epoch": 11.13, + "grad_norm": 2.4281742572784424, + "learning_rate": 2.060017693289703e-05, + "loss": 0.9427, + "step": 37200 + }, + { + "epoch": 11.13, + "grad_norm": 3.646941900253296, + "learning_rate": 2.0594393547397432e-05, + "loss": 0.9663, + "step": 37205 + }, + { + "epoch": 11.13, + "grad_norm": 2.000669240951538, + "learning_rate": 2.058861040521491e-05, + "loss": 1.1134, + "step": 37210 + }, + { + "epoch": 11.13, + "grad_norm": 3.394282579421997, + "learning_rate": 2.058282750666884e-05, + "loss": 1.0419, + "step": 37215 + }, + { + "epoch": 11.14, + "grad_norm": 1.8753750324249268, + "learning_rate": 2.0577044852078624e-05, + "loss": 0.9423, + "step": 37220 + }, + { + "epoch": 11.14, + "grad_norm": 1.0014708042144775, + "learning_rate": 2.0571262441763613e-05, + "loss": 1.1235, + "step": 37225 + }, + { + "epoch": 11.14, + "grad_norm": 6.966102600097656, + "learning_rate": 2.0565480276043186e-05, + "loss": 1.0036, + "step": 37230 + }, + { + "epoch": 11.14, + "grad_norm": 3.9602415561676025, + "learning_rate": 2.055969835523667e-05, + "loss": 1.0308, + "step": 37235 + }, + { + "epoch": 11.14, + "grad_norm": 13.549233436584473, + "learning_rate": 2.0553916679663394e-05, + "loss": 1.1082, + "step": 37240 + }, + { + "epoch": 11.14, + "grad_norm": 3.726292610168457, + "learning_rate": 2.0548135249642683e-05, + "loss": 1.1484, + "step": 37245 + }, + { + "epoch": 11.14, + "grad_norm": 2.3705708980560303, + "learning_rate": 2.054235406549383e-05, + "loss": 1.0761, + "step": 37250 + }, + { + "epoch": 11.15, + "grad_norm": 2.891065835952759, + "learning_rate": 2.0536573127536133e-05, + "loss": 1.0185, + "step": 37255 + }, + { + "epoch": 11.15, + "grad_norm": 1.9550832509994507, + "learning_rate": 2.053079243608886e-05, + "loss": 1.1753, + "step": 37260 + }, + { + "epoch": 11.15, + "grad_norm": 4.056787490844727, + "learning_rate": 2.052501199147128e-05, + "loss": 0.965, + "step": 37265 + }, + { + "epoch": 11.15, + "grad_norm": 4.4339494705200195, + "learning_rate": 2.0519231794002637e-05, + "loss": 1.0662, + "step": 37270 + }, + { + "epoch": 11.15, + "grad_norm": 1.7741667032241821, + "learning_rate": 2.0513451844002154e-05, + "loss": 1.0956, + "step": 37275 + }, + { + "epoch": 11.15, + "grad_norm": 7.996791839599609, + "learning_rate": 2.0507672141789074e-05, + "loss": 0.7328, + "step": 37280 + }, + { + "epoch": 11.16, + "grad_norm": 1.697172999382019, + "learning_rate": 2.050189268768258e-05, + "loss": 1.0675, + "step": 37285 + }, + { + "epoch": 11.16, + "grad_norm": 1.4075742959976196, + "learning_rate": 2.049611348200189e-05, + "loss": 1.0553, + "step": 37290 + }, + { + "epoch": 11.16, + "grad_norm": 2.2331416606903076, + "learning_rate": 2.0490334525066172e-05, + "loss": 0.9881, + "step": 37295 + }, + { + "epoch": 11.16, + "grad_norm": 1.8239549398422241, + "learning_rate": 2.0484555817194576e-05, + "loss": 1.1081, + "step": 37300 + }, + { + "epoch": 11.16, + "grad_norm": 3.33955454826355, + "learning_rate": 2.047877735870628e-05, + "loss": 1.0621, + "step": 37305 + }, + { + "epoch": 11.16, + "grad_norm": 4.439455509185791, + "learning_rate": 2.0472999149920403e-05, + "loss": 0.8967, + "step": 37310 + }, + { + "epoch": 11.16, + "grad_norm": 1.3203307390213013, + "learning_rate": 2.0467221191156085e-05, + "loss": 1.1234, + "step": 37315 + }, + { + "epoch": 11.17, + "grad_norm": 1.8145140409469604, + "learning_rate": 2.046144348273242e-05, + "loss": 0.8947, + "step": 37320 + }, + { + "epoch": 11.17, + "grad_norm": 1.8895070552825928, + "learning_rate": 2.0455666024968527e-05, + "loss": 0.8826, + "step": 37325 + }, + { + "epoch": 11.17, + "grad_norm": 2.4296302795410156, + "learning_rate": 2.0449888818183465e-05, + "loss": 1.1869, + "step": 37330 + }, + { + "epoch": 11.17, + "grad_norm": 6.826413154602051, + "learning_rate": 2.0444111862696314e-05, + "loss": 0.9804, + "step": 37335 + }, + { + "epoch": 11.17, + "grad_norm": 3.0793893337249756, + "learning_rate": 2.0438335158826134e-05, + "loss": 1.1888, + "step": 37340 + }, + { + "epoch": 11.17, + "grad_norm": 5.991076469421387, + "learning_rate": 2.043255870689196e-05, + "loss": 1.0324, + "step": 37345 + }, + { + "epoch": 11.17, + "grad_norm": 4.768267631530762, + "learning_rate": 2.0426782507212822e-05, + "loss": 0.9743, + "step": 37350 + }, + { + "epoch": 11.18, + "grad_norm": 2.971740245819092, + "learning_rate": 2.0421006560107726e-05, + "loss": 1.0149, + "step": 37355 + }, + { + "epoch": 11.18, + "grad_norm": 4.107237339019775, + "learning_rate": 2.041523086589569e-05, + "loss": 1.147, + "step": 37360 + }, + { + "epoch": 11.18, + "grad_norm": 2.612445116043091, + "learning_rate": 2.0409455424895686e-05, + "loss": 0.9139, + "step": 37365 + }, + { + "epoch": 11.18, + "grad_norm": 1.6917109489440918, + "learning_rate": 2.0403680237426677e-05, + "loss": 1.003, + "step": 37370 + }, + { + "epoch": 11.18, + "grad_norm": 24.12595558166504, + "learning_rate": 2.0397905303807642e-05, + "loss": 0.9988, + "step": 37375 + }, + { + "epoch": 11.18, + "grad_norm": 3.371530532836914, + "learning_rate": 2.0392130624357502e-05, + "loss": 0.9194, + "step": 37380 + }, + { + "epoch": 11.19, + "grad_norm": 1.867092490196228, + "learning_rate": 2.038635619939521e-05, + "loss": 1.015, + "step": 37385 + }, + { + "epoch": 11.19, + "grad_norm": 1.5033526420593262, + "learning_rate": 2.0380582029239655e-05, + "loss": 1.0523, + "step": 37390 + }, + { + "epoch": 11.19, + "grad_norm": 1.303634524345398, + "learning_rate": 2.0374808114209767e-05, + "loss": 0.9923, + "step": 37395 + }, + { + "epoch": 11.19, + "grad_norm": 0.9598073363304138, + "learning_rate": 2.036903445462442e-05, + "loss": 0.8324, + "step": 37400 + }, + { + "epoch": 11.19, + "grad_norm": 2.9755938053131104, + "learning_rate": 2.0363261050802473e-05, + "loss": 1.0322, + "step": 37405 + }, + { + "epoch": 11.19, + "grad_norm": 2.0213468074798584, + "learning_rate": 2.0357487903062805e-05, + "loss": 1.1045, + "step": 37410 + }, + { + "epoch": 11.19, + "grad_norm": 2.2053709030151367, + "learning_rate": 2.0351715011724255e-05, + "loss": 1.0586, + "step": 37415 + }, + { + "epoch": 11.2, + "grad_norm": 2.988892078399658, + "learning_rate": 2.0345942377105654e-05, + "loss": 0.9125, + "step": 37420 + }, + { + "epoch": 11.2, + "grad_norm": 1.9951380491256714, + "learning_rate": 2.034016999952582e-05, + "loss": 1.0415, + "step": 37425 + }, + { + "epoch": 11.2, + "grad_norm": 2.1068739891052246, + "learning_rate": 2.0334397879303545e-05, + "loss": 1.0635, + "step": 37430 + }, + { + "epoch": 11.2, + "grad_norm": 1.8847640752792358, + "learning_rate": 2.032862601675764e-05, + "loss": 1.1524, + "step": 37435 + }, + { + "epoch": 11.2, + "grad_norm": 2.446730375289917, + "learning_rate": 2.0322854412206848e-05, + "loss": 1.1736, + "step": 37440 + }, + { + "epoch": 11.2, + "grad_norm": 3.1144163608551025, + "learning_rate": 2.0317083065969957e-05, + "loss": 1.0448, + "step": 37445 + }, + { + "epoch": 11.2, + "grad_norm": 1.5927165746688843, + "learning_rate": 2.0311311978365694e-05, + "loss": 0.8874, + "step": 37450 + }, + { + "epoch": 11.21, + "grad_norm": 2.248284101486206, + "learning_rate": 2.0305541149712802e-05, + "loss": 1.134, + "step": 37455 + }, + { + "epoch": 11.21, + "grad_norm": 5.157928943634033, + "learning_rate": 2.0299770580329997e-05, + "loss": 1.1917, + "step": 37460 + }, + { + "epoch": 11.21, + "grad_norm": 2.8743419647216797, + "learning_rate": 2.0294000270535963e-05, + "loss": 1.0266, + "step": 37465 + }, + { + "epoch": 11.21, + "grad_norm": 2.73984956741333, + "learning_rate": 2.0288230220649415e-05, + "loss": 0.9329, + "step": 37470 + }, + { + "epoch": 11.21, + "grad_norm": 1.6899389028549194, + "learning_rate": 2.0282460430989003e-05, + "loss": 0.9871, + "step": 37475 + }, + { + "epoch": 11.21, + "grad_norm": 3.168191432952881, + "learning_rate": 2.027669090187341e-05, + "loss": 1.0343, + "step": 37480 + }, + { + "epoch": 11.22, + "grad_norm": 4.179646968841553, + "learning_rate": 2.0270921633621255e-05, + "loss": 0.9998, + "step": 37485 + }, + { + "epoch": 11.22, + "grad_norm": 2.1786577701568604, + "learning_rate": 2.0265152626551195e-05, + "loss": 1.2523, + "step": 37490 + }, + { + "epoch": 11.22, + "grad_norm": 2.7645535469055176, + "learning_rate": 2.025938388098183e-05, + "loss": 1.078, + "step": 37495 + }, + { + "epoch": 11.22, + "grad_norm": 4.356101036071777, + "learning_rate": 2.0253615397231764e-05, + "loss": 1.0382, + "step": 37500 + }, + { + "epoch": 11.22, + "grad_norm": 1.3754960298538208, + "learning_rate": 2.0247847175619584e-05, + "loss": 1.1546, + "step": 37505 + }, + { + "epoch": 11.22, + "grad_norm": 1.7943824529647827, + "learning_rate": 2.024207921646387e-05, + "loss": 1.0802, + "step": 37510 + }, + { + "epoch": 11.22, + "grad_norm": 2.5919864177703857, + "learning_rate": 2.023631152008317e-05, + "loss": 1.0181, + "step": 37515 + }, + { + "epoch": 11.23, + "grad_norm": 3.6458067893981934, + "learning_rate": 2.0230544086796045e-05, + "loss": 0.9573, + "step": 37520 + }, + { + "epoch": 11.23, + "grad_norm": 3.077378273010254, + "learning_rate": 2.0224776916920996e-05, + "loss": 1.2146, + "step": 37525 + }, + { + "epoch": 11.23, + "grad_norm": 3.0590150356292725, + "learning_rate": 2.0219010010776568e-05, + "loss": 0.8816, + "step": 37530 + }, + { + "epoch": 11.23, + "grad_norm": 2.068225145339966, + "learning_rate": 2.0213243368681234e-05, + "loss": 0.929, + "step": 37535 + }, + { + "epoch": 11.23, + "grad_norm": 3.8140151500701904, + "learning_rate": 2.0207476990953505e-05, + "loss": 0.8558, + "step": 37540 + }, + { + "epoch": 11.23, + "grad_norm": 2.6358039379119873, + "learning_rate": 2.0201710877911832e-05, + "loss": 1.1409, + "step": 37545 + }, + { + "epoch": 11.23, + "grad_norm": 3.132976531982422, + "learning_rate": 2.0195945029874687e-05, + "loss": 0.9641, + "step": 37550 + }, + { + "epoch": 11.24, + "grad_norm": 5.017673015594482, + "learning_rate": 2.0190179447160505e-05, + "loss": 1.0201, + "step": 37555 + }, + { + "epoch": 11.24, + "grad_norm": 2.018449306488037, + "learning_rate": 2.0184414130087704e-05, + "loss": 1.1206, + "step": 37560 + }, + { + "epoch": 11.24, + "grad_norm": 5.14389705657959, + "learning_rate": 2.0178649078974714e-05, + "loss": 1.0431, + "step": 37565 + }, + { + "epoch": 11.24, + "grad_norm": 1.725806713104248, + "learning_rate": 2.0172884294139917e-05, + "loss": 0.9678, + "step": 37570 + }, + { + "epoch": 11.24, + "grad_norm": 1.3185760974884033, + "learning_rate": 2.0167119775901706e-05, + "loss": 0.8791, + "step": 37575 + }, + { + "epoch": 11.24, + "grad_norm": 2.4314846992492676, + "learning_rate": 2.016135552457844e-05, + "loss": 0.9986, + "step": 37580 + }, + { + "epoch": 11.24, + "grad_norm": 3.166346311569214, + "learning_rate": 2.015559154048849e-05, + "loss": 1.1143, + "step": 37585 + }, + { + "epoch": 11.25, + "grad_norm": 2.213935136795044, + "learning_rate": 2.014982782395018e-05, + "loss": 0.9987, + "step": 37590 + }, + { + "epoch": 11.25, + "grad_norm": 1.993200659751892, + "learning_rate": 2.0144064375281834e-05, + "loss": 1.0445, + "step": 37595 + }, + { + "epoch": 11.25, + "grad_norm": 3.238537073135376, + "learning_rate": 2.0138301194801768e-05, + "loss": 1.1212, + "step": 37600 + }, + { + "epoch": 11.25, + "grad_norm": 5.660292625427246, + "learning_rate": 2.0132538282828273e-05, + "loss": 1.1843, + "step": 37605 + }, + { + "epoch": 11.25, + "grad_norm": 2.940760374069214, + "learning_rate": 2.012677563967963e-05, + "loss": 1.1591, + "step": 37610 + }, + { + "epoch": 11.25, + "grad_norm": 2.0081498622894287, + "learning_rate": 2.0121013265674095e-05, + "loss": 1.0123, + "step": 37615 + }, + { + "epoch": 11.26, + "grad_norm": 2.4187111854553223, + "learning_rate": 2.011525116112994e-05, + "loss": 1.019, + "step": 37620 + }, + { + "epoch": 11.26, + "grad_norm": 2.3297410011291504, + "learning_rate": 2.0109489326365384e-05, + "loss": 1.0562, + "step": 37625 + }, + { + "epoch": 11.26, + "grad_norm": 4.753705024719238, + "learning_rate": 2.0103727761698636e-05, + "loss": 1.0245, + "step": 37630 + }, + { + "epoch": 11.26, + "grad_norm": 2.0567519664764404, + "learning_rate": 2.0097966467447926e-05, + "loss": 1.1675, + "step": 37635 + }, + { + "epoch": 11.26, + "grad_norm": 4.220091342926025, + "learning_rate": 2.0092205443931422e-05, + "loss": 0.7716, + "step": 37640 + }, + { + "epoch": 11.26, + "grad_norm": 4.092503547668457, + "learning_rate": 2.0086444691467325e-05, + "loss": 1.083, + "step": 37645 + }, + { + "epoch": 11.26, + "grad_norm": 2.5490052700042725, + "learning_rate": 2.0080684210373777e-05, + "loss": 1.3257, + "step": 37650 + }, + { + "epoch": 11.27, + "grad_norm": 2.8553218841552734, + "learning_rate": 2.0074924000968913e-05, + "loss": 0.8709, + "step": 37655 + }, + { + "epoch": 11.27, + "grad_norm": 12.186079978942871, + "learning_rate": 2.0069164063570896e-05, + "loss": 1.0308, + "step": 37660 + }, + { + "epoch": 11.27, + "grad_norm": 2.453186511993408, + "learning_rate": 2.0063404398497803e-05, + "loss": 1.2019, + "step": 37665 + }, + { + "epoch": 11.27, + "grad_norm": 1.8696728944778442, + "learning_rate": 2.0057645006067767e-05, + "loss": 0.9861, + "step": 37670 + }, + { + "epoch": 11.27, + "grad_norm": 2.764322519302368, + "learning_rate": 2.0051885886598855e-05, + "loss": 1.0721, + "step": 37675 + }, + { + "epoch": 11.27, + "grad_norm": 3.057974338531494, + "learning_rate": 2.0046127040409145e-05, + "loss": 1.0609, + "step": 37680 + }, + { + "epoch": 11.27, + "grad_norm": 1.6020811796188354, + "learning_rate": 2.0040368467816688e-05, + "loss": 0.8825, + "step": 37685 + }, + { + "epoch": 11.28, + "grad_norm": 1.326548457145691, + "learning_rate": 2.0034610169139527e-05, + "loss": 1.1104, + "step": 37690 + }, + { + "epoch": 11.28, + "grad_norm": 3.1496689319610596, + "learning_rate": 2.002885214469568e-05, + "loss": 1.106, + "step": 37695 + }, + { + "epoch": 11.28, + "grad_norm": 2.7148725986480713, + "learning_rate": 2.002309439480316e-05, + "loss": 0.9303, + "step": 37700 + }, + { + "epoch": 11.28, + "grad_norm": 2.3015408515930176, + "learning_rate": 2.0017336919779973e-05, + "loss": 0.9726, + "step": 37705 + }, + { + "epoch": 11.28, + "grad_norm": 6.830209732055664, + "learning_rate": 2.001157971994408e-05, + "loss": 1.008, + "step": 37710 + }, + { + "epoch": 11.28, + "grad_norm": 6.348620891571045, + "learning_rate": 2.000582279561346e-05, + "loss": 0.8794, + "step": 37715 + }, + { + "epoch": 11.29, + "grad_norm": 2.9782297611236572, + "learning_rate": 2.000006614710606e-05, + "loss": 0.96, + "step": 37720 + }, + { + "epoch": 11.29, + "grad_norm": 3.9869275093078613, + "learning_rate": 1.9994309774739797e-05, + "loss": 1.0065, + "step": 37725 + }, + { + "epoch": 11.29, + "grad_norm": 9.005064010620117, + "learning_rate": 1.9988553678832612e-05, + "loss": 1.1601, + "step": 37730 + }, + { + "epoch": 11.29, + "grad_norm": 3.4525146484375, + "learning_rate": 1.998279785970239e-05, + "loss": 1.2198, + "step": 37735 + }, + { + "epoch": 11.29, + "grad_norm": 3.2368991374969482, + "learning_rate": 1.997704231766704e-05, + "loss": 1.0094, + "step": 37740 + }, + { + "epoch": 11.29, + "grad_norm": 2.183854341506958, + "learning_rate": 1.9971287053044406e-05, + "loss": 1.1691, + "step": 37745 + }, + { + "epoch": 11.29, + "grad_norm": 1.5113462209701538, + "learning_rate": 1.9965532066152373e-05, + "loss": 1.0431, + "step": 37750 + }, + { + "epoch": 11.3, + "grad_norm": 3.4166901111602783, + "learning_rate": 1.9959777357308772e-05, + "loss": 0.9979, + "step": 37755 + }, + { + "epoch": 11.3, + "grad_norm": 1.2922513484954834, + "learning_rate": 1.9954022926831416e-05, + "loss": 1.0215, + "step": 37760 + }, + { + "epoch": 11.3, + "grad_norm": 3.349341630935669, + "learning_rate": 1.994826877503814e-05, + "loss": 0.9885, + "step": 37765 + }, + { + "epoch": 11.3, + "grad_norm": 9.634007453918457, + "learning_rate": 1.994251490224672e-05, + "loss": 0.9638, + "step": 37770 + }, + { + "epoch": 11.3, + "grad_norm": 2.8767809867858887, + "learning_rate": 1.993676130877495e-05, + "loss": 1.0608, + "step": 37775 + }, + { + "epoch": 11.3, + "grad_norm": 1.4088268280029297, + "learning_rate": 1.9931007994940592e-05, + "loss": 0.9073, + "step": 37780 + }, + { + "epoch": 11.3, + "grad_norm": 3.421081066131592, + "learning_rate": 1.9925254961061385e-05, + "loss": 1.1258, + "step": 37785 + }, + { + "epoch": 11.31, + "grad_norm": 3.029031991958618, + "learning_rate": 1.9919502207455083e-05, + "loss": 1.053, + "step": 37790 + }, + { + "epoch": 11.31, + "grad_norm": 3.345547676086426, + "learning_rate": 1.991374973443938e-05, + "loss": 1.0845, + "step": 37795 + }, + { + "epoch": 11.31, + "grad_norm": 3.7649970054626465, + "learning_rate": 1.9907997542332006e-05, + "loss": 0.9521, + "step": 37800 + }, + { + "epoch": 11.31, + "grad_norm": 5.937236785888672, + "learning_rate": 1.990224563145062e-05, + "loss": 1.0654, + "step": 37805 + }, + { + "epoch": 11.31, + "grad_norm": 3.4234955310821533, + "learning_rate": 1.9896494002112926e-05, + "loss": 0.981, + "step": 37810 + }, + { + "epoch": 11.31, + "grad_norm": 1.7007039785385132, + "learning_rate": 1.989074265463656e-05, + "loss": 0.8819, + "step": 37815 + }, + { + "epoch": 11.32, + "grad_norm": 1.6560670137405396, + "learning_rate": 1.9884991589339157e-05, + "loss": 0.943, + "step": 37820 + }, + { + "epoch": 11.32, + "grad_norm": 5.352659225463867, + "learning_rate": 1.987924080653836e-05, + "loss": 0.8836, + "step": 37825 + }, + { + "epoch": 11.32, + "grad_norm": 3.438709259033203, + "learning_rate": 1.9873490306551762e-05, + "loss": 1.086, + "step": 37830 + }, + { + "epoch": 11.32, + "grad_norm": 1.2173386812210083, + "learning_rate": 1.9867740089696976e-05, + "loss": 1.1148, + "step": 37835 + }, + { + "epoch": 11.32, + "grad_norm": 1.3468738794326782, + "learning_rate": 1.986199015629156e-05, + "loss": 0.9861, + "step": 37840 + }, + { + "epoch": 11.32, + "grad_norm": 3.865586280822754, + "learning_rate": 1.9856240506653097e-05, + "loss": 0.9005, + "step": 37845 + }, + { + "epoch": 11.32, + "grad_norm": 3.671863079071045, + "learning_rate": 1.9850491141099122e-05, + "loss": 1.0731, + "step": 37850 + }, + { + "epoch": 11.33, + "grad_norm": 3.499868154525757, + "learning_rate": 1.984474205994716e-05, + "loss": 1.0681, + "step": 37855 + }, + { + "epoch": 11.33, + "grad_norm": 3.519331216812134, + "learning_rate": 1.983899326351474e-05, + "loss": 1.1132, + "step": 37860 + }, + { + "epoch": 11.33, + "grad_norm": 1.4864459037780762, + "learning_rate": 1.983324475211936e-05, + "loss": 1.0481, + "step": 37865 + }, + { + "epoch": 11.33, + "grad_norm": 3.8415820598602295, + "learning_rate": 1.9827496526078498e-05, + "loss": 0.9927, + "step": 37870 + }, + { + "epoch": 11.33, + "grad_norm": 3.5681967735290527, + "learning_rate": 1.9821748585709634e-05, + "loss": 0.9556, + "step": 37875 + }, + { + "epoch": 11.33, + "grad_norm": 2.1792047023773193, + "learning_rate": 1.9816000931330203e-05, + "loss": 0.9622, + "step": 37880 + }, + { + "epoch": 11.33, + "grad_norm": 1.3530371189117432, + "learning_rate": 1.9810253563257662e-05, + "loss": 1.1555, + "step": 37885 + }, + { + "epoch": 11.34, + "grad_norm": 1.9766597747802734, + "learning_rate": 1.980450648180941e-05, + "loss": 1.093, + "step": 37890 + }, + { + "epoch": 11.34, + "grad_norm": 3.305300712585449, + "learning_rate": 1.9798759687302874e-05, + "loss": 0.9854, + "step": 37895 + }, + { + "epoch": 11.34, + "grad_norm": 1.5727300643920898, + "learning_rate": 1.9793013180055427e-05, + "loss": 0.9959, + "step": 37900 + }, + { + "epoch": 11.34, + "grad_norm": 2.8745787143707275, + "learning_rate": 1.9787266960384458e-05, + "loss": 1.2233, + "step": 37905 + }, + { + "epoch": 11.34, + "grad_norm": 1.7028470039367676, + "learning_rate": 1.978152102860732e-05, + "loss": 1.04, + "step": 37910 + }, + { + "epoch": 11.34, + "grad_norm": 7.1427717208862305, + "learning_rate": 1.977577538504134e-05, + "loss": 0.9223, + "step": 37915 + }, + { + "epoch": 11.35, + "grad_norm": 2.86995530128479, + "learning_rate": 1.9770030030003863e-05, + "loss": 1.1699, + "step": 37920 + }, + { + "epoch": 11.35, + "grad_norm": 1.6554863452911377, + "learning_rate": 1.9764284963812183e-05, + "loss": 1.1282, + "step": 37925 + }, + { + "epoch": 11.35, + "grad_norm": 2.9643630981445312, + "learning_rate": 1.9758540186783613e-05, + "loss": 0.883, + "step": 37930 + }, + { + "epoch": 11.35, + "grad_norm": 2.1681020259857178, + "learning_rate": 1.9752795699235406e-05, + "loss": 1.1171, + "step": 37935 + }, + { + "epoch": 11.35, + "grad_norm": 3.2414727210998535, + "learning_rate": 1.9747051501484852e-05, + "loss": 1.1585, + "step": 37940 + }, + { + "epoch": 11.35, + "grad_norm": 1.7487002611160278, + "learning_rate": 1.9741307593849178e-05, + "loss": 1.011, + "step": 37945 + }, + { + "epoch": 11.35, + "grad_norm": 1.7904525995254517, + "learning_rate": 1.9735563976645617e-05, + "loss": 1.0142, + "step": 37950 + }, + { + "epoch": 11.36, + "grad_norm": 3.4145519733428955, + "learning_rate": 1.9729820650191388e-05, + "loss": 1.0456, + "step": 37955 + }, + { + "epoch": 11.36, + "grad_norm": 1.48880934715271, + "learning_rate": 1.9724077614803686e-05, + "loss": 1.0409, + "step": 37960 + }, + { + "epoch": 11.36, + "grad_norm": 2.0806167125701904, + "learning_rate": 1.9718334870799693e-05, + "loss": 1.024, + "step": 37965 + }, + { + "epoch": 11.36, + "grad_norm": 3.4798760414123535, + "learning_rate": 1.9712592418496577e-05, + "loss": 1.042, + "step": 37970 + }, + { + "epoch": 11.36, + "grad_norm": 2.2736198902130127, + "learning_rate": 1.9706850258211485e-05, + "loss": 1.0028, + "step": 37975 + }, + { + "epoch": 11.36, + "grad_norm": 4.118747711181641, + "learning_rate": 1.9701108390261556e-05, + "loss": 1.0363, + "step": 37980 + }, + { + "epoch": 11.36, + "grad_norm": 26.541406631469727, + "learning_rate": 1.969536681496389e-05, + "loss": 1.1795, + "step": 37985 + }, + { + "epoch": 11.37, + "grad_norm": 2.652695417404175, + "learning_rate": 1.9689625532635615e-05, + "loss": 1.1951, + "step": 37990 + }, + { + "epoch": 11.37, + "grad_norm": 1.8068537712097168, + "learning_rate": 1.9683884543593788e-05, + "loss": 0.9577, + "step": 37995 + }, + { + "epoch": 11.37, + "grad_norm": 1.585398554801941, + "learning_rate": 1.9678143848155505e-05, + "loss": 1.0057, + "step": 38000 + }, + { + "epoch": 11.37, + "grad_norm": 4.244786739349365, + "learning_rate": 1.9672403446637806e-05, + "loss": 1.1152, + "step": 38005 + }, + { + "epoch": 11.37, + "grad_norm": 4.891877174377441, + "learning_rate": 1.9666663339357714e-05, + "loss": 1.1257, + "step": 38010 + }, + { + "epoch": 11.37, + "grad_norm": 3.488956928253174, + "learning_rate": 1.9660923526632275e-05, + "loss": 0.9834, + "step": 38015 + }, + { + "epoch": 11.38, + "grad_norm": 1.9256796836853027, + "learning_rate": 1.9655184008778467e-05, + "loss": 0.9027, + "step": 38020 + }, + { + "epoch": 11.38, + "grad_norm": 2.239374876022339, + "learning_rate": 1.9649444786113303e-05, + "loss": 1.1056, + "step": 38025 + }, + { + "epoch": 11.38, + "grad_norm": 2.374089241027832, + "learning_rate": 1.964370585895373e-05, + "loss": 0.9732, + "step": 38030 + }, + { + "epoch": 11.38, + "grad_norm": 1.6094075441360474, + "learning_rate": 1.9637967227616723e-05, + "loss": 1.1684, + "step": 38035 + }, + { + "epoch": 11.38, + "grad_norm": 2.5619702339172363, + "learning_rate": 1.9632228892419214e-05, + "loss": 1.1079, + "step": 38040 + }, + { + "epoch": 11.38, + "grad_norm": 2.8484513759613037, + "learning_rate": 1.962649085367812e-05, + "loss": 0.9438, + "step": 38045 + }, + { + "epoch": 11.38, + "grad_norm": 2.0517704486846924, + "learning_rate": 1.962075311171035e-05, + "loss": 1.1127, + "step": 38050 + }, + { + "epoch": 11.39, + "grad_norm": 4.042993545532227, + "learning_rate": 1.9615015666832795e-05, + "loss": 0.776, + "step": 38055 + }, + { + "epoch": 11.39, + "grad_norm": 3.7668004035949707, + "learning_rate": 1.9609278519362326e-05, + "loss": 1.0291, + "step": 38060 + }, + { + "epoch": 11.39, + "grad_norm": 4.582650184631348, + "learning_rate": 1.9603541669615796e-05, + "loss": 0.9049, + "step": 38065 + }, + { + "epoch": 11.39, + "grad_norm": 2.7174317836761475, + "learning_rate": 1.959780511791006e-05, + "loss": 1.0892, + "step": 38070 + }, + { + "epoch": 11.39, + "grad_norm": 3.4516286849975586, + "learning_rate": 1.9592068864561936e-05, + "loss": 1.0218, + "step": 38075 + }, + { + "epoch": 11.39, + "grad_norm": 1.96125328540802, + "learning_rate": 1.9586332909888216e-05, + "loss": 1.028, + "step": 38080 + }, + { + "epoch": 11.39, + "grad_norm": 1.8096983432769775, + "learning_rate": 1.9580597254205713e-05, + "loss": 1.0322, + "step": 38085 + }, + { + "epoch": 11.4, + "grad_norm": 5.962859153747559, + "learning_rate": 1.9574861897831175e-05, + "loss": 0.9794, + "step": 38090 + }, + { + "epoch": 11.4, + "grad_norm": 1.6979426145553589, + "learning_rate": 1.9569126841081392e-05, + "loss": 1.1257, + "step": 38095 + }, + { + "epoch": 11.4, + "grad_norm": 3.6008379459381104, + "learning_rate": 1.9563392084273074e-05, + "loss": 0.9345, + "step": 38100 + }, + { + "epoch": 11.4, + "grad_norm": 1.6322747468948364, + "learning_rate": 1.955765762772297e-05, + "loss": 0.9787, + "step": 38105 + }, + { + "epoch": 11.4, + "grad_norm": 3.6460819244384766, + "learning_rate": 1.955192347174778e-05, + "loss": 1.0202, + "step": 38110 + }, + { + "epoch": 11.4, + "grad_norm": 6.339641094207764, + "learning_rate": 1.9546189616664183e-05, + "loss": 1.0028, + "step": 38115 + }, + { + "epoch": 11.41, + "grad_norm": 5.438929557800293, + "learning_rate": 1.954045606278888e-05, + "loss": 1.1827, + "step": 38120 + }, + { + "epoch": 11.41, + "grad_norm": 1.7382398843765259, + "learning_rate": 1.9534722810438496e-05, + "loss": 1.0428, + "step": 38125 + }, + { + "epoch": 11.41, + "grad_norm": 3.361985206604004, + "learning_rate": 1.9528989859929704e-05, + "loss": 1.1343, + "step": 38130 + }, + { + "epoch": 11.41, + "grad_norm": 1.6024760007858276, + "learning_rate": 1.952325721157911e-05, + "loss": 1.0125, + "step": 38135 + }, + { + "epoch": 11.41, + "grad_norm": 5.525008201599121, + "learning_rate": 1.9517524865703322e-05, + "loss": 1.1382, + "step": 38140 + }, + { + "epoch": 11.41, + "grad_norm": 2.9428493976593018, + "learning_rate": 1.9511792822618947e-05, + "loss": 1.017, + "step": 38145 + }, + { + "epoch": 11.41, + "grad_norm": 4.371325492858887, + "learning_rate": 1.9506061082642538e-05, + "loss": 1.1891, + "step": 38150 + }, + { + "epoch": 11.42, + "grad_norm": 2.9674203395843506, + "learning_rate": 1.9500329646090677e-05, + "loss": 1.0249, + "step": 38155 + }, + { + "epoch": 11.42, + "grad_norm": 2.463282346725464, + "learning_rate": 1.949459851327988e-05, + "loss": 1.0911, + "step": 38160 + }, + { + "epoch": 11.42, + "grad_norm": 3.0271716117858887, + "learning_rate": 1.94888676845267e-05, + "loss": 1.041, + "step": 38165 + }, + { + "epoch": 11.42, + "grad_norm": 2.7793784141540527, + "learning_rate": 1.9483137160147626e-05, + "loss": 1.0276, + "step": 38170 + }, + { + "epoch": 11.42, + "grad_norm": 1.6293712854385376, + "learning_rate": 1.947740694045914e-05, + "loss": 0.9052, + "step": 38175 + }, + { + "epoch": 11.42, + "grad_norm": 2.220062017440796, + "learning_rate": 1.9471677025777743e-05, + "loss": 1.1091, + "step": 38180 + }, + { + "epoch": 11.42, + "grad_norm": 2.7983016967773438, + "learning_rate": 1.9465947416419867e-05, + "loss": 1.1761, + "step": 38185 + }, + { + "epoch": 11.43, + "grad_norm": 2.7694313526153564, + "learning_rate": 1.946021811270197e-05, + "loss": 0.9134, + "step": 38190 + }, + { + "epoch": 11.43, + "grad_norm": 4.779193878173828, + "learning_rate": 1.9454489114940458e-05, + "loss": 1.0319, + "step": 38195 + }, + { + "epoch": 11.43, + "grad_norm": 2.7941768169403076, + "learning_rate": 1.944876042345176e-05, + "loss": 1.1433, + "step": 38200 + }, + { + "epoch": 11.43, + "grad_norm": 2.305264949798584, + "learning_rate": 1.944303203855225e-05, + "loss": 1.2263, + "step": 38205 + }, + { + "epoch": 11.43, + "grad_norm": 8.440693855285645, + "learning_rate": 1.94373039605583e-05, + "loss": 1.0192, + "step": 38210 + }, + { + "epoch": 11.43, + "grad_norm": 3.348942995071411, + "learning_rate": 1.9431576189786276e-05, + "loss": 1.0054, + "step": 38215 + }, + { + "epoch": 11.43, + "grad_norm": 3.2682738304138184, + "learning_rate": 1.9425848726552503e-05, + "loss": 1.0656, + "step": 38220 + }, + { + "epoch": 11.44, + "grad_norm": 1.6970369815826416, + "learning_rate": 1.9420121571173315e-05, + "loss": 0.943, + "step": 38225 + }, + { + "epoch": 11.44, + "grad_norm": 3.24284029006958, + "learning_rate": 1.941439472396502e-05, + "loss": 1.1863, + "step": 38230 + }, + { + "epoch": 11.44, + "grad_norm": 4.399949550628662, + "learning_rate": 1.9408668185243885e-05, + "loss": 1.0284, + "step": 38235 + }, + { + "epoch": 11.44, + "grad_norm": 1.228689193725586, + "learning_rate": 1.9402941955326207e-05, + "loss": 1.1199, + "step": 38240 + }, + { + "epoch": 11.44, + "grad_norm": 2.4281301498413086, + "learning_rate": 1.9397216034528216e-05, + "loss": 1.0223, + "step": 38245 + }, + { + "epoch": 11.44, + "grad_norm": 6.723038673400879, + "learning_rate": 1.939149042316617e-05, + "loss": 1.0873, + "step": 38250 + }, + { + "epoch": 11.45, + "grad_norm": 2.267920970916748, + "learning_rate": 1.9385765121556266e-05, + "loss": 1.1129, + "step": 38255 + }, + { + "epoch": 11.45, + "grad_norm": 1.6286221742630005, + "learning_rate": 1.9380040130014733e-05, + "loss": 0.951, + "step": 38260 + }, + { + "epoch": 11.45, + "grad_norm": 5.079461574554443, + "learning_rate": 1.9374315448857737e-05, + "loss": 1.0332, + "step": 38265 + }, + { + "epoch": 11.45, + "grad_norm": 1.5790833234786987, + "learning_rate": 1.9368591078401442e-05, + "loss": 0.983, + "step": 38270 + }, + { + "epoch": 11.45, + "grad_norm": 4.2222394943237305, + "learning_rate": 1.9362867018962022e-05, + "loss": 1.0405, + "step": 38275 + }, + { + "epoch": 11.45, + "grad_norm": 1.2576323747634888, + "learning_rate": 1.9357143270855586e-05, + "loss": 1.0631, + "step": 38280 + }, + { + "epoch": 11.45, + "grad_norm": 3.4669594764709473, + "learning_rate": 1.935141983439827e-05, + "loss": 1.0043, + "step": 38285 + }, + { + "epoch": 11.46, + "grad_norm": 3.4095098972320557, + "learning_rate": 1.9345696709906152e-05, + "loss": 1.146, + "step": 38290 + }, + { + "epoch": 11.46, + "grad_norm": 2.086794137954712, + "learning_rate": 1.9339973897695344e-05, + "loss": 1.0618, + "step": 38295 + }, + { + "epoch": 11.46, + "grad_norm": 2.766907215118408, + "learning_rate": 1.933425139808189e-05, + "loss": 1.1128, + "step": 38300 + }, + { + "epoch": 11.46, + "grad_norm": 5.271542072296143, + "learning_rate": 1.9328529211381835e-05, + "loss": 1.0, + "step": 38305 + }, + { + "epoch": 11.46, + "grad_norm": 2.415320634841919, + "learning_rate": 1.932280733791122e-05, + "loss": 1.0116, + "step": 38310 + }, + { + "epoch": 11.46, + "grad_norm": 1.7332686185836792, + "learning_rate": 1.9317085777986054e-05, + "loss": 1.2115, + "step": 38315 + }, + { + "epoch": 11.46, + "grad_norm": 4.1479058265686035, + "learning_rate": 1.9311364531922333e-05, + "loss": 1.1115, + "step": 38320 + }, + { + "epoch": 11.47, + "grad_norm": 1.9687248468399048, + "learning_rate": 1.9305643600036037e-05, + "loss": 1.0981, + "step": 38325 + }, + { + "epoch": 11.47, + "grad_norm": 2.3544981479644775, + "learning_rate": 1.9299922982643127e-05, + "loss": 1.0191, + "step": 38330 + }, + { + "epoch": 11.47, + "grad_norm": 3.2068850994110107, + "learning_rate": 1.929420268005955e-05, + "loss": 0.9429, + "step": 38335 + }, + { + "epoch": 11.47, + "grad_norm": 2.9129559993743896, + "learning_rate": 1.9288482692601217e-05, + "loss": 1.004, + "step": 38340 + }, + { + "epoch": 11.47, + "grad_norm": 1.2691713571548462, + "learning_rate": 1.928276302058406e-05, + "loss": 1.013, + "step": 38345 + }, + { + "epoch": 11.47, + "grad_norm": 2.6996827125549316, + "learning_rate": 1.9277043664323945e-05, + "loss": 1.0327, + "step": 38350 + }, + { + "epoch": 11.48, + "grad_norm": 3.7437167167663574, + "learning_rate": 1.9271324624136775e-05, + "loss": 0.9692, + "step": 38355 + }, + { + "epoch": 11.48, + "grad_norm": 1.4719038009643555, + "learning_rate": 1.926560590033839e-05, + "loss": 1.1914, + "step": 38360 + }, + { + "epoch": 11.48, + "grad_norm": 2.6418561935424805, + "learning_rate": 1.9259887493244615e-05, + "loss": 1.0561, + "step": 38365 + }, + { + "epoch": 11.48, + "grad_norm": 1.7341514825820923, + "learning_rate": 1.9254169403171303e-05, + "loss": 1.1022, + "step": 38370 + }, + { + "epoch": 11.48, + "grad_norm": 2.533081531524658, + "learning_rate": 1.9248451630434232e-05, + "loss": 1.0766, + "step": 38375 + }, + { + "epoch": 11.48, + "grad_norm": 3.991834878921509, + "learning_rate": 1.9242734175349208e-05, + "loss": 1.2385, + "step": 38380 + }, + { + "epoch": 11.48, + "grad_norm": 1.5860527753829956, + "learning_rate": 1.923701703823198e-05, + "loss": 1.1093, + "step": 38385 + }, + { + "epoch": 11.49, + "grad_norm": 3.969327688217163, + "learning_rate": 1.923130021939832e-05, + "loss": 0.9571, + "step": 38390 + }, + { + "epoch": 11.49, + "grad_norm": 3.739284038543701, + "learning_rate": 1.922558371916395e-05, + "loss": 1.0152, + "step": 38395 + }, + { + "epoch": 11.49, + "grad_norm": 10.35611343383789, + "learning_rate": 1.9219867537844588e-05, + "loss": 1.0696, + "step": 38400 + }, + { + "epoch": 11.49, + "grad_norm": 2.626101016998291, + "learning_rate": 1.9214151675755935e-05, + "loss": 1.0215, + "step": 38405 + }, + { + "epoch": 11.49, + "grad_norm": 3.1883928775787354, + "learning_rate": 1.920843613321367e-05, + "loss": 1.1055, + "step": 38410 + }, + { + "epoch": 11.49, + "grad_norm": 3.719695568084717, + "learning_rate": 1.920272091053346e-05, + "loss": 0.9234, + "step": 38415 + }, + { + "epoch": 11.49, + "grad_norm": 1.9183346033096313, + "learning_rate": 1.9197006008030942e-05, + "loss": 1.0771, + "step": 38420 + }, + { + "epoch": 11.5, + "grad_norm": 10.849863052368164, + "learning_rate": 1.9191291426021756e-05, + "loss": 1.1378, + "step": 38425 + }, + { + "epoch": 11.5, + "grad_norm": 2.5107712745666504, + "learning_rate": 1.9185577164821507e-05, + "loss": 0.9376, + "step": 38430 + }, + { + "epoch": 11.5, + "grad_norm": 3.5658061504364014, + "learning_rate": 1.917986322474578e-05, + "loss": 0.8868, + "step": 38435 + }, + { + "epoch": 11.5, + "grad_norm": 4.931859493255615, + "learning_rate": 1.917414960611017e-05, + "loss": 0.9333, + "step": 38440 + }, + { + "epoch": 11.5, + "grad_norm": 3.9199514389038086, + "learning_rate": 1.916843630923021e-05, + "loss": 1.1173, + "step": 38445 + }, + { + "epoch": 11.5, + "grad_norm": 1.2305980920791626, + "learning_rate": 1.916272333442146e-05, + "loss": 1.0331, + "step": 38450 + }, + { + "epoch": 11.51, + "grad_norm": 1.6451231241226196, + "learning_rate": 1.915701068199942e-05, + "loss": 0.9607, + "step": 38455 + }, + { + "epoch": 11.51, + "grad_norm": 1.1699979305267334, + "learning_rate": 1.915129835227962e-05, + "loss": 0.914, + "step": 38460 + }, + { + "epoch": 11.51, + "grad_norm": 2.1757733821868896, + "learning_rate": 1.9145586345577533e-05, + "loss": 1.1439, + "step": 38465 + }, + { + "epoch": 11.51, + "grad_norm": 1.6203327178955078, + "learning_rate": 1.9139874662208615e-05, + "loss": 1.2016, + "step": 38470 + }, + { + "epoch": 11.51, + "grad_norm": 3.481635093688965, + "learning_rate": 1.913416330248834e-05, + "loss": 0.7547, + "step": 38475 + }, + { + "epoch": 11.51, + "grad_norm": 2.200890064239502, + "learning_rate": 1.9128452266732115e-05, + "loss": 1.0624, + "step": 38480 + }, + { + "epoch": 11.51, + "grad_norm": 1.81036376953125, + "learning_rate": 1.9122741555255384e-05, + "loss": 0.9228, + "step": 38485 + }, + { + "epoch": 11.52, + "grad_norm": 4.309178829193115, + "learning_rate": 1.911703116837352e-05, + "loss": 1.0101, + "step": 38490 + }, + { + "epoch": 11.52, + "grad_norm": 1.9071286916732788, + "learning_rate": 1.911132110640191e-05, + "loss": 1.0724, + "step": 38495 + }, + { + "epoch": 11.52, + "grad_norm": 2.7584476470947266, + "learning_rate": 1.9105611369655914e-05, + "loss": 1.0121, + "step": 38500 + }, + { + "epoch": 11.52, + "grad_norm": 3.3853213787078857, + "learning_rate": 1.9099901958450868e-05, + "loss": 0.9683, + "step": 38505 + }, + { + "epoch": 11.52, + "grad_norm": 2.5267248153686523, + "learning_rate": 1.9094192873102117e-05, + "loss": 1.0502, + "step": 38510 + }, + { + "epoch": 11.52, + "grad_norm": 3.773902416229248, + "learning_rate": 1.9088484113924944e-05, + "loss": 1.029, + "step": 38515 + }, + { + "epoch": 11.52, + "grad_norm": 2.5307250022888184, + "learning_rate": 1.908277568123466e-05, + "loss": 0.9967, + "step": 38520 + }, + { + "epoch": 11.53, + "grad_norm": 4.490280628204346, + "learning_rate": 1.907706757534652e-05, + "loss": 1.0186, + "step": 38525 + }, + { + "epoch": 11.53, + "grad_norm": 1.3024266958236694, + "learning_rate": 1.9071359796575774e-05, + "loss": 1.1027, + "step": 38530 + }, + { + "epoch": 11.53, + "grad_norm": 5.981440544128418, + "learning_rate": 1.906565234523767e-05, + "loss": 1.1221, + "step": 38535 + }, + { + "epoch": 11.53, + "grad_norm": 2.8567864894866943, + "learning_rate": 1.9059945221647414e-05, + "loss": 1.1096, + "step": 38540 + }, + { + "epoch": 11.53, + "grad_norm": 2.297405481338501, + "learning_rate": 1.9054238426120218e-05, + "loss": 1.0762, + "step": 38545 + }, + { + "epoch": 11.53, + "grad_norm": 1.9504579305648804, + "learning_rate": 1.9048531958971245e-05, + "loss": 1.1548, + "step": 38550 + }, + { + "epoch": 11.54, + "grad_norm": 4.491879463195801, + "learning_rate": 1.9042825820515675e-05, + "loss": 0.8179, + "step": 38555 + }, + { + "epoch": 11.54, + "grad_norm": 3.1393933296203613, + "learning_rate": 1.903712001106864e-05, + "loss": 0.8162, + "step": 38560 + }, + { + "epoch": 11.54, + "grad_norm": 1.3722354173660278, + "learning_rate": 1.9031414530945263e-05, + "loss": 1.096, + "step": 38565 + }, + { + "epoch": 11.54, + "grad_norm": 3.8262624740600586, + "learning_rate": 1.9025709380460662e-05, + "loss": 0.9238, + "step": 38570 + }, + { + "epoch": 11.54, + "grad_norm": 3.3715012073516846, + "learning_rate": 1.9020004559929915e-05, + "loss": 1.0101, + "step": 38575 + }, + { + "epoch": 11.54, + "grad_norm": 4.317600250244141, + "learning_rate": 1.9014300069668113e-05, + "loss": 0.9422, + "step": 38580 + }, + { + "epoch": 11.54, + "grad_norm": 3.4299378395080566, + "learning_rate": 1.9008595909990286e-05, + "loss": 1.0151, + "step": 38585 + }, + { + "epoch": 11.55, + "grad_norm": 5.137393474578857, + "learning_rate": 1.9002892081211478e-05, + "loss": 1.1151, + "step": 38590 + }, + { + "epoch": 11.55, + "grad_norm": 2.469620704650879, + "learning_rate": 1.8997188583646713e-05, + "loss": 1.0459, + "step": 38595 + }, + { + "epoch": 11.55, + "grad_norm": 1.945117473602295, + "learning_rate": 1.899148541761098e-05, + "loss": 1.037, + "step": 38600 + }, + { + "epoch": 11.55, + "grad_norm": 2.0627458095550537, + "learning_rate": 1.8985782583419266e-05, + "loss": 1.0576, + "step": 38605 + }, + { + "epoch": 11.55, + "grad_norm": 4.658207416534424, + "learning_rate": 1.8980080081386517e-05, + "loss": 0.9801, + "step": 38610 + }, + { + "epoch": 11.55, + "grad_norm": 3.8801674842834473, + "learning_rate": 1.89743779118277e-05, + "loss": 0.9549, + "step": 38615 + }, + { + "epoch": 11.55, + "grad_norm": 2.336965322494507, + "learning_rate": 1.896867607505773e-05, + "loss": 1.236, + "step": 38620 + }, + { + "epoch": 11.56, + "grad_norm": 2.502366304397583, + "learning_rate": 1.89629745713915e-05, + "loss": 0.8492, + "step": 38625 + }, + { + "epoch": 11.56, + "grad_norm": 3.108949661254883, + "learning_rate": 1.895727340114392e-05, + "loss": 0.9725, + "step": 38630 + }, + { + "epoch": 11.56, + "grad_norm": 4.816553115844727, + "learning_rate": 1.8951572564629835e-05, + "loss": 1.0565, + "step": 38635 + }, + { + "epoch": 11.56, + "grad_norm": 2.322685956954956, + "learning_rate": 1.8945872062164122e-05, + "loss": 0.9627, + "step": 38640 + }, + { + "epoch": 11.56, + "grad_norm": 2.1832380294799805, + "learning_rate": 1.894017189406159e-05, + "loss": 1.0276, + "step": 38645 + }, + { + "epoch": 11.56, + "grad_norm": 2.5433905124664307, + "learning_rate": 1.8934472060637083e-05, + "loss": 1.0334, + "step": 38650 + }, + { + "epoch": 11.57, + "grad_norm": 1.171791672706604, + "learning_rate": 1.8928772562205373e-05, + "loss": 1.0299, + "step": 38655 + }, + { + "epoch": 11.57, + "grad_norm": 12.511678695678711, + "learning_rate": 1.8923073399081238e-05, + "loss": 0.8337, + "step": 38660 + }, + { + "epoch": 11.57, + "grad_norm": 2.81550669670105, + "learning_rate": 1.8917374571579444e-05, + "loss": 1.1925, + "step": 38665 + }, + { + "epoch": 11.57, + "grad_norm": 1.520538568496704, + "learning_rate": 1.891167608001473e-05, + "loss": 0.9966, + "step": 38670 + }, + { + "epoch": 11.57, + "grad_norm": 6.8623456954956055, + "learning_rate": 1.890597792470182e-05, + "loss": 1.1197, + "step": 38675 + }, + { + "epoch": 11.57, + "grad_norm": 2.9777519702911377, + "learning_rate": 1.890028010595541e-05, + "loss": 1.0691, + "step": 38680 + }, + { + "epoch": 11.57, + "grad_norm": 5.088373184204102, + "learning_rate": 1.8894582624090194e-05, + "loss": 0.8929, + "step": 38685 + }, + { + "epoch": 11.58, + "grad_norm": 3.7132937908172607, + "learning_rate": 1.8888885479420838e-05, + "loss": 1.0176, + "step": 38690 + }, + { + "epoch": 11.58, + "grad_norm": 3.316521167755127, + "learning_rate": 1.8883188672261972e-05, + "loss": 1.0659, + "step": 38695 + }, + { + "epoch": 11.58, + "grad_norm": 2.1407995223999023, + "learning_rate": 1.8877492202928252e-05, + "loss": 1.0775, + "step": 38700 + }, + { + "epoch": 11.58, + "grad_norm": 2.758033037185669, + "learning_rate": 1.887179607173426e-05, + "loss": 1.0208, + "step": 38705 + }, + { + "epoch": 11.58, + "grad_norm": 3.8762590885162354, + "learning_rate": 1.8866100278994613e-05, + "loss": 0.9107, + "step": 38710 + }, + { + "epoch": 11.58, + "grad_norm": 2.9407236576080322, + "learning_rate": 1.886040482502387e-05, + "loss": 1.0697, + "step": 38715 + }, + { + "epoch": 11.58, + "grad_norm": 2.600522994995117, + "learning_rate": 1.885470971013658e-05, + "loss": 1.0183, + "step": 38720 + }, + { + "epoch": 11.59, + "grad_norm": 3.5335729122161865, + "learning_rate": 1.884901493464729e-05, + "loss": 0.9398, + "step": 38725 + }, + { + "epoch": 11.59, + "grad_norm": 1.7185178995132446, + "learning_rate": 1.8843320498870504e-05, + "loss": 0.8392, + "step": 38730 + }, + { + "epoch": 11.59, + "grad_norm": 2.37362003326416, + "learning_rate": 1.883762640312074e-05, + "loss": 1.0308, + "step": 38735 + }, + { + "epoch": 11.59, + "grad_norm": 5.628195285797119, + "learning_rate": 1.883193264771245e-05, + "loss": 0.9699, + "step": 38740 + }, + { + "epoch": 11.59, + "grad_norm": 2.0216867923736572, + "learning_rate": 1.882623923296012e-05, + "loss": 1.0402, + "step": 38745 + }, + { + "epoch": 11.59, + "grad_norm": 4.347649097442627, + "learning_rate": 1.8820546159178175e-05, + "loss": 0.9695, + "step": 38750 + }, + { + "epoch": 11.6, + "grad_norm": 5.039327621459961, + "learning_rate": 1.8814853426681046e-05, + "loss": 0.9951, + "step": 38755 + }, + { + "epoch": 11.6, + "grad_norm": 3.6531269550323486, + "learning_rate": 1.880916103578313e-05, + "loss": 1.0957, + "step": 38760 + }, + { + "epoch": 11.6, + "grad_norm": 2.0845232009887695, + "learning_rate": 1.8803468986798814e-05, + "loss": 1.05, + "step": 38765 + }, + { + "epoch": 11.6, + "grad_norm": 4.7361979484558105, + "learning_rate": 1.8797777280042467e-05, + "loss": 0.9722, + "step": 38770 + }, + { + "epoch": 11.6, + "grad_norm": 4.363923072814941, + "learning_rate": 1.879208591582843e-05, + "loss": 0.9497, + "step": 38775 + }, + { + "epoch": 11.6, + "grad_norm": 3.8049824237823486, + "learning_rate": 1.8786394894471038e-05, + "loss": 1.1666, + "step": 38780 + }, + { + "epoch": 11.6, + "grad_norm": 4.981505870819092, + "learning_rate": 1.8780704216284604e-05, + "loss": 1.0055, + "step": 38785 + }, + { + "epoch": 11.61, + "grad_norm": 2.457186222076416, + "learning_rate": 1.87750138815834e-05, + "loss": 1.1097, + "step": 38790 + }, + { + "epoch": 11.61, + "grad_norm": 2.6897389888763428, + "learning_rate": 1.8769323890681717e-05, + "loss": 1.2053, + "step": 38795 + }, + { + "epoch": 11.61, + "grad_norm": 2.4741759300231934, + "learning_rate": 1.876363424389379e-05, + "loss": 1.0683, + "step": 38800 + }, + { + "epoch": 11.61, + "grad_norm": 3.079468011856079, + "learning_rate": 1.875794494153387e-05, + "loss": 0.8789, + "step": 38805 + }, + { + "epoch": 11.61, + "grad_norm": 4.948914051055908, + "learning_rate": 1.875225598391616e-05, + "loss": 0.9235, + "step": 38810 + }, + { + "epoch": 11.61, + "grad_norm": 9.49067211151123, + "learning_rate": 1.874656737135486e-05, + "loss": 0.7728, + "step": 38815 + }, + { + "epoch": 11.61, + "grad_norm": 4.153932094573975, + "learning_rate": 1.874087910416415e-05, + "loss": 0.8923, + "step": 38820 + }, + { + "epoch": 11.62, + "grad_norm": 5.080899715423584, + "learning_rate": 1.8735191182658164e-05, + "loss": 1.1195, + "step": 38825 + }, + { + "epoch": 11.62, + "grad_norm": 14.455235481262207, + "learning_rate": 1.872950360715107e-05, + "loss": 1.0142, + "step": 38830 + }, + { + "epoch": 11.62, + "grad_norm": 2.518566131591797, + "learning_rate": 1.8723816377956966e-05, + "loss": 1.0349, + "step": 38835 + }, + { + "epoch": 11.62, + "grad_norm": 1.6086268424987793, + "learning_rate": 1.871812949538997e-05, + "loss": 1.0412, + "step": 38840 + }, + { + "epoch": 11.62, + "grad_norm": 3.0783514976501465, + "learning_rate": 1.8712442959764144e-05, + "loss": 1.0691, + "step": 38845 + }, + { + "epoch": 11.62, + "grad_norm": 2.9750258922576904, + "learning_rate": 1.870675677139356e-05, + "loss": 0.8981, + "step": 38850 + }, + { + "epoch": 11.62, + "grad_norm": 1.0562318563461304, + "learning_rate": 1.8701070930592257e-05, + "loss": 1.0965, + "step": 38855 + }, + { + "epoch": 11.63, + "grad_norm": 3.368023157119751, + "learning_rate": 1.8695385437674263e-05, + "loss": 1.0257, + "step": 38860 + }, + { + "epoch": 11.63, + "grad_norm": 2.06657338142395, + "learning_rate": 1.868970029295357e-05, + "loss": 1.0595, + "step": 38865 + }, + { + "epoch": 11.63, + "grad_norm": 3.3948874473571777, + "learning_rate": 1.8684015496744173e-05, + "loss": 1.0824, + "step": 38870 + }, + { + "epoch": 11.63, + "grad_norm": 3.1580018997192383, + "learning_rate": 1.8678331049360044e-05, + "loss": 0.994, + "step": 38875 + }, + { + "epoch": 11.63, + "grad_norm": 2.4036195278167725, + "learning_rate": 1.8672646951115116e-05, + "loss": 1.1658, + "step": 38880 + }, + { + "epoch": 11.63, + "grad_norm": 1.6055760383605957, + "learning_rate": 1.866696320232331e-05, + "loss": 1.0514, + "step": 38885 + }, + { + "epoch": 11.64, + "grad_norm": 2.9293153285980225, + "learning_rate": 1.866127980329856e-05, + "loss": 1.0034, + "step": 38890 + }, + { + "epoch": 11.64, + "grad_norm": 6.062731742858887, + "learning_rate": 1.865559675435472e-05, + "loss": 0.9402, + "step": 38895 + }, + { + "epoch": 11.64, + "grad_norm": 2.678724527359009, + "learning_rate": 1.8649914055805696e-05, + "loss": 0.8018, + "step": 38900 + }, + { + "epoch": 11.64, + "grad_norm": 1.7814666032791138, + "learning_rate": 1.8644231707965297e-05, + "loss": 1.0356, + "step": 38905 + }, + { + "epoch": 11.64, + "grad_norm": 2.179852247238159, + "learning_rate": 1.863854971114739e-05, + "loss": 1.0481, + "step": 38910 + }, + { + "epoch": 11.64, + "grad_norm": 3.8820948600769043, + "learning_rate": 1.863286806566577e-05, + "loss": 1.0233, + "step": 38915 + }, + { + "epoch": 11.64, + "grad_norm": 4.843132019042969, + "learning_rate": 1.862718677183422e-05, + "loss": 1.1038, + "step": 38920 + }, + { + "epoch": 11.65, + "grad_norm": 2.708319902420044, + "learning_rate": 1.862150582996653e-05, + "loss": 1.1368, + "step": 38925 + }, + { + "epoch": 11.65, + "grad_norm": 2.447589159011841, + "learning_rate": 1.861582524037643e-05, + "loss": 1.096, + "step": 38930 + }, + { + "epoch": 11.65, + "grad_norm": 2.1640899181365967, + "learning_rate": 1.8610145003377676e-05, + "loss": 0.9314, + "step": 38935 + }, + { + "epoch": 11.65, + "grad_norm": 6.040550708770752, + "learning_rate": 1.860446511928397e-05, + "loss": 1.1117, + "step": 38940 + }, + { + "epoch": 11.65, + "grad_norm": 18.708223342895508, + "learning_rate": 1.8598785588409005e-05, + "loss": 1.0053, + "step": 38945 + }, + { + "epoch": 11.65, + "grad_norm": 2.9335033893585205, + "learning_rate": 1.859310641106646e-05, + "loss": 1.2107, + "step": 38950 + }, + { + "epoch": 11.65, + "grad_norm": 3.3120362758636475, + "learning_rate": 1.8587427587569982e-05, + "loss": 1.193, + "step": 38955 + }, + { + "epoch": 11.66, + "grad_norm": 1.8769471645355225, + "learning_rate": 1.8581749118233225e-05, + "loss": 1.1355, + "step": 38960 + }, + { + "epoch": 11.66, + "grad_norm": 3.2708353996276855, + "learning_rate": 1.857607100336978e-05, + "loss": 0.9209, + "step": 38965 + }, + { + "epoch": 11.66, + "grad_norm": 2.209517478942871, + "learning_rate": 1.857039324329327e-05, + "loss": 1.1848, + "step": 38970 + }, + { + "epoch": 11.66, + "grad_norm": 2.2054500579833984, + "learning_rate": 1.8564715838317252e-05, + "loss": 1.0466, + "step": 38975 + }, + { + "epoch": 11.66, + "grad_norm": 1.8774044513702393, + "learning_rate": 1.8559038788755285e-05, + "loss": 0.9097, + "step": 38980 + }, + { + "epoch": 11.66, + "grad_norm": 2.7075273990631104, + "learning_rate": 1.8553362094920918e-05, + "loss": 1.1306, + "step": 38985 + }, + { + "epoch": 11.67, + "grad_norm": 3.6890761852264404, + "learning_rate": 1.854768575712765e-05, + "loss": 1.0088, + "step": 38990 + }, + { + "epoch": 11.67, + "grad_norm": 4.63300085067749, + "learning_rate": 1.8542009775689003e-05, + "loss": 1.1951, + "step": 38995 + }, + { + "epoch": 11.67, + "grad_norm": 2.1743202209472656, + "learning_rate": 1.8536334150918432e-05, + "loss": 1.0896, + "step": 39000 + }, + { + "epoch": 11.67, + "grad_norm": 2.1397712230682373, + "learning_rate": 1.853065888312942e-05, + "loss": 0.9551, + "step": 39005 + }, + { + "epoch": 11.67, + "grad_norm": 3.1289427280426025, + "learning_rate": 1.852498397263539e-05, + "loss": 1.0159, + "step": 39010 + }, + { + "epoch": 11.67, + "grad_norm": 2.917625904083252, + "learning_rate": 1.8519309419749757e-05, + "loss": 1.0579, + "step": 39015 + }, + { + "epoch": 11.67, + "grad_norm": 1.9072588682174683, + "learning_rate": 1.851363522478594e-05, + "loss": 1.0325, + "step": 39020 + }, + { + "epoch": 11.68, + "grad_norm": 1.4245704412460327, + "learning_rate": 1.8507961388057292e-05, + "loss": 1.1429, + "step": 39025 + }, + { + "epoch": 11.68, + "grad_norm": 5.188929080963135, + "learning_rate": 1.8502287909877197e-05, + "loss": 1.1144, + "step": 39030 + }, + { + "epoch": 11.68, + "grad_norm": 6.824649333953857, + "learning_rate": 1.8496614790558987e-05, + "loss": 0.9599, + "step": 39035 + }, + { + "epoch": 11.68, + "grad_norm": 5.712022304534912, + "learning_rate": 1.8490942030415982e-05, + "loss": 1.105, + "step": 39040 + }, + { + "epoch": 11.68, + "grad_norm": 5.067729949951172, + "learning_rate": 1.8485269629761487e-05, + "loss": 1.0556, + "step": 39045 + }, + { + "epoch": 11.68, + "grad_norm": 1.9118393659591675, + "learning_rate": 1.847959758890877e-05, + "loss": 1.129, + "step": 39050 + }, + { + "epoch": 11.68, + "grad_norm": 2.5474183559417725, + "learning_rate": 1.847392590817111e-05, + "loss": 0.9264, + "step": 39055 + }, + { + "epoch": 11.69, + "grad_norm": 3.004509925842285, + "learning_rate": 1.8468254587861728e-05, + "loss": 0.9561, + "step": 39060 + }, + { + "epoch": 11.69, + "grad_norm": 2.7860445976257324, + "learning_rate": 1.8462583628293867e-05, + "loss": 1.0989, + "step": 39065 + }, + { + "epoch": 11.69, + "grad_norm": 3.1194570064544678, + "learning_rate": 1.8456913029780714e-05, + "loss": 0.9351, + "step": 39070 + }, + { + "epoch": 11.69, + "grad_norm": 2.584099054336548, + "learning_rate": 1.8451242792635447e-05, + "loss": 1.0934, + "step": 39075 + }, + { + "epoch": 11.69, + "grad_norm": 4.399397850036621, + "learning_rate": 1.8445572917171238e-05, + "loss": 1.0019, + "step": 39080 + }, + { + "epoch": 11.69, + "grad_norm": 1.7044438123703003, + "learning_rate": 1.843990340370122e-05, + "loss": 1.1474, + "step": 39085 + }, + { + "epoch": 11.7, + "grad_norm": 1.9364503622055054, + "learning_rate": 1.8434234252538523e-05, + "loss": 1.0992, + "step": 39090 + }, + { + "epoch": 11.7, + "grad_norm": 2.9224693775177, + "learning_rate": 1.842856546399624e-05, + "loss": 0.8731, + "step": 39095 + }, + { + "epoch": 11.7, + "grad_norm": 2.569241523742676, + "learning_rate": 1.842289703838746e-05, + "loss": 1.0233, + "step": 39100 + }, + { + "epoch": 11.7, + "grad_norm": 6.116538047790527, + "learning_rate": 1.841722897602524e-05, + "loss": 0.9919, + "step": 39105 + }, + { + "epoch": 11.7, + "grad_norm": 5.045174598693848, + "learning_rate": 1.8411561277222616e-05, + "loss": 0.9874, + "step": 39110 + }, + { + "epoch": 11.7, + "grad_norm": 4.61328125, + "learning_rate": 1.8405893942292617e-05, + "loss": 0.913, + "step": 39115 + }, + { + "epoch": 11.7, + "grad_norm": 3.24153470993042, + "learning_rate": 1.840022697154824e-05, + "loss": 0.908, + "step": 39120 + }, + { + "epoch": 11.71, + "grad_norm": 1.127431869506836, + "learning_rate": 1.839456036530247e-05, + "loss": 1.0368, + "step": 39125 + }, + { + "epoch": 11.71, + "grad_norm": 2.7838566303253174, + "learning_rate": 1.838889412386826e-05, + "loss": 0.9142, + "step": 39130 + }, + { + "epoch": 11.71, + "grad_norm": 2.489769458770752, + "learning_rate": 1.838322824755856e-05, + "loss": 1.0572, + "step": 39135 + }, + { + "epoch": 11.71, + "grad_norm": 2.236867904663086, + "learning_rate": 1.837756273668629e-05, + "loss": 1.0315, + "step": 39140 + }, + { + "epoch": 11.71, + "grad_norm": 1.7803845405578613, + "learning_rate": 1.8371897591564335e-05, + "loss": 0.9635, + "step": 39145 + }, + { + "epoch": 11.71, + "grad_norm": 19.32113265991211, + "learning_rate": 1.8366232812505602e-05, + "loss": 1.2016, + "step": 39150 + }, + { + "epoch": 11.71, + "grad_norm": 3.2473385334014893, + "learning_rate": 1.836056839982292e-05, + "loss": 0.95, + "step": 39155 + }, + { + "epoch": 11.72, + "grad_norm": 3.1559784412384033, + "learning_rate": 1.8354904353829156e-05, + "loss": 0.9813, + "step": 39160 + }, + { + "epoch": 11.72, + "grad_norm": 2.750032663345337, + "learning_rate": 1.8349240674837105e-05, + "loss": 0.8542, + "step": 39165 + }, + { + "epoch": 11.72, + "grad_norm": 5.297883033752441, + "learning_rate": 1.834357736315959e-05, + "loss": 0.9693, + "step": 39170 + }, + { + "epoch": 11.72, + "grad_norm": 3.035358190536499, + "learning_rate": 1.8337914419109383e-05, + "loss": 1.0519, + "step": 39175 + }, + { + "epoch": 11.72, + "grad_norm": 5.685046195983887, + "learning_rate": 1.833225184299922e-05, + "loss": 0.9673, + "step": 39180 + }, + { + "epoch": 11.72, + "grad_norm": 4.552941799163818, + "learning_rate": 1.8326589635141874e-05, + "loss": 1.1577, + "step": 39185 + }, + { + "epoch": 11.73, + "grad_norm": 11.873933792114258, + "learning_rate": 1.832092779585003e-05, + "loss": 1.0136, + "step": 39190 + }, + { + "epoch": 11.73, + "grad_norm": 1.8182165622711182, + "learning_rate": 1.8315266325436412e-05, + "loss": 1.2264, + "step": 39195 + }, + { + "epoch": 11.73, + "grad_norm": 4.9064621925354, + "learning_rate": 1.8309605224213682e-05, + "loss": 1.0442, + "step": 39200 + }, + { + "epoch": 11.73, + "grad_norm": 4.886104106903076, + "learning_rate": 1.83039444924945e-05, + "loss": 0.9959, + "step": 39205 + }, + { + "epoch": 11.73, + "grad_norm": 2.1193058490753174, + "learning_rate": 1.8298284130591508e-05, + "loss": 0.9811, + "step": 39210 + }, + { + "epoch": 11.73, + "grad_norm": 2.320329427719116, + "learning_rate": 1.8292624138817317e-05, + "loss": 1.1301, + "step": 39215 + }, + { + "epoch": 11.73, + "grad_norm": 8.177576065063477, + "learning_rate": 1.8286964517484518e-05, + "loss": 1.0834, + "step": 39220 + }, + { + "epoch": 11.74, + "grad_norm": 3.5780060291290283, + "learning_rate": 1.8281305266905697e-05, + "loss": 0.9501, + "step": 39225 + }, + { + "epoch": 11.74, + "grad_norm": 4.063993453979492, + "learning_rate": 1.8275646387393396e-05, + "loss": 1.252, + "step": 39230 + }, + { + "epoch": 11.74, + "grad_norm": 1.9857643842697144, + "learning_rate": 1.8269987879260163e-05, + "loss": 1.1019, + "step": 39235 + }, + { + "epoch": 11.74, + "grad_norm": 1.7458826303482056, + "learning_rate": 1.8264329742818496e-05, + "loss": 1.0769, + "step": 39240 + }, + { + "epoch": 11.74, + "grad_norm": 1.2692326307296753, + "learning_rate": 1.8258671978380902e-05, + "loss": 1.1112, + "step": 39245 + }, + { + "epoch": 11.74, + "grad_norm": 1.531681776046753, + "learning_rate": 1.8253014586259842e-05, + "loss": 1.0832, + "step": 39250 + }, + { + "epoch": 11.74, + "grad_norm": 3.519178628921509, + "learning_rate": 1.824735756676778e-05, + "loss": 1.0214, + "step": 39255 + }, + { + "epoch": 11.75, + "grad_norm": 1.5379389524459839, + "learning_rate": 1.824170092021713e-05, + "loss": 0.942, + "step": 39260 + }, + { + "epoch": 11.75, + "grad_norm": 1.4689420461654663, + "learning_rate": 1.823604464692033e-05, + "loss": 1.0427, + "step": 39265 + }, + { + "epoch": 11.75, + "grad_norm": 1.6258187294006348, + "learning_rate": 1.8230388747189746e-05, + "loss": 1.0556, + "step": 39270 + }, + { + "epoch": 11.75, + "grad_norm": 1.4137862920761108, + "learning_rate": 1.8224733221337748e-05, + "loss": 1.2594, + "step": 39275 + }, + { + "epoch": 11.75, + "grad_norm": 3.4396555423736572, + "learning_rate": 1.82190780696767e-05, + "loss": 1.1517, + "step": 39280 + }, + { + "epoch": 11.75, + "grad_norm": 2.3807373046875, + "learning_rate": 1.8213423292518912e-05, + "loss": 0.9228, + "step": 39285 + }, + { + "epoch": 11.76, + "grad_norm": 3.9610815048217773, + "learning_rate": 1.8207768890176714e-05, + "loss": 0.975, + "step": 39290 + }, + { + "epoch": 11.76, + "grad_norm": 2.3501439094543457, + "learning_rate": 1.8202114862962376e-05, + "loss": 1.0386, + "step": 39295 + }, + { + "epoch": 11.76, + "grad_norm": 2.7657525539398193, + "learning_rate": 1.8196461211188164e-05, + "loss": 1.0143, + "step": 39300 + }, + { + "epoch": 11.76, + "grad_norm": 1.5906193256378174, + "learning_rate": 1.8190807935166327e-05, + "loss": 1.1036, + "step": 39305 + }, + { + "epoch": 11.76, + "grad_norm": 26.443702697753906, + "learning_rate": 1.8185155035209096e-05, + "loss": 0.9669, + "step": 39310 + }, + { + "epoch": 11.76, + "grad_norm": 4.151071548461914, + "learning_rate": 1.817950251162866e-05, + "loss": 1.0132, + "step": 39315 + }, + { + "epoch": 11.76, + "grad_norm": 2.691179037094116, + "learning_rate": 1.817385036473721e-05, + "loss": 1.0296, + "step": 39320 + }, + { + "epoch": 11.77, + "grad_norm": 3.2042014598846436, + "learning_rate": 1.816819859484692e-05, + "loss": 1.0784, + "step": 39325 + }, + { + "epoch": 11.77, + "grad_norm": 2.0353968143463135, + "learning_rate": 1.816254720226992e-05, + "loss": 0.9752, + "step": 39330 + }, + { + "epoch": 11.77, + "grad_norm": 1.9836816787719727, + "learning_rate": 1.8156896187318322e-05, + "loss": 1.1108, + "step": 39335 + }, + { + "epoch": 11.77, + "grad_norm": 1.6825382709503174, + "learning_rate": 1.815124555030424e-05, + "loss": 1.0884, + "step": 39340 + }, + { + "epoch": 11.77, + "grad_norm": 3.492959976196289, + "learning_rate": 1.8145595291539742e-05, + "loss": 1.1597, + "step": 39345 + }, + { + "epoch": 11.77, + "grad_norm": 3.515138864517212, + "learning_rate": 1.81399454113369e-05, + "loss": 1.0825, + "step": 39350 + }, + { + "epoch": 11.77, + "grad_norm": 2.3991005420684814, + "learning_rate": 1.8134295910007733e-05, + "loss": 1.1757, + "step": 39355 + }, + { + "epoch": 11.78, + "grad_norm": 4.532209396362305, + "learning_rate": 1.8128646787864274e-05, + "loss": 0.9928, + "step": 39360 + }, + { + "epoch": 11.78, + "grad_norm": 1.8209837675094604, + "learning_rate": 1.8122998045218515e-05, + "loss": 0.9525, + "step": 39365 + }, + { + "epoch": 11.78, + "grad_norm": 1.6788138151168823, + "learning_rate": 1.811734968238241e-05, + "loss": 1.1068, + "step": 39370 + }, + { + "epoch": 11.78, + "grad_norm": 1.9423623085021973, + "learning_rate": 1.8111701699667945e-05, + "loss": 0.9662, + "step": 39375 + }, + { + "epoch": 11.78, + "grad_norm": 1.7735339403152466, + "learning_rate": 1.8106054097387017e-05, + "loss": 1.1168, + "step": 39380 + }, + { + "epoch": 11.78, + "grad_norm": 4.587143898010254, + "learning_rate": 1.810040687585157e-05, + "loss": 1.0909, + "step": 39385 + }, + { + "epoch": 11.79, + "grad_norm": 2.695783853530884, + "learning_rate": 1.809476003537347e-05, + "loss": 1.0518, + "step": 39390 + }, + { + "epoch": 11.79, + "grad_norm": 1.4317811727523804, + "learning_rate": 1.80891135762646e-05, + "loss": 1.0841, + "step": 39395 + }, + { + "epoch": 11.79, + "grad_norm": 2.6233603954315186, + "learning_rate": 1.8083467498836805e-05, + "loss": 0.8933, + "step": 39400 + }, + { + "epoch": 11.79, + "grad_norm": 1.8944311141967773, + "learning_rate": 1.8077821803401903e-05, + "loss": 1.0322, + "step": 39405 + }, + { + "epoch": 11.79, + "grad_norm": 4.844879627227783, + "learning_rate": 1.807217649027172e-05, + "loss": 1.0349, + "step": 39410 + }, + { + "epoch": 11.79, + "grad_norm": 3.5124640464782715, + "learning_rate": 1.8066531559758015e-05, + "loss": 0.9921, + "step": 39415 + }, + { + "epoch": 11.79, + "grad_norm": 4.222341060638428, + "learning_rate": 1.8060887012172578e-05, + "loss": 1.0685, + "step": 39420 + }, + { + "epoch": 11.8, + "grad_norm": 2.961749792098999, + "learning_rate": 1.8055242847827137e-05, + "loss": 1.0769, + "step": 39425 + }, + { + "epoch": 11.8, + "grad_norm": 6.683197498321533, + "learning_rate": 1.8049599067033406e-05, + "loss": 1.1211, + "step": 39430 + }, + { + "epoch": 11.8, + "grad_norm": 2.078709840774536, + "learning_rate": 1.80439556701031e-05, + "loss": 1.2193, + "step": 39435 + }, + { + "epoch": 11.8, + "grad_norm": 3.6731183528900146, + "learning_rate": 1.803831265734789e-05, + "loss": 1.0561, + "step": 39440 + }, + { + "epoch": 11.8, + "grad_norm": 2.6240220069885254, + "learning_rate": 1.803267002907944e-05, + "loss": 0.9361, + "step": 39445 + }, + { + "epoch": 11.8, + "grad_norm": 5.170044898986816, + "learning_rate": 1.802702778560937e-05, + "loss": 1.033, + "step": 39450 + }, + { + "epoch": 11.8, + "grad_norm": 2.432077407836914, + "learning_rate": 1.802138592724932e-05, + "loss": 1.1533, + "step": 39455 + }, + { + "epoch": 11.81, + "grad_norm": 1.541792869567871, + "learning_rate": 1.8015744454310873e-05, + "loss": 1.0976, + "step": 39460 + }, + { + "epoch": 11.81, + "grad_norm": 2.662036180496216, + "learning_rate": 1.8010103367105588e-05, + "loss": 0.9965, + "step": 39465 + }, + { + "epoch": 11.81, + "grad_norm": 3.3465163707733154, + "learning_rate": 1.800446266594504e-05, + "loss": 1.0623, + "step": 39470 + }, + { + "epoch": 11.81, + "grad_norm": 2.513610601425171, + "learning_rate": 1.799882235114074e-05, + "loss": 0.9073, + "step": 39475 + }, + { + "epoch": 11.81, + "grad_norm": 5.069911956787109, + "learning_rate": 1.79931824230042e-05, + "loss": 1.2199, + "step": 39480 + }, + { + "epoch": 11.81, + "grad_norm": 2.737182378768921, + "learning_rate": 1.7987542881846918e-05, + "loss": 1.194, + "step": 39485 + }, + { + "epoch": 11.81, + "grad_norm": 4.385614395141602, + "learning_rate": 1.7981903727980355e-05, + "loss": 1.0531, + "step": 39490 + }, + { + "epoch": 11.82, + "grad_norm": 1.9035724401474, + "learning_rate": 1.7976264961715954e-05, + "loss": 1.0845, + "step": 39495 + }, + { + "epoch": 11.82, + "grad_norm": 2.0744688510894775, + "learning_rate": 1.7970626583365135e-05, + "loss": 0.9908, + "step": 39500 + }, + { + "epoch": 11.82, + "grad_norm": 1.704471468925476, + "learning_rate": 1.796498859323931e-05, + "loss": 1.0739, + "step": 39505 + }, + { + "epoch": 11.82, + "grad_norm": 4.32279634475708, + "learning_rate": 1.7959350991649843e-05, + "loss": 0.9117, + "step": 39510 + }, + { + "epoch": 11.82, + "grad_norm": 3.1391208171844482, + "learning_rate": 1.795371377890811e-05, + "loss": 0.9438, + "step": 39515 + }, + { + "epoch": 11.82, + "grad_norm": 6.020325183868408, + "learning_rate": 1.7948076955325445e-05, + "loss": 0.9606, + "step": 39520 + }, + { + "epoch": 11.83, + "grad_norm": 2.498509645462036, + "learning_rate": 1.7942440521213144e-05, + "loss": 1.0956, + "step": 39525 + }, + { + "epoch": 11.83, + "grad_norm": 2.654123544692993, + "learning_rate": 1.793680447688253e-05, + "loss": 0.9559, + "step": 39530 + }, + { + "epoch": 11.83, + "grad_norm": 3.5224146842956543, + "learning_rate": 1.7931168822644857e-05, + "loss": 1.0071, + "step": 39535 + }, + { + "epoch": 11.83, + "grad_norm": 2.877861499786377, + "learning_rate": 1.7925533558811386e-05, + "loss": 1.0302, + "step": 39540 + }, + { + "epoch": 11.83, + "grad_norm": 2.9893412590026855, + "learning_rate": 1.7919898685693336e-05, + "loss": 0.9003, + "step": 39545 + }, + { + "epoch": 11.83, + "grad_norm": 3.915611505508423, + "learning_rate": 1.7914264203601933e-05, + "loss": 1.0966, + "step": 39550 + }, + { + "epoch": 11.83, + "grad_norm": 4.270025730133057, + "learning_rate": 1.7908630112848345e-05, + "loss": 1.0488, + "step": 39555 + }, + { + "epoch": 11.84, + "grad_norm": 1.905087947845459, + "learning_rate": 1.7902996413743743e-05, + "loss": 1.0429, + "step": 39560 + }, + { + "epoch": 11.84, + "grad_norm": 1.7518643140792847, + "learning_rate": 1.789736310659928e-05, + "loss": 0.9564, + "step": 39565 + }, + { + "epoch": 11.84, + "grad_norm": 2.0850372314453125, + "learning_rate": 1.7891730191726062e-05, + "loss": 1.0715, + "step": 39570 + }, + { + "epoch": 11.84, + "grad_norm": 4.21657133102417, + "learning_rate": 1.7886097669435203e-05, + "loss": 1.169, + "step": 39575 + }, + { + "epoch": 11.84, + "grad_norm": 1.5663069486618042, + "learning_rate": 1.7880465540037772e-05, + "loss": 0.9148, + "step": 39580 + }, + { + "epoch": 11.84, + "grad_norm": 1.8661555051803589, + "learning_rate": 1.787483380384483e-05, + "loss": 0.9692, + "step": 39585 + }, + { + "epoch": 11.84, + "grad_norm": 2.8097341060638428, + "learning_rate": 1.7869202461167414e-05, + "loss": 1.1088, + "step": 39590 + }, + { + "epoch": 11.85, + "grad_norm": 10.885028839111328, + "learning_rate": 1.7863571512316525e-05, + "loss": 1.0241, + "step": 39595 + }, + { + "epoch": 11.85, + "grad_norm": 3.2606093883514404, + "learning_rate": 1.7857940957603175e-05, + "loss": 1.1048, + "step": 39600 + }, + { + "epoch": 11.85, + "grad_norm": 2.2225308418273926, + "learning_rate": 1.785231079733831e-05, + "loss": 1.0134, + "step": 39605 + }, + { + "epoch": 11.85, + "grad_norm": 3.4681246280670166, + "learning_rate": 1.7846681031832902e-05, + "loss": 1.014, + "step": 39610 + }, + { + "epoch": 11.85, + "grad_norm": 2.1326732635498047, + "learning_rate": 1.784105166139785e-05, + "loss": 1.0633, + "step": 39615 + }, + { + "epoch": 11.85, + "grad_norm": 3.486208200454712, + "learning_rate": 1.7835422686344088e-05, + "loss": 1.0031, + "step": 39620 + }, + { + "epoch": 11.86, + "grad_norm": 2.6152260303497314, + "learning_rate": 1.7829794106982487e-05, + "loss": 1.1336, + "step": 39625 + }, + { + "epoch": 11.86, + "grad_norm": 3.0266525745391846, + "learning_rate": 1.782416592362389e-05, + "loss": 1.2247, + "step": 39630 + }, + { + "epoch": 11.86, + "grad_norm": 1.3934903144836426, + "learning_rate": 1.781853813657916e-05, + "loss": 1.1949, + "step": 39635 + }, + { + "epoch": 11.86, + "grad_norm": 1.248276948928833, + "learning_rate": 1.7812910746159096e-05, + "loss": 1.0491, + "step": 39640 + }, + { + "epoch": 11.86, + "grad_norm": 3.2080233097076416, + "learning_rate": 1.780728375267451e-05, + "loss": 1.0669, + "step": 39645 + }, + { + "epoch": 11.86, + "grad_norm": 6.742861747741699, + "learning_rate": 1.7801657156436162e-05, + "loss": 1.0856, + "step": 39650 + }, + { + "epoch": 11.86, + "grad_norm": 2.66377592086792, + "learning_rate": 1.7796030957754806e-05, + "loss": 1.2409, + "step": 39655 + }, + { + "epoch": 11.87, + "grad_norm": 2.5184435844421387, + "learning_rate": 1.779040515694117e-05, + "loss": 1.0872, + "step": 39660 + }, + { + "epoch": 11.87, + "grad_norm": 2.7967371940612793, + "learning_rate": 1.7784779754305962e-05, + "loss": 1.0393, + "step": 39665 + }, + { + "epoch": 11.87, + "grad_norm": 2.9046332836151123, + "learning_rate": 1.7779154750159874e-05, + "loss": 1.0531, + "step": 39670 + }, + { + "epoch": 11.87, + "grad_norm": 1.2060468196868896, + "learning_rate": 1.777353014481356e-05, + "loss": 1.0541, + "step": 39675 + }, + { + "epoch": 11.87, + "grad_norm": 5.9001359939575195, + "learning_rate": 1.7767905938577666e-05, + "loss": 1.0902, + "step": 39680 + }, + { + "epoch": 11.87, + "grad_norm": 3.407264471054077, + "learning_rate": 1.7762282131762814e-05, + "loss": 1.0468, + "step": 39685 + }, + { + "epoch": 11.87, + "grad_norm": 2.291767120361328, + "learning_rate": 1.7756658724679588e-05, + "loss": 0.9659, + "step": 39690 + }, + { + "epoch": 11.88, + "grad_norm": 1.3889952898025513, + "learning_rate": 1.775103571763858e-05, + "loss": 1.0298, + "step": 39695 + }, + { + "epoch": 11.88, + "grad_norm": 2.6598963737487793, + "learning_rate": 1.7745413110950325e-05, + "loss": 1.1888, + "step": 39700 + }, + { + "epoch": 11.88, + "grad_norm": 5.673483371734619, + "learning_rate": 1.7739790904925374e-05, + "loss": 1.0024, + "step": 39705 + }, + { + "epoch": 11.88, + "grad_norm": 4.94851016998291, + "learning_rate": 1.7734169099874216e-05, + "loss": 1.051, + "step": 39710 + }, + { + "epoch": 11.88, + "grad_norm": 10.22668743133545, + "learning_rate": 1.772854769610735e-05, + "loss": 1.0193, + "step": 39715 + }, + { + "epoch": 11.88, + "grad_norm": 3.5006158351898193, + "learning_rate": 1.7722926693935242e-05, + "loss": 1.0393, + "step": 39720 + }, + { + "epoch": 11.89, + "grad_norm": 2.18442964553833, + "learning_rate": 1.771730609366832e-05, + "loss": 0.9004, + "step": 39725 + }, + { + "epoch": 11.89, + "grad_norm": 2.1316640377044678, + "learning_rate": 1.771168589561702e-05, + "loss": 1.0415, + "step": 39730 + }, + { + "epoch": 11.89, + "grad_norm": 7.090075492858887, + "learning_rate": 1.770606610009172e-05, + "loss": 1.0966, + "step": 39735 + }, + { + "epoch": 11.89, + "grad_norm": 4.611660480499268, + "learning_rate": 1.770044670740282e-05, + "loss": 0.9932, + "step": 39740 + }, + { + "epoch": 11.89, + "grad_norm": 2.1602795124053955, + "learning_rate": 1.769482771786065e-05, + "loss": 1.2849, + "step": 39745 + }, + { + "epoch": 11.89, + "grad_norm": 5.2523274421691895, + "learning_rate": 1.7689209131775563e-05, + "loss": 1.0831, + "step": 39750 + }, + { + "epoch": 11.89, + "grad_norm": 1.25748610496521, + "learning_rate": 1.7683590949457852e-05, + "loss": 1.0985, + "step": 39755 + }, + { + "epoch": 11.9, + "grad_norm": 1.3775397539138794, + "learning_rate": 1.7677973171217805e-05, + "loss": 1.0786, + "step": 39760 + }, + { + "epoch": 11.9, + "grad_norm": 1.7527154684066772, + "learning_rate": 1.767235579736569e-05, + "loss": 1.0301, + "step": 39765 + }, + { + "epoch": 11.9, + "grad_norm": 3.6977381706237793, + "learning_rate": 1.7666738828211742e-05, + "loss": 1.2242, + "step": 39770 + }, + { + "epoch": 11.9, + "grad_norm": 2.5999135971069336, + "learning_rate": 1.76611222640662e-05, + "loss": 1.0193, + "step": 39775 + }, + { + "epoch": 11.9, + "grad_norm": 5.089832782745361, + "learning_rate": 1.7655506105239243e-05, + "loss": 1.0719, + "step": 39780 + }, + { + "epoch": 11.9, + "grad_norm": 3.162248373031616, + "learning_rate": 1.764989035204104e-05, + "loss": 0.87, + "step": 39785 + }, + { + "epoch": 11.9, + "grad_norm": 3.2377114295959473, + "learning_rate": 1.7644275004781763e-05, + "loss": 1.0549, + "step": 39790 + }, + { + "epoch": 11.91, + "grad_norm": 1.3557695150375366, + "learning_rate": 1.7638660063771523e-05, + "loss": 1.0341, + "step": 39795 + }, + { + "epoch": 11.91, + "grad_norm": 2.794524669647217, + "learning_rate": 1.7633045529320445e-05, + "loss": 1.0422, + "step": 39800 + }, + { + "epoch": 11.91, + "grad_norm": 5.456986427307129, + "learning_rate": 1.7627431401738597e-05, + "loss": 0.9706, + "step": 39805 + }, + { + "epoch": 11.91, + "grad_norm": 2.9997246265411377, + "learning_rate": 1.7621817681336055e-05, + "loss": 1.0415, + "step": 39810 + }, + { + "epoch": 11.91, + "grad_norm": 3.4345109462738037, + "learning_rate": 1.761620436842286e-05, + "loss": 0.884, + "step": 39815 + }, + { + "epoch": 11.91, + "grad_norm": 3.6423003673553467, + "learning_rate": 1.7610591463309007e-05, + "loss": 1.1453, + "step": 39820 + }, + { + "epoch": 11.92, + "grad_norm": 3.724964141845703, + "learning_rate": 1.760497896630452e-05, + "loss": 1.1898, + "step": 39825 + }, + { + "epoch": 11.92, + "grad_norm": 3.579566478729248, + "learning_rate": 1.759936687771934e-05, + "loss": 1.0795, + "step": 39830 + }, + { + "epoch": 11.92, + "grad_norm": 6.6637492179870605, + "learning_rate": 1.7593755197863455e-05, + "loss": 1.0689, + "step": 39835 + }, + { + "epoch": 11.92, + "grad_norm": 3.4842472076416016, + "learning_rate": 1.758814392704676e-05, + "loss": 1.0492, + "step": 39840 + }, + { + "epoch": 11.92, + "grad_norm": 4.1526079177856445, + "learning_rate": 1.758253306557917e-05, + "loss": 0.9039, + "step": 39845 + }, + { + "epoch": 11.92, + "grad_norm": 2.869919538497925, + "learning_rate": 1.757692261377058e-05, + "loss": 1.0529, + "step": 39850 + }, + { + "epoch": 11.92, + "grad_norm": 1.4743778705596924, + "learning_rate": 1.7571312571930827e-05, + "loss": 0.9677, + "step": 39855 + }, + { + "epoch": 11.93, + "grad_norm": 5.439683437347412, + "learning_rate": 1.756570294036977e-05, + "loss": 1.0239, + "step": 39860 + }, + { + "epoch": 11.93, + "grad_norm": 1.5706872940063477, + "learning_rate": 1.7560093719397204e-05, + "loss": 1.0556, + "step": 39865 + }, + { + "epoch": 11.93, + "grad_norm": 4.678708553314209, + "learning_rate": 1.7554484909322933e-05, + "loss": 1.1842, + "step": 39870 + }, + { + "epoch": 11.93, + "grad_norm": 3.4976863861083984, + "learning_rate": 1.7548876510456723e-05, + "loss": 0.873, + "step": 39875 + }, + { + "epoch": 11.93, + "grad_norm": 3.4963254928588867, + "learning_rate": 1.7543268523108308e-05, + "loss": 0.9826, + "step": 39880 + }, + { + "epoch": 11.93, + "grad_norm": 4.180238246917725, + "learning_rate": 1.7537660947587434e-05, + "loss": 0.9779, + "step": 39885 + }, + { + "epoch": 11.93, + "grad_norm": 0.9406687617301941, + "learning_rate": 1.753205378420378e-05, + "loss": 1.0133, + "step": 39890 + }, + { + "epoch": 11.94, + "grad_norm": 2.034942865371704, + "learning_rate": 1.7526447033267037e-05, + "loss": 1.203, + "step": 39895 + }, + { + "epoch": 11.94, + "grad_norm": 2.6159539222717285, + "learning_rate": 1.7520840695086847e-05, + "loss": 1.1254, + "step": 39900 + }, + { + "epoch": 11.94, + "grad_norm": 4.698697566986084, + "learning_rate": 1.7515234769972865e-05, + "loss": 0.823, + "step": 39905 + }, + { + "epoch": 11.94, + "grad_norm": 3.892012119293213, + "learning_rate": 1.750962925823469e-05, + "loss": 0.9369, + "step": 39910 + }, + { + "epoch": 11.94, + "grad_norm": 1.37578284740448, + "learning_rate": 1.750402416018189e-05, + "loss": 1.1761, + "step": 39915 + }, + { + "epoch": 11.94, + "grad_norm": 3.057701349258423, + "learning_rate": 1.749841947612405e-05, + "loss": 0.9876, + "step": 39920 + }, + { + "epoch": 11.95, + "grad_norm": 4.663394451141357, + "learning_rate": 1.7492815206370705e-05, + "loss": 0.8152, + "step": 39925 + }, + { + "epoch": 11.95, + "grad_norm": 2.727992296218872, + "learning_rate": 1.748721135123137e-05, + "loss": 1.1597, + "step": 39930 + }, + { + "epoch": 11.95, + "grad_norm": 1.3559057712554932, + "learning_rate": 1.7481607911015547e-05, + "loss": 1.0632, + "step": 39935 + }, + { + "epoch": 11.95, + "grad_norm": 2.3270246982574463, + "learning_rate": 1.7476004886032703e-05, + "loss": 0.8817, + "step": 39940 + }, + { + "epoch": 11.95, + "grad_norm": 3.695523977279663, + "learning_rate": 1.7470402276592294e-05, + "loss": 0.9562, + "step": 39945 + }, + { + "epoch": 11.95, + "grad_norm": 2.7991878986358643, + "learning_rate": 1.746480008300373e-05, + "loss": 1.0675, + "step": 39950 + }, + { + "epoch": 11.95, + "grad_norm": 10.5833740234375, + "learning_rate": 1.7459198305576434e-05, + "loss": 1.0423, + "step": 39955 + }, + { + "epoch": 11.96, + "grad_norm": 1.6265363693237305, + "learning_rate": 1.745359694461977e-05, + "loss": 0.9417, + "step": 39960 + }, + { + "epoch": 11.96, + "grad_norm": 2.9523098468780518, + "learning_rate": 1.744799600044311e-05, + "loss": 1.2033, + "step": 39965 + }, + { + "epoch": 11.96, + "grad_norm": 3.3546805381774902, + "learning_rate": 1.744239547335577e-05, + "loss": 1.0307, + "step": 39970 + }, + { + "epoch": 11.96, + "grad_norm": 4.862351894378662, + "learning_rate": 1.7436795363667086e-05, + "loss": 0.9044, + "step": 39975 + }, + { + "epoch": 11.96, + "grad_norm": 3.834397792816162, + "learning_rate": 1.743119567168633e-05, + "loss": 0.9981, + "step": 39980 + }, + { + "epoch": 11.96, + "grad_norm": 5.2290472984313965, + "learning_rate": 1.742559639772276e-05, + "loss": 1.0151, + "step": 39985 + }, + { + "epoch": 11.96, + "grad_norm": 1.2269209623336792, + "learning_rate": 1.741999754208564e-05, + "loss": 0.915, + "step": 39990 + }, + { + "epoch": 11.97, + "grad_norm": 9.868966102600098, + "learning_rate": 1.7414399105084166e-05, + "loss": 0.9631, + "step": 39995 + }, + { + "epoch": 11.97, + "grad_norm": 4.046831130981445, + "learning_rate": 1.7408801087027554e-05, + "loss": 1.0331, + "step": 40000 + }, + { + "epoch": 11.97, + "grad_norm": 7.04620361328125, + "learning_rate": 1.740320348822496e-05, + "loss": 1.0635, + "step": 40005 + }, + { + "epoch": 11.97, + "grad_norm": 4.793135166168213, + "learning_rate": 1.739760630898554e-05, + "loss": 1.0377, + "step": 40010 + }, + { + "epoch": 11.97, + "grad_norm": 1.5962762832641602, + "learning_rate": 1.7392009549618426e-05, + "loss": 1.3054, + "step": 40015 + }, + { + "epoch": 11.97, + "grad_norm": 3.573862314224243, + "learning_rate": 1.7386413210432717e-05, + "loss": 1.1205, + "step": 40020 + }, + { + "epoch": 11.98, + "grad_norm": 2.5612030029296875, + "learning_rate": 1.7380817291737488e-05, + "loss": 0.9382, + "step": 40025 + }, + { + "epoch": 11.98, + "grad_norm": 4.043525218963623, + "learning_rate": 1.7375221793841806e-05, + "loss": 1.083, + "step": 40030 + }, + { + "epoch": 11.98, + "grad_norm": 3.7928805351257324, + "learning_rate": 1.7369626717054693e-05, + "loss": 1.1117, + "step": 40035 + }, + { + "epoch": 11.98, + "grad_norm": 2.3192451000213623, + "learning_rate": 1.7364032061685174e-05, + "loss": 1.0247, + "step": 40040 + }, + { + "epoch": 11.98, + "grad_norm": 3.9967002868652344, + "learning_rate": 1.7358437828042215e-05, + "loss": 0.9724, + "step": 40045 + }, + { + "epoch": 11.98, + "grad_norm": 4.295874118804932, + "learning_rate": 1.7352844016434802e-05, + "loss": 1.17, + "step": 40050 + }, + { + "epoch": 11.98, + "grad_norm": 2.709665536880493, + "learning_rate": 1.7347250627171857e-05, + "loss": 1.0567, + "step": 40055 + }, + { + "epoch": 11.99, + "grad_norm": 1.6827901601791382, + "learning_rate": 1.7341657660562313e-05, + "loss": 1.0409, + "step": 40060 + }, + { + "epoch": 11.99, + "grad_norm": 1.5551859140396118, + "learning_rate": 1.7336065116915045e-05, + "loss": 0.9164, + "step": 40065 + }, + { + "epoch": 11.99, + "grad_norm": 4.49882698059082, + "learning_rate": 1.7330472996538948e-05, + "loss": 0.9778, + "step": 40070 + }, + { + "epoch": 11.99, + "grad_norm": 8.409674644470215, + "learning_rate": 1.7324881299742855e-05, + "loss": 1.1369, + "step": 40075 + }, + { + "epoch": 11.99, + "grad_norm": 3.090298891067505, + "learning_rate": 1.7319290026835577e-05, + "loss": 1.0897, + "step": 40080 + }, + { + "epoch": 11.99, + "grad_norm": 4.631008625030518, + "learning_rate": 1.731369917812594e-05, + "loss": 1.0815, + "step": 40085 + }, + { + "epoch": 11.99, + "grad_norm": 2.0959417819976807, + "learning_rate": 1.7308108753922698e-05, + "loss": 0.944, + "step": 40090 + }, + { + "epoch": 12.0, + "grad_norm": 2.209991931915283, + "learning_rate": 1.730251875453462e-05, + "loss": 1.1075, + "step": 40095 + }, + { + "epoch": 12.0, + "grad_norm": 1.7672827243804932, + "learning_rate": 1.7296929180270424e-05, + "loss": 1.0148, + "step": 40100 + }, + { + "epoch": 12.0, + "grad_norm": 2.572972536087036, + "learning_rate": 1.7291340031438828e-05, + "loss": 0.9899, + "step": 40105 + }, + { + "epoch": 12.0, + "grad_norm": 3.605733871459961, + "learning_rate": 1.728575130834851e-05, + "loss": 1.1367, + "step": 40110 + }, + { + "epoch": 12.0, + "grad_norm": 0.9864242672920227, + "learning_rate": 1.7280163011308127e-05, + "loss": 1.0373, + "step": 40115 + }, + { + "epoch": 12.0, + "grad_norm": 2.625638484954834, + "learning_rate": 1.7274575140626318e-05, + "loss": 0.9886, + "step": 40120 + }, + { + "epoch": 12.0, + "grad_norm": 1.6979814767837524, + "learning_rate": 1.726898769661169e-05, + "loss": 0.9172, + "step": 40125 + }, + { + "epoch": 12.01, + "grad_norm": 3.1777591705322266, + "learning_rate": 1.7263400679572838e-05, + "loss": 1.1741, + "step": 40130 + }, + { + "epoch": 12.01, + "grad_norm": 1.9709137678146362, + "learning_rate": 1.725781408981833e-05, + "loss": 0.9846, + "step": 40135 + }, + { + "epoch": 12.01, + "grad_norm": 2.2092223167419434, + "learning_rate": 1.7252227927656692e-05, + "loss": 1.1, + "step": 40140 + }, + { + "epoch": 12.01, + "grad_norm": 3.6297123432159424, + "learning_rate": 1.7246642193396463e-05, + "loss": 1.1311, + "step": 40145 + }, + { + "epoch": 12.01, + "grad_norm": 2.822312831878662, + "learning_rate": 1.7241056887346115e-05, + "loss": 1.0686, + "step": 40150 + }, + { + "epoch": 12.01, + "grad_norm": 2.791294574737549, + "learning_rate": 1.7235472009814142e-05, + "loss": 1.0233, + "step": 40155 + }, + { + "epoch": 12.02, + "grad_norm": 2.6758475303649902, + "learning_rate": 1.7229887561108967e-05, + "loss": 0.8289, + "step": 40160 + }, + { + "epoch": 12.02, + "grad_norm": 1.5019391775131226, + "learning_rate": 1.7224303541539034e-05, + "loss": 1.0052, + "step": 40165 + }, + { + "epoch": 12.02, + "grad_norm": 2.995958089828491, + "learning_rate": 1.7218719951412736e-05, + "loss": 1.0393, + "step": 40170 + }, + { + "epoch": 12.02, + "grad_norm": 2.246119976043701, + "learning_rate": 1.7213136791038436e-05, + "loss": 1.0853, + "step": 40175 + }, + { + "epoch": 12.02, + "grad_norm": 5.3217315673828125, + "learning_rate": 1.7207554060724505e-05, + "loss": 1.0277, + "step": 40180 + }, + { + "epoch": 12.02, + "grad_norm": 1.9030908346176147, + "learning_rate": 1.720197176077926e-05, + "loss": 1.0401, + "step": 40185 + }, + { + "epoch": 12.02, + "grad_norm": 3.2368953227996826, + "learning_rate": 1.7196389891511017e-05, + "loss": 0.9992, + "step": 40190 + }, + { + "epoch": 12.03, + "grad_norm": 2.3075368404388428, + "learning_rate": 1.7190808453228035e-05, + "loss": 1.1156, + "step": 40195 + }, + { + "epoch": 12.03, + "grad_norm": 3.3374037742614746, + "learning_rate": 1.7185227446238597e-05, + "loss": 1.2066, + "step": 40200 + }, + { + "epoch": 12.03, + "grad_norm": 2.5891878604888916, + "learning_rate": 1.7179646870850917e-05, + "loss": 0.8088, + "step": 40205 + }, + { + "epoch": 12.03, + "grad_norm": 2.94810152053833, + "learning_rate": 1.7174066727373212e-05, + "loss": 0.9585, + "step": 40210 + }, + { + "epoch": 12.03, + "grad_norm": 1.5498286485671997, + "learning_rate": 1.7168487016113675e-05, + "loss": 1.1713, + "step": 40215 + }, + { + "epoch": 12.03, + "grad_norm": 2.1500444412231445, + "learning_rate": 1.7162907737380447e-05, + "loss": 1.0594, + "step": 40220 + }, + { + "epoch": 12.03, + "grad_norm": 2.072866439819336, + "learning_rate": 1.7157328891481688e-05, + "loss": 1.0718, + "step": 40225 + }, + { + "epoch": 12.04, + "grad_norm": 2.1537182331085205, + "learning_rate": 1.7151750478725506e-05, + "loss": 0.953, + "step": 40230 + }, + { + "epoch": 12.04, + "grad_norm": 3.077263116836548, + "learning_rate": 1.7146172499419976e-05, + "loss": 1.0168, + "step": 40235 + }, + { + "epoch": 12.04, + "grad_norm": 3.462986469268799, + "learning_rate": 1.7140594953873184e-05, + "loss": 0.8017, + "step": 40240 + }, + { + "epoch": 12.04, + "grad_norm": 2.716017723083496, + "learning_rate": 1.7135017842393156e-05, + "loss": 1.1061, + "step": 40245 + }, + { + "epoch": 12.04, + "grad_norm": 2.1912074089050293, + "learning_rate": 1.7129441165287923e-05, + "loss": 0.9462, + "step": 40250 + }, + { + "epoch": 12.04, + "grad_norm": 1.6843229532241821, + "learning_rate": 1.7123864922865468e-05, + "loss": 0.9333, + "step": 40255 + }, + { + "epoch": 12.05, + "grad_norm": 1.5522648096084595, + "learning_rate": 1.7118289115433774e-05, + "loss": 0.9716, + "step": 40260 + }, + { + "epoch": 12.05, + "grad_norm": 3.79538631439209, + "learning_rate": 1.7112713743300778e-05, + "loss": 0.9679, + "step": 40265 + }, + { + "epoch": 12.05, + "grad_norm": 2.5258116722106934, + "learning_rate": 1.7107138806774398e-05, + "loss": 1.0447, + "step": 40270 + }, + { + "epoch": 12.05, + "grad_norm": 1.7278646230697632, + "learning_rate": 1.7101564306162546e-05, + "loss": 1.0542, + "step": 40275 + }, + { + "epoch": 12.05, + "grad_norm": 3.5281360149383545, + "learning_rate": 1.7095990241773076e-05, + "loss": 0.9029, + "step": 40280 + }, + { + "epoch": 12.05, + "grad_norm": 2.533038854598999, + "learning_rate": 1.7090416613913863e-05, + "loss": 1.1102, + "step": 40285 + }, + { + "epoch": 12.05, + "grad_norm": 2.665400505065918, + "learning_rate": 1.7084843422892705e-05, + "loss": 0.9122, + "step": 40290 + }, + { + "epoch": 12.06, + "grad_norm": 3.1313700675964355, + "learning_rate": 1.7079270669017422e-05, + "loss": 1.0415, + "step": 40295 + }, + { + "epoch": 12.06, + "grad_norm": 1.9384442567825317, + "learning_rate": 1.707369835259579e-05, + "loss": 0.8205, + "step": 40300 + }, + { + "epoch": 12.06, + "grad_norm": 3.2962498664855957, + "learning_rate": 1.7068126473935552e-05, + "loss": 1.1216, + "step": 40305 + }, + { + "epoch": 12.06, + "grad_norm": 4.275677680969238, + "learning_rate": 1.7062555033344457e-05, + "loss": 0.9877, + "step": 40310 + }, + { + "epoch": 12.06, + "grad_norm": 2.7179315090179443, + "learning_rate": 1.705698403113018e-05, + "loss": 0.8647, + "step": 40315 + }, + { + "epoch": 12.06, + "grad_norm": 2.34228515625, + "learning_rate": 1.7051413467600435e-05, + "loss": 0.9374, + "step": 40320 + }, + { + "epoch": 12.06, + "grad_norm": 3.802971124649048, + "learning_rate": 1.704584334306285e-05, + "loss": 0.9489, + "step": 40325 + }, + { + "epoch": 12.07, + "grad_norm": 3.335583448410034, + "learning_rate": 1.704027365782508e-05, + "loss": 1.0729, + "step": 40330 + }, + { + "epoch": 12.07, + "grad_norm": 2.546466827392578, + "learning_rate": 1.7034704412194722e-05, + "loss": 0.991, + "step": 40335 + }, + { + "epoch": 12.07, + "grad_norm": 3.204293727874756, + "learning_rate": 1.7029135606479346e-05, + "loss": 1.137, + "step": 40340 + }, + { + "epoch": 12.07, + "grad_norm": 4.195864677429199, + "learning_rate": 1.702356724098654e-05, + "loss": 0.9883, + "step": 40345 + }, + { + "epoch": 12.07, + "grad_norm": 2.6011598110198975, + "learning_rate": 1.7017999316023814e-05, + "loss": 1.0274, + "step": 40350 + }, + { + "epoch": 12.07, + "grad_norm": 1.9285248517990112, + "learning_rate": 1.7012431831898696e-05, + "loss": 0.9519, + "step": 40355 + }, + { + "epoch": 12.08, + "grad_norm": 1.2027796506881714, + "learning_rate": 1.700686478891867e-05, + "loss": 1.2077, + "step": 40360 + }, + { + "epoch": 12.08, + "grad_norm": 13.604466438293457, + "learning_rate": 1.700129818739118e-05, + "loss": 0.802, + "step": 40365 + }, + { + "epoch": 12.08, + "grad_norm": 2.9307985305786133, + "learning_rate": 1.6995732027623677e-05, + "loss": 1.0621, + "step": 40370 + }, + { + "epoch": 12.08, + "grad_norm": 3.506657838821411, + "learning_rate": 1.6990166309923584e-05, + "loss": 1.0037, + "step": 40375 + }, + { + "epoch": 12.08, + "grad_norm": 3.7252209186553955, + "learning_rate": 1.6984601034598273e-05, + "loss": 1.0522, + "step": 40380 + }, + { + "epoch": 12.08, + "grad_norm": 3.6333189010620117, + "learning_rate": 1.6979036201955118e-05, + "loss": 1.1053, + "step": 40385 + }, + { + "epoch": 12.08, + "grad_norm": 1.9753819704055786, + "learning_rate": 1.6973471812301456e-05, + "loss": 0.9674, + "step": 40390 + }, + { + "epoch": 12.09, + "grad_norm": 1.905187964439392, + "learning_rate": 1.6967907865944608e-05, + "loss": 1.0107, + "step": 40395 + }, + { + "epoch": 12.09, + "grad_norm": 4.743467807769775, + "learning_rate": 1.6962344363191846e-05, + "loss": 0.8817, + "step": 40400 + }, + { + "epoch": 12.09, + "grad_norm": 1.3228265047073364, + "learning_rate": 1.6956781304350466e-05, + "loss": 0.9523, + "step": 40405 + }, + { + "epoch": 12.09, + "grad_norm": 2.1767237186431885, + "learning_rate": 1.695121868972768e-05, + "loss": 1.177, + "step": 40410 + }, + { + "epoch": 12.09, + "grad_norm": 2.85136342048645, + "learning_rate": 1.694565651963073e-05, + "loss": 1.2009, + "step": 40415 + }, + { + "epoch": 12.09, + "grad_norm": 1.4177536964416504, + "learning_rate": 1.694009479436679e-05, + "loss": 1.0331, + "step": 40420 + }, + { + "epoch": 12.09, + "grad_norm": 3.8098344802856445, + "learning_rate": 1.6934533514243046e-05, + "loss": 1.0316, + "step": 40425 + }, + { + "epoch": 12.1, + "grad_norm": 2.5896317958831787, + "learning_rate": 1.6928972679566633e-05, + "loss": 1.1835, + "step": 40430 + }, + { + "epoch": 12.1, + "grad_norm": 1.0999228954315186, + "learning_rate": 1.692341229064466e-05, + "loss": 1.1264, + "step": 40435 + }, + { + "epoch": 12.1, + "grad_norm": 3.237612009048462, + "learning_rate": 1.691785234778424e-05, + "loss": 0.9797, + "step": 40440 + }, + { + "epoch": 12.1, + "grad_norm": 2.606142520904541, + "learning_rate": 1.691229285129242e-05, + "loss": 0.9886, + "step": 40445 + }, + { + "epoch": 12.1, + "grad_norm": 5.350770950317383, + "learning_rate": 1.6906733801476275e-05, + "loss": 0.9763, + "step": 40450 + }, + { + "epoch": 12.1, + "grad_norm": 1.0530993938446045, + "learning_rate": 1.69011751986428e-05, + "loss": 0.9137, + "step": 40455 + }, + { + "epoch": 12.11, + "grad_norm": 5.592156887054443, + "learning_rate": 1.6895617043099006e-05, + "loss": 1.0207, + "step": 40460 + }, + { + "epoch": 12.11, + "grad_norm": 2.4852864742279053, + "learning_rate": 1.6890059335151856e-05, + "loss": 1.008, + "step": 40465 + }, + { + "epoch": 12.11, + "grad_norm": 2.4040777683258057, + "learning_rate": 1.6884502075108298e-05, + "loss": 1.0699, + "step": 40470 + }, + { + "epoch": 12.11, + "grad_norm": 2.1579723358154297, + "learning_rate": 1.687894526327526e-05, + "loss": 0.9733, + "step": 40475 + }, + { + "epoch": 12.11, + "grad_norm": 3.1468398571014404, + "learning_rate": 1.6873388899959625e-05, + "loss": 1.0083, + "step": 40480 + }, + { + "epoch": 12.11, + "grad_norm": 2.4095633029937744, + "learning_rate": 1.686783298546828e-05, + "loss": 1.0388, + "step": 40485 + }, + { + "epoch": 12.11, + "grad_norm": 3.516322612762451, + "learning_rate": 1.686227752010807e-05, + "loss": 1.0025, + "step": 40490 + }, + { + "epoch": 12.12, + "grad_norm": 4.202288627624512, + "learning_rate": 1.6856722504185802e-05, + "loss": 1.0996, + "step": 40495 + }, + { + "epoch": 12.12, + "grad_norm": 3.198098659515381, + "learning_rate": 1.68511679380083e-05, + "loss": 0.8971, + "step": 40500 + }, + { + "epoch": 12.12, + "grad_norm": 6.3249993324279785, + "learning_rate": 1.684561382188231e-05, + "loss": 0.9876, + "step": 40505 + }, + { + "epoch": 12.12, + "grad_norm": 1.2453722953796387, + "learning_rate": 1.68400601561146e-05, + "loss": 0.9331, + "step": 40510 + }, + { + "epoch": 12.12, + "grad_norm": 3.7218070030212402, + "learning_rate": 1.683450694101188e-05, + "loss": 0.9366, + "step": 40515 + }, + { + "epoch": 12.12, + "grad_norm": 2.971968412399292, + "learning_rate": 1.682895417688086e-05, + "loss": 1.0184, + "step": 40520 + }, + { + "epoch": 12.12, + "grad_norm": 2.1106045246124268, + "learning_rate": 1.682340186402821e-05, + "loss": 0.9095, + "step": 40525 + }, + { + "epoch": 12.13, + "grad_norm": 1.3278517723083496, + "learning_rate": 1.6817850002760565e-05, + "loss": 1.1384, + "step": 40530 + }, + { + "epoch": 12.13, + "grad_norm": 1.8929804563522339, + "learning_rate": 1.6812298593384574e-05, + "loss": 1.029, + "step": 40535 + }, + { + "epoch": 12.13, + "grad_norm": 1.8360497951507568, + "learning_rate": 1.6806747636206804e-05, + "loss": 0.9871, + "step": 40540 + }, + { + "epoch": 12.13, + "grad_norm": 1.8714280128479004, + "learning_rate": 1.680119713153386e-05, + "loss": 1.0098, + "step": 40545 + }, + { + "epoch": 12.13, + "grad_norm": 2.2284882068634033, + "learning_rate": 1.6795647079672262e-05, + "loss": 0.9422, + "step": 40550 + }, + { + "epoch": 12.13, + "grad_norm": 2.5915369987487793, + "learning_rate": 1.6790097480928562e-05, + "loss": 0.9804, + "step": 40555 + }, + { + "epoch": 12.14, + "grad_norm": 1.8112045526504517, + "learning_rate": 1.678454833560924e-05, + "loss": 0.8877, + "step": 40560 + }, + { + "epoch": 12.14, + "grad_norm": 2.2570388317108154, + "learning_rate": 1.677899964402077e-05, + "loss": 0.8855, + "step": 40565 + }, + { + "epoch": 12.14, + "grad_norm": 2.4070281982421875, + "learning_rate": 1.6773451406469607e-05, + "loss": 1.2104, + "step": 40570 + }, + { + "epoch": 12.14, + "grad_norm": 1.3471062183380127, + "learning_rate": 1.6767903623262168e-05, + "loss": 1.1283, + "step": 40575 + }, + { + "epoch": 12.14, + "grad_norm": 1.2536911964416504, + "learning_rate": 1.6762356294704863e-05, + "loss": 1.0132, + "step": 40580 + }, + { + "epoch": 12.14, + "grad_norm": 4.913021564483643, + "learning_rate": 1.675680942110406e-05, + "loss": 1.1661, + "step": 40585 + }, + { + "epoch": 12.14, + "grad_norm": 3.420980215072632, + "learning_rate": 1.675126300276609e-05, + "loss": 0.7853, + "step": 40590 + }, + { + "epoch": 12.15, + "grad_norm": 4.242785930633545, + "learning_rate": 1.6745717039997303e-05, + "loss": 0.8756, + "step": 40595 + }, + { + "epoch": 12.15, + "grad_norm": 4.858880519866943, + "learning_rate": 1.6740171533103974e-05, + "loss": 0.894, + "step": 40600 + }, + { + "epoch": 12.15, + "grad_norm": 1.5555437803268433, + "learning_rate": 1.6734626482392397e-05, + "loss": 0.9653, + "step": 40605 + }, + { + "epoch": 12.15, + "grad_norm": 2.0157172679901123, + "learning_rate": 1.6729081888168794e-05, + "loss": 1.0733, + "step": 40610 + }, + { + "epoch": 12.15, + "grad_norm": 5.87971830368042, + "learning_rate": 1.6723537750739414e-05, + "loss": 1.0843, + "step": 40615 + }, + { + "epoch": 12.15, + "grad_norm": 3.524643659591675, + "learning_rate": 1.6717994070410442e-05, + "loss": 1.0036, + "step": 40620 + }, + { + "epoch": 12.15, + "grad_norm": 3.6206631660461426, + "learning_rate": 1.6712450847488037e-05, + "loss": 1.038, + "step": 40625 + }, + { + "epoch": 12.16, + "grad_norm": 3.202336072921753, + "learning_rate": 1.6706908082278368e-05, + "loss": 1.021, + "step": 40630 + }, + { + "epoch": 12.16, + "grad_norm": 1.4924439191818237, + "learning_rate": 1.6701365775087534e-05, + "loss": 1.123, + "step": 40635 + }, + { + "epoch": 12.16, + "grad_norm": 2.8223538398742676, + "learning_rate": 1.669582392622165e-05, + "loss": 1.1115, + "step": 40640 + }, + { + "epoch": 12.16, + "grad_norm": 4.758267879486084, + "learning_rate": 1.6690282535986775e-05, + "loss": 1.0646, + "step": 40645 + }, + { + "epoch": 12.16, + "grad_norm": 3.9418559074401855, + "learning_rate": 1.6684741604688962e-05, + "loss": 1.0262, + "step": 40650 + }, + { + "epoch": 12.16, + "grad_norm": 2.5239322185516357, + "learning_rate": 1.6679201132634227e-05, + "loss": 0.9649, + "step": 40655 + }, + { + "epoch": 12.17, + "grad_norm": 2.7712440490722656, + "learning_rate": 1.667366112012856e-05, + "loss": 0.9846, + "step": 40660 + }, + { + "epoch": 12.17, + "grad_norm": 1.613471269607544, + "learning_rate": 1.666812156747794e-05, + "loss": 0.9812, + "step": 40665 + }, + { + "epoch": 12.17, + "grad_norm": 3.2598836421966553, + "learning_rate": 1.6662582474988297e-05, + "loss": 1.1001, + "step": 40670 + }, + { + "epoch": 12.17, + "grad_norm": 4.2924346923828125, + "learning_rate": 1.665704384296557e-05, + "loss": 1.0433, + "step": 40675 + }, + { + "epoch": 12.17, + "grad_norm": 3.4734883308410645, + "learning_rate": 1.6651505671715628e-05, + "loss": 1.083, + "step": 40680 + }, + { + "epoch": 12.17, + "grad_norm": 3.842501401901245, + "learning_rate": 1.6645967961544357e-05, + "loss": 0.8934, + "step": 40685 + }, + { + "epoch": 12.17, + "grad_norm": 3.126307725906372, + "learning_rate": 1.6640430712757594e-05, + "loss": 0.8845, + "step": 40690 + }, + { + "epoch": 12.18, + "grad_norm": 3.3595333099365234, + "learning_rate": 1.6634893925661142e-05, + "loss": 1.0184, + "step": 40695 + }, + { + "epoch": 12.18, + "grad_norm": 2.7751407623291016, + "learning_rate": 1.662935760056082e-05, + "loss": 0.8099, + "step": 40700 + }, + { + "epoch": 12.18, + "grad_norm": 4.374361038208008, + "learning_rate": 1.6623821737762362e-05, + "loss": 0.89, + "step": 40705 + }, + { + "epoch": 12.18, + "grad_norm": 3.8457541465759277, + "learning_rate": 1.6618286337571532e-05, + "loss": 1.0311, + "step": 40710 + }, + { + "epoch": 12.18, + "grad_norm": 4.070710182189941, + "learning_rate": 1.661275140029404e-05, + "loss": 1.0774, + "step": 40715 + }, + { + "epoch": 12.18, + "grad_norm": 1.868833303451538, + "learning_rate": 1.6607216926235552e-05, + "loss": 1.1075, + "step": 40720 + }, + { + "epoch": 12.18, + "grad_norm": 2.962254524230957, + "learning_rate": 1.6601682915701767e-05, + "loss": 0.7483, + "step": 40725 + }, + { + "epoch": 12.19, + "grad_norm": 1.097212553024292, + "learning_rate": 1.659614936899829e-05, + "loss": 0.9267, + "step": 40730 + }, + { + "epoch": 12.19, + "grad_norm": 1.6510802507400513, + "learning_rate": 1.6590616286430754e-05, + "loss": 1.0181, + "step": 40735 + }, + { + "epoch": 12.19, + "grad_norm": 2.0439059734344482, + "learning_rate": 1.658508366830474e-05, + "loss": 1.1101, + "step": 40740 + }, + { + "epoch": 12.19, + "grad_norm": 3.3008670806884766, + "learning_rate": 1.6579551514925812e-05, + "loss": 1.0038, + "step": 40745 + }, + { + "epoch": 12.19, + "grad_norm": 1.4634921550750732, + "learning_rate": 1.65740198265995e-05, + "loss": 1.0533, + "step": 40750 + }, + { + "epoch": 12.19, + "grad_norm": 2.8518662452697754, + "learning_rate": 1.656848860363131e-05, + "loss": 1.0689, + "step": 40755 + }, + { + "epoch": 12.19, + "grad_norm": 2.9406023025512695, + "learning_rate": 1.6562957846326738e-05, + "loss": 1.1724, + "step": 40760 + }, + { + "epoch": 12.2, + "grad_norm": 4.297204971313477, + "learning_rate": 1.6557427554991222e-05, + "loss": 1.0288, + "step": 40765 + }, + { + "epoch": 12.2, + "grad_norm": 1.472741723060608, + "learning_rate": 1.655189772993022e-05, + "loss": 1.1269, + "step": 40770 + }, + { + "epoch": 12.2, + "grad_norm": 2.547549247741699, + "learning_rate": 1.6546368371449115e-05, + "loss": 1.1671, + "step": 40775 + }, + { + "epoch": 12.2, + "grad_norm": 2.158825159072876, + "learning_rate": 1.6540839479853305e-05, + "loss": 1.0053, + "step": 40780 + }, + { + "epoch": 12.2, + "grad_norm": 2.1777453422546387, + "learning_rate": 1.653531105544814e-05, + "loss": 1.0576, + "step": 40785 + }, + { + "epoch": 12.2, + "grad_norm": 3.722043991088867, + "learning_rate": 1.6529783098538937e-05, + "loss": 1.0552, + "step": 40790 + }, + { + "epoch": 12.21, + "grad_norm": 9.3829984664917, + "learning_rate": 1.6524255609431018e-05, + "loss": 0.8906, + "step": 40795 + }, + { + "epoch": 12.21, + "grad_norm": 1.8850293159484863, + "learning_rate": 1.6518728588429643e-05, + "loss": 0.9639, + "step": 40800 + }, + { + "epoch": 12.21, + "grad_norm": 4.943428993225098, + "learning_rate": 1.651320203584008e-05, + "loss": 0.8564, + "step": 40805 + }, + { + "epoch": 12.21, + "grad_norm": 1.9969452619552612, + "learning_rate": 1.650767595196754e-05, + "loss": 0.9259, + "step": 40810 + }, + { + "epoch": 12.21, + "grad_norm": 3.900078773498535, + "learning_rate": 1.650215033711724e-05, + "loss": 1.0503, + "step": 40815 + }, + { + "epoch": 12.21, + "grad_norm": 6.483584880828857, + "learning_rate": 1.6496625191594335e-05, + "loss": 1.1011, + "step": 40820 + }, + { + "epoch": 12.21, + "grad_norm": 3.2015042304992676, + "learning_rate": 1.6491100515703984e-05, + "loss": 0.9268, + "step": 40825 + }, + { + "epoch": 12.22, + "grad_norm": 1.9813512563705444, + "learning_rate": 1.6485576309751304e-05, + "loss": 0.7646, + "step": 40830 + }, + { + "epoch": 12.22, + "grad_norm": 2.6845803260803223, + "learning_rate": 1.6480052574041398e-05, + "loss": 1.1781, + "step": 40835 + }, + { + "epoch": 12.22, + "grad_norm": 5.457139492034912, + "learning_rate": 1.647452930887933e-05, + "loss": 1.0875, + "step": 40840 + }, + { + "epoch": 12.22, + "grad_norm": 2.0619680881500244, + "learning_rate": 1.6469006514570158e-05, + "loss": 1.136, + "step": 40845 + }, + { + "epoch": 12.22, + "grad_norm": 3.572672128677368, + "learning_rate": 1.646348419141887e-05, + "loss": 0.893, + "step": 40850 + }, + { + "epoch": 12.22, + "grad_norm": 3.076113700866699, + "learning_rate": 1.6457962339730492e-05, + "loss": 0.948, + "step": 40855 + }, + { + "epoch": 12.22, + "grad_norm": 2.0957534313201904, + "learning_rate": 1.6452440959809965e-05, + "loss": 0.9835, + "step": 40860 + }, + { + "epoch": 12.23, + "grad_norm": 5.54816198348999, + "learning_rate": 1.6446920051962247e-05, + "loss": 0.9175, + "step": 40865 + }, + { + "epoch": 12.23, + "grad_norm": 2.4465866088867188, + "learning_rate": 1.6441399616492238e-05, + "loss": 1.0556, + "step": 40870 + }, + { + "epoch": 12.23, + "grad_norm": 2.606534957885742, + "learning_rate": 1.6435879653704835e-05, + "loss": 1.0207, + "step": 40875 + }, + { + "epoch": 12.23, + "grad_norm": 2.173264980316162, + "learning_rate": 1.6430360163904902e-05, + "loss": 1.1046, + "step": 40880 + }, + { + "epoch": 12.23, + "grad_norm": 2.238013744354248, + "learning_rate": 1.6424841147397256e-05, + "loss": 1.0992, + "step": 40885 + }, + { + "epoch": 12.23, + "grad_norm": 1.3038547039031982, + "learning_rate": 1.6419322604486737e-05, + "loss": 0.9191, + "step": 40890 + }, + { + "epoch": 12.24, + "grad_norm": 1.2264683246612549, + "learning_rate": 1.6413804535478095e-05, + "loss": 1.0838, + "step": 40895 + }, + { + "epoch": 12.24, + "grad_norm": 4.56827974319458, + "learning_rate": 1.6408286940676114e-05, + "loss": 0.986, + "step": 40900 + }, + { + "epoch": 12.24, + "grad_norm": 1.0923278331756592, + "learning_rate": 1.6402769820385504e-05, + "loss": 1.0315, + "step": 40905 + }, + { + "epoch": 12.24, + "grad_norm": 1.2232002019882202, + "learning_rate": 1.6397253174910997e-05, + "loss": 1.0501, + "step": 40910 + }, + { + "epoch": 12.24, + "grad_norm": 1.501046895980835, + "learning_rate": 1.6391737004557246e-05, + "loss": 0.9132, + "step": 40915 + }, + { + "epoch": 12.24, + "grad_norm": 0.9044210910797119, + "learning_rate": 1.638622130962891e-05, + "loss": 1.017, + "step": 40920 + }, + { + "epoch": 12.24, + "grad_norm": 2.844947576522827, + "learning_rate": 1.638070609043062e-05, + "loss": 0.9969, + "step": 40925 + }, + { + "epoch": 12.25, + "grad_norm": 4.111929893493652, + "learning_rate": 1.637519134726697e-05, + "loss": 1.0534, + "step": 40930 + }, + { + "epoch": 12.25, + "grad_norm": 1.8732011318206787, + "learning_rate": 1.636967708044254e-05, + "loss": 0.888, + "step": 40935 + }, + { + "epoch": 12.25, + "grad_norm": 2.614795446395874, + "learning_rate": 1.636416329026188e-05, + "loss": 0.9893, + "step": 40940 + }, + { + "epoch": 12.25, + "grad_norm": 2.9741506576538086, + "learning_rate": 1.6358649977029493e-05, + "loss": 1.0112, + "step": 40945 + }, + { + "epoch": 12.25, + "grad_norm": 4.53338098526001, + "learning_rate": 1.63531371410499e-05, + "loss": 0.9482, + "step": 40950 + }, + { + "epoch": 12.25, + "grad_norm": 1.6804264783859253, + "learning_rate": 1.634762478262754e-05, + "loss": 0.9047, + "step": 40955 + }, + { + "epoch": 12.25, + "grad_norm": 5.280777931213379, + "learning_rate": 1.634211290206688e-05, + "loss": 0.8929, + "step": 40960 + }, + { + "epoch": 12.26, + "grad_norm": 2.2232463359832764, + "learning_rate": 1.6336601499672316e-05, + "loss": 1.0002, + "step": 40965 + }, + { + "epoch": 12.26, + "grad_norm": 3.6429049968719482, + "learning_rate": 1.633109057574826e-05, + "loss": 0.9099, + "step": 40970 + }, + { + "epoch": 12.26, + "grad_norm": 4.826415538787842, + "learning_rate": 1.6325580130599054e-05, + "loss": 0.9408, + "step": 40975 + }, + { + "epoch": 12.26, + "grad_norm": 1.3283343315124512, + "learning_rate": 1.6320070164529033e-05, + "loss": 0.9633, + "step": 40980 + }, + { + "epoch": 12.26, + "grad_norm": 4.262852191925049, + "learning_rate": 1.6314560677842526e-05, + "loss": 1.0573, + "step": 40985 + }, + { + "epoch": 12.26, + "grad_norm": 1.9915564060211182, + "learning_rate": 1.6309051670843794e-05, + "loss": 0.9936, + "step": 40990 + }, + { + "epoch": 12.27, + "grad_norm": 2.673431634902954, + "learning_rate": 1.6303543143837113e-05, + "loss": 0.9722, + "step": 40995 + }, + { + "epoch": 12.27, + "grad_norm": 2.525904417037964, + "learning_rate": 1.6298035097126698e-05, + "loss": 1.1012, + "step": 41000 + }, + { + "epoch": 12.27, + "grad_norm": 2.020827054977417, + "learning_rate": 1.629252753101677e-05, + "loss": 1.0946, + "step": 41005 + }, + { + "epoch": 12.27, + "grad_norm": 2.497849941253662, + "learning_rate": 1.6287020445811485e-05, + "loss": 1.1731, + "step": 41010 + }, + { + "epoch": 12.27, + "grad_norm": 2.420525312423706, + "learning_rate": 1.6281513841815006e-05, + "loss": 0.9534, + "step": 41015 + }, + { + "epoch": 12.27, + "grad_norm": 1.6529130935668945, + "learning_rate": 1.627600771933146e-05, + "loss": 1.0972, + "step": 41020 + }, + { + "epoch": 12.27, + "grad_norm": 2.2307538986206055, + "learning_rate": 1.6270502078664927e-05, + "loss": 0.9837, + "step": 41025 + }, + { + "epoch": 12.28, + "grad_norm": 4.9482269287109375, + "learning_rate": 1.6264996920119507e-05, + "loss": 0.7897, + "step": 41030 + }, + { + "epoch": 12.28, + "grad_norm": 1.9426223039627075, + "learning_rate": 1.6259492243999215e-05, + "loss": 1.088, + "step": 41035 + }, + { + "epoch": 12.28, + "grad_norm": 2.4098875522613525, + "learning_rate": 1.6253988050608092e-05, + "loss": 1.014, + "step": 41040 + }, + { + "epoch": 12.28, + "grad_norm": 3.1415224075317383, + "learning_rate": 1.6248484340250114e-05, + "loss": 1.0839, + "step": 41045 + }, + { + "epoch": 12.28, + "grad_norm": 3.377718687057495, + "learning_rate": 1.6242981113229245e-05, + "loss": 0.8829, + "step": 41050 + }, + { + "epoch": 12.28, + "grad_norm": 2.4279189109802246, + "learning_rate": 1.6237478369849433e-05, + "loss": 1.1367, + "step": 41055 + }, + { + "epoch": 12.28, + "grad_norm": 5.114649295806885, + "learning_rate": 1.6231976110414574e-05, + "loss": 1.0176, + "step": 41060 + }, + { + "epoch": 12.29, + "grad_norm": 3.2667107582092285, + "learning_rate": 1.622647433522857e-05, + "loss": 1.0747, + "step": 41065 + }, + { + "epoch": 12.29, + "grad_norm": 3.784885883331299, + "learning_rate": 1.6220973044595267e-05, + "loss": 1.0822, + "step": 41070 + }, + { + "epoch": 12.29, + "grad_norm": 2.1055243015289307, + "learning_rate": 1.621547223881849e-05, + "loss": 1.1663, + "step": 41075 + }, + { + "epoch": 12.29, + "grad_norm": 1.5692771673202515, + "learning_rate": 1.620997191820206e-05, + "loss": 0.9983, + "step": 41080 + }, + { + "epoch": 12.29, + "grad_norm": 2.18072772026062, + "learning_rate": 1.620447208304973e-05, + "loss": 0.805, + "step": 41085 + }, + { + "epoch": 12.29, + "grad_norm": 6.3626933097839355, + "learning_rate": 1.6198972733665284e-05, + "loss": 0.8722, + "step": 41090 + }, + { + "epoch": 12.3, + "grad_norm": 2.6465694904327393, + "learning_rate": 1.6193473870352408e-05, + "loss": 1.124, + "step": 41095 + }, + { + "epoch": 12.3, + "grad_norm": 1.850497841835022, + "learning_rate": 1.6187975493414825e-05, + "loss": 0.909, + "step": 41100 + }, + { + "epoch": 12.3, + "grad_norm": 3.717559576034546, + "learning_rate": 1.61824776031562e-05, + "loss": 0.9344, + "step": 41105 + }, + { + "epoch": 12.3, + "grad_norm": 1.314609408378601, + "learning_rate": 1.6176980199880158e-05, + "loss": 0.9273, + "step": 41110 + }, + { + "epoch": 12.3, + "grad_norm": 1.5568089485168457, + "learning_rate": 1.6171483283890342e-05, + "loss": 1.0734, + "step": 41115 + }, + { + "epoch": 12.3, + "grad_norm": 4.105639457702637, + "learning_rate": 1.6165986855490316e-05, + "loss": 0.9675, + "step": 41120 + }, + { + "epoch": 12.3, + "grad_norm": 4.530340194702148, + "learning_rate": 1.6160490914983667e-05, + "loss": 1.1482, + "step": 41125 + }, + { + "epoch": 12.31, + "grad_norm": Infinity, + "learning_rate": 1.6156094514065535e-05, + "loss": 1.1886, + "step": 41130 + }, + { + "epoch": 12.31, + "grad_norm": 6.879144191741943, + "learning_rate": 1.615059945253183e-05, + "loss": 1.0541, + "step": 41135 + }, + { + "epoch": 12.31, + "grad_norm": 5.159574508666992, + "learning_rate": 1.6145104879741307e-05, + "loss": 1.0869, + "step": 41140 + }, + { + "epoch": 12.31, + "grad_norm": 3.31752872467041, + "learning_rate": 1.6139610795997448e-05, + "loss": 0.8379, + "step": 41145 + }, + { + "epoch": 12.31, + "grad_norm": 1.4643511772155762, + "learning_rate": 1.6134117201603662e-05, + "loss": 1.0015, + "step": 41150 + }, + { + "epoch": 12.31, + "grad_norm": 1.95048189163208, + "learning_rate": 1.612862409686338e-05, + "loss": 1.1556, + "step": 41155 + }, + { + "epoch": 12.31, + "grad_norm": 3.2634193897247314, + "learning_rate": 1.6123131482079962e-05, + "loss": 1.2138, + "step": 41160 + }, + { + "epoch": 12.32, + "grad_norm": 2.3244948387145996, + "learning_rate": 1.6117639357556767e-05, + "loss": 1.0905, + "step": 41165 + }, + { + "epoch": 12.32, + "grad_norm": 5.133475303649902, + "learning_rate": 1.6112147723597116e-05, + "loss": 1.0629, + "step": 41170 + }, + { + "epoch": 12.32, + "grad_norm": 2.374720811843872, + "learning_rate": 1.610665658050431e-05, + "loss": 1.0795, + "step": 41175 + }, + { + "epoch": 12.32, + "grad_norm": 2.306103229522705, + "learning_rate": 1.6101165928581612e-05, + "loss": 1.022, + "step": 41180 + }, + { + "epoch": 12.32, + "grad_norm": 2.1258509159088135, + "learning_rate": 1.6095675768132273e-05, + "loss": 0.8965, + "step": 41185 + }, + { + "epoch": 12.32, + "grad_norm": 4.365717887878418, + "learning_rate": 1.6090186099459505e-05, + "loss": 1.1589, + "step": 41190 + }, + { + "epoch": 12.33, + "grad_norm": 3.533360481262207, + "learning_rate": 1.6084696922866504e-05, + "loss": 0.9771, + "step": 41195 + }, + { + "epoch": 12.33, + "grad_norm": 1.57905113697052, + "learning_rate": 1.6079208238656414e-05, + "loss": 1.1706, + "step": 41200 + }, + { + "epoch": 12.33, + "grad_norm": 3.772719144821167, + "learning_rate": 1.607372004713239e-05, + "loss": 0.9388, + "step": 41205 + }, + { + "epoch": 12.33, + "grad_norm": 2.236943006515503, + "learning_rate": 1.606823234859752e-05, + "loss": 1.0367, + "step": 41210 + }, + { + "epoch": 12.33, + "grad_norm": 1.4967836141586304, + "learning_rate": 1.6062745143354903e-05, + "loss": 0.9601, + "step": 41215 + }, + { + "epoch": 12.33, + "grad_norm": 1.7550181150436401, + "learning_rate": 1.6057258431707585e-05, + "loss": 1.1271, + "step": 41220 + }, + { + "epoch": 12.33, + "grad_norm": 1.6419475078582764, + "learning_rate": 1.6051772213958575e-05, + "loss": 1.1171, + "step": 41225 + }, + { + "epoch": 12.34, + "grad_norm": 3.1746506690979004, + "learning_rate": 1.6046286490410895e-05, + "loss": 0.9319, + "step": 41230 + }, + { + "epoch": 12.34, + "grad_norm": 4.4260993003845215, + "learning_rate": 1.6040801261367493e-05, + "loss": 0.9774, + "step": 41235 + }, + { + "epoch": 12.34, + "grad_norm": 1.5141162872314453, + "learning_rate": 1.603531652713134e-05, + "loss": 1.0768, + "step": 41240 + }, + { + "epoch": 12.34, + "grad_norm": 1.775514841079712, + "learning_rate": 1.602983228800532e-05, + "loss": 0.874, + "step": 41245 + }, + { + "epoch": 12.34, + "grad_norm": 5.238076210021973, + "learning_rate": 1.6024348544292357e-05, + "loss": 0.9991, + "step": 41250 + }, + { + "epoch": 12.34, + "grad_norm": 2.956698417663574, + "learning_rate": 1.6018865296295284e-05, + "loss": 1.137, + "step": 41255 + }, + { + "epoch": 12.34, + "grad_norm": 3.052151918411255, + "learning_rate": 1.6013382544316947e-05, + "loss": 1.0055, + "step": 41260 + }, + { + "epoch": 12.35, + "grad_norm": 7.823723316192627, + "learning_rate": 1.6007900288660148e-05, + "loss": 0.9589, + "step": 41265 + }, + { + "epoch": 12.35, + "grad_norm": 2.2807137966156006, + "learning_rate": 1.6002418529627673e-05, + "loss": 0.9707, + "step": 41270 + }, + { + "epoch": 12.35, + "grad_norm": 5.3980207443237305, + "learning_rate": 1.5996937267522265e-05, + "loss": 1.0007, + "step": 41275 + }, + { + "epoch": 12.35, + "grad_norm": 4.970637321472168, + "learning_rate": 1.5991456502646658e-05, + "loss": 0.8831, + "step": 41280 + }, + { + "epoch": 12.35, + "grad_norm": 2.0372869968414307, + "learning_rate": 1.5985976235303547e-05, + "loss": 1.0731, + "step": 41285 + }, + { + "epoch": 12.35, + "grad_norm": 1.5453438758850098, + "learning_rate": 1.59804964657956e-05, + "loss": 1.1497, + "step": 41290 + }, + { + "epoch": 12.35, + "grad_norm": 3.3459970951080322, + "learning_rate": 1.5975017194425448e-05, + "loss": 0.9553, + "step": 41295 + }, + { + "epoch": 12.36, + "grad_norm": 1.7910807132720947, + "learning_rate": 1.5969538421495728e-05, + "loss": 0.9187, + "step": 41300 + }, + { + "epoch": 12.36, + "grad_norm": 2.4004948139190674, + "learning_rate": 1.5964060147309e-05, + "loss": 0.8181, + "step": 41305 + }, + { + "epoch": 12.36, + "grad_norm": 3.5690810680389404, + "learning_rate": 1.5958582372167853e-05, + "loss": 1.1377, + "step": 41310 + }, + { + "epoch": 12.36, + "grad_norm": 7.011974334716797, + "learning_rate": 1.5953105096374797e-05, + "loss": 0.9167, + "step": 41315 + }, + { + "epoch": 12.36, + "grad_norm": 2.1069743633270264, + "learning_rate": 1.594762832023234e-05, + "loss": 1.0594, + "step": 41320 + }, + { + "epoch": 12.36, + "grad_norm": 1.5728492736816406, + "learning_rate": 1.594215204404297e-05, + "loss": 1.1337, + "step": 41325 + }, + { + "epoch": 12.37, + "grad_norm": 3.168928384780884, + "learning_rate": 1.5936676268109113e-05, + "loss": 0.933, + "step": 41330 + }, + { + "epoch": 12.37, + "grad_norm": 1.6599355936050415, + "learning_rate": 1.5931200992733217e-05, + "loss": 1.1395, + "step": 41335 + }, + { + "epoch": 12.37, + "grad_norm": 3.265467643737793, + "learning_rate": 1.5925726218217653e-05, + "loss": 1.0227, + "step": 41340 + }, + { + "epoch": 12.37, + "grad_norm": 3.9105184078216553, + "learning_rate": 1.5920251944864812e-05, + "loss": 1.0851, + "step": 41345 + }, + { + "epoch": 12.37, + "grad_norm": 7.27264928817749, + "learning_rate": 1.5914778172977008e-05, + "loss": 1.0477, + "step": 41350 + }, + { + "epoch": 12.37, + "grad_norm": 3.6267623901367188, + "learning_rate": 1.5909304902856563e-05, + "loss": 1.0303, + "step": 41355 + }, + { + "epoch": 12.37, + "grad_norm": 3.7175920009613037, + "learning_rate": 1.590383213480576e-05, + "loss": 1.0979, + "step": 41360 + }, + { + "epoch": 12.38, + "grad_norm": 1.885209321975708, + "learning_rate": 1.589835986912685e-05, + "loss": 1.1829, + "step": 41365 + }, + { + "epoch": 12.38, + "grad_norm": 4.020153045654297, + "learning_rate": 1.5892888106122073e-05, + "loss": 1.1045, + "step": 41370 + }, + { + "epoch": 12.38, + "grad_norm": 1.572926640510559, + "learning_rate": 1.5887416846093605e-05, + "loss": 0.992, + "step": 41375 + }, + { + "epoch": 12.38, + "grad_norm": 3.5849032402038574, + "learning_rate": 1.5881946089343646e-05, + "loss": 1.1051, + "step": 41380 + }, + { + "epoch": 12.38, + "grad_norm": 2.3222670555114746, + "learning_rate": 1.5876475836174328e-05, + "loss": 0.9915, + "step": 41385 + }, + { + "epoch": 12.38, + "grad_norm": 2.578094482421875, + "learning_rate": 1.5871006086887756e-05, + "loss": 0.9607, + "step": 41390 + }, + { + "epoch": 12.38, + "grad_norm": 4.564905166625977, + "learning_rate": 1.586553684178604e-05, + "loss": 1.0403, + "step": 41395 + }, + { + "epoch": 12.39, + "grad_norm": 4.259903907775879, + "learning_rate": 1.5860068101171214e-05, + "loss": 0.9894, + "step": 41400 + }, + { + "epoch": 12.39, + "grad_norm": 3.7057387828826904, + "learning_rate": 1.5854599865345342e-05, + "loss": 1.1087, + "step": 41405 + }, + { + "epoch": 12.39, + "grad_norm": 2.632147789001465, + "learning_rate": 1.58491321346104e-05, + "loss": 1.029, + "step": 41410 + }, + { + "epoch": 12.39, + "grad_norm": 1.1085238456726074, + "learning_rate": 1.5843664909268392e-05, + "loss": 1.0203, + "step": 41415 + }, + { + "epoch": 12.39, + "grad_norm": 3.5292742252349854, + "learning_rate": 1.583819818962125e-05, + "loss": 0.9154, + "step": 41420 + }, + { + "epoch": 12.39, + "grad_norm": 4.0045552253723145, + "learning_rate": 1.5832731975970887e-05, + "loss": 1.0444, + "step": 41425 + }, + { + "epoch": 12.4, + "grad_norm": 1.9779725074768066, + "learning_rate": 1.5827266268619224e-05, + "loss": 1.0704, + "step": 41430 + }, + { + "epoch": 12.4, + "grad_norm": 4.693383693695068, + "learning_rate": 1.5821801067868096e-05, + "loss": 0.9821, + "step": 41435 + }, + { + "epoch": 12.4, + "grad_norm": 1.2160636186599731, + "learning_rate": 1.5816336374019363e-05, + "loss": 0.8789, + "step": 41440 + }, + { + "epoch": 12.4, + "grad_norm": 1.0221775770187378, + "learning_rate": 1.581087218737483e-05, + "loss": 1.0346, + "step": 41445 + }, + { + "epoch": 12.4, + "grad_norm": 3.203967809677124, + "learning_rate": 1.5805408508236263e-05, + "loss": 0.8631, + "step": 41450 + }, + { + "epoch": 12.4, + "grad_norm": 10.920624732971191, + "learning_rate": 1.5799945336905438e-05, + "loss": 0.9483, + "step": 41455 + }, + { + "epoch": 12.4, + "grad_norm": 1.881516456604004, + "learning_rate": 1.5794482673684056e-05, + "loss": 1.0587, + "step": 41460 + }, + { + "epoch": 12.41, + "grad_norm": 3.7939016819000244, + "learning_rate": 1.5789020518873842e-05, + "loss": 1.1071, + "step": 41465 + }, + { + "epoch": 12.41, + "grad_norm": 2.4270174503326416, + "learning_rate": 1.5783558872776438e-05, + "loss": 1.1607, + "step": 41470 + }, + { + "epoch": 12.41, + "grad_norm": 3.0483105182647705, + "learning_rate": 1.5778097735693508e-05, + "loss": 0.9586, + "step": 41475 + }, + { + "epoch": 12.41, + "grad_norm": 3.0462000370025635, + "learning_rate": 1.5772637107926658e-05, + "loss": 1.072, + "step": 41480 + }, + { + "epoch": 12.41, + "grad_norm": 1.6828171014785767, + "learning_rate": 1.5767176989777455e-05, + "loss": 1.138, + "step": 41485 + }, + { + "epoch": 12.41, + "grad_norm": 3.1212778091430664, + "learning_rate": 1.576171738154748e-05, + "loss": 0.9104, + "step": 41490 + }, + { + "epoch": 12.41, + "grad_norm": 3.6774206161499023, + "learning_rate": 1.5756258283538243e-05, + "loss": 0.9859, + "step": 41495 + }, + { + "epoch": 12.42, + "grad_norm": 5.639192581176758, + "learning_rate": 1.575079969605126e-05, + "loss": 1.0641, + "step": 41500 + }, + { + "epoch": 12.42, + "grad_norm": 2.7606070041656494, + "learning_rate": 1.5745341619387986e-05, + "loss": 0.9769, + "step": 41505 + }, + { + "epoch": 12.42, + "grad_norm": 2.2884132862091064, + "learning_rate": 1.573988405384989e-05, + "loss": 1.0192, + "step": 41510 + }, + { + "epoch": 12.42, + "grad_norm": 2.4269514083862305, + "learning_rate": 1.573442699973837e-05, + "loss": 1.0783, + "step": 41515 + }, + { + "epoch": 12.42, + "grad_norm": 4.481752872467041, + "learning_rate": 1.5728970457354802e-05, + "loss": 1.0585, + "step": 41520 + }, + { + "epoch": 12.42, + "grad_norm": 2.6316633224487305, + "learning_rate": 1.572351442700057e-05, + "loss": 1.1228, + "step": 41525 + }, + { + "epoch": 12.43, + "grad_norm": 3.698345184326172, + "learning_rate": 1.5718058908976988e-05, + "loss": 1.0415, + "step": 41530 + }, + { + "epoch": 12.43, + "grad_norm": 3.938213348388672, + "learning_rate": 1.5712603903585367e-05, + "loss": 0.9013, + "step": 41535 + }, + { + "epoch": 12.43, + "grad_norm": 2.3214263916015625, + "learning_rate": 1.5707149411126975e-05, + "loss": 0.9264, + "step": 41540 + }, + { + "epoch": 12.43, + "grad_norm": 2.7372500896453857, + "learning_rate": 1.5701695431903068e-05, + "loss": 0.8975, + "step": 41545 + }, + { + "epoch": 12.43, + "grad_norm": 1.4826568365097046, + "learning_rate": 1.569624196621486e-05, + "loss": 1.1417, + "step": 41550 + }, + { + "epoch": 12.43, + "grad_norm": 2.281169891357422, + "learning_rate": 1.5690789014363526e-05, + "loss": 0.9454, + "step": 41555 + }, + { + "epoch": 12.43, + "grad_norm": 1.4119058847427368, + "learning_rate": 1.568533657665025e-05, + "loss": 1.0374, + "step": 41560 + }, + { + "epoch": 12.44, + "grad_norm": 14.846397399902344, + "learning_rate": 1.5679884653376138e-05, + "loss": 0.9577, + "step": 41565 + }, + { + "epoch": 12.44, + "grad_norm": 5.682839393615723, + "learning_rate": 1.5674433244842324e-05, + "loss": 0.8656, + "step": 41570 + }, + { + "epoch": 12.44, + "grad_norm": 6.977272033691406, + "learning_rate": 1.566898235134987e-05, + "loss": 1.0464, + "step": 41575 + }, + { + "epoch": 12.44, + "grad_norm": 3.747130870819092, + "learning_rate": 1.5663531973199807e-05, + "loss": 0.8694, + "step": 41580 + }, + { + "epoch": 12.44, + "grad_norm": 3.2453455924987793, + "learning_rate": 1.565808211069318e-05, + "loss": 1.191, + "step": 41585 + }, + { + "epoch": 12.44, + "grad_norm": 1.451064944267273, + "learning_rate": 1.565263276413096e-05, + "loss": 1.0963, + "step": 41590 + }, + { + "epoch": 12.44, + "grad_norm": 1.9707701206207275, + "learning_rate": 1.5647183933814124e-05, + "loss": 0.9704, + "step": 41595 + }, + { + "epoch": 12.45, + "grad_norm": 2.4262611865997314, + "learning_rate": 1.5641735620043586e-05, + "loss": 0.9327, + "step": 41600 + }, + { + "epoch": 12.45, + "grad_norm": 2.2412772178649902, + "learning_rate": 1.5636287823120278e-05, + "loss": 1.0346, + "step": 41605 + }, + { + "epoch": 12.45, + "grad_norm": 3.1475157737731934, + "learning_rate": 1.5630840543345048e-05, + "loss": 0.9991, + "step": 41610 + }, + { + "epoch": 12.45, + "grad_norm": 5.283227920532227, + "learning_rate": 1.562539378101876e-05, + "loss": 0.9999, + "step": 41615 + }, + { + "epoch": 12.45, + "grad_norm": 3.1468896865844727, + "learning_rate": 1.561994753644223e-05, + "loss": 1.0675, + "step": 41620 + }, + { + "epoch": 12.45, + "grad_norm": 6.32867431640625, + "learning_rate": 1.5614501809916245e-05, + "loss": 1.0708, + "step": 41625 + }, + { + "epoch": 12.46, + "grad_norm": 3.040778875350952, + "learning_rate": 1.5609056601741573e-05, + "loss": 1.26, + "step": 41630 + }, + { + "epoch": 12.46, + "grad_norm": 3.2033233642578125, + "learning_rate": 1.560361191221894e-05, + "loss": 1.1999, + "step": 41635 + }, + { + "epoch": 12.46, + "grad_norm": 1.406636118888855, + "learning_rate": 1.5598167741649054e-05, + "loss": 0.8328, + "step": 41640 + }, + { + "epoch": 12.46, + "grad_norm": 2.998223066329956, + "learning_rate": 1.55927240903326e-05, + "loss": 1.0981, + "step": 41645 + }, + { + "epoch": 12.46, + "grad_norm": 4.719427108764648, + "learning_rate": 1.5587280958570206e-05, + "loss": 0.8436, + "step": 41650 + }, + { + "epoch": 12.46, + "grad_norm": 5.524830341339111, + "learning_rate": 1.5581838346662506e-05, + "loss": 0.8711, + "step": 41655 + }, + { + "epoch": 12.46, + "grad_norm": 2.657658576965332, + "learning_rate": 1.5576396254910074e-05, + "loss": 0.9805, + "step": 41660 + }, + { + "epoch": 12.47, + "grad_norm": 3.9892799854278564, + "learning_rate": 1.5570954683613496e-05, + "loss": 1.0028, + "step": 41665 + }, + { + "epoch": 12.47, + "grad_norm": 3.137242555618286, + "learning_rate": 1.556551363307329e-05, + "loss": 1.0821, + "step": 41670 + }, + { + "epoch": 12.47, + "grad_norm": 3.95418119430542, + "learning_rate": 1.5560073103589947e-05, + "loss": 1.0106, + "step": 41675 + }, + { + "epoch": 12.47, + "grad_norm": 1.6314005851745605, + "learning_rate": 1.5554633095463966e-05, + "loss": 1.0254, + "step": 41680 + }, + { + "epoch": 12.47, + "grad_norm": 1.4221240282058716, + "learning_rate": 1.5549193608995772e-05, + "loss": 1.0632, + "step": 41685 + }, + { + "epoch": 12.47, + "grad_norm": 3.80000901222229, + "learning_rate": 1.5543754644485797e-05, + "loss": 1.1136, + "step": 41690 + }, + { + "epoch": 12.47, + "grad_norm": 1.8849966526031494, + "learning_rate": 1.5538316202234416e-05, + "loss": 1.1025, + "step": 41695 + }, + { + "epoch": 12.48, + "grad_norm": 1.476726770401001, + "learning_rate": 1.5532878282542007e-05, + "loss": 1.0641, + "step": 41700 + }, + { + "epoch": 12.48, + "grad_norm": 2.6277527809143066, + "learning_rate": 1.5527440885708884e-05, + "loss": 1.0265, + "step": 41705 + }, + { + "epoch": 12.48, + "grad_norm": 1.5605323314666748, + "learning_rate": 1.5522004012035358e-05, + "loss": 0.9976, + "step": 41710 + }, + { + "epoch": 12.48, + "grad_norm": 2.3000760078430176, + "learning_rate": 1.551656766182169e-05, + "loss": 0.9773, + "step": 41715 + }, + { + "epoch": 12.48, + "grad_norm": 1.9093542098999023, + "learning_rate": 1.551113183536814e-05, + "loss": 1.0161, + "step": 41720 + }, + { + "epoch": 12.48, + "grad_norm": 3.195117712020874, + "learning_rate": 1.5505696532974918e-05, + "loss": 1.1003, + "step": 41725 + }, + { + "epoch": 12.49, + "grad_norm": 1.47775399684906, + "learning_rate": 1.55002617549422e-05, + "loss": 1.0035, + "step": 41730 + }, + { + "epoch": 12.49, + "grad_norm": 4.95327615737915, + "learning_rate": 1.549482750157016e-05, + "loss": 1.1078, + "step": 41735 + }, + { + "epoch": 12.49, + "grad_norm": 1.3294042348861694, + "learning_rate": 1.548939377315891e-05, + "loss": 1.0844, + "step": 41740 + }, + { + "epoch": 12.49, + "grad_norm": 1.624887228012085, + "learning_rate": 1.5483960570008555e-05, + "loss": 1.076, + "step": 41745 + }, + { + "epoch": 12.49, + "grad_norm": 1.299483060836792, + "learning_rate": 1.5478527892419176e-05, + "loss": 1.0632, + "step": 41750 + }, + { + "epoch": 12.49, + "grad_norm": 1.710900068283081, + "learning_rate": 1.5473095740690792e-05, + "loss": 1.0242, + "step": 41755 + }, + { + "epoch": 12.49, + "grad_norm": 3.3313851356506348, + "learning_rate": 1.5467664115123435e-05, + "loss": 0.9548, + "step": 41760 + }, + { + "epoch": 12.5, + "grad_norm": 2.4123222827911377, + "learning_rate": 1.5462233016017074e-05, + "loss": 1.0284, + "step": 41765 + }, + { + "epoch": 12.5, + "grad_norm": 2.309690237045288, + "learning_rate": 1.545680244367168e-05, + "loss": 1.0553, + "step": 41770 + }, + { + "epoch": 12.5, + "grad_norm": 1.3467509746551514, + "learning_rate": 1.545137239838717e-05, + "loss": 1.1693, + "step": 41775 + }, + { + "epoch": 12.5, + "grad_norm": 1.2531408071517944, + "learning_rate": 1.5445942880463422e-05, + "loss": 1.0461, + "step": 41780 + }, + { + "epoch": 12.5, + "grad_norm": 3.7026236057281494, + "learning_rate": 1.5440513890200333e-05, + "loss": 1.0725, + "step": 41785 + }, + { + "epoch": 12.5, + "grad_norm": 1.7098792791366577, + "learning_rate": 1.543508542789771e-05, + "loss": 1.1343, + "step": 41790 + }, + { + "epoch": 12.5, + "grad_norm": 3.3441429138183594, + "learning_rate": 1.542965749385539e-05, + "loss": 1.07, + "step": 41795 + }, + { + "epoch": 12.51, + "grad_norm": 4.822731018066406, + "learning_rate": 1.5424230088373132e-05, + "loss": 0.8686, + "step": 41800 + }, + { + "epoch": 12.51, + "grad_norm": 3.751056671142578, + "learning_rate": 1.541880321175069e-05, + "loss": 1.0426, + "step": 41805 + }, + { + "epoch": 12.51, + "grad_norm": 1.4619433879852295, + "learning_rate": 1.5413376864287793e-05, + "loss": 0.9761, + "step": 41810 + }, + { + "epoch": 12.51, + "grad_norm": 4.5311737060546875, + "learning_rate": 1.5407951046284118e-05, + "loss": 0.9352, + "step": 41815 + }, + { + "epoch": 12.51, + "grad_norm": 1.865403175354004, + "learning_rate": 1.5402525758039348e-05, + "loss": 0.9368, + "step": 41820 + }, + { + "epoch": 12.51, + "grad_norm": 1.6462074518203735, + "learning_rate": 1.53971009998531e-05, + "loss": 0.9772, + "step": 41825 + }, + { + "epoch": 12.52, + "grad_norm": 1.4321706295013428, + "learning_rate": 1.5391676772024983e-05, + "loss": 0.9295, + "step": 41830 + }, + { + "epoch": 12.52, + "grad_norm": 3.5952343940734863, + "learning_rate": 1.5386253074854572e-05, + "loss": 1.0067, + "step": 41835 + }, + { + "epoch": 12.52, + "grad_norm": 3.0504446029663086, + "learning_rate": 1.5380829908641407e-05, + "loss": 0.9688, + "step": 41840 + }, + { + "epoch": 12.52, + "grad_norm": 2.3888778686523438, + "learning_rate": 1.537540727368501e-05, + "loss": 1.1385, + "step": 41845 + }, + { + "epoch": 12.52, + "grad_norm": 3.4064996242523193, + "learning_rate": 1.5369985170284864e-05, + "loss": 1.0932, + "step": 41850 + }, + { + "epoch": 12.52, + "grad_norm": 2.349851131439209, + "learning_rate": 1.536456359874043e-05, + "loss": 1.0279, + "step": 41855 + }, + { + "epoch": 12.52, + "grad_norm": 2.4067983627319336, + "learning_rate": 1.5359142559351124e-05, + "loss": 0.8575, + "step": 41860 + }, + { + "epoch": 12.53, + "grad_norm": 7.212216854095459, + "learning_rate": 1.5353722052416362e-05, + "loss": 0.9648, + "step": 41865 + }, + { + "epoch": 12.53, + "grad_norm": 2.0969083309173584, + "learning_rate": 1.534830207823551e-05, + "loss": 1.1525, + "step": 41870 + }, + { + "epoch": 12.53, + "grad_norm": 1.6566853523254395, + "learning_rate": 1.5342882637107885e-05, + "loss": 1.058, + "step": 41875 + }, + { + "epoch": 12.53, + "grad_norm": 4.8544921875, + "learning_rate": 1.533746372933283e-05, + "loss": 0.9298, + "step": 41880 + }, + { + "epoch": 12.53, + "grad_norm": 3.7266921997070312, + "learning_rate": 1.5332045355209597e-05, + "loss": 0.9846, + "step": 41885 + }, + { + "epoch": 12.53, + "grad_norm": 3.181262254714966, + "learning_rate": 1.5326627515037452e-05, + "loss": 0.8741, + "step": 41890 + }, + { + "epoch": 12.53, + "grad_norm": 2.2438769340515137, + "learning_rate": 1.532121020911562e-05, + "loss": 0.966, + "step": 41895 + }, + { + "epoch": 12.54, + "grad_norm": 4.2861409187316895, + "learning_rate": 1.5315793437743284e-05, + "loss": 1.0051, + "step": 41900 + }, + { + "epoch": 12.54, + "grad_norm": 6.4033026695251465, + "learning_rate": 1.5310377201219618e-05, + "loss": 0.8874, + "step": 41905 + }, + { + "epoch": 12.54, + "grad_norm": 3.8851475715637207, + "learning_rate": 1.5304961499843734e-05, + "loss": 0.9808, + "step": 41910 + }, + { + "epoch": 12.54, + "grad_norm": 2.0215234756469727, + "learning_rate": 1.529954633391476e-05, + "loss": 1.1129, + "step": 41915 + }, + { + "epoch": 12.54, + "grad_norm": 5.1186699867248535, + "learning_rate": 1.529413170373175e-05, + "loss": 0.908, + "step": 41920 + }, + { + "epoch": 12.54, + "grad_norm": 2.9314262866973877, + "learning_rate": 1.5288717609593764e-05, + "loss": 1.0398, + "step": 41925 + }, + { + "epoch": 12.54, + "grad_norm": 4.335308074951172, + "learning_rate": 1.5283304051799813e-05, + "loss": 1.0499, + "step": 41930 + }, + { + "epoch": 12.55, + "grad_norm": 2.7027440071105957, + "learning_rate": 1.5277891030648868e-05, + "loss": 1.075, + "step": 41935 + }, + { + "epoch": 12.55, + "grad_norm": 1.977004051208496, + "learning_rate": 1.5272478546439907e-05, + "loss": 1.1353, + "step": 41940 + }, + { + "epoch": 12.55, + "grad_norm": 4.210170745849609, + "learning_rate": 1.5267066599471836e-05, + "loss": 1.1134, + "step": 41945 + }, + { + "epoch": 12.55, + "grad_norm": 3.0245361328125, + "learning_rate": 1.5261655190043568e-05, + "loss": 0.8765, + "step": 41950 + }, + { + "epoch": 12.55, + "grad_norm": 2.2794244289398193, + "learning_rate": 1.525624431845395e-05, + "loss": 0.9708, + "step": 41955 + }, + { + "epoch": 12.55, + "grad_norm": 4.885735034942627, + "learning_rate": 1.5250833985001845e-05, + "loss": 1.0793, + "step": 41960 + }, + { + "epoch": 12.56, + "grad_norm": 4.991076946258545, + "learning_rate": 1.5245424189986035e-05, + "loss": 1.1405, + "step": 41965 + }, + { + "epoch": 12.56, + "grad_norm": 2.991661787033081, + "learning_rate": 1.524001493370531e-05, + "loss": 0.8436, + "step": 41970 + }, + { + "epoch": 12.56, + "grad_norm": 4.58644962310791, + "learning_rate": 1.5234606216458414e-05, + "loss": 0.955, + "step": 41975 + }, + { + "epoch": 12.56, + "grad_norm": 4.122666358947754, + "learning_rate": 1.5229198038544068e-05, + "loss": 0.9454, + "step": 41980 + }, + { + "epoch": 12.56, + "grad_norm": 2.8381059169769287, + "learning_rate": 1.5223790400260956e-05, + "loss": 1.2621, + "step": 41985 + }, + { + "epoch": 12.56, + "grad_norm": 2.797647476196289, + "learning_rate": 1.521838330190774e-05, + "loss": 1.1286, + "step": 41990 + }, + { + "epoch": 12.56, + "grad_norm": 2.7422313690185547, + "learning_rate": 1.5212976743783047e-05, + "loss": 1.0241, + "step": 41995 + }, + { + "epoch": 12.57, + "grad_norm": 1.3212940692901611, + "learning_rate": 1.520757072618548e-05, + "loss": 1.0562, + "step": 42000 + }, + { + "epoch": 12.57, + "grad_norm": 1.8071925640106201, + "learning_rate": 1.5202165249413592e-05, + "loss": 1.0923, + "step": 42005 + }, + { + "epoch": 12.57, + "grad_norm": 2.4288082122802734, + "learning_rate": 1.5196760313765946e-05, + "loss": 1.1725, + "step": 42010 + }, + { + "epoch": 12.57, + "grad_norm": 4.023059368133545, + "learning_rate": 1.5191355919541026e-05, + "loss": 1.1285, + "step": 42015 + }, + { + "epoch": 12.57, + "grad_norm": 3.815493583679199, + "learning_rate": 1.5185952067037335e-05, + "loss": 0.9849, + "step": 42020 + }, + { + "epoch": 12.57, + "grad_norm": 3.8521835803985596, + "learning_rate": 1.5180548756553306e-05, + "loss": 1.0811, + "step": 42025 + }, + { + "epoch": 12.57, + "grad_norm": 1.4135304689407349, + "learning_rate": 1.5175145988387353e-05, + "loss": 1.0513, + "step": 42030 + }, + { + "epoch": 12.58, + "grad_norm": 1.7829787731170654, + "learning_rate": 1.5169743762837887e-05, + "loss": 1.1091, + "step": 42035 + }, + { + "epoch": 12.58, + "grad_norm": 1.822656512260437, + "learning_rate": 1.5164342080203246e-05, + "loss": 1.2833, + "step": 42040 + }, + { + "epoch": 12.58, + "grad_norm": 4.21824312210083, + "learning_rate": 1.5158940940781777e-05, + "loss": 0.922, + "step": 42045 + }, + { + "epoch": 12.58, + "grad_norm": 3.6083362102508545, + "learning_rate": 1.5153540344871757e-05, + "loss": 1.0563, + "step": 42050 + }, + { + "epoch": 12.58, + "grad_norm": 2.541013479232788, + "learning_rate": 1.5148140292771484e-05, + "loss": 0.8514, + "step": 42055 + }, + { + "epoch": 12.58, + "grad_norm": 2.3995792865753174, + "learning_rate": 1.5142740784779174e-05, + "loss": 1.0454, + "step": 42060 + }, + { + "epoch": 12.59, + "grad_norm": 2.6729390621185303, + "learning_rate": 1.5137341821193045e-05, + "loss": 0.9045, + "step": 42065 + }, + { + "epoch": 12.59, + "grad_norm": 3.4425437450408936, + "learning_rate": 1.5131943402311279e-05, + "loss": 1.0334, + "step": 42070 + }, + { + "epoch": 12.59, + "grad_norm": 2.506922483444214, + "learning_rate": 1.5126545528432017e-05, + "loss": 1.0794, + "step": 42075 + }, + { + "epoch": 12.59, + "grad_norm": 1.7686995267868042, + "learning_rate": 1.5121148199853385e-05, + "loss": 1.046, + "step": 42080 + }, + { + "epoch": 12.59, + "grad_norm": 1.438901424407959, + "learning_rate": 1.5115751416873472e-05, + "loss": 1.118, + "step": 42085 + }, + { + "epoch": 12.59, + "grad_norm": 1.8655344247817993, + "learning_rate": 1.5110355179790331e-05, + "loss": 0.9908, + "step": 42090 + }, + { + "epoch": 12.59, + "grad_norm": 3.9504570960998535, + "learning_rate": 1.5104959488902e-05, + "loss": 0.9724, + "step": 42095 + }, + { + "epoch": 12.6, + "grad_norm": 2.1039040088653564, + "learning_rate": 1.5099564344506458e-05, + "loss": 1.0601, + "step": 42100 + }, + { + "epoch": 12.6, + "grad_norm": 3.9319233894348145, + "learning_rate": 1.5094169746901698e-05, + "loss": 0.9059, + "step": 42105 + }, + { + "epoch": 12.6, + "grad_norm": 1.384982943534851, + "learning_rate": 1.5088775696385637e-05, + "loss": 1.0329, + "step": 42110 + }, + { + "epoch": 12.6, + "grad_norm": 3.824261426925659, + "learning_rate": 1.50833821932562e-05, + "loss": 0.9907, + "step": 42115 + }, + { + "epoch": 12.6, + "grad_norm": 2.2953989505767822, + "learning_rate": 1.5077989237811253e-05, + "loss": 0.8469, + "step": 42120 + }, + { + "epoch": 12.6, + "grad_norm": 6.605740070343018, + "learning_rate": 1.507259683034865e-05, + "loss": 1.0258, + "step": 42125 + }, + { + "epoch": 12.6, + "grad_norm": 4.182356357574463, + "learning_rate": 1.506720497116621e-05, + "loss": 0.8448, + "step": 42130 + }, + { + "epoch": 12.61, + "grad_norm": 3.7092835903167725, + "learning_rate": 1.5061813660561703e-05, + "loss": 0.906, + "step": 42135 + }, + { + "epoch": 12.61, + "grad_norm": 3.1926708221435547, + "learning_rate": 1.5056422898832906e-05, + "loss": 0.8871, + "step": 42140 + }, + { + "epoch": 12.61, + "grad_norm": 3.0576462745666504, + "learning_rate": 1.5051032686277527e-05, + "loss": 1.0512, + "step": 42145 + }, + { + "epoch": 12.61, + "grad_norm": 5.683520793914795, + "learning_rate": 1.5045643023193281e-05, + "loss": 1.0064, + "step": 42150 + }, + { + "epoch": 12.61, + "grad_norm": 1.615805983543396, + "learning_rate": 1.5040253909877822e-05, + "loss": 0.8095, + "step": 42155 + }, + { + "epoch": 12.61, + "grad_norm": 1.38075852394104, + "learning_rate": 1.5034865346628782e-05, + "loss": 1.1425, + "step": 42160 + }, + { + "epoch": 12.62, + "grad_norm": 6.462873935699463, + "learning_rate": 1.5029477333743774e-05, + "loss": 1.0516, + "step": 42165 + }, + { + "epoch": 12.62, + "grad_norm": 2.051048755645752, + "learning_rate": 1.5024089871520369e-05, + "loss": 0.8301, + "step": 42170 + }, + { + "epoch": 12.62, + "grad_norm": 2.33416485786438, + "learning_rate": 1.5018702960256108e-05, + "loss": 1.1115, + "step": 42175 + }, + { + "epoch": 12.62, + "grad_norm": 1.324583888053894, + "learning_rate": 1.5013316600248507e-05, + "loss": 1.1135, + "step": 42180 + }, + { + "epoch": 12.62, + "grad_norm": 1.6215559244155884, + "learning_rate": 1.5007930791795055e-05, + "loss": 0.9414, + "step": 42185 + }, + { + "epoch": 12.62, + "grad_norm": 1.7824453115463257, + "learning_rate": 1.5002545535193203e-05, + "loss": 1.1032, + "step": 42190 + }, + { + "epoch": 12.62, + "grad_norm": 4.399259090423584, + "learning_rate": 1.4997160830740358e-05, + "loss": 0.9014, + "step": 42195 + }, + { + "epoch": 12.63, + "grad_norm": 3.2295663356781006, + "learning_rate": 1.4991776678733935e-05, + "loss": 0.7915, + "step": 42200 + }, + { + "epoch": 12.63, + "grad_norm": 6.502789497375488, + "learning_rate": 1.4986393079471269e-05, + "loss": 0.9355, + "step": 42205 + }, + { + "epoch": 12.63, + "grad_norm": 2.078927516937256, + "learning_rate": 1.4981010033249718e-05, + "loss": 0.9986, + "step": 42210 + }, + { + "epoch": 12.63, + "grad_norm": 3.382509231567383, + "learning_rate": 1.4975627540366558e-05, + "loss": 1.0191, + "step": 42215 + }, + { + "epoch": 12.63, + "grad_norm": 2.477400302886963, + "learning_rate": 1.4970245601119077e-05, + "loss": 0.9213, + "step": 42220 + }, + { + "epoch": 12.63, + "grad_norm": 2.6311304569244385, + "learning_rate": 1.4964864215804508e-05, + "loss": 1.083, + "step": 42225 + }, + { + "epoch": 12.63, + "grad_norm": 5.062621116638184, + "learning_rate": 1.4959483384720047e-05, + "loss": 0.922, + "step": 42230 + }, + { + "epoch": 12.64, + "grad_norm": 0.9835122227668762, + "learning_rate": 1.4954103108162892e-05, + "loss": 0.9226, + "step": 42235 + }, + { + "epoch": 12.64, + "grad_norm": 1.5001591444015503, + "learning_rate": 1.4948723386430172e-05, + "loss": 0.9836, + "step": 42240 + }, + { + "epoch": 12.64, + "grad_norm": 3.599999189376831, + "learning_rate": 1.494334421981902e-05, + "loss": 0.9332, + "step": 42245 + }, + { + "epoch": 12.64, + "grad_norm": 2.7632405757904053, + "learning_rate": 1.4937965608626503e-05, + "loss": 0.869, + "step": 42250 + }, + { + "epoch": 12.64, + "grad_norm": 2.2371535301208496, + "learning_rate": 1.493258755314969e-05, + "loss": 0.9492, + "step": 42255 + }, + { + "epoch": 12.64, + "grad_norm": 3.195711851119995, + "learning_rate": 1.4927210053685611e-05, + "loss": 1.1248, + "step": 42260 + }, + { + "epoch": 12.65, + "grad_norm": 3.760953903198242, + "learning_rate": 1.4921833110531239e-05, + "loss": 1.0165, + "step": 42265 + }, + { + "epoch": 12.65, + "grad_norm": 2.693413019180298, + "learning_rate": 1.4916456723983558e-05, + "loss": 0.8719, + "step": 42270 + }, + { + "epoch": 12.65, + "grad_norm": 1.946767807006836, + "learning_rate": 1.4911080894339482e-05, + "loss": 0.9631, + "step": 42275 + }, + { + "epoch": 12.65, + "grad_norm": 3.6406733989715576, + "learning_rate": 1.4905705621895932e-05, + "loss": 0.937, + "step": 42280 + }, + { + "epoch": 12.65, + "grad_norm": 2.393702745437622, + "learning_rate": 1.4900330906949767e-05, + "loss": 1.0413, + "step": 42285 + }, + { + "epoch": 12.65, + "grad_norm": 2.1976375579833984, + "learning_rate": 1.4894956749797817e-05, + "loss": 0.9671, + "step": 42290 + }, + { + "epoch": 12.65, + "grad_norm": 1.965586543083191, + "learning_rate": 1.4889583150736913e-05, + "loss": 0.9733, + "step": 42295 + }, + { + "epoch": 12.66, + "grad_norm": 2.103189468383789, + "learning_rate": 1.4884210110063811e-05, + "loss": 1.0441, + "step": 42300 + }, + { + "epoch": 12.66, + "grad_norm": 3.943366050720215, + "learning_rate": 1.4878837628075281e-05, + "loss": 0.8933, + "step": 42305 + }, + { + "epoch": 12.66, + "grad_norm": 2.3624773025512695, + "learning_rate": 1.4873465705068018e-05, + "loss": 0.9805, + "step": 42310 + }, + { + "epoch": 12.66, + "grad_norm": 4.398625373840332, + "learning_rate": 1.4868094341338729e-05, + "loss": 1.0953, + "step": 42315 + }, + { + "epoch": 12.66, + "grad_norm": 2.5565319061279297, + "learning_rate": 1.4862723537184059e-05, + "loss": 0.8852, + "step": 42320 + }, + { + "epoch": 12.66, + "grad_norm": 3.775041341781616, + "learning_rate": 1.4857353292900616e-05, + "loss": 0.9722, + "step": 42325 + }, + { + "epoch": 12.66, + "grad_norm": 5.716676235198975, + "learning_rate": 1.485198360878502e-05, + "loss": 0.9083, + "step": 42330 + }, + { + "epoch": 12.67, + "grad_norm": 2.962364435195923, + "learning_rate": 1.4846614485133814e-05, + "loss": 1.1341, + "step": 42335 + }, + { + "epoch": 12.67, + "grad_norm": 2.1964805126190186, + "learning_rate": 1.4841245922243541e-05, + "loss": 0.9692, + "step": 42340 + }, + { + "epoch": 12.67, + "grad_norm": 1.980453372001648, + "learning_rate": 1.4835877920410695e-05, + "loss": 1.13, + "step": 42345 + }, + { + "epoch": 12.67, + "grad_norm": 16.400785446166992, + "learning_rate": 1.4830510479931747e-05, + "loss": 1.0367, + "step": 42350 + }, + { + "epoch": 12.67, + "grad_norm": 1.031053066253662, + "learning_rate": 1.4825143601103137e-05, + "loss": 0.9072, + "step": 42355 + }, + { + "epoch": 12.67, + "grad_norm": 8.164375305175781, + "learning_rate": 1.4819777284221264e-05, + "loss": 0.9322, + "step": 42360 + }, + { + "epoch": 12.68, + "grad_norm": 2.8158113956451416, + "learning_rate": 1.4814411529582517e-05, + "loss": 0.9522, + "step": 42365 + }, + { + "epoch": 12.68, + "grad_norm": 2.234344244003296, + "learning_rate": 1.4809046337483223e-05, + "loss": 0.9949, + "step": 42370 + }, + { + "epoch": 12.68, + "grad_norm": 7.7483720779418945, + "learning_rate": 1.4803681708219719e-05, + "loss": 0.9843, + "step": 42375 + }, + { + "epoch": 12.68, + "grad_norm": 2.1835453510284424, + "learning_rate": 1.4798317642088274e-05, + "loss": 0.9065, + "step": 42380 + }, + { + "epoch": 12.68, + "grad_norm": 1.4759970903396606, + "learning_rate": 1.4792954139385134e-05, + "loss": 1.0872, + "step": 42385 + }, + { + "epoch": 12.68, + "grad_norm": 2.038754940032959, + "learning_rate": 1.4787591200406536e-05, + "loss": 1.1076, + "step": 42390 + }, + { + "epoch": 12.68, + "grad_norm": 2.0084173679351807, + "learning_rate": 1.4782228825448653e-05, + "loss": 1.0353, + "step": 42395 + }, + { + "epoch": 12.69, + "grad_norm": 3.7430591583251953, + "learning_rate": 1.4776867014807654e-05, + "loss": 1.1075, + "step": 42400 + }, + { + "epoch": 12.69, + "grad_norm": 3.881999969482422, + "learning_rate": 1.4771505768779659e-05, + "loss": 0.9953, + "step": 42405 + }, + { + "epoch": 12.69, + "grad_norm": 2.164416790008545, + "learning_rate": 1.4766145087660779e-05, + "loss": 1.1684, + "step": 42410 + }, + { + "epoch": 12.69, + "grad_norm": 3.810238838195801, + "learning_rate": 1.476078497174706e-05, + "loss": 1.067, + "step": 42415 + }, + { + "epoch": 12.69, + "grad_norm": 2.1892411708831787, + "learning_rate": 1.4755425421334548e-05, + "loss": 1.0355, + "step": 42420 + }, + { + "epoch": 12.69, + "grad_norm": 1.6944376230239868, + "learning_rate": 1.475006643671924e-05, + "loss": 1.0403, + "step": 42425 + }, + { + "epoch": 12.69, + "grad_norm": 1.0041096210479736, + "learning_rate": 1.4744708018197107e-05, + "loss": 1.1216, + "step": 42430 + }, + { + "epoch": 12.7, + "grad_norm": 1.6599242687225342, + "learning_rate": 1.4739350166064092e-05, + "loss": 1.0584, + "step": 42435 + }, + { + "epoch": 12.7, + "grad_norm": 1.151971459388733, + "learning_rate": 1.4733992880616099e-05, + "loss": 1.0802, + "step": 42440 + }, + { + "epoch": 12.7, + "grad_norm": 3.420637845993042, + "learning_rate": 1.4728636162149011e-05, + "loss": 1.0013, + "step": 42445 + }, + { + "epoch": 12.7, + "grad_norm": 3.075683832168579, + "learning_rate": 1.4723280010958676e-05, + "loss": 1.0202, + "step": 42450 + }, + { + "epoch": 12.7, + "grad_norm": 4.555771350860596, + "learning_rate": 1.4717924427340896e-05, + "loss": 1.2088, + "step": 42455 + }, + { + "epoch": 12.7, + "grad_norm": 1.3353490829467773, + "learning_rate": 1.4712569411591468e-05, + "loss": 1.031, + "step": 42460 + }, + { + "epoch": 12.71, + "grad_norm": 2.8824167251586914, + "learning_rate": 1.4707214964006131e-05, + "loss": 1.1876, + "step": 42465 + }, + { + "epoch": 12.71, + "grad_norm": 4.97543478012085, + "learning_rate": 1.470186108488062e-05, + "loss": 1.1018, + "step": 42470 + }, + { + "epoch": 12.71, + "grad_norm": 1.2788058519363403, + "learning_rate": 1.469650777451061e-05, + "loss": 1.0444, + "step": 42475 + }, + { + "epoch": 12.71, + "grad_norm": 4.054439067840576, + "learning_rate": 1.4691155033191773e-05, + "loss": 0.7675, + "step": 42480 + }, + { + "epoch": 12.71, + "grad_norm": 4.729781627655029, + "learning_rate": 1.4685802861219732e-05, + "loss": 1.1607, + "step": 42485 + }, + { + "epoch": 12.71, + "grad_norm": 3.289395570755005, + "learning_rate": 1.4680451258890066e-05, + "loss": 1.0532, + "step": 42490 + }, + { + "epoch": 12.71, + "grad_norm": 3.1528773307800293, + "learning_rate": 1.467510022649836e-05, + "loss": 1.1017, + "step": 42495 + }, + { + "epoch": 12.72, + "grad_norm": 1.4691784381866455, + "learning_rate": 1.4669749764340124e-05, + "loss": 0.9924, + "step": 42500 + }, + { + "epoch": 12.72, + "grad_norm": 4.324456691741943, + "learning_rate": 1.4664399872710885e-05, + "loss": 1.0135, + "step": 42505 + }, + { + "epoch": 12.72, + "grad_norm": 4.2510294914245605, + "learning_rate": 1.4659050551906089e-05, + "loss": 1.0921, + "step": 42510 + }, + { + "epoch": 12.72, + "grad_norm": 10.014864921569824, + "learning_rate": 1.4653701802221187e-05, + "loss": 0.9695, + "step": 42515 + }, + { + "epoch": 12.72, + "grad_norm": 6.1459455490112305, + "learning_rate": 1.4648353623951577e-05, + "loss": 0.9502, + "step": 42520 + }, + { + "epoch": 12.72, + "grad_norm": 2.0565860271453857, + "learning_rate": 1.4643006017392636e-05, + "loss": 0.827, + "step": 42525 + }, + { + "epoch": 12.72, + "grad_norm": 2.4255897998809814, + "learning_rate": 1.4637658982839709e-05, + "loss": 0.9423, + "step": 42530 + }, + { + "epoch": 12.73, + "grad_norm": 2.0831713676452637, + "learning_rate": 1.4632312520588104e-05, + "loss": 0.8922, + "step": 42535 + }, + { + "epoch": 12.73, + "grad_norm": 2.779167890548706, + "learning_rate": 1.46269666309331e-05, + "loss": 0.9137, + "step": 42540 + }, + { + "epoch": 12.73, + "grad_norm": 2.2856433391571045, + "learning_rate": 1.4621621314169953e-05, + "loss": 0.7789, + "step": 42545 + }, + { + "epoch": 12.73, + "grad_norm": 1.7285444736480713, + "learning_rate": 1.4616276570593865e-05, + "loss": 1.0144, + "step": 42550 + }, + { + "epoch": 12.73, + "grad_norm": 4.265864372253418, + "learning_rate": 1.4610932400500037e-05, + "loss": 1.0229, + "step": 42555 + }, + { + "epoch": 12.73, + "grad_norm": 3.695251941680908, + "learning_rate": 1.4605588804183607e-05, + "loss": 0.9748, + "step": 42560 + }, + { + "epoch": 12.73, + "grad_norm": 6.5001749992370605, + "learning_rate": 1.4600245781939708e-05, + "loss": 1.1294, + "step": 42565 + }, + { + "epoch": 12.74, + "grad_norm": 2.0487442016601562, + "learning_rate": 1.4594903334063423e-05, + "loss": 1.0273, + "step": 42570 + }, + { + "epoch": 12.74, + "grad_norm": 2.534780502319336, + "learning_rate": 1.458956146084981e-05, + "loss": 1.0041, + "step": 42575 + }, + { + "epoch": 12.74, + "grad_norm": 4.824891567230225, + "learning_rate": 1.4584220162593898e-05, + "loss": 1.0993, + "step": 42580 + }, + { + "epoch": 12.74, + "grad_norm": 1.7529278993606567, + "learning_rate": 1.4578879439590675e-05, + "loss": 1.1054, + "step": 42585 + }, + { + "epoch": 12.74, + "grad_norm": 1.254698634147644, + "learning_rate": 1.4573539292135113e-05, + "loss": 0.8739, + "step": 42590 + }, + { + "epoch": 12.74, + "grad_norm": 4.246768951416016, + "learning_rate": 1.4568199720522135e-05, + "loss": 1.1456, + "step": 42595 + }, + { + "epoch": 12.75, + "grad_norm": 4.16953182220459, + "learning_rate": 1.4562860725046648e-05, + "loss": 0.839, + "step": 42600 + }, + { + "epoch": 12.75, + "grad_norm": 1.6719205379486084, + "learning_rate": 1.4557522306003521e-05, + "loss": 0.9557, + "step": 42605 + }, + { + "epoch": 12.75, + "grad_norm": 2.465372085571289, + "learning_rate": 1.4552184463687562e-05, + "loss": 1.0845, + "step": 42610 + }, + { + "epoch": 12.75, + "grad_norm": 3.8194258213043213, + "learning_rate": 1.4546847198393618e-05, + "loss": 1.0781, + "step": 42615 + }, + { + "epoch": 12.75, + "grad_norm": 1.8574891090393066, + "learning_rate": 1.4541510510416418e-05, + "loss": 0.9392, + "step": 42620 + }, + { + "epoch": 12.75, + "grad_norm": 2.715480089187622, + "learning_rate": 1.4536174400050739e-05, + "loss": 1.1462, + "step": 42625 + }, + { + "epoch": 12.75, + "grad_norm": 1.606964349746704, + "learning_rate": 1.453083886759125e-05, + "loss": 0.9121, + "step": 42630 + }, + { + "epoch": 12.76, + "grad_norm": 3.806936740875244, + "learning_rate": 1.4525503913332672e-05, + "loss": 0.973, + "step": 42635 + }, + { + "epoch": 12.76, + "grad_norm": 3.890246629714966, + "learning_rate": 1.4520169537569615e-05, + "loss": 1.1849, + "step": 42640 + }, + { + "epoch": 12.76, + "grad_norm": 3.9114744663238525, + "learning_rate": 1.4514835740596699e-05, + "loss": 1.0194, + "step": 42645 + }, + { + "epoch": 12.76, + "grad_norm": 1.8596765995025635, + "learning_rate": 1.4509502522708507e-05, + "loss": 1.057, + "step": 42650 + }, + { + "epoch": 12.76, + "grad_norm": 3.2842726707458496, + "learning_rate": 1.4504169884199587e-05, + "loss": 0.9596, + "step": 42655 + }, + { + "epoch": 12.76, + "grad_norm": 2.9906461238861084, + "learning_rate": 1.4498837825364452e-05, + "loss": 0.8944, + "step": 42660 + }, + { + "epoch": 12.76, + "grad_norm": 1.7279856204986572, + "learning_rate": 1.4493506346497593e-05, + "loss": 1.0216, + "step": 42665 + }, + { + "epoch": 12.77, + "grad_norm": 1.639946460723877, + "learning_rate": 1.4488175447893455e-05, + "loss": 1.097, + "step": 42670 + }, + { + "epoch": 12.77, + "grad_norm": 9.285032272338867, + "learning_rate": 1.4482845129846459e-05, + "loss": 1.0578, + "step": 42675 + }, + { + "epoch": 12.77, + "grad_norm": 1.8126649856567383, + "learning_rate": 1.4477515392650997e-05, + "loss": 0.999, + "step": 42680 + }, + { + "epoch": 12.77, + "grad_norm": 2.5790581703186035, + "learning_rate": 1.447218623660142e-05, + "loss": 0.9432, + "step": 42685 + }, + { + "epoch": 12.77, + "grad_norm": 1.561050295829773, + "learning_rate": 1.4466857661992055e-05, + "loss": 0.9825, + "step": 42690 + }, + { + "epoch": 12.77, + "grad_norm": 3.258485794067383, + "learning_rate": 1.4461529669117197e-05, + "loss": 1.1113, + "step": 42695 + }, + { + "epoch": 12.78, + "grad_norm": 1.9323861598968506, + "learning_rate": 1.4456202258271096e-05, + "loss": 1.1198, + "step": 42700 + }, + { + "epoch": 12.78, + "grad_norm": 3.977754831314087, + "learning_rate": 1.4450875429747987e-05, + "loss": 1.0889, + "step": 42705 + }, + { + "epoch": 12.78, + "grad_norm": 2.0819084644317627, + "learning_rate": 1.4445549183842072e-05, + "loss": 1.2944, + "step": 42710 + }, + { + "epoch": 12.78, + "grad_norm": 2.5505735874176025, + "learning_rate": 1.4440223520847485e-05, + "loss": 0.9569, + "step": 42715 + }, + { + "epoch": 12.78, + "grad_norm": 1.7122985124588013, + "learning_rate": 1.4434898441058398e-05, + "loss": 1.0941, + "step": 42720 + }, + { + "epoch": 12.78, + "grad_norm": 2.2943074703216553, + "learning_rate": 1.4429573944768865e-05, + "loss": 1.0611, + "step": 42725 + }, + { + "epoch": 12.78, + "grad_norm": 5.206298828125, + "learning_rate": 1.4424250032272999e-05, + "loss": 1.1535, + "step": 42730 + }, + { + "epoch": 12.79, + "grad_norm": 1.9564958810806274, + "learning_rate": 1.4418926703864799e-05, + "loss": 1.0421, + "step": 42735 + }, + { + "epoch": 12.79, + "grad_norm": 1.8103880882263184, + "learning_rate": 1.4413603959838278e-05, + "loss": 0.8809, + "step": 42740 + }, + { + "epoch": 12.79, + "grad_norm": 2.9786670207977295, + "learning_rate": 1.4408281800487411e-05, + "loss": 1.0065, + "step": 42745 + }, + { + "epoch": 12.79, + "grad_norm": 5.209625720977783, + "learning_rate": 1.4402960226106128e-05, + "loss": 1.0411, + "step": 42750 + }, + { + "epoch": 12.79, + "grad_norm": 2.3398683071136475, + "learning_rate": 1.4397639236988334e-05, + "loss": 1.0483, + "step": 42755 + }, + { + "epoch": 12.79, + "grad_norm": 1.2769595384597778, + "learning_rate": 1.4392318833427896e-05, + "loss": 1.0679, + "step": 42760 + }, + { + "epoch": 12.79, + "grad_norm": 8.292903900146484, + "learning_rate": 1.4386999015718684e-05, + "loss": 1.0761, + "step": 42765 + }, + { + "epoch": 12.8, + "grad_norm": 2.5282647609710693, + "learning_rate": 1.4381679784154472e-05, + "loss": 1.041, + "step": 42770 + }, + { + "epoch": 12.8, + "grad_norm": 2.1609020233154297, + "learning_rate": 1.4376361139029052e-05, + "loss": 1.123, + "step": 42775 + }, + { + "epoch": 12.8, + "grad_norm": 1.4257582426071167, + "learning_rate": 1.4371043080636159e-05, + "loss": 1.1743, + "step": 42780 + }, + { + "epoch": 12.8, + "grad_norm": 3.5058562755584717, + "learning_rate": 1.4365725609269515e-05, + "loss": 1.0082, + "step": 42785 + }, + { + "epoch": 12.8, + "grad_norm": 2.25451397895813, + "learning_rate": 1.4360408725222785e-05, + "loss": 0.9842, + "step": 42790 + }, + { + "epoch": 12.8, + "grad_norm": 2.5415477752685547, + "learning_rate": 1.4355092428789627e-05, + "loss": 0.9281, + "step": 42795 + }, + { + "epoch": 12.81, + "grad_norm": 2.6299939155578613, + "learning_rate": 1.4349776720263647e-05, + "loss": 0.9657, + "step": 42800 + }, + { + "epoch": 12.81, + "grad_norm": 3.0297467708587646, + "learning_rate": 1.4344461599938439e-05, + "loss": 0.9558, + "step": 42805 + }, + { + "epoch": 12.81, + "grad_norm": 1.9421048164367676, + "learning_rate": 1.4339147068107522e-05, + "loss": 1.0252, + "step": 42810 + }, + { + "epoch": 12.81, + "grad_norm": 3.981034517288208, + "learning_rate": 1.433383312506445e-05, + "loss": 1.0214, + "step": 42815 + }, + { + "epoch": 12.81, + "grad_norm": 5.4559736251831055, + "learning_rate": 1.4328519771102666e-05, + "loss": 0.9897, + "step": 42820 + }, + { + "epoch": 12.81, + "grad_norm": 3.1577250957489014, + "learning_rate": 1.4323207006515665e-05, + "loss": 0.8926, + "step": 42825 + }, + { + "epoch": 12.81, + "grad_norm": 11.30273723602295, + "learning_rate": 1.4317894831596823e-05, + "loss": 0.9951, + "step": 42830 + }, + { + "epoch": 12.82, + "grad_norm": 1.4776180982589722, + "learning_rate": 1.4312583246639566e-05, + "loss": 1.1993, + "step": 42835 + }, + { + "epoch": 12.82, + "grad_norm": 1.8206647634506226, + "learning_rate": 1.4307272251937215e-05, + "loss": 1.1282, + "step": 42840 + }, + { + "epoch": 12.82, + "grad_norm": 4.272174835205078, + "learning_rate": 1.4301961847783096e-05, + "loss": 1.0822, + "step": 42845 + }, + { + "epoch": 12.82, + "grad_norm": 1.5834592580795288, + "learning_rate": 1.4296652034470523e-05, + "loss": 1.0474, + "step": 42850 + }, + { + "epoch": 12.82, + "grad_norm": 3.7897226810455322, + "learning_rate": 1.4291342812292712e-05, + "loss": 1.0012, + "step": 42855 + }, + { + "epoch": 12.82, + "grad_norm": 1.9825385808944702, + "learning_rate": 1.4286034181542928e-05, + "loss": 0.9521, + "step": 42860 + }, + { + "epoch": 12.82, + "grad_norm": 2.32985782623291, + "learning_rate": 1.4280726142514328e-05, + "loss": 1.189, + "step": 42865 + }, + { + "epoch": 12.83, + "grad_norm": 2.0926525592803955, + "learning_rate": 1.4275418695500081e-05, + "loss": 0.8125, + "step": 42870 + }, + { + "epoch": 12.83, + "grad_norm": 3.3241047859191895, + "learning_rate": 1.4270111840793316e-05, + "loss": 0.8291, + "step": 42875 + }, + { + "epoch": 12.83, + "grad_norm": 2.867534637451172, + "learning_rate": 1.4264805578687116e-05, + "loss": 0.9589, + "step": 42880 + }, + { + "epoch": 12.83, + "grad_norm": 1.9998924732208252, + "learning_rate": 1.4259499909474548e-05, + "loss": 1.176, + "step": 42885 + }, + { + "epoch": 12.83, + "grad_norm": 3.46891188621521, + "learning_rate": 1.4254194833448634e-05, + "loss": 1.0184, + "step": 42890 + }, + { + "epoch": 12.83, + "grad_norm": 2.473839044570923, + "learning_rate": 1.4248890350902371e-05, + "loss": 1.104, + "step": 42895 + }, + { + "epoch": 12.84, + "grad_norm": 1.8417174816131592, + "learning_rate": 1.4243586462128727e-05, + "loss": 1.0769, + "step": 42900 + }, + { + "epoch": 12.84, + "grad_norm": 3.319664716720581, + "learning_rate": 1.4238283167420601e-05, + "loss": 0.9912, + "step": 42905 + }, + { + "epoch": 12.84, + "grad_norm": 1.5683692693710327, + "learning_rate": 1.4232980467070933e-05, + "loss": 0.8247, + "step": 42910 + }, + { + "epoch": 12.84, + "grad_norm": 1.6574991941452026, + "learning_rate": 1.4227678361372544e-05, + "loss": 1.074, + "step": 42915 + }, + { + "epoch": 12.84, + "grad_norm": 4.681344032287598, + "learning_rate": 1.4222376850618286e-05, + "loss": 1.0578, + "step": 42920 + }, + { + "epoch": 12.84, + "grad_norm": 1.7636343240737915, + "learning_rate": 1.4217075935100955e-05, + "loss": 1.0217, + "step": 42925 + }, + { + "epoch": 12.84, + "grad_norm": 3.016009569168091, + "learning_rate": 1.4211775615113312e-05, + "loss": 1.0764, + "step": 42930 + }, + { + "epoch": 12.85, + "grad_norm": 2.3196792602539062, + "learning_rate": 1.4206475890948099e-05, + "loss": 0.9541, + "step": 42935 + }, + { + "epoch": 12.85, + "grad_norm": 1.9939143657684326, + "learning_rate": 1.4201176762897981e-05, + "loss": 1.1661, + "step": 42940 + }, + { + "epoch": 12.85, + "grad_norm": 2.861206531524658, + "learning_rate": 1.4195878231255665e-05, + "loss": 0.9937, + "step": 42945 + }, + { + "epoch": 12.85, + "grad_norm": 3.518277883529663, + "learning_rate": 1.4190580296313746e-05, + "loss": 0.9096, + "step": 42950 + }, + { + "epoch": 12.85, + "grad_norm": 5.832763671875, + "learning_rate": 1.4185282958364863e-05, + "loss": 0.982, + "step": 42955 + }, + { + "epoch": 12.85, + "grad_norm": 1.4618480205535889, + "learning_rate": 1.4179986217701547e-05, + "loss": 1.1446, + "step": 42960 + }, + { + "epoch": 12.85, + "grad_norm": 2.2054197788238525, + "learning_rate": 1.4174690074616348e-05, + "loss": 1.0362, + "step": 42965 + }, + { + "epoch": 12.86, + "grad_norm": 2.494736909866333, + "learning_rate": 1.4169394529401764e-05, + "loss": 1.151, + "step": 42970 + }, + { + "epoch": 12.86, + "grad_norm": 2.238530397415161, + "learning_rate": 1.416409958235026e-05, + "loss": 0.9931, + "step": 42975 + }, + { + "epoch": 12.86, + "grad_norm": 2.8167457580566406, + "learning_rate": 1.4158805233754274e-05, + "loss": 0.9729, + "step": 42980 + }, + { + "epoch": 12.86, + "grad_norm": 2.740593433380127, + "learning_rate": 1.4153511483906207e-05, + "loss": 0.894, + "step": 42985 + }, + { + "epoch": 12.86, + "grad_norm": 2.3016016483306885, + "learning_rate": 1.414821833309843e-05, + "loss": 1.0095, + "step": 42990 + }, + { + "epoch": 12.86, + "grad_norm": 4.366925239562988, + "learning_rate": 1.4142925781623279e-05, + "loss": 1.0067, + "step": 42995 + }, + { + "epoch": 12.87, + "grad_norm": 2.0582058429718018, + "learning_rate": 1.4137633829773032e-05, + "loss": 1.0384, + "step": 43000 + }, + { + "epoch": 12.87, + "grad_norm": 2.0724596977233887, + "learning_rate": 1.4132342477839988e-05, + "loss": 1.0962, + "step": 43005 + }, + { + "epoch": 12.87, + "grad_norm": 4.274909496307373, + "learning_rate": 1.4127051726116375e-05, + "loss": 0.9295, + "step": 43010 + }, + { + "epoch": 12.87, + "grad_norm": 4.048436164855957, + "learning_rate": 1.4121761574894393e-05, + "loss": 0.7961, + "step": 43015 + }, + { + "epoch": 12.87, + "grad_norm": 2.6834957599639893, + "learning_rate": 1.4116472024466209e-05, + "loss": 1.1797, + "step": 43020 + }, + { + "epoch": 12.87, + "grad_norm": 1.8873329162597656, + "learning_rate": 1.4111183075123965e-05, + "loss": 0.8737, + "step": 43025 + }, + { + "epoch": 12.87, + "grad_norm": 3.20086407661438, + "learning_rate": 1.410589472715977e-05, + "loss": 0.9586, + "step": 43030 + }, + { + "epoch": 12.88, + "grad_norm": 2.417081117630005, + "learning_rate": 1.4100606980865666e-05, + "loss": 1.0065, + "step": 43035 + }, + { + "epoch": 12.88, + "grad_norm": 7.702847003936768, + "learning_rate": 1.4095319836533732e-05, + "loss": 1.0683, + "step": 43040 + }, + { + "epoch": 12.88, + "grad_norm": 2.575244665145874, + "learning_rate": 1.4090033294455923e-05, + "loss": 0.743, + "step": 43045 + }, + { + "epoch": 12.88, + "grad_norm": 2.828678607940674, + "learning_rate": 1.408474735492426e-05, + "loss": 1.0292, + "step": 43050 + }, + { + "epoch": 12.88, + "grad_norm": 2.4835140705108643, + "learning_rate": 1.4079462018230633e-05, + "loss": 1.0267, + "step": 43055 + }, + { + "epoch": 12.88, + "grad_norm": 3.8808891773223877, + "learning_rate": 1.4074177284666986e-05, + "loss": 0.7873, + "step": 43060 + }, + { + "epoch": 12.88, + "grad_norm": 2.5032103061676025, + "learning_rate": 1.4068893154525165e-05, + "loss": 1.1472, + "step": 43065 + }, + { + "epoch": 12.89, + "grad_norm": 7.541705131530762, + "learning_rate": 1.4063609628097007e-05, + "loss": 1.0327, + "step": 43070 + }, + { + "epoch": 12.89, + "grad_norm": 1.81223726272583, + "learning_rate": 1.4058326705674324e-05, + "loss": 1.0865, + "step": 43075 + }, + { + "epoch": 12.89, + "grad_norm": 1.7020493745803833, + "learning_rate": 1.4053044387548886e-05, + "loss": 1.1599, + "step": 43080 + }, + { + "epoch": 12.89, + "grad_norm": 1.811834454536438, + "learning_rate": 1.4047762674012426e-05, + "loss": 1.1018, + "step": 43085 + }, + { + "epoch": 12.89, + "grad_norm": 2.5735673904418945, + "learning_rate": 1.404248156535665e-05, + "loss": 1.164, + "step": 43090 + }, + { + "epoch": 12.89, + "grad_norm": 5.103279113769531, + "learning_rate": 1.4037201061873225e-05, + "loss": 1.0598, + "step": 43095 + }, + { + "epoch": 12.9, + "grad_norm": 2.6463088989257812, + "learning_rate": 1.4031921163853791e-05, + "loss": 1.1851, + "step": 43100 + }, + { + "epoch": 12.9, + "grad_norm": 3.571685314178467, + "learning_rate": 1.4026641871589951e-05, + "loss": 1.1104, + "step": 43105 + }, + { + "epoch": 12.9, + "grad_norm": 2.5913071632385254, + "learning_rate": 1.4021363185373277e-05, + "loss": 1.11, + "step": 43110 + }, + { + "epoch": 12.9, + "grad_norm": 1.9859071969985962, + "learning_rate": 1.4016085105495299e-05, + "loss": 1.0636, + "step": 43115 + }, + { + "epoch": 12.9, + "grad_norm": 1.736292839050293, + "learning_rate": 1.4010807632247525e-05, + "loss": 1.1104, + "step": 43120 + }, + { + "epoch": 12.9, + "grad_norm": 4.8815836906433105, + "learning_rate": 1.4005530765921432e-05, + "loss": 0.8693, + "step": 43125 + }, + { + "epoch": 12.9, + "grad_norm": 3.071505069732666, + "learning_rate": 1.4000254506808428e-05, + "loss": 1.0174, + "step": 43130 + }, + { + "epoch": 12.91, + "grad_norm": 6.106081485748291, + "learning_rate": 1.3994978855199953e-05, + "loss": 1.0536, + "step": 43135 + }, + { + "epoch": 12.91, + "grad_norm": 3.1881868839263916, + "learning_rate": 1.3989703811387337e-05, + "loss": 1.0537, + "step": 43140 + }, + { + "epoch": 12.91, + "grad_norm": 9.79444694519043, + "learning_rate": 1.3984429375661955e-05, + "loss": 0.8779, + "step": 43145 + }, + { + "epoch": 12.91, + "grad_norm": 3.3935022354125977, + "learning_rate": 1.3979155548315067e-05, + "loss": 0.9611, + "step": 43150 + }, + { + "epoch": 12.91, + "grad_norm": 2.132736921310425, + "learning_rate": 1.3973882329637983e-05, + "loss": 0.9938, + "step": 43155 + }, + { + "epoch": 12.91, + "grad_norm": 3.71537446975708, + "learning_rate": 1.396860971992191e-05, + "loss": 0.8078, + "step": 43160 + }, + { + "epoch": 12.91, + "grad_norm": 1.3954871892929077, + "learning_rate": 1.3963337719458052e-05, + "loss": 0.8549, + "step": 43165 + }, + { + "epoch": 12.92, + "grad_norm": 4.286108016967773, + "learning_rate": 1.3958066328537583e-05, + "loss": 1.1669, + "step": 43170 + }, + { + "epoch": 12.92, + "grad_norm": 1.1374285221099854, + "learning_rate": 1.395279554745163e-05, + "loss": 1.1838, + "step": 43175 + }, + { + "epoch": 12.92, + "grad_norm": 4.613668441772461, + "learning_rate": 1.39475253764913e-05, + "loss": 0.9838, + "step": 43180 + }, + { + "epoch": 12.92, + "grad_norm": 4.321890830993652, + "learning_rate": 1.3942255815947652e-05, + "loss": 0.972, + "step": 43185 + }, + { + "epoch": 12.92, + "grad_norm": 2.9430012702941895, + "learning_rate": 1.393698686611172e-05, + "loss": 0.9042, + "step": 43190 + }, + { + "epoch": 12.92, + "grad_norm": 3.068352222442627, + "learning_rate": 1.3931718527274506e-05, + "loss": 1.0785, + "step": 43195 + }, + { + "epoch": 12.92, + "grad_norm": 1.4933991432189941, + "learning_rate": 1.3926450799726975e-05, + "loss": 0.9176, + "step": 43200 + }, + { + "epoch": 12.93, + "grad_norm": 3.601362943649292, + "learning_rate": 1.3921183683760054e-05, + "loss": 1.0315, + "step": 43205 + }, + { + "epoch": 12.93, + "grad_norm": 4.306023120880127, + "learning_rate": 1.391591717966464e-05, + "loss": 1.1656, + "step": 43210 + }, + { + "epoch": 12.93, + "grad_norm": 1.5193262100219727, + "learning_rate": 1.3910651287731601e-05, + "loss": 1.0546, + "step": 43215 + }, + { + "epoch": 12.93, + "grad_norm": 1.7641685009002686, + "learning_rate": 1.3905386008251775e-05, + "loss": 1.1402, + "step": 43220 + }, + { + "epoch": 12.93, + "grad_norm": 3.4775922298431396, + "learning_rate": 1.3900121341515929e-05, + "loss": 1.0022, + "step": 43225 + }, + { + "epoch": 12.93, + "grad_norm": 2.7105138301849365, + "learning_rate": 1.389485728781486e-05, + "loss": 1.0017, + "step": 43230 + }, + { + "epoch": 12.94, + "grad_norm": 4.041553497314453, + "learning_rate": 1.3889593847439265e-05, + "loss": 0.9267, + "step": 43235 + }, + { + "epoch": 12.94, + "grad_norm": 3.279675006866455, + "learning_rate": 1.3884331020679869e-05, + "loss": 0.9009, + "step": 43240 + }, + { + "epoch": 12.94, + "grad_norm": 1.1497808694839478, + "learning_rate": 1.3879068807827295e-05, + "loss": 1.0816, + "step": 43245 + }, + { + "epoch": 12.94, + "grad_norm": 3.0773322582244873, + "learning_rate": 1.3873807209172215e-05, + "loss": 1.0265, + "step": 43250 + }, + { + "epoch": 12.94, + "grad_norm": 3.1439836025238037, + "learning_rate": 1.3868546225005185e-05, + "loss": 1.0566, + "step": 43255 + }, + { + "epoch": 12.94, + "grad_norm": 4.710149765014648, + "learning_rate": 1.3863285855616781e-05, + "loss": 1.1018, + "step": 43260 + }, + { + "epoch": 12.94, + "grad_norm": 4.115421295166016, + "learning_rate": 1.385802610129752e-05, + "loss": 1.0303, + "step": 43265 + }, + { + "epoch": 12.95, + "grad_norm": 1.8717732429504395, + "learning_rate": 1.3852766962337897e-05, + "loss": 1.152, + "step": 43270 + }, + { + "epoch": 12.95, + "grad_norm": 1.6938856840133667, + "learning_rate": 1.3847508439028367e-05, + "loss": 1.1106, + "step": 43275 + }, + { + "epoch": 12.95, + "grad_norm": 2.244997024536133, + "learning_rate": 1.3842250531659351e-05, + "loss": 1.0663, + "step": 43280 + }, + { + "epoch": 12.95, + "grad_norm": 1.7450692653656006, + "learning_rate": 1.3836993240521245e-05, + "loss": 0.9527, + "step": 43285 + }, + { + "epoch": 12.95, + "grad_norm": 2.384140968322754, + "learning_rate": 1.3831736565904396e-05, + "loss": 0.8981, + "step": 43290 + }, + { + "epoch": 12.95, + "grad_norm": 3.3110265731811523, + "learning_rate": 1.382648050809913e-05, + "loss": 0.9716, + "step": 43295 + }, + { + "epoch": 12.95, + "grad_norm": 1.9519492387771606, + "learning_rate": 1.3821225067395729e-05, + "loss": 1.0456, + "step": 43300 + }, + { + "epoch": 12.96, + "grad_norm": 3.2571592330932617, + "learning_rate": 1.3815970244084447e-05, + "loss": 1.3004, + "step": 43305 + }, + { + "epoch": 12.96, + "grad_norm": 3.3342976570129395, + "learning_rate": 1.3810716038455506e-05, + "loss": 0.9739, + "step": 43310 + }, + { + "epoch": 12.96, + "grad_norm": 2.1756532192230225, + "learning_rate": 1.3805462450799098e-05, + "loss": 0.9842, + "step": 43315 + }, + { + "epoch": 12.96, + "grad_norm": 3.6889872550964355, + "learning_rate": 1.3800209481405341e-05, + "loss": 1.0174, + "step": 43320 + }, + { + "epoch": 12.96, + "grad_norm": 2.0239474773406982, + "learning_rate": 1.3794957130564393e-05, + "loss": 1.0647, + "step": 43325 + }, + { + "epoch": 12.96, + "grad_norm": 7.694988250732422, + "learning_rate": 1.3789705398566296e-05, + "loss": 0.9891, + "step": 43330 + }, + { + "epoch": 12.97, + "grad_norm": 1.3746068477630615, + "learning_rate": 1.3784454285701137e-05, + "loss": 1.0001, + "step": 43335 + }, + { + "epoch": 12.97, + "grad_norm": 3.215383291244507, + "learning_rate": 1.3779203792258891e-05, + "loss": 1.1611, + "step": 43340 + }, + { + "epoch": 12.97, + "grad_norm": 2.7879562377929688, + "learning_rate": 1.3773953918529576e-05, + "loss": 1.0483, + "step": 43345 + }, + { + "epoch": 12.97, + "grad_norm": 4.000429630279541, + "learning_rate": 1.3768704664803106e-05, + "loss": 1.0986, + "step": 43350 + }, + { + "epoch": 12.97, + "grad_norm": 3.5371413230895996, + "learning_rate": 1.3763456031369404e-05, + "loss": 1.0504, + "step": 43355 + }, + { + "epoch": 12.97, + "grad_norm": 2.885620594024658, + "learning_rate": 1.3758208018518346e-05, + "loss": 0.8533, + "step": 43360 + }, + { + "epoch": 12.97, + "grad_norm": 1.5964106321334839, + "learning_rate": 1.3752960626539774e-05, + "loss": 1.0267, + "step": 43365 + }, + { + "epoch": 12.98, + "grad_norm": 2.651956558227539, + "learning_rate": 1.3747713855723493e-05, + "loss": 1.0612, + "step": 43370 + }, + { + "epoch": 12.98, + "grad_norm": 1.7484973669052124, + "learning_rate": 1.3742467706359283e-05, + "loss": 1.1684, + "step": 43375 + }, + { + "epoch": 12.98, + "grad_norm": 2.3421952724456787, + "learning_rate": 1.373722217873688e-05, + "loss": 1.1648, + "step": 43380 + }, + { + "epoch": 12.98, + "grad_norm": 2.6561176776885986, + "learning_rate": 1.3731977273145984e-05, + "loss": 0.9651, + "step": 43385 + }, + { + "epoch": 12.98, + "grad_norm": 2.5251173973083496, + "learning_rate": 1.3726732989876278e-05, + "loss": 0.9813, + "step": 43390 + }, + { + "epoch": 12.98, + "grad_norm": 2.7231156826019287, + "learning_rate": 1.3721489329217385e-05, + "loss": 1.1203, + "step": 43395 + }, + { + "epoch": 12.98, + "grad_norm": 2.138787269592285, + "learning_rate": 1.3716246291458918e-05, + "loss": 1.0665, + "step": 43400 + }, + { + "epoch": 12.99, + "grad_norm": 7.916914939880371, + "learning_rate": 1.3711003876890438e-05, + "loss": 1.1231, + "step": 43405 + }, + { + "epoch": 12.99, + "grad_norm": 2.818751811981201, + "learning_rate": 1.3705762085801478e-05, + "loss": 1.101, + "step": 43410 + }, + { + "epoch": 12.99, + "grad_norm": 2.751744031906128, + "learning_rate": 1.370052091848154e-05, + "loss": 0.9642, + "step": 43415 + }, + { + "epoch": 12.99, + "grad_norm": 1.3935418128967285, + "learning_rate": 1.3695280375220093e-05, + "loss": 0.9676, + "step": 43420 + }, + { + "epoch": 12.99, + "grad_norm": 2.9495058059692383, + "learning_rate": 1.3690040456306546e-05, + "loss": 1.0966, + "step": 43425 + }, + { + "epoch": 12.99, + "grad_norm": 3.748474359512329, + "learning_rate": 1.3684801162030326e-05, + "loss": 1.0841, + "step": 43430 + }, + { + "epoch": 13.0, + "grad_norm": 1.8227459192276, + "learning_rate": 1.3679562492680754e-05, + "loss": 0.9915, + "step": 43435 + }, + { + "epoch": 13.0, + "grad_norm": 4.090573310852051, + "learning_rate": 1.3674324448547201e-05, + "loss": 0.8671, + "step": 43440 + }, + { + "epoch": 13.0, + "grad_norm": 3.219416856765747, + "learning_rate": 1.3669087029918925e-05, + "loss": 1.0274, + "step": 43445 + }, + { + "epoch": 13.0, + "grad_norm": 1.7496566772460938, + "learning_rate": 1.3663850237085196e-05, + "loss": 1.1035, + "step": 43450 + }, + { + "epoch": 13.0, + "grad_norm": 1.8964442014694214, + "learning_rate": 1.3658614070335236e-05, + "loss": 0.9436, + "step": 43455 + }, + { + "epoch": 13.0, + "grad_norm": 1.2473254203796387, + "learning_rate": 1.365337852995823e-05, + "loss": 0.9611, + "step": 43460 + }, + { + "epoch": 13.0, + "grad_norm": 2.2625179290771484, + "learning_rate": 1.3648143616243334e-05, + "loss": 0.8075, + "step": 43465 + }, + { + "epoch": 13.01, + "grad_norm": 1.278442144393921, + "learning_rate": 1.3642909329479666e-05, + "loss": 0.8595, + "step": 43470 + }, + { + "epoch": 13.01, + "grad_norm": 3.522331953048706, + "learning_rate": 1.3637675669956312e-05, + "loss": 1.0262, + "step": 43475 + }, + { + "epoch": 13.01, + "grad_norm": 2.990205764770508, + "learning_rate": 1.363244263796232e-05, + "loss": 1.0063, + "step": 43480 + }, + { + "epoch": 13.01, + "grad_norm": 1.9036306142807007, + "learning_rate": 1.3627210233786705e-05, + "loss": 0.8788, + "step": 43485 + }, + { + "epoch": 13.01, + "grad_norm": 3.0223844051361084, + "learning_rate": 1.3621978457718449e-05, + "loss": 0.9906, + "step": 43490 + }, + { + "epoch": 13.01, + "grad_norm": 2.851306676864624, + "learning_rate": 1.3616747310046493e-05, + "loss": 1.1554, + "step": 43495 + }, + { + "epoch": 13.01, + "grad_norm": 1.1327463388442993, + "learning_rate": 1.3611516791059754e-05, + "loss": 0.9168, + "step": 43500 + }, + { + "epoch": 13.02, + "grad_norm": 2.258399248123169, + "learning_rate": 1.3606286901047099e-05, + "loss": 0.9594, + "step": 43505 + }, + { + "epoch": 13.02, + "grad_norm": 2.515151262283325, + "learning_rate": 1.3601057640297382e-05, + "loss": 0.961, + "step": 43510 + }, + { + "epoch": 13.02, + "grad_norm": 5.121065616607666, + "learning_rate": 1.359582900909941e-05, + "loss": 0.8494, + "step": 43515 + }, + { + "epoch": 13.02, + "grad_norm": 2.3708105087280273, + "learning_rate": 1.3590601007741926e-05, + "loss": 0.9847, + "step": 43520 + }, + { + "epoch": 13.02, + "grad_norm": 1.6514267921447754, + "learning_rate": 1.3585373636513715e-05, + "loss": 1.1194, + "step": 43525 + }, + { + "epoch": 13.02, + "grad_norm": 4.888957977294922, + "learning_rate": 1.3580146895703428e-05, + "loss": 1.012, + "step": 43530 + }, + { + "epoch": 13.03, + "grad_norm": 3.775286912918091, + "learning_rate": 1.357492078559978e-05, + "loss": 1.0716, + "step": 43535 + }, + { + "epoch": 13.03, + "grad_norm": 2.8520710468292236, + "learning_rate": 1.3569695306491358e-05, + "loss": 0.9447, + "step": 43540 + }, + { + "epoch": 13.03, + "grad_norm": 1.2906252145767212, + "learning_rate": 1.3564470458666806e-05, + "loss": 0.7763, + "step": 43545 + }, + { + "epoch": 13.03, + "grad_norm": 3.1408872604370117, + "learning_rate": 1.3559246242414653e-05, + "loss": 0.9982, + "step": 43550 + }, + { + "epoch": 13.03, + "grad_norm": 3.5548839569091797, + "learning_rate": 1.3554022658023442e-05, + "loss": 1.1682, + "step": 43555 + }, + { + "epoch": 13.03, + "grad_norm": 2.9345786571502686, + "learning_rate": 1.3548799705781655e-05, + "loss": 1.0853, + "step": 43560 + }, + { + "epoch": 13.03, + "grad_norm": 1.7587555646896362, + "learning_rate": 1.3543577385977763e-05, + "loss": 0.9033, + "step": 43565 + }, + { + "epoch": 13.04, + "grad_norm": 2.7659850120544434, + "learning_rate": 1.3538355698900182e-05, + "loss": 0.9808, + "step": 43570 + }, + { + "epoch": 13.04, + "grad_norm": 3.0085887908935547, + "learning_rate": 1.3533134644837303e-05, + "loss": 1.2206, + "step": 43575 + }, + { + "epoch": 13.04, + "grad_norm": 2.224841594696045, + "learning_rate": 1.3527914224077475e-05, + "loss": 1.0777, + "step": 43580 + }, + { + "epoch": 13.04, + "grad_norm": 2.03397536277771, + "learning_rate": 1.3522694436909022e-05, + "loss": 0.9373, + "step": 43585 + }, + { + "epoch": 13.04, + "grad_norm": 1.9126800298690796, + "learning_rate": 1.3517475283620226e-05, + "loss": 0.9797, + "step": 43590 + }, + { + "epoch": 13.04, + "grad_norm": 1.9172559976577759, + "learning_rate": 1.3512256764499335e-05, + "loss": 1.051, + "step": 43595 + }, + { + "epoch": 13.04, + "grad_norm": 3.562412738800049, + "learning_rate": 1.3507038879834561e-05, + "loss": 0.9013, + "step": 43600 + }, + { + "epoch": 13.05, + "grad_norm": 3.7953248023986816, + "learning_rate": 1.3501821629914082e-05, + "loss": 0.9603, + "step": 43605 + }, + { + "epoch": 13.05, + "grad_norm": 5.259886741638184, + "learning_rate": 1.3496605015026054e-05, + "loss": 0.9906, + "step": 43610 + }, + { + "epoch": 13.05, + "grad_norm": 1.6214237213134766, + "learning_rate": 1.3491389035458557e-05, + "loss": 0.8656, + "step": 43615 + }, + { + "epoch": 13.05, + "grad_norm": 2.001491069793701, + "learning_rate": 1.3486173691499698e-05, + "loss": 0.8376, + "step": 43620 + }, + { + "epoch": 13.05, + "grad_norm": 2.1933164596557617, + "learning_rate": 1.3480958983437475e-05, + "loss": 1.0552, + "step": 43625 + }, + { + "epoch": 13.05, + "grad_norm": 1.1674703359603882, + "learning_rate": 1.347574491155994e-05, + "loss": 0.9067, + "step": 43630 + }, + { + "epoch": 13.06, + "grad_norm": 3.31339693069458, + "learning_rate": 1.347053147615501e-05, + "loss": 0.9645, + "step": 43635 + }, + { + "epoch": 13.06, + "grad_norm": 2.1307213306427, + "learning_rate": 1.3465318677510664e-05, + "loss": 1.1741, + "step": 43640 + }, + { + "epoch": 13.06, + "grad_norm": 1.4840748310089111, + "learning_rate": 1.346010651591477e-05, + "loss": 0.8708, + "step": 43645 + }, + { + "epoch": 13.06, + "grad_norm": 1.7846823930740356, + "learning_rate": 1.3454894991655196e-05, + "loss": 0.8823, + "step": 43650 + }, + { + "epoch": 13.06, + "grad_norm": 16.712650299072266, + "learning_rate": 1.3449684105019772e-05, + "loss": 1.0997, + "step": 43655 + }, + { + "epoch": 13.06, + "grad_norm": 3.8388795852661133, + "learning_rate": 1.3444473856296277e-05, + "loss": 0.992, + "step": 43660 + }, + { + "epoch": 13.06, + "grad_norm": 1.0792344808578491, + "learning_rate": 1.3439264245772502e-05, + "loss": 0.9544, + "step": 43665 + }, + { + "epoch": 13.07, + "grad_norm": 2.6744980812072754, + "learning_rate": 1.3434055273736135e-05, + "loss": 0.9327, + "step": 43670 + }, + { + "epoch": 13.07, + "grad_norm": 4.98563289642334, + "learning_rate": 1.3428846940474871e-05, + "loss": 1.0448, + "step": 43675 + }, + { + "epoch": 13.07, + "grad_norm": 1.9465887546539307, + "learning_rate": 1.3423639246276366e-05, + "loss": 1.1599, + "step": 43680 + }, + { + "epoch": 13.07, + "grad_norm": 3.1894278526306152, + "learning_rate": 1.3418432191428232e-05, + "loss": 1.129, + "step": 43685 + }, + { + "epoch": 13.07, + "grad_norm": 1.8298451900482178, + "learning_rate": 1.3413225776218048e-05, + "loss": 1.0085, + "step": 43690 + }, + { + "epoch": 13.07, + "grad_norm": 2.9106414318084717, + "learning_rate": 1.3408020000933363e-05, + "loss": 1.0084, + "step": 43695 + }, + { + "epoch": 13.07, + "grad_norm": 1.1771775484085083, + "learning_rate": 1.340281486586168e-05, + "loss": 0.889, + "step": 43700 + }, + { + "epoch": 13.08, + "grad_norm": 15.508944511413574, + "learning_rate": 1.3397610371290492e-05, + "loss": 0.9712, + "step": 43705 + }, + { + "epoch": 13.08, + "grad_norm": 3.3548052310943604, + "learning_rate": 1.33924065175072e-05, + "loss": 0.9766, + "step": 43710 + }, + { + "epoch": 13.08, + "grad_norm": 1.3726935386657715, + "learning_rate": 1.338720330479925e-05, + "loss": 1.0355, + "step": 43715 + }, + { + "epoch": 13.08, + "grad_norm": 4.749921798706055, + "learning_rate": 1.3382000733453976e-05, + "loss": 1.154, + "step": 43720 + }, + { + "epoch": 13.08, + "grad_norm": 3.290040969848633, + "learning_rate": 1.3376798803758742e-05, + "loss": 1.101, + "step": 43725 + }, + { + "epoch": 13.08, + "grad_norm": 2.6318535804748535, + "learning_rate": 1.3371597516000809e-05, + "loss": 1.0034, + "step": 43730 + }, + { + "epoch": 13.09, + "grad_norm": 2.853180170059204, + "learning_rate": 1.3366396870467468e-05, + "loss": 1.0768, + "step": 43735 + }, + { + "epoch": 13.09, + "grad_norm": 4.42519474029541, + "learning_rate": 1.336119686744594e-05, + "loss": 0.9629, + "step": 43740 + }, + { + "epoch": 13.09, + "grad_norm": 3.0336861610412598, + "learning_rate": 1.33559975072234e-05, + "loss": 0.9401, + "step": 43745 + }, + { + "epoch": 13.09, + "grad_norm": 3.247124433517456, + "learning_rate": 1.335079879008703e-05, + "loss": 0.874, + "step": 43750 + }, + { + "epoch": 13.09, + "grad_norm": 2.6699001789093018, + "learning_rate": 1.3345600716323913e-05, + "loss": 1.1099, + "step": 43755 + }, + { + "epoch": 13.09, + "grad_norm": 6.743714332580566, + "learning_rate": 1.3340403286221176e-05, + "loss": 1.1685, + "step": 43760 + }, + { + "epoch": 13.09, + "grad_norm": 1.998405933380127, + "learning_rate": 1.3335206500065828e-05, + "loss": 1.0679, + "step": 43765 + }, + { + "epoch": 13.1, + "grad_norm": 4.789541721343994, + "learning_rate": 1.3330010358144917e-05, + "loss": 0.8997, + "step": 43770 + }, + { + "epoch": 13.1, + "grad_norm": 1.4575897455215454, + "learning_rate": 1.3324814860745394e-05, + "loss": 1.0083, + "step": 43775 + }, + { + "epoch": 13.1, + "grad_norm": 3.014540672302246, + "learning_rate": 1.331962000815421e-05, + "loss": 0.9282, + "step": 43780 + }, + { + "epoch": 13.1, + "grad_norm": 2.8595590591430664, + "learning_rate": 1.3314425800658275e-05, + "loss": 1.037, + "step": 43785 + }, + { + "epoch": 13.1, + "grad_norm": 1.8672281503677368, + "learning_rate": 1.3309232238544457e-05, + "loss": 1.0224, + "step": 43790 + }, + { + "epoch": 13.1, + "grad_norm": 1.25929594039917, + "learning_rate": 1.3304039322099588e-05, + "loss": 0.9784, + "step": 43795 + }, + { + "epoch": 13.1, + "grad_norm": 2.0091841220855713, + "learning_rate": 1.329884705161048e-05, + "loss": 0.9481, + "step": 43800 + }, + { + "epoch": 13.11, + "grad_norm": 3.789120674133301, + "learning_rate": 1.3293655427363866e-05, + "loss": 0.8126, + "step": 43805 + }, + { + "epoch": 13.11, + "grad_norm": 2.5179672241210938, + "learning_rate": 1.3288464449646503e-05, + "loss": 1.1027, + "step": 43810 + }, + { + "epoch": 13.11, + "grad_norm": 3.7937817573547363, + "learning_rate": 1.3283274118745079e-05, + "loss": 1.04, + "step": 43815 + }, + { + "epoch": 13.11, + "grad_norm": 3.776716947555542, + "learning_rate": 1.3278084434946247e-05, + "loss": 1.1246, + "step": 43820 + }, + { + "epoch": 13.11, + "grad_norm": 3.676992654800415, + "learning_rate": 1.3272895398536623e-05, + "loss": 0.9363, + "step": 43825 + }, + { + "epoch": 13.11, + "grad_norm": 2.452056407928467, + "learning_rate": 1.32677070098028e-05, + "loss": 1.1244, + "step": 43830 + }, + { + "epoch": 13.11, + "grad_norm": 4.2442307472229, + "learning_rate": 1.3262519269031331e-05, + "loss": 1.0792, + "step": 43835 + }, + { + "epoch": 13.12, + "grad_norm": 4.072299957275391, + "learning_rate": 1.3257332176508708e-05, + "loss": 0.9724, + "step": 43840 + }, + { + "epoch": 13.12, + "grad_norm": 5.6725311279296875, + "learning_rate": 1.3252145732521438e-05, + "loss": 0.9115, + "step": 43845 + }, + { + "epoch": 13.12, + "grad_norm": 4.147802829742432, + "learning_rate": 1.324695993735593e-05, + "loss": 1.1873, + "step": 43850 + }, + { + "epoch": 13.12, + "grad_norm": 2.995326042175293, + "learning_rate": 1.3241774791298628e-05, + "loss": 1.0866, + "step": 43855 + }, + { + "epoch": 13.12, + "grad_norm": 1.8452882766723633, + "learning_rate": 1.323659029463586e-05, + "loss": 1.1149, + "step": 43860 + }, + { + "epoch": 13.12, + "grad_norm": 3.185358762741089, + "learning_rate": 1.3231406447654004e-05, + "loss": 1.03, + "step": 43865 + }, + { + "epoch": 13.13, + "grad_norm": 5.549739360809326, + "learning_rate": 1.3226223250639328e-05, + "loss": 0.9462, + "step": 43870 + }, + { + "epoch": 13.13, + "grad_norm": 1.393759846687317, + "learning_rate": 1.3221040703878101e-05, + "loss": 1.0435, + "step": 43875 + }, + { + "epoch": 13.13, + "grad_norm": 1.9850372076034546, + "learning_rate": 1.3215858807656556e-05, + "loss": 1.0667, + "step": 43880 + }, + { + "epoch": 13.13, + "grad_norm": 2.1288838386535645, + "learning_rate": 1.3210677562260878e-05, + "loss": 0.9594, + "step": 43885 + }, + { + "epoch": 13.13, + "grad_norm": 1.302878975868225, + "learning_rate": 1.3205496967977226e-05, + "loss": 0.9986, + "step": 43890 + }, + { + "epoch": 13.13, + "grad_norm": 2.470869302749634, + "learning_rate": 1.3200317025091712e-05, + "loss": 0.9677, + "step": 43895 + }, + { + "epoch": 13.13, + "grad_norm": 3.7706472873687744, + "learning_rate": 1.3195137733890428e-05, + "loss": 0.9866, + "step": 43900 + }, + { + "epoch": 13.14, + "grad_norm": 1.7888362407684326, + "learning_rate": 1.3189959094659415e-05, + "loss": 1.061, + "step": 43905 + }, + { + "epoch": 13.14, + "grad_norm": 2.0343103408813477, + "learning_rate": 1.3184781107684687e-05, + "loss": 1.1638, + "step": 43910 + }, + { + "epoch": 13.14, + "grad_norm": 3.1971328258514404, + "learning_rate": 1.3179603773252217e-05, + "loss": 1.0671, + "step": 43915 + }, + { + "epoch": 13.14, + "grad_norm": 3.4805822372436523, + "learning_rate": 1.3174427091647943e-05, + "loss": 1.0129, + "step": 43920 + }, + { + "epoch": 13.14, + "grad_norm": 1.6715178489685059, + "learning_rate": 1.3169251063157767e-05, + "loss": 0.8938, + "step": 43925 + }, + { + "epoch": 13.14, + "grad_norm": 3.162388324737549, + "learning_rate": 1.3164075688067567e-05, + "loss": 0.8423, + "step": 43930 + }, + { + "epoch": 13.14, + "grad_norm": 2.0230519771575928, + "learning_rate": 1.3158900966663148e-05, + "loss": 1.0128, + "step": 43935 + }, + { + "epoch": 13.15, + "grad_norm": 2.7525076866149902, + "learning_rate": 1.315372689923034e-05, + "loss": 1.0519, + "step": 43940 + }, + { + "epoch": 13.15, + "grad_norm": 2.027358055114746, + "learning_rate": 1.314855348605486e-05, + "loss": 0.9151, + "step": 43945 + }, + { + "epoch": 13.15, + "grad_norm": 1.6696336269378662, + "learning_rate": 1.3143380727422472e-05, + "loss": 0.9351, + "step": 43950 + }, + { + "epoch": 13.15, + "grad_norm": 3.3159263134002686, + "learning_rate": 1.3138208623618823e-05, + "loss": 1.0519, + "step": 43955 + }, + { + "epoch": 13.15, + "grad_norm": 12.850335121154785, + "learning_rate": 1.3133037174929602e-05, + "loss": 1.0565, + "step": 43960 + }, + { + "epoch": 13.15, + "grad_norm": 2.337259531021118, + "learning_rate": 1.3127866381640392e-05, + "loss": 1.0833, + "step": 43965 + }, + { + "epoch": 13.16, + "grad_norm": 2.980786085128784, + "learning_rate": 1.312269624403678e-05, + "loss": 1.0833, + "step": 43970 + }, + { + "epoch": 13.16, + "grad_norm": 3.550776958465576, + "learning_rate": 1.311752676240431e-05, + "loss": 0.792, + "step": 43975 + }, + { + "epoch": 13.16, + "grad_norm": 5.096564292907715, + "learning_rate": 1.3112357937028488e-05, + "loss": 0.8951, + "step": 43980 + }, + { + "epoch": 13.16, + "grad_norm": 3.9432437419891357, + "learning_rate": 1.3107189768194777e-05, + "loss": 1.0846, + "step": 43985 + }, + { + "epoch": 13.16, + "grad_norm": 3.766063928604126, + "learning_rate": 1.3102022256188618e-05, + "loss": 1.151, + "step": 43990 + }, + { + "epoch": 13.16, + "grad_norm": 1.3759208917617798, + "learning_rate": 1.30968554012954e-05, + "loss": 0.8187, + "step": 43995 + }, + { + "epoch": 13.16, + "grad_norm": 3.239544153213501, + "learning_rate": 1.3091689203800483e-05, + "loss": 0.8463, + "step": 44000 + }, + { + "epoch": 13.17, + "grad_norm": 4.143579959869385, + "learning_rate": 1.3086523663989197e-05, + "loss": 1.0706, + "step": 44005 + }, + { + "epoch": 13.17, + "grad_norm": 1.883518934249878, + "learning_rate": 1.3081358782146823e-05, + "loss": 0.9982, + "step": 44010 + }, + { + "epoch": 13.17, + "grad_norm": 1.332044243812561, + "learning_rate": 1.3076194558558617e-05, + "loss": 0.9293, + "step": 44015 + }, + { + "epoch": 13.17, + "grad_norm": 3.111307382583618, + "learning_rate": 1.3071030993509788e-05, + "loss": 0.9561, + "step": 44020 + }, + { + "epoch": 13.17, + "grad_norm": 2.8684184551239014, + "learning_rate": 1.3065868087285533e-05, + "loss": 0.9597, + "step": 44025 + }, + { + "epoch": 13.17, + "grad_norm": 2.7545359134674072, + "learning_rate": 1.3060705840170953e-05, + "loss": 0.7545, + "step": 44030 + }, + { + "epoch": 13.17, + "grad_norm": 2.23874831199646, + "learning_rate": 1.3055544252451202e-05, + "loss": 0.9002, + "step": 44035 + }, + { + "epoch": 13.18, + "grad_norm": 3.11095929145813, + "learning_rate": 1.3050383324411308e-05, + "loss": 0.9736, + "step": 44040 + }, + { + "epoch": 13.18, + "grad_norm": 2.461932897567749, + "learning_rate": 1.3045223056336339e-05, + "loss": 1.1343, + "step": 44045 + }, + { + "epoch": 13.18, + "grad_norm": 3.571146011352539, + "learning_rate": 1.3040063448511255e-05, + "loss": 0.9675, + "step": 44050 + }, + { + "epoch": 13.18, + "grad_norm": 3.9316980838775635, + "learning_rate": 1.3034904501221057e-05, + "loss": 0.9565, + "step": 44055 + }, + { + "epoch": 13.18, + "grad_norm": 5.21784782409668, + "learning_rate": 1.3029746214750636e-05, + "loss": 0.9655, + "step": 44060 + }, + { + "epoch": 13.18, + "grad_norm": 3.174137830734253, + "learning_rate": 1.3024588589384887e-05, + "loss": 1.0195, + "step": 44065 + }, + { + "epoch": 13.19, + "grad_norm": 3.539970636367798, + "learning_rate": 1.3019431625408663e-05, + "loss": 1.0647, + "step": 44070 + }, + { + "epoch": 13.19, + "grad_norm": 4.315314769744873, + "learning_rate": 1.3014275323106777e-05, + "loss": 1.1308, + "step": 44075 + }, + { + "epoch": 13.19, + "grad_norm": 2.9523892402648926, + "learning_rate": 1.300911968276401e-05, + "loss": 0.9636, + "step": 44080 + }, + { + "epoch": 13.19, + "grad_norm": 2.1555330753326416, + "learning_rate": 1.3003964704665096e-05, + "loss": 1.0895, + "step": 44085 + }, + { + "epoch": 13.19, + "grad_norm": 1.9076026678085327, + "learning_rate": 1.2998810389094743e-05, + "loss": 0.959, + "step": 44090 + }, + { + "epoch": 13.19, + "grad_norm": 2.8785440921783447, + "learning_rate": 1.2993656736337617e-05, + "loss": 0.9798, + "step": 44095 + }, + { + "epoch": 13.19, + "grad_norm": 7.201198101043701, + "learning_rate": 1.2988503746678348e-05, + "loss": 0.9449, + "step": 44100 + }, + { + "epoch": 13.2, + "grad_norm": 2.4313573837280273, + "learning_rate": 1.2983351420401535e-05, + "loss": 1.0374, + "step": 44105 + }, + { + "epoch": 13.2, + "grad_norm": 3.2586467266082764, + "learning_rate": 1.297819975779173e-05, + "loss": 0.9367, + "step": 44110 + }, + { + "epoch": 13.2, + "grad_norm": 2.9108598232269287, + "learning_rate": 1.297304875913346e-05, + "loss": 1.2829, + "step": 44115 + }, + { + "epoch": 13.2, + "grad_norm": 4.189149379730225, + "learning_rate": 1.2967898424711203e-05, + "loss": 0.8909, + "step": 44120 + }, + { + "epoch": 13.2, + "grad_norm": 2.829456090927124, + "learning_rate": 1.296274875480941e-05, + "loss": 1.0128, + "step": 44125 + }, + { + "epoch": 13.2, + "grad_norm": 1.993833303451538, + "learning_rate": 1.29575997497125e-05, + "loss": 0.9394, + "step": 44130 + }, + { + "epoch": 13.2, + "grad_norm": 1.8777223825454712, + "learning_rate": 1.2952451409704818e-05, + "loss": 0.9643, + "step": 44135 + }, + { + "epoch": 13.21, + "grad_norm": 2.4245407581329346, + "learning_rate": 1.2947303735070742e-05, + "loss": 0.9848, + "step": 44140 + }, + { + "epoch": 13.21, + "grad_norm": 3.382310628890991, + "learning_rate": 1.2942156726094534e-05, + "loss": 1.0159, + "step": 44145 + }, + { + "epoch": 13.21, + "grad_norm": 3.079493761062622, + "learning_rate": 1.29370103830605e-05, + "loss": 0.8951, + "step": 44150 + }, + { + "epoch": 13.21, + "grad_norm": 2.4276509284973145, + "learning_rate": 1.2931864706252828e-05, + "loss": 1.2272, + "step": 44155 + }, + { + "epoch": 13.21, + "grad_norm": 3.1140429973602295, + "learning_rate": 1.2926719695955727e-05, + "loss": 1.0597, + "step": 44160 + }, + { + "epoch": 13.21, + "grad_norm": 2.4898757934570312, + "learning_rate": 1.2921575352453347e-05, + "loss": 0.9542, + "step": 44165 + }, + { + "epoch": 13.22, + "grad_norm": 2.116917133331299, + "learning_rate": 1.2916431676029806e-05, + "loss": 0.9554, + "step": 44170 + }, + { + "epoch": 13.22, + "grad_norm": 3.3684239387512207, + "learning_rate": 1.2911288666969184e-05, + "loss": 1.057, + "step": 44175 + }, + { + "epoch": 13.22, + "grad_norm": 3.8344614505767822, + "learning_rate": 1.2906146325555522e-05, + "loss": 0.9452, + "step": 44180 + }, + { + "epoch": 13.22, + "grad_norm": 3.2199113368988037, + "learning_rate": 1.2901004652072826e-05, + "loss": 0.9491, + "step": 44185 + }, + { + "epoch": 13.22, + "grad_norm": 2.2717814445495605, + "learning_rate": 1.289586364680507e-05, + "loss": 1.0688, + "step": 44190 + }, + { + "epoch": 13.22, + "grad_norm": 2.989772319793701, + "learning_rate": 1.2890723310036181e-05, + "loss": 1.018, + "step": 44195 + }, + { + "epoch": 13.22, + "grad_norm": 9.16136360168457, + "learning_rate": 1.2885583642050058e-05, + "loss": 1.0984, + "step": 44200 + }, + { + "epoch": 13.23, + "grad_norm": 1.604578971862793, + "learning_rate": 1.2880444643130556e-05, + "loss": 1.0616, + "step": 44205 + }, + { + "epoch": 13.23, + "grad_norm": 2.7663040161132812, + "learning_rate": 1.2875306313561502e-05, + "loss": 0.9586, + "step": 44210 + }, + { + "epoch": 13.23, + "grad_norm": 1.7422765493392944, + "learning_rate": 1.2870168653626677e-05, + "loss": 1.0868, + "step": 44215 + }, + { + "epoch": 13.23, + "grad_norm": 3.091707944869995, + "learning_rate": 1.2865031663609827e-05, + "loss": 1.1116, + "step": 44220 + }, + { + "epoch": 13.23, + "grad_norm": 2.8757541179656982, + "learning_rate": 1.2859895343794676e-05, + "loss": 1.0587, + "step": 44225 + }, + { + "epoch": 13.23, + "grad_norm": 2.7807388305664062, + "learning_rate": 1.2854759694464868e-05, + "loss": 1.175, + "step": 44230 + }, + { + "epoch": 13.23, + "grad_norm": 1.115530252456665, + "learning_rate": 1.2849624715904074e-05, + "loss": 0.9452, + "step": 44235 + }, + { + "epoch": 13.24, + "grad_norm": 4.6550822257995605, + "learning_rate": 1.284449040839586e-05, + "loss": 1.0934, + "step": 44240 + }, + { + "epoch": 13.24, + "grad_norm": 2.822744369506836, + "learning_rate": 1.2839356772223823e-05, + "loss": 1.1822, + "step": 44245 + }, + { + "epoch": 13.24, + "grad_norm": 2.8846404552459717, + "learning_rate": 1.2834223807671453e-05, + "loss": 1.1075, + "step": 44250 + }, + { + "epoch": 13.24, + "grad_norm": 2.568181037902832, + "learning_rate": 1.2829091515022279e-05, + "loss": 0.876, + "step": 44255 + }, + { + "epoch": 13.24, + "grad_norm": 5.2756123542785645, + "learning_rate": 1.2823959894559718e-05, + "loss": 1.0383, + "step": 44260 + }, + { + "epoch": 13.24, + "grad_norm": 1.8959022760391235, + "learning_rate": 1.2818828946567194e-05, + "loss": 0.9319, + "step": 44265 + }, + { + "epoch": 13.25, + "grad_norm": 2.6469838619232178, + "learning_rate": 1.2813698671328084e-05, + "loss": 1.0376, + "step": 44270 + }, + { + "epoch": 13.25, + "grad_norm": 2.5506365299224854, + "learning_rate": 1.2808569069125734e-05, + "loss": 0.944, + "step": 44275 + }, + { + "epoch": 13.25, + "grad_norm": 2.260101079940796, + "learning_rate": 1.280344014024344e-05, + "loss": 0.8573, + "step": 44280 + }, + { + "epoch": 13.25, + "grad_norm": 1.153891921043396, + "learning_rate": 1.279831188496447e-05, + "loss": 1.0468, + "step": 44285 + }, + { + "epoch": 13.25, + "grad_norm": 3.153810977935791, + "learning_rate": 1.279318430357205e-05, + "loss": 0.854, + "step": 44290 + }, + { + "epoch": 13.25, + "grad_norm": 2.335554838180542, + "learning_rate": 1.2788057396349374e-05, + "loss": 0.9501, + "step": 44295 + }, + { + "epoch": 13.25, + "grad_norm": 1.3886281251907349, + "learning_rate": 1.2782931163579593e-05, + "loss": 1.0305, + "step": 44300 + }, + { + "epoch": 13.26, + "grad_norm": 1.6027575731277466, + "learning_rate": 1.2777805605545828e-05, + "loss": 0.9369, + "step": 44305 + }, + { + "epoch": 13.26, + "grad_norm": 2.810401201248169, + "learning_rate": 1.2772680722531153e-05, + "loss": 0.8592, + "step": 44310 + }, + { + "epoch": 13.26, + "grad_norm": 4.480473518371582, + "learning_rate": 1.2767556514818613e-05, + "loss": 0.9118, + "step": 44315 + }, + { + "epoch": 13.26, + "grad_norm": 5.290129661560059, + "learning_rate": 1.2762432982691219e-05, + "loss": 0.8724, + "step": 44320 + }, + { + "epoch": 13.26, + "grad_norm": 3.318096160888672, + "learning_rate": 1.2757310126431915e-05, + "loss": 0.8357, + "step": 44325 + }, + { + "epoch": 13.26, + "grad_norm": 2.25472354888916, + "learning_rate": 1.2752187946323663e-05, + "loss": 0.9452, + "step": 44330 + }, + { + "epoch": 13.26, + "grad_norm": 3.4436185359954834, + "learning_rate": 1.2747066442649324e-05, + "loss": 0.9833, + "step": 44335 + }, + { + "epoch": 13.27, + "grad_norm": 5.141889572143555, + "learning_rate": 1.2741945615691785e-05, + "loss": 1.0254, + "step": 44340 + }, + { + "epoch": 13.27, + "grad_norm": 2.958003520965576, + "learning_rate": 1.2736825465733832e-05, + "loss": 0.8983, + "step": 44345 + }, + { + "epoch": 13.27, + "grad_norm": 1.62663996219635, + "learning_rate": 1.2731705993058279e-05, + "loss": 0.9047, + "step": 44350 + }, + { + "epoch": 13.27, + "grad_norm": 1.6435695886611938, + "learning_rate": 1.2726587197947843e-05, + "loss": 0.9762, + "step": 44355 + }, + { + "epoch": 13.27, + "grad_norm": 2.0215165615081787, + "learning_rate": 1.2721469080685238e-05, + "loss": 1.058, + "step": 44360 + }, + { + "epoch": 13.27, + "grad_norm": 7.587679386138916, + "learning_rate": 1.2716351641553137e-05, + "loss": 0.797, + "step": 44365 + }, + { + "epoch": 13.27, + "grad_norm": 4.353124141693115, + "learning_rate": 1.271123488083416e-05, + "loss": 0.9351, + "step": 44370 + }, + { + "epoch": 13.28, + "grad_norm": 1.8056886196136475, + "learning_rate": 1.2706118798810912e-05, + "loss": 1.0511, + "step": 44375 + }, + { + "epoch": 13.28, + "grad_norm": 2.1814777851104736, + "learning_rate": 1.2701003395765942e-05, + "loss": 0.9481, + "step": 44380 + }, + { + "epoch": 13.28, + "grad_norm": 2.314023017883301, + "learning_rate": 1.2695888671981771e-05, + "loss": 1.0634, + "step": 44385 + }, + { + "epoch": 13.28, + "grad_norm": 2.8826498985290527, + "learning_rate": 1.2690774627740875e-05, + "loss": 0.9873, + "step": 44390 + }, + { + "epoch": 13.28, + "grad_norm": 2.8844268321990967, + "learning_rate": 1.2685661263325707e-05, + "loss": 0.8023, + "step": 44395 + }, + { + "epoch": 13.28, + "grad_norm": 2.0241379737854004, + "learning_rate": 1.2680548579018666e-05, + "loss": 0.9862, + "step": 44400 + }, + { + "epoch": 13.29, + "grad_norm": 2.5573320388793945, + "learning_rate": 1.2676458921440643e-05, + "loss": 0.8519, + "step": 44405 + }, + { + "epoch": 13.29, + "grad_norm": 2.713150978088379, + "learning_rate": 1.2671347462039773e-05, + "loss": 1.0685, + "step": 44410 + }, + { + "epoch": 13.29, + "grad_norm": 2.36755633354187, + "learning_rate": 1.2666236683537571e-05, + "loss": 0.8845, + "step": 44415 + }, + { + "epoch": 13.29, + "grad_norm": 3.59240984916687, + "learning_rate": 1.26611265862163e-05, + "loss": 1.0012, + "step": 44420 + }, + { + "epoch": 13.29, + "grad_norm": 1.345497965812683, + "learning_rate": 1.2656017170358175e-05, + "loss": 1.0763, + "step": 44425 + }, + { + "epoch": 13.29, + "grad_norm": 2.690333604812622, + "learning_rate": 1.2650908436245395e-05, + "loss": 1.0471, + "step": 44430 + }, + { + "epoch": 13.29, + "grad_norm": 3.9363012313842773, + "learning_rate": 1.2645800384160103e-05, + "loss": 1.0104, + "step": 44435 + }, + { + "epoch": 13.3, + "grad_norm": 3.2774622440338135, + "learning_rate": 1.2640693014384417e-05, + "loss": 1.052, + "step": 44440 + }, + { + "epoch": 13.3, + "grad_norm": 5.962934494018555, + "learning_rate": 1.2635586327200408e-05, + "loss": 1.044, + "step": 44445 + }, + { + "epoch": 13.3, + "grad_norm": 1.6138075590133667, + "learning_rate": 1.2630480322890114e-05, + "loss": 1.0042, + "step": 44450 + }, + { + "epoch": 13.3, + "grad_norm": 4.094041347503662, + "learning_rate": 1.2625375001735534e-05, + "loss": 0.9887, + "step": 44455 + }, + { + "epoch": 13.3, + "grad_norm": 10.620294570922852, + "learning_rate": 1.2620270364018633e-05, + "loss": 0.9852, + "step": 44460 + }, + { + "epoch": 13.3, + "grad_norm": 1.635864496231079, + "learning_rate": 1.2615166410021329e-05, + "loss": 1.0061, + "step": 44465 + }, + { + "epoch": 13.3, + "grad_norm": 5.53233528137207, + "learning_rate": 1.2610063140025519e-05, + "loss": 1.0371, + "step": 44470 + }, + { + "epoch": 13.31, + "grad_norm": 3.4139328002929688, + "learning_rate": 1.2604960554313027e-05, + "loss": 1.0421, + "step": 44475 + }, + { + "epoch": 13.31, + "grad_norm": 4.036562919616699, + "learning_rate": 1.2599858653165698e-05, + "loss": 0.8257, + "step": 44480 + }, + { + "epoch": 13.31, + "grad_norm": 5.966806411743164, + "learning_rate": 1.2594757436865265e-05, + "loss": 1.1459, + "step": 44485 + }, + { + "epoch": 13.31, + "grad_norm": 2.0404038429260254, + "learning_rate": 1.2589656905693503e-05, + "loss": 1.0175, + "step": 44490 + }, + { + "epoch": 13.31, + "grad_norm": 2.6337130069732666, + "learning_rate": 1.2584557059932076e-05, + "loss": 1.0467, + "step": 44495 + }, + { + "epoch": 13.31, + "grad_norm": 5.764756679534912, + "learning_rate": 1.2579457899862673e-05, + "loss": 1.0118, + "step": 44500 + }, + { + "epoch": 13.32, + "grad_norm": 8.07419490814209, + "learning_rate": 1.257435942576689e-05, + "loss": 1.1198, + "step": 44505 + }, + { + "epoch": 13.32, + "grad_norm": 2.342231273651123, + "learning_rate": 1.2569261637926322e-05, + "loss": 1.1913, + "step": 44510 + }, + { + "epoch": 13.32, + "grad_norm": 2.863327980041504, + "learning_rate": 1.2564164536622513e-05, + "loss": 0.9799, + "step": 44515 + }, + { + "epoch": 13.32, + "grad_norm": 2.152618885040283, + "learning_rate": 1.255906812213697e-05, + "loss": 0.9915, + "step": 44520 + }, + { + "epoch": 13.32, + "grad_norm": 1.8867714405059814, + "learning_rate": 1.2553972394751162e-05, + "loss": 0.9104, + "step": 44525 + }, + { + "epoch": 13.32, + "grad_norm": 4.3180460929870605, + "learning_rate": 1.2548877354746519e-05, + "loss": 0.8839, + "step": 44530 + }, + { + "epoch": 13.32, + "grad_norm": 3.7813620567321777, + "learning_rate": 1.254378300240444e-05, + "loss": 1.0885, + "step": 44535 + }, + { + "epoch": 13.33, + "grad_norm": 2.9510982036590576, + "learning_rate": 1.2538689338006282e-05, + "loss": 1.0409, + "step": 44540 + }, + { + "epoch": 13.33, + "grad_norm": 2.6716909408569336, + "learning_rate": 1.2533596361833355e-05, + "loss": 1.0121, + "step": 44545 + }, + { + "epoch": 13.33, + "grad_norm": 3.946298837661743, + "learning_rate": 1.2528504074166941e-05, + "loss": 0.9055, + "step": 44550 + }, + { + "epoch": 13.33, + "grad_norm": 2.191622018814087, + "learning_rate": 1.2523412475288288e-05, + "loss": 1.0652, + "step": 44555 + }, + { + "epoch": 13.33, + "grad_norm": 4.9477858543396, + "learning_rate": 1.2518321565478593e-05, + "loss": 0.8954, + "step": 44560 + }, + { + "epoch": 13.33, + "grad_norm": 1.2523143291473389, + "learning_rate": 1.2513231345019032e-05, + "loss": 1.0451, + "step": 44565 + }, + { + "epoch": 13.33, + "grad_norm": 2.840963840484619, + "learning_rate": 1.250814181419071e-05, + "loss": 0.8473, + "step": 44570 + }, + { + "epoch": 13.34, + "grad_norm": 2.3389129638671875, + "learning_rate": 1.2503052973274749e-05, + "loss": 1.0362, + "step": 44575 + }, + { + "epoch": 13.34, + "grad_norm": 2.131584882736206, + "learning_rate": 1.2497964822552161e-05, + "loss": 1.0984, + "step": 44580 + }, + { + "epoch": 13.34, + "grad_norm": 6.629162311553955, + "learning_rate": 1.2492877362304e-05, + "loss": 1.0674, + "step": 44585 + }, + { + "epoch": 13.34, + "grad_norm": 4.698531627655029, + "learning_rate": 1.2487790592811202e-05, + "loss": 0.8879, + "step": 44590 + }, + { + "epoch": 13.34, + "grad_norm": 3.542203187942505, + "learning_rate": 1.2482704514354746e-05, + "loss": 0.916, + "step": 44595 + }, + { + "epoch": 13.34, + "grad_norm": 6.20295524597168, + "learning_rate": 1.2477619127215498e-05, + "loss": 1.0514, + "step": 44600 + }, + { + "epoch": 13.35, + "grad_norm": 7.1144022941589355, + "learning_rate": 1.2472534431674327e-05, + "loss": 0.9958, + "step": 44605 + }, + { + "epoch": 13.35, + "grad_norm": 1.586234211921692, + "learning_rate": 1.2467450428012059e-05, + "loss": 0.9424, + "step": 44610 + }, + { + "epoch": 13.35, + "grad_norm": 3.302457332611084, + "learning_rate": 1.246236711650948e-05, + "loss": 0.9997, + "step": 44615 + }, + { + "epoch": 13.35, + "grad_norm": 3.5869815349578857, + "learning_rate": 1.245728449744733e-05, + "loss": 1.0039, + "step": 44620 + }, + { + "epoch": 13.35, + "grad_norm": 4.133788108825684, + "learning_rate": 1.245220257110632e-05, + "loss": 0.8164, + "step": 44625 + }, + { + "epoch": 13.35, + "grad_norm": 1.5097733736038208, + "learning_rate": 1.2447121337767121e-05, + "loss": 0.7921, + "step": 44630 + }, + { + "epoch": 13.35, + "grad_norm": 1.8934462070465088, + "learning_rate": 1.2442040797710358e-05, + "loss": 0.9432, + "step": 44635 + }, + { + "epoch": 13.36, + "grad_norm": 7.073252201080322, + "learning_rate": 1.2436960951216633e-05, + "loss": 0.9477, + "step": 44640 + }, + { + "epoch": 13.36, + "grad_norm": 2.9802677631378174, + "learning_rate": 1.2431881798566495e-05, + "loss": 0.9598, + "step": 44645 + }, + { + "epoch": 13.36, + "grad_norm": 1.559698224067688, + "learning_rate": 1.2426803340040461e-05, + "loss": 0.9934, + "step": 44650 + }, + { + "epoch": 13.36, + "grad_norm": 1.7870979309082031, + "learning_rate": 1.242172557591901e-05, + "loss": 1.0398, + "step": 44655 + }, + { + "epoch": 13.36, + "grad_norm": 4.0721755027771, + "learning_rate": 1.2416648506482589e-05, + "loss": 1.0168, + "step": 44660 + }, + { + "epoch": 13.36, + "grad_norm": 4.374423980712891, + "learning_rate": 1.2411572132011572e-05, + "loss": 0.9884, + "step": 44665 + }, + { + "epoch": 13.36, + "grad_norm": 3.6331615447998047, + "learning_rate": 1.2406496452786364e-05, + "loss": 0.9356, + "step": 44670 + }, + { + "epoch": 13.37, + "grad_norm": 1.985280156135559, + "learning_rate": 1.2401421469087246e-05, + "loss": 1.0527, + "step": 44675 + }, + { + "epoch": 13.37, + "grad_norm": 3.937107563018799, + "learning_rate": 1.2396347181194542e-05, + "loss": 1.0093, + "step": 44680 + }, + { + "epoch": 13.37, + "grad_norm": 1.2847137451171875, + "learning_rate": 1.2391273589388463e-05, + "loss": 1.065, + "step": 44685 + }, + { + "epoch": 13.37, + "grad_norm": 2.778695583343506, + "learning_rate": 1.238620069394926e-05, + "loss": 1.0256, + "step": 44690 + }, + { + "epoch": 13.37, + "grad_norm": 2.4905941486358643, + "learning_rate": 1.2381128495157071e-05, + "loss": 0.9014, + "step": 44695 + }, + { + "epoch": 13.37, + "grad_norm": 3.061601400375366, + "learning_rate": 1.2376056993292037e-05, + "loss": 0.9241, + "step": 44700 + }, + { + "epoch": 13.38, + "grad_norm": 5.156485557556152, + "learning_rate": 1.2370986188634253e-05, + "loss": 1.0604, + "step": 44705 + }, + { + "epoch": 13.38, + "grad_norm": 2.8037447929382324, + "learning_rate": 1.2365916081463777e-05, + "loss": 0.852, + "step": 44710 + }, + { + "epoch": 13.38, + "grad_norm": 2.281531572341919, + "learning_rate": 1.2360846672060622e-05, + "loss": 1.1074, + "step": 44715 + }, + { + "epoch": 13.38, + "grad_norm": 3.54512357711792, + "learning_rate": 1.2355777960704767e-05, + "loss": 0.9447, + "step": 44720 + }, + { + "epoch": 13.38, + "grad_norm": 2.5671660900115967, + "learning_rate": 1.2350709947676153e-05, + "loss": 1.0479, + "step": 44725 + }, + { + "epoch": 13.38, + "grad_norm": 1.9921631813049316, + "learning_rate": 1.2345642633254682e-05, + "loss": 0.9497, + "step": 44730 + }, + { + "epoch": 13.38, + "grad_norm": 1.9253196716308594, + "learning_rate": 1.2340576017720212e-05, + "loss": 1.0235, + "step": 44735 + }, + { + "epoch": 13.39, + "grad_norm": 5.337843418121338, + "learning_rate": 1.233551010135257e-05, + "loss": 1.1129, + "step": 44740 + }, + { + "epoch": 13.39, + "grad_norm": 3.605689764022827, + "learning_rate": 1.2330444884431544e-05, + "loss": 1.0846, + "step": 44745 + }, + { + "epoch": 13.39, + "grad_norm": 2.8609673976898193, + "learning_rate": 1.2325380367236875e-05, + "loss": 1.0635, + "step": 44750 + }, + { + "epoch": 13.39, + "grad_norm": 2.364123582839966, + "learning_rate": 1.2320316550048287e-05, + "loss": 0.951, + "step": 44755 + }, + { + "epoch": 13.39, + "grad_norm": 7.171827793121338, + "learning_rate": 1.2315253433145412e-05, + "loss": 0.8352, + "step": 44760 + }, + { + "epoch": 13.39, + "grad_norm": 3.1451687812805176, + "learning_rate": 1.2310191016807925e-05, + "loss": 0.9133, + "step": 44765 + }, + { + "epoch": 13.39, + "grad_norm": 2.174330711364746, + "learning_rate": 1.2305129301315382e-05, + "loss": 1.0346, + "step": 44770 + }, + { + "epoch": 13.4, + "grad_norm": 2.454254150390625, + "learning_rate": 1.230006828694737e-05, + "loss": 1.0938, + "step": 44775 + }, + { + "epoch": 13.4, + "grad_norm": 2.6113619804382324, + "learning_rate": 1.2295007973983366e-05, + "loss": 0.9711, + "step": 44780 + }, + { + "epoch": 13.4, + "grad_norm": 2.4465274810791016, + "learning_rate": 1.2289948362702883e-05, + "loss": 1.0361, + "step": 44785 + }, + { + "epoch": 13.4, + "grad_norm": 2.1108946800231934, + "learning_rate": 1.2284889453385334e-05, + "loss": 1.0811, + "step": 44790 + }, + { + "epoch": 13.4, + "grad_norm": 3.172182083129883, + "learning_rate": 1.2279831246310122e-05, + "loss": 1.0407, + "step": 44795 + }, + { + "epoch": 13.4, + "grad_norm": 1.988529086112976, + "learning_rate": 1.2274773741756607e-05, + "loss": 1.0839, + "step": 44800 + }, + { + "epoch": 13.41, + "grad_norm": 2.424704074859619, + "learning_rate": 1.2269716940004103e-05, + "loss": 1.0702, + "step": 44805 + }, + { + "epoch": 13.41, + "grad_norm": 1.8845196962356567, + "learning_rate": 1.2264660841331921e-05, + "loss": 0.9947, + "step": 44810 + }, + { + "epoch": 13.41, + "grad_norm": 1.3340157270431519, + "learning_rate": 1.2259605446019261e-05, + "loss": 1.1926, + "step": 44815 + }, + { + "epoch": 13.41, + "grad_norm": 1.2041699886322021, + "learning_rate": 1.225455075434537e-05, + "loss": 1.128, + "step": 44820 + }, + { + "epoch": 13.41, + "grad_norm": 2.0719640254974365, + "learning_rate": 1.2249496766589382e-05, + "loss": 1.0323, + "step": 44825 + }, + { + "epoch": 13.41, + "grad_norm": 2.7494218349456787, + "learning_rate": 1.2244443483030438e-05, + "loss": 1.1221, + "step": 44830 + }, + { + "epoch": 13.41, + "grad_norm": 1.4055849313735962, + "learning_rate": 1.2239390903947618e-05, + "loss": 1.0946, + "step": 44835 + }, + { + "epoch": 13.42, + "grad_norm": 3.2642478942871094, + "learning_rate": 1.2234339029619974e-05, + "loss": 1.0766, + "step": 44840 + }, + { + "epoch": 13.42, + "grad_norm": 1.8189183473587036, + "learning_rate": 1.2229287860326519e-05, + "loss": 0.8681, + "step": 44845 + }, + { + "epoch": 13.42, + "grad_norm": 2.4354348182678223, + "learning_rate": 1.222423739634622e-05, + "loss": 0.722, + "step": 44850 + }, + { + "epoch": 13.42, + "grad_norm": 1.813585638999939, + "learning_rate": 1.2219187637958007e-05, + "loss": 1.1683, + "step": 44855 + }, + { + "epoch": 13.42, + "grad_norm": 3.655863046646118, + "learning_rate": 1.221413858544079e-05, + "loss": 1.067, + "step": 44860 + }, + { + "epoch": 13.42, + "grad_norm": 8.485624313354492, + "learning_rate": 1.2209090239073386e-05, + "loss": 1.0451, + "step": 44865 + }, + { + "epoch": 13.42, + "grad_norm": 4.247042655944824, + "learning_rate": 1.2204042599134654e-05, + "loss": 1.1318, + "step": 44870 + }, + { + "epoch": 13.43, + "grad_norm": 3.260227680206299, + "learning_rate": 1.219899566590333e-05, + "loss": 0.9493, + "step": 44875 + }, + { + "epoch": 13.43, + "grad_norm": 2.963711977005005, + "learning_rate": 1.2193949439658187e-05, + "loss": 0.919, + "step": 44880 + }, + { + "epoch": 13.43, + "grad_norm": 3.6867597103118896, + "learning_rate": 1.2188903920677896e-05, + "loss": 1.0995, + "step": 44885 + }, + { + "epoch": 13.43, + "grad_norm": 2.1366159915924072, + "learning_rate": 1.2183859109241116e-05, + "loss": 0.9898, + "step": 44890 + }, + { + "epoch": 13.43, + "grad_norm": 4.271877288818359, + "learning_rate": 1.2178815005626493e-05, + "loss": 1.0278, + "step": 44895 + }, + { + "epoch": 13.43, + "grad_norm": 7.151296615600586, + "learning_rate": 1.2173771610112574e-05, + "loss": 1.0398, + "step": 44900 + }, + { + "epoch": 13.44, + "grad_norm": 2.853313684463501, + "learning_rate": 1.2168728922977934e-05, + "loss": 1.0188, + "step": 44905 + }, + { + "epoch": 13.44, + "grad_norm": 1.6056421995162964, + "learning_rate": 1.216368694450104e-05, + "loss": 0.8771, + "step": 44910 + }, + { + "epoch": 13.44, + "grad_norm": 1.6957907676696777, + "learning_rate": 1.2158645674960392e-05, + "loss": 0.9153, + "step": 44915 + }, + { + "epoch": 13.44, + "grad_norm": 13.22221851348877, + "learning_rate": 1.2153605114634387e-05, + "loss": 1.1427, + "step": 44920 + }, + { + "epoch": 13.44, + "grad_norm": 4.49719762802124, + "learning_rate": 1.2148565263801417e-05, + "loss": 1.0046, + "step": 44925 + }, + { + "epoch": 13.44, + "grad_norm": 8.903334617614746, + "learning_rate": 1.2143526122739832e-05, + "loss": 0.9307, + "step": 44930 + }, + { + "epoch": 13.44, + "grad_norm": 2.33970046043396, + "learning_rate": 1.2138487691727932e-05, + "loss": 1.0706, + "step": 44935 + }, + { + "epoch": 13.45, + "grad_norm": 2.2916619777679443, + "learning_rate": 1.2133449971043992e-05, + "loss": 1.002, + "step": 44940 + }, + { + "epoch": 13.45, + "grad_norm": 3.4537885189056396, + "learning_rate": 1.2128412960966231e-05, + "loss": 1.0616, + "step": 44945 + }, + { + "epoch": 13.45, + "grad_norm": 2.979152202606201, + "learning_rate": 1.2123376661772848e-05, + "loss": 0.9152, + "step": 44950 + }, + { + "epoch": 13.45, + "grad_norm": 1.5841301679611206, + "learning_rate": 1.2118341073741993e-05, + "loss": 0.9443, + "step": 44955 + }, + { + "epoch": 13.45, + "grad_norm": 8.155377388000488, + "learning_rate": 1.2113306197151754e-05, + "loss": 0.9218, + "step": 44960 + }, + { + "epoch": 13.45, + "grad_norm": 3.3958740234375, + "learning_rate": 1.2108272032280226e-05, + "loss": 0.9962, + "step": 44965 + }, + { + "epoch": 13.45, + "grad_norm": 3.104916572570801, + "learning_rate": 1.2103238579405437e-05, + "loss": 1.0308, + "step": 44970 + }, + { + "epoch": 13.46, + "grad_norm": 1.5764812231063843, + "learning_rate": 1.2098205838805373e-05, + "loss": 0.9052, + "step": 44975 + }, + { + "epoch": 13.46, + "grad_norm": 1.8205498456954956, + "learning_rate": 1.2093173810757991e-05, + "loss": 1.1331, + "step": 44980 + }, + { + "epoch": 13.46, + "grad_norm": 3.6816673278808594, + "learning_rate": 1.2088142495541205e-05, + "loss": 0.9301, + "step": 44985 + }, + { + "epoch": 13.46, + "grad_norm": 3.829503059387207, + "learning_rate": 1.2083111893432893e-05, + "loss": 1.0146, + "step": 44990 + }, + { + "epoch": 13.46, + "grad_norm": 2.2091660499572754, + "learning_rate": 1.207808200471087e-05, + "loss": 0.9556, + "step": 44995 + }, + { + "epoch": 13.46, + "grad_norm": 4.129636287689209, + "learning_rate": 1.2074058607558082e-05, + "loss": 1.0866, + "step": 45000 + }, + { + "epoch": 13.46, + "grad_norm": 3.674562454223633, + "learning_rate": 1.206903000363143e-05, + "loss": 1.1933, + "step": 45005 + }, + { + "epoch": 13.47, + "grad_norm": 3.4031946659088135, + "learning_rate": 1.2064002113868822e-05, + "loss": 1.0472, + "step": 45010 + }, + { + "epoch": 13.47, + "grad_norm": 4.102842330932617, + "learning_rate": 1.2058974938547913e-05, + "loss": 1.0977, + "step": 45015 + }, + { + "epoch": 13.47, + "grad_norm": 1.0470472574234009, + "learning_rate": 1.205394847794638e-05, + "loss": 1.0491, + "step": 45020 + }, + { + "epoch": 13.47, + "grad_norm": 4.4615702629089355, + "learning_rate": 1.2048922732341802e-05, + "loss": 1.0362, + "step": 45025 + }, + { + "epoch": 13.47, + "grad_norm": 1.8083804845809937, + "learning_rate": 1.2043897702011777e-05, + "loss": 0.9582, + "step": 45030 + }, + { + "epoch": 13.47, + "grad_norm": 1.3587656021118164, + "learning_rate": 1.2038873387233806e-05, + "loss": 1.0904, + "step": 45035 + }, + { + "epoch": 13.48, + "grad_norm": 9.736397743225098, + "learning_rate": 1.2033849788285387e-05, + "loss": 1.0481, + "step": 45040 + }, + { + "epoch": 13.48, + "grad_norm": 1.1890517473220825, + "learning_rate": 1.2028826905443965e-05, + "loss": 1.0538, + "step": 45045 + }, + { + "epoch": 13.48, + "grad_norm": 1.4753776788711548, + "learning_rate": 1.2023804738986948e-05, + "loss": 1.0234, + "step": 45050 + }, + { + "epoch": 13.48, + "grad_norm": 2.6727852821350098, + "learning_rate": 1.2018783289191713e-05, + "loss": 1.0864, + "step": 45055 + }, + { + "epoch": 13.48, + "grad_norm": 3.8187108039855957, + "learning_rate": 1.2013762556335578e-05, + "loss": 1.0876, + "step": 45060 + }, + { + "epoch": 13.48, + "grad_norm": 2.2639684677124023, + "learning_rate": 1.2008742540695842e-05, + "loss": 1.1417, + "step": 45065 + }, + { + "epoch": 13.48, + "grad_norm": 1.7074878215789795, + "learning_rate": 1.2003723242549749e-05, + "loss": 0.9508, + "step": 45070 + }, + { + "epoch": 13.49, + "grad_norm": 3.0342020988464355, + "learning_rate": 1.1998704662174515e-05, + "loss": 0.8973, + "step": 45075 + }, + { + "epoch": 13.49, + "grad_norm": 2.464416980743408, + "learning_rate": 1.1993686799847307e-05, + "loss": 1.093, + "step": 45080 + }, + { + "epoch": 13.49, + "grad_norm": 2.7546801567077637, + "learning_rate": 1.1988669655845259e-05, + "loss": 1.1162, + "step": 45085 + }, + { + "epoch": 13.49, + "grad_norm": 7.984701156616211, + "learning_rate": 1.198365323044546e-05, + "loss": 0.9271, + "step": 45090 + }, + { + "epoch": 13.49, + "grad_norm": 2.5185933113098145, + "learning_rate": 1.1978637523924962e-05, + "loss": 0.9088, + "step": 45095 + }, + { + "epoch": 13.49, + "grad_norm": 5.400019645690918, + "learning_rate": 1.1973622536560783e-05, + "loss": 1.0487, + "step": 45100 + }, + { + "epoch": 13.49, + "grad_norm": 1.2262287139892578, + "learning_rate": 1.1968608268629897e-05, + "loss": 0.9563, + "step": 45105 + }, + { + "epoch": 13.5, + "grad_norm": 1.9510408639907837, + "learning_rate": 1.1963594720409213e-05, + "loss": 0.9829, + "step": 45110 + }, + { + "epoch": 13.5, + "grad_norm": 3.9547598361968994, + "learning_rate": 1.1958581892175664e-05, + "loss": 1.0245, + "step": 45115 + }, + { + "epoch": 13.5, + "grad_norm": 2.1796655654907227, + "learning_rate": 1.1953569784206056e-05, + "loss": 0.8948, + "step": 45120 + }, + { + "epoch": 13.5, + "grad_norm": 1.8676890134811401, + "learning_rate": 1.194855839677725e-05, + "loss": 0.9237, + "step": 45125 + }, + { + "epoch": 13.5, + "grad_norm": 2.9657278060913086, + "learning_rate": 1.194354773016599e-05, + "loss": 0.9703, + "step": 45130 + }, + { + "epoch": 13.5, + "grad_norm": 3.7803890705108643, + "learning_rate": 1.1938537784649015e-05, + "loss": 0.941, + "step": 45135 + }, + { + "epoch": 13.51, + "grad_norm": 4.787195205688477, + "learning_rate": 1.1933528560503021e-05, + "loss": 0.854, + "step": 45140 + }, + { + "epoch": 13.51, + "grad_norm": 2.3021395206451416, + "learning_rate": 1.1928520058004666e-05, + "loss": 0.9937, + "step": 45145 + }, + { + "epoch": 13.51, + "grad_norm": 3.430438995361328, + "learning_rate": 1.192351227743056e-05, + "loss": 1.0839, + "step": 45150 + }, + { + "epoch": 13.51, + "grad_norm": 3.5404064655303955, + "learning_rate": 1.1918505219057265e-05, + "loss": 1.0443, + "step": 45155 + }, + { + "epoch": 13.51, + "grad_norm": 1.529388427734375, + "learning_rate": 1.1913498883161354e-05, + "loss": 1.0485, + "step": 45160 + }, + { + "epoch": 13.51, + "grad_norm": 3.046905755996704, + "learning_rate": 1.1908493270019283e-05, + "loss": 1.1611, + "step": 45165 + }, + { + "epoch": 13.51, + "grad_norm": 2.4746837615966797, + "learning_rate": 1.1903488379907524e-05, + "loss": 0.9898, + "step": 45170 + }, + { + "epoch": 13.52, + "grad_norm": 3.2525274753570557, + "learning_rate": 1.1898484213102487e-05, + "loss": 1.036, + "step": 45175 + }, + { + "epoch": 13.52, + "grad_norm": 2.007542371749878, + "learning_rate": 1.1893480769880549e-05, + "loss": 1.0232, + "step": 45180 + }, + { + "epoch": 13.52, + "grad_norm": 3.283017873764038, + "learning_rate": 1.1888478050518046e-05, + "loss": 1.0205, + "step": 45185 + }, + { + "epoch": 13.52, + "grad_norm": 3.2330360412597656, + "learning_rate": 1.188347605529127e-05, + "loss": 0.9873, + "step": 45190 + }, + { + "epoch": 13.52, + "grad_norm": 3.1978647708892822, + "learning_rate": 1.1878474784476478e-05, + "loss": 1.0942, + "step": 45195 + }, + { + "epoch": 13.52, + "grad_norm": 3.234929084777832, + "learning_rate": 1.1873474238349894e-05, + "loss": 1.181, + "step": 45200 + }, + { + "epoch": 13.52, + "grad_norm": 3.867767333984375, + "learning_rate": 1.1868474417187666e-05, + "loss": 0.9687, + "step": 45205 + }, + { + "epoch": 13.53, + "grad_norm": 2.2236828804016113, + "learning_rate": 1.1863475321265962e-05, + "loss": 1.0287, + "step": 45210 + }, + { + "epoch": 13.53, + "grad_norm": 3.1465940475463867, + "learning_rate": 1.1858476950860848e-05, + "loss": 0.9651, + "step": 45215 + }, + { + "epoch": 13.53, + "grad_norm": 2.4534783363342285, + "learning_rate": 1.1853479306248404e-05, + "loss": 1.0121, + "step": 45220 + }, + { + "epoch": 13.53, + "grad_norm": 6.127681732177734, + "learning_rate": 1.1848482387704621e-05, + "loss": 1.1017, + "step": 45225 + }, + { + "epoch": 13.53, + "grad_norm": 2.688446283340454, + "learning_rate": 1.184348619550549e-05, + "loss": 1.0992, + "step": 45230 + }, + { + "epoch": 13.53, + "grad_norm": 2.3378570079803467, + "learning_rate": 1.1838490729926952e-05, + "loss": 1.0143, + "step": 45235 + }, + { + "epoch": 13.54, + "grad_norm": 2.69551944732666, + "learning_rate": 1.1833495991244873e-05, + "loss": 0.999, + "step": 45240 + }, + { + "epoch": 13.54, + "grad_norm": 3.029266357421875, + "learning_rate": 1.182850197973514e-05, + "loss": 1.0098, + "step": 45245 + }, + { + "epoch": 13.54, + "grad_norm": 8.008578300476074, + "learning_rate": 1.1823508695673535e-05, + "loss": 1.0435, + "step": 45250 + }, + { + "epoch": 13.54, + "grad_norm": 1.313415765762329, + "learning_rate": 1.1818516139335867e-05, + "loss": 1.2993, + "step": 45255 + }, + { + "epoch": 13.54, + "grad_norm": 4.881174087524414, + "learning_rate": 1.1813524310997842e-05, + "loss": 0.9305, + "step": 45260 + }, + { + "epoch": 13.54, + "grad_norm": 2.0278892517089844, + "learning_rate": 1.1808533210935164e-05, + "loss": 0.9142, + "step": 45265 + }, + { + "epoch": 13.54, + "grad_norm": 5.330128192901611, + "learning_rate": 1.180354283942348e-05, + "loss": 1.1023, + "step": 45270 + }, + { + "epoch": 13.55, + "grad_norm": 1.4603615999221802, + "learning_rate": 1.1798553196738415e-05, + "loss": 1.04, + "step": 45275 + }, + { + "epoch": 13.55, + "grad_norm": 2.043243408203125, + "learning_rate": 1.1793564283155532e-05, + "loss": 0.9853, + "step": 45280 + }, + { + "epoch": 13.55, + "grad_norm": 3.6421613693237305, + "learning_rate": 1.1788576098950365e-05, + "loss": 0.9557, + "step": 45285 + }, + { + "epoch": 13.55, + "grad_norm": 2.091878890991211, + "learning_rate": 1.1783588644398408e-05, + "loss": 1.1519, + "step": 45290 + }, + { + "epoch": 13.55, + "grad_norm": 1.9971786737442017, + "learning_rate": 1.1778601919775122e-05, + "loss": 1.0033, + "step": 45295 + }, + { + "epoch": 13.55, + "grad_norm": 0.9933714270591736, + "learning_rate": 1.1773615925355891e-05, + "loss": 0.9982, + "step": 45300 + }, + { + "epoch": 13.55, + "grad_norm": 5.131785869598389, + "learning_rate": 1.1768630661416114e-05, + "loss": 1.0143, + "step": 45305 + }, + { + "epoch": 13.56, + "grad_norm": 2.784761905670166, + "learning_rate": 1.176364612823111e-05, + "loss": 0.8749, + "step": 45310 + }, + { + "epoch": 13.56, + "grad_norm": 1.629541039466858, + "learning_rate": 1.1758662326076176e-05, + "loss": 1.1158, + "step": 45315 + }, + { + "epoch": 13.56, + "grad_norm": 3.978738784790039, + "learning_rate": 1.1753679255226555e-05, + "loss": 0.7877, + "step": 45320 + }, + { + "epoch": 13.56, + "grad_norm": 2.3555569648742676, + "learning_rate": 1.174869691595746e-05, + "loss": 1.0907, + "step": 45325 + }, + { + "epoch": 13.56, + "grad_norm": 2.2074787616729736, + "learning_rate": 1.174371530854407e-05, + "loss": 0.9258, + "step": 45330 + }, + { + "epoch": 13.56, + "grad_norm": 3.912524461746216, + "learning_rate": 1.1738734433261486e-05, + "loss": 0.9214, + "step": 45335 + }, + { + "epoch": 13.57, + "grad_norm": 3.4181976318359375, + "learning_rate": 1.1733754290384833e-05, + "loss": 0.9704, + "step": 45340 + }, + { + "epoch": 13.57, + "grad_norm": 2.531336545944214, + "learning_rate": 1.1728774880189123e-05, + "loss": 1.087, + "step": 45345 + }, + { + "epoch": 13.57, + "grad_norm": 3.3904941082000732, + "learning_rate": 1.17237962029494e-05, + "loss": 0.9672, + "step": 45350 + }, + { + "epoch": 13.57, + "grad_norm": 4.165085792541504, + "learning_rate": 1.1718818258940596e-05, + "loss": 0.9494, + "step": 45355 + }, + { + "epoch": 13.57, + "grad_norm": 3.3338541984558105, + "learning_rate": 1.171384104843767e-05, + "loss": 1.0561, + "step": 45360 + }, + { + "epoch": 13.57, + "grad_norm": 2.241774797439575, + "learning_rate": 1.1708864571715486e-05, + "loss": 1.0043, + "step": 45365 + }, + { + "epoch": 13.57, + "grad_norm": 3.3441967964172363, + "learning_rate": 1.1703888829048895e-05, + "loss": 1.0327, + "step": 45370 + }, + { + "epoch": 13.58, + "grad_norm": 3.085231065750122, + "learning_rate": 1.1698913820712704e-05, + "loss": 0.7949, + "step": 45375 + }, + { + "epoch": 13.58, + "grad_norm": 3.700368881225586, + "learning_rate": 1.1693939546981678e-05, + "loss": 1.0952, + "step": 45380 + }, + { + "epoch": 13.58, + "grad_norm": 3.532536745071411, + "learning_rate": 1.1688966008130539e-05, + "loss": 0.9898, + "step": 45385 + }, + { + "epoch": 13.58, + "grad_norm": 3.1939263343811035, + "learning_rate": 1.1683993204433971e-05, + "loss": 1.0335, + "step": 45390 + }, + { + "epoch": 13.58, + "grad_norm": 2.641188144683838, + "learning_rate": 1.1679021136166618e-05, + "loss": 0.9966, + "step": 45395 + }, + { + "epoch": 13.58, + "grad_norm": 12.577829360961914, + "learning_rate": 1.1674049803603079e-05, + "loss": 0.9728, + "step": 45400 + }, + { + "epoch": 13.58, + "grad_norm": 5.408492565155029, + "learning_rate": 1.166907920701792e-05, + "loss": 1.183, + "step": 45405 + }, + { + "epoch": 13.59, + "grad_norm": 15.173827171325684, + "learning_rate": 1.166410934668566e-05, + "loss": 1.0365, + "step": 45410 + }, + { + "epoch": 13.59, + "grad_norm": 1.8098371028900146, + "learning_rate": 1.1659140222880777e-05, + "loss": 1.0531, + "step": 45415 + }, + { + "epoch": 13.59, + "grad_norm": 3.362779378890991, + "learning_rate": 1.1654171835877715e-05, + "loss": 1.0122, + "step": 45420 + }, + { + "epoch": 13.59, + "grad_norm": 4.117199420928955, + "learning_rate": 1.1649204185950877e-05, + "loss": 0.7663, + "step": 45425 + }, + { + "epoch": 13.59, + "grad_norm": 4.3105621337890625, + "learning_rate": 1.1644237273374595e-05, + "loss": 1.0172, + "step": 45430 + }, + { + "epoch": 13.59, + "grad_norm": 2.2465403079986572, + "learning_rate": 1.1639271098423227e-05, + "loss": 0.9232, + "step": 45435 + }, + { + "epoch": 13.6, + "grad_norm": 1.8414210081100464, + "learning_rate": 1.163430566137101e-05, + "loss": 0.915, + "step": 45440 + }, + { + "epoch": 13.6, + "grad_norm": 2.6317269802093506, + "learning_rate": 1.1629340962492217e-05, + "loss": 0.8813, + "step": 45445 + }, + { + "epoch": 13.6, + "grad_norm": 2.4604814052581787, + "learning_rate": 1.1624377002061004e-05, + "loss": 1.111, + "step": 45450 + }, + { + "epoch": 13.6, + "grad_norm": 2.286057710647583, + "learning_rate": 1.1619413780351566e-05, + "loss": 1.0006, + "step": 45455 + }, + { + "epoch": 13.6, + "grad_norm": 3.36887264251709, + "learning_rate": 1.1614451297637985e-05, + "loss": 1.1177, + "step": 45460 + }, + { + "epoch": 13.6, + "grad_norm": 3.5638434886932373, + "learning_rate": 1.1609489554194348e-05, + "loss": 0.9992, + "step": 45465 + }, + { + "epoch": 13.6, + "grad_norm": 1.2167627811431885, + "learning_rate": 1.1604528550294685e-05, + "loss": 1.1973, + "step": 45470 + }, + { + "epoch": 13.61, + "grad_norm": 2.9576528072357178, + "learning_rate": 1.1599568286212984e-05, + "loss": 0.9278, + "step": 45475 + }, + { + "epoch": 13.61, + "grad_norm": 3.1728708744049072, + "learning_rate": 1.15946087622232e-05, + "loss": 1.1047, + "step": 45480 + }, + { + "epoch": 13.61, + "grad_norm": 5.425692558288574, + "learning_rate": 1.1589649978599237e-05, + "loss": 0.9143, + "step": 45485 + }, + { + "epoch": 13.61, + "grad_norm": 2.459667921066284, + "learning_rate": 1.158469193561497e-05, + "loss": 0.8807, + "step": 45490 + }, + { + "epoch": 13.61, + "grad_norm": 2.123094320297241, + "learning_rate": 1.157973463354422e-05, + "loss": 0.945, + "step": 45495 + }, + { + "epoch": 13.61, + "grad_norm": 2.2343719005584717, + "learning_rate": 1.1574778072660778e-05, + "loss": 1.0458, + "step": 45500 + }, + { + "epoch": 13.61, + "grad_norm": 6.470086097717285, + "learning_rate": 1.1569822253238385e-05, + "loss": 1.0621, + "step": 45505 + }, + { + "epoch": 13.62, + "grad_norm": 2.044508934020996, + "learning_rate": 1.1564867175550753e-05, + "loss": 1.0052, + "step": 45510 + }, + { + "epoch": 13.62, + "grad_norm": 2.4018311500549316, + "learning_rate": 1.1559912839871542e-05, + "loss": 1.0759, + "step": 45515 + }, + { + "epoch": 13.62, + "grad_norm": 3.800114154815674, + "learning_rate": 1.1554959246474382e-05, + "loss": 0.9216, + "step": 45520 + }, + { + "epoch": 13.62, + "grad_norm": 2.723524808883667, + "learning_rate": 1.155000639563283e-05, + "loss": 0.9357, + "step": 45525 + }, + { + "epoch": 13.62, + "grad_norm": 4.594392776489258, + "learning_rate": 1.154505428762046e-05, + "loss": 1.1448, + "step": 45530 + }, + { + "epoch": 13.62, + "grad_norm": 2.8049912452697754, + "learning_rate": 1.154010292271074e-05, + "loss": 1.1228, + "step": 45535 + }, + { + "epoch": 13.63, + "grad_norm": 2.90828013420105, + "learning_rate": 1.1535152301177165e-05, + "loss": 1.131, + "step": 45540 + }, + { + "epoch": 13.63, + "grad_norm": 2.163752317428589, + "learning_rate": 1.1530202423293113e-05, + "loss": 0.987, + "step": 45545 + }, + { + "epoch": 13.63, + "grad_norm": 2.9470038414001465, + "learning_rate": 1.1525253289331997e-05, + "loss": 0.8447, + "step": 45550 + }, + { + "epoch": 13.63, + "grad_norm": 5.131521224975586, + "learning_rate": 1.1520304899567128e-05, + "loss": 1.112, + "step": 45555 + }, + { + "epoch": 13.63, + "grad_norm": 2.858603000640869, + "learning_rate": 1.1515357254271805e-05, + "loss": 1.0796, + "step": 45560 + }, + { + "epoch": 13.63, + "grad_norm": 5.054248809814453, + "learning_rate": 1.1510410353719287e-05, + "loss": 0.8444, + "step": 45565 + }, + { + "epoch": 13.63, + "grad_norm": 1.9235126972198486, + "learning_rate": 1.1505464198182784e-05, + "loss": 1.1076, + "step": 45570 + }, + { + "epoch": 13.64, + "grad_norm": 3.045146942138672, + "learning_rate": 1.1500518787935466e-05, + "loss": 0.9475, + "step": 45575 + }, + { + "epoch": 13.64, + "grad_norm": 4.526063442230225, + "learning_rate": 1.1495574123250462e-05, + "loss": 0.8281, + "step": 45580 + }, + { + "epoch": 13.64, + "grad_norm": 2.535439968109131, + "learning_rate": 1.1490630204400863e-05, + "loss": 1.0387, + "step": 45585 + }, + { + "epoch": 13.64, + "grad_norm": 3.0743353366851807, + "learning_rate": 1.1485687031659717e-05, + "loss": 1.0645, + "step": 45590 + }, + { + "epoch": 13.64, + "grad_norm": 3.6751158237457275, + "learning_rate": 1.1480744605300026e-05, + "loss": 0.9101, + "step": 45595 + }, + { + "epoch": 13.64, + "grad_norm": 3.3130288124084473, + "learning_rate": 1.1475802925594762e-05, + "loss": 0.8764, + "step": 45600 + }, + { + "epoch": 13.64, + "grad_norm": 8.780059814453125, + "learning_rate": 1.1470861992816839e-05, + "loss": 1.0427, + "step": 45605 + }, + { + "epoch": 13.65, + "grad_norm": 3.3624203205108643, + "learning_rate": 1.146592180723915e-05, + "loss": 0.9371, + "step": 45610 + }, + { + "epoch": 13.65, + "grad_norm": 2.7508459091186523, + "learning_rate": 1.1460982369134541e-05, + "loss": 0.8532, + "step": 45615 + }, + { + "epoch": 13.65, + "grad_norm": 1.148845911026001, + "learning_rate": 1.1456043678775783e-05, + "loss": 0.992, + "step": 45620 + }, + { + "epoch": 13.65, + "grad_norm": 3.8979814052581787, + "learning_rate": 1.1451105736435672e-05, + "loss": 1.1124, + "step": 45625 + }, + { + "epoch": 13.65, + "grad_norm": 6.8672990798950195, + "learning_rate": 1.144616854238689e-05, + "loss": 0.8867, + "step": 45630 + }, + { + "epoch": 13.65, + "grad_norm": 3.3490543365478516, + "learning_rate": 1.1441232096902149e-05, + "loss": 1.044, + "step": 45635 + }, + { + "epoch": 13.65, + "grad_norm": 2.1857025623321533, + "learning_rate": 1.1436296400254049e-05, + "loss": 0.8137, + "step": 45640 + }, + { + "epoch": 13.66, + "grad_norm": 3.0062873363494873, + "learning_rate": 1.1431361452715218e-05, + "loss": 0.9847, + "step": 45645 + }, + { + "epoch": 13.66, + "grad_norm": 3.707160472869873, + "learning_rate": 1.1426427254558182e-05, + "loss": 1.1328, + "step": 45650 + }, + { + "epoch": 13.66, + "grad_norm": 3.175178289413452, + "learning_rate": 1.1421493806055455e-05, + "loss": 1.0651, + "step": 45655 + }, + { + "epoch": 13.66, + "grad_norm": 4.4922776222229, + "learning_rate": 1.1416561107479514e-05, + "loss": 1.1197, + "step": 45660 + }, + { + "epoch": 13.66, + "grad_norm": 2.236589193344116, + "learning_rate": 1.1411629159102785e-05, + "loss": 1.0603, + "step": 45665 + }, + { + "epoch": 13.66, + "grad_norm": 1.2973885536193848, + "learning_rate": 1.1406697961197654e-05, + "loss": 0.9378, + "step": 45670 + }, + { + "epoch": 13.67, + "grad_norm": 2.1828157901763916, + "learning_rate": 1.1401767514036463e-05, + "loss": 0.94, + "step": 45675 + }, + { + "epoch": 13.67, + "grad_norm": 4.644625186920166, + "learning_rate": 1.1396837817891518e-05, + "loss": 0.9499, + "step": 45680 + }, + { + "epoch": 13.67, + "grad_norm": 2.0859460830688477, + "learning_rate": 1.1391908873035082e-05, + "loss": 0.9002, + "step": 45685 + }, + { + "epoch": 13.67, + "grad_norm": 7.057641983032227, + "learning_rate": 1.1386980679739373e-05, + "loss": 0.917, + "step": 45690 + }, + { + "epoch": 13.67, + "grad_norm": 3.8702352046966553, + "learning_rate": 1.1382053238276569e-05, + "loss": 0.9209, + "step": 45695 + }, + { + "epoch": 13.67, + "grad_norm": 1.8291438817977905, + "learning_rate": 1.1377126548918811e-05, + "loss": 1.02, + "step": 45700 + }, + { + "epoch": 13.67, + "grad_norm": 6.240175247192383, + "learning_rate": 1.1372200611938196e-05, + "loss": 0.9408, + "step": 45705 + }, + { + "epoch": 13.68, + "grad_norm": 8.977832794189453, + "learning_rate": 1.1367275427606774e-05, + "loss": 0.8573, + "step": 45710 + }, + { + "epoch": 13.68, + "grad_norm": 2.6074182987213135, + "learning_rate": 1.1362350996196559e-05, + "loss": 0.9426, + "step": 45715 + }, + { + "epoch": 13.68, + "grad_norm": 3.5321543216705322, + "learning_rate": 1.1357427317979532e-05, + "loss": 1.0316, + "step": 45720 + }, + { + "epoch": 13.68, + "grad_norm": 1.716353178024292, + "learning_rate": 1.1352504393227598e-05, + "loss": 1.0807, + "step": 45725 + }, + { + "epoch": 13.68, + "grad_norm": 4.071742534637451, + "learning_rate": 1.1347582222212677e-05, + "loss": 0.951, + "step": 45730 + }, + { + "epoch": 13.68, + "grad_norm": 2.598299741744995, + "learning_rate": 1.1342660805206578e-05, + "loss": 0.9232, + "step": 45735 + }, + { + "epoch": 13.68, + "grad_norm": 3.115583658218384, + "learning_rate": 1.1337740142481148e-05, + "loss": 1.1328, + "step": 45740 + }, + { + "epoch": 13.69, + "grad_norm": 4.597363471984863, + "learning_rate": 1.1332820234308123e-05, + "loss": 0.9854, + "step": 45745 + }, + { + "epoch": 13.69, + "grad_norm": 4.575268745422363, + "learning_rate": 1.1327901080959224e-05, + "loss": 0.9848, + "step": 45750 + }, + { + "epoch": 13.69, + "grad_norm": 4.154250144958496, + "learning_rate": 1.132298268270614e-05, + "loss": 0.877, + "step": 45755 + }, + { + "epoch": 13.69, + "grad_norm": 2.12250018119812, + "learning_rate": 1.1318065039820505e-05, + "loss": 0.8998, + "step": 45760 + }, + { + "epoch": 13.69, + "grad_norm": 1.1914949417114258, + "learning_rate": 1.1313148152573915e-05, + "loss": 0.975, + "step": 45765 + }, + { + "epoch": 13.69, + "grad_norm": 2.145120143890381, + "learning_rate": 1.130823202123793e-05, + "loss": 1.1313, + "step": 45770 + }, + { + "epoch": 13.7, + "grad_norm": 6.92742395401001, + "learning_rate": 1.1303316646084055e-05, + "loss": 1.184, + "step": 45775 + }, + { + "epoch": 13.7, + "grad_norm": 4.461109638214111, + "learning_rate": 1.1298402027383768e-05, + "loss": 1.0285, + "step": 45780 + }, + { + "epoch": 13.7, + "grad_norm": 3.108293294906616, + "learning_rate": 1.1293488165408492e-05, + "loss": 1.0309, + "step": 45785 + }, + { + "epoch": 13.7, + "grad_norm": 2.175156593322754, + "learning_rate": 1.1288575060429618e-05, + "loss": 1.0424, + "step": 45790 + }, + { + "epoch": 13.7, + "grad_norm": 2.99977707862854, + "learning_rate": 1.1283662712718493e-05, + "loss": 0.8261, + "step": 45795 + }, + { + "epoch": 13.7, + "grad_norm": 3.166217565536499, + "learning_rate": 1.1278751122546417e-05, + "loss": 0.9703, + "step": 45800 + }, + { + "epoch": 13.7, + "grad_norm": 4.1235032081604, + "learning_rate": 1.1273840290184653e-05, + "loss": 1.0018, + "step": 45805 + }, + { + "epoch": 13.71, + "grad_norm": 2.862684726715088, + "learning_rate": 1.1268930215904424e-05, + "loss": 1.0977, + "step": 45810 + }, + { + "epoch": 13.71, + "grad_norm": 3.851482629776001, + "learning_rate": 1.1264020899976918e-05, + "loss": 0.9958, + "step": 45815 + }, + { + "epoch": 13.71, + "grad_norm": 4.1922197341918945, + "learning_rate": 1.125911234267324e-05, + "loss": 1.0227, + "step": 45820 + }, + { + "epoch": 13.71, + "grad_norm": 2.539138078689575, + "learning_rate": 1.1254204544264521e-05, + "loss": 1.1639, + "step": 45825 + }, + { + "epoch": 13.71, + "grad_norm": 5.336416721343994, + "learning_rate": 1.1249297505021778e-05, + "loss": 0.9809, + "step": 45830 + }, + { + "epoch": 13.71, + "grad_norm": 2.3755998611450195, + "learning_rate": 1.1244391225216061e-05, + "loss": 0.9196, + "step": 45835 + }, + { + "epoch": 13.71, + "grad_norm": 4.774353504180908, + "learning_rate": 1.123948570511831e-05, + "loss": 1.012, + "step": 45840 + }, + { + "epoch": 13.72, + "grad_norm": 4.135644435882568, + "learning_rate": 1.123458094499946e-05, + "loss": 1.0807, + "step": 45845 + }, + { + "epoch": 13.72, + "grad_norm": 2.5520973205566406, + "learning_rate": 1.1229676945130394e-05, + "loss": 1.0183, + "step": 45850 + }, + { + "epoch": 13.72, + "grad_norm": 4.4872870445251465, + "learning_rate": 1.122477370578196e-05, + "loss": 0.9953, + "step": 45855 + }, + { + "epoch": 13.72, + "grad_norm": 1.6755377054214478, + "learning_rate": 1.1219871227224953e-05, + "loss": 0.929, + "step": 45860 + }, + { + "epoch": 13.72, + "grad_norm": 3.4664084911346436, + "learning_rate": 1.1214969509730135e-05, + "loss": 0.9963, + "step": 45865 + }, + { + "epoch": 13.72, + "grad_norm": 4.792949199676514, + "learning_rate": 1.1210068553568224e-05, + "loss": 0.7811, + "step": 45870 + }, + { + "epoch": 13.73, + "grad_norm": 4.7333550453186035, + "learning_rate": 1.120516835900989e-05, + "loss": 0.9869, + "step": 45875 + }, + { + "epoch": 13.73, + "grad_norm": 2.1963367462158203, + "learning_rate": 1.1200268926325771e-05, + "loss": 1.0632, + "step": 45880 + }, + { + "epoch": 13.73, + "grad_norm": 4.561999797821045, + "learning_rate": 1.1195370255786455e-05, + "loss": 0.9137, + "step": 45885 + }, + { + "epoch": 13.73, + "grad_norm": 2.747818946838379, + "learning_rate": 1.119047234766249e-05, + "loss": 0.9463, + "step": 45890 + }, + { + "epoch": 13.73, + "grad_norm": 3.7482149600982666, + "learning_rate": 1.1185575202224383e-05, + "loss": 0.8418, + "step": 45895 + }, + { + "epoch": 13.73, + "grad_norm": 2.163506507873535, + "learning_rate": 1.1180678819742599e-05, + "loss": 1.2076, + "step": 45900 + }, + { + "epoch": 13.73, + "grad_norm": 1.5832871198654175, + "learning_rate": 1.1175783200487558e-05, + "loss": 1.1003, + "step": 45905 + }, + { + "epoch": 13.74, + "grad_norm": 4.2854413986206055, + "learning_rate": 1.1170888344729652e-05, + "loss": 1.0152, + "step": 45910 + }, + { + "epoch": 13.74, + "grad_norm": 2.954716920852661, + "learning_rate": 1.116599425273919e-05, + "loss": 1.0792, + "step": 45915 + }, + { + "epoch": 13.74, + "grad_norm": 2.61503005027771, + "learning_rate": 1.1161100924786502e-05, + "loss": 1.0473, + "step": 45920 + }, + { + "epoch": 13.74, + "grad_norm": 4.037346839904785, + "learning_rate": 1.115620836114181e-05, + "loss": 0.996, + "step": 45925 + }, + { + "epoch": 13.74, + "grad_norm": 1.7600798606872559, + "learning_rate": 1.1151316562075356e-05, + "loss": 0.942, + "step": 45930 + }, + { + "epoch": 13.74, + "grad_norm": 2.2943618297576904, + "learning_rate": 1.1146425527857274e-05, + "loss": 0.9601, + "step": 45935 + }, + { + "epoch": 13.74, + "grad_norm": 2.6054422855377197, + "learning_rate": 1.1141535258757733e-05, + "loss": 1.0357, + "step": 45940 + }, + { + "epoch": 13.75, + "grad_norm": 2.9605374336242676, + "learning_rate": 1.1136645755046781e-05, + "loss": 0.9024, + "step": 45945 + }, + { + "epoch": 13.75, + "grad_norm": 6.355667591094971, + "learning_rate": 1.1131757016994476e-05, + "loss": 1.1976, + "step": 45950 + }, + { + "epoch": 13.75, + "grad_norm": 2.857518434524536, + "learning_rate": 1.1126869044870817e-05, + "loss": 1.0472, + "step": 45955 + }, + { + "epoch": 13.75, + "grad_norm": 3.006471633911133, + "learning_rate": 1.1121981838945747e-05, + "loss": 1.1969, + "step": 45960 + }, + { + "epoch": 13.75, + "grad_norm": 2.723449945449829, + "learning_rate": 1.1117095399489215e-05, + "loss": 0.8746, + "step": 45965 + }, + { + "epoch": 13.75, + "grad_norm": 2.74454927444458, + "learning_rate": 1.1112209726771067e-05, + "loss": 1.1539, + "step": 45970 + }, + { + "epoch": 13.76, + "grad_norm": 3.3110272884368896, + "learning_rate": 1.1107324821061139e-05, + "loss": 0.9179, + "step": 45975 + }, + { + "epoch": 13.76, + "grad_norm": 1.4932845830917358, + "learning_rate": 1.1102440682629217e-05, + "loss": 1.0535, + "step": 45980 + }, + { + "epoch": 13.76, + "grad_norm": 2.4937548637390137, + "learning_rate": 1.1097557311745055e-05, + "loss": 1.027, + "step": 45985 + }, + { + "epoch": 13.76, + "grad_norm": 2.2536213397979736, + "learning_rate": 1.1092674708678349e-05, + "loss": 1.0212, + "step": 45990 + }, + { + "epoch": 13.76, + "grad_norm": 1.4273040294647217, + "learning_rate": 1.1087792873698763e-05, + "loss": 1.0554, + "step": 45995 + }, + { + "epoch": 13.76, + "grad_norm": 3.2895405292510986, + "learning_rate": 1.1082911807075917e-05, + "loss": 0.9926, + "step": 46000 + }, + { + "epoch": 13.76, + "grad_norm": 2.659669876098633, + "learning_rate": 1.1078031509079394e-05, + "loss": 1.0621, + "step": 46005 + }, + { + "epoch": 13.77, + "grad_norm": 6.099573135375977, + "learning_rate": 1.1073151979978702e-05, + "loss": 0.9202, + "step": 46010 + }, + { + "epoch": 13.77, + "grad_norm": 3.4382877349853516, + "learning_rate": 1.1068273220043367e-05, + "loss": 1.0081, + "step": 46015 + }, + { + "epoch": 13.77, + "grad_norm": 2.076596975326538, + "learning_rate": 1.1063395229542806e-05, + "loss": 1.0796, + "step": 46020 + }, + { + "epoch": 13.77, + "grad_norm": 3.9843549728393555, + "learning_rate": 1.1058518008746454e-05, + "loss": 0.8787, + "step": 46025 + }, + { + "epoch": 13.77, + "grad_norm": 4.303983688354492, + "learning_rate": 1.1053641557923647e-05, + "loss": 1.0372, + "step": 46030 + }, + { + "epoch": 13.77, + "grad_norm": 2.91055965423584, + "learning_rate": 1.1048765877343736e-05, + "loss": 0.9737, + "step": 46035 + }, + { + "epoch": 13.77, + "grad_norm": 3.4289169311523438, + "learning_rate": 1.1043890967275978e-05, + "loss": 1.1013, + "step": 46040 + }, + { + "epoch": 13.78, + "grad_norm": 18.518823623657227, + "learning_rate": 1.1039016827989601e-05, + "loss": 1.1429, + "step": 46045 + }, + { + "epoch": 13.78, + "grad_norm": 3.946131706237793, + "learning_rate": 1.1034143459753835e-05, + "loss": 0.9853, + "step": 46050 + }, + { + "epoch": 13.78, + "grad_norm": 1.882799506187439, + "learning_rate": 1.102927086283779e-05, + "loss": 0.9411, + "step": 46055 + }, + { + "epoch": 13.78, + "grad_norm": 4.6659393310546875, + "learning_rate": 1.102439903751061e-05, + "loss": 0.8427, + "step": 46060 + }, + { + "epoch": 13.78, + "grad_norm": 4.819268703460693, + "learning_rate": 1.1019527984041328e-05, + "loss": 1.0051, + "step": 46065 + }, + { + "epoch": 13.78, + "grad_norm": 1.3024616241455078, + "learning_rate": 1.1014657702699003e-05, + "loss": 1.1975, + "step": 46070 + }, + { + "epoch": 13.79, + "grad_norm": 5.274040222167969, + "learning_rate": 1.1009788193752584e-05, + "loss": 1.0944, + "step": 46075 + }, + { + "epoch": 13.79, + "grad_norm": 1.9087549448013306, + "learning_rate": 1.1004919457471022e-05, + "loss": 1.1598, + "step": 46080 + }, + { + "epoch": 13.79, + "grad_norm": 5.655836582183838, + "learning_rate": 1.1000051494123211e-05, + "loss": 0.8013, + "step": 46085 + }, + { + "epoch": 13.79, + "grad_norm": 4.382424831390381, + "learning_rate": 1.0995184303978004e-05, + "loss": 0.9066, + "step": 46090 + }, + { + "epoch": 13.79, + "grad_norm": 4.181612968444824, + "learning_rate": 1.0990317887304214e-05, + "loss": 0.8979, + "step": 46095 + }, + { + "epoch": 13.79, + "grad_norm": 3.6581287384033203, + "learning_rate": 1.0985452244370608e-05, + "loss": 0.9312, + "step": 46100 + }, + { + "epoch": 13.79, + "grad_norm": 3.3705477714538574, + "learning_rate": 1.0980587375445894e-05, + "loss": 0.9881, + "step": 46105 + }, + { + "epoch": 13.8, + "grad_norm": 1.2490730285644531, + "learning_rate": 1.0975723280798783e-05, + "loss": 0.9072, + "step": 46110 + }, + { + "epoch": 13.8, + "grad_norm": 2.7945122718811035, + "learning_rate": 1.0970859960697879e-05, + "loss": 0.9368, + "step": 46115 + }, + { + "epoch": 13.8, + "grad_norm": 1.166204810142517, + "learning_rate": 1.0965997415411808e-05, + "loss": 1.0581, + "step": 46120 + }, + { + "epoch": 13.8, + "grad_norm": 1.6771597862243652, + "learning_rate": 1.096113564520911e-05, + "loss": 0.8542, + "step": 46125 + }, + { + "epoch": 13.8, + "grad_norm": 4.801357269287109, + "learning_rate": 1.09562746503583e-05, + "loss": 1.0804, + "step": 46130 + }, + { + "epoch": 13.8, + "grad_norm": 3.126763105392456, + "learning_rate": 1.0951414431127852e-05, + "loss": 0.8767, + "step": 46135 + }, + { + "epoch": 13.8, + "grad_norm": 3.6435258388519287, + "learning_rate": 1.0946554987786162e-05, + "loss": 0.9254, + "step": 46140 + }, + { + "epoch": 13.81, + "grad_norm": 3.6946301460266113, + "learning_rate": 1.0941696320601652e-05, + "loss": 0.8884, + "step": 46145 + }, + { + "epoch": 13.81, + "grad_norm": 1.621283769607544, + "learning_rate": 1.0936838429842621e-05, + "loss": 1.1543, + "step": 46150 + }, + { + "epoch": 13.81, + "grad_norm": 2.5034942626953125, + "learning_rate": 1.0931981315777406e-05, + "loss": 1.0512, + "step": 46155 + }, + { + "epoch": 13.81, + "grad_norm": 2.1045899391174316, + "learning_rate": 1.0927124978674225e-05, + "loss": 1.0269, + "step": 46160 + }, + { + "epoch": 13.81, + "grad_norm": 1.1668440103530884, + "learning_rate": 1.0922269418801318e-05, + "loss": 0.8998, + "step": 46165 + }, + { + "epoch": 13.81, + "grad_norm": 1.8777698278427124, + "learning_rate": 1.091741463642683e-05, + "loss": 1.0598, + "step": 46170 + }, + { + "epoch": 13.82, + "grad_norm": 2.1455204486846924, + "learning_rate": 1.091256063181889e-05, + "loss": 1.0727, + "step": 46175 + }, + { + "epoch": 13.82, + "grad_norm": 3.745995283126831, + "learning_rate": 1.0907707405245588e-05, + "loss": 0.9028, + "step": 46180 + }, + { + "epoch": 13.82, + "grad_norm": 4.1767096519470215, + "learning_rate": 1.0902854956974957e-05, + "loss": 1.0149, + "step": 46185 + }, + { + "epoch": 13.82, + "grad_norm": 1.699058175086975, + "learning_rate": 1.0898003287274991e-05, + "loss": 0.9797, + "step": 46190 + }, + { + "epoch": 13.82, + "grad_norm": 2.886626958847046, + "learning_rate": 1.0893152396413655e-05, + "loss": 1.0911, + "step": 46195 + }, + { + "epoch": 13.82, + "grad_norm": 6.132671356201172, + "learning_rate": 1.0888302284658833e-05, + "loss": 0.9886, + "step": 46200 + }, + { + "epoch": 13.82, + "grad_norm": 1.1873294115066528, + "learning_rate": 1.0883452952278416e-05, + "loss": 1.1952, + "step": 46205 + }, + { + "epoch": 13.83, + "grad_norm": 2.7054784297943115, + "learning_rate": 1.0878604399540219e-05, + "loss": 1.0763, + "step": 46210 + }, + { + "epoch": 13.83, + "grad_norm": 2.0939934253692627, + "learning_rate": 1.087375662671202e-05, + "loss": 0.959, + "step": 46215 + }, + { + "epoch": 13.83, + "grad_norm": 2.734236717224121, + "learning_rate": 1.0868909634061561e-05, + "loss": 0.9568, + "step": 46220 + }, + { + "epoch": 13.83, + "grad_norm": 2.2614846229553223, + "learning_rate": 1.0864063421856535e-05, + "loss": 1.0809, + "step": 46225 + }, + { + "epoch": 13.83, + "grad_norm": 2.1451010704040527, + "learning_rate": 1.08592179903646e-05, + "loss": 1.1251, + "step": 46230 + }, + { + "epoch": 13.83, + "grad_norm": 1.8617221117019653, + "learning_rate": 1.085437333985334e-05, + "loss": 0.9847, + "step": 46235 + }, + { + "epoch": 13.83, + "grad_norm": 6.787776947021484, + "learning_rate": 1.0849529470590358e-05, + "loss": 0.777, + "step": 46240 + }, + { + "epoch": 13.84, + "grad_norm": 5.022008895874023, + "learning_rate": 1.0844686382843134e-05, + "loss": 1.2188, + "step": 46245 + }, + { + "epoch": 13.84, + "grad_norm": 3.551828622817993, + "learning_rate": 1.0839844076879185e-05, + "loss": 0.998, + "step": 46250 + }, + { + "epoch": 13.84, + "grad_norm": 2.8129985332489014, + "learning_rate": 1.0835002552965911e-05, + "loss": 1.0112, + "step": 46255 + }, + { + "epoch": 13.84, + "grad_norm": 5.713034629821777, + "learning_rate": 1.083016181137074e-05, + "loss": 0.8538, + "step": 46260 + }, + { + "epoch": 13.84, + "grad_norm": 7.516810417175293, + "learning_rate": 1.0825321852360995e-05, + "loss": 0.889, + "step": 46265 + }, + { + "epoch": 13.84, + "grad_norm": 1.1386287212371826, + "learning_rate": 1.0820482676203991e-05, + "loss": 1.0063, + "step": 46270 + }, + { + "epoch": 13.84, + "grad_norm": 1.6292932033538818, + "learning_rate": 1.081564428316699e-05, + "loss": 0.9629, + "step": 46275 + }, + { + "epoch": 13.85, + "grad_norm": 2.4698173999786377, + "learning_rate": 1.081080667351721e-05, + "loss": 0.9526, + "step": 46280 + }, + { + "epoch": 13.85, + "grad_norm": 5.473732948303223, + "learning_rate": 1.0805969847521829e-05, + "loss": 0.8786, + "step": 46285 + }, + { + "epoch": 13.85, + "grad_norm": 2.3145463466644287, + "learning_rate": 1.0801133805447979e-05, + "loss": 0.8604, + "step": 46290 + }, + { + "epoch": 13.85, + "grad_norm": 4.11014461517334, + "learning_rate": 1.0796298547562753e-05, + "loss": 1.0444, + "step": 46295 + }, + { + "epoch": 13.85, + "grad_norm": 2.2047159671783447, + "learning_rate": 1.0791464074133189e-05, + "loss": 1.0339, + "step": 46300 + }, + { + "epoch": 13.85, + "grad_norm": 5.002912521362305, + "learning_rate": 1.07866303854263e-05, + "loss": 1.0871, + "step": 46305 + }, + { + "epoch": 13.86, + "grad_norm": 2.067976713180542, + "learning_rate": 1.0781797481709039e-05, + "loss": 1.0389, + "step": 46310 + }, + { + "epoch": 13.86, + "grad_norm": 3.528301477432251, + "learning_rate": 1.0776965363248326e-05, + "loss": 1.061, + "step": 46315 + }, + { + "epoch": 13.86, + "grad_norm": 2.523347854614258, + "learning_rate": 1.077213403031103e-05, + "loss": 0.9864, + "step": 46320 + }, + { + "epoch": 13.86, + "grad_norm": 4.887777328491211, + "learning_rate": 1.0767303483163991e-05, + "loss": 1.0392, + "step": 46325 + }, + { + "epoch": 13.86, + "grad_norm": 4.290348052978516, + "learning_rate": 1.0762473722073968e-05, + "loss": 1.0014, + "step": 46330 + }, + { + "epoch": 13.86, + "grad_norm": 5.4099931716918945, + "learning_rate": 1.0757644747307744e-05, + "loss": 0.898, + "step": 46335 + }, + { + "epoch": 13.86, + "grad_norm": 3.631016254425049, + "learning_rate": 1.0752816559131976e-05, + "loss": 0.9292, + "step": 46340 + }, + { + "epoch": 13.87, + "grad_norm": 2.76509428024292, + "learning_rate": 1.0747989157813357e-05, + "loss": 1.0867, + "step": 46345 + }, + { + "epoch": 13.87, + "grad_norm": 1.9425288438796997, + "learning_rate": 1.0743162543618465e-05, + "loss": 1.0359, + "step": 46350 + }, + { + "epoch": 13.87, + "grad_norm": 1.2764666080474854, + "learning_rate": 1.0738336716813907e-05, + "loss": 1.122, + "step": 46355 + }, + { + "epoch": 13.87, + "grad_norm": 5.330173492431641, + "learning_rate": 1.0733511677666178e-05, + "loss": 1.1068, + "step": 46360 + }, + { + "epoch": 13.87, + "grad_norm": 4.102222442626953, + "learning_rate": 1.0728687426441769e-05, + "loss": 1.0624, + "step": 46365 + }, + { + "epoch": 13.87, + "grad_norm": 2.589555025100708, + "learning_rate": 1.0723863963407119e-05, + "loss": 0.9326, + "step": 46370 + }, + { + "epoch": 13.87, + "grad_norm": 2.6629140377044678, + "learning_rate": 1.0719041288828624e-05, + "loss": 0.8154, + "step": 46375 + }, + { + "epoch": 13.88, + "grad_norm": 3.4112234115600586, + "learning_rate": 1.0714219402972633e-05, + "loss": 1.1058, + "step": 46380 + }, + { + "epoch": 13.88, + "grad_norm": 2.342379093170166, + "learning_rate": 1.0709398306105456e-05, + "loss": 1.0584, + "step": 46385 + }, + { + "epoch": 13.88, + "grad_norm": 1.7157783508300781, + "learning_rate": 1.070457799849336e-05, + "loss": 1.0045, + "step": 46390 + }, + { + "epoch": 13.88, + "grad_norm": 1.9355769157409668, + "learning_rate": 1.0699758480402555e-05, + "loss": 0.8319, + "step": 46395 + }, + { + "epoch": 13.88, + "grad_norm": 2.4929144382476807, + "learning_rate": 1.0694939752099229e-05, + "loss": 0.9453, + "step": 46400 + }, + { + "epoch": 13.88, + "grad_norm": 3.4646151065826416, + "learning_rate": 1.069012181384951e-05, + "loss": 1.0055, + "step": 46405 + }, + { + "epoch": 13.89, + "grad_norm": 3.360816240310669, + "learning_rate": 1.068530466591949e-05, + "loss": 1.0103, + "step": 46410 + }, + { + "epoch": 13.89, + "grad_norm": 7.58369255065918, + "learning_rate": 1.0680488308575215e-05, + "loss": 1.0717, + "step": 46415 + }, + { + "epoch": 13.89, + "grad_norm": 3.2002921104431152, + "learning_rate": 1.0675672742082692e-05, + "loss": 0.9674, + "step": 46420 + }, + { + "epoch": 13.89, + "grad_norm": 3.446225643157959, + "learning_rate": 1.067085796670786e-05, + "loss": 1.1751, + "step": 46425 + }, + { + "epoch": 13.89, + "grad_norm": 1.5883591175079346, + "learning_rate": 1.0666043982716662e-05, + "loss": 1.177, + "step": 46430 + }, + { + "epoch": 13.89, + "grad_norm": 5.534438133239746, + "learning_rate": 1.066123079037494e-05, + "loss": 0.8887, + "step": 46435 + }, + { + "epoch": 13.89, + "grad_norm": 1.9987788200378418, + "learning_rate": 1.0656418389948556e-05, + "loss": 0.9892, + "step": 46440 + }, + { + "epoch": 13.9, + "grad_norm": 1.9698108434677124, + "learning_rate": 1.0651606781703256e-05, + "loss": 1.0445, + "step": 46445 + }, + { + "epoch": 13.9, + "grad_norm": 1.448635458946228, + "learning_rate": 1.0646795965904815e-05, + "loss": 1.0926, + "step": 46450 + }, + { + "epoch": 13.9, + "grad_norm": 1.830094337463379, + "learning_rate": 1.0641985942818907e-05, + "loss": 1.0397, + "step": 46455 + }, + { + "epoch": 13.9, + "grad_norm": 3.425884485244751, + "learning_rate": 1.0637176712711192e-05, + "loss": 1.1553, + "step": 46460 + }, + { + "epoch": 13.9, + "grad_norm": 3.424363136291504, + "learning_rate": 1.0632368275847276e-05, + "loss": 1.083, + "step": 46465 + }, + { + "epoch": 13.9, + "grad_norm": 4.42012357711792, + "learning_rate": 1.0627560632492726e-05, + "loss": 0.9418, + "step": 46470 + }, + { + "epoch": 13.9, + "grad_norm": 1.4648548364639282, + "learning_rate": 1.0622753782913062e-05, + "loss": 0.844, + "step": 46475 + }, + { + "epoch": 13.91, + "grad_norm": 2.15419340133667, + "learning_rate": 1.0617947727373764e-05, + "loss": 1.0659, + "step": 46480 + }, + { + "epoch": 13.91, + "grad_norm": 1.1197208166122437, + "learning_rate": 1.0613142466140264e-05, + "loss": 1.0441, + "step": 46485 + }, + { + "epoch": 13.91, + "grad_norm": 1.7040044069290161, + "learning_rate": 1.0608337999477949e-05, + "loss": 1.058, + "step": 46490 + }, + { + "epoch": 13.91, + "grad_norm": 3.230975389480591, + "learning_rate": 1.0603534327652167e-05, + "loss": 0.8304, + "step": 46495 + }, + { + "epoch": 13.91, + "grad_norm": 3.9677131175994873, + "learning_rate": 1.0598731450928224e-05, + "loss": 1.0955, + "step": 46500 + }, + { + "epoch": 13.91, + "grad_norm": 4.740390777587891, + "learning_rate": 1.059392936957137e-05, + "loss": 0.8264, + "step": 46505 + }, + { + "epoch": 13.92, + "grad_norm": 3.7869038581848145, + "learning_rate": 1.0589128083846822e-05, + "loss": 0.9756, + "step": 46510 + }, + { + "epoch": 13.92, + "grad_norm": 3.1534292697906494, + "learning_rate": 1.0584327594019752e-05, + "loss": 1.0928, + "step": 46515 + }, + { + "epoch": 13.92, + "grad_norm": 3.072333812713623, + "learning_rate": 1.0579527900355286e-05, + "loss": 0.8572, + "step": 46520 + }, + { + "epoch": 13.92, + "grad_norm": 1.7602797746658325, + "learning_rate": 1.0574729003118511e-05, + "loss": 0.8316, + "step": 46525 + }, + { + "epoch": 13.92, + "grad_norm": 2.0324742794036865, + "learning_rate": 1.0569930902574443e-05, + "loss": 0.9476, + "step": 46530 + }, + { + "epoch": 13.92, + "grad_norm": 1.305268406867981, + "learning_rate": 1.056513359898811e-05, + "loss": 1.0128, + "step": 46535 + }, + { + "epoch": 13.92, + "grad_norm": 1.4846941232681274, + "learning_rate": 1.0560337092624426e-05, + "loss": 1.087, + "step": 46540 + }, + { + "epoch": 13.93, + "grad_norm": 1.1681172847747803, + "learning_rate": 1.0555541383748332e-05, + "loss": 0.988, + "step": 46545 + }, + { + "epoch": 13.93, + "grad_norm": 4.223578929901123, + "learning_rate": 1.0550746472624665e-05, + "loss": 1.1653, + "step": 46550 + }, + { + "epoch": 13.93, + "grad_norm": 1.3550422191619873, + "learning_rate": 1.0545952359518251e-05, + "loss": 1.0737, + "step": 46555 + }, + { + "epoch": 13.93, + "grad_norm": 3.005021095275879, + "learning_rate": 1.0541159044693866e-05, + "loss": 0.8508, + "step": 46560 + }, + { + "epoch": 13.93, + "grad_norm": 3.721085786819458, + "learning_rate": 1.0536366528416236e-05, + "loss": 1.0313, + "step": 46565 + }, + { + "epoch": 13.93, + "grad_norm": 1.5910168886184692, + "learning_rate": 1.0531574810950048e-05, + "loss": 1.0548, + "step": 46570 + }, + { + "epoch": 13.93, + "grad_norm": 2.5519471168518066, + "learning_rate": 1.0526783892559944e-05, + "loss": 1.1611, + "step": 46575 + }, + { + "epoch": 13.94, + "grad_norm": 1.598804235458374, + "learning_rate": 1.0521993773510522e-05, + "loss": 1.0835, + "step": 46580 + }, + { + "epoch": 13.94, + "grad_norm": 4.5623273849487305, + "learning_rate": 1.0517204454066337e-05, + "loss": 1.0025, + "step": 46585 + }, + { + "epoch": 13.94, + "grad_norm": 2.7830114364624023, + "learning_rate": 1.0512415934491893e-05, + "loss": 0.9183, + "step": 46590 + }, + { + "epoch": 13.94, + "grad_norm": 2.62373685836792, + "learning_rate": 1.050762821505166e-05, + "loss": 0.8919, + "step": 46595 + }, + { + "epoch": 13.94, + "grad_norm": 8.385918617248535, + "learning_rate": 1.0502841296010055e-05, + "loss": 1.0263, + "step": 46600 + }, + { + "epoch": 13.94, + "grad_norm": 1.7394742965698242, + "learning_rate": 1.0498055177631458e-05, + "loss": 1.0657, + "step": 46605 + }, + { + "epoch": 13.95, + "grad_norm": 3.228177070617676, + "learning_rate": 1.0493269860180196e-05, + "loss": 0.856, + "step": 46610 + }, + { + "epoch": 13.95, + "grad_norm": 3.1336793899536133, + "learning_rate": 1.0488485343920564e-05, + "loss": 0.968, + "step": 46615 + }, + { + "epoch": 13.95, + "grad_norm": 3.7546615600585938, + "learning_rate": 1.0483701629116813e-05, + "loss": 0.9154, + "step": 46620 + }, + { + "epoch": 13.95, + "grad_norm": 4.517045497894287, + "learning_rate": 1.047891871603311e-05, + "loss": 1.031, + "step": 46625 + }, + { + "epoch": 13.95, + "grad_norm": 4.163939952850342, + "learning_rate": 1.0474136604933654e-05, + "loss": 0.9468, + "step": 46630 + }, + { + "epoch": 13.95, + "grad_norm": 1.9266438484191895, + "learning_rate": 1.0469355296082513e-05, + "loss": 1.1067, + "step": 46635 + }, + { + "epoch": 13.95, + "grad_norm": 1.395596981048584, + "learning_rate": 1.0464574789743798e-05, + "loss": 0.9645, + "step": 46640 + }, + { + "epoch": 13.96, + "grad_norm": 3.1779944896698, + "learning_rate": 1.0459795086181487e-05, + "loss": 0.9891, + "step": 46645 + }, + { + "epoch": 13.96, + "grad_norm": 7.933053493499756, + "learning_rate": 1.04550161856596e-05, + "loss": 1.0198, + "step": 46650 + }, + { + "epoch": 13.96, + "grad_norm": 3.6656312942504883, + "learning_rate": 1.0450238088442038e-05, + "loss": 0.8881, + "step": 46655 + }, + { + "epoch": 13.96, + "grad_norm": 3.141336679458618, + "learning_rate": 1.0445460794792706e-05, + "loss": 1.0471, + "step": 46660 + }, + { + "epoch": 13.96, + "grad_norm": 1.635010004043579, + "learning_rate": 1.0440684304975442e-05, + "loss": 0.8864, + "step": 46665 + }, + { + "epoch": 13.96, + "grad_norm": 3.4402453899383545, + "learning_rate": 1.0435908619254053e-05, + "loss": 1.0659, + "step": 46670 + }, + { + "epoch": 13.96, + "grad_norm": 2.2001841068267822, + "learning_rate": 1.0431133737892292e-05, + "loss": 0.9821, + "step": 46675 + }, + { + "epoch": 13.97, + "grad_norm": 2.5706429481506348, + "learning_rate": 1.0426359661153873e-05, + "loss": 0.8815, + "step": 46680 + }, + { + "epoch": 13.97, + "grad_norm": 5.090077877044678, + "learning_rate": 1.042158638930246e-05, + "loss": 0.9084, + "step": 46685 + }, + { + "epoch": 13.97, + "grad_norm": 4.350427150726318, + "learning_rate": 1.0416813922601677e-05, + "loss": 1.0049, + "step": 46690 + }, + { + "epoch": 13.97, + "grad_norm": 3.4876809120178223, + "learning_rate": 1.0412042261315103e-05, + "loss": 1.1069, + "step": 46695 + }, + { + "epoch": 13.97, + "grad_norm": 3.4440534114837646, + "learning_rate": 1.0407271405706271e-05, + "loss": 1.1091, + "step": 46700 + }, + { + "epoch": 13.97, + "grad_norm": 2.864044427871704, + "learning_rate": 1.0402501356038671e-05, + "loss": 0.8457, + "step": 46705 + }, + { + "epoch": 13.98, + "grad_norm": 1.3529998064041138, + "learning_rate": 1.0397732112575747e-05, + "loss": 1.0893, + "step": 46710 + }, + { + "epoch": 13.98, + "grad_norm": 1.4070992469787598, + "learning_rate": 1.0392963675580911e-05, + "loss": 1.1165, + "step": 46715 + }, + { + "epoch": 13.98, + "grad_norm": 3.6802985668182373, + "learning_rate": 1.0388196045317488e-05, + "loss": 1.1286, + "step": 46720 + }, + { + "epoch": 13.98, + "grad_norm": 1.862669587135315, + "learning_rate": 1.0383429222048829e-05, + "loss": 0.9559, + "step": 46725 + }, + { + "epoch": 13.98, + "grad_norm": 2.9887638092041016, + "learning_rate": 1.0378663206038164e-05, + "loss": 0.9677, + "step": 46730 + }, + { + "epoch": 13.98, + "grad_norm": 3.377013921737671, + "learning_rate": 1.0373897997548748e-05, + "loss": 0.9216, + "step": 46735 + }, + { + "epoch": 13.98, + "grad_norm": 3.8938591480255127, + "learning_rate": 1.0369133596843727e-05, + "loss": 1.0537, + "step": 46740 + }, + { + "epoch": 13.99, + "grad_norm": 3.0918357372283936, + "learning_rate": 1.0364370004186267e-05, + "loss": 1.0356, + "step": 46745 + }, + { + "epoch": 13.99, + "grad_norm": 4.587892055511475, + "learning_rate": 1.0359607219839427e-05, + "loss": 0.9786, + "step": 46750 + }, + { + "epoch": 13.99, + "grad_norm": 1.8486536741256714, + "learning_rate": 1.0354845244066263e-05, + "loss": 0.9638, + "step": 46755 + }, + { + "epoch": 13.99, + "grad_norm": 1.7416905164718628, + "learning_rate": 1.0350084077129776e-05, + "loss": 1.0862, + "step": 46760 + }, + { + "epoch": 13.99, + "grad_norm": 2.3283634185791016, + "learning_rate": 1.0345323719292915e-05, + "loss": 1.0598, + "step": 46765 + }, + { + "epoch": 13.99, + "grad_norm": 1.997711181640625, + "learning_rate": 1.0340564170818595e-05, + "loss": 1.2272, + "step": 46770 + }, + { + "epoch": 13.99, + "grad_norm": 2.0410749912261963, + "learning_rate": 1.0335805431969675e-05, + "loss": 1.1836, + "step": 46775 + }, + { + "epoch": 14.0, + "grad_norm": 2.7848238945007324, + "learning_rate": 1.0331047503008981e-05, + "loss": 0.9318, + "step": 46780 + }, + { + "epoch": 14.0, + "grad_norm": 1.9022867679595947, + "learning_rate": 1.0326290384199285e-05, + "loss": 1.0139, + "step": 46785 + }, + { + "epoch": 14.0, + "grad_norm": 1.689530849456787, + "learning_rate": 1.0321534075803319e-05, + "loss": 1.0845, + "step": 46790 + }, + { + "epoch": 14.0, + "grad_norm": 4.382483005523682, + "learning_rate": 1.0316778578083768e-05, + "loss": 0.8879, + "step": 46795 + }, + { + "epoch": 14.0, + "grad_norm": 2.087588310241699, + "learning_rate": 1.0312023891303273e-05, + "loss": 0.9106, + "step": 46800 + }, + { + "epoch": 14.0, + "grad_norm": 1.368632197380066, + "learning_rate": 1.0307270015724433e-05, + "loss": 1.122, + "step": 46805 + }, + { + "epoch": 14.01, + "grad_norm": 2.369845390319824, + "learning_rate": 1.0302516951609806e-05, + "loss": 0.9383, + "step": 46810 + }, + { + "epoch": 14.01, + "grad_norm": 1.6115524768829346, + "learning_rate": 1.0297764699221874e-05, + "loss": 0.9586, + "step": 46815 + }, + { + "epoch": 14.01, + "grad_norm": 4.875194072723389, + "learning_rate": 1.0293013258823131e-05, + "loss": 0.9979, + "step": 46820 + }, + { + "epoch": 14.01, + "grad_norm": 2.5877294540405273, + "learning_rate": 1.028826263067596e-05, + "loss": 0.8455, + "step": 46825 + }, + { + "epoch": 14.01, + "grad_norm": 2.146116018295288, + "learning_rate": 1.0283512815042773e-05, + "loss": 0.8959, + "step": 46830 + }, + { + "epoch": 14.01, + "grad_norm": 3.2791755199432373, + "learning_rate": 1.0278763812185857e-05, + "loss": 0.9684, + "step": 46835 + }, + { + "epoch": 14.01, + "grad_norm": 2.8382163047790527, + "learning_rate": 1.027401562236753e-05, + "loss": 1.033, + "step": 46840 + }, + { + "epoch": 14.02, + "grad_norm": 2.077665090560913, + "learning_rate": 1.0269268245850006e-05, + "loss": 0.9306, + "step": 46845 + }, + { + "epoch": 14.02, + "grad_norm": 3.7365541458129883, + "learning_rate": 1.0264521682895478e-05, + "loss": 0.9208, + "step": 46850 + }, + { + "epoch": 14.02, + "grad_norm": 3.4815821647644043, + "learning_rate": 1.0259775933766117e-05, + "loss": 0.9793, + "step": 46855 + }, + { + "epoch": 14.02, + "grad_norm": 3.6899566650390625, + "learning_rate": 1.0255030998723991e-05, + "loss": 0.9788, + "step": 46860 + }, + { + "epoch": 14.02, + "grad_norm": 2.341599702835083, + "learning_rate": 1.0250286878031193e-05, + "loss": 1.0963, + "step": 46865 + }, + { + "epoch": 14.02, + "grad_norm": 4.204500675201416, + "learning_rate": 1.0245543571949704e-05, + "loss": 0.9897, + "step": 46870 + }, + { + "epoch": 14.02, + "grad_norm": 2.09965443611145, + "learning_rate": 1.0240801080741525e-05, + "loss": 1.0211, + "step": 46875 + }, + { + "epoch": 14.03, + "grad_norm": 2.6598119735717773, + "learning_rate": 1.0236059404668549e-05, + "loss": 0.9235, + "step": 46880 + }, + { + "epoch": 14.03, + "grad_norm": 1.9532800912857056, + "learning_rate": 1.0231318543992669e-05, + "loss": 1.044, + "step": 46885 + }, + { + "epoch": 14.03, + "grad_norm": 2.8368186950683594, + "learning_rate": 1.0226578498975714e-05, + "loss": 0.9055, + "step": 46890 + }, + { + "epoch": 14.03, + "grad_norm": 1.4692111015319824, + "learning_rate": 1.0221839269879472e-05, + "loss": 1.0778, + "step": 46895 + }, + { + "epoch": 14.03, + "grad_norm": 2.8631324768066406, + "learning_rate": 1.0217100856965687e-05, + "loss": 0.9229, + "step": 46900 + }, + { + "epoch": 14.03, + "grad_norm": 1.9299060106277466, + "learning_rate": 1.0212363260496063e-05, + "loss": 1.0288, + "step": 46905 + }, + { + "epoch": 14.03, + "grad_norm": 2.9853992462158203, + "learning_rate": 1.0207626480732228e-05, + "loss": 0.952, + "step": 46910 + }, + { + "epoch": 14.04, + "grad_norm": 2.1383092403411865, + "learning_rate": 1.0202890517935824e-05, + "loss": 1.1462, + "step": 46915 + }, + { + "epoch": 14.04, + "grad_norm": 6.036135196685791, + "learning_rate": 1.019815537236838e-05, + "loss": 0.9865, + "step": 46920 + }, + { + "epoch": 14.04, + "grad_norm": 4.347772598266602, + "learning_rate": 1.0193421044291446e-05, + "loss": 0.9413, + "step": 46925 + }, + { + "epoch": 14.04, + "grad_norm": 5.3200860023498535, + "learning_rate": 1.0188687533966457e-05, + "loss": 0.9559, + "step": 46930 + }, + { + "epoch": 14.04, + "grad_norm": 1.9751383066177368, + "learning_rate": 1.0183954841654873e-05, + "loss": 1.0025, + "step": 46935 + }, + { + "epoch": 14.04, + "grad_norm": 2.211293935775757, + "learning_rate": 1.0179222967618068e-05, + "loss": 1.0879, + "step": 46940 + }, + { + "epoch": 14.05, + "grad_norm": 2.816315174102783, + "learning_rate": 1.0174491912117357e-05, + "loss": 1.1058, + "step": 46945 + }, + { + "epoch": 14.05, + "grad_norm": 3.3474440574645996, + "learning_rate": 1.0169761675414064e-05, + "loss": 0.8382, + "step": 46950 + }, + { + "epoch": 14.05, + "grad_norm": 1.9791414737701416, + "learning_rate": 1.0165032257769403e-05, + "loss": 1.1009, + "step": 46955 + }, + { + "epoch": 14.05, + "grad_norm": 3.0999069213867188, + "learning_rate": 1.0160303659444606e-05, + "loss": 0.9814, + "step": 46960 + }, + { + "epoch": 14.05, + "grad_norm": 1.7832539081573486, + "learning_rate": 1.0155575880700796e-05, + "loss": 0.9725, + "step": 46965 + }, + { + "epoch": 14.05, + "grad_norm": 2.467935800552368, + "learning_rate": 1.015084892179912e-05, + "loss": 0.9384, + "step": 46970 + }, + { + "epoch": 14.05, + "grad_norm": 1.9563651084899902, + "learning_rate": 1.0146122783000612e-05, + "loss": 1.039, + "step": 46975 + }, + { + "epoch": 14.06, + "grad_norm": 3.385742425918579, + "learning_rate": 1.0141397464566302e-05, + "loss": 1.0829, + "step": 46980 + }, + { + "epoch": 14.06, + "grad_norm": 6.85827112197876, + "learning_rate": 1.0136672966757166e-05, + "loss": 0.9445, + "step": 46985 + }, + { + "epoch": 14.06, + "grad_norm": 3.0008556842803955, + "learning_rate": 1.0131949289834133e-05, + "loss": 1.0382, + "step": 46990 + }, + { + "epoch": 14.06, + "grad_norm": 1.650694727897644, + "learning_rate": 1.0127226434058088e-05, + "loss": 0.9398, + "step": 46995 + }, + { + "epoch": 14.06, + "grad_norm": 2.1701951026916504, + "learning_rate": 1.0122504399689863e-05, + "loss": 1.0166, + "step": 47000 + }, + { + "epoch": 14.06, + "grad_norm": 3.96262526512146, + "learning_rate": 1.011778318699026e-05, + "loss": 0.8386, + "step": 47005 + }, + { + "epoch": 14.06, + "grad_norm": 3.1463615894317627, + "learning_rate": 1.0113062796220021e-05, + "loss": 1.0431, + "step": 47010 + }, + { + "epoch": 14.07, + "grad_norm": 3.198836326599121, + "learning_rate": 1.010834322763985e-05, + "loss": 1.0575, + "step": 47015 + }, + { + "epoch": 14.07, + "grad_norm": 2.9448750019073486, + "learning_rate": 1.0103624481510404e-05, + "loss": 0.9553, + "step": 47020 + }, + { + "epoch": 14.07, + "grad_norm": 1.3542437553405762, + "learning_rate": 1.0098906558092291e-05, + "loss": 1.0053, + "step": 47025 + }, + { + "epoch": 14.07, + "grad_norm": 3.5193848609924316, + "learning_rate": 1.0094189457646083e-05, + "loss": 0.9204, + "step": 47030 + }, + { + "epoch": 14.07, + "grad_norm": 1.792724609375, + "learning_rate": 1.0089473180432307e-05, + "loss": 1.0258, + "step": 47035 + }, + { + "epoch": 14.07, + "grad_norm": 1.6008864641189575, + "learning_rate": 1.0084757726711408e-05, + "loss": 0.9024, + "step": 47040 + }, + { + "epoch": 14.08, + "grad_norm": 3.162278890609741, + "learning_rate": 1.0080043096743857e-05, + "loss": 0.9124, + "step": 47045 + }, + { + "epoch": 14.08, + "grad_norm": 2.5094027519226074, + "learning_rate": 1.0075329290789998e-05, + "loss": 0.9806, + "step": 47050 + }, + { + "epoch": 14.08, + "grad_norm": 3.3782403469085693, + "learning_rate": 1.0070616309110207e-05, + "loss": 1.0407, + "step": 47055 + }, + { + "epoch": 14.08, + "grad_norm": 4.14473819732666, + "learning_rate": 1.0065904151964738e-05, + "loss": 0.8774, + "step": 47060 + }, + { + "epoch": 14.08, + "grad_norm": 9.009183883666992, + "learning_rate": 1.0061192819613882e-05, + "loss": 1.0632, + "step": 47065 + }, + { + "epoch": 14.08, + "grad_norm": 1.744873285293579, + "learning_rate": 1.0056482312317809e-05, + "loss": 1.0087, + "step": 47070 + }, + { + "epoch": 14.08, + "grad_norm": 1.8300987482070923, + "learning_rate": 1.0051772630336684e-05, + "loss": 1.0234, + "step": 47075 + }, + { + "epoch": 14.09, + "grad_norm": 2.92419171333313, + "learning_rate": 1.0047063773930617e-05, + "loss": 0.8112, + "step": 47080 + }, + { + "epoch": 14.09, + "grad_norm": 2.440823793411255, + "learning_rate": 1.0042355743359676e-05, + "loss": 0.7468, + "step": 47085 + }, + { + "epoch": 14.09, + "grad_norm": 8.091914176940918, + "learning_rate": 1.0037648538883882e-05, + "loss": 0.9097, + "step": 47090 + }, + { + "epoch": 14.09, + "grad_norm": 5.099801063537598, + "learning_rate": 1.0032942160763206e-05, + "loss": 0.7116, + "step": 47095 + }, + { + "epoch": 14.09, + "grad_norm": 1.9223390817642212, + "learning_rate": 1.0028236609257574e-05, + "loss": 1.0824, + "step": 47100 + }, + { + "epoch": 14.09, + "grad_norm": 2.9645304679870605, + "learning_rate": 1.0023531884626875e-05, + "loss": 1.1287, + "step": 47105 + }, + { + "epoch": 14.09, + "grad_norm": 4.622707366943359, + "learning_rate": 1.0018827987130942e-05, + "loss": 0.9873, + "step": 47110 + }, + { + "epoch": 14.1, + "grad_norm": 2.609149217605591, + "learning_rate": 1.0014124917029567e-05, + "loss": 1.0403, + "step": 47115 + }, + { + "epoch": 14.1, + "grad_norm": 1.8755382299423218, + "learning_rate": 1.0009422674582497e-05, + "loss": 1.0787, + "step": 47120 + }, + { + "epoch": 14.1, + "grad_norm": 3.716013193130493, + "learning_rate": 1.000472126004943e-05, + "loss": 1.0361, + "step": 47125 + }, + { + "epoch": 14.1, + "grad_norm": 4.679911136627197, + "learning_rate": 1.0000020673690027e-05, + "loss": 0.9316, + "step": 47130 + }, + { + "epoch": 14.1, + "grad_norm": 4.787873268127441, + "learning_rate": 9.995320915763878e-06, + "loss": 0.9913, + "step": 47135 + }, + { + "epoch": 14.1, + "grad_norm": 4.154457092285156, + "learning_rate": 9.990621986530574e-06, + "loss": 0.8499, + "step": 47140 + }, + { + "epoch": 14.11, + "grad_norm": 2.3199286460876465, + "learning_rate": 9.985923886249599e-06, + "loss": 1.1649, + "step": 47145 + }, + { + "epoch": 14.11, + "grad_norm": 2.379757881164551, + "learning_rate": 9.981226615180456e-06, + "loss": 0.9088, + "step": 47150 + }, + { + "epoch": 14.11, + "grad_norm": 3.3448526859283447, + "learning_rate": 9.97653017358254e-06, + "loss": 1.0832, + "step": 47155 + }, + { + "epoch": 14.11, + "grad_norm": 1.5010708570480347, + "learning_rate": 9.971834561715265e-06, + "loss": 1.0648, + "step": 47160 + }, + { + "epoch": 14.11, + "grad_norm": 1.6477797031402588, + "learning_rate": 9.967139779837939e-06, + "loss": 0.9349, + "step": 47165 + }, + { + "epoch": 14.11, + "grad_norm": 4.819705009460449, + "learning_rate": 9.962445828209854e-06, + "loss": 0.9833, + "step": 47170 + }, + { + "epoch": 14.11, + "grad_norm": 6.116825580596924, + "learning_rate": 9.957752707090257e-06, + "loss": 0.9763, + "step": 47175 + }, + { + "epoch": 14.12, + "grad_norm": 2.0187926292419434, + "learning_rate": 9.953060416738343e-06, + "loss": 1.0569, + "step": 47180 + }, + { + "epoch": 14.12, + "grad_norm": 2.0881597995758057, + "learning_rate": 9.94836895741326e-06, + "loss": 0.8785, + "step": 47185 + }, + { + "epoch": 14.12, + "grad_norm": 4.898426055908203, + "learning_rate": 9.943678329374116e-06, + "loss": 0.7823, + "step": 47190 + }, + { + "epoch": 14.12, + "grad_norm": 3.7305986881256104, + "learning_rate": 9.938988532879967e-06, + "loss": 0.8913, + "step": 47195 + }, + { + "epoch": 14.12, + "grad_norm": 1.356601357460022, + "learning_rate": 9.934299568189826e-06, + "loss": 1.0196, + "step": 47200 + }, + { + "epoch": 14.12, + "grad_norm": 3.387441873550415, + "learning_rate": 9.929611435562658e-06, + "loss": 0.8262, + "step": 47205 + }, + { + "epoch": 14.12, + "grad_norm": 2.2645153999328613, + "learning_rate": 9.924924135257388e-06, + "loss": 0.9682, + "step": 47210 + }, + { + "epoch": 14.13, + "grad_norm": 3.1974477767944336, + "learning_rate": 9.92023766753289e-06, + "loss": 1.0503, + "step": 47215 + }, + { + "epoch": 14.13, + "grad_norm": 1.60183584690094, + "learning_rate": 9.915552032647988e-06, + "loss": 0.9126, + "step": 47220 + }, + { + "epoch": 14.13, + "grad_norm": 2.7414391040802, + "learning_rate": 9.910867230861467e-06, + "loss": 1.0307, + "step": 47225 + }, + { + "epoch": 14.13, + "grad_norm": 3.494734287261963, + "learning_rate": 9.906183262432068e-06, + "loss": 0.9636, + "step": 47230 + }, + { + "epoch": 14.13, + "grad_norm": 1.4926824569702148, + "learning_rate": 9.901500127618485e-06, + "loss": 0.8568, + "step": 47235 + }, + { + "epoch": 14.13, + "grad_norm": 1.8363795280456543, + "learning_rate": 9.896817826679338e-06, + "loss": 0.9843, + "step": 47240 + }, + { + "epoch": 14.14, + "grad_norm": 2.363992214202881, + "learning_rate": 9.892136359873264e-06, + "loss": 1.0686, + "step": 47245 + }, + { + "epoch": 14.14, + "grad_norm": 1.888505458831787, + "learning_rate": 9.887455727458775e-06, + "loss": 1.0122, + "step": 47250 + }, + { + "epoch": 14.14, + "grad_norm": 1.452127456665039, + "learning_rate": 9.882775929694416e-06, + "loss": 0.9163, + "step": 47255 + }, + { + "epoch": 14.14, + "grad_norm": 7.370832443237305, + "learning_rate": 9.878096966838617e-06, + "loss": 0.8539, + "step": 47260 + }, + { + "epoch": 14.14, + "grad_norm": 3.398411750793457, + "learning_rate": 9.873418839149808e-06, + "loss": 1.0393, + "step": 47265 + }, + { + "epoch": 14.14, + "grad_norm": 2.2340104579925537, + "learning_rate": 9.868741546886353e-06, + "loss": 0.9541, + "step": 47270 + }, + { + "epoch": 14.14, + "grad_norm": 2.5456645488739014, + "learning_rate": 9.864065090306574e-06, + "loss": 1.0563, + "step": 47275 + }, + { + "epoch": 14.15, + "grad_norm": 1.726277470588684, + "learning_rate": 9.859389469668745e-06, + "loss": 1.0858, + "step": 47280 + }, + { + "epoch": 14.15, + "grad_norm": 1.36215078830719, + "learning_rate": 9.8547146852311e-06, + "loss": 1.061, + "step": 47285 + }, + { + "epoch": 14.15, + "grad_norm": 3.9773330688476562, + "learning_rate": 9.85004073725182e-06, + "loss": 0.7896, + "step": 47290 + }, + { + "epoch": 14.15, + "grad_norm": 1.2395633459091187, + "learning_rate": 9.845367625989044e-06, + "loss": 1.0116, + "step": 47295 + }, + { + "epoch": 14.15, + "grad_norm": 2.955395460128784, + "learning_rate": 9.84069535170086e-06, + "loss": 1.0423, + "step": 47300 + }, + { + "epoch": 14.15, + "grad_norm": 2.663750171661377, + "learning_rate": 9.836023914645313e-06, + "loss": 0.9401, + "step": 47305 + }, + { + "epoch": 14.15, + "grad_norm": 3.750603437423706, + "learning_rate": 9.831353315080405e-06, + "loss": 0.9206, + "step": 47310 + }, + { + "epoch": 14.16, + "grad_norm": 4.748321056365967, + "learning_rate": 9.826683553264085e-06, + "loss": 1.0016, + "step": 47315 + }, + { + "epoch": 14.16, + "grad_norm": 4.10500955581665, + "learning_rate": 9.822014629454263e-06, + "loss": 0.9215, + "step": 47320 + }, + { + "epoch": 14.16, + "grad_norm": 5.371630668640137, + "learning_rate": 9.817346543908796e-06, + "loss": 0.9788, + "step": 47325 + }, + { + "epoch": 14.16, + "grad_norm": 1.5055346488952637, + "learning_rate": 9.812679296885505e-06, + "loss": 0.9854, + "step": 47330 + }, + { + "epoch": 14.16, + "grad_norm": 1.5663864612579346, + "learning_rate": 9.808012888642132e-06, + "loss": 0.9251, + "step": 47335 + }, + { + "epoch": 14.16, + "grad_norm": 2.7338407039642334, + "learning_rate": 9.803347319436435e-06, + "loss": 0.9418, + "step": 47340 + }, + { + "epoch": 14.17, + "grad_norm": 2.1328275203704834, + "learning_rate": 9.798682589526052e-06, + "loss": 0.8952, + "step": 47345 + }, + { + "epoch": 14.17, + "grad_norm": 3.0970237255096436, + "learning_rate": 9.794018699168644e-06, + "loss": 0.8826, + "step": 47350 + }, + { + "epoch": 14.17, + "grad_norm": 1.936923623085022, + "learning_rate": 9.789355648621764e-06, + "loss": 0.9455, + "step": 47355 + }, + { + "epoch": 14.17, + "grad_norm": 2.612597703933716, + "learning_rate": 9.784693438142975e-06, + "loss": 0.9279, + "step": 47360 + }, + { + "epoch": 14.17, + "grad_norm": 3.304236888885498, + "learning_rate": 9.780032067989744e-06, + "loss": 0.929, + "step": 47365 + }, + { + "epoch": 14.17, + "grad_norm": 1.589285135269165, + "learning_rate": 9.775371538419522e-06, + "loss": 1.154, + "step": 47370 + }, + { + "epoch": 14.17, + "grad_norm": 1.9944769144058228, + "learning_rate": 9.770711849689706e-06, + "loss": 0.8109, + "step": 47375 + }, + { + "epoch": 14.18, + "grad_norm": 3.4619359970092773, + "learning_rate": 9.766053002057646e-06, + "loss": 0.9818, + "step": 47380 + }, + { + "epoch": 14.18, + "grad_norm": 3.2043612003326416, + "learning_rate": 9.76139499578064e-06, + "loss": 1.0544, + "step": 47385 + }, + { + "epoch": 14.18, + "grad_norm": 7.88495397567749, + "learning_rate": 9.75673783111595e-06, + "loss": 0.891, + "step": 47390 + }, + { + "epoch": 14.18, + "grad_norm": 2.1868255138397217, + "learning_rate": 9.752081508320785e-06, + "loss": 0.9812, + "step": 47395 + }, + { + "epoch": 14.18, + "grad_norm": 2.6226940155029297, + "learning_rate": 9.747426027652309e-06, + "loss": 0.9334, + "step": 47400 + }, + { + "epoch": 14.18, + "grad_norm": 3.620140552520752, + "learning_rate": 9.74277138936764e-06, + "loss": 0.9, + "step": 47405 + }, + { + "epoch": 14.18, + "grad_norm": 7.102359771728516, + "learning_rate": 9.738117593723847e-06, + "loss": 0.8913, + "step": 47410 + }, + { + "epoch": 14.19, + "grad_norm": 3.3569910526275635, + "learning_rate": 9.733464640977957e-06, + "loss": 1.0191, + "step": 47415 + }, + { + "epoch": 14.19, + "grad_norm": 3.8863701820373535, + "learning_rate": 9.728812531386946e-06, + "loss": 1.025, + "step": 47420 + }, + { + "epoch": 14.19, + "grad_norm": 2.6565213203430176, + "learning_rate": 9.724161265207755e-06, + "loss": 1.0252, + "step": 47425 + }, + { + "epoch": 14.19, + "grad_norm": 3.354027032852173, + "learning_rate": 9.719510842697238e-06, + "loss": 0.8642, + "step": 47430 + }, + { + "epoch": 14.19, + "grad_norm": 3.7532005310058594, + "learning_rate": 9.714861264112273e-06, + "loss": 0.9739, + "step": 47435 + }, + { + "epoch": 14.19, + "grad_norm": 7.380504608154297, + "learning_rate": 9.710212529709618e-06, + "loss": 1.0136, + "step": 47440 + }, + { + "epoch": 14.19, + "grad_norm": 3.0353171825408936, + "learning_rate": 9.705564639746045e-06, + "loss": 1.1409, + "step": 47445 + }, + { + "epoch": 14.2, + "grad_norm": 4.069791793823242, + "learning_rate": 9.700917594478223e-06, + "loss": 0.8443, + "step": 47450 + }, + { + "epoch": 14.2, + "grad_norm": 2.922313928604126, + "learning_rate": 9.69627139416284e-06, + "loss": 0.9226, + "step": 47455 + }, + { + "epoch": 14.2, + "grad_norm": 2.804293632507324, + "learning_rate": 9.69162603905647e-06, + "loss": 1.0484, + "step": 47460 + }, + { + "epoch": 14.2, + "grad_norm": 1.4043070077896118, + "learning_rate": 9.68698152941568e-06, + "loss": 1.0361, + "step": 47465 + }, + { + "epoch": 14.2, + "grad_norm": 2.1712961196899414, + "learning_rate": 9.682337865496985e-06, + "loss": 1.0993, + "step": 47470 + }, + { + "epoch": 14.2, + "grad_norm": 3.498368740081787, + "learning_rate": 9.677695047556848e-06, + "loss": 0.9285, + "step": 47475 + }, + { + "epoch": 14.21, + "grad_norm": 2.01509428024292, + "learning_rate": 9.673053075851687e-06, + "loss": 1.0765, + "step": 47480 + }, + { + "epoch": 14.21, + "grad_norm": 1.8657152652740479, + "learning_rate": 9.668411950637874e-06, + "loss": 1.0225, + "step": 47485 + }, + { + "epoch": 14.21, + "grad_norm": 3.4774529933929443, + "learning_rate": 9.66377167217173e-06, + "loss": 0.9132, + "step": 47490 + }, + { + "epoch": 14.21, + "grad_norm": 3.322037696838379, + "learning_rate": 9.659132240709536e-06, + "loss": 0.9264, + "step": 47495 + }, + { + "epoch": 14.21, + "grad_norm": 3.845637321472168, + "learning_rate": 9.654493656507524e-06, + "loss": 0.883, + "step": 47500 + }, + { + "epoch": 14.21, + "grad_norm": 3.871750593185425, + "learning_rate": 9.649855919821874e-06, + "loss": 0.9087, + "step": 47505 + }, + { + "epoch": 14.21, + "grad_norm": 6.507814407348633, + "learning_rate": 9.645219030908728e-06, + "loss": 0.8509, + "step": 47510 + }, + { + "epoch": 14.22, + "grad_norm": 3.187875270843506, + "learning_rate": 9.640582990024175e-06, + "loss": 0.7997, + "step": 47515 + }, + { + "epoch": 14.22, + "grad_norm": 2.1788666248321533, + "learning_rate": 9.635947797424266e-06, + "loss": 0.8929, + "step": 47520 + }, + { + "epoch": 14.22, + "grad_norm": 1.6329278945922852, + "learning_rate": 9.63131345336497e-06, + "loss": 0.9739, + "step": 47525 + }, + { + "epoch": 14.22, + "grad_norm": 4.640608787536621, + "learning_rate": 9.62667995810228e-06, + "loss": 0.8712, + "step": 47530 + }, + { + "epoch": 14.22, + "grad_norm": 2.9405198097229004, + "learning_rate": 9.622047311892055e-06, + "loss": 0.9474, + "step": 47535 + }, + { + "epoch": 14.22, + "grad_norm": 1.338927149772644, + "learning_rate": 9.61741551499019e-06, + "loss": 1.0767, + "step": 47540 + }, + { + "epoch": 14.22, + "grad_norm": 1.2258890867233276, + "learning_rate": 9.61278456765246e-06, + "loss": 1.1003, + "step": 47545 + }, + { + "epoch": 14.23, + "grad_norm": 1.6150643825531006, + "learning_rate": 9.608154470134665e-06, + "loss": 1.1114, + "step": 47550 + }, + { + "epoch": 14.23, + "grad_norm": 4.458968639373779, + "learning_rate": 9.603525222692486e-06, + "loss": 0.9945, + "step": 47555 + }, + { + "epoch": 14.23, + "grad_norm": 4.157359600067139, + "learning_rate": 9.598896825581607e-06, + "loss": 0.9888, + "step": 47560 + }, + { + "epoch": 14.23, + "grad_norm": 3.2777342796325684, + "learning_rate": 9.59426927905765e-06, + "loss": 0.7707, + "step": 47565 + }, + { + "epoch": 14.23, + "grad_norm": 3.0390613079071045, + "learning_rate": 9.589642583376186e-06, + "loss": 0.9935, + "step": 47570 + }, + { + "epoch": 14.23, + "grad_norm": 3.542185068130493, + "learning_rate": 9.58501673879274e-06, + "loss": 0.8352, + "step": 47575 + }, + { + "epoch": 14.24, + "grad_norm": 3.9005398750305176, + "learning_rate": 9.580391745562802e-06, + "loss": 1.0942, + "step": 47580 + }, + { + "epoch": 14.24, + "grad_norm": 3.451244592666626, + "learning_rate": 9.575767603941798e-06, + "loss": 0.9602, + "step": 47585 + }, + { + "epoch": 14.24, + "grad_norm": 1.035033941268921, + "learning_rate": 9.571144314185116e-06, + "loss": 1.1651, + "step": 47590 + }, + { + "epoch": 14.24, + "grad_norm": 3.617086410522461, + "learning_rate": 9.566521876548096e-06, + "loss": 0.9759, + "step": 47595 + }, + { + "epoch": 14.24, + "grad_norm": 3.698251485824585, + "learning_rate": 9.561900291286032e-06, + "loss": 0.9557, + "step": 47600 + }, + { + "epoch": 14.24, + "grad_norm": 0.981471061706543, + "learning_rate": 9.557279558654167e-06, + "loss": 0.8647, + "step": 47605 + }, + { + "epoch": 14.24, + "grad_norm": 1.0644017457962036, + "learning_rate": 9.552659678907699e-06, + "loss": 0.8904, + "step": 47610 + }, + { + "epoch": 14.25, + "grad_norm": 2.2468714714050293, + "learning_rate": 9.548040652301792e-06, + "loss": 1.0655, + "step": 47615 + }, + { + "epoch": 14.25, + "grad_norm": 3.204252004623413, + "learning_rate": 9.543422479091518e-06, + "loss": 1.083, + "step": 47620 + }, + { + "epoch": 14.25, + "grad_norm": 3.293745279312134, + "learning_rate": 9.538805159531974e-06, + "loss": 1.0888, + "step": 47625 + }, + { + "epoch": 14.25, + "grad_norm": 2.6639957427978516, + "learning_rate": 9.534188693878131e-06, + "loss": 1.0, + "step": 47630 + }, + { + "epoch": 14.25, + "grad_norm": 3.4556894302368164, + "learning_rate": 9.529573082384988e-06, + "loss": 1.0495, + "step": 47635 + }, + { + "epoch": 14.25, + "grad_norm": 2.608086347579956, + "learning_rate": 9.524958325307426e-06, + "loss": 0.9045, + "step": 47640 + }, + { + "epoch": 14.25, + "grad_norm": 3.2534093856811523, + "learning_rate": 9.520344422900348e-06, + "loss": 1.1092, + "step": 47645 + }, + { + "epoch": 14.26, + "grad_norm": 1.683605670928955, + "learning_rate": 9.515731375418549e-06, + "loss": 0.9562, + "step": 47650 + }, + { + "epoch": 14.26, + "grad_norm": 4.434023857116699, + "learning_rate": 9.511119183116812e-06, + "loss": 0.8168, + "step": 47655 + }, + { + "epoch": 14.26, + "grad_norm": 2.663111448287964, + "learning_rate": 9.506507846249859e-06, + "loss": 0.95, + "step": 47660 + }, + { + "epoch": 14.26, + "grad_norm": 3.199991226196289, + "learning_rate": 9.501897365072367e-06, + "loss": 0.9858, + "step": 47665 + }, + { + "epoch": 14.26, + "grad_norm": 2.4415783882141113, + "learning_rate": 9.497287739838992e-06, + "loss": 1.1561, + "step": 47670 + }, + { + "epoch": 14.26, + "grad_norm": 4.541195392608643, + "learning_rate": 9.492678970804283e-06, + "loss": 0.9719, + "step": 47675 + }, + { + "epoch": 14.27, + "grad_norm": 3.981882095336914, + "learning_rate": 9.488071058222814e-06, + "loss": 0.984, + "step": 47680 + }, + { + "epoch": 14.27, + "grad_norm": 7.917405128479004, + "learning_rate": 9.483464002349046e-06, + "loss": 1.0035, + "step": 47685 + }, + { + "epoch": 14.27, + "grad_norm": 2.864999771118164, + "learning_rate": 9.478857803437433e-06, + "loss": 0.9655, + "step": 47690 + }, + { + "epoch": 14.27, + "grad_norm": 3.0915513038635254, + "learning_rate": 9.474252461742373e-06, + "loss": 1.0438, + "step": 47695 + }, + { + "epoch": 14.27, + "grad_norm": 2.601701021194458, + "learning_rate": 9.469647977518207e-06, + "loss": 0.9761, + "step": 47700 + }, + { + "epoch": 14.27, + "grad_norm": 4.57741641998291, + "learning_rate": 9.465044351019243e-06, + "loss": 1.025, + "step": 47705 + }, + { + "epoch": 14.27, + "grad_norm": 4.853657245635986, + "learning_rate": 9.460441582499729e-06, + "loss": 1.1026, + "step": 47710 + }, + { + "epoch": 14.28, + "grad_norm": 3.169487476348877, + "learning_rate": 9.455839672213878e-06, + "loss": 0.9469, + "step": 47715 + }, + { + "epoch": 14.28, + "grad_norm": 2.6972315311431885, + "learning_rate": 9.451238620415851e-06, + "loss": 1.1723, + "step": 47720 + }, + { + "epoch": 14.28, + "grad_norm": 2.2301697731018066, + "learning_rate": 9.446638427359736e-06, + "loss": 0.8984, + "step": 47725 + }, + { + "epoch": 14.28, + "grad_norm": 2.9678409099578857, + "learning_rate": 9.442039093299631e-06, + "loss": 0.9494, + "step": 47730 + }, + { + "epoch": 14.28, + "grad_norm": 1.5792745351791382, + "learning_rate": 9.437440618489518e-06, + "loss": 1.1286, + "step": 47735 + }, + { + "epoch": 14.28, + "grad_norm": 2.416245222091675, + "learning_rate": 9.432843003183392e-06, + "loss": 0.9913, + "step": 47740 + }, + { + "epoch": 14.28, + "grad_norm": 2.8561947345733643, + "learning_rate": 9.428246247635176e-06, + "loss": 0.8804, + "step": 47745 + }, + { + "epoch": 14.29, + "grad_norm": 3.78583025932312, + "learning_rate": 9.423650352098712e-06, + "loss": 0.9608, + "step": 47750 + }, + { + "epoch": 14.29, + "grad_norm": 2.2284066677093506, + "learning_rate": 9.419055316827869e-06, + "loss": 0.9273, + "step": 47755 + }, + { + "epoch": 14.29, + "grad_norm": 2.1891860961914062, + "learning_rate": 9.41446114207639e-06, + "loss": 0.8688, + "step": 47760 + }, + { + "epoch": 14.29, + "grad_norm": 7.929460048675537, + "learning_rate": 9.409867828098035e-06, + "loss": 0.9384, + "step": 47765 + }, + { + "epoch": 14.29, + "grad_norm": 1.693267822265625, + "learning_rate": 9.405275375146458e-06, + "loss": 1.0242, + "step": 47770 + }, + { + "epoch": 14.29, + "grad_norm": 4.085503578186035, + "learning_rate": 9.40068378347533e-06, + "loss": 1.039, + "step": 47775 + }, + { + "epoch": 14.3, + "grad_norm": 2.429783821105957, + "learning_rate": 9.396093053338218e-06, + "loss": 1.0925, + "step": 47780 + }, + { + "epoch": 14.3, + "grad_norm": 1.4568767547607422, + "learning_rate": 9.391503184988661e-06, + "loss": 0.906, + "step": 47785 + }, + { + "epoch": 14.3, + "grad_norm": 2.0767431259155273, + "learning_rate": 9.386914178680162e-06, + "loss": 0.9838, + "step": 47790 + }, + { + "epoch": 14.3, + "grad_norm": 1.0575764179229736, + "learning_rate": 9.382326034666164e-06, + "loss": 0.983, + "step": 47795 + }, + { + "epoch": 14.3, + "grad_norm": 1.7569162845611572, + "learning_rate": 9.377738753200066e-06, + "loss": 0.9052, + "step": 47800 + }, + { + "epoch": 14.3, + "grad_norm": 1.5973174571990967, + "learning_rate": 9.373152334535216e-06, + "loss": 1.0071, + "step": 47805 + }, + { + "epoch": 14.3, + "grad_norm": 2.8834640979766846, + "learning_rate": 9.36856677892492e-06, + "loss": 0.9311, + "step": 47810 + }, + { + "epoch": 14.31, + "grad_norm": 1.4674243927001953, + "learning_rate": 9.363982086622442e-06, + "loss": 0.9815, + "step": 47815 + }, + { + "epoch": 14.31, + "grad_norm": 1.9711565971374512, + "learning_rate": 9.359398257880963e-06, + "loss": 0.9914, + "step": 47820 + }, + { + "epoch": 14.31, + "grad_norm": 3.0938401222229004, + "learning_rate": 9.35481529295367e-06, + "loss": 1.0849, + "step": 47825 + }, + { + "epoch": 14.31, + "grad_norm": 3.689774513244629, + "learning_rate": 9.350233192093667e-06, + "loss": 1.2623, + "step": 47830 + }, + { + "epoch": 14.31, + "grad_norm": 11.061504364013672, + "learning_rate": 9.345651955554016e-06, + "loss": 0.9138, + "step": 47835 + }, + { + "epoch": 14.31, + "grad_norm": 2.238264560699463, + "learning_rate": 9.341071583587745e-06, + "loss": 1.0129, + "step": 47840 + }, + { + "epoch": 14.31, + "grad_norm": 3.2770984172821045, + "learning_rate": 9.336492076447794e-06, + "loss": 0.8907, + "step": 47845 + }, + { + "epoch": 14.32, + "grad_norm": 2.24216890335083, + "learning_rate": 9.331913434387127e-06, + "loss": 1.0423, + "step": 47850 + }, + { + "epoch": 14.32, + "grad_norm": 1.8534948825836182, + "learning_rate": 9.327335657658573e-06, + "loss": 1.002, + "step": 47855 + }, + { + "epoch": 14.32, + "grad_norm": 3.518404006958008, + "learning_rate": 9.322758746515e-06, + "loss": 1.0224, + "step": 47860 + }, + { + "epoch": 14.32, + "grad_norm": 2.3978283405303955, + "learning_rate": 9.318182701209147e-06, + "loss": 1.194, + "step": 47865 + }, + { + "epoch": 14.32, + "grad_norm": 2.3987321853637695, + "learning_rate": 9.31360752199378e-06, + "loss": 0.983, + "step": 47870 + }, + { + "epoch": 14.32, + "grad_norm": 4.193480491638184, + "learning_rate": 9.309033209121556e-06, + "loss": 0.8785, + "step": 47875 + }, + { + "epoch": 14.33, + "grad_norm": 2.2980704307556152, + "learning_rate": 9.30445976284512e-06, + "loss": 1.0265, + "step": 47880 + }, + { + "epoch": 14.33, + "grad_norm": 2.2900497913360596, + "learning_rate": 9.299887183417055e-06, + "loss": 0.9558, + "step": 47885 + }, + { + "epoch": 14.33, + "grad_norm": 1.8453694581985474, + "learning_rate": 9.295315471089903e-06, + "loss": 1.0211, + "step": 47890 + }, + { + "epoch": 14.33, + "grad_norm": 4.543096542358398, + "learning_rate": 9.290744626116152e-06, + "loss": 1.0024, + "step": 47895 + }, + { + "epoch": 14.33, + "grad_norm": 4.047824859619141, + "learning_rate": 9.286174648748247e-06, + "loss": 1.0203, + "step": 47900 + }, + { + "epoch": 14.33, + "grad_norm": 2.266964912414551, + "learning_rate": 9.28160553923858e-06, + "loss": 0.9312, + "step": 47905 + }, + { + "epoch": 14.33, + "grad_norm": 3.8564565181732178, + "learning_rate": 9.277037297839506e-06, + "loss": 0.998, + "step": 47910 + }, + { + "epoch": 14.34, + "grad_norm": 4.338145732879639, + "learning_rate": 9.272469924803315e-06, + "loss": 0.8935, + "step": 47915 + }, + { + "epoch": 14.34, + "grad_norm": 2.9112906455993652, + "learning_rate": 9.267903420382262e-06, + "loss": 0.9426, + "step": 47920 + }, + { + "epoch": 14.34, + "grad_norm": 2.4785072803497314, + "learning_rate": 9.263337784828551e-06, + "loss": 1.1364, + "step": 47925 + }, + { + "epoch": 14.34, + "grad_norm": 2.290468692779541, + "learning_rate": 9.258773018394337e-06, + "loss": 0.8601, + "step": 47930 + }, + { + "epoch": 14.34, + "grad_norm": 1.839494228363037, + "learning_rate": 9.254209121331728e-06, + "loss": 1.0367, + "step": 47935 + }, + { + "epoch": 14.34, + "grad_norm": 2.684028387069702, + "learning_rate": 9.249646093892783e-06, + "loss": 1.0927, + "step": 47940 + }, + { + "epoch": 14.34, + "grad_norm": 13.804609298706055, + "learning_rate": 9.245083936329521e-06, + "loss": 1.0277, + "step": 47945 + }, + { + "epoch": 14.35, + "grad_norm": 1.9250423908233643, + "learning_rate": 9.24052264889388e-06, + "loss": 0.9594, + "step": 47950 + }, + { + "epoch": 14.35, + "grad_norm": 1.3630380630493164, + "learning_rate": 9.235962231837805e-06, + "loss": 0.9599, + "step": 47955 + }, + { + "epoch": 14.35, + "grad_norm": 3.4717135429382324, + "learning_rate": 9.231402685413137e-06, + "loss": 1.0512, + "step": 47960 + }, + { + "epoch": 14.35, + "grad_norm": 2.14388370513916, + "learning_rate": 9.226844009871723e-06, + "loss": 0.8996, + "step": 47965 + }, + { + "epoch": 14.35, + "grad_norm": 4.260863304138184, + "learning_rate": 9.222286205465314e-06, + "loss": 0.9731, + "step": 47970 + }, + { + "epoch": 14.35, + "grad_norm": 1.8017550706863403, + "learning_rate": 9.217729272445636e-06, + "loss": 0.8765, + "step": 47975 + }, + { + "epoch": 14.36, + "grad_norm": 2.0987396240234375, + "learning_rate": 9.213173211064364e-06, + "loss": 1.0251, + "step": 47980 + }, + { + "epoch": 14.36, + "grad_norm": 2.1268539428710938, + "learning_rate": 9.208618021573126e-06, + "loss": 0.9544, + "step": 47985 + }, + { + "epoch": 14.36, + "grad_norm": 4.74144172668457, + "learning_rate": 9.2040637042235e-06, + "loss": 0.9862, + "step": 47990 + }, + { + "epoch": 14.36, + "grad_norm": 2.23129940032959, + "learning_rate": 9.199510259267018e-06, + "loss": 1.0009, + "step": 47995 + }, + { + "epoch": 14.36, + "grad_norm": 2.542621612548828, + "learning_rate": 9.194957686955164e-06, + "loss": 0.9754, + "step": 48000 + }, + { + "epoch": 14.36, + "grad_norm": 2.1914637088775635, + "learning_rate": 9.190405987539363e-06, + "loss": 1.049, + "step": 48005 + }, + { + "epoch": 14.36, + "grad_norm": 5.300290107727051, + "learning_rate": 9.185855161271009e-06, + "loss": 1.0649, + "step": 48010 + }, + { + "epoch": 14.37, + "grad_norm": 3.934770107269287, + "learning_rate": 9.181305208401437e-06, + "loss": 0.9544, + "step": 48015 + }, + { + "epoch": 14.37, + "grad_norm": 2.419064521789551, + "learning_rate": 9.176756129181935e-06, + "loss": 0.9372, + "step": 48020 + }, + { + "epoch": 14.37, + "grad_norm": 2.2804276943206787, + "learning_rate": 9.172207923863746e-06, + "loss": 0.92, + "step": 48025 + }, + { + "epoch": 14.37, + "grad_norm": 4.2205400466918945, + "learning_rate": 9.167660592698058e-06, + "loss": 1.0041, + "step": 48030 + }, + { + "epoch": 14.37, + "grad_norm": 5.472626209259033, + "learning_rate": 9.163114135936022e-06, + "loss": 1.0433, + "step": 48035 + }, + { + "epoch": 14.37, + "grad_norm": 3.6936001777648926, + "learning_rate": 9.158568553828737e-06, + "loss": 1.1153, + "step": 48040 + }, + { + "epoch": 14.37, + "grad_norm": 2.1582984924316406, + "learning_rate": 9.154023846627227e-06, + "loss": 0.9568, + "step": 48045 + }, + { + "epoch": 14.38, + "grad_norm": 3.0322723388671875, + "learning_rate": 9.149480014582529e-06, + "loss": 1.1763, + "step": 48050 + }, + { + "epoch": 14.38, + "grad_norm": 1.92725670337677, + "learning_rate": 9.144937057945555e-06, + "loss": 0.9095, + "step": 48055 + }, + { + "epoch": 14.38, + "grad_norm": 1.9671467542648315, + "learning_rate": 9.140394976967246e-06, + "loss": 1.1279, + "step": 48060 + }, + { + "epoch": 14.38, + "grad_norm": 1.5574039220809937, + "learning_rate": 9.135853771898418e-06, + "loss": 1.1226, + "step": 48065 + }, + { + "epoch": 14.38, + "grad_norm": 2.8291425704956055, + "learning_rate": 9.131313442989914e-06, + "loss": 0.9447, + "step": 48070 + }, + { + "epoch": 14.38, + "grad_norm": 3.1782844066619873, + "learning_rate": 9.126773990492466e-06, + "loss": 0.9002, + "step": 48075 + }, + { + "epoch": 14.38, + "grad_norm": 1.8452153205871582, + "learning_rate": 9.122235414656788e-06, + "loss": 1.1123, + "step": 48080 + }, + { + "epoch": 14.39, + "grad_norm": 6.0874834060668945, + "learning_rate": 9.117697715733548e-06, + "loss": 0.9248, + "step": 48085 + }, + { + "epoch": 14.39, + "grad_norm": 2.1895172595977783, + "learning_rate": 9.113160893973352e-06, + "loss": 1.0326, + "step": 48090 + }, + { + "epoch": 14.39, + "grad_norm": 1.827024221420288, + "learning_rate": 9.108624949626767e-06, + "loss": 0.9664, + "step": 48095 + }, + { + "epoch": 14.39, + "grad_norm": 1.7563681602478027, + "learning_rate": 9.104089882944308e-06, + "loss": 0.9242, + "step": 48100 + }, + { + "epoch": 14.39, + "grad_norm": 4.570804119110107, + "learning_rate": 9.099555694176443e-06, + "loss": 1.0313, + "step": 48105 + }, + { + "epoch": 14.39, + "grad_norm": 3.724151849746704, + "learning_rate": 9.095022383573587e-06, + "loss": 1.0457, + "step": 48110 + }, + { + "epoch": 14.4, + "grad_norm": 1.7167346477508545, + "learning_rate": 9.090489951386114e-06, + "loss": 1.0725, + "step": 48115 + }, + { + "epoch": 14.4, + "grad_norm": 3.227980136871338, + "learning_rate": 9.085958397864344e-06, + "loss": 1.0361, + "step": 48120 + }, + { + "epoch": 14.4, + "grad_norm": 3.609764337539673, + "learning_rate": 9.081427723258552e-06, + "loss": 1.0083, + "step": 48125 + }, + { + "epoch": 14.4, + "grad_norm": 2.5227441787719727, + "learning_rate": 9.076897927818956e-06, + "loss": 1.0384, + "step": 48130 + }, + { + "epoch": 14.4, + "grad_norm": 2.815845251083374, + "learning_rate": 9.072369011795747e-06, + "loss": 1.0781, + "step": 48135 + }, + { + "epoch": 14.4, + "grad_norm": 2.0394959449768066, + "learning_rate": 9.067840975439026e-06, + "loss": 1.0609, + "step": 48140 + }, + { + "epoch": 14.4, + "grad_norm": 2.39359712600708, + "learning_rate": 9.063313818998903e-06, + "loss": 0.9759, + "step": 48145 + }, + { + "epoch": 14.41, + "grad_norm": 2.754600763320923, + "learning_rate": 9.058787542725375e-06, + "loss": 0.7847, + "step": 48150 + }, + { + "epoch": 14.41, + "grad_norm": 2.4412641525268555, + "learning_rate": 9.054262146868459e-06, + "loss": 1.2569, + "step": 48155 + }, + { + "epoch": 14.41, + "grad_norm": 3.2604317665100098, + "learning_rate": 9.049737631678055e-06, + "loss": 1.1841, + "step": 48160 + }, + { + "epoch": 14.41, + "grad_norm": 1.3231927156448364, + "learning_rate": 9.045213997404075e-06, + "loss": 0.9842, + "step": 48165 + }, + { + "epoch": 14.41, + "grad_norm": 1.9147082567214966, + "learning_rate": 9.040691244296335e-06, + "loss": 0.9722, + "step": 48170 + }, + { + "epoch": 14.41, + "grad_norm": 2.9659924507141113, + "learning_rate": 9.036169372604627e-06, + "loss": 1.0222, + "step": 48175 + }, + { + "epoch": 14.41, + "grad_norm": 2.9592783451080322, + "learning_rate": 9.031648382578695e-06, + "loss": 0.9292, + "step": 48180 + }, + { + "epoch": 14.42, + "grad_norm": 1.969634771347046, + "learning_rate": 9.027128274468221e-06, + "loss": 1.1377, + "step": 48185 + }, + { + "epoch": 14.42, + "grad_norm": 1.3826353549957275, + "learning_rate": 9.022609048522854e-06, + "loss": 1.149, + "step": 48190 + }, + { + "epoch": 14.42, + "grad_norm": 3.4229960441589355, + "learning_rate": 9.018090704992177e-06, + "loss": 1.0666, + "step": 48195 + }, + { + "epoch": 14.42, + "grad_norm": 9.103164672851562, + "learning_rate": 9.013573244125742e-06, + "loss": 0.9819, + "step": 48200 + }, + { + "epoch": 14.42, + "grad_norm": 1.6387892961502075, + "learning_rate": 9.009056666173039e-06, + "loss": 1.0427, + "step": 48205 + }, + { + "epoch": 14.42, + "grad_norm": 5.4994425773620605, + "learning_rate": 9.00454097138351e-06, + "loss": 0.9706, + "step": 48210 + }, + { + "epoch": 14.43, + "grad_norm": 1.8383941650390625, + "learning_rate": 9.000026160006561e-06, + "loss": 1.0697, + "step": 48215 + }, + { + "epoch": 14.43, + "grad_norm": 4.727527618408203, + "learning_rate": 8.995512232291537e-06, + "loss": 1.0467, + "step": 48220 + }, + { + "epoch": 14.43, + "grad_norm": 3.8419857025146484, + "learning_rate": 8.990999188487734e-06, + "loss": 0.7724, + "step": 48225 + }, + { + "epoch": 14.43, + "grad_norm": 1.8518542051315308, + "learning_rate": 8.986487028844415e-06, + "loss": 1.0587, + "step": 48230 + }, + { + "epoch": 14.43, + "grad_norm": 2.0114905834198, + "learning_rate": 8.981975753610756e-06, + "loss": 1.127, + "step": 48235 + }, + { + "epoch": 14.43, + "grad_norm": 1.4375, + "learning_rate": 8.977465363035942e-06, + "loss": 1.2177, + "step": 48240 + }, + { + "epoch": 14.43, + "grad_norm": 4.793522834777832, + "learning_rate": 8.972955857369045e-06, + "loss": 0.9406, + "step": 48245 + }, + { + "epoch": 14.44, + "grad_norm": 3.620276927947998, + "learning_rate": 8.968447236859154e-06, + "loss": 0.8755, + "step": 48250 + }, + { + "epoch": 14.44, + "grad_norm": 4.639894962310791, + "learning_rate": 8.963939501755242e-06, + "loss": 1.0752, + "step": 48255 + }, + { + "epoch": 14.44, + "grad_norm": 1.244881510734558, + "learning_rate": 8.959432652306299e-06, + "loss": 0.8917, + "step": 48260 + }, + { + "epoch": 14.44, + "grad_norm": 3.2123985290527344, + "learning_rate": 8.954926688761208e-06, + "loss": 1.0363, + "step": 48265 + }, + { + "epoch": 14.44, + "grad_norm": 2.0880956649780273, + "learning_rate": 8.950421611368836e-06, + "loss": 0.9985, + "step": 48270 + }, + { + "epoch": 14.44, + "grad_norm": 5.486839771270752, + "learning_rate": 8.945917420378e-06, + "loss": 1.0174, + "step": 48275 + }, + { + "epoch": 14.44, + "grad_norm": 2.638864040374756, + "learning_rate": 8.941414116037458e-06, + "loss": 0.9216, + "step": 48280 + }, + { + "epoch": 14.45, + "grad_norm": 3.484980344772339, + "learning_rate": 8.936911698595921e-06, + "loss": 0.9612, + "step": 48285 + }, + { + "epoch": 14.45, + "grad_norm": 4.663930416107178, + "learning_rate": 8.932410168302052e-06, + "loss": 1.0834, + "step": 48290 + }, + { + "epoch": 14.45, + "grad_norm": 3.5646653175354004, + "learning_rate": 8.92790952540447e-06, + "loss": 0.9851, + "step": 48295 + }, + { + "epoch": 14.45, + "grad_norm": 3.214585304260254, + "learning_rate": 8.923409770151739e-06, + "loss": 0.9698, + "step": 48300 + }, + { + "epoch": 14.45, + "grad_norm": 1.7530304193496704, + "learning_rate": 8.918910902792377e-06, + "loss": 0.9638, + "step": 48305 + }, + { + "epoch": 14.45, + "grad_norm": 3.360421895980835, + "learning_rate": 8.914412923574848e-06, + "loss": 0.979, + "step": 48310 + }, + { + "epoch": 14.46, + "grad_norm": 2.919114589691162, + "learning_rate": 8.909915832747573e-06, + "loss": 0.9093, + "step": 48315 + }, + { + "epoch": 14.46, + "grad_norm": 1.921119213104248, + "learning_rate": 8.905419630558922e-06, + "loss": 0.9799, + "step": 48320 + }, + { + "epoch": 14.46, + "grad_norm": 3.1176345348358154, + "learning_rate": 8.900924317257226e-06, + "loss": 1.0288, + "step": 48325 + }, + { + "epoch": 14.46, + "grad_norm": 2.6858246326446533, + "learning_rate": 8.896429893090729e-06, + "loss": 1.0427, + "step": 48330 + }, + { + "epoch": 14.46, + "grad_norm": 30.713518142700195, + "learning_rate": 8.891936358307685e-06, + "loss": 1.0085, + "step": 48335 + }, + { + "epoch": 14.46, + "grad_norm": 4.43396520614624, + "learning_rate": 8.887443713156237e-06, + "loss": 0.8242, + "step": 48340 + }, + { + "epoch": 14.46, + "grad_norm": 3.269070625305176, + "learning_rate": 8.882951957884541e-06, + "loss": 1.0081, + "step": 48345 + }, + { + "epoch": 14.47, + "grad_norm": 3.0732762813568115, + "learning_rate": 8.87846109274064e-06, + "loss": 1.0627, + "step": 48350 + }, + { + "epoch": 14.47, + "grad_norm": 1.8434253931045532, + "learning_rate": 8.87397111797259e-06, + "loss": 0.9998, + "step": 48355 + }, + { + "epoch": 14.47, + "grad_norm": 1.8734230995178223, + "learning_rate": 8.869482033828347e-06, + "loss": 1.0262, + "step": 48360 + }, + { + "epoch": 14.47, + "grad_norm": 2.2515382766723633, + "learning_rate": 8.864993840555844e-06, + "loss": 1.006, + "step": 48365 + }, + { + "epoch": 14.47, + "grad_norm": 1.439144253730774, + "learning_rate": 8.86050653840296e-06, + "loss": 0.9979, + "step": 48370 + }, + { + "epoch": 14.47, + "grad_norm": 3.4755709171295166, + "learning_rate": 8.856020127617524e-06, + "loss": 0.8835, + "step": 48375 + }, + { + "epoch": 14.47, + "grad_norm": 2.903697967529297, + "learning_rate": 8.851534608447311e-06, + "loss": 1.0311, + "step": 48380 + }, + { + "epoch": 14.48, + "grad_norm": 2.571676731109619, + "learning_rate": 8.847049981140063e-06, + "loss": 0.9793, + "step": 48385 + }, + { + "epoch": 14.48, + "grad_norm": 2.435349702835083, + "learning_rate": 8.84256624594345e-06, + "loss": 0.858, + "step": 48390 + }, + { + "epoch": 14.48, + "grad_norm": 1.679379940032959, + "learning_rate": 8.83808340310511e-06, + "loss": 1.1234, + "step": 48395 + }, + { + "epoch": 14.48, + "grad_norm": 4.435678005218506, + "learning_rate": 8.833601452872625e-06, + "loss": 0.9629, + "step": 48400 + }, + { + "epoch": 14.48, + "grad_norm": 3.5596141815185547, + "learning_rate": 8.829120395493527e-06, + "loss": 1.2037, + "step": 48405 + }, + { + "epoch": 14.48, + "grad_norm": 4.797013759613037, + "learning_rate": 8.8246402312153e-06, + "loss": 1.1643, + "step": 48410 + }, + { + "epoch": 14.49, + "grad_norm": 2.486677885055542, + "learning_rate": 8.820160960285382e-06, + "loss": 1.0333, + "step": 48415 + }, + { + "epoch": 14.49, + "grad_norm": 1.6059032678604126, + "learning_rate": 8.815682582951163e-06, + "loss": 1.1628, + "step": 48420 + }, + { + "epoch": 14.49, + "grad_norm": 2.2335846424102783, + "learning_rate": 8.811205099459954e-06, + "loss": 0.9198, + "step": 48425 + }, + { + "epoch": 14.49, + "grad_norm": 2.2356197834014893, + "learning_rate": 8.806728510059078e-06, + "loss": 1.0291, + "step": 48430 + }, + { + "epoch": 14.49, + "grad_norm": 4.154965400695801, + "learning_rate": 8.802252814995738e-06, + "loss": 1.0695, + "step": 48435 + }, + { + "epoch": 14.49, + "grad_norm": 1.8840675354003906, + "learning_rate": 8.797778014517156e-06, + "loss": 1.07, + "step": 48440 + }, + { + "epoch": 14.49, + "grad_norm": 7.801070213317871, + "learning_rate": 8.793304108870434e-06, + "loss": 0.9854, + "step": 48445 + }, + { + "epoch": 14.5, + "grad_norm": 2.220947027206421, + "learning_rate": 8.7888310983027e-06, + "loss": 1.0613, + "step": 48450 + }, + { + "epoch": 14.5, + "grad_norm": 2.6094961166381836, + "learning_rate": 8.784358983060964e-06, + "loss": 0.9946, + "step": 48455 + }, + { + "epoch": 14.5, + "grad_norm": 2.7676682472229004, + "learning_rate": 8.779887763392225e-06, + "loss": 1.0854, + "step": 48460 + }, + { + "epoch": 14.5, + "grad_norm": 4.184953212738037, + "learning_rate": 8.775417439543427e-06, + "loss": 0.7902, + "step": 48465 + }, + { + "epoch": 14.5, + "grad_norm": 2.03581166267395, + "learning_rate": 8.770948011761456e-06, + "loss": 1.0086, + "step": 48470 + }, + { + "epoch": 14.5, + "grad_norm": 3.659592866897583, + "learning_rate": 8.76647948029316e-06, + "loss": 0.8873, + "step": 48475 + }, + { + "epoch": 14.5, + "grad_norm": 4.103082656860352, + "learning_rate": 8.762011845385321e-06, + "loss": 1.1404, + "step": 48480 + }, + { + "epoch": 14.51, + "grad_norm": 2.476278066635132, + "learning_rate": 8.757545107284704e-06, + "loss": 1.0754, + "step": 48485 + }, + { + "epoch": 14.51, + "grad_norm": 3.953524112701416, + "learning_rate": 8.753079266237978e-06, + "loss": 0.9528, + "step": 48490 + }, + { + "epoch": 14.51, + "grad_norm": 1.5628539323806763, + "learning_rate": 8.748614322491799e-06, + "loss": 0.9617, + "step": 48495 + }, + { + "epoch": 14.51, + "grad_norm": 2.0729362964630127, + "learning_rate": 8.744150276292759e-06, + "loss": 1.0597, + "step": 48500 + }, + { + "epoch": 14.51, + "grad_norm": 3.8265740871429443, + "learning_rate": 8.739687127887399e-06, + "loss": 1.1671, + "step": 48505 + }, + { + "epoch": 14.51, + "grad_norm": 2.81380558013916, + "learning_rate": 8.73522487752222e-06, + "loss": 0.9367, + "step": 48510 + }, + { + "epoch": 14.52, + "grad_norm": 3.2788760662078857, + "learning_rate": 8.730763525443663e-06, + "loss": 1.0131, + "step": 48515 + }, + { + "epoch": 14.52, + "grad_norm": 1.7596465349197388, + "learning_rate": 8.726303071898128e-06, + "loss": 0.9415, + "step": 48520 + }, + { + "epoch": 14.52, + "grad_norm": 3.689640760421753, + "learning_rate": 8.721843517131967e-06, + "loss": 1.0165, + "step": 48525 + }, + { + "epoch": 14.52, + "grad_norm": 3.1340339183807373, + "learning_rate": 8.717384861391448e-06, + "loss": 0.9638, + "step": 48530 + }, + { + "epoch": 14.52, + "grad_norm": 3.0869131088256836, + "learning_rate": 8.712927104922859e-06, + "loss": 1.1088, + "step": 48535 + }, + { + "epoch": 14.52, + "grad_norm": 2.496973991394043, + "learning_rate": 8.708470247972359e-06, + "loss": 1.0464, + "step": 48540 + }, + { + "epoch": 14.52, + "grad_norm": 3.2096173763275146, + "learning_rate": 8.704014290786128e-06, + "loss": 0.9417, + "step": 48545 + }, + { + "epoch": 14.53, + "grad_norm": 3.4847817420959473, + "learning_rate": 8.699559233610242e-06, + "loss": 0.8683, + "step": 48550 + }, + { + "epoch": 14.53, + "grad_norm": 2.737156867980957, + "learning_rate": 8.695105076690745e-06, + "loss": 0.8797, + "step": 48555 + }, + { + "epoch": 14.53, + "grad_norm": 1.7218374013900757, + "learning_rate": 8.690651820273668e-06, + "loss": 1.0199, + "step": 48560 + }, + { + "epoch": 14.53, + "grad_norm": 2.990959882736206, + "learning_rate": 8.68619946460492e-06, + "loss": 1.1195, + "step": 48565 + }, + { + "epoch": 14.53, + "grad_norm": 2.1475510597229004, + "learning_rate": 8.681748009930433e-06, + "loss": 1.0958, + "step": 48570 + }, + { + "epoch": 14.53, + "grad_norm": 2.3086507320404053, + "learning_rate": 8.677297456496026e-06, + "loss": 0.9051, + "step": 48575 + }, + { + "epoch": 14.53, + "grad_norm": 1.2065564393997192, + "learning_rate": 8.672847804547533e-06, + "loss": 1.0571, + "step": 48580 + }, + { + "epoch": 14.54, + "grad_norm": 1.4163763523101807, + "learning_rate": 8.668399054330672e-06, + "loss": 1.0684, + "step": 48585 + }, + { + "epoch": 14.54, + "grad_norm": 3.3399434089660645, + "learning_rate": 8.66395120609116e-06, + "loss": 1.1536, + "step": 48590 + }, + { + "epoch": 14.54, + "grad_norm": 1.9370921850204468, + "learning_rate": 8.659504260074638e-06, + "loss": 1.0895, + "step": 48595 + }, + { + "epoch": 14.54, + "grad_norm": 2.0222585201263428, + "learning_rate": 8.655058216526712e-06, + "loss": 0.9891, + "step": 48600 + }, + { + "epoch": 14.54, + "grad_norm": 2.3010804653167725, + "learning_rate": 8.650613075692931e-06, + "loss": 0.9449, + "step": 48605 + }, + { + "epoch": 14.54, + "grad_norm": 2.6222808361053467, + "learning_rate": 8.646168837818797e-06, + "loss": 1.08, + "step": 48610 + }, + { + "epoch": 14.55, + "grad_norm": 1.6273561716079712, + "learning_rate": 8.641725503149762e-06, + "loss": 1.0818, + "step": 48615 + }, + { + "epoch": 14.55, + "grad_norm": 2.0088725090026855, + "learning_rate": 8.637283071931227e-06, + "loss": 0.9697, + "step": 48620 + }, + { + "epoch": 14.55, + "grad_norm": 1.362270474433899, + "learning_rate": 8.632841544408526e-06, + "loss": 1.1185, + "step": 48625 + }, + { + "epoch": 14.55, + "grad_norm": 2.453993797302246, + "learning_rate": 8.62840092082699e-06, + "loss": 1.0598, + "step": 48630 + }, + { + "epoch": 14.55, + "grad_norm": 2.08078932762146, + "learning_rate": 8.623961201431835e-06, + "loss": 0.938, + "step": 48635 + }, + { + "epoch": 14.55, + "grad_norm": 2.452207565307617, + "learning_rate": 8.619522386468292e-06, + "loss": 0.896, + "step": 48640 + }, + { + "epoch": 14.55, + "grad_norm": 8.122790336608887, + "learning_rate": 8.6150844761815e-06, + "loss": 1.0177, + "step": 48645 + }, + { + "epoch": 14.56, + "grad_norm": 2.5997066497802734, + "learning_rate": 8.61064747081656e-06, + "loss": 1.0295, + "step": 48650 + }, + { + "epoch": 14.56, + "grad_norm": 3.2175190448760986, + "learning_rate": 8.606211370618537e-06, + "loss": 1.0199, + "step": 48655 + }, + { + "epoch": 14.56, + "grad_norm": 2.0975887775421143, + "learning_rate": 8.601776175832399e-06, + "loss": 0.9438, + "step": 48660 + }, + { + "epoch": 14.56, + "grad_norm": 2.3262147903442383, + "learning_rate": 8.597341886703134e-06, + "loss": 1.0345, + "step": 48665 + }, + { + "epoch": 14.56, + "grad_norm": 1.7775872945785522, + "learning_rate": 8.592908503475614e-06, + "loss": 1.1137, + "step": 48670 + }, + { + "epoch": 14.56, + "grad_norm": 3.6084024906158447, + "learning_rate": 8.588476026394716e-06, + "loss": 1.0825, + "step": 48675 + }, + { + "epoch": 14.56, + "grad_norm": 3.329972982406616, + "learning_rate": 8.584044455705223e-06, + "loss": 1.1109, + "step": 48680 + }, + { + "epoch": 14.57, + "grad_norm": 1.461459994316101, + "learning_rate": 8.579613791651889e-06, + "loss": 0.7492, + "step": 48685 + }, + { + "epoch": 14.57, + "grad_norm": 3.0383195877075195, + "learning_rate": 8.575184034479416e-06, + "loss": 1.0271, + "step": 48690 + }, + { + "epoch": 14.57, + "grad_norm": 1.4644750356674194, + "learning_rate": 8.57075518443246e-06, + "loss": 1.0969, + "step": 48695 + }, + { + "epoch": 14.57, + "grad_norm": 2.943840503692627, + "learning_rate": 8.566327241755617e-06, + "loss": 1.1267, + "step": 48700 + }, + { + "epoch": 14.57, + "grad_norm": 4.192392349243164, + "learning_rate": 8.561900206693437e-06, + "loss": 0.8901, + "step": 48705 + }, + { + "epoch": 14.57, + "grad_norm": 1.5619021654129028, + "learning_rate": 8.557474079490421e-06, + "loss": 1.1158, + "step": 48710 + }, + { + "epoch": 14.57, + "grad_norm": 5.234612941741943, + "learning_rate": 8.553048860391025e-06, + "loss": 0.8096, + "step": 48715 + }, + { + "epoch": 14.58, + "grad_norm": 4.29495906829834, + "learning_rate": 8.548624549639641e-06, + "loss": 0.9903, + "step": 48720 + }, + { + "epoch": 14.58, + "grad_norm": 3.1692752838134766, + "learning_rate": 8.544201147480625e-06, + "loss": 1.1217, + "step": 48725 + }, + { + "epoch": 14.58, + "grad_norm": 1.6975233554840088, + "learning_rate": 8.539778654158274e-06, + "loss": 1.0455, + "step": 48730 + }, + { + "epoch": 14.58, + "grad_norm": 3.1467435359954834, + "learning_rate": 8.535357069916839e-06, + "loss": 0.7778, + "step": 48735 + }, + { + "epoch": 14.58, + "grad_norm": 2.9374170303344727, + "learning_rate": 8.530936395000517e-06, + "loss": 0.939, + "step": 48740 + }, + { + "epoch": 14.58, + "grad_norm": 3.6052603721618652, + "learning_rate": 8.52651662965346e-06, + "loss": 0.9546, + "step": 48745 + }, + { + "epoch": 14.59, + "grad_norm": 2.1928467750549316, + "learning_rate": 8.522097774119775e-06, + "loss": 1.1193, + "step": 48750 + }, + { + "epoch": 14.59, + "grad_norm": 3.1543407440185547, + "learning_rate": 8.517679828643485e-06, + "loss": 0.8595, + "step": 48755 + }, + { + "epoch": 14.59, + "grad_norm": 2.9056456089019775, + "learning_rate": 8.513262793468623e-06, + "loss": 0.9613, + "step": 48760 + }, + { + "epoch": 14.59, + "grad_norm": 2.9297358989715576, + "learning_rate": 8.508846668839104e-06, + "loss": 0.976, + "step": 48765 + }, + { + "epoch": 14.59, + "grad_norm": 2.0978548526763916, + "learning_rate": 8.504431454998856e-06, + "loss": 0.9262, + "step": 48770 + }, + { + "epoch": 14.59, + "grad_norm": 1.7425466775894165, + "learning_rate": 8.500017152191708e-06, + "loss": 1.0549, + "step": 48775 + }, + { + "epoch": 14.59, + "grad_norm": 3.82523250579834, + "learning_rate": 8.495603760661459e-06, + "loss": 1.0423, + "step": 48780 + }, + { + "epoch": 14.6, + "grad_norm": 3.274580717086792, + "learning_rate": 8.491191280651861e-06, + "loss": 0.8973, + "step": 48785 + }, + { + "epoch": 14.6, + "grad_norm": 8.596813201904297, + "learning_rate": 8.486779712406605e-06, + "loss": 0.9724, + "step": 48790 + }, + { + "epoch": 14.6, + "grad_norm": 2.4201295375823975, + "learning_rate": 8.482369056169345e-06, + "loss": 0.9512, + "step": 48795 + }, + { + "epoch": 14.6, + "grad_norm": 4.427967071533203, + "learning_rate": 8.47795931218367e-06, + "loss": 0.9629, + "step": 48800 + }, + { + "epoch": 14.6, + "grad_norm": 3.9436044692993164, + "learning_rate": 8.47355048069313e-06, + "loss": 0.9762, + "step": 48805 + }, + { + "epoch": 14.6, + "grad_norm": 4.806298732757568, + "learning_rate": 8.469142561941218e-06, + "loss": 1.0321, + "step": 48810 + }, + { + "epoch": 14.6, + "grad_norm": 1.625897765159607, + "learning_rate": 8.46473555617138e-06, + "loss": 1.068, + "step": 48815 + }, + { + "epoch": 14.61, + "grad_norm": 2.3483619689941406, + "learning_rate": 8.46032946362701e-06, + "loss": 1.077, + "step": 48820 + }, + { + "epoch": 14.61, + "grad_norm": 3.5805463790893555, + "learning_rate": 8.455924284551453e-06, + "loss": 1.0664, + "step": 48825 + }, + { + "epoch": 14.61, + "grad_norm": 1.450373649597168, + "learning_rate": 8.451520019187997e-06, + "loss": 1.1001, + "step": 48830 + }, + { + "epoch": 14.61, + "grad_norm": 2.2781779766082764, + "learning_rate": 8.447116667779892e-06, + "loss": 0.9847, + "step": 48835 + }, + { + "epoch": 14.61, + "grad_norm": 3.301300048828125, + "learning_rate": 8.442714230570328e-06, + "loss": 0.8831, + "step": 48840 + }, + { + "epoch": 14.61, + "grad_norm": 2.685093879699707, + "learning_rate": 8.438312707802453e-06, + "loss": 0.8498, + "step": 48845 + }, + { + "epoch": 14.62, + "grad_norm": 4.1189069747924805, + "learning_rate": 8.433912099719337e-06, + "loss": 0.9296, + "step": 48850 + }, + { + "epoch": 14.62, + "grad_norm": 2.040025234222412, + "learning_rate": 8.429512406564055e-06, + "loss": 1.0176, + "step": 48855 + }, + { + "epoch": 14.62, + "grad_norm": 2.4712417125701904, + "learning_rate": 8.42511362857956e-06, + "loss": 0.9175, + "step": 48860 + }, + { + "epoch": 14.62, + "grad_norm": 2.281320810317993, + "learning_rate": 8.420715766008826e-06, + "loss": 0.8373, + "step": 48865 + }, + { + "epoch": 14.62, + "grad_norm": 2.7027089595794678, + "learning_rate": 8.416318819094713e-06, + "loss": 0.9954, + "step": 48870 + }, + { + "epoch": 14.62, + "grad_norm": 2.729827880859375, + "learning_rate": 8.411922788080088e-06, + "loss": 0.8774, + "step": 48875 + }, + { + "epoch": 14.62, + "grad_norm": 1.878912329673767, + "learning_rate": 8.407527673207718e-06, + "loss": 0.9414, + "step": 48880 + }, + { + "epoch": 14.63, + "grad_norm": 2.9538416862487793, + "learning_rate": 8.403133474720348e-06, + "loss": 0.9427, + "step": 48885 + }, + { + "epoch": 14.63, + "grad_norm": 1.521445393562317, + "learning_rate": 8.398740192860663e-06, + "loss": 0.9906, + "step": 48890 + }, + { + "epoch": 14.63, + "grad_norm": 2.062879800796509, + "learning_rate": 8.394347827871302e-06, + "loss": 0.8437, + "step": 48895 + }, + { + "epoch": 14.63, + "grad_norm": 3.722719669342041, + "learning_rate": 8.38995637999485e-06, + "loss": 1.0068, + "step": 48900 + }, + { + "epoch": 14.63, + "grad_norm": 3.8463211059570312, + "learning_rate": 8.385565849473843e-06, + "loss": 0.9874, + "step": 48905 + }, + { + "epoch": 14.63, + "grad_norm": 2.462646484375, + "learning_rate": 8.381176236550764e-06, + "loss": 1.0429, + "step": 48910 + }, + { + "epoch": 14.63, + "grad_norm": 2.3265976905822754, + "learning_rate": 8.376787541468045e-06, + "loss": 1.1793, + "step": 48915 + }, + { + "epoch": 14.64, + "grad_norm": 4.777642726898193, + "learning_rate": 8.372399764468076e-06, + "loss": 0.9058, + "step": 48920 + }, + { + "epoch": 14.64, + "grad_norm": 2.799137592315674, + "learning_rate": 8.36801290579318e-06, + "loss": 0.9833, + "step": 48925 + }, + { + "epoch": 14.64, + "grad_norm": 1.032945156097412, + "learning_rate": 8.363626965685645e-06, + "loss": 1.1707, + "step": 48930 + }, + { + "epoch": 14.64, + "grad_norm": 2.207265853881836, + "learning_rate": 8.359241944387699e-06, + "loss": 0.9743, + "step": 48935 + }, + { + "epoch": 14.64, + "grad_norm": 2.7506961822509766, + "learning_rate": 8.354857842141533e-06, + "loss": 0.9348, + "step": 48940 + }, + { + "epoch": 14.64, + "grad_norm": 1.6580191850662231, + "learning_rate": 8.35047465918925e-06, + "loss": 1.0558, + "step": 48945 + }, + { + "epoch": 14.65, + "grad_norm": 4.469571590423584, + "learning_rate": 8.34609239577296e-06, + "loss": 1.0138, + "step": 48950 + }, + { + "epoch": 14.65, + "grad_norm": 2.21535325050354, + "learning_rate": 8.341711052134663e-06, + "loss": 1.0124, + "step": 48955 + }, + { + "epoch": 14.65, + "grad_norm": 5.841526031494141, + "learning_rate": 8.337330628516363e-06, + "loss": 1.0061, + "step": 48960 + }, + { + "epoch": 14.65, + "grad_norm": 2.309290647506714, + "learning_rate": 8.332951125159957e-06, + "loss": 0.9943, + "step": 48965 + }, + { + "epoch": 14.65, + "grad_norm": 5.338931560516357, + "learning_rate": 8.328572542307353e-06, + "loss": 0.9796, + "step": 48970 + }, + { + "epoch": 14.65, + "grad_norm": 4.652040958404541, + "learning_rate": 8.324194880200348e-06, + "loss": 0.7701, + "step": 48975 + }, + { + "epoch": 14.65, + "grad_norm": 4.739422798156738, + "learning_rate": 8.319818139080728e-06, + "loss": 1.0848, + "step": 48980 + }, + { + "epoch": 14.66, + "grad_norm": 1.5917638540267944, + "learning_rate": 8.315442319190214e-06, + "loss": 0.9494, + "step": 48985 + }, + { + "epoch": 14.66, + "grad_norm": 1.3649824857711792, + "learning_rate": 8.31106742077048e-06, + "loss": 1.1455, + "step": 48990 + }, + { + "epoch": 14.66, + "grad_norm": 1.3612852096557617, + "learning_rate": 8.30669344406314e-06, + "loss": 0.9826, + "step": 48995 + }, + { + "epoch": 14.66, + "grad_norm": 4.043242454528809, + "learning_rate": 8.302320389309776e-06, + "loss": 0.8944, + "step": 49000 + }, + { + "epoch": 14.66, + "grad_norm": 4.382472991943359, + "learning_rate": 8.297948256751897e-06, + "loss": 0.9066, + "step": 49005 + }, + { + "epoch": 14.66, + "grad_norm": 1.3883436918258667, + "learning_rate": 8.293577046630973e-06, + "loss": 1.0056, + "step": 49010 + }, + { + "epoch": 14.66, + "grad_norm": 2.2432875633239746, + "learning_rate": 8.289206759188426e-06, + "loss": 1.1799, + "step": 49015 + }, + { + "epoch": 14.67, + "grad_norm": 2.0689566135406494, + "learning_rate": 8.28483739466562e-06, + "loss": 0.9735, + "step": 49020 + }, + { + "epoch": 14.67, + "grad_norm": 2.059446096420288, + "learning_rate": 8.280468953303868e-06, + "loss": 0.9371, + "step": 49025 + }, + { + "epoch": 14.67, + "grad_norm": 2.9364001750946045, + "learning_rate": 8.276101435344441e-06, + "loss": 1.0071, + "step": 49030 + }, + { + "epoch": 14.67, + "grad_norm": 2.926053524017334, + "learning_rate": 8.271734841028553e-06, + "loss": 0.8205, + "step": 49035 + }, + { + "epoch": 14.67, + "grad_norm": 1.4417191743850708, + "learning_rate": 8.267369170597345e-06, + "loss": 0.9981, + "step": 49040 + }, + { + "epoch": 14.67, + "grad_norm": 6.5991973876953125, + "learning_rate": 8.263004424291962e-06, + "loss": 0.7934, + "step": 49045 + }, + { + "epoch": 14.68, + "grad_norm": 3.2373931407928467, + "learning_rate": 8.258640602353432e-06, + "loss": 0.9667, + "step": 49050 + }, + { + "epoch": 14.68, + "grad_norm": 2.341061592102051, + "learning_rate": 8.254277705022795e-06, + "loss": 1.0543, + "step": 49055 + }, + { + "epoch": 14.68, + "grad_norm": 2.1643905639648438, + "learning_rate": 8.24991573254098e-06, + "loss": 0.9737, + "step": 49060 + }, + { + "epoch": 14.68, + "grad_norm": 4.566475868225098, + "learning_rate": 8.245554685148924e-06, + "loss": 0.9937, + "step": 49065 + }, + { + "epoch": 14.68, + "grad_norm": 3.2137701511383057, + "learning_rate": 8.241194563087456e-06, + "loss": 1.0407, + "step": 49070 + }, + { + "epoch": 14.68, + "grad_norm": 1.9922786951065063, + "learning_rate": 8.236835366597397e-06, + "loss": 1.1503, + "step": 49075 + }, + { + "epoch": 14.68, + "grad_norm": 6.491326808929443, + "learning_rate": 8.232477095919495e-06, + "loss": 0.8255, + "step": 49080 + }, + { + "epoch": 14.69, + "grad_norm": 3.8574929237365723, + "learning_rate": 8.228119751294452e-06, + "loss": 0.8979, + "step": 49085 + }, + { + "epoch": 14.69, + "grad_norm": 1.933752417564392, + "learning_rate": 8.223763332962925e-06, + "loss": 0.992, + "step": 49090 + }, + { + "epoch": 14.69, + "grad_norm": 1.47145676612854, + "learning_rate": 8.219407841165508e-06, + "loss": 1.1588, + "step": 49095 + }, + { + "epoch": 14.69, + "grad_norm": 1.7857855558395386, + "learning_rate": 8.215053276142756e-06, + "loss": 0.9984, + "step": 49100 + }, + { + "epoch": 14.69, + "grad_norm": 3.7101316452026367, + "learning_rate": 8.210699638135164e-06, + "loss": 0.933, + "step": 49105 + }, + { + "epoch": 14.69, + "grad_norm": 3.045780658721924, + "learning_rate": 8.20634692738318e-06, + "loss": 0.8924, + "step": 49110 + }, + { + "epoch": 14.69, + "grad_norm": 4.745621204376221, + "learning_rate": 8.201995144127198e-06, + "loss": 0.9093, + "step": 49115 + }, + { + "epoch": 14.7, + "grad_norm": 1.8852388858795166, + "learning_rate": 8.197644288607562e-06, + "loss": 1.1749, + "step": 49120 + }, + { + "epoch": 14.7, + "grad_norm": 2.5533406734466553, + "learning_rate": 8.193294361064569e-06, + "loss": 1.1027, + "step": 49125 + }, + { + "epoch": 14.7, + "grad_norm": 1.9613593816757202, + "learning_rate": 8.188945361738468e-06, + "loss": 0.9726, + "step": 49130 + }, + { + "epoch": 14.7, + "grad_norm": 2.30924916267395, + "learning_rate": 8.184597290869423e-06, + "loss": 1.0676, + "step": 49135 + }, + { + "epoch": 14.7, + "grad_norm": 7.164217472076416, + "learning_rate": 8.180250148697605e-06, + "loss": 0.9149, + "step": 49140 + }, + { + "epoch": 14.7, + "grad_norm": 1.656010627746582, + "learning_rate": 8.175903935463076e-06, + "loss": 1.0562, + "step": 49145 + }, + { + "epoch": 14.71, + "grad_norm": 2.542595863342285, + "learning_rate": 8.171558651405897e-06, + "loss": 1.0277, + "step": 49150 + }, + { + "epoch": 14.71, + "grad_norm": 2.8253142833709717, + "learning_rate": 8.16721429676603e-06, + "loss": 1.0583, + "step": 49155 + }, + { + "epoch": 14.71, + "grad_norm": 4.2701945304870605, + "learning_rate": 8.162870871783435e-06, + "loss": 0.9444, + "step": 49160 + }, + { + "epoch": 14.71, + "grad_norm": 3.9655089378356934, + "learning_rate": 8.15852837669797e-06, + "loss": 0.9712, + "step": 49165 + }, + { + "epoch": 14.71, + "grad_norm": 3.8803188800811768, + "learning_rate": 8.154186811749479e-06, + "loss": 0.8939, + "step": 49170 + }, + { + "epoch": 14.71, + "grad_norm": 3.9758951663970947, + "learning_rate": 8.14984617717774e-06, + "loss": 0.9559, + "step": 49175 + }, + { + "epoch": 14.71, + "grad_norm": 1.9809141159057617, + "learning_rate": 8.145506473222481e-06, + "loss": 1.0336, + "step": 49180 + }, + { + "epoch": 14.72, + "grad_norm": 3.15104079246521, + "learning_rate": 8.14116770012338e-06, + "loss": 1.1105, + "step": 49185 + }, + { + "epoch": 14.72, + "grad_norm": 3.349479913711548, + "learning_rate": 8.136829858120066e-06, + "loss": 1.0803, + "step": 49190 + }, + { + "epoch": 14.72, + "grad_norm": 3.6429026126861572, + "learning_rate": 8.132492947452108e-06, + "loss": 0.9848, + "step": 49195 + }, + { + "epoch": 14.72, + "grad_norm": 2.910015106201172, + "learning_rate": 8.128156968359035e-06, + "loss": 0.9665, + "step": 49200 + }, + { + "epoch": 14.72, + "grad_norm": 6.490506172180176, + "learning_rate": 8.123821921080313e-06, + "loss": 0.8624, + "step": 49205 + }, + { + "epoch": 14.72, + "grad_norm": 1.679452896118164, + "learning_rate": 8.119487805855364e-06, + "loss": 0.9264, + "step": 49210 + }, + { + "epoch": 14.72, + "grad_norm": 5.132246971130371, + "learning_rate": 8.115154622923556e-06, + "loss": 0.8382, + "step": 49215 + }, + { + "epoch": 14.73, + "grad_norm": 1.5223355293273926, + "learning_rate": 8.11082237252421e-06, + "loss": 1.04, + "step": 49220 + }, + { + "epoch": 14.73, + "grad_norm": 2.475334644317627, + "learning_rate": 8.10649105489659e-06, + "loss": 1.0027, + "step": 49225 + }, + { + "epoch": 14.73, + "grad_norm": 1.9048575162887573, + "learning_rate": 8.102160670279906e-06, + "loss": 1.0384, + "step": 49230 + }, + { + "epoch": 14.73, + "grad_norm": 2.135935068130493, + "learning_rate": 8.097831218913333e-06, + "loss": 1.0354, + "step": 49235 + }, + { + "epoch": 14.73, + "grad_norm": 1.7497014999389648, + "learning_rate": 8.093502701035957e-06, + "loss": 0.8898, + "step": 49240 + }, + { + "epoch": 14.73, + "grad_norm": 2.735699415206909, + "learning_rate": 8.08917511688687e-06, + "loss": 0.9877, + "step": 49245 + }, + { + "epoch": 14.74, + "grad_norm": 1.427288293838501, + "learning_rate": 8.084848466705048e-06, + "loss": 0.8119, + "step": 49250 + }, + { + "epoch": 14.74, + "grad_norm": 3.9717910289764404, + "learning_rate": 8.080522750729477e-06, + "loss": 0.9547, + "step": 49255 + }, + { + "epoch": 14.74, + "grad_norm": 4.33355188369751, + "learning_rate": 8.07619796919904e-06, + "loss": 1.0372, + "step": 49260 + }, + { + "epoch": 14.74, + "grad_norm": 4.518667697906494, + "learning_rate": 8.071874122352598e-06, + "loss": 0.9114, + "step": 49265 + }, + { + "epoch": 14.74, + "grad_norm": 3.398543119430542, + "learning_rate": 8.067551210428952e-06, + "loss": 0.9405, + "step": 49270 + }, + { + "epoch": 14.74, + "grad_norm": 4.380087852478027, + "learning_rate": 8.063229233666854e-06, + "loss": 0.9915, + "step": 49275 + }, + { + "epoch": 14.74, + "grad_norm": 1.5817692279815674, + "learning_rate": 8.058908192305e-06, + "loss": 1.1618, + "step": 49280 + }, + { + "epoch": 14.75, + "grad_norm": 2.916731834411621, + "learning_rate": 8.054588086582029e-06, + "loss": 0.9583, + "step": 49285 + }, + { + "epoch": 14.75, + "grad_norm": 1.46005117893219, + "learning_rate": 8.050268916736561e-06, + "loss": 0.9385, + "step": 49290 + }, + { + "epoch": 14.75, + "grad_norm": 4.392984390258789, + "learning_rate": 8.045950683007115e-06, + "loss": 0.9728, + "step": 49295 + }, + { + "epoch": 14.75, + "grad_norm": 1.823644995689392, + "learning_rate": 8.041633385632186e-06, + "loss": 1.1487, + "step": 49300 + }, + { + "epoch": 14.75, + "grad_norm": 5.119616508483887, + "learning_rate": 8.037317024850222e-06, + "loss": 1.0758, + "step": 49305 + }, + { + "epoch": 14.75, + "grad_norm": 5.690764904022217, + "learning_rate": 8.033001600899606e-06, + "loss": 0.8963, + "step": 49310 + }, + { + "epoch": 14.75, + "grad_norm": 3.6248810291290283, + "learning_rate": 8.028687114018674e-06, + "loss": 0.8885, + "step": 49315 + }, + { + "epoch": 14.76, + "grad_norm": 2.2621960639953613, + "learning_rate": 8.024373564445714e-06, + "loss": 1.0696, + "step": 49320 + }, + { + "epoch": 14.76, + "grad_norm": 4.6661601066589355, + "learning_rate": 8.020060952418956e-06, + "loss": 1.1138, + "step": 49325 + }, + { + "epoch": 14.76, + "grad_norm": 2.248114824295044, + "learning_rate": 8.01574927817659e-06, + "loss": 1.1691, + "step": 49330 + }, + { + "epoch": 14.76, + "grad_norm": 1.4611231088638306, + "learning_rate": 8.01143854195672e-06, + "loss": 1.1069, + "step": 49335 + }, + { + "epoch": 14.76, + "grad_norm": 1.7469176054000854, + "learning_rate": 8.007128743997457e-06, + "loss": 1.0006, + "step": 49340 + }, + { + "epoch": 14.76, + "grad_norm": 1.546489953994751, + "learning_rate": 8.002819884536797e-06, + "loss": 0.9871, + "step": 49345 + }, + { + "epoch": 14.76, + "grad_norm": 3.91237473487854, + "learning_rate": 7.998511963812741e-06, + "loss": 0.9547, + "step": 49350 + }, + { + "epoch": 14.77, + "grad_norm": 2.6893093585968018, + "learning_rate": 7.99420498206318e-06, + "loss": 0.81, + "step": 49355 + }, + { + "epoch": 14.77, + "grad_norm": 3.268592119216919, + "learning_rate": 7.989898939526018e-06, + "loss": 1.152, + "step": 49360 + }, + { + "epoch": 14.77, + "grad_norm": 3.8002939224243164, + "learning_rate": 7.985593836439051e-06, + "loss": 1.1162, + "step": 49365 + }, + { + "epoch": 14.77, + "grad_norm": 2.8874077796936035, + "learning_rate": 7.981289673040041e-06, + "loss": 0.97, + "step": 49370 + }, + { + "epoch": 14.77, + "grad_norm": 3.9415621757507324, + "learning_rate": 7.976986449566728e-06, + "loss": 1.2578, + "step": 49375 + }, + { + "epoch": 14.77, + "grad_norm": 1.1208429336547852, + "learning_rate": 7.972684166256744e-06, + "loss": 0.9146, + "step": 49380 + }, + { + "epoch": 14.78, + "grad_norm": 4.455554962158203, + "learning_rate": 7.968382823347731e-06, + "loss": 0.9225, + "step": 49385 + }, + { + "epoch": 14.78, + "grad_norm": 3.3006980419158936, + "learning_rate": 7.964082421077223e-06, + "loss": 0.92, + "step": 49390 + }, + { + "epoch": 14.78, + "grad_norm": 1.741047739982605, + "learning_rate": 7.959782959682732e-06, + "loss": 0.919, + "step": 49395 + }, + { + "epoch": 14.78, + "grad_norm": 4.698997497558594, + "learning_rate": 7.95548443940172e-06, + "loss": 1.0252, + "step": 49400 + }, + { + "epoch": 14.78, + "grad_norm": 2.376089334487915, + "learning_rate": 7.951186860471582e-06, + "loss": 0.995, + "step": 49405 + }, + { + "epoch": 14.78, + "grad_norm": 3.1121981143951416, + "learning_rate": 7.946890223129677e-06, + "loss": 0.9083, + "step": 49410 + }, + { + "epoch": 14.78, + "grad_norm": 3.4917523860931396, + "learning_rate": 7.942594527613295e-06, + "loss": 0.8519, + "step": 49415 + }, + { + "epoch": 14.79, + "grad_norm": 6.6114420890808105, + "learning_rate": 7.938299774159691e-06, + "loss": 0.8482, + "step": 49420 + }, + { + "epoch": 14.79, + "grad_norm": 1.4963136911392212, + "learning_rate": 7.934005963006062e-06, + "loss": 0.9212, + "step": 49425 + }, + { + "epoch": 14.79, + "grad_norm": 3.31788969039917, + "learning_rate": 7.929713094389527e-06, + "loss": 1.1518, + "step": 49430 + }, + { + "epoch": 14.79, + "grad_norm": 2.473154067993164, + "learning_rate": 7.92542116854721e-06, + "loss": 0.7843, + "step": 49435 + }, + { + "epoch": 14.79, + "grad_norm": 2.462738513946533, + "learning_rate": 7.921130185716119e-06, + "loss": 0.8878, + "step": 49440 + }, + { + "epoch": 14.79, + "grad_norm": 3.9160032272338867, + "learning_rate": 7.916840146133264e-06, + "loss": 1.0054, + "step": 49445 + }, + { + "epoch": 14.79, + "grad_norm": 1.9631022214889526, + "learning_rate": 7.912551050035572e-06, + "loss": 1.0096, + "step": 49450 + }, + { + "epoch": 14.8, + "grad_norm": 3.000981330871582, + "learning_rate": 7.908262897659921e-06, + "loss": 1.051, + "step": 49455 + }, + { + "epoch": 14.8, + "grad_norm": 3.195512056350708, + "learning_rate": 7.903975689243155e-06, + "loss": 1.085, + "step": 49460 + }, + { + "epoch": 14.8, + "grad_norm": 1.471376657485962, + "learning_rate": 7.899689425022022e-06, + "loss": 1.0268, + "step": 49465 + }, + { + "epoch": 14.8, + "grad_norm": 2.2439353466033936, + "learning_rate": 7.895404105233286e-06, + "loss": 0.9531, + "step": 49470 + }, + { + "epoch": 14.8, + "grad_norm": 3.2079246044158936, + "learning_rate": 7.891119730113586e-06, + "loss": 0.9474, + "step": 49475 + }, + { + "epoch": 14.8, + "grad_norm": 2.1932594776153564, + "learning_rate": 7.886836299899574e-06, + "loss": 0.8452, + "step": 49480 + }, + { + "epoch": 14.81, + "grad_norm": 2.9864261150360107, + "learning_rate": 7.882553814827797e-06, + "loss": 1.0548, + "step": 49485 + }, + { + "epoch": 14.81, + "grad_norm": 1.2952245473861694, + "learning_rate": 7.878272275134782e-06, + "loss": 1.072, + "step": 49490 + }, + { + "epoch": 14.81, + "grad_norm": 2.7455410957336426, + "learning_rate": 7.873991681056992e-06, + "loss": 1.242, + "step": 49495 + }, + { + "epoch": 14.81, + "grad_norm": 2.4569225311279297, + "learning_rate": 7.86971203283084e-06, + "loss": 0.9538, + "step": 49500 + }, + { + "epoch": 14.81, + "grad_norm": 6.187790870666504, + "learning_rate": 7.865433330692684e-06, + "loss": 0.872, + "step": 49505 + }, + { + "epoch": 14.81, + "grad_norm": 2.7457752227783203, + "learning_rate": 7.861155574878838e-06, + "loss": 1.0486, + "step": 49510 + }, + { + "epoch": 14.81, + "grad_norm": 3.150146245956421, + "learning_rate": 7.856878765625553e-06, + "loss": 0.9142, + "step": 49515 + }, + { + "epoch": 14.82, + "grad_norm": 25.611284255981445, + "learning_rate": 7.852602903169043e-06, + "loss": 0.9913, + "step": 49520 + }, + { + "epoch": 14.82, + "grad_norm": 5.1431145668029785, + "learning_rate": 7.848327987745433e-06, + "loss": 0.7851, + "step": 49525 + }, + { + "epoch": 14.82, + "grad_norm": 5.0381388664245605, + "learning_rate": 7.844054019590851e-06, + "loss": 0.9707, + "step": 49530 + }, + { + "epoch": 14.82, + "grad_norm": 2.5118184089660645, + "learning_rate": 7.839780998941331e-06, + "loss": 0.9747, + "step": 49535 + }, + { + "epoch": 14.82, + "grad_norm": 3.1850192546844482, + "learning_rate": 7.83550892603287e-06, + "loss": 0.9729, + "step": 49540 + }, + { + "epoch": 14.82, + "grad_norm": 2.785214424133301, + "learning_rate": 7.83123780110141e-06, + "loss": 0.9847, + "step": 49545 + }, + { + "epoch": 14.82, + "grad_norm": 2.283158540725708, + "learning_rate": 7.826967624382839e-06, + "loss": 1.0142, + "step": 49550 + }, + { + "epoch": 14.83, + "grad_norm": 2.654142379760742, + "learning_rate": 7.822698396113005e-06, + "loss": 1.1584, + "step": 49555 + }, + { + "epoch": 14.83, + "grad_norm": 1.418227195739746, + "learning_rate": 7.818430116527668e-06, + "loss": 0.7142, + "step": 49560 + }, + { + "epoch": 14.83, + "grad_norm": 3.0038633346557617, + "learning_rate": 7.814162785862591e-06, + "loss": 1.0606, + "step": 49565 + }, + { + "epoch": 14.83, + "grad_norm": 3.01194429397583, + "learning_rate": 7.809896404353426e-06, + "loss": 0.9423, + "step": 49570 + }, + { + "epoch": 14.83, + "grad_norm": 3.334562063217163, + "learning_rate": 7.805630972235827e-06, + "loss": 0.9911, + "step": 49575 + }, + { + "epoch": 14.83, + "grad_norm": 4.44184684753418, + "learning_rate": 7.801366489745343e-06, + "loss": 0.8377, + "step": 49580 + }, + { + "epoch": 14.84, + "grad_norm": 2.8854176998138428, + "learning_rate": 7.797102957117527e-06, + "loss": 1.0517, + "step": 49585 + }, + { + "epoch": 14.84, + "grad_norm": 1.7378528118133545, + "learning_rate": 7.792840374587826e-06, + "loss": 0.9528, + "step": 49590 + }, + { + "epoch": 14.84, + "grad_norm": 4.007412910461426, + "learning_rate": 7.788578742391664e-06, + "loss": 0.9821, + "step": 49595 + }, + { + "epoch": 14.84, + "grad_norm": 5.08059549331665, + "learning_rate": 7.784318060764406e-06, + "loss": 0.9414, + "step": 49600 + }, + { + "epoch": 14.84, + "grad_norm": 1.6905008554458618, + "learning_rate": 7.78005832994137e-06, + "loss": 0.9635, + "step": 49605 + }, + { + "epoch": 14.84, + "grad_norm": 1.3702781200408936, + "learning_rate": 7.775799550157811e-06, + "loss": 1.0004, + "step": 49610 + }, + { + "epoch": 14.84, + "grad_norm": 2.688145875930786, + "learning_rate": 7.771541721648943e-06, + "loss": 1.0286, + "step": 49615 + }, + { + "epoch": 14.85, + "grad_norm": 2.3341612815856934, + "learning_rate": 7.767284844649914e-06, + "loss": 1.0534, + "step": 49620 + }, + { + "epoch": 14.85, + "grad_norm": 3.92333984375, + "learning_rate": 7.763028919395832e-06, + "loss": 1.0172, + "step": 49625 + }, + { + "epoch": 14.85, + "grad_norm": 3.7512214183807373, + "learning_rate": 7.758773946121745e-06, + "loss": 1.1164, + "step": 49630 + }, + { + "epoch": 14.85, + "grad_norm": 2.6205503940582275, + "learning_rate": 7.754519925062651e-06, + "loss": 0.9167, + "step": 49635 + }, + { + "epoch": 14.85, + "grad_norm": 4.808284282684326, + "learning_rate": 7.750266856453498e-06, + "loss": 1.1432, + "step": 49640 + }, + { + "epoch": 14.85, + "grad_norm": 3.2909045219421387, + "learning_rate": 7.746014740529176e-06, + "loss": 1.1521, + "step": 49645 + }, + { + "epoch": 14.85, + "grad_norm": 4.219728946685791, + "learning_rate": 7.741763577524532e-06, + "loss": 0.9124, + "step": 49650 + }, + { + "epoch": 14.86, + "grad_norm": 3.6862242221832275, + "learning_rate": 7.737513367674331e-06, + "loss": 1.0414, + "step": 49655 + }, + { + "epoch": 14.86, + "grad_norm": 2.3704283237457275, + "learning_rate": 7.73326411121334e-06, + "loss": 1.0735, + "step": 49660 + }, + { + "epoch": 14.86, + "grad_norm": 2.672393321990967, + "learning_rate": 7.729015808376208e-06, + "loss": 0.9972, + "step": 49665 + }, + { + "epoch": 14.86, + "grad_norm": 3.06726336479187, + "learning_rate": 7.724768459397597e-06, + "loss": 0.9701, + "step": 49670 + }, + { + "epoch": 14.86, + "grad_norm": 1.9261653423309326, + "learning_rate": 7.720522064512054e-06, + "loss": 1.0339, + "step": 49675 + }, + { + "epoch": 14.86, + "grad_norm": 4.520108222961426, + "learning_rate": 7.716276623954127e-06, + "loss": 0.8752, + "step": 49680 + }, + { + "epoch": 14.87, + "grad_norm": 6.763810157775879, + "learning_rate": 7.712032137958273e-06, + "loss": 1.0836, + "step": 49685 + }, + { + "epoch": 14.87, + "grad_norm": 2.1053125858306885, + "learning_rate": 7.707788606758912e-06, + "loss": 0.8817, + "step": 49690 + }, + { + "epoch": 14.87, + "grad_norm": 1.749535083770752, + "learning_rate": 7.703546030590414e-06, + "loss": 0.8303, + "step": 49695 + }, + { + "epoch": 14.87, + "grad_norm": 2.739657163619995, + "learning_rate": 7.699304409687089e-06, + "loss": 0.9729, + "step": 49700 + }, + { + "epoch": 14.87, + "grad_norm": 3.52274489402771, + "learning_rate": 7.695063744283196e-06, + "loss": 0.9307, + "step": 49705 + }, + { + "epoch": 14.87, + "grad_norm": 2.7961769104003906, + "learning_rate": 7.690824034612948e-06, + "loss": 0.991, + "step": 49710 + }, + { + "epoch": 14.87, + "grad_norm": 2.9371488094329834, + "learning_rate": 7.686585280910497e-06, + "loss": 0.999, + "step": 49715 + }, + { + "epoch": 14.88, + "grad_norm": 2.072927236557007, + "learning_rate": 7.68234748340994e-06, + "loss": 0.9193, + "step": 49720 + }, + { + "epoch": 14.88, + "grad_norm": 1.750504732131958, + "learning_rate": 7.678110642345334e-06, + "loss": 0.8694, + "step": 49725 + }, + { + "epoch": 14.88, + "grad_norm": 8.674440383911133, + "learning_rate": 7.673874757950675e-06, + "loss": 0.8633, + "step": 49730 + }, + { + "epoch": 14.88, + "grad_norm": 2.6912498474121094, + "learning_rate": 7.6696398304599e-06, + "loss": 1.0709, + "step": 49735 + }, + { + "epoch": 14.88, + "grad_norm": 4.30122184753418, + "learning_rate": 7.665405860106902e-06, + "loss": 0.8949, + "step": 49740 + }, + { + "epoch": 14.88, + "grad_norm": 2.0600149631500244, + "learning_rate": 7.66117284712553e-06, + "loss": 0.9048, + "step": 49745 + }, + { + "epoch": 14.88, + "grad_norm": 3.0497703552246094, + "learning_rate": 7.656940791749545e-06, + "loss": 0.933, + "step": 49750 + }, + { + "epoch": 14.89, + "grad_norm": 4.7076311111450195, + "learning_rate": 7.652709694212706e-06, + "loss": 0.8802, + "step": 49755 + }, + { + "epoch": 14.89, + "grad_norm": 2.6163299083709717, + "learning_rate": 7.648479554748666e-06, + "loss": 1.0973, + "step": 49760 + }, + { + "epoch": 14.89, + "grad_norm": 6.505290508270264, + "learning_rate": 7.644250373591078e-06, + "loss": 0.9932, + "step": 49765 + }, + { + "epoch": 14.89, + "grad_norm": 3.616623878479004, + "learning_rate": 7.640022150973485e-06, + "loss": 0.8964, + "step": 49770 + }, + { + "epoch": 14.89, + "grad_norm": 3.170760154724121, + "learning_rate": 7.635794887129441e-06, + "loss": 1.1569, + "step": 49775 + }, + { + "epoch": 14.89, + "grad_norm": 5.005249500274658, + "learning_rate": 7.631568582292389e-06, + "loss": 0.9821, + "step": 49780 + }, + { + "epoch": 14.9, + "grad_norm": 3.093559503555298, + "learning_rate": 7.62734323669575e-06, + "loss": 1.0313, + "step": 49785 + }, + { + "epoch": 14.9, + "grad_norm": 5.463796138763428, + "learning_rate": 7.623118850572886e-06, + "loss": 1.0058, + "step": 49790 + }, + { + "epoch": 14.9, + "grad_norm": 2.4942896366119385, + "learning_rate": 7.618895424157105e-06, + "loss": 0.9905, + "step": 49795 + }, + { + "epoch": 14.9, + "grad_norm": 3.5765671730041504, + "learning_rate": 7.614672957681665e-06, + "loss": 0.9716, + "step": 49800 + }, + { + "epoch": 14.9, + "grad_norm": 3.6634316444396973, + "learning_rate": 7.610451451379763e-06, + "loss": 1.0055, + "step": 49805 + }, + { + "epoch": 14.9, + "grad_norm": 1.4103604555130005, + "learning_rate": 7.606230905484557e-06, + "loss": 1.0418, + "step": 49810 + }, + { + "epoch": 14.9, + "grad_norm": 5.684398651123047, + "learning_rate": 7.602011320229132e-06, + "loss": 1.0942, + "step": 49815 + }, + { + "epoch": 14.91, + "grad_norm": 1.4336367845535278, + "learning_rate": 7.597792695846542e-06, + "loss": 1.0598, + "step": 49820 + }, + { + "epoch": 14.91, + "grad_norm": 3.313039779663086, + "learning_rate": 7.593575032569772e-06, + "loss": 1.0155, + "step": 49825 + }, + { + "epoch": 14.91, + "grad_norm": 4.043692111968994, + "learning_rate": 7.58935833063176e-06, + "loss": 0.8216, + "step": 49830 + }, + { + "epoch": 14.91, + "grad_norm": 1.758624792098999, + "learning_rate": 7.58514259026539e-06, + "loss": 1.0355, + "step": 49835 + }, + { + "epoch": 14.91, + "grad_norm": 2.282357931137085, + "learning_rate": 7.5809278117035006e-06, + "loss": 1.1196, + "step": 49840 + }, + { + "epoch": 14.91, + "grad_norm": 2.790658950805664, + "learning_rate": 7.576713995178847e-06, + "loss": 1.0328, + "step": 49845 + }, + { + "epoch": 14.91, + "grad_norm": 7.928509712219238, + "learning_rate": 7.572501140924184e-06, + "loss": 0.8045, + "step": 49850 + }, + { + "epoch": 14.92, + "grad_norm": 1.6898220777511597, + "learning_rate": 7.568289249172153e-06, + "loss": 0.9176, + "step": 49855 + }, + { + "epoch": 14.92, + "grad_norm": 1.0324536561965942, + "learning_rate": 7.5640783201554046e-06, + "loss": 1.0659, + "step": 49860 + }, + { + "epoch": 14.92, + "grad_norm": 2.4962313175201416, + "learning_rate": 7.5598683541064665e-06, + "loss": 1.0566, + "step": 49865 + }, + { + "epoch": 14.92, + "grad_norm": 4.477339744567871, + "learning_rate": 7.55565935125789e-06, + "loss": 0.9315, + "step": 49870 + }, + { + "epoch": 14.92, + "grad_norm": 1.8302847146987915, + "learning_rate": 7.551451311842109e-06, + "loss": 0.9124, + "step": 49875 + }, + { + "epoch": 14.92, + "grad_norm": 4.733458995819092, + "learning_rate": 7.547244236091533e-06, + "loss": 1.0675, + "step": 49880 + }, + { + "epoch": 14.93, + "grad_norm": 3.320866584777832, + "learning_rate": 7.543038124238516e-06, + "loss": 1.038, + "step": 49885 + }, + { + "epoch": 14.93, + "grad_norm": 1.8176350593566895, + "learning_rate": 7.5388329765153585e-06, + "loss": 1.1704, + "step": 49890 + }, + { + "epoch": 14.93, + "grad_norm": 3.144420623779297, + "learning_rate": 7.534628793154308e-06, + "loss": 0.948, + "step": 49895 + }, + { + "epoch": 14.93, + "grad_norm": 2.1764090061187744, + "learning_rate": 7.530425574387554e-06, + "loss": 0.9405, + "step": 49900 + }, + { + "epoch": 14.93, + "grad_norm": 2.065609931945801, + "learning_rate": 7.526223320447234e-06, + "loss": 1.005, + "step": 49905 + }, + { + "epoch": 14.93, + "grad_norm": 2.02253794670105, + "learning_rate": 7.52202203156544e-06, + "loss": 0.9092, + "step": 49910 + }, + { + "epoch": 14.93, + "grad_norm": 2.1637492179870605, + "learning_rate": 7.517821707974202e-06, + "loss": 0.8773, + "step": 49915 + }, + { + "epoch": 14.94, + "grad_norm": 10.103838920593262, + "learning_rate": 7.513622349905497e-06, + "loss": 0.9454, + "step": 49920 + }, + { + "epoch": 14.94, + "grad_norm": 2.852343797683716, + "learning_rate": 7.509423957591255e-06, + "loss": 1.1144, + "step": 49925 + }, + { + "epoch": 14.94, + "grad_norm": 3.75535249710083, + "learning_rate": 7.505226531263349e-06, + "loss": 1.0971, + "step": 49930 + }, + { + "epoch": 14.94, + "grad_norm": 2.4124093055725098, + "learning_rate": 7.501030071153594e-06, + "loss": 0.9609, + "step": 49935 + }, + { + "epoch": 14.94, + "grad_norm": 3.048701286315918, + "learning_rate": 7.496834577493761e-06, + "loss": 0.9755, + "step": 49940 + }, + { + "epoch": 14.94, + "grad_norm": 2.259594202041626, + "learning_rate": 7.492640050515567e-06, + "loss": 1.0239, + "step": 49945 + }, + { + "epoch": 14.94, + "grad_norm": 2.8681416511535645, + "learning_rate": 7.488446490450651e-06, + "loss": 0.9469, + "step": 49950 + }, + { + "epoch": 14.95, + "grad_norm": 3.7870285511016846, + "learning_rate": 7.484253897530649e-06, + "loss": 1.0347, + "step": 49955 + }, + { + "epoch": 14.95, + "grad_norm": 1.4063019752502441, + "learning_rate": 7.480900519694567e-06, + "loss": 0.9334, + "step": 49960 + }, + { + "epoch": 14.95, + "grad_norm": 2.873671293258667, + "learning_rate": 7.476709668218851e-06, + "loss": 1.0212, + "step": 49965 + }, + { + "epoch": 14.95, + "grad_norm": 4.571763515472412, + "learning_rate": 7.472519784536242e-06, + "loss": 0.9162, + "step": 49970 + }, + { + "epoch": 14.95, + "grad_norm": 2.2711031436920166, + "learning_rate": 7.468330868878149e-06, + "loss": 0.9683, + "step": 49975 + }, + { + "epoch": 14.95, + "grad_norm": 2.531252384185791, + "learning_rate": 7.464142921475919e-06, + "loss": 1.0777, + "step": 49980 + }, + { + "epoch": 14.95, + "grad_norm": 2.323441982269287, + "learning_rate": 7.459955942560848e-06, + "loss": 1.1148, + "step": 49985 + }, + { + "epoch": 14.96, + "grad_norm": 3.1378073692321777, + "learning_rate": 7.455769932364185e-06, + "loss": 1.0343, + "step": 49990 + }, + { + "epoch": 14.96, + "grad_norm": 3.8024823665618896, + "learning_rate": 7.4515848911170975e-06, + "loss": 0.9482, + "step": 49995 + }, + { + "epoch": 14.96, + "grad_norm": 1.5496140718460083, + "learning_rate": 7.447400819050751e-06, + "loss": 0.9271, + "step": 50000 + }, + { + "epoch": 14.96, + "grad_norm": 3.12982177734375, + "learning_rate": 7.443217716396198e-06, + "loss": 0.9457, + "step": 50005 + }, + { + "epoch": 14.96, + "grad_norm": 1.8738044500350952, + "learning_rate": 7.439035583384496e-06, + "loss": 1.1598, + "step": 50010 + }, + { + "epoch": 14.96, + "grad_norm": 5.965039253234863, + "learning_rate": 7.43485442024659e-06, + "loss": 0.8773, + "step": 50015 + }, + { + "epoch": 14.97, + "grad_norm": 4.203342914581299, + "learning_rate": 7.43067422721343e-06, + "loss": 0.7135, + "step": 50020 + }, + { + "epoch": 14.97, + "grad_norm": 3.4606893062591553, + "learning_rate": 7.426495004515865e-06, + "loss": 0.9092, + "step": 50025 + }, + { + "epoch": 14.97, + "grad_norm": 6.534817218780518, + "learning_rate": 7.422316752384711e-06, + "loss": 1.0587, + "step": 50030 + }, + { + "epoch": 14.97, + "grad_norm": 2.5901870727539062, + "learning_rate": 7.418139471050736e-06, + "loss": 1.0401, + "step": 50035 + }, + { + "epoch": 14.97, + "grad_norm": 3.344170331954956, + "learning_rate": 7.413963160744642e-06, + "loss": 1.0651, + "step": 50040 + }, + { + "epoch": 14.97, + "grad_norm": 4.951648712158203, + "learning_rate": 7.409787821697078e-06, + "loss": 1.0275, + "step": 50045 + }, + { + "epoch": 14.97, + "grad_norm": 2.5820136070251465, + "learning_rate": 7.405613454138655e-06, + "loss": 1.0403, + "step": 50050 + }, + { + "epoch": 14.98, + "grad_norm": 4.226273059844971, + "learning_rate": 7.4014400582999075e-06, + "loss": 0.7765, + "step": 50055 + }, + { + "epoch": 14.98, + "grad_norm": 2.259936571121216, + "learning_rate": 7.3972676344113366e-06, + "loss": 0.9865, + "step": 50060 + }, + { + "epoch": 14.98, + "grad_norm": 5.1020917892456055, + "learning_rate": 7.3930961827033765e-06, + "loss": 0.9311, + "step": 50065 + }, + { + "epoch": 14.98, + "grad_norm": 3.1059951782226562, + "learning_rate": 7.388925703406413e-06, + "loss": 1.0568, + "step": 50070 + }, + { + "epoch": 14.98, + "grad_norm": 2.5398685932159424, + "learning_rate": 7.384756196750775e-06, + "loss": 0.8391, + "step": 50075 + }, + { + "epoch": 14.98, + "grad_norm": 6.540744304656982, + "learning_rate": 7.380587662966743e-06, + "loss": 0.8211, + "step": 50080 + }, + { + "epoch": 14.98, + "grad_norm": 6.105128765106201, + "learning_rate": 7.376420102284543e-06, + "loss": 0.8739, + "step": 50085 + }, + { + "epoch": 14.99, + "grad_norm": 4.216371059417725, + "learning_rate": 7.372253514934338e-06, + "loss": 1.1114, + "step": 50090 + }, + { + "epoch": 14.99, + "grad_norm": 3.220215082168579, + "learning_rate": 7.368087901146259e-06, + "loss": 1.1085, + "step": 50095 + }, + { + "epoch": 14.99, + "grad_norm": 2.48602557182312, + "learning_rate": 7.36392326115034e-06, + "loss": 0.9271, + "step": 50100 + }, + { + "epoch": 14.99, + "grad_norm": 2.3491077423095703, + "learning_rate": 7.3597595951766245e-06, + "loss": 0.994, + "step": 50105 + }, + { + "epoch": 14.99, + "grad_norm": 2.6628293991088867, + "learning_rate": 7.3555969034550335e-06, + "loss": 1.0288, + "step": 50110 + }, + { + "epoch": 14.99, + "grad_norm": 3.2842299938201904, + "learning_rate": 7.351435186215502e-06, + "loss": 0.766, + "step": 50115 + }, + { + "epoch": 15.0, + "grad_norm": 8.026897430419922, + "learning_rate": 7.347274443687854e-06, + "loss": 0.8578, + "step": 50120 + }, + { + "epoch": 15.0, + "grad_norm": 1.960700273513794, + "learning_rate": 7.3431146761018866e-06, + "loss": 1.0008, + "step": 50125 + }, + { + "epoch": 15.0, + "grad_norm": 1.605670690536499, + "learning_rate": 7.338955883687346e-06, + "loss": 0.9886, + "step": 50130 + }, + { + "epoch": 15.0, + "grad_norm": 4.049495697021484, + "learning_rate": 7.334798066673912e-06, + "loss": 1.1398, + "step": 50135 + }, + { + "epoch": 15.0, + "grad_norm": 1.0559757947921753, + "learning_rate": 7.330641225291218e-06, + "loss": 1.0428, + "step": 50140 + }, + { + "epoch": 15.0, + "grad_norm": 3.241338014602661, + "learning_rate": 7.326485359768845e-06, + "loss": 0.7697, + "step": 50145 + }, + { + "epoch": 15.0, + "grad_norm": 3.934131145477295, + "learning_rate": 7.3223304703363135e-06, + "loss": 0.8568, + "step": 50150 + }, + { + "epoch": 15.01, + "grad_norm": 1.8612024784088135, + "learning_rate": 7.318176557223097e-06, + "loss": 1.0974, + "step": 50155 + }, + { + "epoch": 15.01, + "grad_norm": 4.100109100341797, + "learning_rate": 7.314023620658608e-06, + "loss": 1.0554, + "step": 50160 + }, + { + "epoch": 15.01, + "grad_norm": 5.153068542480469, + "learning_rate": 7.309871660872211e-06, + "loss": 1.0952, + "step": 50165 + }, + { + "epoch": 15.01, + "grad_norm": 3.603163003921509, + "learning_rate": 7.305720678093214e-06, + "loss": 0.8898, + "step": 50170 + }, + { + "epoch": 15.01, + "grad_norm": 2.5232720375061035, + "learning_rate": 7.301570672550873e-06, + "loss": 0.909, + "step": 50175 + }, + { + "epoch": 15.01, + "grad_norm": 1.8282480239868164, + "learning_rate": 7.297421644474389e-06, + "loss": 0.8844, + "step": 50180 + }, + { + "epoch": 15.01, + "grad_norm": 3.7669854164123535, + "learning_rate": 7.293273594092903e-06, + "loss": 0.8167, + "step": 50185 + }, + { + "epoch": 15.02, + "grad_norm": 3.2310571670532227, + "learning_rate": 7.289126521635522e-06, + "loss": 0.8983, + "step": 50190 + }, + { + "epoch": 15.02, + "grad_norm": 3.063481569290161, + "learning_rate": 7.284980427331256e-06, + "loss": 0.9985, + "step": 50195 + }, + { + "epoch": 15.02, + "grad_norm": 4.190944671630859, + "learning_rate": 7.280835311409123e-06, + "loss": 1.027, + "step": 50200 + }, + { + "epoch": 15.02, + "grad_norm": 3.723928213119507, + "learning_rate": 7.276691174098024e-06, + "loss": 1.0427, + "step": 50205 + }, + { + "epoch": 15.02, + "grad_norm": 1.9666212797164917, + "learning_rate": 7.272548015626865e-06, + "loss": 1.149, + "step": 50210 + }, + { + "epoch": 15.02, + "grad_norm": 2.085639238357544, + "learning_rate": 7.268405836224443e-06, + "loss": 0.9174, + "step": 50215 + }, + { + "epoch": 15.03, + "grad_norm": 2.700467586517334, + "learning_rate": 7.264264636119536e-06, + "loss": 1.0293, + "step": 50220 + }, + { + "epoch": 15.03, + "grad_norm": 2.4150545597076416, + "learning_rate": 7.260124415540859e-06, + "loss": 0.9693, + "step": 50225 + }, + { + "epoch": 15.03, + "grad_norm": 1.4733829498291016, + "learning_rate": 7.255985174717067e-06, + "loss": 0.9674, + "step": 50230 + }, + { + "epoch": 15.03, + "grad_norm": 3.151942253112793, + "learning_rate": 7.251846913876772e-06, + "loss": 0.8926, + "step": 50235 + }, + { + "epoch": 15.03, + "grad_norm": 3.5010616779327393, + "learning_rate": 7.247709633248526e-06, + "loss": 0.8898, + "step": 50240 + }, + { + "epoch": 15.03, + "grad_norm": 3.5921566486358643, + "learning_rate": 7.243573333060824e-06, + "loss": 0.9497, + "step": 50245 + }, + { + "epoch": 15.03, + "grad_norm": 2.436879873275757, + "learning_rate": 7.239438013542107e-06, + "loss": 1.0677, + "step": 50250 + }, + { + "epoch": 15.04, + "grad_norm": 2.137514352798462, + "learning_rate": 7.235303674920771e-06, + "loss": 1.1743, + "step": 50255 + }, + { + "epoch": 15.04, + "grad_norm": 2.3740313053131104, + "learning_rate": 7.2311703174251454e-06, + "loss": 0.9629, + "step": 50260 + }, + { + "epoch": 15.04, + "grad_norm": 3.162233591079712, + "learning_rate": 7.227037941283515e-06, + "loss": 1.0236, + "step": 50265 + }, + { + "epoch": 15.04, + "grad_norm": 3.699193000793457, + "learning_rate": 7.222906546724104e-06, + "loss": 0.955, + "step": 50270 + }, + { + "epoch": 15.04, + "grad_norm": 2.988489866256714, + "learning_rate": 7.218776133975086e-06, + "loss": 1.1703, + "step": 50275 + }, + { + "epoch": 15.04, + "grad_norm": 3.407639980316162, + "learning_rate": 7.21464670326458e-06, + "loss": 0.8461, + "step": 50280 + }, + { + "epoch": 15.04, + "grad_norm": 1.7939776182174683, + "learning_rate": 7.210518254820658e-06, + "loss": 1.1161, + "step": 50285 + }, + { + "epoch": 15.05, + "grad_norm": 4.75887393951416, + "learning_rate": 7.206390788871306e-06, + "loss": 0.9254, + "step": 50290 + }, + { + "epoch": 15.05, + "grad_norm": 2.7789371013641357, + "learning_rate": 7.20226430564451e-06, + "loss": 0.9432, + "step": 50295 + }, + { + "epoch": 15.05, + "grad_norm": 5.088046073913574, + "learning_rate": 7.198138805368143e-06, + "loss": 1.1504, + "step": 50300 + }, + { + "epoch": 15.05, + "grad_norm": 3.722242593765259, + "learning_rate": 7.194014288270079e-06, + "loss": 1.0487, + "step": 50305 + }, + { + "epoch": 15.05, + "grad_norm": 1.9623161554336548, + "learning_rate": 7.189890754578083e-06, + "loss": 0.9153, + "step": 50310 + }, + { + "epoch": 15.05, + "grad_norm": 1.74498450756073, + "learning_rate": 7.185768204519924e-06, + "loss": 0.976, + "step": 50315 + }, + { + "epoch": 15.06, + "grad_norm": 1.8223540782928467, + "learning_rate": 7.181646638323261e-06, + "loss": 0.9415, + "step": 50320 + }, + { + "epoch": 15.06, + "grad_norm": 2.9013192653656006, + "learning_rate": 7.177526056215733e-06, + "loss": 0.9538, + "step": 50325 + }, + { + "epoch": 15.06, + "grad_norm": 2.3087453842163086, + "learning_rate": 7.173406458424917e-06, + "loss": 1.1462, + "step": 50330 + }, + { + "epoch": 15.06, + "grad_norm": 1.529740333557129, + "learning_rate": 7.169287845178335e-06, + "loss": 0.9129, + "step": 50335 + }, + { + "epoch": 15.06, + "grad_norm": 3.4274179935455322, + "learning_rate": 7.165170216703446e-06, + "loss": 1.035, + "step": 50340 + }, + { + "epoch": 15.06, + "grad_norm": 2.6179728507995605, + "learning_rate": 7.161053573227671e-06, + "loss": 1.0247, + "step": 50345 + }, + { + "epoch": 15.06, + "grad_norm": 3.914541721343994, + "learning_rate": 7.156937914978365e-06, + "loss": 0.888, + "step": 50350 + }, + { + "epoch": 15.07, + "grad_norm": 2.483247995376587, + "learning_rate": 7.152823242182829e-06, + "loss": 1.1135, + "step": 50355 + }, + { + "epoch": 15.07, + "grad_norm": 2.098078727722168, + "learning_rate": 7.148709555068314e-06, + "loss": 1.1288, + "step": 50360 + }, + { + "epoch": 15.07, + "grad_norm": 1.533263921737671, + "learning_rate": 7.144596853862015e-06, + "loss": 1.0257, + "step": 50365 + }, + { + "epoch": 15.07, + "grad_norm": 2.030672550201416, + "learning_rate": 7.140485138791075e-06, + "loss": 0.9171, + "step": 50370 + }, + { + "epoch": 15.07, + "grad_norm": 2.625800371170044, + "learning_rate": 7.136374410082575e-06, + "loss": 1.0061, + "step": 50375 + }, + { + "epoch": 15.07, + "grad_norm": 2.7783334255218506, + "learning_rate": 7.132264667963556e-06, + "loss": 0.9497, + "step": 50380 + }, + { + "epoch": 15.07, + "grad_norm": 1.379747748374939, + "learning_rate": 7.128155912660972e-06, + "loss": 0.8702, + "step": 50385 + }, + { + "epoch": 15.08, + "grad_norm": 2.8253347873687744, + "learning_rate": 7.124048144401774e-06, + "loss": 0.8353, + "step": 50390 + }, + { + "epoch": 15.08, + "grad_norm": 3.046046733856201, + "learning_rate": 7.119941363412802e-06, + "loss": 0.8835, + "step": 50395 + }, + { + "epoch": 15.08, + "grad_norm": 2.3068737983703613, + "learning_rate": 7.115835569920898e-06, + "loss": 1.0548, + "step": 50400 + }, + { + "epoch": 15.08, + "grad_norm": 2.7872204780578613, + "learning_rate": 7.111730764152791e-06, + "loss": 1.0162, + "step": 50405 + }, + { + "epoch": 15.08, + "grad_norm": 1.9851144552230835, + "learning_rate": 7.107626946335214e-06, + "loss": 0.8412, + "step": 50410 + }, + { + "epoch": 15.08, + "grad_norm": 2.1134822368621826, + "learning_rate": 7.103524116694795e-06, + "loss": 1.049, + "step": 50415 + }, + { + "epoch": 15.09, + "grad_norm": 1.651341199874878, + "learning_rate": 7.0994222754581395e-06, + "loss": 1.052, + "step": 50420 + }, + { + "epoch": 15.09, + "grad_norm": 1.9419851303100586, + "learning_rate": 7.095321422851784e-06, + "loss": 1.0539, + "step": 50425 + }, + { + "epoch": 15.09, + "grad_norm": 2.2368361949920654, + "learning_rate": 7.091221559102212e-06, + "loss": 0.9218, + "step": 50430 + }, + { + "epoch": 15.09, + "grad_norm": 1.586853265762329, + "learning_rate": 7.087122684435862e-06, + "loss": 0.9266, + "step": 50435 + }, + { + "epoch": 15.09, + "grad_norm": 2.1675949096679688, + "learning_rate": 7.083024799079099e-06, + "loss": 0.9255, + "step": 50440 + }, + { + "epoch": 15.09, + "grad_norm": 2.4687063694000244, + "learning_rate": 7.078927903258267e-06, + "loss": 0.9003, + "step": 50445 + }, + { + "epoch": 15.09, + "grad_norm": 3.6023154258728027, + "learning_rate": 7.0748319971996104e-06, + "loss": 1.0, + "step": 50450 + }, + { + "epoch": 15.1, + "grad_norm": 3.7124223709106445, + "learning_rate": 7.070737081129353e-06, + "loss": 0.8707, + "step": 50455 + }, + { + "epoch": 15.1, + "grad_norm": 2.767544746398926, + "learning_rate": 7.066643155273647e-06, + "loss": 1.0032, + "step": 50460 + }, + { + "epoch": 15.1, + "grad_norm": 3.66680645942688, + "learning_rate": 7.062550219858602e-06, + "loss": 1.023, + "step": 50465 + }, + { + "epoch": 15.1, + "grad_norm": 4.994945049285889, + "learning_rate": 7.058458275110261e-06, + "loss": 1.0194, + "step": 50470 + }, + { + "epoch": 15.1, + "grad_norm": 4.854514122009277, + "learning_rate": 7.054367321254629e-06, + "loss": 0.9221, + "step": 50475 + }, + { + "epoch": 15.1, + "grad_norm": 3.2869067192077637, + "learning_rate": 7.050277358517618e-06, + "loss": 0.8274, + "step": 50480 + }, + { + "epoch": 15.1, + "grad_norm": 3.371748685836792, + "learning_rate": 7.046188387125149e-06, + "loss": 1.0187, + "step": 50485 + }, + { + "epoch": 15.11, + "grad_norm": 2.176785945892334, + "learning_rate": 7.042100407303018e-06, + "loss": 0.9843, + "step": 50490 + }, + { + "epoch": 15.11, + "grad_norm": 2.4034008979797363, + "learning_rate": 7.038013419277034e-06, + "loss": 1.0416, + "step": 50495 + }, + { + "epoch": 15.11, + "grad_norm": 3.674947500228882, + "learning_rate": 7.033927423272879e-06, + "loss": 0.9973, + "step": 50500 + }, + { + "epoch": 15.11, + "grad_norm": 2.3554413318634033, + "learning_rate": 7.029842419516253e-06, + "loss": 1.0056, + "step": 50505 + }, + { + "epoch": 15.11, + "grad_norm": 4.33696985244751, + "learning_rate": 7.025758408232744e-06, + "loss": 0.8275, + "step": 50510 + }, + { + "epoch": 15.11, + "grad_norm": 2.8192827701568604, + "learning_rate": 7.021675389647916e-06, + "loss": 0.9531, + "step": 50515 + }, + { + "epoch": 15.12, + "grad_norm": 1.3056793212890625, + "learning_rate": 7.017593363987268e-06, + "loss": 0.9896, + "step": 50520 + }, + { + "epoch": 15.12, + "grad_norm": 4.758703708648682, + "learning_rate": 7.013512331476238e-06, + "loss": 1.0297, + "step": 50525 + }, + { + "epoch": 15.12, + "grad_norm": 2.7597901821136475, + "learning_rate": 7.009432292340243e-06, + "loss": 0.9807, + "step": 50530 + }, + { + "epoch": 15.12, + "grad_norm": 2.2206053733825684, + "learning_rate": 7.0053532468045855e-06, + "loss": 0.8794, + "step": 50535 + }, + { + "epoch": 15.12, + "grad_norm": 1.7539916038513184, + "learning_rate": 7.001275195094581e-06, + "loss": 1.0552, + "step": 50540 + }, + { + "epoch": 15.12, + "grad_norm": 1.2006642818450928, + "learning_rate": 6.997198137435432e-06, + "loss": 0.8482, + "step": 50545 + }, + { + "epoch": 15.12, + "grad_norm": 2.1931300163269043, + "learning_rate": 6.993122074052314e-06, + "loss": 0.8871, + "step": 50550 + }, + { + "epoch": 15.13, + "grad_norm": 1.5462733507156372, + "learning_rate": 6.989047005170349e-06, + "loss": 1.0243, + "step": 50555 + }, + { + "epoch": 15.13, + "grad_norm": 2.5775654315948486, + "learning_rate": 6.984972931014597e-06, + "loss": 1.0924, + "step": 50560 + }, + { + "epoch": 15.13, + "grad_norm": 1.4666215181350708, + "learning_rate": 6.980899851810061e-06, + "loss": 1.0157, + "step": 50565 + }, + { + "epoch": 15.13, + "grad_norm": 1.5045851469039917, + "learning_rate": 6.9768277677817075e-06, + "loss": 1.0205, + "step": 50570 + }, + { + "epoch": 15.13, + "grad_norm": 3.089066505432129, + "learning_rate": 6.972756679154404e-06, + "loss": 1.0415, + "step": 50575 + }, + { + "epoch": 15.13, + "grad_norm": 1.6471816301345825, + "learning_rate": 6.9686865861530285e-06, + "loss": 1.0791, + "step": 50580 + }, + { + "epoch": 15.13, + "grad_norm": 1.9261685609817505, + "learning_rate": 6.964617489002334e-06, + "loss": 0.8684, + "step": 50585 + }, + { + "epoch": 15.14, + "grad_norm": 14.480093002319336, + "learning_rate": 6.960549387927087e-06, + "loss": 0.8348, + "step": 50590 + }, + { + "epoch": 15.14, + "grad_norm": 3.1349940299987793, + "learning_rate": 6.956482283151927e-06, + "loss": 1.0708, + "step": 50595 + }, + { + "epoch": 15.14, + "grad_norm": 3.3612797260284424, + "learning_rate": 6.952416174901505e-06, + "loss": 1.0016, + "step": 50600 + }, + { + "epoch": 15.14, + "grad_norm": 1.3571832180023193, + "learning_rate": 6.948351063400385e-06, + "loss": 1.0697, + "step": 50605 + }, + { + "epoch": 15.14, + "grad_norm": 1.7172069549560547, + "learning_rate": 6.9442869488730574e-06, + "loss": 0.9281, + "step": 50610 + }, + { + "epoch": 15.14, + "grad_norm": 1.9647376537322998, + "learning_rate": 6.94022383154401e-06, + "loss": 1.0363, + "step": 50615 + }, + { + "epoch": 15.14, + "grad_norm": 2.1266095638275146, + "learning_rate": 6.936161711637612e-06, + "loss": 0.8694, + "step": 50620 + }, + { + "epoch": 15.15, + "grad_norm": 2.7892374992370605, + "learning_rate": 6.9321005893782404e-06, + "loss": 1.0771, + "step": 50625 + }, + { + "epoch": 15.15, + "grad_norm": 3.818957805633545, + "learning_rate": 6.92804046499016e-06, + "loss": 1.0741, + "step": 50630 + }, + { + "epoch": 15.15, + "grad_norm": 3.792024850845337, + "learning_rate": 6.923981338697632e-06, + "loss": 1.1138, + "step": 50635 + }, + { + "epoch": 15.15, + "grad_norm": 9.00833511352539, + "learning_rate": 6.919923210724821e-06, + "loss": 0.8402, + "step": 50640 + }, + { + "epoch": 15.15, + "grad_norm": 3.8722808361053467, + "learning_rate": 6.915866081295858e-06, + "loss": 0.9153, + "step": 50645 + }, + { + "epoch": 15.15, + "grad_norm": 10.343985557556152, + "learning_rate": 6.911809950634813e-06, + "loss": 0.9226, + "step": 50650 + }, + { + "epoch": 15.16, + "grad_norm": 4.360511779785156, + "learning_rate": 6.9077548189657e-06, + "loss": 0.8344, + "step": 50655 + }, + { + "epoch": 15.16, + "grad_norm": 3.0080530643463135, + "learning_rate": 6.903700686512488e-06, + "loss": 0.9623, + "step": 50660 + }, + { + "epoch": 15.16, + "grad_norm": 3.3220059871673584, + "learning_rate": 6.899647553499073e-06, + "loss": 0.8129, + "step": 50665 + }, + { + "epoch": 15.16, + "grad_norm": 1.465591549873352, + "learning_rate": 6.895595420149309e-06, + "loss": 0.9545, + "step": 50670 + }, + { + "epoch": 15.16, + "grad_norm": 2.015324115753174, + "learning_rate": 6.891544286687002e-06, + "loss": 1.1375, + "step": 50675 + }, + { + "epoch": 15.16, + "grad_norm": 2.517958164215088, + "learning_rate": 6.887494153335863e-06, + "loss": 1.1182, + "step": 50680 + }, + { + "epoch": 15.16, + "grad_norm": 4.011493682861328, + "learning_rate": 6.883445020319604e-06, + "loss": 1.1003, + "step": 50685 + }, + { + "epoch": 15.17, + "grad_norm": 3.0748703479766846, + "learning_rate": 6.879396887861847e-06, + "loss": 1.0012, + "step": 50690 + }, + { + "epoch": 15.17, + "grad_norm": 2.5038511753082275, + "learning_rate": 6.8753497561861615e-06, + "loss": 0.8941, + "step": 50695 + }, + { + "epoch": 15.17, + "grad_norm": 3.3688976764678955, + "learning_rate": 6.871303625516079e-06, + "loss": 0.9082, + "step": 50700 + }, + { + "epoch": 15.17, + "grad_norm": 1.502925992012024, + "learning_rate": 6.867258496075038e-06, + "loss": 1.0297, + "step": 50705 + }, + { + "epoch": 15.17, + "grad_norm": 2.218376636505127, + "learning_rate": 6.863214368086479e-06, + "loss": 0.7532, + "step": 50710 + }, + { + "epoch": 15.17, + "grad_norm": 3.5436737537384033, + "learning_rate": 6.859171241773721e-06, + "loss": 1.0066, + "step": 50715 + }, + { + "epoch": 15.17, + "grad_norm": 3.2597169876098633, + "learning_rate": 6.855129117360096e-06, + "loss": 0.9328, + "step": 50720 + }, + { + "epoch": 15.18, + "grad_norm": 3.4346742630004883, + "learning_rate": 6.851087995068811e-06, + "loss": 0.7659, + "step": 50725 + }, + { + "epoch": 15.18, + "grad_norm": 1.6817134618759155, + "learning_rate": 6.847047875123089e-06, + "loss": 0.9972, + "step": 50730 + }, + { + "epoch": 15.18, + "grad_norm": 3.8097851276397705, + "learning_rate": 6.843008757746036e-06, + "loss": 1.0971, + "step": 50735 + }, + { + "epoch": 15.18, + "grad_norm": 4.3598551750183105, + "learning_rate": 6.838970643160736e-06, + "loss": 0.9112, + "step": 50740 + }, + { + "epoch": 15.18, + "grad_norm": 7.945572853088379, + "learning_rate": 6.834933531590209e-06, + "loss": 1.0161, + "step": 50745 + }, + { + "epoch": 15.18, + "grad_norm": 2.397972345352173, + "learning_rate": 6.830897423257423e-06, + "loss": 1.1107, + "step": 50750 + }, + { + "epoch": 15.19, + "grad_norm": 3.211143732070923, + "learning_rate": 6.826862318385285e-06, + "loss": 1.0359, + "step": 50755 + }, + { + "epoch": 15.19, + "grad_norm": 1.8211517333984375, + "learning_rate": 6.822828217196656e-06, + "loss": 0.9641, + "step": 50760 + }, + { + "epoch": 15.19, + "grad_norm": 3.2200350761413574, + "learning_rate": 6.818795119914326e-06, + "loss": 1.0015, + "step": 50765 + }, + { + "epoch": 15.19, + "grad_norm": 1.9324058294296265, + "learning_rate": 6.8147630267610454e-06, + "loss": 1.0077, + "step": 50770 + }, + { + "epoch": 15.19, + "grad_norm": 2.95210337638855, + "learning_rate": 6.810731937959497e-06, + "loss": 1.1641, + "step": 50775 + }, + { + "epoch": 15.19, + "grad_norm": 1.5571585893630981, + "learning_rate": 6.8067018537323195e-06, + "loss": 1.0368, + "step": 50780 + }, + { + "epoch": 15.19, + "grad_norm": 12.76740550994873, + "learning_rate": 6.802672774302088e-06, + "loss": 0.9348, + "step": 50785 + }, + { + "epoch": 15.2, + "grad_norm": 1.9661290645599365, + "learning_rate": 6.798644699891321e-06, + "loss": 1.102, + "step": 50790 + }, + { + "epoch": 15.2, + "grad_norm": 1.8433258533477783, + "learning_rate": 6.794617630722491e-06, + "loss": 1.1175, + "step": 50795 + }, + { + "epoch": 15.2, + "grad_norm": 5.543290138244629, + "learning_rate": 6.7905915670180065e-06, + "loss": 0.9451, + "step": 50800 + }, + { + "epoch": 15.2, + "grad_norm": 4.0879998207092285, + "learning_rate": 6.786566509000228e-06, + "loss": 1.0948, + "step": 50805 + }, + { + "epoch": 15.2, + "grad_norm": 3.9433319568634033, + "learning_rate": 6.782542456891433e-06, + "loss": 1.0932, + "step": 50810 + }, + { + "epoch": 15.2, + "grad_norm": 1.188195824623108, + "learning_rate": 6.7785194109139e-06, + "loss": 1.0968, + "step": 50815 + }, + { + "epoch": 15.2, + "grad_norm": 1.7227709293365479, + "learning_rate": 6.774497371289784e-06, + "loss": 0.9657, + "step": 50820 + }, + { + "epoch": 15.21, + "grad_norm": 0.8999642133712769, + "learning_rate": 6.770476338241247e-06, + "loss": 0.9252, + "step": 50825 + }, + { + "epoch": 15.21, + "grad_norm": 2.9426021575927734, + "learning_rate": 6.766456311990347e-06, + "loss": 0.9599, + "step": 50830 + }, + { + "epoch": 15.21, + "grad_norm": 1.2726308107376099, + "learning_rate": 6.7624372927591135e-06, + "loss": 0.953, + "step": 50835 + }, + { + "epoch": 15.21, + "grad_norm": 1.873698353767395, + "learning_rate": 6.75841928076951e-06, + "loss": 0.8177, + "step": 50840 + }, + { + "epoch": 15.21, + "grad_norm": 3.4419801235198975, + "learning_rate": 6.7544022762434486e-06, + "loss": 0.9171, + "step": 50845 + }, + { + "epoch": 15.21, + "grad_norm": 2.00152325630188, + "learning_rate": 6.750386279402784e-06, + "loss": 0.981, + "step": 50850 + }, + { + "epoch": 15.22, + "grad_norm": 1.6282256841659546, + "learning_rate": 6.7463712904693165e-06, + "loss": 0.9112, + "step": 50855 + }, + { + "epoch": 15.22, + "grad_norm": 3.2624893188476562, + "learning_rate": 6.742357309664787e-06, + "loss": 0.9756, + "step": 50860 + }, + { + "epoch": 15.22, + "grad_norm": 3.1753547191619873, + "learning_rate": 6.738344337210889e-06, + "loss": 1.0622, + "step": 50865 + }, + { + "epoch": 15.22, + "grad_norm": 4.24165678024292, + "learning_rate": 6.73433237332925e-06, + "loss": 0.9718, + "step": 50870 + }, + { + "epoch": 15.22, + "grad_norm": 3.261056661605835, + "learning_rate": 6.730321418241448e-06, + "loss": 0.9434, + "step": 50875 + }, + { + "epoch": 15.22, + "grad_norm": 3.543851852416992, + "learning_rate": 6.726311472169003e-06, + "loss": 1.0265, + "step": 50880 + }, + { + "epoch": 15.22, + "grad_norm": 2.614902973175049, + "learning_rate": 6.72230253533338e-06, + "loss": 0.8983, + "step": 50885 + }, + { + "epoch": 15.23, + "grad_norm": 2.52410626411438, + "learning_rate": 6.718294607955991e-06, + "loss": 0.9063, + "step": 50890 + }, + { + "epoch": 15.23, + "grad_norm": 1.27847421169281, + "learning_rate": 6.7142876902581895e-06, + "loss": 0.9997, + "step": 50895 + }, + { + "epoch": 15.23, + "grad_norm": 1.5926611423492432, + "learning_rate": 6.710281782461275e-06, + "loss": 0.9342, + "step": 50900 + }, + { + "epoch": 15.23, + "grad_norm": 2.9667863845825195, + "learning_rate": 6.706276884786475e-06, + "loss": 1.1293, + "step": 50905 + }, + { + "epoch": 15.23, + "grad_norm": 2.5114152431488037, + "learning_rate": 6.702272997455e-06, + "loss": 1.1009, + "step": 50910 + }, + { + "epoch": 15.23, + "grad_norm": 3.5501632690429688, + "learning_rate": 6.698270120687955e-06, + "loss": 0.9759, + "step": 50915 + }, + { + "epoch": 15.23, + "grad_norm": 5.344282627105713, + "learning_rate": 6.694268254706443e-06, + "loss": 0.8718, + "step": 50920 + }, + { + "epoch": 15.24, + "grad_norm": 4.164098262786865, + "learning_rate": 6.690267399731459e-06, + "loss": 0.976, + "step": 50925 + }, + { + "epoch": 15.24, + "grad_norm": 2.1116974353790283, + "learning_rate": 6.6862675559839744e-06, + "loss": 0.8628, + "step": 50930 + }, + { + "epoch": 15.24, + "grad_norm": 1.2934943437576294, + "learning_rate": 6.682268723684898e-06, + "loss": 0.9742, + "step": 50935 + }, + { + "epoch": 15.24, + "grad_norm": 1.8546779155731201, + "learning_rate": 6.678270903055078e-06, + "loss": 0.9964, + "step": 50940 + }, + { + "epoch": 15.24, + "grad_norm": 1.931983470916748, + "learning_rate": 6.674274094315311e-06, + "loss": 0.8238, + "step": 50945 + }, + { + "epoch": 15.24, + "grad_norm": 1.8833105564117432, + "learning_rate": 6.670278297686341e-06, + "loss": 0.9469, + "step": 50950 + }, + { + "epoch": 15.25, + "grad_norm": 2.2532496452331543, + "learning_rate": 6.666283513388844e-06, + "loss": 0.9581, + "step": 50955 + }, + { + "epoch": 15.25, + "grad_norm": 3.5239064693450928, + "learning_rate": 6.662289741643454e-06, + "loss": 1.1331, + "step": 50960 + }, + { + "epoch": 15.25, + "grad_norm": 4.513704299926758, + "learning_rate": 6.658296982670739e-06, + "loss": 0.9937, + "step": 50965 + }, + { + "epoch": 15.25, + "grad_norm": 3.6698179244995117, + "learning_rate": 6.654305236691219e-06, + "loss": 0.9723, + "step": 50970 + }, + { + "epoch": 15.25, + "grad_norm": 2.5509257316589355, + "learning_rate": 6.650314503925348e-06, + "loss": 1.0257, + "step": 50975 + }, + { + "epoch": 15.25, + "grad_norm": 4.316878795623779, + "learning_rate": 6.646324784593536e-06, + "loss": 0.8953, + "step": 50980 + }, + { + "epoch": 15.25, + "grad_norm": 2.966710090637207, + "learning_rate": 6.6423360789161285e-06, + "loss": 1.0324, + "step": 50985 + }, + { + "epoch": 15.26, + "grad_norm": 3.1171488761901855, + "learning_rate": 6.638348387113416e-06, + "loss": 0.92, + "step": 50990 + }, + { + "epoch": 15.26, + "grad_norm": 4.053133487701416, + "learning_rate": 6.634361709405645e-06, + "loss": 0.9734, + "step": 50995 + }, + { + "epoch": 15.26, + "grad_norm": 2.0579705238342285, + "learning_rate": 6.630376046012973e-06, + "loss": 0.9514, + "step": 51000 + }, + { + "epoch": 15.26, + "grad_norm": 1.2330838441848755, + "learning_rate": 6.6263913971555515e-06, + "loss": 1.0869, + "step": 51005 + }, + { + "epoch": 15.26, + "grad_norm": 4.011294841766357, + "learning_rate": 6.6224077630534205e-06, + "loss": 1.148, + "step": 51010 + }, + { + "epoch": 15.26, + "grad_norm": 1.641052007675171, + "learning_rate": 6.618425143926618e-06, + "loss": 1.0063, + "step": 51015 + }, + { + "epoch": 15.26, + "grad_norm": 1.6458262205123901, + "learning_rate": 6.614443539995074e-06, + "loss": 0.9671, + "step": 51020 + }, + { + "epoch": 15.27, + "grad_norm": 4.2398529052734375, + "learning_rate": 6.6104629514787215e-06, + "loss": 0.902, + "step": 51025 + }, + { + "epoch": 15.27, + "grad_norm": 2.740884780883789, + "learning_rate": 6.606483378597375e-06, + "loss": 0.9838, + "step": 51030 + }, + { + "epoch": 15.27, + "grad_norm": 1.063081979751587, + "learning_rate": 6.6025048215708345e-06, + "loss": 0.7767, + "step": 51035 + }, + { + "epoch": 15.27, + "grad_norm": 1.965700387954712, + "learning_rate": 6.598527280618827e-06, + "loss": 1.1383, + "step": 51040 + }, + { + "epoch": 15.27, + "grad_norm": 5.353956699371338, + "learning_rate": 6.594550755961032e-06, + "loss": 0.9201, + "step": 51045 + }, + { + "epoch": 15.27, + "grad_norm": 3.3293633460998535, + "learning_rate": 6.590575247817069e-06, + "loss": 1.0393, + "step": 51050 + }, + { + "epoch": 15.28, + "grad_norm": 3.897473096847534, + "learning_rate": 6.586600756406494e-06, + "loss": 0.8148, + "step": 51055 + }, + { + "epoch": 15.28, + "grad_norm": 2.9738924503326416, + "learning_rate": 6.582627281948825e-06, + "loss": 1.107, + "step": 51060 + }, + { + "epoch": 15.28, + "grad_norm": 1.3063501119613647, + "learning_rate": 6.578654824663505e-06, + "loss": 0.9212, + "step": 51065 + }, + { + "epoch": 15.28, + "grad_norm": 2.1776340007781982, + "learning_rate": 6.574683384769933e-06, + "loss": 1.1221, + "step": 51070 + }, + { + "epoch": 15.28, + "grad_norm": 2.1266050338745117, + "learning_rate": 6.570712962487444e-06, + "loss": 0.9498, + "step": 51075 + }, + { + "epoch": 15.28, + "grad_norm": 1.2182793617248535, + "learning_rate": 6.566743558035324e-06, + "loss": 0.9176, + "step": 51080 + }, + { + "epoch": 15.28, + "grad_norm": 11.197833061218262, + "learning_rate": 6.562775171632793e-06, + "loss": 0.8579, + "step": 51085 + }, + { + "epoch": 15.29, + "grad_norm": 1.7195172309875488, + "learning_rate": 6.558807803499034e-06, + "loss": 1.1437, + "step": 51090 + }, + { + "epoch": 15.29, + "grad_norm": 2.056217670440674, + "learning_rate": 6.554841453853139e-06, + "loss": 1.1385, + "step": 51095 + }, + { + "epoch": 15.29, + "grad_norm": 5.98038387298584, + "learning_rate": 6.55087612291419e-06, + "loss": 0.7964, + "step": 51100 + }, + { + "epoch": 15.29, + "grad_norm": 5.708714962005615, + "learning_rate": 6.54691181090116e-06, + "loss": 0.9003, + "step": 51105 + }, + { + "epoch": 15.29, + "grad_norm": 1.2579269409179688, + "learning_rate": 6.542948518033029e-06, + "loss": 1.0455, + "step": 51110 + }, + { + "epoch": 15.29, + "grad_norm": 1.0137869119644165, + "learning_rate": 6.538986244528647e-06, + "loss": 0.8792, + "step": 51115 + }, + { + "epoch": 15.29, + "grad_norm": 1.5064419507980347, + "learning_rate": 6.535024990606883e-06, + "loss": 0.8194, + "step": 51120 + }, + { + "epoch": 15.3, + "grad_norm": 2.8707802295684814, + "learning_rate": 6.531064756486488e-06, + "loss": 1.0377, + "step": 51125 + }, + { + "epoch": 15.3, + "grad_norm": 2.328263521194458, + "learning_rate": 6.527105542386189e-06, + "loss": 0.9897, + "step": 51130 + }, + { + "epoch": 15.3, + "grad_norm": 5.99672269821167, + "learning_rate": 6.523147348524649e-06, + "loss": 1.0021, + "step": 51135 + }, + { + "epoch": 15.3, + "grad_norm": 1.5641214847564697, + "learning_rate": 6.519190175120473e-06, + "loss": 1.1588, + "step": 51140 + }, + { + "epoch": 15.3, + "grad_norm": 6.234478950500488, + "learning_rate": 6.515234022392217e-06, + "loss": 0.9205, + "step": 51145 + }, + { + "epoch": 15.3, + "grad_norm": 1.717020869255066, + "learning_rate": 6.511278890558373e-06, + "loss": 1.0511, + "step": 51150 + }, + { + "epoch": 15.3, + "grad_norm": 3.1675846576690674, + "learning_rate": 6.507324779837376e-06, + "loss": 0.8302, + "step": 51155 + }, + { + "epoch": 15.31, + "grad_norm": 2.2740492820739746, + "learning_rate": 6.503371690447607e-06, + "loss": 0.8826, + "step": 51160 + }, + { + "epoch": 15.31, + "grad_norm": 3.2151083946228027, + "learning_rate": 6.499419622607397e-06, + "loss": 0.9839, + "step": 51165 + }, + { + "epoch": 15.31, + "grad_norm": 5.2867560386657715, + "learning_rate": 6.49546857653501e-06, + "loss": 0.9079, + "step": 51170 + }, + { + "epoch": 15.31, + "grad_norm": 2.4737038612365723, + "learning_rate": 6.491518552448658e-06, + "loss": 0.989, + "step": 51175 + }, + { + "epoch": 15.31, + "grad_norm": 2.2063138484954834, + "learning_rate": 6.487569550566499e-06, + "loss": 0.8596, + "step": 51180 + }, + { + "epoch": 15.31, + "grad_norm": 3.6992015838623047, + "learning_rate": 6.48362157110664e-06, + "loss": 1.1441, + "step": 51185 + }, + { + "epoch": 15.32, + "grad_norm": 4.015367031097412, + "learning_rate": 6.479674614287098e-06, + "loss": 1.054, + "step": 51190 + }, + { + "epoch": 15.32, + "grad_norm": 2.584434747695923, + "learning_rate": 6.47572868032589e-06, + "loss": 1.0146, + "step": 51195 + }, + { + "epoch": 15.32, + "grad_norm": 2.181087017059326, + "learning_rate": 6.47178376944092e-06, + "loss": 0.9372, + "step": 51200 + }, + { + "epoch": 15.32, + "grad_norm": 4.231384754180908, + "learning_rate": 6.467839881850085e-06, + "loss": 1.0406, + "step": 51205 + }, + { + "epoch": 15.32, + "grad_norm": 3.6681032180786133, + "learning_rate": 6.463897017771178e-06, + "loss": 0.9692, + "step": 51210 + }, + { + "epoch": 15.32, + "grad_norm": 1.9487011432647705, + "learning_rate": 6.459955177421986e-06, + "loss": 1.0082, + "step": 51215 + }, + { + "epoch": 15.32, + "grad_norm": 2.813549518585205, + "learning_rate": 6.45601436102019e-06, + "loss": 1.1155, + "step": 51220 + }, + { + "epoch": 15.33, + "grad_norm": 1.6334689855575562, + "learning_rate": 6.4520745687834476e-06, + "loss": 0.9634, + "step": 51225 + }, + { + "epoch": 15.33, + "grad_norm": 3.823737382888794, + "learning_rate": 6.448135800929347e-06, + "loss": 1.1307, + "step": 51230 + }, + { + "epoch": 15.33, + "grad_norm": 1.4004770517349243, + "learning_rate": 6.444198057675418e-06, + "loss": 1.0298, + "step": 51235 + }, + { + "epoch": 15.33, + "grad_norm": 6.257365703582764, + "learning_rate": 6.440261339239148e-06, + "loss": 0.97, + "step": 51240 + }, + { + "epoch": 15.33, + "grad_norm": 6.089005947113037, + "learning_rate": 6.436325645837951e-06, + "loss": 0.9796, + "step": 51245 + }, + { + "epoch": 15.33, + "grad_norm": 4.073474884033203, + "learning_rate": 6.432390977689193e-06, + "loss": 0.9491, + "step": 51250 + }, + { + "epoch": 15.33, + "grad_norm": 1.1611909866333008, + "learning_rate": 6.4284573350101765e-06, + "loss": 1.0043, + "step": 51255 + }, + { + "epoch": 15.34, + "grad_norm": 1.883181095123291, + "learning_rate": 6.424524718018163e-06, + "loss": 0.8922, + "step": 51260 + }, + { + "epoch": 15.34, + "grad_norm": 12.2318754196167, + "learning_rate": 6.4205931269303385e-06, + "loss": 1.024, + "step": 51265 + }, + { + "epoch": 15.34, + "grad_norm": 2.295652151107788, + "learning_rate": 6.416662561963846e-06, + "loss": 0.8952, + "step": 51270 + }, + { + "epoch": 15.34, + "grad_norm": 6.346905708312988, + "learning_rate": 6.412733023335763e-06, + "loss": 1.072, + "step": 51275 + }, + { + "epoch": 15.34, + "grad_norm": 3.8380446434020996, + "learning_rate": 6.408804511263119e-06, + "loss": 0.9072, + "step": 51280 + }, + { + "epoch": 15.34, + "grad_norm": 2.404541492462158, + "learning_rate": 6.404877025962866e-06, + "loss": 1.0707, + "step": 51285 + }, + { + "epoch": 15.35, + "grad_norm": 2.2299270629882812, + "learning_rate": 6.400950567651939e-06, + "loss": 0.9094, + "step": 51290 + }, + { + "epoch": 15.35, + "grad_norm": 4.42881965637207, + "learning_rate": 6.397025136547166e-06, + "loss": 1.04, + "step": 51295 + }, + { + "epoch": 15.35, + "grad_norm": 1.782752513885498, + "learning_rate": 6.393100732865373e-06, + "loss": 0.9773, + "step": 51300 + }, + { + "epoch": 15.35, + "grad_norm": 3.4827709197998047, + "learning_rate": 6.38917735682327e-06, + "loss": 0.8695, + "step": 51305 + }, + { + "epoch": 15.35, + "grad_norm": 2.491976022720337, + "learning_rate": 6.385255008637573e-06, + "loss": 0.8696, + "step": 51310 + }, + { + "epoch": 15.35, + "grad_norm": 3.1498947143554688, + "learning_rate": 6.381333688524885e-06, + "loss": 0.849, + "step": 51315 + }, + { + "epoch": 15.35, + "grad_norm": 2.436060667037964, + "learning_rate": 6.377413396701781e-06, + "loss": 0.9044, + "step": 51320 + }, + { + "epoch": 15.36, + "grad_norm": 2.632699966430664, + "learning_rate": 6.373494133384783e-06, + "loss": 1.0249, + "step": 51325 + }, + { + "epoch": 15.36, + "grad_norm": 10.201811790466309, + "learning_rate": 6.369575898790334e-06, + "loss": 0.9744, + "step": 51330 + }, + { + "epoch": 15.36, + "grad_norm": 2.500356435775757, + "learning_rate": 6.365658693134857e-06, + "loss": 1.0102, + "step": 51335 + }, + { + "epoch": 15.36, + "grad_norm": 2.9083356857299805, + "learning_rate": 6.361742516634664e-06, + "loss": 1.012, + "step": 51340 + }, + { + "epoch": 15.36, + "grad_norm": 4.8380022048950195, + "learning_rate": 6.357827369506075e-06, + "loss": 0.9587, + "step": 51345 + }, + { + "epoch": 15.36, + "grad_norm": 4.04415225982666, + "learning_rate": 6.353913251965296e-06, + "loss": 0.7099, + "step": 51350 + }, + { + "epoch": 15.36, + "grad_norm": 3.18339204788208, + "learning_rate": 6.350000164228509e-06, + "loss": 1.0013, + "step": 51355 + }, + { + "epoch": 15.37, + "grad_norm": 1.7995153665542603, + "learning_rate": 6.346088106511821e-06, + "loss": 0.8609, + "step": 51360 + }, + { + "epoch": 15.37, + "grad_norm": 5.062925338745117, + "learning_rate": 6.3421770790313025e-06, + "loss": 0.9407, + "step": 51365 + }, + { + "epoch": 15.37, + "grad_norm": 2.892843723297119, + "learning_rate": 6.338267082002949e-06, + "loss": 1.0163, + "step": 51370 + }, + { + "epoch": 15.37, + "grad_norm": 2.197019577026367, + "learning_rate": 6.334358115642708e-06, + "loss": 0.9733, + "step": 51375 + }, + { + "epoch": 15.37, + "grad_norm": 3.871853828430176, + "learning_rate": 6.330450180166464e-06, + "loss": 1.108, + "step": 51380 + }, + { + "epoch": 15.37, + "grad_norm": 1.472909927368164, + "learning_rate": 6.326543275790059e-06, + "loss": 1.0104, + "step": 51385 + }, + { + "epoch": 15.38, + "grad_norm": 3.942326068878174, + "learning_rate": 6.322637402729242e-06, + "loss": 1.0015, + "step": 51390 + }, + { + "epoch": 15.38, + "grad_norm": 7.206786155700684, + "learning_rate": 6.318732561199767e-06, + "loss": 1.0726, + "step": 51395 + }, + { + "epoch": 15.38, + "grad_norm": 1.5072174072265625, + "learning_rate": 6.314828751417257e-06, + "loss": 1.0457, + "step": 51400 + }, + { + "epoch": 15.38, + "grad_norm": 2.0470387935638428, + "learning_rate": 6.3109259735973455e-06, + "loss": 1.1581, + "step": 51405 + }, + { + "epoch": 15.38, + "grad_norm": 5.198572635650635, + "learning_rate": 6.3070242279555606e-06, + "loss": 1.0023, + "step": 51410 + }, + { + "epoch": 15.38, + "grad_norm": 3.871081590652466, + "learning_rate": 6.303123514707393e-06, + "loss": 0.9422, + "step": 51415 + }, + { + "epoch": 15.38, + "grad_norm": 4.593938827514648, + "learning_rate": 6.299223834068288e-06, + "loss": 1.1043, + "step": 51420 + }, + { + "epoch": 15.39, + "grad_norm": 3.9437222480773926, + "learning_rate": 6.2953251862536036e-06, + "loss": 0.77, + "step": 51425 + }, + { + "epoch": 15.39, + "grad_norm": 1.9378674030303955, + "learning_rate": 6.291427571478678e-06, + "loss": 1.0061, + "step": 51430 + }, + { + "epoch": 15.39, + "grad_norm": 1.423578143119812, + "learning_rate": 6.287530989958748e-06, + "loss": 0.8501, + "step": 51435 + }, + { + "epoch": 15.39, + "grad_norm": 7.725799560546875, + "learning_rate": 6.283635441909045e-06, + "loss": 0.8871, + "step": 51440 + }, + { + "epoch": 15.39, + "grad_norm": 1.6603666543960571, + "learning_rate": 6.2797409275446914e-06, + "loss": 0.7305, + "step": 51445 + }, + { + "epoch": 15.39, + "grad_norm": 1.9601699113845825, + "learning_rate": 6.275847447080793e-06, + "loss": 1.0657, + "step": 51450 + }, + { + "epoch": 15.39, + "grad_norm": 3.121793746948242, + "learning_rate": 6.271955000732374e-06, + "loss": 1.1533, + "step": 51455 + }, + { + "epoch": 15.4, + "grad_norm": 1.4244256019592285, + "learning_rate": 6.268063588714415e-06, + "loss": 0.9971, + "step": 51460 + }, + { + "epoch": 15.4, + "grad_norm": 2.398967981338501, + "learning_rate": 6.264173211241833e-06, + "loss": 0.9865, + "step": 51465 + }, + { + "epoch": 15.4, + "grad_norm": 1.3114820718765259, + "learning_rate": 6.26028386852949e-06, + "loss": 0.9019, + "step": 51470 + }, + { + "epoch": 15.4, + "grad_norm": 5.757796764373779, + "learning_rate": 6.256395560792188e-06, + "loss": 1.0239, + "step": 51475 + }, + { + "epoch": 15.4, + "grad_norm": 1.7129333019256592, + "learning_rate": 6.252508288244688e-06, + "loss": 1.1315, + "step": 51480 + }, + { + "epoch": 15.4, + "grad_norm": 3.2258822917938232, + "learning_rate": 6.248622051101649e-06, + "loss": 1.0204, + "step": 51485 + }, + { + "epoch": 15.41, + "grad_norm": 2.3875536918640137, + "learning_rate": 6.244736849577734e-06, + "loss": 0.9069, + "step": 51490 + }, + { + "epoch": 15.41, + "grad_norm": 2.4159421920776367, + "learning_rate": 6.240852683887508e-06, + "loss": 0.9534, + "step": 51495 + }, + { + "epoch": 15.41, + "grad_norm": 3.4338598251342773, + "learning_rate": 6.236969554245486e-06, + "loss": 1.0511, + "step": 51500 + }, + { + "epoch": 15.41, + "grad_norm": 1.4087140560150146, + "learning_rate": 6.233087460866141e-06, + "loss": 0.987, + "step": 51505 + }, + { + "epoch": 15.41, + "grad_norm": 2.022752285003662, + "learning_rate": 6.229206403963852e-06, + "loss": 0.9237, + "step": 51510 + }, + { + "epoch": 15.41, + "grad_norm": 2.3156323432922363, + "learning_rate": 6.225326383753e-06, + "loss": 0.8778, + "step": 51515 + }, + { + "epoch": 15.41, + "grad_norm": 2.3037240505218506, + "learning_rate": 6.221447400447841e-06, + "loss": 1.0857, + "step": 51520 + }, + { + "epoch": 15.42, + "grad_norm": 3.583475112915039, + "learning_rate": 6.2175694542626394e-06, + "loss": 1.0723, + "step": 51525 + }, + { + "epoch": 15.42, + "grad_norm": 4.166855335235596, + "learning_rate": 6.213692545411537e-06, + "loss": 1.1171, + "step": 51530 + }, + { + "epoch": 15.42, + "grad_norm": 1.957580327987671, + "learning_rate": 6.2098166741086815e-06, + "loss": 0.9268, + "step": 51535 + }, + { + "epoch": 15.42, + "grad_norm": 2.228228807449341, + "learning_rate": 6.205941840568114e-06, + "loss": 0.9613, + "step": 51540 + }, + { + "epoch": 15.42, + "grad_norm": 3.6298060417175293, + "learning_rate": 6.2020680450038425e-06, + "loss": 1.1636, + "step": 51545 + }, + { + "epoch": 15.42, + "grad_norm": 3.282169818878174, + "learning_rate": 6.198195287629813e-06, + "loss": 0.898, + "step": 51550 + }, + { + "epoch": 15.42, + "grad_norm": 15.333877563476562, + "learning_rate": 6.194323568659916e-06, + "loss": 0.9714, + "step": 51555 + }, + { + "epoch": 15.43, + "grad_norm": 4.079663276672363, + "learning_rate": 6.190452888307979e-06, + "loss": 1.148, + "step": 51560 + }, + { + "epoch": 15.43, + "grad_norm": 3.9293205738067627, + "learning_rate": 6.18658324678778e-06, + "loss": 0.8797, + "step": 51565 + }, + { + "epoch": 15.43, + "grad_norm": 3.552079916000366, + "learning_rate": 6.1827146443130315e-06, + "loss": 1.0032, + "step": 51570 + }, + { + "epoch": 15.43, + "grad_norm": 1.169809341430664, + "learning_rate": 6.1788470810973944e-06, + "loss": 0.9592, + "step": 51575 + }, + { + "epoch": 15.43, + "grad_norm": 1.6891237497329712, + "learning_rate": 6.174980557354468e-06, + "loss": 1.0002, + "step": 51580 + }, + { + "epoch": 15.43, + "grad_norm": 1.8492525815963745, + "learning_rate": 6.171115073297801e-06, + "loss": 0.9641, + "step": 51585 + }, + { + "epoch": 15.44, + "grad_norm": 5.5648579597473145, + "learning_rate": 6.167250629140872e-06, + "loss": 0.9558, + "step": 51590 + }, + { + "epoch": 15.44, + "grad_norm": 2.914482831954956, + "learning_rate": 6.16338722509712e-06, + "loss": 1.0153, + "step": 51595 + }, + { + "epoch": 15.44, + "grad_norm": 2.55684757232666, + "learning_rate": 6.159524861379909e-06, + "loss": 1.0406, + "step": 51600 + }, + { + "epoch": 15.44, + "grad_norm": 2.4347622394561768, + "learning_rate": 6.155663538202558e-06, + "loss": 0.9547, + "step": 51605 + }, + { + "epoch": 15.44, + "grad_norm": 5.680244445800781, + "learning_rate": 6.1518032557783265e-06, + "loss": 1.0837, + "step": 51610 + }, + { + "epoch": 15.44, + "grad_norm": 4.704986572265625, + "learning_rate": 6.147944014320398e-06, + "loss": 1.177, + "step": 51615 + }, + { + "epoch": 15.44, + "grad_norm": 4.459755897521973, + "learning_rate": 6.1440858140419414e-06, + "loss": 1.2561, + "step": 51620 + }, + { + "epoch": 15.45, + "grad_norm": 2.4121649265289307, + "learning_rate": 6.140228655156008e-06, + "loss": 1.0119, + "step": 51625 + }, + { + "epoch": 15.45, + "grad_norm": 1.6860464811325073, + "learning_rate": 6.136372537875659e-06, + "loss": 0.9792, + "step": 51630 + }, + { + "epoch": 15.45, + "grad_norm": 2.846508741378784, + "learning_rate": 6.132517462413839e-06, + "loss": 0.8266, + "step": 51635 + }, + { + "epoch": 15.45, + "grad_norm": 3.4774718284606934, + "learning_rate": 6.128663428983469e-06, + "loss": 0.9554, + "step": 51640 + }, + { + "epoch": 15.45, + "grad_norm": 2.086501121520996, + "learning_rate": 6.124810437797402e-06, + "loss": 0.9001, + "step": 51645 + }, + { + "epoch": 15.45, + "grad_norm": 2.1175014972686768, + "learning_rate": 6.120958489068434e-06, + "loss": 0.8511, + "step": 51650 + }, + { + "epoch": 15.45, + "grad_norm": 2.3501827716827393, + "learning_rate": 6.117107583009304e-06, + "loss": 1.0259, + "step": 51655 + }, + { + "epoch": 15.46, + "grad_norm": 2.106353998184204, + "learning_rate": 6.113257719832696e-06, + "loss": 0.799, + "step": 51660 + }, + { + "epoch": 15.46, + "grad_norm": 1.536186695098877, + "learning_rate": 6.1094088997512315e-06, + "loss": 0.9458, + "step": 51665 + }, + { + "epoch": 15.46, + "grad_norm": 3.473801612854004, + "learning_rate": 6.105561122977479e-06, + "loss": 0.9084, + "step": 51670 + }, + { + "epoch": 15.46, + "grad_norm": 2.5647025108337402, + "learning_rate": 6.101714389723945e-06, + "loss": 0.9893, + "step": 51675 + }, + { + "epoch": 15.46, + "grad_norm": 5.690295696258545, + "learning_rate": 6.097868700203083e-06, + "loss": 1.0979, + "step": 51680 + }, + { + "epoch": 15.46, + "grad_norm": 3.3370258808135986, + "learning_rate": 6.094024054627281e-06, + "loss": 0.9205, + "step": 51685 + }, + { + "epoch": 15.47, + "grad_norm": 5.14525032043457, + "learning_rate": 6.0901804532088824e-06, + "loss": 0.9944, + "step": 51690 + }, + { + "epoch": 15.47, + "grad_norm": 12.52432632446289, + "learning_rate": 6.086337896160163e-06, + "loss": 0.9203, + "step": 51695 + }, + { + "epoch": 15.47, + "grad_norm": 3.637500047683716, + "learning_rate": 6.0824963836933396e-06, + "loss": 0.9591, + "step": 51700 + }, + { + "epoch": 15.47, + "grad_norm": 2.3651740550994873, + "learning_rate": 6.078655916020584e-06, + "loss": 0.8642, + "step": 51705 + }, + { + "epoch": 15.47, + "grad_norm": 4.2891645431518555, + "learning_rate": 6.0748164933539796e-06, + "loss": 1.094, + "step": 51710 + }, + { + "epoch": 15.47, + "grad_norm": 1.5036563873291016, + "learning_rate": 6.0709781159056064e-06, + "loss": 1.0111, + "step": 51715 + }, + { + "epoch": 15.47, + "grad_norm": 4.821185111999512, + "learning_rate": 6.0671407838874205e-06, + "loss": 0.9488, + "step": 51720 + }, + { + "epoch": 15.48, + "grad_norm": 3.7103047370910645, + "learning_rate": 6.063304497511382e-06, + "loss": 0.9017, + "step": 51725 + }, + { + "epoch": 15.48, + "grad_norm": 3.0226051807403564, + "learning_rate": 6.059469256989339e-06, + "loss": 0.8582, + "step": 51730 + }, + { + "epoch": 15.48, + "grad_norm": 2.9177422523498535, + "learning_rate": 6.055635062533138e-06, + "loss": 0.9762, + "step": 51735 + }, + { + "epoch": 15.48, + "grad_norm": 3.6491503715515137, + "learning_rate": 6.051801914354511e-06, + "loss": 0.9779, + "step": 51740 + }, + { + "epoch": 15.48, + "grad_norm": 2.9927515983581543, + "learning_rate": 6.047969812665169e-06, + "loss": 0.9971, + "step": 51745 + }, + { + "epoch": 15.48, + "grad_norm": 2.7734007835388184, + "learning_rate": 6.044138757676757e-06, + "loss": 1.0923, + "step": 51750 + }, + { + "epoch": 15.48, + "grad_norm": 1.9728281497955322, + "learning_rate": 6.040308749600854e-06, + "loss": 1.0897, + "step": 51755 + }, + { + "epoch": 15.49, + "grad_norm": 3.1228339672088623, + "learning_rate": 6.036479788648994e-06, + "loss": 0.9912, + "step": 51760 + }, + { + "epoch": 15.49, + "grad_norm": 2.159838914871216, + "learning_rate": 6.032651875032644e-06, + "loss": 0.957, + "step": 51765 + }, + { + "epoch": 15.49, + "grad_norm": 4.584501266479492, + "learning_rate": 6.028825008963215e-06, + "loss": 0.8692, + "step": 51770 + }, + { + "epoch": 15.49, + "grad_norm": 2.946192979812622, + "learning_rate": 6.024999190652058e-06, + "loss": 0.9615, + "step": 51775 + }, + { + "epoch": 15.49, + "grad_norm": 3.8030338287353516, + "learning_rate": 6.0211744203104765e-06, + "loss": 0.8554, + "step": 51780 + }, + { + "epoch": 15.49, + "grad_norm": 2.243009090423584, + "learning_rate": 6.0173506981497045e-06, + "loss": 0.9453, + "step": 51785 + }, + { + "epoch": 15.49, + "grad_norm": 4.18550968170166, + "learning_rate": 6.013528024380921e-06, + "loss": 1.0779, + "step": 51790 + }, + { + "epoch": 15.5, + "grad_norm": 1.270992636680603, + "learning_rate": 6.009706399215248e-06, + "loss": 0.882, + "step": 51795 + }, + { + "epoch": 15.5, + "grad_norm": 5.413167953491211, + "learning_rate": 6.00588582286376e-06, + "loss": 0.9145, + "step": 51800 + }, + { + "epoch": 15.5, + "grad_norm": 4.414466381072998, + "learning_rate": 6.00206629553744e-06, + "loss": 0.9951, + "step": 51805 + }, + { + "epoch": 15.5, + "grad_norm": 1.6211533546447754, + "learning_rate": 5.998247817447264e-06, + "loss": 0.8533, + "step": 51810 + }, + { + "epoch": 15.5, + "grad_norm": 2.9947216510772705, + "learning_rate": 5.994430388804098e-06, + "loss": 1.0242, + "step": 51815 + }, + { + "epoch": 15.5, + "grad_norm": 1.102436900138855, + "learning_rate": 5.9906140098188015e-06, + "loss": 0.892, + "step": 51820 + }, + { + "epoch": 15.51, + "grad_norm": 2.9922842979431152, + "learning_rate": 5.986798680702116e-06, + "loss": 0.941, + "step": 51825 + }, + { + "epoch": 15.51, + "grad_norm": 7.5577569007873535, + "learning_rate": 5.982984401664793e-06, + "loss": 1.0939, + "step": 51830 + }, + { + "epoch": 15.51, + "grad_norm": 2.929849863052368, + "learning_rate": 5.979171172917467e-06, + "loss": 0.9213, + "step": 51835 + }, + { + "epoch": 15.51, + "grad_norm": 3.348611831665039, + "learning_rate": 5.9753589946707464e-06, + "loss": 1.044, + "step": 51840 + }, + { + "epoch": 15.51, + "grad_norm": 1.479557752609253, + "learning_rate": 5.971547867135174e-06, + "loss": 0.8942, + "step": 51845 + }, + { + "epoch": 15.51, + "grad_norm": 3.3817899227142334, + "learning_rate": 5.9677377905212324e-06, + "loss": 1.2193, + "step": 51850 + }, + { + "epoch": 15.51, + "grad_norm": 2.0309934616088867, + "learning_rate": 5.9639287650393504e-06, + "loss": 1.1913, + "step": 51855 + }, + { + "epoch": 15.52, + "grad_norm": 1.8293267488479614, + "learning_rate": 5.9601207908998956e-06, + "loss": 0.7932, + "step": 51860 + }, + { + "epoch": 15.52, + "grad_norm": 3.6836533546447754, + "learning_rate": 5.956313868313179e-06, + "loss": 0.978, + "step": 51865 + }, + { + "epoch": 15.52, + "grad_norm": 4.970479488372803, + "learning_rate": 5.952507997489451e-06, + "loss": 1.0691, + "step": 51870 + }, + { + "epoch": 15.52, + "grad_norm": 5.052027702331543, + "learning_rate": 5.948703178638909e-06, + "loss": 1.1021, + "step": 51875 + }, + { + "epoch": 15.52, + "grad_norm": 2.7056381702423096, + "learning_rate": 5.944899411971688e-06, + "loss": 0.8781, + "step": 51880 + }, + { + "epoch": 15.52, + "grad_norm": 3.194413900375366, + "learning_rate": 5.941096697697865e-06, + "loss": 0.9483, + "step": 51885 + }, + { + "epoch": 15.52, + "grad_norm": 3.6588056087493896, + "learning_rate": 5.937295036027463e-06, + "loss": 0.8384, + "step": 51890 + }, + { + "epoch": 15.53, + "grad_norm": 1.5710773468017578, + "learning_rate": 5.933494427170447e-06, + "loss": 0.9579, + "step": 51895 + }, + { + "epoch": 15.53, + "grad_norm": 1.679604411125183, + "learning_rate": 5.9296948713367e-06, + "loss": 1.0501, + "step": 51900 + }, + { + "epoch": 15.53, + "grad_norm": 5.178948879241943, + "learning_rate": 5.925896368736098e-06, + "loss": 0.8546, + "step": 51905 + }, + { + "epoch": 15.53, + "grad_norm": 2.5556845664978027, + "learning_rate": 5.922098919578398e-06, + "loss": 1.0649, + "step": 51910 + }, + { + "epoch": 15.53, + "grad_norm": 1.5957000255584717, + "learning_rate": 5.918302524073361e-06, + "loss": 0.964, + "step": 51915 + }, + { + "epoch": 15.53, + "grad_norm": 2.147024631500244, + "learning_rate": 5.9145071824306255e-06, + "loss": 1.1415, + "step": 51920 + }, + { + "epoch": 15.54, + "grad_norm": 2.6177618503570557, + "learning_rate": 5.910712894859832e-06, + "loss": 0.9363, + "step": 51925 + }, + { + "epoch": 15.54, + "grad_norm": 2.6957054138183594, + "learning_rate": 5.90691966157052e-06, + "loss": 0.8722, + "step": 51930 + }, + { + "epoch": 15.54, + "grad_norm": 3.155423402786255, + "learning_rate": 5.903127482772186e-06, + "loss": 0.974, + "step": 51935 + }, + { + "epoch": 15.54, + "grad_norm": 1.4433481693267822, + "learning_rate": 5.899336358674273e-06, + "loss": 1.0228, + "step": 51940 + }, + { + "epoch": 15.54, + "grad_norm": 2.683156967163086, + "learning_rate": 5.895546289486159e-06, + "loss": 1.0334, + "step": 51945 + }, + { + "epoch": 15.54, + "grad_norm": 1.733872890472412, + "learning_rate": 5.891757275417165e-06, + "loss": 0.8581, + "step": 51950 + }, + { + "epoch": 15.54, + "grad_norm": 1.4323784112930298, + "learning_rate": 5.8879693166765535e-06, + "loss": 0.9999, + "step": 51955 + }, + { + "epoch": 15.55, + "grad_norm": 3.402912139892578, + "learning_rate": 5.8841824134735295e-06, + "loss": 0.8503, + "step": 51960 + }, + { + "epoch": 15.55, + "grad_norm": 2.6373584270477295, + "learning_rate": 5.880396566017246e-06, + "loss": 0.8351, + "step": 51965 + }, + { + "epoch": 15.55, + "grad_norm": 1.3156791925430298, + "learning_rate": 5.876611774516783e-06, + "loss": 0.9677, + "step": 51970 + }, + { + "epoch": 15.55, + "grad_norm": 2.961254835128784, + "learning_rate": 5.872828039181175e-06, + "loss": 0.8483, + "step": 51975 + }, + { + "epoch": 15.55, + "grad_norm": 1.9350943565368652, + "learning_rate": 5.869045360219391e-06, + "loss": 0.9348, + "step": 51980 + }, + { + "epoch": 15.55, + "grad_norm": 3.028542995452881, + "learning_rate": 5.865263737840349e-06, + "loss": 0.9053, + "step": 51985 + }, + { + "epoch": 15.55, + "grad_norm": 2.8941564559936523, + "learning_rate": 5.861483172252907e-06, + "loss": 1.0792, + "step": 51990 + }, + { + "epoch": 15.56, + "grad_norm": 2.4049479961395264, + "learning_rate": 5.857703663665839e-06, + "loss": 0.8029, + "step": 51995 + }, + { + "epoch": 15.56, + "grad_norm": 2.2416865825653076, + "learning_rate": 5.85392521228792e-06, + "loss": 0.8356, + "step": 52000 + }, + { + "epoch": 15.56, + "grad_norm": 1.7893521785736084, + "learning_rate": 5.850147818327792e-06, + "loss": 1.0072, + "step": 52005 + }, + { + "epoch": 15.56, + "grad_norm": 4.932260513305664, + "learning_rate": 5.8463714819941116e-06, + "loss": 0.962, + "step": 52010 + }, + { + "epoch": 15.56, + "grad_norm": 1.802653193473816, + "learning_rate": 5.84259620349541e-06, + "loss": 1.0114, + "step": 52015 + }, + { + "epoch": 15.56, + "grad_norm": 4.934607982635498, + "learning_rate": 5.838821983040221e-06, + "loss": 0.9888, + "step": 52020 + }, + { + "epoch": 15.57, + "grad_norm": 3.02289080619812, + "learning_rate": 5.835048820836969e-06, + "loss": 0.9526, + "step": 52025 + }, + { + "epoch": 15.57, + "grad_norm": 2.7780637741088867, + "learning_rate": 5.83127671709405e-06, + "loss": 0.9735, + "step": 52030 + }, + { + "epoch": 15.57, + "grad_norm": 3.7103610038757324, + "learning_rate": 5.827505672019795e-06, + "loss": 0.8817, + "step": 52035 + }, + { + "epoch": 15.57, + "grad_norm": 1.6637479066848755, + "learning_rate": 5.82373568582247e-06, + "loss": 0.8395, + "step": 52040 + }, + { + "epoch": 15.57, + "grad_norm": 4.275225639343262, + "learning_rate": 5.819966758710293e-06, + "loss": 1.0206, + "step": 52045 + }, + { + "epoch": 15.57, + "grad_norm": 3.6473262310028076, + "learning_rate": 5.816198890891417e-06, + "loss": 0.9276, + "step": 52050 + }, + { + "epoch": 15.57, + "grad_norm": 3.159449815750122, + "learning_rate": 5.812432082573932e-06, + "loss": 1.243, + "step": 52055 + }, + { + "epoch": 15.58, + "grad_norm": 3.919372320175171, + "learning_rate": 5.8086663339658815e-06, + "loss": 1.1114, + "step": 52060 + }, + { + "epoch": 15.58, + "grad_norm": 3.567432403564453, + "learning_rate": 5.80490164527524e-06, + "loss": 0.8347, + "step": 52065 + }, + { + "epoch": 15.58, + "grad_norm": 4.436063289642334, + "learning_rate": 5.801138016709928e-06, + "loss": 1.0756, + "step": 52070 + }, + { + "epoch": 15.58, + "grad_norm": 3.9391560554504395, + "learning_rate": 5.797375448477807e-06, + "loss": 0.925, + "step": 52075 + }, + { + "epoch": 15.58, + "grad_norm": 4.531538963317871, + "learning_rate": 5.793613940786679e-06, + "loss": 0.9541, + "step": 52080 + }, + { + "epoch": 15.58, + "grad_norm": 2.420389413833618, + "learning_rate": 5.789853493844294e-06, + "loss": 0.8691, + "step": 52085 + }, + { + "epoch": 15.58, + "grad_norm": 2.241720676422119, + "learning_rate": 5.7860941078583284e-06, + "loss": 1.0427, + "step": 52090 + }, + { + "epoch": 15.59, + "grad_norm": 1.725215196609497, + "learning_rate": 5.782335783036422e-06, + "loss": 1.0402, + "step": 52095 + }, + { + "epoch": 15.59, + "grad_norm": 2.595808744430542, + "learning_rate": 5.778578519586119e-06, + "loss": 1.0218, + "step": 52100 + }, + { + "epoch": 15.59, + "grad_norm": 2.5729830265045166, + "learning_rate": 5.774822317714959e-06, + "loss": 0.8591, + "step": 52105 + }, + { + "epoch": 15.59, + "grad_norm": 3.0964512825012207, + "learning_rate": 5.771067177630368e-06, + "loss": 0.9244, + "step": 52110 + }, + { + "epoch": 15.59, + "grad_norm": 2.0227348804473877, + "learning_rate": 5.767313099539762e-06, + "loss": 0.7325, + "step": 52115 + }, + { + "epoch": 15.59, + "grad_norm": 1.4698666334152222, + "learning_rate": 5.763560083650451e-06, + "loss": 1.0007, + "step": 52120 + }, + { + "epoch": 15.6, + "grad_norm": 4.7408857345581055, + "learning_rate": 5.7598081301697276e-06, + "loss": 1.1846, + "step": 52125 + }, + { + "epoch": 15.6, + "grad_norm": 1.241419792175293, + "learning_rate": 5.756057239304799e-06, + "loss": 0.9077, + "step": 52130 + }, + { + "epoch": 15.6, + "grad_norm": 6.334906578063965, + "learning_rate": 5.752307411262825e-06, + "loss": 0.9065, + "step": 52135 + }, + { + "epoch": 15.6, + "grad_norm": 1.5211411714553833, + "learning_rate": 5.748558646250909e-06, + "loss": 0.9217, + "step": 52140 + }, + { + "epoch": 15.6, + "grad_norm": 2.1848652362823486, + "learning_rate": 5.744810944476079e-06, + "loss": 0.9956, + "step": 52145 + }, + { + "epoch": 15.6, + "grad_norm": 1.960239291191101, + "learning_rate": 5.741064306145341e-06, + "loss": 1.0003, + "step": 52150 + }, + { + "epoch": 15.6, + "grad_norm": 2.8279387950897217, + "learning_rate": 5.737318731465593e-06, + "loss": 1.0655, + "step": 52155 + }, + { + "epoch": 15.61, + "grad_norm": 1.6879123449325562, + "learning_rate": 5.733574220643712e-06, + "loss": 0.8975, + "step": 52160 + }, + { + "epoch": 15.61, + "grad_norm": 1.8145525455474854, + "learning_rate": 5.729830773886502e-06, + "loss": 0.9761, + "step": 52165 + }, + { + "epoch": 15.61, + "grad_norm": 2.2954049110412598, + "learning_rate": 5.726088391400705e-06, + "loss": 1.0937, + "step": 52170 + }, + { + "epoch": 15.61, + "grad_norm": 3.0802268981933594, + "learning_rate": 5.722347073393012e-06, + "loss": 1.1665, + "step": 52175 + }, + { + "epoch": 15.61, + "grad_norm": 1.6516430377960205, + "learning_rate": 5.718606820070055e-06, + "loss": 0.9886, + "step": 52180 + }, + { + "epoch": 15.61, + "grad_norm": 2.8192756175994873, + "learning_rate": 5.714867631638399e-06, + "loss": 0.9826, + "step": 52185 + }, + { + "epoch": 15.61, + "grad_norm": 4.979455471038818, + "learning_rate": 5.7111295083045636e-06, + "loss": 0.9313, + "step": 52190 + }, + { + "epoch": 15.62, + "grad_norm": 2.0236334800720215, + "learning_rate": 5.707392450274984e-06, + "loss": 0.9639, + "step": 52195 + }, + { + "epoch": 15.62, + "grad_norm": 1.1548880338668823, + "learning_rate": 5.7036564577560794e-06, + "loss": 1.1111, + "step": 52200 + }, + { + "epoch": 15.62, + "grad_norm": 3.311762571334839, + "learning_rate": 5.699921530954158e-06, + "loss": 1.0053, + "step": 52205 + }, + { + "epoch": 15.62, + "grad_norm": 2.1778554916381836, + "learning_rate": 5.696187670075523e-06, + "loss": 1.0448, + "step": 52210 + }, + { + "epoch": 15.62, + "grad_norm": 6.210904121398926, + "learning_rate": 5.6924548753263694e-06, + "loss": 0.9885, + "step": 52215 + }, + { + "epoch": 15.62, + "grad_norm": 3.4226996898651123, + "learning_rate": 5.688723146912859e-06, + "loss": 1.1471, + "step": 52220 + }, + { + "epoch": 15.63, + "grad_norm": 3.190202474594116, + "learning_rate": 5.684992485041108e-06, + "loss": 0.8414, + "step": 52225 + }, + { + "epoch": 15.63, + "grad_norm": 3.1030070781707764, + "learning_rate": 5.681262889917133e-06, + "loss": 1.0518, + "step": 52230 + }, + { + "epoch": 15.63, + "grad_norm": 1.8778613805770874, + "learning_rate": 5.677534361746939e-06, + "loss": 1.0345, + "step": 52235 + }, + { + "epoch": 15.63, + "grad_norm": 2.847775459289551, + "learning_rate": 5.6738069007364255e-06, + "loss": 1.0059, + "step": 52240 + }, + { + "epoch": 15.63, + "grad_norm": 1.932309627532959, + "learning_rate": 5.670080507091483e-06, + "loss": 0.9203, + "step": 52245 + }, + { + "epoch": 15.63, + "grad_norm": 13.216365814208984, + "learning_rate": 5.666355181017893e-06, + "loss": 0.9181, + "step": 52250 + }, + { + "epoch": 15.63, + "grad_norm": 1.8943175077438354, + "learning_rate": 5.662630922721413e-06, + "loss": 0.9876, + "step": 52255 + }, + { + "epoch": 15.64, + "grad_norm": 1.8065849542617798, + "learning_rate": 5.658907732407726e-06, + "loss": 0.9735, + "step": 52260 + }, + { + "epoch": 15.64, + "grad_norm": 1.6062184572219849, + "learning_rate": 5.65518561028246e-06, + "loss": 0.8603, + "step": 52265 + }, + { + "epoch": 15.64, + "grad_norm": 1.6171048879623413, + "learning_rate": 5.651464556551186e-06, + "loss": 1.1217, + "step": 52270 + }, + { + "epoch": 15.64, + "grad_norm": 2.9715936183929443, + "learning_rate": 5.647744571419411e-06, + "loss": 1.195, + "step": 52275 + }, + { + "epoch": 15.64, + "grad_norm": 3.835232734680176, + "learning_rate": 5.644025655092591e-06, + "loss": 1.1408, + "step": 52280 + }, + { + "epoch": 15.64, + "grad_norm": 3.526592969894409, + "learning_rate": 5.6403078077761186e-06, + "loss": 0.8728, + "step": 52285 + }, + { + "epoch": 15.64, + "grad_norm": 4.8948187828063965, + "learning_rate": 5.636591029675311e-06, + "loss": 0.8828, + "step": 52290 + }, + { + "epoch": 15.65, + "grad_norm": 1.3252886533737183, + "learning_rate": 5.632875320995468e-06, + "loss": 0.9628, + "step": 52295 + }, + { + "epoch": 15.65, + "grad_norm": 3.719871759414673, + "learning_rate": 5.629160681941772e-06, + "loss": 0.8964, + "step": 52300 + }, + { + "epoch": 15.65, + "grad_norm": 2.8799922466278076, + "learning_rate": 5.6254471127194064e-06, + "loss": 1.073, + "step": 52305 + }, + { + "epoch": 15.65, + "grad_norm": 3.387589454650879, + "learning_rate": 5.621734613533458e-06, + "loss": 0.7555, + "step": 52310 + }, + { + "epoch": 15.65, + "grad_norm": 3.153660535812378, + "learning_rate": 5.618023184588964e-06, + "loss": 1.031, + "step": 52315 + }, + { + "epoch": 15.65, + "grad_norm": 2.53116774559021, + "learning_rate": 5.6143128260909115e-06, + "loss": 0.8774, + "step": 52320 + }, + { + "epoch": 15.66, + "grad_norm": 2.402709722518921, + "learning_rate": 5.610603538244197e-06, + "loss": 0.9746, + "step": 52325 + }, + { + "epoch": 15.66, + "grad_norm": 3.300175666809082, + "learning_rate": 5.606895321253705e-06, + "loss": 0.9147, + "step": 52330 + }, + { + "epoch": 15.66, + "grad_norm": 2.9742469787597656, + "learning_rate": 5.603188175324217e-06, + "loss": 1.1199, + "step": 52335 + }, + { + "epoch": 15.66, + "grad_norm": 2.81020450592041, + "learning_rate": 5.599482100660497e-06, + "loss": 0.961, + "step": 52340 + }, + { + "epoch": 15.66, + "grad_norm": 2.7677392959594727, + "learning_rate": 5.595777097467206e-06, + "loss": 1.1416, + "step": 52345 + }, + { + "epoch": 15.66, + "grad_norm": 2.42520809173584, + "learning_rate": 5.59207316594898e-06, + "loss": 1.1443, + "step": 52350 + }, + { + "epoch": 15.66, + "grad_norm": 2.994718313217163, + "learning_rate": 5.588370306310375e-06, + "loss": 1.0766, + "step": 52355 + }, + { + "epoch": 15.67, + "grad_norm": 2.0844037532806396, + "learning_rate": 5.584668518755904e-06, + "loss": 0.9827, + "step": 52360 + }, + { + "epoch": 15.67, + "grad_norm": 2.0058093070983887, + "learning_rate": 5.580967803490008e-06, + "loss": 0.925, + "step": 52365 + }, + { + "epoch": 15.67, + "grad_norm": 2.180823564529419, + "learning_rate": 5.577268160717078e-06, + "loss": 0.9218, + "step": 52370 + }, + { + "epoch": 15.67, + "grad_norm": 4.118743419647217, + "learning_rate": 5.573569590641436e-06, + "loss": 0.9062, + "step": 52375 + }, + { + "epoch": 15.67, + "grad_norm": 2.850475788116455, + "learning_rate": 5.569872093467354e-06, + "loss": 1.0542, + "step": 52380 + }, + { + "epoch": 15.67, + "grad_norm": 4.366810321807861, + "learning_rate": 5.566175669399037e-06, + "loss": 0.9499, + "step": 52385 + }, + { + "epoch": 15.67, + "grad_norm": 1.8252012729644775, + "learning_rate": 5.562480318640642e-06, + "loss": 1.0941, + "step": 52390 + }, + { + "epoch": 15.68, + "grad_norm": 6.694037437438965, + "learning_rate": 5.558786041396252e-06, + "loss": 1.0343, + "step": 52395 + }, + { + "epoch": 15.68, + "grad_norm": 3.3669612407684326, + "learning_rate": 5.555092837869902e-06, + "loss": 1.0908, + "step": 52400 + }, + { + "epoch": 15.68, + "grad_norm": 2.897421360015869, + "learning_rate": 5.551400708265561e-06, + "loss": 1.0238, + "step": 52405 + }, + { + "epoch": 15.68, + "grad_norm": 2.594632387161255, + "learning_rate": 5.5477096527871445e-06, + "loss": 0.8819, + "step": 52410 + }, + { + "epoch": 15.68, + "grad_norm": 2.2491109371185303, + "learning_rate": 5.544019671638512e-06, + "loss": 0.9708, + "step": 52415 + }, + { + "epoch": 15.68, + "grad_norm": 2.392650604248047, + "learning_rate": 5.5403307650234355e-06, + "loss": 0.9749, + "step": 52420 + }, + { + "epoch": 15.68, + "grad_norm": 2.6764276027679443, + "learning_rate": 5.536642933145677e-06, + "loss": 1.1067, + "step": 52425 + }, + { + "epoch": 15.69, + "grad_norm": 1.6986552476882935, + "learning_rate": 5.532956176208884e-06, + "loss": 0.8994, + "step": 52430 + }, + { + "epoch": 15.69, + "grad_norm": 3.7357470989227295, + "learning_rate": 5.5292704944167026e-06, + "loss": 0.9956, + "step": 52435 + }, + { + "epoch": 15.69, + "grad_norm": 2.2799248695373535, + "learning_rate": 5.525585887972657e-06, + "loss": 1.203, + "step": 52440 + }, + { + "epoch": 15.69, + "grad_norm": 3.3140718936920166, + "learning_rate": 5.521902357080275e-06, + "loss": 0.8632, + "step": 52445 + }, + { + "epoch": 15.69, + "grad_norm": 2.275482416152954, + "learning_rate": 5.518219901942972e-06, + "loss": 0.9642, + "step": 52450 + }, + { + "epoch": 15.69, + "grad_norm": 2.353581428527832, + "learning_rate": 5.514538522764135e-06, + "loss": 1.0428, + "step": 52455 + }, + { + "epoch": 15.7, + "grad_norm": 3.2124972343444824, + "learning_rate": 5.510858219747078e-06, + "loss": 0.9647, + "step": 52460 + }, + { + "epoch": 15.7, + "grad_norm": 2.8147921562194824, + "learning_rate": 5.507178993095067e-06, + "loss": 1.0069, + "step": 52465 + }, + { + "epoch": 15.7, + "grad_norm": 2.2177577018737793, + "learning_rate": 5.503500843011297e-06, + "loss": 1.0085, + "step": 52470 + }, + { + "epoch": 15.7, + "grad_norm": 2.825120210647583, + "learning_rate": 5.4998237696989085e-06, + "loss": 1.0708, + "step": 52475 + }, + { + "epoch": 15.7, + "grad_norm": 3.487179756164551, + "learning_rate": 5.496147773360988e-06, + "loss": 1.0576, + "step": 52480 + }, + { + "epoch": 15.7, + "grad_norm": 1.7475980520248413, + "learning_rate": 5.492472854200551e-06, + "loss": 1.0345, + "step": 52485 + }, + { + "epoch": 15.7, + "grad_norm": 1.6811771392822266, + "learning_rate": 5.488799012420559e-06, + "loss": 1.0522, + "step": 52490 + }, + { + "epoch": 15.71, + "grad_norm": 3.03035044670105, + "learning_rate": 5.4851262482239205e-06, + "loss": 1.0216, + "step": 52495 + }, + { + "epoch": 15.71, + "grad_norm": 1.8432042598724365, + "learning_rate": 5.481454561813473e-06, + "loss": 1.1813, + "step": 52500 + }, + { + "epoch": 15.71, + "grad_norm": 3.2820653915405273, + "learning_rate": 5.477783953392002e-06, + "loss": 0.9292, + "step": 52505 + }, + { + "epoch": 15.71, + "grad_norm": 1.9322882890701294, + "learning_rate": 5.474114423162236e-06, + "loss": 0.9244, + "step": 52510 + }, + { + "epoch": 15.71, + "grad_norm": 4.189065933227539, + "learning_rate": 5.470445971326821e-06, + "loss": 1.1351, + "step": 52515 + }, + { + "epoch": 15.71, + "grad_norm": 1.8628734350204468, + "learning_rate": 5.4667785980883925e-06, + "loss": 0.9412, + "step": 52520 + }, + { + "epoch": 15.71, + "grad_norm": 3.525163412094116, + "learning_rate": 5.463112303649462e-06, + "loss": 0.8848, + "step": 52525 + }, + { + "epoch": 15.72, + "grad_norm": 3.689204692840576, + "learning_rate": 5.459447088212544e-06, + "loss": 0.938, + "step": 52530 + }, + { + "epoch": 15.72, + "grad_norm": 7.576735019683838, + "learning_rate": 5.45578295198004e-06, + "loss": 0.6826, + "step": 52535 + }, + { + "epoch": 15.72, + "grad_norm": 3.6514053344726562, + "learning_rate": 5.452119895154343e-06, + "loss": 0.8254, + "step": 52540 + }, + { + "epoch": 15.72, + "grad_norm": 2.51778507232666, + "learning_rate": 5.4484579179377385e-06, + "loss": 0.9759, + "step": 52545 + }, + { + "epoch": 15.72, + "grad_norm": 3.0608720779418945, + "learning_rate": 5.444797020532477e-06, + "loss": 1.0327, + "step": 52550 + }, + { + "epoch": 15.72, + "grad_norm": 3.0184848308563232, + "learning_rate": 5.441137203140753e-06, + "loss": 0.9639, + "step": 52555 + }, + { + "epoch": 15.73, + "grad_norm": 3.508617401123047, + "learning_rate": 5.43747846596469e-06, + "loss": 0.8989, + "step": 52560 + }, + { + "epoch": 15.73, + "grad_norm": 2.4190101623535156, + "learning_rate": 5.433820809206358e-06, + "loss": 1.0506, + "step": 52565 + }, + { + "epoch": 15.73, + "grad_norm": 3.336304187774658, + "learning_rate": 5.430164233067764e-06, + "loss": 1.0166, + "step": 52570 + }, + { + "epoch": 15.73, + "grad_norm": 3.363173246383667, + "learning_rate": 5.426508737750857e-06, + "loss": 1.0955, + "step": 52575 + }, + { + "epoch": 15.73, + "grad_norm": 2.3108577728271484, + "learning_rate": 5.422854323457527e-06, + "loss": 1.0257, + "step": 52580 + }, + { + "epoch": 15.73, + "grad_norm": 2.772678852081299, + "learning_rate": 5.419200990389603e-06, + "loss": 0.9683, + "step": 52585 + }, + { + "epoch": 15.73, + "grad_norm": 2.349119186401367, + "learning_rate": 5.415548738748857e-06, + "loss": 0.9251, + "step": 52590 + }, + { + "epoch": 15.74, + "grad_norm": 2.551915407180786, + "learning_rate": 5.411897568736995e-06, + "loss": 1.0174, + "step": 52595 + }, + { + "epoch": 15.74, + "grad_norm": 2.7730255126953125, + "learning_rate": 5.408247480555673e-06, + "loss": 0.9942, + "step": 52600 + }, + { + "epoch": 15.74, + "grad_norm": 4.1052141189575195, + "learning_rate": 5.4045984744064836e-06, + "loss": 0.9015, + "step": 52605 + }, + { + "epoch": 15.74, + "grad_norm": 2.427316665649414, + "learning_rate": 5.400950550490935e-06, + "loss": 1.129, + "step": 52610 + }, + { + "epoch": 15.74, + "grad_norm": 2.9681787490844727, + "learning_rate": 5.397303709010534e-06, + "loss": 1.1242, + "step": 52615 + }, + { + "epoch": 15.74, + "grad_norm": 2.8961920738220215, + "learning_rate": 5.393657950166659e-06, + "loss": 0.9106, + "step": 52620 + }, + { + "epoch": 15.74, + "grad_norm": 3.401855945587158, + "learning_rate": 5.390013274160685e-06, + "loss": 1.0304, + "step": 52625 + }, + { + "epoch": 15.75, + "grad_norm": 5.901130676269531, + "learning_rate": 5.386369681193884e-06, + "loss": 0.8025, + "step": 52630 + }, + { + "epoch": 15.75, + "grad_norm": 2.0417873859405518, + "learning_rate": 5.382727171467514e-06, + "loss": 1.0043, + "step": 52635 + }, + { + "epoch": 15.75, + "grad_norm": 17.817447662353516, + "learning_rate": 5.379085745182721e-06, + "loss": 0.8185, + "step": 52640 + }, + { + "epoch": 15.75, + "grad_norm": 1.411085844039917, + "learning_rate": 5.37544540254063e-06, + "loss": 1.1456, + "step": 52645 + }, + { + "epoch": 15.75, + "grad_norm": 3.324824810028076, + "learning_rate": 5.371806143742289e-06, + "loss": 0.9423, + "step": 52650 + }, + { + "epoch": 15.75, + "grad_norm": 2.4922614097595215, + "learning_rate": 5.368167968988691e-06, + "loss": 0.9954, + "step": 52655 + }, + { + "epoch": 15.76, + "grad_norm": 2.6019651889801025, + "learning_rate": 5.364530878480772e-06, + "loss": 0.9052, + "step": 52660 + }, + { + "epoch": 15.76, + "grad_norm": 5.232183456420898, + "learning_rate": 5.360894872419403e-06, + "loss": 0.7985, + "step": 52665 + }, + { + "epoch": 15.76, + "grad_norm": 4.952908515930176, + "learning_rate": 5.357259951005397e-06, + "loss": 1.1789, + "step": 52670 + }, + { + "epoch": 15.76, + "grad_norm": 2.208289861679077, + "learning_rate": 5.3536261144395054e-06, + "loss": 0.8794, + "step": 52675 + }, + { + "epoch": 15.76, + "grad_norm": 2.049973487854004, + "learning_rate": 5.3499933629224204e-06, + "loss": 0.7753, + "step": 52680 + }, + { + "epoch": 15.76, + "grad_norm": 2.3829286098480225, + "learning_rate": 5.346361696654781e-06, + "loss": 0.9712, + "step": 52685 + }, + { + "epoch": 15.76, + "grad_norm": 33.99540328979492, + "learning_rate": 5.3427311158371515e-06, + "loss": 1.0222, + "step": 52690 + }, + { + "epoch": 15.77, + "grad_norm": 1.5256500244140625, + "learning_rate": 5.339101620670054e-06, + "loss": 1.0975, + "step": 52695 + }, + { + "epoch": 15.77, + "grad_norm": 3.328197717666626, + "learning_rate": 5.3354732113539415e-06, + "loss": 0.872, + "step": 52700 + }, + { + "epoch": 15.77, + "grad_norm": 1.9230583906173706, + "learning_rate": 5.33184588808919e-06, + "loss": 0.9616, + "step": 52705 + }, + { + "epoch": 15.77, + "grad_norm": 4.239948749542236, + "learning_rate": 5.328219651076163e-06, + "loss": 0.9001, + "step": 52710 + }, + { + "epoch": 15.77, + "grad_norm": 3.32629656791687, + "learning_rate": 5.324594500515101e-06, + "loss": 1.0787, + "step": 52715 + }, + { + "epoch": 15.77, + "grad_norm": 1.680390477180481, + "learning_rate": 5.3209704366062455e-06, + "loss": 1.0742, + "step": 52720 + }, + { + "epoch": 15.77, + "grad_norm": 3.082519769668579, + "learning_rate": 5.317347459549726e-06, + "loss": 0.8436, + "step": 52725 + }, + { + "epoch": 15.78, + "grad_norm": 5.2578654289245605, + "learning_rate": 5.313725569545661e-06, + "loss": 1.0528, + "step": 52730 + }, + { + "epoch": 15.78, + "grad_norm": 3.4256768226623535, + "learning_rate": 5.310104766794064e-06, + "loss": 1.0051, + "step": 52735 + }, + { + "epoch": 15.78, + "grad_norm": 1.8970190286636353, + "learning_rate": 5.306485051494911e-06, + "loss": 1.1178, + "step": 52740 + }, + { + "epoch": 15.78, + "grad_norm": 6.223118305206299, + "learning_rate": 5.302866423848121e-06, + "loss": 1.0483, + "step": 52745 + }, + { + "epoch": 15.78, + "grad_norm": 3.551870822906494, + "learning_rate": 5.2992488840535425e-06, + "loss": 0.7908, + "step": 52750 + }, + { + "epoch": 15.78, + "grad_norm": 2.2895681858062744, + "learning_rate": 5.295632432310973e-06, + "loss": 0.9522, + "step": 52755 + }, + { + "epoch": 15.79, + "grad_norm": 3.403578281402588, + "learning_rate": 5.2920170688201405e-06, + "loss": 0.9253, + "step": 52760 + }, + { + "epoch": 15.79, + "grad_norm": 4.709850788116455, + "learning_rate": 5.2884027937807225e-06, + "loss": 0.8115, + "step": 52765 + }, + { + "epoch": 15.79, + "grad_norm": 2.8941822052001953, + "learning_rate": 5.284789607392326e-06, + "loss": 1.031, + "step": 52770 + }, + { + "epoch": 15.79, + "grad_norm": 3.633200168609619, + "learning_rate": 5.28117750985451e-06, + "loss": 1.0308, + "step": 52775 + }, + { + "epoch": 15.79, + "grad_norm": 2.2992610931396484, + "learning_rate": 5.27756650136676e-06, + "loss": 0.7602, + "step": 52780 + }, + { + "epoch": 15.79, + "grad_norm": 5.884945392608643, + "learning_rate": 5.2739565821285145e-06, + "loss": 0.8777, + "step": 52785 + }, + { + "epoch": 15.79, + "grad_norm": 2.312242031097412, + "learning_rate": 5.270347752339142e-06, + "loss": 0.8445, + "step": 52790 + }, + { + "epoch": 15.8, + "grad_norm": 3.9331436157226562, + "learning_rate": 5.266740012197954e-06, + "loss": 1.1452, + "step": 52795 + }, + { + "epoch": 15.8, + "grad_norm": 1.6924293041229248, + "learning_rate": 5.263133361904204e-06, + "loss": 0.9707, + "step": 52800 + }, + { + "epoch": 15.8, + "grad_norm": 1.7212278842926025, + "learning_rate": 5.259527801657091e-06, + "loss": 1.1103, + "step": 52805 + }, + { + "epoch": 15.8, + "grad_norm": 2.418543815612793, + "learning_rate": 5.255923331655724e-06, + "loss": 1.0522, + "step": 52810 + }, + { + "epoch": 15.8, + "grad_norm": 1.562758445739746, + "learning_rate": 5.252319952099202e-06, + "loss": 0.9675, + "step": 52815 + }, + { + "epoch": 15.8, + "grad_norm": 3.524245023727417, + "learning_rate": 5.248717663186511e-06, + "loss": 1.0859, + "step": 52820 + }, + { + "epoch": 15.8, + "grad_norm": 3.3297927379608154, + "learning_rate": 5.245116465116625e-06, + "loss": 0.9522, + "step": 52825 + }, + { + "epoch": 15.81, + "grad_norm": 4.018186092376709, + "learning_rate": 5.241516358088417e-06, + "loss": 1.0692, + "step": 52830 + }, + { + "epoch": 15.81, + "grad_norm": 1.4995861053466797, + "learning_rate": 5.237917342300719e-06, + "loss": 1.0195, + "step": 52835 + }, + { + "epoch": 15.81, + "grad_norm": 2.2528059482574463, + "learning_rate": 5.234319417952308e-06, + "loss": 1.0619, + "step": 52840 + }, + { + "epoch": 15.81, + "grad_norm": 3.694645881652832, + "learning_rate": 5.23072258524189e-06, + "loss": 1.0082, + "step": 52845 + }, + { + "epoch": 15.81, + "grad_norm": 2.391193151473999, + "learning_rate": 5.227126844368113e-06, + "loss": 1.032, + "step": 52850 + }, + { + "epoch": 15.81, + "grad_norm": 2.8515758514404297, + "learning_rate": 5.223532195529571e-06, + "loss": 0.9227, + "step": 52855 + }, + { + "epoch": 15.82, + "grad_norm": 1.8766295909881592, + "learning_rate": 5.2199386389247876e-06, + "loss": 0.9314, + "step": 52860 + }, + { + "epoch": 15.82, + "grad_norm": 1.6634045839309692, + "learning_rate": 5.216346174752232e-06, + "loss": 0.9464, + "step": 52865 + }, + { + "epoch": 15.82, + "grad_norm": 2.1022915840148926, + "learning_rate": 5.212754803210312e-06, + "loss": 0.9514, + "step": 52870 + }, + { + "epoch": 15.82, + "grad_norm": 2.1778430938720703, + "learning_rate": 5.209164524497378e-06, + "loss": 0.9898, + "step": 52875 + }, + { + "epoch": 15.82, + "grad_norm": 1.7852226495742798, + "learning_rate": 5.205575338811719e-06, + "loss": 0.9918, + "step": 52880 + }, + { + "epoch": 15.82, + "grad_norm": 2.882033586502075, + "learning_rate": 5.201987246351556e-06, + "loss": 0.8887, + "step": 52885 + }, + { + "epoch": 15.82, + "grad_norm": 5.0355939865112305, + "learning_rate": 5.198400247315058e-06, + "loss": 1.0556, + "step": 52890 + }, + { + "epoch": 15.83, + "grad_norm": 1.298420786857605, + "learning_rate": 5.194814341900331e-06, + "loss": 1.1109, + "step": 52895 + }, + { + "epoch": 15.83, + "grad_norm": 1.944771409034729, + "learning_rate": 5.1912295303054305e-06, + "loss": 0.9752, + "step": 52900 + }, + { + "epoch": 15.83, + "grad_norm": 2.584871768951416, + "learning_rate": 5.187645812728317e-06, + "loss": 0.9854, + "step": 52905 + }, + { + "epoch": 15.83, + "grad_norm": 2.4717025756835938, + "learning_rate": 5.184063189366948e-06, + "loss": 1.0902, + "step": 52910 + }, + { + "epoch": 15.83, + "grad_norm": 1.5633115768432617, + "learning_rate": 5.1804816604191565e-06, + "loss": 0.9654, + "step": 52915 + }, + { + "epoch": 15.83, + "grad_norm": 2.15081787109375, + "learning_rate": 5.176901226082776e-06, + "loss": 0.9748, + "step": 52920 + }, + { + "epoch": 15.83, + "grad_norm": 1.1734721660614014, + "learning_rate": 5.173321886555527e-06, + "loss": 1.0343, + "step": 52925 + }, + { + "epoch": 15.84, + "grad_norm": 3.0947458744049072, + "learning_rate": 5.169743642035102e-06, + "loss": 0.9797, + "step": 52930 + }, + { + "epoch": 15.84, + "grad_norm": 5.485124588012695, + "learning_rate": 5.166166492719124e-06, + "loss": 0.874, + "step": 52935 + }, + { + "epoch": 15.84, + "grad_norm": 2.063061475753784, + "learning_rate": 5.162590438805156e-06, + "loss": 0.9525, + "step": 52940 + }, + { + "epoch": 15.84, + "grad_norm": 1.3920997381210327, + "learning_rate": 5.159015480490695e-06, + "loss": 1.0252, + "step": 52945 + }, + { + "epoch": 15.84, + "grad_norm": 1.3368240594863892, + "learning_rate": 5.155441617973189e-06, + "loss": 1.0572, + "step": 52950 + }, + { + "epoch": 15.84, + "grad_norm": 3.9613707065582275, + "learning_rate": 5.151868851450012e-06, + "loss": 0.9808, + "step": 52955 + }, + { + "epoch": 15.85, + "grad_norm": 3.1625585556030273, + "learning_rate": 5.148297181118489e-06, + "loss": 0.9818, + "step": 52960 + }, + { + "epoch": 15.85, + "grad_norm": 3.069246292114258, + "learning_rate": 5.1447266071758785e-06, + "loss": 0.9461, + "step": 52965 + }, + { + "epoch": 15.85, + "grad_norm": 3.266118288040161, + "learning_rate": 5.14115712981938e-06, + "loss": 0.958, + "step": 52970 + }, + { + "epoch": 15.85, + "grad_norm": 2.903211832046509, + "learning_rate": 5.137588749246128e-06, + "loss": 0.9459, + "step": 52975 + }, + { + "epoch": 15.85, + "grad_norm": 4.388901710510254, + "learning_rate": 5.134021465653205e-06, + "loss": 0.8478, + "step": 52980 + }, + { + "epoch": 15.85, + "grad_norm": 3.907933473587036, + "learning_rate": 5.130455279237625e-06, + "loss": 1.1191, + "step": 52985 + }, + { + "epoch": 15.85, + "grad_norm": 1.9974335432052612, + "learning_rate": 5.126890190196348e-06, + "loss": 1.1045, + "step": 52990 + }, + { + "epoch": 15.86, + "grad_norm": 4.463841438293457, + "learning_rate": 5.1233261987262775e-06, + "loss": 0.9843, + "step": 52995 + }, + { + "epoch": 15.86, + "grad_norm": 4.327840805053711, + "learning_rate": 5.119763305024225e-06, + "loss": 0.9954, + "step": 53000 + }, + { + "epoch": 15.86, + "grad_norm": 1.5290764570236206, + "learning_rate": 5.116201509286997e-06, + "loss": 1.2544, + "step": 53005 + }, + { + "epoch": 15.86, + "grad_norm": 2.252065420150757, + "learning_rate": 5.1126408117112755e-06, + "loss": 0.9876, + "step": 53010 + }, + { + "epoch": 15.86, + "grad_norm": 3.484567880630493, + "learning_rate": 5.109081212493744e-06, + "loss": 0.8204, + "step": 53015 + }, + { + "epoch": 15.86, + "grad_norm": 4.542111873626709, + "learning_rate": 5.105522711830968e-06, + "loss": 0.9088, + "step": 53020 + }, + { + "epoch": 15.86, + "grad_norm": 2.7944681644439697, + "learning_rate": 5.101965309919507e-06, + "loss": 0.7633, + "step": 53025 + }, + { + "epoch": 15.87, + "grad_norm": 3.740417718887329, + "learning_rate": 5.098409006955812e-06, + "loss": 0.8264, + "step": 53030 + }, + { + "epoch": 15.87, + "grad_norm": 2.1556479930877686, + "learning_rate": 5.094853803136296e-06, + "loss": 1.0365, + "step": 53035 + }, + { + "epoch": 15.87, + "grad_norm": 3.4103243350982666, + "learning_rate": 5.091299698657326e-06, + "loss": 1.164, + "step": 53040 + }, + { + "epoch": 15.87, + "grad_norm": 2.4072625637054443, + "learning_rate": 5.087746693715167e-06, + "loss": 0.7972, + "step": 53045 + }, + { + "epoch": 15.87, + "grad_norm": 2.5609822273254395, + "learning_rate": 5.084194788506075e-06, + "loss": 1.0668, + "step": 53050 + }, + { + "epoch": 15.87, + "grad_norm": 1.8932149410247803, + "learning_rate": 5.0806439832261935e-06, + "loss": 0.9741, + "step": 53055 + }, + { + "epoch": 15.87, + "grad_norm": 2.655104160308838, + "learning_rate": 5.077094278071643e-06, + "loss": 0.927, + "step": 53060 + }, + { + "epoch": 15.88, + "grad_norm": 2.491745948791504, + "learning_rate": 5.073545673238467e-06, + "loss": 0.8797, + "step": 53065 + }, + { + "epoch": 15.88, + "grad_norm": 2.7281858921051025, + "learning_rate": 5.069998168922649e-06, + "loss": 0.9462, + "step": 53070 + }, + { + "epoch": 15.88, + "grad_norm": 3.0326929092407227, + "learning_rate": 5.066451765320116e-06, + "loss": 0.997, + "step": 53075 + }, + { + "epoch": 15.88, + "grad_norm": 2.410046339035034, + "learning_rate": 5.062906462626732e-06, + "loss": 1.0823, + "step": 53080 + }, + { + "epoch": 15.88, + "grad_norm": 4.804044246673584, + "learning_rate": 5.059362261038303e-06, + "loss": 1.0974, + "step": 53085 + }, + { + "epoch": 15.88, + "grad_norm": 22.172578811645508, + "learning_rate": 5.0558191607505745e-06, + "loss": 0.944, + "step": 53090 + }, + { + "epoch": 15.89, + "grad_norm": 4.430202007293701, + "learning_rate": 5.052277161959209e-06, + "loss": 0.9268, + "step": 53095 + }, + { + "epoch": 15.89, + "grad_norm": 3.1318132877349854, + "learning_rate": 5.048736264859854e-06, + "loss": 0.8596, + "step": 53100 + }, + { + "epoch": 15.89, + "grad_norm": 2.395545244216919, + "learning_rate": 5.045196469648045e-06, + "loss": 0.8246, + "step": 53105 + }, + { + "epoch": 15.89, + "grad_norm": 1.1723501682281494, + "learning_rate": 5.041657776519307e-06, + "loss": 1.017, + "step": 53110 + }, + { + "epoch": 15.89, + "grad_norm": 3.7153756618499756, + "learning_rate": 5.038120185669051e-06, + "loss": 0.8919, + "step": 53115 + }, + { + "epoch": 15.89, + "grad_norm": 1.7875972986221313, + "learning_rate": 5.034583697292675e-06, + "loss": 0.9199, + "step": 53120 + }, + { + "epoch": 15.89, + "grad_norm": 3.715594530105591, + "learning_rate": 5.031048311585493e-06, + "loss": 1.0215, + "step": 53125 + }, + { + "epoch": 15.9, + "grad_norm": 3.1977784633636475, + "learning_rate": 5.027514028742744e-06, + "loss": 1.1078, + "step": 53130 + }, + { + "epoch": 15.9, + "grad_norm": 2.141249895095825, + "learning_rate": 5.023980848959647e-06, + "loss": 1.1468, + "step": 53135 + }, + { + "epoch": 15.9, + "grad_norm": 1.349517583847046, + "learning_rate": 5.020448772431313e-06, + "loss": 0.9661, + "step": 53140 + }, + { + "epoch": 15.9, + "grad_norm": 3.260608673095703, + "learning_rate": 5.016917799352838e-06, + "loss": 0.877, + "step": 53145 + }, + { + "epoch": 15.9, + "grad_norm": 3.2229676246643066, + "learning_rate": 5.0133879299192085e-06, + "loss": 0.9666, + "step": 53150 + }, + { + "epoch": 15.9, + "grad_norm": 4.101037979125977, + "learning_rate": 5.0098591643254e-06, + "loss": 1.0229, + "step": 53155 + }, + { + "epoch": 15.9, + "grad_norm": 2.1944096088409424, + "learning_rate": 5.006331502766287e-06, + "loss": 1.0606, + "step": 53160 + }, + { + "epoch": 15.91, + "grad_norm": 7.716952800750732, + "learning_rate": 5.002804945436701e-06, + "loss": 0.9529, + "step": 53165 + }, + { + "epoch": 15.91, + "grad_norm": 17.72467803955078, + "learning_rate": 4.999279492531414e-06, + "loss": 0.7256, + "step": 53170 + }, + { + "epoch": 15.91, + "grad_norm": 3.899148464202881, + "learning_rate": 4.995755144245129e-06, + "loss": 1.0014, + "step": 53175 + }, + { + "epoch": 15.91, + "grad_norm": 5.196731090545654, + "learning_rate": 4.992231900772495e-06, + "loss": 1.0583, + "step": 53180 + }, + { + "epoch": 15.91, + "grad_norm": 4.577493190765381, + "learning_rate": 4.9887097623081055e-06, + "loss": 0.94, + "step": 53185 + }, + { + "epoch": 15.91, + "grad_norm": 3.10968017578125, + "learning_rate": 4.985188729046458e-06, + "loss": 1.0676, + "step": 53190 + }, + { + "epoch": 15.92, + "grad_norm": 1.4285688400268555, + "learning_rate": 4.981668801182041e-06, + "loss": 1.0163, + "step": 53195 + }, + { + "epoch": 15.92, + "grad_norm": 2.724886894226074, + "learning_rate": 4.97814997890925e-06, + "loss": 0.9586, + "step": 53200 + }, + { + "epoch": 15.92, + "grad_norm": 2.3468189239501953, + "learning_rate": 4.974632262422421e-06, + "loss": 1.0864, + "step": 53205 + }, + { + "epoch": 15.92, + "grad_norm": 5.247642517089844, + "learning_rate": 4.971115651915839e-06, + "loss": 1.1944, + "step": 53210 + }, + { + "epoch": 15.92, + "grad_norm": 4.25636100769043, + "learning_rate": 4.967600147583718e-06, + "loss": 0.9301, + "step": 53215 + }, + { + "epoch": 15.92, + "grad_norm": 3.097277879714966, + "learning_rate": 4.964085749620228e-06, + "loss": 0.9153, + "step": 53220 + }, + { + "epoch": 15.92, + "grad_norm": 2.7797505855560303, + "learning_rate": 4.960572458219443e-06, + "loss": 1.0695, + "step": 53225 + }, + { + "epoch": 15.93, + "grad_norm": 3.9096181392669678, + "learning_rate": 4.957060273575423e-06, + "loss": 0.9084, + "step": 53230 + }, + { + "epoch": 15.93, + "grad_norm": 3.273726463317871, + "learning_rate": 4.953549195882115e-06, + "loss": 0.9885, + "step": 53235 + }, + { + "epoch": 15.93, + "grad_norm": 2.705861806869507, + "learning_rate": 4.950039225333464e-06, + "loss": 0.9194, + "step": 53240 + }, + { + "epoch": 15.93, + "grad_norm": 4.52183198928833, + "learning_rate": 4.946530362123291e-06, + "loss": 0.8444, + "step": 53245 + }, + { + "epoch": 15.93, + "grad_norm": 4.226823329925537, + "learning_rate": 4.943022606445416e-06, + "loss": 0.965, + "step": 53250 + }, + { + "epoch": 15.93, + "grad_norm": 2.7318599224090576, + "learning_rate": 4.939515958493549e-06, + "loss": 0.9715, + "step": 53255 + }, + { + "epoch": 15.93, + "grad_norm": 3.3146235942840576, + "learning_rate": 4.936010418461362e-06, + "loss": 0.8742, + "step": 53260 + }, + { + "epoch": 15.94, + "grad_norm": 9.107588768005371, + "learning_rate": 4.932505986542465e-06, + "loss": 0.8569, + "step": 53265 + }, + { + "epoch": 15.94, + "grad_norm": 4.029197692871094, + "learning_rate": 4.929002662930404e-06, + "loss": 0.9713, + "step": 53270 + }, + { + "epoch": 15.94, + "grad_norm": 3.5045926570892334, + "learning_rate": 4.9255004478186626e-06, + "loss": 0.9804, + "step": 53275 + }, + { + "epoch": 15.94, + "grad_norm": 2.6087844371795654, + "learning_rate": 4.921999341400666e-06, + "loss": 1.0268, + "step": 53280 + }, + { + "epoch": 15.94, + "grad_norm": 1.471195936203003, + "learning_rate": 4.918499343869773e-06, + "loss": 0.8482, + "step": 53285 + }, + { + "epoch": 15.94, + "grad_norm": 2.6033759117126465, + "learning_rate": 4.915000455419289e-06, + "loss": 1.0013, + "step": 53290 + }, + { + "epoch": 15.95, + "grad_norm": 2.508314609527588, + "learning_rate": 4.911502676242455e-06, + "loss": 0.9304, + "step": 53295 + }, + { + "epoch": 15.95, + "grad_norm": 3.0417144298553467, + "learning_rate": 4.908006006532445e-06, + "loss": 0.8015, + "step": 53300 + }, + { + "epoch": 15.95, + "grad_norm": 2.4495913982391357, + "learning_rate": 4.9045104464823795e-06, + "loss": 1.0227, + "step": 53305 + }, + { + "epoch": 15.95, + "grad_norm": 5.05440616607666, + "learning_rate": 4.901015996285313e-06, + "loss": 1.0307, + "step": 53310 + }, + { + "epoch": 15.95, + "grad_norm": 1.3113963603973389, + "learning_rate": 4.897522656134249e-06, + "loss": 1.0566, + "step": 53315 + }, + { + "epoch": 15.95, + "grad_norm": 2.597869634628296, + "learning_rate": 4.8940304262221015e-06, + "loss": 0.8975, + "step": 53320 + }, + { + "epoch": 15.95, + "grad_norm": 3.1354362964630127, + "learning_rate": 4.890539306741765e-06, + "loss": 0.8747, + "step": 53325 + }, + { + "epoch": 15.96, + "grad_norm": 3.2860591411590576, + "learning_rate": 4.887049297886029e-06, + "loss": 0.9559, + "step": 53330 + }, + { + "epoch": 15.96, + "grad_norm": 4.5994062423706055, + "learning_rate": 4.883560399847664e-06, + "loss": 0.9507, + "step": 53335 + }, + { + "epoch": 15.96, + "grad_norm": 4.300266742706299, + "learning_rate": 4.880072612819336e-06, + "loss": 0.9513, + "step": 53340 + }, + { + "epoch": 15.96, + "grad_norm": 5.296845436096191, + "learning_rate": 4.876585936993699e-06, + "loss": 1.0441, + "step": 53345 + }, + { + "epoch": 15.96, + "grad_norm": 2.162454843521118, + "learning_rate": 4.8731003725632965e-06, + "loss": 1.031, + "step": 53350 + }, + { + "epoch": 15.96, + "grad_norm": 3.4292469024658203, + "learning_rate": 4.869615919720641e-06, + "loss": 0.9578, + "step": 53355 + }, + { + "epoch": 15.96, + "grad_norm": 1.5867362022399902, + "learning_rate": 4.866132578658173e-06, + "loss": 1.0536, + "step": 53360 + }, + { + "epoch": 15.97, + "grad_norm": 2.8988234996795654, + "learning_rate": 4.862650349568274e-06, + "loss": 1.0964, + "step": 53365 + }, + { + "epoch": 15.97, + "grad_norm": 1.7928255796432495, + "learning_rate": 4.859169232643265e-06, + "loss": 0.9603, + "step": 53370 + }, + { + "epoch": 15.97, + "grad_norm": 2.7317888736724854, + "learning_rate": 4.855689228075402e-06, + "loss": 0.9083, + "step": 53375 + }, + { + "epoch": 15.97, + "grad_norm": 6.080751419067383, + "learning_rate": 4.852210336056887e-06, + "loss": 0.9522, + "step": 53380 + }, + { + "epoch": 15.97, + "grad_norm": 6.8323073387146, + "learning_rate": 4.848732556779853e-06, + "loss": 1.0334, + "step": 53385 + }, + { + "epoch": 15.97, + "grad_norm": 2.109210968017578, + "learning_rate": 4.84525589043637e-06, + "loss": 0.8857, + "step": 53390 + }, + { + "epoch": 15.98, + "grad_norm": 1.5655369758605957, + "learning_rate": 4.84178033721846e-06, + "loss": 0.8181, + "step": 53395 + }, + { + "epoch": 15.98, + "grad_norm": 2.1813764572143555, + "learning_rate": 4.838305897318063e-06, + "loss": 0.9369, + "step": 53400 + }, + { + "epoch": 15.98, + "grad_norm": 6.134399890899658, + "learning_rate": 4.8348325709270785e-06, + "loss": 0.9357, + "step": 53405 + }, + { + "epoch": 15.98, + "grad_norm": 2.2405998706817627, + "learning_rate": 4.831360358237336e-06, + "loss": 0.9124, + "step": 53410 + }, + { + "epoch": 15.98, + "grad_norm": 3.413001775741577, + "learning_rate": 4.8278892594405826e-06, + "loss": 1.0046, + "step": 53415 + }, + { + "epoch": 15.98, + "grad_norm": 2.7362916469573975, + "learning_rate": 4.82441927472855e-06, + "loss": 0.9897, + "step": 53420 + }, + { + "epoch": 15.98, + "grad_norm": 4.3867340087890625, + "learning_rate": 4.820950404292859e-06, + "loss": 1.0994, + "step": 53425 + }, + { + "epoch": 15.99, + "grad_norm": 7.300239086151123, + "learning_rate": 4.817482648325114e-06, + "loss": 0.9382, + "step": 53430 + }, + { + "epoch": 15.99, + "grad_norm": 5.3722124099731445, + "learning_rate": 4.814016007016811e-06, + "loss": 1.0042, + "step": 53435 + }, + { + "epoch": 15.99, + "grad_norm": 2.9729552268981934, + "learning_rate": 4.810550480559434e-06, + "loss": 0.8934, + "step": 53440 + }, + { + "epoch": 15.99, + "grad_norm": 2.5826375484466553, + "learning_rate": 4.807086069144364e-06, + "loss": 1.0011, + "step": 53445 + }, + { + "epoch": 15.99, + "grad_norm": 3.0726816654205322, + "learning_rate": 4.80362277296294e-06, + "loss": 0.9917, + "step": 53450 + }, + { + "epoch": 15.99, + "grad_norm": 2.546142339706421, + "learning_rate": 4.800160592206435e-06, + "loss": 0.9182, + "step": 53455 + }, + { + "epoch": 15.99, + "grad_norm": 1.8220473527908325, + "learning_rate": 4.796699527066065e-06, + "loss": 0.9448, + "step": 53460 + }, + { + "epoch": 16.0, + "grad_norm": 2.17934250831604, + "learning_rate": 4.79323957773298e-06, + "loss": 1.1478, + "step": 53465 + }, + { + "epoch": 16.0, + "grad_norm": 2.7372896671295166, + "learning_rate": 4.789780744398273e-06, + "loss": 0.8963, + "step": 53470 + }, + { + "epoch": 16.0, + "grad_norm": 3.5111589431762695, + "learning_rate": 4.786323027252965e-06, + "loss": 0.9019, + "step": 53475 + }, + { + "epoch": 16.0, + "grad_norm": 1.6906156539916992, + "learning_rate": 4.782866426488025e-06, + "loss": 0.9621, + "step": 53480 + }, + { + "epoch": 16.0, + "grad_norm": 2.1876065731048584, + "learning_rate": 4.779410942294357e-06, + "loss": 1.0843, + "step": 53485 + }, + { + "epoch": 16.0, + "grad_norm": 2.4555323123931885, + "learning_rate": 4.7759565748628055e-06, + "loss": 0.9055, + "step": 53490 + }, + { + "epoch": 16.01, + "grad_norm": 3.3051624298095703, + "learning_rate": 4.772503324384151e-06, + "loss": 0.7355, + "step": 53495 + }, + { + "epoch": 16.01, + "grad_norm": 1.634621500968933, + "learning_rate": 4.769051191049112e-06, + "loss": 1.1518, + "step": 53500 + }, + { + "epoch": 16.01, + "grad_norm": 5.108380317687988, + "learning_rate": 4.765600175048355e-06, + "loss": 0.7752, + "step": 53505 + }, + { + "epoch": 16.01, + "grad_norm": 3.3601999282836914, + "learning_rate": 4.762150276572452e-06, + "loss": 1.0098, + "step": 53510 + }, + { + "epoch": 16.01, + "grad_norm": 1.3551602363586426, + "learning_rate": 4.758701495811968e-06, + "loss": 0.9517, + "step": 53515 + }, + { + "epoch": 16.01, + "grad_norm": 3.776623249053955, + "learning_rate": 4.755253832957349e-06, + "loss": 0.9535, + "step": 53520 + }, + { + "epoch": 16.01, + "grad_norm": 1.849770188331604, + "learning_rate": 4.751807288199028e-06, + "loss": 1.0225, + "step": 53525 + }, + { + "epoch": 16.02, + "grad_norm": 2.972109794616699, + "learning_rate": 4.7483618617273325e-06, + "loss": 1.0399, + "step": 53530 + }, + { + "epoch": 16.02, + "grad_norm": 1.7321369647979736, + "learning_rate": 4.744917553732572e-06, + "loss": 1.0185, + "step": 53535 + }, + { + "epoch": 16.02, + "grad_norm": 2.0730857849121094, + "learning_rate": 4.741474364404955e-06, + "loss": 1.0239, + "step": 53540 + }, + { + "epoch": 16.02, + "grad_norm": 3.4221105575561523, + "learning_rate": 4.738032293934652e-06, + "loss": 0.8171, + "step": 53545 + }, + { + "epoch": 16.02, + "grad_norm": 1.468760371208191, + "learning_rate": 4.734591342511765e-06, + "loss": 0.9047, + "step": 53550 + }, + { + "epoch": 16.02, + "grad_norm": 2.599126100540161, + "learning_rate": 4.7311515103263315e-06, + "loss": 0.9684, + "step": 53555 + }, + { + "epoch": 16.02, + "grad_norm": 2.2176599502563477, + "learning_rate": 4.727712797568335e-06, + "loss": 0.9353, + "step": 53560 + }, + { + "epoch": 16.03, + "grad_norm": 5.144350528717041, + "learning_rate": 4.7242752044276865e-06, + "loss": 0.9837, + "step": 53565 + }, + { + "epoch": 16.03, + "grad_norm": 1.4376980066299438, + "learning_rate": 4.720838731094243e-06, + "loss": 1.0823, + "step": 53570 + }, + { + "epoch": 16.03, + "grad_norm": 3.1391711235046387, + "learning_rate": 4.717403377757798e-06, + "loss": 0.7762, + "step": 53575 + }, + { + "epoch": 16.03, + "grad_norm": 3.4208807945251465, + "learning_rate": 4.71396914460808e-06, + "loss": 0.8843, + "step": 53580 + }, + { + "epoch": 16.03, + "grad_norm": 3.4522769451141357, + "learning_rate": 4.710536031834761e-06, + "loss": 1.1328, + "step": 53585 + }, + { + "epoch": 16.03, + "grad_norm": 1.8125996589660645, + "learning_rate": 4.707104039627447e-06, + "loss": 0.9399, + "step": 53590 + }, + { + "epoch": 16.04, + "grad_norm": 4.709026336669922, + "learning_rate": 4.703673168175684e-06, + "loss": 0.9658, + "step": 53595 + }, + { + "epoch": 16.04, + "grad_norm": 1.6594618558883667, + "learning_rate": 4.700243417668957e-06, + "loss": 1.03, + "step": 53600 + }, + { + "epoch": 16.04, + "grad_norm": 2.7412757873535156, + "learning_rate": 4.696814788296683e-06, + "loss": 0.945, + "step": 53605 + }, + { + "epoch": 16.04, + "grad_norm": 4.177542686462402, + "learning_rate": 4.693387280248232e-06, + "loss": 0.9507, + "step": 53610 + }, + { + "epoch": 16.04, + "grad_norm": 1.866605520248413, + "learning_rate": 4.68996089371288e-06, + "loss": 0.8753, + "step": 53615 + }, + { + "epoch": 16.04, + "grad_norm": 2.461459159851074, + "learning_rate": 4.686535628879893e-06, + "loss": 0.8427, + "step": 53620 + }, + { + "epoch": 16.04, + "grad_norm": 2.5235543251037598, + "learning_rate": 4.68311148593841e-06, + "loss": 1.0127, + "step": 53625 + }, + { + "epoch": 16.05, + "grad_norm": 3.0015721321105957, + "learning_rate": 4.679688465077578e-06, + "loss": 1.0528, + "step": 53630 + }, + { + "epoch": 16.05, + "grad_norm": 1.9430887699127197, + "learning_rate": 4.676266566486423e-06, + "loss": 0.9704, + "step": 53635 + }, + { + "epoch": 16.05, + "grad_norm": 3.479346752166748, + "learning_rate": 4.672845790353941e-06, + "loss": 0.8397, + "step": 53640 + }, + { + "epoch": 16.05, + "grad_norm": 5.18800163269043, + "learning_rate": 4.669426136869056e-06, + "loss": 0.8035, + "step": 53645 + }, + { + "epoch": 16.05, + "grad_norm": 4.256208419799805, + "learning_rate": 4.666007606220635e-06, + "loss": 0.922, + "step": 53650 + }, + { + "epoch": 16.05, + "grad_norm": 3.6765284538269043, + "learning_rate": 4.662590198597477e-06, + "loss": 0.9406, + "step": 53655 + }, + { + "epoch": 16.05, + "grad_norm": 1.5843440294265747, + "learning_rate": 4.659173914188322e-06, + "loss": 1.2731, + "step": 53660 + }, + { + "epoch": 16.06, + "grad_norm": 2.01247501373291, + "learning_rate": 4.655758753181846e-06, + "loss": 0.95, + "step": 53665 + }, + { + "epoch": 16.06, + "grad_norm": 2.1680808067321777, + "learning_rate": 4.652344715766671e-06, + "loss": 1.0874, + "step": 53670 + }, + { + "epoch": 16.06, + "grad_norm": 1.926619291305542, + "learning_rate": 4.648931802131348e-06, + "loss": 0.935, + "step": 53675 + }, + { + "epoch": 16.06, + "grad_norm": 4.823452472686768, + "learning_rate": 4.645520012464366e-06, + "loss": 0.7357, + "step": 53680 + }, + { + "epoch": 16.06, + "grad_norm": 3.968709945678711, + "learning_rate": 4.6421093469541545e-06, + "loss": 1.1243, + "step": 53685 + }, + { + "epoch": 16.06, + "grad_norm": 2.4009251594543457, + "learning_rate": 4.638699805789088e-06, + "loss": 0.9297, + "step": 53690 + }, + { + "epoch": 16.06, + "grad_norm": 2.19075345993042, + "learning_rate": 4.635291389157462e-06, + "loss": 0.8262, + "step": 53695 + }, + { + "epoch": 16.07, + "grad_norm": 5.621512413024902, + "learning_rate": 4.631884097247527e-06, + "loss": 1.0455, + "step": 53700 + }, + { + "epoch": 16.07, + "grad_norm": 1.1728134155273438, + "learning_rate": 4.628477930247466e-06, + "loss": 0.7797, + "step": 53705 + }, + { + "epoch": 16.07, + "grad_norm": 2.9061529636383057, + "learning_rate": 4.6250728883453835e-06, + "loss": 0.9716, + "step": 53710 + }, + { + "epoch": 16.07, + "grad_norm": 2.0948145389556885, + "learning_rate": 4.621668971729359e-06, + "loss": 0.9648, + "step": 53715 + }, + { + "epoch": 16.07, + "grad_norm": 1.2624660730361938, + "learning_rate": 4.618266180587363e-06, + "loss": 1.1212, + "step": 53720 + }, + { + "epoch": 16.07, + "grad_norm": 1.8535141944885254, + "learning_rate": 4.61486451510735e-06, + "loss": 0.9889, + "step": 53725 + }, + { + "epoch": 16.08, + "grad_norm": 2.1382455825805664, + "learning_rate": 4.61146397547717e-06, + "loss": 0.8759, + "step": 53730 + }, + { + "epoch": 16.08, + "grad_norm": 2.7965381145477295, + "learning_rate": 4.608064561884656e-06, + "loss": 0.8569, + "step": 53735 + }, + { + "epoch": 16.08, + "grad_norm": 3.074979066848755, + "learning_rate": 4.6046662745175325e-06, + "loss": 1.1403, + "step": 53740 + }, + { + "epoch": 16.08, + "grad_norm": 2.496306896209717, + "learning_rate": 4.601269113563492e-06, + "loss": 0.8768, + "step": 53745 + }, + { + "epoch": 16.08, + "grad_norm": 1.5231736898422241, + "learning_rate": 4.597873079210152e-06, + "loss": 1.005, + "step": 53750 + }, + { + "epoch": 16.08, + "grad_norm": 3.4601387977600098, + "learning_rate": 4.594478171645078e-06, + "loss": 1.0404, + "step": 53755 + }, + { + "epoch": 16.08, + "grad_norm": 2.981828451156616, + "learning_rate": 4.591084391055761e-06, + "loss": 1.1838, + "step": 53760 + }, + { + "epoch": 16.09, + "grad_norm": 1.5750380754470825, + "learning_rate": 4.587691737629643e-06, + "loss": 1.0259, + "step": 53765 + }, + { + "epoch": 16.09, + "grad_norm": 1.4507620334625244, + "learning_rate": 4.58430021155409e-06, + "loss": 0.9411, + "step": 53770 + }, + { + "epoch": 16.09, + "grad_norm": 1.5799695253372192, + "learning_rate": 4.580909813016418e-06, + "loss": 1.0206, + "step": 53775 + }, + { + "epoch": 16.09, + "grad_norm": 2.9777636528015137, + "learning_rate": 4.57752054220387e-06, + "loss": 0.9011, + "step": 53780 + }, + { + "epoch": 16.09, + "grad_norm": 2.887299060821533, + "learning_rate": 4.574132399303638e-06, + "loss": 0.9952, + "step": 53785 + }, + { + "epoch": 16.09, + "grad_norm": 2.431195020675659, + "learning_rate": 4.5707453845028365e-06, + "loss": 1.0887, + "step": 53790 + }, + { + "epoch": 16.09, + "grad_norm": 5.010396480560303, + "learning_rate": 4.567359497988538e-06, + "loss": 1.1466, + "step": 53795 + }, + { + "epoch": 16.1, + "grad_norm": 4.0482611656188965, + "learning_rate": 4.563974739947738e-06, + "loss": 0.7066, + "step": 53800 + }, + { + "epoch": 16.1, + "grad_norm": 1.8453367948532104, + "learning_rate": 4.560591110567361e-06, + "loss": 0.9911, + "step": 53805 + }, + { + "epoch": 16.1, + "grad_norm": 4.799133777618408, + "learning_rate": 4.557208610034302e-06, + "loss": 0.9197, + "step": 53810 + }, + { + "epoch": 16.1, + "grad_norm": 2.3460564613342285, + "learning_rate": 4.553827238535352e-06, + "loss": 1.1286, + "step": 53815 + }, + { + "epoch": 16.1, + "grad_norm": 2.8893086910247803, + "learning_rate": 4.550446996257282e-06, + "loss": 0.8731, + "step": 53820 + }, + { + "epoch": 16.1, + "grad_norm": 3.3876943588256836, + "learning_rate": 4.5470678833867575e-06, + "loss": 1.0628, + "step": 53825 + }, + { + "epoch": 16.11, + "grad_norm": 3.1112098693847656, + "learning_rate": 4.543689900110423e-06, + "loss": 0.7802, + "step": 53830 + }, + { + "epoch": 16.11, + "grad_norm": 3.280080795288086, + "learning_rate": 4.54031304661483e-06, + "loss": 0.9001, + "step": 53835 + }, + { + "epoch": 16.11, + "grad_norm": 2.949982166290283, + "learning_rate": 4.536937323086479e-06, + "loss": 0.8499, + "step": 53840 + }, + { + "epoch": 16.11, + "grad_norm": 1.0892285108566284, + "learning_rate": 4.53356272971181e-06, + "loss": 1.1195, + "step": 53845 + }, + { + "epoch": 16.11, + "grad_norm": 1.7821409702301025, + "learning_rate": 4.53018926667719e-06, + "loss": 1.0122, + "step": 53850 + }, + { + "epoch": 16.11, + "grad_norm": 3.348202705383301, + "learning_rate": 4.526816934168954e-06, + "loss": 0.9711, + "step": 53855 + }, + { + "epoch": 16.11, + "grad_norm": 3.279318332672119, + "learning_rate": 4.52344573237333e-06, + "loss": 0.9613, + "step": 53860 + }, + { + "epoch": 16.12, + "grad_norm": 1.0132023096084595, + "learning_rate": 4.520075661476517e-06, + "loss": 0.9816, + "step": 53865 + }, + { + "epoch": 16.12, + "grad_norm": 2.115264892578125, + "learning_rate": 4.516706721664635e-06, + "loss": 1.0883, + "step": 53870 + }, + { + "epoch": 16.12, + "grad_norm": 4.187863826751709, + "learning_rate": 4.5133389131237495e-06, + "loss": 1.0543, + "step": 53875 + }, + { + "epoch": 16.12, + "grad_norm": 3.177401065826416, + "learning_rate": 4.509972236039861e-06, + "loss": 0.8548, + "step": 53880 + }, + { + "epoch": 16.12, + "grad_norm": 3.7143056392669678, + "learning_rate": 4.50660669059891e-06, + "loss": 1.0591, + "step": 53885 + }, + { + "epoch": 16.12, + "grad_norm": 2.0598580837249756, + "learning_rate": 4.503242276986769e-06, + "loss": 1.0228, + "step": 53890 + }, + { + "epoch": 16.12, + "grad_norm": 2.2336246967315674, + "learning_rate": 4.4998789953892566e-06, + "loss": 0.8827, + "step": 53895 + }, + { + "epoch": 16.13, + "grad_norm": 2.2673490047454834, + "learning_rate": 4.496516845992108e-06, + "loss": 1.0858, + "step": 53900 + }, + { + "epoch": 16.13, + "grad_norm": 1.8698546886444092, + "learning_rate": 4.493155828981033e-06, + "loss": 1.0087, + "step": 53905 + }, + { + "epoch": 16.13, + "grad_norm": 1.9772906303405762, + "learning_rate": 4.489795944541633e-06, + "loss": 0.9655, + "step": 53910 + }, + { + "epoch": 16.13, + "grad_norm": 1.3239150047302246, + "learning_rate": 4.486437192859496e-06, + "loss": 0.8522, + "step": 53915 + }, + { + "epoch": 16.13, + "grad_norm": 2.097325086593628, + "learning_rate": 4.483079574120097e-06, + "loss": 1.1089, + "step": 53920 + }, + { + "epoch": 16.13, + "grad_norm": 2.80251407623291, + "learning_rate": 4.479723088508897e-06, + "loss": 0.9511, + "step": 53925 + }, + { + "epoch": 16.14, + "grad_norm": 2.2428815364837646, + "learning_rate": 4.476367736211265e-06, + "loss": 0.7491, + "step": 53930 + }, + { + "epoch": 16.14, + "grad_norm": 4.145894527435303, + "learning_rate": 4.4730135174124974e-06, + "loss": 1.0072, + "step": 53935 + }, + { + "epoch": 16.14, + "grad_norm": 1.747147560119629, + "learning_rate": 4.469660432297868e-06, + "loss": 0.9186, + "step": 53940 + }, + { + "epoch": 16.14, + "grad_norm": 2.3156204223632812, + "learning_rate": 4.466308481052542e-06, + "loss": 1.0476, + "step": 53945 + }, + { + "epoch": 16.14, + "grad_norm": 1.5770155191421509, + "learning_rate": 4.4636277365666055e-06, + "loss": 1.0194, + "step": 53950 + }, + { + "epoch": 16.14, + "grad_norm": 5.032956123352051, + "learning_rate": 4.460277826752518e-06, + "loss": 0.9442, + "step": 53955 + }, + { + "epoch": 16.14, + "grad_norm": 2.544370651245117, + "learning_rate": 4.456929051325948e-06, + "loss": 0.9561, + "step": 53960 + }, + { + "epoch": 16.15, + "grad_norm": 2.7542896270751953, + "learning_rate": 4.453581410471821e-06, + "loss": 0.9285, + "step": 53965 + }, + { + "epoch": 16.15, + "grad_norm": 4.545336723327637, + "learning_rate": 4.450234904375047e-06, + "loss": 0.8249, + "step": 53970 + }, + { + "epoch": 16.15, + "grad_norm": 6.846313953399658, + "learning_rate": 4.446889533220427e-06, + "loss": 0.9482, + "step": 53975 + }, + { + "epoch": 16.15, + "grad_norm": 2.253056764602661, + "learning_rate": 4.443545297192747e-06, + "loss": 1.0561, + "step": 53980 + }, + { + "epoch": 16.15, + "grad_norm": 3.439204216003418, + "learning_rate": 4.440202196476687e-06, + "loss": 0.8451, + "step": 53985 + }, + { + "epoch": 16.15, + "grad_norm": 2.593958854675293, + "learning_rate": 4.43686023125689e-06, + "loss": 0.8443, + "step": 53990 + }, + { + "epoch": 16.15, + "grad_norm": 3.0037314891815186, + "learning_rate": 4.433519401717925e-06, + "loss": 0.8752, + "step": 53995 + }, + { + "epoch": 16.16, + "grad_norm": 1.4025530815124512, + "learning_rate": 4.43017970804431e-06, + "loss": 0.8974, + "step": 54000 + }, + { + "epoch": 16.16, + "grad_norm": 3.922529935836792, + "learning_rate": 4.426841150420485e-06, + "loss": 1.1543, + "step": 54005 + }, + { + "epoch": 16.16, + "grad_norm": 2.439948797225952, + "learning_rate": 4.4235037290308425e-06, + "loss": 0.8726, + "step": 54010 + }, + { + "epoch": 16.16, + "grad_norm": 2.4692089557647705, + "learning_rate": 4.420167444059698e-06, + "loss": 0.8118, + "step": 54015 + }, + { + "epoch": 16.16, + "grad_norm": 2.5722415447235107, + "learning_rate": 4.416832295691314e-06, + "loss": 1.0299, + "step": 54020 + }, + { + "epoch": 16.16, + "grad_norm": 2.7503256797790527, + "learning_rate": 4.413498284109888e-06, + "loss": 1.172, + "step": 54025 + }, + { + "epoch": 16.17, + "grad_norm": 3.138927459716797, + "learning_rate": 4.410165409499553e-06, + "loss": 0.941, + "step": 54030 + }, + { + "epoch": 16.17, + "grad_norm": 2.402808666229248, + "learning_rate": 4.40683367204438e-06, + "loss": 0.9587, + "step": 54035 + }, + { + "epoch": 16.17, + "grad_norm": 3.9878361225128174, + "learning_rate": 4.403503071928378e-06, + "loss": 0.8567, + "step": 54040 + }, + { + "epoch": 16.17, + "grad_norm": 1.6606813669204712, + "learning_rate": 4.4001736093355005e-06, + "loss": 1.0092, + "step": 54045 + }, + { + "epoch": 16.17, + "grad_norm": 3.9329001903533936, + "learning_rate": 4.396845284449608e-06, + "loss": 1.008, + "step": 54050 + }, + { + "epoch": 16.17, + "grad_norm": 5.513614654541016, + "learning_rate": 4.393518097454546e-06, + "loss": 1.0288, + "step": 54055 + }, + { + "epoch": 16.17, + "grad_norm": 1.570499062538147, + "learning_rate": 4.39019204853405e-06, + "loss": 1.0363, + "step": 54060 + }, + { + "epoch": 16.18, + "grad_norm": 2.9202938079833984, + "learning_rate": 4.386867137871839e-06, + "loss": 0.8739, + "step": 54065 + }, + { + "epoch": 16.18, + "grad_norm": 1.5912072658538818, + "learning_rate": 4.383543365651513e-06, + "loss": 0.9977, + "step": 54070 + }, + { + "epoch": 16.18, + "grad_norm": 1.2433182001113892, + "learning_rate": 4.380220732056673e-06, + "loss": 0.9558, + "step": 54075 + }, + { + "epoch": 16.18, + "grad_norm": 3.465158700942993, + "learning_rate": 4.376899237270798e-06, + "loss": 0.9604, + "step": 54080 + }, + { + "epoch": 16.18, + "grad_norm": 4.610307216644287, + "learning_rate": 4.373578881477347e-06, + "loss": 0.9055, + "step": 54085 + }, + { + "epoch": 16.18, + "grad_norm": 1.9522101879119873, + "learning_rate": 4.370259664859691e-06, + "loss": 0.9228, + "step": 54090 + }, + { + "epoch": 16.18, + "grad_norm": 1.3069242238998413, + "learning_rate": 4.3669415876011535e-06, + "loss": 1.0042, + "step": 54095 + }, + { + "epoch": 16.19, + "grad_norm": 2.4760279655456543, + "learning_rate": 4.363624649884982e-06, + "loss": 1.0021, + "step": 54100 + }, + { + "epoch": 16.19, + "grad_norm": 1.8461346626281738, + "learning_rate": 4.360308851894366e-06, + "loss": 1.0572, + "step": 54105 + }, + { + "epoch": 16.19, + "grad_norm": 3.235544443130493, + "learning_rate": 4.35699419381245e-06, + "loss": 0.9269, + "step": 54110 + }, + { + "epoch": 16.19, + "grad_norm": 2.0938470363616943, + "learning_rate": 4.353680675822281e-06, + "loss": 0.9334, + "step": 54115 + }, + { + "epoch": 16.19, + "grad_norm": 1.0838735103607178, + "learning_rate": 4.350368298106869e-06, + "loss": 1.0252, + "step": 54120 + }, + { + "epoch": 16.19, + "grad_norm": 4.055896282196045, + "learning_rate": 4.34705706084915e-06, + "loss": 0.9531, + "step": 54125 + }, + { + "epoch": 16.2, + "grad_norm": 1.868022084236145, + "learning_rate": 4.343746964232004e-06, + "loss": 1.0052, + "step": 54130 + }, + { + "epoch": 16.2, + "grad_norm": 2.870447874069214, + "learning_rate": 4.340438008438241e-06, + "loss": 1.0222, + "step": 54135 + }, + { + "epoch": 16.2, + "grad_norm": 3.2282488346099854, + "learning_rate": 4.3371301936506205e-06, + "loss": 1.0202, + "step": 54140 + }, + { + "epoch": 16.2, + "grad_norm": 2.0114264488220215, + "learning_rate": 4.333823520051805e-06, + "loss": 1.038, + "step": 54145 + }, + { + "epoch": 16.2, + "grad_norm": 1.6153552532196045, + "learning_rate": 4.330517987824451e-06, + "loss": 1.0861, + "step": 54150 + }, + { + "epoch": 16.2, + "grad_norm": 4.407323360443115, + "learning_rate": 4.327213597151092e-06, + "loss": 1.1782, + "step": 54155 + }, + { + "epoch": 16.2, + "grad_norm": 2.3100712299346924, + "learning_rate": 4.323910348214247e-06, + "loss": 0.7662, + "step": 54160 + }, + { + "epoch": 16.21, + "grad_norm": 2.331082582473755, + "learning_rate": 4.320608241196333e-06, + "loss": 0.9865, + "step": 54165 + }, + { + "epoch": 16.21, + "grad_norm": 2.015458822250366, + "learning_rate": 4.317307276279742e-06, + "loss": 1.0186, + "step": 54170 + }, + { + "epoch": 16.21, + "grad_norm": 2.7874059677124023, + "learning_rate": 4.3140074536467625e-06, + "loss": 1.0223, + "step": 54175 + }, + { + "epoch": 16.21, + "grad_norm": 1.4887959957122803, + "learning_rate": 4.310708773479652e-06, + "loss": 1.0826, + "step": 54180 + }, + { + "epoch": 16.21, + "grad_norm": 3.373190402984619, + "learning_rate": 4.307411235960593e-06, + "loss": 1.1612, + "step": 54185 + }, + { + "epoch": 16.21, + "grad_norm": 5.217267990112305, + "learning_rate": 4.304114841271692e-06, + "loss": 1.0709, + "step": 54190 + }, + { + "epoch": 16.21, + "grad_norm": 1.9477370977401733, + "learning_rate": 4.3008195895950315e-06, + "loss": 0.9157, + "step": 54195 + }, + { + "epoch": 16.22, + "grad_norm": 3.2760956287384033, + "learning_rate": 4.297525481112577e-06, + "loss": 0.9145, + "step": 54200 + }, + { + "epoch": 16.22, + "grad_norm": 4.605449199676514, + "learning_rate": 4.2942325160062826e-06, + "loss": 0.9861, + "step": 54205 + }, + { + "epoch": 16.22, + "grad_norm": 1.7385135889053345, + "learning_rate": 4.290940694457995e-06, + "loss": 1.0439, + "step": 54210 + }, + { + "epoch": 16.22, + "grad_norm": 2.1554768085479736, + "learning_rate": 4.287650016649533e-06, + "loss": 1.2434, + "step": 54215 + }, + { + "epoch": 16.22, + "grad_norm": 3.2271509170532227, + "learning_rate": 4.284360482762626e-06, + "loss": 0.9879, + "step": 54220 + }, + { + "epoch": 16.22, + "grad_norm": 2.956493616104126, + "learning_rate": 4.28107209297896e-06, + "loss": 1.2336, + "step": 54225 + }, + { + "epoch": 16.22, + "grad_norm": 2.8822336196899414, + "learning_rate": 4.277784847480145e-06, + "loss": 0.9283, + "step": 54230 + }, + { + "epoch": 16.23, + "grad_norm": 3.9838624000549316, + "learning_rate": 4.274498746447739e-06, + "loss": 0.999, + "step": 54235 + }, + { + "epoch": 16.23, + "grad_norm": 1.797528862953186, + "learning_rate": 4.271213790063213e-06, + "loss": 0.9987, + "step": 54240 + }, + { + "epoch": 16.23, + "grad_norm": 8.245445251464844, + "learning_rate": 4.267929978508014e-06, + "loss": 0.8547, + "step": 54245 + }, + { + "epoch": 16.23, + "grad_norm": 1.7248831987380981, + "learning_rate": 4.264647311963482e-06, + "loss": 1.2987, + "step": 54250 + }, + { + "epoch": 16.23, + "grad_norm": 1.602921485900879, + "learning_rate": 4.261365790610935e-06, + "loss": 1.0701, + "step": 54255 + }, + { + "epoch": 16.23, + "grad_norm": 1.774390459060669, + "learning_rate": 4.258085414631591e-06, + "loss": 0.9237, + "step": 54260 + }, + { + "epoch": 16.24, + "grad_norm": 2.390256881713867, + "learning_rate": 4.254806184206634e-06, + "loss": 1.0289, + "step": 54265 + }, + { + "epoch": 16.24, + "grad_norm": 2.603825330734253, + "learning_rate": 4.2515280995171715e-06, + "loss": 0.9717, + "step": 54270 + }, + { + "epoch": 16.24, + "grad_norm": 1.769459843635559, + "learning_rate": 4.248251160744235e-06, + "loss": 1.237, + "step": 54275 + }, + { + "epoch": 16.24, + "grad_norm": 2.9963274002075195, + "learning_rate": 4.2449753680688295e-06, + "loss": 0.7686, + "step": 54280 + }, + { + "epoch": 16.24, + "grad_norm": 1.8369311094284058, + "learning_rate": 4.241700721671849e-06, + "loss": 1.0841, + "step": 54285 + }, + { + "epoch": 16.24, + "grad_norm": 1.0869977474212646, + "learning_rate": 4.23842722173417e-06, + "loss": 0.7985, + "step": 54290 + }, + { + "epoch": 16.24, + "grad_norm": 2.742431402206421, + "learning_rate": 4.235154868436564e-06, + "loss": 1.0202, + "step": 54295 + }, + { + "epoch": 16.25, + "grad_norm": 2.4266834259033203, + "learning_rate": 4.231883661959785e-06, + "loss": 0.8518, + "step": 54300 + }, + { + "epoch": 16.25, + "grad_norm": 3.0461745262145996, + "learning_rate": 4.228613602484477e-06, + "loss": 0.8847, + "step": 54305 + }, + { + "epoch": 16.25, + "grad_norm": 2.476036787033081, + "learning_rate": 4.225344690191247e-06, + "loss": 0.8996, + "step": 54310 + }, + { + "epoch": 16.25, + "grad_norm": 3.7024502754211426, + "learning_rate": 4.222076925260637e-06, + "loss": 0.9061, + "step": 54315 + }, + { + "epoch": 16.25, + "grad_norm": 3.387453556060791, + "learning_rate": 4.218810307873122e-06, + "loss": 1.0476, + "step": 54320 + }, + { + "epoch": 16.25, + "grad_norm": 4.558333396911621, + "learning_rate": 4.215544838209112e-06, + "loss": 0.797, + "step": 54325 + }, + { + "epoch": 16.25, + "grad_norm": 3.4511423110961914, + "learning_rate": 4.2122805164489575e-06, + "loss": 0.7751, + "step": 54330 + }, + { + "epoch": 16.26, + "grad_norm": 2.4084949493408203, + "learning_rate": 4.209017342772939e-06, + "loss": 0.8006, + "step": 54335 + }, + { + "epoch": 16.26, + "grad_norm": 3.032461643218994, + "learning_rate": 4.205755317361293e-06, + "loss": 0.9037, + "step": 54340 + }, + { + "epoch": 16.26, + "grad_norm": 1.9977115392684937, + "learning_rate": 4.20249444039415e-06, + "loss": 0.9293, + "step": 54345 + }, + { + "epoch": 16.26, + "grad_norm": 1.7754746675491333, + "learning_rate": 4.199234712051628e-06, + "loss": 1.0857, + "step": 54350 + }, + { + "epoch": 16.26, + "grad_norm": 3.022590398788452, + "learning_rate": 4.195976132513754e-06, + "loss": 0.8392, + "step": 54355 + }, + { + "epoch": 16.26, + "grad_norm": 3.174025535583496, + "learning_rate": 4.192718701960491e-06, + "loss": 0.9006, + "step": 54360 + }, + { + "epoch": 16.27, + "grad_norm": 12.422464370727539, + "learning_rate": 4.189462420571752e-06, + "loss": 1.0187, + "step": 54365 + }, + { + "epoch": 16.27, + "grad_norm": 1.5121108293533325, + "learning_rate": 4.186207288527361e-06, + "loss": 0.9843, + "step": 54370 + }, + { + "epoch": 16.27, + "grad_norm": 2.849782705307007, + "learning_rate": 4.18295330600712e-06, + "loss": 1.0467, + "step": 54375 + }, + { + "epoch": 16.27, + "grad_norm": 3.5961849689483643, + "learning_rate": 4.179700473190717e-06, + "loss": 0.9365, + "step": 54380 + }, + { + "epoch": 16.27, + "grad_norm": 2.3234174251556396, + "learning_rate": 4.176448790257828e-06, + "loss": 0.8933, + "step": 54385 + }, + { + "epoch": 16.27, + "grad_norm": 1.903601050376892, + "learning_rate": 4.173198257388014e-06, + "loss": 0.8867, + "step": 54390 + }, + { + "epoch": 16.27, + "grad_norm": 5.895204067230225, + "learning_rate": 4.169948874760823e-06, + "loss": 1.0914, + "step": 54395 + }, + { + "epoch": 16.28, + "grad_norm": 2.9874160289764404, + "learning_rate": 4.166700642555702e-06, + "loss": 1.0449, + "step": 54400 + }, + { + "epoch": 16.28, + "grad_norm": 2.241297721862793, + "learning_rate": 4.1634535609520465e-06, + "loss": 1.0472, + "step": 54405 + }, + { + "epoch": 16.28, + "grad_norm": 3.7421207427978516, + "learning_rate": 4.160207630129192e-06, + "loss": 1.1745, + "step": 54410 + }, + { + "epoch": 16.28, + "grad_norm": 4.566219329833984, + "learning_rate": 4.15696285026641e-06, + "loss": 1.0118, + "step": 54415 + }, + { + "epoch": 16.28, + "grad_norm": 1.1942859888076782, + "learning_rate": 4.153719221542904e-06, + "loss": 1.0707, + "step": 54420 + }, + { + "epoch": 16.28, + "grad_norm": 3.381263494491577, + "learning_rate": 4.150476744137818e-06, + "loss": 0.9244, + "step": 54425 + }, + { + "epoch": 16.28, + "grad_norm": 2.5632987022399902, + "learning_rate": 4.147235418230227e-06, + "loss": 0.9353, + "step": 54430 + }, + { + "epoch": 16.29, + "grad_norm": 4.676177978515625, + "learning_rate": 4.143995243999152e-06, + "loss": 1.0719, + "step": 54435 + }, + { + "epoch": 16.29, + "grad_norm": 3.268036365509033, + "learning_rate": 4.140756221623537e-06, + "loss": 0.7861, + "step": 54440 + }, + { + "epoch": 16.29, + "grad_norm": 1.5229823589324951, + "learning_rate": 4.137518351282277e-06, + "loss": 0.874, + "step": 54445 + }, + { + "epoch": 16.29, + "grad_norm": 4.277475833892822, + "learning_rate": 4.1342816331541915e-06, + "loss": 0.9358, + "step": 54450 + }, + { + "epoch": 16.29, + "grad_norm": 3.253185510635376, + "learning_rate": 4.131046067418043e-06, + "loss": 1.0229, + "step": 54455 + }, + { + "epoch": 16.29, + "grad_norm": 1.8826074600219727, + "learning_rate": 4.127811654252531e-06, + "loss": 0.8511, + "step": 54460 + }, + { + "epoch": 16.3, + "grad_norm": 4.963164806365967, + "learning_rate": 4.124578393836282e-06, + "loss": 1.0294, + "step": 54465 + }, + { + "epoch": 16.3, + "grad_norm": 4.152764320373535, + "learning_rate": 4.121346286347877e-06, + "loss": 1.0621, + "step": 54470 + }, + { + "epoch": 16.3, + "grad_norm": 4.597795009613037, + "learning_rate": 4.1181153319658025e-06, + "loss": 0.8713, + "step": 54475 + }, + { + "epoch": 16.3, + "grad_norm": 2.7429773807525635, + "learning_rate": 4.1148855308685256e-06, + "loss": 0.9384, + "step": 54480 + }, + { + "epoch": 16.3, + "grad_norm": 3.036653518676758, + "learning_rate": 4.111656883234396e-06, + "loss": 1.026, + "step": 54485 + }, + { + "epoch": 16.3, + "grad_norm": 3.560821533203125, + "learning_rate": 4.108429389241761e-06, + "loss": 0.9886, + "step": 54490 + }, + { + "epoch": 16.3, + "grad_norm": 3.0521786212921143, + "learning_rate": 4.105203049068848e-06, + "loss": 1.0105, + "step": 54495 + }, + { + "epoch": 16.31, + "grad_norm": 2.9421958923339844, + "learning_rate": 4.101977862893852e-06, + "loss": 0.8541, + "step": 54500 + }, + { + "epoch": 16.31, + "grad_norm": 2.5392541885375977, + "learning_rate": 4.098753830894894e-06, + "loss": 1.1314, + "step": 54505 + }, + { + "epoch": 16.31, + "grad_norm": 2.7192037105560303, + "learning_rate": 4.09553095325004e-06, + "loss": 1.0789, + "step": 54510 + }, + { + "epoch": 16.31, + "grad_norm": 1.7262064218521118, + "learning_rate": 4.092309230137282e-06, + "loss": 0.901, + "step": 54515 + }, + { + "epoch": 16.31, + "grad_norm": 6.73134708404541, + "learning_rate": 4.08908866173455e-06, + "loss": 0.8987, + "step": 54520 + }, + { + "epoch": 16.31, + "grad_norm": 2.3271663188934326, + "learning_rate": 4.085869248219715e-06, + "loss": 1.0085, + "step": 54525 + }, + { + "epoch": 16.31, + "grad_norm": 4.21439266204834, + "learning_rate": 4.082650989770584e-06, + "loss": 0.9719, + "step": 54530 + }, + { + "epoch": 16.32, + "grad_norm": 3.639162540435791, + "learning_rate": 4.079433886564899e-06, + "loss": 0.9036, + "step": 54535 + }, + { + "epoch": 16.32, + "grad_norm": 2.1096203327178955, + "learning_rate": 4.0762179387803305e-06, + "loss": 1.0381, + "step": 54540 + }, + { + "epoch": 16.32, + "grad_norm": 4.696827411651611, + "learning_rate": 4.073003146594498e-06, + "loss": 0.8693, + "step": 54545 + }, + { + "epoch": 16.32, + "grad_norm": 4.135863304138184, + "learning_rate": 4.069789510184951e-06, + "loss": 0.7749, + "step": 54550 + }, + { + "epoch": 16.32, + "grad_norm": 2.406681537628174, + "learning_rate": 4.0665770297291714e-06, + "loss": 1.0704, + "step": 54555 + }, + { + "epoch": 16.32, + "grad_norm": 2.2193562984466553, + "learning_rate": 4.063365705404584e-06, + "loss": 1.1637, + "step": 54560 + }, + { + "epoch": 16.33, + "grad_norm": 2.6856563091278076, + "learning_rate": 4.060155537388552e-06, + "loss": 1.0761, + "step": 54565 + }, + { + "epoch": 16.33, + "grad_norm": 5.369014263153076, + "learning_rate": 4.056946525858352e-06, + "loss": 1.0351, + "step": 54570 + }, + { + "epoch": 16.33, + "grad_norm": 2.9913430213928223, + "learning_rate": 4.053738670991239e-06, + "loss": 1.006, + "step": 54575 + }, + { + "epoch": 16.33, + "grad_norm": 1.8716338872909546, + "learning_rate": 4.0505319729643554e-06, + "loss": 0.9395, + "step": 54580 + }, + { + "epoch": 16.33, + "grad_norm": 1.9372947216033936, + "learning_rate": 4.0473264319548256e-06, + "loss": 0.8675, + "step": 54585 + }, + { + "epoch": 16.33, + "grad_norm": 2.8180017471313477, + "learning_rate": 4.044122048139673e-06, + "loss": 0.9045, + "step": 54590 + }, + { + "epoch": 16.33, + "grad_norm": 2.123150110244751, + "learning_rate": 4.0409188216958775e-06, + "loss": 0.8954, + "step": 54595 + }, + { + "epoch": 16.34, + "grad_norm": 1.7818152904510498, + "learning_rate": 4.037716752800347e-06, + "loss": 1.0991, + "step": 54600 + }, + { + "epoch": 16.34, + "grad_norm": 1.8431493043899536, + "learning_rate": 4.034515841629932e-06, + "loss": 1.1051, + "step": 54605 + }, + { + "epoch": 16.34, + "grad_norm": 1.9420349597930908, + "learning_rate": 4.031316088361417e-06, + "loss": 0.7161, + "step": 54610 + }, + { + "epoch": 16.34, + "grad_norm": 2.2753307819366455, + "learning_rate": 4.0281174931715185e-06, + "loss": 0.8588, + "step": 54615 + }, + { + "epoch": 16.34, + "grad_norm": 1.6886123418807983, + "learning_rate": 4.024920056236889e-06, + "loss": 0.9475, + "step": 54620 + }, + { + "epoch": 16.34, + "grad_norm": 2.501573324203491, + "learning_rate": 4.021723777734124e-06, + "loss": 0.935, + "step": 54625 + }, + { + "epoch": 16.34, + "grad_norm": 2.7062265872955322, + "learning_rate": 4.018528657839752e-06, + "loss": 1.1356, + "step": 54630 + }, + { + "epoch": 16.35, + "grad_norm": 1.7512439489364624, + "learning_rate": 4.015334696730227e-06, + "loss": 1.0477, + "step": 54635 + }, + { + "epoch": 16.35, + "grad_norm": 3.424471139907837, + "learning_rate": 4.012141894581958e-06, + "loss": 0.9916, + "step": 54640 + }, + { + "epoch": 16.35, + "grad_norm": 2.5888991355895996, + "learning_rate": 4.008950251571278e-06, + "loss": 0.8972, + "step": 54645 + }, + { + "epoch": 16.35, + "grad_norm": 2.482107639312744, + "learning_rate": 4.005759767874453e-06, + "loss": 0.9857, + "step": 54650 + }, + { + "epoch": 16.35, + "grad_norm": 4.3171796798706055, + "learning_rate": 4.002570443667697e-06, + "loss": 1.0612, + "step": 54655 + }, + { + "epoch": 16.35, + "grad_norm": 2.3370325565338135, + "learning_rate": 3.999382279127153e-06, + "loss": 1.032, + "step": 54660 + }, + { + "epoch": 16.36, + "grad_norm": 2.1718616485595703, + "learning_rate": 3.996195274428888e-06, + "loss": 1.2058, + "step": 54665 + }, + { + "epoch": 16.36, + "grad_norm": 4.225613594055176, + "learning_rate": 3.993009429748937e-06, + "loss": 0.8797, + "step": 54670 + }, + { + "epoch": 16.36, + "grad_norm": 3.9194295406341553, + "learning_rate": 3.989824745263224e-06, + "loss": 0.8541, + "step": 54675 + }, + { + "epoch": 16.36, + "grad_norm": 3.645948886871338, + "learning_rate": 3.986641221147666e-06, + "loss": 1.0261, + "step": 54680 + }, + { + "epoch": 16.36, + "grad_norm": 1.8858450651168823, + "learning_rate": 3.9834588575780595e-06, + "loss": 1.023, + "step": 54685 + }, + { + "epoch": 16.36, + "grad_norm": 2.733396291732788, + "learning_rate": 3.9802776547301884e-06, + "loss": 1.2168, + "step": 54690 + }, + { + "epoch": 16.36, + "grad_norm": 1.9093108177185059, + "learning_rate": 3.977097612779723e-06, + "loss": 0.9631, + "step": 54695 + }, + { + "epoch": 16.37, + "grad_norm": 3.2848117351531982, + "learning_rate": 3.9739187319023066e-06, + "loss": 0.972, + "step": 54700 + }, + { + "epoch": 16.37, + "grad_norm": 1.4223105907440186, + "learning_rate": 3.970741012273507e-06, + "loss": 0.9396, + "step": 54705 + }, + { + "epoch": 16.37, + "grad_norm": 1.8752068281173706, + "learning_rate": 3.967564454068817e-06, + "loss": 0.8828, + "step": 54710 + }, + { + "epoch": 16.37, + "grad_norm": 2.8616607189178467, + "learning_rate": 3.964389057463683e-06, + "loss": 1.0098, + "step": 54715 + }, + { + "epoch": 16.37, + "grad_norm": 2.8053674697875977, + "learning_rate": 3.961214822633477e-06, + "loss": 0.9985, + "step": 54720 + }, + { + "epoch": 16.37, + "grad_norm": 3.857592821121216, + "learning_rate": 3.958041749753505e-06, + "loss": 0.9801, + "step": 54725 + }, + { + "epoch": 16.37, + "grad_norm": 1.627759337425232, + "learning_rate": 3.954869838999017e-06, + "loss": 0.9103, + "step": 54730 + }, + { + "epoch": 16.38, + "grad_norm": 3.5982964038848877, + "learning_rate": 3.951699090545194e-06, + "loss": 0.7888, + "step": 54735 + }, + { + "epoch": 16.38, + "grad_norm": 3.797572135925293, + "learning_rate": 3.948529504567147e-06, + "loss": 0.9846, + "step": 54740 + }, + { + "epoch": 16.38, + "grad_norm": 2.0291848182678223, + "learning_rate": 3.9453610812399375e-06, + "loss": 0.8557, + "step": 54745 + }, + { + "epoch": 16.38, + "grad_norm": 1.8980909585952759, + "learning_rate": 3.9421938207385485e-06, + "loss": 1.0815, + "step": 54750 + }, + { + "epoch": 16.38, + "grad_norm": 3.711595058441162, + "learning_rate": 3.939027723237914e-06, + "loss": 0.9014, + "step": 54755 + }, + { + "epoch": 16.38, + "grad_norm": 1.4966007471084595, + "learning_rate": 3.935862788912875e-06, + "loss": 0.9629, + "step": 54760 + }, + { + "epoch": 16.39, + "grad_norm": 4.108902931213379, + "learning_rate": 3.932699017938251e-06, + "loss": 1.071, + "step": 54765 + }, + { + "epoch": 16.39, + "grad_norm": 2.2687480449676514, + "learning_rate": 3.929536410488749e-06, + "loss": 0.8898, + "step": 54770 + }, + { + "epoch": 16.39, + "grad_norm": 1.9844911098480225, + "learning_rate": 3.926374966739063e-06, + "loss": 0.9253, + "step": 54775 + }, + { + "epoch": 16.39, + "grad_norm": 1.7941007614135742, + "learning_rate": 3.923214686863769e-06, + "loss": 0.9916, + "step": 54780 + }, + { + "epoch": 16.39, + "grad_norm": 3.715179681777954, + "learning_rate": 3.920055571037431e-06, + "loss": 0.9925, + "step": 54785 + }, + { + "epoch": 16.39, + "grad_norm": 3.1290266513824463, + "learning_rate": 3.916897619434509e-06, + "loss": 0.9698, + "step": 54790 + }, + { + "epoch": 16.39, + "grad_norm": 2.829307794570923, + "learning_rate": 3.913740832229415e-06, + "loss": 0.9611, + "step": 54795 + }, + { + "epoch": 16.4, + "grad_norm": 4.010441780090332, + "learning_rate": 3.910585209596499e-06, + "loss": 0.8892, + "step": 54800 + }, + { + "epoch": 16.4, + "grad_norm": 5.990212917327881, + "learning_rate": 3.907430751710039e-06, + "loss": 0.9162, + "step": 54805 + }, + { + "epoch": 16.4, + "grad_norm": 5.119846820831299, + "learning_rate": 3.904277458744254e-06, + "loss": 1.0017, + "step": 54810 + }, + { + "epoch": 16.4, + "grad_norm": 2.6883440017700195, + "learning_rate": 3.901125330873298e-06, + "loss": 0.9282, + "step": 54815 + }, + { + "epoch": 16.4, + "grad_norm": 4.281408309936523, + "learning_rate": 3.897974368271256e-06, + "loss": 0.9491, + "step": 54820 + }, + { + "epoch": 16.4, + "grad_norm": 1.7802320718765259, + "learning_rate": 3.89482457111216e-06, + "loss": 1.0643, + "step": 54825 + }, + { + "epoch": 16.4, + "grad_norm": 1.374393343925476, + "learning_rate": 3.891675939569961e-06, + "loss": 1.0454, + "step": 54830 + }, + { + "epoch": 16.41, + "grad_norm": 1.9680904150009155, + "learning_rate": 3.888528473818562e-06, + "loss": 0.8646, + "step": 54835 + }, + { + "epoch": 16.41, + "grad_norm": 3.9535000324249268, + "learning_rate": 3.88538217403179e-06, + "loss": 0.914, + "step": 54840 + }, + { + "epoch": 16.41, + "grad_norm": 4.569047451019287, + "learning_rate": 3.882237040383413e-06, + "loss": 0.8554, + "step": 54845 + }, + { + "epoch": 16.41, + "grad_norm": 2.8582353591918945, + "learning_rate": 3.879093073047141e-06, + "loss": 1.1247, + "step": 54850 + }, + { + "epoch": 16.41, + "grad_norm": 3.682445764541626, + "learning_rate": 3.87595027219659e-06, + "loss": 0.9073, + "step": 54855 + }, + { + "epoch": 16.41, + "grad_norm": 2.520266056060791, + "learning_rate": 3.872808638005362e-06, + "loss": 0.8065, + "step": 54860 + }, + { + "epoch": 16.41, + "grad_norm": 4.994065284729004, + "learning_rate": 3.8696681706469425e-06, + "loss": 0.9488, + "step": 54865 + }, + { + "epoch": 16.42, + "grad_norm": 2.4194717407226562, + "learning_rate": 3.866528870294795e-06, + "loss": 0.8722, + "step": 54870 + }, + { + "epoch": 16.42, + "grad_norm": 3.143922805786133, + "learning_rate": 3.86339073712228e-06, + "loss": 1.0721, + "step": 54875 + }, + { + "epoch": 16.42, + "grad_norm": 3.6272706985473633, + "learning_rate": 3.860253771302736e-06, + "loss": 1.1331, + "step": 54880 + }, + { + "epoch": 16.42, + "grad_norm": 2.595782518386841, + "learning_rate": 3.857117973009397e-06, + "loss": 0.7396, + "step": 54885 + }, + { + "epoch": 16.42, + "grad_norm": 2.1412410736083984, + "learning_rate": 3.853983342415454e-06, + "loss": 0.8936, + "step": 54890 + }, + { + "epoch": 16.42, + "grad_norm": 2.1500375270843506, + "learning_rate": 3.850849879694032e-06, + "loss": 0.9859, + "step": 54895 + }, + { + "epoch": 16.43, + "grad_norm": 2.9890248775482178, + "learning_rate": 3.84771758501819e-06, + "loss": 1.1094, + "step": 54900 + }, + { + "epoch": 16.43, + "grad_norm": 3.5275800228118896, + "learning_rate": 3.844586458560917e-06, + "loss": 1.0972, + "step": 54905 + }, + { + "epoch": 16.43, + "grad_norm": 3.5730512142181396, + "learning_rate": 3.841456500495144e-06, + "loss": 0.9291, + "step": 54910 + }, + { + "epoch": 16.43, + "grad_norm": 2.170400619506836, + "learning_rate": 3.838327710993736e-06, + "loss": 0.9088, + "step": 54915 + }, + { + "epoch": 16.43, + "grad_norm": 3.1143369674682617, + "learning_rate": 3.835200090229493e-06, + "loss": 0.9509, + "step": 54920 + }, + { + "epoch": 16.43, + "grad_norm": 2.1064987182617188, + "learning_rate": 3.832073638375147e-06, + "loss": 1.1217, + "step": 54925 + }, + { + "epoch": 16.43, + "grad_norm": 1.3867061138153076, + "learning_rate": 3.828948355603374e-06, + "loss": 0.9994, + "step": 54930 + }, + { + "epoch": 16.44, + "grad_norm": 3.5733249187469482, + "learning_rate": 3.8258242420867755e-06, + "loss": 0.9443, + "step": 54935 + }, + { + "epoch": 16.44, + "grad_norm": 4.986148357391357, + "learning_rate": 3.8227012979978935e-06, + "loss": 1.0024, + "step": 54940 + }, + { + "epoch": 16.44, + "grad_norm": 3.2679810523986816, + "learning_rate": 3.819579523509214e-06, + "loss": 0.8478, + "step": 54945 + }, + { + "epoch": 16.44, + "grad_norm": 1.716213345527649, + "learning_rate": 3.816458918793131e-06, + "loss": 0.7901, + "step": 54950 + }, + { + "epoch": 16.44, + "grad_norm": 2.6446051597595215, + "learning_rate": 3.813339484022013e-06, + "loss": 0.9754, + "step": 54955 + }, + { + "epoch": 16.44, + "grad_norm": 3.2618935108184814, + "learning_rate": 3.810221219368121e-06, + "loss": 1.1121, + "step": 54960 + }, + { + "epoch": 16.44, + "grad_norm": 2.8837664127349854, + "learning_rate": 3.8071041250036964e-06, + "loss": 0.9444, + "step": 54965 + }, + { + "epoch": 16.45, + "grad_norm": 3.107837438583374, + "learning_rate": 3.803988201100872e-06, + "loss": 1.2149, + "step": 54970 + }, + { + "epoch": 16.45, + "grad_norm": 1.7038583755493164, + "learning_rate": 3.80087344783176e-06, + "loss": 0.9996, + "step": 54975 + }, + { + "epoch": 16.45, + "grad_norm": 4.2032365798950195, + "learning_rate": 3.797759865368364e-06, + "loss": 0.9182, + "step": 54980 + }, + { + "epoch": 16.45, + "grad_norm": 3.1628682613372803, + "learning_rate": 3.794647453882652e-06, + "loss": 0.9363, + "step": 54985 + }, + { + "epoch": 16.45, + "grad_norm": 2.4282500743865967, + "learning_rate": 3.7915362135465205e-06, + "loss": 1.1558, + "step": 54990 + }, + { + "epoch": 16.45, + "grad_norm": 3.435473918914795, + "learning_rate": 3.7884261445318007e-06, + "loss": 0.959, + "step": 54995 + }, + { + "epoch": 16.46, + "grad_norm": 3.692786455154419, + "learning_rate": 3.7853172470102533e-06, + "loss": 0.9526, + "step": 55000 + }, + { + "epoch": 16.46, + "grad_norm": 1.5530604124069214, + "learning_rate": 3.782209521153579e-06, + "loss": 0.9287, + "step": 55005 + }, + { + "epoch": 16.46, + "grad_norm": 2.799910306930542, + "learning_rate": 3.77910296713343e-06, + "loss": 1.0503, + "step": 55010 + }, + { + "epoch": 16.46, + "grad_norm": 1.5998826026916504, + "learning_rate": 3.7759975851213603e-06, + "loss": 0.8919, + "step": 55015 + }, + { + "epoch": 16.46, + "grad_norm": 1.571890115737915, + "learning_rate": 3.772893375288883e-06, + "loss": 0.9862, + "step": 55020 + }, + { + "epoch": 16.46, + "grad_norm": 4.059834003448486, + "learning_rate": 3.7697903378074412e-06, + "loss": 1.0683, + "step": 55025 + }, + { + "epoch": 16.46, + "grad_norm": 1.3430641889572144, + "learning_rate": 3.7666884728484094e-06, + "loss": 1.0935, + "step": 55030 + }, + { + "epoch": 16.47, + "grad_norm": 3.6715023517608643, + "learning_rate": 3.7635877805831026e-06, + "loss": 0.9086, + "step": 55035 + }, + { + "epoch": 16.47, + "grad_norm": 2.4204914569854736, + "learning_rate": 3.760488261182768e-06, + "loss": 1.0692, + "step": 55040 + }, + { + "epoch": 16.47, + "grad_norm": 6.810561180114746, + "learning_rate": 3.7573899148185932e-06, + "loss": 1.2002, + "step": 55045 + }, + { + "epoch": 16.47, + "grad_norm": 3.171234607696533, + "learning_rate": 3.754292741661694e-06, + "loss": 1.0991, + "step": 55050 + }, + { + "epoch": 16.47, + "grad_norm": 3.974795341491699, + "learning_rate": 3.7511967418831145e-06, + "loss": 1.072, + "step": 55055 + }, + { + "epoch": 16.47, + "grad_norm": 1.6691629886627197, + "learning_rate": 3.748101915653862e-06, + "loss": 0.9038, + "step": 55060 + }, + { + "epoch": 16.47, + "grad_norm": 2.60886287689209, + "learning_rate": 3.745008263144842e-06, + "loss": 0.9588, + "step": 55065 + }, + { + "epoch": 16.48, + "grad_norm": 1.6360840797424316, + "learning_rate": 3.741915784526931e-06, + "loss": 0.9272, + "step": 55070 + }, + { + "epoch": 16.48, + "grad_norm": 1.671827793121338, + "learning_rate": 3.7388244799709095e-06, + "loss": 1.0059, + "step": 55075 + }, + { + "epoch": 16.48, + "grad_norm": 1.6585466861724854, + "learning_rate": 3.735734349647507e-06, + "loss": 1.0757, + "step": 55080 + }, + { + "epoch": 16.48, + "grad_norm": 2.521653890609741, + "learning_rate": 3.7326453937274047e-06, + "loss": 1.0236, + "step": 55085 + }, + { + "epoch": 16.48, + "grad_norm": 2.377997636795044, + "learning_rate": 3.7295576123811795e-06, + "loss": 0.9756, + "step": 55090 + }, + { + "epoch": 16.48, + "grad_norm": 2.9460082054138184, + "learning_rate": 3.726471005779389e-06, + "loss": 1.1004, + "step": 55095 + }, + { + "epoch": 16.49, + "grad_norm": 3.480149507522583, + "learning_rate": 3.7233855740924805e-06, + "loss": 0.8099, + "step": 55100 + }, + { + "epoch": 16.49, + "grad_norm": 1.956493616104126, + "learning_rate": 3.7203013174908816e-06, + "loss": 0.8011, + "step": 55105 + }, + { + "epoch": 16.49, + "grad_norm": 3.8703737258911133, + "learning_rate": 3.7172182361449174e-06, + "loss": 1.013, + "step": 55110 + }, + { + "epoch": 16.49, + "grad_norm": 1.4395601749420166, + "learning_rate": 3.714136330224868e-06, + "loss": 0.9137, + "step": 55115 + }, + { + "epoch": 16.49, + "grad_norm": 0.9838583469390869, + "learning_rate": 3.711055599900945e-06, + "loss": 0.941, + "step": 55120 + }, + { + "epoch": 16.49, + "grad_norm": 2.752509355545044, + "learning_rate": 3.7079760453432895e-06, + "loss": 0.9234, + "step": 55125 + }, + { + "epoch": 16.49, + "grad_norm": 3.263653516769409, + "learning_rate": 3.7048976667219877e-06, + "loss": 0.974, + "step": 55130 + }, + { + "epoch": 16.5, + "grad_norm": 3.148515462875366, + "learning_rate": 3.701820464207054e-06, + "loss": 0.8743, + "step": 55135 + }, + { + "epoch": 16.5, + "grad_norm": 4.454446315765381, + "learning_rate": 3.6987444379684348e-06, + "loss": 1.0195, + "step": 55140 + }, + { + "epoch": 16.5, + "grad_norm": 3.798888921737671, + "learning_rate": 3.695669588176026e-06, + "loss": 0.9871, + "step": 55145 + }, + { + "epoch": 16.5, + "grad_norm": 1.551338791847229, + "learning_rate": 3.6925959149996274e-06, + "loss": 0.9701, + "step": 55150 + }, + { + "epoch": 16.5, + "grad_norm": 4.081212997436523, + "learning_rate": 3.689523418609017e-06, + "loss": 1.0122, + "step": 55155 + }, + { + "epoch": 16.5, + "grad_norm": 2.6100282669067383, + "learning_rate": 3.686452099173876e-06, + "loss": 0.8688, + "step": 55160 + }, + { + "epoch": 16.5, + "grad_norm": 3.0267417430877686, + "learning_rate": 3.683381956863832e-06, + "loss": 0.9878, + "step": 55165 + }, + { + "epoch": 16.51, + "grad_norm": 3.891383409500122, + "learning_rate": 3.680312991848445e-06, + "loss": 1.0131, + "step": 55170 + }, + { + "epoch": 16.51, + "grad_norm": 2.7012784481048584, + "learning_rate": 3.677245204297211e-06, + "loss": 1.1283, + "step": 55175 + }, + { + "epoch": 16.51, + "grad_norm": 1.8227554559707642, + "learning_rate": 3.674178594379565e-06, + "loss": 1.0563, + "step": 55180 + }, + { + "epoch": 16.51, + "grad_norm": 3.7222025394439697, + "learning_rate": 3.671113162264861e-06, + "loss": 0.9301, + "step": 55185 + }, + { + "epoch": 16.51, + "grad_norm": 2.2444727420806885, + "learning_rate": 3.668048908122415e-06, + "loss": 0.954, + "step": 55190 + }, + { + "epoch": 16.51, + "grad_norm": 3.4596662521362305, + "learning_rate": 3.6649858321214424e-06, + "loss": 0.9375, + "step": 55195 + }, + { + "epoch": 16.52, + "grad_norm": 4.218637466430664, + "learning_rate": 3.6619239344311383e-06, + "loss": 1.0127, + "step": 55200 + }, + { + "epoch": 16.52, + "grad_norm": 2.6548755168914795, + "learning_rate": 3.6588632152205897e-06, + "loss": 0.9714, + "step": 55205 + }, + { + "epoch": 16.52, + "grad_norm": 2.7811079025268555, + "learning_rate": 3.655803674658842e-06, + "loss": 0.8688, + "step": 55210 + }, + { + "epoch": 16.52, + "grad_norm": 1.8681191205978394, + "learning_rate": 3.6527453129148738e-06, + "loss": 1.0711, + "step": 55215 + }, + { + "epoch": 16.52, + "grad_norm": 1.8448801040649414, + "learning_rate": 3.6496881301575887e-06, + "loss": 0.9248, + "step": 55220 + }, + { + "epoch": 16.52, + "grad_norm": 3.9367079734802246, + "learning_rate": 3.6466321265558406e-06, + "loss": 0.994, + "step": 55225 + }, + { + "epoch": 16.52, + "grad_norm": 3.2302794456481934, + "learning_rate": 3.6435773022784004e-06, + "loss": 0.8619, + "step": 55230 + }, + { + "epoch": 16.53, + "grad_norm": 4.870388984680176, + "learning_rate": 3.640523657493991e-06, + "loss": 0.9849, + "step": 55235 + }, + { + "epoch": 16.53, + "grad_norm": 1.335530400276184, + "learning_rate": 3.637471192371256e-06, + "loss": 1.0656, + "step": 55240 + }, + { + "epoch": 16.53, + "grad_norm": 7.603557586669922, + "learning_rate": 3.6344199070787815e-06, + "loss": 1.1671, + "step": 55245 + }, + { + "epoch": 16.53, + "grad_norm": 2.9642961025238037, + "learning_rate": 3.6313698017850866e-06, + "loss": 1.0257, + "step": 55250 + }, + { + "epoch": 16.53, + "grad_norm": 2.462388277053833, + "learning_rate": 3.628320876658628e-06, + "loss": 0.8544, + "step": 55255 + }, + { + "epoch": 16.53, + "grad_norm": 1.824250340461731, + "learning_rate": 3.6252731318677902e-06, + "loss": 1.0885, + "step": 55260 + }, + { + "epoch": 16.53, + "grad_norm": 6.232416152954102, + "learning_rate": 3.6222265675809025e-06, + "loss": 0.9456, + "step": 55265 + }, + { + "epoch": 16.54, + "grad_norm": 3.9859626293182373, + "learning_rate": 3.6191811839662194e-06, + "loss": 0.7835, + "step": 55270 + }, + { + "epoch": 16.54, + "grad_norm": 3.1337802410125732, + "learning_rate": 3.616136981191939e-06, + "loss": 1.0781, + "step": 55275 + }, + { + "epoch": 16.54, + "grad_norm": 2.0644679069519043, + "learning_rate": 3.6130939594261783e-06, + "loss": 0.9863, + "step": 55280 + }, + { + "epoch": 16.54, + "grad_norm": 4.25192403793335, + "learning_rate": 3.610052118837015e-06, + "loss": 0.8807, + "step": 55285 + }, + { + "epoch": 16.54, + "grad_norm": 1.9776525497436523, + "learning_rate": 3.6070114595924327e-06, + "loss": 0.9195, + "step": 55290 + }, + { + "epoch": 16.54, + "grad_norm": 1.4639649391174316, + "learning_rate": 3.6039719818603797e-06, + "loss": 1.0648, + "step": 55295 + }, + { + "epoch": 16.55, + "grad_norm": 1.7178945541381836, + "learning_rate": 3.600933685808708e-06, + "loss": 0.921, + "step": 55300 + }, + { + "epoch": 16.55, + "grad_norm": 2.7617597579956055, + "learning_rate": 3.5978965716052248e-06, + "loss": 0.9255, + "step": 55305 + }, + { + "epoch": 16.55, + "grad_norm": 1.6579577922821045, + "learning_rate": 3.5948606394176687e-06, + "loss": 0.8896, + "step": 55310 + }, + { + "epoch": 16.55, + "grad_norm": 5.555087089538574, + "learning_rate": 3.5918258894137107e-06, + "loss": 0.7572, + "step": 55315 + }, + { + "epoch": 16.55, + "grad_norm": 3.7583398818969727, + "learning_rate": 3.5887923217609586e-06, + "loss": 0.8979, + "step": 55320 + }, + { + "epoch": 16.55, + "grad_norm": 2.396233081817627, + "learning_rate": 3.585759936626948e-06, + "loss": 0.9145, + "step": 55325 + }, + { + "epoch": 16.55, + "grad_norm": 2.1706974506378174, + "learning_rate": 3.5827287341791583e-06, + "loss": 0.9833, + "step": 55330 + }, + { + "epoch": 16.56, + "grad_norm": 21.9448184967041, + "learning_rate": 3.5796987145849974e-06, + "loss": 0.9713, + "step": 55335 + }, + { + "epoch": 16.56, + "grad_norm": 1.5266534090042114, + "learning_rate": 3.5766698780118125e-06, + "loss": 0.9322, + "step": 55340 + }, + { + "epoch": 16.56, + "grad_norm": 3.1922240257263184, + "learning_rate": 3.5736422246268804e-06, + "loss": 0.9115, + "step": 55345 + }, + { + "epoch": 16.56, + "grad_norm": 1.6911910772323608, + "learning_rate": 3.5706157545974174e-06, + "loss": 1.0724, + "step": 55350 + }, + { + "epoch": 16.56, + "grad_norm": 1.6051517724990845, + "learning_rate": 3.5675904680905704e-06, + "loss": 0.9671, + "step": 55355 + }, + { + "epoch": 16.56, + "grad_norm": 3.0182387828826904, + "learning_rate": 3.5645663652734277e-06, + "loss": 0.9052, + "step": 55360 + }, + { + "epoch": 16.56, + "grad_norm": 2.883270740509033, + "learning_rate": 3.561543446313001e-06, + "loss": 1.1312, + "step": 55365 + }, + { + "epoch": 16.57, + "grad_norm": 5.493057727813721, + "learning_rate": 3.5585217113762533e-06, + "loss": 0.889, + "step": 55370 + }, + { + "epoch": 16.57, + "grad_norm": 2.2365972995758057, + "learning_rate": 3.5555011606300507e-06, + "loss": 0.9428, + "step": 55375 + }, + { + "epoch": 16.57, + "grad_norm": 2.719250440597534, + "learning_rate": 3.5524817942412414e-06, + "loss": 0.9899, + "step": 55380 + }, + { + "epoch": 16.57, + "grad_norm": 1.566593050956726, + "learning_rate": 3.549463612376558e-06, + "loss": 0.8925, + "step": 55385 + }, + { + "epoch": 16.57, + "grad_norm": 3.2841618061065674, + "learning_rate": 3.5464466152027113e-06, + "loss": 1.0915, + "step": 55390 + }, + { + "epoch": 16.57, + "grad_norm": 3.8649485111236572, + "learning_rate": 3.5434308028863107e-06, + "loss": 0.9006, + "step": 55395 + }, + { + "epoch": 16.58, + "grad_norm": 3.333514928817749, + "learning_rate": 3.540416175593933e-06, + "loss": 1.0511, + "step": 55400 + }, + { + "epoch": 16.58, + "grad_norm": 3.3573429584503174, + "learning_rate": 3.5374027334920595e-06, + "loss": 1.0593, + "step": 55405 + }, + { + "epoch": 16.58, + "grad_norm": 3.672663450241089, + "learning_rate": 3.5343904767471235e-06, + "loss": 0.8411, + "step": 55410 + }, + { + "epoch": 16.58, + "grad_norm": 3.1080899238586426, + "learning_rate": 3.5313794055254894e-06, + "loss": 0.9778, + "step": 55415 + }, + { + "epoch": 16.58, + "grad_norm": 3.76132869720459, + "learning_rate": 3.5283695199934545e-06, + "loss": 1.039, + "step": 55420 + }, + { + "epoch": 16.58, + "grad_norm": 1.2220946550369263, + "learning_rate": 3.5253608203172527e-06, + "loss": 0.9653, + "step": 55425 + }, + { + "epoch": 16.58, + "grad_norm": 2.680553674697876, + "learning_rate": 3.5223533066630538e-06, + "loss": 0.9479, + "step": 55430 + }, + { + "epoch": 16.59, + "grad_norm": 2.850705862045288, + "learning_rate": 3.5193469791969523e-06, + "loss": 0.9687, + "step": 55435 + }, + { + "epoch": 16.59, + "grad_norm": 6.794698715209961, + "learning_rate": 3.516341838084994e-06, + "loss": 0.9051, + "step": 55440 + }, + { + "epoch": 16.59, + "grad_norm": 1.4583183526992798, + "learning_rate": 3.5133378834931424e-06, + "loss": 0.9477, + "step": 55445 + }, + { + "epoch": 16.59, + "grad_norm": 1.7493852376937866, + "learning_rate": 3.5103351155873044e-06, + "loss": 1.0363, + "step": 55450 + }, + { + "epoch": 16.59, + "grad_norm": 2.9901397228240967, + "learning_rate": 3.5073335345333247e-06, + "loss": 1.0936, + "step": 55455 + }, + { + "epoch": 16.59, + "grad_norm": 2.2844088077545166, + "learning_rate": 3.504333140496968e-06, + "loss": 0.9662, + "step": 55460 + }, + { + "epoch": 16.59, + "grad_norm": 1.7796694040298462, + "learning_rate": 3.5013339336439576e-06, + "loss": 0.9429, + "step": 55465 + }, + { + "epoch": 16.6, + "grad_norm": 1.1826192140579224, + "learning_rate": 3.4983359141399164e-06, + "loss": 1.0261, + "step": 55470 + }, + { + "epoch": 16.6, + "grad_norm": 2.0418243408203125, + "learning_rate": 3.4953390821504423e-06, + "loss": 0.9578, + "step": 55475 + }, + { + "epoch": 16.6, + "grad_norm": 6.461709022521973, + "learning_rate": 3.492343437841028e-06, + "loss": 0.9947, + "step": 55480 + }, + { + "epoch": 16.6, + "grad_norm": 3.240283489227295, + "learning_rate": 3.4893489813771413e-06, + "loss": 0.8213, + "step": 55485 + }, + { + "epoch": 16.6, + "grad_norm": 2.8579697608947754, + "learning_rate": 3.486355712924139e-06, + "loss": 0.912, + "step": 55490 + }, + { + "epoch": 16.6, + "grad_norm": 2.917512893676758, + "learning_rate": 3.48336363264736e-06, + "loss": 0.9478, + "step": 55495 + }, + { + "epoch": 16.6, + "grad_norm": 2.837340831756592, + "learning_rate": 3.4803727407120375e-06, + "loss": 1.032, + "step": 55500 + }, + { + "epoch": 16.61, + "grad_norm": 3.3056883811950684, + "learning_rate": 3.4773830372833576e-06, + "loss": 1.1411, + "step": 55505 + }, + { + "epoch": 16.61, + "grad_norm": 4.646909713745117, + "learning_rate": 3.474394522526442e-06, + "loss": 0.9487, + "step": 55510 + }, + { + "epoch": 16.61, + "grad_norm": 5.3782124519348145, + "learning_rate": 3.4714071966063436e-06, + "loss": 0.9774, + "step": 55515 + }, + { + "epoch": 16.61, + "grad_norm": 1.5361804962158203, + "learning_rate": 3.4684210596880486e-06, + "loss": 1.0604, + "step": 55520 + }, + { + "epoch": 16.61, + "grad_norm": 2.3509507179260254, + "learning_rate": 3.4654361119364764e-06, + "loss": 0.9274, + "step": 55525 + }, + { + "epoch": 16.61, + "grad_norm": 4.507537841796875, + "learning_rate": 3.4624523535164823e-06, + "loss": 0.8698, + "step": 55530 + }, + { + "epoch": 16.62, + "grad_norm": 2.660539388656616, + "learning_rate": 3.459469784592859e-06, + "loss": 0.9137, + "step": 55535 + }, + { + "epoch": 16.62, + "grad_norm": 5.054265022277832, + "learning_rate": 3.4564884053303298e-06, + "loss": 0.9331, + "step": 55540 + }, + { + "epoch": 16.62, + "grad_norm": 1.3174716234207153, + "learning_rate": 3.4535082158935527e-06, + "loss": 1.135, + "step": 55545 + }, + { + "epoch": 16.62, + "grad_norm": 2.0269131660461426, + "learning_rate": 3.450529216447121e-06, + "loss": 1.0076, + "step": 55550 + }, + { + "epoch": 16.62, + "grad_norm": 5.588700771331787, + "learning_rate": 3.447551407155561e-06, + "loss": 0.9909, + "step": 55555 + }, + { + "epoch": 16.62, + "grad_norm": 2.787294864654541, + "learning_rate": 3.4445747881833386e-06, + "loss": 1.0891, + "step": 55560 + }, + { + "epoch": 16.62, + "grad_norm": 2.2700955867767334, + "learning_rate": 3.441599359694836e-06, + "loss": 0.9488, + "step": 55565 + }, + { + "epoch": 16.63, + "grad_norm": 1.9791069030761719, + "learning_rate": 3.4386251218543996e-06, + "loss": 1.0003, + "step": 55570 + }, + { + "epoch": 16.63, + "grad_norm": 2.2911648750305176, + "learning_rate": 3.435652074826279e-06, + "loss": 1.0334, + "step": 55575 + }, + { + "epoch": 16.63, + "grad_norm": 5.44551420211792, + "learning_rate": 3.4326802187746864e-06, + "loss": 0.9583, + "step": 55580 + }, + { + "epoch": 16.63, + "grad_norm": 1.6152199506759644, + "learning_rate": 3.4297095538637388e-06, + "loss": 1.0084, + "step": 55585 + }, + { + "epoch": 16.63, + "grad_norm": 3.9968864917755127, + "learning_rate": 3.4267400802575233e-06, + "loss": 0.888, + "step": 55590 + }, + { + "epoch": 16.63, + "grad_norm": 1.834970474243164, + "learning_rate": 3.423771798120021e-06, + "loss": 0.8411, + "step": 55595 + }, + { + "epoch": 16.63, + "grad_norm": 1.1965527534484863, + "learning_rate": 3.4208047076151773e-06, + "loss": 0.9757, + "step": 55600 + }, + { + "epoch": 16.64, + "grad_norm": 2.324232578277588, + "learning_rate": 3.417838808906859e-06, + "loss": 0.9249, + "step": 55605 + }, + { + "epoch": 16.64, + "grad_norm": 1.6368885040283203, + "learning_rate": 3.4148741021588686e-06, + "loss": 1.0691, + "step": 55610 + }, + { + "epoch": 16.64, + "grad_norm": 2.840993881225586, + "learning_rate": 3.4119105875349444e-06, + "loss": 0.998, + "step": 55615 + }, + { + "epoch": 16.64, + "grad_norm": 4.377618789672852, + "learning_rate": 3.4089482651987607e-06, + "loss": 1.0532, + "step": 55620 + }, + { + "epoch": 16.64, + "grad_norm": 3.657803535461426, + "learning_rate": 3.4059871353139207e-06, + "loss": 0.8328, + "step": 55625 + }, + { + "epoch": 16.64, + "grad_norm": 2.706447124481201, + "learning_rate": 3.403027198043968e-06, + "loss": 0.8507, + "step": 55630 + }, + { + "epoch": 16.65, + "grad_norm": 2.529573917388916, + "learning_rate": 3.4000684535523714e-06, + "loss": 0.9334, + "step": 55635 + }, + { + "epoch": 16.65, + "grad_norm": 3.0248026847839355, + "learning_rate": 3.397110902002543e-06, + "loss": 0.9053, + "step": 55640 + }, + { + "epoch": 16.65, + "grad_norm": 1.9483550786972046, + "learning_rate": 3.394154543557826e-06, + "loss": 0.8746, + "step": 55645 + }, + { + "epoch": 16.65, + "grad_norm": 6.750313758850098, + "learning_rate": 3.3911993783814934e-06, + "loss": 0.9609, + "step": 55650 + }, + { + "epoch": 16.65, + "grad_norm": 4.349567890167236, + "learning_rate": 3.388245406636764e-06, + "loss": 0.9785, + "step": 55655 + }, + { + "epoch": 16.65, + "grad_norm": 4.089972019195557, + "learning_rate": 3.3852926284867665e-06, + "loss": 1.1758, + "step": 55660 + }, + { + "epoch": 16.65, + "grad_norm": 1.6726969480514526, + "learning_rate": 3.3823410440946e-06, + "loss": 0.8968, + "step": 55665 + }, + { + "epoch": 16.66, + "grad_norm": 12.856719970703125, + "learning_rate": 3.379390653623257e-06, + "loss": 0.9887, + "step": 55670 + }, + { + "epoch": 16.66, + "grad_norm": 4.076183795928955, + "learning_rate": 3.3764414572357046e-06, + "loss": 0.8843, + "step": 55675 + }, + { + "epoch": 16.66, + "grad_norm": 3.290557384490967, + "learning_rate": 3.3734934550948034e-06, + "loss": 0.9424, + "step": 55680 + }, + { + "epoch": 16.66, + "grad_norm": 1.8041436672210693, + "learning_rate": 3.37054664736339e-06, + "loss": 1.054, + "step": 55685 + }, + { + "epoch": 16.66, + "grad_norm": 2.115232467651367, + "learning_rate": 3.3676010342041988e-06, + "loss": 1.0038, + "step": 55690 + }, + { + "epoch": 16.66, + "grad_norm": 2.6543099880218506, + "learning_rate": 3.364656615779918e-06, + "loss": 0.9418, + "step": 55695 + }, + { + "epoch": 16.66, + "grad_norm": 1.230863332748413, + "learning_rate": 3.36171339225316e-06, + "loss": 0.9834, + "step": 55700 + }, + { + "epoch": 16.67, + "grad_norm": 4.0604047775268555, + "learning_rate": 3.358771363786481e-06, + "loss": 0.9245, + "step": 55705 + }, + { + "epoch": 16.67, + "grad_norm": 15.559052467346191, + "learning_rate": 3.355830530542367e-06, + "loss": 0.9499, + "step": 55710 + }, + { + "epoch": 16.67, + "grad_norm": 7.960972309112549, + "learning_rate": 3.3528908926832304e-06, + "loss": 0.9759, + "step": 55715 + }, + { + "epoch": 16.67, + "grad_norm": 3.3675858974456787, + "learning_rate": 3.3499524503714326e-06, + "loss": 0.9966, + "step": 55720 + }, + { + "epoch": 16.67, + "grad_norm": 2.5648207664489746, + "learning_rate": 3.3470152037692547e-06, + "loss": 1.1776, + "step": 55725 + }, + { + "epoch": 16.67, + "grad_norm": 3.301900625228882, + "learning_rate": 3.3440791530389232e-06, + "loss": 1.0791, + "step": 55730 + }, + { + "epoch": 16.68, + "grad_norm": 1.4749406576156616, + "learning_rate": 3.3411442983425885e-06, + "loss": 1.0181, + "step": 55735 + }, + { + "epoch": 16.68, + "grad_norm": 1.6755149364471436, + "learning_rate": 3.33821063984234e-06, + "loss": 1.0547, + "step": 55740 + }, + { + "epoch": 16.68, + "grad_norm": 3.6287951469421387, + "learning_rate": 3.335278177700202e-06, + "loss": 0.9052, + "step": 55745 + }, + { + "epoch": 16.68, + "grad_norm": 2.7663354873657227, + "learning_rate": 3.3323469120781326e-06, + "loss": 0.8988, + "step": 55750 + }, + { + "epoch": 16.68, + "grad_norm": 2.6993236541748047, + "learning_rate": 3.329416843138017e-06, + "loss": 0.983, + "step": 55755 + }, + { + "epoch": 16.68, + "grad_norm": 6.778196811676025, + "learning_rate": 3.3264879710416917e-06, + "loss": 1.0198, + "step": 55760 + }, + { + "epoch": 16.68, + "grad_norm": 1.4923733472824097, + "learning_rate": 3.3235602959508977e-06, + "loss": 0.9077, + "step": 55765 + }, + { + "epoch": 16.69, + "grad_norm": 2.7575523853302, + "learning_rate": 3.3206338180273465e-06, + "loss": 0.8404, + "step": 55770 + }, + { + "epoch": 16.69, + "grad_norm": 4.008522033691406, + "learning_rate": 3.3177085374326453e-06, + "loss": 0.7967, + "step": 55775 + }, + { + "epoch": 16.69, + "grad_norm": 3.0780680179595947, + "learning_rate": 3.314784454328376e-06, + "loss": 0.9908, + "step": 55780 + }, + { + "epoch": 16.69, + "grad_norm": 3.2288169860839844, + "learning_rate": 3.3118615688760124e-06, + "loss": 0.9111, + "step": 55785 + }, + { + "epoch": 16.69, + "grad_norm": 2.5853915214538574, + "learning_rate": 3.3089398812369942e-06, + "loss": 1.0616, + "step": 55790 + }, + { + "epoch": 16.69, + "grad_norm": 2.06427264213562, + "learning_rate": 3.3060193915726763e-06, + "loss": 0.997, + "step": 55795 + }, + { + "epoch": 16.69, + "grad_norm": 9.891775131225586, + "learning_rate": 3.3031001000443597e-06, + "loss": 1.0209, + "step": 55800 + }, + { + "epoch": 16.7, + "grad_norm": 2.132046937942505, + "learning_rate": 3.300182006813271e-06, + "loss": 0.9908, + "step": 55805 + }, + { + "epoch": 16.7, + "grad_norm": 2.466611862182617, + "learning_rate": 3.297265112040568e-06, + "loss": 1.0584, + "step": 55810 + }, + { + "epoch": 16.7, + "grad_norm": 1.6372896432876587, + "learning_rate": 3.2943494158873684e-06, + "loss": 0.8243, + "step": 55815 + }, + { + "epoch": 16.7, + "grad_norm": 2.080674886703491, + "learning_rate": 3.29143491851468e-06, + "loss": 0.9725, + "step": 55820 + }, + { + "epoch": 16.7, + "grad_norm": 2.6015894412994385, + "learning_rate": 3.288521620083479e-06, + "loss": 1.0789, + "step": 55825 + }, + { + "epoch": 16.7, + "grad_norm": 2.357999086380005, + "learning_rate": 3.2856095207546595e-06, + "loss": 0.9111, + "step": 55830 + }, + { + "epoch": 16.71, + "grad_norm": 2.4748330116271973, + "learning_rate": 3.2826986206890537e-06, + "loss": 1.0326, + "step": 55835 + }, + { + "epoch": 16.71, + "grad_norm": 3.481816291809082, + "learning_rate": 3.279788920047433e-06, + "loss": 0.9831, + "step": 55840 + }, + { + "epoch": 16.71, + "grad_norm": 2.6234841346740723, + "learning_rate": 3.2768804189904913e-06, + "loss": 1.0191, + "step": 55845 + }, + { + "epoch": 16.71, + "grad_norm": 1.8331263065338135, + "learning_rate": 3.2739731176788636e-06, + "loss": 0.9873, + "step": 55850 + }, + { + "epoch": 16.71, + "grad_norm": 1.4677926301956177, + "learning_rate": 3.271067016273124e-06, + "loss": 0.9661, + "step": 55855 + }, + { + "epoch": 16.71, + "grad_norm": 2.3680524826049805, + "learning_rate": 3.2681621149337538e-06, + "loss": 0.862, + "step": 55860 + }, + { + "epoch": 16.71, + "grad_norm": 1.7516663074493408, + "learning_rate": 3.265258413821215e-06, + "loss": 0.9544, + "step": 55865 + }, + { + "epoch": 16.72, + "grad_norm": 1.8315908908843994, + "learning_rate": 3.262355913095852e-06, + "loss": 0.9942, + "step": 55870 + }, + { + "epoch": 16.72, + "grad_norm": 2.081782341003418, + "learning_rate": 3.259454612917984e-06, + "loss": 1.2045, + "step": 55875 + }, + { + "epoch": 16.72, + "grad_norm": 2.6945362091064453, + "learning_rate": 3.256554513447832e-06, + "loss": 1.0241, + "step": 55880 + }, + { + "epoch": 16.72, + "grad_norm": 3.111546277999878, + "learning_rate": 3.253655614845583e-06, + "loss": 0.9431, + "step": 55885 + }, + { + "epoch": 16.72, + "grad_norm": 3.409693717956543, + "learning_rate": 3.250757917271324e-06, + "loss": 1.1223, + "step": 55890 + }, + { + "epoch": 16.72, + "grad_norm": 3.733633279800415, + "learning_rate": 3.2478614208850928e-06, + "loss": 1.0297, + "step": 55895 + }, + { + "epoch": 16.72, + "grad_norm": 2.1389636993408203, + "learning_rate": 3.2449661258468768e-06, + "loss": 0.9982, + "step": 55900 + }, + { + "epoch": 16.73, + "grad_norm": 2.501753568649292, + "learning_rate": 3.2420720323165566e-06, + "loss": 0.9889, + "step": 55905 + }, + { + "epoch": 16.73, + "grad_norm": 1.5202560424804688, + "learning_rate": 3.239179140453996e-06, + "loss": 0.8041, + "step": 55910 + }, + { + "epoch": 16.73, + "grad_norm": 4.239963054656982, + "learning_rate": 3.2362874504189428e-06, + "loss": 0.9234, + "step": 55915 + }, + { + "epoch": 16.73, + "grad_norm": 3.403322458267212, + "learning_rate": 3.2333969623711157e-06, + "loss": 0.935, + "step": 55920 + }, + { + "epoch": 16.73, + "grad_norm": 1.6759166717529297, + "learning_rate": 3.2305076764701458e-06, + "loss": 0.895, + "step": 55925 + }, + { + "epoch": 16.73, + "grad_norm": 3.4392189979553223, + "learning_rate": 3.2276195928756107e-06, + "loss": 0.9788, + "step": 55930 + }, + { + "epoch": 16.74, + "grad_norm": 2.1375179290771484, + "learning_rate": 3.224732711747014e-06, + "loss": 0.8672, + "step": 55935 + }, + { + "epoch": 16.74, + "grad_norm": 1.1759334802627563, + "learning_rate": 3.221847033243794e-06, + "loss": 1.0013, + "step": 55940 + }, + { + "epoch": 16.74, + "grad_norm": 2.3137335777282715, + "learning_rate": 3.2189625575253265e-06, + "loss": 0.8176, + "step": 55945 + }, + { + "epoch": 16.74, + "grad_norm": 1.971378207206726, + "learning_rate": 3.2160792847509225e-06, + "loss": 0.8394, + "step": 55950 + }, + { + "epoch": 16.74, + "grad_norm": 1.7172585725784302, + "learning_rate": 3.213197215079805e-06, + "loss": 0.9224, + "step": 55955 + }, + { + "epoch": 16.74, + "grad_norm": 6.083642482757568, + "learning_rate": 3.210316348671169e-06, + "loss": 1.0735, + "step": 55960 + }, + { + "epoch": 16.74, + "grad_norm": 1.318853497505188, + "learning_rate": 3.2074366856841036e-06, + "loss": 1.0313, + "step": 55965 + }, + { + "epoch": 16.75, + "grad_norm": 2.593428134918213, + "learning_rate": 3.2045582262776626e-06, + "loss": 0.9487, + "step": 55970 + }, + { + "epoch": 16.75, + "grad_norm": 3.650310516357422, + "learning_rate": 3.2016809706108153e-06, + "loss": 0.9554, + "step": 55975 + }, + { + "epoch": 16.75, + "grad_norm": 1.8704103231430054, + "learning_rate": 3.1988049188424683e-06, + "loss": 1.0365, + "step": 55980 + }, + { + "epoch": 16.75, + "grad_norm": 1.763230562210083, + "learning_rate": 3.1959300711314727e-06, + "loss": 1.0271, + "step": 55985 + }, + { + "epoch": 16.75, + "grad_norm": 2.3225176334381104, + "learning_rate": 3.1930564276365842e-06, + "loss": 0.9846, + "step": 55990 + }, + { + "epoch": 16.75, + "grad_norm": 3.4815707206726074, + "learning_rate": 3.190183988516532e-06, + "loss": 0.9911, + "step": 55995 + }, + { + "epoch": 16.75, + "grad_norm": 2.6258528232574463, + "learning_rate": 3.187312753929936e-06, + "loss": 1.1604, + "step": 56000 + }, + { + "epoch": 16.76, + "grad_norm": 1.8864493370056152, + "learning_rate": 3.184442724035397e-06, + "loss": 1.023, + "step": 56005 + }, + { + "epoch": 16.76, + "grad_norm": 3.0884039402008057, + "learning_rate": 3.1815738989914022e-06, + "loss": 0.9337, + "step": 56010 + }, + { + "epoch": 16.76, + "grad_norm": 2.619140625, + "learning_rate": 3.178706278956403e-06, + "loss": 0.9569, + "step": 56015 + }, + { + "epoch": 16.76, + "grad_norm": 2.4161531925201416, + "learning_rate": 3.1758398640887753e-06, + "loss": 1.0311, + "step": 56020 + }, + { + "epoch": 16.76, + "grad_norm": 2.404909133911133, + "learning_rate": 3.172974654546826e-06, + "loss": 0.8572, + "step": 56025 + }, + { + "epoch": 16.76, + "grad_norm": 3.482473850250244, + "learning_rate": 3.1701106504888e-06, + "loss": 0.9448, + "step": 56030 + }, + { + "epoch": 16.77, + "grad_norm": 1.7155938148498535, + "learning_rate": 3.167247852072869e-06, + "loss": 0.8405, + "step": 56035 + }, + { + "epoch": 16.77, + "grad_norm": 5.21242094039917, + "learning_rate": 3.164386259457147e-06, + "loss": 0.999, + "step": 56040 + }, + { + "epoch": 16.77, + "grad_norm": 0.9593510627746582, + "learning_rate": 3.1615258727996812e-06, + "loss": 0.9095, + "step": 56045 + }, + { + "epoch": 16.77, + "grad_norm": 3.08491849899292, + "learning_rate": 3.15866669225843e-06, + "loss": 0.9706, + "step": 56050 + }, + { + "epoch": 16.77, + "grad_norm": 4.684256076812744, + "learning_rate": 3.1558087179913183e-06, + "loss": 0.9068, + "step": 56055 + }, + { + "epoch": 16.77, + "grad_norm": 1.6469522714614868, + "learning_rate": 3.1529519501561883e-06, + "loss": 0.8806, + "step": 56060 + }, + { + "epoch": 16.77, + "grad_norm": 2.585136651992798, + "learning_rate": 3.1500963889108113e-06, + "loss": 0.986, + "step": 56065 + }, + { + "epoch": 16.78, + "grad_norm": 1.8152649402618408, + "learning_rate": 3.1472420344129005e-06, + "loss": 0.9496, + "step": 56070 + }, + { + "epoch": 16.78, + "grad_norm": 4.420323371887207, + "learning_rate": 3.144388886820096e-06, + "loss": 1.0925, + "step": 56075 + }, + { + "epoch": 16.78, + "grad_norm": 1.4763715267181396, + "learning_rate": 3.1415369462899803e-06, + "loss": 1.0199, + "step": 56080 + }, + { + "epoch": 16.78, + "grad_norm": 5.959677219390869, + "learning_rate": 3.1386862129800494e-06, + "loss": 0.9485, + "step": 56085 + }, + { + "epoch": 16.78, + "grad_norm": 3.459981918334961, + "learning_rate": 3.135836687047766e-06, + "loss": 0.8902, + "step": 56090 + }, + { + "epoch": 16.78, + "grad_norm": 3.747342348098755, + "learning_rate": 3.132988368650486e-06, + "loss": 0.9345, + "step": 56095 + }, + { + "epoch": 16.78, + "grad_norm": 2.7175967693328857, + "learning_rate": 3.1301412579455373e-06, + "loss": 0.9902, + "step": 56100 + }, + { + "epoch": 16.79, + "grad_norm": 2.1608591079711914, + "learning_rate": 3.127295355090146e-06, + "loss": 0.9676, + "step": 56105 + }, + { + "epoch": 16.79, + "grad_norm": 4.2502336502075195, + "learning_rate": 3.124450660241507e-06, + "loss": 1.024, + "step": 56110 + }, + { + "epoch": 16.79, + "grad_norm": 3.1099443435668945, + "learning_rate": 3.121607173556712e-06, + "loss": 0.8469, + "step": 56115 + }, + { + "epoch": 16.79, + "grad_norm": 2.466075897216797, + "learning_rate": 3.1187648951928134e-06, + "loss": 0.8405, + "step": 56120 + }, + { + "epoch": 16.79, + "grad_norm": 1.5878918170928955, + "learning_rate": 3.115923825306785e-06, + "loss": 1.0979, + "step": 56125 + }, + { + "epoch": 16.79, + "grad_norm": 6.672008037567139, + "learning_rate": 3.113083964055538e-06, + "loss": 1.0864, + "step": 56130 + }, + { + "epoch": 16.79, + "grad_norm": 3.1133804321289062, + "learning_rate": 3.1102453115959106e-06, + "loss": 0.8562, + "step": 56135 + }, + { + "epoch": 16.8, + "grad_norm": 2.506171226501465, + "learning_rate": 3.1074078680846858e-06, + "loss": 0.8097, + "step": 56140 + }, + { + "epoch": 16.8, + "grad_norm": 3.1147594451904297, + "learning_rate": 3.104571633678563e-06, + "loss": 0.8618, + "step": 56145 + }, + { + "epoch": 16.8, + "grad_norm": 2.9119558334350586, + "learning_rate": 3.101736608534195e-06, + "loss": 0.9775, + "step": 56150 + }, + { + "epoch": 16.8, + "grad_norm": 2.8269081115722656, + "learning_rate": 3.098902792808148e-06, + "loss": 0.8513, + "step": 56155 + }, + { + "epoch": 16.8, + "grad_norm": 3.424445867538452, + "learning_rate": 3.096070186656938e-06, + "loss": 0.9278, + "step": 56160 + }, + { + "epoch": 16.8, + "grad_norm": 2.689589500427246, + "learning_rate": 3.0932387902370017e-06, + "loss": 0.9281, + "step": 56165 + }, + { + "epoch": 16.81, + "grad_norm": 2.5684525966644287, + "learning_rate": 3.090408603704717e-06, + "loss": 1.0208, + "step": 56170 + }, + { + "epoch": 16.81, + "grad_norm": 1.9679588079452515, + "learning_rate": 3.0875796272163972e-06, + "loss": 0.9015, + "step": 56175 + }, + { + "epoch": 16.81, + "grad_norm": 1.7874964475631714, + "learning_rate": 3.084751860928264e-06, + "loss": 1.0641, + "step": 56180 + }, + { + "epoch": 16.81, + "grad_norm": 3.177513599395752, + "learning_rate": 3.081925304996522e-06, + "loss": 1.0432, + "step": 56185 + }, + { + "epoch": 16.81, + "grad_norm": 3.6404199600219727, + "learning_rate": 3.0790999595772473e-06, + "loss": 0.9549, + "step": 56190 + }, + { + "epoch": 16.81, + "grad_norm": 11.20604419708252, + "learning_rate": 3.0762758248265107e-06, + "loss": 1.0167, + "step": 56195 + }, + { + "epoch": 16.81, + "grad_norm": 2.8811817169189453, + "learning_rate": 3.073452900900259e-06, + "loss": 0.8696, + "step": 56200 + }, + { + "epoch": 16.82, + "grad_norm": 3.1575052738189697, + "learning_rate": 3.070631187954423e-06, + "loss": 0.7686, + "step": 56205 + }, + { + "epoch": 16.82, + "grad_norm": 1.944756269454956, + "learning_rate": 3.067810686144826e-06, + "loss": 1.0258, + "step": 56210 + }, + { + "epoch": 16.82, + "grad_norm": 3.3357532024383545, + "learning_rate": 3.064991395627248e-06, + "loss": 0.8491, + "step": 56215 + }, + { + "epoch": 16.82, + "grad_norm": 4.347214221954346, + "learning_rate": 3.062173316557396e-06, + "loss": 0.9459, + "step": 56220 + }, + { + "epoch": 16.82, + "grad_norm": 2.0537803173065186, + "learning_rate": 3.0593564490909084e-06, + "loss": 0.9565, + "step": 56225 + }, + { + "epoch": 16.82, + "grad_norm": 2.428102493286133, + "learning_rate": 3.0565407933833586e-06, + "loss": 0.7407, + "step": 56230 + }, + { + "epoch": 16.82, + "grad_norm": 4.625239372253418, + "learning_rate": 3.0537263495902522e-06, + "loss": 0.9934, + "step": 56235 + }, + { + "epoch": 16.83, + "grad_norm": 1.8983107805252075, + "learning_rate": 3.050913117867027e-06, + "loss": 1.0148, + "step": 56240 + }, + { + "epoch": 16.83, + "grad_norm": 1.6342809200286865, + "learning_rate": 3.048101098369055e-06, + "loss": 1.0129, + "step": 56245 + }, + { + "epoch": 16.83, + "grad_norm": 1.8513437509536743, + "learning_rate": 3.0452902912516403e-06, + "loss": 0.855, + "step": 56250 + }, + { + "epoch": 16.83, + "grad_norm": 3.056840658187866, + "learning_rate": 3.0424806966700246e-06, + "loss": 0.9762, + "step": 56255 + }, + { + "epoch": 16.83, + "grad_norm": 9.707741737365723, + "learning_rate": 3.0396723147793737e-06, + "loss": 0.9429, + "step": 56260 + }, + { + "epoch": 16.83, + "grad_norm": 8.954275131225586, + "learning_rate": 3.036865145734796e-06, + "loss": 1.0287, + "step": 56265 + }, + { + "epoch": 16.84, + "grad_norm": 2.4749701023101807, + "learning_rate": 3.034059189691335e-06, + "loss": 1.0522, + "step": 56270 + }, + { + "epoch": 16.84, + "grad_norm": 3.8181324005126953, + "learning_rate": 3.0312544468039383e-06, + "loss": 0.9914, + "step": 56275 + }, + { + "epoch": 16.84, + "grad_norm": 6.021519660949707, + "learning_rate": 3.028450917227535e-06, + "loss": 0.7978, + "step": 56280 + }, + { + "epoch": 16.84, + "grad_norm": 2.9637115001678467, + "learning_rate": 3.02564860111694e-06, + "loss": 1.0615, + "step": 56285 + }, + { + "epoch": 16.84, + "grad_norm": 4.300103187561035, + "learning_rate": 3.0228474986269406e-06, + "loss": 0.7859, + "step": 56290 + }, + { + "epoch": 16.84, + "grad_norm": 1.1417111158370972, + "learning_rate": 3.0200476099122186e-06, + "loss": 1.0462, + "step": 56295 + }, + { + "epoch": 16.84, + "grad_norm": 1.8585431575775146, + "learning_rate": 3.0172489351274313e-06, + "loss": 1.1015, + "step": 56300 + }, + { + "epoch": 16.85, + "grad_norm": 1.4056024551391602, + "learning_rate": 3.0144514744271325e-06, + "loss": 1.066, + "step": 56305 + }, + { + "epoch": 16.85, + "grad_norm": 2.6178314685821533, + "learning_rate": 3.011655227965823e-06, + "loss": 1.0539, + "step": 56310 + }, + { + "epoch": 16.85, + "grad_norm": 4.641908645629883, + "learning_rate": 3.0088601958979416e-06, + "loss": 0.7809, + "step": 56315 + }, + { + "epoch": 16.85, + "grad_norm": 8.58937931060791, + "learning_rate": 3.006066378377853e-06, + "loss": 1.0699, + "step": 56320 + }, + { + "epoch": 16.85, + "grad_norm": 1.9301339387893677, + "learning_rate": 3.003273775559856e-06, + "loss": 0.9594, + "step": 56325 + }, + { + "epoch": 16.85, + "grad_norm": 3.1416869163513184, + "learning_rate": 3.0004823875981857e-06, + "loss": 0.8427, + "step": 56330 + }, + { + "epoch": 16.85, + "grad_norm": 1.7465052604675293, + "learning_rate": 2.997692214647005e-06, + "loss": 1.0006, + "step": 56335 + }, + { + "epoch": 16.86, + "grad_norm": 1.5967586040496826, + "learning_rate": 2.9949032568604125e-06, + "loss": 1.0248, + "step": 56340 + }, + { + "epoch": 16.86, + "grad_norm": 2.2583372592926025, + "learning_rate": 2.992115514392441e-06, + "loss": 1.0231, + "step": 56345 + }, + { + "epoch": 16.86, + "grad_norm": 4.796255111694336, + "learning_rate": 2.9893289873970528e-06, + "loss": 0.9619, + "step": 56350 + }, + { + "epoch": 16.86, + "grad_norm": 6.1636152267456055, + "learning_rate": 2.9865436760281445e-06, + "loss": 0.9783, + "step": 56355 + }, + { + "epoch": 16.86, + "grad_norm": 2.931490421295166, + "learning_rate": 2.983759580439549e-06, + "loss": 0.9008, + "step": 56360 + }, + { + "epoch": 16.86, + "grad_norm": 2.367924213409424, + "learning_rate": 2.980976700785035e-06, + "loss": 1.0938, + "step": 56365 + }, + { + "epoch": 16.87, + "grad_norm": 1.7075245380401611, + "learning_rate": 2.978195037218276e-06, + "loss": 1.027, + "step": 56370 + }, + { + "epoch": 16.87, + "grad_norm": 2.3319573402404785, + "learning_rate": 2.9754145898929275e-06, + "loss": 0.9178, + "step": 56375 + }, + { + "epoch": 16.87, + "grad_norm": 2.598543643951416, + "learning_rate": 2.972635358962525e-06, + "loss": 0.8888, + "step": 56380 + }, + { + "epoch": 16.87, + "grad_norm": 1.452913522720337, + "learning_rate": 2.9698573445805873e-06, + "loss": 1.0465, + "step": 56385 + }, + { + "epoch": 16.87, + "grad_norm": 6.2622833251953125, + "learning_rate": 2.967080546900519e-06, + "loss": 0.9591, + "step": 56390 + }, + { + "epoch": 16.87, + "grad_norm": 2.436224937438965, + "learning_rate": 2.964304966075701e-06, + "loss": 0.935, + "step": 56395 + }, + { + "epoch": 16.87, + "grad_norm": 2.5119998455047607, + "learning_rate": 2.9615306022594104e-06, + "loss": 0.9041, + "step": 56400 + }, + { + "epoch": 16.88, + "grad_norm": 2.9698565006256104, + "learning_rate": 2.958757455604874e-06, + "loss": 1.0233, + "step": 56405 + }, + { + "epoch": 16.88, + "grad_norm": 1.585296630859375, + "learning_rate": 2.9559855262652564e-06, + "loss": 1.0708, + "step": 56410 + }, + { + "epoch": 16.88, + "grad_norm": 2.456385850906372, + "learning_rate": 2.9532148143936426e-06, + "loss": 0.8779, + "step": 56415 + }, + { + "epoch": 16.88, + "grad_norm": 2.774512767791748, + "learning_rate": 2.950445320143058e-06, + "loss": 1.0221, + "step": 56420 + }, + { + "epoch": 16.88, + "grad_norm": 1.0109182596206665, + "learning_rate": 2.9476770436664582e-06, + "loss": 0.9762, + "step": 56425 + }, + { + "epoch": 16.88, + "grad_norm": 1.9075700044631958, + "learning_rate": 2.9449099851167365e-06, + "loss": 0.8601, + "step": 56430 + }, + { + "epoch": 16.88, + "grad_norm": 1.4174439907073975, + "learning_rate": 2.9421441446467078e-06, + "loss": 1.0847, + "step": 56435 + }, + { + "epoch": 16.89, + "grad_norm": 2.0944745540618896, + "learning_rate": 2.9393795224091332e-06, + "loss": 0.7858, + "step": 56440 + }, + { + "epoch": 16.89, + "grad_norm": 3.1148488521575928, + "learning_rate": 2.9366161185566952e-06, + "loss": 1.08, + "step": 56445 + }, + { + "epoch": 16.89, + "grad_norm": 2.154017448425293, + "learning_rate": 2.9338539332420143e-06, + "loss": 1.025, + "step": 56450 + }, + { + "epoch": 16.89, + "grad_norm": 1.8690617084503174, + "learning_rate": 2.931092966617646e-06, + "loss": 1.0529, + "step": 56455 + }, + { + "epoch": 16.89, + "grad_norm": 1.549531102180481, + "learning_rate": 2.928333218836074e-06, + "loss": 0.8698, + "step": 56460 + }, + { + "epoch": 16.89, + "grad_norm": 4.329528331756592, + "learning_rate": 2.9255746900497148e-06, + "loss": 0.8416, + "step": 56465 + }, + { + "epoch": 16.9, + "grad_norm": 4.210263729095459, + "learning_rate": 2.9228173804109276e-06, + "loss": 1.1474, + "step": 56470 + }, + { + "epoch": 16.9, + "grad_norm": 2.099457263946533, + "learning_rate": 2.9200612900719764e-06, + "loss": 1.1059, + "step": 56475 + }, + { + "epoch": 16.9, + "grad_norm": 4.597415924072266, + "learning_rate": 2.917306419185098e-06, + "loss": 0.8395, + "step": 56480 + }, + { + "epoch": 16.9, + "grad_norm": 2.6783969402313232, + "learning_rate": 2.914552767902426e-06, + "loss": 1.0578, + "step": 56485 + }, + { + "epoch": 16.9, + "grad_norm": 1.4600439071655273, + "learning_rate": 2.9118003363760553e-06, + "loss": 0.8978, + "step": 56490 + }, + { + "epoch": 16.9, + "grad_norm": 3.4358808994293213, + "learning_rate": 2.90904912475799e-06, + "loss": 0.9432, + "step": 56495 + }, + { + "epoch": 16.9, + "grad_norm": 2.708552360534668, + "learning_rate": 2.906299133200177e-06, + "loss": 0.9166, + "step": 56500 + }, + { + "epoch": 16.91, + "grad_norm": 2.577993869781494, + "learning_rate": 2.9035503618544986e-06, + "loss": 1.0563, + "step": 56505 + }, + { + "epoch": 16.91, + "grad_norm": 2.4056396484375, + "learning_rate": 2.900802810872766e-06, + "loss": 1.03, + "step": 56510 + }, + { + "epoch": 16.91, + "grad_norm": 4.068645000457764, + "learning_rate": 2.898056480406722e-06, + "loss": 0.8931, + "step": 56515 + }, + { + "epoch": 16.91, + "grad_norm": 2.8915958404541016, + "learning_rate": 2.8953113706080477e-06, + "loss": 1.1591, + "step": 56520 + }, + { + "epoch": 16.91, + "grad_norm": 1.79038667678833, + "learning_rate": 2.892567481628347e-06, + "loss": 1.0253, + "step": 56525 + }, + { + "epoch": 16.91, + "grad_norm": 2.4151062965393066, + "learning_rate": 2.889824813619166e-06, + "loss": 0.8595, + "step": 56530 + }, + { + "epoch": 16.91, + "grad_norm": 1.7418419122695923, + "learning_rate": 2.8870833667319795e-06, + "loss": 0.8801, + "step": 56535 + }, + { + "epoch": 16.92, + "grad_norm": 4.192162036895752, + "learning_rate": 2.8843431411181926e-06, + "loss": 1.1202, + "step": 56540 + }, + { + "epoch": 16.92, + "grad_norm": 4.1534881591796875, + "learning_rate": 2.8816041369291446e-06, + "loss": 0.8828, + "step": 56545 + }, + { + "epoch": 16.92, + "grad_norm": 2.742586374282837, + "learning_rate": 2.878866354316112e-06, + "loss": 0.9694, + "step": 56550 + }, + { + "epoch": 16.92, + "grad_norm": 3.240999460220337, + "learning_rate": 2.8761297934302934e-06, + "loss": 0.8857, + "step": 56555 + }, + { + "epoch": 16.92, + "grad_norm": 4.671759128570557, + "learning_rate": 2.873394454422834e-06, + "loss": 0.9096, + "step": 56560 + }, + { + "epoch": 16.92, + "grad_norm": 2.9793546199798584, + "learning_rate": 2.8706603374448027e-06, + "loss": 1.1287, + "step": 56565 + }, + { + "epoch": 16.93, + "grad_norm": 1.7608052492141724, + "learning_rate": 2.8679274426471864e-06, + "loss": 0.9546, + "step": 56570 + }, + { + "epoch": 16.93, + "grad_norm": 3.5491206645965576, + "learning_rate": 2.865195770180945e-06, + "loss": 0.9436, + "step": 56575 + }, + { + "epoch": 16.93, + "grad_norm": 4.213696002960205, + "learning_rate": 2.8624653201969247e-06, + "loss": 0.8552, + "step": 56580 + }, + { + "epoch": 16.93, + "grad_norm": 2.7237231731414795, + "learning_rate": 2.859736092845941e-06, + "loss": 1.0671, + "step": 56585 + }, + { + "epoch": 16.93, + "grad_norm": 2.797983407974243, + "learning_rate": 2.8570080882787175e-06, + "loss": 1.0807, + "step": 56590 + }, + { + "epoch": 16.93, + "grad_norm": 4.629161834716797, + "learning_rate": 2.8542813066459173e-06, + "loss": 0.9689, + "step": 56595 + }, + { + "epoch": 16.93, + "grad_norm": 4.058006763458252, + "learning_rate": 2.8515557480981448e-06, + "loss": 1.0349, + "step": 56600 + }, + { + "epoch": 16.94, + "grad_norm": 2.309382677078247, + "learning_rate": 2.848831412785924e-06, + "loss": 0.9884, + "step": 56605 + }, + { + "epoch": 16.94, + "grad_norm": 4.170457363128662, + "learning_rate": 2.8461083008597204e-06, + "loss": 0.8189, + "step": 56610 + }, + { + "epoch": 16.94, + "grad_norm": 2.4919357299804688, + "learning_rate": 2.8433864124699284e-06, + "loss": 0.9594, + "step": 56615 + }, + { + "epoch": 16.94, + "grad_norm": 4.131077289581299, + "learning_rate": 2.840665747766874e-06, + "loss": 0.9989, + "step": 56620 + }, + { + "epoch": 16.94, + "grad_norm": 4.024702072143555, + "learning_rate": 2.8379463069008153e-06, + "loss": 0.9219, + "step": 56625 + }, + { + "epoch": 16.94, + "grad_norm": 2.887530565261841, + "learning_rate": 2.8352280900219462e-06, + "loss": 0.8609, + "step": 56630 + }, + { + "epoch": 16.94, + "grad_norm": 3.2231643199920654, + "learning_rate": 2.8325110972803936e-06, + "loss": 1.0832, + "step": 56635 + }, + { + "epoch": 16.95, + "grad_norm": 2.5507965087890625, + "learning_rate": 2.829795328826207e-06, + "loss": 1.023, + "step": 56640 + }, + { + "epoch": 16.95, + "grad_norm": 1.9942423105239868, + "learning_rate": 2.827080784809383e-06, + "loss": 1.0267, + "step": 56645 + }, + { + "epoch": 16.95, + "grad_norm": 3.263000011444092, + "learning_rate": 2.8243674653798376e-06, + "loss": 0.9424, + "step": 56650 + }, + { + "epoch": 16.95, + "grad_norm": 2.1129324436187744, + "learning_rate": 2.8216553706874288e-06, + "loss": 1.119, + "step": 56655 + }, + { + "epoch": 16.95, + "grad_norm": 1.207990050315857, + "learning_rate": 2.8189445008819454e-06, + "loss": 0.762, + "step": 56660 + }, + { + "epoch": 16.95, + "grad_norm": 2.738921880722046, + "learning_rate": 2.8162348561130895e-06, + "loss": 1.012, + "step": 56665 + }, + { + "epoch": 16.96, + "grad_norm": 4.169528007507324, + "learning_rate": 2.8135264365305366e-06, + "loss": 1.0023, + "step": 56670 + }, + { + "epoch": 16.96, + "grad_norm": 3.962857723236084, + "learning_rate": 2.8108192422838437e-06, + "loss": 0.901, + "step": 56675 + }, + { + "epoch": 16.96, + "grad_norm": 2.816699743270874, + "learning_rate": 2.808113273522553e-06, + "loss": 0.969, + "step": 56680 + }, + { + "epoch": 16.96, + "grad_norm": 4.528411388397217, + "learning_rate": 2.8054085303960863e-06, + "loss": 1.0662, + "step": 56685 + }, + { + "epoch": 16.96, + "grad_norm": 1.3069534301757812, + "learning_rate": 2.8027050130538494e-06, + "loss": 1.0393, + "step": 56690 + }, + { + "epoch": 16.96, + "grad_norm": 5.835165977478027, + "learning_rate": 2.800002721645134e-06, + "loss": 1.1026, + "step": 56695 + }, + { + "epoch": 16.96, + "grad_norm": 2.2448999881744385, + "learning_rate": 2.79730165631919e-06, + "loss": 0.8728, + "step": 56700 + }, + { + "epoch": 16.97, + "grad_norm": 3.5195298194885254, + "learning_rate": 2.7946018172252066e-06, + "loss": 1.0797, + "step": 56705 + }, + { + "epoch": 16.97, + "grad_norm": 1.8925422430038452, + "learning_rate": 2.7919032045122725e-06, + "loss": 0.9078, + "step": 56710 + }, + { + "epoch": 16.97, + "grad_norm": 1.31459379196167, + "learning_rate": 2.789205818329449e-06, + "loss": 0.9306, + "step": 56715 + }, + { + "epoch": 16.97, + "grad_norm": 3.2655656337738037, + "learning_rate": 2.786509658825698e-06, + "loss": 1.1181, + "step": 56720 + }, + { + "epoch": 16.97, + "grad_norm": 1.9709197282791138, + "learning_rate": 2.7838147261499305e-06, + "loss": 0.8847, + "step": 56725 + }, + { + "epoch": 16.97, + "grad_norm": 2.601952314376831, + "learning_rate": 2.78112102045098e-06, + "loss": 0.911, + "step": 56730 + }, + { + "epoch": 16.97, + "grad_norm": 1.5735349655151367, + "learning_rate": 2.7784285418776227e-06, + "loss": 0.8787, + "step": 56735 + }, + { + "epoch": 16.98, + "grad_norm": 1.383877158164978, + "learning_rate": 2.775737290578559e-06, + "loss": 0.8761, + "step": 56740 + }, + { + "epoch": 16.98, + "grad_norm": 2.030069351196289, + "learning_rate": 2.773047266702422e-06, + "loss": 1.0923, + "step": 56745 + }, + { + "epoch": 16.98, + "grad_norm": 2.9204518795013428, + "learning_rate": 2.7703584703977824e-06, + "loss": 0.9418, + "step": 56750 + }, + { + "epoch": 16.98, + "grad_norm": 2.140024423599243, + "learning_rate": 2.7676709018131436e-06, + "loss": 0.9529, + "step": 56755 + }, + { + "epoch": 16.98, + "grad_norm": 1.8030258417129517, + "learning_rate": 2.76498456109692e-06, + "loss": 1.0081, + "step": 56760 + }, + { + "epoch": 16.98, + "grad_norm": 4.3402509689331055, + "learning_rate": 2.7622994483974985e-06, + "loss": 0.8433, + "step": 56765 + }, + { + "epoch": 16.98, + "grad_norm": 2.835216999053955, + "learning_rate": 2.759615563863152e-06, + "loss": 1.0517, + "step": 56770 + }, + { + "epoch": 16.99, + "grad_norm": 2.009197473526001, + "learning_rate": 2.7569329076421317e-06, + "loss": 0.985, + "step": 56775 + }, + { + "epoch": 16.99, + "grad_norm": 1.9505784511566162, + "learning_rate": 2.7542514798825773e-06, + "loss": 0.8046, + "step": 56780 + }, + { + "epoch": 16.99, + "grad_norm": 1.5446016788482666, + "learning_rate": 2.7515712807325955e-06, + "loss": 0.9255, + "step": 56785 + }, + { + "epoch": 16.99, + "grad_norm": 1.5326980352401733, + "learning_rate": 2.7488923103402094e-06, + "loss": 1.1662, + "step": 56790 + }, + { + "epoch": 16.99, + "grad_norm": 2.3964767456054688, + "learning_rate": 2.7462145688533615e-06, + "loss": 0.9944, + "step": 56795 + }, + { + "epoch": 16.99, + "grad_norm": 5.0350189208984375, + "learning_rate": 2.743538056419964e-06, + "loss": 0.9068, + "step": 56800 + }, + { + "epoch": 17.0, + "grad_norm": 2.6177234649658203, + "learning_rate": 2.7408627731878133e-06, + "loss": 0.9081, + "step": 56805 + }, + { + "epoch": 17.0, + "grad_norm": 2.825147867202759, + "learning_rate": 2.738188719304685e-06, + "loss": 0.9299, + "step": 56810 + }, + { + "epoch": 17.0, + "grad_norm": 3.711740732192993, + "learning_rate": 2.735515894918245e-06, + "loss": 0.9777, + "step": 56815 + }, + { + "epoch": 17.0, + "grad_norm": 2.3973779678344727, + "learning_rate": 2.7328443001761266e-06, + "loss": 1.0681, + "step": 56820 + }, + { + "epoch": 17.0, + "grad_norm": 3.3198635578155518, + "learning_rate": 2.7301739352258716e-06, + "loss": 0.9094, + "step": 56825 + }, + { + "epoch": 17.0, + "grad_norm": 2.2214438915252686, + "learning_rate": 2.7275048002149585e-06, + "loss": 0.9812, + "step": 56830 + }, + { + "epoch": 17.0, + "grad_norm": 6.447070598602295, + "learning_rate": 2.7248368952908053e-06, + "loss": 0.76, + "step": 56835 + }, + { + "epoch": 17.01, + "grad_norm": 4.4163713455200195, + "learning_rate": 2.722170220600756e-06, + "loss": 0.8238, + "step": 56840 + }, + { + "epoch": 17.01, + "grad_norm": 2.432135820388794, + "learning_rate": 2.7195047762920895e-06, + "loss": 0.8192, + "step": 56845 + }, + { + "epoch": 17.01, + "grad_norm": 1.874724268913269, + "learning_rate": 2.716840562512021e-06, + "loss": 0.916, + "step": 56850 + }, + { + "epoch": 17.01, + "grad_norm": 3.546983003616333, + "learning_rate": 2.714177579407673e-06, + "loss": 0.858, + "step": 56855 + }, + { + "epoch": 17.01, + "grad_norm": 3.305762529373169, + "learning_rate": 2.7115158271261403e-06, + "loss": 0.9272, + "step": 56860 + }, + { + "epoch": 17.01, + "grad_norm": 4.167823791503906, + "learning_rate": 2.70885530581442e-06, + "loss": 0.9225, + "step": 56865 + }, + { + "epoch": 17.01, + "grad_norm": 2.542015790939331, + "learning_rate": 2.7061960156194526e-06, + "loss": 0.972, + "step": 56870 + }, + { + "epoch": 17.02, + "grad_norm": 2.959282398223877, + "learning_rate": 2.7035379566881042e-06, + "loss": 1.1611, + "step": 56875 + }, + { + "epoch": 17.02, + "grad_norm": 2.539849281311035, + "learning_rate": 2.700881129167179e-06, + "loss": 0.8913, + "step": 56880 + }, + { + "epoch": 17.02, + "grad_norm": 3.6170997619628906, + "learning_rate": 2.6982255332034133e-06, + "loss": 1.0476, + "step": 56885 + }, + { + "epoch": 17.02, + "grad_norm": 4.399633407592773, + "learning_rate": 2.6955711689434614e-06, + "loss": 0.7257, + "step": 56890 + }, + { + "epoch": 17.02, + "grad_norm": 2.9377009868621826, + "learning_rate": 2.6929180365339423e-06, + "loss": 1.2028, + "step": 56895 + }, + { + "epoch": 17.02, + "grad_norm": 3.030989408493042, + "learning_rate": 2.690266136121361e-06, + "loss": 1.0568, + "step": 56900 + }, + { + "epoch": 17.03, + "grad_norm": 2.5369529724121094, + "learning_rate": 2.6876154678522005e-06, + "loss": 1.0388, + "step": 56905 + }, + { + "epoch": 17.03, + "grad_norm": 2.8671159744262695, + "learning_rate": 2.6849660318728376e-06, + "loss": 0.9034, + "step": 56910 + }, + { + "epoch": 17.03, + "grad_norm": 1.3632619380950928, + "learning_rate": 2.682317828329614e-06, + "loss": 0.735, + "step": 56915 + }, + { + "epoch": 17.03, + "grad_norm": 2.6106884479522705, + "learning_rate": 2.679670857368774e-06, + "loss": 0.9862, + "step": 56920 + }, + { + "epoch": 17.03, + "grad_norm": 2.3712689876556396, + "learning_rate": 2.6770251191365113e-06, + "loss": 1.0196, + "step": 56925 + }, + { + "epoch": 17.03, + "grad_norm": 5.395451545715332, + "learning_rate": 2.6743806137789485e-06, + "loss": 1.0626, + "step": 56930 + }, + { + "epoch": 17.03, + "grad_norm": 4.0020856857299805, + "learning_rate": 2.6717373414421377e-06, + "loss": 0.9506, + "step": 56935 + }, + { + "epoch": 17.04, + "grad_norm": 1.977827548980713, + "learning_rate": 2.669095302272065e-06, + "loss": 0.7509, + "step": 56940 + }, + { + "epoch": 17.04, + "grad_norm": 4.007442474365234, + "learning_rate": 2.666454496414647e-06, + "loss": 0.9768, + "step": 56945 + }, + { + "epoch": 17.04, + "grad_norm": 2.781235933303833, + "learning_rate": 2.6638149240157313e-06, + "loss": 0.9895, + "step": 56950 + }, + { + "epoch": 17.04, + "grad_norm": 3.7630717754364014, + "learning_rate": 2.6611765852211037e-06, + "loss": 1.0876, + "step": 56955 + }, + { + "epoch": 17.04, + "grad_norm": 6.746951580047607, + "learning_rate": 2.6585394801764697e-06, + "loss": 0.9619, + "step": 56960 + }, + { + "epoch": 17.04, + "grad_norm": 1.971835732460022, + "learning_rate": 2.6559036090274796e-06, + "loss": 1.147, + "step": 56965 + }, + { + "epoch": 17.04, + "grad_norm": 1.7732189893722534, + "learning_rate": 2.653268971919706e-06, + "loss": 1.0708, + "step": 56970 + }, + { + "epoch": 17.05, + "grad_norm": 2.1703782081604004, + "learning_rate": 2.6506355689986607e-06, + "loss": 0.9244, + "step": 56975 + }, + { + "epoch": 17.05, + "grad_norm": 3.1191794872283936, + "learning_rate": 2.6480034004097877e-06, + "loss": 0.9953, + "step": 56980 + }, + { + "epoch": 17.05, + "grad_norm": 3.8292319774627686, + "learning_rate": 2.645372466298443e-06, + "loss": 0.9508, + "step": 56985 + }, + { + "epoch": 17.05, + "grad_norm": 2.0366878509521484, + "learning_rate": 2.64274276680995e-06, + "loss": 0.9986, + "step": 56990 + }, + { + "epoch": 17.05, + "grad_norm": 2.463479518890381, + "learning_rate": 2.6401143020895245e-06, + "loss": 1.0819, + "step": 56995 + }, + { + "epoch": 17.05, + "grad_norm": 1.6114779710769653, + "learning_rate": 2.637487072282355e-06, + "loss": 1.0996, + "step": 57000 + }, + { + "epoch": 17.06, + "grad_norm": 1.567771315574646, + "learning_rate": 2.6348610775335182e-06, + "loss": 1.001, + "step": 57005 + }, + { + "epoch": 17.06, + "grad_norm": 1.417826533317566, + "learning_rate": 2.632236317988068e-06, + "loss": 0.9204, + "step": 57010 + }, + { + "epoch": 17.06, + "grad_norm": 1.9809144735336304, + "learning_rate": 2.6296127937909504e-06, + "loss": 0.928, + "step": 57015 + }, + { + "epoch": 17.06, + "grad_norm": 2.4139678478240967, + "learning_rate": 2.626990505087068e-06, + "loss": 0.8989, + "step": 57020 + }, + { + "epoch": 17.06, + "grad_norm": 2.6247403621673584, + "learning_rate": 2.6243694520212414e-06, + "loss": 0.9612, + "step": 57025 + }, + { + "epoch": 17.06, + "grad_norm": 4.078159332275391, + "learning_rate": 2.6217496347382325e-06, + "loss": 1.0525, + "step": 57030 + }, + { + "epoch": 17.06, + "grad_norm": 3.896151065826416, + "learning_rate": 2.6191310533827313e-06, + "loss": 1.1528, + "step": 57035 + }, + { + "epoch": 17.07, + "grad_norm": 2.571683883666992, + "learning_rate": 2.616513708099358e-06, + "loss": 1.0413, + "step": 57040 + }, + { + "epoch": 17.07, + "grad_norm": 3.916944980621338, + "learning_rate": 2.613897599032666e-06, + "loss": 0.8858, + "step": 57045 + }, + { + "epoch": 17.07, + "grad_norm": 3.300248622894287, + "learning_rate": 2.611282726327141e-06, + "loss": 1.1224, + "step": 57050 + }, + { + "epoch": 17.07, + "grad_norm": 1.99065363407135, + "learning_rate": 2.6086690901272e-06, + "loss": 1.035, + "step": 57055 + }, + { + "epoch": 17.07, + "grad_norm": 2.4363794326782227, + "learning_rate": 2.606056690577191e-06, + "loss": 0.926, + "step": 57060 + }, + { + "epoch": 17.07, + "grad_norm": 1.8297630548477173, + "learning_rate": 2.603445527821391e-06, + "loss": 1.1107, + "step": 57065 + }, + { + "epoch": 17.07, + "grad_norm": 4.487589359283447, + "learning_rate": 2.6008356020040174e-06, + "loss": 1.0452, + "step": 57070 + }, + { + "epoch": 17.08, + "grad_norm": 4.226923942565918, + "learning_rate": 2.598226913269214e-06, + "loss": 0.8234, + "step": 57075 + }, + { + "epoch": 17.08, + "grad_norm": 2.6841580867767334, + "learning_rate": 2.595619461761045e-06, + "loss": 0.735, + "step": 57080 + }, + { + "epoch": 17.08, + "grad_norm": 4.622152805328369, + "learning_rate": 2.5930132476235326e-06, + "loss": 0.9079, + "step": 57085 + }, + { + "epoch": 17.08, + "grad_norm": 2.2728214263916016, + "learning_rate": 2.5904082710005995e-06, + "loss": 1.0287, + "step": 57090 + }, + { + "epoch": 17.08, + "grad_norm": 3.7428131103515625, + "learning_rate": 2.587804532036134e-06, + "loss": 0.8489, + "step": 57095 + }, + { + "epoch": 17.08, + "grad_norm": 5.1959099769592285, + "learning_rate": 2.585202030873915e-06, + "loss": 0.9871, + "step": 57100 + }, + { + "epoch": 17.09, + "grad_norm": 9.105627059936523, + "learning_rate": 2.582600767657703e-06, + "loss": 0.9979, + "step": 57105 + }, + { + "epoch": 17.09, + "grad_norm": 3.824483633041382, + "learning_rate": 2.580000742531141e-06, + "loss": 0.9031, + "step": 57110 + }, + { + "epoch": 17.09, + "grad_norm": 3.2056384086608887, + "learning_rate": 2.577401955637837e-06, + "loss": 0.9565, + "step": 57115 + }, + { + "epoch": 17.09, + "grad_norm": 1.918878197669983, + "learning_rate": 2.574804407121312e-06, + "loss": 1.0012, + "step": 57120 + }, + { + "epoch": 17.09, + "grad_norm": 2.389338731765747, + "learning_rate": 2.572208097125034e-06, + "loss": 1.0327, + "step": 57125 + }, + { + "epoch": 17.09, + "grad_norm": 1.8829810619354248, + "learning_rate": 2.5696130257923863e-06, + "loss": 0.9748, + "step": 57130 + }, + { + "epoch": 17.09, + "grad_norm": 2.50826358795166, + "learning_rate": 2.5670191932666983e-06, + "loss": 0.901, + "step": 57135 + }, + { + "epoch": 17.1, + "grad_norm": 3.3760509490966797, + "learning_rate": 2.5644265996912246e-06, + "loss": 1.0282, + "step": 57140 + }, + { + "epoch": 17.1, + "grad_norm": 2.7886831760406494, + "learning_rate": 2.561835245209146e-06, + "loss": 0.8485, + "step": 57145 + }, + { + "epoch": 17.1, + "grad_norm": 1.582168698310852, + "learning_rate": 2.559245129963586e-06, + "loss": 0.9798, + "step": 57150 + }, + { + "epoch": 17.1, + "grad_norm": 2.845203161239624, + "learning_rate": 2.556656254097589e-06, + "loss": 1.2068, + "step": 57155 + }, + { + "epoch": 17.1, + "grad_norm": 2.8991782665252686, + "learning_rate": 2.5540686177541408e-06, + "loss": 1.0069, + "step": 57160 + }, + { + "epoch": 17.1, + "grad_norm": 2.5524137020111084, + "learning_rate": 2.5514822210761485e-06, + "loss": 0.7097, + "step": 57165 + }, + { + "epoch": 17.1, + "grad_norm": 5.034450531005859, + "learning_rate": 2.5488970642064624e-06, + "loss": 0.9645, + "step": 57170 + }, + { + "epoch": 17.11, + "grad_norm": 3.3670802116394043, + "learning_rate": 2.5463131472878544e-06, + "loss": 1.0891, + "step": 57175 + }, + { + "epoch": 17.11, + "grad_norm": 5.955173015594482, + "learning_rate": 2.543730470463035e-06, + "loss": 0.9105, + "step": 57180 + }, + { + "epoch": 17.11, + "grad_norm": 2.0601420402526855, + "learning_rate": 2.541149033874632e-06, + "loss": 1.0384, + "step": 57185 + }, + { + "epoch": 17.11, + "grad_norm": 1.1754088401794434, + "learning_rate": 2.538568837665231e-06, + "loss": 0.8901, + "step": 57190 + }, + { + "epoch": 17.11, + "grad_norm": 6.220560550689697, + "learning_rate": 2.535989881977319e-06, + "loss": 0.9757, + "step": 57195 + }, + { + "epoch": 17.11, + "grad_norm": 2.9648501873016357, + "learning_rate": 2.5334121669533416e-06, + "loss": 0.9412, + "step": 57200 + }, + { + "epoch": 17.12, + "grad_norm": 2.4804067611694336, + "learning_rate": 2.5308356927356556e-06, + "loss": 1.1147, + "step": 57205 + }, + { + "epoch": 17.12, + "grad_norm": 2.2306246757507324, + "learning_rate": 2.5282604594665583e-06, + "loss": 0.9197, + "step": 57210 + }, + { + "epoch": 17.12, + "grad_norm": 1.3986696004867554, + "learning_rate": 2.525686467288277e-06, + "loss": 0.9156, + "step": 57215 + }, + { + "epoch": 17.12, + "grad_norm": 2.3900442123413086, + "learning_rate": 2.523113716342973e-06, + "loss": 0.9889, + "step": 57220 + }, + { + "epoch": 17.12, + "grad_norm": 6.6008100509643555, + "learning_rate": 2.520542206772733e-06, + "loss": 0.9103, + "step": 57225 + }, + { + "epoch": 17.12, + "grad_norm": 2.302074909210205, + "learning_rate": 2.517971938719582e-06, + "loss": 0.7901, + "step": 57230 + }, + { + "epoch": 17.12, + "grad_norm": 2.978907585144043, + "learning_rate": 2.5154029123254708e-06, + "loss": 1.1318, + "step": 57235 + }, + { + "epoch": 17.13, + "grad_norm": 3.265174388885498, + "learning_rate": 2.5128351277322853e-06, + "loss": 0.8781, + "step": 57240 + }, + { + "epoch": 17.13, + "grad_norm": 2.929891347885132, + "learning_rate": 2.510268585081843e-06, + "loss": 0.8406, + "step": 57245 + }, + { + "epoch": 17.13, + "grad_norm": 2.2917799949645996, + "learning_rate": 2.5077032845158886e-06, + "loss": 0.9925, + "step": 57250 + }, + { + "epoch": 17.13, + "grad_norm": 6.078389644622803, + "learning_rate": 2.505139226176104e-06, + "loss": 0.8077, + "step": 57255 + }, + { + "epoch": 17.13, + "grad_norm": 3.7742414474487305, + "learning_rate": 2.5025764102040966e-06, + "loss": 0.6981, + "step": 57260 + }, + { + "epoch": 17.13, + "grad_norm": 1.999995231628418, + "learning_rate": 2.50001483674141e-06, + "loss": 1.1849, + "step": 57265 + }, + { + "epoch": 17.13, + "grad_norm": 7.203548431396484, + "learning_rate": 2.497454505929517e-06, + "loss": 0.8368, + "step": 57270 + }, + { + "epoch": 17.14, + "grad_norm": 1.5824021100997925, + "learning_rate": 2.494895417909826e-06, + "loss": 1.0648, + "step": 57275 + }, + { + "epoch": 17.14, + "grad_norm": 2.918457269668579, + "learning_rate": 2.492337572823658e-06, + "loss": 0.8636, + "step": 57280 + }, + { + "epoch": 17.14, + "grad_norm": 4.666050434112549, + "learning_rate": 2.4897809708123e-06, + "loss": 0.8613, + "step": 57285 + }, + { + "epoch": 17.14, + "grad_norm": 3.866513967514038, + "learning_rate": 2.487225612016933e-06, + "loss": 1.0003, + "step": 57290 + }, + { + "epoch": 17.14, + "grad_norm": 3.5122618675231934, + "learning_rate": 2.4846714965787027e-06, + "loss": 1.0402, + "step": 57295 + }, + { + "epoch": 17.14, + "grad_norm": 3.2854020595550537, + "learning_rate": 2.4821186246386575e-06, + "loss": 0.8922, + "step": 57300 + }, + { + "epoch": 17.14, + "grad_norm": 3.5705018043518066, + "learning_rate": 2.4795669963377955e-06, + "loss": 1.0352, + "step": 57305 + }, + { + "epoch": 17.15, + "grad_norm": 2.31522798538208, + "learning_rate": 2.4770166118170374e-06, + "loss": 0.9692, + "step": 57310 + }, + { + "epoch": 17.15, + "grad_norm": 4.260332107543945, + "learning_rate": 2.474467471217243e-06, + "loss": 1.0586, + "step": 57315 + }, + { + "epoch": 17.15, + "grad_norm": 3.6906752586364746, + "learning_rate": 2.4719195746791964e-06, + "loss": 0.8967, + "step": 57320 + }, + { + "epoch": 17.15, + "grad_norm": 2.0712099075317383, + "learning_rate": 2.469372922343613e-06, + "loss": 1.1484, + "step": 57325 + }, + { + "epoch": 17.15, + "grad_norm": 2.5663881301879883, + "learning_rate": 2.466827514351144e-06, + "loss": 0.9263, + "step": 57330 + }, + { + "epoch": 17.15, + "grad_norm": 2.5558412075042725, + "learning_rate": 2.4642833508423724e-06, + "loss": 1.1596, + "step": 57335 + }, + { + "epoch": 17.16, + "grad_norm": 4.297339916229248, + "learning_rate": 2.461740431957804e-06, + "loss": 0.8887, + "step": 57340 + }, + { + "epoch": 17.16, + "grad_norm": 1.2254616022109985, + "learning_rate": 2.4591987578378856e-06, + "loss": 1.0521, + "step": 57345 + }, + { + "epoch": 17.16, + "grad_norm": 3.5310375690460205, + "learning_rate": 2.456658328622988e-06, + "loss": 0.7953, + "step": 57350 + }, + { + "epoch": 17.16, + "grad_norm": 2.0364389419555664, + "learning_rate": 2.4541191444534205e-06, + "loss": 0.9538, + "step": 57355 + }, + { + "epoch": 17.16, + "grad_norm": 2.6163721084594727, + "learning_rate": 2.4515812054694166e-06, + "loss": 0.9859, + "step": 57360 + }, + { + "epoch": 17.16, + "grad_norm": 4.850313663482666, + "learning_rate": 2.4490445118111437e-06, + "loss": 0.8275, + "step": 57365 + }, + { + "epoch": 17.16, + "grad_norm": 2.779263734817505, + "learning_rate": 2.44650906361871e-06, + "loss": 1.0238, + "step": 57370 + }, + { + "epoch": 17.17, + "grad_norm": 1.5706207752227783, + "learning_rate": 2.443974861032125e-06, + "loss": 1.1682, + "step": 57375 + }, + { + "epoch": 17.17, + "grad_norm": 2.980870246887207, + "learning_rate": 2.4414419041913744e-06, + "loss": 1.0511, + "step": 57380 + }, + { + "epoch": 17.17, + "grad_norm": 1.8307000398635864, + "learning_rate": 2.438910193236327e-06, + "loss": 1.1374, + "step": 57385 + }, + { + "epoch": 17.17, + "grad_norm": 4.98880672454834, + "learning_rate": 2.436379728306831e-06, + "loss": 0.8951, + "step": 57390 + }, + { + "epoch": 17.17, + "grad_norm": 3.5696189403533936, + "learning_rate": 2.433850509542618e-06, + "loss": 0.9665, + "step": 57395 + }, + { + "epoch": 17.17, + "grad_norm": 2.4421303272247314, + "learning_rate": 2.431322537083394e-06, + "loss": 1.0267, + "step": 57400 + }, + { + "epoch": 17.17, + "grad_norm": 4.078541278839111, + "learning_rate": 2.428795811068765e-06, + "loss": 0.8258, + "step": 57405 + }, + { + "epoch": 17.18, + "grad_norm": 2.0753166675567627, + "learning_rate": 2.426270331638281e-06, + "loss": 0.8415, + "step": 57410 + }, + { + "epoch": 17.18, + "grad_norm": 2.376988410949707, + "learning_rate": 2.423746098931423e-06, + "loss": 1.0177, + "step": 57415 + }, + { + "epoch": 17.18, + "grad_norm": 4.53126335144043, + "learning_rate": 2.4212231130876007e-06, + "loss": 0.9601, + "step": 57420 + }, + { + "epoch": 17.18, + "grad_norm": 2.623030662536621, + "learning_rate": 2.4187013742461545e-06, + "loss": 0.8718, + "step": 57425 + }, + { + "epoch": 17.18, + "grad_norm": 2.2002532482147217, + "learning_rate": 2.4161808825463622e-06, + "loss": 1.0355, + "step": 57430 + }, + { + "epoch": 17.18, + "grad_norm": 2.742001533508301, + "learning_rate": 2.4136616381274237e-06, + "loss": 0.8977, + "step": 57435 + }, + { + "epoch": 17.19, + "grad_norm": 2.284527540206909, + "learning_rate": 2.411143641128477e-06, + "loss": 0.9512, + "step": 57440 + }, + { + "epoch": 17.19, + "grad_norm": 4.969058036804199, + "learning_rate": 2.408626891688587e-06, + "loss": 0.8995, + "step": 57445 + }, + { + "epoch": 17.19, + "grad_norm": 2.935258150100708, + "learning_rate": 2.4061113899467497e-06, + "loss": 0.9235, + "step": 57450 + }, + { + "epoch": 17.19, + "grad_norm": 4.259613037109375, + "learning_rate": 2.4035971360418963e-06, + "loss": 1.0788, + "step": 57455 + }, + { + "epoch": 17.19, + "grad_norm": 2.900808095932007, + "learning_rate": 2.401084130112885e-06, + "loss": 1.1105, + "step": 57460 + }, + { + "epoch": 17.19, + "grad_norm": 2.809903860092163, + "learning_rate": 2.398572372298513e-06, + "loss": 0.986, + "step": 57465 + }, + { + "epoch": 17.19, + "grad_norm": 2.9427716732025146, + "learning_rate": 2.396061862737484e-06, + "loss": 1.0418, + "step": 57470 + }, + { + "epoch": 17.2, + "grad_norm": 4.519219398498535, + "learning_rate": 2.3935526015684745e-06, + "loss": 1.0045, + "step": 57475 + }, + { + "epoch": 17.2, + "grad_norm": 2.9605634212493896, + "learning_rate": 2.391044588930047e-06, + "loss": 0.9415, + "step": 57480 + }, + { + "epoch": 17.2, + "grad_norm": 2.0215342044830322, + "learning_rate": 2.388537824960735e-06, + "loss": 0.9759, + "step": 57485 + }, + { + "epoch": 17.2, + "grad_norm": 1.2563127279281616, + "learning_rate": 2.3860323097989663e-06, + "loss": 0.956, + "step": 57490 + }, + { + "epoch": 17.2, + "grad_norm": 1.9748398065567017, + "learning_rate": 2.3835280435831385e-06, + "loss": 0.9808, + "step": 57495 + }, + { + "epoch": 17.2, + "grad_norm": 2.0095794200897217, + "learning_rate": 2.3810250264515415e-06, + "loss": 1.0952, + "step": 57500 + }, + { + "epoch": 17.2, + "grad_norm": 2.950597047805786, + "learning_rate": 2.3785232585424226e-06, + "loss": 1.0353, + "step": 57505 + }, + { + "epoch": 17.21, + "grad_norm": 1.762516736984253, + "learning_rate": 2.3760227399939493e-06, + "loss": 1.081, + "step": 57510 + }, + { + "epoch": 17.21, + "grad_norm": 4.237101078033447, + "learning_rate": 2.3735234709442194e-06, + "loss": 1.1056, + "step": 57515 + }, + { + "epoch": 17.21, + "grad_norm": 2.0214059352874756, + "learning_rate": 2.371025451531278e-06, + "loss": 0.933, + "step": 57520 + }, + { + "epoch": 17.21, + "grad_norm": 2.3543801307678223, + "learning_rate": 2.36852868189307e-06, + "loss": 0.9907, + "step": 57525 + }, + { + "epoch": 17.21, + "grad_norm": 3.8169326782226562, + "learning_rate": 2.366033162167508e-06, + "loss": 1.045, + "step": 57530 + }, + { + "epoch": 17.21, + "grad_norm": 1.6182808876037598, + "learning_rate": 2.363538892492406e-06, + "loss": 0.9037, + "step": 57535 + }, + { + "epoch": 17.22, + "grad_norm": 2.3788492679595947, + "learning_rate": 2.3610458730055186e-06, + "loss": 1.0592, + "step": 57540 + }, + { + "epoch": 17.22, + "grad_norm": 1.563767910003662, + "learning_rate": 2.3585541038445374e-06, + "loss": 0.8683, + "step": 57545 + }, + { + "epoch": 17.22, + "grad_norm": 3.2709128856658936, + "learning_rate": 2.356063585147078e-06, + "loss": 0.8699, + "step": 57550 + }, + { + "epoch": 17.22, + "grad_norm": 2.178466320037842, + "learning_rate": 2.3535743170506887e-06, + "loss": 0.8278, + "step": 57555 + }, + { + "epoch": 17.22, + "grad_norm": 4.895992755889893, + "learning_rate": 2.351086299692856e-06, + "loss": 1.0167, + "step": 57560 + }, + { + "epoch": 17.22, + "grad_norm": 3.7449183464050293, + "learning_rate": 2.348599533210974e-06, + "loss": 1.1186, + "step": 57565 + }, + { + "epoch": 17.22, + "grad_norm": 4.224990367889404, + "learning_rate": 2.346114017742407e-06, + "loss": 0.9867, + "step": 57570 + }, + { + "epoch": 17.23, + "grad_norm": 1.4723498821258545, + "learning_rate": 2.3436297534244035e-06, + "loss": 1.0563, + "step": 57575 + }, + { + "epoch": 17.23, + "grad_norm": 2.0138280391693115, + "learning_rate": 2.34114674039419e-06, + "loss": 0.8924, + "step": 57580 + }, + { + "epoch": 17.23, + "grad_norm": 5.9495391845703125, + "learning_rate": 2.3386649787888794e-06, + "loss": 1.0926, + "step": 57585 + }, + { + "epoch": 17.23, + "grad_norm": 3.3847897052764893, + "learning_rate": 2.33618446874555e-06, + "loss": 0.9745, + "step": 57590 + }, + { + "epoch": 17.23, + "grad_norm": 8.030804634094238, + "learning_rate": 2.333705210401202e-06, + "loss": 0.9503, + "step": 57595 + }, + { + "epoch": 17.23, + "grad_norm": 1.3774622678756714, + "learning_rate": 2.3312272038927443e-06, + "loss": 0.8589, + "step": 57600 + }, + { + "epoch": 17.23, + "grad_norm": 4.3188652992248535, + "learning_rate": 2.3287504493570534e-06, + "loss": 0.8434, + "step": 57605 + }, + { + "epoch": 17.24, + "grad_norm": 2.444995880126953, + "learning_rate": 2.3262749469309007e-06, + "loss": 1.1008, + "step": 57610 + }, + { + "epoch": 17.24, + "grad_norm": 4.819691181182861, + "learning_rate": 2.3238006967510273e-06, + "loss": 0.9677, + "step": 57615 + }, + { + "epoch": 17.24, + "grad_norm": 3.0465807914733887, + "learning_rate": 2.321327698954057e-06, + "loss": 0.7771, + "step": 57620 + }, + { + "epoch": 17.24, + "grad_norm": 1.9639545679092407, + "learning_rate": 2.3188559536765965e-06, + "loss": 0.7874, + "step": 57625 + }, + { + "epoch": 17.24, + "grad_norm": 4.145541191101074, + "learning_rate": 2.3163854610551427e-06, + "loss": 0.9116, + "step": 57630 + }, + { + "epoch": 17.24, + "grad_norm": 5.50032377243042, + "learning_rate": 2.3139162212261394e-06, + "loss": 0.828, + "step": 57635 + }, + { + "epoch": 17.25, + "grad_norm": 3.153430223464966, + "learning_rate": 2.311448234325961e-06, + "loss": 1.1071, + "step": 57640 + }, + { + "epoch": 17.25, + "grad_norm": 4.366771697998047, + "learning_rate": 2.3089815004909145e-06, + "loss": 1.0254, + "step": 57645 + }, + { + "epoch": 17.25, + "grad_norm": 3.364361047744751, + "learning_rate": 2.3065160198572355e-06, + "loss": 0.8954, + "step": 57650 + }, + { + "epoch": 17.25, + "grad_norm": 2.316021680831909, + "learning_rate": 2.3040517925610904e-06, + "loss": 0.8784, + "step": 57655 + }, + { + "epoch": 17.25, + "grad_norm": 3.245994806289673, + "learning_rate": 2.3015888187385674e-06, + "loss": 1.0405, + "step": 57660 + }, + { + "epoch": 17.25, + "grad_norm": 2.167280435562134, + "learning_rate": 2.299127098525705e-06, + "loss": 0.9786, + "step": 57665 + }, + { + "epoch": 17.25, + "grad_norm": 3.1760175228118896, + "learning_rate": 2.29666663205845e-06, + "loss": 0.9913, + "step": 57670 + }, + { + "epoch": 17.26, + "grad_norm": 1.8148313760757446, + "learning_rate": 2.294207419472705e-06, + "loss": 1.0802, + "step": 57675 + }, + { + "epoch": 17.26, + "grad_norm": 5.893067836761475, + "learning_rate": 2.29174946090428e-06, + "loss": 0.751, + "step": 57680 + }, + { + "epoch": 17.26, + "grad_norm": 2.0050816535949707, + "learning_rate": 2.2892927564889284e-06, + "loss": 0.9561, + "step": 57685 + }, + { + "epoch": 17.26, + "grad_norm": 2.132786512374878, + "learning_rate": 2.286837306362338e-06, + "loss": 0.838, + "step": 57690 + }, + { + "epoch": 17.26, + "grad_norm": 3.9043054580688477, + "learning_rate": 2.2843831106601017e-06, + "loss": 0.9945, + "step": 57695 + }, + { + "epoch": 17.26, + "grad_norm": 2.53236722946167, + "learning_rate": 2.2819301695177876e-06, + "loss": 0.8508, + "step": 57700 + }, + { + "epoch": 17.26, + "grad_norm": 2.5461416244506836, + "learning_rate": 2.2794784830708432e-06, + "loss": 0.7963, + "step": 57705 + }, + { + "epoch": 17.27, + "grad_norm": 3.189567804336548, + "learning_rate": 2.2770280514546964e-06, + "loss": 0.9692, + "step": 57710 + }, + { + "epoch": 17.27, + "grad_norm": 2.352689027786255, + "learning_rate": 2.2745788748046614e-06, + "loss": 0.9831, + "step": 57715 + }, + { + "epoch": 17.27, + "grad_norm": 2.441016674041748, + "learning_rate": 2.2721309532560244e-06, + "loss": 0.962, + "step": 57720 + }, + { + "epoch": 17.27, + "grad_norm": 3.2766942977905273, + "learning_rate": 2.2696842869439657e-06, + "loss": 0.9884, + "step": 57725 + }, + { + "epoch": 17.27, + "grad_norm": 4.255434989929199, + "learning_rate": 2.267238876003616e-06, + "loss": 1.06, + "step": 57730 + }, + { + "epoch": 17.27, + "grad_norm": 4.779931545257568, + "learning_rate": 2.2647947205700322e-06, + "loss": 1.0216, + "step": 57735 + }, + { + "epoch": 17.28, + "grad_norm": 2.0698294639587402, + "learning_rate": 2.262351820778205e-06, + "loss": 0.8473, + "step": 57740 + }, + { + "epoch": 17.28, + "grad_norm": 2.572493553161621, + "learning_rate": 2.2599101767630524e-06, + "loss": 1.0734, + "step": 57745 + }, + { + "epoch": 17.28, + "grad_norm": 3.1653802394866943, + "learning_rate": 2.257469788659425e-06, + "loss": 0.8459, + "step": 57750 + }, + { + "epoch": 17.28, + "grad_norm": 2.538551092147827, + "learning_rate": 2.255030656602103e-06, + "loss": 0.9257, + "step": 57755 + }, + { + "epoch": 17.28, + "grad_norm": 1.3721609115600586, + "learning_rate": 2.2525927807257928e-06, + "loss": 0.984, + "step": 57760 + }, + { + "epoch": 17.28, + "grad_norm": 1.590498447418213, + "learning_rate": 2.25015616116514e-06, + "loss": 0.8294, + "step": 57765 + }, + { + "epoch": 17.28, + "grad_norm": 3.673794746398926, + "learning_rate": 2.2477207980547145e-06, + "loss": 0.9454, + "step": 57770 + }, + { + "epoch": 17.29, + "grad_norm": 3.491346836090088, + "learning_rate": 2.2452866915290195e-06, + "loss": 1.0912, + "step": 57775 + }, + { + "epoch": 17.29, + "grad_norm": 6.499100208282471, + "learning_rate": 2.2428538417224894e-06, + "loss": 0.9398, + "step": 57780 + }, + { + "epoch": 17.29, + "grad_norm": 2.021291971206665, + "learning_rate": 2.2404222487694913e-06, + "loss": 0.9325, + "step": 57785 + }, + { + "epoch": 17.29, + "grad_norm": 2.0946333408355713, + "learning_rate": 2.2379919128043047e-06, + "loss": 0.9485, + "step": 57790 + }, + { + "epoch": 17.29, + "grad_norm": 1.8785912990570068, + "learning_rate": 2.2355628339611744e-06, + "loss": 0.9824, + "step": 57795 + }, + { + "epoch": 17.29, + "grad_norm": 3.972280979156494, + "learning_rate": 2.233135012374238e-06, + "loss": 0.9679, + "step": 57800 + }, + { + "epoch": 17.29, + "grad_norm": 6.458027362823486, + "learning_rate": 2.2307084481775985e-06, + "loss": 0.9765, + "step": 57805 + }, + { + "epoch": 17.3, + "grad_norm": 8.290522575378418, + "learning_rate": 2.228283141505255e-06, + "loss": 1.0319, + "step": 57810 + }, + { + "epoch": 17.3, + "grad_norm": 3.988783121109009, + "learning_rate": 2.2258590924911753e-06, + "loss": 0.9265, + "step": 57815 + }, + { + "epoch": 17.3, + "grad_norm": 1.954484224319458, + "learning_rate": 2.223436301269219e-06, + "loss": 1.0169, + "step": 57820 + }, + { + "epoch": 17.3, + "grad_norm": 4.165136814117432, + "learning_rate": 2.221014767973201e-06, + "loss": 1.0911, + "step": 57825 + }, + { + "epoch": 17.3, + "grad_norm": 3.616419553756714, + "learning_rate": 2.2185944927368587e-06, + "loss": 0.9891, + "step": 57830 + }, + { + "epoch": 17.3, + "grad_norm": 2.0609145164489746, + "learning_rate": 2.216175475693863e-06, + "loss": 1.0497, + "step": 57835 + }, + { + "epoch": 17.31, + "grad_norm": 3.2805593013763428, + "learning_rate": 2.2137577169778155e-06, + "loss": 0.9588, + "step": 57840 + }, + { + "epoch": 17.31, + "grad_norm": 1.7305797338485718, + "learning_rate": 2.211341216722243e-06, + "loss": 0.9302, + "step": 57845 + }, + { + "epoch": 17.31, + "grad_norm": 3.898714303970337, + "learning_rate": 2.2089259750606052e-06, + "loss": 1.1782, + "step": 57850 + }, + { + "epoch": 17.31, + "grad_norm": 1.8012914657592773, + "learning_rate": 2.206511992126298e-06, + "loss": 0.9423, + "step": 57855 + }, + { + "epoch": 17.31, + "grad_norm": 1.263310432434082, + "learning_rate": 2.2040992680526423e-06, + "loss": 0.9829, + "step": 57860 + }, + { + "epoch": 17.31, + "grad_norm": 1.296061396598816, + "learning_rate": 2.2016878029728877e-06, + "loss": 0.9826, + "step": 57865 + }, + { + "epoch": 17.31, + "grad_norm": 3.5253007411956787, + "learning_rate": 2.199277597020219e-06, + "loss": 0.9559, + "step": 57870 + }, + { + "epoch": 17.32, + "grad_norm": 1.947613000869751, + "learning_rate": 2.1968686503277464e-06, + "loss": 0.9857, + "step": 57875 + }, + { + "epoch": 17.32, + "grad_norm": 1.1814169883728027, + "learning_rate": 2.194460963028516e-06, + "loss": 0.9992, + "step": 57880 + }, + { + "epoch": 17.32, + "grad_norm": 3.306941509246826, + "learning_rate": 2.192054535255503e-06, + "loss": 0.9873, + "step": 57885 + }, + { + "epoch": 17.32, + "grad_norm": 2.9445595741271973, + "learning_rate": 2.189649367141616e-06, + "loss": 0.8051, + "step": 57890 + }, + { + "epoch": 17.32, + "grad_norm": 2.4458541870117188, + "learning_rate": 2.187245458819673e-06, + "loss": 1.121, + "step": 57895 + }, + { + "epoch": 17.32, + "grad_norm": 4.300705432891846, + "learning_rate": 2.1848428104224605e-06, + "loss": 0.9273, + "step": 57900 + }, + { + "epoch": 17.32, + "grad_norm": 1.9480119943618774, + "learning_rate": 2.1824414220826566e-06, + "loss": 0.9085, + "step": 57905 + }, + { + "epoch": 17.33, + "grad_norm": 5.82395601272583, + "learning_rate": 2.180041293932905e-06, + "loss": 1.0309, + "step": 57910 + }, + { + "epoch": 17.33, + "grad_norm": 2.66611909866333, + "learning_rate": 2.177642426105747e-06, + "loss": 1.1571, + "step": 57915 + }, + { + "epoch": 17.33, + "grad_norm": 1.7344080209732056, + "learning_rate": 2.1752448187336738e-06, + "loss": 0.8145, + "step": 57920 + }, + { + "epoch": 17.33, + "grad_norm": 4.083990573883057, + "learning_rate": 2.172848471949107e-06, + "loss": 0.8786, + "step": 57925 + }, + { + "epoch": 17.33, + "grad_norm": 3.4210381507873535, + "learning_rate": 2.1704533858843884e-06, + "loss": 0.9086, + "step": 57930 + }, + { + "epoch": 17.33, + "grad_norm": 2.1340394020080566, + "learning_rate": 2.1680595606718008e-06, + "loss": 0.9275, + "step": 57935 + }, + { + "epoch": 17.33, + "grad_norm": 2.192603588104248, + "learning_rate": 2.165666996443552e-06, + "loss": 1.0049, + "step": 57940 + }, + { + "epoch": 17.34, + "grad_norm": 1.9342150688171387, + "learning_rate": 2.1632756933317815e-06, + "loss": 0.9161, + "step": 57945 + }, + { + "epoch": 17.34, + "grad_norm": 2.2969846725463867, + "learning_rate": 2.160885651468553e-06, + "loss": 1.0195, + "step": 57950 + }, + { + "epoch": 17.34, + "grad_norm": 2.6062443256378174, + "learning_rate": 2.158496870985874e-06, + "loss": 1.0117, + "step": 57955 + }, + { + "epoch": 17.34, + "grad_norm": 1.2157763242721558, + "learning_rate": 2.156109352015667e-06, + "loss": 0.9364, + "step": 57960 + }, + { + "epoch": 17.34, + "grad_norm": 3.201415777206421, + "learning_rate": 2.153723094689797e-06, + "loss": 1.1783, + "step": 57965 + }, + { + "epoch": 17.34, + "grad_norm": 2.736159324645996, + "learning_rate": 2.1513380991400547e-06, + "loss": 0.8852, + "step": 57970 + }, + { + "epoch": 17.35, + "grad_norm": 2.2211790084838867, + "learning_rate": 2.148954365498157e-06, + "loss": 0.8922, + "step": 57975 + }, + { + "epoch": 17.35, + "grad_norm": 0.9812418818473816, + "learning_rate": 2.1465718938957576e-06, + "loss": 1.049, + "step": 57980 + }, + { + "epoch": 17.35, + "grad_norm": 2.103342294692993, + "learning_rate": 2.144190684464442e-06, + "loss": 1.1637, + "step": 57985 + }, + { + "epoch": 17.35, + "grad_norm": 2.9801440238952637, + "learning_rate": 2.1418107373357116e-06, + "loss": 0.8715, + "step": 57990 + }, + { + "epoch": 17.35, + "grad_norm": 3.3744688034057617, + "learning_rate": 2.139432052641019e-06, + "loss": 1.0241, + "step": 57995 + }, + { + "epoch": 17.35, + "grad_norm": 3.7058229446411133, + "learning_rate": 2.1370546305117254e-06, + "loss": 1.0102, + "step": 58000 + }, + { + "epoch": 17.35, + "grad_norm": 4.445005416870117, + "learning_rate": 2.134678471079149e-06, + "loss": 1.1774, + "step": 58005 + }, + { + "epoch": 17.36, + "grad_norm": 2.2905967235565186, + "learning_rate": 2.132303574474509e-06, + "loss": 0.8403, + "step": 58010 + }, + { + "epoch": 17.36, + "grad_norm": 1.867017388343811, + "learning_rate": 2.129929940828973e-06, + "loss": 0.987, + "step": 58015 + }, + { + "epoch": 17.36, + "grad_norm": 2.154637575149536, + "learning_rate": 2.1275575702736334e-06, + "loss": 0.9626, + "step": 58020 + }, + { + "epoch": 17.36, + "grad_norm": 1.9806933403015137, + "learning_rate": 2.1251864629395156e-06, + "loss": 1.1247, + "step": 58025 + }, + { + "epoch": 17.36, + "grad_norm": 2.581702947616577, + "learning_rate": 2.122816618957571e-06, + "loss": 1.0137, + "step": 58030 + }, + { + "epoch": 17.36, + "grad_norm": 2.4726662635803223, + "learning_rate": 2.120448038458686e-06, + "loss": 0.9178, + "step": 58035 + }, + { + "epoch": 17.36, + "grad_norm": 1.8047939538955688, + "learning_rate": 2.118080721573673e-06, + "loss": 1.1279, + "step": 58040 + }, + { + "epoch": 17.37, + "grad_norm": 1.1406919956207275, + "learning_rate": 2.1157146684332774e-06, + "loss": 0.9604, + "step": 58045 + }, + { + "epoch": 17.37, + "grad_norm": 2.2998058795928955, + "learning_rate": 2.113349879168175e-06, + "loss": 1.1191, + "step": 58050 + }, + { + "epoch": 17.37, + "grad_norm": 2.92384672164917, + "learning_rate": 2.11098635390897e-06, + "loss": 1.0299, + "step": 58055 + }, + { + "epoch": 17.37, + "grad_norm": 2.4828195571899414, + "learning_rate": 2.1086240927861933e-06, + "loss": 1.1108, + "step": 58060 + }, + { + "epoch": 17.37, + "grad_norm": 2.7545273303985596, + "learning_rate": 2.1062630959303163e-06, + "loss": 1.0005, + "step": 58065 + }, + { + "epoch": 17.37, + "grad_norm": 2.8774526119232178, + "learning_rate": 2.103903363471732e-06, + "loss": 1.0559, + "step": 58070 + }, + { + "epoch": 17.38, + "grad_norm": 1.6022573709487915, + "learning_rate": 2.1015448955407637e-06, + "loss": 0.8921, + "step": 58075 + }, + { + "epoch": 17.38, + "grad_norm": 3.47226619720459, + "learning_rate": 2.0991876922676764e-06, + "loss": 1.0754, + "step": 58080 + }, + { + "epoch": 17.38, + "grad_norm": 1.9637651443481445, + "learning_rate": 2.096831753782638e-06, + "loss": 0.9987, + "step": 58085 + }, + { + "epoch": 17.38, + "grad_norm": 1.8076112270355225, + "learning_rate": 2.094477080215787e-06, + "loss": 0.9186, + "step": 58090 + }, + { + "epoch": 17.38, + "grad_norm": 3.499573230743408, + "learning_rate": 2.0921236716971465e-06, + "loss": 1.0806, + "step": 58095 + }, + { + "epoch": 17.38, + "grad_norm": 5.197295188903809, + "learning_rate": 2.0897715283567126e-06, + "loss": 1.0594, + "step": 58100 + }, + { + "epoch": 17.38, + "grad_norm": 4.264853477478027, + "learning_rate": 2.0874206503243758e-06, + "loss": 0.8798, + "step": 58105 + }, + { + "epoch": 17.39, + "grad_norm": 4.624975681304932, + "learning_rate": 2.0850710377299883e-06, + "loss": 0.9827, + "step": 58110 + }, + { + "epoch": 17.39, + "grad_norm": 2.4760243892669678, + "learning_rate": 2.0827226907033037e-06, + "loss": 0.9113, + "step": 58115 + }, + { + "epoch": 17.39, + "grad_norm": 1.9899121522903442, + "learning_rate": 2.0803756093740247e-06, + "loss": 0.9547, + "step": 58120 + }, + { + "epoch": 17.39, + "grad_norm": 2.200958251953125, + "learning_rate": 2.0780297938717776e-06, + "loss": 0.9839, + "step": 58125 + }, + { + "epoch": 17.39, + "grad_norm": 2.856152296066284, + "learning_rate": 2.075685244326117e-06, + "loss": 0.8033, + "step": 58130 + }, + { + "epoch": 17.39, + "grad_norm": 2.164071559906006, + "learning_rate": 2.0733419608665345e-06, + "loss": 0.8925, + "step": 58135 + }, + { + "epoch": 17.39, + "grad_norm": 1.5249561071395874, + "learning_rate": 2.0709999436224426e-06, + "loss": 0.958, + "step": 58140 + }, + { + "epoch": 17.4, + "grad_norm": 3.109286069869995, + "learning_rate": 2.0686591927231908e-06, + "loss": 0.8569, + "step": 58145 + }, + { + "epoch": 17.4, + "grad_norm": 3.086642265319824, + "learning_rate": 2.066319708298056e-06, + "loss": 1.0587, + "step": 58150 + }, + { + "epoch": 17.4, + "grad_norm": 3.6070919036865234, + "learning_rate": 2.063981490476247e-06, + "loss": 1.0945, + "step": 58155 + }, + { + "epoch": 17.4, + "grad_norm": 3.0385048389434814, + "learning_rate": 2.0616445393868977e-06, + "loss": 1.0345, + "step": 58160 + }, + { + "epoch": 17.4, + "grad_norm": 3.2083237171173096, + "learning_rate": 2.0593088551590783e-06, + "loss": 0.9921, + "step": 58165 + }, + { + "epoch": 17.4, + "grad_norm": 2.1536431312561035, + "learning_rate": 2.056974437921785e-06, + "loss": 1.1214, + "step": 58170 + }, + { + "epoch": 17.41, + "grad_norm": 4.799535274505615, + "learning_rate": 2.0546412878039517e-06, + "loss": 0.9159, + "step": 58175 + }, + { + "epoch": 17.41, + "grad_norm": 3.742593765258789, + "learning_rate": 2.052309404934419e-06, + "loss": 0.9264, + "step": 58180 + }, + { + "epoch": 17.41, + "grad_norm": 2.696674346923828, + "learning_rate": 2.049978789441992e-06, + "loss": 1.0157, + "step": 58185 + }, + { + "epoch": 17.41, + "grad_norm": 2.9800398349761963, + "learning_rate": 2.0476494414553765e-06, + "loss": 0.8998, + "step": 58190 + }, + { + "epoch": 17.41, + "grad_norm": 5.797310829162598, + "learning_rate": 2.045321361103231e-06, + "loss": 0.9081, + "step": 58195 + }, + { + "epoch": 17.41, + "grad_norm": 3.218557119369507, + "learning_rate": 2.0429945485141184e-06, + "loss": 0.8853, + "step": 58200 + }, + { + "epoch": 17.41, + "grad_norm": 4.598995208740234, + "learning_rate": 2.0411340113185823e-06, + "loss": 0.9268, + "step": 58205 + }, + { + "epoch": 17.42, + "grad_norm": 10.203455924987793, + "learning_rate": 2.0388094810267407e-06, + "loss": 0.9933, + "step": 58210 + }, + { + "epoch": 17.42, + "grad_norm": 1.8637256622314453, + "learning_rate": 2.0364862188575833e-06, + "loss": 1.0012, + "step": 58215 + }, + { + "epoch": 17.42, + "grad_norm": 4.718301296234131, + "learning_rate": 2.034164224939425e-06, + "loss": 1.0701, + "step": 58220 + }, + { + "epoch": 17.42, + "grad_norm": 3.8083252906799316, + "learning_rate": 2.0318434994005014e-06, + "loss": 0.8359, + "step": 58225 + }, + { + "epoch": 17.42, + "grad_norm": 3.1337733268737793, + "learning_rate": 2.0295240423689938e-06, + "loss": 0.968, + "step": 58230 + }, + { + "epoch": 17.42, + "grad_norm": 2.8172826766967773, + "learning_rate": 2.0272058539729882e-06, + "loss": 0.9344, + "step": 58235 + }, + { + "epoch": 17.42, + "grad_norm": 1.8639297485351562, + "learning_rate": 2.0248889343405324e-06, + "loss": 1.1076, + "step": 58240 + }, + { + "epoch": 17.43, + "grad_norm": 1.549034595489502, + "learning_rate": 2.0225732835995715e-06, + "loss": 1.0001, + "step": 58245 + }, + { + "epoch": 17.43, + "grad_norm": 1.5290988683700562, + "learning_rate": 2.020258901878014e-06, + "loss": 1.0315, + "step": 58250 + }, + { + "epoch": 17.43, + "grad_norm": 3.2922134399414062, + "learning_rate": 2.01794578930366e-06, + "loss": 0.976, + "step": 58255 + }, + { + "epoch": 17.43, + "grad_norm": 2.2940144538879395, + "learning_rate": 2.0156339460042804e-06, + "loss": 0.9909, + "step": 58260 + }, + { + "epoch": 17.43, + "grad_norm": 2.8492350578308105, + "learning_rate": 2.013323372107545e-06, + "loss": 1.0184, + "step": 58265 + }, + { + "epoch": 17.43, + "grad_norm": 7.312509059906006, + "learning_rate": 2.0110140677410655e-06, + "loss": 0.8777, + "step": 58270 + }, + { + "epoch": 17.44, + "grad_norm": 3.636141777038574, + "learning_rate": 2.0087060330323813e-06, + "loss": 0.8916, + "step": 58275 + }, + { + "epoch": 17.44, + "grad_norm": 4.8693366050720215, + "learning_rate": 2.006399268108969e-06, + "loss": 0.9538, + "step": 58280 + }, + { + "epoch": 17.44, + "grad_norm": 4.98232364654541, + "learning_rate": 2.00409377309822e-06, + "loss": 0.8613, + "step": 58285 + }, + { + "epoch": 17.44, + "grad_norm": 1.1212947368621826, + "learning_rate": 2.0017895481274724e-06, + "loss": 0.8985, + "step": 58290 + }, + { + "epoch": 17.44, + "grad_norm": 4.1995673179626465, + "learning_rate": 1.999486593323982e-06, + "loss": 0.9499, + "step": 58295 + }, + { + "epoch": 17.44, + "grad_norm": 5.486680507659912, + "learning_rate": 1.9971849088149393e-06, + "loss": 0.9938, + "step": 58300 + }, + { + "epoch": 17.44, + "grad_norm": 2.455383062362671, + "learning_rate": 1.9948844947274616e-06, + "loss": 1.2352, + "step": 58305 + }, + { + "epoch": 17.45, + "grad_norm": 2.302922010421753, + "learning_rate": 1.9925853511886026e-06, + "loss": 1.0595, + "step": 58310 + }, + { + "epoch": 17.45, + "grad_norm": 1.5419032573699951, + "learning_rate": 1.9902874783253393e-06, + "loss": 1.0541, + "step": 58315 + }, + { + "epoch": 17.45, + "grad_norm": 3.4702677726745605, + "learning_rate": 1.9879908762645837e-06, + "loss": 0.9092, + "step": 58320 + }, + { + "epoch": 17.45, + "grad_norm": 2.8185274600982666, + "learning_rate": 1.985695545133173e-06, + "loss": 0.9399, + "step": 58325 + }, + { + "epoch": 17.45, + "grad_norm": 2.852463960647583, + "learning_rate": 1.9834014850578703e-06, + "loss": 0.9442, + "step": 58330 + }, + { + "epoch": 17.45, + "grad_norm": 1.91469407081604, + "learning_rate": 1.9811086961653846e-06, + "loss": 1.1047, + "step": 58335 + }, + { + "epoch": 17.45, + "grad_norm": 3.1850225925445557, + "learning_rate": 1.9788171785823316e-06, + "loss": 0.9461, + "step": 58340 + }, + { + "epoch": 17.46, + "grad_norm": 2.731886386871338, + "learning_rate": 1.9765269324352855e-06, + "loss": 0.8916, + "step": 58345 + }, + { + "epoch": 17.46, + "grad_norm": 2.1498260498046875, + "learning_rate": 1.9742379578507163e-06, + "loss": 0.9886, + "step": 58350 + }, + { + "epoch": 17.46, + "grad_norm": 3.7887089252471924, + "learning_rate": 1.97195025495506e-06, + "loss": 0.9632, + "step": 58355 + }, + { + "epoch": 17.46, + "grad_norm": 2.147568702697754, + "learning_rate": 1.969663823874651e-06, + "loss": 0.911, + "step": 58360 + }, + { + "epoch": 17.46, + "grad_norm": 3.2035396099090576, + "learning_rate": 1.9673786647357693e-06, + "loss": 0.8811, + "step": 58365 + }, + { + "epoch": 17.46, + "grad_norm": 3.055304765701294, + "learning_rate": 1.9650947776646223e-06, + "loss": 1.1041, + "step": 58370 + }, + { + "epoch": 17.47, + "grad_norm": 3.3420493602752686, + "learning_rate": 1.9628121627873476e-06, + "loss": 0.9011, + "step": 58375 + }, + { + "epoch": 17.47, + "grad_norm": 1.865210771560669, + "learning_rate": 1.9605308202300136e-06, + "loss": 0.9144, + "step": 58380 + }, + { + "epoch": 17.47, + "grad_norm": 3.021851062774658, + "learning_rate": 1.958250750118612e-06, + "loss": 1.0783, + "step": 58385 + }, + { + "epoch": 17.47, + "grad_norm": 2.6499688625335693, + "learning_rate": 1.9559719525790693e-06, + "loss": 0.8827, + "step": 58390 + }, + { + "epoch": 17.47, + "grad_norm": 2.7535033226013184, + "learning_rate": 1.953694427737246e-06, + "loss": 0.9414, + "step": 58395 + }, + { + "epoch": 17.47, + "grad_norm": 1.7163090705871582, + "learning_rate": 1.9514181757189222e-06, + "loss": 0.8349, + "step": 58400 + }, + { + "epoch": 17.47, + "grad_norm": 1.809923529624939, + "learning_rate": 1.9491431966498162e-06, + "loss": 0.997, + "step": 58405 + }, + { + "epoch": 17.48, + "grad_norm": 2.4826242923736572, + "learning_rate": 1.94686949065557e-06, + "loss": 0.8481, + "step": 58410 + }, + { + "epoch": 17.48, + "grad_norm": 2.3572394847869873, + "learning_rate": 1.9445970578617605e-06, + "loss": 1.0958, + "step": 58415 + }, + { + "epoch": 17.48, + "grad_norm": 2.6370866298675537, + "learning_rate": 1.942325898393896e-06, + "loss": 0.8277, + "step": 58420 + }, + { + "epoch": 17.48, + "grad_norm": 2.8419244289398193, + "learning_rate": 1.9400560123773946e-06, + "loss": 0.9423, + "step": 58425 + }, + { + "epoch": 17.48, + "grad_norm": 0.7702686786651611, + "learning_rate": 1.937787399937638e-06, + "loss": 0.935, + "step": 58430 + }, + { + "epoch": 17.48, + "grad_norm": 2.7786147594451904, + "learning_rate": 1.9355200611999055e-06, + "loss": 0.778, + "step": 58435 + }, + { + "epoch": 17.48, + "grad_norm": 4.474719047546387, + "learning_rate": 1.933253996289433e-06, + "loss": 0.9768, + "step": 58440 + }, + { + "epoch": 17.49, + "grad_norm": 2.249441146850586, + "learning_rate": 1.930989205331357e-06, + "loss": 0.725, + "step": 58445 + }, + { + "epoch": 17.49, + "grad_norm": 3.9831950664520264, + "learning_rate": 1.9287256884507747e-06, + "loss": 0.8339, + "step": 58450 + }, + { + "epoch": 17.49, + "grad_norm": 3.666069269180298, + "learning_rate": 1.9264634457726914e-06, + "loss": 0.9266, + "step": 58455 + }, + { + "epoch": 17.49, + "grad_norm": 3.5630712509155273, + "learning_rate": 1.924202477422046e-06, + "loss": 0.8352, + "step": 58460 + }, + { + "epoch": 17.49, + "grad_norm": 2.659298896789551, + "learning_rate": 1.921942783523711e-06, + "loss": 0.942, + "step": 58465 + }, + { + "epoch": 17.49, + "grad_norm": 4.289942741394043, + "learning_rate": 1.9196843642024896e-06, + "loss": 0.9033, + "step": 58470 + }, + { + "epoch": 17.5, + "grad_norm": 5.508975505828857, + "learning_rate": 1.9174272195831096e-06, + "loss": 1.015, + "step": 58475 + }, + { + "epoch": 17.5, + "grad_norm": 3.3930578231811523, + "learning_rate": 1.915171349790232e-06, + "loss": 0.9336, + "step": 58480 + }, + { + "epoch": 17.5, + "grad_norm": 2.193418264389038, + "learning_rate": 1.9129167549484435e-06, + "loss": 0.9792, + "step": 58485 + }, + { + "epoch": 17.5, + "grad_norm": 1.5776050090789795, + "learning_rate": 1.9106634351822665e-06, + "loss": 1.0228, + "step": 58490 + }, + { + "epoch": 17.5, + "grad_norm": 1.6216871738433838, + "learning_rate": 1.9084113906161464e-06, + "loss": 0.9904, + "step": 58495 + }, + { + "epoch": 17.5, + "grad_norm": 2.6556360721588135, + "learning_rate": 1.9061606213744642e-06, + "loss": 1.0049, + "step": 58500 + }, + { + "epoch": 17.5, + "grad_norm": 3.8837389945983887, + "learning_rate": 1.9039111275815257e-06, + "loss": 0.8822, + "step": 58505 + }, + { + "epoch": 17.51, + "grad_norm": 2.5499298572540283, + "learning_rate": 1.9016629093615706e-06, + "loss": 1.0682, + "step": 58510 + }, + { + "epoch": 17.51, + "grad_norm": 4.589527606964111, + "learning_rate": 1.8994159668387662e-06, + "loss": 0.8225, + "step": 58515 + }, + { + "epoch": 17.51, + "grad_norm": 2.39145827293396, + "learning_rate": 1.8971703001371965e-06, + "loss": 0.7831, + "step": 58520 + }, + { + "epoch": 17.51, + "grad_norm": 2.863334894180298, + "learning_rate": 1.8949259093809068e-06, + "loss": 0.8574, + "step": 58525 + }, + { + "epoch": 17.51, + "grad_norm": 1.6067497730255127, + "learning_rate": 1.892682794693834e-06, + "loss": 0.9947, + "step": 58530 + }, + { + "epoch": 17.51, + "grad_norm": 2.9973134994506836, + "learning_rate": 1.8904409561998793e-06, + "loss": 0.8852, + "step": 58535 + }, + { + "epoch": 17.51, + "grad_norm": 12.899645805358887, + "learning_rate": 1.8882003940228431e-06, + "loss": 1.015, + "step": 58540 + }, + { + "epoch": 17.52, + "grad_norm": 4.770505428314209, + "learning_rate": 1.8859611082864826e-06, + "loss": 1.1247, + "step": 58545 + }, + { + "epoch": 17.52, + "grad_norm": 1.5795246362686157, + "learning_rate": 1.8837230991144622e-06, + "loss": 1.0028, + "step": 58550 + }, + { + "epoch": 17.52, + "grad_norm": 3.8196096420288086, + "learning_rate": 1.8814863666303833e-06, + "loss": 0.9865, + "step": 58555 + }, + { + "epoch": 17.52, + "grad_norm": 1.8591629266738892, + "learning_rate": 1.879250910957786e-06, + "loss": 0.8811, + "step": 58560 + }, + { + "epoch": 17.52, + "grad_norm": 2.9709808826446533, + "learning_rate": 1.877016732220127e-06, + "loss": 0.9346, + "step": 58565 + }, + { + "epoch": 17.52, + "grad_norm": 4.240139961242676, + "learning_rate": 1.8747838305407967e-06, + "loss": 1.0878, + "step": 58570 + }, + { + "epoch": 17.52, + "grad_norm": 3.321091890335083, + "learning_rate": 1.872552206043121e-06, + "loss": 0.9013, + "step": 58575 + }, + { + "epoch": 17.53, + "grad_norm": 2.4288010597229004, + "learning_rate": 1.8703218588503463e-06, + "loss": 0.8679, + "step": 58580 + }, + { + "epoch": 17.53, + "grad_norm": 3.094264507293701, + "learning_rate": 1.8680927890856515e-06, + "loss": 1.1084, + "step": 58585 + }, + { + "epoch": 17.53, + "grad_norm": 2.697632312774658, + "learning_rate": 1.8658649968721492e-06, + "loss": 1.087, + "step": 58590 + }, + { + "epoch": 17.53, + "grad_norm": 1.5430232286453247, + "learning_rate": 1.8636384823328773e-06, + "loss": 0.7268, + "step": 58595 + }, + { + "epoch": 17.53, + "grad_norm": 3.243159532546997, + "learning_rate": 1.861413245590804e-06, + "loss": 0.9764, + "step": 58600 + }, + { + "epoch": 17.53, + "grad_norm": 2.758650541305542, + "learning_rate": 1.8591892867688226e-06, + "loss": 1.088, + "step": 58605 + }, + { + "epoch": 17.54, + "grad_norm": 3.265185594558716, + "learning_rate": 1.8569666059897655e-06, + "loss": 0.9399, + "step": 58610 + }, + { + "epoch": 17.54, + "grad_norm": 2.0844991207122803, + "learning_rate": 1.8547452033763873e-06, + "loss": 1.0002, + "step": 58615 + }, + { + "epoch": 17.54, + "grad_norm": 3.4934003353118896, + "learning_rate": 1.852525079051376e-06, + "loss": 0.8137, + "step": 58620 + }, + { + "epoch": 17.54, + "grad_norm": 3.0585551261901855, + "learning_rate": 1.8503062331373362e-06, + "loss": 0.9706, + "step": 58625 + }, + { + "epoch": 17.54, + "grad_norm": 2.177954912185669, + "learning_rate": 1.8480886657568307e-06, + "loss": 0.9228, + "step": 58630 + }, + { + "epoch": 17.54, + "grad_norm": 8.818868637084961, + "learning_rate": 1.8458723770323122e-06, + "loss": 0.8919, + "step": 58635 + }, + { + "epoch": 17.54, + "grad_norm": 4.750818252563477, + "learning_rate": 1.8436573670862045e-06, + "loss": 0.9735, + "step": 58640 + }, + { + "epoch": 17.55, + "grad_norm": 4.335287094116211, + "learning_rate": 1.8414436360408265e-06, + "loss": 0.9834, + "step": 58645 + }, + { + "epoch": 17.55, + "grad_norm": 3.9974465370178223, + "learning_rate": 1.8392311840184445e-06, + "loss": 0.9222, + "step": 58650 + }, + { + "epoch": 17.55, + "grad_norm": 2.57463002204895, + "learning_rate": 1.8370200111412495e-06, + "loss": 1.0656, + "step": 58655 + }, + { + "epoch": 17.55, + "grad_norm": 3.5472350120544434, + "learning_rate": 1.8348101175313632e-06, + "loss": 0.8107, + "step": 58660 + }, + { + "epoch": 17.55, + "grad_norm": 2.0845704078674316, + "learning_rate": 1.8326015033108351e-06, + "loss": 0.9723, + "step": 58665 + }, + { + "epoch": 17.55, + "grad_norm": 8.588332176208496, + "learning_rate": 1.8303941686016402e-06, + "loss": 1.0629, + "step": 58670 + }, + { + "epoch": 17.55, + "grad_norm": 2.690653085708618, + "learning_rate": 1.8281881135257e-06, + "loss": 0.9891, + "step": 58675 + }, + { + "epoch": 17.56, + "grad_norm": 1.5229490995407104, + "learning_rate": 1.8259833382048425e-06, + "loss": 0.9973, + "step": 58680 + }, + { + "epoch": 17.56, + "grad_norm": 1.4649319648742676, + "learning_rate": 1.8237798427608337e-06, + "loss": 0.9495, + "step": 58685 + }, + { + "epoch": 17.56, + "grad_norm": 4.790569305419922, + "learning_rate": 1.8215776273153767e-06, + "loss": 0.8056, + "step": 58690 + }, + { + "epoch": 17.56, + "grad_norm": 5.313488960266113, + "learning_rate": 1.8193766919900934e-06, + "loss": 0.9817, + "step": 58695 + }, + { + "epoch": 17.56, + "grad_norm": 1.8241521120071411, + "learning_rate": 1.8171770369065422e-06, + "loss": 0.9368, + "step": 58700 + }, + { + "epoch": 17.56, + "grad_norm": 2.6710360050201416, + "learning_rate": 1.8149786621862036e-06, + "loss": 0.9306, + "step": 58705 + }, + { + "epoch": 17.57, + "grad_norm": 3.22537899017334, + "learning_rate": 1.8127815679504972e-06, + "loss": 0.9387, + "step": 58710 + }, + { + "epoch": 17.57, + "grad_norm": 1.9393733739852905, + "learning_rate": 1.8105857543207649e-06, + "loss": 1.0874, + "step": 58715 + }, + { + "epoch": 17.57, + "grad_norm": 4.174792289733887, + "learning_rate": 1.808391221418268e-06, + "loss": 1.0662, + "step": 58720 + }, + { + "epoch": 17.57, + "grad_norm": 4.3584513664245605, + "learning_rate": 1.8061979693642262e-06, + "loss": 0.9845, + "step": 58725 + }, + { + "epoch": 17.57, + "grad_norm": 2.3744728565216064, + "learning_rate": 1.8040059982797563e-06, + "loss": 0.9638, + "step": 58730 + }, + { + "epoch": 17.57, + "grad_norm": 3.663601875305176, + "learning_rate": 1.8018153082859313e-06, + "loss": 0.9749, + "step": 58735 + }, + { + "epoch": 17.57, + "grad_norm": 2.476334810256958, + "learning_rate": 1.7996258995037291e-06, + "loss": 0.8683, + "step": 58740 + }, + { + "epoch": 17.58, + "grad_norm": 2.907877206802368, + "learning_rate": 1.797437772054067e-06, + "loss": 0.8443, + "step": 58745 + }, + { + "epoch": 17.58, + "grad_norm": 2.9625954627990723, + "learning_rate": 1.7952509260578092e-06, + "loss": 0.942, + "step": 58750 + }, + { + "epoch": 17.58, + "grad_norm": 3.8954741954803467, + "learning_rate": 1.7930653616357152e-06, + "loss": 0.956, + "step": 58755 + }, + { + "epoch": 17.58, + "grad_norm": 3.0243756771087646, + "learning_rate": 1.7908810789085073e-06, + "loss": 1.2546, + "step": 58760 + }, + { + "epoch": 17.58, + "grad_norm": 1.2879854440689087, + "learning_rate": 1.7886980779968032e-06, + "loss": 0.9019, + "step": 58765 + }, + { + "epoch": 17.58, + "grad_norm": 5.267662048339844, + "learning_rate": 1.786516359021187e-06, + "loss": 0.8835, + "step": 58770 + }, + { + "epoch": 17.58, + "grad_norm": 1.7943377494812012, + "learning_rate": 1.7843359221021372e-06, + "loss": 0.9029, + "step": 58775 + }, + { + "epoch": 17.59, + "grad_norm": 5.033475399017334, + "learning_rate": 1.7821567673600854e-06, + "loss": 0.8688, + "step": 58780 + }, + { + "epoch": 17.59, + "grad_norm": 3.698159694671631, + "learning_rate": 1.7799788949153796e-06, + "loss": 1.0448, + "step": 58785 + }, + { + "epoch": 17.59, + "grad_norm": 2.3559699058532715, + "learning_rate": 1.777802304888304e-06, + "loss": 0.7721, + "step": 58790 + }, + { + "epoch": 17.59, + "grad_norm": 3.055173873901367, + "learning_rate": 1.7756269973990713e-06, + "loss": 0.8932, + "step": 58795 + }, + { + "epoch": 17.59, + "grad_norm": 2.6839029788970947, + "learning_rate": 1.7734529725678184e-06, + "loss": 1.1173, + "step": 58800 + }, + { + "epoch": 17.59, + "grad_norm": 3.986969470977783, + "learning_rate": 1.7712802305146131e-06, + "loss": 0.8831, + "step": 58805 + }, + { + "epoch": 17.6, + "grad_norm": 3.205456018447876, + "learning_rate": 1.7691087713594595e-06, + "loss": 0.91, + "step": 58810 + }, + { + "epoch": 17.6, + "grad_norm": 3.0267038345336914, + "learning_rate": 1.7669385952222755e-06, + "loss": 0.9393, + "step": 58815 + }, + { + "epoch": 17.6, + "grad_norm": 3.005690336227417, + "learning_rate": 1.7647697022229321e-06, + "loss": 0.8653, + "step": 58820 + }, + { + "epoch": 17.6, + "grad_norm": 3.8134701251983643, + "learning_rate": 1.762602092481197e-06, + "loss": 1.0195, + "step": 58825 + }, + { + "epoch": 17.6, + "grad_norm": 5.009188175201416, + "learning_rate": 1.760435766116797e-06, + "loss": 0.9841, + "step": 58830 + }, + { + "epoch": 17.6, + "grad_norm": 2.0699079036712646, + "learning_rate": 1.758270723249375e-06, + "loss": 0.953, + "step": 58835 + }, + { + "epoch": 17.6, + "grad_norm": 2.2950937747955322, + "learning_rate": 1.7561069639985023e-06, + "loss": 1.0804, + "step": 58840 + }, + { + "epoch": 17.61, + "grad_norm": 1.1012648344039917, + "learning_rate": 1.753944488483686e-06, + "loss": 0.9962, + "step": 58845 + }, + { + "epoch": 17.61, + "grad_norm": 3.6749930381774902, + "learning_rate": 1.7517832968243442e-06, + "loss": 1.0799, + "step": 58850 + }, + { + "epoch": 17.61, + "grad_norm": 3.0792107582092285, + "learning_rate": 1.749623389139851e-06, + "loss": 0.8246, + "step": 58855 + }, + { + "epoch": 17.61, + "grad_norm": 5.0220046043396, + "learning_rate": 1.7474647655494863e-06, + "loss": 0.8415, + "step": 58860 + }, + { + "epoch": 17.61, + "grad_norm": 3.235442876815796, + "learning_rate": 1.7453074261724795e-06, + "loss": 0.944, + "step": 58865 + }, + { + "epoch": 17.61, + "grad_norm": 1.411855936050415, + "learning_rate": 1.7431513711279685e-06, + "loss": 1.0868, + "step": 58870 + }, + { + "epoch": 17.61, + "grad_norm": 1.5485836267471313, + "learning_rate": 1.7409966005350331e-06, + "loss": 0.8386, + "step": 58875 + }, + { + "epoch": 17.62, + "grad_norm": 2.2840311527252197, + "learning_rate": 1.7388431145126782e-06, + "loss": 1.0442, + "step": 58880 + }, + { + "epoch": 17.62, + "grad_norm": 2.2418808937072754, + "learning_rate": 1.7366909131798393e-06, + "loss": 0.9909, + "step": 58885 + }, + { + "epoch": 17.62, + "grad_norm": 2.9602952003479004, + "learning_rate": 1.7345399966553821e-06, + "loss": 1.0754, + "step": 58890 + }, + { + "epoch": 17.62, + "grad_norm": 3.5335211753845215, + "learning_rate": 1.7323903650580953e-06, + "loss": 0.9721, + "step": 58895 + }, + { + "epoch": 17.62, + "grad_norm": 3.265909194946289, + "learning_rate": 1.7302420185067058e-06, + "loss": 0.7851, + "step": 58900 + }, + { + "epoch": 17.62, + "grad_norm": 1.4548457860946655, + "learning_rate": 1.7280949571198607e-06, + "loss": 1.0308, + "step": 58905 + }, + { + "epoch": 17.63, + "grad_norm": 2.820035457611084, + "learning_rate": 1.7259491810161399e-06, + "loss": 1.0166, + "step": 58910 + }, + { + "epoch": 17.63, + "grad_norm": 3.4491255283355713, + "learning_rate": 1.7238046903140542e-06, + "loss": 0.9413, + "step": 58915 + }, + { + "epoch": 17.63, + "grad_norm": 1.849202275276184, + "learning_rate": 1.7216614851320451e-06, + "loss": 1.1075, + "step": 58920 + }, + { + "epoch": 17.63, + "grad_norm": 2.7424187660217285, + "learning_rate": 1.7195195655884706e-06, + "loss": 1.1669, + "step": 58925 + }, + { + "epoch": 17.63, + "grad_norm": 2.260991096496582, + "learning_rate": 1.717378931801636e-06, + "loss": 1.0069, + "step": 58930 + }, + { + "epoch": 17.63, + "grad_norm": 3.9417827129364014, + "learning_rate": 1.7152395838897605e-06, + "loss": 0.9019, + "step": 58935 + }, + { + "epoch": 17.63, + "grad_norm": 3.6205639839172363, + "learning_rate": 1.7131015219710029e-06, + "loss": 0.974, + "step": 58940 + }, + { + "epoch": 17.64, + "grad_norm": 2.234180450439453, + "learning_rate": 1.7109647461634348e-06, + "loss": 0.9879, + "step": 58945 + }, + { + "epoch": 17.64, + "grad_norm": 3.515759229660034, + "learning_rate": 1.7088292565850845e-06, + "loss": 1.0616, + "step": 58950 + }, + { + "epoch": 17.64, + "grad_norm": 1.790371060371399, + "learning_rate": 1.7066950533538767e-06, + "loss": 0.8907, + "step": 58955 + }, + { + "epoch": 17.64, + "grad_norm": 1.755277395248413, + "learning_rate": 1.7045621365876952e-06, + "loss": 0.9207, + "step": 58960 + }, + { + "epoch": 17.64, + "grad_norm": 1.3836179971694946, + "learning_rate": 1.702430506404329e-06, + "loss": 1.1258, + "step": 58965 + }, + { + "epoch": 17.64, + "grad_norm": 2.935575246810913, + "learning_rate": 1.7003001629215143e-06, + "loss": 0.9072, + "step": 58970 + }, + { + "epoch": 17.64, + "grad_norm": 2.9841513633728027, + "learning_rate": 1.6981711062568989e-06, + "loss": 0.9736, + "step": 58975 + }, + { + "epoch": 17.65, + "grad_norm": 8.933850288391113, + "learning_rate": 1.696043336528072e-06, + "loss": 0.9844, + "step": 58980 + }, + { + "epoch": 17.65, + "grad_norm": 1.3362852334976196, + "learning_rate": 1.6939168538525508e-06, + "loss": 1.0783, + "step": 58985 + }, + { + "epoch": 17.65, + "grad_norm": 3.199964761734009, + "learning_rate": 1.6917916583477744e-06, + "loss": 0.8654, + "step": 58990 + }, + { + "epoch": 17.65, + "grad_norm": 2.5024354457855225, + "learning_rate": 1.6896677501311186e-06, + "loss": 0.9169, + "step": 58995 + }, + { + "epoch": 17.65, + "grad_norm": 3.7208096981048584, + "learning_rate": 1.6875451293198812e-06, + "loss": 0.945, + "step": 59000 + }, + { + "epoch": 17.65, + "grad_norm": 6.412079811096191, + "learning_rate": 1.6854237960312963e-06, + "loss": 1.0159, + "step": 59005 + }, + { + "epoch": 17.66, + "grad_norm": 6.456012725830078, + "learning_rate": 1.68330375038252e-06, + "loss": 0.9998, + "step": 59010 + }, + { + "epoch": 17.66, + "grad_norm": 1.0914913415908813, + "learning_rate": 1.681184992490642e-06, + "loss": 0.9272, + "step": 59015 + }, + { + "epoch": 17.66, + "grad_norm": 1.7044358253479004, + "learning_rate": 1.6790675224726798e-06, + "loss": 1.0046, + "step": 59020 + }, + { + "epoch": 17.66, + "grad_norm": 3.6310768127441406, + "learning_rate": 1.6769513404455733e-06, + "loss": 1.0404, + "step": 59025 + }, + { + "epoch": 17.66, + "grad_norm": 1.748637080192566, + "learning_rate": 1.6748364465262039e-06, + "loss": 0.8425, + "step": 59030 + }, + { + "epoch": 17.66, + "grad_norm": 3.776822090148926, + "learning_rate": 1.6727228408313782e-06, + "loss": 1.1857, + "step": 59035 + }, + { + "epoch": 17.66, + "grad_norm": 2.349539279937744, + "learning_rate": 1.6706105234778114e-06, + "loss": 0.987, + "step": 59040 + }, + { + "epoch": 17.67, + "grad_norm": 8.603898048400879, + "learning_rate": 1.6684994945821847e-06, + "loss": 0.8815, + "step": 59045 + }, + { + "epoch": 17.67, + "grad_norm": 3.434723377227783, + "learning_rate": 1.6663897542610718e-06, + "loss": 0.9244, + "step": 59050 + }, + { + "epoch": 17.67, + "grad_norm": 3.597698211669922, + "learning_rate": 1.6642813026310072e-06, + "loss": 0.7415, + "step": 59055 + }, + { + "epoch": 17.67, + "grad_norm": 4.349695205688477, + "learning_rate": 1.6621741398084201e-06, + "loss": 0.9339, + "step": 59060 + }, + { + "epoch": 17.67, + "grad_norm": 1.6020125150680542, + "learning_rate": 1.6600682659097062e-06, + "loss": 0.9233, + "step": 59065 + }, + { + "epoch": 17.67, + "grad_norm": 3.7563843727111816, + "learning_rate": 1.6579636810511584e-06, + "loss": 1.053, + "step": 59070 + }, + { + "epoch": 17.67, + "grad_norm": 1.2841134071350098, + "learning_rate": 1.6558603853490145e-06, + "loss": 0.9631, + "step": 59075 + }, + { + "epoch": 17.68, + "grad_norm": 2.884526014328003, + "learning_rate": 1.6537583789194345e-06, + "loss": 0.7626, + "step": 59080 + }, + { + "epoch": 17.68, + "grad_norm": 1.4702849388122559, + "learning_rate": 1.651657661878514e-06, + "loss": 0.8606, + "step": 59085 + }, + { + "epoch": 17.68, + "grad_norm": 3.296031951904297, + "learning_rate": 1.6495582343422688e-06, + "loss": 0.9722, + "step": 59090 + }, + { + "epoch": 17.68, + "grad_norm": 2.626152753829956, + "learning_rate": 1.6474600964266534e-06, + "loss": 0.8891, + "step": 59095 + }, + { + "epoch": 17.68, + "grad_norm": 6.174269676208496, + "learning_rate": 1.6453632482475418e-06, + "loss": 1.2104, + "step": 59100 + }, + { + "epoch": 17.68, + "grad_norm": 2.552041530609131, + "learning_rate": 1.6432676899207439e-06, + "loss": 1.1366, + "step": 59105 + }, + { + "epoch": 17.69, + "grad_norm": 2.2161214351654053, + "learning_rate": 1.6411734215619923e-06, + "loss": 0.8592, + "step": 59110 + }, + { + "epoch": 17.69, + "grad_norm": 3.290187120437622, + "learning_rate": 1.6390804432869527e-06, + "loss": 1.1547, + "step": 59115 + }, + { + "epoch": 17.69, + "grad_norm": 4.858699798583984, + "learning_rate": 1.6369887552112163e-06, + "loss": 0.7281, + "step": 59120 + }, + { + "epoch": 17.69, + "grad_norm": 3.530651569366455, + "learning_rate": 1.6348983574503097e-06, + "loss": 0.983, + "step": 59125 + }, + { + "epoch": 17.69, + "grad_norm": 3.629481077194214, + "learning_rate": 1.6328092501196823e-06, + "loss": 0.8718, + "step": 59130 + }, + { + "epoch": 17.69, + "grad_norm": 1.404110074043274, + "learning_rate": 1.6307214333347032e-06, + "loss": 0.9193, + "step": 59135 + }, + { + "epoch": 17.69, + "grad_norm": 2.185800313949585, + "learning_rate": 1.6286349072106965e-06, + "loss": 0.8672, + "step": 59140 + }, + { + "epoch": 17.7, + "grad_norm": 3.7186684608459473, + "learning_rate": 1.6265496718628815e-06, + "loss": 0.9765, + "step": 59145 + }, + { + "epoch": 17.7, + "grad_norm": 4.324354648590088, + "learning_rate": 1.6244657274064407e-06, + "loss": 0.9801, + "step": 59150 + }, + { + "epoch": 17.7, + "grad_norm": 3.0012760162353516, + "learning_rate": 1.6223830739564544e-06, + "loss": 0.9174, + "step": 59155 + }, + { + "epoch": 17.7, + "grad_norm": 2.157304286956787, + "learning_rate": 1.6203017116279584e-06, + "loss": 0.9751, + "step": 59160 + }, + { + "epoch": 17.7, + "grad_norm": 1.090956449508667, + "learning_rate": 1.618221640535894e-06, + "loss": 0.891, + "step": 59165 + }, + { + "epoch": 17.7, + "grad_norm": 3.6914825439453125, + "learning_rate": 1.6161428607951417e-06, + "loss": 1.0343, + "step": 59170 + }, + { + "epoch": 17.7, + "grad_norm": 3.7854952812194824, + "learning_rate": 1.6140653725205152e-06, + "loss": 1.0016, + "step": 59175 + }, + { + "epoch": 17.71, + "grad_norm": 4.192695617675781, + "learning_rate": 1.6119891758267503e-06, + "loss": 1.0018, + "step": 59180 + }, + { + "epoch": 17.71, + "grad_norm": 2.511425256729126, + "learning_rate": 1.6099142708285108e-06, + "loss": 0.9797, + "step": 59185 + }, + { + "epoch": 17.71, + "grad_norm": 2.2904865741729736, + "learning_rate": 1.6078406576403943e-06, + "loss": 0.8575, + "step": 59190 + }, + { + "epoch": 17.71, + "grad_norm": 2.1356563568115234, + "learning_rate": 1.6057683363769227e-06, + "loss": 1.042, + "step": 59195 + }, + { + "epoch": 17.71, + "grad_norm": 2.6813127994537354, + "learning_rate": 1.6036973071525518e-06, + "loss": 0.6984, + "step": 59200 + }, + { + "epoch": 17.71, + "grad_norm": 2.049669027328491, + "learning_rate": 1.6016275700816568e-06, + "loss": 1.1138, + "step": 59205 + }, + { + "epoch": 17.71, + "grad_norm": 1.8621686697006226, + "learning_rate": 1.599559125278549e-06, + "loss": 1.0503, + "step": 59210 + }, + { + "epoch": 17.72, + "grad_norm": 2.028589963912964, + "learning_rate": 1.5974919728574706e-06, + "loss": 0.7726, + "step": 59215 + }, + { + "epoch": 17.72, + "grad_norm": 6.469555377960205, + "learning_rate": 1.5954261129325825e-06, + "loss": 1.0135, + "step": 59220 + }, + { + "epoch": 17.72, + "grad_norm": 3.396925449371338, + "learning_rate": 1.5933615456179885e-06, + "loss": 1.0272, + "step": 59225 + }, + { + "epoch": 17.72, + "grad_norm": 3.9737088680267334, + "learning_rate": 1.5912982710276968e-06, + "loss": 0.975, + "step": 59230 + }, + { + "epoch": 17.72, + "grad_norm": 3.608037233352661, + "learning_rate": 1.5892362892756779e-06, + "loss": 0.9554, + "step": 59235 + }, + { + "epoch": 17.72, + "grad_norm": 1.3548368215560913, + "learning_rate": 1.587175600475796e-06, + "loss": 0.9917, + "step": 59240 + }, + { + "epoch": 17.73, + "grad_norm": 5.358870983123779, + "learning_rate": 1.5851162047418793e-06, + "loss": 1.0518, + "step": 59245 + }, + { + "epoch": 17.73, + "grad_norm": 1.2182296514511108, + "learning_rate": 1.5830581021876455e-06, + "loss": 1.0005, + "step": 59250 + }, + { + "epoch": 17.73, + "grad_norm": 2.458919048309326, + "learning_rate": 1.5810012929267815e-06, + "loss": 1.116, + "step": 59255 + }, + { + "epoch": 17.73, + "grad_norm": 2.0615477561950684, + "learning_rate": 1.578945777072871e-06, + "loss": 0.8852, + "step": 59260 + }, + { + "epoch": 17.73, + "grad_norm": 3.5121617317199707, + "learning_rate": 1.5768915547394375e-06, + "loss": 1.0029, + "step": 59265 + }, + { + "epoch": 17.73, + "grad_norm": 2.439354658126831, + "learning_rate": 1.57483862603994e-06, + "loss": 1.0474, + "step": 59270 + }, + { + "epoch": 17.73, + "grad_norm": 1.8715204000473022, + "learning_rate": 1.5727869910877547e-06, + "loss": 1.1325, + "step": 59275 + }, + { + "epoch": 17.74, + "grad_norm": 1.4641121625900269, + "learning_rate": 1.5707366499961907e-06, + "loss": 0.8829, + "step": 59280 + }, + { + "epoch": 17.74, + "grad_norm": 2.544196367263794, + "learning_rate": 1.5686876028784907e-06, + "loss": 1.0373, + "step": 59285 + }, + { + "epoch": 17.74, + "grad_norm": 2.298362970352173, + "learning_rate": 1.5666398498478175e-06, + "loss": 1.0759, + "step": 59290 + }, + { + "epoch": 17.74, + "grad_norm": 3.0841598510742188, + "learning_rate": 1.564593391017266e-06, + "loss": 0.9647, + "step": 59295 + }, + { + "epoch": 17.74, + "grad_norm": 4.1196722984313965, + "learning_rate": 1.5625482264998658e-06, + "loss": 0.8123, + "step": 59300 + }, + { + "epoch": 17.74, + "grad_norm": 1.699238657951355, + "learning_rate": 1.5605043564085626e-06, + "loss": 1.1025, + "step": 59305 + }, + { + "epoch": 17.74, + "grad_norm": 3.654175043106079, + "learning_rate": 1.5584617808562407e-06, + "loss": 0.9208, + "step": 59310 + }, + { + "epoch": 17.75, + "grad_norm": 1.3798152208328247, + "learning_rate": 1.5564204999557075e-06, + "loss": 0.9044, + "step": 59315 + }, + { + "epoch": 17.75, + "grad_norm": 3.55618953704834, + "learning_rate": 1.554380513819706e-06, + "loss": 1.0186, + "step": 59320 + }, + { + "epoch": 17.75, + "grad_norm": 3.153846263885498, + "learning_rate": 1.5523418225608904e-06, + "loss": 0.9396, + "step": 59325 + }, + { + "epoch": 17.75, + "grad_norm": 2.9710476398468018, + "learning_rate": 1.5503044262918738e-06, + "loss": 0.8835, + "step": 59330 + }, + { + "epoch": 17.75, + "grad_norm": 3.084639072418213, + "learning_rate": 1.5482683251251572e-06, + "loss": 0.8919, + "step": 59335 + }, + { + "epoch": 17.75, + "grad_norm": 2.0667994022369385, + "learning_rate": 1.5462335191732153e-06, + "loss": 1.0935, + "step": 59340 + }, + { + "epoch": 17.76, + "grad_norm": 2.048433303833008, + "learning_rate": 1.5442000085484077e-06, + "loss": 1.0158, + "step": 59345 + }, + { + "epoch": 17.76, + "grad_norm": 3.5328614711761475, + "learning_rate": 1.5421677933630612e-06, + "loss": 0.923, + "step": 59350 + }, + { + "epoch": 17.76, + "grad_norm": 1.7786338329315186, + "learning_rate": 1.5401368737294003e-06, + "loss": 0.9076, + "step": 59355 + }, + { + "epoch": 17.76, + "grad_norm": 2.726869583129883, + "learning_rate": 1.5381072497595961e-06, + "loss": 0.9152, + "step": 59360 + }, + { + "epoch": 17.76, + "grad_norm": 3.708962917327881, + "learning_rate": 1.5360789215657395e-06, + "loss": 0.9758, + "step": 59365 + }, + { + "epoch": 17.76, + "grad_norm": 4.803073406219482, + "learning_rate": 1.5340518892598548e-06, + "loss": 1.1745, + "step": 59370 + }, + { + "epoch": 17.76, + "grad_norm": 2.451441764831543, + "learning_rate": 1.5320261529538943e-06, + "loss": 0.9447, + "step": 59375 + }, + { + "epoch": 17.77, + "grad_norm": 6.812480449676514, + "learning_rate": 1.530001712759735e-06, + "loss": 0.9028, + "step": 59380 + }, + { + "epoch": 17.77, + "grad_norm": 6.00867223739624, + "learning_rate": 1.5279785687891846e-06, + "loss": 0.8863, + "step": 59385 + }, + { + "epoch": 17.77, + "grad_norm": 3.013056516647339, + "learning_rate": 1.5259567211539788e-06, + "loss": 0.9852, + "step": 59390 + }, + { + "epoch": 17.77, + "grad_norm": 1.796088457107544, + "learning_rate": 1.5239361699657867e-06, + "loss": 0.9819, + "step": 59395 + }, + { + "epoch": 17.77, + "grad_norm": 4.895589351654053, + "learning_rate": 1.5219169153361967e-06, + "loss": 0.9859, + "step": 59400 + }, + { + "epoch": 17.77, + "grad_norm": 1.5124635696411133, + "learning_rate": 1.5198989573767308e-06, + "loss": 0.9221, + "step": 59405 + }, + { + "epoch": 17.77, + "grad_norm": 2.5901122093200684, + "learning_rate": 1.5178822961988387e-06, + "loss": 0.8347, + "step": 59410 + }, + { + "epoch": 17.78, + "grad_norm": 5.9025397300720215, + "learning_rate": 1.5158669319139007e-06, + "loss": 0.9824, + "step": 59415 + }, + { + "epoch": 17.78, + "grad_norm": 2.827467918395996, + "learning_rate": 1.5138528646332195e-06, + "loss": 1.1154, + "step": 59420 + }, + { + "epoch": 17.78, + "grad_norm": 2.0498061180114746, + "learning_rate": 1.5118400944680366e-06, + "loss": 0.9927, + "step": 59425 + }, + { + "epoch": 17.78, + "grad_norm": 1.4182449579238892, + "learning_rate": 1.509828621529505e-06, + "loss": 0.8675, + "step": 59430 + }, + { + "epoch": 17.78, + "grad_norm": 4.468570232391357, + "learning_rate": 1.507818445928727e-06, + "loss": 0.9225, + "step": 59435 + }, + { + "epoch": 17.78, + "grad_norm": 2.804314374923706, + "learning_rate": 1.5058095677767113e-06, + "loss": 1.0115, + "step": 59440 + }, + { + "epoch": 17.79, + "grad_norm": 3.139026641845703, + "learning_rate": 1.5038019871844222e-06, + "loss": 0.9551, + "step": 59445 + }, + { + "epoch": 17.79, + "grad_norm": 2.992780923843384, + "learning_rate": 1.501795704262718e-06, + "loss": 1.0013, + "step": 59450 + }, + { + "epoch": 17.79, + "grad_norm": 4.901608943939209, + "learning_rate": 1.4997907191224152e-06, + "loss": 1.04, + "step": 59455 + }, + { + "epoch": 17.79, + "grad_norm": 2.3052425384521484, + "learning_rate": 1.4977870318742426e-06, + "loss": 0.9111, + "step": 59460 + }, + { + "epoch": 17.79, + "grad_norm": 3.878119707107544, + "learning_rate": 1.495784642628864e-06, + "loss": 1.0182, + "step": 59465 + }, + { + "epoch": 17.79, + "grad_norm": 3.6295268535614014, + "learning_rate": 1.4937835514968663e-06, + "loss": 0.9679, + "step": 59470 + }, + { + "epoch": 17.79, + "grad_norm": 3.5464370250701904, + "learning_rate": 1.491783758588769e-06, + "loss": 1.0276, + "step": 59475 + }, + { + "epoch": 17.8, + "grad_norm": 5.657863140106201, + "learning_rate": 1.4897852640150228e-06, + "loss": 1.0774, + "step": 59480 + }, + { + "epoch": 17.8, + "grad_norm": 1.0737026929855347, + "learning_rate": 1.487788067885998e-06, + "loss": 0.9233, + "step": 59485 + }, + { + "epoch": 17.8, + "grad_norm": 10.904875755310059, + "learning_rate": 1.4857921703119976e-06, + "loss": 0.7518, + "step": 59490 + }, + { + "epoch": 17.8, + "grad_norm": 2.129427909851074, + "learning_rate": 1.483797571403256e-06, + "loss": 0.9003, + "step": 59495 + }, + { + "epoch": 17.8, + "grad_norm": 2.353217601776123, + "learning_rate": 1.4818042712699294e-06, + "loss": 1.0227, + "step": 59500 + }, + { + "epoch": 17.8, + "grad_norm": 2.1998493671417236, + "learning_rate": 1.4798122700221074e-06, + "loss": 1.0736, + "step": 59505 + }, + { + "epoch": 17.8, + "grad_norm": 5.145125865936279, + "learning_rate": 1.4778215677698048e-06, + "loss": 0.916, + "step": 59510 + }, + { + "epoch": 17.81, + "grad_norm": 1.4517490863800049, + "learning_rate": 1.4758321646229672e-06, + "loss": 0.8587, + "step": 59515 + }, + { + "epoch": 17.81, + "grad_norm": 1.5610160827636719, + "learning_rate": 1.4738440606914706e-06, + "loss": 1.056, + "step": 59520 + }, + { + "epoch": 17.81, + "grad_norm": 7.770084857940674, + "learning_rate": 1.4718572560851073e-06, + "loss": 0.8253, + "step": 59525 + }, + { + "epoch": 17.81, + "grad_norm": 2.7066264152526855, + "learning_rate": 1.469871750913618e-06, + "loss": 1.0866, + "step": 59530 + }, + { + "epoch": 17.81, + "grad_norm": 4.087286949157715, + "learning_rate": 1.4678875452866448e-06, + "loss": 1.0194, + "step": 59535 + }, + { + "epoch": 17.81, + "grad_norm": 1.208440899848938, + "learning_rate": 1.4659046393137893e-06, + "loss": 0.9872, + "step": 59540 + }, + { + "epoch": 17.82, + "grad_norm": 3.085111141204834, + "learning_rate": 1.4639230331045528e-06, + "loss": 0.8561, + "step": 59545 + }, + { + "epoch": 17.82, + "grad_norm": 1.8525588512420654, + "learning_rate": 1.4619427267683894e-06, + "loss": 0.8675, + "step": 59550 + }, + { + "epoch": 17.82, + "grad_norm": 3.9756102561950684, + "learning_rate": 1.4599637204146587e-06, + "loss": 0.8729, + "step": 59555 + }, + { + "epoch": 17.82, + "grad_norm": 3.060441017150879, + "learning_rate": 1.4579860141526624e-06, + "loss": 0.9836, + "step": 59560 + }, + { + "epoch": 17.82, + "grad_norm": 2.611800193786621, + "learning_rate": 1.4560096080916325e-06, + "loss": 0.9945, + "step": 59565 + }, + { + "epoch": 17.82, + "grad_norm": 3.2961905002593994, + "learning_rate": 1.454034502340712e-06, + "loss": 0.9673, + "step": 59570 + }, + { + "epoch": 17.82, + "grad_norm": 3.6914517879486084, + "learning_rate": 1.452060697009e-06, + "loss": 0.9933, + "step": 59575 + }, + { + "epoch": 17.83, + "grad_norm": 1.3202792406082153, + "learning_rate": 1.4500881922054926e-06, + "loss": 1.0733, + "step": 59580 + }, + { + "epoch": 17.83, + "grad_norm": 3.6315410137176514, + "learning_rate": 1.4481169880391388e-06, + "loss": 1.0638, + "step": 59585 + }, + { + "epoch": 17.83, + "grad_norm": 3.1422181129455566, + "learning_rate": 1.4461470846188013e-06, + "loss": 1.0285, + "step": 59590 + }, + { + "epoch": 17.83, + "grad_norm": 2.8239080905914307, + "learning_rate": 1.4441784820532766e-06, + "loss": 0.9607, + "step": 59595 + }, + { + "epoch": 17.83, + "grad_norm": 2.704333782196045, + "learning_rate": 1.4422111804512916e-06, + "loss": 0.9963, + "step": 59600 + }, + { + "epoch": 17.83, + "grad_norm": 2.927246332168579, + "learning_rate": 1.4402451799214927e-06, + "loss": 1.0692, + "step": 59605 + }, + { + "epoch": 17.83, + "grad_norm": 3.1765265464782715, + "learning_rate": 1.438280480572468e-06, + "loss": 0.8938, + "step": 59610 + }, + { + "epoch": 17.84, + "grad_norm": 4.268390655517578, + "learning_rate": 1.4363170825127226e-06, + "loss": 1.0705, + "step": 59615 + }, + { + "epoch": 17.84, + "grad_norm": 16.1699275970459, + "learning_rate": 1.4343549858506833e-06, + "loss": 0.9742, + "step": 59620 + }, + { + "epoch": 17.84, + "grad_norm": 2.6375677585601807, + "learning_rate": 1.432394190694733e-06, + "loss": 0.9267, + "step": 59625 + }, + { + "epoch": 17.84, + "grad_norm": 1.9000338315963745, + "learning_rate": 1.4304346971531434e-06, + "loss": 0.994, + "step": 59630 + }, + { + "epoch": 17.84, + "grad_norm": 3.127405881881714, + "learning_rate": 1.4284765053341532e-06, + "loss": 1.0222, + "step": 59635 + }, + { + "epoch": 17.84, + "grad_norm": 2.3889312744140625, + "learning_rate": 1.4265196153459032e-06, + "loss": 0.7955, + "step": 59640 + }, + { + "epoch": 17.85, + "grad_norm": 2.0697555541992188, + "learning_rate": 1.424564027296471e-06, + "loss": 1.0505, + "step": 59645 + }, + { + "epoch": 17.85, + "grad_norm": 3.0797669887542725, + "learning_rate": 1.4226097412938677e-06, + "loss": 0.9022, + "step": 59650 + }, + { + "epoch": 17.85, + "grad_norm": 3.074493885040283, + "learning_rate": 1.4206567574460121e-06, + "loss": 0.9206, + "step": 59655 + }, + { + "epoch": 17.85, + "grad_norm": 2.3382043838500977, + "learning_rate": 1.4187050758607824e-06, + "loss": 1.0961, + "step": 59660 + }, + { + "epoch": 17.85, + "grad_norm": 4.023479461669922, + "learning_rate": 1.4167546966459527e-06, + "loss": 0.9317, + "step": 59665 + }, + { + "epoch": 17.85, + "grad_norm": 1.3876618146896362, + "learning_rate": 1.415195331053179e-06, + "loss": 1.0919, + "step": 59670 + }, + { + "epoch": 17.85, + "grad_norm": 2.543372869491577, + "learning_rate": 1.4132472963764826e-06, + "loss": 0.8302, + "step": 59675 + }, + { + "epoch": 17.86, + "grad_norm": 3.604238271713257, + "learning_rate": 1.411300564371626e-06, + "loss": 1.0208, + "step": 59680 + }, + { + "epoch": 17.86, + "grad_norm": 3.2941768169403076, + "learning_rate": 1.4093551351461149e-06, + "loss": 1.0286, + "step": 59685 + }, + { + "epoch": 17.86, + "grad_norm": 3.470616579055786, + "learning_rate": 1.407411008807405e-06, + "loss": 1.003, + "step": 59690 + }, + { + "epoch": 17.86, + "grad_norm": 4.124062538146973, + "learning_rate": 1.4054681854628548e-06, + "loss": 0.9581, + "step": 59695 + }, + { + "epoch": 17.86, + "grad_norm": 2.6687521934509277, + "learning_rate": 1.4035266652197786e-06, + "loss": 0.9555, + "step": 59700 + }, + { + "epoch": 17.86, + "grad_norm": 6.374357223510742, + "learning_rate": 1.4015864481853963e-06, + "loss": 0.9077, + "step": 59705 + }, + { + "epoch": 17.86, + "grad_norm": 5.034076690673828, + "learning_rate": 1.3996475344668636e-06, + "loss": 0.9172, + "step": 59710 + }, + { + "epoch": 17.87, + "grad_norm": 5.536380767822266, + "learning_rate": 1.3977099241712643e-06, + "loss": 1.0173, + "step": 59715 + }, + { + "epoch": 17.87, + "grad_norm": 2.3674838542938232, + "learning_rate": 1.3957736174056157e-06, + "loss": 1.0644, + "step": 59720 + }, + { + "epoch": 17.87, + "grad_norm": 6.831422805786133, + "learning_rate": 1.3938386142768517e-06, + "loss": 1.0125, + "step": 59725 + }, + { + "epoch": 17.87, + "grad_norm": 2.7988386154174805, + "learning_rate": 1.3919049148918478e-06, + "loss": 0.9512, + "step": 59730 + }, + { + "epoch": 17.87, + "grad_norm": 1.8349326848983765, + "learning_rate": 1.3899725193573937e-06, + "loss": 0.8805, + "step": 59735 + }, + { + "epoch": 17.87, + "grad_norm": 4.807056903839111, + "learning_rate": 1.3880414277802178e-06, + "loss": 0.9358, + "step": 59740 + }, + { + "epoch": 17.88, + "grad_norm": 1.473981261253357, + "learning_rate": 1.3861116402669683e-06, + "loss": 1.0964, + "step": 59745 + }, + { + "epoch": 17.88, + "grad_norm": 1.3169115781784058, + "learning_rate": 1.384183156924229e-06, + "loss": 1.0681, + "step": 59750 + }, + { + "epoch": 17.88, + "grad_norm": 3.194916009902954, + "learning_rate": 1.382255977858507e-06, + "loss": 1.1128, + "step": 59755 + }, + { + "epoch": 17.88, + "grad_norm": 3.641432285308838, + "learning_rate": 1.3803301031762362e-06, + "loss": 0.9769, + "step": 59760 + }, + { + "epoch": 17.88, + "grad_norm": 1.8899619579315186, + "learning_rate": 1.378405532983787e-06, + "loss": 1.1635, + "step": 59765 + }, + { + "epoch": 17.88, + "grad_norm": 2.5514962673187256, + "learning_rate": 1.3764822673874411e-06, + "loss": 1.0953, + "step": 59770 + }, + { + "epoch": 17.88, + "grad_norm": 2.5154950618743896, + "learning_rate": 1.3745603064934303e-06, + "loss": 0.8913, + "step": 59775 + }, + { + "epoch": 17.89, + "grad_norm": 2.303077220916748, + "learning_rate": 1.3726396504078892e-06, + "loss": 0.865, + "step": 59780 + }, + { + "epoch": 17.89, + "grad_norm": 4.337673664093018, + "learning_rate": 1.3707202992369078e-06, + "loss": 0.8839, + "step": 59785 + }, + { + "epoch": 17.89, + "grad_norm": 3.780333995819092, + "learning_rate": 1.3688022530864763e-06, + "loss": 0.8552, + "step": 59790 + }, + { + "epoch": 17.89, + "grad_norm": 1.5258413553237915, + "learning_rate": 1.3668855120625406e-06, + "loss": 0.8518, + "step": 59795 + }, + { + "epoch": 17.89, + "grad_norm": 2.358565330505371, + "learning_rate": 1.3649700762709495e-06, + "loss": 0.954, + "step": 59800 + }, + { + "epoch": 17.89, + "grad_norm": 1.6710665225982666, + "learning_rate": 1.363055945817493e-06, + "loss": 1.056, + "step": 59805 + }, + { + "epoch": 17.89, + "grad_norm": 2.9221675395965576, + "learning_rate": 1.3611431208078896e-06, + "loss": 1.0636, + "step": 59810 + }, + { + "epoch": 17.9, + "grad_norm": 2.2900736331939697, + "learning_rate": 1.359231601347777e-06, + "loss": 0.9176, + "step": 59815 + }, + { + "epoch": 17.9, + "grad_norm": 2.580756187438965, + "learning_rate": 1.3573213875427316e-06, + "loss": 0.9621, + "step": 59820 + }, + { + "epoch": 17.9, + "grad_norm": 3.829488515853882, + "learning_rate": 1.3554124794982498e-06, + "loss": 0.919, + "step": 59825 + }, + { + "epoch": 17.9, + "grad_norm": 2.805250883102417, + "learning_rate": 1.3535048773197611e-06, + "loss": 0.9749, + "step": 59830 + }, + { + "epoch": 17.9, + "grad_norm": 1.3262838125228882, + "learning_rate": 1.3515985811126174e-06, + "loss": 1.0604, + "step": 59835 + }, + { + "epoch": 17.9, + "grad_norm": 3.3138718605041504, + "learning_rate": 1.349693590982104e-06, + "loss": 1.0974, + "step": 59840 + }, + { + "epoch": 17.9, + "grad_norm": 1.5498145818710327, + "learning_rate": 1.347789907033431e-06, + "loss": 0.9018, + "step": 59845 + }, + { + "epoch": 17.91, + "grad_norm": 2.336205244064331, + "learning_rate": 1.3458875293717366e-06, + "loss": 1.0362, + "step": 59850 + }, + { + "epoch": 17.91, + "grad_norm": 2.3710501194000244, + "learning_rate": 1.343986458102084e-06, + "loss": 0.9496, + "step": 59855 + }, + { + "epoch": 17.91, + "grad_norm": 5.016022205352783, + "learning_rate": 1.3420866933294752e-06, + "loss": 1.1354, + "step": 59860 + }, + { + "epoch": 17.91, + "grad_norm": 5.362601280212402, + "learning_rate": 1.3401882351588207e-06, + "loss": 0.7814, + "step": 59865 + }, + { + "epoch": 17.91, + "grad_norm": 1.2273238897323608, + "learning_rate": 1.338291083694984e-06, + "loss": 1.0238, + "step": 59870 + }, + { + "epoch": 17.91, + "grad_norm": 2.9525585174560547, + "learning_rate": 1.3363952390427286e-06, + "loss": 0.8545, + "step": 59875 + }, + { + "epoch": 17.92, + "grad_norm": 2.0316078662872314, + "learning_rate": 1.3345007013067763e-06, + "loss": 1.1211, + "step": 59880 + }, + { + "epoch": 17.92, + "grad_norm": 3.018373727798462, + "learning_rate": 1.3326074705917401e-06, + "loss": 0.8393, + "step": 59885 + }, + { + "epoch": 17.92, + "grad_norm": 1.7973732948303223, + "learning_rate": 1.3307155470022038e-06, + "loss": 0.9589, + "step": 59890 + }, + { + "epoch": 17.92, + "grad_norm": 1.8492597341537476, + "learning_rate": 1.3288249306426387e-06, + "loss": 1.0118, + "step": 59895 + }, + { + "epoch": 17.92, + "grad_norm": 2.7371528148651123, + "learning_rate": 1.3269356216174645e-06, + "loss": 1.0925, + "step": 59900 + }, + { + "epoch": 17.92, + "grad_norm": 3.346501350402832, + "learning_rate": 1.3250476200310363e-06, + "loss": 1.0044, + "step": 59905 + }, + { + "epoch": 17.92, + "grad_norm": 2.5383148193359375, + "learning_rate": 1.3231609259876126e-06, + "loss": 0.7532, + "step": 59910 + }, + { + "epoch": 17.93, + "grad_norm": 2.1317331790924072, + "learning_rate": 1.3212755395914074e-06, + "loss": 1.1127, + "step": 59915 + }, + { + "epoch": 17.93, + "grad_norm": 1.8375227451324463, + "learning_rate": 1.319391460946534e-06, + "loss": 1.0262, + "step": 59920 + }, + { + "epoch": 17.93, + "grad_norm": 1.335551142692566, + "learning_rate": 1.3175086901570626e-06, + "loss": 0.9814, + "step": 59925 + }, + { + "epoch": 17.93, + "grad_norm": 1.4829717874526978, + "learning_rate": 1.3156272273269682e-06, + "loss": 0.8828, + "step": 59930 + }, + { + "epoch": 17.93, + "grad_norm": 1.677444577217102, + "learning_rate": 1.313747072560162e-06, + "loss": 0.8892, + "step": 59935 + }, + { + "epoch": 17.93, + "grad_norm": 5.412178993225098, + "learning_rate": 1.3118682259604832e-06, + "loss": 0.9235, + "step": 59940 + }, + { + "epoch": 17.93, + "grad_norm": 2.3385939598083496, + "learning_rate": 1.3099906876317014e-06, + "loss": 1.1063, + "step": 59945 + }, + { + "epoch": 17.94, + "grad_norm": 10.752957344055176, + "learning_rate": 1.3081144576775089e-06, + "loss": 0.9414, + "step": 59950 + }, + { + "epoch": 17.94, + "grad_norm": 5.151787281036377, + "learning_rate": 1.306239536201531e-06, + "loss": 0.8772, + "step": 59955 + }, + { + "epoch": 17.94, + "grad_norm": 1.5891977548599243, + "learning_rate": 1.3043659233073102e-06, + "loss": 0.951, + "step": 59960 + }, + { + "epoch": 17.94, + "grad_norm": 3.044320583343506, + "learning_rate": 1.3024936190983355e-06, + "loss": 1.0105, + "step": 59965 + }, + { + "epoch": 17.94, + "grad_norm": 4.976190090179443, + "learning_rate": 1.3006226236779996e-06, + "loss": 0.9063, + "step": 59970 + }, + { + "epoch": 17.94, + "grad_norm": 3.8189163208007812, + "learning_rate": 1.2987529371496498e-06, + "loss": 0.9465, + "step": 59975 + }, + { + "epoch": 17.95, + "grad_norm": 1.8560311794281006, + "learning_rate": 1.296884559616529e-06, + "loss": 1.015, + "step": 59980 + }, + { + "epoch": 17.95, + "grad_norm": 4.682796478271484, + "learning_rate": 1.2950174911818407e-06, + "loss": 0.8412, + "step": 59985 + }, + { + "epoch": 17.95, + "grad_norm": 1.3414361476898193, + "learning_rate": 1.2931517319487024e-06, + "loss": 0.9081, + "step": 59990 + }, + { + "epoch": 17.95, + "grad_norm": 3.5960850715637207, + "learning_rate": 1.2912872820201427e-06, + "loss": 1.1309, + "step": 59995 + }, + { + "epoch": 17.95, + "grad_norm": 2.8373448848724365, + "learning_rate": 1.2894241414991488e-06, + "loss": 0.9147, + "step": 60000 + }, + { + "epoch": 17.95, + "grad_norm": 2.71930193901062, + "learning_rate": 1.2875623104886104e-06, + "loss": 0.9289, + "step": 60005 + }, + { + "epoch": 17.95, + "grad_norm": 2.6316206455230713, + "learning_rate": 1.2857017890913619e-06, + "loss": 0.8153, + "step": 60010 + }, + { + "epoch": 17.96, + "grad_norm": 5.35076379776001, + "learning_rate": 1.2838425774101466e-06, + "loss": 0.8993, + "step": 60015 + }, + { + "epoch": 17.96, + "grad_norm": 2.4414889812469482, + "learning_rate": 1.2819846755476622e-06, + "loss": 0.993, + "step": 60020 + }, + { + "epoch": 17.96, + "grad_norm": 2.973344087600708, + "learning_rate": 1.2801280836065077e-06, + "loss": 1.1067, + "step": 60025 + }, + { + "epoch": 17.96, + "grad_norm": 3.7870774269104004, + "learning_rate": 1.2782728016892231e-06, + "loss": 1.0017, + "step": 60030 + }, + { + "epoch": 17.96, + "grad_norm": 1.356503963470459, + "learning_rate": 1.2764188298982738e-06, + "loss": 0.9037, + "step": 60035 + }, + { + "epoch": 17.96, + "grad_norm": 1.4454678297042847, + "learning_rate": 1.2745661683360554e-06, + "loss": 1.0378, + "step": 60040 + }, + { + "epoch": 17.96, + "grad_norm": 1.8113714456558228, + "learning_rate": 1.2727148171048836e-06, + "loss": 0.8887, + "step": 60045 + }, + { + "epoch": 17.97, + "grad_norm": 4.557557582855225, + "learning_rate": 1.2708647763070152e-06, + "loss": 0.8867, + "step": 60050 + }, + { + "epoch": 17.97, + "grad_norm": 2.7099125385284424, + "learning_rate": 1.2690160460446105e-06, + "loss": 0.9419, + "step": 60055 + }, + { + "epoch": 17.97, + "grad_norm": 4.08752965927124, + "learning_rate": 1.2671686264197874e-06, + "loss": 0.8125, + "step": 60060 + }, + { + "epoch": 17.97, + "grad_norm": 1.8687279224395752, + "learning_rate": 1.265322517534573e-06, + "loss": 0.7801, + "step": 60065 + }, + { + "epoch": 17.97, + "grad_norm": 4.718087673187256, + "learning_rate": 1.2634777194909242e-06, + "loss": 0.9194, + "step": 60070 + }, + { + "epoch": 17.97, + "grad_norm": 2.097789764404297, + "learning_rate": 1.2616342323907293e-06, + "loss": 0.9047, + "step": 60075 + }, + { + "epoch": 17.98, + "grad_norm": 2.0857861042022705, + "learning_rate": 1.2597920563357984e-06, + "loss": 0.9454, + "step": 60080 + }, + { + "epoch": 17.98, + "grad_norm": 1.1246609687805176, + "learning_rate": 1.2579511914278807e-06, + "loss": 1.0526, + "step": 60085 + }, + { + "epoch": 17.98, + "grad_norm": 3.467363119125366, + "learning_rate": 1.2561116377686338e-06, + "loss": 0.8471, + "step": 60090 + }, + { + "epoch": 17.98, + "grad_norm": 1.099082350730896, + "learning_rate": 1.254273395459668e-06, + "loss": 1.0249, + "step": 60095 + }, + { + "epoch": 17.98, + "grad_norm": 3.2694308757781982, + "learning_rate": 1.252436464602494e-06, + "loss": 1.0626, + "step": 60100 + }, + { + "epoch": 17.98, + "grad_norm": 5.15191125869751, + "learning_rate": 1.2506008452985746e-06, + "loss": 0.9525, + "step": 60105 + }, + { + "epoch": 17.98, + "grad_norm": 4.068530082702637, + "learning_rate": 1.248766537649279e-06, + "loss": 1.0582, + "step": 60110 + }, + { + "epoch": 17.99, + "grad_norm": 1.9388269186019897, + "learning_rate": 1.2469335417559236e-06, + "loss": 0.8859, + "step": 60115 + }, + { + "epoch": 17.99, + "grad_norm": 3.2138922214508057, + "learning_rate": 1.245101857719738e-06, + "loss": 0.856, + "step": 60120 + }, + { + "epoch": 17.99, + "grad_norm": 2.7871532440185547, + "learning_rate": 1.2432714856418837e-06, + "loss": 1.0063, + "step": 60125 + }, + { + "epoch": 17.99, + "grad_norm": 4.555139541625977, + "learning_rate": 1.241442425623454e-06, + "loss": 0.7705, + "step": 60130 + }, + { + "epoch": 17.99, + "grad_norm": 4.78867244720459, + "learning_rate": 1.2396146777654604e-06, + "loss": 1.1051, + "step": 60135 + }, + { + "epoch": 17.99, + "grad_norm": 2.8767669200897217, + "learning_rate": 1.2377882421688526e-06, + "loss": 0.8788, + "step": 60140 + }, + { + "epoch": 17.99, + "grad_norm": 5.313528060913086, + "learning_rate": 1.2359631189344994e-06, + "loss": 1.0437, + "step": 60145 + }, + { + "epoch": 18.0, + "grad_norm": 3.7219114303588867, + "learning_rate": 1.234139308163204e-06, + "loss": 0.8678, + "step": 60150 + }, + { + "epoch": 18.0, + "grad_norm": 3.0899860858917236, + "learning_rate": 1.2323168099556886e-06, + "loss": 0.7888, + "step": 60155 + }, + { + "epoch": 18.0, + "grad_norm": 1.9980682134628296, + "learning_rate": 1.2304956244126115e-06, + "loss": 1.1034, + "step": 60160 + }, + { + "epoch": 18.0, + "grad_norm": 2.616227626800537, + "learning_rate": 1.228675751634556e-06, + "loss": 1.0001, + "step": 60165 + }, + { + "epoch": 18.0, + "grad_norm": 8.774314880371094, + "learning_rate": 1.226857191722028e-06, + "loss": 0.9797, + "step": 60170 + }, + { + "epoch": 18.0, + "grad_norm": 1.6300761699676514, + "learning_rate": 1.2250399447754663e-06, + "loss": 1.0554, + "step": 60175 + }, + { + "epoch": 18.01, + "grad_norm": 2.3184926509857178, + "learning_rate": 1.223224010895238e-06, + "loss": 0.9453, + "step": 60180 + }, + { + "epoch": 18.01, + "grad_norm": 5.169106483459473, + "learning_rate": 1.2214093901816297e-06, + "loss": 0.7603, + "step": 60185 + }, + { + "epoch": 18.01, + "grad_norm": 2.690866231918335, + "learning_rate": 1.2195960827348696e-06, + "loss": 0.9451, + "step": 60190 + }, + { + "epoch": 18.01, + "grad_norm": 3.5127060413360596, + "learning_rate": 1.2177840886550911e-06, + "loss": 1.0737, + "step": 60195 + }, + { + "epoch": 18.01, + "grad_norm": 2.8311686515808105, + "learning_rate": 1.215973408042384e-06, + "loss": 0.7565, + "step": 60200 + }, + { + "epoch": 18.01, + "grad_norm": 3.0393755435943604, + "learning_rate": 1.2141640409967403e-06, + "loss": 0.9511, + "step": 60205 + }, + { + "epoch": 18.01, + "grad_norm": 1.882336139678955, + "learning_rate": 1.2123559876180968e-06, + "loss": 1.1201, + "step": 60210 + }, + { + "epoch": 18.02, + "grad_norm": 3.4361493587493896, + "learning_rate": 1.2105492480063013e-06, + "loss": 0.5961, + "step": 60215 + }, + { + "epoch": 18.02, + "grad_norm": 2.833055019378662, + "learning_rate": 1.2087438222611463e-06, + "loss": 1.1693, + "step": 60220 + }, + { + "epoch": 18.02, + "grad_norm": 3.044955015182495, + "learning_rate": 1.2069397104823381e-06, + "loss": 1.0318, + "step": 60225 + }, + { + "epoch": 18.02, + "grad_norm": 3.623894691467285, + "learning_rate": 1.2051369127695195e-06, + "loss": 1.0285, + "step": 60230 + }, + { + "epoch": 18.02, + "grad_norm": 1.35172700881958, + "learning_rate": 1.2033354292222547e-06, + "loss": 0.9498, + "step": 60235 + }, + { + "epoch": 18.02, + "grad_norm": 1.692931056022644, + "learning_rate": 1.2015352599400399e-06, + "loss": 0.8844, + "step": 60240 + }, + { + "epoch": 18.02, + "grad_norm": 2.4801394939422607, + "learning_rate": 1.1997364050222947e-06, + "loss": 0.8478, + "step": 60245 + }, + { + "epoch": 18.03, + "grad_norm": 2.9214184284210205, + "learning_rate": 1.1979388645683682e-06, + "loss": 0.8282, + "step": 60250 + }, + { + "epoch": 18.03, + "grad_norm": 1.6741719245910645, + "learning_rate": 1.1961426386775386e-06, + "loss": 1.0689, + "step": 60255 + }, + { + "epoch": 18.03, + "grad_norm": 2.074756383895874, + "learning_rate": 1.1943477274490079e-06, + "loss": 0.9789, + "step": 60260 + }, + { + "epoch": 18.03, + "grad_norm": 4.593778610229492, + "learning_rate": 1.1925541309819071e-06, + "loss": 1.0066, + "step": 60265 + }, + { + "epoch": 18.03, + "grad_norm": 4.825080394744873, + "learning_rate": 1.1907618493752965e-06, + "loss": 1.0119, + "step": 60270 + }, + { + "epoch": 18.03, + "grad_norm": 2.745469093322754, + "learning_rate": 1.1889708827281604e-06, + "loss": 1.0391, + "step": 60275 + }, + { + "epoch": 18.04, + "grad_norm": 3.292245864868164, + "learning_rate": 1.1871812311394115e-06, + "loss": 0.8295, + "step": 60280 + }, + { + "epoch": 18.04, + "grad_norm": 2.7830917835235596, + "learning_rate": 1.1853928947078957e-06, + "loss": 1.0809, + "step": 60285 + }, + { + "epoch": 18.04, + "grad_norm": 3.622986316680908, + "learning_rate": 1.1836058735323674e-06, + "loss": 1.0924, + "step": 60290 + }, + { + "epoch": 18.04, + "grad_norm": 2.329106569290161, + "learning_rate": 1.1818201677115393e-06, + "loss": 0.9369, + "step": 60295 + }, + { + "epoch": 18.04, + "grad_norm": 2.0072081089019775, + "learning_rate": 1.1800357773440212e-06, + "loss": 0.8325, + "step": 60300 + }, + { + "epoch": 18.04, + "grad_norm": 2.3265275955200195, + "learning_rate": 1.1782527025283706e-06, + "loss": 0.827, + "step": 60305 + }, + { + "epoch": 18.04, + "grad_norm": 4.844279766082764, + "learning_rate": 1.1764709433630589e-06, + "loss": 0.88, + "step": 60310 + }, + { + "epoch": 18.05, + "grad_norm": 2.5498387813568115, + "learning_rate": 1.174690499946496e-06, + "loss": 0.9118, + "step": 60315 + }, + { + "epoch": 18.05, + "grad_norm": 2.94402813911438, + "learning_rate": 1.1729113723770114e-06, + "loss": 1.0359, + "step": 60320 + }, + { + "epoch": 18.05, + "grad_norm": 8.016414642333984, + "learning_rate": 1.1711335607528629e-06, + "loss": 0.8906, + "step": 60325 + }, + { + "epoch": 18.05, + "grad_norm": 1.8241318464279175, + "learning_rate": 1.1693570651722414e-06, + "loss": 0.9315, + "step": 60330 + }, + { + "epoch": 18.05, + "grad_norm": 3.5544328689575195, + "learning_rate": 1.167581885733257e-06, + "loss": 0.8926, + "step": 60335 + }, + { + "epoch": 18.05, + "grad_norm": 4.292209148406982, + "learning_rate": 1.1658080225339541e-06, + "loss": 0.9165, + "step": 60340 + }, + { + "epoch": 18.05, + "grad_norm": 2.8831968307495117, + "learning_rate": 1.1640354756722981e-06, + "loss": 1.1162, + "step": 60345 + }, + { + "epoch": 18.06, + "grad_norm": 3.58367657661438, + "learning_rate": 1.1622642452461862e-06, + "loss": 0.8821, + "step": 60350 + }, + { + "epoch": 18.06, + "grad_norm": 2.5088016986846924, + "learning_rate": 1.1604943313534455e-06, + "loss": 1.0714, + "step": 60355 + }, + { + "epoch": 18.06, + "grad_norm": 4.4307780265808105, + "learning_rate": 1.15872573409182e-06, + "loss": 1.0852, + "step": 60360 + }, + { + "epoch": 18.06, + "grad_norm": 3.551413059234619, + "learning_rate": 1.1569584535589929e-06, + "loss": 0.9419, + "step": 60365 + }, + { + "epoch": 18.06, + "grad_norm": 3.166933059692383, + "learning_rate": 1.1551924898525634e-06, + "loss": 0.9873, + "step": 60370 + }, + { + "epoch": 18.06, + "grad_norm": 4.157015800476074, + "learning_rate": 1.1534278430700707e-06, + "loss": 0.9274, + "step": 60375 + }, + { + "epoch": 18.06, + "grad_norm": 3.1233348846435547, + "learning_rate": 1.1516645133089727e-06, + "loss": 0.9412, + "step": 60380 + }, + { + "epoch": 18.07, + "grad_norm": 1.6911544799804688, + "learning_rate": 1.1499025006666498e-06, + "loss": 0.8977, + "step": 60385 + }, + { + "epoch": 18.07, + "grad_norm": 2.505157470703125, + "learning_rate": 1.148141805240427e-06, + "loss": 1.0199, + "step": 60390 + }, + { + "epoch": 18.07, + "grad_norm": 3.155808925628662, + "learning_rate": 1.1463824271275319e-06, + "loss": 1.0877, + "step": 60395 + }, + { + "epoch": 18.07, + "grad_norm": 2.7817635536193848, + "learning_rate": 1.1446243664251482e-06, + "loss": 0.8983, + "step": 60400 + }, + { + "epoch": 18.07, + "grad_norm": 3.2476534843444824, + "learning_rate": 1.1428676232303592e-06, + "loss": 1.1019, + "step": 60405 + }, + { + "epoch": 18.07, + "grad_norm": 2.456259250640869, + "learning_rate": 1.1411121976401956e-06, + "loss": 1.1568, + "step": 60410 + }, + { + "epoch": 18.08, + "grad_norm": 3.9711551666259766, + "learning_rate": 1.139358089751605e-06, + "loss": 0.9174, + "step": 60415 + }, + { + "epoch": 18.08, + "grad_norm": 4.814849376678467, + "learning_rate": 1.137605299661465e-06, + "loss": 0.8198, + "step": 60420 + }, + { + "epoch": 18.08, + "grad_norm": 2.4193475246429443, + "learning_rate": 1.135853827466582e-06, + "loss": 0.9368, + "step": 60425 + }, + { + "epoch": 18.08, + "grad_norm": 4.294716835021973, + "learning_rate": 1.1341036732636868e-06, + "loss": 0.932, + "step": 60430 + }, + { + "epoch": 18.08, + "grad_norm": 2.2720484733581543, + "learning_rate": 1.1323548371494352e-06, + "loss": 1.0362, + "step": 60435 + }, + { + "epoch": 18.08, + "grad_norm": 2.023895740509033, + "learning_rate": 1.1306073192204198e-06, + "loss": 0.9637, + "step": 60440 + }, + { + "epoch": 18.08, + "grad_norm": 1.7245546579360962, + "learning_rate": 1.1288611195731518e-06, + "loss": 1.1067, + "step": 60445 + }, + { + "epoch": 18.09, + "grad_norm": 3.7294325828552246, + "learning_rate": 1.1271162383040685e-06, + "loss": 0.9301, + "step": 60450 + }, + { + "epoch": 18.09, + "grad_norm": 3.503115177154541, + "learning_rate": 1.125372675509545e-06, + "loss": 1.0133, + "step": 60455 + }, + { + "epoch": 18.09, + "grad_norm": 3.752464532852173, + "learning_rate": 1.1236304312858686e-06, + "loss": 1.0687, + "step": 60460 + }, + { + "epoch": 18.09, + "grad_norm": 1.6993839740753174, + "learning_rate": 1.1218895057292678e-06, + "loss": 0.8668, + "step": 60465 + }, + { + "epoch": 18.09, + "grad_norm": 1.8143806457519531, + "learning_rate": 1.1201498989358878e-06, + "loss": 0.925, + "step": 60470 + }, + { + "epoch": 18.09, + "grad_norm": 1.6880947351455688, + "learning_rate": 1.118411611001813e-06, + "loss": 1.0283, + "step": 60475 + }, + { + "epoch": 18.09, + "grad_norm": 3.8792147636413574, + "learning_rate": 1.116674642023033e-06, + "loss": 0.9818, + "step": 60480 + }, + { + "epoch": 18.1, + "grad_norm": 3.6388609409332275, + "learning_rate": 1.1149389920954933e-06, + "loss": 0.9869, + "step": 60485 + }, + { + "epoch": 18.1, + "grad_norm": 2.7923548221588135, + "learning_rate": 1.1132046613150398e-06, + "loss": 1.0708, + "step": 60490 + }, + { + "epoch": 18.1, + "grad_norm": 3.55141282081604, + "learning_rate": 1.1114716497774708e-06, + "loss": 0.918, + "step": 60495 + }, + { + "epoch": 18.1, + "grad_norm": 5.645336151123047, + "learning_rate": 1.1097399575784845e-06, + "loss": 0.8854, + "step": 60500 + }, + { + "epoch": 18.1, + "grad_norm": 4.349913120269775, + "learning_rate": 1.108009584813735e-06, + "loss": 0.9067, + "step": 60505 + }, + { + "epoch": 18.1, + "grad_norm": 3.4893198013305664, + "learning_rate": 1.1062805315787794e-06, + "loss": 0.8483, + "step": 60510 + }, + { + "epoch": 18.11, + "grad_norm": 4.366122245788574, + "learning_rate": 1.1045527979691134e-06, + "loss": 0.9011, + "step": 60515 + }, + { + "epoch": 18.11, + "grad_norm": 4.734853267669678, + "learning_rate": 1.1028263840801577e-06, + "loss": 0.9712, + "step": 60520 + }, + { + "epoch": 18.11, + "grad_norm": 3.5593693256378174, + "learning_rate": 1.1011012900072616e-06, + "loss": 0.9062, + "step": 60525 + }, + { + "epoch": 18.11, + "grad_norm": 3.4199602603912354, + "learning_rate": 1.099377515845698e-06, + "loss": 0.8207, + "step": 60530 + }, + { + "epoch": 18.11, + "grad_norm": 1.6151660680770874, + "learning_rate": 1.097655061690675e-06, + "loss": 0.8984, + "step": 60535 + }, + { + "epoch": 18.11, + "grad_norm": 8.06431770324707, + "learning_rate": 1.0959339276373132e-06, + "loss": 1.0251, + "step": 60540 + }, + { + "epoch": 18.11, + "grad_norm": 1.1797876358032227, + "learning_rate": 1.0942141137806782e-06, + "loss": 1.0986, + "step": 60545 + }, + { + "epoch": 18.12, + "grad_norm": 2.656918525695801, + "learning_rate": 1.0924956202157443e-06, + "loss": 1.0583, + "step": 60550 + }, + { + "epoch": 18.12, + "grad_norm": 4.957900524139404, + "learning_rate": 1.09077844703743e-06, + "loss": 0.981, + "step": 60555 + }, + { + "epoch": 18.12, + "grad_norm": 2.982581853866577, + "learning_rate": 1.0890625943405703e-06, + "loss": 1.0173, + "step": 60560 + }, + { + "epoch": 18.12, + "grad_norm": 3.549740791320801, + "learning_rate": 1.0873480622199283e-06, + "loss": 0.8502, + "step": 60565 + }, + { + "epoch": 18.12, + "grad_norm": 2.2221314907073975, + "learning_rate": 1.085634850770198e-06, + "loss": 1.0024, + "step": 60570 + }, + { + "epoch": 18.12, + "grad_norm": 4.6512675285339355, + "learning_rate": 1.0839229600859952e-06, + "loss": 1.1475, + "step": 60575 + }, + { + "epoch": 18.12, + "grad_norm": 1.9820324182510376, + "learning_rate": 1.082212390261872e-06, + "loss": 1.0201, + "step": 60580 + }, + { + "epoch": 18.13, + "grad_norm": 1.954908013343811, + "learning_rate": 1.0805031413922888e-06, + "loss": 0.9867, + "step": 60585 + }, + { + "epoch": 18.13, + "grad_norm": 2.869485855102539, + "learning_rate": 1.078795213571665e-06, + "loss": 1.0383, + "step": 60590 + }, + { + "epoch": 18.13, + "grad_norm": 5.065496444702148, + "learning_rate": 1.0770886068943082e-06, + "loss": 0.8963, + "step": 60595 + }, + { + "epoch": 18.13, + "grad_norm": 2.9921464920043945, + "learning_rate": 1.0753833214544872e-06, + "loss": 0.9474, + "step": 60600 + }, + { + "epoch": 18.13, + "grad_norm": 1.8409907817840576, + "learning_rate": 1.0736793573463744e-06, + "loss": 1.0217, + "step": 60605 + }, + { + "epoch": 18.13, + "grad_norm": 4.104241371154785, + "learning_rate": 1.0719767146640803e-06, + "loss": 0.884, + "step": 60610 + }, + { + "epoch": 18.14, + "grad_norm": 3.4805054664611816, + "learning_rate": 1.0702753935016408e-06, + "loss": 0.8448, + "step": 60615 + }, + { + "epoch": 18.14, + "grad_norm": 1.8355894088745117, + "learning_rate": 1.068575393953017e-06, + "loss": 0.9421, + "step": 60620 + }, + { + "epoch": 18.14, + "grad_norm": 2.950094699859619, + "learning_rate": 1.0668767161121e-06, + "loss": 0.9723, + "step": 60625 + }, + { + "epoch": 18.14, + "grad_norm": 4.175784587860107, + "learning_rate": 1.0651793600727018e-06, + "loss": 0.7554, + "step": 60630 + }, + { + "epoch": 18.14, + "grad_norm": 1.5484530925750732, + "learning_rate": 1.0634833259285743e-06, + "loss": 0.9147, + "step": 60635 + }, + { + "epoch": 18.14, + "grad_norm": 2.7090961933135986, + "learning_rate": 1.061788613773379e-06, + "loss": 0.9327, + "step": 60640 + }, + { + "epoch": 18.14, + "grad_norm": 2.208608627319336, + "learning_rate": 1.0600952237007162e-06, + "loss": 1.1507, + "step": 60645 + }, + { + "epoch": 18.15, + "grad_norm": 3.7526814937591553, + "learning_rate": 1.0584031558041108e-06, + "loss": 1.0072, + "step": 60650 + }, + { + "epoch": 18.15, + "grad_norm": 4.413444519042969, + "learning_rate": 1.0567124101770103e-06, + "loss": 0.9917, + "step": 60655 + }, + { + "epoch": 18.15, + "grad_norm": 5.183726787567139, + "learning_rate": 1.0550229869127987e-06, + "loss": 1.0578, + "step": 60660 + }, + { + "epoch": 18.15, + "grad_norm": 1.6198534965515137, + "learning_rate": 1.0533348861047814e-06, + "loss": 0.9569, + "step": 60665 + }, + { + "epoch": 18.15, + "grad_norm": 3.9471218585968018, + "learning_rate": 1.0516481078461786e-06, + "loss": 0.9391, + "step": 60670 + }, + { + "epoch": 18.15, + "grad_norm": 2.755523204803467, + "learning_rate": 1.049962652230166e-06, + "loss": 0.8292, + "step": 60675 + }, + { + "epoch": 18.15, + "grad_norm": 3.8161582946777344, + "learning_rate": 1.0482785193498156e-06, + "loss": 0.8619, + "step": 60680 + }, + { + "epoch": 18.16, + "grad_norm": 2.392561435699463, + "learning_rate": 1.046595709298151e-06, + "loss": 1.0699, + "step": 60685 + }, + { + "epoch": 18.16, + "grad_norm": 2.5720813274383545, + "learning_rate": 1.044914222168103e-06, + "loss": 0.9893, + "step": 60690 + }, + { + "epoch": 18.16, + "grad_norm": 2.1463463306427, + "learning_rate": 1.0432340580525473e-06, + "loss": 0.9201, + "step": 60695 + }, + { + "epoch": 18.16, + "grad_norm": 2.0431580543518066, + "learning_rate": 1.0415552170442682e-06, + "loss": 0.8918, + "step": 60700 + }, + { + "epoch": 18.16, + "grad_norm": 3.739319086074829, + "learning_rate": 1.0398776992359943e-06, + "loss": 0.9097, + "step": 60705 + }, + { + "epoch": 18.16, + "grad_norm": 3.897839069366455, + "learning_rate": 1.038201504720368e-06, + "loss": 0.8448, + "step": 60710 + }, + { + "epoch": 18.17, + "grad_norm": 2.4379286766052246, + "learning_rate": 1.0365266335899626e-06, + "loss": 1.0746, + "step": 60715 + }, + { + "epoch": 18.17, + "grad_norm": 5.488431453704834, + "learning_rate": 1.0348530859372902e-06, + "loss": 0.9621, + "step": 60720 + }, + { + "epoch": 18.17, + "grad_norm": 2.122537136077881, + "learning_rate": 1.0331808618547628e-06, + "loss": 1.044, + "step": 60725 + }, + { + "epoch": 18.17, + "grad_norm": 1.5338541269302368, + "learning_rate": 1.0315099614347511e-06, + "loss": 0.8884, + "step": 60730 + }, + { + "epoch": 18.17, + "grad_norm": 2.4747703075408936, + "learning_rate": 1.0298403847695286e-06, + "loss": 0.818, + "step": 60735 + }, + { + "epoch": 18.17, + "grad_norm": 5.456902980804443, + "learning_rate": 1.028172131951305e-06, + "loss": 0.985, + "step": 60740 + }, + { + "epoch": 18.17, + "grad_norm": 3.0986452102661133, + "learning_rate": 1.0265052030722171e-06, + "loss": 1.0442, + "step": 60745 + }, + { + "epoch": 18.18, + "grad_norm": 8.805809020996094, + "learning_rate": 1.024839598224331e-06, + "loss": 0.7961, + "step": 60750 + }, + { + "epoch": 18.18, + "grad_norm": 3.979262351989746, + "learning_rate": 1.0231753174996278e-06, + "loss": 1.0433, + "step": 60755 + }, + { + "epoch": 18.18, + "grad_norm": 3.9714319705963135, + "learning_rate": 1.0215123609900373e-06, + "loss": 1.0825, + "step": 60760 + }, + { + "epoch": 18.18, + "grad_norm": 2.5382189750671387, + "learning_rate": 1.019850728787386e-06, + "loss": 0.6957, + "step": 60765 + }, + { + "epoch": 18.18, + "grad_norm": 2.689236640930176, + "learning_rate": 1.0181904209834586e-06, + "loss": 0.9993, + "step": 60770 + }, + { + "epoch": 18.18, + "grad_norm": 2.2523913383483887, + "learning_rate": 1.016531437669943e-06, + "loss": 1.0457, + "step": 60775 + }, + { + "epoch": 18.18, + "grad_norm": 2.1695034503936768, + "learning_rate": 1.0148737789384689e-06, + "loss": 0.945, + "step": 60780 + }, + { + "epoch": 18.19, + "grad_norm": 4.191242694854736, + "learning_rate": 1.013217444880582e-06, + "loss": 1.0195, + "step": 60785 + }, + { + "epoch": 18.19, + "grad_norm": 2.4551188945770264, + "learning_rate": 1.011562435587765e-06, + "loss": 1.0352, + "step": 60790 + }, + { + "epoch": 18.19, + "grad_norm": 5.271094799041748, + "learning_rate": 1.0099087511514227e-06, + "loss": 1.0489, + "step": 60795 + }, + { + "epoch": 18.19, + "grad_norm": 3.048891067504883, + "learning_rate": 1.008256391662879e-06, + "loss": 1.0531, + "step": 60800 + }, + { + "epoch": 18.19, + "grad_norm": 3.0242154598236084, + "learning_rate": 1.0066053572133999e-06, + "loss": 1.1194, + "step": 60805 + }, + { + "epoch": 18.19, + "grad_norm": 3.903019905090332, + "learning_rate": 1.004955647894165e-06, + "loss": 0.9434, + "step": 60810 + }, + { + "epoch": 18.2, + "grad_norm": 2.762580394744873, + "learning_rate": 1.003307263796291e-06, + "loss": 0.8196, + "step": 60815 + }, + { + "epoch": 18.2, + "grad_norm": 22.09214210510254, + "learning_rate": 1.0016602050108099e-06, + "loss": 0.8785, + "step": 60820 + }, + { + "epoch": 18.2, + "grad_norm": 4.526973247528076, + "learning_rate": 1.0000144716286935e-06, + "loss": 0.7305, + "step": 60825 + }, + { + "epoch": 18.2, + "grad_norm": 4.591505527496338, + "learning_rate": 9.983700637408305e-07, + "loss": 1.1613, + "step": 60830 + }, + { + "epoch": 18.2, + "grad_norm": 6.078658580780029, + "learning_rate": 9.967269814380425e-07, + "loss": 0.9575, + "step": 60835 + }, + { + "epoch": 18.2, + "grad_norm": 3.495609998703003, + "learning_rate": 9.950852248110709e-07, + "loss": 1.0076, + "step": 60840 + }, + { + "epoch": 18.2, + "grad_norm": 1.9880722761154175, + "learning_rate": 9.934447939505904e-07, + "loss": 0.89, + "step": 60845 + }, + { + "epoch": 18.21, + "grad_norm": 4.335904121398926, + "learning_rate": 9.918056889472006e-07, + "loss": 1.0215, + "step": 60850 + }, + { + "epoch": 18.21, + "grad_norm": 1.3856970071792603, + "learning_rate": 9.901679098914291e-07, + "loss": 1.0306, + "step": 60855 + }, + { + "epoch": 18.21, + "grad_norm": 2.3149681091308594, + "learning_rate": 9.885314568737258e-07, + "loss": 0.9417, + "step": 60860 + }, + { + "epoch": 18.21, + "grad_norm": 2.511589527130127, + "learning_rate": 9.868963299844742e-07, + "loss": 0.9716, + "step": 60865 + }, + { + "epoch": 18.21, + "grad_norm": 1.1993108987808228, + "learning_rate": 9.85262529313974e-07, + "loss": 1.0695, + "step": 60870 + }, + { + "epoch": 18.21, + "grad_norm": 3.9712471961975098, + "learning_rate": 9.836300549524642e-07, + "loss": 1.0194, + "step": 60875 + }, + { + "epoch": 18.21, + "grad_norm": 1.5070483684539795, + "learning_rate": 9.81998906990103e-07, + "loss": 0.9899, + "step": 60880 + }, + { + "epoch": 18.22, + "grad_norm": 1.7017158269882202, + "learning_rate": 9.803690855169772e-07, + "loss": 1.0185, + "step": 60885 + }, + { + "epoch": 18.22, + "grad_norm": 2.1888837814331055, + "learning_rate": 9.787405906231006e-07, + "loss": 1.0552, + "step": 60890 + }, + { + "epoch": 18.22, + "grad_norm": 2.574962615966797, + "learning_rate": 9.771134223984097e-07, + "loss": 0.8768, + "step": 60895 + }, + { + "epoch": 18.22, + "grad_norm": 2.792041301727295, + "learning_rate": 9.754875809327769e-07, + "loss": 0.8991, + "step": 60900 + }, + { + "epoch": 18.22, + "grad_norm": 2.1841979026794434, + "learning_rate": 9.738630663159886e-07, + "loss": 1.1429, + "step": 60905 + }, + { + "epoch": 18.22, + "grad_norm": 3.5888936519622803, + "learning_rate": 9.722398786377762e-07, + "loss": 0.8153, + "step": 60910 + }, + { + "epoch": 18.23, + "grad_norm": 5.765038967132568, + "learning_rate": 9.706180179877733e-07, + "loss": 0.9749, + "step": 60915 + }, + { + "epoch": 18.23, + "grad_norm": 4.348241329193115, + "learning_rate": 9.68997484455564e-07, + "loss": 1.0241, + "step": 60920 + }, + { + "epoch": 18.23, + "grad_norm": 1.9215142726898193, + "learning_rate": 9.673782781306428e-07, + "loss": 1.1308, + "step": 60925 + }, + { + "epoch": 18.23, + "grad_norm": 2.1733293533325195, + "learning_rate": 9.657603991024416e-07, + "loss": 0.8968, + "step": 60930 + }, + { + "epoch": 18.23, + "grad_norm": 3.1698646545410156, + "learning_rate": 9.641438474603081e-07, + "loss": 0.9575, + "step": 60935 + }, + { + "epoch": 18.23, + "grad_norm": 4.437852382659912, + "learning_rate": 9.625286232935266e-07, + "loss": 1.0785, + "step": 60940 + }, + { + "epoch": 18.23, + "grad_norm": 2.4240994453430176, + "learning_rate": 9.60914726691306e-07, + "loss": 0.9774, + "step": 60945 + }, + { + "epoch": 18.24, + "grad_norm": 4.216778755187988, + "learning_rate": 9.593021577427752e-07, + "loss": 1.0754, + "step": 60950 + }, + { + "epoch": 18.24, + "grad_norm": 5.071732044219971, + "learning_rate": 9.57690916536999e-07, + "loss": 1.0031, + "step": 60955 + }, + { + "epoch": 18.24, + "grad_norm": 5.060432434082031, + "learning_rate": 9.56081003162962e-07, + "loss": 0.9962, + "step": 60960 + }, + { + "epoch": 18.24, + "grad_norm": 4.500984191894531, + "learning_rate": 9.544724177095787e-07, + "loss": 0.9923, + "step": 60965 + }, + { + "epoch": 18.24, + "grad_norm": 3.332535743713379, + "learning_rate": 9.528651602656896e-07, + "loss": 1.1458, + "step": 60970 + }, + { + "epoch": 18.24, + "grad_norm": 2.8214833736419678, + "learning_rate": 9.512592309200625e-07, + "loss": 1.0929, + "step": 60975 + }, + { + "epoch": 18.24, + "grad_norm": 3.44423770904541, + "learning_rate": 9.496546297613901e-07, + "loss": 1.0643, + "step": 60980 + }, + { + "epoch": 18.25, + "grad_norm": 4.527592182159424, + "learning_rate": 9.480513568782962e-07, + "loss": 0.9452, + "step": 60985 + }, + { + "epoch": 18.25, + "grad_norm": 1.5139596462249756, + "learning_rate": 9.46449412359321e-07, + "loss": 0.951, + "step": 60990 + }, + { + "epoch": 18.25, + "grad_norm": 3.112560749053955, + "learning_rate": 9.448487962929492e-07, + "loss": 1.0462, + "step": 60995 + }, + { + "epoch": 18.25, + "grad_norm": 6.812036991119385, + "learning_rate": 9.432495087675657e-07, + "loss": 1.0087, + "step": 61000 + }, + { + "epoch": 18.25, + "grad_norm": 1.6199243068695068, + "learning_rate": 9.416515498715139e-07, + "loss": 0.8079, + "step": 61005 + }, + { + "epoch": 18.25, + "grad_norm": 2.11490797996521, + "learning_rate": 9.400549196930342e-07, + "loss": 0.9901, + "step": 61010 + }, + { + "epoch": 18.25, + "grad_norm": 1.8027985095977783, + "learning_rate": 9.38459618320317e-07, + "loss": 1.1343, + "step": 61015 + }, + { + "epoch": 18.26, + "grad_norm": 2.305330514907837, + "learning_rate": 9.368656458414643e-07, + "loss": 1.1905, + "step": 61020 + }, + { + "epoch": 18.26, + "grad_norm": 1.3872077465057373, + "learning_rate": 9.352730023445055e-07, + "loss": 0.9287, + "step": 61025 + }, + { + "epoch": 18.26, + "grad_norm": 3.1503524780273438, + "learning_rate": 9.336816879174093e-07, + "loss": 0.9731, + "step": 61030 + }, + { + "epoch": 18.26, + "grad_norm": 1.6931301355361938, + "learning_rate": 9.320917026480553e-07, + "loss": 0.9644, + "step": 61035 + }, + { + "epoch": 18.26, + "grad_norm": 1.539621353149414, + "learning_rate": 9.305030466242592e-07, + "loss": 1.0219, + "step": 61040 + }, + { + "epoch": 18.26, + "grad_norm": 2.573394298553467, + "learning_rate": 9.289157199337622e-07, + "loss": 1.0027, + "step": 61045 + }, + { + "epoch": 18.27, + "grad_norm": 2.412062168121338, + "learning_rate": 9.273297226642275e-07, + "loss": 1.0636, + "step": 61050 + }, + { + "epoch": 18.27, + "grad_norm": 1.9419618844985962, + "learning_rate": 9.257450549032515e-07, + "loss": 0.9127, + "step": 61055 + }, + { + "epoch": 18.27, + "grad_norm": 1.3534716367721558, + "learning_rate": 9.241617167383531e-07, + "loss": 1.0985, + "step": 61060 + }, + { + "epoch": 18.27, + "grad_norm": 3.1224892139434814, + "learning_rate": 9.225797082569765e-07, + "loss": 1.0405, + "step": 61065 + }, + { + "epoch": 18.27, + "grad_norm": 2.6927273273468018, + "learning_rate": 9.209990295464959e-07, + "loss": 0.8734, + "step": 61070 + }, + { + "epoch": 18.27, + "grad_norm": 1.9963269233703613, + "learning_rate": 9.194196806942112e-07, + "loss": 0.8057, + "step": 61075 + }, + { + "epoch": 18.27, + "grad_norm": 3.602238178253174, + "learning_rate": 9.178416617873442e-07, + "loss": 0.9009, + "step": 61080 + }, + { + "epoch": 18.28, + "grad_norm": 1.87447988986969, + "learning_rate": 9.162649729130529e-07, + "loss": 0.8498, + "step": 61085 + }, + { + "epoch": 18.28, + "grad_norm": 2.5276732444763184, + "learning_rate": 9.146896141584149e-07, + "loss": 0.9557, + "step": 61090 + }, + { + "epoch": 18.28, + "grad_norm": 2.9176580905914307, + "learning_rate": 9.131155856104301e-07, + "loss": 0.9328, + "step": 61095 + }, + { + "epoch": 18.28, + "grad_norm": 1.1620190143585205, + "learning_rate": 9.115428873560372e-07, + "loss": 1.0674, + "step": 61100 + }, + { + "epoch": 18.28, + "grad_norm": 3.642578601837158, + "learning_rate": 9.09971519482089e-07, + "loss": 0.8607, + "step": 61105 + }, + { + "epoch": 18.28, + "grad_norm": 3.2962098121643066, + "learning_rate": 9.0840148207538e-07, + "loss": 0.8209, + "step": 61110 + }, + { + "epoch": 18.28, + "grad_norm": 2.739952325820923, + "learning_rate": 9.068327752226102e-07, + "loss": 0.9166, + "step": 61115 + }, + { + "epoch": 18.29, + "grad_norm": 2.815523862838745, + "learning_rate": 9.052653990104243e-07, + "loss": 1.0532, + "step": 61120 + }, + { + "epoch": 18.29, + "grad_norm": 1.6121025085449219, + "learning_rate": 9.036993535253863e-07, + "loss": 1.0011, + "step": 61125 + }, + { + "epoch": 18.29, + "grad_norm": 1.8355814218521118, + "learning_rate": 9.021346388539825e-07, + "loss": 0.9553, + "step": 61130 + }, + { + "epoch": 18.29, + "grad_norm": 2.7154858112335205, + "learning_rate": 9.005712550826384e-07, + "loss": 1.089, + "step": 61135 + }, + { + "epoch": 18.29, + "grad_norm": 2.364108085632324, + "learning_rate": 8.990092022976904e-07, + "loss": 0.9518, + "step": 61140 + }, + { + "epoch": 18.29, + "grad_norm": 1.0809826850891113, + "learning_rate": 8.974484805854166e-07, + "loss": 0.9845, + "step": 61145 + }, + { + "epoch": 18.3, + "grad_norm": 1.4934520721435547, + "learning_rate": 8.958890900320066e-07, + "loss": 0.9068, + "step": 61150 + }, + { + "epoch": 18.3, + "grad_norm": 2.0490729808807373, + "learning_rate": 8.943310307235886e-07, + "loss": 1.0463, + "step": 61155 + }, + { + "epoch": 18.3, + "grad_norm": 2.349944591522217, + "learning_rate": 8.927743027462104e-07, + "loss": 1.065, + "step": 61160 + }, + { + "epoch": 18.3, + "grad_norm": 2.097916841506958, + "learning_rate": 8.912189061858506e-07, + "loss": 0.9555, + "step": 61165 + }, + { + "epoch": 18.3, + "grad_norm": 1.0531599521636963, + "learning_rate": 8.896648411284097e-07, + "loss": 0.9135, + "step": 61170 + }, + { + "epoch": 18.3, + "grad_norm": 3.910612106323242, + "learning_rate": 8.881121076597193e-07, + "loss": 0.7968, + "step": 61175 + }, + { + "epoch": 18.3, + "grad_norm": 2.345278739929199, + "learning_rate": 8.865607058655356e-07, + "loss": 0.9632, + "step": 61180 + }, + { + "epoch": 18.31, + "grad_norm": 2.56709885597229, + "learning_rate": 8.850106358315402e-07, + "loss": 1.1304, + "step": 61185 + }, + { + "epoch": 18.31, + "grad_norm": 3.583867073059082, + "learning_rate": 8.834618976433367e-07, + "loss": 0.7337, + "step": 61190 + }, + { + "epoch": 18.31, + "grad_norm": 2.285977840423584, + "learning_rate": 8.819144913864708e-07, + "loss": 1.0113, + "step": 61195 + }, + { + "epoch": 18.31, + "grad_norm": 3.424396514892578, + "learning_rate": 8.803684171463905e-07, + "loss": 0.91, + "step": 61200 + }, + { + "epoch": 18.31, + "grad_norm": 3.49349045753479, + "learning_rate": 8.788236750084999e-07, + "loss": 1.0395, + "step": 61205 + }, + { + "epoch": 18.31, + "grad_norm": 1.7467879056930542, + "learning_rate": 8.772802650580975e-07, + "loss": 0.8866, + "step": 61210 + }, + { + "epoch": 18.31, + "grad_norm": 5.124495506286621, + "learning_rate": 8.757381873804371e-07, + "loss": 0.9363, + "step": 61215 + }, + { + "epoch": 18.32, + "grad_norm": 1.684017539024353, + "learning_rate": 8.741974420606813e-07, + "loss": 0.8885, + "step": 61220 + }, + { + "epoch": 18.32, + "grad_norm": 3.0965070724487305, + "learning_rate": 8.726580291839203e-07, + "loss": 0.9159, + "step": 61225 + }, + { + "epoch": 18.32, + "grad_norm": 4.807112693786621, + "learning_rate": 8.711199488351779e-07, + "loss": 0.9592, + "step": 61230 + }, + { + "epoch": 18.32, + "grad_norm": 2.7470273971557617, + "learning_rate": 8.695832010993998e-07, + "loss": 0.8318, + "step": 61235 + }, + { + "epoch": 18.32, + "grad_norm": 5.211430072784424, + "learning_rate": 8.6804778606146e-07, + "loss": 0.9829, + "step": 61240 + }, + { + "epoch": 18.32, + "grad_norm": 2.6941945552825928, + "learning_rate": 8.665137038061572e-07, + "loss": 0.9167, + "step": 61245 + }, + { + "epoch": 18.33, + "grad_norm": 2.2288553714752197, + "learning_rate": 8.64980954418218e-07, + "loss": 0.9119, + "step": 61250 + }, + { + "epoch": 18.33, + "grad_norm": 2.5863308906555176, + "learning_rate": 8.634495379822943e-07, + "loss": 1.03, + "step": 61255 + }, + { + "epoch": 18.33, + "grad_norm": 1.7381858825683594, + "learning_rate": 8.619194545829628e-07, + "loss": 1.1305, + "step": 61260 + }, + { + "epoch": 18.33, + "grad_norm": 2.788691997528076, + "learning_rate": 8.603907043047283e-07, + "loss": 1.1239, + "step": 61265 + }, + { + "epoch": 18.33, + "grad_norm": 3.6779868602752686, + "learning_rate": 8.588632872320257e-07, + "loss": 1.1608, + "step": 61270 + }, + { + "epoch": 18.33, + "grad_norm": 3.579209327697754, + "learning_rate": 8.573372034492099e-07, + "loss": 0.9267, + "step": 61275 + }, + { + "epoch": 18.33, + "grad_norm": 4.529468059539795, + "learning_rate": 8.558124530405664e-07, + "loss": 0.9345, + "step": 61280 + }, + { + "epoch": 18.34, + "grad_norm": 1.8296302556991577, + "learning_rate": 8.542890360903e-07, + "loss": 0.796, + "step": 61285 + }, + { + "epoch": 18.34, + "grad_norm": 2.3215250968933105, + "learning_rate": 8.527669526825599e-07, + "loss": 1.0763, + "step": 61290 + }, + { + "epoch": 18.34, + "grad_norm": 3.3448548316955566, + "learning_rate": 8.512462029013929e-07, + "loss": 0.9275, + "step": 61295 + }, + { + "epoch": 18.34, + "grad_norm": 2.267402172088623, + "learning_rate": 8.49726786830804e-07, + "loss": 1.107, + "step": 61300 + }, + { + "epoch": 18.34, + "grad_norm": 3.513765335083008, + "learning_rate": 8.482087045546955e-07, + "loss": 1.0165, + "step": 61305 + }, + { + "epoch": 18.34, + "grad_norm": 2.1769518852233887, + "learning_rate": 8.466919561569225e-07, + "loss": 0.9577, + "step": 61310 + }, + { + "epoch": 18.34, + "grad_norm": 2.1250414848327637, + "learning_rate": 8.451765417212432e-07, + "loss": 0.8942, + "step": 61315 + }, + { + "epoch": 18.35, + "grad_norm": 1.5039304494857788, + "learning_rate": 8.43662461331357e-07, + "loss": 0.9996, + "step": 61320 + }, + { + "epoch": 18.35, + "grad_norm": 1.338394045829773, + "learning_rate": 8.421497150708835e-07, + "loss": 0.8906, + "step": 61325 + }, + { + "epoch": 18.35, + "grad_norm": 2.3154618740081787, + "learning_rate": 8.406383030233694e-07, + "loss": 1.0178, + "step": 61330 + }, + { + "epoch": 18.35, + "grad_norm": 1.515351414680481, + "learning_rate": 8.391282252722898e-07, + "loss": 0.9011, + "step": 61335 + }, + { + "epoch": 18.35, + "grad_norm": 4.342297077178955, + "learning_rate": 8.376194819010446e-07, + "loss": 0.9005, + "step": 61340 + }, + { + "epoch": 18.35, + "grad_norm": 3.609494209289551, + "learning_rate": 8.361120729929617e-07, + "loss": 0.9243, + "step": 61345 + }, + { + "epoch": 18.36, + "grad_norm": 2.8200643062591553, + "learning_rate": 8.34605998631291e-07, + "loss": 1.0519, + "step": 61350 + }, + { + "epoch": 18.36, + "grad_norm": 2.881962537765503, + "learning_rate": 8.331012588992132e-07, + "loss": 0.8966, + "step": 61355 + }, + { + "epoch": 18.36, + "grad_norm": 2.507383108139038, + "learning_rate": 8.315978538798314e-07, + "loss": 0.8928, + "step": 61360 + }, + { + "epoch": 18.36, + "grad_norm": 1.8377445936203003, + "learning_rate": 8.30095783656179e-07, + "loss": 0.9829, + "step": 61365 + }, + { + "epoch": 18.36, + "grad_norm": 1.3998005390167236, + "learning_rate": 8.28595048311212e-07, + "loss": 0.9379, + "step": 61370 + }, + { + "epoch": 18.36, + "grad_norm": 1.7559727430343628, + "learning_rate": 8.270956479278196e-07, + "loss": 1.1797, + "step": 61375 + }, + { + "epoch": 18.36, + "grad_norm": 1.639439582824707, + "learning_rate": 8.255975825888024e-07, + "loss": 0.8938, + "step": 61380 + }, + { + "epoch": 18.37, + "grad_norm": 2.903548002243042, + "learning_rate": 8.241008523769106e-07, + "loss": 0.8785, + "step": 61385 + }, + { + "epoch": 18.37, + "grad_norm": 2.10689377784729, + "learning_rate": 8.226054573747894e-07, + "loss": 1.0115, + "step": 61390 + }, + { + "epoch": 18.37, + "grad_norm": 3.6648929119110107, + "learning_rate": 8.211113976650475e-07, + "loss": 0.9137, + "step": 61395 + }, + { + "epoch": 18.37, + "grad_norm": 3.99898624420166, + "learning_rate": 8.19618673330183e-07, + "loss": 1.1167, + "step": 61400 + }, + { + "epoch": 18.37, + "grad_norm": 3.173650026321411, + "learning_rate": 8.181272844526495e-07, + "loss": 1.1092, + "step": 61405 + }, + { + "epoch": 18.37, + "grad_norm": 3.0180552005767822, + "learning_rate": 8.166372311148113e-07, + "loss": 0.8644, + "step": 61410 + }, + { + "epoch": 18.37, + "grad_norm": 3.1037330627441406, + "learning_rate": 8.151485133989584e-07, + "loss": 0.8741, + "step": 61415 + }, + { + "epoch": 18.38, + "grad_norm": 3.032813310623169, + "learning_rate": 8.136611313873166e-07, + "loss": 0.9209, + "step": 61420 + }, + { + "epoch": 18.38, + "grad_norm": 2.8166263103485107, + "learning_rate": 8.121750851620286e-07, + "loss": 0.8434, + "step": 61425 + }, + { + "epoch": 18.38, + "grad_norm": 2.1553432941436768, + "learning_rate": 8.106903748051675e-07, + "loss": 0.8497, + "step": 61430 + }, + { + "epoch": 18.38, + "grad_norm": 2.0251331329345703, + "learning_rate": 8.092070003987373e-07, + "loss": 1.1232, + "step": 61435 + }, + { + "epoch": 18.38, + "grad_norm": 4.4784345626831055, + "learning_rate": 8.077249620246558e-07, + "loss": 0.9472, + "step": 61440 + }, + { + "epoch": 18.38, + "grad_norm": 3.576678514480591, + "learning_rate": 8.062442597647795e-07, + "loss": 0.8144, + "step": 61445 + }, + { + "epoch": 18.39, + "grad_norm": 1.705344796180725, + "learning_rate": 8.047648937008851e-07, + "loss": 1.1875, + "step": 61450 + }, + { + "epoch": 18.39, + "grad_norm": 1.4665272235870361, + "learning_rate": 8.032868639146762e-07, + "loss": 1.0611, + "step": 61455 + }, + { + "epoch": 18.39, + "grad_norm": 2.712339162826538, + "learning_rate": 8.018101704877823e-07, + "loss": 1.0206, + "step": 61460 + }, + { + "epoch": 18.39, + "grad_norm": 3.855246067047119, + "learning_rate": 8.003348135017603e-07, + "loss": 1.0525, + "step": 61465 + }, + { + "epoch": 18.39, + "grad_norm": 2.246304988861084, + "learning_rate": 7.988607930380948e-07, + "loss": 0.8379, + "step": 61470 + }, + { + "epoch": 18.39, + "grad_norm": 3.892084836959839, + "learning_rate": 7.973881091781876e-07, + "loss": 1.0106, + "step": 61475 + }, + { + "epoch": 18.39, + "grad_norm": 1.8931845426559448, + "learning_rate": 7.959167620033847e-07, + "loss": 0.9843, + "step": 61480 + }, + { + "epoch": 18.4, + "grad_norm": 2.768260955810547, + "learning_rate": 7.944467515949322e-07, + "loss": 0.8572, + "step": 61485 + }, + { + "epoch": 18.4, + "grad_norm": 11.066926002502441, + "learning_rate": 7.929780780340318e-07, + "loss": 0.8194, + "step": 61490 + }, + { + "epoch": 18.4, + "grad_norm": 4.7817912101745605, + "learning_rate": 7.915107414017825e-07, + "loss": 1.0095, + "step": 61495 + }, + { + "epoch": 18.4, + "grad_norm": 1.809881329536438, + "learning_rate": 7.900447417792389e-07, + "loss": 0.8781, + "step": 61500 + }, + { + "epoch": 18.4, + "grad_norm": 3.047032356262207, + "learning_rate": 7.885800792473585e-07, + "loss": 1.0429, + "step": 61505 + }, + { + "epoch": 18.4, + "grad_norm": 1.490786075592041, + "learning_rate": 7.871167538870322e-07, + "loss": 0.8668, + "step": 61510 + }, + { + "epoch": 18.4, + "grad_norm": 1.9980204105377197, + "learning_rate": 7.856547657790786e-07, + "loss": 1.0007, + "step": 61515 + }, + { + "epoch": 18.41, + "grad_norm": 3.4392709732055664, + "learning_rate": 7.841941150042415e-07, + "loss": 0.9901, + "step": 61520 + }, + { + "epoch": 18.41, + "grad_norm": 4.489561557769775, + "learning_rate": 7.827348016431979e-07, + "loss": 1.0117, + "step": 61525 + }, + { + "epoch": 18.41, + "grad_norm": 4.4032979011535645, + "learning_rate": 7.812768257765335e-07, + "loss": 1.0882, + "step": 61530 + }, + { + "epoch": 18.41, + "grad_norm": 3.137603282928467, + "learning_rate": 7.79820187484781e-07, + "loss": 0.921, + "step": 61535 + }, + { + "epoch": 18.41, + "grad_norm": 4.174556732177734, + "learning_rate": 7.783648868483817e-07, + "loss": 0.982, + "step": 61540 + }, + { + "epoch": 18.41, + "grad_norm": 2.2925820350646973, + "learning_rate": 7.769109239477129e-07, + "loss": 0.9188, + "step": 61545 + }, + { + "epoch": 18.42, + "grad_norm": 2.8582160472869873, + "learning_rate": 7.754582988630743e-07, + "loss": 1.0092, + "step": 61550 + }, + { + "epoch": 18.42, + "grad_norm": 3.7304418087005615, + "learning_rate": 7.740070116746961e-07, + "loss": 0.7871, + "step": 61555 + }, + { + "epoch": 18.42, + "grad_norm": 2.1194798946380615, + "learning_rate": 7.725570624627282e-07, + "loss": 0.9669, + "step": 61560 + }, + { + "epoch": 18.42, + "grad_norm": 2.3362677097320557, + "learning_rate": 7.71108451307248e-07, + "loss": 0.9318, + "step": 61565 + }, + { + "epoch": 18.42, + "grad_norm": 1.7674001455307007, + "learning_rate": 7.696611782882668e-07, + "loss": 0.9901, + "step": 61570 + }, + { + "epoch": 18.42, + "grad_norm": 1.8596383333206177, + "learning_rate": 7.682152434857149e-07, + "loss": 0.8932, + "step": 61575 + }, + { + "epoch": 18.42, + "grad_norm": 3.049741744995117, + "learning_rate": 7.667706469794395e-07, + "loss": 0.8584, + "step": 61580 + }, + { + "epoch": 18.43, + "grad_norm": 2.637298107147217, + "learning_rate": 7.653273888492407e-07, + "loss": 1.1926, + "step": 61585 + }, + { + "epoch": 18.43, + "grad_norm": 2.7286012172698975, + "learning_rate": 7.638854691748132e-07, + "loss": 1.0714, + "step": 61590 + }, + { + "epoch": 18.43, + "grad_norm": 4.056117057800293, + "learning_rate": 7.624448880358043e-07, + "loss": 0.9455, + "step": 61595 + }, + { + "epoch": 18.43, + "grad_norm": 2.3087191581726074, + "learning_rate": 7.61005645511767e-07, + "loss": 1.0218, + "step": 61600 + }, + { + "epoch": 18.43, + "grad_norm": 2.0572636127471924, + "learning_rate": 7.595677416821933e-07, + "loss": 0.9051, + "step": 61605 + }, + { + "epoch": 18.43, + "grad_norm": 6.119582176208496, + "learning_rate": 7.581311766265003e-07, + "loss": 0.8552, + "step": 61610 + }, + { + "epoch": 18.43, + "grad_norm": 2.6109657287597656, + "learning_rate": 7.56695950424019e-07, + "loss": 0.9766, + "step": 61615 + }, + { + "epoch": 18.44, + "grad_norm": 3.126173734664917, + "learning_rate": 7.552620631540247e-07, + "loss": 0.9172, + "step": 61620 + }, + { + "epoch": 18.44, + "grad_norm": 3.1183481216430664, + "learning_rate": 7.538295148957015e-07, + "loss": 0.7652, + "step": 61625 + }, + { + "epoch": 18.44, + "grad_norm": 1.6673765182495117, + "learning_rate": 7.523983057281775e-07, + "loss": 0.884, + "step": 61630 + }, + { + "epoch": 18.44, + "grad_norm": 3.5469305515289307, + "learning_rate": 7.509684357304897e-07, + "loss": 0.9741, + "step": 61635 + }, + { + "epoch": 18.44, + "grad_norm": 2.485867977142334, + "learning_rate": 7.495399049816082e-07, + "loss": 0.7726, + "step": 61640 + }, + { + "epoch": 18.44, + "grad_norm": 1.444970965385437, + "learning_rate": 7.48112713560431e-07, + "loss": 0.9309, + "step": 61645 + }, + { + "epoch": 18.44, + "grad_norm": 3.333101987838745, + "learning_rate": 7.466868615457783e-07, + "loss": 1.1428, + "step": 61650 + }, + { + "epoch": 18.45, + "grad_norm": 2.304952383041382, + "learning_rate": 7.45262349016404e-07, + "loss": 0.9161, + "step": 61655 + }, + { + "epoch": 18.45, + "grad_norm": 2.9560463428497314, + "learning_rate": 7.438391760509755e-07, + "loss": 0.9267, + "step": 61660 + }, + { + "epoch": 18.45, + "grad_norm": 4.144384384155273, + "learning_rate": 7.427016022174993e-07, + "loss": 1.1577, + "step": 61665 + }, + { + "epoch": 18.45, + "grad_norm": 5.686766624450684, + "learning_rate": 7.412808406652038e-07, + "loss": 0.9974, + "step": 61670 + }, + { + "epoch": 18.45, + "grad_norm": 3.3661046028137207, + "learning_rate": 7.398614188967534e-07, + "loss": 0.8438, + "step": 61675 + }, + { + "epoch": 18.45, + "grad_norm": 5.448729515075684, + "learning_rate": 7.38443336990538e-07, + "loss": 0.8848, + "step": 61680 + }, + { + "epoch": 18.46, + "grad_norm": 2.479353189468384, + "learning_rate": 7.370265950248783e-07, + "loss": 1.0998, + "step": 61685 + }, + { + "epoch": 18.46, + "grad_norm": 2.4149727821350098, + "learning_rate": 7.356111930780201e-07, + "loss": 0.9238, + "step": 61690 + }, + { + "epoch": 18.46, + "grad_norm": 2.909254312515259, + "learning_rate": 7.341971312281343e-07, + "loss": 0.9805, + "step": 61695 + }, + { + "epoch": 18.46, + "grad_norm": 2.9561984539031982, + "learning_rate": 7.327844095533193e-07, + "loss": 1.1177, + "step": 61700 + }, + { + "epoch": 18.46, + "grad_norm": 1.3195778131484985, + "learning_rate": 7.313730281315961e-07, + "loss": 1.0986, + "step": 61705 + }, + { + "epoch": 18.46, + "grad_norm": 1.9130079746246338, + "learning_rate": 7.299629870409136e-07, + "loss": 0.8861, + "step": 61710 + }, + { + "epoch": 18.46, + "grad_norm": 4.564031600952148, + "learning_rate": 7.285542863591482e-07, + "loss": 0.8437, + "step": 61715 + }, + { + "epoch": 18.47, + "grad_norm": 2.09362530708313, + "learning_rate": 7.271469261641017e-07, + "loss": 0.8984, + "step": 61720 + }, + { + "epoch": 18.47, + "grad_norm": 3.5240318775177, + "learning_rate": 7.257409065335035e-07, + "loss": 0.7457, + "step": 61725 + }, + { + "epoch": 18.47, + "grad_norm": 4.574277400970459, + "learning_rate": 7.243362275449972e-07, + "loss": 0.9621, + "step": 61730 + }, + { + "epoch": 18.47, + "grad_norm": 4.2122578620910645, + "learning_rate": 7.229328892761733e-07, + "loss": 0.9186, + "step": 61735 + }, + { + "epoch": 18.47, + "grad_norm": 2.056427001953125, + "learning_rate": 7.215308918045255e-07, + "loss": 1.0321, + "step": 61740 + }, + { + "epoch": 18.47, + "grad_norm": 3.035954236984253, + "learning_rate": 7.201302352074973e-07, + "loss": 0.9903, + "step": 61745 + }, + { + "epoch": 18.47, + "grad_norm": 3.6890830993652344, + "learning_rate": 7.187309195624353e-07, + "loss": 0.9409, + "step": 61750 + }, + { + "epoch": 18.48, + "grad_norm": 2.126034736633301, + "learning_rate": 7.173329449466248e-07, + "loss": 1.0002, + "step": 61755 + }, + { + "epoch": 18.48, + "grad_norm": 12.646512031555176, + "learning_rate": 7.159363114372763e-07, + "loss": 0.8951, + "step": 61760 + }, + { + "epoch": 18.48, + "grad_norm": 4.24096155166626, + "learning_rate": 7.145410191115226e-07, + "loss": 0.8652, + "step": 61765 + }, + { + "epoch": 18.48, + "grad_norm": 1.79692804813385, + "learning_rate": 7.13147068046427e-07, + "loss": 1.0179, + "step": 61770 + }, + { + "epoch": 18.48, + "grad_norm": 2.1023857593536377, + "learning_rate": 7.117544583189723e-07, + "loss": 1.0399, + "step": 61775 + }, + { + "epoch": 18.48, + "grad_norm": 3.325761079788208, + "learning_rate": 7.10363190006072e-07, + "loss": 0.9114, + "step": 61780 + }, + { + "epoch": 18.49, + "grad_norm": 1.3993957042694092, + "learning_rate": 7.089732631845674e-07, + "loss": 0.9271, + "step": 61785 + }, + { + "epoch": 18.49, + "grad_norm": 2.963554620742798, + "learning_rate": 7.075846779312196e-07, + "loss": 1.0317, + "step": 61790 + }, + { + "epoch": 18.49, + "grad_norm": 4.754477500915527, + "learning_rate": 7.061974343227168e-07, + "loss": 0.87, + "step": 61795 + }, + { + "epoch": 18.49, + "grad_norm": 2.58296799659729, + "learning_rate": 7.048115324356814e-07, + "loss": 1.0729, + "step": 61800 + }, + { + "epoch": 18.49, + "grad_norm": 1.766918659210205, + "learning_rate": 7.034269723466491e-07, + "loss": 1.0143, + "step": 61805 + }, + { + "epoch": 18.49, + "grad_norm": 6.538331985473633, + "learning_rate": 7.020437541320924e-07, + "loss": 0.9332, + "step": 61810 + }, + { + "epoch": 18.49, + "grad_norm": 1.1120764017105103, + "learning_rate": 7.006618778683999e-07, + "loss": 1.0328, + "step": 61815 + }, + { + "epoch": 18.5, + "grad_norm": 3.360107898712158, + "learning_rate": 6.992813436318996e-07, + "loss": 0.9667, + "step": 61820 + }, + { + "epoch": 18.5, + "grad_norm": 1.8557356595993042, + "learning_rate": 6.979021514988249e-07, + "loss": 1.0017, + "step": 61825 + }, + { + "epoch": 18.5, + "grad_norm": 2.17095947265625, + "learning_rate": 6.965243015453593e-07, + "loss": 0.9728, + "step": 61830 + }, + { + "epoch": 18.5, + "grad_norm": 2.826488971710205, + "learning_rate": 6.951477938475892e-07, + "loss": 0.9713, + "step": 61835 + }, + { + "epoch": 18.5, + "grad_norm": 2.857255220413208, + "learning_rate": 6.937726284815482e-07, + "loss": 0.9573, + "step": 61840 + }, + { + "epoch": 18.5, + "grad_norm": 6.233930587768555, + "learning_rate": 6.923988055231784e-07, + "loss": 1.2165, + "step": 61845 + }, + { + "epoch": 18.5, + "grad_norm": 4.246452808380127, + "learning_rate": 6.910263250483551e-07, + "loss": 1.1228, + "step": 61850 + }, + { + "epoch": 18.51, + "grad_norm": 2.3864681720733643, + "learning_rate": 6.896551871328788e-07, + "loss": 0.9279, + "step": 61855 + }, + { + "epoch": 18.51, + "grad_norm": 3.7490153312683105, + "learning_rate": 6.882853918524779e-07, + "loss": 0.9047, + "step": 61860 + }, + { + "epoch": 18.51, + "grad_norm": 3.5015833377838135, + "learning_rate": 6.869169392828056e-07, + "loss": 1.0886, + "step": 61865 + }, + { + "epoch": 18.51, + "grad_norm": 4.527100563049316, + "learning_rate": 6.85549829499435e-07, + "loss": 0.8575, + "step": 61870 + }, + { + "epoch": 18.51, + "grad_norm": 4.221701145172119, + "learning_rate": 6.841840625778805e-07, + "loss": 0.9333, + "step": 61875 + }, + { + "epoch": 18.51, + "grad_norm": 2.010347843170166, + "learning_rate": 6.828196385935626e-07, + "loss": 0.8162, + "step": 61880 + }, + { + "epoch": 18.52, + "grad_norm": 2.520059585571289, + "learning_rate": 6.814565576218374e-07, + "loss": 1.0076, + "step": 61885 + }, + { + "epoch": 18.52, + "grad_norm": 2.8664329051971436, + "learning_rate": 6.80094819737992e-07, + "loss": 1.0359, + "step": 61890 + }, + { + "epoch": 18.52, + "grad_norm": 3.0318400859832764, + "learning_rate": 6.787344250172273e-07, + "loss": 0.9725, + "step": 61895 + }, + { + "epoch": 18.52, + "grad_norm": 4.079439640045166, + "learning_rate": 6.773753735346805e-07, + "loss": 0.9406, + "step": 61900 + }, + { + "epoch": 18.52, + "grad_norm": 2.608128309249878, + "learning_rate": 6.76017665365411e-07, + "loss": 0.8831, + "step": 61905 + }, + { + "epoch": 18.52, + "grad_norm": 2.588148355484009, + "learning_rate": 6.746613005844033e-07, + "loss": 1.0509, + "step": 61910 + }, + { + "epoch": 18.52, + "grad_norm": 1.884019374847412, + "learning_rate": 6.733062792665695e-07, + "loss": 1.0, + "step": 61915 + }, + { + "epoch": 18.53, + "grad_norm": 3.63097882270813, + "learning_rate": 6.719526014867361e-07, + "loss": 1.114, + "step": 61920 + }, + { + "epoch": 18.53, + "grad_norm": 1.6369963884353638, + "learning_rate": 6.706002673196793e-07, + "loss": 1.0093, + "step": 61925 + }, + { + "epoch": 18.53, + "grad_norm": 3.430427312850952, + "learning_rate": 6.692492768400782e-07, + "loss": 1.1051, + "step": 61930 + }, + { + "epoch": 18.53, + "grad_norm": 1.9971526861190796, + "learning_rate": 6.67899630122551e-07, + "loss": 0.9439, + "step": 61935 + }, + { + "epoch": 18.53, + "grad_norm": 1.4598491191864014, + "learning_rate": 6.665513272416324e-07, + "loss": 0.9472, + "step": 61940 + }, + { + "epoch": 18.53, + "grad_norm": 2.7384274005889893, + "learning_rate": 6.652043682717907e-07, + "loss": 0.922, + "step": 61945 + }, + { + "epoch": 18.53, + "grad_norm": 2.7123560905456543, + "learning_rate": 6.638587532874219e-07, + "loss": 0.9753, + "step": 61950 + }, + { + "epoch": 18.54, + "grad_norm": 2.5400397777557373, + "learning_rate": 6.625144823628332e-07, + "loss": 0.7639, + "step": 61955 + }, + { + "epoch": 18.54, + "grad_norm": 4.62942361831665, + "learning_rate": 6.611715555722764e-07, + "loss": 1.0445, + "step": 61960 + }, + { + "epoch": 18.54, + "grad_norm": 2.880040168762207, + "learning_rate": 6.598299729899088e-07, + "loss": 0.9792, + "step": 61965 + }, + { + "epoch": 18.54, + "grad_norm": 2.0064916610717773, + "learning_rate": 6.584897346898405e-07, + "loss": 1.0655, + "step": 61970 + }, + { + "epoch": 18.54, + "grad_norm": 2.9998416900634766, + "learning_rate": 6.571508407460764e-07, + "loss": 0.9663, + "step": 61975 + }, + { + "epoch": 18.54, + "grad_norm": 2.280630350112915, + "learning_rate": 6.55813291232571e-07, + "loss": 1.0144, + "step": 61980 + }, + { + "epoch": 18.55, + "grad_norm": 2.611635208129883, + "learning_rate": 6.54477086223193e-07, + "loss": 1.2247, + "step": 61985 + }, + { + "epoch": 18.55, + "grad_norm": 5.889001369476318, + "learning_rate": 6.531422257917391e-07, + "loss": 1.0725, + "step": 61990 + }, + { + "epoch": 18.55, + "grad_norm": 3.5222485065460205, + "learning_rate": 6.518087100119335e-07, + "loss": 0.8588, + "step": 61995 + }, + { + "epoch": 18.55, + "grad_norm": 2.528952121734619, + "learning_rate": 6.50476538957423e-07, + "loss": 0.9481, + "step": 62000 + }, + { + "epoch": 18.55, + "grad_norm": 2.8010430335998535, + "learning_rate": 6.491457127017847e-07, + "loss": 0.8445, + "step": 62005 + }, + { + "epoch": 18.55, + "grad_norm": 1.7682600021362305, + "learning_rate": 6.478162313185182e-07, + "loss": 0.9884, + "step": 62010 + }, + { + "epoch": 18.55, + "grad_norm": 4.158107280731201, + "learning_rate": 6.464880948810453e-07, + "loss": 0.8942, + "step": 62015 + }, + { + "epoch": 18.56, + "grad_norm": 2.5172414779663086, + "learning_rate": 6.45161303462724e-07, + "loss": 0.9485, + "step": 62020 + }, + { + "epoch": 18.56, + "grad_norm": 1.1113386154174805, + "learning_rate": 6.438358571368263e-07, + "loss": 0.9776, + "step": 62025 + }, + { + "epoch": 18.56, + "grad_norm": 2.090869426727295, + "learning_rate": 6.425117559765576e-07, + "loss": 1.0483, + "step": 62030 + }, + { + "epoch": 18.56, + "grad_norm": 3.047197103500366, + "learning_rate": 6.41189000055048e-07, + "loss": 0.8976, + "step": 62035 + }, + { + "epoch": 18.56, + "grad_norm": 2.634687662124634, + "learning_rate": 6.398675894453477e-07, + "loss": 0.9284, + "step": 62040 + }, + { + "epoch": 18.56, + "grad_norm": 3.5582494735717773, + "learning_rate": 6.385475242204453e-07, + "loss": 0.8967, + "step": 62045 + }, + { + "epoch": 18.56, + "grad_norm": 5.443139553070068, + "learning_rate": 6.372288044532326e-07, + "loss": 0.9113, + "step": 62050 + }, + { + "epoch": 18.57, + "grad_norm": 1.4090752601623535, + "learning_rate": 6.35911430216557e-07, + "loss": 1.0798, + "step": 62055 + }, + { + "epoch": 18.57, + "grad_norm": 1.6593273878097534, + "learning_rate": 6.345954015831629e-07, + "loss": 0.6439, + "step": 62060 + }, + { + "epoch": 18.57, + "grad_norm": 1.541206955909729, + "learning_rate": 6.332807186257423e-07, + "loss": 0.9511, + "step": 62065 + }, + { + "epoch": 18.57, + "grad_norm": 4.009206295013428, + "learning_rate": 6.319673814168953e-07, + "loss": 0.8826, + "step": 62070 + }, + { + "epoch": 18.57, + "grad_norm": 2.005704879760742, + "learning_rate": 6.30655390029164e-07, + "loss": 0.9057, + "step": 62075 + }, + { + "epoch": 18.57, + "grad_norm": 2.7836337089538574, + "learning_rate": 6.293447445350043e-07, + "loss": 0.9222, + "step": 62080 + }, + { + "epoch": 18.58, + "grad_norm": 4.371108055114746, + "learning_rate": 6.280354450068026e-07, + "loss": 0.8937, + "step": 62085 + }, + { + "epoch": 18.58, + "grad_norm": 2.3152432441711426, + "learning_rate": 6.26727491516868e-07, + "loss": 1.1751, + "step": 62090 + }, + { + "epoch": 18.58, + "grad_norm": 2.359133720397949, + "learning_rate": 6.254208841374426e-07, + "loss": 0.9842, + "step": 62095 + }, + { + "epoch": 18.58, + "grad_norm": 2.8773648738861084, + "learning_rate": 6.241156229406825e-07, + "loss": 0.9049, + "step": 62100 + }, + { + "epoch": 18.58, + "grad_norm": 2.3904407024383545, + "learning_rate": 6.228117079986829e-07, + "loss": 0.9434, + "step": 62105 + }, + { + "epoch": 18.58, + "grad_norm": 2.5811879634857178, + "learning_rate": 6.215091393834499e-07, + "loss": 1.0067, + "step": 62110 + }, + { + "epoch": 18.58, + "grad_norm": 5.010559558868408, + "learning_rate": 6.202079171669289e-07, + "loss": 0.9329, + "step": 62115 + }, + { + "epoch": 18.59, + "grad_norm": 6.2016167640686035, + "learning_rate": 6.189080414209847e-07, + "loss": 0.8496, + "step": 62120 + }, + { + "epoch": 18.59, + "grad_norm": 1.5568153858184814, + "learning_rate": 6.176095122174041e-07, + "loss": 0.9277, + "step": 62125 + }, + { + "epoch": 18.59, + "grad_norm": 1.9093276262283325, + "learning_rate": 6.163123296279077e-07, + "loss": 0.9473, + "step": 62130 + }, + { + "epoch": 18.59, + "grad_norm": 7.955539703369141, + "learning_rate": 6.150164937241354e-07, + "loss": 0.876, + "step": 62135 + }, + { + "epoch": 18.59, + "grad_norm": 2.9364593029022217, + "learning_rate": 6.137220045776604e-07, + "loss": 0.8387, + "step": 62140 + }, + { + "epoch": 18.59, + "grad_norm": 3.2200205326080322, + "learning_rate": 6.124288622599617e-07, + "loss": 0.9952, + "step": 62145 + }, + { + "epoch": 18.59, + "grad_norm": 2.081777572631836, + "learning_rate": 6.111370668424765e-07, + "loss": 0.8266, + "step": 62150 + }, + { + "epoch": 18.6, + "grad_norm": 2.223055601119995, + "learning_rate": 6.098466183965313e-07, + "loss": 0.9303, + "step": 62155 + }, + { + "epoch": 18.6, + "grad_norm": 1.2533679008483887, + "learning_rate": 6.085575169934132e-07, + "loss": 0.949, + "step": 62160 + }, + { + "epoch": 18.6, + "grad_norm": 2.873736619949341, + "learning_rate": 6.072697627043045e-07, + "loss": 1.0112, + "step": 62165 + }, + { + "epoch": 18.6, + "grad_norm": 1.682570219039917, + "learning_rate": 6.059833556003341e-07, + "loss": 1.0301, + "step": 62170 + }, + { + "epoch": 18.6, + "grad_norm": 4.487571716308594, + "learning_rate": 6.046982957525482e-07, + "loss": 1.1361, + "step": 62175 + }, + { + "epoch": 18.6, + "grad_norm": 4.013219833374023, + "learning_rate": 6.03414583231915e-07, + "loss": 0.8556, + "step": 62180 + }, + { + "epoch": 18.61, + "grad_norm": 3.0844905376434326, + "learning_rate": 6.021322181093386e-07, + "loss": 0.9482, + "step": 62185 + }, + { + "epoch": 18.61, + "grad_norm": 3.159336805343628, + "learning_rate": 6.008512004556377e-07, + "loss": 0.9925, + "step": 62190 + }, + { + "epoch": 18.61, + "grad_norm": 2.8269448280334473, + "learning_rate": 5.995715303415639e-07, + "loss": 1.0819, + "step": 62195 + }, + { + "epoch": 18.61, + "grad_norm": 3.574028730392456, + "learning_rate": 5.982932078377912e-07, + "loss": 0.9489, + "step": 62200 + }, + { + "epoch": 18.61, + "grad_norm": 2.7778759002685547, + "learning_rate": 5.970162330149187e-07, + "loss": 0.9698, + "step": 62205 + }, + { + "epoch": 18.61, + "grad_norm": 2.205036163330078, + "learning_rate": 5.957406059434761e-07, + "loss": 0.9544, + "step": 62210 + }, + { + "epoch": 18.61, + "grad_norm": 2.8578882217407227, + "learning_rate": 5.944663266939127e-07, + "loss": 0.7805, + "step": 62215 + }, + { + "epoch": 18.62, + "grad_norm": 2.122692346572876, + "learning_rate": 5.931933953366081e-07, + "loss": 0.9865, + "step": 62220 + }, + { + "epoch": 18.62, + "grad_norm": 1.806129813194275, + "learning_rate": 5.919218119418591e-07, + "loss": 0.909, + "step": 62225 + }, + { + "epoch": 18.62, + "grad_norm": 3.1925947666168213, + "learning_rate": 5.906515765799009e-07, + "loss": 0.97, + "step": 62230 + }, + { + "epoch": 18.62, + "grad_norm": 2.054266929626465, + "learning_rate": 5.89382689320886e-07, + "loss": 0.9196, + "step": 62235 + }, + { + "epoch": 18.62, + "grad_norm": 3.7723171710968018, + "learning_rate": 5.881151502348859e-07, + "loss": 1.0202, + "step": 62240 + }, + { + "epoch": 18.62, + "grad_norm": 1.945148229598999, + "learning_rate": 5.868489593919141e-07, + "loss": 1.1337, + "step": 62245 + }, + { + "epoch": 18.62, + "grad_norm": 2.7918272018432617, + "learning_rate": 5.855841168618953e-07, + "loss": 0.9992, + "step": 62250 + }, + { + "epoch": 18.63, + "grad_norm": 4.473152160644531, + "learning_rate": 5.843206227146958e-07, + "loss": 0.9585, + "step": 62255 + }, + { + "epoch": 18.63, + "grad_norm": 3.2821218967437744, + "learning_rate": 5.83058477020082e-07, + "loss": 0.8887, + "step": 62260 + }, + { + "epoch": 18.63, + "grad_norm": 3.8282477855682373, + "learning_rate": 5.817976798477731e-07, + "loss": 0.7917, + "step": 62265 + }, + { + "epoch": 18.63, + "grad_norm": 3.6935458183288574, + "learning_rate": 5.805382312673941e-07, + "loss": 0.82, + "step": 62270 + }, + { + "epoch": 18.63, + "grad_norm": 1.9937669038772583, + "learning_rate": 5.792801313485086e-07, + "loss": 1.113, + "step": 62275 + }, + { + "epoch": 18.63, + "grad_norm": 1.7843409776687622, + "learning_rate": 5.780233801605944e-07, + "loss": 0.9841, + "step": 62280 + }, + { + "epoch": 18.63, + "grad_norm": 2.7850582599639893, + "learning_rate": 5.767679777730656e-07, + "loss": 1.0303, + "step": 62285 + }, + { + "epoch": 18.64, + "grad_norm": 3.4673726558685303, + "learning_rate": 5.755139242552554e-07, + "loss": 0.9193, + "step": 62290 + }, + { + "epoch": 18.64, + "grad_norm": 1.380431890487671, + "learning_rate": 5.742612196764224e-07, + "loss": 0.7912, + "step": 62295 + }, + { + "epoch": 18.64, + "grad_norm": 2.402841567993164, + "learning_rate": 5.73009864105753e-07, + "loss": 0.8712, + "step": 62300 + }, + { + "epoch": 18.64, + "grad_norm": 3.139500379562378, + "learning_rate": 5.717598576123584e-07, + "loss": 0.9437, + "step": 62305 + }, + { + "epoch": 18.64, + "grad_norm": 2.209512233734131, + "learning_rate": 5.705112002652752e-07, + "loss": 0.9299, + "step": 62310 + }, + { + "epoch": 18.64, + "grad_norm": 2.3451426029205322, + "learning_rate": 5.692638921334647e-07, + "loss": 0.9714, + "step": 62315 + }, + { + "epoch": 18.65, + "grad_norm": 3.0532517433166504, + "learning_rate": 5.680179332858165e-07, + "loss": 1.1268, + "step": 62320 + }, + { + "epoch": 18.65, + "grad_norm": 6.812263488769531, + "learning_rate": 5.667733237911421e-07, + "loss": 0.786, + "step": 62325 + }, + { + "epoch": 18.65, + "grad_norm": 2.6777634620666504, + "learning_rate": 5.655300637181809e-07, + "loss": 0.8496, + "step": 62330 + }, + { + "epoch": 18.65, + "grad_norm": 1.7131210565567017, + "learning_rate": 5.64288153135592e-07, + "loss": 1.0718, + "step": 62335 + }, + { + "epoch": 18.65, + "grad_norm": 3.845153570175171, + "learning_rate": 5.63047592111976e-07, + "loss": 0.7881, + "step": 62340 + }, + { + "epoch": 18.65, + "grad_norm": 2.2182199954986572, + "learning_rate": 5.618083807158337e-07, + "loss": 0.9474, + "step": 62345 + }, + { + "epoch": 18.65, + "grad_norm": 7.1650390625, + "learning_rate": 5.605705190156158e-07, + "loss": 1.0506, + "step": 62350 + }, + { + "epoch": 18.66, + "grad_norm": 4.3074259757995605, + "learning_rate": 5.593340070796843e-07, + "loss": 1.0749, + "step": 62355 + }, + { + "epoch": 18.66, + "grad_norm": 5.269190311431885, + "learning_rate": 5.580988449763319e-07, + "loss": 0.986, + "step": 62360 + }, + { + "epoch": 18.66, + "grad_norm": 1.8356404304504395, + "learning_rate": 5.568650327737734e-07, + "loss": 0.9098, + "step": 62365 + }, + { + "epoch": 18.66, + "grad_norm": 3.584327459335327, + "learning_rate": 5.556325705401517e-07, + "loss": 1.004, + "step": 62370 + }, + { + "epoch": 18.66, + "grad_norm": 2.812471628189087, + "learning_rate": 5.544014583435342e-07, + "loss": 0.9846, + "step": 62375 + }, + { + "epoch": 18.66, + "grad_norm": 1.4029104709625244, + "learning_rate": 5.53171696251914e-07, + "loss": 0.8815, + "step": 62380 + }, + { + "epoch": 18.66, + "grad_norm": 2.412611722946167, + "learning_rate": 5.519432843332117e-07, + "loss": 1.0807, + "step": 62385 + }, + { + "epoch": 18.67, + "grad_norm": 2.5085036754608154, + "learning_rate": 5.507162226552676e-07, + "loss": 1.0554, + "step": 62390 + }, + { + "epoch": 18.67, + "grad_norm": 2.622960090637207, + "learning_rate": 5.494905112858551e-07, + "loss": 1.0882, + "step": 62395 + }, + { + "epoch": 18.67, + "grad_norm": 7.016479015350342, + "learning_rate": 5.482661502926645e-07, + "loss": 0.971, + "step": 62400 + }, + { + "epoch": 18.67, + "grad_norm": 2.4067859649658203, + "learning_rate": 5.470431397433196e-07, + "loss": 0.943, + "step": 62405 + }, + { + "epoch": 18.67, + "grad_norm": 1.8276368379592896, + "learning_rate": 5.458214797053634e-07, + "loss": 0.8402, + "step": 62410 + }, + { + "epoch": 18.67, + "grad_norm": 3.209845781326294, + "learning_rate": 5.4460117024627e-07, + "loss": 1.0835, + "step": 62415 + }, + { + "epoch": 18.68, + "grad_norm": 3.5742673873901367, + "learning_rate": 5.433822114334325e-07, + "loss": 0.9627, + "step": 62420 + }, + { + "epoch": 18.68, + "grad_norm": 3.4893949031829834, + "learning_rate": 5.421646033341748e-07, + "loss": 1.057, + "step": 62425 + }, + { + "epoch": 18.68, + "grad_norm": 3.8481268882751465, + "learning_rate": 5.409483460157433e-07, + "loss": 1.0174, + "step": 62430 + }, + { + "epoch": 18.68, + "grad_norm": 1.257150650024414, + "learning_rate": 5.397334395453147e-07, + "loss": 0.8821, + "step": 62435 + }, + { + "epoch": 18.68, + "grad_norm": 2.7487339973449707, + "learning_rate": 5.385198839899769e-07, + "loss": 0.9345, + "step": 62440 + }, + { + "epoch": 18.68, + "grad_norm": 1.8233064413070679, + "learning_rate": 5.373076794167653e-07, + "loss": 0.8765, + "step": 62445 + }, + { + "epoch": 18.68, + "grad_norm": 3.476785898208618, + "learning_rate": 5.360968258926152e-07, + "loss": 0.978, + "step": 62450 + }, + { + "epoch": 18.69, + "grad_norm": 1.5684432983398438, + "learning_rate": 5.348873234844176e-07, + "loss": 0.8142, + "step": 62455 + }, + { + "epoch": 18.69, + "grad_norm": 4.222447395324707, + "learning_rate": 5.336791722589579e-07, + "loss": 0.9887, + "step": 62460 + }, + { + "epoch": 18.69, + "grad_norm": 3.2067711353302, + "learning_rate": 5.324723722829661e-07, + "loss": 1.0059, + "step": 62465 + }, + { + "epoch": 18.69, + "grad_norm": 1.711020827293396, + "learning_rate": 5.312669236230944e-07, + "loss": 1.1326, + "step": 62470 + }, + { + "epoch": 18.69, + "grad_norm": 5.746004581451416, + "learning_rate": 5.300628263459173e-07, + "loss": 0.8101, + "step": 62475 + }, + { + "epoch": 18.69, + "grad_norm": 3.5213847160339355, + "learning_rate": 5.288600805179345e-07, + "loss": 0.9395, + "step": 62480 + }, + { + "epoch": 18.69, + "grad_norm": 3.3688313961029053, + "learning_rate": 5.276586862055704e-07, + "loss": 0.8759, + "step": 62485 + }, + { + "epoch": 18.7, + "grad_norm": 1.9452404975891113, + "learning_rate": 5.264586434751833e-07, + "loss": 0.9038, + "step": 62490 + }, + { + "epoch": 18.7, + "grad_norm": 2.7685489654541016, + "learning_rate": 5.252599523930452e-07, + "loss": 0.8489, + "step": 62495 + }, + { + "epoch": 18.7, + "grad_norm": 2.2779135704040527, + "learning_rate": 5.240626130253585e-07, + "loss": 1.0238, + "step": 62500 + }, + { + "epoch": 18.7, + "grad_norm": 2.2392897605895996, + "learning_rate": 5.228666254382536e-07, + "loss": 1.1407, + "step": 62505 + }, + { + "epoch": 18.7, + "grad_norm": 4.152332782745361, + "learning_rate": 5.216719896977806e-07, + "loss": 0.7601, + "step": 62510 + }, + { + "epoch": 18.7, + "grad_norm": 2.4010398387908936, + "learning_rate": 5.204787058699228e-07, + "loss": 0.6884, + "step": 62515 + }, + { + "epoch": 18.71, + "grad_norm": 3.122080087661743, + "learning_rate": 5.192867740205775e-07, + "loss": 1.0505, + "step": 62520 + }, + { + "epoch": 18.71, + "grad_norm": 4.57648229598999, + "learning_rate": 5.18096194215581e-07, + "loss": 0.9048, + "step": 62525 + }, + { + "epoch": 18.71, + "grad_norm": 5.7991414070129395, + "learning_rate": 5.169069665206833e-07, + "loss": 0.8917, + "step": 62530 + }, + { + "epoch": 18.71, + "grad_norm": 1.6808985471725464, + "learning_rate": 5.157190910015625e-07, + "loss": 0.9275, + "step": 62535 + }, + { + "epoch": 18.71, + "grad_norm": 2.2939209938049316, + "learning_rate": 5.145325677238327e-07, + "loss": 1.0068, + "step": 62540 + }, + { + "epoch": 18.71, + "grad_norm": 2.1694259643554688, + "learning_rate": 5.133473967530111e-07, + "loss": 1.0079, + "step": 62545 + }, + { + "epoch": 18.71, + "grad_norm": 3.0563623905181885, + "learning_rate": 5.121635781545647e-07, + "loss": 0.9808, + "step": 62550 + }, + { + "epoch": 18.72, + "grad_norm": 2.7795627117156982, + "learning_rate": 5.109811119938663e-07, + "loss": 0.9278, + "step": 62555 + }, + { + "epoch": 18.72, + "grad_norm": 2.1813108921051025, + "learning_rate": 5.0979999833623e-07, + "loss": 0.973, + "step": 62560 + }, + { + "epoch": 18.72, + "grad_norm": 1.735756754875183, + "learning_rate": 5.086202372468818e-07, + "loss": 1.0072, + "step": 62565 + }, + { + "epoch": 18.72, + "grad_norm": 1.2512030601501465, + "learning_rate": 5.074418287909804e-07, + "loss": 1.0268, + "step": 62570 + }, + { + "epoch": 18.72, + "grad_norm": 6.556924343109131, + "learning_rate": 5.062647730336073e-07, + "loss": 0.9008, + "step": 62575 + }, + { + "epoch": 18.72, + "grad_norm": 2.471876382827759, + "learning_rate": 5.050890700397715e-07, + "loss": 0.7419, + "step": 62580 + }, + { + "epoch": 18.72, + "grad_norm": 2.7054262161254883, + "learning_rate": 5.039147198744071e-07, + "loss": 1.0026, + "step": 62585 + }, + { + "epoch": 18.73, + "grad_norm": 3.4873416423797607, + "learning_rate": 5.027417226023678e-07, + "loss": 1.0207, + "step": 62590 + }, + { + "epoch": 18.73, + "grad_norm": 2.8062655925750732, + "learning_rate": 5.015700782884408e-07, + "loss": 0.9565, + "step": 62595 + }, + { + "epoch": 18.73, + "grad_norm": 1.282105565071106, + "learning_rate": 5.003997869973353e-07, + "loss": 1.0715, + "step": 62600 + }, + { + "epoch": 18.73, + "grad_norm": 2.5329504013061523, + "learning_rate": 4.992308487936804e-07, + "loss": 1.0151, + "step": 62605 + }, + { + "epoch": 18.73, + "grad_norm": 4.997149467468262, + "learning_rate": 4.980632637420408e-07, + "loss": 0.9544, + "step": 62610 + }, + { + "epoch": 18.73, + "grad_norm": 2.9110753536224365, + "learning_rate": 4.968970319068983e-07, + "loss": 0.8164, + "step": 62615 + }, + { + "epoch": 18.74, + "grad_norm": 2.0976929664611816, + "learning_rate": 4.957321533526626e-07, + "loss": 0.878, + "step": 62620 + }, + { + "epoch": 18.74, + "grad_norm": 3.1105329990386963, + "learning_rate": 4.94568628143674e-07, + "loss": 1.0102, + "step": 62625 + }, + { + "epoch": 18.74, + "grad_norm": 2.1142258644104004, + "learning_rate": 4.93406456344181e-07, + "loss": 0.9267, + "step": 62630 + }, + { + "epoch": 18.74, + "grad_norm": 2.38909912109375, + "learning_rate": 4.922456380183821e-07, + "loss": 0.8664, + "step": 62635 + }, + { + "epoch": 18.74, + "grad_norm": 3.103832721710205, + "learning_rate": 4.910861732303762e-07, + "loss": 0.9541, + "step": 62640 + }, + { + "epoch": 18.74, + "grad_norm": 3.287440299987793, + "learning_rate": 4.899280620442121e-07, + "loss": 0.986, + "step": 62645 + }, + { + "epoch": 18.74, + "grad_norm": 3.8672590255737305, + "learning_rate": 4.887713045238385e-07, + "loss": 0.9699, + "step": 62650 + }, + { + "epoch": 18.75, + "grad_norm": 2.768521785736084, + "learning_rate": 4.876159007331516e-07, + "loss": 0.9764, + "step": 62655 + }, + { + "epoch": 18.75, + "grad_norm": 1.9124183654785156, + "learning_rate": 4.864618507359558e-07, + "loss": 0.925, + "step": 62660 + }, + { + "epoch": 18.75, + "grad_norm": 6.206687927246094, + "learning_rate": 4.853091545959948e-07, + "loss": 1.0686, + "step": 62665 + }, + { + "epoch": 18.75, + "grad_norm": 1.3544474840164185, + "learning_rate": 4.841578123769258e-07, + "loss": 0.8745, + "step": 62670 + }, + { + "epoch": 18.75, + "grad_norm": 2.0739264488220215, + "learning_rate": 4.830078241423369e-07, + "loss": 0.8589, + "step": 62675 + }, + { + "epoch": 18.75, + "grad_norm": 2.00089430809021, + "learning_rate": 4.818591899557467e-07, + "loss": 0.908, + "step": 62680 + }, + { + "epoch": 18.75, + "grad_norm": 2.7553093433380127, + "learning_rate": 4.807119098805851e-07, + "loss": 1.0617, + "step": 62685 + }, + { + "epoch": 18.76, + "grad_norm": 1.7284417152404785, + "learning_rate": 4.795659839802181e-07, + "loss": 0.952, + "step": 62690 + }, + { + "epoch": 18.76, + "grad_norm": 2.9970293045043945, + "learning_rate": 4.784214123179337e-07, + "loss": 0.878, + "step": 62695 + }, + { + "epoch": 18.76, + "grad_norm": 2.638425350189209, + "learning_rate": 4.772781949569455e-07, + "loss": 0.9725, + "step": 62700 + }, + { + "epoch": 18.76, + "grad_norm": 2.378786087036133, + "learning_rate": 4.7613633196039455e-07, + "loss": 0.912, + "step": 62705 + }, + { + "epoch": 18.76, + "grad_norm": 2.5562052726745605, + "learning_rate": 4.749958233913415e-07, + "loss": 0.9821, + "step": 62710 + }, + { + "epoch": 18.76, + "grad_norm": 4.997159957885742, + "learning_rate": 4.7385666931277774e-07, + "loss": 0.998, + "step": 62715 + }, + { + "epoch": 18.77, + "grad_norm": 2.2791659832000732, + "learning_rate": 4.727188697876195e-07, + "loss": 1.0484, + "step": 62720 + }, + { + "epoch": 18.77, + "grad_norm": 3.7600674629211426, + "learning_rate": 4.715824248786971e-07, + "loss": 0.9918, + "step": 62725 + }, + { + "epoch": 18.77, + "grad_norm": 5.7446417808532715, + "learning_rate": 4.704473346487881e-07, + "loss": 0.8191, + "step": 62730 + }, + { + "epoch": 18.77, + "grad_norm": 2.3552982807159424, + "learning_rate": 4.693135991605702e-07, + "loss": 1.0591, + "step": 62735 + }, + { + "epoch": 18.77, + "grad_norm": 2.88034987449646, + "learning_rate": 4.681812184766682e-07, + "loss": 1.1121, + "step": 62740 + }, + { + "epoch": 18.77, + "grad_norm": 1.8043874502182007, + "learning_rate": 4.670501926596127e-07, + "loss": 1.0824, + "step": 62745 + }, + { + "epoch": 18.77, + "grad_norm": 2.6473546028137207, + "learning_rate": 4.659205217718815e-07, + "loss": 0.866, + "step": 62750 + }, + { + "epoch": 18.78, + "grad_norm": 2.442850351333618, + "learning_rate": 4.647922058758525e-07, + "loss": 0.9233, + "step": 62755 + }, + { + "epoch": 18.78, + "grad_norm": 2.945636510848999, + "learning_rate": 4.63665245033848e-07, + "loss": 0.8995, + "step": 62760 + }, + { + "epoch": 18.78, + "grad_norm": 2.816267490386963, + "learning_rate": 4.6253963930810995e-07, + "loss": 0.9573, + "step": 62765 + }, + { + "epoch": 18.78, + "grad_norm": 3.510456085205078, + "learning_rate": 4.614153887607997e-07, + "loss": 1.0292, + "step": 62770 + }, + { + "epoch": 18.78, + "grad_norm": 1.8083337545394897, + "learning_rate": 4.6029249345401484e-07, + "loss": 0.8119, + "step": 62775 + }, + { + "epoch": 18.78, + "grad_norm": 3.959989309310913, + "learning_rate": 4.5917095344976403e-07, + "loss": 0.9996, + "step": 62780 + }, + { + "epoch": 18.78, + "grad_norm": 6.885543346405029, + "learning_rate": 4.58050768809995e-07, + "loss": 0.9633, + "step": 62785 + }, + { + "epoch": 18.79, + "grad_norm": 2.3350930213928223, + "learning_rate": 4.569319395965721e-07, + "loss": 1.0368, + "step": 62790 + }, + { + "epoch": 18.79, + "grad_norm": 4.399080276489258, + "learning_rate": 4.5581446587128763e-07, + "loss": 1.014, + "step": 62795 + }, + { + "epoch": 18.79, + "grad_norm": 1.9063431024551392, + "learning_rate": 4.5469834769585604e-07, + "loss": 1.0557, + "step": 62800 + }, + { + "epoch": 18.79, + "grad_norm": 1.7454651594161987, + "learning_rate": 4.5358358513192257e-07, + "loss": 1.1915, + "step": 62805 + }, + { + "epoch": 18.79, + "grad_norm": 2.7510390281677246, + "learning_rate": 4.5247017824105174e-07, + "loss": 0.9034, + "step": 62810 + }, + { + "epoch": 18.79, + "grad_norm": 4.9659600257873535, + "learning_rate": 4.513581270847389e-07, + "loss": 0.7884, + "step": 62815 + }, + { + "epoch": 18.8, + "grad_norm": 4.02251672744751, + "learning_rate": 4.5024743172439877e-07, + "loss": 0.8928, + "step": 62820 + }, + { + "epoch": 18.8, + "grad_norm": 4.709519863128662, + "learning_rate": 4.491380922213767e-07, + "loss": 0.9039, + "step": 62825 + }, + { + "epoch": 18.8, + "grad_norm": 1.9701640605926514, + "learning_rate": 4.4803010863693483e-07, + "loss": 0.937, + "step": 62830 + }, + { + "epoch": 18.8, + "grad_norm": 2.070518970489502, + "learning_rate": 4.4692348103227143e-07, + "loss": 0.9715, + "step": 62835 + }, + { + "epoch": 18.8, + "grad_norm": 1.8517335653305054, + "learning_rate": 4.4581820946850427e-07, + "loss": 0.9508, + "step": 62840 + }, + { + "epoch": 18.8, + "grad_norm": 1.6975722312927246, + "learning_rate": 4.4471429400667343e-07, + "loss": 1.0122, + "step": 62845 + }, + { + "epoch": 18.8, + "grad_norm": 2.018352746963501, + "learning_rate": 4.436117347077495e-07, + "loss": 1.058, + "step": 62850 + }, + { + "epoch": 18.81, + "grad_norm": 4.113342761993408, + "learning_rate": 4.425105316326228e-07, + "loss": 0.9022, + "step": 62855 + }, + { + "epoch": 18.81, + "grad_norm": 3.3095271587371826, + "learning_rate": 4.414106848421168e-07, + "loss": 1.0346, + "step": 62860 + }, + { + "epoch": 18.81, + "grad_norm": 3.63959002494812, + "learning_rate": 4.4031219439696625e-07, + "loss": 0.9892, + "step": 62865 + }, + { + "epoch": 18.81, + "grad_norm": 4.62831974029541, + "learning_rate": 4.3921506035784766e-07, + "loss": 0.9858, + "step": 62870 + }, + { + "epoch": 18.81, + "grad_norm": 3.9106874465942383, + "learning_rate": 4.381192827853514e-07, + "loss": 0.943, + "step": 62875 + }, + { + "epoch": 18.81, + "grad_norm": 3.417293071746826, + "learning_rate": 4.3702486173999856e-07, + "loss": 0.8616, + "step": 62880 + }, + { + "epoch": 18.81, + "grad_norm": 2.981015682220459, + "learning_rate": 4.359317972822269e-07, + "loss": 1.0777, + "step": 62885 + }, + { + "epoch": 18.82, + "grad_norm": 9.559314727783203, + "learning_rate": 4.3484008947241304e-07, + "loss": 0.9715, + "step": 62890 + }, + { + "epoch": 18.82, + "grad_norm": 2.853968858718872, + "learning_rate": 4.33749738370845e-07, + "loss": 1.0899, + "step": 62895 + }, + { + "epoch": 18.82, + "grad_norm": 2.9519448280334473, + "learning_rate": 4.326607440377439e-07, + "loss": 0.9593, + "step": 62900 + }, + { + "epoch": 18.82, + "grad_norm": 3.299821376800537, + "learning_rate": 4.3157310653325344e-07, + "loss": 0.9734, + "step": 62905 + }, + { + "epoch": 18.82, + "grad_norm": 2.5960991382598877, + "learning_rate": 4.3048682591744495e-07, + "loss": 1.0205, + "step": 62910 + }, + { + "epoch": 18.82, + "grad_norm": 1.4669679403305054, + "learning_rate": 4.294019022503037e-07, + "loss": 0.9927, + "step": 62915 + }, + { + "epoch": 18.82, + "grad_norm": 3.0329182147979736, + "learning_rate": 4.283183355917625e-07, + "loss": 1.1614, + "step": 62920 + }, + { + "epoch": 18.83, + "grad_norm": 3.468059778213501, + "learning_rate": 4.272361260016566e-07, + "loss": 0.7337, + "step": 62925 + }, + { + "epoch": 18.83, + "grad_norm": 8.143178939819336, + "learning_rate": 4.2615527353975783e-07, + "loss": 0.833, + "step": 62930 + }, + { + "epoch": 18.83, + "grad_norm": 1.7140194177627563, + "learning_rate": 4.2507577826575995e-07, + "loss": 0.9572, + "step": 62935 + }, + { + "epoch": 18.83, + "grad_norm": 2.0888423919677734, + "learning_rate": 4.23997640239282e-07, + "loss": 1.0221, + "step": 62940 + }, + { + "epoch": 18.83, + "grad_norm": 2.820460319519043, + "learning_rate": 4.2292085951987084e-07, + "loss": 0.8878, + "step": 62945 + }, + { + "epoch": 18.83, + "grad_norm": 4.351651668548584, + "learning_rate": 4.218454361669899e-07, + "loss": 0.8856, + "step": 62950 + }, + { + "epoch": 18.84, + "grad_norm": 1.5819222927093506, + "learning_rate": 4.2077137024004455e-07, + "loss": 1.04, + "step": 62955 + }, + { + "epoch": 18.84, + "grad_norm": 3.579989194869995, + "learning_rate": 4.1969866179834284e-07, + "loss": 1.0235, + "step": 62960 + }, + { + "epoch": 18.84, + "grad_norm": 2.6302943229675293, + "learning_rate": 4.1862731090113736e-07, + "loss": 0.9574, + "step": 62965 + }, + { + "epoch": 18.84, + "grad_norm": 4.829039573669434, + "learning_rate": 4.175573176075892e-07, + "loss": 1.0367, + "step": 62970 + }, + { + "epoch": 18.84, + "grad_norm": 3.0657598972320557, + "learning_rate": 4.164886819768038e-07, + "loss": 0.855, + "step": 62975 + }, + { + "epoch": 18.84, + "grad_norm": 3.936453342437744, + "learning_rate": 4.1542140406779504e-07, + "loss": 1.1114, + "step": 62980 + }, + { + "epoch": 18.84, + "grad_norm": 3.4842371940612793, + "learning_rate": 4.1435548393950474e-07, + "loss": 0.9725, + "step": 62985 + }, + { + "epoch": 18.85, + "grad_norm": 2.537079334259033, + "learning_rate": 4.1329092165080794e-07, + "loss": 0.9701, + "step": 62990 + }, + { + "epoch": 18.85, + "grad_norm": 2.407586097717285, + "learning_rate": 4.122277172604966e-07, + "loss": 0.8825, + "step": 62995 + }, + { + "epoch": 18.85, + "grad_norm": 1.6151124238967896, + "learning_rate": 4.111658708272903e-07, + "loss": 1.1248, + "step": 63000 + }, + { + "epoch": 18.85, + "grad_norm": 3.798203706741333, + "learning_rate": 4.1010538240983667e-07, + "loss": 1.0407, + "step": 63005 + }, + { + "epoch": 18.85, + "grad_norm": 3.2758753299713135, + "learning_rate": 4.090462520666999e-07, + "loss": 1.033, + "step": 63010 + }, + { + "epoch": 18.85, + "grad_norm": 3.0684893131256104, + "learning_rate": 4.0798847985637766e-07, + "loss": 0.8769, + "step": 63015 + }, + { + "epoch": 18.85, + "grad_norm": 2.2949230670928955, + "learning_rate": 4.0693206583729273e-07, + "loss": 1.0484, + "step": 63020 + }, + { + "epoch": 18.86, + "grad_norm": 3.716773271560669, + "learning_rate": 4.0587701006778443e-07, + "loss": 1.0033, + "step": 63025 + }, + { + "epoch": 18.86, + "grad_norm": 1.4224724769592285, + "learning_rate": 4.048233126061257e-07, + "loss": 0.9085, + "step": 63030 + }, + { + "epoch": 18.86, + "grad_norm": 1.496864676475525, + "learning_rate": 4.037709735105089e-07, + "loss": 0.8798, + "step": 63035 + }, + { + "epoch": 18.86, + "grad_norm": 3.2750120162963867, + "learning_rate": 4.0271999283905683e-07, + "loss": 0.954, + "step": 63040 + }, + { + "epoch": 18.86, + "grad_norm": 3.3160552978515625, + "learning_rate": 4.0167037064980925e-07, + "loss": 0.9068, + "step": 63045 + }, + { + "epoch": 18.86, + "grad_norm": 4.469260215759277, + "learning_rate": 4.006221070007421e-07, + "loss": 0.8265, + "step": 63050 + }, + { + "epoch": 18.87, + "grad_norm": 2.6699278354644775, + "learning_rate": 3.9957520194974505e-07, + "loss": 1.0278, + "step": 63055 + }, + { + "epoch": 18.87, + "grad_norm": 2.068089485168457, + "learning_rate": 3.9852965555463863e-07, + "loss": 0.9476, + "step": 63060 + }, + { + "epoch": 18.87, + "grad_norm": 1.7394936084747314, + "learning_rate": 3.974854678731654e-07, + "loss": 0.9202, + "step": 63065 + }, + { + "epoch": 18.87, + "grad_norm": 5.004978656768799, + "learning_rate": 3.964426389630016e-07, + "loss": 1.0277, + "step": 63070 + }, + { + "epoch": 18.87, + "grad_norm": 2.368013620376587, + "learning_rate": 3.954011688817344e-07, + "loss": 0.9778, + "step": 63075 + }, + { + "epoch": 18.87, + "grad_norm": 1.3884702920913696, + "learning_rate": 3.943610576868845e-07, + "loss": 0.8678, + "step": 63080 + }, + { + "epoch": 18.87, + "grad_norm": 1.4735065698623657, + "learning_rate": 3.9332230543589753e-07, + "loss": 1.0603, + "step": 63085 + }, + { + "epoch": 18.88, + "grad_norm": 2.9235053062438965, + "learning_rate": 3.922849121861416e-07, + "loss": 0.9802, + "step": 63090 + }, + { + "epoch": 18.88, + "grad_norm": 1.8659436702728271, + "learning_rate": 3.912488779949153e-07, + "loss": 1.0448, + "step": 63095 + }, + { + "epoch": 18.88, + "grad_norm": 3.8090038299560547, + "learning_rate": 3.902142029194311e-07, + "loss": 0.9592, + "step": 63100 + }, + { + "epoch": 18.88, + "grad_norm": 2.0518362522125244, + "learning_rate": 3.891808870168351e-07, + "loss": 0.8467, + "step": 63105 + }, + { + "epoch": 18.88, + "grad_norm": 2.6214382648468018, + "learning_rate": 3.8814893034420097e-07, + "loss": 0.9282, + "step": 63110 + }, + { + "epoch": 18.88, + "grad_norm": 4.084232807159424, + "learning_rate": 3.8711833295851654e-07, + "loss": 0.9546, + "step": 63115 + }, + { + "epoch": 18.88, + "grad_norm": 1.4593278169631958, + "learning_rate": 3.8608909491670287e-07, + "loss": 0.803, + "step": 63120 + }, + { + "epoch": 18.89, + "grad_norm": 3.1608002185821533, + "learning_rate": 3.8506121627560345e-07, + "loss": 0.9568, + "step": 63125 + }, + { + "epoch": 18.89, + "grad_norm": 2.8863463401794434, + "learning_rate": 3.840346970919867e-07, + "loss": 1.0497, + "step": 63130 + }, + { + "epoch": 18.89, + "grad_norm": 1.954045057296753, + "learning_rate": 3.8300953742254895e-07, + "loss": 0.9635, + "step": 63135 + }, + { + "epoch": 18.89, + "grad_norm": 2.1377789974212646, + "learning_rate": 3.819857373239033e-07, + "loss": 0.7992, + "step": 63140 + }, + { + "epoch": 18.89, + "grad_norm": 2.493187665939331, + "learning_rate": 3.809632968526017e-07, + "loss": 1.0234, + "step": 63145 + }, + { + "epoch": 18.89, + "grad_norm": 2.3102526664733887, + "learning_rate": 3.799422160651017e-07, + "loss": 0.8951, + "step": 63150 + }, + { + "epoch": 18.9, + "grad_norm": 2.773439645767212, + "learning_rate": 3.7892249501780554e-07, + "loss": 0.8641, + "step": 63155 + }, + { + "epoch": 18.9, + "grad_norm": 2.822577953338623, + "learning_rate": 3.779041337670236e-07, + "loss": 0.9386, + "step": 63160 + }, + { + "epoch": 18.9, + "grad_norm": 3.764552116394043, + "learning_rate": 3.7688713236900816e-07, + "loss": 0.9324, + "step": 63165 + }, + { + "epoch": 18.9, + "grad_norm": 2.5591769218444824, + "learning_rate": 3.758714908799199e-07, + "loss": 0.925, + "step": 63170 + }, + { + "epoch": 18.9, + "grad_norm": 1.3700811862945557, + "learning_rate": 3.7485720935585553e-07, + "loss": 0.8529, + "step": 63175 + }, + { + "epoch": 18.9, + "grad_norm": 2.099714517593384, + "learning_rate": 3.7384428785282867e-07, + "loss": 0.9949, + "step": 63180 + }, + { + "epoch": 18.9, + "grad_norm": 3.3494229316711426, + "learning_rate": 3.7283272642678335e-07, + "loss": 0.7835, + "step": 63185 + }, + { + "epoch": 18.91, + "grad_norm": 2.3654308319091797, + "learning_rate": 3.718225251335916e-07, + "loss": 0.9557, + "step": 63190 + }, + { + "epoch": 18.91, + "grad_norm": 3.252896547317505, + "learning_rate": 3.7081368402903935e-07, + "loss": 1.0649, + "step": 63195 + }, + { + "epoch": 18.91, + "grad_norm": 2.0601794719696045, + "learning_rate": 3.6980620316884876e-07, + "loss": 0.9137, + "step": 63200 + }, + { + "epoch": 18.91, + "grad_norm": 3.0151379108428955, + "learning_rate": 3.688000826086585e-07, + "loss": 1.0903, + "step": 63205 + }, + { + "epoch": 18.91, + "grad_norm": 2.6133904457092285, + "learning_rate": 3.6779532240403537e-07, + "loss": 1.0124, + "step": 63210 + }, + { + "epoch": 18.91, + "grad_norm": 5.491546154022217, + "learning_rate": 3.6679192261047655e-07, + "loss": 0.9035, + "step": 63215 + }, + { + "epoch": 18.91, + "grad_norm": 2.6490890979766846, + "learning_rate": 3.6578988328339334e-07, + "loss": 1.0075, + "step": 63220 + }, + { + "epoch": 18.92, + "grad_norm": 2.415147542953491, + "learning_rate": 3.647892044781276e-07, + "loss": 1.0301, + "step": 63225 + }, + { + "epoch": 18.92, + "grad_norm": 6.828542709350586, + "learning_rate": 3.637898862499489e-07, + "loss": 1.1634, + "step": 63230 + }, + { + "epoch": 18.92, + "grad_norm": 1.5809046030044556, + "learning_rate": 3.6279192865404654e-07, + "loss": 0.9714, + "step": 63235 + }, + { + "epoch": 18.92, + "grad_norm": 2.416311264038086, + "learning_rate": 3.6179533174553746e-07, + "loss": 1.1117, + "step": 63240 + }, + { + "epoch": 18.92, + "grad_norm": 3.4020678997039795, + "learning_rate": 3.6080009557946104e-07, + "loss": 0.7682, + "step": 63245 + }, + { + "epoch": 18.92, + "grad_norm": 1.7761116027832031, + "learning_rate": 3.5980622021078716e-07, + "loss": 0.959, + "step": 63250 + }, + { + "epoch": 18.93, + "grad_norm": 1.5695528984069824, + "learning_rate": 3.5881370569439964e-07, + "loss": 0.9388, + "step": 63255 + }, + { + "epoch": 18.93, + "grad_norm": 1.8310515880584717, + "learning_rate": 3.5782255208512136e-07, + "loss": 0.9259, + "step": 63260 + }, + { + "epoch": 18.93, + "grad_norm": 2.843670129776001, + "learning_rate": 3.5683275943768914e-07, + "loss": 1.0009, + "step": 63265 + }, + { + "epoch": 18.93, + "grad_norm": 3.0472795963287354, + "learning_rate": 3.5584432780676745e-07, + "loss": 0.8153, + "step": 63270 + }, + { + "epoch": 18.93, + "grad_norm": 2.187629461288452, + "learning_rate": 3.548572572469461e-07, + "loss": 0.9146, + "step": 63275 + }, + { + "epoch": 18.93, + "grad_norm": 2.447866201400757, + "learning_rate": 3.5387154781274244e-07, + "loss": 0.929, + "step": 63280 + }, + { + "epoch": 18.93, + "grad_norm": 2.634216785430908, + "learning_rate": 3.528871995585964e-07, + "loss": 0.9642, + "step": 63285 + }, + { + "epoch": 18.94, + "grad_norm": 1.427452802658081, + "learning_rate": 3.5190421253886717e-07, + "loss": 1.1127, + "step": 63290 + }, + { + "epoch": 18.94, + "grad_norm": 2.087641716003418, + "learning_rate": 3.5092258680785305e-07, + "loss": 1.0386, + "step": 63295 + }, + { + "epoch": 18.94, + "grad_norm": 1.5409181118011475, + "learning_rate": 3.4994232241976065e-07, + "loss": 0.9281, + "step": 63300 + }, + { + "epoch": 18.94, + "grad_norm": 2.0599822998046875, + "learning_rate": 3.4896341942873e-07, + "loss": 0.9306, + "step": 63305 + }, + { + "epoch": 18.94, + "grad_norm": 3.1682193279266357, + "learning_rate": 3.4798587788882895e-07, + "loss": 1.0272, + "step": 63310 + }, + { + "epoch": 18.94, + "grad_norm": 2.4055306911468506, + "learning_rate": 3.4700969785404214e-07, + "loss": 0.9128, + "step": 63315 + }, + { + "epoch": 18.94, + "grad_norm": 2.880706548690796, + "learning_rate": 3.460348793782847e-07, + "loss": 0.8306, + "step": 63320 + }, + { + "epoch": 18.95, + "grad_norm": 1.1103742122650146, + "learning_rate": 3.450614225153942e-07, + "loss": 1.0068, + "step": 63325 + }, + { + "epoch": 18.95, + "grad_norm": 3.7383155822753906, + "learning_rate": 3.440893273191331e-07, + "loss": 0.9928, + "step": 63330 + }, + { + "epoch": 18.95, + "grad_norm": 3.4909322261810303, + "learning_rate": 3.431185938431919e-07, + "loss": 1.0227, + "step": 63335 + }, + { + "epoch": 18.95, + "grad_norm": 2.2891860008239746, + "learning_rate": 3.421492221411804e-07, + "loss": 0.9881, + "step": 63340 + }, + { + "epoch": 18.95, + "grad_norm": 1.8985334634780884, + "learning_rate": 3.4118121226663913e-07, + "loss": 0.9726, + "step": 63345 + }, + { + "epoch": 18.95, + "grad_norm": 2.307614803314209, + "learning_rate": 3.402145642730226e-07, + "loss": 1.0593, + "step": 63350 + }, + { + "epoch": 18.96, + "grad_norm": 1.9816555976867676, + "learning_rate": 3.3924927821372965e-07, + "loss": 0.8479, + "step": 63355 + }, + { + "epoch": 18.96, + "grad_norm": 3.931323289871216, + "learning_rate": 3.3828535414206217e-07, + "loss": 0.8215, + "step": 63360 + }, + { + "epoch": 18.96, + "grad_norm": 2.6754205226898193, + "learning_rate": 3.373227921112609e-07, + "loss": 1.1578, + "step": 63365 + }, + { + "epoch": 18.96, + "grad_norm": 3.3269999027252197, + "learning_rate": 3.3636159217448617e-07, + "loss": 0.9295, + "step": 63370 + }, + { + "epoch": 18.96, + "grad_norm": 3.251188278198242, + "learning_rate": 3.354017543848259e-07, + "loss": 0.9049, + "step": 63375 + }, + { + "epoch": 18.96, + "grad_norm": 1.9303770065307617, + "learning_rate": 3.344432787952878e-07, + "loss": 0.9681, + "step": 63380 + }, + { + "epoch": 18.96, + "grad_norm": 3.30509614944458, + "learning_rate": 3.3348616545880727e-07, + "loss": 0.9311, + "step": 63385 + }, + { + "epoch": 18.97, + "grad_norm": 3.3238556385040283, + "learning_rate": 3.325304144282476e-07, + "loss": 0.9782, + "step": 63390 + }, + { + "epoch": 18.97, + "grad_norm": 1.8988010883331299, + "learning_rate": 3.315760257563943e-07, + "loss": 0.8005, + "step": 63395 + }, + { + "epoch": 18.97, + "grad_norm": 2.3984742164611816, + "learning_rate": 3.306229994959553e-07, + "loss": 0.8219, + "step": 63400 + }, + { + "epoch": 18.97, + "grad_norm": 1.133448839187622, + "learning_rate": 3.2967133569956344e-07, + "loss": 1.0835, + "step": 63405 + }, + { + "epoch": 18.97, + "grad_norm": 2.583820343017578, + "learning_rate": 3.287210344197822e-07, + "loss": 0.8689, + "step": 63410 + }, + { + "epoch": 18.97, + "grad_norm": 2.5700812339782715, + "learning_rate": 3.2777209570909464e-07, + "loss": 0.8236, + "step": 63415 + }, + { + "epoch": 18.97, + "grad_norm": 3.197221279144287, + "learning_rate": 3.2682451961990603e-07, + "loss": 1.0251, + "step": 63420 + }, + { + "epoch": 18.98, + "grad_norm": 2.2375690937042236, + "learning_rate": 3.2587830620455507e-07, + "loss": 0.8646, + "step": 63425 + }, + { + "epoch": 18.98, + "grad_norm": 2.5171115398406982, + "learning_rate": 3.249334555153e-07, + "loss": 1.1793, + "step": 63430 + }, + { + "epoch": 18.98, + "grad_norm": 2.9206202030181885, + "learning_rate": 3.239899676043184e-07, + "loss": 1.1318, + "step": 63435 + }, + { + "epoch": 18.98, + "grad_norm": 2.8934311866760254, + "learning_rate": 3.230478425237243e-07, + "loss": 1.0891, + "step": 63440 + }, + { + "epoch": 18.98, + "grad_norm": 2.022392988204956, + "learning_rate": 3.221070803255427e-07, + "loss": 1.0109, + "step": 63445 + }, + { + "epoch": 18.98, + "grad_norm": 2.2447142601013184, + "learning_rate": 3.2116768106174035e-07, + "loss": 1.0619, + "step": 63450 + }, + { + "epoch": 18.99, + "grad_norm": 3.3541789054870605, + "learning_rate": 3.2022964478419235e-07, + "loss": 1.1107, + "step": 63455 + }, + { + "epoch": 18.99, + "grad_norm": 2.8497142791748047, + "learning_rate": 3.192929715447102e-07, + "loss": 0.9282, + "step": 63460 + }, + { + "epoch": 18.99, + "grad_norm": 2.122657537460327, + "learning_rate": 3.1835766139502174e-07, + "loss": 0.87, + "step": 63465 + }, + { + "epoch": 18.99, + "grad_norm": 2.5856778621673584, + "learning_rate": 3.1742371438678586e-07, + "loss": 1.0344, + "step": 63470 + }, + { + "epoch": 18.99, + "grad_norm": 3.2936534881591797, + "learning_rate": 3.1649113057158066e-07, + "loss": 0.9697, + "step": 63475 + }, + { + "epoch": 18.99, + "grad_norm": 1.6257001161575317, + "learning_rate": 3.1555991000091214e-07, + "loss": 0.9946, + "step": 63480 + }, + { + "epoch": 18.99, + "grad_norm": 3.2520930767059326, + "learning_rate": 3.1463005272621416e-07, + "loss": 0.8589, + "step": 63485 + }, + { + "epoch": 19.0, + "grad_norm": 3.0452685356140137, + "learning_rate": 3.1370155879883735e-07, + "loss": 0.9231, + "step": 63490 + }, + { + "epoch": 19.0, + "grad_norm": 1.2502514123916626, + "learning_rate": 3.1277442827006564e-07, + "loss": 0.9966, + "step": 63495 + }, + { + "epoch": 19.0, + "grad_norm": 3.0072884559631348, + "learning_rate": 3.118486611910998e-07, + "loss": 0.9674, + "step": 63500 + }, + { + "epoch": 19.0, + "grad_norm": 1.7312711477279663, + "learning_rate": 3.109242576130711e-07, + "loss": 0.857, + "step": 63505 + }, + { + "epoch": 19.0, + "grad_norm": 1.629128336906433, + "learning_rate": 3.100012175870304e-07, + "loss": 0.9265, + "step": 63510 + }, + { + "epoch": 19.0, + "grad_norm": 1.9464771747589111, + "learning_rate": 3.0907954116396185e-07, + "loss": 0.9412, + "step": 63515 + }, + { + "epoch": 19.0, + "grad_norm": 1.9227700233459473, + "learning_rate": 3.0815922839476376e-07, + "loss": 0.6606, + "step": 63520 + }, + { + "epoch": 19.01, + "grad_norm": 3.476877450942993, + "learning_rate": 3.0724027933026757e-07, + "loss": 1.0086, + "step": 63525 + }, + { + "epoch": 19.01, + "grad_norm": 3.3601372241973877, + "learning_rate": 3.063226940212216e-07, + "loss": 0.9579, + "step": 63530 + }, + { + "epoch": 19.01, + "grad_norm": 2.9032952785491943, + "learning_rate": 3.0540647251830755e-07, + "loss": 1.0735, + "step": 63535 + }, + { + "epoch": 19.01, + "grad_norm": 1.6308649778366089, + "learning_rate": 3.0449161487212384e-07, + "loss": 1.1542, + "step": 63540 + }, + { + "epoch": 19.01, + "grad_norm": 2.7309157848358154, + "learning_rate": 3.035781211331995e-07, + "loss": 0.8411, + "step": 63545 + }, + { + "epoch": 19.01, + "grad_norm": 9.992263793945312, + "learning_rate": 3.026659913519858e-07, + "loss": 0.9904, + "step": 63550 + }, + { + "epoch": 19.01, + "grad_norm": 1.4755282402038574, + "learning_rate": 3.017552255788564e-07, + "loss": 1.1822, + "step": 63555 + }, + { + "epoch": 19.02, + "grad_norm": 3.669390916824341, + "learning_rate": 3.008458238641154e-07, + "loss": 1.1826, + "step": 63560 + }, + { + "epoch": 19.02, + "grad_norm": 2.0326449871063232, + "learning_rate": 2.9993778625798383e-07, + "loss": 0.9136, + "step": 63565 + }, + { + "epoch": 19.02, + "grad_norm": 2.4299309253692627, + "learning_rate": 2.9903111281061604e-07, + "loss": 0.891, + "step": 63570 + }, + { + "epoch": 19.02, + "grad_norm": 1.997877836227417, + "learning_rate": 2.98125803572083e-07, + "loss": 0.8897, + "step": 63575 + }, + { + "epoch": 19.02, + "grad_norm": 2.001340866088867, + "learning_rate": 2.972218585923864e-07, + "loss": 0.9927, + "step": 63580 + }, + { + "epoch": 19.02, + "grad_norm": 1.1060036420822144, + "learning_rate": 2.963192779214502e-07, + "loss": 1.1492, + "step": 63585 + }, + { + "epoch": 19.03, + "grad_norm": 2.4950671195983887, + "learning_rate": 2.9541806160912346e-07, + "loss": 0.8886, + "step": 63590 + }, + { + "epoch": 19.03, + "grad_norm": 1.5963852405548096, + "learning_rate": 2.9451820970517466e-07, + "loss": 0.9515, + "step": 63595 + }, + { + "epoch": 19.03, + "grad_norm": 2.513756036758423, + "learning_rate": 2.9361972225930845e-07, + "loss": 0.9299, + "step": 63600 + }, + { + "epoch": 19.03, + "grad_norm": 2.3485164642333984, + "learning_rate": 2.927225993211408e-07, + "loss": 0.7861, + "step": 63605 + }, + { + "epoch": 19.03, + "grad_norm": 4.739649772644043, + "learning_rate": 2.9182684094022363e-07, + "loss": 1.0509, + "step": 63610 + }, + { + "epoch": 19.03, + "grad_norm": 2.6401422023773193, + "learning_rate": 2.909324471660285e-07, + "loss": 0.9599, + "step": 63615 + }, + { + "epoch": 19.03, + "grad_norm": 3.8304197788238525, + "learning_rate": 2.900394180479521e-07, + "loss": 0.9171, + "step": 63620 + }, + { + "epoch": 19.04, + "grad_norm": 1.489724040031433, + "learning_rate": 2.891477536353104e-07, + "loss": 1.1215, + "step": 63625 + }, + { + "epoch": 19.04, + "grad_norm": 3.6274325847625732, + "learning_rate": 2.8825745397735584e-07, + "loss": 0.8733, + "step": 63630 + }, + { + "epoch": 19.04, + "grad_norm": 2.480381965637207, + "learning_rate": 2.873685191232517e-07, + "loss": 0.9707, + "step": 63635 + }, + { + "epoch": 19.04, + "grad_norm": 3.279183864593506, + "learning_rate": 2.8648094912210044e-07, + "loss": 0.9859, + "step": 63640 + }, + { + "epoch": 19.04, + "grad_norm": 6.223374843597412, + "learning_rate": 2.8559474402291564e-07, + "loss": 0.9295, + "step": 63645 + }, + { + "epoch": 19.04, + "grad_norm": 2.4016520977020264, + "learning_rate": 2.8470990387464424e-07, + "loss": 0.8226, + "step": 63650 + }, + { + "epoch": 19.04, + "grad_norm": 4.2338666915893555, + "learning_rate": 2.8382642872615826e-07, + "loss": 0.955, + "step": 63655 + }, + { + "epoch": 19.05, + "grad_norm": 3.539830446243286, + "learning_rate": 2.829443186262437e-07, + "loss": 1.1181, + "step": 63660 + }, + { + "epoch": 19.05, + "grad_norm": 3.2982068061828613, + "learning_rate": 2.8206357362362543e-07, + "loss": 1.0266, + "step": 63665 + }, + { + "epoch": 19.05, + "grad_norm": 1.85117769241333, + "learning_rate": 2.811841937669396e-07, + "loss": 1.1086, + "step": 63670 + }, + { + "epoch": 19.05, + "grad_norm": 4.956904888153076, + "learning_rate": 2.8030617910475845e-07, + "loss": 0.9842, + "step": 63675 + }, + { + "epoch": 19.05, + "grad_norm": 2.6763927936553955, + "learning_rate": 2.794295296855709e-07, + "loss": 1.1752, + "step": 63680 + }, + { + "epoch": 19.05, + "grad_norm": 2.8290627002716064, + "learning_rate": 2.785542455577994e-07, + "loss": 0.9809, + "step": 63685 + }, + { + "epoch": 19.06, + "grad_norm": 2.090080499649048, + "learning_rate": 2.776803267697775e-07, + "loss": 1.065, + "step": 63690 + }, + { + "epoch": 19.06, + "grad_norm": 1.995445728302002, + "learning_rate": 2.768077733697749e-07, + "loss": 0.9295, + "step": 63695 + }, + { + "epoch": 19.06, + "grad_norm": 3.277020215988159, + "learning_rate": 2.759365854059781e-07, + "loss": 0.9197, + "step": 63700 + }, + { + "epoch": 19.06, + "grad_norm": 2.4329395294189453, + "learning_rate": 2.750667629265069e-07, + "loss": 1.0294, + "step": 63705 + }, + { + "epoch": 19.06, + "grad_norm": 3.564478874206543, + "learning_rate": 2.741983059793979e-07, + "loss": 1.0323, + "step": 63710 + }, + { + "epoch": 19.06, + "grad_norm": 1.9448779821395874, + "learning_rate": 2.7333121461261545e-07, + "loss": 0.8142, + "step": 63715 + }, + { + "epoch": 19.06, + "grad_norm": 2.6165874004364014, + "learning_rate": 2.724654888740463e-07, + "loss": 0.8353, + "step": 63720 + }, + { + "epoch": 19.07, + "grad_norm": 3.1223433017730713, + "learning_rate": 2.716011288115078e-07, + "loss": 0.9984, + "step": 63725 + }, + { + "epoch": 19.07, + "grad_norm": 3.356553554534912, + "learning_rate": 2.7073813447273386e-07, + "loss": 0.9196, + "step": 63730 + }, + { + "epoch": 19.07, + "grad_norm": 1.8804484605789185, + "learning_rate": 2.698765059053865e-07, + "loss": 0.9712, + "step": 63735 + }, + { + "epoch": 19.07, + "grad_norm": 2.733215570449829, + "learning_rate": 2.690162431570553e-07, + "loss": 0.9342, + "step": 63740 + }, + { + "epoch": 19.07, + "grad_norm": 2.573293685913086, + "learning_rate": 2.6815734627525235e-07, + "loss": 0.8668, + "step": 63745 + }, + { + "epoch": 19.07, + "grad_norm": 2.0941619873046875, + "learning_rate": 2.672998153074119e-07, + "loss": 0.8658, + "step": 63750 + }, + { + "epoch": 19.07, + "grad_norm": 4.143740653991699, + "learning_rate": 2.6644365030089046e-07, + "loss": 0.9723, + "step": 63755 + }, + { + "epoch": 19.08, + "grad_norm": 2.708742141723633, + "learning_rate": 2.6558885130298363e-07, + "loss": 0.8001, + "step": 63760 + }, + { + "epoch": 19.08, + "grad_norm": 1.1027653217315674, + "learning_rate": 2.647354183608869e-07, + "loss": 0.8632, + "step": 63765 + }, + { + "epoch": 19.08, + "grad_norm": 5.099153995513916, + "learning_rate": 2.638833515217487e-07, + "loss": 0.9992, + "step": 63770 + }, + { + "epoch": 19.08, + "grad_norm": 2.2473297119140625, + "learning_rate": 2.6303265083261753e-07, + "loss": 1.179, + "step": 63775 + }, + { + "epoch": 19.08, + "grad_norm": 2.4921000003814697, + "learning_rate": 2.621833163404835e-07, + "loss": 0.8985, + "step": 63780 + }, + { + "epoch": 19.08, + "grad_norm": 2.435410261154175, + "learning_rate": 2.6133534809224813e-07, + "loss": 1.155, + "step": 63785 + }, + { + "epoch": 19.09, + "grad_norm": 1.8930751085281372, + "learning_rate": 2.6048874613474884e-07, + "loss": 0.9032, + "step": 63790 + }, + { + "epoch": 19.09, + "grad_norm": 2.620929718017578, + "learning_rate": 2.596435105147399e-07, + "loss": 1.0091, + "step": 63795 + }, + { + "epoch": 19.09, + "grad_norm": 3.0546348094940186, + "learning_rate": 2.587996412789034e-07, + "loss": 0.9915, + "step": 63800 + }, + { + "epoch": 19.09, + "grad_norm": 2.1018009185791016, + "learning_rate": 2.579571384738466e-07, + "loss": 0.9352, + "step": 63805 + }, + { + "epoch": 19.09, + "grad_norm": 3.1756997108459473, + "learning_rate": 2.5711600214609885e-07, + "loss": 1.0105, + "step": 63810 + }, + { + "epoch": 19.09, + "grad_norm": 4.981276512145996, + "learning_rate": 2.562762323421147e-07, + "loss": 1.0216, + "step": 63815 + }, + { + "epoch": 19.09, + "grad_norm": 3.7036938667297363, + "learning_rate": 2.554378291082765e-07, + "loss": 1.0003, + "step": 63820 + }, + { + "epoch": 19.1, + "grad_norm": 2.4953665733337402, + "learning_rate": 2.5460079249088606e-07, + "loss": 0.7268, + "step": 63825 + }, + { + "epoch": 19.1, + "grad_norm": 2.5604419708251953, + "learning_rate": 2.5376512253617034e-07, + "loss": 1.0928, + "step": 63830 + }, + { + "epoch": 19.1, + "grad_norm": 3.0103671550750732, + "learning_rate": 2.5293081929028686e-07, + "loss": 1.0442, + "step": 63835 + }, + { + "epoch": 19.1, + "grad_norm": 4.777401447296143, + "learning_rate": 2.5209788279930977e-07, + "loss": 0.9435, + "step": 63840 + }, + { + "epoch": 19.1, + "grad_norm": 2.0667479038238525, + "learning_rate": 2.512663131092441e-07, + "loss": 1.15, + "step": 63845 + }, + { + "epoch": 19.1, + "grad_norm": 1.9874635934829712, + "learning_rate": 2.504361102660141e-07, + "loss": 0.983, + "step": 63850 + }, + { + "epoch": 19.1, + "grad_norm": 4.316436290740967, + "learning_rate": 2.4960727431547206e-07, + "loss": 0.9662, + "step": 63855 + }, + { + "epoch": 19.11, + "grad_norm": 2.294708013534546, + "learning_rate": 2.487798053033924e-07, + "loss": 0.7309, + "step": 63860 + }, + { + "epoch": 19.11, + "grad_norm": 3.665914535522461, + "learning_rate": 2.479537032754803e-07, + "loss": 0.9579, + "step": 63865 + }, + { + "epoch": 19.11, + "grad_norm": 2.2551567554473877, + "learning_rate": 2.4712896827735197e-07, + "loss": 1.0002, + "step": 63870 + }, + { + "epoch": 19.11, + "grad_norm": 3.8116965293884277, + "learning_rate": 2.463056003545655e-07, + "loss": 0.8443, + "step": 63875 + }, + { + "epoch": 19.11, + "grad_norm": 2.817349433898926, + "learning_rate": 2.4548359955259003e-07, + "loss": 1.0352, + "step": 63880 + }, + { + "epoch": 19.11, + "grad_norm": 1.7628093957901, + "learning_rate": 2.4466296591682256e-07, + "loss": 0.9593, + "step": 63885 + }, + { + "epoch": 19.12, + "grad_norm": 4.2066755294799805, + "learning_rate": 2.43843699492588e-07, + "loss": 1.024, + "step": 63890 + }, + { + "epoch": 19.12, + "grad_norm": 2.6177399158477783, + "learning_rate": 2.430258003251334e-07, + "loss": 1.1829, + "step": 63895 + }, + { + "epoch": 19.12, + "grad_norm": 4.167474746704102, + "learning_rate": 2.422092684596311e-07, + "loss": 0.765, + "step": 63900 + }, + { + "epoch": 19.12, + "grad_norm": 2.174982786178589, + "learning_rate": 2.4139410394117825e-07, + "loss": 0.7503, + "step": 63905 + }, + { + "epoch": 19.12, + "grad_norm": 4.269941329956055, + "learning_rate": 2.405803068147916e-07, + "loss": 0.8784, + "step": 63910 + }, + { + "epoch": 19.12, + "grad_norm": 1.9396120309829712, + "learning_rate": 2.397678771254186e-07, + "loss": 1.0015, + "step": 63915 + }, + { + "epoch": 19.12, + "grad_norm": 5.77046537399292, + "learning_rate": 2.3895681491792886e-07, + "loss": 1.1133, + "step": 63920 + }, + { + "epoch": 19.13, + "grad_norm": 2.1925339698791504, + "learning_rate": 2.3814712023711995e-07, + "loss": 1.1291, + "step": 63925 + }, + { + "epoch": 19.13, + "grad_norm": 1.8961554765701294, + "learning_rate": 2.3733879312770324e-07, + "loss": 0.8816, + "step": 63930 + }, + { + "epoch": 19.13, + "grad_norm": 1.7458471059799194, + "learning_rate": 2.365318336343264e-07, + "loss": 1.0096, + "step": 63935 + }, + { + "epoch": 19.13, + "grad_norm": 1.646514892578125, + "learning_rate": 2.3572624180155655e-07, + "loss": 1.1617, + "step": 63940 + }, + { + "epoch": 19.13, + "grad_norm": 2.5368010997772217, + "learning_rate": 2.3492201767388589e-07, + "loss": 0.8202, + "step": 63945 + }, + { + "epoch": 19.13, + "grad_norm": 1.8663653135299683, + "learning_rate": 2.3411916129573164e-07, + "loss": 0.961, + "step": 63950 + }, + { + "epoch": 19.13, + "grad_norm": 3.437864065170288, + "learning_rate": 2.3331767271143057e-07, + "loss": 1.0659, + "step": 63955 + }, + { + "epoch": 19.14, + "grad_norm": 2.4234211444854736, + "learning_rate": 2.325175519652528e-07, + "loss": 0.881, + "step": 63960 + }, + { + "epoch": 19.14, + "grad_norm": 2.6147170066833496, + "learning_rate": 2.317187991013825e-07, + "loss": 0.8791, + "step": 63965 + }, + { + "epoch": 19.14, + "grad_norm": 3.2492587566375732, + "learning_rate": 2.3092141416394264e-07, + "loss": 0.9265, + "step": 63970 + }, + { + "epoch": 19.14, + "grad_norm": 2.781677722930908, + "learning_rate": 2.301253971969647e-07, + "loss": 0.8303, + "step": 63975 + }, + { + "epoch": 19.14, + "grad_norm": 2.981367826461792, + "learning_rate": 2.2933074824441624e-07, + "loss": 0.8374, + "step": 63980 + }, + { + "epoch": 19.14, + "grad_norm": 2.767667055130005, + "learning_rate": 2.2853746735017888e-07, + "loss": 1.0309, + "step": 63985 + }, + { + "epoch": 19.15, + "grad_norm": 2.9164721965789795, + "learning_rate": 2.2774555455807311e-07, + "loss": 0.9418, + "step": 63990 + }, + { + "epoch": 19.15, + "grad_norm": 2.0985426902770996, + "learning_rate": 2.2695500991182783e-07, + "loss": 1.0022, + "step": 63995 + }, + { + "epoch": 19.15, + "grad_norm": 2.751217842102051, + "learning_rate": 2.2616583345510812e-07, + "loss": 0.7166, + "step": 64000 + }, + { + "epoch": 19.15, + "grad_norm": 2.930095911026001, + "learning_rate": 2.2537802523149853e-07, + "loss": 0.921, + "step": 64005 + }, + { + "epoch": 19.15, + "grad_norm": 3.8547110557556152, + "learning_rate": 2.2459158528450874e-07, + "loss": 0.9999, + "step": 64010 + }, + { + "epoch": 19.15, + "grad_norm": 1.5911672115325928, + "learning_rate": 2.2380651365757343e-07, + "loss": 1.0612, + "step": 64015 + }, + { + "epoch": 19.15, + "grad_norm": 4.613253593444824, + "learning_rate": 2.2302281039405238e-07, + "loss": 1.0091, + "step": 64020 + }, + { + "epoch": 19.16, + "grad_norm": 2.566091299057007, + "learning_rate": 2.2224047553722484e-07, + "loss": 1.0461, + "step": 64025 + }, + { + "epoch": 19.16, + "grad_norm": 2.502011299133301, + "learning_rate": 2.2145950913030066e-07, + "loss": 1.0877, + "step": 64030 + }, + { + "epoch": 19.16, + "grad_norm": 3.6345396041870117, + "learning_rate": 2.2067991121641484e-07, + "loss": 0.9155, + "step": 64035 + }, + { + "epoch": 19.16, + "grad_norm": 1.3162267208099365, + "learning_rate": 2.1990168183861904e-07, + "loss": 0.8879, + "step": 64040 + }, + { + "epoch": 19.16, + "grad_norm": 2.207672595977783, + "learning_rate": 2.191248210398955e-07, + "loss": 0.9934, + "step": 64045 + }, + { + "epoch": 19.16, + "grad_norm": 3.507061719894409, + "learning_rate": 2.1834932886314885e-07, + "loss": 0.9936, + "step": 64050 + }, + { + "epoch": 19.16, + "grad_norm": 3.270716428756714, + "learning_rate": 2.1757520535121423e-07, + "loss": 0.9819, + "step": 64055 + }, + { + "epoch": 19.17, + "grad_norm": 2.573598623275757, + "learning_rate": 2.1680245054683524e-07, + "loss": 1.0214, + "step": 64060 + }, + { + "epoch": 19.17, + "grad_norm": 3.2568507194519043, + "learning_rate": 2.1603106449269993e-07, + "loss": 0.9246, + "step": 64065 + }, + { + "epoch": 19.17, + "grad_norm": 1.5595399141311646, + "learning_rate": 2.1526104723140484e-07, + "loss": 0.9696, + "step": 64070 + }, + { + "epoch": 19.17, + "grad_norm": 3.1589651107788086, + "learning_rate": 2.1449239880548254e-07, + "loss": 0.9985, + "step": 64075 + }, + { + "epoch": 19.17, + "grad_norm": 2.8172848224639893, + "learning_rate": 2.137251192573797e-07, + "loss": 0.7963, + "step": 64080 + }, + { + "epoch": 19.17, + "grad_norm": 2.691014528274536, + "learning_rate": 2.1295920862947628e-07, + "loss": 0.9184, + "step": 64085 + }, + { + "epoch": 19.17, + "grad_norm": 3.006046772003174, + "learning_rate": 2.1219466696407176e-07, + "loss": 0.8609, + "step": 64090 + }, + { + "epoch": 19.18, + "grad_norm": 2.1390583515167236, + "learning_rate": 2.1143149430338793e-07, + "loss": 1.1105, + "step": 64095 + }, + { + "epoch": 19.18, + "grad_norm": 1.9489213228225708, + "learning_rate": 2.106696906895772e-07, + "loss": 0.9752, + "step": 64100 + }, + { + "epoch": 19.18, + "grad_norm": 1.616949439048767, + "learning_rate": 2.0990925616471424e-07, + "loss": 0.9205, + "step": 64105 + }, + { + "epoch": 19.18, + "grad_norm": 1.416013240814209, + "learning_rate": 2.0915019077079322e-07, + "loss": 0.9529, + "step": 64110 + }, + { + "epoch": 19.18, + "grad_norm": 1.7403104305267334, + "learning_rate": 2.0839249454973896e-07, + "loss": 0.9774, + "step": 64115 + }, + { + "epoch": 19.18, + "grad_norm": 3.7459676265716553, + "learning_rate": 2.0763616754339577e-07, + "loss": 0.9201, + "step": 64120 + }, + { + "epoch": 19.19, + "grad_norm": 3.3942503929138184, + "learning_rate": 2.0688120979353853e-07, + "loss": 1.0433, + "step": 64125 + }, + { + "epoch": 19.19, + "grad_norm": 2.123073101043701, + "learning_rate": 2.061276213418617e-07, + "loss": 0.8314, + "step": 64130 + }, + { + "epoch": 19.19, + "grad_norm": 1.3835378885269165, + "learning_rate": 2.0537540222998474e-07, + "loss": 0.7707, + "step": 64135 + }, + { + "epoch": 19.19, + "grad_norm": 3.362372398376465, + "learning_rate": 2.0462455249945222e-07, + "loss": 0.9923, + "step": 64140 + }, + { + "epoch": 19.19, + "grad_norm": 3.691894769668579, + "learning_rate": 2.0387507219173098e-07, + "loss": 0.9268, + "step": 64145 + }, + { + "epoch": 19.19, + "grad_norm": 3.2239859104156494, + "learning_rate": 2.0312696134821562e-07, + "loss": 1.0184, + "step": 64150 + }, + { + "epoch": 19.19, + "grad_norm": 3.0610954761505127, + "learning_rate": 2.0238022001022315e-07, + "loss": 0.8632, + "step": 64155 + }, + { + "epoch": 19.2, + "grad_norm": 1.395918846130371, + "learning_rate": 2.0163484821899557e-07, + "loss": 0.8581, + "step": 64160 + }, + { + "epoch": 19.2, + "grad_norm": 4.282837390899658, + "learning_rate": 2.0089084601569718e-07, + "loss": 0.931, + "step": 64165 + }, + { + "epoch": 19.2, + "grad_norm": 1.898026704788208, + "learning_rate": 2.0014821344142286e-07, + "loss": 0.866, + "step": 64170 + }, + { + "epoch": 19.2, + "grad_norm": 2.9928948879241943, + "learning_rate": 1.9940695053718428e-07, + "loss": 1.0112, + "step": 64175 + }, + { + "epoch": 19.2, + "grad_norm": 2.1379966735839844, + "learning_rate": 1.9866705734392088e-07, + "loss": 0.868, + "step": 64180 + }, + { + "epoch": 19.2, + "grad_norm": 3.2108120918273926, + "learning_rate": 1.9792853390249444e-07, + "loss": 0.8312, + "step": 64185 + }, + { + "epoch": 19.2, + "grad_norm": 6.297685146331787, + "learning_rate": 1.971913802536973e-07, + "loss": 0.9006, + "step": 64190 + }, + { + "epoch": 19.21, + "grad_norm": 3.2357826232910156, + "learning_rate": 1.9645559643823863e-07, + "loss": 0.9766, + "step": 64195 + }, + { + "epoch": 19.21, + "grad_norm": 2.2781829833984375, + "learning_rate": 1.9572118249675532e-07, + "loss": 1.1408, + "step": 64200 + }, + { + "epoch": 19.21, + "grad_norm": 3.3626670837402344, + "learning_rate": 1.9498813846980658e-07, + "loss": 0.9252, + "step": 64205 + }, + { + "epoch": 19.21, + "grad_norm": 6.3693108558654785, + "learning_rate": 1.9425646439788224e-07, + "loss": 1.0737, + "step": 64210 + }, + { + "epoch": 19.21, + "grad_norm": 3.2453861236572266, + "learning_rate": 1.9352616032138614e-07, + "loss": 0.8096, + "step": 64215 + }, + { + "epoch": 19.21, + "grad_norm": 2.069312572479248, + "learning_rate": 1.9279722628065823e-07, + "loss": 0.8934, + "step": 64220 + }, + { + "epoch": 19.22, + "grad_norm": 3.1376097202301025, + "learning_rate": 1.9206966231595236e-07, + "loss": 0.9529, + "step": 64225 + }, + { + "epoch": 19.22, + "grad_norm": 1.8707741498947144, + "learning_rate": 1.913434684674531e-07, + "loss": 1.1012, + "step": 64230 + }, + { + "epoch": 19.22, + "grad_norm": 4.028426170349121, + "learning_rate": 1.9061864477527004e-07, + "loss": 0.9865, + "step": 64235 + }, + { + "epoch": 19.22, + "grad_norm": 2.0778110027313232, + "learning_rate": 1.8989519127942667e-07, + "loss": 0.8687, + "step": 64240 + }, + { + "epoch": 19.22, + "grad_norm": 2.294110059738159, + "learning_rate": 1.8917310801988552e-07, + "loss": 1.0524, + "step": 64245 + }, + { + "epoch": 19.22, + "grad_norm": 2.6258926391601562, + "learning_rate": 1.88452395036523e-07, + "loss": 0.7745, + "step": 64250 + }, + { + "epoch": 19.22, + "grad_norm": 2.8713653087615967, + "learning_rate": 1.8773305236914618e-07, + "loss": 1.0878, + "step": 64255 + }, + { + "epoch": 19.23, + "grad_norm": 2.246710777282715, + "learning_rate": 1.8701508005747882e-07, + "loss": 1.0726, + "step": 64260 + }, + { + "epoch": 19.23, + "grad_norm": 3.4345037937164307, + "learning_rate": 1.8629847814118084e-07, + "loss": 0.826, + "step": 64265 + }, + { + "epoch": 19.23, + "grad_norm": 3.921339273452759, + "learning_rate": 1.8558324665982341e-07, + "loss": 0.9873, + "step": 64270 + }, + { + "epoch": 19.23, + "grad_norm": 3.5189566612243652, + "learning_rate": 1.8486938565290822e-07, + "loss": 1.0737, + "step": 64275 + }, + { + "epoch": 19.23, + "grad_norm": 3.8996822834014893, + "learning_rate": 1.8415689515986488e-07, + "loss": 0.9291, + "step": 64280 + }, + { + "epoch": 19.23, + "grad_norm": 1.6882737874984741, + "learning_rate": 1.8344577522004248e-07, + "loss": 0.8509, + "step": 64285 + }, + { + "epoch": 19.23, + "grad_norm": 1.2842025756835938, + "learning_rate": 1.827360258727151e-07, + "loss": 0.9767, + "step": 64290 + }, + { + "epoch": 19.24, + "grad_norm": 3.280601739883423, + "learning_rate": 1.820276471570792e-07, + "loss": 0.843, + "step": 64295 + }, + { + "epoch": 19.24, + "grad_norm": 1.3017467260360718, + "learning_rate": 1.8132063911225905e-07, + "loss": 1.0419, + "step": 64300 + }, + { + "epoch": 19.24, + "grad_norm": 2.4863226413726807, + "learning_rate": 1.8061500177730396e-07, + "loss": 0.8428, + "step": 64305 + }, + { + "epoch": 19.24, + "grad_norm": 3.148401975631714, + "learning_rate": 1.7991073519118274e-07, + "loss": 0.9154, + "step": 64310 + }, + { + "epoch": 19.24, + "grad_norm": 3.4491004943847656, + "learning_rate": 1.7920783939279207e-07, + "loss": 1.0461, + "step": 64315 + }, + { + "epoch": 19.24, + "grad_norm": 1.5797641277313232, + "learning_rate": 1.7850631442095367e-07, + "loss": 1.1385, + "step": 64320 + }, + { + "epoch": 19.25, + "grad_norm": 2.7437055110931396, + "learning_rate": 1.7780616031441156e-07, + "loss": 1.0647, + "step": 64325 + }, + { + "epoch": 19.25, + "grad_norm": 3.9407639503479004, + "learning_rate": 1.7710737711183478e-07, + "loss": 0.6554, + "step": 64330 + }, + { + "epoch": 19.25, + "grad_norm": 1.4198654890060425, + "learning_rate": 1.7640996485181472e-07, + "loss": 1.0993, + "step": 64335 + }, + { + "epoch": 19.25, + "grad_norm": 1.583427906036377, + "learning_rate": 1.7571392357287053e-07, + "loss": 1.1437, + "step": 64340 + }, + { + "epoch": 19.25, + "grad_norm": 6.862940311431885, + "learning_rate": 1.7501925331343817e-07, + "loss": 0.953, + "step": 64345 + }, + { + "epoch": 19.25, + "grad_norm": 4.871342182159424, + "learning_rate": 1.7432595411189524e-07, + "loss": 0.8867, + "step": 64350 + }, + { + "epoch": 19.25, + "grad_norm": 2.5354645252227783, + "learning_rate": 1.7363402600651945e-07, + "loss": 1.0432, + "step": 64355 + }, + { + "epoch": 19.26, + "grad_norm": 2.395785331726074, + "learning_rate": 1.7294346903553305e-07, + "loss": 0.8861, + "step": 64360 + }, + { + "epoch": 19.26, + "grad_norm": 2.0268208980560303, + "learning_rate": 1.7225428323707494e-07, + "loss": 0.8826, + "step": 64365 + }, + { + "epoch": 19.26, + "grad_norm": 1.8643012046813965, + "learning_rate": 1.7156646864920077e-07, + "loss": 1.0994, + "step": 64370 + }, + { + "epoch": 19.26, + "grad_norm": 1.6344417333602905, + "learning_rate": 1.708800253099052e-07, + "loss": 0.9345, + "step": 64375 + }, + { + "epoch": 19.26, + "grad_norm": 2.7925565242767334, + "learning_rate": 1.7019495325709677e-07, + "loss": 1.0531, + "step": 64380 + }, + { + "epoch": 19.26, + "grad_norm": 3.3944101333618164, + "learning_rate": 1.6951125252861466e-07, + "loss": 0.9223, + "step": 64385 + }, + { + "epoch": 19.26, + "grad_norm": 3.003763437271118, + "learning_rate": 1.68828923162212e-07, + "loss": 0.8319, + "step": 64390 + }, + { + "epoch": 19.27, + "grad_norm": 2.5167884826660156, + "learning_rate": 1.6814796519558084e-07, + "loss": 1.1937, + "step": 64395 + }, + { + "epoch": 19.27, + "grad_norm": 1.8801358938217163, + "learning_rate": 1.6746837866632725e-07, + "loss": 1.0366, + "step": 64400 + }, + { + "epoch": 19.27, + "grad_norm": 2.1538734436035156, + "learning_rate": 1.6679016361197951e-07, + "loss": 1.0713, + "step": 64405 + }, + { + "epoch": 19.27, + "grad_norm": 3.144021987915039, + "learning_rate": 1.6611332006999935e-07, + "loss": 0.9265, + "step": 64410 + }, + { + "epoch": 19.27, + "grad_norm": 2.700958490371704, + "learning_rate": 1.6543784807776795e-07, + "loss": 0.8548, + "step": 64415 + }, + { + "epoch": 19.27, + "grad_norm": 2.5718274116516113, + "learning_rate": 1.6476374767258883e-07, + "loss": 0.9646, + "step": 64420 + }, + { + "epoch": 19.28, + "grad_norm": 3.631052255630493, + "learning_rate": 1.640910188916961e-07, + "loss": 1.0448, + "step": 64425 + }, + { + "epoch": 19.28, + "grad_norm": 2.219627618789673, + "learning_rate": 1.6341966177223777e-07, + "loss": 1.1337, + "step": 64430 + }, + { + "epoch": 19.28, + "grad_norm": 1.4927635192871094, + "learning_rate": 1.6274967635129811e-07, + "loss": 0.9727, + "step": 64435 + }, + { + "epoch": 19.28, + "grad_norm": 3.119546890258789, + "learning_rate": 1.6208106266587253e-07, + "loss": 0.9235, + "step": 64440 + }, + { + "epoch": 19.28, + "grad_norm": 1.84986412525177, + "learning_rate": 1.6141382075289813e-07, + "loss": 0.8477, + "step": 64445 + }, + { + "epoch": 19.28, + "grad_norm": 2.341019868850708, + "learning_rate": 1.607479506492149e-07, + "loss": 1.1153, + "step": 64450 + }, + { + "epoch": 19.28, + "grad_norm": 2.070584535598755, + "learning_rate": 1.600834523916045e-07, + "loss": 0.8931, + "step": 64455 + }, + { + "epoch": 19.29, + "grad_norm": 4.45391845703125, + "learning_rate": 1.5942032601676815e-07, + "loss": 0.9916, + "step": 64460 + }, + { + "epoch": 19.29, + "grad_norm": 3.5736074447631836, + "learning_rate": 1.587585715613238e-07, + "loss": 1.1195, + "step": 64465 + }, + { + "epoch": 19.29, + "grad_norm": 2.6194748878479004, + "learning_rate": 1.580981890618255e-07, + "loss": 1.0239, + "step": 64470 + }, + { + "epoch": 19.29, + "grad_norm": 2.58370304107666, + "learning_rate": 1.5743917855473855e-07, + "loss": 1.0816, + "step": 64475 + }, + { + "epoch": 19.29, + "grad_norm": 2.7932465076446533, + "learning_rate": 1.5678154007646718e-07, + "loss": 1.0808, + "step": 64480 + }, + { + "epoch": 19.29, + "grad_norm": 5.6625471115112305, + "learning_rate": 1.5612527366332674e-07, + "loss": 0.9235, + "step": 64485 + }, + { + "epoch": 19.29, + "grad_norm": 3.3300294876098633, + "learning_rate": 1.5547037935156327e-07, + "loss": 0.8833, + "step": 64490 + }, + { + "epoch": 19.3, + "grad_norm": 2.539818525314331, + "learning_rate": 1.5481685717734783e-07, + "loss": 0.9428, + "step": 64495 + }, + { + "epoch": 19.3, + "grad_norm": 3.0235023498535156, + "learning_rate": 1.54164707176771e-07, + "loss": 1.1307, + "step": 64500 + }, + { + "epoch": 19.3, + "grad_norm": 2.7196106910705566, + "learning_rate": 1.5351392938585118e-07, + "loss": 0.946, + "step": 64505 + }, + { + "epoch": 19.3, + "grad_norm": 2.6924359798431396, + "learning_rate": 1.5286452384053184e-07, + "loss": 0.7011, + "step": 64510 + }, + { + "epoch": 19.3, + "grad_norm": 1.7030616998672485, + "learning_rate": 1.52216490576676e-07, + "loss": 1.0243, + "step": 64515 + }, + { + "epoch": 19.3, + "grad_norm": 2.473074197769165, + "learning_rate": 1.5156982963007715e-07, + "loss": 1.0579, + "step": 64520 + }, + { + "epoch": 19.31, + "grad_norm": 1.5015933513641357, + "learning_rate": 1.5092454103644572e-07, + "loss": 0.9168, + "step": 64525 + }, + { + "epoch": 19.31, + "grad_norm": 2.3585140705108643, + "learning_rate": 1.5028062483142537e-07, + "loss": 0.8898, + "step": 64530 + }, + { + "epoch": 19.31, + "grad_norm": 2.2519643306732178, + "learning_rate": 1.4963808105057653e-07, + "loss": 0.9104, + "step": 64535 + }, + { + "epoch": 19.31, + "grad_norm": 3.287801504135132, + "learning_rate": 1.4899690972938474e-07, + "loss": 1.0621, + "step": 64540 + }, + { + "epoch": 19.31, + "grad_norm": 2.565260648727417, + "learning_rate": 1.4835711090326054e-07, + "loss": 0.8879, + "step": 64545 + }, + { + "epoch": 19.31, + "grad_norm": 1.9417716264724731, + "learning_rate": 1.4771868460754234e-07, + "loss": 0.9862, + "step": 64550 + }, + { + "epoch": 19.31, + "grad_norm": 1.4697206020355225, + "learning_rate": 1.4708163087749082e-07, + "loss": 0.9105, + "step": 64555 + }, + { + "epoch": 19.32, + "grad_norm": 6.190129280090332, + "learning_rate": 1.4644594974828618e-07, + "loss": 0.9255, + "step": 64560 + }, + { + "epoch": 19.32, + "grad_norm": 2.575528383255005, + "learning_rate": 1.4581164125503922e-07, + "loss": 1.0289, + "step": 64565 + }, + { + "epoch": 19.32, + "grad_norm": 2.064091205596924, + "learning_rate": 1.4517870543277745e-07, + "loss": 1.0171, + "step": 64570 + }, + { + "epoch": 19.32, + "grad_norm": 1.4593415260314941, + "learning_rate": 1.445471423164646e-07, + "loss": 0.8089, + "step": 64575 + }, + { + "epoch": 19.32, + "grad_norm": 4.081628799438477, + "learning_rate": 1.4391695194097553e-07, + "loss": 0.9634, + "step": 64580 + }, + { + "epoch": 19.32, + "grad_norm": 1.9155333042144775, + "learning_rate": 1.4328813434111577e-07, + "loss": 0.8419, + "step": 64585 + }, + { + "epoch": 19.32, + "grad_norm": 1.9294395446777344, + "learning_rate": 1.426606895516186e-07, + "loss": 0.8925, + "step": 64590 + }, + { + "epoch": 19.33, + "grad_norm": 4.205466270446777, + "learning_rate": 1.4203461760713133e-07, + "loss": 0.9417, + "step": 64595 + }, + { + "epoch": 19.33, + "grad_norm": 2.388813018798828, + "learning_rate": 1.4140991854223183e-07, + "loss": 1.0479, + "step": 64600 + }, + { + "epoch": 19.33, + "grad_norm": 2.137241840362549, + "learning_rate": 1.407865923914259e-07, + "loss": 1.0903, + "step": 64605 + }, + { + "epoch": 19.33, + "grad_norm": 3.590604543685913, + "learning_rate": 1.4016463918913592e-07, + "loss": 0.8402, + "step": 64610 + }, + { + "epoch": 19.33, + "grad_norm": 3.1465625762939453, + "learning_rate": 1.3954405896971224e-07, + "loss": 1.1066, + "step": 64615 + }, + { + "epoch": 19.33, + "grad_norm": 4.3162922859191895, + "learning_rate": 1.389248517674302e-07, + "loss": 1.0114, + "step": 64620 + }, + { + "epoch": 19.34, + "grad_norm": 2.18872332572937, + "learning_rate": 1.3830701761648744e-07, + "loss": 0.954, + "step": 64625 + }, + { + "epoch": 19.34, + "grad_norm": 3.4795877933502197, + "learning_rate": 1.3769055655100395e-07, + "loss": 0.7991, + "step": 64630 + }, + { + "epoch": 19.34, + "grad_norm": 3.6978468894958496, + "learning_rate": 1.3707546860503019e-07, + "loss": 0.9708, + "step": 64635 + }, + { + "epoch": 19.34, + "grad_norm": 3.991750478744507, + "learning_rate": 1.3646175381253345e-07, + "loss": 0.9624, + "step": 64640 + }, + { + "epoch": 19.34, + "grad_norm": 3.2379486560821533, + "learning_rate": 1.3584941220740888e-07, + "loss": 1.0254, + "step": 64645 + }, + { + "epoch": 19.34, + "grad_norm": 3.404052257537842, + "learning_rate": 1.352384438234794e-07, + "loss": 0.8464, + "step": 64650 + }, + { + "epoch": 19.34, + "grad_norm": 2.1188278198242188, + "learning_rate": 1.346288486944819e-07, + "loss": 0.8731, + "step": 64655 + }, + { + "epoch": 19.35, + "grad_norm": 5.067503452301025, + "learning_rate": 1.3402062685408945e-07, + "loss": 0.8417, + "step": 64660 + }, + { + "epoch": 19.35, + "grad_norm": 3.299839973449707, + "learning_rate": 1.3341377833588908e-07, + "loss": 1.0166, + "step": 64665 + }, + { + "epoch": 19.35, + "grad_norm": 3.843200922012329, + "learning_rate": 1.3280830317340122e-07, + "loss": 0.9164, + "step": 64670 + }, + { + "epoch": 19.35, + "grad_norm": 3.3282971382141113, + "learning_rate": 1.3220420140006017e-07, + "loss": 1.0903, + "step": 64675 + }, + { + "epoch": 19.35, + "grad_norm": 3.1337714195251465, + "learning_rate": 1.3160147304923375e-07, + "loss": 0.953, + "step": 64680 + }, + { + "epoch": 19.35, + "grad_norm": 4.363522529602051, + "learning_rate": 1.3100011815420642e-07, + "loss": 0.7601, + "step": 64685 + }, + { + "epoch": 19.35, + "grad_norm": 3.619309902191162, + "learning_rate": 1.3040013674819605e-07, + "loss": 0.8317, + "step": 64690 + }, + { + "epoch": 19.36, + "grad_norm": 4.793463230133057, + "learning_rate": 1.298015288643345e-07, + "loss": 1.1246, + "step": 64695 + }, + { + "epoch": 19.36, + "grad_norm": 4.220412731170654, + "learning_rate": 1.292042945356814e-07, + "loss": 1.2013, + "step": 64700 + }, + { + "epoch": 19.36, + "grad_norm": 1.8279774188995361, + "learning_rate": 1.2860843379522425e-07, + "loss": 0.9528, + "step": 64705 + }, + { + "epoch": 19.36, + "grad_norm": 3.403494119644165, + "learning_rate": 1.2801394667587285e-07, + "loss": 0.9246, + "step": 64710 + }, + { + "epoch": 19.36, + "grad_norm": 3.680722713470459, + "learning_rate": 1.274208332104565e-07, + "loss": 0.9582, + "step": 64715 + }, + { + "epoch": 19.36, + "grad_norm": 1.425324559211731, + "learning_rate": 1.268290934317351e-07, + "loss": 1.0058, + "step": 64720 + }, + { + "epoch": 19.36, + "grad_norm": 2.55713152885437, + "learning_rate": 1.262387273723853e-07, + "loss": 1.0049, + "step": 64725 + }, + { + "epoch": 19.37, + "grad_norm": 3.1014976501464844, + "learning_rate": 1.2564973506501987e-07, + "loss": 0.9631, + "step": 64730 + }, + { + "epoch": 19.37, + "grad_norm": 2.44319224357605, + "learning_rate": 1.2506211654216282e-07, + "loss": 0.9343, + "step": 64735 + }, + { + "epoch": 19.37, + "grad_norm": 1.407696008682251, + "learning_rate": 1.2447587183626874e-07, + "loss": 0.9746, + "step": 64740 + }, + { + "epoch": 19.37, + "grad_norm": 3.8315956592559814, + "learning_rate": 1.238910009797145e-07, + "loss": 0.9999, + "step": 64745 + }, + { + "epoch": 19.37, + "grad_norm": 2.9506800174713135, + "learning_rate": 1.2330750400480483e-07, + "loss": 0.9509, + "step": 64750 + }, + { + "epoch": 19.37, + "grad_norm": 1.5093969106674194, + "learning_rate": 1.2272538094376396e-07, + "loss": 1.0732, + "step": 64755 + }, + { + "epoch": 19.38, + "grad_norm": 3.4581491947174072, + "learning_rate": 1.2214463182873836e-07, + "loss": 0.9692, + "step": 64760 + }, + { + "epoch": 19.38, + "grad_norm": 6.015048027038574, + "learning_rate": 1.2156525669181074e-07, + "loss": 0.8817, + "step": 64765 + }, + { + "epoch": 19.38, + "grad_norm": 1.234391450881958, + "learning_rate": 1.2098725556496938e-07, + "loss": 0.9878, + "step": 64770 + }, + { + "epoch": 19.38, + "grad_norm": 2.1239829063415527, + "learning_rate": 1.2041062848014428e-07, + "loss": 1.0328, + "step": 64775 + }, + { + "epoch": 19.38, + "grad_norm": 2.3981587886810303, + "learning_rate": 1.1983537546917944e-07, + "loss": 0.9892, + "step": 64780 + }, + { + "epoch": 19.38, + "grad_norm": 4.929365158081055, + "learning_rate": 1.1926149656384667e-07, + "loss": 1.0319, + "step": 64785 + }, + { + "epoch": 19.38, + "grad_norm": 2.424349308013916, + "learning_rate": 1.1868899179583726e-07, + "loss": 1.0278, + "step": 64790 + }, + { + "epoch": 19.39, + "grad_norm": 3.70564866065979, + "learning_rate": 1.1811786119677037e-07, + "loss": 0.8346, + "step": 64795 + }, + { + "epoch": 19.39, + "grad_norm": 1.7846856117248535, + "learning_rate": 1.1754810479819301e-07, + "loss": 0.8833, + "step": 64800 + }, + { + "epoch": 19.39, + "grad_norm": 2.44340181350708, + "learning_rate": 1.1697972263157164e-07, + "loss": 0.951, + "step": 64805 + }, + { + "epoch": 19.39, + "grad_norm": 8.20602798461914, + "learning_rate": 1.1641271472829229e-07, + "loss": 0.9087, + "step": 64810 + }, + { + "epoch": 19.39, + "grad_norm": 5.193169593811035, + "learning_rate": 1.1584708111967435e-07, + "loss": 1.0248, + "step": 64815 + }, + { + "epoch": 19.39, + "grad_norm": 5.007645130157471, + "learning_rate": 1.1528282183695949e-07, + "loss": 0.9085, + "step": 64820 + }, + { + "epoch": 19.39, + "grad_norm": 2.5000011920928955, + "learning_rate": 1.1471993691130612e-07, + "loss": 1.1319, + "step": 64825 + }, + { + "epoch": 19.4, + "grad_norm": 1.7281321287155151, + "learning_rate": 1.1415842637380326e-07, + "loss": 0.9048, + "step": 64830 + }, + { + "epoch": 19.4, + "grad_norm": 3.866112470626831, + "learning_rate": 1.135982902554622e-07, + "loss": 0.9548, + "step": 64835 + }, + { + "epoch": 19.4, + "grad_norm": 3.7432994842529297, + "learning_rate": 1.1303952858722211e-07, + "loss": 0.977, + "step": 64840 + }, + { + "epoch": 19.4, + "grad_norm": 2.450726270675659, + "learning_rate": 1.1248214139993884e-07, + "loss": 1.0289, + "step": 64845 + }, + { + "epoch": 19.4, + "grad_norm": 4.173949718475342, + "learning_rate": 1.1192612872439889e-07, + "loss": 1.0554, + "step": 64850 + }, + { + "epoch": 19.4, + "grad_norm": 2.5136771202087402, + "learning_rate": 1.1137149059130825e-07, + "loss": 0.8978, + "step": 64855 + }, + { + "epoch": 19.41, + "grad_norm": 3.791132688522339, + "learning_rate": 1.1081822703130074e-07, + "loss": 1.076, + "step": 64860 + }, + { + "epoch": 19.41, + "grad_norm": 2.5353622436523438, + "learning_rate": 1.102663380749297e-07, + "loss": 1.0079, + "step": 64865 + }, + { + "epoch": 19.41, + "grad_norm": 1.9366538524627686, + "learning_rate": 1.0971582375267908e-07, + "loss": 1.0107, + "step": 64870 + }, + { + "epoch": 19.41, + "grad_norm": 2.1806907653808594, + "learning_rate": 1.0916668409494957e-07, + "loss": 0.959, + "step": 64875 + }, + { + "epoch": 19.41, + "grad_norm": 1.3208969831466675, + "learning_rate": 1.0861891913207523e-07, + "loss": 0.7762, + "step": 64880 + }, + { + "epoch": 19.41, + "grad_norm": 1.4078150987625122, + "learning_rate": 1.080725288943013e-07, + "loss": 0.9522, + "step": 64885 + }, + { + "epoch": 19.41, + "grad_norm": 3.181238889694214, + "learning_rate": 1.0752751341180922e-07, + "loss": 0.9171, + "step": 64890 + }, + { + "epoch": 19.42, + "grad_norm": 2.8408191204071045, + "learning_rate": 1.0698387271469712e-07, + "loss": 0.9459, + "step": 64895 + }, + { + "epoch": 19.42, + "grad_norm": 2.00119686126709, + "learning_rate": 1.0644160683299375e-07, + "loss": 1.0576, + "step": 64900 + }, + { + "epoch": 19.42, + "grad_norm": 1.823187232017517, + "learning_rate": 1.0590071579664185e-07, + "loss": 1.192, + "step": 64905 + }, + { + "epoch": 19.42, + "grad_norm": 2.4590156078338623, + "learning_rate": 1.0536119963552026e-07, + "loss": 0.9914, + "step": 64910 + }, + { + "epoch": 19.42, + "grad_norm": 3.0153346061706543, + "learning_rate": 1.0482305837942185e-07, + "loss": 0.9732, + "step": 64915 + }, + { + "epoch": 19.42, + "grad_norm": 1.55227530002594, + "learning_rate": 1.0428629205806728e-07, + "loss": 0.9852, + "step": 64920 + }, + { + "epoch": 19.42, + "grad_norm": 1.4424388408660889, + "learning_rate": 1.0375090070110505e-07, + "loss": 1.0208, + "step": 64925 + }, + { + "epoch": 19.43, + "grad_norm": 1.9990373849868774, + "learning_rate": 1.032168843381004e-07, + "loss": 0.9683, + "step": 64930 + }, + { + "epoch": 19.43, + "grad_norm": 4.354613304138184, + "learning_rate": 1.0268424299855195e-07, + "loss": 0.9116, + "step": 64935 + }, + { + "epoch": 19.43, + "grad_norm": 4.074637413024902, + "learning_rate": 1.0215297671186952e-07, + "loss": 0.846, + "step": 64940 + }, + { + "epoch": 19.43, + "grad_norm": 2.377812385559082, + "learning_rate": 1.0162308550740185e-07, + "loss": 1.0472, + "step": 64945 + }, + { + "epoch": 19.43, + "grad_norm": 1.2441447973251343, + "learning_rate": 1.0109456941440886e-07, + "loss": 1.0943, + "step": 64950 + }, + { + "epoch": 19.43, + "grad_norm": 35.32939910888672, + "learning_rate": 1.0056742846208389e-07, + "loss": 0.8993, + "step": 64955 + }, + { + "epoch": 19.44, + "grad_norm": 4.968936443328857, + "learning_rate": 1.0004166267953419e-07, + "loss": 0.976, + "step": 64960 + }, + { + "epoch": 19.44, + "grad_norm": 6.838953971862793, + "learning_rate": 9.951727209580598e-08, + "loss": 0.9147, + "step": 64965 + }, + { + "epoch": 19.44, + "grad_norm": 1.3050497770309448, + "learning_rate": 9.899425673985107e-08, + "loss": 1.0613, + "step": 64970 + }, + { + "epoch": 19.44, + "grad_norm": 3.5567336082458496, + "learning_rate": 9.847261664056584e-08, + "loss": 0.9914, + "step": 64975 + }, + { + "epoch": 19.44, + "grad_norm": 3.1688625812530518, + "learning_rate": 9.795235182674944e-08, + "loss": 1.0523, + "step": 64980 + }, + { + "epoch": 19.44, + "grad_norm": 1.9151177406311035, + "learning_rate": 9.743346232714279e-08, + "loss": 1.0164, + "step": 64985 + }, + { + "epoch": 19.44, + "grad_norm": 3.5026047229766846, + "learning_rate": 9.691594817040073e-08, + "loss": 0.9442, + "step": 64990 + }, + { + "epoch": 19.45, + "grad_norm": 2.427326202392578, + "learning_rate": 9.639980938510595e-08, + "loss": 0.9183, + "step": 64995 + }, + { + "epoch": 19.45, + "grad_norm": 1.6219308376312256, + "learning_rate": 9.588504599976345e-08, + "loss": 1.0548, + "step": 65000 + }, + { + "epoch": 19.45, + "grad_norm": 2.265070676803589, + "learning_rate": 9.537165804280324e-08, + "loss": 0.9385, + "step": 65005 + }, + { + "epoch": 19.45, + "grad_norm": 4.394448280334473, + "learning_rate": 9.485964554258043e-08, + "loss": 0.9525, + "step": 65010 + }, + { + "epoch": 19.45, + "grad_norm": 3.270338296890259, + "learning_rate": 9.434900852737238e-08, + "loss": 0.9407, + "step": 65015 + }, + { + "epoch": 19.45, + "grad_norm": 2.3003993034362793, + "learning_rate": 9.383974702537878e-08, + "loss": 0.8104, + "step": 65020 + }, + { + "epoch": 19.45, + "grad_norm": 2.146404981613159, + "learning_rate": 9.33318610647299e-08, + "loss": 0.9752, + "step": 65025 + }, + { + "epoch": 19.46, + "grad_norm": 1.9496128559112549, + "learning_rate": 9.282535067346998e-08, + "loss": 0.8462, + "step": 65030 + }, + { + "epoch": 19.46, + "grad_norm": 1.7979656457901, + "learning_rate": 9.23202158795794e-08, + "loss": 0.8072, + "step": 65035 + }, + { + "epoch": 19.46, + "grad_norm": 2.433094024658203, + "learning_rate": 9.181645671095252e-08, + "loss": 0.8688, + "step": 65040 + }, + { + "epoch": 19.46, + "grad_norm": 3.4137656688690186, + "learning_rate": 9.131407319541153e-08, + "loss": 1.0364, + "step": 65045 + }, + { + "epoch": 19.46, + "grad_norm": 1.888330340385437, + "learning_rate": 9.081306536070366e-08, + "loss": 0.8318, + "step": 65050 + }, + { + "epoch": 19.46, + "grad_norm": 3.29191255569458, + "learning_rate": 9.03134332344957e-08, + "loss": 0.7606, + "step": 65055 + }, + { + "epoch": 19.47, + "grad_norm": 1.3745579719543457, + "learning_rate": 8.981517684438779e-08, + "loss": 0.9285, + "step": 65060 + }, + { + "epoch": 19.47, + "grad_norm": 2.466064929962158, + "learning_rate": 8.9318296217894e-08, + "loss": 0.9789, + "step": 65065 + }, + { + "epoch": 19.47, + "grad_norm": 2.370903730392456, + "learning_rate": 8.882279138245908e-08, + "loss": 0.8135, + "step": 65070 + }, + { + "epoch": 19.47, + "grad_norm": 3.7410812377929688, + "learning_rate": 8.832866236544446e-08, + "loss": 0.9207, + "step": 65075 + }, + { + "epoch": 19.47, + "grad_norm": 2.961038589477539, + "learning_rate": 8.783590919414497e-08, + "loss": 1.0046, + "step": 65080 + }, + { + "epoch": 19.47, + "grad_norm": 2.3268706798553467, + "learning_rate": 8.734453189577497e-08, + "loss": 0.9985, + "step": 65085 + }, + { + "epoch": 19.47, + "grad_norm": 2.1825544834136963, + "learning_rate": 8.685453049747105e-08, + "loss": 0.9957, + "step": 65090 + }, + { + "epoch": 19.48, + "grad_norm": 3.99306321144104, + "learning_rate": 8.636590502629494e-08, + "loss": 0.9198, + "step": 65095 + }, + { + "epoch": 19.48, + "grad_norm": 4.728055000305176, + "learning_rate": 8.587865550923057e-08, + "loss": 1.1148, + "step": 65100 + }, + { + "epoch": 19.48, + "grad_norm": 1.5131349563598633, + "learning_rate": 8.539278197319533e-08, + "loss": 0.8645, + "step": 65105 + }, + { + "epoch": 19.48, + "grad_norm": 4.779293537139893, + "learning_rate": 8.490828444501775e-08, + "loss": 0.9724, + "step": 65110 + }, + { + "epoch": 19.48, + "grad_norm": 2.5396671295166016, + "learning_rate": 8.442516295145974e-08, + "loss": 0.9741, + "step": 65115 + }, + { + "epoch": 19.48, + "grad_norm": 2.4737155437469482, + "learning_rate": 8.394341751919998e-08, + "loss": 1.1152, + "step": 65120 + }, + { + "epoch": 19.48, + "grad_norm": 3.827990770339966, + "learning_rate": 8.346304817484496e-08, + "loss": 0.9559, + "step": 65125 + }, + { + "epoch": 19.49, + "grad_norm": 2.1786043643951416, + "learning_rate": 8.307974350048597e-08, + "loss": 0.8171, + "step": 65130 + }, + { + "epoch": 19.49, + "grad_norm": 2.547623634338379, + "learning_rate": 8.260185118116881e-08, + "loss": 0.9985, + "step": 65135 + }, + { + "epoch": 19.49, + "grad_norm": 2.0980887413024902, + "learning_rate": 8.212533502385267e-08, + "loss": 0.9029, + "step": 65140 + }, + { + "epoch": 19.49, + "grad_norm": 3.6388769149780273, + "learning_rate": 8.165019505485261e-08, + "loss": 1.0007, + "step": 65145 + }, + { + "epoch": 19.49, + "grad_norm": 3.2435462474823, + "learning_rate": 8.117643130041152e-08, + "loss": 1.0546, + "step": 65150 + }, + { + "epoch": 19.49, + "grad_norm": 2.3990752696990967, + "learning_rate": 8.070404378669461e-08, + "loss": 0.9811, + "step": 65155 + }, + { + "epoch": 19.5, + "grad_norm": 2.219163179397583, + "learning_rate": 8.02330325397893e-08, + "loss": 1.179, + "step": 65160 + }, + { + "epoch": 19.5, + "grad_norm": 1.6309661865234375, + "learning_rate": 7.976339758571094e-08, + "loss": 0.8849, + "step": 65165 + }, + { + "epoch": 19.5, + "grad_norm": 1.6608452796936035, + "learning_rate": 7.929513895039987e-08, + "loss": 0.8809, + "step": 65170 + }, + { + "epoch": 19.5, + "grad_norm": 3.3001632690429688, + "learning_rate": 7.882825665971316e-08, + "loss": 0.9709, + "step": 65175 + }, + { + "epoch": 19.5, + "grad_norm": 3.3454103469848633, + "learning_rate": 7.836275073943577e-08, + "loss": 0.9296, + "step": 65180 + }, + { + "epoch": 19.5, + "grad_norm": 3.6163206100463867, + "learning_rate": 7.789862121528325e-08, + "loss": 1.0389, + "step": 65185 + }, + { + "epoch": 19.5, + "grad_norm": 2.1792218685150146, + "learning_rate": 7.743586811288228e-08, + "loss": 0.8764, + "step": 65190 + }, + { + "epoch": 19.51, + "grad_norm": 36.940330505371094, + "learning_rate": 7.6974491457793e-08, + "loss": 1.1318, + "step": 65195 + }, + { + "epoch": 19.51, + "grad_norm": 1.098024845123291, + "learning_rate": 7.651449127549503e-08, + "loss": 0.9647, + "step": 65200 + }, + { + "epoch": 19.51, + "grad_norm": 2.7333242893218994, + "learning_rate": 7.605586759139582e-08, + "loss": 1.0234, + "step": 65205 + }, + { + "epoch": 19.51, + "grad_norm": 3.17170786857605, + "learning_rate": 7.559862043082511e-08, + "loss": 1.0791, + "step": 65210 + }, + { + "epoch": 19.51, + "grad_norm": 3.0869386196136475, + "learning_rate": 7.514274981903491e-08, + "loss": 0.8448, + "step": 65215 + }, + { + "epoch": 19.51, + "grad_norm": 1.4254027605056763, + "learning_rate": 7.468825578120508e-08, + "loss": 1.0125, + "step": 65220 + }, + { + "epoch": 19.51, + "grad_norm": 4.486727237701416, + "learning_rate": 7.423513834242945e-08, + "loss": 0.7673, + "step": 65225 + }, + { + "epoch": 19.52, + "grad_norm": 2.695744514465332, + "learning_rate": 7.378339752774077e-08, + "loss": 0.9713, + "step": 65230 + }, + { + "epoch": 19.52, + "grad_norm": 1.8701138496398926, + "learning_rate": 7.333303336208574e-08, + "loss": 1.0323, + "step": 65235 + }, + { + "epoch": 19.52, + "grad_norm": 2.85798978805542, + "learning_rate": 7.288404587033892e-08, + "loss": 0.9666, + "step": 65240 + }, + { + "epoch": 19.52, + "grad_norm": 2.4904799461364746, + "learning_rate": 7.243643507729437e-08, + "loss": 1.0193, + "step": 65245 + }, + { + "epoch": 19.52, + "grad_norm": 1.5817186832427979, + "learning_rate": 7.199020100767395e-08, + "loss": 0.9865, + "step": 65250 + }, + { + "epoch": 19.52, + "grad_norm": 4.257795333862305, + "learning_rate": 7.154534368612465e-08, + "loss": 0.7875, + "step": 65255 + }, + { + "epoch": 19.53, + "grad_norm": 4.011660099029541, + "learning_rate": 7.11018631372129e-08, + "loss": 0.87, + "step": 65260 + }, + { + "epoch": 19.53, + "grad_norm": 4.863183975219727, + "learning_rate": 7.065975938543579e-08, + "loss": 0.9798, + "step": 65265 + }, + { + "epoch": 19.53, + "grad_norm": 2.2304656505584717, + "learning_rate": 7.021903245520433e-08, + "loss": 0.9956, + "step": 65270 + }, + { + "epoch": 19.53, + "grad_norm": 2.4296457767486572, + "learning_rate": 6.977968237086574e-08, + "loss": 0.9869, + "step": 65275 + }, + { + "epoch": 19.53, + "grad_norm": 1.2431954145431519, + "learning_rate": 6.934170915668114e-08, + "loss": 1.1323, + "step": 65280 + }, + { + "epoch": 19.53, + "grad_norm": 2.272568464279175, + "learning_rate": 6.890511283683953e-08, + "loss": 1.0307, + "step": 65285 + }, + { + "epoch": 19.53, + "grad_norm": 3.0335166454315186, + "learning_rate": 6.846989343545219e-08, + "loss": 1.0689, + "step": 65290 + }, + { + "epoch": 19.54, + "grad_norm": 2.1746132373809814, + "learning_rate": 6.803605097656096e-08, + "loss": 1.0758, + "step": 65295 + }, + { + "epoch": 19.54, + "grad_norm": 1.310981035232544, + "learning_rate": 6.760358548411893e-08, + "loss": 1.0002, + "step": 65300 + }, + { + "epoch": 19.54, + "grad_norm": 3.3575997352600098, + "learning_rate": 6.717249698202088e-08, + "loss": 0.8939, + "step": 65305 + }, + { + "epoch": 19.54, + "grad_norm": 1.3703774213790894, + "learning_rate": 6.674278549406444e-08, + "loss": 0.9257, + "step": 65310 + }, + { + "epoch": 19.54, + "grad_norm": 1.6967878341674805, + "learning_rate": 6.63144510439917e-08, + "loss": 1.0022, + "step": 65315 + }, + { + "epoch": 19.54, + "grad_norm": 3.0534799098968506, + "learning_rate": 6.588749365545044e-08, + "loss": 1.0344, + "step": 65320 + }, + { + "epoch": 19.54, + "grad_norm": 2.0059475898742676, + "learning_rate": 6.546191335202734e-08, + "loss": 0.8697, + "step": 65325 + }, + { + "epoch": 19.55, + "grad_norm": 2.44685697555542, + "learning_rate": 6.503771015722582e-08, + "loss": 0.9656, + "step": 65330 + }, + { + "epoch": 19.55, + "grad_norm": 2.028308868408203, + "learning_rate": 6.461488409447159e-08, + "loss": 0.7818, + "step": 65335 + }, + { + "epoch": 19.55, + "grad_norm": 1.6491647958755493, + "learning_rate": 6.419343518711818e-08, + "loss": 0.822, + "step": 65340 + }, + { + "epoch": 19.55, + "grad_norm": 2.1462082862854004, + "learning_rate": 6.37733634584442e-08, + "loss": 1.0347, + "step": 65345 + }, + { + "epoch": 19.55, + "grad_norm": 2.555720329284668, + "learning_rate": 6.335466893164499e-08, + "loss": 0.9114, + "step": 65350 + }, + { + "epoch": 19.55, + "grad_norm": 1.5168747901916504, + "learning_rate": 6.293735162984926e-08, + "loss": 0.8913, + "step": 65355 + }, + { + "epoch": 19.55, + "grad_norm": 3.1510086059570312, + "learning_rate": 6.252141157610247e-08, + "loss": 1.0805, + "step": 65360 + }, + { + "epoch": 19.56, + "grad_norm": 1.3030726909637451, + "learning_rate": 6.210684879337514e-08, + "loss": 0.9486, + "step": 65365 + }, + { + "epoch": 19.56, + "grad_norm": 1.6003532409667969, + "learning_rate": 6.16936633045656e-08, + "loss": 1.0441, + "step": 65370 + }, + { + "epoch": 19.56, + "grad_norm": 4.755357265472412, + "learning_rate": 6.128185513249452e-08, + "loss": 0.9853, + "step": 65375 + }, + { + "epoch": 19.56, + "grad_norm": 2.193000316619873, + "learning_rate": 6.087142429990478e-08, + "loss": 1.1333, + "step": 65380 + }, + { + "epoch": 19.56, + "grad_norm": 2.2681541442871094, + "learning_rate": 6.04623708294616e-08, + "loss": 1.1406, + "step": 65385 + }, + { + "epoch": 19.56, + "grad_norm": 3.417823314666748, + "learning_rate": 6.00546947437608e-08, + "loss": 1.0069, + "step": 65390 + }, + { + "epoch": 19.57, + "grad_norm": 2.2752387523651123, + "learning_rate": 5.964839606531214e-08, + "loss": 1.0824, + "step": 65395 + }, + { + "epoch": 19.57, + "grad_norm": 8.419718742370605, + "learning_rate": 5.924347481656156e-08, + "loss": 0.9543, + "step": 65400 + }, + { + "epoch": 19.57, + "grad_norm": 6.15936279296875, + "learning_rate": 5.8839931019868955e-08, + "loss": 0.9819, + "step": 65405 + }, + { + "epoch": 19.57, + "grad_norm": 3.1836822032928467, + "learning_rate": 5.8437764697522044e-08, + "loss": 1.0345, + "step": 65410 + }, + { + "epoch": 19.57, + "grad_norm": 2.2897796630859375, + "learning_rate": 5.803697587173085e-08, + "loss": 0.9194, + "step": 65415 + }, + { + "epoch": 19.57, + "grad_norm": 2.0201714038848877, + "learning_rate": 5.7637564564633205e-08, + "loss": 1.1888, + "step": 65420 + }, + { + "epoch": 19.57, + "grad_norm": 3.7068679332733154, + "learning_rate": 5.7239530798283704e-08, + "loss": 0.9841, + "step": 65425 + }, + { + "epoch": 19.58, + "grad_norm": 1.6118470430374146, + "learning_rate": 5.684287459467308e-08, + "loss": 0.9835, + "step": 65430 + }, + { + "epoch": 19.58, + "grad_norm": 2.5250370502471924, + "learning_rate": 5.6447595975700485e-08, + "loss": 0.8138, + "step": 65435 + }, + { + "epoch": 19.58, + "grad_norm": 2.585517406463623, + "learning_rate": 5.6053694963198456e-08, + "loss": 0.9517, + "step": 65440 + }, + { + "epoch": 19.58, + "grad_norm": 1.3797327280044556, + "learning_rate": 5.566117157892459e-08, + "loss": 1.0316, + "step": 65445 + }, + { + "epoch": 19.58, + "grad_norm": 2.215745449066162, + "learning_rate": 5.527002584455876e-08, + "loss": 0.7985, + "step": 65450 + }, + { + "epoch": 19.58, + "grad_norm": 4.586230754852295, + "learning_rate": 5.488025778169759e-08, + "loss": 0.8436, + "step": 65455 + }, + { + "epoch": 19.58, + "grad_norm": 3.00388503074646, + "learning_rate": 5.4491867411871064e-08, + "loss": 1.0267, + "step": 65460 + }, + { + "epoch": 19.59, + "grad_norm": 2.888880729675293, + "learning_rate": 5.410485475652871e-08, + "loss": 0.9659, + "step": 65465 + }, + { + "epoch": 19.59, + "grad_norm": 1.5345345735549927, + "learning_rate": 5.3719219837047865e-08, + "loss": 1.161, + "step": 65470 + }, + { + "epoch": 19.59, + "grad_norm": 3.530027151107788, + "learning_rate": 5.33349626747226e-08, + "loss": 0.9582, + "step": 65475 + }, + { + "epoch": 19.59, + "grad_norm": 1.8898435831069946, + "learning_rate": 5.295208329077761e-08, + "loss": 1.1305, + "step": 65480 + }, + { + "epoch": 19.59, + "grad_norm": 1.663489580154419, + "learning_rate": 5.257058170635709e-08, + "loss": 0.8846, + "step": 65485 + }, + { + "epoch": 19.59, + "grad_norm": 2.8383398056030273, + "learning_rate": 5.2190457942533076e-08, + "loss": 0.8722, + "step": 65490 + }, + { + "epoch": 19.6, + "grad_norm": 2.9115426540374756, + "learning_rate": 5.181171202029711e-08, + "loss": 0.824, + "step": 65495 + }, + { + "epoch": 19.6, + "grad_norm": 7.482906341552734, + "learning_rate": 5.1434343960568565e-08, + "loss": 1.0373, + "step": 65500 + }, + { + "epoch": 19.6, + "grad_norm": 3.7145473957061768, + "learning_rate": 5.105835378418911e-08, + "loss": 0.8548, + "step": 65505 + }, + { + "epoch": 19.6, + "grad_norm": 2.6154332160949707, + "learning_rate": 5.068374151192268e-08, + "loss": 1.0704, + "step": 65510 + }, + { + "epoch": 19.6, + "grad_norm": 8.783279418945312, + "learning_rate": 5.031050716446106e-08, + "loss": 0.8265, + "step": 65515 + }, + { + "epoch": 19.6, + "grad_norm": 2.184469699859619, + "learning_rate": 4.993865076241555e-08, + "loss": 1.0476, + "step": 65520 + }, + { + "epoch": 19.6, + "grad_norm": 2.483076810836792, + "learning_rate": 4.9568172326325266e-08, + "loss": 0.8417, + "step": 65525 + }, + { + "epoch": 19.61, + "grad_norm": 2.2957425117492676, + "learning_rate": 4.919907187664885e-08, + "loss": 0.9391, + "step": 65530 + }, + { + "epoch": 19.61, + "grad_norm": 2.0007917881011963, + "learning_rate": 4.883134943377276e-08, + "loss": 0.8605, + "step": 65535 + }, + { + "epoch": 19.61, + "grad_norm": 2.37748384475708, + "learning_rate": 4.846500501800577e-08, + "loss": 1.2328, + "step": 65540 + }, + { + "epoch": 19.61, + "grad_norm": 8.686102867126465, + "learning_rate": 4.810003864958168e-08, + "loss": 0.8948, + "step": 65545 + }, + { + "epoch": 19.61, + "grad_norm": 2.3307371139526367, + "learning_rate": 4.773645034865659e-08, + "loss": 1.0579, + "step": 65550 + }, + { + "epoch": 19.61, + "grad_norm": 4.153103351593018, + "learning_rate": 4.737424013531166e-08, + "loss": 0.7621, + "step": 65555 + }, + { + "epoch": 19.61, + "grad_norm": 4.211757659912109, + "learning_rate": 4.701340802955034e-08, + "loss": 1.0984, + "step": 65560 + }, + { + "epoch": 19.62, + "grad_norm": 4.062090873718262, + "learning_rate": 4.6653954051298354e-08, + "loss": 0.9231, + "step": 65565 + }, + { + "epoch": 19.62, + "grad_norm": 1.6518398523330688, + "learning_rate": 4.6295878220414815e-08, + "loss": 0.8956, + "step": 65570 + }, + { + "epoch": 19.62, + "grad_norm": 2.662663698196411, + "learning_rate": 4.5939180556670016e-08, + "loss": 0.9944, + "step": 65575 + }, + { + "epoch": 19.62, + "grad_norm": 3.212043523788452, + "learning_rate": 4.5583861079767645e-08, + "loss": 0.8612, + "step": 65580 + }, + { + "epoch": 19.62, + "grad_norm": 2.235893964767456, + "learning_rate": 4.5229919809328115e-08, + "loss": 0.9491, + "step": 65585 + }, + { + "epoch": 19.62, + "grad_norm": 2.6770949363708496, + "learning_rate": 4.4877356764902455e-08, + "loss": 0.8584, + "step": 65590 + }, + { + "epoch": 19.63, + "grad_norm": 6.8069939613342285, + "learning_rate": 4.452617196595843e-08, + "loss": 0.895, + "step": 65595 + }, + { + "epoch": 19.63, + "grad_norm": 2.035522937774658, + "learning_rate": 4.417636543189718e-08, + "loss": 0.8317, + "step": 65600 + }, + { + "epoch": 19.63, + "grad_norm": 2.070621967315674, + "learning_rate": 4.3827937182033816e-08, + "loss": 0.9145, + "step": 65605 + }, + { + "epoch": 19.63, + "grad_norm": 1.5345319509506226, + "learning_rate": 4.348088723561128e-08, + "loss": 0.9873, + "step": 65610 + }, + { + "epoch": 19.63, + "grad_norm": 1.464187741279602, + "learning_rate": 4.313521561180034e-08, + "loss": 0.9745, + "step": 65615 + }, + { + "epoch": 19.63, + "grad_norm": 5.471027851104736, + "learning_rate": 4.2790922329691287e-08, + "loss": 0.9186, + "step": 65620 + }, + { + "epoch": 19.63, + "grad_norm": 1.9127711057662964, + "learning_rate": 4.2448007408296684e-08, + "loss": 0.9168, + "step": 65625 + }, + { + "epoch": 19.64, + "grad_norm": 2.524143934249878, + "learning_rate": 4.210647086655695e-08, + "loss": 1.0501, + "step": 65630 + }, + { + "epoch": 19.64, + "grad_norm": 3.043079137802124, + "learning_rate": 4.1766312723334755e-08, + "loss": 0.7822, + "step": 65635 + }, + { + "epoch": 19.64, + "grad_norm": 2.911064624786377, + "learning_rate": 4.1427532997415086e-08, + "loss": 1.012, + "step": 65640 + }, + { + "epoch": 19.64, + "grad_norm": 5.481101036071777, + "learning_rate": 4.109013170751075e-08, + "loss": 1.0, + "step": 65645 + }, + { + "epoch": 19.64, + "grad_norm": 2.473296880722046, + "learning_rate": 4.075410887225684e-08, + "loss": 0.8792, + "step": 65650 + }, + { + "epoch": 19.64, + "grad_norm": 2.742133855819702, + "learning_rate": 4.041946451020795e-08, + "loss": 0.9363, + "step": 65655 + }, + { + "epoch": 19.64, + "grad_norm": 1.7099344730377197, + "learning_rate": 4.008619863984653e-08, + "loss": 1.0857, + "step": 65660 + }, + { + "epoch": 19.65, + "grad_norm": 4.01974630355835, + "learning_rate": 3.975431127958285e-08, + "loss": 1.0116, + "step": 65665 + }, + { + "epoch": 19.65, + "grad_norm": 6.58184814453125, + "learning_rate": 3.9423802447743905e-08, + "loss": 0.7649, + "step": 65670 + }, + { + "epoch": 19.65, + "grad_norm": 3.7717185020446777, + "learning_rate": 3.909467216258178e-08, + "loss": 0.8852, + "step": 65675 + }, + { + "epoch": 19.65, + "grad_norm": 1.591399908065796, + "learning_rate": 3.876692044227359e-08, + "loss": 0.932, + "step": 65680 + }, + { + "epoch": 19.65, + "grad_norm": 1.8852194547653198, + "learning_rate": 3.8440547304927074e-08, + "loss": 0.9933, + "step": 65685 + }, + { + "epoch": 19.65, + "grad_norm": 5.670814037322998, + "learning_rate": 3.811555276855838e-08, + "loss": 1.0301, + "step": 65690 + }, + { + "epoch": 19.66, + "grad_norm": 1.9637709856033325, + "learning_rate": 3.7791936851125365e-08, + "loss": 0.9422, + "step": 65695 + }, + { + "epoch": 19.66, + "grad_norm": 2.995957612991333, + "learning_rate": 3.746969957049429e-08, + "loss": 0.8587, + "step": 65700 + }, + { + "epoch": 19.66, + "grad_norm": 2.3271963596343994, + "learning_rate": 3.7148840944464804e-08, + "loss": 0.8903, + "step": 65705 + }, + { + "epoch": 19.66, + "grad_norm": 2.014490842819214, + "learning_rate": 3.682936099075884e-08, + "loss": 1.0424, + "step": 65710 + }, + { + "epoch": 19.66, + "grad_norm": 3.5324313640594482, + "learning_rate": 3.6511259727017856e-08, + "loss": 0.7962, + "step": 65715 + }, + { + "epoch": 19.66, + "grad_norm": 2.035303831100464, + "learning_rate": 3.619453717081389e-08, + "loss": 0.9449, + "step": 65720 + }, + { + "epoch": 19.66, + "grad_norm": 12.83745288848877, + "learning_rate": 3.587919333963574e-08, + "loss": 1.0862, + "step": 65725 + }, + { + "epoch": 19.67, + "grad_norm": 1.7152767181396484, + "learning_rate": 3.556522825090281e-08, + "loss": 0.8594, + "step": 65730 + }, + { + "epoch": 19.67, + "grad_norm": 2.850773334503174, + "learning_rate": 3.525264192195121e-08, + "loss": 1.0139, + "step": 65735 + }, + { + "epoch": 19.67, + "grad_norm": 4.40149450302124, + "learning_rate": 3.494143437004771e-08, + "loss": 0.8235, + "step": 65740 + }, + { + "epoch": 19.67, + "grad_norm": 2.7569260597229004, + "learning_rate": 3.463160561237855e-08, + "loss": 0.8739, + "step": 65745 + }, + { + "epoch": 19.67, + "grad_norm": 2.746654510498047, + "learning_rate": 3.432315566605504e-08, + "loss": 0.9751, + "step": 65750 + }, + { + "epoch": 19.67, + "grad_norm": 3.1638407707214355, + "learning_rate": 3.4016084548116336e-08, + "loss": 0.9332, + "step": 65755 + }, + { + "epoch": 19.67, + "grad_norm": 2.6680147647857666, + "learning_rate": 3.371039227551553e-08, + "loss": 0.8459, + "step": 65760 + }, + { + "epoch": 19.68, + "grad_norm": 3.2998719215393066, + "learning_rate": 3.3406078865139137e-08, + "loss": 0.9761, + "step": 65765 + }, + { + "epoch": 19.68, + "grad_norm": 1.581969141960144, + "learning_rate": 3.3103144333793134e-08, + "loss": 1.1189, + "step": 65770 + }, + { + "epoch": 19.68, + "grad_norm": 5.630129337310791, + "learning_rate": 3.280158869821137e-08, + "loss": 0.8332, + "step": 65775 + }, + { + "epoch": 19.68, + "grad_norm": 2.6767990589141846, + "learning_rate": 3.250141197504442e-08, + "loss": 1.0059, + "step": 65780 + }, + { + "epoch": 19.68, + "grad_norm": 3.5217807292938232, + "learning_rate": 3.2202614180870674e-08, + "loss": 0.9764, + "step": 65785 + }, + { + "epoch": 19.68, + "grad_norm": 2.578888177871704, + "learning_rate": 3.190519533219638e-08, + "loss": 0.7636, + "step": 65790 + }, + { + "epoch": 19.69, + "grad_norm": 3.0376648902893066, + "learning_rate": 3.160915544544452e-08, + "loss": 0.9932, + "step": 65795 + }, + { + "epoch": 19.69, + "grad_norm": 4.299635887145996, + "learning_rate": 3.1314494536965886e-08, + "loss": 0.6652, + "step": 65800 + }, + { + "epoch": 19.69, + "grad_norm": 2.366783857345581, + "learning_rate": 3.1021212623033594e-08, + "loss": 1.089, + "step": 65805 + }, + { + "epoch": 19.69, + "grad_norm": 5.171496391296387, + "learning_rate": 3.0729309719845775e-08, + "loss": 0.9039, + "step": 65810 + }, + { + "epoch": 19.69, + "grad_norm": 3.2826364040374756, + "learning_rate": 3.043878584352566e-08, + "loss": 0.9896, + "step": 65815 + }, + { + "epoch": 19.69, + "grad_norm": 2.9492897987365723, + "learning_rate": 3.014964101011597e-08, + "loss": 1.0328, + "step": 65820 + }, + { + "epoch": 19.69, + "grad_norm": 3.130568265914917, + "learning_rate": 2.9861875235587255e-08, + "loss": 1.2715, + "step": 65825 + }, + { + "epoch": 19.7, + "grad_norm": 2.2713990211486816, + "learning_rate": 2.9575488535829587e-08, + "loss": 1.0102, + "step": 65830 + }, + { + "epoch": 19.7, + "grad_norm": 2.3380908966064453, + "learning_rate": 2.929048092666642e-08, + "loss": 0.9455, + "step": 65835 + }, + { + "epoch": 19.7, + "grad_norm": 4.085785865783691, + "learning_rate": 2.9006852423832386e-08, + "loss": 0.8658, + "step": 65840 + }, + { + "epoch": 19.7, + "grad_norm": 3.3306400775909424, + "learning_rate": 2.8724603042992735e-08, + "loss": 0.8673, + "step": 65845 + }, + { + "epoch": 19.7, + "grad_norm": 3.320685863494873, + "learning_rate": 2.8443732799740554e-08, + "loss": 1.0188, + "step": 65850 + }, + { + "epoch": 19.7, + "grad_norm": 3.164665460586548, + "learning_rate": 2.8164241709582873e-08, + "loss": 1.046, + "step": 65855 + }, + { + "epoch": 19.7, + "grad_norm": 3.0726613998413086, + "learning_rate": 2.7886129787954573e-08, + "loss": 0.8932, + "step": 65860 + }, + { + "epoch": 19.71, + "grad_norm": 2.211113452911377, + "learning_rate": 2.7609397050221143e-08, + "loss": 0.8676, + "step": 65865 + }, + { + "epoch": 19.71, + "grad_norm": 3.0560343265533447, + "learning_rate": 2.7334043511662023e-08, + "loss": 0.9694, + "step": 65870 + }, + { + "epoch": 19.71, + "grad_norm": 1.7159016132354736, + "learning_rate": 2.706006918748727e-08, + "loss": 0.8506, + "step": 65875 + }, + { + "epoch": 19.71, + "grad_norm": 2.9933714866638184, + "learning_rate": 2.678747409282645e-08, + "loss": 0.9774, + "step": 65880 + }, + { + "epoch": 19.71, + "grad_norm": 2.8719935417175293, + "learning_rate": 2.6516258242736958e-08, + "loss": 0.8631, + "step": 65885 + }, + { + "epoch": 19.71, + "grad_norm": 1.9159245491027832, + "learning_rate": 2.624642165219293e-08, + "loss": 0.7401, + "step": 65890 + }, + { + "epoch": 19.72, + "grad_norm": 5.0512590408325195, + "learning_rate": 2.597796433610189e-08, + "loss": 0.8452, + "step": 65895 + }, + { + "epoch": 19.72, + "grad_norm": 2.150611639022827, + "learning_rate": 2.571088630929086e-08, + "loss": 0.9875, + "step": 65900 + }, + { + "epoch": 19.72, + "grad_norm": 5.610554218292236, + "learning_rate": 2.5445187586503606e-08, + "loss": 0.9885, + "step": 65905 + }, + { + "epoch": 19.72, + "grad_norm": 2.4917097091674805, + "learning_rate": 2.5180868182422822e-08, + "loss": 1.0434, + "step": 65910 + }, + { + "epoch": 19.72, + "grad_norm": 2.9542877674102783, + "learning_rate": 2.4917928111639623e-08, + "loss": 1.0089, + "step": 65915 + }, + { + "epoch": 19.72, + "grad_norm": 2.203429937362671, + "learning_rate": 2.4656367388681268e-08, + "loss": 0.981, + "step": 65920 + }, + { + "epoch": 19.72, + "grad_norm": 3.659770965576172, + "learning_rate": 2.4396186027991764e-08, + "loss": 1.1026, + "step": 65925 + }, + { + "epoch": 19.73, + "grad_norm": 2.265361785888672, + "learning_rate": 2.413738404394017e-08, + "loss": 0.9166, + "step": 65930 + }, + { + "epoch": 19.73, + "grad_norm": 2.5812058448791504, + "learning_rate": 2.3879961450817835e-08, + "loss": 1.1483, + "step": 65935 + }, + { + "epoch": 19.73, + "grad_norm": 2.302971363067627, + "learning_rate": 2.3623918262846712e-08, + "loss": 0.8114, + "step": 65940 + }, + { + "epoch": 19.73, + "grad_norm": 2.793581008911133, + "learning_rate": 2.3369254494162718e-08, + "loss": 1.0621, + "step": 65945 + }, + { + "epoch": 19.73, + "grad_norm": 5.097170829772949, + "learning_rate": 2.3115970158835155e-08, + "loss": 0.9018, + "step": 65950 + }, + { + "epoch": 19.73, + "grad_norm": 3.7390031814575195, + "learning_rate": 2.2864065270850054e-08, + "loss": 1.0263, + "step": 65955 + }, + { + "epoch": 19.73, + "grad_norm": 2.8096506595611572, + "learning_rate": 2.2613539844118514e-08, + "loss": 1.0402, + "step": 65960 + }, + { + "epoch": 19.74, + "grad_norm": 1.8856858015060425, + "learning_rate": 2.236439389247946e-08, + "loss": 0.9717, + "step": 65965 + }, + { + "epoch": 19.74, + "grad_norm": 3.062253713607788, + "learning_rate": 2.2116627429694116e-08, + "loss": 0.8636, + "step": 65970 + }, + { + "epoch": 19.74, + "grad_norm": 2.116095781326294, + "learning_rate": 2.1870240469440417e-08, + "loss": 0.8674, + "step": 65975 + }, + { + "epoch": 19.74, + "grad_norm": 4.265397071838379, + "learning_rate": 2.162523302533248e-08, + "loss": 0.9501, + "step": 65980 + }, + { + "epoch": 19.74, + "grad_norm": 3.8397390842437744, + "learning_rate": 2.138160511090115e-08, + "loss": 0.9542, + "step": 65985 + }, + { + "epoch": 19.74, + "grad_norm": 5.092801570892334, + "learning_rate": 2.1139356739596773e-08, + "loss": 0.9933, + "step": 65990 + }, + { + "epoch": 19.74, + "grad_norm": 3.441525459289551, + "learning_rate": 2.0898487924803088e-08, + "loss": 0.8262, + "step": 65995 + }, + { + "epoch": 19.75, + "grad_norm": 2.899848461151123, + "learning_rate": 2.0658998679820573e-08, + "loss": 1.0899, + "step": 66000 + }, + { + "epoch": 19.75, + "grad_norm": 2.0572032928466797, + "learning_rate": 2.0420889017880305e-08, + "loss": 0.7595, + "step": 66005 + }, + { + "epoch": 19.75, + "grad_norm": 4.190646171569824, + "learning_rate": 2.0184158952124555e-08, + "loss": 0.8722, + "step": 66010 + }, + { + "epoch": 19.75, + "grad_norm": 2.0695345401763916, + "learning_rate": 1.9948808495637293e-08, + "loss": 1.0758, + "step": 66015 + }, + { + "epoch": 19.75, + "grad_norm": 3.038285732269287, + "learning_rate": 1.9714837661408138e-08, + "loss": 0.9091, + "step": 66020 + }, + { + "epoch": 19.75, + "grad_norm": 1.420379400253296, + "learning_rate": 1.9482246462365628e-08, + "loss": 0.9363, + "step": 66025 + }, + { + "epoch": 19.76, + "grad_norm": 2.4541139602661133, + "learning_rate": 1.9251034911352274e-08, + "loss": 0.9865, + "step": 66030 + }, + { + "epoch": 19.76, + "grad_norm": 1.3549059629440308, + "learning_rate": 1.9021203021135637e-08, + "loss": 1.0879, + "step": 66035 + }, + { + "epoch": 19.76, + "grad_norm": 3.1724984645843506, + "learning_rate": 1.8792750804413894e-08, + "loss": 0.9949, + "step": 66040 + }, + { + "epoch": 19.76, + "grad_norm": 2.643465995788574, + "learning_rate": 1.8565678273801956e-08, + "loss": 1.0872, + "step": 66045 + }, + { + "epoch": 19.76, + "grad_norm": 2.5308868885040283, + "learning_rate": 1.8339985441842567e-08, + "loss": 0.9338, + "step": 66050 + }, + { + "epoch": 19.76, + "grad_norm": 3.499067783355713, + "learning_rate": 1.8115672320995202e-08, + "loss": 1.0337, + "step": 66055 + }, + { + "epoch": 19.76, + "grad_norm": 3.2034714221954346, + "learning_rate": 1.7892738923655504e-08, + "loss": 0.859, + "step": 66060 + }, + { + "epoch": 19.77, + "grad_norm": 4.103240489959717, + "learning_rate": 1.7671185262130296e-08, + "loss": 0.8122, + "step": 66065 + }, + { + "epoch": 19.77, + "grad_norm": 2.9544129371643066, + "learning_rate": 1.745101134865701e-08, + "loss": 0.9386, + "step": 66070 + }, + { + "epoch": 19.77, + "grad_norm": 1.3058322668075562, + "learning_rate": 1.7232217195398138e-08, + "loss": 0.9956, + "step": 66075 + }, + { + "epoch": 19.77, + "grad_norm": 4.783046245574951, + "learning_rate": 1.7014802814435682e-08, + "loss": 0.8943, + "step": 66080 + }, + { + "epoch": 19.77, + "grad_norm": 1.3274574279785156, + "learning_rate": 1.6798768217776706e-08, + "loss": 1.0416, + "step": 66085 + }, + { + "epoch": 19.77, + "grad_norm": 1.9618406295776367, + "learning_rate": 1.658411341735333e-08, + "loss": 0.8388, + "step": 66090 + }, + { + "epoch": 19.77, + "grad_norm": 1.6042386293411255, + "learning_rate": 1.637083842502274e-08, + "loss": 1.0276, + "step": 66095 + }, + { + "epoch": 19.78, + "grad_norm": 1.6663395166397095, + "learning_rate": 1.6158943252558845e-08, + "loss": 0.885, + "step": 66100 + }, + { + "epoch": 19.78, + "grad_norm": 10.641562461853027, + "learning_rate": 1.594842791166895e-08, + "loss": 0.7403, + "step": 66105 + }, + { + "epoch": 19.78, + "grad_norm": 2.9796385765075684, + "learning_rate": 1.5739292413977093e-08, + "loss": 1.0609, + "step": 66110 + }, + { + "epoch": 19.78, + "grad_norm": 13.236249923706055, + "learning_rate": 1.553153677103236e-08, + "loss": 0.9377, + "step": 66115 + }, + { + "epoch": 19.78, + "grad_norm": 8.828291893005371, + "learning_rate": 1.5325160994314468e-08, + "loss": 1.1329, + "step": 66120 + }, + { + "epoch": 19.78, + "grad_norm": 1.7481231689453125, + "learning_rate": 1.5120165095217077e-08, + "loss": 0.9678, + "step": 66125 + }, + { + "epoch": 19.79, + "grad_norm": 5.550840854644775, + "learning_rate": 1.491654908506168e-08, + "loss": 1.0408, + "step": 66130 + }, + { + "epoch": 19.79, + "grad_norm": 1.4550460577011108, + "learning_rate": 1.4714312975094847e-08, + "loss": 0.9088, + "step": 66135 + }, + { + "epoch": 19.79, + "grad_norm": 3.302130937576294, + "learning_rate": 1.4513456776485413e-08, + "loss": 0.9667, + "step": 66140 + }, + { + "epoch": 19.79, + "grad_norm": 4.979493141174316, + "learning_rate": 1.4313980500327284e-08, + "loss": 1.0144, + "step": 66145 + }, + { + "epoch": 19.79, + "grad_norm": 3.0126538276672363, + "learning_rate": 1.411588415763665e-08, + "loss": 0.9664, + "step": 66150 + }, + { + "epoch": 19.79, + "grad_norm": 3.2353785037994385, + "learning_rate": 1.3919167759354756e-08, + "loss": 1.0019, + "step": 66155 + }, + { + "epoch": 19.79, + "grad_norm": 2.159238815307617, + "learning_rate": 1.3723831316345137e-08, + "loss": 0.8879, + "step": 66160 + }, + { + "epoch": 19.8, + "grad_norm": 2.4272637367248535, + "learning_rate": 1.3529874839396383e-08, + "loss": 0.984, + "step": 66165 + }, + { + "epoch": 19.8, + "grad_norm": 2.7232370376586914, + "learning_rate": 1.3337298339219372e-08, + "loss": 0.9735, + "step": 66170 + }, + { + "epoch": 19.8, + "grad_norm": 3.3963069915771484, + "learning_rate": 1.3146101826452818e-08, + "loss": 0.8962, + "step": 66175 + }, + { + "epoch": 19.8, + "grad_norm": 4.68864631652832, + "learning_rate": 1.2956285311654937e-08, + "loss": 0.9397, + "step": 66180 + }, + { + "epoch": 19.8, + "grad_norm": 1.9277212619781494, + "learning_rate": 1.2767848805309013e-08, + "loss": 0.9896, + "step": 66185 + }, + { + "epoch": 19.8, + "grad_norm": 1.3516850471496582, + "learning_rate": 1.258079231782061e-08, + "loss": 1.0026, + "step": 66190 + }, + { + "epoch": 19.8, + "grad_norm": 1.6840862035751343, + "learning_rate": 1.2395115859525907e-08, + "loss": 0.9797, + "step": 66195 + }, + { + "epoch": 19.81, + "grad_norm": 2.9334299564361572, + "learning_rate": 1.221081944067226e-08, + "loss": 1.1275, + "step": 66200 + }, + { + "epoch": 19.81, + "grad_norm": 16.37929916381836, + "learning_rate": 1.2027903071440416e-08, + "loss": 0.8693, + "step": 66205 + }, + { + "epoch": 19.81, + "grad_norm": 2.172717571258545, + "learning_rate": 1.1846366761936179e-08, + "loss": 0.8475, + "step": 66210 + }, + { + "epoch": 19.81, + "grad_norm": 2.417130947113037, + "learning_rate": 1.1666210522184861e-08, + "loss": 1.1559, + "step": 66215 + }, + { + "epoch": 19.81, + "grad_norm": 1.5092058181762695, + "learning_rate": 1.1487434362131288e-08, + "loss": 0.8431, + "step": 66220 + }, + { + "epoch": 19.81, + "grad_norm": 2.016624927520752, + "learning_rate": 1.1310038291656443e-08, + "loss": 1.1364, + "step": 66225 + }, + { + "epoch": 19.82, + "grad_norm": 4.214022159576416, + "learning_rate": 1.113402232054972e-08, + "loss": 0.9275, + "step": 66230 + }, + { + "epoch": 19.82, + "grad_norm": 1.9043920040130615, + "learning_rate": 1.0959386458539444e-08, + "loss": 0.9576, + "step": 66235 + }, + { + "epoch": 19.82, + "grad_norm": 2.4653420448303223, + "learning_rate": 1.0786130715267906e-08, + "loss": 0.9825, + "step": 66240 + }, + { + "epoch": 19.82, + "grad_norm": 1.7439837455749512, + "learning_rate": 1.0614255100299675e-08, + "loss": 0.9889, + "step": 66245 + }, + { + "epoch": 19.82, + "grad_norm": 2.955815076828003, + "learning_rate": 1.0443759623135484e-08, + "loss": 0.8389, + "step": 66250 + }, + { + "epoch": 19.82, + "grad_norm": 2.9133384227752686, + "learning_rate": 1.0274644293184477e-08, + "loss": 0.797, + "step": 66255 + }, + { + "epoch": 19.82, + "grad_norm": 2.106208562850952, + "learning_rate": 1.0106909119791952e-08, + "loss": 0.9157, + "step": 66260 + }, + { + "epoch": 19.83, + "grad_norm": 3.142747640609741, + "learning_rate": 9.940554112217171e-09, + "loss": 1.0129, + "step": 66265 + }, + { + "epoch": 19.83, + "grad_norm": 4.4805121421813965, + "learning_rate": 9.775579279650005e-09, + "loss": 0.9918, + "step": 66270 + }, + { + "epoch": 19.83, + "grad_norm": 1.981816053390503, + "learning_rate": 9.611984631202608e-09, + "loss": 0.9091, + "step": 66275 + }, + { + "epoch": 19.83, + "grad_norm": 2.474738359451294, + "learning_rate": 9.449770175909423e-09, + "loss": 0.9336, + "step": 66280 + }, + { + "epoch": 19.83, + "grad_norm": 3.66359281539917, + "learning_rate": 9.288935922727172e-09, + "loss": 0.9899, + "step": 66285 + }, + { + "epoch": 19.83, + "grad_norm": 3.463992118835449, + "learning_rate": 9.12948188054319e-09, + "loss": 0.6621, + "step": 66290 + }, + { + "epoch": 19.83, + "grad_norm": 2.546389102935791, + "learning_rate": 8.97140805815877e-09, + "loss": 0.9394, + "step": 66295 + }, + { + "epoch": 19.84, + "grad_norm": 2.605010509490967, + "learning_rate": 8.814714464308594e-09, + "loss": 0.9425, + "step": 66300 + }, + { + "epoch": 19.84, + "grad_norm": 1.4219180345535278, + "learning_rate": 8.659401107644072e-09, + "loss": 0.9586, + "step": 66305 + }, + { + "epoch": 19.84, + "grad_norm": 3.061012029647827, + "learning_rate": 8.505467996744455e-09, + "loss": 0.8155, + "step": 66310 + }, + { + "epoch": 19.84, + "grad_norm": 4.381716728210449, + "learning_rate": 8.352915140111273e-09, + "loss": 0.8649, + "step": 66315 + }, + { + "epoch": 19.84, + "grad_norm": 1.0354138612747192, + "learning_rate": 8.201742546168345e-09, + "loss": 0.9967, + "step": 66320 + }, + { + "epoch": 19.84, + "grad_norm": 3.057412624359131, + "learning_rate": 8.051950223267324e-09, + "loss": 1.1387, + "step": 66325 + }, + { + "epoch": 19.85, + "grad_norm": 2.2272489070892334, + "learning_rate": 7.903538179676595e-09, + "loss": 0.9963, + "step": 66330 + }, + { + "epoch": 19.85, + "grad_norm": 2.390744209289551, + "learning_rate": 7.756506423597932e-09, + "loss": 1.0075, + "step": 66335 + }, + { + "epoch": 19.85, + "grad_norm": 2.477290153503418, + "learning_rate": 7.610854963147063e-09, + "loss": 0.9847, + "step": 66340 + }, + { + "epoch": 19.85, + "grad_norm": 2.855509042739868, + "learning_rate": 7.466583806370331e-09, + "loss": 0.908, + "step": 66345 + }, + { + "epoch": 19.85, + "grad_norm": 2.9786510467529297, + "learning_rate": 7.323692961239137e-09, + "loss": 1.0602, + "step": 66350 + }, + { + "epoch": 19.85, + "grad_norm": 2.911388397216797, + "learning_rate": 7.18218243563884e-09, + "loss": 0.8163, + "step": 66355 + }, + { + "epoch": 19.85, + "grad_norm": 3.1563682556152344, + "learning_rate": 7.042052237390961e-09, + "loss": 0.9476, + "step": 66360 + }, + { + "epoch": 19.86, + "grad_norm": 3.7669315338134766, + "learning_rate": 6.903302374228204e-09, + "loss": 0.8065, + "step": 66365 + }, + { + "epoch": 19.86, + "grad_norm": 2.0955512523651123, + "learning_rate": 6.765932853819435e-09, + "loss": 1.1436, + "step": 66370 + }, + { + "epoch": 19.86, + "grad_norm": 2.7320590019226074, + "learning_rate": 6.62994368375025e-09, + "loss": 0.9004, + "step": 66375 + }, + { + "epoch": 19.86, + "grad_norm": 4.438745021820068, + "learning_rate": 6.495334871528536e-09, + "loss": 1.0057, + "step": 66380 + }, + { + "epoch": 19.86, + "grad_norm": 1.9494245052337646, + "learning_rate": 6.3621064245900085e-09, + "loss": 0.8972, + "step": 66385 + }, + { + "epoch": 19.86, + "grad_norm": 1.6904481649398804, + "learning_rate": 6.230258350292673e-09, + "loss": 0.9847, + "step": 66390 + }, + { + "epoch": 19.86, + "grad_norm": 2.8563525676727295, + "learning_rate": 6.09979065591959e-09, + "loss": 0.8644, + "step": 66395 + }, + { + "epoch": 19.87, + "grad_norm": 3.8716516494750977, + "learning_rate": 5.9707033486733345e-09, + "loss": 0.8536, + "step": 66400 + }, + { + "epoch": 19.87, + "grad_norm": 4.005788803100586, + "learning_rate": 5.842996435687087e-09, + "loss": 1.0165, + "step": 66405 + }, + { + "epoch": 19.87, + "grad_norm": 5.931815147399902, + "learning_rate": 5.71666992401354e-09, + "loss": 1.0706, + "step": 66410 + }, + { + "epoch": 19.87, + "grad_norm": 7.53218412399292, + "learning_rate": 5.591723820624894e-09, + "loss": 0.8242, + "step": 66415 + }, + { + "epoch": 19.87, + "grad_norm": 3.1983590126037598, + "learning_rate": 5.468158132426737e-09, + "loss": 0.9099, + "step": 66420 + }, + { + "epoch": 19.87, + "grad_norm": 4.075475692749023, + "learning_rate": 5.3459728662413886e-09, + "loss": 0.9736, + "step": 66425 + }, + { + "epoch": 19.88, + "grad_norm": 1.4461299180984497, + "learning_rate": 5.225168028819005e-09, + "loss": 0.944, + "step": 66430 + }, + { + "epoch": 19.88, + "grad_norm": 3.7994940280914307, + "learning_rate": 5.105743626829251e-09, + "loss": 0.9979, + "step": 66435 + }, + { + "epoch": 19.88, + "grad_norm": 2.046570301055908, + "learning_rate": 4.987699666869627e-09, + "loss": 1.1112, + "step": 66440 + }, + { + "epoch": 19.88, + "grad_norm": 3.123607873916626, + "learning_rate": 4.871036155454367e-09, + "loss": 0.9295, + "step": 66445 + }, + { + "epoch": 19.88, + "grad_norm": 3.029066801071167, + "learning_rate": 4.755753099033866e-09, + "loss": 0.9579, + "step": 66450 + }, + { + "epoch": 19.88, + "grad_norm": 2.6288695335388184, + "learning_rate": 4.641850503972478e-09, + "loss": 1.0604, + "step": 66455 + }, + { + "epoch": 19.88, + "grad_norm": 2.6345958709716797, + "learning_rate": 4.529328376559616e-09, + "loss": 1.0179, + "step": 66460 + }, + { + "epoch": 19.89, + "grad_norm": 2.6742000579833984, + "learning_rate": 4.418186723009754e-09, + "loss": 1.1658, + "step": 66465 + }, + { + "epoch": 19.89, + "grad_norm": 8.863636016845703, + "learning_rate": 4.3084255494652e-09, + "loss": 1.0605, + "step": 66470 + }, + { + "epoch": 19.89, + "grad_norm": 1.21725594997406, + "learning_rate": 4.200044861982222e-09, + "loss": 0.8926, + "step": 66475 + }, + { + "epoch": 19.89, + "grad_norm": 9.21455192565918, + "learning_rate": 4.093044666547696e-09, + "loss": 1.0839, + "step": 66480 + }, + { + "epoch": 19.89, + "grad_norm": 3.9148738384246826, + "learning_rate": 3.987424969073561e-09, + "loss": 1.0616, + "step": 66485 + }, + { + "epoch": 19.89, + "grad_norm": 3.932304859161377, + "learning_rate": 3.883185775394038e-09, + "loss": 0.8605, + "step": 66490 + }, + { + "epoch": 19.89, + "grad_norm": 2.094050168991089, + "learning_rate": 3.780327091262858e-09, + "loss": 1.0609, + "step": 66495 + }, + { + "epoch": 19.9, + "grad_norm": 2.5073344707489014, + "learning_rate": 3.6788489223615884e-09, + "loss": 0.935, + "step": 66500 + }, + { + "epoch": 19.9, + "grad_norm": 3.42728853225708, + "learning_rate": 3.5787512742940785e-09, + "loss": 0.9495, + "step": 66505 + }, + { + "epoch": 19.9, + "grad_norm": 2.900682210922241, + "learning_rate": 3.4800341525920156e-09, + "loss": 1.0571, + "step": 66510 + }, + { + "epoch": 19.9, + "grad_norm": 2.075742483139038, + "learning_rate": 3.3826975627038183e-09, + "loss": 0.8483, + "step": 66515 + }, + { + "epoch": 19.9, + "grad_norm": 1.8850741386413574, + "learning_rate": 3.2867415100085176e-09, + "loss": 0.9677, + "step": 66520 + }, + { + "epoch": 19.9, + "grad_norm": 5.589338779449463, + "learning_rate": 3.192165999801877e-09, + "loss": 1.1917, + "step": 66525 + }, + { + "epoch": 19.91, + "grad_norm": 2.845837116241455, + "learning_rate": 3.098971037310272e-09, + "loss": 0.9501, + "step": 66530 + }, + { + "epoch": 19.91, + "grad_norm": 2.0691795349121094, + "learning_rate": 3.0071566276795858e-09, + "loss": 0.9732, + "step": 66535 + }, + { + "epoch": 19.91, + "grad_norm": 2.152705430984497, + "learning_rate": 2.9167227759807623e-09, + "loss": 1.0665, + "step": 66540 + }, + { + "epoch": 19.91, + "grad_norm": 1.9462311267852783, + "learning_rate": 2.8276694872098053e-09, + "loss": 1.1731, + "step": 66545 + }, + { + "epoch": 19.91, + "grad_norm": 1.606532335281372, + "learning_rate": 2.739996766279451e-09, + "loss": 0.7424, + "step": 66550 + }, + { + "epoch": 19.91, + "grad_norm": 2.7857766151428223, + "learning_rate": 2.6537046180385992e-09, + "loss": 0.8624, + "step": 66555 + }, + { + "epoch": 19.91, + "grad_norm": 1.5321699380874634, + "learning_rate": 2.5687930472501066e-09, + "loss": 0.8455, + "step": 66560 + }, + { + "epoch": 19.92, + "grad_norm": 3.194620370864868, + "learning_rate": 2.485262058604665e-09, + "loss": 1.0387, + "step": 66565 + }, + { + "epoch": 19.92, + "grad_norm": 1.5754766464233398, + "learning_rate": 2.4031116567124756e-09, + "loss": 1.1122, + "step": 66570 + }, + { + "epoch": 19.92, + "grad_norm": 2.2193338871002197, + "learning_rate": 2.3223418461143507e-09, + "loss": 1.1581, + "step": 66575 + }, + { + "epoch": 19.92, + "grad_norm": 2.662733316421509, + "learning_rate": 2.242952631270612e-09, + "loss": 0.8239, + "step": 66580 + }, + { + "epoch": 19.92, + "grad_norm": 2.0737709999084473, + "learning_rate": 2.1649440165638635e-09, + "loss": 1.1056, + "step": 66585 + }, + { + "epoch": 19.92, + "grad_norm": 1.9437065124511719, + "learning_rate": 2.0883160063017713e-09, + "loss": 1.0208, + "step": 66590 + }, + { + "epoch": 19.92, + "grad_norm": 2.8126699924468994, + "learning_rate": 2.0130686047198367e-09, + "loss": 0.8979, + "step": 66595 + }, + { + "epoch": 19.93, + "grad_norm": 5.540438175201416, + "learning_rate": 1.9392018159730684e-09, + "loss": 0.9252, + "step": 66600 + }, + { + "epoch": 19.93, + "grad_norm": 2.1464717388153076, + "learning_rate": 1.8667156441387613e-09, + "loss": 1.1339, + "step": 66605 + }, + { + "epoch": 19.93, + "grad_norm": 1.9437625408172607, + "learning_rate": 1.7956100932220444e-09, + "loss": 0.9708, + "step": 66610 + }, + { + "epoch": 19.93, + "grad_norm": 3.8403539657592773, + "learning_rate": 1.7258851671503317e-09, + "loss": 0.9926, + "step": 66615 + }, + { + "epoch": 19.93, + "grad_norm": 2.2192623615264893, + "learning_rate": 1.657540869773322e-09, + "loss": 1.0566, + "step": 66620 + }, + { + "epoch": 19.93, + "grad_norm": 4.014240741729736, + "learning_rate": 1.5905772048629973e-09, + "loss": 0.8297, + "step": 66625 + }, + { + "epoch": 19.93, + "grad_norm": 4.3666181564331055, + "learning_rate": 1.5249941761247276e-09, + "loss": 0.9347, + "step": 66630 + }, + { + "epoch": 19.94, + "grad_norm": 1.6916534900665283, + "learning_rate": 1.4607917871750643e-09, + "loss": 0.9515, + "step": 66635 + }, + { + "epoch": 19.94, + "grad_norm": 1.9983826875686646, + "learning_rate": 1.3979700415611697e-09, + "loss": 0.9741, + "step": 66640 + }, + { + "epoch": 19.94, + "grad_norm": 2.9387166500091553, + "learning_rate": 1.3365289427524908e-09, + "loss": 0.8877, + "step": 66645 + }, + { + "epoch": 19.94, + "grad_norm": 2.9667611122131348, + "learning_rate": 1.2764684941435346e-09, + "loss": 0.8683, + "step": 66650 + }, + { + "epoch": 19.94, + "grad_norm": 1.2292882204055786, + "learning_rate": 1.2177886990510923e-09, + "loss": 0.8254, + "step": 66655 + }, + { + "epoch": 19.94, + "grad_norm": 1.751691222190857, + "learning_rate": 1.1604895607142397e-09, + "loss": 0.9371, + "step": 66660 + }, + { + "epoch": 19.95, + "grad_norm": 2.478400945663452, + "learning_rate": 1.1045710822971123e-09, + "loss": 0.981, + "step": 66665 + }, + { + "epoch": 19.95, + "grad_norm": 5.2517781257629395, + "learning_rate": 1.0500332668916813e-09, + "loss": 0.8857, + "step": 66670 + }, + { + "epoch": 19.95, + "grad_norm": 3.200549840927124, + "learning_rate": 9.968761175094267e-10, + "loss": 1.0439, + "step": 66675 + }, + { + "epoch": 19.95, + "grad_norm": 3.636068105697632, + "learning_rate": 9.45099637081337e-10, + "loss": 0.9732, + "step": 66680 + }, + { + "epoch": 19.95, + "grad_norm": 2.145059823989868, + "learning_rate": 8.947038284717879e-10, + "loss": 0.7454, + "step": 66685 + }, + { + "epoch": 19.95, + "grad_norm": 1.3678562641143799, + "learning_rate": 8.456886944646636e-10, + "loss": 1.0683, + "step": 66690 + }, + { + "epoch": 19.95, + "grad_norm": 2.720142364501953, + "learning_rate": 7.980542377633571e-10, + "loss": 0.8569, + "step": 66695 + }, + { + "epoch": 19.96, + "grad_norm": 5.969086647033691, + "learning_rate": 7.51800460999097e-10, + "loss": 0.9017, + "step": 66700 + }, + { + "epoch": 19.96, + "grad_norm": 1.7213473320007324, + "learning_rate": 7.069273667281717e-10, + "loss": 0.8733, + "step": 66705 + }, + { + "epoch": 19.96, + "grad_norm": 1.9708019495010376, + "learning_rate": 6.634349574291543e-10, + "loss": 0.978, + "step": 66710 + }, + { + "epoch": 19.96, + "grad_norm": 1.8042583465576172, + "learning_rate": 6.213232355029019e-10, + "loss": 1.2455, + "step": 66715 + }, + { + "epoch": 19.96, + "grad_norm": 1.776975393295288, + "learning_rate": 5.805922032753319e-10, + "loss": 0.7985, + "step": 66720 + }, + { + "epoch": 19.96, + "grad_norm": 2.806536912918091, + "learning_rate": 5.412418629946459e-10, + "loss": 0.9752, + "step": 66725 + }, + { + "epoch": 19.96, + "grad_norm": 1.0485270023345947, + "learning_rate": 5.032722168396564e-10, + "loss": 0.8385, + "step": 66730 + }, + { + "epoch": 19.97, + "grad_norm": 2.4013590812683105, + "learning_rate": 4.666832669003585e-10, + "loss": 0.8629, + "step": 66735 + }, + { + "epoch": 19.97, + "grad_norm": 3.498737335205078, + "learning_rate": 4.3147501520013347e-10, + "loss": 1.2686, + "step": 66740 + }, + { + "epoch": 19.97, + "grad_norm": 2.2071971893310547, + "learning_rate": 3.9764746368742277e-10, + "loss": 1.0284, + "step": 66745 + }, + { + "epoch": 19.97, + "grad_norm": 2.6617162227630615, + "learning_rate": 3.652006142246256e-10, + "loss": 0.9895, + "step": 66750 + }, + { + "epoch": 19.97, + "grad_norm": 2.3580636978149414, + "learning_rate": 3.3413446860475204e-10, + "loss": 1.0622, + "step": 66755 + }, + { + "epoch": 19.97, + "grad_norm": 2.603350877761841, + "learning_rate": 3.044490285458723e-10, + "loss": 0.9365, + "step": 66760 + }, + { + "epoch": 19.98, + "grad_norm": 2.9707422256469727, + "learning_rate": 2.7614429568834087e-10, + "loss": 0.9106, + "step": 66765 + }, + { + "epoch": 19.98, + "grad_norm": 1.205654501914978, + "learning_rate": 2.4922027159202113e-10, + "loss": 0.9287, + "step": 66770 + }, + { + "epoch": 19.98, + "grad_norm": 1.2524653673171997, + "learning_rate": 2.2367695774461185e-10, + "loss": 1.0583, + "step": 66775 + }, + { + "epoch": 19.98, + "grad_norm": 1.9343750476837158, + "learning_rate": 1.9951435555887187e-10, + "loss": 1.0717, + "step": 66780 + }, + { + "epoch": 19.98, + "grad_norm": 2.118802309036255, + "learning_rate": 1.7673246636984443e-10, + "loss": 0.962, + "step": 66785 + }, + { + "epoch": 19.98, + "grad_norm": 2.374420404434204, + "learning_rate": 1.5533129143208146e-10, + "loss": 0.9052, + "step": 66790 + }, + { + "epoch": 19.98, + "grad_norm": 3.1811416149139404, + "learning_rate": 1.3531083193074612e-10, + "loss": 0.8696, + "step": 66795 + }, + { + "epoch": 19.99, + "grad_norm": 1.0755863189697266, + "learning_rate": 1.166710889677347e-10, + "loss": 0.7908, + "step": 66800 + }, + { + "epoch": 19.99, + "grad_norm": 4.653294086456299, + "learning_rate": 9.941206357555467e-11, + "loss": 0.968, + "step": 66805 + }, + { + "epoch": 19.99, + "grad_norm": 1.384509563446045, + "learning_rate": 8.353375670899777e-11, + "loss": 1.0534, + "step": 66810 + }, + { + "epoch": 19.99, + "grad_norm": 6.829671859741211, + "learning_rate": 6.903616924236466e-11, + "loss": 0.934, + "step": 66815 + }, + { + "epoch": 19.99, + "grad_norm": 2.6626029014587402, + "learning_rate": 5.591930197501594e-11, + "loss": 0.9137, + "step": 66820 + }, + { + "epoch": 19.99, + "grad_norm": 3.337085723876953, + "learning_rate": 4.418315563414765e-11, + "loss": 0.9638, + "step": 66825 + }, + { + "epoch": 19.99, + "grad_norm": 2.169858694076538, + "learning_rate": 3.3827730869240294e-11, + "loss": 0.9035, + "step": 66830 + }, + { + "epoch": 20.0, + "grad_norm": 3.450448513031006, + "learning_rate": 2.4853028246507594e-11, + "loss": 0.9036, + "step": 66835 + }, + { + "epoch": 20.0, + "grad_norm": 2.451824188232422, + "learning_rate": 1.7259048268325474e-11, + "loss": 0.8429, + "step": 66840 + }, + { + "epoch": 20.0, + "step": 66840, + "total_flos": 3.489559903035851e+18, + "train_loss": 1.1193655486238139, + "train_runtime": 133767.8538, + "train_samples_per_second": 3.998, + "train_steps_per_second": 0.5 + } + ], + "logging_steps": 5, + "max_steps": 66840, + "num_input_tokens_seen": 0, + "num_train_epochs": 20, + "save_steps": 100, + "total_flos": 3.489559903035851e+18, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}