diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,20350 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 5.0, + "eval_steps": 500, + "global_step": 14505, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0017238407171177384, + "grad_norm": 39.78016284529353, + "learning_rate": 9.174311926605506e-08, + "loss": 1.9587, + "step": 5 + }, + { + "epoch": 0.0034476814342354768, + "grad_norm": 54.45887630252972, + "learning_rate": 2.064220183486239e-07, + "loss": 1.9468, + "step": 10 + }, + { + "epoch": 0.005171522151353215, + "grad_norm": 49.12766068536532, + "learning_rate": 3.211009174311927e-07, + "loss": 1.8937, + "step": 15 + }, + { + "epoch": 0.0068953628684709535, + "grad_norm": 41.25405363658383, + "learning_rate": 4.357798165137615e-07, + "loss": 1.8044, + "step": 20 + }, + { + "epoch": 0.008619203585588691, + "grad_norm": 45.654320048769435, + "learning_rate": 5.504587155963304e-07, + "loss": 1.4952, + "step": 25 + }, + { + "epoch": 0.01034304430270643, + "grad_norm": 17.90673392907594, + "learning_rate": 6.651376146788992e-07, + "loss": 1.1202, + "step": 30 + }, + { + "epoch": 0.012066885019824168, + "grad_norm": 17.266787541205158, + "learning_rate": 7.79816513761468e-07, + "loss": 1.0723, + "step": 35 + }, + { + "epoch": 0.013790725736941907, + "grad_norm": 10.826211444168319, + "learning_rate": 8.944954128440368e-07, + "loss": 0.9813, + "step": 40 + }, + { + "epoch": 0.015514566454059645, + "grad_norm": 6.139222549734279, + "learning_rate": 1.0091743119266057e-06, + "loss": 0.8871, + "step": 45 + }, + { + "epoch": 0.017238407171177382, + "grad_norm": 6.354966515955482, + "learning_rate": 1.1238532110091744e-06, + "loss": 0.8239, + "step": 50 + }, + { + "epoch": 0.018962247888295123, + "grad_norm": 4.4768413900168476, + "learning_rate": 1.2385321100917433e-06, + "loss": 0.8044, + "step": 55 + }, + { + "epoch": 0.02068608860541286, + "grad_norm": 3.8016061359265825, + "learning_rate": 1.353211009174312e-06, + "loss": 0.7791, + "step": 60 + }, + { + "epoch": 0.022409929322530598, + "grad_norm": 4.668162150858383, + "learning_rate": 1.467889908256881e-06, + "loss": 0.7884, + "step": 65 + }, + { + "epoch": 0.024133770039648336, + "grad_norm": 4.288983799389968, + "learning_rate": 1.5825688073394496e-06, + "loss": 0.7471, + "step": 70 + }, + { + "epoch": 0.025857610756766077, + "grad_norm": 3.0974564753828004, + "learning_rate": 1.6972477064220186e-06, + "loss": 0.7531, + "step": 75 + }, + { + "epoch": 0.027581451473883814, + "grad_norm": 3.1397534126911686, + "learning_rate": 1.8119266055045873e-06, + "loss": 0.7049, + "step": 80 + }, + { + "epoch": 0.02930529219100155, + "grad_norm": 2.628798019332378, + "learning_rate": 1.9266055045871564e-06, + "loss": 0.6718, + "step": 85 + }, + { + "epoch": 0.03102913290811929, + "grad_norm": 3.8325816308276, + "learning_rate": 2.041284403669725e-06, + "loss": 0.6207, + "step": 90 + }, + { + "epoch": 0.03275297362523703, + "grad_norm": 3.4309823032403393, + "learning_rate": 2.155963302752294e-06, + "loss": 0.626, + "step": 95 + }, + { + "epoch": 0.034476814342354764, + "grad_norm": 2.9722717648273616, + "learning_rate": 2.2706422018348624e-06, + "loss": 0.6616, + "step": 100 + }, + { + "epoch": 0.0362006550594725, + "grad_norm": 3.63653575765165, + "learning_rate": 2.3853211009174317e-06, + "loss": 0.624, + "step": 105 + }, + { + "epoch": 0.037924495776590246, + "grad_norm": 8.600676962941835, + "learning_rate": 2.5e-06, + "loss": 0.5929, + "step": 110 + }, + { + "epoch": 0.039648336493707984, + "grad_norm": 2.987695328695734, + "learning_rate": 2.6146788990825687e-06, + "loss": 0.56, + "step": 115 + }, + { + "epoch": 0.04137217721082572, + "grad_norm": 2.809016432052269, + "learning_rate": 2.7293577981651376e-06, + "loss": 0.5462, + "step": 120 + }, + { + "epoch": 0.04309601792794346, + "grad_norm": 4.083569914949518, + "learning_rate": 2.844036697247707e-06, + "loss": 0.4953, + "step": 125 + }, + { + "epoch": 0.044819858645061196, + "grad_norm": 2.657791948245135, + "learning_rate": 2.9587155963302755e-06, + "loss": 0.5255, + "step": 130 + }, + { + "epoch": 0.046543699362178934, + "grad_norm": 2.4534732093259484, + "learning_rate": 3.073394495412844e-06, + "loss": 0.5137, + "step": 135 + }, + { + "epoch": 0.04826754007929667, + "grad_norm": 2.5288108338905744, + "learning_rate": 3.188073394495413e-06, + "loss": 0.4995, + "step": 140 + }, + { + "epoch": 0.04999138079641441, + "grad_norm": 3.050354376645196, + "learning_rate": 3.3027522935779823e-06, + "loss": 0.4656, + "step": 145 + }, + { + "epoch": 0.05171522151353215, + "grad_norm": 3.8301326296872036, + "learning_rate": 3.4174311926605508e-06, + "loss": 0.4727, + "step": 150 + }, + { + "epoch": 0.05343906223064989, + "grad_norm": 2.7466041439996354, + "learning_rate": 3.5321100917431193e-06, + "loss": 0.4448, + "step": 155 + }, + { + "epoch": 0.05516290294776763, + "grad_norm": 2.7421197824426136, + "learning_rate": 3.646788990825688e-06, + "loss": 0.4521, + "step": 160 + }, + { + "epoch": 0.056886743664885366, + "grad_norm": 2.1029134850923596, + "learning_rate": 3.7614678899082575e-06, + "loss": 0.4127, + "step": 165 + }, + { + "epoch": 0.0586105843820031, + "grad_norm": 2.404231668222291, + "learning_rate": 3.876146788990826e-06, + "loss": 0.4382, + "step": 170 + }, + { + "epoch": 0.06033442509912084, + "grad_norm": 2.920030378869354, + "learning_rate": 3.9908256880733945e-06, + "loss": 0.4046, + "step": 175 + }, + { + "epoch": 0.06205826581623858, + "grad_norm": 2.1025930464202363, + "learning_rate": 4.105504587155963e-06, + "loss": 0.4054, + "step": 180 + }, + { + "epoch": 0.06378210653335632, + "grad_norm": 2.366928935303905, + "learning_rate": 4.220183486238532e-06, + "loss": 0.4045, + "step": 185 + }, + { + "epoch": 0.06550594725047405, + "grad_norm": 3.390075203841566, + "learning_rate": 4.334862385321102e-06, + "loss": 0.4073, + "step": 190 + }, + { + "epoch": 0.0672297879675918, + "grad_norm": 2.891098407909226, + "learning_rate": 4.44954128440367e-06, + "loss": 0.3926, + "step": 195 + }, + { + "epoch": 0.06895362868470953, + "grad_norm": 4.152303170049208, + "learning_rate": 4.564220183486239e-06, + "loss": 0.4134, + "step": 200 + }, + { + "epoch": 0.07067746940182727, + "grad_norm": 4.021346589174643, + "learning_rate": 4.678899082568808e-06, + "loss": 0.4287, + "step": 205 + }, + { + "epoch": 0.072401310118945, + "grad_norm": 2.285811756508236, + "learning_rate": 4.793577981651377e-06, + "loss": 0.4012, + "step": 210 + }, + { + "epoch": 0.07412515083606275, + "grad_norm": 3.1588080152416946, + "learning_rate": 4.908256880733945e-06, + "loss": 0.3995, + "step": 215 + }, + { + "epoch": 0.07584899155318049, + "grad_norm": 3.130168900049288, + "learning_rate": 5.0229357798165144e-06, + "loss": 0.4176, + "step": 220 + }, + { + "epoch": 0.07757283227029822, + "grad_norm": 3.038295689456518, + "learning_rate": 5.137614678899083e-06, + "loss": 0.4017, + "step": 225 + }, + { + "epoch": 0.07929667298741597, + "grad_norm": 3.103669836904637, + "learning_rate": 5.252293577981652e-06, + "loss": 0.4209, + "step": 230 + }, + { + "epoch": 0.0810205137045337, + "grad_norm": 6.68545870580622, + "learning_rate": 5.366972477064221e-06, + "loss": 0.3958, + "step": 235 + }, + { + "epoch": 0.08274435442165144, + "grad_norm": 2.322241896800378, + "learning_rate": 5.481651376146789e-06, + "loss": 0.4179, + "step": 240 + }, + { + "epoch": 0.08446819513876917, + "grad_norm": 2.507028021110862, + "learning_rate": 5.596330275229358e-06, + "loss": 0.3977, + "step": 245 + }, + { + "epoch": 0.08619203585588692, + "grad_norm": 2.753656567325975, + "learning_rate": 5.711009174311926e-06, + "loss": 0.4112, + "step": 250 + }, + { + "epoch": 0.08791587657300465, + "grad_norm": 2.2863472704603147, + "learning_rate": 5.825688073394496e-06, + "loss": 0.3881, + "step": 255 + }, + { + "epoch": 0.08963971729012239, + "grad_norm": 3.813935905396218, + "learning_rate": 5.940366972477065e-06, + "loss": 0.3893, + "step": 260 + }, + { + "epoch": 0.09136355800724014, + "grad_norm": 2.402554331365669, + "learning_rate": 6.0550458715596335e-06, + "loss": 0.3785, + "step": 265 + }, + { + "epoch": 0.09308739872435787, + "grad_norm": 2.1116348449486297, + "learning_rate": 6.169724770642203e-06, + "loss": 0.3971, + "step": 270 + }, + { + "epoch": 0.09481123944147561, + "grad_norm": 2.5491224473263796, + "learning_rate": 6.284403669724771e-06, + "loss": 0.3926, + "step": 275 + }, + { + "epoch": 0.09653508015859334, + "grad_norm": 2.355395092053141, + "learning_rate": 6.39908256880734e-06, + "loss": 0.3998, + "step": 280 + }, + { + "epoch": 0.09825892087571109, + "grad_norm": 2.2613087445602833, + "learning_rate": 6.513761467889908e-06, + "loss": 0.3889, + "step": 285 + }, + { + "epoch": 0.09998276159282882, + "grad_norm": 2.355674812454849, + "learning_rate": 6.628440366972477e-06, + "loss": 0.4069, + "step": 290 + }, + { + "epoch": 0.10170660230994656, + "grad_norm": 4.185598154500696, + "learning_rate": 6.743119266055046e-06, + "loss": 0.404, + "step": 295 + }, + { + "epoch": 0.1034304430270643, + "grad_norm": 2.527006058053336, + "learning_rate": 6.8577981651376156e-06, + "loss": 0.3947, + "step": 300 + }, + { + "epoch": 0.10515428374418204, + "grad_norm": 3.3133527713511843, + "learning_rate": 6.972477064220184e-06, + "loss": 0.3852, + "step": 305 + }, + { + "epoch": 0.10687812446129978, + "grad_norm": 3.540122413545443, + "learning_rate": 7.087155963302753e-06, + "loss": 0.3928, + "step": 310 + }, + { + "epoch": 0.10860196517841751, + "grad_norm": 3.0843381032577333, + "learning_rate": 7.201834862385322e-06, + "loss": 0.4073, + "step": 315 + }, + { + "epoch": 0.11032580589553526, + "grad_norm": 2.040518561707585, + "learning_rate": 7.31651376146789e-06, + "loss": 0.4004, + "step": 320 + }, + { + "epoch": 0.11204964661265299, + "grad_norm": 1.7648309553670658, + "learning_rate": 7.431192660550459e-06, + "loss": 0.4205, + "step": 325 + }, + { + "epoch": 0.11377348732977073, + "grad_norm": 2.243096280244523, + "learning_rate": 7.545871559633028e-06, + "loss": 0.3736, + "step": 330 + }, + { + "epoch": 0.11549732804688846, + "grad_norm": 2.0265523548779254, + "learning_rate": 7.660550458715596e-06, + "loss": 0.3619, + "step": 335 + }, + { + "epoch": 0.1172211687640062, + "grad_norm": 2.0501398174879952, + "learning_rate": 7.775229357798164e-06, + "loss": 0.4015, + "step": 340 + }, + { + "epoch": 0.11894500948112395, + "grad_norm": 2.2353934598909158, + "learning_rate": 7.889908256880735e-06, + "loss": 0.3731, + "step": 345 + }, + { + "epoch": 0.12066885019824168, + "grad_norm": 1.838277567752222, + "learning_rate": 8.004587155963303e-06, + "loss": 0.3908, + "step": 350 + }, + { + "epoch": 0.12239269091535943, + "grad_norm": 2.398312584687634, + "learning_rate": 8.119266055045872e-06, + "loss": 0.384, + "step": 355 + }, + { + "epoch": 0.12411653163247716, + "grad_norm": 2.313315321651735, + "learning_rate": 8.233944954128442e-06, + "loss": 0.3829, + "step": 360 + }, + { + "epoch": 0.1258403723495949, + "grad_norm": 4.6373392398998705, + "learning_rate": 8.34862385321101e-06, + "loss": 0.369, + "step": 365 + }, + { + "epoch": 0.12756421306671265, + "grad_norm": 3.7183533750191495, + "learning_rate": 8.463302752293579e-06, + "loss": 0.4126, + "step": 370 + }, + { + "epoch": 0.12928805378383038, + "grad_norm": 2.406767268368641, + "learning_rate": 8.577981651376147e-06, + "loss": 0.3562, + "step": 375 + }, + { + "epoch": 0.1310118945009481, + "grad_norm": 2.2747093864314074, + "learning_rate": 8.692660550458716e-06, + "loss": 0.4103, + "step": 380 + }, + { + "epoch": 0.13273573521806586, + "grad_norm": 1.908977382401557, + "learning_rate": 8.807339449541286e-06, + "loss": 0.4133, + "step": 385 + }, + { + "epoch": 0.1344595759351836, + "grad_norm": 1.9376598412867219, + "learning_rate": 8.922018348623855e-06, + "loss": 0.3801, + "step": 390 + }, + { + "epoch": 0.13618341665230133, + "grad_norm": 2.2551638318770935, + "learning_rate": 9.036697247706423e-06, + "loss": 0.3888, + "step": 395 + }, + { + "epoch": 0.13790725736941906, + "grad_norm": 2.104590761231902, + "learning_rate": 9.151376146788992e-06, + "loss": 0.4118, + "step": 400 + }, + { + "epoch": 0.13963109808653681, + "grad_norm": 1.8231769972122067, + "learning_rate": 9.26605504587156e-06, + "loss": 0.3984, + "step": 405 + }, + { + "epoch": 0.14135493880365455, + "grad_norm": 5.751866218247112, + "learning_rate": 9.380733944954129e-06, + "loss": 0.3847, + "step": 410 + }, + { + "epoch": 0.14307877952077228, + "grad_norm": 1.4715301900214872, + "learning_rate": 9.495412844036697e-06, + "loss": 0.3673, + "step": 415 + }, + { + "epoch": 0.14480262023789, + "grad_norm": 2.4838714998797067, + "learning_rate": 9.610091743119267e-06, + "loss": 0.3812, + "step": 420 + }, + { + "epoch": 0.14652646095500776, + "grad_norm": 1.6705137293047283, + "learning_rate": 9.724770642201836e-06, + "loss": 0.3379, + "step": 425 + }, + { + "epoch": 0.1482503016721255, + "grad_norm": 2.1731769348150065, + "learning_rate": 9.839449541284404e-06, + "loss": 0.398, + "step": 430 + }, + { + "epoch": 0.14997414238924323, + "grad_norm": 3.8419027612384946, + "learning_rate": 9.954128440366973e-06, + "loss": 0.398, + "step": 435 + }, + { + "epoch": 0.15169798310636098, + "grad_norm": 2.110073518173239, + "learning_rate": 9.999998878095765e-06, + "loss": 0.4154, + "step": 440 + }, + { + "epoch": 0.15342182382347871, + "grad_norm": 1.5126990806154013, + "learning_rate": 9.999992022016144e-06, + "loss": 0.3414, + "step": 445 + }, + { + "epoch": 0.15514566454059645, + "grad_norm": 1.43575853742032, + "learning_rate": 9.999978933145567e-06, + "loss": 0.3764, + "step": 450 + }, + { + "epoch": 0.15686950525771418, + "grad_norm": 1.717920752962547, + "learning_rate": 9.999959611500351e-06, + "loss": 0.4114, + "step": 455 + }, + { + "epoch": 0.15859334597483193, + "grad_norm": 1.540164844779612, + "learning_rate": 9.999934057104585e-06, + "loss": 0.3547, + "step": 460 + }, + { + "epoch": 0.16031718669194966, + "grad_norm": 2.4286004694238694, + "learning_rate": 9.99990226999012e-06, + "loss": 0.3737, + "step": 465 + }, + { + "epoch": 0.1620410274090674, + "grad_norm": 1.6364737615086615, + "learning_rate": 9.999864250196582e-06, + "loss": 0.3712, + "step": 470 + }, + { + "epoch": 0.16376486812618515, + "grad_norm": 1.9489915262665032, + "learning_rate": 9.999819997771365e-06, + "loss": 0.3664, + "step": 475 + }, + { + "epoch": 0.16548870884330288, + "grad_norm": 2.058773467524577, + "learning_rate": 9.999769512769632e-06, + "loss": 0.3838, + "step": 480 + }, + { + "epoch": 0.16721254956042061, + "grad_norm": 19.681311592477975, + "learning_rate": 9.999712795254318e-06, + "loss": 0.3805, + "step": 485 + }, + { + "epoch": 0.16893639027753835, + "grad_norm": 2.093532171948987, + "learning_rate": 9.99964984529612e-06, + "loss": 0.3848, + "step": 490 + }, + { + "epoch": 0.1706602309946561, + "grad_norm": 1.6329941265934056, + "learning_rate": 9.999580662973511e-06, + "loss": 0.3926, + "step": 495 + }, + { + "epoch": 0.17238407171177383, + "grad_norm": 1.5584880166519641, + "learning_rate": 9.999505248372734e-06, + "loss": 0.359, + "step": 500 + }, + { + "epoch": 0.17410791242889156, + "grad_norm": 1.4956353781231042, + "learning_rate": 9.999423601587794e-06, + "loss": 0.3884, + "step": 505 + }, + { + "epoch": 0.1758317531460093, + "grad_norm": 1.4756155207429988, + "learning_rate": 9.999335722720471e-06, + "loss": 0.3885, + "step": 510 + }, + { + "epoch": 0.17755559386312705, + "grad_norm": 1.5953193620214319, + "learning_rate": 9.999241611880309e-06, + "loss": 0.4194, + "step": 515 + }, + { + "epoch": 0.17927943458024478, + "grad_norm": 2.085260102903812, + "learning_rate": 9.999141269184624e-06, + "loss": 0.3763, + "step": 520 + }, + { + "epoch": 0.18100327529736251, + "grad_norm": 1.71472471804224, + "learning_rate": 9.9990346947585e-06, + "loss": 0.3802, + "step": 525 + }, + { + "epoch": 0.18272711601448027, + "grad_norm": 1.895808542216389, + "learning_rate": 9.998921888734787e-06, + "loss": 0.3648, + "step": 530 + }, + { + "epoch": 0.184450956731598, + "grad_norm": 1.8454078954270925, + "learning_rate": 9.998802851254106e-06, + "loss": 0.3967, + "step": 535 + }, + { + "epoch": 0.18617479744871573, + "grad_norm": 2.2793271469320375, + "learning_rate": 9.998677582464842e-06, + "loss": 0.3778, + "step": 540 + }, + { + "epoch": 0.18789863816583346, + "grad_norm": 1.4775312147085078, + "learning_rate": 9.998546082523154e-06, + "loss": 0.4067, + "step": 545 + }, + { + "epoch": 0.18962247888295122, + "grad_norm": 1.8126102204167893, + "learning_rate": 9.99840835159296e-06, + "loss": 0.3969, + "step": 550 + }, + { + "epoch": 0.19134631960006895, + "grad_norm": 3.1302854559662494, + "learning_rate": 9.998264389845954e-06, + "loss": 0.3476, + "step": 555 + }, + { + "epoch": 0.19307016031718668, + "grad_norm": 1.9485324674828295, + "learning_rate": 9.99811419746159e-06, + "loss": 0.3734, + "step": 560 + }, + { + "epoch": 0.19479400103430444, + "grad_norm": 3.484220474881742, + "learning_rate": 9.997957774627094e-06, + "loss": 0.3812, + "step": 565 + }, + { + "epoch": 0.19651784175142217, + "grad_norm": 1.5462072274250418, + "learning_rate": 9.997795121537455e-06, + "loss": 0.4151, + "step": 570 + }, + { + "epoch": 0.1982416824685399, + "grad_norm": 2.3841039117699503, + "learning_rate": 9.997626238395431e-06, + "loss": 0.375, + "step": 575 + }, + { + "epoch": 0.19996552318565763, + "grad_norm": 1.734462124188258, + "learning_rate": 9.997451125411542e-06, + "loss": 0.3534, + "step": 580 + }, + { + "epoch": 0.2016893639027754, + "grad_norm": 1.542198455947852, + "learning_rate": 9.99726978280408e-06, + "loss": 0.3417, + "step": 585 + }, + { + "epoch": 0.20341320461989312, + "grad_norm": 1.9091830991742968, + "learning_rate": 9.997082210799101e-06, + "loss": 0.3659, + "step": 590 + }, + { + "epoch": 0.20513704533701085, + "grad_norm": 1.8345560789276327, + "learning_rate": 9.99688840963042e-06, + "loss": 0.393, + "step": 595 + }, + { + "epoch": 0.2068608860541286, + "grad_norm": 2.1062288478262206, + "learning_rate": 9.996688379539625e-06, + "loss": 0.3836, + "step": 600 + }, + { + "epoch": 0.20858472677124634, + "grad_norm": 2.339131539367282, + "learning_rate": 9.996482120776065e-06, + "loss": 0.3611, + "step": 605 + }, + { + "epoch": 0.21030856748836407, + "grad_norm": 1.6370806618729674, + "learning_rate": 9.996269633596853e-06, + "loss": 0.3501, + "step": 610 + }, + { + "epoch": 0.2120324082054818, + "grad_norm": 1.5630726784490585, + "learning_rate": 9.99605091826687e-06, + "loss": 0.3613, + "step": 615 + }, + { + "epoch": 0.21375624892259956, + "grad_norm": 1.6159611373776788, + "learning_rate": 9.995825975058754e-06, + "loss": 0.3874, + "step": 620 + }, + { + "epoch": 0.2154800896397173, + "grad_norm": 1.5359715084462024, + "learning_rate": 9.995594804252913e-06, + "loss": 0.3766, + "step": 625 + }, + { + "epoch": 0.21720393035683502, + "grad_norm": 1.644239217826752, + "learning_rate": 9.995357406137512e-06, + "loss": 0.3973, + "step": 630 + }, + { + "epoch": 0.21892777107395275, + "grad_norm": 1.8749655181666027, + "learning_rate": 9.995113781008485e-06, + "loss": 0.3724, + "step": 635 + }, + { + "epoch": 0.2206516117910705, + "grad_norm": 1.5084495107797091, + "learning_rate": 9.994863929169526e-06, + "loss": 0.4027, + "step": 640 + }, + { + "epoch": 0.22237545250818824, + "grad_norm": 1.6397265813828814, + "learning_rate": 9.994607850932089e-06, + "loss": 0.3854, + "step": 645 + }, + { + "epoch": 0.22409929322530597, + "grad_norm": 1.6228257707157137, + "learning_rate": 9.994345546615389e-06, + "loss": 0.3774, + "step": 650 + }, + { + "epoch": 0.22582313394242373, + "grad_norm": 1.6150673694783573, + "learning_rate": 9.99407701654641e-06, + "loss": 0.3736, + "step": 655 + }, + { + "epoch": 0.22754697465954146, + "grad_norm": 1.5201668210444677, + "learning_rate": 9.993802261059882e-06, + "loss": 0.3509, + "step": 660 + }, + { + "epoch": 0.2292708153766592, + "grad_norm": 1.5639526727728108, + "learning_rate": 9.993521280498312e-06, + "loss": 0.3457, + "step": 665 + }, + { + "epoch": 0.23099465609377692, + "grad_norm": 2.4286137351423687, + "learning_rate": 9.993234075211954e-06, + "loss": 0.3823, + "step": 670 + }, + { + "epoch": 0.23271849681089468, + "grad_norm": 3.171671081725359, + "learning_rate": 9.992940645558832e-06, + "loss": 0.3594, + "step": 675 + }, + { + "epoch": 0.2344423375280124, + "grad_norm": 1.4223660407457652, + "learning_rate": 9.99264099190472e-06, + "loss": 0.3483, + "step": 680 + }, + { + "epoch": 0.23616617824513014, + "grad_norm": 1.491078285294274, + "learning_rate": 9.992335114623155e-06, + "loss": 0.3649, + "step": 685 + }, + { + "epoch": 0.2378900189622479, + "grad_norm": 1.5010894475480492, + "learning_rate": 9.992023014095431e-06, + "loss": 0.3857, + "step": 690 + }, + { + "epoch": 0.23961385967936563, + "grad_norm": 1.9744878308027272, + "learning_rate": 9.991704690710602e-06, + "loss": 0.3799, + "step": 695 + }, + { + "epoch": 0.24133770039648336, + "grad_norm": 1.6326915073540642, + "learning_rate": 9.991380144865474e-06, + "loss": 0.362, + "step": 700 + }, + { + "epoch": 0.2430615411136011, + "grad_norm": 1.636689293283894, + "learning_rate": 9.991049376964614e-06, + "loss": 0.3521, + "step": 705 + }, + { + "epoch": 0.24478538183071885, + "grad_norm": 1.9068915222690204, + "learning_rate": 9.990712387420348e-06, + "loss": 0.381, + "step": 710 + }, + { + "epoch": 0.24650922254783658, + "grad_norm": 1.6023130627615407, + "learning_rate": 9.990369176652748e-06, + "loss": 0.3934, + "step": 715 + }, + { + "epoch": 0.2482330632649543, + "grad_norm": 1.7323114108675037, + "learning_rate": 9.99001974508965e-06, + "loss": 0.3858, + "step": 720 + }, + { + "epoch": 0.24995690398207204, + "grad_norm": 2.7566884308975435, + "learning_rate": 9.989664093166641e-06, + "loss": 0.4039, + "step": 725 + }, + { + "epoch": 0.2516807446991898, + "grad_norm": 1.5382959451268354, + "learning_rate": 9.989302221327065e-06, + "loss": 0.3834, + "step": 730 + }, + { + "epoch": 0.25340458541630756, + "grad_norm": 1.4581586166654468, + "learning_rate": 9.988934130022012e-06, + "loss": 0.3834, + "step": 735 + }, + { + "epoch": 0.2551284261334253, + "grad_norm": 2.5874414821417817, + "learning_rate": 9.988559819710333e-06, + "loss": 0.3807, + "step": 740 + }, + { + "epoch": 0.256852266850543, + "grad_norm": 1.4101523870158132, + "learning_rate": 9.988179290858628e-06, + "loss": 0.3856, + "step": 745 + }, + { + "epoch": 0.25857610756766075, + "grad_norm": 1.5004119745253264, + "learning_rate": 9.987792543941248e-06, + "loss": 0.4238, + "step": 750 + }, + { + "epoch": 0.2602999482847785, + "grad_norm": 1.5840716199539697, + "learning_rate": 9.987399579440298e-06, + "loss": 0.3352, + "step": 755 + }, + { + "epoch": 0.2620237890018962, + "grad_norm": 1.950713807715483, + "learning_rate": 9.987000397845632e-06, + "loss": 0.365, + "step": 760 + }, + { + "epoch": 0.26374762971901394, + "grad_norm": 3.5692407156235655, + "learning_rate": 9.986594999654853e-06, + "loss": 0.4, + "step": 765 + }, + { + "epoch": 0.26547147043613173, + "grad_norm": 1.4602853381322065, + "learning_rate": 9.986183385373314e-06, + "loss": 0.3554, + "step": 770 + }, + { + "epoch": 0.26719531115324946, + "grad_norm": 2.0408178483352413, + "learning_rate": 9.985765555514115e-06, + "loss": 0.3825, + "step": 775 + }, + { + "epoch": 0.2689191518703672, + "grad_norm": 1.766355386032275, + "learning_rate": 9.985341510598111e-06, + "loss": 0.4305, + "step": 780 + }, + { + "epoch": 0.2706429925874849, + "grad_norm": 1.8735701650719805, + "learning_rate": 9.984911251153897e-06, + "loss": 0.3854, + "step": 785 + }, + { + "epoch": 0.27236683330460265, + "grad_norm": 1.8498771495958393, + "learning_rate": 9.984474777717815e-06, + "loss": 0.3598, + "step": 790 + }, + { + "epoch": 0.2740906740217204, + "grad_norm": 2.031747943114794, + "learning_rate": 9.984032090833959e-06, + "loss": 0.3838, + "step": 795 + }, + { + "epoch": 0.2758145147388381, + "grad_norm": 1.5129690322023275, + "learning_rate": 9.983583191054162e-06, + "loss": 0.371, + "step": 800 + }, + { + "epoch": 0.27753835545595584, + "grad_norm": 1.5488707743982602, + "learning_rate": 9.983128078938007e-06, + "loss": 0.3921, + "step": 805 + }, + { + "epoch": 0.27926219617307363, + "grad_norm": 1.6432895517230897, + "learning_rate": 9.982666755052818e-06, + "loss": 0.3606, + "step": 810 + }, + { + "epoch": 0.28098603689019136, + "grad_norm": 1.620912544021688, + "learning_rate": 9.982199219973662e-06, + "loss": 0.3952, + "step": 815 + }, + { + "epoch": 0.2827098776073091, + "grad_norm": 2.018270453251334, + "learning_rate": 9.98172547428335e-06, + "loss": 0.3806, + "step": 820 + }, + { + "epoch": 0.2844337183244268, + "grad_norm": 2.0495975917720193, + "learning_rate": 9.981245518572434e-06, + "loss": 0.4053, + "step": 825 + }, + { + "epoch": 0.28615755904154455, + "grad_norm": 1.3074616768100098, + "learning_rate": 9.98075935343921e-06, + "loss": 0.3738, + "step": 830 + }, + { + "epoch": 0.2878813997586623, + "grad_norm": 1.451187504669899, + "learning_rate": 9.98026697948971e-06, + "loss": 0.3454, + "step": 835 + }, + { + "epoch": 0.28960524047578, + "grad_norm": 1.6684435894984186, + "learning_rate": 9.979768397337707e-06, + "loss": 0.361, + "step": 840 + }, + { + "epoch": 0.2913290811928978, + "grad_norm": 2.1691383415826038, + "learning_rate": 9.979263607604717e-06, + "loss": 0.3293, + "step": 845 + }, + { + "epoch": 0.29305292191001553, + "grad_norm": 1.7361296307334113, + "learning_rate": 9.978752610919986e-06, + "loss": 0.3682, + "step": 850 + }, + { + "epoch": 0.29477676262713326, + "grad_norm": 2.632542549563109, + "learning_rate": 9.978235407920506e-06, + "loss": 0.3699, + "step": 855 + }, + { + "epoch": 0.296500603344251, + "grad_norm": 1.5991509266182393, + "learning_rate": 9.977711999251001e-06, + "loss": 0.3775, + "step": 860 + }, + { + "epoch": 0.2982244440613687, + "grad_norm": 1.419117347034277, + "learning_rate": 9.97718238556393e-06, + "loss": 0.3873, + "step": 865 + }, + { + "epoch": 0.29994828477848645, + "grad_norm": 1.7818350177376596, + "learning_rate": 9.97664656751949e-06, + "loss": 0.3247, + "step": 870 + }, + { + "epoch": 0.3016721254956042, + "grad_norm": 1.6102128287351594, + "learning_rate": 9.97610454578561e-06, + "loss": 0.3663, + "step": 875 + }, + { + "epoch": 0.30339596621272197, + "grad_norm": 1.4259541767846493, + "learning_rate": 9.975556321037951e-06, + "loss": 0.37, + "step": 880 + }, + { + "epoch": 0.3051198069298397, + "grad_norm": 1.394367272133933, + "learning_rate": 9.975001893959912e-06, + "loss": 0.4106, + "step": 885 + }, + { + "epoch": 0.30684364764695743, + "grad_norm": 1.550783532169553, + "learning_rate": 9.974441265242614e-06, + "loss": 0.397, + "step": 890 + }, + { + "epoch": 0.30856748836407516, + "grad_norm": 1.61618942570839, + "learning_rate": 9.97387443558492e-06, + "loss": 0.402, + "step": 895 + }, + { + "epoch": 0.3102913290811929, + "grad_norm": 2.158601560774135, + "learning_rate": 9.973301405693414e-06, + "loss": 0.4115, + "step": 900 + }, + { + "epoch": 0.3120151697983106, + "grad_norm": 1.6864322676938845, + "learning_rate": 9.972722176282412e-06, + "loss": 0.3863, + "step": 905 + }, + { + "epoch": 0.31373901051542835, + "grad_norm": 1.499513360028198, + "learning_rate": 9.972136748073962e-06, + "loss": 0.3898, + "step": 910 + }, + { + "epoch": 0.31546285123254614, + "grad_norm": 2.9945860401099353, + "learning_rate": 9.97154512179783e-06, + "loss": 0.3368, + "step": 915 + }, + { + "epoch": 0.31718669194966387, + "grad_norm": 2.2619940301196007, + "learning_rate": 9.970947298191518e-06, + "loss": 0.3681, + "step": 920 + }, + { + "epoch": 0.3189105326667816, + "grad_norm": 1.9961612634247545, + "learning_rate": 9.970343278000248e-06, + "loss": 0.3656, + "step": 925 + }, + { + "epoch": 0.32063437338389933, + "grad_norm": 1.2979879218516086, + "learning_rate": 9.969733061976968e-06, + "loss": 0.3587, + "step": 930 + }, + { + "epoch": 0.32235821410101706, + "grad_norm": 1.6959871926114105, + "learning_rate": 9.969116650882347e-06, + "loss": 0.3741, + "step": 935 + }, + { + "epoch": 0.3240820548181348, + "grad_norm": 1.646394829591119, + "learning_rate": 9.968494045484781e-06, + "loss": 0.3775, + "step": 940 + }, + { + "epoch": 0.3258058955352525, + "grad_norm": 1.8953700126107131, + "learning_rate": 9.967865246560384e-06, + "loss": 0.3658, + "step": 945 + }, + { + "epoch": 0.3275297362523703, + "grad_norm": 1.4124007971281993, + "learning_rate": 9.96723025489299e-06, + "loss": 0.3941, + "step": 950 + }, + { + "epoch": 0.32925357696948804, + "grad_norm": 1.8263518324450987, + "learning_rate": 9.966589071274157e-06, + "loss": 0.3948, + "step": 955 + }, + { + "epoch": 0.33097741768660577, + "grad_norm": 1.2845869518907578, + "learning_rate": 9.965941696503159e-06, + "loss": 0.3724, + "step": 960 + }, + { + "epoch": 0.3327012584037235, + "grad_norm": 1.3653877556655445, + "learning_rate": 9.965288131386985e-06, + "loss": 0.3701, + "step": 965 + }, + { + "epoch": 0.33442509912084123, + "grad_norm": 1.992962251816929, + "learning_rate": 9.964628376740346e-06, + "loss": 0.341, + "step": 970 + }, + { + "epoch": 0.33614893983795896, + "grad_norm": 1.132570804670226, + "learning_rate": 9.963962433385664e-06, + "loss": 0.3861, + "step": 975 + }, + { + "epoch": 0.3378727805550767, + "grad_norm": 1.4106486997441041, + "learning_rate": 9.963290302153079e-06, + "loss": 0.3537, + "step": 980 + }, + { + "epoch": 0.3395966212721945, + "grad_norm": 1.7058732858287857, + "learning_rate": 9.962611983880441e-06, + "loss": 0.3609, + "step": 985 + }, + { + "epoch": 0.3413204619893122, + "grad_norm": 1.425582117664892, + "learning_rate": 9.961927479413315e-06, + "loss": 0.3591, + "step": 990 + }, + { + "epoch": 0.34304430270642994, + "grad_norm": 2.034087724933577, + "learning_rate": 9.96123678960498e-06, + "loss": 0.3526, + "step": 995 + }, + { + "epoch": 0.34476814342354767, + "grad_norm": 1.4069646155605997, + "learning_rate": 9.960539915316419e-06, + "loss": 0.3729, + "step": 1000 + }, + { + "epoch": 0.3464919841406654, + "grad_norm": 1.5670770963105607, + "learning_rate": 9.95983685741633e-06, + "loss": 0.3983, + "step": 1005 + }, + { + "epoch": 0.34821582485778313, + "grad_norm": 1.670410438433764, + "learning_rate": 9.959127616781115e-06, + "loss": 0.3702, + "step": 1010 + }, + { + "epoch": 0.34993966557490086, + "grad_norm": 1.2811693543664437, + "learning_rate": 9.958412194294885e-06, + "loss": 0.365, + "step": 1015 + }, + { + "epoch": 0.3516635062920186, + "grad_norm": 1.3881901167938773, + "learning_rate": 9.95769059084946e-06, + "loss": 0.3763, + "step": 1020 + }, + { + "epoch": 0.3533873470091364, + "grad_norm": 1.1822513717312828, + "learning_rate": 9.956962807344359e-06, + "loss": 0.3588, + "step": 1025 + }, + { + "epoch": 0.3551111877262541, + "grad_norm": 1.5015058214845947, + "learning_rate": 9.956228844686808e-06, + "loss": 0.3719, + "step": 1030 + }, + { + "epoch": 0.35683502844337184, + "grad_norm": 1.2850863986904908, + "learning_rate": 9.95548870379174e-06, + "loss": 0.3703, + "step": 1035 + }, + { + "epoch": 0.35855886916048957, + "grad_norm": 1.9595751672101223, + "learning_rate": 9.954742385581779e-06, + "loss": 0.406, + "step": 1040 + }, + { + "epoch": 0.3602827098776073, + "grad_norm": 1.189969196375302, + "learning_rate": 9.95398989098726e-06, + "loss": 0.3806, + "step": 1045 + }, + { + "epoch": 0.36200655059472503, + "grad_norm": 3.2765302636646867, + "learning_rate": 9.953231220946213e-06, + "loss": 0.3804, + "step": 1050 + }, + { + "epoch": 0.36373039131184276, + "grad_norm": 1.4117818326643434, + "learning_rate": 9.95246637640436e-06, + "loss": 0.3636, + "step": 1055 + }, + { + "epoch": 0.36545423202896055, + "grad_norm": 1.1362293837316353, + "learning_rate": 9.951695358315135e-06, + "loss": 0.3604, + "step": 1060 + }, + { + "epoch": 0.3671780727460783, + "grad_norm": 2.375998856783418, + "learning_rate": 9.95091816763965e-06, + "loss": 0.3598, + "step": 1065 + }, + { + "epoch": 0.368901913463196, + "grad_norm": 1.4402987169718324, + "learning_rate": 9.950134805346727e-06, + "loss": 0.3543, + "step": 1070 + }, + { + "epoch": 0.37062575418031374, + "grad_norm": 2.336170329724286, + "learning_rate": 9.949345272412866e-06, + "loss": 0.3649, + "step": 1075 + }, + { + "epoch": 0.37234959489743147, + "grad_norm": 2.1173847250304143, + "learning_rate": 9.948549569822276e-06, + "loss": 0.3773, + "step": 1080 + }, + { + "epoch": 0.3740734356145492, + "grad_norm": 5.115336943456691, + "learning_rate": 9.947747698566842e-06, + "loss": 0.3852, + "step": 1085 + }, + { + "epoch": 0.37579727633166693, + "grad_norm": 1.7062503153846607, + "learning_rate": 9.946939659646147e-06, + "loss": 0.3939, + "step": 1090 + }, + { + "epoch": 0.3775211170487847, + "grad_norm": 1.32448946576323, + "learning_rate": 9.94612545406746e-06, + "loss": 0.3812, + "step": 1095 + }, + { + "epoch": 0.37924495776590245, + "grad_norm": 1.8972128591502049, + "learning_rate": 9.945305082845738e-06, + "loss": 0.3802, + "step": 1100 + }, + { + "epoch": 0.3809687984830202, + "grad_norm": 1.2829640579295978, + "learning_rate": 9.944478547003622e-06, + "loss": 0.3634, + "step": 1105 + }, + { + "epoch": 0.3826926392001379, + "grad_norm": 1.4149606943731012, + "learning_rate": 9.943645847571439e-06, + "loss": 0.3383, + "step": 1110 + }, + { + "epoch": 0.38441647991725564, + "grad_norm": 1.2510105520686647, + "learning_rate": 9.942806985587199e-06, + "loss": 0.3848, + "step": 1115 + }, + { + "epoch": 0.38614032063437337, + "grad_norm": 1.3127699282429592, + "learning_rate": 9.941961962096595e-06, + "loss": 0.3719, + "step": 1120 + }, + { + "epoch": 0.3878641613514911, + "grad_norm": 1.2255416667018941, + "learning_rate": 9.941110778152997e-06, + "loss": 0.3662, + "step": 1125 + }, + { + "epoch": 0.3895880020686089, + "grad_norm": 1.5072255633942293, + "learning_rate": 9.94025343481746e-06, + "loss": 0.3509, + "step": 1130 + }, + { + "epoch": 0.3913118427857266, + "grad_norm": 1.5051483096888227, + "learning_rate": 9.939389933158712e-06, + "loss": 0.3689, + "step": 1135 + }, + { + "epoch": 0.39303568350284435, + "grad_norm": 1.4244272154222093, + "learning_rate": 9.93852027425316e-06, + "loss": 0.3463, + "step": 1140 + }, + { + "epoch": 0.3947595242199621, + "grad_norm": 1.547483122253386, + "learning_rate": 9.937644459184887e-06, + "loss": 0.3493, + "step": 1145 + }, + { + "epoch": 0.3964833649370798, + "grad_norm": 1.332252429344222, + "learning_rate": 9.936762489045648e-06, + "loss": 0.356, + "step": 1150 + }, + { + "epoch": 0.39820720565419754, + "grad_norm": 1.3004669243523812, + "learning_rate": 9.935874364934875e-06, + "loss": 0.3912, + "step": 1155 + }, + { + "epoch": 0.39993104637131527, + "grad_norm": 1.6257996779485293, + "learning_rate": 9.934980087959663e-06, + "loss": 0.3479, + "step": 1160 + }, + { + "epoch": 0.40165488708843305, + "grad_norm": 1.3921654223877478, + "learning_rate": 9.934079659234787e-06, + "loss": 0.3813, + "step": 1165 + }, + { + "epoch": 0.4033787278055508, + "grad_norm": 1.8477133683088989, + "learning_rate": 9.933173079882682e-06, + "loss": 0.3463, + "step": 1170 + }, + { + "epoch": 0.4051025685226685, + "grad_norm": 1.3347077947689194, + "learning_rate": 9.932260351033456e-06, + "loss": 0.356, + "step": 1175 + }, + { + "epoch": 0.40682640923978625, + "grad_norm": 1.4338643697652664, + "learning_rate": 9.931341473824879e-06, + "loss": 0.3997, + "step": 1180 + }, + { + "epoch": 0.408550249956904, + "grad_norm": 1.7317367152204977, + "learning_rate": 9.930416449402388e-06, + "loss": 0.3686, + "step": 1185 + }, + { + "epoch": 0.4102740906740217, + "grad_norm": 1.507979353997919, + "learning_rate": 9.92948527891908e-06, + "loss": 0.3637, + "step": 1190 + }, + { + "epoch": 0.41199793139113944, + "grad_norm": 1.403884182741799, + "learning_rate": 9.928547963535717e-06, + "loss": 0.3529, + "step": 1195 + }, + { + "epoch": 0.4137217721082572, + "grad_norm": 1.3106169255528577, + "learning_rate": 9.927604504420718e-06, + "loss": 0.3494, + "step": 1200 + }, + { + "epoch": 0.41544561282537495, + "grad_norm": 1.3396927515002877, + "learning_rate": 9.926654902750163e-06, + "loss": 0.3533, + "step": 1205 + }, + { + "epoch": 0.4171694535424927, + "grad_norm": 1.183714203613263, + "learning_rate": 9.925699159707784e-06, + "loss": 0.3734, + "step": 1210 + }, + { + "epoch": 0.4188932942596104, + "grad_norm": 1.389685074932271, + "learning_rate": 9.924737276484974e-06, + "loss": 0.3433, + "step": 1215 + }, + { + "epoch": 0.42061713497672815, + "grad_norm": 1.1942490877101626, + "learning_rate": 9.923769254280781e-06, + "loss": 0.3503, + "step": 1220 + }, + { + "epoch": 0.4223409756938459, + "grad_norm": 1.5953115471921129, + "learning_rate": 9.9227950943019e-06, + "loss": 0.3797, + "step": 1225 + }, + { + "epoch": 0.4240648164109636, + "grad_norm": 1.3659510775769799, + "learning_rate": 9.921814797762681e-06, + "loss": 0.3805, + "step": 1230 + }, + { + "epoch": 0.42578865712808134, + "grad_norm": 2.4060415220718827, + "learning_rate": 9.920828365885121e-06, + "loss": 0.3642, + "step": 1235 + }, + { + "epoch": 0.4275124978451991, + "grad_norm": 1.1890557400582953, + "learning_rate": 9.919835799898869e-06, + "loss": 0.3565, + "step": 1240 + }, + { + "epoch": 0.42923633856231685, + "grad_norm": 1.423047533670317, + "learning_rate": 9.918837101041217e-06, + "loss": 0.3504, + "step": 1245 + }, + { + "epoch": 0.4309601792794346, + "grad_norm": 1.546104084922549, + "learning_rate": 9.917832270557103e-06, + "loss": 0.362, + "step": 1250 + }, + { + "epoch": 0.4326840199965523, + "grad_norm": 1.30108724662532, + "learning_rate": 9.916821309699112e-06, + "loss": 0.4035, + "step": 1255 + }, + { + "epoch": 0.43440786071367005, + "grad_norm": 1.6968915457556, + "learning_rate": 9.915804219727463e-06, + "loss": 0.3664, + "step": 1260 + }, + { + "epoch": 0.4361317014307878, + "grad_norm": 1.7027916902035114, + "learning_rate": 9.91478100191002e-06, + "loss": 0.392, + "step": 1265 + }, + { + "epoch": 0.4378555421479055, + "grad_norm": 1.4545975619385854, + "learning_rate": 9.91375165752229e-06, + "loss": 0.393, + "step": 1270 + }, + { + "epoch": 0.4395793828650233, + "grad_norm": 1.6130421167609756, + "learning_rate": 9.91271618784741e-06, + "loss": 0.3748, + "step": 1275 + }, + { + "epoch": 0.441303223582141, + "grad_norm": 1.3023988922113487, + "learning_rate": 9.911674594176153e-06, + "loss": 0.3361, + "step": 1280 + }, + { + "epoch": 0.44302706429925875, + "grad_norm": 1.5374173378155058, + "learning_rate": 9.91062687780693e-06, + "loss": 0.3703, + "step": 1285 + }, + { + "epoch": 0.4447509050163765, + "grad_norm": 1.5199725349390463, + "learning_rate": 9.909573040045785e-06, + "loss": 0.3607, + "step": 1290 + }, + { + "epoch": 0.4464747457334942, + "grad_norm": 1.3413231991377519, + "learning_rate": 9.908513082206386e-06, + "loss": 0.347, + "step": 1295 + }, + { + "epoch": 0.44819858645061195, + "grad_norm": 1.2462578869188095, + "learning_rate": 9.907447005610038e-06, + "loss": 0.3484, + "step": 1300 + }, + { + "epoch": 0.4499224271677297, + "grad_norm": 1.4037202302710707, + "learning_rate": 9.906374811585668e-06, + "loss": 0.3712, + "step": 1305 + }, + { + "epoch": 0.45164626788484746, + "grad_norm": 1.7753102784448989, + "learning_rate": 9.90529650146983e-06, + "loss": 0.3829, + "step": 1310 + }, + { + "epoch": 0.4533701086019652, + "grad_norm": 1.2899182482456184, + "learning_rate": 9.904212076606704e-06, + "loss": 0.3747, + "step": 1315 + }, + { + "epoch": 0.4550939493190829, + "grad_norm": 1.407583745226016, + "learning_rate": 9.903121538348086e-06, + "loss": 0.352, + "step": 1320 + }, + { + "epoch": 0.45681779003620065, + "grad_norm": 1.577726964068593, + "learning_rate": 9.902024888053404e-06, + "loss": 0.3517, + "step": 1325 + }, + { + "epoch": 0.4585416307533184, + "grad_norm": 1.2565158325938188, + "learning_rate": 9.900922127089696e-06, + "loss": 0.3717, + "step": 1330 + }, + { + "epoch": 0.4602654714704361, + "grad_norm": 2.294699756680478, + "learning_rate": 9.899813256831618e-06, + "loss": 0.3658, + "step": 1335 + }, + { + "epoch": 0.46198931218755385, + "grad_norm": 1.3078489945396463, + "learning_rate": 9.898698278661448e-06, + "loss": 0.3726, + "step": 1340 + }, + { + "epoch": 0.46371315290467163, + "grad_norm": 1.1945961138468528, + "learning_rate": 9.897577193969068e-06, + "loss": 0.3856, + "step": 1345 + }, + { + "epoch": 0.46543699362178936, + "grad_norm": 1.3912208144004374, + "learning_rate": 9.89645000415198e-06, + "loss": 0.3853, + "step": 1350 + }, + { + "epoch": 0.4671608343389071, + "grad_norm": 1.4416719025269795, + "learning_rate": 9.895316710615296e-06, + "loss": 0.3748, + "step": 1355 + }, + { + "epoch": 0.4688846750560248, + "grad_norm": 1.3417231467145718, + "learning_rate": 9.89417731477173e-06, + "loss": 0.3971, + "step": 1360 + }, + { + "epoch": 0.47060851577314256, + "grad_norm": 1.2924588730347766, + "learning_rate": 9.893031818041615e-06, + "loss": 0.3857, + "step": 1365 + }, + { + "epoch": 0.4723323564902603, + "grad_norm": 1.5799532568189445, + "learning_rate": 9.891880221852872e-06, + "loss": 0.3612, + "step": 1370 + }, + { + "epoch": 0.474056197207378, + "grad_norm": 1.6455110984272108, + "learning_rate": 9.890722527641041e-06, + "loss": 0.3469, + "step": 1375 + }, + { + "epoch": 0.4757800379244958, + "grad_norm": 1.2408173241480154, + "learning_rate": 9.889558736849258e-06, + "loss": 0.3341, + "step": 1380 + }, + { + "epoch": 0.47750387864161353, + "grad_norm": 1.3054985860026322, + "learning_rate": 9.888388850928254e-06, + "loss": 0.3706, + "step": 1385 + }, + { + "epoch": 0.47922771935873126, + "grad_norm": 1.6467829330469361, + "learning_rate": 9.887212871336368e-06, + "loss": 0.3367, + "step": 1390 + }, + { + "epoch": 0.480951560075849, + "grad_norm": 1.2749999426414975, + "learning_rate": 9.886030799539522e-06, + "loss": 0.3429, + "step": 1395 + }, + { + "epoch": 0.4826754007929667, + "grad_norm": 1.8119587682624123, + "learning_rate": 9.884842637011245e-06, + "loss": 0.4, + "step": 1400 + }, + { + "epoch": 0.48439924151008446, + "grad_norm": 1.7911955620757787, + "learning_rate": 9.883648385232654e-06, + "loss": 0.3809, + "step": 1405 + }, + { + "epoch": 0.4861230822272022, + "grad_norm": 1.310684798244541, + "learning_rate": 9.88244804569245e-06, + "loss": 0.3425, + "step": 1410 + }, + { + "epoch": 0.48784692294431997, + "grad_norm": 1.5197330820828343, + "learning_rate": 9.881241619886934e-06, + "loss": 0.3677, + "step": 1415 + }, + { + "epoch": 0.4895707636614377, + "grad_norm": 1.38254878981102, + "learning_rate": 9.880029109319986e-06, + "loss": 0.3774, + "step": 1420 + }, + { + "epoch": 0.49129460437855543, + "grad_norm": 1.1583178223739745, + "learning_rate": 9.878810515503074e-06, + "loss": 0.3335, + "step": 1425 + }, + { + "epoch": 0.49301844509567316, + "grad_norm": 1.2847768495296337, + "learning_rate": 9.877585839955247e-06, + "loss": 0.3414, + "step": 1430 + }, + { + "epoch": 0.4947422858127909, + "grad_norm": 1.2512268003206457, + "learning_rate": 9.87635508420314e-06, + "loss": 0.3351, + "step": 1435 + }, + { + "epoch": 0.4964661265299086, + "grad_norm": 1.6571534427005084, + "learning_rate": 9.87511824978096e-06, + "loss": 0.3747, + "step": 1440 + }, + { + "epoch": 0.49818996724702636, + "grad_norm": 1.5612786668980918, + "learning_rate": 9.873875338230499e-06, + "loss": 0.3671, + "step": 1445 + }, + { + "epoch": 0.4999138079641441, + "grad_norm": 1.3248757883806321, + "learning_rate": 9.87262635110112e-06, + "loss": 0.357, + "step": 1450 + }, + { + "epoch": 0.5016376486812618, + "grad_norm": 1.1197006492173756, + "learning_rate": 9.871371289949758e-06, + "loss": 0.3599, + "step": 1455 + }, + { + "epoch": 0.5033614893983795, + "grad_norm": 1.3032377507199844, + "learning_rate": 9.870110156340928e-06, + "loss": 0.3153, + "step": 1460 + }, + { + "epoch": 0.5050853301154973, + "grad_norm": 1.2771120579191166, + "learning_rate": 9.868842951846703e-06, + "loss": 0.3573, + "step": 1465 + }, + { + "epoch": 0.5068091708326151, + "grad_norm": 1.949617037013246, + "learning_rate": 9.867569678046734e-06, + "loss": 0.3853, + "step": 1470 + }, + { + "epoch": 0.5085330115497328, + "grad_norm": 1.4482220126271506, + "learning_rate": 9.86629033652823e-06, + "loss": 0.3182, + "step": 1475 + }, + { + "epoch": 0.5102568522668506, + "grad_norm": 1.341005156684862, + "learning_rate": 9.865004928885968e-06, + "loss": 0.3647, + "step": 1480 + }, + { + "epoch": 0.5119806929839683, + "grad_norm": 1.4542069069853112, + "learning_rate": 9.863713456722289e-06, + "loss": 0.3657, + "step": 1485 + }, + { + "epoch": 0.513704533701086, + "grad_norm": 5.6703414667784, + "learning_rate": 9.862415921647087e-06, + "loss": 0.3897, + "step": 1490 + }, + { + "epoch": 0.5154283744182038, + "grad_norm": 3.2139034968460254, + "learning_rate": 9.86111232527782e-06, + "loss": 0.3229, + "step": 1495 + }, + { + "epoch": 0.5171522151353215, + "grad_norm": 1.3612184719987817, + "learning_rate": 9.859802669239497e-06, + "loss": 0.3539, + "step": 1500 + }, + { + "epoch": 0.5188760558524392, + "grad_norm": 1.7907244849071648, + "learning_rate": 9.858486955164686e-06, + "loss": 0.3847, + "step": 1505 + }, + { + "epoch": 0.520599896569557, + "grad_norm": 1.387236181448054, + "learning_rate": 9.857165184693502e-06, + "loss": 0.3404, + "step": 1510 + }, + { + "epoch": 0.5223237372866747, + "grad_norm": 2.7369191694718435, + "learning_rate": 9.855837359473611e-06, + "loss": 0.3377, + "step": 1515 + }, + { + "epoch": 0.5240475780037924, + "grad_norm": 1.7846678078527498, + "learning_rate": 9.854503481160229e-06, + "loss": 0.3397, + "step": 1520 + }, + { + "epoch": 0.5257714187209102, + "grad_norm": 1.1775736461224253, + "learning_rate": 9.853163551416112e-06, + "loss": 0.3181, + "step": 1525 + }, + { + "epoch": 0.5274952594380279, + "grad_norm": 5.217423554831368, + "learning_rate": 9.851817571911568e-06, + "loss": 0.3786, + "step": 1530 + }, + { + "epoch": 0.5292191001551456, + "grad_norm": 1.300451482508374, + "learning_rate": 9.850465544324437e-06, + "loss": 0.3424, + "step": 1535 + }, + { + "epoch": 0.5309429408722635, + "grad_norm": 1.264550442244784, + "learning_rate": 9.849107470340105e-06, + "loss": 0.3489, + "step": 1540 + }, + { + "epoch": 0.5326667815893812, + "grad_norm": 1.8979472196923746, + "learning_rate": 9.847743351651493e-06, + "loss": 0.4058, + "step": 1545 + }, + { + "epoch": 0.5343906223064989, + "grad_norm": 1.3971555375184224, + "learning_rate": 9.846373189959057e-06, + "loss": 0.3803, + "step": 1550 + }, + { + "epoch": 0.5361144630236166, + "grad_norm": 1.1945667113234897, + "learning_rate": 9.844996986970785e-06, + "loss": 0.351, + "step": 1555 + }, + { + "epoch": 0.5378383037407344, + "grad_norm": 1.4265596933503348, + "learning_rate": 9.843614744402199e-06, + "loss": 0.3618, + "step": 1560 + }, + { + "epoch": 0.5395621444578521, + "grad_norm": 1.6254590780571174, + "learning_rate": 9.842226463976344e-06, + "loss": 0.3651, + "step": 1565 + }, + { + "epoch": 0.5412859851749698, + "grad_norm": 1.6247245781236594, + "learning_rate": 9.840832147423797e-06, + "loss": 0.3306, + "step": 1570 + }, + { + "epoch": 0.5430098258920876, + "grad_norm": 1.220406406153, + "learning_rate": 9.839431796482657e-06, + "loss": 0.3223, + "step": 1575 + }, + { + "epoch": 0.5447336666092053, + "grad_norm": 1.8630798820145742, + "learning_rate": 9.83802541289855e-06, + "loss": 0.3233, + "step": 1580 + }, + { + "epoch": 0.546457507326323, + "grad_norm": 1.654861293681641, + "learning_rate": 9.836612998424609e-06, + "loss": 0.3422, + "step": 1585 + }, + { + "epoch": 0.5481813480434408, + "grad_norm": 1.2926137649787477, + "learning_rate": 9.8351945548215e-06, + "loss": 0.3438, + "step": 1590 + }, + { + "epoch": 0.5499051887605585, + "grad_norm": 1.4070463848515484, + "learning_rate": 9.833770083857399e-06, + "loss": 0.3429, + "step": 1595 + }, + { + "epoch": 0.5516290294776762, + "grad_norm": 1.6643200904657351, + "learning_rate": 9.832339587307993e-06, + "loss": 0.3872, + "step": 1600 + }, + { + "epoch": 0.553352870194794, + "grad_norm": 1.196628821556852, + "learning_rate": 9.830903066956482e-06, + "loss": 0.3584, + "step": 1605 + }, + { + "epoch": 0.5550767109119117, + "grad_norm": 1.4221976338929014, + "learning_rate": 9.829460524593573e-06, + "loss": 0.3501, + "step": 1610 + }, + { + "epoch": 0.5568005516290295, + "grad_norm": 1.231712612757513, + "learning_rate": 9.828011962017483e-06, + "loss": 0.3615, + "step": 1615 + }, + { + "epoch": 0.5585243923461473, + "grad_norm": 3.782491737531238, + "learning_rate": 9.826557381033935e-06, + "loss": 0.3595, + "step": 1620 + }, + { + "epoch": 0.560248233063265, + "grad_norm": 1.8192046148733767, + "learning_rate": 9.82509678345615e-06, + "loss": 0.3889, + "step": 1625 + }, + { + "epoch": 0.5619720737803827, + "grad_norm": 1.1784062261575894, + "learning_rate": 9.82363017110485e-06, + "loss": 0.358, + "step": 1630 + }, + { + "epoch": 0.5636959144975004, + "grad_norm": 1.1120097999467748, + "learning_rate": 9.822157545808258e-06, + "loss": 0.344, + "step": 1635 + }, + { + "epoch": 0.5654197552146182, + "grad_norm": 1.231049378045983, + "learning_rate": 9.820678909402086e-06, + "loss": 0.3622, + "step": 1640 + }, + { + "epoch": 0.5671435959317359, + "grad_norm": 1.4583871524324172, + "learning_rate": 9.819194263729545e-06, + "loss": 0.3324, + "step": 1645 + }, + { + "epoch": 0.5688674366488536, + "grad_norm": 2.1448552999596324, + "learning_rate": 9.817703610641338e-06, + "loss": 0.3605, + "step": 1650 + }, + { + "epoch": 0.5705912773659714, + "grad_norm": 1.4216658533014481, + "learning_rate": 9.816206951995651e-06, + "loss": 0.3583, + "step": 1655 + }, + { + "epoch": 0.5723151180830891, + "grad_norm": 1.2683657570498639, + "learning_rate": 9.81470428965816e-06, + "loss": 0.3266, + "step": 1660 + }, + { + "epoch": 0.5740389588002068, + "grad_norm": 1.429925418521831, + "learning_rate": 9.813195625502023e-06, + "loss": 0.3625, + "step": 1665 + }, + { + "epoch": 0.5757627995173246, + "grad_norm": 1.3109369869244831, + "learning_rate": 9.81168096140788e-06, + "loss": 0.3496, + "step": 1670 + }, + { + "epoch": 0.5774866402344423, + "grad_norm": 1.4097525415123802, + "learning_rate": 9.810160299263854e-06, + "loss": 0.3222, + "step": 1675 + }, + { + "epoch": 0.57921048095156, + "grad_norm": 1.6167685051895064, + "learning_rate": 9.808633640965538e-06, + "loss": 0.3562, + "step": 1680 + }, + { + "epoch": 0.5809343216686779, + "grad_norm": 1.4068806492951937, + "learning_rate": 9.80710098841601e-06, + "loss": 0.3241, + "step": 1685 + }, + { + "epoch": 0.5826581623857956, + "grad_norm": 1.179635646405619, + "learning_rate": 9.805562343525805e-06, + "loss": 0.3857, + "step": 1690 + }, + { + "epoch": 0.5843820031029133, + "grad_norm": 1.2173491024659624, + "learning_rate": 9.804017708212942e-06, + "loss": 0.3371, + "step": 1695 + }, + { + "epoch": 0.5861058438200311, + "grad_norm": 1.3173546555259819, + "learning_rate": 9.8024670844029e-06, + "loss": 0.343, + "step": 1700 + }, + { + "epoch": 0.5878296845371488, + "grad_norm": 1.4538435102753489, + "learning_rate": 9.800910474028626e-06, + "loss": 0.3582, + "step": 1705 + }, + { + "epoch": 0.5895535252542665, + "grad_norm": 1.490191931421755, + "learning_rate": 9.79934787903053e-06, + "loss": 0.3407, + "step": 1710 + }, + { + "epoch": 0.5912773659713843, + "grad_norm": 1.3278697952759766, + "learning_rate": 9.797779301356476e-06, + "loss": 0.3645, + "step": 1715 + }, + { + "epoch": 0.593001206688502, + "grad_norm": 1.9231811456670627, + "learning_rate": 9.796204742961794e-06, + "loss": 0.3498, + "step": 1720 + }, + { + "epoch": 0.5947250474056197, + "grad_norm": 3.4354823474910896, + "learning_rate": 9.794624205809265e-06, + "loss": 0.3389, + "step": 1725 + }, + { + "epoch": 0.5964488881227374, + "grad_norm": 1.2929264142519374, + "learning_rate": 9.793037691869122e-06, + "loss": 0.3386, + "step": 1730 + }, + { + "epoch": 0.5981727288398552, + "grad_norm": 1.3268190419109211, + "learning_rate": 9.791445203119054e-06, + "loss": 0.3607, + "step": 1735 + }, + { + "epoch": 0.5998965695569729, + "grad_norm": 2.4156815305069195, + "learning_rate": 9.789846741544189e-06, + "loss": 0.3519, + "step": 1740 + }, + { + "epoch": 0.6016204102740906, + "grad_norm": 2.105253860773333, + "learning_rate": 9.78824230913711e-06, + "loss": 0.3482, + "step": 1745 + }, + { + "epoch": 0.6033442509912084, + "grad_norm": 1.4553672573498604, + "learning_rate": 9.786631907897837e-06, + "loss": 0.3444, + "step": 1750 + }, + { + "epoch": 0.6050680917083262, + "grad_norm": 1.1683158328286432, + "learning_rate": 9.785015539833833e-06, + "loss": 0.328, + "step": 1755 + }, + { + "epoch": 0.6067919324254439, + "grad_norm": 1.3640121639678129, + "learning_rate": 9.783393206959994e-06, + "loss": 0.3329, + "step": 1760 + }, + { + "epoch": 0.6085157731425617, + "grad_norm": 1.3717937958436441, + "learning_rate": 9.781764911298662e-06, + "loss": 0.3608, + "step": 1765 + }, + { + "epoch": 0.6102396138596794, + "grad_norm": 1.4079094810940234, + "learning_rate": 9.780130654879598e-06, + "loss": 0.352, + "step": 1770 + }, + { + "epoch": 0.6119634545767971, + "grad_norm": 1.2300630026877328, + "learning_rate": 9.778490439740008e-06, + "loss": 0.3695, + "step": 1775 + }, + { + "epoch": 0.6136872952939149, + "grad_norm": 1.4261147274189423, + "learning_rate": 9.776844267924515e-06, + "loss": 0.3089, + "step": 1780 + }, + { + "epoch": 0.6154111360110326, + "grad_norm": 17.970470975419595, + "learning_rate": 9.775192141485172e-06, + "loss": 0.3452, + "step": 1785 + }, + { + "epoch": 0.6171349767281503, + "grad_norm": 2.3325003922313328, + "learning_rate": 9.773534062481455e-06, + "loss": 0.3341, + "step": 1790 + }, + { + "epoch": 0.618858817445268, + "grad_norm": 1.2644218427270757, + "learning_rate": 9.771870032980258e-06, + "loss": 0.3372, + "step": 1795 + }, + { + "epoch": 0.6205826581623858, + "grad_norm": 2.267288591389249, + "learning_rate": 9.770200055055895e-06, + "loss": 0.368, + "step": 1800 + }, + { + "epoch": 0.6223064988795035, + "grad_norm": 1.511457144800206, + "learning_rate": 9.768524130790092e-06, + "loss": 0.352, + "step": 1805 + }, + { + "epoch": 0.6240303395966212, + "grad_norm": 1.2748023215607918, + "learning_rate": 9.766842262271991e-06, + "loss": 0.3449, + "step": 1810 + }, + { + "epoch": 0.625754180313739, + "grad_norm": 1.6346116653311675, + "learning_rate": 9.765154451598142e-06, + "loss": 0.3413, + "step": 1815 + }, + { + "epoch": 0.6274780210308567, + "grad_norm": 1.2129741803165814, + "learning_rate": 9.763460700872504e-06, + "loss": 0.3323, + "step": 1820 + }, + { + "epoch": 0.6292018617479744, + "grad_norm": 1.5775136340309934, + "learning_rate": 9.761761012206436e-06, + "loss": 0.3464, + "step": 1825 + }, + { + "epoch": 0.6309257024650923, + "grad_norm": 1.1506286707450157, + "learning_rate": 9.760055387718705e-06, + "loss": 0.3733, + "step": 1830 + }, + { + "epoch": 0.63264954318221, + "grad_norm": 3.1483872499981613, + "learning_rate": 9.758343829535475e-06, + "loss": 0.3357, + "step": 1835 + }, + { + "epoch": 0.6343733838993277, + "grad_norm": 6.66861345094595, + "learning_rate": 9.756626339790304e-06, + "loss": 0.35, + "step": 1840 + }, + { + "epoch": 0.6360972246164455, + "grad_norm": 1.3407152188605196, + "learning_rate": 9.754902920624148e-06, + "loss": 0.362, + "step": 1845 + }, + { + "epoch": 0.6378210653335632, + "grad_norm": 1.2053825635726498, + "learning_rate": 9.75317357418535e-06, + "loss": 0.3518, + "step": 1850 + }, + { + "epoch": 0.6395449060506809, + "grad_norm": 1.1351690387521203, + "learning_rate": 9.751438302629648e-06, + "loss": 0.3375, + "step": 1855 + }, + { + "epoch": 0.6412687467677987, + "grad_norm": 1.2654786468781802, + "learning_rate": 9.74969710812016e-06, + "loss": 0.3577, + "step": 1860 + }, + { + "epoch": 0.6429925874849164, + "grad_norm": 1.320102261820997, + "learning_rate": 9.74794999282739e-06, + "loss": 0.3702, + "step": 1865 + }, + { + "epoch": 0.6447164282020341, + "grad_norm": 1.2184163373499772, + "learning_rate": 9.746196958929224e-06, + "loss": 0.3565, + "step": 1870 + }, + { + "epoch": 0.6464402689191519, + "grad_norm": 1.3086320859106735, + "learning_rate": 9.744438008610923e-06, + "loss": 0.3415, + "step": 1875 + }, + { + "epoch": 0.6481641096362696, + "grad_norm": 1.260451122577548, + "learning_rate": 9.742673144065124e-06, + "loss": 0.3591, + "step": 1880 + }, + { + "epoch": 0.6498879503533873, + "grad_norm": 2.2130311744776687, + "learning_rate": 9.740902367491838e-06, + "loss": 0.3321, + "step": 1885 + }, + { + "epoch": 0.651611791070505, + "grad_norm": 1.3269825421431771, + "learning_rate": 9.739125681098445e-06, + "loss": 0.3881, + "step": 1890 + }, + { + "epoch": 0.6533356317876228, + "grad_norm": 1.4558089848240794, + "learning_rate": 9.737343087099688e-06, + "loss": 0.3234, + "step": 1895 + }, + { + "epoch": 0.6550594725047406, + "grad_norm": 1.2647154181590219, + "learning_rate": 9.735554587717683e-06, + "loss": 0.3452, + "step": 1900 + }, + { + "epoch": 0.6567833132218583, + "grad_norm": 1.177397083432282, + "learning_rate": 9.733760185181898e-06, + "loss": 0.3363, + "step": 1905 + }, + { + "epoch": 0.6585071539389761, + "grad_norm": 1.5200540976797097, + "learning_rate": 9.731959881729166e-06, + "loss": 0.3406, + "step": 1910 + }, + { + "epoch": 0.6602309946560938, + "grad_norm": 1.0930012809935852, + "learning_rate": 9.730153679603672e-06, + "loss": 0.337, + "step": 1915 + }, + { + "epoch": 0.6619548353732115, + "grad_norm": 1.4255344620994401, + "learning_rate": 9.728341581056955e-06, + "loss": 0.3449, + "step": 1920 + }, + { + "epoch": 0.6636786760903293, + "grad_norm": 1.2281310053060395, + "learning_rate": 9.726523588347906e-06, + "loss": 0.3604, + "step": 1925 + }, + { + "epoch": 0.665402516807447, + "grad_norm": 1.387743088432441, + "learning_rate": 9.724699703742763e-06, + "loss": 0.3445, + "step": 1930 + }, + { + "epoch": 0.6671263575245647, + "grad_norm": 1.2543456914925757, + "learning_rate": 9.72286992951511e-06, + "loss": 0.3875, + "step": 1935 + }, + { + "epoch": 0.6688501982416825, + "grad_norm": 1.3470232445654011, + "learning_rate": 9.721034267945866e-06, + "loss": 0.3692, + "step": 1940 + }, + { + "epoch": 0.6705740389588002, + "grad_norm": 1.0384993936345281, + "learning_rate": 9.719192721323297e-06, + "loss": 0.3316, + "step": 1945 + }, + { + "epoch": 0.6722978796759179, + "grad_norm": 2.3474011989080723, + "learning_rate": 9.717345291943e-06, + "loss": 0.376, + "step": 1950 + }, + { + "epoch": 0.6740217203930357, + "grad_norm": 1.2785175572492598, + "learning_rate": 9.715491982107905e-06, + "loss": 0.3354, + "step": 1955 + }, + { + "epoch": 0.6757455611101534, + "grad_norm": 1.4315614912377117, + "learning_rate": 9.71363279412828e-06, + "loss": 0.3648, + "step": 1960 + }, + { + "epoch": 0.6774694018272711, + "grad_norm": 1.5159264140521431, + "learning_rate": 9.71176773032171e-06, + "loss": 0.3632, + "step": 1965 + }, + { + "epoch": 0.679193242544389, + "grad_norm": 1.936391805248331, + "learning_rate": 9.70989679301311e-06, + "loss": 0.3251, + "step": 1970 + }, + { + "epoch": 0.6809170832615067, + "grad_norm": 1.1908396682983318, + "learning_rate": 9.708019984534717e-06, + "loss": 0.3385, + "step": 1975 + }, + { + "epoch": 0.6826409239786244, + "grad_norm": 1.194331525773947, + "learning_rate": 9.706137307226085e-06, + "loss": 0.3766, + "step": 1980 + }, + { + "epoch": 0.6843647646957421, + "grad_norm": 1.384193293040237, + "learning_rate": 9.704248763434086e-06, + "loss": 0.3091, + "step": 1985 + }, + { + "epoch": 0.6860886054128599, + "grad_norm": 1.3014083426984635, + "learning_rate": 9.702354355512899e-06, + "loss": 0.3361, + "step": 1990 + }, + { + "epoch": 0.6878124461299776, + "grad_norm": 2.130779427145665, + "learning_rate": 9.700454085824025e-06, + "loss": 0.3382, + "step": 1995 + }, + { + "epoch": 0.6895362868470953, + "grad_norm": 1.3918957544735562, + "learning_rate": 9.698547956736257e-06, + "loss": 0.3356, + "step": 2000 + }, + { + "epoch": 0.6912601275642131, + "grad_norm": 1.1048646806456652, + "learning_rate": 9.696635970625705e-06, + "loss": 0.3796, + "step": 2005 + }, + { + "epoch": 0.6929839682813308, + "grad_norm": 1.255839215498237, + "learning_rate": 9.694718129875772e-06, + "loss": 0.3704, + "step": 2010 + }, + { + "epoch": 0.6947078089984485, + "grad_norm": 1.155480250834214, + "learning_rate": 9.692794436877161e-06, + "loss": 0.3115, + "step": 2015 + }, + { + "epoch": 0.6964316497155663, + "grad_norm": 1.2630790059617156, + "learning_rate": 9.690864894027876e-06, + "loss": 0.3611, + "step": 2020 + }, + { + "epoch": 0.698155490432684, + "grad_norm": 1.2491606966680813, + "learning_rate": 9.688929503733202e-06, + "loss": 0.3678, + "step": 2025 + }, + { + "epoch": 0.6998793311498017, + "grad_norm": 1.317226937620061, + "learning_rate": 9.686988268405725e-06, + "loss": 0.3434, + "step": 2030 + }, + { + "epoch": 0.7016031718669195, + "grad_norm": 2.9635268070050635, + "learning_rate": 9.685041190465306e-06, + "loss": 0.3567, + "step": 2035 + }, + { + "epoch": 0.7033270125840372, + "grad_norm": 1.1111167588783066, + "learning_rate": 9.683088272339098e-06, + "loss": 0.3311, + "step": 2040 + }, + { + "epoch": 0.705050853301155, + "grad_norm": 1.1727225408316133, + "learning_rate": 9.681129516461533e-06, + "loss": 0.3644, + "step": 2045 + }, + { + "epoch": 0.7067746940182728, + "grad_norm": 1.2849192048716935, + "learning_rate": 9.679164925274316e-06, + "loss": 0.3292, + "step": 2050 + }, + { + "epoch": 0.7084985347353905, + "grad_norm": 1.856294010427372, + "learning_rate": 9.677194501226427e-06, + "loss": 0.3598, + "step": 2055 + }, + { + "epoch": 0.7102223754525082, + "grad_norm": 1.4344730481492045, + "learning_rate": 9.675218246774119e-06, + "loss": 0.4037, + "step": 2060 + }, + { + "epoch": 0.711946216169626, + "grad_norm": 1.2596112394096157, + "learning_rate": 9.673236164380912e-06, + "loss": 0.3594, + "step": 2065 + }, + { + "epoch": 0.7136700568867437, + "grad_norm": 1.0430720211348907, + "learning_rate": 9.671248256517593e-06, + "loss": 0.3473, + "step": 2070 + }, + { + "epoch": 0.7153938976038614, + "grad_norm": 2.2414599858134174, + "learning_rate": 9.669254525662206e-06, + "loss": 0.3449, + "step": 2075 + }, + { + "epoch": 0.7171177383209791, + "grad_norm": 1.2595526917390543, + "learning_rate": 9.667254974300058e-06, + "loss": 0.3339, + "step": 2080 + }, + { + "epoch": 0.7188415790380969, + "grad_norm": 1.5145459258516831, + "learning_rate": 9.66524960492371e-06, + "loss": 0.3652, + "step": 2085 + }, + { + "epoch": 0.7205654197552146, + "grad_norm": 1.3622039449929448, + "learning_rate": 9.663238420032974e-06, + "loss": 0.3482, + "step": 2090 + }, + { + "epoch": 0.7222892604723323, + "grad_norm": 4.393090680903247, + "learning_rate": 9.661221422134916e-06, + "loss": 0.3369, + "step": 2095 + }, + { + "epoch": 0.7240131011894501, + "grad_norm": 1.3436475260291876, + "learning_rate": 9.659198613743843e-06, + "loss": 0.346, + "step": 2100 + }, + { + "epoch": 0.7257369419065678, + "grad_norm": 1.1735302491637403, + "learning_rate": 9.657169997381309e-06, + "loss": 0.3287, + "step": 2105 + }, + { + "epoch": 0.7274607826236855, + "grad_norm": 1.6624901063970146, + "learning_rate": 9.655135575576104e-06, + "loss": 0.3296, + "step": 2110 + }, + { + "epoch": 0.7291846233408034, + "grad_norm": 1.1642361211785006, + "learning_rate": 9.653095350864258e-06, + "loss": 0.3646, + "step": 2115 + }, + { + "epoch": 0.7309084640579211, + "grad_norm": 1.2379510076499574, + "learning_rate": 9.651049325789035e-06, + "loss": 0.3384, + "step": 2120 + }, + { + "epoch": 0.7326323047750388, + "grad_norm": 2.350647701258199, + "learning_rate": 9.648997502900927e-06, + "loss": 0.339, + "step": 2125 + }, + { + "epoch": 0.7343561454921566, + "grad_norm": 1.2552644816504759, + "learning_rate": 9.646939884757658e-06, + "loss": 0.3339, + "step": 2130 + }, + { + "epoch": 0.7360799862092743, + "grad_norm": 1.3038930976731238, + "learning_rate": 9.644876473924169e-06, + "loss": 0.3607, + "step": 2135 + }, + { + "epoch": 0.737803826926392, + "grad_norm": 1.1710383911554159, + "learning_rate": 9.642807272972628e-06, + "loss": 0.3267, + "step": 2140 + }, + { + "epoch": 0.7395276676435097, + "grad_norm": 1.1416737908534371, + "learning_rate": 9.640732284482415e-06, + "loss": 0.3512, + "step": 2145 + }, + { + "epoch": 0.7412515083606275, + "grad_norm": 1.1621384764475449, + "learning_rate": 9.638651511040133e-06, + "loss": 0.3516, + "step": 2150 + }, + { + "epoch": 0.7429753490777452, + "grad_norm": 1.1264664231349113, + "learning_rate": 9.636564955239589e-06, + "loss": 0.357, + "step": 2155 + }, + { + "epoch": 0.7446991897948629, + "grad_norm": 1.4580010683693982, + "learning_rate": 9.6344726196818e-06, + "loss": 0.3437, + "step": 2160 + }, + { + "epoch": 0.7464230305119807, + "grad_norm": 1.444505088947196, + "learning_rate": 9.632374506974989e-06, + "loss": 0.3521, + "step": 2165 + }, + { + "epoch": 0.7481468712290984, + "grad_norm": 1.3267725643785786, + "learning_rate": 9.63027061973458e-06, + "loss": 0.3367, + "step": 2170 + }, + { + "epoch": 0.7498707119462161, + "grad_norm": 1.1604147870475694, + "learning_rate": 9.628160960583193e-06, + "loss": 0.3767, + "step": 2175 + }, + { + "epoch": 0.7515945526633339, + "grad_norm": 2.337394597573418, + "learning_rate": 9.626045532150645e-06, + "loss": 0.3725, + "step": 2180 + }, + { + "epoch": 0.7533183933804517, + "grad_norm": 1.2134445156430822, + "learning_rate": 9.62392433707395e-06, + "loss": 0.3569, + "step": 2185 + }, + { + "epoch": 0.7550422340975694, + "grad_norm": 1.6532910321870786, + "learning_rate": 9.6217973779973e-06, + "loss": 0.3532, + "step": 2190 + }, + { + "epoch": 0.7567660748146872, + "grad_norm": 1.3308694918317898, + "learning_rate": 9.619664657572077e-06, + "loss": 0.3364, + "step": 2195 + }, + { + "epoch": 0.7584899155318049, + "grad_norm": 1.4402124304669235, + "learning_rate": 9.61752617845685e-06, + "loss": 0.3678, + "step": 2200 + }, + { + "epoch": 0.7602137562489226, + "grad_norm": 1.407378477530841, + "learning_rate": 9.615381943317358e-06, + "loss": 0.3388, + "step": 2205 + }, + { + "epoch": 0.7619375969660404, + "grad_norm": 1.3100326233019954, + "learning_rate": 9.613231954826522e-06, + "loss": 0.3434, + "step": 2210 + }, + { + "epoch": 0.7636614376831581, + "grad_norm": 1.3152288801879068, + "learning_rate": 9.61107621566443e-06, + "loss": 0.3761, + "step": 2215 + }, + { + "epoch": 0.7653852784002758, + "grad_norm": 1.8018198787615154, + "learning_rate": 9.608914728518342e-06, + "loss": 0.3421, + "step": 2220 + }, + { + "epoch": 0.7671091191173935, + "grad_norm": 1.3715552770372919, + "learning_rate": 9.60674749608268e-06, + "loss": 0.3842, + "step": 2225 + }, + { + "epoch": 0.7688329598345113, + "grad_norm": 1.2502019396132484, + "learning_rate": 9.604574521059031e-06, + "loss": 0.3527, + "step": 2230 + }, + { + "epoch": 0.770556800551629, + "grad_norm": 1.292033578517676, + "learning_rate": 9.602395806156138e-06, + "loss": 0.3373, + "step": 2235 + }, + { + "epoch": 0.7722806412687467, + "grad_norm": 1.2513714620754817, + "learning_rate": 9.600211354089903e-06, + "loss": 0.3696, + "step": 2240 + }, + { + "epoch": 0.7740044819858645, + "grad_norm": 1.2447118649170084, + "learning_rate": 9.598021167583374e-06, + "loss": 0.339, + "step": 2245 + }, + { + "epoch": 0.7757283227029822, + "grad_norm": 1.4852008354533497, + "learning_rate": 9.595825249366751e-06, + "loss": 0.3278, + "step": 2250 + }, + { + "epoch": 0.7774521634200999, + "grad_norm": 1.3741570480902097, + "learning_rate": 9.593623602177378e-06, + "loss": 0.3537, + "step": 2255 + }, + { + "epoch": 0.7791760041372178, + "grad_norm": 1.7110318822891042, + "learning_rate": 9.59141622875974e-06, + "loss": 0.3133, + "step": 2260 + }, + { + "epoch": 0.7808998448543355, + "grad_norm": 1.0798645745221636, + "learning_rate": 9.589203131865464e-06, + "loss": 0.3188, + "step": 2265 + }, + { + "epoch": 0.7826236855714532, + "grad_norm": 1.4075076569973053, + "learning_rate": 9.586984314253307e-06, + "loss": 0.3378, + "step": 2270 + }, + { + "epoch": 0.784347526288571, + "grad_norm": 1.2503754359181298, + "learning_rate": 9.584759778689157e-06, + "loss": 0.3364, + "step": 2275 + }, + { + "epoch": 0.7860713670056887, + "grad_norm": 1.1529449498678912, + "learning_rate": 9.582529527946032e-06, + "loss": 0.333, + "step": 2280 + }, + { + "epoch": 0.7877952077228064, + "grad_norm": 1.333286467678276, + "learning_rate": 9.580293564804074e-06, + "loss": 0.3512, + "step": 2285 + }, + { + "epoch": 0.7895190484399242, + "grad_norm": 3.0938545533335153, + "learning_rate": 9.578051892050548e-06, + "loss": 0.3487, + "step": 2290 + }, + { + "epoch": 0.7912428891570419, + "grad_norm": 1.232161292795314, + "learning_rate": 9.57580451247983e-06, + "loss": 0.3503, + "step": 2295 + }, + { + "epoch": 0.7929667298741596, + "grad_norm": 1.2968956904022089, + "learning_rate": 9.573551428893419e-06, + "loss": 0.3635, + "step": 2300 + }, + { + "epoch": 0.7946905705912773, + "grad_norm": 1.2207877754566046, + "learning_rate": 9.571292644099914e-06, + "loss": 0.3435, + "step": 2305 + }, + { + "epoch": 0.7964144113083951, + "grad_norm": 1.485339530667337, + "learning_rate": 9.569028160915028e-06, + "loss": 0.3545, + "step": 2310 + }, + { + "epoch": 0.7981382520255128, + "grad_norm": 1.5574049401540915, + "learning_rate": 9.566757982161576e-06, + "loss": 0.3382, + "step": 2315 + }, + { + "epoch": 0.7998620927426305, + "grad_norm": 1.1584070857731443, + "learning_rate": 9.564482110669473e-06, + "loss": 0.3396, + "step": 2320 + }, + { + "epoch": 0.8015859334597483, + "grad_norm": 1.0932314469914213, + "learning_rate": 9.56220054927573e-06, + "loss": 0.3654, + "step": 2325 + }, + { + "epoch": 0.8033097741768661, + "grad_norm": 1.7106605016330145, + "learning_rate": 9.559913300824448e-06, + "loss": 0.3235, + "step": 2330 + }, + { + "epoch": 0.8050336148939838, + "grad_norm": 1.332995683769837, + "learning_rate": 9.55762036816682e-06, + "loss": 0.3302, + "step": 2335 + }, + { + "epoch": 0.8067574556111016, + "grad_norm": 2.651225259868728, + "learning_rate": 9.555321754161128e-06, + "loss": 0.333, + "step": 2340 + }, + { + "epoch": 0.8084812963282193, + "grad_norm": 1.0688478928234517, + "learning_rate": 9.553017461672731e-06, + "loss": 0.3239, + "step": 2345 + }, + { + "epoch": 0.810205137045337, + "grad_norm": 1.8617172914803979, + "learning_rate": 9.550707493574068e-06, + "loss": 0.3424, + "step": 2350 + }, + { + "epoch": 0.8119289777624548, + "grad_norm": 1.1028777782836379, + "learning_rate": 9.548391852744653e-06, + "loss": 0.3154, + "step": 2355 + }, + { + "epoch": 0.8136528184795725, + "grad_norm": 1.3623541183125842, + "learning_rate": 9.546070542071072e-06, + "loss": 0.3488, + "step": 2360 + }, + { + "epoch": 0.8153766591966902, + "grad_norm": 1.2406464456818898, + "learning_rate": 9.543743564446978e-06, + "loss": 0.3329, + "step": 2365 + }, + { + "epoch": 0.817100499913808, + "grad_norm": 1.536245772157553, + "learning_rate": 9.541410922773089e-06, + "loss": 0.3423, + "step": 2370 + }, + { + "epoch": 0.8188243406309257, + "grad_norm": 1.5397276731246494, + "learning_rate": 9.539072619957183e-06, + "loss": 0.3247, + "step": 2375 + }, + { + "epoch": 0.8205481813480434, + "grad_norm": 1.8060539523506391, + "learning_rate": 9.536728658914097e-06, + "loss": 0.3568, + "step": 2380 + }, + { + "epoch": 0.8222720220651611, + "grad_norm": 1.4592439115566356, + "learning_rate": 9.534379042565717e-06, + "loss": 0.3336, + "step": 2385 + }, + { + "epoch": 0.8239958627822789, + "grad_norm": 1.3185535134406328, + "learning_rate": 9.532023773840982e-06, + "loss": 0.3392, + "step": 2390 + }, + { + "epoch": 0.8257197034993966, + "grad_norm": 1.3342775023387434, + "learning_rate": 9.529662855675876e-06, + "loss": 0.315, + "step": 2395 + }, + { + "epoch": 0.8274435442165144, + "grad_norm": 1.3718867632280016, + "learning_rate": 9.527296291013426e-06, + "loss": 0.3243, + "step": 2400 + }, + { + "epoch": 0.8291673849336322, + "grad_norm": 1.3647005703897526, + "learning_rate": 9.524924082803698e-06, + "loss": 0.3333, + "step": 2405 + }, + { + "epoch": 0.8308912256507499, + "grad_norm": 1.5042258411470462, + "learning_rate": 9.522546234003788e-06, + "loss": 0.336, + "step": 2410 + }, + { + "epoch": 0.8326150663678676, + "grad_norm": 1.3461680568437342, + "learning_rate": 9.520162747577835e-06, + "loss": 0.3365, + "step": 2415 + }, + { + "epoch": 0.8343389070849854, + "grad_norm": 1.1933386344717056, + "learning_rate": 9.517773626496993e-06, + "loss": 0.3504, + "step": 2420 + }, + { + "epoch": 0.8360627478021031, + "grad_norm": 1.1962471023938015, + "learning_rate": 9.515378873739446e-06, + "loss": 0.3557, + "step": 2425 + }, + { + "epoch": 0.8377865885192208, + "grad_norm": 1.2334697182629366, + "learning_rate": 9.512978492290399e-06, + "loss": 0.3505, + "step": 2430 + }, + { + "epoch": 0.8395104292363386, + "grad_norm": 1.2404195025607767, + "learning_rate": 9.51057248514207e-06, + "loss": 0.3286, + "step": 2435 + }, + { + "epoch": 0.8412342699534563, + "grad_norm": 1.4456394535074233, + "learning_rate": 9.508160855293692e-06, + "loss": 0.3703, + "step": 2440 + }, + { + "epoch": 0.842958110670574, + "grad_norm": 1.211630844825725, + "learning_rate": 9.505743605751508e-06, + "loss": 0.3327, + "step": 2445 + }, + { + "epoch": 0.8446819513876918, + "grad_norm": 1.2508066585995028, + "learning_rate": 9.503320739528765e-06, + "loss": 0.3198, + "step": 2450 + }, + { + "epoch": 0.8464057921048095, + "grad_norm": 1.5023037643225363, + "learning_rate": 9.500892259645711e-06, + "loss": 0.3555, + "step": 2455 + }, + { + "epoch": 0.8481296328219272, + "grad_norm": 1.3347230510275558, + "learning_rate": 9.498458169129592e-06, + "loss": 0.353, + "step": 2460 + }, + { + "epoch": 0.849853473539045, + "grad_norm": 2.7517703869349965, + "learning_rate": 9.496018471014647e-06, + "loss": 0.3576, + "step": 2465 + }, + { + "epoch": 0.8515773142561627, + "grad_norm": 1.4814688915816552, + "learning_rate": 9.493573168342109e-06, + "loss": 0.3301, + "step": 2470 + }, + { + "epoch": 0.8533011549732805, + "grad_norm": 1.2196033889361055, + "learning_rate": 9.491122264160196e-06, + "loss": 0.297, + "step": 2475 + }, + { + "epoch": 0.8550249956903982, + "grad_norm": 1.275401994882182, + "learning_rate": 9.488665761524103e-06, + "loss": 0.3234, + "step": 2480 + }, + { + "epoch": 0.856748836407516, + "grad_norm": 3.4667876017122747, + "learning_rate": 9.486203663496013e-06, + "loss": 0.354, + "step": 2485 + }, + { + "epoch": 0.8584726771246337, + "grad_norm": 1.2188819142562994, + "learning_rate": 9.483735973145073e-06, + "loss": 0.3304, + "step": 2490 + }, + { + "epoch": 0.8601965178417514, + "grad_norm": 1.2053209562676213, + "learning_rate": 9.481262693547416e-06, + "loss": 0.3272, + "step": 2495 + }, + { + "epoch": 0.8619203585588692, + "grad_norm": 1.157798675578704, + "learning_rate": 9.47878382778613e-06, + "loss": 0.3283, + "step": 2500 + }, + { + "epoch": 0.8636441992759869, + "grad_norm": 1.277783662247749, + "learning_rate": 9.476299378951267e-06, + "loss": 0.3269, + "step": 2505 + }, + { + "epoch": 0.8653680399931046, + "grad_norm": 1.2785783526501449, + "learning_rate": 9.473809350139846e-06, + "loss": 0.3482, + "step": 2510 + }, + { + "epoch": 0.8670918807102224, + "grad_norm": 1.36486848600861, + "learning_rate": 9.471313744455839e-06, + "loss": 0.363, + "step": 2515 + }, + { + "epoch": 0.8688157214273401, + "grad_norm": 1.1617192231119264, + "learning_rate": 9.468812565010164e-06, + "loss": 0.3517, + "step": 2520 + }, + { + "epoch": 0.8705395621444578, + "grad_norm": 1.5440363899586036, + "learning_rate": 9.466305814920695e-06, + "loss": 0.3313, + "step": 2525 + }, + { + "epoch": 0.8722634028615756, + "grad_norm": 1.0159466150889511, + "learning_rate": 9.463793497312246e-06, + "loss": 0.3326, + "step": 2530 + }, + { + "epoch": 0.8739872435786933, + "grad_norm": 1.2885891218483585, + "learning_rate": 9.461275615316571e-06, + "loss": 0.3579, + "step": 2535 + }, + { + "epoch": 0.875711084295811, + "grad_norm": 2.4592342977104957, + "learning_rate": 9.458752172072363e-06, + "loss": 0.3556, + "step": 2540 + }, + { + "epoch": 0.8774349250129289, + "grad_norm": 1.472629523629724, + "learning_rate": 9.456223170725244e-06, + "loss": 0.3517, + "step": 2545 + }, + { + "epoch": 0.8791587657300466, + "grad_norm": 1.4734918044829217, + "learning_rate": 9.453688614427772e-06, + "loss": 0.3058, + "step": 2550 + }, + { + "epoch": 0.8808826064471643, + "grad_norm": 1.4448743076562125, + "learning_rate": 9.451148506339416e-06, + "loss": 0.357, + "step": 2555 + }, + { + "epoch": 0.882606447164282, + "grad_norm": 4.491847049166671, + "learning_rate": 9.44860284962658e-06, + "loss": 0.3279, + "step": 2560 + }, + { + "epoch": 0.8843302878813998, + "grad_norm": 1.3416304743321867, + "learning_rate": 9.446051647462573e-06, + "loss": 0.3478, + "step": 2565 + }, + { + "epoch": 0.8860541285985175, + "grad_norm": 1.1288996950397197, + "learning_rate": 9.443494903027626e-06, + "loss": 0.3319, + "step": 2570 + }, + { + "epoch": 0.8877779693156352, + "grad_norm": 0.901498432790985, + "learning_rate": 9.440932619508873e-06, + "loss": 0.3426, + "step": 2575 + }, + { + "epoch": 0.889501810032753, + "grad_norm": 1.1035942636790148, + "learning_rate": 9.438364800100355e-06, + "loss": 0.3306, + "step": 2580 + }, + { + "epoch": 0.8912256507498707, + "grad_norm": 1.0523924262373272, + "learning_rate": 9.435791448003013e-06, + "loss": 0.3198, + "step": 2585 + }, + { + "epoch": 0.8929494914669884, + "grad_norm": 1.4009671647525208, + "learning_rate": 9.433212566424687e-06, + "loss": 0.3303, + "step": 2590 + }, + { + "epoch": 0.8946733321841062, + "grad_norm": 1.1826714745257199, + "learning_rate": 9.430628158580106e-06, + "loss": 0.3471, + "step": 2595 + }, + { + "epoch": 0.8963971729012239, + "grad_norm": 1.4810150161689448, + "learning_rate": 9.42803822769089e-06, + "loss": 0.3574, + "step": 2600 + }, + { + "epoch": 0.8981210136183416, + "grad_norm": 1.441077514610335, + "learning_rate": 9.425442776985545e-06, + "loss": 0.3435, + "step": 2605 + }, + { + "epoch": 0.8998448543354594, + "grad_norm": 1.1357452889666666, + "learning_rate": 9.422841809699456e-06, + "loss": 0.3241, + "step": 2610 + }, + { + "epoch": 0.9015686950525772, + "grad_norm": 2.0085711068152, + "learning_rate": 9.420235329074884e-06, + "loss": 0.3234, + "step": 2615 + }, + { + "epoch": 0.9032925357696949, + "grad_norm": 0.9489424603080546, + "learning_rate": 9.417623338360969e-06, + "loss": 0.3154, + "step": 2620 + }, + { + "epoch": 0.9050163764868127, + "grad_norm": 1.4232674265899794, + "learning_rate": 9.415005840813707e-06, + "loss": 0.3303, + "step": 2625 + }, + { + "epoch": 0.9067402172039304, + "grad_norm": 1.3382898651573043, + "learning_rate": 9.41238283969597e-06, + "loss": 0.3301, + "step": 2630 + }, + { + "epoch": 0.9084640579210481, + "grad_norm": 1.1231685188301026, + "learning_rate": 9.409754338277488e-06, + "loss": 0.3085, + "step": 2635 + }, + { + "epoch": 0.9101878986381658, + "grad_norm": 1.5592137300364184, + "learning_rate": 9.407120339834844e-06, + "loss": 0.3581, + "step": 2640 + }, + { + "epoch": 0.9119117393552836, + "grad_norm": 1.4910951835642579, + "learning_rate": 9.404480847651478e-06, + "loss": 0.335, + "step": 2645 + }, + { + "epoch": 0.9136355800724013, + "grad_norm": 1.3912407376667717, + "learning_rate": 9.401835865017672e-06, + "loss": 0.3016, + "step": 2650 + }, + { + "epoch": 0.915359420789519, + "grad_norm": 1.253916795107278, + "learning_rate": 9.399185395230561e-06, + "loss": 0.3157, + "step": 2655 + }, + { + "epoch": 0.9170832615066368, + "grad_norm": 2.2440199096218425, + "learning_rate": 9.396529441594108e-06, + "loss": 0.3496, + "step": 2660 + }, + { + "epoch": 0.9188071022237545, + "grad_norm": 1.419891140198786, + "learning_rate": 9.393868007419128e-06, + "loss": 0.3507, + "step": 2665 + }, + { + "epoch": 0.9205309429408722, + "grad_norm": 1.2073706223295508, + "learning_rate": 9.391201096023253e-06, + "loss": 0.3083, + "step": 2670 + }, + { + "epoch": 0.92225478365799, + "grad_norm": 1.2615155330650807, + "learning_rate": 9.388528710730948e-06, + "loss": 0.321, + "step": 2675 + }, + { + "epoch": 0.9239786243751077, + "grad_norm": 1.339076863408327, + "learning_rate": 9.385850854873507e-06, + "loss": 0.3353, + "step": 2680 + }, + { + "epoch": 0.9257024650922254, + "grad_norm": 1.2825836435340867, + "learning_rate": 9.383167531789034e-06, + "loss": 0.3511, + "step": 2685 + }, + { + "epoch": 0.9274263058093433, + "grad_norm": 1.7019430585409971, + "learning_rate": 9.380478744822455e-06, + "loss": 0.3074, + "step": 2690 + }, + { + "epoch": 0.929150146526461, + "grad_norm": 1.233264044715385, + "learning_rate": 9.377784497325501e-06, + "loss": 0.3282, + "step": 2695 + }, + { + "epoch": 0.9308739872435787, + "grad_norm": 1.2831370149893337, + "learning_rate": 9.37508479265672e-06, + "loss": 0.317, + "step": 2700 + }, + { + "epoch": 0.9325978279606965, + "grad_norm": 1.2661466151020366, + "learning_rate": 9.372379634181451e-06, + "loss": 0.3164, + "step": 2705 + }, + { + "epoch": 0.9343216686778142, + "grad_norm": 1.3461679472854156, + "learning_rate": 9.36966902527184e-06, + "loss": 0.3859, + "step": 2710 + }, + { + "epoch": 0.9360455093949319, + "grad_norm": 1.3258623040461295, + "learning_rate": 9.366952969306821e-06, + "loss": 0.343, + "step": 2715 + }, + { + "epoch": 0.9377693501120496, + "grad_norm": 1.2837719220100288, + "learning_rate": 9.364231469672125e-06, + "loss": 0.31, + "step": 2720 + }, + { + "epoch": 0.9394931908291674, + "grad_norm": 1.1383293285293314, + "learning_rate": 9.361504529760261e-06, + "loss": 0.3466, + "step": 2725 + }, + { + "epoch": 0.9412170315462851, + "grad_norm": 1.2941441350697112, + "learning_rate": 9.358772152970528e-06, + "loss": 0.3444, + "step": 2730 + }, + { + "epoch": 0.9429408722634028, + "grad_norm": 1.69131432027105, + "learning_rate": 9.356034342708995e-06, + "loss": 0.3672, + "step": 2735 + }, + { + "epoch": 0.9446647129805206, + "grad_norm": 0.9926487053723504, + "learning_rate": 9.353291102388509e-06, + "loss": 0.3131, + "step": 2740 + }, + { + "epoch": 0.9463885536976383, + "grad_norm": 1.3410566345399482, + "learning_rate": 9.350542435428682e-06, + "loss": 0.3183, + "step": 2745 + }, + { + "epoch": 0.948112394414756, + "grad_norm": 1.3214435277868297, + "learning_rate": 9.347788345255895e-06, + "loss": 0.3237, + "step": 2750 + }, + { + "epoch": 0.9498362351318738, + "grad_norm": 1.0277835304235838, + "learning_rate": 9.345028835303287e-06, + "loss": 0.3319, + "step": 2755 + }, + { + "epoch": 0.9515600758489916, + "grad_norm": 4.14593407556766, + "learning_rate": 9.342263909010752e-06, + "loss": 0.3116, + "step": 2760 + }, + { + "epoch": 0.9532839165661093, + "grad_norm": 1.6385378541366884, + "learning_rate": 9.339493569824937e-06, + "loss": 0.3329, + "step": 2765 + }, + { + "epoch": 0.9550077572832271, + "grad_norm": 1.3821378989058908, + "learning_rate": 9.336717821199237e-06, + "loss": 0.3046, + "step": 2770 + }, + { + "epoch": 0.9567315980003448, + "grad_norm": 1.4381237457331928, + "learning_rate": 9.33393666659379e-06, + "loss": 0.3285, + "step": 2775 + }, + { + "epoch": 0.9584554387174625, + "grad_norm": 1.216809366937311, + "learning_rate": 9.331150109475473e-06, + "loss": 0.3125, + "step": 2780 + }, + { + "epoch": 0.9601792794345803, + "grad_norm": 1.3041047185014534, + "learning_rate": 9.328358153317895e-06, + "loss": 0.3826, + "step": 2785 + }, + { + "epoch": 0.961903120151698, + "grad_norm": 2.2222305774162647, + "learning_rate": 9.3255608016014e-06, + "loss": 0.3297, + "step": 2790 + }, + { + "epoch": 0.9636269608688157, + "grad_norm": 1.1654097857044674, + "learning_rate": 9.322758057813053e-06, + "loss": 0.3307, + "step": 2795 + }, + { + "epoch": 0.9653508015859334, + "grad_norm": 1.6721298722949953, + "learning_rate": 9.319949925446646e-06, + "loss": 0.3361, + "step": 2800 + }, + { + "epoch": 0.9670746423030512, + "grad_norm": 1.752818697182659, + "learning_rate": 9.31713640800268e-06, + "loss": 0.3317, + "step": 2805 + }, + { + "epoch": 0.9687984830201689, + "grad_norm": 1.5385925523268744, + "learning_rate": 9.31431750898838e-06, + "loss": 0.353, + "step": 2810 + }, + { + "epoch": 0.9705223237372866, + "grad_norm": 2.029079104356329, + "learning_rate": 9.311493231917668e-06, + "loss": 0.3441, + "step": 2815 + }, + { + "epoch": 0.9722461644544044, + "grad_norm": 2.3846356000705025, + "learning_rate": 9.308663580311176e-06, + "loss": 0.3238, + "step": 2820 + }, + { + "epoch": 0.9739700051715221, + "grad_norm": 1.1577758429063203, + "learning_rate": 9.30582855769624e-06, + "loss": 0.3658, + "step": 2825 + }, + { + "epoch": 0.9756938458886399, + "grad_norm": 2.1131532793323284, + "learning_rate": 9.30298816760688e-06, + "loss": 0.289, + "step": 2830 + }, + { + "epoch": 0.9774176866057577, + "grad_norm": 1.3175010394166544, + "learning_rate": 9.300142413583815e-06, + "loss": 0.3084, + "step": 2835 + }, + { + "epoch": 0.9791415273228754, + "grad_norm": 1.3865265635249566, + "learning_rate": 9.297291299174451e-06, + "loss": 0.3108, + "step": 2840 + }, + { + "epoch": 0.9808653680399931, + "grad_norm": 1.2854712886862314, + "learning_rate": 9.294434827932873e-06, + "loss": 0.3238, + "step": 2845 + }, + { + "epoch": 0.9825892087571109, + "grad_norm": 1.3196575952522713, + "learning_rate": 9.29157300341984e-06, + "loss": 0.3476, + "step": 2850 + }, + { + "epoch": 0.9843130494742286, + "grad_norm": 1.166391706467773, + "learning_rate": 9.288705829202795e-06, + "loss": 0.3467, + "step": 2855 + }, + { + "epoch": 0.9860368901913463, + "grad_norm": 2.1517267968283766, + "learning_rate": 9.28583330885584e-06, + "loss": 0.3206, + "step": 2860 + }, + { + "epoch": 0.9877607309084641, + "grad_norm": 1.4302470939931653, + "learning_rate": 9.282955445959742e-06, + "loss": 0.2991, + "step": 2865 + }, + { + "epoch": 0.9894845716255818, + "grad_norm": 1.2982846886282287, + "learning_rate": 9.280072244101935e-06, + "loss": 0.3439, + "step": 2870 + }, + { + "epoch": 0.9912084123426995, + "grad_norm": 1.4171862473277814, + "learning_rate": 9.277183706876503e-06, + "loss": 0.33, + "step": 2875 + }, + { + "epoch": 0.9929322530598172, + "grad_norm": 2.0529464439224743, + "learning_rate": 9.274289837884177e-06, + "loss": 0.3273, + "step": 2880 + }, + { + "epoch": 0.994656093776935, + "grad_norm": 1.4179302230128747, + "learning_rate": 9.271390640732344e-06, + "loss": 0.3249, + "step": 2885 + }, + { + "epoch": 0.9963799344940527, + "grad_norm": 1.1838232344069644, + "learning_rate": 9.268486119035024e-06, + "loss": 0.3261, + "step": 2890 + }, + { + "epoch": 0.9981037752111704, + "grad_norm": 1.0809177117913324, + "learning_rate": 9.26557627641288e-06, + "loss": 0.3367, + "step": 2895 + }, + { + "epoch": 0.9998276159282882, + "grad_norm": 1.3718377055896473, + "learning_rate": 9.262661116493206e-06, + "loss": 0.3793, + "step": 2900 + }, + { + "epoch": 1.0013790725736942, + "grad_norm": 1.4469424397242794, + "learning_rate": 9.259740642909925e-06, + "loss": 0.3396, + "step": 2905 + }, + { + "epoch": 1.003102913290812, + "grad_norm": 1.2088296961564604, + "learning_rate": 9.25681485930358e-06, + "loss": 0.2943, + "step": 2910 + }, + { + "epoch": 1.0048267540079296, + "grad_norm": 1.0212872692451622, + "learning_rate": 9.253883769321338e-06, + "loss": 0.3255, + "step": 2915 + }, + { + "epoch": 1.0065505947250475, + "grad_norm": 1.2957301045173608, + "learning_rate": 9.250947376616981e-06, + "loss": 0.3406, + "step": 2920 + }, + { + "epoch": 1.008274435442165, + "grad_norm": 1.200707887021714, + "learning_rate": 9.248005684850899e-06, + "loss": 0.3247, + "step": 2925 + }, + { + "epoch": 1.009998276159283, + "grad_norm": 1.9336721553938447, + "learning_rate": 9.245058697690082e-06, + "loss": 0.33, + "step": 2930 + }, + { + "epoch": 1.0117221168764006, + "grad_norm": 1.2363403152433825, + "learning_rate": 9.242106418808135e-06, + "loss": 0.2935, + "step": 2935 + }, + { + "epoch": 1.0134459575935184, + "grad_norm": 1.0602086781702837, + "learning_rate": 9.239148851885246e-06, + "loss": 0.333, + "step": 2940 + }, + { + "epoch": 1.015169798310636, + "grad_norm": 1.0500617420268046, + "learning_rate": 9.236186000608202e-06, + "loss": 0.2984, + "step": 2945 + }, + { + "epoch": 1.0168936390277539, + "grad_norm": 1.4653937545800964, + "learning_rate": 9.233217868670375e-06, + "loss": 0.3551, + "step": 2950 + }, + { + "epoch": 1.0186174797448715, + "grad_norm": 1.1579878357723785, + "learning_rate": 9.23024445977172e-06, + "loss": 0.323, + "step": 2955 + }, + { + "epoch": 1.0203413204619893, + "grad_norm": 1.3960967621772689, + "learning_rate": 9.22726577761877e-06, + "loss": 0.328, + "step": 2960 + }, + { + "epoch": 1.022065161179107, + "grad_norm": 1.6683684640391248, + "learning_rate": 9.224281825924633e-06, + "loss": 0.3151, + "step": 2965 + }, + { + "epoch": 1.0237890018962248, + "grad_norm": 1.3594375481318157, + "learning_rate": 9.221292608408981e-06, + "loss": 0.3328, + "step": 2970 + }, + { + "epoch": 1.0255128426133426, + "grad_norm": 1.1961076479551558, + "learning_rate": 9.218298128798057e-06, + "loss": 0.2866, + "step": 2975 + }, + { + "epoch": 1.0272366833304603, + "grad_norm": 1.3264006343525812, + "learning_rate": 9.21529839082466e-06, + "loss": 0.3634, + "step": 2980 + }, + { + "epoch": 1.028960524047578, + "grad_norm": 1.4380114758343852, + "learning_rate": 9.212293398228143e-06, + "loss": 0.326, + "step": 2985 + }, + { + "epoch": 1.0306843647646957, + "grad_norm": 1.2633287466496788, + "learning_rate": 9.209283154754407e-06, + "loss": 0.3243, + "step": 2990 + }, + { + "epoch": 1.0324082054818136, + "grad_norm": 1.2068503164627076, + "learning_rate": 9.206267664155906e-06, + "loss": 0.3284, + "step": 2995 + }, + { + "epoch": 1.0341320461989312, + "grad_norm": 0.9972841581038544, + "learning_rate": 9.20324693019163e-06, + "loss": 0.3141, + "step": 3000 + }, + { + "epoch": 1.035855886916049, + "grad_norm": 1.3456009009348888, + "learning_rate": 9.200220956627103e-06, + "loss": 0.3067, + "step": 3005 + }, + { + "epoch": 1.0375797276331666, + "grad_norm": 1.4124353691852705, + "learning_rate": 9.197189747234386e-06, + "loss": 0.3139, + "step": 3010 + }, + { + "epoch": 1.0393035683502845, + "grad_norm": 1.2134329536513566, + "learning_rate": 9.194153305792063e-06, + "loss": 0.303, + "step": 3015 + }, + { + "epoch": 1.041027409067402, + "grad_norm": 1.225486462924102, + "learning_rate": 9.191111636085239e-06, + "loss": 0.3456, + "step": 3020 + }, + { + "epoch": 1.04275124978452, + "grad_norm": 1.200538272769869, + "learning_rate": 9.188064741905541e-06, + "loss": 0.3398, + "step": 3025 + }, + { + "epoch": 1.0444750905016376, + "grad_norm": 1.1788390122132568, + "learning_rate": 9.185012627051104e-06, + "loss": 0.308, + "step": 3030 + }, + { + "epoch": 1.0461989312187554, + "grad_norm": 1.1923896675301378, + "learning_rate": 9.181955295326577e-06, + "loss": 0.2828, + "step": 3035 + }, + { + "epoch": 1.047922771935873, + "grad_norm": 1.2818716741324634, + "learning_rate": 9.178892750543102e-06, + "loss": 0.3064, + "step": 3040 + }, + { + "epoch": 1.0496466126529909, + "grad_norm": 1.754963405197714, + "learning_rate": 9.175824996518328e-06, + "loss": 0.317, + "step": 3045 + }, + { + "epoch": 1.0513704533701087, + "grad_norm": 1.4595740948374027, + "learning_rate": 9.172752037076397e-06, + "loss": 0.3331, + "step": 3050 + }, + { + "epoch": 1.0530942940872263, + "grad_norm": 1.4914539720002655, + "learning_rate": 9.169673876047935e-06, + "loss": 0.3124, + "step": 3055 + }, + { + "epoch": 1.0548181348043442, + "grad_norm": 1.3450930772698597, + "learning_rate": 9.166590517270057e-06, + "loss": 0.3342, + "step": 3060 + }, + { + "epoch": 1.0565419755214618, + "grad_norm": 1.5353690769757897, + "learning_rate": 9.163501964586352e-06, + "loss": 0.3258, + "step": 3065 + }, + { + "epoch": 1.0582658162385796, + "grad_norm": 1.31240648943605, + "learning_rate": 9.160408221846892e-06, + "loss": 0.279, + "step": 3070 + }, + { + "epoch": 1.0599896569556972, + "grad_norm": 1.3020899417469012, + "learning_rate": 9.157309292908209e-06, + "loss": 0.3284, + "step": 3075 + }, + { + "epoch": 1.061713497672815, + "grad_norm": 1.123033745362766, + "learning_rate": 9.154205181633307e-06, + "loss": 0.3202, + "step": 3080 + }, + { + "epoch": 1.0634373383899327, + "grad_norm": 1.3700619980016504, + "learning_rate": 9.151095891891645e-06, + "loss": 0.3505, + "step": 3085 + }, + { + "epoch": 1.0651611791070505, + "grad_norm": 1.2780386112530913, + "learning_rate": 9.147981427559143e-06, + "loss": 0.2824, + "step": 3090 + }, + { + "epoch": 1.0668850198241682, + "grad_norm": 1.101535842869913, + "learning_rate": 9.144861792518165e-06, + "loss": 0.3002, + "step": 3095 + }, + { + "epoch": 1.068608860541286, + "grad_norm": 2.1390900051858357, + "learning_rate": 9.141736990657525e-06, + "loss": 0.3183, + "step": 3100 + }, + { + "epoch": 1.0703327012584036, + "grad_norm": 1.7802300522958283, + "learning_rate": 9.138607025872479e-06, + "loss": 0.319, + "step": 3105 + }, + { + "epoch": 1.0720565419755215, + "grad_norm": 2.541754822505671, + "learning_rate": 9.135471902064715e-06, + "loss": 0.3219, + "step": 3110 + }, + { + "epoch": 1.073780382692639, + "grad_norm": 1.1246883363907023, + "learning_rate": 9.13233162314235e-06, + "loss": 0.2884, + "step": 3115 + }, + { + "epoch": 1.075504223409757, + "grad_norm": 1.1851281288816107, + "learning_rate": 9.129186193019936e-06, + "loss": 0.3292, + "step": 3120 + }, + { + "epoch": 1.0772280641268748, + "grad_norm": 1.317805691787828, + "learning_rate": 9.126035615618436e-06, + "loss": 0.3393, + "step": 3125 + }, + { + "epoch": 1.0789519048439924, + "grad_norm": 0.9821849032061132, + "learning_rate": 9.12287989486524e-06, + "loss": 0.3064, + "step": 3130 + }, + { + "epoch": 1.0806757455611102, + "grad_norm": 3.1902851908124754, + "learning_rate": 9.119719034694138e-06, + "loss": 0.3136, + "step": 3135 + }, + { + "epoch": 1.0823995862782279, + "grad_norm": 1.057752385226104, + "learning_rate": 9.116553039045335e-06, + "loss": 0.2912, + "step": 3140 + }, + { + "epoch": 1.0841234269953457, + "grad_norm": 1.082838286388527, + "learning_rate": 9.113381911865438e-06, + "loss": 0.3005, + "step": 3145 + }, + { + "epoch": 1.0858472677124633, + "grad_norm": 1.207253429285479, + "learning_rate": 9.110205657107442e-06, + "loss": 0.3317, + "step": 3150 + }, + { + "epoch": 1.0875711084295812, + "grad_norm": 2.1789368895253, + "learning_rate": 9.107024278730745e-06, + "loss": 0.3206, + "step": 3155 + }, + { + "epoch": 1.0892949491466988, + "grad_norm": 1.0505887404596654, + "learning_rate": 9.103837780701123e-06, + "loss": 0.3246, + "step": 3160 + }, + { + "epoch": 1.0910187898638166, + "grad_norm": 1.0985766251187308, + "learning_rate": 9.10064616699074e-06, + "loss": 0.318, + "step": 3165 + }, + { + "epoch": 1.0927426305809342, + "grad_norm": 1.6052042628818102, + "learning_rate": 9.097449441578133e-06, + "loss": 0.3143, + "step": 3170 + }, + { + "epoch": 1.094466471298052, + "grad_norm": 1.1837835031734694, + "learning_rate": 9.094247608448212e-06, + "loss": 0.3072, + "step": 3175 + }, + { + "epoch": 1.0961903120151697, + "grad_norm": 1.2079701069646844, + "learning_rate": 9.091040671592255e-06, + "loss": 0.3203, + "step": 3180 + }, + { + "epoch": 1.0979141527322875, + "grad_norm": 1.2820392494380455, + "learning_rate": 9.087828635007905e-06, + "loss": 0.3057, + "step": 3185 + }, + { + "epoch": 1.0996379934494054, + "grad_norm": 1.0910305561507039, + "learning_rate": 9.084611502699156e-06, + "loss": 0.2925, + "step": 3190 + }, + { + "epoch": 1.101361834166523, + "grad_norm": 1.1750103231464422, + "learning_rate": 9.081389278676356e-06, + "loss": 0.3023, + "step": 3195 + }, + { + "epoch": 1.1030856748836408, + "grad_norm": 1.0523181897701774, + "learning_rate": 9.078161966956205e-06, + "loss": 0.3073, + "step": 3200 + }, + { + "epoch": 1.1048095156007585, + "grad_norm": 1.1144835013330965, + "learning_rate": 9.074929571561737e-06, + "loss": 0.3103, + "step": 3205 + }, + { + "epoch": 1.1065333563178763, + "grad_norm": 1.2826801652393287, + "learning_rate": 9.071692096522331e-06, + "loss": 0.3234, + "step": 3210 + }, + { + "epoch": 1.108257197034994, + "grad_norm": 1.2110538700727251, + "learning_rate": 9.068449545873692e-06, + "loss": 0.3247, + "step": 3215 + }, + { + "epoch": 1.1099810377521118, + "grad_norm": 1.1195375320247638, + "learning_rate": 9.065201923657854e-06, + "loss": 0.3239, + "step": 3220 + }, + { + "epoch": 1.1117048784692294, + "grad_norm": 1.1149731135135896, + "learning_rate": 9.061949233923176e-06, + "loss": 0.3068, + "step": 3225 + }, + { + "epoch": 1.1134287191863472, + "grad_norm": 1.095240329352778, + "learning_rate": 9.058691480724329e-06, + "loss": 0.3092, + "step": 3230 + }, + { + "epoch": 1.1151525599034648, + "grad_norm": 1.2147303093492796, + "learning_rate": 9.055428668122302e-06, + "loss": 0.3157, + "step": 3235 + }, + { + "epoch": 1.1168764006205827, + "grad_norm": 1.299461369702117, + "learning_rate": 9.052160800184383e-06, + "loss": 0.319, + "step": 3240 + }, + { + "epoch": 1.1186002413377003, + "grad_norm": 1.0488502649200553, + "learning_rate": 9.04888788098417e-06, + "loss": 0.2859, + "step": 3245 + }, + { + "epoch": 1.1203240820548181, + "grad_norm": 1.6973882557434135, + "learning_rate": 9.04560991460155e-06, + "loss": 0.314, + "step": 3250 + }, + { + "epoch": 1.122047922771936, + "grad_norm": 1.4241709959155497, + "learning_rate": 9.042326905122708e-06, + "loss": 0.3169, + "step": 3255 + }, + { + "epoch": 1.1237717634890536, + "grad_norm": 1.3947509943465393, + "learning_rate": 9.039038856640112e-06, + "loss": 0.3052, + "step": 3260 + }, + { + "epoch": 1.1254956042061712, + "grad_norm": 5.946235156364782, + "learning_rate": 9.035745773252512e-06, + "loss": 0.3199, + "step": 3265 + }, + { + "epoch": 1.127219444923289, + "grad_norm": 1.2064611015080702, + "learning_rate": 9.032447659064936e-06, + "loss": 0.287, + "step": 3270 + }, + { + "epoch": 1.128943285640407, + "grad_norm": 1.1525822962827261, + "learning_rate": 9.029144518188679e-06, + "loss": 0.3187, + "step": 3275 + }, + { + "epoch": 1.1306671263575245, + "grad_norm": 1.0183538734942175, + "learning_rate": 9.02583635474131e-06, + "loss": 0.3084, + "step": 3280 + }, + { + "epoch": 1.1323909670746424, + "grad_norm": 1.0409103846604317, + "learning_rate": 9.022523172846646e-06, + "loss": 0.3147, + "step": 3285 + }, + { + "epoch": 1.13411480779176, + "grad_norm": 1.1631051294259775, + "learning_rate": 9.019204976634774e-06, + "loss": 0.3399, + "step": 3290 + }, + { + "epoch": 1.1358386485088778, + "grad_norm": 1.3342095603729827, + "learning_rate": 9.015881770242024e-06, + "loss": 0.3262, + "step": 3295 + }, + { + "epoch": 1.1375624892259955, + "grad_norm": 1.0923642457198397, + "learning_rate": 9.012553557810973e-06, + "loss": 0.3059, + "step": 3300 + }, + { + "epoch": 1.1392863299431133, + "grad_norm": 1.135252373838748, + "learning_rate": 9.009220343490435e-06, + "loss": 0.3011, + "step": 3305 + }, + { + "epoch": 1.141010170660231, + "grad_norm": 1.3095446317894224, + "learning_rate": 9.005882131435465e-06, + "loss": 0.2911, + "step": 3310 + }, + { + "epoch": 1.1427340113773488, + "grad_norm": 2.2004334678146464, + "learning_rate": 9.002538925807345e-06, + "loss": 0.3129, + "step": 3315 + }, + { + "epoch": 1.1444578520944664, + "grad_norm": 1.1777030293123387, + "learning_rate": 8.999190730773582e-06, + "loss": 0.3194, + "step": 3320 + }, + { + "epoch": 1.1461816928115842, + "grad_norm": 1.955546480394541, + "learning_rate": 8.995837550507903e-06, + "loss": 0.2928, + "step": 3325 + }, + { + "epoch": 1.1479055335287018, + "grad_norm": 1.108216299144341, + "learning_rate": 8.992479389190247e-06, + "loss": 0.3273, + "step": 3330 + }, + { + "epoch": 1.1496293742458197, + "grad_norm": 1.5101398201775058, + "learning_rate": 8.989116251006766e-06, + "loss": 0.2962, + "step": 3335 + }, + { + "epoch": 1.1513532149629375, + "grad_norm": 0.9838854320360326, + "learning_rate": 8.985748140149813e-06, + "loss": 0.3044, + "step": 3340 + }, + { + "epoch": 1.1530770556800551, + "grad_norm": 1.7167124539287208, + "learning_rate": 8.982375060817942e-06, + "loss": 0.318, + "step": 3345 + }, + { + "epoch": 1.154800896397173, + "grad_norm": 1.1990845752020232, + "learning_rate": 8.978997017215897e-06, + "loss": 0.2834, + "step": 3350 + }, + { + "epoch": 1.1565247371142906, + "grad_norm": 1.2563787199271776, + "learning_rate": 8.975614013554619e-06, + "loss": 0.3079, + "step": 3355 + }, + { + "epoch": 1.1582485778314084, + "grad_norm": 1.2228714968733068, + "learning_rate": 8.972226054051217e-06, + "loss": 0.3214, + "step": 3360 + }, + { + "epoch": 1.159972418548526, + "grad_norm": 1.1188032594839905, + "learning_rate": 8.968833142928992e-06, + "loss": 0.3212, + "step": 3365 + }, + { + "epoch": 1.161696259265644, + "grad_norm": 1.1579918395516766, + "learning_rate": 8.96543528441741e-06, + "loss": 0.3457, + "step": 3370 + }, + { + "epoch": 1.1634200999827615, + "grad_norm": 1.270614333694507, + "learning_rate": 8.962032482752107e-06, + "loss": 0.3016, + "step": 3375 + }, + { + "epoch": 1.1651439406998794, + "grad_norm": 1.029638728499931, + "learning_rate": 8.958624742174881e-06, + "loss": 0.3206, + "step": 3380 + }, + { + "epoch": 1.166867781416997, + "grad_norm": 1.0422434350804568, + "learning_rate": 8.955212066933683e-06, + "loss": 0.3261, + "step": 3385 + }, + { + "epoch": 1.1685916221341148, + "grad_norm": 1.166885288746339, + "learning_rate": 8.95179446128262e-06, + "loss": 0.3082, + "step": 3390 + }, + { + "epoch": 1.1703154628512324, + "grad_norm": 1.1527357707503587, + "learning_rate": 8.948371929481941e-06, + "loss": 0.2944, + "step": 3395 + }, + { + "epoch": 1.1720393035683503, + "grad_norm": 1.3451173935652678, + "learning_rate": 8.94494447579804e-06, + "loss": 0.3336, + "step": 3400 + }, + { + "epoch": 1.1737631442854681, + "grad_norm": 1.248928659098544, + "learning_rate": 8.941512104503444e-06, + "loss": 0.3211, + "step": 3405 + }, + { + "epoch": 1.1754869850025857, + "grad_norm": 1.3621113086681973, + "learning_rate": 8.938074819876809e-06, + "loss": 0.3385, + "step": 3410 + }, + { + "epoch": 1.1772108257197036, + "grad_norm": 1.3886706920077845, + "learning_rate": 8.934632626202922e-06, + "loss": 0.3017, + "step": 3415 + }, + { + "epoch": 1.1789346664368212, + "grad_norm": 1.4960290800079372, + "learning_rate": 8.931185527772676e-06, + "loss": 0.2949, + "step": 3420 + }, + { + "epoch": 1.180658507153939, + "grad_norm": 2.142670664410332, + "learning_rate": 8.927733528883094e-06, + "loss": 0.3264, + "step": 3425 + }, + { + "epoch": 1.1823823478710567, + "grad_norm": 1.168035816519859, + "learning_rate": 8.924276633837297e-06, + "loss": 0.2848, + "step": 3430 + }, + { + "epoch": 1.1841061885881745, + "grad_norm": 1.078767182728306, + "learning_rate": 8.920814846944513e-06, + "loss": 0.3182, + "step": 3435 + }, + { + "epoch": 1.1858300293052921, + "grad_norm": 1.1575427200338022, + "learning_rate": 8.917348172520069e-06, + "loss": 0.3453, + "step": 3440 + }, + { + "epoch": 1.18755387002241, + "grad_norm": 1.1751267417332532, + "learning_rate": 8.913876614885381e-06, + "loss": 0.3262, + "step": 3445 + }, + { + "epoch": 1.1892777107395276, + "grad_norm": 2.714565615270745, + "learning_rate": 8.910400178367958e-06, + "loss": 0.3157, + "step": 3450 + }, + { + "epoch": 1.1910015514566454, + "grad_norm": 1.1841621039542385, + "learning_rate": 8.906918867301384e-06, + "loss": 0.2714, + "step": 3455 + }, + { + "epoch": 1.192725392173763, + "grad_norm": 1.2687659930527437, + "learning_rate": 8.903432686025326e-06, + "loss": 0.3344, + "step": 3460 + }, + { + "epoch": 1.194449232890881, + "grad_norm": 1.177974001863437, + "learning_rate": 8.899941638885513e-06, + "loss": 0.3098, + "step": 3465 + }, + { + "epoch": 1.1961730736079987, + "grad_norm": 1.1834203084489565, + "learning_rate": 8.896445730233753e-06, + "loss": 0.2964, + "step": 3470 + }, + { + "epoch": 1.1978969143251164, + "grad_norm": 1.120645544236912, + "learning_rate": 8.892944964427902e-06, + "loss": 0.306, + "step": 3475 + }, + { + "epoch": 1.199620755042234, + "grad_norm": 1.5043334332333145, + "learning_rate": 8.889439345831873e-06, + "loss": 0.3282, + "step": 3480 + }, + { + "epoch": 1.2013445957593518, + "grad_norm": 1.2627887443563284, + "learning_rate": 8.885928878815635e-06, + "loss": 0.3183, + "step": 3485 + }, + { + "epoch": 1.2030684364764697, + "grad_norm": 1.3312654827479626, + "learning_rate": 8.882413567755196e-06, + "loss": 0.3118, + "step": 3490 + }, + { + "epoch": 1.2047922771935873, + "grad_norm": 1.1999457191173653, + "learning_rate": 8.8788934170326e-06, + "loss": 0.3009, + "step": 3495 + }, + { + "epoch": 1.2065161179107051, + "grad_norm": 1.2149297405117525, + "learning_rate": 8.87536843103593e-06, + "loss": 0.302, + "step": 3500 + }, + { + "epoch": 1.2082399586278227, + "grad_norm": 1.5737309802337405, + "learning_rate": 8.87183861415929e-06, + "loss": 0.3134, + "step": 3505 + }, + { + "epoch": 1.2099637993449406, + "grad_norm": 1.2583340120837145, + "learning_rate": 8.868303970802812e-06, + "loss": 0.2971, + "step": 3510 + }, + { + "epoch": 1.2116876400620582, + "grad_norm": 1.2160266975293088, + "learning_rate": 8.864764505372638e-06, + "loss": 0.336, + "step": 3515 + }, + { + "epoch": 1.213411480779176, + "grad_norm": 1.1282174036075983, + "learning_rate": 8.86122022228093e-06, + "loss": 0.3225, + "step": 3520 + }, + { + "epoch": 1.2151353214962937, + "grad_norm": 1.0651429482619807, + "learning_rate": 8.857671125945846e-06, + "loss": 0.3159, + "step": 3525 + }, + { + "epoch": 1.2168591622134115, + "grad_norm": 1.1616952116761117, + "learning_rate": 8.854117220791549e-06, + "loss": 0.3183, + "step": 3530 + }, + { + "epoch": 1.2185830029305291, + "grad_norm": 1.1444499355129203, + "learning_rate": 8.850558511248195e-06, + "loss": 0.2943, + "step": 3535 + }, + { + "epoch": 1.220306843647647, + "grad_norm": 2.159298770877469, + "learning_rate": 8.846995001751932e-06, + "loss": 0.2877, + "step": 3540 + }, + { + "epoch": 1.2220306843647646, + "grad_norm": 1.1699216183514227, + "learning_rate": 8.843426696744888e-06, + "loss": 0.3311, + "step": 3545 + }, + { + "epoch": 1.2237545250818824, + "grad_norm": 1.5762911225905711, + "learning_rate": 8.83985360067517e-06, + "loss": 0.2807, + "step": 3550 + }, + { + "epoch": 1.2254783657990003, + "grad_norm": 1.1775995024929828, + "learning_rate": 8.836275717996853e-06, + "loss": 0.3127, + "step": 3555 + }, + { + "epoch": 1.2272022065161179, + "grad_norm": 1.2924795578245347, + "learning_rate": 8.832693053169991e-06, + "loss": 0.2866, + "step": 3560 + }, + { + "epoch": 1.2289260472332357, + "grad_norm": 1.0826637187224928, + "learning_rate": 8.829105610660587e-06, + "loss": 0.334, + "step": 3565 + }, + { + "epoch": 1.2306498879503533, + "grad_norm": 1.0878241010781369, + "learning_rate": 8.825513394940604e-06, + "loss": 0.3189, + "step": 3570 + }, + { + "epoch": 1.2323737286674712, + "grad_norm": 1.6919986779201839, + "learning_rate": 8.821916410487955e-06, + "loss": 0.2935, + "step": 3575 + }, + { + "epoch": 1.2340975693845888, + "grad_norm": 1.196708144055184, + "learning_rate": 8.818314661786496e-06, + "loss": 0.3207, + "step": 3580 + }, + { + "epoch": 1.2358214101017067, + "grad_norm": 1.2977363127019375, + "learning_rate": 8.814708153326025e-06, + "loss": 0.3441, + "step": 3585 + }, + { + "epoch": 1.2375452508188243, + "grad_norm": 5.186676555016642, + "learning_rate": 8.811096889602275e-06, + "loss": 0.3031, + "step": 3590 + }, + { + "epoch": 1.2392690915359421, + "grad_norm": 1.273320277381821, + "learning_rate": 8.807480875116901e-06, + "loss": 0.3225, + "step": 3595 + }, + { + "epoch": 1.2409929322530597, + "grad_norm": 1.457400171271567, + "learning_rate": 8.80386011437748e-06, + "loss": 0.3329, + "step": 3600 + }, + { + "epoch": 1.2427167729701776, + "grad_norm": 1.1685070126812422, + "learning_rate": 8.800234611897513e-06, + "loss": 0.3049, + "step": 3605 + }, + { + "epoch": 1.2444406136872952, + "grad_norm": 1.222717746578087, + "learning_rate": 8.796604372196401e-06, + "loss": 0.3019, + "step": 3610 + }, + { + "epoch": 1.246164454404413, + "grad_norm": 1.187029207324435, + "learning_rate": 8.792969399799464e-06, + "loss": 0.2978, + "step": 3615 + }, + { + "epoch": 1.2478882951215309, + "grad_norm": 2.264623865148118, + "learning_rate": 8.789329699237907e-06, + "loss": 0.3225, + "step": 3620 + }, + { + "epoch": 1.2496121358386485, + "grad_norm": 1.3352759557893117, + "learning_rate": 8.78568527504884e-06, + "loss": 0.3437, + "step": 3625 + }, + { + "epoch": 1.2513359765557661, + "grad_norm": 1.1603793017125967, + "learning_rate": 8.782036131775255e-06, + "loss": 0.3106, + "step": 3630 + }, + { + "epoch": 1.253059817272884, + "grad_norm": 1.0931930826137783, + "learning_rate": 8.77838227396603e-06, + "loss": 0.3369, + "step": 3635 + }, + { + "epoch": 1.2547836579900018, + "grad_norm": 1.101995328424067, + "learning_rate": 8.774723706175919e-06, + "loss": 0.3173, + "step": 3640 + }, + { + "epoch": 1.2565074987071194, + "grad_norm": 1.301541638576734, + "learning_rate": 8.771060432965543e-06, + "loss": 0.2696, + "step": 3645 + }, + { + "epoch": 1.2582313394242373, + "grad_norm": 1.7329598603808272, + "learning_rate": 8.767392458901395e-06, + "loss": 0.3086, + "step": 3650 + }, + { + "epoch": 1.2599551801413549, + "grad_norm": 1.143101022541705, + "learning_rate": 8.76371978855583e-06, + "loss": 0.2676, + "step": 3655 + }, + { + "epoch": 1.2616790208584727, + "grad_norm": 1.358724583745211, + "learning_rate": 8.760042426507044e-06, + "loss": 0.2685, + "step": 3660 + }, + { + "epoch": 1.2634028615755903, + "grad_norm": 1.2314537967737824, + "learning_rate": 8.756360377339097e-06, + "loss": 0.2943, + "step": 3665 + }, + { + "epoch": 1.2651267022927082, + "grad_norm": 1.2763524736651122, + "learning_rate": 8.752673645641882e-06, + "loss": 0.2538, + "step": 3670 + }, + { + "epoch": 1.2668505430098258, + "grad_norm": 1.3379171604640265, + "learning_rate": 8.748982236011132e-06, + "loss": 0.319, + "step": 3675 + }, + { + "epoch": 1.2685743837269436, + "grad_norm": 1.1703348158657632, + "learning_rate": 8.74528615304841e-06, + "loss": 0.33, + "step": 3680 + }, + { + "epoch": 1.2702982244440615, + "grad_norm": 2.208958053875747, + "learning_rate": 8.74158540136111e-06, + "loss": 0.3147, + "step": 3685 + }, + { + "epoch": 1.272022065161179, + "grad_norm": 1.0576540493714783, + "learning_rate": 8.737879985562437e-06, + "loss": 0.3166, + "step": 3690 + }, + { + "epoch": 1.2737459058782967, + "grad_norm": 1.3831866611618326, + "learning_rate": 8.734169910271418e-06, + "loss": 0.3221, + "step": 3695 + }, + { + "epoch": 1.2754697465954146, + "grad_norm": 1.266831655506686, + "learning_rate": 8.730455180112885e-06, + "loss": 0.311, + "step": 3700 + }, + { + "epoch": 1.2771935873125324, + "grad_norm": 1.1691148138632521, + "learning_rate": 8.72673579971747e-06, + "loss": 0.3119, + "step": 3705 + }, + { + "epoch": 1.27891742802965, + "grad_norm": 1.1632580575334266, + "learning_rate": 8.723011773721606e-06, + "loss": 0.3021, + "step": 3710 + }, + { + "epoch": 1.2806412687467679, + "grad_norm": 1.3312894813926992, + "learning_rate": 8.719283106767515e-06, + "loss": 0.3322, + "step": 3715 + }, + { + "epoch": 1.2823651094638855, + "grad_norm": 1.2394797838694025, + "learning_rate": 8.715549803503206e-06, + "loss": 0.3199, + "step": 3720 + }, + { + "epoch": 1.2840889501810033, + "grad_norm": 1.1460264439012757, + "learning_rate": 8.711811868582469e-06, + "loss": 0.3094, + "step": 3725 + }, + { + "epoch": 1.285812790898121, + "grad_norm": 1.8253868322485098, + "learning_rate": 8.708069306664857e-06, + "loss": 0.3091, + "step": 3730 + }, + { + "epoch": 1.2875366316152388, + "grad_norm": 1.174995699726055, + "learning_rate": 8.704322122415705e-06, + "loss": 0.3072, + "step": 3735 + }, + { + "epoch": 1.2892604723323564, + "grad_norm": 1.2405692201112009, + "learning_rate": 8.7005703205061e-06, + "loss": 0.2784, + "step": 3740 + }, + { + "epoch": 1.2909843130494743, + "grad_norm": 1.2139381271740364, + "learning_rate": 8.696813905612894e-06, + "loss": 0.315, + "step": 3745 + }, + { + "epoch": 1.292708153766592, + "grad_norm": 1.1218970511407682, + "learning_rate": 8.693052882418679e-06, + "loss": 0.275, + "step": 3750 + }, + { + "epoch": 1.2944319944837097, + "grad_norm": 1.315075351816208, + "learning_rate": 8.689287255611798e-06, + "loss": 0.2777, + "step": 3755 + }, + { + "epoch": 1.2961558352008273, + "grad_norm": 1.2052547476295588, + "learning_rate": 8.685517029886333e-06, + "loss": 0.3089, + "step": 3760 + }, + { + "epoch": 1.2978796759179452, + "grad_norm": 1.1711089575310607, + "learning_rate": 8.681742209942097e-06, + "loss": 0.3102, + "step": 3765 + }, + { + "epoch": 1.299603516635063, + "grad_norm": 2.5907011576274526, + "learning_rate": 8.677962800484628e-06, + "loss": 0.3102, + "step": 3770 + }, + { + "epoch": 1.3013273573521806, + "grad_norm": 1.2269836411875583, + "learning_rate": 8.674178806225189e-06, + "loss": 0.311, + "step": 3775 + }, + { + "epoch": 1.3030511980692985, + "grad_norm": 10.195710693149598, + "learning_rate": 8.670390231880757e-06, + "loss": 0.2927, + "step": 3780 + }, + { + "epoch": 1.304775038786416, + "grad_norm": 4.387608525503374, + "learning_rate": 8.666597082174018e-06, + "loss": 0.3526, + "step": 3785 + }, + { + "epoch": 1.306498879503534, + "grad_norm": 2.4775231725757676, + "learning_rate": 8.662799361833358e-06, + "loss": 0.3559, + "step": 3790 + }, + { + "epoch": 1.3082227202206516, + "grad_norm": 1.7873721070217716, + "learning_rate": 8.65899707559287e-06, + "loss": 0.297, + "step": 3795 + }, + { + "epoch": 1.3099465609377694, + "grad_norm": 1.209675958623188, + "learning_rate": 8.655190228192327e-06, + "loss": 0.2945, + "step": 3800 + }, + { + "epoch": 1.311670401654887, + "grad_norm": 1.2066551121091627, + "learning_rate": 8.651378824377197e-06, + "loss": 0.3098, + "step": 3805 + }, + { + "epoch": 1.3133942423720049, + "grad_norm": 1.1850448243433895, + "learning_rate": 8.647562868898623e-06, + "loss": 0.2962, + "step": 3810 + }, + { + "epoch": 1.3151180830891227, + "grad_norm": 1.1464089305962142, + "learning_rate": 8.643742366513421e-06, + "loss": 0.3074, + "step": 3815 + }, + { + "epoch": 1.3168419238062403, + "grad_norm": 1.3818831810491352, + "learning_rate": 8.639917321984081e-06, + "loss": 0.3102, + "step": 3820 + }, + { + "epoch": 1.318565764523358, + "grad_norm": 1.6127204863145952, + "learning_rate": 8.636087740078749e-06, + "loss": 0.3085, + "step": 3825 + }, + { + "epoch": 1.3202896052404758, + "grad_norm": 1.1506016824908816, + "learning_rate": 8.63225362557123e-06, + "loss": 0.3119, + "step": 3830 + }, + { + "epoch": 1.3220134459575936, + "grad_norm": 1.1695078929991647, + "learning_rate": 8.628414983240978e-06, + "loss": 0.2929, + "step": 3835 + }, + { + "epoch": 1.3237372866747112, + "grad_norm": 1.3850274658923796, + "learning_rate": 8.62457181787309e-06, + "loss": 0.2963, + "step": 3840 + }, + { + "epoch": 1.3254611273918289, + "grad_norm": 1.4119811784754794, + "learning_rate": 8.620724134258308e-06, + "loss": 0.2986, + "step": 3845 + }, + { + "epoch": 1.3271849681089467, + "grad_norm": 1.1063803465154691, + "learning_rate": 8.616871937192995e-06, + "loss": 0.3162, + "step": 3850 + }, + { + "epoch": 1.3289088088260645, + "grad_norm": 1.247495661891309, + "learning_rate": 8.61301523147915e-06, + "loss": 0.2842, + "step": 3855 + }, + { + "epoch": 1.3306326495431822, + "grad_norm": 1.1480440865690702, + "learning_rate": 8.60915402192439e-06, + "loss": 0.2823, + "step": 3860 + }, + { + "epoch": 1.3323564902603, + "grad_norm": 1.2707825018347805, + "learning_rate": 8.605288313341942e-06, + "loss": 0.286, + "step": 3865 + }, + { + "epoch": 1.3340803309774176, + "grad_norm": 1.2614854491642877, + "learning_rate": 8.601418110550645e-06, + "loss": 0.3255, + "step": 3870 + }, + { + "epoch": 1.3358041716945355, + "grad_norm": 1.591165196055561, + "learning_rate": 8.597543418374943e-06, + "loss": 0.3394, + "step": 3875 + }, + { + "epoch": 1.337528012411653, + "grad_norm": 1.103027484329325, + "learning_rate": 8.593664241644868e-06, + "loss": 0.3165, + "step": 3880 + }, + { + "epoch": 1.339251853128771, + "grad_norm": 1.20833888216885, + "learning_rate": 8.58978058519605e-06, + "loss": 0.2591, + "step": 3885 + }, + { + "epoch": 1.3409756938458886, + "grad_norm": 1.3884345182667164, + "learning_rate": 8.5858924538697e-06, + "loss": 0.2952, + "step": 3890 + }, + { + "epoch": 1.3426995345630064, + "grad_norm": 1.1228948582398766, + "learning_rate": 8.581999852512606e-06, + "loss": 0.3159, + "step": 3895 + }, + { + "epoch": 1.3444233752801242, + "grad_norm": 1.4477570266687043, + "learning_rate": 8.578102785977134e-06, + "loss": 0.3326, + "step": 3900 + }, + { + "epoch": 1.3461472159972419, + "grad_norm": 1.1223799822191507, + "learning_rate": 8.574201259121208e-06, + "loss": 0.2989, + "step": 3905 + }, + { + "epoch": 1.3478710567143595, + "grad_norm": 1.2260172584010405, + "learning_rate": 8.570295276808319e-06, + "loss": 0.313, + "step": 3910 + }, + { + "epoch": 1.3495948974314773, + "grad_norm": 1.1391685796338593, + "learning_rate": 8.566384843907505e-06, + "loss": 0.2791, + "step": 3915 + }, + { + "epoch": 1.3513187381485952, + "grad_norm": 1.137208901102679, + "learning_rate": 8.562469965293361e-06, + "loss": 0.2956, + "step": 3920 + }, + { + "epoch": 1.3530425788657128, + "grad_norm": 1.0734438310141086, + "learning_rate": 8.558550645846015e-06, + "loss": 0.2922, + "step": 3925 + }, + { + "epoch": 1.3547664195828306, + "grad_norm": 1.5072972449316966, + "learning_rate": 8.554626890451137e-06, + "loss": 0.2889, + "step": 3930 + }, + { + "epoch": 1.3564902602999482, + "grad_norm": 1.111309813125269, + "learning_rate": 8.550698703999922e-06, + "loss": 0.2855, + "step": 3935 + }, + { + "epoch": 1.358214101017066, + "grad_norm": 1.3056420462285931, + "learning_rate": 8.546766091389091e-06, + "loss": 0.3283, + "step": 3940 + }, + { + "epoch": 1.3599379417341837, + "grad_norm": 1.199834987424094, + "learning_rate": 8.542829057520884e-06, + "loss": 0.28, + "step": 3945 + }, + { + "epoch": 1.3616617824513015, + "grad_norm": 1.3191225232525428, + "learning_rate": 8.538887607303052e-06, + "loss": 0.274, + "step": 3950 + }, + { + "epoch": 1.3633856231684192, + "grad_norm": 1.1068722563144409, + "learning_rate": 8.534941745648845e-06, + "loss": 0.275, + "step": 3955 + }, + { + "epoch": 1.365109463885537, + "grad_norm": 1.078920667366664, + "learning_rate": 8.53099147747702e-06, + "loss": 0.3191, + "step": 3960 + }, + { + "epoch": 1.3668333046026548, + "grad_norm": 1.1455257685642934, + "learning_rate": 8.527036807711825e-06, + "loss": 0.295, + "step": 3965 + }, + { + "epoch": 1.3685571453197725, + "grad_norm": 1.305899035365651, + "learning_rate": 8.523077741282991e-06, + "loss": 0.2896, + "step": 3970 + }, + { + "epoch": 1.37028098603689, + "grad_norm": 1.1545883100319863, + "learning_rate": 8.519114283125736e-06, + "loss": 0.3105, + "step": 3975 + }, + { + "epoch": 1.372004826754008, + "grad_norm": 1.2535401547804015, + "learning_rate": 8.515146438180745e-06, + "loss": 0.3319, + "step": 3980 + }, + { + "epoch": 1.3737286674711258, + "grad_norm": 1.186121490544396, + "learning_rate": 8.511174211394178e-06, + "loss": 0.3168, + "step": 3985 + }, + { + "epoch": 1.3754525081882434, + "grad_norm": 1.036125970260004, + "learning_rate": 8.507197607717656e-06, + "loss": 0.3019, + "step": 3990 + }, + { + "epoch": 1.3771763489053612, + "grad_norm": 1.8719458982246997, + "learning_rate": 8.503216632108253e-06, + "loss": 0.3021, + "step": 3995 + }, + { + "epoch": 1.3789001896224788, + "grad_norm": 1.1309458795923768, + "learning_rate": 8.499231289528495e-06, + "loss": 0.2791, + "step": 4000 + }, + { + "epoch": 1.3806240303395967, + "grad_norm": 1.1247825532724618, + "learning_rate": 8.49524158494635e-06, + "loss": 0.3131, + "step": 4005 + }, + { + "epoch": 1.3823478710567143, + "grad_norm": 1.1592309305579953, + "learning_rate": 8.491247523335227e-06, + "loss": 0.2321, + "step": 4010 + }, + { + "epoch": 1.3840717117738321, + "grad_norm": 1.0672390832017877, + "learning_rate": 8.487249109673963e-06, + "loss": 0.308, + "step": 4015 + }, + { + "epoch": 1.3857955524909498, + "grad_norm": 1.020286154089578, + "learning_rate": 8.483246348946823e-06, + "loss": 0.3069, + "step": 4020 + }, + { + "epoch": 1.3875193932080676, + "grad_norm": 1.5815639886180672, + "learning_rate": 8.479239246143487e-06, + "loss": 0.3255, + "step": 4025 + }, + { + "epoch": 1.3892432339251855, + "grad_norm": 1.224480348243422, + "learning_rate": 8.47522780625905e-06, + "loss": 0.279, + "step": 4030 + }, + { + "epoch": 1.390967074642303, + "grad_norm": 1.0510017728348102, + "learning_rate": 8.471212034294013e-06, + "loss": 0.3068, + "step": 4035 + }, + { + "epoch": 1.3926909153594207, + "grad_norm": 1.1979579443327188, + "learning_rate": 8.46719193525428e-06, + "loss": 0.295, + "step": 4040 + }, + { + "epoch": 1.3944147560765385, + "grad_norm": 1.1436237072016693, + "learning_rate": 8.463167514151142e-06, + "loss": 0.2952, + "step": 4045 + }, + { + "epoch": 1.3961385967936564, + "grad_norm": 2.372645789701561, + "learning_rate": 8.459138776001287e-06, + "loss": 0.2759, + "step": 4050 + }, + { + "epoch": 1.397862437510774, + "grad_norm": 1.5358038033861106, + "learning_rate": 8.455105725826776e-06, + "loss": 0.3102, + "step": 4055 + }, + { + "epoch": 1.3995862782278916, + "grad_norm": 1.0942800078744155, + "learning_rate": 8.451068368655051e-06, + "loss": 0.298, + "step": 4060 + }, + { + "epoch": 1.4013101189450095, + "grad_norm": 1.2498191988881162, + "learning_rate": 8.447026709518917e-06, + "loss": 0.2881, + "step": 4065 + }, + { + "epoch": 1.4030339596621273, + "grad_norm": 1.103501423611643, + "learning_rate": 8.44298075345655e-06, + "loss": 0.2708, + "step": 4070 + }, + { + "epoch": 1.404757800379245, + "grad_norm": 1.140581080023868, + "learning_rate": 8.438930505511476e-06, + "loss": 0.3046, + "step": 4075 + }, + { + "epoch": 1.4064816410963628, + "grad_norm": 1.112135439548782, + "learning_rate": 8.434875970732573e-06, + "loss": 0.2974, + "step": 4080 + }, + { + "epoch": 1.4082054818134804, + "grad_norm": 1.2195968452445054, + "learning_rate": 8.430817154174061e-06, + "loss": 0.2945, + "step": 4085 + }, + { + "epoch": 1.4099293225305982, + "grad_norm": 1.1857667376896035, + "learning_rate": 8.426754060895499e-06, + "loss": 0.3017, + "step": 4090 + }, + { + "epoch": 1.4116531632477158, + "grad_norm": 1.224210577295516, + "learning_rate": 8.42268669596178e-06, + "loss": 0.2883, + "step": 4095 + }, + { + "epoch": 1.4133770039648337, + "grad_norm": 1.3165556809012575, + "learning_rate": 8.418615064443116e-06, + "loss": 0.2662, + "step": 4100 + }, + { + "epoch": 1.4151008446819513, + "grad_norm": 1.239487565939614, + "learning_rate": 8.414539171415044e-06, + "loss": 0.2883, + "step": 4105 + }, + { + "epoch": 1.4168246853990691, + "grad_norm": 1.1353859309595264, + "learning_rate": 8.410459021958407e-06, + "loss": 0.2592, + "step": 4110 + }, + { + "epoch": 1.418548526116187, + "grad_norm": 1.2239330978333636, + "learning_rate": 8.40637462115936e-06, + "loss": 0.2978, + "step": 4115 + }, + { + "epoch": 1.4202723668333046, + "grad_norm": 1.1931010142867433, + "learning_rate": 8.402285974109351e-06, + "loss": 0.2961, + "step": 4120 + }, + { + "epoch": 1.4219962075504222, + "grad_norm": 1.1543151925906574, + "learning_rate": 8.398193085905129e-06, + "loss": 0.2951, + "step": 4125 + }, + { + "epoch": 1.42372004826754, + "grad_norm": 1.0186883594497838, + "learning_rate": 8.394095961648719e-06, + "loss": 0.2943, + "step": 4130 + }, + { + "epoch": 1.425443888984658, + "grad_norm": 1.6396849506994637, + "learning_rate": 8.389994606447438e-06, + "loss": 0.2922, + "step": 4135 + }, + { + "epoch": 1.4271677297017755, + "grad_norm": 1.055836727069913, + "learning_rate": 8.38588902541387e-06, + "loss": 0.2993, + "step": 4140 + }, + { + "epoch": 1.4288915704188934, + "grad_norm": 5.517045291697795, + "learning_rate": 8.381779223665871e-06, + "loss": 0.3013, + "step": 4145 + }, + { + "epoch": 1.430615411136011, + "grad_norm": 1.505632142957612, + "learning_rate": 8.377665206326554e-06, + "loss": 0.2799, + "step": 4150 + }, + { + "epoch": 1.4323392518531288, + "grad_norm": 1.1203779436713657, + "learning_rate": 8.373546978524288e-06, + "loss": 0.3005, + "step": 4155 + }, + { + "epoch": 1.4340630925702464, + "grad_norm": 1.2089066705710734, + "learning_rate": 8.369424545392694e-06, + "loss": 0.295, + "step": 4160 + }, + { + "epoch": 1.4357869332873643, + "grad_norm": 1.0271157769609536, + "learning_rate": 8.365297912070635e-06, + "loss": 0.2915, + "step": 4165 + }, + { + "epoch": 1.437510774004482, + "grad_norm": 1.9142291281162835, + "learning_rate": 8.361167083702204e-06, + "loss": 0.2905, + "step": 4170 + }, + { + "epoch": 1.4392346147215997, + "grad_norm": 1.2842701676122075, + "learning_rate": 8.357032065436728e-06, + "loss": 0.3133, + "step": 4175 + }, + { + "epoch": 1.4409584554387176, + "grad_norm": 1.168564641232761, + "learning_rate": 8.35289286242876e-06, + "loss": 0.2974, + "step": 4180 + }, + { + "epoch": 1.4426822961558352, + "grad_norm": 1.2654640122040623, + "learning_rate": 8.348749479838057e-06, + "loss": 0.2644, + "step": 4185 + }, + { + "epoch": 1.4444061368729528, + "grad_norm": 1.279875810813251, + "learning_rate": 8.344601922829603e-06, + "loss": 0.3063, + "step": 4190 + }, + { + "epoch": 1.4461299775900707, + "grad_norm": 1.1989896556917319, + "learning_rate": 8.340450196573574e-06, + "loss": 0.2793, + "step": 4195 + }, + { + "epoch": 1.4478538183071885, + "grad_norm": 1.1684699927310496, + "learning_rate": 8.336294306245347e-06, + "loss": 0.2764, + "step": 4200 + }, + { + "epoch": 1.4495776590243061, + "grad_norm": 1.671482671844399, + "learning_rate": 8.332134257025491e-06, + "loss": 0.2918, + "step": 4205 + }, + { + "epoch": 1.451301499741424, + "grad_norm": 0.9660048161905248, + "learning_rate": 8.327970054099754e-06, + "loss": 0.2989, + "step": 4210 + }, + { + "epoch": 1.4530253404585416, + "grad_norm": 1.220434293069677, + "learning_rate": 8.323801702659069e-06, + "loss": 0.2944, + "step": 4215 + }, + { + "epoch": 1.4547491811756594, + "grad_norm": 1.5616478369713984, + "learning_rate": 8.319629207899536e-06, + "loss": 0.2762, + "step": 4220 + }, + { + "epoch": 1.456473021892777, + "grad_norm": 1.0774039544402243, + "learning_rate": 8.315452575022418e-06, + "loss": 0.2769, + "step": 4225 + }, + { + "epoch": 1.458196862609895, + "grad_norm": 1.1145790915355014, + "learning_rate": 8.311271809234145e-06, + "loss": 0.3287, + "step": 4230 + }, + { + "epoch": 1.4599207033270125, + "grad_norm": 1.1681505383079978, + "learning_rate": 8.307086915746288e-06, + "loss": 0.2917, + "step": 4235 + }, + { + "epoch": 1.4616445440441304, + "grad_norm": 1.1253000125643369, + "learning_rate": 8.302897899775571e-06, + "loss": 0.2845, + "step": 4240 + }, + { + "epoch": 1.4633683847612482, + "grad_norm": 2.6969917963864947, + "learning_rate": 8.298704766543853e-06, + "loss": 0.3195, + "step": 4245 + }, + { + "epoch": 1.4650922254783658, + "grad_norm": 2.071204538351495, + "learning_rate": 8.294507521278127e-06, + "loss": 0.2963, + "step": 4250 + }, + { + "epoch": 1.4668160661954834, + "grad_norm": 1.1722313659463146, + "learning_rate": 8.290306169210516e-06, + "loss": 0.2927, + "step": 4255 + }, + { + "epoch": 1.4685399069126013, + "grad_norm": 1.1691652127028054, + "learning_rate": 8.286100715578254e-06, + "loss": 0.2743, + "step": 4260 + }, + { + "epoch": 1.4702637476297191, + "grad_norm": 1.1798942644502195, + "learning_rate": 8.281891165623693e-06, + "loss": 0.2761, + "step": 4265 + }, + { + "epoch": 1.4719875883468367, + "grad_norm": 1.1121187262116956, + "learning_rate": 8.277677524594288e-06, + "loss": 0.3109, + "step": 4270 + }, + { + "epoch": 1.4737114290639544, + "grad_norm": 1.4202418924078146, + "learning_rate": 8.2734597977426e-06, + "loss": 0.2891, + "step": 4275 + }, + { + "epoch": 1.4754352697810722, + "grad_norm": 1.112394836998521, + "learning_rate": 8.269237990326278e-06, + "loss": 0.3077, + "step": 4280 + }, + { + "epoch": 1.47715911049819, + "grad_norm": 1.0986260704553183, + "learning_rate": 8.265012107608057e-06, + "loss": 0.2732, + "step": 4285 + }, + { + "epoch": 1.4788829512153077, + "grad_norm": 1.1397736528472096, + "learning_rate": 8.260782154855757e-06, + "loss": 0.2938, + "step": 4290 + }, + { + "epoch": 1.4806067919324255, + "grad_norm": 3.0310989331718274, + "learning_rate": 8.256548137342268e-06, + "loss": 0.2997, + "step": 4295 + }, + { + "epoch": 1.4823306326495431, + "grad_norm": 1.414759567180519, + "learning_rate": 8.252310060345546e-06, + "loss": 0.302, + "step": 4300 + }, + { + "epoch": 1.484054473366661, + "grad_norm": 1.1273109488248374, + "learning_rate": 8.248067929148612e-06, + "loss": 0.2747, + "step": 4305 + }, + { + "epoch": 1.4857783140837786, + "grad_norm": 1.0138568921712012, + "learning_rate": 8.243821749039534e-06, + "loss": 0.2731, + "step": 4310 + }, + { + "epoch": 1.4875021548008964, + "grad_norm": 1.0829130740936297, + "learning_rate": 8.239571525311433e-06, + "loss": 0.2902, + "step": 4315 + }, + { + "epoch": 1.489225995518014, + "grad_norm": 1.1966939995575037, + "learning_rate": 8.23531726326247e-06, + "loss": 0.3393, + "step": 4320 + }, + { + "epoch": 1.4909498362351319, + "grad_norm": 0.9337724122434297, + "learning_rate": 8.231058968195838e-06, + "loss": 0.2982, + "step": 4325 + }, + { + "epoch": 1.4926736769522497, + "grad_norm": 1.0707804685307505, + "learning_rate": 8.226796645419758e-06, + "loss": 0.3136, + "step": 4330 + }, + { + "epoch": 1.4943975176693673, + "grad_norm": 1.141925415485157, + "learning_rate": 8.222530300247467e-06, + "loss": 0.3067, + "step": 4335 + }, + { + "epoch": 1.496121358386485, + "grad_norm": 1.2168700457279291, + "learning_rate": 8.218259937997228e-06, + "loss": 0.2901, + "step": 4340 + }, + { + "epoch": 1.4978451991036028, + "grad_norm": 1.0017932593769117, + "learning_rate": 8.213985563992302e-06, + "loss": 0.2752, + "step": 4345 + }, + { + "epoch": 1.4995690398207207, + "grad_norm": 1.1784743251912764, + "learning_rate": 8.209707183560953e-06, + "loss": 0.3186, + "step": 4350 + }, + { + "epoch": 1.5012928805378383, + "grad_norm": 1.072869339665256, + "learning_rate": 8.20542480203644e-06, + "loss": 0.2826, + "step": 4355 + }, + { + "epoch": 1.503016721254956, + "grad_norm": 1.1017133539730921, + "learning_rate": 8.201138424757008e-06, + "loss": 0.2905, + "step": 4360 + }, + { + "epoch": 1.5047405619720737, + "grad_norm": 1.0922456832653578, + "learning_rate": 8.196848057065887e-06, + "loss": 0.2839, + "step": 4365 + }, + { + "epoch": 1.5064644026891916, + "grad_norm": 1.1740104467786454, + "learning_rate": 8.192553704311277e-06, + "loss": 0.3002, + "step": 4370 + }, + { + "epoch": 1.5081882434063094, + "grad_norm": 4.204163727357052, + "learning_rate": 8.188255371846347e-06, + "loss": 0.3062, + "step": 4375 + }, + { + "epoch": 1.509912084123427, + "grad_norm": 1.2772967129642088, + "learning_rate": 8.183953065029226e-06, + "loss": 0.2975, + "step": 4380 + }, + { + "epoch": 1.5116359248405447, + "grad_norm": 1.5168363584089413, + "learning_rate": 8.179646789223e-06, + "loss": 0.3037, + "step": 4385 + }, + { + "epoch": 1.5133597655576625, + "grad_norm": 1.4788250289482368, + "learning_rate": 8.175336549795701e-06, + "loss": 0.3435, + "step": 4390 + }, + { + "epoch": 1.5150836062747803, + "grad_norm": 1.1003016752913906, + "learning_rate": 8.1710223521203e-06, + "loss": 0.281, + "step": 4395 + }, + { + "epoch": 1.516807446991898, + "grad_norm": 1.0711099126215438, + "learning_rate": 8.166704201574707e-06, + "loss": 0.2731, + "step": 4400 + }, + { + "epoch": 1.5185312877090156, + "grad_norm": 1.249983606211235, + "learning_rate": 8.162382103541755e-06, + "loss": 0.289, + "step": 4405 + }, + { + "epoch": 1.5202551284261334, + "grad_norm": 1.107332976953771, + "learning_rate": 8.158056063409198e-06, + "loss": 0.2864, + "step": 4410 + }, + { + "epoch": 1.5219789691432513, + "grad_norm": 1.2623515697126584, + "learning_rate": 8.153726086569707e-06, + "loss": 0.3027, + "step": 4415 + }, + { + "epoch": 1.5237028098603689, + "grad_norm": 1.1052823756210386, + "learning_rate": 8.149392178420858e-06, + "loss": 0.2944, + "step": 4420 + }, + { + "epoch": 1.5254266505774865, + "grad_norm": 1.0476709242070792, + "learning_rate": 8.14505434436513e-06, + "loss": 0.2987, + "step": 4425 + }, + { + "epoch": 1.5271504912946043, + "grad_norm": 1.1317688824371155, + "learning_rate": 8.140712589809891e-06, + "loss": 0.2663, + "step": 4430 + }, + { + "epoch": 1.5288743320117222, + "grad_norm": 1.1905840008315134, + "learning_rate": 8.136366920167403e-06, + "loss": 0.2643, + "step": 4435 + }, + { + "epoch": 1.5305981727288398, + "grad_norm": 1.137180132429372, + "learning_rate": 8.1320173408548e-06, + "loss": 0.2906, + "step": 4440 + }, + { + "epoch": 1.5323220134459576, + "grad_norm": 1.0753074044669657, + "learning_rate": 8.1276638572941e-06, + "loss": 0.3385, + "step": 4445 + }, + { + "epoch": 1.5340458541630753, + "grad_norm": 1.167513808635872, + "learning_rate": 8.123306474912178e-06, + "loss": 0.2944, + "step": 4450 + }, + { + "epoch": 1.535769694880193, + "grad_norm": 1.3897430885477142, + "learning_rate": 8.118945199140774e-06, + "loss": 0.2965, + "step": 4455 + }, + { + "epoch": 1.537493535597311, + "grad_norm": 1.0462935277782224, + "learning_rate": 8.114580035416484e-06, + "loss": 0.2813, + "step": 4460 + }, + { + "epoch": 1.5392173763144286, + "grad_norm": 0.9442234706910525, + "learning_rate": 8.110210989180742e-06, + "loss": 0.2856, + "step": 4465 + }, + { + "epoch": 1.5409412170315462, + "grad_norm": 1.2880096731566573, + "learning_rate": 8.105838065879832e-06, + "loss": 0.2972, + "step": 4470 + }, + { + "epoch": 1.542665057748664, + "grad_norm": 1.1817976972804822, + "learning_rate": 8.101461270964863e-06, + "loss": 0.3071, + "step": 4475 + }, + { + "epoch": 1.5443888984657819, + "grad_norm": 1.19871914468833, + "learning_rate": 8.097080609891775e-06, + "loss": 0.2986, + "step": 4480 + }, + { + "epoch": 1.5461127391828995, + "grad_norm": 1.1368965902954358, + "learning_rate": 8.092696088121324e-06, + "loss": 0.2795, + "step": 4485 + }, + { + "epoch": 1.547836579900017, + "grad_norm": 1.1499305851560142, + "learning_rate": 8.088307711119082e-06, + "loss": 0.2588, + "step": 4490 + }, + { + "epoch": 1.549560420617135, + "grad_norm": 1.3872810664498054, + "learning_rate": 8.083915484355423e-06, + "loss": 0.284, + "step": 4495 + }, + { + "epoch": 1.5512842613342528, + "grad_norm": 1.1268491020621787, + "learning_rate": 8.079519413305523e-06, + "loss": 0.2628, + "step": 4500 + }, + { + "epoch": 1.5530081020513704, + "grad_norm": 1.203114196866655, + "learning_rate": 8.075119503449352e-06, + "loss": 0.3032, + "step": 4505 + }, + { + "epoch": 1.554731942768488, + "grad_norm": 1.3394123876449031, + "learning_rate": 8.070715760271657e-06, + "loss": 0.2967, + "step": 4510 + }, + { + "epoch": 1.5564557834856059, + "grad_norm": 1.7377691152609942, + "learning_rate": 8.066308189261971e-06, + "loss": 0.264, + "step": 4515 + }, + { + "epoch": 1.5581796242027237, + "grad_norm": 0.9846889598064859, + "learning_rate": 8.0618967959146e-06, + "loss": 0.2927, + "step": 4520 + }, + { + "epoch": 1.5599034649198416, + "grad_norm": 1.216221746125267, + "learning_rate": 8.057481585728604e-06, + "loss": 0.2745, + "step": 4525 + }, + { + "epoch": 1.5616273056369592, + "grad_norm": 1.0672477634076372, + "learning_rate": 8.053062564207816e-06, + "loss": 0.273, + "step": 4530 + }, + { + "epoch": 1.5633511463540768, + "grad_norm": 1.0776198555958603, + "learning_rate": 8.048639736860808e-06, + "loss": 0.2815, + "step": 4535 + }, + { + "epoch": 1.5650749870711946, + "grad_norm": 1.5085017509837733, + "learning_rate": 8.044213109200901e-06, + "loss": 0.2756, + "step": 4540 + }, + { + "epoch": 1.5667988277883125, + "grad_norm": 1.3085717154879373, + "learning_rate": 8.039782686746153e-06, + "loss": 0.2987, + "step": 4545 + }, + { + "epoch": 1.56852266850543, + "grad_norm": 1.0162772612935684, + "learning_rate": 8.035348475019352e-06, + "loss": 0.2946, + "step": 4550 + }, + { + "epoch": 1.5702465092225477, + "grad_norm": 1.3775938951629474, + "learning_rate": 8.03091047954801e-06, + "loss": 0.2859, + "step": 4555 + }, + { + "epoch": 1.5719703499396656, + "grad_norm": 1.1994169217393993, + "learning_rate": 8.026468705864357e-06, + "loss": 0.2855, + "step": 4560 + }, + { + "epoch": 1.5736941906567834, + "grad_norm": 1.0925332218455293, + "learning_rate": 8.022023159505328e-06, + "loss": 0.2778, + "step": 4565 + }, + { + "epoch": 1.575418031373901, + "grad_norm": 2.577322170757021, + "learning_rate": 8.017573846012564e-06, + "loss": 0.2862, + "step": 4570 + }, + { + "epoch": 1.5771418720910186, + "grad_norm": 3.505143617383019, + "learning_rate": 8.013120770932406e-06, + "loss": 0.3156, + "step": 4575 + }, + { + "epoch": 1.5788657128081365, + "grad_norm": 2.818090342674019, + "learning_rate": 8.008663939815878e-06, + "loss": 0.3043, + "step": 4580 + }, + { + "epoch": 1.5805895535252543, + "grad_norm": 1.0380932701893224, + "learning_rate": 8.004203358218687e-06, + "loss": 0.301, + "step": 4585 + }, + { + "epoch": 1.5823133942423722, + "grad_norm": 1.343618018699047, + "learning_rate": 7.999739031701218e-06, + "loss": 0.3, + "step": 4590 + }, + { + "epoch": 1.5840372349594898, + "grad_norm": 1.3860425519912696, + "learning_rate": 7.995270965828523e-06, + "loss": 0.3106, + "step": 4595 + }, + { + "epoch": 1.5857610756766074, + "grad_norm": 1.2227838784955878, + "learning_rate": 7.990799166170312e-06, + "loss": 0.2824, + "step": 4600 + }, + { + "epoch": 1.5874849163937252, + "grad_norm": 1.2291192243757418, + "learning_rate": 7.986323638300957e-06, + "loss": 0.2647, + "step": 4605 + }, + { + "epoch": 1.589208757110843, + "grad_norm": 1.1924245691491968, + "learning_rate": 7.981844387799468e-06, + "loss": 0.2886, + "step": 4610 + }, + { + "epoch": 1.5909325978279607, + "grad_norm": 1.027590649560104, + "learning_rate": 7.977361420249504e-06, + "loss": 0.2658, + "step": 4615 + }, + { + "epoch": 1.5926564385450783, + "grad_norm": 1.1668511583869174, + "learning_rate": 7.972874741239352e-06, + "loss": 0.2715, + "step": 4620 + }, + { + "epoch": 1.5943802792621962, + "grad_norm": 1.1530835346302435, + "learning_rate": 7.968384356361927e-06, + "loss": 0.3023, + "step": 4625 + }, + { + "epoch": 1.596104119979314, + "grad_norm": 1.1219218944959122, + "learning_rate": 7.963890271214765e-06, + "loss": 0.2519, + "step": 4630 + }, + { + "epoch": 1.5978279606964316, + "grad_norm": 1.0597119841876788, + "learning_rate": 7.959392491400015e-06, + "loss": 0.2758, + "step": 4635 + }, + { + "epoch": 1.5995518014135492, + "grad_norm": 1.0706986098744253, + "learning_rate": 7.954891022524427e-06, + "loss": 0.2616, + "step": 4640 + }, + { + "epoch": 1.601275642130667, + "grad_norm": 1.0788317237951472, + "learning_rate": 7.950385870199356e-06, + "loss": 0.2695, + "step": 4645 + }, + { + "epoch": 1.602999482847785, + "grad_norm": 1.1209830452940586, + "learning_rate": 7.945877040040742e-06, + "loss": 0.2619, + "step": 4650 + }, + { + "epoch": 1.6047233235649025, + "grad_norm": 1.040694549076333, + "learning_rate": 7.941364537669117e-06, + "loss": 0.2951, + "step": 4655 + }, + { + "epoch": 1.6064471642820204, + "grad_norm": 1.11191039361645, + "learning_rate": 7.936848368709582e-06, + "loss": 0.3003, + "step": 4660 + }, + { + "epoch": 1.608171004999138, + "grad_norm": 1.2050444474037043, + "learning_rate": 7.932328538791818e-06, + "loss": 0.2871, + "step": 4665 + }, + { + "epoch": 1.6098948457162559, + "grad_norm": 0.9903198963124957, + "learning_rate": 7.927805053550064e-06, + "loss": 0.255, + "step": 4670 + }, + { + "epoch": 1.6116186864333737, + "grad_norm": 1.090871684646597, + "learning_rate": 7.923277918623116e-06, + "loss": 0.264, + "step": 4675 + }, + { + "epoch": 1.6133425271504913, + "grad_norm": 1.080006435375285, + "learning_rate": 7.918747139654318e-06, + "loss": 0.2793, + "step": 4680 + }, + { + "epoch": 1.615066367867609, + "grad_norm": 1.2223590352235285, + "learning_rate": 7.914212722291561e-06, + "loss": 0.3058, + "step": 4685 + }, + { + "epoch": 1.6167902085847268, + "grad_norm": 1.290101087149561, + "learning_rate": 7.909674672187268e-06, + "loss": 0.321, + "step": 4690 + }, + { + "epoch": 1.6185140493018446, + "grad_norm": 1.2857378909440365, + "learning_rate": 7.905132994998394e-06, + "loss": 0.3114, + "step": 4695 + }, + { + "epoch": 1.6202378900189622, + "grad_norm": 1.104863427511264, + "learning_rate": 7.900587696386413e-06, + "loss": 0.2589, + "step": 4700 + }, + { + "epoch": 1.6219617307360799, + "grad_norm": 1.0742409705734057, + "learning_rate": 7.896038782017308e-06, + "loss": 0.3179, + "step": 4705 + }, + { + "epoch": 1.6236855714531977, + "grad_norm": 1.2678367311665033, + "learning_rate": 7.89148625756158e-06, + "loss": 0.284, + "step": 4710 + }, + { + "epoch": 1.6254094121703155, + "grad_norm": 1.1759575583917148, + "learning_rate": 7.886930128694221e-06, + "loss": 0.2962, + "step": 4715 + }, + { + "epoch": 1.6271332528874332, + "grad_norm": 1.1014959512898448, + "learning_rate": 7.882370401094723e-06, + "loss": 0.2874, + "step": 4720 + }, + { + "epoch": 1.6288570936045508, + "grad_norm": 2.50511505608037, + "learning_rate": 7.877807080447058e-06, + "loss": 0.3029, + "step": 4725 + }, + { + "epoch": 1.6305809343216686, + "grad_norm": 1.185916183994825, + "learning_rate": 7.873240172439683e-06, + "loss": 0.2845, + "step": 4730 + }, + { + "epoch": 1.6323047750387865, + "grad_norm": 1.6176622608656361, + "learning_rate": 7.86866968276552e-06, + "loss": 0.2795, + "step": 4735 + }, + { + "epoch": 1.6340286157559043, + "grad_norm": 1.250930863184501, + "learning_rate": 7.86409561712196e-06, + "loss": 0.2915, + "step": 4740 + }, + { + "epoch": 1.635752456473022, + "grad_norm": 1.9008168568839663, + "learning_rate": 7.859517981210855e-06, + "loss": 0.2922, + "step": 4745 + }, + { + "epoch": 1.6374762971901395, + "grad_norm": 2.3733977130541852, + "learning_rate": 7.854936780738501e-06, + "loss": 0.2875, + "step": 4750 + }, + { + "epoch": 1.6392001379072574, + "grad_norm": 1.0197756084994443, + "learning_rate": 7.85035202141564e-06, + "loss": 0.2663, + "step": 4755 + }, + { + "epoch": 1.6409239786243752, + "grad_norm": 1.1569440849863617, + "learning_rate": 7.845763708957448e-06, + "loss": 0.3178, + "step": 4760 + }, + { + "epoch": 1.6426478193414928, + "grad_norm": 1.1135247757913942, + "learning_rate": 7.841171849083537e-06, + "loss": 0.2965, + "step": 4765 + }, + { + "epoch": 1.6443716600586105, + "grad_norm": 1.203979917867108, + "learning_rate": 7.836576447517935e-06, + "loss": 0.277, + "step": 4770 + }, + { + "epoch": 1.6460955007757283, + "grad_norm": 1.0915322798339258, + "learning_rate": 7.831977509989086e-06, + "loss": 0.3196, + "step": 4775 + }, + { + "epoch": 1.6478193414928461, + "grad_norm": 1.9043654726869557, + "learning_rate": 7.827375042229843e-06, + "loss": 0.2809, + "step": 4780 + }, + { + "epoch": 1.6495431822099638, + "grad_norm": 1.231698886575644, + "learning_rate": 7.822769049977459e-06, + "loss": 0.2559, + "step": 4785 + }, + { + "epoch": 1.6512670229270814, + "grad_norm": 0.9756875575310217, + "learning_rate": 7.81815953897358e-06, + "loss": 0.2604, + "step": 4790 + }, + { + "epoch": 1.6529908636441992, + "grad_norm": 1.2033951497698867, + "learning_rate": 7.81354651496424e-06, + "loss": 0.2562, + "step": 4795 + }, + { + "epoch": 1.654714704361317, + "grad_norm": 1.2585683573733146, + "learning_rate": 7.808929983699848e-06, + "loss": 0.2604, + "step": 4800 + }, + { + "epoch": 1.656438545078435, + "grad_norm": 1.0835966787970026, + "learning_rate": 7.804309950935191e-06, + "loss": 0.2807, + "step": 4805 + }, + { + "epoch": 1.6581623857955525, + "grad_norm": 1.0539996832826386, + "learning_rate": 7.799686422429418e-06, + "loss": 0.2641, + "step": 4810 + }, + { + "epoch": 1.6598862265126701, + "grad_norm": 1.1510984990439581, + "learning_rate": 7.795059403946034e-06, + "loss": 0.2862, + "step": 4815 + }, + { + "epoch": 1.661610067229788, + "grad_norm": 1.099104953394071, + "learning_rate": 7.790428901252897e-06, + "loss": 0.2852, + "step": 4820 + }, + { + "epoch": 1.6633339079469058, + "grad_norm": 1.1682885829402445, + "learning_rate": 7.785794920122207e-06, + "loss": 0.2859, + "step": 4825 + }, + { + "epoch": 1.6650577486640235, + "grad_norm": 1.1721610560414317, + "learning_rate": 7.7811574663305e-06, + "loss": 0.2921, + "step": 4830 + }, + { + "epoch": 1.666781589381141, + "grad_norm": 1.168726191344149, + "learning_rate": 7.776516545658641e-06, + "loss": 0.3008, + "step": 4835 + }, + { + "epoch": 1.668505430098259, + "grad_norm": 1.3478403912271937, + "learning_rate": 7.771872163891818e-06, + "loss": 0.2945, + "step": 4840 + }, + { + "epoch": 1.6702292708153768, + "grad_norm": 0.999876928881415, + "learning_rate": 7.767224326819533e-06, + "loss": 0.2512, + "step": 4845 + }, + { + "epoch": 1.6719531115324944, + "grad_norm": 1.0653778046015696, + "learning_rate": 7.762573040235592e-06, + "loss": 0.2673, + "step": 4850 + }, + { + "epoch": 1.673676952249612, + "grad_norm": 1.2411457041849874, + "learning_rate": 7.757918309938107e-06, + "loss": 0.2791, + "step": 4855 + }, + { + "epoch": 1.6754007929667298, + "grad_norm": 1.130982998169481, + "learning_rate": 7.753260141729474e-06, + "loss": 0.2985, + "step": 4860 + }, + { + "epoch": 1.6771246336838477, + "grad_norm": 1.172612356497052, + "learning_rate": 7.748598541416386e-06, + "loss": 0.2845, + "step": 4865 + }, + { + "epoch": 1.6788484744009653, + "grad_norm": 1.4309093735314082, + "learning_rate": 7.743933514809806e-06, + "loss": 0.2534, + "step": 4870 + }, + { + "epoch": 1.6805723151180831, + "grad_norm": 1.0583343878157196, + "learning_rate": 7.739265067724966e-06, + "loss": 0.268, + "step": 4875 + }, + { + "epoch": 1.6822961558352008, + "grad_norm": 1.064639332799556, + "learning_rate": 7.734593205981375e-06, + "loss": 0.2915, + "step": 4880 + }, + { + "epoch": 1.6840199965523186, + "grad_norm": 1.2481077971908499, + "learning_rate": 7.729917935402783e-06, + "loss": 0.3059, + "step": 4885 + }, + { + "epoch": 1.6857438372694364, + "grad_norm": 1.0723438589361638, + "learning_rate": 7.725239261817201e-06, + "loss": 0.292, + "step": 4890 + }, + { + "epoch": 1.687467677986554, + "grad_norm": 1.2069060312922704, + "learning_rate": 7.720557191056873e-06, + "loss": 0.245, + "step": 4895 + }, + { + "epoch": 1.6891915187036717, + "grad_norm": 1.1900799630831256, + "learning_rate": 7.715871728958285e-06, + "loss": 0.284, + "step": 4900 + }, + { + "epoch": 1.6909153594207895, + "grad_norm": 1.0808543445599839, + "learning_rate": 7.711182881362143e-06, + "loss": 0.2794, + "step": 4905 + }, + { + "epoch": 1.6926392001379074, + "grad_norm": 1.1770038792609376, + "learning_rate": 7.706490654113383e-06, + "loss": 0.2829, + "step": 4910 + }, + { + "epoch": 1.694363040855025, + "grad_norm": 1.3712361776514597, + "learning_rate": 7.701795053061145e-06, + "loss": 0.2916, + "step": 4915 + }, + { + "epoch": 1.6960868815721426, + "grad_norm": 1.1007601160863054, + "learning_rate": 7.697096084058781e-06, + "loss": 0.2601, + "step": 4920 + }, + { + "epoch": 1.6978107222892604, + "grad_norm": 1.2528337178686804, + "learning_rate": 7.692393752963837e-06, + "loss": 0.2879, + "step": 4925 + }, + { + "epoch": 1.6995345630063783, + "grad_norm": 1.2087277967671552, + "learning_rate": 7.687688065638052e-06, + "loss": 0.2805, + "step": 4930 + }, + { + "epoch": 1.701258403723496, + "grad_norm": 1.3431894201965882, + "learning_rate": 7.682979027947349e-06, + "loss": 0.3023, + "step": 4935 + }, + { + "epoch": 1.7029822444406135, + "grad_norm": 1.1536128646000634, + "learning_rate": 7.678266645761823e-06, + "loss": 0.2813, + "step": 4940 + }, + { + "epoch": 1.7047060851577314, + "grad_norm": 1.0822875992827867, + "learning_rate": 7.673550924955749e-06, + "loss": 0.2621, + "step": 4945 + }, + { + "epoch": 1.7064299258748492, + "grad_norm": 0.9758588784653546, + "learning_rate": 7.668831871407552e-06, + "loss": 0.2896, + "step": 4950 + }, + { + "epoch": 1.708153766591967, + "grad_norm": 5.060149915372297, + "learning_rate": 7.664109490999819e-06, + "loss": 0.2672, + "step": 4955 + }, + { + "epoch": 1.7098776073090847, + "grad_norm": 0.9785982692848444, + "learning_rate": 7.659383789619277e-06, + "loss": 0.2677, + "step": 4960 + }, + { + "epoch": 1.7116014480262023, + "grad_norm": 1.1456111445632373, + "learning_rate": 7.6546547731568e-06, + "loss": 0.296, + "step": 4965 + }, + { + "epoch": 1.7133252887433201, + "grad_norm": 1.2787557535098648, + "learning_rate": 7.649922447507392e-06, + "loss": 0.2738, + "step": 4970 + }, + { + "epoch": 1.715049129460438, + "grad_norm": 1.164003982929867, + "learning_rate": 7.645186818570183e-06, + "loss": 0.2813, + "step": 4975 + }, + { + "epoch": 1.7167729701775556, + "grad_norm": 1.2434044369597195, + "learning_rate": 7.640447892248416e-06, + "loss": 0.2503, + "step": 4980 + }, + { + "epoch": 1.7184968108946732, + "grad_norm": 1.0898227360866577, + "learning_rate": 7.635705674449448e-06, + "loss": 0.2924, + "step": 4985 + }, + { + "epoch": 1.720220651611791, + "grad_norm": 1.1109599536674057, + "learning_rate": 7.630960171084742e-06, + "loss": 0.2814, + "step": 4990 + }, + { + "epoch": 1.721944492328909, + "grad_norm": 0.9936293276920007, + "learning_rate": 7.626211388069853e-06, + "loss": 0.271, + "step": 4995 + }, + { + "epoch": 1.7236683330460265, + "grad_norm": 0.9991754561676714, + "learning_rate": 7.621459331324421e-06, + "loss": 0.2692, + "step": 5000 + }, + { + "epoch": 1.7253921737631441, + "grad_norm": 1.3655918924152755, + "learning_rate": 7.616704006772175e-06, + "loss": 0.2649, + "step": 5005 + }, + { + "epoch": 1.727116014480262, + "grad_norm": 1.5048623255301028, + "learning_rate": 7.611945420340913e-06, + "loss": 0.2336, + "step": 5010 + }, + { + "epoch": 1.7288398551973798, + "grad_norm": 1.1367584713498837, + "learning_rate": 7.607183577962496e-06, + "loss": 0.2633, + "step": 5015 + }, + { + "epoch": 1.7305636959144977, + "grad_norm": 1.4103184520900731, + "learning_rate": 7.602418485572849e-06, + "loss": 0.2665, + "step": 5020 + }, + { + "epoch": 1.7322875366316153, + "grad_norm": 1.458893334785239, + "learning_rate": 7.597650149111948e-06, + "loss": 0.2781, + "step": 5025 + }, + { + "epoch": 1.734011377348733, + "grad_norm": 1.0043127824753695, + "learning_rate": 7.592878574523809e-06, + "loss": 0.3137, + "step": 5030 + }, + { + "epoch": 1.7357352180658507, + "grad_norm": 1.0507992776405541, + "learning_rate": 7.5881037677564886e-06, + "loss": 0.2497, + "step": 5035 + }, + { + "epoch": 1.7374590587829686, + "grad_norm": 2.8059775995775946, + "learning_rate": 7.583325734762068e-06, + "loss": 0.2982, + "step": 5040 + }, + { + "epoch": 1.7391828995000862, + "grad_norm": 1.1436290478445026, + "learning_rate": 7.578544481496657e-06, + "loss": 0.2895, + "step": 5045 + }, + { + "epoch": 1.7409067402172038, + "grad_norm": 1.3224859651054273, + "learning_rate": 7.5737600139203715e-06, + "loss": 0.2535, + "step": 5050 + }, + { + "epoch": 1.7426305809343217, + "grad_norm": 1.1752876663370677, + "learning_rate": 7.5689723379973404e-06, + "loss": 0.2779, + "step": 5055 + }, + { + "epoch": 1.7443544216514395, + "grad_norm": 1.1909209268360834, + "learning_rate": 7.564181459695692e-06, + "loss": 0.2952, + "step": 5060 + }, + { + "epoch": 1.7460782623685571, + "grad_norm": 1.1057420527030353, + "learning_rate": 7.559387384987538e-06, + "loss": 0.2913, + "step": 5065 + }, + { + "epoch": 1.7478021030856747, + "grad_norm": 1.0788985188134605, + "learning_rate": 7.554590119848988e-06, + "loss": 0.2732, + "step": 5070 + }, + { + "epoch": 1.7495259438027926, + "grad_norm": 1.2506844184121586, + "learning_rate": 7.549789670260117e-06, + "loss": 0.2852, + "step": 5075 + }, + { + "epoch": 1.7512497845199104, + "grad_norm": 1.0751159126128347, + "learning_rate": 7.544986042204977e-06, + "loss": 0.2869, + "step": 5080 + }, + { + "epoch": 1.752973625237028, + "grad_norm": 1.0913872117333538, + "learning_rate": 7.540179241671578e-06, + "loss": 0.2878, + "step": 5085 + }, + { + "epoch": 1.7546974659541459, + "grad_norm": 1.1394484243614582, + "learning_rate": 7.535369274651887e-06, + "loss": 0.2816, + "step": 5090 + }, + { + "epoch": 1.7564213066712635, + "grad_norm": 1.0311513805472574, + "learning_rate": 7.530556147141817e-06, + "loss": 0.2654, + "step": 5095 + }, + { + "epoch": 1.7581451473883813, + "grad_norm": 0.9650362254355321, + "learning_rate": 7.525739865141221e-06, + "loss": 0.2816, + "step": 5100 + }, + { + "epoch": 1.7598689881054992, + "grad_norm": 1.146197497667083, + "learning_rate": 7.5209204346538845e-06, + "loss": 0.2876, + "step": 5105 + }, + { + "epoch": 1.7615928288226168, + "grad_norm": 1.2823570613469755, + "learning_rate": 7.516097861687517e-06, + "loss": 0.2549, + "step": 5110 + }, + { + "epoch": 1.7633166695397344, + "grad_norm": 1.0979690128514943, + "learning_rate": 7.511272152253746e-06, + "loss": 0.2931, + "step": 5115 + }, + { + "epoch": 1.7650405102568523, + "grad_norm": 1.0190614992710245, + "learning_rate": 7.506443312368111e-06, + "loss": 0.2693, + "step": 5120 + }, + { + "epoch": 1.76676435097397, + "grad_norm": 1.053828883724785, + "learning_rate": 7.5016113480500465e-06, + "loss": 0.2744, + "step": 5125 + }, + { + "epoch": 1.7684881916910877, + "grad_norm": 1.210477140134907, + "learning_rate": 7.496776265322893e-06, + "loss": 0.2855, + "step": 5130 + }, + { + "epoch": 1.7702120324082053, + "grad_norm": 1.6091285360924075, + "learning_rate": 7.491938070213868e-06, + "loss": 0.2852, + "step": 5135 + }, + { + "epoch": 1.7719358731253232, + "grad_norm": 1.1356765125738666, + "learning_rate": 7.4870967687540745e-06, + "loss": 0.2623, + "step": 5140 + }, + { + "epoch": 1.773659713842441, + "grad_norm": 1.3691503007676011, + "learning_rate": 7.482252366978484e-06, + "loss": 0.2929, + "step": 5145 + }, + { + "epoch": 1.7753835545595587, + "grad_norm": 1.2769107336632355, + "learning_rate": 7.477404870925937e-06, + "loss": 0.2703, + "step": 5150 + }, + { + "epoch": 1.7771073952766763, + "grad_norm": 1.185701202768985, + "learning_rate": 7.47255428663913e-06, + "loss": 0.2893, + "step": 5155 + }, + { + "epoch": 1.7788312359937941, + "grad_norm": 1.0168763853501037, + "learning_rate": 7.467700620164606e-06, + "loss": 0.2836, + "step": 5160 + }, + { + "epoch": 1.780555076710912, + "grad_norm": 1.056250760663425, + "learning_rate": 7.462843877552752e-06, + "loss": 0.2822, + "step": 5165 + }, + { + "epoch": 1.7822789174280298, + "grad_norm": 1.2340473863412658, + "learning_rate": 7.457984064857791e-06, + "loss": 0.2798, + "step": 5170 + }, + { + "epoch": 1.7840027581451474, + "grad_norm": 0.9952934398082832, + "learning_rate": 7.453121188137773e-06, + "loss": 0.2863, + "step": 5175 + }, + { + "epoch": 1.785726598862265, + "grad_norm": 1.096546664691232, + "learning_rate": 7.448255253454566e-06, + "loss": 0.267, + "step": 5180 + }, + { + "epoch": 1.7874504395793829, + "grad_norm": 1.1137598402828075, + "learning_rate": 7.443386266873849e-06, + "loss": 0.2696, + "step": 5185 + }, + { + "epoch": 1.7891742802965007, + "grad_norm": 1.265124395170217, + "learning_rate": 7.438514234465108e-06, + "loss": 0.2743, + "step": 5190 + }, + { + "epoch": 1.7908981210136183, + "grad_norm": 1.0639163200289241, + "learning_rate": 7.433639162301623e-06, + "loss": 0.2547, + "step": 5195 + }, + { + "epoch": 1.792621961730736, + "grad_norm": 1.1278138783766827, + "learning_rate": 7.4287610564604675e-06, + "loss": 0.266, + "step": 5200 + }, + { + "epoch": 1.7943458024478538, + "grad_norm": 3.6967756772489144, + "learning_rate": 7.4238799230224924e-06, + "loss": 0.2958, + "step": 5205 + }, + { + "epoch": 1.7960696431649716, + "grad_norm": 1.0996972379476142, + "learning_rate": 7.418995768072323e-06, + "loss": 0.2867, + "step": 5210 + }, + { + "epoch": 1.7977934838820893, + "grad_norm": 1.0706722278463459, + "learning_rate": 7.414108597698357e-06, + "loss": 0.2651, + "step": 5215 + }, + { + "epoch": 1.7995173245992069, + "grad_norm": 1.0382780131139366, + "learning_rate": 7.409218417992741e-06, + "loss": 0.2519, + "step": 5220 + }, + { + "epoch": 1.8012411653163247, + "grad_norm": 1.0992035780495972, + "learning_rate": 7.404325235051381e-06, + "loss": 0.252, + "step": 5225 + }, + { + "epoch": 1.8029650060334426, + "grad_norm": 1.126768049409712, + "learning_rate": 7.399429054973923e-06, + "loss": 0.278, + "step": 5230 + }, + { + "epoch": 1.8046888467505604, + "grad_norm": 1.185737350601416, + "learning_rate": 7.39452988386375e-06, + "loss": 0.3009, + "step": 5235 + }, + { + "epoch": 1.806412687467678, + "grad_norm": 1.0451308376522186, + "learning_rate": 7.389627727827977e-06, + "loss": 0.2598, + "step": 5240 + }, + { + "epoch": 1.8081365281847956, + "grad_norm": 1.2583174124221492, + "learning_rate": 7.3847225929774316e-06, + "loss": 0.2732, + "step": 5245 + }, + { + "epoch": 1.8098603689019135, + "grad_norm": 1.0331050969455584, + "learning_rate": 7.3798144854266615e-06, + "loss": 0.267, + "step": 5250 + }, + { + "epoch": 1.8115842096190313, + "grad_norm": 1.1062804250909486, + "learning_rate": 7.374903411293919e-06, + "loss": 0.262, + "step": 5255 + }, + { + "epoch": 1.813308050336149, + "grad_norm": 1.329738516827646, + "learning_rate": 7.369989376701153e-06, + "loss": 0.2819, + "step": 5260 + }, + { + "epoch": 1.8150318910532666, + "grad_norm": 1.2271682423367378, + "learning_rate": 7.365072387774004e-06, + "loss": 0.255, + "step": 5265 + }, + { + "epoch": 1.8167557317703844, + "grad_norm": 1.3129017608959763, + "learning_rate": 7.360152450641792e-06, + "loss": 0.2671, + "step": 5270 + }, + { + "epoch": 1.8184795724875022, + "grad_norm": 1.2955446038826546, + "learning_rate": 7.355229571437519e-06, + "loss": 0.2462, + "step": 5275 + }, + { + "epoch": 1.8202034132046199, + "grad_norm": 1.3071733693941237, + "learning_rate": 7.350303756297845e-06, + "loss": 0.2763, + "step": 5280 + }, + { + "epoch": 1.8219272539217375, + "grad_norm": 1.1198453515777207, + "learning_rate": 7.3453750113631e-06, + "loss": 0.2772, + "step": 5285 + }, + { + "epoch": 1.8236510946388553, + "grad_norm": 1.092114500883187, + "learning_rate": 7.340443342777258e-06, + "loss": 0.2451, + "step": 5290 + }, + { + "epoch": 1.8253749353559732, + "grad_norm": 1.0470019744280976, + "learning_rate": 7.335508756687941e-06, + "loss": 0.2614, + "step": 5295 + }, + { + "epoch": 1.8270987760730908, + "grad_norm": 1.0594618623401897, + "learning_rate": 7.330571259246411e-06, + "loss": 0.2746, + "step": 5300 + }, + { + "epoch": 1.8288226167902086, + "grad_norm": 1.0982499449870642, + "learning_rate": 7.32563085660755e-06, + "loss": 0.2631, + "step": 5305 + }, + { + "epoch": 1.8305464575073263, + "grad_norm": 0.9117916047220774, + "learning_rate": 7.320687554929871e-06, + "loss": 0.2713, + "step": 5310 + }, + { + "epoch": 1.832270298224444, + "grad_norm": 1.2399561251897282, + "learning_rate": 7.315741360375497e-06, + "loss": 0.2674, + "step": 5315 + }, + { + "epoch": 1.833994138941562, + "grad_norm": 1.08628856500942, + "learning_rate": 7.310792279110155e-06, + "loss": 0.2665, + "step": 5320 + }, + { + "epoch": 1.8357179796586796, + "grad_norm": 1.112817692082712, + "learning_rate": 7.305840317303174e-06, + "loss": 0.264, + "step": 5325 + }, + { + "epoch": 1.8374418203757972, + "grad_norm": 1.1554707394297785, + "learning_rate": 7.300885481127472e-06, + "loss": 0.2643, + "step": 5330 + }, + { + "epoch": 1.839165661092915, + "grad_norm": 1.1065351039547966, + "learning_rate": 7.295927776759551e-06, + "loss": 0.2626, + "step": 5335 + }, + { + "epoch": 1.8408895018100329, + "grad_norm": 1.1428672706404264, + "learning_rate": 7.290967210379489e-06, + "loss": 0.2947, + "step": 5340 + }, + { + "epoch": 1.8426133425271505, + "grad_norm": 0.9919106094816719, + "learning_rate": 7.286003788170928e-06, + "loss": 0.2694, + "step": 5345 + }, + { + "epoch": 1.844337183244268, + "grad_norm": 1.1143963164241666, + "learning_rate": 7.281037516321073e-06, + "loss": 0.2974, + "step": 5350 + }, + { + "epoch": 1.846061023961386, + "grad_norm": 1.204079287254847, + "learning_rate": 7.276068401020682e-06, + "loss": 0.2675, + "step": 5355 + }, + { + "epoch": 1.8477848646785038, + "grad_norm": 1.1051609652606675, + "learning_rate": 7.271096448464057e-06, + "loss": 0.2587, + "step": 5360 + }, + { + "epoch": 1.8495087053956214, + "grad_norm": 1.231084386734048, + "learning_rate": 7.266121664849033e-06, + "loss": 0.2924, + "step": 5365 + }, + { + "epoch": 1.851232546112739, + "grad_norm": 1.1424913542294668, + "learning_rate": 7.261144056376978e-06, + "loss": 0.2394, + "step": 5370 + }, + { + "epoch": 1.8529563868298569, + "grad_norm": 1.1819390379008323, + "learning_rate": 7.256163629252784e-06, + "loss": 0.2624, + "step": 5375 + }, + { + "epoch": 1.8546802275469747, + "grad_norm": 1.0898053611089158, + "learning_rate": 7.251180389684849e-06, + "loss": 0.2606, + "step": 5380 + }, + { + "epoch": 1.8564040682640925, + "grad_norm": 1.056902359024124, + "learning_rate": 7.246194343885082e-06, + "loss": 0.2613, + "step": 5385 + }, + { + "epoch": 1.8581279089812102, + "grad_norm": 1.0845746443904702, + "learning_rate": 7.2412054980688905e-06, + "loss": 0.2891, + "step": 5390 + }, + { + "epoch": 1.8598517496983278, + "grad_norm": 1.2084838861539404, + "learning_rate": 7.23621385845517e-06, + "loss": 0.2816, + "step": 5395 + }, + { + "epoch": 1.8615755904154456, + "grad_norm": 1.2217850486847586, + "learning_rate": 7.2312194312663e-06, + "loss": 0.2839, + "step": 5400 + }, + { + "epoch": 1.8632994311325635, + "grad_norm": 1.1310132370248838, + "learning_rate": 7.226222222728134e-06, + "loss": 0.2485, + "step": 5405 + }, + { + "epoch": 1.865023271849681, + "grad_norm": 1.1136582290738202, + "learning_rate": 7.221222239069994e-06, + "loss": 0.2721, + "step": 5410 + }, + { + "epoch": 1.8667471125667987, + "grad_norm": 1.1600893472584881, + "learning_rate": 7.216219486524659e-06, + "loss": 0.2496, + "step": 5415 + }, + { + "epoch": 1.8684709532839165, + "grad_norm": 1.0477615037842176, + "learning_rate": 7.211213971328364e-06, + "loss": 0.262, + "step": 5420 + }, + { + "epoch": 1.8701947940010344, + "grad_norm": 1.1123140931236568, + "learning_rate": 7.206205699720782e-06, + "loss": 0.2707, + "step": 5425 + }, + { + "epoch": 1.871918634718152, + "grad_norm": 1.2651961367995357, + "learning_rate": 7.201194677945027e-06, + "loss": 0.2932, + "step": 5430 + }, + { + "epoch": 1.8736424754352696, + "grad_norm": 1.0153308064883255, + "learning_rate": 7.196180912247637e-06, + "loss": 0.2912, + "step": 5435 + }, + { + "epoch": 1.8753663161523875, + "grad_norm": 1.0469023717406651, + "learning_rate": 7.191164408878575e-06, + "loss": 0.2661, + "step": 5440 + }, + { + "epoch": 1.8770901568695053, + "grad_norm": 1.15204943776014, + "learning_rate": 7.186145174091214e-06, + "loss": 0.2774, + "step": 5445 + }, + { + "epoch": 1.8788139975866232, + "grad_norm": 1.294374183716143, + "learning_rate": 7.181123214142331e-06, + "loss": 0.2909, + "step": 5450 + }, + { + "epoch": 1.8805378383037408, + "grad_norm": 1.2237541265828948, + "learning_rate": 7.176098535292101e-06, + "loss": 0.2701, + "step": 5455 + }, + { + "epoch": 1.8822616790208584, + "grad_norm": 1.1134729489128907, + "learning_rate": 7.171071143804089e-06, + "loss": 0.2591, + "step": 5460 + }, + { + "epoch": 1.8839855197379762, + "grad_norm": 0.9771169461106862, + "learning_rate": 7.166041045945242e-06, + "loss": 0.2776, + "step": 5465 + }, + { + "epoch": 1.885709360455094, + "grad_norm": 1.164004332509995, + "learning_rate": 7.161008247985881e-06, + "loss": 0.2524, + "step": 5470 + }, + { + "epoch": 1.8874332011722117, + "grad_norm": 1.036062945872068, + "learning_rate": 7.155972756199688e-06, + "loss": 0.2781, + "step": 5475 + }, + { + "epoch": 1.8891570418893293, + "grad_norm": 1.2103960421924118, + "learning_rate": 7.150934576863708e-06, + "loss": 0.2834, + "step": 5480 + }, + { + "epoch": 1.8908808826064472, + "grad_norm": 1.24293340973761, + "learning_rate": 7.145893716258335e-06, + "loss": 0.2778, + "step": 5485 + }, + { + "epoch": 1.892604723323565, + "grad_norm": 0.9832689197280934, + "learning_rate": 7.140850180667306e-06, + "loss": 0.2282, + "step": 5490 + }, + { + "epoch": 1.8943285640406826, + "grad_norm": 1.0331895067046448, + "learning_rate": 7.13580397637769e-06, + "loss": 0.2768, + "step": 5495 + }, + { + "epoch": 1.8960524047578002, + "grad_norm": 1.1923177256654112, + "learning_rate": 7.1307551096798855e-06, + "loss": 0.2557, + "step": 5500 + }, + { + "epoch": 1.897776245474918, + "grad_norm": 1.5997663160143192, + "learning_rate": 7.1257035868676085e-06, + "loss": 0.2711, + "step": 5505 + }, + { + "epoch": 1.899500086192036, + "grad_norm": 1.194254895138194, + "learning_rate": 7.120649414237885e-06, + "loss": 0.2847, + "step": 5510 + }, + { + "epoch": 1.9012239269091535, + "grad_norm": 1.0880952414876823, + "learning_rate": 7.115592598091046e-06, + "loss": 0.2535, + "step": 5515 + }, + { + "epoch": 1.9029477676262714, + "grad_norm": 1.0888244601473853, + "learning_rate": 7.110533144730718e-06, + "loss": 0.2422, + "step": 5520 + }, + { + "epoch": 1.904671608343389, + "grad_norm": 1.0598652883511703, + "learning_rate": 7.105471060463814e-06, + "loss": 0.2874, + "step": 5525 + }, + { + "epoch": 1.9063954490605068, + "grad_norm": 1.291465991800003, + "learning_rate": 7.1004063516005265e-06, + "loss": 0.2776, + "step": 5530 + }, + { + "epoch": 1.9081192897776247, + "grad_norm": 0.9859693890889243, + "learning_rate": 7.095339024454316e-06, + "loss": 0.2691, + "step": 5535 + }, + { + "epoch": 1.9098431304947423, + "grad_norm": 1.0642240673720773, + "learning_rate": 7.0902690853419185e-06, + "loss": 0.2611, + "step": 5540 + }, + { + "epoch": 1.91156697121186, + "grad_norm": 1.1138711581684404, + "learning_rate": 7.085196540583312e-06, + "loss": 0.259, + "step": 5545 + }, + { + "epoch": 1.9132908119289778, + "grad_norm": 1.4743476326248401, + "learning_rate": 7.080121396501733e-06, + "loss": 0.2917, + "step": 5550 + }, + { + "epoch": 1.9150146526460956, + "grad_norm": 1.7890008769900765, + "learning_rate": 7.075043659423648e-06, + "loss": 0.2713, + "step": 5555 + }, + { + "epoch": 1.9167384933632132, + "grad_norm": 1.1299933829421482, + "learning_rate": 7.069963335678767e-06, + "loss": 0.2611, + "step": 5560 + }, + { + "epoch": 1.9184623340803308, + "grad_norm": 1.168157969901466, + "learning_rate": 7.06488043160002e-06, + "loss": 0.2611, + "step": 5565 + }, + { + "epoch": 1.9201861747974487, + "grad_norm": 1.653223887377282, + "learning_rate": 7.059794953523549e-06, + "loss": 0.2576, + "step": 5570 + }, + { + "epoch": 1.9219100155145665, + "grad_norm": 1.0833739041565649, + "learning_rate": 7.054706907788711e-06, + "loss": 0.2644, + "step": 5575 + }, + { + "epoch": 1.9236338562316841, + "grad_norm": 1.059492405661376, + "learning_rate": 7.049616300738059e-06, + "loss": 0.261, + "step": 5580 + }, + { + "epoch": 1.9253576969488018, + "grad_norm": 1.5814448412443767, + "learning_rate": 7.044523138717344e-06, + "loss": 0.2698, + "step": 5585 + }, + { + "epoch": 1.9270815376659196, + "grad_norm": 1.1358794117729327, + "learning_rate": 7.0394274280754984e-06, + "loss": 0.2666, + "step": 5590 + }, + { + "epoch": 1.9288053783830374, + "grad_norm": 1.2958086531655535, + "learning_rate": 7.0343291751646295e-06, + "loss": 0.2941, + "step": 5595 + }, + { + "epoch": 1.9305292191001553, + "grad_norm": 1.1203931328301828, + "learning_rate": 7.029228386340017e-06, + "loss": 0.2757, + "step": 5600 + }, + { + "epoch": 1.932253059817273, + "grad_norm": 8.170377614516136, + "learning_rate": 7.024125067960104e-06, + "loss": 0.2606, + "step": 5605 + }, + { + "epoch": 1.9339769005343905, + "grad_norm": 1.3417798966601795, + "learning_rate": 7.019019226386482e-06, + "loss": 0.2777, + "step": 5610 + }, + { + "epoch": 1.9357007412515084, + "grad_norm": 1.3041374667829848, + "learning_rate": 7.0139108679838885e-06, + "loss": 0.2555, + "step": 5615 + }, + { + "epoch": 1.9374245819686262, + "grad_norm": 1.0226608905689267, + "learning_rate": 7.008799999120203e-06, + "loss": 0.2752, + "step": 5620 + }, + { + "epoch": 1.9391484226857438, + "grad_norm": 1.1926622366516715, + "learning_rate": 7.003686626166429e-06, + "loss": 0.2569, + "step": 5625 + }, + { + "epoch": 1.9408722634028615, + "grad_norm": 1.7781872868378437, + "learning_rate": 6.998570755496694e-06, + "loss": 0.2475, + "step": 5630 + }, + { + "epoch": 1.9425961041199793, + "grad_norm": 0.9988247278701624, + "learning_rate": 6.993452393488238e-06, + "loss": 0.2794, + "step": 5635 + }, + { + "epoch": 1.9443199448370971, + "grad_norm": 1.1284938143658345, + "learning_rate": 6.988331546521408e-06, + "loss": 0.2683, + "step": 5640 + }, + { + "epoch": 1.9460437855542148, + "grad_norm": 1.2099722282233312, + "learning_rate": 6.983208220979647e-06, + "loss": 0.2572, + "step": 5645 + }, + { + "epoch": 1.9477676262713324, + "grad_norm": 1.249136336362036, + "learning_rate": 6.978082423249491e-06, + "loss": 0.2646, + "step": 5650 + }, + { + "epoch": 1.9494914669884502, + "grad_norm": 1.1307235999444802, + "learning_rate": 6.972954159720552e-06, + "loss": 0.2664, + "step": 5655 + }, + { + "epoch": 1.951215307705568, + "grad_norm": 1.114170233199106, + "learning_rate": 6.967823436785521e-06, + "loss": 0.2636, + "step": 5660 + }, + { + "epoch": 1.952939148422686, + "grad_norm": 1.1602703323436812, + "learning_rate": 6.962690260840153e-06, + "loss": 0.2666, + "step": 5665 + }, + { + "epoch": 1.9546629891398035, + "grad_norm": 1.022057578742568, + "learning_rate": 6.9575546382832615e-06, + "loss": 0.2911, + "step": 5670 + }, + { + "epoch": 1.9563868298569211, + "grad_norm": 1.1162004610083973, + "learning_rate": 6.952416575516707e-06, + "loss": 0.2702, + "step": 5675 + }, + { + "epoch": 1.958110670574039, + "grad_norm": 1.1672309636467573, + "learning_rate": 6.947276078945393e-06, + "loss": 0.2787, + "step": 5680 + }, + { + "epoch": 1.9598345112911568, + "grad_norm": 1.083713088049749, + "learning_rate": 6.942133154977263e-06, + "loss": 0.25, + "step": 5685 + }, + { + "epoch": 1.9615583520082744, + "grad_norm": 1.034910496878224, + "learning_rate": 6.936987810023277e-06, + "loss": 0.2648, + "step": 5690 + }, + { + "epoch": 1.963282192725392, + "grad_norm": 1.047401038681741, + "learning_rate": 6.931840050497417e-06, + "loss": 0.2591, + "step": 5695 + }, + { + "epoch": 1.96500603344251, + "grad_norm": 1.0375177459093605, + "learning_rate": 6.9266898828166774e-06, + "loss": 0.2393, + "step": 5700 + }, + { + "epoch": 1.9667298741596277, + "grad_norm": 1.4160738858000166, + "learning_rate": 6.92153731340105e-06, + "loss": 0.2775, + "step": 5705 + }, + { + "epoch": 1.9684537148767454, + "grad_norm": 1.0380061179428337, + "learning_rate": 6.9163823486735245e-06, + "loss": 0.2672, + "step": 5710 + }, + { + "epoch": 1.970177555593863, + "grad_norm": 1.0168733689431326, + "learning_rate": 6.9112249950600726e-06, + "loss": 0.2774, + "step": 5715 + }, + { + "epoch": 1.9719013963109808, + "grad_norm": 1.2152116140898908, + "learning_rate": 6.9060652589896485e-06, + "loss": 0.243, + "step": 5720 + }, + { + "epoch": 1.9736252370280987, + "grad_norm": 1.6474829672831846, + "learning_rate": 6.900903146894171e-06, + "loss": 0.2531, + "step": 5725 + }, + { + "epoch": 1.9753490777452165, + "grad_norm": 1.2142546628291522, + "learning_rate": 6.895738665208526e-06, + "loss": 0.247, + "step": 5730 + }, + { + "epoch": 1.9770729184623341, + "grad_norm": 1.0326790077881896, + "learning_rate": 6.8905718203705485e-06, + "loss": 0.2627, + "step": 5735 + }, + { + "epoch": 1.9787967591794517, + "grad_norm": 1.4913206458086243, + "learning_rate": 6.885402618821022e-06, + "loss": 0.2749, + "step": 5740 + }, + { + "epoch": 1.9805205998965696, + "grad_norm": 1.392133050519287, + "learning_rate": 6.88023106700367e-06, + "loss": 0.2774, + "step": 5745 + }, + { + "epoch": 1.9822444406136874, + "grad_norm": 1.0796781573181105, + "learning_rate": 6.875057171365139e-06, + "loss": 0.2745, + "step": 5750 + }, + { + "epoch": 1.983968281330805, + "grad_norm": 1.1962868504050188, + "learning_rate": 6.869880938355004e-06, + "loss": 0.2701, + "step": 5755 + }, + { + "epoch": 1.9856921220479227, + "grad_norm": 1.2042347082839586, + "learning_rate": 6.864702374425749e-06, + "loss": 0.2794, + "step": 5760 + }, + { + "epoch": 1.9874159627650405, + "grad_norm": 1.3125219172293308, + "learning_rate": 6.859521486032768e-06, + "loss": 0.2599, + "step": 5765 + }, + { + "epoch": 1.9891398034821584, + "grad_norm": 1.1429032279231959, + "learning_rate": 6.854338279634349e-06, + "loss": 0.2537, + "step": 5770 + }, + { + "epoch": 1.990863644199276, + "grad_norm": 1.487277914324487, + "learning_rate": 6.849152761691671e-06, + "loss": 0.2572, + "step": 5775 + }, + { + "epoch": 1.9925874849163936, + "grad_norm": 1.0745202819769721, + "learning_rate": 6.843964938668792e-06, + "loss": 0.2939, + "step": 5780 + }, + { + "epoch": 1.9943113256335114, + "grad_norm": 1.0428930706243056, + "learning_rate": 6.838774817032648e-06, + "loss": 0.2883, + "step": 5785 + }, + { + "epoch": 1.9960351663506293, + "grad_norm": 1.6922282787293423, + "learning_rate": 6.833582403253038e-06, + "loss": 0.2332, + "step": 5790 + }, + { + "epoch": 1.997759007067747, + "grad_norm": 1.278542198963824, + "learning_rate": 6.8283877038026185e-06, + "loss": 0.2842, + "step": 5795 + }, + { + "epoch": 1.9994828477848645, + "grad_norm": 1.1741569666338092, + "learning_rate": 6.823190725156892e-06, + "loss": 0.234, + "step": 5800 + }, + { + "epoch": 2.0010343044302705, + "grad_norm": 1.069771793210654, + "learning_rate": 6.817991473794207e-06, + "loss": 0.2566, + "step": 5805 + }, + { + "epoch": 2.0027581451473884, + "grad_norm": 1.1022591623453037, + "learning_rate": 6.812789956195745e-06, + "loss": 0.2127, + "step": 5810 + }, + { + "epoch": 2.004481985864506, + "grad_norm": 1.2427733300346624, + "learning_rate": 6.807586178845509e-06, + "loss": 0.227, + "step": 5815 + }, + { + "epoch": 2.006205826581624, + "grad_norm": 1.1609130023882328, + "learning_rate": 6.80238014823032e-06, + "loss": 0.2329, + "step": 5820 + }, + { + "epoch": 2.0079296672987414, + "grad_norm": 1.2194015635674407, + "learning_rate": 6.797171870839809e-06, + "loss": 0.2338, + "step": 5825 + }, + { + "epoch": 2.0096535080158593, + "grad_norm": 1.1713733585642396, + "learning_rate": 6.791961353166408e-06, + "loss": 0.2127, + "step": 5830 + }, + { + "epoch": 2.011377348732977, + "grad_norm": 1.1607859835299417, + "learning_rate": 6.786748601705341e-06, + "loss": 0.2632, + "step": 5835 + }, + { + "epoch": 2.013101189450095, + "grad_norm": 1.144078788264028, + "learning_rate": 6.781533622954615e-06, + "loss": 0.2149, + "step": 5840 + }, + { + "epoch": 2.0148250301672124, + "grad_norm": 1.0973643308018433, + "learning_rate": 6.776316423415015e-06, + "loss": 0.2475, + "step": 5845 + }, + { + "epoch": 2.01654887088433, + "grad_norm": 1.13899084035407, + "learning_rate": 6.7710970095900956e-06, + "loss": 0.2337, + "step": 5850 + }, + { + "epoch": 2.018272711601448, + "grad_norm": 1.8061306377516608, + "learning_rate": 6.76587538798617e-06, + "loss": 0.2657, + "step": 5855 + }, + { + "epoch": 2.019996552318566, + "grad_norm": 1.0832860811873404, + "learning_rate": 6.7606515651123e-06, + "loss": 0.2378, + "step": 5860 + }, + { + "epoch": 2.0217203930356833, + "grad_norm": 1.0390851457035901, + "learning_rate": 6.755425547480301e-06, + "loss": 0.2438, + "step": 5865 + }, + { + "epoch": 2.023444233752801, + "grad_norm": 1.105889528965176, + "learning_rate": 6.750197341604714e-06, + "loss": 0.2285, + "step": 5870 + }, + { + "epoch": 2.025168074469919, + "grad_norm": 1.219541978578336, + "learning_rate": 6.744966954002816e-06, + "loss": 0.221, + "step": 5875 + }, + { + "epoch": 2.026891915187037, + "grad_norm": 1.3359271211676251, + "learning_rate": 6.7397343911945965e-06, + "loss": 0.2187, + "step": 5880 + }, + { + "epoch": 2.0286157559041547, + "grad_norm": 1.1947466098000317, + "learning_rate": 6.734499659702761e-06, + "loss": 0.2302, + "step": 5885 + }, + { + "epoch": 2.030339596621272, + "grad_norm": 1.1116329595278311, + "learning_rate": 6.72926276605272e-06, + "loss": 0.2407, + "step": 5890 + }, + { + "epoch": 2.03206343733839, + "grad_norm": 1.1124699826421875, + "learning_rate": 6.724023716772573e-06, + "loss": 0.2289, + "step": 5895 + }, + { + "epoch": 2.0337872780555077, + "grad_norm": 1.0757617090304594, + "learning_rate": 6.718782518393111e-06, + "loss": 0.2197, + "step": 5900 + }, + { + "epoch": 2.0355111187726256, + "grad_norm": 1.2394833449760483, + "learning_rate": 6.713539177447805e-06, + "loss": 0.2228, + "step": 5905 + }, + { + "epoch": 2.037234959489743, + "grad_norm": 1.0921680519718104, + "learning_rate": 6.708293700472792e-06, + "loss": 0.2539, + "step": 5910 + }, + { + "epoch": 2.038958800206861, + "grad_norm": 1.1967162319205045, + "learning_rate": 6.703046094006878e-06, + "loss": 0.2396, + "step": 5915 + }, + { + "epoch": 2.0406826409239787, + "grad_norm": 1.108450216113543, + "learning_rate": 6.697796364591517e-06, + "loss": 0.2349, + "step": 5920 + }, + { + "epoch": 2.0424064816410965, + "grad_norm": 1.0881400098795595, + "learning_rate": 6.692544518770816e-06, + "loss": 0.2092, + "step": 5925 + }, + { + "epoch": 2.044130322358214, + "grad_norm": 1.1642194846324272, + "learning_rate": 6.687290563091515e-06, + "loss": 0.2511, + "step": 5930 + }, + { + "epoch": 2.0458541630753317, + "grad_norm": 1.0733848179823602, + "learning_rate": 6.682034504102987e-06, + "loss": 0.2416, + "step": 5935 + }, + { + "epoch": 2.0475780037924496, + "grad_norm": 1.279626520702259, + "learning_rate": 6.676776348357224e-06, + "loss": 0.2035, + "step": 5940 + }, + { + "epoch": 2.0493018445095674, + "grad_norm": 1.1210734611683955, + "learning_rate": 6.671516102408833e-06, + "loss": 0.2464, + "step": 5945 + }, + { + "epoch": 2.0510256852266853, + "grad_norm": 1.0689961412139548, + "learning_rate": 6.66625377281503e-06, + "loss": 0.2009, + "step": 5950 + }, + { + "epoch": 2.0527495259438027, + "grad_norm": 1.1387837258799702, + "learning_rate": 6.660989366135624e-06, + "loss": 0.2417, + "step": 5955 + }, + { + "epoch": 2.0544733666609205, + "grad_norm": 1.272758419732411, + "learning_rate": 6.655722888933016e-06, + "loss": 0.2581, + "step": 5960 + }, + { + "epoch": 2.0561972073780383, + "grad_norm": 1.2140821035581335, + "learning_rate": 6.650454347772184e-06, + "loss": 0.2139, + "step": 5965 + }, + { + "epoch": 2.057921048095156, + "grad_norm": 1.084407306601656, + "learning_rate": 6.645183749220685e-06, + "loss": 0.2662, + "step": 5970 + }, + { + "epoch": 2.0596448888122736, + "grad_norm": 1.0838704341161596, + "learning_rate": 6.639911099848636e-06, + "loss": 0.24, + "step": 5975 + }, + { + "epoch": 2.0613687295293914, + "grad_norm": 1.1914678111301553, + "learning_rate": 6.634636406228711e-06, + "loss": 0.2549, + "step": 5980 + }, + { + "epoch": 2.0630925702465093, + "grad_norm": 1.1044635016861235, + "learning_rate": 6.629359674936132e-06, + "loss": 0.2451, + "step": 5985 + }, + { + "epoch": 2.064816410963627, + "grad_norm": 1.1536147284580867, + "learning_rate": 6.624080912548665e-06, + "loss": 0.2476, + "step": 5990 + }, + { + "epoch": 2.0665402516807445, + "grad_norm": 1.1421925526144936, + "learning_rate": 6.6188001256466025e-06, + "loss": 0.241, + "step": 5995 + }, + { + "epoch": 2.0682640923978624, + "grad_norm": 2.1379547217703285, + "learning_rate": 6.613517320812766e-06, + "loss": 0.2164, + "step": 6000 + }, + { + "epoch": 2.06998793311498, + "grad_norm": 1.1122295896891858, + "learning_rate": 6.608232504632486e-06, + "loss": 0.2376, + "step": 6005 + }, + { + "epoch": 2.071711773832098, + "grad_norm": 1.039917351999052, + "learning_rate": 6.602945683693605e-06, + "loss": 0.2443, + "step": 6010 + }, + { + "epoch": 2.073435614549216, + "grad_norm": 1.1600517769636751, + "learning_rate": 6.597656864586466e-06, + "loss": 0.2436, + "step": 6015 + }, + { + "epoch": 2.0751594552663333, + "grad_norm": 1.1065203329203004, + "learning_rate": 6.5923660539038995e-06, + "loss": 0.2369, + "step": 6020 + }, + { + "epoch": 2.076883295983451, + "grad_norm": 1.258710551388086, + "learning_rate": 6.587073258241215e-06, + "loss": 0.2524, + "step": 6025 + }, + { + "epoch": 2.078607136700569, + "grad_norm": 1.1089161354480546, + "learning_rate": 6.581778484196206e-06, + "loss": 0.216, + "step": 6030 + }, + { + "epoch": 2.080330977417687, + "grad_norm": 1.286041663946239, + "learning_rate": 6.576481738369126e-06, + "loss": 0.2483, + "step": 6035 + }, + { + "epoch": 2.082054818134804, + "grad_norm": 1.0615172291224826, + "learning_rate": 6.571183027362686e-06, + "loss": 0.2267, + "step": 6040 + }, + { + "epoch": 2.083778658851922, + "grad_norm": 1.144691418124422, + "learning_rate": 6.565882357782048e-06, + "loss": 0.2214, + "step": 6045 + }, + { + "epoch": 2.08550249956904, + "grad_norm": 1.1203900966572184, + "learning_rate": 6.5605797362348175e-06, + "loss": 0.2148, + "step": 6050 + }, + { + "epoch": 2.0872263402861577, + "grad_norm": 1.164264894518462, + "learning_rate": 6.555275169331031e-06, + "loss": 0.2353, + "step": 6055 + }, + { + "epoch": 2.088950181003275, + "grad_norm": 1.111396306734735, + "learning_rate": 6.5499686636831485e-06, + "loss": 0.2371, + "step": 6060 + }, + { + "epoch": 2.090674021720393, + "grad_norm": 1.484226472860949, + "learning_rate": 6.54466022590605e-06, + "loss": 0.2284, + "step": 6065 + }, + { + "epoch": 2.092397862437511, + "grad_norm": 1.2784600960385353, + "learning_rate": 6.539349862617023e-06, + "loss": 0.2637, + "step": 6070 + }, + { + "epoch": 2.0941217031546286, + "grad_norm": 2.6398567990968984, + "learning_rate": 6.534037580435753e-06, + "loss": 0.2326, + "step": 6075 + }, + { + "epoch": 2.095845543871746, + "grad_norm": 1.0054398119210661, + "learning_rate": 6.528723385984322e-06, + "loss": 0.2433, + "step": 6080 + }, + { + "epoch": 2.097569384588864, + "grad_norm": 1.177616826803118, + "learning_rate": 6.523407285887192e-06, + "loss": 0.2122, + "step": 6085 + }, + { + "epoch": 2.0992932253059817, + "grad_norm": 1.2851218049211262, + "learning_rate": 6.5180892867711996e-06, + "loss": 0.238, + "step": 6090 + }, + { + "epoch": 2.1010170660230996, + "grad_norm": 1.1495087647733981, + "learning_rate": 6.512769395265556e-06, + "loss": 0.2006, + "step": 6095 + }, + { + "epoch": 2.1027409067402174, + "grad_norm": 1.0225974867997085, + "learning_rate": 6.507447618001821e-06, + "loss": 0.2254, + "step": 6100 + }, + { + "epoch": 2.104464747457335, + "grad_norm": 1.137638390854424, + "learning_rate": 6.502123961613912e-06, + "loss": 0.2343, + "step": 6105 + }, + { + "epoch": 2.1061885881744526, + "grad_norm": 1.1809201545717292, + "learning_rate": 6.496798432738087e-06, + "loss": 0.2232, + "step": 6110 + }, + { + "epoch": 2.1079124288915705, + "grad_norm": 1.2408473497171375, + "learning_rate": 6.491471038012941e-06, + "loss": 0.2427, + "step": 6115 + }, + { + "epoch": 2.1096362696086883, + "grad_norm": 1.2093251008140802, + "learning_rate": 6.486141784079387e-06, + "loss": 0.2147, + "step": 6120 + }, + { + "epoch": 2.1113601103258057, + "grad_norm": 1.1284539751014209, + "learning_rate": 6.480810677580664e-06, + "loss": 0.2219, + "step": 6125 + }, + { + "epoch": 2.1130839510429236, + "grad_norm": 0.9947179313905126, + "learning_rate": 6.4754777251623166e-06, + "loss": 0.2403, + "step": 6130 + }, + { + "epoch": 2.1148077917600414, + "grad_norm": 1.207016487014047, + "learning_rate": 6.470142933472191e-06, + "loss": 0.2505, + "step": 6135 + }, + { + "epoch": 2.1165316324771593, + "grad_norm": 1.109965366005483, + "learning_rate": 6.464806309160427e-06, + "loss": 0.2289, + "step": 6140 + }, + { + "epoch": 2.1182554731942767, + "grad_norm": 1.2296801343295378, + "learning_rate": 6.4594678588794445e-06, + "loss": 0.249, + "step": 6145 + }, + { + "epoch": 2.1199793139113945, + "grad_norm": 1.1709946649721397, + "learning_rate": 6.454127589283945e-06, + "loss": 0.2609, + "step": 6150 + }, + { + "epoch": 2.1217031546285123, + "grad_norm": 1.0256537521065312, + "learning_rate": 6.448785507030898e-06, + "loss": 0.2485, + "step": 6155 + }, + { + "epoch": 2.12342699534563, + "grad_norm": 1.2294744683344365, + "learning_rate": 6.443441618779528e-06, + "loss": 0.2316, + "step": 6160 + }, + { + "epoch": 2.1251508360627476, + "grad_norm": 1.1751304786398409, + "learning_rate": 6.438095931191315e-06, + "loss": 0.2315, + "step": 6165 + }, + { + "epoch": 2.1268746767798654, + "grad_norm": 1.0694001684270091, + "learning_rate": 6.432748450929977e-06, + "loss": 0.256, + "step": 6170 + }, + { + "epoch": 2.1285985174969833, + "grad_norm": 1.5918972890666852, + "learning_rate": 6.4273991846614735e-06, + "loss": 0.234, + "step": 6175 + }, + { + "epoch": 2.130322358214101, + "grad_norm": 2.1061165243228013, + "learning_rate": 6.422048139053987e-06, + "loss": 0.2231, + "step": 6180 + }, + { + "epoch": 2.132046198931219, + "grad_norm": 1.2980001891437871, + "learning_rate": 6.416695320777915e-06, + "loss": 0.2355, + "step": 6185 + }, + { + "epoch": 2.1337700396483363, + "grad_norm": 1.2013658421140967, + "learning_rate": 6.411340736505869e-06, + "loss": 0.2312, + "step": 6190 + }, + { + "epoch": 2.135493880365454, + "grad_norm": 1.4480279744210514, + "learning_rate": 6.4059843929126605e-06, + "loss": 0.2721, + "step": 6195 + }, + { + "epoch": 2.137217721082572, + "grad_norm": 0.9910186749108644, + "learning_rate": 6.400626296675296e-06, + "loss": 0.2449, + "step": 6200 + }, + { + "epoch": 2.13894156179969, + "grad_norm": 1.4704006413199056, + "learning_rate": 6.395266454472963e-06, + "loss": 0.2501, + "step": 6205 + }, + { + "epoch": 2.1406654025168073, + "grad_norm": 1.210407783203279, + "learning_rate": 6.389904872987025e-06, + "loss": 0.235, + "step": 6210 + }, + { + "epoch": 2.142389243233925, + "grad_norm": 1.266712887450536, + "learning_rate": 6.384541558901021e-06, + "loss": 0.274, + "step": 6215 + }, + { + "epoch": 2.144113083951043, + "grad_norm": 1.0758792414175562, + "learning_rate": 6.37917651890064e-06, + "loss": 0.2246, + "step": 6220 + }, + { + "epoch": 2.145836924668161, + "grad_norm": 1.148378596934074, + "learning_rate": 6.373809759673733e-06, + "loss": 0.195, + "step": 6225 + }, + { + "epoch": 2.147560765385278, + "grad_norm": 1.2369163372033345, + "learning_rate": 6.368441287910281e-06, + "loss": 0.2368, + "step": 6230 + }, + { + "epoch": 2.149284606102396, + "grad_norm": 1.1605685315411505, + "learning_rate": 6.3630711103024125e-06, + "loss": 0.2299, + "step": 6235 + }, + { + "epoch": 2.151008446819514, + "grad_norm": 1.153738101931673, + "learning_rate": 6.3576992335443764e-06, + "loss": 0.229, + "step": 6240 + }, + { + "epoch": 2.1527322875366317, + "grad_norm": 1.061980878109314, + "learning_rate": 6.352325664332539e-06, + "loss": 0.2296, + "step": 6245 + }, + { + "epoch": 2.1544561282537495, + "grad_norm": 0.970850560174058, + "learning_rate": 6.346950409365377e-06, + "loss": 0.2237, + "step": 6250 + }, + { + "epoch": 2.156179968970867, + "grad_norm": 1.1016916099458374, + "learning_rate": 6.3415734753434736e-06, + "loss": 0.218, + "step": 6255 + }, + { + "epoch": 2.157903809687985, + "grad_norm": 1.193448165025695, + "learning_rate": 6.336194868969495e-06, + "loss": 0.2224, + "step": 6260 + }, + { + "epoch": 2.1596276504051026, + "grad_norm": 1.0219575670537784, + "learning_rate": 6.3308145969482005e-06, + "loss": 0.236, + "step": 6265 + }, + { + "epoch": 2.1613514911222205, + "grad_norm": 1.272655310419002, + "learning_rate": 6.325432665986423e-06, + "loss": 0.2487, + "step": 6270 + }, + { + "epoch": 2.163075331839338, + "grad_norm": 1.8152147982265001, + "learning_rate": 6.320049082793063e-06, + "loss": 0.2259, + "step": 6275 + }, + { + "epoch": 2.1647991725564557, + "grad_norm": 1.0786103760624521, + "learning_rate": 6.314663854079081e-06, + "loss": 0.2337, + "step": 6280 + }, + { + "epoch": 2.1665230132735736, + "grad_norm": 1.0904656874530174, + "learning_rate": 6.309276986557489e-06, + "loss": 0.23, + "step": 6285 + }, + { + "epoch": 2.1682468539906914, + "grad_norm": 1.131407405814106, + "learning_rate": 6.30388848694334e-06, + "loss": 0.2118, + "step": 6290 + }, + { + "epoch": 2.169970694707809, + "grad_norm": 2.220220145462081, + "learning_rate": 6.298498361953723e-06, + "loss": 0.2185, + "step": 6295 + }, + { + "epoch": 2.1716945354249266, + "grad_norm": 1.1171046668981954, + "learning_rate": 6.293106618307757e-06, + "loss": 0.2689, + "step": 6300 + }, + { + "epoch": 2.1734183761420445, + "grad_norm": 1.3104517671771276, + "learning_rate": 6.287713262726571e-06, + "loss": 0.2286, + "step": 6305 + }, + { + "epoch": 2.1751422168591623, + "grad_norm": 1.2071183923627506, + "learning_rate": 6.2823183019333085e-06, + "loss": 0.2403, + "step": 6310 + }, + { + "epoch": 2.17686605757628, + "grad_norm": 1.5815496517793666, + "learning_rate": 6.276921742653113e-06, + "loss": 0.2189, + "step": 6315 + }, + { + "epoch": 2.1785898982933976, + "grad_norm": 1.1528626189674833, + "learning_rate": 6.271523591613121e-06, + "loss": 0.2688, + "step": 6320 + }, + { + "epoch": 2.1803137390105154, + "grad_norm": 1.154833714772765, + "learning_rate": 6.266123855542452e-06, + "loss": 0.2511, + "step": 6325 + }, + { + "epoch": 2.1820375797276332, + "grad_norm": 1.1271706581188135, + "learning_rate": 6.2607225411722005e-06, + "loss": 0.2356, + "step": 6330 + }, + { + "epoch": 2.183761420444751, + "grad_norm": 1.465635437555139, + "learning_rate": 6.255319655235432e-06, + "loss": 0.2405, + "step": 6335 + }, + { + "epoch": 2.1854852611618685, + "grad_norm": 1.0110337846358128, + "learning_rate": 6.249915204467168e-06, + "loss": 0.2353, + "step": 6340 + }, + { + "epoch": 2.1872091018789863, + "grad_norm": 1.264188343176243, + "learning_rate": 6.244509195604383e-06, + "loss": 0.2179, + "step": 6345 + }, + { + "epoch": 2.188932942596104, + "grad_norm": 1.1855417116942413, + "learning_rate": 6.2391016353859914e-06, + "loss": 0.2331, + "step": 6350 + }, + { + "epoch": 2.190656783313222, + "grad_norm": 1.121771969018659, + "learning_rate": 6.23369253055284e-06, + "loss": 0.2319, + "step": 6355 + }, + { + "epoch": 2.1923806240303394, + "grad_norm": 2.979733543546792, + "learning_rate": 6.228281887847708e-06, + "loss": 0.2363, + "step": 6360 + }, + { + "epoch": 2.1941044647474572, + "grad_norm": 0.945998582169751, + "learning_rate": 6.222869714015284e-06, + "loss": 0.2006, + "step": 6365 + }, + { + "epoch": 2.195828305464575, + "grad_norm": 1.1385376045750768, + "learning_rate": 6.21745601580217e-06, + "loss": 0.2387, + "step": 6370 + }, + { + "epoch": 2.197552146181693, + "grad_norm": 1.1470900066181131, + "learning_rate": 6.212040799956865e-06, + "loss": 0.2217, + "step": 6375 + }, + { + "epoch": 2.1992759868988108, + "grad_norm": 1.0910786770760603, + "learning_rate": 6.206624073229763e-06, + "loss": 0.2384, + "step": 6380 + }, + { + "epoch": 2.200999827615928, + "grad_norm": 1.21836785295153, + "learning_rate": 6.201205842373139e-06, + "loss": 0.2408, + "step": 6385 + }, + { + "epoch": 2.202723668333046, + "grad_norm": 1.1550357487539422, + "learning_rate": 6.195786114141145e-06, + "loss": 0.2151, + "step": 6390 + }, + { + "epoch": 2.204447509050164, + "grad_norm": 2.0126094991818735, + "learning_rate": 6.190364895289796e-06, + "loss": 0.2258, + "step": 6395 + }, + { + "epoch": 2.2061713497672817, + "grad_norm": 1.1759885295782768, + "learning_rate": 6.18494219257697e-06, + "loss": 0.2282, + "step": 6400 + }, + { + "epoch": 2.207895190484399, + "grad_norm": 1.156765171706817, + "learning_rate": 6.179518012762391e-06, + "loss": 0.2221, + "step": 6405 + }, + { + "epoch": 2.209619031201517, + "grad_norm": 1.2152068175854134, + "learning_rate": 6.174092362607627e-06, + "loss": 0.2365, + "step": 6410 + }, + { + "epoch": 2.2113428719186348, + "grad_norm": 1.1065954471480959, + "learning_rate": 6.1686652488760735e-06, + "loss": 0.2422, + "step": 6415 + }, + { + "epoch": 2.2130667126357526, + "grad_norm": 1.1496019358592222, + "learning_rate": 6.163236678332959e-06, + "loss": 0.2206, + "step": 6420 + }, + { + "epoch": 2.21479055335287, + "grad_norm": 1.1821890228767273, + "learning_rate": 6.157806657745321e-06, + "loss": 0.2339, + "step": 6425 + }, + { + "epoch": 2.216514394069988, + "grad_norm": 1.2580278526002249, + "learning_rate": 6.1523751938820085e-06, + "loss": 0.2358, + "step": 6430 + }, + { + "epoch": 2.2182382347871057, + "grad_norm": 1.299244265008732, + "learning_rate": 6.146942293513665e-06, + "loss": 0.2529, + "step": 6435 + }, + { + "epoch": 2.2199620755042235, + "grad_norm": 1.2456429273924774, + "learning_rate": 6.141507963412732e-06, + "loss": 0.2391, + "step": 6440 + }, + { + "epoch": 2.2216859162213414, + "grad_norm": 1.0876578922412434, + "learning_rate": 6.1360722103534255e-06, + "loss": 0.2241, + "step": 6445 + }, + { + "epoch": 2.2234097569384588, + "grad_norm": 1.163992161362365, + "learning_rate": 6.130635041111741e-06, + "loss": 0.2337, + "step": 6450 + }, + { + "epoch": 2.2251335976555766, + "grad_norm": 1.305738602201025, + "learning_rate": 6.125196462465435e-06, + "loss": 0.2314, + "step": 6455 + }, + { + "epoch": 2.2268574383726945, + "grad_norm": 1.224758584645192, + "learning_rate": 6.119756481194025e-06, + "loss": 0.2251, + "step": 6460 + }, + { + "epoch": 2.2285812790898123, + "grad_norm": 1.2406266375311112, + "learning_rate": 6.1143151040787755e-06, + "loss": 0.2398, + "step": 6465 + }, + { + "epoch": 2.2303051198069297, + "grad_norm": 1.3967656523094172, + "learning_rate": 6.108872337902688e-06, + "loss": 0.2192, + "step": 6470 + }, + { + "epoch": 2.2320289605240475, + "grad_norm": 1.1804813959831304, + "learning_rate": 6.1034281894505e-06, + "loss": 0.2133, + "step": 6475 + }, + { + "epoch": 2.2337528012411654, + "grad_norm": 1.270472137454462, + "learning_rate": 6.0979826655086695e-06, + "loss": 0.2446, + "step": 6480 + }, + { + "epoch": 2.235476641958283, + "grad_norm": 1.0585835578808256, + "learning_rate": 6.09253577286537e-06, + "loss": 0.222, + "step": 6485 + }, + { + "epoch": 2.2372004826754006, + "grad_norm": 1.2609989086875364, + "learning_rate": 6.087087518310482e-06, + "loss": 0.2413, + "step": 6490 + }, + { + "epoch": 2.2389243233925185, + "grad_norm": 1.333218721228253, + "learning_rate": 6.081637908635581e-06, + "loss": 0.228, + "step": 6495 + }, + { + "epoch": 2.2406481641096363, + "grad_norm": 1.0670271069980255, + "learning_rate": 6.076186950633932e-06, + "loss": 0.2056, + "step": 6500 + }, + { + "epoch": 2.242372004826754, + "grad_norm": 1.455046424465367, + "learning_rate": 6.070734651100486e-06, + "loss": 0.2441, + "step": 6505 + }, + { + "epoch": 2.244095845543872, + "grad_norm": 1.2331558263115812, + "learning_rate": 6.065281016831861e-06, + "loss": 0.2075, + "step": 6510 + }, + { + "epoch": 2.2458196862609894, + "grad_norm": 1.2117871822689017, + "learning_rate": 6.059826054626338e-06, + "loss": 0.2464, + "step": 6515 + }, + { + "epoch": 2.247543526978107, + "grad_norm": 1.1156968508704466, + "learning_rate": 6.054369771283861e-06, + "loss": 0.2264, + "step": 6520 + }, + { + "epoch": 2.249267367695225, + "grad_norm": 1.4028013307117346, + "learning_rate": 6.04891217360601e-06, + "loss": 0.2165, + "step": 6525 + }, + { + "epoch": 2.2509912084123425, + "grad_norm": 1.0582071748324666, + "learning_rate": 6.0434532683960134e-06, + "loss": 0.2026, + "step": 6530 + }, + { + "epoch": 2.2527150491294603, + "grad_norm": 1.1495646956895642, + "learning_rate": 6.03799306245872e-06, + "loss": 0.2301, + "step": 6535 + }, + { + "epoch": 2.254438889846578, + "grad_norm": 1.1465172093749207, + "learning_rate": 6.03253156260061e-06, + "loss": 0.2042, + "step": 6540 + }, + { + "epoch": 2.256162730563696, + "grad_norm": 1.124781134057256, + "learning_rate": 6.027068775629768e-06, + "loss": 0.241, + "step": 6545 + }, + { + "epoch": 2.257886571280814, + "grad_norm": 1.3024720247117805, + "learning_rate": 6.02160470835589e-06, + "loss": 0.228, + "step": 6550 + }, + { + "epoch": 2.2596104119979312, + "grad_norm": 1.3853558749547408, + "learning_rate": 6.016139367590263e-06, + "loss": 0.2256, + "step": 6555 + }, + { + "epoch": 2.261334252715049, + "grad_norm": 1.2302090593193726, + "learning_rate": 6.010672760145762e-06, + "loss": 0.247, + "step": 6560 + }, + { + "epoch": 2.263058093432167, + "grad_norm": 1.2159809112138422, + "learning_rate": 6.005204892836843e-06, + "loss": 0.2096, + "step": 6565 + }, + { + "epoch": 2.2647819341492847, + "grad_norm": 1.1668358935339171, + "learning_rate": 5.9997357724795325e-06, + "loss": 0.2445, + "step": 6570 + }, + { + "epoch": 2.2665057748664026, + "grad_norm": 1.0030758336245051, + "learning_rate": 5.9942654058914184e-06, + "loss": 0.2268, + "step": 6575 + }, + { + "epoch": 2.26822961558352, + "grad_norm": 1.220413035318449, + "learning_rate": 5.988793799891639e-06, + "loss": 0.2362, + "step": 6580 + }, + { + "epoch": 2.269953456300638, + "grad_norm": 1.2409356225757247, + "learning_rate": 5.983320961300886e-06, + "loss": 0.2218, + "step": 6585 + }, + { + "epoch": 2.2716772970177557, + "grad_norm": 1.1814271142656403, + "learning_rate": 5.977846896941376e-06, + "loss": 0.2321, + "step": 6590 + }, + { + "epoch": 2.273401137734873, + "grad_norm": 1.1844509666036434, + "learning_rate": 5.972371613636863e-06, + "loss": 0.2197, + "step": 6595 + }, + { + "epoch": 2.275124978451991, + "grad_norm": 1.3058894691253469, + "learning_rate": 5.966895118212615e-06, + "loss": 0.2438, + "step": 6600 + }, + { + "epoch": 2.2768488191691088, + "grad_norm": 1.2269709989697146, + "learning_rate": 5.961417417495416e-06, + "loss": 0.236, + "step": 6605 + }, + { + "epoch": 2.2785726598862266, + "grad_norm": 1.2241053358319587, + "learning_rate": 5.955938518313549e-06, + "loss": 0.2181, + "step": 6610 + }, + { + "epoch": 2.2802965006033444, + "grad_norm": 0.9522467273279619, + "learning_rate": 5.950458427496789e-06, + "loss": 0.235, + "step": 6615 + }, + { + "epoch": 2.282020341320462, + "grad_norm": 1.266499686373801, + "learning_rate": 5.944977151876402e-06, + "loss": 0.2462, + "step": 6620 + }, + { + "epoch": 2.2837441820375797, + "grad_norm": 1.2230910063692286, + "learning_rate": 5.939494698285125e-06, + "loss": 0.2204, + "step": 6625 + }, + { + "epoch": 2.2854680227546975, + "grad_norm": 1.1978976267327752, + "learning_rate": 5.934011073557169e-06, + "loss": 0.2208, + "step": 6630 + }, + { + "epoch": 2.2871918634718154, + "grad_norm": 1.0298786417340702, + "learning_rate": 5.928526284528202e-06, + "loss": 0.215, + "step": 6635 + }, + { + "epoch": 2.2889157041889328, + "grad_norm": 1.1331157635471427, + "learning_rate": 5.923040338035339e-06, + "loss": 0.2241, + "step": 6640 + }, + { + "epoch": 2.2906395449060506, + "grad_norm": 1.0687985229605128, + "learning_rate": 5.917553240917151e-06, + "loss": 0.221, + "step": 6645 + }, + { + "epoch": 2.2923633856231684, + "grad_norm": 1.1676284370384098, + "learning_rate": 5.912065000013627e-06, + "loss": 0.2264, + "step": 6650 + }, + { + "epoch": 2.2940872263402863, + "grad_norm": 1.1272516773874, + "learning_rate": 5.906575622166193e-06, + "loss": 0.2151, + "step": 6655 + }, + { + "epoch": 2.2958110670574037, + "grad_norm": 1.2006669040848492, + "learning_rate": 5.9010851142176884e-06, + "loss": 0.1966, + "step": 6660 + }, + { + "epoch": 2.2975349077745215, + "grad_norm": 1.0535753392295237, + "learning_rate": 5.895593483012362e-06, + "loss": 0.1946, + "step": 6665 + }, + { + "epoch": 2.2992587484916394, + "grad_norm": 1.4714542006763698, + "learning_rate": 5.890100735395864e-06, + "loss": 0.2463, + "step": 6670 + }, + { + "epoch": 2.300982589208757, + "grad_norm": 1.0413354232245149, + "learning_rate": 5.884606878215231e-06, + "loss": 0.2246, + "step": 6675 + }, + { + "epoch": 2.302706429925875, + "grad_norm": 1.186952337907024, + "learning_rate": 5.87911191831889e-06, + "loss": 0.2285, + "step": 6680 + }, + { + "epoch": 2.3044302706429924, + "grad_norm": 1.1518388534893138, + "learning_rate": 5.873615862556636e-06, + "loss": 0.2093, + "step": 6685 + }, + { + "epoch": 2.3061541113601103, + "grad_norm": 0.925470444073306, + "learning_rate": 5.868118717779636e-06, + "loss": 0.183, + "step": 6690 + }, + { + "epoch": 2.307877952077228, + "grad_norm": 1.1658438303375276, + "learning_rate": 5.8626204908404125e-06, + "loss": 0.235, + "step": 6695 + }, + { + "epoch": 2.309601792794346, + "grad_norm": 1.2464114844329484, + "learning_rate": 5.857121188592834e-06, + "loss": 0.2476, + "step": 6700 + }, + { + "epoch": 2.3113256335114634, + "grad_norm": 1.2086437497264788, + "learning_rate": 5.851620817892112e-06, + "loss": 0.2385, + "step": 6705 + }, + { + "epoch": 2.313049474228581, + "grad_norm": 1.2879945468527676, + "learning_rate": 5.846119385594789e-06, + "loss": 0.2325, + "step": 6710 + }, + { + "epoch": 2.314773314945699, + "grad_norm": 1.2630188960941966, + "learning_rate": 5.840616898558734e-06, + "loss": 0.2393, + "step": 6715 + }, + { + "epoch": 2.316497155662817, + "grad_norm": 1.2771883079158053, + "learning_rate": 5.835113363643126e-06, + "loss": 0.2041, + "step": 6720 + }, + { + "epoch": 2.3182209963799343, + "grad_norm": 1.2023618651995445, + "learning_rate": 5.829608787708454e-06, + "loss": 0.2291, + "step": 6725 + }, + { + "epoch": 2.319944837097052, + "grad_norm": 1.0785269405748918, + "learning_rate": 5.8241031776165035e-06, + "loss": 0.253, + "step": 6730 + }, + { + "epoch": 2.32166867781417, + "grad_norm": 1.151900270329759, + "learning_rate": 5.818596540230346e-06, + "loss": 0.2173, + "step": 6735 + }, + { + "epoch": 2.323392518531288, + "grad_norm": 1.161962981954948, + "learning_rate": 5.8130888824143384e-06, + "loss": 0.2003, + "step": 6740 + }, + { + "epoch": 2.3251163592484057, + "grad_norm": 1.505530224772604, + "learning_rate": 5.807580211034106e-06, + "loss": 0.2142, + "step": 6745 + }, + { + "epoch": 2.326840199965523, + "grad_norm": 1.2105429985143288, + "learning_rate": 5.802070532956542e-06, + "loss": 0.2103, + "step": 6750 + }, + { + "epoch": 2.328564040682641, + "grad_norm": 1.306197584762026, + "learning_rate": 5.796559855049791e-06, + "loss": 0.2245, + "step": 6755 + }, + { + "epoch": 2.3302878813997587, + "grad_norm": 1.5574206245981117, + "learning_rate": 5.7910481841832424e-06, + "loss": 0.22, + "step": 6760 + }, + { + "epoch": 2.3320117221168766, + "grad_norm": 1.1763708628501528, + "learning_rate": 5.785535527227527e-06, + "loss": 0.2179, + "step": 6765 + }, + { + "epoch": 2.333735562833994, + "grad_norm": 1.1856196939443528, + "learning_rate": 5.780021891054504e-06, + "loss": 0.2186, + "step": 6770 + }, + { + "epoch": 2.335459403551112, + "grad_norm": 1.1408446330456214, + "learning_rate": 5.774507282537251e-06, + "loss": 0.2172, + "step": 6775 + }, + { + "epoch": 2.3371832442682297, + "grad_norm": 1.151214016085517, + "learning_rate": 5.7689917085500625e-06, + "loss": 0.2345, + "step": 6780 + }, + { + "epoch": 2.3389070849853475, + "grad_norm": 1.0477149333534155, + "learning_rate": 5.763475175968429e-06, + "loss": 0.2131, + "step": 6785 + }, + { + "epoch": 2.340630925702465, + "grad_norm": 1.0639308549219988, + "learning_rate": 5.7579576916690465e-06, + "loss": 0.2146, + "step": 6790 + }, + { + "epoch": 2.3423547664195827, + "grad_norm": 1.1565149305617368, + "learning_rate": 5.752439262529784e-06, + "loss": 0.1999, + "step": 6795 + }, + { + "epoch": 2.3440786071367006, + "grad_norm": 1.219503174438488, + "learning_rate": 5.7469198954297005e-06, + "loss": 0.2057, + "step": 6800 + }, + { + "epoch": 2.3458024478538184, + "grad_norm": 1.1980585499435352, + "learning_rate": 5.7413995972490174e-06, + "loss": 0.2265, + "step": 6805 + }, + { + "epoch": 2.3475262885709363, + "grad_norm": 1.3185695988317838, + "learning_rate": 5.7358783748691194e-06, + "loss": 0.2498, + "step": 6810 + }, + { + "epoch": 2.3492501292880537, + "grad_norm": 1.1976799909402038, + "learning_rate": 5.730356235172543e-06, + "loss": 0.2132, + "step": 6815 + }, + { + "epoch": 2.3509739700051715, + "grad_norm": 1.1704062573965308, + "learning_rate": 5.724833185042965e-06, + "loss": 0.2334, + "step": 6820 + }, + { + "epoch": 2.3526978107222893, + "grad_norm": 1.1114083975412845, + "learning_rate": 5.719309231365202e-06, + "loss": 0.2091, + "step": 6825 + }, + { + "epoch": 2.354421651439407, + "grad_norm": 1.1393640520800679, + "learning_rate": 5.713784381025194e-06, + "loss": 0.2236, + "step": 6830 + }, + { + "epoch": 2.3561454921565246, + "grad_norm": 1.134556435275592, + "learning_rate": 5.7082586409100005e-06, + "loss": 0.2056, + "step": 6835 + }, + { + "epoch": 2.3578693328736424, + "grad_norm": 1.087144351801372, + "learning_rate": 5.702732017907788e-06, + "loss": 0.2081, + "step": 6840 + }, + { + "epoch": 2.3595931735907603, + "grad_norm": 1.540543465229459, + "learning_rate": 5.697204518907823e-06, + "loss": 0.228, + "step": 6845 + }, + { + "epoch": 2.361317014307878, + "grad_norm": 1.14562385187818, + "learning_rate": 5.69167615080047e-06, + "loss": 0.2152, + "step": 6850 + }, + { + "epoch": 2.3630408550249955, + "grad_norm": 1.2752884151374904, + "learning_rate": 5.686146920477169e-06, + "loss": 0.1976, + "step": 6855 + }, + { + "epoch": 2.3647646957421133, + "grad_norm": 1.1263082918927592, + "learning_rate": 5.680616834830439e-06, + "loss": 0.226, + "step": 6860 + }, + { + "epoch": 2.366488536459231, + "grad_norm": 1.2588818960989305, + "learning_rate": 5.675085900753865e-06, + "loss": 0.2336, + "step": 6865 + }, + { + "epoch": 2.368212377176349, + "grad_norm": 1.0271877557731728, + "learning_rate": 5.669554125142089e-06, + "loss": 0.207, + "step": 6870 + }, + { + "epoch": 2.369936217893467, + "grad_norm": 1.9174249095640592, + "learning_rate": 5.664021514890804e-06, + "loss": 0.2111, + "step": 6875 + }, + { + "epoch": 2.3716600586105843, + "grad_norm": 1.150945943954569, + "learning_rate": 5.658488076896739e-06, + "loss": 0.2381, + "step": 6880 + }, + { + "epoch": 2.373383899327702, + "grad_norm": 1.1053485694021632, + "learning_rate": 5.6529538180576574e-06, + "loss": 0.2238, + "step": 6885 + }, + { + "epoch": 2.37510774004482, + "grad_norm": 1.114827238948087, + "learning_rate": 5.647418745272347e-06, + "loss": 0.2272, + "step": 6890 + }, + { + "epoch": 2.376831580761938, + "grad_norm": 1.211894614332284, + "learning_rate": 5.64188286544061e-06, + "loss": 0.2582, + "step": 6895 + }, + { + "epoch": 2.378555421479055, + "grad_norm": 1.0331736006740053, + "learning_rate": 5.636346185463254e-06, + "loss": 0.227, + "step": 6900 + }, + { + "epoch": 2.380279262196173, + "grad_norm": 1.2115995995308106, + "learning_rate": 5.630808712242081e-06, + "loss": 0.2308, + "step": 6905 + }, + { + "epoch": 2.382003102913291, + "grad_norm": 1.249508815570148, + "learning_rate": 5.6252704526798855e-06, + "loss": 0.2356, + "step": 6910 + }, + { + "epoch": 2.3837269436304087, + "grad_norm": 1.2668841587373507, + "learning_rate": 5.619731413680443e-06, + "loss": 0.2175, + "step": 6915 + }, + { + "epoch": 2.385450784347526, + "grad_norm": 1.004589133549024, + "learning_rate": 5.614191602148498e-06, + "loss": 0.193, + "step": 6920 + }, + { + "epoch": 2.387174625064644, + "grad_norm": 1.124115544035202, + "learning_rate": 5.6086510249897576e-06, + "loss": 0.2597, + "step": 6925 + }, + { + "epoch": 2.388898465781762, + "grad_norm": 1.0615474999083314, + "learning_rate": 5.603109689110887e-06, + "loss": 0.2226, + "step": 6930 + }, + { + "epoch": 2.3906223064988796, + "grad_norm": 1.2306413672173773, + "learning_rate": 5.597567601419496e-06, + "loss": 0.2213, + "step": 6935 + }, + { + "epoch": 2.3923461472159975, + "grad_norm": 1.173560341722914, + "learning_rate": 5.592024768824126e-06, + "loss": 0.225, + "step": 6940 + }, + { + "epoch": 2.394069987933115, + "grad_norm": 1.2633080823318978, + "learning_rate": 5.586481198234253e-06, + "loss": 0.2289, + "step": 6945 + }, + { + "epoch": 2.3957938286502327, + "grad_norm": 1.1173136748171157, + "learning_rate": 5.580936896560273e-06, + "loss": 0.2071, + "step": 6950 + }, + { + "epoch": 2.3975176693673506, + "grad_norm": 1.105475600922252, + "learning_rate": 5.57539187071349e-06, + "loss": 0.2146, + "step": 6955 + }, + { + "epoch": 2.399241510084468, + "grad_norm": 1.147403948431547, + "learning_rate": 5.569846127606115e-06, + "loss": 0.2115, + "step": 6960 + }, + { + "epoch": 2.400965350801586, + "grad_norm": 1.0777348019729607, + "learning_rate": 5.564299674151248e-06, + "loss": 0.1989, + "step": 6965 + }, + { + "epoch": 2.4026891915187036, + "grad_norm": 5.556592377891615, + "learning_rate": 5.558752517262877e-06, + "loss": 0.211, + "step": 6970 + }, + { + "epoch": 2.4044130322358215, + "grad_norm": 1.125771209945684, + "learning_rate": 5.553204663855868e-06, + "loss": 0.2231, + "step": 6975 + }, + { + "epoch": 2.4061368729529393, + "grad_norm": 1.1591474019986616, + "learning_rate": 5.547656120845953e-06, + "loss": 0.218, + "step": 6980 + }, + { + "epoch": 2.4078607136700567, + "grad_norm": 1.1585412151784338, + "learning_rate": 5.542106895149727e-06, + "loss": 0.2238, + "step": 6985 + }, + { + "epoch": 2.4095845543871746, + "grad_norm": 1.2253255427195415, + "learning_rate": 5.5365569936846294e-06, + "loss": 0.2375, + "step": 6990 + }, + { + "epoch": 2.4113083951042924, + "grad_norm": 1.1242386635647112, + "learning_rate": 5.531006423368953e-06, + "loss": 0.2164, + "step": 6995 + }, + { + "epoch": 2.4130322358214102, + "grad_norm": 1.3095471221364696, + "learning_rate": 5.5254551911218114e-06, + "loss": 0.2319, + "step": 7000 + }, + { + "epoch": 2.414756076538528, + "grad_norm": 1.0918409076710234, + "learning_rate": 5.519903303863153e-06, + "loss": 0.2326, + "step": 7005 + }, + { + "epoch": 2.4164799172556455, + "grad_norm": 1.3028504273451234, + "learning_rate": 5.514350768513738e-06, + "loss": 0.2142, + "step": 7010 + }, + { + "epoch": 2.4182037579727633, + "grad_norm": 1.0861765578487275, + "learning_rate": 5.5087975919951374e-06, + "loss": 0.1967, + "step": 7015 + }, + { + "epoch": 2.419927598689881, + "grad_norm": 1.1199745344635728, + "learning_rate": 5.503243781229719e-06, + "loss": 0.2099, + "step": 7020 + }, + { + "epoch": 2.4216514394069986, + "grad_norm": 1.2399659497777336, + "learning_rate": 5.497689343140642e-06, + "loss": 0.2293, + "step": 7025 + }, + { + "epoch": 2.4233752801241164, + "grad_norm": 1.1536479623138571, + "learning_rate": 5.4921342846518475e-06, + "loss": 0.2198, + "step": 7030 + }, + { + "epoch": 2.4250991208412342, + "grad_norm": 2.9483122443726, + "learning_rate": 5.486578612688051e-06, + "loss": 0.2122, + "step": 7035 + }, + { + "epoch": 2.426822961558352, + "grad_norm": 1.0502328215848433, + "learning_rate": 5.4810223341747315e-06, + "loss": 0.213, + "step": 7040 + }, + { + "epoch": 2.42854680227547, + "grad_norm": 1.2517019386272146, + "learning_rate": 5.4754654560381245e-06, + "loss": 0.2308, + "step": 7045 + }, + { + "epoch": 2.4302706429925873, + "grad_norm": 1.058335724534813, + "learning_rate": 5.469907985205212e-06, + "loss": 0.2048, + "step": 7050 + }, + { + "epoch": 2.431994483709705, + "grad_norm": 1.1595293528986441, + "learning_rate": 5.4643499286037195e-06, + "loss": 0.2176, + "step": 7055 + }, + { + "epoch": 2.433718324426823, + "grad_norm": 1.1693707538220317, + "learning_rate": 5.458791293162095e-06, + "loss": 0.208, + "step": 7060 + }, + { + "epoch": 2.435442165143941, + "grad_norm": 1.2031321093299394, + "learning_rate": 5.453232085809514e-06, + "loss": 0.2293, + "step": 7065 + }, + { + "epoch": 2.4371660058610582, + "grad_norm": 1.1133781109654826, + "learning_rate": 5.44767231347586e-06, + "loss": 0.2017, + "step": 7070 + }, + { + "epoch": 2.438889846578176, + "grad_norm": 1.0936315572392878, + "learning_rate": 5.442111983091729e-06, + "loss": 0.2327, + "step": 7075 + }, + { + "epoch": 2.440613687295294, + "grad_norm": 1.0291507351238356, + "learning_rate": 5.436551101588405e-06, + "loss": 0.2031, + "step": 7080 + }, + { + "epoch": 2.4423375280124118, + "grad_norm": 1.3108572256084867, + "learning_rate": 5.430989675897861e-06, + "loss": 0.2098, + "step": 7085 + }, + { + "epoch": 2.444061368729529, + "grad_norm": 1.1755260710981985, + "learning_rate": 5.425427712952748e-06, + "loss": 0.2159, + "step": 7090 + }, + { + "epoch": 2.445785209446647, + "grad_norm": 1.1144893583804825, + "learning_rate": 5.419865219686389e-06, + "loss": 0.2236, + "step": 7095 + }, + { + "epoch": 2.447509050163765, + "grad_norm": 1.1173215912035803, + "learning_rate": 5.414302203032766e-06, + "loss": 0.2341, + "step": 7100 + }, + { + "epoch": 2.4492328908808827, + "grad_norm": 1.287175299503757, + "learning_rate": 5.408738669926517e-06, + "loss": 0.1916, + "step": 7105 + }, + { + "epoch": 2.4509567315980005, + "grad_norm": 1.0611457253042027, + "learning_rate": 5.403174627302915e-06, + "loss": 0.2125, + "step": 7110 + }, + { + "epoch": 2.452680572315118, + "grad_norm": 1.1456019986733763, + "learning_rate": 5.397610082097879e-06, + "loss": 0.2121, + "step": 7115 + }, + { + "epoch": 2.4544044130322358, + "grad_norm": 1.236233787814864, + "learning_rate": 5.392045041247946e-06, + "loss": 0.2456, + "step": 7120 + }, + { + "epoch": 2.4561282537493536, + "grad_norm": 1.1703211485963458, + "learning_rate": 5.386479511690276e-06, + "loss": 0.2051, + "step": 7125 + }, + { + "epoch": 2.4578520944664715, + "grad_norm": 1.2366522779193592, + "learning_rate": 5.380913500362637e-06, + "loss": 0.2297, + "step": 7130 + }, + { + "epoch": 2.459575935183589, + "grad_norm": 1.2263911601430404, + "learning_rate": 5.375347014203395e-06, + "loss": 0.2327, + "step": 7135 + }, + { + "epoch": 2.4612997759007067, + "grad_norm": 2.226172866100081, + "learning_rate": 5.369780060151514e-06, + "loss": 0.2347, + "step": 7140 + }, + { + "epoch": 2.4630236166178245, + "grad_norm": 1.117299355633407, + "learning_rate": 5.364212645146533e-06, + "loss": 0.2153, + "step": 7145 + }, + { + "epoch": 2.4647474573349424, + "grad_norm": 1.214567637730859, + "learning_rate": 5.3586447761285724e-06, + "loss": 0.2327, + "step": 7150 + }, + { + "epoch": 2.46647129805206, + "grad_norm": 1.1954676598235126, + "learning_rate": 5.353076460038315e-06, + "loss": 0.2241, + "step": 7155 + }, + { + "epoch": 2.4681951387691776, + "grad_norm": 1.1572220939526712, + "learning_rate": 5.347507703817001e-06, + "loss": 0.2305, + "step": 7160 + }, + { + "epoch": 2.4699189794862955, + "grad_norm": 1.3110562065182316, + "learning_rate": 5.341938514406423e-06, + "loss": 0.2382, + "step": 7165 + }, + { + "epoch": 2.4716428202034133, + "grad_norm": 1.1170022862963467, + "learning_rate": 5.3363688987489075e-06, + "loss": 0.2342, + "step": 7170 + }, + { + "epoch": 2.473366660920531, + "grad_norm": 1.216312857430115, + "learning_rate": 5.330798863787318e-06, + "loss": 0.2215, + "step": 7175 + }, + { + "epoch": 2.4750905016376485, + "grad_norm": 1.207424663510382, + "learning_rate": 5.3252284164650355e-06, + "loss": 0.2248, + "step": 7180 + }, + { + "epoch": 2.4768143423547664, + "grad_norm": 1.2043950960105698, + "learning_rate": 5.319657563725962e-06, + "loss": 0.1857, + "step": 7185 + }, + { + "epoch": 2.4785381830718842, + "grad_norm": 1.2265575159938535, + "learning_rate": 5.314086312514498e-06, + "loss": 0.1999, + "step": 7190 + }, + { + "epoch": 2.480262023789002, + "grad_norm": 1.069220922010664, + "learning_rate": 5.3085146697755415e-06, + "loss": 0.1926, + "step": 7195 + }, + { + "epoch": 2.4819858645061195, + "grad_norm": 1.060777107033285, + "learning_rate": 5.3029426424544865e-06, + "loss": 0.191, + "step": 7200 + }, + { + "epoch": 2.4837097052232373, + "grad_norm": 1.3349880458159542, + "learning_rate": 5.297370237497194e-06, + "loss": 0.2219, + "step": 7205 + }, + { + "epoch": 2.485433545940355, + "grad_norm": 1.2248208490554113, + "learning_rate": 5.291797461850004e-06, + "loss": 0.2205, + "step": 7210 + }, + { + "epoch": 2.487157386657473, + "grad_norm": 1.2652552640765649, + "learning_rate": 5.28622432245972e-06, + "loss": 0.2382, + "step": 7215 + }, + { + "epoch": 2.4888812273745904, + "grad_norm": 0.9161267701940665, + "learning_rate": 5.280650826273591e-06, + "loss": 0.2138, + "step": 7220 + }, + { + "epoch": 2.4906050680917082, + "grad_norm": 1.0591088864957827, + "learning_rate": 5.2750769802393195e-06, + "loss": 0.1925, + "step": 7225 + }, + { + "epoch": 2.492328908808826, + "grad_norm": 0.9870998756199474, + "learning_rate": 5.269502791305037e-06, + "loss": 0.1954, + "step": 7230 + }, + { + "epoch": 2.494052749525944, + "grad_norm": 1.1463081762145328, + "learning_rate": 5.263928266419306e-06, + "loss": 0.206, + "step": 7235 + }, + { + "epoch": 2.4957765902430618, + "grad_norm": 1.2207867491896367, + "learning_rate": 5.258353412531109e-06, + "loss": 0.2104, + "step": 7240 + }, + { + "epoch": 2.497500430960179, + "grad_norm": 1.201890758904529, + "learning_rate": 5.252778236589834e-06, + "loss": 0.2071, + "step": 7245 + }, + { + "epoch": 2.499224271677297, + "grad_norm": 1.2846657038195175, + "learning_rate": 5.247202745545277e-06, + "loss": 0.1908, + "step": 7250 + }, + { + "epoch": 2.500948112394415, + "grad_norm": 1.229337153227972, + "learning_rate": 5.241626946347617e-06, + "loss": 0.2227, + "step": 7255 + }, + { + "epoch": 2.5026719531115322, + "grad_norm": 1.2120176108004994, + "learning_rate": 5.236050845947433e-06, + "loss": 0.1957, + "step": 7260 + }, + { + "epoch": 2.50439579382865, + "grad_norm": 1.0973018698702013, + "learning_rate": 5.230474451295659e-06, + "loss": 0.217, + "step": 7265 + }, + { + "epoch": 2.506119634545768, + "grad_norm": 1.140347075779373, + "learning_rate": 5.2248977693436154e-06, + "loss": 0.2328, + "step": 7270 + }, + { + "epoch": 2.5078434752628858, + "grad_norm": 1.1809774800075683, + "learning_rate": 5.219320807042965e-06, + "loss": 0.1994, + "step": 7275 + }, + { + "epoch": 2.5095673159800036, + "grad_norm": 1.2874965605523885, + "learning_rate": 5.21374357134573e-06, + "loss": 0.2145, + "step": 7280 + }, + { + "epoch": 2.511291156697121, + "grad_norm": 1.1577850174966156, + "learning_rate": 5.208166069204274e-06, + "loss": 0.2127, + "step": 7285 + }, + { + "epoch": 2.513014997414239, + "grad_norm": 1.7529340656558474, + "learning_rate": 5.202588307571282e-06, + "loss": 0.2123, + "step": 7290 + }, + { + "epoch": 2.5147388381313567, + "grad_norm": 1.4061356963194789, + "learning_rate": 5.197010293399774e-06, + "loss": 0.2089, + "step": 7295 + }, + { + "epoch": 2.5164626788484745, + "grad_norm": 1.1149404139701187, + "learning_rate": 5.191432033643078e-06, + "loss": 0.1989, + "step": 7300 + }, + { + "epoch": 2.5181865195655924, + "grad_norm": 1.2917225279499973, + "learning_rate": 5.185853535254832e-06, + "loss": 0.2468, + "step": 7305 + }, + { + "epoch": 2.5199103602827098, + "grad_norm": 1.1374273101044257, + "learning_rate": 5.1802748051889715e-06, + "loss": 0.1891, + "step": 7310 + }, + { + "epoch": 2.5216342009998276, + "grad_norm": 1.102091147672205, + "learning_rate": 5.1746958503997154e-06, + "loss": 0.2135, + "step": 7315 + }, + { + "epoch": 2.5233580417169454, + "grad_norm": 1.2681242359822413, + "learning_rate": 5.16911667784157e-06, + "loss": 0.2383, + "step": 7320 + }, + { + "epoch": 2.525081882434063, + "grad_norm": 1.171266354531671, + "learning_rate": 5.163537294469308e-06, + "loss": 0.1761, + "step": 7325 + }, + { + "epoch": 2.5268057231511807, + "grad_norm": 1.056180065049999, + "learning_rate": 5.1579577072379676e-06, + "loss": 0.2077, + "step": 7330 + }, + { + "epoch": 2.5285295638682985, + "grad_norm": 1.1330512349452793, + "learning_rate": 5.152377923102836e-06, + "loss": 0.218, + "step": 7335 + }, + { + "epoch": 2.5302534045854164, + "grad_norm": 1.1441240829036081, + "learning_rate": 5.146797949019455e-06, + "loss": 0.2062, + "step": 7340 + }, + { + "epoch": 2.531977245302534, + "grad_norm": 1.3887357240844742, + "learning_rate": 5.141217791943597e-06, + "loss": 0.2208, + "step": 7345 + }, + { + "epoch": 2.5337010860196516, + "grad_norm": 1.1307112493676725, + "learning_rate": 5.135637458831262e-06, + "loss": 0.2074, + "step": 7350 + }, + { + "epoch": 2.5354249267367694, + "grad_norm": 1.2282684352585873, + "learning_rate": 5.1300569566386725e-06, + "loss": 0.2079, + "step": 7355 + }, + { + "epoch": 2.5371487674538873, + "grad_norm": 1.1569630718124229, + "learning_rate": 5.124476292322259e-06, + "loss": 0.2168, + "step": 7360 + }, + { + "epoch": 2.538872608171005, + "grad_norm": 1.0466275191099756, + "learning_rate": 5.1188954728386565e-06, + "loss": 0.2302, + "step": 7365 + }, + { + "epoch": 2.540596448888123, + "grad_norm": 3.0032131582027364, + "learning_rate": 5.113314505144693e-06, + "loss": 0.23, + "step": 7370 + }, + { + "epoch": 2.5423202896052404, + "grad_norm": 0.9610460318468603, + "learning_rate": 5.107733396197379e-06, + "loss": 0.2173, + "step": 7375 + }, + { + "epoch": 2.544044130322358, + "grad_norm": 1.2169358707697866, + "learning_rate": 5.102152152953903e-06, + "loss": 0.2125, + "step": 7380 + }, + { + "epoch": 2.545767971039476, + "grad_norm": 1.2078229706325732, + "learning_rate": 5.09657078237162e-06, + "loss": 0.223, + "step": 7385 + }, + { + "epoch": 2.5474918117565934, + "grad_norm": 1.266319393380179, + "learning_rate": 5.090989291408047e-06, + "loss": 0.2116, + "step": 7390 + }, + { + "epoch": 2.5492156524737113, + "grad_norm": 1.1638609253026335, + "learning_rate": 5.0854076870208456e-06, + "loss": 0.1952, + "step": 7395 + }, + { + "epoch": 2.550939493190829, + "grad_norm": 1.1923136092049034, + "learning_rate": 5.079825976167821e-06, + "loss": 0.209, + "step": 7400 + }, + { + "epoch": 2.552663333907947, + "grad_norm": 1.2901463135572986, + "learning_rate": 5.074244165806915e-06, + "loss": 0.2333, + "step": 7405 + }, + { + "epoch": 2.554387174625065, + "grad_norm": 1.3062271974022313, + "learning_rate": 5.068662262896189e-06, + "loss": 0.2201, + "step": 7410 + }, + { + "epoch": 2.556111015342182, + "grad_norm": 1.1748899193790712, + "learning_rate": 5.063080274393818e-06, + "loss": 0.2201, + "step": 7415 + }, + { + "epoch": 2.5578348560593, + "grad_norm": 1.251431560253386, + "learning_rate": 5.05749820725809e-06, + "loss": 0.2183, + "step": 7420 + }, + { + "epoch": 2.559558696776418, + "grad_norm": 1.3622389217298134, + "learning_rate": 5.051916068447387e-06, + "loss": 0.2192, + "step": 7425 + }, + { + "epoch": 2.5612825374935357, + "grad_norm": 1.3864641540898042, + "learning_rate": 5.04633386492018e-06, + "loss": 0.2296, + "step": 7430 + }, + { + "epoch": 2.5630063782106536, + "grad_norm": 1.2952115427774917, + "learning_rate": 5.040751603635021e-06, + "loss": 0.2069, + "step": 7435 + }, + { + "epoch": 2.564730218927771, + "grad_norm": 1.3404304069860504, + "learning_rate": 5.035169291550537e-06, + "loss": 0.1924, + "step": 7440 + }, + { + "epoch": 2.566454059644889, + "grad_norm": 1.1326785798351073, + "learning_rate": 5.029586935625413e-06, + "loss": 0.1858, + "step": 7445 + }, + { + "epoch": 2.5681779003620067, + "grad_norm": 1.290488194981013, + "learning_rate": 5.024004542818396e-06, + "loss": 0.2089, + "step": 7450 + }, + { + "epoch": 2.569901741079124, + "grad_norm": 1.36755491149153, + "learning_rate": 5.01842212008827e-06, + "loss": 0.2172, + "step": 7455 + }, + { + "epoch": 2.571625581796242, + "grad_norm": 1.1642713513439564, + "learning_rate": 5.012839674393861e-06, + "loss": 0.2017, + "step": 7460 + }, + { + "epoch": 2.5733494225133597, + "grad_norm": 1.0664883925426987, + "learning_rate": 5.007257212694028e-06, + "loss": 0.2187, + "step": 7465 + }, + { + "epoch": 2.5750732632304776, + "grad_norm": 1.1524668732702383, + "learning_rate": 5.001674741947641e-06, + "loss": 0.2161, + "step": 7470 + }, + { + "epoch": 2.5767971039475954, + "grad_norm": 1.045590352929241, + "learning_rate": 4.996092269113589e-06, + "loss": 0.188, + "step": 7475 + }, + { + "epoch": 2.578520944664713, + "grad_norm": 1.353761575390307, + "learning_rate": 4.990509801150758e-06, + "loss": 0.2193, + "step": 7480 + }, + { + "epoch": 2.5802447853818307, + "grad_norm": 1.1343409334190961, + "learning_rate": 4.984927345018028e-06, + "loss": 0.1934, + "step": 7485 + }, + { + "epoch": 2.5819686260989485, + "grad_norm": 1.1630259837276047, + "learning_rate": 4.979344907674273e-06, + "loss": 0.2324, + "step": 7490 + }, + { + "epoch": 2.5836924668160663, + "grad_norm": 1.2007037888943761, + "learning_rate": 4.973762496078333e-06, + "loss": 0.2041, + "step": 7495 + }, + { + "epoch": 2.585416307533184, + "grad_norm": 1.1891610793015506, + "learning_rate": 4.9681801171890195e-06, + "loss": 0.2206, + "step": 7500 + }, + { + "epoch": 2.5871401482503016, + "grad_norm": 1.0931116205911549, + "learning_rate": 4.9625977779651055e-06, + "loss": 0.2195, + "step": 7505 + }, + { + "epoch": 2.5888639889674194, + "grad_norm": 1.2641333552254657, + "learning_rate": 4.957015485365314e-06, + "loss": 0.2576, + "step": 7510 + }, + { + "epoch": 2.5905878296845373, + "grad_norm": 1.5751221013193646, + "learning_rate": 4.951433246348304e-06, + "loss": 0.1911, + "step": 7515 + }, + { + "epoch": 2.5923116704016547, + "grad_norm": 1.2284005377309903, + "learning_rate": 4.945851067872677e-06, + "loss": 0.2138, + "step": 7520 + }, + { + "epoch": 2.5940355111187725, + "grad_norm": 1.2138876792673146, + "learning_rate": 4.9402689568969516e-06, + "loss": 0.222, + "step": 7525 + }, + { + "epoch": 2.5957593518358903, + "grad_norm": 1.2531347514382467, + "learning_rate": 4.934686920379567e-06, + "loss": 0.2277, + "step": 7530 + }, + { + "epoch": 2.597483192553008, + "grad_norm": 1.105439810964691, + "learning_rate": 4.9291049652788645e-06, + "loss": 0.2203, + "step": 7535 + }, + { + "epoch": 2.599207033270126, + "grad_norm": 1.0691720932694833, + "learning_rate": 4.923523098553091e-06, + "loss": 0.2161, + "step": 7540 + }, + { + "epoch": 2.6009308739872434, + "grad_norm": 1.1784358418650132, + "learning_rate": 4.917941327160377e-06, + "loss": 0.1953, + "step": 7545 + }, + { + "epoch": 2.6026547147043613, + "grad_norm": 1.2263614236990952, + "learning_rate": 4.912359658058736e-06, + "loss": 0.1932, + "step": 7550 + }, + { + "epoch": 2.604378555421479, + "grad_norm": 1.2296935329591632, + "learning_rate": 4.906778098206058e-06, + "loss": 0.2365, + "step": 7555 + }, + { + "epoch": 2.606102396138597, + "grad_norm": 1.4035538705631023, + "learning_rate": 4.901196654560088e-06, + "loss": 0.1871, + "step": 7560 + }, + { + "epoch": 2.607826236855715, + "grad_norm": 1.1686204347595222, + "learning_rate": 4.895615334078437e-06, + "loss": 0.2086, + "step": 7565 + }, + { + "epoch": 2.609550077572832, + "grad_norm": 0.9935612979912067, + "learning_rate": 4.89003414371855e-06, + "loss": 0.2008, + "step": 7570 + }, + { + "epoch": 2.61127391828995, + "grad_norm": 1.300964389093522, + "learning_rate": 4.884453090437725e-06, + "loss": 0.2027, + "step": 7575 + }, + { + "epoch": 2.612997759007068, + "grad_norm": 1.1480951233356864, + "learning_rate": 4.878872181193073e-06, + "loss": 0.2249, + "step": 7580 + }, + { + "epoch": 2.6147215997241853, + "grad_norm": 0.9923665704981476, + "learning_rate": 4.873291422941536e-06, + "loss": 0.1909, + "step": 7585 + }, + { + "epoch": 2.616445440441303, + "grad_norm": 2.214346676798211, + "learning_rate": 4.867710822639869e-06, + "loss": 0.2231, + "step": 7590 + }, + { + "epoch": 2.618169281158421, + "grad_norm": 1.2512831008438763, + "learning_rate": 4.862130387244622e-06, + "loss": 0.2164, + "step": 7595 + }, + { + "epoch": 2.619893121875539, + "grad_norm": 1.3200386659107375, + "learning_rate": 4.856550123712142e-06, + "loss": 0.2102, + "step": 7600 + }, + { + "epoch": 2.6216169625926566, + "grad_norm": 1.291299203564834, + "learning_rate": 4.850970038998567e-06, + "loss": 0.2006, + "step": 7605 + }, + { + "epoch": 2.623340803309774, + "grad_norm": 1.3428618013758173, + "learning_rate": 4.845390140059808e-06, + "loss": 0.219, + "step": 7610 + }, + { + "epoch": 2.625064644026892, + "grad_norm": 1.1621956627140024, + "learning_rate": 4.839810433851543e-06, + "loss": 0.1992, + "step": 7615 + }, + { + "epoch": 2.6267884847440097, + "grad_norm": 1.0179151578072643, + "learning_rate": 4.8342309273292115e-06, + "loss": 0.1711, + "step": 7620 + }, + { + "epoch": 2.628512325461127, + "grad_norm": 1.2793489697631724, + "learning_rate": 4.828651627448006e-06, + "loss": 0.2246, + "step": 7625 + }, + { + "epoch": 2.6302361661782454, + "grad_norm": 1.2069027265954009, + "learning_rate": 4.823072541162859e-06, + "loss": 0.1937, + "step": 7630 + }, + { + "epoch": 2.631960006895363, + "grad_norm": 1.1784427374902435, + "learning_rate": 4.817493675428434e-06, + "loss": 0.1983, + "step": 7635 + }, + { + "epoch": 2.6336838476124806, + "grad_norm": 1.3110717127620901, + "learning_rate": 4.81191503719913e-06, + "loss": 0.2159, + "step": 7640 + }, + { + "epoch": 2.6354076883295985, + "grad_norm": 1.1204792070203156, + "learning_rate": 4.806336633429049e-06, + "loss": 0.225, + "step": 7645 + }, + { + "epoch": 2.637131529046716, + "grad_norm": 1.1322464392337706, + "learning_rate": 4.800758471072009e-06, + "loss": 0.2303, + "step": 7650 + }, + { + "epoch": 2.6388553697638337, + "grad_norm": 1.418815982511749, + "learning_rate": 4.795180557081524e-06, + "loss": 0.1983, + "step": 7655 + }, + { + "epoch": 2.6405792104809516, + "grad_norm": 1.2350238653831294, + "learning_rate": 4.789602898410803e-06, + "loss": 0.2217, + "step": 7660 + }, + { + "epoch": 2.6423030511980694, + "grad_norm": 1.1627749751680085, + "learning_rate": 4.78402550201273e-06, + "loss": 0.1805, + "step": 7665 + }, + { + "epoch": 2.6440268919151872, + "grad_norm": 1.3143049772616533, + "learning_rate": 4.778448374839864e-06, + "loss": 0.1871, + "step": 7670 + }, + { + "epoch": 2.6457507326323046, + "grad_norm": 1.180059872412795, + "learning_rate": 4.772871523844435e-06, + "loss": 0.2064, + "step": 7675 + }, + { + "epoch": 2.6474745733494225, + "grad_norm": 1.281356440545652, + "learning_rate": 4.767294955978319e-06, + "loss": 0.2069, + "step": 7680 + }, + { + "epoch": 2.6491984140665403, + "grad_norm": 1.2451275420988035, + "learning_rate": 4.761718678193044e-06, + "loss": 0.2335, + "step": 7685 + }, + { + "epoch": 2.6509222547836577, + "grad_norm": 1.1724257983704, + "learning_rate": 4.756142697439775e-06, + "loss": 0.1996, + "step": 7690 + }, + { + "epoch": 2.6526460955007756, + "grad_norm": 1.0916232424849017, + "learning_rate": 4.750567020669312e-06, + "loss": 0.2075, + "step": 7695 + }, + { + "epoch": 2.6543699362178934, + "grad_norm": 1.2713824407428382, + "learning_rate": 4.744991654832067e-06, + "loss": 0.216, + "step": 7700 + }, + { + "epoch": 2.6560937769350113, + "grad_norm": 1.2605858193030177, + "learning_rate": 4.739416606878069e-06, + "loss": 0.1942, + "step": 7705 + }, + { + "epoch": 2.657817617652129, + "grad_norm": 1.2291735786863593, + "learning_rate": 4.733841883756954e-06, + "loss": 0.1926, + "step": 7710 + }, + { + "epoch": 2.6595414583692465, + "grad_norm": 1.2206656544882426, + "learning_rate": 4.728267492417949e-06, + "loss": 0.2106, + "step": 7715 + }, + { + "epoch": 2.6612652990863643, + "grad_norm": 1.1937226387173594, + "learning_rate": 4.722693439809866e-06, + "loss": 0.2049, + "step": 7720 + }, + { + "epoch": 2.662989139803482, + "grad_norm": 1.3494350997502214, + "learning_rate": 4.717119732881099e-06, + "loss": 0.2073, + "step": 7725 + }, + { + "epoch": 2.6647129805206, + "grad_norm": 1.293557692804613, + "learning_rate": 4.71154637857961e-06, + "loss": 0.2338, + "step": 7730 + }, + { + "epoch": 2.666436821237718, + "grad_norm": 1.1304063702876845, + "learning_rate": 4.705973383852919e-06, + "loss": 0.1987, + "step": 7735 + }, + { + "epoch": 2.6681606619548353, + "grad_norm": 1.1252780123618584, + "learning_rate": 4.700400755648098e-06, + "loss": 0.2137, + "step": 7740 + }, + { + "epoch": 2.669884502671953, + "grad_norm": 1.3044113305907792, + "learning_rate": 4.694828500911766e-06, + "loss": 0.1879, + "step": 7745 + }, + { + "epoch": 2.671608343389071, + "grad_norm": 1.2407585884032672, + "learning_rate": 4.689256626590073e-06, + "loss": 0.2128, + "step": 7750 + }, + { + "epoch": 2.6733321841061883, + "grad_norm": 1.1849482277456749, + "learning_rate": 4.683685139628693e-06, + "loss": 0.2129, + "step": 7755 + }, + { + "epoch": 2.675056024823306, + "grad_norm": 1.1959407941225024, + "learning_rate": 4.6781140469728255e-06, + "loss": 0.1978, + "step": 7760 + }, + { + "epoch": 2.676779865540424, + "grad_norm": 1.1666921527164538, + "learning_rate": 4.672543355567168e-06, + "loss": 0.2095, + "step": 7765 + }, + { + "epoch": 2.678503706257542, + "grad_norm": 1.1131245840833146, + "learning_rate": 4.666973072355925e-06, + "loss": 0.2072, + "step": 7770 + }, + { + "epoch": 2.6802275469746597, + "grad_norm": 1.207693667437074, + "learning_rate": 4.661403204282786e-06, + "loss": 0.2106, + "step": 7775 + }, + { + "epoch": 2.681951387691777, + "grad_norm": 1.246143342849133, + "learning_rate": 4.655833758290933e-06, + "loss": 0.1956, + "step": 7780 + }, + { + "epoch": 2.683675228408895, + "grad_norm": 1.1898634230084935, + "learning_rate": 4.650264741323011e-06, + "loss": 0.1918, + "step": 7785 + }, + { + "epoch": 2.685399069126013, + "grad_norm": 1.2151672915313287, + "learning_rate": 4.644696160321134e-06, + "loss": 0.1881, + "step": 7790 + }, + { + "epoch": 2.6871229098431306, + "grad_norm": 1.1383887903753034, + "learning_rate": 4.639128022226879e-06, + "loss": 0.2044, + "step": 7795 + }, + { + "epoch": 2.6888467505602485, + "grad_norm": 1.1989490163151273, + "learning_rate": 4.63356033398126e-06, + "loss": 0.1903, + "step": 7800 + }, + { + "epoch": 2.690570591277366, + "grad_norm": 1.2971416104279978, + "learning_rate": 4.627993102524736e-06, + "loss": 0.2024, + "step": 7805 + }, + { + "epoch": 2.6922944319944837, + "grad_norm": 1.2950861054118248, + "learning_rate": 4.622426334797196e-06, + "loss": 0.1931, + "step": 7810 + }, + { + "epoch": 2.6940182727116015, + "grad_norm": 1.4651119211485237, + "learning_rate": 4.616860037737955e-06, + "loss": 0.2175, + "step": 7815 + }, + { + "epoch": 2.695742113428719, + "grad_norm": 1.0050843856974014, + "learning_rate": 4.611294218285734e-06, + "loss": 0.1945, + "step": 7820 + }, + { + "epoch": 2.697465954145837, + "grad_norm": 1.2372108121867913, + "learning_rate": 4.60572888337866e-06, + "loss": 0.1995, + "step": 7825 + }, + { + "epoch": 2.6991897948629546, + "grad_norm": 1.498612857700723, + "learning_rate": 4.600164039954261e-06, + "loss": 0.1956, + "step": 7830 + }, + { + "epoch": 2.7009136355800725, + "grad_norm": 1.2330939883872856, + "learning_rate": 4.5945996949494485e-06, + "loss": 0.185, + "step": 7835 + }, + { + "epoch": 2.7026374762971903, + "grad_norm": 1.2385919948447828, + "learning_rate": 4.589035855300512e-06, + "loss": 0.2134, + "step": 7840 + }, + { + "epoch": 2.7043613170143077, + "grad_norm": 1.109916619555715, + "learning_rate": 4.5834725279431155e-06, + "loss": 0.2111, + "step": 7845 + }, + { + "epoch": 2.7060851577314256, + "grad_norm": 1.2428499396738826, + "learning_rate": 4.577909719812279e-06, + "loss": 0.2094, + "step": 7850 + }, + { + "epoch": 2.7078089984485434, + "grad_norm": 1.1963865432617258, + "learning_rate": 4.572347437842379e-06, + "loss": 0.1959, + "step": 7855 + }, + { + "epoch": 2.7095328391656612, + "grad_norm": 1.319182663807647, + "learning_rate": 4.566785688967131e-06, + "loss": 0.1931, + "step": 7860 + }, + { + "epoch": 2.711256679882779, + "grad_norm": 1.7260294326255932, + "learning_rate": 4.561224480119595e-06, + "loss": 0.1895, + "step": 7865 + }, + { + "epoch": 2.7129805205998965, + "grad_norm": 1.071017979385581, + "learning_rate": 4.555663818232149e-06, + "loss": 0.1919, + "step": 7870 + }, + { + "epoch": 2.7147043613170143, + "grad_norm": 1.021294790808218, + "learning_rate": 4.550103710236492e-06, + "loss": 0.1906, + "step": 7875 + }, + { + "epoch": 2.716428202034132, + "grad_norm": 1.324442795469158, + "learning_rate": 4.544544163063638e-06, + "loss": 0.2148, + "step": 7880 + }, + { + "epoch": 2.7181520427512496, + "grad_norm": 1.1920062038819825, + "learning_rate": 4.5389851836438935e-06, + "loss": 0.2109, + "step": 7885 + }, + { + "epoch": 2.7198758834683674, + "grad_norm": 1.5379124195735483, + "learning_rate": 4.533426778906861e-06, + "loss": 0.1736, + "step": 7890 + }, + { + "epoch": 2.7215997241854852, + "grad_norm": 1.2455543439708339, + "learning_rate": 4.527868955781424e-06, + "loss": 0.1904, + "step": 7895 + }, + { + "epoch": 2.723323564902603, + "grad_norm": 1.4861350187627516, + "learning_rate": 4.5223117211957505e-06, + "loss": 0.1822, + "step": 7900 + }, + { + "epoch": 2.725047405619721, + "grad_norm": 1.1171194416553463, + "learning_rate": 4.516755082077261e-06, + "loss": 0.1876, + "step": 7905 + }, + { + "epoch": 2.7267712463368383, + "grad_norm": 1.3116132145694126, + "learning_rate": 4.511199045352645e-06, + "loss": 0.1948, + "step": 7910 + }, + { + "epoch": 2.728495087053956, + "grad_norm": 1.2106137602824627, + "learning_rate": 4.505643617947834e-06, + "loss": 0.1872, + "step": 7915 + }, + { + "epoch": 2.730218927771074, + "grad_norm": 1.4372986204495664, + "learning_rate": 4.500088806788005e-06, + "loss": 0.2101, + "step": 7920 + }, + { + "epoch": 2.731942768488192, + "grad_norm": 1.1809474895261303, + "learning_rate": 4.494534618797561e-06, + "loss": 0.1994, + "step": 7925 + }, + { + "epoch": 2.7336666092053097, + "grad_norm": 1.1966511234707295, + "learning_rate": 4.4889810609001335e-06, + "loss": 0.1792, + "step": 7930 + }, + { + "epoch": 2.735390449922427, + "grad_norm": 1.1748529313376745, + "learning_rate": 4.483428140018569e-06, + "loss": 0.195, + "step": 7935 + }, + { + "epoch": 2.737114290639545, + "grad_norm": 1.2912063021415345, + "learning_rate": 4.477875863074914e-06, + "loss": 0.2001, + "step": 7940 + }, + { + "epoch": 2.7388381313566628, + "grad_norm": 1.0622864482871097, + "learning_rate": 4.472324236990416e-06, + "loss": 0.2066, + "step": 7945 + }, + { + "epoch": 2.74056197207378, + "grad_norm": 1.1875791261114388, + "learning_rate": 4.466773268685512e-06, + "loss": 0.1968, + "step": 7950 + }, + { + "epoch": 2.742285812790898, + "grad_norm": 1.2236668959815376, + "learning_rate": 4.46122296507982e-06, + "loss": 0.1754, + "step": 7955 + }, + { + "epoch": 2.744009653508016, + "grad_norm": 1.406285223793381, + "learning_rate": 4.455673333092123e-06, + "loss": 0.1912, + "step": 7960 + }, + { + "epoch": 2.7457334942251337, + "grad_norm": 1.1683515068454808, + "learning_rate": 4.450124379640377e-06, + "loss": 0.2101, + "step": 7965 + }, + { + "epoch": 2.7474573349422515, + "grad_norm": 1.170486381157583, + "learning_rate": 4.444576111641681e-06, + "loss": 0.1931, + "step": 7970 + }, + { + "epoch": 2.749181175659369, + "grad_norm": 1.2060903792039241, + "learning_rate": 4.439028536012288e-06, + "loss": 0.2111, + "step": 7975 + }, + { + "epoch": 2.7509050163764868, + "grad_norm": 1.2409100345817432, + "learning_rate": 4.433481659667583e-06, + "loss": 0.1856, + "step": 7980 + }, + { + "epoch": 2.7526288570936046, + "grad_norm": 1.1925647359048128, + "learning_rate": 4.427935489522084e-06, + "loss": 0.1781, + "step": 7985 + }, + { + "epoch": 2.7543526978107225, + "grad_norm": 1.1619717711225186, + "learning_rate": 4.422390032489423e-06, + "loss": 0.2078, + "step": 7990 + }, + { + "epoch": 2.7560765385278403, + "grad_norm": 1.0429201092240183, + "learning_rate": 4.416845295482346e-06, + "loss": 0.1796, + "step": 7995 + }, + { + "epoch": 2.7578003792449577, + "grad_norm": 1.3352573254648352, + "learning_rate": 4.411301285412703e-06, + "loss": 0.2017, + "step": 8000 + }, + { + "epoch": 2.7595242199620755, + "grad_norm": 1.1965571839421498, + "learning_rate": 4.405758009191438e-06, + "loss": 0.1895, + "step": 8005 + }, + { + "epoch": 2.7612480606791934, + "grad_norm": 1.3542719573902489, + "learning_rate": 4.400215473728573e-06, + "loss": 0.1938, + "step": 8010 + }, + { + "epoch": 2.7629719013963108, + "grad_norm": 1.112444871723316, + "learning_rate": 4.394673685933215e-06, + "loss": 0.1822, + "step": 8015 + }, + { + "epoch": 2.7646957421134286, + "grad_norm": 1.2391166863423548, + "learning_rate": 4.3891326527135375e-06, + "loss": 0.2051, + "step": 8020 + }, + { + "epoch": 2.7664195828305465, + "grad_norm": 1.2595252454074388, + "learning_rate": 4.38359238097677e-06, + "loss": 0.1984, + "step": 8025 + }, + { + "epoch": 2.7681434235476643, + "grad_norm": 1.0939898221166287, + "learning_rate": 4.3780528776291936e-06, + "loss": 0.199, + "step": 8030 + }, + { + "epoch": 2.769867264264782, + "grad_norm": 3.0232658688429344, + "learning_rate": 4.3725141495761345e-06, + "loss": 0.2269, + "step": 8035 + }, + { + "epoch": 2.7715911049818995, + "grad_norm": 1.3255994971097103, + "learning_rate": 4.366976203721952e-06, + "loss": 0.2197, + "step": 8040 + }, + { + "epoch": 2.7733149456990174, + "grad_norm": 1.466300530156325, + "learning_rate": 4.361439046970024e-06, + "loss": 0.2, + "step": 8045 + }, + { + "epoch": 2.775038786416135, + "grad_norm": 1.14521302211061, + "learning_rate": 4.3559026862227534e-06, + "loss": 0.1732, + "step": 8050 + }, + { + "epoch": 2.7767626271332526, + "grad_norm": 1.31163814279667, + "learning_rate": 4.350367128381547e-06, + "loss": 0.1872, + "step": 8055 + }, + { + "epoch": 2.778486467850371, + "grad_norm": 1.383108648090669, + "learning_rate": 4.3448323803468105e-06, + "loss": 0.19, + "step": 8060 + }, + { + "epoch": 2.7802103085674883, + "grad_norm": 1.2867511929277209, + "learning_rate": 4.339298449017937e-06, + "loss": 0.2079, + "step": 8065 + }, + { + "epoch": 2.781934149284606, + "grad_norm": 1.1623867399276215, + "learning_rate": 4.33376534129331e-06, + "loss": 0.1961, + "step": 8070 + }, + { + "epoch": 2.783657990001724, + "grad_norm": 1.2088762149212156, + "learning_rate": 4.328233064070278e-06, + "loss": 0.1903, + "step": 8075 + }, + { + "epoch": 2.7853818307188414, + "grad_norm": 1.223823741388553, + "learning_rate": 4.322701624245158e-06, + "loss": 0.1595, + "step": 8080 + }, + { + "epoch": 2.787105671435959, + "grad_norm": 1.1002836332974475, + "learning_rate": 4.317171028713225e-06, + "loss": 0.1929, + "step": 8085 + }, + { + "epoch": 2.788829512153077, + "grad_norm": 1.0505110228506256, + "learning_rate": 4.311641284368696e-06, + "loss": 0.1847, + "step": 8090 + }, + { + "epoch": 2.790553352870195, + "grad_norm": 1.034877504895325, + "learning_rate": 4.306112398104732e-06, + "loss": 0.1823, + "step": 8095 + }, + { + "epoch": 2.7922771935873127, + "grad_norm": 1.0807104967684351, + "learning_rate": 4.30058437681342e-06, + "loss": 0.1894, + "step": 8100 + }, + { + "epoch": 2.79400103430443, + "grad_norm": 1.139431942812383, + "learning_rate": 4.295057227385776e-06, + "loss": 0.2312, + "step": 8105 + }, + { + "epoch": 2.795724875021548, + "grad_norm": 1.1277278495674539, + "learning_rate": 4.28953095671172e-06, + "loss": 0.1955, + "step": 8110 + }, + { + "epoch": 2.797448715738666, + "grad_norm": 1.2303686781202476, + "learning_rate": 4.284005571680081e-06, + "loss": 0.2071, + "step": 8115 + }, + { + "epoch": 2.7991725564557832, + "grad_norm": 1.2432317012431924, + "learning_rate": 4.278481079178587e-06, + "loss": 0.1909, + "step": 8120 + }, + { + "epoch": 2.800896397172901, + "grad_norm": 1.2687097494661927, + "learning_rate": 4.2729574860938484e-06, + "loss": 0.2143, + "step": 8125 + }, + { + "epoch": 2.802620237890019, + "grad_norm": 1.1667186420107287, + "learning_rate": 4.267434799311357e-06, + "loss": 0.1938, + "step": 8130 + }, + { + "epoch": 2.8043440786071367, + "grad_norm": 1.4209068220987036, + "learning_rate": 4.2619130257154726e-06, + "loss": 0.1999, + "step": 8135 + }, + { + "epoch": 2.8060679193242546, + "grad_norm": 1.1970763612085271, + "learning_rate": 4.2563921721894216e-06, + "loss": 0.2143, + "step": 8140 + }, + { + "epoch": 2.807791760041372, + "grad_norm": 1.2960720448024432, + "learning_rate": 4.250872245615278e-06, + "loss": 0.2072, + "step": 8145 + }, + { + "epoch": 2.80951560075849, + "grad_norm": 1.134691339864308, + "learning_rate": 4.24535325287396e-06, + "loss": 0.1699, + "step": 8150 + }, + { + "epoch": 2.8112394414756077, + "grad_norm": 1.1327331828629736, + "learning_rate": 4.239835200845229e-06, + "loss": 0.1772, + "step": 8155 + }, + { + "epoch": 2.8129632821927255, + "grad_norm": 1.2964504069699636, + "learning_rate": 4.2343180964076675e-06, + "loss": 0.2143, + "step": 8160 + }, + { + "epoch": 2.8146871229098434, + "grad_norm": 1.3190192771614466, + "learning_rate": 4.228801946438675e-06, + "loss": 0.2055, + "step": 8165 + }, + { + "epoch": 2.8164109636269608, + "grad_norm": 0.9720368175650838, + "learning_rate": 4.22328675781447e-06, + "loss": 0.1756, + "step": 8170 + }, + { + "epoch": 2.8181348043440786, + "grad_norm": 1.2692431794174732, + "learning_rate": 4.217772537410061e-06, + "loss": 0.1987, + "step": 8175 + }, + { + "epoch": 2.8198586450611964, + "grad_norm": 1.3730188514086452, + "learning_rate": 4.212259292099261e-06, + "loss": 0.2077, + "step": 8180 + }, + { + "epoch": 2.821582485778314, + "grad_norm": 1.2171736835900917, + "learning_rate": 4.206747028754656e-06, + "loss": 0.1873, + "step": 8185 + }, + { + "epoch": 2.8233063264954317, + "grad_norm": 1.171133534075351, + "learning_rate": 4.201235754247621e-06, + "loss": 0.1938, + "step": 8190 + }, + { + "epoch": 2.8250301672125495, + "grad_norm": 1.2682037480479451, + "learning_rate": 4.195725475448287e-06, + "loss": 0.182, + "step": 8195 + }, + { + "epoch": 2.8267540079296674, + "grad_norm": 1.4120655758507916, + "learning_rate": 4.190216199225547e-06, + "loss": 0.204, + "step": 8200 + }, + { + "epoch": 2.828477848646785, + "grad_norm": 1.075863806220456, + "learning_rate": 4.18470793244705e-06, + "loss": 0.183, + "step": 8205 + }, + { + "epoch": 2.8302016893639026, + "grad_norm": 1.1319415182795967, + "learning_rate": 4.179200681979179e-06, + "loss": 0.2123, + "step": 8210 + }, + { + "epoch": 2.8319255300810204, + "grad_norm": 1.3054193049581715, + "learning_rate": 4.173694454687053e-06, + "loss": 0.206, + "step": 8215 + }, + { + "epoch": 2.8336493707981383, + "grad_norm": 1.1157542815944936, + "learning_rate": 4.168189257434515e-06, + "loss": 0.164, + "step": 8220 + }, + { + "epoch": 2.835373211515256, + "grad_norm": 1.1371507714261992, + "learning_rate": 4.162685097084127e-06, + "loss": 0.1931, + "step": 8225 + }, + { + "epoch": 2.837097052232374, + "grad_norm": 1.1676006693921204, + "learning_rate": 4.157181980497156e-06, + "loss": 0.1971, + "step": 8230 + }, + { + "epoch": 2.8388208929494914, + "grad_norm": 1.415367403210831, + "learning_rate": 4.151679914533565e-06, + "loss": 0.2069, + "step": 8235 + }, + { + "epoch": 2.840544733666609, + "grad_norm": 1.3266681302230978, + "learning_rate": 4.146178906052013e-06, + "loss": 0.178, + "step": 8240 + }, + { + "epoch": 2.842268574383727, + "grad_norm": 1.286262464013329, + "learning_rate": 4.140678961909838e-06, + "loss": 0.208, + "step": 8245 + }, + { + "epoch": 2.8439924151008444, + "grad_norm": 1.7348548171576503, + "learning_rate": 4.1351800889630515e-06, + "loss": 0.1719, + "step": 8250 + }, + { + "epoch": 2.8457162558179623, + "grad_norm": 1.185900473511365, + "learning_rate": 4.129682294066327e-06, + "loss": 0.1814, + "step": 8255 + }, + { + "epoch": 2.84744009653508, + "grad_norm": 1.262898454175873, + "learning_rate": 4.124185584072999e-06, + "loss": 0.1939, + "step": 8260 + }, + { + "epoch": 2.849163937252198, + "grad_norm": 1.270852337544755, + "learning_rate": 4.118689965835048e-06, + "loss": 0.2088, + "step": 8265 + }, + { + "epoch": 2.850887777969316, + "grad_norm": 1.222777687393257, + "learning_rate": 4.11319544620309e-06, + "loss": 0.1866, + "step": 8270 + }, + { + "epoch": 2.852611618686433, + "grad_norm": 1.250304059645102, + "learning_rate": 4.107702032026378e-06, + "loss": 0.1842, + "step": 8275 + }, + { + "epoch": 2.854335459403551, + "grad_norm": 1.276360860707752, + "learning_rate": 4.10220973015278e-06, + "loss": 0.1919, + "step": 8280 + }, + { + "epoch": 2.856059300120669, + "grad_norm": 1.21363515890293, + "learning_rate": 4.096718547428781e-06, + "loss": 0.1933, + "step": 8285 + }, + { + "epoch": 2.8577831408377867, + "grad_norm": 1.3488690642459205, + "learning_rate": 4.091228490699474e-06, + "loss": 0.1848, + "step": 8290 + }, + { + "epoch": 2.8595069815549046, + "grad_norm": 1.3131588721368845, + "learning_rate": 4.085739566808545e-06, + "loss": 0.1888, + "step": 8295 + }, + { + "epoch": 2.861230822272022, + "grad_norm": 1.9331377583476947, + "learning_rate": 4.080251782598263e-06, + "loss": 0.1741, + "step": 8300 + }, + { + "epoch": 2.86295466298914, + "grad_norm": 1.2358601570039363, + "learning_rate": 4.074765144909485e-06, + "loss": 0.1894, + "step": 8305 + }, + { + "epoch": 2.8646785037062577, + "grad_norm": 1.017128065952739, + "learning_rate": 4.069279660581635e-06, + "loss": 0.1964, + "step": 8310 + }, + { + "epoch": 2.866402344423375, + "grad_norm": 1.1091890388739514, + "learning_rate": 4.0637953364526984e-06, + "loss": 0.1839, + "step": 8315 + }, + { + "epoch": 2.868126185140493, + "grad_norm": 1.2830530709685048, + "learning_rate": 4.058312179359215e-06, + "loss": 0.2149, + "step": 8320 + }, + { + "epoch": 2.8698500258576107, + "grad_norm": 1.2324446018727235, + "learning_rate": 4.052830196136272e-06, + "loss": 0.1577, + "step": 8325 + }, + { + "epoch": 2.8715738665747286, + "grad_norm": 1.0157886508807732, + "learning_rate": 4.04734939361749e-06, + "loss": 0.1857, + "step": 8330 + }, + { + "epoch": 2.8732977072918464, + "grad_norm": 1.1557010837729673, + "learning_rate": 4.041869778635018e-06, + "loss": 0.2001, + "step": 8335 + }, + { + "epoch": 2.875021548008964, + "grad_norm": 1.0784480225030633, + "learning_rate": 4.036391358019526e-06, + "loss": 0.1778, + "step": 8340 + }, + { + "epoch": 2.8767453887260817, + "grad_norm": 1.2873100676424474, + "learning_rate": 4.030914138600199e-06, + "loss": 0.2083, + "step": 8345 + }, + { + "epoch": 2.8784692294431995, + "grad_norm": 1.2389284327803995, + "learning_rate": 4.025438127204717e-06, + "loss": 0.1901, + "step": 8350 + }, + { + "epoch": 2.8801930701603173, + "grad_norm": 1.2163933470381572, + "learning_rate": 4.019963330659257e-06, + "loss": 0.1967, + "step": 8355 + }, + { + "epoch": 2.881916910877435, + "grad_norm": 1.2515375249766985, + "learning_rate": 4.014489755788484e-06, + "loss": 0.1855, + "step": 8360 + }, + { + "epoch": 2.8836407515945526, + "grad_norm": 1.1211494136197395, + "learning_rate": 4.00901740941554e-06, + "loss": 0.1896, + "step": 8365 + }, + { + "epoch": 2.8853645923116704, + "grad_norm": 1.1465954337843312, + "learning_rate": 4.003546298362032e-06, + "loss": 0.1614, + "step": 8370 + }, + { + "epoch": 2.8870884330287883, + "grad_norm": 1.152720453143601, + "learning_rate": 3.998076429448028e-06, + "loss": 0.1992, + "step": 8375 + }, + { + "epoch": 2.8888122737459057, + "grad_norm": 1.1167651038566924, + "learning_rate": 3.992607809492051e-06, + "loss": 0.1835, + "step": 8380 + }, + { + "epoch": 2.8905361144630235, + "grad_norm": 1.189863677438235, + "learning_rate": 3.987140445311065e-06, + "loss": 0.1939, + "step": 8385 + }, + { + "epoch": 2.8922599551801413, + "grad_norm": 1.170508660773413, + "learning_rate": 3.981674343720466e-06, + "loss": 0.1777, + "step": 8390 + }, + { + "epoch": 2.893983795897259, + "grad_norm": 1.1915528504055501, + "learning_rate": 3.976209511534083e-06, + "loss": 0.196, + "step": 8395 + }, + { + "epoch": 2.895707636614377, + "grad_norm": 1.301325060283411, + "learning_rate": 3.9707459555641535e-06, + "loss": 0.2084, + "step": 8400 + }, + { + "epoch": 2.8974314773314944, + "grad_norm": 1.2661027616303373, + "learning_rate": 3.965283682621329e-06, + "loss": 0.1883, + "step": 8405 + }, + { + "epoch": 2.8991553180486123, + "grad_norm": 1.219275075233274, + "learning_rate": 3.959822699514667e-06, + "loss": 0.1773, + "step": 8410 + }, + { + "epoch": 2.90087915876573, + "grad_norm": 1.0979884941554234, + "learning_rate": 3.9543630130516065e-06, + "loss": 0.1853, + "step": 8415 + }, + { + "epoch": 2.902602999482848, + "grad_norm": 1.5017483872419184, + "learning_rate": 3.948904630037976e-06, + "loss": 0.1555, + "step": 8420 + }, + { + "epoch": 2.904326840199966, + "grad_norm": 1.2372063293423552, + "learning_rate": 3.943447557277978e-06, + "loss": 0.2129, + "step": 8425 + }, + { + "epoch": 2.906050680917083, + "grad_norm": 1.050827856052223, + "learning_rate": 3.937991801574185e-06, + "loss": 0.1852, + "step": 8430 + }, + { + "epoch": 2.907774521634201, + "grad_norm": 1.2096691108510387, + "learning_rate": 3.932537369727523e-06, + "loss": 0.177, + "step": 8435 + }, + { + "epoch": 2.909498362351319, + "grad_norm": 1.181342471659955, + "learning_rate": 3.927084268537266e-06, + "loss": 0.2017, + "step": 8440 + }, + { + "epoch": 2.9112222030684363, + "grad_norm": 1.1402574108423542, + "learning_rate": 3.92163250480104e-06, + "loss": 0.1905, + "step": 8445 + }, + { + "epoch": 2.912946043785554, + "grad_norm": 1.3165671771647727, + "learning_rate": 3.916182085314791e-06, + "loss": 0.2013, + "step": 8450 + }, + { + "epoch": 2.914669884502672, + "grad_norm": 1.029576640286608, + "learning_rate": 3.910733016872799e-06, + "loss": 0.2004, + "step": 8455 + }, + { + "epoch": 2.91639372521979, + "grad_norm": 1.0665272745829613, + "learning_rate": 3.90528530626765e-06, + "loss": 0.1671, + "step": 8460 + }, + { + "epoch": 2.9181175659369076, + "grad_norm": 1.0233155093361852, + "learning_rate": 3.899838960290248e-06, + "loss": 0.1871, + "step": 8465 + }, + { + "epoch": 2.919841406654025, + "grad_norm": 1.1827437485749752, + "learning_rate": 3.89439398572979e-06, + "loss": 0.1881, + "step": 8470 + }, + { + "epoch": 2.921565247371143, + "grad_norm": 1.1658387291535435, + "learning_rate": 3.8889503893737625e-06, + "loss": 0.1973, + "step": 8475 + }, + { + "epoch": 2.9232890880882607, + "grad_norm": 1.2430907347834113, + "learning_rate": 3.883508178007939e-06, + "loss": 0.1719, + "step": 8480 + }, + { + "epoch": 2.925012928805378, + "grad_norm": 1.1373376572837053, + "learning_rate": 3.878067358416361e-06, + "loss": 0.1787, + "step": 8485 + }, + { + "epoch": 2.9267367695224964, + "grad_norm": 1.3601094211432447, + "learning_rate": 3.872627937381338e-06, + "loss": 0.1932, + "step": 8490 + }, + { + "epoch": 2.928460610239614, + "grad_norm": 1.322640625687631, + "learning_rate": 3.867189921683439e-06, + "loss": 0.1981, + "step": 8495 + }, + { + "epoch": 2.9301844509567316, + "grad_norm": 1.2719584425802242, + "learning_rate": 3.861753318101473e-06, + "loss": 0.1904, + "step": 8500 + }, + { + "epoch": 2.9319082916738495, + "grad_norm": 1.4712530394725756, + "learning_rate": 3.856318133412495e-06, + "loss": 0.1706, + "step": 8505 + }, + { + "epoch": 2.933632132390967, + "grad_norm": 1.3682269449776316, + "learning_rate": 3.850884374391791e-06, + "loss": 0.1615, + "step": 8510 + }, + { + "epoch": 2.9353559731080847, + "grad_norm": 1.2792338243916719, + "learning_rate": 3.845452047812868e-06, + "loss": 0.1795, + "step": 8515 + }, + { + "epoch": 2.9370798138252026, + "grad_norm": 1.256542313984073, + "learning_rate": 3.840021160447448e-06, + "loss": 0.201, + "step": 8520 + }, + { + "epoch": 2.9388036545423204, + "grad_norm": 1.1712058086383184, + "learning_rate": 3.8345917190654585e-06, + "loss": 0.1713, + "step": 8525 + }, + { + "epoch": 2.9405274952594382, + "grad_norm": 1.2398605762854835, + "learning_rate": 3.829163730435025e-06, + "loss": 0.1767, + "step": 8530 + }, + { + "epoch": 2.9422513359765556, + "grad_norm": 1.1848463789189871, + "learning_rate": 3.823737201322465e-06, + "loss": 0.199, + "step": 8535 + }, + { + "epoch": 2.9439751766936735, + "grad_norm": 1.303223292226999, + "learning_rate": 3.818312138492268e-06, + "loss": 0.1825, + "step": 8540 + }, + { + "epoch": 2.9456990174107913, + "grad_norm": 1.5157044205233943, + "learning_rate": 3.812888548707104e-06, + "loss": 0.2029, + "step": 8545 + }, + { + "epoch": 2.9474228581279087, + "grad_norm": 1.131704794695496, + "learning_rate": 3.807466438727806e-06, + "loss": 0.2079, + "step": 8550 + }, + { + "epoch": 2.9491466988450266, + "grad_norm": 1.37739915064286, + "learning_rate": 3.8020458153133586e-06, + "loss": 0.1887, + "step": 8555 + }, + { + "epoch": 2.9508705395621444, + "grad_norm": 1.2024852771849421, + "learning_rate": 3.7966266852208934e-06, + "loss": 0.1844, + "step": 8560 + }, + { + "epoch": 2.9525943802792622, + "grad_norm": 1.2390752925242332, + "learning_rate": 3.7912090552056847e-06, + "loss": 0.1723, + "step": 8565 + }, + { + "epoch": 2.95431822099638, + "grad_norm": 1.1572358198879844, + "learning_rate": 3.7857929320211343e-06, + "loss": 0.2092, + "step": 8570 + }, + { + "epoch": 2.9560420617134975, + "grad_norm": 1.245744561744381, + "learning_rate": 3.7803783224187657e-06, + "loss": 0.1836, + "step": 8575 + }, + { + "epoch": 2.9577659024306153, + "grad_norm": 1.245313112244567, + "learning_rate": 3.7749652331482124e-06, + "loss": 0.1824, + "step": 8580 + }, + { + "epoch": 2.959489743147733, + "grad_norm": 1.2199876730603632, + "learning_rate": 3.7695536709572194e-06, + "loss": 0.1741, + "step": 8585 + }, + { + "epoch": 2.961213583864851, + "grad_norm": 1.1106009111986281, + "learning_rate": 3.764143642591625e-06, + "loss": 0.1868, + "step": 8590 + }, + { + "epoch": 2.962937424581969, + "grad_norm": 1.2264086772639844, + "learning_rate": 3.7587351547953516e-06, + "loss": 0.1709, + "step": 8595 + }, + { + "epoch": 2.9646612652990862, + "grad_norm": 1.1811997158930236, + "learning_rate": 3.753328214310409e-06, + "loss": 0.1935, + "step": 8600 + }, + { + "epoch": 2.966385106016204, + "grad_norm": 1.252239759339723, + "learning_rate": 3.74792282787687e-06, + "loss": 0.1679, + "step": 8605 + }, + { + "epoch": 2.968108946733322, + "grad_norm": 1.1660064316973588, + "learning_rate": 3.7425190022328763e-06, + "loss": 0.2019, + "step": 8610 + }, + { + "epoch": 2.9698327874504393, + "grad_norm": 1.1306830827070342, + "learning_rate": 3.737116744114622e-06, + "loss": 0.1773, + "step": 8615 + }, + { + "epoch": 2.971556628167557, + "grad_norm": 1.1179096381911475, + "learning_rate": 3.7317160602563473e-06, + "loss": 0.1662, + "step": 8620 + }, + { + "epoch": 2.973280468884675, + "grad_norm": 1.2131794282445627, + "learning_rate": 3.7263169573903274e-06, + "loss": 0.1724, + "step": 8625 + }, + { + "epoch": 2.975004309601793, + "grad_norm": 1.5168740661484055, + "learning_rate": 3.7209194422468684e-06, + "loss": 0.194, + "step": 8630 + }, + { + "epoch": 2.9767281503189107, + "grad_norm": 1.3771843753035093, + "learning_rate": 3.715523521554303e-06, + "loss": 0.198, + "step": 8635 + }, + { + "epoch": 2.978451991036028, + "grad_norm": 1.1799172525576889, + "learning_rate": 3.7101292020389666e-06, + "loss": 0.1722, + "step": 8640 + }, + { + "epoch": 2.980175831753146, + "grad_norm": 1.1540682340883583, + "learning_rate": 3.7047364904252024e-06, + "loss": 0.1728, + "step": 8645 + }, + { + "epoch": 2.9818996724702638, + "grad_norm": 1.22785878372636, + "learning_rate": 3.699345393435353e-06, + "loss": 0.1819, + "step": 8650 + }, + { + "epoch": 2.9836235131873816, + "grad_norm": 1.2329073959728665, + "learning_rate": 3.6939559177897445e-06, + "loss": 0.158, + "step": 8655 + }, + { + "epoch": 2.9853473539044995, + "grad_norm": 1.292106741036609, + "learning_rate": 3.688568070206682e-06, + "loss": 0.1821, + "step": 8660 + }, + { + "epoch": 2.987071194621617, + "grad_norm": 2.5148027142226557, + "learning_rate": 3.6831818574024405e-06, + "loss": 0.1739, + "step": 8665 + }, + { + "epoch": 2.9887950353387347, + "grad_norm": 1.3617487795094294, + "learning_rate": 3.6777972860912596e-06, + "loss": 0.2088, + "step": 8670 + }, + { + "epoch": 2.9905188760558525, + "grad_norm": 1.0917490198305588, + "learning_rate": 3.6724143629853335e-06, + "loss": 0.1868, + "step": 8675 + }, + { + "epoch": 2.99224271677297, + "grad_norm": 1.3553058987589877, + "learning_rate": 3.6670330947947953e-06, + "loss": 0.1861, + "step": 8680 + }, + { + "epoch": 2.9939665574900878, + "grad_norm": 1.3961356864483492, + "learning_rate": 3.6616534882277242e-06, + "loss": 0.2018, + "step": 8685 + }, + { + "epoch": 2.9956903982072056, + "grad_norm": 1.1404126956620368, + "learning_rate": 3.6562755499901207e-06, + "loss": 0.2002, + "step": 8690 + }, + { + "epoch": 2.9974142389243235, + "grad_norm": 1.3783271072832564, + "learning_rate": 3.6508992867859104e-06, + "loss": 0.1682, + "step": 8695 + }, + { + "epoch": 2.9991380796414413, + "grad_norm": 1.188646039032073, + "learning_rate": 3.645524705316926e-06, + "loss": 0.1904, + "step": 8700 + }, + { + "epoch": 3.0006895362868473, + "grad_norm": 1.0970171613103625, + "learning_rate": 3.6401518122829103e-06, + "loss": 0.1536, + "step": 8705 + }, + { + "epoch": 3.0024133770039647, + "grad_norm": 1.092466946917964, + "learning_rate": 3.6347806143814957e-06, + "loss": 0.1731, + "step": 8710 + }, + { + "epoch": 3.0041372177210826, + "grad_norm": 1.2313941069165162, + "learning_rate": 3.629411118308202e-06, + "loss": 0.1708, + "step": 8715 + }, + { + "epoch": 3.0058610584382004, + "grad_norm": 1.3960767229370925, + "learning_rate": 3.6240433307564337e-06, + "loss": 0.1493, + "step": 8720 + }, + { + "epoch": 3.0075848991553182, + "grad_norm": 1.3093850658962947, + "learning_rate": 3.6186772584174577e-06, + "loss": 0.1589, + "step": 8725 + }, + { + "epoch": 3.0093087398724356, + "grad_norm": 1.2456216474500965, + "learning_rate": 3.6133129079804064e-06, + "loss": 0.1647, + "step": 8730 + }, + { + "epoch": 3.0110325805895535, + "grad_norm": 1.4066648330317426, + "learning_rate": 3.607950286132266e-06, + "loss": 0.1783, + "step": 8735 + }, + { + "epoch": 3.0127564213066713, + "grad_norm": 1.189881531728998, + "learning_rate": 3.602589399557869e-06, + "loss": 0.1724, + "step": 8740 + }, + { + "epoch": 3.014480262023789, + "grad_norm": 1.3455052616634957, + "learning_rate": 3.5972302549398795e-06, + "loss": 0.1694, + "step": 8745 + }, + { + "epoch": 3.0162041027409066, + "grad_norm": 1.229601294528069, + "learning_rate": 3.591872858958796e-06, + "loss": 0.1655, + "step": 8750 + }, + { + "epoch": 3.0179279434580244, + "grad_norm": 1.3345614679194509, + "learning_rate": 3.586517218292935e-06, + "loss": 0.1806, + "step": 8755 + }, + { + "epoch": 3.0196517841751422, + "grad_norm": 1.2245161134824287, + "learning_rate": 3.5811633396184266e-06, + "loss": 0.166, + "step": 8760 + }, + { + "epoch": 3.02137562489226, + "grad_norm": 1.2149303448028828, + "learning_rate": 3.5758112296091972e-06, + "loss": 0.154, + "step": 8765 + }, + { + "epoch": 3.0230994656093775, + "grad_norm": 1.2547650395188585, + "learning_rate": 3.570460894936979e-06, + "loss": 0.1623, + "step": 8770 + }, + { + "epoch": 3.0248233063264953, + "grad_norm": 1.2832556765244343, + "learning_rate": 3.5651123422712865e-06, + "loss": 0.1753, + "step": 8775 + }, + { + "epoch": 3.026547147043613, + "grad_norm": 1.1774052352792375, + "learning_rate": 3.5597655782794096e-06, + "loss": 0.1416, + "step": 8780 + }, + { + "epoch": 3.028270987760731, + "grad_norm": 1.2228252426072648, + "learning_rate": 3.5544206096264113e-06, + "loss": 0.182, + "step": 8785 + }, + { + "epoch": 3.029994828477849, + "grad_norm": 1.4863199015956217, + "learning_rate": 3.5490774429751185e-06, + "loss": 0.157, + "step": 8790 + }, + { + "epoch": 3.0317186691949662, + "grad_norm": 1.1777303610532432, + "learning_rate": 3.5437360849861103e-06, + "loss": 0.1802, + "step": 8795 + }, + { + "epoch": 3.033442509912084, + "grad_norm": 1.254018043291895, + "learning_rate": 3.538396542317708e-06, + "loss": 0.1702, + "step": 8800 + }, + { + "epoch": 3.035166350629202, + "grad_norm": 1.122608652504421, + "learning_rate": 3.533058821625977e-06, + "loss": 0.1819, + "step": 8805 + }, + { + "epoch": 3.0368901913463198, + "grad_norm": 1.4025133008197308, + "learning_rate": 3.5277229295647043e-06, + "loss": 0.1493, + "step": 8810 + }, + { + "epoch": 3.038614032063437, + "grad_norm": 1.183473272076663, + "learning_rate": 3.5223888727854018e-06, + "loss": 0.1699, + "step": 8815 + }, + { + "epoch": 3.040337872780555, + "grad_norm": 1.2998775767633, + "learning_rate": 3.51705665793729e-06, + "loss": 0.1604, + "step": 8820 + }, + { + "epoch": 3.042061713497673, + "grad_norm": 1.136440256216976, + "learning_rate": 3.5117262916673e-06, + "loss": 0.1557, + "step": 8825 + }, + { + "epoch": 3.0437855542147907, + "grad_norm": 1.3040492358993647, + "learning_rate": 3.50639778062005e-06, + "loss": 0.157, + "step": 8830 + }, + { + "epoch": 3.045509394931908, + "grad_norm": 1.094331077347454, + "learning_rate": 3.5010711314378498e-06, + "loss": 0.1747, + "step": 8835 + }, + { + "epoch": 3.047233235649026, + "grad_norm": 1.167773248184639, + "learning_rate": 3.4957463507606924e-06, + "loss": 0.1513, + "step": 8840 + }, + { + "epoch": 3.0489570763661438, + "grad_norm": 1.243233315669377, + "learning_rate": 3.4904234452262348e-06, + "loss": 0.1457, + "step": 8845 + }, + { + "epoch": 3.0506809170832616, + "grad_norm": 1.02399958288721, + "learning_rate": 3.485102421469796e-06, + "loss": 0.1412, + "step": 8850 + }, + { + "epoch": 3.0524047578003795, + "grad_norm": 1.4348227706918033, + "learning_rate": 3.479783286124357e-06, + "loss": 0.1615, + "step": 8855 + }, + { + "epoch": 3.054128598517497, + "grad_norm": 1.2206811934506423, + "learning_rate": 3.4744660458205385e-06, + "loss": 0.1622, + "step": 8860 + }, + { + "epoch": 3.0558524392346147, + "grad_norm": 1.2443157593748708, + "learning_rate": 3.469150707186601e-06, + "loss": 0.1709, + "step": 8865 + }, + { + "epoch": 3.0575762799517325, + "grad_norm": 1.3006298528543088, + "learning_rate": 3.463837276848431e-06, + "loss": 0.1814, + "step": 8870 + }, + { + "epoch": 3.0593001206688504, + "grad_norm": 1.2221187416551949, + "learning_rate": 3.4585257614295424e-06, + "loss": 0.1586, + "step": 8875 + }, + { + "epoch": 3.0610239613859678, + "grad_norm": 1.1420431171276149, + "learning_rate": 3.453216167551059e-06, + "loss": 0.1402, + "step": 8880 + }, + { + "epoch": 3.0627478021030856, + "grad_norm": 1.282806457905716, + "learning_rate": 3.447908501831706e-06, + "loss": 0.1817, + "step": 8885 + }, + { + "epoch": 3.0644716428202035, + "grad_norm": 1.1825323383956727, + "learning_rate": 3.4426027708878125e-06, + "loss": 0.1568, + "step": 8890 + }, + { + "epoch": 3.0661954835373213, + "grad_norm": 1.234712663003976, + "learning_rate": 3.437298981333288e-06, + "loss": 0.1617, + "step": 8895 + }, + { + "epoch": 3.0679193242544387, + "grad_norm": 0.9940273551718051, + "learning_rate": 3.431997139779627e-06, + "loss": 0.1434, + "step": 8900 + }, + { + "epoch": 3.0696431649715565, + "grad_norm": 1.1911605035483424, + "learning_rate": 3.426697252835891e-06, + "loss": 0.1526, + "step": 8905 + }, + { + "epoch": 3.0713670056886744, + "grad_norm": 1.2512560419389025, + "learning_rate": 3.421399327108714e-06, + "loss": 0.164, + "step": 8910 + }, + { + "epoch": 3.073090846405792, + "grad_norm": 1.2924444069760832, + "learning_rate": 3.4161033692022736e-06, + "loss": 0.1451, + "step": 8915 + }, + { + "epoch": 3.0748146871229096, + "grad_norm": 1.4258413720044054, + "learning_rate": 3.410809385718301e-06, + "loss": 0.1731, + "step": 8920 + }, + { + "epoch": 3.0765385278400275, + "grad_norm": 1.2127107118447744, + "learning_rate": 3.4055173832560694e-06, + "loss": 0.1599, + "step": 8925 + }, + { + "epoch": 3.0782623685571453, + "grad_norm": 1.4795382920841373, + "learning_rate": 3.400227368412373e-06, + "loss": 0.1773, + "step": 8930 + }, + { + "epoch": 3.079986209274263, + "grad_norm": 1.4073976625245204, + "learning_rate": 3.3949393477815374e-06, + "loss": 0.1643, + "step": 8935 + }, + { + "epoch": 3.081710049991381, + "grad_norm": 1.2603618596811135, + "learning_rate": 3.3896533279553965e-06, + "loss": 0.1594, + "step": 8940 + }, + { + "epoch": 3.0834338907084984, + "grad_norm": 1.1700826310162284, + "learning_rate": 3.384369315523294e-06, + "loss": 0.1579, + "step": 8945 + }, + { + "epoch": 3.0851577314256162, + "grad_norm": 1.2331113214960312, + "learning_rate": 3.379087317072067e-06, + "loss": 0.1624, + "step": 8950 + }, + { + "epoch": 3.086881572142734, + "grad_norm": 1.1795130742565376, + "learning_rate": 3.3738073391860443e-06, + "loss": 0.1461, + "step": 8955 + }, + { + "epoch": 3.088605412859852, + "grad_norm": 1.3371068033276374, + "learning_rate": 3.3685293884470393e-06, + "loss": 0.1612, + "step": 8960 + }, + { + "epoch": 3.0903292535769693, + "grad_norm": 1.3044981159173494, + "learning_rate": 3.363253471434334e-06, + "loss": 0.1693, + "step": 8965 + }, + { + "epoch": 3.092053094294087, + "grad_norm": 1.7031418448770361, + "learning_rate": 3.357979594724673e-06, + "loss": 0.1657, + "step": 8970 + }, + { + "epoch": 3.093776935011205, + "grad_norm": 1.2422130712239046, + "learning_rate": 3.3527077648922657e-06, + "loss": 0.1548, + "step": 8975 + }, + { + "epoch": 3.095500775728323, + "grad_norm": 1.2153967115568292, + "learning_rate": 3.3474379885087636e-06, + "loss": 0.1586, + "step": 8980 + }, + { + "epoch": 3.0972246164454402, + "grad_norm": 1.2763439005942754, + "learning_rate": 3.3421702721432596e-06, + "loss": 0.1699, + "step": 8985 + }, + { + "epoch": 3.098948457162558, + "grad_norm": 2.0238039028924923, + "learning_rate": 3.336904622362278e-06, + "loss": 0.1485, + "step": 8990 + }, + { + "epoch": 3.100672297879676, + "grad_norm": 1.2516738258150344, + "learning_rate": 3.33164104572977e-06, + "loss": 0.1578, + "step": 8995 + }, + { + "epoch": 3.1023961385967938, + "grad_norm": 1.2289166101128928, + "learning_rate": 3.3263795488071017e-06, + "loss": 0.1729, + "step": 9000 + }, + { + "epoch": 3.1041199793139116, + "grad_norm": 1.3271495653163117, + "learning_rate": 3.3211201381530413e-06, + "loss": 0.1725, + "step": 9005 + }, + { + "epoch": 3.105843820031029, + "grad_norm": 1.2248850728907936, + "learning_rate": 3.3158628203237658e-06, + "loss": 0.1538, + "step": 9010 + }, + { + "epoch": 3.107567660748147, + "grad_norm": 1.2856247893709944, + "learning_rate": 3.310607601872835e-06, + "loss": 0.1614, + "step": 9015 + }, + { + "epoch": 3.1092915014652647, + "grad_norm": 1.1875485025104728, + "learning_rate": 3.305354489351197e-06, + "loss": 0.1572, + "step": 9020 + }, + { + "epoch": 3.1110153421823825, + "grad_norm": 1.3912249833335995, + "learning_rate": 3.300103489307169e-06, + "loss": 0.1606, + "step": 9025 + }, + { + "epoch": 3.1127391828995, + "grad_norm": 1.2759584216954767, + "learning_rate": 3.294854608286444e-06, + "loss": 0.1735, + "step": 9030 + }, + { + "epoch": 3.1144630236166178, + "grad_norm": 1.3218498639618694, + "learning_rate": 3.289607852832064e-06, + "loss": 0.1356, + "step": 9035 + }, + { + "epoch": 3.1161868643337356, + "grad_norm": 1.1182668375520324, + "learning_rate": 3.284363229484425e-06, + "loss": 0.1593, + "step": 9040 + }, + { + "epoch": 3.1179107050508534, + "grad_norm": 1.203530908352527, + "learning_rate": 3.27912074478127e-06, + "loss": 0.1523, + "step": 9045 + }, + { + "epoch": 3.119634545767971, + "grad_norm": 1.1908506583990222, + "learning_rate": 3.2738804052576683e-06, + "loss": 0.1619, + "step": 9050 + }, + { + "epoch": 3.1213583864850887, + "grad_norm": 1.3869705790615299, + "learning_rate": 3.2686422174460176e-06, + "loss": 0.1604, + "step": 9055 + }, + { + "epoch": 3.1230822272022065, + "grad_norm": 1.164863663922533, + "learning_rate": 3.2634061878760363e-06, + "loss": 0.1652, + "step": 9060 + }, + { + "epoch": 3.1248060679193244, + "grad_norm": 1.451389863606094, + "learning_rate": 3.2581723230747507e-06, + "loss": 0.1695, + "step": 9065 + }, + { + "epoch": 3.126529908636442, + "grad_norm": 1.2556715058830605, + "learning_rate": 3.2529406295664886e-06, + "loss": 0.1753, + "step": 9070 + }, + { + "epoch": 3.1282537493535596, + "grad_norm": 1.265263393868935, + "learning_rate": 3.247711113872866e-06, + "loss": 0.1754, + "step": 9075 + }, + { + "epoch": 3.1299775900706774, + "grad_norm": 1.276515319127673, + "learning_rate": 3.2424837825127943e-06, + "loss": 0.1583, + "step": 9080 + }, + { + "epoch": 3.1317014307877953, + "grad_norm": 1.4239641358944504, + "learning_rate": 3.237258642002456e-06, + "loss": 0.1487, + "step": 9085 + }, + { + "epoch": 3.133425271504913, + "grad_norm": 1.2309278287313907, + "learning_rate": 3.2320356988552994e-06, + "loss": 0.1503, + "step": 9090 + }, + { + "epoch": 3.1351491122220305, + "grad_norm": 1.1973779391393573, + "learning_rate": 3.2268149595820424e-06, + "loss": 0.1617, + "step": 9095 + }, + { + "epoch": 3.1368729529391484, + "grad_norm": 1.4077947592991213, + "learning_rate": 3.221596430690647e-06, + "loss": 0.1922, + "step": 9100 + }, + { + "epoch": 3.138596793656266, + "grad_norm": 1.0978739727298412, + "learning_rate": 3.2163801186863266e-06, + "loss": 0.1504, + "step": 9105 + }, + { + "epoch": 3.140320634373384, + "grad_norm": 1.031431957954541, + "learning_rate": 3.2111660300715235e-06, + "loss": 0.1533, + "step": 9110 + }, + { + "epoch": 3.1420444750905014, + "grad_norm": 1.4791723213322414, + "learning_rate": 3.205954171345918e-06, + "loss": 0.15, + "step": 9115 + }, + { + "epoch": 3.1437683158076193, + "grad_norm": 1.185407698657408, + "learning_rate": 3.2007445490064026e-06, + "loss": 0.1519, + "step": 9120 + }, + { + "epoch": 3.145492156524737, + "grad_norm": 1.8387350913492626, + "learning_rate": 3.1955371695470844e-06, + "loss": 0.1531, + "step": 9125 + }, + { + "epoch": 3.147215997241855, + "grad_norm": 1.3424838946762745, + "learning_rate": 3.1903320394592787e-06, + "loss": 0.1405, + "step": 9130 + }, + { + "epoch": 3.148939837958973, + "grad_norm": 1.4261443580634592, + "learning_rate": 3.1851291652314907e-06, + "loss": 0.1727, + "step": 9135 + }, + { + "epoch": 3.15066367867609, + "grad_norm": 1.5221580856953272, + "learning_rate": 3.179928553349418e-06, + "loss": 0.1703, + "step": 9140 + }, + { + "epoch": 3.152387519393208, + "grad_norm": 1.200721193330485, + "learning_rate": 3.1747302102959334e-06, + "loss": 0.1639, + "step": 9145 + }, + { + "epoch": 3.154111360110326, + "grad_norm": 1.2843819585587974, + "learning_rate": 3.169534142551087e-06, + "loss": 0.1659, + "step": 9150 + }, + { + "epoch": 3.1558352008274437, + "grad_norm": 1.3225029865479827, + "learning_rate": 3.1643403565920894e-06, + "loss": 0.1615, + "step": 9155 + }, + { + "epoch": 3.157559041544561, + "grad_norm": 1.154229045189436, + "learning_rate": 3.159148858893305e-06, + "loss": 0.154, + "step": 9160 + }, + { + "epoch": 3.159282882261679, + "grad_norm": 1.328864367745576, + "learning_rate": 3.153959655926253e-06, + "loss": 0.1594, + "step": 9165 + }, + { + "epoch": 3.161006722978797, + "grad_norm": 1.2216420251964037, + "learning_rate": 3.1487727541595847e-06, + "loss": 0.1498, + "step": 9170 + }, + { + "epoch": 3.1627305636959147, + "grad_norm": 1.2155244685570445, + "learning_rate": 3.1435881600590823e-06, + "loss": 0.1761, + "step": 9175 + }, + { + "epoch": 3.164454404413032, + "grad_norm": 1.1016787276536528, + "learning_rate": 3.138405880087658e-06, + "loss": 0.142, + "step": 9180 + }, + { + "epoch": 3.16617824513015, + "grad_norm": 1.285676065299186, + "learning_rate": 3.1332259207053357e-06, + "loss": 0.1531, + "step": 9185 + }, + { + "epoch": 3.1679020858472677, + "grad_norm": 1.3820370607901018, + "learning_rate": 3.128048288369245e-06, + "loss": 0.1477, + "step": 9190 + }, + { + "epoch": 3.1696259265643856, + "grad_norm": 1.276935464160731, + "learning_rate": 3.122872989533616e-06, + "loss": 0.1677, + "step": 9195 + }, + { + "epoch": 3.1713497672815034, + "grad_norm": 1.2197500412036166, + "learning_rate": 3.1177000306497705e-06, + "loss": 0.1938, + "step": 9200 + }, + { + "epoch": 3.173073607998621, + "grad_norm": 1.2295343099830522, + "learning_rate": 3.1125294181661155e-06, + "loss": 0.1633, + "step": 9205 + }, + { + "epoch": 3.1747974487157387, + "grad_norm": 1.2348206207636296, + "learning_rate": 3.107361158528126e-06, + "loss": 0.1488, + "step": 9210 + }, + { + "epoch": 3.1765212894328565, + "grad_norm": 1.2378880053981638, + "learning_rate": 3.102195258178353e-06, + "loss": 0.1393, + "step": 9215 + }, + { + "epoch": 3.1782451301499743, + "grad_norm": 1.2305434710767567, + "learning_rate": 3.0970317235563996e-06, + "loss": 0.1679, + "step": 9220 + }, + { + "epoch": 3.1799689708670917, + "grad_norm": 1.1849848594490677, + "learning_rate": 3.0918705610989235e-06, + "loss": 0.1467, + "step": 9225 + }, + { + "epoch": 3.1816928115842096, + "grad_norm": 1.8960401239668225, + "learning_rate": 3.0867117772396225e-06, + "loss": 0.1426, + "step": 9230 + }, + { + "epoch": 3.1834166523013274, + "grad_norm": 1.0708933155701055, + "learning_rate": 3.0815553784092346e-06, + "loss": 0.1622, + "step": 9235 + }, + { + "epoch": 3.1851404930184453, + "grad_norm": 1.0740171225605655, + "learning_rate": 3.0764013710355185e-06, + "loss": 0.1733, + "step": 9240 + }, + { + "epoch": 3.1868643337355627, + "grad_norm": 1.2588239522178575, + "learning_rate": 3.0712497615432542e-06, + "loss": 0.181, + "step": 9245 + }, + { + "epoch": 3.1885881744526805, + "grad_norm": 1.1380999782943795, + "learning_rate": 3.0661005563542356e-06, + "loss": 0.154, + "step": 9250 + }, + { + "epoch": 3.1903120151697983, + "grad_norm": 1.1114175392323502, + "learning_rate": 3.060953761887256e-06, + "loss": 0.1703, + "step": 9255 + }, + { + "epoch": 3.192035855886916, + "grad_norm": 1.32524618177933, + "learning_rate": 3.055809384558102e-06, + "loss": 0.1495, + "step": 9260 + }, + { + "epoch": 3.1937596966040336, + "grad_norm": 1.3807948928628861, + "learning_rate": 3.0506674307795516e-06, + "loss": 0.1738, + "step": 9265 + }, + { + "epoch": 3.1954835373211514, + "grad_norm": 1.0145175130409587, + "learning_rate": 3.0455279069613596e-06, + "loss": 0.1388, + "step": 9270 + }, + { + "epoch": 3.1972073780382693, + "grad_norm": 1.1246000437481154, + "learning_rate": 3.0403908195102526e-06, + "loss": 0.1545, + "step": 9275 + }, + { + "epoch": 3.198931218755387, + "grad_norm": 1.2497428737835505, + "learning_rate": 3.0352561748299157e-06, + "loss": 0.1575, + "step": 9280 + }, + { + "epoch": 3.200655059472505, + "grad_norm": 1.4730540337890046, + "learning_rate": 3.0301239793209965e-06, + "loss": 0.1576, + "step": 9285 + }, + { + "epoch": 3.2023789001896223, + "grad_norm": 1.2264899431695482, + "learning_rate": 3.0249942393810846e-06, + "loss": 0.1621, + "step": 9290 + }, + { + "epoch": 3.20410274090674, + "grad_norm": 1.339302228891098, + "learning_rate": 3.0198669614047064e-06, + "loss": 0.1443, + "step": 9295 + }, + { + "epoch": 3.205826581623858, + "grad_norm": 1.2431049928184017, + "learning_rate": 3.0147421517833274e-06, + "loss": 0.1596, + "step": 9300 + }, + { + "epoch": 3.207550422340976, + "grad_norm": 1.31662825026558, + "learning_rate": 3.009619816905328e-06, + "loss": 0.1514, + "step": 9305 + }, + { + "epoch": 3.2092742630580933, + "grad_norm": 1.1377991158718492, + "learning_rate": 3.0044999631560084e-06, + "loss": 0.1405, + "step": 9310 + }, + { + "epoch": 3.210998103775211, + "grad_norm": 1.2786348510357934, + "learning_rate": 2.9993825969175717e-06, + "loss": 0.1402, + "step": 9315 + }, + { + "epoch": 3.212721944492329, + "grad_norm": 1.355896912109576, + "learning_rate": 2.9942677245691266e-06, + "loss": 0.1655, + "step": 9320 + }, + { + "epoch": 3.214445785209447, + "grad_norm": 1.2767357811079234, + "learning_rate": 2.989155352486667e-06, + "loss": 0.1629, + "step": 9325 + }, + { + "epoch": 3.216169625926564, + "grad_norm": 1.1945654397950884, + "learning_rate": 2.9840454870430713e-06, + "loss": 0.1468, + "step": 9330 + }, + { + "epoch": 3.217893466643682, + "grad_norm": 1.2177112201203022, + "learning_rate": 2.9789381346080985e-06, + "loss": 0.1552, + "step": 9335 + }, + { + "epoch": 3.2196173073608, + "grad_norm": 1.06642046128774, + "learning_rate": 2.973833301548367e-06, + "loss": 0.1298, + "step": 9340 + }, + { + "epoch": 3.2213411480779177, + "grad_norm": 1.385445885995133, + "learning_rate": 2.9687309942273606e-06, + "loss": 0.1529, + "step": 9345 + }, + { + "epoch": 3.223064988795035, + "grad_norm": 1.1548857080268446, + "learning_rate": 2.9636312190054093e-06, + "loss": 0.1416, + "step": 9350 + }, + { + "epoch": 3.224788829512153, + "grad_norm": 1.2496366251161442, + "learning_rate": 2.958533982239694e-06, + "loss": 0.1603, + "step": 9355 + }, + { + "epoch": 3.226512670229271, + "grad_norm": 1.198113134559372, + "learning_rate": 2.953439290284224e-06, + "loss": 0.1626, + "step": 9360 + }, + { + "epoch": 3.2282365109463886, + "grad_norm": 1.3945918076744428, + "learning_rate": 2.9483471494898396e-06, + "loss": 0.1668, + "step": 9365 + }, + { + "epoch": 3.2299603516635065, + "grad_norm": 1.271661001106605, + "learning_rate": 2.943257566204203e-06, + "loss": 0.1607, + "step": 9370 + }, + { + "epoch": 3.231684192380624, + "grad_norm": 1.24702467300753, + "learning_rate": 2.938170546771785e-06, + "loss": 0.1297, + "step": 9375 + }, + { + "epoch": 3.2334080330977417, + "grad_norm": 1.2246876912963742, + "learning_rate": 2.9330860975338592e-06, + "loss": 0.1597, + "step": 9380 + }, + { + "epoch": 3.2351318738148596, + "grad_norm": 1.1898325900629065, + "learning_rate": 2.9280042248285e-06, + "loss": 0.1438, + "step": 9385 + }, + { + "epoch": 3.2368557145319774, + "grad_norm": 1.1258341844437951, + "learning_rate": 2.9229249349905686e-06, + "loss": 0.1535, + "step": 9390 + }, + { + "epoch": 3.238579555249095, + "grad_norm": 1.3075808292199873, + "learning_rate": 2.917848234351702e-06, + "loss": 0.1467, + "step": 9395 + }, + { + "epoch": 3.2403033959662126, + "grad_norm": 1.4888915252167954, + "learning_rate": 2.912774129240315e-06, + "loss": 0.1236, + "step": 9400 + }, + { + "epoch": 3.2420272366833305, + "grad_norm": 1.3130581874151135, + "learning_rate": 2.9077026259815865e-06, + "loss": 0.1512, + "step": 9405 + }, + { + "epoch": 3.2437510774004483, + "grad_norm": 1.2233651203860043, + "learning_rate": 2.9026337308974485e-06, + "loss": 0.1514, + "step": 9410 + }, + { + "epoch": 3.2454749181175657, + "grad_norm": 1.1621007724666295, + "learning_rate": 2.8975674503065826e-06, + "loss": 0.1422, + "step": 9415 + }, + { + "epoch": 3.2471987588346836, + "grad_norm": 1.2324134122107149, + "learning_rate": 2.8925037905244157e-06, + "loss": 0.1526, + "step": 9420 + }, + { + "epoch": 3.2489225995518014, + "grad_norm": 1.3375408126424149, + "learning_rate": 2.887442757863103e-06, + "loss": 0.161, + "step": 9425 + }, + { + "epoch": 3.2506464402689192, + "grad_norm": 1.3044008937263296, + "learning_rate": 2.8823843586315236e-06, + "loss": 0.16, + "step": 9430 + }, + { + "epoch": 3.252370280986037, + "grad_norm": 1.2832049055162995, + "learning_rate": 2.877328599135282e-06, + "loss": 0.1715, + "step": 9435 + }, + { + "epoch": 3.2540941217031545, + "grad_norm": 1.3544162858132127, + "learning_rate": 2.872275485676681e-06, + "loss": 0.15, + "step": 9440 + }, + { + "epoch": 3.2558179624202723, + "grad_norm": 1.2628491376935866, + "learning_rate": 2.867225024554735e-06, + "loss": 0.1595, + "step": 9445 + }, + { + "epoch": 3.25754180313739, + "grad_norm": 1.0156045919606889, + "learning_rate": 2.8621772220651445e-06, + "loss": 0.1424, + "step": 9450 + }, + { + "epoch": 3.259265643854508, + "grad_norm": 1.1795663228801603, + "learning_rate": 2.8571320845003026e-06, + "loss": 0.1398, + "step": 9455 + }, + { + "epoch": 3.2609894845716254, + "grad_norm": 1.291572373617909, + "learning_rate": 2.852089618149275e-06, + "loss": 0.1449, + "step": 9460 + }, + { + "epoch": 3.2627133252887432, + "grad_norm": 1.1917792856944078, + "learning_rate": 2.847049829297799e-06, + "loss": 0.175, + "step": 9465 + }, + { + "epoch": 3.264437166005861, + "grad_norm": 1.1902174190693788, + "learning_rate": 2.842012724228273e-06, + "loss": 0.1517, + "step": 9470 + }, + { + "epoch": 3.266161006722979, + "grad_norm": 1.1391828957317889, + "learning_rate": 2.836978309219754e-06, + "loss": 0.1561, + "step": 9475 + }, + { + "epoch": 3.2678848474400963, + "grad_norm": 1.1556784576811463, + "learning_rate": 2.831946590547945e-06, + "loss": 0.1406, + "step": 9480 + }, + { + "epoch": 3.269608688157214, + "grad_norm": 1.2708959197240675, + "learning_rate": 2.8269175744851817e-06, + "loss": 0.1441, + "step": 9485 + }, + { + "epoch": 3.271332528874332, + "grad_norm": 1.0924787427958744, + "learning_rate": 2.8218912673004394e-06, + "loss": 0.1312, + "step": 9490 + }, + { + "epoch": 3.27305636959145, + "grad_norm": 1.2769029353004335, + "learning_rate": 2.8168676752593118e-06, + "loss": 0.1594, + "step": 9495 + }, + { + "epoch": 3.2747802103085677, + "grad_norm": 1.3795662728959874, + "learning_rate": 2.8118468046240044e-06, + "loss": 0.1597, + "step": 9500 + }, + { + "epoch": 3.276504051025685, + "grad_norm": 1.2887238937193444, + "learning_rate": 2.8068286616533403e-06, + "loss": 0.1459, + "step": 9505 + }, + { + "epoch": 3.278227891742803, + "grad_norm": 1.1808402704797365, + "learning_rate": 2.801813252602734e-06, + "loss": 0.1487, + "step": 9510 + }, + { + "epoch": 3.2799517324599208, + "grad_norm": 1.5317936743745084, + "learning_rate": 2.7968005837241934e-06, + "loss": 0.1559, + "step": 9515 + }, + { + "epoch": 3.2816755731770386, + "grad_norm": 1.2944915064473905, + "learning_rate": 2.791790661266313e-06, + "loss": 0.1471, + "step": 9520 + }, + { + "epoch": 3.283399413894156, + "grad_norm": 1.4435572098572045, + "learning_rate": 2.7867834914742653e-06, + "loss": 0.1515, + "step": 9525 + }, + { + "epoch": 3.285123254611274, + "grad_norm": 1.2507729186644416, + "learning_rate": 2.781779080589787e-06, + "loss": 0.1337, + "step": 9530 + }, + { + "epoch": 3.2868470953283917, + "grad_norm": 1.2948469878334319, + "learning_rate": 2.7767774348511744e-06, + "loss": 0.1474, + "step": 9535 + }, + { + "epoch": 3.2885709360455095, + "grad_norm": 1.1271553054927308, + "learning_rate": 2.7717785604932845e-06, + "loss": 0.1553, + "step": 9540 + }, + { + "epoch": 3.290294776762627, + "grad_norm": 1.1780439193780539, + "learning_rate": 2.7667824637475137e-06, + "loss": 0.1308, + "step": 9545 + }, + { + "epoch": 3.292018617479745, + "grad_norm": 1.364462227019429, + "learning_rate": 2.761789150841796e-06, + "loss": 0.1678, + "step": 9550 + }, + { + "epoch": 3.2937424581968626, + "grad_norm": 1.2408534193952319, + "learning_rate": 2.7567986280005956e-06, + "loss": 0.1672, + "step": 9555 + }, + { + "epoch": 3.2954662989139805, + "grad_norm": 1.3574064237829038, + "learning_rate": 2.7518109014449004e-06, + "loss": 0.1358, + "step": 9560 + }, + { + "epoch": 3.2971901396310983, + "grad_norm": 1.2879046454538696, + "learning_rate": 2.746825977392214e-06, + "loss": 0.1617, + "step": 9565 + }, + { + "epoch": 3.2989139803482157, + "grad_norm": 1.21458954561244, + "learning_rate": 2.7418438620565405e-06, + "loss": 0.1643, + "step": 9570 + }, + { + "epoch": 3.3006378210653335, + "grad_norm": 1.1591378646019665, + "learning_rate": 2.736864561648391e-06, + "loss": 0.146, + "step": 9575 + }, + { + "epoch": 3.3023616617824514, + "grad_norm": 1.3420686126024113, + "learning_rate": 2.7318880823747606e-06, + "loss": 0.1621, + "step": 9580 + }, + { + "epoch": 3.3040855024995692, + "grad_norm": 5.195289363418977, + "learning_rate": 2.7269144304391304e-06, + "loss": 0.1648, + "step": 9585 + }, + { + "epoch": 3.3058093432166866, + "grad_norm": 1.4312003943267158, + "learning_rate": 2.7219436120414546e-06, + "loss": 0.1626, + "step": 9590 + }, + { + "epoch": 3.3075331839338045, + "grad_norm": 1.1891300449792883, + "learning_rate": 2.7169756333781613e-06, + "loss": 0.1469, + "step": 9595 + }, + { + "epoch": 3.3092570246509223, + "grad_norm": 1.2300564203284647, + "learning_rate": 2.712010500642131e-06, + "loss": 0.1627, + "step": 9600 + }, + { + "epoch": 3.31098086536804, + "grad_norm": 1.207605291762137, + "learning_rate": 2.7070482200227027e-06, + "loss": 0.1419, + "step": 9605 + }, + { + "epoch": 3.3127047060851575, + "grad_norm": 1.4119647051471251, + "learning_rate": 2.7020887977056596e-06, + "loss": 0.1466, + "step": 9610 + }, + { + "epoch": 3.3144285468022754, + "grad_norm": 1.3414050244450053, + "learning_rate": 2.697132239873218e-06, + "loss": 0.1531, + "step": 9615 + }, + { + "epoch": 3.3161523875193932, + "grad_norm": 1.2794188181412658, + "learning_rate": 2.6921785527040245e-06, + "loss": 0.1517, + "step": 9620 + }, + { + "epoch": 3.317876228236511, + "grad_norm": 1.1383487525517915, + "learning_rate": 2.687227742373151e-06, + "loss": 0.1339, + "step": 9625 + }, + { + "epoch": 3.319600068953629, + "grad_norm": 1.3706684361185986, + "learning_rate": 2.6822798150520784e-06, + "loss": 0.1458, + "step": 9630 + }, + { + "epoch": 3.3213239096707463, + "grad_norm": 1.0840779345482268, + "learning_rate": 2.6773347769086954e-06, + "loss": 0.138, + "step": 9635 + }, + { + "epoch": 3.323047750387864, + "grad_norm": 1.1299680926739593, + "learning_rate": 2.672392634107292e-06, + "loss": 0.1378, + "step": 9640 + }, + { + "epoch": 3.324771591104982, + "grad_norm": 1.2968179813302492, + "learning_rate": 2.667453392808543e-06, + "loss": 0.1536, + "step": 9645 + }, + { + "epoch": 3.3264954318220994, + "grad_norm": 1.113095759791961, + "learning_rate": 2.6625170591695147e-06, + "loss": 0.1537, + "step": 9650 + }, + { + "epoch": 3.3282192725392172, + "grad_norm": 1.4863009178344746, + "learning_rate": 2.6575836393436407e-06, + "loss": 0.1629, + "step": 9655 + }, + { + "epoch": 3.329943113256335, + "grad_norm": 1.1218714497049078, + "learning_rate": 2.652653139480727e-06, + "loss": 0.1381, + "step": 9660 + }, + { + "epoch": 3.331666953973453, + "grad_norm": 1.2767619153443948, + "learning_rate": 2.6477255657269385e-06, + "loss": 0.1469, + "step": 9665 + }, + { + "epoch": 3.3333907946905708, + "grad_norm": 1.2679028466844597, + "learning_rate": 2.6428009242247923e-06, + "loss": 0.1484, + "step": 9670 + }, + { + "epoch": 3.335114635407688, + "grad_norm": 1.4103582032061193, + "learning_rate": 2.637879221113147e-06, + "loss": 0.1363, + "step": 9675 + }, + { + "epoch": 3.336838476124806, + "grad_norm": 1.079104577316675, + "learning_rate": 2.6329604625272056e-06, + "loss": 0.1637, + "step": 9680 + }, + { + "epoch": 3.338562316841924, + "grad_norm": 1.3996568137256045, + "learning_rate": 2.628044654598497e-06, + "loss": 0.1494, + "step": 9685 + }, + { + "epoch": 3.3402861575590417, + "grad_norm": 1.2866884448424594, + "learning_rate": 2.623131803454869e-06, + "loss": 0.1424, + "step": 9690 + }, + { + "epoch": 3.3420099982761595, + "grad_norm": 1.3298720378968754, + "learning_rate": 2.6182219152204896e-06, + "loss": 0.1491, + "step": 9695 + }, + { + "epoch": 3.343733838993277, + "grad_norm": 1.1645690462893232, + "learning_rate": 2.613314996015828e-06, + "loss": 0.1449, + "step": 9700 + }, + { + "epoch": 3.3454576797103948, + "grad_norm": 1.3657935717403653, + "learning_rate": 2.6084110519576544e-06, + "loss": 0.1551, + "step": 9705 + }, + { + "epoch": 3.3471815204275126, + "grad_norm": 1.3394892472574793, + "learning_rate": 2.6035100891590277e-06, + "loss": 0.1373, + "step": 9710 + }, + { + "epoch": 3.34890536114463, + "grad_norm": 1.271435935332484, + "learning_rate": 2.5986121137292973e-06, + "loss": 0.1616, + "step": 9715 + }, + { + "epoch": 3.350629201861748, + "grad_norm": 1.2957387994542078, + "learning_rate": 2.5937171317740808e-06, + "loss": 0.1555, + "step": 9720 + }, + { + "epoch": 3.3523530425788657, + "grad_norm": 1.3265098912489344, + "learning_rate": 2.588825149395269e-06, + "loss": 0.1654, + "step": 9725 + }, + { + "epoch": 3.3540768832959835, + "grad_norm": 1.4816628313362459, + "learning_rate": 2.583936172691015e-06, + "loss": 0.1487, + "step": 9730 + }, + { + "epoch": 3.3558007240131014, + "grad_norm": 1.3025356011399114, + "learning_rate": 2.5790502077557193e-06, + "loss": 0.1458, + "step": 9735 + }, + { + "epoch": 3.3575245647302188, + "grad_norm": 3.289135084806184, + "learning_rate": 2.574167260680031e-06, + "loss": 0.1303, + "step": 9740 + }, + { + "epoch": 3.3592484054473366, + "grad_norm": 1.1993669977774641, + "learning_rate": 2.5692873375508397e-06, + "loss": 0.1335, + "step": 9745 + }, + { + "epoch": 3.3609722461644544, + "grad_norm": 1.4260518652936927, + "learning_rate": 2.564410444451263e-06, + "loss": 0.1516, + "step": 9750 + }, + { + "epoch": 3.3626960868815723, + "grad_norm": 1.1741266972894133, + "learning_rate": 2.5595365874606403e-06, + "loss": 0.1488, + "step": 9755 + }, + { + "epoch": 3.36441992759869, + "grad_norm": 1.5130574213876373, + "learning_rate": 2.5546657726545267e-06, + "loss": 0.1332, + "step": 9760 + }, + { + "epoch": 3.3661437683158075, + "grad_norm": 1.4131270060140593, + "learning_rate": 2.549798006104687e-06, + "loss": 0.1398, + "step": 9765 + }, + { + "epoch": 3.3678676090329254, + "grad_norm": 1.1117451529424736, + "learning_rate": 2.544933293879087e-06, + "loss": 0.1372, + "step": 9770 + }, + { + "epoch": 3.369591449750043, + "grad_norm": 1.2688281117458118, + "learning_rate": 2.540071642041881e-06, + "loss": 0.1487, + "step": 9775 + }, + { + "epoch": 3.3713152904671606, + "grad_norm": 1.2238693061663448, + "learning_rate": 2.535213056653412e-06, + "loss": 0.1385, + "step": 9780 + }, + { + "epoch": 3.3730391311842784, + "grad_norm": 1.312250632914668, + "learning_rate": 2.5303575437701992e-06, + "loss": 0.1478, + "step": 9785 + }, + { + "epoch": 3.3747629719013963, + "grad_norm": 1.5068816910492788, + "learning_rate": 2.525505109444931e-06, + "loss": 0.1309, + "step": 9790 + }, + { + "epoch": 3.376486812618514, + "grad_norm": 1.4449125973083312, + "learning_rate": 2.5206557597264565e-06, + "loss": 0.1649, + "step": 9795 + }, + { + "epoch": 3.378210653335632, + "grad_norm": 1.1630657360993657, + "learning_rate": 2.515809500659786e-06, + "loss": 0.1472, + "step": 9800 + }, + { + "epoch": 3.3799344940527494, + "grad_norm": 3.8103986592465118, + "learning_rate": 2.5109663382860695e-06, + "loss": 0.1365, + "step": 9805 + }, + { + "epoch": 3.381658334769867, + "grad_norm": 2.244838279442172, + "learning_rate": 2.506126278642602e-06, + "loss": 0.1393, + "step": 9810 + }, + { + "epoch": 3.383382175486985, + "grad_norm": 1.8591193422511196, + "learning_rate": 2.5012893277628104e-06, + "loss": 0.1556, + "step": 9815 + }, + { + "epoch": 3.385106016204103, + "grad_norm": 1.249234027724954, + "learning_rate": 2.4964554916762446e-06, + "loss": 0.1501, + "step": 9820 + }, + { + "epoch": 3.3868298569212203, + "grad_norm": 1.3286169728192727, + "learning_rate": 2.4916247764085694e-06, + "loss": 0.125, + "step": 9825 + }, + { + "epoch": 3.388553697638338, + "grad_norm": 1.2390694186350926, + "learning_rate": 2.4867971879815656e-06, + "loss": 0.1585, + "step": 9830 + }, + { + "epoch": 3.390277538355456, + "grad_norm": 1.322906955054297, + "learning_rate": 2.4819727324131114e-06, + "loss": 0.1502, + "step": 9835 + }, + { + "epoch": 3.392001379072574, + "grad_norm": 1.2656397505407209, + "learning_rate": 2.4771514157171796e-06, + "loss": 0.1615, + "step": 9840 + }, + { + "epoch": 3.393725219789691, + "grad_norm": 1.1797504014624975, + "learning_rate": 2.4723332439038337e-06, + "loss": 0.1475, + "step": 9845 + }, + { + "epoch": 3.395449060506809, + "grad_norm": 1.1838104403710576, + "learning_rate": 2.4675182229792128e-06, + "loss": 0.1226, + "step": 9850 + }, + { + "epoch": 3.397172901223927, + "grad_norm": 1.1493895536343182, + "learning_rate": 2.462706358945533e-06, + "loss": 0.1355, + "step": 9855 + }, + { + "epoch": 3.3988967419410447, + "grad_norm": 1.1756756014850063, + "learning_rate": 2.4578976578010688e-06, + "loss": 0.13, + "step": 9860 + }, + { + "epoch": 3.4006205826581626, + "grad_norm": 1.2006110410571582, + "learning_rate": 2.4530921255401597e-06, + "loss": 0.1438, + "step": 9865 + }, + { + "epoch": 3.40234442337528, + "grad_norm": 1.4174755741714786, + "learning_rate": 2.4482897681531885e-06, + "loss": 0.1672, + "step": 9870 + }, + { + "epoch": 3.404068264092398, + "grad_norm": 1.2397634742682426, + "learning_rate": 2.4434905916265827e-06, + "loss": 0.1466, + "step": 9875 + }, + { + "epoch": 3.4057921048095157, + "grad_norm": 1.408305791146792, + "learning_rate": 2.438694601942803e-06, + "loss": 0.1466, + "step": 9880 + }, + { + "epoch": 3.4075159455266335, + "grad_norm": 1.331945928683061, + "learning_rate": 2.4339018050803413e-06, + "loss": 0.1436, + "step": 9885 + }, + { + "epoch": 3.409239786243751, + "grad_norm": 1.0673855870985687, + "learning_rate": 2.429112207013709e-06, + "loss": 0.1278, + "step": 9890 + }, + { + "epoch": 3.4109636269608687, + "grad_norm": 1.3167135709994189, + "learning_rate": 2.4243258137134247e-06, + "loss": 0.1592, + "step": 9895 + }, + { + "epoch": 3.4126874676779866, + "grad_norm": 1.0382253330637887, + "learning_rate": 2.4195426311460184e-06, + "loss": 0.1369, + "step": 9900 + }, + { + "epoch": 3.4144113083951044, + "grad_norm": 1.347839718990197, + "learning_rate": 2.414762665274015e-06, + "loss": 0.1356, + "step": 9905 + }, + { + "epoch": 3.416135149112222, + "grad_norm": 1.7525700544410359, + "learning_rate": 2.4099859220559272e-06, + "loss": 0.1598, + "step": 9910 + }, + { + "epoch": 3.4178589898293397, + "grad_norm": 1.2228510478538737, + "learning_rate": 2.4052124074462535e-06, + "loss": 0.1464, + "step": 9915 + }, + { + "epoch": 3.4195828305464575, + "grad_norm": 1.3166200373784038, + "learning_rate": 2.40044212739547e-06, + "loss": 0.1492, + "step": 9920 + }, + { + "epoch": 3.4213066712635753, + "grad_norm": 1.2029074828603865, + "learning_rate": 2.395675087850013e-06, + "loss": 0.1423, + "step": 9925 + }, + { + "epoch": 3.423030511980693, + "grad_norm": 1.6016952521237071, + "learning_rate": 2.390911294752287e-06, + "loss": 0.1492, + "step": 9930 + }, + { + "epoch": 3.4247543526978106, + "grad_norm": 1.1537569382242616, + "learning_rate": 2.386150754040649e-06, + "loss": 0.1527, + "step": 9935 + }, + { + "epoch": 3.4264781934149284, + "grad_norm": 1.4708685951800293, + "learning_rate": 2.3813934716493976e-06, + "loss": 0.1547, + "step": 9940 + }, + { + "epoch": 3.4282020341320463, + "grad_norm": 1.2702368885272195, + "learning_rate": 2.3766394535087688e-06, + "loss": 0.1753, + "step": 9945 + }, + { + "epoch": 3.429925874849164, + "grad_norm": 1.186006739617199, + "learning_rate": 2.3718887055449362e-06, + "loss": 0.1192, + "step": 9950 + }, + { + "epoch": 3.4316497155662815, + "grad_norm": 1.1776757786872523, + "learning_rate": 2.367141233679992e-06, + "loss": 0.1351, + "step": 9955 + }, + { + "epoch": 3.4333735562833994, + "grad_norm": 1.2317207596419153, + "learning_rate": 2.3623970438319456e-06, + "loss": 0.1457, + "step": 9960 + }, + { + "epoch": 3.435097397000517, + "grad_norm": 1.2744775172337612, + "learning_rate": 2.357656141914712e-06, + "loss": 0.1249, + "step": 9965 + }, + { + "epoch": 3.436821237717635, + "grad_norm": 1.1027102832973195, + "learning_rate": 2.352918533838114e-06, + "loss": 0.1288, + "step": 9970 + }, + { + "epoch": 3.4385450784347524, + "grad_norm": 1.2735620938476777, + "learning_rate": 2.3481842255078662e-06, + "loss": 0.1238, + "step": 9975 + }, + { + "epoch": 3.4402689191518703, + "grad_norm": 1.3083824846662153, + "learning_rate": 2.3434532228255653e-06, + "loss": 0.123, + "step": 9980 + }, + { + "epoch": 3.441992759868988, + "grad_norm": 1.1261896661897102, + "learning_rate": 2.3387255316886947e-06, + "loss": 0.1287, + "step": 9985 + }, + { + "epoch": 3.443716600586106, + "grad_norm": 1.1577807763124193, + "learning_rate": 2.334001157990604e-06, + "loss": 0.153, + "step": 9990 + }, + { + "epoch": 3.445440441303224, + "grad_norm": 1.7469172819956151, + "learning_rate": 2.3292801076205095e-06, + "loss": 0.1164, + "step": 9995 + }, + { + "epoch": 3.447164282020341, + "grad_norm": 1.285500279625073, + "learning_rate": 2.3245623864634823e-06, + "loss": 0.153, + "step": 10000 + }, + { + "epoch": 3.448888122737459, + "grad_norm": 1.1373202633733297, + "learning_rate": 2.3198480004004503e-06, + "loss": 0.1359, + "step": 10005 + }, + { + "epoch": 3.450611963454577, + "grad_norm": 1.1995880423557541, + "learning_rate": 2.3151369553081747e-06, + "loss": 0.152, + "step": 10010 + }, + { + "epoch": 3.4523358041716947, + "grad_norm": 1.3190939820004475, + "learning_rate": 2.310429257059259e-06, + "loss": 0.1636, + "step": 10015 + }, + { + "epoch": 3.454059644888812, + "grad_norm": 1.4851426400529595, + "learning_rate": 2.305724911522134e-06, + "loss": 0.1408, + "step": 10020 + }, + { + "epoch": 3.45578348560593, + "grad_norm": 1.2107923919088128, + "learning_rate": 2.301023924561049e-06, + "loss": 0.1335, + "step": 10025 + }, + { + "epoch": 3.457507326323048, + "grad_norm": 1.4514363246494875, + "learning_rate": 2.296326302036065e-06, + "loss": 0.1515, + "step": 10030 + }, + { + "epoch": 3.4592311670401656, + "grad_norm": 1.1785208273644654, + "learning_rate": 2.2916320498030507e-06, + "loss": 0.132, + "step": 10035 + }, + { + "epoch": 3.460955007757283, + "grad_norm": 1.3728957597883333, + "learning_rate": 2.2869411737136776e-06, + "loss": 0.1191, + "step": 10040 + }, + { + "epoch": 3.462678848474401, + "grad_norm": 1.1398710835306203, + "learning_rate": 2.282253679615401e-06, + "loss": 0.128, + "step": 10045 + }, + { + "epoch": 3.4644026891915187, + "grad_norm": 1.2776111551003537, + "learning_rate": 2.277569573351468e-06, + "loss": 0.1157, + "step": 10050 + }, + { + "epoch": 3.4661265299086366, + "grad_norm": 1.4871118678155257, + "learning_rate": 2.272888860760896e-06, + "loss": 0.1489, + "step": 10055 + }, + { + "epoch": 3.4678503706257544, + "grad_norm": 1.1816851101719668, + "learning_rate": 2.268211547678478e-06, + "loss": 0.1361, + "step": 10060 + }, + { + "epoch": 3.469574211342872, + "grad_norm": 1.326715133116185, + "learning_rate": 2.2635376399347625e-06, + "loss": 0.1476, + "step": 10065 + }, + { + "epoch": 3.4712980520599896, + "grad_norm": 1.201080970912036, + "learning_rate": 2.2588671433560605e-06, + "loss": 0.1337, + "step": 10070 + }, + { + "epoch": 3.4730218927771075, + "grad_norm": 1.2095397506766368, + "learning_rate": 2.2542000637644255e-06, + "loss": 0.116, + "step": 10075 + }, + { + "epoch": 3.474745733494225, + "grad_norm": 1.2361475967277016, + "learning_rate": 2.249536406977653e-06, + "loss": 0.1415, + "step": 10080 + }, + { + "epoch": 3.4764695742113427, + "grad_norm": 1.1930575497042437, + "learning_rate": 2.2448761788092698e-06, + "loss": 0.1394, + "step": 10085 + }, + { + "epoch": 3.4781934149284606, + "grad_norm": 1.8463098668073976, + "learning_rate": 2.2402193850685327e-06, + "loss": 0.1404, + "step": 10090 + }, + { + "epoch": 3.4799172556455784, + "grad_norm": 1.5571786413481146, + "learning_rate": 2.2355660315604173e-06, + "loss": 0.1625, + "step": 10095 + }, + { + "epoch": 3.4816410963626963, + "grad_norm": 1.3547933147833346, + "learning_rate": 2.2309161240856047e-06, + "loss": 0.1415, + "step": 10100 + }, + { + "epoch": 3.4833649370798137, + "grad_norm": 1.3412873791216466, + "learning_rate": 2.2262696684404887e-06, + "loss": 0.1386, + "step": 10105 + }, + { + "epoch": 3.4850887777969315, + "grad_norm": 1.3022777262880085, + "learning_rate": 2.221626670417154e-06, + "loss": 0.1412, + "step": 10110 + }, + { + "epoch": 3.4868126185140493, + "grad_norm": 1.2544141695607791, + "learning_rate": 2.216987135803376e-06, + "loss": 0.1491, + "step": 10115 + }, + { + "epoch": 3.488536459231167, + "grad_norm": 1.3151231958344867, + "learning_rate": 2.2123510703826136e-06, + "loss": 0.1422, + "step": 10120 + }, + { + "epoch": 3.490260299948285, + "grad_norm": 1.2748072937124686, + "learning_rate": 2.2077184799340036e-06, + "loss": 0.1332, + "step": 10125 + }, + { + "epoch": 3.4919841406654024, + "grad_norm": 1.187476731348306, + "learning_rate": 2.2030893702323457e-06, + "loss": 0.1504, + "step": 10130 + }, + { + "epoch": 3.4937079813825203, + "grad_norm": 1.4262527187843115, + "learning_rate": 2.1984637470481056e-06, + "loss": 0.1299, + "step": 10135 + }, + { + "epoch": 3.495431822099638, + "grad_norm": 1.4159713330193895, + "learning_rate": 2.193841616147403e-06, + "loss": 0.1492, + "step": 10140 + }, + { + "epoch": 3.4971556628167555, + "grad_norm": 1.8420571283229705, + "learning_rate": 2.189222983292e-06, + "loss": 0.13, + "step": 10145 + }, + { + "epoch": 3.4988795035338733, + "grad_norm": 1.3015205686974982, + "learning_rate": 2.1846078542393005e-06, + "loss": 0.1557, + "step": 10150 + }, + { + "epoch": 3.500603344250991, + "grad_norm": 1.288144370705917, + "learning_rate": 2.179996234742339e-06, + "loss": 0.151, + "step": 10155 + }, + { + "epoch": 3.502327184968109, + "grad_norm": 1.2368914366983854, + "learning_rate": 2.1753881305497798e-06, + "loss": 0.1266, + "step": 10160 + }, + { + "epoch": 3.504051025685227, + "grad_norm": 1.2793429978132784, + "learning_rate": 2.170783547405901e-06, + "loss": 0.1727, + "step": 10165 + }, + { + "epoch": 3.5057748664023443, + "grad_norm": 1.0476834155793038, + "learning_rate": 2.16618249105059e-06, + "loss": 0.1294, + "step": 10170 + }, + { + "epoch": 3.507498707119462, + "grad_norm": 1.2685273294129786, + "learning_rate": 2.161584967219343e-06, + "loss": 0.1364, + "step": 10175 + }, + { + "epoch": 3.50922254783658, + "grad_norm": 1.2165289853454064, + "learning_rate": 2.1569909816432517e-06, + "loss": 0.1283, + "step": 10180 + }, + { + "epoch": 3.510946388553698, + "grad_norm": 1.2182163113863704, + "learning_rate": 2.1524005400489917e-06, + "loss": 0.1305, + "step": 10185 + }, + { + "epoch": 3.5126702292708156, + "grad_norm": 1.1567298789673996, + "learning_rate": 2.1478136481588284e-06, + "loss": 0.1425, + "step": 10190 + }, + { + "epoch": 3.514394069987933, + "grad_norm": 1.304619567517035, + "learning_rate": 2.1432303116905974e-06, + "loss": 0.1467, + "step": 10195 + }, + { + "epoch": 3.516117910705051, + "grad_norm": 1.1718439677464705, + "learning_rate": 2.1386505363577025e-06, + "loss": 0.1472, + "step": 10200 + }, + { + "epoch": 3.5178417514221687, + "grad_norm": 1.1920756528298804, + "learning_rate": 2.1340743278691077e-06, + "loss": 0.1417, + "step": 10205 + }, + { + "epoch": 3.519565592139286, + "grad_norm": 1.2508333889573606, + "learning_rate": 2.1295016919293366e-06, + "loss": 0.1375, + "step": 10210 + }, + { + "epoch": 3.521289432856404, + "grad_norm": 1.1786696781792316, + "learning_rate": 2.1249326342384506e-06, + "loss": 0.1428, + "step": 10215 + }, + { + "epoch": 3.523013273573522, + "grad_norm": 1.2954028572764982, + "learning_rate": 2.1203671604920575e-06, + "loss": 0.1422, + "step": 10220 + }, + { + "epoch": 3.5247371142906396, + "grad_norm": 1.2549155800354854, + "learning_rate": 2.1158052763812963e-06, + "loss": 0.1284, + "step": 10225 + }, + { + "epoch": 3.5264609550077575, + "grad_norm": 1.3034212695095693, + "learning_rate": 2.1112469875928287e-06, + "loss": 0.1234, + "step": 10230 + }, + { + "epoch": 3.528184795724875, + "grad_norm": 2.8605010507460733, + "learning_rate": 2.1066922998088358e-06, + "loss": 0.1447, + "step": 10235 + }, + { + "epoch": 3.5299086364419927, + "grad_norm": 1.1506103931619351, + "learning_rate": 2.1021412187070078e-06, + "loss": 0.1358, + "step": 10240 + }, + { + "epoch": 3.5316324771591106, + "grad_norm": 1.301886880815796, + "learning_rate": 2.097593749960546e-06, + "loss": 0.1393, + "step": 10245 + }, + { + "epoch": 3.5333563178762284, + "grad_norm": 1.2929314975727006, + "learning_rate": 2.0930498992381395e-06, + "loss": 0.1327, + "step": 10250 + }, + { + "epoch": 3.5350801585933462, + "grad_norm": 1.1327159869706134, + "learning_rate": 2.088509672203973e-06, + "loss": 0.1271, + "step": 10255 + }, + { + "epoch": 3.5368039993104636, + "grad_norm": 1.4404330210849468, + "learning_rate": 2.083973074517715e-06, + "loss": 0.1294, + "step": 10260 + }, + { + "epoch": 3.5385278400275815, + "grad_norm": 1.463516046286985, + "learning_rate": 2.0794401118345065e-06, + "loss": 0.1615, + "step": 10265 + }, + { + "epoch": 3.5402516807446993, + "grad_norm": 1.2009394418196053, + "learning_rate": 2.074910789804955e-06, + "loss": 0.1212, + "step": 10270 + }, + { + "epoch": 3.5419755214618167, + "grad_norm": 1.39931538405594, + "learning_rate": 2.0703851140751374e-06, + "loss": 0.1485, + "step": 10275 + }, + { + "epoch": 3.5436993621789346, + "grad_norm": 1.3285491672518441, + "learning_rate": 2.0658630902865793e-06, + "loss": 0.1406, + "step": 10280 + }, + { + "epoch": 3.5454232028960524, + "grad_norm": 1.27157866509823, + "learning_rate": 2.061344724076255e-06, + "loss": 0.1679, + "step": 10285 + }, + { + "epoch": 3.5471470436131702, + "grad_norm": 1.1340291313938298, + "learning_rate": 2.056830021076578e-06, + "loss": 0.1342, + "step": 10290 + }, + { + "epoch": 3.548870884330288, + "grad_norm": 1.4711914742437875, + "learning_rate": 2.0523189869154e-06, + "loss": 0.1367, + "step": 10295 + }, + { + "epoch": 3.5505947250474055, + "grad_norm": 1.1018443142071328, + "learning_rate": 2.047811627215997e-06, + "loss": 0.1274, + "step": 10300 + }, + { + "epoch": 3.5523185657645233, + "grad_norm": 1.276916706292044, + "learning_rate": 2.0433079475970614e-06, + "loss": 0.1509, + "step": 10305 + }, + { + "epoch": 3.554042406481641, + "grad_norm": 1.2631568006321003, + "learning_rate": 2.038807953672704e-06, + "loss": 0.1465, + "step": 10310 + }, + { + "epoch": 3.555766247198759, + "grad_norm": 1.2858206578244311, + "learning_rate": 2.034311651052437e-06, + "loss": 0.1237, + "step": 10315 + }, + { + "epoch": 3.557490087915877, + "grad_norm": 1.3166707370113864, + "learning_rate": 2.0298190453411713e-06, + "loss": 0.1459, + "step": 10320 + }, + { + "epoch": 3.5592139286329942, + "grad_norm": 1.329663428551299, + "learning_rate": 2.025330142139209e-06, + "loss": 0.1292, + "step": 10325 + }, + { + "epoch": 3.560937769350112, + "grad_norm": 1.3061670357384707, + "learning_rate": 2.020844947042242e-06, + "loss": 0.1572, + "step": 10330 + }, + { + "epoch": 3.56266161006723, + "grad_norm": 1.4902355455081342, + "learning_rate": 2.0163634656413316e-06, + "loss": 0.1309, + "step": 10335 + }, + { + "epoch": 3.5643854507843473, + "grad_norm": 1.2660175158905984, + "learning_rate": 2.0118857035229163e-06, + "loss": 0.1358, + "step": 10340 + }, + { + "epoch": 3.566109291501465, + "grad_norm": 1.2702700696118927, + "learning_rate": 2.0074116662687972e-06, + "loss": 0.1644, + "step": 10345 + }, + { + "epoch": 3.567833132218583, + "grad_norm": 1.2349927622288783, + "learning_rate": 2.0029413594561303e-06, + "loss": 0.1253, + "step": 10350 + }, + { + "epoch": 3.569556972935701, + "grad_norm": 1.2613607181969857, + "learning_rate": 1.998474788657421e-06, + "loss": 0.1391, + "step": 10355 + }, + { + "epoch": 3.5712808136528187, + "grad_norm": 1.172492727380651, + "learning_rate": 1.994011959440517e-06, + "loss": 0.1376, + "step": 10360 + }, + { + "epoch": 3.573004654369936, + "grad_norm": 1.3442806355321946, + "learning_rate": 1.989552877368608e-06, + "loss": 0.1342, + "step": 10365 + }, + { + "epoch": 3.574728495087054, + "grad_norm": 1.131544245484566, + "learning_rate": 1.9850975480002057e-06, + "loss": 0.1184, + "step": 10370 + }, + { + "epoch": 3.5764523358041718, + "grad_norm": 1.0926149570186006, + "learning_rate": 1.980645976889144e-06, + "loss": 0.1239, + "step": 10375 + }, + { + "epoch": 3.578176176521289, + "grad_norm": 1.3719965590893122, + "learning_rate": 1.9761981695845767e-06, + "loss": 0.1359, + "step": 10380 + }, + { + "epoch": 3.579900017238407, + "grad_norm": 1.1643149725825608, + "learning_rate": 1.9717541316309647e-06, + "loss": 0.1326, + "step": 10385 + }, + { + "epoch": 3.581623857955525, + "grad_norm": 1.8561466802008566, + "learning_rate": 1.9673138685680653e-06, + "loss": 0.1404, + "step": 10390 + }, + { + "epoch": 3.5833476986726427, + "grad_norm": 1.1709644797992391, + "learning_rate": 1.9628773859309374e-06, + "loss": 0.1219, + "step": 10395 + }, + { + "epoch": 3.5850715393897605, + "grad_norm": 1.373461957118018, + "learning_rate": 1.9584446892499213e-06, + "loss": 0.1459, + "step": 10400 + }, + { + "epoch": 3.586795380106878, + "grad_norm": 1.3315445881372823, + "learning_rate": 1.9540157840506406e-06, + "loss": 0.1585, + "step": 10405 + }, + { + "epoch": 3.5885192208239958, + "grad_norm": 1.3190982832334845, + "learning_rate": 1.9495906758539906e-06, + "loss": 0.1463, + "step": 10410 + }, + { + "epoch": 3.5902430615411136, + "grad_norm": 1.8810774817123894, + "learning_rate": 1.9451693701761376e-06, + "loss": 0.1467, + "step": 10415 + }, + { + "epoch": 3.5919669022582315, + "grad_norm": 1.0999186764013664, + "learning_rate": 1.9407518725285024e-06, + "loss": 0.1307, + "step": 10420 + }, + { + "epoch": 3.5936907429753493, + "grad_norm": 1.1333744249409976, + "learning_rate": 1.9363381884177635e-06, + "loss": 0.1188, + "step": 10425 + }, + { + "epoch": 3.5954145836924667, + "grad_norm": 1.2983964703441595, + "learning_rate": 1.9319283233458453e-06, + "loss": 0.1342, + "step": 10430 + }, + { + "epoch": 3.5971384244095845, + "grad_norm": 1.1977946441537601, + "learning_rate": 1.927522282809908e-06, + "loss": 0.1218, + "step": 10435 + }, + { + "epoch": 3.5988622651267024, + "grad_norm": 1.1241380011426576, + "learning_rate": 1.923120072302346e-06, + "loss": 0.1304, + "step": 10440 + }, + { + "epoch": 3.6005861058438198, + "grad_norm": 1.384808286108473, + "learning_rate": 1.918721697310779e-06, + "loss": 0.1665, + "step": 10445 + }, + { + "epoch": 3.6023099465609376, + "grad_norm": 1.271228113332117, + "learning_rate": 1.9143271633180494e-06, + "loss": 0.1258, + "step": 10450 + }, + { + "epoch": 3.6040337872780555, + "grad_norm": 1.1310942200204361, + "learning_rate": 1.9099364758022037e-06, + "loss": 0.1293, + "step": 10455 + }, + { + "epoch": 3.6057576279951733, + "grad_norm": 1.4577887134382665, + "learning_rate": 1.9055496402365004e-06, + "loss": 0.1268, + "step": 10460 + }, + { + "epoch": 3.607481468712291, + "grad_norm": 1.2354278117066597, + "learning_rate": 1.9011666620893966e-06, + "loss": 0.1341, + "step": 10465 + }, + { + "epoch": 3.6092053094294085, + "grad_norm": 1.3624270768772266, + "learning_rate": 1.8967875468245357e-06, + "loss": 0.1364, + "step": 10470 + }, + { + "epoch": 3.6109291501465264, + "grad_norm": 1.518599366796241, + "learning_rate": 1.8924122999007483e-06, + "loss": 0.1358, + "step": 10475 + }, + { + "epoch": 3.612652990863644, + "grad_norm": 1.241562848439346, + "learning_rate": 1.8880409267720417e-06, + "loss": 0.1292, + "step": 10480 + }, + { + "epoch": 3.614376831580762, + "grad_norm": 1.4017676664890422, + "learning_rate": 1.8836734328875989e-06, + "loss": 0.116, + "step": 10485 + }, + { + "epoch": 3.61610067229788, + "grad_norm": 1.1914278899598685, + "learning_rate": 1.8793098236917624e-06, + "loss": 0.1221, + "step": 10490 + }, + { + "epoch": 3.6178245130149973, + "grad_norm": 1.3611171604951202, + "learning_rate": 1.8749501046240309e-06, + "loss": 0.137, + "step": 10495 + }, + { + "epoch": 3.619548353732115, + "grad_norm": 1.1245813067761146, + "learning_rate": 1.8705942811190596e-06, + "loss": 0.1195, + "step": 10500 + }, + { + "epoch": 3.621272194449233, + "grad_norm": 1.4638796937027179, + "learning_rate": 1.8662423586066464e-06, + "loss": 0.1335, + "step": 10505 + }, + { + "epoch": 3.6229960351663504, + "grad_norm": 1.277229079172751, + "learning_rate": 1.8618943425117198e-06, + "loss": 0.1382, + "step": 10510 + }, + { + "epoch": 3.6247198758834682, + "grad_norm": 1.5670591940980676, + "learning_rate": 1.857550238254348e-06, + "loss": 0.133, + "step": 10515 + }, + { + "epoch": 3.626443716600586, + "grad_norm": 1.369205222016917, + "learning_rate": 1.853210051249717e-06, + "loss": 0.135, + "step": 10520 + }, + { + "epoch": 3.628167557317704, + "grad_norm": 1.3182500817127756, + "learning_rate": 1.8488737869081303e-06, + "loss": 0.136, + "step": 10525 + }, + { + "epoch": 3.6298913980348217, + "grad_norm": 1.385080097191345, + "learning_rate": 1.8445414506350002e-06, + "loss": 0.1309, + "step": 10530 + }, + { + "epoch": 3.631615238751939, + "grad_norm": 1.245724613530749, + "learning_rate": 1.8402130478308495e-06, + "loss": 0.1556, + "step": 10535 + }, + { + "epoch": 3.633339079469057, + "grad_norm": 1.335406621758715, + "learning_rate": 1.8358885838912881e-06, + "loss": 0.1389, + "step": 10540 + }, + { + "epoch": 3.635062920186175, + "grad_norm": 1.329232887143972, + "learning_rate": 1.8315680642070226e-06, + "loss": 0.1266, + "step": 10545 + }, + { + "epoch": 3.6367867609032927, + "grad_norm": 1.4605559129841814, + "learning_rate": 1.8272514941638431e-06, + "loss": 0.1386, + "step": 10550 + }, + { + "epoch": 3.6385106016204105, + "grad_norm": 1.4477857409364705, + "learning_rate": 1.8229388791426116e-06, + "loss": 0.1352, + "step": 10555 + }, + { + "epoch": 3.640234442337528, + "grad_norm": 1.3043594278556248, + "learning_rate": 1.818630224519262e-06, + "loss": 0.1503, + "step": 10560 + }, + { + "epoch": 3.6419582830546458, + "grad_norm": 1.3361476287320897, + "learning_rate": 1.8143255356647903e-06, + "loss": 0.1402, + "step": 10565 + }, + { + "epoch": 3.6436821237717636, + "grad_norm": 1.1482931785874433, + "learning_rate": 1.810024817945254e-06, + "loss": 0.1287, + "step": 10570 + }, + { + "epoch": 3.645405964488881, + "grad_norm": 1.1679709169601555, + "learning_rate": 1.8057280767217544e-06, + "loss": 0.1391, + "step": 10575 + }, + { + "epoch": 3.647129805205999, + "grad_norm": 1.2385544212852486, + "learning_rate": 1.8014353173504363e-06, + "loss": 0.1311, + "step": 10580 + }, + { + "epoch": 3.6488536459231167, + "grad_norm": 1.07210735004757, + "learning_rate": 1.7971465451824842e-06, + "loss": 0.114, + "step": 10585 + }, + { + "epoch": 3.6505774866402345, + "grad_norm": 1.350006746341956, + "learning_rate": 1.7928617655641122e-06, + "loss": 0.1336, + "step": 10590 + }, + { + "epoch": 3.6523013273573524, + "grad_norm": 1.3609570099878197, + "learning_rate": 1.7885809838365552e-06, + "loss": 0.134, + "step": 10595 + }, + { + "epoch": 3.6540251680744698, + "grad_norm": 1.4535370751201406, + "learning_rate": 1.7843042053360626e-06, + "loss": 0.1403, + "step": 10600 + }, + { + "epoch": 3.6557490087915876, + "grad_norm": 1.331424181308694, + "learning_rate": 1.7800314353939003e-06, + "loss": 0.1232, + "step": 10605 + }, + { + "epoch": 3.6574728495087054, + "grad_norm": 1.1209992275175193, + "learning_rate": 1.7757626793363308e-06, + "loss": 0.126, + "step": 10610 + }, + { + "epoch": 3.6591966902258233, + "grad_norm": 1.2137726012580736, + "learning_rate": 1.771497942484614e-06, + "loss": 0.1484, + "step": 10615 + }, + { + "epoch": 3.660920530942941, + "grad_norm": 1.3747442482775356, + "learning_rate": 1.7672372301550044e-06, + "loss": 0.1536, + "step": 10620 + }, + { + "epoch": 3.6626443716600585, + "grad_norm": 1.040662719999315, + "learning_rate": 1.762980547658733e-06, + "loss": 0.1338, + "step": 10625 + }, + { + "epoch": 3.6643682123771764, + "grad_norm": 1.3360808889696956, + "learning_rate": 1.7587279003020125e-06, + "loss": 0.121, + "step": 10630 + }, + { + "epoch": 3.666092053094294, + "grad_norm": 1.4687925238284372, + "learning_rate": 1.7544792933860256e-06, + "loss": 0.1348, + "step": 10635 + }, + { + "epoch": 3.6678158938114116, + "grad_norm": 1.2033196101927577, + "learning_rate": 1.750234732206914e-06, + "loss": 0.1346, + "step": 10640 + }, + { + "epoch": 3.6695397345285294, + "grad_norm": 1.2504807907262168, + "learning_rate": 1.7459942220557791e-06, + "loss": 0.1254, + "step": 10645 + }, + { + "epoch": 3.6712635752456473, + "grad_norm": 1.3642377624001467, + "learning_rate": 1.741757768218671e-06, + "loss": 0.1191, + "step": 10650 + }, + { + "epoch": 3.672987415962765, + "grad_norm": 1.3836460333016505, + "learning_rate": 1.7375253759765863e-06, + "loss": 0.1257, + "step": 10655 + }, + { + "epoch": 3.674711256679883, + "grad_norm": 1.5930833080042952, + "learning_rate": 1.7332970506054548e-06, + "loss": 0.1271, + "step": 10660 + }, + { + "epoch": 3.6764350973970004, + "grad_norm": 1.3318259266400878, + "learning_rate": 1.729072797376139e-06, + "loss": 0.1302, + "step": 10665 + }, + { + "epoch": 3.678158938114118, + "grad_norm": 1.4027360006651397, + "learning_rate": 1.724852621554427e-06, + "loss": 0.1401, + "step": 10670 + }, + { + "epoch": 3.679882778831236, + "grad_norm": 1.3567103655800863, + "learning_rate": 1.7206365284010206e-06, + "loss": 0.1336, + "step": 10675 + }, + { + "epoch": 3.681606619548354, + "grad_norm": 1.3227808689531386, + "learning_rate": 1.7164245231715325e-06, + "loss": 0.1394, + "step": 10680 + }, + { + "epoch": 3.6833304602654717, + "grad_norm": 1.2139258418406307, + "learning_rate": 1.7122166111164807e-06, + "loss": 0.132, + "step": 10685 + }, + { + "epoch": 3.685054300982589, + "grad_norm": 1.2801010660869936, + "learning_rate": 1.7080127974812828e-06, + "loss": 0.1184, + "step": 10690 + }, + { + "epoch": 3.686778141699707, + "grad_norm": 2.342104147156023, + "learning_rate": 1.7038130875062437e-06, + "loss": 0.1356, + "step": 10695 + }, + { + "epoch": 3.688501982416825, + "grad_norm": 1.1868197294481282, + "learning_rate": 1.699617486426554e-06, + "loss": 0.1399, + "step": 10700 + }, + { + "epoch": 3.690225823133942, + "grad_norm": 1.2895401450171413, + "learning_rate": 1.6954259994722838e-06, + "loss": 0.1419, + "step": 10705 + }, + { + "epoch": 3.69194966385106, + "grad_norm": 1.3094544008032438, + "learning_rate": 1.691238631868376e-06, + "loss": 0.1371, + "step": 10710 + }, + { + "epoch": 3.693673504568178, + "grad_norm": 1.24079050631587, + "learning_rate": 1.6870553888346325e-06, + "loss": 0.1063, + "step": 10715 + }, + { + "epoch": 3.6953973452852957, + "grad_norm": 1.2174923355322707, + "learning_rate": 1.6828762755857214e-06, + "loss": 0.1276, + "step": 10720 + }, + { + "epoch": 3.6971211860024136, + "grad_norm": 1.2499719819707977, + "learning_rate": 1.6787012973311567e-06, + "loss": 0.1374, + "step": 10725 + }, + { + "epoch": 3.698845026719531, + "grad_norm": 1.4334182789428982, + "learning_rate": 1.6745304592753004e-06, + "loss": 0.1354, + "step": 10730 + }, + { + "epoch": 3.700568867436649, + "grad_norm": 1.278202095404078, + "learning_rate": 1.670363766617351e-06, + "loss": 0.126, + "step": 10735 + }, + { + "epoch": 3.7022927081537667, + "grad_norm": 1.3399633301297333, + "learning_rate": 1.6662012245513454e-06, + "loss": 0.1526, + "step": 10740 + }, + { + "epoch": 3.7040165488708845, + "grad_norm": 1.3435297851993446, + "learning_rate": 1.6620428382661391e-06, + "loss": 0.1283, + "step": 10745 + }, + { + "epoch": 3.7057403895880023, + "grad_norm": 1.237787900089323, + "learning_rate": 1.657888612945413e-06, + "loss": 0.1291, + "step": 10750 + }, + { + "epoch": 3.7074642303051197, + "grad_norm": 1.231154186186958, + "learning_rate": 1.6537385537676604e-06, + "loss": 0.1247, + "step": 10755 + }, + { + "epoch": 3.7091880710222376, + "grad_norm": 1.2283581519236173, + "learning_rate": 1.6495926659061779e-06, + "loss": 0.139, + "step": 10760 + }, + { + "epoch": 3.7109119117393554, + "grad_norm": 1.3180432600272551, + "learning_rate": 1.6454509545290647e-06, + "loss": 0.1158, + "step": 10765 + }, + { + "epoch": 3.712635752456473, + "grad_norm": 1.4453948312356553, + "learning_rate": 1.6413134247992112e-06, + "loss": 0.1218, + "step": 10770 + }, + { + "epoch": 3.7143595931735907, + "grad_norm": 1.2105396668085264, + "learning_rate": 1.6371800818743004e-06, + "loss": 0.1239, + "step": 10775 + }, + { + "epoch": 3.7160834338907085, + "grad_norm": 1.3962426884992203, + "learning_rate": 1.6330509309067921e-06, + "loss": 0.1273, + "step": 10780 + }, + { + "epoch": 3.7178072746078263, + "grad_norm": 1.3272134789736372, + "learning_rate": 1.6289259770439192e-06, + "loss": 0.1283, + "step": 10785 + }, + { + "epoch": 3.719531115324944, + "grad_norm": 1.4582300277803126, + "learning_rate": 1.624805225427687e-06, + "loss": 0.1383, + "step": 10790 + }, + { + "epoch": 3.7212549560420616, + "grad_norm": 1.4787466443055255, + "learning_rate": 1.6206886811948613e-06, + "loss": 0.1236, + "step": 10795 + }, + { + "epoch": 3.7229787967591794, + "grad_norm": 1.2562573715672862, + "learning_rate": 1.616576349476961e-06, + "loss": 0.135, + "step": 10800 + }, + { + "epoch": 3.7247026374762973, + "grad_norm": 1.1323987446276198, + "learning_rate": 1.6124682354002534e-06, + "loss": 0.1406, + "step": 10805 + }, + { + "epoch": 3.7264264781934147, + "grad_norm": 1.2233775897652701, + "learning_rate": 1.6083643440857538e-06, + "loss": 0.125, + "step": 10810 + }, + { + "epoch": 3.7281503189105325, + "grad_norm": 1.4222384020217527, + "learning_rate": 1.6042646806492074e-06, + "loss": 0.1466, + "step": 10815 + }, + { + "epoch": 3.7298741596276503, + "grad_norm": 1.3658861942644898, + "learning_rate": 1.6001692502010896e-06, + "loss": 0.108, + "step": 10820 + }, + { + "epoch": 3.731598000344768, + "grad_norm": 1.3315481859182554, + "learning_rate": 1.5960780578466045e-06, + "loss": 0.1394, + "step": 10825 + }, + { + "epoch": 3.733321841061886, + "grad_norm": 1.215603151961821, + "learning_rate": 1.591991108685666e-06, + "loss": 0.1308, + "step": 10830 + }, + { + "epoch": 3.7350456817790034, + "grad_norm": 1.2194561850927335, + "learning_rate": 1.5879084078129043e-06, + "loss": 0.1257, + "step": 10835 + }, + { + "epoch": 3.7367695224961213, + "grad_norm": 1.4393103623707952, + "learning_rate": 1.5838299603176533e-06, + "loss": 0.1361, + "step": 10840 + }, + { + "epoch": 3.738493363213239, + "grad_norm": 1.2893582131665366, + "learning_rate": 1.5797557712839412e-06, + "loss": 0.1204, + "step": 10845 + }, + { + "epoch": 3.740217203930357, + "grad_norm": 1.220522667960852, + "learning_rate": 1.57568584579049e-06, + "loss": 0.1242, + "step": 10850 + }, + { + "epoch": 3.741941044647475, + "grad_norm": 1.1576652642506275, + "learning_rate": 1.5716201889107051e-06, + "loss": 0.1043, + "step": 10855 + }, + { + "epoch": 3.743664885364592, + "grad_norm": 1.4260677520633134, + "learning_rate": 1.5675588057126762e-06, + "loss": 0.1413, + "step": 10860 + }, + { + "epoch": 3.74538872608171, + "grad_norm": 1.361725118807402, + "learning_rate": 1.5635017012591585e-06, + "loss": 0.139, + "step": 10865 + }, + { + "epoch": 3.747112566798828, + "grad_norm": 1.296965961384951, + "learning_rate": 1.5594488806075775e-06, + "loss": 0.142, + "step": 10870 + }, + { + "epoch": 3.7488364075159453, + "grad_norm": 1.5503628680491506, + "learning_rate": 1.5554003488100205e-06, + "loss": 0.1538, + "step": 10875 + }, + { + "epoch": 3.750560248233063, + "grad_norm": 1.337619682985209, + "learning_rate": 1.5513561109132247e-06, + "loss": 0.132, + "step": 10880 + }, + { + "epoch": 3.752284088950181, + "grad_norm": 1.5941157201208773, + "learning_rate": 1.5473161719585754e-06, + "loss": 0.1238, + "step": 10885 + }, + { + "epoch": 3.754007929667299, + "grad_norm": 1.2320881707975944, + "learning_rate": 1.543280536982098e-06, + "loss": 0.1408, + "step": 10890 + }, + { + "epoch": 3.7557317703844166, + "grad_norm": 2.04090156034139, + "learning_rate": 1.539249211014458e-06, + "loss": 0.1227, + "step": 10895 + }, + { + "epoch": 3.757455611101534, + "grad_norm": 1.247734741558676, + "learning_rate": 1.535222199080944e-06, + "loss": 0.1437, + "step": 10900 + }, + { + "epoch": 3.759179451818652, + "grad_norm": 1.3943424707463135, + "learning_rate": 1.5311995062014674e-06, + "loss": 0.1182, + "step": 10905 + }, + { + "epoch": 3.7609032925357697, + "grad_norm": 1.3327693645392733, + "learning_rate": 1.5271811373905583e-06, + "loss": 0.1264, + "step": 10910 + }, + { + "epoch": 3.7626271332528876, + "grad_norm": 1.3495899789263786, + "learning_rate": 1.5231670976573565e-06, + "loss": 0.1232, + "step": 10915 + }, + { + "epoch": 3.7643509739700054, + "grad_norm": 1.2950239065978737, + "learning_rate": 1.5191573920056025e-06, + "loss": 0.1302, + "step": 10920 + }, + { + "epoch": 3.766074814687123, + "grad_norm": 1.2862976776110107, + "learning_rate": 1.515152025433635e-06, + "loss": 0.129, + "step": 10925 + }, + { + "epoch": 3.7677986554042406, + "grad_norm": 1.1501424533467042, + "learning_rate": 1.5111510029343868e-06, + "loss": 0.1337, + "step": 10930 + }, + { + "epoch": 3.7695224961213585, + "grad_norm": 1.2436287570238818, + "learning_rate": 1.5071543294953722e-06, + "loss": 0.1313, + "step": 10935 + }, + { + "epoch": 3.771246336838476, + "grad_norm": 1.453319868604849, + "learning_rate": 1.5031620100986833e-06, + "loss": 0.1316, + "step": 10940 + }, + { + "epoch": 3.7729701775555937, + "grad_norm": 1.2621068718110429, + "learning_rate": 1.4991740497209895e-06, + "loss": 0.1348, + "step": 10945 + }, + { + "epoch": 3.7746940182727116, + "grad_norm": 1.271308202827298, + "learning_rate": 1.4951904533335204e-06, + "loss": 0.1278, + "step": 10950 + }, + { + "epoch": 3.7764178589898294, + "grad_norm": 1.133861444720207, + "learning_rate": 1.4912112259020706e-06, + "loss": 0.1393, + "step": 10955 + }, + { + "epoch": 3.7781416997069472, + "grad_norm": 1.1465500843261776, + "learning_rate": 1.487236372386987e-06, + "loss": 0.123, + "step": 10960 + }, + { + "epoch": 3.7798655404240646, + "grad_norm": 1.4475200808592783, + "learning_rate": 1.4832658977431635e-06, + "loss": 0.1256, + "step": 10965 + }, + { + "epoch": 3.7815893811411825, + "grad_norm": 1.4875096236385752, + "learning_rate": 1.4792998069200348e-06, + "loss": 0.1357, + "step": 10970 + }, + { + "epoch": 3.7833132218583003, + "grad_norm": 1.1171697163963556, + "learning_rate": 1.4753381048615706e-06, + "loss": 0.119, + "step": 10975 + }, + { + "epoch": 3.785037062575418, + "grad_norm": 1.2259841311742092, + "learning_rate": 1.4713807965062744e-06, + "loss": 0.1285, + "step": 10980 + }, + { + "epoch": 3.786760903292536, + "grad_norm": 1.4564214641351185, + "learning_rate": 1.4674278867871666e-06, + "loss": 0.1171, + "step": 10985 + }, + { + "epoch": 3.7884847440096534, + "grad_norm": 1.3354716528623003, + "learning_rate": 1.463479380631786e-06, + "loss": 0.1214, + "step": 10990 + }, + { + "epoch": 3.7902085847267712, + "grad_norm": 1.1938806946647718, + "learning_rate": 1.4595352829621856e-06, + "loss": 0.1354, + "step": 10995 + }, + { + "epoch": 3.791932425443889, + "grad_norm": 1.3794445650650593, + "learning_rate": 1.4555955986949204e-06, + "loss": 0.1292, + "step": 11000 + }, + { + "epoch": 3.7936562661610065, + "grad_norm": 1.2104736072519433, + "learning_rate": 1.4516603327410438e-06, + "loss": 0.1294, + "step": 11005 + }, + { + "epoch": 3.7953801068781243, + "grad_norm": 1.2556223572321454, + "learning_rate": 1.4477294900060994e-06, + "loss": 0.1192, + "step": 11010 + }, + { + "epoch": 3.797103947595242, + "grad_norm": 4.240080770042343, + "learning_rate": 1.4438030753901223e-06, + "loss": 0.1392, + "step": 11015 + }, + { + "epoch": 3.79882778831236, + "grad_norm": 1.4255185379359292, + "learning_rate": 1.4398810937876234e-06, + "loss": 0.1357, + "step": 11020 + }, + { + "epoch": 3.800551629029478, + "grad_norm": 1.430969451647483, + "learning_rate": 1.4359635500875868e-06, + "loss": 0.1318, + "step": 11025 + }, + { + "epoch": 3.8022754697465952, + "grad_norm": 1.4035212866383076, + "learning_rate": 1.43205044917347e-06, + "loss": 0.1137, + "step": 11030 + }, + { + "epoch": 3.803999310463713, + "grad_norm": 1.268086122543932, + "learning_rate": 1.4281417959231853e-06, + "loss": 0.1076, + "step": 11035 + }, + { + "epoch": 3.805723151180831, + "grad_norm": 1.453769281043299, + "learning_rate": 1.424237595209108e-06, + "loss": 0.1187, + "step": 11040 + }, + { + "epoch": 3.8074469918979488, + "grad_norm": 1.2930421143796562, + "learning_rate": 1.4203378518980554e-06, + "loss": 0.1279, + "step": 11045 + }, + { + "epoch": 3.8091708326150666, + "grad_norm": 1.085263627593601, + "learning_rate": 1.4164425708512952e-06, + "loss": 0.1227, + "step": 11050 + }, + { + "epoch": 3.810894673332184, + "grad_norm": 1.144411584916602, + "learning_rate": 1.412551756924529e-06, + "loss": 0.1257, + "step": 11055 + }, + { + "epoch": 3.812618514049302, + "grad_norm": 1.195013482913271, + "learning_rate": 1.408665414967888e-06, + "loss": 0.1416, + "step": 11060 + }, + { + "epoch": 3.8143423547664197, + "grad_norm": 1.4447775880354259, + "learning_rate": 1.4047835498259349e-06, + "loss": 0.1242, + "step": 11065 + }, + { + "epoch": 3.816066195483537, + "grad_norm": 1.3492475896084266, + "learning_rate": 1.4009061663376455e-06, + "loss": 0.1358, + "step": 11070 + }, + { + "epoch": 3.817790036200655, + "grad_norm": 1.1707568635202883, + "learning_rate": 1.3970332693364125e-06, + "loss": 0.1343, + "step": 11075 + }, + { + "epoch": 3.8195138769177728, + "grad_norm": 1.2811601038550247, + "learning_rate": 1.3931648636500372e-06, + "loss": 0.1136, + "step": 11080 + }, + { + "epoch": 3.8212377176348906, + "grad_norm": 1.2746899216199157, + "learning_rate": 1.389300954100718e-06, + "loss": 0.1337, + "step": 11085 + }, + { + "epoch": 3.8229615583520085, + "grad_norm": 1.1457523483560896, + "learning_rate": 1.3854415455050507e-06, + "loss": 0.1301, + "step": 11090 + }, + { + "epoch": 3.824685399069126, + "grad_norm": 1.2964019904296888, + "learning_rate": 1.3815866426740193e-06, + "loss": 0.1216, + "step": 11095 + }, + { + "epoch": 3.8264092397862437, + "grad_norm": 1.457836130755616, + "learning_rate": 1.3777362504129948e-06, + "loss": 0.1291, + "step": 11100 + }, + { + "epoch": 3.8281330805033615, + "grad_norm": 1.276492445624028, + "learning_rate": 1.373890373521722e-06, + "loss": 0.1173, + "step": 11105 + }, + { + "epoch": 3.8298569212204794, + "grad_norm": 1.3377137684512943, + "learning_rate": 1.3700490167943153e-06, + "loss": 0.1303, + "step": 11110 + }, + { + "epoch": 3.8315807619375972, + "grad_norm": 0.9776925487844316, + "learning_rate": 1.3662121850192594e-06, + "loss": 0.1061, + "step": 11115 + }, + { + "epoch": 3.8333046026547146, + "grad_norm": 1.2442278933032103, + "learning_rate": 1.3623798829793972e-06, + "loss": 0.131, + "step": 11120 + }, + { + "epoch": 3.8350284433718325, + "grad_norm": 1.647143889860017, + "learning_rate": 1.3585521154519226e-06, + "loss": 0.119, + "step": 11125 + }, + { + "epoch": 3.8367522840889503, + "grad_norm": 1.1130395591433442, + "learning_rate": 1.3547288872083765e-06, + "loss": 0.1208, + "step": 11130 + }, + { + "epoch": 3.8384761248060677, + "grad_norm": 1.245456149628818, + "learning_rate": 1.350910203014646e-06, + "loss": 0.1423, + "step": 11135 + }, + { + "epoch": 3.8401999655231855, + "grad_norm": 1.3163255724349603, + "learning_rate": 1.3470960676309491e-06, + "loss": 0.1285, + "step": 11140 + }, + { + "epoch": 3.8419238062403034, + "grad_norm": 1.1961848803512916, + "learning_rate": 1.3432864858118333e-06, + "loss": 0.117, + "step": 11145 + }, + { + "epoch": 3.8436476469574212, + "grad_norm": 1.190522361714228, + "learning_rate": 1.3394814623061752e-06, + "loss": 0.1282, + "step": 11150 + }, + { + "epoch": 3.845371487674539, + "grad_norm": 1.463827119485804, + "learning_rate": 1.3356810018571626e-06, + "loss": 0.1211, + "step": 11155 + }, + { + "epoch": 3.8470953283916565, + "grad_norm": 1.3327547679233624, + "learning_rate": 1.3318851092022994e-06, + "loss": 0.1312, + "step": 11160 + }, + { + "epoch": 3.8488191691087743, + "grad_norm": 1.4066396304155349, + "learning_rate": 1.3280937890733959e-06, + "loss": 0.1289, + "step": 11165 + }, + { + "epoch": 3.850543009825892, + "grad_norm": 1.3354027398687491, + "learning_rate": 1.324307046196559e-06, + "loss": 0.1287, + "step": 11170 + }, + { + "epoch": 3.85226685054301, + "grad_norm": 1.37528379829199, + "learning_rate": 1.3205248852921915e-06, + "loss": 0.1087, + "step": 11175 + }, + { + "epoch": 3.853990691260128, + "grad_norm": 1.4046439494981537, + "learning_rate": 1.316747311074984e-06, + "loss": 0.1437, + "step": 11180 + }, + { + "epoch": 3.8557145319772452, + "grad_norm": 1.2903397860103851, + "learning_rate": 1.3129743282539121e-06, + "loss": 0.1283, + "step": 11185 + }, + { + "epoch": 3.857438372694363, + "grad_norm": 1.3505503586265961, + "learning_rate": 1.3092059415322244e-06, + "loss": 0.1195, + "step": 11190 + }, + { + "epoch": 3.859162213411481, + "grad_norm": 1.2530372314439693, + "learning_rate": 1.305442155607441e-06, + "loss": 0.1153, + "step": 11195 + }, + { + "epoch": 3.8608860541285983, + "grad_norm": 1.5124853496926782, + "learning_rate": 1.3016829751713483e-06, + "loss": 0.1182, + "step": 11200 + }, + { + "epoch": 3.862609894845716, + "grad_norm": 1.1688470516223795, + "learning_rate": 1.2979284049099933e-06, + "loss": 0.1139, + "step": 11205 + }, + { + "epoch": 3.864333735562834, + "grad_norm": 1.1531170510402515, + "learning_rate": 1.2941784495036713e-06, + "loss": 0.1203, + "step": 11210 + }, + { + "epoch": 3.866057576279952, + "grad_norm": 1.3078188811256894, + "learning_rate": 1.2904331136269267e-06, + "loss": 0.1086, + "step": 11215 + }, + { + "epoch": 3.8677814169970697, + "grad_norm": 1.511012410167427, + "learning_rate": 1.2866924019485488e-06, + "loss": 0.1303, + "step": 11220 + }, + { + "epoch": 3.869505257714187, + "grad_norm": 1.4950061574931324, + "learning_rate": 1.282956319131558e-06, + "loss": 0.1164, + "step": 11225 + }, + { + "epoch": 3.871229098431305, + "grad_norm": 1.2534647224767244, + "learning_rate": 1.279224869833205e-06, + "loss": 0.141, + "step": 11230 + }, + { + "epoch": 3.8729529391484228, + "grad_norm": 1.4769203165496234, + "learning_rate": 1.2754980587049693e-06, + "loss": 0.1193, + "step": 11235 + }, + { + "epoch": 3.87467677986554, + "grad_norm": 1.1552930698672923, + "learning_rate": 1.271775890392542e-06, + "loss": 0.1183, + "step": 11240 + }, + { + "epoch": 3.876400620582658, + "grad_norm": 1.1571956169173494, + "learning_rate": 1.2680583695358329e-06, + "loss": 0.1164, + "step": 11245 + }, + { + "epoch": 3.878124461299776, + "grad_norm": 1.3191622152880862, + "learning_rate": 1.2643455007689526e-06, + "loss": 0.1414, + "step": 11250 + }, + { + "epoch": 3.8798483020168937, + "grad_norm": 1.2903336535394538, + "learning_rate": 1.260637288720218e-06, + "loss": 0.1271, + "step": 11255 + }, + { + "epoch": 3.8815721427340115, + "grad_norm": 1.3086834300730585, + "learning_rate": 1.2569337380121371e-06, + "loss": 0.1132, + "step": 11260 + }, + { + "epoch": 3.883295983451129, + "grad_norm": 1.4192245593261936, + "learning_rate": 1.253234853261408e-06, + "loss": 0.1426, + "step": 11265 + }, + { + "epoch": 3.8850198241682468, + "grad_norm": 1.210821345352799, + "learning_rate": 1.2495406390789155e-06, + "loss": 0.1261, + "step": 11270 + }, + { + "epoch": 3.8867436648853646, + "grad_norm": 1.2492008174085303, + "learning_rate": 1.245851100069717e-06, + "loss": 0.1117, + "step": 11275 + }, + { + "epoch": 3.8884675056024824, + "grad_norm": 1.3417342393956402, + "learning_rate": 1.242166240833047e-06, + "loss": 0.0998, + "step": 11280 + }, + { + "epoch": 3.8901913463196003, + "grad_norm": 1.4186445506236978, + "learning_rate": 1.2384860659623044e-06, + "loss": 0.1347, + "step": 11285 + }, + { + "epoch": 3.8919151870367177, + "grad_norm": 1.2949279297232827, + "learning_rate": 1.2348105800450489e-06, + "loss": 0.1227, + "step": 11290 + }, + { + "epoch": 3.8936390277538355, + "grad_norm": 1.4243304940922359, + "learning_rate": 1.2311397876629932e-06, + "loss": 0.1318, + "step": 11295 + }, + { + "epoch": 3.8953628684709534, + "grad_norm": 1.2905485068936577, + "learning_rate": 1.2274736933920006e-06, + "loss": 0.1398, + "step": 11300 + }, + { + "epoch": 3.8970867091880708, + "grad_norm": 1.3236274887074178, + "learning_rate": 1.2238123018020808e-06, + "loss": 0.1081, + "step": 11305 + }, + { + "epoch": 3.8988105499051886, + "grad_norm": 1.2931686422207205, + "learning_rate": 1.2201556174573775e-06, + "loss": 0.1236, + "step": 11310 + }, + { + "epoch": 3.9005343906223064, + "grad_norm": 1.5918640685585563, + "learning_rate": 1.216503644916166e-06, + "loss": 0.1336, + "step": 11315 + }, + { + "epoch": 3.9022582313394243, + "grad_norm": 4.094605028147942, + "learning_rate": 1.2128563887308514e-06, + "loss": 0.1309, + "step": 11320 + }, + { + "epoch": 3.903982072056542, + "grad_norm": 1.3958336497684238, + "learning_rate": 1.2092138534479593e-06, + "loss": 0.1282, + "step": 11325 + }, + { + "epoch": 3.9057059127736595, + "grad_norm": 1.3143188624337778, + "learning_rate": 1.2055760436081281e-06, + "loss": 0.1139, + "step": 11330 + }, + { + "epoch": 3.9074297534907774, + "grad_norm": 1.0715275139956435, + "learning_rate": 1.201942963746105e-06, + "loss": 0.1108, + "step": 11335 + }, + { + "epoch": 3.909153594207895, + "grad_norm": 1.171293519452673, + "learning_rate": 1.1983146183907457e-06, + "loss": 0.1257, + "step": 11340 + }, + { + "epoch": 3.910877434925013, + "grad_norm": 1.3862634054133804, + "learning_rate": 1.1946910120649996e-06, + "loss": 0.1293, + "step": 11345 + }, + { + "epoch": 3.912601275642131, + "grad_norm": 1.2786235687218053, + "learning_rate": 1.1910721492859083e-06, + "loss": 0.1145, + "step": 11350 + }, + { + "epoch": 3.9143251163592483, + "grad_norm": 1.4781329272979127, + "learning_rate": 1.1874580345646054e-06, + "loss": 0.1165, + "step": 11355 + }, + { + "epoch": 3.916048957076366, + "grad_norm": 1.3180672481231375, + "learning_rate": 1.1838486724062992e-06, + "loss": 0.1303, + "step": 11360 + }, + { + "epoch": 3.917772797793484, + "grad_norm": 1.3869240003211991, + "learning_rate": 1.18024406731028e-06, + "loss": 0.1331, + "step": 11365 + }, + { + "epoch": 3.9194966385106014, + "grad_norm": 1.3405785174041558, + "learning_rate": 1.1766442237699016e-06, + "loss": 0.1226, + "step": 11370 + }, + { + "epoch": 3.921220479227719, + "grad_norm": 1.2456815469065652, + "learning_rate": 1.173049146272589e-06, + "loss": 0.1141, + "step": 11375 + }, + { + "epoch": 3.922944319944837, + "grad_norm": 1.314857528219527, + "learning_rate": 1.1694588392998207e-06, + "loss": 0.1337, + "step": 11380 + }, + { + "epoch": 3.924668160661955, + "grad_norm": 1.3116783961627352, + "learning_rate": 1.1658733073271294e-06, + "loss": 0.1178, + "step": 11385 + }, + { + "epoch": 3.9263920013790727, + "grad_norm": 1.4187715000423577, + "learning_rate": 1.1622925548240993e-06, + "loss": 0.1138, + "step": 11390 + }, + { + "epoch": 3.92811584209619, + "grad_norm": 1.3111507735045609, + "learning_rate": 1.158716586254352e-06, + "loss": 0.1331, + "step": 11395 + }, + { + "epoch": 3.929839682813308, + "grad_norm": 1.3881387781864114, + "learning_rate": 1.1551454060755468e-06, + "loss": 0.1257, + "step": 11400 + }, + { + "epoch": 3.931563523530426, + "grad_norm": 1.381550436277335, + "learning_rate": 1.1515790187393761e-06, + "loss": 0.1155, + "step": 11405 + }, + { + "epoch": 3.9332873642475437, + "grad_norm": 1.307666267073457, + "learning_rate": 1.1480174286915568e-06, + "loss": 0.1298, + "step": 11410 + }, + { + "epoch": 3.9350112049646615, + "grad_norm": 1.2572515169693843, + "learning_rate": 1.144460640371825e-06, + "loss": 0.1342, + "step": 11415 + }, + { + "epoch": 3.936735045681779, + "grad_norm": 1.3827825621819523, + "learning_rate": 1.140908658213929e-06, + "loss": 0.1108, + "step": 11420 + }, + { + "epoch": 3.9384588863988967, + "grad_norm": 1.4326374483997706, + "learning_rate": 1.1373614866456318e-06, + "loss": 0.115, + "step": 11425 + }, + { + "epoch": 3.9401827271160146, + "grad_norm": 1.2819893498360015, + "learning_rate": 1.1338191300886947e-06, + "loss": 0.1136, + "step": 11430 + }, + { + "epoch": 3.941906567833132, + "grad_norm": 1.3726546493538927, + "learning_rate": 1.1302815929588768e-06, + "loss": 0.1251, + "step": 11435 + }, + { + "epoch": 3.94363040855025, + "grad_norm": 1.3668066551952909, + "learning_rate": 1.1267488796659332e-06, + "loss": 0.118, + "step": 11440 + }, + { + "epoch": 3.9453542492673677, + "grad_norm": 1.2906622087235946, + "learning_rate": 1.123220994613602e-06, + "loss": 0.1125, + "step": 11445 + }, + { + "epoch": 3.9470780899844855, + "grad_norm": 1.4589184065164795, + "learning_rate": 1.119697942199607e-06, + "loss": 0.1252, + "step": 11450 + }, + { + "epoch": 3.9488019307016033, + "grad_norm": 1.2592519865439942, + "learning_rate": 1.116179726815641e-06, + "loss": 0.1202, + "step": 11455 + }, + { + "epoch": 3.9505257714187207, + "grad_norm": 1.1442893223089237, + "learning_rate": 1.1126663528473746e-06, + "loss": 0.1315, + "step": 11460 + }, + { + "epoch": 3.9522496121358386, + "grad_norm": 1.3414462655063484, + "learning_rate": 1.109157824674439e-06, + "loss": 0.1264, + "step": 11465 + }, + { + "epoch": 3.9539734528529564, + "grad_norm": 1.447593237017541, + "learning_rate": 1.105654146670424e-06, + "loss": 0.1184, + "step": 11470 + }, + { + "epoch": 3.9556972935700743, + "grad_norm": 1.318433941420654, + "learning_rate": 1.1021553232028776e-06, + "loss": 0.1258, + "step": 11475 + }, + { + "epoch": 3.957421134287192, + "grad_norm": 1.439497552831529, + "learning_rate": 1.0986613586332918e-06, + "loss": 0.1283, + "step": 11480 + }, + { + "epoch": 3.9591449750043095, + "grad_norm": 1.5088135652965549, + "learning_rate": 1.0951722573171054e-06, + "loss": 0.1243, + "step": 11485 + }, + { + "epoch": 3.9608688157214273, + "grad_norm": 1.4644340894445993, + "learning_rate": 1.091688023603691e-06, + "loss": 0.1356, + "step": 11490 + }, + { + "epoch": 3.962592656438545, + "grad_norm": 1.2528418995261748, + "learning_rate": 1.088208661836358e-06, + "loss": 0.1129, + "step": 11495 + }, + { + "epoch": 3.9643164971556626, + "grad_norm": 1.2066685547187292, + "learning_rate": 1.0847341763523395e-06, + "loss": 0.1194, + "step": 11500 + }, + { + "epoch": 3.9660403378727804, + "grad_norm": 1.4827307883548029, + "learning_rate": 1.0812645714827891e-06, + "loss": 0.1175, + "step": 11505 + }, + { + "epoch": 3.9677641785898983, + "grad_norm": 1.3134738206153762, + "learning_rate": 1.0777998515527803e-06, + "loss": 0.1124, + "step": 11510 + }, + { + "epoch": 3.969488019307016, + "grad_norm": 1.1625180293338828, + "learning_rate": 1.0743400208812943e-06, + "loss": 0.1115, + "step": 11515 + }, + { + "epoch": 3.971211860024134, + "grad_norm": 1.3896292456470711, + "learning_rate": 1.0708850837812168e-06, + "loss": 0.1218, + "step": 11520 + }, + { + "epoch": 3.9729357007412514, + "grad_norm": 1.3254372568094368, + "learning_rate": 1.0674350445593357e-06, + "loss": 0.1236, + "step": 11525 + }, + { + "epoch": 3.974659541458369, + "grad_norm": 1.293089181896352, + "learning_rate": 1.063989907516334e-06, + "loss": 0.1102, + "step": 11530 + }, + { + "epoch": 3.976383382175487, + "grad_norm": 1.4370992489184637, + "learning_rate": 1.0605496769467815e-06, + "loss": 0.1459, + "step": 11535 + }, + { + "epoch": 3.978107222892605, + "grad_norm": 1.2210879436739124, + "learning_rate": 1.0571143571391312e-06, + "loss": 0.13, + "step": 11540 + }, + { + "epoch": 3.9798310636097227, + "grad_norm": 1.46020719541028, + "learning_rate": 1.0536839523757182e-06, + "loss": 0.1285, + "step": 11545 + }, + { + "epoch": 3.98155490432684, + "grad_norm": 1.2939461867294593, + "learning_rate": 1.0502584669327476e-06, + "loss": 0.1078, + "step": 11550 + }, + { + "epoch": 3.983278745043958, + "grad_norm": 1.1821278440593554, + "learning_rate": 1.0468379050802914e-06, + "loss": 0.1198, + "step": 11555 + }, + { + "epoch": 3.985002585761076, + "grad_norm": 1.442776995049047, + "learning_rate": 1.0434222710822882e-06, + "loss": 0.1121, + "step": 11560 + }, + { + "epoch": 3.986726426478193, + "grad_norm": 1.3873685783529446, + "learning_rate": 1.0400115691965296e-06, + "loss": 0.1262, + "step": 11565 + }, + { + "epoch": 3.988450267195311, + "grad_norm": 1.3681284056301366, + "learning_rate": 1.036605803674663e-06, + "loss": 0.1364, + "step": 11570 + }, + { + "epoch": 3.990174107912429, + "grad_norm": 1.404052842629089, + "learning_rate": 1.0332049787621767e-06, + "loss": 0.1247, + "step": 11575 + }, + { + "epoch": 3.9918979486295467, + "grad_norm": 1.2907010539025423, + "learning_rate": 1.0298090986984077e-06, + "loss": 0.1322, + "step": 11580 + }, + { + "epoch": 3.9936217893466646, + "grad_norm": 1.2243692650404463, + "learning_rate": 1.0264181677165225e-06, + "loss": 0.1127, + "step": 11585 + }, + { + "epoch": 3.995345630063782, + "grad_norm": 1.2521298308335556, + "learning_rate": 1.0230321900435191e-06, + "loss": 0.1073, + "step": 11590 + }, + { + "epoch": 3.9970694707809, + "grad_norm": 1.3277773009406997, + "learning_rate": 1.019651169900226e-06, + "loss": 0.1065, + "step": 11595 + }, + { + "epoch": 3.9987933114980176, + "grad_norm": 1.9013747081077799, + "learning_rate": 1.0162751115012865e-06, + "loss": 0.113, + "step": 11600 + }, + { + "epoch": 4.000344768143424, + "grad_norm": 1.192967741884643, + "learning_rate": 1.0129040190551591e-06, + "loss": 0.1058, + "step": 11605 + }, + { + "epoch": 4.002068608860541, + "grad_norm": 1.1246415912642835, + "learning_rate": 1.009537896764115e-06, + "loss": 0.1085, + "step": 11610 + }, + { + "epoch": 4.003792449577659, + "grad_norm": 1.2283374992652234, + "learning_rate": 1.0061767488242297e-06, + "loss": 0.1203, + "step": 11615 + }, + { + "epoch": 4.005516290294777, + "grad_norm": 1.4975157666010093, + "learning_rate": 1.0028205794253748e-06, + "loss": 0.1015, + "step": 11620 + }, + { + "epoch": 4.007240131011894, + "grad_norm": 1.3421887596802118, + "learning_rate": 9.994693927512156e-07, + "loss": 0.0983, + "step": 11625 + }, + { + "epoch": 4.008963971729012, + "grad_norm": 1.111806480271007, + "learning_rate": 9.961231929792115e-07, + "loss": 0.0997, + "step": 11630 + }, + { + "epoch": 4.01068781244613, + "grad_norm": 1.3154458261628559, + "learning_rate": 9.927819842805997e-07, + "loss": 0.1111, + "step": 11635 + }, + { + "epoch": 4.012411653163248, + "grad_norm": 1.2893096837104474, + "learning_rate": 9.894457708203976e-07, + "loss": 0.0981, + "step": 11640 + }, + { + "epoch": 4.0141354938803655, + "grad_norm": 1.3579685588749366, + "learning_rate": 9.861145567573976e-07, + "loss": 0.1009, + "step": 11645 + }, + { + "epoch": 4.015859334597483, + "grad_norm": 1.7631269267504865, + "learning_rate": 9.827883462441568e-07, + "loss": 0.1221, + "step": 11650 + }, + { + "epoch": 4.017583175314601, + "grad_norm": 1.80720466145997, + "learning_rate": 9.794671434269987e-07, + "loss": 0.1041, + "step": 11655 + }, + { + "epoch": 4.019307016031719, + "grad_norm": 1.1128224846323527, + "learning_rate": 9.76150952446e-07, + "loss": 0.0921, + "step": 11660 + }, + { + "epoch": 4.021030856748836, + "grad_norm": 1.1635046904275557, + "learning_rate": 9.728397774349957e-07, + "loss": 0.0987, + "step": 11665 + }, + { + "epoch": 4.022754697465954, + "grad_norm": 1.4913858189939422, + "learning_rate": 9.695336225215624e-07, + "loss": 0.1244, + "step": 11670 + }, + { + "epoch": 4.024478538183072, + "grad_norm": 1.3770127707009217, + "learning_rate": 9.662324918270205e-07, + "loss": 0.1181, + "step": 11675 + }, + { + "epoch": 4.02620237890019, + "grad_norm": 1.1304720331491682, + "learning_rate": 9.6293638946643e-07, + "loss": 0.1049, + "step": 11680 + }, + { + "epoch": 4.027926219617307, + "grad_norm": 1.4254678750933454, + "learning_rate": 9.596453195485795e-07, + "loss": 0.0927, + "step": 11685 + }, + { + "epoch": 4.029650060334425, + "grad_norm": 1.3083179637487954, + "learning_rate": 9.563592861759867e-07, + "loss": 0.1054, + "step": 11690 + }, + { + "epoch": 4.031373901051543, + "grad_norm": 1.186237793678765, + "learning_rate": 9.53078293444889e-07, + "loss": 0.1036, + "step": 11695 + }, + { + "epoch": 4.03309774176866, + "grad_norm": 1.9498169472536708, + "learning_rate": 9.498023454452426e-07, + "loss": 0.0969, + "step": 11700 + }, + { + "epoch": 4.034821582485779, + "grad_norm": 1.3328954105824022, + "learning_rate": 9.465314462607128e-07, + "loss": 0.1089, + "step": 11705 + }, + { + "epoch": 4.036545423202896, + "grad_norm": 3.2855682666504435, + "learning_rate": 9.432655999686713e-07, + "loss": 0.0891, + "step": 11710 + }, + { + "epoch": 4.0382692639200135, + "grad_norm": 1.2580259239639295, + "learning_rate": 9.400048106401949e-07, + "loss": 0.0987, + "step": 11715 + }, + { + "epoch": 4.039993104637132, + "grad_norm": 1.1296615390957263, + "learning_rate": 9.367490823400516e-07, + "loss": 0.1112, + "step": 11720 + }, + { + "epoch": 4.041716945354249, + "grad_norm": 1.7251392205965974, + "learning_rate": 9.334984191267022e-07, + "loss": 0.1165, + "step": 11725 + }, + { + "epoch": 4.043440786071367, + "grad_norm": 1.3810324511738785, + "learning_rate": 9.302528250522946e-07, + "loss": 0.1138, + "step": 11730 + }, + { + "epoch": 4.045164626788485, + "grad_norm": 1.1635481284634455, + "learning_rate": 9.270123041626588e-07, + "loss": 0.0961, + "step": 11735 + }, + { + "epoch": 4.046888467505602, + "grad_norm": 1.390536186188031, + "learning_rate": 9.237768604972975e-07, + "loss": 0.1039, + "step": 11740 + }, + { + "epoch": 4.048612308222721, + "grad_norm": 1.5211267859585615, + "learning_rate": 9.205464980893852e-07, + "loss": 0.1034, + "step": 11745 + }, + { + "epoch": 4.050336148939838, + "grad_norm": 1.5343426141391991, + "learning_rate": 9.17321220965765e-07, + "loss": 0.1197, + "step": 11750 + }, + { + "epoch": 4.052059989656955, + "grad_norm": 1.5225286593304013, + "learning_rate": 9.141010331469385e-07, + "loss": 0.1111, + "step": 11755 + }, + { + "epoch": 4.053783830374074, + "grad_norm": 1.5468434745235593, + "learning_rate": 9.108859386470614e-07, + "loss": 0.0948, + "step": 11760 + }, + { + "epoch": 4.055507671091191, + "grad_norm": 1.1097092908953101, + "learning_rate": 9.076759414739455e-07, + "loss": 0.0932, + "step": 11765 + }, + { + "epoch": 4.057231511808309, + "grad_norm": 1.3743748526969284, + "learning_rate": 9.044710456290429e-07, + "loss": 0.1101, + "step": 11770 + }, + { + "epoch": 4.058955352525427, + "grad_norm": 1.6347840548319619, + "learning_rate": 9.012712551074515e-07, + "loss": 0.1118, + "step": 11775 + }, + { + "epoch": 4.060679193242544, + "grad_norm": 1.3727861526025247, + "learning_rate": 8.980765738979003e-07, + "loss": 0.1056, + "step": 11780 + }, + { + "epoch": 4.062403033959662, + "grad_norm": 1.3808153758538497, + "learning_rate": 8.948870059827547e-07, + "loss": 0.1137, + "step": 11785 + }, + { + "epoch": 4.06412687467678, + "grad_norm": 1.2235266519823085, + "learning_rate": 8.917025553380005e-07, + "loss": 0.1047, + "step": 11790 + }, + { + "epoch": 4.065850715393897, + "grad_norm": 1.4477415851054352, + "learning_rate": 8.885232259332472e-07, + "loss": 0.0935, + "step": 11795 + }, + { + "epoch": 4.0675745561110155, + "grad_norm": 1.4397584466110982, + "learning_rate": 8.853490217317223e-07, + "loss": 0.1004, + "step": 11800 + }, + { + "epoch": 4.069298396828133, + "grad_norm": 1.4921000596319516, + "learning_rate": 8.821799466902603e-07, + "loss": 0.102, + "step": 11805 + }, + { + "epoch": 4.071022237545251, + "grad_norm": 1.0393458179033408, + "learning_rate": 8.790160047593038e-07, + "loss": 0.1139, + "step": 11810 + }, + { + "epoch": 4.072746078262369, + "grad_norm": 1.037696317340174, + "learning_rate": 8.758571998828979e-07, + "loss": 0.1012, + "step": 11815 + }, + { + "epoch": 4.074469918979486, + "grad_norm": 1.7236463427496, + "learning_rate": 8.727035359986841e-07, + "loss": 0.1029, + "step": 11820 + }, + { + "epoch": 4.076193759696604, + "grad_norm": 1.358343552358338, + "learning_rate": 8.695550170378924e-07, + "loss": 0.1127, + "step": 11825 + }, + { + "epoch": 4.077917600413722, + "grad_norm": 1.1162062249181175, + "learning_rate": 8.664116469253403e-07, + "loss": 0.112, + "step": 11830 + }, + { + "epoch": 4.07964144113084, + "grad_norm": 1.1491529791012212, + "learning_rate": 8.632734295794309e-07, + "loss": 0.0937, + "step": 11835 + }, + { + "epoch": 4.081365281847957, + "grad_norm": 1.4900000940003695, + "learning_rate": 8.60140368912139e-07, + "loss": 0.1171, + "step": 11840 + }, + { + "epoch": 4.083089122565075, + "grad_norm": 1.3452224444177885, + "learning_rate": 8.570124688290121e-07, + "loss": 0.1068, + "step": 11845 + }, + { + "epoch": 4.084812963282193, + "grad_norm": 1.4799116085086237, + "learning_rate": 8.538897332291685e-07, + "loss": 0.1011, + "step": 11850 + }, + { + "epoch": 4.08653680399931, + "grad_norm": 1.1445816428457665, + "learning_rate": 8.507721660052837e-07, + "loss": 0.1177, + "step": 11855 + }, + { + "epoch": 4.088260644716428, + "grad_norm": 1.206107159018617, + "learning_rate": 8.476597710435952e-07, + "loss": 0.0927, + "step": 11860 + }, + { + "epoch": 4.089984485433546, + "grad_norm": 1.2100060217624893, + "learning_rate": 8.445525522238879e-07, + "loss": 0.106, + "step": 11865 + }, + { + "epoch": 4.0917083261506635, + "grad_norm": 1.4458123099958744, + "learning_rate": 8.414505134195e-07, + "loss": 0.1146, + "step": 11870 + }, + { + "epoch": 4.093432166867782, + "grad_norm": 1.3991243941204066, + "learning_rate": 8.383536584973084e-07, + "loss": 0.1121, + "step": 11875 + }, + { + "epoch": 4.095156007584899, + "grad_norm": 1.1932901120313424, + "learning_rate": 8.352619913177273e-07, + "loss": 0.1047, + "step": 11880 + }, + { + "epoch": 4.096879848302017, + "grad_norm": 1.5002107075033384, + "learning_rate": 8.321755157347089e-07, + "loss": 0.0987, + "step": 11885 + }, + { + "epoch": 4.098603689019135, + "grad_norm": 1.3690348433153468, + "learning_rate": 8.290942355957277e-07, + "loss": 0.1146, + "step": 11890 + }, + { + "epoch": 4.100327529736252, + "grad_norm": 1.078835584076048, + "learning_rate": 8.260181547417878e-07, + "loss": 0.1078, + "step": 11895 + }, + { + "epoch": 4.1020513704533705, + "grad_norm": 1.48676742848251, + "learning_rate": 8.229472770074065e-07, + "loss": 0.1132, + "step": 11900 + }, + { + "epoch": 4.103775211170488, + "grad_norm": 1.4042825662154081, + "learning_rate": 8.1988160622062e-07, + "loss": 0.0956, + "step": 11905 + }, + { + "epoch": 4.105499051887605, + "grad_norm": 1.387523325741912, + "learning_rate": 8.168211462029707e-07, + "loss": 0.1088, + "step": 11910 + }, + { + "epoch": 4.107222892604724, + "grad_norm": 1.312285694460311, + "learning_rate": 8.137659007695043e-07, + "loss": 0.112, + "step": 11915 + }, + { + "epoch": 4.108946733321841, + "grad_norm": 1.2004785098009025, + "learning_rate": 8.107158737287707e-07, + "loss": 0.1062, + "step": 11920 + }, + { + "epoch": 4.110670574038958, + "grad_norm": 1.242550070156778, + "learning_rate": 8.076710688828115e-07, + "loss": 0.0925, + "step": 11925 + }, + { + "epoch": 4.112394414756077, + "grad_norm": 1.4076478021887935, + "learning_rate": 8.046314900271573e-07, + "loss": 0.0982, + "step": 11930 + }, + { + "epoch": 4.114118255473194, + "grad_norm": 1.399082084580976, + "learning_rate": 8.015971409508277e-07, + "loss": 0.1022, + "step": 11935 + }, + { + "epoch": 4.115842096190312, + "grad_norm": 1.4885978958615196, + "learning_rate": 7.985680254363226e-07, + "loss": 0.1139, + "step": 11940 + }, + { + "epoch": 4.11756593690743, + "grad_norm": 1.2138767824937315, + "learning_rate": 7.955441472596154e-07, + "loss": 0.0982, + "step": 11945 + }, + { + "epoch": 4.119289777624547, + "grad_norm": 1.1093407827318338, + "learning_rate": 7.925255101901508e-07, + "loss": 0.0796, + "step": 11950 + }, + { + "epoch": 4.1210136183416655, + "grad_norm": 1.1675549711136834, + "learning_rate": 7.895121179908444e-07, + "loss": 0.1077, + "step": 11955 + }, + { + "epoch": 4.122737459058783, + "grad_norm": 1.4988544671297996, + "learning_rate": 7.865039744180691e-07, + "loss": 0.0982, + "step": 11960 + }, + { + "epoch": 4.1244612997759, + "grad_norm": 1.5360699817741974, + "learning_rate": 7.835010832216567e-07, + "loss": 0.1106, + "step": 11965 + }, + { + "epoch": 4.1261851404930185, + "grad_norm": 1.289086933199439, + "learning_rate": 7.805034481448937e-07, + "loss": 0.1015, + "step": 11970 + }, + { + "epoch": 4.127908981210136, + "grad_norm": 1.4227311446805668, + "learning_rate": 7.775110729245095e-07, + "loss": 0.0944, + "step": 11975 + }, + { + "epoch": 4.129632821927254, + "grad_norm": 1.231740254467238, + "learning_rate": 7.745239612906835e-07, + "loss": 0.1083, + "step": 11980 + }, + { + "epoch": 4.131356662644372, + "grad_norm": 1.3202500427322454, + "learning_rate": 7.715421169670273e-07, + "loss": 0.0985, + "step": 11985 + }, + { + "epoch": 4.133080503361489, + "grad_norm": 1.24611953602762, + "learning_rate": 7.685655436705913e-07, + "loss": 0.1113, + "step": 11990 + }, + { + "epoch": 4.134804344078607, + "grad_norm": 1.3017889243267873, + "learning_rate": 7.65594245111852e-07, + "loss": 0.1012, + "step": 11995 + }, + { + "epoch": 4.136528184795725, + "grad_norm": 1.2667733889442665, + "learning_rate": 7.626282249947115e-07, + "loss": 0.0988, + "step": 12000 + }, + { + "epoch": 4.138252025512843, + "grad_norm": 1.2624752669186925, + "learning_rate": 7.596674870164939e-07, + "loss": 0.1151, + "step": 12005 + }, + { + "epoch": 4.13997586622996, + "grad_norm": 1.2812322088667052, + "learning_rate": 7.567120348679369e-07, + "loss": 0.0921, + "step": 12010 + }, + { + "epoch": 4.141699706947078, + "grad_norm": 1.3834748467633617, + "learning_rate": 7.537618722331874e-07, + "loss": 0.1054, + "step": 12015 + }, + { + "epoch": 4.143423547664196, + "grad_norm": 1.2230869497311545, + "learning_rate": 7.50817002789802e-07, + "loss": 0.1001, + "step": 12020 + }, + { + "epoch": 4.1451473883813135, + "grad_norm": 1.6538512619424104, + "learning_rate": 7.478774302087394e-07, + "loss": 0.1172, + "step": 12025 + }, + { + "epoch": 4.146871229098432, + "grad_norm": 1.376358870710376, + "learning_rate": 7.449431581543526e-07, + "loss": 0.118, + "step": 12030 + }, + { + "epoch": 4.148595069815549, + "grad_norm": 1.212496811634599, + "learning_rate": 7.420141902843864e-07, + "loss": 0.1052, + "step": 12035 + }, + { + "epoch": 4.1503189105326665, + "grad_norm": 1.2727108505858757, + "learning_rate": 7.39090530249978e-07, + "loss": 0.0962, + "step": 12040 + }, + { + "epoch": 4.152042751249785, + "grad_norm": 1.3758968413519976, + "learning_rate": 7.361721816956447e-07, + "loss": 0.0987, + "step": 12045 + }, + { + "epoch": 4.153766591966902, + "grad_norm": 1.2697425996431435, + "learning_rate": 7.332591482592827e-07, + "loss": 0.1068, + "step": 12050 + }, + { + "epoch": 4.15549043268402, + "grad_norm": 1.2921233256220757, + "learning_rate": 7.303514335721651e-07, + "loss": 0.1126, + "step": 12055 + }, + { + "epoch": 4.157214273401138, + "grad_norm": 1.2587095157496573, + "learning_rate": 7.274490412589319e-07, + "loss": 0.0932, + "step": 12060 + }, + { + "epoch": 4.158938114118255, + "grad_norm": 1.1740100893674308, + "learning_rate": 7.245519749375907e-07, + "loss": 0.0908, + "step": 12065 + }, + { + "epoch": 4.160661954835374, + "grad_norm": 1.301879362854531, + "learning_rate": 7.216602382195081e-07, + "loss": 0.1091, + "step": 12070 + }, + { + "epoch": 4.162385795552491, + "grad_norm": 1.2075119152246632, + "learning_rate": 7.187738347094097e-07, + "loss": 0.1121, + "step": 12075 + }, + { + "epoch": 4.164109636269608, + "grad_norm": 1.5191436087557528, + "learning_rate": 7.158927680053696e-07, + "loss": 0.1132, + "step": 12080 + }, + { + "epoch": 4.165833476986727, + "grad_norm": 1.5201548964636065, + "learning_rate": 7.130170416988102e-07, + "loss": 0.1211, + "step": 12085 + }, + { + "epoch": 4.167557317703844, + "grad_norm": 1.2720001429559575, + "learning_rate": 7.101466593744999e-07, + "loss": 0.102, + "step": 12090 + }, + { + "epoch": 4.1692811584209615, + "grad_norm": 1.3372610104815759, + "learning_rate": 7.072816246105402e-07, + "loss": 0.0978, + "step": 12095 + }, + { + "epoch": 4.17100499913808, + "grad_norm": 1.3131973260898593, + "learning_rate": 7.044219409783715e-07, + "loss": 0.0862, + "step": 12100 + }, + { + "epoch": 4.172728839855197, + "grad_norm": 1.3462693147357956, + "learning_rate": 7.015676120427595e-07, + "loss": 0.1075, + "step": 12105 + }, + { + "epoch": 4.174452680572315, + "grad_norm": 1.4543187413051926, + "learning_rate": 6.987186413617997e-07, + "loss": 0.1045, + "step": 12110 + }, + { + "epoch": 4.176176521289433, + "grad_norm": 0.943878434492626, + "learning_rate": 6.95875032486904e-07, + "loss": 0.0831, + "step": 12115 + }, + { + "epoch": 4.17790036200655, + "grad_norm": 1.4191410311195434, + "learning_rate": 6.930367889628009e-07, + "loss": 0.0962, + "step": 12120 + }, + { + "epoch": 4.1796242027236685, + "grad_norm": 1.4224130732823812, + "learning_rate": 6.902039143275341e-07, + "loss": 0.1021, + "step": 12125 + }, + { + "epoch": 4.181348043440786, + "grad_norm": 1.6031111083930405, + "learning_rate": 6.87376412112451e-07, + "loss": 0.1133, + "step": 12130 + }, + { + "epoch": 4.183071884157904, + "grad_norm": 1.254260518232728, + "learning_rate": 6.845542858422016e-07, + "loss": 0.107, + "step": 12135 + }, + { + "epoch": 4.184795724875022, + "grad_norm": 1.3031521917920925, + "learning_rate": 6.817375390347386e-07, + "loss": 0.115, + "step": 12140 + }, + { + "epoch": 4.186519565592139, + "grad_norm": 1.3213404812386826, + "learning_rate": 6.789261752013065e-07, + "loss": 0.0972, + "step": 12145 + }, + { + "epoch": 4.188243406309257, + "grad_norm": 0.9890280381696214, + "learning_rate": 6.761201978464388e-07, + "loss": 0.0991, + "step": 12150 + }, + { + "epoch": 4.189967247026375, + "grad_norm": 1.2558941440702447, + "learning_rate": 6.73319610467954e-07, + "loss": 0.1115, + "step": 12155 + }, + { + "epoch": 4.191691087743492, + "grad_norm": 1.1800450862754848, + "learning_rate": 6.705244165569547e-07, + "loss": 0.0952, + "step": 12160 + }, + { + "epoch": 4.19341492846061, + "grad_norm": 1.3170812073211093, + "learning_rate": 6.677346195978179e-07, + "loss": 0.1067, + "step": 12165 + }, + { + "epoch": 4.195138769177728, + "grad_norm": 1.4500544743842672, + "learning_rate": 6.649502230681915e-07, + "loss": 0.1093, + "step": 12170 + }, + { + "epoch": 4.196862609894846, + "grad_norm": 1.8897829410967788, + "learning_rate": 6.62171230438996e-07, + "loss": 0.1159, + "step": 12175 + }, + { + "epoch": 4.1985864506119634, + "grad_norm": 1.3445827852411167, + "learning_rate": 6.593976451744106e-07, + "loss": 0.1198, + "step": 12180 + }, + { + "epoch": 4.200310291329081, + "grad_norm": 1.28542876479375, + "learning_rate": 6.566294707318782e-07, + "loss": 0.1103, + "step": 12185 + }, + { + "epoch": 4.202034132046199, + "grad_norm": 1.3030784104291313, + "learning_rate": 6.538667105620932e-07, + "loss": 0.0949, + "step": 12190 + }, + { + "epoch": 4.2037579727633165, + "grad_norm": 1.419157885553054, + "learning_rate": 6.511093681090047e-07, + "loss": 0.1117, + "step": 12195 + }, + { + "epoch": 4.205481813480435, + "grad_norm": 1.350194958418878, + "learning_rate": 6.483574468098042e-07, + "loss": 0.1124, + "step": 12200 + }, + { + "epoch": 4.207205654197552, + "grad_norm": 1.2871976907166678, + "learning_rate": 6.456109500949265e-07, + "loss": 0.1222, + "step": 12205 + }, + { + "epoch": 4.20892949491467, + "grad_norm": 1.3613161194122034, + "learning_rate": 6.428698813880469e-07, + "loss": 0.102, + "step": 12210 + }, + { + "epoch": 4.210653335631788, + "grad_norm": 1.4164947956837435, + "learning_rate": 6.401342441060721e-07, + "loss": 0.0938, + "step": 12215 + }, + { + "epoch": 4.212377176348905, + "grad_norm": 1.5150162861272307, + "learning_rate": 6.374040416591371e-07, + "loss": 0.1169, + "step": 12220 + }, + { + "epoch": 4.214101017066023, + "grad_norm": 1.0162984955966008, + "learning_rate": 6.346792774506044e-07, + "loss": 0.0915, + "step": 12225 + }, + { + "epoch": 4.215824857783141, + "grad_norm": 1.3178507104745893, + "learning_rate": 6.319599548770578e-07, + "loss": 0.1067, + "step": 12230 + }, + { + "epoch": 4.217548698500258, + "grad_norm": 1.2553539257056758, + "learning_rate": 6.29246077328296e-07, + "loss": 0.0923, + "step": 12235 + }, + { + "epoch": 4.219272539217377, + "grad_norm": 2.3583820774370325, + "learning_rate": 6.265376481873287e-07, + "loss": 0.1122, + "step": 12240 + }, + { + "epoch": 4.220996379934494, + "grad_norm": 1.2221506546192613, + "learning_rate": 6.238346708303783e-07, + "loss": 0.108, + "step": 12245 + }, + { + "epoch": 4.2227202206516115, + "grad_norm": 1.3815144195748101, + "learning_rate": 6.211371486268686e-07, + "loss": 0.1485, + "step": 12250 + }, + { + "epoch": 4.22444406136873, + "grad_norm": 1.3803058732470492, + "learning_rate": 6.184450849394208e-07, + "loss": 0.0963, + "step": 12255 + }, + { + "epoch": 4.226167902085847, + "grad_norm": 1.4336304501411858, + "learning_rate": 6.157584831238572e-07, + "loss": 0.1014, + "step": 12260 + }, + { + "epoch": 4.227891742802965, + "grad_norm": 1.455715308871864, + "learning_rate": 6.130773465291867e-07, + "loss": 0.1076, + "step": 12265 + }, + { + "epoch": 4.229615583520083, + "grad_norm": 1.5972679775048577, + "learning_rate": 6.104016784976092e-07, + "loss": 0.1009, + "step": 12270 + }, + { + "epoch": 4.2313394242372, + "grad_norm": 1.396676664172261, + "learning_rate": 6.077314823645037e-07, + "loss": 0.1071, + "step": 12275 + }, + { + "epoch": 4.2330632649543185, + "grad_norm": 1.3346924550314543, + "learning_rate": 6.050667614584327e-07, + "loss": 0.1065, + "step": 12280 + }, + { + "epoch": 4.234787105671436, + "grad_norm": 1.1729357327241883, + "learning_rate": 6.024075191011297e-07, + "loss": 0.1016, + "step": 12285 + }, + { + "epoch": 4.236510946388553, + "grad_norm": 1.364809418073652, + "learning_rate": 5.997537586075003e-07, + "loss": 0.1077, + "step": 12290 + }, + { + "epoch": 4.238234787105672, + "grad_norm": 1.340325804932455, + "learning_rate": 5.971054832856177e-07, + "loss": 0.1092, + "step": 12295 + }, + { + "epoch": 4.239958627822789, + "grad_norm": 1.3306328836841919, + "learning_rate": 5.94462696436715e-07, + "loss": 0.1097, + "step": 12300 + }, + { + "epoch": 4.241682468539907, + "grad_norm": 1.5652138591990172, + "learning_rate": 5.918254013551867e-07, + "loss": 0.1051, + "step": 12305 + }, + { + "epoch": 4.243406309257025, + "grad_norm": 1.4301464183226353, + "learning_rate": 5.891936013285781e-07, + "loss": 0.0971, + "step": 12310 + }, + { + "epoch": 4.245130149974142, + "grad_norm": 1.4010698047294279, + "learning_rate": 5.865672996375882e-07, + "loss": 0.118, + "step": 12315 + }, + { + "epoch": 4.24685399069126, + "grad_norm": 1.4346661372391631, + "learning_rate": 5.83946499556059e-07, + "loss": 0.0995, + "step": 12320 + }, + { + "epoch": 4.248577831408378, + "grad_norm": 1.5022228254855505, + "learning_rate": 5.81331204350975e-07, + "loss": 0.1176, + "step": 12325 + }, + { + "epoch": 4.250301672125495, + "grad_norm": 1.2678766067832812, + "learning_rate": 5.787214172824606e-07, + "loss": 0.1221, + "step": 12330 + }, + { + "epoch": 4.252025512842613, + "grad_norm": 1.3410254869010216, + "learning_rate": 5.761171416037714e-07, + "loss": 0.0934, + "step": 12335 + }, + { + "epoch": 4.253749353559731, + "grad_norm": 1.5464218900432887, + "learning_rate": 5.735183805612931e-07, + "loss": 0.1002, + "step": 12340 + }, + { + "epoch": 4.255473194276849, + "grad_norm": 1.3829402583270551, + "learning_rate": 5.709251373945379e-07, + "loss": 0.1054, + "step": 12345 + }, + { + "epoch": 4.2571970349939665, + "grad_norm": 1.2054391203469281, + "learning_rate": 5.683374153361421e-07, + "loss": 0.1029, + "step": 12350 + }, + { + "epoch": 4.258920875711084, + "grad_norm": 1.2472665271613321, + "learning_rate": 5.657552176118542e-07, + "loss": 0.0894, + "step": 12355 + }, + { + "epoch": 4.260644716428202, + "grad_norm": 1.4089057223846146, + "learning_rate": 5.631785474405394e-07, + "loss": 0.1114, + "step": 12360 + }, + { + "epoch": 4.26236855714532, + "grad_norm": 1.569769553643092, + "learning_rate": 5.606074080341734e-07, + "loss": 0.1094, + "step": 12365 + }, + { + "epoch": 4.264092397862438, + "grad_norm": 1.3772431096303317, + "learning_rate": 5.580418025978351e-07, + "loss": 0.0976, + "step": 12370 + }, + { + "epoch": 4.265816238579555, + "grad_norm": 1.3766000638773417, + "learning_rate": 5.554817343297064e-07, + "loss": 0.0983, + "step": 12375 + }, + { + "epoch": 4.267540079296673, + "grad_norm": 1.4859420970994945, + "learning_rate": 5.529272064210655e-07, + "loss": 0.1153, + "step": 12380 + }, + { + "epoch": 4.269263920013791, + "grad_norm": 1.3403472666691587, + "learning_rate": 5.503782220562859e-07, + "loss": 0.1054, + "step": 12385 + }, + { + "epoch": 4.270987760730908, + "grad_norm": 1.4706091562778003, + "learning_rate": 5.478347844128317e-07, + "loss": 0.1047, + "step": 12390 + }, + { + "epoch": 4.272711601448027, + "grad_norm": 1.4923912229247072, + "learning_rate": 5.452968966612482e-07, + "loss": 0.1016, + "step": 12395 + }, + { + "epoch": 4.274435442165144, + "grad_norm": 1.3326109737245777, + "learning_rate": 5.427645619651673e-07, + "loss": 0.0998, + "step": 12400 + }, + { + "epoch": 4.276159282882261, + "grad_norm": 1.396060359742583, + "learning_rate": 5.402377834812961e-07, + "loss": 0.0987, + "step": 12405 + }, + { + "epoch": 4.27788312359938, + "grad_norm": 1.4662761612938902, + "learning_rate": 5.377165643594145e-07, + "loss": 0.0964, + "step": 12410 + }, + { + "epoch": 4.279606964316497, + "grad_norm": 1.1628681323041505, + "learning_rate": 5.352009077423759e-07, + "loss": 0.09, + "step": 12415 + }, + { + "epoch": 4.2813308050336145, + "grad_norm": 1.2393800355859474, + "learning_rate": 5.326908167660971e-07, + "loss": 0.0997, + "step": 12420 + }, + { + "epoch": 4.283054645750733, + "grad_norm": 1.266289804169401, + "learning_rate": 5.301862945595565e-07, + "loss": 0.1003, + "step": 12425 + }, + { + "epoch": 4.28477848646785, + "grad_norm": 1.4621785644531997, + "learning_rate": 5.276873442447922e-07, + "loss": 0.1014, + "step": 12430 + }, + { + "epoch": 4.2865023271849685, + "grad_norm": 1.3353129648864126, + "learning_rate": 5.251939689368973e-07, + "loss": 0.0907, + "step": 12435 + }, + { + "epoch": 4.288226167902086, + "grad_norm": 2.04250269508673, + "learning_rate": 5.227061717440141e-07, + "loss": 0.1077, + "step": 12440 + }, + { + "epoch": 4.289950008619203, + "grad_norm": 1.2548661722067267, + "learning_rate": 5.202239557673295e-07, + "loss": 0.0802, + "step": 12445 + }, + { + "epoch": 4.291673849336322, + "grad_norm": 1.1900181280239783, + "learning_rate": 5.177473241010772e-07, + "loss": 0.0911, + "step": 12450 + }, + { + "epoch": 4.293397690053439, + "grad_norm": 1.3571581299873972, + "learning_rate": 5.152762798325267e-07, + "loss": 0.0942, + "step": 12455 + }, + { + "epoch": 4.295121530770556, + "grad_norm": 1.3288448591516742, + "learning_rate": 5.128108260419828e-07, + "loss": 0.1135, + "step": 12460 + }, + { + "epoch": 4.296845371487675, + "grad_norm": 1.2233712060586364, + "learning_rate": 5.103509658027828e-07, + "loss": 0.1004, + "step": 12465 + }, + { + "epoch": 4.298569212204792, + "grad_norm": 1.2173291940674655, + "learning_rate": 5.078967021812914e-07, + "loss": 0.101, + "step": 12470 + }, + { + "epoch": 4.30029305292191, + "grad_norm": 1.3655430164910245, + "learning_rate": 5.054480382368948e-07, + "loss": 0.1106, + "step": 12475 + }, + { + "epoch": 4.302016893639028, + "grad_norm": 1.3490310044295044, + "learning_rate": 5.030049770219991e-07, + "loss": 0.093, + "step": 12480 + }, + { + "epoch": 4.303740734356145, + "grad_norm": 2.7658729263934987, + "learning_rate": 5.005675215820294e-07, + "loss": 0.1039, + "step": 12485 + }, + { + "epoch": 4.305464575073263, + "grad_norm": 1.5291197745764706, + "learning_rate": 4.981356749554189e-07, + "loss": 0.1019, + "step": 12490 + }, + { + "epoch": 4.307188415790381, + "grad_norm": 1.3587881192478735, + "learning_rate": 4.957094401736101e-07, + "loss": 0.1035, + "step": 12495 + }, + { + "epoch": 4.308912256507499, + "grad_norm": 1.417384199119948, + "learning_rate": 4.932888202610531e-07, + "loss": 0.1072, + "step": 12500 + }, + { + "epoch": 4.3106360972246165, + "grad_norm": 1.2442251101795154, + "learning_rate": 4.908738182351941e-07, + "loss": 0.101, + "step": 12505 + }, + { + "epoch": 4.312359937941734, + "grad_norm": 1.43864313135744, + "learning_rate": 4.884644371064801e-07, + "loss": 0.1061, + "step": 12510 + }, + { + "epoch": 4.314083778658852, + "grad_norm": 1.3610182737377219, + "learning_rate": 4.860606798783479e-07, + "loss": 0.105, + "step": 12515 + }, + { + "epoch": 4.31580761937597, + "grad_norm": 1.362514085323076, + "learning_rate": 4.836625495472274e-07, + "loss": 0.0865, + "step": 12520 + }, + { + "epoch": 4.317531460093088, + "grad_norm": 1.3079875797638718, + "learning_rate": 4.812700491025318e-07, + "loss": 0.0939, + "step": 12525 + }, + { + "epoch": 4.319255300810205, + "grad_norm": 1.4818211035752797, + "learning_rate": 4.788831815266554e-07, + "loss": 0.1241, + "step": 12530 + }, + { + "epoch": 4.320979141527323, + "grad_norm": 1.3672898407379324, + "learning_rate": 4.7650194979497466e-07, + "loss": 0.1004, + "step": 12535 + }, + { + "epoch": 4.322702982244441, + "grad_norm": 1.4318534960326164, + "learning_rate": 4.74126356875837e-07, + "loss": 0.1111, + "step": 12540 + }, + { + "epoch": 4.324426822961558, + "grad_norm": 1.370126841495434, + "learning_rate": 4.717564057305607e-07, + "loss": 0.114, + "step": 12545 + }, + { + "epoch": 4.326150663678676, + "grad_norm": 1.5715046699340225, + "learning_rate": 4.693920993134343e-07, + "loss": 0.1066, + "step": 12550 + }, + { + "epoch": 4.327874504395794, + "grad_norm": 1.3465254964642521, + "learning_rate": 4.6703344057170807e-07, + "loss": 0.0941, + "step": 12555 + }, + { + "epoch": 4.329598345112911, + "grad_norm": 1.486207141933927, + "learning_rate": 4.6468043244559167e-07, + "loss": 0.1065, + "step": 12560 + }, + { + "epoch": 4.33132218583003, + "grad_norm": 1.4194290234443416, + "learning_rate": 4.6233307786825e-07, + "loss": 0.1035, + "step": 12565 + }, + { + "epoch": 4.333046026547147, + "grad_norm": 2.1555815012813704, + "learning_rate": 4.5999137976580456e-07, + "loss": 0.1046, + "step": 12570 + }, + { + "epoch": 4.3347698672642645, + "grad_norm": 1.4582126311186856, + "learning_rate": 4.576553410573209e-07, + "loss": 0.1065, + "step": 12575 + }, + { + "epoch": 4.336493707981383, + "grad_norm": 1.284759161569176, + "learning_rate": 4.5532496465481246e-07, + "loss": 0.1043, + "step": 12580 + }, + { + "epoch": 4.3382175486985, + "grad_norm": 1.3575841090237788, + "learning_rate": 4.5300025346323217e-07, + "loss": 0.1074, + "step": 12585 + }, + { + "epoch": 4.339941389415618, + "grad_norm": 1.2729313115551153, + "learning_rate": 4.506812103804742e-07, + "loss": 0.1014, + "step": 12590 + }, + { + "epoch": 4.341665230132736, + "grad_norm": 1.3921280110786385, + "learning_rate": 4.483678382973661e-07, + "loss": 0.1049, + "step": 12595 + }, + { + "epoch": 4.343389070849853, + "grad_norm": 1.4120792014341572, + "learning_rate": 4.460601400976633e-07, + "loss": 0.1082, + "step": 12600 + }, + { + "epoch": 4.3451129115669715, + "grad_norm": 1.2266977720156238, + "learning_rate": 4.4375811865805196e-07, + "loss": 0.087, + "step": 12605 + }, + { + "epoch": 4.346836752284089, + "grad_norm": 1.221441799906539, + "learning_rate": 4.4146177684813993e-07, + "loss": 0.102, + "step": 12610 + }, + { + "epoch": 4.348560593001206, + "grad_norm": 1.4977195702203998, + "learning_rate": 4.391711175304542e-07, + "loss": 0.0958, + "step": 12615 + }, + { + "epoch": 4.350284433718325, + "grad_norm": 1.3647702918679094, + "learning_rate": 4.3688614356044155e-07, + "loss": 0.1095, + "step": 12620 + }, + { + "epoch": 4.352008274435442, + "grad_norm": 1.2978731041023601, + "learning_rate": 4.3460685778645874e-07, + "loss": 0.0941, + "step": 12625 + }, + { + "epoch": 4.35373211515256, + "grad_norm": 1.3074376377403427, + "learning_rate": 4.3233326304977175e-07, + "loss": 0.083, + "step": 12630 + }, + { + "epoch": 4.355455955869678, + "grad_norm": 1.3636889328659345, + "learning_rate": 4.3006536218455355e-07, + "loss": 0.0957, + "step": 12635 + }, + { + "epoch": 4.357179796586795, + "grad_norm": 1.4090294409315518, + "learning_rate": 4.278031580178804e-07, + "loss": 0.0958, + "step": 12640 + }, + { + "epoch": 4.358903637303913, + "grad_norm": 1.2745871352097167, + "learning_rate": 4.2554665336972557e-07, + "loss": 0.0992, + "step": 12645 + }, + { + "epoch": 4.360627478021031, + "grad_norm": 1.3487303505526345, + "learning_rate": 4.232958510529561e-07, + "loss": 0.104, + "step": 12650 + }, + { + "epoch": 4.362351318738148, + "grad_norm": 1.057505664160843, + "learning_rate": 4.210507538733344e-07, + "loss": 0.1073, + "step": 12655 + }, + { + "epoch": 4.3640751594552665, + "grad_norm": 1.355665425463141, + "learning_rate": 4.188113646295089e-07, + "loss": 0.1096, + "step": 12660 + }, + { + "epoch": 4.365799000172384, + "grad_norm": 1.2972887628731364, + "learning_rate": 4.165776861130116e-07, + "loss": 0.0861, + "step": 12665 + }, + { + "epoch": 4.367522840889502, + "grad_norm": 1.3587682846022855, + "learning_rate": 4.1434972110825864e-07, + "loss": 0.0972, + "step": 12670 + }, + { + "epoch": 4.3692466816066196, + "grad_norm": 1.4116821509583934, + "learning_rate": 4.121274723925428e-07, + "loss": 0.0954, + "step": 12675 + }, + { + "epoch": 4.370970522323737, + "grad_norm": 1.4358790735628058, + "learning_rate": 4.0991094273603036e-07, + "loss": 0.0955, + "step": 12680 + }, + { + "epoch": 4.372694363040855, + "grad_norm": 1.7436771542495768, + "learning_rate": 4.077001349017579e-07, + "loss": 0.1041, + "step": 12685 + }, + { + "epoch": 4.374418203757973, + "grad_norm": 1.5809645807030175, + "learning_rate": 4.054950516456324e-07, + "loss": 0.1054, + "step": 12690 + }, + { + "epoch": 4.37614204447509, + "grad_norm": 1.6113582839177205, + "learning_rate": 4.0329569571642133e-07, + "loss": 0.1119, + "step": 12695 + }, + { + "epoch": 4.377865885192208, + "grad_norm": 1.3810798928916548, + "learning_rate": 4.0110206985575495e-07, + "loss": 0.1015, + "step": 12700 + }, + { + "epoch": 4.379589725909326, + "grad_norm": 1.1764847209935048, + "learning_rate": 3.989141767981186e-07, + "loss": 0.1045, + "step": 12705 + }, + { + "epoch": 4.381313566626444, + "grad_norm": 1.3729825902566304, + "learning_rate": 3.967320192708535e-07, + "loss": 0.1107, + "step": 12710 + }, + { + "epoch": 4.383037407343561, + "grad_norm": 1.4615264832490804, + "learning_rate": 3.945555999941514e-07, + "loss": 0.0888, + "step": 12715 + }, + { + "epoch": 4.384761248060679, + "grad_norm": 1.4273211790569875, + "learning_rate": 3.9238492168104825e-07, + "loss": 0.1132, + "step": 12720 + }, + { + "epoch": 4.386485088777797, + "grad_norm": 1.3636642689942804, + "learning_rate": 3.902199870374268e-07, + "loss": 0.1168, + "step": 12725 + }, + { + "epoch": 4.3882089294949145, + "grad_norm": 1.3843737512650844, + "learning_rate": 3.880607987620072e-07, + "loss": 0.0931, + "step": 12730 + }, + { + "epoch": 4.389932770212033, + "grad_norm": 1.4950554554819624, + "learning_rate": 3.8590735954634694e-07, + "loss": 0.0935, + "step": 12735 + }, + { + "epoch": 4.39165661092915, + "grad_norm": 1.4774430566556316, + "learning_rate": 3.837596720748399e-07, + "loss": 0.0926, + "step": 12740 + }, + { + "epoch": 4.393380451646268, + "grad_norm": 1.3825430194966124, + "learning_rate": 3.816177390247061e-07, + "loss": 0.0957, + "step": 12745 + }, + { + "epoch": 4.395104292363386, + "grad_norm": 1.3038503938953758, + "learning_rate": 3.794815630659937e-07, + "loss": 0.0958, + "step": 12750 + }, + { + "epoch": 4.396828133080503, + "grad_norm": 1.3365803641344325, + "learning_rate": 3.773511468615748e-07, + "loss": 0.1042, + "step": 12755 + }, + { + "epoch": 4.3985519737976215, + "grad_norm": 1.2221751365503517, + "learning_rate": 3.7522649306714233e-07, + "loss": 0.102, + "step": 12760 + }, + { + "epoch": 4.400275814514739, + "grad_norm": 1.4462721087322152, + "learning_rate": 3.731076043312054e-07, + "loss": 0.1001, + "step": 12765 + }, + { + "epoch": 4.401999655231856, + "grad_norm": 1.2748365997202482, + "learning_rate": 3.70994483295084e-07, + "loss": 0.0936, + "step": 12770 + }, + { + "epoch": 4.403723495948975, + "grad_norm": 1.7392654589861656, + "learning_rate": 3.688871325929128e-07, + "loss": 0.1098, + "step": 12775 + }, + { + "epoch": 4.405447336666092, + "grad_norm": 1.4703587444432689, + "learning_rate": 3.6678555485163137e-07, + "loss": 0.0943, + "step": 12780 + }, + { + "epoch": 4.407171177383209, + "grad_norm": 1.342522738440642, + "learning_rate": 3.646897526909815e-07, + "loss": 0.1063, + "step": 12785 + }, + { + "epoch": 4.408895018100328, + "grad_norm": 1.3022584845829768, + "learning_rate": 3.625997287235067e-07, + "loss": 0.1087, + "step": 12790 + }, + { + "epoch": 4.410618858817445, + "grad_norm": 1.206791508217318, + "learning_rate": 3.6051548555454785e-07, + "loss": 0.0956, + "step": 12795 + }, + { + "epoch": 4.412342699534563, + "grad_norm": 1.1279835274287382, + "learning_rate": 3.5843702578224115e-07, + "loss": 0.1004, + "step": 12800 + }, + { + "epoch": 4.414066540251681, + "grad_norm": 1.561160217448479, + "learning_rate": 3.563643519975091e-07, + "loss": 0.0871, + "step": 12805 + }, + { + "epoch": 4.415790380968798, + "grad_norm": 1.4484456113372957, + "learning_rate": 3.5429746678406707e-07, + "loss": 0.0924, + "step": 12810 + }, + { + "epoch": 4.4175142216859165, + "grad_norm": 1.22779791194021, + "learning_rate": 3.5223637271841026e-07, + "loss": 0.0952, + "step": 12815 + }, + { + "epoch": 4.419238062403034, + "grad_norm": 1.1526436057893366, + "learning_rate": 3.501810723698168e-07, + "loss": 0.0894, + "step": 12820 + }, + { + "epoch": 4.420961903120151, + "grad_norm": 1.082238673798404, + "learning_rate": 3.481315683003411e-07, + "loss": 0.0788, + "step": 12825 + }, + { + "epoch": 4.4226857438372695, + "grad_norm": 1.6270339011649144, + "learning_rate": 3.460878630648157e-07, + "loss": 0.1149, + "step": 12830 + }, + { + "epoch": 4.424409584554387, + "grad_norm": 1.437972544443947, + "learning_rate": 3.440499592108393e-07, + "loss": 0.0859, + "step": 12835 + }, + { + "epoch": 4.426133425271505, + "grad_norm": 1.4124833409613646, + "learning_rate": 3.4201785927878375e-07, + "loss": 0.1151, + "step": 12840 + }, + { + "epoch": 4.427857265988623, + "grad_norm": 1.348271869233934, + "learning_rate": 3.3999156580178384e-07, + "loss": 0.0875, + "step": 12845 + }, + { + "epoch": 4.42958110670574, + "grad_norm": 1.3166518724443603, + "learning_rate": 3.379710813057363e-07, + "loss": 0.1094, + "step": 12850 + }, + { + "epoch": 4.431304947422858, + "grad_norm": 1.3729400568280912, + "learning_rate": 3.3595640830929534e-07, + "loss": 0.1004, + "step": 12855 + }, + { + "epoch": 4.433028788139976, + "grad_norm": 1.526974997179749, + "learning_rate": 3.3394754932387363e-07, + "loss": 0.1012, + "step": 12860 + }, + { + "epoch": 4.434752628857094, + "grad_norm": 1.3652631357819194, + "learning_rate": 3.3194450685363364e-07, + "loss": 0.108, + "step": 12865 + }, + { + "epoch": 4.436476469574211, + "grad_norm": 1.4890953606733544, + "learning_rate": 3.2994728339548863e-07, + "loss": 0.0805, + "step": 12870 + }, + { + "epoch": 4.438200310291329, + "grad_norm": 1.2502987339362712, + "learning_rate": 3.279558814390982e-07, + "loss": 0.091, + "step": 12875 + }, + { + "epoch": 4.439924151008447, + "grad_norm": 1.3490773986660833, + "learning_rate": 3.2597030346686544e-07, + "loss": 0.1004, + "step": 12880 + }, + { + "epoch": 4.4416479917255645, + "grad_norm": 1.567697673737467, + "learning_rate": 3.239905519539316e-07, + "loss": 0.1107, + "step": 12885 + }, + { + "epoch": 4.443371832442683, + "grad_norm": 1.0639027461941597, + "learning_rate": 3.2201662936817533e-07, + "loss": 0.0951, + "step": 12890 + }, + { + "epoch": 4.4450956731598, + "grad_norm": 1.2286238065647186, + "learning_rate": 3.2004853817021233e-07, + "loss": 0.102, + "step": 12895 + }, + { + "epoch": 4.4468195138769175, + "grad_norm": 1.3921497125598463, + "learning_rate": 3.18086280813385e-07, + "loss": 0.1, + "step": 12900 + }, + { + "epoch": 4.448543354594036, + "grad_norm": 1.2175773126722704, + "learning_rate": 3.1612985974376563e-07, + "loss": 0.0847, + "step": 12905 + }, + { + "epoch": 4.450267195311153, + "grad_norm": 1.285169869770806, + "learning_rate": 3.1417927740015064e-07, + "loss": 0.103, + "step": 12910 + }, + { + "epoch": 4.451991036028271, + "grad_norm": 1.463906692471606, + "learning_rate": 3.1223453621405775e-07, + "loss": 0.1014, + "step": 12915 + }, + { + "epoch": 4.453714876745389, + "grad_norm": 1.2869230792060926, + "learning_rate": 3.102956386097256e-07, + "loss": 0.0885, + "step": 12920 + }, + { + "epoch": 4.455438717462506, + "grad_norm": 1.5530548797185606, + "learning_rate": 3.0836258700410515e-07, + "loss": 0.0848, + "step": 12925 + }, + { + "epoch": 4.457162558179625, + "grad_norm": 1.2862694034718802, + "learning_rate": 3.064353838068629e-07, + "loss": 0.1033, + "step": 12930 + }, + { + "epoch": 4.458886398896742, + "grad_norm": 1.4721761441567474, + "learning_rate": 3.0451403142037263e-07, + "loss": 0.0851, + "step": 12935 + }, + { + "epoch": 4.460610239613859, + "grad_norm": 1.4669459599969508, + "learning_rate": 3.0259853223971513e-07, + "loss": 0.1067, + "step": 12940 + }, + { + "epoch": 4.462334080330978, + "grad_norm": 1.37748309479494, + "learning_rate": 3.0068888865267707e-07, + "loss": 0.0942, + "step": 12945 + }, + { + "epoch": 4.464057921048095, + "grad_norm": 1.6375777308007735, + "learning_rate": 2.9878510303974375e-07, + "loss": 0.1137, + "step": 12950 + }, + { + "epoch": 4.4657817617652125, + "grad_norm": 1.1204002889094036, + "learning_rate": 2.968871777740967e-07, + "loss": 0.0971, + "step": 12955 + }, + { + "epoch": 4.467505602482331, + "grad_norm": 1.0897762511129812, + "learning_rate": 2.9499511522161516e-07, + "loss": 0.0856, + "step": 12960 + }, + { + "epoch": 4.469229443199448, + "grad_norm": 1.2686325796294473, + "learning_rate": 2.9310891774087023e-07, + "loss": 0.0963, + "step": 12965 + }, + { + "epoch": 4.470953283916566, + "grad_norm": 1.3901134913359297, + "learning_rate": 2.912285876831195e-07, + "loss": 0.111, + "step": 12970 + }, + { + "epoch": 4.472677124633684, + "grad_norm": 1.13042242981474, + "learning_rate": 2.893541273923067e-07, + "loss": 0.1015, + "step": 12975 + }, + { + "epoch": 4.474400965350801, + "grad_norm": 1.5101556464880321, + "learning_rate": 2.874855392050607e-07, + "loss": 0.1179, + "step": 12980 + }, + { + "epoch": 4.4761248060679195, + "grad_norm": 1.5865788619169257, + "learning_rate": 2.856228254506888e-07, + "loss": 0.1214, + "step": 12985 + }, + { + "epoch": 4.477848646785037, + "grad_norm": 1.321921555506973, + "learning_rate": 2.8376598845117566e-07, + "loss": 0.0976, + "step": 12990 + }, + { + "epoch": 4.479572487502155, + "grad_norm": 1.3855200764691942, + "learning_rate": 2.819150305211793e-07, + "loss": 0.105, + "step": 12995 + }, + { + "epoch": 4.481296328219273, + "grad_norm": 1.5457327572351867, + "learning_rate": 2.8006995396803127e-07, + "loss": 0.113, + "step": 13000 + }, + { + "epoch": 4.48302016893639, + "grad_norm": 0.8593743094516547, + "learning_rate": 2.782307610917312e-07, + "loss": 0.0854, + "step": 13005 + }, + { + "epoch": 4.484744009653508, + "grad_norm": 1.3952963557799567, + "learning_rate": 2.7639745418494233e-07, + "loss": 0.0969, + "step": 13010 + }, + { + "epoch": 4.486467850370626, + "grad_norm": 1.3731902241157263, + "learning_rate": 2.7457003553299275e-07, + "loss": 0.0804, + "step": 13015 + }, + { + "epoch": 4.488191691087744, + "grad_norm": 1.3022314660354217, + "learning_rate": 2.727485074138703e-07, + "loss": 0.1048, + "step": 13020 + }, + { + "epoch": 4.489915531804861, + "grad_norm": 1.227026767751595, + "learning_rate": 2.709328720982185e-07, + "loss": 0.1066, + "step": 13025 + }, + { + "epoch": 4.491639372521979, + "grad_norm": 1.1417604541849078, + "learning_rate": 2.691231318493354e-07, + "loss": 0.0859, + "step": 13030 + }, + { + "epoch": 4.493363213239097, + "grad_norm": 1.4900371489784046, + "learning_rate": 2.6731928892317295e-07, + "loss": 0.1037, + "step": 13035 + }, + { + "epoch": 4.495087053956214, + "grad_norm": 1.2151600207334885, + "learning_rate": 2.6552134556832863e-07, + "loss": 0.0986, + "step": 13040 + }, + { + "epoch": 4.496810894673332, + "grad_norm": 1.8363902524651001, + "learning_rate": 2.637293040260469e-07, + "loss": 0.0923, + "step": 13045 + }, + { + "epoch": 4.49853473539045, + "grad_norm": 1.4983308920248348, + "learning_rate": 2.6194316653021634e-07, + "loss": 0.1064, + "step": 13050 + }, + { + "epoch": 4.5002585761075675, + "grad_norm": 1.2872110404664545, + "learning_rate": 2.6016293530736483e-07, + "loss": 0.0919, + "step": 13055 + }, + { + "epoch": 4.501982416824685, + "grad_norm": 1.5124529378726344, + "learning_rate": 2.583886125766566e-07, + "loss": 0.1209, + "step": 13060 + }, + { + "epoch": 4.503706257541803, + "grad_norm": 1.287728451640388, + "learning_rate": 2.56620200549893e-07, + "loss": 0.0908, + "step": 13065 + }, + { + "epoch": 4.505430098258921, + "grad_norm": 1.4652171994556562, + "learning_rate": 2.548577014315051e-07, + "loss": 0.0917, + "step": 13070 + }, + { + "epoch": 4.507153938976039, + "grad_norm": 1.7160599080486028, + "learning_rate": 2.531011174185544e-07, + "loss": 0.0944, + "step": 13075 + }, + { + "epoch": 4.508877779693156, + "grad_norm": 1.738633885349048, + "learning_rate": 2.5135045070072805e-07, + "loss": 0.1083, + "step": 13080 + }, + { + "epoch": 4.510601620410274, + "grad_norm": 1.5220304294741687, + "learning_rate": 2.4960570346033885e-07, + "loss": 0.0982, + "step": 13085 + }, + { + "epoch": 4.512325461127392, + "grad_norm": 1.386549851427352, + "learning_rate": 2.478668778723181e-07, + "loss": 0.1111, + "step": 13090 + }, + { + "epoch": 4.514049301844509, + "grad_norm": 1.366751422364679, + "learning_rate": 2.4613397610421694e-07, + "loss": 0.0901, + "step": 13095 + }, + { + "epoch": 4.515773142561628, + "grad_norm": 1.2455467002386247, + "learning_rate": 2.444070003162019e-07, + "loss": 0.0997, + "step": 13100 + }, + { + "epoch": 4.517496983278745, + "grad_norm": 1.4699538249465196, + "learning_rate": 2.4268595266105145e-07, + "loss": 0.1123, + "step": 13105 + }, + { + "epoch": 4.5192208239958624, + "grad_norm": 1.1079905013317122, + "learning_rate": 2.409708352841561e-07, + "loss": 0.0929, + "step": 13110 + }, + { + "epoch": 4.520944664712981, + "grad_norm": 1.3514740009692399, + "learning_rate": 2.392616503235118e-07, + "loss": 0.1114, + "step": 13115 + }, + { + "epoch": 4.522668505430098, + "grad_norm": 1.2706570686597356, + "learning_rate": 2.3755839990972086e-07, + "loss": 0.102, + "step": 13120 + }, + { + "epoch": 4.524392346147216, + "grad_norm": 1.2544292470805234, + "learning_rate": 2.3586108616598825e-07, + "loss": 0.1045, + "step": 13125 + }, + { + "epoch": 4.526116186864334, + "grad_norm": 1.4872975760870342, + "learning_rate": 2.3416971120811594e-07, + "loss": 0.1217, + "step": 13130 + }, + { + "epoch": 4.527840027581451, + "grad_norm": 1.454057940553262, + "learning_rate": 2.3248427714450684e-07, + "loss": 0.087, + "step": 13135 + }, + { + "epoch": 4.5295638682985695, + "grad_norm": 1.563757832532968, + "learning_rate": 2.3080478607615475e-07, + "loss": 0.119, + "step": 13140 + }, + { + "epoch": 4.531287709015687, + "grad_norm": 1.374597331759327, + "learning_rate": 2.291312400966461e-07, + "loss": 0.0905, + "step": 13145 + }, + { + "epoch": 4.533011549732805, + "grad_norm": 1.4757684836095026, + "learning_rate": 2.274636412921566e-07, + "loss": 0.1004, + "step": 13150 + }, + { + "epoch": 4.534735390449923, + "grad_norm": 1.5931262829105004, + "learning_rate": 2.2580199174144946e-07, + "loss": 0.0983, + "step": 13155 + }, + { + "epoch": 4.53645923116704, + "grad_norm": 1.426226613506302, + "learning_rate": 2.2414629351586946e-07, + "loss": 0.0787, + "step": 13160 + }, + { + "epoch": 4.538183071884158, + "grad_norm": 1.4154589419608299, + "learning_rate": 2.224965486793451e-07, + "loss": 0.0979, + "step": 13165 + }, + { + "epoch": 4.539906912601276, + "grad_norm": 1.3557580241854177, + "learning_rate": 2.2085275928838245e-07, + "loss": 0.1036, + "step": 13170 + }, + { + "epoch": 4.541630753318393, + "grad_norm": 1.492145085562156, + "learning_rate": 2.1921492739206463e-07, + "loss": 0.0893, + "step": 13175 + }, + { + "epoch": 4.543354594035511, + "grad_norm": 1.2949241272414553, + "learning_rate": 2.1758305503204568e-07, + "loss": 0.0967, + "step": 13180 + }, + { + "epoch": 4.545078434752629, + "grad_norm": 1.2385526324923133, + "learning_rate": 2.1595714424255453e-07, + "loss": 0.0954, + "step": 13185 + }, + { + "epoch": 4.546802275469746, + "grad_norm": 1.690001400330076, + "learning_rate": 2.1433719705038602e-07, + "loss": 0.1107, + "step": 13190 + }, + { + "epoch": 4.548526116186864, + "grad_norm": 1.2457330913666358, + "learning_rate": 2.127232154749026e-07, + "loss": 0.0998, + "step": 13195 + }, + { + "epoch": 4.550249956903982, + "grad_norm": 1.3930447286730825, + "learning_rate": 2.1111520152802767e-07, + "loss": 0.085, + "step": 13200 + }, + { + "epoch": 4.5519737976211, + "grad_norm": 1.0526105841258329, + "learning_rate": 2.0951315721424893e-07, + "loss": 0.1005, + "step": 13205 + }, + { + "epoch": 4.5536976383382175, + "grad_norm": 1.5401083244814908, + "learning_rate": 2.0791708453061054e-07, + "loss": 0.1141, + "step": 13210 + }, + { + "epoch": 4.555421479055335, + "grad_norm": 1.3508795386173233, + "learning_rate": 2.0632698546671327e-07, + "loss": 0.0975, + "step": 13215 + }, + { + "epoch": 4.557145319772453, + "grad_norm": 1.4921763974196698, + "learning_rate": 2.0474286200471149e-07, + "loss": 0.1068, + "step": 13220 + }, + { + "epoch": 4.558869160489571, + "grad_norm": 1.4736816750777688, + "learning_rate": 2.0316471611931066e-07, + "loss": 0.1236, + "step": 13225 + }, + { + "epoch": 4.560593001206689, + "grad_norm": 1.6234184304688664, + "learning_rate": 2.0159254977776376e-07, + "loss": 0.0953, + "step": 13230 + }, + { + "epoch": 4.562316841923806, + "grad_norm": 1.3390965225224296, + "learning_rate": 2.0002636493987037e-07, + "loss": 0.0923, + "step": 13235 + }, + { + "epoch": 4.564040682640924, + "grad_norm": 1.2978219433611065, + "learning_rate": 1.98466163557976e-07, + "loss": 0.0985, + "step": 13240 + }, + { + "epoch": 4.565764523358042, + "grad_norm": 1.7456280452183062, + "learning_rate": 1.969119475769632e-07, + "loss": 0.094, + "step": 13245 + }, + { + "epoch": 4.567488364075159, + "grad_norm": 1.4884220943358613, + "learning_rate": 1.9536371893425776e-07, + "loss": 0.1006, + "step": 13250 + }, + { + "epoch": 4.569212204792278, + "grad_norm": 1.3322884018508931, + "learning_rate": 1.9382147955981923e-07, + "loss": 0.0946, + "step": 13255 + }, + { + "epoch": 4.570936045509395, + "grad_norm": 1.3238506807068622, + "learning_rate": 1.922852313761414e-07, + "loss": 0.1093, + "step": 13260 + }, + { + "epoch": 4.572659886226512, + "grad_norm": 1.2103606889951604, + "learning_rate": 1.907549762982508e-07, + "loss": 0.0954, + "step": 13265 + }, + { + "epoch": 4.574383726943631, + "grad_norm": 1.5514619596585275, + "learning_rate": 1.8923071623370093e-07, + "loss": 0.0957, + "step": 13270 + }, + { + "epoch": 4.576107567660748, + "grad_norm": 1.319009896706714, + "learning_rate": 1.877124530825758e-07, + "loss": 0.0891, + "step": 13275 + }, + { + "epoch": 4.5778314083778655, + "grad_norm": 1.12910226251704, + "learning_rate": 1.862001887374798e-07, + "loss": 0.0917, + "step": 13280 + }, + { + "epoch": 4.579555249094984, + "grad_norm": 1.2729579102654363, + "learning_rate": 1.8469392508354277e-07, + "loss": 0.1078, + "step": 13285 + }, + { + "epoch": 4.581279089812101, + "grad_norm": 1.3691912041824974, + "learning_rate": 1.8319366399841331e-07, + "loss": 0.1023, + "step": 13290 + }, + { + "epoch": 4.5830029305292195, + "grad_norm": 1.4430583312306229, + "learning_rate": 1.81699407352256e-07, + "loss": 0.1051, + "step": 13295 + }, + { + "epoch": 4.584726771246337, + "grad_norm": 1.4521508718047706, + "learning_rate": 1.8021115700775193e-07, + "loss": 0.104, + "step": 13300 + }, + { + "epoch": 4.586450611963454, + "grad_norm": 1.4938106721352669, + "learning_rate": 1.7872891482009546e-07, + "loss": 0.0914, + "step": 13305 + }, + { + "epoch": 4.588174452680573, + "grad_norm": 1.2759287382605968, + "learning_rate": 1.772526826369897e-07, + "loss": 0.0878, + "step": 13310 + }, + { + "epoch": 4.58989829339769, + "grad_norm": 1.2838268078011246, + "learning_rate": 1.7578246229864816e-07, + "loss": 0.0973, + "step": 13315 + }, + { + "epoch": 4.591622134114807, + "grad_norm": 1.4327722570452122, + "learning_rate": 1.7431825563778705e-07, + "loss": 0.0902, + "step": 13320 + }, + { + "epoch": 4.593345974831926, + "grad_norm": 1.603054932211892, + "learning_rate": 1.7286006447962912e-07, + "loss": 0.1157, + "step": 13325 + }, + { + "epoch": 4.595069815549043, + "grad_norm": 1.5177612887963574, + "learning_rate": 1.714078906418981e-07, + "loss": 0.0929, + "step": 13330 + }, + { + "epoch": 4.596793656266161, + "grad_norm": 1.6769181249663976, + "learning_rate": 1.6996173593481546e-07, + "loss": 0.1066, + "step": 13335 + }, + { + "epoch": 4.598517496983279, + "grad_norm": 1.1958153335576733, + "learning_rate": 1.6852160216110026e-07, + "loss": 0.0943, + "step": 13340 + }, + { + "epoch": 4.600241337700396, + "grad_norm": 1.3155997170309006, + "learning_rate": 1.6708749111596535e-07, + "loss": 0.093, + "step": 13345 + }, + { + "epoch": 4.601965178417514, + "grad_norm": 1.4886335786550116, + "learning_rate": 1.656594045871174e-07, + "loss": 0.104, + "step": 13350 + }, + { + "epoch": 4.603689019134632, + "grad_norm": 1.2695988845935569, + "learning_rate": 1.642373443547507e-07, + "loss": 0.0971, + "step": 13355 + }, + { + "epoch": 4.60541285985175, + "grad_norm": 1.5233611092597934, + "learning_rate": 1.6282131219155062e-07, + "loss": 0.0902, + "step": 13360 + }, + { + "epoch": 4.6071367005688675, + "grad_norm": 1.5912394263506853, + "learning_rate": 1.6141130986268516e-07, + "loss": 0.1179, + "step": 13365 + }, + { + "epoch": 4.608860541285985, + "grad_norm": 1.1955424578211602, + "learning_rate": 1.600073391258078e-07, + "loss": 0.0924, + "step": 13370 + }, + { + "epoch": 4.610584382003103, + "grad_norm": 1.5460175407099928, + "learning_rate": 1.5860940173105244e-07, + "loss": 0.1013, + "step": 13375 + }, + { + "epoch": 4.612308222720221, + "grad_norm": 1.2221212450599088, + "learning_rate": 1.5721749942103237e-07, + "loss": 0.0811, + "step": 13380 + }, + { + "epoch": 4.614032063437339, + "grad_norm": 1.2464206154528004, + "learning_rate": 1.5583163393083689e-07, + "loss": 0.0982, + "step": 13385 + }, + { + "epoch": 4.615755904154456, + "grad_norm": 1.4499475052958257, + "learning_rate": 1.544518069880313e-07, + "loss": 0.0837, + "step": 13390 + }, + { + "epoch": 4.617479744871574, + "grad_norm": 1.226797858106584, + "learning_rate": 1.5307802031265305e-07, + "loss": 0.0927, + "step": 13395 + }, + { + "epoch": 4.619203585588692, + "grad_norm": 1.4748541756460058, + "learning_rate": 1.5171027561720953e-07, + "loss": 0.1069, + "step": 13400 + }, + { + "epoch": 4.620927426305809, + "grad_norm": 1.3988443218251998, + "learning_rate": 1.503485746066763e-07, + "loss": 0.1, + "step": 13405 + }, + { + "epoch": 4.622651267022927, + "grad_norm": 1.2656574957575772, + "learning_rate": 1.4899291897849665e-07, + "loss": 0.1054, + "step": 13410 + }, + { + "epoch": 4.624375107740045, + "grad_norm": 1.449296239770174, + "learning_rate": 1.4764331042257662e-07, + "loss": 0.0973, + "step": 13415 + }, + { + "epoch": 4.626098948457162, + "grad_norm": 1.1166203335379397, + "learning_rate": 1.4629975062128432e-07, + "loss": 0.0951, + "step": 13420 + }, + { + "epoch": 4.62782278917428, + "grad_norm": 1.296216895911148, + "learning_rate": 1.449622412494478e-07, + "loss": 0.0983, + "step": 13425 + }, + { + "epoch": 4.629546629891398, + "grad_norm": 1.7368534817398664, + "learning_rate": 1.4363078397435336e-07, + "loss": 0.1077, + "step": 13430 + }, + { + "epoch": 4.6312704706085155, + "grad_norm": 1.1978076419655144, + "learning_rate": 1.4230538045574283e-07, + "loss": 0.0942, + "step": 13435 + }, + { + "epoch": 4.632994311325634, + "grad_norm": 1.243838839475416, + "learning_rate": 1.409860323458101e-07, + "loss": 0.0785, + "step": 13440 + }, + { + "epoch": 4.634718152042751, + "grad_norm": 1.3067558831150292, + "learning_rate": 1.396727412892035e-07, + "loss": 0.0794, + "step": 13445 + }, + { + "epoch": 4.636441992759869, + "grad_norm": 1.4041963387833243, + "learning_rate": 1.3836550892301792e-07, + "loss": 0.1103, + "step": 13450 + }, + { + "epoch": 4.638165833476987, + "grad_norm": 1.2034301664062652, + "learning_rate": 1.370643368767982e-07, + "loss": 0.0922, + "step": 13455 + }, + { + "epoch": 4.639889674194104, + "grad_norm": 1.5013055926060335, + "learning_rate": 1.3576922677253413e-07, + "loss": 0.1065, + "step": 13460 + }, + { + "epoch": 4.6416135149112225, + "grad_norm": 1.1145947340545772, + "learning_rate": 1.3448018022465758e-07, + "loss": 0.0966, + "step": 13465 + }, + { + "epoch": 4.64333735562834, + "grad_norm": 1.2193378482255657, + "learning_rate": 1.3319719884004268e-07, + "loss": 0.0977, + "step": 13470 + }, + { + "epoch": 4.645061196345457, + "grad_norm": 1.2820035565815853, + "learning_rate": 1.3192028421800286e-07, + "loss": 0.0826, + "step": 13475 + }, + { + "epoch": 4.646785037062576, + "grad_norm": 1.1676360477766345, + "learning_rate": 1.3064943795028927e-07, + "loss": 0.0865, + "step": 13480 + }, + { + "epoch": 4.648508877779693, + "grad_norm": 1.268140172438007, + "learning_rate": 1.2938466162108755e-07, + "loss": 0.0961, + "step": 13485 + }, + { + "epoch": 4.650232718496811, + "grad_norm": 1.2316793005853521, + "learning_rate": 1.2812595680701868e-07, + "loss": 0.0899, + "step": 13490 + }, + { + "epoch": 4.651956559213929, + "grad_norm": 1.3560309380214235, + "learning_rate": 1.2687332507713367e-07, + "loss": 0.0919, + "step": 13495 + }, + { + "epoch": 4.653680399931046, + "grad_norm": 1.3211197526343696, + "learning_rate": 1.2562676799291295e-07, + "loss": 0.0936, + "step": 13500 + }, + { + "epoch": 4.655404240648164, + "grad_norm": 1.7175327864120091, + "learning_rate": 1.2438628710826462e-07, + "loss": 0.0963, + "step": 13505 + }, + { + "epoch": 4.657128081365282, + "grad_norm": 1.3462874472570279, + "learning_rate": 1.2315188396952393e-07, + "loss": 0.0942, + "step": 13510 + }, + { + "epoch": 4.6588519220824, + "grad_norm": 1.3704760307183306, + "learning_rate": 1.219235601154478e-07, + "loss": 0.0994, + "step": 13515 + }, + { + "epoch": 4.6605757627995175, + "grad_norm": 1.527220264908924, + "learning_rate": 1.2070131707721645e-07, + "loss": 0.0887, + "step": 13520 + }, + { + "epoch": 4.662299603516635, + "grad_norm": 1.38286071365882, + "learning_rate": 1.1948515637842772e-07, + "loss": 0.109, + "step": 13525 + }, + { + "epoch": 4.664023444233753, + "grad_norm": 1.3332598116491754, + "learning_rate": 1.1827507953510065e-07, + "loss": 0.1045, + "step": 13530 + }, + { + "epoch": 4.6657472849508705, + "grad_norm": 1.4427499746222403, + "learning_rate": 1.1707108805566914e-07, + "loss": 0.1141, + "step": 13535 + }, + { + "epoch": 4.667471125667988, + "grad_norm": 1.137998617122497, + "learning_rate": 1.1587318344097987e-07, + "loss": 0.083, + "step": 13540 + }, + { + "epoch": 4.669194966385106, + "grad_norm": 1.600272132407465, + "learning_rate": 1.146813671842939e-07, + "loss": 0.0856, + "step": 13545 + }, + { + "epoch": 4.670918807102224, + "grad_norm": 1.6046131715497098, + "learning_rate": 1.1349564077128172e-07, + "loss": 0.1054, + "step": 13550 + }, + { + "epoch": 4.672642647819341, + "grad_norm": 1.1104256373747428, + "learning_rate": 1.1231600568002266e-07, + "loss": 0.0899, + "step": 13555 + }, + { + "epoch": 4.674366488536459, + "grad_norm": 1.352860699849487, + "learning_rate": 1.1114246338100209e-07, + "loss": 0.0748, + "step": 13560 + }, + { + "epoch": 4.676090329253577, + "grad_norm": 1.4976105797333303, + "learning_rate": 1.0997501533711263e-07, + "loss": 0.1103, + "step": 13565 + }, + { + "epoch": 4.677814169970695, + "grad_norm": 1.3136807231096304, + "learning_rate": 1.0881366300364681e-07, + "loss": 0.0818, + "step": 13570 + }, + { + "epoch": 4.679538010687812, + "grad_norm": 1.4188593207640947, + "learning_rate": 1.0765840782830106e-07, + "loss": 0.0924, + "step": 13575 + }, + { + "epoch": 4.68126185140493, + "grad_norm": 1.5397827779204702, + "learning_rate": 1.0650925125117062e-07, + "loss": 0.0937, + "step": 13580 + }, + { + "epoch": 4.682985692122048, + "grad_norm": 1.120202347782216, + "learning_rate": 1.0536619470474852e-07, + "loss": 0.0867, + "step": 13585 + }, + { + "epoch": 4.6847095328391655, + "grad_norm": 1.820853812092607, + "learning_rate": 1.0422923961392328e-07, + "loss": 0.0957, + "step": 13590 + }, + { + "epoch": 4.686433373556284, + "grad_norm": 1.4286969067738933, + "learning_rate": 1.0309838739597677e-07, + "loss": 0.1019, + "step": 13595 + }, + { + "epoch": 4.688157214273401, + "grad_norm": 1.2270942167491161, + "learning_rate": 1.0197363946058637e-07, + "loss": 0.0867, + "step": 13600 + }, + { + "epoch": 4.6898810549905185, + "grad_norm": 2.019887324948578, + "learning_rate": 1.0085499720981661e-07, + "loss": 0.1058, + "step": 13605 + }, + { + "epoch": 4.691604895707637, + "grad_norm": 1.4971597940304116, + "learning_rate": 9.97424620381221e-08, + "loss": 0.0966, + "step": 13610 + }, + { + "epoch": 4.693328736424754, + "grad_norm": 1.3518552066019132, + "learning_rate": 9.863603533234622e-08, + "loss": 0.1033, + "step": 13615 + }, + { + "epoch": 4.6950525771418725, + "grad_norm": 1.3860843666652813, + "learning_rate": 9.753571847171572e-08, + "loss": 0.0901, + "step": 13620 + }, + { + "epoch": 4.69677641785899, + "grad_norm": 1.3014880507475362, + "learning_rate": 9.644151282784119e-08, + "loss": 0.0965, + "step": 13625 + }, + { + "epoch": 4.698500258576107, + "grad_norm": 1.4008498586606288, + "learning_rate": 9.535341976471713e-08, + "loss": 0.0999, + "step": 13630 + }, + { + "epoch": 4.700224099293226, + "grad_norm": 1.322094982701535, + "learning_rate": 9.427144063871629e-08, + "loss": 0.0812, + "step": 13635 + }, + { + "epoch": 4.701947940010343, + "grad_norm": 1.3876031917007183, + "learning_rate": 9.319557679859093e-08, + "loss": 0.0933, + "step": 13640 + }, + { + "epoch": 4.70367178072746, + "grad_norm": 1.110067386556814, + "learning_rate": 9.212582958546989e-08, + "loss": 0.0985, + "step": 13645 + }, + { + "epoch": 4.705395621444579, + "grad_norm": 1.6536004853182418, + "learning_rate": 9.106220033285762e-08, + "loss": 0.0976, + "step": 13650 + }, + { + "epoch": 4.707119462161696, + "grad_norm": 1.3877762732500114, + "learning_rate": 9.000469036663128e-08, + "loss": 0.0922, + "step": 13655 + }, + { + "epoch": 4.708843302878814, + "grad_norm": 1.4471504297036866, + "learning_rate": 8.89533010050414e-08, + "loss": 0.0933, + "step": 13660 + }, + { + "epoch": 4.710567143595932, + "grad_norm": 1.345184410865251, + "learning_rate": 8.790803355870847e-08, + "loss": 0.1045, + "step": 13665 + }, + { + "epoch": 4.712290984313049, + "grad_norm": 1.2747865374695397, + "learning_rate": 8.686888933062076e-08, + "loss": 0.1097, + "step": 13670 + }, + { + "epoch": 4.714014825030167, + "grad_norm": 1.2778218298625545, + "learning_rate": 8.583586961613432e-08, + "loss": 0.1021, + "step": 13675 + }, + { + "epoch": 4.715738665747285, + "grad_norm": 1.2033123973842108, + "learning_rate": 8.480897570296964e-08, + "loss": 0.0995, + "step": 13680 + }, + { + "epoch": 4.717462506464402, + "grad_norm": 1.1922744353834456, + "learning_rate": 8.378820887121276e-08, + "loss": 0.0794, + "step": 13685 + }, + { + "epoch": 4.7191863471815205, + "grad_norm": 1.2173613841871695, + "learning_rate": 8.277357039330969e-08, + "loss": 0.094, + "step": 13690 + }, + { + "epoch": 4.720910187898638, + "grad_norm": 1.4153786129778525, + "learning_rate": 8.176506153406983e-08, + "loss": 0.0825, + "step": 13695 + }, + { + "epoch": 4.722634028615756, + "grad_norm": 1.2756006556141246, + "learning_rate": 8.07626835506592e-08, + "loss": 0.0915, + "step": 13700 + }, + { + "epoch": 4.724357869332874, + "grad_norm": 1.3386533645764536, + "learning_rate": 7.976643769260329e-08, + "loss": 0.0966, + "step": 13705 + }, + { + "epoch": 4.726081710049991, + "grad_norm": 1.2853753966838655, + "learning_rate": 7.877632520178146e-08, + "loss": 0.0975, + "step": 13710 + }, + { + "epoch": 4.727805550767109, + "grad_norm": 1.200683434493203, + "learning_rate": 7.779234731242869e-08, + "loss": 0.1097, + "step": 13715 + }, + { + "epoch": 4.729529391484227, + "grad_norm": 1.2438111752310064, + "learning_rate": 7.68145052511332e-08, + "loss": 0.0969, + "step": 13720 + }, + { + "epoch": 4.731253232201345, + "grad_norm": 1.4857904886894677, + "learning_rate": 7.584280023683333e-08, + "loss": 0.0981, + "step": 13725 + }, + { + "epoch": 4.732977072918462, + "grad_norm": 1.2569915355430077, + "learning_rate": 7.487723348081788e-08, + "loss": 0.0946, + "step": 13730 + }, + { + "epoch": 4.73470091363558, + "grad_norm": 1.3006579967061103, + "learning_rate": 7.391780618672461e-08, + "loss": 0.0979, + "step": 13735 + }, + { + "epoch": 4.736424754352698, + "grad_norm": 1.2060927022007693, + "learning_rate": 7.296451955053685e-08, + "loss": 0.0971, + "step": 13740 + }, + { + "epoch": 4.7381485950698154, + "grad_norm": 1.333980082942855, + "learning_rate": 7.201737476058346e-08, + "loss": 0.098, + "step": 13745 + }, + { + "epoch": 4.739872435786934, + "grad_norm": 1.528677809213225, + "learning_rate": 7.107637299753833e-08, + "loss": 0.0991, + "step": 13750 + }, + { + "epoch": 4.741596276504051, + "grad_norm": 1.5331933283271482, + "learning_rate": 7.01415154344165e-08, + "loss": 0.0996, + "step": 13755 + }, + { + "epoch": 4.7433201172211685, + "grad_norm": 1.5113460268661256, + "learning_rate": 6.921280323657354e-08, + "loss": 0.0921, + "step": 13760 + }, + { + "epoch": 4.745043957938287, + "grad_norm": 1.5194760667157945, + "learning_rate": 6.829023756170505e-08, + "loss": 0.101, + "step": 13765 + }, + { + "epoch": 4.746767798655404, + "grad_norm": 6.4406094901708775, + "learning_rate": 6.737381955984556e-08, + "loss": 0.0842, + "step": 13770 + }, + { + "epoch": 4.748491639372522, + "grad_norm": 1.3920559725715298, + "learning_rate": 6.646355037336461e-08, + "loss": 0.102, + "step": 13775 + }, + { + "epoch": 4.75021548008964, + "grad_norm": 1.5441176486271477, + "learning_rate": 6.555943113696783e-08, + "loss": 0.091, + "step": 13780 + }, + { + "epoch": 4.751939320806757, + "grad_norm": 1.6977755494926987, + "learning_rate": 6.466146297769427e-08, + "loss": 0.112, + "step": 13785 + }, + { + "epoch": 4.753663161523876, + "grad_norm": 1.1084874788367438, + "learning_rate": 6.376964701491518e-08, + "loss": 0.096, + "step": 13790 + }, + { + "epoch": 4.755387002240993, + "grad_norm": 1.3672625263770586, + "learning_rate": 6.288398436033294e-08, + "loss": 0.1067, + "step": 13795 + }, + { + "epoch": 4.75711084295811, + "grad_norm": 1.4085136982008044, + "learning_rate": 6.200447611797889e-08, + "loss": 0.0996, + "step": 13800 + }, + { + "epoch": 4.758834683675229, + "grad_norm": 1.315774980855411, + "learning_rate": 6.113112338421379e-08, + "loss": 0.0854, + "step": 13805 + }, + { + "epoch": 4.760558524392346, + "grad_norm": 1.3449961450823873, + "learning_rate": 6.026392724772346e-08, + "loss": 0.1049, + "step": 13810 + }, + { + "epoch": 4.7622823651094635, + "grad_norm": 1.3416353730437305, + "learning_rate": 5.9402888789520386e-08, + "loss": 0.1074, + "step": 13815 + }, + { + "epoch": 4.764006205826582, + "grad_norm": 1.6156281298912567, + "learning_rate": 5.8548009082941005e-08, + "loss": 0.1059, + "step": 13820 + }, + { + "epoch": 4.765730046543699, + "grad_norm": 1.906243084579866, + "learning_rate": 5.769928919364454e-08, + "loss": 0.1118, + "step": 13825 + }, + { + "epoch": 4.767453887260817, + "grad_norm": 1.2151040203494308, + "learning_rate": 5.68567301796108e-08, + "loss": 0.1082, + "step": 13830 + }, + { + "epoch": 4.769177727977935, + "grad_norm": 1.4233744317358104, + "learning_rate": 5.6020333091140743e-08, + "loss": 0.1141, + "step": 13835 + }, + { + "epoch": 4.770901568695052, + "grad_norm": 1.2188441261815888, + "learning_rate": 5.51900989708537e-08, + "loss": 0.0909, + "step": 13840 + }, + { + "epoch": 4.7726254094121705, + "grad_norm": 3.5705776344629556, + "learning_rate": 5.43660288536868e-08, + "loss": 0.0996, + "step": 13845 + }, + { + "epoch": 4.774349250129288, + "grad_norm": 1.5089928203842367, + "learning_rate": 5.3548123766891666e-08, + "loss": 0.0836, + "step": 13850 + }, + { + "epoch": 4.776073090846406, + "grad_norm": 1.4416933213557006, + "learning_rate": 5.2736384730037726e-08, + "loss": 0.0901, + "step": 13855 + }, + { + "epoch": 4.777796931563524, + "grad_norm": 1.4536660462965365, + "learning_rate": 5.1930812755005554e-08, + "loss": 0.1095, + "step": 13860 + }, + { + "epoch": 4.779520772280641, + "grad_norm": 1.394197227843644, + "learning_rate": 5.1131408845989106e-08, + "loss": 0.1024, + "step": 13865 + }, + { + "epoch": 4.781244612997759, + "grad_norm": 1.4539150414557434, + "learning_rate": 5.0338173999494586e-08, + "loss": 0.1045, + "step": 13870 + }, + { + "epoch": 4.782968453714877, + "grad_norm": 1.2009776217172086, + "learning_rate": 4.9551109204336034e-08, + "loss": 0.0786, + "step": 13875 + }, + { + "epoch": 4.784692294431995, + "grad_norm": 1.3029531931726013, + "learning_rate": 4.877021544163696e-08, + "loss": 0.1031, + "step": 13880 + }, + { + "epoch": 4.786416135149112, + "grad_norm": 1.4638350709115346, + "learning_rate": 4.79954936848287e-08, + "loss": 0.1285, + "step": 13885 + }, + { + "epoch": 4.78813997586623, + "grad_norm": 1.325904788671911, + "learning_rate": 4.7226944899649296e-08, + "loss": 0.0951, + "step": 13890 + }, + { + "epoch": 4.789863816583348, + "grad_norm": 1.5621696558731313, + "learning_rate": 4.646457004413962e-08, + "loss": 0.0955, + "step": 13895 + }, + { + "epoch": 4.791587657300465, + "grad_norm": 1.6548210222936395, + "learning_rate": 4.5708370068646144e-08, + "loss": 0.0971, + "step": 13900 + }, + { + "epoch": 4.793311498017583, + "grad_norm": 1.3102386537006776, + "learning_rate": 4.495834591581871e-08, + "loss": 0.0956, + "step": 13905 + }, + { + "epoch": 4.795035338734701, + "grad_norm": 1.1558147781651293, + "learning_rate": 4.4214498520607216e-08, + "loss": 0.0844, + "step": 13910 + }, + { + "epoch": 4.7967591794518185, + "grad_norm": 1.3034762756808058, + "learning_rate": 4.3476828810261054e-08, + "loss": 0.0976, + "step": 13915 + }, + { + "epoch": 4.798483020168936, + "grad_norm": 1.0778541595176203, + "learning_rate": 4.2745337704331316e-08, + "loss": 0.0869, + "step": 13920 + }, + { + "epoch": 4.800206860886054, + "grad_norm": 1.5648959055945622, + "learning_rate": 4.202002611466471e-08, + "loss": 0.1046, + "step": 13925 + }, + { + "epoch": 4.801930701603172, + "grad_norm": 1.4808155428582115, + "learning_rate": 4.130089494540635e-08, + "loss": 0.0988, + "step": 13930 + }, + { + "epoch": 4.80365454232029, + "grad_norm": 1.4236243594177849, + "learning_rate": 4.058794509299635e-08, + "loss": 0.0989, + "step": 13935 + }, + { + "epoch": 4.805378383037407, + "grad_norm": 1.2427124493604509, + "learning_rate": 3.9881177446169376e-08, + "loss": 0.1012, + "step": 13940 + }, + { + "epoch": 4.807102223754525, + "grad_norm": 1.9298724366113007, + "learning_rate": 3.918059288595399e-08, + "loss": 0.1038, + "step": 13945 + }, + { + "epoch": 4.808826064471643, + "grad_norm": 1.2998729388360812, + "learning_rate": 3.848619228567107e-08, + "loss": 0.1094, + "step": 13950 + }, + { + "epoch": 4.81054990518876, + "grad_norm": 1.3651422935501445, + "learning_rate": 3.7797976510933196e-08, + "loss": 0.0985, + "step": 13955 + }, + { + "epoch": 4.812273745905879, + "grad_norm": 1.3622378507771178, + "learning_rate": 3.711594641964189e-08, + "loss": 0.1002, + "step": 13960 + }, + { + "epoch": 4.813997586622996, + "grad_norm": 1.1948443111747744, + "learning_rate": 3.644010286198929e-08, + "loss": 0.0919, + "step": 13965 + }, + { + "epoch": 4.815721427340113, + "grad_norm": 1.3791338577576255, + "learning_rate": 3.577044668045482e-08, + "loss": 0.1104, + "step": 13970 + }, + { + "epoch": 4.817445268057232, + "grad_norm": 1.422126479577189, + "learning_rate": 3.5106978709805726e-08, + "loss": 0.1, + "step": 13975 + }, + { + "epoch": 4.819169108774349, + "grad_norm": 1.3106926505898486, + "learning_rate": 3.4449699777093226e-08, + "loss": 0.094, + "step": 13980 + }, + { + "epoch": 4.820892949491467, + "grad_norm": 1.1769287100489754, + "learning_rate": 3.3798610701656906e-08, + "loss": 0.0864, + "step": 13985 + }, + { + "epoch": 4.822616790208585, + "grad_norm": 1.2869442181883943, + "learning_rate": 3.315371229511754e-08, + "loss": 0.0943, + "step": 13990 + }, + { + "epoch": 4.824340630925702, + "grad_norm": 1.3513580116611834, + "learning_rate": 3.2515005361380415e-08, + "loss": 0.0961, + "step": 13995 + }, + { + "epoch": 4.8260644716428205, + "grad_norm": 1.4141085437286893, + "learning_rate": 3.1882490696631406e-08, + "loss": 0.1028, + "step": 14000 + }, + { + "epoch": 4.827788312359938, + "grad_norm": 1.2240986527661692, + "learning_rate": 3.125616908933815e-08, + "loss": 0.0889, + "step": 14005 + }, + { + "epoch": 4.829512153077056, + "grad_norm": 1.3401514969185462, + "learning_rate": 3.063604132024889e-08, + "loss": 0.0958, + "step": 14010 + }, + { + "epoch": 4.831235993794174, + "grad_norm": 1.236269890014706, + "learning_rate": 3.0022108162389706e-08, + "loss": 0.0974, + "step": 14015 + }, + { + "epoch": 4.832959834511291, + "grad_norm": 1.3310133105230573, + "learning_rate": 2.9414370381065095e-08, + "loss": 0.0998, + "step": 14020 + }, + { + "epoch": 4.834683675228409, + "grad_norm": 1.4760525563365918, + "learning_rate": 2.8812828733856825e-08, + "loss": 0.0988, + "step": 14025 + }, + { + "epoch": 4.836407515945527, + "grad_norm": 1.3013321767236874, + "learning_rate": 2.8217483970623404e-08, + "loss": 0.0972, + "step": 14030 + }, + { + "epoch": 4.838131356662644, + "grad_norm": 1.4514264775266061, + "learning_rate": 2.762833683349786e-08, + "loss": 0.0879, + "step": 14035 + }, + { + "epoch": 4.839855197379762, + "grad_norm": 1.3986986620265374, + "learning_rate": 2.7045388056886613e-08, + "loss": 0.0973, + "step": 14040 + }, + { + "epoch": 4.84157903809688, + "grad_norm": 1.3973586120737507, + "learning_rate": 2.6468638367471156e-08, + "loss": 0.1161, + "step": 14045 + }, + { + "epoch": 4.843302878813997, + "grad_norm": 1.4211471904477977, + "learning_rate": 2.5898088484204164e-08, + "loss": 0.1161, + "step": 14050 + }, + { + "epoch": 4.845026719531115, + "grad_norm": 1.1241958618848094, + "learning_rate": 2.5333739118310607e-08, + "loss": 0.0933, + "step": 14055 + }, + { + "epoch": 4.846750560248233, + "grad_norm": 1.093403189910431, + "learning_rate": 2.4775590973286634e-08, + "loss": 0.0972, + "step": 14060 + }, + { + "epoch": 4.848474400965351, + "grad_norm": 1.3846975251008296, + "learning_rate": 2.4223644744896247e-08, + "loss": 0.0936, + "step": 14065 + }, + { + "epoch": 4.8501982416824685, + "grad_norm": 1.1625662697774155, + "learning_rate": 2.3677901121174628e-08, + "loss": 0.1267, + "step": 14070 + }, + { + "epoch": 4.851922082399586, + "grad_norm": 1.6069938340444492, + "learning_rate": 2.3138360782423707e-08, + "loss": 0.1051, + "step": 14075 + }, + { + "epoch": 4.853645923116704, + "grad_norm": 1.3886022004222287, + "learning_rate": 2.2605024401212704e-08, + "loss": 0.0958, + "step": 14080 + }, + { + "epoch": 4.855369763833822, + "grad_norm": 1.256109373027148, + "learning_rate": 2.207789264237814e-08, + "loss": 0.0951, + "step": 14085 + }, + { + "epoch": 4.85709360455094, + "grad_norm": 1.4532026966920928, + "learning_rate": 2.1556966163021054e-08, + "loss": 0.0957, + "step": 14090 + }, + { + "epoch": 4.858817445268057, + "grad_norm": 1.3896721377892596, + "learning_rate": 2.1042245612507563e-08, + "loss": 0.1107, + "step": 14095 + }, + { + "epoch": 4.860541285985175, + "grad_norm": 1.161239952998629, + "learning_rate": 2.0533731632468302e-08, + "loss": 0.1014, + "step": 14100 + }, + { + "epoch": 4.862265126702293, + "grad_norm": 1.4405078393378496, + "learning_rate": 2.0031424856795656e-08, + "loss": 0.0964, + "step": 14105 + }, + { + "epoch": 4.86398896741941, + "grad_norm": 1.0542170598870872, + "learning_rate": 1.9535325911645974e-08, + "loss": 0.1002, + "step": 14110 + }, + { + "epoch": 4.865712808136529, + "grad_norm": 1.211915362473615, + "learning_rate": 1.9045435415436798e-08, + "loss": 0.0883, + "step": 14115 + }, + { + "epoch": 4.867436648853646, + "grad_norm": 1.4877414514791176, + "learning_rate": 1.856175397884519e-08, + "loss": 0.1021, + "step": 14120 + }, + { + "epoch": 4.869160489570763, + "grad_norm": 1.257286910642509, + "learning_rate": 1.80842822048094e-08, + "loss": 0.1031, + "step": 14125 + }, + { + "epoch": 4.870884330287882, + "grad_norm": 1.3420926910135274, + "learning_rate": 1.7613020688527215e-08, + "loss": 0.1156, + "step": 14130 + }, + { + "epoch": 4.872608171004999, + "grad_norm": 1.3855615335288138, + "learning_rate": 1.7147970017454275e-08, + "loss": 0.1131, + "step": 14135 + }, + { + "epoch": 4.8743320117221165, + "grad_norm": 1.3496689615543849, + "learning_rate": 1.6689130771304076e-08, + "loss": 0.0972, + "step": 14140 + }, + { + "epoch": 4.876055852439235, + "grad_norm": 1.2361116160781513, + "learning_rate": 1.6236503522046865e-08, + "loss": 0.0925, + "step": 14145 + }, + { + "epoch": 4.877779693156352, + "grad_norm": 1.2949134418825732, + "learning_rate": 1.5790088833910755e-08, + "loss": 0.0882, + "step": 14150 + }, + { + "epoch": 4.8795035338734705, + "grad_norm": 1.4423232706755098, + "learning_rate": 1.5349887263377826e-08, + "loss": 0.096, + "step": 14155 + }, + { + "epoch": 4.881227374590588, + "grad_norm": 1.5242583230047713, + "learning_rate": 1.491589935918636e-08, + "loss": 0.1077, + "step": 14160 + }, + { + "epoch": 4.882951215307705, + "grad_norm": 1.2823536392269483, + "learning_rate": 1.448812566232749e-08, + "loss": 0.0918, + "step": 14165 + }, + { + "epoch": 4.8846750560248235, + "grad_norm": 1.3272510700353644, + "learning_rate": 1.4066566706048001e-08, + "loss": 0.115, + "step": 14170 + }, + { + "epoch": 4.886398896741941, + "grad_norm": 1.566051023444431, + "learning_rate": 1.3651223015845871e-08, + "loss": 0.0994, + "step": 14175 + }, + { + "epoch": 4.888122737459058, + "grad_norm": 1.3707859433477998, + "learning_rate": 1.3242095109471942e-08, + "loss": 0.1098, + "step": 14180 + }, + { + "epoch": 4.889846578176177, + "grad_norm": 1.4204996960180363, + "learning_rate": 1.2839183496928808e-08, + "loss": 0.1047, + "step": 14185 + }, + { + "epoch": 4.891570418893294, + "grad_norm": 1.0365809080242143, + "learning_rate": 1.2442488680470266e-08, + "loss": 0.0917, + "step": 14190 + }, + { + "epoch": 4.893294259610412, + "grad_norm": 1.1823149883840962, + "learning_rate": 1.2052011154600197e-08, + "loss": 0.0911, + "step": 14195 + }, + { + "epoch": 4.89501810032753, + "grad_norm": 1.424281172971629, + "learning_rate": 1.1667751406072569e-08, + "loss": 0.0897, + "step": 14200 + }, + { + "epoch": 4.896741941044647, + "grad_norm": 1.3321713768345178, + "learning_rate": 1.128970991388978e-08, + "loss": 0.0951, + "step": 14205 + }, + { + "epoch": 4.898465781761765, + "grad_norm": 1.539849141369638, + "learning_rate": 1.0917887149303196e-08, + "loss": 0.0972, + "step": 14210 + }, + { + "epoch": 4.900189622478883, + "grad_norm": 1.3932031260084274, + "learning_rate": 1.0552283575813171e-08, + "loss": 0.0952, + "step": 14215 + }, + { + "epoch": 4.901913463196001, + "grad_norm": 1.453166419944204, + "learning_rate": 1.01928996491657e-08, + "loss": 0.1088, + "step": 14220 + }, + { + "epoch": 4.9036373039131185, + "grad_norm": 1.334424660734862, + "learning_rate": 9.8397358173552e-09, + "loss": 0.0879, + "step": 14225 + }, + { + "epoch": 4.905361144630236, + "grad_norm": 1.322735552987269, + "learning_rate": 9.492792520620631e-09, + "loss": 0.0917, + "step": 14230 + }, + { + "epoch": 4.907084985347354, + "grad_norm": 1.4973921778425843, + "learning_rate": 9.152070191448814e-09, + "loss": 0.0957, + "step": 14235 + }, + { + "epoch": 4.9088088260644716, + "grad_norm": 1.431169667547247, + "learning_rate": 8.817569254569447e-09, + "loss": 0.0979, + "step": 14240 + }, + { + "epoch": 4.91053266678159, + "grad_norm": 1.2710799194841491, + "learning_rate": 8.489290126959537e-09, + "loss": 0.0961, + "step": 14245 + }, + { + "epoch": 4.912256507498707, + "grad_norm": 1.3185640551155087, + "learning_rate": 8.1672332178373e-09, + "loss": 0.0795, + "step": 14250 + }, + { + "epoch": 4.913980348215825, + "grad_norm": 1.6115725732360635, + "learning_rate": 7.851398928667154e-09, + "loss": 0.105, + "step": 14255 + }, + { + "epoch": 4.915704188932943, + "grad_norm": 1.5133238027410296, + "learning_rate": 7.54178765315472e-09, + "loss": 0.0924, + "step": 14260 + }, + { + "epoch": 4.91742802965006, + "grad_norm": 1.1314040169361865, + "learning_rate": 7.238399777249605e-09, + "loss": 0.0992, + "step": 14265 + }, + { + "epoch": 4.919151870367178, + "grad_norm": 1.250245971151011, + "learning_rate": 6.941235679143177e-09, + "loss": 0.0979, + "step": 14270 + }, + { + "epoch": 4.920875711084296, + "grad_norm": 1.5807535933663104, + "learning_rate": 6.650295729268008e-09, + "loss": 0.0957, + "step": 14275 + }, + { + "epoch": 4.922599551801413, + "grad_norm": 1.2646164150834145, + "learning_rate": 6.3655802902984345e-09, + "loss": 0.0979, + "step": 14280 + }, + { + "epoch": 4.924323392518531, + "grad_norm": 1.1472513451831219, + "learning_rate": 6.087089717148887e-09, + "loss": 0.0745, + "step": 14285 + }, + { + "epoch": 4.926047233235649, + "grad_norm": 1.3787643736402495, + "learning_rate": 5.814824356975557e-09, + "loss": 0.092, + "step": 14290 + }, + { + "epoch": 4.9277710739527665, + "grad_norm": 1.2047071648300773, + "learning_rate": 5.54878454917307e-09, + "loss": 0.093, + "step": 14295 + }, + { + "epoch": 4.929494914669885, + "grad_norm": 1.48549431951532, + "learning_rate": 5.288970625376144e-09, + "loss": 0.1051, + "step": 14300 + }, + { + "epoch": 4.931218755387002, + "grad_norm": 1.3761400542258648, + "learning_rate": 5.035382909457931e-09, + "loss": 0.1197, + "step": 14305 + }, + { + "epoch": 4.93294259610412, + "grad_norm": 1.2031446727362487, + "learning_rate": 4.788021717531677e-09, + "loss": 0.1019, + "step": 14310 + }, + { + "epoch": 4.934666436821238, + "grad_norm": 1.0569127634349142, + "learning_rate": 4.546887357947394e-09, + "loss": 0.0869, + "step": 14315 + }, + { + "epoch": 4.936390277538355, + "grad_norm": 1.3663366213890815, + "learning_rate": 4.31198013129408e-09, + "loss": 0.0917, + "step": 14320 + }, + { + "epoch": 4.9381141182554735, + "grad_norm": 1.4437475336622627, + "learning_rate": 4.083300330396944e-09, + "loss": 0.097, + "step": 14325 + }, + { + "epoch": 4.939837958972591, + "grad_norm": 1.384616177252969, + "learning_rate": 3.8608482403196255e-09, + "loss": 0.0977, + "step": 14330 + }, + { + "epoch": 4.941561799689708, + "grad_norm": 1.4573513735461567, + "learning_rate": 3.644624138362529e-09, + "loss": 0.0998, + "step": 14335 + }, + { + "epoch": 4.943285640406827, + "grad_norm": 1.4257908131537387, + "learning_rate": 3.4346282940611596e-09, + "loss": 0.0843, + "step": 14340 + }, + { + "epoch": 4.945009481123944, + "grad_norm": 1.4472104168885904, + "learning_rate": 3.2308609691877878e-09, + "loss": 0.112, + "step": 14345 + }, + { + "epoch": 4.946733321841062, + "grad_norm": 1.3596620459225746, + "learning_rate": 3.033322417752005e-09, + "loss": 0.0944, + "step": 14350 + }, + { + "epoch": 4.94845716255818, + "grad_norm": 1.4080719556223817, + "learning_rate": 2.8420128859962813e-09, + "loss": 0.1058, + "step": 14355 + }, + { + "epoch": 4.950181003275297, + "grad_norm": 1.929399587520485, + "learning_rate": 2.656932612399854e-09, + "loss": 0.1022, + "step": 14360 + }, + { + "epoch": 4.951904843992415, + "grad_norm": 1.1843384274221604, + "learning_rate": 2.478081827676504e-09, + "loss": 0.1107, + "step": 14365 + }, + { + "epoch": 4.953628684709533, + "grad_norm": 1.2150546344422188, + "learning_rate": 2.305460754774003e-09, + "loss": 0.0929, + "step": 14370 + }, + { + "epoch": 4.955352525426651, + "grad_norm": 1.2852605711580363, + "learning_rate": 2.1390696088757766e-09, + "loss": 0.1072, + "step": 14375 + }, + { + "epoch": 4.9570763661437685, + "grad_norm": 1.5649879790510222, + "learning_rate": 1.9789085973975774e-09, + "loss": 0.1147, + "step": 14380 + }, + { + "epoch": 4.958800206860886, + "grad_norm": 1.4806792798022161, + "learning_rate": 1.824977919990256e-09, + "loss": 0.119, + "step": 14385 + }, + { + "epoch": 4.960524047578004, + "grad_norm": 1.1713177695747807, + "learning_rate": 1.677277768537544e-09, + "loss": 0.088, + "step": 14390 + }, + { + "epoch": 4.9622478882951215, + "grad_norm": 1.3199356091483336, + "learning_rate": 1.535808327156052e-09, + "loss": 0.0917, + "step": 14395 + }, + { + "epoch": 4.963971729012239, + "grad_norm": 1.240851690916721, + "learning_rate": 1.4005697721969357e-09, + "loss": 0.1025, + "step": 14400 + }, + { + "epoch": 4.965695569729357, + "grad_norm": 1.502781587155829, + "learning_rate": 1.2715622722425657e-09, + "loss": 0.1032, + "step": 14405 + }, + { + "epoch": 4.967419410446475, + "grad_norm": 1.1159646005854584, + "learning_rate": 1.1487859881087471e-09, + "loss": 0.094, + "step": 14410 + }, + { + "epoch": 4.969143251163592, + "grad_norm": 1.2331025159144084, + "learning_rate": 1.0322410728436095e-09, + "loss": 0.0909, + "step": 14415 + }, + { + "epoch": 4.97086709188071, + "grad_norm": 1.287020195683251, + "learning_rate": 9.21927671727052e-10, + "loss": 0.1006, + "step": 14420 + }, + { + "epoch": 4.972590932597828, + "grad_norm": 1.452627575615611, + "learning_rate": 8.178459222712986e-10, + "loss": 0.1066, + "step": 14425 + }, + { + "epoch": 4.974314773314946, + "grad_norm": 1.81159188089844, + "learning_rate": 7.199959542208979e-10, + "loss": 0.0979, + "step": 14430 + }, + { + "epoch": 4.976038614032063, + "grad_norm": 1.1596458413062876, + "learning_rate": 6.283778895516123e-10, + "loss": 0.0944, + "step": 14435 + }, + { + "epoch": 4.977762454749181, + "grad_norm": 2.465260462403574, + "learning_rate": 5.429918424709745e-10, + "loss": 0.0965, + "step": 14440 + }, + { + "epoch": 4.979486295466299, + "grad_norm": 1.3471705488906647, + "learning_rate": 4.6383791941773114e-10, + "loss": 0.103, + "step": 14445 + }, + { + "epoch": 4.9812101361834165, + "grad_norm": 1.3025984657839627, + "learning_rate": 3.909162190618432e-10, + "loss": 0.1021, + "step": 14450 + }, + { + "epoch": 4.982933976900535, + "grad_norm": 1.2119348971608377, + "learning_rate": 3.2422683230448617e-10, + "loss": 0.087, + "step": 14455 + }, + { + "epoch": 4.984657817617652, + "grad_norm": 1.245040472055958, + "learning_rate": 2.6376984227860504e-10, + "loss": 0.0952, + "step": 14460 + }, + { + "epoch": 4.9863816583347695, + "grad_norm": 1.3841399141396942, + "learning_rate": 2.0954532434669384e-10, + "loss": 0.0771, + "step": 14465 + }, + { + "epoch": 4.988105499051888, + "grad_norm": 1.8376152995847748, + "learning_rate": 1.6155334610357121e-10, + "loss": 0.0832, + "step": 14470 + }, + { + "epoch": 4.989829339769005, + "grad_norm": 1.352306386898328, + "learning_rate": 1.1979396737415993e-10, + "loss": 0.1128, + "step": 14475 + }, + { + "epoch": 4.9915531804861235, + "grad_norm": 1.7225670797640587, + "learning_rate": 8.426724021348697e-11, + "loss": 0.1111, + "step": 14480 + }, + { + "epoch": 4.993277021203241, + "grad_norm": 1.3740699473559133, + "learning_rate": 5.4973208907793676e-11, + "loss": 0.0885, + "step": 14485 + }, + { + "epoch": 4.995000861920358, + "grad_norm": 1.1852501848701082, + "learning_rate": 3.191190997398064e-11, + "loss": 0.0863, + "step": 14490 + }, + { + "epoch": 4.996724702637477, + "grad_norm": 1.4127853484068187, + "learning_rate": 1.5083372159607756e-11, + "loss": 0.0862, + "step": 14495 + }, + { + "epoch": 4.998448543354594, + "grad_norm": 1.4421464293245208, + "learning_rate": 4.487616442339082e-12, + "loss": 0.102, + "step": 14500 + }, + { + "epoch": 5.0, + "grad_norm": 2.4497902949004597, + "learning_rate": 1.2465603049793828e-13, + "loss": 0.1157, + "step": 14505 + }, + { + "epoch": 5.0, + "step": 14505, + "total_flos": 2.7534079795134464e+16, + "train_loss": 0.2264123897661796, + "train_runtime": 100851.2158, + "train_samples_per_second": 73.624, + "train_steps_per_second": 0.144 + } + ], + "logging_steps": 5, + "max_steps": 14505, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 2000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 2.7534079795134464e+16, + "train_batch_size": 32, + "trial_name": null, + "trial_params": null +}