diff --git "a/checkpoint-12603/trainer_state.json" "b/checkpoint-12603/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-12603/trainer_state.json" @@ -0,0 +1,8877 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 3.0, + "eval_steps": 500, + "global_step": 12603, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00238038562247084, + "grad_norm": 29.197416305541992, + "learning_rate": 4.996032690629215e-05, + "loss": 5.8295, + "step": 10 + }, + { + "epoch": 0.00476077124494168, + "grad_norm": 2.8866491317749023, + "learning_rate": 4.99206538125843e-05, + "loss": 0.9476, + "step": 20 + }, + { + "epoch": 0.007141156867412521, + "grad_norm": 2.2606563568115234, + "learning_rate": 4.988098071887646e-05, + "loss": 0.1466, + "step": 30 + }, + { + "epoch": 0.00952154248988336, + "grad_norm": 2.5246834754943848, + "learning_rate": 4.984130762516862e-05, + "loss": 0.0596, + "step": 40 + }, + { + "epoch": 0.011901928112354201, + "grad_norm": 1.10219144821167, + "learning_rate": 4.980163453146077e-05, + "loss": 0.0351, + "step": 50 + }, + { + "epoch": 0.014282313734825042, + "grad_norm": 1.7988760471343994, + "learning_rate": 4.976196143775292e-05, + "loss": 0.0293, + "step": 60 + }, + { + "epoch": 0.016662699357295883, + "grad_norm": 0.2419203370809555, + "learning_rate": 4.972228834404507e-05, + "loss": 0.024, + "step": 70 + }, + { + "epoch": 0.01904308497976672, + "grad_norm": 0.992480993270874, + "learning_rate": 4.9682615250337225e-05, + "loss": 0.0191, + "step": 80 + }, + { + "epoch": 0.021423470602237562, + "grad_norm": 1.2107903957366943, + "learning_rate": 4.9642942156629376e-05, + "loss": 0.0147, + "step": 90 + }, + { + "epoch": 0.023803856224708403, + "grad_norm": 1.5667377710342407, + "learning_rate": 4.960326906292153e-05, + "loss": 0.0144, + "step": 100 + }, + { + "epoch": 0.026184241847179244, + "grad_norm": 1.7987982034683228, + "learning_rate": 4.956359596921368e-05, + "loss": 0.0121, + "step": 110 + }, + { + "epoch": 0.028564627469650085, + "grad_norm": 0.7142848968505859, + "learning_rate": 4.952392287550583e-05, + "loss": 0.0109, + "step": 120 + }, + { + "epoch": 0.030945013092120922, + "grad_norm": 0.9309341311454773, + "learning_rate": 4.9484249781797984e-05, + "loss": 0.0087, + "step": 130 + }, + { + "epoch": 0.03332539871459177, + "grad_norm": 0.2679256498813629, + "learning_rate": 4.944457668809014e-05, + "loss": 0.0065, + "step": 140 + }, + { + "epoch": 0.035705784337062604, + "grad_norm": 0.36588725447654724, + "learning_rate": 4.940490359438229e-05, + "loss": 0.0075, + "step": 150 + }, + { + "epoch": 0.03808616995953344, + "grad_norm": 0.6737563610076904, + "learning_rate": 4.936523050067445e-05, + "loss": 0.0092, + "step": 160 + }, + { + "epoch": 0.040466555582004286, + "grad_norm": 0.3371886610984802, + "learning_rate": 4.93255574069666e-05, + "loss": 0.0067, + "step": 170 + }, + { + "epoch": 0.042846941204475124, + "grad_norm": 1.0238951444625854, + "learning_rate": 4.928588431325875e-05, + "loss": 0.0084, + "step": 180 + }, + { + "epoch": 0.04522732682694597, + "grad_norm": 1.0350103378295898, + "learning_rate": 4.9246211219550906e-05, + "loss": 0.0073, + "step": 190 + }, + { + "epoch": 0.047607712449416806, + "grad_norm": 0.33256474137306213, + "learning_rate": 4.9206538125843056e-05, + "loss": 0.0082, + "step": 200 + }, + { + "epoch": 0.04998809807188764, + "grad_norm": 0.0693468451499939, + "learning_rate": 4.9166865032135206e-05, + "loss": 0.0044, + "step": 210 + }, + { + "epoch": 0.05236848369435849, + "grad_norm": 0.8809625506401062, + "learning_rate": 4.912719193842736e-05, + "loss": 0.0064, + "step": 220 + }, + { + "epoch": 0.054748869316829325, + "grad_norm": 0.36927270889282227, + "learning_rate": 4.9087518844719514e-05, + "loss": 0.0066, + "step": 230 + }, + { + "epoch": 0.05712925493930017, + "grad_norm": 0.8885632753372192, + "learning_rate": 4.9047845751011664e-05, + "loss": 0.0063, + "step": 240 + }, + { + "epoch": 0.05950964056177101, + "grad_norm": 0.5330325365066528, + "learning_rate": 4.900817265730382e-05, + "loss": 0.0059, + "step": 250 + }, + { + "epoch": 0.061890026184241845, + "grad_norm": 0.5747584700584412, + "learning_rate": 4.896849956359597e-05, + "loss": 0.0056, + "step": 260 + }, + { + "epoch": 0.06427041180671268, + "grad_norm": 0.10936570912599564, + "learning_rate": 4.892882646988812e-05, + "loss": 0.0038, + "step": 270 + }, + { + "epoch": 0.06665079742918353, + "grad_norm": 0.136638343334198, + "learning_rate": 4.888915337618027e-05, + "loss": 0.006, + "step": 280 + }, + { + "epoch": 0.06903118305165437, + "grad_norm": 0.25448599457740784, + "learning_rate": 4.884948028247243e-05, + "loss": 0.0052, + "step": 290 + }, + { + "epoch": 0.07141156867412521, + "grad_norm": 0.19224955141544342, + "learning_rate": 4.8809807188764586e-05, + "loss": 0.0041, + "step": 300 + }, + { + "epoch": 0.07379195429659605, + "grad_norm": 0.9061737060546875, + "learning_rate": 4.8770134095056736e-05, + "loss": 0.0051, + "step": 310 + }, + { + "epoch": 0.07617233991906688, + "grad_norm": 0.31071603298187256, + "learning_rate": 4.873046100134889e-05, + "loss": 0.0043, + "step": 320 + }, + { + "epoch": 0.07855272554153774, + "grad_norm": 0.054100409150123596, + "learning_rate": 4.869078790764104e-05, + "loss": 0.004, + "step": 330 + }, + { + "epoch": 0.08093311116400857, + "grad_norm": 0.11965326964855194, + "learning_rate": 4.865111481393319e-05, + "loss": 0.0039, + "step": 340 + }, + { + "epoch": 0.08331349678647941, + "grad_norm": 0.16056092083454132, + "learning_rate": 4.861144172022535e-05, + "loss": 0.0036, + "step": 350 + }, + { + "epoch": 0.08569388240895025, + "grad_norm": 0.08699148148298264, + "learning_rate": 4.85717686265175e-05, + "loss": 0.0032, + "step": 360 + }, + { + "epoch": 0.08807426803142109, + "grad_norm": 0.16824030876159668, + "learning_rate": 4.853209553280965e-05, + "loss": 0.0033, + "step": 370 + }, + { + "epoch": 0.09045465365389194, + "grad_norm": 0.07728957384824753, + "learning_rate": 4.84924224391018e-05, + "loss": 0.0023, + "step": 380 + }, + { + "epoch": 0.09283503927636277, + "grad_norm": 0.2950897514820099, + "learning_rate": 4.845274934539395e-05, + "loss": 0.0039, + "step": 390 + }, + { + "epoch": 0.09521542489883361, + "grad_norm": 0.6249143481254578, + "learning_rate": 4.841307625168611e-05, + "loss": 0.012, + "step": 400 + }, + { + "epoch": 0.09759581052130445, + "grad_norm": 0.06545058637857437, + "learning_rate": 4.837340315797826e-05, + "loss": 0.0022, + "step": 410 + }, + { + "epoch": 0.09997619614377529, + "grad_norm": 0.40417027473449707, + "learning_rate": 4.833373006427042e-05, + "loss": 0.003, + "step": 420 + }, + { + "epoch": 0.10235658176624614, + "grad_norm": 0.38520482182502747, + "learning_rate": 4.829405697056257e-05, + "loss": 0.0037, + "step": 430 + }, + { + "epoch": 0.10473696738871698, + "grad_norm": 0.9367744326591492, + "learning_rate": 4.825438387685472e-05, + "loss": 0.0029, + "step": 440 + }, + { + "epoch": 0.10711735301118781, + "grad_norm": 0.09369224309921265, + "learning_rate": 4.8214710783146875e-05, + "loss": 0.0021, + "step": 450 + }, + { + "epoch": 0.10949773863365865, + "grad_norm": 1.1114966869354248, + "learning_rate": 4.8175037689439025e-05, + "loss": 0.0024, + "step": 460 + }, + { + "epoch": 0.11187812425612949, + "grad_norm": 0.15539304912090302, + "learning_rate": 4.8135364595731175e-05, + "loss": 0.0026, + "step": 470 + }, + { + "epoch": 0.11425850987860034, + "grad_norm": 0.05451425537467003, + "learning_rate": 4.809569150202333e-05, + "loss": 0.0024, + "step": 480 + }, + { + "epoch": 0.11663889550107118, + "grad_norm": 0.08954957127571106, + "learning_rate": 4.805601840831548e-05, + "loss": 0.0032, + "step": 490 + }, + { + "epoch": 0.11901928112354201, + "grad_norm": 0.24188756942749023, + "learning_rate": 4.801634531460763e-05, + "loss": 0.0023, + "step": 500 + }, + { + "epoch": 0.12139966674601285, + "grad_norm": 0.062233567237854004, + "learning_rate": 4.797667222089979e-05, + "loss": 0.002, + "step": 510 + }, + { + "epoch": 0.12378005236848369, + "grad_norm": 0.605993926525116, + "learning_rate": 4.793699912719194e-05, + "loss": 0.0021, + "step": 520 + }, + { + "epoch": 0.12616043799095453, + "grad_norm": 1.5091257095336914, + "learning_rate": 4.789732603348409e-05, + "loss": 0.0026, + "step": 530 + }, + { + "epoch": 0.12854082361342536, + "grad_norm": 0.07300706952810287, + "learning_rate": 4.785765293977625e-05, + "loss": 0.0018, + "step": 540 + }, + { + "epoch": 0.1309212092358962, + "grad_norm": 0.07547351717948914, + "learning_rate": 4.78179798460684e-05, + "loss": 0.0022, + "step": 550 + }, + { + "epoch": 0.13330159485836707, + "grad_norm": 0.017345329746603966, + "learning_rate": 4.7778306752360555e-05, + "loss": 0.002, + "step": 560 + }, + { + "epoch": 0.1356819804808379, + "grad_norm": 0.048248808830976486, + "learning_rate": 4.7738633658652705e-05, + "loss": 0.0018, + "step": 570 + }, + { + "epoch": 0.13806236610330874, + "grad_norm": 0.04654766246676445, + "learning_rate": 4.7698960564944856e-05, + "loss": 0.0026, + "step": 580 + }, + { + "epoch": 0.14044275172577958, + "grad_norm": 0.7228689193725586, + "learning_rate": 4.7659287471237006e-05, + "loss": 0.0033, + "step": 590 + }, + { + "epoch": 0.14282313734825042, + "grad_norm": 0.01947982981801033, + "learning_rate": 4.761961437752916e-05, + "loss": 0.0024, + "step": 600 + }, + { + "epoch": 0.14520352297072125, + "grad_norm": 0.03398985415697098, + "learning_rate": 4.7579941283821314e-05, + "loss": 0.0019, + "step": 610 + }, + { + "epoch": 0.1475839085931921, + "grad_norm": 0.11993751674890518, + "learning_rate": 4.754026819011347e-05, + "loss": 0.0024, + "step": 620 + }, + { + "epoch": 0.14996429421566293, + "grad_norm": 0.02739240974187851, + "learning_rate": 4.750059509640562e-05, + "loss": 0.0019, + "step": 630 + }, + { + "epoch": 0.15234467983813377, + "grad_norm": 0.08998490869998932, + "learning_rate": 4.746092200269777e-05, + "loss": 0.0026, + "step": 640 + }, + { + "epoch": 0.1547250654606046, + "grad_norm": 0.06008267030119896, + "learning_rate": 4.742124890898992e-05, + "loss": 0.0019, + "step": 650 + }, + { + "epoch": 0.15710545108307547, + "grad_norm": 0.2969667911529541, + "learning_rate": 4.738157581528208e-05, + "loss": 0.0016, + "step": 660 + }, + { + "epoch": 0.1594858367055463, + "grad_norm": 0.056759823113679886, + "learning_rate": 4.7341902721574236e-05, + "loss": 0.0026, + "step": 670 + }, + { + "epoch": 0.16186622232801715, + "grad_norm": 0.36679673194885254, + "learning_rate": 4.7302229627866386e-05, + "loss": 0.0023, + "step": 680 + }, + { + "epoch": 0.16424660795048798, + "grad_norm": 0.29111284017562866, + "learning_rate": 4.7262556534158536e-05, + "loss": 0.0028, + "step": 690 + }, + { + "epoch": 0.16662699357295882, + "grad_norm": 0.48570939898490906, + "learning_rate": 4.722288344045069e-05, + "loss": 0.0015, + "step": 700 + }, + { + "epoch": 0.16900737919542966, + "grad_norm": 0.06863627582788467, + "learning_rate": 4.718321034674284e-05, + "loss": 0.0016, + "step": 710 + }, + { + "epoch": 0.1713877648179005, + "grad_norm": 0.18400460481643677, + "learning_rate": 4.7143537253034994e-05, + "loss": 0.0025, + "step": 720 + }, + { + "epoch": 0.17376815044037133, + "grad_norm": 0.02043345756828785, + "learning_rate": 4.710386415932715e-05, + "loss": 0.0014, + "step": 730 + }, + { + "epoch": 0.17614853606284217, + "grad_norm": 0.22026614844799042, + "learning_rate": 4.70641910656193e-05, + "loss": 0.0016, + "step": 740 + }, + { + "epoch": 0.178528921685313, + "grad_norm": 0.033756159245967865, + "learning_rate": 4.702451797191145e-05, + "loss": 0.0015, + "step": 750 + }, + { + "epoch": 0.18090930730778387, + "grad_norm": 0.03022690862417221, + "learning_rate": 4.69848448782036e-05, + "loss": 0.0016, + "step": 760 + }, + { + "epoch": 0.1832896929302547, + "grad_norm": 0.32997235655784607, + "learning_rate": 4.694517178449576e-05, + "loss": 0.0017, + "step": 770 + }, + { + "epoch": 0.18567007855272555, + "grad_norm": 0.6392120718955994, + "learning_rate": 4.690549869078791e-05, + "loss": 0.0015, + "step": 780 + }, + { + "epoch": 0.18805046417519639, + "grad_norm": 0.12279071658849716, + "learning_rate": 4.6865825597080066e-05, + "loss": 0.0016, + "step": 790 + }, + { + "epoch": 0.19043084979766722, + "grad_norm": 0.1228996068239212, + "learning_rate": 4.682615250337222e-05, + "loss": 0.0012, + "step": 800 + }, + { + "epoch": 0.19281123542013806, + "grad_norm": 0.23846402764320374, + "learning_rate": 4.678647940966437e-05, + "loss": 0.0011, + "step": 810 + }, + { + "epoch": 0.1951916210426089, + "grad_norm": 0.06786726415157318, + "learning_rate": 4.674680631595652e-05, + "loss": 0.0008, + "step": 820 + }, + { + "epoch": 0.19757200666507974, + "grad_norm": 0.062252361327409744, + "learning_rate": 4.6707133222248675e-05, + "loss": 0.0009, + "step": 830 + }, + { + "epoch": 0.19995239228755057, + "grad_norm": 0.10420612245798111, + "learning_rate": 4.6667460128540825e-05, + "loss": 0.0007, + "step": 840 + }, + { + "epoch": 0.2023327779100214, + "grad_norm": 0.024685313925147057, + "learning_rate": 4.6627787034832975e-05, + "loss": 0.0012, + "step": 850 + }, + { + "epoch": 0.20471316353249228, + "grad_norm": 0.07784374058246613, + "learning_rate": 4.658811394112513e-05, + "loss": 0.0011, + "step": 860 + }, + { + "epoch": 0.2070935491549631, + "grad_norm": 0.1463196724653244, + "learning_rate": 4.654844084741728e-05, + "loss": 0.0017, + "step": 870 + }, + { + "epoch": 0.20947393477743395, + "grad_norm": 0.04599474370479584, + "learning_rate": 4.650876775370944e-05, + "loss": 0.0013, + "step": 880 + }, + { + "epoch": 0.2118543203999048, + "grad_norm": 0.44877147674560547, + "learning_rate": 4.646909466000159e-05, + "loss": 0.0012, + "step": 890 + }, + { + "epoch": 0.21423470602237563, + "grad_norm": 1.3056105375289917, + "learning_rate": 4.642942156629374e-05, + "loss": 0.0018, + "step": 900 + }, + { + "epoch": 0.21661509164484646, + "grad_norm": 0.5220457911491394, + "learning_rate": 4.638974847258589e-05, + "loss": 0.0011, + "step": 910 + }, + { + "epoch": 0.2189954772673173, + "grad_norm": 0.5913621783256531, + "learning_rate": 4.635007537887805e-05, + "loss": 0.0013, + "step": 920 + }, + { + "epoch": 0.22137586288978814, + "grad_norm": 0.150216206908226, + "learning_rate": 4.63104022851702e-05, + "loss": 0.001, + "step": 930 + }, + { + "epoch": 0.22375624851225898, + "grad_norm": 0.022638270631432533, + "learning_rate": 4.6270729191462355e-05, + "loss": 0.0012, + "step": 940 + }, + { + "epoch": 0.2261366341347298, + "grad_norm": 0.017948875203728676, + "learning_rate": 4.6231056097754505e-05, + "loss": 0.0008, + "step": 950 + }, + { + "epoch": 0.22851701975720068, + "grad_norm": 0.25053608417510986, + "learning_rate": 4.6191383004046656e-05, + "loss": 0.0019, + "step": 960 + }, + { + "epoch": 0.23089740537967152, + "grad_norm": 0.12757046520709991, + "learning_rate": 4.6151709910338806e-05, + "loss": 0.0019, + "step": 970 + }, + { + "epoch": 0.23327779100214235, + "grad_norm": 0.185049369931221, + "learning_rate": 4.611203681663096e-05, + "loss": 0.0014, + "step": 980 + }, + { + "epoch": 0.2356581766246132, + "grad_norm": 0.37812331318855286, + "learning_rate": 4.607236372292312e-05, + "loss": 0.0014, + "step": 990 + }, + { + "epoch": 0.23803856224708403, + "grad_norm": 0.7450318336486816, + "learning_rate": 4.603269062921527e-05, + "loss": 0.0016, + "step": 1000 + }, + { + "epoch": 0.24041894786955487, + "grad_norm": 0.03629771247506142, + "learning_rate": 4.599301753550742e-05, + "loss": 0.0012, + "step": 1010 + }, + { + "epoch": 0.2427993334920257, + "grad_norm": 0.23223434388637543, + "learning_rate": 4.595334444179957e-05, + "loss": 0.0011, + "step": 1020 + }, + { + "epoch": 0.24517971911449654, + "grad_norm": 0.08511273562908173, + "learning_rate": 4.591367134809172e-05, + "loss": 0.0006, + "step": 1030 + }, + { + "epoch": 0.24756010473696738, + "grad_norm": 0.17114369571208954, + "learning_rate": 4.587399825438388e-05, + "loss": 0.001, + "step": 1040 + }, + { + "epoch": 0.24994049035943822, + "grad_norm": 0.04517650604248047, + "learning_rate": 4.5834325160676035e-05, + "loss": 0.0008, + "step": 1050 + }, + { + "epoch": 0.25232087598190905, + "grad_norm": 0.20234528183937073, + "learning_rate": 4.5794652066968186e-05, + "loss": 0.0014, + "step": 1060 + }, + { + "epoch": 0.2547012616043799, + "grad_norm": 0.007534442003816366, + "learning_rate": 4.5754978973260336e-05, + "loss": 0.0008, + "step": 1070 + }, + { + "epoch": 0.25708164722685073, + "grad_norm": 0.02520332857966423, + "learning_rate": 4.5715305879552486e-05, + "loss": 0.0008, + "step": 1080 + }, + { + "epoch": 0.25946203284932157, + "grad_norm": 0.02674415148794651, + "learning_rate": 4.5675632785844644e-05, + "loss": 0.005, + "step": 1090 + }, + { + "epoch": 0.2618424184717924, + "grad_norm": 0.0756726786494255, + "learning_rate": 4.5635959692136794e-05, + "loss": 0.0008, + "step": 1100 + }, + { + "epoch": 0.2642228040942633, + "grad_norm": 0.18692266941070557, + "learning_rate": 4.559628659842895e-05, + "loss": 0.0021, + "step": 1110 + }, + { + "epoch": 0.26660318971673413, + "grad_norm": 0.021881476044654846, + "learning_rate": 4.55566135047211e-05, + "loss": 0.0016, + "step": 1120 + }, + { + "epoch": 0.26898357533920497, + "grad_norm": 0.16764195263385773, + "learning_rate": 4.551694041101325e-05, + "loss": 0.001, + "step": 1130 + }, + { + "epoch": 0.2713639609616758, + "grad_norm": 0.6519142389297485, + "learning_rate": 4.547726731730541e-05, + "loss": 0.0015, + "step": 1140 + }, + { + "epoch": 0.27374434658414665, + "grad_norm": 0.07793217897415161, + "learning_rate": 4.543759422359756e-05, + "loss": 0.0005, + "step": 1150 + }, + { + "epoch": 0.2761247322066175, + "grad_norm": 0.04451458901166916, + "learning_rate": 4.539792112988971e-05, + "loss": 0.0009, + "step": 1160 + }, + { + "epoch": 0.2785051178290883, + "grad_norm": 0.02606957219541073, + "learning_rate": 4.5358248036181866e-05, + "loss": 0.0013, + "step": 1170 + }, + { + "epoch": 0.28088550345155916, + "grad_norm": 0.03642681613564491, + "learning_rate": 4.531857494247402e-05, + "loss": 0.0007, + "step": 1180 + }, + { + "epoch": 0.28326588907403, + "grad_norm": 0.27240046858787537, + "learning_rate": 4.527890184876617e-05, + "loss": 0.0007, + "step": 1190 + }, + { + "epoch": 0.28564627469650083, + "grad_norm": 0.01732662320137024, + "learning_rate": 4.5239228755058324e-05, + "loss": 0.0011, + "step": 1200 + }, + { + "epoch": 0.2880266603189717, + "grad_norm": 0.10321195423603058, + "learning_rate": 4.5199555661350474e-05, + "loss": 0.0007, + "step": 1210 + }, + { + "epoch": 0.2904070459414425, + "grad_norm": 0.060121580958366394, + "learning_rate": 4.5159882567642625e-05, + "loss": 0.0014, + "step": 1220 + }, + { + "epoch": 0.29278743156391335, + "grad_norm": 0.028955884277820587, + "learning_rate": 4.5120209473934775e-05, + "loss": 0.0007, + "step": 1230 + }, + { + "epoch": 0.2951678171863842, + "grad_norm": 0.0714436024427414, + "learning_rate": 4.508053638022693e-05, + "loss": 0.0007, + "step": 1240 + }, + { + "epoch": 0.297548202808855, + "grad_norm": 0.052230022847652435, + "learning_rate": 4.504086328651909e-05, + "loss": 0.0008, + "step": 1250 + }, + { + "epoch": 0.29992858843132586, + "grad_norm": 0.33476394414901733, + "learning_rate": 4.500119019281124e-05, + "loss": 0.0008, + "step": 1260 + }, + { + "epoch": 0.3023089740537967, + "grad_norm": 0.07732009142637253, + "learning_rate": 4.496151709910339e-05, + "loss": 0.0012, + "step": 1270 + }, + { + "epoch": 0.30468935967626753, + "grad_norm": 0.6843579411506653, + "learning_rate": 4.492184400539554e-05, + "loss": 0.0007, + "step": 1280 + }, + { + "epoch": 0.3070697452987384, + "grad_norm": 0.08292358368635178, + "learning_rate": 4.488217091168769e-05, + "loss": 0.0005, + "step": 1290 + }, + { + "epoch": 0.3094501309212092, + "grad_norm": 0.02598383277654648, + "learning_rate": 4.484249781797985e-05, + "loss": 0.001, + "step": 1300 + }, + { + "epoch": 0.3118305165436801, + "grad_norm": 0.7855332493782043, + "learning_rate": 4.4802824724272005e-05, + "loss": 0.0007, + "step": 1310 + }, + { + "epoch": 0.31421090216615094, + "grad_norm": 0.07066315412521362, + "learning_rate": 4.4763151630564155e-05, + "loss": 0.0005, + "step": 1320 + }, + { + "epoch": 0.3165912877886218, + "grad_norm": 0.012595695443451405, + "learning_rate": 4.4723478536856305e-05, + "loss": 0.0005, + "step": 1330 + }, + { + "epoch": 0.3189716734110926, + "grad_norm": 0.015364304184913635, + "learning_rate": 4.4683805443148455e-05, + "loss": 0.0005, + "step": 1340 + }, + { + "epoch": 0.32135205903356345, + "grad_norm": 0.0556706003844738, + "learning_rate": 4.464413234944061e-05, + "loss": 0.0011, + "step": 1350 + }, + { + "epoch": 0.3237324446560343, + "grad_norm": 0.22568030655384064, + "learning_rate": 4.460445925573277e-05, + "loss": 0.0023, + "step": 1360 + }, + { + "epoch": 0.32611283027850513, + "grad_norm": 0.048404548317193985, + "learning_rate": 4.456478616202492e-05, + "loss": 0.0016, + "step": 1370 + }, + { + "epoch": 0.32849321590097597, + "grad_norm": 0.0693359524011612, + "learning_rate": 4.452511306831707e-05, + "loss": 0.0038, + "step": 1380 + }, + { + "epoch": 0.3308736015234468, + "grad_norm": 0.16493481397628784, + "learning_rate": 4.448543997460922e-05, + "loss": 0.0006, + "step": 1390 + }, + { + "epoch": 0.33325398714591764, + "grad_norm": 1.200024962425232, + "learning_rate": 4.444576688090137e-05, + "loss": 0.0011, + "step": 1400 + }, + { + "epoch": 0.3356343727683885, + "grad_norm": 0.23021258413791656, + "learning_rate": 4.440609378719353e-05, + "loss": 0.0009, + "step": 1410 + }, + { + "epoch": 0.3380147583908593, + "grad_norm": 0.0196574367582798, + "learning_rate": 4.436642069348568e-05, + "loss": 0.0006, + "step": 1420 + }, + { + "epoch": 0.34039514401333015, + "grad_norm": 0.3254101574420929, + "learning_rate": 4.4326747599777835e-05, + "loss": 0.0015, + "step": 1430 + }, + { + "epoch": 0.342775529635801, + "grad_norm": 0.026332201436161995, + "learning_rate": 4.4287074506069986e-05, + "loss": 0.0017, + "step": 1440 + }, + { + "epoch": 0.34515591525827183, + "grad_norm": 0.2679558992385864, + "learning_rate": 4.4247401412362136e-05, + "loss": 0.0012, + "step": 1450 + }, + { + "epoch": 0.34753630088074267, + "grad_norm": 0.06991584599018097, + "learning_rate": 4.420772831865429e-05, + "loss": 0.0007, + "step": 1460 + }, + { + "epoch": 0.3499166865032135, + "grad_norm": 0.036999981850385666, + "learning_rate": 4.416805522494644e-05, + "loss": 0.001, + "step": 1470 + }, + { + "epoch": 0.35229707212568434, + "grad_norm": 0.042684607207775116, + "learning_rate": 4.4128382131238594e-05, + "loss": 0.0009, + "step": 1480 + }, + { + "epoch": 0.3546774577481552, + "grad_norm": 0.013829515315592289, + "learning_rate": 4.408870903753075e-05, + "loss": 0.0008, + "step": 1490 + }, + { + "epoch": 0.357057843370626, + "grad_norm": 0.0129277054220438, + "learning_rate": 4.40490359438229e-05, + "loss": 0.0007, + "step": 1500 + }, + { + "epoch": 0.3594382289930969, + "grad_norm": 0.03553192317485809, + "learning_rate": 4.400936285011505e-05, + "loss": 0.0008, + "step": 1510 + }, + { + "epoch": 0.36181861461556775, + "grad_norm": 0.01258548628538847, + "learning_rate": 4.396968975640721e-05, + "loss": 0.001, + "step": 1520 + }, + { + "epoch": 0.3641990002380386, + "grad_norm": 0.021352197974920273, + "learning_rate": 4.393001666269936e-05, + "loss": 0.001, + "step": 1530 + }, + { + "epoch": 0.3665793858605094, + "grad_norm": 0.035958483815193176, + "learning_rate": 4.389034356899151e-05, + "loss": 0.0007, + "step": 1540 + }, + { + "epoch": 0.36895977148298026, + "grad_norm": 0.013187541626393795, + "learning_rate": 4.3850670475283666e-05, + "loss": 0.0009, + "step": 1550 + }, + { + "epoch": 0.3713401571054511, + "grad_norm": 0.02294233813881874, + "learning_rate": 4.3810997381575816e-05, + "loss": 0.0008, + "step": 1560 + }, + { + "epoch": 0.37372054272792193, + "grad_norm": 0.14476238191127777, + "learning_rate": 4.3771324287867974e-05, + "loss": 0.0005, + "step": 1570 + }, + { + "epoch": 0.37610092835039277, + "grad_norm": 0.2275228053331375, + "learning_rate": 4.3731651194160124e-05, + "loss": 0.0006, + "step": 1580 + }, + { + "epoch": 0.3784813139728636, + "grad_norm": 0.020434999838471413, + "learning_rate": 4.3691978100452274e-05, + "loss": 0.0004, + "step": 1590 + }, + { + "epoch": 0.38086169959533445, + "grad_norm": 0.01040293462574482, + "learning_rate": 4.3652305006744424e-05, + "loss": 0.0003, + "step": 1600 + }, + { + "epoch": 0.3832420852178053, + "grad_norm": 0.0240499097853899, + "learning_rate": 4.3612631913036575e-05, + "loss": 0.0008, + "step": 1610 + }, + { + "epoch": 0.3856224708402761, + "grad_norm": 0.014826517552137375, + "learning_rate": 4.357295881932874e-05, + "loss": 0.0004, + "step": 1620 + }, + { + "epoch": 0.38800285646274696, + "grad_norm": 0.011841246858239174, + "learning_rate": 4.353328572562089e-05, + "loss": 0.0007, + "step": 1630 + }, + { + "epoch": 0.3903832420852178, + "grad_norm": 0.0156678706407547, + "learning_rate": 4.349361263191304e-05, + "loss": 0.0006, + "step": 1640 + }, + { + "epoch": 0.39276362770768863, + "grad_norm": 0.06124578043818474, + "learning_rate": 4.345393953820519e-05, + "loss": 0.0005, + "step": 1650 + }, + { + "epoch": 0.39514401333015947, + "grad_norm": 0.06753918528556824, + "learning_rate": 4.341426644449734e-05, + "loss": 0.0006, + "step": 1660 + }, + { + "epoch": 0.3975243989526303, + "grad_norm": 0.08766347169876099, + "learning_rate": 4.33745933507895e-05, + "loss": 0.0003, + "step": 1670 + }, + { + "epoch": 0.39990478457510115, + "grad_norm": 0.021080242469906807, + "learning_rate": 4.3334920257081654e-05, + "loss": 0.0008, + "step": 1680 + }, + { + "epoch": 0.402285170197572, + "grad_norm": 0.11970046162605286, + "learning_rate": 4.3295247163373804e-05, + "loss": 0.0005, + "step": 1690 + }, + { + "epoch": 0.4046655558200428, + "grad_norm": 0.027210582047700882, + "learning_rate": 4.3255574069665955e-05, + "loss": 0.0003, + "step": 1700 + }, + { + "epoch": 0.4070459414425137, + "grad_norm": 0.021168386563658714, + "learning_rate": 4.3215900975958105e-05, + "loss": 0.0005, + "step": 1710 + }, + { + "epoch": 0.40942632706498455, + "grad_norm": 0.012768070213496685, + "learning_rate": 4.3176227882250255e-05, + "loss": 0.0005, + "step": 1720 + }, + { + "epoch": 0.4118067126874554, + "grad_norm": 0.1276211142539978, + "learning_rate": 4.313655478854241e-05, + "loss": 0.0005, + "step": 1730 + }, + { + "epoch": 0.4141870983099262, + "grad_norm": 0.08978109806776047, + "learning_rate": 4.309688169483457e-05, + "loss": 0.0009, + "step": 1740 + }, + { + "epoch": 0.41656748393239706, + "grad_norm": 0.3068161606788635, + "learning_rate": 4.305720860112672e-05, + "loss": 0.0007, + "step": 1750 + }, + { + "epoch": 0.4189478695548679, + "grad_norm": 0.01211560145020485, + "learning_rate": 4.301753550741887e-05, + "loss": 0.0006, + "step": 1760 + }, + { + "epoch": 0.42132825517733874, + "grad_norm": 0.02517927996814251, + "learning_rate": 4.297786241371102e-05, + "loss": 0.0006, + "step": 1770 + }, + { + "epoch": 0.4237086407998096, + "grad_norm": 0.017450081184506416, + "learning_rate": 4.293818932000318e-05, + "loss": 0.0003, + "step": 1780 + }, + { + "epoch": 0.4260890264222804, + "grad_norm": 0.014250938780605793, + "learning_rate": 4.289851622629533e-05, + "loss": 0.0005, + "step": 1790 + }, + { + "epoch": 0.42846941204475125, + "grad_norm": 0.027526648715138435, + "learning_rate": 4.2858843132587485e-05, + "loss": 0.0005, + "step": 1800 + }, + { + "epoch": 0.4308497976672221, + "grad_norm": 0.0071271262131631374, + "learning_rate": 4.2819170038879635e-05, + "loss": 0.0007, + "step": 1810 + }, + { + "epoch": 0.4332301832896929, + "grad_norm": 0.11835234612226486, + "learning_rate": 4.2779496945171785e-05, + "loss": 0.0005, + "step": 1820 + }, + { + "epoch": 0.43561056891216376, + "grad_norm": 0.016718665137887, + "learning_rate": 4.273982385146394e-05, + "loss": 0.0007, + "step": 1830 + }, + { + "epoch": 0.4379909545346346, + "grad_norm": 0.04138866439461708, + "learning_rate": 4.270015075775609e-05, + "loss": 0.0005, + "step": 1840 + }, + { + "epoch": 0.44037134015710544, + "grad_norm": 0.5920994281768799, + "learning_rate": 4.266047766404824e-05, + "loss": 0.0009, + "step": 1850 + }, + { + "epoch": 0.4427517257795763, + "grad_norm": 0.010394711047410965, + "learning_rate": 4.2620804570340393e-05, + "loss": 0.0004, + "step": 1860 + }, + { + "epoch": 0.4451321114020471, + "grad_norm": 0.031543031334877014, + "learning_rate": 4.258113147663255e-05, + "loss": 0.0005, + "step": 1870 + }, + { + "epoch": 0.44751249702451795, + "grad_norm": 0.016665128991007805, + "learning_rate": 4.25414583829247e-05, + "loss": 0.0083, + "step": 1880 + }, + { + "epoch": 0.4498928826469888, + "grad_norm": 0.03811788931488991, + "learning_rate": 4.250178528921686e-05, + "loss": 0.0014, + "step": 1890 + }, + { + "epoch": 0.4522732682694596, + "grad_norm": 0.0656796246767044, + "learning_rate": 4.246211219550901e-05, + "loss": 0.0008, + "step": 1900 + }, + { + "epoch": 0.4546536538919305, + "grad_norm": 0.011904909275472164, + "learning_rate": 4.242243910180116e-05, + "loss": 0.0006, + "step": 1910 + }, + { + "epoch": 0.45703403951440136, + "grad_norm": 0.01850457303225994, + "learning_rate": 4.238276600809331e-05, + "loss": 0.0004, + "step": 1920 + }, + { + "epoch": 0.4594144251368722, + "grad_norm": 0.10309766978025436, + "learning_rate": 4.2343092914385466e-05, + "loss": 0.0005, + "step": 1930 + }, + { + "epoch": 0.46179481075934303, + "grad_norm": 0.13206863403320312, + "learning_rate": 4.230341982067762e-05, + "loss": 0.0004, + "step": 1940 + }, + { + "epoch": 0.46417519638181387, + "grad_norm": 0.010924161411821842, + "learning_rate": 4.226374672696977e-05, + "loss": 0.0003, + "step": 1950 + }, + { + "epoch": 0.4665555820042847, + "grad_norm": 0.013266120105981827, + "learning_rate": 4.2224073633261924e-05, + "loss": 0.0004, + "step": 1960 + }, + { + "epoch": 0.46893596762675555, + "grad_norm": 0.008552256040275097, + "learning_rate": 4.2184400539554074e-05, + "loss": 0.0003, + "step": 1970 + }, + { + "epoch": 0.4713163532492264, + "grad_norm": 0.0052538709715008736, + "learning_rate": 4.2144727445846224e-05, + "loss": 0.0005, + "step": 1980 + }, + { + "epoch": 0.4736967388716972, + "grad_norm": 0.0074672214686870575, + "learning_rate": 4.210505435213838e-05, + "loss": 0.0003, + "step": 1990 + }, + { + "epoch": 0.47607712449416806, + "grad_norm": 0.5743750929832458, + "learning_rate": 4.206538125843054e-05, + "loss": 0.0005, + "step": 2000 + }, + { + "epoch": 0.4784575101166389, + "grad_norm": 0.0076432847417891026, + "learning_rate": 4.202570816472269e-05, + "loss": 0.0005, + "step": 2010 + }, + { + "epoch": 0.48083789573910973, + "grad_norm": 0.09265641123056412, + "learning_rate": 4.198603507101484e-05, + "loss": 0.0003, + "step": 2020 + }, + { + "epoch": 0.48321828136158057, + "grad_norm": 0.01519245095551014, + "learning_rate": 4.194636197730699e-05, + "loss": 0.0002, + "step": 2030 + }, + { + "epoch": 0.4855986669840514, + "grad_norm": 0.04831220954656601, + "learning_rate": 4.1906688883599146e-05, + "loss": 0.0003, + "step": 2040 + }, + { + "epoch": 0.48797905260652225, + "grad_norm": 0.024797851219773293, + "learning_rate": 4.18670157898913e-05, + "loss": 0.0004, + "step": 2050 + }, + { + "epoch": 0.4903594382289931, + "grad_norm": 0.008994129486382008, + "learning_rate": 4.1827342696183454e-05, + "loss": 0.0002, + "step": 2060 + }, + { + "epoch": 0.4927398238514639, + "grad_norm": 0.00806290004402399, + "learning_rate": 4.1787669602475604e-05, + "loss": 0.0004, + "step": 2070 + }, + { + "epoch": 0.49512020947393476, + "grad_norm": 0.003900889540091157, + "learning_rate": 4.1747996508767754e-05, + "loss": 0.0002, + "step": 2080 + }, + { + "epoch": 0.4975005950964056, + "grad_norm": 0.00262014614418149, + "learning_rate": 4.1708323415059905e-05, + "loss": 0.0002, + "step": 2090 + }, + { + "epoch": 0.49988098071887643, + "grad_norm": 0.30837100744247437, + "learning_rate": 4.166865032135206e-05, + "loss": 0.0004, + "step": 2100 + }, + { + "epoch": 0.5022613663413473, + "grad_norm": 0.5304675102233887, + "learning_rate": 4.162897722764421e-05, + "loss": 0.0003, + "step": 2110 + }, + { + "epoch": 0.5046417519638181, + "grad_norm": 0.3627573847770691, + "learning_rate": 4.158930413393637e-05, + "loss": 0.0043, + "step": 2120 + }, + { + "epoch": 0.507022137586289, + "grad_norm": 0.011327610351145267, + "learning_rate": 4.154963104022852e-05, + "loss": 0.0005, + "step": 2130 + }, + { + "epoch": 0.5094025232087598, + "grad_norm": 0.055182114243507385, + "learning_rate": 4.150995794652067e-05, + "loss": 0.0005, + "step": 2140 + }, + { + "epoch": 0.5117829088312307, + "grad_norm": 0.009911212138831615, + "learning_rate": 4.147028485281283e-05, + "loss": 0.0004, + "step": 2150 + }, + { + "epoch": 0.5141632944537015, + "grad_norm": 0.028569847345352173, + "learning_rate": 4.143061175910498e-05, + "loss": 0.0003, + "step": 2160 + }, + { + "epoch": 0.5165436800761724, + "grad_norm": 0.0070992144756019115, + "learning_rate": 4.139093866539713e-05, + "loss": 0.0006, + "step": 2170 + }, + { + "epoch": 0.5189240656986431, + "grad_norm": 0.008213848806917667, + "learning_rate": 4.1351265571689285e-05, + "loss": 0.0002, + "step": 2180 + }, + { + "epoch": 0.521304451321114, + "grad_norm": 0.018964022397994995, + "learning_rate": 4.1311592477981435e-05, + "loss": 0.0003, + "step": 2190 + }, + { + "epoch": 0.5236848369435848, + "grad_norm": 0.004533541388809681, + "learning_rate": 4.1271919384273585e-05, + "loss": 0.0003, + "step": 2200 + }, + { + "epoch": 0.5260652225660557, + "grad_norm": 0.12422726303339005, + "learning_rate": 4.123224629056574e-05, + "loss": 0.0003, + "step": 2210 + }, + { + "epoch": 0.5284456081885266, + "grad_norm": 0.019521724432706833, + "learning_rate": 4.119257319685789e-05, + "loss": 0.0003, + "step": 2220 + }, + { + "epoch": 0.5308259938109974, + "grad_norm": 0.03547817841172218, + "learning_rate": 4.115290010315004e-05, + "loss": 0.0004, + "step": 2230 + }, + { + "epoch": 0.5332063794334683, + "grad_norm": 0.9750944375991821, + "learning_rate": 4.111322700944219e-05, + "loss": 0.0005, + "step": 2240 + }, + { + "epoch": 0.535586765055939, + "grad_norm": 0.09758254885673523, + "learning_rate": 4.107355391573435e-05, + "loss": 0.0004, + "step": 2250 + }, + { + "epoch": 0.5379671506784099, + "grad_norm": 0.20201332867145538, + "learning_rate": 4.103388082202651e-05, + "loss": 0.0008, + "step": 2260 + }, + { + "epoch": 0.5403475363008807, + "grad_norm": 0.2006085067987442, + "learning_rate": 4.099420772831866e-05, + "loss": 0.0008, + "step": 2270 + }, + { + "epoch": 0.5427279219233516, + "grad_norm": 0.0802696943283081, + "learning_rate": 4.095453463461081e-05, + "loss": 0.0007, + "step": 2280 + }, + { + "epoch": 0.5451083075458224, + "grad_norm": 0.4039531350135803, + "learning_rate": 4.091486154090296e-05, + "loss": 0.0024, + "step": 2290 + }, + { + "epoch": 0.5474886931682933, + "grad_norm": 0.006702470127493143, + "learning_rate": 4.087518844719511e-05, + "loss": 0.0007, + "step": 2300 + }, + { + "epoch": 0.5498690787907641, + "grad_norm": 0.1001976877450943, + "learning_rate": 4.083551535348727e-05, + "loss": 0.0003, + "step": 2310 + }, + { + "epoch": 0.552249464413235, + "grad_norm": 0.005626179743558168, + "learning_rate": 4.079584225977942e-05, + "loss": 0.0009, + "step": 2320 + }, + { + "epoch": 0.5546298500357058, + "grad_norm": 0.009593102149665356, + "learning_rate": 4.075616916607157e-05, + "loss": 0.0003, + "step": 2330 + }, + { + "epoch": 0.5570102356581766, + "grad_norm": 0.014003382995724678, + "learning_rate": 4.0716496072363723e-05, + "loss": 0.0003, + "step": 2340 + }, + { + "epoch": 0.5593906212806474, + "grad_norm": 0.012953966856002808, + "learning_rate": 4.0676822978655874e-05, + "loss": 0.0004, + "step": 2350 + }, + { + "epoch": 0.5617710069031183, + "grad_norm": 0.007770949974656105, + "learning_rate": 4.063714988494803e-05, + "loss": 0.0006, + "step": 2360 + }, + { + "epoch": 0.5641513925255891, + "grad_norm": 0.01227940246462822, + "learning_rate": 4.059747679124019e-05, + "loss": 0.0003, + "step": 2370 + }, + { + "epoch": 0.56653177814806, + "grad_norm": 0.2204684615135193, + "learning_rate": 4.055780369753234e-05, + "loss": 0.0003, + "step": 2380 + }, + { + "epoch": 0.5689121637705308, + "grad_norm": 0.03364790603518486, + "learning_rate": 4.051813060382449e-05, + "loss": 0.0003, + "step": 2390 + }, + { + "epoch": 0.5712925493930017, + "grad_norm": 0.049715492874383926, + "learning_rate": 4.047845751011664e-05, + "loss": 0.0003, + "step": 2400 + }, + { + "epoch": 0.5736729350154725, + "grad_norm": 0.028070533648133278, + "learning_rate": 4.0438784416408796e-05, + "loss": 0.0007, + "step": 2410 + }, + { + "epoch": 0.5760533206379433, + "grad_norm": 0.020421486347913742, + "learning_rate": 4.0399111322700946e-05, + "loss": 0.0009, + "step": 2420 + }, + { + "epoch": 0.5784337062604142, + "grad_norm": 0.010064806789159775, + "learning_rate": 4.0359438228993097e-05, + "loss": 0.0008, + "step": 2430 + }, + { + "epoch": 0.580814091882885, + "grad_norm": 0.3017018735408783, + "learning_rate": 4.0319765135285254e-05, + "loss": 0.001, + "step": 2440 + }, + { + "epoch": 0.5831944775053559, + "grad_norm": 0.20759595930576324, + "learning_rate": 4.0280092041577404e-05, + "loss": 0.0003, + "step": 2450 + }, + { + "epoch": 0.5855748631278267, + "grad_norm": 0.016160350292921066, + "learning_rate": 4.0240418947869554e-05, + "loss": 0.0006, + "step": 2460 + }, + { + "epoch": 0.5879552487502976, + "grad_norm": 0.5293152332305908, + "learning_rate": 4.020074585416171e-05, + "loss": 0.0011, + "step": 2470 + }, + { + "epoch": 0.5903356343727684, + "grad_norm": 0.007493559271097183, + "learning_rate": 4.016107276045386e-05, + "loss": 0.0004, + "step": 2480 + }, + { + "epoch": 0.5927160199952393, + "grad_norm": 0.018649157136678696, + "learning_rate": 4.012139966674601e-05, + "loss": 0.0005, + "step": 2490 + }, + { + "epoch": 0.59509640561771, + "grad_norm": 0.01135182660073042, + "learning_rate": 4.008172657303817e-05, + "loss": 0.0004, + "step": 2500 + }, + { + "epoch": 0.5974767912401809, + "grad_norm": 0.0733335018157959, + "learning_rate": 4.004205347933032e-05, + "loss": 0.0005, + "step": 2510 + }, + { + "epoch": 0.5998571768626517, + "grad_norm": 0.02785026654601097, + "learning_rate": 4.0002380385622476e-05, + "loss": 0.0003, + "step": 2520 + }, + { + "epoch": 0.6022375624851226, + "grad_norm": 0.005258665420114994, + "learning_rate": 3.996270729191463e-05, + "loss": 0.0002, + "step": 2530 + }, + { + "epoch": 0.6046179481075934, + "grad_norm": 0.006735061760991812, + "learning_rate": 3.992303419820678e-05, + "loss": 0.0003, + "step": 2540 + }, + { + "epoch": 0.6069983337300643, + "grad_norm": 0.008341578766703606, + "learning_rate": 3.988336110449893e-05, + "loss": 0.0003, + "step": 2550 + }, + { + "epoch": 0.6093787193525351, + "grad_norm": 0.0027205003425478935, + "learning_rate": 3.9843688010791084e-05, + "loss": 0.0003, + "step": 2560 + }, + { + "epoch": 0.611759104975006, + "grad_norm": 0.01718416064977646, + "learning_rate": 3.9804014917083235e-05, + "loss": 0.0005, + "step": 2570 + }, + { + "epoch": 0.6141394905974767, + "grad_norm": 0.06104213371872902, + "learning_rate": 3.976434182337539e-05, + "loss": 0.0002, + "step": 2580 + }, + { + "epoch": 0.6165198762199476, + "grad_norm": 0.008454731665551662, + "learning_rate": 3.972466872966754e-05, + "loss": 0.0001, + "step": 2590 + }, + { + "epoch": 0.6189002618424184, + "grad_norm": 0.006591182202100754, + "learning_rate": 3.968499563595969e-05, + "loss": 0.0002, + "step": 2600 + }, + { + "epoch": 0.6212806474648893, + "grad_norm": 0.009718428365886211, + "learning_rate": 3.964532254225184e-05, + "loss": 0.0019, + "step": 2610 + }, + { + "epoch": 0.6236610330873602, + "grad_norm": 0.0156183410435915, + "learning_rate": 3.9605649448544e-05, + "loss": 0.0002, + "step": 2620 + }, + { + "epoch": 0.626041418709831, + "grad_norm": 0.012816215865314007, + "learning_rate": 3.956597635483616e-05, + "loss": 0.0008, + "step": 2630 + }, + { + "epoch": 0.6284218043323019, + "grad_norm": 0.0211672130972147, + "learning_rate": 3.952630326112831e-05, + "loss": 0.0002, + "step": 2640 + }, + { + "epoch": 0.6308021899547727, + "grad_norm": 0.012701870873570442, + "learning_rate": 3.948663016742046e-05, + "loss": 0.0003, + "step": 2650 + }, + { + "epoch": 0.6331825755772436, + "grad_norm": 0.008668744005262852, + "learning_rate": 3.944695707371261e-05, + "loss": 0.0002, + "step": 2660 + }, + { + "epoch": 0.6355629611997143, + "grad_norm": 0.020911380648612976, + "learning_rate": 3.940728398000476e-05, + "loss": 0.0004, + "step": 2670 + }, + { + "epoch": 0.6379433468221852, + "grad_norm": 0.0015960232121869922, + "learning_rate": 3.9367610886296915e-05, + "loss": 0.0002, + "step": 2680 + }, + { + "epoch": 0.640323732444656, + "grad_norm": 0.01783674582839012, + "learning_rate": 3.932793779258907e-05, + "loss": 0.0001, + "step": 2690 + }, + { + "epoch": 0.6427041180671269, + "grad_norm": 0.006887937895953655, + "learning_rate": 3.928826469888122e-05, + "loss": 0.0002, + "step": 2700 + }, + { + "epoch": 0.6450845036895977, + "grad_norm": 0.004555295687168837, + "learning_rate": 3.924859160517337e-05, + "loss": 0.0002, + "step": 2710 + }, + { + "epoch": 0.6474648893120686, + "grad_norm": 0.00994735024869442, + "learning_rate": 3.920891851146552e-05, + "loss": 0.0003, + "step": 2720 + }, + { + "epoch": 0.6498452749345394, + "grad_norm": 0.03482622653245926, + "learning_rate": 3.916924541775768e-05, + "loss": 0.0002, + "step": 2730 + }, + { + "epoch": 0.6522256605570103, + "grad_norm": 0.06792888045310974, + "learning_rate": 3.912957232404983e-05, + "loss": 0.0002, + "step": 2740 + }, + { + "epoch": 0.654606046179481, + "grad_norm": 0.02015574462711811, + "learning_rate": 3.908989923034199e-05, + "loss": 0.0008, + "step": 2750 + }, + { + "epoch": 0.6569864318019519, + "grad_norm": 0.07359887659549713, + "learning_rate": 3.905022613663414e-05, + "loss": 0.0003, + "step": 2760 + }, + { + "epoch": 0.6593668174244227, + "grad_norm": 0.006248469930142164, + "learning_rate": 3.901055304292629e-05, + "loss": 0.0002, + "step": 2770 + }, + { + "epoch": 0.6617472030468936, + "grad_norm": 0.01739078015089035, + "learning_rate": 3.897087994921844e-05, + "loss": 0.0002, + "step": 2780 + }, + { + "epoch": 0.6641275886693644, + "grad_norm": 0.008228071965277195, + "learning_rate": 3.8931206855510596e-05, + "loss": 0.0005, + "step": 2790 + }, + { + "epoch": 0.6665079742918353, + "grad_norm": 0.012569721788167953, + "learning_rate": 3.8891533761802746e-05, + "loss": 0.0002, + "step": 2800 + }, + { + "epoch": 0.6688883599143061, + "grad_norm": 0.003245885483920574, + "learning_rate": 3.88518606680949e-05, + "loss": 0.0001, + "step": 2810 + }, + { + "epoch": 0.671268745536777, + "grad_norm": 0.010106906294822693, + "learning_rate": 3.8812187574387053e-05, + "loss": 0.0002, + "step": 2820 + }, + { + "epoch": 0.6736491311592478, + "grad_norm": 0.0013821216998621821, + "learning_rate": 3.8772514480679204e-05, + "loss": 0.0002, + "step": 2830 + }, + { + "epoch": 0.6760295167817186, + "grad_norm": 0.008525123819708824, + "learning_rate": 3.873284138697136e-05, + "loss": 0.0001, + "step": 2840 + }, + { + "epoch": 0.6784099024041895, + "grad_norm": 0.0045269266702234745, + "learning_rate": 3.869316829326351e-05, + "loss": 0.0001, + "step": 2850 + }, + { + "epoch": 0.6807902880266603, + "grad_norm": 0.005178367253392935, + "learning_rate": 3.865349519955566e-05, + "loss": 0.0002, + "step": 2860 + }, + { + "epoch": 0.6831706736491312, + "grad_norm": 0.015604405663907528, + "learning_rate": 3.861382210584781e-05, + "loss": 0.0001, + "step": 2870 + }, + { + "epoch": 0.685551059271602, + "grad_norm": 0.7911249399185181, + "learning_rate": 3.857414901213997e-05, + "loss": 0.0002, + "step": 2880 + }, + { + "epoch": 0.6879314448940729, + "grad_norm": 0.005056778434664011, + "learning_rate": 3.853447591843212e-05, + "loss": 0.0003, + "step": 2890 + }, + { + "epoch": 0.6903118305165437, + "grad_norm": 0.007354553788900375, + "learning_rate": 3.8494802824724276e-05, + "loss": 0.0002, + "step": 2900 + }, + { + "epoch": 0.6926922161390145, + "grad_norm": 0.10069092363119125, + "learning_rate": 3.8455129731016427e-05, + "loss": 0.0003, + "step": 2910 + }, + { + "epoch": 0.6950726017614853, + "grad_norm": 0.007913509383797646, + "learning_rate": 3.841545663730858e-05, + "loss": 0.0005, + "step": 2920 + }, + { + "epoch": 0.6974529873839562, + "grad_norm": 0.04653599485754967, + "learning_rate": 3.837578354360073e-05, + "loss": 0.0005, + "step": 2930 + }, + { + "epoch": 0.699833373006427, + "grad_norm": 0.007795447017997503, + "learning_rate": 3.8336110449892884e-05, + "loss": 0.0002, + "step": 2940 + }, + { + "epoch": 0.7022137586288979, + "grad_norm": 0.0843840092420578, + "learning_rate": 3.829643735618504e-05, + "loss": 0.0008, + "step": 2950 + }, + { + "epoch": 0.7045941442513687, + "grad_norm": 0.019790470600128174, + "learning_rate": 3.825676426247719e-05, + "loss": 0.0004, + "step": 2960 + }, + { + "epoch": 0.7069745298738396, + "grad_norm": 0.04970049858093262, + "learning_rate": 3.821709116876934e-05, + "loss": 0.0008, + "step": 2970 + }, + { + "epoch": 0.7093549154963104, + "grad_norm": 0.011334414593875408, + "learning_rate": 3.817741807506149e-05, + "loss": 0.0003, + "step": 2980 + }, + { + "epoch": 0.7117353011187812, + "grad_norm": 0.12627428770065308, + "learning_rate": 3.813774498135364e-05, + "loss": 0.0006, + "step": 2990 + }, + { + "epoch": 0.714115686741252, + "grad_norm": 0.03299270570278168, + "learning_rate": 3.8098071887645806e-05, + "loss": 0.0006, + "step": 3000 + }, + { + "epoch": 0.7164960723637229, + "grad_norm": 0.014470428228378296, + "learning_rate": 3.805839879393796e-05, + "loss": 0.0002, + "step": 3010 + }, + { + "epoch": 0.7188764579861938, + "grad_norm": 0.010081595741212368, + "learning_rate": 3.801872570023011e-05, + "loss": 0.0002, + "step": 3020 + }, + { + "epoch": 0.7212568436086646, + "grad_norm": 0.006527799181640148, + "learning_rate": 3.797905260652226e-05, + "loss": 0.0005, + "step": 3030 + }, + { + "epoch": 0.7236372292311355, + "grad_norm": 0.025967439636588097, + "learning_rate": 3.793937951281441e-05, + "loss": 0.0003, + "step": 3040 + }, + { + "epoch": 0.7260176148536063, + "grad_norm": 0.012788872234523296, + "learning_rate": 3.7899706419106565e-05, + "loss": 0.0006, + "step": 3050 + }, + { + "epoch": 0.7283980004760772, + "grad_norm": 0.05159073323011398, + "learning_rate": 3.7860033325398715e-05, + "loss": 0.0002, + "step": 3060 + }, + { + "epoch": 0.730778386098548, + "grad_norm": 0.09669562429189682, + "learning_rate": 3.782036023169087e-05, + "loss": 0.0003, + "step": 3070 + }, + { + "epoch": 0.7331587717210188, + "grad_norm": 0.0008232035324908793, + "learning_rate": 3.778068713798302e-05, + "loss": 0.0002, + "step": 3080 + }, + { + "epoch": 0.7355391573434896, + "grad_norm": 0.0026904919650405645, + "learning_rate": 3.774101404427517e-05, + "loss": 0.0008, + "step": 3090 + }, + { + "epoch": 0.7379195429659605, + "grad_norm": 0.22064454853534698, + "learning_rate": 3.770134095056733e-05, + "loss": 0.0001, + "step": 3100 + }, + { + "epoch": 0.7402999285884313, + "grad_norm": 0.0037417325656861067, + "learning_rate": 3.766166785685948e-05, + "loss": 0.0002, + "step": 3110 + }, + { + "epoch": 0.7426803142109022, + "grad_norm": 0.008903945796191692, + "learning_rate": 3.762199476315163e-05, + "loss": 0.0004, + "step": 3120 + }, + { + "epoch": 0.745060699833373, + "grad_norm": 0.01190115325152874, + "learning_rate": 3.758232166944379e-05, + "loss": 0.0003, + "step": 3130 + }, + { + "epoch": 0.7474410854558439, + "grad_norm": 0.005016674287617207, + "learning_rate": 3.754264857573594e-05, + "loss": 0.0002, + "step": 3140 + }, + { + "epoch": 0.7498214710783146, + "grad_norm": 0.009286819957196712, + "learning_rate": 3.750297548202809e-05, + "loss": 0.0002, + "step": 3150 + }, + { + "epoch": 0.7522018567007855, + "grad_norm": 0.06282204389572144, + "learning_rate": 3.7463302388320245e-05, + "loss": 0.0003, + "step": 3160 + }, + { + "epoch": 0.7545822423232563, + "grad_norm": 0.008628441952168941, + "learning_rate": 3.7423629294612396e-05, + "loss": 0.0003, + "step": 3170 + }, + { + "epoch": 0.7569626279457272, + "grad_norm": 0.03511732071638107, + "learning_rate": 3.7383956200904546e-05, + "loss": 0.0001, + "step": 3180 + }, + { + "epoch": 0.759343013568198, + "grad_norm": 0.003294560592621565, + "learning_rate": 3.73442831071967e-05, + "loss": 0.0003, + "step": 3190 + }, + { + "epoch": 0.7617233991906689, + "grad_norm": 0.032009340822696686, + "learning_rate": 3.730461001348885e-05, + "loss": 0.0002, + "step": 3200 + }, + { + "epoch": 0.7641037848131397, + "grad_norm": 0.022615088149905205, + "learning_rate": 3.726493691978101e-05, + "loss": 0.0002, + "step": 3210 + }, + { + "epoch": 0.7664841704356106, + "grad_norm": 0.0026582872960716486, + "learning_rate": 3.722526382607316e-05, + "loss": 0.0001, + "step": 3220 + }, + { + "epoch": 0.7688645560580815, + "grad_norm": 0.3148833215236664, + "learning_rate": 3.718559073236531e-05, + "loss": 0.0002, + "step": 3230 + }, + { + "epoch": 0.7712449416805522, + "grad_norm": 0.03451314941048622, + "learning_rate": 3.714591763865746e-05, + "loss": 0.0002, + "step": 3240 + }, + { + "epoch": 0.7736253273030231, + "grad_norm": 0.008008177392184734, + "learning_rate": 3.710624454494961e-05, + "loss": 0.0001, + "step": 3250 + }, + { + "epoch": 0.7760057129254939, + "grad_norm": 0.07701031118631363, + "learning_rate": 3.706657145124177e-05, + "loss": 0.0005, + "step": 3260 + }, + { + "epoch": 0.7783860985479648, + "grad_norm": 0.010465078055858612, + "learning_rate": 3.7026898357533926e-05, + "loss": 0.0002, + "step": 3270 + }, + { + "epoch": 0.7807664841704356, + "grad_norm": 0.00499736238270998, + "learning_rate": 3.6987225263826076e-05, + "loss": 0.0007, + "step": 3280 + }, + { + "epoch": 0.7831468697929065, + "grad_norm": 0.6453936696052551, + "learning_rate": 3.6947552170118226e-05, + "loss": 0.0003, + "step": 3290 + }, + { + "epoch": 0.7855272554153773, + "grad_norm": 0.016864465549588203, + "learning_rate": 3.690787907641038e-05, + "loss": 0.0003, + "step": 3300 + }, + { + "epoch": 0.7879076410378482, + "grad_norm": 0.05074018985033035, + "learning_rate": 3.6868205982702534e-05, + "loss": 0.0002, + "step": 3310 + }, + { + "epoch": 0.7902880266603189, + "grad_norm": 0.006529835984110832, + "learning_rate": 3.682853288899469e-05, + "loss": 0.0005, + "step": 3320 + }, + { + "epoch": 0.7926684122827898, + "grad_norm": 0.041339557617902756, + "learning_rate": 3.678885979528684e-05, + "loss": 0.0004, + "step": 3330 + }, + { + "epoch": 0.7950487979052606, + "grad_norm": 0.006891661789268255, + "learning_rate": 3.674918670157899e-05, + "loss": 0.0004, + "step": 3340 + }, + { + "epoch": 0.7974291835277315, + "grad_norm": 0.01043302658945322, + "learning_rate": 3.670951360787114e-05, + "loss": 0.0003, + "step": 3350 + }, + { + "epoch": 0.7998095691502023, + "grad_norm": 0.01914358325302601, + "learning_rate": 3.666984051416329e-05, + "loss": 0.0008, + "step": 3360 + }, + { + "epoch": 0.8021899547726732, + "grad_norm": 0.016266925260424614, + "learning_rate": 3.663016742045545e-05, + "loss": 0.0004, + "step": 3370 + }, + { + "epoch": 0.804570340395144, + "grad_norm": 0.005765034817159176, + "learning_rate": 3.6590494326747606e-05, + "loss": 0.0002, + "step": 3380 + }, + { + "epoch": 0.8069507260176149, + "grad_norm": 0.007664472330361605, + "learning_rate": 3.6550821233039757e-05, + "loss": 0.0002, + "step": 3390 + }, + { + "epoch": 0.8093311116400856, + "grad_norm": 0.00499699218198657, + "learning_rate": 3.651114813933191e-05, + "loss": 0.0001, + "step": 3400 + }, + { + "epoch": 0.8117114972625565, + "grad_norm": 0.012575655244290829, + "learning_rate": 3.647147504562406e-05, + "loss": 0.0002, + "step": 3410 + }, + { + "epoch": 0.8140918828850274, + "grad_norm": 0.010001065209507942, + "learning_rate": 3.6431801951916214e-05, + "loss": 0.0005, + "step": 3420 + }, + { + "epoch": 0.8164722685074982, + "grad_norm": 0.06131220981478691, + "learning_rate": 3.6392128858208365e-05, + "loss": 0.0002, + "step": 3430 + }, + { + "epoch": 0.8188526541299691, + "grad_norm": 0.037141721695661545, + "learning_rate": 3.6352455764500515e-05, + "loss": 0.0001, + "step": 3440 + }, + { + "epoch": 0.8212330397524399, + "grad_norm": 0.05955801159143448, + "learning_rate": 3.631278267079267e-05, + "loss": 0.0005, + "step": 3450 + }, + { + "epoch": 0.8236134253749108, + "grad_norm": 0.012499036267399788, + "learning_rate": 3.627310957708482e-05, + "loss": 0.0002, + "step": 3460 + }, + { + "epoch": 0.8259938109973816, + "grad_norm": 0.007782169617712498, + "learning_rate": 3.623343648337697e-05, + "loss": 0.0004, + "step": 3470 + }, + { + "epoch": 0.8283741966198525, + "grad_norm": 0.016740377992391586, + "learning_rate": 3.619376338966913e-05, + "loss": 0.0004, + "step": 3480 + }, + { + "epoch": 0.8307545822423232, + "grad_norm": 0.05157579854130745, + "learning_rate": 3.615409029596128e-05, + "loss": 0.0003, + "step": 3490 + }, + { + "epoch": 0.8331349678647941, + "grad_norm": 0.00816064327955246, + "learning_rate": 3.611441720225343e-05, + "loss": 0.0003, + "step": 3500 + }, + { + "epoch": 0.8355153534872649, + "grad_norm": 0.02470710128545761, + "learning_rate": 3.607474410854559e-05, + "loss": 0.0002, + "step": 3510 + }, + { + "epoch": 0.8378957391097358, + "grad_norm": 0.004836896900087595, + "learning_rate": 3.603507101483774e-05, + "loss": 0.0001, + "step": 3520 + }, + { + "epoch": 0.8402761247322066, + "grad_norm": 0.003796802368015051, + "learning_rate": 3.5995397921129895e-05, + "loss": 0.0002, + "step": 3530 + }, + { + "epoch": 0.8426565103546775, + "grad_norm": 0.006737705785781145, + "learning_rate": 3.5955724827422045e-05, + "loss": 0.0003, + "step": 3540 + }, + { + "epoch": 0.8450368959771483, + "grad_norm": 0.0021388079039752483, + "learning_rate": 3.5916051733714195e-05, + "loss": 0.0001, + "step": 3550 + }, + { + "epoch": 0.8474172815996192, + "grad_norm": 0.047663912177085876, + "learning_rate": 3.5876378640006346e-05, + "loss": 0.0001, + "step": 3560 + }, + { + "epoch": 0.8497976672220899, + "grad_norm": 0.015320863574743271, + "learning_rate": 3.58367055462985e-05, + "loss": 0.0002, + "step": 3570 + }, + { + "epoch": 0.8521780528445608, + "grad_norm": 0.008627827279269695, + "learning_rate": 3.579703245259065e-05, + "loss": 0.0003, + "step": 3580 + }, + { + "epoch": 0.8545584384670316, + "grad_norm": 0.0034904240164905787, + "learning_rate": 3.575735935888281e-05, + "loss": 0.0001, + "step": 3590 + }, + { + "epoch": 0.8569388240895025, + "grad_norm": 0.01078026182949543, + "learning_rate": 3.571768626517496e-05, + "loss": 0.0002, + "step": 3600 + }, + { + "epoch": 0.8593192097119733, + "grad_norm": 0.011285877786576748, + "learning_rate": 3.567801317146711e-05, + "loss": 0.0007, + "step": 3610 + }, + { + "epoch": 0.8616995953344442, + "grad_norm": 0.005885743070393801, + "learning_rate": 3.563834007775926e-05, + "loss": 0.0003, + "step": 3620 + }, + { + "epoch": 0.8640799809569151, + "grad_norm": 0.1011798158288002, + "learning_rate": 3.559866698405142e-05, + "loss": 0.0002, + "step": 3630 + }, + { + "epoch": 0.8664603665793859, + "grad_norm": 0.012861615046858788, + "learning_rate": 3.5558993890343575e-05, + "loss": 0.0002, + "step": 3640 + }, + { + "epoch": 0.8688407522018567, + "grad_norm": 0.009324765764176846, + "learning_rate": 3.5519320796635726e-05, + "loss": 0.0007, + "step": 3650 + }, + { + "epoch": 0.8712211378243275, + "grad_norm": 0.0035065708216279745, + "learning_rate": 3.5479647702927876e-05, + "loss": 0.0004, + "step": 3660 + }, + { + "epoch": 0.8736015234467984, + "grad_norm": 0.010472165420651436, + "learning_rate": 3.5439974609220026e-05, + "loss": 0.0001, + "step": 3670 + }, + { + "epoch": 0.8759819090692692, + "grad_norm": 0.009073158726096153, + "learning_rate": 3.5400301515512176e-05, + "loss": 0.0001, + "step": 3680 + }, + { + "epoch": 0.8783622946917401, + "grad_norm": 0.0028665116988122463, + "learning_rate": 3.5360628421804334e-05, + "loss": 0.0001, + "step": 3690 + }, + { + "epoch": 0.8807426803142109, + "grad_norm": 0.009178753942251205, + "learning_rate": 3.532095532809649e-05, + "loss": 0.0003, + "step": 3700 + }, + { + "epoch": 0.8831230659366818, + "grad_norm": 0.007954353466629982, + "learning_rate": 3.528128223438864e-05, + "loss": 0.0006, + "step": 3710 + }, + { + "epoch": 0.8855034515591526, + "grad_norm": 0.009399271570146084, + "learning_rate": 3.524160914068079e-05, + "loss": 0.0002, + "step": 3720 + }, + { + "epoch": 0.8878838371816234, + "grad_norm": 0.0035749957896769047, + "learning_rate": 3.520193604697294e-05, + "loss": 0.0001, + "step": 3730 + }, + { + "epoch": 0.8902642228040942, + "grad_norm": 0.007753758691251278, + "learning_rate": 3.51622629532651e-05, + "loss": 0.0001, + "step": 3740 + }, + { + "epoch": 0.8926446084265651, + "grad_norm": 0.007471214048564434, + "learning_rate": 3.512258985955725e-05, + "loss": 0.0003, + "step": 3750 + }, + { + "epoch": 0.8950249940490359, + "grad_norm": 0.016612932085990906, + "learning_rate": 3.5082916765849406e-05, + "loss": 0.0001, + "step": 3760 + }, + { + "epoch": 0.8974053796715068, + "grad_norm": 0.008320000022649765, + "learning_rate": 3.5043243672141556e-05, + "loss": 0.0001, + "step": 3770 + }, + { + "epoch": 0.8997857652939776, + "grad_norm": 0.010242090560495853, + "learning_rate": 3.500357057843371e-05, + "loss": 0.0001, + "step": 3780 + }, + { + "epoch": 0.9021661509164485, + "grad_norm": 0.0036350861191749573, + "learning_rate": 3.4963897484725864e-05, + "loss": 0.0001, + "step": 3790 + }, + { + "epoch": 0.9045465365389193, + "grad_norm": 0.002153201960027218, + "learning_rate": 3.4924224391018014e-05, + "loss": 0.0002, + "step": 3800 + }, + { + "epoch": 0.9069269221613901, + "grad_norm": 0.003587006125599146, + "learning_rate": 3.4884551297310164e-05, + "loss": 0.0002, + "step": 3810 + }, + { + "epoch": 0.909307307783861, + "grad_norm": 0.006511629093438387, + "learning_rate": 3.4844878203602315e-05, + "loss": 0.0002, + "step": 3820 + }, + { + "epoch": 0.9116876934063318, + "grad_norm": 0.008945467881858349, + "learning_rate": 3.480520510989447e-05, + "loss": 0.0001, + "step": 3830 + }, + { + "epoch": 0.9140680790288027, + "grad_norm": 0.006604051683098078, + "learning_rate": 3.476553201618662e-05, + "loss": 0.0001, + "step": 3840 + }, + { + "epoch": 0.9164484646512735, + "grad_norm": 0.0031156474724411964, + "learning_rate": 3.472585892247878e-05, + "loss": 0.0003, + "step": 3850 + }, + { + "epoch": 0.9188288502737444, + "grad_norm": 0.005195919424295425, + "learning_rate": 3.468618582877093e-05, + "loss": 0.0001, + "step": 3860 + }, + { + "epoch": 0.9212092358962152, + "grad_norm": 0.008878687396645546, + "learning_rate": 3.464651273506308e-05, + "loss": 0.0001, + "step": 3870 + }, + { + "epoch": 0.9235896215186861, + "grad_norm": 0.0020940713584423065, + "learning_rate": 3.460683964135523e-05, + "loss": 0.0001, + "step": 3880 + }, + { + "epoch": 0.9259700071411568, + "grad_norm": 0.0066345930099487305, + "learning_rate": 3.456716654764739e-05, + "loss": 0.0001, + "step": 3890 + }, + { + "epoch": 0.9283503927636277, + "grad_norm": 0.0018133444245904684, + "learning_rate": 3.4527493453939544e-05, + "loss": 0.0001, + "step": 3900 + }, + { + "epoch": 0.9307307783860985, + "grad_norm": 0.000830967677757144, + "learning_rate": 3.4487820360231695e-05, + "loss": 0.0001, + "step": 3910 + }, + { + "epoch": 0.9331111640085694, + "grad_norm": 0.0037288174498826265, + "learning_rate": 3.4448147266523845e-05, + "loss": 0.0001, + "step": 3920 + }, + { + "epoch": 0.9354915496310402, + "grad_norm": 0.005838675890117884, + "learning_rate": 3.4408474172815995e-05, + "loss": 0.0003, + "step": 3930 + }, + { + "epoch": 0.9378719352535111, + "grad_norm": 0.008044001646339893, + "learning_rate": 3.4368801079108145e-05, + "loss": 0.0002, + "step": 3940 + }, + { + "epoch": 0.9402523208759819, + "grad_norm": 0.07016938179731369, + "learning_rate": 3.43291279854003e-05, + "loss": 0.0005, + "step": 3950 + }, + { + "epoch": 0.9426327064984528, + "grad_norm": 0.11337173730134964, + "learning_rate": 3.428945489169246e-05, + "loss": 0.0002, + "step": 3960 + }, + { + "epoch": 0.9450130921209235, + "grad_norm": 0.0017598132835701108, + "learning_rate": 3.424978179798461e-05, + "loss": 0.0003, + "step": 3970 + }, + { + "epoch": 0.9473934777433944, + "grad_norm": 0.030149806290864944, + "learning_rate": 3.421010870427676e-05, + "loss": 0.0003, + "step": 3980 + }, + { + "epoch": 0.9497738633658652, + "grad_norm": 0.11280670762062073, + "learning_rate": 3.417043561056891e-05, + "loss": 0.0003, + "step": 3990 + }, + { + "epoch": 0.9521542489883361, + "grad_norm": 0.02797405980527401, + "learning_rate": 3.413076251686107e-05, + "loss": 0.0003, + "step": 4000 + }, + { + "epoch": 0.9545346346108069, + "grad_norm": 0.009325963445007801, + "learning_rate": 3.4091089423153225e-05, + "loss": 0.0002, + "step": 4010 + }, + { + "epoch": 0.9569150202332778, + "grad_norm": 0.015098505653440952, + "learning_rate": 3.4051416329445375e-05, + "loss": 0.0002, + "step": 4020 + }, + { + "epoch": 0.9592954058557487, + "grad_norm": 0.0010631170589476824, + "learning_rate": 3.4011743235737525e-05, + "loss": 0.0002, + "step": 4030 + }, + { + "epoch": 0.9616757914782195, + "grad_norm": 0.11537562310695648, + "learning_rate": 3.3972070142029676e-05, + "loss": 0.0004, + "step": 4040 + }, + { + "epoch": 0.9640561771006904, + "grad_norm": 0.055657465010881424, + "learning_rate": 3.3932397048321826e-05, + "loss": 0.0002, + "step": 4050 + }, + { + "epoch": 0.9664365627231611, + "grad_norm": 0.004681292921304703, + "learning_rate": 3.389272395461398e-05, + "loss": 0.0001, + "step": 4060 + }, + { + "epoch": 0.968816948345632, + "grad_norm": 0.0036875929217785597, + "learning_rate": 3.385305086090613e-05, + "loss": 0.0003, + "step": 4070 + }, + { + "epoch": 0.9711973339681028, + "grad_norm": 0.3181780278682709, + "learning_rate": 3.381337776719829e-05, + "loss": 0.0002, + "step": 4080 + }, + { + "epoch": 0.9735777195905737, + "grad_norm": 0.008175074122846127, + "learning_rate": 3.377370467349044e-05, + "loss": 0.0001, + "step": 4090 + }, + { + "epoch": 0.9759581052130445, + "grad_norm": 0.008897043764591217, + "learning_rate": 3.373403157978259e-05, + "loss": 0.0002, + "step": 4100 + }, + { + "epoch": 0.9783384908355154, + "grad_norm": 0.005149902775883675, + "learning_rate": 3.369435848607475e-05, + "loss": 0.0001, + "step": 4110 + }, + { + "epoch": 0.9807188764579862, + "grad_norm": 0.005102005321532488, + "learning_rate": 3.36546853923669e-05, + "loss": 0.0001, + "step": 4120 + }, + { + "epoch": 0.9830992620804571, + "grad_norm": 0.003907215781509876, + "learning_rate": 3.361501229865905e-05, + "loss": 0.0001, + "step": 4130 + }, + { + "epoch": 0.9854796477029278, + "grad_norm": 0.006176768336445093, + "learning_rate": 3.3575339204951206e-05, + "loss": 0.0001, + "step": 4140 + }, + { + "epoch": 0.9878600333253987, + "grad_norm": 0.007574237417429686, + "learning_rate": 3.3535666111243356e-05, + "loss": 0.0001, + "step": 4150 + }, + { + "epoch": 0.9902404189478695, + "grad_norm": 0.0036479670088738203, + "learning_rate": 3.3495993017535506e-05, + "loss": 0.0001, + "step": 4160 + }, + { + "epoch": 0.9926208045703404, + "grad_norm": 0.0031234126072376966, + "learning_rate": 3.3456319923827664e-05, + "loss": 0.0001, + "step": 4170 + }, + { + "epoch": 0.9950011901928112, + "grad_norm": 0.015276722609996796, + "learning_rate": 3.3416646830119814e-05, + "loss": 0.0004, + "step": 4180 + }, + { + "epoch": 0.9973815758152821, + "grad_norm": 0.015308289788663387, + "learning_rate": 3.3376973736411964e-05, + "loss": 0.0002, + "step": 4190 + }, + { + "epoch": 0.9997619614377529, + "grad_norm": 1.1039026975631714, + "learning_rate": 3.333730064270412e-05, + "loss": 0.0034, + "step": 4200 + }, + { + "epoch": 1.0, + "eval_loss": 6.8751428443647455e-06, + "eval_runtime": 52.0551, + "eval_samples_per_second": 35.885, + "eval_steps_per_second": 8.971, + "step": 4201 + }, + { + "epoch": 1.0021423470602238, + "grad_norm": 0.4035731852054596, + "learning_rate": 3.329762754899627e-05, + "loss": 0.001, + "step": 4210 + }, + { + "epoch": 1.0045227326826947, + "grad_norm": 0.058116745203733444, + "learning_rate": 3.325795445528843e-05, + "loss": 0.0006, + "step": 4220 + }, + { + "epoch": 1.0069031183051655, + "grad_norm": 0.030079133808612823, + "learning_rate": 3.321828136158058e-05, + "loss": 0.0005, + "step": 4230 + }, + { + "epoch": 1.0092835039276362, + "grad_norm": 0.03710814565420151, + "learning_rate": 3.317860826787273e-05, + "loss": 0.0008, + "step": 4240 + }, + { + "epoch": 1.011663889550107, + "grad_norm": 0.25699111819267273, + "learning_rate": 3.313893517416488e-05, + "loss": 0.0003, + "step": 4250 + }, + { + "epoch": 1.014044275172578, + "grad_norm": 0.01729218102991581, + "learning_rate": 3.309926208045703e-05, + "loss": 0.0004, + "step": 4260 + }, + { + "epoch": 1.016424660795049, + "grad_norm": 0.004348506219685078, + "learning_rate": 3.3059588986749194e-05, + "loss": 0.0003, + "step": 4270 + }, + { + "epoch": 1.0188050464175196, + "grad_norm": 0.008898822590708733, + "learning_rate": 3.3019915893041344e-05, + "loss": 0.0002, + "step": 4280 + }, + { + "epoch": 1.0211854320399905, + "grad_norm": 0.021421125158667564, + "learning_rate": 3.2980242799333494e-05, + "loss": 0.0002, + "step": 4290 + }, + { + "epoch": 1.0235658176624614, + "grad_norm": 0.09812607616186142, + "learning_rate": 3.2940569705625645e-05, + "loss": 0.0003, + "step": 4300 + }, + { + "epoch": 1.0259462032849322, + "grad_norm": 0.00921029131859541, + "learning_rate": 3.2900896611917795e-05, + "loss": 0.0003, + "step": 4310 + }, + { + "epoch": 1.028326588907403, + "grad_norm": 0.18005193769931793, + "learning_rate": 3.286122351820995e-05, + "loss": 0.0009, + "step": 4320 + }, + { + "epoch": 1.0307069745298738, + "grad_norm": 0.022728268057107925, + "learning_rate": 3.282155042450211e-05, + "loss": 0.001, + "step": 4330 + }, + { + "epoch": 1.0330873601523447, + "grad_norm": 0.002307797549292445, + "learning_rate": 3.278187733079426e-05, + "loss": 0.0001, + "step": 4340 + }, + { + "epoch": 1.0354677457748156, + "grad_norm": 0.09381233900785446, + "learning_rate": 3.274220423708641e-05, + "loss": 0.0001, + "step": 4350 + }, + { + "epoch": 1.0378481313972863, + "grad_norm": 0.30725282430648804, + "learning_rate": 3.270253114337856e-05, + "loss": 0.0006, + "step": 4360 + }, + { + "epoch": 1.0402285170197572, + "grad_norm": 0.0028942192438989878, + "learning_rate": 3.266285804967071e-05, + "loss": 0.0002, + "step": 4370 + }, + { + "epoch": 1.042608902642228, + "grad_norm": 0.07420436292886734, + "learning_rate": 3.262318495596287e-05, + "loss": 0.0001, + "step": 4380 + }, + { + "epoch": 1.044989288264699, + "grad_norm": 0.0038873206358402967, + "learning_rate": 3.2583511862255025e-05, + "loss": 0.0004, + "step": 4390 + }, + { + "epoch": 1.0473696738871696, + "grad_norm": 0.00487096281722188, + "learning_rate": 3.2543838768547175e-05, + "loss": 0.0001, + "step": 4400 + }, + { + "epoch": 1.0497500595096405, + "grad_norm": 0.00458755437284708, + "learning_rate": 3.2504165674839325e-05, + "loss": 0.0001, + "step": 4410 + }, + { + "epoch": 1.0521304451321114, + "grad_norm": 0.003127770032733679, + "learning_rate": 3.2464492581131475e-05, + "loss": 0.0001, + "step": 4420 + }, + { + "epoch": 1.0545108307545823, + "grad_norm": 0.0036109236534684896, + "learning_rate": 3.242481948742363e-05, + "loss": 0.0003, + "step": 4430 + }, + { + "epoch": 1.0568912163770532, + "grad_norm": 0.01696913130581379, + "learning_rate": 3.238514639371578e-05, + "loss": 0.0001, + "step": 4440 + }, + { + "epoch": 1.0592716019995239, + "grad_norm": 0.0007638796814717352, + "learning_rate": 3.234547330000793e-05, + "loss": 0.0003, + "step": 4450 + }, + { + "epoch": 1.0616519876219948, + "grad_norm": 0.005359685514122248, + "learning_rate": 3.230580020630009e-05, + "loss": 0.0001, + "step": 4460 + }, + { + "epoch": 1.0640323732444656, + "grad_norm": 0.008990432135760784, + "learning_rate": 3.226612711259224e-05, + "loss": 0.0002, + "step": 4470 + }, + { + "epoch": 1.0664127588669365, + "grad_norm": 0.004698805510997772, + "learning_rate": 3.22264540188844e-05, + "loss": 0.0001, + "step": 4480 + }, + { + "epoch": 1.0687931444894072, + "grad_norm": 0.07380379736423492, + "learning_rate": 3.218678092517655e-05, + "loss": 0.0005, + "step": 4490 + }, + { + "epoch": 1.071173530111878, + "grad_norm": 0.0072670914232730865, + "learning_rate": 3.21471078314687e-05, + "loss": 0.0001, + "step": 4500 + }, + { + "epoch": 1.073553915734349, + "grad_norm": 0.003431397257372737, + "learning_rate": 3.210743473776085e-05, + "loss": 0.0001, + "step": 4510 + }, + { + "epoch": 1.0759343013568199, + "grad_norm": 0.012710604816675186, + "learning_rate": 3.2067761644053006e-05, + "loss": 0.0001, + "step": 4520 + }, + { + "epoch": 1.0783146869792906, + "grad_norm": 0.0029263871256262064, + "learning_rate": 3.2028088550345156e-05, + "loss": 0.0001, + "step": 4530 + }, + { + "epoch": 1.0806950726017615, + "grad_norm": 0.0013361535966396332, + "learning_rate": 3.198841545663731e-05, + "loss": 0.0001, + "step": 4540 + }, + { + "epoch": 1.0830754582242323, + "grad_norm": 0.0027455012314021587, + "learning_rate": 3.194874236292946e-05, + "loss": 0.0001, + "step": 4550 + }, + { + "epoch": 1.0854558438467032, + "grad_norm": 0.0015189964324235916, + "learning_rate": 3.1909069269221614e-05, + "loss": 0.0002, + "step": 4560 + }, + { + "epoch": 1.087836229469174, + "grad_norm": 0.019486431032419205, + "learning_rate": 3.1869396175513764e-05, + "loss": 0.0001, + "step": 4570 + }, + { + "epoch": 1.0902166150916448, + "grad_norm": 0.009100046940147877, + "learning_rate": 3.182972308180592e-05, + "loss": 0.0002, + "step": 4580 + }, + { + "epoch": 1.0925970007141157, + "grad_norm": 0.6804227828979492, + "learning_rate": 3.179004998809808e-05, + "loss": 0.0008, + "step": 4590 + }, + { + "epoch": 1.0949773863365866, + "grad_norm": 0.004166084341704845, + "learning_rate": 3.175037689439023e-05, + "loss": 0.0004, + "step": 4600 + }, + { + "epoch": 1.0973577719590573, + "grad_norm": 0.0014277161099016666, + "learning_rate": 3.171070380068238e-05, + "loss": 0.0005, + "step": 4610 + }, + { + "epoch": 1.0997381575815282, + "grad_norm": 0.02292274497449398, + "learning_rate": 3.167103070697453e-05, + "loss": 0.0001, + "step": 4620 + }, + { + "epoch": 1.102118543203999, + "grad_norm": 0.006580695044249296, + "learning_rate": 3.163135761326668e-05, + "loss": 0.0002, + "step": 4630 + }, + { + "epoch": 1.10449892882647, + "grad_norm": 0.005075294058769941, + "learning_rate": 3.1591684519558836e-05, + "loss": 0.0002, + "step": 4640 + }, + { + "epoch": 1.1068793144489408, + "grad_norm": 0.0034661772660911083, + "learning_rate": 3.1552011425850994e-05, + "loss": 0.0003, + "step": 4650 + }, + { + "epoch": 1.1092597000714115, + "grad_norm": 0.0035978129599243402, + "learning_rate": 3.1512338332143144e-05, + "loss": 0.0002, + "step": 4660 + }, + { + "epoch": 1.1116400856938824, + "grad_norm": 0.01178679708391428, + "learning_rate": 3.1472665238435294e-05, + "loss": 0.0001, + "step": 4670 + }, + { + "epoch": 1.1140204713163533, + "grad_norm": 0.0021494280081242323, + "learning_rate": 3.1432992144727444e-05, + "loss": 0.0003, + "step": 4680 + }, + { + "epoch": 1.1164008569388242, + "grad_norm": 0.00325006153434515, + "learning_rate": 3.13933190510196e-05, + "loss": 0.0001, + "step": 4690 + }, + { + "epoch": 1.1187812425612949, + "grad_norm": 0.006323399022221565, + "learning_rate": 3.135364595731175e-05, + "loss": 0.0002, + "step": 4700 + }, + { + "epoch": 1.1211616281837657, + "grad_norm": 0.006911338306963444, + "learning_rate": 3.131397286360391e-05, + "loss": 0.0001, + "step": 4710 + }, + { + "epoch": 1.1235420138062366, + "grad_norm": 0.0032435038592666388, + "learning_rate": 3.127429976989606e-05, + "loss": 0.0001, + "step": 4720 + }, + { + "epoch": 1.1259223994287075, + "grad_norm": 0.18325313925743103, + "learning_rate": 3.123462667618821e-05, + "loss": 0.0002, + "step": 4730 + }, + { + "epoch": 1.1283027850511782, + "grad_norm": 0.12742838263511658, + "learning_rate": 3.119495358248036e-05, + "loss": 0.0004, + "step": 4740 + }, + { + "epoch": 1.130683170673649, + "grad_norm": 0.001981141045689583, + "learning_rate": 3.115528048877252e-05, + "loss": 0.0002, + "step": 4750 + }, + { + "epoch": 1.13306355629612, + "grad_norm": 0.0030578586738556623, + "learning_rate": 3.111560739506467e-05, + "loss": 0.0001, + "step": 4760 + }, + { + "epoch": 1.1354439419185909, + "grad_norm": 0.00284597952850163, + "learning_rate": 3.1075934301356824e-05, + "loss": 0.0001, + "step": 4770 + }, + { + "epoch": 1.1378243275410616, + "grad_norm": 0.023655202239751816, + "learning_rate": 3.1036261207648975e-05, + "loss": 0.0002, + "step": 4780 + }, + { + "epoch": 1.1402047131635324, + "grad_norm": 0.008493321016430855, + "learning_rate": 3.0996588113941125e-05, + "loss": 0.0002, + "step": 4790 + }, + { + "epoch": 1.1425850987860033, + "grad_norm": 0.0038551143370568752, + "learning_rate": 3.095691502023328e-05, + "loss": 0.0001, + "step": 4800 + }, + { + "epoch": 1.1449654844084742, + "grad_norm": 0.0014539804542437196, + "learning_rate": 3.091724192652543e-05, + "loss": 0.0001, + "step": 4810 + }, + { + "epoch": 1.1473458700309451, + "grad_norm": 0.0026364317163825035, + "learning_rate": 3.087756883281758e-05, + "loss": 0.0006, + "step": 4820 + }, + { + "epoch": 1.1497262556534158, + "grad_norm": 0.0010660483967512846, + "learning_rate": 3.083789573910973e-05, + "loss": 0.0001, + "step": 4830 + }, + { + "epoch": 1.1521066412758867, + "grad_norm": 0.005250291433185339, + "learning_rate": 3.079822264540189e-05, + "loss": 0.0013, + "step": 4840 + }, + { + "epoch": 1.1544870268983576, + "grad_norm": 0.0824214443564415, + "learning_rate": 3.075854955169404e-05, + "loss": 0.0002, + "step": 4850 + }, + { + "epoch": 1.1568674125208283, + "grad_norm": 0.003175609977915883, + "learning_rate": 3.07188764579862e-05, + "loss": 0.0006, + "step": 4860 + }, + { + "epoch": 1.1592477981432991, + "grad_norm": 0.0015882077859714627, + "learning_rate": 3.067920336427835e-05, + "loss": 0.0001, + "step": 4870 + }, + { + "epoch": 1.16162818376577, + "grad_norm": 0.003802343737334013, + "learning_rate": 3.06395302705705e-05, + "loss": 0.0001, + "step": 4880 + }, + { + "epoch": 1.164008569388241, + "grad_norm": 0.002745629521086812, + "learning_rate": 3.059985717686265e-05, + "loss": 0.0004, + "step": 4890 + }, + { + "epoch": 1.1663889550107118, + "grad_norm": 0.006173206493258476, + "learning_rate": 3.0560184083154805e-05, + "loss": 0.0001, + "step": 4900 + }, + { + "epoch": 1.1687693406331825, + "grad_norm": 0.006407946813851595, + "learning_rate": 3.052051098944696e-05, + "loss": 0.0001, + "step": 4910 + }, + { + "epoch": 1.1711497262556534, + "grad_norm": 0.017478201538324356, + "learning_rate": 3.0480837895739113e-05, + "loss": 0.0003, + "step": 4920 + }, + { + "epoch": 1.1735301118781243, + "grad_norm": 0.0035310271196067333, + "learning_rate": 3.0441164802031263e-05, + "loss": 0.0001, + "step": 4930 + }, + { + "epoch": 1.1759104975005952, + "grad_norm": 0.0057274349965155125, + "learning_rate": 3.0401491708323417e-05, + "loss": 0.0001, + "step": 4940 + }, + { + "epoch": 1.1782908831230658, + "grad_norm": 0.013580716215074062, + "learning_rate": 3.0361818614615567e-05, + "loss": 0.0001, + "step": 4950 + }, + { + "epoch": 1.1806712687455367, + "grad_norm": 0.005545695312321186, + "learning_rate": 3.0322145520907724e-05, + "loss": 0.0001, + "step": 4960 + }, + { + "epoch": 1.1830516543680076, + "grad_norm": 0.001243342412635684, + "learning_rate": 3.0282472427199875e-05, + "loss": 0.0001, + "step": 4970 + }, + { + "epoch": 1.1854320399904785, + "grad_norm": 0.004315751604735851, + "learning_rate": 3.0242799333492028e-05, + "loss": 0.0001, + "step": 4980 + }, + { + "epoch": 1.1878124256129494, + "grad_norm": 0.0020015877671539783, + "learning_rate": 3.020312623978418e-05, + "loss": 0.0001, + "step": 4990 + }, + { + "epoch": 1.19019281123542, + "grad_norm": 0.0013068486005067825, + "learning_rate": 3.0163453146076332e-05, + "loss": 0.0001, + "step": 5000 + }, + { + "epoch": 1.192573196857891, + "grad_norm": 0.0020259215962141752, + "learning_rate": 3.0123780052368483e-05, + "loss": 0.0001, + "step": 5010 + }, + { + "epoch": 1.1949535824803619, + "grad_norm": 0.00229440163820982, + "learning_rate": 3.008410695866064e-05, + "loss": 0.0001, + "step": 5020 + }, + { + "epoch": 1.1973339681028325, + "grad_norm": 0.006487131118774414, + "learning_rate": 3.0044433864952793e-05, + "loss": 0.0, + "step": 5030 + }, + { + "epoch": 1.1997143537253034, + "grad_norm": 0.0029580420814454556, + "learning_rate": 3.0004760771244944e-05, + "loss": 0.0001, + "step": 5040 + }, + { + "epoch": 1.2020947393477743, + "grad_norm": 0.004215626046061516, + "learning_rate": 2.9965087677537097e-05, + "loss": 0.0002, + "step": 5050 + }, + { + "epoch": 1.2044751249702452, + "grad_norm": 0.0045689307153224945, + "learning_rate": 2.9925414583829248e-05, + "loss": 0.0001, + "step": 5060 + }, + { + "epoch": 1.2068555105927161, + "grad_norm": 0.0018343930132687092, + "learning_rate": 2.9885741490121398e-05, + "loss": 0.0001, + "step": 5070 + }, + { + "epoch": 1.2092358962151868, + "grad_norm": 0.21103504300117493, + "learning_rate": 2.984606839641355e-05, + "loss": 0.0003, + "step": 5080 + }, + { + "epoch": 1.2116162818376577, + "grad_norm": 0.04271009564399719, + "learning_rate": 2.980639530270571e-05, + "loss": 0.0003, + "step": 5090 + }, + { + "epoch": 1.2139966674601286, + "grad_norm": 0.008761608973145485, + "learning_rate": 2.976672220899786e-05, + "loss": 0.0002, + "step": 5100 + }, + { + "epoch": 1.2163770530825995, + "grad_norm": 0.002944928128272295, + "learning_rate": 2.9727049115290013e-05, + "loss": 0.0004, + "step": 5110 + }, + { + "epoch": 1.2187574387050701, + "grad_norm": 0.0039098006673157215, + "learning_rate": 2.9687376021582163e-05, + "loss": 0.0001, + "step": 5120 + }, + { + "epoch": 1.221137824327541, + "grad_norm": 0.007188912481069565, + "learning_rate": 2.9647702927874317e-05, + "loss": 0.0003, + "step": 5130 + }, + { + "epoch": 1.223518209950012, + "grad_norm": 0.0020366155076771975, + "learning_rate": 2.9608029834166467e-05, + "loss": 0.0001, + "step": 5140 + }, + { + "epoch": 1.2258985955724828, + "grad_norm": 0.0052825105376541615, + "learning_rate": 2.9568356740458624e-05, + "loss": 0.0001, + "step": 5150 + }, + { + "epoch": 1.2282789811949535, + "grad_norm": 0.0322733074426651, + "learning_rate": 2.9528683646750778e-05, + "loss": 0.0002, + "step": 5160 + }, + { + "epoch": 1.2306593668174244, + "grad_norm": 0.0030191782861948013, + "learning_rate": 2.9489010553042928e-05, + "loss": 0.0001, + "step": 5170 + }, + { + "epoch": 1.2330397524398953, + "grad_norm": 0.0158090703189373, + "learning_rate": 2.944933745933508e-05, + "loss": 0.0001, + "step": 5180 + }, + { + "epoch": 1.2354201380623662, + "grad_norm": 0.0023131452035158873, + "learning_rate": 2.9409664365627232e-05, + "loss": 0.0002, + "step": 5190 + }, + { + "epoch": 1.2378005236848368, + "grad_norm": 0.0010957660852000117, + "learning_rate": 2.9369991271919382e-05, + "loss": 0.0001, + "step": 5200 + }, + { + "epoch": 1.2401809093073077, + "grad_norm": 0.006251092534512281, + "learning_rate": 2.933031817821154e-05, + "loss": 0.0001, + "step": 5210 + }, + { + "epoch": 1.2425612949297786, + "grad_norm": 0.002981637604534626, + "learning_rate": 2.9290645084503693e-05, + "loss": 0.0001, + "step": 5220 + }, + { + "epoch": 1.2449416805522495, + "grad_norm": 0.0044878036715090275, + "learning_rate": 2.9250971990795844e-05, + "loss": 0.0009, + "step": 5230 + }, + { + "epoch": 1.2473220661747204, + "grad_norm": 0.0026534402277320623, + "learning_rate": 2.9211298897087997e-05, + "loss": 0.0001, + "step": 5240 + }, + { + "epoch": 1.249702451797191, + "grad_norm": 0.0017549542244523764, + "learning_rate": 2.9171625803380148e-05, + "loss": 0.0001, + "step": 5250 + }, + { + "epoch": 1.252082837419662, + "grad_norm": 0.0030411062762141228, + "learning_rate": 2.91319527096723e-05, + "loss": 0.0001, + "step": 5260 + }, + { + "epoch": 1.2544632230421329, + "grad_norm": 0.006810466758906841, + "learning_rate": 2.909227961596445e-05, + "loss": 0.0001, + "step": 5270 + }, + { + "epoch": 1.2568436086646035, + "grad_norm": 0.008998183533549309, + "learning_rate": 2.905260652225661e-05, + "loss": 0.0001, + "step": 5280 + }, + { + "epoch": 1.2592239942870744, + "grad_norm": 0.0006000595167279243, + "learning_rate": 2.9012933428548762e-05, + "loss": 0.0001, + "step": 5290 + }, + { + "epoch": 1.2616043799095453, + "grad_norm": 0.0037659297231584787, + "learning_rate": 2.8973260334840913e-05, + "loss": 0.0001, + "step": 5300 + }, + { + "epoch": 1.2639847655320162, + "grad_norm": 0.003123963950201869, + "learning_rate": 2.8933587241133063e-05, + "loss": 0.0001, + "step": 5310 + }, + { + "epoch": 1.2663651511544871, + "grad_norm": 0.0024721056688576937, + "learning_rate": 2.8893914147425217e-05, + "loss": 0.0001, + "step": 5320 + }, + { + "epoch": 1.268745536776958, + "grad_norm": 0.04851701855659485, + "learning_rate": 2.8854241053717367e-05, + "loss": 0.0002, + "step": 5330 + }, + { + "epoch": 1.2711259223994287, + "grad_norm": 0.0003437872801441699, + "learning_rate": 2.8814567960009524e-05, + "loss": 0.0, + "step": 5340 + }, + { + "epoch": 1.2735063080218996, + "grad_norm": 0.36953097581863403, + "learning_rate": 2.8774894866301678e-05, + "loss": 0.0002, + "step": 5350 + }, + { + "epoch": 1.2758866936443705, + "grad_norm": 0.004762616939842701, + "learning_rate": 2.8735221772593828e-05, + "loss": 0.0, + "step": 5360 + }, + { + "epoch": 1.2782670792668411, + "grad_norm": 0.0032022674567997456, + "learning_rate": 2.8695548678885982e-05, + "loss": 0.0001, + "step": 5370 + }, + { + "epoch": 1.280647464889312, + "grad_norm": 0.112340047955513, + "learning_rate": 2.8655875585178132e-05, + "loss": 0.0001, + "step": 5380 + }, + { + "epoch": 1.283027850511783, + "grad_norm": 0.0022161102388054132, + "learning_rate": 2.8616202491470286e-05, + "loss": 0.0001, + "step": 5390 + }, + { + "epoch": 1.2854082361342538, + "grad_norm": 0.0012134364806115627, + "learning_rate": 2.8576529397762443e-05, + "loss": 0.0001, + "step": 5400 + }, + { + "epoch": 1.2877886217567247, + "grad_norm": 0.003832167712971568, + "learning_rate": 2.8536856304054593e-05, + "loss": 0.0001, + "step": 5410 + }, + { + "epoch": 1.2901690073791954, + "grad_norm": 0.001739076804369688, + "learning_rate": 2.8497183210346743e-05, + "loss": 0.0, + "step": 5420 + }, + { + "epoch": 1.2925493930016663, + "grad_norm": 0.000749527825973928, + "learning_rate": 2.8457510116638897e-05, + "loss": 0.0001, + "step": 5430 + }, + { + "epoch": 1.2949297786241372, + "grad_norm": 0.006486440543085337, + "learning_rate": 2.8417837022931047e-05, + "loss": 0.0001, + "step": 5440 + }, + { + "epoch": 1.2973101642466078, + "grad_norm": 0.002875624457374215, + "learning_rate": 2.83781639292232e-05, + "loss": 0.0003, + "step": 5450 + }, + { + "epoch": 1.2996905498690787, + "grad_norm": 0.011916677467525005, + "learning_rate": 2.833849083551535e-05, + "loss": 0.0002, + "step": 5460 + }, + { + "epoch": 1.3020709354915496, + "grad_norm": 0.014456122182309628, + "learning_rate": 2.829881774180751e-05, + "loss": 0.0001, + "step": 5470 + }, + { + "epoch": 1.3044513211140205, + "grad_norm": 0.00652431882917881, + "learning_rate": 2.8259144648099662e-05, + "loss": 0.0003, + "step": 5480 + }, + { + "epoch": 1.3068317067364914, + "grad_norm": 0.004612395539879799, + "learning_rate": 2.8219471554391813e-05, + "loss": 0.0001, + "step": 5490 + }, + { + "epoch": 1.309212092358962, + "grad_norm": 0.0016554853646084666, + "learning_rate": 2.8179798460683966e-05, + "loss": 0.0001, + "step": 5500 + }, + { + "epoch": 1.311592477981433, + "grad_norm": 0.00955954473465681, + "learning_rate": 2.8140125366976117e-05, + "loss": 0.0003, + "step": 5510 + }, + { + "epoch": 1.3139728636039039, + "grad_norm": 0.0014887260040268302, + "learning_rate": 2.8100452273268267e-05, + "loss": 0.0, + "step": 5520 + }, + { + "epoch": 1.3163532492263748, + "grad_norm": 0.004022569395601749, + "learning_rate": 2.8060779179560427e-05, + "loss": 0.0001, + "step": 5530 + }, + { + "epoch": 1.3187336348488454, + "grad_norm": 0.01300437469035387, + "learning_rate": 2.8021106085852578e-05, + "loss": 0.0001, + "step": 5540 + }, + { + "epoch": 1.3211140204713163, + "grad_norm": 0.0033303312957286835, + "learning_rate": 2.7981432992144728e-05, + "loss": 0.0001, + "step": 5550 + }, + { + "epoch": 1.3234944060937872, + "grad_norm": 0.00033377157524228096, + "learning_rate": 2.794175989843688e-05, + "loss": 0.0001, + "step": 5560 + }, + { + "epoch": 1.325874791716258, + "grad_norm": 0.001646155840717256, + "learning_rate": 2.7902086804729032e-05, + "loss": 0.0, + "step": 5570 + }, + { + "epoch": 1.328255177338729, + "grad_norm": 0.009458147920668125, + "learning_rate": 2.7862413711021186e-05, + "loss": 0.0, + "step": 5580 + }, + { + "epoch": 1.3306355629611997, + "grad_norm": 0.044097207486629486, + "learning_rate": 2.7822740617313343e-05, + "loss": 0.0001, + "step": 5590 + }, + { + "epoch": 1.3330159485836706, + "grad_norm": 0.3018762469291687, + "learning_rate": 2.7783067523605493e-05, + "loss": 0.0003, + "step": 5600 + }, + { + "epoch": 1.3353963342061415, + "grad_norm": 0.00142444740049541, + "learning_rate": 2.7743394429897647e-05, + "loss": 0.0, + "step": 5610 + }, + { + "epoch": 1.3377767198286121, + "grad_norm": 0.026065746322274208, + "learning_rate": 2.7703721336189797e-05, + "loss": 0.0001, + "step": 5620 + }, + { + "epoch": 1.340157105451083, + "grad_norm": 0.002285444876179099, + "learning_rate": 2.766404824248195e-05, + "loss": 0.0004, + "step": 5630 + }, + { + "epoch": 1.342537491073554, + "grad_norm": 0.0023544467985630035, + "learning_rate": 2.76243751487741e-05, + "loss": 0.0001, + "step": 5640 + }, + { + "epoch": 1.3449178766960248, + "grad_norm": 0.005093382205814123, + "learning_rate": 2.758470205506625e-05, + "loss": 0.0008, + "step": 5650 + }, + { + "epoch": 1.3472982623184957, + "grad_norm": 0.01395428366959095, + "learning_rate": 2.754502896135841e-05, + "loss": 0.0001, + "step": 5660 + }, + { + "epoch": 1.3496786479409664, + "grad_norm": 0.0021814145147800446, + "learning_rate": 2.7505355867650562e-05, + "loss": 0.0001, + "step": 5670 + }, + { + "epoch": 1.3520590335634373, + "grad_norm": 0.0020568270701915026, + "learning_rate": 2.7465682773942712e-05, + "loss": 0.0001, + "step": 5680 + }, + { + "epoch": 1.3544394191859082, + "grad_norm": 0.001564579550176859, + "learning_rate": 2.7426009680234866e-05, + "loss": 0.0002, + "step": 5690 + }, + { + "epoch": 1.3568198048083788, + "grad_norm": 0.0009057559072971344, + "learning_rate": 2.7386336586527016e-05, + "loss": 0.0001, + "step": 5700 + }, + { + "epoch": 1.3592001904308497, + "grad_norm": 0.005018309690058231, + "learning_rate": 2.734666349281917e-05, + "loss": 0.0001, + "step": 5710 + }, + { + "epoch": 1.3615805760533206, + "grad_norm": 0.0018629188416525722, + "learning_rate": 2.7306990399111327e-05, + "loss": 0.0003, + "step": 5720 + }, + { + "epoch": 1.3639609616757915, + "grad_norm": 0.001482214662246406, + "learning_rate": 2.7267317305403478e-05, + "loss": 0.0001, + "step": 5730 + }, + { + "epoch": 1.3663413472982624, + "grad_norm": 0.012405039742588997, + "learning_rate": 2.722764421169563e-05, + "loss": 0.0005, + "step": 5740 + }, + { + "epoch": 1.3687217329207333, + "grad_norm": 0.0018485913751646876, + "learning_rate": 2.718797111798778e-05, + "loss": 0.0, + "step": 5750 + }, + { + "epoch": 1.371102118543204, + "grad_norm": 0.0015681314980611205, + "learning_rate": 2.7148298024279932e-05, + "loss": 0.0001, + "step": 5760 + }, + { + "epoch": 1.3734825041656749, + "grad_norm": 0.017725007608532906, + "learning_rate": 2.7108624930572086e-05, + "loss": 0.0001, + "step": 5770 + }, + { + "epoch": 1.3758628897881457, + "grad_norm": 0.011187481693923473, + "learning_rate": 2.7068951836864243e-05, + "loss": 0.0001, + "step": 5780 + }, + { + "epoch": 1.3782432754106164, + "grad_norm": 0.003125675953924656, + "learning_rate": 2.7029278743156393e-05, + "loss": 0.0001, + "step": 5790 + }, + { + "epoch": 1.3806236610330873, + "grad_norm": 0.004620529245585203, + "learning_rate": 2.6989605649448547e-05, + "loss": 0.0001, + "step": 5800 + }, + { + "epoch": 1.3830040466555582, + "grad_norm": 0.004881042055785656, + "learning_rate": 2.6949932555740697e-05, + "loss": 0.0001, + "step": 5810 + }, + { + "epoch": 1.385384432278029, + "grad_norm": 0.015351341105997562, + "learning_rate": 2.691025946203285e-05, + "loss": 0.0001, + "step": 5820 + }, + { + "epoch": 1.3877648179005, + "grad_norm": 0.06165415793657303, + "learning_rate": 2.6870586368325e-05, + "loss": 0.0001, + "step": 5830 + }, + { + "epoch": 1.3901452035229707, + "grad_norm": 0.000691259338054806, + "learning_rate": 2.6830913274617155e-05, + "loss": 0.0001, + "step": 5840 + }, + { + "epoch": 1.3925255891454416, + "grad_norm": 0.006264138966798782, + "learning_rate": 2.6791240180909312e-05, + "loss": 0.0, + "step": 5850 + }, + { + "epoch": 1.3949059747679124, + "grad_norm": 0.0016265185549855232, + "learning_rate": 2.6751567087201462e-05, + "loss": 0.0001, + "step": 5860 + }, + { + "epoch": 1.3972863603903831, + "grad_norm": 0.0036318551283329725, + "learning_rate": 2.6711893993493616e-05, + "loss": 0.0, + "step": 5870 + }, + { + "epoch": 1.399666746012854, + "grad_norm": 0.0011168549535796046, + "learning_rate": 2.6672220899785766e-05, + "loss": 0.0001, + "step": 5880 + }, + { + "epoch": 1.402047131635325, + "grad_norm": 0.011570369824767113, + "learning_rate": 2.6632547806077916e-05, + "loss": 0.0001, + "step": 5890 + }, + { + "epoch": 1.4044275172577958, + "grad_norm": 0.004564432427287102, + "learning_rate": 2.659287471237007e-05, + "loss": 0.0001, + "step": 5900 + }, + { + "epoch": 1.4068079028802667, + "grad_norm": 0.003310930449515581, + "learning_rate": 2.6553201618662227e-05, + "loss": 0.0001, + "step": 5910 + }, + { + "epoch": 1.4091882885027374, + "grad_norm": 0.005474664270877838, + "learning_rate": 2.6513528524954377e-05, + "loss": 0.0, + "step": 5920 + }, + { + "epoch": 1.4115686741252083, + "grad_norm": 0.003840883495286107, + "learning_rate": 2.647385543124653e-05, + "loss": 0.0, + "step": 5930 + }, + { + "epoch": 1.4139490597476791, + "grad_norm": 0.0011354766320437193, + "learning_rate": 2.643418233753868e-05, + "loss": 0.0001, + "step": 5940 + }, + { + "epoch": 1.41632944537015, + "grad_norm": 0.0011250395327806473, + "learning_rate": 2.6394509243830835e-05, + "loss": 0.0001, + "step": 5950 + }, + { + "epoch": 1.4187098309926207, + "grad_norm": 0.0025986500550061464, + "learning_rate": 2.6354836150122985e-05, + "loss": 0.0, + "step": 5960 + }, + { + "epoch": 1.4210902166150916, + "grad_norm": 0.0018986169015988708, + "learning_rate": 2.6315163056415143e-05, + "loss": 0.0001, + "step": 5970 + }, + { + "epoch": 1.4234706022375625, + "grad_norm": 0.006072606425732374, + "learning_rate": 2.6275489962707296e-05, + "loss": 0.0001, + "step": 5980 + }, + { + "epoch": 1.4258509878600334, + "grad_norm": 0.005382834933698177, + "learning_rate": 2.6235816868999447e-05, + "loss": 0.0001, + "step": 5990 + }, + { + "epoch": 1.4282313734825043, + "grad_norm": 0.0069602313451468945, + "learning_rate": 2.6196143775291597e-05, + "loss": 0.0001, + "step": 6000 + }, + { + "epoch": 1.430611759104975, + "grad_norm": 0.00503483647480607, + "learning_rate": 2.615647068158375e-05, + "loss": 0.0001, + "step": 6010 + }, + { + "epoch": 1.4329921447274458, + "grad_norm": 0.009482208639383316, + "learning_rate": 2.61167975878759e-05, + "loss": 0.0001, + "step": 6020 + }, + { + "epoch": 1.4353725303499167, + "grad_norm": 0.003071409650146961, + "learning_rate": 2.6077124494168058e-05, + "loss": 0.0003, + "step": 6030 + }, + { + "epoch": 1.4377529159723874, + "grad_norm": 0.025201931595802307, + "learning_rate": 2.603745140046021e-05, + "loss": 0.0002, + "step": 6040 + }, + { + "epoch": 1.4401333015948583, + "grad_norm": 0.029845217242836952, + "learning_rate": 2.5997778306752362e-05, + "loss": 0.0001, + "step": 6050 + }, + { + "epoch": 1.4425136872173292, + "grad_norm": 0.002946893684566021, + "learning_rate": 2.5958105213044516e-05, + "loss": 0.0001, + "step": 6060 + }, + { + "epoch": 1.4448940728398, + "grad_norm": 0.002334748860448599, + "learning_rate": 2.5918432119336666e-05, + "loss": 0.0001, + "step": 6070 + }, + { + "epoch": 1.447274458462271, + "grad_norm": 0.0038676797412335873, + "learning_rate": 2.587875902562882e-05, + "loss": 0.0001, + "step": 6080 + }, + { + "epoch": 1.4496548440847417, + "grad_norm": 0.39916858077049255, + "learning_rate": 2.583908593192097e-05, + "loss": 0.0005, + "step": 6090 + }, + { + "epoch": 1.4520352297072125, + "grad_norm": 0.005464503075927496, + "learning_rate": 2.5799412838213127e-05, + "loss": 0.0, + "step": 6100 + }, + { + "epoch": 1.4544156153296834, + "grad_norm": 0.002350292168557644, + "learning_rate": 2.5759739744505277e-05, + "loss": 0.0001, + "step": 6110 + }, + { + "epoch": 1.4567960009521541, + "grad_norm": 0.02950800396502018, + "learning_rate": 2.572006665079743e-05, + "loss": 0.0001, + "step": 6120 + }, + { + "epoch": 1.459176386574625, + "grad_norm": 0.0020270231179893017, + "learning_rate": 2.568039355708958e-05, + "loss": 0.0001, + "step": 6130 + }, + { + "epoch": 1.461556772197096, + "grad_norm": 0.29163315892219543, + "learning_rate": 2.5640720463381735e-05, + "loss": 0.0004, + "step": 6140 + }, + { + "epoch": 1.4639371578195668, + "grad_norm": 0.0028463418129831553, + "learning_rate": 2.5601047369673885e-05, + "loss": 0.0001, + "step": 6150 + }, + { + "epoch": 1.4663175434420377, + "grad_norm": 0.007839919067919254, + "learning_rate": 2.5561374275966042e-05, + "loss": 0.0, + "step": 6160 + }, + { + "epoch": 1.4686979290645086, + "grad_norm": 0.0009790142066776752, + "learning_rate": 2.5521701182258196e-05, + "loss": 0.0001, + "step": 6170 + }, + { + "epoch": 1.4710783146869792, + "grad_norm": 0.019366919994354248, + "learning_rate": 2.5482028088550346e-05, + "loss": 0.0001, + "step": 6180 + }, + { + "epoch": 1.4734587003094501, + "grad_norm": 0.002335514174774289, + "learning_rate": 2.54423549948425e-05, + "loss": 0.0001, + "step": 6190 + }, + { + "epoch": 1.475839085931921, + "grad_norm": 0.004448035266250372, + "learning_rate": 2.540268190113465e-05, + "loss": 0.0, + "step": 6200 + }, + { + "epoch": 1.4782194715543917, + "grad_norm": 0.0020590273197740316, + "learning_rate": 2.53630088074268e-05, + "loss": 0.0, + "step": 6210 + }, + { + "epoch": 1.4805998571768626, + "grad_norm": 0.0015115641290321946, + "learning_rate": 2.532333571371896e-05, + "loss": 0.0001, + "step": 6220 + }, + { + "epoch": 1.4829802427993335, + "grad_norm": 0.0024076756089925766, + "learning_rate": 2.528366262001111e-05, + "loss": 0.0003, + "step": 6230 + }, + { + "epoch": 1.4853606284218044, + "grad_norm": 0.0048133935779333115, + "learning_rate": 2.5243989526303262e-05, + "loss": 0.0001, + "step": 6240 + }, + { + "epoch": 1.4877410140442753, + "grad_norm": 0.015479459427297115, + "learning_rate": 2.5204316432595416e-05, + "loss": 0.0001, + "step": 6250 + }, + { + "epoch": 1.490121399666746, + "grad_norm": 0.1010046973824501, + "learning_rate": 2.5164643338887566e-05, + "loss": 0.0001, + "step": 6260 + }, + { + "epoch": 1.4925017852892168, + "grad_norm": 0.0011843384709209204, + "learning_rate": 2.512497024517972e-05, + "loss": 0.0002, + "step": 6270 + }, + { + "epoch": 1.4948821709116877, + "grad_norm": 0.002041852567344904, + "learning_rate": 2.508529715147187e-05, + "loss": 0.0001, + "step": 6280 + }, + { + "epoch": 1.4972625565341584, + "grad_norm": 0.002975156530737877, + "learning_rate": 2.5045624057764027e-05, + "loss": 0.0001, + "step": 6290 + }, + { + "epoch": 1.4996429421566293, + "grad_norm": 0.005752989556640387, + "learning_rate": 2.500595096405618e-05, + "loss": 0.0001, + "step": 6300 + }, + { + "epoch": 1.5020233277791002, + "grad_norm": 0.002325852634385228, + "learning_rate": 2.496627787034833e-05, + "loss": 0.0, + "step": 6310 + }, + { + "epoch": 1.504403713401571, + "grad_norm": 0.006379146594554186, + "learning_rate": 2.4926604776640485e-05, + "loss": 0.0001, + "step": 6320 + }, + { + "epoch": 1.506784099024042, + "grad_norm": 0.0011644313344731927, + "learning_rate": 2.488693168293264e-05, + "loss": 0.0, + "step": 6330 + }, + { + "epoch": 1.5091644846465129, + "grad_norm": 0.06679144501686096, + "learning_rate": 2.484725858922479e-05, + "loss": 0.0001, + "step": 6340 + }, + { + "epoch": 1.5115448702689835, + "grad_norm": 0.010065040551126003, + "learning_rate": 2.4807585495516942e-05, + "loss": 0.0003, + "step": 6350 + }, + { + "epoch": 1.5139252558914544, + "grad_norm": 0.00404448714107275, + "learning_rate": 2.4767912401809093e-05, + "loss": 0.0001, + "step": 6360 + }, + { + "epoch": 1.516305641513925, + "grad_norm": 0.005027102772146463, + "learning_rate": 2.4728239308101246e-05, + "loss": 0.0001, + "step": 6370 + }, + { + "epoch": 1.518686027136396, + "grad_norm": 0.0007329948712140322, + "learning_rate": 2.46885662143934e-05, + "loss": 0.0001, + "step": 6380 + }, + { + "epoch": 1.521066412758867, + "grad_norm": 0.008010495454072952, + "learning_rate": 2.464889312068555e-05, + "loss": 0.0001, + "step": 6390 + }, + { + "epoch": 1.5234467983813378, + "grad_norm": 0.0004263845912646502, + "learning_rate": 2.4609220026977704e-05, + "loss": 0.0, + "step": 6400 + }, + { + "epoch": 1.5258271840038087, + "grad_norm": 0.0008505060104653239, + "learning_rate": 2.4569546933269858e-05, + "loss": 0.0001, + "step": 6410 + }, + { + "epoch": 1.5282075696262796, + "grad_norm": 0.005009577609598637, + "learning_rate": 2.4529873839562008e-05, + "loss": 0.0001, + "step": 6420 + }, + { + "epoch": 1.5305879552487502, + "grad_norm": 0.0055831428617239, + "learning_rate": 2.4490200745854165e-05, + "loss": 0.0, + "step": 6430 + }, + { + "epoch": 1.5329683408712211, + "grad_norm": 0.0025661292020231485, + "learning_rate": 2.4450527652146315e-05, + "loss": 0.0002, + "step": 6440 + }, + { + "epoch": 1.535348726493692, + "grad_norm": 0.002652715193107724, + "learning_rate": 2.4410854558438466e-05, + "loss": 0.0, + "step": 6450 + }, + { + "epoch": 1.5377291121161627, + "grad_norm": 0.0017773109721019864, + "learning_rate": 2.4371181464730623e-05, + "loss": 0.0001, + "step": 6460 + }, + { + "epoch": 1.5401094977386336, + "grad_norm": 0.023734472692012787, + "learning_rate": 2.4331508371022773e-05, + "loss": 0.0001, + "step": 6470 + }, + { + "epoch": 1.5424898833611045, + "grad_norm": 0.0018312609754502773, + "learning_rate": 2.4291835277314927e-05, + "loss": 0.0001, + "step": 6480 + }, + { + "epoch": 1.5448702689835754, + "grad_norm": 0.004327055066823959, + "learning_rate": 2.425216218360708e-05, + "loss": 0.0001, + "step": 6490 + }, + { + "epoch": 1.5472506546060463, + "grad_norm": 0.0021172019187361, + "learning_rate": 2.421248908989923e-05, + "loss": 0.0001, + "step": 6500 + }, + { + "epoch": 1.5496310402285172, + "grad_norm": 0.001905101933516562, + "learning_rate": 2.4172815996191385e-05, + "loss": 0.0, + "step": 6510 + }, + { + "epoch": 1.5520114258509878, + "grad_norm": 0.0016990803414955735, + "learning_rate": 2.4133142902483538e-05, + "loss": 0.0001, + "step": 6520 + }, + { + "epoch": 1.5543918114734587, + "grad_norm": 0.0022508346009999514, + "learning_rate": 2.409346980877569e-05, + "loss": 0.0001, + "step": 6530 + }, + { + "epoch": 1.5567721970959294, + "grad_norm": 0.0018837592797353864, + "learning_rate": 2.4053796715067842e-05, + "loss": 0.0001, + "step": 6540 + }, + { + "epoch": 1.5591525827184003, + "grad_norm": 0.001968635246157646, + "learning_rate": 2.4014123621359993e-05, + "loss": 0.0002, + "step": 6550 + }, + { + "epoch": 1.5615329683408712, + "grad_norm": 0.0019730927888303995, + "learning_rate": 2.397445052765215e-05, + "loss": 0.0001, + "step": 6560 + }, + { + "epoch": 1.563913353963342, + "grad_norm": 0.0006384404841810465, + "learning_rate": 2.39347774339443e-05, + "loss": 0.0, + "step": 6570 + }, + { + "epoch": 1.566293739585813, + "grad_norm": 0.05303851515054703, + "learning_rate": 2.389510434023645e-05, + "loss": 0.0002, + "step": 6580 + }, + { + "epoch": 1.5686741252082839, + "grad_norm": 0.009338784962892532, + "learning_rate": 2.3855431246528607e-05, + "loss": 0.0, + "step": 6590 + }, + { + "epoch": 1.5710545108307545, + "grad_norm": 0.001042340649291873, + "learning_rate": 2.3815758152820758e-05, + "loss": 0.0, + "step": 6600 + }, + { + "epoch": 1.5734348964532254, + "grad_norm": 0.008856063708662987, + "learning_rate": 2.377608505911291e-05, + "loss": 0.0001, + "step": 6610 + }, + { + "epoch": 1.575815282075696, + "grad_norm": 0.0010636444203555584, + "learning_rate": 2.3736411965405065e-05, + "loss": 0.0001, + "step": 6620 + }, + { + "epoch": 1.578195667698167, + "grad_norm": 0.044303007423877716, + "learning_rate": 2.3696738871697215e-05, + "loss": 0.0001, + "step": 6630 + }, + { + "epoch": 1.5805760533206379, + "grad_norm": 0.003368295030668378, + "learning_rate": 2.365706577798937e-05, + "loss": 0.0001, + "step": 6640 + }, + { + "epoch": 1.5829564389431088, + "grad_norm": 0.0010406200308352709, + "learning_rate": 2.3617392684281523e-05, + "loss": 0.0001, + "step": 6650 + }, + { + "epoch": 1.5853368245655797, + "grad_norm": 0.009850569069385529, + "learning_rate": 2.3577719590573673e-05, + "loss": 0.0, + "step": 6660 + }, + { + "epoch": 1.5877172101880506, + "grad_norm": 0.00514467665925622, + "learning_rate": 2.3538046496865827e-05, + "loss": 0.0001, + "step": 6670 + }, + { + "epoch": 1.5900975958105215, + "grad_norm": 0.00200643390417099, + "learning_rate": 2.349837340315798e-05, + "loss": 0.0002, + "step": 6680 + }, + { + "epoch": 1.5924779814329921, + "grad_norm": 0.01371715497225523, + "learning_rate": 2.345870030945013e-05, + "loss": 0.0002, + "step": 6690 + }, + { + "epoch": 1.594858367055463, + "grad_norm": 0.0005170275107957423, + "learning_rate": 2.3419027215742284e-05, + "loss": 0.0001, + "step": 6700 + }, + { + "epoch": 1.5972387526779337, + "grad_norm": 0.0018967930227518082, + "learning_rate": 2.3379354122034438e-05, + "loss": 0.0, + "step": 6710 + }, + { + "epoch": 1.5996191383004046, + "grad_norm": 0.002288557356223464, + "learning_rate": 2.3339681028326592e-05, + "loss": 0.0003, + "step": 6720 + }, + { + "epoch": 1.6019995239228755, + "grad_norm": 0.0017687254585325718, + "learning_rate": 2.3300007934618742e-05, + "loss": 0.0001, + "step": 6730 + }, + { + "epoch": 1.6043799095453464, + "grad_norm": 0.023880669847130775, + "learning_rate": 2.3260334840910893e-05, + "loss": 0.0001, + "step": 6740 + }, + { + "epoch": 1.6067602951678173, + "grad_norm": 0.004767647013068199, + "learning_rate": 2.322066174720305e-05, + "loss": 0.0001, + "step": 6750 + }, + { + "epoch": 1.6091406807902882, + "grad_norm": 0.0016061540227383375, + "learning_rate": 2.31809886534952e-05, + "loss": 0.0001, + "step": 6760 + }, + { + "epoch": 1.6115210664127588, + "grad_norm": 0.009586431086063385, + "learning_rate": 2.3141315559787354e-05, + "loss": 0.0001, + "step": 6770 + }, + { + "epoch": 1.6139014520352297, + "grad_norm": 0.003596968250349164, + "learning_rate": 2.3101642466079507e-05, + "loss": 0.0001, + "step": 6780 + }, + { + "epoch": 1.6162818376577004, + "grad_norm": 0.003184641245752573, + "learning_rate": 2.3061969372371658e-05, + "loss": 0.0001, + "step": 6790 + }, + { + "epoch": 1.6186622232801713, + "grad_norm": 0.02113034948706627, + "learning_rate": 2.302229627866381e-05, + "loss": 0.0, + "step": 6800 + }, + { + "epoch": 1.6210426089026422, + "grad_norm": 0.0022694601211696863, + "learning_rate": 2.2982623184955965e-05, + "loss": 0.0001, + "step": 6810 + }, + { + "epoch": 1.623422994525113, + "grad_norm": 0.0007104437099769711, + "learning_rate": 2.2942950091248115e-05, + "loss": 0.0, + "step": 6820 + }, + { + "epoch": 1.625803380147584, + "grad_norm": 0.004562158603221178, + "learning_rate": 2.290327699754027e-05, + "loss": 0.0001, + "step": 6830 + }, + { + "epoch": 1.6281837657700549, + "grad_norm": 0.0015846043825149536, + "learning_rate": 2.2863603903832423e-05, + "loss": 0.0001, + "step": 6840 + }, + { + "epoch": 1.6305641513925258, + "grad_norm": 0.012255080044269562, + "learning_rate": 2.2823930810124573e-05, + "loss": 0.0001, + "step": 6850 + }, + { + "epoch": 1.6329445370149964, + "grad_norm": 0.0012517154682427645, + "learning_rate": 2.2784257716416727e-05, + "loss": 0.0001, + "step": 6860 + }, + { + "epoch": 1.6353249226374673, + "grad_norm": 0.0006557099404744804, + "learning_rate": 2.274458462270888e-05, + "loss": 0.0001, + "step": 6870 + }, + { + "epoch": 1.637705308259938, + "grad_norm": 0.0007641498814336956, + "learning_rate": 2.2704911529001034e-05, + "loss": 0.0, + "step": 6880 + }, + { + "epoch": 1.6400856938824089, + "grad_norm": 0.005642781965434551, + "learning_rate": 2.2665238435293184e-05, + "loss": 0.0, + "step": 6890 + }, + { + "epoch": 1.6424660795048798, + "grad_norm": 0.0022149153519421816, + "learning_rate": 2.2625565341585338e-05, + "loss": 0.0004, + "step": 6900 + }, + { + "epoch": 1.6448464651273507, + "grad_norm": 0.8982350826263428, + "learning_rate": 2.2585892247877492e-05, + "loss": 0.0003, + "step": 6910 + }, + { + "epoch": 1.6472268507498216, + "grad_norm": 0.002032769611105323, + "learning_rate": 2.2546219154169642e-05, + "loss": 0.0001, + "step": 6920 + }, + { + "epoch": 1.6496072363722925, + "grad_norm": 0.0021233465522527695, + "learning_rate": 2.2506546060461796e-05, + "loss": 0.0001, + "step": 6930 + }, + { + "epoch": 1.6519876219947631, + "grad_norm": 0.019824443385004997, + "learning_rate": 2.246687296675395e-05, + "loss": 0.0001, + "step": 6940 + }, + { + "epoch": 1.654368007617234, + "grad_norm": 0.002160045551136136, + "learning_rate": 2.24271998730461e-05, + "loss": 0.0001, + "step": 6950 + }, + { + "epoch": 1.6567483932397047, + "grad_norm": 0.002742405980825424, + "learning_rate": 2.2387526779338254e-05, + "loss": 0.0003, + "step": 6960 + }, + { + "epoch": 1.6591287788621756, + "grad_norm": 0.04358428716659546, + "learning_rate": 2.2347853685630407e-05, + "loss": 0.0003, + "step": 6970 + }, + { + "epoch": 1.6615091644846465, + "grad_norm": 0.0023650035727769136, + "learning_rate": 2.2308180591922558e-05, + "loss": 0.0, + "step": 6980 + }, + { + "epoch": 1.6638895501071174, + "grad_norm": 0.0027010326739400625, + "learning_rate": 2.226850749821471e-05, + "loss": 0.0001, + "step": 6990 + }, + { + "epoch": 1.6662699357295883, + "grad_norm": 0.01885942928493023, + "learning_rate": 2.2228834404506865e-05, + "loss": 0.0001, + "step": 7000 + }, + { + "epoch": 1.6686503213520592, + "grad_norm": 0.013014287687838078, + "learning_rate": 2.218916131079902e-05, + "loss": 0.0004, + "step": 7010 + }, + { + "epoch": 1.6710307069745298, + "grad_norm": 0.0015542235923931003, + "learning_rate": 2.214948821709117e-05, + "loss": 0.0001, + "step": 7020 + }, + { + "epoch": 1.6734110925970007, + "grad_norm": 0.011335782706737518, + "learning_rate": 2.2109815123383323e-05, + "loss": 0.0, + "step": 7030 + }, + { + "epoch": 1.6757914782194716, + "grad_norm": 0.1068568155169487, + "learning_rate": 2.2070142029675476e-05, + "loss": 0.0001, + "step": 7040 + }, + { + "epoch": 1.6781718638419423, + "grad_norm": 0.004407468251883984, + "learning_rate": 2.2030468935967627e-05, + "loss": 0.0001, + "step": 7050 + }, + { + "epoch": 1.6805522494644132, + "grad_norm": 0.0026373250875622034, + "learning_rate": 2.199079584225978e-05, + "loss": 0.0, + "step": 7060 + }, + { + "epoch": 1.682932635086884, + "grad_norm": 0.020453903824090958, + "learning_rate": 2.1951122748551934e-05, + "loss": 0.0001, + "step": 7070 + }, + { + "epoch": 1.685313020709355, + "grad_norm": 0.009605340659618378, + "learning_rate": 2.1911449654844084e-05, + "loss": 0.0003, + "step": 7080 + }, + { + "epoch": 1.6876934063318259, + "grad_norm": 0.0008563417941331863, + "learning_rate": 2.1871776561136238e-05, + "loss": 0.0, + "step": 7090 + }, + { + "epoch": 1.6900737919542967, + "grad_norm": 0.0017095934599637985, + "learning_rate": 2.1832103467428392e-05, + "loss": 0.0001, + "step": 7100 + }, + { + "epoch": 1.6924541775767674, + "grad_norm": 0.0017231311649084091, + "learning_rate": 2.1792430373720542e-05, + "loss": 0.0001, + "step": 7110 + }, + { + "epoch": 1.6948345631992383, + "grad_norm": 0.0004322198801673949, + "learning_rate": 2.17527572800127e-05, + "loss": 0.0001, + "step": 7120 + }, + { + "epoch": 1.697214948821709, + "grad_norm": 0.06828305870294571, + "learning_rate": 2.171308418630485e-05, + "loss": 0.0003, + "step": 7130 + }, + { + "epoch": 1.6995953344441799, + "grad_norm": 0.012662236578762531, + "learning_rate": 2.1673411092597e-05, + "loss": 0.0, + "step": 7140 + }, + { + "epoch": 1.7019757200666508, + "grad_norm": 0.0004414702707435936, + "learning_rate": 2.1633737998889153e-05, + "loss": 0.0, + "step": 7150 + }, + { + "epoch": 1.7043561056891217, + "grad_norm": 0.0018225832609459758, + "learning_rate": 2.1594064905181307e-05, + "loss": 0.0001, + "step": 7160 + }, + { + "epoch": 1.7067364913115926, + "grad_norm": 0.10008008033037186, + "learning_rate": 2.155439181147346e-05, + "loss": 0.0001, + "step": 7170 + }, + { + "epoch": 1.7091168769340634, + "grad_norm": 0.0027361391112208366, + "learning_rate": 2.151471871776561e-05, + "loss": 0.0, + "step": 7180 + }, + { + "epoch": 1.7114972625565341, + "grad_norm": 0.0021505611948668957, + "learning_rate": 2.1475045624057765e-05, + "loss": 0.0, + "step": 7190 + }, + { + "epoch": 1.713877648179005, + "grad_norm": 0.00697895884513855, + "learning_rate": 2.143537253034992e-05, + "loss": 0.0, + "step": 7200 + }, + { + "epoch": 1.7162580338014757, + "grad_norm": 0.002057724166661501, + "learning_rate": 2.139569943664207e-05, + "loss": 0.0, + "step": 7210 + }, + { + "epoch": 1.7186384194239466, + "grad_norm": 0.002399923512712121, + "learning_rate": 2.1356026342934223e-05, + "loss": 0.0001, + "step": 7220 + }, + { + "epoch": 1.7210188050464175, + "grad_norm": 0.3061892092227936, + "learning_rate": 2.1316353249226376e-05, + "loss": 0.0002, + "step": 7230 + }, + { + "epoch": 1.7233991906688884, + "grad_norm": 0.004888875875622034, + "learning_rate": 2.1276680155518527e-05, + "loss": 0.0, + "step": 7240 + }, + { + "epoch": 1.7257795762913593, + "grad_norm": 0.04453931376338005, + "learning_rate": 2.1237007061810684e-05, + "loss": 0.0001, + "step": 7250 + }, + { + "epoch": 1.7281599619138301, + "grad_norm": 0.02463744953274727, + "learning_rate": 2.1197333968102834e-05, + "loss": 0.0001, + "step": 7260 + }, + { + "epoch": 1.730540347536301, + "grad_norm": 0.002113641705363989, + "learning_rate": 2.1157660874394984e-05, + "loss": 0.0001, + "step": 7270 + }, + { + "epoch": 1.7329207331587717, + "grad_norm": 0.0024889137130230665, + "learning_rate": 2.111798778068714e-05, + "loss": 0.0001, + "step": 7280 + }, + { + "epoch": 1.7353011187812426, + "grad_norm": 0.10477261245250702, + "learning_rate": 2.107831468697929e-05, + "loss": 0.0001, + "step": 7290 + }, + { + "epoch": 1.7376815044037133, + "grad_norm": 0.0008585329633206129, + "learning_rate": 2.1038641593271445e-05, + "loss": 0.0001, + "step": 7300 + }, + { + "epoch": 1.7400618900261842, + "grad_norm": 0.17968738079071045, + "learning_rate": 2.09989684995636e-05, + "loss": 0.0001, + "step": 7310 + }, + { + "epoch": 1.742442275648655, + "grad_norm": 0.0023223140742629766, + "learning_rate": 2.095929540585575e-05, + "loss": 0.0001, + "step": 7320 + }, + { + "epoch": 1.744822661271126, + "grad_norm": 0.0016741958679631352, + "learning_rate": 2.0919622312147903e-05, + "loss": 0.0002, + "step": 7330 + }, + { + "epoch": 1.7472030468935968, + "grad_norm": 0.009992700070142746, + "learning_rate": 2.0879949218440053e-05, + "loss": 0.0, + "step": 7340 + }, + { + "epoch": 1.7495834325160677, + "grad_norm": 0.002163327531889081, + "learning_rate": 2.0840276124732207e-05, + "loss": 0.0, + "step": 7350 + }, + { + "epoch": 1.7519638181385384, + "grad_norm": 0.15539680421352386, + "learning_rate": 2.080060303102436e-05, + "loss": 0.0003, + "step": 7360 + }, + { + "epoch": 1.7543442037610093, + "grad_norm": 0.002331450814381242, + "learning_rate": 2.076092993731651e-05, + "loss": 0.0, + "step": 7370 + }, + { + "epoch": 1.75672458938348, + "grad_norm": 0.0014541965210810304, + "learning_rate": 2.0721256843608665e-05, + "loss": 0.0, + "step": 7380 + }, + { + "epoch": 1.7591049750059509, + "grad_norm": 0.002874292666092515, + "learning_rate": 2.068158374990082e-05, + "loss": 0.0, + "step": 7390 + }, + { + "epoch": 1.7614853606284218, + "grad_norm": 0.046790674328804016, + "learning_rate": 2.064191065619297e-05, + "loss": 0.0001, + "step": 7400 + }, + { + "epoch": 1.7638657462508927, + "grad_norm": 0.012541896663606167, + "learning_rate": 2.0602237562485126e-05, + "loss": 0.0001, + "step": 7410 + }, + { + "epoch": 1.7662461318733635, + "grad_norm": 0.0005884987185709178, + "learning_rate": 2.0562564468777276e-05, + "loss": 0.0001, + "step": 7420 + }, + { + "epoch": 1.7686265174958344, + "grad_norm": 0.0090475520119071, + "learning_rate": 2.0522891375069426e-05, + "loss": 0.0001, + "step": 7430 + }, + { + "epoch": 1.7710069031183053, + "grad_norm": 0.04852410405874252, + "learning_rate": 2.0483218281361584e-05, + "loss": 0.0001, + "step": 7440 + }, + { + "epoch": 1.773387288740776, + "grad_norm": 0.003311296459287405, + "learning_rate": 2.0443545187653734e-05, + "loss": 0.0001, + "step": 7450 + }, + { + "epoch": 1.775767674363247, + "grad_norm": 0.03242022171616554, + "learning_rate": 2.0403872093945888e-05, + "loss": 0.0001, + "step": 7460 + }, + { + "epoch": 1.7781480599857176, + "grad_norm": 0.010833712294697762, + "learning_rate": 2.036419900023804e-05, + "loss": 0.0002, + "step": 7470 + }, + { + "epoch": 1.7805284456081885, + "grad_norm": 0.0031983698718249798, + "learning_rate": 2.032452590653019e-05, + "loss": 0.0001, + "step": 7480 + }, + { + "epoch": 1.7829088312306594, + "grad_norm": 0.021590987220406532, + "learning_rate": 2.0284852812822345e-05, + "loss": 0.0001, + "step": 7490 + }, + { + "epoch": 1.7852892168531302, + "grad_norm": 0.005147872492671013, + "learning_rate": 2.02451797191145e-05, + "loss": 0.0, + "step": 7500 + }, + { + "epoch": 1.7876696024756011, + "grad_norm": 0.0012411813950166106, + "learning_rate": 2.020550662540665e-05, + "loss": 0.0001, + "step": 7510 + }, + { + "epoch": 1.790049988098072, + "grad_norm": 0.0009874672396108508, + "learning_rate": 2.0165833531698803e-05, + "loss": 0.0001, + "step": 7520 + }, + { + "epoch": 1.7924303737205427, + "grad_norm": 0.002135714516043663, + "learning_rate": 2.0126160437990957e-05, + "loss": 0.0003, + "step": 7530 + }, + { + "epoch": 1.7948107593430136, + "grad_norm": 0.002928838599473238, + "learning_rate": 2.008648734428311e-05, + "loss": 0.0002, + "step": 7540 + }, + { + "epoch": 1.7971911449654843, + "grad_norm": 0.002418682212010026, + "learning_rate": 2.004681425057526e-05, + "loss": 0.0001, + "step": 7550 + }, + { + "epoch": 1.7995715305879552, + "grad_norm": 0.022359730675816536, + "learning_rate": 2.000714115686741e-05, + "loss": 0.0, + "step": 7560 + }, + { + "epoch": 1.801951916210426, + "grad_norm": 0.0013171250466257334, + "learning_rate": 1.9967468063159568e-05, + "loss": 0.0001, + "step": 7570 + }, + { + "epoch": 1.804332301832897, + "grad_norm": 0.005206149537116289, + "learning_rate": 1.992779496945172e-05, + "loss": 0.0001, + "step": 7580 + }, + { + "epoch": 1.8067126874553678, + "grad_norm": 0.5035125613212585, + "learning_rate": 1.988812187574387e-05, + "loss": 0.0004, + "step": 7590 + }, + { + "epoch": 1.8090930730778387, + "grad_norm": 0.0018090710509568453, + "learning_rate": 1.9848448782036026e-05, + "loss": 0.0, + "step": 7600 + }, + { + "epoch": 1.8114734587003094, + "grad_norm": 0.0020274862181395292, + "learning_rate": 1.9808775688328176e-05, + "loss": 0.0, + "step": 7610 + }, + { + "epoch": 1.8138538443227803, + "grad_norm": 0.008559592068195343, + "learning_rate": 1.976910259462033e-05, + "loss": 0.0001, + "step": 7620 + }, + { + "epoch": 1.816234229945251, + "grad_norm": 0.002766631543636322, + "learning_rate": 1.9729429500912483e-05, + "loss": 0.0, + "step": 7630 + }, + { + "epoch": 1.8186146155677219, + "grad_norm": 0.003933802247047424, + "learning_rate": 1.9689756407204634e-05, + "loss": 0.0001, + "step": 7640 + }, + { + "epoch": 1.8209950011901928, + "grad_norm": 0.0502641461789608, + "learning_rate": 1.9650083313496787e-05, + "loss": 0.0001, + "step": 7650 + }, + { + "epoch": 1.8233753868126636, + "grad_norm": 0.002705627353861928, + "learning_rate": 1.961041021978894e-05, + "loss": 0.0, + "step": 7660 + }, + { + "epoch": 1.8257557724351345, + "grad_norm": 0.015057703480124474, + "learning_rate": 1.957073712608109e-05, + "loss": 0.0001, + "step": 7670 + }, + { + "epoch": 1.8281361580576054, + "grad_norm": 0.0005775150493718684, + "learning_rate": 1.9531064032373245e-05, + "loss": 0.0001, + "step": 7680 + }, + { + "epoch": 1.8305165436800763, + "grad_norm": 0.006392305716872215, + "learning_rate": 1.94913909386654e-05, + "loss": 0.0001, + "step": 7690 + }, + { + "epoch": 1.832896929302547, + "grad_norm": 0.0014930195175111294, + "learning_rate": 1.9451717844957553e-05, + "loss": 0.0002, + "step": 7700 + }, + { + "epoch": 1.8352773149250179, + "grad_norm": 0.0161952693015337, + "learning_rate": 1.9412044751249703e-05, + "loss": 0.0001, + "step": 7710 + }, + { + "epoch": 1.8376577005474886, + "grad_norm": 0.0019109123386442661, + "learning_rate": 1.9372371657541857e-05, + "loss": 0.0001, + "step": 7720 + }, + { + "epoch": 1.8400380861699595, + "grad_norm": 0.0026801279745996, + "learning_rate": 1.933269856383401e-05, + "loss": 0.0, + "step": 7730 + }, + { + "epoch": 1.8424184717924303, + "grad_norm": 0.006187149789184332, + "learning_rate": 1.929302547012616e-05, + "loss": 0.0001, + "step": 7740 + }, + { + "epoch": 1.8447988574149012, + "grad_norm": 0.002990028355270624, + "learning_rate": 1.9253352376418314e-05, + "loss": 0.0001, + "step": 7750 + }, + { + "epoch": 1.8471792430373721, + "grad_norm": 0.0044268155470490456, + "learning_rate": 1.9213679282710468e-05, + "loss": 0.0, + "step": 7760 + }, + { + "epoch": 1.849559628659843, + "grad_norm": 0.005206019151955843, + "learning_rate": 1.9174006189002618e-05, + "loss": 0.0001, + "step": 7770 + }, + { + "epoch": 1.8519400142823137, + "grad_norm": 0.005415783729404211, + "learning_rate": 1.9134333095294772e-05, + "loss": 0.0001, + "step": 7780 + }, + { + "epoch": 1.8543203999047846, + "grad_norm": 0.0016888550017029047, + "learning_rate": 1.9094660001586926e-05, + "loss": 0.0, + "step": 7790 + }, + { + "epoch": 1.8567007855272553, + "grad_norm": 0.003122705966234207, + "learning_rate": 1.9054986907879076e-05, + "loss": 0.0, + "step": 7800 + }, + { + "epoch": 1.8590811711497262, + "grad_norm": 0.021525248885154724, + "learning_rate": 1.901531381417123e-05, + "loss": 0.0, + "step": 7810 + }, + { + "epoch": 1.861461556772197, + "grad_norm": 0.004836782813072205, + "learning_rate": 1.8975640720463383e-05, + "loss": 0.0, + "step": 7820 + }, + { + "epoch": 1.863841942394668, + "grad_norm": 0.003003711812198162, + "learning_rate": 1.8935967626755534e-05, + "loss": 0.0001, + "step": 7830 + }, + { + "epoch": 1.8662223280171388, + "grad_norm": 0.0034373151138424873, + "learning_rate": 1.8896294533047687e-05, + "loss": 0.0001, + "step": 7840 + }, + { + "epoch": 1.8686027136396097, + "grad_norm": 0.061307862401008606, + "learning_rate": 1.885662143933984e-05, + "loss": 0.0001, + "step": 7850 + }, + { + "epoch": 1.8709830992620806, + "grad_norm": 0.001207771128974855, + "learning_rate": 1.8816948345631995e-05, + "loss": 0.0, + "step": 7860 + }, + { + "epoch": 1.8733634848845513, + "grad_norm": 0.007686016149818897, + "learning_rate": 1.8777275251924145e-05, + "loss": 0.0001, + "step": 7870 + }, + { + "epoch": 1.8757438705070222, + "grad_norm": 0.0019049645634368062, + "learning_rate": 1.87376021582163e-05, + "loss": 0.0001, + "step": 7880 + }, + { + "epoch": 1.8781242561294929, + "grad_norm": 0.00202633673325181, + "learning_rate": 1.8697929064508452e-05, + "loss": 0.0, + "step": 7890 + }, + { + "epoch": 1.8805046417519637, + "grad_norm": 0.0011157892877236009, + "learning_rate": 1.8658255970800603e-05, + "loss": 0.0002, + "step": 7900 + }, + { + "epoch": 1.8828850273744346, + "grad_norm": 0.001622357638552785, + "learning_rate": 1.8618582877092756e-05, + "loss": 0.0006, + "step": 7910 + }, + { + "epoch": 1.8852654129969055, + "grad_norm": 0.04895901307463646, + "learning_rate": 1.857890978338491e-05, + "loss": 0.0002, + "step": 7920 + }, + { + "epoch": 1.8876457986193764, + "grad_norm": 0.0012425240129232407, + "learning_rate": 1.853923668967706e-05, + "loss": 0.0001, + "step": 7930 + }, + { + "epoch": 1.8900261842418473, + "grad_norm": 0.004690519999712706, + "learning_rate": 1.8499563595969214e-05, + "loss": 0.0001, + "step": 7940 + }, + { + "epoch": 1.892406569864318, + "grad_norm": 0.0015794184291735291, + "learning_rate": 1.8459890502261368e-05, + "loss": 0.0, + "step": 7950 + }, + { + "epoch": 1.8947869554867889, + "grad_norm": 0.01080586388707161, + "learning_rate": 1.8420217408553518e-05, + "loss": 0.0002, + "step": 7960 + }, + { + "epoch": 1.8971673411092596, + "grad_norm": 0.0018335338681936264, + "learning_rate": 1.8380544314845672e-05, + "loss": 0.0, + "step": 7970 + }, + { + "epoch": 1.8995477267317304, + "grad_norm": 0.003800921142101288, + "learning_rate": 1.8340871221137826e-05, + "loss": 0.0, + "step": 7980 + }, + { + "epoch": 1.9019281123542013, + "grad_norm": 0.0035681715235114098, + "learning_rate": 1.830119812742998e-05, + "loss": 0.0001, + "step": 7990 + }, + { + "epoch": 1.9043084979766722, + "grad_norm": 0.001115818158723414, + "learning_rate": 1.826152503372213e-05, + "loss": 0.0003, + "step": 8000 + }, + { + "epoch": 1.9066888835991431, + "grad_norm": 0.004726150073111057, + "learning_rate": 1.8221851940014283e-05, + "loss": 0.0001, + "step": 8010 + }, + { + "epoch": 1.909069269221614, + "grad_norm": 0.025985538959503174, + "learning_rate": 1.8182178846306437e-05, + "loss": 0.0, + "step": 8020 + }, + { + "epoch": 1.9114496548440847, + "grad_norm": 0.002658289624378085, + "learning_rate": 1.8142505752598587e-05, + "loss": 0.0001, + "step": 8030 + }, + { + "epoch": 1.9138300404665556, + "grad_norm": 0.010776730254292488, + "learning_rate": 1.810283265889074e-05, + "loss": 0.0001, + "step": 8040 + }, + { + "epoch": 1.9162104260890265, + "grad_norm": 0.004742765333503485, + "learning_rate": 1.8063159565182895e-05, + "loss": 0.0001, + "step": 8050 + }, + { + "epoch": 1.9185908117114971, + "grad_norm": 0.0017833469901233912, + "learning_rate": 1.8023486471475045e-05, + "loss": 0.0003, + "step": 8060 + }, + { + "epoch": 1.920971197333968, + "grad_norm": 0.0015226156683638692, + "learning_rate": 1.79838133777672e-05, + "loss": 0.0, + "step": 8070 + }, + { + "epoch": 1.923351582956439, + "grad_norm": 0.0021416472736746073, + "learning_rate": 1.7944140284059352e-05, + "loss": 0.0001, + "step": 8080 + }, + { + "epoch": 1.9257319685789098, + "grad_norm": 0.0021594560239464045, + "learning_rate": 1.7904467190351503e-05, + "loss": 0.0, + "step": 8090 + }, + { + "epoch": 1.9281123542013807, + "grad_norm": 0.0018359982641413808, + "learning_rate": 1.786479409664366e-05, + "loss": 0.0, + "step": 8100 + }, + { + "epoch": 1.9304927398238516, + "grad_norm": 0.0036185849457979202, + "learning_rate": 1.782512100293581e-05, + "loss": 0.0001, + "step": 8110 + }, + { + "epoch": 1.9328731254463223, + "grad_norm": 0.019637318328022957, + "learning_rate": 1.778544790922796e-05, + "loss": 0.0, + "step": 8120 + }, + { + "epoch": 1.9352535110687932, + "grad_norm": 0.002496182220056653, + "learning_rate": 1.7745774815520117e-05, + "loss": 0.0001, + "step": 8130 + }, + { + "epoch": 1.9376338966912638, + "grad_norm": 0.004374451469630003, + "learning_rate": 1.7706101721812268e-05, + "loss": 0.0005, + "step": 8140 + }, + { + "epoch": 1.9400142823137347, + "grad_norm": 0.0006196928443387151, + "learning_rate": 1.766642862810442e-05, + "loss": 0.0, + "step": 8150 + }, + { + "epoch": 1.9423946679362056, + "grad_norm": 0.0037022046744823456, + "learning_rate": 1.7626755534396572e-05, + "loss": 0.0, + "step": 8160 + }, + { + "epoch": 1.9447750535586765, + "grad_norm": 0.004300027620047331, + "learning_rate": 1.7587082440688725e-05, + "loss": 0.0, + "step": 8170 + }, + { + "epoch": 1.9471554391811474, + "grad_norm": 0.0019766122568398714, + "learning_rate": 1.754740934698088e-05, + "loss": 0.0001, + "step": 8180 + }, + { + "epoch": 1.9495358248036183, + "grad_norm": 0.0018594982102513313, + "learning_rate": 1.750773625327303e-05, + "loss": 0.0001, + "step": 8190 + }, + { + "epoch": 1.951916210426089, + "grad_norm": 0.0012102769687771797, + "learning_rate": 1.7468063159565183e-05, + "loss": 0.0001, + "step": 8200 + }, + { + "epoch": 1.9542965960485599, + "grad_norm": 0.0012130772229284048, + "learning_rate": 1.7428390065857337e-05, + "loss": 0.0, + "step": 8210 + }, + { + "epoch": 1.9566769816710305, + "grad_norm": 0.0006833472289144993, + "learning_rate": 1.7388716972149487e-05, + "loss": 0.0004, + "step": 8220 + }, + { + "epoch": 1.9590573672935014, + "grad_norm": 0.0017617164412513375, + "learning_rate": 1.7349043878441644e-05, + "loss": 0.0001, + "step": 8230 + }, + { + "epoch": 1.9614377529159723, + "grad_norm": 0.0013312195660546422, + "learning_rate": 1.7309370784733795e-05, + "loss": 0.0, + "step": 8240 + }, + { + "epoch": 1.9638181385384432, + "grad_norm": 0.0018878667615354061, + "learning_rate": 1.7269697691025945e-05, + "loss": 0.0, + "step": 8250 + }, + { + "epoch": 1.9661985241609141, + "grad_norm": 0.0019427284132689238, + "learning_rate": 1.7230024597318102e-05, + "loss": 0.0, + "step": 8260 + }, + { + "epoch": 1.968578909783385, + "grad_norm": 0.004271362908184528, + "learning_rate": 1.7190351503610252e-05, + "loss": 0.0001, + "step": 8270 + }, + { + "epoch": 1.970959295405856, + "grad_norm": 0.0027857243549078703, + "learning_rate": 1.7150678409902406e-05, + "loss": 0.0, + "step": 8280 + }, + { + "epoch": 1.9733396810283266, + "grad_norm": 0.0018286170670762658, + "learning_rate": 1.711100531619456e-05, + "loss": 0.0001, + "step": 8290 + }, + { + "epoch": 1.9757200666507975, + "grad_norm": 0.001666391035541892, + "learning_rate": 1.707133222248671e-05, + "loss": 0.0, + "step": 8300 + }, + { + "epoch": 1.9781004522732681, + "grad_norm": 0.021936526522040367, + "learning_rate": 1.7031659128778864e-05, + "loss": 0.0001, + "step": 8310 + }, + { + "epoch": 1.980480837895739, + "grad_norm": 0.00029301681206561625, + "learning_rate": 1.6991986035071017e-05, + "loss": 0.0, + "step": 8320 + }, + { + "epoch": 1.98286122351821, + "grad_norm": 0.0009200606727972627, + "learning_rate": 1.6952312941363168e-05, + "loss": 0.0, + "step": 8330 + }, + { + "epoch": 1.9852416091406808, + "grad_norm": 0.00579107366502285, + "learning_rate": 1.691263984765532e-05, + "loss": 0.0, + "step": 8340 + }, + { + "epoch": 1.9876219947631517, + "grad_norm": 0.000620057515334338, + "learning_rate": 1.687296675394747e-05, + "loss": 0.0001, + "step": 8350 + }, + { + "epoch": 1.9900023803856226, + "grad_norm": 0.0015694822650402784, + "learning_rate": 1.6833293660239625e-05, + "loss": 0.0, + "step": 8360 + }, + { + "epoch": 1.9923827660080933, + "grad_norm": 0.0013426202349364758, + "learning_rate": 1.679362056653178e-05, + "loss": 0.0, + "step": 8370 + }, + { + "epoch": 1.9947631516305642, + "grad_norm": 0.06455473601818085, + "learning_rate": 1.675394747282393e-05, + "loss": 0.0001, + "step": 8380 + }, + { + "epoch": 1.9971435372530348, + "grad_norm": 0.0007938113994896412, + "learning_rate": 1.6714274379116086e-05, + "loss": 0.0001, + "step": 8390 + }, + { + "epoch": 1.9995239228755057, + "grad_norm": 0.0030489168129861355, + "learning_rate": 1.6674601285408237e-05, + "loss": 0.0001, + "step": 8400 + }, + { + "epoch": 2.0, + "eval_loss": 7.416475114041532e-07, + "eval_runtime": 52.1219, + "eval_samples_per_second": 35.839, + "eval_steps_per_second": 8.96, + "step": 8402 + }, + { + "epoch": 2.0019043084979766, + "grad_norm": 0.00039361350354738533, + "learning_rate": 1.6634928191700387e-05, + "loss": 0.0001, + "step": 8410 + }, + { + "epoch": 2.0042846941204475, + "grad_norm": 0.007912525907158852, + "learning_rate": 1.6595255097992544e-05, + "loss": 0.0001, + "step": 8420 + }, + { + "epoch": 2.0066650797429184, + "grad_norm": 0.003857001895084977, + "learning_rate": 1.6555582004284694e-05, + "loss": 0.0001, + "step": 8430 + }, + { + "epoch": 2.0090454653653893, + "grad_norm": 0.002192788990214467, + "learning_rate": 1.6515908910576848e-05, + "loss": 0.0, + "step": 8440 + }, + { + "epoch": 2.01142585098786, + "grad_norm": 0.00107199524063617, + "learning_rate": 1.6476235816869002e-05, + "loss": 0.0002, + "step": 8450 + }, + { + "epoch": 2.013806236610331, + "grad_norm": 0.024036822840571404, + "learning_rate": 1.6436562723161152e-05, + "loss": 0.0001, + "step": 8460 + }, + { + "epoch": 2.0161866222328015, + "grad_norm": 0.000551603501662612, + "learning_rate": 1.6396889629453306e-05, + "loss": 0.0, + "step": 8470 + }, + { + "epoch": 2.0185670078552724, + "grad_norm": 0.001782495528459549, + "learning_rate": 1.635721653574546e-05, + "loss": 0.0001, + "step": 8480 + }, + { + "epoch": 2.0209473934777433, + "grad_norm": 0.030838970094919205, + "learning_rate": 1.631754344203761e-05, + "loss": 0.0001, + "step": 8490 + }, + { + "epoch": 2.023327779100214, + "grad_norm": 0.0005242625484243035, + "learning_rate": 1.6277870348329764e-05, + "loss": 0.0003, + "step": 8500 + }, + { + "epoch": 2.025708164722685, + "grad_norm": 0.001871236483566463, + "learning_rate": 1.6238197254621917e-05, + "loss": 0.0, + "step": 8510 + }, + { + "epoch": 2.028088550345156, + "grad_norm": 0.0005813137395307422, + "learning_rate": 1.6198524160914068e-05, + "loss": 0.0, + "step": 8520 + }, + { + "epoch": 2.030468935967627, + "grad_norm": 0.0007783659384585917, + "learning_rate": 1.615885106720622e-05, + "loss": 0.0, + "step": 8530 + }, + { + "epoch": 2.032849321590098, + "grad_norm": 0.002862844616174698, + "learning_rate": 1.6119177973498375e-05, + "loss": 0.0001, + "step": 8540 + }, + { + "epoch": 2.0352297072125682, + "grad_norm": 0.0016766699263826013, + "learning_rate": 1.607950487979053e-05, + "loss": 0.0, + "step": 8550 + }, + { + "epoch": 2.037610092835039, + "grad_norm": 0.06566356122493744, + "learning_rate": 1.603983178608268e-05, + "loss": 0.0002, + "step": 8560 + }, + { + "epoch": 2.03999047845751, + "grad_norm": 0.0013121259398758411, + "learning_rate": 1.600015869237483e-05, + "loss": 0.0, + "step": 8570 + }, + { + "epoch": 2.042370864079981, + "grad_norm": 0.0012001970317214727, + "learning_rate": 1.5960485598666986e-05, + "loss": 0.0001, + "step": 8580 + }, + { + "epoch": 2.044751249702452, + "grad_norm": 0.008261552080512047, + "learning_rate": 1.5920812504959137e-05, + "loss": 0.0, + "step": 8590 + }, + { + "epoch": 2.0471316353249227, + "grad_norm": 0.0006174147129058838, + "learning_rate": 1.588113941125129e-05, + "loss": 0.0, + "step": 8600 + }, + { + "epoch": 2.0495120209473936, + "grad_norm": 0.005130809266120195, + "learning_rate": 1.5841466317543444e-05, + "loss": 0.0001, + "step": 8610 + }, + { + "epoch": 2.0518924065698645, + "grad_norm": 0.0034670240711420774, + "learning_rate": 1.5801793223835594e-05, + "loss": 0.0004, + "step": 8620 + }, + { + "epoch": 2.054272792192335, + "grad_norm": 0.0055514005944132805, + "learning_rate": 1.5762120130127748e-05, + "loss": 0.0001, + "step": 8630 + }, + { + "epoch": 2.056653177814806, + "grad_norm": 0.0003135903971269727, + "learning_rate": 1.5722447036419902e-05, + "loss": 0.0, + "step": 8640 + }, + { + "epoch": 2.0590335634372767, + "grad_norm": 0.002474389737471938, + "learning_rate": 1.5682773942712052e-05, + "loss": 0.0001, + "step": 8650 + }, + { + "epoch": 2.0614139490597476, + "grad_norm": 0.004792024847120047, + "learning_rate": 1.5643100849004206e-05, + "loss": 0.0, + "step": 8660 + }, + { + "epoch": 2.0637943346822185, + "grad_norm": 0.0030985362827777863, + "learning_rate": 1.560342775529636e-05, + "loss": 0.0001, + "step": 8670 + }, + { + "epoch": 2.0661747203046894, + "grad_norm": 0.004058391321450472, + "learning_rate": 1.5563754661588513e-05, + "loss": 0.0, + "step": 8680 + }, + { + "epoch": 2.0685551059271603, + "grad_norm": 0.00150771695189178, + "learning_rate": 1.5524081567880663e-05, + "loss": 0.0, + "step": 8690 + }, + { + "epoch": 2.070935491549631, + "grad_norm": 0.001020533381961286, + "learning_rate": 1.5484408474172817e-05, + "loss": 0.0, + "step": 8700 + }, + { + "epoch": 2.073315877172102, + "grad_norm": 0.00616106390953064, + "learning_rate": 1.544473538046497e-05, + "loss": 0.0, + "step": 8710 + }, + { + "epoch": 2.0756962627945725, + "grad_norm": 0.0025589261204004288, + "learning_rate": 1.540506228675712e-05, + "loss": 0.0001, + "step": 8720 + }, + { + "epoch": 2.0780766484170434, + "grad_norm": 0.0006466865306720138, + "learning_rate": 1.5365389193049275e-05, + "loss": 0.0, + "step": 8730 + }, + { + "epoch": 2.0804570340395143, + "grad_norm": 0.002343350788578391, + "learning_rate": 1.532571609934143e-05, + "loss": 0.0001, + "step": 8740 + }, + { + "epoch": 2.082837419661985, + "grad_norm": 0.0006717872456647456, + "learning_rate": 1.528604300563358e-05, + "loss": 0.0001, + "step": 8750 + }, + { + "epoch": 2.085217805284456, + "grad_norm": 0.0009957224829122424, + "learning_rate": 1.524636991192573e-05, + "loss": 0.0002, + "step": 8760 + }, + { + "epoch": 2.087598190906927, + "grad_norm": 0.0014106009621173143, + "learning_rate": 1.5206696818217886e-05, + "loss": 0.0, + "step": 8770 + }, + { + "epoch": 2.089978576529398, + "grad_norm": 0.0011065505677834153, + "learning_rate": 1.5167023724510038e-05, + "loss": 0.0001, + "step": 8780 + }, + { + "epoch": 2.092358962151869, + "grad_norm": 0.0027844165451824665, + "learning_rate": 1.512735063080219e-05, + "loss": 0.0001, + "step": 8790 + }, + { + "epoch": 2.0947393477743392, + "grad_norm": 0.0006960778846405447, + "learning_rate": 1.5087677537094344e-05, + "loss": 0.0, + "step": 8800 + }, + { + "epoch": 2.09711973339681, + "grad_norm": 0.0003423156449571252, + "learning_rate": 1.5048004443386496e-05, + "loss": 0.0, + "step": 8810 + }, + { + "epoch": 2.099500119019281, + "grad_norm": 0.0011733579449355602, + "learning_rate": 1.5008331349678648e-05, + "loss": 0.0, + "step": 8820 + }, + { + "epoch": 2.101880504641752, + "grad_norm": 0.004115458112210035, + "learning_rate": 1.4968658255970802e-05, + "loss": 0.0, + "step": 8830 + }, + { + "epoch": 2.104260890264223, + "grad_norm": 0.072359099984169, + "learning_rate": 1.4928985162262954e-05, + "loss": 0.0, + "step": 8840 + }, + { + "epoch": 2.1066412758866937, + "grad_norm": 0.003922273404896259, + "learning_rate": 1.4889312068555106e-05, + "loss": 0.0, + "step": 8850 + }, + { + "epoch": 2.1090216615091646, + "grad_norm": 0.012736503966152668, + "learning_rate": 1.4849638974847261e-05, + "loss": 0.0, + "step": 8860 + }, + { + "epoch": 2.1114020471316355, + "grad_norm": 0.0019338323036208749, + "learning_rate": 1.4809965881139411e-05, + "loss": 0.0001, + "step": 8870 + }, + { + "epoch": 2.1137824327541064, + "grad_norm": 0.0015457593835890293, + "learning_rate": 1.4770292787431563e-05, + "loss": 0.0, + "step": 8880 + }, + { + "epoch": 2.116162818376577, + "grad_norm": 0.0016716497484594584, + "learning_rate": 1.4730619693723719e-05, + "loss": 0.0, + "step": 8890 + }, + { + "epoch": 2.1185432039990477, + "grad_norm": 0.001560089411213994, + "learning_rate": 1.469094660001587e-05, + "loss": 0.0, + "step": 8900 + }, + { + "epoch": 2.1209235896215186, + "grad_norm": 0.0031743065919727087, + "learning_rate": 1.4651273506308023e-05, + "loss": 0.0, + "step": 8910 + }, + { + "epoch": 2.1233039752439895, + "grad_norm": 0.0015614436706528068, + "learning_rate": 1.4611600412600176e-05, + "loss": 0.0, + "step": 8920 + }, + { + "epoch": 2.1256843608664604, + "grad_norm": 0.0005399516085162759, + "learning_rate": 1.4571927318892328e-05, + "loss": 0.0, + "step": 8930 + }, + { + "epoch": 2.1280647464889313, + "grad_norm": 0.0014794693561270833, + "learning_rate": 1.453225422518448e-05, + "loss": 0.0, + "step": 8940 + }, + { + "epoch": 2.130445132111402, + "grad_norm": 0.0024672893341630697, + "learning_rate": 1.4492581131476632e-05, + "loss": 0.0, + "step": 8950 + }, + { + "epoch": 2.132825517733873, + "grad_norm": 0.0013646584702655673, + "learning_rate": 1.4452908037768786e-05, + "loss": 0.0, + "step": 8960 + }, + { + "epoch": 2.1352059033563435, + "grad_norm": 0.07290241867303848, + "learning_rate": 1.4413234944060938e-05, + "loss": 0.0001, + "step": 8970 + }, + { + "epoch": 2.1375862889788144, + "grad_norm": 0.001859787036664784, + "learning_rate": 1.437356185035309e-05, + "loss": 0.0, + "step": 8980 + }, + { + "epoch": 2.1399666746012853, + "grad_norm": 0.001754750614054501, + "learning_rate": 1.4333888756645244e-05, + "loss": 0.0, + "step": 8990 + }, + { + "epoch": 2.142347060223756, + "grad_norm": 0.028476126492023468, + "learning_rate": 1.4294215662937396e-05, + "loss": 0.0, + "step": 9000 + }, + { + "epoch": 2.144727445846227, + "grad_norm": 0.0005994876846671104, + "learning_rate": 1.4254542569229548e-05, + "loss": 0.0, + "step": 9010 + }, + { + "epoch": 2.147107831468698, + "grad_norm": 0.0007879494805820286, + "learning_rate": 1.4214869475521703e-05, + "loss": 0.0, + "step": 9020 + }, + { + "epoch": 2.149488217091169, + "grad_norm": 0.0012654970632866025, + "learning_rate": 1.4175196381813855e-05, + "loss": 0.0, + "step": 9030 + }, + { + "epoch": 2.1518686027136398, + "grad_norm": 0.0018679037457332015, + "learning_rate": 1.4135523288106006e-05, + "loss": 0.0, + "step": 9040 + }, + { + "epoch": 2.1542489883361107, + "grad_norm": 0.0017861429369077086, + "learning_rate": 1.4095850194398161e-05, + "loss": 0.0, + "step": 9050 + }, + { + "epoch": 2.156629373958581, + "grad_norm": 0.006415149662643671, + "learning_rate": 1.4056177100690313e-05, + "loss": 0.0, + "step": 9060 + }, + { + "epoch": 2.159009759581052, + "grad_norm": 0.002842891961336136, + "learning_rate": 1.4016504006982465e-05, + "loss": 0.0, + "step": 9070 + }, + { + "epoch": 2.161390145203523, + "grad_norm": 0.0013869826216250658, + "learning_rate": 1.3976830913274619e-05, + "loss": 0.0, + "step": 9080 + }, + { + "epoch": 2.163770530825994, + "grad_norm": 0.018388478085398674, + "learning_rate": 1.393715781956677e-05, + "loss": 0.0001, + "step": 9090 + }, + { + "epoch": 2.1661509164484647, + "grad_norm": 0.0008245584322139621, + "learning_rate": 1.3897484725858923e-05, + "loss": 0.0, + "step": 9100 + }, + { + "epoch": 2.1685313020709356, + "grad_norm": 0.36837905645370483, + "learning_rate": 1.3857811632151076e-05, + "loss": 0.0001, + "step": 9110 + }, + { + "epoch": 2.1709116876934065, + "grad_norm": 0.002466343343257904, + "learning_rate": 1.3818138538443228e-05, + "loss": 0.0001, + "step": 9120 + }, + { + "epoch": 2.1732920733158774, + "grad_norm": 0.0035982499830424786, + "learning_rate": 1.377846544473538e-05, + "loss": 0.0, + "step": 9130 + }, + { + "epoch": 2.175672458938348, + "grad_norm": 0.13738982379436493, + "learning_rate": 1.3738792351027536e-05, + "loss": 0.0001, + "step": 9140 + }, + { + "epoch": 2.1780528445608187, + "grad_norm": 0.00042806967394426465, + "learning_rate": 1.3699119257319688e-05, + "loss": 0.0, + "step": 9150 + }, + { + "epoch": 2.1804332301832896, + "grad_norm": 0.002727969316765666, + "learning_rate": 1.3659446163611838e-05, + "loss": 0.0, + "step": 9160 + }, + { + "epoch": 2.1828136158057605, + "grad_norm": 0.0010691905627027154, + "learning_rate": 1.361977306990399e-05, + "loss": 0.0, + "step": 9170 + }, + { + "epoch": 2.1851940014282314, + "grad_norm": 0.020881984382867813, + "learning_rate": 1.3580099976196145e-05, + "loss": 0.0, + "step": 9180 + }, + { + "epoch": 2.1875743870507023, + "grad_norm": 0.0019363940227776766, + "learning_rate": 1.3540426882488297e-05, + "loss": 0.0, + "step": 9190 + }, + { + "epoch": 2.189954772673173, + "grad_norm": 0.001359110465273261, + "learning_rate": 1.350075378878045e-05, + "loss": 0.0, + "step": 9200 + }, + { + "epoch": 2.192335158295644, + "grad_norm": 0.0024417322129011154, + "learning_rate": 1.3461080695072603e-05, + "loss": 0.0, + "step": 9210 + }, + { + "epoch": 2.1947155439181145, + "grad_norm": 0.0006399775156751275, + "learning_rate": 1.3421407601364755e-05, + "loss": 0.0, + "step": 9220 + }, + { + "epoch": 2.1970959295405854, + "grad_norm": 0.001347382552921772, + "learning_rate": 1.3381734507656907e-05, + "loss": 0.0001, + "step": 9230 + }, + { + "epoch": 2.1994763151630563, + "grad_norm": 0.002276881132274866, + "learning_rate": 1.334206141394906e-05, + "loss": 0.0, + "step": 9240 + }, + { + "epoch": 2.201856700785527, + "grad_norm": 0.0005205354536883533, + "learning_rate": 1.3302388320241213e-05, + "loss": 0.0, + "step": 9250 + }, + { + "epoch": 2.204237086407998, + "grad_norm": 0.001351204700767994, + "learning_rate": 1.3262715226533365e-05, + "loss": 0.0, + "step": 9260 + }, + { + "epoch": 2.206617472030469, + "grad_norm": 0.00529600540176034, + "learning_rate": 1.322304213282552e-05, + "loss": 0.0002, + "step": 9270 + }, + { + "epoch": 2.20899785765294, + "grad_norm": 0.002000352367758751, + "learning_rate": 1.318336903911767e-05, + "loss": 0.0, + "step": 9280 + }, + { + "epoch": 2.2113782432754108, + "grad_norm": 0.0011036837240681052, + "learning_rate": 1.3143695945409823e-05, + "loss": 0.0, + "step": 9290 + }, + { + "epoch": 2.2137586288978817, + "grad_norm": 0.0023322845809161663, + "learning_rate": 1.3104022851701978e-05, + "loss": 0.0, + "step": 9300 + }, + { + "epoch": 2.216139014520352, + "grad_norm": 0.0029122158885002136, + "learning_rate": 1.306434975799413e-05, + "loss": 0.0, + "step": 9310 + }, + { + "epoch": 2.218519400142823, + "grad_norm": 0.00949085596948862, + "learning_rate": 1.302467666428628e-05, + "loss": 0.0, + "step": 9320 + }, + { + "epoch": 2.220899785765294, + "grad_norm": 0.0013391702668741345, + "learning_rate": 1.2985003570578436e-05, + "loss": 0.0, + "step": 9330 + }, + { + "epoch": 2.223280171387765, + "grad_norm": 0.00047678747796453536, + "learning_rate": 1.2945330476870588e-05, + "loss": 0.0, + "step": 9340 + }, + { + "epoch": 2.2256605570102357, + "grad_norm": 0.0031029602978378534, + "learning_rate": 1.290565738316274e-05, + "loss": 0.0, + "step": 9350 + }, + { + "epoch": 2.2280409426327066, + "grad_norm": 0.00046392931835725904, + "learning_rate": 1.2865984289454892e-05, + "loss": 0.0, + "step": 9360 + }, + { + "epoch": 2.2304213282551775, + "grad_norm": 0.0008917547529563308, + "learning_rate": 1.2826311195747045e-05, + "loss": 0.0, + "step": 9370 + }, + { + "epoch": 2.2328017138776484, + "grad_norm": 0.0039760940708220005, + "learning_rate": 1.2786638102039197e-05, + "loss": 0.0, + "step": 9380 + }, + { + "epoch": 2.235182099500119, + "grad_norm": 0.0009416754473932087, + "learning_rate": 1.274696500833135e-05, + "loss": 0.0, + "step": 9390 + }, + { + "epoch": 2.2375624851225897, + "grad_norm": 0.0008697324083186686, + "learning_rate": 1.2707291914623503e-05, + "loss": 0.0, + "step": 9400 + }, + { + "epoch": 2.2399428707450606, + "grad_norm": 0.00044792311382479966, + "learning_rate": 1.2667618820915655e-05, + "loss": 0.0, + "step": 9410 + }, + { + "epoch": 2.2423232563675315, + "grad_norm": 0.0014049585442990065, + "learning_rate": 1.2627945727207807e-05, + "loss": 0.0, + "step": 9420 + }, + { + "epoch": 2.2447036419900024, + "grad_norm": 0.00259969150647521, + "learning_rate": 1.2588272633499962e-05, + "loss": 0.0, + "step": 9430 + }, + { + "epoch": 2.2470840276124733, + "grad_norm": 0.0012579966569319367, + "learning_rate": 1.2548599539792113e-05, + "loss": 0.0, + "step": 9440 + }, + { + "epoch": 2.249464413234944, + "grad_norm": 0.008475791662931442, + "learning_rate": 1.2508926446084265e-05, + "loss": 0.0, + "step": 9450 + }, + { + "epoch": 2.251844798857415, + "grad_norm": 0.007055677939206362, + "learning_rate": 1.2469253352376418e-05, + "loss": 0.0, + "step": 9460 + }, + { + "epoch": 2.2542251844798855, + "grad_norm": 0.00043771168566308916, + "learning_rate": 1.2429580258668572e-05, + "loss": 0.0, + "step": 9470 + }, + { + "epoch": 2.2566055701023564, + "grad_norm": 0.0004315728147048503, + "learning_rate": 1.2389907164960724e-05, + "loss": 0.0, + "step": 9480 + }, + { + "epoch": 2.2589859557248273, + "grad_norm": 0.0006574731087312102, + "learning_rate": 1.2350234071252876e-05, + "loss": 0.0, + "step": 9490 + }, + { + "epoch": 2.261366341347298, + "grad_norm": 0.000502898299600929, + "learning_rate": 1.231056097754503e-05, + "loss": 0.0, + "step": 9500 + }, + { + "epoch": 2.263746726969769, + "grad_norm": 0.0014464023988693953, + "learning_rate": 1.2270887883837182e-05, + "loss": 0.0, + "step": 9510 + }, + { + "epoch": 2.26612711259224, + "grad_norm": 0.0007312349043786526, + "learning_rate": 1.2231214790129336e-05, + "loss": 0.0, + "step": 9520 + }, + { + "epoch": 2.268507498214711, + "grad_norm": 0.0012411205098032951, + "learning_rate": 1.2191541696421488e-05, + "loss": 0.0, + "step": 9530 + }, + { + "epoch": 2.2708878838371818, + "grad_norm": 0.003359739203006029, + "learning_rate": 1.215186860271364e-05, + "loss": 0.0, + "step": 9540 + }, + { + "epoch": 2.2732682694596527, + "grad_norm": 0.0025401897728443146, + "learning_rate": 1.2112195509005793e-05, + "loss": 0.0, + "step": 9550 + }, + { + "epoch": 2.275648655082123, + "grad_norm": 0.0009357984527014196, + "learning_rate": 1.2072522415297945e-05, + "loss": 0.0, + "step": 9560 + }, + { + "epoch": 2.278029040704594, + "grad_norm": 0.015569353476166725, + "learning_rate": 1.2032849321590097e-05, + "loss": 0.0, + "step": 9570 + }, + { + "epoch": 2.280409426327065, + "grad_norm": 0.0005228265072219074, + "learning_rate": 1.1993176227882251e-05, + "loss": 0.0, + "step": 9580 + }, + { + "epoch": 2.282789811949536, + "grad_norm": 0.0006133327260613441, + "learning_rate": 1.1953503134174405e-05, + "loss": 0.0, + "step": 9590 + }, + { + "epoch": 2.2851701975720067, + "grad_norm": 0.0006283469265326858, + "learning_rate": 1.1913830040466557e-05, + "loss": 0.0, + "step": 9600 + }, + { + "epoch": 2.2875505831944776, + "grad_norm": 0.0017937012016773224, + "learning_rate": 1.1874156946758709e-05, + "loss": 0.0, + "step": 9610 + }, + { + "epoch": 2.2899309688169485, + "grad_norm": 0.00227372907102108, + "learning_rate": 1.183448385305086e-05, + "loss": 0.0, + "step": 9620 + }, + { + "epoch": 2.2923113544394194, + "grad_norm": 0.0007874960429035127, + "learning_rate": 1.1794810759343014e-05, + "loss": 0.0, + "step": 9630 + }, + { + "epoch": 2.2946917400618903, + "grad_norm": 0.0012992926640436053, + "learning_rate": 1.1755137665635168e-05, + "loss": 0.0, + "step": 9640 + }, + { + "epoch": 2.2970721256843607, + "grad_norm": 0.0026856097392737865, + "learning_rate": 1.1715464571927318e-05, + "loss": 0.0001, + "step": 9650 + }, + { + "epoch": 2.2994525113068316, + "grad_norm": 0.027589144185185432, + "learning_rate": 1.1675791478219472e-05, + "loss": 0.0005, + "step": 9660 + }, + { + "epoch": 2.3018328969293025, + "grad_norm": 0.00021341729734558612, + "learning_rate": 1.1636118384511626e-05, + "loss": 0.0, + "step": 9670 + }, + { + "epoch": 2.3042132825517734, + "grad_norm": 0.0005525678861886263, + "learning_rate": 1.1596445290803778e-05, + "loss": 0.0, + "step": 9680 + }, + { + "epoch": 2.3065936681742443, + "grad_norm": 0.0006510653183795512, + "learning_rate": 1.155677219709593e-05, + "loss": 0.0, + "step": 9690 + }, + { + "epoch": 2.308974053796715, + "grad_norm": 0.0011141913710162044, + "learning_rate": 1.1517099103388082e-05, + "loss": 0.0, + "step": 9700 + }, + { + "epoch": 2.311354439419186, + "grad_norm": 0.001998309977352619, + "learning_rate": 1.1477426009680235e-05, + "loss": 0.0, + "step": 9710 + }, + { + "epoch": 2.3137348250416565, + "grad_norm": 0.008638182654976845, + "learning_rate": 1.1437752915972389e-05, + "loss": 0.0, + "step": 9720 + }, + { + "epoch": 2.3161152106641274, + "grad_norm": 0.0004837829037569463, + "learning_rate": 1.139807982226454e-05, + "loss": 0.0, + "step": 9730 + }, + { + "epoch": 2.3184955962865983, + "grad_norm": 0.008834806270897388, + "learning_rate": 1.1358406728556693e-05, + "loss": 0.0, + "step": 9740 + }, + { + "epoch": 2.320875981909069, + "grad_norm": 0.017421774566173553, + "learning_rate": 1.1318733634848847e-05, + "loss": 0.0, + "step": 9750 + }, + { + "epoch": 2.32325636753154, + "grad_norm": 0.0008695673895999789, + "learning_rate": 1.1279060541140999e-05, + "loss": 0.0, + "step": 9760 + }, + { + "epoch": 2.325636753154011, + "grad_norm": 0.007985567674040794, + "learning_rate": 1.1239387447433151e-05, + "loss": 0.0001, + "step": 9770 + }, + { + "epoch": 2.328017138776482, + "grad_norm": 0.0002991770743392408, + "learning_rate": 1.1199714353725305e-05, + "loss": 0.0, + "step": 9780 + }, + { + "epoch": 2.3303975243989528, + "grad_norm": 0.0018964770715683699, + "learning_rate": 1.1160041260017457e-05, + "loss": 0.0, + "step": 9790 + }, + { + "epoch": 2.3327779100214237, + "grad_norm": 0.0003782061976380646, + "learning_rate": 1.112036816630961e-05, + "loss": 0.0, + "step": 9800 + }, + { + "epoch": 2.335158295643894, + "grad_norm": 0.0005394426407292485, + "learning_rate": 1.108069507260176e-05, + "loss": 0.0, + "step": 9810 + }, + { + "epoch": 2.337538681266365, + "grad_norm": 0.0008728650282137096, + "learning_rate": 1.1041021978893914e-05, + "loss": 0.0, + "step": 9820 + }, + { + "epoch": 2.339919066888836, + "grad_norm": 1.026079773902893, + "learning_rate": 1.1001348885186068e-05, + "loss": 0.0001, + "step": 9830 + }, + { + "epoch": 2.342299452511307, + "grad_norm": 0.000987286795862019, + "learning_rate": 1.096167579147822e-05, + "loss": 0.0, + "step": 9840 + }, + { + "epoch": 2.3446798381337777, + "grad_norm": 0.0015003952430561185, + "learning_rate": 1.0922002697770372e-05, + "loss": 0.0, + "step": 9850 + }, + { + "epoch": 2.3470602237562486, + "grad_norm": 0.001296977628953755, + "learning_rate": 1.0882329604062526e-05, + "loss": 0.0, + "step": 9860 + }, + { + "epoch": 2.3494406093787195, + "grad_norm": 0.000640163547359407, + "learning_rate": 1.0842656510354678e-05, + "loss": 0.0, + "step": 9870 + }, + { + "epoch": 2.3518209950011904, + "grad_norm": 0.0009391361963935196, + "learning_rate": 1.0802983416646831e-05, + "loss": 0.0, + "step": 9880 + }, + { + "epoch": 2.3542013806236612, + "grad_norm": 0.0006612977595068514, + "learning_rate": 1.0763310322938983e-05, + "loss": 0.0001, + "step": 9890 + }, + { + "epoch": 2.3565817662461317, + "grad_norm": 0.0014715328579768538, + "learning_rate": 1.0723637229231135e-05, + "loss": 0.0, + "step": 9900 + }, + { + "epoch": 2.3589621518686026, + "grad_norm": 0.0004139976226724684, + "learning_rate": 1.0683964135523289e-05, + "loss": 0.0, + "step": 9910 + }, + { + "epoch": 2.3613425374910735, + "grad_norm": 0.001368595752865076, + "learning_rate": 1.0644291041815441e-05, + "loss": 0.0001, + "step": 9920 + }, + { + "epoch": 2.3637229231135444, + "grad_norm": 0.0010275020031258464, + "learning_rate": 1.0604617948107593e-05, + "loss": 0.0, + "step": 9930 + }, + { + "epoch": 2.3661033087360153, + "grad_norm": 0.0008476102957502007, + "learning_rate": 1.0564944854399747e-05, + "loss": 0.0, + "step": 9940 + }, + { + "epoch": 2.368483694358486, + "grad_norm": 0.019286731258034706, + "learning_rate": 1.0525271760691899e-05, + "loss": 0.0, + "step": 9950 + }, + { + "epoch": 2.370864079980957, + "grad_norm": 0.0007589785964228213, + "learning_rate": 1.0485598666984052e-05, + "loss": 0.0001, + "step": 9960 + }, + { + "epoch": 2.373244465603428, + "grad_norm": 0.0007659016991965473, + "learning_rate": 1.0445925573276204e-05, + "loss": 0.0, + "step": 9970 + }, + { + "epoch": 2.375624851225899, + "grad_norm": 0.0035345428623259068, + "learning_rate": 1.0406252479568356e-05, + "loss": 0.0, + "step": 9980 + }, + { + "epoch": 2.3780052368483693, + "grad_norm": 0.0021891535725444555, + "learning_rate": 1.036657938586051e-05, + "loss": 0.0, + "step": 9990 + }, + { + "epoch": 2.38038562247084, + "grad_norm": 0.2655426263809204, + "learning_rate": 1.0326906292152662e-05, + "loss": 0.001, + "step": 10000 + }, + { + "epoch": 2.382766008093311, + "grad_norm": 0.0008121923892758787, + "learning_rate": 1.0287233198444816e-05, + "loss": 0.0001, + "step": 10010 + }, + { + "epoch": 2.385146393715782, + "grad_norm": 0.006638567429035902, + "learning_rate": 1.0247560104736968e-05, + "loss": 0.0, + "step": 10020 + }, + { + "epoch": 2.387526779338253, + "grad_norm": 0.0033031317871063948, + "learning_rate": 1.020788701102912e-05, + "loss": 0.0, + "step": 10030 + }, + { + "epoch": 2.3899071649607238, + "grad_norm": 0.0004701575671788305, + "learning_rate": 1.0168213917321274e-05, + "loss": 0.0002, + "step": 10040 + }, + { + "epoch": 2.3922875505831946, + "grad_norm": 0.007627520710229874, + "learning_rate": 1.0128540823613426e-05, + "loss": 0.0, + "step": 10050 + }, + { + "epoch": 2.394667936205665, + "grad_norm": 0.0011233366094529629, + "learning_rate": 1.0088867729905578e-05, + "loss": 0.0, + "step": 10060 + }, + { + "epoch": 2.397048321828136, + "grad_norm": 0.0003728682058863342, + "learning_rate": 1.0049194636197731e-05, + "loss": 0.0001, + "step": 10070 + }, + { + "epoch": 2.399428707450607, + "grad_norm": 0.0018078387947753072, + "learning_rate": 1.0009521542489885e-05, + "loss": 0.0, + "step": 10080 + }, + { + "epoch": 2.4018090930730778, + "grad_norm": 0.004032574128359556, + "learning_rate": 9.969848448782037e-06, + "loss": 0.0, + "step": 10090 + }, + { + "epoch": 2.4041894786955487, + "grad_norm": 0.0010251044295728207, + "learning_rate": 9.930175355074189e-06, + "loss": 0.0, + "step": 10100 + }, + { + "epoch": 2.4065698643180196, + "grad_norm": 0.0012369100004434586, + "learning_rate": 9.890502261366341e-06, + "loss": 0.0001, + "step": 10110 + }, + { + "epoch": 2.4089502499404905, + "grad_norm": 0.0008841692470014095, + "learning_rate": 9.850829167658495e-06, + "loss": 0.0, + "step": 10120 + }, + { + "epoch": 2.4113306355629613, + "grad_norm": 0.05967468023300171, + "learning_rate": 9.811156073950648e-06, + "loss": 0.0001, + "step": 10130 + }, + { + "epoch": 2.4137110211854322, + "grad_norm": 0.002878790721297264, + "learning_rate": 9.771482980242799e-06, + "loss": 0.0, + "step": 10140 + }, + { + "epoch": 2.4160914068079027, + "grad_norm": 0.0005018101655878127, + "learning_rate": 9.731809886534952e-06, + "loss": 0.0, + "step": 10150 + }, + { + "epoch": 2.4184717924303736, + "grad_norm": 0.0015724776312708855, + "learning_rate": 9.692136792827106e-06, + "loss": 0.0003, + "step": 10160 + }, + { + "epoch": 2.4208521780528445, + "grad_norm": 0.004237225744873285, + "learning_rate": 9.652463699119258e-06, + "loss": 0.0, + "step": 10170 + }, + { + "epoch": 2.4232325636753154, + "grad_norm": 0.00131317344494164, + "learning_rate": 9.61279060541141e-06, + "loss": 0.0, + "step": 10180 + }, + { + "epoch": 2.4256129492977863, + "grad_norm": 0.002073557348921895, + "learning_rate": 9.573117511703564e-06, + "loss": 0.0, + "step": 10190 + }, + { + "epoch": 2.427993334920257, + "grad_norm": 0.0045993453823029995, + "learning_rate": 9.533444417995716e-06, + "loss": 0.0002, + "step": 10200 + }, + { + "epoch": 2.430373720542728, + "grad_norm": 0.001618819311261177, + "learning_rate": 9.49377132428787e-06, + "loss": 0.0001, + "step": 10210 + }, + { + "epoch": 2.432754106165199, + "grad_norm": 0.00304215750657022, + "learning_rate": 9.45409823058002e-06, + "loss": 0.0, + "step": 10220 + }, + { + "epoch": 2.43513449178767, + "grad_norm": 0.0007059932686388493, + "learning_rate": 9.414425136872173e-06, + "loss": 0.0, + "step": 10230 + }, + { + "epoch": 2.4375148774101403, + "grad_norm": 0.0031899004243314266, + "learning_rate": 9.374752043164327e-06, + "loss": 0.0, + "step": 10240 + }, + { + "epoch": 2.439895263032611, + "grad_norm": 0.002477418165653944, + "learning_rate": 9.33507894945648e-06, + "loss": 0.0, + "step": 10250 + }, + { + "epoch": 2.442275648655082, + "grad_norm": 0.00046585980453528464, + "learning_rate": 9.295405855748631e-06, + "loss": 0.0, + "step": 10260 + }, + { + "epoch": 2.444656034277553, + "grad_norm": 0.0005838835495524108, + "learning_rate": 9.255732762040785e-06, + "loss": 0.0, + "step": 10270 + }, + { + "epoch": 2.447036419900024, + "grad_norm": 0.001370543148368597, + "learning_rate": 9.216059668332937e-06, + "loss": 0.0, + "step": 10280 + }, + { + "epoch": 2.4494168055224947, + "grad_norm": 0.0016045079100877047, + "learning_rate": 9.17638657462509e-06, + "loss": 0.0, + "step": 10290 + }, + { + "epoch": 2.4517971911449656, + "grad_norm": 0.0020401678048074245, + "learning_rate": 9.136713480917243e-06, + "loss": 0.0001, + "step": 10300 + }, + { + "epoch": 2.454177576767436, + "grad_norm": 0.00043605471728369594, + "learning_rate": 9.097040387209395e-06, + "loss": 0.0, + "step": 10310 + }, + { + "epoch": 2.456557962389907, + "grad_norm": 0.0005910994368605316, + "learning_rate": 9.057367293501548e-06, + "loss": 0.0, + "step": 10320 + }, + { + "epoch": 2.458938348012378, + "grad_norm": 0.0005397904315032065, + "learning_rate": 9.0176941997937e-06, + "loss": 0.0, + "step": 10330 + }, + { + "epoch": 2.4613187336348488, + "grad_norm": 0.014002328738570213, + "learning_rate": 8.978021106085852e-06, + "loss": 0.0, + "step": 10340 + }, + { + "epoch": 2.4636991192573197, + "grad_norm": 0.0011001590173691511, + "learning_rate": 8.938348012378006e-06, + "loss": 0.0001, + "step": 10350 + }, + { + "epoch": 2.4660795048797906, + "grad_norm": 0.0029695210978388786, + "learning_rate": 8.898674918670158e-06, + "loss": 0.0, + "step": 10360 + }, + { + "epoch": 2.4684598905022614, + "grad_norm": 0.00410072086378932, + "learning_rate": 8.859001824962312e-06, + "loss": 0.0, + "step": 10370 + }, + { + "epoch": 2.4708402761247323, + "grad_norm": 0.0005128366756252944, + "learning_rate": 8.819328731254464e-06, + "loss": 0.0, + "step": 10380 + }, + { + "epoch": 2.4732206617472032, + "grad_norm": 0.0021037100814282894, + "learning_rate": 8.779655637546616e-06, + "loss": 0.0, + "step": 10390 + }, + { + "epoch": 2.4756010473696737, + "grad_norm": 0.0005958130932413042, + "learning_rate": 8.73998254383877e-06, + "loss": 0.0, + "step": 10400 + }, + { + "epoch": 2.4779814329921446, + "grad_norm": 0.0021961687598377466, + "learning_rate": 8.700309450130921e-06, + "loss": 0.0001, + "step": 10410 + }, + { + "epoch": 2.4803618186146155, + "grad_norm": 0.0011290331603959203, + "learning_rate": 8.660636356423073e-06, + "loss": 0.0, + "step": 10420 + }, + { + "epoch": 2.4827422042370864, + "grad_norm": 0.003101737704128027, + "learning_rate": 8.620963262715227e-06, + "loss": 0.0, + "step": 10430 + }, + { + "epoch": 2.4851225898595573, + "grad_norm": 0.010269707068800926, + "learning_rate": 8.581290169007379e-06, + "loss": 0.0, + "step": 10440 + }, + { + "epoch": 2.487502975482028, + "grad_norm": 0.0006016406114213169, + "learning_rate": 8.541617075299533e-06, + "loss": 0.0, + "step": 10450 + }, + { + "epoch": 2.489883361104499, + "grad_norm": 0.012370145879685879, + "learning_rate": 8.501943981591685e-06, + "loss": 0.0, + "step": 10460 + }, + { + "epoch": 2.49226374672697, + "grad_norm": 0.002209730911999941, + "learning_rate": 8.462270887883837e-06, + "loss": 0.0, + "step": 10470 + }, + { + "epoch": 2.494644132349441, + "grad_norm": 0.0002978077973239124, + "learning_rate": 8.42259779417599e-06, + "loss": 0.0, + "step": 10480 + }, + { + "epoch": 2.4970245179719113, + "grad_norm": 0.0006728899315930903, + "learning_rate": 8.382924700468144e-06, + "loss": 0.0, + "step": 10490 + }, + { + "epoch": 2.499404903594382, + "grad_norm": 0.0008764348458498716, + "learning_rate": 8.343251606760296e-06, + "loss": 0.0, + "step": 10500 + }, + { + "epoch": 2.501785289216853, + "grad_norm": 0.001580104581080377, + "learning_rate": 8.303578513052448e-06, + "loss": 0.0, + "step": 10510 + }, + { + "epoch": 2.504165674839324, + "grad_norm": 0.0003571589768398553, + "learning_rate": 8.2639054193446e-06, + "loss": 0.0, + "step": 10520 + }, + { + "epoch": 2.506546060461795, + "grad_norm": 0.004758020397275686, + "learning_rate": 8.224232325636754e-06, + "loss": 0.0, + "step": 10530 + }, + { + "epoch": 2.5089264460842657, + "grad_norm": 0.0013680767733603716, + "learning_rate": 8.184559231928906e-06, + "loss": 0.0, + "step": 10540 + }, + { + "epoch": 2.5113068317067366, + "grad_norm": 0.0010658970568329096, + "learning_rate": 8.144886138221058e-06, + "loss": 0.0, + "step": 10550 + }, + { + "epoch": 2.513687217329207, + "grad_norm": 0.0007452235440723598, + "learning_rate": 8.105213044513212e-06, + "loss": 0.0002, + "step": 10560 + }, + { + "epoch": 2.5160676029516784, + "grad_norm": 0.0006281470414251089, + "learning_rate": 8.065539950805365e-06, + "loss": 0.0, + "step": 10570 + }, + { + "epoch": 2.518447988574149, + "grad_norm": 0.0007866009837016463, + "learning_rate": 8.025866857097517e-06, + "loss": 0.0, + "step": 10580 + }, + { + "epoch": 2.5208283741966198, + "grad_norm": 0.00039683215436525643, + "learning_rate": 7.98619376338967e-06, + "loss": 0.0, + "step": 10590 + }, + { + "epoch": 2.5232087598190907, + "grad_norm": 0.0009177124593406916, + "learning_rate": 7.946520669681823e-06, + "loss": 0.0, + "step": 10600 + }, + { + "epoch": 2.5255891454415615, + "grad_norm": 0.00038271176163107157, + "learning_rate": 7.906847575973975e-06, + "loss": 0.0, + "step": 10610 + }, + { + "epoch": 2.5279695310640324, + "grad_norm": 0.00041592001798562706, + "learning_rate": 7.867174482266127e-06, + "loss": 0.0, + "step": 10620 + }, + { + "epoch": 2.5303499166865033, + "grad_norm": 0.0009455361287109554, + "learning_rate": 7.827501388558279e-06, + "loss": 0.0, + "step": 10630 + }, + { + "epoch": 2.5327303023089742, + "grad_norm": 0.0005674211424775422, + "learning_rate": 7.787828294850433e-06, + "loss": 0.0001, + "step": 10640 + }, + { + "epoch": 2.5351106879314447, + "grad_norm": 0.008180541917681694, + "learning_rate": 7.748155201142586e-06, + "loss": 0.0001, + "step": 10650 + }, + { + "epoch": 2.537491073553916, + "grad_norm": 0.006044210400432348, + "learning_rate": 7.708482107434738e-06, + "loss": 0.0, + "step": 10660 + }, + { + "epoch": 2.5398714591763865, + "grad_norm": 0.00039350485894829035, + "learning_rate": 7.66880901372689e-06, + "loss": 0.0, + "step": 10670 + }, + { + "epoch": 2.5422518447988574, + "grad_norm": 0.0007660723640583456, + "learning_rate": 7.629135920019044e-06, + "loss": 0.0, + "step": 10680 + }, + { + "epoch": 2.5446322304213282, + "grad_norm": 0.001309241633862257, + "learning_rate": 7.589462826311196e-06, + "loss": 0.0, + "step": 10690 + }, + { + "epoch": 2.547012616043799, + "grad_norm": 0.023756977170705795, + "learning_rate": 7.549789732603349e-06, + "loss": 0.0002, + "step": 10700 + }, + { + "epoch": 2.54939300166627, + "grad_norm": 0.002046087756752968, + "learning_rate": 7.510116638895501e-06, + "loss": 0.0, + "step": 10710 + }, + { + "epoch": 2.551773387288741, + "grad_norm": 0.0047508729621768, + "learning_rate": 7.470443545187654e-06, + "loss": 0.0, + "step": 10720 + }, + { + "epoch": 2.554153772911212, + "grad_norm": 0.0010949558345600963, + "learning_rate": 7.4307704514798075e-06, + "loss": 0.0, + "step": 10730 + }, + { + "epoch": 2.5565341585336823, + "grad_norm": 0.010589073412120342, + "learning_rate": 7.391097357771959e-06, + "loss": 0.0, + "step": 10740 + }, + { + "epoch": 2.558914544156153, + "grad_norm": 0.0006332534248940647, + "learning_rate": 7.351424264064112e-06, + "loss": 0.0, + "step": 10750 + }, + { + "epoch": 2.561294929778624, + "grad_norm": 0.00027181513723917305, + "learning_rate": 7.311751170356265e-06, + "loss": 0.0, + "step": 10760 + }, + { + "epoch": 2.563675315401095, + "grad_norm": 0.0036267938558012247, + "learning_rate": 7.272078076648417e-06, + "loss": 0.0, + "step": 10770 + }, + { + "epoch": 2.566055701023566, + "grad_norm": 0.002974023576825857, + "learning_rate": 7.23240498294057e-06, + "loss": 0.0, + "step": 10780 + }, + { + "epoch": 2.5684360866460367, + "grad_norm": 0.0005654848064295948, + "learning_rate": 7.192731889232724e-06, + "loss": 0.0001, + "step": 10790 + }, + { + "epoch": 2.5708164722685076, + "grad_norm": 0.001776995835825801, + "learning_rate": 7.153058795524875e-06, + "loss": 0.0, + "step": 10800 + }, + { + "epoch": 2.573196857890978, + "grad_norm": 0.0031643370166420937, + "learning_rate": 7.1133857018170286e-06, + "loss": 0.0, + "step": 10810 + }, + { + "epoch": 2.5755772435134494, + "grad_norm": 0.0006117381271906197, + "learning_rate": 7.07371260810918e-06, + "loss": 0.0, + "step": 10820 + }, + { + "epoch": 2.57795762913592, + "grad_norm": 0.00013082509394735098, + "learning_rate": 7.034039514401333e-06, + "loss": 0.0, + "step": 10830 + }, + { + "epoch": 2.5803380147583908, + "grad_norm": 0.009411906823515892, + "learning_rate": 6.994366420693486e-06, + "loss": 0.0, + "step": 10840 + }, + { + "epoch": 2.5827184003808616, + "grad_norm": 0.007766501512378454, + "learning_rate": 6.954693326985638e-06, + "loss": 0.0, + "step": 10850 + }, + { + "epoch": 2.5850987860033325, + "grad_norm": 0.001036152825690806, + "learning_rate": 6.915020233277791e-06, + "loss": 0.0, + "step": 10860 + }, + { + "epoch": 2.5874791716258034, + "grad_norm": 0.0007062302902340889, + "learning_rate": 6.875347139569945e-06, + "loss": 0.0, + "step": 10870 + }, + { + "epoch": 2.5898595572482743, + "grad_norm": 0.004976709373295307, + "learning_rate": 6.835674045862096e-06, + "loss": 0.0, + "step": 10880 + }, + { + "epoch": 2.592239942870745, + "grad_norm": 0.0005074761575087905, + "learning_rate": 6.79600095215425e-06, + "loss": 0.0, + "step": 10890 + }, + { + "epoch": 2.5946203284932157, + "grad_norm": 0.0028977631591260433, + "learning_rate": 6.7563278584464025e-06, + "loss": 0.0, + "step": 10900 + }, + { + "epoch": 2.597000714115687, + "grad_norm": 0.004557565785944462, + "learning_rate": 6.7166547647385545e-06, + "loss": 0.0, + "step": 10910 + }, + { + "epoch": 2.5993810997381575, + "grad_norm": 0.0018358832458034158, + "learning_rate": 6.676981671030707e-06, + "loss": 0.0, + "step": 10920 + }, + { + "epoch": 2.6017614853606283, + "grad_norm": 0.0014729060931131244, + "learning_rate": 6.637308577322859e-06, + "loss": 0.0, + "step": 10930 + }, + { + "epoch": 2.6041418709830992, + "grad_norm": 0.0004332439857535064, + "learning_rate": 6.597635483615012e-06, + "loss": 0.0, + "step": 10940 + }, + { + "epoch": 2.60652225660557, + "grad_norm": 0.0009114540298469365, + "learning_rate": 6.557962389907166e-06, + "loss": 0.0, + "step": 10950 + }, + { + "epoch": 2.608902642228041, + "grad_norm": 0.010355968959629536, + "learning_rate": 6.518289296199318e-06, + "loss": 0.0, + "step": 10960 + }, + { + "epoch": 2.611283027850512, + "grad_norm": 0.054084401577711105, + "learning_rate": 6.478616202491471e-06, + "loss": 0.0, + "step": 10970 + }, + { + "epoch": 2.613663413472983, + "grad_norm": 0.0009903626050800085, + "learning_rate": 6.438943108783624e-06, + "loss": 0.0, + "step": 10980 + }, + { + "epoch": 2.6160437990954533, + "grad_norm": 0.00019378839351702482, + "learning_rate": 6.399270015075776e-06, + "loss": 0.0, + "step": 10990 + }, + { + "epoch": 2.618424184717924, + "grad_norm": 0.0006563541246578097, + "learning_rate": 6.3595969213679285e-06, + "loss": 0.0, + "step": 11000 + }, + { + "epoch": 2.620804570340395, + "grad_norm": 0.0006744746351614594, + "learning_rate": 6.3199238276600805e-06, + "loss": 0.0, + "step": 11010 + }, + { + "epoch": 2.623184955962866, + "grad_norm": 0.0011966971214860678, + "learning_rate": 6.280250733952233e-06, + "loss": 0.0, + "step": 11020 + }, + { + "epoch": 2.625565341585337, + "grad_norm": 0.0017309453105553985, + "learning_rate": 6.240577640244387e-06, + "loss": 0.0, + "step": 11030 + }, + { + "epoch": 2.6279457272078077, + "grad_norm": 0.0008661380270496011, + "learning_rate": 6.200904546536539e-06, + "loss": 0.0005, + "step": 11040 + }, + { + "epoch": 2.6303261128302786, + "grad_norm": 0.0003683891554828733, + "learning_rate": 6.161231452828692e-06, + "loss": 0.0, + "step": 11050 + }, + { + "epoch": 2.6327064984527495, + "grad_norm": 0.0005742148496210575, + "learning_rate": 6.121558359120845e-06, + "loss": 0.0, + "step": 11060 + }, + { + "epoch": 2.6350868840752204, + "grad_norm": 0.0010009456891566515, + "learning_rate": 6.0818852654129976e-06, + "loss": 0.0, + "step": 11070 + }, + { + "epoch": 2.637467269697691, + "grad_norm": 0.0008674330892972648, + "learning_rate": 6.0422121717051496e-06, + "loss": 0.0001, + "step": 11080 + }, + { + "epoch": 2.6398476553201617, + "grad_norm": 0.00011453252227511257, + "learning_rate": 6.002539077997302e-06, + "loss": 0.0, + "step": 11090 + }, + { + "epoch": 2.6422280409426326, + "grad_norm": 0.0014997412217780948, + "learning_rate": 5.962865984289455e-06, + "loss": 0.0, + "step": 11100 + }, + { + "epoch": 2.6446084265651035, + "grad_norm": 0.0013535526813939214, + "learning_rate": 5.923192890581608e-06, + "loss": 0.0, + "step": 11110 + }, + { + "epoch": 2.6469888121875744, + "grad_norm": 0.0010607549920678139, + "learning_rate": 5.883519796873761e-06, + "loss": 0.0, + "step": 11120 + }, + { + "epoch": 2.6493691978100453, + "grad_norm": 0.001384345581755042, + "learning_rate": 5.843846703165913e-06, + "loss": 0.0, + "step": 11130 + }, + { + "epoch": 2.651749583432516, + "grad_norm": 0.009620246477425098, + "learning_rate": 5.804173609458066e-06, + "loss": 0.0, + "step": 11140 + }, + { + "epoch": 2.6541299690549867, + "grad_norm": 0.004576113075017929, + "learning_rate": 5.764500515750219e-06, + "loss": 0.0, + "step": 11150 + }, + { + "epoch": 2.656510354677458, + "grad_norm": 0.0007963149109855294, + "learning_rate": 5.7248274220423715e-06, + "loss": 0.0, + "step": 11160 + }, + { + "epoch": 2.6588907402999284, + "grad_norm": 0.0005275904550217092, + "learning_rate": 5.6851543283345235e-06, + "loss": 0.0, + "step": 11170 + }, + { + "epoch": 2.6612711259223993, + "grad_norm": 0.0007748051430098712, + "learning_rate": 5.645481234626677e-06, + "loss": 0.0, + "step": 11180 + }, + { + "epoch": 2.6636515115448702, + "grad_norm": 0.0005676033324562013, + "learning_rate": 5.605808140918829e-06, + "loss": 0.0, + "step": 11190 + }, + { + "epoch": 2.666031897167341, + "grad_norm": 0.0009870273061096668, + "learning_rate": 5.566135047210982e-06, + "loss": 0.0, + "step": 11200 + }, + { + "epoch": 2.668412282789812, + "grad_norm": 0.0004960622172802687, + "learning_rate": 5.526461953503134e-06, + "loss": 0.0, + "step": 11210 + }, + { + "epoch": 2.670792668412283, + "grad_norm": 0.2789072096347809, + "learning_rate": 5.486788859795288e-06, + "loss": 0.0001, + "step": 11220 + }, + { + "epoch": 2.673173054034754, + "grad_norm": 0.004494486376643181, + "learning_rate": 5.44711576608744e-06, + "loss": 0.0001, + "step": 11230 + }, + { + "epoch": 2.6755534396572243, + "grad_norm": 0.0009736506035551429, + "learning_rate": 5.407442672379593e-06, + "loss": 0.0, + "step": 11240 + }, + { + "epoch": 2.677933825279695, + "grad_norm": 0.0027844863943755627, + "learning_rate": 5.367769578671745e-06, + "loss": 0.0, + "step": 11250 + }, + { + "epoch": 2.680314210902166, + "grad_norm": 0.013426104560494423, + "learning_rate": 5.328096484963898e-06, + "loss": 0.0, + "step": 11260 + }, + { + "epoch": 2.682694596524637, + "grad_norm": 0.0002785604156088084, + "learning_rate": 5.28842339125605e-06, + "loss": 0.0, + "step": 11270 + }, + { + "epoch": 2.685074982147108, + "grad_norm": 0.0007079096976667643, + "learning_rate": 5.248750297548203e-06, + "loss": 0.0, + "step": 11280 + }, + { + "epoch": 2.6874553677695787, + "grad_norm": 0.0004877845640294254, + "learning_rate": 5.209077203840355e-06, + "loss": 0.0001, + "step": 11290 + }, + { + "epoch": 2.6898357533920496, + "grad_norm": 0.029308408498764038, + "learning_rate": 5.169404110132509e-06, + "loss": 0.0, + "step": 11300 + }, + { + "epoch": 2.6922161390145205, + "grad_norm": 0.0011891064932569861, + "learning_rate": 5.129731016424661e-06, + "loss": 0.0, + "step": 11310 + }, + { + "epoch": 2.6945965246369914, + "grad_norm": 0.009328281506896019, + "learning_rate": 5.090057922716814e-06, + "loss": 0.0, + "step": 11320 + }, + { + "epoch": 2.696976910259462, + "grad_norm": 0.0010127691784873605, + "learning_rate": 5.0503848290089666e-06, + "loss": 0.0, + "step": 11330 + }, + { + "epoch": 2.6993572958819327, + "grad_norm": 0.0006704577244818211, + "learning_rate": 5.010711735301119e-06, + "loss": 0.0, + "step": 11340 + }, + { + "epoch": 2.7017376815044036, + "grad_norm": 0.0015914466930553317, + "learning_rate": 4.971038641593271e-06, + "loss": 0.0, + "step": 11350 + }, + { + "epoch": 2.7041180671268745, + "grad_norm": 0.00046926282811909914, + "learning_rate": 4.931365547885424e-06, + "loss": 0.0, + "step": 11360 + }, + { + "epoch": 2.7064984527493454, + "grad_norm": 0.0008572743972763419, + "learning_rate": 4.891692454177577e-06, + "loss": 0.0, + "step": 11370 + }, + { + "epoch": 2.7088788383718163, + "grad_norm": 0.001012885244563222, + "learning_rate": 4.85201936046973e-06, + "loss": 0.0, + "step": 11380 + }, + { + "epoch": 2.711259223994287, + "grad_norm": 0.000291361880954355, + "learning_rate": 4.812346266761882e-06, + "loss": 0.0, + "step": 11390 + }, + { + "epoch": 2.7136396096167577, + "grad_norm": 0.001445894013158977, + "learning_rate": 4.772673173054035e-06, + "loss": 0.0, + "step": 11400 + }, + { + "epoch": 2.716019995239229, + "grad_norm": 0.0007329813088290393, + "learning_rate": 4.733000079346188e-06, + "loss": 0.0, + "step": 11410 + }, + { + "epoch": 2.7184003808616994, + "grad_norm": 0.02237352356314659, + "learning_rate": 4.6933269856383405e-06, + "loss": 0.0, + "step": 11420 + }, + { + "epoch": 2.7207807664841703, + "grad_norm": 0.0004787015204783529, + "learning_rate": 4.6536538919304925e-06, + "loss": 0.0, + "step": 11430 + }, + { + "epoch": 2.7231611521066412, + "grad_norm": 0.0011766423704102635, + "learning_rate": 4.613980798222645e-06, + "loss": 0.0, + "step": 11440 + }, + { + "epoch": 2.725541537729112, + "grad_norm": 0.0003720026579685509, + "learning_rate": 4.574307704514798e-06, + "loss": 0.0, + "step": 11450 + }, + { + "epoch": 2.727921923351583, + "grad_norm": 0.0004271367215551436, + "learning_rate": 4.534634610806951e-06, + "loss": 0.0, + "step": 11460 + }, + { + "epoch": 2.730302308974054, + "grad_norm": 0.001319264993071556, + "learning_rate": 4.494961517099103e-06, + "loss": 0.0, + "step": 11470 + }, + { + "epoch": 2.732682694596525, + "grad_norm": 0.0012237573973834515, + "learning_rate": 4.455288423391257e-06, + "loss": 0.0, + "step": 11480 + }, + { + "epoch": 2.7350630802189952, + "grad_norm": 0.00044418079778552055, + "learning_rate": 4.415615329683409e-06, + "loss": 0.0, + "step": 11490 + }, + { + "epoch": 2.7374434658414666, + "grad_norm": 0.0009368477039970458, + "learning_rate": 4.375942235975562e-06, + "loss": 0.0, + "step": 11500 + }, + { + "epoch": 2.739823851463937, + "grad_norm": 0.0015390801709145308, + "learning_rate": 4.336269142267714e-06, + "loss": 0.0001, + "step": 11510 + }, + { + "epoch": 2.742204237086408, + "grad_norm": 0.00022943236399441957, + "learning_rate": 4.296596048559867e-06, + "loss": 0.0, + "step": 11520 + }, + { + "epoch": 2.744584622708879, + "grad_norm": 0.0031924904324114323, + "learning_rate": 4.256922954852019e-06, + "loss": 0.0, + "step": 11530 + }, + { + "epoch": 2.7469650083313497, + "grad_norm": 0.0011005508713424206, + "learning_rate": 4.217249861144172e-06, + "loss": 0.0, + "step": 11540 + }, + { + "epoch": 2.7493453939538206, + "grad_norm": 0.00039162219036370516, + "learning_rate": 4.177576767436325e-06, + "loss": 0.0, + "step": 11550 + }, + { + "epoch": 2.7517257795762915, + "grad_norm": 0.0011376795591786504, + "learning_rate": 4.137903673728478e-06, + "loss": 0.0, + "step": 11560 + }, + { + "epoch": 2.7541061651987624, + "grad_norm": 0.0005944286240264773, + "learning_rate": 4.09823058002063e-06, + "loss": 0.0, + "step": 11570 + }, + { + "epoch": 2.756486550821233, + "grad_norm": 0.0007298539276234806, + "learning_rate": 4.058557486312783e-06, + "loss": 0.0, + "step": 11580 + }, + { + "epoch": 2.7588669364437037, + "grad_norm": 0.00018211067072115839, + "learning_rate": 4.0188843926049356e-06, + "loss": 0.0, + "step": 11590 + }, + { + "epoch": 2.7612473220661746, + "grad_norm": 0.0034182893577963114, + "learning_rate": 3.9792112988970884e-06, + "loss": 0.0, + "step": 11600 + }, + { + "epoch": 2.7636277076886455, + "grad_norm": 0.000364614010322839, + "learning_rate": 3.939538205189241e-06, + "loss": 0.0, + "step": 11610 + }, + { + "epoch": 2.7660080933111164, + "grad_norm": 0.0021814818028360605, + "learning_rate": 3.899865111481393e-06, + "loss": 0.0, + "step": 11620 + }, + { + "epoch": 2.7683884789335873, + "grad_norm": 0.0014812530716881156, + "learning_rate": 3.860192017773546e-06, + "loss": 0.0001, + "step": 11630 + }, + { + "epoch": 2.770768864556058, + "grad_norm": 0.0005358079797588289, + "learning_rate": 3.820518924065699e-06, + "loss": 0.0, + "step": 11640 + }, + { + "epoch": 2.7731492501785286, + "grad_norm": 0.00028996021137572825, + "learning_rate": 3.7808458303578514e-06, + "loss": 0.0, + "step": 11650 + }, + { + "epoch": 2.775529635801, + "grad_norm": 0.001182155217975378, + "learning_rate": 3.741172736650004e-06, + "loss": 0.0, + "step": 11660 + }, + { + "epoch": 2.7779100214234704, + "grad_norm": 0.00023413899180013686, + "learning_rate": 3.701499642942157e-06, + "loss": 0.0, + "step": 11670 + }, + { + "epoch": 2.7802904070459413, + "grad_norm": 0.0006019670399837196, + "learning_rate": 3.6618265492343095e-06, + "loss": 0.0, + "step": 11680 + }, + { + "epoch": 2.782670792668412, + "grad_norm": 0.0004944771062582731, + "learning_rate": 3.622153455526462e-06, + "loss": 0.0, + "step": 11690 + }, + { + "epoch": 2.785051178290883, + "grad_norm": 6.98843869031407e-05, + "learning_rate": 3.5824803618186144e-06, + "loss": 0.0, + "step": 11700 + }, + { + "epoch": 2.787431563913354, + "grad_norm": 0.0005101510905660689, + "learning_rate": 3.5428072681107677e-06, + "loss": 0.0, + "step": 11710 + }, + { + "epoch": 2.789811949535825, + "grad_norm": 0.00034247711300849915, + "learning_rate": 3.50313417440292e-06, + "loss": 0.0, + "step": 11720 + }, + { + "epoch": 2.792192335158296, + "grad_norm": 0.00044277720735408366, + "learning_rate": 3.4634610806950725e-06, + "loss": 0.0, + "step": 11730 + }, + { + "epoch": 2.7945727207807662, + "grad_norm": 0.0005088172620162368, + "learning_rate": 3.423787986987225e-06, + "loss": 0.0, + "step": 11740 + }, + { + "epoch": 2.7969531064032376, + "grad_norm": 0.00021512300008907914, + "learning_rate": 3.384114893279378e-06, + "loss": 0.0, + "step": 11750 + }, + { + "epoch": 2.799333492025708, + "grad_norm": 0.0007052098517306149, + "learning_rate": 3.3444417995715306e-06, + "loss": 0.0, + "step": 11760 + }, + { + "epoch": 2.801713877648179, + "grad_norm": 0.036882251501083374, + "learning_rate": 3.304768705863683e-06, + "loss": 0.0, + "step": 11770 + }, + { + "epoch": 2.80409426327065, + "grad_norm": 0.00013749166100751609, + "learning_rate": 3.2650956121558363e-06, + "loss": 0.0, + "step": 11780 + }, + { + "epoch": 2.8064746488931207, + "grad_norm": 0.0006571552366949618, + "learning_rate": 3.2254225184479888e-06, + "loss": 0.0, + "step": 11790 + }, + { + "epoch": 2.8088550345155916, + "grad_norm": 0.0008290376281365752, + "learning_rate": 3.185749424740141e-06, + "loss": 0.0, + "step": 11800 + }, + { + "epoch": 2.8112354201380625, + "grad_norm": 8.49374700919725e-05, + "learning_rate": 3.146076331032294e-06, + "loss": 0.0, + "step": 11810 + }, + { + "epoch": 2.8136158057605334, + "grad_norm": 0.00033748464193195105, + "learning_rate": 3.1064032373244465e-06, + "loss": 0.0, + "step": 11820 + }, + { + "epoch": 2.815996191383004, + "grad_norm": 0.0003914514381904155, + "learning_rate": 3.0667301436165993e-06, + "loss": 0.0, + "step": 11830 + }, + { + "epoch": 2.8183765770054747, + "grad_norm": 0.00029730124515481293, + "learning_rate": 3.0270570499087517e-06, + "loss": 0.0, + "step": 11840 + }, + { + "epoch": 2.8207569626279456, + "grad_norm": 0.00035526990541256964, + "learning_rate": 2.9873839562009046e-06, + "loss": 0.0, + "step": 11850 + }, + { + "epoch": 2.8231373482504165, + "grad_norm": 0.0007370146340690553, + "learning_rate": 2.9477108624930574e-06, + "loss": 0.0, + "step": 11860 + }, + { + "epoch": 2.8255177338728874, + "grad_norm": 8.048515883274376e-05, + "learning_rate": 2.90803776878521e-06, + "loss": 0.0, + "step": 11870 + }, + { + "epoch": 2.8278981194953583, + "grad_norm": 0.00022186528076417744, + "learning_rate": 2.8683646750773627e-06, + "loss": 0.0, + "step": 11880 + }, + { + "epoch": 2.830278505117829, + "grad_norm": 0.0004252239887136966, + "learning_rate": 2.8286915813695156e-06, + "loss": 0.0, + "step": 11890 + }, + { + "epoch": 2.8326588907403, + "grad_norm": 0.00027670618146657944, + "learning_rate": 2.789018487661668e-06, + "loss": 0.0, + "step": 11900 + }, + { + "epoch": 2.835039276362771, + "grad_norm": 0.0020431778393685818, + "learning_rate": 2.749345393953821e-06, + "loss": 0.0, + "step": 11910 + }, + { + "epoch": 2.8374196619852414, + "grad_norm": 0.001547365915030241, + "learning_rate": 2.7096723002459737e-06, + "loss": 0.0, + "step": 11920 + }, + { + "epoch": 2.8398000476077123, + "grad_norm": 0.0013964555691927671, + "learning_rate": 2.669999206538126e-06, + "loss": 0.0, + "step": 11930 + }, + { + "epoch": 2.842180433230183, + "grad_norm": 0.00027170139946974814, + "learning_rate": 2.630326112830279e-06, + "loss": 0.0, + "step": 11940 + }, + { + "epoch": 2.844560818852654, + "grad_norm": 0.0008765398524701595, + "learning_rate": 2.5906530191224314e-06, + "loss": 0.0, + "step": 11950 + }, + { + "epoch": 2.846941204475125, + "grad_norm": 0.00015922258899081498, + "learning_rate": 2.5509799254145842e-06, + "loss": 0.0, + "step": 11960 + }, + { + "epoch": 2.849321590097596, + "grad_norm": 0.00011323492071824148, + "learning_rate": 2.5113068317067367e-06, + "loss": 0.0, + "step": 11970 + }, + { + "epoch": 2.851701975720067, + "grad_norm": 0.0008671206305734813, + "learning_rate": 2.4716337379988895e-06, + "loss": 0.0001, + "step": 11980 + }, + { + "epoch": 2.8540823613425372, + "grad_norm": 0.00013449507241602987, + "learning_rate": 2.431960644291042e-06, + "loss": 0.0, + "step": 11990 + }, + { + "epoch": 2.8564627469650086, + "grad_norm": 0.0008318678010255098, + "learning_rate": 2.3922875505831948e-06, + "loss": 0.0, + "step": 12000 + }, + { + "epoch": 2.858843132587479, + "grad_norm": 0.0012901159934699535, + "learning_rate": 2.352614456875347e-06, + "loss": 0.0, + "step": 12010 + }, + { + "epoch": 2.86122351820995, + "grad_norm": 0.00032769294921308756, + "learning_rate": 2.3129413631675e-06, + "loss": 0.0, + "step": 12020 + }, + { + "epoch": 2.863603903832421, + "grad_norm": 0.0022394724655896425, + "learning_rate": 2.2732682694596525e-06, + "loss": 0.0, + "step": 12030 + }, + { + "epoch": 2.8659842894548917, + "grad_norm": 0.0001916442415677011, + "learning_rate": 2.2335951757518053e-06, + "loss": 0.0, + "step": 12040 + }, + { + "epoch": 2.8683646750773626, + "grad_norm": 0.0008263205527327955, + "learning_rate": 2.1939220820439578e-06, + "loss": 0.0, + "step": 12050 + }, + { + "epoch": 2.8707450606998335, + "grad_norm": 0.01558750867843628, + "learning_rate": 2.1542489883361106e-06, + "loss": 0.0, + "step": 12060 + }, + { + "epoch": 2.8731254463223044, + "grad_norm": 0.0005802076193504035, + "learning_rate": 2.1145758946282635e-06, + "loss": 0.0002, + "step": 12070 + }, + { + "epoch": 2.875505831944775, + "grad_norm": 0.0006769265746697783, + "learning_rate": 2.074902800920416e-06, + "loss": 0.0, + "step": 12080 + }, + { + "epoch": 2.877886217567246, + "grad_norm": 0.00040787094621919096, + "learning_rate": 2.0352297072125687e-06, + "loss": 0.0, + "step": 12090 + }, + { + "epoch": 2.8802666031897166, + "grad_norm": 0.00034027136280201375, + "learning_rate": 1.995556613504721e-06, + "loss": 0.0, + "step": 12100 + }, + { + "epoch": 2.8826469888121875, + "grad_norm": 0.008367573842406273, + "learning_rate": 1.955883519796874e-06, + "loss": 0.0, + "step": 12110 + }, + { + "epoch": 2.8850273744346584, + "grad_norm": 0.0002640595193952322, + "learning_rate": 1.9162104260890264e-06, + "loss": 0.0, + "step": 12120 + }, + { + "epoch": 2.8874077600571293, + "grad_norm": 0.0006561621557921171, + "learning_rate": 1.8765373323811793e-06, + "loss": 0.0, + "step": 12130 + }, + { + "epoch": 2.8897881456796, + "grad_norm": 0.0008464111597277224, + "learning_rate": 1.8368642386733317e-06, + "loss": 0.0, + "step": 12140 + }, + { + "epoch": 2.892168531302071, + "grad_norm": 0.0003002223384100944, + "learning_rate": 1.7971911449654846e-06, + "loss": 0.0, + "step": 12150 + }, + { + "epoch": 2.894548916924542, + "grad_norm": 0.0003043843025807291, + "learning_rate": 1.757518051257637e-06, + "loss": 0.0, + "step": 12160 + }, + { + "epoch": 2.8969293025470124, + "grad_norm": 0.00041168101597577333, + "learning_rate": 1.7178449575497898e-06, + "loss": 0.0, + "step": 12170 + }, + { + "epoch": 2.8993096881694833, + "grad_norm": 0.002103559672832489, + "learning_rate": 1.6781718638419423e-06, + "loss": 0.0, + "step": 12180 + }, + { + "epoch": 2.901690073791954, + "grad_norm": 0.00029975874349474907, + "learning_rate": 1.6384987701340951e-06, + "loss": 0.0, + "step": 12190 + }, + { + "epoch": 2.904070459414425, + "grad_norm": 0.004904668778181076, + "learning_rate": 1.5988256764262475e-06, + "loss": 0.0001, + "step": 12200 + }, + { + "epoch": 2.906450845036896, + "grad_norm": 0.0009001428843475878, + "learning_rate": 1.5591525827184004e-06, + "loss": 0.0, + "step": 12210 + }, + { + "epoch": 2.908831230659367, + "grad_norm": 0.0004976601339876652, + "learning_rate": 1.519479489010553e-06, + "loss": 0.0, + "step": 12220 + }, + { + "epoch": 2.9112116162818378, + "grad_norm": 0.0002044235880021006, + "learning_rate": 1.4798063953027057e-06, + "loss": 0.0, + "step": 12230 + }, + { + "epoch": 2.9135920019043082, + "grad_norm": 0.0003118833410553634, + "learning_rate": 1.4401333015948583e-06, + "loss": 0.0, + "step": 12240 + }, + { + "epoch": 2.9159723875267796, + "grad_norm": 0.00038868881529197097, + "learning_rate": 1.4004602078870111e-06, + "loss": 0.0, + "step": 12250 + }, + { + "epoch": 2.91835277314925, + "grad_norm": 0.0005747165414504707, + "learning_rate": 1.3607871141791638e-06, + "loss": 0.0, + "step": 12260 + }, + { + "epoch": 2.920733158771721, + "grad_norm": 0.0013731828657910228, + "learning_rate": 1.3211140204713164e-06, + "loss": 0.0, + "step": 12270 + }, + { + "epoch": 2.923113544394192, + "grad_norm": 0.000688336614985019, + "learning_rate": 1.281440926763469e-06, + "loss": 0.0, + "step": 12280 + }, + { + "epoch": 2.9254939300166627, + "grad_norm": 0.00041094853077083826, + "learning_rate": 1.241767833055622e-06, + "loss": 0.0, + "step": 12290 + }, + { + "epoch": 2.9278743156391336, + "grad_norm": 0.00040040462044999003, + "learning_rate": 1.2020947393477745e-06, + "loss": 0.0, + "step": 12300 + }, + { + "epoch": 2.9302547012616045, + "grad_norm": 0.0027486933395266533, + "learning_rate": 1.1624216456399272e-06, + "loss": 0.0, + "step": 12310 + }, + { + "epoch": 2.9326350868840754, + "grad_norm": 0.000705558864865452, + "learning_rate": 1.1227485519320798e-06, + "loss": 0.0, + "step": 12320 + }, + { + "epoch": 2.935015472506546, + "grad_norm": 0.0013841954059898853, + "learning_rate": 1.0830754582242325e-06, + "loss": 0.0001, + "step": 12330 + }, + { + "epoch": 2.937395858129017, + "grad_norm": 0.0013595300260931253, + "learning_rate": 1.043402364516385e-06, + "loss": 0.0, + "step": 12340 + }, + { + "epoch": 2.9397762437514876, + "grad_norm": 0.0011891273315995932, + "learning_rate": 1.0037292708085377e-06, + "loss": 0.0, + "step": 12350 + }, + { + "epoch": 2.9421566293739585, + "grad_norm": 0.0009695956250652671, + "learning_rate": 9.640561771006904e-07, + "loss": 0.0, + "step": 12360 + }, + { + "epoch": 2.9445370149964294, + "grad_norm": 0.00034754411899484694, + "learning_rate": 9.24383083392843e-07, + "loss": 0.0001, + "step": 12370 + }, + { + "epoch": 2.9469174006189003, + "grad_norm": 0.00020417921768967062, + "learning_rate": 8.847099896849956e-07, + "loss": 0.0, + "step": 12380 + }, + { + "epoch": 2.949297786241371, + "grad_norm": 0.0010077544720843434, + "learning_rate": 8.450368959771483e-07, + "loss": 0.0, + "step": 12390 + }, + { + "epoch": 2.951678171863842, + "grad_norm": 0.0006951851537451148, + "learning_rate": 8.053638022693009e-07, + "loss": 0.0, + "step": 12400 + }, + { + "epoch": 2.954058557486313, + "grad_norm": 0.0005225545028224587, + "learning_rate": 7.656907085614537e-07, + "loss": 0.0, + "step": 12410 + }, + { + "epoch": 2.9564389431087834, + "grad_norm": 0.0004363077168818563, + "learning_rate": 7.260176148536063e-07, + "loss": 0.0, + "step": 12420 + }, + { + "epoch": 2.9588193287312543, + "grad_norm": 0.00024609945830889046, + "learning_rate": 6.863445211457589e-07, + "loss": 0.0, + "step": 12430 + }, + { + "epoch": 2.961199714353725, + "grad_norm": 0.06491145491600037, + "learning_rate": 6.466714274379116e-07, + "loss": 0.0, + "step": 12440 + }, + { + "epoch": 2.963580099976196, + "grad_norm": 0.0004482944495975971, + "learning_rate": 6.069983337300642e-07, + "loss": 0.0, + "step": 12450 + }, + { + "epoch": 2.965960485598667, + "grad_norm": 0.001836300129070878, + "learning_rate": 5.67325240022217e-07, + "loss": 0.0, + "step": 12460 + }, + { + "epoch": 2.968340871221138, + "grad_norm": 0.0004112005408387631, + "learning_rate": 5.276521463143697e-07, + "loss": 0.0, + "step": 12470 + }, + { + "epoch": 2.9707212568436088, + "grad_norm": 0.0020831027068197727, + "learning_rate": 4.879790526065223e-07, + "loss": 0.0, + "step": 12480 + }, + { + "epoch": 2.9731016424660797, + "grad_norm": 0.0012763678096234798, + "learning_rate": 4.4830595889867493e-07, + "loss": 0.0, + "step": 12490 + }, + { + "epoch": 2.9754820280885506, + "grad_norm": 0.0011779662454500794, + "learning_rate": 4.086328651908276e-07, + "loss": 0.0, + "step": 12500 + }, + { + "epoch": 2.977862413711021, + "grad_norm": 0.0005871544708497822, + "learning_rate": 3.6895977148298026e-07, + "loss": 0.0, + "step": 12510 + }, + { + "epoch": 2.980242799333492, + "grad_norm": 0.002057824982330203, + "learning_rate": 3.2928667777513295e-07, + "loss": 0.0, + "step": 12520 + }, + { + "epoch": 2.982623184955963, + "grad_norm": 0.00029588877805508673, + "learning_rate": 2.896135840672856e-07, + "loss": 0.0, + "step": 12530 + }, + { + "epoch": 2.9850035705784337, + "grad_norm": 0.0004726073530036956, + "learning_rate": 2.499404903594382e-07, + "loss": 0.0, + "step": 12540 + }, + { + "epoch": 2.9873839562009046, + "grad_norm": 0.0014838631032034755, + "learning_rate": 2.102673966515909e-07, + "loss": 0.0, + "step": 12550 + }, + { + "epoch": 2.9897643418233755, + "grad_norm": 0.0010778923751786351, + "learning_rate": 1.7059430294374355e-07, + "loss": 0.0, + "step": 12560 + }, + { + "epoch": 2.9921447274458464, + "grad_norm": 0.0007851801346987486, + "learning_rate": 1.3092120923589622e-07, + "loss": 0.0, + "step": 12570 + }, + { + "epoch": 2.994525113068317, + "grad_norm": 0.00047710456419736147, + "learning_rate": 9.124811552804888e-08, + "loss": 0.0, + "step": 12580 + }, + { + "epoch": 2.996905498690788, + "grad_norm": 0.003749624127522111, + "learning_rate": 5.1575021820201544e-08, + "loss": 0.0, + "step": 12590 + }, + { + "epoch": 2.9992858843132586, + "grad_norm": 0.0007799621089361608, + "learning_rate": 1.1901928112354202e-08, + "loss": 0.0001, + "step": 12600 + }, + { + "epoch": 3.0, + "eval_loss": 2.340411811019294e-07, + "eval_runtime": 52.9973, + "eval_samples_per_second": 35.247, + "eval_steps_per_second": 8.812, + "step": 12603 + } + ], + "logging_steps": 10, + "max_steps": 12603, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 6586245895421952.0, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}