{ "best_metric": 0.6777463460631777, "best_model_checkpoint": "videomae-base-finetuned-chickenbehaviour/checkpoint-19086", "epoch": 7.125, "eval_steps": 500, "global_step": 25448, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 10.394131660461426, "learning_rate": 1.9646365422396855e-07, "loss": 1.4018, "step": 10 }, { "epoch": 0.0, "grad_norm": 7.45326042175293, "learning_rate": 3.929273084479371e-07, "loss": 1.4698, "step": 20 }, { "epoch": 0.0, "grad_norm": 6.575077056884766, "learning_rate": 5.893909626719058e-07, "loss": 1.6503, "step": 30 }, { "epoch": 0.0, "grad_norm": 7.925477027893066, "learning_rate": 7.858546168958742e-07, "loss": 1.2539, "step": 40 }, { "epoch": 0.0, "grad_norm": 7.608184814453125, "learning_rate": 9.823182711198429e-07, "loss": 1.3469, "step": 50 }, { "epoch": 0.0, "grad_norm": 4.777962684631348, "learning_rate": 1.1787819253438115e-06, "loss": 1.2688, "step": 60 }, { "epoch": 0.0, "grad_norm": 4.392425537109375, "learning_rate": 1.37524557956778e-06, "loss": 1.5589, "step": 70 }, { "epoch": 0.0, "grad_norm": 4.155239105224609, "learning_rate": 1.5717092337917484e-06, "loss": 1.4452, "step": 80 }, { "epoch": 0.0, "grad_norm": 9.280621528625488, "learning_rate": 1.768172888015717e-06, "loss": 1.6565, "step": 90 }, { "epoch": 0.0, "grad_norm": 6.231608867645264, "learning_rate": 1.9646365422396858e-06, "loss": 1.4829, "step": 100 }, { "epoch": 0.0, "grad_norm": 12.79507827758789, "learning_rate": 2.161100196463654e-06, "loss": 1.956, "step": 110 }, { "epoch": 0.0, "grad_norm": 8.23023796081543, "learning_rate": 2.357563850687623e-06, "loss": 1.3583, "step": 120 }, { "epoch": 0.01, "grad_norm": 4.914371013641357, "learning_rate": 2.5540275049115915e-06, "loss": 1.7319, "step": 130 }, { "epoch": 0.01, "grad_norm": 7.463047981262207, "learning_rate": 2.75049115913556e-06, "loss": 1.9267, "step": 140 }, { "epoch": 0.01, "grad_norm": 9.884008407592773, "learning_rate": 2.946954813359529e-06, "loss": 1.5219, "step": 150 }, { "epoch": 0.01, "grad_norm": 5.130902290344238, "learning_rate": 3.143418467583497e-06, "loss": 1.4455, "step": 160 }, { "epoch": 0.01, "grad_norm": 7.394954204559326, "learning_rate": 3.3398821218074657e-06, "loss": 1.6936, "step": 170 }, { "epoch": 0.01, "grad_norm": 4.51138162612915, "learning_rate": 3.536345776031434e-06, "loss": 1.3487, "step": 180 }, { "epoch": 0.01, "grad_norm": 11.946001052856445, "learning_rate": 3.732809430255403e-06, "loss": 1.3861, "step": 190 }, { "epoch": 0.01, "grad_norm": 5.986423492431641, "learning_rate": 3.9292730844793715e-06, "loss": 1.7812, "step": 200 }, { "epoch": 0.01, "grad_norm": 7.473042964935303, "learning_rate": 4.12573673870334e-06, "loss": 1.3823, "step": 210 }, { "epoch": 0.01, "grad_norm": 7.890947341918945, "learning_rate": 4.322200392927308e-06, "loss": 1.3436, "step": 220 }, { "epoch": 0.01, "grad_norm": 9.291756629943848, "learning_rate": 4.518664047151278e-06, "loss": 1.5751, "step": 230 }, { "epoch": 0.01, "grad_norm": 6.813894748687744, "learning_rate": 4.715127701375246e-06, "loss": 1.303, "step": 240 }, { "epoch": 0.01, "grad_norm": 10.576889991760254, "learning_rate": 4.911591355599214e-06, "loss": 1.7911, "step": 250 }, { "epoch": 0.01, "grad_norm": 7.865218639373779, "learning_rate": 5.108055009823183e-06, "loss": 1.5955, "step": 260 }, { "epoch": 0.01, "grad_norm": 12.177133560180664, "learning_rate": 5.3045186640471515e-06, "loss": 1.9946, "step": 270 }, { "epoch": 0.01, "grad_norm": 10.074688911437988, "learning_rate": 5.50098231827112e-06, "loss": 1.4666, "step": 280 }, { "epoch": 0.01, "grad_norm": 6.612791538238525, "learning_rate": 5.697445972495088e-06, "loss": 1.1115, "step": 290 }, { "epoch": 0.01, "grad_norm": 12.787959098815918, "learning_rate": 5.893909626719058e-06, "loss": 1.847, "step": 300 }, { "epoch": 0.01, "grad_norm": 6.857900619506836, "learning_rate": 6.090373280943025e-06, "loss": 1.4993, "step": 310 }, { "epoch": 0.01, "grad_norm": 10.485871315002441, "learning_rate": 6.286836935166994e-06, "loss": 1.2466, "step": 320 }, { "epoch": 0.01, "grad_norm": 7.906979084014893, "learning_rate": 6.483300589390963e-06, "loss": 1.7436, "step": 330 }, { "epoch": 0.01, "grad_norm": 8.239996910095215, "learning_rate": 6.6797642436149315e-06, "loss": 1.8369, "step": 340 }, { "epoch": 0.01, "grad_norm": 7.042497158050537, "learning_rate": 6.876227897838901e-06, "loss": 1.3499, "step": 350 }, { "epoch": 0.01, "grad_norm": 6.0535430908203125, "learning_rate": 7.072691552062868e-06, "loss": 1.6774, "step": 360 }, { "epoch": 0.01, "grad_norm": 8.97998046875, "learning_rate": 7.269155206286838e-06, "loss": 1.1997, "step": 370 }, { "epoch": 0.01, "grad_norm": 9.170978546142578, "learning_rate": 7.465618860510806e-06, "loss": 1.5851, "step": 380 }, { "epoch": 0.02, "grad_norm": 9.31490421295166, "learning_rate": 7.662082514734775e-06, "loss": 1.678, "step": 390 }, { "epoch": 0.02, "grad_norm": 8.385000228881836, "learning_rate": 7.858546168958743e-06, "loss": 1.7227, "step": 400 }, { "epoch": 0.02, "grad_norm": 8.306479454040527, "learning_rate": 8.055009823182711e-06, "loss": 1.459, "step": 410 }, { "epoch": 0.02, "grad_norm": 6.981912136077881, "learning_rate": 8.25147347740668e-06, "loss": 1.4991, "step": 420 }, { "epoch": 0.02, "grad_norm": 20.09281349182129, "learning_rate": 8.447937131630648e-06, "loss": 1.666, "step": 430 }, { "epoch": 0.02, "grad_norm": 5.669954299926758, "learning_rate": 8.644400785854617e-06, "loss": 1.4479, "step": 440 }, { "epoch": 0.02, "grad_norm": 11.342742919921875, "learning_rate": 8.840864440078587e-06, "loss": 1.4535, "step": 450 }, { "epoch": 0.02, "grad_norm": 6.513933181762695, "learning_rate": 9.037328094302555e-06, "loss": 1.3667, "step": 460 }, { "epoch": 0.02, "grad_norm": 7.348504543304443, "learning_rate": 9.233791748526522e-06, "loss": 1.2527, "step": 470 }, { "epoch": 0.02, "grad_norm": 11.161468505859375, "learning_rate": 9.430255402750492e-06, "loss": 1.5684, "step": 480 }, { "epoch": 0.02, "grad_norm": 8.645318031311035, "learning_rate": 9.62671905697446e-06, "loss": 1.4732, "step": 490 }, { "epoch": 0.02, "grad_norm": 11.482588768005371, "learning_rate": 9.823182711198428e-06, "loss": 1.4329, "step": 500 }, { "epoch": 0.02, "grad_norm": 8.524100303649902, "learning_rate": 1.0019646365422398e-05, "loss": 1.2222, "step": 510 }, { "epoch": 0.02, "grad_norm": 5.331492900848389, "learning_rate": 1.0216110019646366e-05, "loss": 0.9751, "step": 520 }, { "epoch": 0.02, "grad_norm": 6.497613906860352, "learning_rate": 1.0412573673870335e-05, "loss": 1.1484, "step": 530 }, { "epoch": 0.02, "grad_norm": 10.038250923156738, "learning_rate": 1.0609037328094303e-05, "loss": 1.1031, "step": 540 }, { "epoch": 0.02, "grad_norm": 5.79709529876709, "learning_rate": 1.0805500982318271e-05, "loss": 1.1071, "step": 550 }, { "epoch": 0.02, "grad_norm": 4.789510250091553, "learning_rate": 1.100196463654224e-05, "loss": 1.07, "step": 560 }, { "epoch": 0.02, "grad_norm": 10.311077117919922, "learning_rate": 1.119842829076621e-05, "loss": 1.3162, "step": 570 }, { "epoch": 0.02, "grad_norm": 7.317972660064697, "learning_rate": 1.1394891944990177e-05, "loss": 1.2341, "step": 580 }, { "epoch": 0.02, "grad_norm": 9.392952919006348, "learning_rate": 1.1591355599214145e-05, "loss": 1.6377, "step": 590 }, { "epoch": 0.02, "grad_norm": 6.577554702758789, "learning_rate": 1.1787819253438115e-05, "loss": 1.4626, "step": 600 }, { "epoch": 0.02, "grad_norm": 11.501137733459473, "learning_rate": 1.1984282907662082e-05, "loss": 1.4852, "step": 610 }, { "epoch": 0.02, "grad_norm": 14.57140064239502, "learning_rate": 1.218074656188605e-05, "loss": 1.595, "step": 620 }, { "epoch": 0.02, "grad_norm": 10.156977653503418, "learning_rate": 1.237721021611002e-05, "loss": 1.3952, "step": 630 }, { "epoch": 0.03, "grad_norm": 14.087113380432129, "learning_rate": 1.2573673870333987e-05, "loss": 1.4008, "step": 640 }, { "epoch": 0.03, "grad_norm": 8.824213027954102, "learning_rate": 1.2770137524557958e-05, "loss": 1.4678, "step": 650 }, { "epoch": 0.03, "grad_norm": 15.29951000213623, "learning_rate": 1.2966601178781926e-05, "loss": 1.3995, "step": 660 }, { "epoch": 0.03, "grad_norm": 8.123235702514648, "learning_rate": 1.3163064833005895e-05, "loss": 1.5019, "step": 670 }, { "epoch": 0.03, "grad_norm": 10.05123519897461, "learning_rate": 1.3359528487229863e-05, "loss": 1.3279, "step": 680 }, { "epoch": 0.03, "grad_norm": 8.91348648071289, "learning_rate": 1.3555992141453833e-05, "loss": 1.8362, "step": 690 }, { "epoch": 0.03, "grad_norm": 13.258356094360352, "learning_rate": 1.3752455795677802e-05, "loss": 1.1796, "step": 700 }, { "epoch": 0.03, "grad_norm": 6.128164768218994, "learning_rate": 1.3948919449901768e-05, "loss": 1.1528, "step": 710 }, { "epoch": 0.03, "grad_norm": 9.367437362670898, "learning_rate": 1.4145383104125737e-05, "loss": 1.4948, "step": 720 }, { "epoch": 0.03, "grad_norm": 14.426176071166992, "learning_rate": 1.4341846758349705e-05, "loss": 1.6851, "step": 730 }, { "epoch": 0.03, "grad_norm": 19.73320960998535, "learning_rate": 1.4538310412573675e-05, "loss": 1.9248, "step": 740 }, { "epoch": 0.03, "grad_norm": 15.559652328491211, "learning_rate": 1.4734774066797644e-05, "loss": 1.3177, "step": 750 }, { "epoch": 0.03, "grad_norm": 11.005021095275879, "learning_rate": 1.4931237721021612e-05, "loss": 1.5996, "step": 760 }, { "epoch": 0.03, "grad_norm": 6.700186729431152, "learning_rate": 1.512770137524558e-05, "loss": 1.3295, "step": 770 }, { "epoch": 0.03, "grad_norm": 8.540508270263672, "learning_rate": 1.532416502946955e-05, "loss": 1.5227, "step": 780 }, { "epoch": 0.03, "grad_norm": 10.755002975463867, "learning_rate": 1.5520628683693518e-05, "loss": 1.1771, "step": 790 }, { "epoch": 0.03, "grad_norm": 5.741491794586182, "learning_rate": 1.5717092337917486e-05, "loss": 1.3991, "step": 800 }, { "epoch": 0.03, "grad_norm": 8.759336471557617, "learning_rate": 1.5913555992141455e-05, "loss": 1.2928, "step": 810 }, { "epoch": 0.03, "grad_norm": 5.328275203704834, "learning_rate": 1.6110019646365423e-05, "loss": 1.3228, "step": 820 }, { "epoch": 0.03, "grad_norm": 9.775882720947266, "learning_rate": 1.630648330058939e-05, "loss": 1.3438, "step": 830 }, { "epoch": 0.03, "grad_norm": 10.725419044494629, "learning_rate": 1.650294695481336e-05, "loss": 0.9901, "step": 840 }, { "epoch": 0.03, "grad_norm": 6.251297950744629, "learning_rate": 1.669941060903733e-05, "loss": 1.7383, "step": 850 }, { "epoch": 0.03, "grad_norm": 9.709663391113281, "learning_rate": 1.6895874263261297e-05, "loss": 1.2776, "step": 860 }, { "epoch": 0.03, "grad_norm": 12.714786529541016, "learning_rate": 1.7092337917485265e-05, "loss": 1.6162, "step": 870 }, { "epoch": 0.03, "grad_norm": 4.496620178222656, "learning_rate": 1.7288801571709234e-05, "loss": 1.1782, "step": 880 }, { "epoch": 0.03, "grad_norm": 9.746281623840332, "learning_rate": 1.7485265225933202e-05, "loss": 1.838, "step": 890 }, { "epoch": 0.04, "grad_norm": 7.110745906829834, "learning_rate": 1.7681728880157174e-05, "loss": 1.5393, "step": 900 }, { "epoch": 0.04, "grad_norm": 5.807432174682617, "learning_rate": 1.7878192534381142e-05, "loss": 1.5431, "step": 910 }, { "epoch": 0.04, "grad_norm": 6.808559417724609, "learning_rate": 1.807465618860511e-05, "loss": 1.3577, "step": 920 }, { "epoch": 0.04, "grad_norm": 8.64316177368164, "learning_rate": 1.8271119842829076e-05, "loss": 1.5992, "step": 930 }, { "epoch": 0.04, "grad_norm": 5.399013519287109, "learning_rate": 1.8467583497053044e-05, "loss": 1.48, "step": 940 }, { "epoch": 0.04, "grad_norm": 7.770280361175537, "learning_rate": 1.8664047151277013e-05, "loss": 1.4354, "step": 950 }, { "epoch": 0.04, "grad_norm": 14.282828330993652, "learning_rate": 1.8860510805500985e-05, "loss": 1.1923, "step": 960 }, { "epoch": 0.04, "grad_norm": 15.462182998657227, "learning_rate": 1.9056974459724953e-05, "loss": 1.1385, "step": 970 }, { "epoch": 0.04, "grad_norm": 8.012928009033203, "learning_rate": 1.925343811394892e-05, "loss": 1.4546, "step": 980 }, { "epoch": 0.04, "grad_norm": 6.5036516189575195, "learning_rate": 1.944990176817289e-05, "loss": 1.1244, "step": 990 }, { "epoch": 0.04, "grad_norm": 11.139033317565918, "learning_rate": 1.9646365422396855e-05, "loss": 1.5779, "step": 1000 }, { "epoch": 0.04, "grad_norm": 9.863429069519043, "learning_rate": 1.9842829076620827e-05, "loss": 1.1956, "step": 1010 }, { "epoch": 0.04, "grad_norm": 8.337393760681152, "learning_rate": 2.0039292730844795e-05, "loss": 1.4012, "step": 1020 }, { "epoch": 0.04, "grad_norm": 21.124107360839844, "learning_rate": 2.0235756385068764e-05, "loss": 1.4654, "step": 1030 }, { "epoch": 0.04, "grad_norm": 11.873066902160645, "learning_rate": 2.0432220039292732e-05, "loss": 1.0498, "step": 1040 }, { "epoch": 0.04, "grad_norm": 9.920634269714355, "learning_rate": 2.06286836935167e-05, "loss": 1.3642, "step": 1050 }, { "epoch": 0.04, "grad_norm": 5.280946731567383, "learning_rate": 2.082514734774067e-05, "loss": 1.582, "step": 1060 }, { "epoch": 0.04, "grad_norm": 7.705603122711182, "learning_rate": 2.1021611001964638e-05, "loss": 1.5856, "step": 1070 }, { "epoch": 0.04, "grad_norm": 9.099641799926758, "learning_rate": 2.1218074656188606e-05, "loss": 1.164, "step": 1080 }, { "epoch": 0.04, "grad_norm": 15.541876792907715, "learning_rate": 2.1414538310412574e-05, "loss": 1.8418, "step": 1090 }, { "epoch": 0.04, "grad_norm": 7.425509929656982, "learning_rate": 2.1611001964636543e-05, "loss": 1.568, "step": 1100 }, { "epoch": 0.04, "grad_norm": 5.149704933166504, "learning_rate": 2.180746561886051e-05, "loss": 1.0543, "step": 1110 }, { "epoch": 0.04, "grad_norm": 7.079004287719727, "learning_rate": 2.200392927308448e-05, "loss": 1.3883, "step": 1120 }, { "epoch": 0.04, "grad_norm": 5.580286026000977, "learning_rate": 2.2200392927308448e-05, "loss": 1.6113, "step": 1130 }, { "epoch": 0.04, "grad_norm": 4.750272274017334, "learning_rate": 2.239685658153242e-05, "loss": 1.1308, "step": 1140 }, { "epoch": 0.05, "grad_norm": 10.437239646911621, "learning_rate": 2.2593320235756385e-05, "loss": 1.4191, "step": 1150 }, { "epoch": 0.05, "grad_norm": 8.456888198852539, "learning_rate": 2.2789783889980354e-05, "loss": 1.4588, "step": 1160 }, { "epoch": 0.05, "grad_norm": 8.06169319152832, "learning_rate": 2.2986247544204322e-05, "loss": 1.0332, "step": 1170 }, { "epoch": 0.05, "grad_norm": 10.481605529785156, "learning_rate": 2.318271119842829e-05, "loss": 1.4272, "step": 1180 }, { "epoch": 0.05, "grad_norm": 6.591656684875488, "learning_rate": 2.3379174852652262e-05, "loss": 1.1557, "step": 1190 }, { "epoch": 0.05, "grad_norm": 6.696040153503418, "learning_rate": 2.357563850687623e-05, "loss": 0.9853, "step": 1200 }, { "epoch": 0.05, "grad_norm": 18.90186882019043, "learning_rate": 2.37721021611002e-05, "loss": 1.6339, "step": 1210 }, { "epoch": 0.05, "grad_norm": 5.797850131988525, "learning_rate": 2.3968565815324164e-05, "loss": 1.632, "step": 1220 }, { "epoch": 0.05, "grad_norm": 12.473474502563477, "learning_rate": 2.4165029469548133e-05, "loss": 1.4197, "step": 1230 }, { "epoch": 0.05, "grad_norm": 6.878892421722412, "learning_rate": 2.43614931237721e-05, "loss": 1.1182, "step": 1240 }, { "epoch": 0.05, "grad_norm": 8.978673934936523, "learning_rate": 2.4557956777996073e-05, "loss": 1.4259, "step": 1250 }, { "epoch": 0.05, "grad_norm": 13.112768173217773, "learning_rate": 2.475442043222004e-05, "loss": 1.2943, "step": 1260 }, { "epoch": 0.05, "grad_norm": 8.947503089904785, "learning_rate": 2.495088408644401e-05, "loss": 1.1939, "step": 1270 }, { "epoch": 0.05, "grad_norm": 16.912456512451172, "learning_rate": 2.5147347740667975e-05, "loss": 1.4931, "step": 1280 }, { "epoch": 0.05, "grad_norm": 7.998508930206299, "learning_rate": 2.5343811394891947e-05, "loss": 1.4884, "step": 1290 }, { "epoch": 0.05, "grad_norm": 6.81245231628418, "learning_rate": 2.5540275049115915e-05, "loss": 0.9738, "step": 1300 }, { "epoch": 0.05, "grad_norm": 13.682781219482422, "learning_rate": 2.5736738703339887e-05, "loss": 1.1422, "step": 1310 }, { "epoch": 0.05, "grad_norm": 8.065417289733887, "learning_rate": 2.5933202357563852e-05, "loss": 1.4115, "step": 1320 }, { "epoch": 0.05, "grad_norm": 4.6489033699035645, "learning_rate": 2.6129666011787817e-05, "loss": 1.4783, "step": 1330 }, { "epoch": 0.05, "grad_norm": 8.150613784790039, "learning_rate": 2.632612966601179e-05, "loss": 0.9543, "step": 1340 }, { "epoch": 0.05, "grad_norm": 5.638037204742432, "learning_rate": 2.6522593320235754e-05, "loss": 1.6097, "step": 1350 }, { "epoch": 0.05, "grad_norm": 8.103131294250488, "learning_rate": 2.6719056974459726e-05, "loss": 1.7673, "step": 1360 }, { "epoch": 0.05, "grad_norm": 13.944802284240723, "learning_rate": 2.6915520628683694e-05, "loss": 1.4748, "step": 1370 }, { "epoch": 0.05, "grad_norm": 21.044559478759766, "learning_rate": 2.7111984282907666e-05, "loss": 1.2909, "step": 1380 }, { "epoch": 0.05, "grad_norm": 16.559110641479492, "learning_rate": 2.730844793713163e-05, "loss": 1.4038, "step": 1390 }, { "epoch": 0.06, "grad_norm": 9.706820487976074, "learning_rate": 2.7504911591355603e-05, "loss": 1.4774, "step": 1400 }, { "epoch": 0.06, "grad_norm": 12.640449523925781, "learning_rate": 2.7701375245579568e-05, "loss": 1.6861, "step": 1410 }, { "epoch": 0.06, "grad_norm": 12.526803970336914, "learning_rate": 2.7897838899803537e-05, "loss": 1.288, "step": 1420 }, { "epoch": 0.06, "grad_norm": 8.123985290527344, "learning_rate": 2.809430255402751e-05, "loss": 1.4044, "step": 1430 }, { "epoch": 0.06, "grad_norm": 7.328338623046875, "learning_rate": 2.8290766208251474e-05, "loss": 1.4156, "step": 1440 }, { "epoch": 0.06, "grad_norm": 9.951528549194336, "learning_rate": 2.8487229862475445e-05, "loss": 1.7236, "step": 1450 }, { "epoch": 0.06, "grad_norm": 12.283649444580078, "learning_rate": 2.868369351669941e-05, "loss": 1.4234, "step": 1460 }, { "epoch": 0.06, "grad_norm": 6.4442667961120605, "learning_rate": 2.8880157170923382e-05, "loss": 1.5968, "step": 1470 }, { "epoch": 0.06, "grad_norm": 13.760937690734863, "learning_rate": 2.907662082514735e-05, "loss": 1.1626, "step": 1480 }, { "epoch": 0.06, "grad_norm": 13.588836669921875, "learning_rate": 2.9273084479371316e-05, "loss": 1.7874, "step": 1490 }, { "epoch": 0.06, "grad_norm": 7.21587610244751, "learning_rate": 2.9469548133595288e-05, "loss": 1.518, "step": 1500 }, { "epoch": 0.06, "grad_norm": 4.825687408447266, "learning_rate": 2.9666011787819253e-05, "loss": 1.6592, "step": 1510 }, { "epoch": 0.06, "grad_norm": 13.052379608154297, "learning_rate": 2.9862475442043225e-05, "loss": 1.6322, "step": 1520 }, { "epoch": 0.06, "grad_norm": 11.555798530578613, "learning_rate": 3.005893909626719e-05, "loss": 1.9354, "step": 1530 }, { "epoch": 0.06, "grad_norm": 7.818208694458008, "learning_rate": 3.025540275049116e-05, "loss": 1.3693, "step": 1540 }, { "epoch": 0.06, "grad_norm": 15.580031394958496, "learning_rate": 3.045186640471513e-05, "loss": 1.3851, "step": 1550 }, { "epoch": 0.06, "grad_norm": 9.129706382751465, "learning_rate": 3.06483300589391e-05, "loss": 1.0949, "step": 1560 }, { "epoch": 0.06, "grad_norm": 5.964504241943359, "learning_rate": 3.084479371316307e-05, "loss": 1.4683, "step": 1570 }, { "epoch": 0.06, "grad_norm": 15.801876068115234, "learning_rate": 3.1041257367387035e-05, "loss": 1.3759, "step": 1580 }, { "epoch": 0.06, "grad_norm": 16.279518127441406, "learning_rate": 3.123772102161101e-05, "loss": 1.1431, "step": 1590 }, { "epoch": 0.06, "grad_norm": 7.55146598815918, "learning_rate": 3.143418467583497e-05, "loss": 0.9607, "step": 1600 }, { "epoch": 0.06, "grad_norm": 6.134790420532227, "learning_rate": 3.1630648330058944e-05, "loss": 1.2067, "step": 1610 }, { "epoch": 0.06, "grad_norm": 5.770500659942627, "learning_rate": 3.182711198428291e-05, "loss": 1.2247, "step": 1620 }, { "epoch": 0.06, "grad_norm": 7.203014373779297, "learning_rate": 3.2023575638506874e-05, "loss": 1.4814, "step": 1630 }, { "epoch": 0.06, "grad_norm": 10.20174503326416, "learning_rate": 3.2220039292730846e-05, "loss": 1.0392, "step": 1640 }, { "epoch": 0.06, "grad_norm": 9.342395782470703, "learning_rate": 3.241650294695481e-05, "loss": 1.3264, "step": 1650 }, { "epoch": 0.07, "grad_norm": 23.120468139648438, "learning_rate": 3.261296660117878e-05, "loss": 1.4181, "step": 1660 }, { "epoch": 0.07, "grad_norm": 9.172660827636719, "learning_rate": 3.280943025540275e-05, "loss": 1.7663, "step": 1670 }, { "epoch": 0.07, "grad_norm": 8.3864164352417, "learning_rate": 3.300589390962672e-05, "loss": 1.4011, "step": 1680 }, { "epoch": 0.07, "grad_norm": 11.054300308227539, "learning_rate": 3.320235756385069e-05, "loss": 1.3251, "step": 1690 }, { "epoch": 0.07, "grad_norm": 10.472184181213379, "learning_rate": 3.339882121807466e-05, "loss": 1.6141, "step": 1700 }, { "epoch": 0.07, "grad_norm": 3.5828475952148438, "learning_rate": 3.359528487229863e-05, "loss": 0.9862, "step": 1710 }, { "epoch": 0.07, "grad_norm": 10.149588584899902, "learning_rate": 3.3791748526522594e-05, "loss": 1.4186, "step": 1720 }, { "epoch": 0.07, "grad_norm": 11.893289566040039, "learning_rate": 3.3988212180746565e-05, "loss": 1.1678, "step": 1730 }, { "epoch": 0.07, "grad_norm": 8.834487915039062, "learning_rate": 3.418467583497053e-05, "loss": 1.7693, "step": 1740 }, { "epoch": 0.07, "grad_norm": 3.0044820308685303, "learning_rate": 3.43811394891945e-05, "loss": 1.3349, "step": 1750 }, { "epoch": 0.07, "grad_norm": 11.232534408569336, "learning_rate": 3.457760314341847e-05, "loss": 1.0112, "step": 1760 }, { "epoch": 0.07, "grad_norm": 8.355599403381348, "learning_rate": 3.477406679764244e-05, "loss": 1.5229, "step": 1770 }, { "epoch": 0.07, "grad_norm": 9.612308502197266, "learning_rate": 3.4970530451866404e-05, "loss": 1.5551, "step": 1780 }, { "epoch": 0.07, "grad_norm": 5.819346904754639, "learning_rate": 3.5166994106090376e-05, "loss": 1.8126, "step": 1790 }, { "epoch": 0.07, "grad_norm": 7.747526168823242, "learning_rate": 3.536345776031435e-05, "loss": 1.2443, "step": 1800 }, { "epoch": 0.07, "grad_norm": 11.222460746765137, "learning_rate": 3.555992141453831e-05, "loss": 1.4851, "step": 1810 }, { "epoch": 0.07, "grad_norm": 8.2022123336792, "learning_rate": 3.5756385068762285e-05, "loss": 1.5147, "step": 1820 }, { "epoch": 0.07, "grad_norm": 7.194546699523926, "learning_rate": 3.595284872298625e-05, "loss": 1.768, "step": 1830 }, { "epoch": 0.07, "grad_norm": 3.923492193222046, "learning_rate": 3.614931237721022e-05, "loss": 1.3366, "step": 1840 }, { "epoch": 0.07, "grad_norm": 5.61436128616333, "learning_rate": 3.634577603143419e-05, "loss": 1.923, "step": 1850 }, { "epoch": 0.07, "grad_norm": 10.649151802062988, "learning_rate": 3.654223968565815e-05, "loss": 1.2197, "step": 1860 }, { "epoch": 0.07, "grad_norm": 4.888509750366211, "learning_rate": 3.6738703339882124e-05, "loss": 1.4047, "step": 1870 }, { "epoch": 0.07, "grad_norm": 5.455938816070557, "learning_rate": 3.693516699410609e-05, "loss": 1.1998, "step": 1880 }, { "epoch": 0.07, "grad_norm": 5.748868942260742, "learning_rate": 3.713163064833006e-05, "loss": 1.3259, "step": 1890 }, { "epoch": 0.07, "grad_norm": 5.507759094238281, "learning_rate": 3.7328094302554026e-05, "loss": 1.1685, "step": 1900 }, { "epoch": 0.08, "grad_norm": 8.361678123474121, "learning_rate": 3.7524557956778e-05, "loss": 1.6374, "step": 1910 }, { "epoch": 0.08, "grad_norm": 6.0592041015625, "learning_rate": 3.772102161100197e-05, "loss": 1.709, "step": 1920 }, { "epoch": 0.08, "grad_norm": 10.069732666015625, "learning_rate": 3.7917485265225934e-05, "loss": 1.6663, "step": 1930 }, { "epoch": 0.08, "grad_norm": 18.412240982055664, "learning_rate": 3.8113948919449906e-05, "loss": 1.2823, "step": 1940 }, { "epoch": 0.08, "grad_norm": 3.610499382019043, "learning_rate": 3.831041257367387e-05, "loss": 1.2483, "step": 1950 }, { "epoch": 0.08, "grad_norm": 8.533556938171387, "learning_rate": 3.850687622789784e-05, "loss": 1.31, "step": 1960 }, { "epoch": 0.08, "grad_norm": 7.135300636291504, "learning_rate": 3.870333988212181e-05, "loss": 1.2421, "step": 1970 }, { "epoch": 0.08, "grad_norm": 10.809922218322754, "learning_rate": 3.889980353634578e-05, "loss": 1.8349, "step": 1980 }, { "epoch": 0.08, "grad_norm": 6.778326034545898, "learning_rate": 3.9096267190569745e-05, "loss": 1.4278, "step": 1990 }, { "epoch": 0.08, "grad_norm": 11.11804485321045, "learning_rate": 3.929273084479371e-05, "loss": 1.155, "step": 2000 }, { "epoch": 0.08, "grad_norm": 9.65732479095459, "learning_rate": 3.948919449901768e-05, "loss": 1.5908, "step": 2010 }, { "epoch": 0.08, "grad_norm": 9.297612190246582, "learning_rate": 3.9685658153241654e-05, "loss": 1.3048, "step": 2020 }, { "epoch": 0.08, "grad_norm": 8.862041473388672, "learning_rate": 3.9882121807465626e-05, "loss": 1.5642, "step": 2030 }, { "epoch": 0.08, "grad_norm": 11.724441528320312, "learning_rate": 4.007858546168959e-05, "loss": 1.4594, "step": 2040 }, { "epoch": 0.08, "grad_norm": 13.767656326293945, "learning_rate": 4.027504911591356e-05, "loss": 1.5344, "step": 2050 }, { "epoch": 0.08, "grad_norm": 10.537979125976562, "learning_rate": 4.047151277013753e-05, "loss": 1.7519, "step": 2060 }, { "epoch": 0.08, "grad_norm": 9.79917049407959, "learning_rate": 4.066797642436149e-05, "loss": 1.5063, "step": 2070 }, { "epoch": 0.08, "grad_norm": 14.532011985778809, "learning_rate": 4.0864440078585464e-05, "loss": 1.4517, "step": 2080 }, { "epoch": 0.08, "grad_norm": 7.567566871643066, "learning_rate": 4.106090373280943e-05, "loss": 1.5242, "step": 2090 }, { "epoch": 0.08, "grad_norm": 11.735125541687012, "learning_rate": 4.12573673870334e-05, "loss": 1.0596, "step": 2100 }, { "epoch": 0.08, "grad_norm": 10.138215065002441, "learning_rate": 4.1453831041257366e-05, "loss": 1.5448, "step": 2110 }, { "epoch": 0.08, "grad_norm": 28.378204345703125, "learning_rate": 4.165029469548134e-05, "loss": 1.3888, "step": 2120 }, { "epoch": 0.08, "grad_norm": 11.644747734069824, "learning_rate": 4.18467583497053e-05, "loss": 1.5252, "step": 2130 }, { "epoch": 0.08, "grad_norm": 1.6212198734283447, "learning_rate": 4.2043222003929275e-05, "loss": 1.4971, "step": 2140 }, { "epoch": 0.08, "grad_norm": 8.66162109375, "learning_rate": 4.223968565815325e-05, "loss": 1.8845, "step": 2150 }, { "epoch": 0.08, "grad_norm": 13.313812255859375, "learning_rate": 4.243614931237721e-05, "loss": 1.5392, "step": 2160 }, { "epoch": 0.09, "grad_norm": 11.202216148376465, "learning_rate": 4.2632612966601184e-05, "loss": 1.7777, "step": 2170 }, { "epoch": 0.09, "grad_norm": 13.136141777038574, "learning_rate": 4.282907662082515e-05, "loss": 1.2742, "step": 2180 }, { "epoch": 0.09, "grad_norm": 10.07437515258789, "learning_rate": 4.302554027504912e-05, "loss": 1.409, "step": 2190 }, { "epoch": 0.09, "grad_norm": 8.2798433303833, "learning_rate": 4.3222003929273086e-05, "loss": 1.5731, "step": 2200 }, { "epoch": 0.09, "grad_norm": 10.272143363952637, "learning_rate": 4.341846758349706e-05, "loss": 1.6087, "step": 2210 }, { "epoch": 0.09, "grad_norm": 10.13696002960205, "learning_rate": 4.361493123772102e-05, "loss": 1.3903, "step": 2220 }, { "epoch": 0.09, "grad_norm": 11.432174682617188, "learning_rate": 4.381139489194499e-05, "loss": 1.4511, "step": 2230 }, { "epoch": 0.09, "grad_norm": 11.11430549621582, "learning_rate": 4.400785854616896e-05, "loss": 1.239, "step": 2240 }, { "epoch": 0.09, "grad_norm": 16.314895629882812, "learning_rate": 4.4204322200392925e-05, "loss": 1.3122, "step": 2250 }, { "epoch": 0.09, "grad_norm": 15.554749488830566, "learning_rate": 4.4400785854616897e-05, "loss": 1.154, "step": 2260 }, { "epoch": 0.09, "grad_norm": 7.058994770050049, "learning_rate": 4.459724950884087e-05, "loss": 1.2116, "step": 2270 }, { "epoch": 0.09, "grad_norm": 10.799016952514648, "learning_rate": 4.479371316306484e-05, "loss": 1.4949, "step": 2280 }, { "epoch": 0.09, "grad_norm": 5.718183517456055, "learning_rate": 4.4990176817288805e-05, "loss": 1.3982, "step": 2290 }, { "epoch": 0.09, "grad_norm": 7.06242561340332, "learning_rate": 4.518664047151277e-05, "loss": 1.452, "step": 2300 }, { "epoch": 0.09, "grad_norm": 8.024133682250977, "learning_rate": 4.538310412573674e-05, "loss": 1.4932, "step": 2310 }, { "epoch": 0.09, "grad_norm": 6.693330764770508, "learning_rate": 4.557956777996071e-05, "loss": 1.3727, "step": 2320 }, { "epoch": 0.09, "grad_norm": 8.972209930419922, "learning_rate": 4.577603143418468e-05, "loss": 1.4636, "step": 2330 }, { "epoch": 0.09, "grad_norm": 15.532783508300781, "learning_rate": 4.5972495088408644e-05, "loss": 1.3717, "step": 2340 }, { "epoch": 0.09, "grad_norm": 2.770662307739258, "learning_rate": 4.6168958742632616e-05, "loss": 1.6121, "step": 2350 }, { "epoch": 0.09, "grad_norm": 6.491889953613281, "learning_rate": 4.636542239685658e-05, "loss": 1.4383, "step": 2360 }, { "epoch": 0.09, "grad_norm": 4.818174362182617, "learning_rate": 4.656188605108055e-05, "loss": 1.261, "step": 2370 }, { "epoch": 0.09, "grad_norm": 10.724140167236328, "learning_rate": 4.6758349705304525e-05, "loss": 1.4259, "step": 2380 }, { "epoch": 0.09, "grad_norm": 5.87529993057251, "learning_rate": 4.695481335952849e-05, "loss": 1.4938, "step": 2390 }, { "epoch": 0.09, "grad_norm": 7.359882831573486, "learning_rate": 4.715127701375246e-05, "loss": 1.3255, "step": 2400 }, { "epoch": 0.09, "grad_norm": 5.665154933929443, "learning_rate": 4.734774066797643e-05, "loss": 1.7927, "step": 2410 }, { "epoch": 0.1, "grad_norm": 4.808079719543457, "learning_rate": 4.75442043222004e-05, "loss": 1.1887, "step": 2420 }, { "epoch": 0.1, "grad_norm": 7.385726451873779, "learning_rate": 4.7740667976424364e-05, "loss": 1.2943, "step": 2430 }, { "epoch": 0.1, "grad_norm": 9.11865520477295, "learning_rate": 4.793713163064833e-05, "loss": 1.5941, "step": 2440 }, { "epoch": 0.1, "grad_norm": 7.88557243347168, "learning_rate": 4.81335952848723e-05, "loss": 1.407, "step": 2450 }, { "epoch": 0.1, "grad_norm": 6.10056209564209, "learning_rate": 4.8330058939096265e-05, "loss": 1.5796, "step": 2460 }, { "epoch": 0.1, "grad_norm": 11.770026206970215, "learning_rate": 4.852652259332024e-05, "loss": 1.101, "step": 2470 }, { "epoch": 0.1, "grad_norm": 10.865485191345215, "learning_rate": 4.87229862475442e-05, "loss": 1.3848, "step": 2480 }, { "epoch": 0.1, "grad_norm": 13.065580368041992, "learning_rate": 4.8919449901768174e-05, "loss": 1.6731, "step": 2490 }, { "epoch": 0.1, "grad_norm": 9.575936317443848, "learning_rate": 4.9115913555992146e-05, "loss": 1.7237, "step": 2500 }, { "epoch": 0.1, "grad_norm": 9.547867774963379, "learning_rate": 4.931237721021611e-05, "loss": 1.8957, "step": 2510 }, { "epoch": 0.1, "grad_norm": 15.875798225402832, "learning_rate": 4.950884086444008e-05, "loss": 1.6723, "step": 2520 }, { "epoch": 0.1, "grad_norm": 5.348197937011719, "learning_rate": 4.970530451866405e-05, "loss": 1.2545, "step": 2530 }, { "epoch": 0.1, "grad_norm": 8.234753608703613, "learning_rate": 4.990176817288802e-05, "loss": 1.2349, "step": 2540 }, { "epoch": 0.1, "grad_norm": 8.788379669189453, "learning_rate": 4.998908439942366e-05, "loss": 1.5329, "step": 2550 }, { "epoch": 0.1, "grad_norm": 14.56135368347168, "learning_rate": 4.996725319827097e-05, "loss": 1.7621, "step": 2560 }, { "epoch": 0.1, "grad_norm": 7.16921329498291, "learning_rate": 4.9945421997118284e-05, "loss": 1.2855, "step": 2570 }, { "epoch": 0.1, "grad_norm": 7.210436820983887, "learning_rate": 4.99235907959656e-05, "loss": 1.4386, "step": 2580 }, { "epoch": 0.1, "grad_norm": 11.392064094543457, "learning_rate": 4.990175959481291e-05, "loss": 1.5655, "step": 2590 }, { "epoch": 0.1, "grad_norm": 6.797138214111328, "learning_rate": 4.987992839366022e-05, "loss": 1.5974, "step": 2600 }, { "epoch": 0.1, "grad_norm": 8.253582000732422, "learning_rate": 4.985809719250754e-05, "loss": 1.3211, "step": 2610 }, { "epoch": 0.1, "grad_norm": 7.643932819366455, "learning_rate": 4.9836265991354846e-05, "loss": 1.4549, "step": 2620 }, { "epoch": 0.1, "grad_norm": 8.747930526733398, "learning_rate": 4.981443479020216e-05, "loss": 1.5036, "step": 2630 }, { "epoch": 0.1, "grad_norm": 7.855422496795654, "learning_rate": 4.9792603589049476e-05, "loss": 1.3499, "step": 2640 }, { "epoch": 0.1, "grad_norm": 8.04935359954834, "learning_rate": 4.9770772387896785e-05, "loss": 1.3623, "step": 2650 }, { "epoch": 0.1, "grad_norm": 11.400749206542969, "learning_rate": 4.97489411867441e-05, "loss": 1.7174, "step": 2660 }, { "epoch": 0.1, "grad_norm": 5.667577743530273, "learning_rate": 4.972710998559141e-05, "loss": 0.9566, "step": 2670 }, { "epoch": 0.11, "grad_norm": 8.566537857055664, "learning_rate": 4.970527878443872e-05, "loss": 1.5167, "step": 2680 }, { "epoch": 0.11, "grad_norm": 11.930310249328613, "learning_rate": 4.968344758328604e-05, "loss": 1.5441, "step": 2690 }, { "epoch": 0.11, "grad_norm": 11.449090003967285, "learning_rate": 4.966161638213335e-05, "loss": 1.3872, "step": 2700 }, { "epoch": 0.11, "grad_norm": 1.7841280698776245, "learning_rate": 4.9639785180980655e-05, "loss": 1.3851, "step": 2710 }, { "epoch": 0.11, "grad_norm": 9.411259651184082, "learning_rate": 4.961795397982798e-05, "loss": 1.3658, "step": 2720 }, { "epoch": 0.11, "grad_norm": 12.333673477172852, "learning_rate": 4.9596122778675286e-05, "loss": 1.6543, "step": 2730 }, { "epoch": 0.11, "grad_norm": 5.220922946929932, "learning_rate": 4.9574291577522594e-05, "loss": 1.4727, "step": 2740 }, { "epoch": 0.11, "grad_norm": 5.5571441650390625, "learning_rate": 4.955246037636991e-05, "loss": 1.4131, "step": 2750 }, { "epoch": 0.11, "grad_norm": 11.638848304748535, "learning_rate": 4.9530629175217224e-05, "loss": 1.4148, "step": 2760 }, { "epoch": 0.11, "grad_norm": 8.170778274536133, "learning_rate": 4.950879797406453e-05, "loss": 1.3222, "step": 2770 }, { "epoch": 0.11, "grad_norm": 5.6434454917907715, "learning_rate": 4.948696677291185e-05, "loss": 1.2244, "step": 2780 }, { "epoch": 0.11, "grad_norm": 20.316694259643555, "learning_rate": 4.9465135571759156e-05, "loss": 1.6371, "step": 2790 }, { "epoch": 0.11, "grad_norm": 18.24306297302246, "learning_rate": 4.944330437060647e-05, "loss": 1.3749, "step": 2800 }, { "epoch": 0.11, "grad_norm": 7.818178653717041, "learning_rate": 4.942147316945379e-05, "loss": 1.3116, "step": 2810 }, { "epoch": 0.11, "grad_norm": 6.210562705993652, "learning_rate": 4.9399641968301095e-05, "loss": 1.4921, "step": 2820 }, { "epoch": 0.11, "grad_norm": 9.905407905578613, "learning_rate": 4.937781076714841e-05, "loss": 1.4672, "step": 2830 }, { "epoch": 0.11, "grad_norm": 11.01400375366211, "learning_rate": 4.9355979565995725e-05, "loss": 1.4191, "step": 2840 }, { "epoch": 0.11, "grad_norm": 3.254570960998535, "learning_rate": 4.9334148364843034e-05, "loss": 1.3801, "step": 2850 }, { "epoch": 0.11, "grad_norm": 12.207158088684082, "learning_rate": 4.931231716369035e-05, "loss": 1.6177, "step": 2860 }, { "epoch": 0.11, "grad_norm": 10.228629112243652, "learning_rate": 4.9290485962537664e-05, "loss": 1.3422, "step": 2870 }, { "epoch": 0.11, "grad_norm": 12.078383445739746, "learning_rate": 4.926865476138497e-05, "loss": 1.522, "step": 2880 }, { "epoch": 0.11, "grad_norm": 7.838730812072754, "learning_rate": 4.924682356023229e-05, "loss": 1.6886, "step": 2890 }, { "epoch": 0.11, "grad_norm": 11.273910522460938, "learning_rate": 4.9224992359079596e-05, "loss": 1.5464, "step": 2900 }, { "epoch": 0.11, "grad_norm": 5.147863864898682, "learning_rate": 4.920316115792691e-05, "loss": 1.5664, "step": 2910 }, { "epoch": 0.11, "grad_norm": 20.459049224853516, "learning_rate": 4.9181329956774226e-05, "loss": 1.4609, "step": 2920 }, { "epoch": 0.12, "grad_norm": 8.035314559936523, "learning_rate": 4.9159498755621535e-05, "loss": 1.3739, "step": 2930 }, { "epoch": 0.12, "grad_norm": 7.041208267211914, "learning_rate": 4.913766755446885e-05, "loss": 1.4629, "step": 2940 }, { "epoch": 0.12, "grad_norm": 1.790037989616394, "learning_rate": 4.9115836353316165e-05, "loss": 1.1904, "step": 2950 }, { "epoch": 0.12, "grad_norm": 9.08872127532959, "learning_rate": 4.9094005152163474e-05, "loss": 2.074, "step": 2960 }, { "epoch": 0.12, "grad_norm": 4.665253639221191, "learning_rate": 4.907217395101079e-05, "loss": 1.642, "step": 2970 }, { "epoch": 0.12, "grad_norm": 5.311834335327148, "learning_rate": 4.90503427498581e-05, "loss": 1.4805, "step": 2980 }, { "epoch": 0.12, "grad_norm": 8.634931564331055, "learning_rate": 4.902851154870541e-05, "loss": 1.7677, "step": 2990 }, { "epoch": 0.12, "grad_norm": 13.166306495666504, "learning_rate": 4.900668034755273e-05, "loss": 1.8442, "step": 3000 }, { "epoch": 0.12, "grad_norm": 14.040495872497559, "learning_rate": 4.8984849146400036e-05, "loss": 1.4396, "step": 3010 }, { "epoch": 0.12, "grad_norm": 6.09878396987915, "learning_rate": 4.896301794524735e-05, "loss": 1.1957, "step": 3020 }, { "epoch": 0.12, "grad_norm": 10.209972381591797, "learning_rate": 4.8941186744094666e-05, "loss": 1.6388, "step": 3030 }, { "epoch": 0.12, "grad_norm": 11.767516136169434, "learning_rate": 4.8919355542941975e-05, "loss": 1.4817, "step": 3040 }, { "epoch": 0.12, "grad_norm": 14.19233512878418, "learning_rate": 4.889752434178928e-05, "loss": 1.4872, "step": 3050 }, { "epoch": 0.12, "grad_norm": 4.903499126434326, "learning_rate": 4.8875693140636605e-05, "loss": 1.412, "step": 3060 }, { "epoch": 0.12, "grad_norm": 8.952275276184082, "learning_rate": 4.885386193948391e-05, "loss": 2.1105, "step": 3070 }, { "epoch": 0.12, "grad_norm": 8.626426696777344, "learning_rate": 4.883203073833122e-05, "loss": 1.3604, "step": 3080 }, { "epoch": 0.12, "grad_norm": 11.721304893493652, "learning_rate": 4.881019953717854e-05, "loss": 1.7466, "step": 3090 }, { "epoch": 0.12, "grad_norm": 7.356558322906494, "learning_rate": 4.878836833602585e-05, "loss": 1.8017, "step": 3100 }, { "epoch": 0.12, "grad_norm": 14.052141189575195, "learning_rate": 4.876653713487316e-05, "loss": 1.5261, "step": 3110 }, { "epoch": 0.12, "grad_norm": 7.929640769958496, "learning_rate": 4.8744705933720476e-05, "loss": 1.6713, "step": 3120 }, { "epoch": 0.12, "grad_norm": 8.090364456176758, "learning_rate": 4.8722874732567784e-05, "loss": 1.5643, "step": 3130 }, { "epoch": 0.12, "grad_norm": 8.23180103302002, "learning_rate": 4.87010435314151e-05, "loss": 1.2131, "step": 3140 }, { "epoch": 0.12, "grad_norm": 5.148247718811035, "learning_rate": 4.8679212330262414e-05, "loss": 1.3958, "step": 3150 }, { "epoch": 0.12, "grad_norm": 8.83860969543457, "learning_rate": 4.865738112910972e-05, "loss": 1.9858, "step": 3160 }, { "epoch": 0.12, "grad_norm": 5.009531021118164, "learning_rate": 4.863554992795704e-05, "loss": 1.2465, "step": 3170 }, { "epoch": 0.12, "grad_norm": 13.143977165222168, "learning_rate": 4.861371872680435e-05, "loss": 1.3916, "step": 3180 }, { "epoch": 0.12, "eval_accuracy": 0.5662423385195663, "eval_loss": 1.510087490081787, "eval_runtime": 468.2795, "eval_samples_per_second": 9.059, "eval_steps_per_second": 2.266, "step": 3181 }, { "epoch": 1.0, "grad_norm": 10.143963813781738, "learning_rate": 4.859188752565166e-05, "loss": 1.7761, "step": 3190 }, { "epoch": 1.0, "grad_norm": 9.97482681274414, "learning_rate": 4.857005632449898e-05, "loss": 0.9733, "step": 3200 }, { "epoch": 1.0, "grad_norm": 7.034632682800293, "learning_rate": 4.854822512334629e-05, "loss": 1.6076, "step": 3210 }, { "epoch": 1.0, "grad_norm": 5.688974857330322, "learning_rate": 4.85263939221936e-05, "loss": 1.2839, "step": 3220 }, { "epoch": 1.0, "grad_norm": 11.914328575134277, "learning_rate": 4.8504562721040915e-05, "loss": 1.2852, "step": 3230 }, { "epoch": 1.0, "grad_norm": 3.7191710472106934, "learning_rate": 4.8482731519888224e-05, "loss": 1.5728, "step": 3240 }, { "epoch": 1.0, "grad_norm": 5.6878485679626465, "learning_rate": 4.846090031873554e-05, "loss": 1.18, "step": 3250 }, { "epoch": 1.0, "grad_norm": 10.692712783813477, "learning_rate": 4.8439069117582854e-05, "loss": 1.6006, "step": 3260 }, { "epoch": 1.0, "grad_norm": 7.867443561553955, "learning_rate": 4.841723791643016e-05, "loss": 1.376, "step": 3270 }, { "epoch": 1.0, "grad_norm": 6.656571865081787, "learning_rate": 4.839540671527748e-05, "loss": 1.3951, "step": 3280 }, { "epoch": 1.0, "grad_norm": 10.624161720275879, "learning_rate": 4.837357551412479e-05, "loss": 1.45, "step": 3290 }, { "epoch": 1.0, "grad_norm": 8.931720733642578, "learning_rate": 4.83517443129721e-05, "loss": 1.3844, "step": 3300 }, { "epoch": 1.01, "grad_norm": 13.14348316192627, "learning_rate": 4.8329913111819416e-05, "loss": 1.7095, "step": 3310 }, { "epoch": 1.01, "grad_norm": 8.860713958740234, "learning_rate": 4.8308081910666725e-05, "loss": 1.2616, "step": 3320 }, { "epoch": 1.01, "grad_norm": 8.462882041931152, "learning_rate": 4.828625070951404e-05, "loss": 1.643, "step": 3330 }, { "epoch": 1.01, "grad_norm": 3.1630160808563232, "learning_rate": 4.8264419508361355e-05, "loss": 1.0135, "step": 3340 }, { "epoch": 1.01, "grad_norm": 13.14311408996582, "learning_rate": 4.8242588307208664e-05, "loss": 1.713, "step": 3350 }, { "epoch": 1.01, "grad_norm": 12.38389778137207, "learning_rate": 4.822075710605597e-05, "loss": 1.6263, "step": 3360 }, { "epoch": 1.01, "grad_norm": 7.065594673156738, "learning_rate": 4.8198925904903294e-05, "loss": 1.3884, "step": 3370 }, { "epoch": 1.01, "grad_norm": 9.292243003845215, "learning_rate": 4.81770947037506e-05, "loss": 1.5777, "step": 3380 }, { "epoch": 1.01, "grad_norm": 11.394031524658203, "learning_rate": 4.815526350259791e-05, "loss": 1.3106, "step": 3390 }, { "epoch": 1.01, "grad_norm": 9.732278823852539, "learning_rate": 4.813343230144523e-05, "loss": 1.5997, "step": 3400 }, { "epoch": 1.01, "grad_norm": 17.422178268432617, "learning_rate": 4.811160110029254e-05, "loss": 1.4081, "step": 3410 }, { "epoch": 1.01, "grad_norm": 3.7148585319519043, "learning_rate": 4.808976989913985e-05, "loss": 1.3729, "step": 3420 }, { "epoch": 1.01, "grad_norm": 8.881731986999512, "learning_rate": 4.8067938697987165e-05, "loss": 1.3277, "step": 3430 }, { "epoch": 1.01, "grad_norm": 7.502439498901367, "learning_rate": 4.804610749683448e-05, "loss": 1.3658, "step": 3440 }, { "epoch": 1.01, "grad_norm": 9.295893669128418, "learning_rate": 4.802427629568179e-05, "loss": 1.4908, "step": 3450 }, { "epoch": 1.01, "grad_norm": 3.233078718185425, "learning_rate": 4.80024450945291e-05, "loss": 1.6476, "step": 3460 }, { "epoch": 1.01, "grad_norm": 13.235282897949219, "learning_rate": 4.798061389337641e-05, "loss": 1.3534, "step": 3470 }, { "epoch": 1.01, "grad_norm": 11.9801025390625, "learning_rate": 4.7958782692223734e-05, "loss": 1.7628, "step": 3480 }, { "epoch": 1.01, "grad_norm": 9.94530200958252, "learning_rate": 4.793695149107104e-05, "loss": 1.3445, "step": 3490 }, { "epoch": 1.01, "grad_norm": 8.22075366973877, "learning_rate": 4.791512028991835e-05, "loss": 1.2736, "step": 3500 }, { "epoch": 1.01, "grad_norm": 8.88790512084961, "learning_rate": 4.7893289088765666e-05, "loss": 1.3422, "step": 3510 }, { "epoch": 1.01, "grad_norm": 15.338109970092773, "learning_rate": 4.787145788761298e-05, "loss": 1.2089, "step": 3520 }, { "epoch": 1.01, "grad_norm": 13.21195125579834, "learning_rate": 4.784962668646029e-05, "loss": 1.4632, "step": 3530 }, { "epoch": 1.01, "grad_norm": 8.5131254196167, "learning_rate": 4.7827795485307604e-05, "loss": 1.2106, "step": 3540 }, { "epoch": 1.01, "grad_norm": 9.516834259033203, "learning_rate": 4.780596428415491e-05, "loss": 1.2242, "step": 3550 }, { "epoch": 1.01, "grad_norm": 3.863199472427368, "learning_rate": 4.778413308300223e-05, "loss": 1.0017, "step": 3560 }, { "epoch": 1.02, "grad_norm": 11.73081111907959, "learning_rate": 4.776230188184954e-05, "loss": 1.5438, "step": 3570 }, { "epoch": 1.02, "grad_norm": 12.750449180603027, "learning_rate": 4.774047068069685e-05, "loss": 1.3419, "step": 3580 }, { "epoch": 1.02, "grad_norm": 10.799705505371094, "learning_rate": 4.771863947954417e-05, "loss": 0.9993, "step": 3590 }, { "epoch": 1.02, "grad_norm": 9.25054931640625, "learning_rate": 4.769680827839148e-05, "loss": 1.3782, "step": 3600 }, { "epoch": 1.02, "grad_norm": 13.411931991577148, "learning_rate": 4.767497707723879e-05, "loss": 1.6279, "step": 3610 }, { "epoch": 1.02, "grad_norm": 9.175803184509277, "learning_rate": 4.7653145876086105e-05, "loss": 1.6972, "step": 3620 }, { "epoch": 1.02, "grad_norm": 13.229693412780762, "learning_rate": 4.763131467493342e-05, "loss": 1.3942, "step": 3630 }, { "epoch": 1.02, "grad_norm": 8.508553504943848, "learning_rate": 4.760948347378073e-05, "loss": 1.5354, "step": 3640 }, { "epoch": 1.02, "grad_norm": 10.259597778320312, "learning_rate": 4.7587652272628044e-05, "loss": 1.5637, "step": 3650 }, { "epoch": 1.02, "grad_norm": 10.034700393676758, "learning_rate": 4.756582107147535e-05, "loss": 1.1312, "step": 3660 }, { "epoch": 1.02, "grad_norm": 11.961254119873047, "learning_rate": 4.754398987032267e-05, "loss": 1.5816, "step": 3670 }, { "epoch": 1.02, "grad_norm": 8.309999465942383, "learning_rate": 4.752215866916998e-05, "loss": 1.8619, "step": 3680 }, { "epoch": 1.02, "grad_norm": 8.207474708557129, "learning_rate": 4.750032746801729e-05, "loss": 1.4892, "step": 3690 }, { "epoch": 1.02, "grad_norm": 11.339587211608887, "learning_rate": 4.74784962668646e-05, "loss": 1.3351, "step": 3700 }, { "epoch": 1.02, "grad_norm": 5.284873962402344, "learning_rate": 4.745666506571192e-05, "loss": 1.0018, "step": 3710 }, { "epoch": 1.02, "grad_norm": 10.486954689025879, "learning_rate": 4.743483386455923e-05, "loss": 1.7009, "step": 3720 }, { "epoch": 1.02, "grad_norm": 7.536387920379639, "learning_rate": 4.741300266340654e-05, "loss": 1.6927, "step": 3730 }, { "epoch": 1.02, "grad_norm": 11.057391166687012, "learning_rate": 4.7391171462253854e-05, "loss": 1.1964, "step": 3740 }, { "epoch": 1.02, "grad_norm": 11.438668251037598, "learning_rate": 4.736934026110117e-05, "loss": 1.4732, "step": 3750 }, { "epoch": 1.02, "grad_norm": 5.016892910003662, "learning_rate": 4.734750905994848e-05, "loss": 1.6115, "step": 3760 }, { "epoch": 1.02, "grad_norm": 4.691535949707031, "learning_rate": 4.732567785879579e-05, "loss": 1.1526, "step": 3770 }, { "epoch": 1.02, "grad_norm": 10.773592948913574, "learning_rate": 4.730384665764311e-05, "loss": 1.5588, "step": 3780 }, { "epoch": 1.02, "grad_norm": 8.989503860473633, "learning_rate": 4.728201545649042e-05, "loss": 1.5357, "step": 3790 }, { "epoch": 1.02, "grad_norm": 6.076661109924316, "learning_rate": 4.726018425533773e-05, "loss": 1.4407, "step": 3800 }, { "epoch": 1.02, "grad_norm": 5.561712741851807, "learning_rate": 4.723835305418504e-05, "loss": 1.4429, "step": 3810 }, { "epoch": 1.03, "grad_norm": 5.164684772491455, "learning_rate": 4.721652185303236e-05, "loss": 1.6466, "step": 3820 }, { "epoch": 1.03, "grad_norm": 5.6980180740356445, "learning_rate": 4.719469065187967e-05, "loss": 1.402, "step": 3830 }, { "epoch": 1.03, "grad_norm": 18.30165672302246, "learning_rate": 4.717285945072698e-05, "loss": 1.3913, "step": 3840 }, { "epoch": 1.03, "grad_norm": 7.518755912780762, "learning_rate": 4.7151028249574293e-05, "loss": 1.6737, "step": 3850 }, { "epoch": 1.03, "grad_norm": 10.146422386169434, "learning_rate": 4.712919704842161e-05, "loss": 1.5405, "step": 3860 }, { "epoch": 1.03, "grad_norm": 8.751389503479004, "learning_rate": 4.710736584726892e-05, "loss": 1.4557, "step": 3870 }, { "epoch": 1.03, "grad_norm": 9.556578636169434, "learning_rate": 4.708553464611623e-05, "loss": 1.442, "step": 3880 }, { "epoch": 1.03, "grad_norm": 12.347620964050293, "learning_rate": 4.706370344496354e-05, "loss": 1.8028, "step": 3890 }, { "epoch": 1.03, "grad_norm": 6.167050838470459, "learning_rate": 4.7041872243810856e-05, "loss": 1.0748, "step": 3900 }, { "epoch": 1.03, "grad_norm": 5.359996318817139, "learning_rate": 4.702004104265817e-05, "loss": 1.4697, "step": 3910 }, { "epoch": 1.03, "grad_norm": 10.503448486328125, "learning_rate": 4.699820984150548e-05, "loss": 1.4239, "step": 3920 }, { "epoch": 1.03, "grad_norm": 7.868473529815674, "learning_rate": 4.6976378640352794e-05, "loss": 1.4664, "step": 3930 }, { "epoch": 1.03, "grad_norm": 8.645943641662598, "learning_rate": 4.695454743920011e-05, "loss": 1.5887, "step": 3940 }, { "epoch": 1.03, "grad_norm": 18.052316665649414, "learning_rate": 4.693271623804742e-05, "loss": 1.4634, "step": 3950 }, { "epoch": 1.03, "grad_norm": 9.096421241760254, "learning_rate": 4.691088503689473e-05, "loss": 1.008, "step": 3960 }, { "epoch": 1.03, "grad_norm": 10.256746292114258, "learning_rate": 4.688905383574205e-05, "loss": 1.5729, "step": 3970 }, { "epoch": 1.03, "grad_norm": 11.159574508666992, "learning_rate": 4.686722263458936e-05, "loss": 1.4614, "step": 3980 }, { "epoch": 1.03, "grad_norm": 8.913644790649414, "learning_rate": 4.684539143343667e-05, "loss": 1.485, "step": 3990 }, { "epoch": 1.03, "grad_norm": 11.335363388061523, "learning_rate": 4.682356023228398e-05, "loss": 1.4922, "step": 4000 }, { "epoch": 1.03, "grad_norm": 12.085495948791504, "learning_rate": 4.6801729031131295e-05, "loss": 1.7522, "step": 4010 }, { "epoch": 1.03, "grad_norm": 7.542959213256836, "learning_rate": 4.677989782997861e-05, "loss": 1.6858, "step": 4020 }, { "epoch": 1.03, "grad_norm": 9.111419677734375, "learning_rate": 4.675806662882592e-05, "loss": 1.4292, "step": 4030 }, { "epoch": 1.03, "grad_norm": 10.08993911743164, "learning_rate": 4.673623542767323e-05, "loss": 1.1923, "step": 4040 }, { "epoch": 1.03, "grad_norm": 4.950143814086914, "learning_rate": 4.671440422652055e-05, "loss": 1.4206, "step": 4050 }, { "epoch": 1.03, "grad_norm": 8.562198638916016, "learning_rate": 4.669257302536786e-05, "loss": 1.2957, "step": 4060 }, { "epoch": 1.03, "grad_norm": 6.133686542510986, "learning_rate": 4.6670741824215166e-05, "loss": 1.413, "step": 4070 }, { "epoch": 1.04, "grad_norm": 8.008171081542969, "learning_rate": 4.664891062306248e-05, "loss": 1.1781, "step": 4080 }, { "epoch": 1.04, "grad_norm": 6.107157230377197, "learning_rate": 4.6627079421909797e-05, "loss": 1.1212, "step": 4090 }, { "epoch": 1.04, "grad_norm": 10.730534553527832, "learning_rate": 4.660524822075711e-05, "loss": 1.5133, "step": 4100 }, { "epoch": 1.04, "grad_norm": 8.251222610473633, "learning_rate": 4.658341701960442e-05, "loss": 1.0907, "step": 4110 }, { "epoch": 1.04, "grad_norm": 7.12594747543335, "learning_rate": 4.6561585818451735e-05, "loss": 1.0155, "step": 4120 }, { "epoch": 1.04, "grad_norm": 3.9965178966522217, "learning_rate": 4.653975461729905e-05, "loss": 1.5169, "step": 4130 }, { "epoch": 1.04, "grad_norm": 7.871490955352783, "learning_rate": 4.651792341614636e-05, "loss": 1.2748, "step": 4140 }, { "epoch": 1.04, "grad_norm": 8.492125511169434, "learning_rate": 4.649609221499367e-05, "loss": 1.5087, "step": 4150 }, { "epoch": 1.04, "grad_norm": 8.849578857421875, "learning_rate": 4.647426101384099e-05, "loss": 1.3693, "step": 4160 }, { "epoch": 1.04, "grad_norm": 14.816685676574707, "learning_rate": 4.64524298126883e-05, "loss": 1.4137, "step": 4170 }, { "epoch": 1.04, "grad_norm": 10.553801536560059, "learning_rate": 4.6430598611535606e-05, "loss": 1.1409, "step": 4180 }, { "epoch": 1.04, "grad_norm": 6.09594202041626, "learning_rate": 4.640876741038292e-05, "loss": 1.1802, "step": 4190 }, { "epoch": 1.04, "grad_norm": 13.451135635375977, "learning_rate": 4.6386936209230236e-05, "loss": 1.257, "step": 4200 }, { "epoch": 1.04, "grad_norm": 7.469411849975586, "learning_rate": 4.6365105008077545e-05, "loss": 1.2316, "step": 4210 }, { "epoch": 1.04, "grad_norm": 7.031535625457764, "learning_rate": 4.634327380692486e-05, "loss": 1.1496, "step": 4220 }, { "epoch": 1.04, "grad_norm": 10.719830513000488, "learning_rate": 4.632144260577217e-05, "loss": 1.5249, "step": 4230 }, { "epoch": 1.04, "grad_norm": 7.979582786560059, "learning_rate": 4.6299611404619483e-05, "loss": 0.9378, "step": 4240 }, { "epoch": 1.04, "grad_norm": 8.162788391113281, "learning_rate": 4.62777802034668e-05, "loss": 1.1664, "step": 4250 }, { "epoch": 1.04, "grad_norm": 9.076647758483887, "learning_rate": 4.625594900231411e-05, "loss": 1.7126, "step": 4260 }, { "epoch": 1.04, "grad_norm": 11.276798248291016, "learning_rate": 4.623411780116142e-05, "loss": 1.3169, "step": 4270 }, { "epoch": 1.04, "grad_norm": 13.375223159790039, "learning_rate": 4.621228660000874e-05, "loss": 1.8935, "step": 4280 }, { "epoch": 1.04, "grad_norm": 9.408567428588867, "learning_rate": 4.6190455398856046e-05, "loss": 1.2777, "step": 4290 }, { "epoch": 1.04, "grad_norm": 21.38223648071289, "learning_rate": 4.616862419770336e-05, "loss": 1.89, "step": 4300 }, { "epoch": 1.04, "grad_norm": 6.090635299682617, "learning_rate": 4.6146792996550676e-05, "loss": 1.3209, "step": 4310 }, { "epoch": 1.04, "grad_norm": 10.609798431396484, "learning_rate": 4.6124961795397984e-05, "loss": 1.8142, "step": 4320 }, { "epoch": 1.05, "grad_norm": 12.922097206115723, "learning_rate": 4.61031305942453e-05, "loss": 1.2468, "step": 4330 }, { "epoch": 1.05, "grad_norm": 7.043770790100098, "learning_rate": 4.608129939309261e-05, "loss": 1.2018, "step": 4340 }, { "epoch": 1.05, "grad_norm": 9.51561164855957, "learning_rate": 4.605946819193992e-05, "loss": 1.0995, "step": 4350 }, { "epoch": 1.05, "grad_norm": 7.798288345336914, "learning_rate": 4.603763699078724e-05, "loss": 0.8999, "step": 4360 }, { "epoch": 1.05, "grad_norm": 9.319433212280273, "learning_rate": 4.601580578963455e-05, "loss": 1.885, "step": 4370 }, { "epoch": 1.05, "grad_norm": 7.906859397888184, "learning_rate": 4.5993974588481855e-05, "loss": 1.471, "step": 4380 }, { "epoch": 1.05, "grad_norm": 2.07707142829895, "learning_rate": 4.597214338732918e-05, "loss": 1.0106, "step": 4390 }, { "epoch": 1.05, "grad_norm": 9.248849868774414, "learning_rate": 4.5950312186176486e-05, "loss": 1.1346, "step": 4400 }, { "epoch": 1.05, "grad_norm": 6.911819934844971, "learning_rate": 4.59284809850238e-05, "loss": 1.3789, "step": 4410 }, { "epoch": 1.05, "grad_norm": 10.98066234588623, "learning_rate": 4.590664978387111e-05, "loss": 0.9063, "step": 4420 }, { "epoch": 1.05, "grad_norm": 8.73058795928955, "learning_rate": 4.5884818582718424e-05, "loss": 1.1106, "step": 4430 }, { "epoch": 1.05, "grad_norm": 2.6824753284454346, "learning_rate": 4.586298738156574e-05, "loss": 1.4832, "step": 4440 }, { "epoch": 1.05, "grad_norm": 10.074097633361816, "learning_rate": 4.584115618041305e-05, "loss": 1.5585, "step": 4450 }, { "epoch": 1.05, "grad_norm": 6.4544219970703125, "learning_rate": 4.5819324979260356e-05, "loss": 1.2475, "step": 4460 }, { "epoch": 1.05, "grad_norm": 7.890171527862549, "learning_rate": 4.579749377810768e-05, "loss": 1.9584, "step": 4470 }, { "epoch": 1.05, "grad_norm": 11.118244171142578, "learning_rate": 4.5775662576954987e-05, "loss": 1.7102, "step": 4480 }, { "epoch": 1.05, "grad_norm": 8.275115966796875, "learning_rate": 4.5753831375802295e-05, "loss": 1.0324, "step": 4490 }, { "epoch": 1.05, "grad_norm": 7.449193954467773, "learning_rate": 4.573200017464962e-05, "loss": 1.4796, "step": 4500 }, { "epoch": 1.05, "grad_norm": 18.889055252075195, "learning_rate": 4.5710168973496925e-05, "loss": 1.325, "step": 4510 }, { "epoch": 1.05, "grad_norm": 15.009246826171875, "learning_rate": 4.5688337772344234e-05, "loss": 1.5762, "step": 4520 }, { "epoch": 1.05, "grad_norm": 6.199570178985596, "learning_rate": 4.566650657119155e-05, "loss": 1.4229, "step": 4530 }, { "epoch": 1.05, "grad_norm": 9.907659530639648, "learning_rate": 4.5644675370038864e-05, "loss": 1.1091, "step": 4540 }, { "epoch": 1.05, "grad_norm": 6.617704391479492, "learning_rate": 4.562284416888617e-05, "loss": 1.2615, "step": 4550 }, { "epoch": 1.05, "grad_norm": 9.920687675476074, "learning_rate": 4.560101296773349e-05, "loss": 1.5242, "step": 4560 }, { "epoch": 1.05, "grad_norm": 11.011232376098633, "learning_rate": 4.5579181766580796e-05, "loss": 1.441, "step": 4570 }, { "epoch": 1.05, "grad_norm": 8.547407150268555, "learning_rate": 4.555735056542811e-05, "loss": 1.4599, "step": 4580 }, { "epoch": 1.06, "grad_norm": 10.177791595458984, "learning_rate": 4.5535519364275426e-05, "loss": 1.5294, "step": 4590 }, { "epoch": 1.06, "grad_norm": 9.815908432006836, "learning_rate": 4.5513688163122735e-05, "loss": 2.025, "step": 4600 }, { "epoch": 1.06, "grad_norm": 3.541128158569336, "learning_rate": 4.549185696197005e-05, "loss": 1.5358, "step": 4610 }, { "epoch": 1.06, "grad_norm": 5.384029865264893, "learning_rate": 4.5470025760817365e-05, "loss": 1.3514, "step": 4620 }, { "epoch": 1.06, "grad_norm": 6.140589714050293, "learning_rate": 4.5448194559664673e-05, "loss": 1.3597, "step": 4630 }, { "epoch": 1.06, "grad_norm": 17.420961380004883, "learning_rate": 4.542636335851199e-05, "loss": 1.5466, "step": 4640 }, { "epoch": 1.06, "grad_norm": 8.64077377319336, "learning_rate": 4.54045321573593e-05, "loss": 1.8637, "step": 4650 }, { "epoch": 1.06, "grad_norm": 12.511388778686523, "learning_rate": 4.538270095620661e-05, "loss": 1.0227, "step": 4660 }, { "epoch": 1.06, "grad_norm": 9.870147705078125, "learning_rate": 4.536086975505393e-05, "loss": 1.6874, "step": 4670 }, { "epoch": 1.06, "grad_norm": 5.387143611907959, "learning_rate": 4.5339038553901236e-05, "loss": 1.4033, "step": 4680 }, { "epoch": 1.06, "grad_norm": 6.215307235717773, "learning_rate": 4.531720735274855e-05, "loss": 1.3022, "step": 4690 }, { "epoch": 1.06, "grad_norm": 9.970330238342285, "learning_rate": 4.5295376151595866e-05, "loss": 1.5071, "step": 4700 }, { "epoch": 1.06, "grad_norm": 8.783123970031738, "learning_rate": 4.5273544950443175e-05, "loss": 1.3248, "step": 4710 }, { "epoch": 1.06, "grad_norm": 4.01648473739624, "learning_rate": 4.525171374929049e-05, "loss": 1.4289, "step": 4720 }, { "epoch": 1.06, "grad_norm": 6.31545352935791, "learning_rate": 4.5229882548137805e-05, "loss": 0.8816, "step": 4730 }, { "epoch": 1.06, "grad_norm": 8.44157600402832, "learning_rate": 4.520805134698511e-05, "loss": 1.4249, "step": 4740 }, { "epoch": 1.06, "grad_norm": 11.265628814697266, "learning_rate": 4.518622014583243e-05, "loss": 1.0749, "step": 4750 }, { "epoch": 1.06, "grad_norm": 14.69555377960205, "learning_rate": 4.516438894467974e-05, "loss": 0.9858, "step": 4760 }, { "epoch": 1.06, "grad_norm": 12.71445083618164, "learning_rate": 4.514255774352705e-05, "loss": 1.3262, "step": 4770 }, { "epoch": 1.06, "grad_norm": 12.089975357055664, "learning_rate": 4.512072654237437e-05, "loss": 1.4533, "step": 4780 }, { "epoch": 1.06, "grad_norm": 3.764937162399292, "learning_rate": 4.5098895341221676e-05, "loss": 1.535, "step": 4790 }, { "epoch": 1.06, "grad_norm": 12.15512752532959, "learning_rate": 4.5077064140068984e-05, "loss": 1.363, "step": 4800 }, { "epoch": 1.06, "grad_norm": 5.814800262451172, "learning_rate": 4.5055232938916306e-05, "loss": 1.5283, "step": 4810 }, { "epoch": 1.06, "grad_norm": 5.920871734619141, "learning_rate": 4.5033401737763614e-05, "loss": 1.2184, "step": 4820 }, { "epoch": 1.06, "grad_norm": 10.319404602050781, "learning_rate": 4.501157053661092e-05, "loss": 1.1123, "step": 4830 }, { "epoch": 1.07, "grad_norm": 10.613509178161621, "learning_rate": 4.498973933545824e-05, "loss": 1.2214, "step": 4840 }, { "epoch": 1.07, "grad_norm": 1.9326369762420654, "learning_rate": 4.496790813430555e-05, "loss": 1.1005, "step": 4850 }, { "epoch": 1.07, "grad_norm": 5.255860805511475, "learning_rate": 4.494607693315286e-05, "loss": 1.6839, "step": 4860 }, { "epoch": 1.07, "grad_norm": 9.374025344848633, "learning_rate": 4.4924245732000177e-05, "loss": 1.2228, "step": 4870 }, { "epoch": 1.07, "grad_norm": 4.1247944831848145, "learning_rate": 4.490241453084749e-05, "loss": 1.0696, "step": 4880 }, { "epoch": 1.07, "grad_norm": 6.179527759552002, "learning_rate": 4.48805833296948e-05, "loss": 1.3659, "step": 4890 }, { "epoch": 1.07, "grad_norm": 13.348698616027832, "learning_rate": 4.4858752128542115e-05, "loss": 1.6335, "step": 4900 }, { "epoch": 1.07, "grad_norm": 10.437296867370605, "learning_rate": 4.4836920927389424e-05, "loss": 1.8773, "step": 4910 }, { "epoch": 1.07, "grad_norm": 9.636833190917969, "learning_rate": 4.481508972623674e-05, "loss": 1.2797, "step": 4920 }, { "epoch": 1.07, "grad_norm": 13.312190055847168, "learning_rate": 4.4793258525084054e-05, "loss": 1.4097, "step": 4930 }, { "epoch": 1.07, "grad_norm": 11.044099807739258, "learning_rate": 4.477142732393136e-05, "loss": 1.5917, "step": 4940 }, { "epoch": 1.07, "grad_norm": 15.599660873413086, "learning_rate": 4.474959612277868e-05, "loss": 1.6672, "step": 4950 }, { "epoch": 1.07, "grad_norm": 9.421758651733398, "learning_rate": 4.472776492162599e-05, "loss": 1.4526, "step": 4960 }, { "epoch": 1.07, "grad_norm": 5.059420585632324, "learning_rate": 4.47059337204733e-05, "loss": 1.2382, "step": 4970 }, { "epoch": 1.07, "grad_norm": 3.259791851043701, "learning_rate": 4.4684102519320616e-05, "loss": 1.0361, "step": 4980 }, { "epoch": 1.07, "grad_norm": 7.2442426681518555, "learning_rate": 4.4662271318167925e-05, "loss": 1.186, "step": 4990 }, { "epoch": 1.07, "grad_norm": 14.159686088562012, "learning_rate": 4.464044011701524e-05, "loss": 1.0536, "step": 5000 }, { "epoch": 1.07, "grad_norm": 4.3618550300598145, "learning_rate": 4.4618608915862555e-05, "loss": 0.8875, "step": 5010 }, { "epoch": 1.07, "grad_norm": 6.963890552520752, "learning_rate": 4.4596777714709863e-05, "loss": 1.4126, "step": 5020 }, { "epoch": 1.07, "grad_norm": 9.353178977966309, "learning_rate": 4.457494651355718e-05, "loss": 1.4585, "step": 5030 }, { "epoch": 1.07, "grad_norm": 7.040259838104248, "learning_rate": 4.4553115312404494e-05, "loss": 1.375, "step": 5040 }, { "epoch": 1.07, "grad_norm": 12.671477317810059, "learning_rate": 4.45312841112518e-05, "loss": 1.0769, "step": 5050 }, { "epoch": 1.07, "grad_norm": 7.251068115234375, "learning_rate": 4.450945291009912e-05, "loss": 1.911, "step": 5060 }, { "epoch": 1.07, "grad_norm": 10.104564666748047, "learning_rate": 4.448762170894643e-05, "loss": 1.2554, "step": 5070 }, { "epoch": 1.07, "grad_norm": 9.69332218170166, "learning_rate": 4.446579050779374e-05, "loss": 1.0348, "step": 5080 }, { "epoch": 1.08, "grad_norm": 10.503662109375, "learning_rate": 4.4443959306641056e-05, "loss": 1.4293, "step": 5090 }, { "epoch": 1.08, "grad_norm": 25.976200103759766, "learning_rate": 4.4422128105488365e-05, "loss": 1.4591, "step": 5100 }, { "epoch": 1.08, "grad_norm": 8.001091003417969, "learning_rate": 4.440029690433568e-05, "loss": 1.3777, "step": 5110 }, { "epoch": 1.08, "grad_norm": 3.186648368835449, "learning_rate": 4.4378465703182995e-05, "loss": 1.3863, "step": 5120 }, { "epoch": 1.08, "grad_norm": 9.589128494262695, "learning_rate": 4.43566345020303e-05, "loss": 1.4746, "step": 5130 }, { "epoch": 1.08, "grad_norm": 10.536452293395996, "learning_rate": 4.433480330087761e-05, "loss": 1.3822, "step": 5140 }, { "epoch": 1.08, "grad_norm": 9.60023021697998, "learning_rate": 4.4312972099724934e-05, "loss": 0.9355, "step": 5150 }, { "epoch": 1.08, "grad_norm": 10.864623069763184, "learning_rate": 4.429114089857224e-05, "loss": 1.6614, "step": 5160 }, { "epoch": 1.08, "grad_norm": 3.658456325531006, "learning_rate": 4.426930969741955e-05, "loss": 1.7109, "step": 5170 }, { "epoch": 1.08, "grad_norm": 13.351730346679688, "learning_rate": 4.4247478496266866e-05, "loss": 1.6071, "step": 5180 }, { "epoch": 1.08, "grad_norm": 14.85404109954834, "learning_rate": 4.422564729511418e-05, "loss": 1.7418, "step": 5190 }, { "epoch": 1.08, "grad_norm": 7.094925403594971, "learning_rate": 4.420381609396149e-05, "loss": 1.3652, "step": 5200 }, { "epoch": 1.08, "grad_norm": 8.321815490722656, "learning_rate": 4.4181984892808804e-05, "loss": 1.5604, "step": 5210 }, { "epoch": 1.08, "grad_norm": 2.4179341793060303, "learning_rate": 4.416015369165611e-05, "loss": 1.3683, "step": 5220 }, { "epoch": 1.08, "grad_norm": 8.329386711120605, "learning_rate": 4.413832249050343e-05, "loss": 1.5238, "step": 5230 }, { "epoch": 1.08, "grad_norm": 5.000741958618164, "learning_rate": 4.411649128935074e-05, "loss": 0.9069, "step": 5240 }, { "epoch": 1.08, "grad_norm": 7.406224727630615, "learning_rate": 4.409466008819805e-05, "loss": 1.4594, "step": 5250 }, { "epoch": 1.08, "grad_norm": 7.657325744628906, "learning_rate": 4.407282888704537e-05, "loss": 1.4955, "step": 5260 }, { "epoch": 1.08, "grad_norm": 11.027125358581543, "learning_rate": 4.405099768589268e-05, "loss": 1.1406, "step": 5270 }, { "epoch": 1.08, "grad_norm": 8.906120300292969, "learning_rate": 4.402916648473999e-05, "loss": 1.3478, "step": 5280 }, { "epoch": 1.08, "grad_norm": 12.594207763671875, "learning_rate": 4.4007335283587305e-05, "loss": 1.9564, "step": 5290 }, { "epoch": 1.08, "grad_norm": 8.870455741882324, "learning_rate": 4.398550408243462e-05, "loss": 1.3615, "step": 5300 }, { "epoch": 1.08, "grad_norm": 10.202986717224121, "learning_rate": 4.396367288128193e-05, "loss": 1.3245, "step": 5310 }, { "epoch": 1.08, "grad_norm": 8.025837898254395, "learning_rate": 4.3941841680129244e-05, "loss": 1.095, "step": 5320 }, { "epoch": 1.08, "grad_norm": 7.455843448638916, "learning_rate": 4.392001047897655e-05, "loss": 1.518, "step": 5330 }, { "epoch": 1.08, "grad_norm": 9.10627555847168, "learning_rate": 4.389817927782387e-05, "loss": 1.7162, "step": 5340 }, { "epoch": 1.09, "grad_norm": 5.706864356994629, "learning_rate": 4.387634807667118e-05, "loss": 1.0257, "step": 5350 }, { "epoch": 1.09, "grad_norm": 12.348214149475098, "learning_rate": 4.385451687551849e-05, "loss": 1.3507, "step": 5360 }, { "epoch": 1.09, "grad_norm": 4.653382301330566, "learning_rate": 4.3832685674365806e-05, "loss": 1.2564, "step": 5370 }, { "epoch": 1.09, "grad_norm": 7.7382001876831055, "learning_rate": 4.381085447321312e-05, "loss": 1.2119, "step": 5380 }, { "epoch": 1.09, "grad_norm": 11.806276321411133, "learning_rate": 4.378902327206043e-05, "loss": 1.5198, "step": 5390 }, { "epoch": 1.09, "grad_norm": 5.764992713928223, "learning_rate": 4.3767192070907745e-05, "loss": 1.7527, "step": 5400 }, { "epoch": 1.09, "grad_norm": 10.073014259338379, "learning_rate": 4.374536086975506e-05, "loss": 1.7294, "step": 5410 }, { "epoch": 1.09, "grad_norm": 10.972870826721191, "learning_rate": 4.372352966860237e-05, "loss": 1.0573, "step": 5420 }, { "epoch": 1.09, "grad_norm": 7.774905204772949, "learning_rate": 4.3701698467449684e-05, "loss": 1.564, "step": 5430 }, { "epoch": 1.09, "grad_norm": 10.355585098266602, "learning_rate": 4.367986726629699e-05, "loss": 1.2876, "step": 5440 }, { "epoch": 1.09, "grad_norm": 7.519742965698242, "learning_rate": 4.365803606514431e-05, "loss": 1.172, "step": 5450 }, { "epoch": 1.09, "grad_norm": 5.678964138031006, "learning_rate": 4.363620486399162e-05, "loss": 1.1193, "step": 5460 }, { "epoch": 1.09, "grad_norm": 8.34531021118164, "learning_rate": 4.361437366283893e-05, "loss": 1.4139, "step": 5470 }, { "epoch": 1.09, "grad_norm": 10.988582611083984, "learning_rate": 4.359254246168624e-05, "loss": 1.2381, "step": 5480 }, { "epoch": 1.09, "grad_norm": 9.079237937927246, "learning_rate": 4.357071126053356e-05, "loss": 1.7306, "step": 5490 }, { "epoch": 1.09, "grad_norm": 12.234977722167969, "learning_rate": 4.354888005938087e-05, "loss": 1.6351, "step": 5500 }, { "epoch": 1.09, "grad_norm": 11.239243507385254, "learning_rate": 4.352704885822818e-05, "loss": 1.4795, "step": 5510 }, { "epoch": 1.09, "grad_norm": 4.135558605194092, "learning_rate": 4.350521765707549e-05, "loss": 1.4064, "step": 5520 }, { "epoch": 1.09, "grad_norm": 4.801987648010254, "learning_rate": 4.348338645592281e-05, "loss": 1.3574, "step": 5530 }, { "epoch": 1.09, "grad_norm": 22.40532112121582, "learning_rate": 4.346155525477012e-05, "loss": 1.3873, "step": 5540 }, { "epoch": 1.09, "grad_norm": 17.664045333862305, "learning_rate": 4.343972405361743e-05, "loss": 1.2345, "step": 5550 }, { "epoch": 1.09, "grad_norm": 9.555740356445312, "learning_rate": 4.341789285246474e-05, "loss": 1.1707, "step": 5560 }, { "epoch": 1.09, "grad_norm": 9.065106391906738, "learning_rate": 4.3396061651312056e-05, "loss": 1.411, "step": 5570 }, { "epoch": 1.09, "grad_norm": 10.068510055541992, "learning_rate": 4.337423045015937e-05, "loss": 1.2798, "step": 5580 }, { "epoch": 1.09, "grad_norm": 10.018562316894531, "learning_rate": 4.335239924900668e-05, "loss": 1.1519, "step": 5590 }, { "epoch": 1.1, "grad_norm": 10.77661418914795, "learning_rate": 4.3330568047853994e-05, "loss": 1.3274, "step": 5600 }, { "epoch": 1.1, "grad_norm": 5.27345609664917, "learning_rate": 4.330873684670131e-05, "loss": 1.0769, "step": 5610 }, { "epoch": 1.1, "grad_norm": 9.78793716430664, "learning_rate": 4.328690564554862e-05, "loss": 1.8263, "step": 5620 }, { "epoch": 1.1, "grad_norm": 6.7209272384643555, "learning_rate": 4.326507444439593e-05, "loss": 1.4534, "step": 5630 }, { "epoch": 1.1, "grad_norm": 16.386430740356445, "learning_rate": 4.324324324324325e-05, "loss": 1.2706, "step": 5640 }, { "epoch": 1.1, "grad_norm": 11.684378623962402, "learning_rate": 4.322141204209056e-05, "loss": 1.818, "step": 5650 }, { "epoch": 1.1, "grad_norm": 6.29071569442749, "learning_rate": 4.319958084093787e-05, "loss": 1.331, "step": 5660 }, { "epoch": 1.1, "grad_norm": 9.52759075164795, "learning_rate": 4.317774963978518e-05, "loss": 1.2682, "step": 5670 }, { "epoch": 1.1, "grad_norm": 9.667022705078125, "learning_rate": 4.3155918438632495e-05, "loss": 1.1853, "step": 5680 }, { "epoch": 1.1, "grad_norm": 3.4852025508880615, "learning_rate": 4.313408723747981e-05, "loss": 1.2407, "step": 5690 }, { "epoch": 1.1, "grad_norm": 7.062267303466797, "learning_rate": 4.311225603632712e-05, "loss": 1.1836, "step": 5700 }, { "epoch": 1.1, "grad_norm": 11.635501861572266, "learning_rate": 4.3090424835174434e-05, "loss": 1.6584, "step": 5710 }, { "epoch": 1.1, "grad_norm": 6.469181537628174, "learning_rate": 4.306859363402175e-05, "loss": 1.508, "step": 5720 }, { "epoch": 1.1, "grad_norm": 9.674896240234375, "learning_rate": 4.304676243286906e-05, "loss": 1.8292, "step": 5730 }, { "epoch": 1.1, "grad_norm": 7.221367359161377, "learning_rate": 4.302493123171637e-05, "loss": 1.4369, "step": 5740 }, { "epoch": 1.1, "grad_norm": 3.692150592803955, "learning_rate": 4.300310003056368e-05, "loss": 0.985, "step": 5750 }, { "epoch": 1.1, "grad_norm": 10.339776992797852, "learning_rate": 4.2981268829410996e-05, "loss": 1.8611, "step": 5760 }, { "epoch": 1.1, "grad_norm": 21.516637802124023, "learning_rate": 4.295943762825831e-05, "loss": 1.1063, "step": 5770 }, { "epoch": 1.1, "grad_norm": 10.89984130859375, "learning_rate": 4.293760642710562e-05, "loss": 1.4435, "step": 5780 }, { "epoch": 1.1, "grad_norm": 6.787333011627197, "learning_rate": 4.2915775225952935e-05, "loss": 1.4575, "step": 5790 }, { "epoch": 1.1, "grad_norm": 12.513680458068848, "learning_rate": 4.289394402480025e-05, "loss": 1.5727, "step": 5800 }, { "epoch": 1.1, "grad_norm": 8.494597434997559, "learning_rate": 4.287211282364756e-05, "loss": 1.0312, "step": 5810 }, { "epoch": 1.1, "grad_norm": 11.695953369140625, "learning_rate": 4.285028162249487e-05, "loss": 1.363, "step": 5820 }, { "epoch": 1.1, "grad_norm": 13.415657997131348, "learning_rate": 4.282845042134219e-05, "loss": 1.2698, "step": 5830 }, { "epoch": 1.1, "grad_norm": 11.020434379577637, "learning_rate": 4.28066192201895e-05, "loss": 1.5602, "step": 5840 }, { "epoch": 1.1, "grad_norm": 13.339118957519531, "learning_rate": 4.2784788019036806e-05, "loss": 1.4492, "step": 5850 }, { "epoch": 1.11, "grad_norm": 10.761958122253418, "learning_rate": 4.276295681788412e-05, "loss": 1.2013, "step": 5860 }, { "epoch": 1.11, "grad_norm": 16.963329315185547, "learning_rate": 4.2741125616731436e-05, "loss": 1.2333, "step": 5870 }, { "epoch": 1.11, "grad_norm": 11.41370964050293, "learning_rate": 4.2719294415578745e-05, "loss": 1.0943, "step": 5880 }, { "epoch": 1.11, "grad_norm": 13.932652473449707, "learning_rate": 4.269746321442606e-05, "loss": 1.3185, "step": 5890 }, { "epoch": 1.11, "grad_norm": 7.832998752593994, "learning_rate": 4.267563201327337e-05, "loss": 1.1908, "step": 5900 }, { "epoch": 1.11, "grad_norm": 3.6177937984466553, "learning_rate": 4.265380081212068e-05, "loss": 0.8071, "step": 5910 }, { "epoch": 1.11, "grad_norm": 9.798988342285156, "learning_rate": 4.2631969610968e-05, "loss": 1.388, "step": 5920 }, { "epoch": 1.11, "grad_norm": 10.035263061523438, "learning_rate": 4.261013840981531e-05, "loss": 1.6554, "step": 5930 }, { "epoch": 1.11, "grad_norm": 10.603642463684082, "learning_rate": 4.258830720866262e-05, "loss": 1.546, "step": 5940 }, { "epoch": 1.11, "grad_norm": 3.801497459411621, "learning_rate": 4.256647600750994e-05, "loss": 1.0172, "step": 5950 }, { "epoch": 1.11, "grad_norm": 14.851375579833984, "learning_rate": 4.2544644806357246e-05, "loss": 1.7496, "step": 5960 }, { "epoch": 1.11, "grad_norm": 10.721879959106445, "learning_rate": 4.252281360520456e-05, "loss": 1.3678, "step": 5970 }, { "epoch": 1.11, "grad_norm": 11.220300674438477, "learning_rate": 4.2500982404051876e-05, "loss": 1.4647, "step": 5980 }, { "epoch": 1.11, "grad_norm": 9.148963928222656, "learning_rate": 4.2479151202899184e-05, "loss": 1.7119, "step": 5990 }, { "epoch": 1.11, "grad_norm": 6.071213722229004, "learning_rate": 4.24573200017465e-05, "loss": 1.4969, "step": 6000 }, { "epoch": 1.11, "grad_norm": 6.835692882537842, "learning_rate": 4.243548880059381e-05, "loss": 1.275, "step": 6010 }, { "epoch": 1.11, "grad_norm": 12.33680534362793, "learning_rate": 4.241365759944112e-05, "loss": 1.1403, "step": 6020 }, { "epoch": 1.11, "grad_norm": 10.196438789367676, "learning_rate": 4.239182639828844e-05, "loss": 1.4146, "step": 6030 }, { "epoch": 1.11, "grad_norm": 6.318220138549805, "learning_rate": 4.236999519713575e-05, "loss": 1.4477, "step": 6040 }, { "epoch": 1.11, "grad_norm": 6.795490741729736, "learning_rate": 4.234816399598306e-05, "loss": 1.0804, "step": 6050 }, { "epoch": 1.11, "grad_norm": 11.984965324401855, "learning_rate": 4.232633279483038e-05, "loss": 1.4842, "step": 6060 }, { "epoch": 1.11, "grad_norm": 5.881481647491455, "learning_rate": 4.2304501593677685e-05, "loss": 1.3527, "step": 6070 }, { "epoch": 1.11, "grad_norm": 8.952033042907715, "learning_rate": 4.2282670392525e-05, "loss": 1.5106, "step": 6080 }, { "epoch": 1.11, "grad_norm": 9.092145919799805, "learning_rate": 4.226083919137231e-05, "loss": 1.5945, "step": 6090 }, { "epoch": 1.11, "grad_norm": 10.932160377502441, "learning_rate": 4.2239007990219624e-05, "loss": 1.4243, "step": 6100 }, { "epoch": 1.12, "grad_norm": 14.826157569885254, "learning_rate": 4.221717678906694e-05, "loss": 1.4937, "step": 6110 }, { "epoch": 1.12, "grad_norm": 9.662437438964844, "learning_rate": 4.219534558791425e-05, "loss": 1.4405, "step": 6120 }, { "epoch": 1.12, "grad_norm": 8.702345848083496, "learning_rate": 4.2173514386761556e-05, "loss": 1.1632, "step": 6130 }, { "epoch": 1.12, "grad_norm": 7.759054183959961, "learning_rate": 4.215168318560888e-05, "loss": 1.4207, "step": 6140 }, { "epoch": 1.12, "grad_norm": 6.042901039123535, "learning_rate": 4.2129851984456186e-05, "loss": 1.0835, "step": 6150 }, { "epoch": 1.12, "grad_norm": 12.703083992004395, "learning_rate": 4.2108020783303495e-05, "loss": 1.253, "step": 6160 }, { "epoch": 1.12, "grad_norm": 10.430384635925293, "learning_rate": 4.208618958215082e-05, "loss": 1.4686, "step": 6170 }, { "epoch": 1.12, "grad_norm": 5.644809722900391, "learning_rate": 4.2064358380998125e-05, "loss": 1.1186, "step": 6180 }, { "epoch": 1.12, "grad_norm": 2.9076974391937256, "learning_rate": 4.2042527179845434e-05, "loss": 1.3987, "step": 6190 }, { "epoch": 1.12, "grad_norm": 6.042174339294434, "learning_rate": 4.202069597869275e-05, "loss": 1.5718, "step": 6200 }, { "epoch": 1.12, "grad_norm": 8.924242973327637, "learning_rate": 4.1998864777540064e-05, "loss": 1.6369, "step": 6210 }, { "epoch": 1.12, "grad_norm": 7.816781044006348, "learning_rate": 4.197703357638737e-05, "loss": 1.4422, "step": 6220 }, { "epoch": 1.12, "grad_norm": 12.56850528717041, "learning_rate": 4.195520237523469e-05, "loss": 1.4442, "step": 6230 }, { "epoch": 1.12, "grad_norm": 7.811129093170166, "learning_rate": 4.1933371174081996e-05, "loss": 1.0775, "step": 6240 }, { "epoch": 1.12, "grad_norm": 8.069817543029785, "learning_rate": 4.191153997292932e-05, "loss": 1.037, "step": 6250 }, { "epoch": 1.12, "grad_norm": 11.316567420959473, "learning_rate": 4.1889708771776626e-05, "loss": 1.3522, "step": 6260 }, { "epoch": 1.12, "grad_norm": 8.277640342712402, "learning_rate": 4.1867877570623935e-05, "loss": 1.3446, "step": 6270 }, { "epoch": 1.12, "grad_norm": 5.92270040512085, "learning_rate": 4.184604636947125e-05, "loss": 0.919, "step": 6280 }, { "epoch": 1.12, "grad_norm": 11.871767044067383, "learning_rate": 4.1824215168318565e-05, "loss": 1.7094, "step": 6290 }, { "epoch": 1.12, "grad_norm": 4.560102939605713, "learning_rate": 4.180238396716587e-05, "loss": 1.0964, "step": 6300 }, { "epoch": 1.12, "grad_norm": 4.638527870178223, "learning_rate": 4.178055276601319e-05, "loss": 1.3042, "step": 6310 }, { "epoch": 1.12, "grad_norm": 7.609078407287598, "learning_rate": 4.17587215648605e-05, "loss": 1.364, "step": 6320 }, { "epoch": 1.12, "grad_norm": 8.808463096618652, "learning_rate": 4.173689036370781e-05, "loss": 1.3478, "step": 6330 }, { "epoch": 1.12, "grad_norm": 7.644648551940918, "learning_rate": 4.171505916255513e-05, "loss": 1.4559, "step": 6340 }, { "epoch": 1.12, "grad_norm": 5.955029487609863, "learning_rate": 4.1693227961402436e-05, "loss": 1.1947, "step": 6350 }, { "epoch": 1.12, "grad_norm": 11.162169456481934, "learning_rate": 4.167139676024975e-05, "loss": 1.566, "step": 6360 }, { "epoch": 1.12, "eval_accuracy": 0.6317774634606318, "eval_loss": 1.2703893184661865, "eval_runtime": 374.6858, "eval_samples_per_second": 11.321, "eval_steps_per_second": 2.832, "step": 6362 }, { "epoch": 2.0, "grad_norm": 20.70047950744629, "learning_rate": 4.1649565559097066e-05, "loss": 1.3217, "step": 6370 }, { "epoch": 2.0, "grad_norm": 8.594015121459961, "learning_rate": 4.1627734357944374e-05, "loss": 1.5316, "step": 6380 }, { "epoch": 2.0, "grad_norm": 11.462390899658203, "learning_rate": 4.160590315679169e-05, "loss": 1.4135, "step": 6390 }, { "epoch": 2.0, "grad_norm": 10.786730766296387, "learning_rate": 4.1584071955639005e-05, "loss": 1.2027, "step": 6400 }, { "epoch": 2.0, "grad_norm": 4.670394420623779, "learning_rate": 4.156224075448631e-05, "loss": 1.2845, "step": 6410 }, { "epoch": 2.0, "grad_norm": 4.784489631652832, "learning_rate": 4.154040955333363e-05, "loss": 1.2445, "step": 6420 }, { "epoch": 2.0, "grad_norm": 1.4236918687820435, "learning_rate": 4.151857835218094e-05, "loss": 0.706, "step": 6430 }, { "epoch": 2.0, "grad_norm": 1.4937036037445068, "learning_rate": 4.149674715102825e-05, "loss": 1.035, "step": 6440 }, { "epoch": 2.0, "grad_norm": 4.353930950164795, "learning_rate": 4.147491594987557e-05, "loss": 1.9907, "step": 6450 }, { "epoch": 2.0, "grad_norm": 7.945840835571289, "learning_rate": 4.1453084748722875e-05, "loss": 1.4635, "step": 6460 }, { "epoch": 2.0, "grad_norm": 8.93451976776123, "learning_rate": 4.1431253547570184e-05, "loss": 1.2701, "step": 6470 }, { "epoch": 2.0, "grad_norm": 20.85499382019043, "learning_rate": 4.1409422346417506e-05, "loss": 1.3616, "step": 6480 }, { "epoch": 2.01, "grad_norm": 7.519311428070068, "learning_rate": 4.1387591145264814e-05, "loss": 1.3379, "step": 6490 }, { "epoch": 2.01, "grad_norm": 9.72396183013916, "learning_rate": 4.136575994411212e-05, "loss": 0.9993, "step": 6500 }, { "epoch": 2.01, "grad_norm": 5.450109958648682, "learning_rate": 4.134392874295944e-05, "loss": 1.4177, "step": 6510 }, { "epoch": 2.01, "grad_norm": 12.376193046569824, "learning_rate": 4.132209754180675e-05, "loss": 1.2318, "step": 6520 }, { "epoch": 2.01, "grad_norm": 8.414650917053223, "learning_rate": 4.130026634065406e-05, "loss": 1.0834, "step": 6530 }, { "epoch": 2.01, "grad_norm": 8.144619941711426, "learning_rate": 4.1278435139501376e-05, "loss": 1.1219, "step": 6540 }, { "epoch": 2.01, "grad_norm": 9.703134536743164, "learning_rate": 4.125660393834869e-05, "loss": 1.3134, "step": 6550 }, { "epoch": 2.01, "grad_norm": 8.901161193847656, "learning_rate": 4.123477273719601e-05, "loss": 1.2709, "step": 6560 }, { "epoch": 2.01, "grad_norm": 12.2059326171875, "learning_rate": 4.1212941536043315e-05, "loss": 1.4683, "step": 6570 }, { "epoch": 2.01, "grad_norm": 13.865036964416504, "learning_rate": 4.1191110334890624e-05, "loss": 1.0404, "step": 6580 }, { "epoch": 2.01, "grad_norm": 6.864190578460693, "learning_rate": 4.1169279133737946e-05, "loss": 0.9905, "step": 6590 }, { "epoch": 2.01, "grad_norm": 5.386161804199219, "learning_rate": 4.1147447932585254e-05, "loss": 1.2465, "step": 6600 }, { "epoch": 2.01, "grad_norm": 13.625713348388672, "learning_rate": 4.112561673143256e-05, "loss": 1.0173, "step": 6610 }, { "epoch": 2.01, "grad_norm": 7.799843788146973, "learning_rate": 4.110378553027988e-05, "loss": 1.442, "step": 6620 }, { "epoch": 2.01, "grad_norm": 5.630618572235107, "learning_rate": 4.108195432912719e-05, "loss": 1.1965, "step": 6630 }, { "epoch": 2.01, "grad_norm": 11.694421768188477, "learning_rate": 4.10601231279745e-05, "loss": 1.5661, "step": 6640 }, { "epoch": 2.01, "grad_norm": 16.32294273376465, "learning_rate": 4.1038291926821816e-05, "loss": 1.3501, "step": 6650 }, { "epoch": 2.01, "grad_norm": 7.672978401184082, "learning_rate": 4.1016460725669125e-05, "loss": 1.0422, "step": 6660 }, { "epoch": 2.01, "grad_norm": 9.904041290283203, "learning_rate": 4.099462952451644e-05, "loss": 1.4158, "step": 6670 }, { "epoch": 2.01, "grad_norm": 6.419857978820801, "learning_rate": 4.0972798323363755e-05, "loss": 0.9251, "step": 6680 }, { "epoch": 2.01, "grad_norm": 3.4371767044067383, "learning_rate": 4.095096712221106e-05, "loss": 1.2682, "step": 6690 }, { "epoch": 2.01, "grad_norm": 11.325343132019043, "learning_rate": 4.092913592105838e-05, "loss": 1.702, "step": 6700 }, { "epoch": 2.01, "grad_norm": 8.532791137695312, "learning_rate": 4.0907304719905694e-05, "loss": 1.5305, "step": 6710 }, { "epoch": 2.01, "grad_norm": 15.458795547485352, "learning_rate": 4.0885473518753e-05, "loss": 2.0259, "step": 6720 }, { "epoch": 2.01, "grad_norm": 10.888657569885254, "learning_rate": 4.086364231760032e-05, "loss": 1.4615, "step": 6730 }, { "epoch": 2.01, "grad_norm": 11.061561584472656, "learning_rate": 4.084181111644763e-05, "loss": 1.0946, "step": 6740 }, { "epoch": 2.02, "grad_norm": 5.369321823120117, "learning_rate": 4.081997991529494e-05, "loss": 1.2325, "step": 6750 }, { "epoch": 2.02, "grad_norm": 15.661127090454102, "learning_rate": 4.0798148714142256e-05, "loss": 1.2741, "step": 6760 }, { "epoch": 2.02, "grad_norm": 7.850557804107666, "learning_rate": 4.0776317512989564e-05, "loss": 1.4497, "step": 6770 }, { "epoch": 2.02, "grad_norm": 14.3179349899292, "learning_rate": 4.075448631183688e-05, "loss": 1.0526, "step": 6780 }, { "epoch": 2.02, "grad_norm": 12.123126983642578, "learning_rate": 4.0732655110684195e-05, "loss": 1.4888, "step": 6790 }, { "epoch": 2.02, "grad_norm": 8.910035133361816, "learning_rate": 4.07108239095315e-05, "loss": 1.2083, "step": 6800 }, { "epoch": 2.02, "grad_norm": 6.023748874664307, "learning_rate": 4.068899270837881e-05, "loss": 1.4507, "step": 6810 }, { "epoch": 2.02, "grad_norm": 11.782119750976562, "learning_rate": 4.0667161507226134e-05, "loss": 1.6523, "step": 6820 }, { "epoch": 2.02, "grad_norm": 4.899501800537109, "learning_rate": 4.064533030607344e-05, "loss": 1.2879, "step": 6830 }, { "epoch": 2.02, "grad_norm": 6.889960289001465, "learning_rate": 4.062349910492075e-05, "loss": 1.6222, "step": 6840 }, { "epoch": 2.02, "grad_norm": 9.870460510253906, "learning_rate": 4.0601667903768065e-05, "loss": 1.4467, "step": 6850 }, { "epoch": 2.02, "grad_norm": 12.333667755126953, "learning_rate": 4.057983670261538e-05, "loss": 1.4007, "step": 6860 }, { "epoch": 2.02, "grad_norm": 13.394468307495117, "learning_rate": 4.0558005501462696e-05, "loss": 0.9144, "step": 6870 }, { "epoch": 2.02, "grad_norm": 7.482660293579102, "learning_rate": 4.0536174300310004e-05, "loss": 1.4531, "step": 6880 }, { "epoch": 2.02, "grad_norm": 19.7496337890625, "learning_rate": 4.051434309915732e-05, "loss": 1.4494, "step": 6890 }, { "epoch": 2.02, "grad_norm": 4.039620876312256, "learning_rate": 4.0492511898004635e-05, "loss": 1.4626, "step": 6900 }, { "epoch": 2.02, "grad_norm": 10.677824020385742, "learning_rate": 4.047068069685194e-05, "loss": 1.2848, "step": 6910 }, { "epoch": 2.02, "grad_norm": 16.838600158691406, "learning_rate": 4.044884949569925e-05, "loss": 1.4759, "step": 6920 }, { "epoch": 2.02, "grad_norm": 11.80189323425293, "learning_rate": 4.042701829454657e-05, "loss": 1.5892, "step": 6930 }, { "epoch": 2.02, "grad_norm": 10.882355690002441, "learning_rate": 4.040518709339388e-05, "loss": 1.1056, "step": 6940 }, { "epoch": 2.02, "grad_norm": 10.496129989624023, "learning_rate": 4.038335589224119e-05, "loss": 1.2585, "step": 6950 }, { "epoch": 2.02, "grad_norm": 5.9408369064331055, "learning_rate": 4.0361524691088505e-05, "loss": 0.9989, "step": 6960 }, { "epoch": 2.02, "grad_norm": 12.555964469909668, "learning_rate": 4.033969348993582e-05, "loss": 1.3959, "step": 6970 }, { "epoch": 2.02, "grad_norm": 5.582096576690674, "learning_rate": 4.031786228878313e-05, "loss": 1.2151, "step": 6980 }, { "epoch": 2.02, "grad_norm": 11.457969665527344, "learning_rate": 4.0296031087630444e-05, "loss": 1.2285, "step": 6990 }, { "epoch": 2.03, "grad_norm": 6.421438694000244, "learning_rate": 4.027419988647775e-05, "loss": 1.2213, "step": 7000 }, { "epoch": 2.03, "grad_norm": 7.338561058044434, "learning_rate": 4.025236868532507e-05, "loss": 1.2924, "step": 7010 }, { "epoch": 2.03, "grad_norm": 8.555256843566895, "learning_rate": 4.023053748417238e-05, "loss": 1.1914, "step": 7020 }, { "epoch": 2.03, "grad_norm": 9.488926887512207, "learning_rate": 4.020870628301969e-05, "loss": 1.0308, "step": 7030 }, { "epoch": 2.03, "grad_norm": 7.274324417114258, "learning_rate": 4.0186875081867006e-05, "loss": 1.6781, "step": 7040 }, { "epoch": 2.03, "grad_norm": 7.207624912261963, "learning_rate": 4.016504388071432e-05, "loss": 0.8851, "step": 7050 }, { "epoch": 2.03, "grad_norm": 11.121164321899414, "learning_rate": 4.014321267956163e-05, "loss": 1.554, "step": 7060 }, { "epoch": 2.03, "grad_norm": 10.630584716796875, "learning_rate": 4.0121381478408945e-05, "loss": 1.5054, "step": 7070 }, { "epoch": 2.03, "grad_norm": 15.908056259155273, "learning_rate": 4.009955027725626e-05, "loss": 1.4411, "step": 7080 }, { "epoch": 2.03, "grad_norm": 11.93024730682373, "learning_rate": 4.007771907610357e-05, "loss": 1.4389, "step": 7090 }, { "epoch": 2.03, "grad_norm": 8.511980056762695, "learning_rate": 4.0055887874950884e-05, "loss": 1.7509, "step": 7100 }, { "epoch": 2.03, "grad_norm": 13.465594291687012, "learning_rate": 4.003405667379819e-05, "loss": 1.1475, "step": 7110 }, { "epoch": 2.03, "grad_norm": 6.573078632354736, "learning_rate": 4.001222547264551e-05, "loss": 1.0862, "step": 7120 }, { "epoch": 2.03, "grad_norm": 6.822663307189941, "learning_rate": 3.999039427149282e-05, "loss": 1.0826, "step": 7130 }, { "epoch": 2.03, "grad_norm": 7.63055419921875, "learning_rate": 3.996856307034013e-05, "loss": 1.088, "step": 7140 }, { "epoch": 2.03, "grad_norm": 19.246923446655273, "learning_rate": 3.994673186918744e-05, "loss": 1.0296, "step": 7150 }, { "epoch": 2.03, "grad_norm": 5.085113525390625, "learning_rate": 3.992490066803476e-05, "loss": 1.3802, "step": 7160 }, { "epoch": 2.03, "grad_norm": 7.012568473815918, "learning_rate": 3.990306946688207e-05, "loss": 1.1053, "step": 7170 }, { "epoch": 2.03, "grad_norm": 18.401243209838867, "learning_rate": 3.9881238265729385e-05, "loss": 1.0645, "step": 7180 }, { "epoch": 2.03, "grad_norm": 15.425987243652344, "learning_rate": 3.985940706457669e-05, "loss": 1.096, "step": 7190 }, { "epoch": 2.03, "grad_norm": 4.270368576049805, "learning_rate": 3.983757586342401e-05, "loss": 1.1733, "step": 7200 }, { "epoch": 2.03, "grad_norm": 7.060866355895996, "learning_rate": 3.9815744662271324e-05, "loss": 1.3791, "step": 7210 }, { "epoch": 2.03, "grad_norm": 8.921182632446289, "learning_rate": 3.979391346111863e-05, "loss": 1.2632, "step": 7220 }, { "epoch": 2.03, "grad_norm": 9.632451057434082, "learning_rate": 3.977208225996594e-05, "loss": 1.0966, "step": 7230 }, { "epoch": 2.03, "grad_norm": 7.126803398132324, "learning_rate": 3.975025105881326e-05, "loss": 1.6604, "step": 7240 }, { "epoch": 2.03, "grad_norm": 8.455897331237793, "learning_rate": 3.972841985766057e-05, "loss": 1.2767, "step": 7250 }, { "epoch": 2.04, "grad_norm": 6.303521633148193, "learning_rate": 3.970658865650788e-05, "loss": 0.8061, "step": 7260 }, { "epoch": 2.04, "grad_norm": 7.39023494720459, "learning_rate": 3.96847574553552e-05, "loss": 1.1342, "step": 7270 }, { "epoch": 2.04, "grad_norm": 14.367632865905762, "learning_rate": 3.966292625420251e-05, "loss": 1.4532, "step": 7280 }, { "epoch": 2.04, "grad_norm": 10.540842056274414, "learning_rate": 3.964109505304982e-05, "loss": 1.1075, "step": 7290 }, { "epoch": 2.04, "grad_norm": 9.620889663696289, "learning_rate": 3.961926385189713e-05, "loss": 1.1672, "step": 7300 }, { "epoch": 2.04, "grad_norm": 10.473124504089355, "learning_rate": 3.959743265074445e-05, "loss": 1.3278, "step": 7310 }, { "epoch": 2.04, "grad_norm": 10.144049644470215, "learning_rate": 3.9575601449591757e-05, "loss": 1.3279, "step": 7320 }, { "epoch": 2.04, "grad_norm": 8.136828422546387, "learning_rate": 3.955377024843907e-05, "loss": 1.5342, "step": 7330 }, { "epoch": 2.04, "grad_norm": 8.58341121673584, "learning_rate": 3.953193904728638e-05, "loss": 1.0539, "step": 7340 }, { "epoch": 2.04, "grad_norm": 14.833088874816895, "learning_rate": 3.9510107846133695e-05, "loss": 1.4279, "step": 7350 }, { "epoch": 2.04, "grad_norm": 8.232218742370605, "learning_rate": 3.948827664498101e-05, "loss": 1.2912, "step": 7360 }, { "epoch": 2.04, "grad_norm": 5.843240261077881, "learning_rate": 3.946644544382832e-05, "loss": 1.354, "step": 7370 }, { "epoch": 2.04, "grad_norm": 7.985373497009277, "learning_rate": 3.9444614242675634e-05, "loss": 1.3933, "step": 7380 }, { "epoch": 2.04, "grad_norm": 17.41598892211914, "learning_rate": 3.942278304152295e-05, "loss": 1.2268, "step": 7390 }, { "epoch": 2.04, "grad_norm": 14.413007736206055, "learning_rate": 3.940095184037026e-05, "loss": 1.9318, "step": 7400 }, { "epoch": 2.04, "grad_norm": 9.484841346740723, "learning_rate": 3.937912063921757e-05, "loss": 1.6003, "step": 7410 }, { "epoch": 2.04, "grad_norm": 8.916605949401855, "learning_rate": 3.935728943806488e-05, "loss": 1.0038, "step": 7420 }, { "epoch": 2.04, "grad_norm": 16.42214584350586, "learning_rate": 3.9335458236912196e-05, "loss": 0.8855, "step": 7430 }, { "epoch": 2.04, "grad_norm": 9.259830474853516, "learning_rate": 3.931362703575951e-05, "loss": 1.2926, "step": 7440 }, { "epoch": 2.04, "grad_norm": 10.413721084594727, "learning_rate": 3.929179583460682e-05, "loss": 1.8284, "step": 7450 }, { "epoch": 2.04, "grad_norm": 6.733238697052002, "learning_rate": 3.9269964633454135e-05, "loss": 0.8318, "step": 7460 }, { "epoch": 2.04, "grad_norm": 8.42292594909668, "learning_rate": 3.924813343230145e-05, "loss": 0.8966, "step": 7470 }, { "epoch": 2.04, "grad_norm": 13.229130744934082, "learning_rate": 3.922630223114876e-05, "loss": 0.9965, "step": 7480 }, { "epoch": 2.04, "grad_norm": 8.147464752197266, "learning_rate": 3.9204471029996074e-05, "loss": 1.1644, "step": 7490 }, { "epoch": 2.04, "grad_norm": 10.038113594055176, "learning_rate": 3.918263982884339e-05, "loss": 1.4682, "step": 7500 }, { "epoch": 2.05, "grad_norm": 7.785242557525635, "learning_rate": 3.91608086276907e-05, "loss": 0.86, "step": 7510 }, { "epoch": 2.05, "grad_norm": 7.201658725738525, "learning_rate": 3.913897742653801e-05, "loss": 1.102, "step": 7520 }, { "epoch": 2.05, "grad_norm": 12.15894603729248, "learning_rate": 3.911714622538532e-05, "loss": 1.8251, "step": 7530 }, { "epoch": 2.05, "grad_norm": 7.207378387451172, "learning_rate": 3.9095315024232636e-05, "loss": 1.588, "step": 7540 }, { "epoch": 2.05, "grad_norm": 5.362427711486816, "learning_rate": 3.907348382307995e-05, "loss": 1.1987, "step": 7550 }, { "epoch": 2.05, "grad_norm": 1.8747584819793701, "learning_rate": 3.905165262192726e-05, "loss": 1.5552, "step": 7560 }, { "epoch": 2.05, "grad_norm": 7.360047340393066, "learning_rate": 3.902982142077457e-05, "loss": 1.7228, "step": 7570 }, { "epoch": 2.05, "grad_norm": 6.378320217132568, "learning_rate": 3.900799021962189e-05, "loss": 1.451, "step": 7580 }, { "epoch": 2.05, "grad_norm": 6.066529273986816, "learning_rate": 3.89861590184692e-05, "loss": 1.2918, "step": 7590 }, { "epoch": 2.05, "grad_norm": 18.33747673034668, "learning_rate": 3.896432781731651e-05, "loss": 0.9071, "step": 7600 }, { "epoch": 2.05, "grad_norm": 7.380865097045898, "learning_rate": 3.894249661616382e-05, "loss": 1.1914, "step": 7610 }, { "epoch": 2.05, "grad_norm": 11.81452751159668, "learning_rate": 3.892066541501114e-05, "loss": 0.8755, "step": 7620 }, { "epoch": 2.05, "grad_norm": 5.44549036026001, "learning_rate": 3.8898834213858446e-05, "loss": 1.3354, "step": 7630 }, { "epoch": 2.05, "grad_norm": 8.32117748260498, "learning_rate": 3.887700301270576e-05, "loss": 1.6619, "step": 7640 }, { "epoch": 2.05, "grad_norm": 5.560888290405273, "learning_rate": 3.8855171811553076e-05, "loss": 1.1239, "step": 7650 }, { "epoch": 2.05, "grad_norm": 4.413572311401367, "learning_rate": 3.8833340610400384e-05, "loss": 1.0453, "step": 7660 }, { "epoch": 2.05, "grad_norm": 9.715067863464355, "learning_rate": 3.88115094092477e-05, "loss": 1.8147, "step": 7670 }, { "epoch": 2.05, "grad_norm": 7.3825764656066895, "learning_rate": 3.878967820809501e-05, "loss": 1.2615, "step": 7680 }, { "epoch": 2.05, "grad_norm": 9.961684226989746, "learning_rate": 3.876784700694232e-05, "loss": 1.383, "step": 7690 }, { "epoch": 2.05, "grad_norm": 6.429656982421875, "learning_rate": 3.874601580578964e-05, "loss": 1.3407, "step": 7700 }, { "epoch": 2.05, "grad_norm": 6.283732891082764, "learning_rate": 3.8724184604636947e-05, "loss": 1.4077, "step": 7710 }, { "epoch": 2.05, "grad_norm": 8.831610679626465, "learning_rate": 3.870235340348426e-05, "loss": 1.7205, "step": 7720 }, { "epoch": 2.05, "grad_norm": 9.852862358093262, "learning_rate": 3.868052220233158e-05, "loss": 1.5552, "step": 7730 }, { "epoch": 2.05, "grad_norm": 14.114699363708496, "learning_rate": 3.8658691001178885e-05, "loss": 1.1707, "step": 7740 }, { "epoch": 2.05, "grad_norm": 9.571085929870605, "learning_rate": 3.86368598000262e-05, "loss": 0.6445, "step": 7750 }, { "epoch": 2.05, "grad_norm": 4.617367267608643, "learning_rate": 3.861502859887351e-05, "loss": 1.4049, "step": 7760 }, { "epoch": 2.06, "grad_norm": 1.5643693208694458, "learning_rate": 3.8593197397720824e-05, "loss": 1.0544, "step": 7770 }, { "epoch": 2.06, "grad_norm": 12.363773345947266, "learning_rate": 3.857136619656814e-05, "loss": 1.3698, "step": 7780 }, { "epoch": 2.06, "grad_norm": 6.082370758056641, "learning_rate": 3.854953499541545e-05, "loss": 1.5998, "step": 7790 }, { "epoch": 2.06, "grad_norm": 8.809284210205078, "learning_rate": 3.852770379426276e-05, "loss": 1.9381, "step": 7800 }, { "epoch": 2.06, "grad_norm": 6.093570232391357, "learning_rate": 3.850587259311008e-05, "loss": 1.3862, "step": 7810 }, { "epoch": 2.06, "grad_norm": 5.605815887451172, "learning_rate": 3.8484041391957386e-05, "loss": 1.4225, "step": 7820 }, { "epoch": 2.06, "grad_norm": 12.397106170654297, "learning_rate": 3.84622101908047e-05, "loss": 1.3421, "step": 7830 }, { "epoch": 2.06, "grad_norm": 13.268582344055176, "learning_rate": 3.844037898965202e-05, "loss": 1.7036, "step": 7840 }, { "epoch": 2.06, "grad_norm": 8.59499740600586, "learning_rate": 3.8418547788499325e-05, "loss": 1.1617, "step": 7850 }, { "epoch": 2.06, "grad_norm": 10.697739601135254, "learning_rate": 3.839671658734664e-05, "loss": 1.3551, "step": 7860 }, { "epoch": 2.06, "grad_norm": 5.254342555999756, "learning_rate": 3.837488538619395e-05, "loss": 1.1425, "step": 7870 }, { "epoch": 2.06, "grad_norm": 9.56226921081543, "learning_rate": 3.8353054185041264e-05, "loss": 0.9719, "step": 7880 }, { "epoch": 2.06, "grad_norm": 9.353610038757324, "learning_rate": 3.833122298388858e-05, "loss": 1.4326, "step": 7890 }, { "epoch": 2.06, "grad_norm": 11.42126750946045, "learning_rate": 3.830939178273589e-05, "loss": 1.2203, "step": 7900 }, { "epoch": 2.06, "grad_norm": 5.809771537780762, "learning_rate": 3.8287560581583196e-05, "loss": 1.2697, "step": 7910 }, { "epoch": 2.06, "grad_norm": 13.945771217346191, "learning_rate": 3.826572938043052e-05, "loss": 1.2922, "step": 7920 }, { "epoch": 2.06, "grad_norm": 7.608748435974121, "learning_rate": 3.8243898179277826e-05, "loss": 1.4886, "step": 7930 }, { "epoch": 2.06, "grad_norm": 15.62812328338623, "learning_rate": 3.8222066978125135e-05, "loss": 1.2474, "step": 7940 }, { "epoch": 2.06, "grad_norm": 8.133816719055176, "learning_rate": 3.820023577697245e-05, "loss": 1.2306, "step": 7950 }, { "epoch": 2.06, "grad_norm": 11.07603645324707, "learning_rate": 3.8178404575819765e-05, "loss": 1.0476, "step": 7960 }, { "epoch": 2.06, "grad_norm": 5.324217796325684, "learning_rate": 3.815657337466707e-05, "loss": 1.058, "step": 7970 }, { "epoch": 2.06, "grad_norm": 12.429716110229492, "learning_rate": 3.813474217351439e-05, "loss": 1.0418, "step": 7980 }, { "epoch": 2.06, "grad_norm": 9.462568283081055, "learning_rate": 3.8112910972361704e-05, "loss": 1.2944, "step": 7990 }, { "epoch": 2.06, "grad_norm": 8.181110382080078, "learning_rate": 3.809107977120901e-05, "loss": 1.2121, "step": 8000 }, { "epoch": 2.06, "grad_norm": 10.78189468383789, "learning_rate": 3.806924857005633e-05, "loss": 1.4074, "step": 8010 }, { "epoch": 2.07, "grad_norm": 17.104066848754883, "learning_rate": 3.8047417368903636e-05, "loss": 1.4157, "step": 8020 }, { "epoch": 2.07, "grad_norm": 6.317647457122803, "learning_rate": 3.802558616775095e-05, "loss": 1.4805, "step": 8030 }, { "epoch": 2.07, "grad_norm": 8.186767578125, "learning_rate": 3.8003754966598266e-05, "loss": 1.0648, "step": 8040 }, { "epoch": 2.07, "grad_norm": 5.564696788787842, "learning_rate": 3.7981923765445574e-05, "loss": 1.0217, "step": 8050 }, { "epoch": 2.07, "grad_norm": 7.463255882263184, "learning_rate": 3.796009256429289e-05, "loss": 1.4861, "step": 8060 }, { "epoch": 2.07, "grad_norm": 11.142265319824219, "learning_rate": 3.7938261363140205e-05, "loss": 1.5016, "step": 8070 }, { "epoch": 2.07, "grad_norm": 8.285846710205078, "learning_rate": 3.791643016198751e-05, "loss": 1.4516, "step": 8080 }, { "epoch": 2.07, "grad_norm": 9.068915367126465, "learning_rate": 3.789459896083483e-05, "loss": 1.3137, "step": 8090 }, { "epoch": 2.07, "grad_norm": 6.806003093719482, "learning_rate": 3.7872767759682137e-05, "loss": 1.1694, "step": 8100 }, { "epoch": 2.07, "grad_norm": 6.360131740570068, "learning_rate": 3.785093655852945e-05, "loss": 1.2451, "step": 8110 }, { "epoch": 2.07, "grad_norm": 13.665157318115234, "learning_rate": 3.782910535737677e-05, "loss": 1.5621, "step": 8120 }, { "epoch": 2.07, "grad_norm": 8.61514663696289, "learning_rate": 3.7807274156224075e-05, "loss": 1.3883, "step": 8130 }, { "epoch": 2.07, "grad_norm": 10.076803207397461, "learning_rate": 3.778544295507139e-05, "loss": 1.5984, "step": 8140 }, { "epoch": 2.07, "grad_norm": 11.513164520263672, "learning_rate": 3.7763611753918706e-05, "loss": 1.1568, "step": 8150 }, { "epoch": 2.07, "grad_norm": 10.96152114868164, "learning_rate": 3.7741780552766014e-05, "loss": 1.2679, "step": 8160 }, { "epoch": 2.07, "grad_norm": 11.095200538635254, "learning_rate": 3.771994935161333e-05, "loss": 1.3044, "step": 8170 }, { "epoch": 2.07, "grad_norm": 11.137669563293457, "learning_rate": 3.7698118150460644e-05, "loss": 1.6338, "step": 8180 }, { "epoch": 2.07, "grad_norm": 10.673565864562988, "learning_rate": 3.767628694930795e-05, "loss": 1.04, "step": 8190 }, { "epoch": 2.07, "grad_norm": 12.882896423339844, "learning_rate": 3.765445574815527e-05, "loss": 1.2069, "step": 8200 }, { "epoch": 2.07, "grad_norm": 12.909979820251465, "learning_rate": 3.7632624547002576e-05, "loss": 1.4069, "step": 8210 }, { "epoch": 2.07, "grad_norm": 9.905861854553223, "learning_rate": 3.761079334584989e-05, "loss": 1.0378, "step": 8220 }, { "epoch": 2.07, "grad_norm": 12.227041244506836, "learning_rate": 3.758896214469721e-05, "loss": 1.9366, "step": 8230 }, { "epoch": 2.07, "grad_norm": 16.074005126953125, "learning_rate": 3.7567130943544515e-05, "loss": 1.324, "step": 8240 }, { "epoch": 2.07, "grad_norm": 15.497346878051758, "learning_rate": 3.7545299742391823e-05, "loss": 1.3117, "step": 8250 }, { "epoch": 2.07, "grad_norm": 3.61712384223938, "learning_rate": 3.7523468541239145e-05, "loss": 0.9081, "step": 8260 }, { "epoch": 2.07, "grad_norm": 6.043015003204346, "learning_rate": 3.7501637340086454e-05, "loss": 1.3697, "step": 8270 }, { "epoch": 2.08, "grad_norm": 1.494061827659607, "learning_rate": 3.747980613893376e-05, "loss": 1.0113, "step": 8280 }, { "epoch": 2.08, "grad_norm": 10.294641494750977, "learning_rate": 3.745797493778108e-05, "loss": 1.1916, "step": 8290 }, { "epoch": 2.08, "grad_norm": 11.770963668823242, "learning_rate": 3.743614373662839e-05, "loss": 1.7289, "step": 8300 }, { "epoch": 2.08, "grad_norm": 11.508599281311035, "learning_rate": 3.74143125354757e-05, "loss": 1.532, "step": 8310 }, { "epoch": 2.08, "grad_norm": 5.5095906257629395, "learning_rate": 3.7392481334323016e-05, "loss": 1.5726, "step": 8320 }, { "epoch": 2.08, "grad_norm": 8.906387329101562, "learning_rate": 3.7370650133170325e-05, "loss": 1.872, "step": 8330 }, { "epoch": 2.08, "grad_norm": 13.537530899047852, "learning_rate": 3.734881893201764e-05, "loss": 1.4783, "step": 8340 }, { "epoch": 2.08, "grad_norm": 5.542272090911865, "learning_rate": 3.7326987730864955e-05, "loss": 0.9067, "step": 8350 }, { "epoch": 2.08, "grad_norm": 13.40847396850586, "learning_rate": 3.730515652971226e-05, "loss": 1.4513, "step": 8360 }, { "epoch": 2.08, "grad_norm": 6.104247570037842, "learning_rate": 3.728332532855958e-05, "loss": 1.4018, "step": 8370 }, { "epoch": 2.08, "grad_norm": 3.081935167312622, "learning_rate": 3.7261494127406894e-05, "loss": 1.3387, "step": 8380 }, { "epoch": 2.08, "grad_norm": 5.6001410484313965, "learning_rate": 3.72396629262542e-05, "loss": 1.1357, "step": 8390 }, { "epoch": 2.08, "grad_norm": 9.477627754211426, "learning_rate": 3.721783172510152e-05, "loss": 0.9248, "step": 8400 }, { "epoch": 2.08, "grad_norm": 10.714771270751953, "learning_rate": 3.719600052394883e-05, "loss": 1.0914, "step": 8410 }, { "epoch": 2.08, "grad_norm": 8.806195259094238, "learning_rate": 3.717416932279614e-05, "loss": 1.4307, "step": 8420 }, { "epoch": 2.08, "grad_norm": 7.881807327270508, "learning_rate": 3.7152338121643456e-05, "loss": 1.5862, "step": 8430 }, { "epoch": 2.08, "grad_norm": 9.756494522094727, "learning_rate": 3.7130506920490764e-05, "loss": 0.8593, "step": 8440 }, { "epoch": 2.08, "grad_norm": 11.778931617736816, "learning_rate": 3.710867571933808e-05, "loss": 1.0078, "step": 8450 }, { "epoch": 2.08, "grad_norm": 19.07478904724121, "learning_rate": 3.7086844518185395e-05, "loss": 1.4918, "step": 8460 }, { "epoch": 2.08, "grad_norm": 7.192957401275635, "learning_rate": 3.70650133170327e-05, "loss": 1.4567, "step": 8470 }, { "epoch": 2.08, "grad_norm": 12.366439819335938, "learning_rate": 3.704318211588002e-05, "loss": 1.7597, "step": 8480 }, { "epoch": 2.08, "grad_norm": 7.855134963989258, "learning_rate": 3.7021350914727333e-05, "loss": 1.3408, "step": 8490 }, { "epoch": 2.08, "grad_norm": 8.647002220153809, "learning_rate": 3.699951971357464e-05, "loss": 1.317, "step": 8500 }, { "epoch": 2.08, "grad_norm": 17.394466400146484, "learning_rate": 3.697768851242196e-05, "loss": 0.9509, "step": 8510 }, { "epoch": 2.08, "grad_norm": 5.812830924987793, "learning_rate": 3.6955857311269265e-05, "loss": 0.9451, "step": 8520 }, { "epoch": 2.09, "grad_norm": 3.620662212371826, "learning_rate": 3.693402611011658e-05, "loss": 0.9367, "step": 8530 }, { "epoch": 2.09, "grad_norm": 10.696394920349121, "learning_rate": 3.6912194908963896e-05, "loss": 1.1111, "step": 8540 }, { "epoch": 2.09, "grad_norm": 6.507318496704102, "learning_rate": 3.6890363707811204e-05, "loss": 1.5801, "step": 8550 }, { "epoch": 2.09, "grad_norm": 1.3577836751937866, "learning_rate": 3.686853250665852e-05, "loss": 0.9715, "step": 8560 }, { "epoch": 2.09, "grad_norm": 9.392326354980469, "learning_rate": 3.6846701305505834e-05, "loss": 1.3546, "step": 8570 }, { "epoch": 2.09, "grad_norm": 7.389379978179932, "learning_rate": 3.682487010435314e-05, "loss": 1.4263, "step": 8580 }, { "epoch": 2.09, "grad_norm": 12.445832252502441, "learning_rate": 3.680303890320045e-05, "loss": 1.2927, "step": 8590 }, { "epoch": 2.09, "grad_norm": 7.945863246917725, "learning_rate": 3.678120770204777e-05, "loss": 1.347, "step": 8600 }, { "epoch": 2.09, "grad_norm": 11.42392635345459, "learning_rate": 3.675937650089508e-05, "loss": 1.2931, "step": 8610 }, { "epoch": 2.09, "grad_norm": 8.835967063903809, "learning_rate": 3.673754529974239e-05, "loss": 1.7067, "step": 8620 }, { "epoch": 2.09, "grad_norm": 11.03829288482666, "learning_rate": 3.6715714098589705e-05, "loss": 1.1344, "step": 8630 }, { "epoch": 2.09, "grad_norm": 8.060691833496094, "learning_rate": 3.669388289743702e-05, "loss": 1.1783, "step": 8640 }, { "epoch": 2.09, "grad_norm": 8.403630256652832, "learning_rate": 3.667205169628433e-05, "loss": 1.2756, "step": 8650 }, { "epoch": 2.09, "grad_norm": 6.985191345214844, "learning_rate": 3.6650220495131644e-05, "loss": 1.3376, "step": 8660 }, { "epoch": 2.09, "grad_norm": 10.588379859924316, "learning_rate": 3.662838929397895e-05, "loss": 0.9488, "step": 8670 }, { "epoch": 2.09, "grad_norm": 9.86176586151123, "learning_rate": 3.660655809282627e-05, "loss": 1.4349, "step": 8680 }, { "epoch": 2.09, "grad_norm": 10.589521408081055, "learning_rate": 3.658472689167358e-05, "loss": 0.9541, "step": 8690 }, { "epoch": 2.09, "grad_norm": 4.302985191345215, "learning_rate": 3.656289569052089e-05, "loss": 1.0201, "step": 8700 }, { "epoch": 2.09, "grad_norm": 9.92156982421875, "learning_rate": 3.6541064489368206e-05, "loss": 1.3658, "step": 8710 }, { "epoch": 2.09, "grad_norm": 11.065665245056152, "learning_rate": 3.651923328821552e-05, "loss": 0.7676, "step": 8720 }, { "epoch": 2.09, "grad_norm": 11.673006057739258, "learning_rate": 3.649740208706283e-05, "loss": 1.0899, "step": 8730 }, { "epoch": 2.09, "grad_norm": 2.9989564418792725, "learning_rate": 3.6475570885910145e-05, "loss": 1.1931, "step": 8740 }, { "epoch": 2.09, "grad_norm": 12.47274112701416, "learning_rate": 3.645373968475746e-05, "loss": 1.3796, "step": 8750 }, { "epoch": 2.09, "grad_norm": 9.17037296295166, "learning_rate": 3.643190848360477e-05, "loss": 1.5027, "step": 8760 }, { "epoch": 2.09, "grad_norm": 6.050634384155273, "learning_rate": 3.6410077282452084e-05, "loss": 1.172, "step": 8770 }, { "epoch": 2.1, "grad_norm": 5.867905139923096, "learning_rate": 3.638824608129939e-05, "loss": 1.5851, "step": 8780 }, { "epoch": 2.1, "grad_norm": 7.001101493835449, "learning_rate": 3.636641488014671e-05, "loss": 0.9081, "step": 8790 }, { "epoch": 2.1, "grad_norm": 3.2297420501708984, "learning_rate": 3.634458367899402e-05, "loss": 1.105, "step": 8800 }, { "epoch": 2.1, "grad_norm": 15.62044620513916, "learning_rate": 3.632275247784133e-05, "loss": 1.0057, "step": 8810 }, { "epoch": 2.1, "grad_norm": 5.033325672149658, "learning_rate": 3.6300921276688646e-05, "loss": 0.7844, "step": 8820 }, { "epoch": 2.1, "grad_norm": 2.689194440841675, "learning_rate": 3.627909007553596e-05, "loss": 1.1229, "step": 8830 }, { "epoch": 2.1, "grad_norm": 2.9724693298339844, "learning_rate": 3.625725887438327e-05, "loss": 0.9214, "step": 8840 }, { "epoch": 2.1, "grad_norm": 11.626296043395996, "learning_rate": 3.6235427673230585e-05, "loss": 1.7377, "step": 8850 }, { "epoch": 2.1, "grad_norm": 6.832793235778809, "learning_rate": 3.621359647207789e-05, "loss": 1.0135, "step": 8860 }, { "epoch": 2.1, "grad_norm": 5.191517353057861, "learning_rate": 3.619176527092521e-05, "loss": 0.8074, "step": 8870 }, { "epoch": 2.1, "grad_norm": 7.160601615905762, "learning_rate": 3.6169934069772523e-05, "loss": 1.2109, "step": 8880 }, { "epoch": 2.1, "grad_norm": 8.562121391296387, "learning_rate": 3.614810286861983e-05, "loss": 1.4029, "step": 8890 }, { "epoch": 2.1, "grad_norm": 10.82218074798584, "learning_rate": 3.612627166746714e-05, "loss": 1.4321, "step": 8900 }, { "epoch": 2.1, "grad_norm": 10.444973945617676, "learning_rate": 3.610444046631446e-05, "loss": 1.2913, "step": 8910 }, { "epoch": 2.1, "grad_norm": 8.743501663208008, "learning_rate": 3.608260926516177e-05, "loss": 1.3876, "step": 8920 }, { "epoch": 2.1, "grad_norm": 9.77945327758789, "learning_rate": 3.606077806400908e-05, "loss": 1.2299, "step": 8930 }, { "epoch": 2.1, "grad_norm": 3.2734627723693848, "learning_rate": 3.60389468628564e-05, "loss": 1.1126, "step": 8940 }, { "epoch": 2.1, "grad_norm": 11.210502624511719, "learning_rate": 3.601711566170371e-05, "loss": 0.7819, "step": 8950 }, { "epoch": 2.1, "grad_norm": 11.881025314331055, "learning_rate": 3.599528446055102e-05, "loss": 1.4032, "step": 8960 }, { "epoch": 2.1, "grad_norm": 9.75598430633545, "learning_rate": 3.597345325939833e-05, "loss": 1.2859, "step": 8970 }, { "epoch": 2.1, "grad_norm": 6.536518096923828, "learning_rate": 3.595162205824565e-05, "loss": 1.2656, "step": 8980 }, { "epoch": 2.1, "grad_norm": 10.658870697021484, "learning_rate": 3.5929790857092956e-05, "loss": 1.2869, "step": 8990 }, { "epoch": 2.1, "grad_norm": 9.933408737182617, "learning_rate": 3.590795965594027e-05, "loss": 1.2214, "step": 9000 }, { "epoch": 2.1, "grad_norm": 6.969487190246582, "learning_rate": 3.588612845478758e-05, "loss": 1.3915, "step": 9010 }, { "epoch": 2.1, "grad_norm": 4.11505126953125, "learning_rate": 3.58642972536349e-05, "loss": 0.9495, "step": 9020 }, { "epoch": 2.1, "grad_norm": 10.214959144592285, "learning_rate": 3.584246605248221e-05, "loss": 1.11, "step": 9030 }, { "epoch": 2.11, "grad_norm": 15.322981834411621, "learning_rate": 3.582063485132952e-05, "loss": 1.5727, "step": 9040 }, { "epoch": 2.11, "grad_norm": 5.80056619644165, "learning_rate": 3.5798803650176834e-05, "loss": 1.7465, "step": 9050 }, { "epoch": 2.11, "grad_norm": 9.596943855285645, "learning_rate": 3.577697244902415e-05, "loss": 1.1546, "step": 9060 }, { "epoch": 2.11, "grad_norm": 7.293318748474121, "learning_rate": 3.575514124787146e-05, "loss": 1.061, "step": 9070 }, { "epoch": 2.11, "grad_norm": 6.565110683441162, "learning_rate": 3.573331004671877e-05, "loss": 1.0309, "step": 9080 }, { "epoch": 2.11, "grad_norm": 8.167417526245117, "learning_rate": 3.571147884556608e-05, "loss": 1.193, "step": 9090 }, { "epoch": 2.11, "grad_norm": 13.13125991821289, "learning_rate": 3.5689647644413396e-05, "loss": 1.2656, "step": 9100 }, { "epoch": 2.11, "grad_norm": 10.815147399902344, "learning_rate": 3.566781644326071e-05, "loss": 1.5991, "step": 9110 }, { "epoch": 2.11, "grad_norm": 7.445765018463135, "learning_rate": 3.564598524210802e-05, "loss": 1.2404, "step": 9120 }, { "epoch": 2.11, "grad_norm": 6.982111930847168, "learning_rate": 3.5624154040955335e-05, "loss": 1.1511, "step": 9130 }, { "epoch": 2.11, "grad_norm": 8.957866668701172, "learning_rate": 3.560232283980265e-05, "loss": 1.4067, "step": 9140 }, { "epoch": 2.11, "grad_norm": 7.584774494171143, "learning_rate": 3.558049163864996e-05, "loss": 0.8798, "step": 9150 }, { "epoch": 2.11, "grad_norm": 5.63563346862793, "learning_rate": 3.5558660437497274e-05, "loss": 0.8099, "step": 9160 }, { "epoch": 2.11, "grad_norm": 8.630626678466797, "learning_rate": 3.553682923634459e-05, "loss": 1.407, "step": 9170 }, { "epoch": 2.11, "grad_norm": 9.603389739990234, "learning_rate": 3.55149980351919e-05, "loss": 1.6283, "step": 9180 }, { "epoch": 2.11, "grad_norm": 6.0514349937438965, "learning_rate": 3.549316683403921e-05, "loss": 1.7242, "step": 9190 }, { "epoch": 2.11, "grad_norm": 5.8901047706604, "learning_rate": 3.547133563288652e-05, "loss": 1.3853, "step": 9200 }, { "epoch": 2.11, "grad_norm": 5.180474758148193, "learning_rate": 3.5449504431733836e-05, "loss": 0.9809, "step": 9210 }, { "epoch": 2.11, "grad_norm": 20.491348266601562, "learning_rate": 3.542767323058115e-05, "loss": 1.3426, "step": 9220 }, { "epoch": 2.11, "grad_norm": 20.78603172302246, "learning_rate": 3.540584202942846e-05, "loss": 1.3817, "step": 9230 }, { "epoch": 2.11, "grad_norm": 8.151640892028809, "learning_rate": 3.538401082827577e-05, "loss": 1.5403, "step": 9240 }, { "epoch": 2.11, "grad_norm": 12.207304000854492, "learning_rate": 3.536217962712309e-05, "loss": 1.1712, "step": 9250 }, { "epoch": 2.11, "grad_norm": 8.05256462097168, "learning_rate": 3.53403484259704e-05, "loss": 1.3194, "step": 9260 }, { "epoch": 2.11, "grad_norm": 10.615517616271973, "learning_rate": 3.531851722481771e-05, "loss": 1.1088, "step": 9270 }, { "epoch": 2.11, "grad_norm": 4.074679851531982, "learning_rate": 3.529668602366502e-05, "loss": 1.4418, "step": 9280 }, { "epoch": 2.12, "grad_norm": 9.39974594116211, "learning_rate": 3.527485482251234e-05, "loss": 0.9251, "step": 9290 }, { "epoch": 2.12, "grad_norm": 15.957358360290527, "learning_rate": 3.5253023621359645e-05, "loss": 1.0332, "step": 9300 }, { "epoch": 2.12, "grad_norm": 9.708263397216797, "learning_rate": 3.523119242020696e-05, "loss": 1.1119, "step": 9310 }, { "epoch": 2.12, "grad_norm": 10.333442687988281, "learning_rate": 3.5209361219054276e-05, "loss": 1.1308, "step": 9320 }, { "epoch": 2.12, "grad_norm": 9.877683639526367, "learning_rate": 3.518753001790159e-05, "loss": 1.6372, "step": 9330 }, { "epoch": 2.12, "grad_norm": 6.895038604736328, "learning_rate": 3.51656988167489e-05, "loss": 1.6491, "step": 9340 }, { "epoch": 2.12, "grad_norm": 9.609232902526855, "learning_rate": 3.514386761559621e-05, "loss": 1.1877, "step": 9350 }, { "epoch": 2.12, "grad_norm": 12.127094268798828, "learning_rate": 3.512203641444353e-05, "loss": 1.5809, "step": 9360 }, { "epoch": 2.12, "grad_norm": 10.841419219970703, "learning_rate": 3.510020521329084e-05, "loss": 1.3296, "step": 9370 }, { "epoch": 2.12, "grad_norm": 12.827550888061523, "learning_rate": 3.5078374012138146e-05, "loss": 1.346, "step": 9380 }, { "epoch": 2.12, "grad_norm": 8.358640670776367, "learning_rate": 3.505654281098546e-05, "loss": 1.64, "step": 9390 }, { "epoch": 2.12, "grad_norm": 9.967247009277344, "learning_rate": 3.503471160983278e-05, "loss": 1.3227, "step": 9400 }, { "epoch": 2.12, "grad_norm": 9.085989952087402, "learning_rate": 3.5012880408680085e-05, "loss": 1.2133, "step": 9410 }, { "epoch": 2.12, "grad_norm": 2.847717761993408, "learning_rate": 3.49910492075274e-05, "loss": 0.8088, "step": 9420 }, { "epoch": 2.12, "grad_norm": 11.476191520690918, "learning_rate": 3.496921800637471e-05, "loss": 1.4469, "step": 9430 }, { "epoch": 2.12, "grad_norm": 17.413597106933594, "learning_rate": 3.4947386805222024e-05, "loss": 1.0108, "step": 9440 }, { "epoch": 2.12, "grad_norm": 10.87855052947998, "learning_rate": 3.492555560406934e-05, "loss": 1.2837, "step": 9450 }, { "epoch": 2.12, "grad_norm": 4.182565689086914, "learning_rate": 3.490372440291665e-05, "loss": 1.2423, "step": 9460 }, { "epoch": 2.12, "grad_norm": 8.531558990478516, "learning_rate": 3.488189320176396e-05, "loss": 1.4975, "step": 9470 }, { "epoch": 2.12, "grad_norm": 12.245074272155762, "learning_rate": 3.486006200061128e-05, "loss": 1.0601, "step": 9480 }, { "epoch": 2.12, "grad_norm": 12.894485473632812, "learning_rate": 3.4838230799458586e-05, "loss": 1.1734, "step": 9490 }, { "epoch": 2.12, "grad_norm": 13.966607093811035, "learning_rate": 3.48163995983059e-05, "loss": 0.8893, "step": 9500 }, { "epoch": 2.12, "grad_norm": 16.224842071533203, "learning_rate": 3.4794568397153217e-05, "loss": 1.3193, "step": 9510 }, { "epoch": 2.12, "grad_norm": 4.688863754272461, "learning_rate": 3.4772737196000525e-05, "loss": 1.3676, "step": 9520 }, { "epoch": 2.12, "grad_norm": 6.886513710021973, "learning_rate": 3.475090599484784e-05, "loss": 1.1562, "step": 9530 }, { "epoch": 2.12, "grad_norm": 8.741244316101074, "learning_rate": 3.472907479369515e-05, "loss": 1.5383, "step": 9540 }, { "epoch": 2.12, "eval_accuracy": 0.6452145214521452, "eval_loss": 1.200659155845642, "eval_runtime": 391.3754, "eval_samples_per_second": 10.839, "eval_steps_per_second": 2.711, "step": 9543 }, { "epoch": 3.0, "grad_norm": 12.914654731750488, "learning_rate": 3.4707243592542464e-05, "loss": 1.4951, "step": 9550 }, { "epoch": 3.0, "grad_norm": 8.256881713867188, "learning_rate": 3.468541239138978e-05, "loss": 1.0147, "step": 9560 }, { "epoch": 3.0, "grad_norm": 8.062272071838379, "learning_rate": 3.466358119023709e-05, "loss": 0.9157, "step": 9570 }, { "epoch": 3.0, "grad_norm": 15.298491477966309, "learning_rate": 3.4641749989084396e-05, "loss": 1.4513, "step": 9580 }, { "epoch": 3.0, "grad_norm": 10.917853355407715, "learning_rate": 3.461991878793172e-05, "loss": 0.9637, "step": 9590 }, { "epoch": 3.0, "grad_norm": 7.624248504638672, "learning_rate": 3.4598087586779026e-05, "loss": 1.1946, "step": 9600 }, { "epoch": 3.0, "grad_norm": 10.209529876708984, "learning_rate": 3.4576256385626334e-05, "loss": 1.0378, "step": 9610 }, { "epoch": 3.0, "grad_norm": 9.969310760498047, "learning_rate": 3.455442518447365e-05, "loss": 0.8888, "step": 9620 }, { "epoch": 3.0, "grad_norm": 14.444302558898926, "learning_rate": 3.4532593983320965e-05, "loss": 1.2676, "step": 9630 }, { "epoch": 3.0, "grad_norm": 10.476544380187988, "learning_rate": 3.451076278216828e-05, "loss": 1.0983, "step": 9640 }, { "epoch": 3.0, "grad_norm": 5.764463424682617, "learning_rate": 3.448893158101559e-05, "loss": 0.9734, "step": 9650 }, { "epoch": 3.0, "grad_norm": 10.36607551574707, "learning_rate": 3.4467100379862903e-05, "loss": 1.0671, "step": 9660 }, { "epoch": 3.0, "grad_norm": 8.946820259094238, "learning_rate": 3.444526917871022e-05, "loss": 1.4005, "step": 9670 }, { "epoch": 3.01, "grad_norm": 13.740750312805176, "learning_rate": 3.442343797755753e-05, "loss": 0.9685, "step": 9680 }, { "epoch": 3.01, "grad_norm": 11.432165145874023, "learning_rate": 3.4401606776404835e-05, "loss": 1.7425, "step": 9690 }, { "epoch": 3.01, "grad_norm": 8.642751693725586, "learning_rate": 3.437977557525216e-05, "loss": 0.9433, "step": 9700 }, { "epoch": 3.01, "grad_norm": 9.338411331176758, "learning_rate": 3.4357944374099466e-05, "loss": 1.4972, "step": 9710 }, { "epoch": 3.01, "grad_norm": 8.963696479797363, "learning_rate": 3.4336113172946774e-05, "loss": 1.3106, "step": 9720 }, { "epoch": 3.01, "grad_norm": 8.116684913635254, "learning_rate": 3.431428197179409e-05, "loss": 1.4631, "step": 9730 }, { "epoch": 3.01, "grad_norm": 7.657939910888672, "learning_rate": 3.4292450770641405e-05, "loss": 1.109, "step": 9740 }, { "epoch": 3.01, "grad_norm": 8.110799789428711, "learning_rate": 3.427061956948871e-05, "loss": 0.9806, "step": 9750 }, { "epoch": 3.01, "grad_norm": 8.791415214538574, "learning_rate": 3.424878836833603e-05, "loss": 1.0944, "step": 9760 }, { "epoch": 3.01, "grad_norm": 10.378591537475586, "learning_rate": 3.4226957167183336e-05, "loss": 1.243, "step": 9770 }, { "epoch": 3.01, "grad_norm": 15.603302001953125, "learning_rate": 3.420512596603065e-05, "loss": 1.6138, "step": 9780 }, { "epoch": 3.01, "grad_norm": 15.08752155303955, "learning_rate": 3.418329476487797e-05, "loss": 1.0914, "step": 9790 }, { "epoch": 3.01, "grad_norm": 2.3594017028808594, "learning_rate": 3.4161463563725275e-05, "loss": 1.0651, "step": 9800 }, { "epoch": 3.01, "grad_norm": 3.262540340423584, "learning_rate": 3.413963236257259e-05, "loss": 0.7645, "step": 9810 }, { "epoch": 3.01, "grad_norm": 7.27140998840332, "learning_rate": 3.4117801161419906e-05, "loss": 1.1904, "step": 9820 }, { "epoch": 3.01, "grad_norm": 6.888250827789307, "learning_rate": 3.4095969960267214e-05, "loss": 1.0653, "step": 9830 }, { "epoch": 3.01, "grad_norm": 10.473215103149414, "learning_rate": 3.407413875911453e-05, "loss": 1.2414, "step": 9840 }, { "epoch": 3.01, "grad_norm": 8.186983108520508, "learning_rate": 3.4052307557961844e-05, "loss": 0.8326, "step": 9850 }, { "epoch": 3.01, "grad_norm": 6.812361240386963, "learning_rate": 3.403047635680915e-05, "loss": 1.3292, "step": 9860 }, { "epoch": 3.01, "grad_norm": 6.289360523223877, "learning_rate": 3.400864515565647e-05, "loss": 1.3329, "step": 9870 }, { "epoch": 3.01, "grad_norm": 7.619573593139648, "learning_rate": 3.3986813954503776e-05, "loss": 0.9479, "step": 9880 }, { "epoch": 3.01, "grad_norm": 7.525688648223877, "learning_rate": 3.396498275335109e-05, "loss": 1.2269, "step": 9890 }, { "epoch": 3.01, "grad_norm": 10.449602127075195, "learning_rate": 3.3943151552198407e-05, "loss": 1.0017, "step": 9900 }, { "epoch": 3.01, "grad_norm": 9.058977127075195, "learning_rate": 3.3921320351045715e-05, "loss": 1.1795, "step": 9910 }, { "epoch": 3.01, "grad_norm": 3.9881622791290283, "learning_rate": 3.389948914989302e-05, "loss": 1.1927, "step": 9920 }, { "epoch": 3.02, "grad_norm": 10.046173095703125, "learning_rate": 3.3877657948740345e-05, "loss": 1.1666, "step": 9930 }, { "epoch": 3.02, "grad_norm": 9.762126922607422, "learning_rate": 3.3855826747587654e-05, "loss": 1.012, "step": 9940 }, { "epoch": 3.02, "grad_norm": 11.69827938079834, "learning_rate": 3.383399554643497e-05, "loss": 1.0931, "step": 9950 }, { "epoch": 3.02, "grad_norm": 8.370316505432129, "learning_rate": 3.381216434528228e-05, "loss": 0.79, "step": 9960 }, { "epoch": 3.02, "grad_norm": 8.71602725982666, "learning_rate": 3.379033314412959e-05, "loss": 0.8558, "step": 9970 }, { "epoch": 3.02, "grad_norm": 9.7393159866333, "learning_rate": 3.376850194297691e-05, "loss": 1.0726, "step": 9980 }, { "epoch": 3.02, "grad_norm": 9.084054946899414, "learning_rate": 3.3746670741824216e-05, "loss": 1.2315, "step": 9990 }, { "epoch": 3.02, "grad_norm": 3.29392671585083, "learning_rate": 3.3724839540671524e-05, "loss": 0.9782, "step": 10000 }, { "epoch": 3.02, "grad_norm": 9.95692253112793, "learning_rate": 3.3703008339518846e-05, "loss": 1.1326, "step": 10010 }, { "epoch": 3.02, "grad_norm": 13.31050968170166, "learning_rate": 3.3681177138366155e-05, "loss": 1.2638, "step": 10020 }, { "epoch": 3.02, "grad_norm": 8.067925453186035, "learning_rate": 3.365934593721346e-05, "loss": 1.3101, "step": 10030 }, { "epoch": 3.02, "grad_norm": 1.994728922843933, "learning_rate": 3.3637514736060785e-05, "loss": 1.1431, "step": 10040 }, { "epoch": 3.02, "grad_norm": 9.318438529968262, "learning_rate": 3.3615683534908094e-05, "loss": 1.1549, "step": 10050 }, { "epoch": 3.02, "grad_norm": 5.434901237487793, "learning_rate": 3.35938523337554e-05, "loss": 1.1703, "step": 10060 }, { "epoch": 3.02, "grad_norm": 10.148241996765137, "learning_rate": 3.357202113260272e-05, "loss": 1.4215, "step": 10070 }, { "epoch": 3.02, "grad_norm": 8.392454147338867, "learning_rate": 3.355018993145003e-05, "loss": 1.0815, "step": 10080 }, { "epoch": 3.02, "grad_norm": 5.88727331161499, "learning_rate": 3.352835873029734e-05, "loss": 1.1391, "step": 10090 }, { "epoch": 3.02, "grad_norm": 11.211675643920898, "learning_rate": 3.3506527529144656e-05, "loss": 1.1807, "step": 10100 }, { "epoch": 3.02, "grad_norm": 7.479984760284424, "learning_rate": 3.3484696327991964e-05, "loss": 1.1585, "step": 10110 }, { "epoch": 3.02, "grad_norm": 28.300212860107422, "learning_rate": 3.346286512683928e-05, "loss": 1.0493, "step": 10120 }, { "epoch": 3.02, "grad_norm": 7.998159885406494, "learning_rate": 3.3441033925686595e-05, "loss": 1.5319, "step": 10130 }, { "epoch": 3.02, "grad_norm": 3.3382136821746826, "learning_rate": 3.34192027245339e-05, "loss": 0.9603, "step": 10140 }, { "epoch": 3.02, "grad_norm": 13.15101432800293, "learning_rate": 3.339737152338122e-05, "loss": 1.4557, "step": 10150 }, { "epoch": 3.02, "grad_norm": 7.072193145751953, "learning_rate": 3.337554032222853e-05, "loss": 1.0435, "step": 10160 }, { "epoch": 3.02, "grad_norm": 9.291816711425781, "learning_rate": 3.335370912107584e-05, "loss": 1.1473, "step": 10170 }, { "epoch": 3.03, "grad_norm": 9.348976135253906, "learning_rate": 3.333187791992316e-05, "loss": 1.2946, "step": 10180 }, { "epoch": 3.03, "grad_norm": 16.373863220214844, "learning_rate": 3.3310046718770465e-05, "loss": 1.5654, "step": 10190 }, { "epoch": 3.03, "grad_norm": 11.688248634338379, "learning_rate": 3.328821551761778e-05, "loss": 1.6156, "step": 10200 }, { "epoch": 3.03, "grad_norm": 8.236863136291504, "learning_rate": 3.3266384316465096e-05, "loss": 1.0757, "step": 10210 }, { "epoch": 3.03, "grad_norm": 19.430217742919922, "learning_rate": 3.3244553115312404e-05, "loss": 1.2994, "step": 10220 }, { "epoch": 3.03, "grad_norm": 8.403687477111816, "learning_rate": 3.322272191415972e-05, "loss": 1.1626, "step": 10230 }, { "epoch": 3.03, "grad_norm": 9.98920726776123, "learning_rate": 3.3200890713007034e-05, "loss": 0.9486, "step": 10240 }, { "epoch": 3.03, "grad_norm": 9.799349784851074, "learning_rate": 3.317905951185434e-05, "loss": 1.3295, "step": 10250 }, { "epoch": 3.03, "grad_norm": 9.48578929901123, "learning_rate": 3.315722831070166e-05, "loss": 1.4019, "step": 10260 }, { "epoch": 3.03, "grad_norm": 7.63637113571167, "learning_rate": 3.313539710954897e-05, "loss": 0.7789, "step": 10270 }, { "epoch": 3.03, "grad_norm": 8.749222755432129, "learning_rate": 3.311356590839628e-05, "loss": 1.3271, "step": 10280 }, { "epoch": 3.03, "grad_norm": 7.529891490936279, "learning_rate": 3.30917347072436e-05, "loss": 0.9948, "step": 10290 }, { "epoch": 3.03, "grad_norm": 9.81517219543457, "learning_rate": 3.3069903506090905e-05, "loss": 0.9846, "step": 10300 }, { "epoch": 3.03, "grad_norm": 6.512310981750488, "learning_rate": 3.304807230493822e-05, "loss": 1.0527, "step": 10310 }, { "epoch": 3.03, "grad_norm": 7.11715841293335, "learning_rate": 3.3026241103785535e-05, "loss": 0.9881, "step": 10320 }, { "epoch": 3.03, "grad_norm": 8.273052215576172, "learning_rate": 3.3004409902632844e-05, "loss": 1.3606, "step": 10330 }, { "epoch": 3.03, "grad_norm": 8.270402908325195, "learning_rate": 3.298257870148015e-05, "loss": 1.1437, "step": 10340 }, { "epoch": 3.03, "grad_norm": 3.199420928955078, "learning_rate": 3.2960747500327474e-05, "loss": 0.9835, "step": 10350 }, { "epoch": 3.03, "grad_norm": 6.723785400390625, "learning_rate": 3.293891629917478e-05, "loss": 1.0319, "step": 10360 }, { "epoch": 3.03, "grad_norm": 8.974822998046875, "learning_rate": 3.291708509802209e-05, "loss": 1.0206, "step": 10370 }, { "epoch": 3.03, "grad_norm": 13.581647872924805, "learning_rate": 3.2895253896869406e-05, "loss": 1.6742, "step": 10380 }, { "epoch": 3.03, "grad_norm": 10.690033912658691, "learning_rate": 3.287342269571672e-05, "loss": 1.1326, "step": 10390 }, { "epoch": 3.03, "grad_norm": 6.253451347351074, "learning_rate": 3.285159149456403e-05, "loss": 0.9193, "step": 10400 }, { "epoch": 3.03, "grad_norm": 10.628779411315918, "learning_rate": 3.2829760293411345e-05, "loss": 1.4522, "step": 10410 }, { "epoch": 3.03, "grad_norm": 6.158343315124512, "learning_rate": 3.280792909225866e-05, "loss": 1.3551, "step": 10420 }, { "epoch": 3.03, "grad_norm": 11.623557090759277, "learning_rate": 3.278609789110597e-05, "loss": 1.1779, "step": 10430 }, { "epoch": 3.04, "grad_norm": 6.426126003265381, "learning_rate": 3.2764266689953284e-05, "loss": 1.1344, "step": 10440 }, { "epoch": 3.04, "grad_norm": 16.0042781829834, "learning_rate": 3.274243548880059e-05, "loss": 1.4459, "step": 10450 }, { "epoch": 3.04, "grad_norm": 5.915554046630859, "learning_rate": 3.272060428764791e-05, "loss": 1.2526, "step": 10460 }, { "epoch": 3.04, "grad_norm": 7.384095191955566, "learning_rate": 3.269877308649522e-05, "loss": 0.8833, "step": 10470 }, { "epoch": 3.04, "grad_norm": 6.016345977783203, "learning_rate": 3.267694188534253e-05, "loss": 1.0851, "step": 10480 }, { "epoch": 3.04, "grad_norm": 8.065629959106445, "learning_rate": 3.2655110684189846e-05, "loss": 1.5879, "step": 10490 }, { "epoch": 3.04, "grad_norm": 24.0394344329834, "learning_rate": 3.263327948303716e-05, "loss": 1.4237, "step": 10500 }, { "epoch": 3.04, "grad_norm": 10.563517570495605, "learning_rate": 3.261144828188447e-05, "loss": 1.3147, "step": 10510 }, { "epoch": 3.04, "grad_norm": 11.242913246154785, "learning_rate": 3.2589617080731785e-05, "loss": 1.2347, "step": 10520 }, { "epoch": 3.04, "grad_norm": 7.010312557220459, "learning_rate": 3.256778587957909e-05, "loss": 1.4469, "step": 10530 }, { "epoch": 3.04, "grad_norm": 8.72696304321289, "learning_rate": 3.254595467842641e-05, "loss": 1.1676, "step": 10540 }, { "epoch": 3.04, "grad_norm": 15.148387908935547, "learning_rate": 3.252412347727372e-05, "loss": 1.2508, "step": 10550 }, { "epoch": 3.04, "grad_norm": 10.924818992614746, "learning_rate": 3.250229227612103e-05, "loss": 1.4615, "step": 10560 }, { "epoch": 3.04, "grad_norm": 9.585631370544434, "learning_rate": 3.248046107496835e-05, "loss": 1.3493, "step": 10570 }, { "epoch": 3.04, "grad_norm": 13.758076667785645, "learning_rate": 3.245862987381566e-05, "loss": 0.9924, "step": 10580 }, { "epoch": 3.04, "grad_norm": 11.401185035705566, "learning_rate": 3.243679867266297e-05, "loss": 1.5328, "step": 10590 }, { "epoch": 3.04, "grad_norm": 9.216519355773926, "learning_rate": 3.2414967471510286e-05, "loss": 0.7139, "step": 10600 }, { "epoch": 3.04, "grad_norm": 6.171424865722656, "learning_rate": 3.23931362703576e-05, "loss": 1.1364, "step": 10610 }, { "epoch": 3.04, "grad_norm": 12.21530818939209, "learning_rate": 3.237130506920491e-05, "loss": 1.245, "step": 10620 }, { "epoch": 3.04, "grad_norm": 9.306144714355469, "learning_rate": 3.2349473868052224e-05, "loss": 1.3813, "step": 10630 }, { "epoch": 3.04, "grad_norm": 7.814422130584717, "learning_rate": 3.232764266689953e-05, "loss": 1.0819, "step": 10640 }, { "epoch": 3.04, "grad_norm": 12.978373527526855, "learning_rate": 3.230581146574685e-05, "loss": 1.0463, "step": 10650 }, { "epoch": 3.04, "grad_norm": 11.514069557189941, "learning_rate": 3.228398026459416e-05, "loss": 1.4232, "step": 10660 }, { "epoch": 3.04, "grad_norm": 9.382047653198242, "learning_rate": 3.226214906344147e-05, "loss": 1.1928, "step": 10670 }, { "epoch": 3.04, "grad_norm": 7.7506585121154785, "learning_rate": 3.224031786228878e-05, "loss": 0.8882, "step": 10680 }, { "epoch": 3.05, "grad_norm": 10.884236335754395, "learning_rate": 3.22184866611361e-05, "loss": 1.2765, "step": 10690 }, { "epoch": 3.05, "grad_norm": 4.817571640014648, "learning_rate": 3.219665545998341e-05, "loss": 1.4092, "step": 10700 }, { "epoch": 3.05, "grad_norm": 9.602187156677246, "learning_rate": 3.217482425883072e-05, "loss": 1.7147, "step": 10710 }, { "epoch": 3.05, "grad_norm": 9.705577850341797, "learning_rate": 3.2152993057678034e-05, "loss": 1.4774, "step": 10720 }, { "epoch": 3.05, "grad_norm": 4.031256675720215, "learning_rate": 3.213116185652535e-05, "loss": 1.3663, "step": 10730 }, { "epoch": 3.05, "grad_norm": 10.3754243850708, "learning_rate": 3.210933065537266e-05, "loss": 1.2403, "step": 10740 }, { "epoch": 3.05, "grad_norm": 9.311732292175293, "learning_rate": 3.208749945421997e-05, "loss": 1.3426, "step": 10750 }, { "epoch": 3.05, "grad_norm": 2.550180673599243, "learning_rate": 3.206566825306729e-05, "loss": 1.2089, "step": 10760 }, { "epoch": 3.05, "grad_norm": 14.743963241577148, "learning_rate": 3.2043837051914596e-05, "loss": 1.6274, "step": 10770 }, { "epoch": 3.05, "grad_norm": 9.267411231994629, "learning_rate": 3.202200585076191e-05, "loss": 1.4035, "step": 10780 }, { "epoch": 3.05, "grad_norm": 8.47838306427002, "learning_rate": 3.200017464960922e-05, "loss": 1.1911, "step": 10790 }, { "epoch": 3.05, "grad_norm": 7.267144203186035, "learning_rate": 3.1978343448456535e-05, "loss": 0.9747, "step": 10800 }, { "epoch": 3.05, "grad_norm": 7.812573432922363, "learning_rate": 3.195651224730385e-05, "loss": 1.3411, "step": 10810 }, { "epoch": 3.05, "grad_norm": 2.3480443954467773, "learning_rate": 3.193468104615116e-05, "loss": 1.1569, "step": 10820 }, { "epoch": 3.05, "grad_norm": 6.847988128662109, "learning_rate": 3.1912849844998474e-05, "loss": 1.1875, "step": 10830 }, { "epoch": 3.05, "grad_norm": 10.475046157836914, "learning_rate": 3.189101864384579e-05, "loss": 0.7476, "step": 10840 }, { "epoch": 3.05, "grad_norm": 8.156534194946289, "learning_rate": 3.18691874426931e-05, "loss": 0.9395, "step": 10850 }, { "epoch": 3.05, "grad_norm": 16.633739471435547, "learning_rate": 3.184735624154041e-05, "loss": 1.6967, "step": 10860 }, { "epoch": 3.05, "grad_norm": 12.172830581665039, "learning_rate": 3.182552504038772e-05, "loss": 1.95, "step": 10870 }, { "epoch": 3.05, "grad_norm": 8.555020332336426, "learning_rate": 3.1803693839235036e-05, "loss": 1.3558, "step": 10880 }, { "epoch": 3.05, "grad_norm": 6.4003376960754395, "learning_rate": 3.178186263808235e-05, "loss": 1.6663, "step": 10890 }, { "epoch": 3.05, "grad_norm": 10.371899604797363, "learning_rate": 3.176003143692966e-05, "loss": 1.143, "step": 10900 }, { "epoch": 3.05, "grad_norm": 5.4215545654296875, "learning_rate": 3.1738200235776975e-05, "loss": 1.07, "step": 10910 }, { "epoch": 3.05, "grad_norm": 11.012886047363281, "learning_rate": 3.171636903462429e-05, "loss": 1.1161, "step": 10920 }, { "epoch": 3.05, "grad_norm": 7.175322532653809, "learning_rate": 3.16945378334716e-05, "loss": 0.9937, "step": 10930 }, { "epoch": 3.05, "grad_norm": 9.558371543884277, "learning_rate": 3.167270663231891e-05, "loss": 1.0471, "step": 10940 }, { "epoch": 3.06, "grad_norm": 7.509729385375977, "learning_rate": 3.165087543116623e-05, "loss": 0.9908, "step": 10950 }, { "epoch": 3.06, "grad_norm": 2.29264760017395, "learning_rate": 3.162904423001354e-05, "loss": 1.0029, "step": 10960 }, { "epoch": 3.06, "grad_norm": 17.51969337463379, "learning_rate": 3.160721302886085e-05, "loss": 1.2134, "step": 10970 }, { "epoch": 3.06, "grad_norm": 5.316622734069824, "learning_rate": 3.158538182770816e-05, "loss": 1.1858, "step": 10980 }, { "epoch": 3.06, "grad_norm": 15.948904037475586, "learning_rate": 3.1563550626555476e-05, "loss": 0.871, "step": 10990 }, { "epoch": 3.06, "grad_norm": 12.288128852844238, "learning_rate": 3.154171942540279e-05, "loss": 0.9975, "step": 11000 }, { "epoch": 3.06, "grad_norm": 4.6583709716796875, "learning_rate": 3.15198882242501e-05, "loss": 1.0058, "step": 11010 }, { "epoch": 3.06, "grad_norm": 7.930287837982178, "learning_rate": 3.149805702309741e-05, "loss": 1.0087, "step": 11020 }, { "epoch": 3.06, "grad_norm": 9.56474494934082, "learning_rate": 3.147622582194473e-05, "loss": 1.0976, "step": 11030 }, { "epoch": 3.06, "grad_norm": 9.481861114501953, "learning_rate": 3.145439462079204e-05, "loss": 1.1339, "step": 11040 }, { "epoch": 3.06, "grad_norm": 9.845037460327148, "learning_rate": 3.1432563419639346e-05, "loss": 0.9755, "step": 11050 }, { "epoch": 3.06, "grad_norm": 11.976770401000977, "learning_rate": 3.141073221848666e-05, "loss": 1.1185, "step": 11060 }, { "epoch": 3.06, "grad_norm": 10.453662872314453, "learning_rate": 3.138890101733398e-05, "loss": 1.0059, "step": 11070 }, { "epoch": 3.06, "grad_norm": 5.161324977874756, "learning_rate": 3.1367069816181285e-05, "loss": 0.9511, "step": 11080 }, { "epoch": 3.06, "grad_norm": 12.51476764678955, "learning_rate": 3.13452386150286e-05, "loss": 0.9743, "step": 11090 }, { "epoch": 3.06, "grad_norm": 14.148575782775879, "learning_rate": 3.132340741387591e-05, "loss": 1.3285, "step": 11100 }, { "epoch": 3.06, "grad_norm": 24.71393394470215, "learning_rate": 3.1301576212723224e-05, "loss": 1.2451, "step": 11110 }, { "epoch": 3.06, "grad_norm": 8.589850425720215, "learning_rate": 3.127974501157054e-05, "loss": 1.2511, "step": 11120 }, { "epoch": 3.06, "grad_norm": 14.497078895568848, "learning_rate": 3.125791381041785e-05, "loss": 1.476, "step": 11130 }, { "epoch": 3.06, "grad_norm": 12.509770393371582, "learning_rate": 3.123608260926517e-05, "loss": 1.4715, "step": 11140 }, { "epoch": 3.06, "grad_norm": 19.378536224365234, "learning_rate": 3.121425140811248e-05, "loss": 1.0658, "step": 11150 }, { "epoch": 3.06, "grad_norm": 11.65372085571289, "learning_rate": 3.1192420206959786e-05, "loss": 1.2546, "step": 11160 }, { "epoch": 3.06, "grad_norm": 11.705025672912598, "learning_rate": 3.11705890058071e-05, "loss": 1.2314, "step": 11170 }, { "epoch": 3.06, "grad_norm": 8.887195587158203, "learning_rate": 3.1148757804654416e-05, "loss": 1.2884, "step": 11180 }, { "epoch": 3.06, "grad_norm": 2.041499614715576, "learning_rate": 3.1126926603501725e-05, "loss": 0.9299, "step": 11190 }, { "epoch": 3.07, "grad_norm": 12.161649703979492, "learning_rate": 3.110509540234904e-05, "loss": 1.2834, "step": 11200 }, { "epoch": 3.07, "grad_norm": 2.216449499130249, "learning_rate": 3.108326420119635e-05, "loss": 1.1107, "step": 11210 }, { "epoch": 3.07, "grad_norm": 12.04345417022705, "learning_rate": 3.1061433000043664e-05, "loss": 1.2849, "step": 11220 }, { "epoch": 3.07, "grad_norm": 10.982839584350586, "learning_rate": 3.103960179889098e-05, "loss": 1.29, "step": 11230 }, { "epoch": 3.07, "grad_norm": 7.2620086669921875, "learning_rate": 3.101777059773829e-05, "loss": 1.2533, "step": 11240 }, { "epoch": 3.07, "grad_norm": 5.981380939483643, "learning_rate": 3.09959393965856e-05, "loss": 1.1331, "step": 11250 }, { "epoch": 3.07, "grad_norm": 12.066489219665527, "learning_rate": 3.097410819543292e-05, "loss": 1.0358, "step": 11260 }, { "epoch": 3.07, "grad_norm": 22.076650619506836, "learning_rate": 3.0952276994280226e-05, "loss": 1.3283, "step": 11270 }, { "epoch": 3.07, "grad_norm": 7.056905269622803, "learning_rate": 3.093044579312754e-05, "loss": 0.7736, "step": 11280 }, { "epoch": 3.07, "grad_norm": 4.653173446655273, "learning_rate": 3.090861459197485e-05, "loss": 1.3276, "step": 11290 }, { "epoch": 3.07, "grad_norm": 17.94127082824707, "learning_rate": 3.0886783390822165e-05, "loss": 1.2326, "step": 11300 }, { "epoch": 3.07, "grad_norm": 13.081766128540039, "learning_rate": 3.086495218966948e-05, "loss": 0.9709, "step": 11310 }, { "epoch": 3.07, "grad_norm": 8.861055374145508, "learning_rate": 3.084312098851679e-05, "loss": 1.0176, "step": 11320 }, { "epoch": 3.07, "grad_norm": 7.273895263671875, "learning_rate": 3.08212897873641e-05, "loss": 0.9611, "step": 11330 }, { "epoch": 3.07, "grad_norm": 6.208264350891113, "learning_rate": 3.079945858621142e-05, "loss": 0.9444, "step": 11340 }, { "epoch": 3.07, "grad_norm": 11.617364883422852, "learning_rate": 3.077762738505873e-05, "loss": 1.4588, "step": 11350 }, { "epoch": 3.07, "grad_norm": 2.946704864501953, "learning_rate": 3.0755796183906035e-05, "loss": 1.6045, "step": 11360 }, { "epoch": 3.07, "grad_norm": 9.546661376953125, "learning_rate": 3.073396498275336e-05, "loss": 1.2466, "step": 11370 }, { "epoch": 3.07, "grad_norm": 7.4591064453125, "learning_rate": 3.0712133781600666e-05, "loss": 1.0199, "step": 11380 }, { "epoch": 3.07, "grad_norm": 4.636168479919434, "learning_rate": 3.0690302580447974e-05, "loss": 1.0663, "step": 11390 }, { "epoch": 3.07, "grad_norm": 6.2078399658203125, "learning_rate": 3.066847137929529e-05, "loss": 1.228, "step": 11400 }, { "epoch": 3.07, "grad_norm": 5.180665969848633, "learning_rate": 3.0646640178142604e-05, "loss": 1.0104, "step": 11410 }, { "epoch": 3.07, "grad_norm": 23.514713287353516, "learning_rate": 3.062480897698991e-05, "loss": 1.486, "step": 11420 }, { "epoch": 3.07, "grad_norm": 9.88407039642334, "learning_rate": 3.060297777583723e-05, "loss": 1.14, "step": 11430 }, { "epoch": 3.07, "grad_norm": 4.529387474060059, "learning_rate": 3.0581146574684536e-05, "loss": 1.0019, "step": 11440 }, { "epoch": 3.07, "grad_norm": 3.4746382236480713, "learning_rate": 3.055931537353185e-05, "loss": 1.1055, "step": 11450 }, { "epoch": 3.08, "grad_norm": 8.909346580505371, "learning_rate": 3.053748417237917e-05, "loss": 1.1356, "step": 11460 }, { "epoch": 3.08, "grad_norm": 10.162489891052246, "learning_rate": 3.051565297122648e-05, "loss": 0.914, "step": 11470 }, { "epoch": 3.08, "grad_norm": 7.058337211608887, "learning_rate": 3.0493821770073787e-05, "loss": 0.7797, "step": 11480 }, { "epoch": 3.08, "grad_norm": 12.32120132446289, "learning_rate": 3.0471990568921105e-05, "loss": 1.0455, "step": 11490 }, { "epoch": 3.08, "grad_norm": 13.363324165344238, "learning_rate": 3.0450159367768417e-05, "loss": 1.4521, "step": 11500 }, { "epoch": 3.08, "grad_norm": 13.155016899108887, "learning_rate": 3.0428328166615726e-05, "loss": 1.112, "step": 11510 }, { "epoch": 3.08, "grad_norm": 4.535550117492676, "learning_rate": 3.0406496965463044e-05, "loss": 0.9661, "step": 11520 }, { "epoch": 3.08, "grad_norm": 8.993491172790527, "learning_rate": 3.0384665764310356e-05, "loss": 1.5635, "step": 11530 }, { "epoch": 3.08, "grad_norm": 1.9522517919540405, "learning_rate": 3.0362834563157668e-05, "loss": 0.8698, "step": 11540 }, { "epoch": 3.08, "grad_norm": 8.874967575073242, "learning_rate": 3.0341003362004976e-05, "loss": 1.1839, "step": 11550 }, { "epoch": 3.08, "grad_norm": 9.989808082580566, "learning_rate": 3.0319172160852295e-05, "loss": 1.0773, "step": 11560 }, { "epoch": 3.08, "grad_norm": 10.150749206542969, "learning_rate": 3.0297340959699606e-05, "loss": 1.2126, "step": 11570 }, { "epoch": 3.08, "grad_norm": 9.283119201660156, "learning_rate": 3.0275509758546915e-05, "loss": 1.2137, "step": 11580 }, { "epoch": 3.08, "grad_norm": 11.815513610839844, "learning_rate": 3.0253678557394227e-05, "loss": 0.9419, "step": 11590 }, { "epoch": 3.08, "grad_norm": 11.510446548461914, "learning_rate": 3.0231847356241545e-05, "loss": 1.1911, "step": 11600 }, { "epoch": 3.08, "grad_norm": 8.54575252532959, "learning_rate": 3.0210016155088854e-05, "loss": 0.8849, "step": 11610 }, { "epoch": 3.08, "grad_norm": 7.760162353515625, "learning_rate": 3.0188184953936165e-05, "loss": 1.4317, "step": 11620 }, { "epoch": 3.08, "grad_norm": 14.357382774353027, "learning_rate": 3.0166353752783477e-05, "loss": 0.9573, "step": 11630 }, { "epoch": 3.08, "grad_norm": 11.202889442443848, "learning_rate": 3.0144522551630792e-05, "loss": 1.3239, "step": 11640 }, { "epoch": 3.08, "grad_norm": 15.981091499328613, "learning_rate": 3.0122691350478104e-05, "loss": 1.5431, "step": 11650 }, { "epoch": 3.08, "grad_norm": 5.233941555023193, "learning_rate": 3.0100860149325416e-05, "loss": 1.3416, "step": 11660 }, { "epoch": 3.08, "grad_norm": 11.780496597290039, "learning_rate": 3.0079028948172728e-05, "loss": 1.3634, "step": 11670 }, { "epoch": 3.08, "grad_norm": 27.69906997680664, "learning_rate": 3.0057197747020043e-05, "loss": 1.7546, "step": 11680 }, { "epoch": 3.08, "grad_norm": 6.341325283050537, "learning_rate": 3.0035366545867355e-05, "loss": 1.2556, "step": 11690 }, { "epoch": 3.08, "grad_norm": 6.443033218383789, "learning_rate": 3.0013535344714666e-05, "loss": 1.228, "step": 11700 }, { "epoch": 3.09, "grad_norm": 2.231876850128174, "learning_rate": 2.999170414356198e-05, "loss": 0.806, "step": 11710 }, { "epoch": 3.09, "grad_norm": 13.271726608276367, "learning_rate": 2.9969872942409293e-05, "loss": 0.8769, "step": 11720 }, { "epoch": 3.09, "grad_norm": 6.617437839508057, "learning_rate": 2.9948041741256605e-05, "loss": 0.9776, "step": 11730 }, { "epoch": 3.09, "grad_norm": 12.114411354064941, "learning_rate": 2.9926210540103917e-05, "loss": 1.248, "step": 11740 }, { "epoch": 3.09, "grad_norm": 13.168693542480469, "learning_rate": 2.9904379338951232e-05, "loss": 1.4017, "step": 11750 }, { "epoch": 3.09, "grad_norm": 5.869525909423828, "learning_rate": 2.9882548137798544e-05, "loss": 0.8701, "step": 11760 }, { "epoch": 3.09, "grad_norm": 7.316917419433594, "learning_rate": 2.9860716936645856e-05, "loss": 1.0037, "step": 11770 }, { "epoch": 3.09, "grad_norm": 6.803651809692383, "learning_rate": 2.9838885735493167e-05, "loss": 0.7483, "step": 11780 }, { "epoch": 3.09, "grad_norm": 3.154744863510132, "learning_rate": 2.9817054534340483e-05, "loss": 1.2463, "step": 11790 }, { "epoch": 3.09, "grad_norm": 10.579174041748047, "learning_rate": 2.9795223333187794e-05, "loss": 1.7161, "step": 11800 }, { "epoch": 3.09, "grad_norm": 7.603121757507324, "learning_rate": 2.9773392132035106e-05, "loss": 1.5962, "step": 11810 }, { "epoch": 3.09, "grad_norm": 10.561923027038574, "learning_rate": 2.9751560930882415e-05, "loss": 1.1061, "step": 11820 }, { "epoch": 3.09, "grad_norm": 9.153263092041016, "learning_rate": 2.9729729729729733e-05, "loss": 1.1162, "step": 11830 }, { "epoch": 3.09, "grad_norm": 12.803223609924316, "learning_rate": 2.9707898528577045e-05, "loss": 1.1092, "step": 11840 }, { "epoch": 3.09, "grad_norm": 9.954139709472656, "learning_rate": 2.9686067327424353e-05, "loss": 0.8835, "step": 11850 }, { "epoch": 3.09, "grad_norm": 8.512142181396484, "learning_rate": 2.9664236126271665e-05, "loss": 1.2355, "step": 11860 }, { "epoch": 3.09, "grad_norm": 6.764357089996338, "learning_rate": 2.9642404925118984e-05, "loss": 1.4386, "step": 11870 }, { "epoch": 3.09, "grad_norm": 10.317313194274902, "learning_rate": 2.9620573723966295e-05, "loss": 0.9633, "step": 11880 }, { "epoch": 3.09, "grad_norm": 7.67103385925293, "learning_rate": 2.9598742522813604e-05, "loss": 1.4238, "step": 11890 }, { "epoch": 3.09, "grad_norm": 2.7083051204681396, "learning_rate": 2.9576911321660922e-05, "loss": 0.6811, "step": 11900 }, { "epoch": 3.09, "grad_norm": 7.054417610168457, "learning_rate": 2.9555080120508234e-05, "loss": 0.7134, "step": 11910 }, { "epoch": 3.09, "grad_norm": 7.829658031463623, "learning_rate": 2.9533248919355543e-05, "loss": 0.8884, "step": 11920 }, { "epoch": 3.09, "grad_norm": 16.902952194213867, "learning_rate": 2.9511417718202854e-05, "loss": 1.2334, "step": 11930 }, { "epoch": 3.09, "grad_norm": 15.651792526245117, "learning_rate": 2.9489586517050173e-05, "loss": 0.7682, "step": 11940 }, { "epoch": 3.09, "grad_norm": 8.101134300231934, "learning_rate": 2.946775531589748e-05, "loss": 1.0392, "step": 11950 }, { "epoch": 3.09, "grad_norm": 9.39798641204834, "learning_rate": 2.9445924114744793e-05, "loss": 1.354, "step": 11960 }, { "epoch": 3.1, "grad_norm": 9.823807716369629, "learning_rate": 2.9424092913592105e-05, "loss": 0.9289, "step": 11970 }, { "epoch": 3.1, "grad_norm": 2.006439208984375, "learning_rate": 2.940226171243942e-05, "loss": 1.2431, "step": 11980 }, { "epoch": 3.1, "grad_norm": 14.654336929321289, "learning_rate": 2.9380430511286732e-05, "loss": 1.7387, "step": 11990 }, { "epoch": 3.1, "grad_norm": 13.693473815917969, "learning_rate": 2.9358599310134044e-05, "loss": 1.0924, "step": 12000 }, { "epoch": 3.1, "grad_norm": 13.72071647644043, "learning_rate": 2.9336768108981355e-05, "loss": 1.244, "step": 12010 }, { "epoch": 3.1, "grad_norm": 7.943502902984619, "learning_rate": 2.931493690782867e-05, "loss": 0.9371, "step": 12020 }, { "epoch": 3.1, "grad_norm": 6.931057453155518, "learning_rate": 2.9293105706675982e-05, "loss": 1.5397, "step": 12030 }, { "epoch": 3.1, "grad_norm": 9.55163860321045, "learning_rate": 2.9271274505523294e-05, "loss": 1.4439, "step": 12040 }, { "epoch": 3.1, "grad_norm": 3.5247087478637695, "learning_rate": 2.924944330437061e-05, "loss": 1.3056, "step": 12050 }, { "epoch": 3.1, "grad_norm": 10.86681842803955, "learning_rate": 2.922761210321792e-05, "loss": 1.0746, "step": 12060 }, { "epoch": 3.1, "grad_norm": 5.180484294891357, "learning_rate": 2.9205780902065233e-05, "loss": 0.9483, "step": 12070 }, { "epoch": 3.1, "grad_norm": 9.507184982299805, "learning_rate": 2.9183949700912545e-05, "loss": 1.3, "step": 12080 }, { "epoch": 3.1, "grad_norm": 10.69293212890625, "learning_rate": 2.916211849975986e-05, "loss": 1.5727, "step": 12090 }, { "epoch": 3.1, "grad_norm": 4.30325984954834, "learning_rate": 2.914028729860717e-05, "loss": 0.731, "step": 12100 }, { "epoch": 3.1, "grad_norm": 11.065319061279297, "learning_rate": 2.9118456097454483e-05, "loss": 2.0348, "step": 12110 }, { "epoch": 3.1, "grad_norm": 9.229863166809082, "learning_rate": 2.9096624896301795e-05, "loss": 1.2605, "step": 12120 }, { "epoch": 3.1, "grad_norm": 9.275853157043457, "learning_rate": 2.907479369514911e-05, "loss": 1.489, "step": 12130 }, { "epoch": 3.1, "grad_norm": 6.916837692260742, "learning_rate": 2.9052962493996422e-05, "loss": 1.5641, "step": 12140 }, { "epoch": 3.1, "grad_norm": 7.4393181800842285, "learning_rate": 2.9031131292843734e-05, "loss": 1.5763, "step": 12150 }, { "epoch": 3.1, "grad_norm": 6.979135513305664, "learning_rate": 2.9009300091691042e-05, "loss": 1.3082, "step": 12160 }, { "epoch": 3.1, "grad_norm": 5.667503833770752, "learning_rate": 2.898746889053836e-05, "loss": 1.329, "step": 12170 }, { "epoch": 3.1, "grad_norm": 2.3347370624542236, "learning_rate": 2.8965637689385673e-05, "loss": 1.2681, "step": 12180 }, { "epoch": 3.1, "grad_norm": 12.032842636108398, "learning_rate": 2.8943806488232984e-05, "loss": 1.2273, "step": 12190 }, { "epoch": 3.1, "grad_norm": 12.323254585266113, "learning_rate": 2.8921975287080293e-05, "loss": 1.6201, "step": 12200 }, { "epoch": 3.1, "grad_norm": 10.17250919342041, "learning_rate": 2.890014408592761e-05, "loss": 1.4853, "step": 12210 }, { "epoch": 3.11, "grad_norm": 9.745219230651855, "learning_rate": 2.8878312884774923e-05, "loss": 1.2618, "step": 12220 }, { "epoch": 3.11, "grad_norm": 9.196125030517578, "learning_rate": 2.885648168362223e-05, "loss": 1.3129, "step": 12230 }, { "epoch": 3.11, "grad_norm": 10.650343894958496, "learning_rate": 2.883465048246955e-05, "loss": 1.0366, "step": 12240 }, { "epoch": 3.11, "grad_norm": 3.088823080062866, "learning_rate": 2.8812819281316862e-05, "loss": 1.3676, "step": 12250 }, { "epoch": 3.11, "grad_norm": 12.678071022033691, "learning_rate": 2.879098808016417e-05, "loss": 1.359, "step": 12260 }, { "epoch": 3.11, "grad_norm": 7.779239654541016, "learning_rate": 2.8769156879011482e-05, "loss": 1.6622, "step": 12270 }, { "epoch": 3.11, "grad_norm": 10.309340476989746, "learning_rate": 2.87473256778588e-05, "loss": 0.9899, "step": 12280 }, { "epoch": 3.11, "grad_norm": 8.22410774230957, "learning_rate": 2.872549447670611e-05, "loss": 1.0485, "step": 12290 }, { "epoch": 3.11, "grad_norm": 11.260028839111328, "learning_rate": 2.870366327555342e-05, "loss": 0.9084, "step": 12300 }, { "epoch": 3.11, "grad_norm": 8.30509090423584, "learning_rate": 2.8681832074400733e-05, "loss": 1.1053, "step": 12310 }, { "epoch": 3.11, "grad_norm": 7.860371112823486, "learning_rate": 2.866000087324805e-05, "loss": 1.0789, "step": 12320 }, { "epoch": 3.11, "grad_norm": 3.8172786235809326, "learning_rate": 2.863816967209536e-05, "loss": 1.1394, "step": 12330 }, { "epoch": 3.11, "grad_norm": 6.0185546875, "learning_rate": 2.861633847094267e-05, "loss": 1.0998, "step": 12340 }, { "epoch": 3.11, "grad_norm": 10.664830207824707, "learning_rate": 2.8594507269789983e-05, "loss": 1.4392, "step": 12350 }, { "epoch": 3.11, "grad_norm": 13.262534141540527, "learning_rate": 2.85726760686373e-05, "loss": 1.2106, "step": 12360 }, { "epoch": 3.11, "grad_norm": 5.346441268920898, "learning_rate": 2.855084486748461e-05, "loss": 1.2369, "step": 12370 }, { "epoch": 3.11, "grad_norm": 9.43067741394043, "learning_rate": 2.8529013666331922e-05, "loss": 1.0708, "step": 12380 }, { "epoch": 3.11, "grad_norm": 10.107817649841309, "learning_rate": 2.8507182465179234e-05, "loss": 1.132, "step": 12390 }, { "epoch": 3.11, "grad_norm": 9.317228317260742, "learning_rate": 2.848535126402655e-05, "loss": 1.1862, "step": 12400 }, { "epoch": 3.11, "grad_norm": 5.576760292053223, "learning_rate": 2.846352006287386e-05, "loss": 1.8185, "step": 12410 }, { "epoch": 3.11, "grad_norm": 5.2753095626831055, "learning_rate": 2.8441688861721172e-05, "loss": 1.1802, "step": 12420 }, { "epoch": 3.11, "grad_norm": 4.385393142700195, "learning_rate": 2.8419857660568488e-05, "loss": 1.2014, "step": 12430 }, { "epoch": 3.11, "grad_norm": 5.176544666290283, "learning_rate": 2.83980264594158e-05, "loss": 1.0198, "step": 12440 }, { "epoch": 3.11, "grad_norm": 14.364564895629883, "learning_rate": 2.837619525826311e-05, "loss": 1.4076, "step": 12450 }, { "epoch": 3.11, "grad_norm": 6.475866794586182, "learning_rate": 2.8354364057110423e-05, "loss": 1.0056, "step": 12460 }, { "epoch": 3.12, "grad_norm": 8.298385620117188, "learning_rate": 2.8332532855957738e-05, "loss": 1.2638, "step": 12470 }, { "epoch": 3.12, "grad_norm": 9.618647575378418, "learning_rate": 2.831070165480505e-05, "loss": 1.2134, "step": 12480 }, { "epoch": 3.12, "grad_norm": 9.428750038146973, "learning_rate": 2.828887045365236e-05, "loss": 1.4573, "step": 12490 }, { "epoch": 3.12, "grad_norm": 8.537643432617188, "learning_rate": 2.8267039252499673e-05, "loss": 1.4736, "step": 12500 }, { "epoch": 3.12, "grad_norm": 2.389181613922119, "learning_rate": 2.824520805134699e-05, "loss": 0.7278, "step": 12510 }, { "epoch": 3.12, "grad_norm": 8.930627822875977, "learning_rate": 2.82233768501943e-05, "loss": 1.0329, "step": 12520 }, { "epoch": 3.12, "grad_norm": 7.222578048706055, "learning_rate": 2.8201545649041612e-05, "loss": 0.8614, "step": 12530 }, { "epoch": 3.12, "grad_norm": 8.318516731262207, "learning_rate": 2.817971444788892e-05, "loss": 0.8867, "step": 12540 }, { "epoch": 3.12, "grad_norm": 11.308908462524414, "learning_rate": 2.815788324673624e-05, "loss": 1.1223, "step": 12550 }, { "epoch": 3.12, "grad_norm": 10.175277709960938, "learning_rate": 2.813605204558355e-05, "loss": 1.0336, "step": 12560 }, { "epoch": 3.12, "grad_norm": 17.536441802978516, "learning_rate": 2.811422084443086e-05, "loss": 1.4506, "step": 12570 }, { "epoch": 3.12, "grad_norm": 7.839116096496582, "learning_rate": 2.809238964327817e-05, "loss": 1.3656, "step": 12580 }, { "epoch": 3.12, "grad_norm": 5.562898635864258, "learning_rate": 2.807055844212549e-05, "loss": 1.2645, "step": 12590 }, { "epoch": 3.12, "grad_norm": 14.041899681091309, "learning_rate": 2.8048727240972798e-05, "loss": 1.2545, "step": 12600 }, { "epoch": 3.12, "grad_norm": 7.86154842376709, "learning_rate": 2.802689603982011e-05, "loss": 1.1184, "step": 12610 }, { "epoch": 3.12, "grad_norm": 6.332355976104736, "learning_rate": 2.800506483866743e-05, "loss": 1.3585, "step": 12620 }, { "epoch": 3.12, "grad_norm": 11.250005722045898, "learning_rate": 2.798323363751474e-05, "loss": 1.639, "step": 12630 }, { "epoch": 3.12, "grad_norm": 10.811439514160156, "learning_rate": 2.796140243636205e-05, "loss": 1.1521, "step": 12640 }, { "epoch": 3.12, "grad_norm": 19.984708786010742, "learning_rate": 2.793957123520936e-05, "loss": 1.3838, "step": 12650 }, { "epoch": 3.12, "grad_norm": 7.718048095703125, "learning_rate": 2.791774003405668e-05, "loss": 0.773, "step": 12660 }, { "epoch": 3.12, "grad_norm": 2.735309600830078, "learning_rate": 2.7895908832903987e-05, "loss": 1.0766, "step": 12670 }, { "epoch": 3.12, "grad_norm": 13.165864944458008, "learning_rate": 2.78740776317513e-05, "loss": 1.2647, "step": 12680 }, { "epoch": 3.12, "grad_norm": 11.21660041809082, "learning_rate": 2.785224643059861e-05, "loss": 1.5332, "step": 12690 }, { "epoch": 3.12, "grad_norm": 10.624415397644043, "learning_rate": 2.7830415229445926e-05, "loss": 1.1093, "step": 12700 }, { "epoch": 3.12, "grad_norm": 13.85206413269043, "learning_rate": 2.7808584028293238e-05, "loss": 1.1225, "step": 12710 }, { "epoch": 3.12, "grad_norm": 9.258134841918945, "learning_rate": 2.778675282714055e-05, "loss": 1.0339, "step": 12720 }, { "epoch": 3.12, "eval_accuracy": 0.6449787835926449, "eval_loss": 1.235801100730896, "eval_runtime": 395.6981, "eval_samples_per_second": 10.72, "eval_steps_per_second": 2.681, "step": 12724 }, { "epoch": 4.0, "grad_norm": 9.849520683288574, "learning_rate": 2.776492162598786e-05, "loss": 1.4241, "step": 12730 }, { "epoch": 4.0, "grad_norm": 6.570850849151611, "learning_rate": 2.7743090424835177e-05, "loss": 1.1123, "step": 12740 }, { "epoch": 4.0, "grad_norm": 12.022607803344727, "learning_rate": 2.772125922368249e-05, "loss": 1.1587, "step": 12750 }, { "epoch": 4.0, "grad_norm": 14.493890762329102, "learning_rate": 2.76994280225298e-05, "loss": 0.9799, "step": 12760 }, { "epoch": 4.0, "grad_norm": 10.324650764465332, "learning_rate": 2.7677596821377112e-05, "loss": 0.8191, "step": 12770 }, { "epoch": 4.0, "grad_norm": 13.755268096923828, "learning_rate": 2.7655765620224427e-05, "loss": 1.3153, "step": 12780 }, { "epoch": 4.0, "grad_norm": 15.701711654663086, "learning_rate": 2.763393441907174e-05, "loss": 1.2301, "step": 12790 }, { "epoch": 4.0, "grad_norm": 13.877677917480469, "learning_rate": 2.761210321791905e-05, "loss": 0.6759, "step": 12800 }, { "epoch": 4.0, "grad_norm": 14.830212593078613, "learning_rate": 2.7590272016766366e-05, "loss": 1.3668, "step": 12810 }, { "epoch": 4.0, "grad_norm": 21.4211483001709, "learning_rate": 2.7568440815613678e-05, "loss": 0.9864, "step": 12820 }, { "epoch": 4.0, "grad_norm": 1.2794026136398315, "learning_rate": 2.754660961446099e-05, "loss": 1.211, "step": 12830 }, { "epoch": 4.0, "grad_norm": 11.753496170043945, "learning_rate": 2.75247784133083e-05, "loss": 1.1536, "step": 12840 }, { "epoch": 4.0, "grad_norm": 8.25611686706543, "learning_rate": 2.7502947212155616e-05, "loss": 0.9888, "step": 12850 }, { "epoch": 4.01, "grad_norm": 11.044210433959961, "learning_rate": 2.7481116011002928e-05, "loss": 1.1196, "step": 12860 }, { "epoch": 4.01, "grad_norm": 13.520480155944824, "learning_rate": 2.745928480985024e-05, "loss": 1.2869, "step": 12870 }, { "epoch": 4.01, "grad_norm": 10.444195747375488, "learning_rate": 2.7437453608697548e-05, "loss": 1.083, "step": 12880 }, { "epoch": 4.01, "grad_norm": 13.263379096984863, "learning_rate": 2.7415622407544867e-05, "loss": 0.8396, "step": 12890 }, { "epoch": 4.01, "grad_norm": 10.427453994750977, "learning_rate": 2.739379120639218e-05, "loss": 1.1272, "step": 12900 }, { "epoch": 4.01, "grad_norm": 10.293091773986816, "learning_rate": 2.7371960005239487e-05, "loss": 1.057, "step": 12910 }, { "epoch": 4.01, "grad_norm": 7.65976095199585, "learning_rate": 2.73501288040868e-05, "loss": 1.245, "step": 12920 }, { "epoch": 4.01, "grad_norm": 9.03111457824707, "learning_rate": 2.7328297602934117e-05, "loss": 1.3535, "step": 12930 }, { "epoch": 4.01, "grad_norm": 13.248369216918945, "learning_rate": 2.730646640178143e-05, "loss": 1.1805, "step": 12940 }, { "epoch": 4.01, "grad_norm": 1.2417975664138794, "learning_rate": 2.7284635200628738e-05, "loss": 0.8926, "step": 12950 }, { "epoch": 4.01, "grad_norm": 9.869341850280762, "learning_rate": 2.726280399947605e-05, "loss": 1.1078, "step": 12960 }, { "epoch": 4.01, "grad_norm": 9.424198150634766, "learning_rate": 2.7240972798323368e-05, "loss": 1.0599, "step": 12970 }, { "epoch": 4.01, "grad_norm": 13.51994800567627, "learning_rate": 2.7219141597170676e-05, "loss": 1.5523, "step": 12980 }, { "epoch": 4.01, "grad_norm": 14.581307411193848, "learning_rate": 2.7197310396017988e-05, "loss": 1.1023, "step": 12990 }, { "epoch": 4.01, "grad_norm": 18.17917251586914, "learning_rate": 2.7175479194865307e-05, "loss": 1.2999, "step": 13000 }, { "epoch": 4.01, "grad_norm": 14.525318145751953, "learning_rate": 2.7153647993712615e-05, "loss": 1.078, "step": 13010 }, { "epoch": 4.01, "grad_norm": 13.585474967956543, "learning_rate": 2.7131816792559927e-05, "loss": 1.1142, "step": 13020 }, { "epoch": 4.01, "grad_norm": 12.350788116455078, "learning_rate": 2.710998559140724e-05, "loss": 1.0181, "step": 13030 }, { "epoch": 4.01, "grad_norm": 4.34970760345459, "learning_rate": 2.7088154390254554e-05, "loss": 1.0006, "step": 13040 }, { "epoch": 4.01, "grad_norm": 10.317882537841797, "learning_rate": 2.7066323189101866e-05, "loss": 1.2944, "step": 13050 }, { "epoch": 4.01, "grad_norm": 11.264039993286133, "learning_rate": 2.7044491987949177e-05, "loss": 0.8632, "step": 13060 }, { "epoch": 4.01, "grad_norm": 10.422079086303711, "learning_rate": 2.702266078679649e-05, "loss": 1.2094, "step": 13070 }, { "epoch": 4.01, "grad_norm": 14.387296676635742, "learning_rate": 2.7000829585643804e-05, "loss": 1.3005, "step": 13080 }, { "epoch": 4.01, "grad_norm": 16.9293212890625, "learning_rate": 2.6978998384491116e-05, "loss": 0.9649, "step": 13090 }, { "epoch": 4.01, "grad_norm": 11.597358703613281, "learning_rate": 2.6957167183338428e-05, "loss": 1.3514, "step": 13100 }, { "epoch": 4.02, "grad_norm": 15.312579154968262, "learning_rate": 2.693533598218574e-05, "loss": 1.6446, "step": 13110 }, { "epoch": 4.02, "grad_norm": 13.525115013122559, "learning_rate": 2.6913504781033055e-05, "loss": 1.0549, "step": 13120 }, { "epoch": 4.02, "grad_norm": 22.092679977416992, "learning_rate": 2.6891673579880367e-05, "loss": 1.3384, "step": 13130 }, { "epoch": 4.02, "grad_norm": 13.596874237060547, "learning_rate": 2.686984237872768e-05, "loss": 0.9413, "step": 13140 }, { "epoch": 4.02, "grad_norm": 8.80706787109375, "learning_rate": 2.684801117757499e-05, "loss": 1.3798, "step": 13150 }, { "epoch": 4.02, "grad_norm": 7.989864826202393, "learning_rate": 2.6826179976422305e-05, "loss": 1.35, "step": 13160 }, { "epoch": 4.02, "grad_norm": 6.565116882324219, "learning_rate": 2.6804348775269617e-05, "loss": 0.8361, "step": 13170 }, { "epoch": 4.02, "grad_norm": 7.901808738708496, "learning_rate": 2.678251757411693e-05, "loss": 0.9726, "step": 13180 }, { "epoch": 4.02, "grad_norm": 11.21402359008789, "learning_rate": 2.6760686372964244e-05, "loss": 0.9387, "step": 13190 }, { "epoch": 4.02, "grad_norm": 8.050765991210938, "learning_rate": 2.6738855171811556e-05, "loss": 0.8797, "step": 13200 }, { "epoch": 4.02, "grad_norm": 12.255356788635254, "learning_rate": 2.6717023970658868e-05, "loss": 1.4325, "step": 13210 }, { "epoch": 4.02, "grad_norm": 8.981575012207031, "learning_rate": 2.6695192769506176e-05, "loss": 1.2217, "step": 13220 }, { "epoch": 4.02, "grad_norm": 13.987320899963379, "learning_rate": 2.6673361568353495e-05, "loss": 1.8537, "step": 13230 }, { "epoch": 4.02, "grad_norm": 10.214347839355469, "learning_rate": 2.6651530367200806e-05, "loss": 1.0039, "step": 13240 }, { "epoch": 4.02, "grad_norm": 7.236724853515625, "learning_rate": 2.6629699166048118e-05, "loss": 0.7554, "step": 13250 }, { "epoch": 4.02, "grad_norm": 4.08578634262085, "learning_rate": 2.6607867964895427e-05, "loss": 0.9161, "step": 13260 }, { "epoch": 4.02, "grad_norm": 9.348335266113281, "learning_rate": 2.6586036763742745e-05, "loss": 0.9868, "step": 13270 }, { "epoch": 4.02, "grad_norm": 11.125860214233398, "learning_rate": 2.6564205562590057e-05, "loss": 1.0207, "step": 13280 }, { "epoch": 4.02, "grad_norm": 11.241005897521973, "learning_rate": 2.6542374361437365e-05, "loss": 1.1512, "step": 13290 }, { "epoch": 4.02, "grad_norm": 6.700471878051758, "learning_rate": 2.6520543160284677e-05, "loss": 0.8443, "step": 13300 }, { "epoch": 4.02, "grad_norm": 15.188118934631348, "learning_rate": 2.6498711959131996e-05, "loss": 1.3254, "step": 13310 }, { "epoch": 4.02, "grad_norm": 6.617821216583252, "learning_rate": 2.6476880757979304e-05, "loss": 1.2548, "step": 13320 }, { "epoch": 4.02, "grad_norm": 10.363046646118164, "learning_rate": 2.6455049556826616e-05, "loss": 1.4559, "step": 13330 }, { "epoch": 4.02, "grad_norm": 9.583874702453613, "learning_rate": 2.6433218355673928e-05, "loss": 0.8404, "step": 13340 }, { "epoch": 4.02, "grad_norm": 12.375665664672852, "learning_rate": 2.6411387154521243e-05, "loss": 0.8615, "step": 13350 }, { "epoch": 4.02, "grad_norm": 11.927233695983887, "learning_rate": 2.6389555953368555e-05, "loss": 1.386, "step": 13360 }, { "epoch": 4.03, "grad_norm": 12.068115234375, "learning_rate": 2.6367724752215866e-05, "loss": 1.2065, "step": 13370 }, { "epoch": 4.03, "grad_norm": 10.302019119262695, "learning_rate": 2.6345893551063185e-05, "loss": 1.3223, "step": 13380 }, { "epoch": 4.03, "grad_norm": 9.60232925415039, "learning_rate": 2.6324062349910493e-05, "loss": 0.6623, "step": 13390 }, { "epoch": 4.03, "grad_norm": 2.561321973800659, "learning_rate": 2.6302231148757805e-05, "loss": 1.0396, "step": 13400 }, { "epoch": 4.03, "grad_norm": 14.244486808776855, "learning_rate": 2.6280399947605117e-05, "loss": 1.1343, "step": 13410 }, { "epoch": 4.03, "grad_norm": 12.042131423950195, "learning_rate": 2.6258568746452432e-05, "loss": 1.6383, "step": 13420 }, { "epoch": 4.03, "grad_norm": 8.161518096923828, "learning_rate": 2.6236737545299744e-05, "loss": 1.1145, "step": 13430 }, { "epoch": 4.03, "grad_norm": 11.092758178710938, "learning_rate": 2.6214906344147056e-05, "loss": 1.0642, "step": 13440 }, { "epoch": 4.03, "grad_norm": 1.6890394687652588, "learning_rate": 2.6193075142994367e-05, "loss": 0.8629, "step": 13450 }, { "epoch": 4.03, "grad_norm": 12.74367618560791, "learning_rate": 2.6171243941841683e-05, "loss": 0.7256, "step": 13460 }, { "epoch": 4.03, "grad_norm": 9.57580852508545, "learning_rate": 2.6149412740688994e-05, "loss": 1.1888, "step": 13470 }, { "epoch": 4.03, "grad_norm": 10.403108596801758, "learning_rate": 2.6127581539536306e-05, "loss": 1.0024, "step": 13480 }, { "epoch": 4.03, "grad_norm": 12.243316650390625, "learning_rate": 2.6105750338383618e-05, "loss": 0.8442, "step": 13490 }, { "epoch": 4.03, "grad_norm": 10.62572956085205, "learning_rate": 2.6083919137230933e-05, "loss": 1.2976, "step": 13500 }, { "epoch": 4.03, "grad_norm": 10.540983200073242, "learning_rate": 2.6062087936078245e-05, "loss": 1.5247, "step": 13510 }, { "epoch": 4.03, "grad_norm": 29.210153579711914, "learning_rate": 2.6040256734925557e-05, "loss": 1.0384, "step": 13520 }, { "epoch": 4.03, "grad_norm": 4.831394195556641, "learning_rate": 2.6018425533772872e-05, "loss": 1.1008, "step": 13530 }, { "epoch": 4.03, "grad_norm": 15.153460502624512, "learning_rate": 2.5996594332620184e-05, "loss": 1.351, "step": 13540 }, { "epoch": 4.03, "grad_norm": 9.962759017944336, "learning_rate": 2.5974763131467495e-05, "loss": 1.248, "step": 13550 }, { "epoch": 4.03, "grad_norm": 12.451315879821777, "learning_rate": 2.5952931930314807e-05, "loss": 0.9991, "step": 13560 }, { "epoch": 4.03, "grad_norm": 9.538119316101074, "learning_rate": 2.5931100729162122e-05, "loss": 1.0672, "step": 13570 }, { "epoch": 4.03, "grad_norm": 6.621155261993408, "learning_rate": 2.5909269528009434e-05, "loss": 1.6283, "step": 13580 }, { "epoch": 4.03, "grad_norm": 22.773773193359375, "learning_rate": 2.5887438326856746e-05, "loss": 1.0594, "step": 13590 }, { "epoch": 4.03, "grad_norm": 7.35496187210083, "learning_rate": 2.5865607125704054e-05, "loss": 0.8314, "step": 13600 }, { "epoch": 4.03, "grad_norm": 9.718795776367188, "learning_rate": 2.5843775924551373e-05, "loss": 1.032, "step": 13610 }, { "epoch": 4.04, "grad_norm": 12.503546714782715, "learning_rate": 2.5821944723398685e-05, "loss": 1.3705, "step": 13620 }, { "epoch": 4.04, "grad_norm": 10.960770606994629, "learning_rate": 2.5800113522245993e-05, "loss": 1.2136, "step": 13630 }, { "epoch": 4.04, "grad_norm": 5.387396812438965, "learning_rate": 2.5778282321093305e-05, "loss": 1.0521, "step": 13640 }, { "epoch": 4.04, "grad_norm": 7.711702346801758, "learning_rate": 2.5756451119940623e-05, "loss": 1.1658, "step": 13650 }, { "epoch": 4.04, "grad_norm": 15.824799537658691, "learning_rate": 2.5734619918787932e-05, "loss": 1.1988, "step": 13660 }, { "epoch": 4.04, "grad_norm": 8.047748565673828, "learning_rate": 2.5712788717635244e-05, "loss": 1.148, "step": 13670 }, { "epoch": 4.04, "grad_norm": 7.71880578994751, "learning_rate": 2.5690957516482555e-05, "loss": 1.2351, "step": 13680 }, { "epoch": 4.04, "grad_norm": 3.572082757949829, "learning_rate": 2.5669126315329874e-05, "loss": 0.8479, "step": 13690 }, { "epoch": 4.04, "grad_norm": 12.815369606018066, "learning_rate": 2.5647295114177182e-05, "loss": 1.146, "step": 13700 }, { "epoch": 4.04, "grad_norm": 6.888113975524902, "learning_rate": 2.5625463913024494e-05, "loss": 1.5445, "step": 13710 }, { "epoch": 4.04, "grad_norm": 18.434946060180664, "learning_rate": 2.5603632711871813e-05, "loss": 1.0119, "step": 13720 }, { "epoch": 4.04, "grad_norm": 18.102935791015625, "learning_rate": 2.558180151071912e-05, "loss": 1.1144, "step": 13730 }, { "epoch": 4.04, "grad_norm": 11.363154411315918, "learning_rate": 2.5559970309566433e-05, "loss": 1.3157, "step": 13740 }, { "epoch": 4.04, "grad_norm": 12.518674850463867, "learning_rate": 2.5538139108413745e-05, "loss": 1.1991, "step": 13750 }, { "epoch": 4.04, "grad_norm": 11.948362350463867, "learning_rate": 2.551630790726106e-05, "loss": 0.8509, "step": 13760 }, { "epoch": 4.04, "grad_norm": 9.551739692687988, "learning_rate": 2.549447670610837e-05, "loss": 1.3175, "step": 13770 }, { "epoch": 4.04, "grad_norm": 23.302627563476562, "learning_rate": 2.5472645504955683e-05, "loss": 1.2973, "step": 13780 }, { "epoch": 4.04, "grad_norm": 18.783184051513672, "learning_rate": 2.5450814303802995e-05, "loss": 0.7363, "step": 13790 }, { "epoch": 4.04, "grad_norm": 10.384957313537598, "learning_rate": 2.542898310265031e-05, "loss": 1.4546, "step": 13800 }, { "epoch": 4.04, "grad_norm": 18.749160766601562, "learning_rate": 2.5407151901497622e-05, "loss": 1.033, "step": 13810 }, { "epoch": 4.04, "grad_norm": 7.442930221557617, "learning_rate": 2.5385320700344934e-05, "loss": 1.2899, "step": 13820 }, { "epoch": 4.04, "grad_norm": 12.670392036437988, "learning_rate": 2.5363489499192246e-05, "loss": 0.7839, "step": 13830 }, { "epoch": 4.04, "grad_norm": 6.5534257888793945, "learning_rate": 2.534165829803956e-05, "loss": 1.0528, "step": 13840 }, { "epoch": 4.04, "grad_norm": 15.77134895324707, "learning_rate": 2.5319827096886873e-05, "loss": 0.9637, "step": 13850 }, { "epoch": 4.04, "grad_norm": 15.274126052856445, "learning_rate": 2.5297995895734184e-05, "loss": 1.0434, "step": 13860 }, { "epoch": 4.05, "grad_norm": 14.11375617980957, "learning_rate": 2.5276164694581496e-05, "loss": 0.9472, "step": 13870 }, { "epoch": 4.05, "grad_norm": 16.868555068969727, "learning_rate": 2.525433349342881e-05, "loss": 1.5396, "step": 13880 }, { "epoch": 4.05, "grad_norm": 8.889870643615723, "learning_rate": 2.5232502292276123e-05, "loss": 1.4214, "step": 13890 }, { "epoch": 4.05, "grad_norm": 8.958718299865723, "learning_rate": 2.5210671091123435e-05, "loss": 0.8635, "step": 13900 }, { "epoch": 4.05, "grad_norm": 7.622828960418701, "learning_rate": 2.518883988997075e-05, "loss": 1.0823, "step": 13910 }, { "epoch": 4.05, "grad_norm": 7.238192558288574, "learning_rate": 2.5167008688818062e-05, "loss": 0.9808, "step": 13920 }, { "epoch": 4.05, "grad_norm": 14.741711616516113, "learning_rate": 2.5145177487665374e-05, "loss": 1.0875, "step": 13930 }, { "epoch": 4.05, "grad_norm": 27.616342544555664, "learning_rate": 2.5123346286512682e-05, "loss": 1.57, "step": 13940 }, { "epoch": 4.05, "grad_norm": 0.8020779490470886, "learning_rate": 2.510151508536e-05, "loss": 0.9303, "step": 13950 }, { "epoch": 4.05, "grad_norm": 12.362224578857422, "learning_rate": 2.5079683884207312e-05, "loss": 0.7828, "step": 13960 }, { "epoch": 4.05, "grad_norm": 16.245460510253906, "learning_rate": 2.505785268305462e-05, "loss": 1.3605, "step": 13970 }, { "epoch": 4.05, "grad_norm": 9.894335746765137, "learning_rate": 2.5036021481901933e-05, "loss": 1.455, "step": 13980 }, { "epoch": 4.05, "grad_norm": 8.316423416137695, "learning_rate": 2.501419028074925e-05, "loss": 0.9514, "step": 13990 }, { "epoch": 4.05, "grad_norm": 1.1758034229278564, "learning_rate": 2.4992359079596563e-05, "loss": 0.821, "step": 14000 }, { "epoch": 4.05, "grad_norm": 7.598605632781982, "learning_rate": 2.497052787844387e-05, "loss": 1.4293, "step": 14010 }, { "epoch": 4.05, "grad_norm": 11.498948097229004, "learning_rate": 2.4948696677291186e-05, "loss": 0.9355, "step": 14020 }, { "epoch": 4.05, "grad_norm": 1.6274681091308594, "learning_rate": 2.4926865476138498e-05, "loss": 0.8916, "step": 14030 }, { "epoch": 4.05, "grad_norm": 8.756082534790039, "learning_rate": 2.490503427498581e-05, "loss": 1.1911, "step": 14040 }, { "epoch": 4.05, "grad_norm": 7.499686241149902, "learning_rate": 2.4883203073833122e-05, "loss": 0.9112, "step": 14050 }, { "epoch": 4.05, "grad_norm": 6.491840362548828, "learning_rate": 2.4861371872680437e-05, "loss": 0.9151, "step": 14060 }, { "epoch": 4.05, "grad_norm": 6.883969306945801, "learning_rate": 2.483954067152775e-05, "loss": 0.7545, "step": 14070 }, { "epoch": 4.05, "grad_norm": 7.478753566741943, "learning_rate": 2.481770947037506e-05, "loss": 1.2034, "step": 14080 }, { "epoch": 4.05, "grad_norm": 13.122247695922852, "learning_rate": 2.4795878269222376e-05, "loss": 1.1106, "step": 14090 }, { "epoch": 4.05, "grad_norm": 11.167612075805664, "learning_rate": 2.4774047068069687e-05, "loss": 0.9138, "step": 14100 }, { "epoch": 4.05, "grad_norm": 9.339203834533691, "learning_rate": 2.4752215866917e-05, "loss": 0.8419, "step": 14110 }, { "epoch": 4.05, "grad_norm": 10.59921646118164, "learning_rate": 2.473038466576431e-05, "loss": 1.212, "step": 14120 }, { "epoch": 4.06, "grad_norm": 20.258886337280273, "learning_rate": 2.4708553464611626e-05, "loss": 1.2686, "step": 14130 }, { "epoch": 4.06, "grad_norm": 6.651147365570068, "learning_rate": 2.4686722263458935e-05, "loss": 1.1228, "step": 14140 }, { "epoch": 4.06, "grad_norm": 9.203714370727539, "learning_rate": 2.466489106230625e-05, "loss": 1.1968, "step": 14150 }, { "epoch": 4.06, "grad_norm": 8.39387035369873, "learning_rate": 2.464305986115356e-05, "loss": 0.8097, "step": 14160 }, { "epoch": 4.06, "grad_norm": 6.408330917358398, "learning_rate": 2.4621228660000877e-05, "loss": 1.2068, "step": 14170 }, { "epoch": 4.06, "grad_norm": 7.197762966156006, "learning_rate": 2.4599397458848185e-05, "loss": 1.1027, "step": 14180 }, { "epoch": 4.06, "grad_norm": 16.30938148498535, "learning_rate": 2.45775662576955e-05, "loss": 0.9763, "step": 14190 }, { "epoch": 4.06, "grad_norm": 8.004793167114258, "learning_rate": 2.4555735056542812e-05, "loss": 1.1766, "step": 14200 }, { "epoch": 4.06, "grad_norm": 12.945077896118164, "learning_rate": 2.4533903855390124e-05, "loss": 0.9756, "step": 14210 }, { "epoch": 4.06, "grad_norm": 14.830296516418457, "learning_rate": 2.4512072654237436e-05, "loss": 1.4361, "step": 14220 }, { "epoch": 4.06, "grad_norm": 3.220728635787964, "learning_rate": 2.449024145308475e-05, "loss": 0.9451, "step": 14230 }, { "epoch": 4.06, "grad_norm": 9.93178653717041, "learning_rate": 2.4468410251932063e-05, "loss": 1.1347, "step": 14240 }, { "epoch": 4.06, "grad_norm": 17.511709213256836, "learning_rate": 2.4446579050779374e-05, "loss": 0.8403, "step": 14250 }, { "epoch": 4.06, "grad_norm": 7.697203159332275, "learning_rate": 2.4424747849626686e-05, "loss": 0.7599, "step": 14260 }, { "epoch": 4.06, "grad_norm": 6.824232578277588, "learning_rate": 2.4402916648474e-05, "loss": 0.9953, "step": 14270 }, { "epoch": 4.06, "grad_norm": 12.450435638427734, "learning_rate": 2.4381085447321313e-05, "loss": 1.1575, "step": 14280 }, { "epoch": 4.06, "grad_norm": 2.7066357135772705, "learning_rate": 2.4359254246168625e-05, "loss": 0.7812, "step": 14290 }, { "epoch": 4.06, "grad_norm": 11.865926742553711, "learning_rate": 2.433742304501594e-05, "loss": 1.3358, "step": 14300 }, { "epoch": 4.06, "grad_norm": 8.115141868591309, "learning_rate": 2.4315591843863252e-05, "loss": 1.1591, "step": 14310 }, { "epoch": 4.06, "grad_norm": 7.908416271209717, "learning_rate": 2.4293760642710564e-05, "loss": 1.4353, "step": 14320 }, { "epoch": 4.06, "grad_norm": 6.612397193908691, "learning_rate": 2.4271929441557875e-05, "loss": 1.027, "step": 14330 }, { "epoch": 4.06, "grad_norm": 6.981642723083496, "learning_rate": 2.425009824040519e-05, "loss": 1.0294, "step": 14340 }, { "epoch": 4.06, "grad_norm": 13.808174133300781, "learning_rate": 2.42282670392525e-05, "loss": 1.1126, "step": 14350 }, { "epoch": 4.06, "grad_norm": 9.667133331298828, "learning_rate": 2.4206435838099814e-05, "loss": 1.0839, "step": 14360 }, { "epoch": 4.06, "grad_norm": 7.177286148071289, "learning_rate": 2.4184604636947126e-05, "loss": 1.2526, "step": 14370 }, { "epoch": 4.07, "grad_norm": 10.164685249328613, "learning_rate": 2.4162773435794438e-05, "loss": 1.0603, "step": 14380 }, { "epoch": 4.07, "grad_norm": 16.028968811035156, "learning_rate": 2.414094223464175e-05, "loss": 1.2508, "step": 14390 }, { "epoch": 4.07, "grad_norm": 6.405318260192871, "learning_rate": 2.4119111033489065e-05, "loss": 1.2076, "step": 14400 }, { "epoch": 4.07, "grad_norm": 13.187591552734375, "learning_rate": 2.4097279832336376e-05, "loss": 1.2774, "step": 14410 }, { "epoch": 4.07, "grad_norm": 14.681076049804688, "learning_rate": 2.4075448631183688e-05, "loss": 1.5274, "step": 14420 }, { "epoch": 4.07, "grad_norm": 17.252317428588867, "learning_rate": 2.4053617430031e-05, "loss": 1.1504, "step": 14430 }, { "epoch": 4.07, "grad_norm": 7.880256175994873, "learning_rate": 2.4031786228878315e-05, "loss": 0.9457, "step": 14440 }, { "epoch": 4.07, "grad_norm": 2.7197632789611816, "learning_rate": 2.4009955027725627e-05, "loss": 1.2914, "step": 14450 }, { "epoch": 4.07, "grad_norm": 13.747255325317383, "learning_rate": 2.398812382657294e-05, "loss": 1.2819, "step": 14460 }, { "epoch": 4.07, "grad_norm": 12.134635925292969, "learning_rate": 2.3966292625420254e-05, "loss": 1.0547, "step": 14470 }, { "epoch": 4.07, "grad_norm": 14.555671691894531, "learning_rate": 2.3944461424267566e-05, "loss": 0.9987, "step": 14480 }, { "epoch": 4.07, "grad_norm": 16.17243003845215, "learning_rate": 2.3922630223114877e-05, "loss": 0.9806, "step": 14490 }, { "epoch": 4.07, "grad_norm": 8.929733276367188, "learning_rate": 2.390079902196219e-05, "loss": 1.041, "step": 14500 }, { "epoch": 4.07, "grad_norm": 2.922783136367798, "learning_rate": 2.3878967820809504e-05, "loss": 1.0527, "step": 14510 }, { "epoch": 4.07, "grad_norm": 16.46654510498047, "learning_rate": 2.3857136619656813e-05, "loss": 1.0505, "step": 14520 }, { "epoch": 4.07, "grad_norm": 10.181805610656738, "learning_rate": 2.3835305418504128e-05, "loss": 1.196, "step": 14530 }, { "epoch": 4.07, "grad_norm": 9.33940601348877, "learning_rate": 2.381347421735144e-05, "loss": 1.0101, "step": 14540 }, { "epoch": 4.07, "grad_norm": 15.194351196289062, "learning_rate": 2.379164301619875e-05, "loss": 1.4859, "step": 14550 }, { "epoch": 4.07, "grad_norm": 10.166406631469727, "learning_rate": 2.3769811815046063e-05, "loss": 1.5657, "step": 14560 }, { "epoch": 4.07, "grad_norm": 5.704379558563232, "learning_rate": 2.374798061389338e-05, "loss": 0.958, "step": 14570 }, { "epoch": 4.07, "grad_norm": 8.521444320678711, "learning_rate": 2.372614941274069e-05, "loss": 1.209, "step": 14580 }, { "epoch": 4.07, "grad_norm": 6.480230808258057, "learning_rate": 2.3704318211588002e-05, "loss": 1.1421, "step": 14590 }, { "epoch": 4.07, "grad_norm": 8.353271484375, "learning_rate": 2.3682487010435314e-05, "loss": 1.363, "step": 14600 }, { "epoch": 4.07, "grad_norm": 8.387436866760254, "learning_rate": 2.366065580928263e-05, "loss": 1.0814, "step": 14610 }, { "epoch": 4.07, "grad_norm": 13.440296173095703, "learning_rate": 2.363882460812994e-05, "loss": 0.709, "step": 14620 }, { "epoch": 4.07, "grad_norm": 12.094486236572266, "learning_rate": 2.3616993406977253e-05, "loss": 1.4111, "step": 14630 }, { "epoch": 4.08, "grad_norm": 9.441247940063477, "learning_rate": 2.3595162205824568e-05, "loss": 1.0099, "step": 14640 }, { "epoch": 4.08, "grad_norm": 1.7629671096801758, "learning_rate": 2.357333100467188e-05, "loss": 1.6077, "step": 14650 }, { "epoch": 4.08, "grad_norm": 12.599310874938965, "learning_rate": 2.355149980351919e-05, "loss": 0.7662, "step": 14660 }, { "epoch": 4.08, "grad_norm": 11.41132926940918, "learning_rate": 2.3529668602366503e-05, "loss": 1.4468, "step": 14670 }, { "epoch": 4.08, "grad_norm": 9.263294219970703, "learning_rate": 2.350783740121382e-05, "loss": 1.2558, "step": 14680 }, { "epoch": 4.08, "grad_norm": 11.410943031311035, "learning_rate": 2.3486006200061127e-05, "loss": 1.0088, "step": 14690 }, { "epoch": 4.08, "grad_norm": 8.209218978881836, "learning_rate": 2.3464174998908442e-05, "loss": 1.0955, "step": 14700 }, { "epoch": 4.08, "grad_norm": 10.404569625854492, "learning_rate": 2.3442343797755754e-05, "loss": 1.1111, "step": 14710 }, { "epoch": 4.08, "grad_norm": 13.431899070739746, "learning_rate": 2.3420512596603065e-05, "loss": 1.1112, "step": 14720 }, { "epoch": 4.08, "grad_norm": 13.179576873779297, "learning_rate": 2.3398681395450377e-05, "loss": 1.3517, "step": 14730 }, { "epoch": 4.08, "grad_norm": 8.080814361572266, "learning_rate": 2.3376850194297692e-05, "loss": 1.8226, "step": 14740 }, { "epoch": 4.08, "grad_norm": 14.013104438781738, "learning_rate": 2.3355018993145004e-05, "loss": 1.2565, "step": 14750 }, { "epoch": 4.08, "grad_norm": 12.315030097961426, "learning_rate": 2.3333187791992316e-05, "loss": 0.9305, "step": 14760 }, { "epoch": 4.08, "grad_norm": 8.574878692626953, "learning_rate": 2.3311356590839628e-05, "loss": 0.8032, "step": 14770 }, { "epoch": 4.08, "grad_norm": 3.478322982788086, "learning_rate": 2.3289525389686943e-05, "loss": 0.8846, "step": 14780 }, { "epoch": 4.08, "grad_norm": 12.565533638000488, "learning_rate": 2.3267694188534255e-05, "loss": 0.9044, "step": 14790 }, { "epoch": 4.08, "grad_norm": 13.900773048400879, "learning_rate": 2.3245862987381566e-05, "loss": 1.143, "step": 14800 }, { "epoch": 4.08, "grad_norm": 9.95212459564209, "learning_rate": 2.3224031786228878e-05, "loss": 1.2195, "step": 14810 }, { "epoch": 4.08, "grad_norm": 2.379199981689453, "learning_rate": 2.3202200585076193e-05, "loss": 1.1116, "step": 14820 }, { "epoch": 4.08, "grad_norm": 12.756540298461914, "learning_rate": 2.3180369383923505e-05, "loss": 0.744, "step": 14830 }, { "epoch": 4.08, "grad_norm": 3.8526463508605957, "learning_rate": 2.3158538182770817e-05, "loss": 0.9885, "step": 14840 }, { "epoch": 4.08, "grad_norm": 7.764082431793213, "learning_rate": 2.3136706981618132e-05, "loss": 0.6559, "step": 14850 }, { "epoch": 4.08, "grad_norm": 10.631293296813965, "learning_rate": 2.311487578046544e-05, "loss": 1.2118, "step": 14860 }, { "epoch": 4.08, "grad_norm": 3.7059028148651123, "learning_rate": 2.3093044579312756e-05, "loss": 1.3401, "step": 14870 }, { "epoch": 4.08, "grad_norm": 13.204307556152344, "learning_rate": 2.3071213378160068e-05, "loss": 1.0929, "step": 14880 }, { "epoch": 4.09, "grad_norm": 18.120487213134766, "learning_rate": 2.304938217700738e-05, "loss": 0.9377, "step": 14890 }, { "epoch": 4.09, "grad_norm": 13.828512191772461, "learning_rate": 2.302755097585469e-05, "loss": 1.0495, "step": 14900 }, { "epoch": 4.09, "grad_norm": 10.844417572021484, "learning_rate": 2.3005719774702006e-05, "loss": 1.45, "step": 14910 }, { "epoch": 4.09, "grad_norm": 13.730113983154297, "learning_rate": 2.2983888573549318e-05, "loss": 0.8808, "step": 14920 }, { "epoch": 4.09, "grad_norm": 5.513245582580566, "learning_rate": 2.296205737239663e-05, "loss": 1.196, "step": 14930 }, { "epoch": 4.09, "grad_norm": 9.067357063293457, "learning_rate": 2.294022617124394e-05, "loss": 0.8421, "step": 14940 }, { "epoch": 4.09, "grad_norm": 16.54936981201172, "learning_rate": 2.2918394970091257e-05, "loss": 1.4222, "step": 14950 }, { "epoch": 4.09, "grad_norm": 15.98849868774414, "learning_rate": 2.289656376893857e-05, "loss": 1.2646, "step": 14960 }, { "epoch": 4.09, "grad_norm": 7.36518669128418, "learning_rate": 2.287473256778588e-05, "loss": 0.6694, "step": 14970 }, { "epoch": 4.09, "grad_norm": 8.441777229309082, "learning_rate": 2.2852901366633192e-05, "loss": 1.011, "step": 14980 }, { "epoch": 4.09, "grad_norm": 1.9819217920303345, "learning_rate": 2.2831070165480507e-05, "loss": 1.0736, "step": 14990 }, { "epoch": 4.09, "grad_norm": 3.026094436645508, "learning_rate": 2.2809238964327816e-05, "loss": 1.0935, "step": 15000 }, { "epoch": 4.09, "grad_norm": 12.656912803649902, "learning_rate": 2.278740776317513e-05, "loss": 0.9118, "step": 15010 }, { "epoch": 4.09, "grad_norm": 8.679171562194824, "learning_rate": 2.2765576562022446e-05, "loss": 0.9717, "step": 15020 }, { "epoch": 4.09, "grad_norm": 12.294328689575195, "learning_rate": 2.2743745360869754e-05, "loss": 0.584, "step": 15030 }, { "epoch": 4.09, "grad_norm": 7.023477077484131, "learning_rate": 2.272191415971707e-05, "loss": 0.8437, "step": 15040 }, { "epoch": 4.09, "grad_norm": 2.457037925720215, "learning_rate": 2.270008295856438e-05, "loss": 0.7175, "step": 15050 }, { "epoch": 4.09, "grad_norm": 14.272405624389648, "learning_rate": 2.2678251757411693e-05, "loss": 1.1087, "step": 15060 }, { "epoch": 4.09, "grad_norm": 16.612394332885742, "learning_rate": 2.2656420556259005e-05, "loss": 1.1998, "step": 15070 }, { "epoch": 4.09, "grad_norm": 9.487042427062988, "learning_rate": 2.263458935510632e-05, "loss": 1.2095, "step": 15080 }, { "epoch": 4.09, "grad_norm": 16.1021785736084, "learning_rate": 2.2612758153953632e-05, "loss": 1.2077, "step": 15090 }, { "epoch": 4.09, "grad_norm": 6.269824981689453, "learning_rate": 2.2590926952800944e-05, "loss": 1.4958, "step": 15100 }, { "epoch": 4.09, "grad_norm": 1.760103702545166, "learning_rate": 2.2569095751648255e-05, "loss": 0.8853, "step": 15110 }, { "epoch": 4.09, "grad_norm": 18.436809539794922, "learning_rate": 2.254726455049557e-05, "loss": 1.2572, "step": 15120 }, { "epoch": 4.09, "grad_norm": 10.232084274291992, "learning_rate": 2.2525433349342882e-05, "loss": 1.0609, "step": 15130 }, { "epoch": 4.09, "grad_norm": 26.723386764526367, "learning_rate": 2.2503602148190194e-05, "loss": 0.8545, "step": 15140 }, { "epoch": 4.1, "grad_norm": 10.723196029663086, "learning_rate": 2.2481770947037506e-05, "loss": 1.0456, "step": 15150 }, { "epoch": 4.1, "grad_norm": 12.38196086883545, "learning_rate": 2.245993974588482e-05, "loss": 0.9142, "step": 15160 }, { "epoch": 4.1, "grad_norm": 1.245741844177246, "learning_rate": 2.243810854473213e-05, "loss": 1.2001, "step": 15170 }, { "epoch": 4.1, "grad_norm": 0.7662307024002075, "learning_rate": 2.2416277343579445e-05, "loss": 1.31, "step": 15180 }, { "epoch": 4.1, "grad_norm": 5.021573066711426, "learning_rate": 2.239444614242676e-05, "loss": 0.714, "step": 15190 }, { "epoch": 4.1, "grad_norm": 9.740928649902344, "learning_rate": 2.2372614941274068e-05, "loss": 1.7255, "step": 15200 }, { "epoch": 4.1, "grad_norm": 14.394036293029785, "learning_rate": 2.2350783740121383e-05, "loss": 0.976, "step": 15210 }, { "epoch": 4.1, "grad_norm": 7.137668609619141, "learning_rate": 2.2328952538968695e-05, "loss": 1.0424, "step": 15220 }, { "epoch": 4.1, "grad_norm": 5.691535949707031, "learning_rate": 2.230712133781601e-05, "loss": 0.893, "step": 15230 }, { "epoch": 4.1, "grad_norm": 6.356419086456299, "learning_rate": 2.228529013666332e-05, "loss": 1.4814, "step": 15240 }, { "epoch": 4.1, "grad_norm": 8.038843154907227, "learning_rate": 2.2263458935510634e-05, "loss": 1.0241, "step": 15250 }, { "epoch": 4.1, "grad_norm": 17.072019577026367, "learning_rate": 2.2241627734357946e-05, "loss": 1.0734, "step": 15260 }, { "epoch": 4.1, "grad_norm": 4.3296732902526855, "learning_rate": 2.2219796533205258e-05, "loss": 0.8488, "step": 15270 }, { "epoch": 4.1, "grad_norm": 14.016191482543945, "learning_rate": 2.219796533205257e-05, "loss": 0.84, "step": 15280 }, { "epoch": 4.1, "grad_norm": 12.977688789367676, "learning_rate": 2.2176134130899885e-05, "loss": 1.1895, "step": 15290 }, { "epoch": 4.1, "grad_norm": 7.598196506500244, "learning_rate": 2.2154302929747196e-05, "loss": 1.3503, "step": 15300 }, { "epoch": 4.1, "grad_norm": 13.575299263000488, "learning_rate": 2.2132471728594508e-05, "loss": 1.188, "step": 15310 }, { "epoch": 4.1, "grad_norm": 15.606423377990723, "learning_rate": 2.211064052744182e-05, "loss": 1.2059, "step": 15320 }, { "epoch": 4.1, "grad_norm": 16.593473434448242, "learning_rate": 2.2088809326289135e-05, "loss": 1.2121, "step": 15330 }, { "epoch": 4.1, "grad_norm": 20.780080795288086, "learning_rate": 2.2066978125136443e-05, "loss": 1.1756, "step": 15340 }, { "epoch": 4.1, "grad_norm": 7.214114189147949, "learning_rate": 2.204514692398376e-05, "loss": 1.1789, "step": 15350 }, { "epoch": 4.1, "grad_norm": 5.472196102142334, "learning_rate": 2.202331572283107e-05, "loss": 0.9039, "step": 15360 }, { "epoch": 4.1, "grad_norm": 17.215797424316406, "learning_rate": 2.2001484521678382e-05, "loss": 1.0978, "step": 15370 }, { "epoch": 4.1, "grad_norm": 10.053537368774414, "learning_rate": 2.1979653320525697e-05, "loss": 1.0302, "step": 15380 }, { "epoch": 4.1, "grad_norm": 11.19655990600586, "learning_rate": 2.195782211937301e-05, "loss": 1.0611, "step": 15390 }, { "epoch": 4.11, "grad_norm": 7.358768463134766, "learning_rate": 2.1935990918220324e-05, "loss": 1.2858, "step": 15400 }, { "epoch": 4.11, "grad_norm": 5.900850772857666, "learning_rate": 2.1914159717067633e-05, "loss": 1.4144, "step": 15410 }, { "epoch": 4.11, "grad_norm": 9.213544845581055, "learning_rate": 2.1892328515914948e-05, "loss": 1.2379, "step": 15420 }, { "epoch": 4.11, "grad_norm": 1.5337578058242798, "learning_rate": 2.187049731476226e-05, "loss": 1.2671, "step": 15430 }, { "epoch": 4.11, "grad_norm": 4.450963497161865, "learning_rate": 2.184866611360957e-05, "loss": 0.7194, "step": 15440 }, { "epoch": 4.11, "grad_norm": 4.569442272186279, "learning_rate": 2.1826834912456883e-05, "loss": 1.0461, "step": 15450 }, { "epoch": 4.11, "grad_norm": 9.535881996154785, "learning_rate": 2.18050037113042e-05, "loss": 1.4377, "step": 15460 }, { "epoch": 4.11, "grad_norm": 1.6328961849212646, "learning_rate": 2.178317251015151e-05, "loss": 0.9351, "step": 15470 }, { "epoch": 4.11, "grad_norm": 16.266902923583984, "learning_rate": 2.1761341308998822e-05, "loss": 1.1276, "step": 15480 }, { "epoch": 4.11, "grad_norm": 6.932487487792969, "learning_rate": 2.1739510107846134e-05, "loss": 0.8495, "step": 15490 }, { "epoch": 4.11, "grad_norm": 11.423697471618652, "learning_rate": 2.171767890669345e-05, "loss": 1.3244, "step": 15500 }, { "epoch": 4.11, "grad_norm": 7.710949897766113, "learning_rate": 2.1695847705540757e-05, "loss": 0.7287, "step": 15510 }, { "epoch": 4.11, "grad_norm": 11.07575798034668, "learning_rate": 2.1674016504388072e-05, "loss": 1.626, "step": 15520 }, { "epoch": 4.11, "grad_norm": 20.116147994995117, "learning_rate": 2.1652185303235384e-05, "loss": 1.4789, "step": 15530 }, { "epoch": 4.11, "grad_norm": 13.951581001281738, "learning_rate": 2.16303541020827e-05, "loss": 1.5676, "step": 15540 }, { "epoch": 4.11, "grad_norm": 8.60759449005127, "learning_rate": 2.1608522900930008e-05, "loss": 1.3723, "step": 15550 }, { "epoch": 4.11, "grad_norm": 7.1869683265686035, "learning_rate": 2.1586691699777323e-05, "loss": 1.0133, "step": 15560 }, { "epoch": 4.11, "grad_norm": 15.143035888671875, "learning_rate": 2.1564860498624638e-05, "loss": 1.3139, "step": 15570 }, { "epoch": 4.11, "grad_norm": 5.7337822914123535, "learning_rate": 2.1543029297471947e-05, "loss": 1.1527, "step": 15580 }, { "epoch": 4.11, "grad_norm": 1.8082867860794067, "learning_rate": 2.1521198096319262e-05, "loss": 0.7519, "step": 15590 }, { "epoch": 4.11, "grad_norm": 8.28132438659668, "learning_rate": 2.1499366895166573e-05, "loss": 1.0078, "step": 15600 }, { "epoch": 4.11, "grad_norm": 10.866012573242188, "learning_rate": 2.1477535694013885e-05, "loss": 1.1509, "step": 15610 }, { "epoch": 4.11, "grad_norm": 10.80540657043457, "learning_rate": 2.1455704492861197e-05, "loss": 0.9838, "step": 15620 }, { "epoch": 4.11, "grad_norm": 7.286997318267822, "learning_rate": 2.1433873291708512e-05, "loss": 0.8202, "step": 15630 }, { "epoch": 4.11, "grad_norm": 6.843558311462402, "learning_rate": 2.1412042090555824e-05, "loss": 1.1174, "step": 15640 }, { "epoch": 4.11, "grad_norm": 2.8200693130493164, "learning_rate": 2.1390210889403136e-05, "loss": 0.6846, "step": 15650 }, { "epoch": 4.12, "grad_norm": 12.176255226135254, "learning_rate": 2.1368379688250448e-05, "loss": 0.8305, "step": 15660 }, { "epoch": 4.12, "grad_norm": 16.528547286987305, "learning_rate": 2.1346548487097763e-05, "loss": 1.6058, "step": 15670 }, { "epoch": 4.12, "grad_norm": 12.43004322052002, "learning_rate": 2.132471728594507e-05, "loss": 1.1798, "step": 15680 }, { "epoch": 4.12, "grad_norm": 15.097733497619629, "learning_rate": 2.1302886084792386e-05, "loss": 1.0834, "step": 15690 }, { "epoch": 4.12, "grad_norm": 1.498320460319519, "learning_rate": 2.1281054883639698e-05, "loss": 0.8029, "step": 15700 }, { "epoch": 4.12, "grad_norm": 15.281813621520996, "learning_rate": 2.1259223682487013e-05, "loss": 1.3136, "step": 15710 }, { "epoch": 4.12, "grad_norm": 11.053997039794922, "learning_rate": 2.123739248133432e-05, "loss": 1.0811, "step": 15720 }, { "epoch": 4.12, "grad_norm": 6.765857696533203, "learning_rate": 2.1215561280181637e-05, "loss": 0.9225, "step": 15730 }, { "epoch": 4.12, "grad_norm": 11.498769760131836, "learning_rate": 2.119373007902895e-05, "loss": 1.2389, "step": 15740 }, { "epoch": 4.12, "grad_norm": 15.940476417541504, "learning_rate": 2.117189887787626e-05, "loss": 1.2454, "step": 15750 }, { "epoch": 4.12, "grad_norm": 12.104440689086914, "learning_rate": 2.1150067676723576e-05, "loss": 1.1084, "step": 15760 }, { "epoch": 4.12, "grad_norm": 8.03203296661377, "learning_rate": 2.1128236475570887e-05, "loss": 0.9975, "step": 15770 }, { "epoch": 4.12, "grad_norm": 12.16697883605957, "learning_rate": 2.11064052744182e-05, "loss": 1.5036, "step": 15780 }, { "epoch": 4.12, "grad_norm": 7.02891206741333, "learning_rate": 2.108457407326551e-05, "loss": 0.8032, "step": 15790 }, { "epoch": 4.12, "grad_norm": 4.733098030090332, "learning_rate": 2.1062742872112826e-05, "loss": 0.7854, "step": 15800 }, { "epoch": 4.12, "grad_norm": 25.10373878479004, "learning_rate": 2.1040911670960138e-05, "loss": 1.3777, "step": 15810 }, { "epoch": 4.12, "grad_norm": 12.570293426513672, "learning_rate": 2.101908046980745e-05, "loss": 0.7527, "step": 15820 }, { "epoch": 4.12, "grad_norm": 7.165543079376221, "learning_rate": 2.099724926865476e-05, "loss": 0.765, "step": 15830 }, { "epoch": 4.12, "grad_norm": 9.649480819702148, "learning_rate": 2.0975418067502077e-05, "loss": 0.9119, "step": 15840 }, { "epoch": 4.12, "grad_norm": 17.62099266052246, "learning_rate": 2.095358686634939e-05, "loss": 1.2128, "step": 15850 }, { "epoch": 4.12, "grad_norm": 10.959212303161621, "learning_rate": 2.09317556651967e-05, "loss": 1.1109, "step": 15860 }, { "epoch": 4.12, "grad_norm": 8.288532257080078, "learning_rate": 2.0909924464044012e-05, "loss": 1.1385, "step": 15870 }, { "epoch": 4.12, "grad_norm": 1.937004804611206, "learning_rate": 2.0888093262891327e-05, "loss": 1.3, "step": 15880 }, { "epoch": 4.12, "grad_norm": 10.40786361694336, "learning_rate": 2.0866262061738636e-05, "loss": 1.3697, "step": 15890 }, { "epoch": 4.12, "grad_norm": 10.689018249511719, "learning_rate": 2.084443086058595e-05, "loss": 1.0752, "step": 15900 }, { "epoch": 4.12, "eval_accuracy": 0.6640735502121641, "eval_loss": 1.1753077507019043, "eval_runtime": 376.4251, "eval_samples_per_second": 11.269, "eval_steps_per_second": 2.819, "step": 15905 }, { "epoch": 5.0, "grad_norm": 10.574934005737305, "learning_rate": 2.0822599659433262e-05, "loss": 1.2747, "step": 15910 }, { "epoch": 5.0, "grad_norm": 12.970417022705078, "learning_rate": 2.0800768458280574e-05, "loss": 1.0719, "step": 15920 }, { "epoch": 5.0, "grad_norm": 13.848506927490234, "learning_rate": 2.077893725712789e-05, "loss": 0.9597, "step": 15930 }, { "epoch": 5.0, "grad_norm": 9.054560661315918, "learning_rate": 2.07571060559752e-05, "loss": 1.0381, "step": 15940 }, { "epoch": 5.0, "grad_norm": 15.943974494934082, "learning_rate": 2.0735274854822513e-05, "loss": 0.8905, "step": 15950 }, { "epoch": 5.0, "grad_norm": 7.103931903839111, "learning_rate": 2.0713443653669825e-05, "loss": 0.8657, "step": 15960 }, { "epoch": 5.0, "grad_norm": 4.462408542633057, "learning_rate": 2.069161245251714e-05, "loss": 0.8572, "step": 15970 }, { "epoch": 5.0, "grad_norm": 7.564704418182373, "learning_rate": 2.0669781251364452e-05, "loss": 1.1179, "step": 15980 }, { "epoch": 5.0, "grad_norm": 32.66106414794922, "learning_rate": 2.0647950050211764e-05, "loss": 0.9179, "step": 15990 }, { "epoch": 5.0, "grad_norm": 8.110369682312012, "learning_rate": 2.0626118849059075e-05, "loss": 1.1048, "step": 16000 }, { "epoch": 5.0, "grad_norm": 15.266033172607422, "learning_rate": 2.060428764790639e-05, "loss": 1.1136, "step": 16010 }, { "epoch": 5.0, "grad_norm": 6.084113597869873, "learning_rate": 2.0582456446753702e-05, "loss": 1.0353, "step": 16020 }, { "epoch": 5.0, "grad_norm": 7.481696605682373, "learning_rate": 2.0560625245601014e-05, "loss": 1.0937, "step": 16030 }, { "epoch": 5.01, "grad_norm": 16.246747970581055, "learning_rate": 2.0538794044448326e-05, "loss": 0.9168, "step": 16040 }, { "epoch": 5.01, "grad_norm": 21.125158309936523, "learning_rate": 2.051696284329564e-05, "loss": 0.986, "step": 16050 }, { "epoch": 5.01, "grad_norm": 9.857032775878906, "learning_rate": 2.049513164214295e-05, "loss": 0.7285, "step": 16060 }, { "epoch": 5.01, "grad_norm": 9.294692039489746, "learning_rate": 2.0473300440990265e-05, "loss": 0.7363, "step": 16070 }, { "epoch": 5.01, "grad_norm": 13.892821311950684, "learning_rate": 2.0451469239837576e-05, "loss": 1.3599, "step": 16080 }, { "epoch": 5.01, "grad_norm": 29.375200271606445, "learning_rate": 2.0429638038684888e-05, "loss": 0.9011, "step": 16090 }, { "epoch": 5.01, "grad_norm": 17.151151657104492, "learning_rate": 2.04078068375322e-05, "loss": 0.9354, "step": 16100 }, { "epoch": 5.01, "grad_norm": 9.543347358703613, "learning_rate": 2.0385975636379515e-05, "loss": 0.9754, "step": 16110 }, { "epoch": 5.01, "grad_norm": 15.834558486938477, "learning_rate": 2.0364144435226827e-05, "loss": 1.1989, "step": 16120 }, { "epoch": 5.01, "grad_norm": 11.3794527053833, "learning_rate": 2.034231323407414e-05, "loss": 1.2325, "step": 16130 }, { "epoch": 5.01, "grad_norm": 10.923517227172852, "learning_rate": 2.0320482032921454e-05, "loss": 1.0495, "step": 16140 }, { "epoch": 5.01, "grad_norm": 10.931344032287598, "learning_rate": 2.0298650831768766e-05, "loss": 0.9109, "step": 16150 }, { "epoch": 5.01, "grad_norm": 10.598140716552734, "learning_rate": 2.0276819630616077e-05, "loss": 1.007, "step": 16160 }, { "epoch": 5.01, "grad_norm": 18.302419662475586, "learning_rate": 2.025498842946339e-05, "loss": 0.9294, "step": 16170 }, { "epoch": 5.01, "grad_norm": 1.8738963603973389, "learning_rate": 2.0233157228310704e-05, "loss": 1.155, "step": 16180 }, { "epoch": 5.01, "grad_norm": 13.378024101257324, "learning_rate": 2.0211326027158016e-05, "loss": 1.2478, "step": 16190 }, { "epoch": 5.01, "grad_norm": 3.6828815937042236, "learning_rate": 2.0189494826005328e-05, "loss": 0.6931, "step": 16200 }, { "epoch": 5.01, "grad_norm": 15.233384132385254, "learning_rate": 2.016766362485264e-05, "loss": 1.062, "step": 16210 }, { "epoch": 5.01, "grad_norm": 7.888859272003174, "learning_rate": 2.0145832423699955e-05, "loss": 1.1412, "step": 16220 }, { "epoch": 5.01, "grad_norm": 12.290363311767578, "learning_rate": 2.0124001222547263e-05, "loss": 1.2896, "step": 16230 }, { "epoch": 5.01, "grad_norm": 7.716591835021973, "learning_rate": 2.010217002139458e-05, "loss": 1.225, "step": 16240 }, { "epoch": 5.01, "grad_norm": 13.84139633178711, "learning_rate": 2.008033882024189e-05, "loss": 1.1315, "step": 16250 }, { "epoch": 5.01, "grad_norm": 14.10618782043457, "learning_rate": 2.0058507619089202e-05, "loss": 1.1625, "step": 16260 }, { "epoch": 5.01, "grad_norm": 17.291889190673828, "learning_rate": 2.0036676417936514e-05, "loss": 1.3882, "step": 16270 }, { "epoch": 5.01, "grad_norm": 6.713531494140625, "learning_rate": 2.001484521678383e-05, "loss": 0.7729, "step": 16280 }, { "epoch": 5.02, "grad_norm": 14.574259757995605, "learning_rate": 1.999301401563114e-05, "loss": 0.8652, "step": 16290 }, { "epoch": 5.02, "grad_norm": 13.663325309753418, "learning_rate": 1.9971182814478453e-05, "loss": 1.5846, "step": 16300 }, { "epoch": 5.02, "grad_norm": 12.51400089263916, "learning_rate": 1.9949351613325768e-05, "loss": 1.1258, "step": 16310 }, { "epoch": 5.02, "grad_norm": 10.854379653930664, "learning_rate": 1.992752041217308e-05, "loss": 1.1717, "step": 16320 }, { "epoch": 5.02, "grad_norm": 15.260193824768066, "learning_rate": 1.990568921102039e-05, "loss": 1.0827, "step": 16330 }, { "epoch": 5.02, "grad_norm": 12.922003746032715, "learning_rate": 1.9883858009867703e-05, "loss": 0.9579, "step": 16340 }, { "epoch": 5.02, "grad_norm": 9.87225341796875, "learning_rate": 1.9862026808715018e-05, "loss": 1.1559, "step": 16350 }, { "epoch": 5.02, "grad_norm": 12.76365852355957, "learning_rate": 1.984019560756233e-05, "loss": 1.2905, "step": 16360 }, { "epoch": 5.02, "grad_norm": 6.333401203155518, "learning_rate": 1.9818364406409642e-05, "loss": 0.9349, "step": 16370 }, { "epoch": 5.02, "grad_norm": 3.383871555328369, "learning_rate": 1.9796533205256954e-05, "loss": 0.6397, "step": 16380 }, { "epoch": 5.02, "grad_norm": 3.9091715812683105, "learning_rate": 1.977470200410427e-05, "loss": 1.0482, "step": 16390 }, { "epoch": 5.02, "grad_norm": 2.4046404361724854, "learning_rate": 1.9752870802951577e-05, "loss": 0.5058, "step": 16400 }, { "epoch": 5.02, "grad_norm": 15.956851959228516, "learning_rate": 1.9731039601798892e-05, "loss": 1.2198, "step": 16410 }, { "epoch": 5.02, "grad_norm": 15.203241348266602, "learning_rate": 1.9709208400646204e-05, "loss": 1.2913, "step": 16420 }, { "epoch": 5.02, "grad_norm": 9.65035629272461, "learning_rate": 1.9687377199493516e-05, "loss": 1.3082, "step": 16430 }, { "epoch": 5.02, "grad_norm": 18.05831527709961, "learning_rate": 1.9665545998340828e-05, "loss": 0.8978, "step": 16440 }, { "epoch": 5.02, "grad_norm": 5.754335880279541, "learning_rate": 1.9643714797188143e-05, "loss": 0.9492, "step": 16450 }, { "epoch": 5.02, "grad_norm": 16.76416778564453, "learning_rate": 1.9621883596035455e-05, "loss": 1.3825, "step": 16460 }, { "epoch": 5.02, "grad_norm": 8.99234676361084, "learning_rate": 1.9600052394882766e-05, "loss": 0.9382, "step": 16470 }, { "epoch": 5.02, "grad_norm": 14.925118446350098, "learning_rate": 1.957822119373008e-05, "loss": 0.8439, "step": 16480 }, { "epoch": 5.02, "grad_norm": 5.391711711883545, "learning_rate": 1.9556389992577393e-05, "loss": 1.0815, "step": 16490 }, { "epoch": 5.02, "grad_norm": 11.149608612060547, "learning_rate": 1.9534558791424705e-05, "loss": 1.1135, "step": 16500 }, { "epoch": 5.02, "grad_norm": 9.765776634216309, "learning_rate": 1.9512727590272017e-05, "loss": 1.3721, "step": 16510 }, { "epoch": 5.02, "grad_norm": 1.6820274591445923, "learning_rate": 1.9490896389119332e-05, "loss": 0.9832, "step": 16520 }, { "epoch": 5.02, "grad_norm": 14.258615493774414, "learning_rate": 1.9469065187966644e-05, "loss": 1.0422, "step": 16530 }, { "epoch": 5.02, "grad_norm": 10.644720077514648, "learning_rate": 1.9447233986813956e-05, "loss": 1.0786, "step": 16540 }, { "epoch": 5.03, "grad_norm": 16.705280303955078, "learning_rate": 1.9425402785661267e-05, "loss": 1.4586, "step": 16550 }, { "epoch": 5.03, "grad_norm": 8.47806167602539, "learning_rate": 1.9403571584508583e-05, "loss": 1.0255, "step": 16560 }, { "epoch": 5.03, "grad_norm": 12.204362869262695, "learning_rate": 1.938174038335589e-05, "loss": 1.0506, "step": 16570 }, { "epoch": 5.03, "grad_norm": 10.369688034057617, "learning_rate": 1.9359909182203206e-05, "loss": 0.9877, "step": 16580 }, { "epoch": 5.03, "grad_norm": 2.803469657897949, "learning_rate": 1.9338077981050518e-05, "loss": 0.8515, "step": 16590 }, { "epoch": 5.03, "grad_norm": 13.40743350982666, "learning_rate": 1.931624677989783e-05, "loss": 1.2962, "step": 16600 }, { "epoch": 5.03, "grad_norm": 9.421445846557617, "learning_rate": 1.929441557874514e-05, "loss": 1.2046, "step": 16610 }, { "epoch": 5.03, "grad_norm": 7.884713649749756, "learning_rate": 1.9272584377592457e-05, "loss": 1.1864, "step": 16620 }, { "epoch": 5.03, "grad_norm": 14.929444313049316, "learning_rate": 1.925075317643977e-05, "loss": 0.7849, "step": 16630 }, { "epoch": 5.03, "grad_norm": 16.50315284729004, "learning_rate": 1.922892197528708e-05, "loss": 1.3749, "step": 16640 }, { "epoch": 5.03, "grad_norm": 14.393998146057129, "learning_rate": 1.9207090774134392e-05, "loss": 0.9604, "step": 16650 }, { "epoch": 5.03, "grad_norm": 9.451912879943848, "learning_rate": 1.9185259572981707e-05, "loss": 1.3165, "step": 16660 }, { "epoch": 5.03, "grad_norm": 14.681599617004395, "learning_rate": 1.916342837182902e-05, "loss": 0.7473, "step": 16670 }, { "epoch": 5.03, "grad_norm": 2.3132874965667725, "learning_rate": 1.914159717067633e-05, "loss": 1.2289, "step": 16680 }, { "epoch": 5.03, "grad_norm": 9.696444511413574, "learning_rate": 1.9119765969523646e-05, "loss": 1.1801, "step": 16690 }, { "epoch": 5.03, "grad_norm": 10.32034683227539, "learning_rate": 1.9097934768370958e-05, "loss": 0.7957, "step": 16700 }, { "epoch": 5.03, "grad_norm": 6.515714645385742, "learning_rate": 1.907610356721827e-05, "loss": 0.7047, "step": 16710 }, { "epoch": 5.03, "grad_norm": 13.403619766235352, "learning_rate": 1.905427236606558e-05, "loss": 1.2618, "step": 16720 }, { "epoch": 5.03, "grad_norm": 16.552072525024414, "learning_rate": 1.9032441164912896e-05, "loss": 0.9952, "step": 16730 }, { "epoch": 5.03, "grad_norm": 10.115706443786621, "learning_rate": 1.9010609963760205e-05, "loss": 0.9411, "step": 16740 }, { "epoch": 5.03, "grad_norm": 4.506774425506592, "learning_rate": 1.898877876260752e-05, "loss": 1.1159, "step": 16750 }, { "epoch": 5.03, "grad_norm": 15.125251770019531, "learning_rate": 1.8966947561454832e-05, "loss": 1.018, "step": 16760 }, { "epoch": 5.03, "grad_norm": 17.570297241210938, "learning_rate": 1.8945116360302147e-05, "loss": 1.3486, "step": 16770 }, { "epoch": 5.03, "grad_norm": 12.157319068908691, "learning_rate": 1.8923285159149455e-05, "loss": 1.5527, "step": 16780 }, { "epoch": 5.03, "grad_norm": 14.878063201904297, "learning_rate": 1.890145395799677e-05, "loss": 1.3085, "step": 16790 }, { "epoch": 5.04, "grad_norm": 11.876107215881348, "learning_rate": 1.8879622756844082e-05, "loss": 1.2264, "step": 16800 }, { "epoch": 5.04, "grad_norm": 5.870955467224121, "learning_rate": 1.8857791555691394e-05, "loss": 1.0939, "step": 16810 }, { "epoch": 5.04, "grad_norm": 8.176389694213867, "learning_rate": 1.8835960354538706e-05, "loss": 0.7982, "step": 16820 }, { "epoch": 5.04, "grad_norm": 19.979841232299805, "learning_rate": 1.881412915338602e-05, "loss": 1.3092, "step": 16830 }, { "epoch": 5.04, "grad_norm": 3.6618449687957764, "learning_rate": 1.8792297952233333e-05, "loss": 0.5024, "step": 16840 }, { "epoch": 5.04, "grad_norm": 10.252650260925293, "learning_rate": 1.8770466751080645e-05, "loss": 1.0506, "step": 16850 }, { "epoch": 5.04, "grad_norm": 16.792020797729492, "learning_rate": 1.874863554992796e-05, "loss": 1.3555, "step": 16860 }, { "epoch": 5.04, "grad_norm": 10.057008743286133, "learning_rate": 1.872680434877527e-05, "loss": 1.1523, "step": 16870 }, { "epoch": 5.04, "grad_norm": 10.020461082458496, "learning_rate": 1.8704973147622583e-05, "loss": 1.177, "step": 16880 }, { "epoch": 5.04, "grad_norm": 2.956209421157837, "learning_rate": 1.8683141946469895e-05, "loss": 0.6736, "step": 16890 }, { "epoch": 5.04, "grad_norm": 2.7244486808776855, "learning_rate": 1.866131074531721e-05, "loss": 0.7939, "step": 16900 }, { "epoch": 5.04, "grad_norm": 8.136406898498535, "learning_rate": 1.863947954416452e-05, "loss": 0.6011, "step": 16910 }, { "epoch": 5.04, "grad_norm": 15.968564987182617, "learning_rate": 1.8617648343011834e-05, "loss": 1.1065, "step": 16920 }, { "epoch": 5.04, "grad_norm": 2.198591947555542, "learning_rate": 1.8595817141859146e-05, "loss": 0.5878, "step": 16930 }, { "epoch": 5.04, "grad_norm": 22.69780731201172, "learning_rate": 1.857398594070646e-05, "loss": 0.8302, "step": 16940 }, { "epoch": 5.04, "grad_norm": 10.725274085998535, "learning_rate": 1.855215473955377e-05, "loss": 1.2354, "step": 16950 }, { "epoch": 5.04, "grad_norm": 11.707435607910156, "learning_rate": 1.8530323538401084e-05, "loss": 1.2117, "step": 16960 }, { "epoch": 5.04, "grad_norm": 18.559642791748047, "learning_rate": 1.8508492337248396e-05, "loss": 0.8822, "step": 16970 }, { "epoch": 5.04, "grad_norm": 17.385536193847656, "learning_rate": 1.8486661136095708e-05, "loss": 1.1128, "step": 16980 }, { "epoch": 5.04, "grad_norm": 13.808891296386719, "learning_rate": 1.846482993494302e-05, "loss": 0.9397, "step": 16990 }, { "epoch": 5.04, "grad_norm": 10.883342742919922, "learning_rate": 1.8442998733790335e-05, "loss": 1.2236, "step": 17000 }, { "epoch": 5.04, "grad_norm": 9.972495079040527, "learning_rate": 1.8421167532637647e-05, "loss": 0.951, "step": 17010 }, { "epoch": 5.04, "grad_norm": 20.686342239379883, "learning_rate": 1.839933633148496e-05, "loss": 1.5867, "step": 17020 }, { "epoch": 5.04, "grad_norm": 8.76779842376709, "learning_rate": 1.837750513033227e-05, "loss": 0.7325, "step": 17030 }, { "epoch": 5.04, "grad_norm": 9.286952018737793, "learning_rate": 1.8355673929179585e-05, "loss": 0.8497, "step": 17040 }, { "epoch": 5.04, "grad_norm": 16.644506454467773, "learning_rate": 1.8333842728026897e-05, "loss": 1.3984, "step": 17050 }, { "epoch": 5.05, "grad_norm": 6.982311248779297, "learning_rate": 1.831201152687421e-05, "loss": 1.0207, "step": 17060 }, { "epoch": 5.05, "grad_norm": 2.0991930961608887, "learning_rate": 1.8290180325721524e-05, "loss": 0.7286, "step": 17070 }, { "epoch": 5.05, "grad_norm": 21.561464309692383, "learning_rate": 1.8268349124568836e-05, "loss": 1.3778, "step": 17080 }, { "epoch": 5.05, "grad_norm": 7.250512599945068, "learning_rate": 1.8246517923416148e-05, "loss": 0.9682, "step": 17090 }, { "epoch": 5.05, "grad_norm": 16.515119552612305, "learning_rate": 1.822468672226346e-05, "loss": 1.1177, "step": 17100 }, { "epoch": 5.05, "grad_norm": 26.353736877441406, "learning_rate": 1.8202855521110775e-05, "loss": 0.9607, "step": 17110 }, { "epoch": 5.05, "grad_norm": 2.0812923908233643, "learning_rate": 1.8181024319958083e-05, "loss": 0.7441, "step": 17120 }, { "epoch": 5.05, "grad_norm": 25.468631744384766, "learning_rate": 1.8159193118805398e-05, "loss": 1.1726, "step": 17130 }, { "epoch": 5.05, "grad_norm": 9.197002410888672, "learning_rate": 1.813736191765271e-05, "loss": 0.9389, "step": 17140 }, { "epoch": 5.05, "grad_norm": 6.818742275238037, "learning_rate": 1.8115530716500022e-05, "loss": 0.9064, "step": 17150 }, { "epoch": 5.05, "grad_norm": 5.714310169219971, "learning_rate": 1.8093699515347334e-05, "loss": 1.2026, "step": 17160 }, { "epoch": 5.05, "grad_norm": 14.129135131835938, "learning_rate": 1.807186831419465e-05, "loss": 0.8447, "step": 17170 }, { "epoch": 5.05, "grad_norm": 16.9246826171875, "learning_rate": 1.805003711304196e-05, "loss": 1.2624, "step": 17180 }, { "epoch": 5.05, "grad_norm": 10.886422157287598, "learning_rate": 1.8028205911889272e-05, "loss": 1.3189, "step": 17190 }, { "epoch": 5.05, "grad_norm": 2.9026236534118652, "learning_rate": 1.8006374710736584e-05, "loss": 0.6308, "step": 17200 }, { "epoch": 5.05, "grad_norm": 13.140935897827148, "learning_rate": 1.79845435095839e-05, "loss": 1.2198, "step": 17210 }, { "epoch": 5.05, "grad_norm": 8.462525367736816, "learning_rate": 1.796271230843121e-05, "loss": 1.0083, "step": 17220 }, { "epoch": 5.05, "grad_norm": 2.120617389678955, "learning_rate": 1.7940881107278523e-05, "loss": 1.0684, "step": 17230 }, { "epoch": 5.05, "grad_norm": 12.117980003356934, "learning_rate": 1.7919049906125838e-05, "loss": 0.5352, "step": 17240 }, { "epoch": 5.05, "grad_norm": 4.850026607513428, "learning_rate": 1.789721870497315e-05, "loss": 0.6093, "step": 17250 }, { "epoch": 5.05, "grad_norm": 4.718018531799316, "learning_rate": 1.787538750382046e-05, "loss": 1.0792, "step": 17260 }, { "epoch": 5.05, "grad_norm": 6.126802921295166, "learning_rate": 1.7853556302667773e-05, "loss": 1.2551, "step": 17270 }, { "epoch": 5.05, "grad_norm": 10.99985408782959, "learning_rate": 1.783172510151509e-05, "loss": 0.3593, "step": 17280 }, { "epoch": 5.05, "grad_norm": 11.103346824645996, "learning_rate": 1.7809893900362397e-05, "loss": 1.3178, "step": 17290 }, { "epoch": 5.05, "grad_norm": 22.160974502563477, "learning_rate": 1.7788062699209712e-05, "loss": 0.9992, "step": 17300 }, { "epoch": 5.06, "grad_norm": 11.603459358215332, "learning_rate": 1.7766231498057024e-05, "loss": 0.7958, "step": 17310 }, { "epoch": 5.06, "grad_norm": 2.606255292892456, "learning_rate": 1.7744400296904336e-05, "loss": 0.9023, "step": 17320 }, { "epoch": 5.06, "grad_norm": 18.560928344726562, "learning_rate": 1.7722569095751647e-05, "loss": 1.1119, "step": 17330 }, { "epoch": 5.06, "grad_norm": 16.32120132446289, "learning_rate": 1.7700737894598963e-05, "loss": 0.922, "step": 17340 }, { "epoch": 5.06, "grad_norm": 14.608675003051758, "learning_rate": 1.7678906693446274e-05, "loss": 1.1674, "step": 17350 }, { "epoch": 5.06, "grad_norm": 5.11763334274292, "learning_rate": 1.7657075492293586e-05, "loss": 0.9036, "step": 17360 }, { "epoch": 5.06, "grad_norm": 11.688695907592773, "learning_rate": 1.7635244291140898e-05, "loss": 1.1373, "step": 17370 }, { "epoch": 5.06, "grad_norm": 6.197993278503418, "learning_rate": 1.7613413089988213e-05, "loss": 0.9641, "step": 17380 }, { "epoch": 5.06, "grad_norm": 3.5992565155029297, "learning_rate": 1.7591581888835525e-05, "loss": 0.5627, "step": 17390 }, { "epoch": 5.06, "grad_norm": 6.502362251281738, "learning_rate": 1.7569750687682837e-05, "loss": 1.1273, "step": 17400 }, { "epoch": 5.06, "grad_norm": 7.513978481292725, "learning_rate": 1.7547919486530152e-05, "loss": 1.1512, "step": 17410 }, { "epoch": 5.06, "grad_norm": 10.959603309631348, "learning_rate": 1.7526088285377464e-05, "loss": 0.7052, "step": 17420 }, { "epoch": 5.06, "grad_norm": 11.914216995239258, "learning_rate": 1.7504257084224775e-05, "loss": 0.8839, "step": 17430 }, { "epoch": 5.06, "grad_norm": 11.613728523254395, "learning_rate": 1.7482425883072087e-05, "loss": 1.48, "step": 17440 }, { "epoch": 5.06, "grad_norm": 14.877669334411621, "learning_rate": 1.7460594681919402e-05, "loss": 0.9517, "step": 17450 }, { "epoch": 5.06, "grad_norm": 10.036979675292969, "learning_rate": 1.743876348076671e-05, "loss": 1.0274, "step": 17460 }, { "epoch": 5.06, "grad_norm": 6.7218852043151855, "learning_rate": 1.7416932279614026e-05, "loss": 1.1196, "step": 17470 }, { "epoch": 5.06, "grad_norm": 8.696640968322754, "learning_rate": 1.7395101078461338e-05, "loss": 1.1189, "step": 17480 }, { "epoch": 5.06, "grad_norm": 3.029432773590088, "learning_rate": 1.737326987730865e-05, "loss": 0.9388, "step": 17490 }, { "epoch": 5.06, "grad_norm": 18.290916442871094, "learning_rate": 1.735143867615596e-05, "loss": 1.3121, "step": 17500 }, { "epoch": 5.06, "grad_norm": 14.119184494018555, "learning_rate": 1.7329607475003277e-05, "loss": 1.4059, "step": 17510 }, { "epoch": 5.06, "grad_norm": 11.076031684875488, "learning_rate": 1.7307776273850588e-05, "loss": 1.0305, "step": 17520 }, { "epoch": 5.06, "grad_norm": 10.12272834777832, "learning_rate": 1.72859450726979e-05, "loss": 1.2507, "step": 17530 }, { "epoch": 5.06, "grad_norm": 16.223949432373047, "learning_rate": 1.7264113871545212e-05, "loss": 0.6788, "step": 17540 }, { "epoch": 5.06, "grad_norm": 15.81699275970459, "learning_rate": 1.7242282670392527e-05, "loss": 0.899, "step": 17550 }, { "epoch": 5.07, "grad_norm": 16.068099975585938, "learning_rate": 1.722045146923984e-05, "loss": 0.9288, "step": 17560 }, { "epoch": 5.07, "grad_norm": 12.96339225769043, "learning_rate": 1.719862026808715e-05, "loss": 1.0382, "step": 17570 }, { "epoch": 5.07, "grad_norm": 16.720951080322266, "learning_rate": 1.7176789066934462e-05, "loss": 0.9943, "step": 17580 }, { "epoch": 5.07, "grad_norm": 9.627103805541992, "learning_rate": 1.7154957865781778e-05, "loss": 1.2077, "step": 17590 }, { "epoch": 5.07, "grad_norm": 1.6424877643585205, "learning_rate": 1.713312666462909e-05, "loss": 1.0335, "step": 17600 }, { "epoch": 5.07, "grad_norm": 9.488208770751953, "learning_rate": 1.71112954634764e-05, "loss": 0.7875, "step": 17610 }, { "epoch": 5.07, "grad_norm": 10.782718658447266, "learning_rate": 1.7089464262323716e-05, "loss": 1.3287, "step": 17620 }, { "epoch": 5.07, "grad_norm": 5.2250142097473145, "learning_rate": 1.7067633061171025e-05, "loss": 0.972, "step": 17630 }, { "epoch": 5.07, "grad_norm": 8.15955924987793, "learning_rate": 1.704580186001834e-05, "loss": 0.9912, "step": 17640 }, { "epoch": 5.07, "grad_norm": 17.579971313476562, "learning_rate": 1.702397065886565e-05, "loss": 0.8551, "step": 17650 }, { "epoch": 5.07, "grad_norm": 16.472959518432617, "learning_rate": 1.7002139457712963e-05, "loss": 1.5087, "step": 17660 }, { "epoch": 5.07, "grad_norm": 13.607599258422852, "learning_rate": 1.6980308256560275e-05, "loss": 1.2726, "step": 17670 }, { "epoch": 5.07, "grad_norm": 20.370607376098633, "learning_rate": 1.695847705540759e-05, "loss": 1.4286, "step": 17680 }, { "epoch": 5.07, "grad_norm": 10.226858139038086, "learning_rate": 1.6936645854254902e-05, "loss": 1.4171, "step": 17690 }, { "epoch": 5.07, "grad_norm": 18.607908248901367, "learning_rate": 1.6914814653102214e-05, "loss": 0.8273, "step": 17700 }, { "epoch": 5.07, "grad_norm": 15.916524887084961, "learning_rate": 1.6892983451949526e-05, "loss": 1.0766, "step": 17710 }, { "epoch": 5.07, "grad_norm": 12.274320602416992, "learning_rate": 1.687115225079684e-05, "loss": 1.0913, "step": 17720 }, { "epoch": 5.07, "grad_norm": 12.900050163269043, "learning_rate": 1.6849321049644153e-05, "loss": 1.0977, "step": 17730 }, { "epoch": 5.07, "grad_norm": 8.273165702819824, "learning_rate": 1.6827489848491464e-05, "loss": 0.7171, "step": 17740 }, { "epoch": 5.07, "grad_norm": 15.36609935760498, "learning_rate": 1.6805658647338776e-05, "loss": 0.8534, "step": 17750 }, { "epoch": 5.07, "grad_norm": 14.345377922058105, "learning_rate": 1.678382744618609e-05, "loss": 1.3854, "step": 17760 }, { "epoch": 5.07, "grad_norm": 2.0935044288635254, "learning_rate": 1.67619962450334e-05, "loss": 0.5494, "step": 17770 }, { "epoch": 5.07, "grad_norm": 5.562219619750977, "learning_rate": 1.6740165043880715e-05, "loss": 0.7348, "step": 17780 }, { "epoch": 5.07, "grad_norm": 6.326713562011719, "learning_rate": 1.671833384272803e-05, "loss": 0.7783, "step": 17790 }, { "epoch": 5.07, "grad_norm": 15.367618560791016, "learning_rate": 1.669650264157534e-05, "loss": 0.8149, "step": 17800 }, { "epoch": 5.07, "grad_norm": 7.653753757476807, "learning_rate": 1.6674671440422654e-05, "loss": 0.8891, "step": 17810 }, { "epoch": 5.08, "grad_norm": 7.2886271476745605, "learning_rate": 1.6652840239269965e-05, "loss": 0.8833, "step": 17820 }, { "epoch": 5.08, "grad_norm": 9.55782413482666, "learning_rate": 1.6631009038117277e-05, "loss": 1.045, "step": 17830 }, { "epoch": 5.08, "grad_norm": 8.952122688293457, "learning_rate": 1.660917783696459e-05, "loss": 1.2518, "step": 17840 }, { "epoch": 5.08, "grad_norm": 14.377588272094727, "learning_rate": 1.6587346635811904e-05, "loss": 0.8931, "step": 17850 }, { "epoch": 5.08, "grad_norm": 9.466032981872559, "learning_rate": 1.6565515434659216e-05, "loss": 1.0164, "step": 17860 }, { "epoch": 5.08, "grad_norm": 14.650469779968262, "learning_rate": 1.6543684233506528e-05, "loss": 1.3928, "step": 17870 }, { "epoch": 5.08, "grad_norm": 8.764126777648926, "learning_rate": 1.652185303235384e-05, "loss": 1.3566, "step": 17880 }, { "epoch": 5.08, "grad_norm": 23.299457550048828, "learning_rate": 1.6500021831201155e-05, "loss": 1.0356, "step": 17890 }, { "epoch": 5.08, "grad_norm": 7.281065464019775, "learning_rate": 1.6478190630048467e-05, "loss": 0.9025, "step": 17900 }, { "epoch": 5.08, "grad_norm": 8.289732933044434, "learning_rate": 1.645635942889578e-05, "loss": 1.0428, "step": 17910 }, { "epoch": 5.08, "grad_norm": 7.550114154815674, "learning_rate": 1.643452822774309e-05, "loss": 0.7994, "step": 17920 }, { "epoch": 5.08, "grad_norm": 8.787652969360352, "learning_rate": 1.6412697026590405e-05, "loss": 0.916, "step": 17930 }, { "epoch": 5.08, "grad_norm": 6.92379903793335, "learning_rate": 1.6390865825437714e-05, "loss": 1.36, "step": 17940 }, { "epoch": 5.08, "grad_norm": 17.2072811126709, "learning_rate": 1.636903462428503e-05, "loss": 1.1517, "step": 17950 }, { "epoch": 5.08, "grad_norm": 9.617305755615234, "learning_rate": 1.6347203423132344e-05, "loss": 0.587, "step": 17960 }, { "epoch": 5.08, "grad_norm": 3.0850846767425537, "learning_rate": 1.6325372221979652e-05, "loss": 1.0816, "step": 17970 }, { "epoch": 5.08, "grad_norm": 7.413936614990234, "learning_rate": 1.6303541020826968e-05, "loss": 1.1317, "step": 17980 }, { "epoch": 5.08, "grad_norm": 19.44146728515625, "learning_rate": 1.628170981967428e-05, "loss": 1.1413, "step": 17990 }, { "epoch": 5.08, "grad_norm": 10.17887020111084, "learning_rate": 1.6259878618521595e-05, "loss": 1.0409, "step": 18000 }, { "epoch": 5.08, "grad_norm": 12.31407642364502, "learning_rate": 1.6238047417368903e-05, "loss": 1.2444, "step": 18010 }, { "epoch": 5.08, "grad_norm": 15.09060001373291, "learning_rate": 1.6216216216216218e-05, "loss": 1.3268, "step": 18020 }, { "epoch": 5.08, "grad_norm": 23.001676559448242, "learning_rate": 1.619438501506353e-05, "loss": 1.1173, "step": 18030 }, { "epoch": 5.08, "grad_norm": 7.485833168029785, "learning_rate": 1.617255381391084e-05, "loss": 0.5609, "step": 18040 }, { "epoch": 5.08, "grad_norm": 7.914642333984375, "learning_rate": 1.6150722612758153e-05, "loss": 1.2382, "step": 18050 }, { "epoch": 5.08, "grad_norm": 3.8642818927764893, "learning_rate": 1.612889141160547e-05, "loss": 1.0682, "step": 18060 }, { "epoch": 5.09, "grad_norm": 8.96588134765625, "learning_rate": 1.610706021045278e-05, "loss": 0.699, "step": 18070 }, { "epoch": 5.09, "grad_norm": 5.151167392730713, "learning_rate": 1.6085229009300092e-05, "loss": 1.0005, "step": 18080 }, { "epoch": 5.09, "grad_norm": 18.026966094970703, "learning_rate": 1.6063397808147404e-05, "loss": 1.2644, "step": 18090 }, { "epoch": 5.09, "grad_norm": 2.0052404403686523, "learning_rate": 1.604156660699472e-05, "loss": 0.6725, "step": 18100 }, { "epoch": 5.09, "grad_norm": 9.023505210876465, "learning_rate": 1.6019735405842028e-05, "loss": 1.0806, "step": 18110 }, { "epoch": 5.09, "grad_norm": 14.213906288146973, "learning_rate": 1.5997904204689343e-05, "loss": 0.8979, "step": 18120 }, { "epoch": 5.09, "grad_norm": 22.621767044067383, "learning_rate": 1.5976073003536654e-05, "loss": 1.1198, "step": 18130 }, { "epoch": 5.09, "grad_norm": 8.10435676574707, "learning_rate": 1.5954241802383966e-05, "loss": 0.82, "step": 18140 }, { "epoch": 5.09, "grad_norm": 1.5562348365783691, "learning_rate": 1.593241060123128e-05, "loss": 0.9412, "step": 18150 }, { "epoch": 5.09, "grad_norm": 22.386442184448242, "learning_rate": 1.5910579400078593e-05, "loss": 0.9028, "step": 18160 }, { "epoch": 5.09, "grad_norm": 8.735527038574219, "learning_rate": 1.588874819892591e-05, "loss": 0.9486, "step": 18170 }, { "epoch": 5.09, "grad_norm": 7.260108947753906, "learning_rate": 1.5866916997773217e-05, "loss": 0.4357, "step": 18180 }, { "epoch": 5.09, "grad_norm": 9.167963981628418, "learning_rate": 1.5845085796620532e-05, "loss": 1.2025, "step": 18190 }, { "epoch": 5.09, "grad_norm": 17.001707077026367, "learning_rate": 1.5823254595467844e-05, "loss": 0.9136, "step": 18200 }, { "epoch": 5.09, "grad_norm": 14.9670991897583, "learning_rate": 1.5801423394315156e-05, "loss": 1.1845, "step": 18210 }, { "epoch": 5.09, "grad_norm": 15.41689682006836, "learning_rate": 1.5779592193162467e-05, "loss": 1.2345, "step": 18220 }, { "epoch": 5.09, "grad_norm": 16.562034606933594, "learning_rate": 1.5757760992009782e-05, "loss": 1.3559, "step": 18230 }, { "epoch": 5.09, "grad_norm": 16.239242553710938, "learning_rate": 1.5735929790857094e-05, "loss": 1.406, "step": 18240 }, { "epoch": 5.09, "grad_norm": 21.97365951538086, "learning_rate": 1.5714098589704406e-05, "loss": 1.0546, "step": 18250 }, { "epoch": 5.09, "grad_norm": 6.850479602813721, "learning_rate": 1.5692267388551718e-05, "loss": 0.9107, "step": 18260 }, { "epoch": 5.09, "grad_norm": 22.093599319458008, "learning_rate": 1.5670436187399033e-05, "loss": 1.2941, "step": 18270 }, { "epoch": 5.09, "grad_norm": 0.44992467761039734, "learning_rate": 1.564860498624634e-05, "loss": 0.7106, "step": 18280 }, { "epoch": 5.09, "grad_norm": 8.531197547912598, "learning_rate": 1.5626773785093657e-05, "loss": 1.3874, "step": 18290 }, { "epoch": 5.09, "grad_norm": 16.238969802856445, "learning_rate": 1.560494258394097e-05, "loss": 0.7537, "step": 18300 }, { "epoch": 5.09, "grad_norm": 3.762239456176758, "learning_rate": 1.5583111382788284e-05, "loss": 0.8293, "step": 18310 }, { "epoch": 5.09, "grad_norm": 20.46561050415039, "learning_rate": 1.5561280181635592e-05, "loss": 0.8761, "step": 18320 }, { "epoch": 5.1, "grad_norm": 10.434751510620117, "learning_rate": 1.5539448980482907e-05, "loss": 1.1204, "step": 18330 }, { "epoch": 5.1, "grad_norm": 11.48574447631836, "learning_rate": 1.5517617779330222e-05, "loss": 1.1092, "step": 18340 }, { "epoch": 5.1, "grad_norm": 7.6499433517456055, "learning_rate": 1.549578657817753e-05, "loss": 0.9963, "step": 18350 }, { "epoch": 5.1, "grad_norm": 11.333465576171875, "learning_rate": 1.5473955377024846e-05, "loss": 0.9399, "step": 18360 }, { "epoch": 5.1, "grad_norm": 1.7811930179595947, "learning_rate": 1.5452124175872158e-05, "loss": 1.0706, "step": 18370 }, { "epoch": 5.1, "grad_norm": 26.29073143005371, "learning_rate": 1.543029297471947e-05, "loss": 1.2153, "step": 18380 }, { "epoch": 5.1, "grad_norm": 18.950416564941406, "learning_rate": 1.540846177356678e-05, "loss": 1.5852, "step": 18390 }, { "epoch": 5.1, "grad_norm": 15.982751846313477, "learning_rate": 1.5386630572414096e-05, "loss": 1.3665, "step": 18400 }, { "epoch": 5.1, "grad_norm": 15.991207122802734, "learning_rate": 1.5364799371261408e-05, "loss": 1.0842, "step": 18410 }, { "epoch": 5.1, "grad_norm": 8.121706008911133, "learning_rate": 1.534296817010872e-05, "loss": 0.9248, "step": 18420 }, { "epoch": 5.1, "grad_norm": 5.642177581787109, "learning_rate": 1.532113696895603e-05, "loss": 1.122, "step": 18430 }, { "epoch": 5.1, "grad_norm": 19.470535278320312, "learning_rate": 1.5299305767803347e-05, "loss": 1.5846, "step": 18440 }, { "epoch": 5.1, "grad_norm": 8.943193435668945, "learning_rate": 1.5277474566650655e-05, "loss": 0.8214, "step": 18450 }, { "epoch": 5.1, "grad_norm": 10.343379974365234, "learning_rate": 1.525564336549797e-05, "loss": 1.1467, "step": 18460 }, { "epoch": 5.1, "grad_norm": 10.28848934173584, "learning_rate": 1.5233812164345282e-05, "loss": 1.1411, "step": 18470 }, { "epoch": 5.1, "grad_norm": 12.360368728637695, "learning_rate": 1.5211980963192596e-05, "loss": 0.7424, "step": 18480 }, { "epoch": 5.1, "grad_norm": 17.17974090576172, "learning_rate": 1.5190149762039907e-05, "loss": 0.7835, "step": 18490 }, { "epoch": 5.1, "grad_norm": 11.744301795959473, "learning_rate": 1.5168318560887221e-05, "loss": 1.3349, "step": 18500 }, { "epoch": 5.1, "grad_norm": 11.225785255432129, "learning_rate": 1.5146487359734534e-05, "loss": 1.0224, "step": 18510 }, { "epoch": 5.1, "grad_norm": 12.693541526794434, "learning_rate": 1.5124656158581846e-05, "loss": 1.0637, "step": 18520 }, { "epoch": 5.1, "grad_norm": 11.930280685424805, "learning_rate": 1.510282495742916e-05, "loss": 1.0569, "step": 18530 }, { "epoch": 5.1, "grad_norm": 13.09936809539795, "learning_rate": 1.5080993756276471e-05, "loss": 0.8516, "step": 18540 }, { "epoch": 5.1, "grad_norm": 12.095209121704102, "learning_rate": 1.5059162555123785e-05, "loss": 1.1523, "step": 18550 }, { "epoch": 5.1, "grad_norm": 1.697354793548584, "learning_rate": 1.5037331353971095e-05, "loss": 0.8816, "step": 18560 }, { "epoch": 5.1, "grad_norm": 10.504684448242188, "learning_rate": 1.501550015281841e-05, "loss": 0.9202, "step": 18570 }, { "epoch": 5.11, "grad_norm": 22.708742141723633, "learning_rate": 1.499366895166572e-05, "loss": 1.2456, "step": 18580 }, { "epoch": 5.11, "grad_norm": 17.721071243286133, "learning_rate": 1.4971837750513035e-05, "loss": 0.8406, "step": 18590 }, { "epoch": 5.11, "grad_norm": 14.318277359008789, "learning_rate": 1.4950006549360346e-05, "loss": 1.3574, "step": 18600 }, { "epoch": 5.11, "grad_norm": 5.688145160675049, "learning_rate": 1.4928175348207659e-05, "loss": 1.0934, "step": 18610 }, { "epoch": 5.11, "grad_norm": 22.90980339050293, "learning_rate": 1.490634414705497e-05, "loss": 1.0341, "step": 18620 }, { "epoch": 5.11, "grad_norm": 13.427457809448242, "learning_rate": 1.4884512945902284e-05, "loss": 1.1311, "step": 18630 }, { "epoch": 5.11, "grad_norm": 10.557838439941406, "learning_rate": 1.4862681744749596e-05, "loss": 1.2561, "step": 18640 }, { "epoch": 5.11, "grad_norm": 9.978479385375977, "learning_rate": 1.484085054359691e-05, "loss": 0.6529, "step": 18650 }, { "epoch": 5.11, "grad_norm": 9.879769325256348, "learning_rate": 1.4819019342444221e-05, "loss": 0.7928, "step": 18660 }, { "epoch": 5.11, "grad_norm": 11.378496170043945, "learning_rate": 1.4797188141291535e-05, "loss": 0.7299, "step": 18670 }, { "epoch": 5.11, "grad_norm": 9.323801040649414, "learning_rate": 1.4775356940138847e-05, "loss": 1.1239, "step": 18680 }, { "epoch": 5.11, "grad_norm": 14.799972534179688, "learning_rate": 1.475352573898616e-05, "loss": 1.3317, "step": 18690 }, { "epoch": 5.11, "grad_norm": 7.75312614440918, "learning_rate": 1.4731694537833474e-05, "loss": 1.0852, "step": 18700 }, { "epoch": 5.11, "grad_norm": 26.804323196411133, "learning_rate": 1.4709863336680785e-05, "loss": 1.0177, "step": 18710 }, { "epoch": 5.11, "grad_norm": 15.156867980957031, "learning_rate": 1.4688032135528099e-05, "loss": 1.4227, "step": 18720 }, { "epoch": 5.11, "grad_norm": 2.4774365425109863, "learning_rate": 1.466620093437541e-05, "loss": 0.5564, "step": 18730 }, { "epoch": 5.11, "grad_norm": 16.350753784179688, "learning_rate": 1.4644369733222724e-05, "loss": 0.9023, "step": 18740 }, { "epoch": 5.11, "grad_norm": 8.890925407409668, "learning_rate": 1.4622538532070034e-05, "loss": 0.8498, "step": 18750 }, { "epoch": 5.11, "grad_norm": 1.657368779182434, "learning_rate": 1.460070733091735e-05, "loss": 1.0283, "step": 18760 }, { "epoch": 5.11, "grad_norm": 12.567975997924805, "learning_rate": 1.457887612976466e-05, "loss": 0.8934, "step": 18770 }, { "epoch": 5.11, "grad_norm": 12.60466194152832, "learning_rate": 1.4557044928611973e-05, "loss": 0.9093, "step": 18780 }, { "epoch": 5.11, "grad_norm": 13.059163093566895, "learning_rate": 1.4535213727459285e-05, "loss": 1.0899, "step": 18790 }, { "epoch": 5.11, "grad_norm": 19.254384994506836, "learning_rate": 1.4513382526306598e-05, "loss": 1.1929, "step": 18800 }, { "epoch": 5.11, "grad_norm": 0.45910605788230896, "learning_rate": 1.449155132515391e-05, "loss": 0.7048, "step": 18810 }, { "epoch": 5.11, "grad_norm": 6.579792499542236, "learning_rate": 1.4469720124001223e-05, "loss": 0.9626, "step": 18820 }, { "epoch": 5.11, "grad_norm": 9.869888305664062, "learning_rate": 1.4447888922848535e-05, "loss": 1.0588, "step": 18830 }, { "epoch": 5.12, "grad_norm": 16.15755844116211, "learning_rate": 1.4426057721695849e-05, "loss": 1.4889, "step": 18840 }, { "epoch": 5.12, "grad_norm": 7.933180332183838, "learning_rate": 1.440422652054316e-05, "loss": 1.1863, "step": 18850 }, { "epoch": 5.12, "grad_norm": 8.97828483581543, "learning_rate": 1.4382395319390474e-05, "loss": 0.9781, "step": 18860 }, { "epoch": 5.12, "grad_norm": 11.166291236877441, "learning_rate": 1.4360564118237784e-05, "loss": 1.4933, "step": 18870 }, { "epoch": 5.12, "grad_norm": 9.7828950881958, "learning_rate": 1.43387329170851e-05, "loss": 0.88, "step": 18880 }, { "epoch": 5.12, "grad_norm": 9.525924682617188, "learning_rate": 1.4316901715932413e-05, "loss": 1.163, "step": 18890 }, { "epoch": 5.12, "grad_norm": 22.926471710205078, "learning_rate": 1.4295070514779724e-05, "loss": 0.9008, "step": 18900 }, { "epoch": 5.12, "grad_norm": 1.8941190242767334, "learning_rate": 1.4273239313627038e-05, "loss": 0.8, "step": 18910 }, { "epoch": 5.12, "grad_norm": 4.959818363189697, "learning_rate": 1.4251408112474348e-05, "loss": 1.0046, "step": 18920 }, { "epoch": 5.12, "grad_norm": 4.9487385749816895, "learning_rate": 1.4229576911321663e-05, "loss": 0.9173, "step": 18930 }, { "epoch": 5.12, "grad_norm": 13.545561790466309, "learning_rate": 1.4207745710168973e-05, "loss": 0.914, "step": 18940 }, { "epoch": 5.12, "grad_norm": 7.421730041503906, "learning_rate": 1.4185914509016288e-05, "loss": 0.9119, "step": 18950 }, { "epoch": 5.12, "grad_norm": 1.8136019706726074, "learning_rate": 1.4164083307863599e-05, "loss": 0.8829, "step": 18960 }, { "epoch": 5.12, "grad_norm": 14.179448127746582, "learning_rate": 1.4142252106710912e-05, "loss": 0.8155, "step": 18970 }, { "epoch": 5.12, "grad_norm": 16.389019012451172, "learning_rate": 1.4120420905558224e-05, "loss": 0.756, "step": 18980 }, { "epoch": 5.12, "grad_norm": 13.684218406677246, "learning_rate": 1.4098589704405537e-05, "loss": 0.9412, "step": 18990 }, { "epoch": 5.12, "grad_norm": 13.905356407165527, "learning_rate": 1.4076758503252849e-05, "loss": 0.8589, "step": 19000 }, { "epoch": 5.12, "grad_norm": 12.580118179321289, "learning_rate": 1.4054927302100163e-05, "loss": 1.0592, "step": 19010 }, { "epoch": 5.12, "grad_norm": 11.314577102661133, "learning_rate": 1.4033096100947474e-05, "loss": 1.162, "step": 19020 }, { "epoch": 5.12, "grad_norm": 6.442819118499756, "learning_rate": 1.4011264899794788e-05, "loss": 1.1072, "step": 19030 }, { "epoch": 5.12, "grad_norm": 22.509777069091797, "learning_rate": 1.39894336986421e-05, "loss": 0.7486, "step": 19040 }, { "epoch": 5.12, "grad_norm": 10.416544914245605, "learning_rate": 1.3967602497489413e-05, "loss": 0.7889, "step": 19050 }, { "epoch": 5.12, "grad_norm": 12.814079284667969, "learning_rate": 1.3945771296336723e-05, "loss": 1.5754, "step": 19060 }, { "epoch": 5.12, "grad_norm": 21.85147476196289, "learning_rate": 1.3923940095184038e-05, "loss": 0.99, "step": 19070 }, { "epoch": 5.12, "grad_norm": 15.901121139526367, "learning_rate": 1.3902108894031352e-05, "loss": 0.7721, "step": 19080 }, { "epoch": 5.12, "eval_accuracy": 0.6777463460631777, "eval_loss": 1.1413390636444092, "eval_runtime": 376.2448, "eval_samples_per_second": 11.275, "eval_steps_per_second": 2.82, "step": 19086 }, { "epoch": 6.0, "grad_norm": 15.180476188659668, "learning_rate": 1.3880277692878662e-05, "loss": 1.0321, "step": 19090 }, { "epoch": 6.0, "grad_norm": 22.01369857788086, "learning_rate": 1.3858446491725977e-05, "loss": 0.9948, "step": 19100 }, { "epoch": 6.0, "grad_norm": 9.111541748046875, "learning_rate": 1.3836615290573287e-05, "loss": 1.0863, "step": 19110 }, { "epoch": 6.0, "grad_norm": 8.824039459228516, "learning_rate": 1.3814784089420602e-05, "loss": 0.9853, "step": 19120 }, { "epoch": 6.0, "grad_norm": 15.332914352416992, "learning_rate": 1.3792952888267912e-05, "loss": 1.2223, "step": 19130 }, { "epoch": 6.0, "grad_norm": 12.9944429397583, "learning_rate": 1.3771121687115226e-05, "loss": 0.9095, "step": 19140 }, { "epoch": 6.0, "grad_norm": 8.413078308105469, "learning_rate": 1.3749290485962538e-05, "loss": 1.1107, "step": 19150 }, { "epoch": 6.0, "grad_norm": 14.440452575683594, "learning_rate": 1.3727459284809851e-05, "loss": 1.2088, "step": 19160 }, { "epoch": 6.0, "grad_norm": 9.766199111938477, "learning_rate": 1.3705628083657163e-05, "loss": 0.9525, "step": 19170 }, { "epoch": 6.0, "grad_norm": 5.649563312530518, "learning_rate": 1.3683796882504476e-05, "loss": 0.9432, "step": 19180 }, { "epoch": 6.0, "grad_norm": 10.622297286987305, "learning_rate": 1.3661965681351788e-05, "loss": 1.0212, "step": 19190 }, { "epoch": 6.0, "grad_norm": 1.4472098350524902, "learning_rate": 1.3640134480199102e-05, "loss": 0.8731, "step": 19200 }, { "epoch": 6.0, "grad_norm": 3.022843599319458, "learning_rate": 1.3618303279046413e-05, "loss": 0.7158, "step": 19210 }, { "epoch": 6.01, "grad_norm": 17.63628578186035, "learning_rate": 1.3596472077893727e-05, "loss": 1.1055, "step": 19220 }, { "epoch": 6.01, "grad_norm": 11.917749404907227, "learning_rate": 1.3574640876741037e-05, "loss": 0.7743, "step": 19230 }, { "epoch": 6.01, "grad_norm": 5.3785719871521, "learning_rate": 1.3552809675588352e-05, "loss": 0.5578, "step": 19240 }, { "epoch": 6.01, "grad_norm": 15.35848617553711, "learning_rate": 1.3530978474435666e-05, "loss": 0.9378, "step": 19250 }, { "epoch": 6.01, "grad_norm": 3.908442735671997, "learning_rate": 1.3509147273282976e-05, "loss": 0.7619, "step": 19260 }, { "epoch": 6.01, "grad_norm": 8.305680274963379, "learning_rate": 1.3487316072130291e-05, "loss": 0.864, "step": 19270 }, { "epoch": 6.01, "grad_norm": 2.0058398246765137, "learning_rate": 1.3465484870977601e-05, "loss": 0.8396, "step": 19280 }, { "epoch": 6.01, "grad_norm": 22.156827926635742, "learning_rate": 1.3443653669824916e-05, "loss": 1.132, "step": 19290 }, { "epoch": 6.01, "grad_norm": 14.584115028381348, "learning_rate": 1.3421822468672226e-05, "loss": 1.106, "step": 19300 }, { "epoch": 6.01, "grad_norm": 6.945678234100342, "learning_rate": 1.339999126751954e-05, "loss": 0.9229, "step": 19310 }, { "epoch": 6.01, "grad_norm": 1.9087910652160645, "learning_rate": 1.3378160066366852e-05, "loss": 0.884, "step": 19320 }, { "epoch": 6.01, "grad_norm": 14.006959915161133, "learning_rate": 1.3356328865214165e-05, "loss": 0.8935, "step": 19330 }, { "epoch": 6.01, "grad_norm": 8.553735733032227, "learning_rate": 1.3334497664061477e-05, "loss": 1.1401, "step": 19340 }, { "epoch": 6.01, "grad_norm": 4.0361104011535645, "learning_rate": 1.331266646290879e-05, "loss": 0.5517, "step": 19350 }, { "epoch": 6.01, "grad_norm": 17.48221778869629, "learning_rate": 1.3290835261756102e-05, "loss": 0.8757, "step": 19360 }, { "epoch": 6.01, "grad_norm": 8.670267105102539, "learning_rate": 1.3269004060603416e-05, "loss": 1.0839, "step": 19370 }, { "epoch": 6.01, "grad_norm": 13.145389556884766, "learning_rate": 1.3247172859450727e-05, "loss": 0.673, "step": 19380 }, { "epoch": 6.01, "grad_norm": 5.540602207183838, "learning_rate": 1.322534165829804e-05, "loss": 0.6177, "step": 19390 }, { "epoch": 6.01, "grad_norm": 9.156180381774902, "learning_rate": 1.3203510457145351e-05, "loss": 0.601, "step": 19400 }, { "epoch": 6.01, "grad_norm": 14.95980167388916, "learning_rate": 1.3181679255992666e-05, "loss": 0.992, "step": 19410 }, { "epoch": 6.01, "grad_norm": 14.796906471252441, "learning_rate": 1.3159848054839976e-05, "loss": 0.7576, "step": 19420 }, { "epoch": 6.01, "grad_norm": 14.570966720581055, "learning_rate": 1.3138016853687291e-05, "loss": 1.006, "step": 19430 }, { "epoch": 6.01, "grad_norm": 5.694338798522949, "learning_rate": 1.3116185652534605e-05, "loss": 0.9229, "step": 19440 }, { "epoch": 6.01, "grad_norm": 19.376192092895508, "learning_rate": 1.3094354451381915e-05, "loss": 1.123, "step": 19450 }, { "epoch": 6.01, "grad_norm": 12.55019474029541, "learning_rate": 1.307252325022923e-05, "loss": 1.4066, "step": 19460 }, { "epoch": 6.02, "grad_norm": 7.346521854400635, "learning_rate": 1.305069204907654e-05, "loss": 1.4577, "step": 19470 }, { "epoch": 6.02, "grad_norm": 11.374764442443848, "learning_rate": 1.3028860847923854e-05, "loss": 0.8926, "step": 19480 }, { "epoch": 6.02, "grad_norm": 5.0139288902282715, "learning_rate": 1.3007029646771165e-05, "loss": 0.4844, "step": 19490 }, { "epoch": 6.02, "grad_norm": 19.328943252563477, "learning_rate": 1.2985198445618479e-05, "loss": 0.6363, "step": 19500 }, { "epoch": 6.02, "grad_norm": 14.918673515319824, "learning_rate": 1.296336724446579e-05, "loss": 0.9846, "step": 19510 }, { "epoch": 6.02, "grad_norm": 17.552114486694336, "learning_rate": 1.2941536043313104e-05, "loss": 1.0785, "step": 19520 }, { "epoch": 6.02, "grad_norm": 2.675473928451538, "learning_rate": 1.2919704842160416e-05, "loss": 0.8533, "step": 19530 }, { "epoch": 6.02, "grad_norm": 32.22571563720703, "learning_rate": 1.289787364100773e-05, "loss": 1.0467, "step": 19540 }, { "epoch": 6.02, "grad_norm": 17.012359619140625, "learning_rate": 1.2876042439855041e-05, "loss": 1.6503, "step": 19550 }, { "epoch": 6.02, "grad_norm": 19.152799606323242, "learning_rate": 1.2854211238702355e-05, "loss": 0.9693, "step": 19560 }, { "epoch": 6.02, "grad_norm": 14.425495147705078, "learning_rate": 1.2832380037549665e-05, "loss": 0.9155, "step": 19570 }, { "epoch": 6.02, "grad_norm": 18.54916763305664, "learning_rate": 1.281054883639698e-05, "loss": 1.1365, "step": 19580 }, { "epoch": 6.02, "grad_norm": 11.173250198364258, "learning_rate": 1.278871763524429e-05, "loss": 1.2804, "step": 19590 }, { "epoch": 6.02, "grad_norm": 9.448962211608887, "learning_rate": 1.2766886434091605e-05, "loss": 1.4027, "step": 19600 }, { "epoch": 6.02, "grad_norm": 9.05895709991455, "learning_rate": 1.2745055232938915e-05, "loss": 1.0849, "step": 19610 }, { "epoch": 6.02, "grad_norm": 12.796313285827637, "learning_rate": 1.2723224031786229e-05, "loss": 0.9861, "step": 19620 }, { "epoch": 6.02, "grad_norm": 8.710278511047363, "learning_rate": 1.2701392830633544e-05, "loss": 0.6917, "step": 19630 }, { "epoch": 6.02, "grad_norm": 9.26023006439209, "learning_rate": 1.2679561629480854e-05, "loss": 0.8625, "step": 19640 }, { "epoch": 6.02, "grad_norm": 15.079761505126953, "learning_rate": 1.265773042832817e-05, "loss": 0.9741, "step": 19650 }, { "epoch": 6.02, "grad_norm": 9.078069686889648, "learning_rate": 1.263589922717548e-05, "loss": 0.8887, "step": 19660 }, { "epoch": 6.02, "grad_norm": 12.440805435180664, "learning_rate": 1.2614068026022793e-05, "loss": 1.0601, "step": 19670 }, { "epoch": 6.02, "grad_norm": 16.50569725036621, "learning_rate": 1.2592236824870105e-05, "loss": 1.0177, "step": 19680 }, { "epoch": 6.02, "grad_norm": 9.713330268859863, "learning_rate": 1.2570405623717418e-05, "loss": 1.0722, "step": 19690 }, { "epoch": 6.02, "grad_norm": 20.44059181213379, "learning_rate": 1.254857442256473e-05, "loss": 0.7604, "step": 19700 }, { "epoch": 6.02, "grad_norm": 12.339465141296387, "learning_rate": 1.2526743221412043e-05, "loss": 0.8027, "step": 19710 }, { "epoch": 6.02, "grad_norm": 4.405728816986084, "learning_rate": 1.2504912020259355e-05, "loss": 0.526, "step": 19720 }, { "epoch": 6.03, "grad_norm": 3.1987340450286865, "learning_rate": 1.2483080819106669e-05, "loss": 0.6485, "step": 19730 }, { "epoch": 6.03, "grad_norm": 14.758907318115234, "learning_rate": 1.246124961795398e-05, "loss": 1.0299, "step": 19740 }, { "epoch": 6.03, "grad_norm": 8.4321928024292, "learning_rate": 1.2439418416801294e-05, "loss": 0.8649, "step": 19750 }, { "epoch": 6.03, "grad_norm": 22.443330764770508, "learning_rate": 1.2417587215648606e-05, "loss": 0.9007, "step": 19760 }, { "epoch": 6.03, "grad_norm": 13.023670196533203, "learning_rate": 1.2395756014495919e-05, "loss": 1.0411, "step": 19770 }, { "epoch": 6.03, "grad_norm": 13.61885929107666, "learning_rate": 1.237392481334323e-05, "loss": 0.7225, "step": 19780 }, { "epoch": 6.03, "grad_norm": 12.228261947631836, "learning_rate": 1.2352093612190543e-05, "loss": 0.919, "step": 19790 }, { "epoch": 6.03, "grad_norm": 23.84542465209961, "learning_rate": 1.2330262411037856e-05, "loss": 0.9936, "step": 19800 }, { "epoch": 6.03, "grad_norm": 10.710442543029785, "learning_rate": 1.2308431209885168e-05, "loss": 0.7094, "step": 19810 }, { "epoch": 6.03, "grad_norm": 20.20848274230957, "learning_rate": 1.2286600008732481e-05, "loss": 1.2505, "step": 19820 }, { "epoch": 6.03, "grad_norm": 14.151534080505371, "learning_rate": 1.2264768807579793e-05, "loss": 0.9584, "step": 19830 }, { "epoch": 6.03, "grad_norm": 9.120428085327148, "learning_rate": 1.2242937606427107e-05, "loss": 0.8168, "step": 19840 }, { "epoch": 6.03, "grad_norm": 4.100675582885742, "learning_rate": 1.2221106405274418e-05, "loss": 0.9742, "step": 19850 }, { "epoch": 6.03, "grad_norm": 9.65624713897705, "learning_rate": 1.219927520412173e-05, "loss": 1.2749, "step": 19860 }, { "epoch": 6.03, "grad_norm": 16.977109909057617, "learning_rate": 1.2177444002969044e-05, "loss": 1.2981, "step": 19870 }, { "epoch": 6.03, "grad_norm": 11.914881706237793, "learning_rate": 1.2155612801816355e-05, "loss": 1.2447, "step": 19880 }, { "epoch": 6.03, "grad_norm": 1.9755802154541016, "learning_rate": 1.2133781600663669e-05, "loss": 0.7578, "step": 19890 }, { "epoch": 6.03, "grad_norm": 18.403108596801758, "learning_rate": 1.2111950399510982e-05, "loss": 1.1049, "step": 19900 }, { "epoch": 6.03, "grad_norm": 8.58592700958252, "learning_rate": 1.2090119198358294e-05, "loss": 0.6003, "step": 19910 }, { "epoch": 6.03, "grad_norm": 15.167259216308594, "learning_rate": 1.2068287997205608e-05, "loss": 0.698, "step": 19920 }, { "epoch": 6.03, "grad_norm": 14.04846477508545, "learning_rate": 1.204645679605292e-05, "loss": 1.0091, "step": 19930 }, { "epoch": 6.03, "grad_norm": 6.70612907409668, "learning_rate": 1.2024625594900233e-05, "loss": 0.5851, "step": 19940 }, { "epoch": 6.03, "grad_norm": 11.948375701904297, "learning_rate": 1.2002794393747545e-05, "loss": 1.0188, "step": 19950 }, { "epoch": 6.03, "grad_norm": 8.925446510314941, "learning_rate": 1.1980963192594858e-05, "loss": 0.9685, "step": 19960 }, { "epoch": 6.03, "grad_norm": 17.687366485595703, "learning_rate": 1.195913199144217e-05, "loss": 1.2155, "step": 19970 }, { "epoch": 6.04, "grad_norm": 11.535438537597656, "learning_rate": 1.1937300790289482e-05, "loss": 1.1188, "step": 19980 }, { "epoch": 6.04, "grad_norm": 4.907320976257324, "learning_rate": 1.1915469589136795e-05, "loss": 1.2147, "step": 19990 }, { "epoch": 6.04, "grad_norm": 22.2689266204834, "learning_rate": 1.1893638387984107e-05, "loss": 0.8476, "step": 20000 }, { "epoch": 6.04, "grad_norm": 0.4976605772972107, "learning_rate": 1.187180718683142e-05, "loss": 0.8115, "step": 20010 }, { "epoch": 6.04, "grad_norm": 12.891763687133789, "learning_rate": 1.1849975985678732e-05, "loss": 1.0538, "step": 20020 }, { "epoch": 6.04, "grad_norm": 3.1018552780151367, "learning_rate": 1.1828144784526044e-05, "loss": 1.1069, "step": 20030 }, { "epoch": 6.04, "grad_norm": 11.552604675292969, "learning_rate": 1.1806313583373357e-05, "loss": 1.266, "step": 20040 }, { "epoch": 6.04, "grad_norm": 11.094951629638672, "learning_rate": 1.178448238222067e-05, "loss": 0.9404, "step": 20050 }, { "epoch": 6.04, "grad_norm": 14.462247848510742, "learning_rate": 1.1762651181067983e-05, "loss": 0.7581, "step": 20060 }, { "epoch": 6.04, "grad_norm": 7.162161827087402, "learning_rate": 1.1740819979915295e-05, "loss": 0.8249, "step": 20070 }, { "epoch": 6.04, "grad_norm": 14.862044334411621, "learning_rate": 1.1718988778762608e-05, "loss": 1.1265, "step": 20080 }, { "epoch": 6.04, "grad_norm": 3.4600839614868164, "learning_rate": 1.1697157577609921e-05, "loss": 0.8085, "step": 20090 }, { "epoch": 6.04, "grad_norm": 10.59775447845459, "learning_rate": 1.1675326376457233e-05, "loss": 1.3397, "step": 20100 }, { "epoch": 6.04, "grad_norm": 3.948460817337036, "learning_rate": 1.1653495175304547e-05, "loss": 0.8698, "step": 20110 }, { "epoch": 6.04, "grad_norm": 9.43496322631836, "learning_rate": 1.1631663974151859e-05, "loss": 0.9724, "step": 20120 }, { "epoch": 6.04, "grad_norm": 8.688820838928223, "learning_rate": 1.1609832772999172e-05, "loss": 0.7916, "step": 20130 }, { "epoch": 6.04, "grad_norm": 7.970766067504883, "learning_rate": 1.1588001571846484e-05, "loss": 0.8868, "step": 20140 }, { "epoch": 6.04, "grad_norm": 3.132626533508301, "learning_rate": 1.1566170370693796e-05, "loss": 0.8783, "step": 20150 }, { "epoch": 6.04, "grad_norm": 20.212108612060547, "learning_rate": 1.1544339169541109e-05, "loss": 0.7285, "step": 20160 }, { "epoch": 6.04, "grad_norm": 13.403995513916016, "learning_rate": 1.152250796838842e-05, "loss": 0.9507, "step": 20170 }, { "epoch": 6.04, "grad_norm": 10.535225868225098, "learning_rate": 1.1500676767235734e-05, "loss": 0.8754, "step": 20180 }, { "epoch": 6.04, "grad_norm": 2.9035370349884033, "learning_rate": 1.1478845566083046e-05, "loss": 1.0473, "step": 20190 }, { "epoch": 6.04, "grad_norm": 5.57392692565918, "learning_rate": 1.145701436493036e-05, "loss": 0.9705, "step": 20200 }, { "epoch": 6.04, "grad_norm": 4.606277942657471, "learning_rate": 1.1435183163777671e-05, "loss": 1.1733, "step": 20210 }, { "epoch": 6.04, "grad_norm": 3.3531525135040283, "learning_rate": 1.1413351962624983e-05, "loss": 0.5378, "step": 20220 }, { "epoch": 6.04, "grad_norm": 16.04824447631836, "learning_rate": 1.1391520761472297e-05, "loss": 1.015, "step": 20230 }, { "epoch": 6.05, "grad_norm": 8.07902717590332, "learning_rate": 1.1369689560319608e-05, "loss": 0.855, "step": 20240 }, { "epoch": 6.05, "grad_norm": 32.14404296875, "learning_rate": 1.1347858359166922e-05, "loss": 0.9633, "step": 20250 }, { "epoch": 6.05, "grad_norm": 14.990850448608398, "learning_rate": 1.1326027158014234e-05, "loss": 0.8525, "step": 20260 }, { "epoch": 6.05, "grad_norm": 15.764461517333984, "learning_rate": 1.1304195956861547e-05, "loss": 1.4, "step": 20270 }, { "epoch": 6.05, "grad_norm": 15.154685020446777, "learning_rate": 1.128236475570886e-05, "loss": 0.9615, "step": 20280 }, { "epoch": 6.05, "grad_norm": 11.590630531311035, "learning_rate": 1.1260533554556172e-05, "loss": 0.8712, "step": 20290 }, { "epoch": 6.05, "grad_norm": 13.895352363586426, "learning_rate": 1.1238702353403486e-05, "loss": 1.1044, "step": 20300 }, { "epoch": 6.05, "grad_norm": 10.58617877960205, "learning_rate": 1.1216871152250798e-05, "loss": 0.9063, "step": 20310 }, { "epoch": 6.05, "grad_norm": 8.31143569946289, "learning_rate": 1.119503995109811e-05, "loss": 1.0207, "step": 20320 }, { "epoch": 6.05, "grad_norm": 16.123573303222656, "learning_rate": 1.1173208749945423e-05, "loss": 1.0759, "step": 20330 }, { "epoch": 6.05, "grad_norm": 5.104023456573486, "learning_rate": 1.1151377548792735e-05, "loss": 0.4777, "step": 20340 }, { "epoch": 6.05, "grad_norm": 9.543670654296875, "learning_rate": 1.1129546347640048e-05, "loss": 0.9613, "step": 20350 }, { "epoch": 6.05, "grad_norm": 0.3251878321170807, "learning_rate": 1.110771514648736e-05, "loss": 0.8304, "step": 20360 }, { "epoch": 6.05, "grad_norm": 8.986598014831543, "learning_rate": 1.1085883945334673e-05, "loss": 0.7382, "step": 20370 }, { "epoch": 6.05, "grad_norm": 6.264145851135254, "learning_rate": 1.1064052744181985e-05, "loss": 1.0875, "step": 20380 }, { "epoch": 6.05, "grad_norm": 8.829626083374023, "learning_rate": 1.1042221543029297e-05, "loss": 0.9781, "step": 20390 }, { "epoch": 6.05, "grad_norm": 16.54914093017578, "learning_rate": 1.102039034187661e-05, "loss": 1.2976, "step": 20400 }, { "epoch": 6.05, "grad_norm": 19.746023178100586, "learning_rate": 1.0998559140723922e-05, "loss": 0.8488, "step": 20410 }, { "epoch": 6.05, "grad_norm": 12.405964851379395, "learning_rate": 1.0976727939571236e-05, "loss": 1.18, "step": 20420 }, { "epoch": 6.05, "grad_norm": 9.574867248535156, "learning_rate": 1.0954896738418548e-05, "loss": 0.7254, "step": 20430 }, { "epoch": 6.05, "grad_norm": 8.595725059509277, "learning_rate": 1.0933065537265861e-05, "loss": 0.5145, "step": 20440 }, { "epoch": 6.05, "grad_norm": 7.9356279373168945, "learning_rate": 1.0911234336113174e-05, "loss": 1.1989, "step": 20450 }, { "epoch": 6.05, "grad_norm": 10.958845138549805, "learning_rate": 1.0889403134960486e-05, "loss": 1.1552, "step": 20460 }, { "epoch": 6.05, "grad_norm": 16.113046646118164, "learning_rate": 1.08675719338078e-05, "loss": 1.041, "step": 20470 }, { "epoch": 6.05, "grad_norm": 6.027303695678711, "learning_rate": 1.0845740732655112e-05, "loss": 0.8942, "step": 20480 }, { "epoch": 6.06, "grad_norm": 28.24263572692871, "learning_rate": 1.0823909531502425e-05, "loss": 1.2611, "step": 20490 }, { "epoch": 6.06, "grad_norm": 7.918956756591797, "learning_rate": 1.0802078330349737e-05, "loss": 0.6719, "step": 20500 }, { "epoch": 6.06, "grad_norm": 6.799624919891357, "learning_rate": 1.0780247129197049e-05, "loss": 1.2173, "step": 20510 }, { "epoch": 6.06, "grad_norm": 14.290821075439453, "learning_rate": 1.0758415928044362e-05, "loss": 1.0704, "step": 20520 }, { "epoch": 6.06, "grad_norm": 7.131523132324219, "learning_rate": 1.0736584726891674e-05, "loss": 1.202, "step": 20530 }, { "epoch": 6.06, "grad_norm": 1.7636173963546753, "learning_rate": 1.0714753525738987e-05, "loss": 1.3019, "step": 20540 }, { "epoch": 6.06, "grad_norm": 10.356038093566895, "learning_rate": 1.0692922324586299e-05, "loss": 1.0062, "step": 20550 }, { "epoch": 6.06, "grad_norm": 2.87709641456604, "learning_rate": 1.0671091123433611e-05, "loss": 0.6786, "step": 20560 }, { "epoch": 6.06, "grad_norm": 17.549829483032227, "learning_rate": 1.0649259922280924e-05, "loss": 1.3318, "step": 20570 }, { "epoch": 6.06, "grad_norm": 13.36909008026123, "learning_rate": 1.0627428721128236e-05, "loss": 0.8369, "step": 20580 }, { "epoch": 6.06, "grad_norm": 15.279455184936523, "learning_rate": 1.060559751997555e-05, "loss": 1.0796, "step": 20590 }, { "epoch": 6.06, "grad_norm": 15.034516334533691, "learning_rate": 1.0583766318822861e-05, "loss": 0.8869, "step": 20600 }, { "epoch": 6.06, "grad_norm": 12.585487365722656, "learning_rate": 1.0561935117670175e-05, "loss": 0.9458, "step": 20610 }, { "epoch": 6.06, "grad_norm": 14.438051223754883, "learning_rate": 1.0540103916517487e-05, "loss": 0.7751, "step": 20620 }, { "epoch": 6.06, "grad_norm": 10.37213134765625, "learning_rate": 1.0518272715364798e-05, "loss": 0.729, "step": 20630 }, { "epoch": 6.06, "grad_norm": 13.155344009399414, "learning_rate": 1.0496441514212114e-05, "loss": 0.7347, "step": 20640 }, { "epoch": 6.06, "grad_norm": 7.79851770401001, "learning_rate": 1.0474610313059425e-05, "loss": 0.9178, "step": 20650 }, { "epoch": 6.06, "grad_norm": 9.04112434387207, "learning_rate": 1.0452779111906739e-05, "loss": 1.19, "step": 20660 }, { "epoch": 6.06, "grad_norm": 0.9089367389678955, "learning_rate": 1.043094791075405e-05, "loss": 0.4832, "step": 20670 }, { "epoch": 6.06, "grad_norm": 9.40851879119873, "learning_rate": 1.0409116709601362e-05, "loss": 0.9592, "step": 20680 }, { "epoch": 6.06, "grad_norm": 10.140167236328125, "learning_rate": 1.0387285508448676e-05, "loss": 0.9616, "step": 20690 }, { "epoch": 6.06, "grad_norm": 22.334558486938477, "learning_rate": 1.0365454307295988e-05, "loss": 0.801, "step": 20700 }, { "epoch": 6.06, "grad_norm": 14.993794441223145, "learning_rate": 1.0343623106143301e-05, "loss": 0.8228, "step": 20710 }, { "epoch": 6.06, "grad_norm": 13.428932189941406, "learning_rate": 1.0321791904990613e-05, "loss": 1.0946, "step": 20720 }, { "epoch": 6.06, "grad_norm": 5.753325462341309, "learning_rate": 1.0299960703837926e-05, "loss": 0.631, "step": 20730 }, { "epoch": 6.06, "grad_norm": 5.023742198944092, "learning_rate": 1.0278129502685238e-05, "loss": 0.8829, "step": 20740 }, { "epoch": 6.07, "grad_norm": 17.32583999633789, "learning_rate": 1.025629830153255e-05, "loss": 1.0119, "step": 20750 }, { "epoch": 6.07, "grad_norm": 0.9809181690216064, "learning_rate": 1.0234467100379863e-05, "loss": 0.7042, "step": 20760 }, { "epoch": 6.07, "grad_norm": 9.597783088684082, "learning_rate": 1.0212635899227175e-05, "loss": 1.143, "step": 20770 }, { "epoch": 6.07, "grad_norm": 6.851977348327637, "learning_rate": 1.0190804698074489e-05, "loss": 0.5702, "step": 20780 }, { "epoch": 6.07, "grad_norm": 9.077513694763184, "learning_rate": 1.01689734969218e-05, "loss": 0.8957, "step": 20790 }, { "epoch": 6.07, "grad_norm": 5.540928363800049, "learning_rate": 1.0147142295769112e-05, "loss": 0.6874, "step": 20800 }, { "epoch": 6.07, "grad_norm": 12.658646583557129, "learning_rate": 1.0125311094616426e-05, "loss": 0.7236, "step": 20810 }, { "epoch": 6.07, "grad_norm": 11.710670471191406, "learning_rate": 1.010347989346374e-05, "loss": 0.5816, "step": 20820 }, { "epoch": 6.07, "grad_norm": 18.188371658325195, "learning_rate": 1.0081648692311053e-05, "loss": 0.7323, "step": 20830 }, { "epoch": 6.07, "grad_norm": 13.488285064697266, "learning_rate": 1.0059817491158364e-05, "loss": 0.7705, "step": 20840 }, { "epoch": 6.07, "grad_norm": 12.554931640625, "learning_rate": 1.0037986290005676e-05, "loss": 0.9455, "step": 20850 }, { "epoch": 6.07, "grad_norm": 13.885579109191895, "learning_rate": 1.001615508885299e-05, "loss": 0.9845, "step": 20860 }, { "epoch": 6.07, "grad_norm": 9.108217239379883, "learning_rate": 9.994323887700302e-06, "loss": 0.7895, "step": 20870 }, { "epoch": 6.07, "grad_norm": 16.092084884643555, "learning_rate": 9.972492686547615e-06, "loss": 0.8849, "step": 20880 }, { "epoch": 6.07, "grad_norm": 16.532115936279297, "learning_rate": 9.950661485394927e-06, "loss": 0.9726, "step": 20890 }, { "epoch": 6.07, "grad_norm": 22.5228271484375, "learning_rate": 9.92883028424224e-06, "loss": 1.0253, "step": 20900 }, { "epoch": 6.07, "grad_norm": 18.020206451416016, "learning_rate": 9.906999083089552e-06, "loss": 1.4122, "step": 20910 }, { "epoch": 6.07, "grad_norm": 17.558069229125977, "learning_rate": 9.885167881936864e-06, "loss": 0.8031, "step": 20920 }, { "epoch": 6.07, "grad_norm": 4.401589393615723, "learning_rate": 9.863336680784177e-06, "loss": 0.8285, "step": 20930 }, { "epoch": 6.07, "grad_norm": 1.5896601676940918, "learning_rate": 9.841505479631489e-06, "loss": 1.1535, "step": 20940 }, { "epoch": 6.07, "grad_norm": 7.720593452453613, "learning_rate": 9.819674278478803e-06, "loss": 0.9335, "step": 20950 }, { "epoch": 6.07, "grad_norm": 4.146795749664307, "learning_rate": 9.797843077326114e-06, "loss": 0.84, "step": 20960 }, { "epoch": 6.07, "grad_norm": 13.92165470123291, "learning_rate": 9.776011876173428e-06, "loss": 0.7417, "step": 20970 }, { "epoch": 6.07, "grad_norm": 10.809229850769043, "learning_rate": 9.75418067502074e-06, "loss": 1.2793, "step": 20980 }, { "epoch": 6.07, "grad_norm": 15.442631721496582, "learning_rate": 9.732349473868051e-06, "loss": 0.6351, "step": 20990 }, { "epoch": 6.08, "grad_norm": 14.932209968566895, "learning_rate": 9.710518272715365e-06, "loss": 1.0208, "step": 21000 }, { "epoch": 6.08, "grad_norm": 10.870864868164062, "learning_rate": 9.688687071562678e-06, "loss": 0.9546, "step": 21010 }, { "epoch": 6.08, "grad_norm": 5.001415252685547, "learning_rate": 9.66685587040999e-06, "loss": 0.7949, "step": 21020 }, { "epoch": 6.08, "grad_norm": 6.89686393737793, "learning_rate": 9.645024669257304e-06, "loss": 0.8683, "step": 21030 }, { "epoch": 6.08, "grad_norm": 4.120835304260254, "learning_rate": 9.623193468104615e-06, "loss": 0.8381, "step": 21040 }, { "epoch": 6.08, "grad_norm": 21.621309280395508, "learning_rate": 9.601362266951929e-06, "loss": 0.9873, "step": 21050 }, { "epoch": 6.08, "grad_norm": 10.413346290588379, "learning_rate": 9.57953106579924e-06, "loss": 0.9357, "step": 21060 }, { "epoch": 6.08, "grad_norm": 2.5423777103424072, "learning_rate": 9.557699864646554e-06, "loss": 0.5701, "step": 21070 }, { "epoch": 6.08, "grad_norm": 14.804628372192383, "learning_rate": 9.535868663493866e-06, "loss": 0.7334, "step": 21080 }, { "epoch": 6.08, "grad_norm": 1.4368655681610107, "learning_rate": 9.514037462341178e-06, "loss": 0.7756, "step": 21090 }, { "epoch": 6.08, "grad_norm": 13.961825370788574, "learning_rate": 9.492206261188491e-06, "loss": 0.9304, "step": 21100 }, { "epoch": 6.08, "grad_norm": 2.898294448852539, "learning_rate": 9.470375060035803e-06, "loss": 1.1231, "step": 21110 }, { "epoch": 6.08, "grad_norm": 12.19017505645752, "learning_rate": 9.448543858883116e-06, "loss": 1.0204, "step": 21120 }, { "epoch": 6.08, "grad_norm": 15.401390075683594, "learning_rate": 9.426712657730428e-06, "loss": 0.8214, "step": 21130 }, { "epoch": 6.08, "grad_norm": 1.3338301181793213, "learning_rate": 9.404881456577742e-06, "loss": 0.8738, "step": 21140 }, { "epoch": 6.08, "grad_norm": 5.401437759399414, "learning_rate": 9.383050255425053e-06, "loss": 0.9233, "step": 21150 }, { "epoch": 6.08, "grad_norm": 2.707165002822876, "learning_rate": 9.361219054272365e-06, "loss": 1.1428, "step": 21160 }, { "epoch": 6.08, "grad_norm": 11.466598510742188, "learning_rate": 9.339387853119679e-06, "loss": 1.1975, "step": 21170 }, { "epoch": 6.08, "grad_norm": 9.304337501525879, "learning_rate": 9.31755665196699e-06, "loss": 0.695, "step": 21180 }, { "epoch": 6.08, "grad_norm": 13.315038681030273, "learning_rate": 9.295725450814306e-06, "loss": 0.7036, "step": 21190 }, { "epoch": 6.08, "grad_norm": 19.887859344482422, "learning_rate": 9.273894249661617e-06, "loss": 1.5324, "step": 21200 }, { "epoch": 6.08, "grad_norm": 21.062702178955078, "learning_rate": 9.25206304850893e-06, "loss": 1.1726, "step": 21210 }, { "epoch": 6.08, "grad_norm": 28.665861129760742, "learning_rate": 9.230231847356243e-06, "loss": 1.1265, "step": 21220 }, { "epoch": 6.08, "grad_norm": 4.124011993408203, "learning_rate": 9.208400646203555e-06, "loss": 0.8292, "step": 21230 }, { "epoch": 6.08, "grad_norm": 14.323921203613281, "learning_rate": 9.186569445050868e-06, "loss": 0.6922, "step": 21240 }, { "epoch": 6.09, "grad_norm": 11.402443885803223, "learning_rate": 9.16473824389818e-06, "loss": 0.9509, "step": 21250 }, { "epoch": 6.09, "grad_norm": 17.296323776245117, "learning_rate": 9.142907042745493e-06, "loss": 1.1408, "step": 21260 }, { "epoch": 6.09, "grad_norm": 28.33347511291504, "learning_rate": 9.121075841592805e-06, "loss": 1.2543, "step": 21270 }, { "epoch": 6.09, "grad_norm": 12.737051963806152, "learning_rate": 9.099244640440117e-06, "loss": 1.0212, "step": 21280 }, { "epoch": 6.09, "grad_norm": 10.641096115112305, "learning_rate": 9.07741343928743e-06, "loss": 0.8657, "step": 21290 }, { "epoch": 6.09, "grad_norm": 14.535057067871094, "learning_rate": 9.055582238134742e-06, "loss": 0.885, "step": 21300 }, { "epoch": 6.09, "grad_norm": 4.204330921173096, "learning_rate": 9.033751036982056e-06, "loss": 1.3415, "step": 21310 }, { "epoch": 6.09, "grad_norm": 6.733258247375488, "learning_rate": 9.011919835829367e-06, "loss": 1.5911, "step": 21320 }, { "epoch": 6.09, "grad_norm": 9.88444995880127, "learning_rate": 8.990088634676679e-06, "loss": 1.0954, "step": 21330 }, { "epoch": 6.09, "grad_norm": 8.323349952697754, "learning_rate": 8.968257433523993e-06, "loss": 0.8466, "step": 21340 }, { "epoch": 6.09, "grad_norm": 17.35368537902832, "learning_rate": 8.946426232371304e-06, "loss": 1.2521, "step": 21350 }, { "epoch": 6.09, "grad_norm": 8.413402557373047, "learning_rate": 8.924595031218618e-06, "loss": 1.2708, "step": 21360 }, { "epoch": 6.09, "grad_norm": 9.765074729919434, "learning_rate": 8.902763830065931e-06, "loss": 0.7091, "step": 21370 }, { "epoch": 6.09, "grad_norm": 9.898675918579102, "learning_rate": 8.880932628913243e-06, "loss": 0.9043, "step": 21380 }, { "epoch": 6.09, "grad_norm": 22.1824951171875, "learning_rate": 8.859101427760557e-06, "loss": 1.0121, "step": 21390 }, { "epoch": 6.09, "grad_norm": 7.850949764251709, "learning_rate": 8.837270226607868e-06, "loss": 1.0129, "step": 21400 }, { "epoch": 6.09, "grad_norm": 17.360538482666016, "learning_rate": 8.815439025455182e-06, "loss": 1.1927, "step": 21410 }, { "epoch": 6.09, "grad_norm": 8.161169052124023, "learning_rate": 8.793607824302494e-06, "loss": 0.8799, "step": 21420 }, { "epoch": 6.09, "grad_norm": 10.52309799194336, "learning_rate": 8.771776623149807e-06, "loss": 0.6986, "step": 21430 }, { "epoch": 6.09, "grad_norm": 16.455080032348633, "learning_rate": 8.749945421997119e-06, "loss": 1.0261, "step": 21440 }, { "epoch": 6.09, "grad_norm": 21.763277053833008, "learning_rate": 8.72811422084443e-06, "loss": 1.1537, "step": 21450 }, { "epoch": 6.09, "grad_norm": 13.489154815673828, "learning_rate": 8.706283019691744e-06, "loss": 0.9482, "step": 21460 }, { "epoch": 6.09, "grad_norm": 9.123115539550781, "learning_rate": 8.684451818539056e-06, "loss": 0.9397, "step": 21470 }, { "epoch": 6.09, "grad_norm": 12.124760627746582, "learning_rate": 8.66262061738637e-06, "loss": 1.0483, "step": 21480 }, { "epoch": 6.09, "grad_norm": 14.57143497467041, "learning_rate": 8.640789416233681e-06, "loss": 1.4178, "step": 21490 }, { "epoch": 6.09, "grad_norm": 8.894169807434082, "learning_rate": 8.618958215080995e-06, "loss": 1.1189, "step": 21500 }, { "epoch": 6.1, "grad_norm": 17.721481323242188, "learning_rate": 8.597127013928306e-06, "loss": 0.4566, "step": 21510 }, { "epoch": 6.1, "grad_norm": 23.265424728393555, "learning_rate": 8.575295812775618e-06, "loss": 0.7617, "step": 21520 }, { "epoch": 6.1, "grad_norm": 12.722051620483398, "learning_rate": 8.553464611622932e-06, "loss": 1.219, "step": 21530 }, { "epoch": 6.1, "grad_norm": 9.28855037689209, "learning_rate": 8.531633410470244e-06, "loss": 0.857, "step": 21540 }, { "epoch": 6.1, "grad_norm": 14.718110084533691, "learning_rate": 8.509802209317557e-06, "loss": 0.8637, "step": 21550 }, { "epoch": 6.1, "grad_norm": 2.2475578784942627, "learning_rate": 8.48797100816487e-06, "loss": 0.7759, "step": 21560 }, { "epoch": 6.1, "grad_norm": 15.702960014343262, "learning_rate": 8.466139807012182e-06, "loss": 1.2992, "step": 21570 }, { "epoch": 6.1, "grad_norm": 10.665801048278809, "learning_rate": 8.444308605859496e-06, "loss": 0.9768, "step": 21580 }, { "epoch": 6.1, "grad_norm": 29.55537986755371, "learning_rate": 8.422477404706808e-06, "loss": 0.9243, "step": 21590 }, { "epoch": 6.1, "grad_norm": 14.316550254821777, "learning_rate": 8.400646203554121e-06, "loss": 1.2148, "step": 21600 }, { "epoch": 6.1, "grad_norm": 8.909773826599121, "learning_rate": 8.378815002401433e-06, "loss": 0.7115, "step": 21610 }, { "epoch": 6.1, "grad_norm": 8.9246244430542, "learning_rate": 8.356983801248745e-06, "loss": 1.1205, "step": 21620 }, { "epoch": 6.1, "grad_norm": 12.544243812561035, "learning_rate": 8.335152600096058e-06, "loss": 0.7842, "step": 21630 }, { "epoch": 6.1, "grad_norm": 11.330750465393066, "learning_rate": 8.31332139894337e-06, "loss": 0.8645, "step": 21640 }, { "epoch": 6.1, "grad_norm": 14.815391540527344, "learning_rate": 8.291490197790683e-06, "loss": 0.6481, "step": 21650 }, { "epoch": 6.1, "grad_norm": 15.599739074707031, "learning_rate": 8.269658996637995e-06, "loss": 0.7738, "step": 21660 }, { "epoch": 6.1, "grad_norm": 10.524014472961426, "learning_rate": 8.247827795485309e-06, "loss": 1.3818, "step": 21670 }, { "epoch": 6.1, "grad_norm": 21.113872528076172, "learning_rate": 8.22599659433262e-06, "loss": 0.956, "step": 21680 }, { "epoch": 6.1, "grad_norm": 9.72291088104248, "learning_rate": 8.204165393179932e-06, "loss": 0.7706, "step": 21690 }, { "epoch": 6.1, "grad_norm": 6.975581169128418, "learning_rate": 8.182334192027246e-06, "loss": 0.6685, "step": 21700 }, { "epoch": 6.1, "grad_norm": 3.5132322311401367, "learning_rate": 8.160502990874557e-06, "loss": 1.1738, "step": 21710 }, { "epoch": 6.1, "grad_norm": 9.985962867736816, "learning_rate": 8.138671789721871e-06, "loss": 0.7941, "step": 21720 }, { "epoch": 6.1, "grad_norm": 17.347122192382812, "learning_rate": 8.116840588569183e-06, "loss": 0.786, "step": 21730 }, { "epoch": 6.1, "grad_norm": 4.003395080566406, "learning_rate": 8.095009387416496e-06, "loss": 0.9184, "step": 21740 }, { "epoch": 6.1, "grad_norm": 27.621301651000977, "learning_rate": 8.07317818626381e-06, "loss": 0.6072, "step": 21750 }, { "epoch": 6.11, "grad_norm": 29.72891616821289, "learning_rate": 8.051346985111121e-06, "loss": 0.9557, "step": 21760 }, { "epoch": 6.11, "grad_norm": 16.735626220703125, "learning_rate": 8.029515783958435e-06, "loss": 0.8894, "step": 21770 }, { "epoch": 6.11, "grad_norm": 15.984378814697266, "learning_rate": 8.007684582805747e-06, "loss": 0.7223, "step": 21780 }, { "epoch": 6.11, "grad_norm": 14.316971778869629, "learning_rate": 7.985853381653058e-06, "loss": 0.947, "step": 21790 }, { "epoch": 6.11, "grad_norm": 12.876729011535645, "learning_rate": 7.964022180500372e-06, "loss": 1.3639, "step": 21800 }, { "epoch": 6.11, "grad_norm": 12.506220817565918, "learning_rate": 7.942190979347684e-06, "loss": 0.5414, "step": 21810 }, { "epoch": 6.11, "grad_norm": 13.942744255065918, "learning_rate": 7.920359778194997e-06, "loss": 0.9607, "step": 21820 }, { "epoch": 6.11, "grad_norm": 12.619991302490234, "learning_rate": 7.898528577042309e-06, "loss": 0.6879, "step": 21830 }, { "epoch": 6.11, "grad_norm": 10.627518653869629, "learning_rate": 7.876697375889622e-06, "loss": 1.0828, "step": 21840 }, { "epoch": 6.11, "grad_norm": 11.243753433227539, "learning_rate": 7.854866174736934e-06, "loss": 1.0331, "step": 21850 }, { "epoch": 6.11, "grad_norm": 0.7537599802017212, "learning_rate": 7.833034973584246e-06, "loss": 0.9732, "step": 21860 }, { "epoch": 6.11, "grad_norm": 0.6372535228729248, "learning_rate": 7.81120377243156e-06, "loss": 0.837, "step": 21870 }, { "epoch": 6.11, "grad_norm": 15.871705055236816, "learning_rate": 7.789372571278871e-06, "loss": 0.9987, "step": 21880 }, { "epoch": 6.11, "grad_norm": 13.437572479248047, "learning_rate": 7.767541370126185e-06, "loss": 0.9789, "step": 21890 }, { "epoch": 6.11, "grad_norm": 12.20001220703125, "learning_rate": 7.745710168973497e-06, "loss": 0.7814, "step": 21900 }, { "epoch": 6.11, "grad_norm": 3.191204309463501, "learning_rate": 7.72387896782081e-06, "loss": 0.9588, "step": 21910 }, { "epoch": 6.11, "grad_norm": 16.17407989501953, "learning_rate": 7.702047766668122e-06, "loss": 0.9378, "step": 21920 }, { "epoch": 6.11, "grad_norm": 9.865918159484863, "learning_rate": 7.680216565515435e-06, "loss": 0.9194, "step": 21930 }, { "epoch": 6.11, "grad_norm": 9.503559112548828, "learning_rate": 7.658385364362749e-06, "loss": 1.0729, "step": 21940 }, { "epoch": 6.11, "grad_norm": 10.614211082458496, "learning_rate": 7.63655416321006e-06, "loss": 0.6172, "step": 21950 }, { "epoch": 6.11, "grad_norm": 23.543678283691406, "learning_rate": 7.614722962057373e-06, "loss": 0.672, "step": 21960 }, { "epoch": 6.11, "grad_norm": 12.802196502685547, "learning_rate": 7.592891760904686e-06, "loss": 0.7566, "step": 21970 }, { "epoch": 6.11, "grad_norm": 13.475851058959961, "learning_rate": 7.571060559751998e-06, "loss": 0.885, "step": 21980 }, { "epoch": 6.11, "grad_norm": 13.462224960327148, "learning_rate": 7.549229358599311e-06, "loss": 0.9038, "step": 21990 }, { "epoch": 6.11, "grad_norm": 9.254766464233398, "learning_rate": 7.527398157446623e-06, "loss": 0.8986, "step": 22000 }, { "epoch": 6.11, "grad_norm": 14.946699142456055, "learning_rate": 7.5055669562939354e-06, "loss": 1.1127, "step": 22010 }, { "epoch": 6.12, "grad_norm": 5.1692118644714355, "learning_rate": 7.483735755141248e-06, "loss": 1.0801, "step": 22020 }, { "epoch": 6.12, "grad_norm": 14.043323516845703, "learning_rate": 7.461904553988561e-06, "loss": 0.8564, "step": 22030 }, { "epoch": 6.12, "grad_norm": 14.317754745483398, "learning_rate": 7.440073352835873e-06, "loss": 1.1019, "step": 22040 }, { "epoch": 6.12, "grad_norm": 21.511520385742188, "learning_rate": 7.418242151683186e-06, "loss": 0.9223, "step": 22050 }, { "epoch": 6.12, "grad_norm": 13.267854690551758, "learning_rate": 7.396410950530499e-06, "loss": 0.9343, "step": 22060 }, { "epoch": 6.12, "grad_norm": 4.049220561981201, "learning_rate": 7.37457974937781e-06, "loss": 1.1205, "step": 22070 }, { "epoch": 6.12, "grad_norm": 19.73366928100586, "learning_rate": 7.352748548225123e-06, "loss": 1.5657, "step": 22080 }, { "epoch": 6.12, "grad_norm": 18.005531311035156, "learning_rate": 7.330917347072436e-06, "loss": 1.0449, "step": 22090 }, { "epoch": 6.12, "grad_norm": 7.735836982727051, "learning_rate": 7.309086145919748e-06, "loss": 0.9682, "step": 22100 }, { "epoch": 6.12, "grad_norm": 24.13368797302246, "learning_rate": 7.287254944767062e-06, "loss": 1.0607, "step": 22110 }, { "epoch": 6.12, "grad_norm": 1.9197334051132202, "learning_rate": 7.265423743614374e-06, "loss": 0.9083, "step": 22120 }, { "epoch": 6.12, "grad_norm": 9.305731773376465, "learning_rate": 7.243592542461687e-06, "loss": 0.7475, "step": 22130 }, { "epoch": 6.12, "grad_norm": 16.268892288208008, "learning_rate": 7.221761341309e-06, "loss": 0.9091, "step": 22140 }, { "epoch": 6.12, "grad_norm": 9.121870040893555, "learning_rate": 7.199930140156312e-06, "loss": 1.0766, "step": 22150 }, { "epoch": 6.12, "grad_norm": 9.69632625579834, "learning_rate": 7.178098939003625e-06, "loss": 0.8342, "step": 22160 }, { "epoch": 6.12, "grad_norm": 8.912117958068848, "learning_rate": 7.1562677378509375e-06, "loss": 0.8764, "step": 22170 }, { "epoch": 6.12, "grad_norm": 12.770004272460938, "learning_rate": 7.134436536698249e-06, "loss": 0.9334, "step": 22180 }, { "epoch": 6.12, "grad_norm": 4.163743495941162, "learning_rate": 7.112605335545562e-06, "loss": 0.8597, "step": 22190 }, { "epoch": 6.12, "grad_norm": 24.231935501098633, "learning_rate": 7.0907741343928746e-06, "loss": 0.7762, "step": 22200 }, { "epoch": 6.12, "grad_norm": 13.032087326049805, "learning_rate": 7.068942933240187e-06, "loss": 0.6841, "step": 22210 }, { "epoch": 6.12, "grad_norm": 6.7593607902526855, "learning_rate": 7.0471117320875e-06, "loss": 0.7632, "step": 22220 }, { "epoch": 6.12, "grad_norm": 6.953234672546387, "learning_rate": 7.0252805309348124e-06, "loss": 1.0193, "step": 22230 }, { "epoch": 6.12, "grad_norm": 20.670001983642578, "learning_rate": 7.003449329782124e-06, "loss": 0.645, "step": 22240 }, { "epoch": 6.12, "grad_norm": 12.630159378051758, "learning_rate": 6.981618128629437e-06, "loss": 1.3178, "step": 22250 }, { "epoch": 6.12, "grad_norm": 10.160954475402832, "learning_rate": 6.9597869274767495e-06, "loss": 0.9696, "step": 22260 }, { "epoch": 6.12, "eval_accuracy": 0.6732673267326733, "eval_loss": 1.2086248397827148, "eval_runtime": 377.8847, "eval_samples_per_second": 11.226, "eval_steps_per_second": 2.808, "step": 22267 }, { "epoch": 7.0, "grad_norm": 24.566938400268555, "learning_rate": 6.937955726324062e-06, "loss": 0.8656, "step": 22270 }, { "epoch": 7.0, "grad_norm": 12.866881370544434, "learning_rate": 6.916124525171375e-06, "loss": 0.7449, "step": 22280 }, { "epoch": 7.0, "grad_norm": 12.54931640625, "learning_rate": 6.894293324018687e-06, "loss": 0.967, "step": 22290 }, { "epoch": 7.0, "grad_norm": 4.171769142150879, "learning_rate": 6.872462122866001e-06, "loss": 0.4451, "step": 22300 }, { "epoch": 7.0, "grad_norm": 21.893814086914062, "learning_rate": 6.8506309217133135e-06, "loss": 1.1437, "step": 22310 }, { "epoch": 7.0, "grad_norm": 11.064529418945312, "learning_rate": 6.828799720560626e-06, "loss": 0.5414, "step": 22320 }, { "epoch": 7.0, "grad_norm": 6.644165515899658, "learning_rate": 6.806968519407939e-06, "loss": 0.9183, "step": 22330 }, { "epoch": 7.0, "grad_norm": 27.031402587890625, "learning_rate": 6.785137318255251e-06, "loss": 1.3573, "step": 22340 }, { "epoch": 7.0, "grad_norm": 7.331912994384766, "learning_rate": 6.763306117102563e-06, "loss": 0.798, "step": 22350 }, { "epoch": 7.0, "grad_norm": 19.093603134155273, "learning_rate": 6.741474915949876e-06, "loss": 0.7462, "step": 22360 }, { "epoch": 7.0, "grad_norm": 0.9792558550834656, "learning_rate": 6.719643714797188e-06, "loss": 0.8659, "step": 22370 }, { "epoch": 7.0, "grad_norm": 15.488015174865723, "learning_rate": 6.697812513644501e-06, "loss": 1.1279, "step": 22380 }, { "epoch": 7.0, "grad_norm": 4.714751720428467, "learning_rate": 6.675981312491814e-06, "loss": 1.0684, "step": 22390 }, { "epoch": 7.01, "grad_norm": 2.470669746398926, "learning_rate": 6.654150111339126e-06, "loss": 0.7873, "step": 22400 }, { "epoch": 7.01, "grad_norm": 14.077079772949219, "learning_rate": 6.632318910186439e-06, "loss": 0.7347, "step": 22410 }, { "epoch": 7.01, "grad_norm": 1.516126036643982, "learning_rate": 6.610487709033751e-06, "loss": 1.017, "step": 22420 }, { "epoch": 7.01, "grad_norm": 21.819738388061523, "learning_rate": 6.588656507881063e-06, "loss": 1.0878, "step": 22430 }, { "epoch": 7.01, "grad_norm": 13.667936325073242, "learning_rate": 6.566825306728376e-06, "loss": 0.7215, "step": 22440 }, { "epoch": 7.01, "grad_norm": 11.390900611877441, "learning_rate": 6.544994105575689e-06, "loss": 0.7695, "step": 22450 }, { "epoch": 7.01, "grad_norm": 15.198098182678223, "learning_rate": 6.523162904423001e-06, "loss": 0.7556, "step": 22460 }, { "epoch": 7.01, "grad_norm": 10.800911903381348, "learning_rate": 6.501331703270314e-06, "loss": 1.0945, "step": 22470 }, { "epoch": 7.01, "grad_norm": 3.1775708198547363, "learning_rate": 6.479500502117627e-06, "loss": 0.7259, "step": 22480 }, { "epoch": 7.01, "grad_norm": 22.255386352539062, "learning_rate": 6.45766930096494e-06, "loss": 0.9007, "step": 22490 }, { "epoch": 7.01, "grad_norm": 11.408926010131836, "learning_rate": 6.435838099812253e-06, "loss": 1.0859, "step": 22500 }, { "epoch": 7.01, "grad_norm": 7.062247276306152, "learning_rate": 6.414006898659565e-06, "loss": 1.0549, "step": 22510 }, { "epoch": 7.01, "grad_norm": 6.482881546020508, "learning_rate": 6.392175697506878e-06, "loss": 0.7321, "step": 22520 }, { "epoch": 7.01, "grad_norm": 8.553717613220215, "learning_rate": 6.37034449635419e-06, "loss": 0.913, "step": 22530 }, { "epoch": 7.01, "grad_norm": 18.774763107299805, "learning_rate": 6.348513295201502e-06, "loss": 1.4235, "step": 22540 }, { "epoch": 7.01, "grad_norm": 39.36065673828125, "learning_rate": 6.326682094048815e-06, "loss": 0.9129, "step": 22550 }, { "epoch": 7.01, "grad_norm": 12.211271286010742, "learning_rate": 6.3048508928961275e-06, "loss": 1.1178, "step": 22560 }, { "epoch": 7.01, "grad_norm": 12.562443733215332, "learning_rate": 6.28301969174344e-06, "loss": 0.7438, "step": 22570 }, { "epoch": 7.01, "grad_norm": 8.817444801330566, "learning_rate": 6.261188490590753e-06, "loss": 0.8186, "step": 22580 }, { "epoch": 7.01, "grad_norm": 17.169815063476562, "learning_rate": 6.2393572894380654e-06, "loss": 1.2084, "step": 22590 }, { "epoch": 7.01, "grad_norm": 9.290719032287598, "learning_rate": 6.217526088285377e-06, "loss": 0.8715, "step": 22600 }, { "epoch": 7.01, "grad_norm": 8.68436050415039, "learning_rate": 6.19569488713269e-06, "loss": 0.6677, "step": 22610 }, { "epoch": 7.01, "grad_norm": 9.004079818725586, "learning_rate": 6.173863685980003e-06, "loss": 1.0963, "step": 22620 }, { "epoch": 7.01, "grad_norm": 7.435941219329834, "learning_rate": 6.152032484827316e-06, "loss": 0.9219, "step": 22630 }, { "epoch": 7.01, "grad_norm": 3.747330665588379, "learning_rate": 6.130201283674629e-06, "loss": 1.1059, "step": 22640 }, { "epoch": 7.02, "grad_norm": 17.886228561401367, "learning_rate": 6.10837008252194e-06, "loss": 0.6888, "step": 22650 }, { "epoch": 7.02, "grad_norm": 13.061933517456055, "learning_rate": 6.086538881369253e-06, "loss": 0.8836, "step": 22660 }, { "epoch": 7.02, "grad_norm": 16.27376365661621, "learning_rate": 6.064707680216566e-06, "loss": 1.0022, "step": 22670 }, { "epoch": 7.02, "grad_norm": 21.48235511779785, "learning_rate": 6.042876479063878e-06, "loss": 1.1584, "step": 22680 }, { "epoch": 7.02, "grad_norm": 10.399629592895508, "learning_rate": 6.021045277911191e-06, "loss": 0.9695, "step": 22690 }, { "epoch": 7.02, "grad_norm": 16.379108428955078, "learning_rate": 5.9992140767585035e-06, "loss": 0.8027, "step": 22700 }, { "epoch": 7.02, "grad_norm": 11.35714054107666, "learning_rate": 5.977382875605816e-06, "loss": 0.8776, "step": 22710 }, { "epoch": 7.02, "grad_norm": 14.8698091506958, "learning_rate": 5.955551674453129e-06, "loss": 0.6826, "step": 22720 }, { "epoch": 7.02, "grad_norm": 17.39885711669922, "learning_rate": 5.933720473300441e-06, "loss": 0.7544, "step": 22730 }, { "epoch": 7.02, "grad_norm": 21.03622817993164, "learning_rate": 5.911889272147754e-06, "loss": 1.3856, "step": 22740 }, { "epoch": 7.02, "grad_norm": 14.05392074584961, "learning_rate": 5.890058070995067e-06, "loss": 1.1597, "step": 22750 }, { "epoch": 7.02, "grad_norm": 5.29230260848999, "learning_rate": 5.868226869842379e-06, "loss": 0.707, "step": 22760 }, { "epoch": 7.02, "grad_norm": 13.723396301269531, "learning_rate": 5.846395668689691e-06, "loss": 0.6267, "step": 22770 }, { "epoch": 7.02, "grad_norm": 16.502593994140625, "learning_rate": 5.824564467537004e-06, "loss": 0.7602, "step": 22780 }, { "epoch": 7.02, "grad_norm": 1.0553020238876343, "learning_rate": 5.802733266384316e-06, "loss": 0.9847, "step": 22790 }, { "epoch": 7.02, "grad_norm": 13.689431190490723, "learning_rate": 5.78090206523163e-06, "loss": 0.8923, "step": 22800 }, { "epoch": 7.02, "grad_norm": 10.34310245513916, "learning_rate": 5.7590708640789424e-06, "loss": 0.7253, "step": 22810 }, { "epoch": 7.02, "grad_norm": 8.613569259643555, "learning_rate": 5.737239662926255e-06, "loss": 0.715, "step": 22820 }, { "epoch": 7.02, "grad_norm": 11.60677719116211, "learning_rate": 5.715408461773567e-06, "loss": 0.7096, "step": 22830 }, { "epoch": 7.02, "grad_norm": 21.794721603393555, "learning_rate": 5.6935772606208795e-06, "loss": 0.8873, "step": 22840 }, { "epoch": 7.02, "grad_norm": 12.340702056884766, "learning_rate": 5.671746059468192e-06, "loss": 0.9329, "step": 22850 }, { "epoch": 7.02, "grad_norm": 15.118240356445312, "learning_rate": 5.649914858315505e-06, "loss": 1.3958, "step": 22860 }, { "epoch": 7.02, "grad_norm": 15.176989555358887, "learning_rate": 5.628083657162817e-06, "loss": 0.9516, "step": 22870 }, { "epoch": 7.02, "grad_norm": 10.92980670928955, "learning_rate": 5.60625245601013e-06, "loss": 1.2174, "step": 22880 }, { "epoch": 7.02, "grad_norm": 11.025736808776855, "learning_rate": 5.584421254857442e-06, "loss": 1.0819, "step": 22890 }, { "epoch": 7.02, "grad_norm": 23.223430633544922, "learning_rate": 5.562590053704755e-06, "loss": 0.8388, "step": 22900 }, { "epoch": 7.03, "grad_norm": 1.704086184501648, "learning_rate": 5.540758852552068e-06, "loss": 1.0151, "step": 22910 }, { "epoch": 7.03, "grad_norm": 20.710552215576172, "learning_rate": 5.5189276513993805e-06, "loss": 0.6974, "step": 22920 }, { "epoch": 7.03, "grad_norm": 11.450077056884766, "learning_rate": 5.497096450246693e-06, "loss": 1.0273, "step": 22930 }, { "epoch": 7.03, "grad_norm": 9.025691986083984, "learning_rate": 5.475265249094006e-06, "loss": 0.4062, "step": 22940 }, { "epoch": 7.03, "grad_norm": 3.4410812854766846, "learning_rate": 5.4534340479413176e-06, "loss": 1.2953, "step": 22950 }, { "epoch": 7.03, "grad_norm": 7.897343635559082, "learning_rate": 5.43160284678863e-06, "loss": 0.7313, "step": 22960 }, { "epoch": 7.03, "grad_norm": 24.02825164794922, "learning_rate": 5.409771645635943e-06, "loss": 0.8518, "step": 22970 }, { "epoch": 7.03, "grad_norm": 17.86899185180664, "learning_rate": 5.3879404444832555e-06, "loss": 1.0217, "step": 22980 }, { "epoch": 7.03, "grad_norm": 21.70557403564453, "learning_rate": 5.366109243330569e-06, "loss": 0.8316, "step": 22990 }, { "epoch": 7.03, "grad_norm": 14.955513000488281, "learning_rate": 5.344278042177881e-06, "loss": 1.4034, "step": 23000 }, { "epoch": 7.03, "grad_norm": 23.103965759277344, "learning_rate": 5.322446841025193e-06, "loss": 1.2206, "step": 23010 }, { "epoch": 7.03, "grad_norm": 16.355716705322266, "learning_rate": 5.300615639872506e-06, "loss": 0.9341, "step": 23020 }, { "epoch": 7.03, "grad_norm": 17.01467514038086, "learning_rate": 5.278784438719819e-06, "loss": 0.8918, "step": 23030 }, { "epoch": 7.03, "grad_norm": 9.071610450744629, "learning_rate": 5.256953237567131e-06, "loss": 1.3791, "step": 23040 }, { "epoch": 7.03, "grad_norm": 5.33894157409668, "learning_rate": 5.235122036414444e-06, "loss": 0.8691, "step": 23050 }, { "epoch": 7.03, "grad_norm": 8.486804962158203, "learning_rate": 5.2132908352617565e-06, "loss": 0.9363, "step": 23060 }, { "epoch": 7.03, "grad_norm": 16.909337997436523, "learning_rate": 5.191459634109068e-06, "loss": 1.5038, "step": 23070 }, { "epoch": 7.03, "grad_norm": 6.249111175537109, "learning_rate": 5.169628432956382e-06, "loss": 0.661, "step": 23080 }, { "epoch": 7.03, "grad_norm": 13.780141830444336, "learning_rate": 5.147797231803694e-06, "loss": 0.5907, "step": 23090 }, { "epoch": 7.03, "grad_norm": 6.661269187927246, "learning_rate": 5.125966030651007e-06, "loss": 1.3042, "step": 23100 }, { "epoch": 7.03, "grad_norm": 18.024240493774414, "learning_rate": 5.10413482949832e-06, "loss": 0.7153, "step": 23110 }, { "epoch": 7.03, "grad_norm": 25.936298370361328, "learning_rate": 5.0823036283456314e-06, "loss": 0.8083, "step": 23120 }, { "epoch": 7.03, "grad_norm": 6.467824459075928, "learning_rate": 5.060472427192944e-06, "loss": 0.8431, "step": 23130 }, { "epoch": 7.03, "grad_norm": 27.269886016845703, "learning_rate": 5.038641226040257e-06, "loss": 0.7651, "step": 23140 }, { "epoch": 7.03, "grad_norm": 18.707489013671875, "learning_rate": 5.016810024887569e-06, "loss": 0.8242, "step": 23150 }, { "epoch": 7.04, "grad_norm": 13.538045883178711, "learning_rate": 4.994978823734882e-06, "loss": 0.979, "step": 23160 }, { "epoch": 7.04, "grad_norm": 0.6856327652931213, "learning_rate": 4.9731476225821954e-06, "loss": 0.9759, "step": 23170 }, { "epoch": 7.04, "grad_norm": 6.240645885467529, "learning_rate": 4.951316421429507e-06, "loss": 0.8098, "step": 23180 }, { "epoch": 7.04, "grad_norm": 1.8353832960128784, "learning_rate": 4.92948522027682e-06, "loss": 0.6065, "step": 23190 }, { "epoch": 7.04, "grad_norm": 8.427669525146484, "learning_rate": 4.9076540191241325e-06, "loss": 0.833, "step": 23200 }, { "epoch": 7.04, "grad_norm": 18.19917869567871, "learning_rate": 4.885822817971445e-06, "loss": 0.8599, "step": 23210 }, { "epoch": 7.04, "grad_norm": 12.511658668518066, "learning_rate": 4.863991616818758e-06, "loss": 0.9558, "step": 23220 }, { "epoch": 7.04, "grad_norm": 4.134636878967285, "learning_rate": 4.84216041566607e-06, "loss": 0.7504, "step": 23230 }, { "epoch": 7.04, "grad_norm": 10.412677764892578, "learning_rate": 4.820329214513383e-06, "loss": 0.8058, "step": 23240 }, { "epoch": 7.04, "grad_norm": 21.072742462158203, "learning_rate": 4.798498013360695e-06, "loss": 0.8851, "step": 23250 }, { "epoch": 7.04, "grad_norm": 11.047403335571289, "learning_rate": 4.776666812208007e-06, "loss": 0.6278, "step": 23260 }, { "epoch": 7.04, "grad_norm": 1.739803433418274, "learning_rate": 4.754835611055321e-06, "loss": 0.7051, "step": 23270 }, { "epoch": 7.04, "grad_norm": 8.430428504943848, "learning_rate": 4.7330044099026335e-06, "loss": 0.9971, "step": 23280 }, { "epoch": 7.04, "grad_norm": 10.30160140991211, "learning_rate": 4.711173208749946e-06, "loss": 0.8712, "step": 23290 }, { "epoch": 7.04, "grad_norm": 16.159887313842773, "learning_rate": 4.689342007597258e-06, "loss": 0.9551, "step": 23300 }, { "epoch": 7.04, "grad_norm": 15.109415054321289, "learning_rate": 4.6675108064445706e-06, "loss": 1.2491, "step": 23310 }, { "epoch": 7.04, "grad_norm": 19.414960861206055, "learning_rate": 4.645679605291883e-06, "loss": 0.8695, "step": 23320 }, { "epoch": 7.04, "grad_norm": 14.283536911010742, "learning_rate": 4.623848404139196e-06, "loss": 0.5716, "step": 23330 }, { "epoch": 7.04, "grad_norm": 10.28564739227295, "learning_rate": 4.6020172029865084e-06, "loss": 0.7963, "step": 23340 }, { "epoch": 7.04, "grad_norm": 0.5498999357223511, "learning_rate": 4.580186001833821e-06, "loss": 0.8409, "step": 23350 }, { "epoch": 7.04, "grad_norm": 6.093383312225342, "learning_rate": 4.558354800681134e-06, "loss": 0.9016, "step": 23360 }, { "epoch": 7.04, "grad_norm": 16.872297286987305, "learning_rate": 4.536523599528446e-06, "loss": 0.6694, "step": 23370 }, { "epoch": 7.04, "grad_norm": 10.391695022583008, "learning_rate": 4.514692398375759e-06, "loss": 0.9626, "step": 23380 }, { "epoch": 7.04, "grad_norm": 13.343799591064453, "learning_rate": 4.492861197223072e-06, "loss": 0.7144, "step": 23390 }, { "epoch": 7.04, "grad_norm": 3.48488712310791, "learning_rate": 4.471029996070384e-06, "loss": 0.6862, "step": 23400 }, { "epoch": 7.04, "grad_norm": 4.5250935554504395, "learning_rate": 4.449198794917697e-06, "loss": 0.7777, "step": 23410 }, { "epoch": 7.05, "grad_norm": 9.5841646194458, "learning_rate": 4.427367593765009e-06, "loss": 0.7492, "step": 23420 }, { "epoch": 7.05, "grad_norm": 3.997966766357422, "learning_rate": 4.405536392612321e-06, "loss": 0.8444, "step": 23430 }, { "epoch": 7.05, "grad_norm": 14.375734329223633, "learning_rate": 4.383705191459634e-06, "loss": 1.0804, "step": 23440 }, { "epoch": 7.05, "grad_norm": 10.048188209533691, "learning_rate": 4.361873990306947e-06, "loss": 0.4166, "step": 23450 }, { "epoch": 7.05, "grad_norm": 12.621820449829102, "learning_rate": 4.34004278915426e-06, "loss": 0.9862, "step": 23460 }, { "epoch": 7.05, "grad_norm": 13.038965225219727, "learning_rate": 4.318211588001573e-06, "loss": 0.5866, "step": 23470 }, { "epoch": 7.05, "grad_norm": 2.1622395515441895, "learning_rate": 4.296380386848884e-06, "loss": 0.7231, "step": 23480 }, { "epoch": 7.05, "grad_norm": 19.956249237060547, "learning_rate": 4.274549185696197e-06, "loss": 1.1706, "step": 23490 }, { "epoch": 7.05, "grad_norm": 7.50234317779541, "learning_rate": 4.25271798454351e-06, "loss": 0.8329, "step": 23500 }, { "epoch": 7.05, "grad_norm": 3.854886054992676, "learning_rate": 4.230886783390822e-06, "loss": 0.5939, "step": 23510 }, { "epoch": 7.05, "grad_norm": 0.9531391263008118, "learning_rate": 4.209055582238135e-06, "loss": 0.5558, "step": 23520 }, { "epoch": 7.05, "grad_norm": 15.257450103759766, "learning_rate": 4.1872243810854476e-06, "loss": 0.7121, "step": 23530 }, { "epoch": 7.05, "grad_norm": 8.079009056091309, "learning_rate": 4.16539317993276e-06, "loss": 0.4573, "step": 23540 }, { "epoch": 7.05, "grad_norm": 8.49425220489502, "learning_rate": 4.143561978780073e-06, "loss": 0.4373, "step": 23550 }, { "epoch": 7.05, "grad_norm": 10.378521919250488, "learning_rate": 4.1217307776273855e-06, "loss": 0.922, "step": 23560 }, { "epoch": 7.05, "grad_norm": 21.390535354614258, "learning_rate": 4.099899576474698e-06, "loss": 1.3892, "step": 23570 }, { "epoch": 7.05, "grad_norm": 1.3589413166046143, "learning_rate": 4.078068375322011e-06, "loss": 0.866, "step": 23580 }, { "epoch": 7.05, "grad_norm": 19.397750854492188, "learning_rate": 4.056237174169323e-06, "loss": 1.1069, "step": 23590 }, { "epoch": 7.05, "grad_norm": 19.77324867248535, "learning_rate": 4.034405973016635e-06, "loss": 1.1565, "step": 23600 }, { "epoch": 7.05, "grad_norm": 10.542643547058105, "learning_rate": 4.012574771863948e-06, "loss": 0.8809, "step": 23610 }, { "epoch": 7.05, "grad_norm": 18.369590759277344, "learning_rate": 3.99074357071126e-06, "loss": 0.9308, "step": 23620 }, { "epoch": 7.05, "grad_norm": 16.768144607543945, "learning_rate": 3.968912369558573e-06, "loss": 0.8965, "step": 23630 }, { "epoch": 7.05, "grad_norm": 10.184368133544922, "learning_rate": 3.9470811684058865e-06, "loss": 0.8279, "step": 23640 }, { "epoch": 7.05, "grad_norm": 8.216437339782715, "learning_rate": 3.925249967253198e-06, "loss": 0.5507, "step": 23650 }, { "epoch": 7.05, "grad_norm": 6.6208109855651855, "learning_rate": 3.903418766100511e-06, "loss": 0.63, "step": 23660 }, { "epoch": 7.06, "grad_norm": 12.18597412109375, "learning_rate": 3.8815875649478235e-06, "loss": 0.8562, "step": 23670 }, { "epoch": 7.06, "grad_norm": 12.935205459594727, "learning_rate": 3.859756363795136e-06, "loss": 0.8953, "step": 23680 }, { "epoch": 7.06, "grad_norm": 8.183343887329102, "learning_rate": 3.837925162642449e-06, "loss": 0.8083, "step": 23690 }, { "epoch": 7.06, "grad_norm": 17.558481216430664, "learning_rate": 3.8160939614897614e-06, "loss": 0.9331, "step": 23700 }, { "epoch": 7.06, "grad_norm": 12.761829376220703, "learning_rate": 3.7942627603370736e-06, "loss": 1.0621, "step": 23710 }, { "epoch": 7.06, "grad_norm": 16.400774002075195, "learning_rate": 3.7724315591843863e-06, "loss": 1.0942, "step": 23720 }, { "epoch": 7.06, "grad_norm": 11.519524574279785, "learning_rate": 3.7506003580316993e-06, "loss": 0.856, "step": 23730 }, { "epoch": 7.06, "grad_norm": 17.327651977539062, "learning_rate": 3.728769156879012e-06, "loss": 0.7153, "step": 23740 }, { "epoch": 7.06, "grad_norm": 3.3754522800445557, "learning_rate": 3.7069379557263246e-06, "loss": 0.8135, "step": 23750 }, { "epoch": 7.06, "grad_norm": 20.5135440826416, "learning_rate": 3.6851067545736368e-06, "loss": 1.0794, "step": 23760 }, { "epoch": 7.06, "grad_norm": 20.814950942993164, "learning_rate": 3.6632755534209494e-06, "loss": 0.7967, "step": 23770 }, { "epoch": 7.06, "grad_norm": 5.40897798538208, "learning_rate": 3.641444352268262e-06, "loss": 0.8899, "step": 23780 }, { "epoch": 7.06, "grad_norm": 10.100780487060547, "learning_rate": 3.6196131511155743e-06, "loss": 1.1645, "step": 23790 }, { "epoch": 7.06, "grad_norm": 4.774610996246338, "learning_rate": 3.597781949962887e-06, "loss": 0.6898, "step": 23800 }, { "epoch": 7.06, "grad_norm": 16.961591720581055, "learning_rate": 3.5759507488101995e-06, "loss": 0.5431, "step": 23810 }, { "epoch": 7.06, "grad_norm": 22.29293441772461, "learning_rate": 3.5541195476575126e-06, "loss": 0.5451, "step": 23820 }, { "epoch": 7.06, "grad_norm": 5.687443733215332, "learning_rate": 3.532288346504825e-06, "loss": 0.9708, "step": 23830 }, { "epoch": 7.06, "grad_norm": 18.295982360839844, "learning_rate": 3.5104571453521374e-06, "loss": 0.871, "step": 23840 }, { "epoch": 7.06, "grad_norm": 7.523390293121338, "learning_rate": 3.48862594419945e-06, "loss": 0.8485, "step": 23850 }, { "epoch": 7.06, "grad_norm": 11.493181228637695, "learning_rate": 3.4667947430467627e-06, "loss": 0.6442, "step": 23860 }, { "epoch": 7.06, "grad_norm": 23.334091186523438, "learning_rate": 3.4449635418940753e-06, "loss": 0.9452, "step": 23870 }, { "epoch": 7.06, "grad_norm": 0.3790530264377594, "learning_rate": 3.4231323407413875e-06, "loss": 0.7469, "step": 23880 }, { "epoch": 7.06, "grad_norm": 16.289031982421875, "learning_rate": 3.4013011395887e-06, "loss": 1.571, "step": 23890 }, { "epoch": 7.06, "grad_norm": 17.160968780517578, "learning_rate": 3.3794699384360128e-06, "loss": 0.6647, "step": 23900 }, { "epoch": 7.06, "grad_norm": 10.158751487731934, "learning_rate": 3.357638737283326e-06, "loss": 0.8742, "step": 23910 }, { "epoch": 7.06, "grad_norm": 1.2584426403045654, "learning_rate": 3.3358075361306384e-06, "loss": 0.6706, "step": 23920 }, { "epoch": 7.07, "grad_norm": 7.49001407623291, "learning_rate": 3.3139763349779506e-06, "loss": 1.1162, "step": 23930 }, { "epoch": 7.07, "grad_norm": 24.406400680541992, "learning_rate": 3.2921451338252633e-06, "loss": 0.6852, "step": 23940 }, { "epoch": 7.07, "grad_norm": 6.5853447914123535, "learning_rate": 3.270313932672576e-06, "loss": 1.3098, "step": 23950 }, { "epoch": 7.07, "grad_norm": 11.233528137207031, "learning_rate": 3.2484827315198885e-06, "loss": 0.5316, "step": 23960 }, { "epoch": 7.07, "grad_norm": 10.876388549804688, "learning_rate": 3.2266515303672007e-06, "loss": 0.847, "step": 23970 }, { "epoch": 7.07, "grad_norm": 6.558208465576172, "learning_rate": 3.2048203292145134e-06, "loss": 0.7609, "step": 23980 }, { "epoch": 7.07, "grad_norm": 15.549338340759277, "learning_rate": 3.182989128061826e-06, "loss": 0.7003, "step": 23990 }, { "epoch": 7.07, "grad_norm": 8.242082595825195, "learning_rate": 3.161157926909139e-06, "loss": 0.6523, "step": 24000 }, { "epoch": 7.07, "grad_norm": 2.7282044887542725, "learning_rate": 3.1393267257564517e-06, "loss": 0.3347, "step": 24010 }, { "epoch": 7.07, "grad_norm": 10.54309368133545, "learning_rate": 3.117495524603764e-06, "loss": 0.8421, "step": 24020 }, { "epoch": 7.07, "grad_norm": 17.911592483520508, "learning_rate": 3.0956643234510765e-06, "loss": 0.7005, "step": 24030 }, { "epoch": 7.07, "grad_norm": 13.500044822692871, "learning_rate": 3.073833122298389e-06, "loss": 0.8019, "step": 24040 }, { "epoch": 7.07, "grad_norm": 17.696617126464844, "learning_rate": 3.0520019211457014e-06, "loss": 0.5378, "step": 24050 }, { "epoch": 7.07, "grad_norm": 3.744534492492676, "learning_rate": 3.030170719993014e-06, "loss": 0.7327, "step": 24060 }, { "epoch": 7.07, "grad_norm": 13.558680534362793, "learning_rate": 3.008339518840327e-06, "loss": 0.6696, "step": 24070 }, { "epoch": 7.07, "grad_norm": 33.397762298583984, "learning_rate": 2.9865083176876392e-06, "loss": 0.8728, "step": 24080 }, { "epoch": 7.07, "grad_norm": 12.871822357177734, "learning_rate": 2.964677116534952e-06, "loss": 0.8255, "step": 24090 }, { "epoch": 7.07, "grad_norm": 17.987056732177734, "learning_rate": 2.9428459153822645e-06, "loss": 1.0269, "step": 24100 }, { "epoch": 7.07, "grad_norm": 14.662192344665527, "learning_rate": 2.9210147142295767e-06, "loss": 0.962, "step": 24110 }, { "epoch": 7.07, "grad_norm": 12.710545539855957, "learning_rate": 2.8991835130768898e-06, "loss": 0.8864, "step": 24120 }, { "epoch": 7.07, "grad_norm": 11.532308578491211, "learning_rate": 2.8773523119242024e-06, "loss": 1.1039, "step": 24130 }, { "epoch": 7.07, "grad_norm": 13.05734634399414, "learning_rate": 2.8555211107715146e-06, "loss": 0.8819, "step": 24140 }, { "epoch": 7.07, "grad_norm": 9.320040702819824, "learning_rate": 2.8336899096188272e-06, "loss": 0.9217, "step": 24150 }, { "epoch": 7.07, "grad_norm": 10.553701400756836, "learning_rate": 2.81185870846614e-06, "loss": 0.544, "step": 24160 }, { "epoch": 7.07, "grad_norm": 14.938994407653809, "learning_rate": 2.7900275073134525e-06, "loss": 0.8479, "step": 24170 }, { "epoch": 7.08, "grad_norm": 11.07756519317627, "learning_rate": 2.768196306160765e-06, "loss": 1.2725, "step": 24180 }, { "epoch": 7.08, "grad_norm": 27.06083869934082, "learning_rate": 2.7463651050080778e-06, "loss": 0.9264, "step": 24190 }, { "epoch": 7.08, "grad_norm": 11.948705673217773, "learning_rate": 2.72453390385539e-06, "loss": 1.0881, "step": 24200 }, { "epoch": 7.08, "grad_norm": 8.953108787536621, "learning_rate": 2.702702702702703e-06, "loss": 0.8102, "step": 24210 }, { "epoch": 7.08, "grad_norm": 17.345094680786133, "learning_rate": 2.6808715015500156e-06, "loss": 0.884, "step": 24220 }, { "epoch": 7.08, "grad_norm": 12.708348274230957, "learning_rate": 2.659040300397328e-06, "loss": 1.0964, "step": 24230 }, { "epoch": 7.08, "grad_norm": 10.813621520996094, "learning_rate": 2.6372090992446405e-06, "loss": 0.7013, "step": 24240 }, { "epoch": 7.08, "grad_norm": 13.178887367248535, "learning_rate": 2.615377898091953e-06, "loss": 0.7456, "step": 24250 }, { "epoch": 7.08, "grad_norm": 19.272401809692383, "learning_rate": 2.5935466969392657e-06, "loss": 1.0667, "step": 24260 }, { "epoch": 7.08, "grad_norm": 11.024786949157715, "learning_rate": 2.5717154957865784e-06, "loss": 0.6242, "step": 24270 }, { "epoch": 7.08, "grad_norm": 8.982081413269043, "learning_rate": 2.549884294633891e-06, "loss": 0.6095, "step": 24280 }, { "epoch": 7.08, "grad_norm": 1.3254083395004272, "learning_rate": 2.528053093481203e-06, "loss": 0.6069, "step": 24290 }, { "epoch": 7.08, "grad_norm": 2.119105815887451, "learning_rate": 2.5062218923285163e-06, "loss": 0.5777, "step": 24300 }, { "epoch": 7.08, "grad_norm": 19.908342361450195, "learning_rate": 2.484390691175829e-06, "loss": 0.8604, "step": 24310 }, { "epoch": 7.08, "grad_norm": 16.997121810913086, "learning_rate": 2.462559490023141e-06, "loss": 0.9678, "step": 24320 }, { "epoch": 7.08, "grad_norm": 17.20611000061035, "learning_rate": 2.4407282888704537e-06, "loss": 0.9643, "step": 24330 }, { "epoch": 7.08, "grad_norm": 22.053987503051758, "learning_rate": 2.4188970877177664e-06, "loss": 1.1076, "step": 24340 }, { "epoch": 7.08, "grad_norm": 7.580945014953613, "learning_rate": 2.397065886565079e-06, "loss": 0.6378, "step": 24350 }, { "epoch": 7.08, "grad_norm": 10.895210266113281, "learning_rate": 2.3752346854123916e-06, "loss": 0.8328, "step": 24360 }, { "epoch": 7.08, "grad_norm": 13.015052795410156, "learning_rate": 2.3534034842597042e-06, "loss": 0.9009, "step": 24370 }, { "epoch": 7.08, "grad_norm": 12.412936210632324, "learning_rate": 2.3315722831070165e-06, "loss": 0.6724, "step": 24380 }, { "epoch": 7.08, "grad_norm": 8.614873886108398, "learning_rate": 2.309741081954329e-06, "loss": 0.5701, "step": 24390 }, { "epoch": 7.08, "grad_norm": 22.714107513427734, "learning_rate": 2.2879098808016417e-06, "loss": 1.0538, "step": 24400 }, { "epoch": 7.08, "grad_norm": 12.148658752441406, "learning_rate": 2.2660786796489543e-06, "loss": 0.7045, "step": 24410 }, { "epoch": 7.08, "grad_norm": 7.043250560760498, "learning_rate": 2.244247478496267e-06, "loss": 0.7908, "step": 24420 }, { "epoch": 7.08, "grad_norm": 11.0164155960083, "learning_rate": 2.2224162773435796e-06, "loss": 1.0713, "step": 24430 }, { "epoch": 7.09, "grad_norm": 12.212179183959961, "learning_rate": 2.2005850761908922e-06, "loss": 1.03, "step": 24440 }, { "epoch": 7.09, "grad_norm": 11.677922248840332, "learning_rate": 2.178753875038205e-06, "loss": 0.8676, "step": 24450 }, { "epoch": 7.09, "grad_norm": 14.847953796386719, "learning_rate": 2.1569226738855175e-06, "loss": 0.7631, "step": 24460 }, { "epoch": 7.09, "grad_norm": 2.892458915710449, "learning_rate": 2.1350914727328297e-06, "loss": 1.1263, "step": 24470 }, { "epoch": 7.09, "grad_norm": 8.109782218933105, "learning_rate": 2.1132602715801423e-06, "loss": 1.2093, "step": 24480 }, { "epoch": 7.09, "grad_norm": 2.1379892826080322, "learning_rate": 2.091429070427455e-06, "loss": 0.8277, "step": 24490 }, { "epoch": 7.09, "grad_norm": 17.013763427734375, "learning_rate": 2.0695978692747676e-06, "loss": 0.8865, "step": 24500 }, { "epoch": 7.09, "grad_norm": 0.8392373323440552, "learning_rate": 2.0477666681220802e-06, "loss": 0.8332, "step": 24510 }, { "epoch": 7.09, "grad_norm": 6.880904674530029, "learning_rate": 2.025935466969393e-06, "loss": 1.2041, "step": 24520 }, { "epoch": 7.09, "grad_norm": 13.638782501220703, "learning_rate": 2.0041042658167055e-06, "loss": 0.8902, "step": 24530 }, { "epoch": 7.09, "grad_norm": 12.885167121887207, "learning_rate": 1.982273064664018e-06, "loss": 0.5747, "step": 24540 }, { "epoch": 7.09, "grad_norm": 17.75956916809082, "learning_rate": 1.9604418635113303e-06, "loss": 0.7702, "step": 24550 }, { "epoch": 7.09, "grad_norm": 22.505477905273438, "learning_rate": 1.938610662358643e-06, "loss": 1.2404, "step": 24560 }, { "epoch": 7.09, "grad_norm": 15.675399780273438, "learning_rate": 1.9167794612059556e-06, "loss": 1.5253, "step": 24570 }, { "epoch": 7.09, "grad_norm": 2.4935858249664307, "learning_rate": 1.8949482600532684e-06, "loss": 0.4015, "step": 24580 }, { "epoch": 7.09, "grad_norm": 18.096593856811523, "learning_rate": 1.8731170589005808e-06, "loss": 1.0052, "step": 24590 }, { "epoch": 7.09, "grad_norm": 16.974163055419922, "learning_rate": 1.8512858577478935e-06, "loss": 0.9866, "step": 24600 }, { "epoch": 7.09, "grad_norm": 8.443421363830566, "learning_rate": 1.8294546565952059e-06, "loss": 0.7569, "step": 24610 }, { "epoch": 7.09, "grad_norm": 14.004621505737305, "learning_rate": 1.8076234554425183e-06, "loss": 0.7701, "step": 24620 }, { "epoch": 7.09, "grad_norm": 10.717763900756836, "learning_rate": 1.7857922542898311e-06, "loss": 1.2174, "step": 24630 }, { "epoch": 7.09, "grad_norm": 10.493739128112793, "learning_rate": 1.7639610531371438e-06, "loss": 1.2388, "step": 24640 }, { "epoch": 7.09, "grad_norm": 15.393651962280273, "learning_rate": 1.7421298519844562e-06, "loss": 0.5955, "step": 24650 }, { "epoch": 7.09, "grad_norm": 16.138591766357422, "learning_rate": 1.7202986508317688e-06, "loss": 1.0444, "step": 24660 }, { "epoch": 7.09, "grad_norm": 5.676127910614014, "learning_rate": 1.6984674496790817e-06, "loss": 0.7586, "step": 24670 }, { "epoch": 7.09, "grad_norm": 9.72994613647461, "learning_rate": 1.676636248526394e-06, "loss": 0.8107, "step": 24680 }, { "epoch": 7.1, "grad_norm": 21.46115493774414, "learning_rate": 1.6548050473737065e-06, "loss": 0.7393, "step": 24690 }, { "epoch": 7.1, "grad_norm": 15.914698600769043, "learning_rate": 1.6329738462210191e-06, "loss": 0.9955, "step": 24700 }, { "epoch": 7.1, "grad_norm": 10.579070091247559, "learning_rate": 1.6111426450683315e-06, "loss": 0.7721, "step": 24710 }, { "epoch": 7.1, "grad_norm": 16.05234718322754, "learning_rate": 1.5893114439156444e-06, "loss": 0.8974, "step": 24720 }, { "epoch": 7.1, "grad_norm": 26.396526336669922, "learning_rate": 1.567480242762957e-06, "loss": 0.9057, "step": 24730 }, { "epoch": 7.1, "grad_norm": 0.6526234745979309, "learning_rate": 1.5456490416102694e-06, "loss": 0.5384, "step": 24740 }, { "epoch": 7.1, "grad_norm": 10.186136245727539, "learning_rate": 1.523817840457582e-06, "loss": 0.792, "step": 24750 }, { "epoch": 7.1, "grad_norm": 14.36878490447998, "learning_rate": 1.5019866393048947e-06, "loss": 1.2152, "step": 24760 }, { "epoch": 7.1, "grad_norm": 16.363309860229492, "learning_rate": 1.4801554381522071e-06, "loss": 0.8902, "step": 24770 }, { "epoch": 7.1, "grad_norm": 13.272995948791504, "learning_rate": 1.4583242369995197e-06, "loss": 0.5832, "step": 24780 }, { "epoch": 7.1, "grad_norm": 0.9992132782936096, "learning_rate": 1.4364930358468324e-06, "loss": 0.5749, "step": 24790 }, { "epoch": 7.1, "grad_norm": 26.46092987060547, "learning_rate": 1.414661834694145e-06, "loss": 1.2693, "step": 24800 }, { "epoch": 7.1, "grad_norm": 23.956892013549805, "learning_rate": 1.3928306335414574e-06, "loss": 0.5898, "step": 24810 }, { "epoch": 7.1, "grad_norm": 20.25436782836914, "learning_rate": 1.3709994323887703e-06, "loss": 1.4901, "step": 24820 }, { "epoch": 7.1, "grad_norm": 2.1372323036193848, "learning_rate": 1.3491682312360827e-06, "loss": 0.6667, "step": 24830 }, { "epoch": 7.1, "grad_norm": 10.164586067199707, "learning_rate": 1.327337030083395e-06, "loss": 0.9714, "step": 24840 }, { "epoch": 7.1, "grad_norm": 8.670980453491211, "learning_rate": 1.305505828930708e-06, "loss": 0.548, "step": 24850 }, { "epoch": 7.1, "grad_norm": 16.654539108276367, "learning_rate": 1.2836746277780204e-06, "loss": 1.4563, "step": 24860 }, { "epoch": 7.1, "grad_norm": 18.827083587646484, "learning_rate": 1.261843426625333e-06, "loss": 1.0585, "step": 24870 }, { "epoch": 7.1, "grad_norm": 10.401359558105469, "learning_rate": 1.2400122254726456e-06, "loss": 1.1042, "step": 24880 }, { "epoch": 7.1, "grad_norm": 7.8430304527282715, "learning_rate": 1.2181810243199583e-06, "loss": 0.8499, "step": 24890 }, { "epoch": 7.1, "grad_norm": 7.353401184082031, "learning_rate": 1.1963498231672707e-06, "loss": 0.6064, "step": 24900 }, { "epoch": 7.1, "grad_norm": 7.131856918334961, "learning_rate": 1.1745186220145833e-06, "loss": 0.4197, "step": 24910 }, { "epoch": 7.1, "grad_norm": 11.834263801574707, "learning_rate": 1.152687420861896e-06, "loss": 1.0883, "step": 24920 }, { "epoch": 7.1, "grad_norm": 17.790658950805664, "learning_rate": 1.1308562197092084e-06, "loss": 0.9971, "step": 24930 }, { "epoch": 7.11, "grad_norm": 16.537490844726562, "learning_rate": 1.109025018556521e-06, "loss": 0.7329, "step": 24940 }, { "epoch": 7.11, "grad_norm": 4.100344657897949, "learning_rate": 1.0871938174038336e-06, "loss": 0.7681, "step": 24950 }, { "epoch": 7.11, "grad_norm": 19.56581687927246, "learning_rate": 1.0653626162511462e-06, "loss": 0.7915, "step": 24960 }, { "epoch": 7.11, "grad_norm": 8.949625015258789, "learning_rate": 1.0435314150984587e-06, "loss": 0.602, "step": 24970 }, { "epoch": 7.11, "grad_norm": 6.615477561950684, "learning_rate": 1.0217002139457715e-06, "loss": 0.6882, "step": 24980 }, { "epoch": 7.11, "grad_norm": 9.177192687988281, "learning_rate": 9.99869012793084e-07, "loss": 0.5067, "step": 24990 }, { "epoch": 7.11, "grad_norm": 8.680255889892578, "learning_rate": 9.780378116403963e-07, "loss": 0.7513, "step": 25000 }, { "epoch": 7.11, "grad_norm": 11.872978210449219, "learning_rate": 9.562066104877092e-07, "loss": 0.708, "step": 25010 }, { "epoch": 7.11, "grad_norm": 6.160376071929932, "learning_rate": 9.343754093350216e-07, "loss": 0.8147, "step": 25020 }, { "epoch": 7.11, "grad_norm": 12.194184303283691, "learning_rate": 9.125442081823343e-07, "loss": 0.9149, "step": 25030 }, { "epoch": 7.11, "grad_norm": 2.7049248218536377, "learning_rate": 8.907130070296468e-07, "loss": 0.5998, "step": 25040 }, { "epoch": 7.11, "grad_norm": 10.01700210571289, "learning_rate": 8.688818058769595e-07, "loss": 0.8943, "step": 25050 }, { "epoch": 7.11, "grad_norm": 24.086641311645508, "learning_rate": 8.47050604724272e-07, "loss": 1.0311, "step": 25060 }, { "epoch": 7.11, "grad_norm": 7.036148548126221, "learning_rate": 8.252194035715844e-07, "loss": 0.9072, "step": 25070 }, { "epoch": 7.11, "grad_norm": 9.596638679504395, "learning_rate": 8.033882024188972e-07, "loss": 0.9473, "step": 25080 }, { "epoch": 7.11, "grad_norm": 12.967899322509766, "learning_rate": 7.815570012662097e-07, "loss": 0.8756, "step": 25090 }, { "epoch": 7.11, "grad_norm": 0.8889364004135132, "learning_rate": 7.597258001135222e-07, "loss": 0.6797, "step": 25100 }, { "epoch": 7.11, "grad_norm": 19.574743270874023, "learning_rate": 7.378945989608348e-07, "loss": 0.9851, "step": 25110 }, { "epoch": 7.11, "grad_norm": 14.977055549621582, "learning_rate": 7.160633978081475e-07, "loss": 0.9653, "step": 25120 }, { "epoch": 7.11, "grad_norm": 19.922693252563477, "learning_rate": 6.9423219665546e-07, "loss": 0.4912, "step": 25130 }, { "epoch": 7.11, "grad_norm": 7.497199058532715, "learning_rate": 6.724009955027726e-07, "loss": 0.8026, "step": 25140 }, { "epoch": 7.11, "grad_norm": 3.7217555046081543, "learning_rate": 6.505697943500852e-07, "loss": 0.6766, "step": 25150 }, { "epoch": 7.11, "grad_norm": 11.129315376281738, "learning_rate": 6.287385931973978e-07, "loss": 1.0283, "step": 25160 }, { "epoch": 7.11, "grad_norm": 26.756568908691406, "learning_rate": 6.069073920447103e-07, "loss": 0.7623, "step": 25170 }, { "epoch": 7.11, "grad_norm": 15.260847091674805, "learning_rate": 5.850761908920228e-07, "loss": 0.5897, "step": 25180 }, { "epoch": 7.11, "grad_norm": 10.736505508422852, "learning_rate": 5.632449897393355e-07, "loss": 0.6028, "step": 25190 }, { "epoch": 7.12, "grad_norm": 10.892223358154297, "learning_rate": 5.414137885866481e-07, "loss": 0.6336, "step": 25200 }, { "epoch": 7.12, "grad_norm": 0.7702347040176392, "learning_rate": 5.195825874339606e-07, "loss": 1.3553, "step": 25210 }, { "epoch": 7.12, "grad_norm": 6.698873996734619, "learning_rate": 4.977513862812732e-07, "loss": 0.8505, "step": 25220 }, { "epoch": 7.12, "grad_norm": 29.81500816345215, "learning_rate": 4.759201851285858e-07, "loss": 0.7004, "step": 25230 }, { "epoch": 7.12, "grad_norm": 6.137831211090088, "learning_rate": 4.540889839758984e-07, "loss": 0.9699, "step": 25240 }, { "epoch": 7.12, "grad_norm": 21.173114776611328, "learning_rate": 4.322577828232109e-07, "loss": 1.2876, "step": 25250 }, { "epoch": 7.12, "grad_norm": 19.042373657226562, "learning_rate": 4.104265816705235e-07, "loss": 1.0676, "step": 25260 }, { "epoch": 7.12, "grad_norm": 15.2430419921875, "learning_rate": 3.885953805178361e-07, "loss": 1.003, "step": 25270 }, { "epoch": 7.12, "grad_norm": 15.05865478515625, "learning_rate": 3.667641793651487e-07, "loss": 0.9564, "step": 25280 }, { "epoch": 7.12, "grad_norm": 11.467713356018066, "learning_rate": 3.449329782124613e-07, "loss": 0.8816, "step": 25290 }, { "epoch": 7.12, "grad_norm": 10.172699928283691, "learning_rate": 3.2310177705977386e-07, "loss": 0.69, "step": 25300 }, { "epoch": 7.12, "grad_norm": 14.137933731079102, "learning_rate": 3.0127057590708644e-07, "loss": 0.9147, "step": 25310 }, { "epoch": 7.12, "grad_norm": 20.67843246459961, "learning_rate": 2.79439374754399e-07, "loss": 0.9453, "step": 25320 }, { "epoch": 7.12, "grad_norm": 16.184593200683594, "learning_rate": 2.576081736017116e-07, "loss": 0.7121, "step": 25330 }, { "epoch": 7.12, "grad_norm": 13.22278118133545, "learning_rate": 2.3577697244902414e-07, "loss": 1.0581, "step": 25340 }, { "epoch": 7.12, "grad_norm": 12.613483428955078, "learning_rate": 2.1394577129633672e-07, "loss": 0.8204, "step": 25350 }, { "epoch": 7.12, "grad_norm": 1.6971631050109863, "learning_rate": 1.9211457014364932e-07, "loss": 0.9095, "step": 25360 }, { "epoch": 7.12, "grad_norm": 6.877547264099121, "learning_rate": 1.7028336899096187e-07, "loss": 1.1821, "step": 25370 }, { "epoch": 7.12, "grad_norm": 5.2098612785339355, "learning_rate": 1.4845216783827448e-07, "loss": 0.825, "step": 25380 }, { "epoch": 7.12, "grad_norm": 12.059030532836914, "learning_rate": 1.2662096668558705e-07, "loss": 0.4893, "step": 25390 }, { "epoch": 7.12, "grad_norm": 2.480212688446045, "learning_rate": 1.0478976553289963e-07, "loss": 0.3803, "step": 25400 }, { "epoch": 7.12, "grad_norm": 1.0061817169189453, "learning_rate": 8.29585643802122e-08, "loss": 0.9151, "step": 25410 }, { "epoch": 7.12, "grad_norm": 24.098403930664062, "learning_rate": 6.112736322752477e-08, "loss": 0.737, "step": 25420 }, { "epoch": 7.12, "grad_norm": 24.233139038085938, "learning_rate": 3.929616207483736e-08, "loss": 0.5689, "step": 25430 }, { "epoch": 7.12, "grad_norm": 1.6509398221969604, "learning_rate": 1.746496092214994e-08, "loss": 0.8068, "step": 25440 }, { "epoch": 7.12, "eval_accuracy": 0.6687883074021688, "eval_loss": 1.2041077613830566, "eval_runtime": 708.6333, "eval_samples_per_second": 5.986, "eval_steps_per_second": 1.497, "step": 25448 }, { "epoch": 7.12, "step": 25448, "total_flos": 1.2685976778118791e+20, "train_loss": 1.159527130113462, "train_runtime": 19539.0492, "train_samples_per_second": 5.21, "train_steps_per_second": 1.302 }, { "epoch": 7.12, "eval_accuracy": 0.6676096181046676, "eval_loss": 1.1090556383132935, "eval_runtime": 803.2715, "eval_samples_per_second": 5.281, "eval_steps_per_second": 1.321, "step": 25448 }, { "epoch": 7.12, "eval_accuracy": 0.6676096181046676, "eval_loss": 1.1090556383132935, "eval_runtime": 780.1485, "eval_samples_per_second": 5.437, "eval_steps_per_second": 1.36, "step": 25448 } ], "logging_steps": 10, "max_steps": 25448, "num_input_tokens_seen": 0, "num_train_epochs": 9223372036854775807, "save_steps": 500, "total_flos": 1.2685976778118791e+20, "train_batch_size": 4, "trial_name": null, "trial_params": null }