diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,129489 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 3.0, + "eval_steps": 500, + "global_step": 16182, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0001853911753800519, + "grad_norm": 20.296875, + "learning_rate": 9.999814608824621e-06, + "loss": 3.4381, + "mean_token_accuracy": 0.3741410027996946, + "step": 1 + }, + { + "epoch": 0.0003707823507601038, + "grad_norm": 33.5, + "learning_rate": 9.99962921764924e-06, + "loss": 3.499, + "mean_token_accuracy": 0.3633383761976803, + "step": 2 + }, + { + "epoch": 0.0005561735261401557, + "grad_norm": 23.40625, + "learning_rate": 9.99944382647386e-06, + "loss": 3.8999, + "mean_token_accuracy": 0.3120863309352518, + "step": 3 + }, + { + "epoch": 0.0007415647015202076, + "grad_norm": 25.8125, + "learning_rate": 9.99925843529848e-06, + "loss": 3.698, + "mean_token_accuracy": 0.3301929116195603, + "step": 4 + }, + { + "epoch": 0.0009269558769002596, + "grad_norm": 14.3359375, + "learning_rate": 9.999073044123102e-06, + "loss": 3.7705, + "mean_token_accuracy": 0.3312273154993951, + "step": 5 + }, + { + "epoch": 0.0011123470522803114, + "grad_norm": 15.921875, + "learning_rate": 9.99888765294772e-06, + "loss": 4.0455, + "mean_token_accuracy": 0.3116751269035533, + "step": 6 + }, + { + "epoch": 0.0012977382276603633, + "grad_norm": 23.28125, + "learning_rate": 9.998702261772341e-06, + "loss": 3.6514, + "mean_token_accuracy": 0.3503990362897154, + "step": 7 + }, + { + "epoch": 0.0014831294030404152, + "grad_norm": 21.71875, + "learning_rate": 9.99851687059696e-06, + "loss": 3.4188, + "mean_token_accuracy": 0.37161084529505584, + "step": 8 + }, + { + "epoch": 0.0016685205784204673, + "grad_norm": 17.125, + "learning_rate": 9.99833147942158e-06, + "loss": 4.2302, + "mean_token_accuracy": 0.3071310572687225, + "step": 9 + }, + { + "epoch": 0.0018539117538005192, + "grad_norm": 25.53125, + "learning_rate": 9.9981460882462e-06, + "loss": 3.4282, + "mean_token_accuracy": 0.364314715241962, + "step": 10 + }, + { + "epoch": 0.002039302929180571, + "grad_norm": 15.6484375, + "learning_rate": 9.99796069707082e-06, + "loss": 3.8982, + "mean_token_accuracy": 0.33768844221105526, + "step": 11 + }, + { + "epoch": 0.002224694104560623, + "grad_norm": 21.6875, + "learning_rate": 9.99777530589544e-06, + "loss": 3.6119, + "mean_token_accuracy": 0.3585068198133525, + "step": 12 + }, + { + "epoch": 0.002410085279940675, + "grad_norm": 26.03125, + "learning_rate": 9.99758991472006e-06, + "loss": 3.1203, + "mean_token_accuracy": 0.4051309460181721, + "step": 13 + }, + { + "epoch": 0.0025954764553207266, + "grad_norm": 22.875, + "learning_rate": 9.997404523544681e-06, + "loss": 3.7581, + "mean_token_accuracy": 0.3534979423868313, + "step": 14 + }, + { + "epoch": 0.0027808676307007787, + "grad_norm": 18.421875, + "learning_rate": 9.9972191323693e-06, + "loss": 4.1867, + "mean_token_accuracy": 0.32308126410835214, + "step": 15 + }, + { + "epoch": 0.0029662588060808304, + "grad_norm": 18.203125, + "learning_rate": 9.99703374119392e-06, + "loss": 3.9706, + "mean_token_accuracy": 0.3233863546733272, + "step": 16 + }, + { + "epoch": 0.0031516499814608825, + "grad_norm": 13.8828125, + "learning_rate": 9.996848350018539e-06, + "loss": 4.0017, + "mean_token_accuracy": 0.34279940438204637, + "step": 17 + }, + { + "epoch": 0.0033370411568409346, + "grad_norm": 14.1015625, + "learning_rate": 9.99666295884316e-06, + "loss": 3.5922, + "mean_token_accuracy": 0.36894628705606103, + "step": 18 + }, + { + "epoch": 0.0035224323322209862, + "grad_norm": 13.609375, + "learning_rate": 9.99647756766778e-06, + "loss": 3.7713, + "mean_token_accuracy": 0.348929889298893, + "step": 19 + }, + { + "epoch": 0.0037078235076010383, + "grad_norm": 13.65625, + "learning_rate": 9.996292176492399e-06, + "loss": 3.7535, + "mean_token_accuracy": 0.34879288437102923, + "step": 20 + }, + { + "epoch": 0.00389321468298109, + "grad_norm": 12.71875, + "learning_rate": 9.996106785317021e-06, + "loss": 3.9474, + "mean_token_accuracy": 0.33545048223947305, + "step": 21 + }, + { + "epoch": 0.004078605858361142, + "grad_norm": 11.3359375, + "learning_rate": 9.99592139414164e-06, + "loss": 3.6182, + "mean_token_accuracy": 0.37765814266487213, + "step": 22 + }, + { + "epoch": 0.004263997033741194, + "grad_norm": 16.9375, + "learning_rate": 9.99573600296626e-06, + "loss": 3.4863, + "mean_token_accuracy": 0.36946011281224816, + "step": 23 + }, + { + "epoch": 0.004449388209121246, + "grad_norm": 20.5625, + "learning_rate": 9.995550611790879e-06, + "loss": 3.5888, + "mean_token_accuracy": 0.36218828957486543, + "step": 24 + }, + { + "epoch": 0.0046347793845012975, + "grad_norm": 10.9765625, + "learning_rate": 9.9953652206155e-06, + "loss": 3.732, + "mean_token_accuracy": 0.36657595645910035, + "step": 25 + }, + { + "epoch": 0.00482017055988135, + "grad_norm": 13.1640625, + "learning_rate": 9.99517982944012e-06, + "loss": 3.8653, + "mean_token_accuracy": 0.3446526555544052, + "step": 26 + }, + { + "epoch": 0.005005561735261402, + "grad_norm": 17.203125, + "learning_rate": 9.994994438264739e-06, + "loss": 3.6163, + "mean_token_accuracy": 0.35840407470288627, + "step": 27 + }, + { + "epoch": 0.005190952910641453, + "grad_norm": 11.7890625, + "learning_rate": 9.99480904708936e-06, + "loss": 3.7773, + "mean_token_accuracy": 0.3563114134542706, + "step": 28 + }, + { + "epoch": 0.005376344086021506, + "grad_norm": 9.875, + "learning_rate": 9.99462365591398e-06, + "loss": 3.8295, + "mean_token_accuracy": 0.36033011610015386, + "step": 29 + }, + { + "epoch": 0.0055617352614015575, + "grad_norm": 15.3125, + "learning_rate": 9.9944382647386e-06, + "loss": 3.7272, + "mean_token_accuracy": 0.36409055425448866, + "step": 30 + }, + { + "epoch": 0.005747126436781609, + "grad_norm": 22.390625, + "learning_rate": 9.994252873563219e-06, + "loss": 3.0721, + "mean_token_accuracy": 0.3995652729829945, + "step": 31 + }, + { + "epoch": 0.005932517612161661, + "grad_norm": 12.671875, + "learning_rate": 9.99406748238784e-06, + "loss": 3.0176, + "mean_token_accuracy": 0.43850538315389487, + "step": 32 + }, + { + "epoch": 0.006117908787541713, + "grad_norm": 14.109375, + "learning_rate": 9.993882091212458e-06, + "loss": 3.7233, + "mean_token_accuracy": 0.3470967741935484, + "step": 33 + }, + { + "epoch": 0.006303299962921765, + "grad_norm": 22.578125, + "learning_rate": 9.993696700037079e-06, + "loss": 3.1235, + "mean_token_accuracy": 0.39344262295081966, + "step": 34 + }, + { + "epoch": 0.006488691138301817, + "grad_norm": 13.0859375, + "learning_rate": 9.9935113088617e-06, + "loss": 3.4322, + "mean_token_accuracy": 0.3896842518140812, + "step": 35 + }, + { + "epoch": 0.006674082313681869, + "grad_norm": 8.328125, + "learning_rate": 9.993325917686318e-06, + "loss": 3.2582, + "mean_token_accuracy": 0.38849028400597907, + "step": 36 + }, + { + "epoch": 0.006859473489061921, + "grad_norm": 13.234375, + "learning_rate": 9.993140526510939e-06, + "loss": 3.039, + "mean_token_accuracy": 0.4251584592881521, + "step": 37 + }, + { + "epoch": 0.0070448646644419724, + "grad_norm": 12.5859375, + "learning_rate": 9.99295513533556e-06, + "loss": 3.4285, + "mean_token_accuracy": 0.39289994008388257, + "step": 38 + }, + { + "epoch": 0.007230255839822024, + "grad_norm": 9.6640625, + "learning_rate": 9.99276974416018e-06, + "loss": 3.6749, + "mean_token_accuracy": 0.35631067961165047, + "step": 39 + }, + { + "epoch": 0.007415647015202077, + "grad_norm": 12.328125, + "learning_rate": 9.992584352984798e-06, + "loss": 4.1607, + "mean_token_accuracy": 0.32176572264326553, + "step": 40 + }, + { + "epoch": 0.007601038190582128, + "grad_norm": 20.578125, + "learning_rate": 9.992398961809419e-06, + "loss": 3.6988, + "mean_token_accuracy": 0.36075949367088606, + "step": 41 + }, + { + "epoch": 0.00778642936596218, + "grad_norm": 10.4140625, + "learning_rate": 9.992213570634038e-06, + "loss": 3.7204, + "mean_token_accuracy": 0.3503425466572171, + "step": 42 + }, + { + "epoch": 0.007971820541342232, + "grad_norm": 13.578125, + "learning_rate": 9.992028179458658e-06, + "loss": 4.0198, + "mean_token_accuracy": 0.3351313969571231, + "step": 43 + }, + { + "epoch": 0.008157211716722283, + "grad_norm": 15.03125, + "learning_rate": 9.991842788283279e-06, + "loss": 3.2864, + "mean_token_accuracy": 0.4051724137931034, + "step": 44 + }, + { + "epoch": 0.008342602892102336, + "grad_norm": 10.1875, + "learning_rate": 9.9916573971079e-06, + "loss": 3.4034, + "mean_token_accuracy": 0.39417650076962546, + "step": 45 + }, + { + "epoch": 0.008527994067482388, + "grad_norm": 18.015625, + "learning_rate": 9.991472005932518e-06, + "loss": 3.285, + "mean_token_accuracy": 0.39734923790589793, + "step": 46 + }, + { + "epoch": 0.008713385242862439, + "grad_norm": 8.9921875, + "learning_rate": 9.991286614757138e-06, + "loss": 3.4491, + "mean_token_accuracy": 0.38653762819586884, + "step": 47 + }, + { + "epoch": 0.008898776418242492, + "grad_norm": 8.640625, + "learning_rate": 9.991101223581759e-06, + "loss": 3.5815, + "mean_token_accuracy": 0.373163027056723, + "step": 48 + }, + { + "epoch": 0.009084167593622544, + "grad_norm": 8.3828125, + "learning_rate": 9.990915832406378e-06, + "loss": 3.9502, + "mean_token_accuracy": 0.3483087234265368, + "step": 49 + }, + { + "epoch": 0.009269558769002595, + "grad_norm": 11.3203125, + "learning_rate": 9.990730441230998e-06, + "loss": 3.0887, + "mean_token_accuracy": 0.4165704758240349, + "step": 50 + }, + { + "epoch": 0.009454949944382647, + "grad_norm": 17.828125, + "learning_rate": 9.990545050055617e-06, + "loss": 3.2425, + "mean_token_accuracy": 0.3741918458899591, + "step": 51 + }, + { + "epoch": 0.0096403411197627, + "grad_norm": 8.21875, + "learning_rate": 9.990359658880238e-06, + "loss": 3.7171, + "mean_token_accuracy": 0.3701393497013935, + "step": 52 + }, + { + "epoch": 0.00982573229514275, + "grad_norm": 28.3125, + "learning_rate": 9.990174267704858e-06, + "loss": 3.6864, + "mean_token_accuracy": 0.3573947313835304, + "step": 53 + }, + { + "epoch": 0.010011123470522803, + "grad_norm": 13.5703125, + "learning_rate": 9.989988876529479e-06, + "loss": 3.4587, + "mean_token_accuracy": 0.37662811929397444, + "step": 54 + }, + { + "epoch": 0.010196514645902856, + "grad_norm": 36.65625, + "learning_rate": 9.989803485354097e-06, + "loss": 3.4809, + "mean_token_accuracy": 0.3548863636363636, + "step": 55 + }, + { + "epoch": 0.010381905821282907, + "grad_norm": 10.4453125, + "learning_rate": 9.989618094178718e-06, + "loss": 3.6576, + "mean_token_accuracy": 0.3727217125382263, + "step": 56 + }, + { + "epoch": 0.010567296996662959, + "grad_norm": 9.6484375, + "learning_rate": 9.989432703003338e-06, + "loss": 4.2375, + "mean_token_accuracy": 0.322178762856729, + "step": 57 + }, + { + "epoch": 0.010752688172043012, + "grad_norm": 10.4375, + "learning_rate": 9.989247311827957e-06, + "loss": 3.9134, + "mean_token_accuracy": 0.34256864918113533, + "step": 58 + }, + { + "epoch": 0.010938079347423062, + "grad_norm": 16.984375, + "learning_rate": 9.989061920652578e-06, + "loss": 3.7722, + "mean_token_accuracy": 0.3469330332020259, + "step": 59 + }, + { + "epoch": 0.011123470522803115, + "grad_norm": 13.1015625, + "learning_rate": 9.988876529477196e-06, + "loss": 3.5898, + "mean_token_accuracy": 0.3768393172454385, + "step": 60 + }, + { + "epoch": 0.011308861698183166, + "grad_norm": 9.15625, + "learning_rate": 9.988691138301819e-06, + "loss": 3.7143, + "mean_token_accuracy": 0.36265190711105766, + "step": 61 + }, + { + "epoch": 0.011494252873563218, + "grad_norm": 9.6484375, + "learning_rate": 9.988505747126437e-06, + "loss": 3.339, + "mean_token_accuracy": 0.40487172095704815, + "step": 62 + }, + { + "epoch": 0.01167964404894327, + "grad_norm": 22.046875, + "learning_rate": 9.988320355951058e-06, + "loss": 3.3481, + "mean_token_accuracy": 0.3734547820429408, + "step": 63 + }, + { + "epoch": 0.011865035224323322, + "grad_norm": 12.84375, + "learning_rate": 9.988134964775678e-06, + "loss": 3.0666, + "mean_token_accuracy": 0.4502140588316531, + "step": 64 + }, + { + "epoch": 0.012050426399703374, + "grad_norm": 12.1171875, + "learning_rate": 9.987949573600297e-06, + "loss": 3.1151, + "mean_token_accuracy": 0.4391958559447459, + "step": 65 + }, + { + "epoch": 0.012235817575083427, + "grad_norm": 16.9375, + "learning_rate": 9.987764182424918e-06, + "loss": 3.5205, + "mean_token_accuracy": 0.38388911209321014, + "step": 66 + }, + { + "epoch": 0.012421208750463477, + "grad_norm": 8.7890625, + "learning_rate": 9.987578791249536e-06, + "loss": 3.6644, + "mean_token_accuracy": 0.35716242125445086, + "step": 67 + }, + { + "epoch": 0.01260659992584353, + "grad_norm": 11.5390625, + "learning_rate": 9.987393400074157e-06, + "loss": 3.1804, + "mean_token_accuracy": 0.4036207345662933, + "step": 68 + }, + { + "epoch": 0.012791991101223582, + "grad_norm": 9.9765625, + "learning_rate": 9.987208008898777e-06, + "loss": 3.261, + "mean_token_accuracy": 0.4166935744268647, + "step": 69 + }, + { + "epoch": 0.012977382276603633, + "grad_norm": 11.2265625, + "learning_rate": 9.987022617723398e-06, + "loss": 3.7567, + "mean_token_accuracy": 0.3566405193400054, + "step": 70 + }, + { + "epoch": 0.013162773451983686, + "grad_norm": 10.5234375, + "learning_rate": 9.986837226548017e-06, + "loss": 3.6839, + "mean_token_accuracy": 0.3646567164179104, + "step": 71 + }, + { + "epoch": 0.013348164627363738, + "grad_norm": 8.7421875, + "learning_rate": 9.986651835372637e-06, + "loss": 3.7606, + "mean_token_accuracy": 0.36064139941690965, + "step": 72 + }, + { + "epoch": 0.013533555802743789, + "grad_norm": 15.328125, + "learning_rate": 9.986466444197258e-06, + "loss": 3.7036, + "mean_token_accuracy": 0.35925591008913577, + "step": 73 + }, + { + "epoch": 0.013718946978123842, + "grad_norm": 10.015625, + "learning_rate": 9.986281053021877e-06, + "loss": 3.4698, + "mean_token_accuracy": 0.4113996069101066, + "step": 74 + }, + { + "epoch": 0.013904338153503892, + "grad_norm": 23.09375, + "learning_rate": 9.986095661846497e-06, + "loss": 3.0644, + "mean_token_accuracy": 0.4284487385936661, + "step": 75 + }, + { + "epoch": 0.014089729328883945, + "grad_norm": 13.78125, + "learning_rate": 9.985910270671116e-06, + "loss": 3.1365, + "mean_token_accuracy": 0.44471413160733547, + "step": 76 + }, + { + "epoch": 0.014275120504263997, + "grad_norm": 12.7734375, + "learning_rate": 9.985724879495738e-06, + "loss": 3.8205, + "mean_token_accuracy": 0.3564191533657183, + "step": 77 + }, + { + "epoch": 0.014460511679644048, + "grad_norm": 9.6328125, + "learning_rate": 9.985539488320357e-06, + "loss": 3.8257, + "mean_token_accuracy": 0.3372511848341232, + "step": 78 + }, + { + "epoch": 0.0146459028550241, + "grad_norm": 14.3828125, + "learning_rate": 9.985354097144977e-06, + "loss": 3.4389, + "mean_token_accuracy": 0.3951518691588785, + "step": 79 + }, + { + "epoch": 0.014831294030404153, + "grad_norm": 16.0, + "learning_rate": 9.985168705969596e-06, + "loss": 3.5757, + "mean_token_accuracy": 0.3722309647742404, + "step": 80 + }, + { + "epoch": 0.015016685205784204, + "grad_norm": 10.5078125, + "learning_rate": 9.984983314794217e-06, + "loss": 3.3855, + "mean_token_accuracy": 0.3938829787234043, + "step": 81 + }, + { + "epoch": 0.015202076381164257, + "grad_norm": 10.875, + "learning_rate": 9.984797923618837e-06, + "loss": 3.9025, + "mean_token_accuracy": 0.35135884636716586, + "step": 82 + }, + { + "epoch": 0.015387467556544309, + "grad_norm": 17.1875, + "learning_rate": 9.984612532443456e-06, + "loss": 3.5976, + "mean_token_accuracy": 0.37413598089732314, + "step": 83 + }, + { + "epoch": 0.01557285873192436, + "grad_norm": 11.40625, + "learning_rate": 9.984427141268076e-06, + "loss": 3.571, + "mean_token_accuracy": 0.3723349820910796, + "step": 84 + }, + { + "epoch": 0.01575824990730441, + "grad_norm": 10.9140625, + "learning_rate": 9.984241750092697e-06, + "loss": 3.4574, + "mean_token_accuracy": 0.38086979722518677, + "step": 85 + }, + { + "epoch": 0.015943641082684465, + "grad_norm": 13.5859375, + "learning_rate": 9.984056358917317e-06, + "loss": 3.154, + "mean_token_accuracy": 0.43227402969523315, + "step": 86 + }, + { + "epoch": 0.016129032258064516, + "grad_norm": 13.484375, + "learning_rate": 9.983870967741936e-06, + "loss": 3.2812, + "mean_token_accuracy": 0.38634423897581793, + "step": 87 + }, + { + "epoch": 0.016314423433444566, + "grad_norm": 9.8203125, + "learning_rate": 9.983685576566557e-06, + "loss": 2.9024, + "mean_token_accuracy": 0.4394335212429534, + "step": 88 + }, + { + "epoch": 0.01649981460882462, + "grad_norm": 11.59375, + "learning_rate": 9.983500185391175e-06, + "loss": 3.1165, + "mean_token_accuracy": 0.4171966420758077, + "step": 89 + }, + { + "epoch": 0.01668520578420467, + "grad_norm": 8.03125, + "learning_rate": 9.983314794215796e-06, + "loss": 3.6869, + "mean_token_accuracy": 0.36991727834090643, + "step": 90 + }, + { + "epoch": 0.016870596959584722, + "grad_norm": 8.71875, + "learning_rate": 9.983129403040416e-06, + "loss": 3.5475, + "mean_token_accuracy": 0.38127165046373895, + "step": 91 + }, + { + "epoch": 0.017055988134964777, + "grad_norm": 8.7109375, + "learning_rate": 9.982944011865035e-06, + "loss": 3.5756, + "mean_token_accuracy": 0.39057239057239057, + "step": 92 + }, + { + "epoch": 0.017241379310344827, + "grad_norm": 7.44921875, + "learning_rate": 9.982758620689656e-06, + "loss": 3.6211, + "mean_token_accuracy": 0.3890165441176471, + "step": 93 + }, + { + "epoch": 0.017426770485724878, + "grad_norm": 7.49609375, + "learning_rate": 9.982573229514276e-06, + "loss": 3.7896, + "mean_token_accuracy": 0.3646804260985353, + "step": 94 + }, + { + "epoch": 0.017612161661104932, + "grad_norm": 10.015625, + "learning_rate": 9.982387838338897e-06, + "loss": 3.4047, + "mean_token_accuracy": 0.39285714285714285, + "step": 95 + }, + { + "epoch": 0.017797552836484983, + "grad_norm": 10.53125, + "learning_rate": 9.982202447163515e-06, + "loss": 3.2716, + "mean_token_accuracy": 0.40177011761965764, + "step": 96 + }, + { + "epoch": 0.017982944011865034, + "grad_norm": 10.6484375, + "learning_rate": 9.982017055988136e-06, + "loss": 3.4895, + "mean_token_accuracy": 0.3729986431478969, + "step": 97 + }, + { + "epoch": 0.018168335187245088, + "grad_norm": 8.875, + "learning_rate": 9.981831664812755e-06, + "loss": 2.9862, + "mean_token_accuracy": 0.4264252696456086, + "step": 98 + }, + { + "epoch": 0.01835372636262514, + "grad_norm": 9.46875, + "learning_rate": 9.981646273637375e-06, + "loss": 3.5209, + "mean_token_accuracy": 0.3851343753514, + "step": 99 + }, + { + "epoch": 0.01853911753800519, + "grad_norm": 9.0, + "learning_rate": 9.981460882461996e-06, + "loss": 3.9077, + "mean_token_accuracy": 0.35223367697594504, + "step": 100 + }, + { + "epoch": 0.018724508713385244, + "grad_norm": 10.828125, + "learning_rate": 9.981275491286616e-06, + "loss": 3.6151, + "mean_token_accuracy": 0.38634969325153373, + "step": 101 + }, + { + "epoch": 0.018909899888765295, + "grad_norm": 9.140625, + "learning_rate": 9.981090100111237e-06, + "loss": 3.8936, + "mean_token_accuracy": 0.3572481572481572, + "step": 102 + }, + { + "epoch": 0.019095291064145346, + "grad_norm": 9.3984375, + "learning_rate": 9.980904708935856e-06, + "loss": 3.404, + "mean_token_accuracy": 0.39367396593673964, + "step": 103 + }, + { + "epoch": 0.0192806822395254, + "grad_norm": 9.5078125, + "learning_rate": 9.980719317760476e-06, + "loss": 3.8367, + "mean_token_accuracy": 0.3672547018515819, + "step": 104 + }, + { + "epoch": 0.01946607341490545, + "grad_norm": 21.1875, + "learning_rate": 9.980533926585095e-06, + "loss": 3.2517, + "mean_token_accuracy": 0.3914650537634409, + "step": 105 + }, + { + "epoch": 0.0196514645902855, + "grad_norm": 10.515625, + "learning_rate": 9.980348535409715e-06, + "loss": 3.2583, + "mean_token_accuracy": 0.39715972554651346, + "step": 106 + }, + { + "epoch": 0.019836855765665556, + "grad_norm": 7.29296875, + "learning_rate": 9.980163144234336e-06, + "loss": 3.4838, + "mean_token_accuracy": 0.3822471011595362, + "step": 107 + }, + { + "epoch": 0.020022246941045607, + "grad_norm": 11.2421875, + "learning_rate": 9.979977753058955e-06, + "loss": 3.5272, + "mean_token_accuracy": 0.3844527565457798, + "step": 108 + }, + { + "epoch": 0.020207638116425657, + "grad_norm": 10.4453125, + "learning_rate": 9.979792361883575e-06, + "loss": 3.931, + "mean_token_accuracy": 0.3280475718533201, + "step": 109 + }, + { + "epoch": 0.02039302929180571, + "grad_norm": 8.34375, + "learning_rate": 9.979606970708196e-06, + "loss": 3.5288, + "mean_token_accuracy": 0.37078954459714364, + "step": 110 + }, + { + "epoch": 0.020578420467185762, + "grad_norm": 9.125, + "learning_rate": 9.979421579532816e-06, + "loss": 3.1308, + "mean_token_accuracy": 0.40451592288848653, + "step": 111 + }, + { + "epoch": 0.020763811642565813, + "grad_norm": 9.8828125, + "learning_rate": 9.979236188357435e-06, + "loss": 3.8475, + "mean_token_accuracy": 0.35536105032822757, + "step": 112 + }, + { + "epoch": 0.020949202817945867, + "grad_norm": 9.5703125, + "learning_rate": 9.979050797182055e-06, + "loss": 2.8274, + "mean_token_accuracy": 0.4463582677165354, + "step": 113 + }, + { + "epoch": 0.021134593993325918, + "grad_norm": 7.41015625, + "learning_rate": 9.978865406006674e-06, + "loss": 3.2743, + "mean_token_accuracy": 0.4164364640883978, + "step": 114 + }, + { + "epoch": 0.02131998516870597, + "grad_norm": 7.42578125, + "learning_rate": 9.978680014831295e-06, + "loss": 3.0824, + "mean_token_accuracy": 0.42395408548557617, + "step": 115 + }, + { + "epoch": 0.021505376344086023, + "grad_norm": 14.859375, + "learning_rate": 9.978494623655915e-06, + "loss": 3.2852, + "mean_token_accuracy": 0.39287063861947347, + "step": 116 + }, + { + "epoch": 0.021690767519466074, + "grad_norm": 11.65625, + "learning_rate": 9.978309232480536e-06, + "loss": 3.6064, + "mean_token_accuracy": 0.37898894154818324, + "step": 117 + }, + { + "epoch": 0.021876158694846125, + "grad_norm": 9.359375, + "learning_rate": 9.978123841305154e-06, + "loss": 3.9121, + "mean_token_accuracy": 0.33431998753699954, + "step": 118 + }, + { + "epoch": 0.022061549870226176, + "grad_norm": 10.34375, + "learning_rate": 9.977938450129775e-06, + "loss": 3.1628, + "mean_token_accuracy": 0.41136394604169507, + "step": 119 + }, + { + "epoch": 0.02224694104560623, + "grad_norm": 10.7109375, + "learning_rate": 9.977753058954395e-06, + "loss": 3.2838, + "mean_token_accuracy": 0.3993351640410464, + "step": 120 + }, + { + "epoch": 0.02243233222098628, + "grad_norm": 9.1171875, + "learning_rate": 9.977567667779014e-06, + "loss": 3.3766, + "mean_token_accuracy": 0.3800491266375546, + "step": 121 + }, + { + "epoch": 0.02261772339636633, + "grad_norm": 13.5546875, + "learning_rate": 9.977382276603635e-06, + "loss": 3.2509, + "mean_token_accuracy": 0.3941845468401923, + "step": 122 + }, + { + "epoch": 0.022803114571746386, + "grad_norm": 8.546875, + "learning_rate": 9.977196885428253e-06, + "loss": 3.4839, + "mean_token_accuracy": 0.37543383129621244, + "step": 123 + }, + { + "epoch": 0.022988505747126436, + "grad_norm": 11.265625, + "learning_rate": 9.977011494252874e-06, + "loss": 3.9086, + "mean_token_accuracy": 0.339407160272039, + "step": 124 + }, + { + "epoch": 0.023173896922506487, + "grad_norm": 11.7109375, + "learning_rate": 9.976826103077494e-06, + "loss": 3.5963, + "mean_token_accuracy": 0.3780616276007374, + "step": 125 + }, + { + "epoch": 0.02335928809788654, + "grad_norm": 8.8203125, + "learning_rate": 9.976640711902115e-06, + "loss": 3.1831, + "mean_token_accuracy": 0.41598277949683843, + "step": 126 + }, + { + "epoch": 0.023544679273266592, + "grad_norm": 10.078125, + "learning_rate": 9.976455320726734e-06, + "loss": 2.7696, + "mean_token_accuracy": 0.44809306434987484, + "step": 127 + }, + { + "epoch": 0.023730070448646643, + "grad_norm": 10.1328125, + "learning_rate": 9.976269929551354e-06, + "loss": 3.2666, + "mean_token_accuracy": 0.3986628211851075, + "step": 128 + }, + { + "epoch": 0.023915461624026697, + "grad_norm": 12.2734375, + "learning_rate": 9.976084538375975e-06, + "loss": 3.0797, + "mean_token_accuracy": 0.41573642570716374, + "step": 129 + }, + { + "epoch": 0.024100852799406748, + "grad_norm": 12.1171875, + "learning_rate": 9.975899147200594e-06, + "loss": 3.4001, + "mean_token_accuracy": 0.3711010397227406, + "step": 130 + }, + { + "epoch": 0.0242862439747868, + "grad_norm": 8.40625, + "learning_rate": 9.975713756025214e-06, + "loss": 3.7864, + "mean_token_accuracy": 0.35381828316610925, + "step": 131 + }, + { + "epoch": 0.024471635150166853, + "grad_norm": 12.3984375, + "learning_rate": 9.975528364849833e-06, + "loss": 3.9022, + "mean_token_accuracy": 0.3447947811905409, + "step": 132 + }, + { + "epoch": 0.024657026325546904, + "grad_norm": 9.1796875, + "learning_rate": 9.975342973674453e-06, + "loss": 3.5614, + "mean_token_accuracy": 0.36817218327082996, + "step": 133 + }, + { + "epoch": 0.024842417500926955, + "grad_norm": 13.34375, + "learning_rate": 9.975157582499074e-06, + "loss": 3.3685, + "mean_token_accuracy": 0.3716986017607457, + "step": 134 + }, + { + "epoch": 0.02502780867630701, + "grad_norm": 12.578125, + "learning_rate": 9.974972191323694e-06, + "loss": 3.3435, + "mean_token_accuracy": 0.3836438269655826, + "step": 135 + }, + { + "epoch": 0.02521319985168706, + "grad_norm": 15.3984375, + "learning_rate": 9.974786800148313e-06, + "loss": 4.2441, + "mean_token_accuracy": 0.32710989678202795, + "step": 136 + }, + { + "epoch": 0.02539859102706711, + "grad_norm": 11.7265625, + "learning_rate": 9.974601408972934e-06, + "loss": 3.5882, + "mean_token_accuracy": 0.3715994020926756, + "step": 137 + }, + { + "epoch": 0.025583982202447165, + "grad_norm": 14.1875, + "learning_rate": 9.974416017797554e-06, + "loss": 3.8388, + "mean_token_accuracy": 0.3429962894248609, + "step": 138 + }, + { + "epoch": 0.025769373377827216, + "grad_norm": 18.828125, + "learning_rate": 9.974230626622173e-06, + "loss": 2.9681, + "mean_token_accuracy": 0.42415471648467307, + "step": 139 + }, + { + "epoch": 0.025954764553207266, + "grad_norm": 11.5, + "learning_rate": 9.974045235446793e-06, + "loss": 3.1797, + "mean_token_accuracy": 0.4128205128205128, + "step": 140 + }, + { + "epoch": 0.02614015572858732, + "grad_norm": 14.578125, + "learning_rate": 9.973859844271412e-06, + "loss": 3.1276, + "mean_token_accuracy": 0.40506966593923116, + "step": 141 + }, + { + "epoch": 0.02632554690396737, + "grad_norm": 14.421875, + "learning_rate": 9.973674453096034e-06, + "loss": 3.2541, + "mean_token_accuracy": 0.3984329991934555, + "step": 142 + }, + { + "epoch": 0.026510938079347422, + "grad_norm": 10.0234375, + "learning_rate": 9.973489061920653e-06, + "loss": 2.8935, + "mean_token_accuracy": 0.43506763787721126, + "step": 143 + }, + { + "epoch": 0.026696329254727477, + "grad_norm": 7.671875, + "learning_rate": 9.973303670745274e-06, + "loss": 4.186, + "mean_token_accuracy": 0.3357953919082791, + "step": 144 + }, + { + "epoch": 0.026881720430107527, + "grad_norm": 14.6171875, + "learning_rate": 9.973118279569894e-06, + "loss": 3.2942, + "mean_token_accuracy": 0.39072847682119205, + "step": 145 + }, + { + "epoch": 0.027067111605487578, + "grad_norm": 13.1953125, + "learning_rate": 9.972932888394513e-06, + "loss": 3.8442, + "mean_token_accuracy": 0.34437676422522656, + "step": 146 + }, + { + "epoch": 0.027252502780867632, + "grad_norm": 8.4375, + "learning_rate": 9.972747497219133e-06, + "loss": 3.5772, + "mean_token_accuracy": 0.3763837638376384, + "step": 147 + }, + { + "epoch": 0.027437893956247683, + "grad_norm": 14.8515625, + "learning_rate": 9.972562106043752e-06, + "loss": 3.3499, + "mean_token_accuracy": 0.38854277465430787, + "step": 148 + }, + { + "epoch": 0.027623285131627734, + "grad_norm": 11.40625, + "learning_rate": 9.972376714868373e-06, + "loss": 3.354, + "mean_token_accuracy": 0.3941627358490566, + "step": 149 + }, + { + "epoch": 0.027808676307007785, + "grad_norm": 11.2734375, + "learning_rate": 9.972191323692993e-06, + "loss": 3.3128, + "mean_token_accuracy": 0.4013104013104013, + "step": 150 + }, + { + "epoch": 0.02799406748238784, + "grad_norm": 10.078125, + "learning_rate": 9.972005932517614e-06, + "loss": 3.3116, + "mean_token_accuracy": 0.40372204712591925, + "step": 151 + }, + { + "epoch": 0.02817945865776789, + "grad_norm": 18.765625, + "learning_rate": 9.971820541342232e-06, + "loss": 2.9742, + "mean_token_accuracy": 0.42819805430521274, + "step": 152 + }, + { + "epoch": 0.02836484983314794, + "grad_norm": 9.9375, + "learning_rate": 9.971635150166853e-06, + "loss": 3.9243, + "mean_token_accuracy": 0.36853853383458646, + "step": 153 + }, + { + "epoch": 0.028550241008527995, + "grad_norm": 11.7265625, + "learning_rate": 9.971449758991473e-06, + "loss": 2.927, + "mean_token_accuracy": 0.4494630448515477, + "step": 154 + }, + { + "epoch": 0.028735632183908046, + "grad_norm": 8.96875, + "learning_rate": 9.971264367816092e-06, + "loss": 3.3215, + "mean_token_accuracy": 0.38849487785658, + "step": 155 + }, + { + "epoch": 0.028921023359288096, + "grad_norm": 9.0703125, + "learning_rate": 9.971078976640713e-06, + "loss": 3.3514, + "mean_token_accuracy": 0.3982483882739326, + "step": 156 + }, + { + "epoch": 0.02910641453466815, + "grad_norm": 7.76171875, + "learning_rate": 9.970893585465332e-06, + "loss": 3.5172, + "mean_token_accuracy": 0.3770096463022508, + "step": 157 + }, + { + "epoch": 0.0292918057100482, + "grad_norm": 9.5625, + "learning_rate": 9.970708194289954e-06, + "loss": 3.5161, + "mean_token_accuracy": 0.39988837728477744, + "step": 158 + }, + { + "epoch": 0.029477196885428252, + "grad_norm": 24.421875, + "learning_rate": 9.970522803114573e-06, + "loss": 2.3581, + "mean_token_accuracy": 0.4728568434290505, + "step": 159 + }, + { + "epoch": 0.029662588060808306, + "grad_norm": 9.3828125, + "learning_rate": 9.970337411939193e-06, + "loss": 3.7167, + "mean_token_accuracy": 0.3625486922648859, + "step": 160 + }, + { + "epoch": 0.029847979236188357, + "grad_norm": 10.859375, + "learning_rate": 9.970152020763812e-06, + "loss": 3.6196, + "mean_token_accuracy": 0.36540371789413933, + "step": 161 + }, + { + "epoch": 0.030033370411568408, + "grad_norm": 10.0390625, + "learning_rate": 9.969966629588432e-06, + "loss": 3.631, + "mean_token_accuracy": 0.38195468561469365, + "step": 162 + }, + { + "epoch": 0.030218761586948462, + "grad_norm": 13.46875, + "learning_rate": 9.969781238413053e-06, + "loss": 2.9555, + "mean_token_accuracy": 0.41974308529105425, + "step": 163 + }, + { + "epoch": 0.030404152762328513, + "grad_norm": 8.59375, + "learning_rate": 9.969595847237672e-06, + "loss": 4.2115, + "mean_token_accuracy": 0.31930638391870997, + "step": 164 + }, + { + "epoch": 0.030589543937708564, + "grad_norm": 16.109375, + "learning_rate": 9.969410456062292e-06, + "loss": 3.4556, + "mean_token_accuracy": 0.380061394532963, + "step": 165 + }, + { + "epoch": 0.030774935113088618, + "grad_norm": 13.3984375, + "learning_rate": 9.969225064886913e-06, + "loss": 3.4162, + "mean_token_accuracy": 0.3842827318818968, + "step": 166 + }, + { + "epoch": 0.03096032628846867, + "grad_norm": 12.0, + "learning_rate": 9.969039673711533e-06, + "loss": 3.7262, + "mean_token_accuracy": 0.35910087719298245, + "step": 167 + }, + { + "epoch": 0.03114571746384872, + "grad_norm": 9.1171875, + "learning_rate": 9.968854282536152e-06, + "loss": 3.3072, + "mean_token_accuracy": 0.39448027598620067, + "step": 168 + }, + { + "epoch": 0.03133110863922877, + "grad_norm": 14.0, + "learning_rate": 9.968668891360772e-06, + "loss": 2.9263, + "mean_token_accuracy": 0.42758710348413936, + "step": 169 + }, + { + "epoch": 0.03151649981460882, + "grad_norm": 11.15625, + "learning_rate": 9.968483500185391e-06, + "loss": 3.6215, + "mean_token_accuracy": 0.36975368102380624, + "step": 170 + }, + { + "epoch": 0.03170189098998888, + "grad_norm": 12.546875, + "learning_rate": 9.968298109010012e-06, + "loss": 2.6465, + "mean_token_accuracy": 0.4780934039480019, + "step": 171 + }, + { + "epoch": 0.03188728216536893, + "grad_norm": 13.9296875, + "learning_rate": 9.968112717834632e-06, + "loss": 3.106, + "mean_token_accuracy": 0.4053084648493544, + "step": 172 + }, + { + "epoch": 0.03207267334074898, + "grad_norm": 21.8125, + "learning_rate": 9.967927326659251e-06, + "loss": 3.0122, + "mean_token_accuracy": 0.41927899686520376, + "step": 173 + }, + { + "epoch": 0.03225806451612903, + "grad_norm": 12.4609375, + "learning_rate": 9.967741935483871e-06, + "loss": 2.9418, + "mean_token_accuracy": 0.44761374704327256, + "step": 174 + }, + { + "epoch": 0.03244345569150908, + "grad_norm": 9.9921875, + "learning_rate": 9.967556544308492e-06, + "loss": 3.6329, + "mean_token_accuracy": 0.3670821673874332, + "step": 175 + }, + { + "epoch": 0.03262884686688913, + "grad_norm": 8.359375, + "learning_rate": 9.967371153133112e-06, + "loss": 3.4014, + "mean_token_accuracy": 0.38483363425328865, + "step": 176 + }, + { + "epoch": 0.03281423804226919, + "grad_norm": 8.84375, + "learning_rate": 9.967185761957731e-06, + "loss": 3.0306, + "mean_token_accuracy": 0.42600828093850635, + "step": 177 + }, + { + "epoch": 0.03299962921764924, + "grad_norm": 9.25, + "learning_rate": 9.967000370782352e-06, + "loss": 3.6401, + "mean_token_accuracy": 0.36178257933828495, + "step": 178 + }, + { + "epoch": 0.03318502039302929, + "grad_norm": 9.5703125, + "learning_rate": 9.96681497960697e-06, + "loss": 3.1834, + "mean_token_accuracy": 0.41987080898185175, + "step": 179 + }, + { + "epoch": 0.03337041156840934, + "grad_norm": 8.7734375, + "learning_rate": 9.966629588431591e-06, + "loss": 3.1498, + "mean_token_accuracy": 0.4112565798353354, + "step": 180 + }, + { + "epoch": 0.033555802743789394, + "grad_norm": 9.984375, + "learning_rate": 9.966444197256211e-06, + "loss": 3.0118, + "mean_token_accuracy": 0.42468716205777846, + "step": 181 + }, + { + "epoch": 0.033741193919169445, + "grad_norm": 10.5078125, + "learning_rate": 9.966258806080832e-06, + "loss": 3.4268, + "mean_token_accuracy": 0.39398592450415865, + "step": 182 + }, + { + "epoch": 0.0339265850945495, + "grad_norm": 6.7109375, + "learning_rate": 9.966073414905452e-06, + "loss": 3.2557, + "mean_token_accuracy": 0.4154325798908807, + "step": 183 + }, + { + "epoch": 0.03411197626992955, + "grad_norm": 7.58203125, + "learning_rate": 9.965888023730071e-06, + "loss": 3.4369, + "mean_token_accuracy": 0.3830580713905168, + "step": 184 + }, + { + "epoch": 0.034297367445309604, + "grad_norm": 7.80078125, + "learning_rate": 9.965702632554692e-06, + "loss": 3.4169, + "mean_token_accuracy": 0.39291857798165136, + "step": 185 + }, + { + "epoch": 0.034482758620689655, + "grad_norm": 8.203125, + "learning_rate": 9.96551724137931e-06, + "loss": 3.8411, + "mean_token_accuracy": 0.35609889359377134, + "step": 186 + }, + { + "epoch": 0.034668149796069705, + "grad_norm": 12.53125, + "learning_rate": 9.965331850203931e-06, + "loss": 3.2983, + "mean_token_accuracy": 0.4042931688804554, + "step": 187 + }, + { + "epoch": 0.034853540971449756, + "grad_norm": 14.8671875, + "learning_rate": 9.965146459028552e-06, + "loss": 3.6223, + "mean_token_accuracy": 0.39536835039436147, + "step": 188 + }, + { + "epoch": 0.035038932146829814, + "grad_norm": 9.34375, + "learning_rate": 9.96496106785317e-06, + "loss": 3.7538, + "mean_token_accuracy": 0.3595505617977528, + "step": 189 + }, + { + "epoch": 0.035224323322209865, + "grad_norm": 9.46875, + "learning_rate": 9.96477567667779e-06, + "loss": 3.6352, + "mean_token_accuracy": 0.37782231128924515, + "step": 190 + }, + { + "epoch": 0.035409714497589916, + "grad_norm": 7.44140625, + "learning_rate": 9.964590285502411e-06, + "loss": 2.884, + "mean_token_accuracy": 0.4645614843840037, + "step": 191 + }, + { + "epoch": 0.035595105672969966, + "grad_norm": 9.296875, + "learning_rate": 9.964404894327032e-06, + "loss": 3.215, + "mean_token_accuracy": 0.39757904622769485, + "step": 192 + }, + { + "epoch": 0.03578049684835002, + "grad_norm": 9.9453125, + "learning_rate": 9.96421950315165e-06, + "loss": 3.123, + "mean_token_accuracy": 0.42290748898678415, + "step": 193 + }, + { + "epoch": 0.03596588802373007, + "grad_norm": 12.703125, + "learning_rate": 9.964034111976271e-06, + "loss": 3.7299, + "mean_token_accuracy": 0.37048567870485677, + "step": 194 + }, + { + "epoch": 0.036151279199110126, + "grad_norm": 7.2890625, + "learning_rate": 9.96384872080089e-06, + "loss": 3.1822, + "mean_token_accuracy": 0.4138442855215221, + "step": 195 + }, + { + "epoch": 0.036336670374490176, + "grad_norm": 9.765625, + "learning_rate": 9.96366332962551e-06, + "loss": 3.5492, + "mean_token_accuracy": 0.37267384916748286, + "step": 196 + }, + { + "epoch": 0.03652206154987023, + "grad_norm": 8.5625, + "learning_rate": 9.963477938450131e-06, + "loss": 3.4134, + "mean_token_accuracy": 0.37505614612966015, + "step": 197 + }, + { + "epoch": 0.03670745272525028, + "grad_norm": 16.390625, + "learning_rate": 9.963292547274751e-06, + "loss": 3.2484, + "mean_token_accuracy": 0.3829763744756017, + "step": 198 + }, + { + "epoch": 0.03689284390063033, + "grad_norm": 11.140625, + "learning_rate": 9.96310715609937e-06, + "loss": 3.1324, + "mean_token_accuracy": 0.41386061080657793, + "step": 199 + }, + { + "epoch": 0.03707823507601038, + "grad_norm": 10.8203125, + "learning_rate": 9.96292176492399e-06, + "loss": 3.4555, + "mean_token_accuracy": 0.3731955844891027, + "step": 200 + }, + { + "epoch": 0.03726362625139043, + "grad_norm": 10.96875, + "learning_rate": 9.962736373748611e-06, + "loss": 3.8081, + "mean_token_accuracy": 0.34589411129119396, + "step": 201 + }, + { + "epoch": 0.03744901742677049, + "grad_norm": 9.234375, + "learning_rate": 9.96255098257323e-06, + "loss": 3.3706, + "mean_token_accuracy": 0.38206023006333206, + "step": 202 + }, + { + "epoch": 0.03763440860215054, + "grad_norm": 9.5234375, + "learning_rate": 9.96236559139785e-06, + "loss": 3.1445, + "mean_token_accuracy": 0.40518207282913166, + "step": 203 + }, + { + "epoch": 0.03781979977753059, + "grad_norm": 14.484375, + "learning_rate": 9.96218020022247e-06, + "loss": 2.4262, + "mean_token_accuracy": 0.47050754458161864, + "step": 204 + }, + { + "epoch": 0.03800519095291064, + "grad_norm": 16.5625, + "learning_rate": 9.96199480904709e-06, + "loss": 3.4291, + "mean_token_accuracy": 0.39842632331902716, + "step": 205 + }, + { + "epoch": 0.03819058212829069, + "grad_norm": 34.4375, + "learning_rate": 9.96180941787171e-06, + "loss": 3.1806, + "mean_token_accuracy": 0.3886689003959793, + "step": 206 + }, + { + "epoch": 0.03837597330367074, + "grad_norm": 12.4375, + "learning_rate": 9.96162402669633e-06, + "loss": 2.9821, + "mean_token_accuracy": 0.41936780069649077, + "step": 207 + }, + { + "epoch": 0.0385613644790508, + "grad_norm": 11.6953125, + "learning_rate": 9.96143863552095e-06, + "loss": 3.2933, + "mean_token_accuracy": 0.3872255489021956, + "step": 208 + }, + { + "epoch": 0.03874675565443085, + "grad_norm": 9.515625, + "learning_rate": 9.96125324434557e-06, + "loss": 3.2637, + "mean_token_accuracy": 0.40516499282639884, + "step": 209 + }, + { + "epoch": 0.0389321468298109, + "grad_norm": 12.3125, + "learning_rate": 9.96106785317019e-06, + "loss": 3.7229, + "mean_token_accuracy": 0.37159841479524436, + "step": 210 + }, + { + "epoch": 0.03911753800519095, + "grad_norm": 21.125, + "learning_rate": 9.96088246199481e-06, + "loss": 3.1191, + "mean_token_accuracy": 0.403714893140875, + "step": 211 + }, + { + "epoch": 0.039302929180571, + "grad_norm": 18.21875, + "learning_rate": 9.96069707081943e-06, + "loss": 3.2282, + "mean_token_accuracy": 0.39150075288606323, + "step": 212 + }, + { + "epoch": 0.039488320355951054, + "grad_norm": 10.8359375, + "learning_rate": 9.960511679644049e-06, + "loss": 3.0031, + "mean_token_accuracy": 0.4197100641204349, + "step": 213 + }, + { + "epoch": 0.03967371153133111, + "grad_norm": 7.24609375, + "learning_rate": 9.96032628846867e-06, + "loss": 3.4147, + "mean_token_accuracy": 0.38453572661373836, + "step": 214 + }, + { + "epoch": 0.03985910270671116, + "grad_norm": 7.6796875, + "learning_rate": 9.96014089729329e-06, + "loss": 3.7369, + "mean_token_accuracy": 0.3621285418106427, + "step": 215 + }, + { + "epoch": 0.04004449388209121, + "grad_norm": 19.859375, + "learning_rate": 9.95995550611791e-06, + "loss": 3.1417, + "mean_token_accuracy": 0.3868991517436381, + "step": 216 + }, + { + "epoch": 0.040229885057471264, + "grad_norm": 12.640625, + "learning_rate": 9.959770114942529e-06, + "loss": 3.2834, + "mean_token_accuracy": 0.38451840645486635, + "step": 217 + }, + { + "epoch": 0.040415276232851315, + "grad_norm": 9.75, + "learning_rate": 9.95958472376715e-06, + "loss": 3.4675, + "mean_token_accuracy": 0.387737843551797, + "step": 218 + }, + { + "epoch": 0.040600667408231365, + "grad_norm": 10.328125, + "learning_rate": 9.95939933259177e-06, + "loss": 3.1948, + "mean_token_accuracy": 0.40479477846115225, + "step": 219 + }, + { + "epoch": 0.04078605858361142, + "grad_norm": 10.1640625, + "learning_rate": 9.959213941416389e-06, + "loss": 3.5703, + "mean_token_accuracy": 0.39093041438623927, + "step": 220 + }, + { + "epoch": 0.040971449758991474, + "grad_norm": 8.7421875, + "learning_rate": 9.959028550241009e-06, + "loss": 3.5699, + "mean_token_accuracy": 0.3735813366960908, + "step": 221 + }, + { + "epoch": 0.041156840934371525, + "grad_norm": 11.46875, + "learning_rate": 9.95884315906563e-06, + "loss": 3.1396, + "mean_token_accuracy": 0.4228142076502732, + "step": 222 + }, + { + "epoch": 0.041342232109751575, + "grad_norm": 9.6484375, + "learning_rate": 9.95865776789025e-06, + "loss": 3.3994, + "mean_token_accuracy": 0.39686998394863565, + "step": 223 + }, + { + "epoch": 0.041527623285131626, + "grad_norm": 7.89453125, + "learning_rate": 9.958472376714869e-06, + "loss": 3.2472, + "mean_token_accuracy": 0.39676926017969794, + "step": 224 + }, + { + "epoch": 0.04171301446051168, + "grad_norm": 8.6484375, + "learning_rate": 9.95828698553949e-06, + "loss": 3.7331, + "mean_token_accuracy": 0.35888177052999415, + "step": 225 + }, + { + "epoch": 0.041898405635891735, + "grad_norm": 9.3671875, + "learning_rate": 9.95810159436411e-06, + "loss": 3.2112, + "mean_token_accuracy": 0.4221007418976962, + "step": 226 + }, + { + "epoch": 0.042083796811271786, + "grad_norm": 9.5390625, + "learning_rate": 9.957916203188729e-06, + "loss": 3.3106, + "mean_token_accuracy": 0.3904628658038941, + "step": 227 + }, + { + "epoch": 0.042269187986651836, + "grad_norm": 9.296875, + "learning_rate": 9.95773081201335e-06, + "loss": 2.8992, + "mean_token_accuracy": 0.43082386363636366, + "step": 228 + }, + { + "epoch": 0.04245457916203189, + "grad_norm": 10.25, + "learning_rate": 9.957545420837968e-06, + "loss": 2.9827, + "mean_token_accuracy": 0.42902469306111185, + "step": 229 + }, + { + "epoch": 0.04263997033741194, + "grad_norm": 23.0625, + "learning_rate": 9.95736002966259e-06, + "loss": 2.8586, + "mean_token_accuracy": 0.42935244806220385, + "step": 230 + }, + { + "epoch": 0.04282536151279199, + "grad_norm": 15.5, + "learning_rate": 9.957174638487209e-06, + "loss": 2.8888, + "mean_token_accuracy": 0.4355179704016913, + "step": 231 + }, + { + "epoch": 0.043010752688172046, + "grad_norm": 8.359375, + "learning_rate": 9.95698924731183e-06, + "loss": 3.4322, + "mean_token_accuracy": 0.38012439920836866, + "step": 232 + }, + { + "epoch": 0.0431961438635521, + "grad_norm": 11.8671875, + "learning_rate": 9.956803856136448e-06, + "loss": 2.7555, + "mean_token_accuracy": 0.454233801851217, + "step": 233 + }, + { + "epoch": 0.04338153503893215, + "grad_norm": 10.0, + "learning_rate": 9.956618464961069e-06, + "loss": 3.2944, + "mean_token_accuracy": 0.375397019301246, + "step": 234 + }, + { + "epoch": 0.0435669262143122, + "grad_norm": 15.9296875, + "learning_rate": 9.95643307378569e-06, + "loss": 2.945, + "mean_token_accuracy": 0.40634441087613293, + "step": 235 + }, + { + "epoch": 0.04375231738969225, + "grad_norm": 13.296875, + "learning_rate": 9.956247682610308e-06, + "loss": 3.6443, + "mean_token_accuracy": 0.3519572143832499, + "step": 236 + }, + { + "epoch": 0.0439377085650723, + "grad_norm": 10.359375, + "learning_rate": 9.956062291434929e-06, + "loss": 3.0666, + "mean_token_accuracy": 0.42105263157894735, + "step": 237 + }, + { + "epoch": 0.04412309974045235, + "grad_norm": 9.8046875, + "learning_rate": 9.955876900259549e-06, + "loss": 3.062, + "mean_token_accuracy": 0.4213204592901879, + "step": 238 + }, + { + "epoch": 0.04430849091583241, + "grad_norm": 13.15625, + "learning_rate": 9.95569150908417e-06, + "loss": 2.9448, + "mean_token_accuracy": 0.4407938849403245, + "step": 239 + }, + { + "epoch": 0.04449388209121246, + "grad_norm": 10.4140625, + "learning_rate": 9.955506117908788e-06, + "loss": 3.5474, + "mean_token_accuracy": 0.3649173256649892, + "step": 240 + }, + { + "epoch": 0.04467927326659251, + "grad_norm": 7.78125, + "learning_rate": 9.955320726733409e-06, + "loss": 3.6224, + "mean_token_accuracy": 0.36583052276559863, + "step": 241 + }, + { + "epoch": 0.04486466444197256, + "grad_norm": 10.3515625, + "learning_rate": 9.955135335558028e-06, + "loss": 3.6969, + "mean_token_accuracy": 0.35927545284197376, + "step": 242 + }, + { + "epoch": 0.04505005561735261, + "grad_norm": 8.8203125, + "learning_rate": 9.954949944382648e-06, + "loss": 3.5021, + "mean_token_accuracy": 0.36568775642409845, + "step": 243 + }, + { + "epoch": 0.04523544679273266, + "grad_norm": 11.8359375, + "learning_rate": 9.954764553207269e-06, + "loss": 3.0969, + "mean_token_accuracy": 0.40544398001835047, + "step": 244 + }, + { + "epoch": 0.04542083796811272, + "grad_norm": 13.359375, + "learning_rate": 9.954579162031887e-06, + "loss": 3.5776, + "mean_token_accuracy": 0.3670543684068163, + "step": 245 + }, + { + "epoch": 0.04560622914349277, + "grad_norm": 9.265625, + "learning_rate": 9.954393770856508e-06, + "loss": 3.5283, + "mean_token_accuracy": 0.3642548737406777, + "step": 246 + }, + { + "epoch": 0.04579162031887282, + "grad_norm": 9.765625, + "learning_rate": 9.954208379681128e-06, + "loss": 3.191, + "mean_token_accuracy": 0.4285278878390735, + "step": 247 + }, + { + "epoch": 0.04597701149425287, + "grad_norm": 10.75, + "learning_rate": 9.954022988505749e-06, + "loss": 3.3679, + "mean_token_accuracy": 0.3937433722163309, + "step": 248 + }, + { + "epoch": 0.046162402669632924, + "grad_norm": 10.5078125, + "learning_rate": 9.953837597330368e-06, + "loss": 3.2337, + "mean_token_accuracy": 0.4002500852563374, + "step": 249 + }, + { + "epoch": 0.046347793845012975, + "grad_norm": 10.1796875, + "learning_rate": 9.953652206154988e-06, + "loss": 3.5841, + "mean_token_accuracy": 0.37393986121819583, + "step": 250 + }, + { + "epoch": 0.04653318502039303, + "grad_norm": 13.875, + "learning_rate": 9.953466814979607e-06, + "loss": 3.2986, + "mean_token_accuracy": 0.3838593425794474, + "step": 251 + }, + { + "epoch": 0.04671857619577308, + "grad_norm": 22.546875, + "learning_rate": 9.953281423804227e-06, + "loss": 2.995, + "mean_token_accuracy": 0.398406374501992, + "step": 252 + }, + { + "epoch": 0.046903967371153134, + "grad_norm": 13.015625, + "learning_rate": 9.953096032628848e-06, + "loss": 2.9509, + "mean_token_accuracy": 0.421219646799117, + "step": 253 + }, + { + "epoch": 0.047089358546533185, + "grad_norm": 10.4609375, + "learning_rate": 9.952910641453467e-06, + "loss": 3.1138, + "mean_token_accuracy": 0.41052864410528644, + "step": 254 + }, + { + "epoch": 0.047274749721913235, + "grad_norm": 10.5390625, + "learning_rate": 9.952725250278087e-06, + "loss": 2.6581, + "mean_token_accuracy": 0.4651910950971561, + "step": 255 + }, + { + "epoch": 0.047460140897293286, + "grad_norm": 20.65625, + "learning_rate": 9.952539859102708e-06, + "loss": 2.5778, + "mean_token_accuracy": 0.4835465372739303, + "step": 256 + }, + { + "epoch": 0.047645532072673344, + "grad_norm": 14.8828125, + "learning_rate": 9.952354467927328e-06, + "loss": 2.8463, + "mean_token_accuracy": 0.4397463002114165, + "step": 257 + }, + { + "epoch": 0.047830923248053395, + "grad_norm": 10.515625, + "learning_rate": 9.952169076751947e-06, + "loss": 3.0023, + "mean_token_accuracy": 0.4152579107065453, + "step": 258 + }, + { + "epoch": 0.048016314423433445, + "grad_norm": 10.2421875, + "learning_rate": 9.951983685576567e-06, + "loss": 3.172, + "mean_token_accuracy": 0.41314093583636874, + "step": 259 + }, + { + "epoch": 0.048201705598813496, + "grad_norm": 8.796875, + "learning_rate": 9.951798294401186e-06, + "loss": 4.0024, + "mean_token_accuracy": 0.3308007718282682, + "step": 260 + }, + { + "epoch": 0.04838709677419355, + "grad_norm": 8.859375, + "learning_rate": 9.951612903225807e-06, + "loss": 3.3318, + "mean_token_accuracy": 0.3919406771392331, + "step": 261 + }, + { + "epoch": 0.0485724879495736, + "grad_norm": 8.7109375, + "learning_rate": 9.951427512050427e-06, + "loss": 3.3641, + "mean_token_accuracy": 0.39799622477130825, + "step": 262 + }, + { + "epoch": 0.048757879124953656, + "grad_norm": 7.203125, + "learning_rate": 9.951242120875048e-06, + "loss": 3.9871, + "mean_token_accuracy": 0.32790697674418606, + "step": 263 + }, + { + "epoch": 0.048943270300333706, + "grad_norm": 9.84375, + "learning_rate": 9.951056729699668e-06, + "loss": 3.6859, + "mean_token_accuracy": 0.37404908856035596, + "step": 264 + }, + { + "epoch": 0.04912866147571376, + "grad_norm": 9.0703125, + "learning_rate": 9.950871338524287e-06, + "loss": 3.5331, + "mean_token_accuracy": 0.3781861292234736, + "step": 265 + }, + { + "epoch": 0.04931405265109381, + "grad_norm": 8.8125, + "learning_rate": 9.950685947348908e-06, + "loss": 3.0759, + "mean_token_accuracy": 0.39980732177263967, + "step": 266 + }, + { + "epoch": 0.04949944382647386, + "grad_norm": 9.359375, + "learning_rate": 9.950500556173526e-06, + "loss": 3.3699, + "mean_token_accuracy": 0.3899159663865546, + "step": 267 + }, + { + "epoch": 0.04968483500185391, + "grad_norm": 11.328125, + "learning_rate": 9.950315164998147e-06, + "loss": 3.1082, + "mean_token_accuracy": 0.4157303370786517, + "step": 268 + }, + { + "epoch": 0.04987022617723396, + "grad_norm": 7.4296875, + "learning_rate": 9.950129773822767e-06, + "loss": 2.8153, + "mean_token_accuracy": 0.4501039501039501, + "step": 269 + }, + { + "epoch": 0.05005561735261402, + "grad_norm": 8.8671875, + "learning_rate": 9.949944382647386e-06, + "loss": 2.8924, + "mean_token_accuracy": 0.42946110828673106, + "step": 270 + }, + { + "epoch": 0.05024100852799407, + "grad_norm": 8.671875, + "learning_rate": 9.949758991472007e-06, + "loss": 3.1582, + "mean_token_accuracy": 0.4190692395005675, + "step": 271 + }, + { + "epoch": 0.05042639970337412, + "grad_norm": 10.5703125, + "learning_rate": 9.949573600296627e-06, + "loss": 2.8333, + "mean_token_accuracy": 0.44422398589065254, + "step": 272 + }, + { + "epoch": 0.05061179087875417, + "grad_norm": 8.78125, + "learning_rate": 9.949388209121248e-06, + "loss": 3.1242, + "mean_token_accuracy": 0.40270494065691415, + "step": 273 + }, + { + "epoch": 0.05079718205413422, + "grad_norm": 8.234375, + "learning_rate": 9.949202817945866e-06, + "loss": 3.5041, + "mean_token_accuracy": 0.39269535673839184, + "step": 274 + }, + { + "epoch": 0.05098257322951427, + "grad_norm": 6.578125, + "learning_rate": 9.949017426770487e-06, + "loss": 3.1813, + "mean_token_accuracy": 0.40011883541295307, + "step": 275 + }, + { + "epoch": 0.05116796440489433, + "grad_norm": 7.16796875, + "learning_rate": 9.948832035595106e-06, + "loss": 3.5754, + "mean_token_accuracy": 0.36409686187299234, + "step": 276 + }, + { + "epoch": 0.05135335558027438, + "grad_norm": 7.625, + "learning_rate": 9.948646644419726e-06, + "loss": 3.3745, + "mean_token_accuracy": 0.3978096788515613, + "step": 277 + }, + { + "epoch": 0.05153874675565443, + "grad_norm": 7.98046875, + "learning_rate": 9.948461253244347e-06, + "loss": 3.459, + "mean_token_accuracy": 0.3805266579973992, + "step": 278 + }, + { + "epoch": 0.05172413793103448, + "grad_norm": 7.01953125, + "learning_rate": 9.948275862068967e-06, + "loss": 3.7506, + "mean_token_accuracy": 0.36843579330625964, + "step": 279 + }, + { + "epoch": 0.05190952910641453, + "grad_norm": 6.90625, + "learning_rate": 9.948090470893586e-06, + "loss": 3.0627, + "mean_token_accuracy": 0.4097297297297297, + "step": 280 + }, + { + "epoch": 0.052094920281794584, + "grad_norm": 8.015625, + "learning_rate": 9.947905079718206e-06, + "loss": 3.4648, + "mean_token_accuracy": 0.3831575729787559, + "step": 281 + }, + { + "epoch": 0.05228031145717464, + "grad_norm": 10.25, + "learning_rate": 9.947719688542827e-06, + "loss": 2.8794, + "mean_token_accuracy": 0.4348494554772582, + "step": 282 + }, + { + "epoch": 0.05246570263255469, + "grad_norm": 7.5234375, + "learning_rate": 9.947534297367446e-06, + "loss": 3.5441, + "mean_token_accuracy": 0.3593377483443709, + "step": 283 + }, + { + "epoch": 0.05265109380793474, + "grad_norm": 10.484375, + "learning_rate": 9.947348906192066e-06, + "loss": 3.3728, + "mean_token_accuracy": 0.3731917993547716, + "step": 284 + }, + { + "epoch": 0.052836484983314794, + "grad_norm": 13.0390625, + "learning_rate": 9.947163515016685e-06, + "loss": 3.1653, + "mean_token_accuracy": 0.3817497876471302, + "step": 285 + }, + { + "epoch": 0.053021876158694844, + "grad_norm": 9.828125, + "learning_rate": 9.946978123841305e-06, + "loss": 2.8067, + "mean_token_accuracy": 0.4605310155970949, + "step": 286 + }, + { + "epoch": 0.053207267334074895, + "grad_norm": 14.625, + "learning_rate": 9.946792732665926e-06, + "loss": 3.2071, + "mean_token_accuracy": 0.40423710678365077, + "step": 287 + }, + { + "epoch": 0.05339265850945495, + "grad_norm": 11.171875, + "learning_rate": 9.946607341490546e-06, + "loss": 3.2238, + "mean_token_accuracy": 0.4034090909090909, + "step": 288 + }, + { + "epoch": 0.053578049684835004, + "grad_norm": 13.09375, + "learning_rate": 9.946421950315165e-06, + "loss": 3.3854, + "mean_token_accuracy": 0.3754677754677755, + "step": 289 + }, + { + "epoch": 0.053763440860215055, + "grad_norm": 13.9921875, + "learning_rate": 9.946236559139786e-06, + "loss": 3.2726, + "mean_token_accuracy": 0.38038548752834467, + "step": 290 + }, + { + "epoch": 0.053948832035595105, + "grad_norm": 12.5546875, + "learning_rate": 9.946051167964406e-06, + "loss": 3.324, + "mean_token_accuracy": 0.392080137736735, + "step": 291 + }, + { + "epoch": 0.054134223210975156, + "grad_norm": 8.515625, + "learning_rate": 9.945865776789025e-06, + "loss": 3.2803, + "mean_token_accuracy": 0.38600987091875477, + "step": 292 + }, + { + "epoch": 0.05431961438635521, + "grad_norm": 8.1796875, + "learning_rate": 9.945680385613646e-06, + "loss": 3.2993, + "mean_token_accuracy": 0.40348886682740037, + "step": 293 + }, + { + "epoch": 0.054505005561735265, + "grad_norm": 12.3359375, + "learning_rate": 9.945494994438264e-06, + "loss": 3.2883, + "mean_token_accuracy": 0.39398750926219966, + "step": 294 + }, + { + "epoch": 0.054690396737115315, + "grad_norm": 9.5546875, + "learning_rate": 9.945309603262887e-06, + "loss": 3.0073, + "mean_token_accuracy": 0.43836276083467096, + "step": 295 + }, + { + "epoch": 0.054875787912495366, + "grad_norm": 17.671875, + "learning_rate": 9.945124212087505e-06, + "loss": 2.9189, + "mean_token_accuracy": 0.4206308169596691, + "step": 296 + }, + { + "epoch": 0.05506117908787542, + "grad_norm": 9.5078125, + "learning_rate": 9.944938820912126e-06, + "loss": 3.2944, + "mean_token_accuracy": 0.3911849512563504, + "step": 297 + }, + { + "epoch": 0.05524657026325547, + "grad_norm": 8.2578125, + "learning_rate": 9.944753429736745e-06, + "loss": 2.8802, + "mean_token_accuracy": 0.435657629927785, + "step": 298 + }, + { + "epoch": 0.05543196143863552, + "grad_norm": 9.2265625, + "learning_rate": 9.944568038561365e-06, + "loss": 2.9501, + "mean_token_accuracy": 0.4169729368526562, + "step": 299 + }, + { + "epoch": 0.05561735261401557, + "grad_norm": 7.32421875, + "learning_rate": 9.944382647385986e-06, + "loss": 3.428, + "mean_token_accuracy": 0.392517725147081, + "step": 300 + }, + { + "epoch": 0.05580274378939563, + "grad_norm": 7.921875, + "learning_rate": 9.944197256210604e-06, + "loss": 3.4753, + "mean_token_accuracy": 0.36510085284960064, + "step": 301 + }, + { + "epoch": 0.05598813496477568, + "grad_norm": 8.2265625, + "learning_rate": 9.944011865035225e-06, + "loss": 2.6533, + "mean_token_accuracy": 0.4446389496717724, + "step": 302 + }, + { + "epoch": 0.05617352614015573, + "grad_norm": 8.9453125, + "learning_rate": 9.943826473859845e-06, + "loss": 3.4928, + "mean_token_accuracy": 0.35917901938426455, + "step": 303 + }, + { + "epoch": 0.05635891731553578, + "grad_norm": 6.96484375, + "learning_rate": 9.943641082684466e-06, + "loss": 3.2542, + "mean_token_accuracy": 0.4170755642787046, + "step": 304 + }, + { + "epoch": 0.05654430849091583, + "grad_norm": 9.4453125, + "learning_rate": 9.943455691509085e-06, + "loss": 2.977, + "mean_token_accuracy": 0.43080593849416754, + "step": 305 + }, + { + "epoch": 0.05672969966629588, + "grad_norm": 8.109375, + "learning_rate": 9.943270300333705e-06, + "loss": 3.3331, + "mean_token_accuracy": 0.3952975753122704, + "step": 306 + }, + { + "epoch": 0.05691509084167594, + "grad_norm": 7.30859375, + "learning_rate": 9.943084909158326e-06, + "loss": 3.6364, + "mean_token_accuracy": 0.371665582303188, + "step": 307 + }, + { + "epoch": 0.05710048201705599, + "grad_norm": 7.671875, + "learning_rate": 9.942899517982944e-06, + "loss": 3.5286, + "mean_token_accuracy": 0.3780568407138136, + "step": 308 + }, + { + "epoch": 0.05728587319243604, + "grad_norm": 7.01171875, + "learning_rate": 9.942714126807565e-06, + "loss": 3.4153, + "mean_token_accuracy": 0.38449289563939243, + "step": 309 + }, + { + "epoch": 0.05747126436781609, + "grad_norm": 7.1953125, + "learning_rate": 9.942528735632184e-06, + "loss": 3.0392, + "mean_token_accuracy": 0.3950374646849281, + "step": 310 + }, + { + "epoch": 0.05765665554319614, + "grad_norm": 7.28515625, + "learning_rate": 9.942343344456806e-06, + "loss": 2.8286, + "mean_token_accuracy": 0.4390590464161667, + "step": 311 + }, + { + "epoch": 0.05784204671857619, + "grad_norm": 7.40625, + "learning_rate": 9.942157953281425e-06, + "loss": 3.4708, + "mean_token_accuracy": 0.36556501659691154, + "step": 312 + }, + { + "epoch": 0.05802743789395625, + "grad_norm": 12.171875, + "learning_rate": 9.941972562106045e-06, + "loss": 3.1521, + "mean_token_accuracy": 0.41821471652593484, + "step": 313 + }, + { + "epoch": 0.0582128290693363, + "grad_norm": 8.125, + "learning_rate": 9.941787170930664e-06, + "loss": 3.3218, + "mean_token_accuracy": 0.39399853622834835, + "step": 314 + }, + { + "epoch": 0.05839822024471635, + "grad_norm": 9.796875, + "learning_rate": 9.941601779755284e-06, + "loss": 3.2287, + "mean_token_accuracy": 0.40054598512661205, + "step": 315 + }, + { + "epoch": 0.0585836114200964, + "grad_norm": 9.4140625, + "learning_rate": 9.941416388579905e-06, + "loss": 3.0306, + "mean_token_accuracy": 0.41148190045248867, + "step": 316 + }, + { + "epoch": 0.058769002595476454, + "grad_norm": 7.82421875, + "learning_rate": 9.941230997404524e-06, + "loss": 3.1382, + "mean_token_accuracy": 0.42365652544782484, + "step": 317 + }, + { + "epoch": 0.058954393770856504, + "grad_norm": 7.359375, + "learning_rate": 9.941045606229144e-06, + "loss": 3.0658, + "mean_token_accuracy": 0.4279584775086505, + "step": 318 + }, + { + "epoch": 0.05913978494623656, + "grad_norm": 7.0390625, + "learning_rate": 9.940860215053765e-06, + "loss": 3.0132, + "mean_token_accuracy": 0.4064222712238148, + "step": 319 + }, + { + "epoch": 0.05932517612161661, + "grad_norm": 8.1875, + "learning_rate": 9.940674823878385e-06, + "loss": 3.3134, + "mean_token_accuracy": 0.3796250207400033, + "step": 320 + }, + { + "epoch": 0.059510567296996664, + "grad_norm": 8.390625, + "learning_rate": 9.940489432703004e-06, + "loss": 2.6812, + "mean_token_accuracy": 0.47064432638605097, + "step": 321 + }, + { + "epoch": 0.059695958472376714, + "grad_norm": 8.2265625, + "learning_rate": 9.940304041527625e-06, + "loss": 3.2405, + "mean_token_accuracy": 0.3894899536321484, + "step": 322 + }, + { + "epoch": 0.059881349647756765, + "grad_norm": 7.78125, + "learning_rate": 9.940118650352243e-06, + "loss": 3.5864, + "mean_token_accuracy": 0.3572093023255814, + "step": 323 + }, + { + "epoch": 0.060066740823136816, + "grad_norm": 6.88671875, + "learning_rate": 9.939933259176864e-06, + "loss": 3.335, + "mean_token_accuracy": 0.3887285690035648, + "step": 324 + }, + { + "epoch": 0.060252131998516874, + "grad_norm": 8.1796875, + "learning_rate": 9.939747868001484e-06, + "loss": 3.171, + "mean_token_accuracy": 0.4153274407392527, + "step": 325 + }, + { + "epoch": 0.060437523173896925, + "grad_norm": 9.6640625, + "learning_rate": 9.939562476826103e-06, + "loss": 2.8717, + "mean_token_accuracy": 0.42344694887300716, + "step": 326 + }, + { + "epoch": 0.060622914349276975, + "grad_norm": 10.09375, + "learning_rate": 9.939377085650724e-06, + "loss": 3.0943, + "mean_token_accuracy": 0.4212737127371274, + "step": 327 + }, + { + "epoch": 0.060808305524657026, + "grad_norm": 8.15625, + "learning_rate": 9.939191694475344e-06, + "loss": 2.8487, + "mean_token_accuracy": 0.4289034132171387, + "step": 328 + }, + { + "epoch": 0.06099369670003708, + "grad_norm": 7.00390625, + "learning_rate": 9.939006303299965e-06, + "loss": 3.358, + "mean_token_accuracy": 0.3919404517453799, + "step": 329 + }, + { + "epoch": 0.06117908787541713, + "grad_norm": 11.09375, + "learning_rate": 9.938820912124583e-06, + "loss": 2.9496, + "mean_token_accuracy": 0.41870147406323843, + "step": 330 + }, + { + "epoch": 0.061364479050797185, + "grad_norm": 8.1875, + "learning_rate": 9.938635520949204e-06, + "loss": 3.0177, + "mean_token_accuracy": 0.4148455622024173, + "step": 331 + }, + { + "epoch": 0.061549870226177236, + "grad_norm": 8.4140625, + "learning_rate": 9.938450129773823e-06, + "loss": 3.4472, + "mean_token_accuracy": 0.36963210702341137, + "step": 332 + }, + { + "epoch": 0.06173526140155729, + "grad_norm": 9.328125, + "learning_rate": 9.938264738598443e-06, + "loss": 3.0677, + "mean_token_accuracy": 0.39379347244515783, + "step": 333 + }, + { + "epoch": 0.06192065257693734, + "grad_norm": 8.75, + "learning_rate": 9.938079347423064e-06, + "loss": 3.2314, + "mean_token_accuracy": 0.38148958213422357, + "step": 334 + }, + { + "epoch": 0.06210604375231739, + "grad_norm": 10.4609375, + "learning_rate": 9.937893956247684e-06, + "loss": 3.256, + "mean_token_accuracy": 0.37461950246667364, + "step": 335 + }, + { + "epoch": 0.06229143492769744, + "grad_norm": 8.3359375, + "learning_rate": 9.937708565072303e-06, + "loss": 3.2837, + "mean_token_accuracy": 0.39097202192096914, + "step": 336 + }, + { + "epoch": 0.06247682610307749, + "grad_norm": 6.83984375, + "learning_rate": 9.937523173896923e-06, + "loss": 3.0507, + "mean_token_accuracy": 0.4053655660377358, + "step": 337 + }, + { + "epoch": 0.06266221727845754, + "grad_norm": 8.921875, + "learning_rate": 9.937337782721544e-06, + "loss": 2.9651, + "mean_token_accuracy": 0.4084235503485096, + "step": 338 + }, + { + "epoch": 0.06284760845383759, + "grad_norm": 11.9453125, + "learning_rate": 9.937152391546163e-06, + "loss": 3.2857, + "mean_token_accuracy": 0.3715688462396786, + "step": 339 + }, + { + "epoch": 0.06303299962921764, + "grad_norm": 7.9140625, + "learning_rate": 9.936967000370783e-06, + "loss": 2.8766, + "mean_token_accuracy": 0.43730668570069, + "step": 340 + }, + { + "epoch": 0.06321839080459771, + "grad_norm": 32.53125, + "learning_rate": 9.936781609195402e-06, + "loss": 3.2508, + "mean_token_accuracy": 0.3707114026236125, + "step": 341 + }, + { + "epoch": 0.06340378197997776, + "grad_norm": 8.7109375, + "learning_rate": 9.936596218020022e-06, + "loss": 3.0083, + "mean_token_accuracy": 0.4149775645984837, + "step": 342 + }, + { + "epoch": 0.06358917315535781, + "grad_norm": 8.6171875, + "learning_rate": 9.936410826844643e-06, + "loss": 3.0531, + "mean_token_accuracy": 0.4118731078057128, + "step": 343 + }, + { + "epoch": 0.06377456433073786, + "grad_norm": 7.37109375, + "learning_rate": 9.936225435669263e-06, + "loss": 3.1703, + "mean_token_accuracy": 0.39235832396853426, + "step": 344 + }, + { + "epoch": 0.06395995550611791, + "grad_norm": 6.90625, + "learning_rate": 9.936040044493884e-06, + "loss": 3.3839, + "mean_token_accuracy": 0.37969796752583174, + "step": 345 + }, + { + "epoch": 0.06414534668149796, + "grad_norm": 7.08203125, + "learning_rate": 9.935854653318503e-06, + "loss": 3.2928, + "mean_token_accuracy": 0.39885729832675826, + "step": 346 + }, + { + "epoch": 0.06433073785687801, + "grad_norm": 9.21875, + "learning_rate": 9.935669262143123e-06, + "loss": 3.1548, + "mean_token_accuracy": 0.40253516772298653, + "step": 347 + }, + { + "epoch": 0.06451612903225806, + "grad_norm": 7.96484375, + "learning_rate": 9.935483870967742e-06, + "loss": 3.379, + "mean_token_accuracy": 0.38612143742255267, + "step": 348 + }, + { + "epoch": 0.06470152020763811, + "grad_norm": 14.0859375, + "learning_rate": 9.935298479792363e-06, + "loss": 3.0443, + "mean_token_accuracy": 0.3883242732214606, + "step": 349 + }, + { + "epoch": 0.06488691138301816, + "grad_norm": 7.8984375, + "learning_rate": 9.935113088616983e-06, + "loss": 3.0287, + "mean_token_accuracy": 0.4172011182371321, + "step": 350 + }, + { + "epoch": 0.06507230255839822, + "grad_norm": 9.2890625, + "learning_rate": 9.934927697441604e-06, + "loss": 2.8132, + "mean_token_accuracy": 0.4324363849078678, + "step": 351 + }, + { + "epoch": 0.06525769373377827, + "grad_norm": 7.52734375, + "learning_rate": 9.934742306266222e-06, + "loss": 3.1722, + "mean_token_accuracy": 0.37766203703703705, + "step": 352 + }, + { + "epoch": 0.06544308490915833, + "grad_norm": 14.6796875, + "learning_rate": 9.934556915090843e-06, + "loss": 3.2414, + "mean_token_accuracy": 0.4192169837331128, + "step": 353 + }, + { + "epoch": 0.06562847608453838, + "grad_norm": 11.1171875, + "learning_rate": 9.934371523915463e-06, + "loss": 3.0014, + "mean_token_accuracy": 0.4087623220153341, + "step": 354 + }, + { + "epoch": 0.06581386725991843, + "grad_norm": 8.15625, + "learning_rate": 9.934186132740082e-06, + "loss": 3.0859, + "mean_token_accuracy": 0.4166019166019166, + "step": 355 + }, + { + "epoch": 0.06599925843529848, + "grad_norm": 7.78125, + "learning_rate": 9.934000741564703e-06, + "loss": 3.2385, + "mean_token_accuracy": 0.39176453530520516, + "step": 356 + }, + { + "epoch": 0.06618464961067853, + "grad_norm": 10.15625, + "learning_rate": 9.933815350389321e-06, + "loss": 2.9305, + "mean_token_accuracy": 0.4167066730676908, + "step": 357 + }, + { + "epoch": 0.06637004078605858, + "grad_norm": 8.0703125, + "learning_rate": 9.933629959213942e-06, + "loss": 3.4065, + "mean_token_accuracy": 0.3885509838998211, + "step": 358 + }, + { + "epoch": 0.06655543196143864, + "grad_norm": 13.6328125, + "learning_rate": 9.933444568038562e-06, + "loss": 3.1736, + "mean_token_accuracy": 0.3686071473670918, + "step": 359 + }, + { + "epoch": 0.06674082313681869, + "grad_norm": 10.0078125, + "learning_rate": 9.933259176863183e-06, + "loss": 3.4273, + "mean_token_accuracy": 0.37640109057861254, + "step": 360 + }, + { + "epoch": 0.06692621431219874, + "grad_norm": 13.15625, + "learning_rate": 9.933073785687802e-06, + "loss": 3.1109, + "mean_token_accuracy": 0.4104465835568881, + "step": 361 + }, + { + "epoch": 0.06711160548757879, + "grad_norm": 9.8359375, + "learning_rate": 9.932888394512422e-06, + "loss": 2.9534, + "mean_token_accuracy": 0.41854456724558636, + "step": 362 + }, + { + "epoch": 0.06729699666295884, + "grad_norm": 9.203125, + "learning_rate": 9.932703003337043e-06, + "loss": 3.1304, + "mean_token_accuracy": 0.39285248467457934, + "step": 363 + }, + { + "epoch": 0.06748238783833889, + "grad_norm": 14.5546875, + "learning_rate": 9.932517612161661e-06, + "loss": 3.5019, + "mean_token_accuracy": 0.36822405782137324, + "step": 364 + }, + { + "epoch": 0.06766777901371895, + "grad_norm": 13.6640625, + "learning_rate": 9.932332220986282e-06, + "loss": 2.765, + "mean_token_accuracy": 0.41987221280479853, + "step": 365 + }, + { + "epoch": 0.067853170189099, + "grad_norm": 8.484375, + "learning_rate": 9.9321468298109e-06, + "loss": 3.935, + "mean_token_accuracy": 0.3432546470521154, + "step": 366 + }, + { + "epoch": 0.06803856136447906, + "grad_norm": 11.6640625, + "learning_rate": 9.931961438635523e-06, + "loss": 2.8361, + "mean_token_accuracy": 0.42698781989872725, + "step": 367 + }, + { + "epoch": 0.0682239525398591, + "grad_norm": 7.37890625, + "learning_rate": 9.931776047460142e-06, + "loss": 3.107, + "mean_token_accuracy": 0.40388738030584537, + "step": 368 + }, + { + "epoch": 0.06840934371523916, + "grad_norm": 8.2265625, + "learning_rate": 9.931590656284762e-06, + "loss": 3.3055, + "mean_token_accuracy": 0.3911715713492504, + "step": 369 + }, + { + "epoch": 0.06859473489061921, + "grad_norm": 10.2421875, + "learning_rate": 9.931405265109381e-06, + "loss": 3.2748, + "mean_token_accuracy": 0.373602667189645, + "step": 370 + }, + { + "epoch": 0.06878012606599926, + "grad_norm": 9.9921875, + "learning_rate": 9.931219873934002e-06, + "loss": 3.1735, + "mean_token_accuracy": 0.40048396854204477, + "step": 371 + }, + { + "epoch": 0.06896551724137931, + "grad_norm": 12.8359375, + "learning_rate": 9.931034482758622e-06, + "loss": 3.1394, + "mean_token_accuracy": 0.3843685537474215, + "step": 372 + }, + { + "epoch": 0.06915090841675936, + "grad_norm": 19.09375, + "learning_rate": 9.93084909158324e-06, + "loss": 2.8149, + "mean_token_accuracy": 0.42526997840172787, + "step": 373 + }, + { + "epoch": 0.06933629959213941, + "grad_norm": 13.1328125, + "learning_rate": 9.930663700407861e-06, + "loss": 2.9503, + "mean_token_accuracy": 0.422032262512064, + "step": 374 + }, + { + "epoch": 0.06952169076751946, + "grad_norm": 12.9140625, + "learning_rate": 9.93047830923248e-06, + "loss": 3.3978, + "mean_token_accuracy": 0.39074910450085654, + "step": 375 + }, + { + "epoch": 0.06970708194289951, + "grad_norm": 8.9453125, + "learning_rate": 9.930292918057102e-06, + "loss": 3.4736, + "mean_token_accuracy": 0.38575803981623275, + "step": 376 + }, + { + "epoch": 0.06989247311827956, + "grad_norm": 7.98828125, + "learning_rate": 9.930107526881721e-06, + "loss": 3.3877, + "mean_token_accuracy": 0.3846567967698519, + "step": 377 + }, + { + "epoch": 0.07007786429365963, + "grad_norm": 6.87890625, + "learning_rate": 9.929922135706342e-06, + "loss": 3.6124, + "mean_token_accuracy": 0.365993811698836, + "step": 378 + }, + { + "epoch": 0.07026325546903968, + "grad_norm": 14.9375, + "learning_rate": 9.92973674453096e-06, + "loss": 2.9345, + "mean_token_accuracy": 0.4124666751301257, + "step": 379 + }, + { + "epoch": 0.07044864664441973, + "grad_norm": 17.203125, + "learning_rate": 9.92955135335558e-06, + "loss": 2.9832, + "mean_token_accuracy": 0.4029434719589698, + "step": 380 + }, + { + "epoch": 0.07063403781979978, + "grad_norm": 15.25, + "learning_rate": 9.929365962180201e-06, + "loss": 2.7983, + "mean_token_accuracy": 0.4282109177845663, + "step": 381 + }, + { + "epoch": 0.07081942899517983, + "grad_norm": 9.1171875, + "learning_rate": 9.92918057100482e-06, + "loss": 2.9869, + "mean_token_accuracy": 0.4103250478011472, + "step": 382 + }, + { + "epoch": 0.07100482017055988, + "grad_norm": 8.8515625, + "learning_rate": 9.92899517982944e-06, + "loss": 2.9022, + "mean_token_accuracy": 0.40809928151534947, + "step": 383 + }, + { + "epoch": 0.07119021134593993, + "grad_norm": 6.50390625, + "learning_rate": 9.928809788654061e-06, + "loss": 3.49, + "mean_token_accuracy": 0.37555982085732564, + "step": 384 + }, + { + "epoch": 0.07137560252131998, + "grad_norm": 8.5546875, + "learning_rate": 9.928624397478682e-06, + "loss": 3.0957, + "mean_token_accuracy": 0.40704993705413345, + "step": 385 + }, + { + "epoch": 0.07156099369670003, + "grad_norm": 7.35546875, + "learning_rate": 9.9284390063033e-06, + "loss": 3.3227, + "mean_token_accuracy": 0.3978283621140764, + "step": 386 + }, + { + "epoch": 0.07174638487208009, + "grad_norm": 9.7421875, + "learning_rate": 9.928253615127921e-06, + "loss": 2.8282, + "mean_token_accuracy": 0.4238902114549783, + "step": 387 + }, + { + "epoch": 0.07193177604746014, + "grad_norm": 10.140625, + "learning_rate": 9.928068223952541e-06, + "loss": 3.4092, + "mean_token_accuracy": 0.3781851274050962, + "step": 388 + }, + { + "epoch": 0.07211716722284019, + "grad_norm": 7.36328125, + "learning_rate": 9.92788283277716e-06, + "loss": 3.3315, + "mean_token_accuracy": 0.3859465128474043, + "step": 389 + }, + { + "epoch": 0.07230255839822025, + "grad_norm": 7.24609375, + "learning_rate": 9.92769744160178e-06, + "loss": 3.0029, + "mean_token_accuracy": 0.4217097277614011, + "step": 390 + }, + { + "epoch": 0.0724879495736003, + "grad_norm": 12.328125, + "learning_rate": 9.9275120504264e-06, + "loss": 3.1637, + "mean_token_accuracy": 0.39295695919508106, + "step": 391 + }, + { + "epoch": 0.07267334074898035, + "grad_norm": 6.78125, + "learning_rate": 9.927326659251022e-06, + "loss": 3.2558, + "mean_token_accuracy": 0.38784781374219196, + "step": 392 + }, + { + "epoch": 0.0728587319243604, + "grad_norm": 5.69921875, + "learning_rate": 9.92714126807564e-06, + "loss": 3.2201, + "mean_token_accuracy": 0.3929712460063898, + "step": 393 + }, + { + "epoch": 0.07304412309974045, + "grad_norm": 7.26953125, + "learning_rate": 9.926955876900261e-06, + "loss": 3.357, + "mean_token_accuracy": 0.39025779803446803, + "step": 394 + }, + { + "epoch": 0.0732295142751205, + "grad_norm": 8.6875, + "learning_rate": 9.92677048572488e-06, + "loss": 3.0221, + "mean_token_accuracy": 0.40974671369028537, + "step": 395 + }, + { + "epoch": 0.07341490545050056, + "grad_norm": 11.1328125, + "learning_rate": 9.9265850945495e-06, + "loss": 3.2178, + "mean_token_accuracy": 0.39492710679151355, + "step": 396 + }, + { + "epoch": 0.0736002966258806, + "grad_norm": 6.71875, + "learning_rate": 9.92639970337412e-06, + "loss": 3.4733, + "mean_token_accuracy": 0.37597911227154046, + "step": 397 + }, + { + "epoch": 0.07378568780126066, + "grad_norm": 10.546875, + "learning_rate": 9.92621431219874e-06, + "loss": 3.004, + "mean_token_accuracy": 0.4068651374654303, + "step": 398 + }, + { + "epoch": 0.07397107897664071, + "grad_norm": 6.87109375, + "learning_rate": 9.92602892102336e-06, + "loss": 3.0414, + "mean_token_accuracy": 0.42359005457853244, + "step": 399 + }, + { + "epoch": 0.07415647015202076, + "grad_norm": 7.52734375, + "learning_rate": 9.92584352984798e-06, + "loss": 3.3743, + "mean_token_accuracy": 0.3960429621254946, + "step": 400 + }, + { + "epoch": 0.07434186132740081, + "grad_norm": 16.5625, + "learning_rate": 9.925658138672601e-06, + "loss": 3.0779, + "mean_token_accuracy": 0.4026806526806527, + "step": 401 + }, + { + "epoch": 0.07452725250278086, + "grad_norm": 6.66015625, + "learning_rate": 9.92547274749722e-06, + "loss": 3.1661, + "mean_token_accuracy": 0.39506468615237184, + "step": 402 + }, + { + "epoch": 0.07471264367816093, + "grad_norm": 6.7109375, + "learning_rate": 9.92528735632184e-06, + "loss": 3.1679, + "mean_token_accuracy": 0.4066322370209296, + "step": 403 + }, + { + "epoch": 0.07489803485354098, + "grad_norm": 6.30859375, + "learning_rate": 9.925101965146459e-06, + "loss": 2.8993, + "mean_token_accuracy": 0.42763237979306146, + "step": 404 + }, + { + "epoch": 0.07508342602892103, + "grad_norm": 8.0078125, + "learning_rate": 9.92491657397108e-06, + "loss": 3.0765, + "mean_token_accuracy": 0.4217510457233521, + "step": 405 + }, + { + "epoch": 0.07526881720430108, + "grad_norm": 7.9609375, + "learning_rate": 9.9247311827957e-06, + "loss": 3.0178, + "mean_token_accuracy": 0.4194839857651246, + "step": 406 + }, + { + "epoch": 0.07545420837968113, + "grad_norm": 31.734375, + "learning_rate": 9.924545791620319e-06, + "loss": 3.415, + "mean_token_accuracy": 0.36237650933040616, + "step": 407 + }, + { + "epoch": 0.07563959955506118, + "grad_norm": 9.1953125, + "learning_rate": 9.92436040044494e-06, + "loss": 3.025, + "mean_token_accuracy": 0.42097791798107254, + "step": 408 + }, + { + "epoch": 0.07582499073044123, + "grad_norm": 7.0859375, + "learning_rate": 9.92417500926956e-06, + "loss": 2.794, + "mean_token_accuracy": 0.4515760040671073, + "step": 409 + }, + { + "epoch": 0.07601038190582128, + "grad_norm": 7.0859375, + "learning_rate": 9.92398961809418e-06, + "loss": 3.2994, + "mean_token_accuracy": 0.3854875283446712, + "step": 410 + }, + { + "epoch": 0.07619577308120133, + "grad_norm": 7.4375, + "learning_rate": 9.923804226918799e-06, + "loss": 3.2092, + "mean_token_accuracy": 0.40295767465578786, + "step": 411 + }, + { + "epoch": 0.07638116425658138, + "grad_norm": 7.83984375, + "learning_rate": 9.92361883574342e-06, + "loss": 3.7227, + "mean_token_accuracy": 0.35226628895184137, + "step": 412 + }, + { + "epoch": 0.07656655543196143, + "grad_norm": 6.5390625, + "learning_rate": 9.923433444568038e-06, + "loss": 3.0243, + "mean_token_accuracy": 0.4165524801315429, + "step": 413 + }, + { + "epoch": 0.07675194660734148, + "grad_norm": 8.03125, + "learning_rate": 9.923248053392659e-06, + "loss": 3.0522, + "mean_token_accuracy": 0.4143056200650255, + "step": 414 + }, + { + "epoch": 0.07693733778272155, + "grad_norm": 6.66796875, + "learning_rate": 9.92306266221728e-06, + "loss": 2.7394, + "mean_token_accuracy": 0.4457690812491901, + "step": 415 + }, + { + "epoch": 0.0771227289581016, + "grad_norm": 6.54296875, + "learning_rate": 9.9228772710419e-06, + "loss": 2.3393, + "mean_token_accuracy": 0.5100199071001991, + "step": 416 + }, + { + "epoch": 0.07730812013348165, + "grad_norm": 11.5234375, + "learning_rate": 9.922691879866519e-06, + "loss": 2.9385, + "mean_token_accuracy": 0.4253193580085162, + "step": 417 + }, + { + "epoch": 0.0774935113088617, + "grad_norm": 8.1171875, + "learning_rate": 9.92250648869114e-06, + "loss": 3.3027, + "mean_token_accuracy": 0.3856152512998267, + "step": 418 + }, + { + "epoch": 0.07767890248424175, + "grad_norm": 8.890625, + "learning_rate": 9.92232109751576e-06, + "loss": 3.2797, + "mean_token_accuracy": 0.38766840635999517, + "step": 419 + }, + { + "epoch": 0.0778642936596218, + "grad_norm": 7.9609375, + "learning_rate": 9.922135706340378e-06, + "loss": 3.0887, + "mean_token_accuracy": 0.4093029118870541, + "step": 420 + }, + { + "epoch": 0.07804968483500185, + "grad_norm": 9.3671875, + "learning_rate": 9.921950315164999e-06, + "loss": 3.1051, + "mean_token_accuracy": 0.39662897375720074, + "step": 421 + }, + { + "epoch": 0.0782350760103819, + "grad_norm": 6.80859375, + "learning_rate": 9.921764923989618e-06, + "loss": 3.2713, + "mean_token_accuracy": 0.39184581171237953, + "step": 422 + }, + { + "epoch": 0.07842046718576196, + "grad_norm": 8.7578125, + "learning_rate": 9.921579532814238e-06, + "loss": 2.8083, + "mean_token_accuracy": 0.43615977575332865, + "step": 423 + }, + { + "epoch": 0.078605858361142, + "grad_norm": 12.53125, + "learning_rate": 9.921394141638859e-06, + "loss": 2.6467, + "mean_token_accuracy": 0.4571752694271129, + "step": 424 + }, + { + "epoch": 0.07879124953652206, + "grad_norm": 8.2890625, + "learning_rate": 9.92120875046348e-06, + "loss": 3.017, + "mean_token_accuracy": 0.4204812974104107, + "step": 425 + }, + { + "epoch": 0.07897664071190211, + "grad_norm": 12.5234375, + "learning_rate": 9.9210233592881e-06, + "loss": 2.9766, + "mean_token_accuracy": 0.4030904489143466, + "step": 426 + }, + { + "epoch": 0.07916203188728217, + "grad_norm": 9.5625, + "learning_rate": 9.920837968112719e-06, + "loss": 3.3189, + "mean_token_accuracy": 0.39043691484618814, + "step": 427 + }, + { + "epoch": 0.07934742306266222, + "grad_norm": 8.703125, + "learning_rate": 9.920652576937339e-06, + "loss": 3.2277, + "mean_token_accuracy": 0.3963214915595868, + "step": 428 + }, + { + "epoch": 0.07953281423804227, + "grad_norm": 13.1171875, + "learning_rate": 9.920467185761958e-06, + "loss": 2.9014, + "mean_token_accuracy": 0.44818840579710145, + "step": 429 + }, + { + "epoch": 0.07971820541342232, + "grad_norm": 8.7265625, + "learning_rate": 9.920281794586578e-06, + "loss": 3.318, + "mean_token_accuracy": 0.3807124443402859, + "step": 430 + }, + { + "epoch": 0.07990359658880238, + "grad_norm": 11.0703125, + "learning_rate": 9.920096403411199e-06, + "loss": 3.514, + "mean_token_accuracy": 0.36650659037498723, + "step": 431 + }, + { + "epoch": 0.08008898776418243, + "grad_norm": 8.796875, + "learning_rate": 9.91991101223582e-06, + "loss": 3.5072, + "mean_token_accuracy": 0.3688764829030007, + "step": 432 + }, + { + "epoch": 0.08027437893956248, + "grad_norm": 7.54296875, + "learning_rate": 9.919725621060438e-06, + "loss": 3.6727, + "mean_token_accuracy": 0.35138888888888886, + "step": 433 + }, + { + "epoch": 0.08045977011494253, + "grad_norm": 7.65625, + "learning_rate": 9.919540229885059e-06, + "loss": 3.4004, + "mean_token_accuracy": 0.4000682128240109, + "step": 434 + }, + { + "epoch": 0.08064516129032258, + "grad_norm": 6.19921875, + "learning_rate": 9.919354838709679e-06, + "loss": 3.1433, + "mean_token_accuracy": 0.40173325499412454, + "step": 435 + }, + { + "epoch": 0.08083055246570263, + "grad_norm": 8.6484375, + "learning_rate": 9.919169447534298e-06, + "loss": 3.269, + "mean_token_accuracy": 0.39300750503386417, + "step": 436 + }, + { + "epoch": 0.08101594364108268, + "grad_norm": 7.29296875, + "learning_rate": 9.918984056358918e-06, + "loss": 3.4682, + "mean_token_accuracy": 0.3660212367270456, + "step": 437 + }, + { + "epoch": 0.08120133481646273, + "grad_norm": 10.25, + "learning_rate": 9.918798665183537e-06, + "loss": 3.6457, + "mean_token_accuracy": 0.3582458307597282, + "step": 438 + }, + { + "epoch": 0.08138672599184278, + "grad_norm": 9.8671875, + "learning_rate": 9.918613274008158e-06, + "loss": 2.6709, + "mean_token_accuracy": 0.450360162856248, + "step": 439 + }, + { + "epoch": 0.08157211716722285, + "grad_norm": 11.671875, + "learning_rate": 9.918427882832778e-06, + "loss": 3.1539, + "mean_token_accuracy": 0.39481687161179424, + "step": 440 + }, + { + "epoch": 0.0817575083426029, + "grad_norm": 11.8984375, + "learning_rate": 9.918242491657399e-06, + "loss": 3.2612, + "mean_token_accuracy": 0.39783751010509294, + "step": 441 + }, + { + "epoch": 0.08194289951798295, + "grad_norm": 9.015625, + "learning_rate": 9.918057100482017e-06, + "loss": 3.0612, + "mean_token_accuracy": 0.4160719006079831, + "step": 442 + }, + { + "epoch": 0.082128290693363, + "grad_norm": 9.84375, + "learning_rate": 9.917871709306638e-06, + "loss": 2.8218, + "mean_token_accuracy": 0.4304481097649978, + "step": 443 + }, + { + "epoch": 0.08231368186874305, + "grad_norm": 8.2578125, + "learning_rate": 9.917686318131258e-06, + "loss": 3.3211, + "mean_token_accuracy": 0.41575370961031755, + "step": 444 + }, + { + "epoch": 0.0824990730441231, + "grad_norm": 10.90625, + "learning_rate": 9.917500926955877e-06, + "loss": 2.7982, + "mean_token_accuracy": 0.433960281601345, + "step": 445 + }, + { + "epoch": 0.08268446421950315, + "grad_norm": 12.140625, + "learning_rate": 9.917315535780498e-06, + "loss": 3.3681, + "mean_token_accuracy": 0.38062105398788815, + "step": 446 + }, + { + "epoch": 0.0828698553948832, + "grad_norm": 9.21875, + "learning_rate": 9.917130144605116e-06, + "loss": 2.8688, + "mean_token_accuracy": 0.4178797033781928, + "step": 447 + }, + { + "epoch": 0.08305524657026325, + "grad_norm": 8.5859375, + "learning_rate": 9.916944753429739e-06, + "loss": 2.6755, + "mean_token_accuracy": 0.45230017641471026, + "step": 448 + }, + { + "epoch": 0.0832406377456433, + "grad_norm": 8.296875, + "learning_rate": 9.916759362254357e-06, + "loss": 3.1206, + "mean_token_accuracy": 0.4315057671038079, + "step": 449 + }, + { + "epoch": 0.08342602892102335, + "grad_norm": 8.1328125, + "learning_rate": 9.916573971078978e-06, + "loss": 3.3526, + "mean_token_accuracy": 0.3785459823195672, + "step": 450 + }, + { + "epoch": 0.0836114200964034, + "grad_norm": 8.6015625, + "learning_rate": 9.916388579903597e-06, + "loss": 3.3528, + "mean_token_accuracy": 0.38765657620041755, + "step": 451 + }, + { + "epoch": 0.08379681127178347, + "grad_norm": 8.3984375, + "learning_rate": 9.916203188728217e-06, + "loss": 2.3142, + "mean_token_accuracy": 0.5174269005847953, + "step": 452 + }, + { + "epoch": 0.08398220244716352, + "grad_norm": 16.234375, + "learning_rate": 9.916017797552838e-06, + "loss": 2.3648, + "mean_token_accuracy": 0.4698336085355011, + "step": 453 + }, + { + "epoch": 0.08416759362254357, + "grad_norm": 8.7578125, + "learning_rate": 9.915832406377457e-06, + "loss": 3.0772, + "mean_token_accuracy": 0.4197109067017083, + "step": 454 + }, + { + "epoch": 0.08435298479792362, + "grad_norm": 8.21875, + "learning_rate": 9.915647015202077e-06, + "loss": 3.1772, + "mean_token_accuracy": 0.4105668684645019, + "step": 455 + }, + { + "epoch": 0.08453837597330367, + "grad_norm": 8.03125, + "learning_rate": 9.915461624026698e-06, + "loss": 3.3031, + "mean_token_accuracy": 0.39910955636826206, + "step": 456 + }, + { + "epoch": 0.08472376714868372, + "grad_norm": 8.4921875, + "learning_rate": 9.915276232851318e-06, + "loss": 3.0853, + "mean_token_accuracy": 0.4126109169131487, + "step": 457 + }, + { + "epoch": 0.08490915832406377, + "grad_norm": 6.7890625, + "learning_rate": 9.915090841675937e-06, + "loss": 3.3507, + "mean_token_accuracy": 0.3846563665423548, + "step": 458 + }, + { + "epoch": 0.08509454949944383, + "grad_norm": 9.0546875, + "learning_rate": 9.914905450500557e-06, + "loss": 3.0794, + "mean_token_accuracy": 0.44009632751354605, + "step": 459 + }, + { + "epoch": 0.08527994067482388, + "grad_norm": 6.64453125, + "learning_rate": 9.914720059325176e-06, + "loss": 3.2129, + "mean_token_accuracy": 0.40235094179294717, + "step": 460 + }, + { + "epoch": 0.08546533185020393, + "grad_norm": 6.09375, + "learning_rate": 9.914534668149797e-06, + "loss": 3.1745, + "mean_token_accuracy": 0.4080825038973498, + "step": 461 + }, + { + "epoch": 0.08565072302558398, + "grad_norm": 6.06640625, + "learning_rate": 9.914349276974417e-06, + "loss": 3.2488, + "mean_token_accuracy": 0.40218763146823727, + "step": 462 + }, + { + "epoch": 0.08583611420096403, + "grad_norm": 8.1015625, + "learning_rate": 9.914163885799036e-06, + "loss": 3.1788, + "mean_token_accuracy": 0.39791580968051554, + "step": 463 + }, + { + "epoch": 0.08602150537634409, + "grad_norm": 7.64453125, + "learning_rate": 9.913978494623658e-06, + "loss": 3.1277, + "mean_token_accuracy": 0.3926210607225211, + "step": 464 + }, + { + "epoch": 0.08620689655172414, + "grad_norm": 7.4765625, + "learning_rate": 9.913793103448277e-06, + "loss": 3.1341, + "mean_token_accuracy": 0.39659232580689846, + "step": 465 + }, + { + "epoch": 0.0863922877271042, + "grad_norm": 7.7734375, + "learning_rate": 9.913607712272897e-06, + "loss": 3.2317, + "mean_token_accuracy": 0.3909190480931992, + "step": 466 + }, + { + "epoch": 0.08657767890248425, + "grad_norm": 6.375, + "learning_rate": 9.913422321097516e-06, + "loss": 3.2833, + "mean_token_accuracy": 0.3947193843704392, + "step": 467 + }, + { + "epoch": 0.0867630700778643, + "grad_norm": 7.23046875, + "learning_rate": 9.913236929922137e-06, + "loss": 2.9127, + "mean_token_accuracy": 0.42207425798442616, + "step": 468 + }, + { + "epoch": 0.08694846125324435, + "grad_norm": 7.61328125, + "learning_rate": 9.913051538746757e-06, + "loss": 3.0064, + "mean_token_accuracy": 0.3991683991683992, + "step": 469 + }, + { + "epoch": 0.0871338524286244, + "grad_norm": 9.0234375, + "learning_rate": 9.912866147571376e-06, + "loss": 2.9705, + "mean_token_accuracy": 0.42190547636909226, + "step": 470 + }, + { + "epoch": 0.08731924360400445, + "grad_norm": 6.9921875, + "learning_rate": 9.912680756395996e-06, + "loss": 2.6942, + "mean_token_accuracy": 0.4473190348525469, + "step": 471 + }, + { + "epoch": 0.0875046347793845, + "grad_norm": 8.6640625, + "learning_rate": 9.912495365220617e-06, + "loss": 2.7824, + "mean_token_accuracy": 0.4219273223365993, + "step": 472 + }, + { + "epoch": 0.08769002595476455, + "grad_norm": 9.8515625, + "learning_rate": 9.912309974045237e-06, + "loss": 2.9691, + "mean_token_accuracy": 0.4296920395119117, + "step": 473 + }, + { + "epoch": 0.0878754171301446, + "grad_norm": 11.4921875, + "learning_rate": 9.912124582869856e-06, + "loss": 3.1641, + "mean_token_accuracy": 0.3890962671905697, + "step": 474 + }, + { + "epoch": 0.08806080830552465, + "grad_norm": 7.84765625, + "learning_rate": 9.911939191694477e-06, + "loss": 3.0651, + "mean_token_accuracy": 0.40698455339153794, + "step": 475 + }, + { + "epoch": 0.0882461994809047, + "grad_norm": 6.94140625, + "learning_rate": 9.911753800519095e-06, + "loss": 2.7195, + "mean_token_accuracy": 0.45902570657577046, + "step": 476 + }, + { + "epoch": 0.08843159065628477, + "grad_norm": 9.984375, + "learning_rate": 9.911568409343716e-06, + "loss": 2.4893, + "mean_token_accuracy": 0.4737275064267352, + "step": 477 + }, + { + "epoch": 0.08861698183166482, + "grad_norm": 12.2734375, + "learning_rate": 9.911383018168336e-06, + "loss": 3.2334, + "mean_token_accuracy": 0.4050864361702128, + "step": 478 + }, + { + "epoch": 0.08880237300704487, + "grad_norm": 9.875, + "learning_rate": 9.911197626992955e-06, + "loss": 3.0104, + "mean_token_accuracy": 0.42554298348289765, + "step": 479 + }, + { + "epoch": 0.08898776418242492, + "grad_norm": 10.0, + "learning_rate": 9.911012235817576e-06, + "loss": 3.0575, + "mean_token_accuracy": 0.40624231998033916, + "step": 480 + }, + { + "epoch": 0.08917315535780497, + "grad_norm": 10.25, + "learning_rate": 9.910826844642196e-06, + "loss": 2.7407, + "mean_token_accuracy": 0.4360285006195787, + "step": 481 + }, + { + "epoch": 0.08935854653318502, + "grad_norm": 6.3125, + "learning_rate": 9.910641453466817e-06, + "loss": 3.2688, + "mean_token_accuracy": 0.39638615112458936, + "step": 482 + }, + { + "epoch": 0.08954393770856507, + "grad_norm": 9.703125, + "learning_rate": 9.910456062291436e-06, + "loss": 2.6631, + "mean_token_accuracy": 0.4357098701833917, + "step": 483 + }, + { + "epoch": 0.08972932888394512, + "grad_norm": 10.0859375, + "learning_rate": 9.910270671116056e-06, + "loss": 3.1845, + "mean_token_accuracy": 0.3989728341667793, + "step": 484 + }, + { + "epoch": 0.08991472005932517, + "grad_norm": 11.1015625, + "learning_rate": 9.910085279940675e-06, + "loss": 2.7018, + "mean_token_accuracy": 0.4283249460819554, + "step": 485 + }, + { + "epoch": 0.09010011123470522, + "grad_norm": 5.74609375, + "learning_rate": 9.909899888765295e-06, + "loss": 3.1636, + "mean_token_accuracy": 0.40561257632843967, + "step": 486 + }, + { + "epoch": 0.09028550241008527, + "grad_norm": 10.40625, + "learning_rate": 9.909714497589916e-06, + "loss": 3.0088, + "mean_token_accuracy": 0.4089074098189868, + "step": 487 + }, + { + "epoch": 0.09047089358546533, + "grad_norm": 12.03125, + "learning_rate": 9.909529106414536e-06, + "loss": 3.1917, + "mean_token_accuracy": 0.39633614422797325, + "step": 488 + }, + { + "epoch": 0.09065628476084539, + "grad_norm": 8.4296875, + "learning_rate": 9.909343715239155e-06, + "loss": 3.1894, + "mean_token_accuracy": 0.394391623540739, + "step": 489 + }, + { + "epoch": 0.09084167593622544, + "grad_norm": 8.765625, + "learning_rate": 9.909158324063776e-06, + "loss": 3.2605, + "mean_token_accuracy": 0.4022971360381862, + "step": 490 + }, + { + "epoch": 0.09102706711160549, + "grad_norm": 10.6015625, + "learning_rate": 9.908972932888396e-06, + "loss": 2.7009, + "mean_token_accuracy": 0.442383273070272, + "step": 491 + }, + { + "epoch": 0.09121245828698554, + "grad_norm": 11.5078125, + "learning_rate": 9.908787541713015e-06, + "loss": 3.1025, + "mean_token_accuracy": 0.4097849102864148, + "step": 492 + }, + { + "epoch": 0.0913978494623656, + "grad_norm": 7.56640625, + "learning_rate": 9.908602150537635e-06, + "loss": 3.0982, + "mean_token_accuracy": 0.41829085457271364, + "step": 493 + }, + { + "epoch": 0.09158324063774564, + "grad_norm": 11.234375, + "learning_rate": 9.908416759362254e-06, + "loss": 2.9722, + "mean_token_accuracy": 0.38609790569189256, + "step": 494 + }, + { + "epoch": 0.0917686318131257, + "grad_norm": 7.73046875, + "learning_rate": 9.908231368186875e-06, + "loss": 3.3339, + "mean_token_accuracy": 0.38137963178746215, + "step": 495 + }, + { + "epoch": 0.09195402298850575, + "grad_norm": 5.5625, + "learning_rate": 9.908045977011495e-06, + "loss": 3.4163, + "mean_token_accuracy": 0.3942775733111718, + "step": 496 + }, + { + "epoch": 0.0921394141638858, + "grad_norm": 12.203125, + "learning_rate": 9.907860585836116e-06, + "loss": 3.5249, + "mean_token_accuracy": 0.3502646694447573, + "step": 497 + }, + { + "epoch": 0.09232480533926585, + "grad_norm": 8.84375, + "learning_rate": 9.907675194660734e-06, + "loss": 3.2102, + "mean_token_accuracy": 0.39647936581953835, + "step": 498 + }, + { + "epoch": 0.0925101965146459, + "grad_norm": 9.0078125, + "learning_rate": 9.907489803485355e-06, + "loss": 3.1365, + "mean_token_accuracy": 0.3905526397036122, + "step": 499 + }, + { + "epoch": 0.09269558769002595, + "grad_norm": 9.2109375, + "learning_rate": 9.907304412309975e-06, + "loss": 3.1293, + "mean_token_accuracy": 0.4135810217145726, + "step": 500 + }, + { + "epoch": 0.092880978865406, + "grad_norm": 16.0, + "learning_rate": 9.907119021134594e-06, + "loss": 2.68, + "mean_token_accuracy": 0.42712124114154376, + "step": 501 + }, + { + "epoch": 0.09306637004078606, + "grad_norm": 7.4765625, + "learning_rate": 9.906933629959215e-06, + "loss": 3.1706, + "mean_token_accuracy": 0.39808671501311527, + "step": 502 + }, + { + "epoch": 0.09325176121616612, + "grad_norm": 7.7578125, + "learning_rate": 9.906748238783834e-06, + "loss": 3.3471, + "mean_token_accuracy": 0.37984034314309545, + "step": 503 + }, + { + "epoch": 0.09343715239154617, + "grad_norm": 9.3125, + "learning_rate": 9.906562847608454e-06, + "loss": 3.1901, + "mean_token_accuracy": 0.39259415106248685, + "step": 504 + }, + { + "epoch": 0.09362254356692622, + "grad_norm": 6.3828125, + "learning_rate": 9.906377456433074e-06, + "loss": 2.8863, + "mean_token_accuracy": 0.4313641704946053, + "step": 505 + }, + { + "epoch": 0.09380793474230627, + "grad_norm": 9.4765625, + "learning_rate": 9.906192065257695e-06, + "loss": 2.8253, + "mean_token_accuracy": 0.4282671344443052, + "step": 506 + }, + { + "epoch": 0.09399332591768632, + "grad_norm": 7.8671875, + "learning_rate": 9.906006674082315e-06, + "loss": 3.6233, + "mean_token_accuracy": 0.3689356207341819, + "step": 507 + }, + { + "epoch": 0.09417871709306637, + "grad_norm": 8.4765625, + "learning_rate": 9.905821282906934e-06, + "loss": 3.4721, + "mean_token_accuracy": 0.37449329591518554, + "step": 508 + }, + { + "epoch": 0.09436410826844642, + "grad_norm": 6.2890625, + "learning_rate": 9.905635891731555e-06, + "loss": 3.756, + "mean_token_accuracy": 0.3437748871781258, + "step": 509 + }, + { + "epoch": 0.09454949944382647, + "grad_norm": 11.421875, + "learning_rate": 9.905450500556174e-06, + "loss": 3.1838, + "mean_token_accuracy": 0.41393114491593275, + "step": 510 + }, + { + "epoch": 0.09473489061920652, + "grad_norm": 10.515625, + "learning_rate": 9.905265109380794e-06, + "loss": 2.8957, + "mean_token_accuracy": 0.4046385949110561, + "step": 511 + }, + { + "epoch": 0.09492028179458657, + "grad_norm": 5.91796875, + "learning_rate": 9.905079718205415e-06, + "loss": 2.7827, + "mean_token_accuracy": 0.43753187149413564, + "step": 512 + }, + { + "epoch": 0.09510567296996662, + "grad_norm": 7.1171875, + "learning_rate": 9.904894327030035e-06, + "loss": 2.9564, + "mean_token_accuracy": 0.44311168044718763, + "step": 513 + }, + { + "epoch": 0.09529106414534669, + "grad_norm": 10.2421875, + "learning_rate": 9.904708935854654e-06, + "loss": 3.4504, + "mean_token_accuracy": 0.35724786827403704, + "step": 514 + }, + { + "epoch": 0.09547645532072674, + "grad_norm": 9.0234375, + "learning_rate": 9.904523544679274e-06, + "loss": 3.5049, + "mean_token_accuracy": 0.3726017781937295, + "step": 515 + }, + { + "epoch": 0.09566184649610679, + "grad_norm": 9.2890625, + "learning_rate": 9.904338153503895e-06, + "loss": 3.1349, + "mean_token_accuracy": 0.3977930733092975, + "step": 516 + }, + { + "epoch": 0.09584723767148684, + "grad_norm": 9.9453125, + "learning_rate": 9.904152762328514e-06, + "loss": 3.0061, + "mean_token_accuracy": 0.4089308176100629, + "step": 517 + }, + { + "epoch": 0.09603262884686689, + "grad_norm": 7.37890625, + "learning_rate": 9.903967371153134e-06, + "loss": 3.3977, + "mean_token_accuracy": 0.3812831077104179, + "step": 518 + }, + { + "epoch": 0.09621802002224694, + "grad_norm": 6.78515625, + "learning_rate": 9.903781979977753e-06, + "loss": 3.0701, + "mean_token_accuracy": 0.41572499107886285, + "step": 519 + }, + { + "epoch": 0.09640341119762699, + "grad_norm": 13.4375, + "learning_rate": 9.903596588802373e-06, + "loss": 2.8534, + "mean_token_accuracy": 0.41735941320293396, + "step": 520 + }, + { + "epoch": 0.09658880237300704, + "grad_norm": 9.203125, + "learning_rate": 9.903411197626994e-06, + "loss": 3.3144, + "mean_token_accuracy": 0.3787737317149082, + "step": 521 + }, + { + "epoch": 0.0967741935483871, + "grad_norm": 9.21875, + "learning_rate": 9.903225806451614e-06, + "loss": 2.9402, + "mean_token_accuracy": 0.42441036216007216, + "step": 522 + }, + { + "epoch": 0.09695958472376714, + "grad_norm": 7.1640625, + "learning_rate": 9.903040415276233e-06, + "loss": 2.9591, + "mean_token_accuracy": 0.4172654884443581, + "step": 523 + }, + { + "epoch": 0.0971449758991472, + "grad_norm": 10.4609375, + "learning_rate": 9.902855024100854e-06, + "loss": 2.715, + "mean_token_accuracy": 0.44952380952380955, + "step": 524 + }, + { + "epoch": 0.09733036707452725, + "grad_norm": 8.4921875, + "learning_rate": 9.902669632925474e-06, + "loss": 3.3993, + "mean_token_accuracy": 0.3869340061080866, + "step": 525 + }, + { + "epoch": 0.09751575824990731, + "grad_norm": 7.46484375, + "learning_rate": 9.902484241750093e-06, + "loss": 3.2045, + "mean_token_accuracy": 0.3825547206795165, + "step": 526 + }, + { + "epoch": 0.09770114942528736, + "grad_norm": 11.1640625, + "learning_rate": 9.902298850574713e-06, + "loss": 2.7646, + "mean_token_accuracy": 0.44712335757111876, + "step": 527 + }, + { + "epoch": 0.09788654060066741, + "grad_norm": 6.7421875, + "learning_rate": 9.902113459399332e-06, + "loss": 3.063, + "mean_token_accuracy": 0.40072365445499775, + "step": 528 + }, + { + "epoch": 0.09807193177604746, + "grad_norm": 9.7890625, + "learning_rate": 9.901928068223954e-06, + "loss": 2.7241, + "mean_token_accuracy": 0.44005805515239477, + "step": 529 + }, + { + "epoch": 0.09825732295142751, + "grad_norm": 5.5703125, + "learning_rate": 9.901742677048573e-06, + "loss": 2.8571, + "mean_token_accuracy": 0.43361597080605346, + "step": 530 + }, + { + "epoch": 0.09844271412680757, + "grad_norm": 6.72265625, + "learning_rate": 9.901557285873194e-06, + "loss": 3.2402, + "mean_token_accuracy": 0.4043173089656999, + "step": 531 + }, + { + "epoch": 0.09862810530218762, + "grad_norm": 9.7109375, + "learning_rate": 9.901371894697813e-06, + "loss": 2.8658, + "mean_token_accuracy": 0.44091611634115496, + "step": 532 + }, + { + "epoch": 0.09881349647756767, + "grad_norm": 21.25, + "learning_rate": 9.901186503522433e-06, + "loss": 3.1902, + "mean_token_accuracy": 0.3711459403905447, + "step": 533 + }, + { + "epoch": 0.09899888765294772, + "grad_norm": 7.953125, + "learning_rate": 9.901001112347053e-06, + "loss": 2.6368, + "mean_token_accuracy": 0.4520154748533633, + "step": 534 + }, + { + "epoch": 0.09918427882832777, + "grad_norm": 12.546875, + "learning_rate": 9.900815721171672e-06, + "loss": 2.7065, + "mean_token_accuracy": 0.4474727452923687, + "step": 535 + }, + { + "epoch": 0.09936967000370782, + "grad_norm": 5.83203125, + "learning_rate": 9.900630329996293e-06, + "loss": 3.2118, + "mean_token_accuracy": 0.43668559973270965, + "step": 536 + }, + { + "epoch": 0.09955506117908787, + "grad_norm": 7.09375, + "learning_rate": 9.900444938820913e-06, + "loss": 2.9621, + "mean_token_accuracy": 0.41491299069202753, + "step": 537 + }, + { + "epoch": 0.09974045235446792, + "grad_norm": 8.4296875, + "learning_rate": 9.900259547645534e-06, + "loss": 2.8767, + "mean_token_accuracy": 0.4382619568615192, + "step": 538 + }, + { + "epoch": 0.09992584352984799, + "grad_norm": 11.125, + "learning_rate": 9.900074156470153e-06, + "loss": 3.0035, + "mean_token_accuracy": 0.4055264167839325, + "step": 539 + }, + { + "epoch": 0.10011123470522804, + "grad_norm": 8.6796875, + "learning_rate": 9.899888765294773e-06, + "loss": 2.8824, + "mean_token_accuracy": 0.4249751573368665, + "step": 540 + }, + { + "epoch": 0.10029662588060809, + "grad_norm": 8.390625, + "learning_rate": 9.899703374119392e-06, + "loss": 2.7838, + "mean_token_accuracy": 0.4313384113166485, + "step": 541 + }, + { + "epoch": 0.10048201705598814, + "grad_norm": 11.5078125, + "learning_rate": 9.899517982944012e-06, + "loss": 2.934, + "mean_token_accuracy": 0.414598961338719, + "step": 542 + }, + { + "epoch": 0.10066740823136819, + "grad_norm": 9.9921875, + "learning_rate": 9.899332591768633e-06, + "loss": 2.5087, + "mean_token_accuracy": 0.4635171902052931, + "step": 543 + }, + { + "epoch": 0.10085279940674824, + "grad_norm": 7.94921875, + "learning_rate": 9.899147200593252e-06, + "loss": 2.9936, + "mean_token_accuracy": 0.40667330677290836, + "step": 544 + }, + { + "epoch": 0.10103819058212829, + "grad_norm": 7.38671875, + "learning_rate": 9.898961809417874e-06, + "loss": 2.9504, + "mean_token_accuracy": 0.42535932830510886, + "step": 545 + }, + { + "epoch": 0.10122358175750834, + "grad_norm": 6.96875, + "learning_rate": 9.898776418242493e-06, + "loss": 3.1097, + "mean_token_accuracy": 0.409689557855127, + "step": 546 + }, + { + "epoch": 0.10140897293288839, + "grad_norm": 7.33984375, + "learning_rate": 9.898591027067113e-06, + "loss": 3.0158, + "mean_token_accuracy": 0.4100429645542428, + "step": 547 + }, + { + "epoch": 0.10159436410826844, + "grad_norm": 7.33203125, + "learning_rate": 9.898405635891732e-06, + "loss": 3.0057, + "mean_token_accuracy": 0.42205900975053817, + "step": 548 + }, + { + "epoch": 0.1017797552836485, + "grad_norm": 15.953125, + "learning_rate": 9.898220244716352e-06, + "loss": 2.541, + "mean_token_accuracy": 0.45236523652365235, + "step": 549 + }, + { + "epoch": 0.10196514645902854, + "grad_norm": 12.8828125, + "learning_rate": 9.898034853540973e-06, + "loss": 2.7674, + "mean_token_accuracy": 0.4315550265122822, + "step": 550 + }, + { + "epoch": 0.10215053763440861, + "grad_norm": 10.515625, + "learning_rate": 9.897849462365592e-06, + "loss": 3.2461, + "mean_token_accuracy": 0.39080459770114945, + "step": 551 + }, + { + "epoch": 0.10233592880978866, + "grad_norm": 7.88671875, + "learning_rate": 9.897664071190212e-06, + "loss": 3.2235, + "mean_token_accuracy": 0.4100203276336243, + "step": 552 + }, + { + "epoch": 0.10252131998516871, + "grad_norm": 12.7109375, + "learning_rate": 9.897478680014833e-06, + "loss": 2.5025, + "mean_token_accuracy": 0.4781881199952193, + "step": 553 + }, + { + "epoch": 0.10270671116054876, + "grad_norm": 16.125, + "learning_rate": 9.897293288839453e-06, + "loss": 2.6751, + "mean_token_accuracy": 0.4566615226337449, + "step": 554 + }, + { + "epoch": 0.10289210233592881, + "grad_norm": 13.90625, + "learning_rate": 9.897107897664072e-06, + "loss": 3.0205, + "mean_token_accuracy": 0.402683780630105, + "step": 555 + }, + { + "epoch": 0.10307749351130886, + "grad_norm": 8.1015625, + "learning_rate": 9.896922506488692e-06, + "loss": 3.1493, + "mean_token_accuracy": 0.4079531051964512, + "step": 556 + }, + { + "epoch": 0.10326288468668891, + "grad_norm": 8.625, + "learning_rate": 9.896737115313311e-06, + "loss": 2.9197, + "mean_token_accuracy": 0.42426735218509, + "step": 557 + }, + { + "epoch": 0.10344827586206896, + "grad_norm": 8.3203125, + "learning_rate": 9.896551724137932e-06, + "loss": 3.653, + "mean_token_accuracy": 0.3520766061685781, + "step": 558 + }, + { + "epoch": 0.10363366703744901, + "grad_norm": 8.3984375, + "learning_rate": 9.896366332962552e-06, + "loss": 3.2426, + "mean_token_accuracy": 0.3994003471674294, + "step": 559 + }, + { + "epoch": 0.10381905821282907, + "grad_norm": 7.9375, + "learning_rate": 9.896180941787171e-06, + "loss": 3.0199, + "mean_token_accuracy": 0.40754315441002603, + "step": 560 + }, + { + "epoch": 0.10400444938820912, + "grad_norm": 10.2578125, + "learning_rate": 9.895995550611792e-06, + "loss": 3.2425, + "mean_token_accuracy": 0.38952962460425145, + "step": 561 + }, + { + "epoch": 0.10418984056358917, + "grad_norm": 10.0546875, + "learning_rate": 9.895810159436412e-06, + "loss": 2.8934, + "mean_token_accuracy": 0.4219742755307609, + "step": 562 + }, + { + "epoch": 0.10437523173896923, + "grad_norm": 13.40625, + "learning_rate": 9.895624768261032e-06, + "loss": 2.7176, + "mean_token_accuracy": 0.4490403213807469, + "step": 563 + }, + { + "epoch": 0.10456062291434928, + "grad_norm": 5.7109375, + "learning_rate": 9.895439377085651e-06, + "loss": 3.0299, + "mean_token_accuracy": 0.42014849323045567, + "step": 564 + }, + { + "epoch": 0.10474601408972933, + "grad_norm": 7.70703125, + "learning_rate": 9.895253985910272e-06, + "loss": 3.0085, + "mean_token_accuracy": 0.4144959128065395, + "step": 565 + }, + { + "epoch": 0.10493140526510938, + "grad_norm": 11.046875, + "learning_rate": 9.89506859473489e-06, + "loss": 2.7066, + "mean_token_accuracy": 0.4501766784452297, + "step": 566 + }, + { + "epoch": 0.10511679644048944, + "grad_norm": 7.453125, + "learning_rate": 9.894883203559511e-06, + "loss": 3.4936, + "mean_token_accuracy": 0.3604332669322709, + "step": 567 + }, + { + "epoch": 0.10530218761586949, + "grad_norm": 7.2109375, + "learning_rate": 9.894697812384132e-06, + "loss": 3.2169, + "mean_token_accuracy": 0.39522168073220015, + "step": 568 + }, + { + "epoch": 0.10548757879124954, + "grad_norm": 6.40234375, + "learning_rate": 9.894512421208752e-06, + "loss": 2.944, + "mean_token_accuracy": 0.4186946902654867, + "step": 569 + }, + { + "epoch": 0.10567296996662959, + "grad_norm": 7.5703125, + "learning_rate": 9.894327030033371e-06, + "loss": 3.4881, + "mean_token_accuracy": 0.3633245382585752, + "step": 570 + }, + { + "epoch": 0.10585836114200964, + "grad_norm": 5.36328125, + "learning_rate": 9.894141638857991e-06, + "loss": 3.308, + "mean_token_accuracy": 0.39363320295523685, + "step": 571 + }, + { + "epoch": 0.10604375231738969, + "grad_norm": 6.37890625, + "learning_rate": 9.893956247682612e-06, + "loss": 3.7202, + "mean_token_accuracy": 0.34663497623217415, + "step": 572 + }, + { + "epoch": 0.10622914349276974, + "grad_norm": 11.0390625, + "learning_rate": 9.89377085650723e-06, + "loss": 2.4134, + "mean_token_accuracy": 0.46348547717842326, + "step": 573 + }, + { + "epoch": 0.10641453466814979, + "grad_norm": 8.2734375, + "learning_rate": 9.893585465331851e-06, + "loss": 2.4189, + "mean_token_accuracy": 0.4828383641674781, + "step": 574 + }, + { + "epoch": 0.10659992584352984, + "grad_norm": 6.29296875, + "learning_rate": 9.89340007415647e-06, + "loss": 3.4033, + "mean_token_accuracy": 0.36513242478786323, + "step": 575 + }, + { + "epoch": 0.1067853170189099, + "grad_norm": 15.03125, + "learning_rate": 9.89321468298109e-06, + "loss": 2.7942, + "mean_token_accuracy": 0.4102470041574957, + "step": 576 + }, + { + "epoch": 0.10697070819428996, + "grad_norm": 8.2109375, + "learning_rate": 9.893029291805711e-06, + "loss": 3.0124, + "mean_token_accuracy": 0.40227895571902494, + "step": 577 + }, + { + "epoch": 0.10715609936967001, + "grad_norm": 7.24609375, + "learning_rate": 9.892843900630331e-06, + "loss": 2.9258, + "mean_token_accuracy": 0.42379448909299655, + "step": 578 + }, + { + "epoch": 0.10734149054505006, + "grad_norm": 8.28125, + "learning_rate": 9.89265850945495e-06, + "loss": 2.8692, + "mean_token_accuracy": 0.4168611435239207, + "step": 579 + }, + { + "epoch": 0.10752688172043011, + "grad_norm": 7.74609375, + "learning_rate": 9.89247311827957e-06, + "loss": 3.1486, + "mean_token_accuracy": 0.3992425981179711, + "step": 580 + }, + { + "epoch": 0.10771227289581016, + "grad_norm": 12.3203125, + "learning_rate": 9.892287727104191e-06, + "loss": 3.4727, + "mean_token_accuracy": 0.3803249328386849, + "step": 581 + }, + { + "epoch": 0.10789766407119021, + "grad_norm": 8.46875, + "learning_rate": 9.89210233592881e-06, + "loss": 2.8421, + "mean_token_accuracy": 0.42551020408163265, + "step": 582 + }, + { + "epoch": 0.10808305524657026, + "grad_norm": 7.3203125, + "learning_rate": 9.89191694475343e-06, + "loss": 3.3514, + "mean_token_accuracy": 0.3931807884005437, + "step": 583 + }, + { + "epoch": 0.10826844642195031, + "grad_norm": 7.66796875, + "learning_rate": 9.89173155357805e-06, + "loss": 3.1343, + "mean_token_accuracy": 0.40583554376657827, + "step": 584 + }, + { + "epoch": 0.10845383759733036, + "grad_norm": 6.328125, + "learning_rate": 9.891546162402671e-06, + "loss": 3.0979, + "mean_token_accuracy": 0.3949170397682381, + "step": 585 + }, + { + "epoch": 0.10863922877271041, + "grad_norm": 7.5234375, + "learning_rate": 9.89136077122729e-06, + "loss": 2.7459, + "mean_token_accuracy": 0.4419388585377194, + "step": 586 + }, + { + "epoch": 0.10882461994809046, + "grad_norm": 6.80859375, + "learning_rate": 9.89117538005191e-06, + "loss": 2.948, + "mean_token_accuracy": 0.4154088463052894, + "step": 587 + }, + { + "epoch": 0.10901001112347053, + "grad_norm": 7.26171875, + "learning_rate": 9.890989988876531e-06, + "loss": 3.3296, + "mean_token_accuracy": 0.38610216290842153, + "step": 588 + }, + { + "epoch": 0.10919540229885058, + "grad_norm": 8.1328125, + "learning_rate": 9.89080459770115e-06, + "loss": 2.9242, + "mean_token_accuracy": 0.4286278641032394, + "step": 589 + }, + { + "epoch": 0.10938079347423063, + "grad_norm": 9.171875, + "learning_rate": 9.89061920652577e-06, + "loss": 2.9386, + "mean_token_accuracy": 0.3997347731363077, + "step": 590 + }, + { + "epoch": 0.10956618464961068, + "grad_norm": 7.13671875, + "learning_rate": 9.89043381535039e-06, + "loss": 2.7333, + "mean_token_accuracy": 0.4522235576923077, + "step": 591 + }, + { + "epoch": 0.10975157582499073, + "grad_norm": 9.3515625, + "learning_rate": 9.89024842417501e-06, + "loss": 2.7505, + "mean_token_accuracy": 0.43312723722746505, + "step": 592 + }, + { + "epoch": 0.10993696700037078, + "grad_norm": 8.1328125, + "learning_rate": 9.89006303299963e-06, + "loss": 3.2415, + "mean_token_accuracy": 0.39282921925611525, + "step": 593 + }, + { + "epoch": 0.11012235817575083, + "grad_norm": 7.2734375, + "learning_rate": 9.88987764182425e-06, + "loss": 2.6174, + "mean_token_accuracy": 0.47191466378611935, + "step": 594 + }, + { + "epoch": 0.11030774935113088, + "grad_norm": 13.7734375, + "learning_rate": 9.88969225064887e-06, + "loss": 2.5852, + "mean_token_accuracy": 0.4635452423451874, + "step": 595 + }, + { + "epoch": 0.11049314052651094, + "grad_norm": 6.69921875, + "learning_rate": 9.88950685947349e-06, + "loss": 3.0751, + "mean_token_accuracy": 0.4011641177813285, + "step": 596 + }, + { + "epoch": 0.11067853170189099, + "grad_norm": 7.35546875, + "learning_rate": 9.88932146829811e-06, + "loss": 2.8879, + "mean_token_accuracy": 0.43366461587001104, + "step": 597 + }, + { + "epoch": 0.11086392287727104, + "grad_norm": 7.484375, + "learning_rate": 9.88913607712273e-06, + "loss": 3.0651, + "mean_token_accuracy": 0.4127983599355689, + "step": 598 + }, + { + "epoch": 0.11104931405265109, + "grad_norm": 7.33203125, + "learning_rate": 9.88895068594735e-06, + "loss": 2.7786, + "mean_token_accuracy": 0.4375, + "step": 599 + }, + { + "epoch": 0.11123470522803114, + "grad_norm": 6.90234375, + "learning_rate": 9.888765294771969e-06, + "loss": 2.6548, + "mean_token_accuracy": 0.44922443044110516, + "step": 600 + }, + { + "epoch": 0.1114200964034112, + "grad_norm": 5.953125, + "learning_rate": 9.88857990359659e-06, + "loss": 3.0822, + "mean_token_accuracy": 0.39007669068092027, + "step": 601 + }, + { + "epoch": 0.11160548757879125, + "grad_norm": 5.98828125, + "learning_rate": 9.88839451242121e-06, + "loss": 3.364, + "mean_token_accuracy": 0.380719794344473, + "step": 602 + }, + { + "epoch": 0.1117908787541713, + "grad_norm": 10.3984375, + "learning_rate": 9.88820912124583e-06, + "loss": 3.4323, + "mean_token_accuracy": 0.3897013641391176, + "step": 603 + }, + { + "epoch": 0.11197626992955136, + "grad_norm": 7.0078125, + "learning_rate": 9.888023730070449e-06, + "loss": 2.8606, + "mean_token_accuracy": 0.4246031746031746, + "step": 604 + }, + { + "epoch": 0.1121616611049314, + "grad_norm": 6.8125, + "learning_rate": 9.88783833889507e-06, + "loss": 3.077, + "mean_token_accuracy": 0.3956997359486986, + "step": 605 + }, + { + "epoch": 0.11234705228031146, + "grad_norm": 10.3671875, + "learning_rate": 9.88765294771969e-06, + "loss": 2.9796, + "mean_token_accuracy": 0.41631701631701634, + "step": 606 + }, + { + "epoch": 0.11253244345569151, + "grad_norm": 9.625, + "learning_rate": 9.887467556544309e-06, + "loss": 3.1043, + "mean_token_accuracy": 0.41048087300635566, + "step": 607 + }, + { + "epoch": 0.11271783463107156, + "grad_norm": 9.3125, + "learning_rate": 9.88728216536893e-06, + "loss": 3.0037, + "mean_token_accuracy": 0.4041991601679664, + "step": 608 + }, + { + "epoch": 0.11290322580645161, + "grad_norm": 6.3984375, + "learning_rate": 9.88709677419355e-06, + "loss": 3.1005, + "mean_token_accuracy": 0.39710365853658536, + "step": 609 + }, + { + "epoch": 0.11308861698183166, + "grad_norm": 7.8828125, + "learning_rate": 9.88691138301817e-06, + "loss": 3.1799, + "mean_token_accuracy": 0.3976857490864799, + "step": 610 + }, + { + "epoch": 0.11327400815721171, + "grad_norm": 7.828125, + "learning_rate": 9.886725991842789e-06, + "loss": 2.9804, + "mean_token_accuracy": 0.414394497666421, + "step": 611 + }, + { + "epoch": 0.11345939933259176, + "grad_norm": 15.3046875, + "learning_rate": 9.88654060066741e-06, + "loss": 2.7082, + "mean_token_accuracy": 0.43707273338563263, + "step": 612 + }, + { + "epoch": 0.11364479050797183, + "grad_norm": 6.1796875, + "learning_rate": 9.886355209492028e-06, + "loss": 3.1872, + "mean_token_accuracy": 0.41010071090047395, + "step": 613 + }, + { + "epoch": 0.11383018168335188, + "grad_norm": 7.55078125, + "learning_rate": 9.886169818316649e-06, + "loss": 2.9411, + "mean_token_accuracy": 0.43027276219746446, + "step": 614 + }, + { + "epoch": 0.11401557285873193, + "grad_norm": 6.59765625, + "learning_rate": 9.88598442714127e-06, + "loss": 2.7976, + "mean_token_accuracy": 0.4294573643410853, + "step": 615 + }, + { + "epoch": 0.11420096403411198, + "grad_norm": 6.69921875, + "learning_rate": 9.885799035965888e-06, + "loss": 2.6748, + "mean_token_accuracy": 0.4445882704371204, + "step": 616 + }, + { + "epoch": 0.11438635520949203, + "grad_norm": 6.07421875, + "learning_rate": 9.885613644790509e-06, + "loss": 2.8201, + "mean_token_accuracy": 0.43463191459864137, + "step": 617 + }, + { + "epoch": 0.11457174638487208, + "grad_norm": 8.046875, + "learning_rate": 9.885428253615129e-06, + "loss": 3.2382, + "mean_token_accuracy": 0.40473330129745316, + "step": 618 + }, + { + "epoch": 0.11475713756025213, + "grad_norm": 5.34765625, + "learning_rate": 9.88524286243975e-06, + "loss": 2.7734, + "mean_token_accuracy": 0.4668, + "step": 619 + }, + { + "epoch": 0.11494252873563218, + "grad_norm": 7.04296875, + "learning_rate": 9.885057471264368e-06, + "loss": 2.7128, + "mean_token_accuracy": 0.44738628649015616, + "step": 620 + }, + { + "epoch": 0.11512791991101223, + "grad_norm": 8.78125, + "learning_rate": 9.884872080088989e-06, + "loss": 2.4175, + "mean_token_accuracy": 0.4752225322173509, + "step": 621 + }, + { + "epoch": 0.11531331108639228, + "grad_norm": 6.45703125, + "learning_rate": 9.884686688913608e-06, + "loss": 2.963, + "mean_token_accuracy": 0.42620897018291154, + "step": 622 + }, + { + "epoch": 0.11549870226177233, + "grad_norm": 7.15234375, + "learning_rate": 9.884501297738228e-06, + "loss": 3.2089, + "mean_token_accuracy": 0.38625725519565624, + "step": 623 + }, + { + "epoch": 0.11568409343715239, + "grad_norm": 6.80078125, + "learning_rate": 9.884315906562849e-06, + "loss": 2.9824, + "mean_token_accuracy": 0.4298110067752288, + "step": 624 + }, + { + "epoch": 0.11586948461253245, + "grad_norm": 6.421875, + "learning_rate": 9.884130515387467e-06, + "loss": 2.9457, + "mean_token_accuracy": 0.4301253496322387, + "step": 625 + }, + { + "epoch": 0.1160548757879125, + "grad_norm": 6.5546875, + "learning_rate": 9.88394512421209e-06, + "loss": 3.0506, + "mean_token_accuracy": 0.40153886280646844, + "step": 626 + }, + { + "epoch": 0.11624026696329255, + "grad_norm": 7.3828125, + "learning_rate": 9.883759733036708e-06, + "loss": 2.7686, + "mean_token_accuracy": 0.4399644233619923, + "step": 627 + }, + { + "epoch": 0.1164256581386726, + "grad_norm": 7.7109375, + "learning_rate": 9.883574341861329e-06, + "loss": 3.2799, + "mean_token_accuracy": 0.4031502212965374, + "step": 628 + }, + { + "epoch": 0.11661104931405265, + "grad_norm": 6.26953125, + "learning_rate": 9.883388950685948e-06, + "loss": 2.7435, + "mean_token_accuracy": 0.45799031476997576, + "step": 629 + }, + { + "epoch": 0.1167964404894327, + "grad_norm": 6.80078125, + "learning_rate": 9.883203559510568e-06, + "loss": 3.0745, + "mean_token_accuracy": 0.42391716125773454, + "step": 630 + }, + { + "epoch": 0.11698183166481275, + "grad_norm": 6.9140625, + "learning_rate": 9.883018168335189e-06, + "loss": 3.0794, + "mean_token_accuracy": 0.4025974025974026, + "step": 631 + }, + { + "epoch": 0.1171672228401928, + "grad_norm": 8.546875, + "learning_rate": 9.882832777159807e-06, + "loss": 2.3033, + "mean_token_accuracy": 0.494098955969133, + "step": 632 + }, + { + "epoch": 0.11735261401557286, + "grad_norm": 10.3515625, + "learning_rate": 9.882647385984428e-06, + "loss": 2.9588, + "mean_token_accuracy": 0.4038412617654541, + "step": 633 + }, + { + "epoch": 0.11753800519095291, + "grad_norm": 7.515625, + "learning_rate": 9.882461994809048e-06, + "loss": 2.9061, + "mean_token_accuracy": 0.41789748045178104, + "step": 634 + }, + { + "epoch": 0.11772339636633296, + "grad_norm": 7.6015625, + "learning_rate": 9.882276603633669e-06, + "loss": 2.8987, + "mean_token_accuracy": 0.4271176294522598, + "step": 635 + }, + { + "epoch": 0.11790878754171301, + "grad_norm": 8.34375, + "learning_rate": 9.882091212458288e-06, + "loss": 3.0162, + "mean_token_accuracy": 0.4147084421235857, + "step": 636 + }, + { + "epoch": 0.11809417871709306, + "grad_norm": 10.03125, + "learning_rate": 9.881905821282908e-06, + "loss": 2.5506, + "mean_token_accuracy": 0.4669016411499064, + "step": 637 + }, + { + "epoch": 0.11827956989247312, + "grad_norm": 6.921875, + "learning_rate": 9.881720430107527e-06, + "loss": 2.8896, + "mean_token_accuracy": 0.42791265427912656, + "step": 638 + }, + { + "epoch": 0.11846496106785318, + "grad_norm": 7.60546875, + "learning_rate": 9.881535038932147e-06, + "loss": 3.1056, + "mean_token_accuracy": 0.39513721029432675, + "step": 639 + }, + { + "epoch": 0.11865035224323323, + "grad_norm": 7.984375, + "learning_rate": 9.881349647756768e-06, + "loss": 3.2403, + "mean_token_accuracy": 0.3993878094224115, + "step": 640 + }, + { + "epoch": 0.11883574341861328, + "grad_norm": 11.3046875, + "learning_rate": 9.881164256581387e-06, + "loss": 3.0229, + "mean_token_accuracy": 0.42289750766360124, + "step": 641 + }, + { + "epoch": 0.11902113459399333, + "grad_norm": 10.1640625, + "learning_rate": 9.880978865406007e-06, + "loss": 3.3999, + "mean_token_accuracy": 0.35867973414441817, + "step": 642 + }, + { + "epoch": 0.11920652576937338, + "grad_norm": 12.3828125, + "learning_rate": 9.880793474230628e-06, + "loss": 2.7568, + "mean_token_accuracy": 0.452468380252958, + "step": 643 + }, + { + "epoch": 0.11939191694475343, + "grad_norm": 5.3671875, + "learning_rate": 9.880608083055248e-06, + "loss": 2.9786, + "mean_token_accuracy": 0.4222486615110054, + "step": 644 + }, + { + "epoch": 0.11957730812013348, + "grad_norm": 8.6796875, + "learning_rate": 9.880422691879867e-06, + "loss": 2.6843, + "mean_token_accuracy": 0.4693542272210287, + "step": 645 + }, + { + "epoch": 0.11976269929551353, + "grad_norm": 8.1484375, + "learning_rate": 9.880237300704488e-06, + "loss": 2.7184, + "mean_token_accuracy": 0.44553226696083836, + "step": 646 + }, + { + "epoch": 0.11994809047089358, + "grad_norm": 13.5546875, + "learning_rate": 9.880051909529106e-06, + "loss": 3.0748, + "mean_token_accuracy": 0.3765888825649782, + "step": 647 + }, + { + "epoch": 0.12013348164627363, + "grad_norm": 9.3984375, + "learning_rate": 9.879866518353727e-06, + "loss": 2.9943, + "mean_token_accuracy": 0.40639350052984813, + "step": 648 + }, + { + "epoch": 0.12031887282165368, + "grad_norm": 5.76171875, + "learning_rate": 9.879681127178347e-06, + "loss": 3.0106, + "mean_token_accuracy": 0.40795701675834717, + "step": 649 + }, + { + "epoch": 0.12050426399703375, + "grad_norm": 6.57421875, + "learning_rate": 9.879495736002968e-06, + "loss": 3.25, + "mean_token_accuracy": 0.39554494828957837, + "step": 650 + }, + { + "epoch": 0.1206896551724138, + "grad_norm": 8.0546875, + "learning_rate": 9.879310344827587e-06, + "loss": 2.7496, + "mean_token_accuracy": 0.44060114503816794, + "step": 651 + }, + { + "epoch": 0.12087504634779385, + "grad_norm": 7.1875, + "learning_rate": 9.879124953652207e-06, + "loss": 2.7566, + "mean_token_accuracy": 0.43593967811430745, + "step": 652 + }, + { + "epoch": 0.1210604375231739, + "grad_norm": 6.5859375, + "learning_rate": 9.878939562476828e-06, + "loss": 3.0319, + "mean_token_accuracy": 0.4093316803786769, + "step": 653 + }, + { + "epoch": 0.12124582869855395, + "grad_norm": 6.0390625, + "learning_rate": 9.878754171301446e-06, + "loss": 2.7842, + "mean_token_accuracy": 0.45045698534745393, + "step": 654 + }, + { + "epoch": 0.121431219873934, + "grad_norm": 9.3046875, + "learning_rate": 9.878568780126067e-06, + "loss": 3.4031, + "mean_token_accuracy": 0.37794656888423256, + "step": 655 + }, + { + "epoch": 0.12161661104931405, + "grad_norm": 7.484375, + "learning_rate": 9.878383388950686e-06, + "loss": 2.7452, + "mean_token_accuracy": 0.4482017481014472, + "step": 656 + }, + { + "epoch": 0.1218020022246941, + "grad_norm": 9.3984375, + "learning_rate": 9.878197997775306e-06, + "loss": 3.2324, + "mean_token_accuracy": 0.3964526605046215, + "step": 657 + }, + { + "epoch": 0.12198739340007415, + "grad_norm": 11.0, + "learning_rate": 9.878012606599927e-06, + "loss": 3.3105, + "mean_token_accuracy": 0.37773512476007676, + "step": 658 + }, + { + "epoch": 0.1221727845754542, + "grad_norm": 11.0703125, + "learning_rate": 9.877827215424547e-06, + "loss": 3.3004, + "mean_token_accuracy": 0.38499690018598887, + "step": 659 + }, + { + "epoch": 0.12235817575083426, + "grad_norm": 8.171875, + "learning_rate": 9.877641824249166e-06, + "loss": 3.3313, + "mean_token_accuracy": 0.3730419902593129, + "step": 660 + }, + { + "epoch": 0.1225435669262143, + "grad_norm": 7.3203125, + "learning_rate": 9.877456433073786e-06, + "loss": 2.6276, + "mean_token_accuracy": 0.44212479703085134, + "step": 661 + }, + { + "epoch": 0.12272895810159437, + "grad_norm": 6.55859375, + "learning_rate": 9.877271041898407e-06, + "loss": 3.1038, + "mean_token_accuracy": 0.3980857851825594, + "step": 662 + }, + { + "epoch": 0.12291434927697442, + "grad_norm": 6.03515625, + "learning_rate": 9.877085650723026e-06, + "loss": 3.2664, + "mean_token_accuracy": 0.39626853771328335, + "step": 663 + }, + { + "epoch": 0.12309974045235447, + "grad_norm": 7.05078125, + "learning_rate": 9.876900259547646e-06, + "loss": 3.1763, + "mean_token_accuracy": 0.3953143934293793, + "step": 664 + }, + { + "epoch": 0.12328513162773452, + "grad_norm": 8.28125, + "learning_rate": 9.876714868372265e-06, + "loss": 2.8905, + "mean_token_accuracy": 0.423560281729043, + "step": 665 + }, + { + "epoch": 0.12347052280311457, + "grad_norm": 6.21484375, + "learning_rate": 9.876529477196887e-06, + "loss": 2.899, + "mean_token_accuracy": 0.425415735914619, + "step": 666 + }, + { + "epoch": 0.12365591397849462, + "grad_norm": 9.3984375, + "learning_rate": 9.876344086021506e-06, + "loss": 2.8276, + "mean_token_accuracy": 0.4398491945618645, + "step": 667 + }, + { + "epoch": 0.12384130515387468, + "grad_norm": 9.8984375, + "learning_rate": 9.876158694846126e-06, + "loss": 3.076, + "mean_token_accuracy": 0.40351941747572817, + "step": 668 + }, + { + "epoch": 0.12402669632925473, + "grad_norm": 8.0546875, + "learning_rate": 9.875973303670747e-06, + "loss": 2.9766, + "mean_token_accuracy": 0.42509241139378123, + "step": 669 + }, + { + "epoch": 0.12421208750463478, + "grad_norm": 8.265625, + "learning_rate": 9.875787912495366e-06, + "loss": 2.8745, + "mean_token_accuracy": 0.42353594227033026, + "step": 670 + }, + { + "epoch": 0.12439747868001483, + "grad_norm": 9.1796875, + "learning_rate": 9.875602521319986e-06, + "loss": 3.0462, + "mean_token_accuracy": 0.41735537190082644, + "step": 671 + }, + { + "epoch": 0.12458286985539488, + "grad_norm": 8.96875, + "learning_rate": 9.875417130144605e-06, + "loss": 2.8012, + "mean_token_accuracy": 0.4233263840630067, + "step": 672 + }, + { + "epoch": 0.12476826103077493, + "grad_norm": 7.2734375, + "learning_rate": 9.875231738969226e-06, + "loss": 3.5253, + "mean_token_accuracy": 0.3751308153363143, + "step": 673 + }, + { + "epoch": 0.12495365220615498, + "grad_norm": 13.3671875, + "learning_rate": 9.875046347793846e-06, + "loss": 2.8286, + "mean_token_accuracy": 0.4217142857142857, + "step": 674 + }, + { + "epoch": 0.12513904338153503, + "grad_norm": 11.5703125, + "learning_rate": 9.874860956618467e-06, + "loss": 3.2542, + "mean_token_accuracy": 0.3853965183752418, + "step": 675 + }, + { + "epoch": 0.12532443455691508, + "grad_norm": 11.171875, + "learning_rate": 9.874675565443085e-06, + "loss": 2.5511, + "mean_token_accuracy": 0.4512676056338028, + "step": 676 + }, + { + "epoch": 0.12550982573229513, + "grad_norm": 6.765625, + "learning_rate": 9.874490174267706e-06, + "loss": 2.9929, + "mean_token_accuracy": 0.41382038770934987, + "step": 677 + }, + { + "epoch": 0.12569521690767518, + "grad_norm": 18.234375, + "learning_rate": 9.874304783092326e-06, + "loss": 3.1176, + "mean_token_accuracy": 0.38240428255556846, + "step": 678 + }, + { + "epoch": 0.12588060808305523, + "grad_norm": 12.8671875, + "learning_rate": 9.874119391916945e-06, + "loss": 2.2764, + "mean_token_accuracy": 0.485667382245233, + "step": 679 + }, + { + "epoch": 0.12606599925843529, + "grad_norm": 8.578125, + "learning_rate": 9.873934000741566e-06, + "loss": 3.1105, + "mean_token_accuracy": 0.3994442006728097, + "step": 680 + }, + { + "epoch": 0.12625139043381536, + "grad_norm": 8.40625, + "learning_rate": 9.873748609566184e-06, + "loss": 3.0083, + "mean_token_accuracy": 0.4122506786262245, + "step": 681 + }, + { + "epoch": 0.12643678160919541, + "grad_norm": 15.21875, + "learning_rate": 9.873563218390807e-06, + "loss": 2.8403, + "mean_token_accuracy": 0.42344559585492225, + "step": 682 + }, + { + "epoch": 0.12662217278457547, + "grad_norm": 11.9296875, + "learning_rate": 9.873377827215425e-06, + "loss": 2.9019, + "mean_token_accuracy": 0.427928870292887, + "step": 683 + }, + { + "epoch": 0.12680756395995552, + "grad_norm": 8.6640625, + "learning_rate": 9.873192436040046e-06, + "loss": 3.0652, + "mean_token_accuracy": 0.3993808049535604, + "step": 684 + }, + { + "epoch": 0.12699295513533557, + "grad_norm": 10.5078125, + "learning_rate": 9.873007044864665e-06, + "loss": 2.6465, + "mean_token_accuracy": 0.4363766339869281, + "step": 685 + }, + { + "epoch": 0.12717834631071562, + "grad_norm": 6.3359375, + "learning_rate": 9.872821653689285e-06, + "loss": 2.899, + "mean_token_accuracy": 0.442664311845591, + "step": 686 + }, + { + "epoch": 0.12736373748609567, + "grad_norm": 10.1953125, + "learning_rate": 9.872636262513906e-06, + "loss": 3.2416, + "mean_token_accuracy": 0.39878197320341047, + "step": 687 + }, + { + "epoch": 0.12754912866147572, + "grad_norm": 10.109375, + "learning_rate": 9.872450871338524e-06, + "loss": 2.6962, + "mean_token_accuracy": 0.4372042227884965, + "step": 688 + }, + { + "epoch": 0.12773451983685577, + "grad_norm": 20.03125, + "learning_rate": 9.872265480163145e-06, + "loss": 3.223, + "mean_token_accuracy": 0.36838487972508593, + "step": 689 + }, + { + "epoch": 0.12791991101223582, + "grad_norm": 8.6640625, + "learning_rate": 9.872080088987765e-06, + "loss": 3.2226, + "mean_token_accuracy": 0.39855274144169217, + "step": 690 + }, + { + "epoch": 0.12810530218761587, + "grad_norm": 10.3125, + "learning_rate": 9.871894697812386e-06, + "loss": 2.9064, + "mean_token_accuracy": 0.41621691081984064, + "step": 691 + }, + { + "epoch": 0.12829069336299592, + "grad_norm": 8.640625, + "learning_rate": 9.871709306637005e-06, + "loss": 2.9699, + "mean_token_accuracy": 0.4099699183498066, + "step": 692 + }, + { + "epoch": 0.12847608453837597, + "grad_norm": 6.9765625, + "learning_rate": 9.871523915461625e-06, + "loss": 3.0353, + "mean_token_accuracy": 0.4178212787593746, + "step": 693 + }, + { + "epoch": 0.12866147571375602, + "grad_norm": 7.75, + "learning_rate": 9.871338524286244e-06, + "loss": 2.6648, + "mean_token_accuracy": 0.45852187028657615, + "step": 694 + }, + { + "epoch": 0.12884686688913607, + "grad_norm": 6.50390625, + "learning_rate": 9.871153133110865e-06, + "loss": 3.0548, + "mean_token_accuracy": 0.41835616438356166, + "step": 695 + }, + { + "epoch": 0.12903225806451613, + "grad_norm": 8.875, + "learning_rate": 9.870967741935485e-06, + "loss": 3.2201, + "mean_token_accuracy": 0.40557903634826714, + "step": 696 + }, + { + "epoch": 0.12921764923989618, + "grad_norm": 6.6484375, + "learning_rate": 9.870782350760104e-06, + "loss": 2.9487, + "mean_token_accuracy": 0.4257959388317874, + "step": 697 + }, + { + "epoch": 0.12940304041527623, + "grad_norm": 15.0703125, + "learning_rate": 9.870596959584724e-06, + "loss": 3.0872, + "mean_token_accuracy": 0.38031737565008666, + "step": 698 + }, + { + "epoch": 0.12958843159065628, + "grad_norm": 7.10546875, + "learning_rate": 9.870411568409345e-06, + "loss": 3.2629, + "mean_token_accuracy": 0.40667157223448613, + "step": 699 + }, + { + "epoch": 0.12977382276603633, + "grad_norm": 6.98046875, + "learning_rate": 9.870226177233965e-06, + "loss": 2.8492, + "mean_token_accuracy": 0.43875347115087937, + "step": 700 + }, + { + "epoch": 0.12995921394141638, + "grad_norm": 5.9765625, + "learning_rate": 9.870040786058584e-06, + "loss": 2.978, + "mean_token_accuracy": 0.40629453681710215, + "step": 701 + }, + { + "epoch": 0.13014460511679643, + "grad_norm": 7.44140625, + "learning_rate": 9.869855394883205e-06, + "loss": 3.2567, + "mean_token_accuracy": 0.3889651386220148, + "step": 702 + }, + { + "epoch": 0.13032999629217648, + "grad_norm": 8.0625, + "learning_rate": 9.869670003707823e-06, + "loss": 2.9607, + "mean_token_accuracy": 0.4, + "step": 703 + }, + { + "epoch": 0.13051538746755653, + "grad_norm": 6.07421875, + "learning_rate": 9.869484612532444e-06, + "loss": 2.8669, + "mean_token_accuracy": 0.4226688303582574, + "step": 704 + }, + { + "epoch": 0.13070077864293658, + "grad_norm": 6.5546875, + "learning_rate": 9.869299221357064e-06, + "loss": 2.8474, + "mean_token_accuracy": 0.4220518867924528, + "step": 705 + }, + { + "epoch": 0.13088616981831666, + "grad_norm": 9.2734375, + "learning_rate": 9.869113830181685e-06, + "loss": 2.7965, + "mean_token_accuracy": 0.4494828957836118, + "step": 706 + }, + { + "epoch": 0.1310715609936967, + "grad_norm": 8.4296875, + "learning_rate": 9.868928439006305e-06, + "loss": 2.6898, + "mean_token_accuracy": 0.45481171548117155, + "step": 707 + }, + { + "epoch": 0.13125695216907676, + "grad_norm": 7.35546875, + "learning_rate": 9.868743047830924e-06, + "loss": 3.0509, + "mean_token_accuracy": 0.4099435917617736, + "step": 708 + }, + { + "epoch": 0.1314423433444568, + "grad_norm": 10.2890625, + "learning_rate": 9.868557656655545e-06, + "loss": 2.332, + "mean_token_accuracy": 0.5083138656039576, + "step": 709 + }, + { + "epoch": 0.13162773451983686, + "grad_norm": 7.9609375, + "learning_rate": 9.868372265480163e-06, + "loss": 3.4955, + "mean_token_accuracy": 0.3647892949870182, + "step": 710 + }, + { + "epoch": 0.13181312569521692, + "grad_norm": 9.828125, + "learning_rate": 9.868186874304784e-06, + "loss": 3.0041, + "mean_token_accuracy": 0.40580575797395774, + "step": 711 + }, + { + "epoch": 0.13199851687059697, + "grad_norm": 5.58984375, + "learning_rate": 9.868001483129403e-06, + "loss": 3.4287, + "mean_token_accuracy": 0.35912722069870034, + "step": 712 + }, + { + "epoch": 0.13218390804597702, + "grad_norm": 9.453125, + "learning_rate": 9.867816091954023e-06, + "loss": 2.9458, + "mean_token_accuracy": 0.4134373760084645, + "step": 713 + }, + { + "epoch": 0.13236929922135707, + "grad_norm": 11.046875, + "learning_rate": 9.867630700778644e-06, + "loss": 2.8708, + "mean_token_accuracy": 0.4248428532385898, + "step": 714 + }, + { + "epoch": 0.13255469039673712, + "grad_norm": 10.4765625, + "learning_rate": 9.867445309603264e-06, + "loss": 3.3693, + "mean_token_accuracy": 0.37528634954857837, + "step": 715 + }, + { + "epoch": 0.13274008157211717, + "grad_norm": 6.6953125, + "learning_rate": 9.867259918427885e-06, + "loss": 3.1915, + "mean_token_accuracy": 0.38454580695180257, + "step": 716 + }, + { + "epoch": 0.13292547274749722, + "grad_norm": 11.0234375, + "learning_rate": 9.867074527252503e-06, + "loss": 3.0738, + "mean_token_accuracy": 0.4051822976381564, + "step": 717 + }, + { + "epoch": 0.13311086392287727, + "grad_norm": 7.00390625, + "learning_rate": 9.866889136077124e-06, + "loss": 3.4506, + "mean_token_accuracy": 0.36359537050954455, + "step": 718 + }, + { + "epoch": 0.13329625509825732, + "grad_norm": 7.140625, + "learning_rate": 9.866703744901743e-06, + "loss": 3.1755, + "mean_token_accuracy": 0.41415640302715534, + "step": 719 + }, + { + "epoch": 0.13348164627363737, + "grad_norm": 10.53125, + "learning_rate": 9.866518353726363e-06, + "loss": 2.8631, + "mean_token_accuracy": 0.4196456985255118, + "step": 720 + }, + { + "epoch": 0.13366703744901742, + "grad_norm": 9.1484375, + "learning_rate": 9.866332962550984e-06, + "loss": 3.9717, + "mean_token_accuracy": 0.32326324194836303, + "step": 721 + }, + { + "epoch": 0.13385242862439747, + "grad_norm": 7.6015625, + "learning_rate": 9.866147571375604e-06, + "loss": 3.2262, + "mean_token_accuracy": 0.3940175953079179, + "step": 722 + }, + { + "epoch": 0.13403781979977752, + "grad_norm": 7.48046875, + "learning_rate": 9.865962180200223e-06, + "loss": 3.1626, + "mean_token_accuracy": 0.38652597402597405, + "step": 723 + }, + { + "epoch": 0.13422321097515758, + "grad_norm": 9.5546875, + "learning_rate": 9.865776789024844e-06, + "loss": 3.1989, + "mean_token_accuracy": 0.4015409018808067, + "step": 724 + }, + { + "epoch": 0.13440860215053763, + "grad_norm": 12.34375, + "learning_rate": 9.865591397849464e-06, + "loss": 2.7034, + "mean_token_accuracy": 0.45056320400500627, + "step": 725 + }, + { + "epoch": 0.13459399332591768, + "grad_norm": 12.1875, + "learning_rate": 9.865406006674083e-06, + "loss": 2.7679, + "mean_token_accuracy": 0.44550369948776325, + "step": 726 + }, + { + "epoch": 0.13477938450129773, + "grad_norm": 8.0, + "learning_rate": 9.865220615498703e-06, + "loss": 2.7701, + "mean_token_accuracy": 0.4434520357267138, + "step": 727 + }, + { + "epoch": 0.13496477567667778, + "grad_norm": 9.140625, + "learning_rate": 9.865035224323322e-06, + "loss": 2.7807, + "mean_token_accuracy": 0.4473251780171627, + "step": 728 + }, + { + "epoch": 0.13515016685205783, + "grad_norm": 7.7734375, + "learning_rate": 9.864849833147943e-06, + "loss": 3.026, + "mean_token_accuracy": 0.4076058772687986, + "step": 729 + }, + { + "epoch": 0.1353355580274379, + "grad_norm": 9.4609375, + "learning_rate": 9.864664441972563e-06, + "loss": 2.5247, + "mean_token_accuracy": 0.46952672795369427, + "step": 730 + }, + { + "epoch": 0.13552094920281796, + "grad_norm": 6.6953125, + "learning_rate": 9.864479050797184e-06, + "loss": 3.3639, + "mean_token_accuracy": 0.38154450261780104, + "step": 731 + }, + { + "epoch": 0.135706340378198, + "grad_norm": 5.75, + "learning_rate": 9.864293659621802e-06, + "loss": 3.377, + "mean_token_accuracy": 0.3736515641855448, + "step": 732 + }, + { + "epoch": 0.13589173155357806, + "grad_norm": 5.90625, + "learning_rate": 9.864108268446423e-06, + "loss": 3.122, + "mean_token_accuracy": 0.40932708148523633, + "step": 733 + }, + { + "epoch": 0.1360771227289581, + "grad_norm": 7.84375, + "learning_rate": 9.863922877271043e-06, + "loss": 3.045, + "mean_token_accuracy": 0.407824455031038, + "step": 734 + }, + { + "epoch": 0.13626251390433816, + "grad_norm": 5.93359375, + "learning_rate": 9.863737486095662e-06, + "loss": 2.9052, + "mean_token_accuracy": 0.4120286164665939, + "step": 735 + }, + { + "epoch": 0.1364479050797182, + "grad_norm": 6.86328125, + "learning_rate": 9.863552094920283e-06, + "loss": 2.9854, + "mean_token_accuracy": 0.39720634920634923, + "step": 736 + }, + { + "epoch": 0.13663329625509826, + "grad_norm": 8.3828125, + "learning_rate": 9.863366703744901e-06, + "loss": 2.5673, + "mean_token_accuracy": 0.46193265007320644, + "step": 737 + }, + { + "epoch": 0.13681868743047831, + "grad_norm": 6.7734375, + "learning_rate": 9.863181312569524e-06, + "loss": 3.1197, + "mean_token_accuracy": 0.4005707878911426, + "step": 738 + }, + { + "epoch": 0.13700407860585836, + "grad_norm": 6.328125, + "learning_rate": 9.862995921394142e-06, + "loss": 2.9011, + "mean_token_accuracy": 0.4281341821743389, + "step": 739 + }, + { + "epoch": 0.13718946978123842, + "grad_norm": 7.7578125, + "learning_rate": 9.862810530218763e-06, + "loss": 3.2804, + "mean_token_accuracy": 0.39657297830374755, + "step": 740 + }, + { + "epoch": 0.13737486095661847, + "grad_norm": 10.203125, + "learning_rate": 9.862625139043382e-06, + "loss": 2.973, + "mean_token_accuracy": 0.40604960677555957, + "step": 741 + }, + { + "epoch": 0.13756025213199852, + "grad_norm": 10.1015625, + "learning_rate": 9.862439747868002e-06, + "loss": 2.4093, + "mean_token_accuracy": 0.47001584068793845, + "step": 742 + }, + { + "epoch": 0.13774564330737857, + "grad_norm": 11.5234375, + "learning_rate": 9.862254356692623e-06, + "loss": 3.3727, + "mean_token_accuracy": 0.36067346308310166, + "step": 743 + }, + { + "epoch": 0.13793103448275862, + "grad_norm": 7.5859375, + "learning_rate": 9.862068965517241e-06, + "loss": 2.5664, + "mean_token_accuracy": 0.49264998013508143, + "step": 744 + }, + { + "epoch": 0.13811642565813867, + "grad_norm": 10.546875, + "learning_rate": 9.861883574341862e-06, + "loss": 2.6881, + "mean_token_accuracy": 0.4430678466076696, + "step": 745 + }, + { + "epoch": 0.13830181683351872, + "grad_norm": 7.5703125, + "learning_rate": 9.86169818316648e-06, + "loss": 2.9677, + "mean_token_accuracy": 0.41450480149061203, + "step": 746 + }, + { + "epoch": 0.13848720800889877, + "grad_norm": 8.171875, + "learning_rate": 9.861512791991103e-06, + "loss": 2.9933, + "mean_token_accuracy": 0.41045454545454546, + "step": 747 + }, + { + "epoch": 0.13867259918427882, + "grad_norm": 8.8046875, + "learning_rate": 9.861327400815722e-06, + "loss": 3.0249, + "mean_token_accuracy": 0.4226044226044226, + "step": 748 + }, + { + "epoch": 0.13885799035965887, + "grad_norm": 7.42578125, + "learning_rate": 9.861142009640342e-06, + "loss": 3.2242, + "mean_token_accuracy": 0.39226460953186093, + "step": 749 + }, + { + "epoch": 0.13904338153503892, + "grad_norm": 5.9609375, + "learning_rate": 9.860956618464963e-06, + "loss": 2.6403, + "mean_token_accuracy": 0.4613674263479711, + "step": 750 + }, + { + "epoch": 0.13922877271041897, + "grad_norm": 10.25, + "learning_rate": 9.860771227289582e-06, + "loss": 2.806, + "mean_token_accuracy": 0.42448889865904593, + "step": 751 + }, + { + "epoch": 0.13941416388579903, + "grad_norm": 7.19921875, + "learning_rate": 9.860585836114202e-06, + "loss": 2.7607, + "mean_token_accuracy": 0.4307969615037981, + "step": 752 + }, + { + "epoch": 0.13959955506117908, + "grad_norm": 7.1484375, + "learning_rate": 9.86040044493882e-06, + "loss": 3.3084, + "mean_token_accuracy": 0.388254940161425, + "step": 753 + }, + { + "epoch": 0.13978494623655913, + "grad_norm": 9.265625, + "learning_rate": 9.860215053763441e-06, + "loss": 2.8351, + "mean_token_accuracy": 0.4377808988764045, + "step": 754 + }, + { + "epoch": 0.1399703374119392, + "grad_norm": 10.8515625, + "learning_rate": 9.860029662588062e-06, + "loss": 2.7376, + "mean_token_accuracy": 0.4554277498202732, + "step": 755 + }, + { + "epoch": 0.14015572858731926, + "grad_norm": 6.63671875, + "learning_rate": 9.859844271412682e-06, + "loss": 3.2321, + "mean_token_accuracy": 0.3894289864789362, + "step": 756 + }, + { + "epoch": 0.1403411197626993, + "grad_norm": 7.44921875, + "learning_rate": 9.859658880237301e-06, + "loss": 2.6218, + "mean_token_accuracy": 0.4551959489211801, + "step": 757 + }, + { + "epoch": 0.14052651093807936, + "grad_norm": 5.6484375, + "learning_rate": 9.859473489061922e-06, + "loss": 3.1867, + "mean_token_accuracy": 0.3927792915531335, + "step": 758 + }, + { + "epoch": 0.1407119021134594, + "grad_norm": 7.45703125, + "learning_rate": 9.859288097886542e-06, + "loss": 3.4918, + "mean_token_accuracy": 0.3734524369221125, + "step": 759 + }, + { + "epoch": 0.14089729328883946, + "grad_norm": 6.5625, + "learning_rate": 9.859102706711161e-06, + "loss": 2.9985, + "mean_token_accuracy": 0.41660759493670885, + "step": 760 + }, + { + "epoch": 0.1410826844642195, + "grad_norm": 6.5546875, + "learning_rate": 9.858917315535781e-06, + "loss": 3.0234, + "mean_token_accuracy": 0.40899110135213224, + "step": 761 + }, + { + "epoch": 0.14126807563959956, + "grad_norm": 6.26171875, + "learning_rate": 9.8587319243604e-06, + "loss": 2.6739, + "mean_token_accuracy": 0.4517402749341913, + "step": 762 + }, + { + "epoch": 0.1414534668149796, + "grad_norm": 6.48828125, + "learning_rate": 9.858546533185022e-06, + "loss": 3.0713, + "mean_token_accuracy": 0.4106090373280943, + "step": 763 + }, + { + "epoch": 0.14163885799035966, + "grad_norm": 6.41015625, + "learning_rate": 9.858361142009641e-06, + "loss": 2.7636, + "mean_token_accuracy": 0.44991534988713316, + "step": 764 + }, + { + "epoch": 0.1418242491657397, + "grad_norm": 7.10546875, + "learning_rate": 9.858175750834262e-06, + "loss": 2.5906, + "mean_token_accuracy": 0.4720234604105572, + "step": 765 + }, + { + "epoch": 0.14200964034111976, + "grad_norm": 6.07421875, + "learning_rate": 9.85799035965888e-06, + "loss": 3.2116, + "mean_token_accuracy": 0.3828075105377443, + "step": 766 + }, + { + "epoch": 0.14219503151649981, + "grad_norm": 7.2890625, + "learning_rate": 9.857804968483501e-06, + "loss": 2.509, + "mean_token_accuracy": 0.4621794037234735, + "step": 767 + }, + { + "epoch": 0.14238042269187987, + "grad_norm": 8.46875, + "learning_rate": 9.857619577308121e-06, + "loss": 2.2698, + "mean_token_accuracy": 0.5228154690218119, + "step": 768 + }, + { + "epoch": 0.14256581386725992, + "grad_norm": 7.3359375, + "learning_rate": 9.85743418613274e-06, + "loss": 3.0399, + "mean_token_accuracy": 0.41105919003115265, + "step": 769 + }, + { + "epoch": 0.14275120504263997, + "grad_norm": 7.34375, + "learning_rate": 9.85724879495736e-06, + "loss": 2.9676, + "mean_token_accuracy": 0.41291251964379255, + "step": 770 + }, + { + "epoch": 0.14293659621802002, + "grad_norm": 6.56640625, + "learning_rate": 9.857063403781981e-06, + "loss": 2.975, + "mean_token_accuracy": 0.4261959929126346, + "step": 771 + }, + { + "epoch": 0.14312198739340007, + "grad_norm": 8.7265625, + "learning_rate": 9.856878012606602e-06, + "loss": 2.6482, + "mean_token_accuracy": 0.45067458843916325, + "step": 772 + }, + { + "epoch": 0.14330737856878012, + "grad_norm": 8.15625, + "learning_rate": 9.85669262143122e-06, + "loss": 3.1154, + "mean_token_accuracy": 0.40451977401129946, + "step": 773 + }, + { + "epoch": 0.14349276974416017, + "grad_norm": 6.75, + "learning_rate": 9.856507230255841e-06, + "loss": 3.0661, + "mean_token_accuracy": 0.4118069520252801, + "step": 774 + }, + { + "epoch": 0.14367816091954022, + "grad_norm": 7.4375, + "learning_rate": 9.85632183908046e-06, + "loss": 3.6535, + "mean_token_accuracy": 0.36909323116219667, + "step": 775 + }, + { + "epoch": 0.14386355209492027, + "grad_norm": 10.1484375, + "learning_rate": 9.85613644790508e-06, + "loss": 2.8093, + "mean_token_accuracy": 0.42281771501925547, + "step": 776 + }, + { + "epoch": 0.14404894327030032, + "grad_norm": 8.8203125, + "learning_rate": 9.8559510567297e-06, + "loss": 2.7525, + "mean_token_accuracy": 0.44396351831813263, + "step": 777 + }, + { + "epoch": 0.14423433444568037, + "grad_norm": 6.06640625, + "learning_rate": 9.85576566555432e-06, + "loss": 3.2859, + "mean_token_accuracy": 0.3887994052781564, + "step": 778 + }, + { + "epoch": 0.14441972562106042, + "grad_norm": 8.9140625, + "learning_rate": 9.85558027437894e-06, + "loss": 2.5743, + "mean_token_accuracy": 0.4697271176805003, + "step": 779 + }, + { + "epoch": 0.1446051167964405, + "grad_norm": 6.3203125, + "learning_rate": 9.85539488320356e-06, + "loss": 3.4889, + "mean_token_accuracy": 0.3886387253204018, + "step": 780 + }, + { + "epoch": 0.14479050797182055, + "grad_norm": 8.671875, + "learning_rate": 9.855209492028181e-06, + "loss": 3.4763, + "mean_token_accuracy": 0.38178107208078205, + "step": 781 + }, + { + "epoch": 0.1449758991472006, + "grad_norm": 8.0859375, + "learning_rate": 9.8550241008528e-06, + "loss": 2.5951, + "mean_token_accuracy": 0.4481503345139709, + "step": 782 + }, + { + "epoch": 0.14516129032258066, + "grad_norm": 7.65234375, + "learning_rate": 9.85483870967742e-06, + "loss": 2.8362, + "mean_token_accuracy": 0.4366177099672038, + "step": 783 + }, + { + "epoch": 0.1453466814979607, + "grad_norm": 7.78515625, + "learning_rate": 9.854653318502039e-06, + "loss": 2.9784, + "mean_token_accuracy": 0.40820424555364315, + "step": 784 + }, + { + "epoch": 0.14553207267334076, + "grad_norm": 8.5390625, + "learning_rate": 9.85446792732666e-06, + "loss": 2.7548, + "mean_token_accuracy": 0.44809133071708884, + "step": 785 + }, + { + "epoch": 0.1457174638487208, + "grad_norm": 6.03125, + "learning_rate": 9.85428253615128e-06, + "loss": 3.6404, + "mean_token_accuracy": 0.3442300118156755, + "step": 786 + }, + { + "epoch": 0.14590285502410086, + "grad_norm": 9.578125, + "learning_rate": 9.8540971449759e-06, + "loss": 3.0064, + "mean_token_accuracy": 0.4064428721962969, + "step": 787 + }, + { + "epoch": 0.1460882461994809, + "grad_norm": 7.6171875, + "learning_rate": 9.853911753800521e-06, + "loss": 2.9781, + "mean_token_accuracy": 0.4061837258622614, + "step": 788 + }, + { + "epoch": 0.14627363737486096, + "grad_norm": 5.88671875, + "learning_rate": 9.85372636262514e-06, + "loss": 3.2133, + "mean_token_accuracy": 0.38737244897959183, + "step": 789 + }, + { + "epoch": 0.146459028550241, + "grad_norm": 7.0234375, + "learning_rate": 9.85354097144976e-06, + "loss": 3.3739, + "mean_token_accuracy": 0.3860845839017735, + "step": 790 + }, + { + "epoch": 0.14664441972562106, + "grad_norm": 9.7265625, + "learning_rate": 9.85335558027438e-06, + "loss": 2.9039, + "mean_token_accuracy": 0.42658672126352737, + "step": 791 + }, + { + "epoch": 0.1468298109010011, + "grad_norm": 8.3828125, + "learning_rate": 9.853170189099e-06, + "loss": 3.1087, + "mean_token_accuracy": 0.4005235602094241, + "step": 792 + }, + { + "epoch": 0.14701520207638116, + "grad_norm": 5.421875, + "learning_rate": 9.852984797923618e-06, + "loss": 3.0261, + "mean_token_accuracy": 0.4257202881152461, + "step": 793 + }, + { + "epoch": 0.1472005932517612, + "grad_norm": 9.875, + "learning_rate": 9.852799406748239e-06, + "loss": 2.646, + "mean_token_accuracy": 0.459673730751639, + "step": 794 + }, + { + "epoch": 0.14738598442714126, + "grad_norm": 9.3203125, + "learning_rate": 9.85261401557286e-06, + "loss": 2.7779, + "mean_token_accuracy": 0.44240048250904707, + "step": 795 + }, + { + "epoch": 0.14757137560252132, + "grad_norm": 7.65625, + "learning_rate": 9.85242862439748e-06, + "loss": 2.5365, + "mean_token_accuracy": 0.4512967610852157, + "step": 796 + }, + { + "epoch": 0.14775676677790137, + "grad_norm": 9.2890625, + "learning_rate": 9.8522432332221e-06, + "loss": 3.1935, + "mean_token_accuracy": 0.369736621372253, + "step": 797 + }, + { + "epoch": 0.14794215795328142, + "grad_norm": 8.5234375, + "learning_rate": 9.85205784204672e-06, + "loss": 2.4953, + "mean_token_accuracy": 0.48764492329312564, + "step": 798 + }, + { + "epoch": 0.14812754912866147, + "grad_norm": 6.46875, + "learning_rate": 9.85187245087134e-06, + "loss": 2.7617, + "mean_token_accuracy": 0.4278936196778624, + "step": 799 + }, + { + "epoch": 0.14831294030404152, + "grad_norm": 8.2265625, + "learning_rate": 9.851687059695958e-06, + "loss": 2.8215, + "mean_token_accuracy": 0.4289504036908881, + "step": 800 + }, + { + "epoch": 0.14849833147942157, + "grad_norm": 7.09765625, + "learning_rate": 9.851501668520579e-06, + "loss": 3.0931, + "mean_token_accuracy": 0.40274963820549925, + "step": 801 + }, + { + "epoch": 0.14868372265480162, + "grad_norm": 6.65625, + "learning_rate": 9.8513162773452e-06, + "loss": 3.3212, + "mean_token_accuracy": 0.3741313606814616, + "step": 802 + }, + { + "epoch": 0.14886911383018167, + "grad_norm": 7.2421875, + "learning_rate": 9.85113088616982e-06, + "loss": 2.6435, + "mean_token_accuracy": 0.4512826282628263, + "step": 803 + }, + { + "epoch": 0.14905450500556172, + "grad_norm": 7.01171875, + "learning_rate": 9.850945494994439e-06, + "loss": 2.4941, + "mean_token_accuracy": 0.45881397238017874, + "step": 804 + }, + { + "epoch": 0.1492398961809418, + "grad_norm": 7.5859375, + "learning_rate": 9.85076010381906e-06, + "loss": 3.2045, + "mean_token_accuracy": 0.39095197774283297, + "step": 805 + }, + { + "epoch": 0.14942528735632185, + "grad_norm": 5.65234375, + "learning_rate": 9.85057471264368e-06, + "loss": 2.8881, + "mean_token_accuracy": 0.4630516592541909, + "step": 806 + }, + { + "epoch": 0.1496106785317019, + "grad_norm": 5.91796875, + "learning_rate": 9.850389321468299e-06, + "loss": 2.9351, + "mean_token_accuracy": 0.4184427394146064, + "step": 807 + }, + { + "epoch": 0.14979606970708195, + "grad_norm": 8.6328125, + "learning_rate": 9.850203930292919e-06, + "loss": 2.6712, + "mean_token_accuracy": 0.4437269372693727, + "step": 808 + }, + { + "epoch": 0.149981460882462, + "grad_norm": 7.703125, + "learning_rate": 9.850018539117538e-06, + "loss": 2.8661, + "mean_token_accuracy": 0.40842204132748905, + "step": 809 + }, + { + "epoch": 0.15016685205784205, + "grad_norm": 14.6171875, + "learning_rate": 9.849833147942158e-06, + "loss": 2.5576, + "mean_token_accuracy": 0.4495798319327731, + "step": 810 + }, + { + "epoch": 0.1503522432332221, + "grad_norm": 7.91015625, + "learning_rate": 9.849647756766779e-06, + "loss": 2.9951, + "mean_token_accuracy": 0.42762465811066697, + "step": 811 + }, + { + "epoch": 0.15053763440860216, + "grad_norm": 7.69921875, + "learning_rate": 9.8494623655914e-06, + "loss": 2.8932, + "mean_token_accuracy": 0.4242622950819672, + "step": 812 + }, + { + "epoch": 0.1507230255839822, + "grad_norm": 9.75, + "learning_rate": 9.849276974416018e-06, + "loss": 3.1525, + "mean_token_accuracy": 0.39692242833052277, + "step": 813 + }, + { + "epoch": 0.15090841675936226, + "grad_norm": 9.359375, + "learning_rate": 9.849091583240639e-06, + "loss": 3.0924, + "mean_token_accuracy": 0.40548554484803556, + "step": 814 + }, + { + "epoch": 0.1510938079347423, + "grad_norm": 9.0078125, + "learning_rate": 9.848906192065259e-06, + "loss": 3.1264, + "mean_token_accuracy": 0.37162837162837165, + "step": 815 + }, + { + "epoch": 0.15127919911012236, + "grad_norm": 6.90625, + "learning_rate": 9.848720800889878e-06, + "loss": 2.5484, + "mean_token_accuracy": 0.4744904418280795, + "step": 816 + }, + { + "epoch": 0.1514645902855024, + "grad_norm": 9.2109375, + "learning_rate": 9.848535409714498e-06, + "loss": 3.0358, + "mean_token_accuracy": 0.4024390243902439, + "step": 817 + }, + { + "epoch": 0.15164998146088246, + "grad_norm": 8.9921875, + "learning_rate": 9.848350018539117e-06, + "loss": 2.6225, + "mean_token_accuracy": 0.44414292175486203, + "step": 818 + }, + { + "epoch": 0.1518353726362625, + "grad_norm": 9.203125, + "learning_rate": 9.84816462736374e-06, + "loss": 3.1218, + "mean_token_accuracy": 0.4061935172912399, + "step": 819 + }, + { + "epoch": 0.15202076381164256, + "grad_norm": 10.4765625, + "learning_rate": 9.847979236188358e-06, + "loss": 2.6542, + "mean_token_accuracy": 0.45192066281697213, + "step": 820 + }, + { + "epoch": 0.1522061549870226, + "grad_norm": 8.6484375, + "learning_rate": 9.847793845012979e-06, + "loss": 2.8867, + "mean_token_accuracy": 0.4193506993455665, + "step": 821 + }, + { + "epoch": 0.15239154616240266, + "grad_norm": 9.6640625, + "learning_rate": 9.847608453837597e-06, + "loss": 3.1338, + "mean_token_accuracy": 0.3854957507082153, + "step": 822 + }, + { + "epoch": 0.15257693733778271, + "grad_norm": 7.94921875, + "learning_rate": 9.847423062662218e-06, + "loss": 3.2794, + "mean_token_accuracy": 0.3942507645259939, + "step": 823 + }, + { + "epoch": 0.15276232851316277, + "grad_norm": 6.21875, + "learning_rate": 9.847237671486838e-06, + "loss": 2.9785, + "mean_token_accuracy": 0.41218826835265193, + "step": 824 + }, + { + "epoch": 0.15294771968854282, + "grad_norm": 12.703125, + "learning_rate": 9.847052280311457e-06, + "loss": 2.5551, + "mean_token_accuracy": 0.48514375075183447, + "step": 825 + }, + { + "epoch": 0.15313311086392287, + "grad_norm": 7.171875, + "learning_rate": 9.846866889136078e-06, + "loss": 3.058, + "mean_token_accuracy": 0.4107867521926954, + "step": 826 + }, + { + "epoch": 0.15331850203930292, + "grad_norm": 7.75, + "learning_rate": 9.846681497960698e-06, + "loss": 3.0017, + "mean_token_accuracy": 0.4088648332358104, + "step": 827 + }, + { + "epoch": 0.15350389321468297, + "grad_norm": 6.71484375, + "learning_rate": 9.846496106785319e-06, + "loss": 3.0407, + "mean_token_accuracy": 0.41314935064935066, + "step": 828 + }, + { + "epoch": 0.15368928439006305, + "grad_norm": 6.59375, + "learning_rate": 9.846310715609937e-06, + "loss": 3.0433, + "mean_token_accuracy": 0.4014628199918732, + "step": 829 + }, + { + "epoch": 0.1538746755654431, + "grad_norm": 6.4296875, + "learning_rate": 9.846125324434558e-06, + "loss": 2.5572, + "mean_token_accuracy": 0.47892011834319526, + "step": 830 + }, + { + "epoch": 0.15406006674082315, + "grad_norm": 9.234375, + "learning_rate": 9.845939933259177e-06, + "loss": 2.5824, + "mean_token_accuracy": 0.4693154034229829, + "step": 831 + }, + { + "epoch": 0.1542454579162032, + "grad_norm": 6.515625, + "learning_rate": 9.845754542083797e-06, + "loss": 3.3004, + "mean_token_accuracy": 0.38484621155288823, + "step": 832 + }, + { + "epoch": 0.15443084909158325, + "grad_norm": 7.5234375, + "learning_rate": 9.845569150908418e-06, + "loss": 2.9477, + "mean_token_accuracy": 0.42285553839674295, + "step": 833 + }, + { + "epoch": 0.1546162402669633, + "grad_norm": 10.75, + "learning_rate": 9.845383759733037e-06, + "loss": 2.4653, + "mean_token_accuracy": 0.461119927454092, + "step": 834 + }, + { + "epoch": 0.15480163144234335, + "grad_norm": 9.875, + "learning_rate": 9.845198368557659e-06, + "loss": 2.6306, + "mean_token_accuracy": 0.45259219668626405, + "step": 835 + }, + { + "epoch": 0.1549870226177234, + "grad_norm": 10.59375, + "learning_rate": 9.845012977382278e-06, + "loss": 2.7674, + "mean_token_accuracy": 0.438645585560375, + "step": 836 + }, + { + "epoch": 0.15517241379310345, + "grad_norm": 7.1640625, + "learning_rate": 9.844827586206898e-06, + "loss": 3.2029, + "mean_token_accuracy": 0.38777717685235263, + "step": 837 + }, + { + "epoch": 0.1553578049684835, + "grad_norm": 7.6796875, + "learning_rate": 9.844642195031517e-06, + "loss": 3.1878, + "mean_token_accuracy": 0.42232225300092335, + "step": 838 + }, + { + "epoch": 0.15554319614386355, + "grad_norm": 11.7109375, + "learning_rate": 9.844456803856137e-06, + "loss": 3.455, + "mean_token_accuracy": 0.36750832408435075, + "step": 839 + }, + { + "epoch": 0.1557285873192436, + "grad_norm": 6.89453125, + "learning_rate": 9.844271412680758e-06, + "loss": 3.1067, + "mean_token_accuracy": 0.4134655101197852, + "step": 840 + }, + { + "epoch": 0.15591397849462366, + "grad_norm": 8.921875, + "learning_rate": 9.844086021505377e-06, + "loss": 2.8434, + "mean_token_accuracy": 0.42536265793167993, + "step": 841 + }, + { + "epoch": 0.1560993696700037, + "grad_norm": 8.6953125, + "learning_rate": 9.843900630329997e-06, + "loss": 2.9312, + "mean_token_accuracy": 0.4146890113598021, + "step": 842 + }, + { + "epoch": 0.15628476084538376, + "grad_norm": 10.875, + "learning_rate": 9.843715239154618e-06, + "loss": 2.7913, + "mean_token_accuracy": 0.42761656341701987, + "step": 843 + }, + { + "epoch": 0.1564701520207638, + "grad_norm": 8.6875, + "learning_rate": 9.843529847979238e-06, + "loss": 3.2152, + "mean_token_accuracy": 0.3933711737279546, + "step": 844 + }, + { + "epoch": 0.15665554319614386, + "grad_norm": 7.53125, + "learning_rate": 9.843344456803857e-06, + "loss": 2.8734, + "mean_token_accuracy": 0.4027480916030534, + "step": 845 + }, + { + "epoch": 0.1568409343715239, + "grad_norm": 8.125, + "learning_rate": 9.843159065628477e-06, + "loss": 2.6002, + "mean_token_accuracy": 0.47222923504165615, + "step": 846 + }, + { + "epoch": 0.15702632554690396, + "grad_norm": 9.6796875, + "learning_rate": 9.842973674453096e-06, + "loss": 3.3205, + "mean_token_accuracy": 0.3829444891391794, + "step": 847 + }, + { + "epoch": 0.157211716722284, + "grad_norm": 7.296875, + "learning_rate": 9.842788283277717e-06, + "loss": 3.3651, + "mean_token_accuracy": 0.37548226509023025, + "step": 848 + }, + { + "epoch": 0.15739710789766406, + "grad_norm": 8.7578125, + "learning_rate": 9.842602892102337e-06, + "loss": 3.1086, + "mean_token_accuracy": 0.3885805763073639, + "step": 849 + }, + { + "epoch": 0.1575824990730441, + "grad_norm": 8.8984375, + "learning_rate": 9.842417500926956e-06, + "loss": 3.0153, + "mean_token_accuracy": 0.40098593242755803, + "step": 850 + }, + { + "epoch": 0.15776789024842416, + "grad_norm": 8.84375, + "learning_rate": 9.842232109751576e-06, + "loss": 2.7477, + "mean_token_accuracy": 0.44428880682541705, + "step": 851 + }, + { + "epoch": 0.15795328142380421, + "grad_norm": 7.90234375, + "learning_rate": 9.842046718576197e-06, + "loss": 3.3033, + "mean_token_accuracy": 0.38580060422960727, + "step": 852 + }, + { + "epoch": 0.15813867259918427, + "grad_norm": 13.7109375, + "learning_rate": 9.841861327400817e-06, + "loss": 2.4676, + "mean_token_accuracy": 0.4555924958442175, + "step": 853 + }, + { + "epoch": 0.15832406377456434, + "grad_norm": 9.6171875, + "learning_rate": 9.841675936225436e-06, + "loss": 2.9422, + "mean_token_accuracy": 0.40244248792956544, + "step": 854 + }, + { + "epoch": 0.1585094549499444, + "grad_norm": 7.1640625, + "learning_rate": 9.841490545050057e-06, + "loss": 2.8004, + "mean_token_accuracy": 0.423541915769895, + "step": 855 + }, + { + "epoch": 0.15869484612532445, + "grad_norm": 8.84375, + "learning_rate": 9.841305153874676e-06, + "loss": 2.7139, + "mean_token_accuracy": 0.4201044119152421, + "step": 856 + }, + { + "epoch": 0.1588802373007045, + "grad_norm": 8.7109375, + "learning_rate": 9.841119762699296e-06, + "loss": 2.7176, + "mean_token_accuracy": 0.4476150510560918, + "step": 857 + }, + { + "epoch": 0.15906562847608455, + "grad_norm": 8.09375, + "learning_rate": 9.840934371523916e-06, + "loss": 3.0865, + "mean_token_accuracy": 0.405218525766471, + "step": 858 + }, + { + "epoch": 0.1592510196514646, + "grad_norm": 7.15625, + "learning_rate": 9.840748980348537e-06, + "loss": 3.3838, + "mean_token_accuracy": 0.36763754045307445, + "step": 859 + }, + { + "epoch": 0.15943641082684465, + "grad_norm": 10.640625, + "learning_rate": 9.840563589173156e-06, + "loss": 3.0425, + "mean_token_accuracy": 0.4020387588215526, + "step": 860 + }, + { + "epoch": 0.1596218020022247, + "grad_norm": 7.328125, + "learning_rate": 9.840378197997776e-06, + "loss": 2.8725, + "mean_token_accuracy": 0.43518271539077086, + "step": 861 + }, + { + "epoch": 0.15980719317760475, + "grad_norm": 5.49609375, + "learning_rate": 9.840192806822397e-06, + "loss": 3.0704, + "mean_token_accuracy": 0.39563271395632715, + "step": 862 + }, + { + "epoch": 0.1599925843529848, + "grad_norm": 8.953125, + "learning_rate": 9.840007415647016e-06, + "loss": 2.8088, + "mean_token_accuracy": 0.42668037527310115, + "step": 863 + }, + { + "epoch": 0.16017797552836485, + "grad_norm": 9.2109375, + "learning_rate": 9.839822024471636e-06, + "loss": 2.9694, + "mean_token_accuracy": 0.41420861051838836, + "step": 864 + }, + { + "epoch": 0.1603633667037449, + "grad_norm": 6.83984375, + "learning_rate": 9.839636633296255e-06, + "loss": 2.9639, + "mean_token_accuracy": 0.426615064007145, + "step": 865 + }, + { + "epoch": 0.16054875787912495, + "grad_norm": 6.4609375, + "learning_rate": 9.839451242120875e-06, + "loss": 2.4992, + "mean_token_accuracy": 0.465635507733692, + "step": 866 + }, + { + "epoch": 0.160734149054505, + "grad_norm": 10.6640625, + "learning_rate": 9.839265850945496e-06, + "loss": 2.6242, + "mean_token_accuracy": 0.42900403768506057, + "step": 867 + }, + { + "epoch": 0.16091954022988506, + "grad_norm": 6.3359375, + "learning_rate": 9.839080459770116e-06, + "loss": 3.102, + "mean_token_accuracy": 0.4029791195637718, + "step": 868 + }, + { + "epoch": 0.1611049314052651, + "grad_norm": 7.25390625, + "learning_rate": 9.838895068594737e-06, + "loss": 3.1046, + "mean_token_accuracy": 0.40268617340208657, + "step": 869 + }, + { + "epoch": 0.16129032258064516, + "grad_norm": 7.2421875, + "learning_rate": 9.838709677419356e-06, + "loss": 2.9799, + "mean_token_accuracy": 0.4182015167930661, + "step": 870 + }, + { + "epoch": 0.1614757137560252, + "grad_norm": 7.86328125, + "learning_rate": 9.838524286243976e-06, + "loss": 2.428, + "mean_token_accuracy": 0.4929448786925185, + "step": 871 + }, + { + "epoch": 0.16166110493140526, + "grad_norm": 7.06640625, + "learning_rate": 9.838338895068595e-06, + "loss": 3.1815, + "mean_token_accuracy": 0.37569850552306694, + "step": 872 + }, + { + "epoch": 0.1618464961067853, + "grad_norm": 8.1640625, + "learning_rate": 9.838153503893215e-06, + "loss": 2.9097, + "mean_token_accuracy": 0.4155888748998512, + "step": 873 + }, + { + "epoch": 0.16203188728216536, + "grad_norm": 6.66796875, + "learning_rate": 9.837968112717834e-06, + "loss": 2.9409, + "mean_token_accuracy": 0.4218113975576662, + "step": 874 + }, + { + "epoch": 0.1622172784575454, + "grad_norm": 8.9453125, + "learning_rate": 9.837782721542455e-06, + "loss": 2.719, + "mean_token_accuracy": 0.4326935631283457, + "step": 875 + }, + { + "epoch": 0.16240266963292546, + "grad_norm": 14.890625, + "learning_rate": 9.837597330367075e-06, + "loss": 2.5295, + "mean_token_accuracy": 0.4559766108174969, + "step": 876 + }, + { + "epoch": 0.1625880608083055, + "grad_norm": 7.08984375, + "learning_rate": 9.837411939191696e-06, + "loss": 2.9903, + "mean_token_accuracy": 0.42188698542572234, + "step": 877 + }, + { + "epoch": 0.16277345198368556, + "grad_norm": 7.546875, + "learning_rate": 9.837226548016316e-06, + "loss": 2.9975, + "mean_token_accuracy": 0.4095304835318851, + "step": 878 + }, + { + "epoch": 0.16295884315906564, + "grad_norm": 8.1953125, + "learning_rate": 9.837041156840935e-06, + "loss": 2.8369, + "mean_token_accuracy": 0.42687011537030145, + "step": 879 + }, + { + "epoch": 0.1631442343344457, + "grad_norm": 7.5078125, + "learning_rate": 9.836855765665555e-06, + "loss": 2.4864, + "mean_token_accuracy": 0.4581965142712806, + "step": 880 + }, + { + "epoch": 0.16332962550982574, + "grad_norm": 8.7265625, + "learning_rate": 9.836670374490174e-06, + "loss": 2.2708, + "mean_token_accuracy": 0.49271422357546757, + "step": 881 + }, + { + "epoch": 0.1635150166852058, + "grad_norm": 10.0390625, + "learning_rate": 9.836484983314795e-06, + "loss": 2.6299, + "mean_token_accuracy": 0.45739005046863734, + "step": 882 + }, + { + "epoch": 0.16370040786058584, + "grad_norm": 9.0859375, + "learning_rate": 9.836299592139415e-06, + "loss": 2.5935, + "mean_token_accuracy": 0.4530878115996493, + "step": 883 + }, + { + "epoch": 0.1638857990359659, + "grad_norm": 6.1875, + "learning_rate": 9.836114200964036e-06, + "loss": 2.7587, + "mean_token_accuracy": 0.43889588821440845, + "step": 884 + }, + { + "epoch": 0.16407119021134595, + "grad_norm": 11.046875, + "learning_rate": 9.835928809788655e-06, + "loss": 2.566, + "mean_token_accuracy": 0.4770997846374731, + "step": 885 + }, + { + "epoch": 0.164256581386726, + "grad_norm": 6.640625, + "learning_rate": 9.835743418613275e-06, + "loss": 3.6766, + "mean_token_accuracy": 0.3553476682490924, + "step": 886 + }, + { + "epoch": 0.16444197256210605, + "grad_norm": 5.9609375, + "learning_rate": 9.835558027437895e-06, + "loss": 2.9207, + "mean_token_accuracy": 0.4325127334465195, + "step": 887 + }, + { + "epoch": 0.1646273637374861, + "grad_norm": 7.98828125, + "learning_rate": 9.835372636262514e-06, + "loss": 3.0047, + "mean_token_accuracy": 0.42887544802867383, + "step": 888 + }, + { + "epoch": 0.16481275491286615, + "grad_norm": 7.10546875, + "learning_rate": 9.835187245087135e-06, + "loss": 3.0301, + "mean_token_accuracy": 0.41472980825101685, + "step": 889 + }, + { + "epoch": 0.1649981460882462, + "grad_norm": 5.5546875, + "learning_rate": 9.835001853911754e-06, + "loss": 3.1827, + "mean_token_accuracy": 0.3912381780628329, + "step": 890 + }, + { + "epoch": 0.16518353726362625, + "grad_norm": 6.95703125, + "learning_rate": 9.834816462736374e-06, + "loss": 3.2358, + "mean_token_accuracy": 0.39298099597725067, + "step": 891 + }, + { + "epoch": 0.1653689284390063, + "grad_norm": 7.484375, + "learning_rate": 9.834631071560995e-06, + "loss": 3.115, + "mean_token_accuracy": 0.4002200522624123, + "step": 892 + }, + { + "epoch": 0.16555431961438635, + "grad_norm": 6.36328125, + "learning_rate": 9.834445680385615e-06, + "loss": 2.7536, + "mean_token_accuracy": 0.4423791821561338, + "step": 893 + }, + { + "epoch": 0.1657397107897664, + "grad_norm": 10.5390625, + "learning_rate": 9.834260289210234e-06, + "loss": 2.8993, + "mean_token_accuracy": 0.42203570161957676, + "step": 894 + }, + { + "epoch": 0.16592510196514645, + "grad_norm": 6.16015625, + "learning_rate": 9.834074898034854e-06, + "loss": 3.2818, + "mean_token_accuracy": 0.39052152317880795, + "step": 895 + }, + { + "epoch": 0.1661104931405265, + "grad_norm": 6.75390625, + "learning_rate": 9.833889506859475e-06, + "loss": 3.1184, + "mean_token_accuracy": 0.40248468345813476, + "step": 896 + }, + { + "epoch": 0.16629588431590656, + "grad_norm": 8.109375, + "learning_rate": 9.833704115684094e-06, + "loss": 2.8024, + "mean_token_accuracy": 0.4240407204385278, + "step": 897 + }, + { + "epoch": 0.1664812754912866, + "grad_norm": 6.67578125, + "learning_rate": 9.833518724508714e-06, + "loss": 3.0508, + "mean_token_accuracy": 0.3948530339346537, + "step": 898 + }, + { + "epoch": 0.16666666666666666, + "grad_norm": 9.546875, + "learning_rate": 9.833333333333333e-06, + "loss": 3.1853, + "mean_token_accuracy": 0.391392610637434, + "step": 899 + }, + { + "epoch": 0.1668520578420467, + "grad_norm": 6.68359375, + "learning_rate": 9.833147942157955e-06, + "loss": 2.9536, + "mean_token_accuracy": 0.43108345732449616, + "step": 900 + }, + { + "epoch": 0.16703744901742676, + "grad_norm": 6.625, + "learning_rate": 9.832962550982574e-06, + "loss": 2.5083, + "mean_token_accuracy": 0.4743078500626638, + "step": 901 + }, + { + "epoch": 0.1672228401928068, + "grad_norm": 12.2265625, + "learning_rate": 9.832777159807194e-06, + "loss": 3.0457, + "mean_token_accuracy": 0.4022612723062253, + "step": 902 + }, + { + "epoch": 0.16740823136818686, + "grad_norm": 8.796875, + "learning_rate": 9.832591768631813e-06, + "loss": 3.3727, + "mean_token_accuracy": 0.3941884355738186, + "step": 903 + }, + { + "epoch": 0.16759362254356694, + "grad_norm": 11.078125, + "learning_rate": 9.832406377456434e-06, + "loss": 2.8555, + "mean_token_accuracy": 0.4276098719809951, + "step": 904 + }, + { + "epoch": 0.167779013718947, + "grad_norm": 11.4453125, + "learning_rate": 9.832220986281054e-06, + "loss": 2.6867, + "mean_token_accuracy": 0.4577971646673937, + "step": 905 + }, + { + "epoch": 0.16796440489432704, + "grad_norm": 8.9140625, + "learning_rate": 9.832035595105673e-06, + "loss": 2.989, + "mean_token_accuracy": 0.4172646227440748, + "step": 906 + }, + { + "epoch": 0.1681497960697071, + "grad_norm": 8.3515625, + "learning_rate": 9.831850203930293e-06, + "loss": 2.9702, + "mean_token_accuracy": 0.42941757156959526, + "step": 907 + }, + { + "epoch": 0.16833518724508714, + "grad_norm": 8.8359375, + "learning_rate": 9.831664812754914e-06, + "loss": 2.8582, + "mean_token_accuracy": 0.4194536322430479, + "step": 908 + }, + { + "epoch": 0.1685205784204672, + "grad_norm": 8.8125, + "learning_rate": 9.831479421579534e-06, + "loss": 3.2169, + "mean_token_accuracy": 0.37999403697078116, + "step": 909 + }, + { + "epoch": 0.16870596959584724, + "grad_norm": 9.7734375, + "learning_rate": 9.831294030404153e-06, + "loss": 3.4214, + "mean_token_accuracy": 0.3838383838383838, + "step": 910 + }, + { + "epoch": 0.1688913607712273, + "grad_norm": 7.76953125, + "learning_rate": 9.831108639228774e-06, + "loss": 2.8021, + "mean_token_accuracy": 0.449582236465606, + "step": 911 + }, + { + "epoch": 0.16907675194660735, + "grad_norm": 9.09375, + "learning_rate": 9.830923248053393e-06, + "loss": 2.8376, + "mean_token_accuracy": 0.4329275103317324, + "step": 912 + }, + { + "epoch": 0.1692621431219874, + "grad_norm": 7.80078125, + "learning_rate": 9.830737856878013e-06, + "loss": 3.6041, + "mean_token_accuracy": 0.35638665132336017, + "step": 913 + }, + { + "epoch": 0.16944753429736745, + "grad_norm": 8.125, + "learning_rate": 9.830552465702634e-06, + "loss": 3.0355, + "mean_token_accuracy": 0.4033574618820268, + "step": 914 + }, + { + "epoch": 0.1696329254727475, + "grad_norm": 7.69140625, + "learning_rate": 9.830367074527252e-06, + "loss": 3.2878, + "mean_token_accuracy": 0.37535730404693846, + "step": 915 + }, + { + "epoch": 0.16981831664812755, + "grad_norm": 10.15625, + "learning_rate": 9.830181683351874e-06, + "loss": 2.7633, + "mean_token_accuracy": 0.4202977735282065, + "step": 916 + }, + { + "epoch": 0.1700037078235076, + "grad_norm": 11.078125, + "learning_rate": 9.829996292176493e-06, + "loss": 2.9776, + "mean_token_accuracy": 0.42373367771559767, + "step": 917 + }, + { + "epoch": 0.17018909899888765, + "grad_norm": 10.7421875, + "learning_rate": 9.829810901001114e-06, + "loss": 3.1208, + "mean_token_accuracy": 0.39517230909366935, + "step": 918 + }, + { + "epoch": 0.1703744901742677, + "grad_norm": 6.74609375, + "learning_rate": 9.829625509825733e-06, + "loss": 3.0683, + "mean_token_accuracy": 0.3957353928811283, + "step": 919 + }, + { + "epoch": 0.17055988134964775, + "grad_norm": 9.53125, + "learning_rate": 9.829440118650353e-06, + "loss": 3.0807, + "mean_token_accuracy": 0.41297676457693994, + "step": 920 + }, + { + "epoch": 0.1707452725250278, + "grad_norm": 10.5703125, + "learning_rate": 9.829254727474974e-06, + "loss": 2.9419, + "mean_token_accuracy": 0.4179579707068563, + "step": 921 + }, + { + "epoch": 0.17093066370040785, + "grad_norm": 8.4140625, + "learning_rate": 9.829069336299592e-06, + "loss": 2.5302, + "mean_token_accuracy": 0.4503933011925907, + "step": 922 + }, + { + "epoch": 0.1711160548757879, + "grad_norm": 8.875, + "learning_rate": 9.828883945124213e-06, + "loss": 2.6073, + "mean_token_accuracy": 0.44781718963165074, + "step": 923 + }, + { + "epoch": 0.17130144605116795, + "grad_norm": 7.0546875, + "learning_rate": 9.828698553948833e-06, + "loss": 3.2873, + "mean_token_accuracy": 0.37283500455788515, + "step": 924 + }, + { + "epoch": 0.171486837226548, + "grad_norm": 9.1328125, + "learning_rate": 9.828513162773454e-06, + "loss": 2.7549, + "mean_token_accuracy": 0.4229826353421859, + "step": 925 + }, + { + "epoch": 0.17167222840192806, + "grad_norm": 6.18359375, + "learning_rate": 9.828327771598073e-06, + "loss": 2.8575, + "mean_token_accuracy": 0.4302970541106865, + "step": 926 + }, + { + "epoch": 0.1718576195773081, + "grad_norm": 5.80859375, + "learning_rate": 9.828142380422693e-06, + "loss": 3.078, + "mean_token_accuracy": 0.41122956645344705, + "step": 927 + }, + { + "epoch": 0.17204301075268819, + "grad_norm": 7.80859375, + "learning_rate": 9.827956989247312e-06, + "loss": 3.1563, + "mean_token_accuracy": 0.40778401122019636, + "step": 928 + }, + { + "epoch": 0.17222840192806824, + "grad_norm": 6.5546875, + "learning_rate": 9.827771598071932e-06, + "loss": 2.8492, + "mean_token_accuracy": 0.42769857433808556, + "step": 929 + }, + { + "epoch": 0.1724137931034483, + "grad_norm": 6.0859375, + "learning_rate": 9.827586206896553e-06, + "loss": 2.9861, + "mean_token_accuracy": 0.3925370941677685, + "step": 930 + }, + { + "epoch": 0.17259918427882834, + "grad_norm": 7.4921875, + "learning_rate": 9.827400815721172e-06, + "loss": 2.8912, + "mean_token_accuracy": 0.41513458608430676, + "step": 931 + }, + { + "epoch": 0.1727845754542084, + "grad_norm": 6.90234375, + "learning_rate": 9.827215424545792e-06, + "loss": 2.6906, + "mean_token_accuracy": 0.45125628140703516, + "step": 932 + }, + { + "epoch": 0.17296996662958844, + "grad_norm": 7.34375, + "learning_rate": 9.827030033370413e-06, + "loss": 2.5551, + "mean_token_accuracy": 0.45125895125895127, + "step": 933 + }, + { + "epoch": 0.1731553578049685, + "grad_norm": 9.640625, + "learning_rate": 9.826844642195033e-06, + "loss": 3.7212, + "mean_token_accuracy": 0.3551655083048651, + "step": 934 + }, + { + "epoch": 0.17334074898034854, + "grad_norm": 6.47265625, + "learning_rate": 9.826659251019652e-06, + "loss": 2.6948, + "mean_token_accuracy": 0.4425101782956619, + "step": 935 + }, + { + "epoch": 0.1735261401557286, + "grad_norm": 6.6640625, + "learning_rate": 9.826473859844272e-06, + "loss": 3.0416, + "mean_token_accuracy": 0.41005147656461666, + "step": 936 + }, + { + "epoch": 0.17371153133110864, + "grad_norm": 6.3671875, + "learning_rate": 9.826288468668891e-06, + "loss": 2.8398, + "mean_token_accuracy": 0.43751487741014045, + "step": 937 + }, + { + "epoch": 0.1738969225064887, + "grad_norm": 8.2890625, + "learning_rate": 9.826103077493512e-06, + "loss": 2.6089, + "mean_token_accuracy": 0.4540399858038566, + "step": 938 + }, + { + "epoch": 0.17408231368186874, + "grad_norm": 6.36328125, + "learning_rate": 9.825917686318132e-06, + "loss": 3.3504, + "mean_token_accuracy": 0.3643364034425487, + "step": 939 + }, + { + "epoch": 0.1742677048572488, + "grad_norm": 10.8515625, + "learning_rate": 9.825732295142753e-06, + "loss": 2.6028, + "mean_token_accuracy": 0.4558444902162719, + "step": 940 + }, + { + "epoch": 0.17445309603262885, + "grad_norm": 5.93359375, + "learning_rate": 9.825546903967372e-06, + "loss": 3.5589, + "mean_token_accuracy": 0.3567421566590484, + "step": 941 + }, + { + "epoch": 0.1746384872080089, + "grad_norm": 6.6015625, + "learning_rate": 9.825361512791992e-06, + "loss": 2.5763, + "mean_token_accuracy": 0.4640759150474469, + "step": 942 + }, + { + "epoch": 0.17482387838338895, + "grad_norm": 8.5546875, + "learning_rate": 9.825176121616613e-06, + "loss": 2.8157, + "mean_token_accuracy": 0.43114952987564453, + "step": 943 + }, + { + "epoch": 0.175009269558769, + "grad_norm": 8.8984375, + "learning_rate": 9.824990730441231e-06, + "loss": 2.6311, + "mean_token_accuracy": 0.45034553365753777, + "step": 944 + }, + { + "epoch": 0.17519466073414905, + "grad_norm": 7.09765625, + "learning_rate": 9.824805339265852e-06, + "loss": 3.032, + "mean_token_accuracy": 0.4153272101033295, + "step": 945 + }, + { + "epoch": 0.1753800519095291, + "grad_norm": 5.87890625, + "learning_rate": 9.82461994809047e-06, + "loss": 3.0415, + "mean_token_accuracy": 0.4156521739130435, + "step": 946 + }, + { + "epoch": 0.17556544308490915, + "grad_norm": 6.6171875, + "learning_rate": 9.824434556915091e-06, + "loss": 2.9468, + "mean_token_accuracy": 0.41203838325094505, + "step": 947 + }, + { + "epoch": 0.1757508342602892, + "grad_norm": 6.09375, + "learning_rate": 9.824249165739712e-06, + "loss": 2.6942, + "mean_token_accuracy": 0.44108949416342413, + "step": 948 + }, + { + "epoch": 0.17593622543566925, + "grad_norm": 6.671875, + "learning_rate": 9.824063774564332e-06, + "loss": 2.8791, + "mean_token_accuracy": 0.4248938087269919, + "step": 949 + }, + { + "epoch": 0.1761216166110493, + "grad_norm": 6.6875, + "learning_rate": 9.823878383388951e-06, + "loss": 3.0908, + "mean_token_accuracy": 0.40885381837635415, + "step": 950 + }, + { + "epoch": 0.17630700778642935, + "grad_norm": 5.80859375, + "learning_rate": 9.823692992213571e-06, + "loss": 2.7206, + "mean_token_accuracy": 0.44763470781684794, + "step": 951 + }, + { + "epoch": 0.1764923989618094, + "grad_norm": 15.25, + "learning_rate": 9.823507601038192e-06, + "loss": 2.9571, + "mean_token_accuracy": 0.40001937608990507, + "step": 952 + }, + { + "epoch": 0.17667779013718948, + "grad_norm": 6.4921875, + "learning_rate": 9.82332220986281e-06, + "loss": 3.2931, + "mean_token_accuracy": 0.37936225537156154, + "step": 953 + }, + { + "epoch": 0.17686318131256953, + "grad_norm": 7.2734375, + "learning_rate": 9.823136818687431e-06, + "loss": 2.6598, + "mean_token_accuracy": 0.45132444744390077, + "step": 954 + }, + { + "epoch": 0.17704857248794958, + "grad_norm": 7.16015625, + "learning_rate": 9.82295142751205e-06, + "loss": 2.9082, + "mean_token_accuracy": 0.43122376493380693, + "step": 955 + }, + { + "epoch": 0.17723396366332964, + "grad_norm": 5.98046875, + "learning_rate": 9.822766036336672e-06, + "loss": 3.1786, + "mean_token_accuracy": 0.41091632475444484, + "step": 956 + }, + { + "epoch": 0.1774193548387097, + "grad_norm": 8.875, + "learning_rate": 9.822580645161291e-06, + "loss": 3.1282, + "mean_token_accuracy": 0.3942307692307692, + "step": 957 + }, + { + "epoch": 0.17760474601408974, + "grad_norm": 7.92578125, + "learning_rate": 9.822395253985911e-06, + "loss": 3.2286, + "mean_token_accuracy": 0.38834370512206795, + "step": 958 + }, + { + "epoch": 0.1777901371894698, + "grad_norm": 8.1328125, + "learning_rate": 9.822209862810532e-06, + "loss": 2.6381, + "mean_token_accuracy": 0.4401952807160293, + "step": 959 + }, + { + "epoch": 0.17797552836484984, + "grad_norm": 14.2421875, + "learning_rate": 9.82202447163515e-06, + "loss": 2.5369, + "mean_token_accuracy": 0.4442168804570443, + "step": 960 + }, + { + "epoch": 0.1781609195402299, + "grad_norm": 7.26171875, + "learning_rate": 9.821839080459771e-06, + "loss": 2.8189, + "mean_token_accuracy": 0.43766816143497755, + "step": 961 + }, + { + "epoch": 0.17834631071560994, + "grad_norm": 5.9609375, + "learning_rate": 9.82165368928439e-06, + "loss": 3.3976, + "mean_token_accuracy": 0.3637658637658638, + "step": 962 + }, + { + "epoch": 0.17853170189099, + "grad_norm": 6.27734375, + "learning_rate": 9.82146829810901e-06, + "loss": 3.1762, + "mean_token_accuracy": 0.39179153094462543, + "step": 963 + }, + { + "epoch": 0.17871709306637004, + "grad_norm": 7.55078125, + "learning_rate": 9.821282906933631e-06, + "loss": 3.3903, + "mean_token_accuracy": 0.37871967991998, + "step": 964 + }, + { + "epoch": 0.1789024842417501, + "grad_norm": 9.734375, + "learning_rate": 9.821097515758251e-06, + "loss": 2.9571, + "mean_token_accuracy": 0.41045498547918685, + "step": 965 + }, + { + "epoch": 0.17908787541713014, + "grad_norm": 6.859375, + "learning_rate": 9.82091212458287e-06, + "loss": 3.5961, + "mean_token_accuracy": 0.3788939206806642, + "step": 966 + }, + { + "epoch": 0.1792732665925102, + "grad_norm": 8.3671875, + "learning_rate": 9.82072673340749e-06, + "loss": 2.6454, + "mean_token_accuracy": 0.4683882457702582, + "step": 967 + }, + { + "epoch": 0.17945865776789025, + "grad_norm": 6.90234375, + "learning_rate": 9.820541342232111e-06, + "loss": 2.8829, + "mean_token_accuracy": 0.4283302855535237, + "step": 968 + }, + { + "epoch": 0.1796440489432703, + "grad_norm": 11.0546875, + "learning_rate": 9.82035595105673e-06, + "loss": 3.1915, + "mean_token_accuracy": 0.3899988316392102, + "step": 969 + }, + { + "epoch": 0.17982944011865035, + "grad_norm": 7.70703125, + "learning_rate": 9.82017055988135e-06, + "loss": 2.7976, + "mean_token_accuracy": 0.44193806727175655, + "step": 970 + }, + { + "epoch": 0.1800148312940304, + "grad_norm": 7.35546875, + "learning_rate": 9.81998516870597e-06, + "loss": 2.9728, + "mean_token_accuracy": 0.4155251141552511, + "step": 971 + }, + { + "epoch": 0.18020022246941045, + "grad_norm": 7.4375, + "learning_rate": 9.819799777530592e-06, + "loss": 2.7307, + "mean_token_accuracy": 0.4378502658427935, + "step": 972 + }, + { + "epoch": 0.1803856136447905, + "grad_norm": 5.796875, + "learning_rate": 9.81961438635521e-06, + "loss": 2.8629, + "mean_token_accuracy": 0.4325975807457289, + "step": 973 + }, + { + "epoch": 0.18057100482017055, + "grad_norm": 6.0625, + "learning_rate": 9.81942899517983e-06, + "loss": 2.9161, + "mean_token_accuracy": 0.4185038868357334, + "step": 974 + }, + { + "epoch": 0.1807563959955506, + "grad_norm": 7.34765625, + "learning_rate": 9.81924360400445e-06, + "loss": 2.4493, + "mean_token_accuracy": 0.4842044801838024, + "step": 975 + }, + { + "epoch": 0.18094178717093065, + "grad_norm": 10.140625, + "learning_rate": 9.81905821282907e-06, + "loss": 3.1247, + "mean_token_accuracy": 0.39074141932331746, + "step": 976 + }, + { + "epoch": 0.1811271783463107, + "grad_norm": 8.859375, + "learning_rate": 9.81887282165369e-06, + "loss": 2.941, + "mean_token_accuracy": 0.4139446316097847, + "step": 977 + }, + { + "epoch": 0.18131256952169078, + "grad_norm": 8.6953125, + "learning_rate": 9.81868743047831e-06, + "loss": 2.9445, + "mean_token_accuracy": 0.42589613970588236, + "step": 978 + }, + { + "epoch": 0.18149796069707083, + "grad_norm": 8.34375, + "learning_rate": 9.81850203930293e-06, + "loss": 2.6382, + "mean_token_accuracy": 0.449826443673083, + "step": 979 + }, + { + "epoch": 0.18168335187245088, + "grad_norm": 6.3046875, + "learning_rate": 9.81831664812755e-06, + "loss": 2.3357, + "mean_token_accuracy": 0.5011212241129138, + "step": 980 + }, + { + "epoch": 0.18186874304783093, + "grad_norm": 7.3515625, + "learning_rate": 9.818131256952171e-06, + "loss": 2.9433, + "mean_token_accuracy": 0.4157101369105241, + "step": 981 + }, + { + "epoch": 0.18205413422321098, + "grad_norm": 6.50390625, + "learning_rate": 9.81794586577679e-06, + "loss": 3.0002, + "mean_token_accuracy": 0.40797546012269936, + "step": 982 + }, + { + "epoch": 0.18223952539859103, + "grad_norm": 8.359375, + "learning_rate": 9.81776047460141e-06, + "loss": 2.7719, + "mean_token_accuracy": 0.43075258475595096, + "step": 983 + }, + { + "epoch": 0.18242491657397109, + "grad_norm": 7.80859375, + "learning_rate": 9.817575083426029e-06, + "loss": 3.0833, + "mean_token_accuracy": 0.40502056745069087, + "step": 984 + }, + { + "epoch": 0.18261030774935114, + "grad_norm": 7.89453125, + "learning_rate": 9.81738969225065e-06, + "loss": 2.9024, + "mean_token_accuracy": 0.41554229372080953, + "step": 985 + }, + { + "epoch": 0.1827956989247312, + "grad_norm": 8.921875, + "learning_rate": 9.81720430107527e-06, + "loss": 2.6053, + "mean_token_accuracy": 0.46339239187076603, + "step": 986 + }, + { + "epoch": 0.18298109010011124, + "grad_norm": 8.703125, + "learning_rate": 9.817018909899889e-06, + "loss": 2.9021, + "mean_token_accuracy": 0.4314046877561056, + "step": 987 + }, + { + "epoch": 0.1831664812754913, + "grad_norm": 7.66796875, + "learning_rate": 9.816833518724511e-06, + "loss": 2.5, + "mean_token_accuracy": 0.463427432049533, + "step": 988 + }, + { + "epoch": 0.18335187245087134, + "grad_norm": 5.86328125, + "learning_rate": 9.81664812754913e-06, + "loss": 2.8316, + "mean_token_accuracy": 0.4455082176479975, + "step": 989 + }, + { + "epoch": 0.1835372636262514, + "grad_norm": 7.21484375, + "learning_rate": 9.81646273637375e-06, + "loss": 2.2925, + "mean_token_accuracy": 0.4972521181589192, + "step": 990 + }, + { + "epoch": 0.18372265480163144, + "grad_norm": 8.140625, + "learning_rate": 9.816277345198369e-06, + "loss": 3.1826, + "mean_token_accuracy": 0.39731561115454783, + "step": 991 + }, + { + "epoch": 0.1839080459770115, + "grad_norm": 7.06640625, + "learning_rate": 9.81609195402299e-06, + "loss": 3.2356, + "mean_token_accuracy": 0.3774932282688993, + "step": 992 + }, + { + "epoch": 0.18409343715239154, + "grad_norm": 6.515625, + "learning_rate": 9.815906562847608e-06, + "loss": 2.7563, + "mean_token_accuracy": 0.4504526851047997, + "step": 993 + }, + { + "epoch": 0.1842788283277716, + "grad_norm": 8.9921875, + "learning_rate": 9.815721171672229e-06, + "loss": 2.9787, + "mean_token_accuracy": 0.4243169398907104, + "step": 994 + }, + { + "epoch": 0.18446421950315164, + "grad_norm": 9.03125, + "learning_rate": 9.81553578049685e-06, + "loss": 2.6147, + "mean_token_accuracy": 0.47397880195307845, + "step": 995 + }, + { + "epoch": 0.1846496106785317, + "grad_norm": 8.5078125, + "learning_rate": 9.815350389321468e-06, + "loss": 2.3522, + "mean_token_accuracy": 0.47869738705110054, + "step": 996 + }, + { + "epoch": 0.18483500185391175, + "grad_norm": 7.359375, + "learning_rate": 9.81516499814609e-06, + "loss": 3.2108, + "mean_token_accuracy": 0.40973544973544973, + "step": 997 + }, + { + "epoch": 0.1850203930292918, + "grad_norm": 9.28125, + "learning_rate": 9.814979606970709e-06, + "loss": 2.8617, + "mean_token_accuracy": 0.4271401536235111, + "step": 998 + }, + { + "epoch": 0.18520578420467185, + "grad_norm": 7.3828125, + "learning_rate": 9.81479421579533e-06, + "loss": 3.1549, + "mean_token_accuracy": 0.41374045801526715, + "step": 999 + }, + { + "epoch": 0.1853911753800519, + "grad_norm": 6.76953125, + "learning_rate": 9.814608824619948e-06, + "loss": 2.7471, + "mean_token_accuracy": 0.4567751869775627, + "step": 1000 + }, + { + "epoch": 0.18557656655543195, + "grad_norm": 6.453125, + "learning_rate": 9.814423433444569e-06, + "loss": 2.6422, + "mean_token_accuracy": 0.46346863468634686, + "step": 1001 + }, + { + "epoch": 0.185761957730812, + "grad_norm": 14.5625, + "learning_rate": 9.81423804226919e-06, + "loss": 3.2903, + "mean_token_accuracy": 0.3851472471190781, + "step": 1002 + }, + { + "epoch": 0.18594734890619208, + "grad_norm": 7.12890625, + "learning_rate": 9.814052651093808e-06, + "loss": 3.0695, + "mean_token_accuracy": 0.4044219253800092, + "step": 1003 + }, + { + "epoch": 0.18613274008157213, + "grad_norm": 8.4609375, + "learning_rate": 9.813867259918429e-06, + "loss": 2.9237, + "mean_token_accuracy": 0.42072978924189997, + "step": 1004 + }, + { + "epoch": 0.18631813125695218, + "grad_norm": 11.5, + "learning_rate": 9.813681868743049e-06, + "loss": 3.2533, + "mean_token_accuracy": 0.39268362389254075, + "step": 1005 + }, + { + "epoch": 0.18650352243233223, + "grad_norm": 7.06640625, + "learning_rate": 9.81349647756767e-06, + "loss": 2.8126, + "mean_token_accuracy": 0.4327493447372051, + "step": 1006 + }, + { + "epoch": 0.18668891360771228, + "grad_norm": 9.015625, + "learning_rate": 9.813311086392288e-06, + "loss": 2.5965, + "mean_token_accuracy": 0.44166666666666665, + "step": 1007 + }, + { + "epoch": 0.18687430478309233, + "grad_norm": 10.53125, + "learning_rate": 9.813125695216909e-06, + "loss": 2.9113, + "mean_token_accuracy": 0.40158777711204313, + "step": 1008 + }, + { + "epoch": 0.18705969595847238, + "grad_norm": 13.765625, + "learning_rate": 9.812940304041528e-06, + "loss": 3.1759, + "mean_token_accuracy": 0.3845419847328244, + "step": 1009 + }, + { + "epoch": 0.18724508713385243, + "grad_norm": 9.390625, + "learning_rate": 9.812754912866148e-06, + "loss": 2.8142, + "mean_token_accuracy": 0.42088565763384006, + "step": 1010 + }, + { + "epoch": 0.18743047830923248, + "grad_norm": 6.74609375, + "learning_rate": 9.812569521690769e-06, + "loss": 2.9207, + "mean_token_accuracy": 0.4175474156652671, + "step": 1011 + }, + { + "epoch": 0.18761586948461254, + "grad_norm": 9.2421875, + "learning_rate": 9.812384130515387e-06, + "loss": 2.6521, + "mean_token_accuracy": 0.4331787521079258, + "step": 1012 + }, + { + "epoch": 0.18780126065999259, + "grad_norm": 6.53515625, + "learning_rate": 9.812198739340008e-06, + "loss": 2.6721, + "mean_token_accuracy": 0.4418887537321685, + "step": 1013 + }, + { + "epoch": 0.18798665183537264, + "grad_norm": 12.53125, + "learning_rate": 9.812013348164628e-06, + "loss": 2.6866, + "mean_token_accuracy": 0.44471544715447153, + "step": 1014 + }, + { + "epoch": 0.1881720430107527, + "grad_norm": 10.296875, + "learning_rate": 9.811827956989249e-06, + "loss": 2.7858, + "mean_token_accuracy": 0.4494631617919289, + "step": 1015 + }, + { + "epoch": 0.18835743418613274, + "grad_norm": 6.3125, + "learning_rate": 9.811642565813868e-06, + "loss": 2.9027, + "mean_token_accuracy": 0.43593202050175345, + "step": 1016 + }, + { + "epoch": 0.1885428253615128, + "grad_norm": 9.453125, + "learning_rate": 9.811457174638488e-06, + "loss": 2.7057, + "mean_token_accuracy": 0.44948985312254736, + "step": 1017 + }, + { + "epoch": 0.18872821653689284, + "grad_norm": 12.8359375, + "learning_rate": 9.811271783463107e-06, + "loss": 2.7934, + "mean_token_accuracy": 0.4257238469372909, + "step": 1018 + }, + { + "epoch": 0.1889136077122729, + "grad_norm": 6.63671875, + "learning_rate": 9.811086392287728e-06, + "loss": 2.9675, + "mean_token_accuracy": 0.40269897255022236, + "step": 1019 + }, + { + "epoch": 0.18909899888765294, + "grad_norm": 9.4765625, + "learning_rate": 9.810901001112348e-06, + "loss": 2.5321, + "mean_token_accuracy": 0.4683893195521103, + "step": 1020 + }, + { + "epoch": 0.189284390063033, + "grad_norm": 6.25390625, + "learning_rate": 9.810715609936968e-06, + "loss": 2.8699, + "mean_token_accuracy": 0.4243516687986975, + "step": 1021 + }, + { + "epoch": 0.18946978123841304, + "grad_norm": 6.52734375, + "learning_rate": 9.810530218761587e-06, + "loss": 3.0337, + "mean_token_accuracy": 0.41924460431654675, + "step": 1022 + }, + { + "epoch": 0.1896551724137931, + "grad_norm": 5.95703125, + "learning_rate": 9.810344827586208e-06, + "loss": 3.127, + "mean_token_accuracy": 0.40585569030993507, + "step": 1023 + }, + { + "epoch": 0.18984056358917314, + "grad_norm": 7.51953125, + "learning_rate": 9.810159436410828e-06, + "loss": 2.5803, + "mean_token_accuracy": 0.45498717775678227, + "step": 1024 + }, + { + "epoch": 0.1900259547645532, + "grad_norm": 8.5234375, + "learning_rate": 9.809974045235447e-06, + "loss": 2.7258, + "mean_token_accuracy": 0.43440384865427123, + "step": 1025 + }, + { + "epoch": 0.19021134593993325, + "grad_norm": 6.23046875, + "learning_rate": 9.809788654060068e-06, + "loss": 2.7584, + "mean_token_accuracy": 0.46048304796462186, + "step": 1026 + }, + { + "epoch": 0.19039673711531332, + "grad_norm": 8.78125, + "learning_rate": 9.809603262884686e-06, + "loss": 2.7614, + "mean_token_accuracy": 0.4380333715450261, + "step": 1027 + }, + { + "epoch": 0.19058212829069338, + "grad_norm": 7.125, + "learning_rate": 9.809417871709307e-06, + "loss": 2.8784, + "mean_token_accuracy": 0.43032699294721094, + "step": 1028 + }, + { + "epoch": 0.19076751946607343, + "grad_norm": 6.69140625, + "learning_rate": 9.809232480533927e-06, + "loss": 2.8202, + "mean_token_accuracy": 0.42830009496676164, + "step": 1029 + }, + { + "epoch": 0.19095291064145348, + "grad_norm": 8.3828125, + "learning_rate": 9.809047089358548e-06, + "loss": 2.5601, + "mean_token_accuracy": 0.46968545813706125, + "step": 1030 + }, + { + "epoch": 0.19113830181683353, + "grad_norm": 6.8515625, + "learning_rate": 9.808861698183167e-06, + "loss": 2.8931, + "mean_token_accuracy": 0.4442016806722689, + "step": 1031 + }, + { + "epoch": 0.19132369299221358, + "grad_norm": 6.6875, + "learning_rate": 9.808676307007787e-06, + "loss": 3.2197, + "mean_token_accuracy": 0.4037063435495367, + "step": 1032 + }, + { + "epoch": 0.19150908416759363, + "grad_norm": 6.2734375, + "learning_rate": 9.808490915832408e-06, + "loss": 3.1832, + "mean_token_accuracy": 0.3869047619047619, + "step": 1033 + }, + { + "epoch": 0.19169447534297368, + "grad_norm": 7.70703125, + "learning_rate": 9.808305524657026e-06, + "loss": 2.9157, + "mean_token_accuracy": 0.42330226364846874, + "step": 1034 + }, + { + "epoch": 0.19187986651835373, + "grad_norm": 7.453125, + "learning_rate": 9.808120133481647e-06, + "loss": 2.5107, + "mean_token_accuracy": 0.48006245496036515, + "step": 1035 + }, + { + "epoch": 0.19206525769373378, + "grad_norm": 5.61328125, + "learning_rate": 9.807934742306266e-06, + "loss": 2.8004, + "mean_token_accuracy": 0.44635676371240146, + "step": 1036 + }, + { + "epoch": 0.19225064886911383, + "grad_norm": 7.3359375, + "learning_rate": 9.807749351130888e-06, + "loss": 2.853, + "mean_token_accuracy": 0.4311504424778761, + "step": 1037 + }, + { + "epoch": 0.19243604004449388, + "grad_norm": 7.734375, + "learning_rate": 9.807563959955507e-06, + "loss": 2.7605, + "mean_token_accuracy": 0.43736263736263736, + "step": 1038 + }, + { + "epoch": 0.19262143121987393, + "grad_norm": 6.08203125, + "learning_rate": 9.807378568780127e-06, + "loss": 3.1788, + "mean_token_accuracy": 0.39853353461289626, + "step": 1039 + }, + { + "epoch": 0.19280682239525399, + "grad_norm": 16.40625, + "learning_rate": 9.807193177604748e-06, + "loss": 2.5596, + "mean_token_accuracy": 0.4575200270788672, + "step": 1040 + }, + { + "epoch": 0.19299221357063404, + "grad_norm": 8.359375, + "learning_rate": 9.807007786429366e-06, + "loss": 2.8605, + "mean_token_accuracy": 0.4215727209464161, + "step": 1041 + }, + { + "epoch": 0.1931776047460141, + "grad_norm": 11.6953125, + "learning_rate": 9.806822395253987e-06, + "loss": 2.8117, + "mean_token_accuracy": 0.4198639061821341, + "step": 1042 + }, + { + "epoch": 0.19336299592139414, + "grad_norm": 8.6875, + "learning_rate": 9.806637004078606e-06, + "loss": 3.0622, + "mean_token_accuracy": 0.39813895781637715, + "step": 1043 + }, + { + "epoch": 0.1935483870967742, + "grad_norm": 7.8046875, + "learning_rate": 9.806451612903226e-06, + "loss": 3.1685, + "mean_token_accuracy": 0.3736914600550964, + "step": 1044 + }, + { + "epoch": 0.19373377827215424, + "grad_norm": 5.85546875, + "learning_rate": 9.806266221727847e-06, + "loss": 3.3404, + "mean_token_accuracy": 0.3777093925608777, + "step": 1045 + }, + { + "epoch": 0.1939191694475343, + "grad_norm": 9.625, + "learning_rate": 9.806080830552467e-06, + "loss": 2.6389, + "mean_token_accuracy": 0.447255880256593, + "step": 1046 + }, + { + "epoch": 0.19410456062291434, + "grad_norm": 9.625, + "learning_rate": 9.805895439377086e-06, + "loss": 2.9779, + "mean_token_accuracy": 0.42087752131420253, + "step": 1047 + }, + { + "epoch": 0.1942899517982944, + "grad_norm": 7.984375, + "learning_rate": 9.805710048201707e-06, + "loss": 2.931, + "mean_token_accuracy": 0.42249962847377026, + "step": 1048 + }, + { + "epoch": 0.19447534297367444, + "grad_norm": 6.66796875, + "learning_rate": 9.805524657026327e-06, + "loss": 2.8385, + "mean_token_accuracy": 0.4353510895883777, + "step": 1049 + }, + { + "epoch": 0.1946607341490545, + "grad_norm": 10.6640625, + "learning_rate": 9.805339265850946e-06, + "loss": 2.5728, + "mean_token_accuracy": 0.4534898891063275, + "step": 1050 + }, + { + "epoch": 0.19484612532443454, + "grad_norm": 6.71484375, + "learning_rate": 9.805153874675566e-06, + "loss": 2.8725, + "mean_token_accuracy": 0.44359083795703513, + "step": 1051 + }, + { + "epoch": 0.19503151649981462, + "grad_norm": 6.45703125, + "learning_rate": 9.804968483500185e-06, + "loss": 2.8243, + "mean_token_accuracy": 0.43431442928930364, + "step": 1052 + }, + { + "epoch": 0.19521690767519467, + "grad_norm": 5.70703125, + "learning_rate": 9.804783092324807e-06, + "loss": 3.1598, + "mean_token_accuracy": 0.40442149854381787, + "step": 1053 + }, + { + "epoch": 0.19540229885057472, + "grad_norm": 6.97265625, + "learning_rate": 9.804597701149426e-06, + "loss": 3.4106, + "mean_token_accuracy": 0.36759236300520703, + "step": 1054 + }, + { + "epoch": 0.19558769002595477, + "grad_norm": 15.3203125, + "learning_rate": 9.804412309974047e-06, + "loss": 2.222, + "mean_token_accuracy": 0.46537396121883656, + "step": 1055 + }, + { + "epoch": 0.19577308120133483, + "grad_norm": 8.5859375, + "learning_rate": 9.804226918798665e-06, + "loss": 2.6373, + "mean_token_accuracy": 0.45280784844384303, + "step": 1056 + }, + { + "epoch": 0.19595847237671488, + "grad_norm": 7.50390625, + "learning_rate": 9.804041527623286e-06, + "loss": 3.2508, + "mean_token_accuracy": 0.3787171622883051, + "step": 1057 + }, + { + "epoch": 0.19614386355209493, + "grad_norm": 6.8046875, + "learning_rate": 9.803856136447906e-06, + "loss": 2.7307, + "mean_token_accuracy": 0.44530870199319394, + "step": 1058 + }, + { + "epoch": 0.19632925472747498, + "grad_norm": 8.1796875, + "learning_rate": 9.803670745272525e-06, + "loss": 2.7371, + "mean_token_accuracy": 0.4481875240169079, + "step": 1059 + }, + { + "epoch": 0.19651464590285503, + "grad_norm": 10.265625, + "learning_rate": 9.803485354097146e-06, + "loss": 2.8798, + "mean_token_accuracy": 0.4278382907415165, + "step": 1060 + }, + { + "epoch": 0.19670003707823508, + "grad_norm": 9.09375, + "learning_rate": 9.803299962921766e-06, + "loss": 2.5168, + "mean_token_accuracy": 0.47246184472461844, + "step": 1061 + }, + { + "epoch": 0.19688542825361513, + "grad_norm": 5.7109375, + "learning_rate": 9.803114571746387e-06, + "loss": 3.0063, + "mean_token_accuracy": 0.408772874058127, + "step": 1062 + }, + { + "epoch": 0.19707081942899518, + "grad_norm": 8.1171875, + "learning_rate": 9.802929180571005e-06, + "loss": 2.9961, + "mean_token_accuracy": 0.42447696214778086, + "step": 1063 + }, + { + "epoch": 0.19725621060437523, + "grad_norm": 7.59765625, + "learning_rate": 9.802743789395626e-06, + "loss": 3.3244, + "mean_token_accuracy": 0.36967936543402974, + "step": 1064 + }, + { + "epoch": 0.19744160177975528, + "grad_norm": 7.47265625, + "learning_rate": 9.802558398220245e-06, + "loss": 2.9134, + "mean_token_accuracy": 0.43033292231812575, + "step": 1065 + }, + { + "epoch": 0.19762699295513533, + "grad_norm": 6.15234375, + "learning_rate": 9.802373007044865e-06, + "loss": 2.9202, + "mean_token_accuracy": 0.4260120350109409, + "step": 1066 + }, + { + "epoch": 0.19781238413051538, + "grad_norm": 8.234375, + "learning_rate": 9.802187615869486e-06, + "loss": 3.2256, + "mean_token_accuracy": 0.4077703087615358, + "step": 1067 + }, + { + "epoch": 0.19799777530589543, + "grad_norm": 10.2109375, + "learning_rate": 9.802002224694104e-06, + "loss": 2.8775, + "mean_token_accuracy": 0.42418032786885246, + "step": 1068 + }, + { + "epoch": 0.19818316648127549, + "grad_norm": 7.55859375, + "learning_rate": 9.801816833518727e-06, + "loss": 3.1701, + "mean_token_accuracy": 0.39383715699505173, + "step": 1069 + }, + { + "epoch": 0.19836855765665554, + "grad_norm": 9.703125, + "learning_rate": 9.801631442343345e-06, + "loss": 2.8559, + "mean_token_accuracy": 0.41557115507338865, + "step": 1070 + }, + { + "epoch": 0.1985539488320356, + "grad_norm": 11.65625, + "learning_rate": 9.801446051167966e-06, + "loss": 2.6661, + "mean_token_accuracy": 0.43855539287457296, + "step": 1071 + }, + { + "epoch": 0.19873934000741564, + "grad_norm": 9.28125, + "learning_rate": 9.801260659992585e-06, + "loss": 2.6959, + "mean_token_accuracy": 0.47648841528986213, + "step": 1072 + }, + { + "epoch": 0.1989247311827957, + "grad_norm": 6.59765625, + "learning_rate": 9.801075268817205e-06, + "loss": 2.7289, + "mean_token_accuracy": 0.44233420125593503, + "step": 1073 + }, + { + "epoch": 0.19911012235817574, + "grad_norm": 8.2890625, + "learning_rate": 9.800889877641824e-06, + "loss": 3.0608, + "mean_token_accuracy": 0.43058682275251386, + "step": 1074 + }, + { + "epoch": 0.1992955135335558, + "grad_norm": 8.2421875, + "learning_rate": 9.800704486466445e-06, + "loss": 3.4542, + "mean_token_accuracy": 0.38519845644983464, + "step": 1075 + }, + { + "epoch": 0.19948090470893584, + "grad_norm": 7.58203125, + "learning_rate": 9.800519095291065e-06, + "loss": 2.809, + "mean_token_accuracy": 0.4396375701888719, + "step": 1076 + }, + { + "epoch": 0.19966629588431592, + "grad_norm": 7.734375, + "learning_rate": 9.800333704115686e-06, + "loss": 3.1592, + "mean_token_accuracy": 0.39954392702832453, + "step": 1077 + }, + { + "epoch": 0.19985168705969597, + "grad_norm": 8.828125, + "learning_rate": 9.800148312940306e-06, + "loss": 2.6994, + "mean_token_accuracy": 0.45293150684931505, + "step": 1078 + }, + { + "epoch": 0.20003707823507602, + "grad_norm": 6.16796875, + "learning_rate": 9.799962921764925e-06, + "loss": 3.5462, + "mean_token_accuracy": 0.3581456953642384, + "step": 1079 + }, + { + "epoch": 0.20022246941045607, + "grad_norm": 8.953125, + "learning_rate": 9.799777530589545e-06, + "loss": 2.9314, + "mean_token_accuracy": 0.42709660973744584, + "step": 1080 + }, + { + "epoch": 0.20040786058583612, + "grad_norm": 6.36328125, + "learning_rate": 9.799592139414164e-06, + "loss": 3.286, + "mean_token_accuracy": 0.38740204051456456, + "step": 1081 + }, + { + "epoch": 0.20059325176121617, + "grad_norm": 6.39453125, + "learning_rate": 9.799406748238785e-06, + "loss": 2.9125, + "mean_token_accuracy": 0.4343953838434521, + "step": 1082 + }, + { + "epoch": 0.20077864293659622, + "grad_norm": 7.03125, + "learning_rate": 9.799221357063405e-06, + "loss": 3.0987, + "mean_token_accuracy": 0.40029286474973375, + "step": 1083 + }, + { + "epoch": 0.20096403411197628, + "grad_norm": 8.609375, + "learning_rate": 9.799035965888024e-06, + "loss": 2.7017, + "mean_token_accuracy": 0.4223855285472018, + "step": 1084 + }, + { + "epoch": 0.20114942528735633, + "grad_norm": 7.8515625, + "learning_rate": 9.798850574712644e-06, + "loss": 2.577, + "mean_token_accuracy": 0.4589793915603533, + "step": 1085 + }, + { + "epoch": 0.20133481646273638, + "grad_norm": 6.96875, + "learning_rate": 9.798665183537265e-06, + "loss": 2.5672, + "mean_token_accuracy": 0.47958900305470703, + "step": 1086 + }, + { + "epoch": 0.20152020763811643, + "grad_norm": 6.26953125, + "learning_rate": 9.798479792361885e-06, + "loss": 2.8188, + "mean_token_accuracy": 0.44358299875398033, + "step": 1087 + }, + { + "epoch": 0.20170559881349648, + "grad_norm": 6.5625, + "learning_rate": 9.798294401186504e-06, + "loss": 3.2931, + "mean_token_accuracy": 0.38922655225837205, + "step": 1088 + }, + { + "epoch": 0.20189098998887653, + "grad_norm": 8.109375, + "learning_rate": 9.798109010011125e-06, + "loss": 3.3934, + "mean_token_accuracy": 0.3894822225226443, + "step": 1089 + }, + { + "epoch": 0.20207638116425658, + "grad_norm": 11.4453125, + "learning_rate": 9.797923618835743e-06, + "loss": 2.5942, + "mean_token_accuracy": 0.4601639946151022, + "step": 1090 + }, + { + "epoch": 0.20226177233963663, + "grad_norm": 8.6328125, + "learning_rate": 9.797738227660364e-06, + "loss": 2.9568, + "mean_token_accuracy": 0.4112517580872011, + "step": 1091 + }, + { + "epoch": 0.20244716351501668, + "grad_norm": 7.6953125, + "learning_rate": 9.797552836484984e-06, + "loss": 2.7999, + "mean_token_accuracy": 0.4473579129508851, + "step": 1092 + }, + { + "epoch": 0.20263255469039673, + "grad_norm": 6.0703125, + "learning_rate": 9.797367445309605e-06, + "loss": 2.4338, + "mean_token_accuracy": 0.47934721440630274, + "step": 1093 + }, + { + "epoch": 0.20281794586577678, + "grad_norm": 6.34375, + "learning_rate": 9.797182054134224e-06, + "loss": 2.4816, + "mean_token_accuracy": 0.47784146511953024, + "step": 1094 + }, + { + "epoch": 0.20300333704115683, + "grad_norm": 8.6875, + "learning_rate": 9.796996662958844e-06, + "loss": 3.0137, + "mean_token_accuracy": 0.4271518905665247, + "step": 1095 + }, + { + "epoch": 0.20318872821653688, + "grad_norm": 5.96484375, + "learning_rate": 9.796811271783465e-06, + "loss": 3.0285, + "mean_token_accuracy": 0.4043307585158921, + "step": 1096 + }, + { + "epoch": 0.20337411939191694, + "grad_norm": 6.15625, + "learning_rate": 9.796625880608083e-06, + "loss": 2.756, + "mean_token_accuracy": 0.4442196141912856, + "step": 1097 + }, + { + "epoch": 0.203559510567297, + "grad_norm": 10.0234375, + "learning_rate": 9.796440489432704e-06, + "loss": 2.5152, + "mean_token_accuracy": 0.4513156299260337, + "step": 1098 + }, + { + "epoch": 0.20374490174267704, + "grad_norm": 7.69921875, + "learning_rate": 9.796255098257323e-06, + "loss": 2.8944, + "mean_token_accuracy": 0.41858353510895885, + "step": 1099 + }, + { + "epoch": 0.2039302929180571, + "grad_norm": 5.78125, + "learning_rate": 9.796069707081943e-06, + "loss": 3.1327, + "mean_token_accuracy": 0.4126669965363681, + "step": 1100 + }, + { + "epoch": 0.20411568409343714, + "grad_norm": 6.609375, + "learning_rate": 9.795884315906564e-06, + "loss": 3.2587, + "mean_token_accuracy": 0.39227285331361494, + "step": 1101 + }, + { + "epoch": 0.20430107526881722, + "grad_norm": 8.1875, + "learning_rate": 9.795698924731184e-06, + "loss": 2.6494, + "mean_token_accuracy": 0.4513888888888889, + "step": 1102 + }, + { + "epoch": 0.20448646644419727, + "grad_norm": 9.0, + "learning_rate": 9.795513533555803e-06, + "loss": 3.3998, + "mean_token_accuracy": 0.3798411728772144, + "step": 1103 + }, + { + "epoch": 0.20467185761957732, + "grad_norm": 6.46875, + "learning_rate": 9.795328142380424e-06, + "loss": 2.4871, + "mean_token_accuracy": 0.4683992003690604, + "step": 1104 + }, + { + "epoch": 0.20485724879495737, + "grad_norm": 5.7421875, + "learning_rate": 9.795142751205044e-06, + "loss": 2.6401, + "mean_token_accuracy": 0.46563407550822844, + "step": 1105 + }, + { + "epoch": 0.20504263997033742, + "grad_norm": 11.2890625, + "learning_rate": 9.794957360029663e-06, + "loss": 2.6295, + "mean_token_accuracy": 0.4567544809334777, + "step": 1106 + }, + { + "epoch": 0.20522803114571747, + "grad_norm": 5.75, + "learning_rate": 9.794771968854283e-06, + "loss": 2.4093, + "mean_token_accuracy": 0.5070866141732283, + "step": 1107 + }, + { + "epoch": 0.20541342232109752, + "grad_norm": 6.33984375, + "learning_rate": 9.794586577678902e-06, + "loss": 2.6209, + "mean_token_accuracy": 0.4647347623039157, + "step": 1108 + }, + { + "epoch": 0.20559881349647757, + "grad_norm": 6.78515625, + "learning_rate": 9.794401186503524e-06, + "loss": 2.8667, + "mean_token_accuracy": 0.4042606832219841, + "step": 1109 + }, + { + "epoch": 0.20578420467185762, + "grad_norm": 5.6953125, + "learning_rate": 9.794215795328143e-06, + "loss": 3.1588, + "mean_token_accuracy": 0.4005568333131582, + "step": 1110 + }, + { + "epoch": 0.20596959584723767, + "grad_norm": 6.08984375, + "learning_rate": 9.794030404152764e-06, + "loss": 2.8841, + "mean_token_accuracy": 0.42847533632286994, + "step": 1111 + }, + { + "epoch": 0.20615498702261773, + "grad_norm": 8.171875, + "learning_rate": 9.793845012977382e-06, + "loss": 2.8357, + "mean_token_accuracy": 0.4432359550561798, + "step": 1112 + }, + { + "epoch": 0.20634037819799778, + "grad_norm": 7.10546875, + "learning_rate": 9.793659621802003e-06, + "loss": 2.9058, + "mean_token_accuracy": 0.42099605876030094, + "step": 1113 + }, + { + "epoch": 0.20652576937337783, + "grad_norm": 9.3203125, + "learning_rate": 9.793474230626623e-06, + "loss": 2.768, + "mean_token_accuracy": 0.4348502528199144, + "step": 1114 + }, + { + "epoch": 0.20671116054875788, + "grad_norm": 7.4375, + "learning_rate": 9.793288839451242e-06, + "loss": 2.9841, + "mean_token_accuracy": 0.4286407766990291, + "step": 1115 + }, + { + "epoch": 0.20689655172413793, + "grad_norm": 7.03125, + "learning_rate": 9.793103448275863e-06, + "loss": 2.8867, + "mean_token_accuracy": 0.41017344033134867, + "step": 1116 + }, + { + "epoch": 0.20708194289951798, + "grad_norm": 7.63671875, + "learning_rate": 9.792918057100481e-06, + "loss": 2.7961, + "mean_token_accuracy": 0.44319258713277876, + "step": 1117 + }, + { + "epoch": 0.20726733407489803, + "grad_norm": 8.6328125, + "learning_rate": 9.792732665925104e-06, + "loss": 2.687, + "mean_token_accuracy": 0.4358334427952187, + "step": 1118 + }, + { + "epoch": 0.20745272525027808, + "grad_norm": 6.98046875, + "learning_rate": 9.792547274749722e-06, + "loss": 3.0901, + "mean_token_accuracy": 0.389687235841082, + "step": 1119 + }, + { + "epoch": 0.20763811642565813, + "grad_norm": 5.7734375, + "learning_rate": 9.792361883574343e-06, + "loss": 2.5734, + "mean_token_accuracy": 0.46616753778782316, + "step": 1120 + }, + { + "epoch": 0.20782350760103818, + "grad_norm": 9.6484375, + "learning_rate": 9.792176492398963e-06, + "loss": 2.7232, + "mean_token_accuracy": 0.45009185548071035, + "step": 1121 + }, + { + "epoch": 0.20800889877641823, + "grad_norm": 8.6328125, + "learning_rate": 9.791991101223582e-06, + "loss": 3.2123, + "mean_token_accuracy": 0.3910644742535699, + "step": 1122 + }, + { + "epoch": 0.20819428995179828, + "grad_norm": 7.87109375, + "learning_rate": 9.791805710048203e-06, + "loss": 2.5421, + "mean_token_accuracy": 0.44792528667691217, + "step": 1123 + }, + { + "epoch": 0.20837968112717833, + "grad_norm": 6.19140625, + "learning_rate": 9.791620318872821e-06, + "loss": 2.7563, + "mean_token_accuracy": 0.4462567963195316, + "step": 1124 + }, + { + "epoch": 0.20856507230255839, + "grad_norm": 6.9921875, + "learning_rate": 9.791434927697442e-06, + "loss": 2.8709, + "mean_token_accuracy": 0.4415215989684075, + "step": 1125 + }, + { + "epoch": 0.20875046347793846, + "grad_norm": 6.0390625, + "learning_rate": 9.791249536522062e-06, + "loss": 2.9661, + "mean_token_accuracy": 0.4276850958224183, + "step": 1126 + }, + { + "epoch": 0.20893585465331851, + "grad_norm": 5.79296875, + "learning_rate": 9.791064145346683e-06, + "loss": 3.3114, + "mean_token_accuracy": 0.3963323971584338, + "step": 1127 + }, + { + "epoch": 0.20912124582869857, + "grad_norm": 8.234375, + "learning_rate": 9.790878754171302e-06, + "loss": 3.0924, + "mean_token_accuracy": 0.3976329917032699, + "step": 1128 + }, + { + "epoch": 0.20930663700407862, + "grad_norm": 8.484375, + "learning_rate": 9.790693362995922e-06, + "loss": 2.2897, + "mean_token_accuracy": 0.5095942120163574, + "step": 1129 + }, + { + "epoch": 0.20949202817945867, + "grad_norm": 6.53515625, + "learning_rate": 9.790507971820543e-06, + "loss": 2.709, + "mean_token_accuracy": 0.44899978017146625, + "step": 1130 + }, + { + "epoch": 0.20967741935483872, + "grad_norm": 7.0546875, + "learning_rate": 9.790322580645162e-06, + "loss": 3.2162, + "mean_token_accuracy": 0.3875460405156538, + "step": 1131 + }, + { + "epoch": 0.20986281053021877, + "grad_norm": 5.94921875, + "learning_rate": 9.790137189469782e-06, + "loss": 3.2783, + "mean_token_accuracy": 0.40025209121118366, + "step": 1132 + }, + { + "epoch": 0.21004820170559882, + "grad_norm": 7.921875, + "learning_rate": 9.7899517982944e-06, + "loss": 2.6593, + "mean_token_accuracy": 0.4517487639626442, + "step": 1133 + }, + { + "epoch": 0.21023359288097887, + "grad_norm": 6.6015625, + "learning_rate": 9.789766407119023e-06, + "loss": 3.0492, + "mean_token_accuracy": 0.415185588199734, + "step": 1134 + }, + { + "epoch": 0.21041898405635892, + "grad_norm": 7.4765625, + "learning_rate": 9.789581015943642e-06, + "loss": 3.0749, + "mean_token_accuracy": 0.4105833598979917, + "step": 1135 + }, + { + "epoch": 0.21060437523173897, + "grad_norm": 5.6796875, + "learning_rate": 9.789395624768262e-06, + "loss": 3.0489, + "mean_token_accuracy": 0.41921847613155994, + "step": 1136 + }, + { + "epoch": 0.21078976640711902, + "grad_norm": 7.2109375, + "learning_rate": 9.789210233592881e-06, + "loss": 2.7799, + "mean_token_accuracy": 0.43996188055908514, + "step": 1137 + }, + { + "epoch": 0.21097515758249907, + "grad_norm": 7.53125, + "learning_rate": 9.789024842417502e-06, + "loss": 3.2321, + "mean_token_accuracy": 0.3952324195470799, + "step": 1138 + }, + { + "epoch": 0.21116054875787912, + "grad_norm": 6.9609375, + "learning_rate": 9.788839451242122e-06, + "loss": 2.8977, + "mean_token_accuracy": 0.4318699680996506, + "step": 1139 + }, + { + "epoch": 0.21134593993325917, + "grad_norm": 6.29296875, + "learning_rate": 9.788654060066741e-06, + "loss": 3.1301, + "mean_token_accuracy": 0.4022207707380797, + "step": 1140 + }, + { + "epoch": 0.21153133110863923, + "grad_norm": 5.1171875, + "learning_rate": 9.788468668891361e-06, + "loss": 2.7294, + "mean_token_accuracy": 0.4444444444444444, + "step": 1141 + }, + { + "epoch": 0.21171672228401928, + "grad_norm": 8.375, + "learning_rate": 9.788283277715982e-06, + "loss": 2.7615, + "mean_token_accuracy": 0.4364328706547107, + "step": 1142 + }, + { + "epoch": 0.21190211345939933, + "grad_norm": 6.0703125, + "learning_rate": 9.788097886540602e-06, + "loss": 2.7843, + "mean_token_accuracy": 0.4460633230596806, + "step": 1143 + }, + { + "epoch": 0.21208750463477938, + "grad_norm": 8.75, + "learning_rate": 9.787912495365221e-06, + "loss": 2.8443, + "mean_token_accuracy": 0.4269622093023256, + "step": 1144 + }, + { + "epoch": 0.21227289581015943, + "grad_norm": 5.890625, + "learning_rate": 9.787727104189842e-06, + "loss": 2.6167, + "mean_token_accuracy": 0.44726810673443457, + "step": 1145 + }, + { + "epoch": 0.21245828698553948, + "grad_norm": 16.140625, + "learning_rate": 9.78754171301446e-06, + "loss": 2.5632, + "mean_token_accuracy": 0.45762554534484357, + "step": 1146 + }, + { + "epoch": 0.21264367816091953, + "grad_norm": 6.9375, + "learning_rate": 9.787356321839081e-06, + "loss": 2.8794, + "mean_token_accuracy": 0.4282909930715935, + "step": 1147 + }, + { + "epoch": 0.21282906933629958, + "grad_norm": 5.86328125, + "learning_rate": 9.787170930663701e-06, + "loss": 2.6344, + "mean_token_accuracy": 0.4487199872793767, + "step": 1148 + }, + { + "epoch": 0.21301446051167963, + "grad_norm": 6.14453125, + "learning_rate": 9.78698553948832e-06, + "loss": 2.9224, + "mean_token_accuracy": 0.42774301846482526, + "step": 1149 + }, + { + "epoch": 0.21319985168705968, + "grad_norm": 6.9140625, + "learning_rate": 9.78680014831294e-06, + "loss": 3.3961, + "mean_token_accuracy": 0.3858891288696904, + "step": 1150 + }, + { + "epoch": 0.21338524286243976, + "grad_norm": 9.546875, + "learning_rate": 9.786614757137561e-06, + "loss": 2.9146, + "mean_token_accuracy": 0.4180452654014006, + "step": 1151 + }, + { + "epoch": 0.2135706340378198, + "grad_norm": 5.9921875, + "learning_rate": 9.786429365962182e-06, + "loss": 2.4937, + "mean_token_accuracy": 0.49495576594754, + "step": 1152 + }, + { + "epoch": 0.21375602521319986, + "grad_norm": 10.1171875, + "learning_rate": 9.7862439747868e-06, + "loss": 2.7787, + "mean_token_accuracy": 0.43734939759036146, + "step": 1153 + }, + { + "epoch": 0.2139414163885799, + "grad_norm": 9.8203125, + "learning_rate": 9.786058583611421e-06, + "loss": 3.0979, + "mean_token_accuracy": 0.3965096881817015, + "step": 1154 + }, + { + "epoch": 0.21412680756395996, + "grad_norm": 9.015625, + "learning_rate": 9.78587319243604e-06, + "loss": 2.6714, + "mean_token_accuracy": 0.43896882494004796, + "step": 1155 + }, + { + "epoch": 0.21431219873934002, + "grad_norm": 6.4921875, + "learning_rate": 9.78568780126066e-06, + "loss": 3.2057, + "mean_token_accuracy": 0.4044987146529563, + "step": 1156 + }, + { + "epoch": 0.21449758991472007, + "grad_norm": 7.4453125, + "learning_rate": 9.78550241008528e-06, + "loss": 2.8015, + "mean_token_accuracy": 0.4235974409448819, + "step": 1157 + }, + { + "epoch": 0.21468298109010012, + "grad_norm": 7.97265625, + "learning_rate": 9.785317018909901e-06, + "loss": 2.7838, + "mean_token_accuracy": 0.4342302690807242, + "step": 1158 + }, + { + "epoch": 0.21486837226548017, + "grad_norm": 7.109375, + "learning_rate": 9.785131627734522e-06, + "loss": 2.8609, + "mean_token_accuracy": 0.4235016934177588, + "step": 1159 + }, + { + "epoch": 0.21505376344086022, + "grad_norm": 5.87890625, + "learning_rate": 9.78494623655914e-06, + "loss": 3.052, + "mean_token_accuracy": 0.411, + "step": 1160 + }, + { + "epoch": 0.21523915461624027, + "grad_norm": 9.328125, + "learning_rate": 9.784760845383761e-06, + "loss": 3.1273, + "mean_token_accuracy": 0.3937896207155904, + "step": 1161 + }, + { + "epoch": 0.21542454579162032, + "grad_norm": 8.2109375, + "learning_rate": 9.78457545420838e-06, + "loss": 2.7098, + "mean_token_accuracy": 0.44323027718550106, + "step": 1162 + }, + { + "epoch": 0.21560993696700037, + "grad_norm": 7.64453125, + "learning_rate": 9.784390063033e-06, + "loss": 2.7971, + "mean_token_accuracy": 0.46920380570856285, + "step": 1163 + }, + { + "epoch": 0.21579532814238042, + "grad_norm": 5.63671875, + "learning_rate": 9.78420467185762e-06, + "loss": 2.8516, + "mean_token_accuracy": 0.4115293420272673, + "step": 1164 + }, + { + "epoch": 0.21598071931776047, + "grad_norm": 8.78125, + "learning_rate": 9.78401928068224e-06, + "loss": 2.7246, + "mean_token_accuracy": 0.4480143263457284, + "step": 1165 + }, + { + "epoch": 0.21616611049314052, + "grad_norm": 5.77734375, + "learning_rate": 9.78383388950686e-06, + "loss": 2.8348, + "mean_token_accuracy": 0.43222976796830787, + "step": 1166 + }, + { + "epoch": 0.21635150166852057, + "grad_norm": 5.77734375, + "learning_rate": 9.78364849833148e-06, + "loss": 2.831, + "mean_token_accuracy": 0.41485913737222635, + "step": 1167 + }, + { + "epoch": 0.21653689284390062, + "grad_norm": 7.9921875, + "learning_rate": 9.783463107156101e-06, + "loss": 2.8661, + "mean_token_accuracy": 0.4450830140485313, + "step": 1168 + }, + { + "epoch": 0.21672228401928068, + "grad_norm": 5.6640625, + "learning_rate": 9.78327771598072e-06, + "loss": 3.0035, + "mean_token_accuracy": 0.41605335786568537, + "step": 1169 + }, + { + "epoch": 0.21690767519466073, + "grad_norm": 7.4921875, + "learning_rate": 9.78309232480534e-06, + "loss": 2.8281, + "mean_token_accuracy": 0.4479394449116905, + "step": 1170 + }, + { + "epoch": 0.21709306637004078, + "grad_norm": 7.08984375, + "learning_rate": 9.78290693362996e-06, + "loss": 3.1391, + "mean_token_accuracy": 0.40062272963155166, + "step": 1171 + }, + { + "epoch": 0.21727845754542083, + "grad_norm": 6.1484375, + "learning_rate": 9.78272154245458e-06, + "loss": 2.7473, + "mean_token_accuracy": 0.4432754468246926, + "step": 1172 + }, + { + "epoch": 0.21746384872080088, + "grad_norm": 6.8046875, + "learning_rate": 9.7825361512792e-06, + "loss": 2.7259, + "mean_token_accuracy": 0.45585822120118147, + "step": 1173 + }, + { + "epoch": 0.21764923989618093, + "grad_norm": 7.04296875, + "learning_rate": 9.78235076010382e-06, + "loss": 3.261, + "mean_token_accuracy": 0.38257439942631766, + "step": 1174 + }, + { + "epoch": 0.21783463107156098, + "grad_norm": 6.07421875, + "learning_rate": 9.78216536892844e-06, + "loss": 2.6737, + "mean_token_accuracy": 0.4419543429844098, + "step": 1175 + }, + { + "epoch": 0.21802002224694106, + "grad_norm": 10.0390625, + "learning_rate": 9.78197997775306e-06, + "loss": 2.4634, + "mean_token_accuracy": 0.4530123759009928, + "step": 1176 + }, + { + "epoch": 0.2182054134223211, + "grad_norm": 9.296875, + "learning_rate": 9.78179458657768e-06, + "loss": 2.839, + "mean_token_accuracy": 0.42993858020282816, + "step": 1177 + }, + { + "epoch": 0.21839080459770116, + "grad_norm": 17.25, + "learning_rate": 9.7816091954023e-06, + "loss": 2.197, + "mean_token_accuracy": 0.4897648847818458, + "step": 1178 + }, + { + "epoch": 0.2185761957730812, + "grad_norm": 5.33203125, + "learning_rate": 9.78142380422692e-06, + "loss": 2.4788, + "mean_token_accuracy": 0.48766217247519716, + "step": 1179 + }, + { + "epoch": 0.21876158694846126, + "grad_norm": 6.4453125, + "learning_rate": 9.781238413051539e-06, + "loss": 2.9532, + "mean_token_accuracy": 0.42524773804394655, + "step": 1180 + }, + { + "epoch": 0.2189469781238413, + "grad_norm": 7.6328125, + "learning_rate": 9.781053021876159e-06, + "loss": 2.6732, + "mean_token_accuracy": 0.4523900088753645, + "step": 1181 + }, + { + "epoch": 0.21913236929922136, + "grad_norm": 10.0078125, + "learning_rate": 9.78086763070078e-06, + "loss": 2.2347, + "mean_token_accuracy": 0.5073031170004517, + "step": 1182 + }, + { + "epoch": 0.21931776047460141, + "grad_norm": 10.1328125, + "learning_rate": 9.7806822395254e-06, + "loss": 3.0516, + "mean_token_accuracy": 0.40474940960377853, + "step": 1183 + }, + { + "epoch": 0.21950315164998146, + "grad_norm": 7.4609375, + "learning_rate": 9.780496848350019e-06, + "loss": 2.6507, + "mean_token_accuracy": 0.44364672364672364, + "step": 1184 + }, + { + "epoch": 0.21968854282536152, + "grad_norm": 7.55078125, + "learning_rate": 9.78031145717464e-06, + "loss": 2.9071, + "mean_token_accuracy": 0.42946872546453807, + "step": 1185 + }, + { + "epoch": 0.21987393400074157, + "grad_norm": 5.5625, + "learning_rate": 9.78012606599926e-06, + "loss": 3.155, + "mean_token_accuracy": 0.4002143048486472, + "step": 1186 + }, + { + "epoch": 0.22005932517612162, + "grad_norm": 7.703125, + "learning_rate": 9.779940674823879e-06, + "loss": 2.5232, + "mean_token_accuracy": 0.4785413744740533, + "step": 1187 + }, + { + "epoch": 0.22024471635150167, + "grad_norm": 5.546875, + "learning_rate": 9.779755283648499e-06, + "loss": 3.1383, + "mean_token_accuracy": 0.40865491858130715, + "step": 1188 + }, + { + "epoch": 0.22043010752688172, + "grad_norm": 9.578125, + "learning_rate": 9.779569892473118e-06, + "loss": 2.5066, + "mean_token_accuracy": 0.4689073343312015, + "step": 1189 + }, + { + "epoch": 0.22061549870226177, + "grad_norm": 7.33984375, + "learning_rate": 9.77938450129774e-06, + "loss": 2.5506, + "mean_token_accuracy": 0.47013341419041843, + "step": 1190 + }, + { + "epoch": 0.22080088987764182, + "grad_norm": 6.7421875, + "learning_rate": 9.779199110122359e-06, + "loss": 2.8008, + "mean_token_accuracy": 0.42859139183901623, + "step": 1191 + }, + { + "epoch": 0.22098628105302187, + "grad_norm": 6.66796875, + "learning_rate": 9.77901371894698e-06, + "loss": 2.8468, + "mean_token_accuracy": 0.4329132690882135, + "step": 1192 + }, + { + "epoch": 0.22117167222840192, + "grad_norm": 5.90234375, + "learning_rate": 9.778828327771598e-06, + "loss": 3.2222, + "mean_token_accuracy": 0.38843111404087016, + "step": 1193 + }, + { + "epoch": 0.22135706340378197, + "grad_norm": 5.609375, + "learning_rate": 9.778642936596219e-06, + "loss": 3.2838, + "mean_token_accuracy": 0.3781668656320467, + "step": 1194 + }, + { + "epoch": 0.22154245457916202, + "grad_norm": 7.5703125, + "learning_rate": 9.778457545420839e-06, + "loss": 2.6905, + "mean_token_accuracy": 0.4725356294536817, + "step": 1195 + }, + { + "epoch": 0.22172784575454207, + "grad_norm": 6.109375, + "learning_rate": 9.778272154245458e-06, + "loss": 3.0235, + "mean_token_accuracy": 0.42206235011990406, + "step": 1196 + }, + { + "epoch": 0.22191323692992213, + "grad_norm": 7.73828125, + "learning_rate": 9.778086763070078e-06, + "loss": 3.3764, + "mean_token_accuracy": 0.3687551428235571, + "step": 1197 + }, + { + "epoch": 0.22209862810530218, + "grad_norm": 6.09375, + "learning_rate": 9.777901371894699e-06, + "loss": 2.9945, + "mean_token_accuracy": 0.4039287906691222, + "step": 1198 + }, + { + "epoch": 0.22228401928068223, + "grad_norm": 9.2109375, + "learning_rate": 9.77771598071932e-06, + "loss": 2.336, + "mean_token_accuracy": 0.5019560232024821, + "step": 1199 + }, + { + "epoch": 0.22246941045606228, + "grad_norm": 8.640625, + "learning_rate": 9.777530589543938e-06, + "loss": 2.6591, + "mean_token_accuracy": 0.41135487696412687, + "step": 1200 + }, + { + "epoch": 0.22265480163144236, + "grad_norm": 7.15234375, + "learning_rate": 9.777345198368559e-06, + "loss": 2.5751, + "mean_token_accuracy": 0.45745992601726265, + "step": 1201 + }, + { + "epoch": 0.2228401928068224, + "grad_norm": 10.21875, + "learning_rate": 9.77715980719318e-06, + "loss": 2.1021, + "mean_token_accuracy": 0.5066852367688023, + "step": 1202 + }, + { + "epoch": 0.22302558398220246, + "grad_norm": 6.53125, + "learning_rate": 9.776974416017798e-06, + "loss": 3.0944, + "mean_token_accuracy": 0.41747450585421125, + "step": 1203 + }, + { + "epoch": 0.2232109751575825, + "grad_norm": 6.16796875, + "learning_rate": 9.776789024842418e-06, + "loss": 2.7315, + "mean_token_accuracy": 0.4425815342214056, + "step": 1204 + }, + { + "epoch": 0.22339636633296256, + "grad_norm": 6.91796875, + "learning_rate": 9.776603633667037e-06, + "loss": 3.0057, + "mean_token_accuracy": 0.4061111111111111, + "step": 1205 + }, + { + "epoch": 0.2235817575083426, + "grad_norm": 6.828125, + "learning_rate": 9.77641824249166e-06, + "loss": 2.4656, + "mean_token_accuracy": 0.48159316497047366, + "step": 1206 + }, + { + "epoch": 0.22376714868372266, + "grad_norm": 6.4453125, + "learning_rate": 9.776232851316278e-06, + "loss": 3.223, + "mean_token_accuracy": 0.37722624382396874, + "step": 1207 + }, + { + "epoch": 0.2239525398591027, + "grad_norm": 6.19140625, + "learning_rate": 9.776047460140899e-06, + "loss": 3.1511, + "mean_token_accuracy": 0.39880636604774533, + "step": 1208 + }, + { + "epoch": 0.22413793103448276, + "grad_norm": 8.9453125, + "learning_rate": 9.775862068965518e-06, + "loss": 2.5288, + "mean_token_accuracy": 0.4620878466849185, + "step": 1209 + }, + { + "epoch": 0.2243233222098628, + "grad_norm": 7.5546875, + "learning_rate": 9.775676677790138e-06, + "loss": 2.7911, + "mean_token_accuracy": 0.4411728009981285, + "step": 1210 + }, + { + "epoch": 0.22450871338524286, + "grad_norm": 5.4140625, + "learning_rate": 9.775491286614759e-06, + "loss": 3.3141, + "mean_token_accuracy": 0.3771800177357375, + "step": 1211 + }, + { + "epoch": 0.22469410456062291, + "grad_norm": 8.46875, + "learning_rate": 9.775305895439377e-06, + "loss": 2.681, + "mean_token_accuracy": 0.4592499715034766, + "step": 1212 + }, + { + "epoch": 0.22487949573600297, + "grad_norm": 6.85546875, + "learning_rate": 9.775120504263998e-06, + "loss": 3.2486, + "mean_token_accuracy": 0.40227934044616875, + "step": 1213 + }, + { + "epoch": 0.22506488691138302, + "grad_norm": 6.77734375, + "learning_rate": 9.774935113088618e-06, + "loss": 3.0283, + "mean_token_accuracy": 0.40954332552693207, + "step": 1214 + }, + { + "epoch": 0.22525027808676307, + "grad_norm": 7.9375, + "learning_rate": 9.774749721913239e-06, + "loss": 3.007, + "mean_token_accuracy": 0.4057942057942058, + "step": 1215 + }, + { + "epoch": 0.22543566926214312, + "grad_norm": 6.203125, + "learning_rate": 9.774564330737858e-06, + "loss": 2.9439, + "mean_token_accuracy": 0.4070287539936102, + "step": 1216 + }, + { + "epoch": 0.22562106043752317, + "grad_norm": 7.83203125, + "learning_rate": 9.774378939562478e-06, + "loss": 2.9385, + "mean_token_accuracy": 0.4318747255963706, + "step": 1217 + }, + { + "epoch": 0.22580645161290322, + "grad_norm": 7.75390625, + "learning_rate": 9.774193548387097e-06, + "loss": 2.8201, + "mean_token_accuracy": 0.4237463976945245, + "step": 1218 + }, + { + "epoch": 0.22599184278828327, + "grad_norm": 7.43359375, + "learning_rate": 9.774008157211717e-06, + "loss": 2.781, + "mean_token_accuracy": 0.4406497292794669, + "step": 1219 + }, + { + "epoch": 0.22617723396366332, + "grad_norm": 6.20703125, + "learning_rate": 9.773822766036338e-06, + "loss": 2.9886, + "mean_token_accuracy": 0.4228863425209043, + "step": 1220 + }, + { + "epoch": 0.22636262513904337, + "grad_norm": 10.5703125, + "learning_rate": 9.773637374860957e-06, + "loss": 2.855, + "mean_token_accuracy": 0.4242585450192845, + "step": 1221 + }, + { + "epoch": 0.22654801631442342, + "grad_norm": 7.5546875, + "learning_rate": 9.773451983685577e-06, + "loss": 2.6326, + "mean_token_accuracy": 0.4650132860938884, + "step": 1222 + }, + { + "epoch": 0.22673340748980347, + "grad_norm": 8.2265625, + "learning_rate": 9.773266592510198e-06, + "loss": 2.7328, + "mean_token_accuracy": 0.433692264097478, + "step": 1223 + }, + { + "epoch": 0.22691879866518352, + "grad_norm": 6.9453125, + "learning_rate": 9.773081201334818e-06, + "loss": 3.3291, + "mean_token_accuracy": 0.3768449559918754, + "step": 1224 + }, + { + "epoch": 0.2271041898405636, + "grad_norm": 6.04296875, + "learning_rate": 9.772895810159437e-06, + "loss": 3.1182, + "mean_token_accuracy": 0.41483343808925205, + "step": 1225 + }, + { + "epoch": 0.22728958101594365, + "grad_norm": 6.71875, + "learning_rate": 9.772710418984057e-06, + "loss": 3.0041, + "mean_token_accuracy": 0.4258512183314507, + "step": 1226 + }, + { + "epoch": 0.2274749721913237, + "grad_norm": 7.5234375, + "learning_rate": 9.772525027808676e-06, + "loss": 2.9577, + "mean_token_accuracy": 0.4220503866300823, + "step": 1227 + }, + { + "epoch": 0.22766036336670376, + "grad_norm": 6.5, + "learning_rate": 9.772339636633297e-06, + "loss": 3.2815, + "mean_token_accuracy": 0.37471541449042456, + "step": 1228 + }, + { + "epoch": 0.2278457545420838, + "grad_norm": 5.76953125, + "learning_rate": 9.772154245457917e-06, + "loss": 3.1414, + "mean_token_accuracy": 0.40584045584045586, + "step": 1229 + }, + { + "epoch": 0.22803114571746386, + "grad_norm": 5.59375, + "learning_rate": 9.771968854282538e-06, + "loss": 2.7879, + "mean_token_accuracy": 0.4580934101087652, + "step": 1230 + }, + { + "epoch": 0.2282165368928439, + "grad_norm": 5.82421875, + "learning_rate": 9.771783463107156e-06, + "loss": 2.8759, + "mean_token_accuracy": 0.42318092428711895, + "step": 1231 + }, + { + "epoch": 0.22840192806822396, + "grad_norm": 7.36328125, + "learning_rate": 9.771598071931777e-06, + "loss": 2.6194, + "mean_token_accuracy": 0.4455120693695805, + "step": 1232 + }, + { + "epoch": 0.228587319243604, + "grad_norm": 5.5078125, + "learning_rate": 9.771412680756397e-06, + "loss": 2.8271, + "mean_token_accuracy": 0.43392952482648156, + "step": 1233 + }, + { + "epoch": 0.22877271041898406, + "grad_norm": 6.28125, + "learning_rate": 9.771227289581016e-06, + "loss": 2.832, + "mean_token_accuracy": 0.4328850855745721, + "step": 1234 + }, + { + "epoch": 0.2289581015943641, + "grad_norm": 7.5234375, + "learning_rate": 9.771041898405637e-06, + "loss": 3.2831, + "mean_token_accuracy": 0.404125, + "step": 1235 + }, + { + "epoch": 0.22914349276974416, + "grad_norm": 5.5390625, + "learning_rate": 9.770856507230256e-06, + "loss": 3.1512, + "mean_token_accuracy": 0.39688625537353317, + "step": 1236 + }, + { + "epoch": 0.2293288839451242, + "grad_norm": 6.58984375, + "learning_rate": 9.770671116054876e-06, + "loss": 2.8749, + "mean_token_accuracy": 0.42575301204819277, + "step": 1237 + }, + { + "epoch": 0.22951427512050426, + "grad_norm": 6.14453125, + "learning_rate": 9.770485724879497e-06, + "loss": 2.996, + "mean_token_accuracy": 0.4151395545531435, + "step": 1238 + }, + { + "epoch": 0.2296996662958843, + "grad_norm": 5.39453125, + "learning_rate": 9.770300333704117e-06, + "loss": 3.1185, + "mean_token_accuracy": 0.39124326855537345, + "step": 1239 + }, + { + "epoch": 0.22988505747126436, + "grad_norm": 9.96875, + "learning_rate": 9.770114942528738e-06, + "loss": 2.6221, + "mean_token_accuracy": 0.4539731393396754, + "step": 1240 + }, + { + "epoch": 0.23007044864664442, + "grad_norm": 6.39453125, + "learning_rate": 9.769929551353356e-06, + "loss": 2.9471, + "mean_token_accuracy": 0.4212497325058849, + "step": 1241 + }, + { + "epoch": 0.23025583982202447, + "grad_norm": 5.69921875, + "learning_rate": 9.769744160177977e-06, + "loss": 3.1697, + "mean_token_accuracy": 0.4084507042253521, + "step": 1242 + }, + { + "epoch": 0.23044123099740452, + "grad_norm": 5.9375, + "learning_rate": 9.769558769002596e-06, + "loss": 3.2961, + "mean_token_accuracy": 0.3853113358169239, + "step": 1243 + }, + { + "epoch": 0.23062662217278457, + "grad_norm": 7.0703125, + "learning_rate": 9.769373377827216e-06, + "loss": 3.0753, + "mean_token_accuracy": 0.40752205540937936, + "step": 1244 + }, + { + "epoch": 0.23081201334816462, + "grad_norm": 6.08984375, + "learning_rate": 9.769187986651837e-06, + "loss": 2.8466, + "mean_token_accuracy": 0.46470323741007197, + "step": 1245 + }, + { + "epoch": 0.23099740452354467, + "grad_norm": 6.36328125, + "learning_rate": 9.769002595476455e-06, + "loss": 3.1019, + "mean_token_accuracy": 0.400490647040785, + "step": 1246 + }, + { + "epoch": 0.23118279569892472, + "grad_norm": 6.15234375, + "learning_rate": 9.768817204301076e-06, + "loss": 2.9671, + "mean_token_accuracy": 0.4104171690378587, + "step": 1247 + }, + { + "epoch": 0.23136818687430477, + "grad_norm": 6.24609375, + "learning_rate": 9.768631813125696e-06, + "loss": 3.2391, + "mean_token_accuracy": 0.40015255530129673, + "step": 1248 + }, + { + "epoch": 0.23155357804968482, + "grad_norm": 6.89453125, + "learning_rate": 9.768446421950317e-06, + "loss": 3.279, + "mean_token_accuracy": 0.38253638253638256, + "step": 1249 + }, + { + "epoch": 0.2317389692250649, + "grad_norm": 7.25390625, + "learning_rate": 9.768261030774936e-06, + "loss": 3.0371, + "mean_token_accuracy": 0.41655985644706484, + "step": 1250 + }, + { + "epoch": 0.23192436040044495, + "grad_norm": 6.33984375, + "learning_rate": 9.768075639599556e-06, + "loss": 2.8232, + "mean_token_accuracy": 0.45692940997103215, + "step": 1251 + }, + { + "epoch": 0.232109751575825, + "grad_norm": 9.03125, + "learning_rate": 9.767890248424175e-06, + "loss": 2.6696, + "mean_token_accuracy": 0.44629927594529367, + "step": 1252 + }, + { + "epoch": 0.23229514275120505, + "grad_norm": 7.4609375, + "learning_rate": 9.767704857248795e-06, + "loss": 2.6691, + "mean_token_accuracy": 0.4613948526470196, + "step": 1253 + }, + { + "epoch": 0.2324805339265851, + "grad_norm": 7.44921875, + "learning_rate": 9.767519466073416e-06, + "loss": 2.8696, + "mean_token_accuracy": 0.409873027174558, + "step": 1254 + }, + { + "epoch": 0.23266592510196515, + "grad_norm": 5.17578125, + "learning_rate": 9.767334074898036e-06, + "loss": 2.9466, + "mean_token_accuracy": 0.4135889846866555, + "step": 1255 + }, + { + "epoch": 0.2328513162773452, + "grad_norm": 7.4609375, + "learning_rate": 9.767148683722655e-06, + "loss": 3.0508, + "mean_token_accuracy": 0.4241460541813899, + "step": 1256 + }, + { + "epoch": 0.23303670745272526, + "grad_norm": 9.6328125, + "learning_rate": 9.766963292547276e-06, + "loss": 2.5362, + "mean_token_accuracy": 0.46022511698495006, + "step": 1257 + }, + { + "epoch": 0.2332220986281053, + "grad_norm": 9.9921875, + "learning_rate": 9.766777901371896e-06, + "loss": 2.7343, + "mean_token_accuracy": 0.45231062410671746, + "step": 1258 + }, + { + "epoch": 0.23340748980348536, + "grad_norm": 6.421875, + "learning_rate": 9.766592510196515e-06, + "loss": 2.7206, + "mean_token_accuracy": 0.438365947721754, + "step": 1259 + }, + { + "epoch": 0.2335928809788654, + "grad_norm": 12.46875, + "learning_rate": 9.766407119021135e-06, + "loss": 2.9712, + "mean_token_accuracy": 0.3971388783390303, + "step": 1260 + }, + { + "epoch": 0.23377827215424546, + "grad_norm": 6.7890625, + "learning_rate": 9.766221727845754e-06, + "loss": 2.8796, + "mean_token_accuracy": 0.426182092555332, + "step": 1261 + }, + { + "epoch": 0.2339636633296255, + "grad_norm": 6.63671875, + "learning_rate": 9.766036336670375e-06, + "loss": 2.8516, + "mean_token_accuracy": 0.4217195641875888, + "step": 1262 + }, + { + "epoch": 0.23414905450500556, + "grad_norm": 7.84765625, + "learning_rate": 9.765850945494995e-06, + "loss": 2.8203, + "mean_token_accuracy": 0.43216805644644, + "step": 1263 + }, + { + "epoch": 0.2343344456803856, + "grad_norm": 8.1953125, + "learning_rate": 9.765665554319616e-06, + "loss": 3.1013, + "mean_token_accuracy": 0.3959471112388617, + "step": 1264 + }, + { + "epoch": 0.23451983685576566, + "grad_norm": 10.4375, + "learning_rate": 9.765480163144235e-06, + "loss": 2.2, + "mean_token_accuracy": 0.5128857779191854, + "step": 1265 + }, + { + "epoch": 0.2347052280311457, + "grad_norm": 6.26953125, + "learning_rate": 9.765294771968855e-06, + "loss": 2.6077, + "mean_token_accuracy": 0.4615115465360392, + "step": 1266 + }, + { + "epoch": 0.23489061920652576, + "grad_norm": 5.57421875, + "learning_rate": 9.765109380793476e-06, + "loss": 2.6928, + "mean_token_accuracy": 0.4464652854657599, + "step": 1267 + }, + { + "epoch": 0.23507601038190581, + "grad_norm": 7.76953125, + "learning_rate": 9.764923989618094e-06, + "loss": 3.0746, + "mean_token_accuracy": 0.4199491740787802, + "step": 1268 + }, + { + "epoch": 0.23526140155728587, + "grad_norm": 6.234375, + "learning_rate": 9.764738598442715e-06, + "loss": 3.3705, + "mean_token_accuracy": 0.3713670613562971, + "step": 1269 + }, + { + "epoch": 0.23544679273266592, + "grad_norm": 8.4609375, + "learning_rate": 9.764553207267334e-06, + "loss": 2.8245, + "mean_token_accuracy": 0.43195439263265606, + "step": 1270 + }, + { + "epoch": 0.23563218390804597, + "grad_norm": 6.828125, + "learning_rate": 9.764367816091956e-06, + "loss": 2.8508, + "mean_token_accuracy": 0.4360630328080599, + "step": 1271 + }, + { + "epoch": 0.23581757508342602, + "grad_norm": 6.20703125, + "learning_rate": 9.764182424916575e-06, + "loss": 2.4791, + "mean_token_accuracy": 0.468463462072798, + "step": 1272 + }, + { + "epoch": 0.23600296625880607, + "grad_norm": 9.46875, + "learning_rate": 9.763997033741195e-06, + "loss": 2.819, + "mean_token_accuracy": 0.4251429992539169, + "step": 1273 + }, + { + "epoch": 0.23618835743418612, + "grad_norm": 6.28515625, + "learning_rate": 9.763811642565814e-06, + "loss": 2.9617, + "mean_token_accuracy": 0.4247799067840497, + "step": 1274 + }, + { + "epoch": 0.2363737486095662, + "grad_norm": 6.19140625, + "learning_rate": 9.763626251390434e-06, + "loss": 2.2891, + "mean_token_accuracy": 0.5138172819137966, + "step": 1275 + }, + { + "epoch": 0.23655913978494625, + "grad_norm": 6.63671875, + "learning_rate": 9.763440860215055e-06, + "loss": 2.7527, + "mean_token_accuracy": 0.4379596307307151, + "step": 1276 + }, + { + "epoch": 0.2367445309603263, + "grad_norm": 6.84375, + "learning_rate": 9.763255469039674e-06, + "loss": 3.0745, + "mean_token_accuracy": 0.4180413137167484, + "step": 1277 + }, + { + "epoch": 0.23692992213570635, + "grad_norm": 5.80078125, + "learning_rate": 9.763070077864294e-06, + "loss": 2.9202, + "mean_token_accuracy": 0.428454619787408, + "step": 1278 + }, + { + "epoch": 0.2371153133110864, + "grad_norm": 6.23046875, + "learning_rate": 9.762884686688915e-06, + "loss": 2.6945, + "mean_token_accuracy": 0.4513642669955295, + "step": 1279 + }, + { + "epoch": 0.23730070448646645, + "grad_norm": 6.18359375, + "learning_rate": 9.762699295513535e-06, + "loss": 2.7081, + "mean_token_accuracy": 0.4679776048067732, + "step": 1280 + }, + { + "epoch": 0.2374860956618465, + "grad_norm": 12.7890625, + "learning_rate": 9.762513904338154e-06, + "loss": 2.842, + "mean_token_accuracy": 0.43618162316767967, + "step": 1281 + }, + { + "epoch": 0.23767148683722655, + "grad_norm": 7.0078125, + "learning_rate": 9.762328513162774e-06, + "loss": 3.2867, + "mean_token_accuracy": 0.39192649495405935, + "step": 1282 + }, + { + "epoch": 0.2378568780126066, + "grad_norm": 6.4453125, + "learning_rate": 9.762143121987395e-06, + "loss": 2.7202, + "mean_token_accuracy": 0.45325260490002817, + "step": 1283 + }, + { + "epoch": 0.23804226918798665, + "grad_norm": 7.87890625, + "learning_rate": 9.761957730812014e-06, + "loss": 3.1419, + "mean_token_accuracy": 0.38686757319612886, + "step": 1284 + }, + { + "epoch": 0.2382276603633667, + "grad_norm": 8.109375, + "learning_rate": 9.761772339636634e-06, + "loss": 2.7957, + "mean_token_accuracy": 0.4225908372827804, + "step": 1285 + }, + { + "epoch": 0.23841305153874676, + "grad_norm": 5.97265625, + "learning_rate": 9.761586948461253e-06, + "loss": 3.1813, + "mean_token_accuracy": 0.3837990802971348, + "step": 1286 + }, + { + "epoch": 0.2385984427141268, + "grad_norm": 6.9140625, + "learning_rate": 9.761401557285875e-06, + "loss": 2.5476, + "mean_token_accuracy": 0.4707103825136612, + "step": 1287 + }, + { + "epoch": 0.23878383388950686, + "grad_norm": 5.55859375, + "learning_rate": 9.761216166110494e-06, + "loss": 3.4871, + "mean_token_accuracy": 0.3607650685793381, + "step": 1288 + }, + { + "epoch": 0.2389692250648869, + "grad_norm": 6.7421875, + "learning_rate": 9.761030774935114e-06, + "loss": 3.2974, + "mean_token_accuracy": 0.38389800495809234, + "step": 1289 + }, + { + "epoch": 0.23915461624026696, + "grad_norm": 5.3828125, + "learning_rate": 9.760845383759733e-06, + "loss": 3.3738, + "mean_token_accuracy": 0.37438625204582654, + "step": 1290 + }, + { + "epoch": 0.239340007415647, + "grad_norm": 11.28125, + "learning_rate": 9.760659992584354e-06, + "loss": 2.8283, + "mean_token_accuracy": 0.42223692918596956, + "step": 1291 + }, + { + "epoch": 0.23952539859102706, + "grad_norm": 6.765625, + "learning_rate": 9.760474601408974e-06, + "loss": 2.8955, + "mean_token_accuracy": 0.44009746852578857, + "step": 1292 + }, + { + "epoch": 0.2397107897664071, + "grad_norm": 5.83203125, + "learning_rate": 9.760289210233593e-06, + "loss": 2.6797, + "mean_token_accuracy": 0.4535785785785786, + "step": 1293 + }, + { + "epoch": 0.23989618094178716, + "grad_norm": 5.9375, + "learning_rate": 9.760103819058214e-06, + "loss": 3.1694, + "mean_token_accuracy": 0.4032159824582775, + "step": 1294 + }, + { + "epoch": 0.2400815721171672, + "grad_norm": 6.390625, + "learning_rate": 9.759918427882834e-06, + "loss": 2.8147, + "mean_token_accuracy": 0.4276430496116659, + "step": 1295 + }, + { + "epoch": 0.24026696329254726, + "grad_norm": 5.81640625, + "learning_rate": 9.759733036707455e-06, + "loss": 2.7573, + "mean_token_accuracy": 0.44427414057368075, + "step": 1296 + }, + { + "epoch": 0.24045235446792732, + "grad_norm": 8.109375, + "learning_rate": 9.759547645532073e-06, + "loss": 2.5628, + "mean_token_accuracy": 0.45892900411921494, + "step": 1297 + }, + { + "epoch": 0.24063774564330737, + "grad_norm": 7.2578125, + "learning_rate": 9.759362254356694e-06, + "loss": 2.6494, + "mean_token_accuracy": 0.46351824087956023, + "step": 1298 + }, + { + "epoch": 0.24082313681868742, + "grad_norm": 6.3515625, + "learning_rate": 9.759176863181313e-06, + "loss": 2.7979, + "mean_token_accuracy": 0.45256453234998323, + "step": 1299 + }, + { + "epoch": 0.2410085279940675, + "grad_norm": 6.58203125, + "learning_rate": 9.758991472005933e-06, + "loss": 3.4636, + "mean_token_accuracy": 0.3567902975602877, + "step": 1300 + }, + { + "epoch": 0.24119391916944755, + "grad_norm": 10.4609375, + "learning_rate": 9.758806080830554e-06, + "loss": 2.7455, + "mean_token_accuracy": 0.44208829365079366, + "step": 1301 + }, + { + "epoch": 0.2413793103448276, + "grad_norm": 8.2265625, + "learning_rate": 9.758620689655172e-06, + "loss": 2.7685, + "mean_token_accuracy": 0.4325964010282776, + "step": 1302 + }, + { + "epoch": 0.24156470152020765, + "grad_norm": 5.99609375, + "learning_rate": 9.758435298479793e-06, + "loss": 2.6965, + "mean_token_accuracy": 0.44941574415744157, + "step": 1303 + }, + { + "epoch": 0.2417500926955877, + "grad_norm": 5.52734375, + "learning_rate": 9.758249907304413e-06, + "loss": 2.7061, + "mean_token_accuracy": 0.43315508021390375, + "step": 1304 + }, + { + "epoch": 0.24193548387096775, + "grad_norm": 6.0234375, + "learning_rate": 9.758064516129034e-06, + "loss": 2.7839, + "mean_token_accuracy": 0.4376158940397351, + "step": 1305 + }, + { + "epoch": 0.2421208750463478, + "grad_norm": 6.1875, + "learning_rate": 9.757879124953653e-06, + "loss": 2.2454, + "mean_token_accuracy": 0.5157614325884268, + "step": 1306 + }, + { + "epoch": 0.24230626622172785, + "grad_norm": 8.640625, + "learning_rate": 9.757693733778273e-06, + "loss": 3.3332, + "mean_token_accuracy": 0.3804717623043774, + "step": 1307 + }, + { + "epoch": 0.2424916573971079, + "grad_norm": 6.8046875, + "learning_rate": 9.757508342602892e-06, + "loss": 3.3125, + "mean_token_accuracy": 0.36640094478889873, + "step": 1308 + }, + { + "epoch": 0.24267704857248795, + "grad_norm": 9.09375, + "learning_rate": 9.757322951427512e-06, + "loss": 3.2817, + "mean_token_accuracy": 0.414235548352242, + "step": 1309 + }, + { + "epoch": 0.242862439747868, + "grad_norm": 7.89453125, + "learning_rate": 9.757137560252133e-06, + "loss": 2.6533, + "mean_token_accuracy": 0.459037711313394, + "step": 1310 + }, + { + "epoch": 0.24304783092324805, + "grad_norm": 6.0234375, + "learning_rate": 9.756952169076753e-06, + "loss": 3.1318, + "mean_token_accuracy": 0.3974814814814815, + "step": 1311 + }, + { + "epoch": 0.2432332220986281, + "grad_norm": 5.12109375, + "learning_rate": 9.756766777901372e-06, + "loss": 2.9293, + "mean_token_accuracy": 0.4233637116818558, + "step": 1312 + }, + { + "epoch": 0.24341861327400816, + "grad_norm": 7.2890625, + "learning_rate": 9.756581386725993e-06, + "loss": 3.1417, + "mean_token_accuracy": 0.3913650645874549, + "step": 1313 + }, + { + "epoch": 0.2436040044493882, + "grad_norm": 6.51953125, + "learning_rate": 9.756395995550613e-06, + "loss": 2.8922, + "mean_token_accuracy": 0.42792228141648386, + "step": 1314 + }, + { + "epoch": 0.24378939562476826, + "grad_norm": 5.84765625, + "learning_rate": 9.756210604375232e-06, + "loss": 2.9139, + "mean_token_accuracy": 0.41500443262411346, + "step": 1315 + }, + { + "epoch": 0.2439747868001483, + "grad_norm": 5.44921875, + "learning_rate": 9.756025213199852e-06, + "loss": 3.0479, + "mean_token_accuracy": 0.4154043097930446, + "step": 1316 + }, + { + "epoch": 0.24416017797552836, + "grad_norm": 11.21875, + "learning_rate": 9.755839822024471e-06, + "loss": 2.8901, + "mean_token_accuracy": 0.4377629971494503, + "step": 1317 + }, + { + "epoch": 0.2443455691509084, + "grad_norm": 6.60546875, + "learning_rate": 9.755654430849092e-06, + "loss": 2.859, + "mean_token_accuracy": 0.4385823600483286, + "step": 1318 + }, + { + "epoch": 0.24453096032628846, + "grad_norm": 11.6171875, + "learning_rate": 9.755469039673712e-06, + "loss": 2.7477, + "mean_token_accuracy": 0.4397597898160891, + "step": 1319 + }, + { + "epoch": 0.2447163515016685, + "grad_norm": 7.5078125, + "learning_rate": 9.755283648498333e-06, + "loss": 3.0473, + "mean_token_accuracy": 0.4154910096818811, + "step": 1320 + }, + { + "epoch": 0.24490174267704856, + "grad_norm": 6.65234375, + "learning_rate": 9.755098257322953e-06, + "loss": 3.5174, + "mean_token_accuracy": 0.3668742701440249, + "step": 1321 + }, + { + "epoch": 0.2450871338524286, + "grad_norm": 5.94921875, + "learning_rate": 9.754912866147572e-06, + "loss": 3.4391, + "mean_token_accuracy": 0.3920505871725384, + "step": 1322 + }, + { + "epoch": 0.24527252502780866, + "grad_norm": 4.921875, + "learning_rate": 9.754727474972193e-06, + "loss": 3.4771, + "mean_token_accuracy": 0.37844001824540063, + "step": 1323 + }, + { + "epoch": 0.24545791620318874, + "grad_norm": 8.0078125, + "learning_rate": 9.754542083796811e-06, + "loss": 2.6134, + "mean_token_accuracy": 0.4582752944156009, + "step": 1324 + }, + { + "epoch": 0.2456433073785688, + "grad_norm": 8.625, + "learning_rate": 9.754356692621432e-06, + "loss": 2.4128, + "mean_token_accuracy": 0.4834384858044164, + "step": 1325 + }, + { + "epoch": 0.24582869855394884, + "grad_norm": 9.578125, + "learning_rate": 9.754171301446052e-06, + "loss": 2.2431, + "mean_token_accuracy": 0.49921895339755273, + "step": 1326 + }, + { + "epoch": 0.2460140897293289, + "grad_norm": 13.484375, + "learning_rate": 9.753985910270673e-06, + "loss": 3.1383, + "mean_token_accuracy": 0.42633156559833985, + "step": 1327 + }, + { + "epoch": 0.24619948090470894, + "grad_norm": 7.03125, + "learning_rate": 9.753800519095292e-06, + "loss": 2.7401, + "mean_token_accuracy": 0.4392338943702844, + "step": 1328 + }, + { + "epoch": 0.246384872080089, + "grad_norm": 6.89453125, + "learning_rate": 9.753615127919912e-06, + "loss": 2.7483, + "mean_token_accuracy": 0.43293539325842695, + "step": 1329 + }, + { + "epoch": 0.24657026325546905, + "grad_norm": 6.3984375, + "learning_rate": 9.753429736744533e-06, + "loss": 2.4277, + "mean_token_accuracy": 0.49623865110246435, + "step": 1330 + }, + { + "epoch": 0.2467556544308491, + "grad_norm": 5.31640625, + "learning_rate": 9.753244345569151e-06, + "loss": 2.7119, + "mean_token_accuracy": 0.4484464172479391, + "step": 1331 + }, + { + "epoch": 0.24694104560622915, + "grad_norm": 6.2265625, + "learning_rate": 9.753058954393772e-06, + "loss": 2.8247, + "mean_token_accuracy": 0.43430369787568845, + "step": 1332 + }, + { + "epoch": 0.2471264367816092, + "grad_norm": 6.14453125, + "learning_rate": 9.75287356321839e-06, + "loss": 2.9627, + "mean_token_accuracy": 0.42333004277722935, + "step": 1333 + }, + { + "epoch": 0.24731182795698925, + "grad_norm": 6.52734375, + "learning_rate": 9.752688172043011e-06, + "loss": 2.7786, + "mean_token_accuracy": 0.46099806201550386, + "step": 1334 + }, + { + "epoch": 0.2474972191323693, + "grad_norm": 5.91796875, + "learning_rate": 9.752502780867632e-06, + "loss": 2.8768, + "mean_token_accuracy": 0.4107065452969224, + "step": 1335 + }, + { + "epoch": 0.24768261030774935, + "grad_norm": 6.6484375, + "learning_rate": 9.752317389692252e-06, + "loss": 2.73, + "mean_token_accuracy": 0.4367726920093095, + "step": 1336 + }, + { + "epoch": 0.2478680014831294, + "grad_norm": 8.46875, + "learning_rate": 9.752131998516871e-06, + "loss": 2.9268, + "mean_token_accuracy": 0.4272496642292195, + "step": 1337 + }, + { + "epoch": 0.24805339265850945, + "grad_norm": 6.2109375, + "learning_rate": 9.751946607341491e-06, + "loss": 3.1549, + "mean_token_accuracy": 0.408105147864184, + "step": 1338 + }, + { + "epoch": 0.2482387838338895, + "grad_norm": 5.72265625, + "learning_rate": 9.751761216166112e-06, + "loss": 2.8773, + "mean_token_accuracy": 0.43840800879013553, + "step": 1339 + }, + { + "epoch": 0.24842417500926955, + "grad_norm": 5.6171875, + "learning_rate": 9.75157582499073e-06, + "loss": 3.3699, + "mean_token_accuracy": 0.3798907426546582, + "step": 1340 + }, + { + "epoch": 0.2486095661846496, + "grad_norm": 5.484375, + "learning_rate": 9.751390433815351e-06, + "loss": 2.9551, + "mean_token_accuracy": 0.4093256603216763, + "step": 1341 + }, + { + "epoch": 0.24879495736002966, + "grad_norm": 5.96875, + "learning_rate": 9.75120504263997e-06, + "loss": 2.7049, + "mean_token_accuracy": 0.45079138402951324, + "step": 1342 + }, + { + "epoch": 0.2489803485354097, + "grad_norm": 6.171875, + "learning_rate": 9.751019651464592e-06, + "loss": 2.9436, + "mean_token_accuracy": 0.418928133096959, + "step": 1343 + }, + { + "epoch": 0.24916573971078976, + "grad_norm": 8.6171875, + "learning_rate": 9.750834260289211e-06, + "loss": 2.6344, + "mean_token_accuracy": 0.45914198161389175, + "step": 1344 + }, + { + "epoch": 0.2493511308861698, + "grad_norm": 5.96875, + "learning_rate": 9.750648869113831e-06, + "loss": 2.7301, + "mean_token_accuracy": 0.44885799404170806, + "step": 1345 + }, + { + "epoch": 0.24953652206154986, + "grad_norm": 5.296875, + "learning_rate": 9.75046347793845e-06, + "loss": 2.7683, + "mean_token_accuracy": 0.43739515022113773, + "step": 1346 + }, + { + "epoch": 0.2497219132369299, + "grad_norm": 10.84375, + "learning_rate": 9.75027808676307e-06, + "loss": 3.3328, + "mean_token_accuracy": 0.4147045420021267, + "step": 1347 + }, + { + "epoch": 0.24990730441230996, + "grad_norm": 8.3828125, + "learning_rate": 9.750092695587691e-06, + "loss": 3.163, + "mean_token_accuracy": 0.390068233510235, + "step": 1348 + }, + { + "epoch": 0.25009269558769004, + "grad_norm": 8.765625, + "learning_rate": 9.74990730441231e-06, + "loss": 3.2138, + "mean_token_accuracy": 0.38623503092512174, + "step": 1349 + }, + { + "epoch": 0.25027808676307006, + "grad_norm": 6.94921875, + "learning_rate": 9.74972191323693e-06, + "loss": 2.5857, + "mean_token_accuracy": 0.4518430439952438, + "step": 1350 + }, + { + "epoch": 0.25046347793845014, + "grad_norm": 8.6953125, + "learning_rate": 9.749536522061551e-06, + "loss": 2.9861, + "mean_token_accuracy": 0.41586292976285033, + "step": 1351 + }, + { + "epoch": 0.25064886911383016, + "grad_norm": 12.875, + "learning_rate": 9.749351130886172e-06, + "loss": 2.8859, + "mean_token_accuracy": 0.4280434539142121, + "step": 1352 + }, + { + "epoch": 0.25083426028921024, + "grad_norm": 9.796875, + "learning_rate": 9.74916573971079e-06, + "loss": 3.4162, + "mean_token_accuracy": 0.36652010050251255, + "step": 1353 + }, + { + "epoch": 0.25101965146459027, + "grad_norm": 5.6875, + "learning_rate": 9.74898034853541e-06, + "loss": 2.991, + "mean_token_accuracy": 0.40743527995183626, + "step": 1354 + }, + { + "epoch": 0.25120504263997034, + "grad_norm": 11.6875, + "learning_rate": 9.74879495736003e-06, + "loss": 2.7729, + "mean_token_accuracy": 0.4278676099039919, + "step": 1355 + }, + { + "epoch": 0.25139043381535037, + "grad_norm": 10.28125, + "learning_rate": 9.74860956618465e-06, + "loss": 2.769, + "mean_token_accuracy": 0.44195710455764076, + "step": 1356 + }, + { + "epoch": 0.25157582499073045, + "grad_norm": 5.84765625, + "learning_rate": 9.74842417500927e-06, + "loss": 3.0844, + "mean_token_accuracy": 0.40223727745391524, + "step": 1357 + }, + { + "epoch": 0.25176121616611047, + "grad_norm": 9.578125, + "learning_rate": 9.74823878383389e-06, + "loss": 3.0617, + "mean_token_accuracy": 0.39359449444150346, + "step": 1358 + }, + { + "epoch": 0.25194660734149055, + "grad_norm": 7.79296875, + "learning_rate": 9.748053392658512e-06, + "loss": 2.722, + "mean_token_accuracy": 0.4307601649970536, + "step": 1359 + }, + { + "epoch": 0.25213199851687057, + "grad_norm": 8.5234375, + "learning_rate": 9.74786800148313e-06, + "loss": 2.6621, + "mean_token_accuracy": 0.44982698961937717, + "step": 1360 + }, + { + "epoch": 0.25231738969225065, + "grad_norm": 7.453125, + "learning_rate": 9.747682610307751e-06, + "loss": 2.4154, + "mean_token_accuracy": 0.48718294657312466, + "step": 1361 + }, + { + "epoch": 0.2525027808676307, + "grad_norm": 5.8125, + "learning_rate": 9.74749721913237e-06, + "loss": 2.762, + "mean_token_accuracy": 0.4316137566137566, + "step": 1362 + }, + { + "epoch": 0.25268817204301075, + "grad_norm": 8.21875, + "learning_rate": 9.74731182795699e-06, + "loss": 2.4951, + "mean_token_accuracy": 0.47023953544640695, + "step": 1363 + }, + { + "epoch": 0.25287356321839083, + "grad_norm": 10.625, + "learning_rate": 9.74712643678161e-06, + "loss": 2.5648, + "mean_token_accuracy": 0.4565192285483312, + "step": 1364 + }, + { + "epoch": 0.25305895439377085, + "grad_norm": 7.83984375, + "learning_rate": 9.74694104560623e-06, + "loss": 2.6409, + "mean_token_accuracy": 0.4434259954921112, + "step": 1365 + }, + { + "epoch": 0.25324434556915093, + "grad_norm": 5.21875, + "learning_rate": 9.74675565443085e-06, + "loss": 3.0594, + "mean_token_accuracy": 0.425772921108742, + "step": 1366 + }, + { + "epoch": 0.25342973674453095, + "grad_norm": 8.453125, + "learning_rate": 9.746570263255469e-06, + "loss": 2.5717, + "mean_token_accuracy": 0.47274540217150457, + "step": 1367 + }, + { + "epoch": 0.25361512791991103, + "grad_norm": 6.94921875, + "learning_rate": 9.746384872080091e-06, + "loss": 3.0585, + "mean_token_accuracy": 0.409443185882914, + "step": 1368 + }, + { + "epoch": 0.25380051909529106, + "grad_norm": 8.8125, + "learning_rate": 9.74619948090471e-06, + "loss": 2.9503, + "mean_token_accuracy": 0.41759367681498827, + "step": 1369 + }, + { + "epoch": 0.25398591027067113, + "grad_norm": 5.33203125, + "learning_rate": 9.74601408972933e-06, + "loss": 2.6677, + "mean_token_accuracy": 0.46084710743801655, + "step": 1370 + }, + { + "epoch": 0.25417130144605116, + "grad_norm": 7.37109375, + "learning_rate": 9.745828698553949e-06, + "loss": 2.63, + "mean_token_accuracy": 0.46974107553240385, + "step": 1371 + }, + { + "epoch": 0.25435669262143124, + "grad_norm": 7.20703125, + "learning_rate": 9.74564330737857e-06, + "loss": 2.5187, + "mean_token_accuracy": 0.4754081632653061, + "step": 1372 + }, + { + "epoch": 0.25454208379681126, + "grad_norm": 5.7578125, + "learning_rate": 9.74545791620319e-06, + "loss": 2.7271, + "mean_token_accuracy": 0.44862518089725034, + "step": 1373 + }, + { + "epoch": 0.25472747497219134, + "grad_norm": 7.703125, + "learning_rate": 9.745272525027809e-06, + "loss": 2.508, + "mean_token_accuracy": 0.4699866065992938, + "step": 1374 + }, + { + "epoch": 0.25491286614757136, + "grad_norm": 5.9375, + "learning_rate": 9.74508713385243e-06, + "loss": 3.0368, + "mean_token_accuracy": 0.41518443356810314, + "step": 1375 + }, + { + "epoch": 0.25509825732295144, + "grad_norm": 7.25, + "learning_rate": 9.74490174267705e-06, + "loss": 2.7556, + "mean_token_accuracy": 0.4320270924044509, + "step": 1376 + }, + { + "epoch": 0.25528364849833146, + "grad_norm": 6.1640625, + "learning_rate": 9.74471635150167e-06, + "loss": 2.841, + "mean_token_accuracy": 0.4222616933096507, + "step": 1377 + }, + { + "epoch": 0.25546903967371154, + "grad_norm": 6.140625, + "learning_rate": 9.744530960326289e-06, + "loss": 3.1002, + "mean_token_accuracy": 0.40975118653818493, + "step": 1378 + }, + { + "epoch": 0.25565443084909156, + "grad_norm": 6.98046875, + "learning_rate": 9.74434556915091e-06, + "loss": 2.8626, + "mean_token_accuracy": 0.4357833358907473, + "step": 1379 + }, + { + "epoch": 0.25583982202447164, + "grad_norm": 8.0234375, + "learning_rate": 9.744160177975528e-06, + "loss": 2.7693, + "mean_token_accuracy": 0.45619126589275844, + "step": 1380 + }, + { + "epoch": 0.25602521319985166, + "grad_norm": 6.13671875, + "learning_rate": 9.743974786800149e-06, + "loss": 2.7307, + "mean_token_accuracy": 0.43463391885589925, + "step": 1381 + }, + { + "epoch": 0.25621060437523174, + "grad_norm": 6.14453125, + "learning_rate": 9.74378939562477e-06, + "loss": 2.795, + "mean_token_accuracy": 0.4477874034184032, + "step": 1382 + }, + { + "epoch": 0.25639599555061177, + "grad_norm": 5.65234375, + "learning_rate": 9.743604004449388e-06, + "loss": 2.4475, + "mean_token_accuracy": 0.4994080292756431, + "step": 1383 + }, + { + "epoch": 0.25658138672599184, + "grad_norm": 7.2890625, + "learning_rate": 9.743418613274009e-06, + "loss": 2.8074, + "mean_token_accuracy": 0.4427828348504551, + "step": 1384 + }, + { + "epoch": 0.25676677790137187, + "grad_norm": 9.359375, + "learning_rate": 9.743233222098629e-06, + "loss": 2.7038, + "mean_token_accuracy": 0.44256410256410256, + "step": 1385 + }, + { + "epoch": 0.25695216907675195, + "grad_norm": 7.14453125, + "learning_rate": 9.74304783092325e-06, + "loss": 3.1913, + "mean_token_accuracy": 0.40829756795422034, + "step": 1386 + }, + { + "epoch": 0.257137560252132, + "grad_norm": 8.4609375, + "learning_rate": 9.742862439747868e-06, + "loss": 2.4865, + "mean_token_accuracy": 0.4666666666666667, + "step": 1387 + }, + { + "epoch": 0.25732295142751205, + "grad_norm": 5.6015625, + "learning_rate": 9.742677048572489e-06, + "loss": 2.7275, + "mean_token_accuracy": 0.43402292423818845, + "step": 1388 + }, + { + "epoch": 0.2575083426028921, + "grad_norm": 6.515625, + "learning_rate": 9.742491657397108e-06, + "loss": 2.8597, + "mean_token_accuracy": 0.44301730706433, + "step": 1389 + }, + { + "epoch": 0.25769373377827215, + "grad_norm": 8.6796875, + "learning_rate": 9.742306266221728e-06, + "loss": 2.7388, + "mean_token_accuracy": 0.42249240121580545, + "step": 1390 + }, + { + "epoch": 0.25787912495365223, + "grad_norm": 8.2734375, + "learning_rate": 9.742120875046349e-06, + "loss": 2.9099, + "mean_token_accuracy": 0.41530627527332636, + "step": 1391 + }, + { + "epoch": 0.25806451612903225, + "grad_norm": 6.96484375, + "learning_rate": 9.74193548387097e-06, + "loss": 2.7478, + "mean_token_accuracy": 0.45724076281287246, + "step": 1392 + }, + { + "epoch": 0.25824990730441233, + "grad_norm": 9.578125, + "learning_rate": 9.741750092695588e-06, + "loss": 2.9084, + "mean_token_accuracy": 0.41853372434017594, + "step": 1393 + }, + { + "epoch": 0.25843529847979235, + "grad_norm": 6.71484375, + "learning_rate": 9.741564701520208e-06, + "loss": 3.1329, + "mean_token_accuracy": 0.4059267867518884, + "step": 1394 + }, + { + "epoch": 0.25862068965517243, + "grad_norm": 7.83984375, + "learning_rate": 9.741379310344829e-06, + "loss": 3.1425, + "mean_token_accuracy": 0.3849829351535836, + "step": 1395 + }, + { + "epoch": 0.25880608083055245, + "grad_norm": 6.578125, + "learning_rate": 9.741193919169448e-06, + "loss": 2.8825, + "mean_token_accuracy": 0.4265486725663717, + "step": 1396 + }, + { + "epoch": 0.25899147200593253, + "grad_norm": 8.828125, + "learning_rate": 9.741008527994068e-06, + "loss": 2.7531, + "mean_token_accuracy": 0.43152962515114873, + "step": 1397 + }, + { + "epoch": 0.25917686318131256, + "grad_norm": 8.15625, + "learning_rate": 9.740823136818687e-06, + "loss": 2.5507, + "mean_token_accuracy": 0.450074294205052, + "step": 1398 + }, + { + "epoch": 0.25936225435669263, + "grad_norm": 8.4296875, + "learning_rate": 9.740637745643308e-06, + "loss": 2.8377, + "mean_token_accuracy": 0.4302340960190451, + "step": 1399 + }, + { + "epoch": 0.25954764553207266, + "grad_norm": 7.1875, + "learning_rate": 9.740452354467928e-06, + "loss": 2.8668, + "mean_token_accuracy": 0.42975495915985995, + "step": 1400 + }, + { + "epoch": 0.25973303670745274, + "grad_norm": 8.8125, + "learning_rate": 9.740266963292549e-06, + "loss": 3.134, + "mean_token_accuracy": 0.3986462415390096, + "step": 1401 + }, + { + "epoch": 0.25991842788283276, + "grad_norm": 10.2265625, + "learning_rate": 9.740081572117169e-06, + "loss": 2.8421, + "mean_token_accuracy": 0.4289079822616408, + "step": 1402 + }, + { + "epoch": 0.26010381905821284, + "grad_norm": 6.34765625, + "learning_rate": 9.739896180941788e-06, + "loss": 2.7373, + "mean_token_accuracy": 0.4688512783579402, + "step": 1403 + }, + { + "epoch": 0.26028921023359286, + "grad_norm": 10.5, + "learning_rate": 9.739710789766408e-06, + "loss": 2.878, + "mean_token_accuracy": 0.42034638789202683, + "step": 1404 + }, + { + "epoch": 0.26047460140897294, + "grad_norm": 8.578125, + "learning_rate": 9.739525398591027e-06, + "loss": 2.4952, + "mean_token_accuracy": 0.46924726328856786, + "step": 1405 + }, + { + "epoch": 0.26065999258435296, + "grad_norm": 6.1640625, + "learning_rate": 9.739340007415648e-06, + "loss": 2.6958, + "mean_token_accuracy": 0.44245943129373233, + "step": 1406 + }, + { + "epoch": 0.26084538375973304, + "grad_norm": 5.765625, + "learning_rate": 9.739154616240268e-06, + "loss": 2.801, + "mean_token_accuracy": 0.430661659976693, + "step": 1407 + }, + { + "epoch": 0.26103077493511306, + "grad_norm": 7.328125, + "learning_rate": 9.738969225064889e-06, + "loss": 2.4835, + "mean_token_accuracy": 0.480924568162048, + "step": 1408 + }, + { + "epoch": 0.26121616611049314, + "grad_norm": 6.109375, + "learning_rate": 9.738783833889507e-06, + "loss": 3.0193, + "mean_token_accuracy": 0.4113686391298718, + "step": 1409 + }, + { + "epoch": 0.26140155728587317, + "grad_norm": 5.4609375, + "learning_rate": 9.738598442714128e-06, + "loss": 3.0447, + "mean_token_accuracy": 0.41317440401505645, + "step": 1410 + }, + { + "epoch": 0.26158694846125324, + "grad_norm": 5.95703125, + "learning_rate": 9.738413051538748e-06, + "loss": 3.2015, + "mean_token_accuracy": 0.40329271454498317, + "step": 1411 + }, + { + "epoch": 0.2617723396366333, + "grad_norm": 7.953125, + "learning_rate": 9.738227660363367e-06, + "loss": 2.9105, + "mean_token_accuracy": 0.42351323478858716, + "step": 1412 + }, + { + "epoch": 0.26195773081201335, + "grad_norm": 8.34375, + "learning_rate": 9.738042269187988e-06, + "loss": 2.6713, + "mean_token_accuracy": 0.4406269592476489, + "step": 1413 + }, + { + "epoch": 0.2621431219873934, + "grad_norm": 7.45703125, + "learning_rate": 9.737856878012606e-06, + "loss": 2.7808, + "mean_token_accuracy": 0.43470550981633943, + "step": 1414 + }, + { + "epoch": 0.26232851316277345, + "grad_norm": 5.7421875, + "learning_rate": 9.737671486837227e-06, + "loss": 3.4136, + "mean_token_accuracy": 0.3727969348659004, + "step": 1415 + }, + { + "epoch": 0.2625139043381535, + "grad_norm": 6.578125, + "learning_rate": 9.737486095661847e-06, + "loss": 3.2447, + "mean_token_accuracy": 0.39578976718248987, + "step": 1416 + }, + { + "epoch": 0.26269929551353355, + "grad_norm": 5.47265625, + "learning_rate": 9.737300704486468e-06, + "loss": 2.8844, + "mean_token_accuracy": 0.43197332909985714, + "step": 1417 + }, + { + "epoch": 0.2628846866889136, + "grad_norm": 6.51171875, + "learning_rate": 9.737115313311087e-06, + "loss": 2.7373, + "mean_token_accuracy": 0.4543935850317407, + "step": 1418 + }, + { + "epoch": 0.26307007786429365, + "grad_norm": 7.1875, + "learning_rate": 9.736929922135707e-06, + "loss": 2.3903, + "mean_token_accuracy": 0.4763619467998606, + "step": 1419 + }, + { + "epoch": 0.26325546903967373, + "grad_norm": 7.484375, + "learning_rate": 9.736744530960328e-06, + "loss": 3.2405, + "mean_token_accuracy": 0.3924422673198041, + "step": 1420 + }, + { + "epoch": 0.26344086021505375, + "grad_norm": 6.7578125, + "learning_rate": 9.736559139784946e-06, + "loss": 2.9292, + "mean_token_accuracy": 0.42827523649086435, + "step": 1421 + }, + { + "epoch": 0.26362625139043383, + "grad_norm": 5.89453125, + "learning_rate": 9.736373748609567e-06, + "loss": 2.8133, + "mean_token_accuracy": 0.4319627618308767, + "step": 1422 + }, + { + "epoch": 0.26381164256581385, + "grad_norm": 7.66015625, + "learning_rate": 9.736188357434186e-06, + "loss": 2.2175, + "mean_token_accuracy": 0.5247799348848426, + "step": 1423 + }, + { + "epoch": 0.26399703374119393, + "grad_norm": 5.9140625, + "learning_rate": 9.736002966258808e-06, + "loss": 2.9609, + "mean_token_accuracy": 0.4266355140186916, + "step": 1424 + }, + { + "epoch": 0.26418242491657395, + "grad_norm": 8.7265625, + "learning_rate": 9.735817575083427e-06, + "loss": 2.7507, + "mean_token_accuracy": 0.44058205335489087, + "step": 1425 + }, + { + "epoch": 0.26436781609195403, + "grad_norm": 5.9140625, + "learning_rate": 9.735632183908047e-06, + "loss": 2.8315, + "mean_token_accuracy": 0.45780809728607447, + "step": 1426 + }, + { + "epoch": 0.26455320726733406, + "grad_norm": 5.3515625, + "learning_rate": 9.735446792732666e-06, + "loss": 2.9292, + "mean_token_accuracy": 0.41445945945945944, + "step": 1427 + }, + { + "epoch": 0.26473859844271413, + "grad_norm": 6.02734375, + "learning_rate": 9.735261401557287e-06, + "loss": 3.3309, + "mean_token_accuracy": 0.37969094922737306, + "step": 1428 + }, + { + "epoch": 0.26492398961809416, + "grad_norm": 5.86328125, + "learning_rate": 9.735076010381907e-06, + "loss": 2.6712, + "mean_token_accuracy": 0.45440470661093046, + "step": 1429 + }, + { + "epoch": 0.26510938079347424, + "grad_norm": 7.6640625, + "learning_rate": 9.734890619206526e-06, + "loss": 2.9152, + "mean_token_accuracy": 0.4271628125366827, + "step": 1430 + }, + { + "epoch": 0.26529477196885426, + "grad_norm": 7.703125, + "learning_rate": 9.734705228031146e-06, + "loss": 3.0648, + "mean_token_accuracy": 0.4190012180267966, + "step": 1431 + }, + { + "epoch": 0.26548016314423434, + "grad_norm": 7.69921875, + "learning_rate": 9.734519836855767e-06, + "loss": 2.6966, + "mean_token_accuracy": 0.4501140250855188, + "step": 1432 + }, + { + "epoch": 0.26566555431961436, + "grad_norm": 9.7265625, + "learning_rate": 9.734334445680387e-06, + "loss": 2.7761, + "mean_token_accuracy": 0.43670392513647, + "step": 1433 + }, + { + "epoch": 0.26585094549499444, + "grad_norm": 10.53125, + "learning_rate": 9.734149054505006e-06, + "loss": 3.0316, + "mean_token_accuracy": 0.40456284540172877, + "step": 1434 + }, + { + "epoch": 0.26603633667037446, + "grad_norm": 12.7109375, + "learning_rate": 9.733963663329627e-06, + "loss": 3.2042, + "mean_token_accuracy": 0.3867345877949891, + "step": 1435 + }, + { + "epoch": 0.26622172784575454, + "grad_norm": 14.8125, + "learning_rate": 9.733778272154245e-06, + "loss": 2.9063, + "mean_token_accuracy": 0.42193010580314205, + "step": 1436 + }, + { + "epoch": 0.2664071190211346, + "grad_norm": 13.6484375, + "learning_rate": 9.733592880978866e-06, + "loss": 2.8842, + "mean_token_accuracy": 0.419578745811393, + "step": 1437 + }, + { + "epoch": 0.26659251019651464, + "grad_norm": 7.6796875, + "learning_rate": 9.733407489803486e-06, + "loss": 2.8189, + "mean_token_accuracy": 0.45008299739151053, + "step": 1438 + }, + { + "epoch": 0.2667779013718947, + "grad_norm": 8.5078125, + "learning_rate": 9.733222098628105e-06, + "loss": 2.9613, + "mean_token_accuracy": 0.4184129645152277, + "step": 1439 + }, + { + "epoch": 0.26696329254727474, + "grad_norm": 11.828125, + "learning_rate": 9.733036707452727e-06, + "loss": 2.6886, + "mean_token_accuracy": 0.44348598249397436, + "step": 1440 + }, + { + "epoch": 0.2671486837226548, + "grad_norm": 7.8671875, + "learning_rate": 9.732851316277346e-06, + "loss": 3.1343, + "mean_token_accuracy": 0.4058238490720804, + "step": 1441 + }, + { + "epoch": 0.26733407489803485, + "grad_norm": 7.24609375, + "learning_rate": 9.732665925101967e-06, + "loss": 2.6312, + "mean_token_accuracy": 0.44463832487309646, + "step": 1442 + }, + { + "epoch": 0.2675194660734149, + "grad_norm": 5.7734375, + "learning_rate": 9.732480533926585e-06, + "loss": 3.0352, + "mean_token_accuracy": 0.41620846282572316, + "step": 1443 + }, + { + "epoch": 0.26770485724879495, + "grad_norm": 7.19921875, + "learning_rate": 9.732295142751206e-06, + "loss": 2.8949, + "mean_token_accuracy": 0.4381679389312977, + "step": 1444 + }, + { + "epoch": 0.267890248424175, + "grad_norm": 10.4609375, + "learning_rate": 9.732109751575826e-06, + "loss": 2.3839, + "mean_token_accuracy": 0.4860557768924303, + "step": 1445 + }, + { + "epoch": 0.26807563959955505, + "grad_norm": 7.82421875, + "learning_rate": 9.731924360400445e-06, + "loss": 3.2139, + "mean_token_accuracy": 0.39789695057833857, + "step": 1446 + }, + { + "epoch": 0.2682610307749351, + "grad_norm": 6.6875, + "learning_rate": 9.731738969225066e-06, + "loss": 3.0859, + "mean_token_accuracy": 0.4082422901396764, + "step": 1447 + }, + { + "epoch": 0.26844642195031515, + "grad_norm": 11.6640625, + "learning_rate": 9.731553578049686e-06, + "loss": 2.4961, + "mean_token_accuracy": 0.4647495361781076, + "step": 1448 + }, + { + "epoch": 0.26863181312569523, + "grad_norm": 6.8828125, + "learning_rate": 9.731368186874307e-06, + "loss": 2.6263, + "mean_token_accuracy": 0.46710291493158834, + "step": 1449 + }, + { + "epoch": 0.26881720430107525, + "grad_norm": 6.015625, + "learning_rate": 9.731182795698925e-06, + "loss": 2.7822, + "mean_token_accuracy": 0.45086133860491384, + "step": 1450 + }, + { + "epoch": 0.26900259547645533, + "grad_norm": 7.38671875, + "learning_rate": 9.730997404523546e-06, + "loss": 2.6993, + "mean_token_accuracy": 0.4557204404021063, + "step": 1451 + }, + { + "epoch": 0.26918798665183535, + "grad_norm": 7.96484375, + "learning_rate": 9.730812013348165e-06, + "loss": 2.765, + "mean_token_accuracy": 0.43163357400722024, + "step": 1452 + }, + { + "epoch": 0.26937337782721543, + "grad_norm": 7.8125, + "learning_rate": 9.730626622172785e-06, + "loss": 3.3332, + "mean_token_accuracy": 0.38722490606548576, + "step": 1453 + }, + { + "epoch": 0.26955876900259546, + "grad_norm": 6.484375, + "learning_rate": 9.730441230997406e-06, + "loss": 2.83, + "mean_token_accuracy": 0.43372591006423983, + "step": 1454 + }, + { + "epoch": 0.26974416017797553, + "grad_norm": 7.3984375, + "learning_rate": 9.730255839822025e-06, + "loss": 2.6394, + "mean_token_accuracy": 0.4675309229305423, + "step": 1455 + }, + { + "epoch": 0.26992955135335556, + "grad_norm": 12.46875, + "learning_rate": 9.730070448646645e-06, + "loss": 2.6017, + "mean_token_accuracy": 0.4224, + "step": 1456 + }, + { + "epoch": 0.27011494252873564, + "grad_norm": 9.84375, + "learning_rate": 9.729885057471266e-06, + "loss": 2.7468, + "mean_token_accuracy": 0.4348642403235124, + "step": 1457 + }, + { + "epoch": 0.27030033370411566, + "grad_norm": 5.75390625, + "learning_rate": 9.729699666295886e-06, + "loss": 2.808, + "mean_token_accuracy": 0.4474291140957808, + "step": 1458 + }, + { + "epoch": 0.27048572487949574, + "grad_norm": 10.015625, + "learning_rate": 9.729514275120505e-06, + "loss": 3.0718, + "mean_token_accuracy": 0.4106830122591944, + "step": 1459 + }, + { + "epoch": 0.2706711160548758, + "grad_norm": 5.84765625, + "learning_rate": 9.729328883945125e-06, + "loss": 2.732, + "mean_token_accuracy": 0.43124665596575706, + "step": 1460 + }, + { + "epoch": 0.27085650723025584, + "grad_norm": 7.26953125, + "learning_rate": 9.729143492769744e-06, + "loss": 2.9076, + "mean_token_accuracy": 0.44059610873464883, + "step": 1461 + }, + { + "epoch": 0.2710418984056359, + "grad_norm": 9.65625, + "learning_rate": 9.728958101594365e-06, + "loss": 2.8904, + "mean_token_accuracy": 0.408169580690082, + "step": 1462 + }, + { + "epoch": 0.27122728958101594, + "grad_norm": 12.515625, + "learning_rate": 9.728772710418985e-06, + "loss": 2.5016, + "mean_token_accuracy": 0.472670715449563, + "step": 1463 + }, + { + "epoch": 0.271412680756396, + "grad_norm": 7.953125, + "learning_rate": 9.728587319243606e-06, + "loss": 2.7231, + "mean_token_accuracy": 0.433780385582565, + "step": 1464 + }, + { + "epoch": 0.27159807193177604, + "grad_norm": 6.52734375, + "learning_rate": 9.728401928068224e-06, + "loss": 3.2028, + "mean_token_accuracy": 0.3951143854207057, + "step": 1465 + }, + { + "epoch": 0.2717834631071561, + "grad_norm": 9.046875, + "learning_rate": 9.728216536892845e-06, + "loss": 2.8251, + "mean_token_accuracy": 0.44425935417734497, + "step": 1466 + }, + { + "epoch": 0.27196885428253614, + "grad_norm": 7.19140625, + "learning_rate": 9.728031145717465e-06, + "loss": 2.4967, + "mean_token_accuracy": 0.4601251497803222, + "step": 1467 + }, + { + "epoch": 0.2721542454579162, + "grad_norm": 6.2890625, + "learning_rate": 9.727845754542084e-06, + "loss": 2.8892, + "mean_token_accuracy": 0.43394886363636365, + "step": 1468 + }, + { + "epoch": 0.27233963663329624, + "grad_norm": 8.609375, + "learning_rate": 9.727660363366705e-06, + "loss": 2.4747, + "mean_token_accuracy": 0.4760112888052681, + "step": 1469 + }, + { + "epoch": 0.2725250278086763, + "grad_norm": 6.0, + "learning_rate": 9.727474972191323e-06, + "loss": 3.0878, + "mean_token_accuracy": 0.40199735290578753, + "step": 1470 + }, + { + "epoch": 0.27271041898405635, + "grad_norm": 6.12890625, + "learning_rate": 9.727289581015944e-06, + "loss": 2.8279, + "mean_token_accuracy": 0.4272195936543279, + "step": 1471 + }, + { + "epoch": 0.2728958101594364, + "grad_norm": 8.078125, + "learning_rate": 9.727104189840564e-06, + "loss": 3.0491, + "mean_token_accuracy": 0.377616555661275, + "step": 1472 + }, + { + "epoch": 0.27308120133481645, + "grad_norm": 6.65625, + "learning_rate": 9.726918798665185e-06, + "loss": 3.0419, + "mean_token_accuracy": 0.42346089850249585, + "step": 1473 + }, + { + "epoch": 0.2732665925101965, + "grad_norm": 5.8984375, + "learning_rate": 9.726733407489804e-06, + "loss": 3.0996, + "mean_token_accuracy": 0.4057115315098205, + "step": 1474 + }, + { + "epoch": 0.27345198368557655, + "grad_norm": 5.53515625, + "learning_rate": 9.726548016314424e-06, + "loss": 3.0038, + "mean_token_accuracy": 0.40247599797877714, + "step": 1475 + }, + { + "epoch": 0.27363737486095663, + "grad_norm": 6.890625, + "learning_rate": 9.726362625139045e-06, + "loss": 3.4582, + "mean_token_accuracy": 0.38304054946506405, + "step": 1476 + }, + { + "epoch": 0.27382276603633665, + "grad_norm": 5.87109375, + "learning_rate": 9.726177233963664e-06, + "loss": 2.8892, + "mean_token_accuracy": 0.43549280177187155, + "step": 1477 + }, + { + "epoch": 0.27400815721171673, + "grad_norm": 5.5234375, + "learning_rate": 9.725991842788284e-06, + "loss": 3.3496, + "mean_token_accuracy": 0.3818830242510699, + "step": 1478 + }, + { + "epoch": 0.27419354838709675, + "grad_norm": 5.57421875, + "learning_rate": 9.725806451612903e-06, + "loss": 3.038, + "mean_token_accuracy": 0.41250959324635456, + "step": 1479 + }, + { + "epoch": 0.27437893956247683, + "grad_norm": 8.9375, + "learning_rate": 9.725621060437525e-06, + "loss": 2.832, + "mean_token_accuracy": 0.4136631330977621, + "step": 1480 + }, + { + "epoch": 0.27456433073785685, + "grad_norm": 6.94921875, + "learning_rate": 9.725435669262144e-06, + "loss": 2.3572, + "mean_token_accuracy": 0.48827844096073636, + "step": 1481 + }, + { + "epoch": 0.27474972191323693, + "grad_norm": 6.62890625, + "learning_rate": 9.725250278086764e-06, + "loss": 3.1226, + "mean_token_accuracy": 0.40572369254147145, + "step": 1482 + }, + { + "epoch": 0.27493511308861696, + "grad_norm": 7.2890625, + "learning_rate": 9.725064886911385e-06, + "loss": 2.6334, + "mean_token_accuracy": 0.4463179628355127, + "step": 1483 + }, + { + "epoch": 0.27512050426399703, + "grad_norm": 6.63671875, + "learning_rate": 9.724879495736004e-06, + "loss": 2.5939, + "mean_token_accuracy": 0.45490981963927857, + "step": 1484 + }, + { + "epoch": 0.2753058954393771, + "grad_norm": 5.890625, + "learning_rate": 9.724694104560624e-06, + "loss": 3.0145, + "mean_token_accuracy": 0.4225153085256712, + "step": 1485 + }, + { + "epoch": 0.27549128661475714, + "grad_norm": 5.76171875, + "learning_rate": 9.724508713385243e-06, + "loss": 3.1575, + "mean_token_accuracy": 0.39855274144169217, + "step": 1486 + }, + { + "epoch": 0.2756766777901372, + "grad_norm": 8.90625, + "learning_rate": 9.724323322209863e-06, + "loss": 2.738, + "mean_token_accuracy": 0.43975542500899173, + "step": 1487 + }, + { + "epoch": 0.27586206896551724, + "grad_norm": 6.0390625, + "learning_rate": 9.724137931034484e-06, + "loss": 3.0126, + "mean_token_accuracy": 0.42025748817656333, + "step": 1488 + }, + { + "epoch": 0.2760474601408973, + "grad_norm": 5.6875, + "learning_rate": 9.723952539859104e-06, + "loss": 3.1654, + "mean_token_accuracy": 0.40727994705493054, + "step": 1489 + }, + { + "epoch": 0.27623285131627734, + "grad_norm": 8.203125, + "learning_rate": 9.723767148683723e-06, + "loss": 2.8417, + "mean_token_accuracy": 0.4226177787252185, + "step": 1490 + }, + { + "epoch": 0.2764182424916574, + "grad_norm": 5.9921875, + "learning_rate": 9.723581757508344e-06, + "loss": 3.0496, + "mean_token_accuracy": 0.4052637448421992, + "step": 1491 + }, + { + "epoch": 0.27660363366703744, + "grad_norm": 9.4609375, + "learning_rate": 9.723396366332964e-06, + "loss": 3.1385, + "mean_token_accuracy": 0.39399348944660295, + "step": 1492 + }, + { + "epoch": 0.2767890248424175, + "grad_norm": 6.3359375, + "learning_rate": 9.723210975157583e-06, + "loss": 2.9989, + "mean_token_accuracy": 0.4144329896907217, + "step": 1493 + }, + { + "epoch": 0.27697441601779754, + "grad_norm": 10.5546875, + "learning_rate": 9.723025583982203e-06, + "loss": 2.5572, + "mean_token_accuracy": 0.45578146438817513, + "step": 1494 + }, + { + "epoch": 0.2771598071931776, + "grad_norm": 5.9765625, + "learning_rate": 9.722840192806822e-06, + "loss": 3.2437, + "mean_token_accuracy": 0.39901112484548823, + "step": 1495 + }, + { + "epoch": 0.27734519836855764, + "grad_norm": 8.375, + "learning_rate": 9.722654801631443e-06, + "loss": 2.857, + "mean_token_accuracy": 0.43163303672139364, + "step": 1496 + }, + { + "epoch": 0.2775305895439377, + "grad_norm": 5.890625, + "learning_rate": 9.722469410456063e-06, + "loss": 2.6753, + "mean_token_accuracy": 0.44427023945267957, + "step": 1497 + }, + { + "epoch": 0.27771598071931775, + "grad_norm": 15.15625, + "learning_rate": 9.722284019280684e-06, + "loss": 3.1551, + "mean_token_accuracy": 0.41700504491202167, + "step": 1498 + }, + { + "epoch": 0.2779013718946978, + "grad_norm": 7.15234375, + "learning_rate": 9.722098628105302e-06, + "loss": 2.4456, + "mean_token_accuracy": 0.48917511647026585, + "step": 1499 + }, + { + "epoch": 0.27808676307007785, + "grad_norm": 8.2109375, + "learning_rate": 9.721913236929923e-06, + "loss": 2.8178, + "mean_token_accuracy": 0.4240829592942269, + "step": 1500 + }, + { + "epoch": 0.2782721542454579, + "grad_norm": 7.01171875, + "learning_rate": 9.721727845754543e-06, + "loss": 2.7176, + "mean_token_accuracy": 0.4411332941867293, + "step": 1501 + }, + { + "epoch": 0.27845754542083795, + "grad_norm": 6.453125, + "learning_rate": 9.721542454579162e-06, + "loss": 2.6789, + "mean_token_accuracy": 0.46482445601388334, + "step": 1502 + }, + { + "epoch": 0.278642936596218, + "grad_norm": 7.99609375, + "learning_rate": 9.721357063403783e-06, + "loss": 2.6514, + "mean_token_accuracy": 0.4524929444967074, + "step": 1503 + }, + { + "epoch": 0.27882832777159805, + "grad_norm": 11.296875, + "learning_rate": 9.721171672228402e-06, + "loss": 2.9413, + "mean_token_accuracy": 0.41957160856782866, + "step": 1504 + }, + { + "epoch": 0.27901371894697813, + "grad_norm": 6.66015625, + "learning_rate": 9.720986281053024e-06, + "loss": 3.2105, + "mean_token_accuracy": 0.38066789215686275, + "step": 1505 + }, + { + "epoch": 0.27919911012235815, + "grad_norm": 5.65234375, + "learning_rate": 9.720800889877643e-06, + "loss": 2.9903, + "mean_token_accuracy": 0.41111873713109126, + "step": 1506 + }, + { + "epoch": 0.27938450129773823, + "grad_norm": 8.875, + "learning_rate": 9.720615498702263e-06, + "loss": 3.3211, + "mean_token_accuracy": 0.3888682285855956, + "step": 1507 + }, + { + "epoch": 0.27956989247311825, + "grad_norm": 5.9765625, + "learning_rate": 9.720430107526882e-06, + "loss": 3.24, + "mean_token_accuracy": 0.4085076869726043, + "step": 1508 + }, + { + "epoch": 0.27975528364849833, + "grad_norm": 4.9296875, + "learning_rate": 9.720244716351502e-06, + "loss": 2.7364, + "mean_token_accuracy": 0.4503887205165371, + "step": 1509 + }, + { + "epoch": 0.2799406748238784, + "grad_norm": 9.65625, + "learning_rate": 9.720059325176123e-06, + "loss": 2.9812, + "mean_token_accuracy": 0.421281390856407, + "step": 1510 + }, + { + "epoch": 0.28012606599925843, + "grad_norm": 8.78125, + "learning_rate": 9.719873934000742e-06, + "loss": 2.9257, + "mean_token_accuracy": 0.41490618029130033, + "step": 1511 + }, + { + "epoch": 0.2803114571746385, + "grad_norm": 8.7578125, + "learning_rate": 9.719688542825362e-06, + "loss": 2.6762, + "mean_token_accuracy": 0.44573863636363636, + "step": 1512 + }, + { + "epoch": 0.28049684835001854, + "grad_norm": 8.4765625, + "learning_rate": 9.719503151649983e-06, + "loss": 2.224, + "mean_token_accuracy": 0.5133496560568963, + "step": 1513 + }, + { + "epoch": 0.2806822395253986, + "grad_norm": 8.9453125, + "learning_rate": 9.719317760474603e-06, + "loss": 2.6199, + "mean_token_accuracy": 0.436931473620376, + "step": 1514 + }, + { + "epoch": 0.28086763070077864, + "grad_norm": 5.90234375, + "learning_rate": 9.719132369299222e-06, + "loss": 2.9279, + "mean_token_accuracy": 0.4355122263065367, + "step": 1515 + }, + { + "epoch": 0.2810530218761587, + "grad_norm": 6.04296875, + "learning_rate": 9.718946978123842e-06, + "loss": 2.8352, + "mean_token_accuracy": 0.4333778966131907, + "step": 1516 + }, + { + "epoch": 0.28123841305153874, + "grad_norm": 7.5703125, + "learning_rate": 9.718761586948461e-06, + "loss": 3.4339, + "mean_token_accuracy": 0.37950404164954105, + "step": 1517 + }, + { + "epoch": 0.2814238042269188, + "grad_norm": 7.578125, + "learning_rate": 9.718576195773082e-06, + "loss": 2.4594, + "mean_token_accuracy": 0.4937257079400333, + "step": 1518 + }, + { + "epoch": 0.28160919540229884, + "grad_norm": 8.140625, + "learning_rate": 9.718390804597702e-06, + "loss": 2.9719, + "mean_token_accuracy": 0.4154550658271322, + "step": 1519 + }, + { + "epoch": 0.2817945865776789, + "grad_norm": 7.875, + "learning_rate": 9.718205413422321e-06, + "loss": 2.5271, + "mean_token_accuracy": 0.461518572782901, + "step": 1520 + }, + { + "epoch": 0.28197997775305894, + "grad_norm": 8.65625, + "learning_rate": 9.718020022246943e-06, + "loss": 2.6488, + "mean_token_accuracy": 0.45604468679345656, + "step": 1521 + }, + { + "epoch": 0.282165368928439, + "grad_norm": 6.9140625, + "learning_rate": 9.717834631071562e-06, + "loss": 3.4569, + "mean_token_accuracy": 0.36576034977455935, + "step": 1522 + }, + { + "epoch": 0.28235076010381904, + "grad_norm": 7.4296875, + "learning_rate": 9.717649239896182e-06, + "loss": 2.6898, + "mean_token_accuracy": 0.45914967346394775, + "step": 1523 + }, + { + "epoch": 0.2825361512791991, + "grad_norm": 7.2421875, + "learning_rate": 9.717463848720801e-06, + "loss": 3.1185, + "mean_token_accuracy": 0.40501277139208175, + "step": 1524 + }, + { + "epoch": 0.28272154245457914, + "grad_norm": 8.9296875, + "learning_rate": 9.717278457545422e-06, + "loss": 2.6993, + "mean_token_accuracy": 0.44654151189639596, + "step": 1525 + }, + { + "epoch": 0.2829069336299592, + "grad_norm": 7.52734375, + "learning_rate": 9.717093066370042e-06, + "loss": 2.9636, + "mean_token_accuracy": 0.4294286103913814, + "step": 1526 + }, + { + "epoch": 0.28309232480533925, + "grad_norm": 6.4765625, + "learning_rate": 9.716907675194661e-06, + "loss": 2.7059, + "mean_token_accuracy": 0.4510593852581319, + "step": 1527 + }, + { + "epoch": 0.2832777159807193, + "grad_norm": 6.90625, + "learning_rate": 9.716722284019281e-06, + "loss": 2.9323, + "mean_token_accuracy": 0.4316329213778596, + "step": 1528 + }, + { + "epoch": 0.28346310715609935, + "grad_norm": 7.1875, + "learning_rate": 9.716536892843902e-06, + "loss": 3.0733, + "mean_token_accuracy": 0.40403549288926704, + "step": 1529 + }, + { + "epoch": 0.2836484983314794, + "grad_norm": 7.52734375, + "learning_rate": 9.716351501668522e-06, + "loss": 2.7215, + "mean_token_accuracy": 0.4451308730287758, + "step": 1530 + }, + { + "epoch": 0.28383388950685945, + "grad_norm": 9.421875, + "learning_rate": 9.716166110493141e-06, + "loss": 3.0258, + "mean_token_accuracy": 0.42528989508558807, + "step": 1531 + }, + { + "epoch": 0.2840192806822395, + "grad_norm": 7.984375, + "learning_rate": 9.715980719317762e-06, + "loss": 2.7971, + "mean_token_accuracy": 0.45991451884470924, + "step": 1532 + }, + { + "epoch": 0.28420467185761955, + "grad_norm": 10.4140625, + "learning_rate": 9.71579532814238e-06, + "loss": 2.821, + "mean_token_accuracy": 0.42618769263471146, + "step": 1533 + }, + { + "epoch": 0.28439006303299963, + "grad_norm": 9.3671875, + "learning_rate": 9.715609936967001e-06, + "loss": 2.8666, + "mean_token_accuracy": 0.4384868072480661, + "step": 1534 + }, + { + "epoch": 0.2845754542083797, + "grad_norm": 8.1875, + "learning_rate": 9.715424545791622e-06, + "loss": 2.8801, + "mean_token_accuracy": 0.44099762959909306, + "step": 1535 + }, + { + "epoch": 0.28476084538375973, + "grad_norm": 6.37890625, + "learning_rate": 9.71523915461624e-06, + "loss": 2.8135, + "mean_token_accuracy": 0.43271767810026385, + "step": 1536 + }, + { + "epoch": 0.2849462365591398, + "grad_norm": 7.80859375, + "learning_rate": 9.71505376344086e-06, + "loss": 2.7241, + "mean_token_accuracy": 0.4506398537477148, + "step": 1537 + }, + { + "epoch": 0.28513162773451983, + "grad_norm": 9.359375, + "learning_rate": 9.714868372265481e-06, + "loss": 3.1724, + "mean_token_accuracy": 0.40512629090456637, + "step": 1538 + }, + { + "epoch": 0.2853170189098999, + "grad_norm": 8.6875, + "learning_rate": 9.714682981090102e-06, + "loss": 3.2547, + "mean_token_accuracy": 0.39429928741092635, + "step": 1539 + }, + { + "epoch": 0.28550241008527993, + "grad_norm": 7.0703125, + "learning_rate": 9.71449758991472e-06, + "loss": 2.5974, + "mean_token_accuracy": 0.4521172638436482, + "step": 1540 + }, + { + "epoch": 0.28568780126066, + "grad_norm": 9.75, + "learning_rate": 9.714312198739341e-06, + "loss": 1.9374, + "mean_token_accuracy": 0.5431305715783954, + "step": 1541 + }, + { + "epoch": 0.28587319243604004, + "grad_norm": 6.546875, + "learning_rate": 9.71412680756396e-06, + "loss": 2.5831, + "mean_token_accuracy": 0.49724061810154524, + "step": 1542 + }, + { + "epoch": 0.2860585836114201, + "grad_norm": 7.0234375, + "learning_rate": 9.71394141638858e-06, + "loss": 3.1661, + "mean_token_accuracy": 0.38490813648293964, + "step": 1543 + }, + { + "epoch": 0.28624397478680014, + "grad_norm": 6.5859375, + "learning_rate": 9.7137560252132e-06, + "loss": 2.9869, + "mean_token_accuracy": 0.41638769328869757, + "step": 1544 + }, + { + "epoch": 0.2864293659621802, + "grad_norm": 6.390625, + "learning_rate": 9.713570634037821e-06, + "loss": 2.7195, + "mean_token_accuracy": 0.4671212859230395, + "step": 1545 + }, + { + "epoch": 0.28661475713756024, + "grad_norm": 5.890625, + "learning_rate": 9.71338524286244e-06, + "loss": 3.8243, + "mean_token_accuracy": 0.3476795436868032, + "step": 1546 + }, + { + "epoch": 0.2868001483129403, + "grad_norm": 7.81640625, + "learning_rate": 9.71319985168706e-06, + "loss": 2.8274, + "mean_token_accuracy": 0.41514726507713884, + "step": 1547 + }, + { + "epoch": 0.28698553948832034, + "grad_norm": 7.2109375, + "learning_rate": 9.713014460511681e-06, + "loss": 2.7271, + "mean_token_accuracy": 0.4655148583275743, + "step": 1548 + }, + { + "epoch": 0.2871709306637004, + "grad_norm": 7.57421875, + "learning_rate": 9.7128290693363e-06, + "loss": 2.549, + "mean_token_accuracy": 0.4806169237182159, + "step": 1549 + }, + { + "epoch": 0.28735632183908044, + "grad_norm": 6.91796875, + "learning_rate": 9.71264367816092e-06, + "loss": 2.791, + "mean_token_accuracy": 0.44431065623118604, + "step": 1550 + }, + { + "epoch": 0.2875417130144605, + "grad_norm": 5.5390625, + "learning_rate": 9.71245828698554e-06, + "loss": 3.0578, + "mean_token_accuracy": 0.40838820498139133, + "step": 1551 + }, + { + "epoch": 0.28772710418984054, + "grad_norm": 5.5078125, + "learning_rate": 9.71227289581016e-06, + "loss": 2.9479, + "mean_token_accuracy": 0.4099685675797036, + "step": 1552 + }, + { + "epoch": 0.2879124953652206, + "grad_norm": 7.28125, + "learning_rate": 9.71208750463478e-06, + "loss": 2.7056, + "mean_token_accuracy": 0.4480243161094225, + "step": 1553 + }, + { + "epoch": 0.28809788654060065, + "grad_norm": 6.79296875, + "learning_rate": 9.7119021134594e-06, + "loss": 2.9637, + "mean_token_accuracy": 0.42044134727061555, + "step": 1554 + }, + { + "epoch": 0.2882832777159807, + "grad_norm": 5.94921875, + "learning_rate": 9.71171672228402e-06, + "loss": 3.207, + "mean_token_accuracy": 0.396993152724025, + "step": 1555 + }, + { + "epoch": 0.28846866889136075, + "grad_norm": 8.9453125, + "learning_rate": 9.71153133110864e-06, + "loss": 2.4577, + "mean_token_accuracy": 0.48194807190044553, + "step": 1556 + }, + { + "epoch": 0.2886540600667408, + "grad_norm": 5.97265625, + "learning_rate": 9.71134593993326e-06, + "loss": 2.6438, + "mean_token_accuracy": 0.453713670613563, + "step": 1557 + }, + { + "epoch": 0.28883945124212085, + "grad_norm": 7.21484375, + "learning_rate": 9.71116054875788e-06, + "loss": 2.8583, + "mean_token_accuracy": 0.41842634489693314, + "step": 1558 + }, + { + "epoch": 0.2890248424175009, + "grad_norm": 5.4765625, + "learning_rate": 9.7109751575825e-06, + "loss": 2.8726, + "mean_token_accuracy": 0.430997526793075, + "step": 1559 + }, + { + "epoch": 0.289210233592881, + "grad_norm": 4.859375, + "learning_rate": 9.710789766407119e-06, + "loss": 2.675, + "mean_token_accuracy": 0.4443155452436195, + "step": 1560 + }, + { + "epoch": 0.28939562476826103, + "grad_norm": 5.77734375, + "learning_rate": 9.71060437523174e-06, + "loss": 2.7436, + "mean_token_accuracy": 0.46791685494803437, + "step": 1561 + }, + { + "epoch": 0.2895810159436411, + "grad_norm": 6.62890625, + "learning_rate": 9.71041898405636e-06, + "loss": 2.7468, + "mean_token_accuracy": 0.4430458109781263, + "step": 1562 + }, + { + "epoch": 0.28976640711902113, + "grad_norm": 9.9140625, + "learning_rate": 9.71023359288098e-06, + "loss": 2.8519, + "mean_token_accuracy": 0.41906180193596426, + "step": 1563 + }, + { + "epoch": 0.2899517982944012, + "grad_norm": 6.0703125, + "learning_rate": 9.7100482017056e-06, + "loss": 3.0702, + "mean_token_accuracy": 0.41227700519735916, + "step": 1564 + }, + { + "epoch": 0.29013718946978123, + "grad_norm": 6.65234375, + "learning_rate": 9.70986281053022e-06, + "loss": 2.5877, + "mean_token_accuracy": 0.4474368982565704, + "step": 1565 + }, + { + "epoch": 0.2903225806451613, + "grad_norm": 5.2734375, + "learning_rate": 9.70967741935484e-06, + "loss": 2.9484, + "mean_token_accuracy": 0.4313041049986037, + "step": 1566 + }, + { + "epoch": 0.29050797182054133, + "grad_norm": 6.99609375, + "learning_rate": 9.709492028179459e-06, + "loss": 2.7334, + "mean_token_accuracy": 0.4450015669069257, + "step": 1567 + }, + { + "epoch": 0.2906933629959214, + "grad_norm": 6.76953125, + "learning_rate": 9.709306637004079e-06, + "loss": 2.7194, + "mean_token_accuracy": 0.4512799339388935, + "step": 1568 + }, + { + "epoch": 0.29087875417130143, + "grad_norm": 6.90625, + "learning_rate": 9.7091212458287e-06, + "loss": 2.6758, + "mean_token_accuracy": 0.4457377610407395, + "step": 1569 + }, + { + "epoch": 0.2910641453466815, + "grad_norm": 5.21875, + "learning_rate": 9.70893585465332e-06, + "loss": 3.2338, + "mean_token_accuracy": 0.3921968787515006, + "step": 1570 + }, + { + "epoch": 0.29124953652206154, + "grad_norm": 7.515625, + "learning_rate": 9.708750463477939e-06, + "loss": 2.917, + "mean_token_accuracy": 0.42592592592592593, + "step": 1571 + }, + { + "epoch": 0.2914349276974416, + "grad_norm": 6.25390625, + "learning_rate": 9.70856507230256e-06, + "loss": 3.2265, + "mean_token_accuracy": 0.4004566210045662, + "step": 1572 + }, + { + "epoch": 0.29162031887282164, + "grad_norm": 7.76953125, + "learning_rate": 9.70837968112718e-06, + "loss": 2.3289, + "mean_token_accuracy": 0.49734349734349736, + "step": 1573 + }, + { + "epoch": 0.2918057100482017, + "grad_norm": 6.171875, + "learning_rate": 9.708194289951799e-06, + "loss": 3.2519, + "mean_token_accuracy": 0.3765662490002666, + "step": 1574 + }, + { + "epoch": 0.29199110122358174, + "grad_norm": 8.2109375, + "learning_rate": 9.708008898776419e-06, + "loss": 1.9985, + "mean_token_accuracy": 0.5430408381950232, + "step": 1575 + }, + { + "epoch": 0.2921764923989618, + "grad_norm": 7.39453125, + "learning_rate": 9.707823507601038e-06, + "loss": 2.8638, + "mean_token_accuracy": 0.4476762906514973, + "step": 1576 + }, + { + "epoch": 0.29236188357434184, + "grad_norm": 8.390625, + "learning_rate": 9.70763811642566e-06, + "loss": 2.8807, + "mean_token_accuracy": 0.4274538745387454, + "step": 1577 + }, + { + "epoch": 0.2925472747497219, + "grad_norm": 5.953125, + "learning_rate": 9.707452725250279e-06, + "loss": 2.8645, + "mean_token_accuracy": 0.428875, + "step": 1578 + }, + { + "epoch": 0.29273266592510194, + "grad_norm": 5.9765625, + "learning_rate": 9.7072673340749e-06, + "loss": 2.8208, + "mean_token_accuracy": 0.42228277958285293, + "step": 1579 + }, + { + "epoch": 0.292918057100482, + "grad_norm": 5.3359375, + "learning_rate": 9.707081942899518e-06, + "loss": 3.3632, + "mean_token_accuracy": 0.37762669962917184, + "step": 1580 + }, + { + "epoch": 0.29310344827586204, + "grad_norm": 7.4375, + "learning_rate": 9.706896551724139e-06, + "loss": 2.9174, + "mean_token_accuracy": 0.4350547730829421, + "step": 1581 + }, + { + "epoch": 0.2932888394512421, + "grad_norm": 6.2421875, + "learning_rate": 9.70671116054876e-06, + "loss": 3.2318, + "mean_token_accuracy": 0.3927633498686898, + "step": 1582 + }, + { + "epoch": 0.29347423062662215, + "grad_norm": 7.203125, + "learning_rate": 9.706525769373378e-06, + "loss": 2.6148, + "mean_token_accuracy": 0.44075, + "step": 1583 + }, + { + "epoch": 0.2936596218020022, + "grad_norm": 6.8359375, + "learning_rate": 9.706340378197998e-06, + "loss": 2.794, + "mean_token_accuracy": 0.4410968577144875, + "step": 1584 + }, + { + "epoch": 0.2938450129773823, + "grad_norm": 5.83984375, + "learning_rate": 9.706154987022619e-06, + "loss": 2.6049, + "mean_token_accuracy": 0.4608023072889355, + "step": 1585 + }, + { + "epoch": 0.2940304041527623, + "grad_norm": 7.8515625, + "learning_rate": 9.70596959584724e-06, + "loss": 2.6257, + "mean_token_accuracy": 0.44883203559510565, + "step": 1586 + }, + { + "epoch": 0.2942157953281424, + "grad_norm": 6.53125, + "learning_rate": 9.705784204671858e-06, + "loss": 2.7849, + "mean_token_accuracy": 0.44210905596925293, + "step": 1587 + }, + { + "epoch": 0.2944011865035224, + "grad_norm": 6.75390625, + "learning_rate": 9.705598813496479e-06, + "loss": 2.6874, + "mean_token_accuracy": 0.43751891074130106, + "step": 1588 + }, + { + "epoch": 0.2945865776789025, + "grad_norm": 5.18359375, + "learning_rate": 9.705413422321098e-06, + "loss": 3.0166, + "mean_token_accuracy": 0.4035426365391166, + "step": 1589 + }, + { + "epoch": 0.29477196885428253, + "grad_norm": 9.4609375, + "learning_rate": 9.705228031145718e-06, + "loss": 3.122, + "mean_token_accuracy": 0.3881394756935607, + "step": 1590 + }, + { + "epoch": 0.2949573600296626, + "grad_norm": 7.8046875, + "learning_rate": 9.705042639970339e-06, + "loss": 3.1501, + "mean_token_accuracy": 0.4048533251420673, + "step": 1591 + }, + { + "epoch": 0.29514275120504263, + "grad_norm": 6.26953125, + "learning_rate": 9.704857248794957e-06, + "loss": 3.2137, + "mean_token_accuracy": 0.3904655612244898, + "step": 1592 + }, + { + "epoch": 0.2953281423804227, + "grad_norm": 8.1015625, + "learning_rate": 9.704671857619578e-06, + "loss": 2.8807, + "mean_token_accuracy": 0.4326157860404436, + "step": 1593 + }, + { + "epoch": 0.29551353355580273, + "grad_norm": 7.43359375, + "learning_rate": 9.704486466444198e-06, + "loss": 2.2527, + "mean_token_accuracy": 0.5080840743734842, + "step": 1594 + }, + { + "epoch": 0.2956989247311828, + "grad_norm": 11.875, + "learning_rate": 9.704301075268819e-06, + "loss": 2.7885, + "mean_token_accuracy": 0.42993070638121833, + "step": 1595 + }, + { + "epoch": 0.29588431590656283, + "grad_norm": 9.015625, + "learning_rate": 9.704115684093438e-06, + "loss": 3.3049, + "mean_token_accuracy": 0.36860940695296524, + "step": 1596 + }, + { + "epoch": 0.2960697070819429, + "grad_norm": 9.6875, + "learning_rate": 9.703930292918058e-06, + "loss": 3.0422, + "mean_token_accuracy": 0.3937613019891501, + "step": 1597 + }, + { + "epoch": 0.29625509825732294, + "grad_norm": 6.13671875, + "learning_rate": 9.703744901742677e-06, + "loss": 3.2141, + "mean_token_accuracy": 0.3995263026676639, + "step": 1598 + }, + { + "epoch": 0.296440489432703, + "grad_norm": 6.0546875, + "learning_rate": 9.703559510567297e-06, + "loss": 2.4137, + "mean_token_accuracy": 0.4966857142857143, + "step": 1599 + }, + { + "epoch": 0.29662588060808304, + "grad_norm": 6.80859375, + "learning_rate": 9.703374119391918e-06, + "loss": 2.7705, + "mean_token_accuracy": 0.43312744232950967, + "step": 1600 + }, + { + "epoch": 0.2968112717834631, + "grad_norm": 7.23046875, + "learning_rate": 9.703188728216538e-06, + "loss": 2.9913, + "mean_token_accuracy": 0.41335978835978837, + "step": 1601 + }, + { + "epoch": 0.29699666295884314, + "grad_norm": 6.15234375, + "learning_rate": 9.703003337041159e-06, + "loss": 2.6985, + "mean_token_accuracy": 0.4531224786186865, + "step": 1602 + }, + { + "epoch": 0.2971820541342232, + "grad_norm": 5.7265625, + "learning_rate": 9.702817945865778e-06, + "loss": 3.1851, + "mean_token_accuracy": 0.4013269557167104, + "step": 1603 + }, + { + "epoch": 0.29736744530960324, + "grad_norm": 7.75390625, + "learning_rate": 9.702632554690398e-06, + "loss": 2.8564, + "mean_token_accuracy": 0.43471473176270226, + "step": 1604 + }, + { + "epoch": 0.2975528364849833, + "grad_norm": 7.609375, + "learning_rate": 9.702447163515017e-06, + "loss": 2.7967, + "mean_token_accuracy": 0.4384229779162715, + "step": 1605 + }, + { + "epoch": 0.29773822766036334, + "grad_norm": 4.98828125, + "learning_rate": 9.702261772339637e-06, + "loss": 3.0895, + "mean_token_accuracy": 0.40297766749379654, + "step": 1606 + }, + { + "epoch": 0.2979236188357434, + "grad_norm": 6.76171875, + "learning_rate": 9.702076381164258e-06, + "loss": 3.1842, + "mean_token_accuracy": 0.39545519508360727, + "step": 1607 + }, + { + "epoch": 0.29810901001112344, + "grad_norm": 7.15234375, + "learning_rate": 9.701890989988877e-06, + "loss": 2.9224, + "mean_token_accuracy": 0.3972678349661929, + "step": 1608 + }, + { + "epoch": 0.2982944011865035, + "grad_norm": 5.2265625, + "learning_rate": 9.701705598813497e-06, + "loss": 2.869, + "mean_token_accuracy": 0.43312101910828027, + "step": 1609 + }, + { + "epoch": 0.2984797923618836, + "grad_norm": 10.0625, + "learning_rate": 9.701520207638118e-06, + "loss": 2.9442, + "mean_token_accuracy": 0.42415384615384616, + "step": 1610 + }, + { + "epoch": 0.2986651835372636, + "grad_norm": 9.90625, + "learning_rate": 9.701334816462738e-06, + "loss": 2.9575, + "mean_token_accuracy": 0.43625827814569534, + "step": 1611 + }, + { + "epoch": 0.2988505747126437, + "grad_norm": 7.046875, + "learning_rate": 9.701149425287357e-06, + "loss": 2.6565, + "mean_token_accuracy": 0.4426437429537768, + "step": 1612 + }, + { + "epoch": 0.2990359658880237, + "grad_norm": 7.44921875, + "learning_rate": 9.700964034111977e-06, + "loss": 3.2336, + "mean_token_accuracy": 0.3844438249233343, + "step": 1613 + }, + { + "epoch": 0.2992213570634038, + "grad_norm": 8.1171875, + "learning_rate": 9.700778642936596e-06, + "loss": 3.1014, + "mean_token_accuracy": 0.39703597466236407, + "step": 1614 + }, + { + "epoch": 0.2994067482387838, + "grad_norm": 9.6171875, + "learning_rate": 9.700593251761217e-06, + "loss": 2.1395, + "mean_token_accuracy": 0.5212129840546698, + "step": 1615 + }, + { + "epoch": 0.2995921394141639, + "grad_norm": 8.3984375, + "learning_rate": 9.700407860585837e-06, + "loss": 2.5254, + "mean_token_accuracy": 0.49286624203821655, + "step": 1616 + }, + { + "epoch": 0.29977753058954393, + "grad_norm": 6.72265625, + "learning_rate": 9.700222469410456e-06, + "loss": 3.1263, + "mean_token_accuracy": 0.40148428405122233, + "step": 1617 + }, + { + "epoch": 0.299962921764924, + "grad_norm": 8.6015625, + "learning_rate": 9.700037078235077e-06, + "loss": 2.5444, + "mean_token_accuracy": 0.4650430146613353, + "step": 1618 + }, + { + "epoch": 0.30014831294030403, + "grad_norm": 5.89453125, + "learning_rate": 9.699851687059697e-06, + "loss": 3.2236, + "mean_token_accuracy": 0.3995475752863, + "step": 1619 + }, + { + "epoch": 0.3003337041156841, + "grad_norm": 5.81640625, + "learning_rate": 9.699666295884318e-06, + "loss": 3.2149, + "mean_token_accuracy": 0.389470664180175, + "step": 1620 + }, + { + "epoch": 0.30051909529106413, + "grad_norm": 6.26171875, + "learning_rate": 9.699480904708936e-06, + "loss": 2.8352, + "mean_token_accuracy": 0.4374616799509503, + "step": 1621 + }, + { + "epoch": 0.3007044864664442, + "grad_norm": 10.703125, + "learning_rate": 9.699295513533557e-06, + "loss": 2.7288, + "mean_token_accuracy": 0.42482803799541435, + "step": 1622 + }, + { + "epoch": 0.30088987764182423, + "grad_norm": 5.34765625, + "learning_rate": 9.699110122358176e-06, + "loss": 3.3044, + "mean_token_accuracy": 0.3780892103676914, + "step": 1623 + }, + { + "epoch": 0.3010752688172043, + "grad_norm": 6.02734375, + "learning_rate": 9.698924731182796e-06, + "loss": 3.2094, + "mean_token_accuracy": 0.40568475452196384, + "step": 1624 + }, + { + "epoch": 0.30126065999258433, + "grad_norm": 6.4296875, + "learning_rate": 9.698739340007417e-06, + "loss": 2.6631, + "mean_token_accuracy": 0.4592358604091456, + "step": 1625 + }, + { + "epoch": 0.3014460511679644, + "grad_norm": 5.2109375, + "learning_rate": 9.698553948832037e-06, + "loss": 3.0968, + "mean_token_accuracy": 0.42143127603180613, + "step": 1626 + }, + { + "epoch": 0.30163144234334444, + "grad_norm": 5.97265625, + "learning_rate": 9.698368557656656e-06, + "loss": 3.0453, + "mean_token_accuracy": 0.40907880724174656, + "step": 1627 + }, + { + "epoch": 0.3018168335187245, + "grad_norm": 23.8125, + "learning_rate": 9.698183166481276e-06, + "loss": 3.1499, + "mean_token_accuracy": 0.36487980105001383, + "step": 1628 + }, + { + "epoch": 0.30200222469410454, + "grad_norm": 9.4609375, + "learning_rate": 9.697997775305897e-06, + "loss": 2.9914, + "mean_token_accuracy": 0.4153034868704262, + "step": 1629 + }, + { + "epoch": 0.3021876158694846, + "grad_norm": 8.40625, + "learning_rate": 9.697812384130516e-06, + "loss": 3.042, + "mean_token_accuracy": 0.41618709440431, + "step": 1630 + }, + { + "epoch": 0.30237300704486464, + "grad_norm": 6.01171875, + "learning_rate": 9.697626992955136e-06, + "loss": 2.9885, + "mean_token_accuracy": 0.39879502627868224, + "step": 1631 + }, + { + "epoch": 0.3025583982202447, + "grad_norm": 20.09375, + "learning_rate": 9.697441601779755e-06, + "loss": 3.3641, + "mean_token_accuracy": 0.36503928170594835, + "step": 1632 + }, + { + "epoch": 0.30274378939562474, + "grad_norm": 9.0390625, + "learning_rate": 9.697256210604375e-06, + "loss": 3.5279, + "mean_token_accuracy": 0.38456375838926177, + "step": 1633 + }, + { + "epoch": 0.3029291805710048, + "grad_norm": 7.66015625, + "learning_rate": 9.697070819428996e-06, + "loss": 3.0595, + "mean_token_accuracy": 0.39925612415031425, + "step": 1634 + }, + { + "epoch": 0.3031145717463849, + "grad_norm": 6.16796875, + "learning_rate": 9.696885428253616e-06, + "loss": 3.0098, + "mean_token_accuracy": 0.41105289421157687, + "step": 1635 + }, + { + "epoch": 0.3032999629217649, + "grad_norm": 7.1328125, + "learning_rate": 9.696700037078235e-06, + "loss": 3.1967, + "mean_token_accuracy": 0.3983202533388407, + "step": 1636 + }, + { + "epoch": 0.303485354097145, + "grad_norm": 12.1953125, + "learning_rate": 9.696514645902856e-06, + "loss": 2.3336, + "mean_token_accuracy": 0.4772754965273547, + "step": 1637 + }, + { + "epoch": 0.303670745272525, + "grad_norm": 11.484375, + "learning_rate": 9.696329254727476e-06, + "loss": 2.2429, + "mean_token_accuracy": 0.48963903743315507, + "step": 1638 + }, + { + "epoch": 0.3038561364479051, + "grad_norm": 6.25, + "learning_rate": 9.696143863552095e-06, + "loss": 3.1508, + "mean_token_accuracy": 0.381734404536862, + "step": 1639 + }, + { + "epoch": 0.3040415276232851, + "grad_norm": 6.671875, + "learning_rate": 9.695958472376715e-06, + "loss": 2.8728, + "mean_token_accuracy": 0.41678843968176654, + "step": 1640 + }, + { + "epoch": 0.3042269187986652, + "grad_norm": 7.46875, + "learning_rate": 9.695773081201334e-06, + "loss": 2.5174, + "mean_token_accuracy": 0.45927700348432055, + "step": 1641 + }, + { + "epoch": 0.3044123099740452, + "grad_norm": 6.21484375, + "learning_rate": 9.695587690025956e-06, + "loss": 2.865, + "mean_token_accuracy": 0.4286121808586507, + "step": 1642 + }, + { + "epoch": 0.3045977011494253, + "grad_norm": 8.734375, + "learning_rate": 9.695402298850575e-06, + "loss": 2.7297, + "mean_token_accuracy": 0.4509151414309484, + "step": 1643 + }, + { + "epoch": 0.3047830923248053, + "grad_norm": 6.77734375, + "learning_rate": 9.695216907675196e-06, + "loss": 2.6571, + "mean_token_accuracy": 0.44252054794520546, + "step": 1644 + }, + { + "epoch": 0.3049684835001854, + "grad_norm": 5.8359375, + "learning_rate": 9.695031516499816e-06, + "loss": 2.7976, + "mean_token_accuracy": 0.42959958126144987, + "step": 1645 + }, + { + "epoch": 0.30515387467556543, + "grad_norm": 5.12109375, + "learning_rate": 9.694846125324435e-06, + "loss": 2.9451, + "mean_token_accuracy": 0.4317892593535251, + "step": 1646 + }, + { + "epoch": 0.3053392658509455, + "grad_norm": 6.2890625, + "learning_rate": 9.694660734149056e-06, + "loss": 2.6259, + "mean_token_accuracy": 0.4798870853916726, + "step": 1647 + }, + { + "epoch": 0.30552465702632553, + "grad_norm": 6.23828125, + "learning_rate": 9.694475342973674e-06, + "loss": 2.114, + "mean_token_accuracy": 0.5342583321413246, + "step": 1648 + }, + { + "epoch": 0.3057100482017056, + "grad_norm": 5.484375, + "learning_rate": 9.694289951798295e-06, + "loss": 3.1981, + "mean_token_accuracy": 0.4018670565740637, + "step": 1649 + }, + { + "epoch": 0.30589543937708563, + "grad_norm": 6.02734375, + "learning_rate": 9.694104560622915e-06, + "loss": 2.872, + "mean_token_accuracy": 0.4292744479495268, + "step": 1650 + }, + { + "epoch": 0.3060808305524657, + "grad_norm": 7.1328125, + "learning_rate": 9.693919169447536e-06, + "loss": 2.9167, + "mean_token_accuracy": 0.4280237937871778, + "step": 1651 + }, + { + "epoch": 0.30626622172784573, + "grad_norm": 5.359375, + "learning_rate": 9.693733778272155e-06, + "loss": 3.0145, + "mean_token_accuracy": 0.4054091158704009, + "step": 1652 + }, + { + "epoch": 0.3064516129032258, + "grad_norm": 11.6875, + "learning_rate": 9.693548387096775e-06, + "loss": 2.6207, + "mean_token_accuracy": 0.4476362094551622, + "step": 1653 + }, + { + "epoch": 0.30663700407860583, + "grad_norm": 7.13671875, + "learning_rate": 9.693362995921396e-06, + "loss": 2.897, + "mean_token_accuracy": 0.4419168941461935, + "step": 1654 + }, + { + "epoch": 0.3068223952539859, + "grad_norm": 6.25, + "learning_rate": 9.693177604746014e-06, + "loss": 2.8809, + "mean_token_accuracy": 0.43045134479972297, + "step": 1655 + }, + { + "epoch": 0.30700778642936594, + "grad_norm": 5.46875, + "learning_rate": 9.692992213570635e-06, + "loss": 2.7272, + "mean_token_accuracy": 0.44576226012793174, + "step": 1656 + }, + { + "epoch": 0.307193177604746, + "grad_norm": 4.80859375, + "learning_rate": 9.692806822395254e-06, + "loss": 3.0342, + "mean_token_accuracy": 0.4126184478829315, + "step": 1657 + }, + { + "epoch": 0.3073785687801261, + "grad_norm": 5.4765625, + "learning_rate": 9.692621431219876e-06, + "loss": 2.9861, + "mean_token_accuracy": 0.43495693495693494, + "step": 1658 + }, + { + "epoch": 0.3075639599555061, + "grad_norm": 7.91796875, + "learning_rate": 9.692436040044495e-06, + "loss": 2.6858, + "mean_token_accuracy": 0.45570006096321886, + "step": 1659 + }, + { + "epoch": 0.3077493511308862, + "grad_norm": 6.1796875, + "learning_rate": 9.692250648869115e-06, + "loss": 2.869, + "mean_token_accuracy": 0.4214335745972449, + "step": 1660 + }, + { + "epoch": 0.3079347423062662, + "grad_norm": 6.16796875, + "learning_rate": 9.692065257693734e-06, + "loss": 3.0517, + "mean_token_accuracy": 0.4017615971814445, + "step": 1661 + }, + { + "epoch": 0.3081201334816463, + "grad_norm": 7.6328125, + "learning_rate": 9.691879866518354e-06, + "loss": 2.9657, + "mean_token_accuracy": 0.4263743115452562, + "step": 1662 + }, + { + "epoch": 0.3083055246570263, + "grad_norm": 5.5625, + "learning_rate": 9.691694475342975e-06, + "loss": 2.7808, + "mean_token_accuracy": 0.45369195635625476, + "step": 1663 + }, + { + "epoch": 0.3084909158324064, + "grad_norm": 6.21875, + "learning_rate": 9.691509084167594e-06, + "loss": 2.7273, + "mean_token_accuracy": 0.4305889423076923, + "step": 1664 + }, + { + "epoch": 0.3086763070077864, + "grad_norm": 6.234375, + "learning_rate": 9.691323692992214e-06, + "loss": 2.8099, + "mean_token_accuracy": 0.4431085770946951, + "step": 1665 + }, + { + "epoch": 0.3088616981831665, + "grad_norm": 5.4921875, + "learning_rate": 9.691138301816835e-06, + "loss": 2.9046, + "mean_token_accuracy": 0.4237653074956264, + "step": 1666 + }, + { + "epoch": 0.3090470893585465, + "grad_norm": 5.36328125, + "learning_rate": 9.690952910641455e-06, + "loss": 2.7998, + "mean_token_accuracy": 0.4306955025804866, + "step": 1667 + }, + { + "epoch": 0.3092324805339266, + "grad_norm": 5.0546875, + "learning_rate": 9.690767519466074e-06, + "loss": 2.7191, + "mean_token_accuracy": 0.45101694915254237, + "step": 1668 + }, + { + "epoch": 0.3094178717093066, + "grad_norm": 5.0625, + "learning_rate": 9.690582128290694e-06, + "loss": 2.4871, + "mean_token_accuracy": 0.4999378495960224, + "step": 1669 + }, + { + "epoch": 0.3096032628846867, + "grad_norm": 7.10546875, + "learning_rate": 9.690396737115313e-06, + "loss": 2.5897, + "mean_token_accuracy": 0.45039991690038433, + "step": 1670 + }, + { + "epoch": 0.3097886540600667, + "grad_norm": 5.171875, + "learning_rate": 9.690211345939934e-06, + "loss": 2.7431, + "mean_token_accuracy": 0.44445851804939834, + "step": 1671 + }, + { + "epoch": 0.3099740452354468, + "grad_norm": 6.48828125, + "learning_rate": 9.690025954764554e-06, + "loss": 3.3281, + "mean_token_accuracy": 0.3828210424006804, + "step": 1672 + }, + { + "epoch": 0.3101594364108268, + "grad_norm": 6.9375, + "learning_rate": 9.689840563589173e-06, + "loss": 2.6799, + "mean_token_accuracy": 0.4417074877536739, + "step": 1673 + }, + { + "epoch": 0.3103448275862069, + "grad_norm": 17.34375, + "learning_rate": 9.689655172413794e-06, + "loss": 2.7592, + "mean_token_accuracy": 0.43471357029436913, + "step": 1674 + }, + { + "epoch": 0.31053021876158693, + "grad_norm": 5.328125, + "learning_rate": 9.689469781238414e-06, + "loss": 3.0806, + "mean_token_accuracy": 0.4251152073732719, + "step": 1675 + }, + { + "epoch": 0.310715609936967, + "grad_norm": 5.52734375, + "learning_rate": 9.689284390063035e-06, + "loss": 2.8179, + "mean_token_accuracy": 0.4434006031083275, + "step": 1676 + }, + { + "epoch": 0.31090100111234703, + "grad_norm": 5.7578125, + "learning_rate": 9.689098998887653e-06, + "loss": 3.0635, + "mean_token_accuracy": 0.41468771448181196, + "step": 1677 + }, + { + "epoch": 0.3110863922877271, + "grad_norm": 5.20703125, + "learning_rate": 9.688913607712274e-06, + "loss": 3.1912, + "mean_token_accuracy": 0.3950810508664058, + "step": 1678 + }, + { + "epoch": 0.31127178346310713, + "grad_norm": 5.6875, + "learning_rate": 9.688728216536893e-06, + "loss": 3.0033, + "mean_token_accuracy": 0.41600604001510005, + "step": 1679 + }, + { + "epoch": 0.3114571746384872, + "grad_norm": 5.87890625, + "learning_rate": 9.688542825361513e-06, + "loss": 2.7496, + "mean_token_accuracy": 0.4387962586417243, + "step": 1680 + }, + { + "epoch": 0.31164256581386723, + "grad_norm": 7.33203125, + "learning_rate": 9.688357434186134e-06, + "loss": 2.9002, + "mean_token_accuracy": 0.4183506280720918, + "step": 1681 + }, + { + "epoch": 0.3118279569892473, + "grad_norm": 6.1015625, + "learning_rate": 9.688172043010754e-06, + "loss": 3.0304, + "mean_token_accuracy": 0.4251551043429216, + "step": 1682 + }, + { + "epoch": 0.3120133481646274, + "grad_norm": 5.22265625, + "learning_rate": 9.687986651835375e-06, + "loss": 2.7653, + "mean_token_accuracy": 0.43670137245622337, + "step": 1683 + }, + { + "epoch": 0.3121987393400074, + "grad_norm": 8.390625, + "learning_rate": 9.687801260659993e-06, + "loss": 2.2176, + "mean_token_accuracy": 0.5166324903167009, + "step": 1684 + }, + { + "epoch": 0.3123841305153875, + "grad_norm": 6.85546875, + "learning_rate": 9.687615869484614e-06, + "loss": 2.5245, + "mean_token_accuracy": 0.45152299422655784, + "step": 1685 + }, + { + "epoch": 0.3125695216907675, + "grad_norm": 7.98046875, + "learning_rate": 9.687430478309233e-06, + "loss": 2.1454, + "mean_token_accuracy": 0.5059038515603036, + "step": 1686 + }, + { + "epoch": 0.3127549128661476, + "grad_norm": 9.4765625, + "learning_rate": 9.687245087133853e-06, + "loss": 2.4795, + "mean_token_accuracy": 0.4844777841892672, + "step": 1687 + }, + { + "epoch": 0.3129403040415276, + "grad_norm": 7.30859375, + "learning_rate": 9.687059695958474e-06, + "loss": 2.8524, + "mean_token_accuracy": 0.42684824902723734, + "step": 1688 + }, + { + "epoch": 0.3131256952169077, + "grad_norm": 6.8046875, + "learning_rate": 9.686874304783092e-06, + "loss": 2.7992, + "mean_token_accuracy": 0.4192585220204031, + "step": 1689 + }, + { + "epoch": 0.3133110863922877, + "grad_norm": 5.8984375, + "learning_rate": 9.686688913607713e-06, + "loss": 2.5237, + "mean_token_accuracy": 0.47726646459079636, + "step": 1690 + }, + { + "epoch": 0.3134964775676678, + "grad_norm": 6.15625, + "learning_rate": 9.686503522432333e-06, + "loss": 3.0805, + "mean_token_accuracy": 0.41820681228043377, + "step": 1691 + }, + { + "epoch": 0.3136818687430478, + "grad_norm": 6.44140625, + "learning_rate": 9.686318131256954e-06, + "loss": 3.1015, + "mean_token_accuracy": 0.4072164948453608, + "step": 1692 + }, + { + "epoch": 0.3138672599184279, + "grad_norm": 6.98828125, + "learning_rate": 9.686132740081573e-06, + "loss": 3.0463, + "mean_token_accuracy": 0.41208998366218424, + "step": 1693 + }, + { + "epoch": 0.3140526510938079, + "grad_norm": 5.71875, + "learning_rate": 9.685947348906193e-06, + "loss": 3.1958, + "mean_token_accuracy": 0.39973127309371853, + "step": 1694 + }, + { + "epoch": 0.314238042269188, + "grad_norm": 7.55078125, + "learning_rate": 9.685761957730812e-06, + "loss": 2.7279, + "mean_token_accuracy": 0.42528162695643507, + "step": 1695 + }, + { + "epoch": 0.314423433444568, + "grad_norm": 7.3046875, + "learning_rate": 9.685576566555433e-06, + "loss": 2.7829, + "mean_token_accuracy": 0.4410958904109589, + "step": 1696 + }, + { + "epoch": 0.3146088246199481, + "grad_norm": 7.1640625, + "learning_rate": 9.685391175380053e-06, + "loss": 3.5399, + "mean_token_accuracy": 0.3669496487119438, + "step": 1697 + }, + { + "epoch": 0.3147942157953281, + "grad_norm": 5.3828125, + "learning_rate": 9.685205784204673e-06, + "loss": 3.1652, + "mean_token_accuracy": 0.40021281626862143, + "step": 1698 + }, + { + "epoch": 0.3149796069707082, + "grad_norm": 5.72265625, + "learning_rate": 9.685020393029292e-06, + "loss": 2.8891, + "mean_token_accuracy": 0.42942030899247324, + "step": 1699 + }, + { + "epoch": 0.3151649981460882, + "grad_norm": 7.73046875, + "learning_rate": 9.684835001853913e-06, + "loss": 2.4826, + "mean_token_accuracy": 0.4716366007389698, + "step": 1700 + }, + { + "epoch": 0.3153503893214683, + "grad_norm": 5.73828125, + "learning_rate": 9.684649610678533e-06, + "loss": 3.078, + "mean_token_accuracy": 0.4200402819738167, + "step": 1701 + }, + { + "epoch": 0.31553578049684833, + "grad_norm": 6.10546875, + "learning_rate": 9.684464219503152e-06, + "loss": 2.8713, + "mean_token_accuracy": 0.4183555775251277, + "step": 1702 + }, + { + "epoch": 0.3157211716722284, + "grad_norm": 6.82421875, + "learning_rate": 9.684278828327773e-06, + "loss": 3.1498, + "mean_token_accuracy": 0.393552036199095, + "step": 1703 + }, + { + "epoch": 0.31590656284760843, + "grad_norm": 6.7890625, + "learning_rate": 9.684093437152391e-06, + "loss": 3.0194, + "mean_token_accuracy": 0.41530170136320405, + "step": 1704 + }, + { + "epoch": 0.3160919540229885, + "grad_norm": 6.609375, + "learning_rate": 9.683908045977012e-06, + "loss": 3.0437, + "mean_token_accuracy": 0.40755467196819084, + "step": 1705 + }, + { + "epoch": 0.31627734519836853, + "grad_norm": 6.60546875, + "learning_rate": 9.683722654801632e-06, + "loss": 3.3171, + "mean_token_accuracy": 0.37373876986869387, + "step": 1706 + }, + { + "epoch": 0.3164627363737486, + "grad_norm": 5.73046875, + "learning_rate": 9.683537263626253e-06, + "loss": 3.1819, + "mean_token_accuracy": 0.3945742117942809, + "step": 1707 + }, + { + "epoch": 0.3166481275491287, + "grad_norm": 5.36328125, + "learning_rate": 9.683351872450872e-06, + "loss": 3.0673, + "mean_token_accuracy": 0.40986908358509566, + "step": 1708 + }, + { + "epoch": 0.3168335187245087, + "grad_norm": 7.39453125, + "learning_rate": 9.683166481275492e-06, + "loss": 2.9868, + "mean_token_accuracy": 0.42795151877899146, + "step": 1709 + }, + { + "epoch": 0.3170189098998888, + "grad_norm": 7.265625, + "learning_rate": 9.682981090100113e-06, + "loss": 2.932, + "mean_token_accuracy": 0.4205020920502092, + "step": 1710 + }, + { + "epoch": 0.3172043010752688, + "grad_norm": 5.5, + "learning_rate": 9.682795698924731e-06, + "loss": 3.1569, + "mean_token_accuracy": 0.40234159779614326, + "step": 1711 + }, + { + "epoch": 0.3173896922506489, + "grad_norm": 6.0, + "learning_rate": 9.682610307749352e-06, + "loss": 3.0538, + "mean_token_accuracy": 0.41827991113932084, + "step": 1712 + }, + { + "epoch": 0.3175750834260289, + "grad_norm": 6.87109375, + "learning_rate": 9.68242491657397e-06, + "loss": 2.3621, + "mean_token_accuracy": 0.4851256175759487, + "step": 1713 + }, + { + "epoch": 0.317760474601409, + "grad_norm": 5.84375, + "learning_rate": 9.682239525398593e-06, + "loss": 3.4894, + "mean_token_accuracy": 0.38137913866069206, + "step": 1714 + }, + { + "epoch": 0.317945865776789, + "grad_norm": 5.33203125, + "learning_rate": 9.682054134223212e-06, + "loss": 2.4059, + "mean_token_accuracy": 0.48142816009213935, + "step": 1715 + }, + { + "epoch": 0.3181312569521691, + "grad_norm": 8.2734375, + "learning_rate": 9.681868743047832e-06, + "loss": 2.6264, + "mean_token_accuracy": 0.4357818837314105, + "step": 1716 + }, + { + "epoch": 0.3183166481275491, + "grad_norm": 6.76953125, + "learning_rate": 9.681683351872451e-06, + "loss": 2.5407, + "mean_token_accuracy": 0.4632869365342868, + "step": 1717 + }, + { + "epoch": 0.3185020393029292, + "grad_norm": 6.52734375, + "learning_rate": 9.681497960697071e-06, + "loss": 2.871, + "mean_token_accuracy": 0.4162655806491423, + "step": 1718 + }, + { + "epoch": 0.3186874304783092, + "grad_norm": 6.296875, + "learning_rate": 9.681312569521692e-06, + "loss": 2.9749, + "mean_token_accuracy": 0.41883035039072347, + "step": 1719 + }, + { + "epoch": 0.3188728216536893, + "grad_norm": 5.98046875, + "learning_rate": 9.68112717834631e-06, + "loss": 2.7921, + "mean_token_accuracy": 0.4387661743562933, + "step": 1720 + }, + { + "epoch": 0.3190582128290693, + "grad_norm": 6.87109375, + "learning_rate": 9.680941787170931e-06, + "loss": 2.734, + "mean_token_accuracy": 0.44269226374128673, + "step": 1721 + }, + { + "epoch": 0.3192436040044494, + "grad_norm": 8.6015625, + "learning_rate": 9.680756395995552e-06, + "loss": 3.1061, + "mean_token_accuracy": 0.4065630397236615, + "step": 1722 + }, + { + "epoch": 0.3194289951798294, + "grad_norm": 8.40625, + "learning_rate": 9.680571004820172e-06, + "loss": 2.1459, + "mean_token_accuracy": 0.5106964582838127, + "step": 1723 + }, + { + "epoch": 0.3196143863552095, + "grad_norm": 5.9921875, + "learning_rate": 9.680385613644791e-06, + "loss": 3.2803, + "mean_token_accuracy": 0.3998935745643209, + "step": 1724 + }, + { + "epoch": 0.3197997775305895, + "grad_norm": 7.1796875, + "learning_rate": 9.680200222469412e-06, + "loss": 2.739, + "mean_token_accuracy": 0.4352580480327031, + "step": 1725 + }, + { + "epoch": 0.3199851687059696, + "grad_norm": 7.71484375, + "learning_rate": 9.680014831294032e-06, + "loss": 2.7612, + "mean_token_accuracy": 0.44147023571713945, + "step": 1726 + }, + { + "epoch": 0.3201705598813496, + "grad_norm": 6.05078125, + "learning_rate": 9.67982944011865e-06, + "loss": 2.6809, + "mean_token_accuracy": 0.4778831752371443, + "step": 1727 + }, + { + "epoch": 0.3203559510567297, + "grad_norm": 5.7734375, + "learning_rate": 9.679644048943271e-06, + "loss": 2.6775, + "mean_token_accuracy": 0.4397283531409168, + "step": 1728 + }, + { + "epoch": 0.3205413422321097, + "grad_norm": 6.265625, + "learning_rate": 9.67945865776789e-06, + "loss": 2.7387, + "mean_token_accuracy": 0.47771696637998434, + "step": 1729 + }, + { + "epoch": 0.3207267334074898, + "grad_norm": 6.71484375, + "learning_rate": 9.679273266592512e-06, + "loss": 3.1066, + "mean_token_accuracy": 0.41353059465670783, + "step": 1730 + }, + { + "epoch": 0.32091212458286983, + "grad_norm": 6.3203125, + "learning_rate": 9.679087875417131e-06, + "loss": 2.9148, + "mean_token_accuracy": 0.4270994332818135, + "step": 1731 + }, + { + "epoch": 0.3210975157582499, + "grad_norm": 6.63671875, + "learning_rate": 9.678902484241752e-06, + "loss": 2.845, + "mean_token_accuracy": 0.4361012596306714, + "step": 1732 + }, + { + "epoch": 0.32128290693363, + "grad_norm": 9.5625, + "learning_rate": 9.67871709306637e-06, + "loss": 2.777, + "mean_token_accuracy": 0.42577943229409027, + "step": 1733 + }, + { + "epoch": 0.32146829810901, + "grad_norm": 12.59375, + "learning_rate": 9.678531701890991e-06, + "loss": 2.6566, + "mean_token_accuracy": 0.4566291517979687, + "step": 1734 + }, + { + "epoch": 0.3216536892843901, + "grad_norm": 9.359375, + "learning_rate": 9.678346310715611e-06, + "loss": 3.4603, + "mean_token_accuracy": 0.3717342622542921, + "step": 1735 + }, + { + "epoch": 0.3218390804597701, + "grad_norm": 7.83203125, + "learning_rate": 9.67816091954023e-06, + "loss": 3.1582, + "mean_token_accuracy": 0.3794565729542956, + "step": 1736 + }, + { + "epoch": 0.3220244716351502, + "grad_norm": 6.53125, + "learning_rate": 9.67797552836485e-06, + "loss": 2.8251, + "mean_token_accuracy": 0.44359316011547856, + "step": 1737 + }, + { + "epoch": 0.3222098628105302, + "grad_norm": 6.9453125, + "learning_rate": 9.67779013718947e-06, + "loss": 2.8963, + "mean_token_accuracy": 0.4299089393356419, + "step": 1738 + }, + { + "epoch": 0.3223952539859103, + "grad_norm": 5.625, + "learning_rate": 9.677604746014092e-06, + "loss": 2.9299, + "mean_token_accuracy": 0.4466729589428976, + "step": 1739 + }, + { + "epoch": 0.3225806451612903, + "grad_norm": 6.921875, + "learning_rate": 9.67741935483871e-06, + "loss": 3.1636, + "mean_token_accuracy": 0.3961205319019153, + "step": 1740 + }, + { + "epoch": 0.3227660363366704, + "grad_norm": 7.13671875, + "learning_rate": 9.677233963663331e-06, + "loss": 3.4525, + "mean_token_accuracy": 0.3732336956521739, + "step": 1741 + }, + { + "epoch": 0.3229514275120504, + "grad_norm": 9.46875, + "learning_rate": 9.67704857248795e-06, + "loss": 2.564, + "mean_token_accuracy": 0.46214676125130827, + "step": 1742 + }, + { + "epoch": 0.3231368186874305, + "grad_norm": 5.5859375, + "learning_rate": 9.67686318131257e-06, + "loss": 2.8633, + "mean_token_accuracy": 0.42934385503434164, + "step": 1743 + }, + { + "epoch": 0.3233222098628105, + "grad_norm": 5.2109375, + "learning_rate": 9.67667779013719e-06, + "loss": 3.2499, + "mean_token_accuracy": 0.40780548888657, + "step": 1744 + }, + { + "epoch": 0.3235076010381906, + "grad_norm": 8.3359375, + "learning_rate": 9.67649239896181e-06, + "loss": 3.282, + "mean_token_accuracy": 0.3755483072770217, + "step": 1745 + }, + { + "epoch": 0.3236929922135706, + "grad_norm": 7.11328125, + "learning_rate": 9.67630700778643e-06, + "loss": 2.9008, + "mean_token_accuracy": 0.4235454634051262, + "step": 1746 + }, + { + "epoch": 0.3238783833889507, + "grad_norm": 6.26953125, + "learning_rate": 9.67612161661105e-06, + "loss": 2.5453, + "mean_token_accuracy": 0.4646326446846354, + "step": 1747 + }, + { + "epoch": 0.3240637745643307, + "grad_norm": 8.171875, + "learning_rate": 9.675936225435671e-06, + "loss": 2.8352, + "mean_token_accuracy": 0.4373008175142026, + "step": 1748 + }, + { + "epoch": 0.3242491657397108, + "grad_norm": 6.8203125, + "learning_rate": 9.67575083426029e-06, + "loss": 2.7385, + "mean_token_accuracy": 0.45920954511558537, + "step": 1749 + }, + { + "epoch": 0.3244345569150908, + "grad_norm": 14.2265625, + "learning_rate": 9.67556544308491e-06, + "loss": 3.2331, + "mean_token_accuracy": 0.3976919087136929, + "step": 1750 + }, + { + "epoch": 0.3246199480904709, + "grad_norm": 7.21484375, + "learning_rate": 9.675380051909529e-06, + "loss": 2.8482, + "mean_token_accuracy": 0.4298931456867344, + "step": 1751 + }, + { + "epoch": 0.3248053392658509, + "grad_norm": 6.3515625, + "learning_rate": 9.67519466073415e-06, + "loss": 2.7205, + "mean_token_accuracy": 0.4370629370629371, + "step": 1752 + }, + { + "epoch": 0.324990730441231, + "grad_norm": 6.4765625, + "learning_rate": 9.67500926955877e-06, + "loss": 2.5599, + "mean_token_accuracy": 0.47160762942779294, + "step": 1753 + }, + { + "epoch": 0.325176121616611, + "grad_norm": 5.1328125, + "learning_rate": 9.674823878383389e-06, + "loss": 2.7731, + "mean_token_accuracy": 0.44122328331059574, + "step": 1754 + }, + { + "epoch": 0.3253615127919911, + "grad_norm": 14.21875, + "learning_rate": 9.67463848720801e-06, + "loss": 2.6185, + "mean_token_accuracy": 0.4422258111877845, + "step": 1755 + }, + { + "epoch": 0.3255469039673711, + "grad_norm": 5.453125, + "learning_rate": 9.67445309603263e-06, + "loss": 1.9098, + "mean_token_accuracy": 0.572147291800471, + "step": 1756 + }, + { + "epoch": 0.3257322951427512, + "grad_norm": 8.328125, + "learning_rate": 9.67426770485725e-06, + "loss": 2.5541, + "mean_token_accuracy": 0.46827717736808644, + "step": 1757 + }, + { + "epoch": 0.3259176863181313, + "grad_norm": 6.53515625, + "learning_rate": 9.674082313681869e-06, + "loss": 2.9023, + "mean_token_accuracy": 0.41196777905638665, + "step": 1758 + }, + { + "epoch": 0.3261030774935113, + "grad_norm": 9.5390625, + "learning_rate": 9.67389692250649e-06, + "loss": 2.7743, + "mean_token_accuracy": 0.4165660468485873, + "step": 1759 + }, + { + "epoch": 0.3262884686688914, + "grad_norm": 6.38671875, + "learning_rate": 9.673711531331108e-06, + "loss": 3.3841, + "mean_token_accuracy": 0.3949424788365531, + "step": 1760 + }, + { + "epoch": 0.3264738598442714, + "grad_norm": 7.859375, + "learning_rate": 9.673526140155729e-06, + "loss": 2.4976, + "mean_token_accuracy": 0.48478513356562136, + "step": 1761 + }, + { + "epoch": 0.3266592510196515, + "grad_norm": 6.859375, + "learning_rate": 9.67334074898035e-06, + "loss": 2.7439, + "mean_token_accuracy": 0.44480195273493844, + "step": 1762 + }, + { + "epoch": 0.3268446421950315, + "grad_norm": 6.5546875, + "learning_rate": 9.67315535780497e-06, + "loss": 2.8163, + "mean_token_accuracy": 0.4390068886337543, + "step": 1763 + }, + { + "epoch": 0.3270300333704116, + "grad_norm": 7.3125, + "learning_rate": 9.67296996662959e-06, + "loss": 3.3593, + "mean_token_accuracy": 0.37644341801385683, + "step": 1764 + }, + { + "epoch": 0.3272154245457916, + "grad_norm": 6.953125, + "learning_rate": 9.672784575454209e-06, + "loss": 2.8754, + "mean_token_accuracy": 0.4099620893007582, + "step": 1765 + }, + { + "epoch": 0.3274008157211717, + "grad_norm": 6.23828125, + "learning_rate": 9.67259918427883e-06, + "loss": 2.6543, + "mean_token_accuracy": 0.456540825285338, + "step": 1766 + }, + { + "epoch": 0.3275862068965517, + "grad_norm": 5.48828125, + "learning_rate": 9.672413793103448e-06, + "loss": 2.957, + "mean_token_accuracy": 0.42774711490215755, + "step": 1767 + }, + { + "epoch": 0.3277715980719318, + "grad_norm": 5.72265625, + "learning_rate": 9.672228401928069e-06, + "loss": 2.9854, + "mean_token_accuracy": 0.418646346929628, + "step": 1768 + }, + { + "epoch": 0.3279569892473118, + "grad_norm": 5.98828125, + "learning_rate": 9.67204301075269e-06, + "loss": 3.1308, + "mean_token_accuracy": 0.3994402239104358, + "step": 1769 + }, + { + "epoch": 0.3281423804226919, + "grad_norm": 6.85546875, + "learning_rate": 9.671857619577308e-06, + "loss": 3.219, + "mean_token_accuracy": 0.3911631846414455, + "step": 1770 + }, + { + "epoch": 0.3283277715980719, + "grad_norm": 6.1953125, + "learning_rate": 9.671672228401929e-06, + "loss": 2.8306, + "mean_token_accuracy": 0.43488399207182, + "step": 1771 + }, + { + "epoch": 0.328513162773452, + "grad_norm": 6.69921875, + "learning_rate": 9.67148683722655e-06, + "loss": 2.681, + "mean_token_accuracy": 0.4502289077828646, + "step": 1772 + }, + { + "epoch": 0.328698553948832, + "grad_norm": 6.5625, + "learning_rate": 9.67130144605117e-06, + "loss": 2.3006, + "mean_token_accuracy": 0.5141921397379913, + "step": 1773 + }, + { + "epoch": 0.3288839451242121, + "grad_norm": 7.2734375, + "learning_rate": 9.671116054875788e-06, + "loss": 2.703, + "mean_token_accuracy": 0.4460529909860694, + "step": 1774 + }, + { + "epoch": 0.3290693362995921, + "grad_norm": 6.60546875, + "learning_rate": 9.670930663700409e-06, + "loss": 2.7383, + "mean_token_accuracy": 0.4535855186818287, + "step": 1775 + }, + { + "epoch": 0.3292547274749722, + "grad_norm": 6.47265625, + "learning_rate": 9.670745272525028e-06, + "loss": 3.224, + "mean_token_accuracy": 0.4026381909547739, + "step": 1776 + }, + { + "epoch": 0.3294401186503522, + "grad_norm": 6.6640625, + "learning_rate": 9.670559881349648e-06, + "loss": 2.6125, + "mean_token_accuracy": 0.4525202520252025, + "step": 1777 + }, + { + "epoch": 0.3296255098257323, + "grad_norm": 5.8359375, + "learning_rate": 9.670374490174269e-06, + "loss": 3.0713, + "mean_token_accuracy": 0.4307923771313942, + "step": 1778 + }, + { + "epoch": 0.3298109010011123, + "grad_norm": 6.484375, + "learning_rate": 9.67018909899889e-06, + "loss": 2.965, + "mean_token_accuracy": 0.42915698865965296, + "step": 1779 + }, + { + "epoch": 0.3299962921764924, + "grad_norm": 5.875, + "learning_rate": 9.670003707823508e-06, + "loss": 2.7129, + "mean_token_accuracy": 0.45958656895986, + "step": 1780 + }, + { + "epoch": 0.3301816833518724, + "grad_norm": 8.3359375, + "learning_rate": 9.669818316648129e-06, + "loss": 2.8907, + "mean_token_accuracy": 0.42188604552523684, + "step": 1781 + }, + { + "epoch": 0.3303670745272525, + "grad_norm": 7.3515625, + "learning_rate": 9.669632925472749e-06, + "loss": 2.9502, + "mean_token_accuracy": 0.41456907551328154, + "step": 1782 + }, + { + "epoch": 0.3305524657026326, + "grad_norm": 5.71484375, + "learning_rate": 9.669447534297368e-06, + "loss": 3.1144, + "mean_token_accuracy": 0.42750506072874495, + "step": 1783 + }, + { + "epoch": 0.3307378568780126, + "grad_norm": 9.8984375, + "learning_rate": 9.669262143121988e-06, + "loss": 2.7557, + "mean_token_accuracy": 0.44374486723241174, + "step": 1784 + }, + { + "epoch": 0.3309232480533927, + "grad_norm": 9.7421875, + "learning_rate": 9.669076751946607e-06, + "loss": 2.9642, + "mean_token_accuracy": 0.41742654508611954, + "step": 1785 + }, + { + "epoch": 0.3311086392287727, + "grad_norm": 7.61328125, + "learning_rate": 9.668891360771228e-06, + "loss": 2.3543, + "mean_token_accuracy": 0.484174989449993, + "step": 1786 + }, + { + "epoch": 0.3312940304041528, + "grad_norm": 5.734375, + "learning_rate": 9.668705969595848e-06, + "loss": 3.0609, + "mean_token_accuracy": 0.4087670049665299, + "step": 1787 + }, + { + "epoch": 0.3314794215795328, + "grad_norm": 7.79296875, + "learning_rate": 9.668520578420469e-06, + "loss": 2.8457, + "mean_token_accuracy": 0.43902818875564553, + "step": 1788 + }, + { + "epoch": 0.3316648127549129, + "grad_norm": 5.80859375, + "learning_rate": 9.668335187245087e-06, + "loss": 2.9401, + "mean_token_accuracy": 0.41848347730700675, + "step": 1789 + }, + { + "epoch": 0.3318502039302929, + "grad_norm": 6.88671875, + "learning_rate": 9.668149796069708e-06, + "loss": 2.6617, + "mean_token_accuracy": 0.4510108864696734, + "step": 1790 + }, + { + "epoch": 0.332035595105673, + "grad_norm": 8.1796875, + "learning_rate": 9.667964404894328e-06, + "loss": 3.1361, + "mean_token_accuracy": 0.40336134453781514, + "step": 1791 + }, + { + "epoch": 0.332220986281053, + "grad_norm": 5.65625, + "learning_rate": 9.667779013718947e-06, + "loss": 3.0346, + "mean_token_accuracy": 0.4303886925795053, + "step": 1792 + }, + { + "epoch": 0.3324063774564331, + "grad_norm": 6.5, + "learning_rate": 9.667593622543568e-06, + "loss": 3.0776, + "mean_token_accuracy": 0.3987851886461862, + "step": 1793 + }, + { + "epoch": 0.3325917686318131, + "grad_norm": 6.625, + "learning_rate": 9.667408231368186e-06, + "loss": 2.4369, + "mean_token_accuracy": 0.45948150833937634, + "step": 1794 + }, + { + "epoch": 0.3327771598071932, + "grad_norm": 5.5, + "learning_rate": 9.667222840192809e-06, + "loss": 3.008, + "mean_token_accuracy": 0.40268242056522435, + "step": 1795 + }, + { + "epoch": 0.3329625509825732, + "grad_norm": 7.26171875, + "learning_rate": 9.667037449017427e-06, + "loss": 3.1859, + "mean_token_accuracy": 0.40474287236877166, + "step": 1796 + }, + { + "epoch": 0.3331479421579533, + "grad_norm": 6.78515625, + "learning_rate": 9.666852057842048e-06, + "loss": 2.7784, + "mean_token_accuracy": 0.45059793335655407, + "step": 1797 + }, + { + "epoch": 0.3333333333333333, + "grad_norm": 12.9453125, + "learning_rate": 9.666666666666667e-06, + "loss": 2.9279, + "mean_token_accuracy": 0.4151468612017662, + "step": 1798 + }, + { + "epoch": 0.3335187245087134, + "grad_norm": 7.61328125, + "learning_rate": 9.666481275491287e-06, + "loss": 3.095, + "mean_token_accuracy": 0.41327210783587615, + "step": 1799 + }, + { + "epoch": 0.3337041156840934, + "grad_norm": 5.34765625, + "learning_rate": 9.666295884315908e-06, + "loss": 2.6735, + "mean_token_accuracy": 0.45320796460176993, + "step": 1800 + }, + { + "epoch": 0.3338895068594735, + "grad_norm": 6.32421875, + "learning_rate": 9.666110493140527e-06, + "loss": 3.0857, + "mean_token_accuracy": 0.4119214586255259, + "step": 1801 + }, + { + "epoch": 0.3340748980348535, + "grad_norm": 8.3671875, + "learning_rate": 9.665925101965147e-06, + "loss": 2.6713, + "mean_token_accuracy": 0.463898005554153, + "step": 1802 + }, + { + "epoch": 0.3342602892102336, + "grad_norm": 7.9921875, + "learning_rate": 9.665739710789767e-06, + "loss": 3.2288, + "mean_token_accuracy": 0.388215859030837, + "step": 1803 + }, + { + "epoch": 0.3344456803856136, + "grad_norm": 6.77734375, + "learning_rate": 9.665554319614388e-06, + "loss": 3.2347, + "mean_token_accuracy": 0.3862101646724322, + "step": 1804 + }, + { + "epoch": 0.3346310715609937, + "grad_norm": 5.921875, + "learning_rate": 9.665368928439007e-06, + "loss": 3.3388, + "mean_token_accuracy": 0.3841633612457764, + "step": 1805 + }, + { + "epoch": 0.3348164627363737, + "grad_norm": 6.61328125, + "learning_rate": 9.665183537263627e-06, + "loss": 3.128, + "mean_token_accuracy": 0.3998084749820445, + "step": 1806 + }, + { + "epoch": 0.3350018539117538, + "grad_norm": 8.2109375, + "learning_rate": 9.664998146088248e-06, + "loss": 2.3419, + "mean_token_accuracy": 0.516506273602144, + "step": 1807 + }, + { + "epoch": 0.3351872450871339, + "grad_norm": 7.46484375, + "learning_rate": 9.664812754912867e-06, + "loss": 2.7152, + "mean_token_accuracy": 0.4534505208333333, + "step": 1808 + }, + { + "epoch": 0.3353726362625139, + "grad_norm": 7.28125, + "learning_rate": 9.664627363737487e-06, + "loss": 2.9138, + "mean_token_accuracy": 0.4178109062377403, + "step": 1809 + }, + { + "epoch": 0.335558027437894, + "grad_norm": 10.1015625, + "learning_rate": 9.664441972562106e-06, + "loss": 2.8979, + "mean_token_accuracy": 0.41860166773572804, + "step": 1810 + }, + { + "epoch": 0.335743418613274, + "grad_norm": 9.2421875, + "learning_rate": 9.664256581386728e-06, + "loss": 2.5886, + "mean_token_accuracy": 0.4322150492115568, + "step": 1811 + }, + { + "epoch": 0.3359288097886541, + "grad_norm": 7.828125, + "learning_rate": 9.664071190211347e-06, + "loss": 2.706, + "mean_token_accuracy": 0.4560032477418045, + "step": 1812 + }, + { + "epoch": 0.3361142009640341, + "grad_norm": 7.421875, + "learning_rate": 9.663885799035967e-06, + "loss": 3.3723, + "mean_token_accuracy": 0.3758598550857562, + "step": 1813 + }, + { + "epoch": 0.3362995921394142, + "grad_norm": 6.99609375, + "learning_rate": 9.663700407860586e-06, + "loss": 2.7197, + "mean_token_accuracy": 0.4407239819004525, + "step": 1814 + }, + { + "epoch": 0.3364849833147942, + "grad_norm": 8.40625, + "learning_rate": 9.663515016685207e-06, + "loss": 2.7143, + "mean_token_accuracy": 0.45119164218086844, + "step": 1815 + }, + { + "epoch": 0.3366703744901743, + "grad_norm": 6.83984375, + "learning_rate": 9.663329625509827e-06, + "loss": 3.1273, + "mean_token_accuracy": 0.40207253886010363, + "step": 1816 + }, + { + "epoch": 0.3368557656655543, + "grad_norm": 9.4140625, + "learning_rate": 9.663144234334446e-06, + "loss": 2.5952, + "mean_token_accuracy": 0.4694956790919644, + "step": 1817 + }, + { + "epoch": 0.3370411568409344, + "grad_norm": 7.5625, + "learning_rate": 9.662958843159066e-06, + "loss": 2.9857, + "mean_token_accuracy": 0.41074353095316085, + "step": 1818 + }, + { + "epoch": 0.3372265480163144, + "grad_norm": 5.953125, + "learning_rate": 9.662773451983687e-06, + "loss": 2.9329, + "mean_token_accuracy": 0.43982074263764404, + "step": 1819 + }, + { + "epoch": 0.3374119391916945, + "grad_norm": 5.75, + "learning_rate": 9.662588060808307e-06, + "loss": 3.2117, + "mean_token_accuracy": 0.3819151575291491, + "step": 1820 + }, + { + "epoch": 0.3375973303670745, + "grad_norm": 8.5625, + "learning_rate": 9.662402669632926e-06, + "loss": 2.708, + "mean_token_accuracy": 0.44365049517362415, + "step": 1821 + }, + { + "epoch": 0.3377827215424546, + "grad_norm": 7.19140625, + "learning_rate": 9.662217278457547e-06, + "loss": 2.4305, + "mean_token_accuracy": 0.48782911077993046, + "step": 1822 + }, + { + "epoch": 0.3379681127178346, + "grad_norm": 7.75390625, + "learning_rate": 9.662031887282165e-06, + "loss": 2.6495, + "mean_token_accuracy": 0.46012980992118685, + "step": 1823 + }, + { + "epoch": 0.3381535038932147, + "grad_norm": 5.7578125, + "learning_rate": 9.661846496106786e-06, + "loss": 2.4253, + "mean_token_accuracy": 0.4933253036234066, + "step": 1824 + }, + { + "epoch": 0.3383388950685947, + "grad_norm": 6.16796875, + "learning_rate": 9.661661104931406e-06, + "loss": 3.1025, + "mean_token_accuracy": 0.40685640362225095, + "step": 1825 + }, + { + "epoch": 0.3385242862439748, + "grad_norm": 12.7890625, + "learning_rate": 9.661475713756025e-06, + "loss": 3.1683, + "mean_token_accuracy": 0.38286549097359907, + "step": 1826 + }, + { + "epoch": 0.3387096774193548, + "grad_norm": 6.5546875, + "learning_rate": 9.661290322580646e-06, + "loss": 3.7282, + "mean_token_accuracy": 0.3434858135495078, + "step": 1827 + }, + { + "epoch": 0.3388950685947349, + "grad_norm": 5.74609375, + "learning_rate": 9.661104931405266e-06, + "loss": 3.2869, + "mean_token_accuracy": 0.3789942378208486, + "step": 1828 + }, + { + "epoch": 0.3390804597701149, + "grad_norm": 6.3984375, + "learning_rate": 9.660919540229887e-06, + "loss": 2.7726, + "mean_token_accuracy": 0.43242467718794836, + "step": 1829 + }, + { + "epoch": 0.339265850945495, + "grad_norm": 5.9296875, + "learning_rate": 9.660734149054506e-06, + "loss": 2.9348, + "mean_token_accuracy": 0.42353618184433894, + "step": 1830 + }, + { + "epoch": 0.3394512421208751, + "grad_norm": 5.8203125, + "learning_rate": 9.660548757879126e-06, + "loss": 2.6674, + "mean_token_accuracy": 0.46368715083798884, + "step": 1831 + }, + { + "epoch": 0.3396366332962551, + "grad_norm": 9.5390625, + "learning_rate": 9.660363366703745e-06, + "loss": 3.0592, + "mean_token_accuracy": 0.4122794832615641, + "step": 1832 + }, + { + "epoch": 0.3398220244716352, + "grad_norm": 8.3984375, + "learning_rate": 9.660177975528365e-06, + "loss": 3.1722, + "mean_token_accuracy": 0.4271512205898597, + "step": 1833 + }, + { + "epoch": 0.3400074156470152, + "grad_norm": 11.796875, + "learning_rate": 9.659992584352986e-06, + "loss": 1.8828, + "mean_token_accuracy": 0.5495128869157726, + "step": 1834 + }, + { + "epoch": 0.3401928068223953, + "grad_norm": 10.7578125, + "learning_rate": 9.659807193177606e-06, + "loss": 2.7099, + "mean_token_accuracy": 0.4307116104868914, + "step": 1835 + }, + { + "epoch": 0.3403781979977753, + "grad_norm": 9.875, + "learning_rate": 9.659621802002225e-06, + "loss": 2.9467, + "mean_token_accuracy": 0.4245494294550496, + "step": 1836 + }, + { + "epoch": 0.3405635891731554, + "grad_norm": 8.7578125, + "learning_rate": 9.659436410826846e-06, + "loss": 2.7993, + "mean_token_accuracy": 0.4289288692958418, + "step": 1837 + }, + { + "epoch": 0.3407489803485354, + "grad_norm": 6.98828125, + "learning_rate": 9.659251019651466e-06, + "loss": 2.7647, + "mean_token_accuracy": 0.42599067599067597, + "step": 1838 + }, + { + "epoch": 0.3409343715239155, + "grad_norm": 8.625, + "learning_rate": 9.659065628476085e-06, + "loss": 2.7569, + "mean_token_accuracy": 0.448, + "step": 1839 + }, + { + "epoch": 0.3411197626992955, + "grad_norm": 9.015625, + "learning_rate": 9.658880237300705e-06, + "loss": 2.7342, + "mean_token_accuracy": 0.4415486103601168, + "step": 1840 + }, + { + "epoch": 0.3413051538746756, + "grad_norm": 10.4921875, + "learning_rate": 9.658694846125324e-06, + "loss": 2.8034, + "mean_token_accuracy": 0.43900889453621345, + "step": 1841 + }, + { + "epoch": 0.3414905450500556, + "grad_norm": 6.94921875, + "learning_rate": 9.658509454949945e-06, + "loss": 3.235, + "mean_token_accuracy": 0.39839766933721776, + "step": 1842 + }, + { + "epoch": 0.3416759362254357, + "grad_norm": 6.2890625, + "learning_rate": 9.658324063774565e-06, + "loss": 2.6881, + "mean_token_accuracy": 0.4708338450202628, + "step": 1843 + }, + { + "epoch": 0.3418613274008157, + "grad_norm": 5.8125, + "learning_rate": 9.658138672599186e-06, + "loss": 2.4531, + "mean_token_accuracy": 0.4862250520386923, + "step": 1844 + }, + { + "epoch": 0.3420467185761958, + "grad_norm": 9.5, + "learning_rate": 9.657953281423806e-06, + "loss": 2.5952, + "mean_token_accuracy": 0.46513274336283184, + "step": 1845 + }, + { + "epoch": 0.3422321097515758, + "grad_norm": 8.5859375, + "learning_rate": 9.657767890248425e-06, + "loss": 2.7283, + "mean_token_accuracy": 0.43087971274685816, + "step": 1846 + }, + { + "epoch": 0.3424175009269559, + "grad_norm": 7.5234375, + "learning_rate": 9.657582499073045e-06, + "loss": 2.942, + "mean_token_accuracy": 0.43703616444810933, + "step": 1847 + }, + { + "epoch": 0.3426028921023359, + "grad_norm": 6.984375, + "learning_rate": 9.657397107897664e-06, + "loss": 2.8717, + "mean_token_accuracy": 0.41526894158473104, + "step": 1848 + }, + { + "epoch": 0.342788283277716, + "grad_norm": 8.078125, + "learning_rate": 9.657211716722285e-06, + "loss": 3.0608, + "mean_token_accuracy": 0.43319352905931696, + "step": 1849 + }, + { + "epoch": 0.342973674453096, + "grad_norm": 6.25390625, + "learning_rate": 9.657026325546905e-06, + "loss": 2.5444, + "mean_token_accuracy": 0.4657965088850448, + "step": 1850 + }, + { + "epoch": 0.3431590656284761, + "grad_norm": 6.44921875, + "learning_rate": 9.656840934371526e-06, + "loss": 2.763, + "mean_token_accuracy": 0.44189958592132506, + "step": 1851 + }, + { + "epoch": 0.3433444568038561, + "grad_norm": 7.09765625, + "learning_rate": 9.656655543196144e-06, + "loss": 2.7354, + "mean_token_accuracy": 0.45300296256396444, + "step": 1852 + }, + { + "epoch": 0.3435298479792362, + "grad_norm": 6.80859375, + "learning_rate": 9.656470152020765e-06, + "loss": 2.9231, + "mean_token_accuracy": 0.4343457410367664, + "step": 1853 + }, + { + "epoch": 0.3437152391546162, + "grad_norm": 8.7890625, + "learning_rate": 9.656284760845385e-06, + "loss": 2.6444, + "mean_token_accuracy": 0.44942541813630676, + "step": 1854 + }, + { + "epoch": 0.3439006303299963, + "grad_norm": 6.75390625, + "learning_rate": 9.656099369670004e-06, + "loss": 2.6828, + "mean_token_accuracy": 0.44754450195682366, + "step": 1855 + }, + { + "epoch": 0.34408602150537637, + "grad_norm": 6.73046875, + "learning_rate": 9.655913978494625e-06, + "loss": 3.2105, + "mean_token_accuracy": 0.3884664131812421, + "step": 1856 + }, + { + "epoch": 0.3442714126807564, + "grad_norm": 6.640625, + "learning_rate": 9.655728587319244e-06, + "loss": 2.9995, + "mean_token_accuracy": 0.4199944918755164, + "step": 1857 + }, + { + "epoch": 0.3444568038561365, + "grad_norm": 5.91015625, + "learning_rate": 9.655543196143864e-06, + "loss": 2.8224, + "mean_token_accuracy": 0.43491882654514386, + "step": 1858 + }, + { + "epoch": 0.3446421950315165, + "grad_norm": 7.09765625, + "learning_rate": 9.655357804968485e-06, + "loss": 2.5926, + "mean_token_accuracy": 0.4549968963376785, + "step": 1859 + }, + { + "epoch": 0.3448275862068966, + "grad_norm": 9.671875, + "learning_rate": 9.655172413793105e-06, + "loss": 2.7478, + "mean_token_accuracy": 0.44606323620582766, + "step": 1860 + }, + { + "epoch": 0.3450129773822766, + "grad_norm": 8.5390625, + "learning_rate": 9.654987022617724e-06, + "loss": 3.1071, + "mean_token_accuracy": 0.39231212208465305, + "step": 1861 + }, + { + "epoch": 0.3451983685576567, + "grad_norm": 7.03125, + "learning_rate": 9.654801631442344e-06, + "loss": 2.7875, + "mean_token_accuracy": 0.44498217014773306, + "step": 1862 + }, + { + "epoch": 0.3453837597330367, + "grad_norm": 8.5859375, + "learning_rate": 9.654616240266965e-06, + "loss": 2.7759, + "mean_token_accuracy": 0.4514241554427026, + "step": 1863 + }, + { + "epoch": 0.3455691509084168, + "grad_norm": 7.3515625, + "learning_rate": 9.654430849091584e-06, + "loss": 2.6367, + "mean_token_accuracy": 0.4512278776238167, + "step": 1864 + }, + { + "epoch": 0.3457545420837968, + "grad_norm": 6.63671875, + "learning_rate": 9.654245457916204e-06, + "loss": 2.7567, + "mean_token_accuracy": 0.439669634791586, + "step": 1865 + }, + { + "epoch": 0.3459399332591769, + "grad_norm": 7.5546875, + "learning_rate": 9.654060066740823e-06, + "loss": 2.7328, + "mean_token_accuracy": 0.43555612440803787, + "step": 1866 + }, + { + "epoch": 0.3461253244345569, + "grad_norm": 9.390625, + "learning_rate": 9.653874675565443e-06, + "loss": 2.3983, + "mean_token_accuracy": 0.4868520722865491, + "step": 1867 + }, + { + "epoch": 0.346310715609937, + "grad_norm": 7.36328125, + "learning_rate": 9.653689284390064e-06, + "loss": 3.2376, + "mean_token_accuracy": 0.40642750373692077, + "step": 1868 + }, + { + "epoch": 0.346496106785317, + "grad_norm": 6.7421875, + "learning_rate": 9.653503893214684e-06, + "loss": 3.0626, + "mean_token_accuracy": 0.4048688253367998, + "step": 1869 + }, + { + "epoch": 0.3466814979606971, + "grad_norm": 6.7421875, + "learning_rate": 9.653318502039303e-06, + "loss": 3.1158, + "mean_token_accuracy": 0.39600111080255485, + "step": 1870 + }, + { + "epoch": 0.3468668891360771, + "grad_norm": 12.8203125, + "learning_rate": 9.653133110863924e-06, + "loss": 2.4466, + "mean_token_accuracy": 0.4811119978717744, + "step": 1871 + }, + { + "epoch": 0.3470522803114572, + "grad_norm": 23.453125, + "learning_rate": 9.652947719688544e-06, + "loss": 2.7485, + "mean_token_accuracy": 0.423905625426538, + "step": 1872 + }, + { + "epoch": 0.3472376714868372, + "grad_norm": 9.1015625, + "learning_rate": 9.652762328513163e-06, + "loss": 2.9885, + "mean_token_accuracy": 0.40101343389111477, + "step": 1873 + }, + { + "epoch": 0.3474230626622173, + "grad_norm": 13.6875, + "learning_rate": 9.652576937337783e-06, + "loss": 2.41, + "mean_token_accuracy": 0.4916629777187546, + "step": 1874 + }, + { + "epoch": 0.3476084538375973, + "grad_norm": 14.8359375, + "learning_rate": 9.652391546162402e-06, + "loss": 3.1352, + "mean_token_accuracy": 0.3867057413081124, + "step": 1875 + }, + { + "epoch": 0.3477938450129774, + "grad_norm": 15.296875, + "learning_rate": 9.652206154987024e-06, + "loss": 2.6709, + "mean_token_accuracy": 0.45244316697151815, + "step": 1876 + }, + { + "epoch": 0.3479792361883574, + "grad_norm": 9.65625, + "learning_rate": 9.652020763811643e-06, + "loss": 2.7074, + "mean_token_accuracy": 0.4298653319032297, + "step": 1877 + }, + { + "epoch": 0.3481646273637375, + "grad_norm": 6.35546875, + "learning_rate": 9.651835372636264e-06, + "loss": 2.8416, + "mean_token_accuracy": 0.42488425925925927, + "step": 1878 + }, + { + "epoch": 0.3483500185391175, + "grad_norm": 12.8984375, + "learning_rate": 9.651649981460882e-06, + "loss": 2.9167, + "mean_token_accuracy": 0.4296344647519582, + "step": 1879 + }, + { + "epoch": 0.3485354097144976, + "grad_norm": 18.671875, + "learning_rate": 9.651464590285503e-06, + "loss": 2.3264, + "mean_token_accuracy": 0.4827279654559309, + "step": 1880 + }, + { + "epoch": 0.34872080088987767, + "grad_norm": 9.953125, + "learning_rate": 9.651279199110123e-06, + "loss": 2.7094, + "mean_token_accuracy": 0.428470629740695, + "step": 1881 + }, + { + "epoch": 0.3489061920652577, + "grad_norm": 9.984375, + "learning_rate": 9.651093807934742e-06, + "loss": 2.7984, + "mean_token_accuracy": 0.44339106654512306, + "step": 1882 + }, + { + "epoch": 0.34909158324063777, + "grad_norm": 7.53515625, + "learning_rate": 9.650908416759363e-06, + "loss": 3.1309, + "mean_token_accuracy": 0.4012059868633574, + "step": 1883 + }, + { + "epoch": 0.3492769744160178, + "grad_norm": 11.796875, + "learning_rate": 9.650723025583983e-06, + "loss": 2.7937, + "mean_token_accuracy": 0.4226482923906531, + "step": 1884 + }, + { + "epoch": 0.34946236559139787, + "grad_norm": 7.69921875, + "learning_rate": 9.650537634408604e-06, + "loss": 2.6282, + "mean_token_accuracy": 0.4644159000173581, + "step": 1885 + }, + { + "epoch": 0.3496477567667779, + "grad_norm": 6.515625, + "learning_rate": 9.650352243233223e-06, + "loss": 2.9318, + "mean_token_accuracy": 0.4122948614474038, + "step": 1886 + }, + { + "epoch": 0.349833147942158, + "grad_norm": 9.875, + "learning_rate": 9.650166852057843e-06, + "loss": 2.9755, + "mean_token_accuracy": 0.4144013880855986, + "step": 1887 + }, + { + "epoch": 0.350018539117538, + "grad_norm": 9.3046875, + "learning_rate": 9.649981460882464e-06, + "loss": 2.8052, + "mean_token_accuracy": 0.4299972655181843, + "step": 1888 + }, + { + "epoch": 0.3502039302929181, + "grad_norm": 8.21875, + "learning_rate": 9.649796069707082e-06, + "loss": 3.0619, + "mean_token_accuracy": 0.40293767368003175, + "step": 1889 + }, + { + "epoch": 0.3503893214682981, + "grad_norm": 9.21875, + "learning_rate": 9.649610678531703e-06, + "loss": 2.9595, + "mean_token_accuracy": 0.4108836744882676, + "step": 1890 + }, + { + "epoch": 0.3505747126436782, + "grad_norm": 6.78125, + "learning_rate": 9.649425287356322e-06, + "loss": 3.1242, + "mean_token_accuracy": 0.37603132429030905, + "step": 1891 + }, + { + "epoch": 0.3507601038190582, + "grad_norm": 5.51953125, + "learning_rate": 9.649239896180944e-06, + "loss": 3.0956, + "mean_token_accuracy": 0.4022231370934541, + "step": 1892 + }, + { + "epoch": 0.3509454949944383, + "grad_norm": 4.9375, + "learning_rate": 9.649054505005563e-06, + "loss": 2.3185, + "mean_token_accuracy": 0.5050731477111845, + "step": 1893 + }, + { + "epoch": 0.3511308861698183, + "grad_norm": 9.7421875, + "learning_rate": 9.648869113830183e-06, + "loss": 2.7189, + "mean_token_accuracy": 0.44263959390862945, + "step": 1894 + }, + { + "epoch": 0.3513162773451984, + "grad_norm": 8.9765625, + "learning_rate": 9.648683722654802e-06, + "loss": 2.3112, + "mean_token_accuracy": 0.48070460076486266, + "step": 1895 + }, + { + "epoch": 0.3515016685205784, + "grad_norm": 5.01171875, + "learning_rate": 9.648498331479422e-06, + "loss": 2.7872, + "mean_token_accuracy": 0.444634703196347, + "step": 1896 + }, + { + "epoch": 0.3516870596959585, + "grad_norm": 7.40234375, + "learning_rate": 9.648312940304043e-06, + "loss": 2.8237, + "mean_token_accuracy": 0.43508510373959497, + "step": 1897 + }, + { + "epoch": 0.3518724508713385, + "grad_norm": 7.28515625, + "learning_rate": 9.648127549128662e-06, + "loss": 2.6371, + "mean_token_accuracy": 0.4533402651416688, + "step": 1898 + }, + { + "epoch": 0.3520578420467186, + "grad_norm": 6.265625, + "learning_rate": 9.647942157953282e-06, + "loss": 2.6901, + "mean_token_accuracy": 0.4568251446726427, + "step": 1899 + }, + { + "epoch": 0.3522432332220986, + "grad_norm": 10.5625, + "learning_rate": 9.647756766777903e-06, + "loss": 2.709, + "mean_token_accuracy": 0.43246174237859986, + "step": 1900 + }, + { + "epoch": 0.3524286243974787, + "grad_norm": 7.90234375, + "learning_rate": 9.647571375602523e-06, + "loss": 2.8995, + "mean_token_accuracy": 0.4257629443364956, + "step": 1901 + }, + { + "epoch": 0.3526140155728587, + "grad_norm": 5.9765625, + "learning_rate": 9.647385984427142e-06, + "loss": 2.99, + "mean_token_accuracy": 0.41589648798521256, + "step": 1902 + }, + { + "epoch": 0.3527994067482388, + "grad_norm": 7.33203125, + "learning_rate": 9.647200593251762e-06, + "loss": 2.621, + "mean_token_accuracy": 0.45819659321769024, + "step": 1903 + }, + { + "epoch": 0.3529847979236188, + "grad_norm": 15.6953125, + "learning_rate": 9.647015202076381e-06, + "loss": 3.3667, + "mean_token_accuracy": 0.41359284256788575, + "step": 1904 + }, + { + "epoch": 0.3531701890989989, + "grad_norm": 10.171875, + "learning_rate": 9.646829810901002e-06, + "loss": 3.0174, + "mean_token_accuracy": 0.42861201875266375, + "step": 1905 + }, + { + "epoch": 0.35335558027437897, + "grad_norm": 12.7734375, + "learning_rate": 9.646644419725622e-06, + "loss": 3.2239, + "mean_token_accuracy": 0.4074025634318598, + "step": 1906 + }, + { + "epoch": 0.353540971449759, + "grad_norm": 6.48828125, + "learning_rate": 9.646459028550241e-06, + "loss": 2.8519, + "mean_token_accuracy": 0.428043400500775, + "step": 1907 + }, + { + "epoch": 0.35372636262513907, + "grad_norm": 6.0703125, + "learning_rate": 9.646273637374861e-06, + "loss": 2.8792, + "mean_token_accuracy": 0.4269440316988608, + "step": 1908 + }, + { + "epoch": 0.3539117538005191, + "grad_norm": 10.8359375, + "learning_rate": 9.646088246199482e-06, + "loss": 2.7078, + "mean_token_accuracy": 0.4334143899299099, + "step": 1909 + }, + { + "epoch": 0.35409714497589917, + "grad_norm": 10.7109375, + "learning_rate": 9.645902855024102e-06, + "loss": 2.6368, + "mean_token_accuracy": 0.46637820137995556, + "step": 1910 + }, + { + "epoch": 0.3542825361512792, + "grad_norm": 5.84375, + "learning_rate": 9.645717463848721e-06, + "loss": 3.0133, + "mean_token_accuracy": 0.4191351180104546, + "step": 1911 + }, + { + "epoch": 0.35446792732665927, + "grad_norm": 6.60546875, + "learning_rate": 9.645532072673342e-06, + "loss": 2.6503, + "mean_token_accuracy": 0.48240764011058057, + "step": 1912 + }, + { + "epoch": 0.3546533185020393, + "grad_norm": 6.57421875, + "learning_rate": 9.64534668149796e-06, + "loss": 3.0997, + "mean_token_accuracy": 0.4120201096892139, + "step": 1913 + }, + { + "epoch": 0.3548387096774194, + "grad_norm": 5.5546875, + "learning_rate": 9.645161290322581e-06, + "loss": 2.6042, + "mean_token_accuracy": 0.4689585694496915, + "step": 1914 + }, + { + "epoch": 0.3550241008527994, + "grad_norm": 5.1796875, + "learning_rate": 9.644975899147202e-06, + "loss": 2.813, + "mean_token_accuracy": 0.4416645712848881, + "step": 1915 + }, + { + "epoch": 0.3552094920281795, + "grad_norm": 6.484375, + "learning_rate": 9.644790507971822e-06, + "loss": 2.7345, + "mean_token_accuracy": 0.4460013218770654, + "step": 1916 + }, + { + "epoch": 0.3553948832035595, + "grad_norm": 6.7265625, + "learning_rate": 9.64460511679644e-06, + "loss": 2.6803, + "mean_token_accuracy": 0.468959629223076, + "step": 1917 + }, + { + "epoch": 0.3555802743789396, + "grad_norm": 6.67578125, + "learning_rate": 9.644419725621061e-06, + "loss": 2.9918, + "mean_token_accuracy": 0.4214811335525286, + "step": 1918 + }, + { + "epoch": 0.3557656655543196, + "grad_norm": 5.6015625, + "learning_rate": 9.644234334445682e-06, + "loss": 2.5548, + "mean_token_accuracy": 0.47409695817490494, + "step": 1919 + }, + { + "epoch": 0.3559510567296997, + "grad_norm": 7.625, + "learning_rate": 9.6440489432703e-06, + "loss": 3.0147, + "mean_token_accuracy": 0.4262320894347347, + "step": 1920 + }, + { + "epoch": 0.3561364479050797, + "grad_norm": 5.5859375, + "learning_rate": 9.643863552094921e-06, + "loss": 3.1127, + "mean_token_accuracy": 0.4058050383351588, + "step": 1921 + }, + { + "epoch": 0.3563218390804598, + "grad_norm": 5.57421875, + "learning_rate": 9.64367816091954e-06, + "loss": 2.7192, + "mean_token_accuracy": 0.44304980638269464, + "step": 1922 + }, + { + "epoch": 0.3565072302558398, + "grad_norm": 5.28515625, + "learning_rate": 9.64349276974416e-06, + "loss": 2.9308, + "mean_token_accuracy": 0.43669330055316535, + "step": 1923 + }, + { + "epoch": 0.3566926214312199, + "grad_norm": 5.22265625, + "learning_rate": 9.643307378568781e-06, + "loss": 2.8075, + "mean_token_accuracy": 0.4549289832653635, + "step": 1924 + }, + { + "epoch": 0.3568780126065999, + "grad_norm": 6.4765625, + "learning_rate": 9.643121987393401e-06, + "loss": 2.9893, + "mean_token_accuracy": 0.41002720559657985, + "step": 1925 + }, + { + "epoch": 0.35706340378198, + "grad_norm": 5.23828125, + "learning_rate": 9.642936596218022e-06, + "loss": 3.3922, + "mean_token_accuracy": 0.39280898876404496, + "step": 1926 + }, + { + "epoch": 0.35724879495736, + "grad_norm": 7.203125, + "learning_rate": 9.64275120504264e-06, + "loss": 3.1683, + "mean_token_accuracy": 0.414902170999732, + "step": 1927 + }, + { + "epoch": 0.3574341861327401, + "grad_norm": 7.08984375, + "learning_rate": 9.642565813867261e-06, + "loss": 3.0045, + "mean_token_accuracy": 0.4277951208232166, + "step": 1928 + }, + { + "epoch": 0.3576195773081201, + "grad_norm": 7.1640625, + "learning_rate": 9.64238042269188e-06, + "loss": 2.8381, + "mean_token_accuracy": 0.41798127484930103, + "step": 1929 + }, + { + "epoch": 0.3578049684835002, + "grad_norm": 5.40625, + "learning_rate": 9.6421950315165e-06, + "loss": 3.2355, + "mean_token_accuracy": 0.4127672297802362, + "step": 1930 + }, + { + "epoch": 0.35799035965888026, + "grad_norm": 8.6640625, + "learning_rate": 9.642009640341121e-06, + "loss": 3.0069, + "mean_token_accuracy": 0.4026418786692759, + "step": 1931 + }, + { + "epoch": 0.3581757508342603, + "grad_norm": 8.125, + "learning_rate": 9.641824249165741e-06, + "loss": 2.3459, + "mean_token_accuracy": 0.508284023668639, + "step": 1932 + }, + { + "epoch": 0.35836114200964037, + "grad_norm": 7.8671875, + "learning_rate": 9.64163885799036e-06, + "loss": 2.3111, + "mean_token_accuracy": 0.4984088498257312, + "step": 1933 + }, + { + "epoch": 0.3585465331850204, + "grad_norm": 5.6484375, + "learning_rate": 9.64145346681498e-06, + "loss": 2.7673, + "mean_token_accuracy": 0.44067255507076725, + "step": 1934 + }, + { + "epoch": 0.35873192436040047, + "grad_norm": 13.4375, + "learning_rate": 9.641268075639601e-06, + "loss": 2.7675, + "mean_token_accuracy": 0.4404713531084925, + "step": 1935 + }, + { + "epoch": 0.3589173155357805, + "grad_norm": 7.22265625, + "learning_rate": 9.64108268446422e-06, + "loss": 3.1261, + "mean_token_accuracy": 0.4106641721234799, + "step": 1936 + }, + { + "epoch": 0.35910270671116057, + "grad_norm": 5.984375, + "learning_rate": 9.64089729328884e-06, + "loss": 3.6334, + "mean_token_accuracy": 0.3606942889137738, + "step": 1937 + }, + { + "epoch": 0.3592880978865406, + "grad_norm": 10.3515625, + "learning_rate": 9.64071190211346e-06, + "loss": 2.8474, + "mean_token_accuracy": 0.4440717326796542, + "step": 1938 + }, + { + "epoch": 0.35947348906192067, + "grad_norm": 7.02734375, + "learning_rate": 9.64052651093808e-06, + "loss": 2.7504, + "mean_token_accuracy": 0.43622412045750686, + "step": 1939 + }, + { + "epoch": 0.3596588802373007, + "grad_norm": 5.91015625, + "learning_rate": 9.6403411197627e-06, + "loss": 3.3064, + "mean_token_accuracy": 0.41315177681833276, + "step": 1940 + }, + { + "epoch": 0.35984427141268077, + "grad_norm": 8.578125, + "learning_rate": 9.64015572858732e-06, + "loss": 2.3217, + "mean_token_accuracy": 0.4976589324732078, + "step": 1941 + }, + { + "epoch": 0.3600296625880608, + "grad_norm": 9.734375, + "learning_rate": 9.63997033741194e-06, + "loss": 3.0893, + "mean_token_accuracy": 0.40950704225352114, + "step": 1942 + }, + { + "epoch": 0.3602150537634409, + "grad_norm": 9.0234375, + "learning_rate": 9.63978494623656e-06, + "loss": 2.7221, + "mean_token_accuracy": 0.45620661494487547, + "step": 1943 + }, + { + "epoch": 0.3604004449388209, + "grad_norm": 5.83203125, + "learning_rate": 9.63959955506118e-06, + "loss": 3.4177, + "mean_token_accuracy": 0.3831385642737897, + "step": 1944 + }, + { + "epoch": 0.360585836114201, + "grad_norm": 8.5390625, + "learning_rate": 9.6394141638858e-06, + "loss": 3.0043, + "mean_token_accuracy": 0.43035900491583495, + "step": 1945 + }, + { + "epoch": 0.360771227289581, + "grad_norm": 13.6484375, + "learning_rate": 9.63922877271042e-06, + "loss": 2.5515, + "mean_token_accuracy": 0.45373272959479855, + "step": 1946 + }, + { + "epoch": 0.3609566184649611, + "grad_norm": 7.6328125, + "learning_rate": 9.639043381535039e-06, + "loss": 2.7821, + "mean_token_accuracy": 0.44484864232817983, + "step": 1947 + }, + { + "epoch": 0.3611420096403411, + "grad_norm": 8.4921875, + "learning_rate": 9.63885799035966e-06, + "loss": 3.1947, + "mean_token_accuracy": 0.41179495971198354, + "step": 1948 + }, + { + "epoch": 0.3613274008157212, + "grad_norm": 7.62109375, + "learning_rate": 9.63867259918428e-06, + "loss": 3.1753, + "mean_token_accuracy": 0.4215343203230148, + "step": 1949 + }, + { + "epoch": 0.3615127919911012, + "grad_norm": 12.1328125, + "learning_rate": 9.6384872080089e-06, + "loss": 2.5528, + "mean_token_accuracy": 0.4636156186612576, + "step": 1950 + }, + { + "epoch": 0.3616981831664813, + "grad_norm": 8.046875, + "learning_rate": 9.638301816833519e-06, + "loss": 3.0346, + "mean_token_accuracy": 0.4150741681143926, + "step": 1951 + }, + { + "epoch": 0.3618835743418613, + "grad_norm": 5.7734375, + "learning_rate": 9.63811642565814e-06, + "loss": 2.8098, + "mean_token_accuracy": 0.4306280367104553, + "step": 1952 + }, + { + "epoch": 0.3620689655172414, + "grad_norm": 8.5390625, + "learning_rate": 9.63793103448276e-06, + "loss": 2.5817, + "mean_token_accuracy": 0.4619392185238784, + "step": 1953 + }, + { + "epoch": 0.3622543566926214, + "grad_norm": 7.296875, + "learning_rate": 9.637745643307379e-06, + "loss": 2.732, + "mean_token_accuracy": 0.45091623036649214, + "step": 1954 + }, + { + "epoch": 0.3624397478680015, + "grad_norm": 5.2734375, + "learning_rate": 9.637560252132e-06, + "loss": 2.9312, + "mean_token_accuracy": 0.42870165386117987, + "step": 1955 + }, + { + "epoch": 0.36262513904338156, + "grad_norm": 7.265625, + "learning_rate": 9.63737486095662e-06, + "loss": 3.1522, + "mean_token_accuracy": 0.40306534014520035, + "step": 1956 + }, + { + "epoch": 0.3628105302187616, + "grad_norm": 8.4453125, + "learning_rate": 9.63718946978124e-06, + "loss": 2.9769, + "mean_token_accuracy": 0.41661151555261416, + "step": 1957 + }, + { + "epoch": 0.36299592139414166, + "grad_norm": 6.8046875, + "learning_rate": 9.637004078605859e-06, + "loss": 3.3178, + "mean_token_accuracy": 0.4092255125284738, + "step": 1958 + }, + { + "epoch": 0.3631813125695217, + "grad_norm": 8.2265625, + "learning_rate": 9.63681868743048e-06, + "loss": 2.4737, + "mean_token_accuracy": 0.4673913043478261, + "step": 1959 + }, + { + "epoch": 0.36336670374490176, + "grad_norm": 5.98046875, + "learning_rate": 9.636633296255098e-06, + "loss": 2.8654, + "mean_token_accuracy": 0.434767401189227, + "step": 1960 + }, + { + "epoch": 0.3635520949202818, + "grad_norm": 6.94140625, + "learning_rate": 9.636447905079719e-06, + "loss": 2.8771, + "mean_token_accuracy": 0.4288475836431227, + "step": 1961 + }, + { + "epoch": 0.36373748609566187, + "grad_norm": 8.5234375, + "learning_rate": 9.63626251390434e-06, + "loss": 2.7533, + "mean_token_accuracy": 0.44155649038461536, + "step": 1962 + }, + { + "epoch": 0.3639228772710419, + "grad_norm": 6.21875, + "learning_rate": 9.636077122728958e-06, + "loss": 2.7725, + "mean_token_accuracy": 0.43539630836047777, + "step": 1963 + }, + { + "epoch": 0.36410826844642197, + "grad_norm": 10.1953125, + "learning_rate": 9.63589173155358e-06, + "loss": 2.9197, + "mean_token_accuracy": 0.4285538461538462, + "step": 1964 + }, + { + "epoch": 0.364293659621802, + "grad_norm": 10.8984375, + "learning_rate": 9.635706340378199e-06, + "loss": 3.3504, + "mean_token_accuracy": 0.38387329013678906, + "step": 1965 + }, + { + "epoch": 0.36447905079718207, + "grad_norm": 10.453125, + "learning_rate": 9.63552094920282e-06, + "loss": 2.8602, + "mean_token_accuracy": 0.43345965225144895, + "step": 1966 + }, + { + "epoch": 0.3646644419725621, + "grad_norm": 6.8125, + "learning_rate": 9.635335558027438e-06, + "loss": 3.106, + "mean_token_accuracy": 0.4032211676732816, + "step": 1967 + }, + { + "epoch": 0.36484983314794217, + "grad_norm": 5.28125, + "learning_rate": 9.635150166852059e-06, + "loss": 2.9347, + "mean_token_accuracy": 0.42733545066257805, + "step": 1968 + }, + { + "epoch": 0.3650352243233222, + "grad_norm": 12.46875, + "learning_rate": 9.63496477567668e-06, + "loss": 3.0404, + "mean_token_accuracy": 0.422849277357192, + "step": 1969 + }, + { + "epoch": 0.3652206154987023, + "grad_norm": 11.9765625, + "learning_rate": 9.634779384501298e-06, + "loss": 3.107, + "mean_token_accuracy": 0.4027143738433066, + "step": 1970 + }, + { + "epoch": 0.3654060066740823, + "grad_norm": 11.1953125, + "learning_rate": 9.634593993325919e-06, + "loss": 2.8292, + "mean_token_accuracy": 0.4358153189218041, + "step": 1971 + }, + { + "epoch": 0.3655913978494624, + "grad_norm": 5.3671875, + "learning_rate": 9.634408602150539e-06, + "loss": 3.1158, + "mean_token_accuracy": 0.42786385572771146, + "step": 1972 + }, + { + "epoch": 0.3657767890248424, + "grad_norm": 7.6796875, + "learning_rate": 9.63422321097516e-06, + "loss": 2.6339, + "mean_token_accuracy": 0.4522490221642764, + "step": 1973 + }, + { + "epoch": 0.3659621802002225, + "grad_norm": 8.34375, + "learning_rate": 9.634037819799778e-06, + "loss": 2.389, + "mean_token_accuracy": 0.4936310505020231, + "step": 1974 + }, + { + "epoch": 0.3661475713756025, + "grad_norm": 8.5546875, + "learning_rate": 9.633852428624399e-06, + "loss": 2.9325, + "mean_token_accuracy": 0.4143014604574263, + "step": 1975 + }, + { + "epoch": 0.3663329625509826, + "grad_norm": 8.015625, + "learning_rate": 9.633667037449018e-06, + "loss": 3.1065, + "mean_token_accuracy": 0.409533352419124, + "step": 1976 + }, + { + "epoch": 0.3665183537263626, + "grad_norm": 6.515625, + "learning_rate": 9.633481646273638e-06, + "loss": 2.0513, + "mean_token_accuracy": 0.5394495412844037, + "step": 1977 + }, + { + "epoch": 0.3667037449017427, + "grad_norm": 10.2109375, + "learning_rate": 9.633296255098259e-06, + "loss": 2.736, + "mean_token_accuracy": 0.4470061940812113, + "step": 1978 + }, + { + "epoch": 0.3668891360771227, + "grad_norm": 6.28515625, + "learning_rate": 9.633110863922877e-06, + "loss": 2.8101, + "mean_token_accuracy": 0.4428782166857892, + "step": 1979 + }, + { + "epoch": 0.3670745272525028, + "grad_norm": 5.7421875, + "learning_rate": 9.632925472747498e-06, + "loss": 2.7887, + "mean_token_accuracy": 0.44549583648750946, + "step": 1980 + }, + { + "epoch": 0.36725991842788286, + "grad_norm": 10.1953125, + "learning_rate": 9.632740081572118e-06, + "loss": 2.6611, + "mean_token_accuracy": 0.46782544378698226, + "step": 1981 + }, + { + "epoch": 0.3674453096032629, + "grad_norm": 7.98828125, + "learning_rate": 9.632554690396739e-06, + "loss": 2.8348, + "mean_token_accuracy": 0.43077601410934746, + "step": 1982 + }, + { + "epoch": 0.36763070077864296, + "grad_norm": 9.1328125, + "learning_rate": 9.632369299221358e-06, + "loss": 2.5115, + "mean_token_accuracy": 0.46553715825953024, + "step": 1983 + }, + { + "epoch": 0.367816091954023, + "grad_norm": 6.26171875, + "learning_rate": 9.632183908045978e-06, + "loss": 2.6701, + "mean_token_accuracy": 0.4568436258577104, + "step": 1984 + }, + { + "epoch": 0.36800148312940306, + "grad_norm": 8.3359375, + "learning_rate": 9.631998516870597e-06, + "loss": 2.8724, + "mean_token_accuracy": 0.4445274003285053, + "step": 1985 + }, + { + "epoch": 0.3681868743047831, + "grad_norm": 8.8671875, + "learning_rate": 9.631813125695217e-06, + "loss": 3.4052, + "mean_token_accuracy": 0.39631197097944376, + "step": 1986 + }, + { + "epoch": 0.36837226548016316, + "grad_norm": 10.453125, + "learning_rate": 9.631627734519838e-06, + "loss": 3.0992, + "mean_token_accuracy": 0.40095208845208846, + "step": 1987 + }, + { + "epoch": 0.3685576566555432, + "grad_norm": 6.3125, + "learning_rate": 9.631442343344457e-06, + "loss": 2.8029, + "mean_token_accuracy": 0.4380938459289578, + "step": 1988 + }, + { + "epoch": 0.36874304783092327, + "grad_norm": 7.2109375, + "learning_rate": 9.631256952169077e-06, + "loss": 3.1823, + "mean_token_accuracy": 0.3995384197664947, + "step": 1989 + }, + { + "epoch": 0.3689284390063033, + "grad_norm": 8.3671875, + "learning_rate": 9.631071560993698e-06, + "loss": 2.175, + "mean_token_accuracy": 0.5395224109309156, + "step": 1990 + }, + { + "epoch": 0.36911383018168337, + "grad_norm": 9.765625, + "learning_rate": 9.630886169818318e-06, + "loss": 3.2129, + "mean_token_accuracy": 0.4031246436309727, + "step": 1991 + }, + { + "epoch": 0.3692992213570634, + "grad_norm": 9.125, + "learning_rate": 9.630700778642937e-06, + "loss": 2.8497, + "mean_token_accuracy": 0.42178414409303644, + "step": 1992 + }, + { + "epoch": 0.36948461253244347, + "grad_norm": 6.91015625, + "learning_rate": 9.630515387467557e-06, + "loss": 2.8404, + "mean_token_accuracy": 0.43538393449878293, + "step": 1993 + }, + { + "epoch": 0.3696700037078235, + "grad_norm": 6.37890625, + "learning_rate": 9.630329996292176e-06, + "loss": 2.8295, + "mean_token_accuracy": 0.45617752007136486, + "step": 1994 + }, + { + "epoch": 0.36985539488320357, + "grad_norm": 9.3203125, + "learning_rate": 9.630144605116797e-06, + "loss": 3.0167, + "mean_token_accuracy": 0.4160714285714286, + "step": 1995 + }, + { + "epoch": 0.3700407860585836, + "grad_norm": 5.3203125, + "learning_rate": 9.629959213941417e-06, + "loss": 3.2577, + "mean_token_accuracy": 0.38896687254351486, + "step": 1996 + }, + { + "epoch": 0.37022617723396367, + "grad_norm": 6.15625, + "learning_rate": 9.629773822766038e-06, + "loss": 3.1339, + "mean_token_accuracy": 0.40636223704463825, + "step": 1997 + }, + { + "epoch": 0.3704115684093437, + "grad_norm": 6.83203125, + "learning_rate": 9.629588431590657e-06, + "loss": 3.4045, + "mean_token_accuracy": 0.3807138384470883, + "step": 1998 + }, + { + "epoch": 0.3705969595847238, + "grad_norm": 9.1484375, + "learning_rate": 9.629403040415277e-06, + "loss": 2.821, + "mean_token_accuracy": 0.43425551756294684, + "step": 1999 + }, + { + "epoch": 0.3707823507601038, + "grad_norm": 6.15234375, + "learning_rate": 9.629217649239898e-06, + "loss": 2.3541, + "mean_token_accuracy": 0.49463428410053656, + "step": 2000 + }, + { + "epoch": 0.3709677419354839, + "grad_norm": 7.265625, + "learning_rate": 9.629032258064516e-06, + "loss": 2.8611, + "mean_token_accuracy": 0.4388441746932313, + "step": 2001 + }, + { + "epoch": 0.3711531331108639, + "grad_norm": 6.91796875, + "learning_rate": 9.628846866889137e-06, + "loss": 2.633, + "mean_token_accuracy": 0.46019615335626035, + "step": 2002 + }, + { + "epoch": 0.371338524286244, + "grad_norm": 6.11328125, + "learning_rate": 9.628661475713756e-06, + "loss": 2.7864, + "mean_token_accuracy": 0.45384073291050037, + "step": 2003 + }, + { + "epoch": 0.371523915461624, + "grad_norm": 6.64453125, + "learning_rate": 9.628476084538376e-06, + "loss": 2.5784, + "mean_token_accuracy": 0.47295758610873895, + "step": 2004 + }, + { + "epoch": 0.3717093066370041, + "grad_norm": 5.67578125, + "learning_rate": 9.628290693362997e-06, + "loss": 2.4409, + "mean_token_accuracy": 0.509175465057818, + "step": 2005 + }, + { + "epoch": 0.37189469781238416, + "grad_norm": 7.19140625, + "learning_rate": 9.628105302187617e-06, + "loss": 2.668, + "mean_token_accuracy": 0.45245486949522823, + "step": 2006 + }, + { + "epoch": 0.3720800889877642, + "grad_norm": 5.4921875, + "learning_rate": 9.627919911012238e-06, + "loss": 2.4966, + "mean_token_accuracy": 0.46609897172236503, + "step": 2007 + }, + { + "epoch": 0.37226548016314426, + "grad_norm": 4.8984375, + "learning_rate": 9.627734519836856e-06, + "loss": 3.14, + "mean_token_accuracy": 0.4135108891663199, + "step": 2008 + }, + { + "epoch": 0.3724508713385243, + "grad_norm": 8.265625, + "learning_rate": 9.627549128661477e-06, + "loss": 2.587, + "mean_token_accuracy": 0.45553964327561514, + "step": 2009 + }, + { + "epoch": 0.37263626251390436, + "grad_norm": 7.1484375, + "learning_rate": 9.627363737486096e-06, + "loss": 3.1487, + "mean_token_accuracy": 0.3924928066963118, + "step": 2010 + }, + { + "epoch": 0.3728216536892844, + "grad_norm": 5.28515625, + "learning_rate": 9.627178346310716e-06, + "loss": 3.2453, + "mean_token_accuracy": 0.3857559836544075, + "step": 2011 + }, + { + "epoch": 0.37300704486466446, + "grad_norm": 9.2890625, + "learning_rate": 9.626992955135337e-06, + "loss": 2.9718, + "mean_token_accuracy": 0.4310854776693286, + "step": 2012 + }, + { + "epoch": 0.3731924360400445, + "grad_norm": 6.68359375, + "learning_rate": 9.626807563959957e-06, + "loss": 3.0917, + "mean_token_accuracy": 0.41037366083093807, + "step": 2013 + }, + { + "epoch": 0.37337782721542456, + "grad_norm": 6.5546875, + "learning_rate": 9.626622172784576e-06, + "loss": 2.8792, + "mean_token_accuracy": 0.42188208616780043, + "step": 2014 + }, + { + "epoch": 0.3735632183908046, + "grad_norm": 5.8046875, + "learning_rate": 9.626436781609196e-06, + "loss": 2.5818, + "mean_token_accuracy": 0.4514761765565624, + "step": 2015 + }, + { + "epoch": 0.37374860956618466, + "grad_norm": 5.53125, + "learning_rate": 9.626251390433817e-06, + "loss": 3.1144, + "mean_token_accuracy": 0.4027985328080424, + "step": 2016 + }, + { + "epoch": 0.3739340007415647, + "grad_norm": 5.94921875, + "learning_rate": 9.626065999258436e-06, + "loss": 2.3861, + "mean_token_accuracy": 0.4908996359854394, + "step": 2017 + }, + { + "epoch": 0.37411939191694477, + "grad_norm": 5.4296875, + "learning_rate": 9.625880608083056e-06, + "loss": 2.9198, + "mean_token_accuracy": 0.4199238041484408, + "step": 2018 + }, + { + "epoch": 0.3743047830923248, + "grad_norm": 6.28515625, + "learning_rate": 9.625695216907675e-06, + "loss": 2.7408, + "mean_token_accuracy": 0.44878048780487806, + "step": 2019 + }, + { + "epoch": 0.37449017426770487, + "grad_norm": 8.7578125, + "learning_rate": 9.625509825732296e-06, + "loss": 2.7781, + "mean_token_accuracy": 0.41345080034743764, + "step": 2020 + }, + { + "epoch": 0.3746755654430849, + "grad_norm": 6.57421875, + "learning_rate": 9.625324434556916e-06, + "loss": 2.8937, + "mean_token_accuracy": 0.4264958127333266, + "step": 2021 + }, + { + "epoch": 0.37486095661846497, + "grad_norm": 5.33984375, + "learning_rate": 9.625139043381536e-06, + "loss": 2.3729, + "mean_token_accuracy": 0.5096894409937888, + "step": 2022 + }, + { + "epoch": 0.375046347793845, + "grad_norm": 9.1640625, + "learning_rate": 9.624953652206155e-06, + "loss": 3.0185, + "mean_token_accuracy": 0.43386636915829924, + "step": 2023 + }, + { + "epoch": 0.37523173896922507, + "grad_norm": 9.671875, + "learning_rate": 9.624768261030776e-06, + "loss": 2.5644, + "mean_token_accuracy": 0.47907502827698883, + "step": 2024 + }, + { + "epoch": 0.3754171301446051, + "grad_norm": 5.9375, + "learning_rate": 9.624582869855396e-06, + "loss": 2.9866, + "mean_token_accuracy": 0.4180306230200634, + "step": 2025 + }, + { + "epoch": 0.37560252131998517, + "grad_norm": 8.796875, + "learning_rate": 9.624397478680015e-06, + "loss": 2.6531, + "mean_token_accuracy": 0.45761967501097933, + "step": 2026 + }, + { + "epoch": 0.3757879124953652, + "grad_norm": 7.0, + "learning_rate": 9.624212087504636e-06, + "loss": 2.9049, + "mean_token_accuracy": 0.42466502597757727, + "step": 2027 + }, + { + "epoch": 0.3759733036707453, + "grad_norm": 6.56640625, + "learning_rate": 9.624026696329254e-06, + "loss": 3.2015, + "mean_token_accuracy": 0.3909145248057382, + "step": 2028 + }, + { + "epoch": 0.37615869484612535, + "grad_norm": 5.1015625, + "learning_rate": 9.623841305153877e-06, + "loss": 2.7174, + "mean_token_accuracy": 0.4525670313815078, + "step": 2029 + }, + { + "epoch": 0.3763440860215054, + "grad_norm": 7.5703125, + "learning_rate": 9.623655913978495e-06, + "loss": 2.8468, + "mean_token_accuracy": 0.43388305847076464, + "step": 2030 + }, + { + "epoch": 0.37652947719688545, + "grad_norm": 8.1953125, + "learning_rate": 9.623470522803116e-06, + "loss": 2.7492, + "mean_token_accuracy": 0.44240525216353327, + "step": 2031 + }, + { + "epoch": 0.3767148683722655, + "grad_norm": 5.515625, + "learning_rate": 9.623285131627735e-06, + "loss": 2.6994, + "mean_token_accuracy": 0.4571729464076761, + "step": 2032 + }, + { + "epoch": 0.37690025954764556, + "grad_norm": 9.9453125, + "learning_rate": 9.623099740452355e-06, + "loss": 2.6625, + "mean_token_accuracy": 0.454396504642272, + "step": 2033 + }, + { + "epoch": 0.3770856507230256, + "grad_norm": 6.41796875, + "learning_rate": 9.622914349276976e-06, + "loss": 3.2146, + "mean_token_accuracy": 0.3953187485297577, + "step": 2034 + }, + { + "epoch": 0.37727104189840566, + "grad_norm": 8.875, + "learning_rate": 9.622728958101594e-06, + "loss": 3.5484, + "mean_token_accuracy": 0.3681206685690991, + "step": 2035 + }, + { + "epoch": 0.3774564330737857, + "grad_norm": 5.72265625, + "learning_rate": 9.622543566926215e-06, + "loss": 3.07, + "mean_token_accuracy": 0.40710232473000185, + "step": 2036 + }, + { + "epoch": 0.37764182424916576, + "grad_norm": 6.0, + "learning_rate": 9.622358175750835e-06, + "loss": 2.6312, + "mean_token_accuracy": 0.4592440215993829, + "step": 2037 + }, + { + "epoch": 0.3778272154245458, + "grad_norm": 5.8828125, + "learning_rate": 9.622172784575456e-06, + "loss": 2.754, + "mean_token_accuracy": 0.4361594751450921, + "step": 2038 + }, + { + "epoch": 0.37801260659992586, + "grad_norm": 9.125, + "learning_rate": 9.621987393400075e-06, + "loss": 2.3551, + "mean_token_accuracy": 0.46254248810333104, + "step": 2039 + }, + { + "epoch": 0.3781979977753059, + "grad_norm": 7.25, + "learning_rate": 9.621802002224695e-06, + "loss": 2.9968, + "mean_token_accuracy": 0.42607640994542145, + "step": 2040 + }, + { + "epoch": 0.37838338895068596, + "grad_norm": 6.359375, + "learning_rate": 9.621616611049314e-06, + "loss": 3.0349, + "mean_token_accuracy": 0.4182504556105181, + "step": 2041 + }, + { + "epoch": 0.378568780126066, + "grad_norm": 7.11328125, + "learning_rate": 9.621431219873934e-06, + "loss": 2.6081, + "mean_token_accuracy": 0.44713656387665196, + "step": 2042 + }, + { + "epoch": 0.37875417130144606, + "grad_norm": 6.11328125, + "learning_rate": 9.621245828698555e-06, + "loss": 3.1703, + "mean_token_accuracy": 0.4067410035478966, + "step": 2043 + }, + { + "epoch": 0.3789395624768261, + "grad_norm": 9.84375, + "learning_rate": 9.621060437523174e-06, + "loss": 2.898, + "mean_token_accuracy": 0.4383773626616558, + "step": 2044 + }, + { + "epoch": 0.37912495365220616, + "grad_norm": 5.88671875, + "learning_rate": 9.620875046347796e-06, + "loss": 2.8187, + "mean_token_accuracy": 0.4430740037950664, + "step": 2045 + }, + { + "epoch": 0.3793103448275862, + "grad_norm": 7.15234375, + "learning_rate": 9.620689655172415e-06, + "loss": 2.8045, + "mean_token_accuracy": 0.4574505858681501, + "step": 2046 + }, + { + "epoch": 0.37949573600296627, + "grad_norm": 6.42578125, + "learning_rate": 9.620504263997035e-06, + "loss": 3.0148, + "mean_token_accuracy": 0.4261457934380186, + "step": 2047 + }, + { + "epoch": 0.3796811271783463, + "grad_norm": 6.83984375, + "learning_rate": 9.620318872821654e-06, + "loss": 2.8098, + "mean_token_accuracy": 0.45230682139824513, + "step": 2048 + }, + { + "epoch": 0.37986651835372637, + "grad_norm": 7.46484375, + "learning_rate": 9.620133481646275e-06, + "loss": 2.9976, + "mean_token_accuracy": 0.40401785714285715, + "step": 2049 + }, + { + "epoch": 0.3800519095291064, + "grad_norm": 7.50390625, + "learning_rate": 9.619948090470895e-06, + "loss": 2.6342, + "mean_token_accuracy": 0.4509297520661157, + "step": 2050 + }, + { + "epoch": 0.38023730070448647, + "grad_norm": 7.31640625, + "learning_rate": 9.619762699295514e-06, + "loss": 2.8952, + "mean_token_accuracy": 0.4513000702740689, + "step": 2051 + }, + { + "epoch": 0.3804226918798665, + "grad_norm": 6.8203125, + "learning_rate": 9.619577308120134e-06, + "loss": 3.02, + "mean_token_accuracy": 0.4429545989179017, + "step": 2052 + }, + { + "epoch": 0.38060808305524657, + "grad_norm": 5.8203125, + "learning_rate": 9.619391916944755e-06, + "loss": 2.9102, + "mean_token_accuracy": 0.41605153670648237, + "step": 2053 + }, + { + "epoch": 0.38079347423062665, + "grad_norm": 7.296875, + "learning_rate": 9.619206525769375e-06, + "loss": 2.9864, + "mean_token_accuracy": 0.42070532237126784, + "step": 2054 + }, + { + "epoch": 0.3809788654060067, + "grad_norm": 6.36328125, + "learning_rate": 9.619021134593994e-06, + "loss": 3.3018, + "mean_token_accuracy": 0.38093232238135355, + "step": 2055 + }, + { + "epoch": 0.38116425658138675, + "grad_norm": 8.0078125, + "learning_rate": 9.618835743418615e-06, + "loss": 2.9429, + "mean_token_accuracy": 0.4352600274515785, + "step": 2056 + }, + { + "epoch": 0.3813496477567668, + "grad_norm": 7.66796875, + "learning_rate": 9.618650352243233e-06, + "loss": 2.8707, + "mean_token_accuracy": 0.4282781275663838, + "step": 2057 + }, + { + "epoch": 0.38153503893214685, + "grad_norm": 6.72265625, + "learning_rate": 9.618464961067854e-06, + "loss": 3.138, + "mean_token_accuracy": 0.4047550432276657, + "step": 2058 + }, + { + "epoch": 0.3817204301075269, + "grad_norm": 7.92578125, + "learning_rate": 9.618279569892474e-06, + "loss": 2.4854, + "mean_token_accuracy": 0.47245017584994137, + "step": 2059 + }, + { + "epoch": 0.38190582128290695, + "grad_norm": 7.2734375, + "learning_rate": 9.618094178717093e-06, + "loss": 2.6575, + "mean_token_accuracy": 0.4482477587612062, + "step": 2060 + }, + { + "epoch": 0.382091212458287, + "grad_norm": 6.23046875, + "learning_rate": 9.617908787541714e-06, + "loss": 2.7923, + "mean_token_accuracy": 0.44017611447440835, + "step": 2061 + }, + { + "epoch": 0.38227660363366706, + "grad_norm": 7.7265625, + "learning_rate": 9.617723396366334e-06, + "loss": 2.9259, + "mean_token_accuracy": 0.4344192392972881, + "step": 2062 + }, + { + "epoch": 0.3824619948090471, + "grad_norm": 5.5546875, + "learning_rate": 9.617538005190955e-06, + "loss": 2.775, + "mean_token_accuracy": 0.45398860398860397, + "step": 2063 + }, + { + "epoch": 0.38264738598442716, + "grad_norm": 7.3828125, + "learning_rate": 9.617352614015573e-06, + "loss": 3.0458, + "mean_token_accuracy": 0.40997804342522565, + "step": 2064 + }, + { + "epoch": 0.3828327771598072, + "grad_norm": 60.53125, + "learning_rate": 9.617167222840194e-06, + "loss": 2.622, + "mean_token_accuracy": 0.46342464348004514, + "step": 2065 + }, + { + "epoch": 0.38301816833518726, + "grad_norm": 5.21484375, + "learning_rate": 9.616981831664813e-06, + "loss": 2.9078, + "mean_token_accuracy": 0.43449565504344956, + "step": 2066 + }, + { + "epoch": 0.3832035595105673, + "grad_norm": 5.4140625, + "learning_rate": 9.616796440489433e-06, + "loss": 3.0555, + "mean_token_accuracy": 0.40692484014959585, + "step": 2067 + }, + { + "epoch": 0.38338895068594736, + "grad_norm": 6.81640625, + "learning_rate": 9.616611049314054e-06, + "loss": 2.8763, + "mean_token_accuracy": 0.4387933547070825, + "step": 2068 + }, + { + "epoch": 0.3835743418613274, + "grad_norm": 8.703125, + "learning_rate": 9.616425658138674e-06, + "loss": 2.7112, + "mean_token_accuracy": 0.4468868821292776, + "step": 2069 + }, + { + "epoch": 0.38375973303670746, + "grad_norm": 7.13671875, + "learning_rate": 9.616240266963293e-06, + "loss": 2.7574, + "mean_token_accuracy": 0.4438002371853999, + "step": 2070 + }, + { + "epoch": 0.3839451242120875, + "grad_norm": 7.109375, + "learning_rate": 9.616054875787913e-06, + "loss": 2.5961, + "mean_token_accuracy": 0.4589008924377642, + "step": 2071 + }, + { + "epoch": 0.38413051538746756, + "grad_norm": 5.78515625, + "learning_rate": 9.615869484612534e-06, + "loss": 3.002, + "mean_token_accuracy": 0.4162524850894632, + "step": 2072 + }, + { + "epoch": 0.3843159065628476, + "grad_norm": 5.44140625, + "learning_rate": 9.615684093437153e-06, + "loss": 2.7992, + "mean_token_accuracy": 0.444640234948605, + "step": 2073 + }, + { + "epoch": 0.38450129773822767, + "grad_norm": 5.65234375, + "learning_rate": 9.615498702261773e-06, + "loss": 3.0131, + "mean_token_accuracy": 0.41701769165964614, + "step": 2074 + }, + { + "epoch": 0.3846866889136077, + "grad_norm": 8.9765625, + "learning_rate": 9.615313311086392e-06, + "loss": 2.7005, + "mean_token_accuracy": 0.4645742697327533, + "step": 2075 + }, + { + "epoch": 0.38487208008898777, + "grad_norm": 5.578125, + "learning_rate": 9.615127919911013e-06, + "loss": 2.5209, + "mean_token_accuracy": 0.4868745793134395, + "step": 2076 + }, + { + "epoch": 0.3850574712643678, + "grad_norm": 5.46875, + "learning_rate": 9.614942528735633e-06, + "loss": 2.8761, + "mean_token_accuracy": 0.44999288661260495, + "step": 2077 + }, + { + "epoch": 0.38524286243974787, + "grad_norm": 5.83984375, + "learning_rate": 9.614757137560254e-06, + "loss": 2.5907, + "mean_token_accuracy": 0.46751907609816457, + "step": 2078 + }, + { + "epoch": 0.38542825361512795, + "grad_norm": 5.703125, + "learning_rate": 9.614571746384872e-06, + "loss": 3.0025, + "mean_token_accuracy": 0.4178844056706652, + "step": 2079 + }, + { + "epoch": 0.38561364479050797, + "grad_norm": 7.05078125, + "learning_rate": 9.614386355209493e-06, + "loss": 2.6778, + "mean_token_accuracy": 0.44808281398542305, + "step": 2080 + }, + { + "epoch": 0.38579903596588805, + "grad_norm": 5.7890625, + "learning_rate": 9.614200964034113e-06, + "loss": 3.0175, + "mean_token_accuracy": 0.39784572619874914, + "step": 2081 + }, + { + "epoch": 0.38598442714126807, + "grad_norm": 5.87890625, + "learning_rate": 9.614015572858732e-06, + "loss": 3.2212, + "mean_token_accuracy": 0.38601868067717454, + "step": 2082 + }, + { + "epoch": 0.38616981831664815, + "grad_norm": 5.25390625, + "learning_rate": 9.613830181683353e-06, + "loss": 2.4727, + "mean_token_accuracy": 0.4817673378076063, + "step": 2083 + }, + { + "epoch": 0.3863552094920282, + "grad_norm": 6.05859375, + "learning_rate": 9.613644790507971e-06, + "loss": 2.5637, + "mean_token_accuracy": 0.4649891981192019, + "step": 2084 + }, + { + "epoch": 0.38654060066740825, + "grad_norm": 6.51171875, + "learning_rate": 9.613459399332594e-06, + "loss": 2.94, + "mean_token_accuracy": 0.42481442205726405, + "step": 2085 + }, + { + "epoch": 0.3867259918427883, + "grad_norm": 6.11328125, + "learning_rate": 9.613274008157212e-06, + "loss": 3.0626, + "mean_token_accuracy": 0.41205965543841605, + "step": 2086 + }, + { + "epoch": 0.38691138301816835, + "grad_norm": 7.16796875, + "learning_rate": 9.613088616981833e-06, + "loss": 2.6915, + "mean_token_accuracy": 0.4504725236261813, + "step": 2087 + }, + { + "epoch": 0.3870967741935484, + "grad_norm": 6.8828125, + "learning_rate": 9.612903225806453e-06, + "loss": 2.4441, + "mean_token_accuracy": 0.4721268789205823, + "step": 2088 + }, + { + "epoch": 0.38728216536892845, + "grad_norm": 9.1796875, + "learning_rate": 9.612717834631072e-06, + "loss": 2.735, + "mean_token_accuracy": 0.4508590001547748, + "step": 2089 + }, + { + "epoch": 0.3874675565443085, + "grad_norm": 11.9453125, + "learning_rate": 9.612532443455693e-06, + "loss": 2.2656, + "mean_token_accuracy": 0.5066921606118547, + "step": 2090 + }, + { + "epoch": 0.38765294771968856, + "grad_norm": 9.65625, + "learning_rate": 9.612347052280311e-06, + "loss": 2.6518, + "mean_token_accuracy": 0.448006509357201, + "step": 2091 + }, + { + "epoch": 0.3878383388950686, + "grad_norm": 5.96875, + "learning_rate": 9.612161661104932e-06, + "loss": 2.9964, + "mean_token_accuracy": 0.39593674246796723, + "step": 2092 + }, + { + "epoch": 0.38802373007044866, + "grad_norm": 7.15625, + "learning_rate": 9.611976269929552e-06, + "loss": 2.7437, + "mean_token_accuracy": 0.45685146443514646, + "step": 2093 + }, + { + "epoch": 0.3882091212458287, + "grad_norm": 5.50390625, + "learning_rate": 9.611790878754173e-06, + "loss": 3.1872, + "mean_token_accuracy": 0.39611407082419303, + "step": 2094 + }, + { + "epoch": 0.38839451242120876, + "grad_norm": 6.734375, + "learning_rate": 9.611605487578792e-06, + "loss": 2.7417, + "mean_token_accuracy": 0.4251856082238721, + "step": 2095 + }, + { + "epoch": 0.3885799035965888, + "grad_norm": 7.10546875, + "learning_rate": 9.611420096403412e-06, + "loss": 2.5485, + "mean_token_accuracy": 0.45921203204185057, + "step": 2096 + }, + { + "epoch": 0.38876529477196886, + "grad_norm": 7.171875, + "learning_rate": 9.611234705228033e-06, + "loss": 2.637, + "mean_token_accuracy": 0.46487006737247355, + "step": 2097 + }, + { + "epoch": 0.3889506859473489, + "grad_norm": 5.96484375, + "learning_rate": 9.611049314052651e-06, + "loss": 2.7538, + "mean_token_accuracy": 0.44766657674669547, + "step": 2098 + }, + { + "epoch": 0.38913607712272896, + "grad_norm": 9.1953125, + "learning_rate": 9.610863922877272e-06, + "loss": 2.4073, + "mean_token_accuracy": 0.49343533546986457, + "step": 2099 + }, + { + "epoch": 0.389321468298109, + "grad_norm": 7.26953125, + "learning_rate": 9.61067853170189e-06, + "loss": 3.1512, + "mean_token_accuracy": 0.40235878336437, + "step": 2100 + }, + { + "epoch": 0.38950685947348906, + "grad_norm": 6.18359375, + "learning_rate": 9.610493140526513e-06, + "loss": 2.995, + "mean_token_accuracy": 0.41995542152877935, + "step": 2101 + }, + { + "epoch": 0.3896922506488691, + "grad_norm": 8.2578125, + "learning_rate": 9.610307749351132e-06, + "loss": 3.0167, + "mean_token_accuracy": 0.4134461134606971, + "step": 2102 + }, + { + "epoch": 0.38987764182424917, + "grad_norm": 7.35546875, + "learning_rate": 9.610122358175752e-06, + "loss": 3.1865, + "mean_token_accuracy": 0.4180364952495853, + "step": 2103 + }, + { + "epoch": 0.39006303299962924, + "grad_norm": 6.3828125, + "learning_rate": 9.609936967000371e-06, + "loss": 2.7165, + "mean_token_accuracy": 0.45424430641821945, + "step": 2104 + }, + { + "epoch": 0.39024842417500927, + "grad_norm": 6.5859375, + "learning_rate": 9.609751575824992e-06, + "loss": 2.8965, + "mean_token_accuracy": 0.44299552906110284, + "step": 2105 + }, + { + "epoch": 0.39043381535038935, + "grad_norm": 6.00390625, + "learning_rate": 9.609566184649612e-06, + "loss": 3.0302, + "mean_token_accuracy": 0.4079285822363444, + "step": 2106 + }, + { + "epoch": 0.39061920652576937, + "grad_norm": 10.0703125, + "learning_rate": 9.60938079347423e-06, + "loss": 3.2112, + "mean_token_accuracy": 0.38524853019775523, + "step": 2107 + }, + { + "epoch": 0.39080459770114945, + "grad_norm": 5.13671875, + "learning_rate": 9.609195402298851e-06, + "loss": 2.4987, + "mean_token_accuracy": 0.47512437810945274, + "step": 2108 + }, + { + "epoch": 0.39098998887652947, + "grad_norm": 7.76171875, + "learning_rate": 9.60901001112347e-06, + "loss": 2.7383, + "mean_token_accuracy": 0.44074074074074077, + "step": 2109 + }, + { + "epoch": 0.39117538005190955, + "grad_norm": 9.0625, + "learning_rate": 9.608824619948092e-06, + "loss": 2.9336, + "mean_token_accuracy": 0.4272752782714433, + "step": 2110 + }, + { + "epoch": 0.39136077122728957, + "grad_norm": 5.51171875, + "learning_rate": 9.608639228772711e-06, + "loss": 3.1542, + "mean_token_accuracy": 0.42014487754398067, + "step": 2111 + }, + { + "epoch": 0.39154616240266965, + "grad_norm": 5.9140625, + "learning_rate": 9.608453837597332e-06, + "loss": 2.8332, + "mean_token_accuracy": 0.42817153734184304, + "step": 2112 + }, + { + "epoch": 0.3917315535780497, + "grad_norm": 7.25390625, + "learning_rate": 9.60826844642195e-06, + "loss": 3.1417, + "mean_token_accuracy": 0.39766662529477476, + "step": 2113 + }, + { + "epoch": 0.39191694475342975, + "grad_norm": 5.25390625, + "learning_rate": 9.608083055246571e-06, + "loss": 2.6411, + "mean_token_accuracy": 0.47155025553662694, + "step": 2114 + }, + { + "epoch": 0.3921023359288098, + "grad_norm": 11.09375, + "learning_rate": 9.607897664071191e-06, + "loss": 3.0105, + "mean_token_accuracy": 0.42351854295895475, + "step": 2115 + }, + { + "epoch": 0.39228772710418985, + "grad_norm": 10.3203125, + "learning_rate": 9.60771227289581e-06, + "loss": 3.0379, + "mean_token_accuracy": 0.400325545767905, + "step": 2116 + }, + { + "epoch": 0.3924731182795699, + "grad_norm": 9.875, + "learning_rate": 9.60752688172043e-06, + "loss": 2.7898, + "mean_token_accuracy": 0.4152623976889745, + "step": 2117 + }, + { + "epoch": 0.39265850945494996, + "grad_norm": 6.390625, + "learning_rate": 9.607341490545051e-06, + "loss": 2.8519, + "mean_token_accuracy": 0.4348458406050029, + "step": 2118 + }, + { + "epoch": 0.39284390063033, + "grad_norm": 7.98046875, + "learning_rate": 9.607156099369672e-06, + "loss": 2.4599, + "mean_token_accuracy": 0.4906051191004587, + "step": 2119 + }, + { + "epoch": 0.39302929180571006, + "grad_norm": 14.265625, + "learning_rate": 9.60697070819429e-06, + "loss": 2.804, + "mean_token_accuracy": 0.439311098961181, + "step": 2120 + }, + { + "epoch": 0.3932146829810901, + "grad_norm": 10.46875, + "learning_rate": 9.606785317018911e-06, + "loss": 2.7183, + "mean_token_accuracy": 0.4366297243535095, + "step": 2121 + }, + { + "epoch": 0.39340007415647016, + "grad_norm": 7.3671875, + "learning_rate": 9.60659992584353e-06, + "loss": 2.9443, + "mean_token_accuracy": 0.4348396501457726, + "step": 2122 + }, + { + "epoch": 0.3935854653318502, + "grad_norm": 6.40625, + "learning_rate": 9.60641453466815e-06, + "loss": 2.7636, + "mean_token_accuracy": 0.4421972860125261, + "step": 2123 + }, + { + "epoch": 0.39377085650723026, + "grad_norm": 7.66015625, + "learning_rate": 9.60622914349277e-06, + "loss": 2.6197, + "mean_token_accuracy": 0.467395600052694, + "step": 2124 + }, + { + "epoch": 0.3939562476826103, + "grad_norm": 10.09375, + "learning_rate": 9.60604375231739e-06, + "loss": 2.3695, + "mean_token_accuracy": 0.48359945537814086, + "step": 2125 + }, + { + "epoch": 0.39414163885799036, + "grad_norm": 7.42578125, + "learning_rate": 9.605858361142012e-06, + "loss": 3.2091, + "mean_token_accuracy": 0.39211837535859884, + "step": 2126 + }, + { + "epoch": 0.3943270300333704, + "grad_norm": 6.9453125, + "learning_rate": 9.60567296996663e-06, + "loss": 2.318, + "mean_token_accuracy": 0.5069799906933458, + "step": 2127 + }, + { + "epoch": 0.39451242120875046, + "grad_norm": 6.08984375, + "learning_rate": 9.605487578791251e-06, + "loss": 2.9046, + "mean_token_accuracy": 0.4173882311362431, + "step": 2128 + }, + { + "epoch": 0.39469781238413054, + "grad_norm": 6.1875, + "learning_rate": 9.60530218761587e-06, + "loss": 2.9756, + "mean_token_accuracy": 0.4227377560710894, + "step": 2129 + }, + { + "epoch": 0.39488320355951056, + "grad_norm": 7.2890625, + "learning_rate": 9.60511679644049e-06, + "loss": 2.7194, + "mean_token_accuracy": 0.4433718558803535, + "step": 2130 + }, + { + "epoch": 0.39506859473489064, + "grad_norm": 6.6484375, + "learning_rate": 9.60493140526511e-06, + "loss": 3.232, + "mean_token_accuracy": 0.39049103663289164, + "step": 2131 + }, + { + "epoch": 0.39525398591027067, + "grad_norm": 6.50390625, + "learning_rate": 9.60474601408973e-06, + "loss": 2.607, + "mean_token_accuracy": 0.4528549551520455, + "step": 2132 + }, + { + "epoch": 0.39543937708565075, + "grad_norm": 7.03515625, + "learning_rate": 9.60456062291435e-06, + "loss": 2.6888, + "mean_token_accuracy": 0.43695872230345295, + "step": 2133 + }, + { + "epoch": 0.39562476826103077, + "grad_norm": 7.9140625, + "learning_rate": 9.60437523173897e-06, + "loss": 2.9753, + "mean_token_accuracy": 0.41482404235440673, + "step": 2134 + }, + { + "epoch": 0.39581015943641085, + "grad_norm": 6.6875, + "learning_rate": 9.604189840563591e-06, + "loss": 2.8712, + "mean_token_accuracy": 0.43239524702939336, + "step": 2135 + }, + { + "epoch": 0.39599555061179087, + "grad_norm": 5.7890625, + "learning_rate": 9.60400444938821e-06, + "loss": 2.8143, + "mean_token_accuracy": 0.44885033732617374, + "step": 2136 + }, + { + "epoch": 0.39618094178717095, + "grad_norm": 7.94921875, + "learning_rate": 9.60381905821283e-06, + "loss": 2.8667, + "mean_token_accuracy": 0.42611118146131444, + "step": 2137 + }, + { + "epoch": 0.39636633296255097, + "grad_norm": 9.1640625, + "learning_rate": 9.603633667037449e-06, + "loss": 2.9556, + "mean_token_accuracy": 0.4074408343361412, + "step": 2138 + }, + { + "epoch": 0.39655172413793105, + "grad_norm": 6.80859375, + "learning_rate": 9.60344827586207e-06, + "loss": 2.8473, + "mean_token_accuracy": 0.43704873646209386, + "step": 2139 + }, + { + "epoch": 0.3967371153133111, + "grad_norm": 5.84375, + "learning_rate": 9.60326288468669e-06, + "loss": 3.104, + "mean_token_accuracy": 0.4044405197426517, + "step": 2140 + }, + { + "epoch": 0.39692250648869115, + "grad_norm": 7.203125, + "learning_rate": 9.603077493511309e-06, + "loss": 2.7151, + "mean_token_accuracy": 0.4464461185718965, + "step": 2141 + }, + { + "epoch": 0.3971078976640712, + "grad_norm": 6.8203125, + "learning_rate": 9.60289210233593e-06, + "loss": 3.3042, + "mean_token_accuracy": 0.403771491957848, + "step": 2142 + }, + { + "epoch": 0.39729328883945125, + "grad_norm": 7.71484375, + "learning_rate": 9.60270671116055e-06, + "loss": 2.7911, + "mean_token_accuracy": 0.44799419132328916, + "step": 2143 + }, + { + "epoch": 0.3974786800148313, + "grad_norm": 6.07421875, + "learning_rate": 9.60252131998517e-06, + "loss": 3.0516, + "mean_token_accuracy": 0.4171237777247794, + "step": 2144 + }, + { + "epoch": 0.39766407119021135, + "grad_norm": 7.16796875, + "learning_rate": 9.60233592880979e-06, + "loss": 3.2054, + "mean_token_accuracy": 0.39799222797927464, + "step": 2145 + }, + { + "epoch": 0.3978494623655914, + "grad_norm": 10.6875, + "learning_rate": 9.60215053763441e-06, + "loss": 2.9818, + "mean_token_accuracy": 0.4114414541355502, + "step": 2146 + }, + { + "epoch": 0.39803485354097146, + "grad_norm": 5.796875, + "learning_rate": 9.601965146459028e-06, + "loss": 2.4614, + "mean_token_accuracy": 0.4758304412493803, + "step": 2147 + }, + { + "epoch": 0.3982202447163515, + "grad_norm": 8.21875, + "learning_rate": 9.601779755283649e-06, + "loss": 2.9336, + "mean_token_accuracy": 0.4114009953249887, + "step": 2148 + }, + { + "epoch": 0.39840563589173156, + "grad_norm": 9.296875, + "learning_rate": 9.60159436410827e-06, + "loss": 2.7118, + "mean_token_accuracy": 0.4494267885647947, + "step": 2149 + }, + { + "epoch": 0.3985910270671116, + "grad_norm": 5.3203125, + "learning_rate": 9.60140897293289e-06, + "loss": 3.2308, + "mean_token_accuracy": 0.39852803006576887, + "step": 2150 + }, + { + "epoch": 0.39877641824249166, + "grad_norm": 7.66796875, + "learning_rate": 9.601223581757509e-06, + "loss": 3.5244, + "mean_token_accuracy": 0.38055772230889234, + "step": 2151 + }, + { + "epoch": 0.3989618094178717, + "grad_norm": 5.3046875, + "learning_rate": 9.60103819058213e-06, + "loss": 2.3049, + "mean_token_accuracy": 0.49959094627761114, + "step": 2152 + }, + { + "epoch": 0.39914720059325176, + "grad_norm": 6.0859375, + "learning_rate": 9.60085279940675e-06, + "loss": 2.7274, + "mean_token_accuracy": 0.4641638225255973, + "step": 2153 + }, + { + "epoch": 0.39933259176863184, + "grad_norm": 13.671875, + "learning_rate": 9.600667408231369e-06, + "loss": 2.176, + "mean_token_accuracy": 0.5069835824552805, + "step": 2154 + }, + { + "epoch": 0.39951798294401186, + "grad_norm": 12.1640625, + "learning_rate": 9.600482017055989e-06, + "loss": 2.4325, + "mean_token_accuracy": 0.4857462965268268, + "step": 2155 + }, + { + "epoch": 0.39970337411939194, + "grad_norm": 6.87109375, + "learning_rate": 9.600296625880608e-06, + "loss": 2.8785, + "mean_token_accuracy": 0.42439628482972136, + "step": 2156 + }, + { + "epoch": 0.39988876529477196, + "grad_norm": 7.71484375, + "learning_rate": 9.600111234705228e-06, + "loss": 2.919, + "mean_token_accuracy": 0.40865543442352165, + "step": 2157 + }, + { + "epoch": 0.40007415647015204, + "grad_norm": 6.7890625, + "learning_rate": 9.599925843529849e-06, + "loss": 2.5982, + "mean_token_accuracy": 0.460278276481149, + "step": 2158 + }, + { + "epoch": 0.40025954764553207, + "grad_norm": 6.23046875, + "learning_rate": 9.59974045235447e-06, + "loss": 3.1905, + "mean_token_accuracy": 0.40333660451422965, + "step": 2159 + }, + { + "epoch": 0.40044493882091214, + "grad_norm": 7.12890625, + "learning_rate": 9.599555061179088e-06, + "loss": 3.0669, + "mean_token_accuracy": 0.42526118403428875, + "step": 2160 + }, + { + "epoch": 0.40063032999629217, + "grad_norm": 9.078125, + "learning_rate": 9.599369670003709e-06, + "loss": 2.6433, + "mean_token_accuracy": 0.45611033892868597, + "step": 2161 + }, + { + "epoch": 0.40081572117167225, + "grad_norm": 5.37109375, + "learning_rate": 9.599184278828329e-06, + "loss": 3.0843, + "mean_token_accuracy": 0.4155622060709705, + "step": 2162 + }, + { + "epoch": 0.40100111234705227, + "grad_norm": 7.4921875, + "learning_rate": 9.598998887652948e-06, + "loss": 2.3621, + "mean_token_accuracy": 0.5131480890024487, + "step": 2163 + }, + { + "epoch": 0.40118650352243235, + "grad_norm": 8.5, + "learning_rate": 9.598813496477568e-06, + "loss": 2.8468, + "mean_token_accuracy": 0.4180476047002109, + "step": 2164 + }, + { + "epoch": 0.40137189469781237, + "grad_norm": 5.24609375, + "learning_rate": 9.598628105302187e-06, + "loss": 2.9088, + "mean_token_accuracy": 0.421875, + "step": 2165 + }, + { + "epoch": 0.40155728587319245, + "grad_norm": 7.4765625, + "learning_rate": 9.59844271412681e-06, + "loss": 3.1, + "mean_token_accuracy": 0.4135415146651102, + "step": 2166 + }, + { + "epoch": 0.40174267704857247, + "grad_norm": 6.86328125, + "learning_rate": 9.598257322951428e-06, + "loss": 3.2294, + "mean_token_accuracy": 0.3907189916929247, + "step": 2167 + }, + { + "epoch": 0.40192806822395255, + "grad_norm": 11.4375, + "learning_rate": 9.598071931776049e-06, + "loss": 2.7301, + "mean_token_accuracy": 0.44564518204039627, + "step": 2168 + }, + { + "epoch": 0.4021134593993326, + "grad_norm": 7.42578125, + "learning_rate": 9.597886540600669e-06, + "loss": 2.537, + "mean_token_accuracy": 0.46511627906976744, + "step": 2169 + }, + { + "epoch": 0.40229885057471265, + "grad_norm": 7.734375, + "learning_rate": 9.597701149425288e-06, + "loss": 2.9266, + "mean_token_accuracy": 0.43805704099821746, + "step": 2170 + }, + { + "epoch": 0.4024842417500927, + "grad_norm": 6.42578125, + "learning_rate": 9.597515758249908e-06, + "loss": 3.4523, + "mean_token_accuracy": 0.35805357351737044, + "step": 2171 + }, + { + "epoch": 0.40266963292547275, + "grad_norm": 5.9765625, + "learning_rate": 9.597330367074527e-06, + "loss": 2.7003, + "mean_token_accuracy": 0.452103467879477, + "step": 2172 + }, + { + "epoch": 0.4028550241008528, + "grad_norm": 8.0625, + "learning_rate": 9.597144975899148e-06, + "loss": 2.5305, + "mean_token_accuracy": 0.4557803877175605, + "step": 2173 + }, + { + "epoch": 0.40304041527623286, + "grad_norm": 6.46484375, + "learning_rate": 9.596959584723768e-06, + "loss": 2.6961, + "mean_token_accuracy": 0.47148495984755684, + "step": 2174 + }, + { + "epoch": 0.4032258064516129, + "grad_norm": 5.25390625, + "learning_rate": 9.596774193548389e-06, + "loss": 2.7688, + "mean_token_accuracy": 0.4398087787918296, + "step": 2175 + }, + { + "epoch": 0.40341119762699296, + "grad_norm": 8.953125, + "learning_rate": 9.596588802373007e-06, + "loss": 2.7808, + "mean_token_accuracy": 0.44367781610919454, + "step": 2176 + }, + { + "epoch": 0.403596588802373, + "grad_norm": 5.23828125, + "learning_rate": 9.596403411197628e-06, + "loss": 2.7649, + "mean_token_accuracy": 0.4540554414784394, + "step": 2177 + }, + { + "epoch": 0.40378197997775306, + "grad_norm": 7.40625, + "learning_rate": 9.596218020022248e-06, + "loss": 2.4962, + "mean_token_accuracy": 0.4739084132055378, + "step": 2178 + }, + { + "epoch": 0.40396737115313314, + "grad_norm": 6.1015625, + "learning_rate": 9.596032628846867e-06, + "loss": 3.4253, + "mean_token_accuracy": 0.37210464922512915, + "step": 2179 + }, + { + "epoch": 0.40415276232851316, + "grad_norm": 7.3125, + "learning_rate": 9.595847237671488e-06, + "loss": 3.0844, + "mean_token_accuracy": 0.42624969219404085, + "step": 2180 + }, + { + "epoch": 0.40433815350389324, + "grad_norm": 5.7890625, + "learning_rate": 9.595661846496107e-06, + "loss": 2.9273, + "mean_token_accuracy": 0.4272886552781428, + "step": 2181 + }, + { + "epoch": 0.40452354467927326, + "grad_norm": 8.8125, + "learning_rate": 9.595476455320729e-06, + "loss": 2.5441, + "mean_token_accuracy": 0.4539132484677039, + "step": 2182 + }, + { + "epoch": 0.40470893585465334, + "grad_norm": 6.84765625, + "learning_rate": 9.595291064145348e-06, + "loss": 2.5789, + "mean_token_accuracy": 0.45889023896314296, + "step": 2183 + }, + { + "epoch": 0.40489432703003336, + "grad_norm": 7.02734375, + "learning_rate": 9.595105672969968e-06, + "loss": 2.5914, + "mean_token_accuracy": 0.4703813903164728, + "step": 2184 + }, + { + "epoch": 0.40507971820541344, + "grad_norm": 5.83984375, + "learning_rate": 9.594920281794587e-06, + "loss": 3.1223, + "mean_token_accuracy": 0.3999721176634602, + "step": 2185 + }, + { + "epoch": 0.40526510938079346, + "grad_norm": 7.41796875, + "learning_rate": 9.594734890619207e-06, + "loss": 2.6828, + "mean_token_accuracy": 0.4577973015374961, + "step": 2186 + }, + { + "epoch": 0.40545050055617354, + "grad_norm": 10.6796875, + "learning_rate": 9.594549499443828e-06, + "loss": 3.0733, + "mean_token_accuracy": 0.4001912829621533, + "step": 2187 + }, + { + "epoch": 0.40563589173155357, + "grad_norm": 6.8359375, + "learning_rate": 9.594364108268447e-06, + "loss": 2.7907, + "mean_token_accuracy": 0.4379369476322385, + "step": 2188 + }, + { + "epoch": 0.40582128290693364, + "grad_norm": 7.05078125, + "learning_rate": 9.594178717093067e-06, + "loss": 2.5177, + "mean_token_accuracy": 0.4611839262566614, + "step": 2189 + }, + { + "epoch": 0.40600667408231367, + "grad_norm": 11.4375, + "learning_rate": 9.593993325917688e-06, + "loss": 2.8458, + "mean_token_accuracy": 0.4365539858728557, + "step": 2190 + }, + { + "epoch": 0.40619206525769375, + "grad_norm": 9.5703125, + "learning_rate": 9.593807934742308e-06, + "loss": 3.1345, + "mean_token_accuracy": 0.40004364906154516, + "step": 2191 + }, + { + "epoch": 0.40637745643307377, + "grad_norm": 6.9921875, + "learning_rate": 9.593622543566927e-06, + "loss": 2.7532, + "mean_token_accuracy": 0.437296858071506, + "step": 2192 + }, + { + "epoch": 0.40656284760845385, + "grad_norm": 5.94921875, + "learning_rate": 9.593437152391547e-06, + "loss": 2.9164, + "mean_token_accuracy": 0.4266436979615274, + "step": 2193 + }, + { + "epoch": 0.40674823878383387, + "grad_norm": 8.4296875, + "learning_rate": 9.593251761216166e-06, + "loss": 2.961, + "mean_token_accuracy": 0.40921889665241423, + "step": 2194 + }, + { + "epoch": 0.40693362995921395, + "grad_norm": 7.375, + "learning_rate": 9.593066370040787e-06, + "loss": 3.1608, + "mean_token_accuracy": 0.3966967583093968, + "step": 2195 + }, + { + "epoch": 0.407119021134594, + "grad_norm": 8.5625, + "learning_rate": 9.592880978865407e-06, + "loss": 2.8243, + "mean_token_accuracy": 0.4401072011186204, + "step": 2196 + }, + { + "epoch": 0.40730441230997405, + "grad_norm": 5.88671875, + "learning_rate": 9.592695587690026e-06, + "loss": 3.4901, + "mean_token_accuracy": 0.37498164733519307, + "step": 2197 + }, + { + "epoch": 0.4074898034853541, + "grad_norm": 7.1171875, + "learning_rate": 9.592510196514646e-06, + "loss": 2.6724, + "mean_token_accuracy": 0.45298109549200194, + "step": 2198 + }, + { + "epoch": 0.40767519466073415, + "grad_norm": 8.03125, + "learning_rate": 9.592324805339267e-06, + "loss": 2.5993, + "mean_token_accuracy": 0.4460420765854397, + "step": 2199 + }, + { + "epoch": 0.4078605858361142, + "grad_norm": 6.4921875, + "learning_rate": 9.592139414163887e-06, + "loss": 2.5064, + "mean_token_accuracy": 0.49355747936380107, + "step": 2200 + }, + { + "epoch": 0.40804597701149425, + "grad_norm": 6.87109375, + "learning_rate": 9.591954022988506e-06, + "loss": 2.891, + "mean_token_accuracy": 0.4084868619752341, + "step": 2201 + }, + { + "epoch": 0.4082313681868743, + "grad_norm": 7.6640625, + "learning_rate": 9.591768631813127e-06, + "loss": 2.9136, + "mean_token_accuracy": 0.42668661588683354, + "step": 2202 + }, + { + "epoch": 0.40841675936225436, + "grad_norm": 7.66015625, + "learning_rate": 9.591583240637745e-06, + "loss": 3.4947, + "mean_token_accuracy": 0.3673661555017487, + "step": 2203 + }, + { + "epoch": 0.40860215053763443, + "grad_norm": 6.83984375, + "learning_rate": 9.591397849462366e-06, + "loss": 3.3369, + "mean_token_accuracy": 0.4024370691037358, + "step": 2204 + }, + { + "epoch": 0.40878754171301446, + "grad_norm": 6.0546875, + "learning_rate": 9.591212458286986e-06, + "loss": 2.8524, + "mean_token_accuracy": 0.4304034197168047, + "step": 2205 + }, + { + "epoch": 0.40897293288839454, + "grad_norm": 6.95703125, + "learning_rate": 9.591027067111607e-06, + "loss": 3.0428, + "mean_token_accuracy": 0.4013611872577748, + "step": 2206 + }, + { + "epoch": 0.40915832406377456, + "grad_norm": 6.42578125, + "learning_rate": 9.590841675936227e-06, + "loss": 2.4655, + "mean_token_accuracy": 0.46412838263058526, + "step": 2207 + }, + { + "epoch": 0.40934371523915464, + "grad_norm": 6.75390625, + "learning_rate": 9.590656284760846e-06, + "loss": 2.2468, + "mean_token_accuracy": 0.5171443193449334, + "step": 2208 + }, + { + "epoch": 0.40952910641453466, + "grad_norm": 6.0859375, + "learning_rate": 9.590470893585467e-06, + "loss": 3.1099, + "mean_token_accuracy": 0.4083229391087425, + "step": 2209 + }, + { + "epoch": 0.40971449758991474, + "grad_norm": 7.49609375, + "learning_rate": 9.590285502410086e-06, + "loss": 2.7473, + "mean_token_accuracy": 0.44744659782147406, + "step": 2210 + }, + { + "epoch": 0.40989988876529476, + "grad_norm": 6.5546875, + "learning_rate": 9.590100111234706e-06, + "loss": 2.9004, + "mean_token_accuracy": 0.42825878812747914, + "step": 2211 + }, + { + "epoch": 0.41008527994067484, + "grad_norm": 6.71484375, + "learning_rate": 9.589914720059327e-06, + "loss": 2.6701, + "mean_token_accuracy": 0.4524519747046892, + "step": 2212 + }, + { + "epoch": 0.41027067111605486, + "grad_norm": 6.25390625, + "learning_rate": 9.589729328883945e-06, + "loss": 2.6886, + "mean_token_accuracy": 0.4447219666097016, + "step": 2213 + }, + { + "epoch": 0.41045606229143494, + "grad_norm": 5.359375, + "learning_rate": 9.589543937708566e-06, + "loss": 3.2223, + "mean_token_accuracy": 0.39273674532638675, + "step": 2214 + }, + { + "epoch": 0.41064145346681497, + "grad_norm": 8.75, + "learning_rate": 9.589358546533186e-06, + "loss": 2.7485, + "mean_token_accuracy": 0.43857634902411025, + "step": 2215 + }, + { + "epoch": 0.41082684464219504, + "grad_norm": 6.03515625, + "learning_rate": 9.589173155357807e-06, + "loss": 3.0011, + "mean_token_accuracy": 0.4093845630737844, + "step": 2216 + }, + { + "epoch": 0.41101223581757507, + "grad_norm": 5.87109375, + "learning_rate": 9.588987764182426e-06, + "loss": 2.8778, + "mean_token_accuracy": 0.4264786870800749, + "step": 2217 + }, + { + "epoch": 0.41119762699295515, + "grad_norm": 5.26171875, + "learning_rate": 9.588802373007046e-06, + "loss": 2.7295, + "mean_token_accuracy": 0.44439317028149516, + "step": 2218 + }, + { + "epoch": 0.41138301816833517, + "grad_norm": 5.40625, + "learning_rate": 9.588616981831665e-06, + "loss": 2.6191, + "mean_token_accuracy": 0.47444470981609743, + "step": 2219 + }, + { + "epoch": 0.41156840934371525, + "grad_norm": 4.84375, + "learning_rate": 9.588431590656285e-06, + "loss": 2.5209, + "mean_token_accuracy": 0.48569458807307825, + "step": 2220 + }, + { + "epoch": 0.41175380051909527, + "grad_norm": 5.85546875, + "learning_rate": 9.588246199480906e-06, + "loss": 2.6633, + "mean_token_accuracy": 0.4565012773967998, + "step": 2221 + }, + { + "epoch": 0.41193919169447535, + "grad_norm": 5.6171875, + "learning_rate": 9.588060808305526e-06, + "loss": 2.7262, + "mean_token_accuracy": 0.44534466728649225, + "step": 2222 + }, + { + "epoch": 0.41212458286985537, + "grad_norm": 6.2578125, + "learning_rate": 9.587875417130145e-06, + "loss": 2.4175, + "mean_token_accuracy": 0.473217166828009, + "step": 2223 + }, + { + "epoch": 0.41230997404523545, + "grad_norm": 6.3515625, + "learning_rate": 9.587690025954766e-06, + "loss": 2.209, + "mean_token_accuracy": 0.5235075442816531, + "step": 2224 + }, + { + "epoch": 0.4124953652206155, + "grad_norm": 5.91796875, + "learning_rate": 9.587504634779386e-06, + "loss": 3.1116, + "mean_token_accuracy": 0.40691489361702127, + "step": 2225 + }, + { + "epoch": 0.41268075639599555, + "grad_norm": 6.20703125, + "learning_rate": 9.587319243604005e-06, + "loss": 2.4577, + "mean_token_accuracy": 0.4869445716903344, + "step": 2226 + }, + { + "epoch": 0.41286614757137563, + "grad_norm": 6.37109375, + "learning_rate": 9.587133852428625e-06, + "loss": 3.2775, + "mean_token_accuracy": 0.3961429799778897, + "step": 2227 + }, + { + "epoch": 0.41305153874675565, + "grad_norm": 6.19140625, + "learning_rate": 9.586948461253244e-06, + "loss": 2.9515, + "mean_token_accuracy": 0.4275176877916604, + "step": 2228 + }, + { + "epoch": 0.41323692992213573, + "grad_norm": 5.85546875, + "learning_rate": 9.586763070077865e-06, + "loss": 2.2488, + "mean_token_accuracy": 0.5189208128941836, + "step": 2229 + }, + { + "epoch": 0.41342232109751575, + "grad_norm": 5.3984375, + "learning_rate": 9.586577678902485e-06, + "loss": 3.2799, + "mean_token_accuracy": 0.4077181208053691, + "step": 2230 + }, + { + "epoch": 0.41360771227289583, + "grad_norm": 5.49609375, + "learning_rate": 9.586392287727106e-06, + "loss": 2.805, + "mean_token_accuracy": 0.44299973732597847, + "step": 2231 + }, + { + "epoch": 0.41379310344827586, + "grad_norm": 7.0078125, + "learning_rate": 9.586206896551724e-06, + "loss": 2.9177, + "mean_token_accuracy": 0.42305061559507523, + "step": 2232 + }, + { + "epoch": 0.41397849462365593, + "grad_norm": 6.40234375, + "learning_rate": 9.586021505376345e-06, + "loss": 2.9239, + "mean_token_accuracy": 0.4374907966426152, + "step": 2233 + }, + { + "epoch": 0.41416388579903596, + "grad_norm": 5.00390625, + "learning_rate": 9.585836114200965e-06, + "loss": 3.1282, + "mean_token_accuracy": 0.4031123139377537, + "step": 2234 + }, + { + "epoch": 0.41434927697441604, + "grad_norm": 5.84765625, + "learning_rate": 9.585650723025584e-06, + "loss": 2.6302, + "mean_token_accuracy": 0.4598634196624146, + "step": 2235 + }, + { + "epoch": 0.41453466814979606, + "grad_norm": 5.20703125, + "learning_rate": 9.585465331850205e-06, + "loss": 3.2291, + "mean_token_accuracy": 0.41088498304039467, + "step": 2236 + }, + { + "epoch": 0.41472005932517614, + "grad_norm": 6.34765625, + "learning_rate": 9.585279940674824e-06, + "loss": 2.522, + "mean_token_accuracy": 0.4711177794448612, + "step": 2237 + }, + { + "epoch": 0.41490545050055616, + "grad_norm": 6.484375, + "learning_rate": 9.585094549499444e-06, + "loss": 3.0161, + "mean_token_accuracy": 0.4154883901932806, + "step": 2238 + }, + { + "epoch": 0.41509084167593624, + "grad_norm": 5.9453125, + "learning_rate": 9.584909158324065e-06, + "loss": 3.1629, + "mean_token_accuracy": 0.41485998193315266, + "step": 2239 + }, + { + "epoch": 0.41527623285131626, + "grad_norm": 5.953125, + "learning_rate": 9.584723767148685e-06, + "loss": 2.2707, + "mean_token_accuracy": 0.5178206251825884, + "step": 2240 + }, + { + "epoch": 0.41546162402669634, + "grad_norm": 6.41796875, + "learning_rate": 9.584538375973304e-06, + "loss": 2.8583, + "mean_token_accuracy": 0.4417053364269142, + "step": 2241 + }, + { + "epoch": 0.41564701520207636, + "grad_norm": 6.52734375, + "learning_rate": 9.584352984797924e-06, + "loss": 2.8294, + "mean_token_accuracy": 0.4526962457337884, + "step": 2242 + }, + { + "epoch": 0.41583240637745644, + "grad_norm": 6.11328125, + "learning_rate": 9.584167593622545e-06, + "loss": 2.9048, + "mean_token_accuracy": 0.4266694403994175, + "step": 2243 + }, + { + "epoch": 0.41601779755283647, + "grad_norm": 6.24609375, + "learning_rate": 9.583982202447164e-06, + "loss": 2.4896, + "mean_token_accuracy": 0.48919753086419754, + "step": 2244 + }, + { + "epoch": 0.41620318872821654, + "grad_norm": 7.5859375, + "learning_rate": 9.583796811271784e-06, + "loss": 2.288, + "mean_token_accuracy": 0.51, + "step": 2245 + }, + { + "epoch": 0.41638857990359657, + "grad_norm": 6.41015625, + "learning_rate": 9.583611420096403e-06, + "loss": 2.6851, + "mean_token_accuracy": 0.4567398119122257, + "step": 2246 + }, + { + "epoch": 0.41657397107897665, + "grad_norm": 6.890625, + "learning_rate": 9.583426028921025e-06, + "loss": 2.6854, + "mean_token_accuracy": 0.44308614923307577, + "step": 2247 + }, + { + "epoch": 0.41675936225435667, + "grad_norm": 5.95703125, + "learning_rate": 9.583240637745644e-06, + "loss": 2.4149, + "mean_token_accuracy": 0.4935596302470071, + "step": 2248 + }, + { + "epoch": 0.41694475342973675, + "grad_norm": 5.53515625, + "learning_rate": 9.583055246570264e-06, + "loss": 2.9091, + "mean_token_accuracy": 0.4063740368900304, + "step": 2249 + }, + { + "epoch": 0.41713014460511677, + "grad_norm": 9.59375, + "learning_rate": 9.582869855394885e-06, + "loss": 2.9351, + "mean_token_accuracy": 0.41020106781778937, + "step": 2250 + }, + { + "epoch": 0.41731553578049685, + "grad_norm": 7.80859375, + "learning_rate": 9.582684464219504e-06, + "loss": 3.2058, + "mean_token_accuracy": 0.41066003866335266, + "step": 2251 + }, + { + "epoch": 0.4175009269558769, + "grad_norm": 6.73828125, + "learning_rate": 9.582499073044124e-06, + "loss": 2.7334, + "mean_token_accuracy": 0.4456549935149157, + "step": 2252 + }, + { + "epoch": 0.41768631813125695, + "grad_norm": 8.125, + "learning_rate": 9.582313681868743e-06, + "loss": 2.6819, + "mean_token_accuracy": 0.4686708131766874, + "step": 2253 + }, + { + "epoch": 0.41787170930663703, + "grad_norm": 8.140625, + "learning_rate": 9.582128290693363e-06, + "loss": 3.0016, + "mean_token_accuracy": 0.4097625968992248, + "step": 2254 + }, + { + "epoch": 0.41805710048201705, + "grad_norm": 7.0390625, + "learning_rate": 9.581942899517984e-06, + "loss": 3.5263, + "mean_token_accuracy": 0.371763423276097, + "step": 2255 + }, + { + "epoch": 0.41824249165739713, + "grad_norm": 6.765625, + "learning_rate": 9.581757508342604e-06, + "loss": 3.1005, + "mean_token_accuracy": 0.39565943238731216, + "step": 2256 + }, + { + "epoch": 0.41842788283277715, + "grad_norm": 6.11328125, + "learning_rate": 9.581572117167223e-06, + "loss": 2.8462, + "mean_token_accuracy": 0.4264867237217468, + "step": 2257 + }, + { + "epoch": 0.41861327400815723, + "grad_norm": 6.43359375, + "learning_rate": 9.581386725991844e-06, + "loss": 3.4261, + "mean_token_accuracy": 0.3723460721868365, + "step": 2258 + }, + { + "epoch": 0.41879866518353726, + "grad_norm": 5.55078125, + "learning_rate": 9.581201334816464e-06, + "loss": 2.8604, + "mean_token_accuracy": 0.45174699471969443, + "step": 2259 + }, + { + "epoch": 0.41898405635891733, + "grad_norm": 5.453125, + "learning_rate": 9.581015943641083e-06, + "loss": 2.8714, + "mean_token_accuracy": 0.4332957534761368, + "step": 2260 + }, + { + "epoch": 0.41916944753429736, + "grad_norm": 5.42578125, + "learning_rate": 9.580830552465703e-06, + "loss": 2.8619, + "mean_token_accuracy": 0.4447099429178282, + "step": 2261 + }, + { + "epoch": 0.41935483870967744, + "grad_norm": 5.90234375, + "learning_rate": 9.580645161290322e-06, + "loss": 2.4376, + "mean_token_accuracy": 0.4953051643192488, + "step": 2262 + }, + { + "epoch": 0.41954022988505746, + "grad_norm": 6.1484375, + "learning_rate": 9.580459770114944e-06, + "loss": 2.6206, + "mean_token_accuracy": 0.45974770642201834, + "step": 2263 + }, + { + "epoch": 0.41972562106043754, + "grad_norm": 6.81640625, + "learning_rate": 9.580274378939563e-06, + "loss": 2.6863, + "mean_token_accuracy": 0.4523254005288536, + "step": 2264 + }, + { + "epoch": 0.41991101223581756, + "grad_norm": 11.6953125, + "learning_rate": 9.580088987764184e-06, + "loss": 2.9287, + "mean_token_accuracy": 0.4245207667731629, + "step": 2265 + }, + { + "epoch": 0.42009640341119764, + "grad_norm": 6.78125, + "learning_rate": 9.579903596588803e-06, + "loss": 2.9945, + "mean_token_accuracy": 0.417077570655442, + "step": 2266 + }, + { + "epoch": 0.42028179458657766, + "grad_norm": 7.203125, + "learning_rate": 9.579718205413423e-06, + "loss": 2.9123, + "mean_token_accuracy": 0.42065454545454545, + "step": 2267 + }, + { + "epoch": 0.42046718576195774, + "grad_norm": 6.73828125, + "learning_rate": 9.579532814238044e-06, + "loss": 2.767, + "mean_token_accuracy": 0.4510226049515608, + "step": 2268 + }, + { + "epoch": 0.42065257693733776, + "grad_norm": 7.34375, + "learning_rate": 9.579347423062662e-06, + "loss": 2.5501, + "mean_token_accuracy": 0.46499921346547113, + "step": 2269 + }, + { + "epoch": 0.42083796811271784, + "grad_norm": 5.9921875, + "learning_rate": 9.579162031887283e-06, + "loss": 2.6606, + "mean_token_accuracy": 0.44438164141985076, + "step": 2270 + }, + { + "epoch": 0.42102335928809786, + "grad_norm": 5.4375, + "learning_rate": 9.578976640711903e-06, + "loss": 2.7844, + "mean_token_accuracy": 0.44346289752650175, + "step": 2271 + }, + { + "epoch": 0.42120875046347794, + "grad_norm": 7.71484375, + "learning_rate": 9.578791249536524e-06, + "loss": 2.4964, + "mean_token_accuracy": 0.46576007770762506, + "step": 2272 + }, + { + "epoch": 0.42139414163885797, + "grad_norm": 8.7421875, + "learning_rate": 9.578605858361143e-06, + "loss": 2.7703, + "mean_token_accuracy": 0.4587604478788835, + "step": 2273 + }, + { + "epoch": 0.42157953281423804, + "grad_norm": 5.65625, + "learning_rate": 9.578420467185763e-06, + "loss": 2.9912, + "mean_token_accuracy": 0.4190771349862259, + "step": 2274 + }, + { + "epoch": 0.42176492398961807, + "grad_norm": 6.38671875, + "learning_rate": 9.578235076010382e-06, + "loss": 3.0331, + "mean_token_accuracy": 0.41849039749888345, + "step": 2275 + }, + { + "epoch": 0.42195031516499815, + "grad_norm": 5.7578125, + "learning_rate": 9.578049684835002e-06, + "loss": 3.2431, + "mean_token_accuracy": 0.39229741222208725, + "step": 2276 + }, + { + "epoch": 0.4221357063403782, + "grad_norm": 6.25390625, + "learning_rate": 9.577864293659623e-06, + "loss": 2.6444, + "mean_token_accuracy": 0.4572508842541773, + "step": 2277 + }, + { + "epoch": 0.42232109751575825, + "grad_norm": 7.94921875, + "learning_rate": 9.577678902484242e-06, + "loss": 3.124, + "mean_token_accuracy": 0.41318891366677285, + "step": 2278 + }, + { + "epoch": 0.4225064886911383, + "grad_norm": 5.48046875, + "learning_rate": 9.577493511308862e-06, + "loss": 2.6967, + "mean_token_accuracy": 0.4566266721831472, + "step": 2279 + }, + { + "epoch": 0.42269187986651835, + "grad_norm": 9.8359375, + "learning_rate": 9.577308120133483e-06, + "loss": 3.0674, + "mean_token_accuracy": 0.40586592178770947, + "step": 2280 + }, + { + "epoch": 0.42287727104189843, + "grad_norm": 8.9375, + "learning_rate": 9.577122728958103e-06, + "loss": 3.1422, + "mean_token_accuracy": 0.4288283303361461, + "step": 2281 + }, + { + "epoch": 0.42306266221727845, + "grad_norm": 5.9765625, + "learning_rate": 9.576937337782722e-06, + "loss": 2.8583, + "mean_token_accuracy": 0.4208240652473255, + "step": 2282 + }, + { + "epoch": 0.42324805339265853, + "grad_norm": 6.3828125, + "learning_rate": 9.576751946607342e-06, + "loss": 2.57, + "mean_token_accuracy": 0.4715649104458099, + "step": 2283 + }, + { + "epoch": 0.42343344456803855, + "grad_norm": 8.6796875, + "learning_rate": 9.576566555431961e-06, + "loss": 2.7068, + "mean_token_accuracy": 0.44070002892681515, + "step": 2284 + }, + { + "epoch": 0.42361883574341863, + "grad_norm": 6.45703125, + "learning_rate": 9.576381164256582e-06, + "loss": 2.7965, + "mean_token_accuracy": 0.43934426229508194, + "step": 2285 + }, + { + "epoch": 0.42380422691879865, + "grad_norm": 6.77734375, + "learning_rate": 9.576195773081202e-06, + "loss": 2.7099, + "mean_token_accuracy": 0.45409778403095324, + "step": 2286 + }, + { + "epoch": 0.42398961809417873, + "grad_norm": 5.3046875, + "learning_rate": 9.576010381905823e-06, + "loss": 2.6851, + "mean_token_accuracy": 0.4520547945205479, + "step": 2287 + }, + { + "epoch": 0.42417500926955876, + "grad_norm": 6.34765625, + "learning_rate": 9.575824990730443e-06, + "loss": 2.902, + "mean_token_accuracy": 0.4282057532910775, + "step": 2288 + }, + { + "epoch": 0.42436040044493883, + "grad_norm": 5.8671875, + "learning_rate": 9.575639599555062e-06, + "loss": 3.0857, + "mean_token_accuracy": 0.4180987600609093, + "step": 2289 + }, + { + "epoch": 0.42454579162031886, + "grad_norm": 6.171875, + "learning_rate": 9.575454208379682e-06, + "loss": 2.998, + "mean_token_accuracy": 0.42883945322969713, + "step": 2290 + }, + { + "epoch": 0.42473118279569894, + "grad_norm": 6.640625, + "learning_rate": 9.575268817204301e-06, + "loss": 2.8029, + "mean_token_accuracy": 0.44484672942312337, + "step": 2291 + }, + { + "epoch": 0.42491657397107896, + "grad_norm": 6.3515625, + "learning_rate": 9.575083426028922e-06, + "loss": 2.6079, + "mean_token_accuracy": 0.4652828533840082, + "step": 2292 + }, + { + "epoch": 0.42510196514645904, + "grad_norm": 7.15625, + "learning_rate": 9.57489803485354e-06, + "loss": 3.2783, + "mean_token_accuracy": 0.3789332738228074, + "step": 2293 + }, + { + "epoch": 0.42528735632183906, + "grad_norm": 5.84765625, + "learning_rate": 9.574712643678161e-06, + "loss": 3.2932, + "mean_token_accuracy": 0.39693890352527916, + "step": 2294 + }, + { + "epoch": 0.42547274749721914, + "grad_norm": 6.05859375, + "learning_rate": 9.574527252502782e-06, + "loss": 2.7962, + "mean_token_accuracy": 0.4486226497595103, + "step": 2295 + }, + { + "epoch": 0.42565813867259916, + "grad_norm": 7.83984375, + "learning_rate": 9.574341861327402e-06, + "loss": 2.7918, + "mean_token_accuracy": 0.4354358082940154, + "step": 2296 + }, + { + "epoch": 0.42584352984797924, + "grad_norm": 7.33203125, + "learning_rate": 9.574156470152023e-06, + "loss": 2.8192, + "mean_token_accuracy": 0.43056141831996625, + "step": 2297 + }, + { + "epoch": 0.42602892102335926, + "grad_norm": 6.8203125, + "learning_rate": 9.573971078976641e-06, + "loss": 2.6568, + "mean_token_accuracy": 0.45289427052569403, + "step": 2298 + }, + { + "epoch": 0.42621431219873934, + "grad_norm": 6.17578125, + "learning_rate": 9.573785687801262e-06, + "loss": 3.233, + "mean_token_accuracy": 0.40052459016393444, + "step": 2299 + }, + { + "epoch": 0.42639970337411937, + "grad_norm": 6.828125, + "learning_rate": 9.57360029662588e-06, + "loss": 3.5155, + "mean_token_accuracy": 0.3681762210972773, + "step": 2300 + }, + { + "epoch": 0.42658509454949944, + "grad_norm": 8.125, + "learning_rate": 9.573414905450501e-06, + "loss": 2.7311, + "mean_token_accuracy": 0.4388185654008439, + "step": 2301 + }, + { + "epoch": 0.4267704857248795, + "grad_norm": 6.0, + "learning_rate": 9.573229514275122e-06, + "loss": 3.2209, + "mean_token_accuracy": 0.38390630083505045, + "step": 2302 + }, + { + "epoch": 0.42695587690025955, + "grad_norm": 6.203125, + "learning_rate": 9.573044123099742e-06, + "loss": 2.3343, + "mean_token_accuracy": 0.482729089351984, + "step": 2303 + }, + { + "epoch": 0.4271412680756396, + "grad_norm": 6.90234375, + "learning_rate": 9.572858731924361e-06, + "loss": 2.3338, + "mean_token_accuracy": 0.49918330308529946, + "step": 2304 + }, + { + "epoch": 0.42732665925101965, + "grad_norm": 4.89453125, + "learning_rate": 9.572673340748981e-06, + "loss": 2.7545, + "mean_token_accuracy": 0.43779510266827715, + "step": 2305 + }, + { + "epoch": 0.4275120504263997, + "grad_norm": 7.27734375, + "learning_rate": 9.572487949573602e-06, + "loss": 2.5788, + "mean_token_accuracy": 0.4614881082260528, + "step": 2306 + }, + { + "epoch": 0.42769744160177975, + "grad_norm": 8.140625, + "learning_rate": 9.57230255839822e-06, + "loss": 2.7198, + "mean_token_accuracy": 0.4642602368383507, + "step": 2307 + }, + { + "epoch": 0.4278828327771598, + "grad_norm": 6.69921875, + "learning_rate": 9.572117167222841e-06, + "loss": 2.688, + "mean_token_accuracy": 0.45572126171307903, + "step": 2308 + }, + { + "epoch": 0.42806822395253985, + "grad_norm": 6.46875, + "learning_rate": 9.57193177604746e-06, + "loss": 3.0823, + "mean_token_accuracy": 0.4181265382944839, + "step": 2309 + }, + { + "epoch": 0.42825361512791993, + "grad_norm": 5.6171875, + "learning_rate": 9.57174638487208e-06, + "loss": 3.3536, + "mean_token_accuracy": 0.3746069182389937, + "step": 2310 + }, + { + "epoch": 0.42843900630329995, + "grad_norm": 6.59765625, + "learning_rate": 9.571560993696701e-06, + "loss": 3.0761, + "mean_token_accuracy": 0.40766457470957407, + "step": 2311 + }, + { + "epoch": 0.42862439747868003, + "grad_norm": 4.83203125, + "learning_rate": 9.571375602521321e-06, + "loss": 3.0865, + "mean_token_accuracy": 0.4098548073625243, + "step": 2312 + }, + { + "epoch": 0.42880978865406005, + "grad_norm": 5.90234375, + "learning_rate": 9.57119021134594e-06, + "loss": 2.7534, + "mean_token_accuracy": 0.4348197748967089, + "step": 2313 + }, + { + "epoch": 0.42899517982944013, + "grad_norm": 5.88671875, + "learning_rate": 9.57100482017056e-06, + "loss": 2.7977, + "mean_token_accuracy": 0.44222160044767767, + "step": 2314 + }, + { + "epoch": 0.42918057100482015, + "grad_norm": 6.6484375, + "learning_rate": 9.570819428995181e-06, + "loss": 2.5792, + "mean_token_accuracy": 0.46234522942461764, + "step": 2315 + }, + { + "epoch": 0.42936596218020023, + "grad_norm": 6.4765625, + "learning_rate": 9.5706340378198e-06, + "loss": 2.7415, + "mean_token_accuracy": 0.44592592592592595, + "step": 2316 + }, + { + "epoch": 0.42955135335558026, + "grad_norm": 6.0390625, + "learning_rate": 9.57044864664442e-06, + "loss": 3.2823, + "mean_token_accuracy": 0.39098291116228334, + "step": 2317 + }, + { + "epoch": 0.42973674453096034, + "grad_norm": 7.40234375, + "learning_rate": 9.57026325546904e-06, + "loss": 3.1571, + "mean_token_accuracy": 0.40427046263345195, + "step": 2318 + }, + { + "epoch": 0.42992213570634036, + "grad_norm": 10.71875, + "learning_rate": 9.570077864293661e-06, + "loss": 2.784, + "mean_token_accuracy": 0.43703358208955223, + "step": 2319 + }, + { + "epoch": 0.43010752688172044, + "grad_norm": 5.8515625, + "learning_rate": 9.56989247311828e-06, + "loss": 2.9161, + "mean_token_accuracy": 0.4267010088001717, + "step": 2320 + }, + { + "epoch": 0.43029291805710046, + "grad_norm": 8.1640625, + "learning_rate": 9.5697070819429e-06, + "loss": 2.8264, + "mean_token_accuracy": 0.44167794316644116, + "step": 2321 + }, + { + "epoch": 0.43047830923248054, + "grad_norm": 6.33984375, + "learning_rate": 9.56952169076752e-06, + "loss": 2.9504, + "mean_token_accuracy": 0.42596030272740254, + "step": 2322 + }, + { + "epoch": 0.43066370040786056, + "grad_norm": 7.79296875, + "learning_rate": 9.56933629959214e-06, + "loss": 2.7541, + "mean_token_accuracy": 0.44746650549007755, + "step": 2323 + }, + { + "epoch": 0.43084909158324064, + "grad_norm": 5.8515625, + "learning_rate": 9.56915090841676e-06, + "loss": 2.5844, + "mean_token_accuracy": 0.45316896690339004, + "step": 2324 + }, + { + "epoch": 0.43103448275862066, + "grad_norm": 6.2265625, + "learning_rate": 9.56896551724138e-06, + "loss": 2.8174, + "mean_token_accuracy": 0.4273395532937518, + "step": 2325 + }, + { + "epoch": 0.43121987393400074, + "grad_norm": 6.6328125, + "learning_rate": 9.568780126066e-06, + "loss": 2.8076, + "mean_token_accuracy": 0.43569154091097884, + "step": 2326 + }, + { + "epoch": 0.4314052651093808, + "grad_norm": 7.4453125, + "learning_rate": 9.56859473489062e-06, + "loss": 2.7529, + "mean_token_accuracy": 0.4272092627277371, + "step": 2327 + }, + { + "epoch": 0.43159065628476084, + "grad_norm": 6.33203125, + "learning_rate": 9.56840934371524e-06, + "loss": 2.7995, + "mean_token_accuracy": 0.44236709478133635, + "step": 2328 + }, + { + "epoch": 0.4317760474601409, + "grad_norm": 5.62109375, + "learning_rate": 9.56822395253986e-06, + "loss": 3.1791, + "mean_token_accuracy": 0.41021897810218977, + "step": 2329 + }, + { + "epoch": 0.43196143863552094, + "grad_norm": 8.34375, + "learning_rate": 9.56803856136448e-06, + "loss": 2.577, + "mean_token_accuracy": 0.46273964131106987, + "step": 2330 + }, + { + "epoch": 0.432146829810901, + "grad_norm": 6.609375, + "learning_rate": 9.5678531701891e-06, + "loss": 2.8645, + "mean_token_accuracy": 0.42329700272479565, + "step": 2331 + }, + { + "epoch": 0.43233222098628105, + "grad_norm": 8.234375, + "learning_rate": 9.56766777901372e-06, + "loss": 2.4701, + "mean_token_accuracy": 0.48970716149608584, + "step": 2332 + }, + { + "epoch": 0.4325176121616611, + "grad_norm": 6.546875, + "learning_rate": 9.56748238783834e-06, + "loss": 3.0891, + "mean_token_accuracy": 0.4025445292620865, + "step": 2333 + }, + { + "epoch": 0.43270300333704115, + "grad_norm": 7.6484375, + "learning_rate": 9.567296996662959e-06, + "loss": 3.1577, + "mean_token_accuracy": 0.4045156407669021, + "step": 2334 + }, + { + "epoch": 0.4328883945124212, + "grad_norm": 6.54296875, + "learning_rate": 9.567111605487581e-06, + "loss": 2.8963, + "mean_token_accuracy": 0.41485784163864264, + "step": 2335 + }, + { + "epoch": 0.43307378568780125, + "grad_norm": 5.66796875, + "learning_rate": 9.5669262143122e-06, + "loss": 2.4262, + "mean_token_accuracy": 0.498676293622142, + "step": 2336 + }, + { + "epoch": 0.43325917686318133, + "grad_norm": 6.58984375, + "learning_rate": 9.56674082313682e-06, + "loss": 2.8299, + "mean_token_accuracy": 0.43812036688026135, + "step": 2337 + }, + { + "epoch": 0.43344456803856135, + "grad_norm": 5.890625, + "learning_rate": 9.566555431961439e-06, + "loss": 3.2132, + "mean_token_accuracy": 0.4051290374939133, + "step": 2338 + }, + { + "epoch": 0.43362995921394143, + "grad_norm": 6.421875, + "learning_rate": 9.56637004078606e-06, + "loss": 3.0045, + "mean_token_accuracy": 0.41964285714285715, + "step": 2339 + }, + { + "epoch": 0.43381535038932145, + "grad_norm": 7.3125, + "learning_rate": 9.56618464961068e-06, + "loss": 2.7718, + "mean_token_accuracy": 0.4379874213836478, + "step": 2340 + }, + { + "epoch": 0.43400074156470153, + "grad_norm": 6.0703125, + "learning_rate": 9.565999258435299e-06, + "loss": 3.4193, + "mean_token_accuracy": 0.37652681890600104, + "step": 2341 + }, + { + "epoch": 0.43418613274008155, + "grad_norm": 9.0, + "learning_rate": 9.56581386725992e-06, + "loss": 3.2395, + "mean_token_accuracy": 0.402314137518287, + "step": 2342 + }, + { + "epoch": 0.43437152391546163, + "grad_norm": 9.6875, + "learning_rate": 9.56562847608454e-06, + "loss": 3.2184, + "mean_token_accuracy": 0.37963930998431783, + "step": 2343 + }, + { + "epoch": 0.43455691509084166, + "grad_norm": 6.58203125, + "learning_rate": 9.56544308490916e-06, + "loss": 3.0456, + "mean_token_accuracy": 0.4261033877716291, + "step": 2344 + }, + { + "epoch": 0.43474230626622173, + "grad_norm": 8.9453125, + "learning_rate": 9.565257693733779e-06, + "loss": 3.0722, + "mean_token_accuracy": 0.4226220223221723, + "step": 2345 + }, + { + "epoch": 0.43492769744160176, + "grad_norm": 6.67578125, + "learning_rate": 9.5650723025584e-06, + "loss": 2.719, + "mean_token_accuracy": 0.44951830443159924, + "step": 2346 + }, + { + "epoch": 0.43511308861698184, + "grad_norm": 5.71484375, + "learning_rate": 9.564886911383018e-06, + "loss": 2.4749, + "mean_token_accuracy": 0.4772456870910173, + "step": 2347 + }, + { + "epoch": 0.43529847979236186, + "grad_norm": 7.4921875, + "learning_rate": 9.564701520207639e-06, + "loss": 2.8291, + "mean_token_accuracy": 0.44114394059093065, + "step": 2348 + }, + { + "epoch": 0.43548387096774194, + "grad_norm": 9.1328125, + "learning_rate": 9.56451612903226e-06, + "loss": 2.6548, + "mean_token_accuracy": 0.4633623768033138, + "step": 2349 + }, + { + "epoch": 0.43566926214312196, + "grad_norm": 6.78125, + "learning_rate": 9.564330737856878e-06, + "loss": 2.4607, + "mean_token_accuracy": 0.486877405808935, + "step": 2350 + }, + { + "epoch": 0.43585465331850204, + "grad_norm": 8.0625, + "learning_rate": 9.564145346681499e-06, + "loss": 2.5791, + "mean_token_accuracy": 0.46268896751367, + "step": 2351 + }, + { + "epoch": 0.4360400444938821, + "grad_norm": 10.15625, + "learning_rate": 9.563959955506119e-06, + "loss": 2.9099, + "mean_token_accuracy": 0.4356060606060606, + "step": 2352 + }, + { + "epoch": 0.43622543566926214, + "grad_norm": 8.171875, + "learning_rate": 9.56377456433074e-06, + "loss": 2.9472, + "mean_token_accuracy": 0.4171475680131904, + "step": 2353 + }, + { + "epoch": 0.4364108268446422, + "grad_norm": 5.796875, + "learning_rate": 9.563589173155358e-06, + "loss": 3.0594, + "mean_token_accuracy": 0.4031359906213365, + "step": 2354 + }, + { + "epoch": 0.43659621802002224, + "grad_norm": 7.07421875, + "learning_rate": 9.563403781979979e-06, + "loss": 2.4322, + "mean_token_accuracy": 0.49316615836439626, + "step": 2355 + }, + { + "epoch": 0.4367816091954023, + "grad_norm": 8.3203125, + "learning_rate": 9.563218390804598e-06, + "loss": 2.6021, + "mean_token_accuracy": 0.4652005799903335, + "step": 2356 + }, + { + "epoch": 0.43696700037078234, + "grad_norm": 9.109375, + "learning_rate": 9.563032999629218e-06, + "loss": 3.1881, + "mean_token_accuracy": 0.40682058246567077, + "step": 2357 + }, + { + "epoch": 0.4371523915461624, + "grad_norm": 7.75, + "learning_rate": 9.562847608453839e-06, + "loss": 2.2675, + "mean_token_accuracy": 0.5075212557226946, + "step": 2358 + }, + { + "epoch": 0.43733778272154245, + "grad_norm": 8.625, + "learning_rate": 9.562662217278457e-06, + "loss": 3.2012, + "mean_token_accuracy": 0.4028294862248697, + "step": 2359 + }, + { + "epoch": 0.4375231738969225, + "grad_norm": 8.6171875, + "learning_rate": 9.562476826103078e-06, + "loss": 2.4626, + "mean_token_accuracy": 0.4647638292498106, + "step": 2360 + }, + { + "epoch": 0.43770856507230255, + "grad_norm": 6.73046875, + "learning_rate": 9.562291434927698e-06, + "loss": 2.8055, + "mean_token_accuracy": 0.4294911734164071, + "step": 2361 + }, + { + "epoch": 0.4378939562476826, + "grad_norm": 5.05859375, + "learning_rate": 9.562106043752319e-06, + "loss": 2.6261, + "mean_token_accuracy": 0.4549322584257082, + "step": 2362 + }, + { + "epoch": 0.43807934742306265, + "grad_norm": 6.28515625, + "learning_rate": 9.561920652576938e-06, + "loss": 2.6777, + "mean_token_accuracy": 0.4393713588944587, + "step": 2363 + }, + { + "epoch": 0.4382647385984427, + "grad_norm": 4.9375, + "learning_rate": 9.561735261401558e-06, + "loss": 2.9382, + "mean_token_accuracy": 0.43802674457804247, + "step": 2364 + }, + { + "epoch": 0.43845012977382275, + "grad_norm": 6.2421875, + "learning_rate": 9.561549870226177e-06, + "loss": 3.0734, + "mean_token_accuracy": 0.4129256428075052, + "step": 2365 + }, + { + "epoch": 0.43863552094920283, + "grad_norm": 5.89453125, + "learning_rate": 9.561364479050797e-06, + "loss": 2.3614, + "mean_token_accuracy": 0.48459586806814064, + "step": 2366 + }, + { + "epoch": 0.43882091212458285, + "grad_norm": 6.14453125, + "learning_rate": 9.561179087875418e-06, + "loss": 3.1097, + "mean_token_accuracy": 0.4179781275006668, + "step": 2367 + }, + { + "epoch": 0.43900630329996293, + "grad_norm": 6.59765625, + "learning_rate": 9.560993696700038e-06, + "loss": 2.5272, + "mean_token_accuracy": 0.4798509201024924, + "step": 2368 + }, + { + "epoch": 0.43919169447534295, + "grad_norm": 6.15234375, + "learning_rate": 9.560808305524659e-06, + "loss": 2.8779, + "mean_token_accuracy": 0.44212888746393075, + "step": 2369 + }, + { + "epoch": 0.43937708565072303, + "grad_norm": 5.94140625, + "learning_rate": 9.560622914349278e-06, + "loss": 3.344, + "mean_token_accuracy": 0.3872888396566048, + "step": 2370 + }, + { + "epoch": 0.43956247682610305, + "grad_norm": 4.64453125, + "learning_rate": 9.560437523173898e-06, + "loss": 2.633, + "mean_token_accuracy": 0.46213997132803336, + "step": 2371 + }, + { + "epoch": 0.43974786800148313, + "grad_norm": 6.6328125, + "learning_rate": 9.560252131998517e-06, + "loss": 2.9029, + "mean_token_accuracy": 0.42407795732083386, + "step": 2372 + }, + { + "epoch": 0.43993325917686316, + "grad_norm": 6.07421875, + "learning_rate": 9.560066740823138e-06, + "loss": 2.5744, + "mean_token_accuracy": 0.4665350010972131, + "step": 2373 + }, + { + "epoch": 0.44011865035224323, + "grad_norm": 6.22265625, + "learning_rate": 9.559881349647756e-06, + "loss": 2.4988, + "mean_token_accuracy": 0.4788270473618212, + "step": 2374 + }, + { + "epoch": 0.44030404152762326, + "grad_norm": 7.6640625, + "learning_rate": 9.559695958472377e-06, + "loss": 2.77, + "mean_token_accuracy": 0.4401874310915105, + "step": 2375 + }, + { + "epoch": 0.44048943270300334, + "grad_norm": 7.8515625, + "learning_rate": 9.559510567296997e-06, + "loss": 3.1516, + "mean_token_accuracy": 0.40589600109185203, + "step": 2376 + }, + { + "epoch": 0.4406748238783834, + "grad_norm": 8.8203125, + "learning_rate": 9.559325176121618e-06, + "loss": 2.8167, + "mean_token_accuracy": 0.4267715043577113, + "step": 2377 + }, + { + "epoch": 0.44086021505376344, + "grad_norm": 6.8515625, + "learning_rate": 9.559139784946238e-06, + "loss": 3.1831, + "mean_token_accuracy": 0.4064989845336666, + "step": 2378 + }, + { + "epoch": 0.4410456062291435, + "grad_norm": 5.78515625, + "learning_rate": 9.558954393770857e-06, + "loss": 3.0379, + "mean_token_accuracy": 0.4132280649486072, + "step": 2379 + }, + { + "epoch": 0.44123099740452354, + "grad_norm": 4.94921875, + "learning_rate": 9.558769002595478e-06, + "loss": 2.871, + "mean_token_accuracy": 0.4444712117562033, + "step": 2380 + }, + { + "epoch": 0.4414163885799036, + "grad_norm": 5.73828125, + "learning_rate": 9.558583611420096e-06, + "loss": 2.5853, + "mean_token_accuracy": 0.47842866988283944, + "step": 2381 + }, + { + "epoch": 0.44160177975528364, + "grad_norm": 6.67578125, + "learning_rate": 9.558398220244717e-06, + "loss": 2.8789, + "mean_token_accuracy": 0.42940461725394896, + "step": 2382 + }, + { + "epoch": 0.4417871709306637, + "grad_norm": 5.6171875, + "learning_rate": 9.558212829069337e-06, + "loss": 3.0894, + "mean_token_accuracy": 0.3994656917885264, + "step": 2383 + }, + { + "epoch": 0.44197256210604374, + "grad_norm": 5.92578125, + "learning_rate": 9.558027437893958e-06, + "loss": 2.6715, + "mean_token_accuracy": 0.46623990245389424, + "step": 2384 + }, + { + "epoch": 0.4421579532814238, + "grad_norm": 7.34765625, + "learning_rate": 9.557842046718577e-06, + "loss": 4.0918, + "mean_token_accuracy": 0.33005341579983133, + "step": 2385 + }, + { + "epoch": 0.44234334445680384, + "grad_norm": 7.55859375, + "learning_rate": 9.557656655543197e-06, + "loss": 3.1334, + "mean_token_accuracy": 0.40554048265029274, + "step": 2386 + }, + { + "epoch": 0.4425287356321839, + "grad_norm": 11.5859375, + "learning_rate": 9.557471264367818e-06, + "loss": 2.6694, + "mean_token_accuracy": 0.44031130457723255, + "step": 2387 + }, + { + "epoch": 0.44271412680756395, + "grad_norm": 8.2890625, + "learning_rate": 9.557285873192436e-06, + "loss": 2.7522, + "mean_token_accuracy": 0.4521486643437863, + "step": 2388 + }, + { + "epoch": 0.442899517982944, + "grad_norm": 7.2265625, + "learning_rate": 9.557100482017057e-06, + "loss": 2.7545, + "mean_token_accuracy": 0.4453001371452685, + "step": 2389 + }, + { + "epoch": 0.44308490915832405, + "grad_norm": 5.26171875, + "learning_rate": 9.556915090841676e-06, + "loss": 2.7894, + "mean_token_accuracy": 0.44506866416978774, + "step": 2390 + }, + { + "epoch": 0.4432703003337041, + "grad_norm": 5.91015625, + "learning_rate": 9.556729699666296e-06, + "loss": 3.3722, + "mean_token_accuracy": 0.37681856438619693, + "step": 2391 + }, + { + "epoch": 0.44345569150908415, + "grad_norm": 7.51171875, + "learning_rate": 9.556544308490917e-06, + "loss": 2.9046, + "mean_token_accuracy": 0.42397876419407166, + "step": 2392 + }, + { + "epoch": 0.4436410826844642, + "grad_norm": 5.38671875, + "learning_rate": 9.556358917315537e-06, + "loss": 2.8799, + "mean_token_accuracy": 0.420169014084507, + "step": 2393 + }, + { + "epoch": 0.44382647385984425, + "grad_norm": 5.63671875, + "learning_rate": 9.556173526140156e-06, + "loss": 3.1938, + "mean_token_accuracy": 0.39748180663047245, + "step": 2394 + }, + { + "epoch": 0.44401186503522433, + "grad_norm": 5.4921875, + "learning_rate": 9.555988134964776e-06, + "loss": 2.8314, + "mean_token_accuracy": 0.4373899924566256, + "step": 2395 + }, + { + "epoch": 0.44419725621060435, + "grad_norm": 5.92578125, + "learning_rate": 9.555802743789397e-06, + "loss": 2.5548, + "mean_token_accuracy": 0.4647790055248619, + "step": 2396 + }, + { + "epoch": 0.44438264738598443, + "grad_norm": 6.48828125, + "learning_rate": 9.555617352614016e-06, + "loss": 2.4911, + "mean_token_accuracy": 0.48619145362940847, + "step": 2397 + }, + { + "epoch": 0.44456803856136445, + "grad_norm": 9.25, + "learning_rate": 9.555431961438636e-06, + "loss": 3.1625, + "mean_token_accuracy": 0.38819238659676314, + "step": 2398 + }, + { + "epoch": 0.44475342973674453, + "grad_norm": 10.421875, + "learning_rate": 9.555246570263255e-06, + "loss": 2.6799, + "mean_token_accuracy": 0.4431665421956684, + "step": 2399 + }, + { + "epoch": 0.44493882091212456, + "grad_norm": 7.46484375, + "learning_rate": 9.555061179087877e-06, + "loss": 2.6659, + "mean_token_accuracy": 0.45439935717155483, + "step": 2400 + }, + { + "epoch": 0.44512421208750463, + "grad_norm": 6.1796875, + "learning_rate": 9.554875787912496e-06, + "loss": 2.7457, + "mean_token_accuracy": 0.45843536538213686, + "step": 2401 + }, + { + "epoch": 0.4453096032628847, + "grad_norm": 6.33203125, + "learning_rate": 9.554690396737117e-06, + "loss": 3.0558, + "mean_token_accuracy": 0.4189925681255161, + "step": 2402 + }, + { + "epoch": 0.44549499443826474, + "grad_norm": 8.8046875, + "learning_rate": 9.554505005561735e-06, + "loss": 3.0269, + "mean_token_accuracy": 0.40975212382171533, + "step": 2403 + }, + { + "epoch": 0.4456803856136448, + "grad_norm": 11.3671875, + "learning_rate": 9.554319614386356e-06, + "loss": 2.4509, + "mean_token_accuracy": 0.4916887496988677, + "step": 2404 + }, + { + "epoch": 0.44586577678902484, + "grad_norm": 5.82421875, + "learning_rate": 9.554134223210976e-06, + "loss": 2.7319, + "mean_token_accuracy": 0.45618293306080737, + "step": 2405 + }, + { + "epoch": 0.4460511679644049, + "grad_norm": 6.3671875, + "learning_rate": 9.553948832035595e-06, + "loss": 2.8403, + "mean_token_accuracy": 0.4468239039212849, + "step": 2406 + }, + { + "epoch": 0.44623655913978494, + "grad_norm": 6.171875, + "learning_rate": 9.553763440860216e-06, + "loss": 2.9621, + "mean_token_accuracy": 0.41920103092783506, + "step": 2407 + }, + { + "epoch": 0.446421950315165, + "grad_norm": 6.06640625, + "learning_rate": 9.553578049684836e-06, + "loss": 2.7995, + "mean_token_accuracy": 0.4360921285750443, + "step": 2408 + }, + { + "epoch": 0.44660734149054504, + "grad_norm": 4.71484375, + "learning_rate": 9.553392658509457e-06, + "loss": 2.3697, + "mean_token_accuracy": 0.4950892385679586, + "step": 2409 + }, + { + "epoch": 0.4467927326659251, + "grad_norm": 8.390625, + "learning_rate": 9.553207267334075e-06, + "loss": 3.0177, + "mean_token_accuracy": 0.4204398447606727, + "step": 2410 + }, + { + "epoch": 0.44697812384130514, + "grad_norm": 5.50390625, + "learning_rate": 9.553021876158696e-06, + "loss": 2.6889, + "mean_token_accuracy": 0.45392249527410206, + "step": 2411 + }, + { + "epoch": 0.4471635150166852, + "grad_norm": 7.3046875, + "learning_rate": 9.552836484983316e-06, + "loss": 2.1962, + "mean_token_accuracy": 0.5082932049224184, + "step": 2412 + }, + { + "epoch": 0.44734890619206524, + "grad_norm": 5.8203125, + "learning_rate": 9.552651093807935e-06, + "loss": 3.007, + "mean_token_accuracy": 0.4300649901102006, + "step": 2413 + }, + { + "epoch": 0.4475342973674453, + "grad_norm": 5.09375, + "learning_rate": 9.552465702632556e-06, + "loss": 3.198, + "mean_token_accuracy": 0.40052682656314986, + "step": 2414 + }, + { + "epoch": 0.44771968854282534, + "grad_norm": 6.99609375, + "learning_rate": 9.552280311457174e-06, + "loss": 3.1008, + "mean_token_accuracy": 0.411509900990099, + "step": 2415 + }, + { + "epoch": 0.4479050797182054, + "grad_norm": 6.76171875, + "learning_rate": 9.552094920281797e-06, + "loss": 2.7771, + "mean_token_accuracy": 0.4511797679572416, + "step": 2416 + }, + { + "epoch": 0.44809047089358545, + "grad_norm": 7.44140625, + "learning_rate": 9.551909529106415e-06, + "loss": 2.5745, + "mean_token_accuracy": 0.45569935291189645, + "step": 2417 + }, + { + "epoch": 0.4482758620689655, + "grad_norm": 5.48046875, + "learning_rate": 9.551724137931036e-06, + "loss": 2.9116, + "mean_token_accuracy": 0.4231526447040294, + "step": 2418 + }, + { + "epoch": 0.44846125324434555, + "grad_norm": 5.53515625, + "learning_rate": 9.551538746755655e-06, + "loss": 2.9503, + "mean_token_accuracy": 0.4347772277227723, + "step": 2419 + }, + { + "epoch": 0.4486466444197256, + "grad_norm": 10.3984375, + "learning_rate": 9.551353355580275e-06, + "loss": 3.1357, + "mean_token_accuracy": 0.4181628392484342, + "step": 2420 + }, + { + "epoch": 0.44883203559510565, + "grad_norm": 11.2421875, + "learning_rate": 9.551167964404896e-06, + "loss": 2.8802, + "mean_token_accuracy": 0.4169553026554511, + "step": 2421 + }, + { + "epoch": 0.44901742677048573, + "grad_norm": 6.265625, + "learning_rate": 9.550982573229514e-06, + "loss": 2.4579, + "mean_token_accuracy": 0.4744744744744745, + "step": 2422 + }, + { + "epoch": 0.44920281794586575, + "grad_norm": 9.625, + "learning_rate": 9.550797182054135e-06, + "loss": 2.4746, + "mean_token_accuracy": 0.48040533553924103, + "step": 2423 + }, + { + "epoch": 0.44938820912124583, + "grad_norm": 6.62109375, + "learning_rate": 9.550611790878755e-06, + "loss": 2.922, + "mean_token_accuracy": 0.4366966215447617, + "step": 2424 + }, + { + "epoch": 0.4495736002966259, + "grad_norm": 6.5546875, + "learning_rate": 9.550426399703376e-06, + "loss": 2.6024, + "mean_token_accuracy": 0.4517479074347612, + "step": 2425 + }, + { + "epoch": 0.44975899147200593, + "grad_norm": 6.3515625, + "learning_rate": 9.550241008527995e-06, + "loss": 2.8228, + "mean_token_accuracy": 0.4199166053470689, + "step": 2426 + }, + { + "epoch": 0.449944382647386, + "grad_norm": 6.91796875, + "learning_rate": 9.550055617352615e-06, + "loss": 2.7367, + "mean_token_accuracy": 0.4427722772277228, + "step": 2427 + }, + { + "epoch": 0.45012977382276603, + "grad_norm": 9.546875, + "learning_rate": 9.549870226177234e-06, + "loss": 2.388, + "mean_token_accuracy": 0.4941823566744235, + "step": 2428 + }, + { + "epoch": 0.4503151649981461, + "grad_norm": 6.58984375, + "learning_rate": 9.549684835001855e-06, + "loss": 2.5457, + "mean_token_accuracy": 0.4641255605381166, + "step": 2429 + }, + { + "epoch": 0.45050055617352613, + "grad_norm": 7.33984375, + "learning_rate": 9.549499443826475e-06, + "loss": 2.837, + "mean_token_accuracy": 0.4302994011976048, + "step": 2430 + }, + { + "epoch": 0.4506859473489062, + "grad_norm": 6.92578125, + "learning_rate": 9.549314052651094e-06, + "loss": 3.2055, + "mean_token_accuracy": 0.39765851091817944, + "step": 2431 + }, + { + "epoch": 0.45087133852428624, + "grad_norm": 8.984375, + "learning_rate": 9.549128661475714e-06, + "loss": 2.5708, + "mean_token_accuracy": 0.4770676226974904, + "step": 2432 + }, + { + "epoch": 0.4510567296996663, + "grad_norm": 5.28125, + "learning_rate": 9.548943270300335e-06, + "loss": 3.1242, + "mean_token_accuracy": 0.40918282383727844, + "step": 2433 + }, + { + "epoch": 0.45124212087504634, + "grad_norm": 6.37109375, + "learning_rate": 9.548757879124955e-06, + "loss": 2.972, + "mean_token_accuracy": 0.4305274971941639, + "step": 2434 + }, + { + "epoch": 0.4514275120504264, + "grad_norm": 6.40625, + "learning_rate": 9.548572487949574e-06, + "loss": 2.81, + "mean_token_accuracy": 0.4394027254276602, + "step": 2435 + }, + { + "epoch": 0.45161290322580644, + "grad_norm": 5.71875, + "learning_rate": 9.548387096774195e-06, + "loss": 3.103, + "mean_token_accuracy": 0.44838590892353014, + "step": 2436 + }, + { + "epoch": 0.4517982944011865, + "grad_norm": 6.60546875, + "learning_rate": 9.548201705598813e-06, + "loss": 2.5761, + "mean_token_accuracy": 0.46043364176218754, + "step": 2437 + }, + { + "epoch": 0.45198368557656654, + "grad_norm": 7.34765625, + "learning_rate": 9.548016314423434e-06, + "loss": 2.6479, + "mean_token_accuracy": 0.46520433476955214, + "step": 2438 + }, + { + "epoch": 0.4521690767519466, + "grad_norm": 5.79296875, + "learning_rate": 9.547830923248054e-06, + "loss": 3.2091, + "mean_token_accuracy": 0.40176177709689775, + "step": 2439 + }, + { + "epoch": 0.45235446792732664, + "grad_norm": 6.54296875, + "learning_rate": 9.547645532072675e-06, + "loss": 3.4627, + "mean_token_accuracy": 0.3813646670789528, + "step": 2440 + }, + { + "epoch": 0.4525398591027067, + "grad_norm": 4.88671875, + "learning_rate": 9.547460140897294e-06, + "loss": 2.8834, + "mean_token_accuracy": 0.4404322614654718, + "step": 2441 + }, + { + "epoch": 0.45272525027808674, + "grad_norm": 6.4453125, + "learning_rate": 9.547274749721914e-06, + "loss": 2.9165, + "mean_token_accuracy": 0.4363368134362233, + "step": 2442 + }, + { + "epoch": 0.4529106414534668, + "grad_norm": 7.5, + "learning_rate": 9.547089358546535e-06, + "loss": 2.6656, + "mean_token_accuracy": 0.4488610478359909, + "step": 2443 + }, + { + "epoch": 0.45309603262884685, + "grad_norm": 6.4453125, + "learning_rate": 9.546903967371153e-06, + "loss": 2.9692, + "mean_token_accuracy": 0.4321608040201005, + "step": 2444 + }, + { + "epoch": 0.4532814238042269, + "grad_norm": 5.8515625, + "learning_rate": 9.546718576195774e-06, + "loss": 2.8917, + "mean_token_accuracy": 0.4230594643667726, + "step": 2445 + }, + { + "epoch": 0.45346681497960695, + "grad_norm": 7.15625, + "learning_rate": 9.546533185020393e-06, + "loss": 2.5575, + "mean_token_accuracy": 0.46420598303318944, + "step": 2446 + }, + { + "epoch": 0.453652206154987, + "grad_norm": 5.75, + "learning_rate": 9.546347793845013e-06, + "loss": 2.8305, + "mean_token_accuracy": 0.43079412496888225, + "step": 2447 + }, + { + "epoch": 0.45383759733036705, + "grad_norm": 5.39453125, + "learning_rate": 9.546162402669634e-06, + "loss": 2.5969, + "mean_token_accuracy": 0.4633488600147095, + "step": 2448 + }, + { + "epoch": 0.4540229885057471, + "grad_norm": 6.09765625, + "learning_rate": 9.545977011494254e-06, + "loss": 2.3082, + "mean_token_accuracy": 0.4954656686339427, + "step": 2449 + }, + { + "epoch": 0.4542083796811272, + "grad_norm": 6.16796875, + "learning_rate": 9.545791620318875e-06, + "loss": 3.1047, + "mean_token_accuracy": 0.3958333333333333, + "step": 2450 + }, + { + "epoch": 0.45439377085650723, + "grad_norm": 6.1484375, + "learning_rate": 9.545606229143493e-06, + "loss": 2.9332, + "mean_token_accuracy": 0.43058846761453395, + "step": 2451 + }, + { + "epoch": 0.4545791620318873, + "grad_norm": 8.9921875, + "learning_rate": 9.545420837968114e-06, + "loss": 2.7718, + "mean_token_accuracy": 0.44430459408432976, + "step": 2452 + }, + { + "epoch": 0.45476455320726733, + "grad_norm": 5.671875, + "learning_rate": 9.545235446792733e-06, + "loss": 2.9261, + "mean_token_accuracy": 0.4274848746758859, + "step": 2453 + }, + { + "epoch": 0.4549499443826474, + "grad_norm": 9.6640625, + "learning_rate": 9.545050055617353e-06, + "loss": 2.756, + "mean_token_accuracy": 0.4496743271475974, + "step": 2454 + }, + { + "epoch": 0.45513533555802743, + "grad_norm": 5.26953125, + "learning_rate": 9.544864664441972e-06, + "loss": 3.2629, + "mean_token_accuracy": 0.38518107395587625, + "step": 2455 + }, + { + "epoch": 0.4553207267334075, + "grad_norm": 7.94140625, + "learning_rate": 9.544679273266594e-06, + "loss": 2.7715, + "mean_token_accuracy": 0.4507629816664837, + "step": 2456 + }, + { + "epoch": 0.45550611790878753, + "grad_norm": 5.82421875, + "learning_rate": 9.544493882091213e-06, + "loss": 2.6489, + "mean_token_accuracy": 0.4534412955465587, + "step": 2457 + }, + { + "epoch": 0.4556915090841676, + "grad_norm": 6.69921875, + "learning_rate": 9.544308490915834e-06, + "loss": 2.5872, + "mean_token_accuracy": 0.46634486457495306, + "step": 2458 + }, + { + "epoch": 0.45587690025954763, + "grad_norm": 8.296875, + "learning_rate": 9.544123099740454e-06, + "loss": 2.778, + "mean_token_accuracy": 0.4428692340245451, + "step": 2459 + }, + { + "epoch": 0.4560622914349277, + "grad_norm": 7.578125, + "learning_rate": 9.543937708565073e-06, + "loss": 2.6885, + "mean_token_accuracy": 0.45277008310249306, + "step": 2460 + }, + { + "epoch": 0.45624768261030774, + "grad_norm": 7.8515625, + "learning_rate": 9.543752317389693e-06, + "loss": 2.6234, + "mean_token_accuracy": 0.46833013435700577, + "step": 2461 + }, + { + "epoch": 0.4564330737856878, + "grad_norm": 6.19140625, + "learning_rate": 9.543566926214312e-06, + "loss": 2.5895, + "mean_token_accuracy": 0.473430458023268, + "step": 2462 + }, + { + "epoch": 0.45661846496106784, + "grad_norm": 6.98828125, + "learning_rate": 9.543381535038933e-06, + "loss": 3.2186, + "mean_token_accuracy": 0.4083129584352078, + "step": 2463 + }, + { + "epoch": 0.4568038561364479, + "grad_norm": 7.05859375, + "learning_rate": 9.543196143863553e-06, + "loss": 3.2707, + "mean_token_accuracy": 0.39071450105517036, + "step": 2464 + }, + { + "epoch": 0.45698924731182794, + "grad_norm": 9.28125, + "learning_rate": 9.543010752688174e-06, + "loss": 2.7024, + "mean_token_accuracy": 0.45409096451652237, + "step": 2465 + }, + { + "epoch": 0.457174638487208, + "grad_norm": 7.96484375, + "learning_rate": 9.542825361512792e-06, + "loss": 2.4721, + "mean_token_accuracy": 0.46788790753462, + "step": 2466 + }, + { + "epoch": 0.45736002966258804, + "grad_norm": 8.09375, + "learning_rate": 9.542639970337413e-06, + "loss": 3.2281, + "mean_token_accuracy": 0.3880880545312325, + "step": 2467 + }, + { + "epoch": 0.4575454208379681, + "grad_norm": 7.71484375, + "learning_rate": 9.542454579162033e-06, + "loss": 2.6807, + "mean_token_accuracy": 0.45368811047490737, + "step": 2468 + }, + { + "epoch": 0.45773081201334814, + "grad_norm": 7.4921875, + "learning_rate": 9.542269187986652e-06, + "loss": 2.9751, + "mean_token_accuracy": 0.4145720649061406, + "step": 2469 + }, + { + "epoch": 0.4579162031887282, + "grad_norm": 5.96875, + "learning_rate": 9.542083796811273e-06, + "loss": 3.0603, + "mean_token_accuracy": 0.40944977529454635, + "step": 2470 + }, + { + "epoch": 0.45810159436410824, + "grad_norm": 7.0078125, + "learning_rate": 9.541898405635891e-06, + "loss": 3.3703, + "mean_token_accuracy": 0.37236731255265376, + "step": 2471 + }, + { + "epoch": 0.4582869855394883, + "grad_norm": 6.28125, + "learning_rate": 9.541713014460514e-06, + "loss": 2.7076, + "mean_token_accuracy": 0.46057263553156536, + "step": 2472 + }, + { + "epoch": 0.45847237671486835, + "grad_norm": 6.23046875, + "learning_rate": 9.541527623285132e-06, + "loss": 3.3086, + "mean_token_accuracy": 0.38600031041440325, + "step": 2473 + }, + { + "epoch": 0.4586577678902484, + "grad_norm": 6.66796875, + "learning_rate": 9.541342232109753e-06, + "loss": 3.055, + "mean_token_accuracy": 0.42121102684482964, + "step": 2474 + }, + { + "epoch": 0.4588431590656285, + "grad_norm": 7.5625, + "learning_rate": 9.541156840934372e-06, + "loss": 3.066, + "mean_token_accuracy": 0.4144700713893465, + "step": 2475 + }, + { + "epoch": 0.4590285502410085, + "grad_norm": 7.6640625, + "learning_rate": 9.540971449758992e-06, + "loss": 2.2571, + "mean_token_accuracy": 0.5086698043101313, + "step": 2476 + }, + { + "epoch": 0.4592139414163886, + "grad_norm": 6.546875, + "learning_rate": 9.540786058583613e-06, + "loss": 2.9718, + "mean_token_accuracy": 0.40262197594269494, + "step": 2477 + }, + { + "epoch": 0.4593993325917686, + "grad_norm": 9.4375, + "learning_rate": 9.540600667408232e-06, + "loss": 3.2434, + "mean_token_accuracy": 0.39374124241008873, + "step": 2478 + }, + { + "epoch": 0.4595847237671487, + "grad_norm": 5.97265625, + "learning_rate": 9.540415276232852e-06, + "loss": 2.9946, + "mean_token_accuracy": 0.43155586334256696, + "step": 2479 + }, + { + "epoch": 0.45977011494252873, + "grad_norm": 6.875, + "learning_rate": 9.54022988505747e-06, + "loss": 2.8389, + "mean_token_accuracy": 0.44116073816528484, + "step": 2480 + }, + { + "epoch": 0.4599555061179088, + "grad_norm": 5.921875, + "learning_rate": 9.540044493882093e-06, + "loss": 3.0083, + "mean_token_accuracy": 0.420907418761496, + "step": 2481 + }, + { + "epoch": 0.46014089729328883, + "grad_norm": 5.1875, + "learning_rate": 9.539859102706712e-06, + "loss": 2.8483, + "mean_token_accuracy": 0.42932015573525006, + "step": 2482 + }, + { + "epoch": 0.4603262884686689, + "grad_norm": 5.86328125, + "learning_rate": 9.539673711531332e-06, + "loss": 2.5072, + "mean_token_accuracy": 0.46636389440817483, + "step": 2483 + }, + { + "epoch": 0.46051167964404893, + "grad_norm": 9.84375, + "learning_rate": 9.539488320355951e-06, + "loss": 2.5965, + "mean_token_accuracy": 0.4535322078347812, + "step": 2484 + }, + { + "epoch": 0.460697070819429, + "grad_norm": 6.734375, + "learning_rate": 9.539302929180572e-06, + "loss": 2.7789, + "mean_token_accuracy": 0.4339500582222797, + "step": 2485 + }, + { + "epoch": 0.46088246199480903, + "grad_norm": 6.59375, + "learning_rate": 9.539117538005192e-06, + "loss": 3.76, + "mean_token_accuracy": 0.3526640493550196, + "step": 2486 + }, + { + "epoch": 0.4610678531701891, + "grad_norm": 6.25390625, + "learning_rate": 9.538932146829811e-06, + "loss": 2.9707, + "mean_token_accuracy": 0.4208, + "step": 2487 + }, + { + "epoch": 0.46125324434556914, + "grad_norm": 8.21875, + "learning_rate": 9.538746755654431e-06, + "loss": 2.8035, + "mean_token_accuracy": 0.43923737447242034, + "step": 2488 + }, + { + "epoch": 0.4614386355209492, + "grad_norm": 8.3671875, + "learning_rate": 9.538561364479052e-06, + "loss": 2.4987, + "mean_token_accuracy": 0.4725289523296526, + "step": 2489 + }, + { + "epoch": 0.46162402669632924, + "grad_norm": 5.30859375, + "learning_rate": 9.538375973303672e-06, + "loss": 3.0194, + "mean_token_accuracy": 0.4134504275479547, + "step": 2490 + }, + { + "epoch": 0.4618094178717093, + "grad_norm": 7.19921875, + "learning_rate": 9.538190582128291e-06, + "loss": 2.3885, + "mean_token_accuracy": 0.4854383582314773, + "step": 2491 + }, + { + "epoch": 0.46199480904708934, + "grad_norm": 17.171875, + "learning_rate": 9.538005190952912e-06, + "loss": 2.308, + "mean_token_accuracy": 0.5088640190155491, + "step": 2492 + }, + { + "epoch": 0.4621802002224694, + "grad_norm": 5.63671875, + "learning_rate": 9.53781979977753e-06, + "loss": 2.9352, + "mean_token_accuracy": 0.42948717948717946, + "step": 2493 + }, + { + "epoch": 0.46236559139784944, + "grad_norm": 6.2734375, + "learning_rate": 9.537634408602151e-06, + "loss": 2.7943, + "mean_token_accuracy": 0.43605045436050455, + "step": 2494 + }, + { + "epoch": 0.4625509825732295, + "grad_norm": 8.21875, + "learning_rate": 9.537449017426771e-06, + "loss": 2.868, + "mean_token_accuracy": 0.42641439090315963, + "step": 2495 + }, + { + "epoch": 0.46273637374860954, + "grad_norm": 6.87109375, + "learning_rate": 9.53726362625139e-06, + "loss": 2.3044, + "mean_token_accuracy": 0.5143839899937461, + "step": 2496 + }, + { + "epoch": 0.4629217649239896, + "grad_norm": 7.29296875, + "learning_rate": 9.537078235076012e-06, + "loss": 2.463, + "mean_token_accuracy": 0.49532119372787053, + "step": 2497 + }, + { + "epoch": 0.46310715609936964, + "grad_norm": 7.12890625, + "learning_rate": 9.536892843900631e-06, + "loss": 2.5569, + "mean_token_accuracy": 0.47036823935558114, + "step": 2498 + }, + { + "epoch": 0.4632925472747497, + "grad_norm": 6.7265625, + "learning_rate": 9.536707452725252e-06, + "loss": 3.2376, + "mean_token_accuracy": 0.41187021033751836, + "step": 2499 + }, + { + "epoch": 0.4634779384501298, + "grad_norm": 7.58984375, + "learning_rate": 9.53652206154987e-06, + "loss": 2.7457, + "mean_token_accuracy": 0.45398856520555403, + "step": 2500 + }, + { + "epoch": 0.4636633296255098, + "grad_norm": 4.49609375, + "learning_rate": 9.536336670374491e-06, + "loss": 2.5008, + "mean_token_accuracy": 0.49551397077672393, + "step": 2501 + }, + { + "epoch": 0.4638487208008899, + "grad_norm": 5.359375, + "learning_rate": 9.536151279199111e-06, + "loss": 2.8365, + "mean_token_accuracy": 0.426619294889314, + "step": 2502 + }, + { + "epoch": 0.4640341119762699, + "grad_norm": 7.375, + "learning_rate": 9.53596588802373e-06, + "loss": 1.9702, + "mean_token_accuracy": 0.552683615819209, + "step": 2503 + }, + { + "epoch": 0.46421950315165, + "grad_norm": 6.2421875, + "learning_rate": 9.53578049684835e-06, + "loss": 3.0862, + "mean_token_accuracy": 0.4169428609306642, + "step": 2504 + }, + { + "epoch": 0.46440489432703, + "grad_norm": 6.49609375, + "learning_rate": 9.535595105672971e-06, + "loss": 2.8983, + "mean_token_accuracy": 0.41797346200241253, + "step": 2505 + }, + { + "epoch": 0.4645902855024101, + "grad_norm": 9.9453125, + "learning_rate": 9.535409714497592e-06, + "loss": 2.4401, + "mean_token_accuracy": 0.4834067928441794, + "step": 2506 + }, + { + "epoch": 0.46477567667779013, + "grad_norm": 6.71875, + "learning_rate": 9.53522432332221e-06, + "loss": 2.7282, + "mean_token_accuracy": 0.44796316964285715, + "step": 2507 + }, + { + "epoch": 0.4649610678531702, + "grad_norm": 6.34375, + "learning_rate": 9.535038932146831e-06, + "loss": 2.747, + "mean_token_accuracy": 0.436739785840524, + "step": 2508 + }, + { + "epoch": 0.46514645902855023, + "grad_norm": 6.4921875, + "learning_rate": 9.53485354097145e-06, + "loss": 2.8133, + "mean_token_accuracy": 0.4511774411279436, + "step": 2509 + }, + { + "epoch": 0.4653318502039303, + "grad_norm": 6.75390625, + "learning_rate": 9.53466814979607e-06, + "loss": 2.789, + "mean_token_accuracy": 0.43570419218087614, + "step": 2510 + }, + { + "epoch": 0.46551724137931033, + "grad_norm": 5.8671875, + "learning_rate": 9.53448275862069e-06, + "loss": 3.1694, + "mean_token_accuracy": 0.40257203320853, + "step": 2511 + }, + { + "epoch": 0.4657026325546904, + "grad_norm": 6.3828125, + "learning_rate": 9.53429736744531e-06, + "loss": 3.3972, + "mean_token_accuracy": 0.39360568383658967, + "step": 2512 + }, + { + "epoch": 0.46588802373007043, + "grad_norm": 6.6796875, + "learning_rate": 9.53411197626993e-06, + "loss": 2.8469, + "mean_token_accuracy": 0.43317972350230416, + "step": 2513 + }, + { + "epoch": 0.4660734149054505, + "grad_norm": 5.5234375, + "learning_rate": 9.53392658509455e-06, + "loss": 3.1231, + "mean_token_accuracy": 0.3945369916707496, + "step": 2514 + }, + { + "epoch": 0.46625880608083053, + "grad_norm": 6.4375, + "learning_rate": 9.533741193919171e-06, + "loss": 2.9609, + "mean_token_accuracy": 0.413142306102769, + "step": 2515 + }, + { + "epoch": 0.4664441972562106, + "grad_norm": 9.6875, + "learning_rate": 9.53355580274379e-06, + "loss": 2.7374, + "mean_token_accuracy": 0.4649122807017544, + "step": 2516 + }, + { + "epoch": 0.46662958843159064, + "grad_norm": 6.59375, + "learning_rate": 9.53337041156841e-06, + "loss": 2.9772, + "mean_token_accuracy": 0.43154246100519933, + "step": 2517 + }, + { + "epoch": 0.4668149796069707, + "grad_norm": 5.12890625, + "learning_rate": 9.533185020393029e-06, + "loss": 2.6205, + "mean_token_accuracy": 0.4678420310296192, + "step": 2518 + }, + { + "epoch": 0.46700037078235074, + "grad_norm": 7.30859375, + "learning_rate": 9.53299962921765e-06, + "loss": 2.3635, + "mean_token_accuracy": 0.497495183044316, + "step": 2519 + }, + { + "epoch": 0.4671857619577308, + "grad_norm": 5.89453125, + "learning_rate": 9.53281423804227e-06, + "loss": 2.9411, + "mean_token_accuracy": 0.43390482294504085, + "step": 2520 + }, + { + "epoch": 0.46737115313311084, + "grad_norm": 5.62109375, + "learning_rate": 9.53262884686689e-06, + "loss": 2.9649, + "mean_token_accuracy": 0.41629432455556614, + "step": 2521 + }, + { + "epoch": 0.4675565443084909, + "grad_norm": 5.3359375, + "learning_rate": 9.53244345569151e-06, + "loss": 2.3771, + "mean_token_accuracy": 0.494625, + "step": 2522 + }, + { + "epoch": 0.46774193548387094, + "grad_norm": 8.609375, + "learning_rate": 9.53225806451613e-06, + "loss": 3.0223, + "mean_token_accuracy": 0.424265605875153, + "step": 2523 + }, + { + "epoch": 0.467927326659251, + "grad_norm": 7.71875, + "learning_rate": 9.53207267334075e-06, + "loss": 2.8401, + "mean_token_accuracy": 0.4319723537693588, + "step": 2524 + }, + { + "epoch": 0.4681127178346311, + "grad_norm": 6.328125, + "learning_rate": 9.53188728216537e-06, + "loss": 2.7406, + "mean_token_accuracy": 0.4533976450560363, + "step": 2525 + }, + { + "epoch": 0.4682981090100111, + "grad_norm": 6.8203125, + "learning_rate": 9.53170189098999e-06, + "loss": 2.9489, + "mean_token_accuracy": 0.4349461668180049, + "step": 2526 + }, + { + "epoch": 0.4684835001853912, + "grad_norm": 7.328125, + "learning_rate": 9.531516499814608e-06, + "loss": 2.8208, + "mean_token_accuracy": 0.42471204188481676, + "step": 2527 + }, + { + "epoch": 0.4686688913607712, + "grad_norm": 5.5078125, + "learning_rate": 9.531331108639229e-06, + "loss": 2.9949, + "mean_token_accuracy": 0.4132737504137703, + "step": 2528 + }, + { + "epoch": 0.4688542825361513, + "grad_norm": 6.22265625, + "learning_rate": 9.53114571746385e-06, + "loss": 2.8006, + "mean_token_accuracy": 0.4358610914245216, + "step": 2529 + }, + { + "epoch": 0.4690396737115313, + "grad_norm": 5.265625, + "learning_rate": 9.53096032628847e-06, + "loss": 3.2051, + "mean_token_accuracy": 0.4030714210581677, + "step": 2530 + }, + { + "epoch": 0.4692250648869114, + "grad_norm": 5.21484375, + "learning_rate": 9.53077493511309e-06, + "loss": 2.8869, + "mean_token_accuracy": 0.44106123362812044, + "step": 2531 + }, + { + "epoch": 0.4694104560622914, + "grad_norm": 5.37890625, + "learning_rate": 9.53058954393771e-06, + "loss": 2.2242, + "mean_token_accuracy": 0.5402336860670194, + "step": 2532 + }, + { + "epoch": 0.4695958472376715, + "grad_norm": 5.4765625, + "learning_rate": 9.53040415276233e-06, + "loss": 2.8237, + "mean_token_accuracy": 0.43513177242393397, + "step": 2533 + }, + { + "epoch": 0.4697812384130515, + "grad_norm": 4.8984375, + "learning_rate": 9.530218761586949e-06, + "loss": 2.9234, + "mean_token_accuracy": 0.423139598044541, + "step": 2534 + }, + { + "epoch": 0.4699666295884316, + "grad_norm": 5.37890625, + "learning_rate": 9.530033370411569e-06, + "loss": 3.3577, + "mean_token_accuracy": 0.3962329961632368, + "step": 2535 + }, + { + "epoch": 0.47015202076381163, + "grad_norm": 6.08203125, + "learning_rate": 9.529847979236188e-06, + "loss": 3.0031, + "mean_token_accuracy": 0.4164496527777778, + "step": 2536 + }, + { + "epoch": 0.4703374119391917, + "grad_norm": 7.65625, + "learning_rate": 9.52966258806081e-06, + "loss": 2.822, + "mean_token_accuracy": 0.40602721970187944, + "step": 2537 + }, + { + "epoch": 0.47052280311457173, + "grad_norm": 5.77734375, + "learning_rate": 9.529477196885429e-06, + "loss": 3.0819, + "mean_token_accuracy": 0.42841490138787436, + "step": 2538 + }, + { + "epoch": 0.4707081942899518, + "grad_norm": 6.7109375, + "learning_rate": 9.52929180571005e-06, + "loss": 2.4889, + "mean_token_accuracy": 0.46345205479452056, + "step": 2539 + }, + { + "epoch": 0.47089358546533183, + "grad_norm": 9.0859375, + "learning_rate": 9.52910641453467e-06, + "loss": 2.8023, + "mean_token_accuracy": 0.4428508707741841, + "step": 2540 + }, + { + "epoch": 0.4710789766407119, + "grad_norm": 6.16796875, + "learning_rate": 9.528921023359289e-06, + "loss": 3.001, + "mean_token_accuracy": 0.4205665024630542, + "step": 2541 + }, + { + "epoch": 0.47126436781609193, + "grad_norm": 8.0078125, + "learning_rate": 9.528735632183909e-06, + "loss": 3.2606, + "mean_token_accuracy": 0.40451467268623026, + "step": 2542 + }, + { + "epoch": 0.471449758991472, + "grad_norm": 9.1875, + "learning_rate": 9.528550241008528e-06, + "loss": 2.4531, + "mean_token_accuracy": 0.467595818815331, + "step": 2543 + }, + { + "epoch": 0.47163515016685204, + "grad_norm": 5.53125, + "learning_rate": 9.528364849833148e-06, + "loss": 2.7229, + "mean_token_accuracy": 0.4499524865378524, + "step": 2544 + }, + { + "epoch": 0.4718205413422321, + "grad_norm": 5.14453125, + "learning_rate": 9.528179458657769e-06, + "loss": 2.6169, + "mean_token_accuracy": 0.4622204578536456, + "step": 2545 + }, + { + "epoch": 0.47200593251761214, + "grad_norm": 7.33984375, + "learning_rate": 9.52799406748239e-06, + "loss": 3.1118, + "mean_token_accuracy": 0.40165752305357766, + "step": 2546 + }, + { + "epoch": 0.4721913236929922, + "grad_norm": 7.00390625, + "learning_rate": 9.527808676307008e-06, + "loss": 2.9195, + "mean_token_accuracy": 0.4091026635634859, + "step": 2547 + }, + { + "epoch": 0.47237671486837224, + "grad_norm": 5.65234375, + "learning_rate": 9.527623285131629e-06, + "loss": 2.8457, + "mean_token_accuracy": 0.442680262199563, + "step": 2548 + }, + { + "epoch": 0.4725621060437523, + "grad_norm": 6.60546875, + "learning_rate": 9.527437893956249e-06, + "loss": 2.8663, + "mean_token_accuracy": 0.4360231832367365, + "step": 2549 + }, + { + "epoch": 0.4727474972191324, + "grad_norm": 5.76953125, + "learning_rate": 9.527252502780868e-06, + "loss": 2.7839, + "mean_token_accuracy": 0.45588972431077696, + "step": 2550 + }, + { + "epoch": 0.4729328883945124, + "grad_norm": 5.3671875, + "learning_rate": 9.527067111605488e-06, + "loss": 3.3261, + "mean_token_accuracy": 0.3899767218951116, + "step": 2551 + }, + { + "epoch": 0.4731182795698925, + "grad_norm": 7.52734375, + "learning_rate": 9.526881720430107e-06, + "loss": 2.8318, + "mean_token_accuracy": 0.43180246913580245, + "step": 2552 + }, + { + "epoch": 0.4733036707452725, + "grad_norm": 5.60546875, + "learning_rate": 9.52669632925473e-06, + "loss": 3.0814, + "mean_token_accuracy": 0.4179741433412828, + "step": 2553 + }, + { + "epoch": 0.4734890619206526, + "grad_norm": 5.6015625, + "learning_rate": 9.526510938079348e-06, + "loss": 2.7128, + "mean_token_accuracy": 0.45361599759434673, + "step": 2554 + }, + { + "epoch": 0.4736744530960326, + "grad_norm": 5.703125, + "learning_rate": 9.526325546903969e-06, + "loss": 3.2983, + "mean_token_accuracy": 0.3863498789346247, + "step": 2555 + }, + { + "epoch": 0.4738598442714127, + "grad_norm": 5.09765625, + "learning_rate": 9.526140155728587e-06, + "loss": 2.8733, + "mean_token_accuracy": 0.43207261724659607, + "step": 2556 + }, + { + "epoch": 0.4740452354467927, + "grad_norm": 5.56640625, + "learning_rate": 9.525954764553208e-06, + "loss": 3.0442, + "mean_token_accuracy": 0.42383275261324044, + "step": 2557 + }, + { + "epoch": 0.4742306266221728, + "grad_norm": 6.015625, + "learning_rate": 9.525769373377828e-06, + "loss": 2.6299, + "mean_token_accuracy": 0.4565102793885082, + "step": 2558 + }, + { + "epoch": 0.4744160177975528, + "grad_norm": 4.765625, + "learning_rate": 9.525583982202447e-06, + "loss": 2.909, + "mean_token_accuracy": 0.4377313432835821, + "step": 2559 + }, + { + "epoch": 0.4746014089729329, + "grad_norm": 5.94140625, + "learning_rate": 9.525398591027068e-06, + "loss": 2.9877, + "mean_token_accuracy": 0.4233587128286166, + "step": 2560 + }, + { + "epoch": 0.4747868001483129, + "grad_norm": 9.765625, + "learning_rate": 9.525213199851688e-06, + "loss": 2.5516, + "mean_token_accuracy": 0.4771517525359044, + "step": 2561 + }, + { + "epoch": 0.474972191323693, + "grad_norm": 9.1875, + "learning_rate": 9.525027808676309e-06, + "loss": 3.4366, + "mean_token_accuracy": 0.3910933485583785, + "step": 2562 + }, + { + "epoch": 0.47515758249907303, + "grad_norm": 7.6484375, + "learning_rate": 9.524842417500928e-06, + "loss": 2.6874, + "mean_token_accuracy": 0.46093085429578323, + "step": 2563 + }, + { + "epoch": 0.4753429736744531, + "grad_norm": 6.0078125, + "learning_rate": 9.524657026325548e-06, + "loss": 2.7747, + "mean_token_accuracy": 0.4500293083235639, + "step": 2564 + }, + { + "epoch": 0.47552836484983313, + "grad_norm": 15.1484375, + "learning_rate": 9.524471635150167e-06, + "loss": 2.1854, + "mean_token_accuracy": 0.5015339305436249, + "step": 2565 + }, + { + "epoch": 0.4757137560252132, + "grad_norm": 6.10546875, + "learning_rate": 9.524286243974787e-06, + "loss": 2.9815, + "mean_token_accuracy": 0.40945397407358913, + "step": 2566 + }, + { + "epoch": 0.47589914720059323, + "grad_norm": 4.8828125, + "learning_rate": 9.524100852799408e-06, + "loss": 2.9804, + "mean_token_accuracy": 0.40863838260070456, + "step": 2567 + }, + { + "epoch": 0.4760845383759733, + "grad_norm": 5.58203125, + "learning_rate": 9.523915461624027e-06, + "loss": 2.7612, + "mean_token_accuracy": 0.4418451833355237, + "step": 2568 + }, + { + "epoch": 0.47626992955135333, + "grad_norm": 7.234375, + "learning_rate": 9.523730070448649e-06, + "loss": 2.4868, + "mean_token_accuracy": 0.4754227912122649, + "step": 2569 + }, + { + "epoch": 0.4764553207267334, + "grad_norm": 11.84375, + "learning_rate": 9.523544679273268e-06, + "loss": 2.5776, + "mean_token_accuracy": 0.44609753217617193, + "step": 2570 + }, + { + "epoch": 0.47664071190211343, + "grad_norm": 7.37109375, + "learning_rate": 9.523359288097888e-06, + "loss": 2.9286, + "mean_token_accuracy": 0.4316546762589928, + "step": 2571 + }, + { + "epoch": 0.4768261030774935, + "grad_norm": 9.1171875, + "learning_rate": 9.523173896922507e-06, + "loss": 2.6578, + "mean_token_accuracy": 0.4539438856537851, + "step": 2572 + }, + { + "epoch": 0.47701149425287354, + "grad_norm": 6.38671875, + "learning_rate": 9.522988505747127e-06, + "loss": 2.7293, + "mean_token_accuracy": 0.45932469012679866, + "step": 2573 + }, + { + "epoch": 0.4771968854282536, + "grad_norm": 7.27734375, + "learning_rate": 9.522803114571746e-06, + "loss": 2.2807, + "mean_token_accuracy": 0.5143958559209734, + "step": 2574 + }, + { + "epoch": 0.4773822766036337, + "grad_norm": 9.8359375, + "learning_rate": 9.522617723396367e-06, + "loss": 2.2991, + "mean_token_accuracy": 0.48370237931508736, + "step": 2575 + }, + { + "epoch": 0.4775676677790137, + "grad_norm": 6.01171875, + "learning_rate": 9.522432332220987e-06, + "loss": 2.7155, + "mean_token_accuracy": 0.46001386001386, + "step": 2576 + }, + { + "epoch": 0.4777530589543938, + "grad_norm": 8.609375, + "learning_rate": 9.522246941045608e-06, + "loss": 2.3429, + "mean_token_accuracy": 0.49522673031026254, + "step": 2577 + }, + { + "epoch": 0.4779384501297738, + "grad_norm": 8.359375, + "learning_rate": 9.522061549870228e-06, + "loss": 2.4308, + "mean_token_accuracy": 0.47777003484320557, + "step": 2578 + }, + { + "epoch": 0.4781238413051539, + "grad_norm": 7.51953125, + "learning_rate": 9.521876158694847e-06, + "loss": 2.6773, + "mean_token_accuracy": 0.45350270167788415, + "step": 2579 + }, + { + "epoch": 0.4783092324805339, + "grad_norm": 7.32421875, + "learning_rate": 9.521690767519467e-06, + "loss": 3.1179, + "mean_token_accuracy": 0.41104594330400784, + "step": 2580 + }, + { + "epoch": 0.478494623655914, + "grad_norm": 6.58203125, + "learning_rate": 9.521505376344086e-06, + "loss": 2.5288, + "mean_token_accuracy": 0.4715392838054516, + "step": 2581 + }, + { + "epoch": 0.478680014831294, + "grad_norm": 7.203125, + "learning_rate": 9.521319985168707e-06, + "loss": 2.8641, + "mean_token_accuracy": 0.43681248401125605, + "step": 2582 + }, + { + "epoch": 0.4788654060066741, + "grad_norm": 5.4765625, + "learning_rate": 9.521134593993327e-06, + "loss": 2.7677, + "mean_token_accuracy": 0.44071315178898524, + "step": 2583 + }, + { + "epoch": 0.4790507971820541, + "grad_norm": 6.39453125, + "learning_rate": 9.520949202817946e-06, + "loss": 2.9314, + "mean_token_accuracy": 0.42054263565891475, + "step": 2584 + }, + { + "epoch": 0.4792361883574342, + "grad_norm": 5.1640625, + "learning_rate": 9.520763811642566e-06, + "loss": 3.0261, + "mean_token_accuracy": 0.420222503872694, + "step": 2585 + }, + { + "epoch": 0.4794215795328142, + "grad_norm": 6.7421875, + "learning_rate": 9.520578420467187e-06, + "loss": 3.0652, + "mean_token_accuracy": 0.4460624071322437, + "step": 2586 + }, + { + "epoch": 0.4796069707081943, + "grad_norm": 4.5078125, + "learning_rate": 9.520393029291807e-06, + "loss": 2.6088, + "mean_token_accuracy": 0.46253943217665616, + "step": 2587 + }, + { + "epoch": 0.4797923618835743, + "grad_norm": 5.88671875, + "learning_rate": 9.520207638116426e-06, + "loss": 3.6518, + "mean_token_accuracy": 0.374085033483881, + "step": 2588 + }, + { + "epoch": 0.4799777530589544, + "grad_norm": 6.9296875, + "learning_rate": 9.520022246941047e-06, + "loss": 3.0874, + "mean_token_accuracy": 0.41317710334229735, + "step": 2589 + }, + { + "epoch": 0.4801631442343344, + "grad_norm": 5.2890625, + "learning_rate": 9.519836855765666e-06, + "loss": 2.8353, + "mean_token_accuracy": 0.4370590419606387, + "step": 2590 + }, + { + "epoch": 0.4803485354097145, + "grad_norm": 5.0, + "learning_rate": 9.519651464590286e-06, + "loss": 2.6998, + "mean_token_accuracy": 0.44785600847009, + "step": 2591 + }, + { + "epoch": 0.48053392658509453, + "grad_norm": 5.69921875, + "learning_rate": 9.519466073414907e-06, + "loss": 2.5143, + "mean_token_accuracy": 0.5007346459006758, + "step": 2592 + }, + { + "epoch": 0.4807193177604746, + "grad_norm": 6.3125, + "learning_rate": 9.519280682239527e-06, + "loss": 2.726, + "mean_token_accuracy": 0.4499440089585666, + "step": 2593 + }, + { + "epoch": 0.48090470893585463, + "grad_norm": 6.98046875, + "learning_rate": 9.519095291064146e-06, + "loss": 2.6136, + "mean_token_accuracy": 0.4663587065675838, + "step": 2594 + }, + { + "epoch": 0.4810901001112347, + "grad_norm": 6.859375, + "learning_rate": 9.518909899888766e-06, + "loss": 2.7489, + "mean_token_accuracy": 0.44330023292877285, + "step": 2595 + }, + { + "epoch": 0.48127549128661473, + "grad_norm": 4.94140625, + "learning_rate": 9.518724508713387e-06, + "loss": 2.8618, + "mean_token_accuracy": 0.4410502398384246, + "step": 2596 + }, + { + "epoch": 0.4814608824619948, + "grad_norm": 7.0, + "learning_rate": 9.518539117538006e-06, + "loss": 2.785, + "mean_token_accuracy": 0.4428932322829411, + "step": 2597 + }, + { + "epoch": 0.48164627363737483, + "grad_norm": 7.4453125, + "learning_rate": 9.518353726362626e-06, + "loss": 2.8649, + "mean_token_accuracy": 0.4088635732147819, + "step": 2598 + }, + { + "epoch": 0.4818316648127549, + "grad_norm": 5.3515625, + "learning_rate": 9.518168335187245e-06, + "loss": 3.4449, + "mean_token_accuracy": 0.3666745116498, + "step": 2599 + }, + { + "epoch": 0.482017055988135, + "grad_norm": 6.28515625, + "learning_rate": 9.517982944011865e-06, + "loss": 3.191, + "mean_token_accuracy": 0.41676519271695434, + "step": 2600 + }, + { + "epoch": 0.482202447163515, + "grad_norm": 6.23046875, + "learning_rate": 9.517797552836486e-06, + "loss": 2.7283, + "mean_token_accuracy": 0.4406799259944496, + "step": 2601 + }, + { + "epoch": 0.4823878383388951, + "grad_norm": 9.4375, + "learning_rate": 9.517612161661106e-06, + "loss": 2.4779, + "mean_token_accuracy": 0.48483920367534455, + "step": 2602 + }, + { + "epoch": 0.4825732295142751, + "grad_norm": 5.5234375, + "learning_rate": 9.517426770485725e-06, + "loss": 2.9092, + "mean_token_accuracy": 0.42191036668175647, + "step": 2603 + }, + { + "epoch": 0.4827586206896552, + "grad_norm": 5.578125, + "learning_rate": 9.517241379310346e-06, + "loss": 3.0931, + "mean_token_accuracy": 0.4219535389213422, + "step": 2604 + }, + { + "epoch": 0.4829440118650352, + "grad_norm": 8.4765625, + "learning_rate": 9.517055988134966e-06, + "loss": 2.9972, + "mean_token_accuracy": 0.41296813862493015, + "step": 2605 + }, + { + "epoch": 0.4831294030404153, + "grad_norm": 9.1640625, + "learning_rate": 9.516870596959585e-06, + "loss": 3.2786, + "mean_token_accuracy": 0.39085204755614267, + "step": 2606 + }, + { + "epoch": 0.4833147942157953, + "grad_norm": 5.3671875, + "learning_rate": 9.516685205784205e-06, + "loss": 2.6184, + "mean_token_accuracy": 0.44262749445676275, + "step": 2607 + }, + { + "epoch": 0.4835001853911754, + "grad_norm": 6.28125, + "learning_rate": 9.516499814608824e-06, + "loss": 3.1467, + "mean_token_accuracy": 0.39843943522417635, + "step": 2608 + }, + { + "epoch": 0.4836855765665554, + "grad_norm": 6.33984375, + "learning_rate": 9.516314423433445e-06, + "loss": 2.7146, + "mean_token_accuracy": 0.4528700906344411, + "step": 2609 + }, + { + "epoch": 0.4838709677419355, + "grad_norm": 7.44921875, + "learning_rate": 9.516129032258065e-06, + "loss": 2.6833, + "mean_token_accuracy": 0.45627687896784763, + "step": 2610 + }, + { + "epoch": 0.4840563589173155, + "grad_norm": 6.20703125, + "learning_rate": 9.515943641082686e-06, + "loss": 3.2867, + "mean_token_accuracy": 0.3927054708968274, + "step": 2611 + }, + { + "epoch": 0.4842417500926956, + "grad_norm": 5.28125, + "learning_rate": 9.515758249907305e-06, + "loss": 2.9137, + "mean_token_accuracy": 0.450512682137075, + "step": 2612 + }, + { + "epoch": 0.4844271412680756, + "grad_norm": 7.13671875, + "learning_rate": 9.515572858731925e-06, + "loss": 2.8444, + "mean_token_accuracy": 0.43803859691710156, + "step": 2613 + }, + { + "epoch": 0.4846125324434557, + "grad_norm": 7.19921875, + "learning_rate": 9.515387467556545e-06, + "loss": 2.7975, + "mean_token_accuracy": 0.4414752305047664, + "step": 2614 + }, + { + "epoch": 0.4847979236188357, + "grad_norm": 7.40234375, + "learning_rate": 9.515202076381164e-06, + "loss": 3.154, + "mean_token_accuracy": 0.408891671884784, + "step": 2615 + }, + { + "epoch": 0.4849833147942158, + "grad_norm": 8.1484375, + "learning_rate": 9.515016685205785e-06, + "loss": 2.5771, + "mean_token_accuracy": 0.48056780595369347, + "step": 2616 + }, + { + "epoch": 0.4851687059695958, + "grad_norm": 7.91796875, + "learning_rate": 9.514831294030404e-06, + "loss": 2.6237, + "mean_token_accuracy": 0.44602415702063414, + "step": 2617 + }, + { + "epoch": 0.4853540971449759, + "grad_norm": 8.375, + "learning_rate": 9.514645902855026e-06, + "loss": 2.9895, + "mean_token_accuracy": 0.4290997013227137, + "step": 2618 + }, + { + "epoch": 0.4855394883203559, + "grad_norm": 10.25, + "learning_rate": 9.514460511679645e-06, + "loss": 3.4011, + "mean_token_accuracy": 0.378950378950379, + "step": 2619 + }, + { + "epoch": 0.485724879495736, + "grad_norm": 8.4140625, + "learning_rate": 9.514275120504265e-06, + "loss": 2.6076, + "mean_token_accuracy": 0.4735290783357896, + "step": 2620 + }, + { + "epoch": 0.48591027067111603, + "grad_norm": 6.03515625, + "learning_rate": 9.514089729328886e-06, + "loss": 2.7027, + "mean_token_accuracy": 0.43832715722604554, + "step": 2621 + }, + { + "epoch": 0.4860956618464961, + "grad_norm": 7.0, + "learning_rate": 9.513904338153504e-06, + "loss": 2.8333, + "mean_token_accuracy": 0.4431761786600496, + "step": 2622 + }, + { + "epoch": 0.4862810530218762, + "grad_norm": 9.71875, + "learning_rate": 9.513718946978125e-06, + "loss": 2.7316, + "mean_token_accuracy": 0.4498026640355205, + "step": 2623 + }, + { + "epoch": 0.4864664441972562, + "grad_norm": 8.3203125, + "learning_rate": 9.513533555802744e-06, + "loss": 2.6717, + "mean_token_accuracy": 0.4897985705003249, + "step": 2624 + }, + { + "epoch": 0.4866518353726363, + "grad_norm": 5.35546875, + "learning_rate": 9.513348164627364e-06, + "loss": 3.2429, + "mean_token_accuracy": 0.3886930776822219, + "step": 2625 + }, + { + "epoch": 0.4868372265480163, + "grad_norm": 9.0, + "learning_rate": 9.513162773451985e-06, + "loss": 2.8459, + "mean_token_accuracy": 0.45525027203482044, + "step": 2626 + }, + { + "epoch": 0.4870226177233964, + "grad_norm": 10.0390625, + "learning_rate": 9.512977382276605e-06, + "loss": 3.2139, + "mean_token_accuracy": 0.39025394646533973, + "step": 2627 + }, + { + "epoch": 0.4872080088987764, + "grad_norm": 6.40234375, + "learning_rate": 9.512791991101224e-06, + "loss": 3.1823, + "mean_token_accuracy": 0.3923456790123457, + "step": 2628 + }, + { + "epoch": 0.4873934000741565, + "grad_norm": 9.203125, + "learning_rate": 9.512606599925844e-06, + "loss": 2.0783, + "mean_token_accuracy": 0.5319126710540061, + "step": 2629 + }, + { + "epoch": 0.4875787912495365, + "grad_norm": 8.734375, + "learning_rate": 9.512421208750465e-06, + "loss": 2.5481, + "mean_token_accuracy": 0.46162112373349706, + "step": 2630 + }, + { + "epoch": 0.4877641824249166, + "grad_norm": 9.40625, + "learning_rate": 9.512235817575084e-06, + "loss": 2.5611, + "mean_token_accuracy": 0.45832326648949023, + "step": 2631 + }, + { + "epoch": 0.4879495736002966, + "grad_norm": 9.3125, + "learning_rate": 9.512050426399704e-06, + "loss": 2.6694, + "mean_token_accuracy": 0.4398270812996793, + "step": 2632 + }, + { + "epoch": 0.4881349647756767, + "grad_norm": 6.90234375, + "learning_rate": 9.511865035224323e-06, + "loss": 3.3448, + "mean_token_accuracy": 0.39632380712577114, + "step": 2633 + }, + { + "epoch": 0.4883203559510567, + "grad_norm": 5.89453125, + "learning_rate": 9.511679644048945e-06, + "loss": 2.9128, + "mean_token_accuracy": 0.428627399764921, + "step": 2634 + }, + { + "epoch": 0.4885057471264368, + "grad_norm": 9.4921875, + "learning_rate": 9.511494252873564e-06, + "loss": 2.8528, + "mean_token_accuracy": 0.4206848357791754, + "step": 2635 + }, + { + "epoch": 0.4886911383018168, + "grad_norm": 10.640625, + "learning_rate": 9.511308861698184e-06, + "loss": 2.6495, + "mean_token_accuracy": 0.45034324942791765, + "step": 2636 + }, + { + "epoch": 0.4888765294771969, + "grad_norm": 8.0546875, + "learning_rate": 9.511123470522803e-06, + "loss": 2.6312, + "mean_token_accuracy": 0.4571123321123321, + "step": 2637 + }, + { + "epoch": 0.4890619206525769, + "grad_norm": 6.859375, + "learning_rate": 9.510938079347424e-06, + "loss": 2.8677, + "mean_token_accuracy": 0.43048845947396674, + "step": 2638 + }, + { + "epoch": 0.489247311827957, + "grad_norm": 7.72265625, + "learning_rate": 9.510752688172044e-06, + "loss": 3.0873, + "mean_token_accuracy": 0.4007341206511331, + "step": 2639 + }, + { + "epoch": 0.489432703003337, + "grad_norm": 8.375, + "learning_rate": 9.510567296996663e-06, + "loss": 3.0211, + "mean_token_accuracy": 0.40501132502831255, + "step": 2640 + }, + { + "epoch": 0.4896180941787171, + "grad_norm": 7.2578125, + "learning_rate": 9.510381905821284e-06, + "loss": 2.8712, + "mean_token_accuracy": 0.44693473961766644, + "step": 2641 + }, + { + "epoch": 0.4898034853540971, + "grad_norm": 6.77734375, + "learning_rate": 9.510196514645904e-06, + "loss": 2.7596, + "mean_token_accuracy": 0.43508673754896476, + "step": 2642 + }, + { + "epoch": 0.4899888765294772, + "grad_norm": 7.62890625, + "learning_rate": 9.510011123470524e-06, + "loss": 2.5579, + "mean_token_accuracy": 0.4839506172839506, + "step": 2643 + }, + { + "epoch": 0.4901742677048572, + "grad_norm": 8.1171875, + "learning_rate": 9.509825732295143e-06, + "loss": 2.8385, + "mean_token_accuracy": 0.4312160129584635, + "step": 2644 + }, + { + "epoch": 0.4903596588802373, + "grad_norm": 6.19921875, + "learning_rate": 9.509640341119764e-06, + "loss": 2.9947, + "mean_token_accuracy": 0.4269778030734206, + "step": 2645 + }, + { + "epoch": 0.4905450500556173, + "grad_norm": 5.9609375, + "learning_rate": 9.509454949944383e-06, + "loss": 2.9403, + "mean_token_accuracy": 0.4356181150550796, + "step": 2646 + }, + { + "epoch": 0.4907304412309974, + "grad_norm": 9.0703125, + "learning_rate": 9.509269558769003e-06, + "loss": 2.8397, + "mean_token_accuracy": 0.46260843553694286, + "step": 2647 + }, + { + "epoch": 0.4909158324063775, + "grad_norm": 10.15625, + "learning_rate": 9.509084167593624e-06, + "loss": 2.9034, + "mean_token_accuracy": 0.42648767264747645, + "step": 2648 + }, + { + "epoch": 0.4911012235817575, + "grad_norm": 7.3828125, + "learning_rate": 9.508898776418242e-06, + "loss": 2.739, + "mean_token_accuracy": 0.45829566003616634, + "step": 2649 + }, + { + "epoch": 0.4912866147571376, + "grad_norm": 5.94140625, + "learning_rate": 9.508713385242865e-06, + "loss": 3.1552, + "mean_token_accuracy": 0.4168056018672891, + "step": 2650 + }, + { + "epoch": 0.4914720059325176, + "grad_norm": 10.7890625, + "learning_rate": 9.508527994067483e-06, + "loss": 3.2833, + "mean_token_accuracy": 0.40257744147552615, + "step": 2651 + }, + { + "epoch": 0.4916573971078977, + "grad_norm": 11.8515625, + "learning_rate": 9.508342602892104e-06, + "loss": 3.1695, + "mean_token_accuracy": 0.40180666353824496, + "step": 2652 + }, + { + "epoch": 0.4918427882832777, + "grad_norm": 9.828125, + "learning_rate": 9.508157211716723e-06, + "loss": 2.7199, + "mean_token_accuracy": 0.4552501033484911, + "step": 2653 + }, + { + "epoch": 0.4920281794586578, + "grad_norm": 6.8984375, + "learning_rate": 9.507971820541343e-06, + "loss": 2.5699, + "mean_token_accuracy": 0.46013491098086917, + "step": 2654 + }, + { + "epoch": 0.4922135706340378, + "grad_norm": 8.9296875, + "learning_rate": 9.507786429365962e-06, + "loss": 3.3893, + "mean_token_accuracy": 0.3743869209809264, + "step": 2655 + }, + { + "epoch": 0.4923989618094179, + "grad_norm": 8.0546875, + "learning_rate": 9.507601038190582e-06, + "loss": 2.3037, + "mean_token_accuracy": 0.4937106918238994, + "step": 2656 + }, + { + "epoch": 0.4925843529847979, + "grad_norm": 9.3359375, + "learning_rate": 9.507415647015203e-06, + "loss": 3.2925, + "mean_token_accuracy": 0.3945649333681735, + "step": 2657 + }, + { + "epoch": 0.492769744160178, + "grad_norm": 10.0859375, + "learning_rate": 9.507230255839823e-06, + "loss": 2.4257, + "mean_token_accuracy": 0.4860200668896321, + "step": 2658 + }, + { + "epoch": 0.492955135335558, + "grad_norm": 7.91796875, + "learning_rate": 9.507044864664444e-06, + "loss": 2.6985, + "mean_token_accuracy": 0.44668624696718173, + "step": 2659 + }, + { + "epoch": 0.4931405265109381, + "grad_norm": 7.203125, + "learning_rate": 9.506859473489063e-06, + "loss": 2.6933, + "mean_token_accuracy": 0.47582001682085784, + "step": 2660 + }, + { + "epoch": 0.4933259176863181, + "grad_norm": 6.875, + "learning_rate": 9.506674082313683e-06, + "loss": 2.6479, + "mean_token_accuracy": 0.47790279549936204, + "step": 2661 + }, + { + "epoch": 0.4935113088616982, + "grad_norm": 8.0625, + "learning_rate": 9.506488691138302e-06, + "loss": 2.5836, + "mean_token_accuracy": 0.4588976674191121, + "step": 2662 + }, + { + "epoch": 0.4936967000370782, + "grad_norm": 5.453125, + "learning_rate": 9.506303299962922e-06, + "loss": 2.9709, + "mean_token_accuracy": 0.4241708957866163, + "step": 2663 + }, + { + "epoch": 0.4938820912124583, + "grad_norm": 6.23828125, + "learning_rate": 9.506117908787543e-06, + "loss": 2.7588, + "mean_token_accuracy": 0.44397024753078895, + "step": 2664 + }, + { + "epoch": 0.4940674823878383, + "grad_norm": 7.92578125, + "learning_rate": 9.505932517612162e-06, + "loss": 2.7507, + "mean_token_accuracy": 0.4496847414880202, + "step": 2665 + }, + { + "epoch": 0.4942528735632184, + "grad_norm": 5.56640625, + "learning_rate": 9.505747126436782e-06, + "loss": 3.2054, + "mean_token_accuracy": 0.41854815661617606, + "step": 2666 + }, + { + "epoch": 0.4944382647385984, + "grad_norm": 5.14453125, + "learning_rate": 9.505561735261403e-06, + "loss": 2.8713, + "mean_token_accuracy": 0.4441292356185973, + "step": 2667 + }, + { + "epoch": 0.4946236559139785, + "grad_norm": 6.95703125, + "learning_rate": 9.505376344086023e-06, + "loss": 2.5086, + "mean_token_accuracy": 0.46820491690701826, + "step": 2668 + }, + { + "epoch": 0.4948090470893585, + "grad_norm": 5.3828125, + "learning_rate": 9.505190952910642e-06, + "loss": 2.6267, + "mean_token_accuracy": 0.4463042313872523, + "step": 2669 + }, + { + "epoch": 0.4949944382647386, + "grad_norm": 5.78125, + "learning_rate": 9.505005561735263e-06, + "loss": 2.7403, + "mean_token_accuracy": 0.4408957415565345, + "step": 2670 + }, + { + "epoch": 0.4951798294401186, + "grad_norm": 8.4453125, + "learning_rate": 9.504820170559881e-06, + "loss": 2.5424, + "mean_token_accuracy": 0.46377183967112023, + "step": 2671 + }, + { + "epoch": 0.4953652206154987, + "grad_norm": 4.98046875, + "learning_rate": 9.504634779384502e-06, + "loss": 3.6251, + "mean_token_accuracy": 0.36990865126276196, + "step": 2672 + }, + { + "epoch": 0.4955506117908788, + "grad_norm": 8.328125, + "learning_rate": 9.504449388209122e-06, + "loss": 2.6056, + "mean_token_accuracy": 0.4459787750926991, + "step": 2673 + }, + { + "epoch": 0.4957360029662588, + "grad_norm": 6.75390625, + "learning_rate": 9.504263997033743e-06, + "loss": 2.7746, + "mean_token_accuracy": 0.4481585564265641, + "step": 2674 + }, + { + "epoch": 0.4959213941416389, + "grad_norm": 9.2109375, + "learning_rate": 9.504078605858362e-06, + "loss": 2.5874, + "mean_token_accuracy": 0.45452066003000136, + "step": 2675 + }, + { + "epoch": 0.4961067853170189, + "grad_norm": 5.12109375, + "learning_rate": 9.503893214682982e-06, + "loss": 2.5571, + "mean_token_accuracy": 0.4779324055666004, + "step": 2676 + }, + { + "epoch": 0.496292176492399, + "grad_norm": 5.68359375, + "learning_rate": 9.503707823507603e-06, + "loss": 2.579, + "mean_token_accuracy": 0.4846173086543272, + "step": 2677 + }, + { + "epoch": 0.496477567667779, + "grad_norm": 5.39453125, + "learning_rate": 9.503522432332221e-06, + "loss": 2.6925, + "mean_token_accuracy": 0.4710943396226415, + "step": 2678 + }, + { + "epoch": 0.4966629588431591, + "grad_norm": 6.53515625, + "learning_rate": 9.503337041156842e-06, + "loss": 3.1251, + "mean_token_accuracy": 0.40388460439874324, + "step": 2679 + }, + { + "epoch": 0.4968483500185391, + "grad_norm": 5.42578125, + "learning_rate": 9.50315164998146e-06, + "loss": 2.7812, + "mean_token_accuracy": 0.45014992503748125, + "step": 2680 + }, + { + "epoch": 0.4970337411939192, + "grad_norm": 5.76953125, + "learning_rate": 9.502966258806081e-06, + "loss": 3.1132, + "mean_token_accuracy": 0.42337114217884464, + "step": 2681 + }, + { + "epoch": 0.4972191323692992, + "grad_norm": 6.53515625, + "learning_rate": 9.502780867630702e-06, + "loss": 3.1612, + "mean_token_accuracy": 0.41472089067573836, + "step": 2682 + }, + { + "epoch": 0.4974045235446793, + "grad_norm": 6.48828125, + "learning_rate": 9.502595476455322e-06, + "loss": 3.0188, + "mean_token_accuracy": 0.41335342529268576, + "step": 2683 + }, + { + "epoch": 0.4975899147200593, + "grad_norm": 6.6328125, + "learning_rate": 9.502410085279941e-06, + "loss": 2.4824, + "mean_token_accuracy": 0.48311729876780285, + "step": 2684 + }, + { + "epoch": 0.4977753058954394, + "grad_norm": 5.77734375, + "learning_rate": 9.502224694104561e-06, + "loss": 2.7865, + "mean_token_accuracy": 0.45614275909402885, + "step": 2685 + }, + { + "epoch": 0.4979606970708194, + "grad_norm": 5.45703125, + "learning_rate": 9.502039302929182e-06, + "loss": 2.9237, + "mean_token_accuracy": 0.4332480818414322, + "step": 2686 + }, + { + "epoch": 0.4981460882461995, + "grad_norm": 5.42578125, + "learning_rate": 9.5018539117538e-06, + "loss": 3.0401, + "mean_token_accuracy": 0.4134461134606971, + "step": 2687 + }, + { + "epoch": 0.4983314794215795, + "grad_norm": 7.046875, + "learning_rate": 9.501668520578421e-06, + "loss": 2.8413, + "mean_token_accuracy": 0.42769500438212094, + "step": 2688 + }, + { + "epoch": 0.4985168705969596, + "grad_norm": 9.171875, + "learning_rate": 9.50148312940304e-06, + "loss": 2.9259, + "mean_token_accuracy": 0.4205286239184544, + "step": 2689 + }, + { + "epoch": 0.4987022617723396, + "grad_norm": 8.6640625, + "learning_rate": 9.501297738227662e-06, + "loss": 2.8987, + "mean_token_accuracy": 0.4297736506094022, + "step": 2690 + }, + { + "epoch": 0.4988876529477197, + "grad_norm": 5.125, + "learning_rate": 9.501112347052281e-06, + "loss": 2.5725, + "mean_token_accuracy": 0.4769962397699624, + "step": 2691 + }, + { + "epoch": 0.4990730441230997, + "grad_norm": 8.828125, + "learning_rate": 9.500926955876901e-06, + "loss": 3.1194, + "mean_token_accuracy": 0.40703212078224743, + "step": 2692 + }, + { + "epoch": 0.4992584352984798, + "grad_norm": 6.15625, + "learning_rate": 9.50074156470152e-06, + "loss": 2.9328, + "mean_token_accuracy": 0.42680071615479964, + "step": 2693 + }, + { + "epoch": 0.4994438264738598, + "grad_norm": 7.77734375, + "learning_rate": 9.50055617352614e-06, + "loss": 2.282, + "mean_token_accuracy": 0.5137240800671221, + "step": 2694 + }, + { + "epoch": 0.4996292176492399, + "grad_norm": 7.3515625, + "learning_rate": 9.500370782350761e-06, + "loss": 2.8449, + "mean_token_accuracy": 0.4427303283633543, + "step": 2695 + }, + { + "epoch": 0.4998146088246199, + "grad_norm": 10.6953125, + "learning_rate": 9.50018539117538e-06, + "loss": 2.6711, + "mean_token_accuracy": 0.4525009693679721, + "step": 2696 + }, + { + "epoch": 0.5, + "grad_norm": 10.8984375, + "learning_rate": 9.5e-06, + "loss": 2.7526, + "mean_token_accuracy": 0.43572216097023153, + "step": 2697 + }, + { + "epoch": 0.5001853911753801, + "grad_norm": 6.4140625, + "learning_rate": 9.499814608824621e-06, + "loss": 2.5365, + "mean_token_accuracy": 0.4977139124755062, + "step": 2698 + }, + { + "epoch": 0.5003707823507602, + "grad_norm": 7.4296875, + "learning_rate": 9.499629217649242e-06, + "loss": 2.7446, + "mean_token_accuracy": 0.46442222875624817, + "step": 2699 + }, + { + "epoch": 0.5005561735261401, + "grad_norm": 6.88671875, + "learning_rate": 9.49944382647386e-06, + "loss": 3.187, + "mean_token_accuracy": 0.3971187427240978, + "step": 2700 + }, + { + "epoch": 0.5007415647015202, + "grad_norm": 9.015625, + "learning_rate": 9.49925843529848e-06, + "loss": 2.3528, + "mean_token_accuracy": 0.4882262996941896, + "step": 2701 + }, + { + "epoch": 0.5009269558769003, + "grad_norm": 5.5859375, + "learning_rate": 9.499073044123101e-06, + "loss": 3.0496, + "mean_token_accuracy": 0.4168702584544169, + "step": 2702 + }, + { + "epoch": 0.5011123470522804, + "grad_norm": 6.26953125, + "learning_rate": 9.49888765294772e-06, + "loss": 2.7018, + "mean_token_accuracy": 0.4654474199869366, + "step": 2703 + }, + { + "epoch": 0.5012977382276603, + "grad_norm": 7.234375, + "learning_rate": 9.49870226177234e-06, + "loss": 2.6871, + "mean_token_accuracy": 0.46608023457283754, + "step": 2704 + }, + { + "epoch": 0.5014831294030404, + "grad_norm": 6.125, + "learning_rate": 9.49851687059696e-06, + "loss": 3.4367, + "mean_token_accuracy": 0.3709294033753268, + "step": 2705 + }, + { + "epoch": 0.5016685205784205, + "grad_norm": 6.77734375, + "learning_rate": 9.498331479421582e-06, + "loss": 2.7517, + "mean_token_accuracy": 0.45950763061156286, + "step": 2706 + }, + { + "epoch": 0.5018539117538006, + "grad_norm": 5.58984375, + "learning_rate": 9.4981460882462e-06, + "loss": 2.9244, + "mean_token_accuracy": 0.43453510436432635, + "step": 2707 + }, + { + "epoch": 0.5020393029291805, + "grad_norm": 4.984375, + "learning_rate": 9.49796069707082e-06, + "loss": 2.5513, + "mean_token_accuracy": 0.4632984901277584, + "step": 2708 + }, + { + "epoch": 0.5022246941045606, + "grad_norm": 5.41796875, + "learning_rate": 9.49777530589544e-06, + "loss": 2.7881, + "mean_token_accuracy": 0.43689671814671815, + "step": 2709 + }, + { + "epoch": 0.5024100852799407, + "grad_norm": 5.58984375, + "learning_rate": 9.49758991472006e-06, + "loss": 2.3617, + "mean_token_accuracy": 0.5068433630241147, + "step": 2710 + }, + { + "epoch": 0.5025954764553208, + "grad_norm": 5.72265625, + "learning_rate": 9.49740452354468e-06, + "loss": 2.8423, + "mean_token_accuracy": 0.4383446956105729, + "step": 2711 + }, + { + "epoch": 0.5027808676307007, + "grad_norm": 5.66796875, + "learning_rate": 9.4972191323693e-06, + "loss": 2.8709, + "mean_token_accuracy": 0.41535626535626535, + "step": 2712 + }, + { + "epoch": 0.5029662588060808, + "grad_norm": 5.625, + "learning_rate": 9.49703374119392e-06, + "loss": 2.7515, + "mean_token_accuracy": 0.46480419665731365, + "step": 2713 + }, + { + "epoch": 0.5031516499814609, + "grad_norm": 5.10546875, + "learning_rate": 9.49684835001854e-06, + "loss": 2.6408, + "mean_token_accuracy": 0.45881662017324915, + "step": 2714 + }, + { + "epoch": 0.503337041156841, + "grad_norm": 6.875, + "learning_rate": 9.496662958843161e-06, + "loss": 2.5316, + "mean_token_accuracy": 0.481951560316721, + "step": 2715 + }, + { + "epoch": 0.5035224323322209, + "grad_norm": 6.984375, + "learning_rate": 9.49647756766778e-06, + "loss": 2.8637, + "mean_token_accuracy": 0.4162470182878346, + "step": 2716 + }, + { + "epoch": 0.503707823507601, + "grad_norm": 6.65234375, + "learning_rate": 9.4962921764924e-06, + "loss": 2.5862, + "mean_token_accuracy": 0.46290111804849715, + "step": 2717 + }, + { + "epoch": 0.5038932146829811, + "grad_norm": 4.9609375, + "learning_rate": 9.496106785317019e-06, + "loss": 2.6496, + "mean_token_accuracy": 0.4557100027255383, + "step": 2718 + }, + { + "epoch": 0.5040786058583612, + "grad_norm": 8.3125, + "learning_rate": 9.49592139414164e-06, + "loss": 3.0468, + "mean_token_accuracy": 0.43645242180282434, + "step": 2719 + }, + { + "epoch": 0.5042639970337411, + "grad_norm": 6.69140625, + "learning_rate": 9.49573600296626e-06, + "loss": 3.3426, + "mean_token_accuracy": 0.3912852112676056, + "step": 2720 + }, + { + "epoch": 0.5044493882091212, + "grad_norm": 7.30078125, + "learning_rate": 9.495550611790879e-06, + "loss": 2.9825, + "mean_token_accuracy": 0.41556928096958395, + "step": 2721 + }, + { + "epoch": 0.5046347793845013, + "grad_norm": 7.2109375, + "learning_rate": 9.4953652206155e-06, + "loss": 2.686, + "mean_token_accuracy": 0.44554198076423185, + "step": 2722 + }, + { + "epoch": 0.5048201705598814, + "grad_norm": 5.9921875, + "learning_rate": 9.49517982944012e-06, + "loss": 2.9192, + "mean_token_accuracy": 0.43998040176384123, + "step": 2723 + }, + { + "epoch": 0.5050055617352615, + "grad_norm": 5.3671875, + "learning_rate": 9.49499443826474e-06, + "loss": 2.9849, + "mean_token_accuracy": 0.441846787237171, + "step": 2724 + }, + { + "epoch": 0.5051909529106414, + "grad_norm": 8.0078125, + "learning_rate": 9.494809047089359e-06, + "loss": 2.3941, + "mean_token_accuracy": 0.5016685205784205, + "step": 2725 + }, + { + "epoch": 0.5053763440860215, + "grad_norm": 7.22265625, + "learning_rate": 9.49462365591398e-06, + "loss": 3.0942, + "mean_token_accuracy": 0.418109474137004, + "step": 2726 + }, + { + "epoch": 0.5055617352614016, + "grad_norm": 6.7734375, + "learning_rate": 9.494438264738598e-06, + "loss": 2.6008, + "mean_token_accuracy": 0.4416214673137311, + "step": 2727 + }, + { + "epoch": 0.5057471264367817, + "grad_norm": 7.0390625, + "learning_rate": 9.494252873563219e-06, + "loss": 2.5043, + "mean_token_accuracy": 0.4632187285728279, + "step": 2728 + }, + { + "epoch": 0.5059325176121616, + "grad_norm": 5.99609375, + "learning_rate": 9.49406748238784e-06, + "loss": 2.9766, + "mean_token_accuracy": 0.4378268758672217, + "step": 2729 + }, + { + "epoch": 0.5061179087875417, + "grad_norm": 6.14453125, + "learning_rate": 9.493882091212458e-06, + "loss": 2.963, + "mean_token_accuracy": 0.4203009828009828, + "step": 2730 + }, + { + "epoch": 0.5063032999629218, + "grad_norm": 6.5546875, + "learning_rate": 9.493696700037079e-06, + "loss": 3.1021, + "mean_token_accuracy": 0.4140923801117976, + "step": 2731 + }, + { + "epoch": 0.5064886911383019, + "grad_norm": 6.10546875, + "learning_rate": 9.493511308861699e-06, + "loss": 3.0982, + "mean_token_accuracy": 0.41766954551890134, + "step": 2732 + }, + { + "epoch": 0.5066740823136818, + "grad_norm": 4.8828125, + "learning_rate": 9.49332591768632e-06, + "loss": 2.7017, + "mean_token_accuracy": 0.4389864099660249, + "step": 2733 + }, + { + "epoch": 0.5068594734890619, + "grad_norm": 6.3671875, + "learning_rate": 9.493140526510938e-06, + "loss": 3.0815, + "mean_token_accuracy": 0.40806017063313876, + "step": 2734 + }, + { + "epoch": 0.507044864664442, + "grad_norm": 6.89453125, + "learning_rate": 9.492955135335559e-06, + "loss": 3.1153, + "mean_token_accuracy": 0.4172641238887296, + "step": 2735 + }, + { + "epoch": 0.5072302558398221, + "grad_norm": 6.1328125, + "learning_rate": 9.492769744160178e-06, + "loss": 2.9954, + "mean_token_accuracy": 0.4334806787394838, + "step": 2736 + }, + { + "epoch": 0.507415647015202, + "grad_norm": 6.17578125, + "learning_rate": 9.492584352984798e-06, + "loss": 2.9505, + "mean_token_accuracy": 0.4669270505228676, + "step": 2737 + }, + { + "epoch": 0.5076010381905821, + "grad_norm": 5.71484375, + "learning_rate": 9.492398961809419e-06, + "loss": 2.5795, + "mean_token_accuracy": 0.4636956823545745, + "step": 2738 + }, + { + "epoch": 0.5077864293659622, + "grad_norm": 5.875, + "learning_rate": 9.492213570634039e-06, + "loss": 2.6799, + "mean_token_accuracy": 0.45851578704770685, + "step": 2739 + }, + { + "epoch": 0.5079718205413423, + "grad_norm": 5.18359375, + "learning_rate": 9.49202817945866e-06, + "loss": 2.5443, + "mean_token_accuracy": 0.4568421052631579, + "step": 2740 + }, + { + "epoch": 0.5081572117167222, + "grad_norm": 5.99609375, + "learning_rate": 9.491842788283278e-06, + "loss": 2.5423, + "mean_token_accuracy": 0.4743191591017678, + "step": 2741 + }, + { + "epoch": 0.5083426028921023, + "grad_norm": 7.44921875, + "learning_rate": 9.491657397107899e-06, + "loss": 2.299, + "mean_token_accuracy": 0.5021197668256492, + "step": 2742 + }, + { + "epoch": 0.5085279940674824, + "grad_norm": 6.265625, + "learning_rate": 9.491472005932518e-06, + "loss": 3.1371, + "mean_token_accuracy": 0.399055640632976, + "step": 2743 + }, + { + "epoch": 0.5087133852428625, + "grad_norm": 5.484375, + "learning_rate": 9.491286614757138e-06, + "loss": 3.2764, + "mean_token_accuracy": 0.40266328471781865, + "step": 2744 + }, + { + "epoch": 0.5088987764182424, + "grad_norm": 6.82421875, + "learning_rate": 9.491101223581759e-06, + "loss": 3.1831, + "mean_token_accuracy": 0.38971827594509995, + "step": 2745 + }, + { + "epoch": 0.5090841675936225, + "grad_norm": 9.015625, + "learning_rate": 9.490915832406377e-06, + "loss": 3.3505, + "mean_token_accuracy": 0.36485324818862824, + "step": 2746 + }, + { + "epoch": 0.5092695587690026, + "grad_norm": 5.9921875, + "learning_rate": 9.490730441230998e-06, + "loss": 3.2547, + "mean_token_accuracy": 0.39100684261974583, + "step": 2747 + }, + { + "epoch": 0.5094549499443827, + "grad_norm": 5.828125, + "learning_rate": 9.490545050055618e-06, + "loss": 2.7544, + "mean_token_accuracy": 0.4356773134705214, + "step": 2748 + }, + { + "epoch": 0.5096403411197628, + "grad_norm": 7.28515625, + "learning_rate": 9.490359658880239e-06, + "loss": 3.0145, + "mean_token_accuracy": 0.4150917176209005, + "step": 2749 + }, + { + "epoch": 0.5098257322951427, + "grad_norm": 5.71875, + "learning_rate": 9.490174267704858e-06, + "loss": 2.9295, + "mean_token_accuracy": 0.4291912530371399, + "step": 2750 + }, + { + "epoch": 0.5100111234705228, + "grad_norm": 9.578125, + "learning_rate": 9.489988876529478e-06, + "loss": 3.1402, + "mean_token_accuracy": 0.4268292682926829, + "step": 2751 + }, + { + "epoch": 0.5101965146459029, + "grad_norm": 7.3671875, + "learning_rate": 9.489803485354097e-06, + "loss": 2.4615, + "mean_token_accuracy": 0.4672114191146049, + "step": 2752 + }, + { + "epoch": 0.510381905821283, + "grad_norm": 6.49609375, + "learning_rate": 9.489618094178718e-06, + "loss": 2.4887, + "mean_token_accuracy": 0.47094484696140765, + "step": 2753 + }, + { + "epoch": 0.5105672969966629, + "grad_norm": 7.0546875, + "learning_rate": 9.489432703003338e-06, + "loss": 2.8353, + "mean_token_accuracy": 0.43817899637868596, + "step": 2754 + }, + { + "epoch": 0.510752688172043, + "grad_norm": 6.00390625, + "learning_rate": 9.489247311827959e-06, + "loss": 2.5583, + "mean_token_accuracy": 0.45448397132422563, + "step": 2755 + }, + { + "epoch": 0.5109380793474231, + "grad_norm": 6.25390625, + "learning_rate": 9.489061920652577e-06, + "loss": 3.03, + "mean_token_accuracy": 0.4084987593052109, + "step": 2756 + }, + { + "epoch": 0.5111234705228032, + "grad_norm": 6.3046875, + "learning_rate": 9.488876529477198e-06, + "loss": 2.9225, + "mean_token_accuracy": 0.4413754227733935, + "step": 2757 + }, + { + "epoch": 0.5113088616981831, + "grad_norm": 6.08984375, + "learning_rate": 9.488691138301818e-06, + "loss": 3.0192, + "mean_token_accuracy": 0.42981410867492853, + "step": 2758 + }, + { + "epoch": 0.5114942528735632, + "grad_norm": 6.72265625, + "learning_rate": 9.488505747126437e-06, + "loss": 3.2597, + "mean_token_accuracy": 0.411298457991065, + "step": 2759 + }, + { + "epoch": 0.5116796440489433, + "grad_norm": 7.00390625, + "learning_rate": 9.488320355951058e-06, + "loss": 2.5509, + "mean_token_accuracy": 0.45740392367643107, + "step": 2760 + }, + { + "epoch": 0.5118650352243234, + "grad_norm": 4.81640625, + "learning_rate": 9.488134964775676e-06, + "loss": 2.827, + "mean_token_accuracy": 0.4475289431096726, + "step": 2761 + }, + { + "epoch": 0.5120504263997033, + "grad_norm": 6.72265625, + "learning_rate": 9.487949573600297e-06, + "loss": 3.2967, + "mean_token_accuracy": 0.3988713607797871, + "step": 2762 + }, + { + "epoch": 0.5122358175750834, + "grad_norm": 6.5625, + "learning_rate": 9.487764182424917e-06, + "loss": 2.554, + "mean_token_accuracy": 0.4677708146821844, + "step": 2763 + }, + { + "epoch": 0.5124212087504635, + "grad_norm": 6.1171875, + "learning_rate": 9.487578791249538e-06, + "loss": 3.162, + "mean_token_accuracy": 0.39616128141397405, + "step": 2764 + }, + { + "epoch": 0.5126065999258436, + "grad_norm": 9.375, + "learning_rate": 9.487393400074157e-06, + "loss": 2.555, + "mean_token_accuracy": 0.46008708272859217, + "step": 2765 + }, + { + "epoch": 0.5127919911012235, + "grad_norm": 5.953125, + "learning_rate": 9.487208008898777e-06, + "loss": 2.7105, + "mean_token_accuracy": 0.4464145614310372, + "step": 2766 + }, + { + "epoch": 0.5129773822766036, + "grad_norm": 7.1953125, + "learning_rate": 9.487022617723398e-06, + "loss": 2.2738, + "mean_token_accuracy": 0.5102252673102875, + "step": 2767 + }, + { + "epoch": 0.5131627734519837, + "grad_norm": 5.84375, + "learning_rate": 9.486837226548016e-06, + "loss": 3.2163, + "mean_token_accuracy": 0.40253447633246364, + "step": 2768 + }, + { + "epoch": 0.5133481646273638, + "grad_norm": 6.72265625, + "learning_rate": 9.486651835372637e-06, + "loss": 2.6652, + "mean_token_accuracy": 0.4756300193852119, + "step": 2769 + }, + { + "epoch": 0.5135335558027437, + "grad_norm": 7.3828125, + "learning_rate": 9.486466444197256e-06, + "loss": 2.3139, + "mean_token_accuracy": 0.5039230574538092, + "step": 2770 + }, + { + "epoch": 0.5137189469781238, + "grad_norm": 5.6875, + "learning_rate": 9.486281053021878e-06, + "loss": 2.7999, + "mean_token_accuracy": 0.44335587139506, + "step": 2771 + }, + { + "epoch": 0.5139043381535039, + "grad_norm": 8.09375, + "learning_rate": 9.486095661846497e-06, + "loss": 2.737, + "mean_token_accuracy": 0.45554956896551724, + "step": 2772 + }, + { + "epoch": 0.514089729328884, + "grad_norm": 10.46875, + "learning_rate": 9.485910270671117e-06, + "loss": 2.5759, + "mean_token_accuracy": 0.44888295288975233, + "step": 2773 + }, + { + "epoch": 0.514275120504264, + "grad_norm": 5.73828125, + "learning_rate": 9.485724879495736e-06, + "loss": 2.8279, + "mean_token_accuracy": 0.4282669290047549, + "step": 2774 + }, + { + "epoch": 0.514460511679644, + "grad_norm": 8.78125, + "learning_rate": 9.485539488320356e-06, + "loss": 3.4112, + "mean_token_accuracy": 0.37783711615487314, + "step": 2775 + }, + { + "epoch": 0.5146459028550241, + "grad_norm": 7.6328125, + "learning_rate": 9.485354097144977e-06, + "loss": 2.8436, + "mean_token_accuracy": 0.43803834398444014, + "step": 2776 + }, + { + "epoch": 0.5148312940304042, + "grad_norm": 8.3984375, + "learning_rate": 9.485168705969596e-06, + "loss": 2.7303, + "mean_token_accuracy": 0.43912205249888775, + "step": 2777 + }, + { + "epoch": 0.5150166852057843, + "grad_norm": 6.1015625, + "learning_rate": 9.484983314794216e-06, + "loss": 3.2752, + "mean_token_accuracy": 0.4089532144059239, + "step": 2778 + }, + { + "epoch": 0.5152020763811642, + "grad_norm": 5.79296875, + "learning_rate": 9.484797923618837e-06, + "loss": 2.4319, + "mean_token_accuracy": 0.47305389221556887, + "step": 2779 + }, + { + "epoch": 0.5153874675565443, + "grad_norm": 6.140625, + "learning_rate": 9.484612532443457e-06, + "loss": 2.9958, + "mean_token_accuracy": 0.4305191873589165, + "step": 2780 + }, + { + "epoch": 0.5155728587319244, + "grad_norm": 7.5703125, + "learning_rate": 9.484427141268076e-06, + "loss": 3.0578, + "mean_token_accuracy": 0.41756548536209553, + "step": 2781 + }, + { + "epoch": 0.5157582499073045, + "grad_norm": 8.640625, + "learning_rate": 9.484241750092697e-06, + "loss": 2.9061, + "mean_token_accuracy": 0.42881982177787437, + "step": 2782 + }, + { + "epoch": 0.5159436410826844, + "grad_norm": 7.73828125, + "learning_rate": 9.484056358917317e-06, + "loss": 2.7509, + "mean_token_accuracy": 0.4404933881542394, + "step": 2783 + }, + { + "epoch": 0.5161290322580645, + "grad_norm": 8.4296875, + "learning_rate": 9.483870967741936e-06, + "loss": 2.7492, + "mean_token_accuracy": 0.4400839195359743, + "step": 2784 + }, + { + "epoch": 0.5163144234334446, + "grad_norm": 8.8515625, + "learning_rate": 9.483685576566556e-06, + "loss": 2.8196, + "mean_token_accuracy": 0.44691430242761043, + "step": 2785 + }, + { + "epoch": 0.5164998146088247, + "grad_norm": 5.12109375, + "learning_rate": 9.483500185391175e-06, + "loss": 2.4553, + "mean_token_accuracy": 0.485009910802775, + "step": 2786 + }, + { + "epoch": 0.5166852057842046, + "grad_norm": 12.375, + "learning_rate": 9.483314794215797e-06, + "loss": 2.4553, + "mean_token_accuracy": 0.4479944674965422, + "step": 2787 + }, + { + "epoch": 0.5168705969595847, + "grad_norm": 7.98046875, + "learning_rate": 9.483129403040416e-06, + "loss": 3.0793, + "mean_token_accuracy": 0.40118066658467594, + "step": 2788 + }, + { + "epoch": 0.5170559881349648, + "grad_norm": 6.27734375, + "learning_rate": 9.482944011865037e-06, + "loss": 2.7857, + "mean_token_accuracy": 0.43041949258980156, + "step": 2789 + }, + { + "epoch": 0.5172413793103449, + "grad_norm": 6.078125, + "learning_rate": 9.482758620689655e-06, + "loss": 3.4801, + "mean_token_accuracy": 0.38734323194464465, + "step": 2790 + }, + { + "epoch": 0.5174267704857248, + "grad_norm": 8.015625, + "learning_rate": 9.482573229514276e-06, + "loss": 3.0161, + "mean_token_accuracy": 0.42345244086562656, + "step": 2791 + }, + { + "epoch": 0.5176121616611049, + "grad_norm": 6.78515625, + "learning_rate": 9.482387838338896e-06, + "loss": 2.7716, + "mean_token_accuracy": 0.43864651649082786, + "step": 2792 + }, + { + "epoch": 0.517797552836485, + "grad_norm": 7.0078125, + "learning_rate": 9.482202447163515e-06, + "loss": 2.2453, + "mean_token_accuracy": 0.5304557865994539, + "step": 2793 + }, + { + "epoch": 0.5179829440118651, + "grad_norm": 7.5390625, + "learning_rate": 9.482017055988136e-06, + "loss": 2.8603, + "mean_token_accuracy": 0.44338899954037075, + "step": 2794 + }, + { + "epoch": 0.518168335187245, + "grad_norm": 5.796875, + "learning_rate": 9.481831664812756e-06, + "loss": 2.6136, + "mean_token_accuracy": 0.4619238476953908, + "step": 2795 + }, + { + "epoch": 0.5183537263626251, + "grad_norm": 5.04296875, + "learning_rate": 9.481646273637377e-06, + "loss": 2.4041, + "mean_token_accuracy": 0.49196282121377805, + "step": 2796 + }, + { + "epoch": 0.5185391175380052, + "grad_norm": 6.6171875, + "learning_rate": 9.481460882461995e-06, + "loss": 3.0594, + "mean_token_accuracy": 0.4235943917292168, + "step": 2797 + }, + { + "epoch": 0.5187245087133853, + "grad_norm": 8.25, + "learning_rate": 9.481275491286616e-06, + "loss": 2.6918, + "mean_token_accuracy": 0.44341522351993556, + "step": 2798 + }, + { + "epoch": 0.5189098998887653, + "grad_norm": 10.234375, + "learning_rate": 9.481090100111235e-06, + "loss": 2.7453, + "mean_token_accuracy": 0.4241340782122905, + "step": 2799 + }, + { + "epoch": 0.5190952910641453, + "grad_norm": 7.2421875, + "learning_rate": 9.480904708935855e-06, + "loss": 3.0141, + "mean_token_accuracy": 0.43438238586888916, + "step": 2800 + }, + { + "epoch": 0.5192806822395254, + "grad_norm": 8.0625, + "learning_rate": 9.480719317760476e-06, + "loss": 2.551, + "mean_token_accuracy": 0.47785081451843386, + "step": 2801 + }, + { + "epoch": 0.5194660734149055, + "grad_norm": 6.31640625, + "learning_rate": 9.480533926585095e-06, + "loss": 2.6175, + "mean_token_accuracy": 0.4487438313144908, + "step": 2802 + }, + { + "epoch": 0.5196514645902856, + "grad_norm": 9.0703125, + "learning_rate": 9.480348535409715e-06, + "loss": 2.4346, + "mean_token_accuracy": 0.4910071942446043, + "step": 2803 + }, + { + "epoch": 0.5198368557656655, + "grad_norm": 6.10546875, + "learning_rate": 9.480163144234335e-06, + "loss": 2.4634, + "mean_token_accuracy": 0.48520070279767535, + "step": 2804 + }, + { + "epoch": 0.5200222469410456, + "grad_norm": 7.00390625, + "learning_rate": 9.479977753058956e-06, + "loss": 2.956, + "mean_token_accuracy": 0.44159426286029513, + "step": 2805 + }, + { + "epoch": 0.5202076381164257, + "grad_norm": 5.78515625, + "learning_rate": 9.479792361883575e-06, + "loss": 3.2777, + "mean_token_accuracy": 0.39315770215879986, + "step": 2806 + }, + { + "epoch": 0.5203930292918058, + "grad_norm": 6.43359375, + "learning_rate": 9.479606970708195e-06, + "loss": 2.0691, + "mean_token_accuracy": 0.5525837444819527, + "step": 2807 + }, + { + "epoch": 0.5205784204671857, + "grad_norm": 4.953125, + "learning_rate": 9.479421579532814e-06, + "loss": 3.038, + "mean_token_accuracy": 0.42873596314425566, + "step": 2808 + }, + { + "epoch": 0.5207638116425658, + "grad_norm": 5.3984375, + "learning_rate": 9.479236188357435e-06, + "loss": 2.6201, + "mean_token_accuracy": 0.46547153137230146, + "step": 2809 + }, + { + "epoch": 0.5209492028179459, + "grad_norm": 7.6171875, + "learning_rate": 9.479050797182055e-06, + "loss": 3.1412, + "mean_token_accuracy": 0.408819287038945, + "step": 2810 + }, + { + "epoch": 0.521134593993326, + "grad_norm": 5.4609375, + "learning_rate": 9.478865406006676e-06, + "loss": 3.0968, + "mean_token_accuracy": 0.39909946786737616, + "step": 2811 + }, + { + "epoch": 0.5213199851687059, + "grad_norm": 8.5078125, + "learning_rate": 9.478680014831294e-06, + "loss": 2.7351, + "mean_token_accuracy": 0.4606436603334626, + "step": 2812 + }, + { + "epoch": 0.521505376344086, + "grad_norm": 6.375, + "learning_rate": 9.478494623655915e-06, + "loss": 3.3519, + "mean_token_accuracy": 0.39111052838599714, + "step": 2813 + }, + { + "epoch": 0.5216907675194661, + "grad_norm": 7.5625, + "learning_rate": 9.478309232480535e-06, + "loss": 2.3896, + "mean_token_accuracy": 0.4905318623415116, + "step": 2814 + }, + { + "epoch": 0.5218761586948462, + "grad_norm": 7.05859375, + "learning_rate": 9.478123841305154e-06, + "loss": 2.9364, + "mean_token_accuracy": 0.4266785767455709, + "step": 2815 + }, + { + "epoch": 0.5220615498702261, + "grad_norm": 7.43359375, + "learning_rate": 9.477938450129775e-06, + "loss": 2.4496, + "mean_token_accuracy": 0.4929934605631923, + "step": 2816 + }, + { + "epoch": 0.5222469410456062, + "grad_norm": 4.859375, + "learning_rate": 9.477753058954393e-06, + "loss": 2.8325, + "mean_token_accuracy": 0.4291353622819305, + "step": 2817 + }, + { + "epoch": 0.5224323322209863, + "grad_norm": 10.5625, + "learning_rate": 9.477567667779014e-06, + "loss": 1.9907, + "mean_token_accuracy": 0.517725258493353, + "step": 2818 + }, + { + "epoch": 0.5226177233963664, + "grad_norm": 6.02734375, + "learning_rate": 9.477382276603634e-06, + "loss": 2.7372, + "mean_token_accuracy": 0.4534977759805904, + "step": 2819 + }, + { + "epoch": 0.5228031145717463, + "grad_norm": 6.3828125, + "learning_rate": 9.477196885428255e-06, + "loss": 2.8026, + "mean_token_accuracy": 0.43660800886795065, + "step": 2820 + }, + { + "epoch": 0.5229885057471264, + "grad_norm": 6.84765625, + "learning_rate": 9.477011494252875e-06, + "loss": 3.0844, + "mean_token_accuracy": 0.40865892291446676, + "step": 2821 + }, + { + "epoch": 0.5231738969225065, + "grad_norm": 7.56640625, + "learning_rate": 9.476826103077494e-06, + "loss": 3.0595, + "mean_token_accuracy": 0.4151254117050925, + "step": 2822 + }, + { + "epoch": 0.5233592880978866, + "grad_norm": 9.1328125, + "learning_rate": 9.476640711902115e-06, + "loss": 2.7696, + "mean_token_accuracy": 0.43795883981225175, + "step": 2823 + }, + { + "epoch": 0.5235446792732666, + "grad_norm": 7.6640625, + "learning_rate": 9.476455320726733e-06, + "loss": 2.5474, + "mean_token_accuracy": 0.45962732919254656, + "step": 2824 + }, + { + "epoch": 0.5237300704486466, + "grad_norm": 5.39453125, + "learning_rate": 9.476269929551354e-06, + "loss": 3.0124, + "mean_token_accuracy": 0.4206396385685412, + "step": 2825 + }, + { + "epoch": 0.5239154616240267, + "grad_norm": 7.1328125, + "learning_rate": 9.476084538375974e-06, + "loss": 2.9489, + "mean_token_accuracy": 0.41460750853242323, + "step": 2826 + }, + { + "epoch": 0.5241008527994068, + "grad_norm": 6.3046875, + "learning_rate": 9.475899147200595e-06, + "loss": 2.6346, + "mean_token_accuracy": 0.4561946902654867, + "step": 2827 + }, + { + "epoch": 0.5242862439747868, + "grad_norm": 5.92578125, + "learning_rate": 9.475713756025214e-06, + "loss": 3.0241, + "mean_token_accuracy": 0.4243879582991683, + "step": 2828 + }, + { + "epoch": 0.5244716351501668, + "grad_norm": 5.6484375, + "learning_rate": 9.475528364849834e-06, + "loss": 2.9676, + "mean_token_accuracy": 0.44555521378037527, + "step": 2829 + }, + { + "epoch": 0.5246570263255469, + "grad_norm": 6.5, + "learning_rate": 9.475342973674455e-06, + "loss": 2.8413, + "mean_token_accuracy": 0.45789821546596166, + "step": 2830 + }, + { + "epoch": 0.524842417500927, + "grad_norm": 7.89453125, + "learning_rate": 9.475157582499074e-06, + "loss": 2.3546, + "mean_token_accuracy": 0.5083143507972665, + "step": 2831 + }, + { + "epoch": 0.525027808676307, + "grad_norm": 6.69921875, + "learning_rate": 9.474972191323694e-06, + "loss": 3.0864, + "mean_token_accuracy": 0.4277812895069532, + "step": 2832 + }, + { + "epoch": 0.525213199851687, + "grad_norm": 5.90625, + "learning_rate": 9.474786800148313e-06, + "loss": 3.0219, + "mean_token_accuracy": 0.42331007663273473, + "step": 2833 + }, + { + "epoch": 0.5253985910270671, + "grad_norm": 7.84375, + "learning_rate": 9.474601408972933e-06, + "loss": 2.6578, + "mean_token_accuracy": 0.4644268774703557, + "step": 2834 + }, + { + "epoch": 0.5255839822024472, + "grad_norm": 7.78125, + "learning_rate": 9.474416017797554e-06, + "loss": 2.7074, + "mean_token_accuracy": 0.4436526150501268, + "step": 2835 + }, + { + "epoch": 0.5257693733778273, + "grad_norm": 7.140625, + "learning_rate": 9.474230626622174e-06, + "loss": 2.656, + "mean_token_accuracy": 0.4467005076142132, + "step": 2836 + }, + { + "epoch": 0.5259547645532072, + "grad_norm": 7.32421875, + "learning_rate": 9.474045235446793e-06, + "loss": 3.4224, + "mean_token_accuracy": 0.37782245592329106, + "step": 2837 + }, + { + "epoch": 0.5261401557285873, + "grad_norm": 5.98046875, + "learning_rate": 9.473859844271414e-06, + "loss": 2.6069, + "mean_token_accuracy": 0.4603002840524821, + "step": 2838 + }, + { + "epoch": 0.5263255469039674, + "grad_norm": 5.59765625, + "learning_rate": 9.473674453096034e-06, + "loss": 2.7357, + "mean_token_accuracy": 0.4543397080457169, + "step": 2839 + }, + { + "epoch": 0.5265109380793475, + "grad_norm": 5.859375, + "learning_rate": 9.473489061920653e-06, + "loss": 2.7469, + "mean_token_accuracy": 0.4561710137133638, + "step": 2840 + }, + { + "epoch": 0.5266963292547274, + "grad_norm": 5.734375, + "learning_rate": 9.473303670745273e-06, + "loss": 2.8876, + "mean_token_accuracy": 0.43061488030197675, + "step": 2841 + }, + { + "epoch": 0.5268817204301075, + "grad_norm": 7.40234375, + "learning_rate": 9.473118279569892e-06, + "loss": 3.0507, + "mean_token_accuracy": 0.4270610596341174, + "step": 2842 + }, + { + "epoch": 0.5270671116054876, + "grad_norm": 6.23828125, + "learning_rate": 9.472932888394514e-06, + "loss": 2.785, + "mean_token_accuracy": 0.4538633461047254, + "step": 2843 + }, + { + "epoch": 0.5272525027808677, + "grad_norm": 6.234375, + "learning_rate": 9.472747497219133e-06, + "loss": 2.3599, + "mean_token_accuracy": 0.49093484419263456, + "step": 2844 + }, + { + "epoch": 0.5274378939562476, + "grad_norm": 6.41015625, + "learning_rate": 9.472562106043754e-06, + "loss": 3.3318, + "mean_token_accuracy": 0.41741799347784386, + "step": 2845 + }, + { + "epoch": 0.5276232851316277, + "grad_norm": 7.20703125, + "learning_rate": 9.472376714868372e-06, + "loss": 2.8259, + "mean_token_accuracy": 0.4221891288160834, + "step": 2846 + }, + { + "epoch": 0.5278086763070078, + "grad_norm": 6.28515625, + "learning_rate": 9.472191323692993e-06, + "loss": 2.5206, + "mean_token_accuracy": 0.46992431284026026, + "step": 2847 + }, + { + "epoch": 0.5279940674823879, + "grad_norm": 5.93359375, + "learning_rate": 9.472005932517613e-06, + "loss": 3.0519, + "mean_token_accuracy": 0.4161189899688689, + "step": 2848 + }, + { + "epoch": 0.5281794586577679, + "grad_norm": 6.86328125, + "learning_rate": 9.471820541342232e-06, + "loss": 2.8259, + "mean_token_accuracy": 0.45199303207295827, + "step": 2849 + }, + { + "epoch": 0.5283648498331479, + "grad_norm": 6.11328125, + "learning_rate": 9.471635150166853e-06, + "loss": 2.6738, + "mean_token_accuracy": 0.46475475743768424, + "step": 2850 + }, + { + "epoch": 0.528550241008528, + "grad_norm": 6.125, + "learning_rate": 9.471449758991471e-06, + "loss": 2.4583, + "mean_token_accuracy": 0.48299732815156665, + "step": 2851 + }, + { + "epoch": 0.5287356321839081, + "grad_norm": 8.609375, + "learning_rate": 9.471264367816094e-06, + "loss": 2.8479, + "mean_token_accuracy": 0.4226044226044226, + "step": 2852 + }, + { + "epoch": 0.5289210233592881, + "grad_norm": 5.45703125, + "learning_rate": 9.471078976640712e-06, + "loss": 2.8219, + "mean_token_accuracy": 0.4557321225879682, + "step": 2853 + }, + { + "epoch": 0.5291064145346681, + "grad_norm": 5.55078125, + "learning_rate": 9.470893585465333e-06, + "loss": 2.5461, + "mean_token_accuracy": 0.4616444015975761, + "step": 2854 + }, + { + "epoch": 0.5292918057100482, + "grad_norm": 5.9375, + "learning_rate": 9.470708194289952e-06, + "loss": 3.7396, + "mean_token_accuracy": 0.35271155722849573, + "step": 2855 + }, + { + "epoch": 0.5294771968854283, + "grad_norm": 7.33203125, + "learning_rate": 9.470522803114572e-06, + "loss": 3.5868, + "mean_token_accuracy": 0.38662379421221865, + "step": 2856 + }, + { + "epoch": 0.5296625880608083, + "grad_norm": 7.640625, + "learning_rate": 9.470337411939193e-06, + "loss": 2.6328, + "mean_token_accuracy": 0.46317171938272955, + "step": 2857 + }, + { + "epoch": 0.5298479792361883, + "grad_norm": 7.84765625, + "learning_rate": 9.470152020763812e-06, + "loss": 2.7643, + "mean_token_accuracy": 0.446376181679887, + "step": 2858 + }, + { + "epoch": 0.5300333704115684, + "grad_norm": 7.23046875, + "learning_rate": 9.469966629588432e-06, + "loss": 2.6675, + "mean_token_accuracy": 0.4337309107122952, + "step": 2859 + }, + { + "epoch": 0.5302187615869485, + "grad_norm": 7.19921875, + "learning_rate": 9.469781238413053e-06, + "loss": 2.8339, + "mean_token_accuracy": 0.43744787322768974, + "step": 2860 + }, + { + "epoch": 0.5304041527623286, + "grad_norm": 6.71875, + "learning_rate": 9.469595847237673e-06, + "loss": 2.5633, + "mean_token_accuracy": 0.48539857932123126, + "step": 2861 + }, + { + "epoch": 0.5305895439377085, + "grad_norm": 7.05859375, + "learning_rate": 9.469410456062292e-06, + "loss": 2.7932, + "mean_token_accuracy": 0.4282890401932792, + "step": 2862 + }, + { + "epoch": 0.5307749351130886, + "grad_norm": 7.375, + "learning_rate": 9.469225064886912e-06, + "loss": 2.0528, + "mean_token_accuracy": 0.5328171091445427, + "step": 2863 + }, + { + "epoch": 0.5309603262884687, + "grad_norm": 5.9921875, + "learning_rate": 9.469039673711533e-06, + "loss": 2.9425, + "mean_token_accuracy": 0.45242537313432835, + "step": 2864 + }, + { + "epoch": 0.5311457174638488, + "grad_norm": 6.5859375, + "learning_rate": 9.468854282536152e-06, + "loss": 2.9216, + "mean_token_accuracy": 0.4252167982071519, + "step": 2865 + }, + { + "epoch": 0.5313311086392287, + "grad_norm": 5.25390625, + "learning_rate": 9.468668891360772e-06, + "loss": 2.718, + "mean_token_accuracy": 0.45558112773302645, + "step": 2866 + }, + { + "epoch": 0.5315164998146088, + "grad_norm": 7.66015625, + "learning_rate": 9.468483500185391e-06, + "loss": 2.9829, + "mean_token_accuracy": 0.43725187472430527, + "step": 2867 + }, + { + "epoch": 0.5317018909899889, + "grad_norm": 8.1328125, + "learning_rate": 9.468298109010013e-06, + "loss": 2.4871, + "mean_token_accuracy": 0.46457098685905696, + "step": 2868 + }, + { + "epoch": 0.531887282165369, + "grad_norm": 7.640625, + "learning_rate": 9.468112717834632e-06, + "loss": 2.441, + "mean_token_accuracy": 0.47692307692307695, + "step": 2869 + }, + { + "epoch": 0.5320726733407489, + "grad_norm": 7.26953125, + "learning_rate": 9.467927326659252e-06, + "loss": 2.3449, + "mean_token_accuracy": 0.49765411893071465, + "step": 2870 + }, + { + "epoch": 0.532258064516129, + "grad_norm": 7.14453125, + "learning_rate": 9.467741935483871e-06, + "loss": 2.8053, + "mean_token_accuracy": 0.43725915221579964, + "step": 2871 + }, + { + "epoch": 0.5324434556915091, + "grad_norm": 6.2421875, + "learning_rate": 9.467556544308492e-06, + "loss": 2.9417, + "mean_token_accuracy": 0.42860831396849985, + "step": 2872 + }, + { + "epoch": 0.5326288468668892, + "grad_norm": 8.1171875, + "learning_rate": 9.467371153133112e-06, + "loss": 2.6035, + "mean_token_accuracy": 0.46498980285520053, + "step": 2873 + }, + { + "epoch": 0.5328142380422692, + "grad_norm": 5.73828125, + "learning_rate": 9.467185761957731e-06, + "loss": 3.1883, + "mean_token_accuracy": 0.4301543824701195, + "step": 2874 + }, + { + "epoch": 0.5329996292176492, + "grad_norm": 5.6640625, + "learning_rate": 9.467000370782351e-06, + "loss": 2.8045, + "mean_token_accuracy": 0.43950039032006244, + "step": 2875 + }, + { + "epoch": 0.5331850203930293, + "grad_norm": 8.4296875, + "learning_rate": 9.466814979606972e-06, + "loss": 2.2215, + "mean_token_accuracy": 0.4894362671472915, + "step": 2876 + }, + { + "epoch": 0.5333704115684094, + "grad_norm": 6.4765625, + "learning_rate": 9.466629588431592e-06, + "loss": 3.0898, + "mean_token_accuracy": 0.39997390056113796, + "step": 2877 + }, + { + "epoch": 0.5335558027437894, + "grad_norm": 6.125, + "learning_rate": 9.466444197256211e-06, + "loss": 2.4839, + "mean_token_accuracy": 0.5142328164776672, + "step": 2878 + }, + { + "epoch": 0.5337411939191694, + "grad_norm": 5.7734375, + "learning_rate": 9.466258806080832e-06, + "loss": 2.9468, + "mean_token_accuracy": 0.4346468561584841, + "step": 2879 + }, + { + "epoch": 0.5339265850945495, + "grad_norm": 6.4296875, + "learning_rate": 9.46607341490545e-06, + "loss": 3.1948, + "mean_token_accuracy": 0.4038237953752745, + "step": 2880 + }, + { + "epoch": 0.5341119762699296, + "grad_norm": 6.2734375, + "learning_rate": 9.465888023730071e-06, + "loss": 2.7831, + "mean_token_accuracy": 0.443679880329095, + "step": 2881 + }, + { + "epoch": 0.5342973674453096, + "grad_norm": 6.4375, + "learning_rate": 9.465702632554691e-06, + "loss": 2.5558, + "mean_token_accuracy": 0.4688601645123384, + "step": 2882 + }, + { + "epoch": 0.5344827586206896, + "grad_norm": 5.3125, + "learning_rate": 9.46551724137931e-06, + "loss": 3.1536, + "mean_token_accuracy": 0.4200789343857918, + "step": 2883 + }, + { + "epoch": 0.5346681497960697, + "grad_norm": 4.85546875, + "learning_rate": 9.46533185020393e-06, + "loss": 3.0953, + "mean_token_accuracy": 0.43098938298199724, + "step": 2884 + }, + { + "epoch": 0.5348535409714498, + "grad_norm": 5.578125, + "learning_rate": 9.465146459028551e-06, + "loss": 3.1416, + "mean_token_accuracy": 0.40680737217598095, + "step": 2885 + }, + { + "epoch": 0.5350389321468298, + "grad_norm": 5.78125, + "learning_rate": 9.464961067853172e-06, + "loss": 2.9831, + "mean_token_accuracy": 0.42898587285570133, + "step": 2886 + }, + { + "epoch": 0.5352243233222098, + "grad_norm": 6.07421875, + "learning_rate": 9.46477567667779e-06, + "loss": 2.8269, + "mean_token_accuracy": 0.4522691705790297, + "step": 2887 + }, + { + "epoch": 0.5354097144975899, + "grad_norm": 7.375, + "learning_rate": 9.464590285502411e-06, + "loss": 2.4955, + "mean_token_accuracy": 0.4757738896366083, + "step": 2888 + }, + { + "epoch": 0.53559510567297, + "grad_norm": 4.9921875, + "learning_rate": 9.46440489432703e-06, + "loss": 2.8393, + "mean_token_accuracy": 0.4341190108191654, + "step": 2889 + }, + { + "epoch": 0.53578049684835, + "grad_norm": 7.0, + "learning_rate": 9.46421950315165e-06, + "loss": 3.0145, + "mean_token_accuracy": 0.40296851158274377, + "step": 2890 + }, + { + "epoch": 0.53596588802373, + "grad_norm": 5.4296875, + "learning_rate": 9.46403411197627e-06, + "loss": 3.3636, + "mean_token_accuracy": 0.38802889576883387, + "step": 2891 + }, + { + "epoch": 0.5361512791991101, + "grad_norm": 8.4609375, + "learning_rate": 9.463848720800891e-06, + "loss": 2.6097, + "mean_token_accuracy": 0.46801470588235294, + "step": 2892 + }, + { + "epoch": 0.5363366703744902, + "grad_norm": 6.0703125, + "learning_rate": 9.46366332962551e-06, + "loss": 2.7892, + "mean_token_accuracy": 0.4475542431634356, + "step": 2893 + }, + { + "epoch": 0.5365220615498703, + "grad_norm": 11.4921875, + "learning_rate": 9.46347793845013e-06, + "loss": 2.626, + "mean_token_accuracy": 0.47231705506902183, + "step": 2894 + }, + { + "epoch": 0.5367074527252503, + "grad_norm": 5.21484375, + "learning_rate": 9.463292547274751e-06, + "loss": 2.7213, + "mean_token_accuracy": 0.45841784989858014, + "step": 2895 + }, + { + "epoch": 0.5368928439006303, + "grad_norm": 6.36328125, + "learning_rate": 9.46310715609937e-06, + "loss": 2.6026, + "mean_token_accuracy": 0.4678819444444444, + "step": 2896 + }, + { + "epoch": 0.5370782350760104, + "grad_norm": 6.03515625, + "learning_rate": 9.46292176492399e-06, + "loss": 2.8458, + "mean_token_accuracy": 0.4189961880559085, + "step": 2897 + }, + { + "epoch": 0.5372636262513905, + "grad_norm": 9.3671875, + "learning_rate": 9.46273637374861e-06, + "loss": 2.5886, + "mean_token_accuracy": 0.4768760907504363, + "step": 2898 + }, + { + "epoch": 0.5374490174267705, + "grad_norm": 6.703125, + "learning_rate": 9.46255098257323e-06, + "loss": 3.2033, + "mean_token_accuracy": 0.39785843415380334, + "step": 2899 + }, + { + "epoch": 0.5376344086021505, + "grad_norm": 7.3671875, + "learning_rate": 9.46236559139785e-06, + "loss": 3.301, + "mean_token_accuracy": 0.38910632746249185, + "step": 2900 + }, + { + "epoch": 0.5378197997775306, + "grad_norm": 5.90234375, + "learning_rate": 9.46218020022247e-06, + "loss": 3.1398, + "mean_token_accuracy": 0.41653112524027797, + "step": 2901 + }, + { + "epoch": 0.5380051909529107, + "grad_norm": 5.52734375, + "learning_rate": 9.461994809047091e-06, + "loss": 2.8023, + "mean_token_accuracy": 0.44234960767218834, + "step": 2902 + }, + { + "epoch": 0.5381905821282907, + "grad_norm": 7.58984375, + "learning_rate": 9.46180941787171e-06, + "loss": 2.8442, + "mean_token_accuracy": 0.4365420812046249, + "step": 2903 + }, + { + "epoch": 0.5383759733036707, + "grad_norm": 10.3671875, + "learning_rate": 9.46162402669633e-06, + "loss": 2.2696, + "mean_token_accuracy": 0.5064337150277417, + "step": 2904 + }, + { + "epoch": 0.5385613644790508, + "grad_norm": 6.4609375, + "learning_rate": 9.46143863552095e-06, + "loss": 2.5671, + "mean_token_accuracy": 0.4740759116844456, + "step": 2905 + }, + { + "epoch": 0.5387467556544309, + "grad_norm": 6.0, + "learning_rate": 9.46125324434557e-06, + "loss": 3.1581, + "mean_token_accuracy": 0.40246105215624295, + "step": 2906 + }, + { + "epoch": 0.5389321468298109, + "grad_norm": 6.59765625, + "learning_rate": 9.46106785317019e-06, + "loss": 3.2238, + "mean_token_accuracy": 0.4177092021128718, + "step": 2907 + }, + { + "epoch": 0.5391175380051909, + "grad_norm": 6.35546875, + "learning_rate": 9.46088246199481e-06, + "loss": 3.2745, + "mean_token_accuracy": 0.41226740179186766, + "step": 2908 + }, + { + "epoch": 0.539302929180571, + "grad_norm": 5.359375, + "learning_rate": 9.46069707081943e-06, + "loss": 3.2445, + "mean_token_accuracy": 0.3871749313519625, + "step": 2909 + }, + { + "epoch": 0.5394883203559511, + "grad_norm": 8.71875, + "learning_rate": 9.46051167964405e-06, + "loss": 2.5877, + "mean_token_accuracy": 0.483264761481152, + "step": 2910 + }, + { + "epoch": 0.5396737115313311, + "grad_norm": 6.671875, + "learning_rate": 9.46032628846867e-06, + "loss": 2.737, + "mean_token_accuracy": 0.4463423253622335, + "step": 2911 + }, + { + "epoch": 0.5398591027067111, + "grad_norm": 5.40625, + "learning_rate": 9.46014089729329e-06, + "loss": 2.9582, + "mean_token_accuracy": 0.43233743409490333, + "step": 2912 + }, + { + "epoch": 0.5400444938820912, + "grad_norm": 5.703125, + "learning_rate": 9.45995550611791e-06, + "loss": 2.5401, + "mean_token_accuracy": 0.4796905222437137, + "step": 2913 + }, + { + "epoch": 0.5402298850574713, + "grad_norm": 4.90234375, + "learning_rate": 9.459770114942529e-06, + "loss": 2.5644, + "mean_token_accuracy": 0.4775074183976261, + "step": 2914 + }, + { + "epoch": 0.5404152762328513, + "grad_norm": 7.8984375, + "learning_rate": 9.459584723767149e-06, + "loss": 2.5894, + "mean_token_accuracy": 0.46721132897603485, + "step": 2915 + }, + { + "epoch": 0.5406006674082313, + "grad_norm": 8.1015625, + "learning_rate": 9.45939933259177e-06, + "loss": 3.0651, + "mean_token_accuracy": 0.41386580677167306, + "step": 2916 + }, + { + "epoch": 0.5407860585836114, + "grad_norm": 6.64453125, + "learning_rate": 9.45921394141639e-06, + "loss": 2.3646, + "mean_token_accuracy": 0.4939033348024093, + "step": 2917 + }, + { + "epoch": 0.5409714497589915, + "grad_norm": 10.0, + "learning_rate": 9.459028550241009e-06, + "loss": 2.7577, + "mean_token_accuracy": 0.44311887515977844, + "step": 2918 + }, + { + "epoch": 0.5411568409343716, + "grad_norm": 7.28515625, + "learning_rate": 9.45884315906563e-06, + "loss": 2.7212, + "mean_token_accuracy": 0.43828880511391577, + "step": 2919 + }, + { + "epoch": 0.5413422321097516, + "grad_norm": 6.32421875, + "learning_rate": 9.45865776789025e-06, + "loss": 2.5071, + "mean_token_accuracy": 0.47138209422822025, + "step": 2920 + }, + { + "epoch": 0.5415276232851316, + "grad_norm": 5.4609375, + "learning_rate": 9.458472376714869e-06, + "loss": 3.0111, + "mean_token_accuracy": 0.4313127892538661, + "step": 2921 + }, + { + "epoch": 0.5417130144605117, + "grad_norm": 8.234375, + "learning_rate": 9.458286985539489e-06, + "loss": 2.5636, + "mean_token_accuracy": 0.4563631790744467, + "step": 2922 + }, + { + "epoch": 0.5418984056358918, + "grad_norm": 6.828125, + "learning_rate": 9.458101594364108e-06, + "loss": 2.6473, + "mean_token_accuracy": 0.4421920872361247, + "step": 2923 + }, + { + "epoch": 0.5420837968112718, + "grad_norm": 5.9453125, + "learning_rate": 9.45791620318873e-06, + "loss": 2.7071, + "mean_token_accuracy": 0.44041031178296663, + "step": 2924 + }, + { + "epoch": 0.5422691879866518, + "grad_norm": 6.38671875, + "learning_rate": 9.457730812013349e-06, + "loss": 2.4567, + "mean_token_accuracy": 0.4776157585060118, + "step": 2925 + }, + { + "epoch": 0.5424545791620319, + "grad_norm": 12.4921875, + "learning_rate": 9.45754542083797e-06, + "loss": 2.6182, + "mean_token_accuracy": 0.45106642291285803, + "step": 2926 + }, + { + "epoch": 0.542639970337412, + "grad_norm": 7.875, + "learning_rate": 9.457360029662588e-06, + "loss": 2.6773, + "mean_token_accuracy": 0.4423870383415518, + "step": 2927 + }, + { + "epoch": 0.542825361512792, + "grad_norm": 9.3984375, + "learning_rate": 9.457174638487209e-06, + "loss": 2.6585, + "mean_token_accuracy": 0.441566356849002, + "step": 2928 + }, + { + "epoch": 0.543010752688172, + "grad_norm": 6.26171875, + "learning_rate": 9.456989247311829e-06, + "loss": 2.9728, + "mean_token_accuracy": 0.428536375904797, + "step": 2929 + }, + { + "epoch": 0.5431961438635521, + "grad_norm": 7.55859375, + "learning_rate": 9.456803856136448e-06, + "loss": 2.774, + "mean_token_accuracy": 0.4648145067376476, + "step": 2930 + }, + { + "epoch": 0.5433815350389322, + "grad_norm": 5.61328125, + "learning_rate": 9.456618464961068e-06, + "loss": 3.0686, + "mean_token_accuracy": 0.42081862168170525, + "step": 2931 + }, + { + "epoch": 0.5435669262143122, + "grad_norm": 4.7578125, + "learning_rate": 9.456433073785689e-06, + "loss": 3.1302, + "mean_token_accuracy": 0.4389503573876432, + "step": 2932 + }, + { + "epoch": 0.5437523173896922, + "grad_norm": 7.01171875, + "learning_rate": 9.45624768261031e-06, + "loss": 3.0709, + "mean_token_accuracy": 0.4141399829980164, + "step": 2933 + }, + { + "epoch": 0.5439377085650723, + "grad_norm": 5.66796875, + "learning_rate": 9.456062291434928e-06, + "loss": 3.158, + "mean_token_accuracy": 0.4203651685393258, + "step": 2934 + }, + { + "epoch": 0.5441230997404524, + "grad_norm": 6.2890625, + "learning_rate": 9.455876900259549e-06, + "loss": 2.6696, + "mean_token_accuracy": 0.45157593123209167, + "step": 2935 + }, + { + "epoch": 0.5443084909158324, + "grad_norm": 5.53125, + "learning_rate": 9.455691509084168e-06, + "loss": 2.4949, + "mean_token_accuracy": 0.48263291139240505, + "step": 2936 + }, + { + "epoch": 0.5444938820912124, + "grad_norm": 5.703125, + "learning_rate": 9.455506117908788e-06, + "loss": 3.5146, + "mean_token_accuracy": 0.38159203980099504, + "step": 2937 + }, + { + "epoch": 0.5446792732665925, + "grad_norm": 7.42578125, + "learning_rate": 9.455320726733408e-06, + "loss": 3.0979, + "mean_token_accuracy": 0.40545303752850925, + "step": 2938 + }, + { + "epoch": 0.5448646644419726, + "grad_norm": 5.65625, + "learning_rate": 9.455135335558027e-06, + "loss": 2.6137, + "mean_token_accuracy": 0.49430765157532436, + "step": 2939 + }, + { + "epoch": 0.5450500556173526, + "grad_norm": 7.38671875, + "learning_rate": 9.45494994438265e-06, + "loss": 2.5808, + "mean_token_accuracy": 0.4534042843498431, + "step": 2940 + }, + { + "epoch": 0.5452354467927326, + "grad_norm": 6.89453125, + "learning_rate": 9.454764553207268e-06, + "loss": 2.7685, + "mean_token_accuracy": 0.43670779111035996, + "step": 2941 + }, + { + "epoch": 0.5454208379681127, + "grad_norm": 5.9453125, + "learning_rate": 9.454579162031889e-06, + "loss": 2.3338, + "mean_token_accuracy": 0.5088801184015787, + "step": 2942 + }, + { + "epoch": 0.5456062291434928, + "grad_norm": 6.41796875, + "learning_rate": 9.454393770856508e-06, + "loss": 2.7579, + "mean_token_accuracy": 0.4438618925831202, + "step": 2943 + }, + { + "epoch": 0.5457916203188728, + "grad_norm": 5.4921875, + "learning_rate": 9.454208379681128e-06, + "loss": 2.7417, + "mean_token_accuracy": 0.4478921463727798, + "step": 2944 + }, + { + "epoch": 0.5459770114942529, + "grad_norm": 7.62109375, + "learning_rate": 9.454022988505749e-06, + "loss": 3.536, + "mean_token_accuracy": 0.36312514955731034, + "step": 2945 + }, + { + "epoch": 0.5461624026696329, + "grad_norm": 5.5859375, + "learning_rate": 9.453837597330367e-06, + "loss": 3.0594, + "mean_token_accuracy": 0.410159416373953, + "step": 2946 + }, + { + "epoch": 0.546347793845013, + "grad_norm": 7.42578125, + "learning_rate": 9.453652206154988e-06, + "loss": 2.5315, + "mean_token_accuracy": 0.45857005038053383, + "step": 2947 + }, + { + "epoch": 0.546533185020393, + "grad_norm": 7.29296875, + "learning_rate": 9.453466814979608e-06, + "loss": 3.0364, + "mean_token_accuracy": 0.40353121801432956, + "step": 2948 + }, + { + "epoch": 0.5467185761957731, + "grad_norm": 5.26171875, + "learning_rate": 9.453281423804229e-06, + "loss": 2.588, + "mean_token_accuracy": 0.47316704459561604, + "step": 2949 + }, + { + "epoch": 0.5469039673711531, + "grad_norm": 5.5390625, + "learning_rate": 9.453096032628848e-06, + "loss": 2.7083, + "mean_token_accuracy": 0.44113295286408793, + "step": 2950 + }, + { + "epoch": 0.5470893585465332, + "grad_norm": 7.05859375, + "learning_rate": 9.452910641453468e-06, + "loss": 2.768, + "mean_token_accuracy": 0.4349468713105077, + "step": 2951 + }, + { + "epoch": 0.5472747497219133, + "grad_norm": 6.171875, + "learning_rate": 9.452725250278087e-06, + "loss": 3.013, + "mean_token_accuracy": 0.4283493132935994, + "step": 2952 + }, + { + "epoch": 0.5474601408972933, + "grad_norm": 5.9609375, + "learning_rate": 9.452539859102707e-06, + "loss": 3.329, + "mean_token_accuracy": 0.38268927444794953, + "step": 2953 + }, + { + "epoch": 0.5476455320726733, + "grad_norm": 6.0078125, + "learning_rate": 9.452354467927328e-06, + "loss": 2.7009, + "mean_token_accuracy": 0.4593716143011918, + "step": 2954 + }, + { + "epoch": 0.5478309232480534, + "grad_norm": 6.05078125, + "learning_rate": 9.452169076751947e-06, + "loss": 2.8836, + "mean_token_accuracy": 0.4247697031729785, + "step": 2955 + }, + { + "epoch": 0.5480163144234335, + "grad_norm": 6.375, + "learning_rate": 9.451983685576567e-06, + "loss": 2.8153, + "mean_token_accuracy": 0.43841548847280604, + "step": 2956 + }, + { + "epoch": 0.5482017055988135, + "grad_norm": 6.5390625, + "learning_rate": 9.451798294401188e-06, + "loss": 3.0976, + "mean_token_accuracy": 0.4335485606672047, + "step": 2957 + }, + { + "epoch": 0.5483870967741935, + "grad_norm": 7.2890625, + "learning_rate": 9.451612903225808e-06, + "loss": 2.5472, + "mean_token_accuracy": 0.4703525641025641, + "step": 2958 + }, + { + "epoch": 0.5485724879495736, + "grad_norm": 5.52734375, + "learning_rate": 9.451427512050427e-06, + "loss": 2.7522, + "mean_token_accuracy": 0.4346079246328623, + "step": 2959 + }, + { + "epoch": 0.5487578791249537, + "grad_norm": 7.03515625, + "learning_rate": 9.451242120875047e-06, + "loss": 2.7747, + "mean_token_accuracy": 0.4698614125897479, + "step": 2960 + }, + { + "epoch": 0.5489432703003337, + "grad_norm": 6.85546875, + "learning_rate": 9.451056729699666e-06, + "loss": 2.9089, + "mean_token_accuracy": 0.4175675675675676, + "step": 2961 + }, + { + "epoch": 0.5491286614757137, + "grad_norm": 5.9765625, + "learning_rate": 9.450871338524287e-06, + "loss": 3.1658, + "mean_token_accuracy": 0.39394642235880784, + "step": 2962 + }, + { + "epoch": 0.5493140526510938, + "grad_norm": 6.37109375, + "learning_rate": 9.450685947348907e-06, + "loss": 2.9053, + "mean_token_accuracy": 0.4261866744084773, + "step": 2963 + }, + { + "epoch": 0.5494994438264739, + "grad_norm": 7.328125, + "learning_rate": 9.450500556173528e-06, + "loss": 2.8157, + "mean_token_accuracy": 0.43747619652151837, + "step": 2964 + }, + { + "epoch": 0.5496848350018539, + "grad_norm": 6.5859375, + "learning_rate": 9.450315164998147e-06, + "loss": 3.4574, + "mean_token_accuracy": 0.3892110586648685, + "step": 2965 + }, + { + "epoch": 0.5498702261772339, + "grad_norm": 10.390625, + "learning_rate": 9.450129773822767e-06, + "loss": 2.0949, + "mean_token_accuracy": 0.5319148936170213, + "step": 2966 + }, + { + "epoch": 0.550055617352614, + "grad_norm": 6.546875, + "learning_rate": 9.449944382647387e-06, + "loss": 2.7709, + "mean_token_accuracy": 0.44807162534435263, + "step": 2967 + }, + { + "epoch": 0.5502410085279941, + "grad_norm": 6.1640625, + "learning_rate": 9.449758991472006e-06, + "loss": 3.0282, + "mean_token_accuracy": 0.4307030514854147, + "step": 2968 + }, + { + "epoch": 0.5504263997033741, + "grad_norm": 5.515625, + "learning_rate": 9.449573600296627e-06, + "loss": 2.5391, + "mean_token_accuracy": 0.4617539585870889, + "step": 2969 + }, + { + "epoch": 0.5506117908787542, + "grad_norm": 6.375, + "learning_rate": 9.449388209121246e-06, + "loss": 2.6059, + "mean_token_accuracy": 0.4758575197889182, + "step": 2970 + }, + { + "epoch": 0.5507971820541342, + "grad_norm": 4.83203125, + "learning_rate": 9.449202817945866e-06, + "loss": 2.8657, + "mean_token_accuracy": 0.42155331534857304, + "step": 2971 + }, + { + "epoch": 0.5509825732295143, + "grad_norm": 5.75390625, + "learning_rate": 9.449017426770487e-06, + "loss": 3.047, + "mean_token_accuracy": 0.414181732801127, + "step": 2972 + }, + { + "epoch": 0.5511679644048944, + "grad_norm": 5.71484375, + "learning_rate": 9.448832035595107e-06, + "loss": 3.8133, + "mean_token_accuracy": 0.3526508742244783, + "step": 2973 + }, + { + "epoch": 0.5513533555802744, + "grad_norm": 6.3515625, + "learning_rate": 9.448646644419726e-06, + "loss": 2.6387, + "mean_token_accuracy": 0.4614587381785205, + "step": 2974 + }, + { + "epoch": 0.5515387467556544, + "grad_norm": 4.8984375, + "learning_rate": 9.448461253244346e-06, + "loss": 3.3632, + "mean_token_accuracy": 0.3841771332383789, + "step": 2975 + }, + { + "epoch": 0.5517241379310345, + "grad_norm": 6.38671875, + "learning_rate": 9.448275862068967e-06, + "loss": 2.5651, + "mean_token_accuracy": 0.4772513335425165, + "step": 2976 + }, + { + "epoch": 0.5519095291064146, + "grad_norm": 4.96484375, + "learning_rate": 9.448090470893586e-06, + "loss": 2.7462, + "mean_token_accuracy": 0.45734399236367973, + "step": 2977 + }, + { + "epoch": 0.5520949202817946, + "grad_norm": 6.16796875, + "learning_rate": 9.447905079718206e-06, + "loss": 3.2056, + "mean_token_accuracy": 0.4206884315117104, + "step": 2978 + }, + { + "epoch": 0.5522803114571746, + "grad_norm": 6.34375, + "learning_rate": 9.447719688542825e-06, + "loss": 2.9688, + "mean_token_accuracy": 0.4270218151138966, + "step": 2979 + }, + { + "epoch": 0.5524657026325547, + "grad_norm": 5.66796875, + "learning_rate": 9.447534297367445e-06, + "loss": 3.1863, + "mean_token_accuracy": 0.39348863317429134, + "step": 2980 + }, + { + "epoch": 0.5526510938079348, + "grad_norm": 7.2421875, + "learning_rate": 9.447348906192066e-06, + "loss": 2.5402, + "mean_token_accuracy": 0.47466196355085244, + "step": 2981 + }, + { + "epoch": 0.5528364849833148, + "grad_norm": 9.1796875, + "learning_rate": 9.447163515016686e-06, + "loss": 2.5158, + "mean_token_accuracy": 0.4834305960817007, + "step": 2982 + }, + { + "epoch": 0.5530218761586948, + "grad_norm": 7.0, + "learning_rate": 9.446978123841307e-06, + "loss": 2.8011, + "mean_token_accuracy": 0.4426123921164498, + "step": 2983 + }, + { + "epoch": 0.5532072673340749, + "grad_norm": 6.33984375, + "learning_rate": 9.446792732665926e-06, + "loss": 3.1543, + "mean_token_accuracy": 0.41539482415394824, + "step": 2984 + }, + { + "epoch": 0.553392658509455, + "grad_norm": 8.7890625, + "learning_rate": 9.446607341490546e-06, + "loss": 3.1289, + "mean_token_accuracy": 0.40719923463286295, + "step": 2985 + }, + { + "epoch": 0.553578049684835, + "grad_norm": 7.9375, + "learning_rate": 9.446421950315165e-06, + "loss": 2.9219, + "mean_token_accuracy": 0.4214385033497661, + "step": 2986 + }, + { + "epoch": 0.553763440860215, + "grad_norm": 6.62109375, + "learning_rate": 9.446236559139785e-06, + "loss": 2.798, + "mean_token_accuracy": 0.4602012808783166, + "step": 2987 + }, + { + "epoch": 0.5539488320355951, + "grad_norm": 5.36328125, + "learning_rate": 9.446051167964406e-06, + "loss": 3.1805, + "mean_token_accuracy": 0.40885446769159633, + "step": 2988 + }, + { + "epoch": 0.5541342232109752, + "grad_norm": 9.7734375, + "learning_rate": 9.445865776789026e-06, + "loss": 2.7911, + "mean_token_accuracy": 0.4408098028769313, + "step": 2989 + }, + { + "epoch": 0.5543196143863552, + "grad_norm": 6.3359375, + "learning_rate": 9.445680385613645e-06, + "loss": 2.9232, + "mean_token_accuracy": 0.4124911785462244, + "step": 2990 + }, + { + "epoch": 0.5545050055617352, + "grad_norm": 5.9296875, + "learning_rate": 9.445494994438266e-06, + "loss": 2.7367, + "mean_token_accuracy": 0.4612546125461255, + "step": 2991 + }, + { + "epoch": 0.5546903967371153, + "grad_norm": 6.3828125, + "learning_rate": 9.445309603262886e-06, + "loss": 2.4229, + "mean_token_accuracy": 0.48038161784466543, + "step": 2992 + }, + { + "epoch": 0.5548757879124954, + "grad_norm": 6.16015625, + "learning_rate": 9.445124212087505e-06, + "loss": 2.948, + "mean_token_accuracy": 0.43003091813415784, + "step": 2993 + }, + { + "epoch": 0.5550611790878754, + "grad_norm": 6.21484375, + "learning_rate": 9.444938820912126e-06, + "loss": 3.1607, + "mean_token_accuracy": 0.40956511546125896, + "step": 2994 + }, + { + "epoch": 0.5552465702632555, + "grad_norm": 6.21484375, + "learning_rate": 9.444753429736744e-06, + "loss": 2.9538, + "mean_token_accuracy": 0.42231172952520446, + "step": 2995 + }, + { + "epoch": 0.5554319614386355, + "grad_norm": 5.97265625, + "learning_rate": 9.444568038561365e-06, + "loss": 2.6489, + "mean_token_accuracy": 0.45564516129032256, + "step": 2996 + }, + { + "epoch": 0.5556173526140156, + "grad_norm": 6.0703125, + "learning_rate": 9.444382647385985e-06, + "loss": 2.4631, + "mean_token_accuracy": 0.49040011725047633, + "step": 2997 + }, + { + "epoch": 0.5558027437893956, + "grad_norm": 6.2734375, + "learning_rate": 9.444197256210606e-06, + "loss": 2.9895, + "mean_token_accuracy": 0.42658137882018476, + "step": 2998 + }, + { + "epoch": 0.5559881349647757, + "grad_norm": 7.36328125, + "learning_rate": 9.444011865035225e-06, + "loss": 2.9292, + "mean_token_accuracy": 0.40367264621590065, + "step": 2999 + }, + { + "epoch": 0.5561735261401557, + "grad_norm": 6.921875, + "learning_rate": 9.443826473859845e-06, + "loss": 2.6649, + "mean_token_accuracy": 0.45190246516613075, + "step": 3000 + }, + { + "epoch": 0.5563589173155358, + "grad_norm": 5.45703125, + "learning_rate": 9.443641082684466e-06, + "loss": 2.5858, + "mean_token_accuracy": 0.46935707678075855, + "step": 3001 + }, + { + "epoch": 0.5565443084909159, + "grad_norm": 5.80078125, + "learning_rate": 9.443455691509084e-06, + "loss": 3.2635, + "mean_token_accuracy": 0.40212363330529854, + "step": 3002 + }, + { + "epoch": 0.5567296996662959, + "grad_norm": 9.0546875, + "learning_rate": 9.443270300333705e-06, + "loss": 2.4568, + "mean_token_accuracy": 0.46580683863227357, + "step": 3003 + }, + { + "epoch": 0.5569150908416759, + "grad_norm": 10.28125, + "learning_rate": 9.443084909158324e-06, + "loss": 3.0562, + "mean_token_accuracy": 0.4003463476070529, + "step": 3004 + }, + { + "epoch": 0.557100482017056, + "grad_norm": 5.1875, + "learning_rate": 9.442899517982946e-06, + "loss": 2.8459, + "mean_token_accuracy": 0.43414223593771917, + "step": 3005 + }, + { + "epoch": 0.557285873192436, + "grad_norm": 7.66796875, + "learning_rate": 9.442714126807565e-06, + "loss": 2.4622, + "mean_token_accuracy": 0.4695671042638603, + "step": 3006 + }, + { + "epoch": 0.5574712643678161, + "grad_norm": 6.7578125, + "learning_rate": 9.442528735632185e-06, + "loss": 3.1757, + "mean_token_accuracy": 0.4162598889075913, + "step": 3007 + }, + { + "epoch": 0.5576566555431961, + "grad_norm": 6.91796875, + "learning_rate": 9.442343344456804e-06, + "loss": 2.7553, + "mean_token_accuracy": 0.4263616274535292, + "step": 3008 + }, + { + "epoch": 0.5578420467185762, + "grad_norm": 8.3203125, + "learning_rate": 9.442157953281424e-06, + "loss": 2.3425, + "mean_token_accuracy": 0.4893809893809894, + "step": 3009 + }, + { + "epoch": 0.5580274378939563, + "grad_norm": 5.45703125, + "learning_rate": 9.441972562106045e-06, + "loss": 2.6553, + "mean_token_accuracy": 0.4653410660038663, + "step": 3010 + }, + { + "epoch": 0.5582128290693363, + "grad_norm": 5.81640625, + "learning_rate": 9.441787170930664e-06, + "loss": 3.0349, + "mean_token_accuracy": 0.3961129106894956, + "step": 3011 + }, + { + "epoch": 0.5583982202447163, + "grad_norm": 11.078125, + "learning_rate": 9.441601779755284e-06, + "loss": 2.738, + "mean_token_accuracy": 0.4572911122937733, + "step": 3012 + }, + { + "epoch": 0.5585836114200964, + "grad_norm": 8.1328125, + "learning_rate": 9.441416388579905e-06, + "loss": 3.2793, + "mean_token_accuracy": 0.39879518072289155, + "step": 3013 + }, + { + "epoch": 0.5587690025954765, + "grad_norm": 6.56640625, + "learning_rate": 9.441230997404525e-06, + "loss": 2.9748, + "mean_token_accuracy": 0.418001800180018, + "step": 3014 + }, + { + "epoch": 0.5589543937708565, + "grad_norm": 5.921875, + "learning_rate": 9.441045606229144e-06, + "loss": 3.1678, + "mean_token_accuracy": 0.3848596468944494, + "step": 3015 + }, + { + "epoch": 0.5591397849462365, + "grad_norm": 6.35546875, + "learning_rate": 9.440860215053764e-06, + "loss": 2.815, + "mean_token_accuracy": 0.4421269768507907, + "step": 3016 + }, + { + "epoch": 0.5593251761216166, + "grad_norm": 7.61328125, + "learning_rate": 9.440674823878383e-06, + "loss": 3.1755, + "mean_token_accuracy": 0.4004226954966672, + "step": 3017 + }, + { + "epoch": 0.5595105672969967, + "grad_norm": 9.4140625, + "learning_rate": 9.440489432703004e-06, + "loss": 2.7947, + "mean_token_accuracy": 0.4178360199466543, + "step": 3018 + }, + { + "epoch": 0.5596959584723767, + "grad_norm": 6.328125, + "learning_rate": 9.440304041527624e-06, + "loss": 2.8986, + "mean_token_accuracy": 0.43343792021515015, + "step": 3019 + }, + { + "epoch": 0.5598813496477568, + "grad_norm": 6.81640625, + "learning_rate": 9.440118650352243e-06, + "loss": 3.1107, + "mean_token_accuracy": 0.39828693790149894, + "step": 3020 + }, + { + "epoch": 0.5600667408231368, + "grad_norm": 6.640625, + "learning_rate": 9.439933259176865e-06, + "loss": 3.0485, + "mean_token_accuracy": 0.4369486892781712, + "step": 3021 + }, + { + "epoch": 0.5602521319985169, + "grad_norm": 5.734375, + "learning_rate": 9.439747868001484e-06, + "loss": 2.9654, + "mean_token_accuracy": 0.41912688442211055, + "step": 3022 + }, + { + "epoch": 0.560437523173897, + "grad_norm": 7.82421875, + "learning_rate": 9.439562476826105e-06, + "loss": 3.0825, + "mean_token_accuracy": 0.42078722510155486, + "step": 3023 + }, + { + "epoch": 0.560622914349277, + "grad_norm": 6.18359375, + "learning_rate": 9.439377085650723e-06, + "loss": 2.9724, + "mean_token_accuracy": 0.41904761904761906, + "step": 3024 + }, + { + "epoch": 0.560808305524657, + "grad_norm": 5.14453125, + "learning_rate": 9.439191694475344e-06, + "loss": 3.0113, + "mean_token_accuracy": 0.4302105661328962, + "step": 3025 + }, + { + "epoch": 0.5609936967000371, + "grad_norm": 5.43359375, + "learning_rate": 9.439006303299964e-06, + "loss": 2.8233, + "mean_token_accuracy": 0.44569871545348383, + "step": 3026 + }, + { + "epoch": 0.5611790878754171, + "grad_norm": 7.515625, + "learning_rate": 9.438820912124583e-06, + "loss": 2.7591, + "mean_token_accuracy": 0.4564459930313589, + "step": 3027 + }, + { + "epoch": 0.5613644790507972, + "grad_norm": 7.69921875, + "learning_rate": 9.438635520949204e-06, + "loss": 2.3939, + "mean_token_accuracy": 0.5017647728566549, + "step": 3028 + }, + { + "epoch": 0.5615498702261772, + "grad_norm": 6.22265625, + "learning_rate": 9.438450129773824e-06, + "loss": 2.9177, + "mean_token_accuracy": 0.4369124851052562, + "step": 3029 + }, + { + "epoch": 0.5617352614015573, + "grad_norm": 7.1796875, + "learning_rate": 9.438264738598445e-06, + "loss": 2.5704, + "mean_token_accuracy": 0.4731827262947595, + "step": 3030 + }, + { + "epoch": 0.5619206525769374, + "grad_norm": 4.796875, + "learning_rate": 9.438079347423063e-06, + "loss": 2.8088, + "mean_token_accuracy": 0.44042695130086723, + "step": 3031 + }, + { + "epoch": 0.5621060437523174, + "grad_norm": 5.140625, + "learning_rate": 9.437893956247684e-06, + "loss": 3.172, + "mean_token_accuracy": 0.40962025316455697, + "step": 3032 + }, + { + "epoch": 0.5622914349276974, + "grad_norm": 8.8046875, + "learning_rate": 9.437708565072303e-06, + "loss": 2.9087, + "mean_token_accuracy": 0.4396851122945126, + "step": 3033 + }, + { + "epoch": 0.5624768261030775, + "grad_norm": 6.28125, + "learning_rate": 9.437523173896923e-06, + "loss": 3.0916, + "mean_token_accuracy": 0.4091234091234091, + "step": 3034 + }, + { + "epoch": 0.5626622172784576, + "grad_norm": 4.9453125, + "learning_rate": 9.437337782721544e-06, + "loss": 2.9472, + "mean_token_accuracy": 0.4344988344988345, + "step": 3035 + }, + { + "epoch": 0.5628476084538376, + "grad_norm": 7.94140625, + "learning_rate": 9.437152391546162e-06, + "loss": 3.5935, + "mean_token_accuracy": 0.3597399959374365, + "step": 3036 + }, + { + "epoch": 0.5630329996292176, + "grad_norm": 9.0703125, + "learning_rate": 9.436967000370783e-06, + "loss": 3.0361, + "mean_token_accuracy": 0.425531914893617, + "step": 3037 + }, + { + "epoch": 0.5632183908045977, + "grad_norm": 6.4140625, + "learning_rate": 9.436781609195403e-06, + "loss": 2.7566, + "mean_token_accuracy": 0.4479725700655933, + "step": 3038 + }, + { + "epoch": 0.5634037819799778, + "grad_norm": 4.75, + "learning_rate": 9.436596218020024e-06, + "loss": 2.9319, + "mean_token_accuracy": 0.430103995621237, + "step": 3039 + }, + { + "epoch": 0.5635891731553578, + "grad_norm": 6.10546875, + "learning_rate": 9.436410826844643e-06, + "loss": 3.2311, + "mean_token_accuracy": 0.41284403669724773, + "step": 3040 + }, + { + "epoch": 0.5637745643307378, + "grad_norm": 6.05859375, + "learning_rate": 9.436225435669263e-06, + "loss": 2.6596, + "mean_token_accuracy": 0.44805485169175735, + "step": 3041 + }, + { + "epoch": 0.5639599555061179, + "grad_norm": 8.3515625, + "learning_rate": 9.436040044493882e-06, + "loss": 2.4301, + "mean_token_accuracy": 0.46982656212877166, + "step": 3042 + }, + { + "epoch": 0.564145346681498, + "grad_norm": 6.3515625, + "learning_rate": 9.435854653318502e-06, + "loss": 2.5456, + "mean_token_accuracy": 0.46542587566240146, + "step": 3043 + }, + { + "epoch": 0.564330737856878, + "grad_norm": 6.8828125, + "learning_rate": 9.435669262143123e-06, + "loss": 2.4448, + "mean_token_accuracy": 0.5168363351605325, + "step": 3044 + }, + { + "epoch": 0.5645161290322581, + "grad_norm": 7.12890625, + "learning_rate": 9.435483870967743e-06, + "loss": 2.7131, + "mean_token_accuracy": 0.4623695071619325, + "step": 3045 + }, + { + "epoch": 0.5647015202076381, + "grad_norm": 6.79296875, + "learning_rate": 9.435298479792362e-06, + "loss": 2.8111, + "mean_token_accuracy": 0.4423773460651959, + "step": 3046 + }, + { + "epoch": 0.5648869113830182, + "grad_norm": 5.36328125, + "learning_rate": 9.435113088616983e-06, + "loss": 3.3342, + "mean_token_accuracy": 0.4002873563218391, + "step": 3047 + }, + { + "epoch": 0.5650723025583982, + "grad_norm": 5.7890625, + "learning_rate": 9.434927697441603e-06, + "loss": 2.7687, + "mean_token_accuracy": 0.4469172932330827, + "step": 3048 + }, + { + "epoch": 0.5652576937337783, + "grad_norm": 9.8046875, + "learning_rate": 9.434742306266222e-06, + "loss": 2.4697, + "mean_token_accuracy": 0.4825788402848423, + "step": 3049 + }, + { + "epoch": 0.5654430849091583, + "grad_norm": 6.234375, + "learning_rate": 9.434556915090843e-06, + "loss": 2.7925, + "mean_token_accuracy": 0.43545302414535025, + "step": 3050 + }, + { + "epoch": 0.5656284760845384, + "grad_norm": 5.92578125, + "learning_rate": 9.434371523915461e-06, + "loss": 3.2016, + "mean_token_accuracy": 0.41889014155079546, + "step": 3051 + }, + { + "epoch": 0.5658138672599184, + "grad_norm": 5.90625, + "learning_rate": 9.434186132740082e-06, + "loss": 2.6429, + "mean_token_accuracy": 0.4780015753347586, + "step": 3052 + }, + { + "epoch": 0.5659992584352985, + "grad_norm": 7.63671875, + "learning_rate": 9.434000741564702e-06, + "loss": 2.3174, + "mean_token_accuracy": 0.47181788333150926, + "step": 3053 + }, + { + "epoch": 0.5661846496106785, + "grad_norm": 5.14453125, + "learning_rate": 9.433815350389323e-06, + "loss": 2.4469, + "mean_token_accuracy": 0.48305843242552104, + "step": 3054 + }, + { + "epoch": 0.5663700407860586, + "grad_norm": 5.12890625, + "learning_rate": 9.433629959213942e-06, + "loss": 2.4647, + "mean_token_accuracy": 0.48691174367043344, + "step": 3055 + }, + { + "epoch": 0.5665554319614386, + "grad_norm": 5.25, + "learning_rate": 9.433444568038562e-06, + "loss": 2.5619, + "mean_token_accuracy": 0.4682970012172181, + "step": 3056 + }, + { + "epoch": 0.5667408231368187, + "grad_norm": 5.8359375, + "learning_rate": 9.433259176863183e-06, + "loss": 2.6465, + "mean_token_accuracy": 0.46729328083322746, + "step": 3057 + }, + { + "epoch": 0.5669262143121987, + "grad_norm": 5.24609375, + "learning_rate": 9.433073785687801e-06, + "loss": 2.7637, + "mean_token_accuracy": 0.4496858694494729, + "step": 3058 + }, + { + "epoch": 0.5671116054875788, + "grad_norm": 5.82421875, + "learning_rate": 9.432888394512422e-06, + "loss": 2.3114, + "mean_token_accuracy": 0.5154596674453759, + "step": 3059 + }, + { + "epoch": 0.5672969966629589, + "grad_norm": 6.65625, + "learning_rate": 9.43270300333704e-06, + "loss": 2.9054, + "mean_token_accuracy": 0.42919094728282964, + "step": 3060 + }, + { + "epoch": 0.5674823878383389, + "grad_norm": 5.03125, + "learning_rate": 9.432517612161663e-06, + "loss": 3.0225, + "mean_token_accuracy": 0.41094117647058825, + "step": 3061 + }, + { + "epoch": 0.5676677790137189, + "grad_norm": 6.265625, + "learning_rate": 9.432332220986282e-06, + "loss": 2.4744, + "mean_token_accuracy": 0.47181405289874434, + "step": 3062 + }, + { + "epoch": 0.567853170189099, + "grad_norm": 7.41796875, + "learning_rate": 9.432146829810902e-06, + "loss": 2.8384, + "mean_token_accuracy": 0.4414233805287558, + "step": 3063 + }, + { + "epoch": 0.568038561364479, + "grad_norm": 7.90234375, + "learning_rate": 9.431961438635523e-06, + "loss": 3.2231, + "mean_token_accuracy": 0.4073697585768742, + "step": 3064 + }, + { + "epoch": 0.5682239525398591, + "grad_norm": 6.03515625, + "learning_rate": 9.431776047460141e-06, + "loss": 2.9904, + "mean_token_accuracy": 0.4246031746031746, + "step": 3065 + }, + { + "epoch": 0.5684093437152391, + "grad_norm": 6.94921875, + "learning_rate": 9.431590656284762e-06, + "loss": 3.1075, + "mean_token_accuracy": 0.41306808992398514, + "step": 3066 + }, + { + "epoch": 0.5685947348906192, + "grad_norm": 10.65625, + "learning_rate": 9.43140526510938e-06, + "loss": 2.9475, + "mean_token_accuracy": 0.41959101237061347, + "step": 3067 + }, + { + "epoch": 0.5687801260659993, + "grad_norm": 7.07421875, + "learning_rate": 9.431219873934001e-06, + "loss": 2.9984, + "mean_token_accuracy": 0.417858038625533, + "step": 3068 + }, + { + "epoch": 0.5689655172413793, + "grad_norm": 6.29296875, + "learning_rate": 9.431034482758622e-06, + "loss": 3.2892, + "mean_token_accuracy": 0.3775735919686915, + "step": 3069 + }, + { + "epoch": 0.5691509084167594, + "grad_norm": 6.75, + "learning_rate": 9.430849091583242e-06, + "loss": 2.4359, + "mean_token_accuracy": 0.47815592565773596, + "step": 3070 + }, + { + "epoch": 0.5693362995921394, + "grad_norm": 6.02734375, + "learning_rate": 9.430663700407861e-06, + "loss": 2.946, + "mean_token_accuracy": 0.42396166134185304, + "step": 3071 + }, + { + "epoch": 0.5695216907675195, + "grad_norm": 5.40625, + "learning_rate": 9.430478309232481e-06, + "loss": 2.2017, + "mean_token_accuracy": 0.5070720570219401, + "step": 3072 + }, + { + "epoch": 0.5697070819428995, + "grad_norm": 5.26953125, + "learning_rate": 9.430292918057102e-06, + "loss": 3.1126, + "mean_token_accuracy": 0.3952633728052266, + "step": 3073 + }, + { + "epoch": 0.5698924731182796, + "grad_norm": 6.08203125, + "learning_rate": 9.43010752688172e-06, + "loss": 2.5552, + "mean_token_accuracy": 0.4871575342465753, + "step": 3074 + }, + { + "epoch": 0.5700778642936596, + "grad_norm": 6.4609375, + "learning_rate": 9.429922135706341e-06, + "loss": 2.6921, + "mean_token_accuracy": 0.4547441058079356, + "step": 3075 + }, + { + "epoch": 0.5702632554690397, + "grad_norm": 5.91015625, + "learning_rate": 9.42973674453096e-06, + "loss": 2.5115, + "mean_token_accuracy": 0.4806201550387597, + "step": 3076 + }, + { + "epoch": 0.5704486466444197, + "grad_norm": 5.80859375, + "learning_rate": 9.429551353355582e-06, + "loss": 2.8453, + "mean_token_accuracy": 0.4449781193297043, + "step": 3077 + }, + { + "epoch": 0.5706340378197998, + "grad_norm": 6.28125, + "learning_rate": 9.429365962180201e-06, + "loss": 2.7013, + "mean_token_accuracy": 0.44952352590827876, + "step": 3078 + }, + { + "epoch": 0.5708194289951798, + "grad_norm": 8.234375, + "learning_rate": 9.429180571004822e-06, + "loss": 2.4455, + "mean_token_accuracy": 0.4864663256606991, + "step": 3079 + }, + { + "epoch": 0.5710048201705599, + "grad_norm": 9.0625, + "learning_rate": 9.42899517982944e-06, + "loss": 2.5219, + "mean_token_accuracy": 0.46737326012003577, + "step": 3080 + }, + { + "epoch": 0.57119021134594, + "grad_norm": 6.39453125, + "learning_rate": 9.42880978865406e-06, + "loss": 3.1202, + "mean_token_accuracy": 0.4183418579754966, + "step": 3081 + }, + { + "epoch": 0.57137560252132, + "grad_norm": 4.84375, + "learning_rate": 9.428624397478681e-06, + "loss": 2.5251, + "mean_token_accuracy": 0.4836318715256331, + "step": 3082 + }, + { + "epoch": 0.5715609936967, + "grad_norm": 7.4453125, + "learning_rate": 9.4284390063033e-06, + "loss": 3.7518, + "mean_token_accuracy": 0.3526537260757432, + "step": 3083 + }, + { + "epoch": 0.5717463848720801, + "grad_norm": 8.25, + "learning_rate": 9.42825361512792e-06, + "loss": 2.6698, + "mean_token_accuracy": 0.48333604291985044, + "step": 3084 + }, + { + "epoch": 0.5719317760474601, + "grad_norm": 5.54296875, + "learning_rate": 9.428068223952541e-06, + "loss": 2.6782, + "mean_token_accuracy": 0.4541989425796139, + "step": 3085 + }, + { + "epoch": 0.5721171672228402, + "grad_norm": 6.4609375, + "learning_rate": 9.427882832777162e-06, + "loss": 2.8835, + "mean_token_accuracy": 0.4227449888641425, + "step": 3086 + }, + { + "epoch": 0.5723025583982202, + "grad_norm": 7.18359375, + "learning_rate": 9.42769744160178e-06, + "loss": 2.543, + "mean_token_accuracy": 0.4923646459972235, + "step": 3087 + }, + { + "epoch": 0.5724879495736003, + "grad_norm": 5.51953125, + "learning_rate": 9.427512050426401e-06, + "loss": 3.0018, + "mean_token_accuracy": 0.43718455872323014, + "step": 3088 + }, + { + "epoch": 0.5726733407489804, + "grad_norm": 6.63671875, + "learning_rate": 9.42732665925102e-06, + "loss": 2.4993, + "mean_token_accuracy": 0.46388151460108984, + "step": 3089 + }, + { + "epoch": 0.5728587319243604, + "grad_norm": 5.53125, + "learning_rate": 9.42714126807564e-06, + "loss": 3.2172, + "mean_token_accuracy": 0.3991467576791809, + "step": 3090 + }, + { + "epoch": 0.5730441230997404, + "grad_norm": 8.890625, + "learning_rate": 9.42695587690026e-06, + "loss": 2.4552, + "mean_token_accuracy": 0.5087814988545871, + "step": 3091 + }, + { + "epoch": 0.5732295142751205, + "grad_norm": 6.484375, + "learning_rate": 9.42677048572488e-06, + "loss": 2.621, + "mean_token_accuracy": 0.45777655324424854, + "step": 3092 + }, + { + "epoch": 0.5734149054505006, + "grad_norm": 6.6015625, + "learning_rate": 9.4265850945495e-06, + "loss": 2.9773, + "mean_token_accuracy": 0.4281560041053712, + "step": 3093 + }, + { + "epoch": 0.5736002966258806, + "grad_norm": 6.46484375, + "learning_rate": 9.42639970337412e-06, + "loss": 3.3191, + "mean_token_accuracy": 0.4092181069958848, + "step": 3094 + }, + { + "epoch": 0.5737856878012607, + "grad_norm": 7.8203125, + "learning_rate": 9.426214312198741e-06, + "loss": 2.7018, + "mean_token_accuracy": 0.45648134914963556, + "step": 3095 + }, + { + "epoch": 0.5739710789766407, + "grad_norm": 5.60546875, + "learning_rate": 9.42602892102336e-06, + "loss": 3.103, + "mean_token_accuracy": 0.4109918055948008, + "step": 3096 + }, + { + "epoch": 0.5741564701520208, + "grad_norm": 5.66796875, + "learning_rate": 9.42584352984798e-06, + "loss": 2.5586, + "mean_token_accuracy": 0.4963943601334625, + "step": 3097 + }, + { + "epoch": 0.5743418613274008, + "grad_norm": 6.38671875, + "learning_rate": 9.425658138672599e-06, + "loss": 2.8668, + "mean_token_accuracy": 0.43670886075949367, + "step": 3098 + }, + { + "epoch": 0.5745272525027809, + "grad_norm": 4.734375, + "learning_rate": 9.42547274749722e-06, + "loss": 2.8494, + "mean_token_accuracy": 0.43486114247008356, + "step": 3099 + }, + { + "epoch": 0.5747126436781609, + "grad_norm": 9.703125, + "learning_rate": 9.42528735632184e-06, + "loss": 2.6443, + "mean_token_accuracy": 0.47657945118059986, + "step": 3100 + }, + { + "epoch": 0.574898034853541, + "grad_norm": 7.10546875, + "learning_rate": 9.425101965146459e-06, + "loss": 3.0981, + "mean_token_accuracy": 0.4058312472389928, + "step": 3101 + }, + { + "epoch": 0.575083426028921, + "grad_norm": 7.1171875, + "learning_rate": 9.424916573971081e-06, + "loss": 2.6452, + "mean_token_accuracy": 0.4586416707778326, + "step": 3102 + }, + { + "epoch": 0.5752688172043011, + "grad_norm": 8.265625, + "learning_rate": 9.4247311827957e-06, + "loss": 2.6154, + "mean_token_accuracy": 0.45869731800766284, + "step": 3103 + }, + { + "epoch": 0.5754542083796811, + "grad_norm": 6.46484375, + "learning_rate": 9.42454579162032e-06, + "loss": 2.8656, + "mean_token_accuracy": 0.42342899554675906, + "step": 3104 + }, + { + "epoch": 0.5756395995550612, + "grad_norm": 5.28515625, + "learning_rate": 9.424360400444939e-06, + "loss": 3.1534, + "mean_token_accuracy": 0.41033966033966035, + "step": 3105 + }, + { + "epoch": 0.5758249907304412, + "grad_norm": 6.9921875, + "learning_rate": 9.42417500926956e-06, + "loss": 3.0204, + "mean_token_accuracy": 0.41279450261780104, + "step": 3106 + }, + { + "epoch": 0.5760103819058213, + "grad_norm": 5.96484375, + "learning_rate": 9.42398961809418e-06, + "loss": 2.4845, + "mean_token_accuracy": 0.4784120219804867, + "step": 3107 + }, + { + "epoch": 0.5761957730812013, + "grad_norm": 5.765625, + "learning_rate": 9.423804226918799e-06, + "loss": 3.2799, + "mean_token_accuracy": 0.4043154761904762, + "step": 3108 + }, + { + "epoch": 0.5763811642565814, + "grad_norm": 5.36328125, + "learning_rate": 9.42361883574342e-06, + "loss": 2.9894, + "mean_token_accuracy": 0.4227910817506193, + "step": 3109 + }, + { + "epoch": 0.5765665554319614, + "grad_norm": 6.2890625, + "learning_rate": 9.42343344456804e-06, + "loss": 2.8693, + "mean_token_accuracy": 0.429007245455701, + "step": 3110 + }, + { + "epoch": 0.5767519466073415, + "grad_norm": 4.91796875, + "learning_rate": 9.42324805339266e-06, + "loss": 3.0737, + "mean_token_accuracy": 0.4101744573178852, + "step": 3111 + }, + { + "epoch": 0.5769373377827215, + "grad_norm": 6.09375, + "learning_rate": 9.423062662217279e-06, + "loss": 3.3594, + "mean_token_accuracy": 0.38680348097155476, + "step": 3112 + }, + { + "epoch": 0.5771227289581016, + "grad_norm": 6.92578125, + "learning_rate": 9.4228772710419e-06, + "loss": 2.4139, + "mean_token_accuracy": 0.49304463529696385, + "step": 3113 + }, + { + "epoch": 0.5773081201334817, + "grad_norm": 6.859375, + "learning_rate": 9.422691879866518e-06, + "loss": 2.9333, + "mean_token_accuracy": 0.420321086089861, + "step": 3114 + }, + { + "epoch": 0.5774935113088617, + "grad_norm": 5.953125, + "learning_rate": 9.422506488691139e-06, + "loss": 3.0873, + "mean_token_accuracy": 0.41862627872990565, + "step": 3115 + }, + { + "epoch": 0.5776789024842417, + "grad_norm": 8.640625, + "learning_rate": 9.42232109751576e-06, + "loss": 2.8899, + "mean_token_accuracy": 0.4374440131382502, + "step": 3116 + }, + { + "epoch": 0.5778642936596218, + "grad_norm": 6.23828125, + "learning_rate": 9.422135706340378e-06, + "loss": 2.745, + "mean_token_accuracy": 0.4514747276955085, + "step": 3117 + }, + { + "epoch": 0.5780496848350019, + "grad_norm": 6.96484375, + "learning_rate": 9.421950315164999e-06, + "loss": 3.0952, + "mean_token_accuracy": 0.4092219020172911, + "step": 3118 + }, + { + "epoch": 0.5782350760103819, + "grad_norm": 8.5703125, + "learning_rate": 9.42176492398962e-06, + "loss": 3.1313, + "mean_token_accuracy": 0.41048436541998773, + "step": 3119 + }, + { + "epoch": 0.578420467185762, + "grad_norm": 10.3515625, + "learning_rate": 9.42157953281424e-06, + "loss": 2.6825, + "mean_token_accuracy": 0.45455987311657414, + "step": 3120 + }, + { + "epoch": 0.578605858361142, + "grad_norm": 7.3515625, + "learning_rate": 9.421394141638858e-06, + "loss": 2.8339, + "mean_token_accuracy": 0.4226912138901362, + "step": 3121 + }, + { + "epoch": 0.5787912495365221, + "grad_norm": 5.71875, + "learning_rate": 9.421208750463479e-06, + "loss": 3.2622, + "mean_token_accuracy": 0.39472198701665256, + "step": 3122 + }, + { + "epoch": 0.5789766407119021, + "grad_norm": 8.0625, + "learning_rate": 9.421023359288098e-06, + "loss": 2.5602, + "mean_token_accuracy": 0.4640846271967241, + "step": 3123 + }, + { + "epoch": 0.5791620318872822, + "grad_norm": 10.1328125, + "learning_rate": 9.420837968112718e-06, + "loss": 2.4376, + "mean_token_accuracy": 0.47997032640949555, + "step": 3124 + }, + { + "epoch": 0.5793474230626622, + "grad_norm": 8.3359375, + "learning_rate": 9.420652576937339e-06, + "loss": 2.4903, + "mean_token_accuracy": 0.4672224327396741, + "step": 3125 + }, + { + "epoch": 0.5795328142380423, + "grad_norm": 6.12890625, + "learning_rate": 9.42046718576196e-06, + "loss": 3.4373, + "mean_token_accuracy": 0.37851057115809406, + "step": 3126 + }, + { + "epoch": 0.5797182054134223, + "grad_norm": 7.875, + "learning_rate": 9.420281794586578e-06, + "loss": 3.1728, + "mean_token_accuracy": 0.4037108125399872, + "step": 3127 + }, + { + "epoch": 0.5799035965888024, + "grad_norm": 6.1953125, + "learning_rate": 9.420096403411198e-06, + "loss": 3.1119, + "mean_token_accuracy": 0.41893115942028986, + "step": 3128 + }, + { + "epoch": 0.5800889877641824, + "grad_norm": 6.67578125, + "learning_rate": 9.419911012235819e-06, + "loss": 2.9689, + "mean_token_accuracy": 0.4304301646309081, + "step": 3129 + }, + { + "epoch": 0.5802743789395625, + "grad_norm": 6.609375, + "learning_rate": 9.419725621060438e-06, + "loss": 3.0626, + "mean_token_accuracy": 0.39611964430072755, + "step": 3130 + }, + { + "epoch": 0.5804597701149425, + "grad_norm": 10.2578125, + "learning_rate": 9.419540229885058e-06, + "loss": 3.0787, + "mean_token_accuracy": 0.4143338517366511, + "step": 3131 + }, + { + "epoch": 0.5806451612903226, + "grad_norm": 7.58984375, + "learning_rate": 9.419354838709677e-06, + "loss": 2.5569, + "mean_token_accuracy": 0.4489263257800451, + "step": 3132 + }, + { + "epoch": 0.5808305524657026, + "grad_norm": 6.14453125, + "learning_rate": 9.419169447534298e-06, + "loss": 2.4433, + "mean_token_accuracy": 0.48450106157112527, + "step": 3133 + }, + { + "epoch": 0.5810159436410827, + "grad_norm": 8.0234375, + "learning_rate": 9.418984056358918e-06, + "loss": 2.9791, + "mean_token_accuracy": 0.4270921131848284, + "step": 3134 + }, + { + "epoch": 0.5812013348164627, + "grad_norm": 6.51953125, + "learning_rate": 9.418798665183539e-06, + "loss": 2.679, + "mean_token_accuracy": 0.4571132800946466, + "step": 3135 + }, + { + "epoch": 0.5813867259918428, + "grad_norm": 7.90234375, + "learning_rate": 9.418613274008157e-06, + "loss": 2.878, + "mean_token_accuracy": 0.43769416637901276, + "step": 3136 + }, + { + "epoch": 0.5815721171672228, + "grad_norm": 5.34375, + "learning_rate": 9.418427882832778e-06, + "loss": 2.7799, + "mean_token_accuracy": 0.4715629198387819, + "step": 3137 + }, + { + "epoch": 0.5817575083426029, + "grad_norm": 5.83984375, + "learning_rate": 9.418242491657398e-06, + "loss": 2.9228, + "mean_token_accuracy": 0.42862664261788197, + "step": 3138 + }, + { + "epoch": 0.581942899517983, + "grad_norm": 7.40625, + "learning_rate": 9.418057100482017e-06, + "loss": 2.633, + "mean_token_accuracy": 0.45174825174825173, + "step": 3139 + }, + { + "epoch": 0.582128290693363, + "grad_norm": 5.8984375, + "learning_rate": 9.417871709306638e-06, + "loss": 3.0056, + "mean_token_accuracy": 0.43594380303241065, + "step": 3140 + }, + { + "epoch": 0.582313681868743, + "grad_norm": 5.59765625, + "learning_rate": 9.417686318131256e-06, + "loss": 3.3847, + "mean_token_accuracy": 0.4069647905422771, + "step": 3141 + }, + { + "epoch": 0.5824990730441231, + "grad_norm": 8.5703125, + "learning_rate": 9.417500926955879e-06, + "loss": 2.718, + "mean_token_accuracy": 0.4511930585683297, + "step": 3142 + }, + { + "epoch": 0.5826844642195032, + "grad_norm": 7.01171875, + "learning_rate": 9.417315535780497e-06, + "loss": 2.7042, + "mean_token_accuracy": 0.43966395112016293, + "step": 3143 + }, + { + "epoch": 0.5828698553948832, + "grad_norm": 7.0390625, + "learning_rate": 9.417130144605118e-06, + "loss": 2.6724, + "mean_token_accuracy": 0.45600246571120356, + "step": 3144 + }, + { + "epoch": 0.5830552465702633, + "grad_norm": 7.95703125, + "learning_rate": 9.416944753429738e-06, + "loss": 2.5946, + "mean_token_accuracy": 0.4699317226890756, + "step": 3145 + }, + { + "epoch": 0.5832406377456433, + "grad_norm": 13.1796875, + "learning_rate": 9.416759362254357e-06, + "loss": 2.614, + "mean_token_accuracy": 0.4659346144247567, + "step": 3146 + }, + { + "epoch": 0.5834260289210234, + "grad_norm": 7.125, + "learning_rate": 9.416573971078978e-06, + "loss": 2.941, + "mean_token_accuracy": 0.425314333612741, + "step": 3147 + }, + { + "epoch": 0.5836114200964034, + "grad_norm": 5.44140625, + "learning_rate": 9.416388579903596e-06, + "loss": 2.9847, + "mean_token_accuracy": 0.4335957126109529, + "step": 3148 + }, + { + "epoch": 0.5837968112717835, + "grad_norm": 8.1796875, + "learning_rate": 9.416203188728217e-06, + "loss": 2.8809, + "mean_token_accuracy": 0.4257296009529482, + "step": 3149 + }, + { + "epoch": 0.5839822024471635, + "grad_norm": 7.1171875, + "learning_rate": 9.416017797552837e-06, + "loss": 3.099, + "mean_token_accuracy": 0.4249722706385676, + "step": 3150 + }, + { + "epoch": 0.5841675936225436, + "grad_norm": 5.73828125, + "learning_rate": 9.415832406377458e-06, + "loss": 2.13, + "mean_token_accuracy": 0.5275523797989584, + "step": 3151 + }, + { + "epoch": 0.5843529847979236, + "grad_norm": 5.15234375, + "learning_rate": 9.415647015202077e-06, + "loss": 3.1375, + "mean_token_accuracy": 0.4089178901576944, + "step": 3152 + }, + { + "epoch": 0.5845383759733037, + "grad_norm": 5.16015625, + "learning_rate": 9.415461624026697e-06, + "loss": 3.0362, + "mean_token_accuracy": 0.4301772589710333, + "step": 3153 + }, + { + "epoch": 0.5847237671486837, + "grad_norm": 5.859375, + "learning_rate": 9.415276232851318e-06, + "loss": 3.0178, + "mean_token_accuracy": 0.42737763629128533, + "step": 3154 + }, + { + "epoch": 0.5849091583240638, + "grad_norm": 4.859375, + "learning_rate": 9.415090841675937e-06, + "loss": 2.5903, + "mean_token_accuracy": 0.4624968217645563, + "step": 3155 + }, + { + "epoch": 0.5850945494994438, + "grad_norm": 7.86328125, + "learning_rate": 9.414905450500557e-06, + "loss": 2.8503, + "mean_token_accuracy": 0.4461461628100549, + "step": 3156 + }, + { + "epoch": 0.5852799406748239, + "grad_norm": 7.390625, + "learning_rate": 9.414720059325176e-06, + "loss": 3.2267, + "mean_token_accuracy": 0.38434163701067614, + "step": 3157 + }, + { + "epoch": 0.5854653318502039, + "grad_norm": 6.2109375, + "learning_rate": 9.414534668149798e-06, + "loss": 3.1227, + "mean_token_accuracy": 0.40094665220392467, + "step": 3158 + }, + { + "epoch": 0.585650723025584, + "grad_norm": 5.16796875, + "learning_rate": 9.414349276974417e-06, + "loss": 2.7075, + "mean_token_accuracy": 0.44745296007342816, + "step": 3159 + }, + { + "epoch": 0.585836114200964, + "grad_norm": 6.80078125, + "learning_rate": 9.414163885799037e-06, + "loss": 2.695, + "mean_token_accuracy": 0.45565845236226654, + "step": 3160 + }, + { + "epoch": 0.5860215053763441, + "grad_norm": 5.59375, + "learning_rate": 9.413978494623656e-06, + "loss": 2.7929, + "mean_token_accuracy": 0.4571939688218758, + "step": 3161 + }, + { + "epoch": 0.5862068965517241, + "grad_norm": 7.390625, + "learning_rate": 9.413793103448277e-06, + "loss": 2.9735, + "mean_token_accuracy": 0.4155455904334828, + "step": 3162 + }, + { + "epoch": 0.5863922877271042, + "grad_norm": 7.06640625, + "learning_rate": 9.413607712272897e-06, + "loss": 2.8829, + "mean_token_accuracy": 0.42983074753173484, + "step": 3163 + }, + { + "epoch": 0.5865776789024842, + "grad_norm": 5.98046875, + "learning_rate": 9.413422321097516e-06, + "loss": 3.0941, + "mean_token_accuracy": 0.40365177698076293, + "step": 3164 + }, + { + "epoch": 0.5867630700778643, + "grad_norm": 5.37109375, + "learning_rate": 9.413236929922136e-06, + "loss": 2.9516, + "mean_token_accuracy": 0.44508346191611725, + "step": 3165 + }, + { + "epoch": 0.5869484612532443, + "grad_norm": 5.265625, + "learning_rate": 9.413051538746757e-06, + "loss": 2.9248, + "mean_token_accuracy": 0.4228206945428774, + "step": 3166 + }, + { + "epoch": 0.5871338524286244, + "grad_norm": 6.24609375, + "learning_rate": 9.412866147571377e-06, + "loss": 2.9648, + "mean_token_accuracy": 0.41889450867052025, + "step": 3167 + }, + { + "epoch": 0.5873192436040044, + "grad_norm": 7.1171875, + "learning_rate": 9.412680756395996e-06, + "loss": 2.7879, + "mean_token_accuracy": 0.4654662725073905, + "step": 3168 + }, + { + "epoch": 0.5875046347793845, + "grad_norm": 9.0703125, + "learning_rate": 9.412495365220617e-06, + "loss": 2.644, + "mean_token_accuracy": 0.4652708541043536, + "step": 3169 + }, + { + "epoch": 0.5876900259547646, + "grad_norm": 6.83984375, + "learning_rate": 9.412309974045235e-06, + "loss": 2.8828, + "mean_token_accuracy": 0.4394069004847448, + "step": 3170 + }, + { + "epoch": 0.5878754171301446, + "grad_norm": 5.2421875, + "learning_rate": 9.412124582869856e-06, + "loss": 2.6269, + "mean_token_accuracy": 0.46454767726161367, + "step": 3171 + }, + { + "epoch": 0.5880608083055247, + "grad_norm": 6.3125, + "learning_rate": 9.411939191694476e-06, + "loss": 3.0064, + "mean_token_accuracy": 0.42638785691540176, + "step": 3172 + }, + { + "epoch": 0.5882461994809047, + "grad_norm": 5.69140625, + "learning_rate": 9.411753800519095e-06, + "loss": 2.785, + "mean_token_accuracy": 0.459290823314564, + "step": 3173 + }, + { + "epoch": 0.5884315906562848, + "grad_norm": 5.109375, + "learning_rate": 9.411568409343716e-06, + "loss": 2.736, + "mean_token_accuracy": 0.4481346678798908, + "step": 3174 + }, + { + "epoch": 0.5886169818316648, + "grad_norm": 6.171875, + "learning_rate": 9.411383018168336e-06, + "loss": 2.9975, + "mean_token_accuracy": 0.433127424220658, + "step": 3175 + }, + { + "epoch": 0.5888023730070449, + "grad_norm": 5.63671875, + "learning_rate": 9.411197626992957e-06, + "loss": 3.1198, + "mean_token_accuracy": 0.40941879637262985, + "step": 3176 + }, + { + "epoch": 0.5889877641824249, + "grad_norm": 4.984375, + "learning_rate": 9.411012235817575e-06, + "loss": 2.9452, + "mean_token_accuracy": 0.4280962128966223, + "step": 3177 + }, + { + "epoch": 0.589173155357805, + "grad_norm": 5.953125, + "learning_rate": 9.410826844642196e-06, + "loss": 2.4706, + "mean_token_accuracy": 0.4766761661916904, + "step": 3178 + }, + { + "epoch": 0.589358546533185, + "grad_norm": 6.5546875, + "learning_rate": 9.410641453466815e-06, + "loss": 2.3804, + "mean_token_accuracy": 0.5072129255626082, + "step": 3179 + }, + { + "epoch": 0.5895439377085651, + "grad_norm": 7.2734375, + "learning_rate": 9.410456062291435e-06, + "loss": 2.8843, + "mean_token_accuracy": 0.43477728830151735, + "step": 3180 + }, + { + "epoch": 0.5897293288839451, + "grad_norm": 5.76171875, + "learning_rate": 9.410270671116056e-06, + "loss": 2.6801, + "mean_token_accuracy": 0.4577025823686554, + "step": 3181 + }, + { + "epoch": 0.5899147200593252, + "grad_norm": 5.390625, + "learning_rate": 9.410085279940676e-06, + "loss": 3.212, + "mean_token_accuracy": 0.3875552747946936, + "step": 3182 + }, + { + "epoch": 0.5901001112347052, + "grad_norm": 5.1328125, + "learning_rate": 9.409899888765297e-06, + "loss": 2.9633, + "mean_token_accuracy": 0.41951912827194127, + "step": 3183 + }, + { + "epoch": 0.5902855024100853, + "grad_norm": 7.2578125, + "learning_rate": 9.409714497589916e-06, + "loss": 2.8132, + "mean_token_accuracy": 0.42996997780968543, + "step": 3184 + }, + { + "epoch": 0.5904708935854653, + "grad_norm": 6.26953125, + "learning_rate": 9.409529106414536e-06, + "loss": 3.0153, + "mean_token_accuracy": 0.4072061767229053, + "step": 3185 + }, + { + "epoch": 0.5906562847608454, + "grad_norm": 5.76953125, + "learning_rate": 9.409343715239155e-06, + "loss": 2.6653, + "mean_token_accuracy": 0.45952178662459037, + "step": 3186 + }, + { + "epoch": 0.5908416759362254, + "grad_norm": 8.0546875, + "learning_rate": 9.409158324063775e-06, + "loss": 2.3456, + "mean_token_accuracy": 0.47361751152073733, + "step": 3187 + }, + { + "epoch": 0.5910270671116055, + "grad_norm": 7.6875, + "learning_rate": 9.408972932888396e-06, + "loss": 3.007, + "mean_token_accuracy": 0.42670001190901513, + "step": 3188 + }, + { + "epoch": 0.5912124582869855, + "grad_norm": 7.1171875, + "learning_rate": 9.408787541713015e-06, + "loss": 3.3277, + "mean_token_accuracy": 0.41493368265924346, + "step": 3189 + }, + { + "epoch": 0.5913978494623656, + "grad_norm": 4.9921875, + "learning_rate": 9.408602150537635e-06, + "loss": 3.1807, + "mean_token_accuracy": 0.4102823857160947, + "step": 3190 + }, + { + "epoch": 0.5915832406377456, + "grad_norm": 7.2890625, + "learning_rate": 9.408416759362256e-06, + "loss": 2.5719, + "mean_token_accuracy": 0.46932750136686713, + "step": 3191 + }, + { + "epoch": 0.5917686318131257, + "grad_norm": 8.3671875, + "learning_rate": 9.408231368186876e-06, + "loss": 2.7128, + "mean_token_accuracy": 0.4530053754683173, + "step": 3192 + }, + { + "epoch": 0.5919540229885057, + "grad_norm": 5.2421875, + "learning_rate": 9.408045977011495e-06, + "loss": 2.9932, + "mean_token_accuracy": 0.42836106611691943, + "step": 3193 + }, + { + "epoch": 0.5921394141638858, + "grad_norm": 7.671875, + "learning_rate": 9.407860585836115e-06, + "loss": 3.129, + "mean_token_accuracy": 0.40983031012287885, + "step": 3194 + }, + { + "epoch": 0.5923248053392659, + "grad_norm": 6.48828125, + "learning_rate": 9.407675194660734e-06, + "loss": 2.7888, + "mean_token_accuracy": 0.45134032634032634, + "step": 3195 + }, + { + "epoch": 0.5925101965146459, + "grad_norm": 6.671875, + "learning_rate": 9.407489803485355e-06, + "loss": 2.9888, + "mean_token_accuracy": 0.41494845360824745, + "step": 3196 + }, + { + "epoch": 0.592695587690026, + "grad_norm": 6.5, + "learning_rate": 9.407304412309975e-06, + "loss": 2.6612, + "mean_token_accuracy": 0.4678086237448317, + "step": 3197 + }, + { + "epoch": 0.592880978865406, + "grad_norm": 5.296875, + "learning_rate": 9.407119021134596e-06, + "loss": 2.7146, + "mean_token_accuracy": 0.4420306965761511, + "step": 3198 + }, + { + "epoch": 0.5930663700407861, + "grad_norm": 10.25, + "learning_rate": 9.406933629959214e-06, + "loss": 2.7925, + "mean_token_accuracy": 0.43722078532800374, + "step": 3199 + }, + { + "epoch": 0.5932517612161661, + "grad_norm": 10.703125, + "learning_rate": 9.406748238783835e-06, + "loss": 2.5244, + "mean_token_accuracy": 0.470515014781391, + "step": 3200 + }, + { + "epoch": 0.5934371523915462, + "grad_norm": 6.234375, + "learning_rate": 9.406562847608455e-06, + "loss": 2.39, + "mean_token_accuracy": 0.489131902254624, + "step": 3201 + }, + { + "epoch": 0.5936225435669262, + "grad_norm": 8.5859375, + "learning_rate": 9.406377456433074e-06, + "loss": 2.4699, + "mean_token_accuracy": 0.4670863706648995, + "step": 3202 + }, + { + "epoch": 0.5938079347423063, + "grad_norm": 8.640625, + "learning_rate": 9.406192065257695e-06, + "loss": 3.0087, + "mean_token_accuracy": 0.42319375135604254, + "step": 3203 + }, + { + "epoch": 0.5939933259176863, + "grad_norm": 8.109375, + "learning_rate": 9.406006674082313e-06, + "loss": 3.024, + "mean_token_accuracy": 0.4209846650524617, + "step": 3204 + }, + { + "epoch": 0.5941787170930664, + "grad_norm": 5.328125, + "learning_rate": 9.405821282906934e-06, + "loss": 2.8248, + "mean_token_accuracy": 0.45910360420268653, + "step": 3205 + }, + { + "epoch": 0.5943641082684464, + "grad_norm": 5.72265625, + "learning_rate": 9.405635891731554e-06, + "loss": 3.0778, + "mean_token_accuracy": 0.3947733333333333, + "step": 3206 + }, + { + "epoch": 0.5945494994438265, + "grad_norm": 6.28125, + "learning_rate": 9.405450500556175e-06, + "loss": 2.4583, + "mean_token_accuracy": 0.4950823177250374, + "step": 3207 + }, + { + "epoch": 0.5947348906192065, + "grad_norm": 6.96875, + "learning_rate": 9.405265109380794e-06, + "loss": 3.269, + "mean_token_accuracy": 0.3958907887479316, + "step": 3208 + }, + { + "epoch": 0.5949202817945866, + "grad_norm": 6.640625, + "learning_rate": 9.405079718205414e-06, + "loss": 3.2785, + "mean_token_accuracy": 0.4062180143295803, + "step": 3209 + }, + { + "epoch": 0.5951056729699666, + "grad_norm": 6.359375, + "learning_rate": 9.404894327030035e-06, + "loss": 2.9842, + "mean_token_accuracy": 0.4102499048585564, + "step": 3210 + }, + { + "epoch": 0.5952910641453467, + "grad_norm": 5.4453125, + "learning_rate": 9.404708935854654e-06, + "loss": 2.7399, + "mean_token_accuracy": 0.450886370241209, + "step": 3211 + }, + { + "epoch": 0.5954764553207267, + "grad_norm": 5.25390625, + "learning_rate": 9.404523544679274e-06, + "loss": 3.0271, + "mean_token_accuracy": 0.41641375821952453, + "step": 3212 + }, + { + "epoch": 0.5956618464961068, + "grad_norm": 5.953125, + "learning_rate": 9.404338153503893e-06, + "loss": 2.786, + "mean_token_accuracy": 0.44434735117422175, + "step": 3213 + }, + { + "epoch": 0.5958472376714868, + "grad_norm": 6.70703125, + "learning_rate": 9.404152762328515e-06, + "loss": 2.2414, + "mean_token_accuracy": 0.5388317448334434, + "step": 3214 + }, + { + "epoch": 0.5960326288468669, + "grad_norm": 6.76171875, + "learning_rate": 9.403967371153134e-06, + "loss": 2.7463, + "mean_token_accuracy": 0.45646088109621336, + "step": 3215 + }, + { + "epoch": 0.5962180200222469, + "grad_norm": 6.57421875, + "learning_rate": 9.403781979977754e-06, + "loss": 1.7965, + "mean_token_accuracy": 0.5882433169667213, + "step": 3216 + }, + { + "epoch": 0.596403411197627, + "grad_norm": 7.59765625, + "learning_rate": 9.403596588802373e-06, + "loss": 2.9055, + "mean_token_accuracy": 0.4407884279128914, + "step": 3217 + }, + { + "epoch": 0.596588802373007, + "grad_norm": 5.9609375, + "learning_rate": 9.403411197626994e-06, + "loss": 3.2324, + "mean_token_accuracy": 0.4034789987271956, + "step": 3218 + }, + { + "epoch": 0.5967741935483871, + "grad_norm": 5.55078125, + "learning_rate": 9.403225806451614e-06, + "loss": 2.8423, + "mean_token_accuracy": 0.43194422437014734, + "step": 3219 + }, + { + "epoch": 0.5969595847237672, + "grad_norm": 8.53125, + "learning_rate": 9.403040415276233e-06, + "loss": 2.2564, + "mean_token_accuracy": 0.5041362530413626, + "step": 3220 + }, + { + "epoch": 0.5971449758991472, + "grad_norm": 7.5859375, + "learning_rate": 9.402855024100853e-06, + "loss": 2.3895, + "mean_token_accuracy": 0.477368290873114, + "step": 3221 + }, + { + "epoch": 0.5973303670745272, + "grad_norm": 5.11328125, + "learning_rate": 9.402669632925472e-06, + "loss": 2.7984, + "mean_token_accuracy": 0.44868248653370213, + "step": 3222 + }, + { + "epoch": 0.5975157582499073, + "grad_norm": 5.51953125, + "learning_rate": 9.402484241750094e-06, + "loss": 2.9314, + "mean_token_accuracy": 0.4287475345167653, + "step": 3223 + }, + { + "epoch": 0.5977011494252874, + "grad_norm": 9.75, + "learning_rate": 9.402298850574713e-06, + "loss": 2.9953, + "mean_token_accuracy": 0.42247493274639275, + "step": 3224 + }, + { + "epoch": 0.5978865406006674, + "grad_norm": 7.171875, + "learning_rate": 9.402113459399334e-06, + "loss": 2.4435, + "mean_token_accuracy": 0.4856185745639367, + "step": 3225 + }, + { + "epoch": 0.5980719317760474, + "grad_norm": 7.3984375, + "learning_rate": 9.401928068223954e-06, + "loss": 2.6402, + "mean_token_accuracy": 0.45655426401196914, + "step": 3226 + }, + { + "epoch": 0.5982573229514275, + "grad_norm": 4.9765625, + "learning_rate": 9.401742677048573e-06, + "loss": 3.177, + "mean_token_accuracy": 0.4127769919849128, + "step": 3227 + }, + { + "epoch": 0.5984427141268076, + "grad_norm": 5.859375, + "learning_rate": 9.401557285873193e-06, + "loss": 2.7735, + "mean_token_accuracy": 0.463007840342124, + "step": 3228 + }, + { + "epoch": 0.5986281053021876, + "grad_norm": 7.21484375, + "learning_rate": 9.401371894697812e-06, + "loss": 2.0978, + "mean_token_accuracy": 0.5410216718266254, + "step": 3229 + }, + { + "epoch": 0.5988134964775677, + "grad_norm": 7.84765625, + "learning_rate": 9.401186503522433e-06, + "loss": 2.8163, + "mean_token_accuracy": 0.47610241820768134, + "step": 3230 + }, + { + "epoch": 0.5989988876529477, + "grad_norm": 4.95703125, + "learning_rate": 9.401001112347053e-06, + "loss": 3.2793, + "mean_token_accuracy": 0.3946295037389531, + "step": 3231 + }, + { + "epoch": 0.5991842788283278, + "grad_norm": 6.203125, + "learning_rate": 9.400815721171674e-06, + "loss": 2.4602, + "mean_token_accuracy": 0.46789871594863797, + "step": 3232 + }, + { + "epoch": 0.5993696700037078, + "grad_norm": 6.484375, + "learning_rate": 9.400630329996292e-06, + "loss": 2.8624, + "mean_token_accuracy": 0.4425796220430818, + "step": 3233 + }, + { + "epoch": 0.5995550611790879, + "grad_norm": 6.35546875, + "learning_rate": 9.400444938820913e-06, + "loss": 3.099, + "mean_token_accuracy": 0.4287498438865992, + "step": 3234 + }, + { + "epoch": 0.5997404523544679, + "grad_norm": 6.37890625, + "learning_rate": 9.400259547645533e-06, + "loss": 2.6663, + "mean_token_accuracy": 0.45685347738014853, + "step": 3235 + }, + { + "epoch": 0.599925843529848, + "grad_norm": 12.265625, + "learning_rate": 9.400074156470152e-06, + "loss": 3.1976, + "mean_token_accuracy": 0.37873172740374716, + "step": 3236 + }, + { + "epoch": 0.600111234705228, + "grad_norm": 8.203125, + "learning_rate": 9.399888765294773e-06, + "loss": 3.3292, + "mean_token_accuracy": 0.3841243862520458, + "step": 3237 + }, + { + "epoch": 0.6002966258806081, + "grad_norm": 8.546875, + "learning_rate": 9.399703374119392e-06, + "loss": 2.8227, + "mean_token_accuracy": 0.4499832346037778, + "step": 3238 + }, + { + "epoch": 0.6004820170559881, + "grad_norm": 6.89453125, + "learning_rate": 9.399517982944014e-06, + "loss": 2.9902, + "mean_token_accuracy": 0.4328996918863403, + "step": 3239 + }, + { + "epoch": 0.6006674082313682, + "grad_norm": 8.5859375, + "learning_rate": 9.399332591768633e-06, + "loss": 2.6093, + "mean_token_accuracy": 0.4751387226634751, + "step": 3240 + }, + { + "epoch": 0.6008527994067482, + "grad_norm": 8.484375, + "learning_rate": 9.399147200593253e-06, + "loss": 2.7762, + "mean_token_accuracy": 0.4507522878858384, + "step": 3241 + }, + { + "epoch": 0.6010381905821283, + "grad_norm": 6.5625, + "learning_rate": 9.398961809417872e-06, + "loss": 2.8965, + "mean_token_accuracy": 0.4195001125872551, + "step": 3242 + }, + { + "epoch": 0.6012235817575083, + "grad_norm": 7.34765625, + "learning_rate": 9.398776418242492e-06, + "loss": 2.8068, + "mean_token_accuracy": 0.43954829998772554, + "step": 3243 + }, + { + "epoch": 0.6014089729328884, + "grad_norm": 6.53125, + "learning_rate": 9.398591027067113e-06, + "loss": 2.3951, + "mean_token_accuracy": 0.4932870719565545, + "step": 3244 + }, + { + "epoch": 0.6015943641082685, + "grad_norm": 7.41015625, + "learning_rate": 9.398405635891732e-06, + "loss": 2.7258, + "mean_token_accuracy": 0.45349918623575913, + "step": 3245 + }, + { + "epoch": 0.6017797552836485, + "grad_norm": 7.03125, + "learning_rate": 9.398220244716352e-06, + "loss": 2.2905, + "mean_token_accuracy": 0.49455469506292354, + "step": 3246 + }, + { + "epoch": 0.6019651464590285, + "grad_norm": 7.0859375, + "learning_rate": 9.398034853540973e-06, + "loss": 2.4706, + "mean_token_accuracy": 0.4789283564118001, + "step": 3247 + }, + { + "epoch": 0.6021505376344086, + "grad_norm": 5.5859375, + "learning_rate": 9.397849462365593e-06, + "loss": 2.7963, + "mean_token_accuracy": 0.4542505744019462, + "step": 3248 + }, + { + "epoch": 0.6023359288097887, + "grad_norm": 5.265625, + "learning_rate": 9.397664071190212e-06, + "loss": 2.7657, + "mean_token_accuracy": 0.4407371060953004, + "step": 3249 + }, + { + "epoch": 0.6025213199851687, + "grad_norm": 6.60546875, + "learning_rate": 9.397478680014832e-06, + "loss": 2.6778, + "mean_token_accuracy": 0.4477392836171462, + "step": 3250 + }, + { + "epoch": 0.6027067111605487, + "grad_norm": 5.90234375, + "learning_rate": 9.397293288839451e-06, + "loss": 2.5877, + "mean_token_accuracy": 0.47302258315719214, + "step": 3251 + }, + { + "epoch": 0.6028921023359288, + "grad_norm": 4.54296875, + "learning_rate": 9.397107897664072e-06, + "loss": 2.8369, + "mean_token_accuracy": 0.4482680552411139, + "step": 3252 + }, + { + "epoch": 0.6030774935113089, + "grad_norm": 6.625, + "learning_rate": 9.396922506488692e-06, + "loss": 2.6702, + "mean_token_accuracy": 0.4484597801240884, + "step": 3253 + }, + { + "epoch": 0.6032628846866889, + "grad_norm": 6.1015625, + "learning_rate": 9.396737115313311e-06, + "loss": 2.9859, + "mean_token_accuracy": 0.4129094645231476, + "step": 3254 + }, + { + "epoch": 0.603448275862069, + "grad_norm": 7.5390625, + "learning_rate": 9.396551724137931e-06, + "loss": 2.4414, + "mean_token_accuracy": 0.49485783424077434, + "step": 3255 + }, + { + "epoch": 0.603633667037449, + "grad_norm": 5.6328125, + "learning_rate": 9.396366332962552e-06, + "loss": 2.5731, + "mean_token_accuracy": 0.4683647359851128, + "step": 3256 + }, + { + "epoch": 0.6038190582128291, + "grad_norm": 5.01953125, + "learning_rate": 9.396180941787172e-06, + "loss": 2.7707, + "mean_token_accuracy": 0.44610957918714783, + "step": 3257 + }, + { + "epoch": 0.6040044493882091, + "grad_norm": 5.95703125, + "learning_rate": 9.395995550611791e-06, + "loss": 3.7238, + "mean_token_accuracy": 0.36298568507157464, + "step": 3258 + }, + { + "epoch": 0.6041898405635892, + "grad_norm": 6.37890625, + "learning_rate": 9.395810159436412e-06, + "loss": 3.1295, + "mean_token_accuracy": 0.4135831381733021, + "step": 3259 + }, + { + "epoch": 0.6043752317389692, + "grad_norm": 4.82421875, + "learning_rate": 9.39562476826103e-06, + "loss": 2.2456, + "mean_token_accuracy": 0.5299048482102402, + "step": 3260 + }, + { + "epoch": 0.6045606229143493, + "grad_norm": 5.17578125, + "learning_rate": 9.395439377085651e-06, + "loss": 2.6502, + "mean_token_accuracy": 0.4715215158086976, + "step": 3261 + }, + { + "epoch": 0.6047460140897293, + "grad_norm": 7.81640625, + "learning_rate": 9.395253985910271e-06, + "loss": 2.8411, + "mean_token_accuracy": 0.44408375263755884, + "step": 3262 + }, + { + "epoch": 0.6049314052651094, + "grad_norm": 6.7265625, + "learning_rate": 9.395068594734892e-06, + "loss": 2.5927, + "mean_token_accuracy": 0.4578479990773844, + "step": 3263 + }, + { + "epoch": 0.6051167964404894, + "grad_norm": 4.8671875, + "learning_rate": 9.394883203559512e-06, + "loss": 2.6667, + "mean_token_accuracy": 0.464867243655567, + "step": 3264 + }, + { + "epoch": 0.6053021876158695, + "grad_norm": 4.9453125, + "learning_rate": 9.394697812384131e-06, + "loss": 2.7551, + "mean_token_accuracy": 0.4578907435508346, + "step": 3265 + }, + { + "epoch": 0.6054875787912495, + "grad_norm": 5.23828125, + "learning_rate": 9.394512421208752e-06, + "loss": 2.2817, + "mean_token_accuracy": 0.5130372492836677, + "step": 3266 + }, + { + "epoch": 0.6056729699666296, + "grad_norm": 6.29296875, + "learning_rate": 9.39432703003337e-06, + "loss": 2.9149, + "mean_token_accuracy": 0.42651359254846744, + "step": 3267 + }, + { + "epoch": 0.6058583611420096, + "grad_norm": 6.28515625, + "learning_rate": 9.394141638857991e-06, + "loss": 3.215, + "mean_token_accuracy": 0.40575175085680226, + "step": 3268 + }, + { + "epoch": 0.6060437523173897, + "grad_norm": 8.28125, + "learning_rate": 9.393956247682612e-06, + "loss": 2.9843, + "mean_token_accuracy": 0.41335075424634754, + "step": 3269 + }, + { + "epoch": 0.6062291434927698, + "grad_norm": 7.546875, + "learning_rate": 9.39377085650723e-06, + "loss": 2.4258, + "mean_token_accuracy": 0.48796944621260346, + "step": 3270 + }, + { + "epoch": 0.6064145346681498, + "grad_norm": 6.2578125, + "learning_rate": 9.39358546533185e-06, + "loss": 2.3022, + "mean_token_accuracy": 0.5031607262945528, + "step": 3271 + }, + { + "epoch": 0.6065999258435298, + "grad_norm": 5.59375, + "learning_rate": 9.393400074156471e-06, + "loss": 2.6021, + "mean_token_accuracy": 0.47356884992264053, + "step": 3272 + }, + { + "epoch": 0.6067853170189099, + "grad_norm": 7.0078125, + "learning_rate": 9.393214682981092e-06, + "loss": 2.7835, + "mean_token_accuracy": 0.45358269490761605, + "step": 3273 + }, + { + "epoch": 0.60697070819429, + "grad_norm": 6.0546875, + "learning_rate": 9.39302929180571e-06, + "loss": 3.0623, + "mean_token_accuracy": 0.40664601520698396, + "step": 3274 + }, + { + "epoch": 0.60715609936967, + "grad_norm": 8.1015625, + "learning_rate": 9.392843900630331e-06, + "loss": 2.6747, + "mean_token_accuracy": 0.4606703146374829, + "step": 3275 + }, + { + "epoch": 0.60734149054505, + "grad_norm": 6.23046875, + "learning_rate": 9.39265850945495e-06, + "loss": 2.6422, + "mean_token_accuracy": 0.4656387665198238, + "step": 3276 + }, + { + "epoch": 0.6075268817204301, + "grad_norm": 10.453125, + "learning_rate": 9.39247311827957e-06, + "loss": 2.9345, + "mean_token_accuracy": 0.4209857364778986, + "step": 3277 + }, + { + "epoch": 0.6077122728958102, + "grad_norm": 12.8515625, + "learning_rate": 9.392287727104191e-06, + "loss": 2.2339, + "mean_token_accuracy": 0.4914018904452796, + "step": 3278 + }, + { + "epoch": 0.6078976640711902, + "grad_norm": 7.12890625, + "learning_rate": 9.392102335928811e-06, + "loss": 3.0181, + "mean_token_accuracy": 0.425511315502844, + "step": 3279 + }, + { + "epoch": 0.6080830552465702, + "grad_norm": 6.5859375, + "learning_rate": 9.39191694475343e-06, + "loss": 2.9267, + "mean_token_accuracy": 0.4270289942981924, + "step": 3280 + }, + { + "epoch": 0.6082684464219503, + "grad_norm": 5.00390625, + "learning_rate": 9.39173155357805e-06, + "loss": 2.8606, + "mean_token_accuracy": 0.4277681660899654, + "step": 3281 + }, + { + "epoch": 0.6084538375973304, + "grad_norm": 6.2890625, + "learning_rate": 9.391546162402671e-06, + "loss": 2.7298, + "mean_token_accuracy": 0.4482192680124733, + "step": 3282 + }, + { + "epoch": 0.6086392287727104, + "grad_norm": 5.66015625, + "learning_rate": 9.39136077122729e-06, + "loss": 2.784, + "mean_token_accuracy": 0.45640050697084916, + "step": 3283 + }, + { + "epoch": 0.6088246199480905, + "grad_norm": 4.328125, + "learning_rate": 9.39117538005191e-06, + "loss": 2.8484, + "mean_token_accuracy": 0.4346054597310879, + "step": 3284 + }, + { + "epoch": 0.6090100111234705, + "grad_norm": 6.34765625, + "learning_rate": 9.39098998887653e-06, + "loss": 2.8435, + "mean_token_accuracy": 0.43119266055045874, + "step": 3285 + }, + { + "epoch": 0.6091954022988506, + "grad_norm": 6.19921875, + "learning_rate": 9.39080459770115e-06, + "loss": 2.9239, + "mean_token_accuracy": 0.41387166181496016, + "step": 3286 + }, + { + "epoch": 0.6093807934742306, + "grad_norm": 6.265625, + "learning_rate": 9.39061920652577e-06, + "loss": 2.9919, + "mean_token_accuracy": 0.4200537719436976, + "step": 3287 + }, + { + "epoch": 0.6095661846496107, + "grad_norm": 5.60546875, + "learning_rate": 9.39043381535039e-06, + "loss": 3.0201, + "mean_token_accuracy": 0.42201715622563035, + "step": 3288 + }, + { + "epoch": 0.6097515758249907, + "grad_norm": 6.1796875, + "learning_rate": 9.39024842417501e-06, + "loss": 2.6116, + "mean_token_accuracy": 0.47947504959560505, + "step": 3289 + }, + { + "epoch": 0.6099369670003708, + "grad_norm": 5.91796875, + "learning_rate": 9.39006303299963e-06, + "loss": 2.5304, + "mean_token_accuracy": 0.4594727161250766, + "step": 3290 + }, + { + "epoch": 0.6101223581757509, + "grad_norm": 5.3828125, + "learning_rate": 9.38987764182425e-06, + "loss": 3.0694, + "mean_token_accuracy": 0.42217059409226687, + "step": 3291 + }, + { + "epoch": 0.6103077493511309, + "grad_norm": 6.8515625, + "learning_rate": 9.38969225064887e-06, + "loss": 2.5858, + "mean_token_accuracy": 0.4573710073710074, + "step": 3292 + }, + { + "epoch": 0.6104931405265109, + "grad_norm": 5.09375, + "learning_rate": 9.38950685947349e-06, + "loss": 3.1484, + "mean_token_accuracy": 0.4094962362478286, + "step": 3293 + }, + { + "epoch": 0.610678531701891, + "grad_norm": 5.35546875, + "learning_rate": 9.389321468298109e-06, + "loss": 3.1756, + "mean_token_accuracy": 0.40341952804860814, + "step": 3294 + }, + { + "epoch": 0.6108639228772711, + "grad_norm": 6.796875, + "learning_rate": 9.38913607712273e-06, + "loss": 3.1556, + "mean_token_accuracy": 0.4105771612583602, + "step": 3295 + }, + { + "epoch": 0.6110493140526511, + "grad_norm": 6.515625, + "learning_rate": 9.38895068594735e-06, + "loss": 2.8727, + "mean_token_accuracy": 0.4280675973548861, + "step": 3296 + }, + { + "epoch": 0.6112347052280311, + "grad_norm": 5.3515625, + "learning_rate": 9.38876529477197e-06, + "loss": 2.7825, + "mean_token_accuracy": 0.4493711091347986, + "step": 3297 + }, + { + "epoch": 0.6114200964034112, + "grad_norm": 5.53515625, + "learning_rate": 9.388579903596589e-06, + "loss": 3.6544, + "mean_token_accuracy": 0.345622633103141, + "step": 3298 + }, + { + "epoch": 0.6116054875787913, + "grad_norm": 5.69921875, + "learning_rate": 9.38839451242121e-06, + "loss": 2.2065, + "mean_token_accuracy": 0.5098673207715464, + "step": 3299 + }, + { + "epoch": 0.6117908787541713, + "grad_norm": 7.46484375, + "learning_rate": 9.38820912124583e-06, + "loss": 2.5895, + "mean_token_accuracy": 0.46120689655172414, + "step": 3300 + }, + { + "epoch": 0.6119762699295513, + "grad_norm": 5.28125, + "learning_rate": 9.388023730070449e-06, + "loss": 3.0934, + "mean_token_accuracy": 0.42255468946442043, + "step": 3301 + }, + { + "epoch": 0.6121616611049314, + "grad_norm": 7.06640625, + "learning_rate": 9.387838338895069e-06, + "loss": 2.4248, + "mean_token_accuracy": 0.48392728766560544, + "step": 3302 + }, + { + "epoch": 0.6123470522803115, + "grad_norm": 6.16015625, + "learning_rate": 9.38765294771969e-06, + "loss": 3.0696, + "mean_token_accuracy": 0.4247218358831711, + "step": 3303 + }, + { + "epoch": 0.6125324434556915, + "grad_norm": 6.4921875, + "learning_rate": 9.38746755654431e-06, + "loss": 3.324, + "mean_token_accuracy": 0.3956128240527688, + "step": 3304 + }, + { + "epoch": 0.6127178346310715, + "grad_norm": 6.69921875, + "learning_rate": 9.387282165368929e-06, + "loss": 2.43, + "mean_token_accuracy": 0.4745679012345679, + "step": 3305 + }, + { + "epoch": 0.6129032258064516, + "grad_norm": 5.046875, + "learning_rate": 9.38709677419355e-06, + "loss": 3.329, + "mean_token_accuracy": 0.4011318619128466, + "step": 3306 + }, + { + "epoch": 0.6130886169818317, + "grad_norm": 6.234375, + "learning_rate": 9.38691138301817e-06, + "loss": 2.5997, + "mean_token_accuracy": 0.46308243727598564, + "step": 3307 + }, + { + "epoch": 0.6132740081572117, + "grad_norm": 6.16796875, + "learning_rate": 9.386725991842789e-06, + "loss": 3.0764, + "mean_token_accuracy": 0.41845096484271743, + "step": 3308 + }, + { + "epoch": 0.6134593993325917, + "grad_norm": 5.24609375, + "learning_rate": 9.38654060066741e-06, + "loss": 3.6871, + "mean_token_accuracy": 0.37120400142908183, + "step": 3309 + }, + { + "epoch": 0.6136447905079718, + "grad_norm": 7.46875, + "learning_rate": 9.386355209492028e-06, + "loss": 2.9533, + "mean_token_accuracy": 0.4210170636931914, + "step": 3310 + }, + { + "epoch": 0.6138301816833519, + "grad_norm": 6.796875, + "learning_rate": 9.38616981831665e-06, + "loss": 2.6124, + "mean_token_accuracy": 0.45919871365402654, + "step": 3311 + }, + { + "epoch": 0.6140155728587319, + "grad_norm": 7.38671875, + "learning_rate": 9.385984427141269e-06, + "loss": 2.7932, + "mean_token_accuracy": 0.43405167550490803, + "step": 3312 + }, + { + "epoch": 0.614200964034112, + "grad_norm": 5.44140625, + "learning_rate": 9.38579903596589e-06, + "loss": 2.9543, + "mean_token_accuracy": 0.4246153846153846, + "step": 3313 + }, + { + "epoch": 0.614386355209492, + "grad_norm": 8.5078125, + "learning_rate": 9.385613644790508e-06, + "loss": 2.5322, + "mean_token_accuracy": 0.4759493670886076, + "step": 3314 + }, + { + "epoch": 0.6145717463848721, + "grad_norm": 6.4609375, + "learning_rate": 9.385428253615129e-06, + "loss": 3.202, + "mean_token_accuracy": 0.4118089507333584, + "step": 3315 + }, + { + "epoch": 0.6147571375602522, + "grad_norm": 5.24609375, + "learning_rate": 9.38524286243975e-06, + "loss": 3.0243, + "mean_token_accuracy": 0.4185344827586207, + "step": 3316 + }, + { + "epoch": 0.6149425287356322, + "grad_norm": 7.359375, + "learning_rate": 9.385057471264368e-06, + "loss": 2.7972, + "mean_token_accuracy": 0.4398455412091227, + "step": 3317 + }, + { + "epoch": 0.6151279199110122, + "grad_norm": 5.3671875, + "learning_rate": 9.384872080088989e-06, + "loss": 2.629, + "mean_token_accuracy": 0.47803946530872055, + "step": 3318 + }, + { + "epoch": 0.6153133110863923, + "grad_norm": 6.03125, + "learning_rate": 9.384686688913609e-06, + "loss": 2.508, + "mean_token_accuracy": 0.4892681665373675, + "step": 3319 + }, + { + "epoch": 0.6154987022617724, + "grad_norm": 6.0234375, + "learning_rate": 9.38450129773823e-06, + "loss": 2.7281, + "mean_token_accuracy": 0.45826056584002794, + "step": 3320 + }, + { + "epoch": 0.6156840934371524, + "grad_norm": 6.12109375, + "learning_rate": 9.384315906562848e-06, + "loss": 2.8716, + "mean_token_accuracy": 0.43319630010277493, + "step": 3321 + }, + { + "epoch": 0.6158694846125324, + "grad_norm": 6.80859375, + "learning_rate": 9.384130515387469e-06, + "loss": 2.6051, + "mean_token_accuracy": 0.509854528390427, + "step": 3322 + }, + { + "epoch": 0.6160548757879125, + "grad_norm": 6.69921875, + "learning_rate": 9.383945124212088e-06, + "loss": 3.0674, + "mean_token_accuracy": 0.43159138655462187, + "step": 3323 + }, + { + "epoch": 0.6162402669632926, + "grad_norm": 4.95703125, + "learning_rate": 9.383759733036708e-06, + "loss": 2.8316, + "mean_token_accuracy": 0.4422546314544738, + "step": 3324 + }, + { + "epoch": 0.6164256581386726, + "grad_norm": 6.0, + "learning_rate": 9.383574341861329e-06, + "loss": 2.849, + "mean_token_accuracy": 0.4341824391632276, + "step": 3325 + }, + { + "epoch": 0.6166110493140526, + "grad_norm": 8.96875, + "learning_rate": 9.383388950685947e-06, + "loss": 2.7341, + "mean_token_accuracy": 0.4353089136928156, + "step": 3326 + }, + { + "epoch": 0.6167964404894327, + "grad_norm": 6.73828125, + "learning_rate": 9.383203559510568e-06, + "loss": 2.59, + "mean_token_accuracy": 0.4655715263518138, + "step": 3327 + }, + { + "epoch": 0.6169818316648128, + "grad_norm": 6.63671875, + "learning_rate": 9.383018168335188e-06, + "loss": 2.289, + "mean_token_accuracy": 0.5260273972602739, + "step": 3328 + }, + { + "epoch": 0.6171672228401928, + "grad_norm": 5.90234375, + "learning_rate": 9.382832777159809e-06, + "loss": 2.7164, + "mean_token_accuracy": 0.4584216431866351, + "step": 3329 + }, + { + "epoch": 0.6173526140155728, + "grad_norm": 9.5078125, + "learning_rate": 9.382647385984428e-06, + "loss": 2.6979, + "mean_token_accuracy": 0.45758832785009296, + "step": 3330 + }, + { + "epoch": 0.6175380051909529, + "grad_norm": 8.0859375, + "learning_rate": 9.382461994809048e-06, + "loss": 2.7213, + "mean_token_accuracy": 0.45431161195492403, + "step": 3331 + }, + { + "epoch": 0.617723396366333, + "grad_norm": 8.6484375, + "learning_rate": 9.382276603633667e-06, + "loss": 3.341, + "mean_token_accuracy": 0.41003102378490175, + "step": 3332 + }, + { + "epoch": 0.617908787541713, + "grad_norm": 12.0625, + "learning_rate": 9.382091212458287e-06, + "loss": 2.9069, + "mean_token_accuracy": 0.41904145077720206, + "step": 3333 + }, + { + "epoch": 0.618094178717093, + "grad_norm": 8.6953125, + "learning_rate": 9.381905821282908e-06, + "loss": 2.999, + "mean_token_accuracy": 0.4226509467667024, + "step": 3334 + }, + { + "epoch": 0.6182795698924731, + "grad_norm": 8.390625, + "learning_rate": 9.381720430107528e-06, + "loss": 3.5712, + "mean_token_accuracy": 0.37801374141161775, + "step": 3335 + }, + { + "epoch": 0.6184649610678532, + "grad_norm": 10.1953125, + "learning_rate": 9.381535038932147e-06, + "loss": 2.9991, + "mean_token_accuracy": 0.4127486885083638, + "step": 3336 + }, + { + "epoch": 0.6186503522432332, + "grad_norm": 13.0859375, + "learning_rate": 9.381349647756768e-06, + "loss": 2.6949, + "mean_token_accuracy": 0.464, + "step": 3337 + }, + { + "epoch": 0.6188357434186132, + "grad_norm": 13.0859375, + "learning_rate": 9.381164256581388e-06, + "loss": 2.5913, + "mean_token_accuracy": 0.4687024559878287, + "step": 3338 + }, + { + "epoch": 0.6190211345939933, + "grad_norm": 7.24609375, + "learning_rate": 9.380978865406007e-06, + "loss": 2.3745, + "mean_token_accuracy": 0.49349442379182157, + "step": 3339 + }, + { + "epoch": 0.6192065257693734, + "grad_norm": 9.3125, + "learning_rate": 9.380793474230627e-06, + "loss": 2.6357, + "mean_token_accuracy": 0.45356075697211157, + "step": 3340 + }, + { + "epoch": 0.6193919169447535, + "grad_norm": 9.5859375, + "learning_rate": 9.380608083055246e-06, + "loss": 3.3249, + "mean_token_accuracy": 0.4005258545135846, + "step": 3341 + }, + { + "epoch": 0.6195773081201335, + "grad_norm": 8.3671875, + "learning_rate": 9.380422691879867e-06, + "loss": 2.8209, + "mean_token_accuracy": 0.4487102579484103, + "step": 3342 + }, + { + "epoch": 0.6197626992955135, + "grad_norm": 6.44140625, + "learning_rate": 9.380237300704487e-06, + "loss": 2.9379, + "mean_token_accuracy": 0.4540797658251006, + "step": 3343 + }, + { + "epoch": 0.6199480904708936, + "grad_norm": 7.47265625, + "learning_rate": 9.380051909529108e-06, + "loss": 2.9086, + "mean_token_accuracy": 0.4359072375127421, + "step": 3344 + }, + { + "epoch": 0.6201334816462737, + "grad_norm": 6.87109375, + "learning_rate": 9.379866518353728e-06, + "loss": 2.4419, + "mean_token_accuracy": 0.49290819131390873, + "step": 3345 + }, + { + "epoch": 0.6203188728216537, + "grad_norm": 6.98046875, + "learning_rate": 9.379681127178347e-06, + "loss": 2.5521, + "mean_token_accuracy": 0.4635254723750299, + "step": 3346 + }, + { + "epoch": 0.6205042639970337, + "grad_norm": 7.640625, + "learning_rate": 9.379495736002968e-06, + "loss": 2.9333, + "mean_token_accuracy": 0.4259064239506544, + "step": 3347 + }, + { + "epoch": 0.6206896551724138, + "grad_norm": 6.13671875, + "learning_rate": 9.379310344827586e-06, + "loss": 2.6625, + "mean_token_accuracy": 0.4558954558954559, + "step": 3348 + }, + { + "epoch": 0.6208750463477939, + "grad_norm": 7.41796875, + "learning_rate": 9.379124953652207e-06, + "loss": 2.6743, + "mean_token_accuracy": 0.45361296822053304, + "step": 3349 + }, + { + "epoch": 0.6210604375231739, + "grad_norm": 6.86328125, + "learning_rate": 9.378939562476827e-06, + "loss": 2.6462, + "mean_token_accuracy": 0.448040172966941, + "step": 3350 + }, + { + "epoch": 0.6212458286985539, + "grad_norm": 5.06640625, + "learning_rate": 9.378754171301446e-06, + "loss": 2.8061, + "mean_token_accuracy": 0.45023380093520377, + "step": 3351 + }, + { + "epoch": 0.621431219873934, + "grad_norm": 5.3515625, + "learning_rate": 9.378568780126067e-06, + "loss": 3.5268, + "mean_token_accuracy": 0.3732539252842447, + "step": 3352 + }, + { + "epoch": 0.6216166110493141, + "grad_norm": 6.68359375, + "learning_rate": 9.378383388950687e-06, + "loss": 2.8244, + "mean_token_accuracy": 0.4448719846085541, + "step": 3353 + }, + { + "epoch": 0.6218020022246941, + "grad_norm": 6.40625, + "learning_rate": 9.378197997775308e-06, + "loss": 2.66, + "mean_token_accuracy": 0.48051044083526684, + "step": 3354 + }, + { + "epoch": 0.6219873934000741, + "grad_norm": 7.7421875, + "learning_rate": 9.378012606599926e-06, + "loss": 2.508, + "mean_token_accuracy": 0.49870466321243523, + "step": 3355 + }, + { + "epoch": 0.6221727845754542, + "grad_norm": 7.2890625, + "learning_rate": 9.377827215424547e-06, + "loss": 2.7415, + "mean_token_accuracy": 0.45779456632282434, + "step": 3356 + }, + { + "epoch": 0.6223581757508343, + "grad_norm": 5.30078125, + "learning_rate": 9.377641824249166e-06, + "loss": 2.6945, + "mean_token_accuracy": 0.4684053651266766, + "step": 3357 + }, + { + "epoch": 0.6225435669262143, + "grad_norm": 10.09375, + "learning_rate": 9.377456433073786e-06, + "loss": 2.6231, + "mean_token_accuracy": 0.46809078771695595, + "step": 3358 + }, + { + "epoch": 0.6227289581015943, + "grad_norm": 7.38671875, + "learning_rate": 9.377271041898407e-06, + "loss": 2.2324, + "mean_token_accuracy": 0.5388011152416357, + "step": 3359 + }, + { + "epoch": 0.6229143492769744, + "grad_norm": 5.41015625, + "learning_rate": 9.377085650723027e-06, + "loss": 2.488, + "mean_token_accuracy": 0.49058006941001486, + "step": 3360 + }, + { + "epoch": 0.6230997404523545, + "grad_norm": 6.15625, + "learning_rate": 9.376900259547646e-06, + "loss": 2.7676, + "mean_token_accuracy": 0.44492722137736257, + "step": 3361 + }, + { + "epoch": 0.6232851316277345, + "grad_norm": 5.32421875, + "learning_rate": 9.376714868372266e-06, + "loss": 3.2614, + "mean_token_accuracy": 0.39415961945031713, + "step": 3362 + }, + { + "epoch": 0.6234705228031145, + "grad_norm": 5.484375, + "learning_rate": 9.376529477196887e-06, + "loss": 3.1397, + "mean_token_accuracy": 0.4145861703737328, + "step": 3363 + }, + { + "epoch": 0.6236559139784946, + "grad_norm": 6.328125, + "learning_rate": 9.376344086021506e-06, + "loss": 3.0144, + "mean_token_accuracy": 0.42267415389285173, + "step": 3364 + }, + { + "epoch": 0.6238413051538747, + "grad_norm": 6.828125, + "learning_rate": 9.376158694846126e-06, + "loss": 3.0636, + "mean_token_accuracy": 0.4309231353678612, + "step": 3365 + }, + { + "epoch": 0.6240266963292548, + "grad_norm": 10.1015625, + "learning_rate": 9.375973303670745e-06, + "loss": 2.5562, + "mean_token_accuracy": 0.47425434040658854, + "step": 3366 + }, + { + "epoch": 0.6242120875046347, + "grad_norm": 5.80859375, + "learning_rate": 9.375787912495365e-06, + "loss": 3.35, + "mean_token_accuracy": 0.3857797676153928, + "step": 3367 + }, + { + "epoch": 0.6243974786800148, + "grad_norm": 8.53125, + "learning_rate": 9.375602521319986e-06, + "loss": 2.4645, + "mean_token_accuracy": 0.5040338579552969, + "step": 3368 + }, + { + "epoch": 0.6245828698553949, + "grad_norm": 6.828125, + "learning_rate": 9.375417130144606e-06, + "loss": 2.8201, + "mean_token_accuracy": 0.448992133726647, + "step": 3369 + }, + { + "epoch": 0.624768261030775, + "grad_norm": 10.5546875, + "learning_rate": 9.375231738969225e-06, + "loss": 2.0473, + "mean_token_accuracy": 0.5443393815486839, + "step": 3370 + }, + { + "epoch": 0.624953652206155, + "grad_norm": 7.02734375, + "learning_rate": 9.375046347793846e-06, + "loss": 2.5197, + "mean_token_accuracy": 0.480499653018737, + "step": 3371 + }, + { + "epoch": 0.625139043381535, + "grad_norm": 5.94921875, + "learning_rate": 9.374860956618466e-06, + "loss": 2.9721, + "mean_token_accuracy": 0.43727947438861176, + "step": 3372 + }, + { + "epoch": 0.6253244345569151, + "grad_norm": 9.7421875, + "learning_rate": 9.374675565443085e-06, + "loss": 2.6704, + "mean_token_accuracy": 0.45610978000435637, + "step": 3373 + }, + { + "epoch": 0.6255098257322952, + "grad_norm": 4.7578125, + "learning_rate": 9.374490174267706e-06, + "loss": 2.5385, + "mean_token_accuracy": 0.45998494731560463, + "step": 3374 + }, + { + "epoch": 0.6256952169076752, + "grad_norm": 7.27734375, + "learning_rate": 9.374304783092324e-06, + "loss": 2.9244, + "mean_token_accuracy": 0.4298538381862125, + "step": 3375 + }, + { + "epoch": 0.6258806080830552, + "grad_norm": 8.7421875, + "learning_rate": 9.374119391916947e-06, + "loss": 2.7912, + "mean_token_accuracy": 0.4508859452950284, + "step": 3376 + }, + { + "epoch": 0.6260659992584353, + "grad_norm": 6.1875, + "learning_rate": 9.373934000741565e-06, + "loss": 2.4233, + "mean_token_accuracy": 0.5111460549440385, + "step": 3377 + }, + { + "epoch": 0.6262513904338154, + "grad_norm": 5.48046875, + "learning_rate": 9.373748609566186e-06, + "loss": 2.9703, + "mean_token_accuracy": 0.42111818433434817, + "step": 3378 + }, + { + "epoch": 0.6264367816091954, + "grad_norm": 6.0546875, + "learning_rate": 9.373563218390805e-06, + "loss": 2.5065, + "mean_token_accuracy": 0.4824498049978333, + "step": 3379 + }, + { + "epoch": 0.6266221727845754, + "grad_norm": 8.734375, + "learning_rate": 9.373377827215425e-06, + "loss": 2.7124, + "mean_token_accuracy": 0.4636157246982913, + "step": 3380 + }, + { + "epoch": 0.6268075639599555, + "grad_norm": 6.43359375, + "learning_rate": 9.373192436040046e-06, + "loss": 2.4534, + "mean_token_accuracy": 0.48397490249279296, + "step": 3381 + }, + { + "epoch": 0.6269929551353356, + "grad_norm": 6.84375, + "learning_rate": 9.373007044864664e-06, + "loss": 2.6778, + "mean_token_accuracy": 0.4515189346650021, + "step": 3382 + }, + { + "epoch": 0.6271783463107156, + "grad_norm": 7.12890625, + "learning_rate": 9.372821653689285e-06, + "loss": 3.0729, + "mean_token_accuracy": 0.4134971141038923, + "step": 3383 + }, + { + "epoch": 0.6273637374860956, + "grad_norm": 6.58203125, + "learning_rate": 9.372636262513905e-06, + "loss": 2.471, + "mean_token_accuracy": 0.49773276904474, + "step": 3384 + }, + { + "epoch": 0.6275491286614757, + "grad_norm": 5.4140625, + "learning_rate": 9.372450871338526e-06, + "loss": 2.8204, + "mean_token_accuracy": 0.4537483559842174, + "step": 3385 + }, + { + "epoch": 0.6277345198368558, + "grad_norm": 6.85546875, + "learning_rate": 9.372265480163145e-06, + "loss": 3.3555, + "mean_token_accuracy": 0.38568665377176015, + "step": 3386 + }, + { + "epoch": 0.6279199110122358, + "grad_norm": 7.2421875, + "learning_rate": 9.372080088987765e-06, + "loss": 2.6807, + "mean_token_accuracy": 0.4566077003121748, + "step": 3387 + }, + { + "epoch": 0.6281053021876158, + "grad_norm": 9.921875, + "learning_rate": 9.371894697812386e-06, + "loss": 2.8964, + "mean_token_accuracy": 0.43160588611644274, + "step": 3388 + }, + { + "epoch": 0.6282906933629959, + "grad_norm": 6.61328125, + "learning_rate": 9.371709306637004e-06, + "loss": 3.0339, + "mean_token_accuracy": 0.4283946798639035, + "step": 3389 + }, + { + "epoch": 0.628476084538376, + "grad_norm": 6.99609375, + "learning_rate": 9.371523915461625e-06, + "loss": 2.6339, + "mean_token_accuracy": 0.46181924368520955, + "step": 3390 + }, + { + "epoch": 0.6286614757137561, + "grad_norm": 4.53125, + "learning_rate": 9.371338524286244e-06, + "loss": 2.8568, + "mean_token_accuracy": 0.43065971780856743, + "step": 3391 + }, + { + "epoch": 0.628846866889136, + "grad_norm": 5.05078125, + "learning_rate": 9.371153133110866e-06, + "loss": 2.8206, + "mean_token_accuracy": 0.4509913432002234, + "step": 3392 + }, + { + "epoch": 0.6290322580645161, + "grad_norm": 4.921875, + "learning_rate": 9.370967741935485e-06, + "loss": 2.823, + "mean_token_accuracy": 0.45094746941712643, + "step": 3393 + }, + { + "epoch": 0.6292176492398962, + "grad_norm": 5.8359375, + "learning_rate": 9.370782350760105e-06, + "loss": 3.0987, + "mean_token_accuracy": 0.4206886182528943, + "step": 3394 + }, + { + "epoch": 0.6294030404152763, + "grad_norm": 7.1796875, + "learning_rate": 9.370596959584724e-06, + "loss": 2.0849, + "mean_token_accuracy": 0.5452862212621509, + "step": 3395 + }, + { + "epoch": 0.6295884315906563, + "grad_norm": 5.4375, + "learning_rate": 9.370411568409344e-06, + "loss": 2.9556, + "mean_token_accuracy": 0.4313694766530868, + "step": 3396 + }, + { + "epoch": 0.6297738227660363, + "grad_norm": 6.7421875, + "learning_rate": 9.370226177233965e-06, + "loss": 2.9863, + "mean_token_accuracy": 0.4477933261571582, + "step": 3397 + }, + { + "epoch": 0.6299592139414164, + "grad_norm": 6.2890625, + "learning_rate": 9.370040786058584e-06, + "loss": 3.0477, + "mean_token_accuracy": 0.41886670959433353, + "step": 3398 + }, + { + "epoch": 0.6301446051167965, + "grad_norm": 7.265625, + "learning_rate": 9.369855394883204e-06, + "loss": 3.1983, + "mean_token_accuracy": 0.4104352318222911, + "step": 3399 + }, + { + "epoch": 0.6303299962921765, + "grad_norm": 6.0078125, + "learning_rate": 9.369670003707825e-06, + "loss": 2.8942, + "mean_token_accuracy": 0.42099284334885834, + "step": 3400 + }, + { + "epoch": 0.6305153874675565, + "grad_norm": 5.25, + "learning_rate": 9.369484612532445e-06, + "loss": 2.9969, + "mean_token_accuracy": 0.4339466039562872, + "step": 3401 + }, + { + "epoch": 0.6307007786429366, + "grad_norm": 7.46875, + "learning_rate": 9.369299221357064e-06, + "loss": 3.0149, + "mean_token_accuracy": 0.4330156643663052, + "step": 3402 + }, + { + "epoch": 0.6308861698183167, + "grad_norm": 7.26953125, + "learning_rate": 9.369113830181685e-06, + "loss": 3.1874, + "mean_token_accuracy": 0.4159938059949121, + "step": 3403 + }, + { + "epoch": 0.6310715609936967, + "grad_norm": 6.59765625, + "learning_rate": 9.368928439006303e-06, + "loss": 2.5668, + "mean_token_accuracy": 0.45945689327763006, + "step": 3404 + }, + { + "epoch": 0.6312569521690767, + "grad_norm": 7.23046875, + "learning_rate": 9.368743047830924e-06, + "loss": 2.5722, + "mean_token_accuracy": 0.49531893646236425, + "step": 3405 + }, + { + "epoch": 0.6314423433444568, + "grad_norm": 9.359375, + "learning_rate": 9.368557656655544e-06, + "loss": 2.8377, + "mean_token_accuracy": 0.436622691292876, + "step": 3406 + }, + { + "epoch": 0.6316277345198369, + "grad_norm": 5.88671875, + "learning_rate": 9.368372265480163e-06, + "loss": 3.3391, + "mean_token_accuracy": 0.3802051155319412, + "step": 3407 + }, + { + "epoch": 0.6318131256952169, + "grad_norm": 8.625, + "learning_rate": 9.368186874304784e-06, + "loss": 2.7484, + "mean_token_accuracy": 0.45124890446976335, + "step": 3408 + }, + { + "epoch": 0.6319985168705969, + "grad_norm": 5.72265625, + "learning_rate": 9.368001483129404e-06, + "loss": 2.7559, + "mean_token_accuracy": 0.44590581247348327, + "step": 3409 + }, + { + "epoch": 0.632183908045977, + "grad_norm": 9.8046875, + "learning_rate": 9.367816091954025e-06, + "loss": 2.2211, + "mean_token_accuracy": 0.5271361929684834, + "step": 3410 + }, + { + "epoch": 0.6323692992213571, + "grad_norm": 7.87109375, + "learning_rate": 9.367630700778643e-06, + "loss": 2.7195, + "mean_token_accuracy": 0.4528119386486113, + "step": 3411 + }, + { + "epoch": 0.6325546903967371, + "grad_norm": 6.83203125, + "learning_rate": 9.367445309603264e-06, + "loss": 3.4319, + "mean_token_accuracy": 0.3698408416509307, + "step": 3412 + }, + { + "epoch": 0.6327400815721171, + "grad_norm": 6.3125, + "learning_rate": 9.367259918427883e-06, + "loss": 2.8753, + "mean_token_accuracy": 0.43326460481099655, + "step": 3413 + }, + { + "epoch": 0.6329254727474972, + "grad_norm": 8.546875, + "learning_rate": 9.367074527252503e-06, + "loss": 2.911, + "mean_token_accuracy": 0.4267657023716445, + "step": 3414 + }, + { + "epoch": 0.6331108639228773, + "grad_norm": 5.71875, + "learning_rate": 9.366889136077124e-06, + "loss": 2.8491, + "mean_token_accuracy": 0.4377935847537243, + "step": 3415 + }, + { + "epoch": 0.6332962550982574, + "grad_norm": 5.57421875, + "learning_rate": 9.366703744901744e-06, + "loss": 3.1554, + "mean_token_accuracy": 0.41284728366660767, + "step": 3416 + }, + { + "epoch": 0.6334816462736373, + "grad_norm": 5.2265625, + "learning_rate": 9.366518353726363e-06, + "loss": 2.8792, + "mean_token_accuracy": 0.43496309963099633, + "step": 3417 + }, + { + "epoch": 0.6336670374490174, + "grad_norm": 8.2890625, + "learning_rate": 9.366332962550983e-06, + "loss": 2.5987, + "mean_token_accuracy": 0.4505840071877808, + "step": 3418 + }, + { + "epoch": 0.6338524286243975, + "grad_norm": 5.77734375, + "learning_rate": 9.366147571375604e-06, + "loss": 2.9068, + "mean_token_accuracy": 0.43318025258323767, + "step": 3419 + }, + { + "epoch": 0.6340378197997776, + "grad_norm": 5.296875, + "learning_rate": 9.365962180200223e-06, + "loss": 2.8174, + "mean_token_accuracy": 0.4445637007620479, + "step": 3420 + }, + { + "epoch": 0.6342232109751575, + "grad_norm": 6.2890625, + "learning_rate": 9.365776789024843e-06, + "loss": 2.3499, + "mean_token_accuracy": 0.4919716206123973, + "step": 3421 + }, + { + "epoch": 0.6344086021505376, + "grad_norm": 6.53125, + "learning_rate": 9.365591397849462e-06, + "loss": 2.6285, + "mean_token_accuracy": 0.4574826352346874, + "step": 3422 + }, + { + "epoch": 0.6345939933259177, + "grad_norm": 6.4296875, + "learning_rate": 9.365406006674082e-06, + "loss": 2.64, + "mean_token_accuracy": 0.46707583480703213, + "step": 3423 + }, + { + "epoch": 0.6347793845012978, + "grad_norm": 7.98828125, + "learning_rate": 9.365220615498703e-06, + "loss": 2.4538, + "mean_token_accuracy": 0.45767987549383454, + "step": 3424 + }, + { + "epoch": 0.6349647756766778, + "grad_norm": 9.109375, + "learning_rate": 9.365035224323323e-06, + "loss": 2.4588, + "mean_token_accuracy": 0.47116652679534826, + "step": 3425 + }, + { + "epoch": 0.6351501668520578, + "grad_norm": 6.43359375, + "learning_rate": 9.364849833147944e-06, + "loss": 2.8657, + "mean_token_accuracy": 0.4372764435360245, + "step": 3426 + }, + { + "epoch": 0.6353355580274379, + "grad_norm": 6.38671875, + "learning_rate": 9.364664441972563e-06, + "loss": 3.0479, + "mean_token_accuracy": 0.42502555117535407, + "step": 3427 + }, + { + "epoch": 0.635520949202818, + "grad_norm": 6.81640625, + "learning_rate": 9.364479050797183e-06, + "loss": 3.1238, + "mean_token_accuracy": 0.41693548387096774, + "step": 3428 + }, + { + "epoch": 0.635706340378198, + "grad_norm": 8.34375, + "learning_rate": 9.364293659621802e-06, + "loss": 2.6401, + "mean_token_accuracy": 0.47024151811385856, + "step": 3429 + }, + { + "epoch": 0.635891731553578, + "grad_norm": 9.5390625, + "learning_rate": 9.364108268446423e-06, + "loss": 2.7015, + "mean_token_accuracy": 0.42945050199719315, + "step": 3430 + }, + { + "epoch": 0.6360771227289581, + "grad_norm": 5.296875, + "learning_rate": 9.363922877271043e-06, + "loss": 3.0209, + "mean_token_accuracy": 0.42886755400392756, + "step": 3431 + }, + { + "epoch": 0.6362625139043382, + "grad_norm": 7.28515625, + "learning_rate": 9.363737486095664e-06, + "loss": 2.7207, + "mean_token_accuracy": 0.44628422425032593, + "step": 3432 + }, + { + "epoch": 0.6364479050797182, + "grad_norm": 5.9375, + "learning_rate": 9.363552094920282e-06, + "loss": 2.9606, + "mean_token_accuracy": 0.4298418972332016, + "step": 3433 + }, + { + "epoch": 0.6366332962550982, + "grad_norm": 7.65234375, + "learning_rate": 9.363366703744903e-06, + "loss": 2.6574, + "mean_token_accuracy": 0.44995836802664446, + "step": 3434 + }, + { + "epoch": 0.6368186874304783, + "grad_norm": 7.69140625, + "learning_rate": 9.363181312569523e-06, + "loss": 2.6233, + "mean_token_accuracy": 0.47205138718053846, + "step": 3435 + }, + { + "epoch": 0.6370040786058584, + "grad_norm": 6.51171875, + "learning_rate": 9.362995921394142e-06, + "loss": 3.042, + "mean_token_accuracy": 0.45814415437003403, + "step": 3436 + }, + { + "epoch": 0.6371894697812384, + "grad_norm": 7.6171875, + "learning_rate": 9.362810530218763e-06, + "loss": 2.9216, + "mean_token_accuracy": 0.4396541786743516, + "step": 3437 + }, + { + "epoch": 0.6373748609566184, + "grad_norm": 5.59375, + "learning_rate": 9.362625139043381e-06, + "loss": 2.9363, + "mean_token_accuracy": 0.42065188351589633, + "step": 3438 + }, + { + "epoch": 0.6375602521319985, + "grad_norm": 5.125, + "learning_rate": 9.362439747868002e-06, + "loss": 2.992, + "mean_token_accuracy": 0.42803487592219985, + "step": 3439 + }, + { + "epoch": 0.6377456433073786, + "grad_norm": 6.515625, + "learning_rate": 9.362254356692622e-06, + "loss": 2.9044, + "mean_token_accuracy": 0.42348104157149385, + "step": 3440 + }, + { + "epoch": 0.6379310344827587, + "grad_norm": 7.8046875, + "learning_rate": 9.362068965517243e-06, + "loss": 2.9058, + "mean_token_accuracy": 0.4381881533101045, + "step": 3441 + }, + { + "epoch": 0.6381164256581386, + "grad_norm": 5.99609375, + "learning_rate": 9.361883574341862e-06, + "loss": 3.1916, + "mean_token_accuracy": 0.39945581756931847, + "step": 3442 + }, + { + "epoch": 0.6383018168335187, + "grad_norm": 6.5234375, + "learning_rate": 9.361698183166482e-06, + "loss": 2.9828, + "mean_token_accuracy": 0.4289079229122056, + "step": 3443 + }, + { + "epoch": 0.6384872080088988, + "grad_norm": 6.25, + "learning_rate": 9.361512791991103e-06, + "loss": 2.6105, + "mean_token_accuracy": 0.47671568627450983, + "step": 3444 + }, + { + "epoch": 0.6386725991842789, + "grad_norm": 6.74609375, + "learning_rate": 9.361327400815721e-06, + "loss": 2.9073, + "mean_token_accuracy": 0.40586001085187196, + "step": 3445 + }, + { + "epoch": 0.6388579903596588, + "grad_norm": 6.45703125, + "learning_rate": 9.361142009640342e-06, + "loss": 2.4019, + "mean_token_accuracy": 0.5067737681333173, + "step": 3446 + }, + { + "epoch": 0.6390433815350389, + "grad_norm": 6.69921875, + "learning_rate": 9.36095661846496e-06, + "loss": 2.7421, + "mean_token_accuracy": 0.44170537096582113, + "step": 3447 + }, + { + "epoch": 0.639228772710419, + "grad_norm": 7.24609375, + "learning_rate": 9.360771227289583e-06, + "loss": 2.6149, + "mean_token_accuracy": 0.46155446883779006, + "step": 3448 + }, + { + "epoch": 0.6394141638857991, + "grad_norm": 6.859375, + "learning_rate": 9.360585836114202e-06, + "loss": 2.679, + "mean_token_accuracy": 0.4713182221096619, + "step": 3449 + }, + { + "epoch": 0.639599555061179, + "grad_norm": 5.70703125, + "learning_rate": 9.360400444938822e-06, + "loss": 2.4673, + "mean_token_accuracy": 0.48581647755303403, + "step": 3450 + }, + { + "epoch": 0.6397849462365591, + "grad_norm": 10.8125, + "learning_rate": 9.360215053763441e-06, + "loss": 2.0817, + "mean_token_accuracy": 0.5285530771925727, + "step": 3451 + }, + { + "epoch": 0.6399703374119392, + "grad_norm": 5.14453125, + "learning_rate": 9.360029662588062e-06, + "loss": 2.5627, + "mean_token_accuracy": 0.46903520208604954, + "step": 3452 + }, + { + "epoch": 0.6401557285873193, + "grad_norm": 10.59375, + "learning_rate": 9.359844271412682e-06, + "loss": 2.7413, + "mean_token_accuracy": 0.44132718421726874, + "step": 3453 + }, + { + "epoch": 0.6403411197626993, + "grad_norm": 8.25, + "learning_rate": 9.3596588802373e-06, + "loss": 2.1288, + "mean_token_accuracy": 0.530690985619081, + "step": 3454 + }, + { + "epoch": 0.6405265109380793, + "grad_norm": 5.68359375, + "learning_rate": 9.359473489061921e-06, + "loss": 2.8097, + "mean_token_accuracy": 0.4371443352302121, + "step": 3455 + }, + { + "epoch": 0.6407119021134594, + "grad_norm": 7.2890625, + "learning_rate": 9.359288097886542e-06, + "loss": 2.7236, + "mean_token_accuracy": 0.4595070422535211, + "step": 3456 + }, + { + "epoch": 0.6408972932888395, + "grad_norm": 8.078125, + "learning_rate": 9.359102706711162e-06, + "loss": 2.8474, + "mean_token_accuracy": 0.45810428119273694, + "step": 3457 + }, + { + "epoch": 0.6410826844642195, + "grad_norm": 6.2265625, + "learning_rate": 9.358917315535781e-06, + "loss": 2.5827, + "mean_token_accuracy": 0.46051919956733367, + "step": 3458 + }, + { + "epoch": 0.6412680756395995, + "grad_norm": 5.42578125, + "learning_rate": 9.358731924360402e-06, + "loss": 2.9487, + "mean_token_accuracy": 0.43360737419033385, + "step": 3459 + }, + { + "epoch": 0.6414534668149796, + "grad_norm": 7.15234375, + "learning_rate": 9.35854653318502e-06, + "loss": 2.611, + "mean_token_accuracy": 0.4728219594228057, + "step": 3460 + }, + { + "epoch": 0.6416388579903597, + "grad_norm": 7.2890625, + "learning_rate": 9.35836114200964e-06, + "loss": 2.8574, + "mean_token_accuracy": 0.41953610712577716, + "step": 3461 + }, + { + "epoch": 0.6418242491657397, + "grad_norm": 6.40234375, + "learning_rate": 9.358175750834261e-06, + "loss": 2.9932, + "mean_token_accuracy": 0.4303088275485105, + "step": 3462 + }, + { + "epoch": 0.6420096403411197, + "grad_norm": 5.97265625, + "learning_rate": 9.35799035965888e-06, + "loss": 2.9876, + "mean_token_accuracy": 0.417957027967831, + "step": 3463 + }, + { + "epoch": 0.6421950315164998, + "grad_norm": 7.6171875, + "learning_rate": 9.357804968483502e-06, + "loss": 3.167, + "mean_token_accuracy": 0.4123508157232016, + "step": 3464 + }, + { + "epoch": 0.6423804226918799, + "grad_norm": 6.31640625, + "learning_rate": 9.357619577308121e-06, + "loss": 2.8087, + "mean_token_accuracy": 0.450201126307321, + "step": 3465 + }, + { + "epoch": 0.64256581386726, + "grad_norm": 6.37109375, + "learning_rate": 9.357434186132742e-06, + "loss": 2.9415, + "mean_token_accuracy": 0.43822721924434566, + "step": 3466 + }, + { + "epoch": 0.6427512050426399, + "grad_norm": 10.6640625, + "learning_rate": 9.35724879495736e-06, + "loss": 2.1166, + "mean_token_accuracy": 0.5081081081081081, + "step": 3467 + }, + { + "epoch": 0.64293659621802, + "grad_norm": 7.3671875, + "learning_rate": 9.357063403781981e-06, + "loss": 2.8479, + "mean_token_accuracy": 0.444538407329105, + "step": 3468 + }, + { + "epoch": 0.6431219873934001, + "grad_norm": 6.68359375, + "learning_rate": 9.356878012606601e-06, + "loss": 2.5465, + "mean_token_accuracy": 0.47539884703043306, + "step": 3469 + }, + { + "epoch": 0.6433073785687802, + "grad_norm": 7.33203125, + "learning_rate": 9.35669262143122e-06, + "loss": 3.033, + "mean_token_accuracy": 0.4175976045145687, + "step": 3470 + }, + { + "epoch": 0.6434927697441601, + "grad_norm": 7.2578125, + "learning_rate": 9.35650723025584e-06, + "loss": 2.9691, + "mean_token_accuracy": 0.4239418913256041, + "step": 3471 + }, + { + "epoch": 0.6436781609195402, + "grad_norm": 5.11328125, + "learning_rate": 9.35632183908046e-06, + "loss": 2.6975, + "mean_token_accuracy": 0.47306485355648537, + "step": 3472 + }, + { + "epoch": 0.6438635520949203, + "grad_norm": 7.625, + "learning_rate": 9.356136447905082e-06, + "loss": 2.9957, + "mean_token_accuracy": 0.42177998894416807, + "step": 3473 + }, + { + "epoch": 0.6440489432703004, + "grad_norm": 5.9609375, + "learning_rate": 9.3559510567297e-06, + "loss": 2.8651, + "mean_token_accuracy": 0.4343128781331029, + "step": 3474 + }, + { + "epoch": 0.6442343344456803, + "grad_norm": 6.6484375, + "learning_rate": 9.355765665554321e-06, + "loss": 2.6631, + "mean_token_accuracy": 0.4327756746855538, + "step": 3475 + }, + { + "epoch": 0.6444197256210604, + "grad_norm": 5.8515625, + "learning_rate": 9.35558027437894e-06, + "loss": 3.2914, + "mean_token_accuracy": 0.3833744543556652, + "step": 3476 + }, + { + "epoch": 0.6446051167964405, + "grad_norm": 7.13671875, + "learning_rate": 9.35539488320356e-06, + "loss": 2.7057, + "mean_token_accuracy": 0.4630716934487021, + "step": 3477 + }, + { + "epoch": 0.6447905079718206, + "grad_norm": 7.453125, + "learning_rate": 9.35520949202818e-06, + "loss": 2.6218, + "mean_token_accuracy": 0.47800216372160115, + "step": 3478 + }, + { + "epoch": 0.6449758991472005, + "grad_norm": 5.609375, + "learning_rate": 9.3550241008528e-06, + "loss": 2.5635, + "mean_token_accuracy": 0.48227341702303766, + "step": 3479 + }, + { + "epoch": 0.6451612903225806, + "grad_norm": 5.51171875, + "learning_rate": 9.35483870967742e-06, + "loss": 2.6569, + "mean_token_accuracy": 0.4987199180747568, + "step": 3480 + }, + { + "epoch": 0.6453466814979607, + "grad_norm": 7.7734375, + "learning_rate": 9.35465331850204e-06, + "loss": 2.7052, + "mean_token_accuracy": 0.46191391805945864, + "step": 3481 + }, + { + "epoch": 0.6455320726733408, + "grad_norm": 10.234375, + "learning_rate": 9.354467927326661e-06, + "loss": 2.9382, + "mean_token_accuracy": 0.4238544474393531, + "step": 3482 + }, + { + "epoch": 0.6457174638487208, + "grad_norm": 5.73828125, + "learning_rate": 9.35428253615128e-06, + "loss": 2.4242, + "mean_token_accuracy": 0.5008727795461546, + "step": 3483 + }, + { + "epoch": 0.6459028550241008, + "grad_norm": 9.2109375, + "learning_rate": 9.3540971449759e-06, + "loss": 2.6135, + "mean_token_accuracy": 0.4593809364174768, + "step": 3484 + }, + { + "epoch": 0.6460882461994809, + "grad_norm": 6.9453125, + "learning_rate": 9.353911753800519e-06, + "loss": 3.5662, + "mean_token_accuracy": 0.3657446551515884, + "step": 3485 + }, + { + "epoch": 0.646273637374861, + "grad_norm": 7.0390625, + "learning_rate": 9.35372636262514e-06, + "loss": 2.7669, + "mean_token_accuracy": 0.4478002378121284, + "step": 3486 + }, + { + "epoch": 0.646459028550241, + "grad_norm": 5.45703125, + "learning_rate": 9.35354097144976e-06, + "loss": 2.672, + "mean_token_accuracy": 0.4561678146524734, + "step": 3487 + }, + { + "epoch": 0.646644419725621, + "grad_norm": 4.61328125, + "learning_rate": 9.353355580274379e-06, + "loss": 3.1046, + "mean_token_accuracy": 0.4125609634716831, + "step": 3488 + }, + { + "epoch": 0.6468298109010011, + "grad_norm": 9.1640625, + "learning_rate": 9.353170189099e-06, + "loss": 2.2336, + "mean_token_accuracy": 0.49825970548862114, + "step": 3489 + }, + { + "epoch": 0.6470152020763812, + "grad_norm": 6.0546875, + "learning_rate": 9.35298479792362e-06, + "loss": 2.8191, + "mean_token_accuracy": 0.4522131378629828, + "step": 3490 + }, + { + "epoch": 0.6472005932517613, + "grad_norm": 6.5390625, + "learning_rate": 9.35279940674824e-06, + "loss": 2.5341, + "mean_token_accuracy": 0.49609282500591995, + "step": 3491 + }, + { + "epoch": 0.6473859844271412, + "grad_norm": 5.953125, + "learning_rate": 9.352614015572859e-06, + "loss": 3.1905, + "mean_token_accuracy": 0.4015497967479675, + "step": 3492 + }, + { + "epoch": 0.6475713756025213, + "grad_norm": 6.8359375, + "learning_rate": 9.35242862439748e-06, + "loss": 3.4976, + "mean_token_accuracy": 0.3823094004441155, + "step": 3493 + }, + { + "epoch": 0.6477567667779014, + "grad_norm": 6.34375, + "learning_rate": 9.352243233222098e-06, + "loss": 3.1655, + "mean_token_accuracy": 0.4075322561897013, + "step": 3494 + }, + { + "epoch": 0.6479421579532815, + "grad_norm": 6.6484375, + "learning_rate": 9.352057842046719e-06, + "loss": 2.9146, + "mean_token_accuracy": 0.4371799062874578, + "step": 3495 + }, + { + "epoch": 0.6481275491286614, + "grad_norm": 9.140625, + "learning_rate": 9.35187245087134e-06, + "loss": 3.2363, + "mean_token_accuracy": 0.3997482693517936, + "step": 3496 + }, + { + "epoch": 0.6483129403040415, + "grad_norm": 6.51953125, + "learning_rate": 9.35168705969596e-06, + "loss": 2.5667, + "mean_token_accuracy": 0.4763842643975829, + "step": 3497 + }, + { + "epoch": 0.6484983314794216, + "grad_norm": 6.015625, + "learning_rate": 9.351501668520579e-06, + "loss": 3.1906, + "mean_token_accuracy": 0.43952318460192474, + "step": 3498 + }, + { + "epoch": 0.6486837226548017, + "grad_norm": 5.328125, + "learning_rate": 9.3513162773452e-06, + "loss": 2.4719, + "mean_token_accuracy": 0.4950111515436084, + "step": 3499 + }, + { + "epoch": 0.6488691138301816, + "grad_norm": 9.34375, + "learning_rate": 9.35113088616982e-06, + "loss": 2.8902, + "mean_token_accuracy": 0.4483165977554637, + "step": 3500 + }, + { + "epoch": 0.6490545050055617, + "grad_norm": 7.40625, + "learning_rate": 9.350945494994438e-06, + "loss": 2.7395, + "mean_token_accuracy": 0.4655295972481009, + "step": 3501 + }, + { + "epoch": 0.6492398961809418, + "grad_norm": 4.96484375, + "learning_rate": 9.350760103819059e-06, + "loss": 2.8185, + "mean_token_accuracy": 0.45587241519124205, + "step": 3502 + }, + { + "epoch": 0.6494252873563219, + "grad_norm": 6.4921875, + "learning_rate": 9.350574712643678e-06, + "loss": 2.7182, + "mean_token_accuracy": 0.46024662360540225, + "step": 3503 + }, + { + "epoch": 0.6496106785317018, + "grad_norm": 6.37109375, + "learning_rate": 9.350389321468298e-06, + "loss": 2.6893, + "mean_token_accuracy": 0.4928236464402444, + "step": 3504 + }, + { + "epoch": 0.6497960697070819, + "grad_norm": 5.8984375, + "learning_rate": 9.350203930292919e-06, + "loss": 2.4816, + "mean_token_accuracy": 0.47257634153572486, + "step": 3505 + }, + { + "epoch": 0.649981460882462, + "grad_norm": 5.99609375, + "learning_rate": 9.35001853911754e-06, + "loss": 2.4901, + "mean_token_accuracy": 0.4763820399688554, + "step": 3506 + }, + { + "epoch": 0.6501668520578421, + "grad_norm": 7.16015625, + "learning_rate": 9.34983314794216e-06, + "loss": 2.4037, + "mean_token_accuracy": 0.46898115008351227, + "step": 3507 + }, + { + "epoch": 0.650352243233222, + "grad_norm": 6.37890625, + "learning_rate": 9.349647756766779e-06, + "loss": 2.6365, + "mean_token_accuracy": 0.4667563125653668, + "step": 3508 + }, + { + "epoch": 0.6505376344086021, + "grad_norm": 5.93359375, + "learning_rate": 9.349462365591399e-06, + "loss": 3.0668, + "mean_token_accuracy": 0.41711040113596026, + "step": 3509 + }, + { + "epoch": 0.6507230255839822, + "grad_norm": 11.3671875, + "learning_rate": 9.349276974416018e-06, + "loss": 2.9947, + "mean_token_accuracy": 0.42299602710913764, + "step": 3510 + }, + { + "epoch": 0.6509084167593623, + "grad_norm": 10.796875, + "learning_rate": 9.349091583240638e-06, + "loss": 2.9209, + "mean_token_accuracy": 0.4326204586446792, + "step": 3511 + }, + { + "epoch": 0.6510938079347423, + "grad_norm": 5.5546875, + "learning_rate": 9.348906192065259e-06, + "loss": 2.8007, + "mean_token_accuracy": 0.43424317617866004, + "step": 3512 + }, + { + "epoch": 0.6512791991101223, + "grad_norm": 6.32421875, + "learning_rate": 9.34872080088988e-06, + "loss": 3.0032, + "mean_token_accuracy": 0.4412568306010929, + "step": 3513 + }, + { + "epoch": 0.6514645902855024, + "grad_norm": 10.3984375, + "learning_rate": 9.348535409714498e-06, + "loss": 2.7403, + "mean_token_accuracy": 0.45398080180688877, + "step": 3514 + }, + { + "epoch": 0.6516499814608825, + "grad_norm": 6.60546875, + "learning_rate": 9.348350018539119e-06, + "loss": 2.8257, + "mean_token_accuracy": 0.4348434716212777, + "step": 3515 + }, + { + "epoch": 0.6518353726362626, + "grad_norm": 5.42578125, + "learning_rate": 9.348164627363739e-06, + "loss": 2.3386, + "mean_token_accuracy": 0.5146098683506368, + "step": 3516 + }, + { + "epoch": 0.6520207638116425, + "grad_norm": 8.3046875, + "learning_rate": 9.347979236188358e-06, + "loss": 2.7749, + "mean_token_accuracy": 0.44810450092290216, + "step": 3517 + }, + { + "epoch": 0.6522061549870226, + "grad_norm": 9.5625, + "learning_rate": 9.347793845012978e-06, + "loss": 2.8623, + "mean_token_accuracy": 0.4358325957769629, + "step": 3518 + }, + { + "epoch": 0.6523915461624027, + "grad_norm": 6.359375, + "learning_rate": 9.347608453837597e-06, + "loss": 3.0063, + "mean_token_accuracy": 0.4304139172165567, + "step": 3519 + }, + { + "epoch": 0.6525769373377828, + "grad_norm": 5.15625, + "learning_rate": 9.347423062662218e-06, + "loss": 2.7654, + "mean_token_accuracy": 0.4473443820957849, + "step": 3520 + }, + { + "epoch": 0.6527623285131627, + "grad_norm": 7.5625, + "learning_rate": 9.347237671486838e-06, + "loss": 3.2825, + "mean_token_accuracy": 0.38628239499553174, + "step": 3521 + }, + { + "epoch": 0.6529477196885428, + "grad_norm": 10.4609375, + "learning_rate": 9.347052280311459e-06, + "loss": 2.5936, + "mean_token_accuracy": 0.46189454668623686, + "step": 3522 + }, + { + "epoch": 0.6531331108639229, + "grad_norm": 6.03125, + "learning_rate": 9.346866889136077e-06, + "loss": 2.7846, + "mean_token_accuracy": 0.4692909651410956, + "step": 3523 + }, + { + "epoch": 0.653318502039303, + "grad_norm": 6.234375, + "learning_rate": 9.346681497960698e-06, + "loss": 2.7169, + "mean_token_accuracy": 0.4525537450362865, + "step": 3524 + }, + { + "epoch": 0.6535038932146829, + "grad_norm": 5.1953125, + "learning_rate": 9.346496106785318e-06, + "loss": 3.1949, + "mean_token_accuracy": 0.40904212503353904, + "step": 3525 + }, + { + "epoch": 0.653689284390063, + "grad_norm": 5.85546875, + "learning_rate": 9.346310715609937e-06, + "loss": 3.1406, + "mean_token_accuracy": 0.43195342820181115, + "step": 3526 + }, + { + "epoch": 0.6538746755654431, + "grad_norm": 5.17578125, + "learning_rate": 9.346125324434558e-06, + "loss": 3.0217, + "mean_token_accuracy": 0.41850097520200613, + "step": 3527 + }, + { + "epoch": 0.6540600667408232, + "grad_norm": 7.1875, + "learning_rate": 9.345939933259176e-06, + "loss": 3.3386, + "mean_token_accuracy": 0.3845357776463631, + "step": 3528 + }, + { + "epoch": 0.6542454579162031, + "grad_norm": 4.8828125, + "learning_rate": 9.345754542083799e-06, + "loss": 3.3616, + "mean_token_accuracy": 0.3970710909259431, + "step": 3529 + }, + { + "epoch": 0.6544308490915832, + "grad_norm": 6.96875, + "learning_rate": 9.345569150908417e-06, + "loss": 2.746, + "mean_token_accuracy": 0.44024289263041677, + "step": 3530 + }, + { + "epoch": 0.6546162402669633, + "grad_norm": 5.8203125, + "learning_rate": 9.345383759733038e-06, + "loss": 2.6817, + "mean_token_accuracy": 0.46041506533435816, + "step": 3531 + }, + { + "epoch": 0.6548016314423434, + "grad_norm": 7.046875, + "learning_rate": 9.345198368557657e-06, + "loss": 2.7184, + "mean_token_accuracy": 0.44545743665853915, + "step": 3532 + }, + { + "epoch": 0.6549870226177233, + "grad_norm": 8.0703125, + "learning_rate": 9.345012977382277e-06, + "loss": 2.948, + "mean_token_accuracy": 0.42253978564468986, + "step": 3533 + }, + { + "epoch": 0.6551724137931034, + "grad_norm": 6.4140625, + "learning_rate": 9.344827586206898e-06, + "loss": 2.5301, + "mean_token_accuracy": 0.470106810930781, + "step": 3534 + }, + { + "epoch": 0.6553578049684835, + "grad_norm": 6.37109375, + "learning_rate": 9.344642195031517e-06, + "loss": 2.5228, + "mean_token_accuracy": 0.49033329379670265, + "step": 3535 + }, + { + "epoch": 0.6555431961438636, + "grad_norm": 5.30078125, + "learning_rate": 9.344456803856137e-06, + "loss": 2.3649, + "mean_token_accuracy": 0.4945665298237141, + "step": 3536 + }, + { + "epoch": 0.6557285873192435, + "grad_norm": 6.3671875, + "learning_rate": 9.344271412680758e-06, + "loss": 2.6566, + "mean_token_accuracy": 0.4783092324805339, + "step": 3537 + }, + { + "epoch": 0.6559139784946236, + "grad_norm": 6.16015625, + "learning_rate": 9.344086021505378e-06, + "loss": 2.6125, + "mean_token_accuracy": 0.46535962110364576, + "step": 3538 + }, + { + "epoch": 0.6560993696700037, + "grad_norm": 6.12890625, + "learning_rate": 9.343900630329997e-06, + "loss": 2.7781, + "mean_token_accuracy": 0.45773028540511296, + "step": 3539 + }, + { + "epoch": 0.6562847608453838, + "grad_norm": 7.63671875, + "learning_rate": 9.343715239154617e-06, + "loss": 3.2221, + "mean_token_accuracy": 0.37738353849867246, + "step": 3540 + }, + { + "epoch": 0.6564701520207639, + "grad_norm": 5.53125, + "learning_rate": 9.343529847979236e-06, + "loss": 2.6657, + "mean_token_accuracy": 0.47190366972477066, + "step": 3541 + }, + { + "epoch": 0.6566555431961438, + "grad_norm": 6.375, + "learning_rate": 9.343344456803857e-06, + "loss": 3.0669, + "mean_token_accuracy": 0.4262676939616556, + "step": 3542 + }, + { + "epoch": 0.6568409343715239, + "grad_norm": 6.69921875, + "learning_rate": 9.343159065628477e-06, + "loss": 2.1978, + "mean_token_accuracy": 0.5372761349437735, + "step": 3543 + }, + { + "epoch": 0.657026325546904, + "grad_norm": 7.1953125, + "learning_rate": 9.342973674453096e-06, + "loss": 2.8334, + "mean_token_accuracy": 0.43960423251339836, + "step": 3544 + }, + { + "epoch": 0.6572117167222841, + "grad_norm": 6.515625, + "learning_rate": 9.342788283277718e-06, + "loss": 3.3283, + "mean_token_accuracy": 0.4047042545831892, + "step": 3545 + }, + { + "epoch": 0.657397107897664, + "grad_norm": 11.53125, + "learning_rate": 9.342602892102337e-06, + "loss": 2.5772, + "mean_token_accuracy": 0.4546969114443163, + "step": 3546 + }, + { + "epoch": 0.6575824990730441, + "grad_norm": 8.4140625, + "learning_rate": 9.342417500926957e-06, + "loss": 2.6725, + "mean_token_accuracy": 0.4457540851861773, + "step": 3547 + }, + { + "epoch": 0.6577678902484242, + "grad_norm": 5.33203125, + "learning_rate": 9.342232109751576e-06, + "loss": 2.8483, + "mean_token_accuracy": 0.4439712284686731, + "step": 3548 + }, + { + "epoch": 0.6579532814238043, + "grad_norm": 6.4453125, + "learning_rate": 9.342046718576197e-06, + "loss": 3.2162, + "mean_token_accuracy": 0.4350282485875706, + "step": 3549 + }, + { + "epoch": 0.6581386725991842, + "grad_norm": 7.57421875, + "learning_rate": 9.341861327400817e-06, + "loss": 3.2735, + "mean_token_accuracy": 0.39293333333333336, + "step": 3550 + }, + { + "epoch": 0.6583240637745643, + "grad_norm": 11.0859375, + "learning_rate": 9.341675936225436e-06, + "loss": 2.5121, + "mean_token_accuracy": 0.4711549759135237, + "step": 3551 + }, + { + "epoch": 0.6585094549499444, + "grad_norm": 5.39453125, + "learning_rate": 9.341490545050056e-06, + "loss": 2.9018, + "mean_token_accuracy": 0.4497547621763431, + "step": 3552 + }, + { + "epoch": 0.6586948461253245, + "grad_norm": 9.5625, + "learning_rate": 9.341305153874677e-06, + "loss": 2.9469, + "mean_token_accuracy": 0.4166553272554089, + "step": 3553 + }, + { + "epoch": 0.6588802373007044, + "grad_norm": 7.9765625, + "learning_rate": 9.341119762699297e-06, + "loss": 2.7343, + "mean_token_accuracy": 0.4546106067845198, + "step": 3554 + }, + { + "epoch": 0.6590656284760845, + "grad_norm": 8.3359375, + "learning_rate": 9.340934371523916e-06, + "loss": 2.4479, + "mean_token_accuracy": 0.48914141414141415, + "step": 3555 + }, + { + "epoch": 0.6592510196514646, + "grad_norm": 9.1484375, + "learning_rate": 9.340748980348537e-06, + "loss": 2.8444, + "mean_token_accuracy": 0.4476233050216063, + "step": 3556 + }, + { + "epoch": 0.6594364108268447, + "grad_norm": 6.23046875, + "learning_rate": 9.340563589173155e-06, + "loss": 2.8782, + "mean_token_accuracy": 0.4375287092328893, + "step": 3557 + }, + { + "epoch": 0.6596218020022246, + "grad_norm": 6.015625, + "learning_rate": 9.340378197997776e-06, + "loss": 2.8338, + "mean_token_accuracy": 0.44735807001442246, + "step": 3558 + }, + { + "epoch": 0.6598071931776047, + "grad_norm": 6.58984375, + "learning_rate": 9.340192806822396e-06, + "loss": 2.8294, + "mean_token_accuracy": 0.4365513809405325, + "step": 3559 + }, + { + "epoch": 0.6599925843529848, + "grad_norm": 5.046875, + "learning_rate": 9.340007415647015e-06, + "loss": 2.9411, + "mean_token_accuracy": 0.43771464252263503, + "step": 3560 + }, + { + "epoch": 0.6601779755283649, + "grad_norm": 8.265625, + "learning_rate": 9.339822024471636e-06, + "loss": 2.851, + "mean_token_accuracy": 0.4304155814743517, + "step": 3561 + }, + { + "epoch": 0.6603633667037448, + "grad_norm": 7.7578125, + "learning_rate": 9.339636633296256e-06, + "loss": 3.0229, + "mean_token_accuracy": 0.3970982687149476, + "step": 3562 + }, + { + "epoch": 0.6605487578791249, + "grad_norm": 4.55078125, + "learning_rate": 9.339451242120877e-06, + "loss": 2.9341, + "mean_token_accuracy": 0.42667102824119507, + "step": 3563 + }, + { + "epoch": 0.660734149054505, + "grad_norm": 6.25390625, + "learning_rate": 9.339265850945496e-06, + "loss": 3.1075, + "mean_token_accuracy": 0.4174991766384894, + "step": 3564 + }, + { + "epoch": 0.6609195402298851, + "grad_norm": 6.296875, + "learning_rate": 9.339080459770116e-06, + "loss": 2.8463, + "mean_token_accuracy": 0.4427916029473099, + "step": 3565 + }, + { + "epoch": 0.6611049314052652, + "grad_norm": 6.21875, + "learning_rate": 9.338895068594735e-06, + "loss": 2.7832, + "mean_token_accuracy": 0.44074028679985294, + "step": 3566 + }, + { + "epoch": 0.6612903225806451, + "grad_norm": 5.0703125, + "learning_rate": 9.338709677419355e-06, + "loss": 2.4891, + "mean_token_accuracy": 0.4980330448465775, + "step": 3567 + }, + { + "epoch": 0.6614757137560252, + "grad_norm": 9.015625, + "learning_rate": 9.338524286243976e-06, + "loss": 2.8717, + "mean_token_accuracy": 0.443019943019943, + "step": 3568 + }, + { + "epoch": 0.6616611049314053, + "grad_norm": 7.08984375, + "learning_rate": 9.338338895068596e-06, + "loss": 3.3111, + "mean_token_accuracy": 0.3847064393939394, + "step": 3569 + }, + { + "epoch": 0.6618464961067854, + "grad_norm": 9.21875, + "learning_rate": 9.338153503893215e-06, + "loss": 2.8339, + "mean_token_accuracy": 0.4424587364826409, + "step": 3570 + }, + { + "epoch": 0.6620318872821653, + "grad_norm": 6.94140625, + "learning_rate": 9.337968112717836e-06, + "loss": 2.4328, + "mean_token_accuracy": 0.48807964369924023, + "step": 3571 + }, + { + "epoch": 0.6622172784575454, + "grad_norm": 7.51171875, + "learning_rate": 9.337782721542456e-06, + "loss": 2.9582, + "mean_token_accuracy": 0.42636655948553054, + "step": 3572 + }, + { + "epoch": 0.6624026696329255, + "grad_norm": 6.56640625, + "learning_rate": 9.337597330367075e-06, + "loss": 3.6296, + "mean_token_accuracy": 0.36961099412612214, + "step": 3573 + }, + { + "epoch": 0.6625880608083056, + "grad_norm": 7.24609375, + "learning_rate": 9.337411939191695e-06, + "loss": 2.4658, + "mean_token_accuracy": 0.497765136123527, + "step": 3574 + }, + { + "epoch": 0.6627734519836855, + "grad_norm": 4.41015625, + "learning_rate": 9.337226548016314e-06, + "loss": 2.7012, + "mean_token_accuracy": 0.4707991803278688, + "step": 3575 + }, + { + "epoch": 0.6629588431590656, + "grad_norm": 4.90625, + "learning_rate": 9.337041156840935e-06, + "loss": 3.1077, + "mean_token_accuracy": 0.424341656433347, + "step": 3576 + }, + { + "epoch": 0.6631442343344457, + "grad_norm": 5.11328125, + "learning_rate": 9.336855765665555e-06, + "loss": 3.0677, + "mean_token_accuracy": 0.42668621700879766, + "step": 3577 + }, + { + "epoch": 0.6633296255098258, + "grad_norm": 9.4140625, + "learning_rate": 9.336670374490176e-06, + "loss": 2.6996, + "mean_token_accuracy": 0.46281843616551316, + "step": 3578 + }, + { + "epoch": 0.6635150166852057, + "grad_norm": 6.95703125, + "learning_rate": 9.336484983314794e-06, + "loss": 2.8427, + "mean_token_accuracy": 0.45353524349057633, + "step": 3579 + }, + { + "epoch": 0.6637004078605858, + "grad_norm": 6.21875, + "learning_rate": 9.336299592139415e-06, + "loss": 3.0039, + "mean_token_accuracy": 0.41881824675987345, + "step": 3580 + }, + { + "epoch": 0.6638857990359659, + "grad_norm": 9.5, + "learning_rate": 9.336114200964035e-06, + "loss": 3.1638, + "mean_token_accuracy": 0.4225332784410594, + "step": 3581 + }, + { + "epoch": 0.664071190211346, + "grad_norm": 6.62109375, + "learning_rate": 9.335928809788654e-06, + "loss": 3.1135, + "mean_token_accuracy": 0.4169104463527413, + "step": 3582 + }, + { + "epoch": 0.6642565813867259, + "grad_norm": 7.87890625, + "learning_rate": 9.335743418613275e-06, + "loss": 3.2602, + "mean_token_accuracy": 0.397458318759473, + "step": 3583 + }, + { + "epoch": 0.664441972562106, + "grad_norm": 6.90625, + "learning_rate": 9.335558027437894e-06, + "loss": 2.7016, + "mean_token_accuracy": 0.4630177514792899, + "step": 3584 + }, + { + "epoch": 0.6646273637374861, + "grad_norm": 8.0, + "learning_rate": 9.335372636262516e-06, + "loss": 2.6147, + "mean_token_accuracy": 0.47836630504148053, + "step": 3585 + }, + { + "epoch": 0.6648127549128662, + "grad_norm": 7.9140625, + "learning_rate": 9.335187245087134e-06, + "loss": 2.5899, + "mean_token_accuracy": 0.4601692121744863, + "step": 3586 + }, + { + "epoch": 0.6649981460882461, + "grad_norm": 8.6015625, + "learning_rate": 9.335001853911755e-06, + "loss": 3.018, + "mean_token_accuracy": 0.4043239369326326, + "step": 3587 + }, + { + "epoch": 0.6651835372636262, + "grad_norm": 6.171875, + "learning_rate": 9.334816462736375e-06, + "loss": 2.7246, + "mean_token_accuracy": 0.44770132675100277, + "step": 3588 + }, + { + "epoch": 0.6653689284390063, + "grad_norm": 6.9140625, + "learning_rate": 9.334631071560994e-06, + "loss": 2.5851, + "mean_token_accuracy": 0.4643243243243243, + "step": 3589 + }, + { + "epoch": 0.6655543196143864, + "grad_norm": 6.421875, + "learning_rate": 9.334445680385615e-06, + "loss": 2.7557, + "mean_token_accuracy": 0.4528301886792453, + "step": 3590 + }, + { + "epoch": 0.6657397107897665, + "grad_norm": 8.4140625, + "learning_rate": 9.334260289210234e-06, + "loss": 2.7999, + "mean_token_accuracy": 0.4544693906010319, + "step": 3591 + }, + { + "epoch": 0.6659251019651464, + "grad_norm": 7.125, + "learning_rate": 9.334074898034854e-06, + "loss": 2.5648, + "mean_token_accuracy": 0.4726477024070022, + "step": 3592 + }, + { + "epoch": 0.6661104931405265, + "grad_norm": 7.953125, + "learning_rate": 9.333889506859475e-06, + "loss": 2.7071, + "mean_token_accuracy": 0.47197408461764995, + "step": 3593 + }, + { + "epoch": 0.6662958843159066, + "grad_norm": 6.5078125, + "learning_rate": 9.333704115684095e-06, + "loss": 2.8733, + "mean_token_accuracy": 0.4277215189873418, + "step": 3594 + }, + { + "epoch": 0.6664812754912867, + "grad_norm": 6.2890625, + "learning_rate": 9.333518724508714e-06, + "loss": 2.603, + "mean_token_accuracy": 0.4681585677749361, + "step": 3595 + }, + { + "epoch": 0.6666666666666666, + "grad_norm": 7.38671875, + "learning_rate": 9.333333333333334e-06, + "loss": 2.3882, + "mean_token_accuracy": 0.49659304511278196, + "step": 3596 + }, + { + "epoch": 0.6668520578420467, + "grad_norm": 7.92578125, + "learning_rate": 9.333147942157955e-06, + "loss": 2.6524, + "mean_token_accuracy": 0.45775144590210187, + "step": 3597 + }, + { + "epoch": 0.6670374490174268, + "grad_norm": 6.0625, + "learning_rate": 9.332962550982574e-06, + "loss": 3.2304, + "mean_token_accuracy": 0.40518134715025905, + "step": 3598 + }, + { + "epoch": 0.6672228401928069, + "grad_norm": 7.97265625, + "learning_rate": 9.332777159807194e-06, + "loss": 2.674, + "mean_token_accuracy": 0.4665566696281254, + "step": 3599 + }, + { + "epoch": 0.6674082313681868, + "grad_norm": 7.56640625, + "learning_rate": 9.332591768631813e-06, + "loss": 2.5623, + "mean_token_accuracy": 0.48282009724473257, + "step": 3600 + }, + { + "epoch": 0.6675936225435669, + "grad_norm": 7.265625, + "learning_rate": 9.332406377456433e-06, + "loss": 3.2064, + "mean_token_accuracy": 0.39840054066231134, + "step": 3601 + }, + { + "epoch": 0.667779013718947, + "grad_norm": 8.9921875, + "learning_rate": 9.332220986281054e-06, + "loss": 2.6115, + "mean_token_accuracy": 0.4723478260869565, + "step": 3602 + }, + { + "epoch": 0.6679644048943271, + "grad_norm": 10.6796875, + "learning_rate": 9.332035595105674e-06, + "loss": 2.4115, + "mean_token_accuracy": 0.4828637815858263, + "step": 3603 + }, + { + "epoch": 0.668149796069707, + "grad_norm": 7.5390625, + "learning_rate": 9.331850203930293e-06, + "loss": 2.9932, + "mean_token_accuracy": 0.43436265183667494, + "step": 3604 + }, + { + "epoch": 0.6683351872450871, + "grad_norm": 7.359375, + "learning_rate": 9.331664812754914e-06, + "loss": 2.8746, + "mean_token_accuracy": 0.4319745353732416, + "step": 3605 + }, + { + "epoch": 0.6685205784204672, + "grad_norm": 8.265625, + "learning_rate": 9.331479421579534e-06, + "loss": 2.4358, + "mean_token_accuracy": 0.4887960501329282, + "step": 3606 + }, + { + "epoch": 0.6687059695958473, + "grad_norm": 7.2578125, + "learning_rate": 9.331294030404153e-06, + "loss": 2.8172, + "mean_token_accuracy": 0.439513998943476, + "step": 3607 + }, + { + "epoch": 0.6688913607712272, + "grad_norm": 9.0859375, + "learning_rate": 9.331108639228773e-06, + "loss": 3.0082, + "mean_token_accuracy": 0.42958129418162044, + "step": 3608 + }, + { + "epoch": 0.6690767519466073, + "grad_norm": 11.0, + "learning_rate": 9.330923248053392e-06, + "loss": 2.7527, + "mean_token_accuracy": 0.4611883691529709, + "step": 3609 + }, + { + "epoch": 0.6692621431219874, + "grad_norm": 10.78125, + "learning_rate": 9.330737856878014e-06, + "loss": 2.5035, + "mean_token_accuracy": 0.46387246078683464, + "step": 3610 + }, + { + "epoch": 0.6694475342973675, + "grad_norm": 5.68359375, + "learning_rate": 9.330552465702633e-06, + "loss": 3.1006, + "mean_token_accuracy": 0.4197340797760672, + "step": 3611 + }, + { + "epoch": 0.6696329254727474, + "grad_norm": 6.234375, + "learning_rate": 9.330367074527254e-06, + "loss": 2.6293, + "mean_token_accuracy": 0.4637002341920375, + "step": 3612 + }, + { + "epoch": 0.6698183166481275, + "grad_norm": 5.10546875, + "learning_rate": 9.330181683351873e-06, + "loss": 2.7377, + "mean_token_accuracy": 0.4565049044914817, + "step": 3613 + }, + { + "epoch": 0.6700037078235076, + "grad_norm": 6.05859375, + "learning_rate": 9.329996292176493e-06, + "loss": 2.7477, + "mean_token_accuracy": 0.47724265678750993, + "step": 3614 + }, + { + "epoch": 0.6701890989988877, + "grad_norm": 5.47265625, + "learning_rate": 9.329810901001113e-06, + "loss": 3.0433, + "mean_token_accuracy": 0.42712964075453563, + "step": 3615 + }, + { + "epoch": 0.6703744901742678, + "grad_norm": 8.515625, + "learning_rate": 9.329625509825732e-06, + "loss": 2.761, + "mean_token_accuracy": 0.43816049239345023, + "step": 3616 + }, + { + "epoch": 0.6705598813496477, + "grad_norm": 6.9140625, + "learning_rate": 9.329440118650353e-06, + "loss": 2.5539, + "mean_token_accuracy": 0.45009666505558243, + "step": 3617 + }, + { + "epoch": 0.6707452725250278, + "grad_norm": 8.578125, + "learning_rate": 9.329254727474973e-06, + "loss": 2.8629, + "mean_token_accuracy": 0.4499082989454379, + "step": 3618 + }, + { + "epoch": 0.6709306637004079, + "grad_norm": 6.234375, + "learning_rate": 9.329069336299594e-06, + "loss": 2.3471, + "mean_token_accuracy": 0.5064914992272025, + "step": 3619 + }, + { + "epoch": 0.671116054875788, + "grad_norm": 8.8515625, + "learning_rate": 9.328883945124213e-06, + "loss": 2.9939, + "mean_token_accuracy": 0.39386302994367034, + "step": 3620 + }, + { + "epoch": 0.6713014460511679, + "grad_norm": 6.03125, + "learning_rate": 9.328698553948833e-06, + "loss": 2.8051, + "mean_token_accuracy": 0.439498504715896, + "step": 3621 + }, + { + "epoch": 0.671486837226548, + "grad_norm": 6.9453125, + "learning_rate": 9.328513162773452e-06, + "loss": 3.2633, + "mean_token_accuracy": 0.4141247182569497, + "step": 3622 + }, + { + "epoch": 0.6716722284019281, + "grad_norm": 6.828125, + "learning_rate": 9.328327771598072e-06, + "loss": 2.487, + "mean_token_accuracy": 0.48067349926793557, + "step": 3623 + }, + { + "epoch": 0.6718576195773082, + "grad_norm": 5.90234375, + "learning_rate": 9.328142380422693e-06, + "loss": 2.6918, + "mean_token_accuracy": 0.4550006955070246, + "step": 3624 + }, + { + "epoch": 0.6720430107526881, + "grad_norm": 5.55078125, + "learning_rate": 9.327956989247312e-06, + "loss": 2.6263, + "mean_token_accuracy": 0.484351302909141, + "step": 3625 + }, + { + "epoch": 0.6722284019280682, + "grad_norm": 6.0625, + "learning_rate": 9.327771598071934e-06, + "loss": 2.9171, + "mean_token_accuracy": 0.42613138686131385, + "step": 3626 + }, + { + "epoch": 0.6724137931034483, + "grad_norm": 10.1171875, + "learning_rate": 9.327586206896553e-06, + "loss": 2.7052, + "mean_token_accuracy": 0.4434087882822903, + "step": 3627 + }, + { + "epoch": 0.6725991842788284, + "grad_norm": 6.015625, + "learning_rate": 9.327400815721173e-06, + "loss": 3.1685, + "mean_token_accuracy": 0.40927152317880794, + "step": 3628 + }, + { + "epoch": 0.6727845754542083, + "grad_norm": 9.3046875, + "learning_rate": 9.327215424545792e-06, + "loss": 2.6157, + "mean_token_accuracy": 0.45728921500761455, + "step": 3629 + }, + { + "epoch": 0.6729699666295884, + "grad_norm": 11.625, + "learning_rate": 9.327030033370412e-06, + "loss": 3.1095, + "mean_token_accuracy": 0.3970977917981073, + "step": 3630 + }, + { + "epoch": 0.6731553578049685, + "grad_norm": 9.953125, + "learning_rate": 9.326844642195033e-06, + "loss": 2.5221, + "mean_token_accuracy": 0.4811220420101037, + "step": 3631 + }, + { + "epoch": 0.6733407489803486, + "grad_norm": 5.68359375, + "learning_rate": 9.326659251019652e-06, + "loss": 3.0323, + "mean_token_accuracy": 0.4242041435068216, + "step": 3632 + }, + { + "epoch": 0.6735261401557285, + "grad_norm": 7.34765625, + "learning_rate": 9.326473859844272e-06, + "loss": 3.0189, + "mean_token_accuracy": 0.4166666666666667, + "step": 3633 + }, + { + "epoch": 0.6737115313311086, + "grad_norm": 8.359375, + "learning_rate": 9.326288468668893e-06, + "loss": 2.5294, + "mean_token_accuracy": 0.47661037214168284, + "step": 3634 + }, + { + "epoch": 0.6738969225064887, + "grad_norm": 5.91015625, + "learning_rate": 9.326103077493513e-06, + "loss": 2.7859, + "mean_token_accuracy": 0.4438757706176573, + "step": 3635 + }, + { + "epoch": 0.6740823136818688, + "grad_norm": 5.80078125, + "learning_rate": 9.325917686318132e-06, + "loss": 2.6873, + "mean_token_accuracy": 0.45653973509933776, + "step": 3636 + }, + { + "epoch": 0.6742677048572487, + "grad_norm": 6.5234375, + "learning_rate": 9.325732295142752e-06, + "loss": 2.5995, + "mean_token_accuracy": 0.46665730731433386, + "step": 3637 + }, + { + "epoch": 0.6744530960326288, + "grad_norm": 6.0078125, + "learning_rate": 9.325546903967371e-06, + "loss": 3.8748, + "mean_token_accuracy": 0.37796052631578947, + "step": 3638 + }, + { + "epoch": 0.6746384872080089, + "grad_norm": 6.66796875, + "learning_rate": 9.325361512791992e-06, + "loss": 2.6198, + "mean_token_accuracy": 0.4668118766479422, + "step": 3639 + }, + { + "epoch": 0.674823878383389, + "grad_norm": 5.34375, + "learning_rate": 9.325176121616612e-06, + "loss": 3.0215, + "mean_token_accuracy": 0.4283849918433931, + "step": 3640 + }, + { + "epoch": 0.675009269558769, + "grad_norm": 5.93359375, + "learning_rate": 9.324990730441231e-06, + "loss": 2.859, + "mean_token_accuracy": 0.43352295277153063, + "step": 3641 + }, + { + "epoch": 0.675194660734149, + "grad_norm": 5.39453125, + "learning_rate": 9.324805339265852e-06, + "loss": 2.3806, + "mean_token_accuracy": 0.5106893792157311, + "step": 3642 + }, + { + "epoch": 0.6753800519095291, + "grad_norm": 8.796875, + "learning_rate": 9.324619948090472e-06, + "loss": 2.5917, + "mean_token_accuracy": 0.4822549647661755, + "step": 3643 + }, + { + "epoch": 0.6755654430849092, + "grad_norm": 6.16015625, + "learning_rate": 9.324434556915092e-06, + "loss": 2.8296, + "mean_token_accuracy": 0.45179584120982985, + "step": 3644 + }, + { + "epoch": 0.6757508342602893, + "grad_norm": 5.8984375, + "learning_rate": 9.324249165739711e-06, + "loss": 2.427, + "mean_token_accuracy": 0.48312611012433393, + "step": 3645 + }, + { + "epoch": 0.6759362254356692, + "grad_norm": 5.2109375, + "learning_rate": 9.324063774564332e-06, + "loss": 2.7307, + "mean_token_accuracy": 0.4411764705882353, + "step": 3646 + }, + { + "epoch": 0.6761216166110493, + "grad_norm": 6.37109375, + "learning_rate": 9.32387838338895e-06, + "loss": 2.5482, + "mean_token_accuracy": 0.4753647452762497, + "step": 3647 + }, + { + "epoch": 0.6763070077864294, + "grad_norm": 6.9453125, + "learning_rate": 9.323692992213571e-06, + "loss": 2.7346, + "mean_token_accuracy": 0.4473684210526316, + "step": 3648 + }, + { + "epoch": 0.6764923989618095, + "grad_norm": 8.453125, + "learning_rate": 9.323507601038192e-06, + "loss": 2.7356, + "mean_token_accuracy": 0.42458928090492865, + "step": 3649 + }, + { + "epoch": 0.6766777901371894, + "grad_norm": 6.015625, + "learning_rate": 9.323322209862812e-06, + "loss": 2.4363, + "mean_token_accuracy": 0.4764303241637608, + "step": 3650 + }, + { + "epoch": 0.6768631813125695, + "grad_norm": 6.28515625, + "learning_rate": 9.323136818687431e-06, + "loss": 2.6247, + "mean_token_accuracy": 0.46879449454200284, + "step": 3651 + }, + { + "epoch": 0.6770485724879496, + "grad_norm": 10.5390625, + "learning_rate": 9.322951427512051e-06, + "loss": 2.6109, + "mean_token_accuracy": 0.4587063422014878, + "step": 3652 + }, + { + "epoch": 0.6772339636633297, + "grad_norm": 6.2890625, + "learning_rate": 9.322766036336672e-06, + "loss": 3.1121, + "mean_token_accuracy": 0.408524771657902, + "step": 3653 + }, + { + "epoch": 0.6774193548387096, + "grad_norm": 5.26953125, + "learning_rate": 9.32258064516129e-06, + "loss": 3.0713, + "mean_token_accuracy": 0.41240128692600175, + "step": 3654 + }, + { + "epoch": 0.6776047460140897, + "grad_norm": 6.5078125, + "learning_rate": 9.322395253985911e-06, + "loss": 2.8869, + "mean_token_accuracy": 0.43451864700780574, + "step": 3655 + }, + { + "epoch": 0.6777901371894698, + "grad_norm": 5.71875, + "learning_rate": 9.32220986281053e-06, + "loss": 2.2131, + "mean_token_accuracy": 0.5030404378230465, + "step": 3656 + }, + { + "epoch": 0.6779755283648499, + "grad_norm": 6.50390625, + "learning_rate": 9.32202447163515e-06, + "loss": 3.1317, + "mean_token_accuracy": 0.41773084479371314, + "step": 3657 + }, + { + "epoch": 0.6781609195402298, + "grad_norm": 7.51953125, + "learning_rate": 9.321839080459771e-06, + "loss": 2.5512, + "mean_token_accuracy": 0.45736724008975316, + "step": 3658 + }, + { + "epoch": 0.6783463107156099, + "grad_norm": 8.6015625, + "learning_rate": 9.321653689284391e-06, + "loss": 2.4088, + "mean_token_accuracy": 0.48516607545952917, + "step": 3659 + }, + { + "epoch": 0.67853170189099, + "grad_norm": 5.21484375, + "learning_rate": 9.32146829810901e-06, + "loss": 3.0705, + "mean_token_accuracy": 0.40913770913770914, + "step": 3660 + }, + { + "epoch": 0.6787170930663701, + "grad_norm": 5.55078125, + "learning_rate": 9.32128290693363e-06, + "loss": 2.5839, + "mean_token_accuracy": 0.47993527508090617, + "step": 3661 + }, + { + "epoch": 0.6789024842417501, + "grad_norm": 5.76171875, + "learning_rate": 9.321097515758251e-06, + "loss": 2.7031, + "mean_token_accuracy": 0.44138892409073627, + "step": 3662 + }, + { + "epoch": 0.6790878754171301, + "grad_norm": 6.15234375, + "learning_rate": 9.32091212458287e-06, + "loss": 2.707, + "mean_token_accuracy": 0.47384799521244764, + "step": 3663 + }, + { + "epoch": 0.6792732665925102, + "grad_norm": 5.34765625, + "learning_rate": 9.32072673340749e-06, + "loss": 2.3473, + "mean_token_accuracy": 0.5033642249587407, + "step": 3664 + }, + { + "epoch": 0.6794586577678903, + "grad_norm": 4.86328125, + "learning_rate": 9.32054134223211e-06, + "loss": 2.9438, + "mean_token_accuracy": 0.45585274662065, + "step": 3665 + }, + { + "epoch": 0.6796440489432704, + "grad_norm": 6.765625, + "learning_rate": 9.320355951056731e-06, + "loss": 2.416, + "mean_token_accuracy": 0.4872469103339469, + "step": 3666 + }, + { + "epoch": 0.6798294401186503, + "grad_norm": 11.1796875, + "learning_rate": 9.32017055988135e-06, + "loss": 3.1018, + "mean_token_accuracy": 0.4056912616469403, + "step": 3667 + }, + { + "epoch": 0.6800148312940304, + "grad_norm": 7.0078125, + "learning_rate": 9.31998516870597e-06, + "loss": 2.7959, + "mean_token_accuracy": 0.45966535165283634, + "step": 3668 + }, + { + "epoch": 0.6802002224694105, + "grad_norm": 6.23828125, + "learning_rate": 9.319799777530591e-06, + "loss": 2.6445, + "mean_token_accuracy": 0.4571267922406522, + "step": 3669 + }, + { + "epoch": 0.6803856136447906, + "grad_norm": 6.6875, + "learning_rate": 9.31961438635521e-06, + "loss": 2.7879, + "mean_token_accuracy": 0.43941166638809964, + "step": 3670 + }, + { + "epoch": 0.6805710048201705, + "grad_norm": 6.3359375, + "learning_rate": 9.31942899517983e-06, + "loss": 2.9837, + "mean_token_accuracy": 0.41839007317849186, + "step": 3671 + }, + { + "epoch": 0.6807563959955506, + "grad_norm": 8.0859375, + "learning_rate": 9.31924360400445e-06, + "loss": 2.927, + "mean_token_accuracy": 0.4268956342967711, + "step": 3672 + }, + { + "epoch": 0.6809417871709307, + "grad_norm": 7.43359375, + "learning_rate": 9.31905821282907e-06, + "loss": 2.7616, + "mean_token_accuracy": 0.44950055493895674, + "step": 3673 + }, + { + "epoch": 0.6811271783463108, + "grad_norm": 4.94921875, + "learning_rate": 9.31887282165369e-06, + "loss": 3.2001, + "mean_token_accuracy": 0.4100163309744148, + "step": 3674 + }, + { + "epoch": 0.6813125695216907, + "grad_norm": 7.0234375, + "learning_rate": 9.31868743047831e-06, + "loss": 2.5693, + "mean_token_accuracy": 0.4498342874359747, + "step": 3675 + }, + { + "epoch": 0.6814979606970708, + "grad_norm": 5.60546875, + "learning_rate": 9.31850203930293e-06, + "loss": 2.8776, + "mean_token_accuracy": 0.44008662175168434, + "step": 3676 + }, + { + "epoch": 0.6816833518724509, + "grad_norm": 7.2421875, + "learning_rate": 9.31831664812755e-06, + "loss": 1.9909, + "mean_token_accuracy": 0.5634180610550518, + "step": 3677 + }, + { + "epoch": 0.681868743047831, + "grad_norm": 5.671875, + "learning_rate": 9.31813125695217e-06, + "loss": 2.9693, + "mean_token_accuracy": 0.4477510990869124, + "step": 3678 + }, + { + "epoch": 0.6820541342232109, + "grad_norm": 5.640625, + "learning_rate": 9.31794586577679e-06, + "loss": 2.9028, + "mean_token_accuracy": 0.4410116856090924, + "step": 3679 + }, + { + "epoch": 0.682239525398591, + "grad_norm": 6.74609375, + "learning_rate": 9.31776047460141e-06, + "loss": 3.2196, + "mean_token_accuracy": 0.4018082701768382, + "step": 3680 + }, + { + "epoch": 0.6824249165739711, + "grad_norm": 6.75, + "learning_rate": 9.317575083426029e-06, + "loss": 2.5036, + "mean_token_accuracy": 0.47283682338996447, + "step": 3681 + }, + { + "epoch": 0.6826103077493512, + "grad_norm": 6.3671875, + "learning_rate": 9.31738969225065e-06, + "loss": 2.2638, + "mean_token_accuracy": 0.5205215587494444, + "step": 3682 + }, + { + "epoch": 0.6827956989247311, + "grad_norm": 6.4296875, + "learning_rate": 9.31720430107527e-06, + "loss": 2.7664, + "mean_token_accuracy": 0.4615570599613153, + "step": 3683 + }, + { + "epoch": 0.6829810901001112, + "grad_norm": 7.7734375, + "learning_rate": 9.31701890989989e-06, + "loss": 2.6736, + "mean_token_accuracy": 0.4461025559612637, + "step": 3684 + }, + { + "epoch": 0.6831664812754913, + "grad_norm": 7.00390625, + "learning_rate": 9.316833518724509e-06, + "loss": 3.0958, + "mean_token_accuracy": 0.40964610084559977, + "step": 3685 + }, + { + "epoch": 0.6833518724508714, + "grad_norm": 6.24609375, + "learning_rate": 9.31664812754913e-06, + "loss": 2.3771, + "mean_token_accuracy": 0.49792919171676686, + "step": 3686 + }, + { + "epoch": 0.6835372636262514, + "grad_norm": 7.484375, + "learning_rate": 9.31646273637375e-06, + "loss": 2.2255, + "mean_token_accuracy": 0.5134324916407241, + "step": 3687 + }, + { + "epoch": 0.6837226548016314, + "grad_norm": 5.6796875, + "learning_rate": 9.316277345198369e-06, + "loss": 3.3625, + "mean_token_accuracy": 0.3951213763846335, + "step": 3688 + }, + { + "epoch": 0.6839080459770115, + "grad_norm": 6.546875, + "learning_rate": 9.31609195402299e-06, + "loss": 2.6319, + "mean_token_accuracy": 0.45058626465661644, + "step": 3689 + }, + { + "epoch": 0.6840934371523916, + "grad_norm": 6.29296875, + "learning_rate": 9.31590656284761e-06, + "loss": 3.2104, + "mean_token_accuracy": 0.42191029364311067, + "step": 3690 + }, + { + "epoch": 0.6842788283277716, + "grad_norm": 8.8359375, + "learning_rate": 9.31572117167223e-06, + "loss": 2.5383, + "mean_token_accuracy": 0.47611464968152867, + "step": 3691 + }, + { + "epoch": 0.6844642195031516, + "grad_norm": 10.796875, + "learning_rate": 9.315535780496849e-06, + "loss": 2.3693, + "mean_token_accuracy": 0.49222011385199244, + "step": 3692 + }, + { + "epoch": 0.6846496106785317, + "grad_norm": 6.03125, + "learning_rate": 9.31535038932147e-06, + "loss": 2.7409, + "mean_token_accuracy": 0.4267748001880583, + "step": 3693 + }, + { + "epoch": 0.6848350018539118, + "grad_norm": 7.640625, + "learning_rate": 9.315164998146088e-06, + "loss": 2.974, + "mean_token_accuracy": 0.4579218250104326, + "step": 3694 + }, + { + "epoch": 0.6850203930292919, + "grad_norm": 6.83984375, + "learning_rate": 9.314979606970709e-06, + "loss": 2.7889, + "mean_token_accuracy": 0.4270300179831235, + "step": 3695 + }, + { + "epoch": 0.6852057842046718, + "grad_norm": 9.203125, + "learning_rate": 9.31479421579533e-06, + "loss": 2.6061, + "mean_token_accuracy": 0.46867833433916717, + "step": 3696 + }, + { + "epoch": 0.6853911753800519, + "grad_norm": 7.875, + "learning_rate": 9.314608824619948e-06, + "loss": 2.4554, + "mean_token_accuracy": 0.49648886896757805, + "step": 3697 + }, + { + "epoch": 0.685576566555432, + "grad_norm": 9.3046875, + "learning_rate": 9.314423433444569e-06, + "loss": 3.0638, + "mean_token_accuracy": 0.4152993097204934, + "step": 3698 + }, + { + "epoch": 0.685761957730812, + "grad_norm": 7.10546875, + "learning_rate": 9.314238042269189e-06, + "loss": 3.1631, + "mean_token_accuracy": 0.3914081145584726, + "step": 3699 + }, + { + "epoch": 0.685947348906192, + "grad_norm": 5.57421875, + "learning_rate": 9.31405265109381e-06, + "loss": 2.5044, + "mean_token_accuracy": 0.4739136753378869, + "step": 3700 + }, + { + "epoch": 0.6861327400815721, + "grad_norm": 7.57421875, + "learning_rate": 9.313867259918428e-06, + "loss": 3.2191, + "mean_token_accuracy": 0.40308471454880296, + "step": 3701 + }, + { + "epoch": 0.6863181312569522, + "grad_norm": 7.0234375, + "learning_rate": 9.313681868743049e-06, + "loss": 2.4089, + "mean_token_accuracy": 0.4782207882000461, + "step": 3702 + }, + { + "epoch": 0.6865035224323323, + "grad_norm": 6.0546875, + "learning_rate": 9.313496477567668e-06, + "loss": 2.6639, + "mean_token_accuracy": 0.45160481444333, + "step": 3703 + }, + { + "epoch": 0.6866889136077122, + "grad_norm": 5.6484375, + "learning_rate": 9.313311086392288e-06, + "loss": 2.7807, + "mean_token_accuracy": 0.46417019158994777, + "step": 3704 + }, + { + "epoch": 0.6868743047830923, + "grad_norm": 5.578125, + "learning_rate": 9.313125695216909e-06, + "loss": 2.6647, + "mean_token_accuracy": 0.4640559756235188, + "step": 3705 + }, + { + "epoch": 0.6870596959584724, + "grad_norm": 4.796875, + "learning_rate": 9.312940304041529e-06, + "loss": 2.7506, + "mean_token_accuracy": 0.4486500794070937, + "step": 3706 + }, + { + "epoch": 0.6872450871338525, + "grad_norm": 5.99609375, + "learning_rate": 9.31275491286615e-06, + "loss": 2.5841, + "mean_token_accuracy": 0.4736072168718761, + "step": 3707 + }, + { + "epoch": 0.6874304783092324, + "grad_norm": 5.796875, + "learning_rate": 9.312569521690768e-06, + "loss": 3.3288, + "mean_token_accuracy": 0.3968909664093923, + "step": 3708 + }, + { + "epoch": 0.6876158694846125, + "grad_norm": 6.33984375, + "learning_rate": 9.312384130515389e-06, + "loss": 2.7566, + "mean_token_accuracy": 0.44651213407334855, + "step": 3709 + }, + { + "epoch": 0.6878012606599926, + "grad_norm": 6.4375, + "learning_rate": 9.312198739340008e-06, + "loss": 3.3169, + "mean_token_accuracy": 0.40241286863270775, + "step": 3710 + }, + { + "epoch": 0.6879866518353727, + "grad_norm": 5.07421875, + "learning_rate": 9.312013348164628e-06, + "loss": 2.6545, + "mean_token_accuracy": 0.48212258796821794, + "step": 3711 + }, + { + "epoch": 0.6881720430107527, + "grad_norm": 6.87109375, + "learning_rate": 9.311827956989249e-06, + "loss": 2.463, + "mean_token_accuracy": 0.47846695557963165, + "step": 3712 + }, + { + "epoch": 0.6883574341861327, + "grad_norm": 7.05078125, + "learning_rate": 9.311642565813867e-06, + "loss": 2.4716, + "mean_token_accuracy": 0.4891616011010437, + "step": 3713 + }, + { + "epoch": 0.6885428253615128, + "grad_norm": 7.87890625, + "learning_rate": 9.311457174638488e-06, + "loss": 3.0846, + "mean_token_accuracy": 0.41446028513238287, + "step": 3714 + }, + { + "epoch": 0.6887282165368929, + "grad_norm": 7.09765625, + "learning_rate": 9.311271783463108e-06, + "loss": 2.5664, + "mean_token_accuracy": 0.48602180598266703, + "step": 3715 + }, + { + "epoch": 0.688913607712273, + "grad_norm": 7.54296875, + "learning_rate": 9.311086392287729e-06, + "loss": 2.0557, + "mean_token_accuracy": 0.527019174898315, + "step": 3716 + }, + { + "epoch": 0.6890989988876529, + "grad_norm": 5.75390625, + "learning_rate": 9.310901001112348e-06, + "loss": 3.0696, + "mean_token_accuracy": 0.4094616639477977, + "step": 3717 + }, + { + "epoch": 0.689284390063033, + "grad_norm": 6.77734375, + "learning_rate": 9.310715609936968e-06, + "loss": 2.8445, + "mean_token_accuracy": 0.43648902821316615, + "step": 3718 + }, + { + "epoch": 0.6894697812384131, + "grad_norm": 6.6875, + "learning_rate": 9.310530218761587e-06, + "loss": 3.2139, + "mean_token_accuracy": 0.4118034948895483, + "step": 3719 + }, + { + "epoch": 0.6896551724137931, + "grad_norm": 9.75, + "learning_rate": 9.310344827586207e-06, + "loss": 2.3257, + "mean_token_accuracy": 0.49427772126144454, + "step": 3720 + }, + { + "epoch": 0.6898405635891731, + "grad_norm": 6.1328125, + "learning_rate": 9.310159436410828e-06, + "loss": 2.9005, + "mean_token_accuracy": 0.4299740644683216, + "step": 3721 + }, + { + "epoch": 0.6900259547645532, + "grad_norm": 5.66015625, + "learning_rate": 9.309974045235447e-06, + "loss": 3.058, + "mean_token_accuracy": 0.4259070871481858, + "step": 3722 + }, + { + "epoch": 0.6902113459399333, + "grad_norm": 7.125, + "learning_rate": 9.309788654060067e-06, + "loss": 2.4398, + "mean_token_accuracy": 0.47286742034943474, + "step": 3723 + }, + { + "epoch": 0.6903967371153134, + "grad_norm": 5.88671875, + "learning_rate": 9.309603262884688e-06, + "loss": 2.3354, + "mean_token_accuracy": 0.5140526837870605, + "step": 3724 + }, + { + "epoch": 0.6905821282906933, + "grad_norm": 6.12890625, + "learning_rate": 9.309417871709308e-06, + "loss": 2.544, + "mean_token_accuracy": 0.49151989562948467, + "step": 3725 + }, + { + "epoch": 0.6907675194660734, + "grad_norm": 6.08203125, + "learning_rate": 9.309232480533927e-06, + "loss": 2.8818, + "mean_token_accuracy": 0.43655339187985237, + "step": 3726 + }, + { + "epoch": 0.6909529106414535, + "grad_norm": 7.25, + "learning_rate": 9.309047089358548e-06, + "loss": 2.6161, + "mean_token_accuracy": 0.4860829225862569, + "step": 3727 + }, + { + "epoch": 0.6911383018168336, + "grad_norm": 6.18359375, + "learning_rate": 9.308861698183166e-06, + "loss": 2.4938, + "mean_token_accuracy": 0.48931829092654827, + "step": 3728 + }, + { + "epoch": 0.6913236929922135, + "grad_norm": 5.98046875, + "learning_rate": 9.308676307007787e-06, + "loss": 2.8052, + "mean_token_accuracy": 0.45561899818323387, + "step": 3729 + }, + { + "epoch": 0.6915090841675936, + "grad_norm": 10.5390625, + "learning_rate": 9.308490915832407e-06, + "loss": 2.4368, + "mean_token_accuracy": 0.49649520112153567, + "step": 3730 + }, + { + "epoch": 0.6916944753429737, + "grad_norm": 7.1328125, + "learning_rate": 9.308305524657028e-06, + "loss": 2.1498, + "mean_token_accuracy": 0.5312959153687923, + "step": 3731 + }, + { + "epoch": 0.6918798665183538, + "grad_norm": 5.2734375, + "learning_rate": 9.308120133481647e-06, + "loss": 2.9191, + "mean_token_accuracy": 0.4215308370044053, + "step": 3732 + }, + { + "epoch": 0.6920652576937337, + "grad_norm": 5.68359375, + "learning_rate": 9.307934742306267e-06, + "loss": 2.9049, + "mean_token_accuracy": 0.45426829268292684, + "step": 3733 + }, + { + "epoch": 0.6922506488691138, + "grad_norm": 6.23046875, + "learning_rate": 9.307749351130888e-06, + "loss": 2.7945, + "mean_token_accuracy": 0.4490644490644491, + "step": 3734 + }, + { + "epoch": 0.6924360400444939, + "grad_norm": 7.4375, + "learning_rate": 9.307563959955506e-06, + "loss": 2.7582, + "mean_token_accuracy": 0.44988569365900616, + "step": 3735 + }, + { + "epoch": 0.692621431219874, + "grad_norm": 10.484375, + "learning_rate": 9.307378568780127e-06, + "loss": 2.1572, + "mean_token_accuracy": 0.5419969894631209, + "step": 3736 + }, + { + "epoch": 0.692806822395254, + "grad_norm": 8.5, + "learning_rate": 9.307193177604746e-06, + "loss": 2.9078, + "mean_token_accuracy": 0.43260485950861954, + "step": 3737 + }, + { + "epoch": 0.692992213570634, + "grad_norm": 7.1171875, + "learning_rate": 9.307007786429366e-06, + "loss": 2.7767, + "mean_token_accuracy": 0.4397390653406053, + "step": 3738 + }, + { + "epoch": 0.6931776047460141, + "grad_norm": 8.0625, + "learning_rate": 9.306822395253987e-06, + "loss": 3.3308, + "mean_token_accuracy": 0.3990030117353827, + "step": 3739 + }, + { + "epoch": 0.6933629959213942, + "grad_norm": 8.171875, + "learning_rate": 9.306637004078607e-06, + "loss": 2.8292, + "mean_token_accuracy": 0.4578696343402226, + "step": 3740 + }, + { + "epoch": 0.6935483870967742, + "grad_norm": 5.1015625, + "learning_rate": 9.306451612903226e-06, + "loss": 2.9635, + "mean_token_accuracy": 0.4141553082405902, + "step": 3741 + }, + { + "epoch": 0.6937337782721542, + "grad_norm": 9.8984375, + "learning_rate": 9.306266221727846e-06, + "loss": 2.6526, + "mean_token_accuracy": 0.44365446142907927, + "step": 3742 + }, + { + "epoch": 0.6939191694475343, + "grad_norm": 6.1328125, + "learning_rate": 9.306080830552467e-06, + "loss": 3.2672, + "mean_token_accuracy": 0.4056435137895812, + "step": 3743 + }, + { + "epoch": 0.6941045606229144, + "grad_norm": 7.33984375, + "learning_rate": 9.305895439377086e-06, + "loss": 2.6097, + "mean_token_accuracy": 0.4550501156515035, + "step": 3744 + }, + { + "epoch": 0.6942899517982944, + "grad_norm": 9.5859375, + "learning_rate": 9.305710048201706e-06, + "loss": 2.9654, + "mean_token_accuracy": 0.42474916387959866, + "step": 3745 + }, + { + "epoch": 0.6944753429736744, + "grad_norm": 6.09375, + "learning_rate": 9.305524657026325e-06, + "loss": 3.088, + "mean_token_accuracy": 0.41641952008627664, + "step": 3746 + }, + { + "epoch": 0.6946607341490545, + "grad_norm": 5.1328125, + "learning_rate": 9.305339265850947e-06, + "loss": 3.0227, + "mean_token_accuracy": 0.4286058851905451, + "step": 3747 + }, + { + "epoch": 0.6948461253244346, + "grad_norm": 4.90625, + "learning_rate": 9.305153874675566e-06, + "loss": 3.0253, + "mean_token_accuracy": 0.43477609704957315, + "step": 3748 + }, + { + "epoch": 0.6950315164998146, + "grad_norm": 5.79296875, + "learning_rate": 9.304968483500186e-06, + "loss": 3.0712, + "mean_token_accuracy": 0.4207501512401694, + "step": 3749 + }, + { + "epoch": 0.6952169076751946, + "grad_norm": 6.63671875, + "learning_rate": 9.304783092324807e-06, + "loss": 2.9522, + "mean_token_accuracy": 0.42444910807974817, + "step": 3750 + }, + { + "epoch": 0.6954022988505747, + "grad_norm": 5.82421875, + "learning_rate": 9.304597701149426e-06, + "loss": 3.2942, + "mean_token_accuracy": 0.40330220598186495, + "step": 3751 + }, + { + "epoch": 0.6955876900259548, + "grad_norm": 5.1875, + "learning_rate": 9.304412309974046e-06, + "loss": 3.1218, + "mean_token_accuracy": 0.4129016670693404, + "step": 3752 + }, + { + "epoch": 0.6957730812013349, + "grad_norm": 5.9375, + "learning_rate": 9.304226918798665e-06, + "loss": 3.5558, + "mean_token_accuracy": 0.37955361723961006, + "step": 3753 + }, + { + "epoch": 0.6959584723767148, + "grad_norm": 6.16796875, + "learning_rate": 9.304041527623286e-06, + "loss": 2.867, + "mean_token_accuracy": 0.42647903503733486, + "step": 3754 + }, + { + "epoch": 0.6961438635520949, + "grad_norm": 5.96484375, + "learning_rate": 9.303856136447906e-06, + "loss": 3.0602, + "mean_token_accuracy": 0.41829268292682925, + "step": 3755 + }, + { + "epoch": 0.696329254727475, + "grad_norm": 6.09765625, + "learning_rate": 9.303670745272527e-06, + "loss": 3.001, + "mean_token_accuracy": 0.43562795585916975, + "step": 3756 + }, + { + "epoch": 0.6965146459028551, + "grad_norm": 5.86328125, + "learning_rate": 9.303485354097145e-06, + "loss": 2.9652, + "mean_token_accuracy": 0.4584017169549299, + "step": 3757 + }, + { + "epoch": 0.696700037078235, + "grad_norm": 5.88671875, + "learning_rate": 9.303299962921766e-06, + "loss": 3.0853, + "mean_token_accuracy": 0.41798523206751054, + "step": 3758 + }, + { + "epoch": 0.6968854282536151, + "grad_norm": 5.46484375, + "learning_rate": 9.303114571746386e-06, + "loss": 2.1892, + "mean_token_accuracy": 0.5260829774252593, + "step": 3759 + }, + { + "epoch": 0.6970708194289952, + "grad_norm": 6.81640625, + "learning_rate": 9.302929180571005e-06, + "loss": 2.3943, + "mean_token_accuracy": 0.5048891008824231, + "step": 3760 + }, + { + "epoch": 0.6972562106043753, + "grad_norm": 8.2578125, + "learning_rate": 9.302743789395626e-06, + "loss": 3.0268, + "mean_token_accuracy": 0.40733812949640286, + "step": 3761 + }, + { + "epoch": 0.6974416017797553, + "grad_norm": 5.42578125, + "learning_rate": 9.302558398220244e-06, + "loss": 3.0889, + "mean_token_accuracy": 0.4246820140859876, + "step": 3762 + }, + { + "epoch": 0.6976269929551353, + "grad_norm": 5.4765625, + "learning_rate": 9.302373007044867e-06, + "loss": 2.5036, + "mean_token_accuracy": 0.4927477840451249, + "step": 3763 + }, + { + "epoch": 0.6978123841305154, + "grad_norm": 8.0390625, + "learning_rate": 9.302187615869485e-06, + "loss": 2.7553, + "mean_token_accuracy": 0.45544554455445546, + "step": 3764 + }, + { + "epoch": 0.6979977753058955, + "grad_norm": 6.69140625, + "learning_rate": 9.302002224694106e-06, + "loss": 3.2619, + "mean_token_accuracy": 0.3847634322373697, + "step": 3765 + }, + { + "epoch": 0.6981831664812755, + "grad_norm": 8.046875, + "learning_rate": 9.301816833518725e-06, + "loss": 2.7133, + "mean_token_accuracy": 0.4454521556256572, + "step": 3766 + }, + { + "epoch": 0.6983685576566555, + "grad_norm": 7.96875, + "learning_rate": 9.301631442343345e-06, + "loss": 2.3942, + "mean_token_accuracy": 0.4794557097118463, + "step": 3767 + }, + { + "epoch": 0.6985539488320356, + "grad_norm": 8.5, + "learning_rate": 9.301446051167966e-06, + "loss": 2.5215, + "mean_token_accuracy": 0.46710526315789475, + "step": 3768 + }, + { + "epoch": 0.6987393400074157, + "grad_norm": 6.44140625, + "learning_rate": 9.301260659992584e-06, + "loss": 3.127, + "mean_token_accuracy": 0.44329799318439983, + "step": 3769 + }, + { + "epoch": 0.6989247311827957, + "grad_norm": 5.23046875, + "learning_rate": 9.301075268817205e-06, + "loss": 2.5763, + "mean_token_accuracy": 0.4542807992589652, + "step": 3770 + }, + { + "epoch": 0.6991101223581757, + "grad_norm": 6.578125, + "learning_rate": 9.300889877641825e-06, + "loss": 2.4968, + "mean_token_accuracy": 0.4939518643222347, + "step": 3771 + }, + { + "epoch": 0.6992955135335558, + "grad_norm": 6.35546875, + "learning_rate": 9.300704486466446e-06, + "loss": 2.8044, + "mean_token_accuracy": 0.4478944562899787, + "step": 3772 + }, + { + "epoch": 0.6994809047089359, + "grad_norm": 7.02734375, + "learning_rate": 9.300519095291065e-06, + "loss": 3.0017, + "mean_token_accuracy": 0.4395097332372026, + "step": 3773 + }, + { + "epoch": 0.699666295884316, + "grad_norm": 6.16796875, + "learning_rate": 9.300333704115685e-06, + "loss": 2.7093, + "mean_token_accuracy": 0.4448196677022829, + "step": 3774 + }, + { + "epoch": 0.6998516870596959, + "grad_norm": 5.9453125, + "learning_rate": 9.300148312940304e-06, + "loss": 3.1104, + "mean_token_accuracy": 0.4289184169167024, + "step": 3775 + }, + { + "epoch": 0.700037078235076, + "grad_norm": 9.3984375, + "learning_rate": 9.299962921764925e-06, + "loss": 2.483, + "mean_token_accuracy": 0.47798807905468993, + "step": 3776 + }, + { + "epoch": 0.7002224694104561, + "grad_norm": 7.31640625, + "learning_rate": 9.299777530589545e-06, + "loss": 2.7822, + "mean_token_accuracy": 0.4536370315944159, + "step": 3777 + }, + { + "epoch": 0.7004078605858362, + "grad_norm": 4.8984375, + "learning_rate": 9.299592139414164e-06, + "loss": 3.2263, + "mean_token_accuracy": 0.42537724418988876, + "step": 3778 + }, + { + "epoch": 0.7005932517612161, + "grad_norm": 7.25, + "learning_rate": 9.299406748238784e-06, + "loss": 3.2735, + "mean_token_accuracy": 0.3994807540354442, + "step": 3779 + }, + { + "epoch": 0.7007786429365962, + "grad_norm": 6.94921875, + "learning_rate": 9.299221357063405e-06, + "loss": 2.6055, + "mean_token_accuracy": 0.46255506607929514, + "step": 3780 + }, + { + "epoch": 0.7009640341119763, + "grad_norm": 5.84765625, + "learning_rate": 9.299035965888025e-06, + "loss": 2.7275, + "mean_token_accuracy": 0.4404685287681812, + "step": 3781 + }, + { + "epoch": 0.7011494252873564, + "grad_norm": 9.3046875, + "learning_rate": 9.298850574712644e-06, + "loss": 2.3886, + "mean_token_accuracy": 0.48823060043885896, + "step": 3782 + }, + { + "epoch": 0.7013348164627363, + "grad_norm": 9.34375, + "learning_rate": 9.298665183537265e-06, + "loss": 2.7863, + "mean_token_accuracy": 0.4582369942196532, + "step": 3783 + }, + { + "epoch": 0.7015202076381164, + "grad_norm": 6.734375, + "learning_rate": 9.298479792361883e-06, + "loss": 2.377, + "mean_token_accuracy": 0.514839409134029, + "step": 3784 + }, + { + "epoch": 0.7017055988134965, + "grad_norm": 6.5, + "learning_rate": 9.298294401186504e-06, + "loss": 2.5365, + "mean_token_accuracy": 0.4765061642242382, + "step": 3785 + }, + { + "epoch": 0.7018909899888766, + "grad_norm": 11.890625, + "learning_rate": 9.298109010011124e-06, + "loss": 3.0053, + "mean_token_accuracy": 0.4141820067409904, + "step": 3786 + }, + { + "epoch": 0.7020763811642566, + "grad_norm": 13.9140625, + "learning_rate": 9.297923618835745e-06, + "loss": 3.1148, + "mean_token_accuracy": 0.4159607668301382, + "step": 3787 + }, + { + "epoch": 0.7022617723396366, + "grad_norm": 8.9921875, + "learning_rate": 9.297738227660365e-06, + "loss": 2.724, + "mean_token_accuracy": 0.45027988146196907, + "step": 3788 + }, + { + "epoch": 0.7024471635150167, + "grad_norm": 7.4609375, + "learning_rate": 9.297552836484984e-06, + "loss": 2.5233, + "mean_token_accuracy": 0.479294745389166, + "step": 3789 + }, + { + "epoch": 0.7026325546903968, + "grad_norm": 10.8203125, + "learning_rate": 9.297367445309605e-06, + "loss": 2.9294, + "mean_token_accuracy": 0.42621160409556313, + "step": 3790 + }, + { + "epoch": 0.7028179458657768, + "grad_norm": 11.359375, + "learning_rate": 9.297182054134223e-06, + "loss": 2.9372, + "mean_token_accuracy": 0.4305572343768887, + "step": 3791 + }, + { + "epoch": 0.7030033370411568, + "grad_norm": 5.5, + "learning_rate": 9.296996662958844e-06, + "loss": 2.7663, + "mean_token_accuracy": 0.4328397673188789, + "step": 3792 + }, + { + "epoch": 0.7031887282165369, + "grad_norm": 7.640625, + "learning_rate": 9.296811271783464e-06, + "loss": 2.926, + "mean_token_accuracy": 0.4357455075279262, + "step": 3793 + }, + { + "epoch": 0.703374119391917, + "grad_norm": 11.1875, + "learning_rate": 9.296625880608083e-06, + "loss": 2.7833, + "mean_token_accuracy": 0.4382592286826277, + "step": 3794 + }, + { + "epoch": 0.703559510567297, + "grad_norm": 9.609375, + "learning_rate": 9.296440489432704e-06, + "loss": 2.4775, + "mean_token_accuracy": 0.5096978958504761, + "step": 3795 + }, + { + "epoch": 0.703744901742677, + "grad_norm": 7.75390625, + "learning_rate": 9.296255098257324e-06, + "loss": 2.7215, + "mean_token_accuracy": 0.46455589801064723, + "step": 3796 + }, + { + "epoch": 0.7039302929180571, + "grad_norm": 8.21875, + "learning_rate": 9.296069707081945e-06, + "loss": 2.905, + "mean_token_accuracy": 0.4429616322340562, + "step": 3797 + }, + { + "epoch": 0.7041156840934372, + "grad_norm": 9.171875, + "learning_rate": 9.295884315906563e-06, + "loss": 2.7986, + "mean_token_accuracy": 0.4398177445892925, + "step": 3798 + }, + { + "epoch": 0.7043010752688172, + "grad_norm": 7.015625, + "learning_rate": 9.295698924731184e-06, + "loss": 2.6195, + "mean_token_accuracy": 0.45451508740674756, + "step": 3799 + }, + { + "epoch": 0.7044864664441972, + "grad_norm": 6.9453125, + "learning_rate": 9.295513533555803e-06, + "loss": 3.3091, + "mean_token_accuracy": 0.40289952798381656, + "step": 3800 + }, + { + "epoch": 0.7046718576195773, + "grad_norm": 5.9296875, + "learning_rate": 9.295328142380423e-06, + "loss": 2.9801, + "mean_token_accuracy": 0.41936181719848564, + "step": 3801 + }, + { + "epoch": 0.7048572487949574, + "grad_norm": 7.79296875, + "learning_rate": 9.295142751205044e-06, + "loss": 2.9512, + "mean_token_accuracy": 0.431203007518797, + "step": 3802 + }, + { + "epoch": 0.7050426399703374, + "grad_norm": 7.09375, + "learning_rate": 9.294957360029664e-06, + "loss": 3.1607, + "mean_token_accuracy": 0.4059082338152106, + "step": 3803 + }, + { + "epoch": 0.7052280311457174, + "grad_norm": 6.7734375, + "learning_rate": 9.294771968854283e-06, + "loss": 3.0824, + "mean_token_accuracy": 0.4139832377591531, + "step": 3804 + }, + { + "epoch": 0.7054134223210975, + "grad_norm": 9.4921875, + "learning_rate": 9.294586577678904e-06, + "loss": 2.8677, + "mean_token_accuracy": 0.43493115229319806, + "step": 3805 + }, + { + "epoch": 0.7055988134964776, + "grad_norm": 8.75, + "learning_rate": 9.294401186503524e-06, + "loss": 3.4116, + "mean_token_accuracy": 0.4032863849765258, + "step": 3806 + }, + { + "epoch": 0.7057842046718577, + "grad_norm": 5.8046875, + "learning_rate": 9.294215795328143e-06, + "loss": 2.6219, + "mean_token_accuracy": 0.4749961053123539, + "step": 3807 + }, + { + "epoch": 0.7059695958472376, + "grad_norm": 6.96875, + "learning_rate": 9.294030404152763e-06, + "loss": 2.3789, + "mean_token_accuracy": 0.482174566316077, + "step": 3808 + }, + { + "epoch": 0.7061549870226177, + "grad_norm": 5.390625, + "learning_rate": 9.293845012977382e-06, + "loss": 3.0125, + "mean_token_accuracy": 0.4328379743182802, + "step": 3809 + }, + { + "epoch": 0.7063403781979978, + "grad_norm": 5.82421875, + "learning_rate": 9.293659621802003e-06, + "loss": 3.1878, + "mean_token_accuracy": 0.4178818520489622, + "step": 3810 + }, + { + "epoch": 0.7065257693733779, + "grad_norm": 6.16796875, + "learning_rate": 9.293474230626623e-06, + "loss": 3.0366, + "mean_token_accuracy": 0.41045465035771983, + "step": 3811 + }, + { + "epoch": 0.7067111605487579, + "grad_norm": 4.390625, + "learning_rate": 9.293288839451244e-06, + "loss": 2.9539, + "mean_token_accuracy": 0.42405305445733965, + "step": 3812 + }, + { + "epoch": 0.7068965517241379, + "grad_norm": 5.0546875, + "learning_rate": 9.293103448275862e-06, + "loss": 2.8925, + "mean_token_accuracy": 0.42827186382138893, + "step": 3813 + }, + { + "epoch": 0.707081942899518, + "grad_norm": 5.96875, + "learning_rate": 9.292918057100483e-06, + "loss": 3.8499, + "mean_token_accuracy": 0.35547122074636306, + "step": 3814 + }, + { + "epoch": 0.7072673340748981, + "grad_norm": 5.66015625, + "learning_rate": 9.292732665925103e-06, + "loss": 2.6861, + "mean_token_accuracy": 0.45479583283439107, + "step": 3815 + }, + { + "epoch": 0.7074527252502781, + "grad_norm": 7.9453125, + "learning_rate": 9.292547274749722e-06, + "loss": 2.6778, + "mean_token_accuracy": 0.4694493336685944, + "step": 3816 + }, + { + "epoch": 0.7076381164256581, + "grad_norm": 7.5390625, + "learning_rate": 9.292361883574343e-06, + "loss": 2.5824, + "mean_token_accuracy": 0.4599832515851178, + "step": 3817 + }, + { + "epoch": 0.7078235076010382, + "grad_norm": 6.671875, + "learning_rate": 9.292176492398961e-06, + "loss": 2.8358, + "mean_token_accuracy": 0.44285218999191034, + "step": 3818 + }, + { + "epoch": 0.7080088987764183, + "grad_norm": 5.57421875, + "learning_rate": 9.291991101223584e-06, + "loss": 2.9752, + "mean_token_accuracy": 0.43184402924451665, + "step": 3819 + }, + { + "epoch": 0.7081942899517983, + "grad_norm": 8.2734375, + "learning_rate": 9.291805710048202e-06, + "loss": 2.7237, + "mean_token_accuracy": 0.44651842233403666, + "step": 3820 + }, + { + "epoch": 0.7083796811271783, + "grad_norm": 9.6640625, + "learning_rate": 9.291620318872823e-06, + "loss": 3.382, + "mean_token_accuracy": 0.4221745542297895, + "step": 3821 + }, + { + "epoch": 0.7085650723025584, + "grad_norm": 6.81640625, + "learning_rate": 9.291434927697442e-06, + "loss": 3.2157, + "mean_token_accuracy": 0.4180610236220472, + "step": 3822 + }, + { + "epoch": 0.7087504634779385, + "grad_norm": 9.0546875, + "learning_rate": 9.291249536522062e-06, + "loss": 2.7699, + "mean_token_accuracy": 0.43652790484903936, + "step": 3823 + }, + { + "epoch": 0.7089358546533185, + "grad_norm": 7.609375, + "learning_rate": 9.291064145346683e-06, + "loss": 2.7713, + "mean_token_accuracy": 0.44808743169398907, + "step": 3824 + }, + { + "epoch": 0.7091212458286985, + "grad_norm": 7.0234375, + "learning_rate": 9.290878754171301e-06, + "loss": 3.01, + "mean_token_accuracy": 0.43073742246726393, + "step": 3825 + }, + { + "epoch": 0.7093066370040786, + "grad_norm": 9.2109375, + "learning_rate": 9.290693362995922e-06, + "loss": 2.5356, + "mean_token_accuracy": 0.48332617718620297, + "step": 3826 + }, + { + "epoch": 0.7094920281794587, + "grad_norm": 6.37890625, + "learning_rate": 9.290507971820542e-06, + "loss": 2.5125, + "mean_token_accuracy": 0.4970414201183432, + "step": 3827 + }, + { + "epoch": 0.7096774193548387, + "grad_norm": 6.9765625, + "learning_rate": 9.290322580645163e-06, + "loss": 2.6819, + "mean_token_accuracy": 0.46208977744247454, + "step": 3828 + }, + { + "epoch": 0.7098628105302187, + "grad_norm": 5.46875, + "learning_rate": 9.290137189469782e-06, + "loss": 2.7806, + "mean_token_accuracy": 0.4633494527869687, + "step": 3829 + }, + { + "epoch": 0.7100482017055988, + "grad_norm": 10.4296875, + "learning_rate": 9.289951798294402e-06, + "loss": 2.8999, + "mean_token_accuracy": 0.4438202247191011, + "step": 3830 + }, + { + "epoch": 0.7102335928809789, + "grad_norm": 5.4609375, + "learning_rate": 9.289766407119023e-06, + "loss": 3.1191, + "mean_token_accuracy": 0.424476736435142, + "step": 3831 + }, + { + "epoch": 0.710418984056359, + "grad_norm": 5.45703125, + "learning_rate": 9.289581015943642e-06, + "loss": 2.7851, + "mean_token_accuracy": 0.4512842588606687, + "step": 3832 + }, + { + "epoch": 0.7106043752317389, + "grad_norm": 6.0703125, + "learning_rate": 9.289395624768262e-06, + "loss": 2.1701, + "mean_token_accuracy": 0.5377923559612093, + "step": 3833 + }, + { + "epoch": 0.710789766407119, + "grad_norm": 6.67578125, + "learning_rate": 9.28921023359288e-06, + "loss": 2.8134, + "mean_token_accuracy": 0.4498439589835042, + "step": 3834 + }, + { + "epoch": 0.7109751575824991, + "grad_norm": 5.94140625, + "learning_rate": 9.289024842417503e-06, + "loss": 2.5722, + "mean_token_accuracy": 0.4611955951756686, + "step": 3835 + }, + { + "epoch": 0.7111605487578792, + "grad_norm": 7.8046875, + "learning_rate": 9.288839451242122e-06, + "loss": 2.4677, + "mean_token_accuracy": 0.47398629883354937, + "step": 3836 + }, + { + "epoch": 0.7113459399332592, + "grad_norm": 7.45703125, + "learning_rate": 9.288654060066742e-06, + "loss": 3.3085, + "mean_token_accuracy": 0.40940927712435576, + "step": 3837 + }, + { + "epoch": 0.7115313311086392, + "grad_norm": 9.2265625, + "learning_rate": 9.288468668891361e-06, + "loss": 3.2965, + "mean_token_accuracy": 0.4, + "step": 3838 + }, + { + "epoch": 0.7117167222840193, + "grad_norm": 9.1796875, + "learning_rate": 9.288283277715982e-06, + "loss": 2.437, + "mean_token_accuracy": 0.49358407079646016, + "step": 3839 + }, + { + "epoch": 0.7119021134593994, + "grad_norm": 6.3359375, + "learning_rate": 9.288097886540602e-06, + "loss": 3.2576, + "mean_token_accuracy": 0.384384834407313, + "step": 3840 + }, + { + "epoch": 0.7120875046347794, + "grad_norm": 6.6875, + "learning_rate": 9.287912495365221e-06, + "loss": 3.1605, + "mean_token_accuracy": 0.4100135317997294, + "step": 3841 + }, + { + "epoch": 0.7122728958101594, + "grad_norm": 8.34375, + "learning_rate": 9.287727104189841e-06, + "loss": 3.1242, + "mean_token_accuracy": 0.4092058674759737, + "step": 3842 + }, + { + "epoch": 0.7124582869855395, + "grad_norm": 8.0546875, + "learning_rate": 9.28754171301446e-06, + "loss": 2.8401, + "mean_token_accuracy": 0.4523113708820404, + "step": 3843 + }, + { + "epoch": 0.7126436781609196, + "grad_norm": 6.55859375, + "learning_rate": 9.287356321839082e-06, + "loss": 2.7778, + "mean_token_accuracy": 0.4600933047534989, + "step": 3844 + }, + { + "epoch": 0.7128290693362996, + "grad_norm": 6.4765625, + "learning_rate": 9.287170930663701e-06, + "loss": 2.5912, + "mean_token_accuracy": 0.46860986547085204, + "step": 3845 + }, + { + "epoch": 0.7130144605116796, + "grad_norm": 7.53515625, + "learning_rate": 9.286985539488322e-06, + "loss": 2.9713, + "mean_token_accuracy": 0.42661576938265106, + "step": 3846 + }, + { + "epoch": 0.7131998516870597, + "grad_norm": 5.55859375, + "learning_rate": 9.28680014831294e-06, + "loss": 2.3253, + "mean_token_accuracy": 0.5053908355795148, + "step": 3847 + }, + { + "epoch": 0.7133852428624398, + "grad_norm": 5.30078125, + "learning_rate": 9.286614757137561e-06, + "loss": 2.8588, + "mean_token_accuracy": 0.44854713868798374, + "step": 3848 + }, + { + "epoch": 0.7135706340378198, + "grad_norm": 7.5390625, + "learning_rate": 9.286429365962181e-06, + "loss": 3.4518, + "mean_token_accuracy": 0.3614772103239978, + "step": 3849 + }, + { + "epoch": 0.7137560252131998, + "grad_norm": 8.6953125, + "learning_rate": 9.2862439747868e-06, + "loss": 2.8411, + "mean_token_accuracy": 0.4513692162417375, + "step": 3850 + }, + { + "epoch": 0.7139414163885799, + "grad_norm": 5.234375, + "learning_rate": 9.28605858361142e-06, + "loss": 2.9319, + "mean_token_accuracy": 0.42496640286695536, + "step": 3851 + }, + { + "epoch": 0.71412680756396, + "grad_norm": 5.2734375, + "learning_rate": 9.285873192436041e-06, + "loss": 2.2036, + "mean_token_accuracy": 0.5415636789384676, + "step": 3852 + }, + { + "epoch": 0.71431219873934, + "grad_norm": 8.078125, + "learning_rate": 9.285687801260662e-06, + "loss": 2.8999, + "mean_token_accuracy": 0.44234296194406236, + "step": 3853 + }, + { + "epoch": 0.71449758991472, + "grad_norm": 5.58203125, + "learning_rate": 9.28550241008528e-06, + "loss": 2.7645, + "mean_token_accuracy": 0.44176706827309237, + "step": 3854 + }, + { + "epoch": 0.7146829810901001, + "grad_norm": 6.55859375, + "learning_rate": 9.285317018909901e-06, + "loss": 2.7683, + "mean_token_accuracy": 0.4656292491312887, + "step": 3855 + }, + { + "epoch": 0.7148683722654802, + "grad_norm": 5.40625, + "learning_rate": 9.28513162773452e-06, + "loss": 2.8877, + "mean_token_accuracy": 0.43333333333333335, + "step": 3856 + }, + { + "epoch": 0.7150537634408602, + "grad_norm": 5.3671875, + "learning_rate": 9.28494623655914e-06, + "loss": 3.0976, + "mean_token_accuracy": 0.4210226192739023, + "step": 3857 + }, + { + "epoch": 0.7152391546162402, + "grad_norm": 5.078125, + "learning_rate": 9.28476084538376e-06, + "loss": 2.9418, + "mean_token_accuracy": 0.4238421955403088, + "step": 3858 + }, + { + "epoch": 0.7154245457916203, + "grad_norm": 5.5703125, + "learning_rate": 9.28457545420838e-06, + "loss": 3.224, + "mean_token_accuracy": 0.41537043438184124, + "step": 3859 + }, + { + "epoch": 0.7156099369670004, + "grad_norm": 7.21484375, + "learning_rate": 9.284390063033e-06, + "loss": 2.3118, + "mean_token_accuracy": 0.4927465362673187, + "step": 3860 + }, + { + "epoch": 0.7157953281423804, + "grad_norm": 6.81640625, + "learning_rate": 9.28420467185762e-06, + "loss": 3.2337, + "mean_token_accuracy": 0.39963503649635035, + "step": 3861 + }, + { + "epoch": 0.7159807193177605, + "grad_norm": 5.6484375, + "learning_rate": 9.284019280682241e-06, + "loss": 3.0017, + "mean_token_accuracy": 0.41752975730406555, + "step": 3862 + }, + { + "epoch": 0.7161661104931405, + "grad_norm": 13.1953125, + "learning_rate": 9.28383388950686e-06, + "loss": 2.4888, + "mean_token_accuracy": 0.5243933918430562, + "step": 3863 + }, + { + "epoch": 0.7163515016685206, + "grad_norm": 6.21875, + "learning_rate": 9.28364849833148e-06, + "loss": 2.9504, + "mean_token_accuracy": 0.4504148053605616, + "step": 3864 + }, + { + "epoch": 0.7165368928439007, + "grad_norm": 5.4765625, + "learning_rate": 9.283463107156099e-06, + "loss": 3.0543, + "mean_token_accuracy": 0.4324521688330656, + "step": 3865 + }, + { + "epoch": 0.7167222840192807, + "grad_norm": 5.99609375, + "learning_rate": 9.28327771598072e-06, + "loss": 3.1813, + "mean_token_accuracy": 0.3868104860731841, + "step": 3866 + }, + { + "epoch": 0.7169076751946607, + "grad_norm": 7.8125, + "learning_rate": 9.28309232480534e-06, + "loss": 2.6599, + "mean_token_accuracy": 0.47185174785904815, + "step": 3867 + }, + { + "epoch": 0.7170930663700408, + "grad_norm": 5.69140625, + "learning_rate": 9.28290693362996e-06, + "loss": 2.1933, + "mean_token_accuracy": 0.5489057151747744, + "step": 3868 + }, + { + "epoch": 0.7172784575454209, + "grad_norm": 12.296875, + "learning_rate": 9.282721542454581e-06, + "loss": 2.3496, + "mean_token_accuracy": 0.509635477130253, + "step": 3869 + }, + { + "epoch": 0.7174638487208009, + "grad_norm": 7.18359375, + "learning_rate": 9.2825361512792e-06, + "loss": 3.1087, + "mean_token_accuracy": 0.40987944722140546, + "step": 3870 + }, + { + "epoch": 0.7176492398961809, + "grad_norm": 5.81640625, + "learning_rate": 9.28235076010382e-06, + "loss": 2.3658, + "mean_token_accuracy": 0.507732670533002, + "step": 3871 + }, + { + "epoch": 0.717834631071561, + "grad_norm": 7.91015625, + "learning_rate": 9.28216536892844e-06, + "loss": 3.4865, + "mean_token_accuracy": 0.38901449660859155, + "step": 3872 + }, + { + "epoch": 0.7180200222469411, + "grad_norm": 6.78515625, + "learning_rate": 9.28197997775306e-06, + "loss": 2.8264, + "mean_token_accuracy": 0.453559990145356, + "step": 3873 + }, + { + "epoch": 0.7182054134223211, + "grad_norm": 9.84375, + "learning_rate": 9.28179458657768e-06, + "loss": 2.7351, + "mean_token_accuracy": 0.4610535794687078, + "step": 3874 + }, + { + "epoch": 0.7183908045977011, + "grad_norm": 8.2578125, + "learning_rate": 9.281609195402299e-06, + "loss": 2.4851, + "mean_token_accuracy": 0.47504223621563507, + "step": 3875 + }, + { + "epoch": 0.7185761957730812, + "grad_norm": 8.109375, + "learning_rate": 9.28142380422692e-06, + "loss": 2.5902, + "mean_token_accuracy": 0.4603890611784607, + "step": 3876 + }, + { + "epoch": 0.7187615869484613, + "grad_norm": 10.796875, + "learning_rate": 9.28123841305154e-06, + "loss": 2.6374, + "mean_token_accuracy": 0.46257758305951074, + "step": 3877 + }, + { + "epoch": 0.7189469781238413, + "grad_norm": 6.4140625, + "learning_rate": 9.28105302187616e-06, + "loss": 2.6443, + "mean_token_accuracy": 0.49711732487748633, + "step": 3878 + }, + { + "epoch": 0.7191323692992213, + "grad_norm": 5.3984375, + "learning_rate": 9.28086763070078e-06, + "loss": 2.6785, + "mean_token_accuracy": 0.48767682660055417, + "step": 3879 + }, + { + "epoch": 0.7193177604746014, + "grad_norm": 5.8125, + "learning_rate": 9.2806822395254e-06, + "loss": 2.7603, + "mean_token_accuracy": 0.45819867921877194, + "step": 3880 + }, + { + "epoch": 0.7195031516499815, + "grad_norm": 7.64453125, + "learning_rate": 9.280496848350018e-06, + "loss": 2.7929, + "mean_token_accuracy": 0.45042286380869057, + "step": 3881 + }, + { + "epoch": 0.7196885428253615, + "grad_norm": 5.43359375, + "learning_rate": 9.280311457174639e-06, + "loss": 3.2275, + "mean_token_accuracy": 0.417011751538892, + "step": 3882 + }, + { + "epoch": 0.7198739340007415, + "grad_norm": 5.95703125, + "learning_rate": 9.28012606599926e-06, + "loss": 2.6321, + "mean_token_accuracy": 0.4506729773702115, + "step": 3883 + }, + { + "epoch": 0.7200593251761216, + "grad_norm": 7.32421875, + "learning_rate": 9.27994067482388e-06, + "loss": 3.0024, + "mean_token_accuracy": 0.4272026661112268, + "step": 3884 + }, + { + "epoch": 0.7202447163515017, + "grad_norm": 4.953125, + "learning_rate": 9.279755283648499e-06, + "loss": 3.21, + "mean_token_accuracy": 0.42186761229314423, + "step": 3885 + }, + { + "epoch": 0.7204301075268817, + "grad_norm": 8.796875, + "learning_rate": 9.27956989247312e-06, + "loss": 2.1783, + "mean_token_accuracy": 0.5267933087609866, + "step": 3886 + }, + { + "epoch": 0.7206154987022618, + "grad_norm": 7.46875, + "learning_rate": 9.27938450129774e-06, + "loss": 2.1276, + "mean_token_accuracy": 0.5390696260261477, + "step": 3887 + }, + { + "epoch": 0.7208008898776418, + "grad_norm": 5.8828125, + "learning_rate": 9.279199110122359e-06, + "loss": 3.1521, + "mean_token_accuracy": 0.4157950583598605, + "step": 3888 + }, + { + "epoch": 0.7209862810530219, + "grad_norm": 5.6015625, + "learning_rate": 9.279013718946979e-06, + "loss": 2.8625, + "mean_token_accuracy": 0.4286047053342651, + "step": 3889 + }, + { + "epoch": 0.721171672228402, + "grad_norm": 6.70703125, + "learning_rate": 9.278828327771598e-06, + "loss": 3.0683, + "mean_token_accuracy": 0.4291015107341638, + "step": 3890 + }, + { + "epoch": 0.721357063403782, + "grad_norm": 5.06640625, + "learning_rate": 9.278642936596218e-06, + "loss": 2.922, + "mean_token_accuracy": 0.4243416743089218, + "step": 3891 + }, + { + "epoch": 0.721542454579162, + "grad_norm": 6.21875, + "learning_rate": 9.278457545420839e-06, + "loss": 2.7284, + "mean_token_accuracy": 0.43791408420602695, + "step": 3892 + }, + { + "epoch": 0.7217278457545421, + "grad_norm": 6.51953125, + "learning_rate": 9.27827215424546e-06, + "loss": 3.149, + "mean_token_accuracy": 0.4257305194805195, + "step": 3893 + }, + { + "epoch": 0.7219132369299222, + "grad_norm": 5.5234375, + "learning_rate": 9.278086763070078e-06, + "loss": 2.558, + "mean_token_accuracy": 0.48107681910340166, + "step": 3894 + }, + { + "epoch": 0.7220986281053022, + "grad_norm": 6.19140625, + "learning_rate": 9.277901371894699e-06, + "loss": 2.7365, + "mean_token_accuracy": 0.46621621621621623, + "step": 3895 + }, + { + "epoch": 0.7222840192806822, + "grad_norm": 7.63671875, + "learning_rate": 9.277715980719319e-06, + "loss": 2.7897, + "mean_token_accuracy": 0.45120702267739576, + "step": 3896 + }, + { + "epoch": 0.7224694104560623, + "grad_norm": 7.1015625, + "learning_rate": 9.277530589543938e-06, + "loss": 2.8925, + "mean_token_accuracy": 0.4322405018345366, + "step": 3897 + }, + { + "epoch": 0.7226548016314424, + "grad_norm": 5.7109375, + "learning_rate": 9.277345198368558e-06, + "loss": 2.3219, + "mean_token_accuracy": 0.5061345158906134, + "step": 3898 + }, + { + "epoch": 0.7228401928068224, + "grad_norm": 7.9921875, + "learning_rate": 9.277159807193177e-06, + "loss": 3.243, + "mean_token_accuracy": 0.41237307258367806, + "step": 3899 + }, + { + "epoch": 0.7230255839822024, + "grad_norm": 7.29296875, + "learning_rate": 9.2769744160178e-06, + "loss": 2.445, + "mean_token_accuracy": 0.48598811112459056, + "step": 3900 + }, + { + "epoch": 0.7232109751575825, + "grad_norm": 5.3125, + "learning_rate": 9.276789024842418e-06, + "loss": 2.9842, + "mean_token_accuracy": 0.43967031482232133, + "step": 3901 + }, + { + "epoch": 0.7233963663329626, + "grad_norm": 8.046875, + "learning_rate": 9.276603633667039e-06, + "loss": 3.0261, + "mean_token_accuracy": 0.4347930992882133, + "step": 3902 + }, + { + "epoch": 0.7235817575083426, + "grad_norm": 5.58203125, + "learning_rate": 9.276418242491657e-06, + "loss": 2.539, + "mean_token_accuracy": 0.473052394647993, + "step": 3903 + }, + { + "epoch": 0.7237671486837226, + "grad_norm": 5.4296875, + "learning_rate": 9.276232851316278e-06, + "loss": 3.0278, + "mean_token_accuracy": 0.4216754540128881, + "step": 3904 + }, + { + "epoch": 0.7239525398591027, + "grad_norm": 6.11328125, + "learning_rate": 9.276047460140898e-06, + "loss": 3.0518, + "mean_token_accuracy": 0.4218213058419244, + "step": 3905 + }, + { + "epoch": 0.7241379310344828, + "grad_norm": 6.6953125, + "learning_rate": 9.275862068965517e-06, + "loss": 2.9169, + "mean_token_accuracy": 0.44007569386038686, + "step": 3906 + }, + { + "epoch": 0.7243233222098628, + "grad_norm": 7.86328125, + "learning_rate": 9.275676677790138e-06, + "loss": 2.9699, + "mean_token_accuracy": 0.43709173530247086, + "step": 3907 + }, + { + "epoch": 0.7245087133852428, + "grad_norm": 9.3671875, + "learning_rate": 9.275491286614758e-06, + "loss": 2.5806, + "mean_token_accuracy": 0.46972526006935184, + "step": 3908 + }, + { + "epoch": 0.7246941045606229, + "grad_norm": 8.46875, + "learning_rate": 9.275305895439379e-06, + "loss": 2.8157, + "mean_token_accuracy": 0.4554568891435119, + "step": 3909 + }, + { + "epoch": 0.724879495736003, + "grad_norm": 8.71875, + "learning_rate": 9.275120504263997e-06, + "loss": 2.4995, + "mean_token_accuracy": 0.4755811681969135, + "step": 3910 + }, + { + "epoch": 0.725064886911383, + "grad_norm": 5.46484375, + "learning_rate": 9.274935113088618e-06, + "loss": 2.6272, + "mean_token_accuracy": 0.4806324110671937, + "step": 3911 + }, + { + "epoch": 0.7252502780867631, + "grad_norm": 6.1796875, + "learning_rate": 9.274749721913238e-06, + "loss": 2.6474, + "mean_token_accuracy": 0.46493150684931506, + "step": 3912 + }, + { + "epoch": 0.7254356692621431, + "grad_norm": 5.62109375, + "learning_rate": 9.274564330737857e-06, + "loss": 2.5719, + "mean_token_accuracy": 0.4821671195652174, + "step": 3913 + }, + { + "epoch": 0.7256210604375232, + "grad_norm": 4.84375, + "learning_rate": 9.274378939562478e-06, + "loss": 2.7854, + "mean_token_accuracy": 0.46368325665690785, + "step": 3914 + }, + { + "epoch": 0.7258064516129032, + "grad_norm": 4.8671875, + "learning_rate": 9.274193548387097e-06, + "loss": 2.9438, + "mean_token_accuracy": 0.4207401270367302, + "step": 3915 + }, + { + "epoch": 0.7259918427882833, + "grad_norm": 11.0703125, + "learning_rate": 9.274008157211719e-06, + "loss": 1.9193, + "mean_token_accuracy": 0.580967454743298, + "step": 3916 + }, + { + "epoch": 0.7261772339636633, + "grad_norm": 6.09765625, + "learning_rate": 9.273822766036338e-06, + "loss": 3.1129, + "mean_token_accuracy": 0.4153397027600849, + "step": 3917 + }, + { + "epoch": 0.7263626251390434, + "grad_norm": 4.90625, + "learning_rate": 9.273637374860958e-06, + "loss": 3.0335, + "mean_token_accuracy": 0.41949556918882075, + "step": 3918 + }, + { + "epoch": 0.7265480163144235, + "grad_norm": 5.63671875, + "learning_rate": 9.273451983685577e-06, + "loss": 2.201, + "mean_token_accuracy": 0.5010224948875256, + "step": 3919 + }, + { + "epoch": 0.7267334074898035, + "grad_norm": 7.890625, + "learning_rate": 9.273266592510197e-06, + "loss": 2.8647, + "mean_token_accuracy": 0.4287729196050776, + "step": 3920 + }, + { + "epoch": 0.7269187986651835, + "grad_norm": 5.484375, + "learning_rate": 9.273081201334818e-06, + "loss": 3.0507, + "mean_token_accuracy": 0.4214655810510733, + "step": 3921 + }, + { + "epoch": 0.7271041898405636, + "grad_norm": 5.8125, + "learning_rate": 9.272895810159437e-06, + "loss": 2.4188, + "mean_token_accuracy": 0.4922339405560882, + "step": 3922 + }, + { + "epoch": 0.7272895810159437, + "grad_norm": 7.46484375, + "learning_rate": 9.272710418984057e-06, + "loss": 2.7332, + "mean_token_accuracy": 0.44634912326616066, + "step": 3923 + }, + { + "epoch": 0.7274749721913237, + "grad_norm": 5.8984375, + "learning_rate": 9.272525027808678e-06, + "loss": 2.873, + "mean_token_accuracy": 0.45482246952835187, + "step": 3924 + }, + { + "epoch": 0.7276603633667037, + "grad_norm": 5.8984375, + "learning_rate": 9.272339636633298e-06, + "loss": 3.0332, + "mean_token_accuracy": 0.41354611711485295, + "step": 3925 + }, + { + "epoch": 0.7278457545420838, + "grad_norm": 5.5625, + "learning_rate": 9.272154245457917e-06, + "loss": 2.5159, + "mean_token_accuracy": 0.4718878345843365, + "step": 3926 + }, + { + "epoch": 0.7280311457174639, + "grad_norm": 5.13671875, + "learning_rate": 9.271968854282537e-06, + "loss": 2.9081, + "mean_token_accuracy": 0.4388247168330156, + "step": 3927 + }, + { + "epoch": 0.7282165368928439, + "grad_norm": 7.15625, + "learning_rate": 9.271783463107156e-06, + "loss": 2.3861, + "mean_token_accuracy": 0.48508600043544525, + "step": 3928 + }, + { + "epoch": 0.7284019280682239, + "grad_norm": 5.6875, + "learning_rate": 9.271598071931777e-06, + "loss": 3.0912, + "mean_token_accuracy": 0.41877880184331795, + "step": 3929 + }, + { + "epoch": 0.728587319243604, + "grad_norm": 5.79296875, + "learning_rate": 9.271412680756397e-06, + "loss": 2.6762, + "mean_token_accuracy": 0.46303162486368593, + "step": 3930 + }, + { + "epoch": 0.7287727104189841, + "grad_norm": 7.54296875, + "learning_rate": 9.271227289581016e-06, + "loss": 2.7837, + "mean_token_accuracy": 0.44346617238183506, + "step": 3931 + }, + { + "epoch": 0.7289581015943641, + "grad_norm": 8.2109375, + "learning_rate": 9.271041898405636e-06, + "loss": 2.4994, + "mean_token_accuracy": 0.4934435261707989, + "step": 3932 + }, + { + "epoch": 0.7291434927697441, + "grad_norm": 6.4765625, + "learning_rate": 9.270856507230257e-06, + "loss": 2.6423, + "mean_token_accuracy": 0.48200403109703427, + "step": 3933 + }, + { + "epoch": 0.7293288839451242, + "grad_norm": 11.6640625, + "learning_rate": 9.270671116054877e-06, + "loss": 2.3081, + "mean_token_accuracy": 0.47759405703330937, + "step": 3934 + }, + { + "epoch": 0.7295142751205043, + "grad_norm": 7.94921875, + "learning_rate": 9.270485724879496e-06, + "loss": 2.7722, + "mean_token_accuracy": 0.4535424697594045, + "step": 3935 + }, + { + "epoch": 0.7296996662958843, + "grad_norm": 7.17578125, + "learning_rate": 9.270300333704117e-06, + "loss": 2.5789, + "mean_token_accuracy": 0.4907862407862408, + "step": 3936 + }, + { + "epoch": 0.7298850574712644, + "grad_norm": 8.6640625, + "learning_rate": 9.270114942528736e-06, + "loss": 2.7871, + "mean_token_accuracy": 0.45421519393097554, + "step": 3937 + }, + { + "epoch": 0.7300704486466444, + "grad_norm": 8.6171875, + "learning_rate": 9.269929551353356e-06, + "loss": 2.9112, + "mean_token_accuracy": 0.44194796817888626, + "step": 3938 + }, + { + "epoch": 0.7302558398220245, + "grad_norm": 6.140625, + "learning_rate": 9.269744160177976e-06, + "loss": 3.073, + "mean_token_accuracy": 0.4136210384356035, + "step": 3939 + }, + { + "epoch": 0.7304412309974045, + "grad_norm": 9.1328125, + "learning_rate": 9.269558769002597e-06, + "loss": 2.2808, + "mean_token_accuracy": 0.5192578930237605, + "step": 3940 + }, + { + "epoch": 0.7306266221727846, + "grad_norm": 5.49609375, + "learning_rate": 9.269373377827216e-06, + "loss": 3.019, + "mean_token_accuracy": 0.42737547090832984, + "step": 3941 + }, + { + "epoch": 0.7308120133481646, + "grad_norm": 9.234375, + "learning_rate": 9.269187986651836e-06, + "loss": 2.9588, + "mean_token_accuracy": 0.4270209157716224, + "step": 3942 + }, + { + "epoch": 0.7309974045235447, + "grad_norm": 5.640625, + "learning_rate": 9.269002595476457e-06, + "loss": 2.6464, + "mean_token_accuracy": 0.464031007751938, + "step": 3943 + }, + { + "epoch": 0.7311827956989247, + "grad_norm": 5.68359375, + "learning_rate": 9.268817204301076e-06, + "loss": 2.9776, + "mean_token_accuracy": 0.44745502413339183, + "step": 3944 + }, + { + "epoch": 0.7313681868743048, + "grad_norm": 7.96484375, + "learning_rate": 9.268631813125696e-06, + "loss": 2.3587, + "mean_token_accuracy": 0.4868006518196632, + "step": 3945 + }, + { + "epoch": 0.7315535780496848, + "grad_norm": 6.171875, + "learning_rate": 9.268446421950315e-06, + "loss": 3.0466, + "mean_token_accuracy": 0.4223285978999382, + "step": 3946 + }, + { + "epoch": 0.7317389692250649, + "grad_norm": 8.71875, + "learning_rate": 9.268261030774935e-06, + "loss": 3.4374, + "mean_token_accuracy": 0.3771629587374025, + "step": 3947 + }, + { + "epoch": 0.731924360400445, + "grad_norm": 6.46484375, + "learning_rate": 9.268075639599556e-06, + "loss": 2.8142, + "mean_token_accuracy": 0.43495196052973256, + "step": 3948 + }, + { + "epoch": 0.732109751575825, + "grad_norm": 6.921875, + "learning_rate": 9.267890248424176e-06, + "loss": 2.7643, + "mean_token_accuracy": 0.43204502017413465, + "step": 3949 + }, + { + "epoch": 0.732295142751205, + "grad_norm": 6.92578125, + "learning_rate": 9.267704857248797e-06, + "loss": 2.6094, + "mean_token_accuracy": 0.4581783500238436, + "step": 3950 + }, + { + "epoch": 0.7324805339265851, + "grad_norm": 8.875, + "learning_rate": 9.267519466073416e-06, + "loss": 2.612, + "mean_token_accuracy": 0.4797630799605133, + "step": 3951 + }, + { + "epoch": 0.7326659251019652, + "grad_norm": 6.375, + "learning_rate": 9.267334074898036e-06, + "loss": 2.8794, + "mean_token_accuracy": 0.4428646105593309, + "step": 3952 + }, + { + "epoch": 0.7328513162773452, + "grad_norm": 6.95703125, + "learning_rate": 9.267148683722655e-06, + "loss": 2.7098, + "mean_token_accuracy": 0.46692131398013753, + "step": 3953 + }, + { + "epoch": 0.7330367074527252, + "grad_norm": 8.5, + "learning_rate": 9.266963292547275e-06, + "loss": 2.6399, + "mean_token_accuracy": 0.4718202141428403, + "step": 3954 + }, + { + "epoch": 0.7332220986281053, + "grad_norm": 6.76953125, + "learning_rate": 9.266777901371894e-06, + "loss": 2.771, + "mean_token_accuracy": 0.4521072796934866, + "step": 3955 + }, + { + "epoch": 0.7334074898034854, + "grad_norm": 5.94921875, + "learning_rate": 9.266592510196516e-06, + "loss": 3.0159, + "mean_token_accuracy": 0.4052898142937535, + "step": 3956 + }, + { + "epoch": 0.7335928809788654, + "grad_norm": 7.39453125, + "learning_rate": 9.266407119021135e-06, + "loss": 2.3836, + "mean_token_accuracy": 0.5151616499442586, + "step": 3957 + }, + { + "epoch": 0.7337782721542454, + "grad_norm": 8.7265625, + "learning_rate": 9.266221727845756e-06, + "loss": 3.2985, + "mean_token_accuracy": 0.39811815517507326, + "step": 3958 + }, + { + "epoch": 0.7339636633296255, + "grad_norm": 5.12109375, + "learning_rate": 9.266036336670376e-06, + "loss": 3.0436, + "mean_token_accuracy": 0.4278858625162127, + "step": 3959 + }, + { + "epoch": 0.7341490545050056, + "grad_norm": 7.32421875, + "learning_rate": 9.265850945494995e-06, + "loss": 3.0014, + "mean_token_accuracy": 0.4146214777301649, + "step": 3960 + }, + { + "epoch": 0.7343344456803856, + "grad_norm": 7.359375, + "learning_rate": 9.265665554319615e-06, + "loss": 2.9896, + "mean_token_accuracy": 0.4469578783151326, + "step": 3961 + }, + { + "epoch": 0.7345198368557657, + "grad_norm": 5.09765625, + "learning_rate": 9.265480163144234e-06, + "loss": 2.8436, + "mean_token_accuracy": 0.44556256062075655, + "step": 3962 + }, + { + "epoch": 0.7347052280311457, + "grad_norm": 5.6953125, + "learning_rate": 9.265294771968855e-06, + "loss": 2.9729, + "mean_token_accuracy": 0.4538906934048863, + "step": 3963 + }, + { + "epoch": 0.7348906192065258, + "grad_norm": 6.21484375, + "learning_rate": 9.265109380793475e-06, + "loss": 2.7902, + "mean_token_accuracy": 0.4344487737795095, + "step": 3964 + }, + { + "epoch": 0.7350760103819058, + "grad_norm": 5.546875, + "learning_rate": 9.264923989618096e-06, + "loss": 2.4585, + "mean_token_accuracy": 0.48615253515125695, + "step": 3965 + }, + { + "epoch": 0.7352614015572859, + "grad_norm": 6.36328125, + "learning_rate": 9.264738598442715e-06, + "loss": 2.7901, + "mean_token_accuracy": 0.4600071736011478, + "step": 3966 + }, + { + "epoch": 0.7354467927326659, + "grad_norm": 5.7890625, + "learning_rate": 9.264553207267335e-06, + "loss": 2.5737, + "mean_token_accuracy": 0.48561987516827804, + "step": 3967 + }, + { + "epoch": 0.735632183908046, + "grad_norm": 6.7109375, + "learning_rate": 9.264367816091955e-06, + "loss": 2.6687, + "mean_token_accuracy": 0.463013306624696, + "step": 3968 + }, + { + "epoch": 0.735817575083426, + "grad_norm": 8.03125, + "learning_rate": 9.264182424916574e-06, + "loss": 2.9828, + "mean_token_accuracy": 0.4225283432890406, + "step": 3969 + }, + { + "epoch": 0.7360029662588061, + "grad_norm": 6.984375, + "learning_rate": 9.263997033741195e-06, + "loss": 3.1496, + "mean_token_accuracy": 0.4039049235993209, + "step": 3970 + }, + { + "epoch": 0.7361883574341861, + "grad_norm": 6.4765625, + "learning_rate": 9.263811642565814e-06, + "loss": 3.1434, + "mean_token_accuracy": 0.416026474412008, + "step": 3971 + }, + { + "epoch": 0.7363737486095662, + "grad_norm": 5.33203125, + "learning_rate": 9.263626251390434e-06, + "loss": 2.6145, + "mean_token_accuracy": 0.44239226033421286, + "step": 3972 + }, + { + "epoch": 0.7365591397849462, + "grad_norm": 5.59375, + "learning_rate": 9.263440860215055e-06, + "loss": 2.8441, + "mean_token_accuracy": 0.4342137145626363, + "step": 3973 + }, + { + "epoch": 0.7367445309603263, + "grad_norm": 6.73828125, + "learning_rate": 9.263255469039675e-06, + "loss": 2.8735, + "mean_token_accuracy": 0.43698378709085345, + "step": 3974 + }, + { + "epoch": 0.7369299221357063, + "grad_norm": 4.671875, + "learning_rate": 9.263070077864294e-06, + "loss": 2.7441, + "mean_token_accuracy": 0.45172155688622756, + "step": 3975 + }, + { + "epoch": 0.7371153133110864, + "grad_norm": 5.19921875, + "learning_rate": 9.262884686688914e-06, + "loss": 2.7072, + "mean_token_accuracy": 0.43791544801914384, + "step": 3976 + }, + { + "epoch": 0.7373007044864665, + "grad_norm": 5.77734375, + "learning_rate": 9.262699295513535e-06, + "loss": 2.6231, + "mean_token_accuracy": 0.47018794556059623, + "step": 3977 + }, + { + "epoch": 0.7374860956618465, + "grad_norm": 6.37890625, + "learning_rate": 9.262513904338154e-06, + "loss": 2.9971, + "mean_token_accuracy": 0.4163275686673449, + "step": 3978 + }, + { + "epoch": 0.7376714868372265, + "grad_norm": 8.53125, + "learning_rate": 9.262328513162774e-06, + "loss": 2.1871, + "mean_token_accuracy": 0.49588719153936545, + "step": 3979 + }, + { + "epoch": 0.7378568780126066, + "grad_norm": 5.8046875, + "learning_rate": 9.262143121987393e-06, + "loss": 3.2811, + "mean_token_accuracy": 0.40675324675324676, + "step": 3980 + }, + { + "epoch": 0.7380422691879867, + "grad_norm": 5.5859375, + "learning_rate": 9.261957730812015e-06, + "loss": 2.8501, + "mean_token_accuracy": 0.4452680344142952, + "step": 3981 + }, + { + "epoch": 0.7382276603633667, + "grad_norm": 5.8046875, + "learning_rate": 9.261772339636634e-06, + "loss": 2.8001, + "mean_token_accuracy": 0.4525697102241662, + "step": 3982 + }, + { + "epoch": 0.7384130515387467, + "grad_norm": 6.05078125, + "learning_rate": 9.261586948461254e-06, + "loss": 3.3582, + "mean_token_accuracy": 0.3859226087954989, + "step": 3983 + }, + { + "epoch": 0.7385984427141268, + "grad_norm": 6.234375, + "learning_rate": 9.261401557285873e-06, + "loss": 3.0026, + "mean_token_accuracy": 0.432781364019085, + "step": 3984 + }, + { + "epoch": 0.7387838338895069, + "grad_norm": 6.8515625, + "learning_rate": 9.261216166110494e-06, + "loss": 2.7704, + "mean_token_accuracy": 0.45082823459185195, + "step": 3985 + }, + { + "epoch": 0.7389692250648869, + "grad_norm": 6.04296875, + "learning_rate": 9.261030774935114e-06, + "loss": 2.7325, + "mean_token_accuracy": 0.4503319251659626, + "step": 3986 + }, + { + "epoch": 0.739154616240267, + "grad_norm": 9.2734375, + "learning_rate": 9.260845383759733e-06, + "loss": 2.5619, + "mean_token_accuracy": 0.47041593438781487, + "step": 3987 + }, + { + "epoch": 0.739340007415647, + "grad_norm": 6.09765625, + "learning_rate": 9.260659992584353e-06, + "loss": 2.197, + "mean_token_accuracy": 0.5171734234234234, + "step": 3988 + }, + { + "epoch": 0.7395253985910271, + "grad_norm": 5.44140625, + "learning_rate": 9.260474601408974e-06, + "loss": 2.8406, + "mean_token_accuracy": 0.45006105006105007, + "step": 3989 + }, + { + "epoch": 0.7397107897664071, + "grad_norm": 6.8359375, + "learning_rate": 9.260289210233594e-06, + "loss": 2.5518, + "mean_token_accuracy": 0.46858606807368547, + "step": 3990 + }, + { + "epoch": 0.7398961809417872, + "grad_norm": 5.69921875, + "learning_rate": 9.260103819058213e-06, + "loss": 2.5798, + "mean_token_accuracy": 0.4768961493582264, + "step": 3991 + }, + { + "epoch": 0.7400815721171672, + "grad_norm": 7.15234375, + "learning_rate": 9.259918427882834e-06, + "loss": 2.6615, + "mean_token_accuracy": 0.47436245252306025, + "step": 3992 + }, + { + "epoch": 0.7402669632925473, + "grad_norm": 5.3046875, + "learning_rate": 9.259733036707454e-06, + "loss": 2.9924, + "mean_token_accuracy": 0.41944739638682255, + "step": 3993 + }, + { + "epoch": 0.7404523544679273, + "grad_norm": 5.19921875, + "learning_rate": 9.259547645532073e-06, + "loss": 3.1124, + "mean_token_accuracy": 0.40746870797558704, + "step": 3994 + }, + { + "epoch": 0.7406377456433074, + "grad_norm": 7.765625, + "learning_rate": 9.259362254356694e-06, + "loss": 2.6614, + "mean_token_accuracy": 0.4447263501268576, + "step": 3995 + }, + { + "epoch": 0.7408231368186874, + "grad_norm": 6.56640625, + "learning_rate": 9.259176863181312e-06, + "loss": 2.7696, + "mean_token_accuracy": 0.44548369855692144, + "step": 3996 + }, + { + "epoch": 0.7410085279940675, + "grad_norm": 6.67578125, + "learning_rate": 9.258991472005934e-06, + "loss": 2.866, + "mean_token_accuracy": 0.444063245823389, + "step": 3997 + }, + { + "epoch": 0.7411939191694475, + "grad_norm": 9.203125, + "learning_rate": 9.258806080830553e-06, + "loss": 2.7762, + "mean_token_accuracy": 0.4304151144225652, + "step": 3998 + }, + { + "epoch": 0.7413793103448276, + "grad_norm": 8.2734375, + "learning_rate": 9.258620689655174e-06, + "loss": 2.6137, + "mean_token_accuracy": 0.4697869873931314, + "step": 3999 + }, + { + "epoch": 0.7415647015202076, + "grad_norm": 6.875, + "learning_rate": 9.258435298479793e-06, + "loss": 2.8907, + "mean_token_accuracy": 0.4239005000641108, + "step": 4000 + }, + { + "epoch": 0.7417500926955877, + "grad_norm": 10.171875, + "learning_rate": 9.258249907304413e-06, + "loss": 3.0554, + "mean_token_accuracy": 0.4281957633308985, + "step": 4001 + }, + { + "epoch": 0.7419354838709677, + "grad_norm": 9.3046875, + "learning_rate": 9.258064516129034e-06, + "loss": 3.4044, + "mean_token_accuracy": 0.40085942295887045, + "step": 4002 + }, + { + "epoch": 0.7421208750463478, + "grad_norm": 5.41015625, + "learning_rate": 9.257879124953652e-06, + "loss": 2.9375, + "mean_token_accuracy": 0.4327706635622817, + "step": 4003 + }, + { + "epoch": 0.7423062662217278, + "grad_norm": 5.734375, + "learning_rate": 9.257693733778273e-06, + "loss": 2.8585, + "mean_token_accuracy": 0.4462566844919786, + "step": 4004 + }, + { + "epoch": 0.7424916573971079, + "grad_norm": 7.7734375, + "learning_rate": 9.257508342602893e-06, + "loss": 2.667, + "mean_token_accuracy": 0.4480372776051963, + "step": 4005 + }, + { + "epoch": 0.742677048572488, + "grad_norm": 6.5859375, + "learning_rate": 9.257322951427514e-06, + "loss": 2.8993, + "mean_token_accuracy": 0.43162175902389427, + "step": 4006 + }, + { + "epoch": 0.742862439747868, + "grad_norm": 5.58984375, + "learning_rate": 9.257137560252133e-06, + "loss": 3.0923, + "mean_token_accuracy": 0.42166563595135026, + "step": 4007 + }, + { + "epoch": 0.743047830923248, + "grad_norm": 7.46484375, + "learning_rate": 9.256952169076753e-06, + "loss": 3.0216, + "mean_token_accuracy": 0.42411232304478996, + "step": 4008 + }, + { + "epoch": 0.7432332220986281, + "grad_norm": 6.97265625, + "learning_rate": 9.256766777901372e-06, + "loss": 2.441, + "mean_token_accuracy": 0.5162257131045886, + "step": 4009 + }, + { + "epoch": 0.7434186132740082, + "grad_norm": 5.59765625, + "learning_rate": 9.256581386725992e-06, + "loss": 2.7914, + "mean_token_accuracy": 0.45378044115772026, + "step": 4010 + }, + { + "epoch": 0.7436040044493882, + "grad_norm": 7.3984375, + "learning_rate": 9.256395995550613e-06, + "loss": 2.3503, + "mean_token_accuracy": 0.5054146856840993, + "step": 4011 + }, + { + "epoch": 0.7437893956247683, + "grad_norm": 11.78125, + "learning_rate": 9.256210604375232e-06, + "loss": 2.4851, + "mean_token_accuracy": 0.4681976674281592, + "step": 4012 + }, + { + "epoch": 0.7439747868001483, + "grad_norm": 5.75, + "learning_rate": 9.256025213199852e-06, + "loss": 3.202, + "mean_token_accuracy": 0.3881325455946571, + "step": 4013 + }, + { + "epoch": 0.7441601779755284, + "grad_norm": 7.36328125, + "learning_rate": 9.255839822024473e-06, + "loss": 2.4837, + "mean_token_accuracy": 0.5, + "step": 4014 + }, + { + "epoch": 0.7443455691509084, + "grad_norm": 7.609375, + "learning_rate": 9.255654430849093e-06, + "loss": 2.8105, + "mean_token_accuracy": 0.43729754743174454, + "step": 4015 + }, + { + "epoch": 0.7445309603262885, + "grad_norm": 9.671875, + "learning_rate": 9.255469039673712e-06, + "loss": 2.4517, + "mean_token_accuracy": 0.4835727492533068, + "step": 4016 + }, + { + "epoch": 0.7447163515016685, + "grad_norm": 6.0859375, + "learning_rate": 9.255283648498332e-06, + "loss": 2.7048, + "mean_token_accuracy": 0.46290762634792776, + "step": 4017 + }, + { + "epoch": 0.7449017426770486, + "grad_norm": 8.4453125, + "learning_rate": 9.255098257322951e-06, + "loss": 2.7873, + "mean_token_accuracy": 0.43796042178246425, + "step": 4018 + }, + { + "epoch": 0.7450871338524286, + "grad_norm": 9.2890625, + "learning_rate": 9.254912866147572e-06, + "loss": 2.8432, + "mean_token_accuracy": 0.45245460659045056, + "step": 4019 + }, + { + "epoch": 0.7452725250278087, + "grad_norm": 6.61328125, + "learning_rate": 9.254727474972192e-06, + "loss": 3.0157, + "mean_token_accuracy": 0.4337513969949087, + "step": 4020 + }, + { + "epoch": 0.7454579162031887, + "grad_norm": 8.1796875, + "learning_rate": 9.254542083796813e-06, + "loss": 2.4191, + "mean_token_accuracy": 0.4832964601769911, + "step": 4021 + }, + { + "epoch": 0.7456433073785688, + "grad_norm": 7.21484375, + "learning_rate": 9.254356692621432e-06, + "loss": 2.5919, + "mean_token_accuracy": 0.4706507868991918, + "step": 4022 + }, + { + "epoch": 0.7458286985539488, + "grad_norm": 6.3125, + "learning_rate": 9.254171301446052e-06, + "loss": 2.9334, + "mean_token_accuracy": 0.4576191225035383, + "step": 4023 + }, + { + "epoch": 0.7460140897293289, + "grad_norm": 6.734375, + "learning_rate": 9.253985910270673e-06, + "loss": 3.5581, + "mean_token_accuracy": 0.3625686199412741, + "step": 4024 + }, + { + "epoch": 0.7461994809047089, + "grad_norm": 8.453125, + "learning_rate": 9.253800519095291e-06, + "loss": 2.3195, + "mean_token_accuracy": 0.5072563135641641, + "step": 4025 + }, + { + "epoch": 0.746384872080089, + "grad_norm": 7.3203125, + "learning_rate": 9.253615127919912e-06, + "loss": 2.86, + "mean_token_accuracy": 0.443035745729882, + "step": 4026 + }, + { + "epoch": 0.746570263255469, + "grad_norm": 5.99609375, + "learning_rate": 9.25342973674453e-06, + "loss": 2.8685, + "mean_token_accuracy": 0.4346381093057607, + "step": 4027 + }, + { + "epoch": 0.7467556544308491, + "grad_norm": 7.05078125, + "learning_rate": 9.253244345569151e-06, + "loss": 2.5989, + "mean_token_accuracy": 0.4639344262295082, + "step": 4028 + }, + { + "epoch": 0.7469410456062291, + "grad_norm": 6.8359375, + "learning_rate": 9.253058954393772e-06, + "loss": 3.0865, + "mean_token_accuracy": 0.41778762462414937, + "step": 4029 + }, + { + "epoch": 0.7471264367816092, + "grad_norm": 11.7890625, + "learning_rate": 9.252873563218392e-06, + "loss": 2.4202, + "mean_token_accuracy": 0.4833012202954399, + "step": 4030 + }, + { + "epoch": 0.7473118279569892, + "grad_norm": 5.91796875, + "learning_rate": 9.252688172043013e-06, + "loss": 2.8348, + "mean_token_accuracy": 0.4420213389611912, + "step": 4031 + }, + { + "epoch": 0.7474972191323693, + "grad_norm": 8.5, + "learning_rate": 9.252502780867631e-06, + "loss": 3.1172, + "mean_token_accuracy": 0.4095894703854591, + "step": 4032 + }, + { + "epoch": 0.7476826103077493, + "grad_norm": 7.8125, + "learning_rate": 9.252317389692252e-06, + "loss": 3.0157, + "mean_token_accuracy": 0.43785725951331045, + "step": 4033 + }, + { + "epoch": 0.7478680014831294, + "grad_norm": 5.390625, + "learning_rate": 9.25213199851687e-06, + "loss": 2.8455, + "mean_token_accuracy": 0.44370701098105464, + "step": 4034 + }, + { + "epoch": 0.7480533926585095, + "grad_norm": 6.7890625, + "learning_rate": 9.251946607341491e-06, + "loss": 2.8581, + "mean_token_accuracy": 0.4390838867055157, + "step": 4035 + }, + { + "epoch": 0.7482387838338895, + "grad_norm": 8.0390625, + "learning_rate": 9.25176121616611e-06, + "loss": 3.1941, + "mean_token_accuracy": 0.4116521114965905, + "step": 4036 + }, + { + "epoch": 0.7484241750092696, + "grad_norm": 7.90234375, + "learning_rate": 9.251575824990732e-06, + "loss": 2.5129, + "mean_token_accuracy": 0.4682274247491639, + "step": 4037 + }, + { + "epoch": 0.7486095661846496, + "grad_norm": 7.92578125, + "learning_rate": 9.251390433815351e-06, + "loss": 2.7454, + "mean_token_accuracy": 0.43207514350321796, + "step": 4038 + }, + { + "epoch": 0.7487949573600297, + "grad_norm": 7.74609375, + "learning_rate": 9.251205042639971e-06, + "loss": 2.7854, + "mean_token_accuracy": 0.44140323824209715, + "step": 4039 + }, + { + "epoch": 0.7489803485354097, + "grad_norm": 7.953125, + "learning_rate": 9.251019651464592e-06, + "loss": 2.7242, + "mean_token_accuracy": 0.44894155238982825, + "step": 4040 + }, + { + "epoch": 0.7491657397107898, + "grad_norm": 5.1015625, + "learning_rate": 9.25083426028921e-06, + "loss": 2.8172, + "mean_token_accuracy": 0.42783505154639173, + "step": 4041 + }, + { + "epoch": 0.7493511308861698, + "grad_norm": 6.78125, + "learning_rate": 9.250648869113831e-06, + "loss": 2.8606, + "mean_token_accuracy": 0.4449648711943794, + "step": 4042 + }, + { + "epoch": 0.7495365220615499, + "grad_norm": 10.40625, + "learning_rate": 9.25046347793845e-06, + "loss": 1.8386, + "mean_token_accuracy": 0.5713257225136275, + "step": 4043 + }, + { + "epoch": 0.7497219132369299, + "grad_norm": 5.234375, + "learning_rate": 9.25027808676307e-06, + "loss": 3.2198, + "mean_token_accuracy": 0.4348412406522006, + "step": 4044 + }, + { + "epoch": 0.74990730441231, + "grad_norm": 6.68359375, + "learning_rate": 9.250092695587691e-06, + "loss": 3.0722, + "mean_token_accuracy": 0.41564605021432943, + "step": 4045 + }, + { + "epoch": 0.75009269558769, + "grad_norm": 9.515625, + "learning_rate": 9.249907304412311e-06, + "loss": 3.2434, + "mean_token_accuracy": 0.4159209296113263, + "step": 4046 + }, + { + "epoch": 0.7502780867630701, + "grad_norm": 9.4140625, + "learning_rate": 9.24972191323693e-06, + "loss": 3.0931, + "mean_token_accuracy": 0.4188545609964587, + "step": 4047 + }, + { + "epoch": 0.7504634779384501, + "grad_norm": 6.98828125, + "learning_rate": 9.24953652206155e-06, + "loss": 2.9671, + "mean_token_accuracy": 0.44380995888533575, + "step": 4048 + }, + { + "epoch": 0.7506488691138302, + "grad_norm": 7.59375, + "learning_rate": 9.249351130886171e-06, + "loss": 2.9613, + "mean_token_accuracy": 0.42479476833171004, + "step": 4049 + }, + { + "epoch": 0.7508342602892102, + "grad_norm": 7.47265625, + "learning_rate": 9.24916573971079e-06, + "loss": 2.692, + "mean_token_accuracy": 0.4621295279912184, + "step": 4050 + }, + { + "epoch": 0.7510196514645903, + "grad_norm": 7.5625, + "learning_rate": 9.24898034853541e-06, + "loss": 2.9086, + "mean_token_accuracy": 0.43951965065502185, + "step": 4051 + }, + { + "epoch": 0.7512050426399703, + "grad_norm": 7.203125, + "learning_rate": 9.24879495736003e-06, + "loss": 3.0167, + "mean_token_accuracy": 0.4137583469824871, + "step": 4052 + }, + { + "epoch": 0.7513904338153504, + "grad_norm": 7.21484375, + "learning_rate": 9.248609566184652e-06, + "loss": 2.6742, + "mean_token_accuracy": 0.45527648168370893, + "step": 4053 + }, + { + "epoch": 0.7515758249907304, + "grad_norm": 10.125, + "learning_rate": 9.24842417500927e-06, + "loss": 2.7397, + "mean_token_accuracy": 0.4435548438751001, + "step": 4054 + }, + { + "epoch": 0.7517612161661105, + "grad_norm": 8.09375, + "learning_rate": 9.24823878383389e-06, + "loss": 2.4387, + "mean_token_accuracy": 0.48414350434676134, + "step": 4055 + }, + { + "epoch": 0.7519466073414905, + "grad_norm": 6.52734375, + "learning_rate": 9.24805339265851e-06, + "loss": 3.4818, + "mean_token_accuracy": 0.4006942722539053, + "step": 4056 + }, + { + "epoch": 0.7521319985168706, + "grad_norm": 6.36328125, + "learning_rate": 9.24786800148313e-06, + "loss": 2.9611, + "mean_token_accuracy": 0.4347442680776014, + "step": 4057 + }, + { + "epoch": 0.7523173896922507, + "grad_norm": 6.1328125, + "learning_rate": 9.24768261030775e-06, + "loss": 2.4162, + "mean_token_accuracy": 0.48296957671957674, + "step": 4058 + }, + { + "epoch": 0.7525027808676307, + "grad_norm": 5.9375, + "learning_rate": 9.24749721913237e-06, + "loss": 3.4389, + "mean_token_accuracy": 0.40722114764667955, + "step": 4059 + }, + { + "epoch": 0.7526881720430108, + "grad_norm": 6.28125, + "learning_rate": 9.24731182795699e-06, + "loss": 2.2765, + "mean_token_accuracy": 0.5191870739712194, + "step": 4060 + }, + { + "epoch": 0.7528735632183908, + "grad_norm": 6.0390625, + "learning_rate": 9.24712643678161e-06, + "loss": 2.9485, + "mean_token_accuracy": 0.4388227927363807, + "step": 4061 + }, + { + "epoch": 0.7530589543937709, + "grad_norm": 5.42578125, + "learning_rate": 9.246941045606231e-06, + "loss": 2.8608, + "mean_token_accuracy": 0.4466173962478681, + "step": 4062 + }, + { + "epoch": 0.7532443455691509, + "grad_norm": 5.73046875, + "learning_rate": 9.24675565443085e-06, + "loss": 2.5679, + "mean_token_accuracy": 0.48073022312373226, + "step": 4063 + }, + { + "epoch": 0.753429736744531, + "grad_norm": 8.84375, + "learning_rate": 9.24657026325547e-06, + "loss": 2.5652, + "mean_token_accuracy": 0.4735376044568245, + "step": 4064 + }, + { + "epoch": 0.753615127919911, + "grad_norm": 6.10546875, + "learning_rate": 9.246384872080089e-06, + "loss": 3.3759, + "mean_token_accuracy": 0.39525230250356724, + "step": 4065 + }, + { + "epoch": 0.7538005190952911, + "grad_norm": 7.8046875, + "learning_rate": 9.24619948090471e-06, + "loss": 2.9538, + "mean_token_accuracy": 0.41501494851068543, + "step": 4066 + }, + { + "epoch": 0.7539859102706711, + "grad_norm": 6.828125, + "learning_rate": 9.24601408972933e-06, + "loss": 2.7162, + "mean_token_accuracy": 0.44494777903611427, + "step": 4067 + }, + { + "epoch": 0.7541713014460512, + "grad_norm": 7.671875, + "learning_rate": 9.245828698553949e-06, + "loss": 2.5477, + "mean_token_accuracy": 0.47215865751334857, + "step": 4068 + }, + { + "epoch": 0.7543566926214312, + "grad_norm": 6.35546875, + "learning_rate": 9.245643307378571e-06, + "loss": 2.6635, + "mean_token_accuracy": 0.4515411973918198, + "step": 4069 + }, + { + "epoch": 0.7545420837968113, + "grad_norm": 6.12890625, + "learning_rate": 9.24545791620319e-06, + "loss": 2.3599, + "mean_token_accuracy": 0.4759839893262175, + "step": 4070 + }, + { + "epoch": 0.7547274749721913, + "grad_norm": 5.16796875, + "learning_rate": 9.24527252502781e-06, + "loss": 3.1606, + "mean_token_accuracy": 0.4105604793409063, + "step": 4071 + }, + { + "epoch": 0.7549128661475714, + "grad_norm": 5.8671875, + "learning_rate": 9.245087133852429e-06, + "loss": 2.9074, + "mean_token_accuracy": 0.43739304050199657, + "step": 4072 + }, + { + "epoch": 0.7550982573229514, + "grad_norm": 6.62109375, + "learning_rate": 9.24490174267705e-06, + "loss": 2.6379, + "mean_token_accuracy": 0.44672545901402344, + "step": 4073 + }, + { + "epoch": 0.7552836484983315, + "grad_norm": 6.48046875, + "learning_rate": 9.244716351501668e-06, + "loss": 2.5613, + "mean_token_accuracy": 0.47007863521258164, + "step": 4074 + }, + { + "epoch": 0.7554690396737115, + "grad_norm": 6.41015625, + "learning_rate": 9.244530960326289e-06, + "loss": 2.751, + "mean_token_accuracy": 0.4574554294975689, + "step": 4075 + }, + { + "epoch": 0.7556544308490916, + "grad_norm": 6.6875, + "learning_rate": 9.24434556915091e-06, + "loss": 3.018, + "mean_token_accuracy": 0.41948938321536905, + "step": 4076 + }, + { + "epoch": 0.7558398220244716, + "grad_norm": 5.5703125, + "learning_rate": 9.24416017797553e-06, + "loss": 2.8349, + "mean_token_accuracy": 0.43303638834365094, + "step": 4077 + }, + { + "epoch": 0.7560252131998517, + "grad_norm": 7.48828125, + "learning_rate": 9.24397478680015e-06, + "loss": 2.5955, + "mean_token_accuracy": 0.46965012205044754, + "step": 4078 + }, + { + "epoch": 0.7562106043752317, + "grad_norm": 5.27734375, + "learning_rate": 9.243789395624769e-06, + "loss": 2.9772, + "mean_token_accuracy": 0.435501257635645, + "step": 4079 + }, + { + "epoch": 0.7563959955506118, + "grad_norm": 7.5390625, + "learning_rate": 9.24360400444939e-06, + "loss": 2.6255, + "mean_token_accuracy": 0.45963926670609107, + "step": 4080 + }, + { + "epoch": 0.7565813867259918, + "grad_norm": 8.3828125, + "learning_rate": 9.243418613274008e-06, + "loss": 2.2806, + "mean_token_accuracy": 0.524390243902439, + "step": 4081 + }, + { + "epoch": 0.7567667779013719, + "grad_norm": 5.97265625, + "learning_rate": 9.243233222098629e-06, + "loss": 2.6644, + "mean_token_accuracy": 0.4671719867620369, + "step": 4082 + }, + { + "epoch": 0.756952169076752, + "grad_norm": 6.71875, + "learning_rate": 9.24304783092325e-06, + "loss": 2.9834, + "mean_token_accuracy": 0.42625, + "step": 4083 + }, + { + "epoch": 0.757137560252132, + "grad_norm": 5.94140625, + "learning_rate": 9.242862439747868e-06, + "loss": 3.0921, + "mean_token_accuracy": 0.4190197123068727, + "step": 4084 + }, + { + "epoch": 0.757322951427512, + "grad_norm": 6.18359375, + "learning_rate": 9.242677048572489e-06, + "loss": 2.4582, + "mean_token_accuracy": 0.485120718697361, + "step": 4085 + }, + { + "epoch": 0.7575083426028921, + "grad_norm": 5.72265625, + "learning_rate": 9.242491657397109e-06, + "loss": 3.0648, + "mean_token_accuracy": 0.42507645259938837, + "step": 4086 + }, + { + "epoch": 0.7576937337782722, + "grad_norm": 5.2734375, + "learning_rate": 9.24230626622173e-06, + "loss": 2.83, + "mean_token_accuracy": 0.45915450579208666, + "step": 4087 + }, + { + "epoch": 0.7578791249536522, + "grad_norm": 5.53515625, + "learning_rate": 9.242120875046348e-06, + "loss": 2.8726, + "mean_token_accuracy": 0.45125827814569536, + "step": 4088 + }, + { + "epoch": 0.7580645161290323, + "grad_norm": 5.2109375, + "learning_rate": 9.241935483870969e-06, + "loss": 2.5661, + "mean_token_accuracy": 0.47138519037608034, + "step": 4089 + }, + { + "epoch": 0.7582499073044123, + "grad_norm": 10.5625, + "learning_rate": 9.241750092695588e-06, + "loss": 2.572, + "mean_token_accuracy": 0.4420249186216186, + "step": 4090 + }, + { + "epoch": 0.7584352984797924, + "grad_norm": 5.64453125, + "learning_rate": 9.241564701520208e-06, + "loss": 2.398, + "mean_token_accuracy": 0.5001670936838587, + "step": 4091 + }, + { + "epoch": 0.7586206896551724, + "grad_norm": 5.54296875, + "learning_rate": 9.241379310344829e-06, + "loss": 2.6036, + "mean_token_accuracy": 0.461252721916229, + "step": 4092 + }, + { + "epoch": 0.7588060808305525, + "grad_norm": 5.11328125, + "learning_rate": 9.241193919169447e-06, + "loss": 3.0353, + "mean_token_accuracy": 0.44257630625969996, + "step": 4093 + }, + { + "epoch": 0.7589914720059325, + "grad_norm": 6.734375, + "learning_rate": 9.241008527994068e-06, + "loss": 2.5399, + "mean_token_accuracy": 0.48105911652903394, + "step": 4094 + }, + { + "epoch": 0.7591768631813126, + "grad_norm": 6.78125, + "learning_rate": 9.240823136818688e-06, + "loss": 2.7591, + "mean_token_accuracy": 0.45267997308813634, + "step": 4095 + }, + { + "epoch": 0.7593622543566926, + "grad_norm": 6.1015625, + "learning_rate": 9.240637745643309e-06, + "loss": 3.0812, + "mean_token_accuracy": 0.4214317375886525, + "step": 4096 + }, + { + "epoch": 0.7595476455320727, + "grad_norm": 5.484375, + "learning_rate": 9.240452354467928e-06, + "loss": 2.8984, + "mean_token_accuracy": 0.4418202052677136, + "step": 4097 + }, + { + "epoch": 0.7597330367074527, + "grad_norm": 7.09375, + "learning_rate": 9.240266963292548e-06, + "loss": 2.6811, + "mean_token_accuracy": 0.45031395031395033, + "step": 4098 + }, + { + "epoch": 0.7599184278828328, + "grad_norm": 6.44921875, + "learning_rate": 9.240081572117167e-06, + "loss": 2.801, + "mean_token_accuracy": 0.4308017372931095, + "step": 4099 + }, + { + "epoch": 0.7601038190582128, + "grad_norm": 7.0703125, + "learning_rate": 9.239896180941788e-06, + "loss": 2.4322, + "mean_token_accuracy": 0.49240473738414003, + "step": 4100 + }, + { + "epoch": 0.7602892102335929, + "grad_norm": 8.546875, + "learning_rate": 9.239710789766408e-06, + "loss": 2.8106, + "mean_token_accuracy": 0.4342989571263036, + "step": 4101 + }, + { + "epoch": 0.7604746014089729, + "grad_norm": 8.0234375, + "learning_rate": 9.239525398591028e-06, + "loss": 2.3234, + "mean_token_accuracy": 0.5059210526315789, + "step": 4102 + }, + { + "epoch": 0.760659992584353, + "grad_norm": 5.4296875, + "learning_rate": 9.239340007415647e-06, + "loss": 3.6483, + "mean_token_accuracy": 0.3774027715690657, + "step": 4103 + }, + { + "epoch": 0.760845383759733, + "grad_norm": 7.73828125, + "learning_rate": 9.239154616240268e-06, + "loss": 2.407, + "mean_token_accuracy": 0.5005283550545967, + "step": 4104 + }, + { + "epoch": 0.7610307749351131, + "grad_norm": 7.4375, + "learning_rate": 9.238969225064888e-06, + "loss": 2.9656, + "mean_token_accuracy": 0.4222117350951707, + "step": 4105 + }, + { + "epoch": 0.7612161661104931, + "grad_norm": 6.25, + "learning_rate": 9.238783833889507e-06, + "loss": 2.7779, + "mean_token_accuracy": 0.4521497919556172, + "step": 4106 + }, + { + "epoch": 0.7614015572858732, + "grad_norm": 5.4375, + "learning_rate": 9.238598442714128e-06, + "loss": 3.346, + "mean_token_accuracy": 0.40461971830985916, + "step": 4107 + }, + { + "epoch": 0.7615869484612533, + "grad_norm": 5.40234375, + "learning_rate": 9.238413051538746e-06, + "loss": 2.6364, + "mean_token_accuracy": 0.4711763178395222, + "step": 4108 + }, + { + "epoch": 0.7617723396366333, + "grad_norm": 7.16796875, + "learning_rate": 9.238227660363367e-06, + "loss": 2.8688, + "mean_token_accuracy": 0.4258139235619611, + "step": 4109 + }, + { + "epoch": 0.7619577308120133, + "grad_norm": 6.2890625, + "learning_rate": 9.238042269187987e-06, + "loss": 2.4537, + "mean_token_accuracy": 0.49286823894930093, + "step": 4110 + }, + { + "epoch": 0.7621431219873934, + "grad_norm": 5.89453125, + "learning_rate": 9.237856878012608e-06, + "loss": 2.7115, + "mean_token_accuracy": 0.4467812259553732, + "step": 4111 + }, + { + "epoch": 0.7623285131627735, + "grad_norm": 5.5390625, + "learning_rate": 9.237671486837228e-06, + "loss": 3.4207, + "mean_token_accuracy": 0.3841204057149089, + "step": 4112 + }, + { + "epoch": 0.7625139043381535, + "grad_norm": 7.17578125, + "learning_rate": 9.237486095661847e-06, + "loss": 2.9447, + "mean_token_accuracy": 0.4154901169826543, + "step": 4113 + }, + { + "epoch": 0.7626992955135335, + "grad_norm": 11.8671875, + "learning_rate": 9.237300704486468e-06, + "loss": 2.6685, + "mean_token_accuracy": 0.46116449971735446, + "step": 4114 + }, + { + "epoch": 0.7628846866889136, + "grad_norm": 7.30859375, + "learning_rate": 9.237115313311086e-06, + "loss": 2.5841, + "mean_token_accuracy": 0.4966654083301875, + "step": 4115 + }, + { + "epoch": 0.7630700778642937, + "grad_norm": 6.00390625, + "learning_rate": 9.236929922135707e-06, + "loss": 3.0009, + "mean_token_accuracy": 0.4423462390547526, + "step": 4116 + }, + { + "epoch": 0.7632554690396737, + "grad_norm": 7.40625, + "learning_rate": 9.236744530960326e-06, + "loss": 2.7945, + "mean_token_accuracy": 0.44911616161616164, + "step": 4117 + }, + { + "epoch": 0.7634408602150538, + "grad_norm": 6.44140625, + "learning_rate": 9.236559139784948e-06, + "loss": 2.9311, + "mean_token_accuracy": 0.42918210316329436, + "step": 4118 + }, + { + "epoch": 0.7636262513904338, + "grad_norm": 6.15234375, + "learning_rate": 9.236373748609567e-06, + "loss": 2.4578, + "mean_token_accuracy": 0.4905496415381273, + "step": 4119 + }, + { + "epoch": 0.7638116425658139, + "grad_norm": 5.078125, + "learning_rate": 9.236188357434187e-06, + "loss": 2.7956, + "mean_token_accuracy": 0.44071315178898524, + "step": 4120 + }, + { + "epoch": 0.7639970337411939, + "grad_norm": 8.375, + "learning_rate": 9.236002966258808e-06, + "loss": 2.423, + "mean_token_accuracy": 0.4873111339298812, + "step": 4121 + }, + { + "epoch": 0.764182424916574, + "grad_norm": 6.265625, + "learning_rate": 9.235817575083426e-06, + "loss": 3.3581, + "mean_token_accuracy": 0.38879070941681393, + "step": 4122 + }, + { + "epoch": 0.764367816091954, + "grad_norm": 6.5390625, + "learning_rate": 9.235632183908047e-06, + "loss": 2.7319, + "mean_token_accuracy": 0.4613774335356919, + "step": 4123 + }, + { + "epoch": 0.7645532072673341, + "grad_norm": 8.75, + "learning_rate": 9.235446792732666e-06, + "loss": 2.7523, + "mean_token_accuracy": 0.4466780724265754, + "step": 4124 + }, + { + "epoch": 0.7647385984427141, + "grad_norm": 5.28515625, + "learning_rate": 9.235261401557286e-06, + "loss": 2.7499, + "mean_token_accuracy": 0.45650557620817844, + "step": 4125 + }, + { + "epoch": 0.7649239896180942, + "grad_norm": 8.6796875, + "learning_rate": 9.235076010381907e-06, + "loss": 2.835, + "mean_token_accuracy": 0.4424226324877142, + "step": 4126 + }, + { + "epoch": 0.7651093807934742, + "grad_norm": 11.7734375, + "learning_rate": 9.234890619206527e-06, + "loss": 3.3589, + "mean_token_accuracy": 0.3951010410287814, + "step": 4127 + }, + { + "epoch": 0.7652947719688543, + "grad_norm": 10.640625, + "learning_rate": 9.234705228031146e-06, + "loss": 2.7618, + "mean_token_accuracy": 0.4427973699940227, + "step": 4128 + }, + { + "epoch": 0.7654801631442343, + "grad_norm": 9.0859375, + "learning_rate": 9.234519836855767e-06, + "loss": 2.836, + "mean_token_accuracy": 0.44959816303099887, + "step": 4129 + }, + { + "epoch": 0.7656655543196144, + "grad_norm": 9.59375, + "learning_rate": 9.234334445680387e-06, + "loss": 2.873, + "mean_token_accuracy": 0.4414353419092756, + "step": 4130 + }, + { + "epoch": 0.7658509454949944, + "grad_norm": 6.01953125, + "learning_rate": 9.234149054505006e-06, + "loss": 2.663, + "mean_token_accuracy": 0.4666402953586498, + "step": 4131 + }, + { + "epoch": 0.7660363366703745, + "grad_norm": 5.85546875, + "learning_rate": 9.233963663329626e-06, + "loss": 2.9813, + "mean_token_accuracy": 0.4395527603074773, + "step": 4132 + }, + { + "epoch": 0.7662217278457546, + "grad_norm": 6.4296875, + "learning_rate": 9.233778272154245e-06, + "loss": 2.3067, + "mean_token_accuracy": 0.5062653957373889, + "step": 4133 + }, + { + "epoch": 0.7664071190211346, + "grad_norm": 6.859375, + "learning_rate": 9.233592880978867e-06, + "loss": 2.3876, + "mean_token_accuracy": 0.49891316782976775, + "step": 4134 + }, + { + "epoch": 0.7665925101965146, + "grad_norm": 6.0078125, + "learning_rate": 9.233407489803486e-06, + "loss": 3.1111, + "mean_token_accuracy": 0.40258924082453496, + "step": 4135 + }, + { + "epoch": 0.7667779013718947, + "grad_norm": 5.53515625, + "learning_rate": 9.233222098628107e-06, + "loss": 3.1174, + "mean_token_accuracy": 0.42552891396332865, + "step": 4136 + }, + { + "epoch": 0.7669632925472748, + "grad_norm": 7.84765625, + "learning_rate": 9.233036707452725e-06, + "loss": 2.1024, + "mean_token_accuracy": 0.5307548134264781, + "step": 4137 + }, + { + "epoch": 0.7671486837226548, + "grad_norm": 5.40234375, + "learning_rate": 9.232851316277346e-06, + "loss": 3.1837, + "mean_token_accuracy": 0.40727453911310413, + "step": 4138 + }, + { + "epoch": 0.7673340748980348, + "grad_norm": 7.0, + "learning_rate": 9.232665925101966e-06, + "loss": 2.6321, + "mean_token_accuracy": 0.46714182169606183, + "step": 4139 + }, + { + "epoch": 0.7675194660734149, + "grad_norm": 7.41796875, + "learning_rate": 9.232480533926585e-06, + "loss": 2.7366, + "mean_token_accuracy": 0.4531009738595592, + "step": 4140 + }, + { + "epoch": 0.767704857248795, + "grad_norm": 5.37890625, + "learning_rate": 9.232295142751206e-06, + "loss": 2.981, + "mean_token_accuracy": 0.43306672279520103, + "step": 4141 + }, + { + "epoch": 0.767890248424175, + "grad_norm": 6.7421875, + "learning_rate": 9.232109751575826e-06, + "loss": 2.3775, + "mean_token_accuracy": 0.5042005600746766, + "step": 4142 + }, + { + "epoch": 0.768075639599555, + "grad_norm": 6.7890625, + "learning_rate": 9.231924360400447e-06, + "loss": 2.8148, + "mean_token_accuracy": 0.4447220487195503, + "step": 4143 + }, + { + "epoch": 0.7682610307749351, + "grad_norm": 5.5859375, + "learning_rate": 9.231738969225065e-06, + "loss": 2.9425, + "mean_token_accuracy": 0.4293394777265745, + "step": 4144 + }, + { + "epoch": 0.7684464219503152, + "grad_norm": 7.19140625, + "learning_rate": 9.231553578049686e-06, + "loss": 2.7067, + "mean_token_accuracy": 0.4512902827509569, + "step": 4145 + }, + { + "epoch": 0.7686318131256952, + "grad_norm": 6.92578125, + "learning_rate": 9.231368186874305e-06, + "loss": 2.9598, + "mean_token_accuracy": 0.4323198667221991, + "step": 4146 + }, + { + "epoch": 0.7688172043010753, + "grad_norm": 6.99609375, + "learning_rate": 9.231182795698925e-06, + "loss": 2.915, + "mean_token_accuracy": 0.4269088844734601, + "step": 4147 + }, + { + "epoch": 0.7690025954764553, + "grad_norm": 6.97265625, + "learning_rate": 9.230997404523546e-06, + "loss": 3.2325, + "mean_token_accuracy": 0.40055370985603544, + "step": 4148 + }, + { + "epoch": 0.7691879866518354, + "grad_norm": 8.5390625, + "learning_rate": 9.230812013348164e-06, + "loss": 2.9646, + "mean_token_accuracy": 0.4490572565300121, + "step": 4149 + }, + { + "epoch": 0.7693733778272154, + "grad_norm": 6.50390625, + "learning_rate": 9.230626622172787e-06, + "loss": 2.5494, + "mean_token_accuracy": 0.4760922925871379, + "step": 4150 + }, + { + "epoch": 0.7695587690025955, + "grad_norm": 7.953125, + "learning_rate": 9.230441230997405e-06, + "loss": 2.389, + "mean_token_accuracy": 0.4922844877596009, + "step": 4151 + }, + { + "epoch": 0.7697441601779755, + "grad_norm": 5.26953125, + "learning_rate": 9.230255839822026e-06, + "loss": 3.2441, + "mean_token_accuracy": 0.398993158642055, + "step": 4152 + }, + { + "epoch": 0.7699295513533556, + "grad_norm": 6.30078125, + "learning_rate": 9.230070448646645e-06, + "loss": 2.7994, + "mean_token_accuracy": 0.4609221069223696, + "step": 4153 + }, + { + "epoch": 0.7701149425287356, + "grad_norm": 7.23046875, + "learning_rate": 9.229885057471265e-06, + "loss": 2.3821, + "mean_token_accuracy": 0.49879027123392333, + "step": 4154 + }, + { + "epoch": 0.7703003337041157, + "grad_norm": 4.71484375, + "learning_rate": 9.229699666295884e-06, + "loss": 2.9933, + "mean_token_accuracy": 0.41575956073595993, + "step": 4155 + }, + { + "epoch": 0.7704857248794957, + "grad_norm": 4.921875, + "learning_rate": 9.229514275120505e-06, + "loss": 2.5524, + "mean_token_accuracy": 0.4837806301050175, + "step": 4156 + }, + { + "epoch": 0.7706711160548758, + "grad_norm": 9.0, + "learning_rate": 9.229328883945125e-06, + "loss": 2.834, + "mean_token_accuracy": 0.45734400883489784, + "step": 4157 + }, + { + "epoch": 0.7708565072302559, + "grad_norm": 6.671875, + "learning_rate": 9.229143492769746e-06, + "loss": 2.9378, + "mean_token_accuracy": 0.4311533159748813, + "step": 4158 + }, + { + "epoch": 0.7710418984056359, + "grad_norm": 7.515625, + "learning_rate": 9.228958101594366e-06, + "loss": 2.7893, + "mean_token_accuracy": 0.4407091125283753, + "step": 4159 + }, + { + "epoch": 0.7712272895810159, + "grad_norm": 6.25, + "learning_rate": 9.228772710418985e-06, + "loss": 3.0159, + "mean_token_accuracy": 0.4393793985924504, + "step": 4160 + }, + { + "epoch": 0.771412680756396, + "grad_norm": 7.11328125, + "learning_rate": 9.228587319243605e-06, + "loss": 3.1598, + "mean_token_accuracy": 0.425963808025177, + "step": 4161 + }, + { + "epoch": 0.7715980719317761, + "grad_norm": 7.25, + "learning_rate": 9.228401928068224e-06, + "loss": 2.897, + "mean_token_accuracy": 0.4387332521315469, + "step": 4162 + }, + { + "epoch": 0.7717834631071561, + "grad_norm": 5.83203125, + "learning_rate": 9.228216536892845e-06, + "loss": 2.7121, + "mean_token_accuracy": 0.45878378378378376, + "step": 4163 + }, + { + "epoch": 0.7719688542825361, + "grad_norm": 5.7734375, + "learning_rate": 9.228031145717465e-06, + "loss": 2.8085, + "mean_token_accuracy": 0.47136273864384465, + "step": 4164 + }, + { + "epoch": 0.7721542454579162, + "grad_norm": 5.9765625, + "learning_rate": 9.227845754542084e-06, + "loss": 2.8246, + "mean_token_accuracy": 0.44946401225114857, + "step": 4165 + }, + { + "epoch": 0.7723396366332963, + "grad_norm": 7.42578125, + "learning_rate": 9.227660363366704e-06, + "loss": 3.3767, + "mean_token_accuracy": 0.39490445859872614, + "step": 4166 + }, + { + "epoch": 0.7725250278086763, + "grad_norm": 5.83203125, + "learning_rate": 9.227474972191325e-06, + "loss": 2.823, + "mean_token_accuracy": 0.4447122200170116, + "step": 4167 + }, + { + "epoch": 0.7727104189840563, + "grad_norm": 6.62109375, + "learning_rate": 9.227289581015945e-06, + "loss": 2.51, + "mean_token_accuracy": 0.4813487322447896, + "step": 4168 + }, + { + "epoch": 0.7728958101594364, + "grad_norm": 7.06640625, + "learning_rate": 9.227104189840564e-06, + "loss": 2.4711, + "mean_token_accuracy": 0.4863375161252492, + "step": 4169 + }, + { + "epoch": 0.7730812013348165, + "grad_norm": 5.90625, + "learning_rate": 9.226918798665185e-06, + "loss": 2.754, + "mean_token_accuracy": 0.4450073323556859, + "step": 4170 + }, + { + "epoch": 0.7732665925101965, + "grad_norm": 5.4609375, + "learning_rate": 9.226733407489803e-06, + "loss": 2.7942, + "mean_token_accuracy": 0.4496114314364502, + "step": 4171 + }, + { + "epoch": 0.7734519836855765, + "grad_norm": 7.18359375, + "learning_rate": 9.226548016314424e-06, + "loss": 2.5152, + "mean_token_accuracy": 0.4770898341271145, + "step": 4172 + }, + { + "epoch": 0.7736373748609566, + "grad_norm": 8.15625, + "learning_rate": 9.226362625139044e-06, + "loss": 2.8384, + "mean_token_accuracy": 0.4507244297805193, + "step": 4173 + }, + { + "epoch": 0.7738227660363367, + "grad_norm": 5.48046875, + "learning_rate": 9.226177233963665e-06, + "loss": 2.9016, + "mean_token_accuracy": 0.4589261744966443, + "step": 4174 + }, + { + "epoch": 0.7740081572117167, + "grad_norm": 5.5625, + "learning_rate": 9.225991842788284e-06, + "loss": 2.3125, + "mean_token_accuracy": 0.501249256395003, + "step": 4175 + }, + { + "epoch": 0.7741935483870968, + "grad_norm": 6.63671875, + "learning_rate": 9.225806451612904e-06, + "loss": 2.8131, + "mean_token_accuracy": 0.444589820751014, + "step": 4176 + }, + { + "epoch": 0.7743789395624768, + "grad_norm": 6.24609375, + "learning_rate": 9.225621060437525e-06, + "loss": 3.6458, + "mean_token_accuracy": 0.38483241169168264, + "step": 4177 + }, + { + "epoch": 0.7745643307378569, + "grad_norm": 7.4296875, + "learning_rate": 9.225435669262143e-06, + "loss": 3.0916, + "mean_token_accuracy": 0.4105047748976808, + "step": 4178 + }, + { + "epoch": 0.7747497219132369, + "grad_norm": 4.484375, + "learning_rate": 9.225250278086764e-06, + "loss": 2.8858, + "mean_token_accuracy": 0.4352795373175434, + "step": 4179 + }, + { + "epoch": 0.774935113088617, + "grad_norm": 8.21875, + "learning_rate": 9.225064886911383e-06, + "loss": 2.6781, + "mean_token_accuracy": 0.4359771258080557, + "step": 4180 + }, + { + "epoch": 0.775120504263997, + "grad_norm": 6.90625, + "learning_rate": 9.224879495736003e-06, + "loss": 2.8168, + "mean_token_accuracy": 0.4501251340722202, + "step": 4181 + }, + { + "epoch": 0.7753058954393771, + "grad_norm": 6.359375, + "learning_rate": 9.224694104560624e-06, + "loss": 3.041, + "mean_token_accuracy": 0.41057134971018494, + "step": 4182 + }, + { + "epoch": 0.7754912866147572, + "grad_norm": 6.23046875, + "learning_rate": 9.224508713385244e-06, + "loss": 3.0925, + "mean_token_accuracy": 0.4010741943542343, + "step": 4183 + }, + { + "epoch": 0.7756766777901372, + "grad_norm": 8.8828125, + "learning_rate": 9.224323322209863e-06, + "loss": 3.3827, + "mean_token_accuracy": 0.3868409458070201, + "step": 4184 + }, + { + "epoch": 0.7758620689655172, + "grad_norm": 6.06640625, + "learning_rate": 9.224137931034484e-06, + "loss": 3.0692, + "mean_token_accuracy": 0.4360035100915131, + "step": 4185 + }, + { + "epoch": 0.7760474601408973, + "grad_norm": 5.78125, + "learning_rate": 9.223952539859104e-06, + "loss": 2.7654, + "mean_token_accuracy": 0.4539018250471995, + "step": 4186 + }, + { + "epoch": 0.7762328513162774, + "grad_norm": 7.01953125, + "learning_rate": 9.223767148683723e-06, + "loss": 3.183, + "mean_token_accuracy": 0.4066538355787737, + "step": 4187 + }, + { + "epoch": 0.7764182424916574, + "grad_norm": 5.99609375, + "learning_rate": 9.223581757508343e-06, + "loss": 3.0951, + "mean_token_accuracy": 0.41848477583313354, + "step": 4188 + }, + { + "epoch": 0.7766036336670374, + "grad_norm": 6.4609375, + "learning_rate": 9.223396366332962e-06, + "loss": 3.4678, + "mean_token_accuracy": 0.36137474834627553, + "step": 4189 + }, + { + "epoch": 0.7767890248424175, + "grad_norm": 5.66015625, + "learning_rate": 9.223210975157584e-06, + "loss": 2.8044, + "mean_token_accuracy": 0.447041166380789, + "step": 4190 + }, + { + "epoch": 0.7769744160177976, + "grad_norm": 4.93359375, + "learning_rate": 9.223025583982203e-06, + "loss": 2.6011, + "mean_token_accuracy": 0.4681701030927835, + "step": 4191 + }, + { + "epoch": 0.7771598071931776, + "grad_norm": 7.42578125, + "learning_rate": 9.222840192806824e-06, + "loss": 2.639, + "mean_token_accuracy": 0.471261309207025, + "step": 4192 + }, + { + "epoch": 0.7773451983685576, + "grad_norm": 6.11328125, + "learning_rate": 9.222654801631442e-06, + "loss": 3.0532, + "mean_token_accuracy": 0.4170703575547866, + "step": 4193 + }, + { + "epoch": 0.7775305895439377, + "grad_norm": 6.3671875, + "learning_rate": 9.222469410456063e-06, + "loss": 2.9521, + "mean_token_accuracy": 0.4315774996561683, + "step": 4194 + }, + { + "epoch": 0.7777159807193178, + "grad_norm": 6.93359375, + "learning_rate": 9.222284019280683e-06, + "loss": 3.1715, + "mean_token_accuracy": 0.41615853658536583, + "step": 4195 + }, + { + "epoch": 0.7779013718946978, + "grad_norm": 5.68359375, + "learning_rate": 9.222098628105302e-06, + "loss": 3.0222, + "mean_token_accuracy": 0.43789691330674935, + "step": 4196 + }, + { + "epoch": 0.7780867630700778, + "grad_norm": 5.74609375, + "learning_rate": 9.221913236929923e-06, + "loss": 3.1901, + "mean_token_accuracy": 0.40142570728447313, + "step": 4197 + }, + { + "epoch": 0.7782721542454579, + "grad_norm": 6.7734375, + "learning_rate": 9.221727845754543e-06, + "loss": 3.0015, + "mean_token_accuracy": 0.4218315203642803, + "step": 4198 + }, + { + "epoch": 0.778457545420838, + "grad_norm": 6.7265625, + "learning_rate": 9.221542454579164e-06, + "loss": 2.5276, + "mean_token_accuracy": 0.47587523870146403, + "step": 4199 + }, + { + "epoch": 0.778642936596218, + "grad_norm": 6.7890625, + "learning_rate": 9.221357063403782e-06, + "loss": 2.8134, + "mean_token_accuracy": 0.439819252362043, + "step": 4200 + }, + { + "epoch": 0.778828327771598, + "grad_norm": 6.171875, + "learning_rate": 9.221171672228403e-06, + "loss": 3.1924, + "mean_token_accuracy": 0.42214285714285715, + "step": 4201 + }, + { + "epoch": 0.7790137189469781, + "grad_norm": 11.046875, + "learning_rate": 9.220986281053023e-06, + "loss": 2.6358, + "mean_token_accuracy": 0.46728077521234596, + "step": 4202 + }, + { + "epoch": 0.7791991101223582, + "grad_norm": 9.5078125, + "learning_rate": 9.220800889877642e-06, + "loss": 2.4564, + "mean_token_accuracy": 0.48792212474462204, + "step": 4203 + }, + { + "epoch": 0.7793845012977382, + "grad_norm": 6.88671875, + "learning_rate": 9.220615498702263e-06, + "loss": 3.0597, + "mean_token_accuracy": 0.42879090490090643, + "step": 4204 + }, + { + "epoch": 0.7795698924731183, + "grad_norm": 6.1875, + "learning_rate": 9.220430107526881e-06, + "loss": 3.0234, + "mean_token_accuracy": 0.4236957581667479, + "step": 4205 + }, + { + "epoch": 0.7797552836484983, + "grad_norm": 7.80078125, + "learning_rate": 9.220244716351504e-06, + "loss": 2.927, + "mean_token_accuracy": 0.43412563667232595, + "step": 4206 + }, + { + "epoch": 0.7799406748238784, + "grad_norm": 6.96484375, + "learning_rate": 9.220059325176122e-06, + "loss": 2.4846, + "mean_token_accuracy": 0.493886230728336, + "step": 4207 + }, + { + "epoch": 0.7801260659992585, + "grad_norm": 5.63671875, + "learning_rate": 9.219873934000743e-06, + "loss": 2.2268, + "mean_token_accuracy": 0.5343340785691473, + "step": 4208 + }, + { + "epoch": 0.7803114571746385, + "grad_norm": 7.1484375, + "learning_rate": 9.219688542825362e-06, + "loss": 2.542, + "mean_token_accuracy": 0.4632611064035831, + "step": 4209 + }, + { + "epoch": 0.7804968483500185, + "grad_norm": 6.21484375, + "learning_rate": 9.219503151649982e-06, + "loss": 2.7727, + "mean_token_accuracy": 0.47639831878435174, + "step": 4210 + }, + { + "epoch": 0.7806822395253986, + "grad_norm": 6.9609375, + "learning_rate": 9.219317760474603e-06, + "loss": 2.7549, + "mean_token_accuracy": 0.45268024851900013, + "step": 4211 + }, + { + "epoch": 0.7808676307007787, + "grad_norm": 5.37890625, + "learning_rate": 9.219132369299222e-06, + "loss": 2.7618, + "mean_token_accuracy": 0.47688641779189833, + "step": 4212 + }, + { + "epoch": 0.7810530218761587, + "grad_norm": 6.6640625, + "learning_rate": 9.218946978123842e-06, + "loss": 2.7575, + "mean_token_accuracy": 0.4550787280024699, + "step": 4213 + }, + { + "epoch": 0.7812384130515387, + "grad_norm": 8.4921875, + "learning_rate": 9.21876158694846e-06, + "loss": 2.2068, + "mean_token_accuracy": 0.5121336274818783, + "step": 4214 + }, + { + "epoch": 0.7814238042269188, + "grad_norm": 5.1640625, + "learning_rate": 9.218576195773083e-06, + "loss": 2.683, + "mean_token_accuracy": 0.45683105401298524, + "step": 4215 + }, + { + "epoch": 0.7816091954022989, + "grad_norm": 10.03125, + "learning_rate": 9.218390804597702e-06, + "loss": 2.8649, + "mean_token_accuracy": 0.463226649248857, + "step": 4216 + }, + { + "epoch": 0.7817945865776789, + "grad_norm": 7.4921875, + "learning_rate": 9.218205413422322e-06, + "loss": 2.687, + "mean_token_accuracy": 0.46336526784509513, + "step": 4217 + }, + { + "epoch": 0.7819799777530589, + "grad_norm": 5.69921875, + "learning_rate": 9.218020022246941e-06, + "loss": 2.9644, + "mean_token_accuracy": 0.44504943401633473, + "step": 4218 + }, + { + "epoch": 0.782165368928439, + "grad_norm": 6.95703125, + "learning_rate": 9.217834631071562e-06, + "loss": 2.7178, + "mean_token_accuracy": 0.4540048083006453, + "step": 4219 + }, + { + "epoch": 0.7823507601038191, + "grad_norm": 6.05078125, + "learning_rate": 9.217649239896182e-06, + "loss": 3.5448, + "mean_token_accuracy": 0.4, + "step": 4220 + }, + { + "epoch": 0.7825361512791991, + "grad_norm": 5.82421875, + "learning_rate": 9.217463848720801e-06, + "loss": 3.0608, + "mean_token_accuracy": 0.42347792508688376, + "step": 4221 + }, + { + "epoch": 0.7827215424545791, + "grad_norm": 6.08203125, + "learning_rate": 9.217278457545421e-06, + "loss": 3.1666, + "mean_token_accuracy": 0.4160839160839161, + "step": 4222 + }, + { + "epoch": 0.7829069336299592, + "grad_norm": 6.21484375, + "learning_rate": 9.217093066370042e-06, + "loss": 3.1761, + "mean_token_accuracy": 0.4104833219877468, + "step": 4223 + }, + { + "epoch": 0.7830923248053393, + "grad_norm": 7.6640625, + "learning_rate": 9.216907675194662e-06, + "loss": 3.0579, + "mean_token_accuracy": 0.4257872999483738, + "step": 4224 + }, + { + "epoch": 0.7832777159807193, + "grad_norm": 12.203125, + "learning_rate": 9.216722284019281e-06, + "loss": 2.5231, + "mean_token_accuracy": 0.4465006729475101, + "step": 4225 + }, + { + "epoch": 0.7834631071560993, + "grad_norm": 8.015625, + "learning_rate": 9.216536892843902e-06, + "loss": 3.2124, + "mean_token_accuracy": 0.43701142513529767, + "step": 4226 + }, + { + "epoch": 0.7836484983314794, + "grad_norm": 9.4140625, + "learning_rate": 9.21635150166852e-06, + "loss": 2.6305, + "mean_token_accuracy": 0.4792484243072898, + "step": 4227 + }, + { + "epoch": 0.7838338895068595, + "grad_norm": 5.5, + "learning_rate": 9.216166110493141e-06, + "loss": 2.9033, + "mean_token_accuracy": 0.4327833050230527, + "step": 4228 + }, + { + "epoch": 0.7840192806822395, + "grad_norm": 7.10546875, + "learning_rate": 9.215980719317761e-06, + "loss": 2.9471, + "mean_token_accuracy": 0.44229017566688356, + "step": 4229 + }, + { + "epoch": 0.7842046718576196, + "grad_norm": 7.00390625, + "learning_rate": 9.21579532814238e-06, + "loss": 2.9005, + "mean_token_accuracy": 0.438558752352783, + "step": 4230 + }, + { + "epoch": 0.7843900630329996, + "grad_norm": 8.0234375, + "learning_rate": 9.215609936967002e-06, + "loss": 2.5694, + "mean_token_accuracy": 0.46709549727857497, + "step": 4231 + }, + { + "epoch": 0.7845754542083797, + "grad_norm": 6.8125, + "learning_rate": 9.215424545791621e-06, + "loss": 2.976, + "mean_token_accuracy": 0.4227266155847911, + "step": 4232 + }, + { + "epoch": 0.7847608453837598, + "grad_norm": 8.890625, + "learning_rate": 9.215239154616242e-06, + "loss": 3.0059, + "mean_token_accuracy": 0.44510181618051736, + "step": 4233 + }, + { + "epoch": 0.7849462365591398, + "grad_norm": 6.21484375, + "learning_rate": 9.21505376344086e-06, + "loss": 2.9082, + "mean_token_accuracy": 0.45241162158293974, + "step": 4234 + }, + { + "epoch": 0.7851316277345198, + "grad_norm": 8.96875, + "learning_rate": 9.214868372265481e-06, + "loss": 3.0119, + "mean_token_accuracy": 0.4393545592376002, + "step": 4235 + }, + { + "epoch": 0.7853170189098999, + "grad_norm": 8.21875, + "learning_rate": 9.2146829810901e-06, + "loss": 2.8862, + "mean_token_accuracy": 0.4429034783581255, + "step": 4236 + }, + { + "epoch": 0.78550241008528, + "grad_norm": 12.0234375, + "learning_rate": 9.21449758991472e-06, + "loss": 2.8171, + "mean_token_accuracy": 0.44640914036996737, + "step": 4237 + }, + { + "epoch": 0.78568780126066, + "grad_norm": 7.8125, + "learning_rate": 9.21431219873934e-06, + "loss": 2.8472, + "mean_token_accuracy": 0.4338443396226415, + "step": 4238 + }, + { + "epoch": 0.78587319243604, + "grad_norm": 5.44140625, + "learning_rate": 9.214126807563961e-06, + "loss": 2.9486, + "mean_token_accuracy": 0.42700889601866704, + "step": 4239 + }, + { + "epoch": 0.7860585836114201, + "grad_norm": 6.48828125, + "learning_rate": 9.213941416388582e-06, + "loss": 2.7167, + "mean_token_accuracy": 0.461724041941171, + "step": 4240 + }, + { + "epoch": 0.7862439747868002, + "grad_norm": 14.625, + "learning_rate": 9.2137560252132e-06, + "loss": 2.5803, + "mean_token_accuracy": 0.46241979835013747, + "step": 4241 + }, + { + "epoch": 0.7864293659621802, + "grad_norm": 6.8515625, + "learning_rate": 9.213570634037821e-06, + "loss": 2.4896, + "mean_token_accuracy": 0.48712849408905096, + "step": 4242 + }, + { + "epoch": 0.7866147571375602, + "grad_norm": 6.703125, + "learning_rate": 9.21338524286244e-06, + "loss": 2.9513, + "mean_token_accuracy": 0.4453387671930718, + "step": 4243 + }, + { + "epoch": 0.7868001483129403, + "grad_norm": 6.03125, + "learning_rate": 9.21319985168706e-06, + "loss": 3.1561, + "mean_token_accuracy": 0.43246578415974873, + "step": 4244 + }, + { + "epoch": 0.7869855394883204, + "grad_norm": 7.828125, + "learning_rate": 9.21301446051168e-06, + "loss": 3.2209, + "mean_token_accuracy": 0.41520035487209817, + "step": 4245 + }, + { + "epoch": 0.7871709306637004, + "grad_norm": 7.53515625, + "learning_rate": 9.2128290693363e-06, + "loss": 2.8638, + "mean_token_accuracy": 0.43796268877702105, + "step": 4246 + }, + { + "epoch": 0.7873563218390804, + "grad_norm": 5.20703125, + "learning_rate": 9.21264367816092e-06, + "loss": 2.7127, + "mean_token_accuracy": 0.4641888838680524, + "step": 4247 + }, + { + "epoch": 0.7875417130144605, + "grad_norm": 6.609375, + "learning_rate": 9.21245828698554e-06, + "loss": 2.5332, + "mean_token_accuracy": 0.4695767195767196, + "step": 4248 + }, + { + "epoch": 0.7877271041898406, + "grad_norm": 7.23828125, + "learning_rate": 9.212272895810161e-06, + "loss": 2.6835, + "mean_token_accuracy": 0.4561465304684748, + "step": 4249 + }, + { + "epoch": 0.7879124953652206, + "grad_norm": 6.90625, + "learning_rate": 9.21208750463478e-06, + "loss": 2.5804, + "mean_token_accuracy": 0.4583576684966709, + "step": 4250 + }, + { + "epoch": 0.7880978865406006, + "grad_norm": 9.71875, + "learning_rate": 9.2119021134594e-06, + "loss": 2.0372, + "mean_token_accuracy": 0.5474542758279782, + "step": 4251 + }, + { + "epoch": 0.7882832777159807, + "grad_norm": 10.0859375, + "learning_rate": 9.21171672228402e-06, + "loss": 2.9795, + "mean_token_accuracy": 0.4281604602844814, + "step": 4252 + }, + { + "epoch": 0.7884686688913608, + "grad_norm": 11.0859375, + "learning_rate": 9.21153133110864e-06, + "loss": 2.0194, + "mean_token_accuracy": 0.531136449066533, + "step": 4253 + }, + { + "epoch": 0.7886540600667408, + "grad_norm": 6.4296875, + "learning_rate": 9.21134593993326e-06, + "loss": 2.8925, + "mean_token_accuracy": 0.43632596685082875, + "step": 4254 + }, + { + "epoch": 0.7888394512421208, + "grad_norm": 7.734375, + "learning_rate": 9.21116054875788e-06, + "loss": 2.3587, + "mean_token_accuracy": 0.5004965243296922, + "step": 4255 + }, + { + "epoch": 0.7890248424175009, + "grad_norm": 7.890625, + "learning_rate": 9.2109751575825e-06, + "loss": 2.9216, + "mean_token_accuracy": 0.430595286745675, + "step": 4256 + }, + { + "epoch": 0.789210233592881, + "grad_norm": 5.43359375, + "learning_rate": 9.21078976640712e-06, + "loss": 2.8603, + "mean_token_accuracy": 0.44718485301444943, + "step": 4257 + }, + { + "epoch": 0.7893956247682611, + "grad_norm": 6.83984375, + "learning_rate": 9.21060437523174e-06, + "loss": 2.569, + "mean_token_accuracy": 0.47357954545454545, + "step": 4258 + }, + { + "epoch": 0.789581015943641, + "grad_norm": 7.9765625, + "learning_rate": 9.21041898405636e-06, + "loss": 2.7196, + "mean_token_accuracy": 0.4508126603934987, + "step": 4259 + }, + { + "epoch": 0.7897664071190211, + "grad_norm": 6.51171875, + "learning_rate": 9.21023359288098e-06, + "loss": 2.4467, + "mean_token_accuracy": 0.49926144756277696, + "step": 4260 + }, + { + "epoch": 0.7899517982944012, + "grad_norm": 6.328125, + "learning_rate": 9.210048201705599e-06, + "loss": 2.5325, + "mean_token_accuracy": 0.47717231222385864, + "step": 4261 + }, + { + "epoch": 0.7901371894697813, + "grad_norm": 5.34765625, + "learning_rate": 9.209862810530219e-06, + "loss": 3.2595, + "mean_token_accuracy": 0.41034952337721287, + "step": 4262 + }, + { + "epoch": 0.7903225806451613, + "grad_norm": 7.26953125, + "learning_rate": 9.20967741935484e-06, + "loss": 2.4621, + "mean_token_accuracy": 0.4769585253456221, + "step": 4263 + }, + { + "epoch": 0.7905079718205413, + "grad_norm": 5.50390625, + "learning_rate": 9.20949202817946e-06, + "loss": 3.29, + "mean_token_accuracy": 0.40071852977753214, + "step": 4264 + }, + { + "epoch": 0.7906933629959214, + "grad_norm": 7.828125, + "learning_rate": 9.209306637004079e-06, + "loss": 2.6563, + "mean_token_accuracy": 0.47397908366533864, + "step": 4265 + }, + { + "epoch": 0.7908787541713015, + "grad_norm": 5.2734375, + "learning_rate": 9.2091212458287e-06, + "loss": 2.8021, + "mean_token_accuracy": 0.4498983444670346, + "step": 4266 + }, + { + "epoch": 0.7910641453466815, + "grad_norm": 6.89453125, + "learning_rate": 9.20893585465332e-06, + "loss": 2.5447, + "mean_token_accuracy": 0.4834315169366716, + "step": 4267 + }, + { + "epoch": 0.7912495365220615, + "grad_norm": 6.0546875, + "learning_rate": 9.208750463477939e-06, + "loss": 2.9085, + "mean_token_accuracy": 0.4450795785698274, + "step": 4268 + }, + { + "epoch": 0.7914349276974416, + "grad_norm": 7.21484375, + "learning_rate": 9.208565072302559e-06, + "loss": 2.8274, + "mean_token_accuracy": 0.43431008391800796, + "step": 4269 + }, + { + "epoch": 0.7916203188728217, + "grad_norm": 6.37109375, + "learning_rate": 9.208379681127178e-06, + "loss": 2.661, + "mean_token_accuracy": 0.46020338983050846, + "step": 4270 + }, + { + "epoch": 0.7918057100482017, + "grad_norm": 6.85546875, + "learning_rate": 9.2081942899518e-06, + "loss": 2.4655, + "mean_token_accuracy": 0.4787803360298693, + "step": 4271 + }, + { + "epoch": 0.7919911012235817, + "grad_norm": 8.734375, + "learning_rate": 9.208008898776419e-06, + "loss": 2.7239, + "mean_token_accuracy": 0.4778111739745403, + "step": 4272 + }, + { + "epoch": 0.7921764923989618, + "grad_norm": 8.09375, + "learning_rate": 9.20782350760104e-06, + "loss": 2.6054, + "mean_token_accuracy": 0.4744768912393654, + "step": 4273 + }, + { + "epoch": 0.7923618835743419, + "grad_norm": 6.390625, + "learning_rate": 9.207638116425658e-06, + "loss": 2.9531, + "mean_token_accuracy": 0.43011452368558045, + "step": 4274 + }, + { + "epoch": 0.7925472747497219, + "grad_norm": 8.984375, + "learning_rate": 9.207452725250279e-06, + "loss": 3.0171, + "mean_token_accuracy": 0.4263424782212976, + "step": 4275 + }, + { + "epoch": 0.7927326659251019, + "grad_norm": 10.0, + "learning_rate": 9.207267334074899e-06, + "loss": 2.8858, + "mean_token_accuracy": 0.43310782824796357, + "step": 4276 + }, + { + "epoch": 0.792918057100482, + "grad_norm": 6.796875, + "learning_rate": 9.207081942899518e-06, + "loss": 2.9521, + "mean_token_accuracy": 0.425511958730655, + "step": 4277 + }, + { + "epoch": 0.7931034482758621, + "grad_norm": 7.0078125, + "learning_rate": 9.206896551724138e-06, + "loss": 2.802, + "mean_token_accuracy": 0.48226661231145534, + "step": 4278 + }, + { + "epoch": 0.7932888394512421, + "grad_norm": 11.0625, + "learning_rate": 9.206711160548759e-06, + "loss": 2.8343, + "mean_token_accuracy": 0.4278266050945926, + "step": 4279 + }, + { + "epoch": 0.7934742306266221, + "grad_norm": 9.3828125, + "learning_rate": 9.20652576937338e-06, + "loss": 2.8539, + "mean_token_accuracy": 0.42800055119195257, + "step": 4280 + }, + { + "epoch": 0.7936596218020022, + "grad_norm": 7.02734375, + "learning_rate": 9.206340378197998e-06, + "loss": 2.6687, + "mean_token_accuracy": 0.46119336025123375, + "step": 4281 + }, + { + "epoch": 0.7938450129773823, + "grad_norm": 8.015625, + "learning_rate": 9.206154987022619e-06, + "loss": 2.7846, + "mean_token_accuracy": 0.4513231756214916, + "step": 4282 + }, + { + "epoch": 0.7940304041527624, + "grad_norm": 12.0703125, + "learning_rate": 9.20596959584724e-06, + "loss": 2.5652, + "mean_token_accuracy": 0.45425616547334924, + "step": 4283 + }, + { + "epoch": 0.7942157953281423, + "grad_norm": 9.3125, + "learning_rate": 9.205784204671858e-06, + "loss": 3.1699, + "mean_token_accuracy": 0.4195605146305363, + "step": 4284 + }, + { + "epoch": 0.7944011865035224, + "grad_norm": 7.20703125, + "learning_rate": 9.205598813496478e-06, + "loss": 2.6675, + "mean_token_accuracy": 0.4723760160207327, + "step": 4285 + }, + { + "epoch": 0.7945865776789025, + "grad_norm": 11.3671875, + "learning_rate": 9.205413422321097e-06, + "loss": 3.2542, + "mean_token_accuracy": 0.3997289972899729, + "step": 4286 + }, + { + "epoch": 0.7947719688542826, + "grad_norm": 13.953125, + "learning_rate": 9.20522803114572e-06, + "loss": 2.7178, + "mean_token_accuracy": 0.46816364195632115, + "step": 4287 + }, + { + "epoch": 0.7949573600296626, + "grad_norm": 16.828125, + "learning_rate": 9.205042639970338e-06, + "loss": 2.9686, + "mean_token_accuracy": 0.4101194522676508, + "step": 4288 + }, + { + "epoch": 0.7951427512050426, + "grad_norm": 17.046875, + "learning_rate": 9.204857248794959e-06, + "loss": 2.6443, + "mean_token_accuracy": 0.462800875273523, + "step": 4289 + }, + { + "epoch": 0.7953281423804227, + "grad_norm": 6.62890625, + "learning_rate": 9.204671857619578e-06, + "loss": 2.4414, + "mean_token_accuracy": 0.487135749822317, + "step": 4290 + }, + { + "epoch": 0.7955135335558028, + "grad_norm": 10.7890625, + "learning_rate": 9.204486466444198e-06, + "loss": 2.5573, + "mean_token_accuracy": 0.4753227821740941, + "step": 4291 + }, + { + "epoch": 0.7956989247311828, + "grad_norm": 16.375, + "learning_rate": 9.204301075268819e-06, + "loss": 2.8874, + "mean_token_accuracy": 0.4446066267698146, + "step": 4292 + }, + { + "epoch": 0.7958843159065628, + "grad_norm": 9.859375, + "learning_rate": 9.204115684093437e-06, + "loss": 2.809, + "mean_token_accuracy": 0.43368990849336253, + "step": 4293 + }, + { + "epoch": 0.7960697070819429, + "grad_norm": 8.8359375, + "learning_rate": 9.203930292918058e-06, + "loss": 2.5457, + "mean_token_accuracy": 0.47324630612537655, + "step": 4294 + }, + { + "epoch": 0.796255098257323, + "grad_norm": 5.6640625, + "learning_rate": 9.203744901742678e-06, + "loss": 3.0402, + "mean_token_accuracy": 0.428014440433213, + "step": 4295 + }, + { + "epoch": 0.796440489432703, + "grad_norm": 10.5234375, + "learning_rate": 9.203559510567299e-06, + "loss": 2.7035, + "mean_token_accuracy": 0.4578252544593369, + "step": 4296 + }, + { + "epoch": 0.796625880608083, + "grad_norm": 14.6875, + "learning_rate": 9.203374119391918e-06, + "loss": 2.4191, + "mean_token_accuracy": 0.4831178611773566, + "step": 4297 + }, + { + "epoch": 0.7968112717834631, + "grad_norm": 12.6796875, + "learning_rate": 9.203188728216538e-06, + "loss": 2.6992, + "mean_token_accuracy": 0.4486863711001642, + "step": 4298 + }, + { + "epoch": 0.7969966629588432, + "grad_norm": 7.52734375, + "learning_rate": 9.203003337041157e-06, + "loss": 2.374, + "mean_token_accuracy": 0.5143055555555556, + "step": 4299 + }, + { + "epoch": 0.7971820541342232, + "grad_norm": 6.7578125, + "learning_rate": 9.202817945865777e-06, + "loss": 2.5986, + "mean_token_accuracy": 0.47138174483106404, + "step": 4300 + }, + { + "epoch": 0.7973674453096032, + "grad_norm": 8.171875, + "learning_rate": 9.202632554690398e-06, + "loss": 3.1837, + "mean_token_accuracy": 0.41209372637944064, + "step": 4301 + }, + { + "epoch": 0.7975528364849833, + "grad_norm": 7.3671875, + "learning_rate": 9.202447163515017e-06, + "loss": 2.8176, + "mean_token_accuracy": 0.4446984541693882, + "step": 4302 + }, + { + "epoch": 0.7977382276603634, + "grad_norm": 6.18359375, + "learning_rate": 9.202261772339637e-06, + "loss": 3.0559, + "mean_token_accuracy": 0.4158186864014801, + "step": 4303 + }, + { + "epoch": 0.7979236188357434, + "grad_norm": 5.4609375, + "learning_rate": 9.202076381164258e-06, + "loss": 2.8058, + "mean_token_accuracy": 0.4567886079057206, + "step": 4304 + }, + { + "epoch": 0.7981090100111234, + "grad_norm": 6.33203125, + "learning_rate": 9.201890989988878e-06, + "loss": 3.1767, + "mean_token_accuracy": 0.40560920756713853, + "step": 4305 + }, + { + "epoch": 0.7982944011865035, + "grad_norm": 10.4140625, + "learning_rate": 9.201705598813497e-06, + "loss": 3.1918, + "mean_token_accuracy": 0.4176054071451561, + "step": 4306 + }, + { + "epoch": 0.7984797923618836, + "grad_norm": 6.4296875, + "learning_rate": 9.201520207638117e-06, + "loss": 3.1628, + "mean_token_accuracy": 0.4073435985786583, + "step": 4307 + }, + { + "epoch": 0.7986651835372637, + "grad_norm": 5.53515625, + "learning_rate": 9.201334816462736e-06, + "loss": 2.556, + "mean_token_accuracy": 0.4820005496015389, + "step": 4308 + }, + { + "epoch": 0.7988505747126436, + "grad_norm": 7.2734375, + "learning_rate": 9.201149425287357e-06, + "loss": 2.3083, + "mean_token_accuracy": 0.49704287826515525, + "step": 4309 + }, + { + "epoch": 0.7990359658880237, + "grad_norm": 7.9609375, + "learning_rate": 9.200964034111977e-06, + "loss": 2.7381, + "mean_token_accuracy": 0.4722558759389387, + "step": 4310 + }, + { + "epoch": 0.7992213570634038, + "grad_norm": 7.6171875, + "learning_rate": 9.200778642936598e-06, + "loss": 3.7411, + "mean_token_accuracy": 0.3593866866118175, + "step": 4311 + }, + { + "epoch": 0.7994067482387839, + "grad_norm": 5.4609375, + "learning_rate": 9.200593251761216e-06, + "loss": 3.0992, + "mean_token_accuracy": 0.409438202247191, + "step": 4312 + }, + { + "epoch": 0.7995921394141638, + "grad_norm": 8.8203125, + "learning_rate": 9.200407860585837e-06, + "loss": 2.8268, + "mean_token_accuracy": 0.463680387409201, + "step": 4313 + }, + { + "epoch": 0.7997775305895439, + "grad_norm": 7.953125, + "learning_rate": 9.200222469410457e-06, + "loss": 2.8848, + "mean_token_accuracy": 0.4409634993350081, + "step": 4314 + }, + { + "epoch": 0.799962921764924, + "grad_norm": 6.1796875, + "learning_rate": 9.200037078235076e-06, + "loss": 2.6711, + "mean_token_accuracy": 0.4618150438659571, + "step": 4315 + }, + { + "epoch": 0.8001483129403041, + "grad_norm": 9.515625, + "learning_rate": 9.199851687059697e-06, + "loss": 2.9536, + "mean_token_accuracy": 0.4299923488905891, + "step": 4316 + }, + { + "epoch": 0.800333704115684, + "grad_norm": 8.3671875, + "learning_rate": 9.199666295884316e-06, + "loss": 2.9296, + "mean_token_accuracy": 0.4384624567267518, + "step": 4317 + }, + { + "epoch": 0.8005190952910641, + "grad_norm": 6.2578125, + "learning_rate": 9.199480904708936e-06, + "loss": 2.8893, + "mean_token_accuracy": 0.4388138138138138, + "step": 4318 + }, + { + "epoch": 0.8007044864664442, + "grad_norm": 5.95703125, + "learning_rate": 9.199295513533557e-06, + "loss": 2.6951, + "mean_token_accuracy": 0.4737760059798181, + "step": 4319 + }, + { + "epoch": 0.8008898776418243, + "grad_norm": 6.7109375, + "learning_rate": 9.199110122358177e-06, + "loss": 2.9045, + "mean_token_accuracy": 0.442278686622578, + "step": 4320 + }, + { + "epoch": 0.8010752688172043, + "grad_norm": 5.79296875, + "learning_rate": 9.198924731182798e-06, + "loss": 2.9112, + "mean_token_accuracy": 0.4473835537665325, + "step": 4321 + }, + { + "epoch": 0.8012606599925843, + "grad_norm": 7.9375, + "learning_rate": 9.198739340007416e-06, + "loss": 2.5082, + "mean_token_accuracy": 0.46463897131552917, + "step": 4322 + }, + { + "epoch": 0.8014460511679644, + "grad_norm": 6.73046875, + "learning_rate": 9.198553948832037e-06, + "loss": 2.7017, + "mean_token_accuracy": 0.4599382786604183, + "step": 4323 + }, + { + "epoch": 0.8016314423433445, + "grad_norm": 8.484375, + "learning_rate": 9.198368557656656e-06, + "loss": 2.9793, + "mean_token_accuracy": 0.41475295755045233, + "step": 4324 + }, + { + "epoch": 0.8018168335187245, + "grad_norm": 5.00390625, + "learning_rate": 9.198183166481276e-06, + "loss": 2.8176, + "mean_token_accuracy": 0.4557622504537205, + "step": 4325 + }, + { + "epoch": 0.8020022246941045, + "grad_norm": 6.57421875, + "learning_rate": 9.197997775305897e-06, + "loss": 2.7547, + "mean_token_accuracy": 0.44747225647348954, + "step": 4326 + }, + { + "epoch": 0.8021876158694846, + "grad_norm": 5.34375, + "learning_rate": 9.197812384130517e-06, + "loss": 2.8768, + "mean_token_accuracy": 0.4258984258984259, + "step": 4327 + }, + { + "epoch": 0.8023730070448647, + "grad_norm": 6.4765625, + "learning_rate": 9.197626992955136e-06, + "loss": 2.8811, + "mean_token_accuracy": 0.46300512236767216, + "step": 4328 + }, + { + "epoch": 0.8025583982202447, + "grad_norm": 6.2734375, + "learning_rate": 9.197441601779756e-06, + "loss": 2.2912, + "mean_token_accuracy": 0.5259444109801222, + "step": 4329 + }, + { + "epoch": 0.8027437893956247, + "grad_norm": 6.32421875, + "learning_rate": 9.197256210604377e-06, + "loss": 2.6276, + "mean_token_accuracy": 0.48133225893096965, + "step": 4330 + }, + { + "epoch": 0.8029291805710048, + "grad_norm": 5.3046875, + "learning_rate": 9.197070819428996e-06, + "loss": 3.2074, + "mean_token_accuracy": 0.4130811078140455, + "step": 4331 + }, + { + "epoch": 0.8031145717463849, + "grad_norm": 7.03125, + "learning_rate": 9.196885428253616e-06, + "loss": 2.9675, + "mean_token_accuracy": 0.4250409612233752, + "step": 4332 + }, + { + "epoch": 0.803299962921765, + "grad_norm": 9.2890625, + "learning_rate": 9.196700037078235e-06, + "loss": 2.8195, + "mean_token_accuracy": 0.44739944975763135, + "step": 4333 + }, + { + "epoch": 0.8034853540971449, + "grad_norm": 7.71484375, + "learning_rate": 9.196514645902855e-06, + "loss": 3.6465, + "mean_token_accuracy": 0.3808119117853631, + "step": 4334 + }, + { + "epoch": 0.803670745272525, + "grad_norm": 6.828125, + "learning_rate": 9.196329254727476e-06, + "loss": 2.5206, + "mean_token_accuracy": 0.47836326170150134, + "step": 4335 + }, + { + "epoch": 0.8038561364479051, + "grad_norm": 7.046875, + "learning_rate": 9.196143863552096e-06, + "loss": 2.4567, + "mean_token_accuracy": 0.48352090032154343, + "step": 4336 + }, + { + "epoch": 0.8040415276232852, + "grad_norm": 6.1171875, + "learning_rate": 9.195958472376715e-06, + "loss": 2.7378, + "mean_token_accuracy": 0.43367760226812474, + "step": 4337 + }, + { + "epoch": 0.8042269187986651, + "grad_norm": 6.66015625, + "learning_rate": 9.195773081201336e-06, + "loss": 2.8549, + "mean_token_accuracy": 0.43636363636363634, + "step": 4338 + }, + { + "epoch": 0.8044123099740452, + "grad_norm": 7.421875, + "learning_rate": 9.195587690025956e-06, + "loss": 3.0817, + "mean_token_accuracy": 0.4194732641660016, + "step": 4339 + }, + { + "epoch": 0.8045977011494253, + "grad_norm": 7.19921875, + "learning_rate": 9.195402298850575e-06, + "loss": 2.5639, + "mean_token_accuracy": 0.492090395480226, + "step": 4340 + }, + { + "epoch": 0.8047830923248054, + "grad_norm": 5.65234375, + "learning_rate": 9.195216907675195e-06, + "loss": 2.8191, + "mean_token_accuracy": 0.45700613129953643, + "step": 4341 + }, + { + "epoch": 0.8049684835001854, + "grad_norm": 8.09375, + "learning_rate": 9.195031516499814e-06, + "loss": 2.5304, + "mean_token_accuracy": 0.47136815636072693, + "step": 4342 + }, + { + "epoch": 0.8051538746755654, + "grad_norm": 6.2421875, + "learning_rate": 9.194846125324435e-06, + "loss": 3.0323, + "mean_token_accuracy": 0.43833881995441887, + "step": 4343 + }, + { + "epoch": 0.8053392658509455, + "grad_norm": 6.484375, + "learning_rate": 9.194660734149055e-06, + "loss": 2.9594, + "mean_token_accuracy": 0.43760642179518366, + "step": 4344 + }, + { + "epoch": 0.8055246570263256, + "grad_norm": 11.984375, + "learning_rate": 9.194475342973676e-06, + "loss": 3.015, + "mean_token_accuracy": 0.4372923588039867, + "step": 4345 + }, + { + "epoch": 0.8057100482017056, + "grad_norm": 7.43359375, + "learning_rate": 9.194289951798295e-06, + "loss": 3.3535, + "mean_token_accuracy": 0.38111430813497255, + "step": 4346 + }, + { + "epoch": 0.8058954393770856, + "grad_norm": 6.1796875, + "learning_rate": 9.194104560622915e-06, + "loss": 2.9831, + "mean_token_accuracy": 0.4292652552926525, + "step": 4347 + }, + { + "epoch": 0.8060808305524657, + "grad_norm": 7.0078125, + "learning_rate": 9.193919169447536e-06, + "loss": 2.1176, + "mean_token_accuracy": 0.5565469293163383, + "step": 4348 + }, + { + "epoch": 0.8062662217278458, + "grad_norm": 6.53515625, + "learning_rate": 9.193733778272154e-06, + "loss": 2.8928, + "mean_token_accuracy": 0.42583960798374565, + "step": 4349 + }, + { + "epoch": 0.8064516129032258, + "grad_norm": 9.703125, + "learning_rate": 9.193548387096775e-06, + "loss": 2.4723, + "mean_token_accuracy": 0.5023058252427185, + "step": 4350 + }, + { + "epoch": 0.8066370040786058, + "grad_norm": 9.09375, + "learning_rate": 9.193362995921394e-06, + "loss": 3.0332, + "mean_token_accuracy": 0.4210726995424504, + "step": 4351 + }, + { + "epoch": 0.8068223952539859, + "grad_norm": 5.953125, + "learning_rate": 9.193177604746016e-06, + "loss": 2.6328, + "mean_token_accuracy": 0.47431526977783, + "step": 4352 + }, + { + "epoch": 0.807007786429366, + "grad_norm": 8.0, + "learning_rate": 9.192992213570635e-06, + "loss": 2.8475, + "mean_token_accuracy": 0.45464494163424124, + "step": 4353 + }, + { + "epoch": 0.807193177604746, + "grad_norm": 5.60546875, + "learning_rate": 9.192806822395255e-06, + "loss": 2.7887, + "mean_token_accuracy": 0.45560481317289425, + "step": 4354 + }, + { + "epoch": 0.807378568780126, + "grad_norm": 6.41796875, + "learning_rate": 9.192621431219874e-06, + "loss": 2.923, + "mean_token_accuracy": 0.42575241340147646, + "step": 4355 + }, + { + "epoch": 0.8075639599555061, + "grad_norm": 6.3828125, + "learning_rate": 9.192436040044494e-06, + "loss": 3.0856, + "mean_token_accuracy": 0.4237333691708104, + "step": 4356 + }, + { + "epoch": 0.8077493511308862, + "grad_norm": 5.92578125, + "learning_rate": 9.192250648869115e-06, + "loss": 3.1427, + "mean_token_accuracy": 0.394333936106088, + "step": 4357 + }, + { + "epoch": 0.8079347423062663, + "grad_norm": 5.5625, + "learning_rate": 9.192065257693734e-06, + "loss": 2.9239, + "mean_token_accuracy": 0.43770192442758815, + "step": 4358 + }, + { + "epoch": 0.8081201334816462, + "grad_norm": 8.9765625, + "learning_rate": 9.191879866518354e-06, + "loss": 2.1578, + "mean_token_accuracy": 0.527965889139704, + "step": 4359 + }, + { + "epoch": 0.8083055246570263, + "grad_norm": 5.86328125, + "learning_rate": 9.191694475342975e-06, + "loss": 3.2135, + "mean_token_accuracy": 0.41491976290299265, + "step": 4360 + }, + { + "epoch": 0.8084909158324064, + "grad_norm": 6.76171875, + "learning_rate": 9.191509084167595e-06, + "loss": 3.4393, + "mean_token_accuracy": 0.37162805458584575, + "step": 4361 + }, + { + "epoch": 0.8086763070077865, + "grad_norm": 9.578125, + "learning_rate": 9.191323692992214e-06, + "loss": 2.3494, + "mean_token_accuracy": 0.49133102160576153, + "step": 4362 + }, + { + "epoch": 0.8088616981831664, + "grad_norm": 5.51171875, + "learning_rate": 9.191138301816834e-06, + "loss": 2.8712, + "mean_token_accuracy": 0.4543939393939394, + "step": 4363 + }, + { + "epoch": 0.8090470893585465, + "grad_norm": 5.99609375, + "learning_rate": 9.190952910641455e-06, + "loss": 2.7282, + "mean_token_accuracy": 0.4700222057735011, + "step": 4364 + }, + { + "epoch": 0.8092324805339266, + "grad_norm": 7.73046875, + "learning_rate": 9.190767519466074e-06, + "loss": 2.9636, + "mean_token_accuracy": 0.41929848138691034, + "step": 4365 + }, + { + "epoch": 0.8094178717093067, + "grad_norm": 7.4609375, + "learning_rate": 9.190582128290694e-06, + "loss": 2.5495, + "mean_token_accuracy": 0.4725782727463753, + "step": 4366 + }, + { + "epoch": 0.8096032628846866, + "grad_norm": 5.02734375, + "learning_rate": 9.190396737115313e-06, + "loss": 2.867, + "mean_token_accuracy": 0.43786839889329965, + "step": 4367 + }, + { + "epoch": 0.8097886540600667, + "grad_norm": 8.8046875, + "learning_rate": 9.190211345939935e-06, + "loss": 2.7449, + "mean_token_accuracy": 0.4451419213973799, + "step": 4368 + }, + { + "epoch": 0.8099740452354468, + "grad_norm": 6.109375, + "learning_rate": 9.190025954764554e-06, + "loss": 2.7499, + "mean_token_accuracy": 0.4649339334978946, + "step": 4369 + }, + { + "epoch": 0.8101594364108269, + "grad_norm": 5.03125, + "learning_rate": 9.189840563589174e-06, + "loss": 2.9489, + "mean_token_accuracy": 0.4270986745213549, + "step": 4370 + }, + { + "epoch": 0.8103448275862069, + "grad_norm": 11.8203125, + "learning_rate": 9.189655172413793e-06, + "loss": 2.5224, + "mean_token_accuracy": 0.46034543531899896, + "step": 4371 + }, + { + "epoch": 0.8105302187615869, + "grad_norm": 5.9140625, + "learning_rate": 9.189469781238414e-06, + "loss": 2.3851, + "mean_token_accuracy": 0.5172600514417219, + "step": 4372 + }, + { + "epoch": 0.810715609936967, + "grad_norm": 8.3359375, + "learning_rate": 9.189284390063034e-06, + "loss": 2.6574, + "mean_token_accuracy": 0.45333491855902985, + "step": 4373 + }, + { + "epoch": 0.8109010011123471, + "grad_norm": 6.796875, + "learning_rate": 9.189098998887653e-06, + "loss": 2.722, + "mean_token_accuracy": 0.43742113112309283, + "step": 4374 + }, + { + "epoch": 0.811086392287727, + "grad_norm": 5.18359375, + "learning_rate": 9.188913607712274e-06, + "loss": 3.0958, + "mean_token_accuracy": 0.41936231884057973, + "step": 4375 + }, + { + "epoch": 0.8112717834631071, + "grad_norm": 5.671875, + "learning_rate": 9.188728216536894e-06, + "loss": 2.6637, + "mean_token_accuracy": 0.4841623360554318, + "step": 4376 + }, + { + "epoch": 0.8114571746384872, + "grad_norm": 9.1328125, + "learning_rate": 9.188542825361515e-06, + "loss": 2.7581, + "mean_token_accuracy": 0.46237996839674245, + "step": 4377 + }, + { + "epoch": 0.8116425658138673, + "grad_norm": 8.4453125, + "learning_rate": 9.188357434186133e-06, + "loss": 2.4451, + "mean_token_accuracy": 0.4785202863961814, + "step": 4378 + }, + { + "epoch": 0.8118279569892473, + "grad_norm": 4.91015625, + "learning_rate": 9.188172043010754e-06, + "loss": 3.722, + "mean_token_accuracy": 0.3836290784201488, + "step": 4379 + }, + { + "epoch": 0.8120133481646273, + "grad_norm": 10.6015625, + "learning_rate": 9.187986651835373e-06, + "loss": 2.4961, + "mean_token_accuracy": 0.46847190439867786, + "step": 4380 + }, + { + "epoch": 0.8121987393400074, + "grad_norm": 7.9609375, + "learning_rate": 9.187801260659993e-06, + "loss": 2.7773, + "mean_token_accuracy": 0.45524908528004504, + "step": 4381 + }, + { + "epoch": 0.8123841305153875, + "grad_norm": 7.86328125, + "learning_rate": 9.187615869484614e-06, + "loss": 2.825, + "mean_token_accuracy": 0.45626389918458116, + "step": 4382 + }, + { + "epoch": 0.8125695216907676, + "grad_norm": 11.4453125, + "learning_rate": 9.187430478309232e-06, + "loss": 2.8006, + "mean_token_accuracy": 0.44647184604419105, + "step": 4383 + }, + { + "epoch": 0.8127549128661475, + "grad_norm": 8.8671875, + "learning_rate": 9.187245087133853e-06, + "loss": 2.1886, + "mean_token_accuracy": 0.5300118114463653, + "step": 4384 + }, + { + "epoch": 0.8129403040415276, + "grad_norm": 6.63671875, + "learning_rate": 9.187059695958473e-06, + "loss": 2.8098, + "mean_token_accuracy": 0.46061902365374935, + "step": 4385 + }, + { + "epoch": 0.8131256952169077, + "grad_norm": 6.078125, + "learning_rate": 9.186874304783094e-06, + "loss": 2.8019, + "mean_token_accuracy": 0.45435909803448693, + "step": 4386 + }, + { + "epoch": 0.8133110863922878, + "grad_norm": 9.0078125, + "learning_rate": 9.186688913607713e-06, + "loss": 3.0977, + "mean_token_accuracy": 0.42460186710598574, + "step": 4387 + }, + { + "epoch": 0.8134964775676677, + "grad_norm": 7.4921875, + "learning_rate": 9.186503522432333e-06, + "loss": 2.7664, + "mean_token_accuracy": 0.45870144439770016, + "step": 4388 + }, + { + "epoch": 0.8136818687430478, + "grad_norm": 5.77734375, + "learning_rate": 9.186318131256952e-06, + "loss": 2.8988, + "mean_token_accuracy": 0.4316022799240025, + "step": 4389 + }, + { + "epoch": 0.8138672599184279, + "grad_norm": 8.21875, + "learning_rate": 9.186132740081572e-06, + "loss": 2.9953, + "mean_token_accuracy": 0.4364118092354277, + "step": 4390 + }, + { + "epoch": 0.814052651093808, + "grad_norm": 10.1328125, + "learning_rate": 9.185947348906193e-06, + "loss": 2.5499, + "mean_token_accuracy": 0.45371953826421546, + "step": 4391 + }, + { + "epoch": 0.814238042269188, + "grad_norm": 5.546875, + "learning_rate": 9.185761957730813e-06, + "loss": 2.7855, + "mean_token_accuracy": 0.44581519109820994, + "step": 4392 + }, + { + "epoch": 0.814423433444568, + "grad_norm": 5.88671875, + "learning_rate": 9.185576566555432e-06, + "loss": 2.57, + "mean_token_accuracy": 0.4981707954426675, + "step": 4393 + }, + { + "epoch": 0.8146088246199481, + "grad_norm": 8.1171875, + "learning_rate": 9.185391175380053e-06, + "loss": 2.9629, + "mean_token_accuracy": 0.42522106881968474, + "step": 4394 + }, + { + "epoch": 0.8147942157953282, + "grad_norm": 8.7421875, + "learning_rate": 9.185205784204673e-06, + "loss": 2.6271, + "mean_token_accuracy": 0.46552150271873455, + "step": 4395 + }, + { + "epoch": 0.8149796069707081, + "grad_norm": 6.4375, + "learning_rate": 9.185020393029292e-06, + "loss": 2.3936, + "mean_token_accuracy": 0.4743816254416961, + "step": 4396 + }, + { + "epoch": 0.8151649981460882, + "grad_norm": 11.171875, + "learning_rate": 9.184835001853912e-06, + "loss": 2.6163, + "mean_token_accuracy": 0.4695697796432319, + "step": 4397 + }, + { + "epoch": 0.8153503893214683, + "grad_norm": 8.25, + "learning_rate": 9.184649610678531e-06, + "loss": 2.9028, + "mean_token_accuracy": 0.44392655367231637, + "step": 4398 + }, + { + "epoch": 0.8155357804968484, + "grad_norm": 9.265625, + "learning_rate": 9.184464219503152e-06, + "loss": 2.586, + "mean_token_accuracy": 0.46875800256081945, + "step": 4399 + }, + { + "epoch": 0.8157211716722284, + "grad_norm": 6.31640625, + "learning_rate": 9.184278828327772e-06, + "loss": 2.8025, + "mean_token_accuracy": 0.46569129480614485, + "step": 4400 + }, + { + "epoch": 0.8159065628476084, + "grad_norm": 5.33203125, + "learning_rate": 9.184093437152393e-06, + "loss": 2.9963, + "mean_token_accuracy": 0.4361865709892363, + "step": 4401 + }, + { + "epoch": 0.8160919540229885, + "grad_norm": 8.21875, + "learning_rate": 9.183908045977013e-06, + "loss": 2.8542, + "mean_token_accuracy": 0.4517895809451025, + "step": 4402 + }, + { + "epoch": 0.8162773451983686, + "grad_norm": 5.1328125, + "learning_rate": 9.183722654801632e-06, + "loss": 2.761, + "mean_token_accuracy": 0.47304810248972684, + "step": 4403 + }, + { + "epoch": 0.8164627363737486, + "grad_norm": 5.21484375, + "learning_rate": 9.183537263626253e-06, + "loss": 3.0951, + "mean_token_accuracy": 0.4181457262961233, + "step": 4404 + }, + { + "epoch": 0.8166481275491286, + "grad_norm": 7.67578125, + "learning_rate": 9.183351872450871e-06, + "loss": 2.9994, + "mean_token_accuracy": 0.43290482634190347, + "step": 4405 + }, + { + "epoch": 0.8168335187245087, + "grad_norm": 8.84375, + "learning_rate": 9.183166481275492e-06, + "loss": 2.68, + "mean_token_accuracy": 0.47159940209267565, + "step": 4406 + }, + { + "epoch": 0.8170189098998888, + "grad_norm": 7.48046875, + "learning_rate": 9.182981090100112e-06, + "loss": 2.6111, + "mean_token_accuracy": 0.4858509366281387, + "step": 4407 + }, + { + "epoch": 0.8172043010752689, + "grad_norm": 6.8125, + "learning_rate": 9.182795698924733e-06, + "loss": 2.5157, + "mean_token_accuracy": 0.4702430846605197, + "step": 4408 + }, + { + "epoch": 0.8173896922506488, + "grad_norm": 7.22265625, + "learning_rate": 9.182610307749352e-06, + "loss": 3.9058, + "mean_token_accuracy": 0.36253776435045315, + "step": 4409 + }, + { + "epoch": 0.8175750834260289, + "grad_norm": 7.32421875, + "learning_rate": 9.182424916573972e-06, + "loss": 2.9919, + "mean_token_accuracy": 0.4170714781401804, + "step": 4410 + }, + { + "epoch": 0.817760474601409, + "grad_norm": 7.578125, + "learning_rate": 9.182239525398593e-06, + "loss": 2.5637, + "mean_token_accuracy": 0.4683528836754643, + "step": 4411 + }, + { + "epoch": 0.8179458657767891, + "grad_norm": 6.359375, + "learning_rate": 9.182054134223211e-06, + "loss": 2.3226, + "mean_token_accuracy": 0.49394166043380705, + "step": 4412 + }, + { + "epoch": 0.818131256952169, + "grad_norm": 9.515625, + "learning_rate": 9.181868743047832e-06, + "loss": 3.0967, + "mean_token_accuracy": 0.42943155657871435, + "step": 4413 + }, + { + "epoch": 0.8183166481275491, + "grad_norm": 10.28125, + "learning_rate": 9.18168335187245e-06, + "loss": 2.5182, + "mean_token_accuracy": 0.49058516801853996, + "step": 4414 + }, + { + "epoch": 0.8185020393029292, + "grad_norm": 7.3671875, + "learning_rate": 9.181497960697071e-06, + "loss": 3.2666, + "mean_token_accuracy": 0.4092020129403307, + "step": 4415 + }, + { + "epoch": 0.8186874304783093, + "grad_norm": 7.53125, + "learning_rate": 9.181312569521692e-06, + "loss": 2.5404, + "mean_token_accuracy": 0.4786148081147435, + "step": 4416 + }, + { + "epoch": 0.8188728216536892, + "grad_norm": 7.44140625, + "learning_rate": 9.181127178346312e-06, + "loss": 2.816, + "mean_token_accuracy": 0.4342594889605557, + "step": 4417 + }, + { + "epoch": 0.8190582128290693, + "grad_norm": 9.546875, + "learning_rate": 9.180941787170931e-06, + "loss": 2.8622, + "mean_token_accuracy": 0.43490304709141275, + "step": 4418 + }, + { + "epoch": 0.8192436040044494, + "grad_norm": 5.7421875, + "learning_rate": 9.180756395995551e-06, + "loss": 3.0761, + "mean_token_accuracy": 0.43468502169684275, + "step": 4419 + }, + { + "epoch": 0.8194289951798295, + "grad_norm": 5.40234375, + "learning_rate": 9.180571004820172e-06, + "loss": 2.7504, + "mean_token_accuracy": 0.4477154247163447, + "step": 4420 + }, + { + "epoch": 0.8196143863552094, + "grad_norm": 7.3203125, + "learning_rate": 9.18038561364479e-06, + "loss": 2.4056, + "mean_token_accuracy": 0.48785185185185187, + "step": 4421 + }, + { + "epoch": 0.8197997775305895, + "grad_norm": 8.3359375, + "learning_rate": 9.180200222469411e-06, + "loss": 3.0176, + "mean_token_accuracy": 0.42487629329734594, + "step": 4422 + }, + { + "epoch": 0.8199851687059696, + "grad_norm": 6.42578125, + "learning_rate": 9.18001483129403e-06, + "loss": 2.2026, + "mean_token_accuracy": 0.5066332916145182, + "step": 4423 + }, + { + "epoch": 0.8201705598813497, + "grad_norm": 6.1875, + "learning_rate": 9.179829440118652e-06, + "loss": 2.4433, + "mean_token_accuracy": 0.4946714031971581, + "step": 4424 + }, + { + "epoch": 0.8203559510567296, + "grad_norm": 6.58203125, + "learning_rate": 9.179644048943271e-06, + "loss": 3.5816, + "mean_token_accuracy": 0.39780658025922233, + "step": 4425 + }, + { + "epoch": 0.8205413422321097, + "grad_norm": 5.0703125, + "learning_rate": 9.179458657767891e-06, + "loss": 2.8443, + "mean_token_accuracy": 0.45802161263507896, + "step": 4426 + }, + { + "epoch": 0.8207267334074898, + "grad_norm": 7.21875, + "learning_rate": 9.17927326659251e-06, + "loss": 2.6387, + "mean_token_accuracy": 0.4645669291338583, + "step": 4427 + }, + { + "epoch": 0.8209121245828699, + "grad_norm": 6.21484375, + "learning_rate": 9.17908787541713e-06, + "loss": 2.6687, + "mean_token_accuracy": 0.4563298843578819, + "step": 4428 + }, + { + "epoch": 0.8210975157582499, + "grad_norm": 4.69921875, + "learning_rate": 9.178902484241751e-06, + "loss": 3.1248, + "mean_token_accuracy": 0.4328288707799767, + "step": 4429 + }, + { + "epoch": 0.8212829069336299, + "grad_norm": 12.9921875, + "learning_rate": 9.17871709306637e-06, + "loss": 2.8007, + "mean_token_accuracy": 0.4327208061647896, + "step": 4430 + }, + { + "epoch": 0.82146829810901, + "grad_norm": 7.8984375, + "learning_rate": 9.17853170189099e-06, + "loss": 2.497, + "mean_token_accuracy": 0.4802651401024405, + "step": 4431 + }, + { + "epoch": 0.8216536892843901, + "grad_norm": 6.5390625, + "learning_rate": 9.178346310715611e-06, + "loss": 2.8033, + "mean_token_accuracy": 0.4549240897487572, + "step": 4432 + }, + { + "epoch": 0.8218390804597702, + "grad_norm": 6.30859375, + "learning_rate": 9.178160919540232e-06, + "loss": 3.034, + "mean_token_accuracy": 0.43412010755900804, + "step": 4433 + }, + { + "epoch": 0.8220244716351501, + "grad_norm": 6.6171875, + "learning_rate": 9.17797552836485e-06, + "loss": 3.1262, + "mean_token_accuracy": 0.429, + "step": 4434 + }, + { + "epoch": 0.8222098628105302, + "grad_norm": 5.41796875, + "learning_rate": 9.17779013718947e-06, + "loss": 2.863, + "mean_token_accuracy": 0.43613707165109034, + "step": 4435 + }, + { + "epoch": 0.8223952539859103, + "grad_norm": 7.05078125, + "learning_rate": 9.17760474601409e-06, + "loss": 3.2017, + "mean_token_accuracy": 0.3989723189126471, + "step": 4436 + }, + { + "epoch": 0.8225806451612904, + "grad_norm": 6.02734375, + "learning_rate": 9.17741935483871e-06, + "loss": 2.1676, + "mean_token_accuracy": 0.5301844235106806, + "step": 4437 + }, + { + "epoch": 0.8227660363366703, + "grad_norm": 5.06640625, + "learning_rate": 9.17723396366333e-06, + "loss": 2.6664, + "mean_token_accuracy": 0.45574341123818995, + "step": 4438 + }, + { + "epoch": 0.8229514275120504, + "grad_norm": 6.3125, + "learning_rate": 9.17704857248795e-06, + "loss": 3.1937, + "mean_token_accuracy": 0.40545004128819156, + "step": 4439 + }, + { + "epoch": 0.8231368186874305, + "grad_norm": 6.31640625, + "learning_rate": 9.176863181312572e-06, + "loss": 3.2905, + "mean_token_accuracy": 0.40193732193732196, + "step": 4440 + }, + { + "epoch": 0.8233222098628106, + "grad_norm": 5.796875, + "learning_rate": 9.17667779013719e-06, + "loss": 2.8553, + "mean_token_accuracy": 0.43833652007648183, + "step": 4441 + }, + { + "epoch": 0.8235076010381905, + "grad_norm": 7.5234375, + "learning_rate": 9.176492398961811e-06, + "loss": 2.7666, + "mean_token_accuracy": 0.44815032295948326, + "step": 4442 + }, + { + "epoch": 0.8236929922135706, + "grad_norm": 5.4765625, + "learning_rate": 9.17630700778643e-06, + "loss": 2.8879, + "mean_token_accuracy": 0.4640972136982939, + "step": 4443 + }, + { + "epoch": 0.8238783833889507, + "grad_norm": 5.08203125, + "learning_rate": 9.17612161661105e-06, + "loss": 2.7549, + "mean_token_accuracy": 0.4592476489028213, + "step": 4444 + }, + { + "epoch": 0.8240637745643308, + "grad_norm": 6.1796875, + "learning_rate": 9.17593622543567e-06, + "loss": 2.9661, + "mean_token_accuracy": 0.4505977067577458, + "step": 4445 + }, + { + "epoch": 0.8242491657397107, + "grad_norm": 6.21875, + "learning_rate": 9.17575083426029e-06, + "loss": 2.681, + "mean_token_accuracy": 0.45746164574616455, + "step": 4446 + }, + { + "epoch": 0.8244345569150908, + "grad_norm": 6.2421875, + "learning_rate": 9.17556544308491e-06, + "loss": 2.966, + "mean_token_accuracy": 0.42812330989724173, + "step": 4447 + }, + { + "epoch": 0.8246199480904709, + "grad_norm": 7.2734375, + "learning_rate": 9.17538005190953e-06, + "loss": 3.4448, + "mean_token_accuracy": 0.4073864280049847, + "step": 4448 + }, + { + "epoch": 0.824805339265851, + "grad_norm": 5.31640625, + "learning_rate": 9.175194660734151e-06, + "loss": 3.4096, + "mean_token_accuracy": 0.39215435727063636, + "step": 4449 + }, + { + "epoch": 0.824990730441231, + "grad_norm": 6.15625, + "learning_rate": 9.17500926955877e-06, + "loss": 3.16, + "mean_token_accuracy": 0.41272123893805307, + "step": 4450 + }, + { + "epoch": 0.825176121616611, + "grad_norm": 6.6328125, + "learning_rate": 9.17482387838339e-06, + "loss": 2.8029, + "mean_token_accuracy": 0.43438287153652394, + "step": 4451 + }, + { + "epoch": 0.8253615127919911, + "grad_norm": 9.484375, + "learning_rate": 9.174638487208009e-06, + "loss": 1.838, + "mean_token_accuracy": 0.5679012345679012, + "step": 4452 + }, + { + "epoch": 0.8255469039673712, + "grad_norm": 6.46484375, + "learning_rate": 9.17445309603263e-06, + "loss": 2.9247, + "mean_token_accuracy": 0.4426871516794698, + "step": 4453 + }, + { + "epoch": 0.8257322951427513, + "grad_norm": 6.0625, + "learning_rate": 9.17426770485725e-06, + "loss": 3.4541, + "mean_token_accuracy": 0.4095221958658082, + "step": 4454 + }, + { + "epoch": 0.8259176863181312, + "grad_norm": 5.27734375, + "learning_rate": 9.174082313681869e-06, + "loss": 2.9604, + "mean_token_accuracy": 0.4255979314802844, + "step": 4455 + }, + { + "epoch": 0.8261030774935113, + "grad_norm": 7.05859375, + "learning_rate": 9.17389692250649e-06, + "loss": 2.9849, + "mean_token_accuracy": 0.4316702819956616, + "step": 4456 + }, + { + "epoch": 0.8262884686688914, + "grad_norm": 6.07421875, + "learning_rate": 9.17371153133111e-06, + "loss": 3.1013, + "mean_token_accuracy": 0.41817192600652886, + "step": 4457 + }, + { + "epoch": 0.8264738598442715, + "grad_norm": 5.66796875, + "learning_rate": 9.17352614015573e-06, + "loss": 2.755, + "mean_token_accuracy": 0.46497797356828197, + "step": 4458 + }, + { + "epoch": 0.8266592510196514, + "grad_norm": 6.0546875, + "learning_rate": 9.173340748980349e-06, + "loss": 2.4322, + "mean_token_accuracy": 0.48446383710167923, + "step": 4459 + }, + { + "epoch": 0.8268446421950315, + "grad_norm": 6.18359375, + "learning_rate": 9.17315535780497e-06, + "loss": 2.5635, + "mean_token_accuracy": 0.48299968223705114, + "step": 4460 + }, + { + "epoch": 0.8270300333704116, + "grad_norm": 7.15625, + "learning_rate": 9.172969966629588e-06, + "loss": 3.3559, + "mean_token_accuracy": 0.4238287309959665, + "step": 4461 + }, + { + "epoch": 0.8272154245457917, + "grad_norm": 6.93359375, + "learning_rate": 9.172784575454209e-06, + "loss": 2.8607, + "mean_token_accuracy": 0.4582779991146525, + "step": 4462 + }, + { + "epoch": 0.8274008157211716, + "grad_norm": 8.609375, + "learning_rate": 9.17259918427883e-06, + "loss": 2.6722, + "mean_token_accuracy": 0.44519621109607577, + "step": 4463 + }, + { + "epoch": 0.8275862068965517, + "grad_norm": 6.734375, + "learning_rate": 9.172413793103448e-06, + "loss": 2.7328, + "mean_token_accuracy": 0.47839933536416507, + "step": 4464 + }, + { + "epoch": 0.8277715980719318, + "grad_norm": 9.1796875, + "learning_rate": 9.172228401928069e-06, + "loss": 3.0857, + "mean_token_accuracy": 0.40721142013290673, + "step": 4465 + }, + { + "epoch": 0.8279569892473119, + "grad_norm": 6.1015625, + "learning_rate": 9.172043010752689e-06, + "loss": 3.448, + "mean_token_accuracy": 0.3917889857631608, + "step": 4466 + }, + { + "epoch": 0.8281423804226918, + "grad_norm": 6.04296875, + "learning_rate": 9.17185761957731e-06, + "loss": 3.0551, + "mean_token_accuracy": 0.4189074740630955, + "step": 4467 + }, + { + "epoch": 0.8283277715980719, + "grad_norm": 6.32421875, + "learning_rate": 9.171672228401928e-06, + "loss": 3.0626, + "mean_token_accuracy": 0.43795171459326465, + "step": 4468 + }, + { + "epoch": 0.828513162773452, + "grad_norm": 5.796875, + "learning_rate": 9.171486837226549e-06, + "loss": 2.9824, + "mean_token_accuracy": 0.44318351500671943, + "step": 4469 + }, + { + "epoch": 0.8286985539488321, + "grad_norm": 7.21484375, + "learning_rate": 9.171301446051168e-06, + "loss": 2.2097, + "mean_token_accuracy": 0.5083261379055137, + "step": 4470 + }, + { + "epoch": 0.828883945124212, + "grad_norm": 7.8046875, + "learning_rate": 9.171116054875788e-06, + "loss": 2.9279, + "mean_token_accuracy": 0.432826281477694, + "step": 4471 + }, + { + "epoch": 0.8290693362995921, + "grad_norm": 8.625, + "learning_rate": 9.170930663700409e-06, + "loss": 2.7359, + "mean_token_accuracy": 0.4522503998172264, + "step": 4472 + }, + { + "epoch": 0.8292547274749722, + "grad_norm": 7.01953125, + "learning_rate": 9.17074527252503e-06, + "loss": 2.6236, + "mean_token_accuracy": 0.4603505843071786, + "step": 4473 + }, + { + "epoch": 0.8294401186503523, + "grad_norm": 11.4765625, + "learning_rate": 9.170559881349648e-06, + "loss": 2.6719, + "mean_token_accuracy": 0.4640632122777693, + "step": 4474 + }, + { + "epoch": 0.8296255098257322, + "grad_norm": 6.19140625, + "learning_rate": 9.170374490174268e-06, + "loss": 3.0002, + "mean_token_accuracy": 0.42925474463817315, + "step": 4475 + }, + { + "epoch": 0.8298109010011123, + "grad_norm": 4.96484375, + "learning_rate": 9.170189098998889e-06, + "loss": 3.2094, + "mean_token_accuracy": 0.42144004282655245, + "step": 4476 + }, + { + "epoch": 0.8299962921764924, + "grad_norm": 5.7421875, + "learning_rate": 9.170003707823508e-06, + "loss": 2.2935, + "mean_token_accuracy": 0.5074814711229199, + "step": 4477 + }, + { + "epoch": 0.8301816833518725, + "grad_norm": 6.69140625, + "learning_rate": 9.169818316648128e-06, + "loss": 2.9473, + "mean_token_accuracy": 0.43269918466599916, + "step": 4478 + }, + { + "epoch": 0.8303670745272526, + "grad_norm": 6.27734375, + "learning_rate": 9.169632925472747e-06, + "loss": 2.7936, + "mean_token_accuracy": 0.4447274579724911, + "step": 4479 + }, + { + "epoch": 0.8305524657026325, + "grad_norm": 5.90625, + "learning_rate": 9.169447534297368e-06, + "loss": 2.9834, + "mean_token_accuracy": 0.4481435813125086, + "step": 4480 + }, + { + "epoch": 0.8307378568780126, + "grad_norm": 5.05078125, + "learning_rate": 9.169262143121988e-06, + "loss": 2.7337, + "mean_token_accuracy": 0.4577012563983248, + "step": 4481 + }, + { + "epoch": 0.8309232480533927, + "grad_norm": 4.71875, + "learning_rate": 9.169076751946609e-06, + "loss": 3.0813, + "mean_token_accuracy": 0.4241962305986696, + "step": 4482 + }, + { + "epoch": 0.8311086392287728, + "grad_norm": 7.10546875, + "learning_rate": 9.168891360771229e-06, + "loss": 2.8972, + "mean_token_accuracy": 0.4438296229571484, + "step": 4483 + }, + { + "epoch": 0.8312940304041527, + "grad_norm": 7.5625, + "learning_rate": 9.168705969595848e-06, + "loss": 2.7619, + "mean_token_accuracy": 0.4556809024979855, + "step": 4484 + }, + { + "epoch": 0.8314794215795328, + "grad_norm": 5.2734375, + "learning_rate": 9.168520578420468e-06, + "loss": 3.0797, + "mean_token_accuracy": 0.4078762306610408, + "step": 4485 + }, + { + "epoch": 0.8316648127549129, + "grad_norm": 6.66796875, + "learning_rate": 9.168335187245087e-06, + "loss": 3.0787, + "mean_token_accuracy": 0.41968911917098445, + "step": 4486 + }, + { + "epoch": 0.831850203930293, + "grad_norm": 5.67578125, + "learning_rate": 9.168149796069708e-06, + "loss": 2.9729, + "mean_token_accuracy": 0.43402545210984594, + "step": 4487 + }, + { + "epoch": 0.8320355951056729, + "grad_norm": 6.03515625, + "learning_rate": 9.167964404894328e-06, + "loss": 2.4158, + "mean_token_accuracy": 0.48830011142751023, + "step": 4488 + }, + { + "epoch": 0.832220986281053, + "grad_norm": 5.890625, + "learning_rate": 9.167779013718949e-06, + "loss": 2.8134, + "mean_token_accuracy": 0.43807242496829646, + "step": 4489 + }, + { + "epoch": 0.8324063774564331, + "grad_norm": 6.03515625, + "learning_rate": 9.167593622543567e-06, + "loss": 2.6587, + "mean_token_accuracy": 0.4577040990121494, + "step": 4490 + }, + { + "epoch": 0.8325917686318132, + "grad_norm": 6.01171875, + "learning_rate": 9.167408231368188e-06, + "loss": 3.3818, + "mean_token_accuracy": 0.3963035903650962, + "step": 4491 + }, + { + "epoch": 0.8327771598071931, + "grad_norm": 6.265625, + "learning_rate": 9.167222840192808e-06, + "loss": 3.0869, + "mean_token_accuracy": 0.4224632391281677, + "step": 4492 + }, + { + "epoch": 0.8329625509825732, + "grad_norm": 5.95703125, + "learning_rate": 9.167037449017427e-06, + "loss": 3.0958, + "mean_token_accuracy": 0.4208832238959701, + "step": 4493 + }, + { + "epoch": 0.8331479421579533, + "grad_norm": 5.125, + "learning_rate": 9.166852057842048e-06, + "loss": 3.4583, + "mean_token_accuracy": 0.39850357839947953, + "step": 4494 + }, + { + "epoch": 0.8333333333333334, + "grad_norm": 6.74609375, + "learning_rate": 9.166666666666666e-06, + "loss": 2.7788, + "mean_token_accuracy": 0.44272873934086193, + "step": 4495 + }, + { + "epoch": 0.8335187245087133, + "grad_norm": 9.3359375, + "learning_rate": 9.166481275491287e-06, + "loss": 3.1218, + "mean_token_accuracy": 0.44252463230044264, + "step": 4496 + }, + { + "epoch": 0.8337041156840934, + "grad_norm": 7.5625, + "learning_rate": 9.166295884315907e-06, + "loss": 2.8186, + "mean_token_accuracy": 0.44422015879424037, + "step": 4497 + }, + { + "epoch": 0.8338895068594735, + "grad_norm": 5.2421875, + "learning_rate": 9.166110493140528e-06, + "loss": 3.4462, + "mean_token_accuracy": 0.4011142061281337, + "step": 4498 + }, + { + "epoch": 0.8340748980348536, + "grad_norm": 10.8125, + "learning_rate": 9.165925101965147e-06, + "loss": 3.1669, + "mean_token_accuracy": 0.4164394234514998, + "step": 4499 + }, + { + "epoch": 0.8342602892102335, + "grad_norm": 9.0625, + "learning_rate": 9.165739710789767e-06, + "loss": 3.1177, + "mean_token_accuracy": 0.4205810828107001, + "step": 4500 + }, + { + "epoch": 0.8344456803856136, + "grad_norm": 6.87109375, + "learning_rate": 9.165554319614388e-06, + "loss": 2.9973, + "mean_token_accuracy": 0.4370713305898491, + "step": 4501 + }, + { + "epoch": 0.8346310715609937, + "grad_norm": 5.80078125, + "learning_rate": 9.165368928439006e-06, + "loss": 2.7574, + "mean_token_accuracy": 0.4560009487666034, + "step": 4502 + }, + { + "epoch": 0.8348164627363738, + "grad_norm": 5.7734375, + "learning_rate": 9.165183537263627e-06, + "loss": 2.842, + "mean_token_accuracy": 0.4471161657189277, + "step": 4503 + }, + { + "epoch": 0.8350018539117539, + "grad_norm": 6.53515625, + "learning_rate": 9.164998146088246e-06, + "loss": 2.5804, + "mean_token_accuracy": 0.4946051986267778, + "step": 4504 + }, + { + "epoch": 0.8351872450871338, + "grad_norm": 6.55859375, + "learning_rate": 9.164812754912868e-06, + "loss": 2.6598, + "mean_token_accuracy": 0.46208024147905924, + "step": 4505 + }, + { + "epoch": 0.8353726362625139, + "grad_norm": 5.85546875, + "learning_rate": 9.164627363737487e-06, + "loss": 3.0803, + "mean_token_accuracy": 0.4314106395696354, + "step": 4506 + }, + { + "epoch": 0.835558027437894, + "grad_norm": 8.0, + "learning_rate": 9.164441972562107e-06, + "loss": 2.7174, + "mean_token_accuracy": 0.4535132466677637, + "step": 4507 + }, + { + "epoch": 0.8357434186132741, + "grad_norm": 6.66796875, + "learning_rate": 9.164256581386726e-06, + "loss": 3.0228, + "mean_token_accuracy": 0.42738359201773835, + "step": 4508 + }, + { + "epoch": 0.835928809788654, + "grad_norm": 5.4921875, + "learning_rate": 9.164071190211347e-06, + "loss": 2.278, + "mean_token_accuracy": 0.5260058881256133, + "step": 4509 + }, + { + "epoch": 0.8361142009640341, + "grad_norm": 7.8828125, + "learning_rate": 9.163885799035967e-06, + "loss": 2.9531, + "mean_token_accuracy": 0.42908555537379356, + "step": 4510 + }, + { + "epoch": 0.8362995921394142, + "grad_norm": 5.7890625, + "learning_rate": 9.163700407860586e-06, + "loss": 3.0234, + "mean_token_accuracy": 0.4222619047619048, + "step": 4511 + }, + { + "epoch": 0.8364849833147943, + "grad_norm": 8.5234375, + "learning_rate": 9.163515016685206e-06, + "loss": 2.7892, + "mean_token_accuracy": 0.43832629139975465, + "step": 4512 + }, + { + "epoch": 0.8366703744901742, + "grad_norm": 6.73046875, + "learning_rate": 9.163329625509827e-06, + "loss": 3.4254, + "mean_token_accuracy": 0.3868283739952052, + "step": 4513 + }, + { + "epoch": 0.8368557656655543, + "grad_norm": 7.078125, + "learning_rate": 9.163144234334447e-06, + "loss": 2.6741, + "mean_token_accuracy": 0.44507827009816925, + "step": 4514 + }, + { + "epoch": 0.8370411568409344, + "grad_norm": 5.953125, + "learning_rate": 9.162958843159066e-06, + "loss": 2.9697, + "mean_token_accuracy": 0.43411778760004777, + "step": 4515 + }, + { + "epoch": 0.8372265480163145, + "grad_norm": 9.34375, + "learning_rate": 9.162773451983687e-06, + "loss": 2.7934, + "mean_token_accuracy": 0.43499617945639124, + "step": 4516 + }, + { + "epoch": 0.8374119391916944, + "grad_norm": 6.8359375, + "learning_rate": 9.162588060808305e-06, + "loss": 2.5805, + "mean_token_accuracy": 0.4850143017037682, + "step": 4517 + }, + { + "epoch": 0.8375973303670745, + "grad_norm": 6.0859375, + "learning_rate": 9.162402669632926e-06, + "loss": 2.9658, + "mean_token_accuracy": 0.4365394149727318, + "step": 4518 + }, + { + "epoch": 0.8377827215424546, + "grad_norm": 6.00390625, + "learning_rate": 9.162217278457546e-06, + "loss": 2.9585, + "mean_token_accuracy": 0.44438246122949904, + "step": 4519 + }, + { + "epoch": 0.8379681127178347, + "grad_norm": 6.390625, + "learning_rate": 9.162031887282165e-06, + "loss": 2.5613, + "mean_token_accuracy": 0.46315653632726805, + "step": 4520 + }, + { + "epoch": 0.8381535038932146, + "grad_norm": 5.28125, + "learning_rate": 9.161846496106787e-06, + "loss": 2.7581, + "mean_token_accuracy": 0.44249965814303294, + "step": 4521 + }, + { + "epoch": 0.8383388950685947, + "grad_norm": 6.85546875, + "learning_rate": 9.161661104931406e-06, + "loss": 2.8028, + "mean_token_accuracy": 0.4475448168355417, + "step": 4522 + }, + { + "epoch": 0.8385242862439748, + "grad_norm": 5.875, + "learning_rate": 9.161475713756027e-06, + "loss": 2.0241, + "mean_token_accuracy": 0.5649028801071668, + "step": 4523 + }, + { + "epoch": 0.8387096774193549, + "grad_norm": 7.078125, + "learning_rate": 9.161290322580645e-06, + "loss": 2.8361, + "mean_token_accuracy": 0.4526646588974728, + "step": 4524 + }, + { + "epoch": 0.8388950685947348, + "grad_norm": 5.59375, + "learning_rate": 9.161104931405266e-06, + "loss": 2.6212, + "mean_token_accuracy": 0.475275624931776, + "step": 4525 + }, + { + "epoch": 0.8390804597701149, + "grad_norm": 6.7890625, + "learning_rate": 9.160919540229886e-06, + "loss": 2.6739, + "mean_token_accuracy": 0.44297445255474455, + "step": 4526 + }, + { + "epoch": 0.839265850945495, + "grad_norm": 9.3515625, + "learning_rate": 9.160734149054505e-06, + "loss": 2.8221, + "mean_token_accuracy": 0.4253541076487252, + "step": 4527 + }, + { + "epoch": 0.8394512421208751, + "grad_norm": 6.51171875, + "learning_rate": 9.160548757879126e-06, + "loss": 2.92, + "mean_token_accuracy": 0.44187898089171973, + "step": 4528 + }, + { + "epoch": 0.8396366332962552, + "grad_norm": 7.48828125, + "learning_rate": 9.160363366703746e-06, + "loss": 2.6921, + "mean_token_accuracy": 0.4561128526645768, + "step": 4529 + }, + { + "epoch": 0.8398220244716351, + "grad_norm": 7.375, + "learning_rate": 9.160177975528367e-06, + "loss": 2.5289, + "mean_token_accuracy": 0.4934302488118535, + "step": 4530 + }, + { + "epoch": 0.8400074156470152, + "grad_norm": 5.5625, + "learning_rate": 9.159992584352985e-06, + "loss": 3.1383, + "mean_token_accuracy": 0.4193734828343544, + "step": 4531 + }, + { + "epoch": 0.8401928068223953, + "grad_norm": 6.4609375, + "learning_rate": 9.159807193177606e-06, + "loss": 2.9498, + "mean_token_accuracy": 0.4337333153566132, + "step": 4532 + }, + { + "epoch": 0.8403781979977754, + "grad_norm": 5.78125, + "learning_rate": 9.159621802002225e-06, + "loss": 2.8378, + "mean_token_accuracy": 0.4612153825961412, + "step": 4533 + }, + { + "epoch": 0.8405635891731553, + "grad_norm": 6.8828125, + "learning_rate": 9.159436410826845e-06, + "loss": 2.9113, + "mean_token_accuracy": 0.45061196345457677, + "step": 4534 + }, + { + "epoch": 0.8407489803485354, + "grad_norm": 6.4921875, + "learning_rate": 9.159251019651466e-06, + "loss": 2.7866, + "mean_token_accuracy": 0.47717231222385864, + "step": 4535 + }, + { + "epoch": 0.8409343715239155, + "grad_norm": 5.6484375, + "learning_rate": 9.159065628476085e-06, + "loss": 2.7865, + "mean_token_accuracy": 0.46956619213092704, + "step": 4536 + }, + { + "epoch": 0.8411197626992956, + "grad_norm": 5.12890625, + "learning_rate": 9.158880237300705e-06, + "loss": 2.8227, + "mean_token_accuracy": 0.44983105626560893, + "step": 4537 + }, + { + "epoch": 0.8413051538746755, + "grad_norm": 7.85546875, + "learning_rate": 9.158694846125326e-06, + "loss": 3.0909, + "mean_token_accuracy": 0.4114566284779051, + "step": 4538 + }, + { + "epoch": 0.8414905450500556, + "grad_norm": 5.93359375, + "learning_rate": 9.158509454949946e-06, + "loss": 2.5092, + "mean_token_accuracy": 0.48292682926829267, + "step": 4539 + }, + { + "epoch": 0.8416759362254357, + "grad_norm": 5.75390625, + "learning_rate": 9.158324063774565e-06, + "loss": 2.928, + "mean_token_accuracy": 0.4328305444261136, + "step": 4540 + }, + { + "epoch": 0.8418613274008158, + "grad_norm": 7.18359375, + "learning_rate": 9.158138672599185e-06, + "loss": 2.5774, + "mean_token_accuracy": 0.4721444133208839, + "step": 4541 + }, + { + "epoch": 0.8420467185761957, + "grad_norm": 6.1953125, + "learning_rate": 9.157953281423804e-06, + "loss": 2.9274, + "mean_token_accuracy": 0.42944187180112114, + "step": 4542 + }, + { + "epoch": 0.8422321097515758, + "grad_norm": 5.26953125, + "learning_rate": 9.157767890248425e-06, + "loss": 2.7483, + "mean_token_accuracy": 0.459679378188001, + "step": 4543 + }, + { + "epoch": 0.8424175009269559, + "grad_norm": 5.48046875, + "learning_rate": 9.157582499073045e-06, + "loss": 2.6776, + "mean_token_accuracy": 0.466697790227202, + "step": 4544 + }, + { + "epoch": 0.842602892102336, + "grad_norm": 4.78515625, + "learning_rate": 9.157397107897666e-06, + "loss": 2.8987, + "mean_token_accuracy": 0.4579643178025927, + "step": 4545 + }, + { + "epoch": 0.8427882832777159, + "grad_norm": 7.140625, + "learning_rate": 9.157211716722284e-06, + "loss": 1.943, + "mean_token_accuracy": 0.5541836490050348, + "step": 4546 + }, + { + "epoch": 0.842973674453096, + "grad_norm": 6.5625, + "learning_rate": 9.157026325546905e-06, + "loss": 2.7451, + "mean_token_accuracy": 0.451925666576892, + "step": 4547 + }, + { + "epoch": 0.8431590656284761, + "grad_norm": 5.22265625, + "learning_rate": 9.156840934371525e-06, + "loss": 2.748, + "mean_token_accuracy": 0.44850163686728783, + "step": 4548 + }, + { + "epoch": 0.8433444568038562, + "grad_norm": 5.54296875, + "learning_rate": 9.156655543196144e-06, + "loss": 2.8334, + "mean_token_accuracy": 0.44949894514767935, + "step": 4549 + }, + { + "epoch": 0.8435298479792361, + "grad_norm": 6.34375, + "learning_rate": 9.156470152020765e-06, + "loss": 2.6232, + "mean_token_accuracy": 0.495189050006776, + "step": 4550 + }, + { + "epoch": 0.8437152391546162, + "grad_norm": 8.5390625, + "learning_rate": 9.156284760845383e-06, + "loss": 3.0298, + "mean_token_accuracy": 0.4131203511766858, + "step": 4551 + }, + { + "epoch": 0.8439006303299963, + "grad_norm": 8.203125, + "learning_rate": 9.156099369670004e-06, + "loss": 3.5428, + "mean_token_accuracy": 0.37496542185338866, + "step": 4552 + }, + { + "epoch": 0.8440860215053764, + "grad_norm": 5.4921875, + "learning_rate": 9.155913978494624e-06, + "loss": 3.0428, + "mean_token_accuracy": 0.4306308384269404, + "step": 4553 + }, + { + "epoch": 0.8442714126807565, + "grad_norm": 6.296875, + "learning_rate": 9.155728587319245e-06, + "loss": 2.9943, + "mean_token_accuracy": 0.4311404507170499, + "step": 4554 + }, + { + "epoch": 0.8444568038561364, + "grad_norm": 5.27734375, + "learning_rate": 9.155543196143864e-06, + "loss": 2.9444, + "mean_token_accuracy": 0.4255471320268917, + "step": 4555 + }, + { + "epoch": 0.8446421950315165, + "grad_norm": 5.046875, + "learning_rate": 9.155357804968484e-06, + "loss": 2.5808, + "mean_token_accuracy": 0.4636100893326327, + "step": 4556 + }, + { + "epoch": 0.8448275862068966, + "grad_norm": 5.6640625, + "learning_rate": 9.155172413793105e-06, + "loss": 2.9773, + "mean_token_accuracy": 0.4701862669458834, + "step": 4557 + }, + { + "epoch": 0.8450129773822767, + "grad_norm": 6.21875, + "learning_rate": 9.154987022617723e-06, + "loss": 3.02, + "mean_token_accuracy": 0.42545855744035305, + "step": 4558 + }, + { + "epoch": 0.8451983685576566, + "grad_norm": 4.875, + "learning_rate": 9.154801631442344e-06, + "loss": 2.3505, + "mean_token_accuracy": 0.5260732476352171, + "step": 4559 + }, + { + "epoch": 0.8453837597330367, + "grad_norm": 5.93359375, + "learning_rate": 9.154616240266963e-06, + "loss": 2.4597, + "mean_token_accuracy": 0.48118081180811806, + "step": 4560 + }, + { + "epoch": 0.8455691509084168, + "grad_norm": 6.3203125, + "learning_rate": 9.154430849091585e-06, + "loss": 2.7555, + "mean_token_accuracy": 0.4737773152965661, + "step": 4561 + }, + { + "epoch": 0.8457545420837969, + "grad_norm": 6.48046875, + "learning_rate": 9.154245457916204e-06, + "loss": 2.6971, + "mean_token_accuracy": 0.4532354163779964, + "step": 4562 + }, + { + "epoch": 0.8459399332591768, + "grad_norm": 8.5859375, + "learning_rate": 9.154060066740824e-06, + "loss": 2.9542, + "mean_token_accuracy": 0.44809030056437854, + "step": 4563 + }, + { + "epoch": 0.8461253244345569, + "grad_norm": 5.671875, + "learning_rate": 9.153874675565445e-06, + "loss": 2.9057, + "mean_token_accuracy": 0.4352548036758563, + "step": 4564 + }, + { + "epoch": 0.846310715609937, + "grad_norm": 6.62109375, + "learning_rate": 9.153689284390064e-06, + "loss": 2.9032, + "mean_token_accuracy": 0.4343241869918699, + "step": 4565 + }, + { + "epoch": 0.8464961067853171, + "grad_norm": 5.52734375, + "learning_rate": 9.153503893214684e-06, + "loss": 3.5836, + "mean_token_accuracy": 0.37475984630163306, + "step": 4566 + }, + { + "epoch": 0.846681497960697, + "grad_norm": 8.9453125, + "learning_rate": 9.153318502039303e-06, + "loss": 3.2336, + "mean_token_accuracy": 0.416925562321362, + "step": 4567 + }, + { + "epoch": 0.8468668891360771, + "grad_norm": 8.1328125, + "learning_rate": 9.153133110863923e-06, + "loss": 2.8175, + "mean_token_accuracy": 0.4372207327971403, + "step": 4568 + }, + { + "epoch": 0.8470522803114572, + "grad_norm": 6.37890625, + "learning_rate": 9.152947719688544e-06, + "loss": 2.8959, + "mean_token_accuracy": 0.43572564160799454, + "step": 4569 + }, + { + "epoch": 0.8472376714868373, + "grad_norm": 5.62890625, + "learning_rate": 9.152762328513164e-06, + "loss": 2.8433, + "mean_token_accuracy": 0.4590340996759759, + "step": 4570 + }, + { + "epoch": 0.8474230626622172, + "grad_norm": 7.0703125, + "learning_rate": 9.152576937337783e-06, + "loss": 2.7195, + "mean_token_accuracy": 0.4498693664271718, + "step": 4571 + }, + { + "epoch": 0.8476084538375973, + "grad_norm": 6.0, + "learning_rate": 9.152391546162404e-06, + "loss": 2.9667, + "mean_token_accuracy": 0.44157918758557735, + "step": 4572 + }, + { + "epoch": 0.8477938450129774, + "grad_norm": 7.96484375, + "learning_rate": 9.152206154987024e-06, + "loss": 3.1863, + "mean_token_accuracy": 0.42868217054263563, + "step": 4573 + }, + { + "epoch": 0.8479792361883575, + "grad_norm": 7.76953125, + "learning_rate": 9.152020763811643e-06, + "loss": 2.9351, + "mean_token_accuracy": 0.4354955739370193, + "step": 4574 + }, + { + "epoch": 0.8481646273637374, + "grad_norm": 5.41796875, + "learning_rate": 9.151835372636263e-06, + "loss": 3.3094, + "mean_token_accuracy": 0.4124111182934712, + "step": 4575 + }, + { + "epoch": 0.8483500185391175, + "grad_norm": 6.15625, + "learning_rate": 9.151649981460882e-06, + "loss": 3.3164, + "mean_token_accuracy": 0.41254469606674615, + "step": 4576 + }, + { + "epoch": 0.8485354097144976, + "grad_norm": 7.04296875, + "learning_rate": 9.151464590285504e-06, + "loss": 2.6545, + "mean_token_accuracy": 0.4619790920807676, + "step": 4577 + }, + { + "epoch": 0.8487208008898777, + "grad_norm": 5.16796875, + "learning_rate": 9.151279199110123e-06, + "loss": 2.4827, + "mean_token_accuracy": 0.47848007870142645, + "step": 4578 + }, + { + "epoch": 0.8489061920652577, + "grad_norm": 7.0546875, + "learning_rate": 9.151093807934744e-06, + "loss": 2.7514, + "mean_token_accuracy": 0.46519377931374967, + "step": 4579 + }, + { + "epoch": 0.8490915832406377, + "grad_norm": 6.7421875, + "learning_rate": 9.150908416759362e-06, + "loss": 3.1352, + "mean_token_accuracy": 0.43538355217691777, + "step": 4580 + }, + { + "epoch": 0.8492769744160178, + "grad_norm": 5.42578125, + "learning_rate": 9.150723025583983e-06, + "loss": 2.8222, + "mean_token_accuracy": 0.4423152644704029, + "step": 4581 + }, + { + "epoch": 0.8494623655913979, + "grad_norm": 6.22265625, + "learning_rate": 9.150537634408603e-06, + "loss": 3.3133, + "mean_token_accuracy": 0.3756401384083045, + "step": 4582 + }, + { + "epoch": 0.849647756766778, + "grad_norm": 5.81640625, + "learning_rate": 9.150352243233222e-06, + "loss": 3.0334, + "mean_token_accuracy": 0.4282283884738527, + "step": 4583 + }, + { + "epoch": 0.8498331479421579, + "grad_norm": 7.27734375, + "learning_rate": 9.150166852057843e-06, + "loss": 2.9441, + "mean_token_accuracy": 0.4366584226435834, + "step": 4584 + }, + { + "epoch": 0.850018539117538, + "grad_norm": 8.1484375, + "learning_rate": 9.149981460882462e-06, + "loss": 2.6243, + "mean_token_accuracy": 0.46327752657949406, + "step": 4585 + }, + { + "epoch": 0.8502039302929181, + "grad_norm": 7.125, + "learning_rate": 9.149796069707084e-06, + "loss": 2.9501, + "mean_token_accuracy": 0.4533790650406504, + "step": 4586 + }, + { + "epoch": 0.8503893214682982, + "grad_norm": 7.58984375, + "learning_rate": 9.149610678531703e-06, + "loss": 2.5239, + "mean_token_accuracy": 0.47652723442672285, + "step": 4587 + }, + { + "epoch": 0.8505747126436781, + "grad_norm": 9.046875, + "learning_rate": 9.149425287356323e-06, + "loss": 2.4094, + "mean_token_accuracy": 0.4916861957370829, + "step": 4588 + }, + { + "epoch": 0.8507601038190582, + "grad_norm": 5.71875, + "learning_rate": 9.149239896180942e-06, + "loss": 2.8939, + "mean_token_accuracy": 0.43102124392116714, + "step": 4589 + }, + { + "epoch": 0.8509454949944383, + "grad_norm": 11.1875, + "learning_rate": 9.149054505005562e-06, + "loss": 2.7724, + "mean_token_accuracy": 0.45392528424472117, + "step": 4590 + }, + { + "epoch": 0.8511308861698184, + "grad_norm": 7.3828125, + "learning_rate": 9.148869113830183e-06, + "loss": 2.5521, + "mean_token_accuracy": 0.4860710854947166, + "step": 4591 + }, + { + "epoch": 0.8513162773451983, + "grad_norm": 7.29296875, + "learning_rate": 9.148683722654802e-06, + "loss": 2.5617, + "mean_token_accuracy": 0.47959889349930845, + "step": 4592 + }, + { + "epoch": 0.8515016685205784, + "grad_norm": 7.82421875, + "learning_rate": 9.148498331479422e-06, + "loss": 2.6148, + "mean_token_accuracy": 0.4486041982254923, + "step": 4593 + }, + { + "epoch": 0.8516870596959585, + "grad_norm": 8.921875, + "learning_rate": 9.148312940304043e-06, + "loss": 2.2292, + "mean_token_accuracy": 0.5175372192783245, + "step": 4594 + }, + { + "epoch": 0.8518724508713386, + "grad_norm": 7.83984375, + "learning_rate": 9.148127549128663e-06, + "loss": 3.4512, + "mean_token_accuracy": 0.3966630785791173, + "step": 4595 + }, + { + "epoch": 0.8520578420467185, + "grad_norm": 8.0234375, + "learning_rate": 9.147942157953282e-06, + "loss": 2.3139, + "mean_token_accuracy": 0.5229763912310287, + "step": 4596 + }, + { + "epoch": 0.8522432332220986, + "grad_norm": 8.5546875, + "learning_rate": 9.147756766777902e-06, + "loss": 2.5622, + "mean_token_accuracy": 0.4678040020523345, + "step": 4597 + }, + { + "epoch": 0.8524286243974787, + "grad_norm": 6.10546875, + "learning_rate": 9.147571375602521e-06, + "loss": 2.3706, + "mean_token_accuracy": 0.4842660052705007, + "step": 4598 + }, + { + "epoch": 0.8526140155728588, + "grad_norm": 6.26953125, + "learning_rate": 9.147385984427142e-06, + "loss": 3.1207, + "mean_token_accuracy": 0.4147045420021267, + "step": 4599 + }, + { + "epoch": 0.8527994067482387, + "grad_norm": 7.98828125, + "learning_rate": 9.147200593251762e-06, + "loss": 2.8139, + "mean_token_accuracy": 0.4517083271705369, + "step": 4600 + }, + { + "epoch": 0.8529847979236188, + "grad_norm": 7.890625, + "learning_rate": 9.147015202076381e-06, + "loss": 2.5942, + "mean_token_accuracy": 0.4730592584294836, + "step": 4601 + }, + { + "epoch": 0.8531701890989989, + "grad_norm": 8.078125, + "learning_rate": 9.146829810901003e-06, + "loss": 3.352, + "mean_token_accuracy": 0.41313535122786976, + "step": 4602 + }, + { + "epoch": 0.853355580274379, + "grad_norm": 8.3203125, + "learning_rate": 9.146644419725622e-06, + "loss": 3.0535, + "mean_token_accuracy": 0.4485408560311284, + "step": 4603 + }, + { + "epoch": 0.853540971449759, + "grad_norm": 6.125, + "learning_rate": 9.146459028550242e-06, + "loss": 2.922, + "mean_token_accuracy": 0.43643805035346644, + "step": 4604 + }, + { + "epoch": 0.853726362625139, + "grad_norm": 8.7421875, + "learning_rate": 9.146273637374861e-06, + "loss": 2.6824, + "mean_token_accuracy": 0.4525065963060686, + "step": 4605 + }, + { + "epoch": 0.8539117538005191, + "grad_norm": 4.66796875, + "learning_rate": 9.146088246199482e-06, + "loss": 3.4257, + "mean_token_accuracy": 0.39802850672705475, + "step": 4606 + }, + { + "epoch": 0.8540971449758992, + "grad_norm": 7.77734375, + "learning_rate": 9.145902855024102e-06, + "loss": 2.9667, + "mean_token_accuracy": 0.43364904776453816, + "step": 4607 + }, + { + "epoch": 0.8542825361512792, + "grad_norm": 6.8828125, + "learning_rate": 9.145717463848721e-06, + "loss": 2.7681, + "mean_token_accuracy": 0.4552487502975482, + "step": 4608 + }, + { + "epoch": 0.8544679273266592, + "grad_norm": 7.34765625, + "learning_rate": 9.145532072673341e-06, + "loss": 2.7308, + "mean_token_accuracy": 0.4507749712973594, + "step": 4609 + }, + { + "epoch": 0.8546533185020393, + "grad_norm": 6.96875, + "learning_rate": 9.145346681497962e-06, + "loss": 3.0644, + "mean_token_accuracy": 0.44049967126890205, + "step": 4610 + }, + { + "epoch": 0.8548387096774194, + "grad_norm": 5.68359375, + "learning_rate": 9.145161290322582e-06, + "loss": 3.0464, + "mean_token_accuracy": 0.43349701110162253, + "step": 4611 + }, + { + "epoch": 0.8550241008527995, + "grad_norm": 5.13671875, + "learning_rate": 9.144975899147201e-06, + "loss": 2.9461, + "mean_token_accuracy": 0.4390436153441934, + "step": 4612 + }, + { + "epoch": 0.8552094920281794, + "grad_norm": 7.390625, + "learning_rate": 9.144790507971822e-06, + "loss": 2.2476, + "mean_token_accuracy": 0.5270928687435406, + "step": 4613 + }, + { + "epoch": 0.8553948832035595, + "grad_norm": 5.6953125, + "learning_rate": 9.14460511679644e-06, + "loss": 2.4497, + "mean_token_accuracy": 0.49178303410278335, + "step": 4614 + }, + { + "epoch": 0.8555802743789396, + "grad_norm": 5.51953125, + "learning_rate": 9.144419725621061e-06, + "loss": 3.1192, + "mean_token_accuracy": 0.42038555756736606, + "step": 4615 + }, + { + "epoch": 0.8557656655543197, + "grad_norm": 8.2890625, + "learning_rate": 9.144234334445682e-06, + "loss": 2.8057, + "mean_token_accuracy": 0.4424301134791032, + "step": 4616 + }, + { + "epoch": 0.8559510567296996, + "grad_norm": 7.5859375, + "learning_rate": 9.1440489432703e-06, + "loss": 3.3223, + "mean_token_accuracy": 0.3915565979508703, + "step": 4617 + }, + { + "epoch": 0.8561364479050797, + "grad_norm": 6.34765625, + "learning_rate": 9.14386355209492e-06, + "loss": 2.6406, + "mean_token_accuracy": 0.4780232558139535, + "step": 4618 + }, + { + "epoch": 0.8563218390804598, + "grad_norm": 9.078125, + "learning_rate": 9.143678160919541e-06, + "loss": 2.8121, + "mean_token_accuracy": 0.44038956677487967, + "step": 4619 + }, + { + "epoch": 0.8565072302558399, + "grad_norm": 11.7734375, + "learning_rate": 9.143492769744162e-06, + "loss": 2.4807, + "mean_token_accuracy": 0.4883034987794955, + "step": 4620 + }, + { + "epoch": 0.8566926214312198, + "grad_norm": 13.0078125, + "learning_rate": 9.14330737856878e-06, + "loss": 2.5472, + "mean_token_accuracy": 0.4632571354772052, + "step": 4621 + }, + { + "epoch": 0.8568780126065999, + "grad_norm": 8.2890625, + "learning_rate": 9.143121987393401e-06, + "loss": 2.6146, + "mean_token_accuracy": 0.4562543192812716, + "step": 4622 + }, + { + "epoch": 0.85706340378198, + "grad_norm": 6.00390625, + "learning_rate": 9.14293659621802e-06, + "loss": 3.1022, + "mean_token_accuracy": 0.40301702171032516, + "step": 4623 + }, + { + "epoch": 0.8572487949573601, + "grad_norm": 8.5546875, + "learning_rate": 9.14275120504264e-06, + "loss": 2.5246, + "mean_token_accuracy": 0.47846543612015924, + "step": 4624 + }, + { + "epoch": 0.85743418613274, + "grad_norm": 9.28125, + "learning_rate": 9.14256581386726e-06, + "loss": 2.8902, + "mean_token_accuracy": 0.43981791404471815, + "step": 4625 + }, + { + "epoch": 0.8576195773081201, + "grad_norm": 6.1640625, + "learning_rate": 9.142380422691881e-06, + "loss": 3.3155, + "mean_token_accuracy": 0.3850203804347826, + "step": 4626 + }, + { + "epoch": 0.8578049684835002, + "grad_norm": 10.2734375, + "learning_rate": 9.1421950315165e-06, + "loss": 2.6996, + "mean_token_accuracy": 0.4652313452794865, + "step": 4627 + }, + { + "epoch": 0.8579903596588803, + "grad_norm": 6.99609375, + "learning_rate": 9.14200964034112e-06, + "loss": 2.7668, + "mean_token_accuracy": 0.45175537938844845, + "step": 4628 + }, + { + "epoch": 0.8581757508342603, + "grad_norm": 7.375, + "learning_rate": 9.141824249165741e-06, + "loss": 2.7455, + "mean_token_accuracy": 0.4558682443236329, + "step": 4629 + }, + { + "epoch": 0.8583611420096403, + "grad_norm": 5.51953125, + "learning_rate": 9.14163885799036e-06, + "loss": 2.9031, + "mean_token_accuracy": 0.45484994640943194, + "step": 4630 + }, + { + "epoch": 0.8585465331850204, + "grad_norm": 5.62890625, + "learning_rate": 9.14145346681498e-06, + "loss": 2.7776, + "mean_token_accuracy": 0.4491803278688525, + "step": 4631 + }, + { + "epoch": 0.8587319243604005, + "grad_norm": 5.734375, + "learning_rate": 9.1412680756396e-06, + "loss": 2.8336, + "mean_token_accuracy": 0.458836716901233, + "step": 4632 + }, + { + "epoch": 0.8589173155357805, + "grad_norm": 5.46875, + "learning_rate": 9.14108268446422e-06, + "loss": 2.9393, + "mean_token_accuracy": 0.44688060593980466, + "step": 4633 + }, + { + "epoch": 0.8591027067111605, + "grad_norm": 7.41796875, + "learning_rate": 9.14089729328884e-06, + "loss": 2.9663, + "mean_token_accuracy": 0.45310198068423635, + "step": 4634 + }, + { + "epoch": 0.8592880978865406, + "grad_norm": 6.44921875, + "learning_rate": 9.14071190211346e-06, + "loss": 2.7802, + "mean_token_accuracy": 0.4563735120767364, + "step": 4635 + }, + { + "epoch": 0.8594734890619207, + "grad_norm": 7.6015625, + "learning_rate": 9.14052651093808e-06, + "loss": 2.467, + "mean_token_accuracy": 0.501809268121863, + "step": 4636 + }, + { + "epoch": 0.8596588802373007, + "grad_norm": 5.84375, + "learning_rate": 9.1403411197627e-06, + "loss": 3.2175, + "mean_token_accuracy": 0.4216244865509474, + "step": 4637 + }, + { + "epoch": 0.8598442714126807, + "grad_norm": 6.0859375, + "learning_rate": 9.14015572858732e-06, + "loss": 3.7964, + "mean_token_accuracy": 0.37756883814640696, + "step": 4638 + }, + { + "epoch": 0.8600296625880608, + "grad_norm": 5.61328125, + "learning_rate": 9.13997033741194e-06, + "loss": 3.1182, + "mean_token_accuracy": 0.4431397574984046, + "step": 4639 + }, + { + "epoch": 0.8602150537634409, + "grad_norm": 10.3984375, + "learning_rate": 9.13978494623656e-06, + "loss": 2.1447, + "mean_token_accuracy": 0.4900475150892513, + "step": 4640 + }, + { + "epoch": 0.860400444938821, + "grad_norm": 6.6953125, + "learning_rate": 9.139599555061179e-06, + "loss": 2.5984, + "mean_token_accuracy": 0.45654008438818566, + "step": 4641 + }, + { + "epoch": 0.8605858361142009, + "grad_norm": 5.97265625, + "learning_rate": 9.1394141638858e-06, + "loss": 2.8633, + "mean_token_accuracy": 0.4496532237349088, + "step": 4642 + }, + { + "epoch": 0.860771227289581, + "grad_norm": 6.12890625, + "learning_rate": 9.13922877271042e-06, + "loss": 2.7367, + "mean_token_accuracy": 0.4566436301995045, + "step": 4643 + }, + { + "epoch": 0.8609566184649611, + "grad_norm": 6.61328125, + "learning_rate": 9.13904338153504e-06, + "loss": 2.7607, + "mean_token_accuracy": 0.4419168941461935, + "step": 4644 + }, + { + "epoch": 0.8611420096403412, + "grad_norm": 6.37109375, + "learning_rate": 9.13885799035966e-06, + "loss": 3.4442, + "mean_token_accuracy": 0.3995751711116356, + "step": 4645 + }, + { + "epoch": 0.8613274008157211, + "grad_norm": 9.7109375, + "learning_rate": 9.13867259918428e-06, + "loss": 2.6569, + "mean_token_accuracy": 0.4661331809088311, + "step": 4646 + }, + { + "epoch": 0.8615127919911012, + "grad_norm": 6.6328125, + "learning_rate": 9.1384872080089e-06, + "loss": 2.5787, + "mean_token_accuracy": 0.4753445305770887, + "step": 4647 + }, + { + "epoch": 0.8616981831664813, + "grad_norm": 5.84375, + "learning_rate": 9.138301816833519e-06, + "loss": 2.8818, + "mean_token_accuracy": 0.4417092924126172, + "step": 4648 + }, + { + "epoch": 0.8618835743418614, + "grad_norm": 5.7734375, + "learning_rate": 9.138116425658139e-06, + "loss": 2.3646, + "mean_token_accuracy": 0.5134058361149995, + "step": 4649 + }, + { + "epoch": 0.8620689655172413, + "grad_norm": 5.73828125, + "learning_rate": 9.13793103448276e-06, + "loss": 3.6174, + "mean_token_accuracy": 0.39386679444178246, + "step": 4650 + }, + { + "epoch": 0.8622543566926214, + "grad_norm": 8.0234375, + "learning_rate": 9.13774564330738e-06, + "loss": 2.6811, + "mean_token_accuracy": 0.45411140583554377, + "step": 4651 + }, + { + "epoch": 0.8624397478680015, + "grad_norm": 7.32421875, + "learning_rate": 9.137560252131999e-06, + "loss": 3.5312, + "mean_token_accuracy": 0.37929984779299847, + "step": 4652 + }, + { + "epoch": 0.8626251390433816, + "grad_norm": 8.6484375, + "learning_rate": 9.13737486095662e-06, + "loss": 2.8069, + "mean_token_accuracy": 0.45248322147651004, + "step": 4653 + }, + { + "epoch": 0.8628105302187616, + "grad_norm": 6.9140625, + "learning_rate": 9.13718946978124e-06, + "loss": 2.5251, + "mean_token_accuracy": 0.4804425744117189, + "step": 4654 + }, + { + "epoch": 0.8629959213941416, + "grad_norm": 5.93359375, + "learning_rate": 9.137004078605859e-06, + "loss": 2.9191, + "mean_token_accuracy": 0.4490266393442623, + "step": 4655 + }, + { + "epoch": 0.8631813125695217, + "grad_norm": 8.140625, + "learning_rate": 9.136818687430479e-06, + "loss": 3.1519, + "mean_token_accuracy": 0.42240145700533366, + "step": 4656 + }, + { + "epoch": 0.8633667037449018, + "grad_norm": 8.2421875, + "learning_rate": 9.136633296255098e-06, + "loss": 2.6647, + "mean_token_accuracy": 0.4605709973588228, + "step": 4657 + }, + { + "epoch": 0.8635520949202818, + "grad_norm": 5.3359375, + "learning_rate": 9.13644790507972e-06, + "loss": 3.4337, + "mean_token_accuracy": 0.38745294855708906, + "step": 4658 + }, + { + "epoch": 0.8637374860956618, + "grad_norm": 6.67578125, + "learning_rate": 9.136262513904339e-06, + "loss": 2.6868, + "mean_token_accuracy": 0.46804326450344147, + "step": 4659 + }, + { + "epoch": 0.8639228772710419, + "grad_norm": 5.47265625, + "learning_rate": 9.13607712272896e-06, + "loss": 3.0203, + "mean_token_accuracy": 0.44267291910902695, + "step": 4660 + }, + { + "epoch": 0.864108268446422, + "grad_norm": 6.6015625, + "learning_rate": 9.135891731553578e-06, + "loss": 2.8542, + "mean_token_accuracy": 0.4447283740299072, + "step": 4661 + }, + { + "epoch": 0.864293659621802, + "grad_norm": 5.65234375, + "learning_rate": 9.135706340378199e-06, + "loss": 3.0972, + "mean_token_accuracy": 0.43463230672533, + "step": 4662 + }, + { + "epoch": 0.864479050797182, + "grad_norm": 7.8671875, + "learning_rate": 9.13552094920282e-06, + "loss": 2.6636, + "mean_token_accuracy": 0.4828038325769037, + "step": 4663 + }, + { + "epoch": 0.8646644419725621, + "grad_norm": 6.015625, + "learning_rate": 9.135335558027438e-06, + "loss": 2.5573, + "mean_token_accuracy": 0.49047733847637415, + "step": 4664 + }, + { + "epoch": 0.8648498331479422, + "grad_norm": 4.83203125, + "learning_rate": 9.135150166852058e-06, + "loss": 2.1404, + "mean_token_accuracy": 0.538717402873869, + "step": 4665 + }, + { + "epoch": 0.8650352243233222, + "grad_norm": 6.4140625, + "learning_rate": 9.134964775676679e-06, + "loss": 3.0335, + "mean_token_accuracy": 0.4249540246145141, + "step": 4666 + }, + { + "epoch": 0.8652206154987022, + "grad_norm": 5.859375, + "learning_rate": 9.1347793845013e-06, + "loss": 2.8254, + "mean_token_accuracy": 0.45067228252065444, + "step": 4667 + }, + { + "epoch": 0.8654060066740823, + "grad_norm": 8.21875, + "learning_rate": 9.134593993325918e-06, + "loss": 2.8289, + "mean_token_accuracy": 0.43745261561789234, + "step": 4668 + }, + { + "epoch": 0.8655913978494624, + "grad_norm": 5.87109375, + "learning_rate": 9.134408602150539e-06, + "loss": 3.0943, + "mean_token_accuracy": 0.4046817849305048, + "step": 4669 + }, + { + "epoch": 0.8657767890248425, + "grad_norm": 6.09375, + "learning_rate": 9.134223210975158e-06, + "loss": 3.3359, + "mean_token_accuracy": 0.4003673094582185, + "step": 4670 + }, + { + "epoch": 0.8659621802002224, + "grad_norm": 6.0078125, + "learning_rate": 9.134037819799778e-06, + "loss": 2.8823, + "mean_token_accuracy": 0.45887265135699373, + "step": 4671 + }, + { + "epoch": 0.8661475713756025, + "grad_norm": 5.1484375, + "learning_rate": 9.133852428624399e-06, + "loss": 3.2096, + "mean_token_accuracy": 0.43425414364640885, + "step": 4672 + }, + { + "epoch": 0.8663329625509826, + "grad_norm": 6.12890625, + "learning_rate": 9.133667037449017e-06, + "loss": 2.94, + "mean_token_accuracy": 0.4308415967454869, + "step": 4673 + }, + { + "epoch": 0.8665183537263627, + "grad_norm": 7.58984375, + "learning_rate": 9.133481646273638e-06, + "loss": 3.1473, + "mean_token_accuracy": 0.41728100607111884, + "step": 4674 + }, + { + "epoch": 0.8667037449017426, + "grad_norm": 8.8515625, + "learning_rate": 9.133296255098258e-06, + "loss": 2.2464, + "mean_token_accuracy": 0.5138820029747149, + "step": 4675 + }, + { + "epoch": 0.8668891360771227, + "grad_norm": 6.515625, + "learning_rate": 9.133110863922879e-06, + "loss": 2.7876, + "mean_token_accuracy": 0.436130007558579, + "step": 4676 + }, + { + "epoch": 0.8670745272525028, + "grad_norm": 5.53125, + "learning_rate": 9.132925472747498e-06, + "loss": 3.1226, + "mean_token_accuracy": 0.4367537915456599, + "step": 4677 + }, + { + "epoch": 0.8672599184278829, + "grad_norm": 6.6640625, + "learning_rate": 9.132740081572118e-06, + "loss": 2.5767, + "mean_token_accuracy": 0.47606793618116316, + "step": 4678 + }, + { + "epoch": 0.8674453096032629, + "grad_norm": 7.46875, + "learning_rate": 9.132554690396737e-06, + "loss": 3.0034, + "mean_token_accuracy": 0.4336448598130841, + "step": 4679 + }, + { + "epoch": 0.8676307007786429, + "grad_norm": 5.7265625, + "learning_rate": 9.132369299221357e-06, + "loss": 3.0675, + "mean_token_accuracy": 0.4206041828040279, + "step": 4680 + }, + { + "epoch": 0.867816091954023, + "grad_norm": 8.109375, + "learning_rate": 9.132183908045978e-06, + "loss": 2.9316, + "mean_token_accuracy": 0.44249712165792504, + "step": 4681 + }, + { + "epoch": 0.8680014831294031, + "grad_norm": 10.3203125, + "learning_rate": 9.131998516870598e-06, + "loss": 2.7924, + "mean_token_accuracy": 0.4553583168967784, + "step": 4682 + }, + { + "epoch": 0.8681868743047831, + "grad_norm": 7.40234375, + "learning_rate": 9.131813125695219e-06, + "loss": 3.0827, + "mean_token_accuracy": 0.42592079756300194, + "step": 4683 + }, + { + "epoch": 0.8683722654801631, + "grad_norm": 6.91015625, + "learning_rate": 9.131627734519838e-06, + "loss": 2.8227, + "mean_token_accuracy": 0.43492488472408153, + "step": 4684 + }, + { + "epoch": 0.8685576566555432, + "grad_norm": 6.44140625, + "learning_rate": 9.131442343344458e-06, + "loss": 3.4536, + "mean_token_accuracy": 0.3873053576141462, + "step": 4685 + }, + { + "epoch": 0.8687430478309233, + "grad_norm": 5.921875, + "learning_rate": 9.131256952169077e-06, + "loss": 3.3182, + "mean_token_accuracy": 0.40918023582257157, + "step": 4686 + }, + { + "epoch": 0.8689284390063033, + "grad_norm": 6.3359375, + "learning_rate": 9.131071560993697e-06, + "loss": 2.6813, + "mean_token_accuracy": 0.44119131751640583, + "step": 4687 + }, + { + "epoch": 0.8691138301816833, + "grad_norm": 9.84375, + "learning_rate": 9.130886169818318e-06, + "loss": 2.9004, + "mean_token_accuracy": 0.44009843284548633, + "step": 4688 + }, + { + "epoch": 0.8692992213570634, + "grad_norm": 7.765625, + "learning_rate": 9.130700778642937e-06, + "loss": 2.9159, + "mean_token_accuracy": 0.44710021839359376, + "step": 4689 + }, + { + "epoch": 0.8694846125324435, + "grad_norm": 7.5703125, + "learning_rate": 9.130515387467557e-06, + "loss": 2.8601, + "mean_token_accuracy": 0.4358793022159359, + "step": 4690 + }, + { + "epoch": 0.8696700037078235, + "grad_norm": 6.27734375, + "learning_rate": 9.130329996292178e-06, + "loss": 3.1576, + "mean_token_accuracy": 0.41571259376233716, + "step": 4691 + }, + { + "epoch": 0.8698553948832035, + "grad_norm": 5.88671875, + "learning_rate": 9.130144605116798e-06, + "loss": 2.7038, + "mean_token_accuracy": 0.44497742663656886, + "step": 4692 + }, + { + "epoch": 0.8700407860585836, + "grad_norm": 6.6953125, + "learning_rate": 9.129959213941417e-06, + "loss": 3.104, + "mean_token_accuracy": 0.40666564745451766, + "step": 4693 + }, + { + "epoch": 0.8702261772339637, + "grad_norm": 7.08203125, + "learning_rate": 9.129773822766037e-06, + "loss": 2.8246, + "mean_token_accuracy": 0.45953326713008935, + "step": 4694 + }, + { + "epoch": 0.8704115684093437, + "grad_norm": 6.234375, + "learning_rate": 9.129588431590656e-06, + "loss": 2.9804, + "mean_token_accuracy": 0.4315159574468085, + "step": 4695 + }, + { + "epoch": 0.8705969595847237, + "grad_norm": 7.7734375, + "learning_rate": 9.129403040415277e-06, + "loss": 2.7531, + "mean_token_accuracy": 0.4513397517689363, + "step": 4696 + }, + { + "epoch": 0.8707823507601038, + "grad_norm": 5.27734375, + "learning_rate": 9.129217649239897e-06, + "loss": 2.3878, + "mean_token_accuracy": 0.48323956868260665, + "step": 4697 + }, + { + "epoch": 0.8709677419354839, + "grad_norm": 7.01171875, + "learning_rate": 9.129032258064518e-06, + "loss": 2.4063, + "mean_token_accuracy": 0.49667752442996743, + "step": 4698 + }, + { + "epoch": 0.871153133110864, + "grad_norm": 12.46875, + "learning_rate": 9.128846866889137e-06, + "loss": 2.1499, + "mean_token_accuracy": 0.5686990727732509, + "step": 4699 + }, + { + "epoch": 0.8713385242862439, + "grad_norm": 9.734375, + "learning_rate": 9.128661475713757e-06, + "loss": 2.7293, + "mean_token_accuracy": 0.4741484184914842, + "step": 4700 + }, + { + "epoch": 0.871523915461624, + "grad_norm": 8.1640625, + "learning_rate": 9.128476084538378e-06, + "loss": 1.9806, + "mean_token_accuracy": 0.5505342188853595, + "step": 4701 + }, + { + "epoch": 0.8717093066370041, + "grad_norm": 5.50390625, + "learning_rate": 9.128290693362996e-06, + "loss": 2.6967, + "mean_token_accuracy": 0.4743479507022069, + "step": 4702 + }, + { + "epoch": 0.8718946978123842, + "grad_norm": 6.14453125, + "learning_rate": 9.128105302187617e-06, + "loss": 2.9239, + "mean_token_accuracy": 0.4404992729035385, + "step": 4703 + }, + { + "epoch": 0.8720800889877642, + "grad_norm": 8.4609375, + "learning_rate": 9.127919911012236e-06, + "loss": 2.4697, + "mean_token_accuracy": 0.48561822936122523, + "step": 4704 + }, + { + "epoch": 0.8722654801631442, + "grad_norm": 6.2421875, + "learning_rate": 9.127734519836856e-06, + "loss": 2.4644, + "mean_token_accuracy": 0.510533035429301, + "step": 4705 + }, + { + "epoch": 0.8724508713385243, + "grad_norm": 6.5625, + "learning_rate": 9.127549128661477e-06, + "loss": 2.5303, + "mean_token_accuracy": 0.49799433695139217, + "step": 4706 + }, + { + "epoch": 0.8726362625139044, + "grad_norm": 5.41796875, + "learning_rate": 9.127363737486097e-06, + "loss": 4.009, + "mean_token_accuracy": 0.35014090177133655, + "step": 4707 + }, + { + "epoch": 0.8728216536892844, + "grad_norm": 6.5546875, + "learning_rate": 9.127178346310716e-06, + "loss": 3.5426, + "mean_token_accuracy": 0.38655102259342544, + "step": 4708 + }, + { + "epoch": 0.8730070448646644, + "grad_norm": 7.765625, + "learning_rate": 9.126992955135336e-06, + "loss": 2.8556, + "mean_token_accuracy": 0.42280720681842704, + "step": 4709 + }, + { + "epoch": 0.8731924360400445, + "grad_norm": 7.40625, + "learning_rate": 9.126807563959957e-06, + "loss": 3.0304, + "mean_token_accuracy": 0.4362083218517498, + "step": 4710 + }, + { + "epoch": 0.8733778272154246, + "grad_norm": 6.37109375, + "learning_rate": 9.126622172784576e-06, + "loss": 3.0044, + "mean_token_accuracy": 0.4462358923477614, + "step": 4711 + }, + { + "epoch": 0.8735632183908046, + "grad_norm": 7.22265625, + "learning_rate": 9.126436781609196e-06, + "loss": 2.7918, + "mean_token_accuracy": 0.44680015704750686, + "step": 4712 + }, + { + "epoch": 0.8737486095661846, + "grad_norm": 6.54296875, + "learning_rate": 9.126251390433815e-06, + "loss": 2.6877, + "mean_token_accuracy": 0.45240715268225584, + "step": 4713 + }, + { + "epoch": 0.8739340007415647, + "grad_norm": 6.5234375, + "learning_rate": 9.126065999258435e-06, + "loss": 2.6366, + "mean_token_accuracy": 0.47361729179911, + "step": 4714 + }, + { + "epoch": 0.8741193919169448, + "grad_norm": 5.83203125, + "learning_rate": 9.125880608083056e-06, + "loss": 2.311, + "mean_token_accuracy": 0.5197648489762822, + "step": 4715 + }, + { + "epoch": 0.8743047830923248, + "grad_norm": 6.9140625, + "learning_rate": 9.125695216907676e-06, + "loss": 2.6729, + "mean_token_accuracy": 0.44745044258820593, + "step": 4716 + }, + { + "epoch": 0.8744901742677048, + "grad_norm": 7.6953125, + "learning_rate": 9.125509825732295e-06, + "loss": 3.0622, + "mean_token_accuracy": 0.4200859710824541, + "step": 4717 + }, + { + "epoch": 0.8746755654430849, + "grad_norm": 7.44921875, + "learning_rate": 9.125324434556916e-06, + "loss": 2.6061, + "mean_token_accuracy": 0.4707744249841739, + "step": 4718 + }, + { + "epoch": 0.874860956618465, + "grad_norm": 6.1328125, + "learning_rate": 9.125139043381536e-06, + "loss": 2.705, + "mean_token_accuracy": 0.44725453070056803, + "step": 4719 + }, + { + "epoch": 0.875046347793845, + "grad_norm": 6.63671875, + "learning_rate": 9.124953652206155e-06, + "loss": 3.3261, + "mean_token_accuracy": 0.40443250503693756, + "step": 4720 + }, + { + "epoch": 0.875231738969225, + "grad_norm": 7.99609375, + "learning_rate": 9.124768261030775e-06, + "loss": 2.7319, + "mean_token_accuracy": 0.46303901437371664, + "step": 4721 + }, + { + "epoch": 0.8754171301446051, + "grad_norm": 8.4765625, + "learning_rate": 9.124582869855394e-06, + "loss": 2.665, + "mean_token_accuracy": 0.46344249809596344, + "step": 4722 + }, + { + "epoch": 0.8756025213199852, + "grad_norm": 6.80859375, + "learning_rate": 9.124397478680016e-06, + "loss": 3.0257, + "mean_token_accuracy": 0.42859334253719894, + "step": 4723 + }, + { + "epoch": 0.8757879124953653, + "grad_norm": 7.671875, + "learning_rate": 9.124212087504635e-06, + "loss": 2.7506, + "mean_token_accuracy": 0.44644484958979036, + "step": 4724 + }, + { + "epoch": 0.8759733036707452, + "grad_norm": 7.0, + "learning_rate": 9.124026696329256e-06, + "loss": 2.346, + "mean_token_accuracy": 0.49849883242521964, + "step": 4725 + }, + { + "epoch": 0.8761586948461253, + "grad_norm": 7.01953125, + "learning_rate": 9.123841305153876e-06, + "loss": 2.2492, + "mean_token_accuracy": 0.5295158771473191, + "step": 4726 + }, + { + "epoch": 0.8763440860215054, + "grad_norm": 8.5234375, + "learning_rate": 9.123655913978495e-06, + "loss": 2.83, + "mean_token_accuracy": 0.44706454357603015, + "step": 4727 + }, + { + "epoch": 0.8765294771968855, + "grad_norm": 6.03515625, + "learning_rate": 9.123470522803116e-06, + "loss": 2.5547, + "mean_token_accuracy": 0.48062110282430615, + "step": 4728 + }, + { + "epoch": 0.8767148683722655, + "grad_norm": 6.25390625, + "learning_rate": 9.123285131627734e-06, + "loss": 3.1319, + "mean_token_accuracy": 0.4159621578099839, + "step": 4729 + }, + { + "epoch": 0.8769002595476455, + "grad_norm": 6.1953125, + "learning_rate": 9.123099740452355e-06, + "loss": 3.3989, + "mean_token_accuracy": 0.3944636678200692, + "step": 4730 + }, + { + "epoch": 0.8770856507230256, + "grad_norm": 4.66015625, + "learning_rate": 9.122914349276975e-06, + "loss": 2.6745, + "mean_token_accuracy": 0.4903914590747331, + "step": 4731 + }, + { + "epoch": 0.8772710418984057, + "grad_norm": 7.9921875, + "learning_rate": 9.122728958101596e-06, + "loss": 2.5371, + "mean_token_accuracy": 0.4905070618198657, + "step": 4732 + }, + { + "epoch": 0.8774564330737857, + "grad_norm": 6.49609375, + "learning_rate": 9.122543566926215e-06, + "loss": 2.2538, + "mean_token_accuracy": 0.513311052206984, + "step": 4733 + }, + { + "epoch": 0.8776418242491657, + "grad_norm": 6.55078125, + "learning_rate": 9.122358175750835e-06, + "loss": 3.086, + "mean_token_accuracy": 0.4284332688588008, + "step": 4734 + }, + { + "epoch": 0.8778272154245458, + "grad_norm": 8.53125, + "learning_rate": 9.122172784575456e-06, + "loss": 2.7871, + "mean_token_accuracy": 0.45256108148331886, + "step": 4735 + }, + { + "epoch": 0.8780126065999259, + "grad_norm": 6.60546875, + "learning_rate": 9.121987393400074e-06, + "loss": 2.8625, + "mean_token_accuracy": 0.45506820005349025, + "step": 4736 + }, + { + "epoch": 0.8781979977753059, + "grad_norm": 7.89453125, + "learning_rate": 9.121802002224695e-06, + "loss": 3.0716, + "mean_token_accuracy": 0.4388914837303441, + "step": 4737 + }, + { + "epoch": 0.8783833889506859, + "grad_norm": 8.890625, + "learning_rate": 9.121616611049314e-06, + "loss": 2.8772, + "mean_token_accuracy": 0.43386714116251485, + "step": 4738 + }, + { + "epoch": 0.878568780126066, + "grad_norm": 6.2890625, + "learning_rate": 9.121431219873936e-06, + "loss": 3.6184, + "mean_token_accuracy": 0.3801114140097191, + "step": 4739 + }, + { + "epoch": 0.8787541713014461, + "grad_norm": 11.1015625, + "learning_rate": 9.121245828698555e-06, + "loss": 2.7374, + "mean_token_accuracy": 0.45813860328481776, + "step": 4740 + }, + { + "epoch": 0.8789395624768261, + "grad_norm": 11.6171875, + "learning_rate": 9.121060437523175e-06, + "loss": 2.805, + "mean_token_accuracy": 0.4422068386826096, + "step": 4741 + }, + { + "epoch": 0.8791249536522061, + "grad_norm": 6.84765625, + "learning_rate": 9.120875046347794e-06, + "loss": 2.8808, + "mean_token_accuracy": 0.4439571150097466, + "step": 4742 + }, + { + "epoch": 0.8793103448275862, + "grad_norm": 6.13671875, + "learning_rate": 9.120689655172414e-06, + "loss": 3.0456, + "mean_token_accuracy": 0.4279114740008595, + "step": 4743 + }, + { + "epoch": 0.8794957360029663, + "grad_norm": 7.38671875, + "learning_rate": 9.120504263997035e-06, + "loss": 3.1233, + "mean_token_accuracy": 0.4172205438066465, + "step": 4744 + }, + { + "epoch": 0.8796811271783463, + "grad_norm": 8.1171875, + "learning_rate": 9.120318872821654e-06, + "loss": 2.2689, + "mean_token_accuracy": 0.5117899761336515, + "step": 4745 + }, + { + "epoch": 0.8798665183537263, + "grad_norm": 7.97265625, + "learning_rate": 9.120133481646274e-06, + "loss": 2.6436, + "mean_token_accuracy": 0.46176279974076473, + "step": 4746 + }, + { + "epoch": 0.8800519095291064, + "grad_norm": 7.94140625, + "learning_rate": 9.119948090470895e-06, + "loss": 3.0196, + "mean_token_accuracy": 0.4321796071094481, + "step": 4747 + }, + { + "epoch": 0.8802373007044865, + "grad_norm": 6.90625, + "learning_rate": 9.119762699295515e-06, + "loss": 3.0322, + "mean_token_accuracy": 0.4455611390284757, + "step": 4748 + }, + { + "epoch": 0.8804226918798665, + "grad_norm": 7.18359375, + "learning_rate": 9.119577308120134e-06, + "loss": 2.6006, + "mean_token_accuracy": 0.4733405875952122, + "step": 4749 + }, + { + "epoch": 0.8806080830552465, + "grad_norm": 9.125, + "learning_rate": 9.119391916944754e-06, + "loss": 3.0766, + "mean_token_accuracy": 0.41508737386167854, + "step": 4750 + }, + { + "epoch": 0.8807934742306266, + "grad_norm": 8.4296875, + "learning_rate": 9.119206525769373e-06, + "loss": 2.5999, + "mean_token_accuracy": 0.47165566886622673, + "step": 4751 + }, + { + "epoch": 0.8809788654060067, + "grad_norm": 5.83984375, + "learning_rate": 9.119021134593994e-06, + "loss": 2.756, + "mean_token_accuracy": 0.46314203189752606, + "step": 4752 + }, + { + "epoch": 0.8811642565813868, + "grad_norm": 7.2109375, + "learning_rate": 9.118835743418614e-06, + "loss": 2.9488, + "mean_token_accuracy": 0.42040443971415536, + "step": 4753 + }, + { + "epoch": 0.8813496477567668, + "grad_norm": 9.625, + "learning_rate": 9.118650352243233e-06, + "loss": 2.9176, + "mean_token_accuracy": 0.4297745529930034, + "step": 4754 + }, + { + "epoch": 0.8815350389321468, + "grad_norm": 10.4921875, + "learning_rate": 9.118464961067854e-06, + "loss": 2.4174, + "mean_token_accuracy": 0.48940269749518306, + "step": 4755 + }, + { + "epoch": 0.8817204301075269, + "grad_norm": 6.23046875, + "learning_rate": 9.118279569892474e-06, + "loss": 2.7697, + "mean_token_accuracy": 0.44727891156462585, + "step": 4756 + }, + { + "epoch": 0.881905821282907, + "grad_norm": 6.4609375, + "learning_rate": 9.118094178717095e-06, + "loss": 2.4987, + "mean_token_accuracy": 0.4994364609749225, + "step": 4757 + }, + { + "epoch": 0.882091212458287, + "grad_norm": 7.0625, + "learning_rate": 9.117908787541713e-06, + "loss": 3.2226, + "mean_token_accuracy": 0.40342243596913097, + "step": 4758 + }, + { + "epoch": 0.882276603633667, + "grad_norm": 6.296875, + "learning_rate": 9.117723396366334e-06, + "loss": 2.9266, + "mean_token_accuracy": 0.45411003236245956, + "step": 4759 + }, + { + "epoch": 0.8824619948090471, + "grad_norm": 8.5703125, + "learning_rate": 9.117538005190953e-06, + "loss": 2.6909, + "mean_token_accuracy": 0.46436443791329907, + "step": 4760 + }, + { + "epoch": 0.8826473859844272, + "grad_norm": 9.6171875, + "learning_rate": 9.117352614015573e-06, + "loss": 2.5413, + "mean_token_accuracy": 0.47476261869065467, + "step": 4761 + }, + { + "epoch": 0.8828327771598072, + "grad_norm": 10.375, + "learning_rate": 9.117167222840194e-06, + "loss": 2.6112, + "mean_token_accuracy": 0.48375410392845014, + "step": 4762 + }, + { + "epoch": 0.8830181683351872, + "grad_norm": 6.3671875, + "learning_rate": 9.116981831664814e-06, + "loss": 2.7955, + "mean_token_accuracy": 0.4416274790431405, + "step": 4763 + }, + { + "epoch": 0.8832035595105673, + "grad_norm": 7.96484375, + "learning_rate": 9.116796440489435e-06, + "loss": 2.6604, + "mean_token_accuracy": 0.4526788142507479, + "step": 4764 + }, + { + "epoch": 0.8833889506859474, + "grad_norm": 6.90234375, + "learning_rate": 9.116611049314053e-06, + "loss": 2.7788, + "mean_token_accuracy": 0.4520460358056266, + "step": 4765 + }, + { + "epoch": 0.8835743418613274, + "grad_norm": 5.70703125, + "learning_rate": 9.116425658138674e-06, + "loss": 3.2422, + "mean_token_accuracy": 0.40122341165021547, + "step": 4766 + }, + { + "epoch": 0.8837597330367074, + "grad_norm": 5.23046875, + "learning_rate": 9.116240266963293e-06, + "loss": 2.397, + "mean_token_accuracy": 0.5078902402251483, + "step": 4767 + }, + { + "epoch": 0.8839451242120875, + "grad_norm": 7.015625, + "learning_rate": 9.116054875787913e-06, + "loss": 2.812, + "mean_token_accuracy": 0.4486486486486487, + "step": 4768 + }, + { + "epoch": 0.8841305153874676, + "grad_norm": 7.0703125, + "learning_rate": 9.115869484612534e-06, + "loss": 2.9554, + "mean_token_accuracy": 0.4558604973744179, + "step": 4769 + }, + { + "epoch": 0.8843159065628476, + "grad_norm": 4.76953125, + "learning_rate": 9.115684093437152e-06, + "loss": 2.8999, + "mean_token_accuracy": 0.44882600842865744, + "step": 4770 + }, + { + "epoch": 0.8845012977382276, + "grad_norm": 11.4921875, + "learning_rate": 9.115498702261773e-06, + "loss": 2.7147, + "mean_token_accuracy": 0.42719725919093426, + "step": 4771 + }, + { + "epoch": 0.8846866889136077, + "grad_norm": 8.71875, + "learning_rate": 9.115313311086393e-06, + "loss": 3.3462, + "mean_token_accuracy": 0.4055905727596602, + "step": 4772 + }, + { + "epoch": 0.8848720800889878, + "grad_norm": 6.21484375, + "learning_rate": 9.115127919911014e-06, + "loss": 2.5715, + "mean_token_accuracy": 0.47835547411818596, + "step": 4773 + }, + { + "epoch": 0.8850574712643678, + "grad_norm": 5.63671875, + "learning_rate": 9.114942528735633e-06, + "loss": 2.8258, + "mean_token_accuracy": 0.4738380590470623, + "step": 4774 + }, + { + "epoch": 0.8852428624397478, + "grad_norm": 7.390625, + "learning_rate": 9.114757137560253e-06, + "loss": 3.3038, + "mean_token_accuracy": 0.40875232774674114, + "step": 4775 + }, + { + "epoch": 0.8854282536151279, + "grad_norm": 11.453125, + "learning_rate": 9.114571746384872e-06, + "loss": 2.7958, + "mean_token_accuracy": 0.43519048163825647, + "step": 4776 + }, + { + "epoch": 0.885613644790508, + "grad_norm": 6.7578125, + "learning_rate": 9.114386355209493e-06, + "loss": 2.9668, + "mean_token_accuracy": 0.434561791899987, + "step": 4777 + }, + { + "epoch": 0.885799035965888, + "grad_norm": 8.1484375, + "learning_rate": 9.114200964034113e-06, + "loss": 2.9992, + "mean_token_accuracy": 0.43731629392971244, + "step": 4778 + }, + { + "epoch": 0.8859844271412681, + "grad_norm": 7.33203125, + "learning_rate": 9.114015572858733e-06, + "loss": 2.7213, + "mean_token_accuracy": 0.4538140643623361, + "step": 4779 + }, + { + "epoch": 0.8861698183166481, + "grad_norm": 4.703125, + "learning_rate": 9.113830181683352e-06, + "loss": 2.7574, + "mean_token_accuracy": 0.45145468732582766, + "step": 4780 + }, + { + "epoch": 0.8863552094920282, + "grad_norm": 6.1484375, + "learning_rate": 9.113644790507973e-06, + "loss": 2.5406, + "mean_token_accuracy": 0.46855345911949686, + "step": 4781 + }, + { + "epoch": 0.8865406006674083, + "grad_norm": 5.68359375, + "learning_rate": 9.113459399332593e-06, + "loss": 2.9505, + "mean_token_accuracy": 0.44532130777903045, + "step": 4782 + }, + { + "epoch": 0.8867259918427883, + "grad_norm": 10.1640625, + "learning_rate": 9.113274008157212e-06, + "loss": 2.5814, + "mean_token_accuracy": 0.471386040357261, + "step": 4783 + }, + { + "epoch": 0.8869113830181683, + "grad_norm": 8.9609375, + "learning_rate": 9.113088616981833e-06, + "loss": 2.58, + "mean_token_accuracy": 0.45769427402862983, + "step": 4784 + }, + { + "epoch": 0.8870967741935484, + "grad_norm": 6.71875, + "learning_rate": 9.112903225806451e-06, + "loss": 2.7992, + "mean_token_accuracy": 0.4433485078401619, + "step": 4785 + }, + { + "epoch": 0.8872821653689285, + "grad_norm": 6.34375, + "learning_rate": 9.112717834631072e-06, + "loss": 3.2165, + "mean_token_accuracy": 0.4023738872403561, + "step": 4786 + }, + { + "epoch": 0.8874675565443085, + "grad_norm": 6.484375, + "learning_rate": 9.112532443455692e-06, + "loss": 2.7502, + "mean_token_accuracy": 0.45825486503452606, + "step": 4787 + }, + { + "epoch": 0.8876529477196885, + "grad_norm": 5.6796875, + "learning_rate": 9.112347052280313e-06, + "loss": 3.1176, + "mean_token_accuracy": 0.4289069171648164, + "step": 4788 + }, + { + "epoch": 0.8878383388950686, + "grad_norm": 6.32421875, + "learning_rate": 9.112161661104932e-06, + "loss": 3.4602, + "mean_token_accuracy": 0.38835965026751923, + "step": 4789 + }, + { + "epoch": 0.8880237300704487, + "grad_norm": 10.3125, + "learning_rate": 9.111976269929552e-06, + "loss": 2.7586, + "mean_token_accuracy": 0.4556019070321812, + "step": 4790 + }, + { + "epoch": 0.8882091212458287, + "grad_norm": 8.6796875, + "learning_rate": 9.111790878754173e-06, + "loss": 3.0236, + "mean_token_accuracy": 0.44246277915632753, + "step": 4791 + }, + { + "epoch": 0.8883945124212087, + "grad_norm": 8.328125, + "learning_rate": 9.111605487578791e-06, + "loss": 2.9589, + "mean_token_accuracy": 0.433116413593637, + "step": 4792 + }, + { + "epoch": 0.8885799035965888, + "grad_norm": 9.8828125, + "learning_rate": 9.111420096403412e-06, + "loss": 2.6919, + "mean_token_accuracy": 0.469671603348358, + "step": 4793 + }, + { + "epoch": 0.8887652947719689, + "grad_norm": 8.734375, + "learning_rate": 9.11123470522803e-06, + "loss": 3.0708, + "mean_token_accuracy": 0.427466483327604, + "step": 4794 + }, + { + "epoch": 0.8889506859473489, + "grad_norm": 5.65234375, + "learning_rate": 9.111049314052653e-06, + "loss": 3.2376, + "mean_token_accuracy": 0.4072338380197068, + "step": 4795 + }, + { + "epoch": 0.8891360771227289, + "grad_norm": 6.23046875, + "learning_rate": 9.110863922877272e-06, + "loss": 3.0328, + "mean_token_accuracy": 0.42715812988670815, + "step": 4796 + }, + { + "epoch": 0.889321468298109, + "grad_norm": 7.4765625, + "learning_rate": 9.110678531701892e-06, + "loss": 2.4425, + "mean_token_accuracy": 0.48037399664814323, + "step": 4797 + }, + { + "epoch": 0.8895068594734891, + "grad_norm": 6.67578125, + "learning_rate": 9.110493140526511e-06, + "loss": 2.8499, + "mean_token_accuracy": 0.44973821989528795, + "step": 4798 + }, + { + "epoch": 0.8896922506488691, + "grad_norm": 5.30859375, + "learning_rate": 9.110307749351131e-06, + "loss": 2.9762, + "mean_token_accuracy": 0.4624066126212049, + "step": 4799 + }, + { + "epoch": 0.8898776418242491, + "grad_norm": 6.1484375, + "learning_rate": 9.110122358175752e-06, + "loss": 2.4517, + "mean_token_accuracy": 0.4829152249134948, + "step": 4800 + }, + { + "epoch": 0.8900630329996292, + "grad_norm": 7.984375, + "learning_rate": 9.10993696700037e-06, + "loss": 3.2612, + "mean_token_accuracy": 0.4140942073545689, + "step": 4801 + }, + { + "epoch": 0.8902484241750093, + "grad_norm": 7.46875, + "learning_rate": 9.109751575824991e-06, + "loss": 3.1539, + "mean_token_accuracy": 0.4131321370309951, + "step": 4802 + }, + { + "epoch": 0.8904338153503893, + "grad_norm": 5.3828125, + "learning_rate": 9.109566184649612e-06, + "loss": 2.6659, + "mean_token_accuracy": 0.48419756464632646, + "step": 4803 + }, + { + "epoch": 0.8906192065257694, + "grad_norm": 6.41796875, + "learning_rate": 9.109380793474232e-06, + "loss": 3.3095, + "mean_token_accuracy": 0.39717925386715197, + "step": 4804 + }, + { + "epoch": 0.8908045977011494, + "grad_norm": 16.21875, + "learning_rate": 9.109195402298851e-06, + "loss": 2.0979, + "mean_token_accuracy": 0.5234252174824442, + "step": 4805 + }, + { + "epoch": 0.8909899888765295, + "grad_norm": 7.30859375, + "learning_rate": 9.109010011123472e-06, + "loss": 2.5693, + "mean_token_accuracy": 0.47193611850480266, + "step": 4806 + }, + { + "epoch": 0.8911753800519095, + "grad_norm": 6.91796875, + "learning_rate": 9.108824619948092e-06, + "loss": 2.8091, + "mean_token_accuracy": 0.4536549707602339, + "step": 4807 + }, + { + "epoch": 0.8913607712272896, + "grad_norm": 7.12109375, + "learning_rate": 9.10863922877271e-06, + "loss": 2.9058, + "mean_token_accuracy": 0.43007518796992483, + "step": 4808 + }, + { + "epoch": 0.8915461624026696, + "grad_norm": 8.875, + "learning_rate": 9.108453837597331e-06, + "loss": 2.5582, + "mean_token_accuracy": 0.4794846134600951, + "step": 4809 + }, + { + "epoch": 0.8917315535780497, + "grad_norm": 7.4140625, + "learning_rate": 9.10826844642195e-06, + "loss": 2.4695, + "mean_token_accuracy": 0.48445154419595315, + "step": 4810 + }, + { + "epoch": 0.8919169447534298, + "grad_norm": 8.40625, + "learning_rate": 9.108083055246572e-06, + "loss": 2.8693, + "mean_token_accuracy": 0.4464115969581749, + "step": 4811 + }, + { + "epoch": 0.8921023359288098, + "grad_norm": 6.73046875, + "learning_rate": 9.107897664071191e-06, + "loss": 3.1149, + "mean_token_accuracy": 0.4265320836337419, + "step": 4812 + }, + { + "epoch": 0.8922877271041898, + "grad_norm": 9.3828125, + "learning_rate": 9.107712272895812e-06, + "loss": 2.5036, + "mean_token_accuracy": 0.4868949482139083, + "step": 4813 + }, + { + "epoch": 0.8924731182795699, + "grad_norm": 5.61328125, + "learning_rate": 9.10752688172043e-06, + "loss": 2.9838, + "mean_token_accuracy": 0.44264175680104884, + "step": 4814 + }, + { + "epoch": 0.89265850945495, + "grad_norm": 5.83984375, + "learning_rate": 9.107341490545051e-06, + "loss": 2.7201, + "mean_token_accuracy": 0.44769403824521936, + "step": 4815 + }, + { + "epoch": 0.89284390063033, + "grad_norm": 5.75390625, + "learning_rate": 9.107156099369671e-06, + "loss": 2.5148, + "mean_token_accuracy": 0.495776630689817, + "step": 4816 + }, + { + "epoch": 0.89302929180571, + "grad_norm": 5.8046875, + "learning_rate": 9.10697070819429e-06, + "loss": 3.2708, + "mean_token_accuracy": 0.4203842396613481, + "step": 4817 + }, + { + "epoch": 0.8932146829810901, + "grad_norm": 5.66796875, + "learning_rate": 9.10678531701891e-06, + "loss": 2.6114, + "mean_token_accuracy": 0.47569060773480665, + "step": 4818 + }, + { + "epoch": 0.8934000741564702, + "grad_norm": 7.0703125, + "learning_rate": 9.106599925843531e-06, + "loss": 2.782, + "mean_token_accuracy": 0.46246396791577893, + "step": 4819 + }, + { + "epoch": 0.8935854653318502, + "grad_norm": 5.36328125, + "learning_rate": 9.106414534668152e-06, + "loss": 3.0879, + "mean_token_accuracy": 0.41668705652067534, + "step": 4820 + }, + { + "epoch": 0.8937708565072302, + "grad_norm": 6.41015625, + "learning_rate": 9.10622914349277e-06, + "loss": 3.0243, + "mean_token_accuracy": 0.42051802945657696, + "step": 4821 + }, + { + "epoch": 0.8939562476826103, + "grad_norm": 8.953125, + "learning_rate": 9.106043752317391e-06, + "loss": 3.012, + "mean_token_accuracy": 0.4723398044081156, + "step": 4822 + }, + { + "epoch": 0.8941416388579904, + "grad_norm": 6.5390625, + "learning_rate": 9.10585836114201e-06, + "loss": 2.4209, + "mean_token_accuracy": 0.4869003062266077, + "step": 4823 + }, + { + "epoch": 0.8943270300333704, + "grad_norm": 5.8203125, + "learning_rate": 9.10567296996663e-06, + "loss": 2.9491, + "mean_token_accuracy": 0.4342634192480492, + "step": 4824 + }, + { + "epoch": 0.8945124212087505, + "grad_norm": 7.11328125, + "learning_rate": 9.10548757879125e-06, + "loss": 3.2459, + "mean_token_accuracy": 0.41229985443959244, + "step": 4825 + }, + { + "epoch": 0.8946978123841305, + "grad_norm": 4.9609375, + "learning_rate": 9.10530218761587e-06, + "loss": 2.8846, + "mean_token_accuracy": 0.4488990444536768, + "step": 4826 + }, + { + "epoch": 0.8948832035595106, + "grad_norm": 5.5078125, + "learning_rate": 9.10511679644049e-06, + "loss": 2.9612, + "mean_token_accuracy": 0.43195975083852417, + "step": 4827 + }, + { + "epoch": 0.8950685947348906, + "grad_norm": 6.140625, + "learning_rate": 9.10493140526511e-06, + "loss": 3.4644, + "mean_token_accuracy": 0.3847089487402259, + "step": 4828 + }, + { + "epoch": 0.8952539859102707, + "grad_norm": 6.36328125, + "learning_rate": 9.104746014089731e-06, + "loss": 2.7245, + "mean_token_accuracy": 0.47996424847311187, + "step": 4829 + }, + { + "epoch": 0.8954393770856507, + "grad_norm": 5.13671875, + "learning_rate": 9.10456062291435e-06, + "loss": 2.7935, + "mean_token_accuracy": 0.46265440965239873, + "step": 4830 + }, + { + "epoch": 0.8956247682610308, + "grad_norm": 5.953125, + "learning_rate": 9.10437523173897e-06, + "loss": 3.0615, + "mean_token_accuracy": 0.4243281471004243, + "step": 4831 + }, + { + "epoch": 0.8958101594364108, + "grad_norm": 6.0625, + "learning_rate": 9.104189840563589e-06, + "loss": 2.9999, + "mean_token_accuracy": 0.41274362818590704, + "step": 4832 + }, + { + "epoch": 0.8959955506117909, + "grad_norm": 5.31640625, + "learning_rate": 9.10400444938821e-06, + "loss": 3.0061, + "mean_token_accuracy": 0.44740400216333154, + "step": 4833 + }, + { + "epoch": 0.8961809417871709, + "grad_norm": 5.1328125, + "learning_rate": 9.10381905821283e-06, + "loss": 2.7098, + "mean_token_accuracy": 0.47615085967831394, + "step": 4834 + }, + { + "epoch": 0.896366332962551, + "grad_norm": 5.6640625, + "learning_rate": 9.103633667037449e-06, + "loss": 2.8705, + "mean_token_accuracy": 0.44950625411454903, + "step": 4835 + }, + { + "epoch": 0.896551724137931, + "grad_norm": 6.09765625, + "learning_rate": 9.10344827586207e-06, + "loss": 2.4584, + "mean_token_accuracy": 0.48600987538208323, + "step": 4836 + }, + { + "epoch": 0.8967371153133111, + "grad_norm": 5.671875, + "learning_rate": 9.10326288468669e-06, + "loss": 2.6915, + "mean_token_accuracy": 0.4685051389030223, + "step": 4837 + }, + { + "epoch": 0.8969225064886911, + "grad_norm": 6.7421875, + "learning_rate": 9.10307749351131e-06, + "loss": 2.7308, + "mean_token_accuracy": 0.4481563626282229, + "step": 4838 + }, + { + "epoch": 0.8971078976640712, + "grad_norm": 6.125, + "learning_rate": 9.102892102335929e-06, + "loss": 2.7726, + "mean_token_accuracy": 0.44919590643274854, + "step": 4839 + }, + { + "epoch": 0.8972932888394513, + "grad_norm": 7.3515625, + "learning_rate": 9.10270671116055e-06, + "loss": 3.0127, + "mean_token_accuracy": 0.4177123552123552, + "step": 4840 + }, + { + "epoch": 0.8974786800148313, + "grad_norm": 9.078125, + "learning_rate": 9.102521319985168e-06, + "loss": 2.6159, + "mean_token_accuracy": 0.467239878101872, + "step": 4841 + }, + { + "epoch": 0.8976640711902113, + "grad_norm": 6.62109375, + "learning_rate": 9.102335928809789e-06, + "loss": 2.7854, + "mean_token_accuracy": 0.4451139638459523, + "step": 4842 + }, + { + "epoch": 0.8978494623655914, + "grad_norm": 6.59765625, + "learning_rate": 9.10215053763441e-06, + "loss": 2.6825, + "mean_token_accuracy": 0.4688304997424008, + "step": 4843 + }, + { + "epoch": 0.8980348535409715, + "grad_norm": 5.6640625, + "learning_rate": 9.10196514645903e-06, + "loss": 2.5821, + "mean_token_accuracy": 0.4620289033720601, + "step": 4844 + }, + { + "epoch": 0.8982202447163515, + "grad_norm": 10.984375, + "learning_rate": 9.10177975528365e-06, + "loss": 2.5949, + "mean_token_accuracy": 0.4669365721997301, + "step": 4845 + }, + { + "epoch": 0.8984056358917315, + "grad_norm": 6.08984375, + "learning_rate": 9.101594364108269e-06, + "loss": 3.2982, + "mean_token_accuracy": 0.4004474272930649, + "step": 4846 + }, + { + "epoch": 0.8985910270671116, + "grad_norm": 6.26953125, + "learning_rate": 9.10140897293289e-06, + "loss": 3.2974, + "mean_token_accuracy": 0.456050796812749, + "step": 4847 + }, + { + "epoch": 0.8987764182424917, + "grad_norm": 6.87890625, + "learning_rate": 9.101223581757508e-06, + "loss": 2.7279, + "mean_token_accuracy": 0.4588701684836472, + "step": 4848 + }, + { + "epoch": 0.8989618094178717, + "grad_norm": 5.80859375, + "learning_rate": 9.101038190582129e-06, + "loss": 2.1861, + "mean_token_accuracy": 0.5260521042084169, + "step": 4849 + }, + { + "epoch": 0.8991472005932518, + "grad_norm": 5.15234375, + "learning_rate": 9.10085279940675e-06, + "loss": 2.8569, + "mean_token_accuracy": 0.438132390588809, + "step": 4850 + }, + { + "epoch": 0.8993325917686318, + "grad_norm": 7.42578125, + "learning_rate": 9.100667408231368e-06, + "loss": 2.7418, + "mean_token_accuracy": 0.46966378977199535, + "step": 4851 + }, + { + "epoch": 0.8995179829440119, + "grad_norm": 5.69140625, + "learning_rate": 9.100482017055989e-06, + "loss": 2.8113, + "mean_token_accuracy": 0.4586403613025909, + "step": 4852 + }, + { + "epoch": 0.8997033741193919, + "grad_norm": 7.87890625, + "learning_rate": 9.10029662588061e-06, + "loss": 3.0107, + "mean_token_accuracy": 0.42656436827421285, + "step": 4853 + }, + { + "epoch": 0.899888765294772, + "grad_norm": 9.734375, + "learning_rate": 9.10011123470523e-06, + "loss": 2.8243, + "mean_token_accuracy": 0.4418943533697632, + "step": 4854 + }, + { + "epoch": 0.900074156470152, + "grad_norm": 7.9140625, + "learning_rate": 9.099925843529848e-06, + "loss": 2.5415, + "mean_token_accuracy": 0.4780982261373235, + "step": 4855 + }, + { + "epoch": 0.9002595476455321, + "grad_norm": 6.43359375, + "learning_rate": 9.099740452354469e-06, + "loss": 2.6513, + "mean_token_accuracy": 0.46058631921824106, + "step": 4856 + }, + { + "epoch": 0.9004449388209121, + "grad_norm": 9.59375, + "learning_rate": 9.099555061179088e-06, + "loss": 2.3358, + "mean_token_accuracy": 0.5088495575221239, + "step": 4857 + }, + { + "epoch": 0.9006303299962922, + "grad_norm": 7.1015625, + "learning_rate": 9.099369670003708e-06, + "loss": 2.4836, + "mean_token_accuracy": 0.49608114338404796, + "step": 4858 + }, + { + "epoch": 0.9008157211716722, + "grad_norm": 6.95703125, + "learning_rate": 9.099184278828329e-06, + "loss": 3.0634, + "mean_token_accuracy": 0.42346771550311374, + "step": 4859 + }, + { + "epoch": 0.9010011123470523, + "grad_norm": 6.63671875, + "learning_rate": 9.09899888765295e-06, + "loss": 2.7997, + "mean_token_accuracy": 0.44312078898545065, + "step": 4860 + }, + { + "epoch": 0.9011865035224323, + "grad_norm": 6.04296875, + "learning_rate": 9.098813496477568e-06, + "loss": 2.9203, + "mean_token_accuracy": 0.44080480922586185, + "step": 4861 + }, + { + "epoch": 0.9013718946978124, + "grad_norm": 5.6796875, + "learning_rate": 9.098628105302189e-06, + "loss": 2.5721, + "mean_token_accuracy": 0.4629313738354547, + "step": 4862 + }, + { + "epoch": 0.9015572858731924, + "grad_norm": 6.578125, + "learning_rate": 9.098442714126809e-06, + "loss": 3.5373, + "mean_token_accuracy": 0.3878078650156484, + "step": 4863 + }, + { + "epoch": 0.9017426770485725, + "grad_norm": 6.484375, + "learning_rate": 9.098257322951428e-06, + "loss": 2.6734, + "mean_token_accuracy": 0.46557745073584433, + "step": 4864 + }, + { + "epoch": 0.9019280682239526, + "grad_norm": 5.06640625, + "learning_rate": 9.098071931776048e-06, + "loss": 3.0073, + "mean_token_accuracy": 0.4343629343629344, + "step": 4865 + }, + { + "epoch": 0.9021134593993326, + "grad_norm": 4.94921875, + "learning_rate": 9.097886540600667e-06, + "loss": 3.0821, + "mean_token_accuracy": 0.4202827289489859, + "step": 4866 + }, + { + "epoch": 0.9022988505747126, + "grad_norm": 5.8671875, + "learning_rate": 9.097701149425288e-06, + "loss": 3.0367, + "mean_token_accuracy": 0.4293239683933275, + "step": 4867 + }, + { + "epoch": 0.9024842417500927, + "grad_norm": 6.9921875, + "learning_rate": 9.097515758249908e-06, + "loss": 2.4986, + "mean_token_accuracy": 0.4861438679245283, + "step": 4868 + }, + { + "epoch": 0.9026696329254728, + "grad_norm": 6.45703125, + "learning_rate": 9.097330367074529e-06, + "loss": 2.666, + "mean_token_accuracy": 0.4762937265105522, + "step": 4869 + }, + { + "epoch": 0.9028550241008528, + "grad_norm": 6.22265625, + "learning_rate": 9.097144975899147e-06, + "loss": 3.0184, + "mean_token_accuracy": 0.42739095096179897, + "step": 4870 + }, + { + "epoch": 0.9030404152762328, + "grad_norm": 5.359375, + "learning_rate": 9.096959584723768e-06, + "loss": 2.9694, + "mean_token_accuracy": 0.4375404530744337, + "step": 4871 + }, + { + "epoch": 0.9032258064516129, + "grad_norm": 8.34375, + "learning_rate": 9.096774193548388e-06, + "loss": 2.6604, + "mean_token_accuracy": 0.47622687662436936, + "step": 4872 + }, + { + "epoch": 0.903411197626993, + "grad_norm": 5.43359375, + "learning_rate": 9.096588802373007e-06, + "loss": 3.0543, + "mean_token_accuracy": 0.4283646888567294, + "step": 4873 + }, + { + "epoch": 0.903596588802373, + "grad_norm": 6.94921875, + "learning_rate": 9.096403411197628e-06, + "loss": 2.6828, + "mean_token_accuracy": 0.4630102040816326, + "step": 4874 + }, + { + "epoch": 0.9037819799777531, + "grad_norm": 8.21875, + "learning_rate": 9.096218020022246e-06, + "loss": 2.8308, + "mean_token_accuracy": 0.46330335028027636, + "step": 4875 + }, + { + "epoch": 0.9039673711531331, + "grad_norm": 6.2421875, + "learning_rate": 9.096032628846869e-06, + "loss": 3.0079, + "mean_token_accuracy": 0.43385469960532086, + "step": 4876 + }, + { + "epoch": 0.9041527623285132, + "grad_norm": 8.203125, + "learning_rate": 9.095847237671487e-06, + "loss": 2.8519, + "mean_token_accuracy": 0.44860203088051187, + "step": 4877 + }, + { + "epoch": 0.9043381535038932, + "grad_norm": 7.59765625, + "learning_rate": 9.095661846496108e-06, + "loss": 2.811, + "mean_token_accuracy": 0.4399512789281364, + "step": 4878 + }, + { + "epoch": 0.9045235446792733, + "grad_norm": 5.359375, + "learning_rate": 9.095476455320727e-06, + "loss": 3.3264, + "mean_token_accuracy": 0.40315804722584386, + "step": 4879 + }, + { + "epoch": 0.9047089358546533, + "grad_norm": 8.3671875, + "learning_rate": 9.095291064145347e-06, + "loss": 2.5031, + "mean_token_accuracy": 0.4665497369080431, + "step": 4880 + }, + { + "epoch": 0.9048943270300334, + "grad_norm": 9.828125, + "learning_rate": 9.095105672969968e-06, + "loss": 2.8232, + "mean_token_accuracy": 0.451505940867643, + "step": 4881 + }, + { + "epoch": 0.9050797182054134, + "grad_norm": 5.53125, + "learning_rate": 9.094920281794587e-06, + "loss": 3.142, + "mean_token_accuracy": 0.4085560016613595, + "step": 4882 + }, + { + "epoch": 0.9052651093807935, + "grad_norm": 7.41015625, + "learning_rate": 9.094734890619207e-06, + "loss": 2.7081, + "mean_token_accuracy": 0.4691276058956314, + "step": 4883 + }, + { + "epoch": 0.9054505005561735, + "grad_norm": 5.84375, + "learning_rate": 9.094549499443827e-06, + "loss": 2.9185, + "mean_token_accuracy": 0.43350785340314135, + "step": 4884 + }, + { + "epoch": 0.9056358917315536, + "grad_norm": 7.765625, + "learning_rate": 9.094364108268448e-06, + "loss": 3.0834, + "mean_token_accuracy": 0.43157327586206895, + "step": 4885 + }, + { + "epoch": 0.9058212829069336, + "grad_norm": 5.93359375, + "learning_rate": 9.094178717093067e-06, + "loss": 3.0475, + "mean_token_accuracy": 0.4337931034482759, + "step": 4886 + }, + { + "epoch": 0.9060066740823137, + "grad_norm": 5.7109375, + "learning_rate": 9.093993325917687e-06, + "loss": 2.9418, + "mean_token_accuracy": 0.4622816032887975, + "step": 4887 + }, + { + "epoch": 0.9061920652576937, + "grad_norm": 5.96484375, + "learning_rate": 9.093807934742308e-06, + "loss": 2.3742, + "mean_token_accuracy": 0.497049356223176, + "step": 4888 + }, + { + "epoch": 0.9063774564330738, + "grad_norm": 6.37890625, + "learning_rate": 9.093622543566927e-06, + "loss": 2.3453, + "mean_token_accuracy": 0.4954711087975013, + "step": 4889 + }, + { + "epoch": 0.9065628476084538, + "grad_norm": 6.46875, + "learning_rate": 9.093437152391547e-06, + "loss": 3.0197, + "mean_token_accuracy": 0.42697674418604653, + "step": 4890 + }, + { + "epoch": 0.9067482387838339, + "grad_norm": 6.8984375, + "learning_rate": 9.093251761216166e-06, + "loss": 3.1975, + "mean_token_accuracy": 0.41771602257924445, + "step": 4891 + }, + { + "epoch": 0.9069336299592139, + "grad_norm": 5.7734375, + "learning_rate": 9.093066370040788e-06, + "loss": 2.5583, + "mean_token_accuracy": 0.4734808102345416, + "step": 4892 + }, + { + "epoch": 0.907119021134594, + "grad_norm": 8.0234375, + "learning_rate": 9.092880978865407e-06, + "loss": 3.2496, + "mean_token_accuracy": 0.42100852074312056, + "step": 4893 + }, + { + "epoch": 0.907304412309974, + "grad_norm": 6.90625, + "learning_rate": 9.092695587690027e-06, + "loss": 2.7741, + "mean_token_accuracy": 0.4510031986042454, + "step": 4894 + }, + { + "epoch": 0.9074898034853541, + "grad_norm": 5.75390625, + "learning_rate": 9.092510196514646e-06, + "loss": 2.8908, + "mean_token_accuracy": 0.433028010088942, + "step": 4895 + }, + { + "epoch": 0.9076751946607341, + "grad_norm": 8.8515625, + "learning_rate": 9.092324805339267e-06, + "loss": 2.8886, + "mean_token_accuracy": 0.43879074396616075, + "step": 4896 + }, + { + "epoch": 0.9078605858361142, + "grad_norm": 7.41796875, + "learning_rate": 9.092139414163887e-06, + "loss": 3.0682, + "mean_token_accuracy": 0.42445565586829526, + "step": 4897 + }, + { + "epoch": 0.9080459770114943, + "grad_norm": 6.703125, + "learning_rate": 9.091954022988506e-06, + "loss": 2.7231, + "mean_token_accuracy": 0.45677498467198036, + "step": 4898 + }, + { + "epoch": 0.9082313681868743, + "grad_norm": 6.51953125, + "learning_rate": 9.091768631813126e-06, + "loss": 2.887, + "mean_token_accuracy": 0.4489100817438692, + "step": 4899 + }, + { + "epoch": 0.9084167593622544, + "grad_norm": 9.203125, + "learning_rate": 9.091583240637747e-06, + "loss": 2.5935, + "mean_token_accuracy": 0.4736216376876207, + "step": 4900 + }, + { + "epoch": 0.9086021505376344, + "grad_norm": 6.875, + "learning_rate": 9.091397849462367e-06, + "loss": 3.0335, + "mean_token_accuracy": 0.4358125318390219, + "step": 4901 + }, + { + "epoch": 0.9087875417130145, + "grad_norm": 6.2578125, + "learning_rate": 9.091212458286986e-06, + "loss": 3.2273, + "mean_token_accuracy": 0.408009286128845, + "step": 4902 + }, + { + "epoch": 0.9089729328883945, + "grad_norm": 6.3203125, + "learning_rate": 9.091027067111607e-06, + "loss": 2.5726, + "mean_token_accuracy": 0.4705722070844687, + "step": 4903 + }, + { + "epoch": 0.9091583240637746, + "grad_norm": 6.73828125, + "learning_rate": 9.090841675936225e-06, + "loss": 2.3397, + "mean_token_accuracy": 0.5137127690946623, + "step": 4904 + }, + { + "epoch": 0.9093437152391546, + "grad_norm": 8.453125, + "learning_rate": 9.090656284760846e-06, + "loss": 2.9487, + "mean_token_accuracy": 0.4239875792469919, + "step": 4905 + }, + { + "epoch": 0.9095291064145347, + "grad_norm": 5.89453125, + "learning_rate": 9.090470893585466e-06, + "loss": 3.0251, + "mean_token_accuracy": 0.43301913738931735, + "step": 4906 + }, + { + "epoch": 0.9097144975899147, + "grad_norm": 7.0625, + "learning_rate": 9.090285502410085e-06, + "loss": 3.1901, + "mean_token_accuracy": 0.43487193535132174, + "step": 4907 + }, + { + "epoch": 0.9098998887652948, + "grad_norm": 6.76953125, + "learning_rate": 9.090100111234706e-06, + "loss": 2.5914, + "mean_token_accuracy": 0.4698652450762624, + "step": 4908 + }, + { + "epoch": 0.9100852799406748, + "grad_norm": 6.2109375, + "learning_rate": 9.089914720059326e-06, + "loss": 2.5262, + "mean_token_accuracy": 0.49608310626702995, + "step": 4909 + }, + { + "epoch": 0.9102706711160549, + "grad_norm": 4.90234375, + "learning_rate": 9.089729328883947e-06, + "loss": 2.761, + "mean_token_accuracy": 0.43695479777954005, + "step": 4910 + }, + { + "epoch": 0.9104560622914349, + "grad_norm": 7.796875, + "learning_rate": 9.089543937708566e-06, + "loss": 3.1522, + "mean_token_accuracy": 0.41713543920380786, + "step": 4911 + }, + { + "epoch": 0.910641453466815, + "grad_norm": 8.828125, + "learning_rate": 9.089358546533186e-06, + "loss": 2.8721, + "mean_token_accuracy": 0.44594594594594594, + "step": 4912 + }, + { + "epoch": 0.910826844642195, + "grad_norm": 6.30859375, + "learning_rate": 9.089173155357805e-06, + "loss": 2.5212, + "mean_token_accuracy": 0.4867914849961529, + "step": 4913 + }, + { + "epoch": 0.9110122358175751, + "grad_norm": 6.24609375, + "learning_rate": 9.088987764182425e-06, + "loss": 3.2299, + "mean_token_accuracy": 0.43695594829860196, + "step": 4914 + }, + { + "epoch": 0.9111976269929551, + "grad_norm": 6.30078125, + "learning_rate": 9.088802373007046e-06, + "loss": 3.4052, + "mean_token_accuracy": 0.4020790020790021, + "step": 4915 + }, + { + "epoch": 0.9113830181683352, + "grad_norm": 5.84375, + "learning_rate": 9.088616981831666e-06, + "loss": 3.1535, + "mean_token_accuracy": 0.4408988764044944, + "step": 4916 + }, + { + "epoch": 0.9115684093437152, + "grad_norm": 9.609375, + "learning_rate": 9.088431590656285e-06, + "loss": 2.4482, + "mean_token_accuracy": 0.4887955182072829, + "step": 4917 + }, + { + "epoch": 0.9117538005190953, + "grad_norm": 10.8125, + "learning_rate": 9.088246199480906e-06, + "loss": 2.8664, + "mean_token_accuracy": 0.4285477453580902, + "step": 4918 + }, + { + "epoch": 0.9119391916944753, + "grad_norm": 5.86328125, + "learning_rate": 9.088060808305526e-06, + "loss": 3.0842, + "mean_token_accuracy": 0.4392743550477088, + "step": 4919 + }, + { + "epoch": 0.9121245828698554, + "grad_norm": 9.0078125, + "learning_rate": 9.087875417130145e-06, + "loss": 3.1244, + "mean_token_accuracy": 0.41763754045307444, + "step": 4920 + }, + { + "epoch": 0.9123099740452354, + "grad_norm": 6.453125, + "learning_rate": 9.087690025954765e-06, + "loss": 3.2078, + "mean_token_accuracy": 0.40724863600935307, + "step": 4921 + }, + { + "epoch": 0.9124953652206155, + "grad_norm": 6.296875, + "learning_rate": 9.087504634779384e-06, + "loss": 3.4093, + "mean_token_accuracy": 0.3962237162065513, + "step": 4922 + }, + { + "epoch": 0.9126807563959956, + "grad_norm": 4.78515625, + "learning_rate": 9.087319243604005e-06, + "loss": 3.1374, + "mean_token_accuracy": 0.41914227071405424, + "step": 4923 + }, + { + "epoch": 0.9128661475713756, + "grad_norm": 6.56640625, + "learning_rate": 9.087133852428625e-06, + "loss": 3.7136, + "mean_token_accuracy": 0.3911639512696316, + "step": 4924 + }, + { + "epoch": 0.9130515387467557, + "grad_norm": 8.0703125, + "learning_rate": 9.086948461253246e-06, + "loss": 2.7809, + "mean_token_accuracy": 0.44657503879405896, + "step": 4925 + }, + { + "epoch": 0.9132369299221357, + "grad_norm": 5.515625, + "learning_rate": 9.086763070077866e-06, + "loss": 3.0026, + "mean_token_accuracy": 0.4470687984496124, + "step": 4926 + }, + { + "epoch": 0.9134223210975158, + "grad_norm": 7.2265625, + "learning_rate": 9.086577678902485e-06, + "loss": 2.9416, + "mean_token_accuracy": 0.4349683108294296, + "step": 4927 + }, + { + "epoch": 0.9136077122728958, + "grad_norm": 6.5546875, + "learning_rate": 9.086392287727105e-06, + "loss": 2.9747, + "mean_token_accuracy": 0.4269135480505933, + "step": 4928 + }, + { + "epoch": 0.9137931034482759, + "grad_norm": 6.421875, + "learning_rate": 9.086206896551724e-06, + "loss": 2.5886, + "mean_token_accuracy": 0.46509768079515595, + "step": 4929 + }, + { + "epoch": 0.9139784946236559, + "grad_norm": 5.7265625, + "learning_rate": 9.086021505376345e-06, + "loss": 2.9867, + "mean_token_accuracy": 0.45144508670520234, + "step": 4930 + }, + { + "epoch": 0.914163885799036, + "grad_norm": 5.99609375, + "learning_rate": 9.085836114200965e-06, + "loss": 2.3185, + "mean_token_accuracy": 0.5096470588235295, + "step": 4931 + }, + { + "epoch": 0.914349276974416, + "grad_norm": 5.9296875, + "learning_rate": 9.085650723025586e-06, + "loss": 2.8879, + "mean_token_accuracy": 0.4369295062135709, + "step": 4932 + }, + { + "epoch": 0.9145346681497961, + "grad_norm": 4.78125, + "learning_rate": 9.085465331850204e-06, + "loss": 2.8352, + "mean_token_accuracy": 0.4431062601932004, + "step": 4933 + }, + { + "epoch": 0.9147200593251761, + "grad_norm": 5.80859375, + "learning_rate": 9.085279940674825e-06, + "loss": 2.7868, + "mean_token_accuracy": 0.450063211125158, + "step": 4934 + }, + { + "epoch": 0.9149054505005562, + "grad_norm": 9.8984375, + "learning_rate": 9.085094549499445e-06, + "loss": 3.4765, + "mean_token_accuracy": 0.4296482412060301, + "step": 4935 + }, + { + "epoch": 0.9150908416759362, + "grad_norm": 7.19140625, + "learning_rate": 9.084909158324064e-06, + "loss": 2.9212, + "mean_token_accuracy": 0.4395873629916183, + "step": 4936 + }, + { + "epoch": 0.9152762328513163, + "grad_norm": 5.58984375, + "learning_rate": 9.084723767148685e-06, + "loss": 2.9659, + "mean_token_accuracy": 0.44335497548694847, + "step": 4937 + }, + { + "epoch": 0.9154616240266963, + "grad_norm": 5.78125, + "learning_rate": 9.084538375973304e-06, + "loss": 2.6547, + "mean_token_accuracy": 0.473407056345445, + "step": 4938 + }, + { + "epoch": 0.9156470152020764, + "grad_norm": 5.8203125, + "learning_rate": 9.084352984797924e-06, + "loss": 2.3808, + "mean_token_accuracy": 0.5084867320105187, + "step": 4939 + }, + { + "epoch": 0.9158324063774564, + "grad_norm": 8.0703125, + "learning_rate": 9.084167593622545e-06, + "loss": 2.943, + "mean_token_accuracy": 0.42499644532916253, + "step": 4940 + }, + { + "epoch": 0.9160177975528365, + "grad_norm": 6.33203125, + "learning_rate": 9.083982202447165e-06, + "loss": 3.1771, + "mean_token_accuracy": 0.41646489104116224, + "step": 4941 + }, + { + "epoch": 0.9162031887282165, + "grad_norm": 5.0, + "learning_rate": 9.083796811271784e-06, + "loss": 2.9426, + "mean_token_accuracy": 0.4365754679931928, + "step": 4942 + }, + { + "epoch": 0.9163885799035966, + "grad_norm": 5.6796875, + "learning_rate": 9.083611420096404e-06, + "loss": 2.5685, + "mean_token_accuracy": 0.4848862574010595, + "step": 4943 + }, + { + "epoch": 0.9165739710789766, + "grad_norm": 7.18359375, + "learning_rate": 9.083426028921025e-06, + "loss": 2.6273, + "mean_token_accuracy": 0.46711746100992474, + "step": 4944 + }, + { + "epoch": 0.9167593622543567, + "grad_norm": 6.41796875, + "learning_rate": 9.083240637745644e-06, + "loss": 2.54, + "mean_token_accuracy": 0.4940161424993042, + "step": 4945 + }, + { + "epoch": 0.9169447534297367, + "grad_norm": 6.50390625, + "learning_rate": 9.083055246570264e-06, + "loss": 2.8788, + "mean_token_accuracy": 0.46175089754211546, + "step": 4946 + }, + { + "epoch": 0.9171301446051168, + "grad_norm": 7.57421875, + "learning_rate": 9.082869855394883e-06, + "loss": 2.9309, + "mean_token_accuracy": 0.45634333378178393, + "step": 4947 + }, + { + "epoch": 0.9173155357804968, + "grad_norm": 8.21875, + "learning_rate": 9.082684464219505e-06, + "loss": 2.7246, + "mean_token_accuracy": 0.4640896614821592, + "step": 4948 + }, + { + "epoch": 0.9175009269558769, + "grad_norm": 5.796875, + "learning_rate": 9.082499073044124e-06, + "loss": 3.4655, + "mean_token_accuracy": 0.39344901540261257, + "step": 4949 + }, + { + "epoch": 0.917686318131257, + "grad_norm": 6.1640625, + "learning_rate": 9.082313681868744e-06, + "loss": 2.7255, + "mean_token_accuracy": 0.4713722290221678, + "step": 4950 + }, + { + "epoch": 0.917871709306637, + "grad_norm": 7.26171875, + "learning_rate": 9.082128290693363e-06, + "loss": 2.9082, + "mean_token_accuracy": 0.44174940304134724, + "step": 4951 + }, + { + "epoch": 0.918057100482017, + "grad_norm": 7.3046875, + "learning_rate": 9.081942899517984e-06, + "loss": 2.5959, + "mean_token_accuracy": 0.4730250481695568, + "step": 4952 + }, + { + "epoch": 0.9182424916573971, + "grad_norm": 5.83984375, + "learning_rate": 9.081757508342604e-06, + "loss": 2.62, + "mean_token_accuracy": 0.493306781723642, + "step": 4953 + }, + { + "epoch": 0.9184278828327772, + "grad_norm": 6.1171875, + "learning_rate": 9.081572117167223e-06, + "loss": 3.2458, + "mean_token_accuracy": 0.4190392758066871, + "step": 4954 + }, + { + "epoch": 0.9186132740081572, + "grad_norm": 6.98828125, + "learning_rate": 9.081386725991843e-06, + "loss": 2.7548, + "mean_token_accuracy": 0.4729447282861124, + "step": 4955 + }, + { + "epoch": 0.9187986651835373, + "grad_norm": 8.796875, + "learning_rate": 9.081201334816462e-06, + "loss": 2.8006, + "mean_token_accuracy": 0.45722061575097805, + "step": 4956 + }, + { + "epoch": 0.9189840563589173, + "grad_norm": 5.48828125, + "learning_rate": 9.081015943641084e-06, + "loss": 2.8963, + "mean_token_accuracy": 0.4548692128208277, + "step": 4957 + }, + { + "epoch": 0.9191694475342974, + "grad_norm": 5.92578125, + "learning_rate": 9.080830552465703e-06, + "loss": 2.6877, + "mean_token_accuracy": 0.46788783355947533, + "step": 4958 + }, + { + "epoch": 0.9193548387096774, + "grad_norm": 6.1796875, + "learning_rate": 9.080645161290324e-06, + "loss": 2.9887, + "mean_token_accuracy": 0.4341346829748348, + "step": 4959 + }, + { + "epoch": 0.9195402298850575, + "grad_norm": 5.984375, + "learning_rate": 9.080459770114942e-06, + "loss": 3.1555, + "mean_token_accuracy": 0.4420103092783505, + "step": 4960 + }, + { + "epoch": 0.9197256210604375, + "grad_norm": 7.5546875, + "learning_rate": 9.080274378939563e-06, + "loss": 3.0522, + "mean_token_accuracy": 0.4373956594323873, + "step": 4961 + }, + { + "epoch": 0.9199110122358176, + "grad_norm": 6.17578125, + "learning_rate": 9.080088987764183e-06, + "loss": 2.5525, + "mean_token_accuracy": 0.4836576610617797, + "step": 4962 + }, + { + "epoch": 0.9200964034111976, + "grad_norm": 7.390625, + "learning_rate": 9.079903596588802e-06, + "loss": 3.1188, + "mean_token_accuracy": 0.4197799385875128, + "step": 4963 + }, + { + "epoch": 0.9202817945865777, + "grad_norm": 6.5703125, + "learning_rate": 9.079718205413423e-06, + "loss": 2.6088, + "mean_token_accuracy": 0.4762982689747004, + "step": 4964 + }, + { + "epoch": 0.9204671857619577, + "grad_norm": 6.67578125, + "learning_rate": 9.079532814238043e-06, + "loss": 3.117, + "mean_token_accuracy": 0.41052767384853567, + "step": 4965 + }, + { + "epoch": 0.9206525769373378, + "grad_norm": 5.6640625, + "learning_rate": 9.079347423062664e-06, + "loss": 3.0117, + "mean_token_accuracy": 0.4299807815502883, + "step": 4966 + }, + { + "epoch": 0.9208379681127178, + "grad_norm": 5.94921875, + "learning_rate": 9.079162031887283e-06, + "loss": 2.7008, + "mean_token_accuracy": 0.465907603716791, + "step": 4967 + }, + { + "epoch": 0.9210233592880979, + "grad_norm": 6.61328125, + "learning_rate": 9.078976640711903e-06, + "loss": 3.0792, + "mean_token_accuracy": 0.43321230651633863, + "step": 4968 + }, + { + "epoch": 0.9212087504634779, + "grad_norm": 7.1171875, + "learning_rate": 9.078791249536524e-06, + "loss": 2.2011, + "mean_token_accuracy": 0.523759899958316, + "step": 4969 + }, + { + "epoch": 0.921394141638858, + "grad_norm": 6.703125, + "learning_rate": 9.078605858361142e-06, + "loss": 2.5621, + "mean_token_accuracy": 0.47427154370737756, + "step": 4970 + }, + { + "epoch": 0.921579532814238, + "grad_norm": 7.41015625, + "learning_rate": 9.078420467185763e-06, + "loss": 3.3999, + "mean_token_accuracy": 0.40147819660014783, + "step": 4971 + }, + { + "epoch": 0.9217649239896181, + "grad_norm": 7.11328125, + "learning_rate": 9.078235076010382e-06, + "loss": 2.5382, + "mean_token_accuracy": 0.4794092928322728, + "step": 4972 + }, + { + "epoch": 0.9219503151649981, + "grad_norm": 6.48828125, + "learning_rate": 9.078049684835004e-06, + "loss": 2.7149, + "mean_token_accuracy": 0.45803085299455537, + "step": 4973 + }, + { + "epoch": 0.9221357063403782, + "grad_norm": 10.0, + "learning_rate": 9.077864293659623e-06, + "loss": 2.6543, + "mean_token_accuracy": 0.47031039136302294, + "step": 4974 + }, + { + "epoch": 0.9223210975157583, + "grad_norm": 5.96875, + "learning_rate": 9.077678902484243e-06, + "loss": 2.8497, + "mean_token_accuracy": 0.44804183355585225, + "step": 4975 + }, + { + "epoch": 0.9225064886911383, + "grad_norm": 5.21484375, + "learning_rate": 9.077493511308862e-06, + "loss": 2.3512, + "mean_token_accuracy": 0.48171990768857037, + "step": 4976 + }, + { + "epoch": 0.9226918798665183, + "grad_norm": 9.46875, + "learning_rate": 9.077308120133482e-06, + "loss": 2.0914, + "mean_token_accuracy": 0.5437234510033689, + "step": 4977 + }, + { + "epoch": 0.9228772710418984, + "grad_norm": 7.9921875, + "learning_rate": 9.077122728958103e-06, + "loss": 2.4773, + "mean_token_accuracy": 0.47540292847361215, + "step": 4978 + }, + { + "epoch": 0.9230626622172785, + "grad_norm": 5.47265625, + "learning_rate": 9.076937337782722e-06, + "loss": 2.8065, + "mean_token_accuracy": 0.44319474282825727, + "step": 4979 + }, + { + "epoch": 0.9232480533926585, + "grad_norm": 6.671875, + "learning_rate": 9.076751946607342e-06, + "loss": 3.3323, + "mean_token_accuracy": 0.4120306933406413, + "step": 4980 + }, + { + "epoch": 0.9234334445680386, + "grad_norm": 5.078125, + "learning_rate": 9.076566555431963e-06, + "loss": 2.8433, + "mean_token_accuracy": 0.44940607127144744, + "step": 4981 + }, + { + "epoch": 0.9236188357434186, + "grad_norm": 6.015625, + "learning_rate": 9.076381164256583e-06, + "loss": 3.0572, + "mean_token_accuracy": 0.42929022588587945, + "step": 4982 + }, + { + "epoch": 0.9238042269187987, + "grad_norm": 5.46875, + "learning_rate": 9.076195773081202e-06, + "loss": 3.5526, + "mean_token_accuracy": 0.384774677053545, + "step": 4983 + }, + { + "epoch": 0.9239896180941787, + "grad_norm": 9.109375, + "learning_rate": 9.076010381905822e-06, + "loss": 2.7878, + "mean_token_accuracy": 0.4655041698256255, + "step": 4984 + }, + { + "epoch": 0.9241750092695588, + "grad_norm": 7.05078125, + "learning_rate": 9.075824990730441e-06, + "loss": 2.8629, + "mean_token_accuracy": 0.44124117170313765, + "step": 4985 + }, + { + "epoch": 0.9243604004449388, + "grad_norm": 6.9453125, + "learning_rate": 9.075639599555062e-06, + "loss": 3.0968, + "mean_token_accuracy": 0.41042691847320123, + "step": 4986 + }, + { + "epoch": 0.9245457916203189, + "grad_norm": 7.85546875, + "learning_rate": 9.075454208379682e-06, + "loss": 2.7458, + "mean_token_accuracy": 0.4569264752287788, + "step": 4987 + }, + { + "epoch": 0.9247311827956989, + "grad_norm": 6.44921875, + "learning_rate": 9.075268817204301e-06, + "loss": 3.2206, + "mean_token_accuracy": 0.41347508646086845, + "step": 4988 + }, + { + "epoch": 0.924916573971079, + "grad_norm": 7.046875, + "learning_rate": 9.075083426028921e-06, + "loss": 2.8966, + "mean_token_accuracy": 0.4460118425635667, + "step": 4989 + }, + { + "epoch": 0.925101965146459, + "grad_norm": 5.4609375, + "learning_rate": 9.074898034853542e-06, + "loss": 3.1181, + "mean_token_accuracy": 0.4161348585189645, + "step": 4990 + }, + { + "epoch": 0.9252873563218391, + "grad_norm": 5.609375, + "learning_rate": 9.074712643678162e-06, + "loss": 2.8617, + "mean_token_accuracy": 0.4566467065868263, + "step": 4991 + }, + { + "epoch": 0.9254727474972191, + "grad_norm": 6.6796875, + "learning_rate": 9.074527252502781e-06, + "loss": 2.9827, + "mean_token_accuracy": 0.4315217391304348, + "step": 4992 + }, + { + "epoch": 0.9256581386725992, + "grad_norm": 6.24609375, + "learning_rate": 9.074341861327402e-06, + "loss": 3.56, + "mean_token_accuracy": 0.41300056401579244, + "step": 4993 + }, + { + "epoch": 0.9258435298479792, + "grad_norm": 5.25390625, + "learning_rate": 9.07415647015202e-06, + "loss": 2.7723, + "mean_token_accuracy": 0.447431693989071, + "step": 4994 + }, + { + "epoch": 0.9260289210233593, + "grad_norm": 6.8046875, + "learning_rate": 9.073971078976641e-06, + "loss": 2.7114, + "mean_token_accuracy": 0.4684607717499644, + "step": 4995 + }, + { + "epoch": 0.9262143121987393, + "grad_norm": 8.0078125, + "learning_rate": 9.073785687801262e-06, + "loss": 2.9449, + "mean_token_accuracy": 0.46177924217462935, + "step": 4996 + }, + { + "epoch": 0.9263997033741194, + "grad_norm": 5.671875, + "learning_rate": 9.073600296625882e-06, + "loss": 2.8734, + "mean_token_accuracy": 0.442861504019538, + "step": 4997 + }, + { + "epoch": 0.9265850945494994, + "grad_norm": 7.87109375, + "learning_rate": 9.0734149054505e-06, + "loss": 2.3334, + "mean_token_accuracy": 0.5043661547726589, + "step": 4998 + }, + { + "epoch": 0.9267704857248795, + "grad_norm": 6.90625, + "learning_rate": 9.073229514275121e-06, + "loss": 3.4988, + "mean_token_accuracy": 0.39118457300275483, + "step": 4999 + }, + { + "epoch": 0.9269558769002596, + "grad_norm": 6.0078125, + "learning_rate": 9.073044123099742e-06, + "loss": 2.6032, + "mean_token_accuracy": 0.4696289293311274, + "step": 5000 + }, + { + "epoch": 0.9271412680756396, + "grad_norm": 6.69140625, + "learning_rate": 9.07285873192436e-06, + "loss": 3.2134, + "mean_token_accuracy": 0.39330346616364065, + "step": 5001 + }, + { + "epoch": 0.9273266592510196, + "grad_norm": 8.046875, + "learning_rate": 9.072673340748981e-06, + "loss": 2.857, + "mean_token_accuracy": 0.4456237278278569, + "step": 5002 + }, + { + "epoch": 0.9275120504263997, + "grad_norm": 8.984375, + "learning_rate": 9.0724879495736e-06, + "loss": 2.9382, + "mean_token_accuracy": 0.45203509276571263, + "step": 5003 + }, + { + "epoch": 0.9276974416017798, + "grad_norm": 6.3203125, + "learning_rate": 9.07230255839822e-06, + "loss": 2.8767, + "mean_token_accuracy": 0.44558051930579845, + "step": 5004 + }, + { + "epoch": 0.9278828327771598, + "grad_norm": 7.36328125, + "learning_rate": 9.072117167222841e-06, + "loss": 2.5572, + "mean_token_accuracy": 0.4724333063864187, + "step": 5005 + }, + { + "epoch": 0.9280682239525399, + "grad_norm": 5.84765625, + "learning_rate": 9.071931776047461e-06, + "loss": 2.6134, + "mean_token_accuracy": 0.4877289149121714, + "step": 5006 + }, + { + "epoch": 0.9282536151279199, + "grad_norm": 5.34765625, + "learning_rate": 9.071746384872082e-06, + "loss": 3.2212, + "mean_token_accuracy": 0.42715141612200436, + "step": 5007 + }, + { + "epoch": 0.9284390063033, + "grad_norm": 7.6015625, + "learning_rate": 9.0715609936967e-06, + "loss": 2.988, + "mean_token_accuracy": 0.4092832414412104, + "step": 5008 + }, + { + "epoch": 0.92862439747868, + "grad_norm": 6.1796875, + "learning_rate": 9.071375602521321e-06, + "loss": 2.8125, + "mean_token_accuracy": 0.45897097625329814, + "step": 5009 + }, + { + "epoch": 0.92880978865406, + "grad_norm": 5.40625, + "learning_rate": 9.07119021134594e-06, + "loss": 2.6624, + "mean_token_accuracy": 0.45803008248423094, + "step": 5010 + }, + { + "epoch": 0.9289951798294401, + "grad_norm": 5.71875, + "learning_rate": 9.07100482017056e-06, + "loss": 2.9895, + "mean_token_accuracy": 0.4239567621920563, + "step": 5011 + }, + { + "epoch": 0.9291805710048202, + "grad_norm": 5.1953125, + "learning_rate": 9.070819428995181e-06, + "loss": 2.6746, + "mean_token_accuracy": 0.46896863010607087, + "step": 5012 + }, + { + "epoch": 0.9293659621802002, + "grad_norm": 5.515625, + "learning_rate": 9.070634037819801e-06, + "loss": 3.2647, + "mean_token_accuracy": 0.41482632338253245, + "step": 5013 + }, + { + "epoch": 0.9295513533555803, + "grad_norm": 6.11328125, + "learning_rate": 9.07044864664442e-06, + "loss": 3.3161, + "mean_token_accuracy": 0.41375257126065235, + "step": 5014 + }, + { + "epoch": 0.9297367445309603, + "grad_norm": 6.7890625, + "learning_rate": 9.07026325546904e-06, + "loss": 2.5915, + "mean_token_accuracy": 0.4809070383822519, + "step": 5015 + }, + { + "epoch": 0.9299221357063404, + "grad_norm": 6.3984375, + "learning_rate": 9.070077864293661e-06, + "loss": 3.0112, + "mean_token_accuracy": 0.4320102432778489, + "step": 5016 + }, + { + "epoch": 0.9301075268817204, + "grad_norm": 8.6484375, + "learning_rate": 9.06989247311828e-06, + "loss": 2.255, + "mean_token_accuracy": 0.5033981161321092, + "step": 5017 + }, + { + "epoch": 0.9302929180571005, + "grad_norm": 8.421875, + "learning_rate": 9.0697070819429e-06, + "loss": 2.5231, + "mean_token_accuracy": 0.4760695051315923, + "step": 5018 + }, + { + "epoch": 0.9304783092324805, + "grad_norm": 8.03125, + "learning_rate": 9.06952169076752e-06, + "loss": 3.0176, + "mean_token_accuracy": 0.43518187239117473, + "step": 5019 + }, + { + "epoch": 0.9306637004078606, + "grad_norm": 5.50390625, + "learning_rate": 9.06933629959214e-06, + "loss": 2.5805, + "mean_token_accuracy": 0.4695041684949539, + "step": 5020 + }, + { + "epoch": 0.9308490915832406, + "grad_norm": 7.58203125, + "learning_rate": 9.06915090841676e-06, + "loss": 2.8542, + "mean_token_accuracy": 0.4540878319736157, + "step": 5021 + }, + { + "epoch": 0.9310344827586207, + "grad_norm": 9.34375, + "learning_rate": 9.06896551724138e-06, + "loss": 2.9099, + "mean_token_accuracy": 0.44472396925227115, + "step": 5022 + }, + { + "epoch": 0.9312198739340007, + "grad_norm": 6.25390625, + "learning_rate": 9.068780126066e-06, + "loss": 3.1117, + "mean_token_accuracy": 0.4439751000444642, + "step": 5023 + }, + { + "epoch": 0.9314052651093808, + "grad_norm": 6.21875, + "learning_rate": 9.06859473489062e-06, + "loss": 2.9356, + "mean_token_accuracy": 0.4418349127867051, + "step": 5024 + }, + { + "epoch": 0.9315906562847609, + "grad_norm": 7.578125, + "learning_rate": 9.06840934371524e-06, + "loss": 2.809, + "mean_token_accuracy": 0.44796851487440675, + "step": 5025 + }, + { + "epoch": 0.9317760474601409, + "grad_norm": 7.67578125, + "learning_rate": 9.06822395253986e-06, + "loss": 2.3462, + "mean_token_accuracy": 0.49228626526676666, + "step": 5026 + }, + { + "epoch": 0.931961438635521, + "grad_norm": 7.54296875, + "learning_rate": 9.06803856136448e-06, + "loss": 2.4471, + "mean_token_accuracy": 0.47226298796595245, + "step": 5027 + }, + { + "epoch": 0.932146829810901, + "grad_norm": 5.9921875, + "learning_rate": 9.067853170189099e-06, + "loss": 2.7297, + "mean_token_accuracy": 0.45342147141633005, + "step": 5028 + }, + { + "epoch": 0.9323322209862811, + "grad_norm": 5.30078125, + "learning_rate": 9.06766777901372e-06, + "loss": 2.9192, + "mean_token_accuracy": 0.41695114773396114, + "step": 5029 + }, + { + "epoch": 0.9325176121616611, + "grad_norm": 7.73828125, + "learning_rate": 9.06748238783834e-06, + "loss": 2.6566, + "mean_token_accuracy": 0.4622186495176849, + "step": 5030 + }, + { + "epoch": 0.9327030033370411, + "grad_norm": 7.70703125, + "learning_rate": 9.06729699666296e-06, + "loss": 2.9391, + "mean_token_accuracy": 0.44421620233662856, + "step": 5031 + }, + { + "epoch": 0.9328883945124212, + "grad_norm": 6.8046875, + "learning_rate": 9.067111605487579e-06, + "loss": 3.0469, + "mean_token_accuracy": 0.4385182644486366, + "step": 5032 + }, + { + "epoch": 0.9330737856878013, + "grad_norm": 5.3828125, + "learning_rate": 9.0669262143122e-06, + "loss": 2.6182, + "mean_token_accuracy": 0.4535098960558166, + "step": 5033 + }, + { + "epoch": 0.9332591768631813, + "grad_norm": 6.78515625, + "learning_rate": 9.06674082313682e-06, + "loss": 2.5284, + "mean_token_accuracy": 0.47643176997407494, + "step": 5034 + }, + { + "epoch": 0.9334445680385614, + "grad_norm": 6.98828125, + "learning_rate": 9.066555431961439e-06, + "loss": 2.7124, + "mean_token_accuracy": 0.46201592227769533, + "step": 5035 + }, + { + "epoch": 0.9336299592139414, + "grad_norm": 9.234375, + "learning_rate": 9.06637004078606e-06, + "loss": 2.5526, + "mean_token_accuracy": 0.4859393806762228, + "step": 5036 + }, + { + "epoch": 0.9338153503893215, + "grad_norm": 7.45703125, + "learning_rate": 9.06618464961068e-06, + "loss": 2.9983, + "mean_token_accuracy": 0.43536977491961415, + "step": 5037 + }, + { + "epoch": 0.9340007415647015, + "grad_norm": 6.296875, + "learning_rate": 9.0659992584353e-06, + "loss": 2.4304, + "mean_token_accuracy": 0.5121806298276886, + "step": 5038 + }, + { + "epoch": 0.9341861327400816, + "grad_norm": 5.23828125, + "learning_rate": 9.065813867259919e-06, + "loss": 3.2227, + "mean_token_accuracy": 0.4051336332363059, + "step": 5039 + }, + { + "epoch": 0.9343715239154616, + "grad_norm": 6.97265625, + "learning_rate": 9.06562847608454e-06, + "loss": 2.7968, + "mean_token_accuracy": 0.44825305852529485, + "step": 5040 + }, + { + "epoch": 0.9345569150908417, + "grad_norm": 5.73046875, + "learning_rate": 9.065443084909158e-06, + "loss": 2.524, + "mean_token_accuracy": 0.4919157275845174, + "step": 5041 + }, + { + "epoch": 0.9347423062662217, + "grad_norm": 4.3046875, + "learning_rate": 9.065257693733779e-06, + "loss": 2.7022, + "mean_token_accuracy": 0.4552004648460198, + "step": 5042 + }, + { + "epoch": 0.9349276974416018, + "grad_norm": 6.30859375, + "learning_rate": 9.0650723025584e-06, + "loss": 3.0691, + "mean_token_accuracy": 0.4241849886277483, + "step": 5043 + }, + { + "epoch": 0.9351130886169818, + "grad_norm": 5.640625, + "learning_rate": 9.064886911383018e-06, + "loss": 2.8632, + "mean_token_accuracy": 0.43985042735042734, + "step": 5044 + }, + { + "epoch": 0.9352984797923619, + "grad_norm": 6.23046875, + "learning_rate": 9.06470152020764e-06, + "loss": 2.615, + "mean_token_accuracy": 0.47962529274004684, + "step": 5045 + }, + { + "epoch": 0.9354838709677419, + "grad_norm": 6.08203125, + "learning_rate": 9.064516129032259e-06, + "loss": 3.0175, + "mean_token_accuracy": 0.43249277646440765, + "step": 5046 + }, + { + "epoch": 0.935669262143122, + "grad_norm": 8.5078125, + "learning_rate": 9.06433073785688e-06, + "loss": 2.2219, + "mean_token_accuracy": 0.5098530992475815, + "step": 5047 + }, + { + "epoch": 0.935854653318502, + "grad_norm": 7.078125, + "learning_rate": 9.064145346681498e-06, + "loss": 2.9151, + "mean_token_accuracy": 0.4275294117647059, + "step": 5048 + }, + { + "epoch": 0.9360400444938821, + "grad_norm": 7.69921875, + "learning_rate": 9.063959955506119e-06, + "loss": 2.79, + "mean_token_accuracy": 0.46177152022812334, + "step": 5049 + }, + { + "epoch": 0.9362254356692622, + "grad_norm": 5.88671875, + "learning_rate": 9.06377456433074e-06, + "loss": 3.5509, + "mean_token_accuracy": 0.372310570626754, + "step": 5050 + }, + { + "epoch": 0.9364108268446422, + "grad_norm": 6.59375, + "learning_rate": 9.063589173155358e-06, + "loss": 2.6417, + "mean_token_accuracy": 0.4562129515714126, + "step": 5051 + }, + { + "epoch": 0.9365962180200222, + "grad_norm": 6.22265625, + "learning_rate": 9.063403781979979e-06, + "loss": 2.8197, + "mean_token_accuracy": 0.44240400667779634, + "step": 5052 + }, + { + "epoch": 0.9367816091954023, + "grad_norm": 7.74609375, + "learning_rate": 9.063218390804599e-06, + "loss": 3.8114, + "mean_token_accuracy": 0.38948380010982975, + "step": 5053 + }, + { + "epoch": 0.9369670003707824, + "grad_norm": 6.08203125, + "learning_rate": 9.06303299962922e-06, + "loss": 2.7711, + "mean_token_accuracy": 0.4529924740174412, + "step": 5054 + }, + { + "epoch": 0.9371523915461624, + "grad_norm": 5.0859375, + "learning_rate": 9.062847608453838e-06, + "loss": 2.6152, + "mean_token_accuracy": 0.4714538765499387, + "step": 5055 + }, + { + "epoch": 0.9373377827215424, + "grad_norm": 7.8359375, + "learning_rate": 9.062662217278459e-06, + "loss": 2.826, + "mean_token_accuracy": 0.4443468072642062, + "step": 5056 + }, + { + "epoch": 0.9375231738969225, + "grad_norm": 8.1953125, + "learning_rate": 9.062476826103078e-06, + "loss": 2.7945, + "mean_token_accuracy": 0.4477390659747961, + "step": 5057 + }, + { + "epoch": 0.9377085650723026, + "grad_norm": 6.69921875, + "learning_rate": 9.062291434927698e-06, + "loss": 2.7296, + "mean_token_accuracy": 0.45717904068386894, + "step": 5058 + }, + { + "epoch": 0.9378939562476826, + "grad_norm": 5.625, + "learning_rate": 9.062106043752319e-06, + "loss": 2.6638, + "mean_token_accuracy": 0.46437768240343347, + "step": 5059 + }, + { + "epoch": 0.9380793474230626, + "grad_norm": 5.61328125, + "learning_rate": 9.061920652576937e-06, + "loss": 2.9125, + "mean_token_accuracy": 0.44624912362701563, + "step": 5060 + }, + { + "epoch": 0.9382647385984427, + "grad_norm": 9.0546875, + "learning_rate": 9.061735261401558e-06, + "loss": 2.6639, + "mean_token_accuracy": 0.46072642691000176, + "step": 5061 + }, + { + "epoch": 0.9384501297738228, + "grad_norm": 12.9140625, + "learning_rate": 9.061549870226178e-06, + "loss": 2.7657, + "mean_token_accuracy": 0.4651775804661487, + "step": 5062 + }, + { + "epoch": 0.9386355209492028, + "grad_norm": 5.75390625, + "learning_rate": 9.061364479050799e-06, + "loss": 2.6755, + "mean_token_accuracy": 0.46603598014888337, + "step": 5063 + }, + { + "epoch": 0.9388209121245829, + "grad_norm": 6.23828125, + "learning_rate": 9.061179087875418e-06, + "loss": 2.5667, + "mean_token_accuracy": 0.47568710359408034, + "step": 5064 + }, + { + "epoch": 0.9390063032999629, + "grad_norm": 7.4453125, + "learning_rate": 9.060993696700038e-06, + "loss": 2.9105, + "mean_token_accuracy": 0.4552574143022211, + "step": 5065 + }, + { + "epoch": 0.939191694475343, + "grad_norm": 7.33984375, + "learning_rate": 9.060808305524657e-06, + "loss": 2.6523, + "mean_token_accuracy": 0.4586961375306988, + "step": 5066 + }, + { + "epoch": 0.939377085650723, + "grad_norm": 6.28125, + "learning_rate": 9.060622914349277e-06, + "loss": 3.3778, + "mean_token_accuracy": 0.43209876543209874, + "step": 5067 + }, + { + "epoch": 0.939562476826103, + "grad_norm": 7.87109375, + "learning_rate": 9.060437523173898e-06, + "loss": 2.663, + "mean_token_accuracy": 0.46652615918670964, + "step": 5068 + }, + { + "epoch": 0.9397478680014831, + "grad_norm": 6.6015625, + "learning_rate": 9.060252131998518e-06, + "loss": 2.9997, + "mean_token_accuracy": 0.4517708689215351, + "step": 5069 + }, + { + "epoch": 0.9399332591768632, + "grad_norm": 7.66796875, + "learning_rate": 9.060066740823137e-06, + "loss": 2.6694, + "mean_token_accuracy": 0.47691123653155465, + "step": 5070 + }, + { + "epoch": 0.9401186503522432, + "grad_norm": 9.53125, + "learning_rate": 9.059881349647758e-06, + "loss": 1.9966, + "mean_token_accuracy": 0.5260179603887317, + "step": 5071 + }, + { + "epoch": 0.9403040415276233, + "grad_norm": 9.2265625, + "learning_rate": 9.059695958472378e-06, + "loss": 2.3724, + "mean_token_accuracy": 0.4982536270822139, + "step": 5072 + }, + { + "epoch": 0.9404894327030033, + "grad_norm": 7.6171875, + "learning_rate": 9.059510567296997e-06, + "loss": 2.6403, + "mean_token_accuracy": 0.46832160059281214, + "step": 5073 + }, + { + "epoch": 0.9406748238783834, + "grad_norm": 7.484375, + "learning_rate": 9.059325176121617e-06, + "loss": 3.6392, + "mean_token_accuracy": 0.3676419163072619, + "step": 5074 + }, + { + "epoch": 0.9408602150537635, + "grad_norm": 8.1953125, + "learning_rate": 9.059139784946236e-06, + "loss": 3.1878, + "mean_token_accuracy": 0.43460081013981444, + "step": 5075 + }, + { + "epoch": 0.9410456062291435, + "grad_norm": 6.23046875, + "learning_rate": 9.058954393770857e-06, + "loss": 2.8118, + "mean_token_accuracy": 0.4408090422367638, + "step": 5076 + }, + { + "epoch": 0.9412309974045235, + "grad_norm": 5.94921875, + "learning_rate": 9.058769002595477e-06, + "loss": 2.9962, + "mean_token_accuracy": 0.4208365409289197, + "step": 5077 + }, + { + "epoch": 0.9414163885799036, + "grad_norm": 6.19921875, + "learning_rate": 9.058583611420098e-06, + "loss": 2.9726, + "mean_token_accuracy": 0.4538629965592743, + "step": 5078 + }, + { + "epoch": 0.9416017797552837, + "grad_norm": 6.15625, + "learning_rate": 9.058398220244717e-06, + "loss": 2.8894, + "mean_token_accuracy": 0.4475616438356164, + "step": 5079 + }, + { + "epoch": 0.9417871709306637, + "grad_norm": 6.0390625, + "learning_rate": 9.058212829069337e-06, + "loss": 3.0562, + "mean_token_accuracy": 0.43910865434444635, + "step": 5080 + }, + { + "epoch": 0.9419725621060437, + "grad_norm": 8.8359375, + "learning_rate": 9.058027437893958e-06, + "loss": 3.125, + "mean_token_accuracy": 0.41188603841691374, + "step": 5081 + }, + { + "epoch": 0.9421579532814238, + "grad_norm": 20.90625, + "learning_rate": 9.057842046718576e-06, + "loss": 3.1916, + "mean_token_accuracy": 0.47096848666743624, + "step": 5082 + }, + { + "epoch": 0.9423433444568039, + "grad_norm": 7.65625, + "learning_rate": 9.057656655543197e-06, + "loss": 2.5256, + "mean_token_accuracy": 0.4723337406179089, + "step": 5083 + }, + { + "epoch": 0.9425287356321839, + "grad_norm": 11.109375, + "learning_rate": 9.057471264367816e-06, + "loss": 3.8565, + "mean_token_accuracy": 0.39267910554052127, + "step": 5084 + }, + { + "epoch": 0.942714126807564, + "grad_norm": 6.87109375, + "learning_rate": 9.057285873192436e-06, + "loss": 2.5803, + "mean_token_accuracy": 0.48431214802896216, + "step": 5085 + }, + { + "epoch": 0.942899517982944, + "grad_norm": 6.359375, + "learning_rate": 9.057100482017057e-06, + "loss": 3.1399, + "mean_token_accuracy": 0.3987034035656402, + "step": 5086 + }, + { + "epoch": 0.9430849091583241, + "grad_norm": 6.390625, + "learning_rate": 9.056915090841677e-06, + "loss": 2.493, + "mean_token_accuracy": 0.4829188302814977, + "step": 5087 + }, + { + "epoch": 0.9432703003337041, + "grad_norm": 5.9140625, + "learning_rate": 9.056729699666298e-06, + "loss": 2.5376, + "mean_token_accuracy": 0.46644388749651905, + "step": 5088 + }, + { + "epoch": 0.9434556915090841, + "grad_norm": 9.1953125, + "learning_rate": 9.056544308490916e-06, + "loss": 2.8615, + "mean_token_accuracy": 0.4608012568735271, + "step": 5089 + }, + { + "epoch": 0.9436410826844642, + "grad_norm": 7.5, + "learning_rate": 9.056358917315537e-06, + "loss": 2.6794, + "mean_token_accuracy": 0.44646051622795807, + "step": 5090 + }, + { + "epoch": 0.9438264738598443, + "grad_norm": 8.0, + "learning_rate": 9.056173526140156e-06, + "loss": 2.7121, + "mean_token_accuracy": 0.46624843161856966, + "step": 5091 + }, + { + "epoch": 0.9440118650352243, + "grad_norm": 8.359375, + "learning_rate": 9.055988134964776e-06, + "loss": 3.4026, + "mean_token_accuracy": 0.40621135083138654, + "step": 5092 + }, + { + "epoch": 0.9441972562106044, + "grad_norm": 6.81640625, + "learning_rate": 9.055802743789397e-06, + "loss": 3.1415, + "mean_token_accuracy": 0.4341147018661812, + "step": 5093 + }, + { + "epoch": 0.9443826473859844, + "grad_norm": 7.1640625, + "learning_rate": 9.055617352614017e-06, + "loss": 2.8629, + "mean_token_accuracy": 0.44316016766162836, + "step": 5094 + }, + { + "epoch": 0.9445680385613645, + "grad_norm": 6.46484375, + "learning_rate": 9.055431961438636e-06, + "loss": 2.8548, + "mean_token_accuracy": 0.4570001189484953, + "step": 5095 + }, + { + "epoch": 0.9447534297367445, + "grad_norm": 6.4921875, + "learning_rate": 9.055246570263256e-06, + "loss": 2.5373, + "mean_token_accuracy": 0.48470209339774556, + "step": 5096 + }, + { + "epoch": 0.9449388209121246, + "grad_norm": 6.9453125, + "learning_rate": 9.055061179087877e-06, + "loss": 3.6893, + "mean_token_accuracy": 0.3631725417439703, + "step": 5097 + }, + { + "epoch": 0.9451242120875046, + "grad_norm": 8.28125, + "learning_rate": 9.054875787912496e-06, + "loss": 2.7876, + "mean_token_accuracy": 0.47638660076880834, + "step": 5098 + }, + { + "epoch": 0.9453096032628847, + "grad_norm": 8.5625, + "learning_rate": 9.054690396737116e-06, + "loss": 3.4998, + "mean_token_accuracy": 0.3807572760666855, + "step": 5099 + }, + { + "epoch": 0.9454949944382648, + "grad_norm": 7.640625, + "learning_rate": 9.054505005561735e-06, + "loss": 3.4905, + "mean_token_accuracy": 0.408623417721519, + "step": 5100 + }, + { + "epoch": 0.9456803856136448, + "grad_norm": 9.6875, + "learning_rate": 9.054319614386356e-06, + "loss": 2.83, + "mean_token_accuracy": 0.4509908232477723, + "step": 5101 + }, + { + "epoch": 0.9458657767890248, + "grad_norm": 6.046875, + "learning_rate": 9.054134223210976e-06, + "loss": 2.5938, + "mean_token_accuracy": 0.4827832547506787, + "step": 5102 + }, + { + "epoch": 0.9460511679644049, + "grad_norm": 5.9453125, + "learning_rate": 9.053948832035596e-06, + "loss": 2.8142, + "mean_token_accuracy": 0.42935288640595903, + "step": 5103 + }, + { + "epoch": 0.946236559139785, + "grad_norm": 5.59375, + "learning_rate": 9.053763440860215e-06, + "loss": 3.0578, + "mean_token_accuracy": 0.430240669689571, + "step": 5104 + }, + { + "epoch": 0.946421950315165, + "grad_norm": 5.40234375, + "learning_rate": 9.053578049684836e-06, + "loss": 2.2834, + "mean_token_accuracy": 0.5416318241548203, + "step": 5105 + }, + { + "epoch": 0.946607341490545, + "grad_norm": 6.34765625, + "learning_rate": 9.053392658509456e-06, + "loss": 2.7072, + "mean_token_accuracy": 0.465514456469068, + "step": 5106 + }, + { + "epoch": 0.9467927326659251, + "grad_norm": 6.1015625, + "learning_rate": 9.053207267334075e-06, + "loss": 3.038, + "mean_token_accuracy": 0.4057322529931068, + "step": 5107 + }, + { + "epoch": 0.9469781238413052, + "grad_norm": 7.03125, + "learning_rate": 9.053021876158696e-06, + "loss": 3.0117, + "mean_token_accuracy": 0.4217802570943488, + "step": 5108 + }, + { + "epoch": 0.9471635150166852, + "grad_norm": 9.828125, + "learning_rate": 9.052836484983314e-06, + "loss": 2.7111, + "mean_token_accuracy": 0.43283582089552236, + "step": 5109 + }, + { + "epoch": 0.9473489061920652, + "grad_norm": 6.734375, + "learning_rate": 9.052651093807937e-06, + "loss": 2.7074, + "mean_token_accuracy": 0.4521354933726068, + "step": 5110 + }, + { + "epoch": 0.9475342973674453, + "grad_norm": 5.88671875, + "learning_rate": 9.052465702632555e-06, + "loss": 3.2006, + "mean_token_accuracy": 0.41606337931976506, + "step": 5111 + }, + { + "epoch": 0.9477196885428254, + "grad_norm": 5.98828125, + "learning_rate": 9.052280311457176e-06, + "loss": 2.9331, + "mean_token_accuracy": 0.4601884114079236, + "step": 5112 + }, + { + "epoch": 0.9479050797182054, + "grad_norm": 8.1796875, + "learning_rate": 9.052094920281795e-06, + "loss": 2.8859, + "mean_token_accuracy": 0.4550881577120644, + "step": 5113 + }, + { + "epoch": 0.9480904708935854, + "grad_norm": 5.84765625, + "learning_rate": 9.051909529106415e-06, + "loss": 2.8655, + "mean_token_accuracy": 0.44434675575874805, + "step": 5114 + }, + { + "epoch": 0.9482758620689655, + "grad_norm": 6.70703125, + "learning_rate": 9.051724137931036e-06, + "loss": 3.096, + "mean_token_accuracy": 0.43569001779647304, + "step": 5115 + }, + { + "epoch": 0.9484612532443456, + "grad_norm": 5.9609375, + "learning_rate": 9.051538746755654e-06, + "loss": 2.3927, + "mean_token_accuracy": 0.5415167650021586, + "step": 5116 + }, + { + "epoch": 0.9486466444197256, + "grad_norm": 8.2890625, + "learning_rate": 9.051353355580275e-06, + "loss": 2.478, + "mean_token_accuracy": 0.4662173546756529, + "step": 5117 + }, + { + "epoch": 0.9488320355951056, + "grad_norm": 5.4765625, + "learning_rate": 9.051167964404895e-06, + "loss": 2.7463, + "mean_token_accuracy": 0.45396536007292615, + "step": 5118 + }, + { + "epoch": 0.9490174267704857, + "grad_norm": 6.12890625, + "learning_rate": 9.050982573229516e-06, + "loss": 2.4743, + "mean_token_accuracy": 0.48412415269354264, + "step": 5119 + }, + { + "epoch": 0.9492028179458658, + "grad_norm": 7.57421875, + "learning_rate": 9.050797182054135e-06, + "loss": 2.8984, + "mean_token_accuracy": 0.44429795649307846, + "step": 5120 + }, + { + "epoch": 0.9493882091212458, + "grad_norm": 7.59375, + "learning_rate": 9.050611790878755e-06, + "loss": 2.8695, + "mean_token_accuracy": 0.4414830452566302, + "step": 5121 + }, + { + "epoch": 0.9495736002966259, + "grad_norm": 8.578125, + "learning_rate": 9.050426399703374e-06, + "loss": 2.7572, + "mean_token_accuracy": 0.4468139121604468, + "step": 5122 + }, + { + "epoch": 0.9497589914720059, + "grad_norm": 5.84375, + "learning_rate": 9.050241008527994e-06, + "loss": 3.2567, + "mean_token_accuracy": 0.4127341879137907, + "step": 5123 + }, + { + "epoch": 0.949944382647386, + "grad_norm": 9.6640625, + "learning_rate": 9.050055617352615e-06, + "loss": 2.5905, + "mean_token_accuracy": 0.46847395674687786, + "step": 5124 + }, + { + "epoch": 0.9501297738227661, + "grad_norm": 6.796875, + "learning_rate": 9.049870226177234e-06, + "loss": 3.4738, + "mean_token_accuracy": 0.40679611650485437, + "step": 5125 + }, + { + "epoch": 0.9503151649981461, + "grad_norm": 5.7265625, + "learning_rate": 9.049684835001856e-06, + "loss": 2.7231, + "mean_token_accuracy": 0.44760110160892885, + "step": 5126 + }, + { + "epoch": 0.9505005561735261, + "grad_norm": 7.3359375, + "learning_rate": 9.049499443826475e-06, + "loss": 2.9174, + "mean_token_accuracy": 0.45037504076530055, + "step": 5127 + }, + { + "epoch": 0.9506859473489062, + "grad_norm": 6.078125, + "learning_rate": 9.049314052651095e-06, + "loss": 3.0871, + "mean_token_accuracy": 0.43454871488344293, + "step": 5128 + }, + { + "epoch": 0.9508713385242863, + "grad_norm": 8.2109375, + "learning_rate": 9.049128661475714e-06, + "loss": 2.7704, + "mean_token_accuracy": 0.48210987996306554, + "step": 5129 + }, + { + "epoch": 0.9510567296996663, + "grad_norm": 8.9921875, + "learning_rate": 9.048943270300335e-06, + "loss": 2.6709, + "mean_token_accuracy": 0.47009653873322343, + "step": 5130 + }, + { + "epoch": 0.9512421208750463, + "grad_norm": 7.9921875, + "learning_rate": 9.048757879124955e-06, + "loss": 2.6169, + "mean_token_accuracy": 0.47681389942493574, + "step": 5131 + }, + { + "epoch": 0.9514275120504264, + "grad_norm": 7.40625, + "learning_rate": 9.048572487949574e-06, + "loss": 3.1055, + "mean_token_accuracy": 0.4241910229645094, + "step": 5132 + }, + { + "epoch": 0.9516129032258065, + "grad_norm": 8.1328125, + "learning_rate": 9.048387096774194e-06, + "loss": 2.6471, + "mean_token_accuracy": 0.482382682888292, + "step": 5133 + }, + { + "epoch": 0.9517982944011865, + "grad_norm": 7.58984375, + "learning_rate": 9.048201705598815e-06, + "loss": 2.4732, + "mean_token_accuracy": 0.5067913604987754, + "step": 5134 + }, + { + "epoch": 0.9519836855765665, + "grad_norm": 5.828125, + "learning_rate": 9.048016314423435e-06, + "loss": 2.9442, + "mean_token_accuracy": 0.42762507415463713, + "step": 5135 + }, + { + "epoch": 0.9521690767519466, + "grad_norm": 5.6484375, + "learning_rate": 9.047830923248054e-06, + "loss": 3.4461, + "mean_token_accuracy": 0.38918106686701726, + "step": 5136 + }, + { + "epoch": 0.9523544679273267, + "grad_norm": 7.81640625, + "learning_rate": 9.047645532072675e-06, + "loss": 2.8653, + "mean_token_accuracy": 0.43877109476417137, + "step": 5137 + }, + { + "epoch": 0.9525398591027067, + "grad_norm": 8.125, + "learning_rate": 9.047460140897293e-06, + "loss": 2.6481, + "mean_token_accuracy": 0.4720449323461833, + "step": 5138 + }, + { + "epoch": 0.9527252502780867, + "grad_norm": 8.71875, + "learning_rate": 9.047274749721914e-06, + "loss": 2.9996, + "mean_token_accuracy": 0.4242208623879323, + "step": 5139 + }, + { + "epoch": 0.9529106414534668, + "grad_norm": 7.67578125, + "learning_rate": 9.047089358546534e-06, + "loss": 3.0508, + "mean_token_accuracy": 0.43106382978723407, + "step": 5140 + }, + { + "epoch": 0.9530960326288469, + "grad_norm": 6.15234375, + "learning_rate": 9.046903967371153e-06, + "loss": 2.5805, + "mean_token_accuracy": 0.4807121661721068, + "step": 5141 + }, + { + "epoch": 0.9532814238042269, + "grad_norm": 8.046875, + "learning_rate": 9.046718576195774e-06, + "loss": 2.2334, + "mean_token_accuracy": 0.5171836787231304, + "step": 5142 + }, + { + "epoch": 0.953466814979607, + "grad_norm": 8.125, + "learning_rate": 9.046533185020394e-06, + "loss": 2.8847, + "mean_token_accuracy": 0.4715713583483533, + "step": 5143 + }, + { + "epoch": 0.953652206154987, + "grad_norm": 6.63671875, + "learning_rate": 9.046347793845015e-06, + "loss": 2.7305, + "mean_token_accuracy": 0.44865027899261045, + "step": 5144 + }, + { + "epoch": 0.9538375973303671, + "grad_norm": 7.9765625, + "learning_rate": 9.046162402669633e-06, + "loss": 2.8197, + "mean_token_accuracy": 0.4576057431121459, + "step": 5145 + }, + { + "epoch": 0.9540229885057471, + "grad_norm": 6.26953125, + "learning_rate": 9.045977011494254e-06, + "loss": 2.8074, + "mean_token_accuracy": 0.4429158110882957, + "step": 5146 + }, + { + "epoch": 0.9542083796811272, + "grad_norm": 6.3203125, + "learning_rate": 9.045791620318873e-06, + "loss": 3.0816, + "mean_token_accuracy": 0.4258615238697439, + "step": 5147 + }, + { + "epoch": 0.9543937708565072, + "grad_norm": 5.49609375, + "learning_rate": 9.045606229143493e-06, + "loss": 2.5723, + "mean_token_accuracy": 0.4476231758416414, + "step": 5148 + }, + { + "epoch": 0.9545791620318873, + "grad_norm": 8.2265625, + "learning_rate": 9.045420837968114e-06, + "loss": 2.9124, + "mean_token_accuracy": 0.4569065583284968, + "step": 5149 + }, + { + "epoch": 0.9547645532072674, + "grad_norm": 7.3828125, + "learning_rate": 9.045235446792734e-06, + "loss": 2.7596, + "mean_token_accuracy": 0.4764292878635908, + "step": 5150 + }, + { + "epoch": 0.9549499443826474, + "grad_norm": 7.19921875, + "learning_rate": 9.045050055617353e-06, + "loss": 2.3494, + "mean_token_accuracy": 0.5188296234075318, + "step": 5151 + }, + { + "epoch": 0.9551353355580274, + "grad_norm": 6.3828125, + "learning_rate": 9.044864664441973e-06, + "loss": 2.9067, + "mean_token_accuracy": 0.4331896551724138, + "step": 5152 + }, + { + "epoch": 0.9553207267334075, + "grad_norm": 5.4140625, + "learning_rate": 9.044679273266594e-06, + "loss": 2.907, + "mean_token_accuracy": 0.4327605019094381, + "step": 5153 + }, + { + "epoch": 0.9555061179087876, + "grad_norm": 5.44140625, + "learning_rate": 9.044493882091213e-06, + "loss": 3.4317, + "mean_token_accuracy": 0.37875710804224205, + "step": 5154 + }, + { + "epoch": 0.9556915090841676, + "grad_norm": 11.4296875, + "learning_rate": 9.044308490915833e-06, + "loss": 1.8909, + "mean_token_accuracy": 0.5749523204068658, + "step": 5155 + }, + { + "epoch": 0.9558769002595476, + "grad_norm": 7.46875, + "learning_rate": 9.044123099740452e-06, + "loss": 2.6433, + "mean_token_accuracy": 0.47192420231294413, + "step": 5156 + }, + { + "epoch": 0.9560622914349277, + "grad_norm": 5.84765625, + "learning_rate": 9.043937708565073e-06, + "loss": 2.753, + "mean_token_accuracy": 0.45921488656640325, + "step": 5157 + }, + { + "epoch": 0.9562476826103078, + "grad_norm": 6.0625, + "learning_rate": 9.043752317389693e-06, + "loss": 3.0041, + "mean_token_accuracy": 0.4222384784198976, + "step": 5158 + }, + { + "epoch": 0.9564330737856878, + "grad_norm": 6.05859375, + "learning_rate": 9.043566926214314e-06, + "loss": 2.7249, + "mean_token_accuracy": 0.4567667075755885, + "step": 5159 + }, + { + "epoch": 0.9566184649610678, + "grad_norm": 5.55078125, + "learning_rate": 9.043381535038932e-06, + "loss": 2.7053, + "mean_token_accuracy": 0.43915997529339096, + "step": 5160 + }, + { + "epoch": 0.9568038561364479, + "grad_norm": 6.9296875, + "learning_rate": 9.043196143863553e-06, + "loss": 2.3705, + "mean_token_accuracy": 0.495202123315639, + "step": 5161 + }, + { + "epoch": 0.956989247311828, + "grad_norm": 5.77734375, + "learning_rate": 9.043010752688173e-06, + "loss": 2.9452, + "mean_token_accuracy": 0.4395433316035288, + "step": 5162 + }, + { + "epoch": 0.957174638487208, + "grad_norm": 5.90625, + "learning_rate": 9.042825361512792e-06, + "loss": 3.0524, + "mean_token_accuracy": 0.4274028629856851, + "step": 5163 + }, + { + "epoch": 0.957360029662588, + "grad_norm": 6.79296875, + "learning_rate": 9.042639970337413e-06, + "loss": 2.8446, + "mean_token_accuracy": 0.44437215354586856, + "step": 5164 + }, + { + "epoch": 0.9575454208379681, + "grad_norm": 6.6171875, + "learning_rate": 9.042454579162031e-06, + "loss": 2.6049, + "mean_token_accuracy": 0.4754803996925442, + "step": 5165 + }, + { + "epoch": 0.9577308120133482, + "grad_norm": 6.33203125, + "learning_rate": 9.042269187986654e-06, + "loss": 3.2309, + "mean_token_accuracy": 0.4251479289940828, + "step": 5166 + }, + { + "epoch": 0.9579162031887282, + "grad_norm": 7.0234375, + "learning_rate": 9.042083796811272e-06, + "loss": 2.0495, + "mean_token_accuracy": 0.569643058214165, + "step": 5167 + }, + { + "epoch": 0.9581015943641082, + "grad_norm": 6.46875, + "learning_rate": 9.041898405635893e-06, + "loss": 2.6429, + "mean_token_accuracy": 0.502425799086758, + "step": 5168 + }, + { + "epoch": 0.9582869855394883, + "grad_norm": 7.1484375, + "learning_rate": 9.041713014460513e-06, + "loss": 2.6652, + "mean_token_accuracy": 0.46747479596735475, + "step": 5169 + }, + { + "epoch": 0.9584723767148684, + "grad_norm": 7.07421875, + "learning_rate": 9.041527623285132e-06, + "loss": 3.0166, + "mean_token_accuracy": 0.4230959446092977, + "step": 5170 + }, + { + "epoch": 0.9586577678902484, + "grad_norm": 7.046875, + "learning_rate": 9.041342232109753e-06, + "loss": 3.1354, + "mean_token_accuracy": 0.42696629213483145, + "step": 5171 + }, + { + "epoch": 0.9588431590656284, + "grad_norm": 4.9921875, + "learning_rate": 9.041156840934371e-06, + "loss": 2.6096, + "mean_token_accuracy": 0.4646479713603819, + "step": 5172 + }, + { + "epoch": 0.9590285502410085, + "grad_norm": 5.32421875, + "learning_rate": 9.040971449758992e-06, + "loss": 2.8641, + "mean_token_accuracy": 0.44739756367663347, + "step": 5173 + }, + { + "epoch": 0.9592139414163886, + "grad_norm": 5.88671875, + "learning_rate": 9.040786058583612e-06, + "loss": 3.0097, + "mean_token_accuracy": 0.4460464584048518, + "step": 5174 + }, + { + "epoch": 0.9593993325917687, + "grad_norm": 6.27734375, + "learning_rate": 9.040600667408233e-06, + "loss": 2.7316, + "mean_token_accuracy": 0.4506109230089798, + "step": 5175 + }, + { + "epoch": 0.9595847237671487, + "grad_norm": 7.62890625, + "learning_rate": 9.040415276232852e-06, + "loss": 2.6282, + "mean_token_accuracy": 0.49907846234860453, + "step": 5176 + }, + { + "epoch": 0.9597701149425287, + "grad_norm": 7.25390625, + "learning_rate": 9.040229885057472e-06, + "loss": 2.2252, + "mean_token_accuracy": 0.5144230769230769, + "step": 5177 + }, + { + "epoch": 0.9599555061179088, + "grad_norm": 7.953125, + "learning_rate": 9.040044493882093e-06, + "loss": 2.5846, + "mean_token_accuracy": 0.4631650750341064, + "step": 5178 + }, + { + "epoch": 0.9601408972932889, + "grad_norm": 8.546875, + "learning_rate": 9.039859102706711e-06, + "loss": 2.7318, + "mean_token_accuracy": 0.44526850707320426, + "step": 5179 + }, + { + "epoch": 0.9603262884686689, + "grad_norm": 7.89453125, + "learning_rate": 9.039673711531332e-06, + "loss": 2.5774, + "mean_token_accuracy": 0.4708860759493671, + "step": 5180 + }, + { + "epoch": 0.9605116796440489, + "grad_norm": 5.578125, + "learning_rate": 9.03948832035595e-06, + "loss": 2.6955, + "mean_token_accuracy": 0.45662750682330605, + "step": 5181 + }, + { + "epoch": 0.960697070819429, + "grad_norm": 5.72265625, + "learning_rate": 9.039302929180573e-06, + "loss": 2.8256, + "mean_token_accuracy": 0.4638644293071939, + "step": 5182 + }, + { + "epoch": 0.9608824619948091, + "grad_norm": 7.765625, + "learning_rate": 9.039117538005192e-06, + "loss": 2.5729, + "mean_token_accuracy": 0.4906201146430432, + "step": 5183 + }, + { + "epoch": 0.9610678531701891, + "grad_norm": 6.18359375, + "learning_rate": 9.038932146829812e-06, + "loss": 2.5885, + "mean_token_accuracy": 0.4794503589997524, + "step": 5184 + }, + { + "epoch": 0.9612532443455691, + "grad_norm": 5.66015625, + "learning_rate": 9.038746755654431e-06, + "loss": 2.6422, + "mean_token_accuracy": 0.45679554774458114, + "step": 5185 + }, + { + "epoch": 0.9614386355209492, + "grad_norm": 8.03125, + "learning_rate": 9.038561364479052e-06, + "loss": 2.6093, + "mean_token_accuracy": 0.46732700509010866, + "step": 5186 + }, + { + "epoch": 0.9616240266963293, + "grad_norm": 7.65625, + "learning_rate": 9.038375973303672e-06, + "loss": 2.4173, + "mean_token_accuracy": 0.506962962962963, + "step": 5187 + }, + { + "epoch": 0.9618094178717093, + "grad_norm": 5.35546875, + "learning_rate": 9.03819058212829e-06, + "loss": 2.7887, + "mean_token_accuracy": 0.4503105590062112, + "step": 5188 + }, + { + "epoch": 0.9619948090470893, + "grad_norm": 10.0625, + "learning_rate": 9.038005190952911e-06, + "loss": 2.238, + "mean_token_accuracy": 0.5131551252367923, + "step": 5189 + }, + { + "epoch": 0.9621802002224694, + "grad_norm": 8.2265625, + "learning_rate": 9.037819799777532e-06, + "loss": 3.0787, + "mean_token_accuracy": 0.429923805125837, + "step": 5190 + }, + { + "epoch": 0.9623655913978495, + "grad_norm": 10.671875, + "learning_rate": 9.037634408602152e-06, + "loss": 2.8502, + "mean_token_accuracy": 0.45515375854214124, + "step": 5191 + }, + { + "epoch": 0.9625509825732295, + "grad_norm": 5.3828125, + "learning_rate": 9.037449017426771e-06, + "loss": 2.9714, + "mean_token_accuracy": 0.44384707287933095, + "step": 5192 + }, + { + "epoch": 0.9627363737486095, + "grad_norm": 8.7421875, + "learning_rate": 9.037263626251392e-06, + "loss": 2.6406, + "mean_token_accuracy": 0.4695277815148661, + "step": 5193 + }, + { + "epoch": 0.9629217649239896, + "grad_norm": 7.70703125, + "learning_rate": 9.03707823507601e-06, + "loss": 2.7539, + "mean_token_accuracy": 0.4553635194972147, + "step": 5194 + }, + { + "epoch": 0.9631071560993697, + "grad_norm": 6.41796875, + "learning_rate": 9.036892843900631e-06, + "loss": 3.5296, + "mean_token_accuracy": 0.3985784001697433, + "step": 5195 + }, + { + "epoch": 0.9632925472747497, + "grad_norm": 6.359375, + "learning_rate": 9.036707452725251e-06, + "loss": 3.4724, + "mean_token_accuracy": 0.40551234820575793, + "step": 5196 + }, + { + "epoch": 0.9634779384501297, + "grad_norm": 6.82421875, + "learning_rate": 9.03652206154987e-06, + "loss": 2.9556, + "mean_token_accuracy": 0.4259637188208617, + "step": 5197 + }, + { + "epoch": 0.9636633296255098, + "grad_norm": 8.953125, + "learning_rate": 9.03633667037449e-06, + "loss": 2.8666, + "mean_token_accuracy": 0.4495681063122924, + "step": 5198 + }, + { + "epoch": 0.9638487208008899, + "grad_norm": 5.5, + "learning_rate": 9.036151279199111e-06, + "loss": 2.726, + "mean_token_accuracy": 0.4518666666666667, + "step": 5199 + }, + { + "epoch": 0.96403411197627, + "grad_norm": 6.6796875, + "learning_rate": 9.035965888023732e-06, + "loss": 2.4821, + "mean_token_accuracy": 0.4727700379952958, + "step": 5200 + }, + { + "epoch": 0.96421950315165, + "grad_norm": 7.65625, + "learning_rate": 9.03578049684835e-06, + "loss": 2.2674, + "mean_token_accuracy": 0.5025125628140703, + "step": 5201 + }, + { + "epoch": 0.96440489432703, + "grad_norm": 6.30078125, + "learning_rate": 9.035595105672971e-06, + "loss": 3.0469, + "mean_token_accuracy": 0.44163424124513617, + "step": 5202 + }, + { + "epoch": 0.9645902855024101, + "grad_norm": 7.92578125, + "learning_rate": 9.03540971449759e-06, + "loss": 2.7477, + "mean_token_accuracy": 0.44185490617903667, + "step": 5203 + }, + { + "epoch": 0.9647756766777902, + "grad_norm": 10.9765625, + "learning_rate": 9.03522432332221e-06, + "loss": 3.2446, + "mean_token_accuracy": 0.4154057771664374, + "step": 5204 + }, + { + "epoch": 0.9649610678531702, + "grad_norm": 8.15625, + "learning_rate": 9.03503893214683e-06, + "loss": 3.2804, + "mean_token_accuracy": 0.404245553643144, + "step": 5205 + }, + { + "epoch": 0.9651464590285502, + "grad_norm": 6.27734375, + "learning_rate": 9.03485354097145e-06, + "loss": 3.3929, + "mean_token_accuracy": 0.39731653888280394, + "step": 5206 + }, + { + "epoch": 0.9653318502039303, + "grad_norm": 10.0625, + "learning_rate": 9.034668149796072e-06, + "loss": 2.5174, + "mean_token_accuracy": 0.46927263730826324, + "step": 5207 + }, + { + "epoch": 0.9655172413793104, + "grad_norm": 13.3515625, + "learning_rate": 9.03448275862069e-06, + "loss": 3.0429, + "mean_token_accuracy": 0.4377210712481051, + "step": 5208 + }, + { + "epoch": 0.9657026325546904, + "grad_norm": 12.0, + "learning_rate": 9.034297367445311e-06, + "loss": 2.4607, + "mean_token_accuracy": 0.48308556713098444, + "step": 5209 + }, + { + "epoch": 0.9658880237300704, + "grad_norm": 7.8046875, + "learning_rate": 9.03411197626993e-06, + "loss": 3.4499, + "mean_token_accuracy": 0.408208851040067, + "step": 5210 + }, + { + "epoch": 0.9660734149054505, + "grad_norm": 6.53125, + "learning_rate": 9.03392658509455e-06, + "loss": 3.6012, + "mean_token_accuracy": 0.38065414057063324, + "step": 5211 + }, + { + "epoch": 0.9662588060808306, + "grad_norm": 9.9921875, + "learning_rate": 9.03374119391917e-06, + "loss": 3.7186, + "mean_token_accuracy": 0.3613712109189018, + "step": 5212 + }, + { + "epoch": 0.9664441972562106, + "grad_norm": 10.265625, + "learning_rate": 9.03355580274379e-06, + "loss": 2.5571, + "mean_token_accuracy": 0.47859116022099446, + "step": 5213 + }, + { + "epoch": 0.9666295884315906, + "grad_norm": 8.75, + "learning_rate": 9.03337041156841e-06, + "loss": 2.949, + "mean_token_accuracy": 0.44313603966532383, + "step": 5214 + }, + { + "epoch": 0.9668149796069707, + "grad_norm": 6.265625, + "learning_rate": 9.03318502039303e-06, + "loss": 3.0346, + "mean_token_accuracy": 0.42360797761934843, + "step": 5215 + }, + { + "epoch": 0.9670003707823508, + "grad_norm": 10.1875, + "learning_rate": 9.032999629217651e-06, + "loss": 3.0643, + "mean_token_accuracy": 0.4169190192439191, + "step": 5216 + }, + { + "epoch": 0.9671857619577308, + "grad_norm": 9.8984375, + "learning_rate": 9.03281423804227e-06, + "loss": 2.6794, + "mean_token_accuracy": 0.46551130494663145, + "step": 5217 + }, + { + "epoch": 0.9673711531331108, + "grad_norm": 7.22265625, + "learning_rate": 9.03262884686689e-06, + "loss": 2.8222, + "mean_token_accuracy": 0.4433491062039958, + "step": 5218 + }, + { + "epoch": 0.9675565443084909, + "grad_norm": 8.3984375, + "learning_rate": 9.032443455691509e-06, + "loss": 2.4979, + "mean_token_accuracy": 0.48482303539292143, + "step": 5219 + }, + { + "epoch": 0.967741935483871, + "grad_norm": 10.1484375, + "learning_rate": 9.03225806451613e-06, + "loss": 2.6135, + "mean_token_accuracy": 0.4622072391767211, + "step": 5220 + }, + { + "epoch": 0.9679273266592511, + "grad_norm": 8.9296875, + "learning_rate": 9.03207267334075e-06, + "loss": 3.2953, + "mean_token_accuracy": 0.40985163204747777, + "step": 5221 + }, + { + "epoch": 0.968112717834631, + "grad_norm": 5.25, + "learning_rate": 9.031887282165369e-06, + "loss": 2.971, + "mean_token_accuracy": 0.44638949671772427, + "step": 5222 + }, + { + "epoch": 0.9682981090100111, + "grad_norm": 6.15625, + "learning_rate": 9.03170189098999e-06, + "loss": 2.9264, + "mean_token_accuracy": 0.43218785796105386, + "step": 5223 + }, + { + "epoch": 0.9684835001853912, + "grad_norm": 8.1015625, + "learning_rate": 9.03151649981461e-06, + "loss": 3.3585, + "mean_token_accuracy": 0.4038132206872931, + "step": 5224 + }, + { + "epoch": 0.9686688913607713, + "grad_norm": 6.55859375, + "learning_rate": 9.03133110863923e-06, + "loss": 2.9971, + "mean_token_accuracy": 0.4357034795763994, + "step": 5225 + }, + { + "epoch": 0.9688542825361512, + "grad_norm": 5.6015625, + "learning_rate": 9.03114571746385e-06, + "loss": 2.5248, + "mean_token_accuracy": 0.4737704918032787, + "step": 5226 + }, + { + "epoch": 0.9690396737115313, + "grad_norm": 8.375, + "learning_rate": 9.03096032628847e-06, + "loss": 2.4894, + "mean_token_accuracy": 0.48577844311377244, + "step": 5227 + }, + { + "epoch": 0.9692250648869114, + "grad_norm": 8.75, + "learning_rate": 9.030774935113088e-06, + "loss": 3.2596, + "mean_token_accuracy": 0.40288098776723447, + "step": 5228 + }, + { + "epoch": 0.9694104560622915, + "grad_norm": 5.42578125, + "learning_rate": 9.030589543937709e-06, + "loss": 2.495, + "mean_token_accuracy": 0.49743062692702983, + "step": 5229 + }, + { + "epoch": 0.9695958472376714, + "grad_norm": 7.52734375, + "learning_rate": 9.03040415276233e-06, + "loss": 2.9977, + "mean_token_accuracy": 0.43370100915278104, + "step": 5230 + }, + { + "epoch": 0.9697812384130515, + "grad_norm": 7.4453125, + "learning_rate": 9.03021876158695e-06, + "loss": 2.98, + "mean_token_accuracy": 0.43478260869565216, + "step": 5231 + }, + { + "epoch": 0.9699666295884316, + "grad_norm": 5.36328125, + "learning_rate": 9.030033370411569e-06, + "loss": 3.0468, + "mean_token_accuracy": 0.4341298371748784, + "step": 5232 + }, + { + "epoch": 0.9701520207638117, + "grad_norm": 7.01953125, + "learning_rate": 9.02984797923619e-06, + "loss": 2.5397, + "mean_token_accuracy": 0.48105234460196294, + "step": 5233 + }, + { + "epoch": 0.9703374119391917, + "grad_norm": 6.54296875, + "learning_rate": 9.02966258806081e-06, + "loss": 2.4322, + "mean_token_accuracy": 0.5385243670024503, + "step": 5234 + }, + { + "epoch": 0.9705228031145717, + "grad_norm": 6.1953125, + "learning_rate": 9.029477196885429e-06, + "loss": 3.202, + "mean_token_accuracy": 0.42265415549597857, + "step": 5235 + }, + { + "epoch": 0.9707081942899518, + "grad_norm": 5.21875, + "learning_rate": 9.029291805710049e-06, + "loss": 3.0352, + "mean_token_accuracy": 0.4229080263478897, + "step": 5236 + }, + { + "epoch": 0.9708935854653319, + "grad_norm": 7.6015625, + "learning_rate": 9.029106414534668e-06, + "loss": 2.8363, + "mean_token_accuracy": 0.4495102188898295, + "step": 5237 + }, + { + "epoch": 0.9710789766407119, + "grad_norm": 7.296875, + "learning_rate": 9.028921023359288e-06, + "loss": 3.108, + "mean_token_accuracy": 0.45034872135503157, + "step": 5238 + }, + { + "epoch": 0.9712643678160919, + "grad_norm": 8.375, + "learning_rate": 9.028735632183909e-06, + "loss": 2.9931, + "mean_token_accuracy": 0.44179750947482405, + "step": 5239 + }, + { + "epoch": 0.971449758991472, + "grad_norm": 7.96484375, + "learning_rate": 9.02855024100853e-06, + "loss": 3.0694, + "mean_token_accuracy": 0.44374846475067553, + "step": 5240 + }, + { + "epoch": 0.9716351501668521, + "grad_norm": 6.16796875, + "learning_rate": 9.028364849833148e-06, + "loss": 3.0564, + "mean_token_accuracy": 0.4228007181328546, + "step": 5241 + }, + { + "epoch": 0.9718205413422321, + "grad_norm": 6.42578125, + "learning_rate": 9.028179458657769e-06, + "loss": 2.9868, + "mean_token_accuracy": 0.43331603528801244, + "step": 5242 + }, + { + "epoch": 0.9720059325176121, + "grad_norm": 7.42578125, + "learning_rate": 9.027994067482389e-06, + "loss": 2.7804, + "mean_token_accuracy": 0.4559925093632959, + "step": 5243 + }, + { + "epoch": 0.9721913236929922, + "grad_norm": 7.38671875, + "learning_rate": 9.027808676307008e-06, + "loss": 3.0058, + "mean_token_accuracy": 0.4321780699133552, + "step": 5244 + }, + { + "epoch": 0.9723767148683723, + "grad_norm": 5.63671875, + "learning_rate": 9.027623285131628e-06, + "loss": 2.3579, + "mean_token_accuracy": 0.5018023430459597, + "step": 5245 + }, + { + "epoch": 0.9725621060437524, + "grad_norm": 5.4453125, + "learning_rate": 9.027437893956247e-06, + "loss": 2.8203, + "mean_token_accuracy": 0.44840345347403043, + "step": 5246 + }, + { + "epoch": 0.9727474972191323, + "grad_norm": 7.16796875, + "learning_rate": 9.02725250278087e-06, + "loss": 3.0954, + "mean_token_accuracy": 0.42288125077553046, + "step": 5247 + }, + { + "epoch": 0.9729328883945124, + "grad_norm": 9.734375, + "learning_rate": 9.027067111605488e-06, + "loss": 2.1252, + "mean_token_accuracy": 0.5217251367878983, + "step": 5248 + }, + { + "epoch": 0.9731182795698925, + "grad_norm": 6.890625, + "learning_rate": 9.026881720430109e-06, + "loss": 2.4225, + "mean_token_accuracy": 0.5027578599007171, + "step": 5249 + }, + { + "epoch": 0.9733036707452726, + "grad_norm": 8.2578125, + "learning_rate": 9.026696329254729e-06, + "loss": 2.5053, + "mean_token_accuracy": 0.49483766637641496, + "step": 5250 + }, + { + "epoch": 0.9734890619206525, + "grad_norm": 7.03515625, + "learning_rate": 9.026510938079348e-06, + "loss": 3.095, + "mean_token_accuracy": 0.42227135210612055, + "step": 5251 + }, + { + "epoch": 0.9736744530960326, + "grad_norm": 7.91015625, + "learning_rate": 9.026325546903968e-06, + "loss": 2.8561, + "mean_token_accuracy": 0.4489284085727314, + "step": 5252 + }, + { + "epoch": 0.9738598442714127, + "grad_norm": 8.953125, + "learning_rate": 9.026140155728587e-06, + "loss": 3.2796, + "mean_token_accuracy": 0.4184190031152648, + "step": 5253 + }, + { + "epoch": 0.9740452354467928, + "grad_norm": 5.4765625, + "learning_rate": 9.025954764553208e-06, + "loss": 2.4272, + "mean_token_accuracy": 0.5006065016982048, + "step": 5254 + }, + { + "epoch": 0.9742306266221727, + "grad_norm": 7.41015625, + "learning_rate": 9.025769373377828e-06, + "loss": 2.9454, + "mean_token_accuracy": 0.44598687531549724, + "step": 5255 + }, + { + "epoch": 0.9744160177975528, + "grad_norm": 7.08203125, + "learning_rate": 9.025583982202449e-06, + "loss": 2.6943, + "mean_token_accuracy": 0.46075353218210363, + "step": 5256 + }, + { + "epoch": 0.9746014089729329, + "grad_norm": 7.73828125, + "learning_rate": 9.025398591027067e-06, + "loss": 2.6617, + "mean_token_accuracy": 0.45768187104277036, + "step": 5257 + }, + { + "epoch": 0.974786800148313, + "grad_norm": 6.6640625, + "learning_rate": 9.025213199851688e-06, + "loss": 3.1012, + "mean_token_accuracy": 0.42696936542669583, + "step": 5258 + }, + { + "epoch": 0.974972191323693, + "grad_norm": 8.1640625, + "learning_rate": 9.025027808676308e-06, + "loss": 2.8136, + "mean_token_accuracy": 0.4522706209453197, + "step": 5259 + }, + { + "epoch": 0.975157582499073, + "grad_norm": 13.234375, + "learning_rate": 9.024842417500927e-06, + "loss": 3.001, + "mean_token_accuracy": 0.44327301337529507, + "step": 5260 + }, + { + "epoch": 0.9753429736744531, + "grad_norm": 8.9609375, + "learning_rate": 9.024657026325548e-06, + "loss": 3.377, + "mean_token_accuracy": 0.41501625775938517, + "step": 5261 + }, + { + "epoch": 0.9755283648498332, + "grad_norm": 5.8046875, + "learning_rate": 9.024471635150167e-06, + "loss": 2.5436, + "mean_token_accuracy": 0.48532094874771936, + "step": 5262 + }, + { + "epoch": 0.9757137560252132, + "grad_norm": 12.15625, + "learning_rate": 9.024286243974789e-06, + "loss": 2.7106, + "mean_token_accuracy": 0.4552808988764045, + "step": 5263 + }, + { + "epoch": 0.9758991472005932, + "grad_norm": 10.453125, + "learning_rate": 9.024100852799408e-06, + "loss": 2.8337, + "mean_token_accuracy": 0.4589080459770115, + "step": 5264 + }, + { + "epoch": 0.9760845383759733, + "grad_norm": 6.9296875, + "learning_rate": 9.023915461624028e-06, + "loss": 2.8556, + "mean_token_accuracy": 0.45927740355174523, + "step": 5265 + }, + { + "epoch": 0.9762699295513534, + "grad_norm": 10.4296875, + "learning_rate": 9.023730070448647e-06, + "loss": 2.011, + "mean_token_accuracy": 0.5403192227619709, + "step": 5266 + }, + { + "epoch": 0.9764553207267334, + "grad_norm": 10.1484375, + "learning_rate": 9.023544679273267e-06, + "loss": 3.1366, + "mean_token_accuracy": 0.4303623561737936, + "step": 5267 + }, + { + "epoch": 0.9766407119021134, + "grad_norm": 12.1875, + "learning_rate": 9.023359288097888e-06, + "loss": 2.6736, + "mean_token_accuracy": 0.4656447492268388, + "step": 5268 + }, + { + "epoch": 0.9768261030774935, + "grad_norm": 7.3671875, + "learning_rate": 9.023173896922507e-06, + "loss": 2.7089, + "mean_token_accuracy": 0.4565191315821039, + "step": 5269 + }, + { + "epoch": 0.9770114942528736, + "grad_norm": 9.34375, + "learning_rate": 9.022988505747127e-06, + "loss": 2.6935, + "mean_token_accuracy": 0.46748704663212437, + "step": 5270 + }, + { + "epoch": 0.9771968854282537, + "grad_norm": 10.2265625, + "learning_rate": 9.022803114571748e-06, + "loss": 3.1434, + "mean_token_accuracy": 0.4215041464112096, + "step": 5271 + }, + { + "epoch": 0.9773822766036336, + "grad_norm": 7.44140625, + "learning_rate": 9.022617723396368e-06, + "loss": 3.0097, + "mean_token_accuracy": 0.42879256965944273, + "step": 5272 + }, + { + "epoch": 0.9775676677790137, + "grad_norm": 6.28125, + "learning_rate": 9.022432332220987e-06, + "loss": 3.2989, + "mean_token_accuracy": 0.4105042693732882, + "step": 5273 + }, + { + "epoch": 0.9777530589543938, + "grad_norm": 7.24609375, + "learning_rate": 9.022246941045607e-06, + "loss": 2.7939, + "mean_token_accuracy": 0.43898201144726084, + "step": 5274 + }, + { + "epoch": 0.9779384501297739, + "grad_norm": 5.0, + "learning_rate": 9.022061549870226e-06, + "loss": 2.5596, + "mean_token_accuracy": 0.47299554565701557, + "step": 5275 + }, + { + "epoch": 0.9781238413051538, + "grad_norm": 5.81640625, + "learning_rate": 9.021876158694847e-06, + "loss": 3.4474, + "mean_token_accuracy": 0.3967266775777414, + "step": 5276 + }, + { + "epoch": 0.9783092324805339, + "grad_norm": 6.2890625, + "learning_rate": 9.021690767519467e-06, + "loss": 2.5461, + "mean_token_accuracy": 0.47848360655737704, + "step": 5277 + }, + { + "epoch": 0.978494623655914, + "grad_norm": 6.10546875, + "learning_rate": 9.021505376344086e-06, + "loss": 3.0837, + "mean_token_accuracy": 0.4216088651754566, + "step": 5278 + }, + { + "epoch": 0.9786800148312941, + "grad_norm": 5.77734375, + "learning_rate": 9.021319985168706e-06, + "loss": 2.8152, + "mean_token_accuracy": 0.4521991045562286, + "step": 5279 + }, + { + "epoch": 0.978865406006674, + "grad_norm": 5.65625, + "learning_rate": 9.021134593993327e-06, + "loss": 3.1958, + "mean_token_accuracy": 0.41112200588317693, + "step": 5280 + }, + { + "epoch": 0.9790507971820541, + "grad_norm": 7.0, + "learning_rate": 9.020949202817947e-06, + "loss": 2.479, + "mean_token_accuracy": 0.4852670349907919, + "step": 5281 + }, + { + "epoch": 0.9792361883574342, + "grad_norm": 5.51953125, + "learning_rate": 9.020763811642566e-06, + "loss": 2.936, + "mean_token_accuracy": 0.4313387860525183, + "step": 5282 + }, + { + "epoch": 0.9794215795328143, + "grad_norm": 6.9375, + "learning_rate": 9.020578420467187e-06, + "loss": 3.0654, + "mean_token_accuracy": 0.43891004980955173, + "step": 5283 + }, + { + "epoch": 0.9796069707081942, + "grad_norm": 6.4921875, + "learning_rate": 9.020393029291805e-06, + "loss": 3.3757, + "mean_token_accuracy": 0.42258726899383986, + "step": 5284 + }, + { + "epoch": 0.9797923618835743, + "grad_norm": 7.8515625, + "learning_rate": 9.020207638116426e-06, + "loss": 3.3513, + "mean_token_accuracy": 0.3975588491717524, + "step": 5285 + }, + { + "epoch": 0.9799777530589544, + "grad_norm": 6.83984375, + "learning_rate": 9.020022246941046e-06, + "loss": 2.9679, + "mean_token_accuracy": 0.4574526489157288, + "step": 5286 + }, + { + "epoch": 0.9801631442343345, + "grad_norm": 5.26171875, + "learning_rate": 9.019836855765667e-06, + "loss": 2.8174, + "mean_token_accuracy": 0.45067167412971254, + "step": 5287 + }, + { + "epoch": 0.9803485354097144, + "grad_norm": 5.58203125, + "learning_rate": 9.019651464590287e-06, + "loss": 3.1026, + "mean_token_accuracy": 0.4354686020826759, + "step": 5288 + }, + { + "epoch": 0.9805339265850945, + "grad_norm": 7.6953125, + "learning_rate": 9.019466073414906e-06, + "loss": 2.4289, + "mean_token_accuracy": 0.49359768690623707, + "step": 5289 + }, + { + "epoch": 0.9807193177604746, + "grad_norm": 5.41796875, + "learning_rate": 9.019280682239527e-06, + "loss": 3.0133, + "mean_token_accuracy": 0.42977564518220723, + "step": 5290 + }, + { + "epoch": 0.9809047089358547, + "grad_norm": 5.96875, + "learning_rate": 9.019095291064146e-06, + "loss": 2.2879, + "mean_token_accuracy": 0.5046447729948973, + "step": 5291 + }, + { + "epoch": 0.9810901001112347, + "grad_norm": 7.29296875, + "learning_rate": 9.018909899888766e-06, + "loss": 2.6908, + "mean_token_accuracy": 0.4632418069087688, + "step": 5292 + }, + { + "epoch": 0.9812754912866147, + "grad_norm": 8.265625, + "learning_rate": 9.018724508713387e-06, + "loss": 2.6789, + "mean_token_accuracy": 0.4739330746847721, + "step": 5293 + }, + { + "epoch": 0.9814608824619948, + "grad_norm": 5.78125, + "learning_rate": 9.018539117538005e-06, + "loss": 2.6383, + "mean_token_accuracy": 0.47544361763022325, + "step": 5294 + }, + { + "epoch": 0.9816462736373749, + "grad_norm": 8.2265625, + "learning_rate": 9.018353726362626e-06, + "loss": 2.9225, + "mean_token_accuracy": 0.42470119521912353, + "step": 5295 + }, + { + "epoch": 0.981831664812755, + "grad_norm": 7.2421875, + "learning_rate": 9.018168335187246e-06, + "loss": 3.6038, + "mean_token_accuracy": 0.3933632286995516, + "step": 5296 + }, + { + "epoch": 0.9820170559881349, + "grad_norm": 5.984375, + "learning_rate": 9.017982944011867e-06, + "loss": 2.706, + "mean_token_accuracy": 0.47243090007087174, + "step": 5297 + }, + { + "epoch": 0.982202447163515, + "grad_norm": 8.3359375, + "learning_rate": 9.017797552836486e-06, + "loss": 2.9854, + "mean_token_accuracy": 0.40631054525325216, + "step": 5298 + }, + { + "epoch": 0.9823878383388951, + "grad_norm": 6.83203125, + "learning_rate": 9.017612161661106e-06, + "loss": 3.0442, + "mean_token_accuracy": 0.42088297126839525, + "step": 5299 + }, + { + "epoch": 0.9825732295142752, + "grad_norm": 6.2890625, + "learning_rate": 9.017426770485725e-06, + "loss": 2.4303, + "mean_token_accuracy": 0.4971590909090909, + "step": 5300 + }, + { + "epoch": 0.9827586206896551, + "grad_norm": 6.01953125, + "learning_rate": 9.017241379310345e-06, + "loss": 2.6119, + "mean_token_accuracy": 0.4545100077982844, + "step": 5301 + }, + { + "epoch": 0.9829440118650352, + "grad_norm": 5.32421875, + "learning_rate": 9.017055988134966e-06, + "loss": 2.3698, + "mean_token_accuracy": 0.49861395685187415, + "step": 5302 + }, + { + "epoch": 0.9831294030404153, + "grad_norm": 4.9921875, + "learning_rate": 9.016870596959586e-06, + "loss": 2.6448, + "mean_token_accuracy": 0.4657534246575342, + "step": 5303 + }, + { + "epoch": 0.9833147942157954, + "grad_norm": 6.24609375, + "learning_rate": 9.016685205784205e-06, + "loss": 2.7036, + "mean_token_accuracy": 0.4474397830594991, + "step": 5304 + }, + { + "epoch": 0.9835001853911753, + "grad_norm": 7.28125, + "learning_rate": 9.016499814608826e-06, + "loss": 2.6528, + "mean_token_accuracy": 0.4669168751737559, + "step": 5305 + }, + { + "epoch": 0.9836855765665554, + "grad_norm": 9.09375, + "learning_rate": 9.016314423433446e-06, + "loss": 2.8119, + "mean_token_accuracy": 0.44283623353390794, + "step": 5306 + }, + { + "epoch": 0.9838709677419355, + "grad_norm": 6.609375, + "learning_rate": 9.016129032258065e-06, + "loss": 3.7831, + "mean_token_accuracy": 0.3781437125748503, + "step": 5307 + }, + { + "epoch": 0.9840563589173156, + "grad_norm": 7.12890625, + "learning_rate": 9.015943641082685e-06, + "loss": 2.877, + "mean_token_accuracy": 0.4429403005279545, + "step": 5308 + }, + { + "epoch": 0.9842417500926955, + "grad_norm": 6.03125, + "learning_rate": 9.015758249907304e-06, + "loss": 2.9062, + "mean_token_accuracy": 0.4446273218904303, + "step": 5309 + }, + { + "epoch": 0.9844271412680756, + "grad_norm": 8.0859375, + "learning_rate": 9.015572858731925e-06, + "loss": 2.753, + "mean_token_accuracy": 0.45505452402004126, + "step": 5310 + }, + { + "epoch": 0.9846125324434557, + "grad_norm": 6.69140625, + "learning_rate": 9.015387467556545e-06, + "loss": 2.3676, + "mean_token_accuracy": 0.5065679733110926, + "step": 5311 + }, + { + "epoch": 0.9847979236188358, + "grad_norm": 6.2109375, + "learning_rate": 9.015202076381166e-06, + "loss": 2.6653, + "mean_token_accuracy": 0.47414406176978563, + "step": 5312 + }, + { + "epoch": 0.9849833147942157, + "grad_norm": 8.7265625, + "learning_rate": 9.015016685205784e-06, + "loss": 2.7725, + "mean_token_accuracy": 0.463492597577389, + "step": 5313 + }, + { + "epoch": 0.9851687059695958, + "grad_norm": 9.921875, + "learning_rate": 9.014831294030405e-06, + "loss": 3.0038, + "mean_token_accuracy": 0.43437423761893146, + "step": 5314 + }, + { + "epoch": 0.9853540971449759, + "grad_norm": 6.00390625, + "learning_rate": 9.014645902855025e-06, + "loss": 3.1276, + "mean_token_accuracy": 0.4335689469270397, + "step": 5315 + }, + { + "epoch": 0.985539488320356, + "grad_norm": 7.3046875, + "learning_rate": 9.014460511679644e-06, + "loss": 2.5009, + "mean_token_accuracy": 0.474618149146451, + "step": 5316 + }, + { + "epoch": 0.985724879495736, + "grad_norm": 7.64453125, + "learning_rate": 9.014275120504265e-06, + "loss": 2.5951, + "mean_token_accuracy": 0.4977079240340537, + "step": 5317 + }, + { + "epoch": 0.985910270671116, + "grad_norm": 6.17578125, + "learning_rate": 9.014089729328884e-06, + "loss": 3.2152, + "mean_token_accuracy": 0.41438937779836343, + "step": 5318 + }, + { + "epoch": 0.9860956618464961, + "grad_norm": 6.1484375, + "learning_rate": 9.013904338153506e-06, + "loss": 2.0302, + "mean_token_accuracy": 0.5498069498069498, + "step": 5319 + }, + { + "epoch": 0.9862810530218762, + "grad_norm": 8.2109375, + "learning_rate": 9.013718946978125e-06, + "loss": 2.7719, + "mean_token_accuracy": 0.4525238263325097, + "step": 5320 + }, + { + "epoch": 0.9864664441972563, + "grad_norm": 5.74609375, + "learning_rate": 9.013533555802745e-06, + "loss": 3.0582, + "mean_token_accuracy": 0.44462257849031395, + "step": 5321 + }, + { + "epoch": 0.9866518353726362, + "grad_norm": 6.390625, + "learning_rate": 9.013348164627364e-06, + "loss": 2.9786, + "mean_token_accuracy": 0.43372126028952596, + "step": 5322 + }, + { + "epoch": 0.9868372265480163, + "grad_norm": 6.5078125, + "learning_rate": 9.013162773451984e-06, + "loss": 3.3364, + "mean_token_accuracy": 0.4204374057315234, + "step": 5323 + }, + { + "epoch": 0.9870226177233964, + "grad_norm": 6.59765625, + "learning_rate": 9.012977382276605e-06, + "loss": 2.6318, + "mean_token_accuracy": 0.4917234664070107, + "step": 5324 + }, + { + "epoch": 0.9872080088987765, + "grad_norm": 4.609375, + "learning_rate": 9.012791991101224e-06, + "loss": 2.9282, + "mean_token_accuracy": 0.4313700051894136, + "step": 5325 + }, + { + "epoch": 0.9873934000741564, + "grad_norm": 6.9765625, + "learning_rate": 9.012606599925844e-06, + "loss": 3.0229, + "mean_token_accuracy": 0.42448266595001344, + "step": 5326 + }, + { + "epoch": 0.9875787912495365, + "grad_norm": 6.3984375, + "learning_rate": 9.012421208750463e-06, + "loss": 2.9413, + "mean_token_accuracy": 0.46790766939687267, + "step": 5327 + }, + { + "epoch": 0.9877641824249166, + "grad_norm": 5.5859375, + "learning_rate": 9.012235817575085e-06, + "loss": 3.0831, + "mean_token_accuracy": 0.4244882675986021, + "step": 5328 + }, + { + "epoch": 0.9879495736002967, + "grad_norm": 6.73828125, + "learning_rate": 9.012050426399704e-06, + "loss": 3.2177, + "mean_token_accuracy": 0.4176895721004933, + "step": 5329 + }, + { + "epoch": 0.9881349647756766, + "grad_norm": 5.96484375, + "learning_rate": 9.011865035224324e-06, + "loss": 3.0277, + "mean_token_accuracy": 0.4436307146027519, + "step": 5330 + }, + { + "epoch": 0.9883203559510567, + "grad_norm": 6.421875, + "learning_rate": 9.011679644048945e-06, + "loss": 2.4874, + "mean_token_accuracy": 0.4778126964173476, + "step": 5331 + }, + { + "epoch": 0.9885057471264368, + "grad_norm": 5.5390625, + "learning_rate": 9.011494252873564e-06, + "loss": 2.4293, + "mean_token_accuracy": 0.51498561751879, + "step": 5332 + }, + { + "epoch": 0.9886911383018169, + "grad_norm": 7.08984375, + "learning_rate": 9.011308861698184e-06, + "loss": 2.8724, + "mean_token_accuracy": 0.43947272947152016, + "step": 5333 + }, + { + "epoch": 0.9888765294771968, + "grad_norm": 6.22265625, + "learning_rate": 9.011123470522803e-06, + "loss": 2.8004, + "mean_token_accuracy": 0.4588452899986788, + "step": 5334 + }, + { + "epoch": 0.9890619206525769, + "grad_norm": 7.19921875, + "learning_rate": 9.010938079347423e-06, + "loss": 2.769, + "mean_token_accuracy": 0.46843335931410757, + "step": 5335 + }, + { + "epoch": 0.989247311827957, + "grad_norm": 5.9453125, + "learning_rate": 9.010752688172044e-06, + "loss": 3.2927, + "mean_token_accuracy": 0.4335483870967742, + "step": 5336 + }, + { + "epoch": 0.9894327030033371, + "grad_norm": 8.765625, + "learning_rate": 9.010567296996664e-06, + "loss": 2.5567, + "mean_token_accuracy": 0.48309302028837564, + "step": 5337 + }, + { + "epoch": 0.989618094178717, + "grad_norm": 6.609375, + "learning_rate": 9.010381905821283e-06, + "loss": 3.3472, + "mean_token_accuracy": 0.42130872483221476, + "step": 5338 + }, + { + "epoch": 0.9898034853540971, + "grad_norm": 5.96484375, + "learning_rate": 9.010196514645904e-06, + "loss": 2.9473, + "mean_token_accuracy": 0.4480499695307739, + "step": 5339 + }, + { + "epoch": 0.9899888765294772, + "grad_norm": 6.3046875, + "learning_rate": 9.010011123470524e-06, + "loss": 2.755, + "mean_token_accuracy": 0.4540897941772422, + "step": 5340 + }, + { + "epoch": 0.9901742677048573, + "grad_norm": 8.109375, + "learning_rate": 9.009825732295143e-06, + "loss": 2.6095, + "mean_token_accuracy": 0.4766005241482591, + "step": 5341 + }, + { + "epoch": 0.9903596588802372, + "grad_norm": 6.0703125, + "learning_rate": 9.009640341119763e-06, + "loss": 3.0822, + "mean_token_accuracy": 0.4536385936222404, + "step": 5342 + }, + { + "epoch": 0.9905450500556173, + "grad_norm": 7.49609375, + "learning_rate": 9.009454949944382e-06, + "loss": 2.2223, + "mean_token_accuracy": 0.5430619053503027, + "step": 5343 + }, + { + "epoch": 0.9907304412309974, + "grad_norm": 7.7734375, + "learning_rate": 9.009269558769004e-06, + "loss": 2.3547, + "mean_token_accuracy": 0.4967394030599448, + "step": 5344 + }, + { + "epoch": 0.9909158324063775, + "grad_norm": 7.25390625, + "learning_rate": 9.009084167593623e-06, + "loss": 2.9127, + "mean_token_accuracy": 0.43528753323072616, + "step": 5345 + }, + { + "epoch": 0.9911012235817576, + "grad_norm": 6.76953125, + "learning_rate": 9.008898776418244e-06, + "loss": 2.5838, + "mean_token_accuracy": 0.46951807228915665, + "step": 5346 + }, + { + "epoch": 0.9912866147571375, + "grad_norm": 7.23046875, + "learning_rate": 9.008713385242863e-06, + "loss": 2.4568, + "mean_token_accuracy": 0.48092594779733083, + "step": 5347 + }, + { + "epoch": 0.9914720059325176, + "grad_norm": 7.76953125, + "learning_rate": 9.008527994067483e-06, + "loss": 2.8586, + "mean_token_accuracy": 0.43542882307607617, + "step": 5348 + }, + { + "epoch": 0.9916573971078977, + "grad_norm": 14.0703125, + "learning_rate": 9.008342602892104e-06, + "loss": 2.8178, + "mean_token_accuracy": 0.4403052064631957, + "step": 5349 + }, + { + "epoch": 0.9918427882832778, + "grad_norm": 6.68359375, + "learning_rate": 9.008157211716722e-06, + "loss": 3.051, + "mean_token_accuracy": 0.4163214581607291, + "step": 5350 + }, + { + "epoch": 0.9920281794586577, + "grad_norm": 4.88671875, + "learning_rate": 9.007971820541343e-06, + "loss": 2.933, + "mean_token_accuracy": 0.4471948978488812, + "step": 5351 + }, + { + "epoch": 0.9922135706340378, + "grad_norm": 11.6875, + "learning_rate": 9.007786429365963e-06, + "loss": 2.6764, + "mean_token_accuracy": 0.45178335535006603, + "step": 5352 + }, + { + "epoch": 0.9923989618094179, + "grad_norm": 6.3828125, + "learning_rate": 9.007601038190584e-06, + "loss": 2.6396, + "mean_token_accuracy": 0.47504682163710477, + "step": 5353 + }, + { + "epoch": 0.992584352984798, + "grad_norm": 7.15625, + "learning_rate": 9.007415647015203e-06, + "loss": 2.969, + "mean_token_accuracy": 0.44990892531876137, + "step": 5354 + }, + { + "epoch": 0.9927697441601779, + "grad_norm": 6.8671875, + "learning_rate": 9.007230255839823e-06, + "loss": 2.5238, + "mean_token_accuracy": 0.46928872430085555, + "step": 5355 + }, + { + "epoch": 0.992955135335558, + "grad_norm": 6.23046875, + "learning_rate": 9.007044864664442e-06, + "loss": 2.8182, + "mean_token_accuracy": 0.4621351008320406, + "step": 5356 + }, + { + "epoch": 0.9931405265109381, + "grad_norm": 5.29296875, + "learning_rate": 9.006859473489062e-06, + "loss": 2.9151, + "mean_token_accuracy": 0.45566891794798103, + "step": 5357 + }, + { + "epoch": 0.9933259176863182, + "grad_norm": 6.32421875, + "learning_rate": 9.006674082313683e-06, + "loss": 2.6284, + "mean_token_accuracy": 0.48252125792954514, + "step": 5358 + }, + { + "epoch": 0.9935113088616981, + "grad_norm": 7.484375, + "learning_rate": 9.006488691138302e-06, + "loss": 2.9684, + "mean_token_accuracy": 0.43583923087194554, + "step": 5359 + }, + { + "epoch": 0.9936967000370782, + "grad_norm": 5.828125, + "learning_rate": 9.006303299962922e-06, + "loss": 2.5793, + "mean_token_accuracy": 0.477866188501625, + "step": 5360 + }, + { + "epoch": 0.9938820912124583, + "grad_norm": 7.1796875, + "learning_rate": 9.006117908787543e-06, + "loss": 3.138, + "mean_token_accuracy": 0.40840760941865106, + "step": 5361 + }, + { + "epoch": 0.9940674823878384, + "grad_norm": 10.7578125, + "learning_rate": 9.005932517612163e-06, + "loss": 3.3873, + "mean_token_accuracy": 0.38589277241815595, + "step": 5362 + }, + { + "epoch": 0.9942528735632183, + "grad_norm": 9.640625, + "learning_rate": 9.005747126436782e-06, + "loss": 2.3354, + "mean_token_accuracy": 0.5178206381654964, + "step": 5363 + }, + { + "epoch": 0.9944382647385984, + "grad_norm": 7.50390625, + "learning_rate": 9.005561735261402e-06, + "loss": 3.5893, + "mean_token_accuracy": 0.3940066592674806, + "step": 5364 + }, + { + "epoch": 0.9946236559139785, + "grad_norm": 7.5625, + "learning_rate": 9.005376344086021e-06, + "loss": 2.7911, + "mean_token_accuracy": 0.45264483627204033, + "step": 5365 + }, + { + "epoch": 0.9948090470893586, + "grad_norm": 5.44921875, + "learning_rate": 9.005190952910642e-06, + "loss": 3.1668, + "mean_token_accuracy": 0.41986190686058467, + "step": 5366 + }, + { + "epoch": 0.9949944382647385, + "grad_norm": 7.14453125, + "learning_rate": 9.005005561735262e-06, + "loss": 2.952, + "mean_token_accuracy": 0.4292821606254442, + "step": 5367 + }, + { + "epoch": 0.9951798294401186, + "grad_norm": 6.05859375, + "learning_rate": 9.004820170559883e-06, + "loss": 3.1834, + "mean_token_accuracy": 0.43790849673202614, + "step": 5368 + }, + { + "epoch": 0.9953652206154987, + "grad_norm": 5.01171875, + "learning_rate": 9.004634779384503e-06, + "loss": 3.0007, + "mean_token_accuracy": 0.4277613703484938, + "step": 5369 + }, + { + "epoch": 0.9955506117908788, + "grad_norm": 5.76171875, + "learning_rate": 9.004449388209122e-06, + "loss": 2.5067, + "mean_token_accuracy": 0.4837253829321663, + "step": 5370 + }, + { + "epoch": 0.9957360029662589, + "grad_norm": 6.48828125, + "learning_rate": 9.004263997033742e-06, + "loss": 2.5389, + "mean_token_accuracy": 0.4820298658567451, + "step": 5371 + }, + { + "epoch": 0.9959213941416388, + "grad_norm": 6.578125, + "learning_rate": 9.004078605858361e-06, + "loss": 2.4616, + "mean_token_accuracy": 0.48828031571394404, + "step": 5372 + }, + { + "epoch": 0.9961067853170189, + "grad_norm": 5.26171875, + "learning_rate": 9.003893214682982e-06, + "loss": 2.6077, + "mean_token_accuracy": 0.4764300847457627, + "step": 5373 + }, + { + "epoch": 0.996292176492399, + "grad_norm": 6.66796875, + "learning_rate": 9.003707823507602e-06, + "loss": 3.2044, + "mean_token_accuracy": 0.4174381811789523, + "step": 5374 + }, + { + "epoch": 0.9964775676677791, + "grad_norm": 7.0703125, + "learning_rate": 9.003522432332221e-06, + "loss": 2.8274, + "mean_token_accuracy": 0.45360262008733626, + "step": 5375 + }, + { + "epoch": 0.996662958843159, + "grad_norm": 9.3125, + "learning_rate": 9.003337041156842e-06, + "loss": 3.1837, + "mean_token_accuracy": 0.4358379715522573, + "step": 5376 + }, + { + "epoch": 0.9968483500185391, + "grad_norm": 10.5, + "learning_rate": 9.003151649981462e-06, + "loss": 2.5045, + "mean_token_accuracy": 0.48051391862955034, + "step": 5377 + }, + { + "epoch": 0.9970337411939192, + "grad_norm": 8.3515625, + "learning_rate": 9.002966258806083e-06, + "loss": 2.7759, + "mean_token_accuracy": 0.450172229480936, + "step": 5378 + }, + { + "epoch": 0.9972191323692993, + "grad_norm": 9.765625, + "learning_rate": 9.002780867630701e-06, + "loss": 3.0243, + "mean_token_accuracy": 0.43997089497938396, + "step": 5379 + }, + { + "epoch": 0.9974045235446792, + "grad_norm": 8.171875, + "learning_rate": 9.002595476455322e-06, + "loss": 3.0958, + "mean_token_accuracy": 0.4264875239923225, + "step": 5380 + }, + { + "epoch": 0.9975899147200593, + "grad_norm": 8.203125, + "learning_rate": 9.00241008527994e-06, + "loss": 2.605, + "mean_token_accuracy": 0.47257528913317304, + "step": 5381 + }, + { + "epoch": 0.9977753058954394, + "grad_norm": 8.1796875, + "learning_rate": 9.002224694104561e-06, + "loss": 2.7325, + "mean_token_accuracy": 0.47313371616078753, + "step": 5382 + }, + { + "epoch": 0.9979606970708195, + "grad_norm": 7.546875, + "learning_rate": 9.002039302929182e-06, + "loss": 2.6217, + "mean_token_accuracy": 0.4649734859056656, + "step": 5383 + }, + { + "epoch": 0.9981460882461994, + "grad_norm": 6.98046875, + "learning_rate": 9.001853911753802e-06, + "loss": 2.4382, + "mean_token_accuracy": 0.5214841259722045, + "step": 5384 + }, + { + "epoch": 0.9983314794215795, + "grad_norm": 8.21875, + "learning_rate": 9.001668520578421e-06, + "loss": 2.9507, + "mean_token_accuracy": 0.4603713768115942, + "step": 5385 + }, + { + "epoch": 0.9985168705969596, + "grad_norm": 6.6015625, + "learning_rate": 9.001483129403041e-06, + "loss": 2.5004, + "mean_token_accuracy": 0.49199300228771364, + "step": 5386 + }, + { + "epoch": 0.9987022617723397, + "grad_norm": 7.265625, + "learning_rate": 9.001297738227662e-06, + "loss": 2.7984, + "mean_token_accuracy": 0.4648922076357943, + "step": 5387 + }, + { + "epoch": 0.9988876529477196, + "grad_norm": 12.0546875, + "learning_rate": 9.00111234705228e-06, + "loss": 2.629, + "mean_token_accuracy": 0.4644888082274652, + "step": 5388 + }, + { + "epoch": 0.9990730441230997, + "grad_norm": 7.83203125, + "learning_rate": 9.000926955876901e-06, + "loss": 3.0245, + "mean_token_accuracy": 0.4527827116637063, + "step": 5389 + }, + { + "epoch": 0.9992584352984798, + "grad_norm": 8.3671875, + "learning_rate": 9.00074156470152e-06, + "loss": 2.0859, + "mean_token_accuracy": 0.5297951582867784, + "step": 5390 + }, + { + "epoch": 0.9994438264738599, + "grad_norm": 11.0078125, + "learning_rate": 9.00055617352614e-06, + "loss": 3.0128, + "mean_token_accuracy": 0.4428139835994821, + "step": 5391 + }, + { + "epoch": 0.9996292176492398, + "grad_norm": 8.4765625, + "learning_rate": 9.000370782350761e-06, + "loss": 2.9123, + "mean_token_accuracy": 0.44999464036874265, + "step": 5392 + }, + { + "epoch": 0.9998146088246199, + "grad_norm": 7.53515625, + "learning_rate": 9.000185391175381e-06, + "loss": 2.8187, + "mean_token_accuracy": 0.4531951640759931, + "step": 5393 + }, + { + "epoch": 1.0, + "grad_norm": 8.6015625, + "learning_rate": 9e-06, + "loss": 2.6479, + "mean_token_accuracy": 0.47234957020057305, + "step": 5394 + }, + { + "epoch": 1.00018539117538, + "grad_norm": 7.25, + "learning_rate": 8.99981460882462e-06, + "loss": 2.7427, + "mean_token_accuracy": 0.45970028351559333, + "step": 5395 + }, + { + "epoch": 1.0003707823507602, + "grad_norm": 6.48046875, + "learning_rate": 8.999629217649241e-06, + "loss": 2.4549, + "mean_token_accuracy": 0.4942700548081714, + "step": 5396 + }, + { + "epoch": 1.0005561735261401, + "grad_norm": 5.8046875, + "learning_rate": 8.99944382647386e-06, + "loss": 2.8212, + "mean_token_accuracy": 0.46564102564102566, + "step": 5397 + }, + { + "epoch": 1.0007415647015203, + "grad_norm": 5.671875, + "learning_rate": 8.99925843529848e-06, + "loss": 2.6737, + "mean_token_accuracy": 0.4762461059190031, + "step": 5398 + }, + { + "epoch": 1.0009269558769003, + "grad_norm": 8.0859375, + "learning_rate": 8.9990730441231e-06, + "loss": 2.7601, + "mean_token_accuracy": 0.48086188436830835, + "step": 5399 + }, + { + "epoch": 1.0011123470522802, + "grad_norm": 6.15625, + "learning_rate": 8.998887652947721e-06, + "loss": 3.1373, + "mean_token_accuracy": 0.4137083141232519, + "step": 5400 + }, + { + "epoch": 1.0012977382276604, + "grad_norm": 7.9375, + "learning_rate": 8.99870226177234e-06, + "loss": 2.8595, + "mean_token_accuracy": 0.46283362434522324, + "step": 5401 + }, + { + "epoch": 1.0014831294030404, + "grad_norm": 7.7265625, + "learning_rate": 8.99851687059696e-06, + "loss": 2.987, + "mean_token_accuracy": 0.4263896103896104, + "step": 5402 + }, + { + "epoch": 1.0016685205784204, + "grad_norm": 9.84375, + "learning_rate": 8.99833147942158e-06, + "loss": 2.8224, + "mean_token_accuracy": 0.440157771477875, + "step": 5403 + }, + { + "epoch": 1.0018539117538006, + "grad_norm": 11.21875, + "learning_rate": 8.9981460882462e-06, + "loss": 3.122, + "mean_token_accuracy": 0.3997254197908966, + "step": 5404 + }, + { + "epoch": 1.0020393029291805, + "grad_norm": 7.17578125, + "learning_rate": 8.99796069707082e-06, + "loss": 2.9079, + "mean_token_accuracy": 0.444659193402912, + "step": 5405 + }, + { + "epoch": 1.0022246941045607, + "grad_norm": 8.3046875, + "learning_rate": 8.99777530589544e-06, + "loss": 2.8867, + "mean_token_accuracy": 0.42133566783391696, + "step": 5406 + }, + { + "epoch": 1.0024100852799407, + "grad_norm": 6.5703125, + "learning_rate": 8.99758991472006e-06, + "loss": 2.3106, + "mean_token_accuracy": 0.5282411820781697, + "step": 5407 + }, + { + "epoch": 1.0025954764553207, + "grad_norm": 6.546875, + "learning_rate": 8.99740452354468e-06, + "loss": 2.8621, + "mean_token_accuracy": 0.4527790329868956, + "step": 5408 + }, + { + "epoch": 1.0027808676307008, + "grad_norm": 8.2734375, + "learning_rate": 8.9972191323693e-06, + "loss": 2.6703, + "mean_token_accuracy": 0.47613252197430694, + "step": 5409 + }, + { + "epoch": 1.0029662588060808, + "grad_norm": 6.5078125, + "learning_rate": 8.99703374119392e-06, + "loss": 2.4359, + "mean_token_accuracy": 0.48895582329317266, + "step": 5410 + }, + { + "epoch": 1.0031516499814608, + "grad_norm": 7.328125, + "learning_rate": 8.99684835001854e-06, + "loss": 2.3395, + "mean_token_accuracy": 0.5009856100926473, + "step": 5411 + }, + { + "epoch": 1.003337041156841, + "grad_norm": 9.5859375, + "learning_rate": 8.99666295884316e-06, + "loss": 2.7966, + "mean_token_accuracy": 0.4670528602461984, + "step": 5412 + }, + { + "epoch": 1.003522432332221, + "grad_norm": 7.55078125, + "learning_rate": 8.99647756766778e-06, + "loss": 3.3766, + "mean_token_accuracy": 0.39555498458376154, + "step": 5413 + }, + { + "epoch": 1.0037078235076011, + "grad_norm": 7.18359375, + "learning_rate": 8.9962921764924e-06, + "loss": 2.6531, + "mean_token_accuracy": 0.46013356957884693, + "step": 5414 + }, + { + "epoch": 1.003893214682981, + "grad_norm": 11.6484375, + "learning_rate": 8.996106785317019e-06, + "loss": 2.3123, + "mean_token_accuracy": 0.5195154777927322, + "step": 5415 + }, + { + "epoch": 1.004078605858361, + "grad_norm": 7.54296875, + "learning_rate": 8.995921394141641e-06, + "loss": 2.4731, + "mean_token_accuracy": 0.4807906741003548, + "step": 5416 + }, + { + "epoch": 1.0042639970337413, + "grad_norm": 8.71875, + "learning_rate": 8.99573600296626e-06, + "loss": 2.9618, + "mean_token_accuracy": 0.4386507189893831, + "step": 5417 + }, + { + "epoch": 1.0044493882091212, + "grad_norm": 7.578125, + "learning_rate": 8.99555061179088e-06, + "loss": 2.5025, + "mean_token_accuracy": 0.5083069118579582, + "step": 5418 + }, + { + "epoch": 1.0046347793845012, + "grad_norm": 6.7265625, + "learning_rate": 8.995365220615499e-06, + "loss": 2.4786, + "mean_token_accuracy": 0.4991372368572415, + "step": 5419 + }, + { + "epoch": 1.0048201705598814, + "grad_norm": 7.953125, + "learning_rate": 8.99517982944012e-06, + "loss": 2.7431, + "mean_token_accuracy": 0.4607252612169637, + "step": 5420 + }, + { + "epoch": 1.0050055617352613, + "grad_norm": 10.3359375, + "learning_rate": 8.99499443826474e-06, + "loss": 2.7897, + "mean_token_accuracy": 0.46403054939640304, + "step": 5421 + }, + { + "epoch": 1.0051909529106415, + "grad_norm": 5.484375, + "learning_rate": 8.994809047089359e-06, + "loss": 2.8096, + "mean_token_accuracy": 0.47230993441826574, + "step": 5422 + }, + { + "epoch": 1.0053763440860215, + "grad_norm": 9.1875, + "learning_rate": 8.99462365591398e-06, + "loss": 2.7145, + "mean_token_accuracy": 0.4539916639948702, + "step": 5423 + }, + { + "epoch": 1.0055617352614015, + "grad_norm": 7.51171875, + "learning_rate": 8.9944382647386e-06, + "loss": 2.5722, + "mean_token_accuracy": 0.4759963768115942, + "step": 5424 + }, + { + "epoch": 1.0057471264367817, + "grad_norm": 7.15625, + "learning_rate": 8.99425287356322e-06, + "loss": 2.7145, + "mean_token_accuracy": 0.47188441949901255, + "step": 5425 + }, + { + "epoch": 1.0059325176121616, + "grad_norm": 6.8203125, + "learning_rate": 8.994067482387839e-06, + "loss": 2.8991, + "mean_token_accuracy": 0.4482009176527409, + "step": 5426 + }, + { + "epoch": 1.0061179087875418, + "grad_norm": 7.62109375, + "learning_rate": 8.99388209121246e-06, + "loss": 2.5275, + "mean_token_accuracy": 0.48938079569249177, + "step": 5427 + }, + { + "epoch": 1.0063032999629218, + "grad_norm": 6.38671875, + "learning_rate": 8.993696700037078e-06, + "loss": 3.03, + "mean_token_accuracy": 0.44650062613051344, + "step": 5428 + }, + { + "epoch": 1.0064886911383017, + "grad_norm": 6.26171875, + "learning_rate": 8.993511308861699e-06, + "loss": 2.8335, + "mean_token_accuracy": 0.4438024019941083, + "step": 5429 + }, + { + "epoch": 1.006674082313682, + "grad_norm": 8.8828125, + "learning_rate": 8.99332591768632e-06, + "loss": 2.4428, + "mean_token_accuracy": 0.4889772727272727, + "step": 5430 + }, + { + "epoch": 1.006859473489062, + "grad_norm": 9.03125, + "learning_rate": 8.993140526510938e-06, + "loss": 2.7231, + "mean_token_accuracy": 0.4471385738495633, + "step": 5431 + }, + { + "epoch": 1.0070448646644419, + "grad_norm": 6.23828125, + "learning_rate": 8.992955135335559e-06, + "loss": 3.0997, + "mean_token_accuracy": 0.43875735596003834, + "step": 5432 + }, + { + "epoch": 1.007230255839822, + "grad_norm": 9.46875, + "learning_rate": 8.992769744160179e-06, + "loss": 2.8572, + "mean_token_accuracy": 0.4532510206113711, + "step": 5433 + }, + { + "epoch": 1.007415647015202, + "grad_norm": 13.8125, + "learning_rate": 8.9925843529848e-06, + "loss": 2.4115, + "mean_token_accuracy": 0.5002764976958526, + "step": 5434 + }, + { + "epoch": 1.0076010381905822, + "grad_norm": 8.5546875, + "learning_rate": 8.992398961809418e-06, + "loss": 2.9994, + "mean_token_accuracy": 0.42680180180180183, + "step": 5435 + }, + { + "epoch": 1.0077864293659622, + "grad_norm": 5.51953125, + "learning_rate": 8.992213570634039e-06, + "loss": 2.3776, + "mean_token_accuracy": 0.49168314528323837, + "step": 5436 + }, + { + "epoch": 1.0079718205413422, + "grad_norm": 9.046875, + "learning_rate": 8.992028179458658e-06, + "loss": 3.1377, + "mean_token_accuracy": 0.4369191597708466, + "step": 5437 + }, + { + "epoch": 1.0081572117167223, + "grad_norm": 6.69140625, + "learning_rate": 8.991842788283278e-06, + "loss": 2.5676, + "mean_token_accuracy": 0.4746033862151713, + "step": 5438 + }, + { + "epoch": 1.0083426028921023, + "grad_norm": 10.0859375, + "learning_rate": 8.991657397107899e-06, + "loss": 2.5001, + "mean_token_accuracy": 0.4820772058823529, + "step": 5439 + }, + { + "epoch": 1.0085279940674823, + "grad_norm": 7.41796875, + "learning_rate": 8.991472005932519e-06, + "loss": 2.8444, + "mean_token_accuracy": 0.462759881197167, + "step": 5440 + }, + { + "epoch": 1.0087133852428625, + "grad_norm": 8.9453125, + "learning_rate": 8.991286614757138e-06, + "loss": 2.201, + "mean_token_accuracy": 0.5431593312862504, + "step": 5441 + }, + { + "epoch": 1.0088987764182424, + "grad_norm": 9.0, + "learning_rate": 8.991101223581758e-06, + "loss": 2.6431, + "mean_token_accuracy": 0.46113400596845244, + "step": 5442 + }, + { + "epoch": 1.0090841675936226, + "grad_norm": 10.2890625, + "learning_rate": 8.990915832406379e-06, + "loss": 2.4818, + "mean_token_accuracy": 0.49086651053864166, + "step": 5443 + }, + { + "epoch": 1.0092695587690026, + "grad_norm": 6.24609375, + "learning_rate": 8.990730441230998e-06, + "loss": 2.5717, + "mean_token_accuracy": 0.4621790423317141, + "step": 5444 + }, + { + "epoch": 1.0094549499443826, + "grad_norm": 7.61328125, + "learning_rate": 8.990545050055618e-06, + "loss": 3.2259, + "mean_token_accuracy": 0.41433189655172414, + "step": 5445 + }, + { + "epoch": 1.0096403411197628, + "grad_norm": 5.79296875, + "learning_rate": 8.990359658880237e-06, + "loss": 2.4473, + "mean_token_accuracy": 0.5157223992242693, + "step": 5446 + }, + { + "epoch": 1.0098257322951427, + "grad_norm": 7.03515625, + "learning_rate": 8.990174267704857e-06, + "loss": 2.4858, + "mean_token_accuracy": 0.4995558187740598, + "step": 5447 + }, + { + "epoch": 1.010011123470523, + "grad_norm": 7.03515625, + "learning_rate": 8.989988876529478e-06, + "loss": 2.5313, + "mean_token_accuracy": 0.47881164228623135, + "step": 5448 + }, + { + "epoch": 1.0101965146459029, + "grad_norm": 7.484375, + "learning_rate": 8.989803485354098e-06, + "loss": 3.1031, + "mean_token_accuracy": 0.43690065209052553, + "step": 5449 + }, + { + "epoch": 1.0103819058212828, + "grad_norm": 6.390625, + "learning_rate": 8.989618094178719e-06, + "loss": 2.9093, + "mean_token_accuracy": 0.4510751791965328, + "step": 5450 + }, + { + "epoch": 1.010567296996663, + "grad_norm": 5.6640625, + "learning_rate": 8.989432703003338e-06, + "loss": 2.5987, + "mean_token_accuracy": 0.48045700541190617, + "step": 5451 + }, + { + "epoch": 1.010752688172043, + "grad_norm": 4.95703125, + "learning_rate": 8.989247311827958e-06, + "loss": 2.5317, + "mean_token_accuracy": 0.48695762819697164, + "step": 5452 + }, + { + "epoch": 1.010938079347423, + "grad_norm": 6.5625, + "learning_rate": 8.989061920652577e-06, + "loss": 2.8766, + "mean_token_accuracy": 0.4480246567666013, + "step": 5453 + }, + { + "epoch": 1.0111234705228032, + "grad_norm": 5.0625, + "learning_rate": 8.988876529477198e-06, + "loss": 3.3495, + "mean_token_accuracy": 0.41807335956339636, + "step": 5454 + }, + { + "epoch": 1.0113088616981831, + "grad_norm": 6.4453125, + "learning_rate": 8.988691138301818e-06, + "loss": 3.1902, + "mean_token_accuracy": 0.4173813607775872, + "step": 5455 + }, + { + "epoch": 1.0114942528735633, + "grad_norm": 7.47265625, + "learning_rate": 8.988505747126437e-06, + "loss": 2.925, + "mean_token_accuracy": 0.4321966693100714, + "step": 5456 + }, + { + "epoch": 1.0116796440489433, + "grad_norm": 8.1328125, + "learning_rate": 8.988320355951057e-06, + "loss": 2.6556, + "mean_token_accuracy": 0.5039230574538092, + "step": 5457 + }, + { + "epoch": 1.0118650352243233, + "grad_norm": 5.95703125, + "learning_rate": 8.988134964775678e-06, + "loss": 2.7498, + "mean_token_accuracy": 0.46423594983743616, + "step": 5458 + }, + { + "epoch": 1.0120504263997034, + "grad_norm": 11.3203125, + "learning_rate": 8.987949573600298e-06, + "loss": 3.0391, + "mean_token_accuracy": 0.43049095607235144, + "step": 5459 + }, + { + "epoch": 1.0122358175750834, + "grad_norm": 12.609375, + "learning_rate": 8.987764182424917e-06, + "loss": 2.5737, + "mean_token_accuracy": 0.47603358486555425, + "step": 5460 + }, + { + "epoch": 1.0124212087504634, + "grad_norm": 10.1796875, + "learning_rate": 8.987578791249538e-06, + "loss": 2.8516, + "mean_token_accuracy": 0.4436400089106705, + "step": 5461 + }, + { + "epoch": 1.0126065999258436, + "grad_norm": 6.953125, + "learning_rate": 8.987393400074156e-06, + "loss": 2.8708, + "mean_token_accuracy": 0.4493557331593541, + "step": 5462 + }, + { + "epoch": 1.0127919911012235, + "grad_norm": 8.734375, + "learning_rate": 8.987208008898777e-06, + "loss": 2.5812, + "mean_token_accuracy": 0.4994443209602134, + "step": 5463 + }, + { + "epoch": 1.0129773822766037, + "grad_norm": 11.09375, + "learning_rate": 8.987022617723397e-06, + "loss": 2.8234, + "mean_token_accuracy": 0.44006776789495977, + "step": 5464 + }, + { + "epoch": 1.0131627734519837, + "grad_norm": 8.390625, + "learning_rate": 8.986837226548018e-06, + "loss": 2.7436, + "mean_token_accuracy": 0.45945366898768075, + "step": 5465 + }, + { + "epoch": 1.0133481646273637, + "grad_norm": 6.59375, + "learning_rate": 8.986651835372637e-06, + "loss": 3.2904, + "mean_token_accuracy": 0.4235717400784998, + "step": 5466 + }, + { + "epoch": 1.0135335558027438, + "grad_norm": 7.8984375, + "learning_rate": 8.986466444197257e-06, + "loss": 2.8725, + "mean_token_accuracy": 0.43998285224349815, + "step": 5467 + }, + { + "epoch": 1.0137189469781238, + "grad_norm": 14.7890625, + "learning_rate": 8.986281053021878e-06, + "loss": 2.4853, + "mean_token_accuracy": 0.4687905604719764, + "step": 5468 + }, + { + "epoch": 1.013904338153504, + "grad_norm": 8.46875, + "learning_rate": 8.986095661846496e-06, + "loss": 3.0284, + "mean_token_accuracy": 0.42964001870032725, + "step": 5469 + }, + { + "epoch": 1.014089729328884, + "grad_norm": 6.30859375, + "learning_rate": 8.985910270671117e-06, + "loss": 2.8018, + "mean_token_accuracy": 0.4569841484893646, + "step": 5470 + }, + { + "epoch": 1.014275120504264, + "grad_norm": 10.0625, + "learning_rate": 8.985724879495736e-06, + "loss": 2.4392, + "mean_token_accuracy": 0.4865923113431419, + "step": 5471 + }, + { + "epoch": 1.0144605116796441, + "grad_norm": 9.09375, + "learning_rate": 8.985539488320356e-06, + "loss": 2.8995, + "mean_token_accuracy": 0.43331284572833434, + "step": 5472 + }, + { + "epoch": 1.014645902855024, + "grad_norm": 9.4375, + "learning_rate": 8.985354097144977e-06, + "loss": 2.9138, + "mean_token_accuracy": 0.4381491973559962, + "step": 5473 + }, + { + "epoch": 1.014831294030404, + "grad_norm": 5.12109375, + "learning_rate": 8.985168705969597e-06, + "loss": 2.9833, + "mean_token_accuracy": 0.4370777510592007, + "step": 5474 + }, + { + "epoch": 1.0150166852057843, + "grad_norm": 8.34375, + "learning_rate": 8.984983314794216e-06, + "loss": 3.2008, + "mean_token_accuracy": 0.44190871369294604, + "step": 5475 + }, + { + "epoch": 1.0152020763811642, + "grad_norm": 8.3515625, + "learning_rate": 8.984797923618836e-06, + "loss": 2.7833, + "mean_token_accuracy": 0.45383867832847424, + "step": 5476 + }, + { + "epoch": 1.0153874675565444, + "grad_norm": 5.4375, + "learning_rate": 8.984612532443457e-06, + "loss": 2.7174, + "mean_token_accuracy": 0.45886567164179104, + "step": 5477 + }, + { + "epoch": 1.0155728587319244, + "grad_norm": 4.92578125, + "learning_rate": 8.984427141268076e-06, + "loss": 2.5576, + "mean_token_accuracy": 0.48444994584558254, + "step": 5478 + }, + { + "epoch": 1.0157582499073043, + "grad_norm": 7.08984375, + "learning_rate": 8.984241750092696e-06, + "loss": 2.4713, + "mean_token_accuracy": 0.49972572682391664, + "step": 5479 + }, + { + "epoch": 1.0159436410826845, + "grad_norm": 11.2265625, + "learning_rate": 8.984056358917315e-06, + "loss": 2.9527, + "mean_token_accuracy": 0.44204809930178435, + "step": 5480 + }, + { + "epoch": 1.0161290322580645, + "grad_norm": 6.44921875, + "learning_rate": 8.983870967741937e-06, + "loss": 2.9697, + "mean_token_accuracy": 0.4401859940820065, + "step": 5481 + }, + { + "epoch": 1.0163144234334445, + "grad_norm": 5.3046875, + "learning_rate": 8.983685576566556e-06, + "loss": 2.6541, + "mean_token_accuracy": 0.48264556246466933, + "step": 5482 + }, + { + "epoch": 1.0164998146088247, + "grad_norm": 9.7734375, + "learning_rate": 8.983500185391177e-06, + "loss": 2.7513, + "mean_token_accuracy": 0.445993031358885, + "step": 5483 + }, + { + "epoch": 1.0166852057842046, + "grad_norm": 9.46875, + "learning_rate": 8.983314794215795e-06, + "loss": 2.4654, + "mean_token_accuracy": 0.5058866813833701, + "step": 5484 + }, + { + "epoch": 1.0168705969595848, + "grad_norm": 7.53515625, + "learning_rate": 8.983129403040416e-06, + "loss": 2.8264, + "mean_token_accuracy": 0.44897619443982023, + "step": 5485 + }, + { + "epoch": 1.0170559881349648, + "grad_norm": 7.94921875, + "learning_rate": 8.982944011865036e-06, + "loss": 2.7339, + "mean_token_accuracy": 0.4598187311178248, + "step": 5486 + }, + { + "epoch": 1.0172413793103448, + "grad_norm": 9.7265625, + "learning_rate": 8.982758620689655e-06, + "loss": 3.0269, + "mean_token_accuracy": 0.4277370590929913, + "step": 5487 + }, + { + "epoch": 1.017426770485725, + "grad_norm": 7.63671875, + "learning_rate": 8.982573229514276e-06, + "loss": 3.2608, + "mean_token_accuracy": 0.40989825832040006, + "step": 5488 + }, + { + "epoch": 1.017612161661105, + "grad_norm": 6.21484375, + "learning_rate": 8.982387838338896e-06, + "loss": 2.3214, + "mean_token_accuracy": 0.5200439319055464, + "step": 5489 + }, + { + "epoch": 1.0177975528364849, + "grad_norm": 8.453125, + "learning_rate": 8.982202447163517e-06, + "loss": 3.4448, + "mean_token_accuracy": 0.4241750726619935, + "step": 5490 + }, + { + "epoch": 1.017982944011865, + "grad_norm": 12.75, + "learning_rate": 8.982017055988135e-06, + "loss": 2.7642, + "mean_token_accuracy": 0.4415407480802576, + "step": 5491 + }, + { + "epoch": 1.018168335187245, + "grad_norm": 10.296875, + "learning_rate": 8.981831664812756e-06, + "loss": 2.7, + "mean_token_accuracy": 0.4550252409362093, + "step": 5492 + }, + { + "epoch": 1.0183537263626252, + "grad_norm": 5.8203125, + "learning_rate": 8.981646273637376e-06, + "loss": 2.8245, + "mean_token_accuracy": 0.4846382556987116, + "step": 5493 + }, + { + "epoch": 1.0185391175380052, + "grad_norm": 6.34375, + "learning_rate": 8.981460882461995e-06, + "loss": 3.0094, + "mean_token_accuracy": 0.43735117360244924, + "step": 5494 + }, + { + "epoch": 1.0187245087133852, + "grad_norm": 8.6640625, + "learning_rate": 8.981275491286616e-06, + "loss": 2.909, + "mean_token_accuracy": 0.4720408742548964, + "step": 5495 + }, + { + "epoch": 1.0189098998887653, + "grad_norm": 9.765625, + "learning_rate": 8.981090100111234e-06, + "loss": 2.7555, + "mean_token_accuracy": 0.4686874081986914, + "step": 5496 + }, + { + "epoch": 1.0190952910641453, + "grad_norm": 4.96875, + "learning_rate": 8.980904708935857e-06, + "loss": 2.4565, + "mean_token_accuracy": 0.4967860422405877, + "step": 5497 + }, + { + "epoch": 1.0192806822395255, + "grad_norm": 5.47265625, + "learning_rate": 8.980719317760475e-06, + "loss": 2.6386, + "mean_token_accuracy": 0.46288274831964155, + "step": 5498 + }, + { + "epoch": 1.0194660734149055, + "grad_norm": 6.61328125, + "learning_rate": 8.980533926585096e-06, + "loss": 2.3363, + "mean_token_accuracy": 0.5300979934671022, + "step": 5499 + }, + { + "epoch": 1.0196514645902854, + "grad_norm": 6.96875, + "learning_rate": 8.980348535409715e-06, + "loss": 3.0539, + "mean_token_accuracy": 0.42207792207792205, + "step": 5500 + }, + { + "epoch": 1.0198368557656656, + "grad_norm": 5.1171875, + "learning_rate": 8.980163144234335e-06, + "loss": 2.0198, + "mean_token_accuracy": 0.5498520373321193, + "step": 5501 + }, + { + "epoch": 1.0200222469410456, + "grad_norm": 6.4765625, + "learning_rate": 8.979977753058956e-06, + "loss": 2.5637, + "mean_token_accuracy": 0.47623842126459925, + "step": 5502 + }, + { + "epoch": 1.0202076381164256, + "grad_norm": 9.3359375, + "learning_rate": 8.979792361883574e-06, + "loss": 2.3095, + "mean_token_accuracy": 0.4949611287071696, + "step": 5503 + }, + { + "epoch": 1.0203930292918058, + "grad_norm": 6.01171875, + "learning_rate": 8.979606970708195e-06, + "loss": 2.6835, + "mean_token_accuracy": 0.4731083575006591, + "step": 5504 + }, + { + "epoch": 1.0205784204671857, + "grad_norm": 9.2734375, + "learning_rate": 8.979421579532815e-06, + "loss": 2.5066, + "mean_token_accuracy": 0.47296322999279017, + "step": 5505 + }, + { + "epoch": 1.020763811642566, + "grad_norm": 10.765625, + "learning_rate": 8.979236188357436e-06, + "loss": 2.2761, + "mean_token_accuracy": 0.5217729393468118, + "step": 5506 + }, + { + "epoch": 1.0209492028179459, + "grad_norm": 10.609375, + "learning_rate": 8.979050797182055e-06, + "loss": 2.862, + "mean_token_accuracy": 0.44781923279033103, + "step": 5507 + }, + { + "epoch": 1.0211345939933258, + "grad_norm": 7.69140625, + "learning_rate": 8.978865406006675e-06, + "loss": 3.0876, + "mean_token_accuracy": 0.4302705223880597, + "step": 5508 + }, + { + "epoch": 1.021319985168706, + "grad_norm": 7.9140625, + "learning_rate": 8.978680014831294e-06, + "loss": 2.5951, + "mean_token_accuracy": 0.48896648044692737, + "step": 5509 + }, + { + "epoch": 1.021505376344086, + "grad_norm": 8.4609375, + "learning_rate": 8.978494623655915e-06, + "loss": 2.8644, + "mean_token_accuracy": 0.4558106169296987, + "step": 5510 + }, + { + "epoch": 1.021690767519466, + "grad_norm": 6.44921875, + "learning_rate": 8.978309232480535e-06, + "loss": 3.4437, + "mean_token_accuracy": 0.4086298042577511, + "step": 5511 + }, + { + "epoch": 1.0218761586948462, + "grad_norm": 8.9375, + "learning_rate": 8.978123841305154e-06, + "loss": 2.1186, + "mean_token_accuracy": 0.5306296431616124, + "step": 5512 + }, + { + "epoch": 1.0220615498702261, + "grad_norm": 5.78515625, + "learning_rate": 8.977938450129774e-06, + "loss": 2.9658, + "mean_token_accuracy": 0.43958705072648485, + "step": 5513 + }, + { + "epoch": 1.0222469410456063, + "grad_norm": 6.1171875, + "learning_rate": 8.977753058954395e-06, + "loss": 2.9204, + "mean_token_accuracy": 0.4574953070528292, + "step": 5514 + }, + { + "epoch": 1.0224323322209863, + "grad_norm": 5.12109375, + "learning_rate": 8.977567667779015e-06, + "loss": 3.096, + "mean_token_accuracy": 0.45022551546391754, + "step": 5515 + }, + { + "epoch": 1.0226177233963663, + "grad_norm": 4.8984375, + "learning_rate": 8.977382276603634e-06, + "loss": 2.3322, + "mean_token_accuracy": 0.5193633952254642, + "step": 5516 + }, + { + "epoch": 1.0228031145717464, + "grad_norm": 8.234375, + "learning_rate": 8.977196885428255e-06, + "loss": 2.5278, + "mean_token_accuracy": 0.4796555639666919, + "step": 5517 + }, + { + "epoch": 1.0229885057471264, + "grad_norm": 6.3828125, + "learning_rate": 8.977011494252873e-06, + "loss": 3.1843, + "mean_token_accuracy": 0.4291914483202058, + "step": 5518 + }, + { + "epoch": 1.0231738969225064, + "grad_norm": 8.1484375, + "learning_rate": 8.976826103077494e-06, + "loss": 2.7885, + "mean_token_accuracy": 0.4369158878504673, + "step": 5519 + }, + { + "epoch": 1.0233592880978866, + "grad_norm": 5.44140625, + "learning_rate": 8.976640711902114e-06, + "loss": 2.662, + "mean_token_accuracy": 0.47323369565217394, + "step": 5520 + }, + { + "epoch": 1.0235446792732665, + "grad_norm": 6.91015625, + "learning_rate": 8.976455320726735e-06, + "loss": 2.5861, + "mean_token_accuracy": 0.4894597235795058, + "step": 5521 + }, + { + "epoch": 1.0237300704486467, + "grad_norm": 7.33984375, + "learning_rate": 8.976269929551354e-06, + "loss": 2.4696, + "mean_token_accuracy": 0.48795108500179835, + "step": 5522 + }, + { + "epoch": 1.0239154616240267, + "grad_norm": 5.7734375, + "learning_rate": 8.976084538375974e-06, + "loss": 2.5893, + "mean_token_accuracy": 0.47368421052631576, + "step": 5523 + }, + { + "epoch": 1.0241008527994067, + "grad_norm": 5.875, + "learning_rate": 8.975899147200595e-06, + "loss": 3.0833, + "mean_token_accuracy": 0.44462613349189833, + "step": 5524 + }, + { + "epoch": 1.0242862439747868, + "grad_norm": 7.36328125, + "learning_rate": 8.975713756025213e-06, + "loss": 2.3263, + "mean_token_accuracy": 0.5102139489906926, + "step": 5525 + }, + { + "epoch": 1.0244716351501668, + "grad_norm": 7.55078125, + "learning_rate": 8.975528364849834e-06, + "loss": 2.7095, + "mean_token_accuracy": 0.4616011961397309, + "step": 5526 + }, + { + "epoch": 1.024657026325547, + "grad_norm": 5.54296875, + "learning_rate": 8.975342973674453e-06, + "loss": 3.4361, + "mean_token_accuracy": 0.4043360433604336, + "step": 5527 + }, + { + "epoch": 1.024842417500927, + "grad_norm": 8.6015625, + "learning_rate": 8.975157582499073e-06, + "loss": 2.9329, + "mean_token_accuracy": 0.4526143790849673, + "step": 5528 + }, + { + "epoch": 1.025027808676307, + "grad_norm": 12.765625, + "learning_rate": 8.974972191323694e-06, + "loss": 3.8307, + "mean_token_accuracy": 0.3935156540032174, + "step": 5529 + }, + { + "epoch": 1.0252131998516871, + "grad_norm": 7.9375, + "learning_rate": 8.974786800148314e-06, + "loss": 2.0652, + "mean_token_accuracy": 0.5345549738219896, + "step": 5530 + }, + { + "epoch": 1.025398591027067, + "grad_norm": 7.703125, + "learning_rate": 8.974601408972935e-06, + "loss": 2.4594, + "mean_token_accuracy": 0.4909354604786077, + "step": 5531 + }, + { + "epoch": 1.025583982202447, + "grad_norm": 5.9921875, + "learning_rate": 8.974416017797553e-06, + "loss": 2.5402, + "mean_token_accuracy": 0.4727535765954027, + "step": 5532 + }, + { + "epoch": 1.0257693733778273, + "grad_norm": 12.46875, + "learning_rate": 8.974230626622174e-06, + "loss": 2.8028, + "mean_token_accuracy": 0.43747587803936705, + "step": 5533 + }, + { + "epoch": 1.0259547645532072, + "grad_norm": 6.59375, + "learning_rate": 8.974045235446793e-06, + "loss": 2.6011, + "mean_token_accuracy": 0.48085558969510755, + "step": 5534 + }, + { + "epoch": 1.0261401557285874, + "grad_norm": 7.1015625, + "learning_rate": 8.973859844271413e-06, + "loss": 2.6265, + "mean_token_accuracy": 0.48827208756841284, + "step": 5535 + }, + { + "epoch": 1.0263255469039674, + "grad_norm": 8.65625, + "learning_rate": 8.973674453096032e-06, + "loss": 2.4597, + "mean_token_accuracy": 0.47446344542637436, + "step": 5536 + }, + { + "epoch": 1.0265109380793473, + "grad_norm": 7.9921875, + "learning_rate": 8.973489061920654e-06, + "loss": 2.743, + "mean_token_accuracy": 0.45961566247652075, + "step": 5537 + }, + { + "epoch": 1.0266963292547275, + "grad_norm": 7.1328125, + "learning_rate": 8.973303670745273e-06, + "loss": 2.798, + "mean_token_accuracy": 0.46198246198246196, + "step": 5538 + }, + { + "epoch": 1.0268817204301075, + "grad_norm": 7.21484375, + "learning_rate": 8.973118279569894e-06, + "loss": 2.8432, + "mean_token_accuracy": 0.45358649789029537, + "step": 5539 + }, + { + "epoch": 1.0270671116054875, + "grad_norm": 6.29296875, + "learning_rate": 8.972932888394514e-06, + "loss": 2.3655, + "mean_token_accuracy": 0.5073823651166, + "step": 5540 + }, + { + "epoch": 1.0272525027808677, + "grad_norm": 7.0625, + "learning_rate": 8.972747497219133e-06, + "loss": 2.3819, + "mean_token_accuracy": 0.5251381942616478, + "step": 5541 + }, + { + "epoch": 1.0274378939562476, + "grad_norm": 7.81640625, + "learning_rate": 8.972562106043753e-06, + "loss": 2.521, + "mean_token_accuracy": 0.4916988856038208, + "step": 5542 + }, + { + "epoch": 1.0276232851316278, + "grad_norm": 7.515625, + "learning_rate": 8.972376714868372e-06, + "loss": 2.759, + "mean_token_accuracy": 0.46452815120457874, + "step": 5543 + }, + { + "epoch": 1.0278086763070078, + "grad_norm": 5.21875, + "learning_rate": 8.972191323692993e-06, + "loss": 2.1372, + "mean_token_accuracy": 0.5526788003076134, + "step": 5544 + }, + { + "epoch": 1.0279940674823878, + "grad_norm": 6.58203125, + "learning_rate": 8.972005932517613e-06, + "loss": 2.6057, + "mean_token_accuracy": 0.4781368821292776, + "step": 5545 + }, + { + "epoch": 1.028179458657768, + "grad_norm": 5.796875, + "learning_rate": 8.971820541342234e-06, + "loss": 3.1485, + "mean_token_accuracy": 0.41814420803782504, + "step": 5546 + }, + { + "epoch": 1.028364849833148, + "grad_norm": 5.76953125, + "learning_rate": 8.971635150166852e-06, + "loss": 3.0989, + "mean_token_accuracy": 0.4389733421458471, + "step": 5547 + }, + { + "epoch": 1.028550241008528, + "grad_norm": 6.89453125, + "learning_rate": 8.971449758991473e-06, + "loss": 2.2856, + "mean_token_accuracy": 0.4929492587682295, + "step": 5548 + }, + { + "epoch": 1.028735632183908, + "grad_norm": 6.0390625, + "learning_rate": 8.971264367816093e-06, + "loss": 2.8145, + "mean_token_accuracy": 0.4703251541157984, + "step": 5549 + }, + { + "epoch": 1.028921023359288, + "grad_norm": 5.796875, + "learning_rate": 8.971078976640712e-06, + "loss": 3.0712, + "mean_token_accuracy": 0.4100673700266938, + "step": 5550 + }, + { + "epoch": 1.0291064145346682, + "grad_norm": 6.421875, + "learning_rate": 8.970893585465333e-06, + "loss": 2.6634, + "mean_token_accuracy": 0.4408254599701641, + "step": 5551 + }, + { + "epoch": 1.0292918057100482, + "grad_norm": 8.65625, + "learning_rate": 8.970708194289951e-06, + "loss": 2.551, + "mean_token_accuracy": 0.4813048933500627, + "step": 5552 + }, + { + "epoch": 1.0294771968854282, + "grad_norm": 5.99609375, + "learning_rate": 8.970522803114574e-06, + "loss": 2.4918, + "mean_token_accuracy": 0.48578924355050285, + "step": 5553 + }, + { + "epoch": 1.0296625880608083, + "grad_norm": 7.1015625, + "learning_rate": 8.970337411939192e-06, + "loss": 2.2307, + "mean_token_accuracy": 0.5139676322028657, + "step": 5554 + }, + { + "epoch": 1.0298479792361883, + "grad_norm": 6.171875, + "learning_rate": 8.970152020763813e-06, + "loss": 2.502, + "mean_token_accuracy": 0.4964777947932619, + "step": 5555 + }, + { + "epoch": 1.0300333704115685, + "grad_norm": 7.49609375, + "learning_rate": 8.969966629588432e-06, + "loss": 2.998, + "mean_token_accuracy": 0.438280725319006, + "step": 5556 + }, + { + "epoch": 1.0302187615869485, + "grad_norm": 6.109375, + "learning_rate": 8.969781238413052e-06, + "loss": 2.5377, + "mean_token_accuracy": 0.4846938775510204, + "step": 5557 + }, + { + "epoch": 1.0304041527623284, + "grad_norm": 7.23828125, + "learning_rate": 8.969595847237673e-06, + "loss": 2.7634, + "mean_token_accuracy": 0.4433853264009243, + "step": 5558 + }, + { + "epoch": 1.0305895439377086, + "grad_norm": 10.0546875, + "learning_rate": 8.969410456062292e-06, + "loss": 3.5801, + "mean_token_accuracy": 0.3630314649825195, + "step": 5559 + }, + { + "epoch": 1.0307749351130886, + "grad_norm": 6.05859375, + "learning_rate": 8.969225064886912e-06, + "loss": 2.8917, + "mean_token_accuracy": 0.4431622231065128, + "step": 5560 + }, + { + "epoch": 1.0309603262884686, + "grad_norm": 8.1015625, + "learning_rate": 8.969039673711532e-06, + "loss": 2.6158, + "mean_token_accuracy": 0.455750273822563, + "step": 5561 + }, + { + "epoch": 1.0311457174638488, + "grad_norm": 14.9921875, + "learning_rate": 8.968854282536153e-06, + "loss": 2.1778, + "mean_token_accuracy": 0.5068895126307732, + "step": 5562 + }, + { + "epoch": 1.0313311086392287, + "grad_norm": 8.1953125, + "learning_rate": 8.968668891360772e-06, + "loss": 2.7626, + "mean_token_accuracy": 0.44863096680397385, + "step": 5563 + }, + { + "epoch": 1.031516499814609, + "grad_norm": 5.46484375, + "learning_rate": 8.968483500185392e-06, + "loss": 2.6416, + "mean_token_accuracy": 0.47481092167670813, + "step": 5564 + }, + { + "epoch": 1.0317018909899889, + "grad_norm": 7.71875, + "learning_rate": 8.968298109010011e-06, + "loss": 3.14, + "mean_token_accuracy": 0.42704449415858814, + "step": 5565 + }, + { + "epoch": 1.0318872821653688, + "grad_norm": 9.4609375, + "learning_rate": 8.968112717834632e-06, + "loss": 2.5653, + "mean_token_accuracy": 0.476685189659338, + "step": 5566 + }, + { + "epoch": 1.032072673340749, + "grad_norm": 8.9921875, + "learning_rate": 8.967927326659252e-06, + "loss": 2.7088, + "mean_token_accuracy": 0.47718416090509114, + "step": 5567 + }, + { + "epoch": 1.032258064516129, + "grad_norm": 7.390625, + "learning_rate": 8.967741935483871e-06, + "loss": 2.8832, + "mean_token_accuracy": 0.45079212674027846, + "step": 5568 + }, + { + "epoch": 1.0324434556915092, + "grad_norm": 6.828125, + "learning_rate": 8.967556544308491e-06, + "loss": 2.416, + "mean_token_accuracy": 0.5176904176904177, + "step": 5569 + }, + { + "epoch": 1.0326288468668892, + "grad_norm": 8.671875, + "learning_rate": 8.967371153133112e-06, + "loss": 2.6533, + "mean_token_accuracy": 0.4761355443403028, + "step": 5570 + }, + { + "epoch": 1.0328142380422691, + "grad_norm": 7.6953125, + "learning_rate": 8.967185761957732e-06, + "loss": 2.9078, + "mean_token_accuracy": 0.4454123112659698, + "step": 5571 + }, + { + "epoch": 1.0329996292176493, + "grad_norm": 7.77734375, + "learning_rate": 8.967000370782351e-06, + "loss": 2.7456, + "mean_token_accuracy": 0.4697633654688869, + "step": 5572 + }, + { + "epoch": 1.0331850203930293, + "grad_norm": 11.2109375, + "learning_rate": 8.966814979606972e-06, + "loss": 2.4755, + "mean_token_accuracy": 0.48849441157133466, + "step": 5573 + }, + { + "epoch": 1.0333704115684093, + "grad_norm": 8.078125, + "learning_rate": 8.966629588431592e-06, + "loss": 2.6445, + "mean_token_accuracy": 0.4667390714091773, + "step": 5574 + }, + { + "epoch": 1.0335558027437894, + "grad_norm": 8.8359375, + "learning_rate": 8.966444197256211e-06, + "loss": 2.5863, + "mean_token_accuracy": 0.4709214938143138, + "step": 5575 + }, + { + "epoch": 1.0337411939191694, + "grad_norm": 10.140625, + "learning_rate": 8.966258806080831e-06, + "loss": 2.7975, + "mean_token_accuracy": 0.44244946492271103, + "step": 5576 + }, + { + "epoch": 1.0339265850945496, + "grad_norm": 10.25, + "learning_rate": 8.96607341490545e-06, + "loss": 2.6415, + "mean_token_accuracy": 0.4659902292371289, + "step": 5577 + }, + { + "epoch": 1.0341119762699296, + "grad_norm": 6.66015625, + "learning_rate": 8.965888023730072e-06, + "loss": 2.8957, + "mean_token_accuracy": 0.4733075874602453, + "step": 5578 + }, + { + "epoch": 1.0342973674453095, + "grad_norm": 6.796875, + "learning_rate": 8.965702632554691e-06, + "loss": 2.4616, + "mean_token_accuracy": 0.5248847193512483, + "step": 5579 + }, + { + "epoch": 1.0344827586206897, + "grad_norm": 5.80078125, + "learning_rate": 8.965517241379312e-06, + "loss": 2.8631, + "mean_token_accuracy": 0.44561495917829863, + "step": 5580 + }, + { + "epoch": 1.0346681497960697, + "grad_norm": 8.5390625, + "learning_rate": 8.96533185020393e-06, + "loss": 2.324, + "mean_token_accuracy": 0.49885233358837033, + "step": 5581 + }, + { + "epoch": 1.0348535409714497, + "grad_norm": 7.13671875, + "learning_rate": 8.965146459028551e-06, + "loss": 2.4044, + "mean_token_accuracy": 0.49252316764953663, + "step": 5582 + }, + { + "epoch": 1.0350389321468298, + "grad_norm": 5.15625, + "learning_rate": 8.964961067853171e-06, + "loss": 2.5914, + "mean_token_accuracy": 0.48448197170241897, + "step": 5583 + }, + { + "epoch": 1.0352243233222098, + "grad_norm": 7.0625, + "learning_rate": 8.96477567667779e-06, + "loss": 3.13, + "mean_token_accuracy": 0.4157202630005977, + "step": 5584 + }, + { + "epoch": 1.03540971449759, + "grad_norm": 11.203125, + "learning_rate": 8.96459028550241e-06, + "loss": 2.39, + "mean_token_accuracy": 0.519563090968373, + "step": 5585 + }, + { + "epoch": 1.03559510567297, + "grad_norm": 7.05078125, + "learning_rate": 8.964404894327031e-06, + "loss": 2.9875, + "mean_token_accuracy": 0.4204845814977974, + "step": 5586 + }, + { + "epoch": 1.03578049684835, + "grad_norm": 8.375, + "learning_rate": 8.964219503151652e-06, + "loss": 2.4857, + "mean_token_accuracy": 0.47370133391172325, + "step": 5587 + }, + { + "epoch": 1.0359658880237301, + "grad_norm": 6.05859375, + "learning_rate": 8.96403411197627e-06, + "loss": 2.5342, + "mean_token_accuracy": 0.47273974168476396, + "step": 5588 + }, + { + "epoch": 1.03615127919911, + "grad_norm": 8.203125, + "learning_rate": 8.963848720800891e-06, + "loss": 2.5891, + "mean_token_accuracy": 0.4813193228254524, + "step": 5589 + }, + { + "epoch": 1.03633667037449, + "grad_norm": 6.4140625, + "learning_rate": 8.96366332962551e-06, + "loss": 2.5965, + "mean_token_accuracy": 0.46768328445747803, + "step": 5590 + }, + { + "epoch": 1.0365220615498703, + "grad_norm": 7.83203125, + "learning_rate": 8.96347793845013e-06, + "loss": 2.7904, + "mean_token_accuracy": 0.4707792207792208, + "step": 5591 + }, + { + "epoch": 1.0367074527252502, + "grad_norm": 7.23828125, + "learning_rate": 8.96329254727475e-06, + "loss": 2.4898, + "mean_token_accuracy": 0.4925648273319405, + "step": 5592 + }, + { + "epoch": 1.0368928439006304, + "grad_norm": 8.3515625, + "learning_rate": 8.96310715609937e-06, + "loss": 3.3168, + "mean_token_accuracy": 0.4000669194735668, + "step": 5593 + }, + { + "epoch": 1.0370782350760104, + "grad_norm": 7.8515625, + "learning_rate": 8.96292176492399e-06, + "loss": 2.842, + "mean_token_accuracy": 0.4589775734782003, + "step": 5594 + }, + { + "epoch": 1.0372636262513903, + "grad_norm": 6.60546875, + "learning_rate": 8.96273637374861e-06, + "loss": 3.0455, + "mean_token_accuracy": 0.4381054897739505, + "step": 5595 + }, + { + "epoch": 1.0374490174267705, + "grad_norm": 5.984375, + "learning_rate": 8.962550982573231e-06, + "loss": 2.3983, + "mean_token_accuracy": 0.4983991462113127, + "step": 5596 + }, + { + "epoch": 1.0376344086021505, + "grad_norm": 10.8125, + "learning_rate": 8.96236559139785e-06, + "loss": 2.5216, + "mean_token_accuracy": 0.48427754677754675, + "step": 5597 + }, + { + "epoch": 1.0378197997775307, + "grad_norm": 7.453125, + "learning_rate": 8.96218020022247e-06, + "loss": 3.0371, + "mean_token_accuracy": 0.4277363729358469, + "step": 5598 + }, + { + "epoch": 1.0380051909529107, + "grad_norm": 6.44140625, + "learning_rate": 8.961994809047089e-06, + "loss": 3.1914, + "mean_token_accuracy": 0.4148964418481147, + "step": 5599 + }, + { + "epoch": 1.0381905821282906, + "grad_norm": 8.515625, + "learning_rate": 8.96180941787171e-06, + "loss": 2.1297, + "mean_token_accuracy": 0.5437125748502994, + "step": 5600 + }, + { + "epoch": 1.0383759733036708, + "grad_norm": 6.203125, + "learning_rate": 8.96162402669633e-06, + "loss": 2.8923, + "mean_token_accuracy": 0.44300737338329504, + "step": 5601 + }, + { + "epoch": 1.0385613644790508, + "grad_norm": 6.51953125, + "learning_rate": 8.96143863552095e-06, + "loss": 2.8181, + "mean_token_accuracy": 0.4305235903337169, + "step": 5602 + }, + { + "epoch": 1.0387467556544308, + "grad_norm": 5.9296875, + "learning_rate": 8.96125324434557e-06, + "loss": 3.0045, + "mean_token_accuracy": 0.4406193378356119, + "step": 5603 + }, + { + "epoch": 1.038932146829811, + "grad_norm": 6.46875, + "learning_rate": 8.96106785317019e-06, + "loss": 2.9437, + "mean_token_accuracy": 0.4326349382544748, + "step": 5604 + }, + { + "epoch": 1.039117538005191, + "grad_norm": 6.078125, + "learning_rate": 8.96088246199481e-06, + "loss": 2.7796, + "mean_token_accuracy": 0.47644927536231885, + "step": 5605 + }, + { + "epoch": 1.039302929180571, + "grad_norm": 6.2109375, + "learning_rate": 8.96069707081943e-06, + "loss": 2.9562, + "mean_token_accuracy": 0.4398704902867715, + "step": 5606 + }, + { + "epoch": 1.039488320355951, + "grad_norm": 6.40625, + "learning_rate": 8.96051167964405e-06, + "loss": 2.75, + "mean_token_accuracy": 0.46301564722617355, + "step": 5607 + }, + { + "epoch": 1.039673711531331, + "grad_norm": 6.546875, + "learning_rate": 8.960326288468668e-06, + "loss": 2.8534, + "mean_token_accuracy": 0.45798367628463044, + "step": 5608 + }, + { + "epoch": 1.0398591027067112, + "grad_norm": 8.078125, + "learning_rate": 8.960140897293289e-06, + "loss": 2.9492, + "mean_token_accuracy": 0.42568318151521883, + "step": 5609 + }, + { + "epoch": 1.0400444938820912, + "grad_norm": 7.7421875, + "learning_rate": 8.95995550611791e-06, + "loss": 2.7774, + "mean_token_accuracy": 0.4832951945080092, + "step": 5610 + }, + { + "epoch": 1.0402298850574712, + "grad_norm": 8.8515625, + "learning_rate": 8.95977011494253e-06, + "loss": 2.547, + "mean_token_accuracy": 0.49680453394429036, + "step": 5611 + }, + { + "epoch": 1.0404152762328513, + "grad_norm": 7.1484375, + "learning_rate": 8.95958472376715e-06, + "loss": 2.7894, + "mean_token_accuracy": 0.4746865389164892, + "step": 5612 + }, + { + "epoch": 1.0406006674082313, + "grad_norm": 5.83984375, + "learning_rate": 8.95939933259177e-06, + "loss": 3.0426, + "mean_token_accuracy": 0.4395138496325608, + "step": 5613 + }, + { + "epoch": 1.0407860585836115, + "grad_norm": 6.51171875, + "learning_rate": 8.95921394141639e-06, + "loss": 3.11, + "mean_token_accuracy": 0.41501605995717344, + "step": 5614 + }, + { + "epoch": 1.0409714497589915, + "grad_norm": 5.80078125, + "learning_rate": 8.959028550241009e-06, + "loss": 2.5915, + "mean_token_accuracy": 0.46768828700403897, + "step": 5615 + }, + { + "epoch": 1.0411568409343714, + "grad_norm": 5.40234375, + "learning_rate": 8.958843159065629e-06, + "loss": 2.5451, + "mean_token_accuracy": 0.47551766138855056, + "step": 5616 + }, + { + "epoch": 1.0413422321097516, + "grad_norm": 5.0703125, + "learning_rate": 8.958657767890248e-06, + "loss": 2.9057, + "mean_token_accuracy": 0.4242699545749513, + "step": 5617 + }, + { + "epoch": 1.0415276232851316, + "grad_norm": 6.63671875, + "learning_rate": 8.95847237671487e-06, + "loss": 2.6428, + "mean_token_accuracy": 0.4813992951311839, + "step": 5618 + }, + { + "epoch": 1.0417130144605116, + "grad_norm": 6.84765625, + "learning_rate": 8.958286985539489e-06, + "loss": 2.5617, + "mean_token_accuracy": 0.4784663156473606, + "step": 5619 + }, + { + "epoch": 1.0418984056358918, + "grad_norm": 6.1328125, + "learning_rate": 8.95810159436411e-06, + "loss": 2.5549, + "mean_token_accuracy": 0.48664487771892256, + "step": 5620 + }, + { + "epoch": 1.0420837968112717, + "grad_norm": 5.6875, + "learning_rate": 8.95791620318873e-06, + "loss": 2.2521, + "mean_token_accuracy": 0.5407463006648081, + "step": 5621 + }, + { + "epoch": 1.042269187986652, + "grad_norm": 7.7265625, + "learning_rate": 8.957730812013349e-06, + "loss": 2.3945, + "mean_token_accuracy": 0.48872472783825816, + "step": 5622 + }, + { + "epoch": 1.0424545791620319, + "grad_norm": 6.984375, + "learning_rate": 8.957545420837969e-06, + "loss": 2.6269, + "mean_token_accuracy": 0.45908100819417075, + "step": 5623 + }, + { + "epoch": 1.0426399703374118, + "grad_norm": 6.3828125, + "learning_rate": 8.957360029662588e-06, + "loss": 2.3866, + "mean_token_accuracy": 0.48110140215403374, + "step": 5624 + }, + { + "epoch": 1.042825361512792, + "grad_norm": 5.6484375, + "learning_rate": 8.957174638487208e-06, + "loss": 2.9371, + "mean_token_accuracy": 0.44050818470559494, + "step": 5625 + }, + { + "epoch": 1.043010752688172, + "grad_norm": 7.140625, + "learning_rate": 8.956989247311829e-06, + "loss": 3.4247, + "mean_token_accuracy": 0.4192400413270577, + "step": 5626 + }, + { + "epoch": 1.0431961438635522, + "grad_norm": 6.5, + "learning_rate": 8.95680385613645e-06, + "loss": 2.5695, + "mean_token_accuracy": 0.4887134745880072, + "step": 5627 + }, + { + "epoch": 1.0433815350389322, + "grad_norm": 6.73046875, + "learning_rate": 8.956618464961068e-06, + "loss": 2.6947, + "mean_token_accuracy": 0.4662992306809048, + "step": 5628 + }, + { + "epoch": 1.0435669262143121, + "grad_norm": 7.796875, + "learning_rate": 8.956433073785689e-06, + "loss": 2.4385, + "mean_token_accuracy": 0.5011981566820276, + "step": 5629 + }, + { + "epoch": 1.0437523173896923, + "grad_norm": 9.0859375, + "learning_rate": 8.956247682610309e-06, + "loss": 2.8683, + "mean_token_accuracy": 0.45062549906840565, + "step": 5630 + }, + { + "epoch": 1.0439377085650723, + "grad_norm": 6.3046875, + "learning_rate": 8.956062291434928e-06, + "loss": 2.7964, + "mean_token_accuracy": 0.4394172853340804, + "step": 5631 + }, + { + "epoch": 1.0441230997404523, + "grad_norm": 6.48828125, + "learning_rate": 8.955876900259548e-06, + "loss": 2.7648, + "mean_token_accuracy": 0.4543467702768334, + "step": 5632 + }, + { + "epoch": 1.0443084909158324, + "grad_norm": 6.109375, + "learning_rate": 8.955691509084167e-06, + "loss": 2.4697, + "mean_token_accuracy": 0.4915514592933948, + "step": 5633 + }, + { + "epoch": 1.0444938820912124, + "grad_norm": 8.3359375, + "learning_rate": 8.95550611790879e-06, + "loss": 3.0593, + "mean_token_accuracy": 0.4274000252302258, + "step": 5634 + }, + { + "epoch": 1.0446792732665926, + "grad_norm": 6.4375, + "learning_rate": 8.955320726733408e-06, + "loss": 2.7248, + "mean_token_accuracy": 0.4629392656757555, + "step": 5635 + }, + { + "epoch": 1.0448646644419726, + "grad_norm": 7.14453125, + "learning_rate": 8.955135335558029e-06, + "loss": 2.6308, + "mean_token_accuracy": 0.4556735144078312, + "step": 5636 + }, + { + "epoch": 1.0450500556173525, + "grad_norm": 8.5078125, + "learning_rate": 8.954949944382647e-06, + "loss": 2.6641, + "mean_token_accuracy": 0.4824143756719398, + "step": 5637 + }, + { + "epoch": 1.0452354467927327, + "grad_norm": 6.8515625, + "learning_rate": 8.954764553207268e-06, + "loss": 2.2615, + "mean_token_accuracy": 0.5200527414569828, + "step": 5638 + }, + { + "epoch": 1.0454208379681127, + "grad_norm": 7.0078125, + "learning_rate": 8.954579162031888e-06, + "loss": 3.0472, + "mean_token_accuracy": 0.42877353357275005, + "step": 5639 + }, + { + "epoch": 1.0456062291434929, + "grad_norm": 7.25, + "learning_rate": 8.954393770856507e-06, + "loss": 3.158, + "mean_token_accuracy": 0.43897291593387266, + "step": 5640 + }, + { + "epoch": 1.0457916203188728, + "grad_norm": 7.80859375, + "learning_rate": 8.954208379681128e-06, + "loss": 3.0234, + "mean_token_accuracy": 0.43511053315994797, + "step": 5641 + }, + { + "epoch": 1.0459770114942528, + "grad_norm": 8.1875, + "learning_rate": 8.954022988505748e-06, + "loss": 2.762, + "mean_token_accuracy": 0.46043460434604344, + "step": 5642 + }, + { + "epoch": 1.046162402669633, + "grad_norm": 6.67578125, + "learning_rate": 8.953837597330369e-06, + "loss": 3.1795, + "mean_token_accuracy": 0.4272343791194305, + "step": 5643 + }, + { + "epoch": 1.046347793845013, + "grad_norm": 8.46875, + "learning_rate": 8.953652206154988e-06, + "loss": 2.6132, + "mean_token_accuracy": 0.4865735767991407, + "step": 5644 + }, + { + "epoch": 1.046533185020393, + "grad_norm": 10.921875, + "learning_rate": 8.953466814979608e-06, + "loss": 1.9001, + "mean_token_accuracy": 0.5672187567218757, + "step": 5645 + }, + { + "epoch": 1.0467185761957731, + "grad_norm": 8.0546875, + "learning_rate": 8.953281423804227e-06, + "loss": 2.8291, + "mean_token_accuracy": 0.4414668547249647, + "step": 5646 + }, + { + "epoch": 1.046903967371153, + "grad_norm": 8.4453125, + "learning_rate": 8.953096032628847e-06, + "loss": 2.5627, + "mean_token_accuracy": 0.4692361042306549, + "step": 5647 + }, + { + "epoch": 1.0470893585465333, + "grad_norm": 7.0859375, + "learning_rate": 8.952910641453468e-06, + "loss": 2.5749, + "mean_token_accuracy": 0.49135856486545615, + "step": 5648 + }, + { + "epoch": 1.0472747497219133, + "grad_norm": 5.1875, + "learning_rate": 8.952725250278087e-06, + "loss": 2.6549, + "mean_token_accuracy": 0.4620816702486951, + "step": 5649 + }, + { + "epoch": 1.0474601408972932, + "grad_norm": 6.97265625, + "learning_rate": 8.952539859102709e-06, + "loss": 2.6919, + "mean_token_accuracy": 0.4596832776405637, + "step": 5650 + }, + { + "epoch": 1.0476455320726734, + "grad_norm": 8.53125, + "learning_rate": 8.952354467927328e-06, + "loss": 3.01, + "mean_token_accuracy": 0.44061566735584656, + "step": 5651 + }, + { + "epoch": 1.0478309232480534, + "grad_norm": 7.37890625, + "learning_rate": 8.952169076751948e-06, + "loss": 2.2429, + "mean_token_accuracy": 0.538016628509459, + "step": 5652 + }, + { + "epoch": 1.0480163144234333, + "grad_norm": 5.87109375, + "learning_rate": 8.951983685576567e-06, + "loss": 2.6003, + "mean_token_accuracy": 0.457256046705588, + "step": 5653 + }, + { + "epoch": 1.0482017055988135, + "grad_norm": 8.328125, + "learning_rate": 8.951798294401187e-06, + "loss": 2.6015, + "mean_token_accuracy": 0.48035298035298035, + "step": 5654 + }, + { + "epoch": 1.0483870967741935, + "grad_norm": 6.3828125, + "learning_rate": 8.951612903225806e-06, + "loss": 2.9956, + "mean_token_accuracy": 0.4323971260613978, + "step": 5655 + }, + { + "epoch": 1.0485724879495737, + "grad_norm": 7.9609375, + "learning_rate": 8.951427512050427e-06, + "loss": 2.7272, + "mean_token_accuracy": 0.4562591714640479, + "step": 5656 + }, + { + "epoch": 1.0487578791249537, + "grad_norm": 7.01953125, + "learning_rate": 8.951242120875047e-06, + "loss": 2.9276, + "mean_token_accuracy": 0.44190900236538194, + "step": 5657 + }, + { + "epoch": 1.0489432703003336, + "grad_norm": 6.30859375, + "learning_rate": 8.951056729699668e-06, + "loss": 3.2293, + "mean_token_accuracy": 0.42861804535813347, + "step": 5658 + }, + { + "epoch": 1.0491286614757138, + "grad_norm": 8.046875, + "learning_rate": 8.950871338524288e-06, + "loss": 2.7459, + "mean_token_accuracy": 0.47488055595772405, + "step": 5659 + }, + { + "epoch": 1.0493140526510938, + "grad_norm": 9.9140625, + "learning_rate": 8.950685947348907e-06, + "loss": 2.8, + "mean_token_accuracy": 0.4492506396978189, + "step": 5660 + }, + { + "epoch": 1.0494994438264738, + "grad_norm": 6.703125, + "learning_rate": 8.950500556173527e-06, + "loss": 2.8064, + "mean_token_accuracy": 0.4726089785296031, + "step": 5661 + }, + { + "epoch": 1.049684835001854, + "grad_norm": 6.4140625, + "learning_rate": 8.950315164998146e-06, + "loss": 2.214, + "mean_token_accuracy": 0.5288023679417122, + "step": 5662 + }, + { + "epoch": 1.049870226177234, + "grad_norm": 9.4296875, + "learning_rate": 8.950129773822767e-06, + "loss": 2.7581, + "mean_token_accuracy": 0.4600056529112493, + "step": 5663 + }, + { + "epoch": 1.050055617352614, + "grad_norm": 15.109375, + "learning_rate": 8.949944382647387e-06, + "loss": 2.9905, + "mean_token_accuracy": 0.42480156684877846, + "step": 5664 + }, + { + "epoch": 1.050241008527994, + "grad_norm": 5.92578125, + "learning_rate": 8.949758991472006e-06, + "loss": 2.5476, + "mean_token_accuracy": 0.46461967899511514, + "step": 5665 + }, + { + "epoch": 1.050426399703374, + "grad_norm": 6.90625, + "learning_rate": 8.949573600296626e-06, + "loss": 2.6666, + "mean_token_accuracy": 0.4581861012956419, + "step": 5666 + }, + { + "epoch": 1.0506117908787542, + "grad_norm": 5.37109375, + "learning_rate": 8.949388209121247e-06, + "loss": 2.9847, + "mean_token_accuracy": 0.4569648189017266, + "step": 5667 + }, + { + "epoch": 1.0507971820541342, + "grad_norm": 4.98828125, + "learning_rate": 8.949202817945867e-06, + "loss": 2.6743, + "mean_token_accuracy": 0.4740740740740741, + "step": 5668 + }, + { + "epoch": 1.0509825732295144, + "grad_norm": 5.90625, + "learning_rate": 8.949017426770486e-06, + "loss": 3.0292, + "mean_token_accuracy": 0.4361972547025928, + "step": 5669 + }, + { + "epoch": 1.0511679644048944, + "grad_norm": 12.640625, + "learning_rate": 8.948832035595107e-06, + "loss": 2.4846, + "mean_token_accuracy": 0.47592931139549055, + "step": 5670 + }, + { + "epoch": 1.0513533555802743, + "grad_norm": 7.90625, + "learning_rate": 8.948646644419726e-06, + "loss": 3.2244, + "mean_token_accuracy": 0.42754798594214655, + "step": 5671 + }, + { + "epoch": 1.0515387467556545, + "grad_norm": 6.90234375, + "learning_rate": 8.948461253244346e-06, + "loss": 2.9034, + "mean_token_accuracy": 0.43752549286199865, + "step": 5672 + }, + { + "epoch": 1.0517241379310345, + "grad_norm": 7.75390625, + "learning_rate": 8.948275862068967e-06, + "loss": 3.1433, + "mean_token_accuracy": 0.41396718552797646, + "step": 5673 + }, + { + "epoch": 1.0519095291064144, + "grad_norm": 5.48828125, + "learning_rate": 8.948090470893587e-06, + "loss": 2.8565, + "mean_token_accuracy": 0.4496324530356657, + "step": 5674 + }, + { + "epoch": 1.0520949202817946, + "grad_norm": 5.62890625, + "learning_rate": 8.947905079718206e-06, + "loss": 3.0286, + "mean_token_accuracy": 0.4350290697674419, + "step": 5675 + }, + { + "epoch": 1.0522803114571746, + "grad_norm": 5.78515625, + "learning_rate": 8.947719688542826e-06, + "loss": 2.86, + "mean_token_accuracy": 0.4517029592406477, + "step": 5676 + }, + { + "epoch": 1.0524657026325548, + "grad_norm": 5.84765625, + "learning_rate": 8.947534297367447e-06, + "loss": 2.8077, + "mean_token_accuracy": 0.4557408985754292, + "step": 5677 + }, + { + "epoch": 1.0526510938079348, + "grad_norm": 5.390625, + "learning_rate": 8.947348906192066e-06, + "loss": 2.9929, + "mean_token_accuracy": 0.43848199178148417, + "step": 5678 + }, + { + "epoch": 1.0528364849833147, + "grad_norm": 5.23046875, + "learning_rate": 8.947163515016686e-06, + "loss": 2.6964, + "mean_token_accuracy": 0.47115795470730665, + "step": 5679 + }, + { + "epoch": 1.053021876158695, + "grad_norm": 5.96875, + "learning_rate": 8.946978123841305e-06, + "loss": 3.4283, + "mean_token_accuracy": 0.4009882643607165, + "step": 5680 + }, + { + "epoch": 1.0532072673340749, + "grad_norm": 6.32421875, + "learning_rate": 8.946792732665925e-06, + "loss": 2.8485, + "mean_token_accuracy": 0.45848327335732375, + "step": 5681 + }, + { + "epoch": 1.0533926585094548, + "grad_norm": 8.75, + "learning_rate": 8.946607341490546e-06, + "loss": 2.9152, + "mean_token_accuracy": 0.43754611573732477, + "step": 5682 + }, + { + "epoch": 1.053578049684835, + "grad_norm": 6.27734375, + "learning_rate": 8.946421950315166e-06, + "loss": 2.5218, + "mean_token_accuracy": 0.4813002826701457, + "step": 5683 + }, + { + "epoch": 1.053763440860215, + "grad_norm": 6.6875, + "learning_rate": 8.946236559139785e-06, + "loss": 3.5092, + "mean_token_accuracy": 0.42909504550050553, + "step": 5684 + }, + { + "epoch": 1.0539488320355952, + "grad_norm": 5.65234375, + "learning_rate": 8.946051167964406e-06, + "loss": 2.9554, + "mean_token_accuracy": 0.4472736306048938, + "step": 5685 + }, + { + "epoch": 1.0541342232109752, + "grad_norm": 6.9921875, + "learning_rate": 8.945865776789026e-06, + "loss": 2.8285, + "mean_token_accuracy": 0.45864338866628507, + "step": 5686 + }, + { + "epoch": 1.0543196143863551, + "grad_norm": 6.125, + "learning_rate": 8.945680385613645e-06, + "loss": 2.3288, + "mean_token_accuracy": 0.4971605575632421, + "step": 5687 + }, + { + "epoch": 1.0545050055617353, + "grad_norm": 5.92578125, + "learning_rate": 8.945494994438265e-06, + "loss": 2.7666, + "mean_token_accuracy": 0.46943231441048033, + "step": 5688 + }, + { + "epoch": 1.0546903967371153, + "grad_norm": 7.20703125, + "learning_rate": 8.945309603262884e-06, + "loss": 2.9318, + "mean_token_accuracy": 0.42982782113216894, + "step": 5689 + }, + { + "epoch": 1.0548757879124953, + "grad_norm": 7.2578125, + "learning_rate": 8.945124212087506e-06, + "loss": 3.0356, + "mean_token_accuracy": 0.4259826129545733, + "step": 5690 + }, + { + "epoch": 1.0550611790878754, + "grad_norm": 6.28515625, + "learning_rate": 8.944938820912125e-06, + "loss": 2.8971, + "mean_token_accuracy": 0.46479469451107663, + "step": 5691 + }, + { + "epoch": 1.0552465702632554, + "grad_norm": 6.38671875, + "learning_rate": 8.944753429736746e-06, + "loss": 2.9445, + "mean_token_accuracy": 0.4489905067522396, + "step": 5692 + }, + { + "epoch": 1.0554319614386356, + "grad_norm": 7.78515625, + "learning_rate": 8.944568038561366e-06, + "loss": 2.696, + "mean_token_accuracy": 0.48539928486293205, + "step": 5693 + }, + { + "epoch": 1.0556173526140156, + "grad_norm": 11.8046875, + "learning_rate": 8.944382647385985e-06, + "loss": 2.4134, + "mean_token_accuracy": 0.500067231410515, + "step": 5694 + }, + { + "epoch": 1.0558027437893955, + "grad_norm": 6.14453125, + "learning_rate": 8.944197256210605e-06, + "loss": 3.0006, + "mean_token_accuracy": 0.4424843271613938, + "step": 5695 + }, + { + "epoch": 1.0559881349647757, + "grad_norm": 7.19921875, + "learning_rate": 8.944011865035224e-06, + "loss": 2.5536, + "mean_token_accuracy": 0.4783821478382148, + "step": 5696 + }, + { + "epoch": 1.0561735261401557, + "grad_norm": 7.3359375, + "learning_rate": 8.943826473859845e-06, + "loss": 2.7178, + "mean_token_accuracy": 0.470316301703163, + "step": 5697 + }, + { + "epoch": 1.0563589173155359, + "grad_norm": 9.6875, + "learning_rate": 8.943641082684464e-06, + "loss": 2.9396, + "mean_token_accuracy": 0.44092174450974325, + "step": 5698 + }, + { + "epoch": 1.0565443084909159, + "grad_norm": 6.90625, + "learning_rate": 8.943455691509086e-06, + "loss": 2.8208, + "mean_token_accuracy": 0.4552061145219897, + "step": 5699 + }, + { + "epoch": 1.0567296996662958, + "grad_norm": 8.65625, + "learning_rate": 8.943270300333705e-06, + "loss": 2.457, + "mean_token_accuracy": 0.47265865004827523, + "step": 5700 + }, + { + "epoch": 1.056915090841676, + "grad_norm": 7.38671875, + "learning_rate": 8.943084909158325e-06, + "loss": 2.0218, + "mean_token_accuracy": 0.5308861799753458, + "step": 5701 + }, + { + "epoch": 1.057100482017056, + "grad_norm": 5.6875, + "learning_rate": 8.942899517982946e-06, + "loss": 2.5764, + "mean_token_accuracy": 0.48502994011976047, + "step": 5702 + }, + { + "epoch": 1.057285873192436, + "grad_norm": 6.24609375, + "learning_rate": 8.942714126807564e-06, + "loss": 2.5405, + "mean_token_accuracy": 0.5013395847287341, + "step": 5703 + }, + { + "epoch": 1.0574712643678161, + "grad_norm": 7.37109375, + "learning_rate": 8.942528735632185e-06, + "loss": 2.1832, + "mean_token_accuracy": 0.5244963939318578, + "step": 5704 + }, + { + "epoch": 1.057656655543196, + "grad_norm": 6.7578125, + "learning_rate": 8.942343344456804e-06, + "loss": 3.0618, + "mean_token_accuracy": 0.42559818773892116, + "step": 5705 + }, + { + "epoch": 1.0578420467185763, + "grad_norm": 5.3515625, + "learning_rate": 8.942157953281424e-06, + "loss": 2.7036, + "mean_token_accuracy": 0.49498619386717047, + "step": 5706 + }, + { + "epoch": 1.0580274378939563, + "grad_norm": 6.52734375, + "learning_rate": 8.941972562106045e-06, + "loss": 2.6175, + "mean_token_accuracy": 0.47662048490846115, + "step": 5707 + }, + { + "epoch": 1.0582128290693362, + "grad_norm": 6.234375, + "learning_rate": 8.941787170930665e-06, + "loss": 2.5017, + "mean_token_accuracy": 0.4891633064516129, + "step": 5708 + }, + { + "epoch": 1.0583982202447164, + "grad_norm": 4.88671875, + "learning_rate": 8.941601779755284e-06, + "loss": 2.4592, + "mean_token_accuracy": 0.4957750380939188, + "step": 5709 + }, + { + "epoch": 1.0585836114200964, + "grad_norm": 5.4609375, + "learning_rate": 8.941416388579904e-06, + "loss": 2.7625, + "mean_token_accuracy": 0.46655376799322607, + "step": 5710 + }, + { + "epoch": 1.0587690025954763, + "grad_norm": 5.91796875, + "learning_rate": 8.941230997404525e-06, + "loss": 3.2089, + "mean_token_accuracy": 0.42674842558833276, + "step": 5711 + }, + { + "epoch": 1.0589543937708565, + "grad_norm": 7.51171875, + "learning_rate": 8.941045606229144e-06, + "loss": 3.0842, + "mean_token_accuracy": 0.42810364464692485, + "step": 5712 + }, + { + "epoch": 1.0591397849462365, + "grad_norm": 6.08203125, + "learning_rate": 8.940860215053764e-06, + "loss": 2.9793, + "mean_token_accuracy": 0.42973939703628, + "step": 5713 + }, + { + "epoch": 1.0593251761216167, + "grad_norm": 8.234375, + "learning_rate": 8.940674823878383e-06, + "loss": 2.5483, + "mean_token_accuracy": 0.4722112211221122, + "step": 5714 + }, + { + "epoch": 1.0595105672969967, + "grad_norm": 7.44921875, + "learning_rate": 8.940489432703005e-06, + "loss": 3.2442, + "mean_token_accuracy": 0.3954865378170037, + "step": 5715 + }, + { + "epoch": 1.0596959584723766, + "grad_norm": 6.57421875, + "learning_rate": 8.940304041527624e-06, + "loss": 3.1388, + "mean_token_accuracy": 0.41407254952405453, + "step": 5716 + }, + { + "epoch": 1.0598813496477568, + "grad_norm": 5.80078125, + "learning_rate": 8.940118650352244e-06, + "loss": 2.9944, + "mean_token_accuracy": 0.4496709461646348, + "step": 5717 + }, + { + "epoch": 1.0600667408231368, + "grad_norm": 6.51953125, + "learning_rate": 8.939933259176863e-06, + "loss": 2.8126, + "mean_token_accuracy": 0.47378570567675066, + "step": 5718 + }, + { + "epoch": 1.060252131998517, + "grad_norm": 5.578125, + "learning_rate": 8.939747868001484e-06, + "loss": 2.6748, + "mean_token_accuracy": 0.4578462384816394, + "step": 5719 + }, + { + "epoch": 1.060437523173897, + "grad_norm": 8.1484375, + "learning_rate": 8.939562476826104e-06, + "loss": 3.3277, + "mean_token_accuracy": 0.4234686346863469, + "step": 5720 + }, + { + "epoch": 1.060622914349277, + "grad_norm": 5.6328125, + "learning_rate": 8.939377085650723e-06, + "loss": 2.4503, + "mean_token_accuracy": 0.5293363714041583, + "step": 5721 + }, + { + "epoch": 1.060808305524657, + "grad_norm": 6.578125, + "learning_rate": 8.939191694475344e-06, + "loss": 2.9995, + "mean_token_accuracy": 0.45446910617876424, + "step": 5722 + }, + { + "epoch": 1.060993696700037, + "grad_norm": 7.734375, + "learning_rate": 8.939006303299964e-06, + "loss": 2.2549, + "mean_token_accuracy": 0.5109130385098846, + "step": 5723 + }, + { + "epoch": 1.061179087875417, + "grad_norm": 8.0234375, + "learning_rate": 8.938820912124584e-06, + "loss": 2.9035, + "mean_token_accuracy": 0.4328173374613003, + "step": 5724 + }, + { + "epoch": 1.0613644790507972, + "grad_norm": 6.609375, + "learning_rate": 8.938635520949203e-06, + "loss": 3.1527, + "mean_token_accuracy": 0.4398438825725437, + "step": 5725 + }, + { + "epoch": 1.0615498702261772, + "grad_norm": 7.1484375, + "learning_rate": 8.938450129773824e-06, + "loss": 2.9403, + "mean_token_accuracy": 0.41848958333333336, + "step": 5726 + }, + { + "epoch": 1.0617352614015574, + "grad_norm": 5.44140625, + "learning_rate": 8.938264738598443e-06, + "loss": 2.4523, + "mean_token_accuracy": 0.49534474081529944, + "step": 5727 + }, + { + "epoch": 1.0619206525769374, + "grad_norm": 6.5703125, + "learning_rate": 8.938079347423063e-06, + "loss": 2.2225, + "mean_token_accuracy": 0.5306122448979592, + "step": 5728 + }, + { + "epoch": 1.0621060437523173, + "grad_norm": 8.6015625, + "learning_rate": 8.937893956247684e-06, + "loss": 3.0911, + "mean_token_accuracy": 0.42921137618501926, + "step": 5729 + }, + { + "epoch": 1.0622914349276975, + "grad_norm": 8.1328125, + "learning_rate": 8.937708565072302e-06, + "loss": 2.6733, + "mean_token_accuracy": 0.47794404684450226, + "step": 5730 + }, + { + "epoch": 1.0624768261030775, + "grad_norm": 5.69921875, + "learning_rate": 8.937523173896925e-06, + "loss": 2.6952, + "mean_token_accuracy": 0.4754469336954062, + "step": 5731 + }, + { + "epoch": 1.0626622172784574, + "grad_norm": 6.66796875, + "learning_rate": 8.937337782721543e-06, + "loss": 2.9965, + "mean_token_accuracy": 0.43822497976800645, + "step": 5732 + }, + { + "epoch": 1.0628476084538376, + "grad_norm": 5.625, + "learning_rate": 8.937152391546164e-06, + "loss": 2.9898, + "mean_token_accuracy": 0.42593153589821264, + "step": 5733 + }, + { + "epoch": 1.0630329996292176, + "grad_norm": 5.53125, + "learning_rate": 8.936967000370783e-06, + "loss": 2.594, + "mean_token_accuracy": 0.4858052901260213, + "step": 5734 + }, + { + "epoch": 1.0632183908045978, + "grad_norm": 6.46484375, + "learning_rate": 8.936781609195403e-06, + "loss": 2.9792, + "mean_token_accuracy": 0.45512335401846526, + "step": 5735 + }, + { + "epoch": 1.0634037819799778, + "grad_norm": 5.34765625, + "learning_rate": 8.936596218020022e-06, + "loss": 2.331, + "mean_token_accuracy": 0.5089388223716582, + "step": 5736 + }, + { + "epoch": 1.0635891731553577, + "grad_norm": 5.66015625, + "learning_rate": 8.936410826844642e-06, + "loss": 2.6855, + "mean_token_accuracy": 0.4709402693060191, + "step": 5737 + }, + { + "epoch": 1.063774564330738, + "grad_norm": 7.1171875, + "learning_rate": 8.936225435669263e-06, + "loss": 3.047, + "mean_token_accuracy": 0.4472206625491297, + "step": 5738 + }, + { + "epoch": 1.0639599555061179, + "grad_norm": 8.8984375, + "learning_rate": 8.936040044493883e-06, + "loss": 2.2704, + "mean_token_accuracy": 0.5379146919431279, + "step": 5739 + }, + { + "epoch": 1.064145346681498, + "grad_norm": 7.078125, + "learning_rate": 8.935854653318504e-06, + "loss": 2.6071, + "mean_token_accuracy": 0.4864927806241267, + "step": 5740 + }, + { + "epoch": 1.064330737856878, + "grad_norm": 6.73828125, + "learning_rate": 8.935669262143123e-06, + "loss": 3.0707, + "mean_token_accuracy": 0.4472066292435178, + "step": 5741 + }, + { + "epoch": 1.064516129032258, + "grad_norm": 5.90625, + "learning_rate": 8.935483870967743e-06, + "loss": 2.7683, + "mean_token_accuracy": 0.47944368026898976, + "step": 5742 + }, + { + "epoch": 1.0647015202076382, + "grad_norm": 8.5390625, + "learning_rate": 8.935298479792362e-06, + "loss": 3.0508, + "mean_token_accuracy": 0.40850845720143514, + "step": 5743 + }, + { + "epoch": 1.0648869113830182, + "grad_norm": 9.0078125, + "learning_rate": 8.935113088616982e-06, + "loss": 2.9536, + "mean_token_accuracy": 0.4547327596654951, + "step": 5744 + }, + { + "epoch": 1.0650723025583981, + "grad_norm": 6.79296875, + "learning_rate": 8.934927697441603e-06, + "loss": 3.1622, + "mean_token_accuracy": 0.4279286035698215, + "step": 5745 + }, + { + "epoch": 1.0652576937337783, + "grad_norm": 11.53125, + "learning_rate": 8.934742306266222e-06, + "loss": 2.8398, + "mean_token_accuracy": 0.4524207011686144, + "step": 5746 + }, + { + "epoch": 1.0654430849091583, + "grad_norm": 8.5, + "learning_rate": 8.934556915090842e-06, + "loss": 2.8321, + "mean_token_accuracy": 0.45915655690352397, + "step": 5747 + }, + { + "epoch": 1.0656284760845385, + "grad_norm": 6.05078125, + "learning_rate": 8.934371523915463e-06, + "loss": 2.7909, + "mean_token_accuracy": 0.46559466898449947, + "step": 5748 + }, + { + "epoch": 1.0658138672599184, + "grad_norm": 9.9609375, + "learning_rate": 8.934186132740083e-06, + "loss": 2.1183, + "mean_token_accuracy": 0.5217225124219645, + "step": 5749 + }, + { + "epoch": 1.0659992584352984, + "grad_norm": 7.58984375, + "learning_rate": 8.934000741564702e-06, + "loss": 2.424, + "mean_token_accuracy": 0.5076538920855499, + "step": 5750 + }, + { + "epoch": 1.0661846496106786, + "grad_norm": 5.05859375, + "learning_rate": 8.933815350389323e-06, + "loss": 3.3062, + "mean_token_accuracy": 0.3998315485501143, + "step": 5751 + }, + { + "epoch": 1.0663700407860586, + "grad_norm": 6.6328125, + "learning_rate": 8.933629959213941e-06, + "loss": 2.0356, + "mean_token_accuracy": 0.571405596654873, + "step": 5752 + }, + { + "epoch": 1.0665554319614385, + "grad_norm": 6.58203125, + "learning_rate": 8.933444568038562e-06, + "loss": 3.2009, + "mean_token_accuracy": 0.4553668232743043, + "step": 5753 + }, + { + "epoch": 1.0667408231368187, + "grad_norm": 7.12890625, + "learning_rate": 8.933259176863182e-06, + "loss": 2.6738, + "mean_token_accuracy": 0.46766917293233085, + "step": 5754 + }, + { + "epoch": 1.0669262143121987, + "grad_norm": 6.328125, + "learning_rate": 8.933073785687803e-06, + "loss": 2.7168, + "mean_token_accuracy": 0.4710581639803784, + "step": 5755 + }, + { + "epoch": 1.0671116054875789, + "grad_norm": 6.1640625, + "learning_rate": 8.932888394512422e-06, + "loss": 3.3006, + "mean_token_accuracy": 0.4151372137738158, + "step": 5756 + }, + { + "epoch": 1.0672969966629589, + "grad_norm": 6.82421875, + "learning_rate": 8.932703003337042e-06, + "loss": 3.2766, + "mean_token_accuracy": 0.42860548271752086, + "step": 5757 + }, + { + "epoch": 1.0674823878383388, + "grad_norm": 6.32421875, + "learning_rate": 8.932517612161663e-06, + "loss": 2.222, + "mean_token_accuracy": 0.5101957585644372, + "step": 5758 + }, + { + "epoch": 1.067667779013719, + "grad_norm": 4.94921875, + "learning_rate": 8.932332220986281e-06, + "loss": 3.0586, + "mean_token_accuracy": 0.44321070234113713, + "step": 5759 + }, + { + "epoch": 1.067853170189099, + "grad_norm": 7.13671875, + "learning_rate": 8.932146829810902e-06, + "loss": 2.6122, + "mean_token_accuracy": 0.48635259834871297, + "step": 5760 + }, + { + "epoch": 1.068038561364479, + "grad_norm": 4.703125, + "learning_rate": 8.93196143863552e-06, + "loss": 2.788, + "mean_token_accuracy": 0.45470517184583287, + "step": 5761 + }, + { + "epoch": 1.0682239525398591, + "grad_norm": 5.12890625, + "learning_rate": 8.931776047460141e-06, + "loss": 2.7694, + "mean_token_accuracy": 0.45633971291866027, + "step": 5762 + }, + { + "epoch": 1.068409343715239, + "grad_norm": 7.80859375, + "learning_rate": 8.931590656284762e-06, + "loss": 2.4383, + "mean_token_accuracy": 0.49179037336932074, + "step": 5763 + }, + { + "epoch": 1.0685947348906193, + "grad_norm": 6.328125, + "learning_rate": 8.931405265109382e-06, + "loss": 2.4456, + "mean_token_accuracy": 0.5048680682940595, + "step": 5764 + }, + { + "epoch": 1.0687801260659993, + "grad_norm": 7.25390625, + "learning_rate": 8.931219873934001e-06, + "loss": 2.8644, + "mean_token_accuracy": 0.4505724657916783, + "step": 5765 + }, + { + "epoch": 1.0689655172413792, + "grad_norm": 6.390625, + "learning_rate": 8.931034482758621e-06, + "loss": 2.9415, + "mean_token_accuracy": 0.44723946422737665, + "step": 5766 + }, + { + "epoch": 1.0691509084167594, + "grad_norm": 6.75390625, + "learning_rate": 8.930849091583242e-06, + "loss": 2.4076, + "mean_token_accuracy": 0.4897292885321725, + "step": 5767 + }, + { + "epoch": 1.0693362995921394, + "grad_norm": 5.1015625, + "learning_rate": 8.93066370040786e-06, + "loss": 2.9646, + "mean_token_accuracy": 0.44514887354239785, + "step": 5768 + }, + { + "epoch": 1.0695216907675196, + "grad_norm": 5.890625, + "learning_rate": 8.930478309232481e-06, + "loss": 2.4079, + "mean_token_accuracy": 0.4857183257918552, + "step": 5769 + }, + { + "epoch": 1.0697070819428995, + "grad_norm": 5.16015625, + "learning_rate": 8.9302929180571e-06, + "loss": 2.5148, + "mean_token_accuracy": 0.5208443972384493, + "step": 5770 + }, + { + "epoch": 1.0698924731182795, + "grad_norm": 6.9765625, + "learning_rate": 8.930107526881722e-06, + "loss": 2.5371, + "mean_token_accuracy": 0.47689463955637706, + "step": 5771 + }, + { + "epoch": 1.0700778642936597, + "grad_norm": 5.34375, + "learning_rate": 8.929922135706341e-06, + "loss": 2.6064, + "mean_token_accuracy": 0.4703448275862069, + "step": 5772 + }, + { + "epoch": 1.0702632554690397, + "grad_norm": 5.96484375, + "learning_rate": 8.929736744530961e-06, + "loss": 3.2041, + "mean_token_accuracy": 0.4141294005708849, + "step": 5773 + }, + { + "epoch": 1.0704486466444196, + "grad_norm": 6.56640625, + "learning_rate": 8.92955135335558e-06, + "loss": 2.5006, + "mean_token_accuracy": 0.48738194766991794, + "step": 5774 + }, + { + "epoch": 1.0706340378197998, + "grad_norm": 8.2109375, + "learning_rate": 8.9293659621802e-06, + "loss": 3.1463, + "mean_token_accuracy": 0.4229600694444444, + "step": 5775 + }, + { + "epoch": 1.0708194289951798, + "grad_norm": 5.3671875, + "learning_rate": 8.929180571004821e-06, + "loss": 2.9061, + "mean_token_accuracy": 0.45018933877075445, + "step": 5776 + }, + { + "epoch": 1.07100482017056, + "grad_norm": 6.6015625, + "learning_rate": 8.92899517982944e-06, + "loss": 2.1618, + "mean_token_accuracy": 0.5411734758813797, + "step": 5777 + }, + { + "epoch": 1.07119021134594, + "grad_norm": 5.6875, + "learning_rate": 8.92880978865406e-06, + "loss": 2.7697, + "mean_token_accuracy": 0.4490987560294491, + "step": 5778 + }, + { + "epoch": 1.07137560252132, + "grad_norm": 7.66796875, + "learning_rate": 8.928624397478681e-06, + "loss": 2.953, + "mean_token_accuracy": 0.4488862837045721, + "step": 5779 + }, + { + "epoch": 1.0715609936967, + "grad_norm": 6.4609375, + "learning_rate": 8.928439006303302e-06, + "loss": 2.9553, + "mean_token_accuracy": 0.44451675886755615, + "step": 5780 + }, + { + "epoch": 1.07174638487208, + "grad_norm": 9.140625, + "learning_rate": 8.92825361512792e-06, + "loss": 2.3443, + "mean_token_accuracy": 0.5105550118831259, + "step": 5781 + }, + { + "epoch": 1.07193177604746, + "grad_norm": 7.28515625, + "learning_rate": 8.92806822395254e-06, + "loss": 3.1704, + "mean_token_accuracy": 0.4397890418986229, + "step": 5782 + }, + { + "epoch": 1.0721171672228402, + "grad_norm": 6.578125, + "learning_rate": 8.927882832777161e-06, + "loss": 3.3804, + "mean_token_accuracy": 0.41244555071561917, + "step": 5783 + }, + { + "epoch": 1.0723025583982202, + "grad_norm": 11.5859375, + "learning_rate": 8.92769744160178e-06, + "loss": 3.1856, + "mean_token_accuracy": 0.4259179882840406, + "step": 5784 + }, + { + "epoch": 1.0724879495736004, + "grad_norm": 7.44921875, + "learning_rate": 8.9275120504264e-06, + "loss": 2.1791, + "mean_token_accuracy": 0.5471489714206821, + "step": 5785 + }, + { + "epoch": 1.0726733407489804, + "grad_norm": 7.1796875, + "learning_rate": 8.92732665925102e-06, + "loss": 2.5774, + "mean_token_accuracy": 0.48245363766048505, + "step": 5786 + }, + { + "epoch": 1.0728587319243603, + "grad_norm": 5.58984375, + "learning_rate": 8.927141268075642e-06, + "loss": 2.6968, + "mean_token_accuracy": 0.46965784377017433, + "step": 5787 + }, + { + "epoch": 1.0730441230997405, + "grad_norm": 7.5703125, + "learning_rate": 8.92695587690026e-06, + "loss": 2.2704, + "mean_token_accuracy": 0.5076905804423487, + "step": 5788 + }, + { + "epoch": 1.0732295142751205, + "grad_norm": 6.12890625, + "learning_rate": 8.92677048572488e-06, + "loss": 2.6718, + "mean_token_accuracy": 0.4686438463548471, + "step": 5789 + }, + { + "epoch": 1.0734149054505004, + "grad_norm": 5.1875, + "learning_rate": 8.9265850945495e-06, + "loss": 2.7832, + "mean_token_accuracy": 0.4414613894270689, + "step": 5790 + }, + { + "epoch": 1.0736002966258806, + "grad_norm": 6.1171875, + "learning_rate": 8.92639970337412e-06, + "loss": 2.9742, + "mean_token_accuracy": 0.4287641207216321, + "step": 5791 + }, + { + "epoch": 1.0737856878012606, + "grad_norm": 6.57421875, + "learning_rate": 8.92621431219874e-06, + "loss": 2.7078, + "mean_token_accuracy": 0.4662937062937063, + "step": 5792 + }, + { + "epoch": 1.0739710789766408, + "grad_norm": 6.84765625, + "learning_rate": 8.92602892102336e-06, + "loss": 3.0665, + "mean_token_accuracy": 0.4523151347615757, + "step": 5793 + }, + { + "epoch": 1.0741564701520208, + "grad_norm": 7.01171875, + "learning_rate": 8.92584352984798e-06, + "loss": 3.3212, + "mean_token_accuracy": 0.40908463343918267, + "step": 5794 + }, + { + "epoch": 1.0743418613274007, + "grad_norm": 9.7890625, + "learning_rate": 8.9256581386726e-06, + "loss": 2.1224, + "mean_token_accuracy": 0.5443458980044346, + "step": 5795 + }, + { + "epoch": 1.074527252502781, + "grad_norm": 6.32421875, + "learning_rate": 8.925472747497221e-06, + "loss": 2.6615, + "mean_token_accuracy": 0.4880177749563561, + "step": 5796 + }, + { + "epoch": 1.0747126436781609, + "grad_norm": 7.015625, + "learning_rate": 8.92528735632184e-06, + "loss": 2.8996, + "mean_token_accuracy": 0.44215032103739144, + "step": 5797 + }, + { + "epoch": 1.074898034853541, + "grad_norm": 5.40234375, + "learning_rate": 8.92510196514646e-06, + "loss": 2.711, + "mean_token_accuracy": 0.457280947926411, + "step": 5798 + }, + { + "epoch": 1.075083426028921, + "grad_norm": 7.26171875, + "learning_rate": 8.924916573971079e-06, + "loss": 2.6011, + "mean_token_accuracy": 0.4884597268016957, + "step": 5799 + }, + { + "epoch": 1.075268817204301, + "grad_norm": 5.2890625, + "learning_rate": 8.9247311827957e-06, + "loss": 2.3566, + "mean_token_accuracy": 0.49474635634391595, + "step": 5800 + }, + { + "epoch": 1.0754542083796812, + "grad_norm": 6.140625, + "learning_rate": 8.92454579162032e-06, + "loss": 3.244, + "mean_token_accuracy": 0.41499359248184536, + "step": 5801 + }, + { + "epoch": 1.0756395995550612, + "grad_norm": 8.4375, + "learning_rate": 8.924360400444939e-06, + "loss": 3.4775, + "mean_token_accuracy": 0.3731082654249127, + "step": 5802 + }, + { + "epoch": 1.0758249907304411, + "grad_norm": 6.30078125, + "learning_rate": 8.92417500926956e-06, + "loss": 2.519, + "mean_token_accuracy": 0.483072546230441, + "step": 5803 + }, + { + "epoch": 1.0760103819058213, + "grad_norm": 5.3515625, + "learning_rate": 8.92398961809418e-06, + "loss": 2.7978, + "mean_token_accuracy": 0.47080056722959907, + "step": 5804 + }, + { + "epoch": 1.0761957730812013, + "grad_norm": 5.89453125, + "learning_rate": 8.9238042269188e-06, + "loss": 2.279, + "mean_token_accuracy": 0.5139495390587093, + "step": 5805 + }, + { + "epoch": 1.0763811642565815, + "grad_norm": 6.7265625, + "learning_rate": 8.923618835743419e-06, + "loss": 2.3307, + "mean_token_accuracy": 0.5020222446916077, + "step": 5806 + }, + { + "epoch": 1.0765665554319614, + "grad_norm": 6.0078125, + "learning_rate": 8.92343344456804e-06, + "loss": 3.3019, + "mean_token_accuracy": 0.4198668714797747, + "step": 5807 + }, + { + "epoch": 1.0767519466073414, + "grad_norm": 5.98046875, + "learning_rate": 8.923248053392658e-06, + "loss": 2.6598, + "mean_token_accuracy": 0.4862234201856843, + "step": 5808 + }, + { + "epoch": 1.0769373377827216, + "grad_norm": 6.28125, + "learning_rate": 8.923062662217279e-06, + "loss": 2.7145, + "mean_token_accuracy": 0.4743436754176611, + "step": 5809 + }, + { + "epoch": 1.0771227289581016, + "grad_norm": 6.015625, + "learning_rate": 8.9228772710419e-06, + "loss": 2.3365, + "mean_token_accuracy": 0.5075952995127544, + "step": 5810 + }, + { + "epoch": 1.0773081201334818, + "grad_norm": 6.4765625, + "learning_rate": 8.92269187986652e-06, + "loss": 2.6944, + "mean_token_accuracy": 0.45133149678604223, + "step": 5811 + }, + { + "epoch": 1.0774935113088617, + "grad_norm": 7.11328125, + "learning_rate": 8.92250648869114e-06, + "loss": 2.5046, + "mean_token_accuracy": 0.4671948846260773, + "step": 5812 + }, + { + "epoch": 1.0776789024842417, + "grad_norm": 8.078125, + "learning_rate": 8.922321097515759e-06, + "loss": 2.9621, + "mean_token_accuracy": 0.4457315138051412, + "step": 5813 + }, + { + "epoch": 1.0778642936596219, + "grad_norm": 5.24609375, + "learning_rate": 8.92213570634038e-06, + "loss": 2.5833, + "mean_token_accuracy": 0.4936175644625989, + "step": 5814 + }, + { + "epoch": 1.0780496848350019, + "grad_norm": 6.3203125, + "learning_rate": 8.921950315164998e-06, + "loss": 2.6766, + "mean_token_accuracy": 0.4688198757763975, + "step": 5815 + }, + { + "epoch": 1.0782350760103818, + "grad_norm": 5.90234375, + "learning_rate": 8.921764923989619e-06, + "loss": 3.1408, + "mean_token_accuracy": 0.43957951586459637, + "step": 5816 + }, + { + "epoch": 1.078420467185762, + "grad_norm": 5.765625, + "learning_rate": 8.921579532814238e-06, + "loss": 3.0975, + "mean_token_accuracy": 0.4502564102564103, + "step": 5817 + }, + { + "epoch": 1.078605858361142, + "grad_norm": 5.265625, + "learning_rate": 8.921394141638858e-06, + "loss": 2.7628, + "mean_token_accuracy": 0.45309358945722583, + "step": 5818 + }, + { + "epoch": 1.078791249536522, + "grad_norm": 6.55078125, + "learning_rate": 8.921208750463479e-06, + "loss": 2.8176, + "mean_token_accuracy": 0.4581901489117984, + "step": 5819 + }, + { + "epoch": 1.0789766407119021, + "grad_norm": 8.7578125, + "learning_rate": 8.921023359288099e-06, + "loss": 2.4382, + "mean_token_accuracy": 0.4910374029640085, + "step": 5820 + }, + { + "epoch": 1.079162031887282, + "grad_norm": 5.08203125, + "learning_rate": 8.92083796811272e-06, + "loss": 2.5454, + "mean_token_accuracy": 0.4872926858370123, + "step": 5821 + }, + { + "epoch": 1.0793474230626623, + "grad_norm": 5.43359375, + "learning_rate": 8.920652576937338e-06, + "loss": 2.5622, + "mean_token_accuracy": 0.48317801295214025, + "step": 5822 + }, + { + "epoch": 1.0795328142380423, + "grad_norm": 7.62109375, + "learning_rate": 8.920467185761959e-06, + "loss": 2.8763, + "mean_token_accuracy": 0.43857493857493857, + "step": 5823 + }, + { + "epoch": 1.0797182054134222, + "grad_norm": 6.5859375, + "learning_rate": 8.920281794586578e-06, + "loss": 2.8661, + "mean_token_accuracy": 0.45296884185773073, + "step": 5824 + }, + { + "epoch": 1.0799035965888024, + "grad_norm": 5.296875, + "learning_rate": 8.920096403411198e-06, + "loss": 2.6759, + "mean_token_accuracy": 0.4666357738646895, + "step": 5825 + }, + { + "epoch": 1.0800889877641824, + "grad_norm": 6.0546875, + "learning_rate": 8.919911012235819e-06, + "loss": 2.9337, + "mean_token_accuracy": 0.46113989637305697, + "step": 5826 + }, + { + "epoch": 1.0802743789395626, + "grad_norm": 5.97265625, + "learning_rate": 8.919725621060437e-06, + "loss": 2.5825, + "mean_token_accuracy": 0.465184318314804, + "step": 5827 + }, + { + "epoch": 1.0804597701149425, + "grad_norm": 5.2578125, + "learning_rate": 8.919540229885058e-06, + "loss": 2.3426, + "mean_token_accuracy": 0.49328859060402686, + "step": 5828 + }, + { + "epoch": 1.0806451612903225, + "grad_norm": 6.93359375, + "learning_rate": 8.919354838709678e-06, + "loss": 2.5996, + "mean_token_accuracy": 0.5106905012267788, + "step": 5829 + }, + { + "epoch": 1.0808305524657027, + "grad_norm": 7.609375, + "learning_rate": 8.919169447534299e-06, + "loss": 3.3304, + "mean_token_accuracy": 0.43202065848934795, + "step": 5830 + }, + { + "epoch": 1.0810159436410827, + "grad_norm": 7.78125, + "learning_rate": 8.918984056358918e-06, + "loss": 2.2893, + "mean_token_accuracy": 0.49600798403193613, + "step": 5831 + }, + { + "epoch": 1.0812013348164626, + "grad_norm": 7.7265625, + "learning_rate": 8.918798665183538e-06, + "loss": 2.763, + "mean_token_accuracy": 0.47485760781122865, + "step": 5832 + }, + { + "epoch": 1.0813867259918428, + "grad_norm": 6.30859375, + "learning_rate": 8.918613274008157e-06, + "loss": 2.9272, + "mean_token_accuracy": 0.45123287671232876, + "step": 5833 + }, + { + "epoch": 1.0815721171672228, + "grad_norm": 6.57421875, + "learning_rate": 8.918427882832778e-06, + "loss": 2.9774, + "mean_token_accuracy": 0.4481352560914968, + "step": 5834 + }, + { + "epoch": 1.081757508342603, + "grad_norm": 5.78515625, + "learning_rate": 8.918242491657398e-06, + "loss": 2.6549, + "mean_token_accuracy": 0.4764795144157815, + "step": 5835 + }, + { + "epoch": 1.081942899517983, + "grad_norm": 6.625, + "learning_rate": 8.918057100482019e-06, + "loss": 2.6245, + "mean_token_accuracy": 0.46986301369863015, + "step": 5836 + }, + { + "epoch": 1.082128290693363, + "grad_norm": 6.84375, + "learning_rate": 8.917871709306637e-06, + "loss": 2.985, + "mean_token_accuracy": 0.4320270924044509, + "step": 5837 + }, + { + "epoch": 1.082313681868743, + "grad_norm": 5.453125, + "learning_rate": 8.917686318131258e-06, + "loss": 2.6733, + "mean_token_accuracy": 0.4623047926763597, + "step": 5838 + }, + { + "epoch": 1.082499073044123, + "grad_norm": 6.83203125, + "learning_rate": 8.917500926955878e-06, + "loss": 2.3709, + "mean_token_accuracy": 0.48354999197560583, + "step": 5839 + }, + { + "epoch": 1.0826844642195033, + "grad_norm": 6.0703125, + "learning_rate": 8.917315535780497e-06, + "loss": 2.9644, + "mean_token_accuracy": 0.43162193698949824, + "step": 5840 + }, + { + "epoch": 1.0828698553948832, + "grad_norm": 7.6875, + "learning_rate": 8.917130144605118e-06, + "loss": 2.8106, + "mean_token_accuracy": 0.45687560738581146, + "step": 5841 + }, + { + "epoch": 1.0830552465702632, + "grad_norm": 7.765625, + "learning_rate": 8.916944753429736e-06, + "loss": 2.816, + "mean_token_accuracy": 0.4612623392162728, + "step": 5842 + }, + { + "epoch": 1.0832406377456434, + "grad_norm": 7.6484375, + "learning_rate": 8.916759362254357e-06, + "loss": 2.1659, + "mean_token_accuracy": 0.5413478516774574, + "step": 5843 + }, + { + "epoch": 1.0834260289210234, + "grad_norm": 5.79296875, + "learning_rate": 8.916573971078977e-06, + "loss": 3.0518, + "mean_token_accuracy": 0.41382941382941385, + "step": 5844 + }, + { + "epoch": 1.0836114200964033, + "grad_norm": 7.83984375, + "learning_rate": 8.916388579903598e-06, + "loss": 2.5888, + "mean_token_accuracy": 0.46653980336187756, + "step": 5845 + }, + { + "epoch": 1.0837968112717835, + "grad_norm": 7.71484375, + "learning_rate": 8.916203188728217e-06, + "loss": 2.769, + "mean_token_accuracy": 0.45228777844671886, + "step": 5846 + }, + { + "epoch": 1.0839822024471635, + "grad_norm": 7.203125, + "learning_rate": 8.916017797552837e-06, + "loss": 3.1581, + "mean_token_accuracy": 0.4230212037402871, + "step": 5847 + }, + { + "epoch": 1.0841675936225437, + "grad_norm": 6.3671875, + "learning_rate": 8.915832406377458e-06, + "loss": 2.9627, + "mean_token_accuracy": 0.4391309094879354, + "step": 5848 + }, + { + "epoch": 1.0843529847979236, + "grad_norm": 7.72265625, + "learning_rate": 8.915647015202076e-06, + "loss": 2.4092, + "mean_token_accuracy": 0.5011114639568117, + "step": 5849 + }, + { + "epoch": 1.0845383759733036, + "grad_norm": 6.1796875, + "learning_rate": 8.915461624026697e-06, + "loss": 3.1298, + "mean_token_accuracy": 0.43235430157261795, + "step": 5850 + }, + { + "epoch": 1.0847237671486838, + "grad_norm": 10.71875, + "learning_rate": 8.915276232851316e-06, + "loss": 2.7077, + "mean_token_accuracy": 0.4445817882159044, + "step": 5851 + }, + { + "epoch": 1.0849091583240638, + "grad_norm": 8.8984375, + "learning_rate": 8.915090841675938e-06, + "loss": 2.9048, + "mean_token_accuracy": 0.4332615715823466, + "step": 5852 + }, + { + "epoch": 1.0850945494994437, + "grad_norm": 5.58203125, + "learning_rate": 8.914905450500557e-06, + "loss": 2.9546, + "mean_token_accuracy": 0.46609006040637013, + "step": 5853 + }, + { + "epoch": 1.085279940674824, + "grad_norm": 7.640625, + "learning_rate": 8.914720059325177e-06, + "loss": 2.342, + "mean_token_accuracy": 0.5257707129094412, + "step": 5854 + }, + { + "epoch": 1.0854653318502039, + "grad_norm": 8.28125, + "learning_rate": 8.914534668149796e-06, + "loss": 2.3807, + "mean_token_accuracy": 0.48614756488772237, + "step": 5855 + }, + { + "epoch": 1.085650723025584, + "grad_norm": 6.62890625, + "learning_rate": 8.914349276974416e-06, + "loss": 2.8633, + "mean_token_accuracy": 0.43610421836228286, + "step": 5856 + }, + { + "epoch": 1.085836114200964, + "grad_norm": 5.47265625, + "learning_rate": 8.914163885799037e-06, + "loss": 2.3424, + "mean_token_accuracy": 0.5262197902416781, + "step": 5857 + }, + { + "epoch": 1.086021505376344, + "grad_norm": 10.828125, + "learning_rate": 8.913978494623656e-06, + "loss": 2.6676, + "mean_token_accuracy": 0.47401055408970977, + "step": 5858 + }, + { + "epoch": 1.0862068965517242, + "grad_norm": 9.4375, + "learning_rate": 8.913793103448276e-06, + "loss": 2.3748, + "mean_token_accuracy": 0.5046898140529866, + "step": 5859 + }, + { + "epoch": 1.0863922877271042, + "grad_norm": 7.69140625, + "learning_rate": 8.913607712272897e-06, + "loss": 2.4944, + "mean_token_accuracy": 0.5003229974160207, + "step": 5860 + }, + { + "epoch": 1.0865776789024841, + "grad_norm": 9.7109375, + "learning_rate": 8.913422321097517e-06, + "loss": 2.8139, + "mean_token_accuracy": 0.45675884102677095, + "step": 5861 + }, + { + "epoch": 1.0867630700778643, + "grad_norm": 6.60546875, + "learning_rate": 8.913236929922136e-06, + "loss": 2.9569, + "mean_token_accuracy": 0.4216105046669831, + "step": 5862 + }, + { + "epoch": 1.0869484612532443, + "grad_norm": 7.51953125, + "learning_rate": 8.913051538746757e-06, + "loss": 2.7842, + "mean_token_accuracy": 0.46524064171123, + "step": 5863 + }, + { + "epoch": 1.0871338524286245, + "grad_norm": 8.796875, + "learning_rate": 8.912866147571377e-06, + "loss": 2.7076, + "mean_token_accuracy": 0.44183877415056627, + "step": 5864 + }, + { + "epoch": 1.0873192436040044, + "grad_norm": 6.66796875, + "learning_rate": 8.912680756395996e-06, + "loss": 2.893, + "mean_token_accuracy": 0.4463802943826975, + "step": 5865 + }, + { + "epoch": 1.0875046347793844, + "grad_norm": 5.8984375, + "learning_rate": 8.912495365220616e-06, + "loss": 2.4264, + "mean_token_accuracy": 0.5079887218045113, + "step": 5866 + }, + { + "epoch": 1.0876900259547646, + "grad_norm": 6.2734375, + "learning_rate": 8.912309974045235e-06, + "loss": 2.9513, + "mean_token_accuracy": 0.44932810750279956, + "step": 5867 + }, + { + "epoch": 1.0878754171301446, + "grad_norm": 10.6640625, + "learning_rate": 8.912124582869857e-06, + "loss": 2.0251, + "mean_token_accuracy": 0.5367423782339366, + "step": 5868 + }, + { + "epoch": 1.0880608083055248, + "grad_norm": 6.28125, + "learning_rate": 8.911939191694476e-06, + "loss": 2.7347, + "mean_token_accuracy": 0.4708604483007954, + "step": 5869 + }, + { + "epoch": 1.0882461994809047, + "grad_norm": 5.61328125, + "learning_rate": 8.911753800519097e-06, + "loss": 3.0234, + "mean_token_accuracy": 0.42890700566533096, + "step": 5870 + }, + { + "epoch": 1.0884315906562847, + "grad_norm": 6.296875, + "learning_rate": 8.911568409343715e-06, + "loss": 2.6174, + "mean_token_accuracy": 0.47026169706582077, + "step": 5871 + }, + { + "epoch": 1.0886169818316649, + "grad_norm": 9.6875, + "learning_rate": 8.911383018168336e-06, + "loss": 3.0913, + "mean_token_accuracy": 0.4249743062692703, + "step": 5872 + }, + { + "epoch": 1.0888023730070449, + "grad_norm": 6.96484375, + "learning_rate": 8.911197626992956e-06, + "loss": 3.2522, + "mean_token_accuracy": 0.4463864613677348, + "step": 5873 + }, + { + "epoch": 1.0889877641824248, + "grad_norm": 6.16796875, + "learning_rate": 8.911012235817575e-06, + "loss": 2.7667, + "mean_token_accuracy": 0.46302981682043054, + "step": 5874 + }, + { + "epoch": 1.089173155357805, + "grad_norm": 7.078125, + "learning_rate": 8.910826844642196e-06, + "loss": 2.6224, + "mean_token_accuracy": 0.47866018368449487, + "step": 5875 + }, + { + "epoch": 1.089358546533185, + "grad_norm": 8.546875, + "learning_rate": 8.910641453466816e-06, + "loss": 3.3146, + "mean_token_accuracy": 0.41527415143603136, + "step": 5876 + }, + { + "epoch": 1.0895439377085652, + "grad_norm": 6.25390625, + "learning_rate": 8.910456062291437e-06, + "loss": 3.0159, + "mean_token_accuracy": 0.46915224145583667, + "step": 5877 + }, + { + "epoch": 1.0897293288839451, + "grad_norm": 5.81640625, + "learning_rate": 8.910270671116055e-06, + "loss": 3.417, + "mean_token_accuracy": 0.4017118001964361, + "step": 5878 + }, + { + "epoch": 1.089914720059325, + "grad_norm": 9.3671875, + "learning_rate": 8.910085279940676e-06, + "loss": 2.7975, + "mean_token_accuracy": 0.4427985716175109, + "step": 5879 + }, + { + "epoch": 1.0901001112347053, + "grad_norm": 10.0, + "learning_rate": 8.909899888765295e-06, + "loss": 2.654, + "mean_token_accuracy": 0.4635085369936391, + "step": 5880 + }, + { + "epoch": 1.0902855024100853, + "grad_norm": 7.109375, + "learning_rate": 8.909714497589915e-06, + "loss": 2.7899, + "mean_token_accuracy": 0.4829136690647482, + "step": 5881 + }, + { + "epoch": 1.0904708935854652, + "grad_norm": 6.49609375, + "learning_rate": 8.909529106414536e-06, + "loss": 2.5318, + "mean_token_accuracy": 0.4765746638358103, + "step": 5882 + }, + { + "epoch": 1.0906562847608454, + "grad_norm": 7.1875, + "learning_rate": 8.909343715239155e-06, + "loss": 2.5793, + "mean_token_accuracy": 0.4867669953295278, + "step": 5883 + }, + { + "epoch": 1.0908416759362254, + "grad_norm": 6.16796875, + "learning_rate": 8.909158324063775e-06, + "loss": 2.9176, + "mean_token_accuracy": 0.4633587786259542, + "step": 5884 + }, + { + "epoch": 1.0910270671116056, + "grad_norm": 6.02734375, + "learning_rate": 8.908972932888395e-06, + "loss": 2.9709, + "mean_token_accuracy": 0.4574265505984766, + "step": 5885 + }, + { + "epoch": 1.0912124582869855, + "grad_norm": 8.546875, + "learning_rate": 8.908787541713016e-06, + "loss": 2.5235, + "mean_token_accuracy": 0.4960962498400102, + "step": 5886 + }, + { + "epoch": 1.0913978494623655, + "grad_norm": 7.265625, + "learning_rate": 8.908602150537635e-06, + "loss": 2.5001, + "mean_token_accuracy": 0.4853275992916772, + "step": 5887 + }, + { + "epoch": 1.0915832406377457, + "grad_norm": 5.54296875, + "learning_rate": 8.908416759362255e-06, + "loss": 2.4464, + "mean_token_accuracy": 0.5046728971962616, + "step": 5888 + }, + { + "epoch": 1.0917686318131257, + "grad_norm": 5.51953125, + "learning_rate": 8.908231368186874e-06, + "loss": 2.7388, + "mean_token_accuracy": 0.45863719234275296, + "step": 5889 + }, + { + "epoch": 1.0919540229885056, + "grad_norm": 6.40625, + "learning_rate": 8.908045977011495e-06, + "loss": 2.9976, + "mean_token_accuracy": 0.4380753138075314, + "step": 5890 + }, + { + "epoch": 1.0921394141638858, + "grad_norm": 6.7265625, + "learning_rate": 8.907860585836115e-06, + "loss": 2.7043, + "mean_token_accuracy": 0.45947947524333477, + "step": 5891 + }, + { + "epoch": 1.0923248053392658, + "grad_norm": 7.9140625, + "learning_rate": 8.907675194660736e-06, + "loss": 2.6479, + "mean_token_accuracy": 0.47155460906601765, + "step": 5892 + }, + { + "epoch": 1.092510196514646, + "grad_norm": 5.796875, + "learning_rate": 8.907489803485356e-06, + "loss": 3.1548, + "mean_token_accuracy": 0.4369444802267165, + "step": 5893 + }, + { + "epoch": 1.092695587690026, + "grad_norm": 5.91015625, + "learning_rate": 8.907304412309975e-06, + "loss": 2.8659, + "mean_token_accuracy": 0.455086258179655, + "step": 5894 + }, + { + "epoch": 1.092880978865406, + "grad_norm": 12.3203125, + "learning_rate": 8.907119021134595e-06, + "loss": 2.3595, + "mean_token_accuracy": 0.49942223249364454, + "step": 5895 + }, + { + "epoch": 1.093066370040786, + "grad_norm": 8.125, + "learning_rate": 8.906933629959214e-06, + "loss": 2.9794, + "mean_token_accuracy": 0.434631743899709, + "step": 5896 + }, + { + "epoch": 1.093251761216166, + "grad_norm": 7.1875, + "learning_rate": 8.906748238783835e-06, + "loss": 2.8822, + "mean_token_accuracy": 0.45534901858832705, + "step": 5897 + }, + { + "epoch": 1.0934371523915463, + "grad_norm": 7.45703125, + "learning_rate": 8.906562847608453e-06, + "loss": 2.6458, + "mean_token_accuracy": 0.4851845434066886, + "step": 5898 + }, + { + "epoch": 1.0936225435669262, + "grad_norm": 9.171875, + "learning_rate": 8.906377456433074e-06, + "loss": 2.5894, + "mean_token_accuracy": 0.46762200892253614, + "step": 5899 + }, + { + "epoch": 1.0938079347423062, + "grad_norm": 10.484375, + "learning_rate": 8.906192065257694e-06, + "loss": 2.6075, + "mean_token_accuracy": 0.4638793331569198, + "step": 5900 + }, + { + "epoch": 1.0939933259176864, + "grad_norm": 7.34375, + "learning_rate": 8.906006674082315e-06, + "loss": 3.4862, + "mean_token_accuracy": 0.41389024987751105, + "step": 5901 + }, + { + "epoch": 1.0941787170930664, + "grad_norm": 7.01953125, + "learning_rate": 8.905821282906935e-06, + "loss": 2.5768, + "mean_token_accuracy": 0.48210188159706285, + "step": 5902 + }, + { + "epoch": 1.0943641082684463, + "grad_norm": 5.9921875, + "learning_rate": 8.905635891731554e-06, + "loss": 2.5851, + "mean_token_accuracy": 0.4748700173310225, + "step": 5903 + }, + { + "epoch": 1.0945494994438265, + "grad_norm": 6.5390625, + "learning_rate": 8.905450500556175e-06, + "loss": 2.7191, + "mean_token_accuracy": 0.4780799524446426, + "step": 5904 + }, + { + "epoch": 1.0947348906192065, + "grad_norm": 8.8203125, + "learning_rate": 8.905265109380793e-06, + "loss": 2.8511, + "mean_token_accuracy": 0.44391067255258376, + "step": 5905 + }, + { + "epoch": 1.0949202817945867, + "grad_norm": 8.28125, + "learning_rate": 8.905079718205414e-06, + "loss": 2.2929, + "mean_token_accuracy": 0.5014671361502347, + "step": 5906 + }, + { + "epoch": 1.0951056729699666, + "grad_norm": 9.3203125, + "learning_rate": 8.904894327030034e-06, + "loss": 2.8546, + "mean_token_accuracy": 0.4511727078891258, + "step": 5907 + }, + { + "epoch": 1.0952910641453466, + "grad_norm": 8.234375, + "learning_rate": 8.904708935854655e-06, + "loss": 2.6402, + "mean_token_accuracy": 0.48106575963718823, + "step": 5908 + }, + { + "epoch": 1.0954764553207268, + "grad_norm": 6.171875, + "learning_rate": 8.904523544679274e-06, + "loss": 2.6611, + "mean_token_accuracy": 0.4696384211979023, + "step": 5909 + }, + { + "epoch": 1.0956618464961068, + "grad_norm": 9.59375, + "learning_rate": 8.904338153503894e-06, + "loss": 2.4396, + "mean_token_accuracy": 0.48324447829398326, + "step": 5910 + }, + { + "epoch": 1.095847237671487, + "grad_norm": 6.88671875, + "learning_rate": 8.904152762328515e-06, + "loss": 2.6302, + "mean_token_accuracy": 0.4809052333804809, + "step": 5911 + }, + { + "epoch": 1.096032628846867, + "grad_norm": 5.75, + "learning_rate": 8.903967371153134e-06, + "loss": 3.3385, + "mean_token_accuracy": 0.40424188865042293, + "step": 5912 + }, + { + "epoch": 1.0962180200222469, + "grad_norm": 5.26953125, + "learning_rate": 8.903781979977754e-06, + "loss": 2.9783, + "mean_token_accuracy": 0.45852593733949665, + "step": 5913 + }, + { + "epoch": 1.096403411197627, + "grad_norm": 6.2109375, + "learning_rate": 8.903596588802373e-06, + "loss": 2.4342, + "mean_token_accuracy": 0.523036253776435, + "step": 5914 + }, + { + "epoch": 1.096588802373007, + "grad_norm": 7.2734375, + "learning_rate": 8.903411197626993e-06, + "loss": 2.6705, + "mean_token_accuracy": 0.4719347376111043, + "step": 5915 + }, + { + "epoch": 1.096774193548387, + "grad_norm": 8.078125, + "learning_rate": 8.903225806451614e-06, + "loss": 2.782, + "mean_token_accuracy": 0.5186085035891772, + "step": 5916 + }, + { + "epoch": 1.0969595847237672, + "grad_norm": 6.74609375, + "learning_rate": 8.903040415276234e-06, + "loss": 2.5854, + "mean_token_accuracy": 0.4954763171899947, + "step": 5917 + }, + { + "epoch": 1.0971449758991472, + "grad_norm": 5.7265625, + "learning_rate": 8.902855024100853e-06, + "loss": 2.7441, + "mean_token_accuracy": 0.47062154891689084, + "step": 5918 + }, + { + "epoch": 1.0973303670745271, + "grad_norm": 6.05078125, + "learning_rate": 8.902669632925474e-06, + "loss": 3.0181, + "mean_token_accuracy": 0.44542124542124545, + "step": 5919 + }, + { + "epoch": 1.0975157582499073, + "grad_norm": 5.9453125, + "learning_rate": 8.902484241750094e-06, + "loss": 2.5784, + "mean_token_accuracy": 0.4626079320942485, + "step": 5920 + }, + { + "epoch": 1.0977011494252873, + "grad_norm": 5.0859375, + "learning_rate": 8.902298850574713e-06, + "loss": 2.6459, + "mean_token_accuracy": 0.47695652173913045, + "step": 5921 + }, + { + "epoch": 1.0978865406006675, + "grad_norm": 5.7265625, + "learning_rate": 8.902113459399333e-06, + "loss": 2.4642, + "mean_token_accuracy": 0.4829562008351947, + "step": 5922 + }, + { + "epoch": 1.0980719317760474, + "grad_norm": 5.9921875, + "learning_rate": 8.901928068223952e-06, + "loss": 3.5238, + "mean_token_accuracy": 0.4046744982868331, + "step": 5923 + }, + { + "epoch": 1.0982573229514274, + "grad_norm": 6.61328125, + "learning_rate": 8.901742677048574e-06, + "loss": 3.3899, + "mean_token_accuracy": 0.4277487853158179, + "step": 5924 + }, + { + "epoch": 1.0984427141268076, + "grad_norm": 6.4453125, + "learning_rate": 8.901557285873193e-06, + "loss": 3.0662, + "mean_token_accuracy": 0.4227543083727314, + "step": 5925 + }, + { + "epoch": 1.0986281053021876, + "grad_norm": 6.9296875, + "learning_rate": 8.901371894697814e-06, + "loss": 2.3191, + "mean_token_accuracy": 0.5286597358808655, + "step": 5926 + }, + { + "epoch": 1.0988134964775678, + "grad_norm": 6.078125, + "learning_rate": 8.901186503522432e-06, + "loss": 2.8647, + "mean_token_accuracy": 0.46147596479350034, + "step": 5927 + }, + { + "epoch": 1.0989988876529477, + "grad_norm": 5.328125, + "learning_rate": 8.901001112347053e-06, + "loss": 2.5455, + "mean_token_accuracy": 0.4766666666666667, + "step": 5928 + }, + { + "epoch": 1.0991842788283277, + "grad_norm": 9.40625, + "learning_rate": 8.900815721171673e-06, + "loss": 2.3132, + "mean_token_accuracy": 0.5059283551967709, + "step": 5929 + }, + { + "epoch": 1.0993696700037079, + "grad_norm": 5.5859375, + "learning_rate": 8.900630329996292e-06, + "loss": 2.6389, + "mean_token_accuracy": 0.45506623702112425, + "step": 5930 + }, + { + "epoch": 1.0995550611790879, + "grad_norm": 5.90625, + "learning_rate": 8.900444938820913e-06, + "loss": 2.6946, + "mean_token_accuracy": 0.47724079797695984, + "step": 5931 + }, + { + "epoch": 1.0997404523544678, + "grad_norm": 7.05859375, + "learning_rate": 8.900259547645533e-06, + "loss": 3.09, + "mean_token_accuracy": 0.44373325394462637, + "step": 5932 + }, + { + "epoch": 1.099925843529848, + "grad_norm": 6.4921875, + "learning_rate": 8.900074156470154e-06, + "loss": 2.6832, + "mean_token_accuracy": 0.4835182767624021, + "step": 5933 + }, + { + "epoch": 1.100111234705228, + "grad_norm": 6.3515625, + "learning_rate": 8.899888765294772e-06, + "loss": 2.9061, + "mean_token_accuracy": 0.4621715732826844, + "step": 5934 + }, + { + "epoch": 1.1002966258806082, + "grad_norm": 6.6015625, + "learning_rate": 8.899703374119393e-06, + "loss": 3.5846, + "mean_token_accuracy": 0.40544321505786596, + "step": 5935 + }, + { + "epoch": 1.1004820170559881, + "grad_norm": 6.9765625, + "learning_rate": 8.899517982944012e-06, + "loss": 2.375, + "mean_token_accuracy": 0.5258087968011632, + "step": 5936 + }, + { + "epoch": 1.100667408231368, + "grad_norm": 6.953125, + "learning_rate": 8.899332591768632e-06, + "loss": 2.6999, + "mean_token_accuracy": 0.47862805928990004, + "step": 5937 + }, + { + "epoch": 1.1008527994067483, + "grad_norm": 5.98828125, + "learning_rate": 8.899147200593253e-06, + "loss": 3.1124, + "mean_token_accuracy": 0.41685193419297467, + "step": 5938 + }, + { + "epoch": 1.1010381905821283, + "grad_norm": 6.984375, + "learning_rate": 8.898961809417872e-06, + "loss": 2.763, + "mean_token_accuracy": 0.4514224369296833, + "step": 5939 + }, + { + "epoch": 1.1012235817575085, + "grad_norm": 5.73046875, + "learning_rate": 8.898776418242492e-06, + "loss": 2.6891, + "mean_token_accuracy": 0.46892265193370164, + "step": 5940 + }, + { + "epoch": 1.1014089729328884, + "grad_norm": 7.25, + "learning_rate": 8.898591027067113e-06, + "loss": 2.1948, + "mean_token_accuracy": 0.5301125949201362, + "step": 5941 + }, + { + "epoch": 1.1015943641082684, + "grad_norm": 6.50390625, + "learning_rate": 8.898405635891733e-06, + "loss": 2.5704, + "mean_token_accuracy": 0.4674279499728113, + "step": 5942 + }, + { + "epoch": 1.1017797552836486, + "grad_norm": 6.5, + "learning_rate": 8.898220244716352e-06, + "loss": 2.9638, + "mean_token_accuracy": 0.44887328728343334, + "step": 5943 + }, + { + "epoch": 1.1019651464590285, + "grad_norm": 6.89453125, + "learning_rate": 8.898034853540972e-06, + "loss": 2.497, + "mean_token_accuracy": 0.4917267713194739, + "step": 5944 + }, + { + "epoch": 1.1021505376344085, + "grad_norm": 7.5078125, + "learning_rate": 8.897849462365593e-06, + "loss": 2.4281, + "mean_token_accuracy": 0.4920677601505781, + "step": 5945 + }, + { + "epoch": 1.1023359288097887, + "grad_norm": 7.0390625, + "learning_rate": 8.897664071190212e-06, + "loss": 3.2403, + "mean_token_accuracy": 0.42653766413268834, + "step": 5946 + }, + { + "epoch": 1.1025213199851687, + "grad_norm": 6.296875, + "learning_rate": 8.897478680014832e-06, + "loss": 2.2185, + "mean_token_accuracy": 0.5519073569482289, + "step": 5947 + }, + { + "epoch": 1.1027067111605489, + "grad_norm": 10.53125, + "learning_rate": 8.897293288839451e-06, + "loss": 2.6036, + "mean_token_accuracy": 0.4622032726434662, + "step": 5948 + }, + { + "epoch": 1.1028921023359288, + "grad_norm": 6.234375, + "learning_rate": 8.897107897664073e-06, + "loss": 2.2669, + "mean_token_accuracy": 0.5373719489060327, + "step": 5949 + }, + { + "epoch": 1.1030774935113088, + "grad_norm": 7.26171875, + "learning_rate": 8.896922506488692e-06, + "loss": 3.1001, + "mean_token_accuracy": 0.4223356009070295, + "step": 5950 + }, + { + "epoch": 1.103262884686689, + "grad_norm": 7.4296875, + "learning_rate": 8.896737115313312e-06, + "loss": 2.808, + "mean_token_accuracy": 0.4609227186726823, + "step": 5951 + }, + { + "epoch": 1.103448275862069, + "grad_norm": 7.42578125, + "learning_rate": 8.896551724137931e-06, + "loss": 2.7813, + "mean_token_accuracy": 0.48213881980556184, + "step": 5952 + }, + { + "epoch": 1.103633667037449, + "grad_norm": 5.734375, + "learning_rate": 8.896366332962552e-06, + "loss": 2.9833, + "mean_token_accuracy": 0.43242947736715504, + "step": 5953 + }, + { + "epoch": 1.103819058212829, + "grad_norm": 7.55078125, + "learning_rate": 8.896180941787172e-06, + "loss": 3.1817, + "mean_token_accuracy": 0.4266177921894826, + "step": 5954 + }, + { + "epoch": 1.104004449388209, + "grad_norm": 8.1875, + "learning_rate": 8.895995550611791e-06, + "loss": 2.7619, + "mean_token_accuracy": 0.4552127225771374, + "step": 5955 + }, + { + "epoch": 1.1041898405635893, + "grad_norm": 5.50390625, + "learning_rate": 8.895810159436411e-06, + "loss": 2.7201, + "mean_token_accuracy": 0.4820232246538633, + "step": 5956 + }, + { + "epoch": 1.1043752317389692, + "grad_norm": 6.39453125, + "learning_rate": 8.895624768261032e-06, + "loss": 2.7391, + "mean_token_accuracy": 0.47200864968765016, + "step": 5957 + }, + { + "epoch": 1.1045606229143492, + "grad_norm": 7.609375, + "learning_rate": 8.895439377085652e-06, + "loss": 2.8617, + "mean_token_accuracy": 0.4474417240873772, + "step": 5958 + }, + { + "epoch": 1.1047460140897294, + "grad_norm": 7.828125, + "learning_rate": 8.895253985910271e-06, + "loss": 2.8975, + "mean_token_accuracy": 0.4643700513960272, + "step": 5959 + }, + { + "epoch": 1.1049314052651094, + "grad_norm": 8.2578125, + "learning_rate": 8.895068594734892e-06, + "loss": 2.8567, + "mean_token_accuracy": 0.45591633223884226, + "step": 5960 + }, + { + "epoch": 1.1051167964404893, + "grad_norm": 10.5078125, + "learning_rate": 8.89488320355951e-06, + "loss": 2.5254, + "mean_token_accuracy": 0.4897573869536486, + "step": 5961 + }, + { + "epoch": 1.1053021876158695, + "grad_norm": 6.03515625, + "learning_rate": 8.894697812384131e-06, + "loss": 2.9708, + "mean_token_accuracy": 0.46972401482905396, + "step": 5962 + }, + { + "epoch": 1.1054875787912495, + "grad_norm": 5.31640625, + "learning_rate": 8.894512421208751e-06, + "loss": 2.9248, + "mean_token_accuracy": 0.46721673788124357, + "step": 5963 + }, + { + "epoch": 1.1056729699666297, + "grad_norm": 6.30078125, + "learning_rate": 8.89432703003337e-06, + "loss": 3.2314, + "mean_token_accuracy": 0.42909737209597565, + "step": 5964 + }, + { + "epoch": 1.1058583611420096, + "grad_norm": 6.1328125, + "learning_rate": 8.89414163885799e-06, + "loss": 2.6957, + "mean_token_accuracy": 0.4861111111111111, + "step": 5965 + }, + { + "epoch": 1.1060437523173896, + "grad_norm": 5.7734375, + "learning_rate": 8.893956247682611e-06, + "loss": 3.1668, + "mean_token_accuracy": 0.434232250963126, + "step": 5966 + }, + { + "epoch": 1.1062291434927698, + "grad_norm": 5.78125, + "learning_rate": 8.893770856507232e-06, + "loss": 2.7599, + "mean_token_accuracy": 0.47367799811142586, + "step": 5967 + }, + { + "epoch": 1.1064145346681498, + "grad_norm": 6.4765625, + "learning_rate": 8.89358546533185e-06, + "loss": 3.4905, + "mean_token_accuracy": 0.41115510013972983, + "step": 5968 + }, + { + "epoch": 1.10659992584353, + "grad_norm": 5.94140625, + "learning_rate": 8.893400074156471e-06, + "loss": 2.672, + "mean_token_accuracy": 0.49429657794676807, + "step": 5969 + }, + { + "epoch": 1.10678531701891, + "grad_norm": 6.30078125, + "learning_rate": 8.89321468298109e-06, + "loss": 3.0852, + "mean_token_accuracy": 0.43815426997245177, + "step": 5970 + }, + { + "epoch": 1.1069707081942899, + "grad_norm": 6.58203125, + "learning_rate": 8.89302929180571e-06, + "loss": 2.7507, + "mean_token_accuracy": 0.47865714719732644, + "step": 5971 + }, + { + "epoch": 1.10715609936967, + "grad_norm": 6.0078125, + "learning_rate": 8.89284390063033e-06, + "loss": 2.8232, + "mean_token_accuracy": 0.4657687991021324, + "step": 5972 + }, + { + "epoch": 1.10734149054505, + "grad_norm": 4.859375, + "learning_rate": 8.892658509454951e-06, + "loss": 2.579, + "mean_token_accuracy": 0.48058201612059037, + "step": 5973 + }, + { + "epoch": 1.10752688172043, + "grad_norm": 5.08984375, + "learning_rate": 8.89247311827957e-06, + "loss": 2.5007, + "mean_token_accuracy": 0.47933121345779056, + "step": 5974 + }, + { + "epoch": 1.1077122728958102, + "grad_norm": 5.66015625, + "learning_rate": 8.89228772710419e-06, + "loss": 2.5891, + "mean_token_accuracy": 0.48149417409184375, + "step": 5975 + }, + { + "epoch": 1.1078976640711902, + "grad_norm": 5.13671875, + "learning_rate": 8.892102335928811e-06, + "loss": 2.5206, + "mean_token_accuracy": 0.49604288996681134, + "step": 5976 + }, + { + "epoch": 1.1080830552465704, + "grad_norm": 5.98046875, + "learning_rate": 8.89191694475343e-06, + "loss": 2.2843, + "mean_token_accuracy": 0.5371428571428571, + "step": 5977 + }, + { + "epoch": 1.1082684464219503, + "grad_norm": 6.42578125, + "learning_rate": 8.89173155357805e-06, + "loss": 2.9641, + "mean_token_accuracy": 0.4724199288256228, + "step": 5978 + }, + { + "epoch": 1.1084538375973303, + "grad_norm": 6.59765625, + "learning_rate": 8.89154616240267e-06, + "loss": 2.781, + "mean_token_accuracy": 0.4551843175933917, + "step": 5979 + }, + { + "epoch": 1.1086392287727105, + "grad_norm": 8.09375, + "learning_rate": 8.89136077122729e-06, + "loss": 2.3051, + "mean_token_accuracy": 0.5087342935948513, + "step": 5980 + }, + { + "epoch": 1.1088246199480905, + "grad_norm": 4.9453125, + "learning_rate": 8.89117538005191e-06, + "loss": 2.9737, + "mean_token_accuracy": 0.44810624692572554, + "step": 5981 + }, + { + "epoch": 1.1090100111234706, + "grad_norm": 7.69140625, + "learning_rate": 8.89098998887653e-06, + "loss": 3.0245, + "mean_token_accuracy": 0.4263418749340926, + "step": 5982 + }, + { + "epoch": 1.1091954022988506, + "grad_norm": 13.40625, + "learning_rate": 8.890804597701151e-06, + "loss": 2.6665, + "mean_token_accuracy": 0.47416514371158, + "step": 5983 + }, + { + "epoch": 1.1093807934742306, + "grad_norm": 8.015625, + "learning_rate": 8.89061920652577e-06, + "loss": 3.2707, + "mean_token_accuracy": 0.4064094179202093, + "step": 5984 + }, + { + "epoch": 1.1095661846496108, + "grad_norm": 5.62109375, + "learning_rate": 8.89043381535039e-06, + "loss": 2.799, + "mean_token_accuracy": 0.4718036352338046, + "step": 5985 + }, + { + "epoch": 1.1097515758249907, + "grad_norm": 7.15234375, + "learning_rate": 8.89024842417501e-06, + "loss": 2.4969, + "mean_token_accuracy": 0.5173534381905629, + "step": 5986 + }, + { + "epoch": 1.1099369670003707, + "grad_norm": 8.4453125, + "learning_rate": 8.89006303299963e-06, + "loss": 2.4693, + "mean_token_accuracy": 0.49584043299589053, + "step": 5987 + }, + { + "epoch": 1.110122358175751, + "grad_norm": 7.83203125, + "learning_rate": 8.88987764182425e-06, + "loss": 2.585, + "mean_token_accuracy": 0.48588030214991285, + "step": 5988 + }, + { + "epoch": 1.1103077493511309, + "grad_norm": 7.84765625, + "learning_rate": 8.88969225064887e-06, + "loss": 2.7907, + "mean_token_accuracy": 0.4524405506883605, + "step": 5989 + }, + { + "epoch": 1.1104931405265108, + "grad_norm": 6.3828125, + "learning_rate": 8.88950685947349e-06, + "loss": 2.8931, + "mean_token_accuracy": 0.4735067437379576, + "step": 5990 + }, + { + "epoch": 1.110678531701891, + "grad_norm": 11.2265625, + "learning_rate": 8.88932146829811e-06, + "loss": 2.4846, + "mean_token_accuracy": 0.49226579520697167, + "step": 5991 + }, + { + "epoch": 1.110863922877271, + "grad_norm": 7.609375, + "learning_rate": 8.88913607712273e-06, + "loss": 2.7628, + "mean_token_accuracy": 0.48059490084985834, + "step": 5992 + }, + { + "epoch": 1.1110493140526512, + "grad_norm": 7.23046875, + "learning_rate": 8.88895068594735e-06, + "loss": 2.9749, + "mean_token_accuracy": 0.46493921159277907, + "step": 5993 + }, + { + "epoch": 1.1112347052280311, + "grad_norm": 6.65625, + "learning_rate": 8.88876529477197e-06, + "loss": 2.7282, + "mean_token_accuracy": 0.46918037170744753, + "step": 5994 + }, + { + "epoch": 1.111420096403411, + "grad_norm": 6.58984375, + "learning_rate": 8.888579903596589e-06, + "loss": 2.9356, + "mean_token_accuracy": 0.4409030544488712, + "step": 5995 + }, + { + "epoch": 1.1116054875787913, + "grad_norm": 7.53515625, + "learning_rate": 8.888394512421209e-06, + "loss": 2.8876, + "mean_token_accuracy": 0.45124617402710976, + "step": 5996 + }, + { + "epoch": 1.1117908787541713, + "grad_norm": 7.71875, + "learning_rate": 8.88820912124583e-06, + "loss": 2.5171, + "mean_token_accuracy": 0.4881751358261425, + "step": 5997 + }, + { + "epoch": 1.1119762699295515, + "grad_norm": 8.28125, + "learning_rate": 8.88802373007045e-06, + "loss": 2.7551, + "mean_token_accuracy": 0.4795714285714286, + "step": 5998 + }, + { + "epoch": 1.1121616611049314, + "grad_norm": 6.515625, + "learning_rate": 8.887838338895069e-06, + "loss": 2.648, + "mean_token_accuracy": 0.46601756395570826, + "step": 5999 + }, + { + "epoch": 1.1123470522803114, + "grad_norm": 6.38671875, + "learning_rate": 8.88765294771969e-06, + "loss": 2.5822, + "mean_token_accuracy": 0.4814588924127836, + "step": 6000 + }, + { + "epoch": 1.1125324434556916, + "grad_norm": 6.37890625, + "learning_rate": 8.88746755654431e-06, + "loss": 2.7859, + "mean_token_accuracy": 0.46122402634268195, + "step": 6001 + }, + { + "epoch": 1.1127178346310715, + "grad_norm": 6.60546875, + "learning_rate": 8.887282165368929e-06, + "loss": 3.0582, + "mean_token_accuracy": 0.4555933205260825, + "step": 6002 + }, + { + "epoch": 1.1129032258064515, + "grad_norm": 7.12890625, + "learning_rate": 8.887096774193549e-06, + "loss": 2.8978, + "mean_token_accuracy": 0.4502908514013749, + "step": 6003 + }, + { + "epoch": 1.1130886169818317, + "grad_norm": 7.984375, + "learning_rate": 8.886911383018168e-06, + "loss": 2.6144, + "mean_token_accuracy": 0.4776023890784983, + "step": 6004 + }, + { + "epoch": 1.1132740081572117, + "grad_norm": 6.06640625, + "learning_rate": 8.88672599184279e-06, + "loss": 2.0241, + "mean_token_accuracy": 0.5485391140433553, + "step": 6005 + }, + { + "epoch": 1.1134593993325919, + "grad_norm": 5.50390625, + "learning_rate": 8.886540600667409e-06, + "loss": 3.4702, + "mean_token_accuracy": 0.401330376940133, + "step": 6006 + }, + { + "epoch": 1.1136447905079718, + "grad_norm": 5.53515625, + "learning_rate": 8.88635520949203e-06, + "loss": 3.2897, + "mean_token_accuracy": 0.4184006527947776, + "step": 6007 + }, + { + "epoch": 1.1138301816833518, + "grad_norm": 8.21875, + "learning_rate": 8.886169818316648e-06, + "loss": 3.4552, + "mean_token_accuracy": 0.425670294664189, + "step": 6008 + }, + { + "epoch": 1.114015572858732, + "grad_norm": 4.9765625, + "learning_rate": 8.885984427141269e-06, + "loss": 2.622, + "mean_token_accuracy": 0.4809813189286518, + "step": 6009 + }, + { + "epoch": 1.114200964034112, + "grad_norm": 7.359375, + "learning_rate": 8.885799035965889e-06, + "loss": 2.7759, + "mean_token_accuracy": 0.4727088948787062, + "step": 6010 + }, + { + "epoch": 1.1143863552094921, + "grad_norm": 9.5703125, + "learning_rate": 8.885613644790508e-06, + "loss": 2.8099, + "mean_token_accuracy": 0.4615613382899628, + "step": 6011 + }, + { + "epoch": 1.114571746384872, + "grad_norm": 5.51953125, + "learning_rate": 8.885428253615128e-06, + "loss": 2.4475, + "mean_token_accuracy": 0.488135593220339, + "step": 6012 + }, + { + "epoch": 1.114757137560252, + "grad_norm": 6.453125, + "learning_rate": 8.885242862439749e-06, + "loss": 2.8682, + "mean_token_accuracy": 0.45306403488638974, + "step": 6013 + }, + { + "epoch": 1.1149425287356323, + "grad_norm": 8.34375, + "learning_rate": 8.88505747126437e-06, + "loss": 2.7924, + "mean_token_accuracy": 0.4776268228842079, + "step": 6014 + }, + { + "epoch": 1.1151279199110122, + "grad_norm": 6.35546875, + "learning_rate": 8.884872080088988e-06, + "loss": 2.8932, + "mean_token_accuracy": 0.45159343312409467, + "step": 6015 + }, + { + "epoch": 1.1153133110863922, + "grad_norm": 9.234375, + "learning_rate": 8.884686688913609e-06, + "loss": 3.2073, + "mean_token_accuracy": 0.45677083333333335, + "step": 6016 + }, + { + "epoch": 1.1154987022617724, + "grad_norm": 7.125, + "learning_rate": 8.884501297738228e-06, + "loss": 2.6975, + "mean_token_accuracy": 0.47155049786628733, + "step": 6017 + }, + { + "epoch": 1.1156840934371524, + "grad_norm": 5.96484375, + "learning_rate": 8.884315906562848e-06, + "loss": 3.0363, + "mean_token_accuracy": 0.44541484716157204, + "step": 6018 + }, + { + "epoch": 1.1158694846125325, + "grad_norm": 6.61328125, + "learning_rate": 8.884130515387468e-06, + "loss": 2.7733, + "mean_token_accuracy": 0.45345345345345345, + "step": 6019 + }, + { + "epoch": 1.1160548757879125, + "grad_norm": 8.3515625, + "learning_rate": 8.883945124212087e-06, + "loss": 2.6205, + "mean_token_accuracy": 0.4738770525173022, + "step": 6020 + }, + { + "epoch": 1.1162402669632925, + "grad_norm": 6.0859375, + "learning_rate": 8.88375973303671e-06, + "loss": 3.1421, + "mean_token_accuracy": 0.43269339997728046, + "step": 6021 + }, + { + "epoch": 1.1164256581386727, + "grad_norm": 5.21875, + "learning_rate": 8.883574341861328e-06, + "loss": 2.7327, + "mean_token_accuracy": 0.4588117106773823, + "step": 6022 + }, + { + "epoch": 1.1166110493140526, + "grad_norm": 5.75390625, + "learning_rate": 8.883388950685949e-06, + "loss": 2.8152, + "mean_token_accuracy": 0.4725463591135233, + "step": 6023 + }, + { + "epoch": 1.1167964404894326, + "grad_norm": 6.03125, + "learning_rate": 8.883203559510568e-06, + "loss": 2.8547, + "mean_token_accuracy": 0.4547422540928156, + "step": 6024 + }, + { + "epoch": 1.1169818316648128, + "grad_norm": 5.1640625, + "learning_rate": 8.883018168335188e-06, + "loss": 2.4855, + "mean_token_accuracy": 0.5194069431051109, + "step": 6025 + }, + { + "epoch": 1.1171672228401928, + "grad_norm": 7.43359375, + "learning_rate": 8.882832777159809e-06, + "loss": 2.4647, + "mean_token_accuracy": 0.52, + "step": 6026 + }, + { + "epoch": 1.117352614015573, + "grad_norm": 5.66796875, + "learning_rate": 8.882647385984427e-06, + "loss": 2.9139, + "mean_token_accuracy": 0.45119425652353995, + "step": 6027 + }, + { + "epoch": 1.117538005190953, + "grad_norm": 5.43359375, + "learning_rate": 8.882461994809048e-06, + "loss": 2.7944, + "mean_token_accuracy": 0.4552732335537765, + "step": 6028 + }, + { + "epoch": 1.1177233963663329, + "grad_norm": 6.79296875, + "learning_rate": 8.882276603633668e-06, + "loss": 2.9487, + "mean_token_accuracy": 0.4645888594164456, + "step": 6029 + }, + { + "epoch": 1.117908787541713, + "grad_norm": 8.2578125, + "learning_rate": 8.882091212458289e-06, + "loss": 2.2868, + "mean_token_accuracy": 0.5099697885196375, + "step": 6030 + }, + { + "epoch": 1.118094178717093, + "grad_norm": 7.7265625, + "learning_rate": 8.881905821282908e-06, + "loss": 3.2369, + "mean_token_accuracy": 0.42538190364277323, + "step": 6031 + }, + { + "epoch": 1.118279569892473, + "grad_norm": 7.203125, + "learning_rate": 8.881720430107528e-06, + "loss": 2.7403, + "mean_token_accuracy": 0.4516409098801875, + "step": 6032 + }, + { + "epoch": 1.1184649610678532, + "grad_norm": 9.046875, + "learning_rate": 8.881535038932147e-06, + "loss": 2.4627, + "mean_token_accuracy": 0.49254933548127267, + "step": 6033 + }, + { + "epoch": 1.1186503522432332, + "grad_norm": 6.82421875, + "learning_rate": 8.881349647756767e-06, + "loss": 2.8884, + "mean_token_accuracy": 0.4507178354500276, + "step": 6034 + }, + { + "epoch": 1.1188357434186134, + "grad_norm": 7.21484375, + "learning_rate": 8.881164256581388e-06, + "loss": 2.9667, + "mean_token_accuracy": 0.42517449426239207, + "step": 6035 + }, + { + "epoch": 1.1190211345939933, + "grad_norm": 5.71875, + "learning_rate": 8.880978865406007e-06, + "loss": 2.6325, + "mean_token_accuracy": 0.4747104530127547, + "step": 6036 + }, + { + "epoch": 1.1192065257693733, + "grad_norm": 7.53515625, + "learning_rate": 8.880793474230627e-06, + "loss": 2.6773, + "mean_token_accuracy": 0.45834738617200677, + "step": 6037 + }, + { + "epoch": 1.1193919169447535, + "grad_norm": 6.4609375, + "learning_rate": 8.880608083055248e-06, + "loss": 2.5723, + "mean_token_accuracy": 0.45986009327115257, + "step": 6038 + }, + { + "epoch": 1.1195773081201335, + "grad_norm": 8.3828125, + "learning_rate": 8.880422691879868e-06, + "loss": 2.8636, + "mean_token_accuracy": 0.44751670816406763, + "step": 6039 + }, + { + "epoch": 1.1197626992955136, + "grad_norm": 6.6015625, + "learning_rate": 8.880237300704487e-06, + "loss": 2.5343, + "mean_token_accuracy": 0.4836795252225519, + "step": 6040 + }, + { + "epoch": 1.1199480904708936, + "grad_norm": 8.0390625, + "learning_rate": 8.880051909529107e-06, + "loss": 3.0737, + "mean_token_accuracy": 0.4221553549537501, + "step": 6041 + }, + { + "epoch": 1.1201334816462736, + "grad_norm": 7.1171875, + "learning_rate": 8.879866518353726e-06, + "loss": 2.5655, + "mean_token_accuracy": 0.4824609482049877, + "step": 6042 + }, + { + "epoch": 1.1203188728216538, + "grad_norm": 7.76171875, + "learning_rate": 8.879681127178347e-06, + "loss": 2.1774, + "mean_token_accuracy": 0.5496136012364761, + "step": 6043 + }, + { + "epoch": 1.1205042639970337, + "grad_norm": 6.45703125, + "learning_rate": 8.879495736002967e-06, + "loss": 2.3754, + "mean_token_accuracy": 0.5075975359342916, + "step": 6044 + }, + { + "epoch": 1.1206896551724137, + "grad_norm": 8.2578125, + "learning_rate": 8.879310344827588e-06, + "loss": 2.805, + "mean_token_accuracy": 0.44574613284804365, + "step": 6045 + }, + { + "epoch": 1.120875046347794, + "grad_norm": 6.734375, + "learning_rate": 8.879124953652207e-06, + "loss": 2.7607, + "mean_token_accuracy": 0.48055908513341805, + "step": 6046 + }, + { + "epoch": 1.1210604375231739, + "grad_norm": 6.921875, + "learning_rate": 8.878939562476827e-06, + "loss": 3.5432, + "mean_token_accuracy": 0.41027457927369354, + "step": 6047 + }, + { + "epoch": 1.121245828698554, + "grad_norm": 6.8203125, + "learning_rate": 8.878754171301447e-06, + "loss": 2.5396, + "mean_token_accuracy": 0.473972602739726, + "step": 6048 + }, + { + "epoch": 1.121431219873934, + "grad_norm": 6.2890625, + "learning_rate": 8.878568780126066e-06, + "loss": 2.5447, + "mean_token_accuracy": 0.501987434286447, + "step": 6049 + }, + { + "epoch": 1.121616611049314, + "grad_norm": 6.24609375, + "learning_rate": 8.878383388950687e-06, + "loss": 2.9563, + "mean_token_accuracy": 0.45418759332156916, + "step": 6050 + }, + { + "epoch": 1.1218020022246942, + "grad_norm": 6.20703125, + "learning_rate": 8.878197997775306e-06, + "loss": 3.0918, + "mean_token_accuracy": 0.4252929097717116, + "step": 6051 + }, + { + "epoch": 1.1219873934000741, + "grad_norm": 7.38671875, + "learning_rate": 8.878012606599926e-06, + "loss": 2.6327, + "mean_token_accuracy": 0.48363718718151966, + "step": 6052 + }, + { + "epoch": 1.122172784575454, + "grad_norm": 5.80859375, + "learning_rate": 8.877827215424547e-06, + "loss": 3.0127, + "mean_token_accuracy": 0.4557859107256188, + "step": 6053 + }, + { + "epoch": 1.1223581757508343, + "grad_norm": 5.828125, + "learning_rate": 8.877641824249167e-06, + "loss": 3.1603, + "mean_token_accuracy": 0.4222945484133442, + "step": 6054 + }, + { + "epoch": 1.1225435669262143, + "grad_norm": 5.17578125, + "learning_rate": 8.877456433073786e-06, + "loss": 2.4638, + "mean_token_accuracy": 0.49341813341221713, + "step": 6055 + }, + { + "epoch": 1.1227289581015945, + "grad_norm": 5.3203125, + "learning_rate": 8.877271041898406e-06, + "loss": 2.4586, + "mean_token_accuracy": 0.5209993114979836, + "step": 6056 + }, + { + "epoch": 1.1229143492769744, + "grad_norm": 5.9453125, + "learning_rate": 8.877085650723027e-06, + "loss": 2.4775, + "mean_token_accuracy": 0.48168031136760164, + "step": 6057 + }, + { + "epoch": 1.1230997404523544, + "grad_norm": 5.62109375, + "learning_rate": 8.876900259547646e-06, + "loss": 3.3435, + "mean_token_accuracy": 0.4280414620840153, + "step": 6058 + }, + { + "epoch": 1.1232851316277346, + "grad_norm": 5.0703125, + "learning_rate": 8.876714868372266e-06, + "loss": 3.1785, + "mean_token_accuracy": 0.42826450226784435, + "step": 6059 + }, + { + "epoch": 1.1234705228031145, + "grad_norm": 5.71875, + "learning_rate": 8.876529477196885e-06, + "loss": 2.5868, + "mean_token_accuracy": 0.4804042059265328, + "step": 6060 + }, + { + "epoch": 1.1236559139784945, + "grad_norm": 12.9765625, + "learning_rate": 8.876344086021507e-06, + "loss": 2.7781, + "mean_token_accuracy": 0.4370037056643727, + "step": 6061 + }, + { + "epoch": 1.1238413051538747, + "grad_norm": 6.2265625, + "learning_rate": 8.876158694846126e-06, + "loss": 3.2056, + "mean_token_accuracy": 0.41947608200455583, + "step": 6062 + }, + { + "epoch": 1.1240266963292547, + "grad_norm": 6.0078125, + "learning_rate": 8.875973303670746e-06, + "loss": 2.9121, + "mean_token_accuracy": 0.4424515975769094, + "step": 6063 + }, + { + "epoch": 1.1242120875046349, + "grad_norm": 6.15234375, + "learning_rate": 8.875787912495367e-06, + "loss": 2.8088, + "mean_token_accuracy": 0.4528239202657807, + "step": 6064 + }, + { + "epoch": 1.1243974786800148, + "grad_norm": 5.40234375, + "learning_rate": 8.875602521319986e-06, + "loss": 2.571, + "mean_token_accuracy": 0.4760516451478551, + "step": 6065 + }, + { + "epoch": 1.1245828698553948, + "grad_norm": 5.4296875, + "learning_rate": 8.875417130144606e-06, + "loss": 2.659, + "mean_token_accuracy": 0.46417629036059393, + "step": 6066 + }, + { + "epoch": 1.124768261030775, + "grad_norm": 6.10546875, + "learning_rate": 8.875231738969225e-06, + "loss": 2.6214, + "mean_token_accuracy": 0.47344007182403114, + "step": 6067 + }, + { + "epoch": 1.124953652206155, + "grad_norm": 7.05859375, + "learning_rate": 8.875046347793845e-06, + "loss": 2.6659, + "mean_token_accuracy": 0.4788432267884323, + "step": 6068 + }, + { + "epoch": 1.1251390433815351, + "grad_norm": 6.66796875, + "learning_rate": 8.874860956618466e-06, + "loss": 2.6639, + "mean_token_accuracy": 0.48020850301352014, + "step": 6069 + }, + { + "epoch": 1.125324434556915, + "grad_norm": 6.15625, + "learning_rate": 8.874675565443086e-06, + "loss": 2.5833, + "mean_token_accuracy": 0.4834914611005693, + "step": 6070 + }, + { + "epoch": 1.125509825732295, + "grad_norm": 6.1953125, + "learning_rate": 8.874490174267705e-06, + "loss": 2.6939, + "mean_token_accuracy": 0.47797313314895296, + "step": 6071 + }, + { + "epoch": 1.1256952169076753, + "grad_norm": 7.5234375, + "learning_rate": 8.874304783092326e-06, + "loss": 2.8667, + "mean_token_accuracy": 0.4502591952604295, + "step": 6072 + }, + { + "epoch": 1.1258806080830552, + "grad_norm": 8.21875, + "learning_rate": 8.874119391916946e-06, + "loss": 2.3519, + "mean_token_accuracy": 0.5079067274189225, + "step": 6073 + }, + { + "epoch": 1.1260659992584352, + "grad_norm": 5.41796875, + "learning_rate": 8.873934000741565e-06, + "loss": 2.8346, + "mean_token_accuracy": 0.46578673150399447, + "step": 6074 + }, + { + "epoch": 1.1262513904338154, + "grad_norm": 8.328125, + "learning_rate": 8.873748609566186e-06, + "loss": 2.4558, + "mean_token_accuracy": 0.4904042988741044, + "step": 6075 + }, + { + "epoch": 1.1264367816091954, + "grad_norm": 7.4609375, + "learning_rate": 8.873563218390804e-06, + "loss": 2.8993, + "mean_token_accuracy": 0.4347881087919039, + "step": 6076 + }, + { + "epoch": 1.1266221727845755, + "grad_norm": 9.15625, + "learning_rate": 8.873377827215425e-06, + "loss": 2.6044, + "mean_token_accuracy": 0.473974111814927, + "step": 6077 + }, + { + "epoch": 1.1268075639599555, + "grad_norm": 6.85546875, + "learning_rate": 8.873192436040045e-06, + "loss": 2.5192, + "mean_token_accuracy": 0.4753384343731607, + "step": 6078 + }, + { + "epoch": 1.1269929551353355, + "grad_norm": 6.796875, + "learning_rate": 8.873007044864666e-06, + "loss": 3.0628, + "mean_token_accuracy": 0.4322377307519136, + "step": 6079 + }, + { + "epoch": 1.1271783463107157, + "grad_norm": 8.3203125, + "learning_rate": 8.872821653689285e-06, + "loss": 2.6804, + "mean_token_accuracy": 0.46040575916230364, + "step": 6080 + }, + { + "epoch": 1.1273637374860956, + "grad_norm": 10.7578125, + "learning_rate": 8.872636262513905e-06, + "loss": 2.4161, + "mean_token_accuracy": 0.4973072780427758, + "step": 6081 + }, + { + "epoch": 1.1275491286614758, + "grad_norm": 8.7578125, + "learning_rate": 8.872450871338526e-06, + "loss": 3.3332, + "mean_token_accuracy": 0.40064102564102566, + "step": 6082 + }, + { + "epoch": 1.1277345198368558, + "grad_norm": 10.6484375, + "learning_rate": 8.872265480163144e-06, + "loss": 2.1574, + "mean_token_accuracy": 0.5366781595529664, + "step": 6083 + }, + { + "epoch": 1.1279199110122358, + "grad_norm": 6.94140625, + "learning_rate": 8.872080088987765e-06, + "loss": 3.0475, + "mean_token_accuracy": 0.4360379628391926, + "step": 6084 + }, + { + "epoch": 1.128105302187616, + "grad_norm": 7.49609375, + "learning_rate": 8.871894697812384e-06, + "loss": 2.7365, + "mean_token_accuracy": 0.476449515722436, + "step": 6085 + }, + { + "epoch": 1.128290693362996, + "grad_norm": 6.00390625, + "learning_rate": 8.871709306637006e-06, + "loss": 2.4432, + "mean_token_accuracy": 0.4930013458950202, + "step": 6086 + }, + { + "epoch": 1.128476084538376, + "grad_norm": 6.11328125, + "learning_rate": 8.871523915461625e-06, + "loss": 3.1485, + "mean_token_accuracy": 0.4186150409530901, + "step": 6087 + }, + { + "epoch": 1.128661475713756, + "grad_norm": 5.7890625, + "learning_rate": 8.871338524286245e-06, + "loss": 2.9741, + "mean_token_accuracy": 0.4474813089839441, + "step": 6088 + }, + { + "epoch": 1.128846866889136, + "grad_norm": 9.5859375, + "learning_rate": 8.871153133110864e-06, + "loss": 2.327, + "mean_token_accuracy": 0.5031246014538961, + "step": 6089 + }, + { + "epoch": 1.129032258064516, + "grad_norm": 7.48046875, + "learning_rate": 8.870967741935484e-06, + "loss": 2.3146, + "mean_token_accuracy": 0.4942584310407349, + "step": 6090 + }, + { + "epoch": 1.1292176492398962, + "grad_norm": 7.76171875, + "learning_rate": 8.870782350760105e-06, + "loss": 2.3077, + "mean_token_accuracy": 0.5253214379427972, + "step": 6091 + }, + { + "epoch": 1.1294030404152762, + "grad_norm": 5.453125, + "learning_rate": 8.870596959584724e-06, + "loss": 2.8183, + "mean_token_accuracy": 0.4626738575078059, + "step": 6092 + }, + { + "epoch": 1.1295884315906564, + "grad_norm": 7.24609375, + "learning_rate": 8.870411568409344e-06, + "loss": 2.9489, + "mean_token_accuracy": 0.4522628642281463, + "step": 6093 + }, + { + "epoch": 1.1297738227660363, + "grad_norm": 7.52734375, + "learning_rate": 8.870226177233965e-06, + "loss": 2.6075, + "mean_token_accuracy": 0.46719981455725546, + "step": 6094 + }, + { + "epoch": 1.1299592139414163, + "grad_norm": 5.3984375, + "learning_rate": 8.870040786058585e-06, + "loss": 2.8817, + "mean_token_accuracy": 0.44915349254400716, + "step": 6095 + }, + { + "epoch": 1.1301446051167965, + "grad_norm": 7.52734375, + "learning_rate": 8.869855394883204e-06, + "loss": 2.5496, + "mean_token_accuracy": 0.484251968503937, + "step": 6096 + }, + { + "epoch": 1.1303299962921765, + "grad_norm": 8.078125, + "learning_rate": 8.869670003707824e-06, + "loss": 2.8245, + "mean_token_accuracy": 0.4768660667019587, + "step": 6097 + }, + { + "epoch": 1.1305153874675566, + "grad_norm": 5.87890625, + "learning_rate": 8.869484612532443e-06, + "loss": 2.5123, + "mean_token_accuracy": 0.4964493221433183, + "step": 6098 + }, + { + "epoch": 1.1307007786429366, + "grad_norm": 5.9375, + "learning_rate": 8.869299221357064e-06, + "loss": 3.3111, + "mean_token_accuracy": 0.4147701918786256, + "step": 6099 + }, + { + "epoch": 1.1308861698183166, + "grad_norm": 8.703125, + "learning_rate": 8.869113830181684e-06, + "loss": 2.4049, + "mean_token_accuracy": 0.5108347697611426, + "step": 6100 + }, + { + "epoch": 1.1310715609936968, + "grad_norm": 5.765625, + "learning_rate": 8.868928439006303e-06, + "loss": 2.6086, + "mean_token_accuracy": 0.47177688710754845, + "step": 6101 + }, + { + "epoch": 1.1312569521690767, + "grad_norm": 5.51953125, + "learning_rate": 8.868743047830925e-06, + "loss": 3.0713, + "mean_token_accuracy": 0.4107723822147247, + "step": 6102 + }, + { + "epoch": 1.1314423433444567, + "grad_norm": 6.14453125, + "learning_rate": 8.868557656655544e-06, + "loss": 2.3673, + "mean_token_accuracy": 0.5261427162117724, + "step": 6103 + }, + { + "epoch": 1.131627734519837, + "grad_norm": 7.734375, + "learning_rate": 8.868372265480165e-06, + "loss": 2.4421, + "mean_token_accuracy": 0.47541733363823463, + "step": 6104 + }, + { + "epoch": 1.1318131256952169, + "grad_norm": 5.47265625, + "learning_rate": 8.868186874304783e-06, + "loss": 2.6935, + "mean_token_accuracy": 0.4624617268830373, + "step": 6105 + }, + { + "epoch": 1.131998516870597, + "grad_norm": 7.76171875, + "learning_rate": 8.868001483129404e-06, + "loss": 2.8987, + "mean_token_accuracy": 0.4656620021528525, + "step": 6106 + }, + { + "epoch": 1.132183908045977, + "grad_norm": 7.2578125, + "learning_rate": 8.867816091954024e-06, + "loss": 2.0636, + "mean_token_accuracy": 0.5696166042087056, + "step": 6107 + }, + { + "epoch": 1.132369299221357, + "grad_norm": 6.375, + "learning_rate": 8.867630700778643e-06, + "loss": 2.3469, + "mean_token_accuracy": 0.5098680227630463, + "step": 6108 + }, + { + "epoch": 1.1325546903967372, + "grad_norm": 6.703125, + "learning_rate": 8.867445309603264e-06, + "loss": 3.2382, + "mean_token_accuracy": 0.4299039398579981, + "step": 6109 + }, + { + "epoch": 1.1327400815721171, + "grad_norm": 10.421875, + "learning_rate": 8.867259918427884e-06, + "loss": 2.6108, + "mean_token_accuracy": 0.47201534642292936, + "step": 6110 + }, + { + "epoch": 1.1329254727474973, + "grad_norm": 6.0, + "learning_rate": 8.867074527252505e-06, + "loss": 3.4837, + "mean_token_accuracy": 0.41144321093082836, + "step": 6111 + }, + { + "epoch": 1.1331108639228773, + "grad_norm": 5.9609375, + "learning_rate": 8.866889136077123e-06, + "loss": 2.7884, + "mean_token_accuracy": 0.47805190644024353, + "step": 6112 + }, + { + "epoch": 1.1332962550982573, + "grad_norm": 9.0234375, + "learning_rate": 8.866703744901744e-06, + "loss": 2.7015, + "mean_token_accuracy": 0.47411081538833777, + "step": 6113 + }, + { + "epoch": 1.1334816462736375, + "grad_norm": 5.828125, + "learning_rate": 8.866518353726363e-06, + "loss": 2.3777, + "mean_token_accuracy": 0.49588839941262847, + "step": 6114 + }, + { + "epoch": 1.1336670374490174, + "grad_norm": 6.65625, + "learning_rate": 8.866332962550983e-06, + "loss": 2.912, + "mean_token_accuracy": 0.4566843866902934, + "step": 6115 + }, + { + "epoch": 1.1338524286243974, + "grad_norm": 6.9453125, + "learning_rate": 8.866147571375604e-06, + "loss": 2.8876, + "mean_token_accuracy": 0.44988344988344986, + "step": 6116 + }, + { + "epoch": 1.1340378197997776, + "grad_norm": 9.53125, + "learning_rate": 8.865962180200222e-06, + "loss": 2.9036, + "mean_token_accuracy": 0.44989830508474576, + "step": 6117 + }, + { + "epoch": 1.1342232109751575, + "grad_norm": 5.93359375, + "learning_rate": 8.865776789024843e-06, + "loss": 3.3547, + "mean_token_accuracy": 0.42628960460853627, + "step": 6118 + }, + { + "epoch": 1.1344086021505375, + "grad_norm": 7.00390625, + "learning_rate": 8.865591397849463e-06, + "loss": 3.6309, + "mean_token_accuracy": 0.37717828418230565, + "step": 6119 + }, + { + "epoch": 1.1345939933259177, + "grad_norm": 8.9140625, + "learning_rate": 8.865406006674084e-06, + "loss": 2.8457, + "mean_token_accuracy": 0.4421704591355864, + "step": 6120 + }, + { + "epoch": 1.1347793845012977, + "grad_norm": 7.609375, + "learning_rate": 8.865220615498703e-06, + "loss": 2.8997, + "mean_token_accuracy": 0.441622760800843, + "step": 6121 + }, + { + "epoch": 1.1349647756766779, + "grad_norm": 5.8125, + "learning_rate": 8.865035224323323e-06, + "loss": 2.4551, + "mean_token_accuracy": 0.5312924330329589, + "step": 6122 + }, + { + "epoch": 1.1351501668520578, + "grad_norm": 7.90234375, + "learning_rate": 8.864849833147942e-06, + "loss": 2.9831, + "mean_token_accuracy": 0.4339234252778921, + "step": 6123 + }, + { + "epoch": 1.135335558027438, + "grad_norm": 9.0625, + "learning_rate": 8.864664441972562e-06, + "loss": 2.4746, + "mean_token_accuracy": 0.5060065878705677, + "step": 6124 + }, + { + "epoch": 1.135520949202818, + "grad_norm": 6.33984375, + "learning_rate": 8.864479050797183e-06, + "loss": 3.0591, + "mean_token_accuracy": 0.42217648572163624, + "step": 6125 + }, + { + "epoch": 1.135706340378198, + "grad_norm": 7.55859375, + "learning_rate": 8.864293659621803e-06, + "loss": 2.7672, + "mean_token_accuracy": 0.4686385844447379, + "step": 6126 + }, + { + "epoch": 1.1358917315535781, + "grad_norm": 8.921875, + "learning_rate": 8.864108268446422e-06, + "loss": 2.8735, + "mean_token_accuracy": 0.4532710280373832, + "step": 6127 + }, + { + "epoch": 1.136077122728958, + "grad_norm": 7.5234375, + "learning_rate": 8.863922877271043e-06, + "loss": 2.7409, + "mean_token_accuracy": 0.47589760638297873, + "step": 6128 + }, + { + "epoch": 1.136262513904338, + "grad_norm": 7.9921875, + "learning_rate": 8.863737486095663e-06, + "loss": 2.2411, + "mean_token_accuracy": 0.5232399179767601, + "step": 6129 + }, + { + "epoch": 1.1364479050797183, + "grad_norm": 9.2421875, + "learning_rate": 8.863552094920282e-06, + "loss": 2.3582, + "mean_token_accuracy": 0.512446240062557, + "step": 6130 + }, + { + "epoch": 1.1366332962550982, + "grad_norm": 6.65625, + "learning_rate": 8.863366703744903e-06, + "loss": 2.819, + "mean_token_accuracy": 0.45714647423724525, + "step": 6131 + }, + { + "epoch": 1.1368186874304782, + "grad_norm": 7.11328125, + "learning_rate": 8.863181312569521e-06, + "loss": 2.694, + "mean_token_accuracy": 0.493714436945878, + "step": 6132 + }, + { + "epoch": 1.1370040786058584, + "grad_norm": 6.70703125, + "learning_rate": 8.862995921394142e-06, + "loss": 2.6374, + "mean_token_accuracy": 0.4800796812749004, + "step": 6133 + }, + { + "epoch": 1.1371894697812384, + "grad_norm": 6.12890625, + "learning_rate": 8.862810530218762e-06, + "loss": 2.7784, + "mean_token_accuracy": 0.4684875977222276, + "step": 6134 + }, + { + "epoch": 1.1373748609566185, + "grad_norm": 6.265625, + "learning_rate": 8.862625139043383e-06, + "loss": 2.4664, + "mean_token_accuracy": 0.4958939348966063, + "step": 6135 + }, + { + "epoch": 1.1375602521319985, + "grad_norm": 6.14453125, + "learning_rate": 8.862439747868002e-06, + "loss": 2.564, + "mean_token_accuracy": 0.484965922758252, + "step": 6136 + }, + { + "epoch": 1.1377456433073785, + "grad_norm": 8.046875, + "learning_rate": 8.862254356692622e-06, + "loss": 3.1018, + "mean_token_accuracy": 0.4235631573355997, + "step": 6137 + }, + { + "epoch": 1.1379310344827587, + "grad_norm": 7.98828125, + "learning_rate": 8.862068965517243e-06, + "loss": 2.7619, + "mean_token_accuracy": 0.4607142857142857, + "step": 6138 + }, + { + "epoch": 1.1381164256581386, + "grad_norm": 5.58984375, + "learning_rate": 8.861883574341861e-06, + "loss": 2.6263, + "mean_token_accuracy": 0.4780308258569128, + "step": 6139 + }, + { + "epoch": 1.1383018168335188, + "grad_norm": 7.81640625, + "learning_rate": 8.861698183166482e-06, + "loss": 2.5254, + "mean_token_accuracy": 0.45831752055660974, + "step": 6140 + }, + { + "epoch": 1.1384872080088988, + "grad_norm": 7.66015625, + "learning_rate": 8.8615127919911e-06, + "loss": 2.9118, + "mean_token_accuracy": 0.4401151956632221, + "step": 6141 + }, + { + "epoch": 1.1386725991842788, + "grad_norm": 8.4375, + "learning_rate": 8.861327400815723e-06, + "loss": 2.5593, + "mean_token_accuracy": 0.4883792048929664, + "step": 6142 + }, + { + "epoch": 1.138857990359659, + "grad_norm": 7.8359375, + "learning_rate": 8.861142009640342e-06, + "loss": 3.0609, + "mean_token_accuracy": 0.42585794094173984, + "step": 6143 + }, + { + "epoch": 1.139043381535039, + "grad_norm": 9.703125, + "learning_rate": 8.860956618464962e-06, + "loss": 2.5034, + "mean_token_accuracy": 0.5018406521167499, + "step": 6144 + }, + { + "epoch": 1.139228772710419, + "grad_norm": 7.53515625, + "learning_rate": 8.860771227289583e-06, + "loss": 2.9299, + "mean_token_accuracy": 0.4520629266844761, + "step": 6145 + }, + { + "epoch": 1.139414163885799, + "grad_norm": 6.44921875, + "learning_rate": 8.860585836114201e-06, + "loss": 2.7818, + "mean_token_accuracy": 0.4752210018751674, + "step": 6146 + }, + { + "epoch": 1.139599555061179, + "grad_norm": 7.875, + "learning_rate": 8.860400444938822e-06, + "loss": 3.0154, + "mean_token_accuracy": 0.4488506519693507, + "step": 6147 + }, + { + "epoch": 1.139784946236559, + "grad_norm": 7.3359375, + "learning_rate": 8.86021505376344e-06, + "loss": 3.096, + "mean_token_accuracy": 0.42114485981308414, + "step": 6148 + }, + { + "epoch": 1.1399703374119392, + "grad_norm": 7.24609375, + "learning_rate": 8.860029662588061e-06, + "loss": 3.2497, + "mean_token_accuracy": 0.42070760628554904, + "step": 6149 + }, + { + "epoch": 1.1401557285873192, + "grad_norm": 8.1484375, + "learning_rate": 8.859844271412682e-06, + "loss": 2.6299, + "mean_token_accuracy": 0.48242530755711777, + "step": 6150 + }, + { + "epoch": 1.1403411197626994, + "grad_norm": 6.3984375, + "learning_rate": 8.859658880237302e-06, + "loss": 3.2276, + "mean_token_accuracy": 0.4148978246539222, + "step": 6151 + }, + { + "epoch": 1.1405265109380793, + "grad_norm": 6.25390625, + "learning_rate": 8.859473489061921e-06, + "loss": 2.6551, + "mean_token_accuracy": 0.4745433974462583, + "step": 6152 + }, + { + "epoch": 1.1407119021134595, + "grad_norm": 7.86328125, + "learning_rate": 8.859288097886541e-06, + "loss": 2.6942, + "mean_token_accuracy": 0.4392332268370607, + "step": 6153 + }, + { + "epoch": 1.1408972932888395, + "grad_norm": 9.9296875, + "learning_rate": 8.859102706711162e-06, + "loss": 3.2602, + "mean_token_accuracy": 0.4218085717568654, + "step": 6154 + }, + { + "epoch": 1.1410826844642195, + "grad_norm": 7.20703125, + "learning_rate": 8.85891731553578e-06, + "loss": 3.003, + "mean_token_accuracy": 0.44236327899179095, + "step": 6155 + }, + { + "epoch": 1.1412680756395996, + "grad_norm": 7.47265625, + "learning_rate": 8.858731924360401e-06, + "loss": 2.7516, + "mean_token_accuracy": 0.4636749389224637, + "step": 6156 + }, + { + "epoch": 1.1414534668149796, + "grad_norm": 6.35546875, + "learning_rate": 8.85854653318502e-06, + "loss": 2.7147, + "mean_token_accuracy": 0.47580522057701113, + "step": 6157 + }, + { + "epoch": 1.1416388579903596, + "grad_norm": 6.4296875, + "learning_rate": 8.858361142009642e-06, + "loss": 2.7357, + "mean_token_accuracy": 0.4905345211581292, + "step": 6158 + }, + { + "epoch": 1.1418242491657398, + "grad_norm": 8.21875, + "learning_rate": 8.858175750834261e-06, + "loss": 2.5916, + "mean_token_accuracy": 0.48572150619155924, + "step": 6159 + }, + { + "epoch": 1.1420096403411197, + "grad_norm": 6.5859375, + "learning_rate": 8.857990359658882e-06, + "loss": 2.3482, + "mean_token_accuracy": 0.5140864714086472, + "step": 6160 + }, + { + "epoch": 1.1421950315164997, + "grad_norm": 5.5859375, + "learning_rate": 8.8578049684835e-06, + "loss": 2.8062, + "mean_token_accuracy": 0.46034919365587096, + "step": 6161 + }, + { + "epoch": 1.14238042269188, + "grad_norm": 6.50390625, + "learning_rate": 8.85761957730812e-06, + "loss": 2.6138, + "mean_token_accuracy": 0.4618403837767117, + "step": 6162 + }, + { + "epoch": 1.1425658138672599, + "grad_norm": 5.13671875, + "learning_rate": 8.857434186132741e-06, + "loss": 2.8813, + "mean_token_accuracy": 0.4548903488479689, + "step": 6163 + }, + { + "epoch": 1.14275120504264, + "grad_norm": 6.75, + "learning_rate": 8.85724879495736e-06, + "loss": 2.1379, + "mean_token_accuracy": 0.5262407011107715, + "step": 6164 + }, + { + "epoch": 1.14293659621802, + "grad_norm": 6.8828125, + "learning_rate": 8.85706340378198e-06, + "loss": 2.6396, + "mean_token_accuracy": 0.47533185840707964, + "step": 6165 + }, + { + "epoch": 1.1431219873934, + "grad_norm": 7.125, + "learning_rate": 8.856878012606601e-06, + "loss": 2.3762, + "mean_token_accuracy": 0.5093226022803488, + "step": 6166 + }, + { + "epoch": 1.1433073785687802, + "grad_norm": 6.2265625, + "learning_rate": 8.856692621431222e-06, + "loss": 2.9756, + "mean_token_accuracy": 0.4512119328775637, + "step": 6167 + }, + { + "epoch": 1.1434927697441601, + "grad_norm": 5.55078125, + "learning_rate": 8.85650723025584e-06, + "loss": 2.741, + "mean_token_accuracy": 0.477198341697578, + "step": 6168 + }, + { + "epoch": 1.1436781609195403, + "grad_norm": 6.2421875, + "learning_rate": 8.856321839080461e-06, + "loss": 2.9421, + "mean_token_accuracy": 0.4389093588798821, + "step": 6169 + }, + { + "epoch": 1.1438635520949203, + "grad_norm": 6.46484375, + "learning_rate": 8.85613644790508e-06, + "loss": 2.8186, + "mean_token_accuracy": 0.46213503649635035, + "step": 6170 + }, + { + "epoch": 1.1440489432703003, + "grad_norm": 7.03515625, + "learning_rate": 8.8559510567297e-06, + "loss": 2.5902, + "mean_token_accuracy": 0.47406340057636887, + "step": 6171 + }, + { + "epoch": 1.1442343344456805, + "grad_norm": 6.33203125, + "learning_rate": 8.85576566555432e-06, + "loss": 2.3843, + "mean_token_accuracy": 0.5090029561945714, + "step": 6172 + }, + { + "epoch": 1.1444197256210604, + "grad_norm": 7.03125, + "learning_rate": 8.85558027437894e-06, + "loss": 2.4378, + "mean_token_accuracy": 0.5029572074683984, + "step": 6173 + }, + { + "epoch": 1.1446051167964404, + "grad_norm": 7.3125, + "learning_rate": 8.85539488320356e-06, + "loss": 2.41, + "mean_token_accuracy": 0.5085627779354717, + "step": 6174 + }, + { + "epoch": 1.1447905079718206, + "grad_norm": 6.609375, + "learning_rate": 8.85520949202818e-06, + "loss": 3.1158, + "mean_token_accuracy": 0.43976565360673747, + "step": 6175 + }, + { + "epoch": 1.1449758991472005, + "grad_norm": 6.07421875, + "learning_rate": 8.855024100852801e-06, + "loss": 2.7264, + "mean_token_accuracy": 0.4723886428757489, + "step": 6176 + }, + { + "epoch": 1.1451612903225807, + "grad_norm": 6.13671875, + "learning_rate": 8.85483870967742e-06, + "loss": 2.7601, + "mean_token_accuracy": 0.47481927710843375, + "step": 6177 + }, + { + "epoch": 1.1453466814979607, + "grad_norm": 6.359375, + "learning_rate": 8.85465331850204e-06, + "loss": 3.1006, + "mean_token_accuracy": 0.42986490370796204, + "step": 6178 + }, + { + "epoch": 1.1455320726733407, + "grad_norm": 5.30078125, + "learning_rate": 8.854467927326659e-06, + "loss": 2.6905, + "mean_token_accuracy": 0.48302312464749014, + "step": 6179 + }, + { + "epoch": 1.1457174638487209, + "grad_norm": 5.7578125, + "learning_rate": 8.85428253615128e-06, + "loss": 2.8808, + "mean_token_accuracy": 0.44185007261758463, + "step": 6180 + }, + { + "epoch": 1.1459028550241008, + "grad_norm": 6.31640625, + "learning_rate": 8.8540971449759e-06, + "loss": 2.9105, + "mean_token_accuracy": 0.4383685459270139, + "step": 6181 + }, + { + "epoch": 1.146088246199481, + "grad_norm": 5.88671875, + "learning_rate": 8.85391175380052e-06, + "loss": 3.1046, + "mean_token_accuracy": 0.4342357706246003, + "step": 6182 + }, + { + "epoch": 1.146273637374861, + "grad_norm": 6.09765625, + "learning_rate": 8.853726362625141e-06, + "loss": 2.5872, + "mean_token_accuracy": 0.4873096446700508, + "step": 6183 + }, + { + "epoch": 1.146459028550241, + "grad_norm": 7.90625, + "learning_rate": 8.85354097144976e-06, + "loss": 3.003, + "mean_token_accuracy": 0.44263640092985096, + "step": 6184 + }, + { + "epoch": 1.1466444197256211, + "grad_norm": 7.60546875, + "learning_rate": 8.85335558027438e-06, + "loss": 2.549, + "mean_token_accuracy": 0.47827758554402156, + "step": 6185 + }, + { + "epoch": 1.1468298109010011, + "grad_norm": 6.21484375, + "learning_rate": 8.853170189098999e-06, + "loss": 2.8155, + "mean_token_accuracy": 0.4563193343898574, + "step": 6186 + }, + { + "epoch": 1.147015202076381, + "grad_norm": 6.77734375, + "learning_rate": 8.85298479792362e-06, + "loss": 2.6995, + "mean_token_accuracy": 0.4885710637211084, + "step": 6187 + }, + { + "epoch": 1.1472005932517613, + "grad_norm": 7.890625, + "learning_rate": 8.85279940674824e-06, + "loss": 2.6052, + "mean_token_accuracy": 0.5041039671682627, + "step": 6188 + }, + { + "epoch": 1.1473859844271412, + "grad_norm": 6.5078125, + "learning_rate": 8.852614015572859e-06, + "loss": 2.816, + "mean_token_accuracy": 0.4597826086956522, + "step": 6189 + }, + { + "epoch": 1.1475713756025212, + "grad_norm": 10.2421875, + "learning_rate": 8.85242862439748e-06, + "loss": 2.6783, + "mean_token_accuracy": 0.4799685781618225, + "step": 6190 + }, + { + "epoch": 1.1477567667779014, + "grad_norm": 7.70703125, + "learning_rate": 8.8522432332221e-06, + "loss": 3.0254, + "mean_token_accuracy": 0.4185829345161667, + "step": 6191 + }, + { + "epoch": 1.1479421579532814, + "grad_norm": 6.5625, + "learning_rate": 8.85205784204672e-06, + "loss": 2.7407, + "mean_token_accuracy": 0.4522696929238985, + "step": 6192 + }, + { + "epoch": 1.1481275491286616, + "grad_norm": 7.265625, + "learning_rate": 8.851872450871339e-06, + "loss": 2.8813, + "mean_token_accuracy": 0.44809537656295434, + "step": 6193 + }, + { + "epoch": 1.1483129403040415, + "grad_norm": 6.09765625, + "learning_rate": 8.85168705969596e-06, + "loss": 2.6448, + "mean_token_accuracy": 0.4782293178519594, + "step": 6194 + }, + { + "epoch": 1.1484983314794215, + "grad_norm": 6.83203125, + "learning_rate": 8.851501668520578e-06, + "loss": 2.6886, + "mean_token_accuracy": 0.4938891679223171, + "step": 6195 + }, + { + "epoch": 1.1486837226548017, + "grad_norm": 6.3203125, + "learning_rate": 8.851316277345199e-06, + "loss": 2.8137, + "mean_token_accuracy": 0.46339633963396337, + "step": 6196 + }, + { + "epoch": 1.1488691138301816, + "grad_norm": 5.8984375, + "learning_rate": 8.85113088616982e-06, + "loss": 3.0294, + "mean_token_accuracy": 0.4501965188096575, + "step": 6197 + }, + { + "epoch": 1.1490545050055618, + "grad_norm": 10.1953125, + "learning_rate": 8.850945494994438e-06, + "loss": 2.4977, + "mean_token_accuracy": 0.48570675800289814, + "step": 6198 + }, + { + "epoch": 1.1492398961809418, + "grad_norm": 5.88671875, + "learning_rate": 8.850760103819059e-06, + "loss": 2.6489, + "mean_token_accuracy": 0.4861492673992674, + "step": 6199 + }, + { + "epoch": 1.1494252873563218, + "grad_norm": 6.0703125, + "learning_rate": 8.85057471264368e-06, + "loss": 2.7254, + "mean_token_accuracy": 0.46112024276141106, + "step": 6200 + }, + { + "epoch": 1.149610678531702, + "grad_norm": 5.0234375, + "learning_rate": 8.8503893214683e-06, + "loss": 2.6529, + "mean_token_accuracy": 0.4836140191169777, + "step": 6201 + }, + { + "epoch": 1.149796069707082, + "grad_norm": 5.01171875, + "learning_rate": 8.850203930292918e-06, + "loss": 2.5626, + "mean_token_accuracy": 0.4747630028183449, + "step": 6202 + }, + { + "epoch": 1.149981460882462, + "grad_norm": 6.9921875, + "learning_rate": 8.850018539117539e-06, + "loss": 2.9312, + "mean_token_accuracy": 0.44868968895420036, + "step": 6203 + }, + { + "epoch": 1.150166852057842, + "grad_norm": 5.78515625, + "learning_rate": 8.849833147942158e-06, + "loss": 3.0616, + "mean_token_accuracy": 0.44425675675675674, + "step": 6204 + }, + { + "epoch": 1.150352243233222, + "grad_norm": 6.37890625, + "learning_rate": 8.849647756766778e-06, + "loss": 2.6839, + "mean_token_accuracy": 0.490343616754452, + "step": 6205 + }, + { + "epoch": 1.1505376344086022, + "grad_norm": 6.73828125, + "learning_rate": 8.849462365591399e-06, + "loss": 2.5792, + "mean_token_accuracy": 0.48501362397820164, + "step": 6206 + }, + { + "epoch": 1.1507230255839822, + "grad_norm": 5.6875, + "learning_rate": 8.84927697441602e-06, + "loss": 3.2156, + "mean_token_accuracy": 0.4222932954276238, + "step": 6207 + }, + { + "epoch": 1.1509084167593622, + "grad_norm": 6.5234375, + "learning_rate": 8.849091583240638e-06, + "loss": 2.5741, + "mean_token_accuracy": 0.48509933774834435, + "step": 6208 + }, + { + "epoch": 1.1510938079347424, + "grad_norm": 5.44140625, + "learning_rate": 8.848906192065258e-06, + "loss": 2.7665, + "mean_token_accuracy": 0.4680750619615248, + "step": 6209 + }, + { + "epoch": 1.1512791991101223, + "grad_norm": 6.4609375, + "learning_rate": 8.848720800889879e-06, + "loss": 2.9077, + "mean_token_accuracy": 0.4458077709611452, + "step": 6210 + }, + { + "epoch": 1.1514645902855025, + "grad_norm": 5.75390625, + "learning_rate": 8.848535409714498e-06, + "loss": 2.7499, + "mean_token_accuracy": 0.46450079239302694, + "step": 6211 + }, + { + "epoch": 1.1516499814608825, + "grad_norm": 6.359375, + "learning_rate": 8.848350018539118e-06, + "loss": 2.8467, + "mean_token_accuracy": 0.4379429701664744, + "step": 6212 + }, + { + "epoch": 1.1518353726362625, + "grad_norm": 6.06640625, + "learning_rate": 8.848164627363737e-06, + "loss": 2.9641, + "mean_token_accuracy": 0.4384881422924901, + "step": 6213 + }, + { + "epoch": 1.1520207638116426, + "grad_norm": 7.65234375, + "learning_rate": 8.847979236188358e-06, + "loss": 2.9871, + "mean_token_accuracy": 0.48944531776330563, + "step": 6214 + }, + { + "epoch": 1.1522061549870226, + "grad_norm": 6.0390625, + "learning_rate": 8.847793845012978e-06, + "loss": 2.4876, + "mean_token_accuracy": 0.504351563339422, + "step": 6215 + }, + { + "epoch": 1.1523915461624026, + "grad_norm": 6.94921875, + "learning_rate": 8.847608453837599e-06, + "loss": 3.1185, + "mean_token_accuracy": 0.4230883224659158, + "step": 6216 + }, + { + "epoch": 1.1525769373377828, + "grad_norm": 6.9140625, + "learning_rate": 8.847423062662217e-06, + "loss": 2.5818, + "mean_token_accuracy": 0.47446083995459704, + "step": 6217 + }, + { + "epoch": 1.1527623285131627, + "grad_norm": 5.47265625, + "learning_rate": 8.847237671486838e-06, + "loss": 2.5684, + "mean_token_accuracy": 0.5115008260261786, + "step": 6218 + }, + { + "epoch": 1.1529477196885427, + "grad_norm": 6.69921875, + "learning_rate": 8.847052280311458e-06, + "loss": 2.5594, + "mean_token_accuracy": 0.4939550949913644, + "step": 6219 + }, + { + "epoch": 1.153133110863923, + "grad_norm": 10.765625, + "learning_rate": 8.846866889136077e-06, + "loss": 2.8199, + "mean_token_accuracy": 0.4374415341440599, + "step": 6220 + }, + { + "epoch": 1.1533185020393029, + "grad_norm": 9.625, + "learning_rate": 8.846681497960698e-06, + "loss": 2.5348, + "mean_token_accuracy": 0.48185053380782916, + "step": 6221 + }, + { + "epoch": 1.153503893214683, + "grad_norm": 7.05078125, + "learning_rate": 8.846496106785316e-06, + "loss": 3.0819, + "mean_token_accuracy": 0.4322967678746327, + "step": 6222 + }, + { + "epoch": 1.153689284390063, + "grad_norm": 6.15625, + "learning_rate": 8.846310715609939e-06, + "loss": 3.1256, + "mean_token_accuracy": 0.46271421954608616, + "step": 6223 + }, + { + "epoch": 1.1538746755654432, + "grad_norm": 7.44921875, + "learning_rate": 8.846125324434557e-06, + "loss": 3.386, + "mean_token_accuracy": 0.408676393955185, + "step": 6224 + }, + { + "epoch": 1.1540600667408232, + "grad_norm": 6.1796875, + "learning_rate": 8.845939933259178e-06, + "loss": 2.8037, + "mean_token_accuracy": 0.47654656696125086, + "step": 6225 + }, + { + "epoch": 1.1542454579162031, + "grad_norm": 6.53125, + "learning_rate": 8.845754542083798e-06, + "loss": 2.5673, + "mean_token_accuracy": 0.47266717518433765, + "step": 6226 + }, + { + "epoch": 1.1544308490915833, + "grad_norm": 5.0390625, + "learning_rate": 8.845569150908417e-06, + "loss": 2.7396, + "mean_token_accuracy": 0.47694910475116314, + "step": 6227 + }, + { + "epoch": 1.1546162402669633, + "grad_norm": 6.28515625, + "learning_rate": 8.845383759733038e-06, + "loss": 2.8478, + "mean_token_accuracy": 0.4412562455389008, + "step": 6228 + }, + { + "epoch": 1.1548016314423433, + "grad_norm": 5.9765625, + "learning_rate": 8.845198368557656e-06, + "loss": 2.8053, + "mean_token_accuracy": 0.4449157829070493, + "step": 6229 + }, + { + "epoch": 1.1549870226177235, + "grad_norm": 5.38671875, + "learning_rate": 8.845012977382277e-06, + "loss": 3.0872, + "mean_token_accuracy": 0.43776987553975105, + "step": 6230 + }, + { + "epoch": 1.1551724137931034, + "grad_norm": 5.1796875, + "learning_rate": 8.844827586206897e-06, + "loss": 2.8646, + "mean_token_accuracy": 0.4440495299243293, + "step": 6231 + }, + { + "epoch": 1.1553578049684834, + "grad_norm": 6.55859375, + "learning_rate": 8.844642195031518e-06, + "loss": 3.0487, + "mean_token_accuracy": 0.43379766427465877, + "step": 6232 + }, + { + "epoch": 1.1555431961438636, + "grad_norm": 5.23828125, + "learning_rate": 8.844456803856137e-06, + "loss": 2.9256, + "mean_token_accuracy": 0.45817312688350825, + "step": 6233 + }, + { + "epoch": 1.1557285873192435, + "grad_norm": 7.55078125, + "learning_rate": 8.844271412680757e-06, + "loss": 2.6379, + "mean_token_accuracy": 0.4900171889461854, + "step": 6234 + }, + { + "epoch": 1.1559139784946237, + "grad_norm": 6.58203125, + "learning_rate": 8.844086021505378e-06, + "loss": 2.942, + "mean_token_accuracy": 0.4361308238198087, + "step": 6235 + }, + { + "epoch": 1.1560993696700037, + "grad_norm": 7.13671875, + "learning_rate": 8.843900630329997e-06, + "loss": 2.7927, + "mean_token_accuracy": 0.4555878084179971, + "step": 6236 + }, + { + "epoch": 1.1562847608453837, + "grad_norm": 6.46484375, + "learning_rate": 8.843715239154617e-06, + "loss": 2.2697, + "mean_token_accuracy": 0.5258587167854828, + "step": 6237 + }, + { + "epoch": 1.1564701520207639, + "grad_norm": 7.328125, + "learning_rate": 8.843529847979236e-06, + "loss": 2.3706, + "mean_token_accuracy": 0.5077056922694506, + "step": 6238 + }, + { + "epoch": 1.1566555431961438, + "grad_norm": 5.9453125, + "learning_rate": 8.843344456803858e-06, + "loss": 3.4275, + "mean_token_accuracy": 0.40905416329830235, + "step": 6239 + }, + { + "epoch": 1.156840934371524, + "grad_norm": 5.27734375, + "learning_rate": 8.843159065628477e-06, + "loss": 3.287, + "mean_token_accuracy": 0.42162162162162165, + "step": 6240 + }, + { + "epoch": 1.157026325546904, + "grad_norm": 6.25390625, + "learning_rate": 8.842973674453097e-06, + "loss": 2.8897, + "mean_token_accuracy": 0.4485800604229607, + "step": 6241 + }, + { + "epoch": 1.157211716722284, + "grad_norm": 5.87890625, + "learning_rate": 8.842788283277716e-06, + "loss": 3.1861, + "mean_token_accuracy": 0.407801842766543, + "step": 6242 + }, + { + "epoch": 1.1573971078976641, + "grad_norm": 6.6640625, + "learning_rate": 8.842602892102337e-06, + "loss": 2.6684, + "mean_token_accuracy": 0.4861517976031957, + "step": 6243 + }, + { + "epoch": 1.1575824990730441, + "grad_norm": 5.79296875, + "learning_rate": 8.842417500926957e-06, + "loss": 3.1253, + "mean_token_accuracy": 0.4390753862237194, + "step": 6244 + }, + { + "epoch": 1.157767890248424, + "grad_norm": 7.1875, + "learning_rate": 8.842232109751576e-06, + "loss": 2.8409, + "mean_token_accuracy": 0.458041958041958, + "step": 6245 + }, + { + "epoch": 1.1579532814238043, + "grad_norm": 6.7109375, + "learning_rate": 8.842046718576196e-06, + "loss": 3.1423, + "mean_token_accuracy": 0.4358725761772853, + "step": 6246 + }, + { + "epoch": 1.1581386725991842, + "grad_norm": 8.3984375, + "learning_rate": 8.841861327400817e-06, + "loss": 2.1037, + "mean_token_accuracy": 0.5110396869759642, + "step": 6247 + }, + { + "epoch": 1.1583240637745644, + "grad_norm": 5.796875, + "learning_rate": 8.841675936225437e-06, + "loss": 3.0579, + "mean_token_accuracy": 0.43250063083522583, + "step": 6248 + }, + { + "epoch": 1.1585094549499444, + "grad_norm": 6.5703125, + "learning_rate": 8.841490545050056e-06, + "loss": 3.2946, + "mean_token_accuracy": 0.4102183106640759, + "step": 6249 + }, + { + "epoch": 1.1586948461253244, + "grad_norm": 10.1640625, + "learning_rate": 8.841305153874677e-06, + "loss": 2.6042, + "mean_token_accuracy": 0.48033573141486813, + "step": 6250 + }, + { + "epoch": 1.1588802373007046, + "grad_norm": 5.66015625, + "learning_rate": 8.841119762699295e-06, + "loss": 2.9208, + "mean_token_accuracy": 0.4442389758179232, + "step": 6251 + }, + { + "epoch": 1.1590656284760845, + "grad_norm": 7.98828125, + "learning_rate": 8.840934371523916e-06, + "loss": 2.3404, + "mean_token_accuracy": 0.5026996305768684, + "step": 6252 + }, + { + "epoch": 1.1592510196514647, + "grad_norm": 8.3671875, + "learning_rate": 8.840748980348536e-06, + "loss": 2.5435, + "mean_token_accuracy": 0.48536846943465606, + "step": 6253 + }, + { + "epoch": 1.1594364108268447, + "grad_norm": 6.265625, + "learning_rate": 8.840563589173155e-06, + "loss": 2.8096, + "mean_token_accuracy": 0.45177728063634104, + "step": 6254 + }, + { + "epoch": 1.1596218020022246, + "grad_norm": 7.08984375, + "learning_rate": 8.840378197997776e-06, + "loss": 2.7046, + "mean_token_accuracy": 0.4654343807763401, + "step": 6255 + }, + { + "epoch": 1.1598071931776048, + "grad_norm": 7.02734375, + "learning_rate": 8.840192806822396e-06, + "loss": 2.7457, + "mean_token_accuracy": 0.4650558586645882, + "step": 6256 + }, + { + "epoch": 1.1599925843529848, + "grad_norm": 5.3359375, + "learning_rate": 8.840007415647017e-06, + "loss": 2.965, + "mean_token_accuracy": 0.45515592988845893, + "step": 6257 + }, + { + "epoch": 1.1601779755283648, + "grad_norm": 8.1640625, + "learning_rate": 8.839822024471635e-06, + "loss": 3.1483, + "mean_token_accuracy": 0.41381544841886986, + "step": 6258 + }, + { + "epoch": 1.160363366703745, + "grad_norm": 7.18359375, + "learning_rate": 8.839636633296256e-06, + "loss": 2.761, + "mean_token_accuracy": 0.46389185343294204, + "step": 6259 + }, + { + "epoch": 1.160548757879125, + "grad_norm": 7.296875, + "learning_rate": 8.839451242120875e-06, + "loss": 1.9678, + "mean_token_accuracy": 0.5932790224032587, + "step": 6260 + }, + { + "epoch": 1.160734149054505, + "grad_norm": 8.8359375, + "learning_rate": 8.839265850945495e-06, + "loss": 2.7525, + "mean_token_accuracy": 0.45690468700849324, + "step": 6261 + }, + { + "epoch": 1.160919540229885, + "grad_norm": 6.8203125, + "learning_rate": 8.839080459770116e-06, + "loss": 2.281, + "mean_token_accuracy": 0.5114660114660114, + "step": 6262 + }, + { + "epoch": 1.161104931405265, + "grad_norm": 6.04296875, + "learning_rate": 8.838895068594736e-06, + "loss": 2.8721, + "mean_token_accuracy": 0.47297297297297297, + "step": 6263 + }, + { + "epoch": 1.1612903225806452, + "grad_norm": 7.47265625, + "learning_rate": 8.838709677419357e-06, + "loss": 2.3967, + "mean_token_accuracy": 0.5031376377074327, + "step": 6264 + }, + { + "epoch": 1.1614757137560252, + "grad_norm": 7.96484375, + "learning_rate": 8.838524286243976e-06, + "loss": 2.6904, + "mean_token_accuracy": 0.474112623432079, + "step": 6265 + }, + { + "epoch": 1.1616611049314052, + "grad_norm": 6.62890625, + "learning_rate": 8.838338895068596e-06, + "loss": 2.6536, + "mean_token_accuracy": 0.4803471994759253, + "step": 6266 + }, + { + "epoch": 1.1618464961067854, + "grad_norm": 9.0, + "learning_rate": 8.838153503893215e-06, + "loss": 2.9049, + "mean_token_accuracy": 0.43220022413149045, + "step": 6267 + }, + { + "epoch": 1.1620318872821653, + "grad_norm": 7.6171875, + "learning_rate": 8.837968112717835e-06, + "loss": 2.8885, + "mean_token_accuracy": 0.4385288966725044, + "step": 6268 + }, + { + "epoch": 1.1622172784575455, + "grad_norm": 5.2265625, + "learning_rate": 8.837782721542456e-06, + "loss": 2.5944, + "mean_token_accuracy": 0.48122555410691004, + "step": 6269 + }, + { + "epoch": 1.1624026696329255, + "grad_norm": 7.8359375, + "learning_rate": 8.837597330367075e-06, + "loss": 3.3007, + "mean_token_accuracy": 0.4160255001099143, + "step": 6270 + }, + { + "epoch": 1.1625880608083055, + "grad_norm": 9.2265625, + "learning_rate": 8.837411939191695e-06, + "loss": 2.6935, + "mean_token_accuracy": 0.4792046144505161, + "step": 6271 + }, + { + "epoch": 1.1627734519836856, + "grad_norm": 5.3984375, + "learning_rate": 8.837226548016316e-06, + "loss": 3.0817, + "mean_token_accuracy": 0.44159786304031085, + "step": 6272 + }, + { + "epoch": 1.1629588431590656, + "grad_norm": 6.30078125, + "learning_rate": 8.837041156840936e-06, + "loss": 2.4583, + "mean_token_accuracy": 0.5039006067610518, + "step": 6273 + }, + { + "epoch": 1.1631442343344456, + "grad_norm": 8.8046875, + "learning_rate": 8.836855765665555e-06, + "loss": 3.4902, + "mean_token_accuracy": 0.39008394543546693, + "step": 6274 + }, + { + "epoch": 1.1633296255098258, + "grad_norm": 8.3515625, + "learning_rate": 8.836670374490175e-06, + "loss": 2.4008, + "mean_token_accuracy": 0.49205340114431023, + "step": 6275 + }, + { + "epoch": 1.1635150166852057, + "grad_norm": 7.72265625, + "learning_rate": 8.836484983314794e-06, + "loss": 2.2125, + "mean_token_accuracy": 0.5218600953895072, + "step": 6276 + }, + { + "epoch": 1.163700407860586, + "grad_norm": 6.6796875, + "learning_rate": 8.836299592139415e-06, + "loss": 3.018, + "mean_token_accuracy": 0.43903448275862067, + "step": 6277 + }, + { + "epoch": 1.163885799035966, + "grad_norm": 6.99609375, + "learning_rate": 8.836114200964035e-06, + "loss": 2.7199, + "mean_token_accuracy": 0.4672369270497095, + "step": 6278 + }, + { + "epoch": 1.1640711902113459, + "grad_norm": 4.890625, + "learning_rate": 8.835928809788656e-06, + "loss": 3.0414, + "mean_token_accuracy": 0.4319129226493747, + "step": 6279 + }, + { + "epoch": 1.164256581386726, + "grad_norm": 5.29296875, + "learning_rate": 8.835743418613274e-06, + "loss": 3.7187, + "mean_token_accuracy": 0.3777634130575307, + "step": 6280 + }, + { + "epoch": 1.164441972562106, + "grad_norm": 6.15234375, + "learning_rate": 8.835558027437895e-06, + "loss": 2.7329, + "mean_token_accuracy": 0.46790299572039945, + "step": 6281 + }, + { + "epoch": 1.1646273637374862, + "grad_norm": 6.90234375, + "learning_rate": 8.835372636262515e-06, + "loss": 2.6391, + "mean_token_accuracy": 0.48060109289617486, + "step": 6282 + }, + { + "epoch": 1.1648127549128662, + "grad_norm": 5.32421875, + "learning_rate": 8.835187245087134e-06, + "loss": 2.6662, + "mean_token_accuracy": 0.48919472247497725, + "step": 6283 + }, + { + "epoch": 1.1649981460882461, + "grad_norm": 7.390625, + "learning_rate": 8.835001853911755e-06, + "loss": 2.6174, + "mean_token_accuracy": 0.47289972899729, + "step": 6284 + }, + { + "epoch": 1.1651835372636263, + "grad_norm": 6.99609375, + "learning_rate": 8.834816462736373e-06, + "loss": 2.7107, + "mean_token_accuracy": 0.4818551028429189, + "step": 6285 + }, + { + "epoch": 1.1653689284390063, + "grad_norm": 4.90625, + "learning_rate": 8.834631071560994e-06, + "loss": 2.6862, + "mean_token_accuracy": 0.47253634894991925, + "step": 6286 + }, + { + "epoch": 1.1655543196143863, + "grad_norm": 8.9609375, + "learning_rate": 8.834445680385614e-06, + "loss": 2.4417, + "mean_token_accuracy": 0.4875769318440848, + "step": 6287 + }, + { + "epoch": 1.1657397107897665, + "grad_norm": 5.94140625, + "learning_rate": 8.834260289210235e-06, + "loss": 2.894, + "mean_token_accuracy": 0.4424284717376134, + "step": 6288 + }, + { + "epoch": 1.1659251019651464, + "grad_norm": 6.578125, + "learning_rate": 8.834074898034854e-06, + "loss": 2.9189, + "mean_token_accuracy": 0.44844844844844844, + "step": 6289 + }, + { + "epoch": 1.1661104931405264, + "grad_norm": 7.06640625, + "learning_rate": 8.833889506859474e-06, + "loss": 2.3077, + "mean_token_accuracy": 0.4882315112540193, + "step": 6290 + }, + { + "epoch": 1.1662958843159066, + "grad_norm": 7.6875, + "learning_rate": 8.833704115684095e-06, + "loss": 2.8007, + "mean_token_accuracy": 0.45584158415841586, + "step": 6291 + }, + { + "epoch": 1.1664812754912866, + "grad_norm": 6.96875, + "learning_rate": 8.833518724508714e-06, + "loss": 2.4177, + "mean_token_accuracy": 0.485842242126553, + "step": 6292 + }, + { + "epoch": 1.1666666666666667, + "grad_norm": 9.2890625, + "learning_rate": 8.833333333333334e-06, + "loss": 2.8094, + "mean_token_accuracy": 0.4373887240356083, + "step": 6293 + }, + { + "epoch": 1.1668520578420467, + "grad_norm": 9.1484375, + "learning_rate": 8.833147942157953e-06, + "loss": 2.2688, + "mean_token_accuracy": 0.5540246555474981, + "step": 6294 + }, + { + "epoch": 1.1670374490174267, + "grad_norm": 6.2265625, + "learning_rate": 8.832962550982575e-06, + "loss": 3.2563, + "mean_token_accuracy": 0.42158273381294964, + "step": 6295 + }, + { + "epoch": 1.1672228401928069, + "grad_norm": 7.8828125, + "learning_rate": 8.832777159807194e-06, + "loss": 2.6791, + "mean_token_accuracy": 0.4647398843930636, + "step": 6296 + }, + { + "epoch": 1.1674082313681868, + "grad_norm": 5.86328125, + "learning_rate": 8.832591768631814e-06, + "loss": 2.8409, + "mean_token_accuracy": 0.48254149971379506, + "step": 6297 + }, + { + "epoch": 1.167593622543567, + "grad_norm": 8.203125, + "learning_rate": 8.832406377456433e-06, + "loss": 3.0331, + "mean_token_accuracy": 0.4413630011824143, + "step": 6298 + }, + { + "epoch": 1.167779013718947, + "grad_norm": 6.07421875, + "learning_rate": 8.832220986281054e-06, + "loss": 2.9287, + "mean_token_accuracy": 0.43567292755260556, + "step": 6299 + }, + { + "epoch": 1.167964404894327, + "grad_norm": 9.328125, + "learning_rate": 8.832035595105674e-06, + "loss": 3.2963, + "mean_token_accuracy": 0.4292892156862745, + "step": 6300 + }, + { + "epoch": 1.1681497960697071, + "grad_norm": 6.78125, + "learning_rate": 8.831850203930293e-06, + "loss": 3.5141, + "mean_token_accuracy": 0.4124339699309224, + "step": 6301 + }, + { + "epoch": 1.1683351872450871, + "grad_norm": 6.765625, + "learning_rate": 8.831664812754913e-06, + "loss": 3.1951, + "mean_token_accuracy": 0.4376654481280726, + "step": 6302 + }, + { + "epoch": 1.168520578420467, + "grad_norm": 8.8125, + "learning_rate": 8.831479421579534e-06, + "loss": 2.7285, + "mean_token_accuracy": 0.45105623326390953, + "step": 6303 + }, + { + "epoch": 1.1687059695958473, + "grad_norm": 10.703125, + "learning_rate": 8.831294030404154e-06, + "loss": 2.5187, + "mean_token_accuracy": 0.5028266666666666, + "step": 6304 + }, + { + "epoch": 1.1688913607712272, + "grad_norm": 7.14453125, + "learning_rate": 8.831108639228773e-06, + "loss": 3.028, + "mean_token_accuracy": 0.4359763920964845, + "step": 6305 + }, + { + "epoch": 1.1690767519466074, + "grad_norm": 6.25, + "learning_rate": 8.830923248053394e-06, + "loss": 3.4223, + "mean_token_accuracy": 0.405736480430236, + "step": 6306 + }, + { + "epoch": 1.1692621431219874, + "grad_norm": 7.14453125, + "learning_rate": 8.830737856878014e-06, + "loss": 3.4809, + "mean_token_accuracy": 0.3938528491772066, + "step": 6307 + }, + { + "epoch": 1.1694475342973674, + "grad_norm": 6.58984375, + "learning_rate": 8.830552465702633e-06, + "loss": 4.1768, + "mean_token_accuracy": 0.3549208903191204, + "step": 6308 + }, + { + "epoch": 1.1696329254727476, + "grad_norm": 6.6953125, + "learning_rate": 8.830367074527253e-06, + "loss": 2.4775, + "mean_token_accuracy": 0.4905234657039711, + "step": 6309 + }, + { + "epoch": 1.1698183166481275, + "grad_norm": 6.6328125, + "learning_rate": 8.830181683351872e-06, + "loss": 2.8528, + "mean_token_accuracy": 0.45447434292866085, + "step": 6310 + }, + { + "epoch": 1.1700037078235077, + "grad_norm": 9.140625, + "learning_rate": 8.829996292176493e-06, + "loss": 2.582, + "mean_token_accuracy": 0.4651473154706909, + "step": 6311 + }, + { + "epoch": 1.1701890989988877, + "grad_norm": 6.1796875, + "learning_rate": 8.829810901001113e-06, + "loss": 2.67, + "mean_token_accuracy": 0.48894570612122124, + "step": 6312 + }, + { + "epoch": 1.1703744901742676, + "grad_norm": 6.3984375, + "learning_rate": 8.829625509825734e-06, + "loss": 3.2249, + "mean_token_accuracy": 0.42331347299634825, + "step": 6313 + }, + { + "epoch": 1.1705598813496478, + "grad_norm": 6.41796875, + "learning_rate": 8.829440118650352e-06, + "loss": 2.3696, + "mean_token_accuracy": 0.49776071657069737, + "step": 6314 + }, + { + "epoch": 1.1707452725250278, + "grad_norm": 9.984375, + "learning_rate": 8.829254727474973e-06, + "loss": 2.3138, + "mean_token_accuracy": 0.492573402417962, + "step": 6315 + }, + { + "epoch": 1.1709306637004078, + "grad_norm": 6.68359375, + "learning_rate": 8.829069336299593e-06, + "loss": 2.7926, + "mean_token_accuracy": 0.4692871877148751, + "step": 6316 + }, + { + "epoch": 1.171116054875788, + "grad_norm": 5.48046875, + "learning_rate": 8.828883945124212e-06, + "loss": 2.607, + "mean_token_accuracy": 0.49708565636087176, + "step": 6317 + }, + { + "epoch": 1.171301446051168, + "grad_norm": 6.27734375, + "learning_rate": 8.828698553948833e-06, + "loss": 2.3901, + "mean_token_accuracy": 0.508356940509915, + "step": 6318 + }, + { + "epoch": 1.171486837226548, + "grad_norm": 5.9765625, + "learning_rate": 8.828513162773452e-06, + "loss": 2.3003, + "mean_token_accuracy": 0.5313873548968782, + "step": 6319 + }, + { + "epoch": 1.171672228401928, + "grad_norm": 5.484375, + "learning_rate": 8.828327771598074e-06, + "loss": 2.9595, + "mean_token_accuracy": 0.4595854922279793, + "step": 6320 + }, + { + "epoch": 1.171857619577308, + "grad_norm": 5.640625, + "learning_rate": 8.828142380422693e-06, + "loss": 2.9775, + "mean_token_accuracy": 0.43765156349712825, + "step": 6321 + }, + { + "epoch": 1.1720430107526882, + "grad_norm": 5.8046875, + "learning_rate": 8.827956989247313e-06, + "loss": 2.8408, + "mean_token_accuracy": 0.46032745591939545, + "step": 6322 + }, + { + "epoch": 1.1722284019280682, + "grad_norm": 6.73046875, + "learning_rate": 8.827771598071932e-06, + "loss": 2.6551, + "mean_token_accuracy": 0.4846560846560847, + "step": 6323 + }, + { + "epoch": 1.1724137931034484, + "grad_norm": 15.2109375, + "learning_rate": 8.827586206896552e-06, + "loss": 2.3105, + "mean_token_accuracy": 0.49884054017187285, + "step": 6324 + }, + { + "epoch": 1.1725991842788284, + "grad_norm": 10.375, + "learning_rate": 8.827400815721173e-06, + "loss": 2.9711, + "mean_token_accuracy": 0.4347183003899422, + "step": 6325 + }, + { + "epoch": 1.1727845754542083, + "grad_norm": 10.046875, + "learning_rate": 8.827215424545792e-06, + "loss": 2.6262, + "mean_token_accuracy": 0.4604775890690844, + "step": 6326 + }, + { + "epoch": 1.1729699666295885, + "grad_norm": 6.92578125, + "learning_rate": 8.827030033370412e-06, + "loss": 2.6615, + "mean_token_accuracy": 0.4831445523193096, + "step": 6327 + }, + { + "epoch": 1.1731553578049685, + "grad_norm": 5.6640625, + "learning_rate": 8.826844642195033e-06, + "loss": 2.6929, + "mean_token_accuracy": 0.4762435416877723, + "step": 6328 + }, + { + "epoch": 1.1733407489803485, + "grad_norm": 7.15625, + "learning_rate": 8.826659251019653e-06, + "loss": 2.4264, + "mean_token_accuracy": 0.4958384332925337, + "step": 6329 + }, + { + "epoch": 1.1735261401557286, + "grad_norm": 9.6875, + "learning_rate": 8.826473859844272e-06, + "loss": 2.7945, + "mean_token_accuracy": 0.44794903666873837, + "step": 6330 + }, + { + "epoch": 1.1737115313311086, + "grad_norm": 8.7109375, + "learning_rate": 8.826288468668892e-06, + "loss": 2.5257, + "mean_token_accuracy": 0.5103480714957667, + "step": 6331 + }, + { + "epoch": 1.1738969225064886, + "grad_norm": 7.24609375, + "learning_rate": 8.826103077493511e-06, + "loss": 2.8784, + "mean_token_accuracy": 0.4468599033816425, + "step": 6332 + }, + { + "epoch": 1.1740823136818688, + "grad_norm": 12.5390625, + "learning_rate": 8.825917686318132e-06, + "loss": 2.7923, + "mean_token_accuracy": 0.4553501180173092, + "step": 6333 + }, + { + "epoch": 1.1742677048572487, + "grad_norm": 9.5234375, + "learning_rate": 8.825732295142752e-06, + "loss": 2.3155, + "mean_token_accuracy": 0.5107069521853916, + "step": 6334 + }, + { + "epoch": 1.174453096032629, + "grad_norm": 6.90625, + "learning_rate": 8.825546903967371e-06, + "loss": 2.4689, + "mean_token_accuracy": 0.5278425655976676, + "step": 6335 + }, + { + "epoch": 1.174638487208009, + "grad_norm": 6.3828125, + "learning_rate": 8.825361512791991e-06, + "loss": 2.7515, + "mean_token_accuracy": 0.4889873582692558, + "step": 6336 + }, + { + "epoch": 1.1748238783833889, + "grad_norm": 7.4453125, + "learning_rate": 8.825176121616612e-06, + "loss": 2.7022, + "mean_token_accuracy": 0.4718196457326892, + "step": 6337 + }, + { + "epoch": 1.175009269558769, + "grad_norm": 8.875, + "learning_rate": 8.824990730441232e-06, + "loss": 2.1254, + "mean_token_accuracy": 0.5217060167555218, + "step": 6338 + }, + { + "epoch": 1.175194660734149, + "grad_norm": 6.37890625, + "learning_rate": 8.824805339265851e-06, + "loss": 2.4546, + "mean_token_accuracy": 0.4835164835164835, + "step": 6339 + }, + { + "epoch": 1.1753800519095292, + "grad_norm": 8.28125, + "learning_rate": 8.824619948090472e-06, + "loss": 3.0341, + "mean_token_accuracy": 0.43360905528950805, + "step": 6340 + }, + { + "epoch": 1.1755654430849092, + "grad_norm": 6.94140625, + "learning_rate": 8.82443455691509e-06, + "loss": 3.1317, + "mean_token_accuracy": 0.4315977254989408, + "step": 6341 + }, + { + "epoch": 1.1757508342602891, + "grad_norm": 11.15625, + "learning_rate": 8.824249165739711e-06, + "loss": 2.5288, + "mean_token_accuracy": 0.48819757129212715, + "step": 6342 + }, + { + "epoch": 1.1759362254356693, + "grad_norm": 9.421875, + "learning_rate": 8.824063774564331e-06, + "loss": 2.2493, + "mean_token_accuracy": 0.49625508238818744, + "step": 6343 + }, + { + "epoch": 1.1761216166110493, + "grad_norm": 7.3046875, + "learning_rate": 8.823878383388952e-06, + "loss": 2.6195, + "mean_token_accuracy": 0.4844215659712815, + "step": 6344 + }, + { + "epoch": 1.1763070077864293, + "grad_norm": 9.171875, + "learning_rate": 8.823692992213572e-06, + "loss": 2.8151, + "mean_token_accuracy": 0.46156693399136334, + "step": 6345 + }, + { + "epoch": 1.1764923989618095, + "grad_norm": 8.3828125, + "learning_rate": 8.823507601038191e-06, + "loss": 2.7746, + "mean_token_accuracy": 0.4703600436416535, + "step": 6346 + }, + { + "epoch": 1.1766777901371894, + "grad_norm": 5.70703125, + "learning_rate": 8.823322209862812e-06, + "loss": 3.7264, + "mean_token_accuracy": 0.3795459111914808, + "step": 6347 + }, + { + "epoch": 1.1768631813125696, + "grad_norm": 5.75, + "learning_rate": 8.82313681868743e-06, + "loss": 2.7436, + "mean_token_accuracy": 0.4621694417238002, + "step": 6348 + }, + { + "epoch": 1.1770485724879496, + "grad_norm": 7.2109375, + "learning_rate": 8.822951427512051e-06, + "loss": 2.9344, + "mean_token_accuracy": 0.44876997210246006, + "step": 6349 + }, + { + "epoch": 1.1772339636633296, + "grad_norm": 7.4375, + "learning_rate": 8.822766036336672e-06, + "loss": 3.5666, + "mean_token_accuracy": 0.39516767890150517, + "step": 6350 + }, + { + "epoch": 1.1774193548387097, + "grad_norm": 5.4296875, + "learning_rate": 8.82258064516129e-06, + "loss": 2.789, + "mean_token_accuracy": 0.454745650737723, + "step": 6351 + }, + { + "epoch": 1.1776047460140897, + "grad_norm": 9.0703125, + "learning_rate": 8.82239525398591e-06, + "loss": 2.4672, + "mean_token_accuracy": 0.4940248565965583, + "step": 6352 + }, + { + "epoch": 1.17779013718947, + "grad_norm": 6.15234375, + "learning_rate": 8.822209862810531e-06, + "loss": 2.6406, + "mean_token_accuracy": 0.48978741142142557, + "step": 6353 + }, + { + "epoch": 1.1779755283648499, + "grad_norm": 5.9609375, + "learning_rate": 8.822024471635152e-06, + "loss": 3.0807, + "mean_token_accuracy": 0.4498792270531401, + "step": 6354 + }, + { + "epoch": 1.1781609195402298, + "grad_norm": 5.484375, + "learning_rate": 8.82183908045977e-06, + "loss": 3.0307, + "mean_token_accuracy": 0.441527446300716, + "step": 6355 + }, + { + "epoch": 1.17834631071561, + "grad_norm": 7.9609375, + "learning_rate": 8.821653689284391e-06, + "loss": 3.1834, + "mean_token_accuracy": 0.42505207861606736, + "step": 6356 + }, + { + "epoch": 1.17853170189099, + "grad_norm": 7.56640625, + "learning_rate": 8.82146829810901e-06, + "loss": 2.8839, + "mean_token_accuracy": 0.47977856373981237, + "step": 6357 + }, + { + "epoch": 1.17871709306637, + "grad_norm": 6.39453125, + "learning_rate": 8.82128290693363e-06, + "loss": 2.8867, + "mean_token_accuracy": 0.4513958964009418, + "step": 6358 + }, + { + "epoch": 1.1789024842417501, + "grad_norm": 8.9453125, + "learning_rate": 8.821097515758251e-06, + "loss": 2.606, + "mean_token_accuracy": 0.47661171424741994, + "step": 6359 + }, + { + "epoch": 1.1790878754171301, + "grad_norm": 6.2734375, + "learning_rate": 8.820912124582871e-06, + "loss": 2.403, + "mean_token_accuracy": 0.5047009663097415, + "step": 6360 + }, + { + "epoch": 1.17927326659251, + "grad_norm": 5.390625, + "learning_rate": 8.82072673340749e-06, + "loss": 3.5323, + "mean_token_accuracy": 0.4063740228502706, + "step": 6361 + }, + { + "epoch": 1.1794586577678903, + "grad_norm": 6.6484375, + "learning_rate": 8.82054134223211e-06, + "loss": 3.0652, + "mean_token_accuracy": 0.42973651191969886, + "step": 6362 + }, + { + "epoch": 1.1796440489432702, + "grad_norm": 6.89453125, + "learning_rate": 8.820355951056731e-06, + "loss": 2.9167, + "mean_token_accuracy": 0.45741690094261617, + "step": 6363 + }, + { + "epoch": 1.1798294401186504, + "grad_norm": 5.4453125, + "learning_rate": 8.82017055988135e-06, + "loss": 2.8703, + "mean_token_accuracy": 0.4465591397849462, + "step": 6364 + }, + { + "epoch": 1.1800148312940304, + "grad_norm": 6.56640625, + "learning_rate": 8.81998516870597e-06, + "loss": 2.8019, + "mean_token_accuracy": 0.467167842896594, + "step": 6365 + }, + { + "epoch": 1.1802002224694104, + "grad_norm": 5.3046875, + "learning_rate": 8.81979977753059e-06, + "loss": 2.3345, + "mean_token_accuracy": 0.5, + "step": 6366 + }, + { + "epoch": 1.1803856136447906, + "grad_norm": 5.5625, + "learning_rate": 8.81961438635521e-06, + "loss": 3.0838, + "mean_token_accuracy": 0.4562492085602127, + "step": 6367 + }, + { + "epoch": 1.1805710048201705, + "grad_norm": 7.52734375, + "learning_rate": 8.81942899517983e-06, + "loss": 2.5619, + "mean_token_accuracy": 0.47722018223854207, + "step": 6368 + }, + { + "epoch": 1.1807563959955507, + "grad_norm": 7.34375, + "learning_rate": 8.81924360400445e-06, + "loss": 4.0755, + "mean_token_accuracy": 0.3647910205287255, + "step": 6369 + }, + { + "epoch": 1.1809417871709307, + "grad_norm": 6.02734375, + "learning_rate": 8.81905821282907e-06, + "loss": 2.9377, + "mean_token_accuracy": 0.4447466848010881, + "step": 6370 + }, + { + "epoch": 1.1811271783463106, + "grad_norm": 6.54296875, + "learning_rate": 8.81887282165369e-06, + "loss": 3.1299, + "mean_token_accuracy": 0.4263312274368231, + "step": 6371 + }, + { + "epoch": 1.1813125695216908, + "grad_norm": 7.546875, + "learning_rate": 8.81868743047831e-06, + "loss": 2.7367, + "mean_token_accuracy": 0.4625487646293888, + "step": 6372 + }, + { + "epoch": 1.1814979606970708, + "grad_norm": 6.0078125, + "learning_rate": 8.81850203930293e-06, + "loss": 2.9158, + "mean_token_accuracy": 0.43024618991793667, + "step": 6373 + }, + { + "epoch": 1.1816833518724508, + "grad_norm": 5.5703125, + "learning_rate": 8.81831664812755e-06, + "loss": 2.2095, + "mean_token_accuracy": 0.5179052234787291, + "step": 6374 + }, + { + "epoch": 1.181868743047831, + "grad_norm": 6.92578125, + "learning_rate": 8.818131256952169e-06, + "loss": 2.6794, + "mean_token_accuracy": 0.46814799714606053, + "step": 6375 + }, + { + "epoch": 1.182054134223211, + "grad_norm": 6.390625, + "learning_rate": 8.81794586577679e-06, + "loss": 3.2691, + "mean_token_accuracy": 0.44510875167508507, + "step": 6376 + }, + { + "epoch": 1.1822395253985911, + "grad_norm": 6.1171875, + "learning_rate": 8.81776047460141e-06, + "loss": 2.2124, + "mean_token_accuracy": 0.5571020255996023, + "step": 6377 + }, + { + "epoch": 1.182424916573971, + "grad_norm": 5.12890625, + "learning_rate": 8.81757508342603e-06, + "loss": 2.651, + "mean_token_accuracy": 0.4798093804865814, + "step": 6378 + }, + { + "epoch": 1.182610307749351, + "grad_norm": 5.8671875, + "learning_rate": 8.817389692250649e-06, + "loss": 2.5449, + "mean_token_accuracy": 0.49141707114952055, + "step": 6379 + }, + { + "epoch": 1.1827956989247312, + "grad_norm": 5.9375, + "learning_rate": 8.81720430107527e-06, + "loss": 2.6384, + "mean_token_accuracy": 0.4539434787749793, + "step": 6380 + }, + { + "epoch": 1.1829810901001112, + "grad_norm": 6.0234375, + "learning_rate": 8.81701890989989e-06, + "loss": 2.8109, + "mean_token_accuracy": 0.46606668633815285, + "step": 6381 + }, + { + "epoch": 1.1831664812754914, + "grad_norm": 8.2421875, + "learning_rate": 8.816833518724509e-06, + "loss": 3.044, + "mean_token_accuracy": 0.410218387705581, + "step": 6382 + }, + { + "epoch": 1.1833518724508714, + "grad_norm": 5.33203125, + "learning_rate": 8.816648127549129e-06, + "loss": 2.8462, + "mean_token_accuracy": 0.4774321641297154, + "step": 6383 + }, + { + "epoch": 1.1835372636262513, + "grad_norm": 5.71875, + "learning_rate": 8.81646273637375e-06, + "loss": 2.6792, + "mean_token_accuracy": 0.4960244648318043, + "step": 6384 + }, + { + "epoch": 1.1837226548016315, + "grad_norm": 5.6484375, + "learning_rate": 8.81627734519837e-06, + "loss": 2.9034, + "mean_token_accuracy": 0.446986301369863, + "step": 6385 + }, + { + "epoch": 1.1839080459770115, + "grad_norm": 5.50390625, + "learning_rate": 8.816091954022989e-06, + "loss": 2.7844, + "mean_token_accuracy": 0.46271556958950694, + "step": 6386 + }, + { + "epoch": 1.1840934371523915, + "grad_norm": 6.48828125, + "learning_rate": 8.81590656284761e-06, + "loss": 3.5053, + "mean_token_accuracy": 0.41990625861593606, + "step": 6387 + }, + { + "epoch": 1.1842788283277716, + "grad_norm": 5.7265625, + "learning_rate": 8.81572117167223e-06, + "loss": 2.9776, + "mean_token_accuracy": 0.45307725883893496, + "step": 6388 + }, + { + "epoch": 1.1844642195031516, + "grad_norm": 7.515625, + "learning_rate": 8.815535780496849e-06, + "loss": 2.6269, + "mean_token_accuracy": 0.47132815390307065, + "step": 6389 + }, + { + "epoch": 1.1846496106785316, + "grad_norm": 6.3046875, + "learning_rate": 8.81535038932147e-06, + "loss": 2.7769, + "mean_token_accuracy": 0.4734870654336884, + "step": 6390 + }, + { + "epoch": 1.1848350018539118, + "grad_norm": 7.43359375, + "learning_rate": 8.815164998146088e-06, + "loss": 3.12, + "mean_token_accuracy": 0.43461226695487704, + "step": 6391 + }, + { + "epoch": 1.1850203930292917, + "grad_norm": 7.54296875, + "learning_rate": 8.81497960697071e-06, + "loss": 2.5001, + "mean_token_accuracy": 0.4799328295549958, + "step": 6392 + }, + { + "epoch": 1.185205784204672, + "grad_norm": 6.58984375, + "learning_rate": 8.814794215795329e-06, + "loss": 3.0162, + "mean_token_accuracy": 0.4430566747246844, + "step": 6393 + }, + { + "epoch": 1.185391175380052, + "grad_norm": 6.28515625, + "learning_rate": 8.81460882461995e-06, + "loss": 2.461, + "mean_token_accuracy": 0.4929384965831435, + "step": 6394 + }, + { + "epoch": 1.1855765665554319, + "grad_norm": 7.05078125, + "learning_rate": 8.814423433444568e-06, + "loss": 3.0958, + "mean_token_accuracy": 0.4270176739587196, + "step": 6395 + }, + { + "epoch": 1.185761957730812, + "grad_norm": 6.90234375, + "learning_rate": 8.814238042269189e-06, + "loss": 2.8033, + "mean_token_accuracy": 0.4609099350046425, + "step": 6396 + }, + { + "epoch": 1.185947348906192, + "grad_norm": 7.18359375, + "learning_rate": 8.81405265109381e-06, + "loss": 2.7717, + "mean_token_accuracy": 0.46469833119383824, + "step": 6397 + }, + { + "epoch": 1.1861327400815722, + "grad_norm": 6.8046875, + "learning_rate": 8.813867259918428e-06, + "loss": 2.4397, + "mean_token_accuracy": 0.4994773779304166, + "step": 6398 + }, + { + "epoch": 1.1863181312569522, + "grad_norm": 7.79296875, + "learning_rate": 8.813681868743049e-06, + "loss": 2.576, + "mean_token_accuracy": 0.473090390351446, + "step": 6399 + }, + { + "epoch": 1.1865035224323321, + "grad_norm": 8.734375, + "learning_rate": 8.813496477567669e-06, + "loss": 2.7637, + "mean_token_accuracy": 0.47411444141689374, + "step": 6400 + }, + { + "epoch": 1.1866889136077123, + "grad_norm": 7.51953125, + "learning_rate": 8.81331108639229e-06, + "loss": 3.0461, + "mean_token_accuracy": 0.4305811059409155, + "step": 6401 + }, + { + "epoch": 1.1868743047830923, + "grad_norm": 6.80859375, + "learning_rate": 8.813125695216908e-06, + "loss": 2.6728, + "mean_token_accuracy": 0.47608083908928117, + "step": 6402 + }, + { + "epoch": 1.1870596959584723, + "grad_norm": 7.68359375, + "learning_rate": 8.812940304041529e-06, + "loss": 3.5118, + "mean_token_accuracy": 0.40063974410235903, + "step": 6403 + }, + { + "epoch": 1.1872450871338525, + "grad_norm": 7.6640625, + "learning_rate": 8.812754912866148e-06, + "loss": 2.8287, + "mean_token_accuracy": 0.45515232431120284, + "step": 6404 + }, + { + "epoch": 1.1874304783092324, + "grad_norm": 5.87890625, + "learning_rate": 8.812569521690768e-06, + "loss": 3.2195, + "mean_token_accuracy": 0.4374731413837559, + "step": 6405 + }, + { + "epoch": 1.1876158694846126, + "grad_norm": 7.81640625, + "learning_rate": 8.812384130515389e-06, + "loss": 2.9887, + "mean_token_accuracy": 0.4493107104984093, + "step": 6406 + }, + { + "epoch": 1.1878012606599926, + "grad_norm": 8.2421875, + "learning_rate": 8.812198739340007e-06, + "loss": 2.8959, + "mean_token_accuracy": 0.4631480324797002, + "step": 6407 + }, + { + "epoch": 1.1879866518353726, + "grad_norm": 15.234375, + "learning_rate": 8.812013348164628e-06, + "loss": 2.7402, + "mean_token_accuracy": 0.444, + "step": 6408 + }, + { + "epoch": 1.1881720430107527, + "grad_norm": 8.7578125, + "learning_rate": 8.811827956989248e-06, + "loss": 2.6256, + "mean_token_accuracy": 0.46747737556561086, + "step": 6409 + }, + { + "epoch": 1.1883574341861327, + "grad_norm": 6.53515625, + "learning_rate": 8.811642565813869e-06, + "loss": 3.052, + "mean_token_accuracy": 0.43136599230897843, + "step": 6410 + }, + { + "epoch": 1.188542825361513, + "grad_norm": 8.3515625, + "learning_rate": 8.811457174638488e-06, + "loss": 2.3425, + "mean_token_accuracy": 0.5219915987150976, + "step": 6411 + }, + { + "epoch": 1.1887282165368929, + "grad_norm": 6.69140625, + "learning_rate": 8.811271783463108e-06, + "loss": 2.6473, + "mean_token_accuracy": 0.49348154247289694, + "step": 6412 + }, + { + "epoch": 1.1889136077122728, + "grad_norm": 6.38671875, + "learning_rate": 8.811086392287727e-06, + "loss": 2.4697, + "mean_token_accuracy": 0.4959971322738678, + "step": 6413 + }, + { + "epoch": 1.189098998887653, + "grad_norm": 8.2109375, + "learning_rate": 8.810901001112347e-06, + "loss": 3.0116, + "mean_token_accuracy": 0.4202618883528601, + "step": 6414 + }, + { + "epoch": 1.189284390063033, + "grad_norm": 8.1484375, + "learning_rate": 8.810715609936968e-06, + "loss": 2.9244, + "mean_token_accuracy": 0.4568576182552888, + "step": 6415 + }, + { + "epoch": 1.189469781238413, + "grad_norm": 7.15625, + "learning_rate": 8.810530218761588e-06, + "loss": 2.8936, + "mean_token_accuracy": 0.4493371212121212, + "step": 6416 + }, + { + "epoch": 1.1896551724137931, + "grad_norm": 6.203125, + "learning_rate": 8.810344827586207e-06, + "loss": 2.2097, + "mean_token_accuracy": 0.5103169251517194, + "step": 6417 + }, + { + "epoch": 1.1898405635891731, + "grad_norm": 6.28515625, + "learning_rate": 8.810159436410828e-06, + "loss": 2.9877, + "mean_token_accuracy": 0.45045649838882923, + "step": 6418 + }, + { + "epoch": 1.190025954764553, + "grad_norm": 5.828125, + "learning_rate": 8.809974045235448e-06, + "loss": 2.8231, + "mean_token_accuracy": 0.4565071556350626, + "step": 6419 + }, + { + "epoch": 1.1902113459399333, + "grad_norm": 7.30859375, + "learning_rate": 8.809788654060067e-06, + "loss": 2.728, + "mean_token_accuracy": 0.46048020765736536, + "step": 6420 + }, + { + "epoch": 1.1903967371153132, + "grad_norm": 6.52734375, + "learning_rate": 8.809603262884687e-06, + "loss": 2.771, + "mean_token_accuracy": 0.4693314955203308, + "step": 6421 + }, + { + "epoch": 1.1905821282906934, + "grad_norm": 9.3046875, + "learning_rate": 8.809417871709306e-06, + "loss": 2.8314, + "mean_token_accuracy": 0.4578091439091746, + "step": 6422 + }, + { + "epoch": 1.1907675194660734, + "grad_norm": 5.96875, + "learning_rate": 8.809232480533927e-06, + "loss": 3.0668, + "mean_token_accuracy": 0.44035437983528825, + "step": 6423 + }, + { + "epoch": 1.1909529106414536, + "grad_norm": 6.14453125, + "learning_rate": 8.809047089358547e-06, + "loss": 2.6839, + "mean_token_accuracy": 0.4656220451168445, + "step": 6424 + }, + { + "epoch": 1.1911383018168336, + "grad_norm": 7.015625, + "learning_rate": 8.808861698183168e-06, + "loss": 3.0735, + "mean_token_accuracy": 0.46038573608991257, + "step": 6425 + }, + { + "epoch": 1.1913236929922135, + "grad_norm": 6.18359375, + "learning_rate": 8.808676307007788e-06, + "loss": 2.7196, + "mean_token_accuracy": 0.45421483309520716, + "step": 6426 + }, + { + "epoch": 1.1915090841675937, + "grad_norm": 6.640625, + "learning_rate": 8.808490915832407e-06, + "loss": 3.2395, + "mean_token_accuracy": 0.4069446462298416, + "step": 6427 + }, + { + "epoch": 1.1916944753429737, + "grad_norm": 6.578125, + "learning_rate": 8.808305524657028e-06, + "loss": 2.5925, + "mean_token_accuracy": 0.4931056095267941, + "step": 6428 + }, + { + "epoch": 1.1918798665183536, + "grad_norm": 6.17578125, + "learning_rate": 8.808120133481646e-06, + "loss": 2.5755, + "mean_token_accuracy": 0.48601186871745183, + "step": 6429 + }, + { + "epoch": 1.1920652576937338, + "grad_norm": 6.6484375, + "learning_rate": 8.807934742306267e-06, + "loss": 2.7582, + "mean_token_accuracy": 0.4620494391810955, + "step": 6430 + }, + { + "epoch": 1.1922506488691138, + "grad_norm": 7.69140625, + "learning_rate": 8.807749351130887e-06, + "loss": 2.8932, + "mean_token_accuracy": 0.4397296698426767, + "step": 6431 + }, + { + "epoch": 1.1924360400444938, + "grad_norm": 6.3359375, + "learning_rate": 8.807563959955508e-06, + "loss": 2.5842, + "mean_token_accuracy": 0.48234683281412255, + "step": 6432 + }, + { + "epoch": 1.192621431219874, + "grad_norm": 5.84375, + "learning_rate": 8.807378568780127e-06, + "loss": 2.6112, + "mean_token_accuracy": 0.4844789356984479, + "step": 6433 + }, + { + "epoch": 1.192806822395254, + "grad_norm": 7.33203125, + "learning_rate": 8.807193177604747e-06, + "loss": 3.2604, + "mean_token_accuracy": 0.42578042047143766, + "step": 6434 + }, + { + "epoch": 1.1929922135706341, + "grad_norm": 6.6484375, + "learning_rate": 8.807007786429368e-06, + "loss": 2.9666, + "mean_token_accuracy": 0.4721104708056975, + "step": 6435 + }, + { + "epoch": 1.193177604746014, + "grad_norm": 8.4453125, + "learning_rate": 8.806822395253986e-06, + "loss": 2.792, + "mean_token_accuracy": 0.46206896551724136, + "step": 6436 + }, + { + "epoch": 1.193362995921394, + "grad_norm": 6.0703125, + "learning_rate": 8.806637004078607e-06, + "loss": 3.0656, + "mean_token_accuracy": 0.4203590093160645, + "step": 6437 + }, + { + "epoch": 1.1935483870967742, + "grad_norm": 5.36328125, + "learning_rate": 8.806451612903226e-06, + "loss": 2.991, + "mean_token_accuracy": 0.4492753623188406, + "step": 6438 + }, + { + "epoch": 1.1937337782721542, + "grad_norm": 6.82421875, + "learning_rate": 8.806266221727846e-06, + "loss": 2.687, + "mean_token_accuracy": 0.463241322765783, + "step": 6439 + }, + { + "epoch": 1.1939191694475344, + "grad_norm": 7.21484375, + "learning_rate": 8.806080830552467e-06, + "loss": 3.1144, + "mean_token_accuracy": 0.43771863117870724, + "step": 6440 + }, + { + "epoch": 1.1941045606229144, + "grad_norm": 8.2109375, + "learning_rate": 8.805895439377087e-06, + "loss": 3.225, + "mean_token_accuracy": 0.4190995907230559, + "step": 6441 + }, + { + "epoch": 1.1942899517982943, + "grad_norm": 6.39453125, + "learning_rate": 8.805710048201706e-06, + "loss": 3.0193, + "mean_token_accuracy": 0.42626904944633265, + "step": 6442 + }, + { + "epoch": 1.1944753429736745, + "grad_norm": 6.15234375, + "learning_rate": 8.805524657026326e-06, + "loss": 3.2261, + "mean_token_accuracy": 0.4257966616084977, + "step": 6443 + }, + { + "epoch": 1.1946607341490545, + "grad_norm": 5.40625, + "learning_rate": 8.805339265850947e-06, + "loss": 2.6869, + "mean_token_accuracy": 0.45699414443721537, + "step": 6444 + }, + { + "epoch": 1.1948461253244345, + "grad_norm": 7.13671875, + "learning_rate": 8.805153874675566e-06, + "loss": 2.7952, + "mean_token_accuracy": 0.4552152557064432, + "step": 6445 + }, + { + "epoch": 1.1950315164998146, + "grad_norm": 6.67578125, + "learning_rate": 8.804968483500186e-06, + "loss": 2.7549, + "mean_token_accuracy": 0.46132468735525706, + "step": 6446 + }, + { + "epoch": 1.1952169076751946, + "grad_norm": 6.2265625, + "learning_rate": 8.804783092324805e-06, + "loss": 2.8782, + "mean_token_accuracy": 0.4632373497344143, + "step": 6447 + }, + { + "epoch": 1.1954022988505748, + "grad_norm": 5.8125, + "learning_rate": 8.804597701149425e-06, + "loss": 2.7742, + "mean_token_accuracy": 0.4583876858857292, + "step": 6448 + }, + { + "epoch": 1.1955876900259548, + "grad_norm": 6.89453125, + "learning_rate": 8.804412309974046e-06, + "loss": 3.2342, + "mean_token_accuracy": 0.4338526211671612, + "step": 6449 + }, + { + "epoch": 1.1957730812013347, + "grad_norm": 6.65625, + "learning_rate": 8.804226918798666e-06, + "loss": 3.0143, + "mean_token_accuracy": 0.43004587155963303, + "step": 6450 + }, + { + "epoch": 1.195958472376715, + "grad_norm": 6.6328125, + "learning_rate": 8.804041527623285e-06, + "loss": 2.475, + "mean_token_accuracy": 0.49002849002849, + "step": 6451 + }, + { + "epoch": 1.196143863552095, + "grad_norm": 8.1953125, + "learning_rate": 8.803856136447906e-06, + "loss": 2.6906, + "mean_token_accuracy": 0.4831537307325058, + "step": 6452 + }, + { + "epoch": 1.196329254727475, + "grad_norm": 8.984375, + "learning_rate": 8.803670745272526e-06, + "loss": 2.6896, + "mean_token_accuracy": 0.48764769065520946, + "step": 6453 + }, + { + "epoch": 1.196514645902855, + "grad_norm": 8.3125, + "learning_rate": 8.803485354097145e-06, + "loss": 2.6573, + "mean_token_accuracy": 0.45057624113475175, + "step": 6454 + }, + { + "epoch": 1.196700037078235, + "grad_norm": 8.203125, + "learning_rate": 8.803299962921766e-06, + "loss": 3.2449, + "mean_token_accuracy": 0.4351993332407279, + "step": 6455 + }, + { + "epoch": 1.1968854282536152, + "grad_norm": 9.0390625, + "learning_rate": 8.803114571746384e-06, + "loss": 2.7258, + "mean_token_accuracy": 0.4681406417933364, + "step": 6456 + }, + { + "epoch": 1.1970708194289952, + "grad_norm": 7.48046875, + "learning_rate": 8.802929180571007e-06, + "loss": 2.6919, + "mean_token_accuracy": 0.4594152301985599, + "step": 6457 + }, + { + "epoch": 1.1972562106043751, + "grad_norm": 6.9765625, + "learning_rate": 8.802743789395625e-06, + "loss": 2.7389, + "mean_token_accuracy": 0.48030552643402724, + "step": 6458 + }, + { + "epoch": 1.1974416017797553, + "grad_norm": 9.7421875, + "learning_rate": 8.802558398220246e-06, + "loss": 3.2004, + "mean_token_accuracy": 0.42887563884156726, + "step": 6459 + }, + { + "epoch": 1.1976269929551353, + "grad_norm": 13.609375, + "learning_rate": 8.802373007044865e-06, + "loss": 2.6192, + "mean_token_accuracy": 0.45671299247736835, + "step": 6460 + }, + { + "epoch": 1.1978123841305153, + "grad_norm": 8.6640625, + "learning_rate": 8.802187615869485e-06, + "loss": 2.9635, + "mean_token_accuracy": 0.4364450927276516, + "step": 6461 + }, + { + "epoch": 1.1979977753058955, + "grad_norm": 7.80859375, + "learning_rate": 8.802002224694106e-06, + "loss": 2.0386, + "mean_token_accuracy": 0.5340122731427079, + "step": 6462 + }, + { + "epoch": 1.1981831664812754, + "grad_norm": 6.53125, + "learning_rate": 8.801816833518724e-06, + "loss": 2.875, + "mean_token_accuracy": 0.4582284275577935, + "step": 6463 + }, + { + "epoch": 1.1983685576566556, + "grad_norm": 8.8515625, + "learning_rate": 8.801631442343345e-06, + "loss": 2.8027, + "mean_token_accuracy": 0.46124445803393976, + "step": 6464 + }, + { + "epoch": 1.1985539488320356, + "grad_norm": 5.93359375, + "learning_rate": 8.801446051167965e-06, + "loss": 2.4499, + "mean_token_accuracy": 0.4884163003445408, + "step": 6465 + }, + { + "epoch": 1.1987393400074156, + "grad_norm": 10.171875, + "learning_rate": 8.801260659992586e-06, + "loss": 2.502, + "mean_token_accuracy": 0.4791789548189218, + "step": 6466 + }, + { + "epoch": 1.1989247311827957, + "grad_norm": 9.6796875, + "learning_rate": 8.801075268817205e-06, + "loss": 3.0017, + "mean_token_accuracy": 0.45891022778025903, + "step": 6467 + }, + { + "epoch": 1.1991101223581757, + "grad_norm": 5.796875, + "learning_rate": 8.800889877641825e-06, + "loss": 2.7854, + "mean_token_accuracy": 0.4601137716629184, + "step": 6468 + }, + { + "epoch": 1.199295513533556, + "grad_norm": 4.94140625, + "learning_rate": 8.800704486466446e-06, + "loss": 2.6037, + "mean_token_accuracy": 0.48540706605222733, + "step": 6469 + }, + { + "epoch": 1.1994809047089359, + "grad_norm": 5.17578125, + "learning_rate": 8.800519095291064e-06, + "loss": 2.9095, + "mean_token_accuracy": 0.45133772780147347, + "step": 6470 + }, + { + "epoch": 1.1996662958843158, + "grad_norm": 6.0625, + "learning_rate": 8.800333704115685e-06, + "loss": 2.7478, + "mean_token_accuracy": 0.4475543854787308, + "step": 6471 + }, + { + "epoch": 1.199851687059696, + "grad_norm": 6.75, + "learning_rate": 8.800148312940304e-06, + "loss": 3.0276, + "mean_token_accuracy": 0.44599056603773585, + "step": 6472 + }, + { + "epoch": 1.200037078235076, + "grad_norm": 6.55078125, + "learning_rate": 8.799962921764926e-06, + "loss": 2.2506, + "mean_token_accuracy": 0.5217576187101347, + "step": 6473 + }, + { + "epoch": 1.200222469410456, + "grad_norm": 7.3125, + "learning_rate": 8.799777530589545e-06, + "loss": 2.8597, + "mean_token_accuracy": 0.4486624203821656, + "step": 6474 + }, + { + "epoch": 1.2004078605858362, + "grad_norm": 7.53125, + "learning_rate": 8.799592139414165e-06, + "loss": 2.7053, + "mean_token_accuracy": 0.4638033495407888, + "step": 6475 + }, + { + "epoch": 1.2005932517612161, + "grad_norm": 7.62109375, + "learning_rate": 8.799406748238784e-06, + "loss": 2.8359, + "mean_token_accuracy": 0.4814912050406931, + "step": 6476 + }, + { + "epoch": 1.2007786429365963, + "grad_norm": 8.3828125, + "learning_rate": 8.799221357063404e-06, + "loss": 2.6657, + "mean_token_accuracy": 0.48215928841631733, + "step": 6477 + }, + { + "epoch": 1.2009640341119763, + "grad_norm": 5.515625, + "learning_rate": 8.799035965888025e-06, + "loss": 3.0571, + "mean_token_accuracy": 0.4348221388794181, + "step": 6478 + }, + { + "epoch": 1.2011494252873562, + "grad_norm": 7.25, + "learning_rate": 8.798850574712644e-06, + "loss": 3.0542, + "mean_token_accuracy": 0.44322508398656213, + "step": 6479 + }, + { + "epoch": 1.2013348164627364, + "grad_norm": 6.1171875, + "learning_rate": 8.798665183537264e-06, + "loss": 3.3947, + "mean_token_accuracy": 0.4144047619047619, + "step": 6480 + }, + { + "epoch": 1.2015202076381164, + "grad_norm": 6.8125, + "learning_rate": 8.798479792361885e-06, + "loss": 2.9937, + "mean_token_accuracy": 0.4382205513784461, + "step": 6481 + }, + { + "epoch": 1.2017055988134966, + "grad_norm": 6.55859375, + "learning_rate": 8.798294401186505e-06, + "loss": 2.5797, + "mean_token_accuracy": 0.47920665387076133, + "step": 6482 + }, + { + "epoch": 1.2018909899888766, + "grad_norm": 7.2109375, + "learning_rate": 8.798109010011124e-06, + "loss": 3.0179, + "mean_token_accuracy": 0.4571956769055745, + "step": 6483 + }, + { + "epoch": 1.2020763811642565, + "grad_norm": 7.125, + "learning_rate": 8.797923618835745e-06, + "loss": 1.8922, + "mean_token_accuracy": 0.5804347826086956, + "step": 6484 + }, + { + "epoch": 1.2022617723396367, + "grad_norm": 7.86328125, + "learning_rate": 8.797738227660363e-06, + "loss": 2.7083, + "mean_token_accuracy": 0.4742751801575331, + "step": 6485 + }, + { + "epoch": 1.2024471635150167, + "grad_norm": 8.90625, + "learning_rate": 8.797552836484984e-06, + "loss": 2.5284, + "mean_token_accuracy": 0.47640086206896554, + "step": 6486 + }, + { + "epoch": 1.2026325546903966, + "grad_norm": 9.953125, + "learning_rate": 8.797367445309604e-06, + "loss": 2.6027, + "mean_token_accuracy": 0.4840656687590536, + "step": 6487 + }, + { + "epoch": 1.2028179458657768, + "grad_norm": 8.109375, + "learning_rate": 8.797182054134223e-06, + "loss": 2.4776, + "mean_token_accuracy": 0.5026155652823743, + "step": 6488 + }, + { + "epoch": 1.2030033370411568, + "grad_norm": 6.625, + "learning_rate": 8.796996662958844e-06, + "loss": 2.5836, + "mean_token_accuracy": 0.46923076923076923, + "step": 6489 + }, + { + "epoch": 1.2031887282165368, + "grad_norm": 7.3984375, + "learning_rate": 8.796811271783464e-06, + "loss": 2.493, + "mean_token_accuracy": 0.5124506268246608, + "step": 6490 + }, + { + "epoch": 1.203374119391917, + "grad_norm": 6.41796875, + "learning_rate": 8.796625880608085e-06, + "loss": 3.4614, + "mean_token_accuracy": 0.43246509129967775, + "step": 6491 + }, + { + "epoch": 1.203559510567297, + "grad_norm": 6.09765625, + "learning_rate": 8.796440489432703e-06, + "loss": 3.0401, + "mean_token_accuracy": 0.42487266553480474, + "step": 6492 + }, + { + "epoch": 1.2037449017426771, + "grad_norm": 7.421875, + "learning_rate": 8.796255098257324e-06, + "loss": 2.9117, + "mean_token_accuracy": 0.43437945791726107, + "step": 6493 + }, + { + "epoch": 1.203930292918057, + "grad_norm": 6.47265625, + "learning_rate": 8.796069707081943e-06, + "loss": 2.4475, + "mean_token_accuracy": 0.501031177969186, + "step": 6494 + }, + { + "epoch": 1.204115684093437, + "grad_norm": 6.48828125, + "learning_rate": 8.795884315906563e-06, + "loss": 2.3678, + "mean_token_accuracy": 0.515285084601166, + "step": 6495 + }, + { + "epoch": 1.2043010752688172, + "grad_norm": 5.9609375, + "learning_rate": 8.795698924731184e-06, + "loss": 2.7263, + "mean_token_accuracy": 0.46972318339100344, + "step": 6496 + }, + { + "epoch": 1.2044864664441972, + "grad_norm": 4.91015625, + "learning_rate": 8.795513533555804e-06, + "loss": 2.8675, + "mean_token_accuracy": 0.4575106223444139, + "step": 6497 + }, + { + "epoch": 1.2046718576195774, + "grad_norm": 5.87109375, + "learning_rate": 8.795328142380423e-06, + "loss": 2.4637, + "mean_token_accuracy": 0.49696519261736655, + "step": 6498 + }, + { + "epoch": 1.2048572487949574, + "grad_norm": 6.04296875, + "learning_rate": 8.795142751205043e-06, + "loss": 2.7673, + "mean_token_accuracy": 0.4739399829911311, + "step": 6499 + }, + { + "epoch": 1.2050426399703373, + "grad_norm": 5.25, + "learning_rate": 8.794957360029664e-06, + "loss": 2.9244, + "mean_token_accuracy": 0.4391359863800809, + "step": 6500 + }, + { + "epoch": 1.2052280311457175, + "grad_norm": 5.25390625, + "learning_rate": 8.794771968854283e-06, + "loss": 2.013, + "mean_token_accuracy": 0.557653922923384, + "step": 6501 + }, + { + "epoch": 1.2054134223210975, + "grad_norm": 8.1640625, + "learning_rate": 8.794586577678903e-06, + "loss": 2.8808, + "mean_token_accuracy": 0.46299702844205465, + "step": 6502 + }, + { + "epoch": 1.2055988134964775, + "grad_norm": 8.1875, + "learning_rate": 8.794401186503522e-06, + "loss": 2.9953, + "mean_token_accuracy": 0.4427980702963473, + "step": 6503 + }, + { + "epoch": 1.2057842046718577, + "grad_norm": 9.75, + "learning_rate": 8.794215795328142e-06, + "loss": 2.6529, + "mean_token_accuracy": 0.463200200954534, + "step": 6504 + }, + { + "epoch": 1.2059695958472376, + "grad_norm": 7.76953125, + "learning_rate": 8.794030404152763e-06, + "loss": 2.3556, + "mean_token_accuracy": 0.5079012345679013, + "step": 6505 + }, + { + "epoch": 1.2061549870226178, + "grad_norm": 6.609375, + "learning_rate": 8.793845012977383e-06, + "loss": 3.0243, + "mean_token_accuracy": 0.4335307893649263, + "step": 6506 + }, + { + "epoch": 1.2063403781979978, + "grad_norm": 8.75, + "learning_rate": 8.793659621802004e-06, + "loss": 2.5779, + "mean_token_accuracy": 0.4926173028546429, + "step": 6507 + }, + { + "epoch": 1.2065257693733777, + "grad_norm": 18.421875, + "learning_rate": 8.793474230626623e-06, + "loss": 1.7085, + "mean_token_accuracy": 0.5883407126291377, + "step": 6508 + }, + { + "epoch": 1.206711160548758, + "grad_norm": 6.37890625, + "learning_rate": 8.793288839451243e-06, + "loss": 2.8361, + "mean_token_accuracy": 0.44144144144144143, + "step": 6509 + }, + { + "epoch": 1.206896551724138, + "grad_norm": 5.95703125, + "learning_rate": 8.793103448275862e-06, + "loss": 2.3865, + "mean_token_accuracy": 0.48699103336763194, + "step": 6510 + }, + { + "epoch": 1.207081942899518, + "grad_norm": 6.34375, + "learning_rate": 8.792918057100483e-06, + "loss": 2.7577, + "mean_token_accuracy": 0.47492802303262954, + "step": 6511 + }, + { + "epoch": 1.207267334074898, + "grad_norm": 8.515625, + "learning_rate": 8.792732665925103e-06, + "loss": 2.3418, + "mean_token_accuracy": 0.500956937799043, + "step": 6512 + }, + { + "epoch": 1.207452725250278, + "grad_norm": 7.57421875, + "learning_rate": 8.792547274749724e-06, + "loss": 2.484, + "mean_token_accuracy": 0.5129479466387654, + "step": 6513 + }, + { + "epoch": 1.2076381164256582, + "grad_norm": 11.25, + "learning_rate": 8.792361883574342e-06, + "loss": 2.1724, + "mean_token_accuracy": 0.5210348706411698, + "step": 6514 + }, + { + "epoch": 1.2078235076010382, + "grad_norm": 9.5859375, + "learning_rate": 8.792176492398963e-06, + "loss": 2.6263, + "mean_token_accuracy": 0.4723419540229885, + "step": 6515 + }, + { + "epoch": 1.2080088987764181, + "grad_norm": 13.28125, + "learning_rate": 8.791991101223583e-06, + "loss": 2.5701, + "mean_token_accuracy": 0.48403783624002367, + "step": 6516 + }, + { + "epoch": 1.2081942899517983, + "grad_norm": 14.65625, + "learning_rate": 8.791805710048202e-06, + "loss": 2.1428, + "mean_token_accuracy": 0.5228815690218758, + "step": 6517 + }, + { + "epoch": 1.2083796811271783, + "grad_norm": 14.6796875, + "learning_rate": 8.791620318872823e-06, + "loss": 3.1227, + "mean_token_accuracy": 0.4292944328669247, + "step": 6518 + }, + { + "epoch": 1.2085650723025583, + "grad_norm": 15.28125, + "learning_rate": 8.791434927697441e-06, + "loss": 2.6561, + "mean_token_accuracy": 0.46710287168302433, + "step": 6519 + }, + { + "epoch": 1.2087504634779385, + "grad_norm": 13.6015625, + "learning_rate": 8.791249536522062e-06, + "loss": 3.0709, + "mean_token_accuracy": 0.4283391695847924, + "step": 6520 + }, + { + "epoch": 1.2089358546533184, + "grad_norm": 8.703125, + "learning_rate": 8.791064145346682e-06, + "loss": 2.9144, + "mean_token_accuracy": 0.46511627906976744, + "step": 6521 + }, + { + "epoch": 1.2091212458286986, + "grad_norm": 7.67578125, + "learning_rate": 8.790878754171303e-06, + "loss": 3.4172, + "mean_token_accuracy": 0.41921470342522976, + "step": 6522 + }, + { + "epoch": 1.2093066370040786, + "grad_norm": 13.2109375, + "learning_rate": 8.790693362995922e-06, + "loss": 2.5537, + "mean_token_accuracy": 0.4745624270711785, + "step": 6523 + }, + { + "epoch": 1.2094920281794588, + "grad_norm": 14.296875, + "learning_rate": 8.790507971820542e-06, + "loss": 2.7245, + "mean_token_accuracy": 0.4663027503674155, + "step": 6524 + }, + { + "epoch": 1.2096774193548387, + "grad_norm": 6.84375, + "learning_rate": 8.790322580645163e-06, + "loss": 2.4258, + "mean_token_accuracy": 0.4893456058504602, + "step": 6525 + }, + { + "epoch": 1.2098628105302187, + "grad_norm": 5.98828125, + "learning_rate": 8.790137189469781e-06, + "loss": 2.5397, + "mean_token_accuracy": 0.4941347040991169, + "step": 6526 + }, + { + "epoch": 1.210048201705599, + "grad_norm": 7.453125, + "learning_rate": 8.789951798294402e-06, + "loss": 2.632, + "mean_token_accuracy": 0.4711797890605838, + "step": 6527 + }, + { + "epoch": 1.2102335928809789, + "grad_norm": 7.09375, + "learning_rate": 8.78976640711902e-06, + "loss": 2.8446, + "mean_token_accuracy": 0.4395064549297384, + "step": 6528 + }, + { + "epoch": 1.2104189840563588, + "grad_norm": 6.49609375, + "learning_rate": 8.789581015943643e-06, + "loss": 2.6643, + "mean_token_accuracy": 0.47111442415206856, + "step": 6529 + }, + { + "epoch": 1.210604375231739, + "grad_norm": 7.29296875, + "learning_rate": 8.789395624768262e-06, + "loss": 2.5416, + "mean_token_accuracy": 0.4812166381307282, + "step": 6530 + }, + { + "epoch": 1.210789766407119, + "grad_norm": 10.921875, + "learning_rate": 8.789210233592882e-06, + "loss": 2.7768, + "mean_token_accuracy": 0.4612835930789054, + "step": 6531 + }, + { + "epoch": 1.210975157582499, + "grad_norm": 7.7421875, + "learning_rate": 8.789024842417501e-06, + "loss": 2.8109, + "mean_token_accuracy": 0.46142208774583965, + "step": 6532 + }, + { + "epoch": 1.2111605487578792, + "grad_norm": 6.67578125, + "learning_rate": 8.788839451242121e-06, + "loss": 2.4654, + "mean_token_accuracy": 0.5169617515880525, + "step": 6533 + }, + { + "epoch": 1.2113459399332591, + "grad_norm": 8.1484375, + "learning_rate": 8.788654060066742e-06, + "loss": 2.5383, + "mean_token_accuracy": 0.4841102867899457, + "step": 6534 + }, + { + "epoch": 1.2115313311086393, + "grad_norm": 8.484375, + "learning_rate": 8.78846866889136e-06, + "loss": 2.5999, + "mean_token_accuracy": 0.4870810055865922, + "step": 6535 + }, + { + "epoch": 1.2117167222840193, + "grad_norm": 7.14453125, + "learning_rate": 8.788283277715981e-06, + "loss": 3.0674, + "mean_token_accuracy": 0.45516476552598223, + "step": 6536 + }, + { + "epoch": 1.2119021134593992, + "grad_norm": 6.7265625, + "learning_rate": 8.788097886540602e-06, + "loss": 3.1784, + "mean_token_accuracy": 0.4386917540093446, + "step": 6537 + }, + { + "epoch": 1.2120875046347794, + "grad_norm": 10.015625, + "learning_rate": 8.787912495365222e-06, + "loss": 2.5552, + "mean_token_accuracy": 0.4789989755597834, + "step": 6538 + }, + { + "epoch": 1.2122728958101594, + "grad_norm": 10.34375, + "learning_rate": 8.787727104189841e-06, + "loss": 2.2256, + "mean_token_accuracy": 0.5405751969225133, + "step": 6539 + }, + { + "epoch": 1.2124582869855396, + "grad_norm": 5.76953125, + "learning_rate": 8.787541713014462e-06, + "loss": 2.6028, + "mean_token_accuracy": 0.47942360119433985, + "step": 6540 + }, + { + "epoch": 1.2126436781609196, + "grad_norm": 6.6640625, + "learning_rate": 8.78735632183908e-06, + "loss": 2.9211, + "mean_token_accuracy": 0.43910961563349293, + "step": 6541 + }, + { + "epoch": 1.2128290693362995, + "grad_norm": 7.80859375, + "learning_rate": 8.7871709306637e-06, + "loss": 2.3351, + "mean_token_accuracy": 0.506635004888951, + "step": 6542 + }, + { + "epoch": 1.2130144605116797, + "grad_norm": 5.37890625, + "learning_rate": 8.786985539488321e-06, + "loss": 2.3846, + "mean_token_accuracy": 0.49985775248933145, + "step": 6543 + }, + { + "epoch": 1.2131998516870597, + "grad_norm": 5.72265625, + "learning_rate": 8.78680014831294e-06, + "loss": 2.6279, + "mean_token_accuracy": 0.46471115665428464, + "step": 6544 + }, + { + "epoch": 1.2133852428624397, + "grad_norm": 8.703125, + "learning_rate": 8.786614757137562e-06, + "loss": 2.6802, + "mean_token_accuracy": 0.4800498753117207, + "step": 6545 + }, + { + "epoch": 1.2135706340378198, + "grad_norm": 7.1015625, + "learning_rate": 8.786429365962181e-06, + "loss": 2.8154, + "mean_token_accuracy": 0.47454431175361406, + "step": 6546 + }, + { + "epoch": 1.2137560252131998, + "grad_norm": 7.8359375, + "learning_rate": 8.786243974786802e-06, + "loss": 2.34, + "mean_token_accuracy": 0.5004863813229572, + "step": 6547 + }, + { + "epoch": 1.21394141638858, + "grad_norm": 5.8125, + "learning_rate": 8.78605858361142e-06, + "loss": 2.5902, + "mean_token_accuracy": 0.47715617715617714, + "step": 6548 + }, + { + "epoch": 1.21412680756396, + "grad_norm": 6.76953125, + "learning_rate": 8.785873192436041e-06, + "loss": 2.7777, + "mean_token_accuracy": 0.4603174603174603, + "step": 6549 + }, + { + "epoch": 1.21431219873934, + "grad_norm": 6.05859375, + "learning_rate": 8.785687801260661e-06, + "loss": 3.1634, + "mean_token_accuracy": 0.44045009498757853, + "step": 6550 + }, + { + "epoch": 1.2144975899147201, + "grad_norm": 6.78515625, + "learning_rate": 8.78550241008528e-06, + "loss": 3.6676, + "mean_token_accuracy": 0.4106062556313554, + "step": 6551 + }, + { + "epoch": 1.2146829810901, + "grad_norm": 5.5703125, + "learning_rate": 8.7853170189099e-06, + "loss": 2.8238, + "mean_token_accuracy": 0.44954240390482003, + "step": 6552 + }, + { + "epoch": 1.2148683722654803, + "grad_norm": 7.015625, + "learning_rate": 8.785131627734521e-06, + "loss": 2.7361, + "mean_token_accuracy": 0.46141439205955337, + "step": 6553 + }, + { + "epoch": 1.2150537634408602, + "grad_norm": 5.8671875, + "learning_rate": 8.784946236559142e-06, + "loss": 3.0113, + "mean_token_accuracy": 0.44121753700083777, + "step": 6554 + }, + { + "epoch": 1.2152391546162402, + "grad_norm": 6.5078125, + "learning_rate": 8.78476084538376e-06, + "loss": 2.3536, + "mean_token_accuracy": 0.5021189161422884, + "step": 6555 + }, + { + "epoch": 1.2154245457916204, + "grad_norm": 5.73828125, + "learning_rate": 8.784575454208381e-06, + "loss": 3.0559, + "mean_token_accuracy": 0.4519927536231884, + "step": 6556 + }, + { + "epoch": 1.2156099369670004, + "grad_norm": 5.76171875, + "learning_rate": 8.784390063033e-06, + "loss": 3.085, + "mean_token_accuracy": 0.43568665377176014, + "step": 6557 + }, + { + "epoch": 1.2157953281423803, + "grad_norm": 5.515625, + "learning_rate": 8.78420467185762e-06, + "loss": 3.0064, + "mean_token_accuracy": 0.43690426854537917, + "step": 6558 + }, + { + "epoch": 1.2159807193177605, + "grad_norm": 7.015625, + "learning_rate": 8.78401928068224e-06, + "loss": 2.3874, + "mean_token_accuracy": 0.5045945325063175, + "step": 6559 + }, + { + "epoch": 1.2161661104931405, + "grad_norm": 6.43359375, + "learning_rate": 8.78383388950686e-06, + "loss": 3.0243, + "mean_token_accuracy": 0.4625298329355609, + "step": 6560 + }, + { + "epoch": 1.2163515016685205, + "grad_norm": 9.890625, + "learning_rate": 8.78364849833148e-06, + "loss": 2.6999, + "mean_token_accuracy": 0.4770965468639887, + "step": 6561 + }, + { + "epoch": 1.2165368928439007, + "grad_norm": 6.95703125, + "learning_rate": 8.7834631071561e-06, + "loss": 2.8364, + "mean_token_accuracy": 0.46932814021421615, + "step": 6562 + }, + { + "epoch": 1.2167222840192806, + "grad_norm": 8.125, + "learning_rate": 8.783277715980721e-06, + "loss": 2.5424, + "mean_token_accuracy": 0.48943929493997873, + "step": 6563 + }, + { + "epoch": 1.2169076751946608, + "grad_norm": 6.29296875, + "learning_rate": 8.78309232480534e-06, + "loss": 2.7572, + "mean_token_accuracy": 0.4604563579973416, + "step": 6564 + }, + { + "epoch": 1.2170930663700408, + "grad_norm": 5.6328125, + "learning_rate": 8.78290693362996e-06, + "loss": 2.7735, + "mean_token_accuracy": 0.4751958224543081, + "step": 6565 + }, + { + "epoch": 1.2172784575454207, + "grad_norm": 8.4140625, + "learning_rate": 8.782721542454579e-06, + "loss": 2.5586, + "mean_token_accuracy": 0.47953216374269003, + "step": 6566 + }, + { + "epoch": 1.217463848720801, + "grad_norm": 5.46875, + "learning_rate": 8.7825361512792e-06, + "loss": 2.6858, + "mean_token_accuracy": 0.4750920245398773, + "step": 6567 + }, + { + "epoch": 1.217649239896181, + "grad_norm": 8.1875, + "learning_rate": 8.78235076010382e-06, + "loss": 2.9169, + "mean_token_accuracy": 0.454884246188594, + "step": 6568 + }, + { + "epoch": 1.217834631071561, + "grad_norm": 7.859375, + "learning_rate": 8.782165368928439e-06, + "loss": 2.6228, + "mean_token_accuracy": 0.4932895488292404, + "step": 6569 + }, + { + "epoch": 1.218020022246941, + "grad_norm": 6.8515625, + "learning_rate": 8.78197997775306e-06, + "loss": 2.7818, + "mean_token_accuracy": 0.45407725321888415, + "step": 6570 + }, + { + "epoch": 1.218205413422321, + "grad_norm": 8.15625, + "learning_rate": 8.78179458657768e-06, + "loss": 2.3944, + "mean_token_accuracy": 0.4964326484018265, + "step": 6571 + }, + { + "epoch": 1.2183908045977012, + "grad_norm": 7.73828125, + "learning_rate": 8.7816091954023e-06, + "loss": 2.7251, + "mean_token_accuracy": 0.4636610959968908, + "step": 6572 + }, + { + "epoch": 1.2185761957730812, + "grad_norm": 8.4140625, + "learning_rate": 8.781423804226919e-06, + "loss": 3.0118, + "mean_token_accuracy": 0.45545545545545546, + "step": 6573 + }, + { + "epoch": 1.2187615869484612, + "grad_norm": 6.83203125, + "learning_rate": 8.78123841305154e-06, + "loss": 2.6366, + "mean_token_accuracy": 0.49918454397189815, + "step": 6574 + }, + { + "epoch": 1.2189469781238413, + "grad_norm": 8.0234375, + "learning_rate": 8.781053021876158e-06, + "loss": 2.7346, + "mean_token_accuracy": 0.4712581344902386, + "step": 6575 + }, + { + "epoch": 1.2191323692992213, + "grad_norm": 6.453125, + "learning_rate": 8.780867630700779e-06, + "loss": 2.9366, + "mean_token_accuracy": 0.44063981042654027, + "step": 6576 + }, + { + "epoch": 1.2193177604746015, + "grad_norm": 6.76953125, + "learning_rate": 8.7806822395254e-06, + "loss": 2.4907, + "mean_token_accuracy": 0.5099505810070789, + "step": 6577 + }, + { + "epoch": 1.2195031516499815, + "grad_norm": 7.81640625, + "learning_rate": 8.78049684835002e-06, + "loss": 2.5438, + "mean_token_accuracy": 0.4837504246404711, + "step": 6578 + }, + { + "epoch": 1.2196885428253614, + "grad_norm": 7.59765625, + "learning_rate": 8.780311457174639e-06, + "loss": 2.8274, + "mean_token_accuracy": 0.4489819662594532, + "step": 6579 + }, + { + "epoch": 1.2198739340007416, + "grad_norm": 6.8125, + "learning_rate": 8.78012606599926e-06, + "loss": 2.1961, + "mean_token_accuracy": 0.5433338254835376, + "step": 6580 + }, + { + "epoch": 1.2200593251761216, + "grad_norm": 6.00390625, + "learning_rate": 8.77994067482388e-06, + "loss": 3.028, + "mean_token_accuracy": 0.430343386865126, + "step": 6581 + }, + { + "epoch": 1.2202447163515018, + "grad_norm": 7.86328125, + "learning_rate": 8.779755283648498e-06, + "loss": 3.389, + "mean_token_accuracy": 0.3994683421942968, + "step": 6582 + }, + { + "epoch": 1.2204301075268817, + "grad_norm": 6.68359375, + "learning_rate": 8.779569892473119e-06, + "loss": 2.8746, + "mean_token_accuracy": 0.45725699067909453, + "step": 6583 + }, + { + "epoch": 1.2206154987022617, + "grad_norm": 4.875, + "learning_rate": 8.779384501297738e-06, + "loss": 2.5537, + "mean_token_accuracy": 0.4982276119402985, + "step": 6584 + }, + { + "epoch": 1.220800889877642, + "grad_norm": 6.70703125, + "learning_rate": 8.779199110122358e-06, + "loss": 3.3035, + "mean_token_accuracy": 0.42238845845578415, + "step": 6585 + }, + { + "epoch": 1.2209862810530219, + "grad_norm": 7.73828125, + "learning_rate": 8.779013718946979e-06, + "loss": 2.677, + "mean_token_accuracy": 0.4865290914302092, + "step": 6586 + }, + { + "epoch": 1.2211716722284018, + "grad_norm": 6.203125, + "learning_rate": 8.7788283277716e-06, + "loss": 2.5919, + "mean_token_accuracy": 0.48125437981779956, + "step": 6587 + }, + { + "epoch": 1.221357063403782, + "grad_norm": 6.375, + "learning_rate": 8.77864293659622e-06, + "loss": 3.0411, + "mean_token_accuracy": 0.45101637492941843, + "step": 6588 + }, + { + "epoch": 1.221542454579162, + "grad_norm": 8.765625, + "learning_rate": 8.778457545420839e-06, + "loss": 2.9616, + "mean_token_accuracy": 0.43680734355759543, + "step": 6589 + }, + { + "epoch": 1.221727845754542, + "grad_norm": 6.74609375, + "learning_rate": 8.778272154245459e-06, + "loss": 2.7919, + "mean_token_accuracy": 0.47368421052631576, + "step": 6590 + }, + { + "epoch": 1.2219132369299222, + "grad_norm": 12.1875, + "learning_rate": 8.778086763070078e-06, + "loss": 3.2453, + "mean_token_accuracy": 0.4563861094761624, + "step": 6591 + }, + { + "epoch": 1.2220986281053021, + "grad_norm": 7.1640625, + "learning_rate": 8.777901371894698e-06, + "loss": 2.4058, + "mean_token_accuracy": 0.5050644567219152, + "step": 6592 + }, + { + "epoch": 1.2222840192806823, + "grad_norm": 6.40234375, + "learning_rate": 8.777715980719319e-06, + "loss": 2.4092, + "mean_token_accuracy": 0.5071184510250569, + "step": 6593 + }, + { + "epoch": 1.2224694104560623, + "grad_norm": 6.1875, + "learning_rate": 8.77753058954394e-06, + "loss": 2.7646, + "mean_token_accuracy": 0.45621805792163544, + "step": 6594 + }, + { + "epoch": 1.2226548016314425, + "grad_norm": 6.421875, + "learning_rate": 8.777345198368558e-06, + "loss": 2.8224, + "mean_token_accuracy": 0.44771513353115727, + "step": 6595 + }, + { + "epoch": 1.2228401928068224, + "grad_norm": 5.5859375, + "learning_rate": 8.777159807193179e-06, + "loss": 2.6155, + "mean_token_accuracy": 0.4716468151216986, + "step": 6596 + }, + { + "epoch": 1.2230255839822024, + "grad_norm": 7.3359375, + "learning_rate": 8.776974416017799e-06, + "loss": 3.1917, + "mean_token_accuracy": 0.43752733634640617, + "step": 6597 + }, + { + "epoch": 1.2232109751575826, + "grad_norm": 6.3125, + "learning_rate": 8.776789024842418e-06, + "loss": 2.9273, + "mean_token_accuracy": 0.4437559580552908, + "step": 6598 + }, + { + "epoch": 1.2233963663329626, + "grad_norm": 6.2734375, + "learning_rate": 8.776603633667038e-06, + "loss": 3.8564, + "mean_token_accuracy": 0.3781504731565633, + "step": 6599 + }, + { + "epoch": 1.2235817575083425, + "grad_norm": 5.63671875, + "learning_rate": 8.776418242491657e-06, + "loss": 2.6114, + "mean_token_accuracy": 0.47034327518289254, + "step": 6600 + }, + { + "epoch": 1.2237671486837227, + "grad_norm": 5.4296875, + "learning_rate": 8.776232851316278e-06, + "loss": 2.6162, + "mean_token_accuracy": 0.4822253000923361, + "step": 6601 + }, + { + "epoch": 1.2239525398591027, + "grad_norm": 5.1328125, + "learning_rate": 8.776047460140898e-06, + "loss": 2.5476, + "mean_token_accuracy": 0.47247150133398014, + "step": 6602 + }, + { + "epoch": 1.2241379310344827, + "grad_norm": 6.37890625, + "learning_rate": 8.775862068965519e-06, + "loss": 3.1398, + "mean_token_accuracy": 0.42875264270613106, + "step": 6603 + }, + { + "epoch": 1.2243233222098628, + "grad_norm": 6.4765625, + "learning_rate": 8.775676677790137e-06, + "loss": 2.6803, + "mean_token_accuracy": 0.4771668219944082, + "step": 6604 + }, + { + "epoch": 1.2245087133852428, + "grad_norm": 6.27734375, + "learning_rate": 8.775491286614758e-06, + "loss": 2.7994, + "mean_token_accuracy": 0.45821489482660605, + "step": 6605 + }, + { + "epoch": 1.224694104560623, + "grad_norm": 7.98828125, + "learning_rate": 8.775305895439378e-06, + "loss": 2.5261, + "mean_token_accuracy": 0.4983674680914218, + "step": 6606 + }, + { + "epoch": 1.224879495736003, + "grad_norm": 6.69921875, + "learning_rate": 8.775120504263997e-06, + "loss": 2.6688, + "mean_token_accuracy": 0.4730549006399461, + "step": 6607 + }, + { + "epoch": 1.225064886911383, + "grad_norm": 6.390625, + "learning_rate": 8.774935113088618e-06, + "loss": 2.8597, + "mean_token_accuracy": 0.4580811138014528, + "step": 6608 + }, + { + "epoch": 1.2252502780867631, + "grad_norm": 5.52734375, + "learning_rate": 8.774749721913236e-06, + "loss": 2.6141, + "mean_token_accuracy": 0.4949182501104728, + "step": 6609 + }, + { + "epoch": 1.225435669262143, + "grad_norm": 6.4375, + "learning_rate": 8.774564330737859e-06, + "loss": 2.7782, + "mean_token_accuracy": 0.4840261739799846, + "step": 6610 + }, + { + "epoch": 1.2256210604375233, + "grad_norm": 7.3125, + "learning_rate": 8.774378939562477e-06, + "loss": 2.7077, + "mean_token_accuracy": 0.4884555032503923, + "step": 6611 + }, + { + "epoch": 1.2258064516129032, + "grad_norm": 5.9140625, + "learning_rate": 8.774193548387098e-06, + "loss": 3.0008, + "mean_token_accuracy": 0.4347759829320329, + "step": 6612 + }, + { + "epoch": 1.2259918427882832, + "grad_norm": 7.46484375, + "learning_rate": 8.774008157211717e-06, + "loss": 2.7343, + "mean_token_accuracy": 0.44486732212707475, + "step": 6613 + }, + { + "epoch": 1.2261772339636634, + "grad_norm": 7.29296875, + "learning_rate": 8.773822766036337e-06, + "loss": 3.0451, + "mean_token_accuracy": 0.4677891654465593, + "step": 6614 + }, + { + "epoch": 1.2263626251390434, + "grad_norm": 8.2421875, + "learning_rate": 8.773637374860958e-06, + "loss": 2.457, + "mean_token_accuracy": 0.5040123891313529, + "step": 6615 + }, + { + "epoch": 1.2265480163144233, + "grad_norm": 7.54296875, + "learning_rate": 8.773451983685577e-06, + "loss": 2.923, + "mean_token_accuracy": 0.4517538384721009, + "step": 6616 + }, + { + "epoch": 1.2267334074898035, + "grad_norm": 7.234375, + "learning_rate": 8.773266592510197e-06, + "loss": 3.0446, + "mean_token_accuracy": 0.431665868836764, + "step": 6617 + }, + { + "epoch": 1.2269187986651835, + "grad_norm": 7.5234375, + "learning_rate": 8.773081201334818e-06, + "loss": 2.6618, + "mean_token_accuracy": 0.48417579121043947, + "step": 6618 + }, + { + "epoch": 1.2271041898405637, + "grad_norm": 6.765625, + "learning_rate": 8.772895810159438e-06, + "loss": 3.3239, + "mean_token_accuracy": 0.42322560692747796, + "step": 6619 + }, + { + "epoch": 1.2272895810159437, + "grad_norm": 7.41796875, + "learning_rate": 8.772710418984057e-06, + "loss": 3.0598, + "mean_token_accuracy": 0.4347728295096716, + "step": 6620 + }, + { + "epoch": 1.2274749721913236, + "grad_norm": 8.390625, + "learning_rate": 8.772525027808677e-06, + "loss": 2.2123, + "mean_token_accuracy": 0.5282861896838602, + "step": 6621 + }, + { + "epoch": 1.2276603633667038, + "grad_norm": 7.24609375, + "learning_rate": 8.772339636633296e-06, + "loss": 2.5786, + "mean_token_accuracy": 0.48101615352754384, + "step": 6622 + }, + { + "epoch": 1.2278457545420838, + "grad_norm": 7.859375, + "learning_rate": 8.772154245457917e-06, + "loss": 2.8736, + "mean_token_accuracy": 0.4778467311300332, + "step": 6623 + }, + { + "epoch": 1.228031145717464, + "grad_norm": 9.25, + "learning_rate": 8.771968854282537e-06, + "loss": 2.8536, + "mean_token_accuracy": 0.45817857673451196, + "step": 6624 + }, + { + "epoch": 1.228216536892844, + "grad_norm": 5.8046875, + "learning_rate": 8.771783463107156e-06, + "loss": 2.7602, + "mean_token_accuracy": 0.4623800706951692, + "step": 6625 + }, + { + "epoch": 1.228401928068224, + "grad_norm": 8.1953125, + "learning_rate": 8.771598071931778e-06, + "loss": 2.6336, + "mean_token_accuracy": 0.46781276277679157, + "step": 6626 + }, + { + "epoch": 1.228587319243604, + "grad_norm": 6.84375, + "learning_rate": 8.771412680756397e-06, + "loss": 3.1214, + "mean_token_accuracy": 0.4359861591695502, + "step": 6627 + }, + { + "epoch": 1.228772710418984, + "grad_norm": 6.125, + "learning_rate": 8.771227289581017e-06, + "loss": 2.7447, + "mean_token_accuracy": 0.46699779249448126, + "step": 6628 + }, + { + "epoch": 1.228958101594364, + "grad_norm": 6.25, + "learning_rate": 8.771041898405636e-06, + "loss": 3.5625, + "mean_token_accuracy": 0.39941010200319527, + "step": 6629 + }, + { + "epoch": 1.2291434927697442, + "grad_norm": 6.859375, + "learning_rate": 8.770856507230257e-06, + "loss": 2.4065, + "mean_token_accuracy": 0.5225885225885226, + "step": 6630 + }, + { + "epoch": 1.2293288839451242, + "grad_norm": 5.7109375, + "learning_rate": 8.770671116054877e-06, + "loss": 2.7583, + "mean_token_accuracy": 0.4509597686037339, + "step": 6631 + }, + { + "epoch": 1.2295142751205042, + "grad_norm": 5.640625, + "learning_rate": 8.770485724879496e-06, + "loss": 3.0195, + "mean_token_accuracy": 0.43264913406029504, + "step": 6632 + }, + { + "epoch": 1.2296996662958843, + "grad_norm": 5.18359375, + "learning_rate": 8.770300333704116e-06, + "loss": 2.3864, + "mean_token_accuracy": 0.5240506329113924, + "step": 6633 + }, + { + "epoch": 1.2298850574712643, + "grad_norm": 5.015625, + "learning_rate": 8.770114942528737e-06, + "loss": 2.7436, + "mean_token_accuracy": 0.48985855350947427, + "step": 6634 + }, + { + "epoch": 1.2300704486466445, + "grad_norm": 6.109375, + "learning_rate": 8.769929551353357e-06, + "loss": 2.7309, + "mean_token_accuracy": 0.48079618727221757, + "step": 6635 + }, + { + "epoch": 1.2302558398220245, + "grad_norm": 6.0703125, + "learning_rate": 8.769744160177976e-06, + "loss": 2.8647, + "mean_token_accuracy": 0.4470198675496689, + "step": 6636 + }, + { + "epoch": 1.2304412309974044, + "grad_norm": 6.86328125, + "learning_rate": 8.769558769002597e-06, + "loss": 3.2423, + "mean_token_accuracy": 0.42991960852848654, + "step": 6637 + }, + { + "epoch": 1.2306266221727846, + "grad_norm": 5.50390625, + "learning_rate": 8.769373377827215e-06, + "loss": 2.9553, + "mean_token_accuracy": 0.45041380277738813, + "step": 6638 + }, + { + "epoch": 1.2308120133481646, + "grad_norm": 7.84765625, + "learning_rate": 8.769187986651836e-06, + "loss": 2.4645, + "mean_token_accuracy": 0.5039218052371184, + "step": 6639 + }, + { + "epoch": 1.2309974045235448, + "grad_norm": 5.640625, + "learning_rate": 8.769002595476456e-06, + "loss": 2.7687, + "mean_token_accuracy": 0.46917269581629445, + "step": 6640 + }, + { + "epoch": 1.2311827956989247, + "grad_norm": 5.4375, + "learning_rate": 8.768817204301075e-06, + "loss": 2.752, + "mean_token_accuracy": 0.4710373242850218, + "step": 6641 + }, + { + "epoch": 1.2313681868743047, + "grad_norm": 7.30078125, + "learning_rate": 8.768631813125696e-06, + "loss": 2.6614, + "mean_token_accuracy": 0.4807282036694638, + "step": 6642 + }, + { + "epoch": 1.231553578049685, + "grad_norm": 6.59375, + "learning_rate": 8.768446421950316e-06, + "loss": 3.493, + "mean_token_accuracy": 0.4035029742233972, + "step": 6643 + }, + { + "epoch": 1.2317389692250649, + "grad_norm": 5.8828125, + "learning_rate": 8.768261030774937e-06, + "loss": 3.1014, + "mean_token_accuracy": 0.4492247520603436, + "step": 6644 + }, + { + "epoch": 1.2319243604004448, + "grad_norm": 5.30859375, + "learning_rate": 8.768075639599556e-06, + "loss": 2.636, + "mean_token_accuracy": 0.4631069198419778, + "step": 6645 + }, + { + "epoch": 1.232109751575825, + "grad_norm": 7.27734375, + "learning_rate": 8.767890248424176e-06, + "loss": 2.5508, + "mean_token_accuracy": 0.4787527956847783, + "step": 6646 + }, + { + "epoch": 1.232295142751205, + "grad_norm": 5.3671875, + "learning_rate": 8.767704857248795e-06, + "loss": 2.0723, + "mean_token_accuracy": 0.5542236164015236, + "step": 6647 + }, + { + "epoch": 1.2324805339265852, + "grad_norm": 5.79296875, + "learning_rate": 8.767519466073415e-06, + "loss": 2.8559, + "mean_token_accuracy": 0.45510485997739547, + "step": 6648 + }, + { + "epoch": 1.2326659251019652, + "grad_norm": 8.4375, + "learning_rate": 8.767334074898036e-06, + "loss": 3.1364, + "mean_token_accuracy": 0.4938002452650225, + "step": 6649 + }, + { + "epoch": 1.2328513162773451, + "grad_norm": 5.99609375, + "learning_rate": 8.767148683722656e-06, + "loss": 2.7389, + "mean_token_accuracy": 0.46055698687277635, + "step": 6650 + }, + { + "epoch": 1.2330367074527253, + "grad_norm": 6.52734375, + "learning_rate": 8.766963292547275e-06, + "loss": 2.7879, + "mean_token_accuracy": 0.4698723656871475, + "step": 6651 + }, + { + "epoch": 1.2332220986281053, + "grad_norm": 5.5625, + "learning_rate": 8.766777901371896e-06, + "loss": 2.5989, + "mean_token_accuracy": 0.48546429579452444, + "step": 6652 + }, + { + "epoch": 1.2334074898034855, + "grad_norm": 5.47265625, + "learning_rate": 8.766592510196516e-06, + "loss": 3.167, + "mean_token_accuracy": 0.4293281293560078, + "step": 6653 + }, + { + "epoch": 1.2335928809788654, + "grad_norm": 7.54296875, + "learning_rate": 8.766407119021135e-06, + "loss": 2.5692, + "mean_token_accuracy": 0.47995434441432444, + "step": 6654 + }, + { + "epoch": 1.2337782721542454, + "grad_norm": 6.52734375, + "learning_rate": 8.766221727845755e-06, + "loss": 2.7878, + "mean_token_accuracy": 0.44339356295878035, + "step": 6655 + }, + { + "epoch": 1.2339636633296256, + "grad_norm": 8.3671875, + "learning_rate": 8.766036336670374e-06, + "loss": 2.5729, + "mean_token_accuracy": 0.48121387283236994, + "step": 6656 + }, + { + "epoch": 1.2341490545050056, + "grad_norm": 7.22265625, + "learning_rate": 8.765850945494995e-06, + "loss": 2.7465, + "mean_token_accuracy": 0.4404864267289031, + "step": 6657 + }, + { + "epoch": 1.2343344456803855, + "grad_norm": 7.68359375, + "learning_rate": 8.765665554319615e-06, + "loss": 3.2222, + "mean_token_accuracy": 0.4331965344277246, + "step": 6658 + }, + { + "epoch": 1.2345198368557657, + "grad_norm": 7.11328125, + "learning_rate": 8.765480163144236e-06, + "loss": 2.2198, + "mean_token_accuracy": 0.5459989806320081, + "step": 6659 + }, + { + "epoch": 1.2347052280311457, + "grad_norm": 7.71875, + "learning_rate": 8.765294771968854e-06, + "loss": 2.8011, + "mean_token_accuracy": 0.4532710280373832, + "step": 6660 + }, + { + "epoch": 1.2348906192065257, + "grad_norm": 6.79296875, + "learning_rate": 8.765109380793475e-06, + "loss": 2.9427, + "mean_token_accuracy": 0.47419566644780037, + "step": 6661 + }, + { + "epoch": 1.2350760103819058, + "grad_norm": 6.6015625, + "learning_rate": 8.764923989618095e-06, + "loss": 2.8941, + "mean_token_accuracy": 0.44818387030576645, + "step": 6662 + }, + { + "epoch": 1.2352614015572858, + "grad_norm": 7.52734375, + "learning_rate": 8.764738598442714e-06, + "loss": 2.4172, + "mean_token_accuracy": 0.4943465684985538, + "step": 6663 + }, + { + "epoch": 1.235446792732666, + "grad_norm": 8.671875, + "learning_rate": 8.764553207267335e-06, + "loss": 2.4013, + "mean_token_accuracy": 0.49782293178519593, + "step": 6664 + }, + { + "epoch": 1.235632183908046, + "grad_norm": 5.7890625, + "learning_rate": 8.764367816091954e-06, + "loss": 2.6393, + "mean_token_accuracy": 0.48471678980153554, + "step": 6665 + }, + { + "epoch": 1.235817575083426, + "grad_norm": 9.421875, + "learning_rate": 8.764182424916576e-06, + "loss": 2.6939, + "mean_token_accuracy": 0.4692993964838625, + "step": 6666 + }, + { + "epoch": 1.2360029662588061, + "grad_norm": 6.49609375, + "learning_rate": 8.763997033741194e-06, + "loss": 3.1109, + "mean_token_accuracy": 0.4267895109851169, + "step": 6667 + }, + { + "epoch": 1.236188357434186, + "grad_norm": 6.40234375, + "learning_rate": 8.763811642565815e-06, + "loss": 2.7938, + "mean_token_accuracy": 0.45982812713135995, + "step": 6668 + }, + { + "epoch": 1.2363737486095663, + "grad_norm": 7.7265625, + "learning_rate": 8.763626251390435e-06, + "loss": 3.2074, + "mean_token_accuracy": 0.43894121468009095, + "step": 6669 + }, + { + "epoch": 1.2365591397849462, + "grad_norm": 8.6640625, + "learning_rate": 8.763440860215054e-06, + "loss": 2.623, + "mean_token_accuracy": 0.4936941671045717, + "step": 6670 + }, + { + "epoch": 1.2367445309603262, + "grad_norm": 5.5625, + "learning_rate": 8.763255469039675e-06, + "loss": 2.8535, + "mean_token_accuracy": 0.4628241174632927, + "step": 6671 + }, + { + "epoch": 1.2369299221357064, + "grad_norm": 7.23828125, + "learning_rate": 8.763070077864294e-06, + "loss": 2.4929, + "mean_token_accuracy": 0.49324001908700493, + "step": 6672 + }, + { + "epoch": 1.2371153133110864, + "grad_norm": 10.0078125, + "learning_rate": 8.762884686688914e-06, + "loss": 3.155, + "mean_token_accuracy": 0.43948264125255276, + "step": 6673 + }, + { + "epoch": 1.2373007044864663, + "grad_norm": 5.69140625, + "learning_rate": 8.762699295513535e-06, + "loss": 2.4574, + "mean_token_accuracy": 0.48737953189536487, + "step": 6674 + }, + { + "epoch": 1.2374860956618465, + "grad_norm": 6.06640625, + "learning_rate": 8.762513904338155e-06, + "loss": 2.9325, + "mean_token_accuracy": 0.4470570837030464, + "step": 6675 + }, + { + "epoch": 1.2376714868372265, + "grad_norm": 6.08203125, + "learning_rate": 8.762328513162774e-06, + "loss": 2.8394, + "mean_token_accuracy": 0.4507323568575233, + "step": 6676 + }, + { + "epoch": 1.2378568780126067, + "grad_norm": 6.32421875, + "learning_rate": 8.762143121987394e-06, + "loss": 2.5263, + "mean_token_accuracy": 0.5110150585610709, + "step": 6677 + }, + { + "epoch": 1.2380422691879867, + "grad_norm": 5.84375, + "learning_rate": 8.761957730812015e-06, + "loss": 2.4604, + "mean_token_accuracy": 0.4934403457323661, + "step": 6678 + }, + { + "epoch": 1.2382276603633666, + "grad_norm": 6.70703125, + "learning_rate": 8.761772339636634e-06, + "loss": 2.7038, + "mean_token_accuracy": 0.4991596638655462, + "step": 6679 + }, + { + "epoch": 1.2384130515387468, + "grad_norm": 5.7421875, + "learning_rate": 8.761586948461254e-06, + "loss": 2.6639, + "mean_token_accuracy": 0.4620160288971667, + "step": 6680 + }, + { + "epoch": 1.2385984427141268, + "grad_norm": 6.33984375, + "learning_rate": 8.761401557285873e-06, + "loss": 2.5681, + "mean_token_accuracy": 0.4895148026315789, + "step": 6681 + }, + { + "epoch": 1.238783833889507, + "grad_norm": 7.43359375, + "learning_rate": 8.761216166110493e-06, + "loss": 2.5821, + "mean_token_accuracy": 0.4804010938924339, + "step": 6682 + }, + { + "epoch": 1.238969225064887, + "grad_norm": 5.94921875, + "learning_rate": 8.761030774935114e-06, + "loss": 2.7824, + "mean_token_accuracy": 0.46069761729304837, + "step": 6683 + }, + { + "epoch": 1.239154616240267, + "grad_norm": 7.59765625, + "learning_rate": 8.760845383759734e-06, + "loss": 3.1791, + "mean_token_accuracy": 0.4210680751173709, + "step": 6684 + }, + { + "epoch": 1.239340007415647, + "grad_norm": 8.0390625, + "learning_rate": 8.760659992584353e-06, + "loss": 2.5043, + "mean_token_accuracy": 0.48750604546187326, + "step": 6685 + }, + { + "epoch": 1.239525398591027, + "grad_norm": 6.1796875, + "learning_rate": 8.760474601408974e-06, + "loss": 2.7138, + "mean_token_accuracy": 0.47574497574497576, + "step": 6686 + }, + { + "epoch": 1.239710789766407, + "grad_norm": 7.41015625, + "learning_rate": 8.760289210233594e-06, + "loss": 2.4869, + "mean_token_accuracy": 0.48620938628158844, + "step": 6687 + }, + { + "epoch": 1.2398961809417872, + "grad_norm": 6.24609375, + "learning_rate": 8.760103819058213e-06, + "loss": 2.4615, + "mean_token_accuracy": 0.4866682974559687, + "step": 6688 + }, + { + "epoch": 1.2400815721171672, + "grad_norm": 7.00390625, + "learning_rate": 8.759918427882833e-06, + "loss": 2.404, + "mean_token_accuracy": 0.4994980639609924, + "step": 6689 + }, + { + "epoch": 1.2402669632925472, + "grad_norm": 6.390625, + "learning_rate": 8.759733036707452e-06, + "loss": 2.6777, + "mean_token_accuracy": 0.46771117166212534, + "step": 6690 + }, + { + "epoch": 1.2404523544679273, + "grad_norm": 6.78125, + "learning_rate": 8.759547645532074e-06, + "loss": 2.7794, + "mean_token_accuracy": 0.46309006863706403, + "step": 6691 + }, + { + "epoch": 1.2406377456433073, + "grad_norm": 6.4453125, + "learning_rate": 8.759362254356693e-06, + "loss": 2.7323, + "mean_token_accuracy": 0.44880480634027864, + "step": 6692 + }, + { + "epoch": 1.2408231368186875, + "grad_norm": 6.3984375, + "learning_rate": 8.759176863181314e-06, + "loss": 2.8165, + "mean_token_accuracy": 0.4481180811808118, + "step": 6693 + }, + { + "epoch": 1.2410085279940675, + "grad_norm": 6.74609375, + "learning_rate": 8.758991472005933e-06, + "loss": 2.9653, + "mean_token_accuracy": 0.4480534439069339, + "step": 6694 + }, + { + "epoch": 1.2411939191694477, + "grad_norm": 10.0390625, + "learning_rate": 8.758806080830553e-06, + "loss": 2.5681, + "mean_token_accuracy": 0.4628238341968912, + "step": 6695 + }, + { + "epoch": 1.2413793103448276, + "grad_norm": 8.5, + "learning_rate": 8.758620689655173e-06, + "loss": 2.4418, + "mean_token_accuracy": 0.4923014586709887, + "step": 6696 + }, + { + "epoch": 1.2415647015202076, + "grad_norm": 5.5859375, + "learning_rate": 8.758435298479792e-06, + "loss": 3.1324, + "mean_token_accuracy": 0.4509918319719953, + "step": 6697 + }, + { + "epoch": 1.2417500926955878, + "grad_norm": 6.7109375, + "learning_rate": 8.758249907304413e-06, + "loss": 2.7712, + "mean_token_accuracy": 0.46607237892496006, + "step": 6698 + }, + { + "epoch": 1.2419354838709677, + "grad_norm": 5.17578125, + "learning_rate": 8.758064516129033e-06, + "loss": 2.457, + "mean_token_accuracy": 0.48798619407938404, + "step": 6699 + }, + { + "epoch": 1.2421208750463477, + "grad_norm": 7.8984375, + "learning_rate": 8.757879124953654e-06, + "loss": 2.9329, + "mean_token_accuracy": 0.4502302968270215, + "step": 6700 + }, + { + "epoch": 1.242306266221728, + "grad_norm": 5.88671875, + "learning_rate": 8.757693733778273e-06, + "loss": 2.7687, + "mean_token_accuracy": 0.4925673583152679, + "step": 6701 + }, + { + "epoch": 1.2424916573971079, + "grad_norm": 6.93359375, + "learning_rate": 8.757508342602893e-06, + "loss": 3.2166, + "mean_token_accuracy": 0.43183420273509093, + "step": 6702 + }, + { + "epoch": 1.2426770485724878, + "grad_norm": 11.8359375, + "learning_rate": 8.757322951427512e-06, + "loss": 2.6865, + "mean_token_accuracy": 0.4767163384779138, + "step": 6703 + }, + { + "epoch": 1.242862439747868, + "grad_norm": 5.66015625, + "learning_rate": 8.757137560252132e-06, + "loss": 3.051, + "mean_token_accuracy": 0.4544376358964222, + "step": 6704 + }, + { + "epoch": 1.243047830923248, + "grad_norm": 6.453125, + "learning_rate": 8.756952169076753e-06, + "loss": 3.2754, + "mean_token_accuracy": 0.40908366533864543, + "step": 6705 + }, + { + "epoch": 1.2432332220986282, + "grad_norm": 6.93359375, + "learning_rate": 8.756766777901372e-06, + "loss": 2.1753, + "mean_token_accuracy": 0.5378930395055093, + "step": 6706 + }, + { + "epoch": 1.2434186132740082, + "grad_norm": 6.95703125, + "learning_rate": 8.756581386725994e-06, + "loss": 3.4061, + "mean_token_accuracy": 0.40572597137014316, + "step": 6707 + }, + { + "epoch": 1.2436040044493881, + "grad_norm": 6.16796875, + "learning_rate": 8.756395995550613e-06, + "loss": 3.321, + "mean_token_accuracy": 0.4239890164752871, + "step": 6708 + }, + { + "epoch": 1.2437893956247683, + "grad_norm": 6.27734375, + "learning_rate": 8.756210604375233e-06, + "loss": 2.8756, + "mean_token_accuracy": 0.4459138187221397, + "step": 6709 + }, + { + "epoch": 1.2439747868001483, + "grad_norm": 6.52734375, + "learning_rate": 8.756025213199852e-06, + "loss": 2.457, + "mean_token_accuracy": 0.5065158593557905, + "step": 6710 + }, + { + "epoch": 1.2441601779755285, + "grad_norm": 6.08203125, + "learning_rate": 8.755839822024472e-06, + "loss": 2.9262, + "mean_token_accuracy": 0.44918552688482055, + "step": 6711 + }, + { + "epoch": 1.2443455691509084, + "grad_norm": 7.19140625, + "learning_rate": 8.755654430849093e-06, + "loss": 3.0281, + "mean_token_accuracy": 0.44071438163018667, + "step": 6712 + }, + { + "epoch": 1.2445309603262884, + "grad_norm": 6.19921875, + "learning_rate": 8.755469039673712e-06, + "loss": 3.1844, + "mean_token_accuracy": 0.42174993099641184, + "step": 6713 + }, + { + "epoch": 1.2447163515016686, + "grad_norm": 6.9453125, + "learning_rate": 8.755283648498332e-06, + "loss": 2.8651, + "mean_token_accuracy": 0.46655934435826135, + "step": 6714 + }, + { + "epoch": 1.2449017426770486, + "grad_norm": 6.54296875, + "learning_rate": 8.755098257322953e-06, + "loss": 3.747, + "mean_token_accuracy": 0.39570032573289904, + "step": 6715 + }, + { + "epoch": 1.2450871338524285, + "grad_norm": 6.24609375, + "learning_rate": 8.754912866147573e-06, + "loss": 2.623, + "mean_token_accuracy": 0.47602950215119855, + "step": 6716 + }, + { + "epoch": 1.2452725250278087, + "grad_norm": 7.22265625, + "learning_rate": 8.754727474972192e-06, + "loss": 3.2433, + "mean_token_accuracy": 0.4242527948893452, + "step": 6717 + }, + { + "epoch": 1.2454579162031887, + "grad_norm": 6.76953125, + "learning_rate": 8.754542083796812e-06, + "loss": 2.6421, + "mean_token_accuracy": 0.47396316417119383, + "step": 6718 + }, + { + "epoch": 1.2456433073785689, + "grad_norm": 6.8203125, + "learning_rate": 8.754356692621431e-06, + "loss": 3.1563, + "mean_token_accuracy": 0.4281524926686217, + "step": 6719 + }, + { + "epoch": 1.2458286985539488, + "grad_norm": 5.765625, + "learning_rate": 8.754171301446052e-06, + "loss": 2.4215, + "mean_token_accuracy": 0.5042219541616405, + "step": 6720 + }, + { + "epoch": 1.2460140897293288, + "grad_norm": 8.875, + "learning_rate": 8.753985910270672e-06, + "loss": 2.7341, + "mean_token_accuracy": 0.46629791613069205, + "step": 6721 + }, + { + "epoch": 1.246199480904709, + "grad_norm": 6.81640625, + "learning_rate": 8.753800519095291e-06, + "loss": 2.2938, + "mean_token_accuracy": 0.5015186490098409, + "step": 6722 + }, + { + "epoch": 1.246384872080089, + "grad_norm": 7.921875, + "learning_rate": 8.753615127919912e-06, + "loss": 2.4598, + "mean_token_accuracy": 0.4870848708487085, + "step": 6723 + }, + { + "epoch": 1.2465702632554692, + "grad_norm": 6.6953125, + "learning_rate": 8.753429736744532e-06, + "loss": 2.4729, + "mean_token_accuracy": 0.5131323586492431, + "step": 6724 + }, + { + "epoch": 1.2467556544308491, + "grad_norm": 6.13671875, + "learning_rate": 8.753244345569152e-06, + "loss": 2.8349, + "mean_token_accuracy": 0.4689895470383275, + "step": 6725 + }, + { + "epoch": 1.246941045606229, + "grad_norm": 4.97265625, + "learning_rate": 8.753058954393771e-06, + "loss": 2.8458, + "mean_token_accuracy": 0.4401634559252773, + "step": 6726 + }, + { + "epoch": 1.2471264367816093, + "grad_norm": 7.7265625, + "learning_rate": 8.752873563218392e-06, + "loss": 3.0488, + "mean_token_accuracy": 0.4328901154039137, + "step": 6727 + }, + { + "epoch": 1.2473118279569892, + "grad_norm": 5.28125, + "learning_rate": 8.75268817204301e-06, + "loss": 2.4577, + "mean_token_accuracy": 0.49883025734338443, + "step": 6728 + }, + { + "epoch": 1.2474972191323692, + "grad_norm": 7.765625, + "learning_rate": 8.752502780867631e-06, + "loss": 2.727, + "mean_token_accuracy": 0.4638447971781305, + "step": 6729 + }, + { + "epoch": 1.2476826103077494, + "grad_norm": 5.29296875, + "learning_rate": 8.752317389692252e-06, + "loss": 2.9785, + "mean_token_accuracy": 0.4434279564106014, + "step": 6730 + }, + { + "epoch": 1.2478680014831294, + "grad_norm": 6.9765625, + "learning_rate": 8.752131998516872e-06, + "loss": 2.0136, + "mean_token_accuracy": 0.5574194884539713, + "step": 6731 + }, + { + "epoch": 1.2480533926585093, + "grad_norm": 6.80859375, + "learning_rate": 8.751946607341491e-06, + "loss": 2.8212, + "mean_token_accuracy": 0.4519568489713999, + "step": 6732 + }, + { + "epoch": 1.2482387838338895, + "grad_norm": 5.08984375, + "learning_rate": 8.751761216166111e-06, + "loss": 2.8478, + "mean_token_accuracy": 0.45098501828477056, + "step": 6733 + }, + { + "epoch": 1.2484241750092695, + "grad_norm": 7.015625, + "learning_rate": 8.751575824990732e-06, + "loss": 2.5982, + "mean_token_accuracy": 0.4870630957784839, + "step": 6734 + }, + { + "epoch": 1.2486095661846497, + "grad_norm": 9.96875, + "learning_rate": 8.75139043381535e-06, + "loss": 3.0341, + "mean_token_accuracy": 0.4554203226719502, + "step": 6735 + }, + { + "epoch": 1.2487949573600297, + "grad_norm": 7.12109375, + "learning_rate": 8.751205042639971e-06, + "loss": 2.7293, + "mean_token_accuracy": 0.4726410299847535, + "step": 6736 + }, + { + "epoch": 1.2489803485354096, + "grad_norm": 6.0859375, + "learning_rate": 8.75101965146459e-06, + "loss": 3.0175, + "mean_token_accuracy": 0.4456856079002752, + "step": 6737 + }, + { + "epoch": 1.2491657397107898, + "grad_norm": 8.078125, + "learning_rate": 8.75083426028921e-06, + "loss": 2.8711, + "mean_token_accuracy": 0.44144736842105264, + "step": 6738 + }, + { + "epoch": 1.2493511308861698, + "grad_norm": 7.0859375, + "learning_rate": 8.750648869113831e-06, + "loss": 3.2202, + "mean_token_accuracy": 0.4200706001008573, + "step": 6739 + }, + { + "epoch": 1.24953652206155, + "grad_norm": 6.421875, + "learning_rate": 8.750463477938451e-06, + "loss": 2.5511, + "mean_token_accuracy": 0.4797720005560962, + "step": 6740 + }, + { + "epoch": 1.24972191323693, + "grad_norm": 6.14453125, + "learning_rate": 8.75027808676307e-06, + "loss": 2.5444, + "mean_token_accuracy": 0.49317124418430136, + "step": 6741 + }, + { + "epoch": 1.24990730441231, + "grad_norm": 7.4609375, + "learning_rate": 8.75009269558769e-06, + "loss": 2.9241, + "mean_token_accuracy": 0.4326327299234601, + "step": 6742 + }, + { + "epoch": 1.25009269558769, + "grad_norm": 7.81640625, + "learning_rate": 8.749907304412311e-06, + "loss": 3.1704, + "mean_token_accuracy": 0.43146985841482993, + "step": 6743 + }, + { + "epoch": 1.25027808676307, + "grad_norm": 6.625, + "learning_rate": 8.74972191323693e-06, + "loss": 2.9301, + "mean_token_accuracy": 0.44171632896305124, + "step": 6744 + }, + { + "epoch": 1.25046347793845, + "grad_norm": 6.41796875, + "learning_rate": 8.74953652206155e-06, + "loss": 2.4918, + "mean_token_accuracy": 0.4941389728096677, + "step": 6745 + }, + { + "epoch": 1.2506488691138302, + "grad_norm": 6.63671875, + "learning_rate": 8.74935113088617e-06, + "loss": 2.6894, + "mean_token_accuracy": 0.4709908735332464, + "step": 6746 + }, + { + "epoch": 1.2508342602892102, + "grad_norm": 8.2109375, + "learning_rate": 8.749165739710791e-06, + "loss": 2.5561, + "mean_token_accuracy": 0.4878086902156924, + "step": 6747 + }, + { + "epoch": 1.2510196514645902, + "grad_norm": 6.06640625, + "learning_rate": 8.74898034853541e-06, + "loss": 3.5381, + "mean_token_accuracy": 0.40937038858829317, + "step": 6748 + }, + { + "epoch": 1.2512050426399703, + "grad_norm": 5.5234375, + "learning_rate": 8.74879495736003e-06, + "loss": 2.1615, + "mean_token_accuracy": 0.548824494259158, + "step": 6749 + }, + { + "epoch": 1.2513904338153503, + "grad_norm": 9.9609375, + "learning_rate": 8.748609566184651e-06, + "loss": 2.9008, + "mean_token_accuracy": 0.45496838840510556, + "step": 6750 + }, + { + "epoch": 1.2515758249907305, + "grad_norm": 7.6953125, + "learning_rate": 8.74842417500927e-06, + "loss": 2.8284, + "mean_token_accuracy": 0.47239642232959805, + "step": 6751 + }, + { + "epoch": 1.2517612161661105, + "grad_norm": 9.171875, + "learning_rate": 8.74823878383389e-06, + "loss": 3.8383, + "mean_token_accuracy": 0.4061146181458101, + "step": 6752 + }, + { + "epoch": 1.2519466073414907, + "grad_norm": 10.0390625, + "learning_rate": 8.74805339265851e-06, + "loss": 2.7514, + "mean_token_accuracy": 0.5150321648943154, + "step": 6753 + }, + { + "epoch": 1.2521319985168706, + "grad_norm": 5.6484375, + "learning_rate": 8.74786800148313e-06, + "loss": 2.9846, + "mean_token_accuracy": 0.44584708076316165, + "step": 6754 + }, + { + "epoch": 1.2523173896922506, + "grad_norm": 7.67578125, + "learning_rate": 8.74768261030775e-06, + "loss": 3.2229, + "mean_token_accuracy": 0.4276060388209921, + "step": 6755 + }, + { + "epoch": 1.2525027808676308, + "grad_norm": 6.2265625, + "learning_rate": 8.74749721913237e-06, + "loss": 2.4108, + "mean_token_accuracy": 0.4786336059087312, + "step": 6756 + }, + { + "epoch": 1.2526881720430108, + "grad_norm": 6.015625, + "learning_rate": 8.74731182795699e-06, + "loss": 2.6911, + "mean_token_accuracy": 0.46736292428198434, + "step": 6757 + }, + { + "epoch": 1.2528735632183907, + "grad_norm": 6.24609375, + "learning_rate": 8.74712643678161e-06, + "loss": 2.5607, + "mean_token_accuracy": 0.4932332537245536, + "step": 6758 + }, + { + "epoch": 1.253058954393771, + "grad_norm": 6.41796875, + "learning_rate": 8.74694104560623e-06, + "loss": 2.2509, + "mean_token_accuracy": 0.5244332493702771, + "step": 6759 + }, + { + "epoch": 1.2532443455691509, + "grad_norm": 5.7265625, + "learning_rate": 8.74675565443085e-06, + "loss": 2.5473, + "mean_token_accuracy": 0.4777234447161098, + "step": 6760 + }, + { + "epoch": 1.2534297367445308, + "grad_norm": 8.1953125, + "learning_rate": 8.74657026325547e-06, + "loss": 2.9633, + "mean_token_accuracy": 0.43644767249917465, + "step": 6761 + }, + { + "epoch": 1.253615127919911, + "grad_norm": 6.63671875, + "learning_rate": 8.746384872080089e-06, + "loss": 2.5463, + "mean_token_accuracy": 0.48415968177297913, + "step": 6762 + }, + { + "epoch": 1.253800519095291, + "grad_norm": 7.01953125, + "learning_rate": 8.74619948090471e-06, + "loss": 2.7518, + "mean_token_accuracy": 0.4630438055095589, + "step": 6763 + }, + { + "epoch": 1.2539859102706712, + "grad_norm": 6.52734375, + "learning_rate": 8.74601408972933e-06, + "loss": 2.2152, + "mean_token_accuracy": 0.5293951384963256, + "step": 6764 + }, + { + "epoch": 1.2541713014460512, + "grad_norm": 6.34765625, + "learning_rate": 8.74582869855395e-06, + "loss": 2.6572, + "mean_token_accuracy": 0.46649544711650714, + "step": 6765 + }, + { + "epoch": 1.2543566926214313, + "grad_norm": 7.33203125, + "learning_rate": 8.745643307378569e-06, + "loss": 2.393, + "mean_token_accuracy": 0.49072192875524856, + "step": 6766 + }, + { + "epoch": 1.2545420837968113, + "grad_norm": 6.28125, + "learning_rate": 8.74545791620319e-06, + "loss": 3.1449, + "mean_token_accuracy": 0.43002058218171124, + "step": 6767 + }, + { + "epoch": 1.2547274749721913, + "grad_norm": 7.2890625, + "learning_rate": 8.74527252502781e-06, + "loss": 2.9077, + "mean_token_accuracy": 0.4720194647201946, + "step": 6768 + }, + { + "epoch": 1.2549128661475715, + "grad_norm": 7.1796875, + "learning_rate": 8.745087133852429e-06, + "loss": 2.5595, + "mean_token_accuracy": 0.5012501644953283, + "step": 6769 + }, + { + "epoch": 1.2550982573229514, + "grad_norm": 6.18359375, + "learning_rate": 8.74490174267705e-06, + "loss": 2.6439, + "mean_token_accuracy": 0.48436770881941205, + "step": 6770 + }, + { + "epoch": 1.2552836484983314, + "grad_norm": 6.83203125, + "learning_rate": 8.74471635150167e-06, + "loss": 2.7475, + "mean_token_accuracy": 0.46584272714925984, + "step": 6771 + }, + { + "epoch": 1.2554690396737116, + "grad_norm": 9.2421875, + "learning_rate": 8.74453096032629e-06, + "loss": 2.8991, + "mean_token_accuracy": 0.46563918505225377, + "step": 6772 + }, + { + "epoch": 1.2556544308490916, + "grad_norm": 6.06640625, + "learning_rate": 8.744345569150909e-06, + "loss": 2.894, + "mean_token_accuracy": 0.46795366795366794, + "step": 6773 + }, + { + "epoch": 1.2558398220244715, + "grad_norm": 6.83984375, + "learning_rate": 8.74416017797553e-06, + "loss": 2.7069, + "mean_token_accuracy": 0.46266094420600856, + "step": 6774 + }, + { + "epoch": 1.2560252131998517, + "grad_norm": 5.44140625, + "learning_rate": 8.743974786800148e-06, + "loss": 2.9142, + "mean_token_accuracy": 0.45024424284717374, + "step": 6775 + }, + { + "epoch": 1.2562106043752317, + "grad_norm": 8.9921875, + "learning_rate": 8.743789395624769e-06, + "loss": 2.6917, + "mean_token_accuracy": 0.44411918452692106, + "step": 6776 + }, + { + "epoch": 1.2563959955506117, + "grad_norm": 7.2421875, + "learning_rate": 8.74360400444939e-06, + "loss": 2.5661, + "mean_token_accuracy": 0.5127145991996902, + "step": 6777 + }, + { + "epoch": 1.2565813867259918, + "grad_norm": 7.65234375, + "learning_rate": 8.743418613274008e-06, + "loss": 3.1534, + "mean_token_accuracy": 0.45713179794851944, + "step": 6778 + }, + { + "epoch": 1.2567667779013718, + "grad_norm": 5.56640625, + "learning_rate": 8.743233222098629e-06, + "loss": 3.115, + "mean_token_accuracy": 0.4294346579270791, + "step": 6779 + }, + { + "epoch": 1.256952169076752, + "grad_norm": 6.02734375, + "learning_rate": 8.743047830923249e-06, + "loss": 2.6003, + "mean_token_accuracy": 0.48210502843126324, + "step": 6780 + }, + { + "epoch": 1.257137560252132, + "grad_norm": 5.66796875, + "learning_rate": 8.74286243974787e-06, + "loss": 2.4566, + "mean_token_accuracy": 0.5140995260663507, + "step": 6781 + }, + { + "epoch": 1.2573229514275122, + "grad_norm": 8.7109375, + "learning_rate": 8.742677048572488e-06, + "loss": 2.4733, + "mean_token_accuracy": 0.46385193753614806, + "step": 6782 + }, + { + "epoch": 1.2575083426028921, + "grad_norm": 6.0390625, + "learning_rate": 8.742491657397109e-06, + "loss": 2.9813, + "mean_token_accuracy": 0.44597417394606914, + "step": 6783 + }, + { + "epoch": 1.257693733778272, + "grad_norm": 7.46484375, + "learning_rate": 8.742306266221728e-06, + "loss": 2.7349, + "mean_token_accuracy": 0.4406564197388189, + "step": 6784 + }, + { + "epoch": 1.2578791249536523, + "grad_norm": 6.69921875, + "learning_rate": 8.742120875046348e-06, + "loss": 2.8005, + "mean_token_accuracy": 0.47238934250107434, + "step": 6785 + }, + { + "epoch": 1.2580645161290323, + "grad_norm": 4.94140625, + "learning_rate": 8.741935483870969e-06, + "loss": 2.5127, + "mean_token_accuracy": 0.5139984866500918, + "step": 6786 + }, + { + "epoch": 1.2582499073044122, + "grad_norm": 5.62109375, + "learning_rate": 8.741750092695589e-06, + "loss": 2.4651, + "mean_token_accuracy": 0.49170991013795723, + "step": 6787 + }, + { + "epoch": 1.2584352984797924, + "grad_norm": 6.36328125, + "learning_rate": 8.74156470152021e-06, + "loss": 2.2244, + "mean_token_accuracy": 0.5210989678202793, + "step": 6788 + }, + { + "epoch": 1.2586206896551724, + "grad_norm": 6.02734375, + "learning_rate": 8.741379310344828e-06, + "loss": 2.8136, + "mean_token_accuracy": 0.4666281421554464, + "step": 6789 + }, + { + "epoch": 1.2588060808305523, + "grad_norm": 9.03125, + "learning_rate": 8.741193919169449e-06, + "loss": 2.8093, + "mean_token_accuracy": 0.4558382257012394, + "step": 6790 + }, + { + "epoch": 1.2589914720059325, + "grad_norm": 5.90234375, + "learning_rate": 8.741008527994068e-06, + "loss": 3.3896, + "mean_token_accuracy": 0.4233520694941237, + "step": 6791 + }, + { + "epoch": 1.2591768631813125, + "grad_norm": 6.796875, + "learning_rate": 8.740823136818688e-06, + "loss": 2.9796, + "mean_token_accuracy": 0.45075519194461927, + "step": 6792 + }, + { + "epoch": 1.2593622543566927, + "grad_norm": 5.8671875, + "learning_rate": 8.740637745643309e-06, + "loss": 2.5326, + "mean_token_accuracy": 0.4676577394462097, + "step": 6793 + }, + { + "epoch": 1.2595476455320727, + "grad_norm": 5.6328125, + "learning_rate": 8.740452354467927e-06, + "loss": 2.7728, + "mean_token_accuracy": 0.47548312662244013, + "step": 6794 + }, + { + "epoch": 1.2597330367074528, + "grad_norm": 7.11328125, + "learning_rate": 8.740266963292548e-06, + "loss": 3.1244, + "mean_token_accuracy": 0.4441281138790036, + "step": 6795 + }, + { + "epoch": 1.2599184278828328, + "grad_norm": 5.59765625, + "learning_rate": 8.740081572117168e-06, + "loss": 3.1198, + "mean_token_accuracy": 0.434337123578658, + "step": 6796 + }, + { + "epoch": 1.2601038190582128, + "grad_norm": 6.24609375, + "learning_rate": 8.739896180941789e-06, + "loss": 2.7971, + "mean_token_accuracy": 0.4645439163205661, + "step": 6797 + }, + { + "epoch": 1.260289210233593, + "grad_norm": 7.46484375, + "learning_rate": 8.739710789766408e-06, + "loss": 2.4278, + "mean_token_accuracy": 0.5083766608896592, + "step": 6798 + }, + { + "epoch": 1.260474601408973, + "grad_norm": 7.0625, + "learning_rate": 8.739525398591028e-06, + "loss": 2.5152, + "mean_token_accuracy": 0.48158834844737564, + "step": 6799 + }, + { + "epoch": 1.260659992584353, + "grad_norm": 8.1953125, + "learning_rate": 8.739340007415647e-06, + "loss": 3.0181, + "mean_token_accuracy": 0.4427994616419919, + "step": 6800 + }, + { + "epoch": 1.260845383759733, + "grad_norm": 6.125, + "learning_rate": 8.739154616240267e-06, + "loss": 3.1782, + "mean_token_accuracy": 0.4527429934406679, + "step": 6801 + }, + { + "epoch": 1.261030774935113, + "grad_norm": 10.171875, + "learning_rate": 8.738969225064888e-06, + "loss": 2.9036, + "mean_token_accuracy": 0.4509746445844936, + "step": 6802 + }, + { + "epoch": 1.261216166110493, + "grad_norm": 10.5, + "learning_rate": 8.738783833889508e-06, + "loss": 2.9682, + "mean_token_accuracy": 0.43660326993660326, + "step": 6803 + }, + { + "epoch": 1.2614015572858732, + "grad_norm": 6.4296875, + "learning_rate": 8.738598442714127e-06, + "loss": 2.7514, + "mean_token_accuracy": 0.46253164556962023, + "step": 6804 + }, + { + "epoch": 1.2615869484612532, + "grad_norm": 8.5, + "learning_rate": 8.738413051538748e-06, + "loss": 2.5542, + "mean_token_accuracy": 0.5015659436260295, + "step": 6805 + }, + { + "epoch": 1.2617723396366334, + "grad_norm": 8.921875, + "learning_rate": 8.738227660363368e-06, + "loss": 2.4894, + "mean_token_accuracy": 0.4945043864071796, + "step": 6806 + }, + { + "epoch": 1.2619577308120133, + "grad_norm": 8.1171875, + "learning_rate": 8.738042269187987e-06, + "loss": 2.9399, + "mean_token_accuracy": 0.44983857467416, + "step": 6807 + }, + { + "epoch": 1.2621431219873935, + "grad_norm": 5.88671875, + "learning_rate": 8.737856878012608e-06, + "loss": 2.8639, + "mean_token_accuracy": 0.46305631571366845, + "step": 6808 + }, + { + "epoch": 1.2623285131627735, + "grad_norm": 11.6875, + "learning_rate": 8.737671486837226e-06, + "loss": 2.7446, + "mean_token_accuracy": 0.4537424980183445, + "step": 6809 + }, + { + "epoch": 1.2625139043381535, + "grad_norm": 7.90234375, + "learning_rate": 8.737486095661847e-06, + "loss": 2.8229, + "mean_token_accuracy": 0.44919143356643354, + "step": 6810 + }, + { + "epoch": 1.2626992955135337, + "grad_norm": 6.4765625, + "learning_rate": 8.737300704486467e-06, + "loss": 2.953, + "mean_token_accuracy": 0.4464570352122881, + "step": 6811 + }, + { + "epoch": 1.2628846866889136, + "grad_norm": 8.234375, + "learning_rate": 8.737115313311088e-06, + "loss": 2.7617, + "mean_token_accuracy": 0.478149446992177, + "step": 6812 + }, + { + "epoch": 1.2630700778642936, + "grad_norm": 7.37109375, + "learning_rate": 8.736929922135707e-06, + "loss": 2.5801, + "mean_token_accuracy": 0.4879990050988683, + "step": 6813 + }, + { + "epoch": 1.2632554690396738, + "grad_norm": 6.54296875, + "learning_rate": 8.736744530960327e-06, + "loss": 1.7893, + "mean_token_accuracy": 0.5961298377028714, + "step": 6814 + }, + { + "epoch": 1.2634408602150538, + "grad_norm": 8.59375, + "learning_rate": 8.736559139784948e-06, + "loss": 2.9098, + "mean_token_accuracy": 0.4634653644420329, + "step": 6815 + }, + { + "epoch": 1.2636262513904337, + "grad_norm": 6.5859375, + "learning_rate": 8.736373748609566e-06, + "loss": 2.8137, + "mean_token_accuracy": 0.46824044779751767, + "step": 6816 + }, + { + "epoch": 1.263811642565814, + "grad_norm": 9.015625, + "learning_rate": 8.736188357434187e-06, + "loss": 2.9523, + "mean_token_accuracy": 0.4348478646472916, + "step": 6817 + }, + { + "epoch": 1.2639970337411939, + "grad_norm": 7.1484375, + "learning_rate": 8.736002966258806e-06, + "loss": 2.5326, + "mean_token_accuracy": 0.4890496078231639, + "step": 6818 + }, + { + "epoch": 1.2641824249165738, + "grad_norm": 7.72265625, + "learning_rate": 8.735817575083426e-06, + "loss": 2.7792, + "mean_token_accuracy": 0.4627399546917849, + "step": 6819 + }, + { + "epoch": 1.264367816091954, + "grad_norm": 5.72265625, + "learning_rate": 8.735632183908047e-06, + "loss": 2.2659, + "mean_token_accuracy": 0.5080310163389643, + "step": 6820 + }, + { + "epoch": 1.264553207267334, + "grad_norm": 6.171875, + "learning_rate": 8.735446792732667e-06, + "loss": 2.9384, + "mean_token_accuracy": 0.4402029475718773, + "step": 6821 + }, + { + "epoch": 1.2647385984427142, + "grad_norm": 5.98828125, + "learning_rate": 8.735261401557286e-06, + "loss": 2.8103, + "mean_token_accuracy": 0.4395251051199604, + "step": 6822 + }, + { + "epoch": 1.2649239896180942, + "grad_norm": 7.79296875, + "learning_rate": 8.735076010381906e-06, + "loss": 2.5809, + "mean_token_accuracy": 0.4697717104508432, + "step": 6823 + }, + { + "epoch": 1.2651093807934743, + "grad_norm": 6.6171875, + "learning_rate": 8.734890619206527e-06, + "loss": 2.8641, + "mean_token_accuracy": 0.4699742442727396, + "step": 6824 + }, + { + "epoch": 1.2652947719688543, + "grad_norm": 6.16796875, + "learning_rate": 8.734705228031146e-06, + "loss": 2.7156, + "mean_token_accuracy": 0.4700846999459362, + "step": 6825 + }, + { + "epoch": 1.2654801631442343, + "grad_norm": 5.4765625, + "learning_rate": 8.734519836855766e-06, + "loss": 3.5967, + "mean_token_accuracy": 0.40620138960904284, + "step": 6826 + }, + { + "epoch": 1.2656655543196145, + "grad_norm": 6.92578125, + "learning_rate": 8.734334445680385e-06, + "loss": 3.8365, + "mean_token_accuracy": 0.3940274227283291, + "step": 6827 + }, + { + "epoch": 1.2658509454949944, + "grad_norm": 6.421875, + "learning_rate": 8.734149054505007e-06, + "loss": 3.158, + "mean_token_accuracy": 0.4304426377597109, + "step": 6828 + }, + { + "epoch": 1.2660363366703744, + "grad_norm": 5.8984375, + "learning_rate": 8.733963663329626e-06, + "loss": 2.9354, + "mean_token_accuracy": 0.4633601983880967, + "step": 6829 + }, + { + "epoch": 1.2662217278457546, + "grad_norm": 9.2578125, + "learning_rate": 8.733778272154246e-06, + "loss": 2.5492, + "mean_token_accuracy": 0.4778959149412423, + "step": 6830 + }, + { + "epoch": 1.2664071190211346, + "grad_norm": 5.73046875, + "learning_rate": 8.733592880978867e-06, + "loss": 3.2573, + "mean_token_accuracy": 0.4330474148201096, + "step": 6831 + }, + { + "epoch": 1.2665925101965145, + "grad_norm": 6.609375, + "learning_rate": 8.733407489803486e-06, + "loss": 2.3335, + "mean_token_accuracy": 0.532866023012829, + "step": 6832 + }, + { + "epoch": 1.2667779013718947, + "grad_norm": 5.3515625, + "learning_rate": 8.733222098628106e-06, + "loss": 2.9168, + "mean_token_accuracy": 0.44966666666666666, + "step": 6833 + }, + { + "epoch": 1.2669632925472747, + "grad_norm": 5.80078125, + "learning_rate": 8.733036707452725e-06, + "loss": 2.9226, + "mean_token_accuracy": 0.46327615366030445, + "step": 6834 + }, + { + "epoch": 1.2671486837226549, + "grad_norm": 5.2109375, + "learning_rate": 8.732851316277346e-06, + "loss": 2.6512, + "mean_token_accuracy": 0.4747636363636364, + "step": 6835 + }, + { + "epoch": 1.2673340748980348, + "grad_norm": 5.609375, + "learning_rate": 8.732665925101966e-06, + "loss": 2.9665, + "mean_token_accuracy": 0.4320665797653665, + "step": 6836 + }, + { + "epoch": 1.267519466073415, + "grad_norm": 5.93359375, + "learning_rate": 8.732480533926587e-06, + "loss": 2.6722, + "mean_token_accuracy": 0.473407977606718, + "step": 6837 + }, + { + "epoch": 1.267704857248795, + "grad_norm": 5.26171875, + "learning_rate": 8.732295142751205e-06, + "loss": 3.0403, + "mean_token_accuracy": 0.44613809577977476, + "step": 6838 + }, + { + "epoch": 1.267890248424175, + "grad_norm": 6.58203125, + "learning_rate": 8.732109751575826e-06, + "loss": 2.8695, + "mean_token_accuracy": 0.4500271591526344, + "step": 6839 + }, + { + "epoch": 1.2680756395995552, + "grad_norm": 8.5703125, + "learning_rate": 8.731924360400446e-06, + "loss": 2.6207, + "mean_token_accuracy": 0.48713550600343053, + "step": 6840 + }, + { + "epoch": 1.2682610307749351, + "grad_norm": 6.52734375, + "learning_rate": 8.731738969225065e-06, + "loss": 3.0926, + "mean_token_accuracy": 0.4469162995594714, + "step": 6841 + }, + { + "epoch": 1.268446421950315, + "grad_norm": 5.73828125, + "learning_rate": 8.731553578049686e-06, + "loss": 2.5039, + "mean_token_accuracy": 0.4686949371549395, + "step": 6842 + }, + { + "epoch": 1.2686318131256953, + "grad_norm": 8.28125, + "learning_rate": 8.731368186874304e-06, + "loss": 2.8682, + "mean_token_accuracy": 0.45308985046976313, + "step": 6843 + }, + { + "epoch": 1.2688172043010753, + "grad_norm": 7.4453125, + "learning_rate": 8.731182795698927e-06, + "loss": 2.7532, + "mean_token_accuracy": 0.4778502985696431, + "step": 6844 + }, + { + "epoch": 1.2690025954764552, + "grad_norm": 6.453125, + "learning_rate": 8.730997404523545e-06, + "loss": 3.0764, + "mean_token_accuracy": 0.429648970118944, + "step": 6845 + }, + { + "epoch": 1.2691879866518354, + "grad_norm": 9.2578125, + "learning_rate": 8.730812013348166e-06, + "loss": 3.2888, + "mean_token_accuracy": 0.39870093974571585, + "step": 6846 + }, + { + "epoch": 1.2693733778272154, + "grad_norm": 7.83984375, + "learning_rate": 8.730626622172785e-06, + "loss": 2.9456, + "mean_token_accuracy": 0.4395667870036101, + "step": 6847 + }, + { + "epoch": 1.2695587690025953, + "grad_norm": 7.0546875, + "learning_rate": 8.730441230997405e-06, + "loss": 2.691, + "mean_token_accuracy": 0.4701117318435754, + "step": 6848 + }, + { + "epoch": 1.2697441601779755, + "grad_norm": 5.6796875, + "learning_rate": 8.730255839822026e-06, + "loss": 2.5011, + "mean_token_accuracy": 0.4988110964332893, + "step": 6849 + }, + { + "epoch": 1.2699295513533555, + "grad_norm": 6.69921875, + "learning_rate": 8.730070448646644e-06, + "loss": 2.808, + "mean_token_accuracy": 0.4668904839083121, + "step": 6850 + }, + { + "epoch": 1.2701149425287357, + "grad_norm": 8.7421875, + "learning_rate": 8.729885057471265e-06, + "loss": 2.4938, + "mean_token_accuracy": 0.476169781254527, + "step": 6851 + }, + { + "epoch": 1.2703003337041157, + "grad_norm": 6.96875, + "learning_rate": 8.729699666295885e-06, + "loss": 3.0007, + "mean_token_accuracy": 0.4197422378441711, + "step": 6852 + }, + { + "epoch": 1.2704857248794958, + "grad_norm": 13.546875, + "learning_rate": 8.729514275120506e-06, + "loss": 2.8055, + "mean_token_accuracy": 0.44871794871794873, + "step": 6853 + }, + { + "epoch": 1.2706711160548758, + "grad_norm": 11.6015625, + "learning_rate": 8.729328883945125e-06, + "loss": 2.9314, + "mean_token_accuracy": 0.44271230786002397, + "step": 6854 + }, + { + "epoch": 1.2708565072302558, + "grad_norm": 7.25, + "learning_rate": 8.729143492769745e-06, + "loss": 2.6627, + "mean_token_accuracy": 0.4730885009030704, + "step": 6855 + }, + { + "epoch": 1.271041898405636, + "grad_norm": 6.1328125, + "learning_rate": 8.728958101594364e-06, + "loss": 2.2544, + "mean_token_accuracy": 0.5156273822228998, + "step": 6856 + }, + { + "epoch": 1.271227289581016, + "grad_norm": 12.109375, + "learning_rate": 8.728772710418985e-06, + "loss": 2.7693, + "mean_token_accuracy": 0.4431924882629108, + "step": 6857 + }, + { + "epoch": 1.271412680756396, + "grad_norm": 11.6328125, + "learning_rate": 8.728587319243605e-06, + "loss": 2.7558, + "mean_token_accuracy": 0.4644632540642214, + "step": 6858 + }, + { + "epoch": 1.271598071931776, + "grad_norm": 10.8671875, + "learning_rate": 8.728401928068224e-06, + "loss": 3.2253, + "mean_token_accuracy": 0.4214511041009464, + "step": 6859 + }, + { + "epoch": 1.271783463107156, + "grad_norm": 4.98828125, + "learning_rate": 8.728216536892844e-06, + "loss": 2.8842, + "mean_token_accuracy": 0.4524959742351047, + "step": 6860 + }, + { + "epoch": 1.271968854282536, + "grad_norm": 10.578125, + "learning_rate": 8.728031145717465e-06, + "loss": 2.9777, + "mean_token_accuracy": 0.43530411786928597, + "step": 6861 + }, + { + "epoch": 1.2721542454579162, + "grad_norm": 14.5703125, + "learning_rate": 8.727845754542085e-06, + "loss": 3.0789, + "mean_token_accuracy": 0.4350140056022409, + "step": 6862 + }, + { + "epoch": 1.2723396366332962, + "grad_norm": 7.5078125, + "learning_rate": 8.727660363366704e-06, + "loss": 2.8479, + "mean_token_accuracy": 0.4682926829268293, + "step": 6863 + }, + { + "epoch": 1.2725250278086764, + "grad_norm": 6.16796875, + "learning_rate": 8.727474972191325e-06, + "loss": 3.0979, + "mean_token_accuracy": 0.4301659988551803, + "step": 6864 + }, + { + "epoch": 1.2727104189840563, + "grad_norm": 5.52734375, + "learning_rate": 8.727289581015943e-06, + "loss": 2.9038, + "mean_token_accuracy": 0.44987991404373656, + "step": 6865 + }, + { + "epoch": 1.2728958101594365, + "grad_norm": 6.0859375, + "learning_rate": 8.727104189840564e-06, + "loss": 2.5616, + "mean_token_accuracy": 0.481492873987491, + "step": 6866 + }, + { + "epoch": 1.2730812013348165, + "grad_norm": 6.4296875, + "learning_rate": 8.726918798665184e-06, + "loss": 2.8801, + "mean_token_accuracy": 0.4825196850393701, + "step": 6867 + }, + { + "epoch": 1.2732665925101965, + "grad_norm": 6.02734375, + "learning_rate": 8.726733407489805e-06, + "loss": 3.1763, + "mean_token_accuracy": 0.4097202990861257, + "step": 6868 + }, + { + "epoch": 1.2734519836855767, + "grad_norm": 8.109375, + "learning_rate": 8.726548016314425e-06, + "loss": 3.0444, + "mean_token_accuracy": 0.45615763546798027, + "step": 6869 + }, + { + "epoch": 1.2736373748609566, + "grad_norm": 5.71875, + "learning_rate": 8.726362625139044e-06, + "loss": 2.7271, + "mean_token_accuracy": 0.4561996779388084, + "step": 6870 + }, + { + "epoch": 1.2738227660363366, + "grad_norm": 6.86328125, + "learning_rate": 8.726177233963665e-06, + "loss": 2.4195, + "mean_token_accuracy": 0.5001418842224744, + "step": 6871 + }, + { + "epoch": 1.2740081572117168, + "grad_norm": 6.4609375, + "learning_rate": 8.725991842788283e-06, + "loss": 3.4894, + "mean_token_accuracy": 0.42713973044580117, + "step": 6872 + }, + { + "epoch": 1.2741935483870968, + "grad_norm": 7.44140625, + "learning_rate": 8.725806451612904e-06, + "loss": 2.9024, + "mean_token_accuracy": 0.4456261234272019, + "step": 6873 + }, + { + "epoch": 1.2743789395624767, + "grad_norm": 7.47265625, + "learning_rate": 8.725621060437524e-06, + "loss": 2.5909, + "mean_token_accuracy": 0.4893787117405208, + "step": 6874 + }, + { + "epoch": 1.274564330737857, + "grad_norm": 8.3984375, + "learning_rate": 8.725435669262143e-06, + "loss": 3.5365, + "mean_token_accuracy": 0.41929269299573413, + "step": 6875 + }, + { + "epoch": 1.2747497219132369, + "grad_norm": 11.578125, + "learning_rate": 8.725250278086764e-06, + "loss": 2.5456, + "mean_token_accuracy": 0.4736973323360758, + "step": 6876 + }, + { + "epoch": 1.2749351130886168, + "grad_norm": 7.00390625, + "learning_rate": 8.725064886911384e-06, + "loss": 3.2688, + "mean_token_accuracy": 0.452212389380531, + "step": 6877 + }, + { + "epoch": 1.275120504263997, + "grad_norm": 5.86328125, + "learning_rate": 8.724879495736005e-06, + "loss": 2.6558, + "mean_token_accuracy": 0.4800524934383202, + "step": 6878 + }, + { + "epoch": 1.2753058954393772, + "grad_norm": 9.7421875, + "learning_rate": 8.724694104560623e-06, + "loss": 2.6867, + "mean_token_accuracy": 0.4744180407371484, + "step": 6879 + }, + { + "epoch": 1.2754912866147572, + "grad_norm": 6.72265625, + "learning_rate": 8.724508713385244e-06, + "loss": 2.8819, + "mean_token_accuracy": 0.44857142857142857, + "step": 6880 + }, + { + "epoch": 1.2756766777901372, + "grad_norm": 12.9765625, + "learning_rate": 8.724323322209863e-06, + "loss": 2.2067, + "mean_token_accuracy": 0.5220318960425281, + "step": 6881 + }, + { + "epoch": 1.2758620689655173, + "grad_norm": 5.57421875, + "learning_rate": 8.724137931034483e-06, + "loss": 2.518, + "mean_token_accuracy": 0.48923331755797445, + "step": 6882 + }, + { + "epoch": 1.2760474601408973, + "grad_norm": 7.9453125, + "learning_rate": 8.723952539859104e-06, + "loss": 2.9759, + "mean_token_accuracy": 0.442486281131279, + "step": 6883 + }, + { + "epoch": 1.2762328513162773, + "grad_norm": 9.78125, + "learning_rate": 8.723767148683724e-06, + "loss": 2.7313, + "mean_token_accuracy": 0.4579280531110082, + "step": 6884 + }, + { + "epoch": 1.2764182424916575, + "grad_norm": 7.2890625, + "learning_rate": 8.723581757508343e-06, + "loss": 2.9476, + "mean_token_accuracy": 0.4561038961038961, + "step": 6885 + }, + { + "epoch": 1.2766036336670374, + "grad_norm": 8.4609375, + "learning_rate": 8.723396366332964e-06, + "loss": 3.3309, + "mean_token_accuracy": 0.42711244893571276, + "step": 6886 + }, + { + "epoch": 1.2767890248424174, + "grad_norm": 8.0, + "learning_rate": 8.723210975157584e-06, + "loss": 2.8451, + "mean_token_accuracy": 0.4404369949117031, + "step": 6887 + }, + { + "epoch": 1.2769744160177976, + "grad_norm": 6.98046875, + "learning_rate": 8.723025583982203e-06, + "loss": 2.536, + "mean_token_accuracy": 0.4799697656840514, + "step": 6888 + }, + { + "epoch": 1.2771598071931776, + "grad_norm": 6.6875, + "learning_rate": 8.722840192806823e-06, + "loss": 2.804, + "mean_token_accuracy": 0.45852080989876265, + "step": 6889 + }, + { + "epoch": 1.2773451983685575, + "grad_norm": 15.859375, + "learning_rate": 8.722654801631442e-06, + "loss": 2.7743, + "mean_token_accuracy": 0.466089273817455, + "step": 6890 + }, + { + "epoch": 1.2775305895439377, + "grad_norm": 6.33203125, + "learning_rate": 8.722469410456063e-06, + "loss": 3.0711, + "mean_token_accuracy": 0.4412729260293939, + "step": 6891 + }, + { + "epoch": 1.2777159807193177, + "grad_norm": 6.046875, + "learning_rate": 8.722284019280683e-06, + "loss": 2.3227, + "mean_token_accuracy": 0.5013854930725347, + "step": 6892 + }, + { + "epoch": 1.2779013718946979, + "grad_norm": 5.95703125, + "learning_rate": 8.722098628105304e-06, + "loss": 2.8942, + "mean_token_accuracy": 0.4855994641661085, + "step": 6893 + }, + { + "epoch": 1.2780867630700778, + "grad_norm": 6.640625, + "learning_rate": 8.721913236929922e-06, + "loss": 3.1615, + "mean_token_accuracy": 0.4101966873706004, + "step": 6894 + }, + { + "epoch": 1.278272154245458, + "grad_norm": 6.9453125, + "learning_rate": 8.721727845754543e-06, + "loss": 2.6892, + "mean_token_accuracy": 0.48331322878970645, + "step": 6895 + }, + { + "epoch": 1.278457545420838, + "grad_norm": 7.53515625, + "learning_rate": 8.721542454579163e-06, + "loss": 2.446, + "mean_token_accuracy": 0.4849949135300102, + "step": 6896 + }, + { + "epoch": 1.278642936596218, + "grad_norm": 9.2265625, + "learning_rate": 8.721357063403782e-06, + "loss": 3.4468, + "mean_token_accuracy": 0.4271095717884131, + "step": 6897 + }, + { + "epoch": 1.2788283277715982, + "grad_norm": 9.3359375, + "learning_rate": 8.721171672228403e-06, + "loss": 2.6155, + "mean_token_accuracy": 0.4935453186574263, + "step": 6898 + }, + { + "epoch": 1.2790137189469781, + "grad_norm": 6.90234375, + "learning_rate": 8.720986281053021e-06, + "loss": 4.2466, + "mean_token_accuracy": 0.3687603058012292, + "step": 6899 + }, + { + "epoch": 1.279199110122358, + "grad_norm": 5.375, + "learning_rate": 8.720800889877644e-06, + "loss": 2.4405, + "mean_token_accuracy": 0.4988418871144703, + "step": 6900 + }, + { + "epoch": 1.2793845012977383, + "grad_norm": 6.4375, + "learning_rate": 8.720615498702262e-06, + "loss": 2.568, + "mean_token_accuracy": 0.47586206896551725, + "step": 6901 + }, + { + "epoch": 1.2795698924731183, + "grad_norm": 6.03125, + "learning_rate": 8.720430107526883e-06, + "loss": 2.9857, + "mean_token_accuracy": 0.4620599981424724, + "step": 6902 + }, + { + "epoch": 1.2797552836484982, + "grad_norm": 6.359375, + "learning_rate": 8.720244716351502e-06, + "loss": 2.763, + "mean_token_accuracy": 0.4634146341463415, + "step": 6903 + }, + { + "epoch": 1.2799406748238784, + "grad_norm": 6.62109375, + "learning_rate": 8.720059325176122e-06, + "loss": 2.5361, + "mean_token_accuracy": 0.5068493150684932, + "step": 6904 + }, + { + "epoch": 1.2801260659992584, + "grad_norm": 7.1015625, + "learning_rate": 8.719873934000743e-06, + "loss": 2.2169, + "mean_token_accuracy": 0.5367710676587645, + "step": 6905 + }, + { + "epoch": 1.2803114571746386, + "grad_norm": 5.53125, + "learning_rate": 8.719688542825361e-06, + "loss": 2.7148, + "mean_token_accuracy": 0.492800622648852, + "step": 6906 + }, + { + "epoch": 1.2804968483500185, + "grad_norm": 8.7421875, + "learning_rate": 8.719503151649982e-06, + "loss": 3.4514, + "mean_token_accuracy": 0.40084449621432733, + "step": 6907 + }, + { + "epoch": 1.2806822395253987, + "grad_norm": 13.171875, + "learning_rate": 8.719317760474602e-06, + "loss": 2.9262, + "mean_token_accuracy": 0.4486500794070937, + "step": 6908 + }, + { + "epoch": 1.2808676307007787, + "grad_norm": 10.7265625, + "learning_rate": 8.719132369299223e-06, + "loss": 2.3029, + "mean_token_accuracy": 0.5029123455036226, + "step": 6909 + }, + { + "epoch": 1.2810530218761587, + "grad_norm": 6.8671875, + "learning_rate": 8.718946978123842e-06, + "loss": 3.1173, + "mean_token_accuracy": 0.42355889724310775, + "step": 6910 + }, + { + "epoch": 1.2812384130515388, + "grad_norm": 6.91796875, + "learning_rate": 8.718761586948462e-06, + "loss": 2.9953, + "mean_token_accuracy": 0.4447870778267254, + "step": 6911 + }, + { + "epoch": 1.2814238042269188, + "grad_norm": 6.58203125, + "learning_rate": 8.718576195773083e-06, + "loss": 2.6755, + "mean_token_accuracy": 0.48339532412327313, + "step": 6912 + }, + { + "epoch": 1.2816091954022988, + "grad_norm": 5.83203125, + "learning_rate": 8.718390804597702e-06, + "loss": 2.77, + "mean_token_accuracy": 0.4582531742978069, + "step": 6913 + }, + { + "epoch": 1.281794586577679, + "grad_norm": 7.9453125, + "learning_rate": 8.718205413422322e-06, + "loss": 3.0574, + "mean_token_accuracy": 0.44884807475430966, + "step": 6914 + }, + { + "epoch": 1.281979977753059, + "grad_norm": 7.17578125, + "learning_rate": 8.71802002224694e-06, + "loss": 2.502, + "mean_token_accuracy": 0.4822074437055186, + "step": 6915 + }, + { + "epoch": 1.282165368928439, + "grad_norm": 5.71484375, + "learning_rate": 8.717834631071563e-06, + "loss": 2.6671, + "mean_token_accuracy": 0.4868812201111542, + "step": 6916 + }, + { + "epoch": 1.282350760103819, + "grad_norm": 5.234375, + "learning_rate": 8.717649239896182e-06, + "loss": 3.0133, + "mean_token_accuracy": 0.4406181552689219, + "step": 6917 + }, + { + "epoch": 1.282536151279199, + "grad_norm": 6.51953125, + "learning_rate": 8.717463848720802e-06, + "loss": 2.417, + "mean_token_accuracy": 0.5251918585251919, + "step": 6918 + }, + { + "epoch": 1.282721542454579, + "grad_norm": 10.5390625, + "learning_rate": 8.717278457545421e-06, + "loss": 2.577, + "mean_token_accuracy": 0.47660311958405543, + "step": 6919 + }, + { + "epoch": 1.2829069336299592, + "grad_norm": 7.125, + "learning_rate": 8.717093066370042e-06, + "loss": 2.8554, + "mean_token_accuracy": 0.45697509617331444, + "step": 6920 + }, + { + "epoch": 1.2830923248053392, + "grad_norm": 6.9921875, + "learning_rate": 8.716907675194662e-06, + "loss": 3.263, + "mean_token_accuracy": 0.422355854262469, + "step": 6921 + }, + { + "epoch": 1.2832777159807194, + "grad_norm": 5.46484375, + "learning_rate": 8.716722284019281e-06, + "loss": 2.5434, + "mean_token_accuracy": 0.504313205043132, + "step": 6922 + }, + { + "epoch": 1.2834631071560993, + "grad_norm": 8.515625, + "learning_rate": 8.716536892843901e-06, + "loss": 2.7635, + "mean_token_accuracy": 0.4735520094562648, + "step": 6923 + }, + { + "epoch": 1.2836484983314795, + "grad_norm": 5.52734375, + "learning_rate": 8.716351501668522e-06, + "loss": 2.24, + "mean_token_accuracy": 0.5216906123587669, + "step": 6924 + }, + { + "epoch": 1.2838338895068595, + "grad_norm": 6.16796875, + "learning_rate": 8.716166110493142e-06, + "loss": 2.7268, + "mean_token_accuracy": 0.49910946705028086, + "step": 6925 + }, + { + "epoch": 1.2840192806822395, + "grad_norm": 9.640625, + "learning_rate": 8.715980719317761e-06, + "loss": 3.7165, + "mean_token_accuracy": 0.42424242424242425, + "step": 6926 + }, + { + "epoch": 1.2842046718576197, + "grad_norm": 6.1328125, + "learning_rate": 8.715795328142382e-06, + "loss": 2.6352, + "mean_token_accuracy": 0.4812910938433951, + "step": 6927 + }, + { + "epoch": 1.2843900630329996, + "grad_norm": 6.5078125, + "learning_rate": 8.715609936967e-06, + "loss": 3.0123, + "mean_token_accuracy": 0.4396551724137931, + "step": 6928 + }, + { + "epoch": 1.2845754542083796, + "grad_norm": 8.6015625, + "learning_rate": 8.715424545791621e-06, + "loss": 2.939, + "mean_token_accuracy": 0.4322480248982523, + "step": 6929 + }, + { + "epoch": 1.2847608453837598, + "grad_norm": 5.24609375, + "learning_rate": 8.715239154616241e-06, + "loss": 2.6858, + "mean_token_accuracy": 0.4668402511870118, + "step": 6930 + }, + { + "epoch": 1.2849462365591398, + "grad_norm": 5.35546875, + "learning_rate": 8.71505376344086e-06, + "loss": 2.7304, + "mean_token_accuracy": 0.4777634777634778, + "step": 6931 + }, + { + "epoch": 1.2851316277345197, + "grad_norm": 6.9375, + "learning_rate": 8.71486837226548e-06, + "loss": 2.3899, + "mean_token_accuracy": 0.49371995658241585, + "step": 6932 + }, + { + "epoch": 1.2853170189099, + "grad_norm": 6.9921875, + "learning_rate": 8.714682981090101e-06, + "loss": 3.1327, + "mean_token_accuracy": 0.4503478052290717, + "step": 6933 + }, + { + "epoch": 1.2855024100852799, + "grad_norm": 6.08984375, + "learning_rate": 8.714497589914722e-06, + "loss": 2.3297, + "mean_token_accuracy": 0.5160578302615605, + "step": 6934 + }, + { + "epoch": 1.28568780126066, + "grad_norm": 6.40234375, + "learning_rate": 8.71431219873934e-06, + "loss": 3.6942, + "mean_token_accuracy": 0.3682753164556962, + "step": 6935 + }, + { + "epoch": 1.28587319243604, + "grad_norm": 10.4296875, + "learning_rate": 8.714126807563961e-06, + "loss": 3.1291, + "mean_token_accuracy": 0.4283894870904249, + "step": 6936 + }, + { + "epoch": 1.2860585836114202, + "grad_norm": 9.359375, + "learning_rate": 8.71394141638858e-06, + "loss": 2.7199, + "mean_token_accuracy": 0.44988576537200753, + "step": 6937 + }, + { + "epoch": 1.2862439747868002, + "grad_norm": 6.8046875, + "learning_rate": 8.7137560252132e-06, + "loss": 2.6832, + "mean_token_accuracy": 0.4782758620689655, + "step": 6938 + }, + { + "epoch": 1.2864293659621802, + "grad_norm": 8.953125, + "learning_rate": 8.71357063403782e-06, + "loss": 3.1268, + "mean_token_accuracy": 0.42401311640812206, + "step": 6939 + }, + { + "epoch": 1.2866147571375603, + "grad_norm": 8.1484375, + "learning_rate": 8.71338524286244e-06, + "loss": 2.4169, + "mean_token_accuracy": 0.49859550561797755, + "step": 6940 + }, + { + "epoch": 1.2868001483129403, + "grad_norm": 8.59375, + "learning_rate": 8.71319985168706e-06, + "loss": 2.7931, + "mean_token_accuracy": 0.47214381221215274, + "step": 6941 + }, + { + "epoch": 1.2869855394883203, + "grad_norm": 7.67578125, + "learning_rate": 8.71301446051168e-06, + "loss": 2.2485, + "mean_token_accuracy": 0.5122319956019791, + "step": 6942 + }, + { + "epoch": 1.2871709306637005, + "grad_norm": 9.015625, + "learning_rate": 8.712829069336301e-06, + "loss": 2.8287, + "mean_token_accuracy": 0.4582123600165906, + "step": 6943 + }, + { + "epoch": 1.2873563218390804, + "grad_norm": 6.76953125, + "learning_rate": 8.71264367816092e-06, + "loss": 2.8152, + "mean_token_accuracy": 0.4588443717634233, + "step": 6944 + }, + { + "epoch": 1.2875417130144604, + "grad_norm": 7.3671875, + "learning_rate": 8.71245828698554e-06, + "loss": 3.3397, + "mean_token_accuracy": 0.4276899924755455, + "step": 6945 + }, + { + "epoch": 1.2877271041898406, + "grad_norm": 7.671875, + "learning_rate": 8.712272895810159e-06, + "loss": 3.1164, + "mean_token_accuracy": 0.4264766911052704, + "step": 6946 + }, + { + "epoch": 1.2879124953652206, + "grad_norm": 7.2890625, + "learning_rate": 8.71208750463478e-06, + "loss": 3.4786, + "mean_token_accuracy": 0.40805653710247347, + "step": 6947 + }, + { + "epoch": 1.2880978865406005, + "grad_norm": 7.8984375, + "learning_rate": 8.7119021134594e-06, + "loss": 2.4963, + "mean_token_accuracy": 0.4802825947334618, + "step": 6948 + }, + { + "epoch": 1.2882832777159807, + "grad_norm": 6.1328125, + "learning_rate": 8.71171672228402e-06, + "loss": 2.4055, + "mean_token_accuracy": 0.5127755511022044, + "step": 6949 + }, + { + "epoch": 1.2884686688913607, + "grad_norm": 9.046875, + "learning_rate": 8.711531331108641e-06, + "loss": 2.6307, + "mean_token_accuracy": 0.4718600429113715, + "step": 6950 + }, + { + "epoch": 1.2886540600667409, + "grad_norm": 6.32421875, + "learning_rate": 8.71134593993326e-06, + "loss": 2.7671, + "mean_token_accuracy": 0.4769659011830202, + "step": 6951 + }, + { + "epoch": 1.2888394512421208, + "grad_norm": 5.58984375, + "learning_rate": 8.71116054875788e-06, + "loss": 2.367, + "mean_token_accuracy": 0.5385068993985139, + "step": 6952 + }, + { + "epoch": 1.289024842417501, + "grad_norm": 9.75, + "learning_rate": 8.710975157582499e-06, + "loss": 2.8695, + "mean_token_accuracy": 0.47386875939221906, + "step": 6953 + }, + { + "epoch": 1.289210233592881, + "grad_norm": 8.9140625, + "learning_rate": 8.71078976640712e-06, + "loss": 3.0407, + "mean_token_accuracy": 0.4362370133576892, + "step": 6954 + }, + { + "epoch": 1.289395624768261, + "grad_norm": 6.890625, + "learning_rate": 8.71060437523174e-06, + "loss": 2.8899, + "mean_token_accuracy": 0.4619191919191919, + "step": 6955 + }, + { + "epoch": 1.2895810159436412, + "grad_norm": 5.93359375, + "learning_rate": 8.710418984056359e-06, + "loss": 3.0542, + "mean_token_accuracy": 0.44437704719155646, + "step": 6956 + }, + { + "epoch": 1.2897664071190211, + "grad_norm": 11.765625, + "learning_rate": 8.71023359288098e-06, + "loss": 2.2009, + "mean_token_accuracy": 0.535036325287087, + "step": 6957 + }, + { + "epoch": 1.289951798294401, + "grad_norm": 9.765625, + "learning_rate": 8.7100482017056e-06, + "loss": 2.2476, + "mean_token_accuracy": 0.5053329864724245, + "step": 6958 + }, + { + "epoch": 1.2901371894697813, + "grad_norm": 6.7109375, + "learning_rate": 8.70986281053022e-06, + "loss": 2.9455, + "mean_token_accuracy": 0.45460358056265987, + "step": 6959 + }, + { + "epoch": 1.2903225806451613, + "grad_norm": 8.5859375, + "learning_rate": 8.70967741935484e-06, + "loss": 2.9178, + "mean_token_accuracy": 0.44893460690668624, + "step": 6960 + }, + { + "epoch": 1.2905079718205412, + "grad_norm": 7.74609375, + "learning_rate": 8.70949202817946e-06, + "loss": 2.7003, + "mean_token_accuracy": 0.47106662496090085, + "step": 6961 + }, + { + "epoch": 1.2906933629959214, + "grad_norm": 8.1875, + "learning_rate": 8.709306637004078e-06, + "loss": 3.283, + "mean_token_accuracy": 0.4214817938984957, + "step": 6962 + }, + { + "epoch": 1.2908787541713014, + "grad_norm": 8.8515625, + "learning_rate": 8.709121245828699e-06, + "loss": 3.1283, + "mean_token_accuracy": 0.4319349826700131, + "step": 6963 + }, + { + "epoch": 1.2910641453466816, + "grad_norm": 8.328125, + "learning_rate": 8.70893585465332e-06, + "loss": 3.3758, + "mean_token_accuracy": 0.4087332372825003, + "step": 6964 + }, + { + "epoch": 1.2912495365220615, + "grad_norm": 6.8515625, + "learning_rate": 8.70875046347794e-06, + "loss": 3.3302, + "mean_token_accuracy": 0.4220293325351691, + "step": 6965 + }, + { + "epoch": 1.2914349276974417, + "grad_norm": 6.69140625, + "learning_rate": 8.708565072302559e-06, + "loss": 2.9249, + "mean_token_accuracy": 0.4680795050677899, + "step": 6966 + }, + { + "epoch": 1.2916203188728217, + "grad_norm": 10.6171875, + "learning_rate": 8.70837968112718e-06, + "loss": 2.445, + "mean_token_accuracy": 0.4813256180957391, + "step": 6967 + }, + { + "epoch": 1.2918057100482017, + "grad_norm": 10.46875, + "learning_rate": 8.7081942899518e-06, + "loss": 3.2887, + "mean_token_accuracy": 0.41794963599474877, + "step": 6968 + }, + { + "epoch": 1.2919911012235819, + "grad_norm": 7.28125, + "learning_rate": 8.708008898776419e-06, + "loss": 2.5879, + "mean_token_accuracy": 0.49240034413535994, + "step": 6969 + }, + { + "epoch": 1.2921764923989618, + "grad_norm": 7.48046875, + "learning_rate": 8.707823507601039e-06, + "loss": 2.8653, + "mean_token_accuracy": 0.4529799341120096, + "step": 6970 + }, + { + "epoch": 1.2923618835743418, + "grad_norm": 9.5859375, + "learning_rate": 8.707638116425658e-06, + "loss": 2.8389, + "mean_token_accuracy": 0.4537901060974051, + "step": 6971 + }, + { + "epoch": 1.292547274749722, + "grad_norm": 10.9609375, + "learning_rate": 8.707452725250278e-06, + "loss": 2.3883, + "mean_token_accuracy": 0.49167410050550103, + "step": 6972 + }, + { + "epoch": 1.292732665925102, + "grad_norm": 8.0703125, + "learning_rate": 8.707267334074899e-06, + "loss": 2.0286, + "mean_token_accuracy": 0.5577903292464527, + "step": 6973 + }, + { + "epoch": 1.292918057100482, + "grad_norm": 6.9609375, + "learning_rate": 8.70708194289952e-06, + "loss": 3.47, + "mean_token_accuracy": 0.41597510373443985, + "step": 6974 + }, + { + "epoch": 1.293103448275862, + "grad_norm": 7.328125, + "learning_rate": 8.706896551724138e-06, + "loss": 2.6679, + "mean_token_accuracy": 0.5031482541499713, + "step": 6975 + }, + { + "epoch": 1.293288839451242, + "grad_norm": 6.890625, + "learning_rate": 8.706711160548759e-06, + "loss": 3.0831, + "mean_token_accuracy": 0.41405520736098145, + "step": 6976 + }, + { + "epoch": 1.293474230626622, + "grad_norm": 6.265625, + "learning_rate": 8.706525769373379e-06, + "loss": 2.4937, + "mean_token_accuracy": 0.48798001873243835, + "step": 6977 + }, + { + "epoch": 1.2936596218020022, + "grad_norm": 5.83984375, + "learning_rate": 8.706340378197998e-06, + "loss": 3.2133, + "mean_token_accuracy": 0.4313488576449912, + "step": 6978 + }, + { + "epoch": 1.2938450129773824, + "grad_norm": 6.47265625, + "learning_rate": 8.706154987022618e-06, + "loss": 2.5218, + "mean_token_accuracy": 0.5307808841757333, + "step": 6979 + }, + { + "epoch": 1.2940304041527624, + "grad_norm": 4.875, + "learning_rate": 8.705969595847237e-06, + "loss": 2.9492, + "mean_token_accuracy": 0.44651312957382694, + "step": 6980 + }, + { + "epoch": 1.2942157953281423, + "grad_norm": 6.21875, + "learning_rate": 8.70578420467186e-06, + "loss": 2.6896, + "mean_token_accuracy": 0.47109670448406266, + "step": 6981 + }, + { + "epoch": 1.2944011865035225, + "grad_norm": 8.4375, + "learning_rate": 8.705598813496478e-06, + "loss": 3.2205, + "mean_token_accuracy": 0.42215771649733913, + "step": 6982 + }, + { + "epoch": 1.2945865776789025, + "grad_norm": 6.4765625, + "learning_rate": 8.705413422321099e-06, + "loss": 3.1936, + "mean_token_accuracy": 0.44059925093632957, + "step": 6983 + }, + { + "epoch": 1.2947719688542825, + "grad_norm": 5.5078125, + "learning_rate": 8.705228031145717e-06, + "loss": 2.5444, + "mean_token_accuracy": 0.4750243495199666, + "step": 6984 + }, + { + "epoch": 1.2949573600296627, + "grad_norm": 6.12109375, + "learning_rate": 8.705042639970338e-06, + "loss": 2.5734, + "mean_token_accuracy": 0.49076002082248826, + "step": 6985 + }, + { + "epoch": 1.2951427512050426, + "grad_norm": 6.28125, + "learning_rate": 8.704857248794958e-06, + "loss": 2.5851, + "mean_token_accuracy": 0.4896254378873619, + "step": 6986 + }, + { + "epoch": 1.2953281423804226, + "grad_norm": 5.19921875, + "learning_rate": 8.704671857619577e-06, + "loss": 2.3294, + "mean_token_accuracy": 0.4987305041712006, + "step": 6987 + }, + { + "epoch": 1.2955135335558028, + "grad_norm": 6.4140625, + "learning_rate": 8.704486466444198e-06, + "loss": 3.2613, + "mean_token_accuracy": 0.4525993883792049, + "step": 6988 + }, + { + "epoch": 1.2956989247311828, + "grad_norm": 7.1171875, + "learning_rate": 8.704301075268818e-06, + "loss": 2.6497, + "mean_token_accuracy": 0.47506275237367673, + "step": 6989 + }, + { + "epoch": 1.2958843159065627, + "grad_norm": 7.703125, + "learning_rate": 8.704115684093439e-06, + "loss": 3.1065, + "mean_token_accuracy": 0.4464530892448513, + "step": 6990 + }, + { + "epoch": 1.296069707081943, + "grad_norm": 6.3046875, + "learning_rate": 8.703930292918057e-06, + "loss": 2.5805, + "mean_token_accuracy": 0.49241475295755044, + "step": 6991 + }, + { + "epoch": 1.2962550982573229, + "grad_norm": 6.953125, + "learning_rate": 8.703744901742678e-06, + "loss": 2.3696, + "mean_token_accuracy": 0.5400304028375982, + "step": 6992 + }, + { + "epoch": 1.296440489432703, + "grad_norm": 7.390625, + "learning_rate": 8.703559510567298e-06, + "loss": 2.6299, + "mean_token_accuracy": 0.4651370299553856, + "step": 6993 + }, + { + "epoch": 1.296625880608083, + "grad_norm": 5.53515625, + "learning_rate": 8.703374119391917e-06, + "loss": 3.21, + "mean_token_accuracy": 0.4156959813628422, + "step": 6994 + }, + { + "epoch": 1.2968112717834632, + "grad_norm": 6.05859375, + "learning_rate": 8.703188728216538e-06, + "loss": 3.4343, + "mean_token_accuracy": 0.40844493030882756, + "step": 6995 + }, + { + "epoch": 1.2969966629588432, + "grad_norm": 5.9921875, + "learning_rate": 8.703003337041157e-06, + "loss": 3.6598, + "mean_token_accuracy": 0.4075333837238599, + "step": 6996 + }, + { + "epoch": 1.2971820541342232, + "grad_norm": 9.2109375, + "learning_rate": 8.702817945865779e-06, + "loss": 2.9394, + "mean_token_accuracy": 0.4366993217784476, + "step": 6997 + }, + { + "epoch": 1.2973674453096034, + "grad_norm": 8.6171875, + "learning_rate": 8.702632554690398e-06, + "loss": 2.2277, + "mean_token_accuracy": 0.52773737793476, + "step": 6998 + }, + { + "epoch": 1.2975528364849833, + "grad_norm": 5.32421875, + "learning_rate": 8.702447163515018e-06, + "loss": 2.4328, + "mean_token_accuracy": 0.5355625748829104, + "step": 6999 + }, + { + "epoch": 1.2977382276603633, + "grad_norm": 8.5859375, + "learning_rate": 8.702261772339637e-06, + "loss": 2.6627, + "mean_token_accuracy": 0.4696329254727475, + "step": 7000 + }, + { + "epoch": 1.2979236188357435, + "grad_norm": 6.0625, + "learning_rate": 8.702076381164257e-06, + "loss": 2.8589, + "mean_token_accuracy": 0.46010834769761144, + "step": 7001 + }, + { + "epoch": 1.2981090100111234, + "grad_norm": 5.4765625, + "learning_rate": 8.701890989988878e-06, + "loss": 2.743, + "mean_token_accuracy": 0.4884253370643602, + "step": 7002 + }, + { + "epoch": 1.2982944011865034, + "grad_norm": 8.546875, + "learning_rate": 8.701705598813497e-06, + "loss": 3.1546, + "mean_token_accuracy": 0.42584030988379357, + "step": 7003 + }, + { + "epoch": 1.2984797923618836, + "grad_norm": 6.75390625, + "learning_rate": 8.701520207638117e-06, + "loss": 2.8375, + "mean_token_accuracy": 0.47386231038506416, + "step": 7004 + }, + { + "epoch": 1.2986651835372636, + "grad_norm": 7.296875, + "learning_rate": 8.701334816462738e-06, + "loss": 2.3171, + "mean_token_accuracy": 0.5175499930079709, + "step": 7005 + }, + { + "epoch": 1.2988505747126438, + "grad_norm": 5.99609375, + "learning_rate": 8.701149425287358e-06, + "loss": 3.3306, + "mean_token_accuracy": 0.4201019664967225, + "step": 7006 + }, + { + "epoch": 1.2990359658880237, + "grad_norm": 6.08984375, + "learning_rate": 8.700964034111977e-06, + "loss": 3.4746, + "mean_token_accuracy": 0.41622090501662573, + "step": 7007 + }, + { + "epoch": 1.299221357063404, + "grad_norm": 7.03515625, + "learning_rate": 8.700778642936597e-06, + "loss": 3.1027, + "mean_token_accuracy": 0.43416370106761565, + "step": 7008 + }, + { + "epoch": 1.2994067482387839, + "grad_norm": 7.98046875, + "learning_rate": 8.700593251761216e-06, + "loss": 2.6505, + "mean_token_accuracy": 0.4667373844521897, + "step": 7009 + }, + { + "epoch": 1.2995921394141638, + "grad_norm": 5.77734375, + "learning_rate": 8.700407860585837e-06, + "loss": 2.618, + "mean_token_accuracy": 0.48220387243735763, + "step": 7010 + }, + { + "epoch": 1.299777530589544, + "grad_norm": 6.2421875, + "learning_rate": 8.700222469410457e-06, + "loss": 2.9512, + "mean_token_accuracy": 0.46461787302871005, + "step": 7011 + }, + { + "epoch": 1.299962921764924, + "grad_norm": 9.296875, + "learning_rate": 8.700037078235076e-06, + "loss": 3.1871, + "mean_token_accuracy": 0.44493545761135017, + "step": 7012 + }, + { + "epoch": 1.300148312940304, + "grad_norm": 6.546875, + "learning_rate": 8.699851687059696e-06, + "loss": 2.4898, + "mean_token_accuracy": 0.4873815092945956, + "step": 7013 + }, + { + "epoch": 1.3003337041156842, + "grad_norm": 5.76171875, + "learning_rate": 8.699666295884317e-06, + "loss": 2.6573, + "mean_token_accuracy": 0.4707173227266493, + "step": 7014 + }, + { + "epoch": 1.3005190952910641, + "grad_norm": 8.390625, + "learning_rate": 8.699480904708937e-06, + "loss": 2.7062, + "mean_token_accuracy": 0.4790137138107771, + "step": 7015 + }, + { + "epoch": 1.300704486466444, + "grad_norm": 9.3671875, + "learning_rate": 8.699295513533556e-06, + "loss": 2.957, + "mean_token_accuracy": 0.4494427286155499, + "step": 7016 + }, + { + "epoch": 1.3008898776418243, + "grad_norm": 8.7109375, + "learning_rate": 8.699110122358177e-06, + "loss": 2.9677, + "mean_token_accuracy": 0.4471116816431322, + "step": 7017 + }, + { + "epoch": 1.3010752688172043, + "grad_norm": 5.9921875, + "learning_rate": 8.698924731182796e-06, + "loss": 3.0336, + "mean_token_accuracy": 0.4657353426465735, + "step": 7018 + }, + { + "epoch": 1.3012606599925842, + "grad_norm": 6.91796875, + "learning_rate": 8.698739340007416e-06, + "loss": 2.6471, + "mean_token_accuracy": 0.46966785616250345, + "step": 7019 + }, + { + "epoch": 1.3014460511679644, + "grad_norm": 7.12109375, + "learning_rate": 8.698553948832036e-06, + "loss": 2.9234, + "mean_token_accuracy": 0.46406968303895474, + "step": 7020 + }, + { + "epoch": 1.3016314423433444, + "grad_norm": 10.453125, + "learning_rate": 8.698368557656657e-06, + "loss": 2.5199, + "mean_token_accuracy": 0.48504273504273504, + "step": 7021 + }, + { + "epoch": 1.3018168335187246, + "grad_norm": 6.11328125, + "learning_rate": 8.698183166481276e-06, + "loss": 2.7721, + "mean_token_accuracy": 0.45688172043010755, + "step": 7022 + }, + { + "epoch": 1.3020022246941045, + "grad_norm": 5.98828125, + "learning_rate": 8.697997775305896e-06, + "loss": 2.9321, + "mean_token_accuracy": 0.45676511355155136, + "step": 7023 + }, + { + "epoch": 1.3021876158694847, + "grad_norm": 7.24609375, + "learning_rate": 8.697812384130517e-06, + "loss": 3.0765, + "mean_token_accuracy": 0.4544716766758643, + "step": 7024 + }, + { + "epoch": 1.3023730070448647, + "grad_norm": 6.3203125, + "learning_rate": 8.697626992955136e-06, + "loss": 2.8539, + "mean_token_accuracy": 0.46716216216216216, + "step": 7025 + }, + { + "epoch": 1.3025583982202447, + "grad_norm": 6.3125, + "learning_rate": 8.697441601779756e-06, + "loss": 2.5717, + "mean_token_accuracy": 0.4861842950777535, + "step": 7026 + }, + { + "epoch": 1.3027437893956249, + "grad_norm": 6.28125, + "learning_rate": 8.697256210604375e-06, + "loss": 2.6859, + "mean_token_accuracy": 0.4808729139922978, + "step": 7027 + }, + { + "epoch": 1.3029291805710048, + "grad_norm": 6.32421875, + "learning_rate": 8.697070819428995e-06, + "loss": 3.0713, + "mean_token_accuracy": 0.43382937634897195, + "step": 7028 + }, + { + "epoch": 1.3031145717463848, + "grad_norm": 6.55859375, + "learning_rate": 8.696885428253616e-06, + "loss": 2.6334, + "mean_token_accuracy": 0.4744678233050326, + "step": 7029 + }, + { + "epoch": 1.303299962921765, + "grad_norm": 6.1015625, + "learning_rate": 8.696700037078236e-06, + "loss": 2.6485, + "mean_token_accuracy": 0.48314606741573035, + "step": 7030 + }, + { + "epoch": 1.303485354097145, + "grad_norm": 7.05859375, + "learning_rate": 8.696514645902857e-06, + "loss": 2.8046, + "mean_token_accuracy": 0.4669852848698277, + "step": 7031 + }, + { + "epoch": 1.303670745272525, + "grad_norm": 6.07421875, + "learning_rate": 8.696329254727476e-06, + "loss": 3.2299, + "mean_token_accuracy": 0.42602970019613334, + "step": 7032 + }, + { + "epoch": 1.303856136447905, + "grad_norm": 6.76171875, + "learning_rate": 8.696143863552096e-06, + "loss": 1.9862, + "mean_token_accuracy": 0.5550348152115694, + "step": 7033 + }, + { + "epoch": 1.304041527623285, + "grad_norm": 7.375, + "learning_rate": 8.695958472376715e-06, + "loss": 3.0631, + "mean_token_accuracy": 0.4619227857683573, + "step": 7034 + }, + { + "epoch": 1.3042269187986653, + "grad_norm": 10.578125, + "learning_rate": 8.695773081201335e-06, + "loss": 2.9494, + "mean_token_accuracy": 0.4373391139412004, + "step": 7035 + }, + { + "epoch": 1.3044123099740452, + "grad_norm": 6.0234375, + "learning_rate": 8.695587690025956e-06, + "loss": 2.9487, + "mean_token_accuracy": 0.436052854891502, + "step": 7036 + }, + { + "epoch": 1.3045977011494254, + "grad_norm": 6.8125, + "learning_rate": 8.695402298850576e-06, + "loss": 3.0095, + "mean_token_accuracy": 0.4550379198266522, + "step": 7037 + }, + { + "epoch": 1.3047830923248054, + "grad_norm": 14.0625, + "learning_rate": 8.695216907675195e-06, + "loss": 2.7531, + "mean_token_accuracy": 0.47096456692913385, + "step": 7038 + }, + { + "epoch": 1.3049684835001854, + "grad_norm": 9.46875, + "learning_rate": 8.695031516499816e-06, + "loss": 2.9083, + "mean_token_accuracy": 0.4520629266844761, + "step": 7039 + }, + { + "epoch": 1.3051538746755655, + "grad_norm": 6.63671875, + "learning_rate": 8.694846125324436e-06, + "loss": 2.4455, + "mean_token_accuracy": 0.5008966599417171, + "step": 7040 + }, + { + "epoch": 1.3053392658509455, + "grad_norm": 6.7421875, + "learning_rate": 8.694660734149055e-06, + "loss": 2.8823, + "mean_token_accuracy": 0.4670502659863207, + "step": 7041 + }, + { + "epoch": 1.3055246570263255, + "grad_norm": 5.37890625, + "learning_rate": 8.694475342973675e-06, + "loss": 3.0578, + "mean_token_accuracy": 0.4740466101694915, + "step": 7042 + }, + { + "epoch": 1.3057100482017057, + "grad_norm": 5.84375, + "learning_rate": 8.694289951798294e-06, + "loss": 2.9404, + "mean_token_accuracy": 0.44580670405634537, + "step": 7043 + }, + { + "epoch": 1.3058954393770856, + "grad_norm": 6.4609375, + "learning_rate": 8.694104560622915e-06, + "loss": 2.5324, + "mean_token_accuracy": 0.5004758883248731, + "step": 7044 + }, + { + "epoch": 1.3060808305524656, + "grad_norm": 5.73046875, + "learning_rate": 8.693919169447535e-06, + "loss": 2.6681, + "mean_token_accuracy": 0.47325981264081585, + "step": 7045 + }, + { + "epoch": 1.3062662217278458, + "grad_norm": 7.6484375, + "learning_rate": 8.693733778272156e-06, + "loss": 2.8457, + "mean_token_accuracy": 0.4764309764309764, + "step": 7046 + }, + { + "epoch": 1.3064516129032258, + "grad_norm": 9.125, + "learning_rate": 8.693548387096775e-06, + "loss": 2.8976, + "mean_token_accuracy": 0.45290970983952017, + "step": 7047 + }, + { + "epoch": 1.3066370040786057, + "grad_norm": 6.6015625, + "learning_rate": 8.693362995921395e-06, + "loss": 2.3853, + "mean_token_accuracy": 0.49202361512372816, + "step": 7048 + }, + { + "epoch": 1.306822395253986, + "grad_norm": 5.3515625, + "learning_rate": 8.693177604746015e-06, + "loss": 2.3149, + "mean_token_accuracy": 0.545260663507109, + "step": 7049 + }, + { + "epoch": 1.3070077864293659, + "grad_norm": 7.44921875, + "learning_rate": 8.692992213570634e-06, + "loss": 3.0918, + "mean_token_accuracy": 0.44785794813979707, + "step": 7050 + }, + { + "epoch": 1.307193177604746, + "grad_norm": 6.96484375, + "learning_rate": 8.692806822395255e-06, + "loss": 2.6991, + "mean_token_accuracy": 0.45714285714285713, + "step": 7051 + }, + { + "epoch": 1.307378568780126, + "grad_norm": 6.67578125, + "learning_rate": 8.692621431219874e-06, + "loss": 2.948, + "mean_token_accuracy": 0.46090239079633294, + "step": 7052 + }, + { + "epoch": 1.3075639599555062, + "grad_norm": 6.36328125, + "learning_rate": 8.692436040044494e-06, + "loss": 3.2028, + "mean_token_accuracy": 0.43376623376623374, + "step": 7053 + }, + { + "epoch": 1.3077493511308862, + "grad_norm": 5.96484375, + "learning_rate": 8.692250648869115e-06, + "loss": 2.636, + "mean_token_accuracy": 0.4822245168903968, + "step": 7054 + }, + { + "epoch": 1.3079347423062662, + "grad_norm": 6.953125, + "learning_rate": 8.692065257693735e-06, + "loss": 2.4221, + "mean_token_accuracy": 0.49278438030560273, + "step": 7055 + }, + { + "epoch": 1.3081201334816464, + "grad_norm": 5.7890625, + "learning_rate": 8.691879866518354e-06, + "loss": 2.7917, + "mean_token_accuracy": 0.4626759730273276, + "step": 7056 + }, + { + "epoch": 1.3083055246570263, + "grad_norm": 7.3203125, + "learning_rate": 8.691694475342974e-06, + "loss": 2.5592, + "mean_token_accuracy": 0.4687456494500905, + "step": 7057 + }, + { + "epoch": 1.3084909158324063, + "grad_norm": 5.83203125, + "learning_rate": 8.691509084167595e-06, + "loss": 2.4699, + "mean_token_accuracy": 0.5067580997813556, + "step": 7058 + }, + { + "epoch": 1.3086763070077865, + "grad_norm": 6.07421875, + "learning_rate": 8.691323692992214e-06, + "loss": 2.4696, + "mean_token_accuracy": 0.5194647201946472, + "step": 7059 + }, + { + "epoch": 1.3088616981831664, + "grad_norm": 8.515625, + "learning_rate": 8.691138301816834e-06, + "loss": 2.8756, + "mean_token_accuracy": 0.4609012175222794, + "step": 7060 + }, + { + "epoch": 1.3090470893585464, + "grad_norm": 10.203125, + "learning_rate": 8.690952910641453e-06, + "loss": 2.7576, + "mean_token_accuracy": 0.4730829831932773, + "step": 7061 + }, + { + "epoch": 1.3092324805339266, + "grad_norm": 6.390625, + "learning_rate": 8.690767519466075e-06, + "loss": 2.8571, + "mean_token_accuracy": 0.45482560582714665, + "step": 7062 + }, + { + "epoch": 1.3094178717093066, + "grad_norm": 6.96875, + "learning_rate": 8.690582128290694e-06, + "loss": 2.8342, + "mean_token_accuracy": 0.4501955671447197, + "step": 7063 + }, + { + "epoch": 1.3096032628846868, + "grad_norm": 9.1796875, + "learning_rate": 8.690396737115314e-06, + "loss": 2.6845, + "mean_token_accuracy": 0.4937195590874135, + "step": 7064 + }, + { + "epoch": 1.3097886540600667, + "grad_norm": 7.8515625, + "learning_rate": 8.690211345939933e-06, + "loss": 2.8854, + "mean_token_accuracy": 0.4691790826760152, + "step": 7065 + }, + { + "epoch": 1.309974045235447, + "grad_norm": 6.90625, + "learning_rate": 8.690025954764554e-06, + "loss": 2.1812, + "mean_token_accuracy": 0.527256009457507, + "step": 7066 + }, + { + "epoch": 1.3101594364108269, + "grad_norm": 8.4296875, + "learning_rate": 8.689840563589174e-06, + "loss": 2.7556, + "mean_token_accuracy": 0.4637736938323872, + "step": 7067 + }, + { + "epoch": 1.3103448275862069, + "grad_norm": 8.53125, + "learning_rate": 8.689655172413793e-06, + "loss": 2.6931, + "mean_token_accuracy": 0.4608634111818825, + "step": 7068 + }, + { + "epoch": 1.310530218761587, + "grad_norm": 7.7421875, + "learning_rate": 8.689469781238413e-06, + "loss": 2.6171, + "mean_token_accuracy": 0.4862726291811533, + "step": 7069 + }, + { + "epoch": 1.310715609936967, + "grad_norm": 5.98828125, + "learning_rate": 8.689284390063034e-06, + "loss": 3.0025, + "mean_token_accuracy": 0.4476728174056734, + "step": 7070 + }, + { + "epoch": 1.310901001112347, + "grad_norm": 8.7421875, + "learning_rate": 8.689098998887654e-06, + "loss": 2.9269, + "mean_token_accuracy": 0.4602696688048174, + "step": 7071 + }, + { + "epoch": 1.3110863922877272, + "grad_norm": 9.1953125, + "learning_rate": 8.688913607712273e-06, + "loss": 3.3894, + "mean_token_accuracy": 0.4304994954591322, + "step": 7072 + }, + { + "epoch": 1.3112717834631071, + "grad_norm": 8.5859375, + "learning_rate": 8.688728216536894e-06, + "loss": 2.5144, + "mean_token_accuracy": 0.49756750182437365, + "step": 7073 + }, + { + "epoch": 1.311457174638487, + "grad_norm": 9.2109375, + "learning_rate": 8.688542825361514e-06, + "loss": 2.8469, + "mean_token_accuracy": 0.46040970507046347, + "step": 7074 + }, + { + "epoch": 1.3116425658138673, + "grad_norm": 9.03125, + "learning_rate": 8.688357434186133e-06, + "loss": 2.8453, + "mean_token_accuracy": 0.49150704734369355, + "step": 7075 + }, + { + "epoch": 1.3118279569892473, + "grad_norm": 6.7265625, + "learning_rate": 8.688172043010754e-06, + "loss": 2.8478, + "mean_token_accuracy": 0.44589444508603765, + "step": 7076 + }, + { + "epoch": 1.3120133481646274, + "grad_norm": 8.6875, + "learning_rate": 8.687986651835372e-06, + "loss": 2.9035, + "mean_token_accuracy": 0.4370629370629371, + "step": 7077 + }, + { + "epoch": 1.3121987393400074, + "grad_norm": 10.453125, + "learning_rate": 8.687801260659994e-06, + "loss": 3.0436, + "mean_token_accuracy": 0.4145097539206936, + "step": 7078 + }, + { + "epoch": 1.3123841305153876, + "grad_norm": 7.1015625, + "learning_rate": 8.687615869484613e-06, + "loss": 2.5394, + "mean_token_accuracy": 0.4783068783068783, + "step": 7079 + }, + { + "epoch": 1.3125695216907676, + "grad_norm": 5.9296875, + "learning_rate": 8.687430478309234e-06, + "loss": 3.0164, + "mean_token_accuracy": 0.44342993251290197, + "step": 7080 + }, + { + "epoch": 1.3127549128661475, + "grad_norm": 7.6875, + "learning_rate": 8.687245087133853e-06, + "loss": 2.9231, + "mean_token_accuracy": 0.4477571115973742, + "step": 7081 + }, + { + "epoch": 1.3129403040415277, + "grad_norm": 6.08203125, + "learning_rate": 8.687059695958473e-06, + "loss": 3.1596, + "mean_token_accuracy": 0.4309063893016345, + "step": 7082 + }, + { + "epoch": 1.3131256952169077, + "grad_norm": 5.80859375, + "learning_rate": 8.686874304783094e-06, + "loss": 2.92, + "mean_token_accuracy": 0.4457390597480016, + "step": 7083 + }, + { + "epoch": 1.3133110863922877, + "grad_norm": 6.046875, + "learning_rate": 8.686688913607712e-06, + "loss": 3.0056, + "mean_token_accuracy": 0.4255860683188212, + "step": 7084 + }, + { + "epoch": 1.3134964775676679, + "grad_norm": 6.52734375, + "learning_rate": 8.686503522432333e-06, + "loss": 3.5776, + "mean_token_accuracy": 0.43008314436885864, + "step": 7085 + }, + { + "epoch": 1.3136818687430478, + "grad_norm": 6.6796875, + "learning_rate": 8.686318131256953e-06, + "loss": 2.9636, + "mean_token_accuracy": 0.45851431117050895, + "step": 7086 + }, + { + "epoch": 1.3138672599184278, + "grad_norm": 6.14453125, + "learning_rate": 8.686132740081574e-06, + "loss": 3.172, + "mean_token_accuracy": 0.4369565217391304, + "step": 7087 + }, + { + "epoch": 1.314052651093808, + "grad_norm": 7.0859375, + "learning_rate": 8.685947348906193e-06, + "loss": 2.3996, + "mean_token_accuracy": 0.5025900493916395, + "step": 7088 + }, + { + "epoch": 1.314238042269188, + "grad_norm": 5.80859375, + "learning_rate": 8.685761957730813e-06, + "loss": 3.1272, + "mean_token_accuracy": 0.43207100591715975, + "step": 7089 + }, + { + "epoch": 1.314423433444568, + "grad_norm": 7.5625, + "learning_rate": 8.685576566555432e-06, + "loss": 2.6418, + "mean_token_accuracy": 0.45469155643466436, + "step": 7090 + }, + { + "epoch": 1.314608824619948, + "grad_norm": 7.90234375, + "learning_rate": 8.685391175380052e-06, + "loss": 2.761, + "mean_token_accuracy": 0.4907859078590786, + "step": 7091 + }, + { + "epoch": 1.314794215795328, + "grad_norm": 5.2265625, + "learning_rate": 8.685205784204673e-06, + "loss": 2.3342, + "mean_token_accuracy": 0.5463866584311303, + "step": 7092 + }, + { + "epoch": 1.3149796069707083, + "grad_norm": 6.453125, + "learning_rate": 8.685020393029292e-06, + "loss": 2.6479, + "mean_token_accuracy": 0.4678931333489571, + "step": 7093 + }, + { + "epoch": 1.3151649981460882, + "grad_norm": 7.8828125, + "learning_rate": 8.684835001853912e-06, + "loss": 2.7444, + "mean_token_accuracy": 0.48464007336084364, + "step": 7094 + }, + { + "epoch": 1.3153503893214684, + "grad_norm": 7.83984375, + "learning_rate": 8.684649610678533e-06, + "loss": 3.172, + "mean_token_accuracy": 0.4223687484448868, + "step": 7095 + }, + { + "epoch": 1.3155357804968484, + "grad_norm": 6.10546875, + "learning_rate": 8.684464219503153e-06, + "loss": 3.267, + "mean_token_accuracy": 0.4257607926397735, + "step": 7096 + }, + { + "epoch": 1.3157211716722284, + "grad_norm": 8.6015625, + "learning_rate": 8.684278828327772e-06, + "loss": 3.1144, + "mean_token_accuracy": 0.4262607040913416, + "step": 7097 + }, + { + "epoch": 1.3159065628476085, + "grad_norm": 8.1484375, + "learning_rate": 8.684093437152392e-06, + "loss": 3.1164, + "mean_token_accuracy": 0.3965156794425087, + "step": 7098 + }, + { + "epoch": 1.3160919540229885, + "grad_norm": 7.15234375, + "learning_rate": 8.683908045977011e-06, + "loss": 2.461, + "mean_token_accuracy": 0.5323812686815012, + "step": 7099 + }, + { + "epoch": 1.3162773451983685, + "grad_norm": 6.2734375, + "learning_rate": 8.683722654801632e-06, + "loss": 2.5135, + "mean_token_accuracy": 0.4849141265666099, + "step": 7100 + }, + { + "epoch": 1.3164627363737487, + "grad_norm": 8.6328125, + "learning_rate": 8.683537263626252e-06, + "loss": 2.3052, + "mean_token_accuracy": 0.5043419267299865, + "step": 7101 + }, + { + "epoch": 1.3166481275491286, + "grad_norm": 5.0, + "learning_rate": 8.683351872450873e-06, + "loss": 2.7761, + "mean_token_accuracy": 0.46165560752746654, + "step": 7102 + }, + { + "epoch": 1.3168335187245086, + "grad_norm": 5.16015625, + "learning_rate": 8.683166481275492e-06, + "loss": 2.8358, + "mean_token_accuracy": 0.4650896604349485, + "step": 7103 + }, + { + "epoch": 1.3170189098998888, + "grad_norm": 5.0859375, + "learning_rate": 8.682981090100112e-06, + "loss": 2.0835, + "mean_token_accuracy": 0.5486594891281402, + "step": 7104 + }, + { + "epoch": 1.3172043010752688, + "grad_norm": 5.83984375, + "learning_rate": 8.682795698924733e-06, + "loss": 2.8427, + "mean_token_accuracy": 0.44196540486337427, + "step": 7105 + }, + { + "epoch": 1.317389692250649, + "grad_norm": 5.7421875, + "learning_rate": 8.682610307749351e-06, + "loss": 2.4071, + "mean_token_accuracy": 0.5011117287381879, + "step": 7106 + }, + { + "epoch": 1.317575083426029, + "grad_norm": 7.87890625, + "learning_rate": 8.682424916573972e-06, + "loss": 2.7258, + "mean_token_accuracy": 0.49034194727225267, + "step": 7107 + }, + { + "epoch": 1.317760474601409, + "grad_norm": 5.703125, + "learning_rate": 8.68223952539859e-06, + "loss": 2.9129, + "mean_token_accuracy": 0.44031040714361647, + "step": 7108 + }, + { + "epoch": 1.317945865776789, + "grad_norm": 5.203125, + "learning_rate": 8.682054134223211e-06, + "loss": 2.7827, + "mean_token_accuracy": 0.47100110823790176, + "step": 7109 + }, + { + "epoch": 1.318131256952169, + "grad_norm": 6.5078125, + "learning_rate": 8.681868743047832e-06, + "loss": 2.7137, + "mean_token_accuracy": 0.4750436735712503, + "step": 7110 + }, + { + "epoch": 1.3183166481275492, + "grad_norm": 5.38671875, + "learning_rate": 8.681683351872452e-06, + "loss": 2.8891, + "mean_token_accuracy": 0.4566747728145399, + "step": 7111 + }, + { + "epoch": 1.3185020393029292, + "grad_norm": 6.7734375, + "learning_rate": 8.681497960697073e-06, + "loss": 3.0353, + "mean_token_accuracy": 0.44500917690244246, + "step": 7112 + }, + { + "epoch": 1.3186874304783092, + "grad_norm": 6.31640625, + "learning_rate": 8.681312569521691e-06, + "loss": 2.8088, + "mean_token_accuracy": 0.4563767000256608, + "step": 7113 + }, + { + "epoch": 1.3188728216536894, + "grad_norm": 5.7109375, + "learning_rate": 8.681127178346312e-06, + "loss": 3.1201, + "mean_token_accuracy": 0.4438229972034874, + "step": 7114 + }, + { + "epoch": 1.3190582128290693, + "grad_norm": 6.02734375, + "learning_rate": 8.68094178717093e-06, + "loss": 2.4667, + "mean_token_accuracy": 0.49524475524475525, + "step": 7115 + }, + { + "epoch": 1.3192436040044493, + "grad_norm": 6.71484375, + "learning_rate": 8.680756395995551e-06, + "loss": 3.5577, + "mean_token_accuracy": 0.4018622886547415, + "step": 7116 + }, + { + "epoch": 1.3194289951798295, + "grad_norm": 6.484375, + "learning_rate": 8.68057100482017e-06, + "loss": 3.3033, + "mean_token_accuracy": 0.4118141097424412, + "step": 7117 + }, + { + "epoch": 1.3196143863552094, + "grad_norm": 7.04296875, + "learning_rate": 8.680385613644792e-06, + "loss": 2.4446, + "mean_token_accuracy": 0.4937671667018804, + "step": 7118 + }, + { + "epoch": 1.3197997775305894, + "grad_norm": 6.00390625, + "learning_rate": 8.680200222469411e-06, + "loss": 3.0815, + "mean_token_accuracy": 0.42424242424242425, + "step": 7119 + }, + { + "epoch": 1.3199851687059696, + "grad_norm": 6.30078125, + "learning_rate": 8.680014831294031e-06, + "loss": 3.0184, + "mean_token_accuracy": 0.434826883910387, + "step": 7120 + }, + { + "epoch": 1.3201705598813496, + "grad_norm": 9.3125, + "learning_rate": 8.679829440118652e-06, + "loss": 2.3845, + "mean_token_accuracy": 0.4953023367863166, + "step": 7121 + }, + { + "epoch": 1.3203559510567298, + "grad_norm": 6.83203125, + "learning_rate": 8.67964404894327e-06, + "loss": 2.9299, + "mean_token_accuracy": 0.45575332163449483, + "step": 7122 + }, + { + "epoch": 1.3205413422321097, + "grad_norm": 6.53125, + "learning_rate": 8.679458657767891e-06, + "loss": 3.7051, + "mean_token_accuracy": 0.3856749311294766, + "step": 7123 + }, + { + "epoch": 1.32072673340749, + "grad_norm": 7.6328125, + "learning_rate": 8.67927326659251e-06, + "loss": 3.2865, + "mean_token_accuracy": 0.4438340320948388, + "step": 7124 + }, + { + "epoch": 1.3209121245828699, + "grad_norm": 6.9765625, + "learning_rate": 8.67908787541713e-06, + "loss": 2.5585, + "mean_token_accuracy": 0.52065999153857, + "step": 7125 + }, + { + "epoch": 1.3210975157582499, + "grad_norm": 5.98046875, + "learning_rate": 8.678902484241751e-06, + "loss": 3.0205, + "mean_token_accuracy": 0.4617616033755274, + "step": 7126 + }, + { + "epoch": 1.32128290693363, + "grad_norm": 7.30859375, + "learning_rate": 8.678717093066371e-06, + "loss": 2.9563, + "mean_token_accuracy": 0.4740767913915383, + "step": 7127 + }, + { + "epoch": 1.32146829810901, + "grad_norm": 7.21875, + "learning_rate": 8.67853170189099e-06, + "loss": 3.2205, + "mean_token_accuracy": 0.4268053148469093, + "step": 7128 + }, + { + "epoch": 1.32165368928439, + "grad_norm": 7.7890625, + "learning_rate": 8.67834631071561e-06, + "loss": 3.1278, + "mean_token_accuracy": 0.43245812047047644, + "step": 7129 + }, + { + "epoch": 1.3218390804597702, + "grad_norm": 5.71875, + "learning_rate": 8.678160919540231e-06, + "loss": 2.4728, + "mean_token_accuracy": 0.5037855281122006, + "step": 7130 + }, + { + "epoch": 1.3220244716351501, + "grad_norm": 6.48828125, + "learning_rate": 8.67797552836485e-06, + "loss": 2.9364, + "mean_token_accuracy": 0.44659546061415223, + "step": 7131 + }, + { + "epoch": 1.32220986281053, + "grad_norm": 6.9765625, + "learning_rate": 8.67779013718947e-06, + "loss": 3.1549, + "mean_token_accuracy": 0.43215434083601284, + "step": 7132 + }, + { + "epoch": 1.3223952539859103, + "grad_norm": 5.73046875, + "learning_rate": 8.67760474601409e-06, + "loss": 3.2268, + "mean_token_accuracy": 0.4274521621214404, + "step": 7133 + }, + { + "epoch": 1.3225806451612903, + "grad_norm": 5.53515625, + "learning_rate": 8.677419354838712e-06, + "loss": 2.9665, + "mean_token_accuracy": 0.4478368756068616, + "step": 7134 + }, + { + "epoch": 1.3227660363366704, + "grad_norm": 5.921875, + "learning_rate": 8.67723396366333e-06, + "loss": 2.5995, + "mean_token_accuracy": 0.4923810989687548, + "step": 7135 + }, + { + "epoch": 1.3229514275120504, + "grad_norm": 6.40625, + "learning_rate": 8.67704857248795e-06, + "loss": 2.2696, + "mean_token_accuracy": 0.5179230363732209, + "step": 7136 + }, + { + "epoch": 1.3231368186874306, + "grad_norm": 8.296875, + "learning_rate": 8.67686318131257e-06, + "loss": 2.1416, + "mean_token_accuracy": 0.5397967161845192, + "step": 7137 + }, + { + "epoch": 1.3233222098628106, + "grad_norm": 5.91796875, + "learning_rate": 8.67667779013719e-06, + "loss": 2.6511, + "mean_token_accuracy": 0.4809310889005786, + "step": 7138 + }, + { + "epoch": 1.3235076010381905, + "grad_norm": 7.83203125, + "learning_rate": 8.67649239896181e-06, + "loss": 2.5325, + "mean_token_accuracy": 0.4686435584051292, + "step": 7139 + }, + { + "epoch": 1.3236929922135707, + "grad_norm": 11.015625, + "learning_rate": 8.67630700778643e-06, + "loss": 2.3235, + "mean_token_accuracy": 0.5308602999210734, + "step": 7140 + }, + { + "epoch": 1.3238783833889507, + "grad_norm": 9.2421875, + "learning_rate": 8.67612161661105e-06, + "loss": 2.6769, + "mean_token_accuracy": 0.4757016632016632, + "step": 7141 + }, + { + "epoch": 1.3240637745643307, + "grad_norm": 6.37109375, + "learning_rate": 8.67593622543567e-06, + "loss": 3.2522, + "mean_token_accuracy": 0.4418165807019751, + "step": 7142 + }, + { + "epoch": 1.3242491657397109, + "grad_norm": 8.0390625, + "learning_rate": 8.675750834260291e-06, + "loss": 2.8933, + "mean_token_accuracy": 0.44660062423666713, + "step": 7143 + }, + { + "epoch": 1.3244345569150908, + "grad_norm": 7.9140625, + "learning_rate": 8.67556544308491e-06, + "loss": 2.7649, + "mean_token_accuracy": 0.47042177604096486, + "step": 7144 + }, + { + "epoch": 1.3246199480904708, + "grad_norm": 5.4375, + "learning_rate": 8.67538005190953e-06, + "loss": 2.3969, + "mean_token_accuracy": 0.505421293272371, + "step": 7145 + }, + { + "epoch": 1.324805339265851, + "grad_norm": 7.80078125, + "learning_rate": 8.675194660734149e-06, + "loss": 2.3275, + "mean_token_accuracy": 0.5078463607831415, + "step": 7146 + }, + { + "epoch": 1.324990730441231, + "grad_norm": 6.625, + "learning_rate": 8.67500926955877e-06, + "loss": 2.827, + "mean_token_accuracy": 0.45971622455274525, + "step": 7147 + }, + { + "epoch": 1.325176121616611, + "grad_norm": 6.3125, + "learning_rate": 8.67482387838339e-06, + "loss": 2.9196, + "mean_token_accuracy": 0.4626025791324736, + "step": 7148 + }, + { + "epoch": 1.325361512791991, + "grad_norm": 6.3515625, + "learning_rate": 8.674638487208009e-06, + "loss": 3.1549, + "mean_token_accuracy": 0.4340051522958024, + "step": 7149 + }, + { + "epoch": 1.325546903967371, + "grad_norm": 7.88671875, + "learning_rate": 8.674453096032631e-06, + "loss": 3.1961, + "mean_token_accuracy": 0.428740581270183, + "step": 7150 + }, + { + "epoch": 1.3257322951427513, + "grad_norm": 8.2265625, + "learning_rate": 8.67426770485725e-06, + "loss": 2.1752, + "mean_token_accuracy": 0.5267996260517295, + "step": 7151 + }, + { + "epoch": 1.3259176863181312, + "grad_norm": 7.5390625, + "learning_rate": 8.67408231368187e-06, + "loss": 2.3468, + "mean_token_accuracy": 0.4941696823482107, + "step": 7152 + }, + { + "epoch": 1.3261030774935114, + "grad_norm": 8.578125, + "learning_rate": 8.673896922506489e-06, + "loss": 2.67, + "mean_token_accuracy": 0.47132390096008087, + "step": 7153 + }, + { + "epoch": 1.3262884686688914, + "grad_norm": 14.2890625, + "learning_rate": 8.67371153133111e-06, + "loss": 3.2165, + "mean_token_accuracy": 0.41689866369710465, + "step": 7154 + }, + { + "epoch": 1.3264738598442714, + "grad_norm": 6.70703125, + "learning_rate": 8.67352614015573e-06, + "loss": 2.8587, + "mean_token_accuracy": 0.44610169491525425, + "step": 7155 + }, + { + "epoch": 1.3266592510196515, + "grad_norm": 7.52734375, + "learning_rate": 8.673340748980349e-06, + "loss": 2.96, + "mean_token_accuracy": 0.4502058672156459, + "step": 7156 + }, + { + "epoch": 1.3268446421950315, + "grad_norm": 9.4921875, + "learning_rate": 8.67315535780497e-06, + "loss": 3.2149, + "mean_token_accuracy": 0.4133034379671151, + "step": 7157 + }, + { + "epoch": 1.3270300333704115, + "grad_norm": 10.34375, + "learning_rate": 8.67296996662959e-06, + "loss": 2.4036, + "mean_token_accuracy": 0.4868845560387151, + "step": 7158 + }, + { + "epoch": 1.3272154245457917, + "grad_norm": 5.015625, + "learning_rate": 8.67278457545421e-06, + "loss": 2.4743, + "mean_token_accuracy": 0.4812383900928793, + "step": 7159 + }, + { + "epoch": 1.3274008157211716, + "grad_norm": 7.53125, + "learning_rate": 8.672599184278829e-06, + "loss": 2.7616, + "mean_token_accuracy": 0.47458807147830123, + "step": 7160 + }, + { + "epoch": 1.3275862068965516, + "grad_norm": 11.96875, + "learning_rate": 8.67241379310345e-06, + "loss": 2.4532, + "mean_token_accuracy": 0.4858880778588808, + "step": 7161 + }, + { + "epoch": 1.3277715980719318, + "grad_norm": 10.2265625, + "learning_rate": 8.672228401928068e-06, + "loss": 3.1639, + "mean_token_accuracy": 0.4442881700265667, + "step": 7162 + }, + { + "epoch": 1.3279569892473118, + "grad_norm": 5.87890625, + "learning_rate": 8.672043010752689e-06, + "loss": 2.5357, + "mean_token_accuracy": 0.4957315862616637, + "step": 7163 + }, + { + "epoch": 1.328142380422692, + "grad_norm": 6.37109375, + "learning_rate": 8.67185761957731e-06, + "loss": 2.5916, + "mean_token_accuracy": 0.5005800464037123, + "step": 7164 + }, + { + "epoch": 1.328327771598072, + "grad_norm": 8.234375, + "learning_rate": 8.671672228401928e-06, + "loss": 2.9886, + "mean_token_accuracy": 0.4495830174374526, + "step": 7165 + }, + { + "epoch": 1.328513162773452, + "grad_norm": 8.2734375, + "learning_rate": 8.671486837226549e-06, + "loss": 3.0486, + "mean_token_accuracy": 0.41974248927038627, + "step": 7166 + }, + { + "epoch": 1.328698553948832, + "grad_norm": 7.390625, + "learning_rate": 8.671301446051169e-06, + "loss": 3.4328, + "mean_token_accuracy": 0.41117850953206236, + "step": 7167 + }, + { + "epoch": 1.328883945124212, + "grad_norm": 5.984375, + "learning_rate": 8.67111605487579e-06, + "loss": 2.859, + "mean_token_accuracy": 0.44726350126857556, + "step": 7168 + }, + { + "epoch": 1.3290693362995922, + "grad_norm": 7.38671875, + "learning_rate": 8.670930663700408e-06, + "loss": 3.1067, + "mean_token_accuracy": 0.44147784673113005, + "step": 7169 + }, + { + "epoch": 1.3292547274749722, + "grad_norm": 6.02734375, + "learning_rate": 8.670745272525029e-06, + "loss": 2.5338, + "mean_token_accuracy": 0.4917005950516755, + "step": 7170 + }, + { + "epoch": 1.3294401186503522, + "grad_norm": 6.390625, + "learning_rate": 8.670559881349648e-06, + "loss": 2.4196, + "mean_token_accuracy": 0.49652402286420516, + "step": 7171 + }, + { + "epoch": 1.3296255098257324, + "grad_norm": 6.10546875, + "learning_rate": 8.670374490174268e-06, + "loss": 2.5324, + "mean_token_accuracy": 0.48323496169785257, + "step": 7172 + }, + { + "epoch": 1.3298109010011123, + "grad_norm": 5.640625, + "learning_rate": 8.670189098998889e-06, + "loss": 2.7242, + "mean_token_accuracy": 0.4735545335085414, + "step": 7173 + }, + { + "epoch": 1.3299962921764923, + "grad_norm": 7.7734375, + "learning_rate": 8.670003707823509e-06, + "loss": 3.1069, + "mean_token_accuracy": 0.43765508684863524, + "step": 7174 + }, + { + "epoch": 1.3301816833518725, + "grad_norm": 6.04296875, + "learning_rate": 8.669818316648128e-06, + "loss": 2.9094, + "mean_token_accuracy": 0.456682629516761, + "step": 7175 + }, + { + "epoch": 1.3303670745272524, + "grad_norm": 6.375, + "learning_rate": 8.669632925472748e-06, + "loss": 3.0595, + "mean_token_accuracy": 0.41973718113888175, + "step": 7176 + }, + { + "epoch": 1.3305524657026326, + "grad_norm": 6.02734375, + "learning_rate": 8.669447534297369e-06, + "loss": 2.7826, + "mean_token_accuracy": 0.451067615658363, + "step": 7177 + }, + { + "epoch": 1.3307378568780126, + "grad_norm": 6.5625, + "learning_rate": 8.669262143121988e-06, + "loss": 3.3065, + "mean_token_accuracy": 0.45878861459598, + "step": 7178 + }, + { + "epoch": 1.3309232480533928, + "grad_norm": 5.8359375, + "learning_rate": 8.669076751946608e-06, + "loss": 2.473, + "mean_token_accuracy": 0.5121262805770437, + "step": 7179 + }, + { + "epoch": 1.3311086392287728, + "grad_norm": 5.75, + "learning_rate": 8.668891360771227e-06, + "loss": 2.5927, + "mean_token_accuracy": 0.5213785213785214, + "step": 7180 + }, + { + "epoch": 1.3312940304041527, + "grad_norm": 5.640625, + "learning_rate": 8.668705969595848e-06, + "loss": 2.5694, + "mean_token_accuracy": 0.47198022133270545, + "step": 7181 + }, + { + "epoch": 1.331479421579533, + "grad_norm": 5.921875, + "learning_rate": 8.668520578420468e-06, + "loss": 3.4492, + "mean_token_accuracy": 0.41316685584562995, + "step": 7182 + }, + { + "epoch": 1.3316648127549129, + "grad_norm": 8.75, + "learning_rate": 8.668335187245088e-06, + "loss": 2.1682, + "mean_token_accuracy": 0.532287403365166, + "step": 7183 + }, + { + "epoch": 1.3318502039302929, + "grad_norm": 6.15234375, + "learning_rate": 8.668149796069707e-06, + "loss": 2.3086, + "mean_token_accuracy": 0.5148763793825953, + "step": 7184 + }, + { + "epoch": 1.332035595105673, + "grad_norm": 5.67578125, + "learning_rate": 8.667964404894328e-06, + "loss": 2.8262, + "mean_token_accuracy": 0.47252481239409344, + "step": 7185 + }, + { + "epoch": 1.332220986281053, + "grad_norm": 9.046875, + "learning_rate": 8.667779013718948e-06, + "loss": 3.2013, + "mean_token_accuracy": 0.4354686020826759, + "step": 7186 + }, + { + "epoch": 1.332406377456433, + "grad_norm": 6.72265625, + "learning_rate": 8.667593622543567e-06, + "loss": 2.7543, + "mean_token_accuracy": 0.46911455345190284, + "step": 7187 + }, + { + "epoch": 1.3325917686318132, + "grad_norm": 6.21875, + "learning_rate": 8.667408231368188e-06, + "loss": 3.0154, + "mean_token_accuracy": 0.4423076923076923, + "step": 7188 + }, + { + "epoch": 1.3327771598071931, + "grad_norm": 7.01171875, + "learning_rate": 8.667222840192806e-06, + "loss": 2.873, + "mean_token_accuracy": 0.47605692323673005, + "step": 7189 + }, + { + "epoch": 1.332962550982573, + "grad_norm": 7.5234375, + "learning_rate": 8.667037449017427e-06, + "loss": 3.7216, + "mean_token_accuracy": 0.3854370660494926, + "step": 7190 + }, + { + "epoch": 1.3331479421579533, + "grad_norm": 6.4296875, + "learning_rate": 8.666852057842047e-06, + "loss": 3.8675, + "mean_token_accuracy": 0.4056393076493579, + "step": 7191 + }, + { + "epoch": 1.3333333333333333, + "grad_norm": 8.96875, + "learning_rate": 8.666666666666668e-06, + "loss": 3.2868, + "mean_token_accuracy": 0.42080507547582585, + "step": 7192 + }, + { + "epoch": 1.3335187245087134, + "grad_norm": 7.83984375, + "learning_rate": 8.666481275491288e-06, + "loss": 2.5227, + "mean_token_accuracy": 0.489426907753601, + "step": 7193 + }, + { + "epoch": 1.3337041156840934, + "grad_norm": 5.8359375, + "learning_rate": 8.666295884315907e-06, + "loss": 2.5026, + "mean_token_accuracy": 0.4859406952965235, + "step": 7194 + }, + { + "epoch": 1.3338895068594736, + "grad_norm": 7.6171875, + "learning_rate": 8.666110493140528e-06, + "loss": 2.9627, + "mean_token_accuracy": 0.4659868026394721, + "step": 7195 + }, + { + "epoch": 1.3340748980348536, + "grad_norm": 5.46484375, + "learning_rate": 8.665925101965146e-06, + "loss": 2.4394, + "mean_token_accuracy": 0.5033815143880122, + "step": 7196 + }, + { + "epoch": 1.3342602892102335, + "grad_norm": 6.40625, + "learning_rate": 8.665739710789767e-06, + "loss": 3.1604, + "mean_token_accuracy": 0.4410924102889806, + "step": 7197 + }, + { + "epoch": 1.3344456803856137, + "grad_norm": 6.17578125, + "learning_rate": 8.665554319614386e-06, + "loss": 3.0865, + "mean_token_accuracy": 0.4511842105263158, + "step": 7198 + }, + { + "epoch": 1.3346310715609937, + "grad_norm": 6.13671875, + "learning_rate": 8.665368928439008e-06, + "loss": 2.675, + "mean_token_accuracy": 0.4800531914893617, + "step": 7199 + }, + { + "epoch": 1.3348164627363737, + "grad_norm": 5.9765625, + "learning_rate": 8.665183537263627e-06, + "loss": 2.8052, + "mean_token_accuracy": 0.44724845327254964, + "step": 7200 + }, + { + "epoch": 1.3350018539117539, + "grad_norm": 6.69140625, + "learning_rate": 8.664998146088247e-06, + "loss": 2.8431, + "mean_token_accuracy": 0.46218887697987376, + "step": 7201 + }, + { + "epoch": 1.3351872450871338, + "grad_norm": 8.1953125, + "learning_rate": 8.664812754912868e-06, + "loss": 2.6405, + "mean_token_accuracy": 0.4659643435980551, + "step": 7202 + }, + { + "epoch": 1.3353726362625138, + "grad_norm": 7.140625, + "learning_rate": 8.664627363737486e-06, + "loss": 2.608, + "mean_token_accuracy": 0.48135311243587964, + "step": 7203 + }, + { + "epoch": 1.335558027437894, + "grad_norm": 7.0, + "learning_rate": 8.664441972562107e-06, + "loss": 3.9419, + "mean_token_accuracy": 0.36348722756034685, + "step": 7204 + }, + { + "epoch": 1.335743418613274, + "grad_norm": 8.984375, + "learning_rate": 8.664256581386726e-06, + "loss": 2.172, + "mean_token_accuracy": 0.5345589113047631, + "step": 7205 + }, + { + "epoch": 1.3359288097886541, + "grad_norm": 5.44921875, + "learning_rate": 8.664071190211346e-06, + "loss": 3.3444, + "mean_token_accuracy": 0.41488381615313746, + "step": 7206 + }, + { + "epoch": 1.336114200964034, + "grad_norm": 7.0, + "learning_rate": 8.663885799035967e-06, + "loss": 2.6724, + "mean_token_accuracy": 0.4723557692307692, + "step": 7207 + }, + { + "epoch": 1.3362995921394143, + "grad_norm": 6.16015625, + "learning_rate": 8.663700407860587e-06, + "loss": 2.2136, + "mean_token_accuracy": 0.5317876932050161, + "step": 7208 + }, + { + "epoch": 1.3364849833147943, + "grad_norm": 5.1875, + "learning_rate": 8.663515016685206e-06, + "loss": 2.7921, + "mean_token_accuracy": 0.4575627798325871, + "step": 7209 + }, + { + "epoch": 1.3366703744901742, + "grad_norm": 6.66796875, + "learning_rate": 8.663329625509827e-06, + "loss": 3.0123, + "mean_token_accuracy": 0.4531194716358061, + "step": 7210 + }, + { + "epoch": 1.3368557656655544, + "grad_norm": 8.4140625, + "learning_rate": 8.663144234334447e-06, + "loss": 3.2027, + "mean_token_accuracy": 0.4120079644662276, + "step": 7211 + }, + { + "epoch": 1.3370411568409344, + "grad_norm": 6.88671875, + "learning_rate": 8.662958843159066e-06, + "loss": 3.1639, + "mean_token_accuracy": 0.4173508907823393, + "step": 7212 + }, + { + "epoch": 1.3372265480163144, + "grad_norm": 6.2734375, + "learning_rate": 8.662773451983686e-06, + "loss": 2.8919, + "mean_token_accuracy": 0.47506600176004693, + "step": 7213 + }, + { + "epoch": 1.3374119391916945, + "grad_norm": 5.98046875, + "learning_rate": 8.662588060808305e-06, + "loss": 3.2255, + "mean_token_accuracy": 0.4355846042120552, + "step": 7214 + }, + { + "epoch": 1.3375973303670745, + "grad_norm": 5.890625, + "learning_rate": 8.662402669632927e-06, + "loss": 2.7012, + "mean_token_accuracy": 0.4690909090909091, + "step": 7215 + }, + { + "epoch": 1.3377827215424545, + "grad_norm": 6.50390625, + "learning_rate": 8.662217278457546e-06, + "loss": 2.6946, + "mean_token_accuracy": 0.48770069229636176, + "step": 7216 + }, + { + "epoch": 1.3379681127178347, + "grad_norm": 9.9921875, + "learning_rate": 8.662031887282167e-06, + "loss": 2.8079, + "mean_token_accuracy": 0.4461745482506728, + "step": 7217 + }, + { + "epoch": 1.3381535038932146, + "grad_norm": 6.9140625, + "learning_rate": 8.661846496106785e-06, + "loss": 2.6653, + "mean_token_accuracy": 0.4593780135004822, + "step": 7218 + }, + { + "epoch": 1.3383388950685946, + "grad_norm": 5.7421875, + "learning_rate": 8.661661104931406e-06, + "loss": 2.8977, + "mean_token_accuracy": 0.44960538232630354, + "step": 7219 + }, + { + "epoch": 1.3385242862439748, + "grad_norm": 6.06640625, + "learning_rate": 8.661475713756026e-06, + "loss": 2.3178, + "mean_token_accuracy": 0.5185917721518988, + "step": 7220 + }, + { + "epoch": 1.3387096774193548, + "grad_norm": 8.796875, + "learning_rate": 8.661290322580645e-06, + "loss": 3.1137, + "mean_token_accuracy": 0.428271744392448, + "step": 7221 + }, + { + "epoch": 1.338895068594735, + "grad_norm": 6.49609375, + "learning_rate": 8.661104931405266e-06, + "loss": 2.9439, + "mean_token_accuracy": 0.46454767726161367, + "step": 7222 + }, + { + "epoch": 1.339080459770115, + "grad_norm": 5.9921875, + "learning_rate": 8.660919540229886e-06, + "loss": 3.6439, + "mean_token_accuracy": 0.3886658795749705, + "step": 7223 + }, + { + "epoch": 1.339265850945495, + "grad_norm": 7.2109375, + "learning_rate": 8.660734149054507e-06, + "loss": 3.0451, + "mean_token_accuracy": 0.4427671460834423, + "step": 7224 + }, + { + "epoch": 1.339451242120875, + "grad_norm": 5.83984375, + "learning_rate": 8.660548757879125e-06, + "loss": 3.1548, + "mean_token_accuracy": 0.4389814438575105, + "step": 7225 + }, + { + "epoch": 1.339636633296255, + "grad_norm": 7.9453125, + "learning_rate": 8.660363366703746e-06, + "loss": 2.7117, + "mean_token_accuracy": 0.465438919582566, + "step": 7226 + }, + { + "epoch": 1.3398220244716352, + "grad_norm": 5.4921875, + "learning_rate": 8.660177975528365e-06, + "loss": 2.9679, + "mean_token_accuracy": 0.46248048985472445, + "step": 7227 + }, + { + "epoch": 1.3400074156470152, + "grad_norm": 5.109375, + "learning_rate": 8.659992584352985e-06, + "loss": 2.1292, + "mean_token_accuracy": 0.5307682342272315, + "step": 7228 + }, + { + "epoch": 1.3401928068223952, + "grad_norm": 6.171875, + "learning_rate": 8.659807193177606e-06, + "loss": 2.857, + "mean_token_accuracy": 0.4735756229169973, + "step": 7229 + }, + { + "epoch": 1.3403781979977754, + "grad_norm": 5.77734375, + "learning_rate": 8.659621802002224e-06, + "loss": 2.5436, + "mean_token_accuracy": 0.47965274009766684, + "step": 7230 + }, + { + "epoch": 1.3405635891731553, + "grad_norm": 6.515625, + "learning_rate": 8.659436410826847e-06, + "loss": 2.6775, + "mean_token_accuracy": 0.4804823470909995, + "step": 7231 + }, + { + "epoch": 1.3407489803485353, + "grad_norm": 7.703125, + "learning_rate": 8.659251019651465e-06, + "loss": 2.7673, + "mean_token_accuracy": 0.47003704510786665, + "step": 7232 + }, + { + "epoch": 1.3409343715239155, + "grad_norm": 6.7578125, + "learning_rate": 8.659065628476086e-06, + "loss": 2.8615, + "mean_token_accuracy": 0.4645161290322581, + "step": 7233 + }, + { + "epoch": 1.3411197626992954, + "grad_norm": 8.015625, + "learning_rate": 8.658880237300705e-06, + "loss": 2.6794, + "mean_token_accuracy": 0.4670530156685984, + "step": 7234 + }, + { + "epoch": 1.3413051538746756, + "grad_norm": 6.875, + "learning_rate": 8.658694846125325e-06, + "loss": 3.2448, + "mean_token_accuracy": 0.4317507418397626, + "step": 7235 + }, + { + "epoch": 1.3414905450500556, + "grad_norm": 6.35546875, + "learning_rate": 8.658509454949946e-06, + "loss": 2.7127, + "mean_token_accuracy": 0.47053872053872053, + "step": 7236 + }, + { + "epoch": 1.3416759362254358, + "grad_norm": 6.59375, + "learning_rate": 8.658324063774565e-06, + "loss": 3.463, + "mean_token_accuracy": 0.40838757870024545, + "step": 7237 + }, + { + "epoch": 1.3418613274008158, + "grad_norm": 6.66796875, + "learning_rate": 8.658138672599185e-06, + "loss": 2.8505, + "mean_token_accuracy": 0.47598415449368653, + "step": 7238 + }, + { + "epoch": 1.3420467185761957, + "grad_norm": 11.3203125, + "learning_rate": 8.657953281423806e-06, + "loss": 2.7571, + "mean_token_accuracy": 0.47897897897897895, + "step": 7239 + }, + { + "epoch": 1.342232109751576, + "grad_norm": 6.296875, + "learning_rate": 8.657767890248426e-06, + "loss": 3.0187, + "mean_token_accuracy": 0.4676860025220681, + "step": 7240 + }, + { + "epoch": 1.3424175009269559, + "grad_norm": 6.93359375, + "learning_rate": 8.657582499073045e-06, + "loss": 3.3356, + "mean_token_accuracy": 0.4300136425648022, + "step": 7241 + }, + { + "epoch": 1.3426028921023359, + "grad_norm": 7.05859375, + "learning_rate": 8.657397107897665e-06, + "loss": 3.0434, + "mean_token_accuracy": 0.4514336917562724, + "step": 7242 + }, + { + "epoch": 1.342788283277716, + "grad_norm": 6.65234375, + "learning_rate": 8.657211716722284e-06, + "loss": 2.749, + "mean_token_accuracy": 0.46653543307086615, + "step": 7243 + }, + { + "epoch": 1.342973674453096, + "grad_norm": 6.34375, + "learning_rate": 8.657026325546905e-06, + "loss": 2.5434, + "mean_token_accuracy": 0.49780701754385964, + "step": 7244 + }, + { + "epoch": 1.343159065628476, + "grad_norm": 7.6875, + "learning_rate": 8.656840934371525e-06, + "loss": 3.1214, + "mean_token_accuracy": 0.4290519496698642, + "step": 7245 + }, + { + "epoch": 1.3433444568038562, + "grad_norm": 6.44921875, + "learning_rate": 8.656655543196144e-06, + "loss": 3.3053, + "mean_token_accuracy": 0.454360569446106, + "step": 7246 + }, + { + "epoch": 1.3435298479792361, + "grad_norm": 6.41796875, + "learning_rate": 8.656470152020764e-06, + "loss": 2.7181, + "mean_token_accuracy": 0.48188260462628696, + "step": 7247 + }, + { + "epoch": 1.343715239154616, + "grad_norm": 5.2421875, + "learning_rate": 8.656284760845385e-06, + "loss": 2.9339, + "mean_token_accuracy": 0.4404277577589213, + "step": 7248 + }, + { + "epoch": 1.3439006303299963, + "grad_norm": 6.68359375, + "learning_rate": 8.656099369670005e-06, + "loss": 2.9291, + "mean_token_accuracy": 0.4462477120195241, + "step": 7249 + }, + { + "epoch": 1.3440860215053765, + "grad_norm": 6.22265625, + "learning_rate": 8.655913978494624e-06, + "loss": 2.8486, + "mean_token_accuracy": 0.4581450969970768, + "step": 7250 + }, + { + "epoch": 1.3442714126807565, + "grad_norm": 4.9375, + "learning_rate": 8.655728587319245e-06, + "loss": 2.3275, + "mean_token_accuracy": 0.5249717514124294, + "step": 7251 + }, + { + "epoch": 1.3444568038561364, + "grad_norm": 5.33203125, + "learning_rate": 8.655543196143863e-06, + "loss": 2.4292, + "mean_token_accuracy": 0.5123260922626311, + "step": 7252 + }, + { + "epoch": 1.3446421950315166, + "grad_norm": 5.99609375, + "learning_rate": 8.655357804968484e-06, + "loss": 2.715, + "mean_token_accuracy": 0.47298553033320057, + "step": 7253 + }, + { + "epoch": 1.3448275862068966, + "grad_norm": 6.07421875, + "learning_rate": 8.655172413793104e-06, + "loss": 2.6695, + "mean_token_accuracy": 0.4816747716759578, + "step": 7254 + }, + { + "epoch": 1.3450129773822765, + "grad_norm": 5.703125, + "learning_rate": 8.654987022617725e-06, + "loss": 3.0237, + "mean_token_accuracy": 0.437412685107572, + "step": 7255 + }, + { + "epoch": 1.3451983685576567, + "grad_norm": 6.0, + "learning_rate": 8.654801631442344e-06, + "loss": 2.8516, + "mean_token_accuracy": 0.4623085983510012, + "step": 7256 + }, + { + "epoch": 1.3453837597330367, + "grad_norm": 9.6953125, + "learning_rate": 8.654616240266964e-06, + "loss": 2.8629, + "mean_token_accuracy": 0.4451311425987338, + "step": 7257 + }, + { + "epoch": 1.3455691509084167, + "grad_norm": 6.75390625, + "learning_rate": 8.654430849091585e-06, + "loss": 3.5636, + "mean_token_accuracy": 0.3974084958589367, + "step": 7258 + }, + { + "epoch": 1.3457545420837969, + "grad_norm": 5.88671875, + "learning_rate": 8.654245457916203e-06, + "loss": 3.1171, + "mean_token_accuracy": 0.4337084820203129, + "step": 7259 + }, + { + "epoch": 1.3459399332591768, + "grad_norm": 6.95703125, + "learning_rate": 8.654060066740824e-06, + "loss": 2.7768, + "mean_token_accuracy": 0.4583804569102013, + "step": 7260 + }, + { + "epoch": 1.3461253244345568, + "grad_norm": 8.375, + "learning_rate": 8.653874675565443e-06, + "loss": 2.4179, + "mean_token_accuracy": 0.4946236559139785, + "step": 7261 + }, + { + "epoch": 1.346310715609937, + "grad_norm": 6.05078125, + "learning_rate": 8.653689284390063e-06, + "loss": 2.7718, + "mean_token_accuracy": 0.47497232812692164, + "step": 7262 + }, + { + "epoch": 1.346496106785317, + "grad_norm": 8.0859375, + "learning_rate": 8.653503893214684e-06, + "loss": 2.8715, + "mean_token_accuracy": 0.44639855166170955, + "step": 7263 + }, + { + "epoch": 1.3466814979606971, + "grad_norm": 10.2578125, + "learning_rate": 8.653318502039304e-06, + "loss": 3.3435, + "mean_token_accuracy": 0.43461733400912317, + "step": 7264 + }, + { + "epoch": 1.346866889136077, + "grad_norm": 8.0390625, + "learning_rate": 8.653133110863923e-06, + "loss": 2.6704, + "mean_token_accuracy": 0.47962838594134766, + "step": 7265 + }, + { + "epoch": 1.3470522803114573, + "grad_norm": 8.8515625, + "learning_rate": 8.652947719688544e-06, + "loss": 2.1593, + "mean_token_accuracy": 0.56312625250501, + "step": 7266 + }, + { + "epoch": 1.3472376714868373, + "grad_norm": 6.47265625, + "learning_rate": 8.652762328513164e-06, + "loss": 2.8806, + "mean_token_accuracy": 0.45191420664206644, + "step": 7267 + }, + { + "epoch": 1.3474230626622172, + "grad_norm": 7.18359375, + "learning_rate": 8.652576937337783e-06, + "loss": 3.2226, + "mean_token_accuracy": 0.42981236970118136, + "step": 7268 + }, + { + "epoch": 1.3476084538375974, + "grad_norm": 7.17578125, + "learning_rate": 8.652391546162403e-06, + "loss": 2.3328, + "mean_token_accuracy": 0.5088454376163873, + "step": 7269 + }, + { + "epoch": 1.3477938450129774, + "grad_norm": 6.5, + "learning_rate": 8.652206154987022e-06, + "loss": 2.6998, + "mean_token_accuracy": 0.46749393040194226, + "step": 7270 + }, + { + "epoch": 1.3479792361883574, + "grad_norm": 9.2265625, + "learning_rate": 8.652020763811644e-06, + "loss": 2.4339, + "mean_token_accuracy": 0.490522834245007, + "step": 7271 + }, + { + "epoch": 1.3481646273637375, + "grad_norm": 5.60546875, + "learning_rate": 8.651835372636263e-06, + "loss": 2.8724, + "mean_token_accuracy": 0.45015882619875963, + "step": 7272 + }, + { + "epoch": 1.3483500185391175, + "grad_norm": 6.078125, + "learning_rate": 8.651649981460884e-06, + "loss": 2.7544, + "mean_token_accuracy": 0.4889453241708993, + "step": 7273 + }, + { + "epoch": 1.3485354097144975, + "grad_norm": 7.3828125, + "learning_rate": 8.651464590285504e-06, + "loss": 3.0883, + "mean_token_accuracy": 0.44939172749391726, + "step": 7274 + }, + { + "epoch": 1.3487208008898777, + "grad_norm": 5.6328125, + "learning_rate": 8.651279199110123e-06, + "loss": 3.014, + "mean_token_accuracy": 0.4529142984509083, + "step": 7275 + }, + { + "epoch": 1.3489061920652576, + "grad_norm": 5.61328125, + "learning_rate": 8.651093807934743e-06, + "loss": 2.8706, + "mean_token_accuracy": 0.4515597410241318, + "step": 7276 + }, + { + "epoch": 1.3490915832406378, + "grad_norm": 5.796875, + "learning_rate": 8.650908416759362e-06, + "loss": 3.6346, + "mean_token_accuracy": 0.40131500298864314, + "step": 7277 + }, + { + "epoch": 1.3492769744160178, + "grad_norm": 5.46875, + "learning_rate": 8.650723025583983e-06, + "loss": 3.1167, + "mean_token_accuracy": 0.42902881536819637, + "step": 7278 + }, + { + "epoch": 1.349462365591398, + "grad_norm": 6.390625, + "learning_rate": 8.650537634408603e-06, + "loss": 2.6441, + "mean_token_accuracy": 0.4784546805349183, + "step": 7279 + }, + { + "epoch": 1.349647756766778, + "grad_norm": 5.90625, + "learning_rate": 8.650352243233224e-06, + "loss": 2.7791, + "mean_token_accuracy": 0.4521497919556172, + "step": 7280 + }, + { + "epoch": 1.349833147942158, + "grad_norm": 6.69921875, + "learning_rate": 8.650166852057842e-06, + "loss": 3.0139, + "mean_token_accuracy": 0.4565560821484992, + "step": 7281 + }, + { + "epoch": 1.350018539117538, + "grad_norm": 7.48828125, + "learning_rate": 8.649981460882463e-06, + "loss": 2.9816, + "mean_token_accuracy": 0.4662576687116564, + "step": 7282 + }, + { + "epoch": 1.350203930292918, + "grad_norm": 6.48046875, + "learning_rate": 8.649796069707083e-06, + "loss": 3.4044, + "mean_token_accuracy": 0.4219853602157442, + "step": 7283 + }, + { + "epoch": 1.350389321468298, + "grad_norm": 9.4140625, + "learning_rate": 8.649610678531702e-06, + "loss": 2.4514, + "mean_token_accuracy": 0.5141999249343175, + "step": 7284 + }, + { + "epoch": 1.3505747126436782, + "grad_norm": 8.6328125, + "learning_rate": 8.649425287356323e-06, + "loss": 2.4216, + "mean_token_accuracy": 0.5134003350083752, + "step": 7285 + }, + { + "epoch": 1.3507601038190582, + "grad_norm": 7.0546875, + "learning_rate": 8.649239896180941e-06, + "loss": 2.7765, + "mean_token_accuracy": 0.4772782799902272, + "step": 7286 + }, + { + "epoch": 1.3509454949944382, + "grad_norm": 9.0703125, + "learning_rate": 8.649054505005564e-06, + "loss": 3.2421, + "mean_token_accuracy": 0.43143556779920417, + "step": 7287 + }, + { + "epoch": 1.3511308861698184, + "grad_norm": 9.296875, + "learning_rate": 8.648869113830182e-06, + "loss": 3.1066, + "mean_token_accuracy": 0.4344214514718741, + "step": 7288 + }, + { + "epoch": 1.3513162773451983, + "grad_norm": 8.578125, + "learning_rate": 8.648683722654803e-06, + "loss": 2.6411, + "mean_token_accuracy": 0.4764950756421972, + "step": 7289 + }, + { + "epoch": 1.3515016685205783, + "grad_norm": 8.3984375, + "learning_rate": 8.648498331479422e-06, + "loss": 2.2674, + "mean_token_accuracy": 0.539740426901212, + "step": 7290 + }, + { + "epoch": 1.3516870596959585, + "grad_norm": 7.953125, + "learning_rate": 8.648312940304042e-06, + "loss": 3.2944, + "mean_token_accuracy": 0.443556739162598, + "step": 7291 + }, + { + "epoch": 1.3518724508713384, + "grad_norm": 6.91796875, + "learning_rate": 8.648127549128663e-06, + "loss": 2.7469, + "mean_token_accuracy": 0.4670287044220326, + "step": 7292 + }, + { + "epoch": 1.3520578420467186, + "grad_norm": 8.3203125, + "learning_rate": 8.647942157953282e-06, + "loss": 2.4673, + "mean_token_accuracy": 0.4789122497796802, + "step": 7293 + }, + { + "epoch": 1.3522432332220986, + "grad_norm": 8.1875, + "learning_rate": 8.647756766777902e-06, + "loss": 2.7897, + "mean_token_accuracy": 0.45630198336532307, + "step": 7294 + }, + { + "epoch": 1.3524286243974788, + "grad_norm": 7.10546875, + "learning_rate": 8.647571375602523e-06, + "loss": 3.0645, + "mean_token_accuracy": 0.4429060107509366, + "step": 7295 + }, + { + "epoch": 1.3526140155728588, + "grad_norm": 7.30078125, + "learning_rate": 8.647385984427143e-06, + "loss": 2.823, + "mean_token_accuracy": 0.4820408681304734, + "step": 7296 + }, + { + "epoch": 1.3527994067482387, + "grad_norm": 5.3984375, + "learning_rate": 8.647200593251762e-06, + "loss": 3.198, + "mean_token_accuracy": 0.4487571312143439, + "step": 7297 + }, + { + "epoch": 1.352984797923619, + "grad_norm": 6.59375, + "learning_rate": 8.647015202076382e-06, + "loss": 3.0485, + "mean_token_accuracy": 0.4268653445820094, + "step": 7298 + }, + { + "epoch": 1.3531701890989989, + "grad_norm": 4.9921875, + "learning_rate": 8.646829810901001e-06, + "loss": 2.6847, + "mean_token_accuracy": 0.4755072463768116, + "step": 7299 + }, + { + "epoch": 1.3533555802743789, + "grad_norm": 10.421875, + "learning_rate": 8.646644419725622e-06, + "loss": 2.4317, + "mean_token_accuracy": 0.49184747721145206, + "step": 7300 + }, + { + "epoch": 1.353540971449759, + "grad_norm": 7.453125, + "learning_rate": 8.646459028550242e-06, + "loss": 2.6152, + "mean_token_accuracy": 0.4799290869950614, + "step": 7301 + }, + { + "epoch": 1.353726362625139, + "grad_norm": 6.69921875, + "learning_rate": 8.646273637374861e-06, + "loss": 2.5994, + "mean_token_accuracy": 0.47489970725360514, + "step": 7302 + }, + { + "epoch": 1.353911753800519, + "grad_norm": 5.109375, + "learning_rate": 8.646088246199481e-06, + "loss": 2.9922, + "mean_token_accuracy": 0.4417139256458727, + "step": 7303 + }, + { + "epoch": 1.3540971449758992, + "grad_norm": 8.6484375, + "learning_rate": 8.645902855024102e-06, + "loss": 2.8287, + "mean_token_accuracy": 0.4797136038186158, + "step": 7304 + }, + { + "epoch": 1.3542825361512791, + "grad_norm": 12.765625, + "learning_rate": 8.645717463848722e-06, + "loss": 2.393, + "mean_token_accuracy": 0.49369722188226656, + "step": 7305 + }, + { + "epoch": 1.3544679273266593, + "grad_norm": 7.26171875, + "learning_rate": 8.645532072673341e-06, + "loss": 2.6199, + "mean_token_accuracy": 0.47749834546657843, + "step": 7306 + }, + { + "epoch": 1.3546533185020393, + "grad_norm": 11.5546875, + "learning_rate": 8.645346681497962e-06, + "loss": 2.5834, + "mean_token_accuracy": 0.48481012658227846, + "step": 7307 + }, + { + "epoch": 1.3548387096774195, + "grad_norm": 9.5703125, + "learning_rate": 8.64516129032258e-06, + "loss": 2.8783, + "mean_token_accuracy": 0.46375587986973826, + "step": 7308 + }, + { + "epoch": 1.3550241008527995, + "grad_norm": 5.953125, + "learning_rate": 8.644975899147201e-06, + "loss": 2.5174, + "mean_token_accuracy": 0.4992014196983141, + "step": 7309 + }, + { + "epoch": 1.3552094920281794, + "grad_norm": 6.7109375, + "learning_rate": 8.644790507971821e-06, + "loss": 3.0934, + "mean_token_accuracy": 0.44241815048822514, + "step": 7310 + }, + { + "epoch": 1.3553948832035596, + "grad_norm": 11.6171875, + "learning_rate": 8.64460511679644e-06, + "loss": 2.5826, + "mean_token_accuracy": 0.4882753403933434, + "step": 7311 + }, + { + "epoch": 1.3555802743789396, + "grad_norm": 6.890625, + "learning_rate": 8.644419725621062e-06, + "loss": 2.5368, + "mean_token_accuracy": 0.49647795460474825, + "step": 7312 + }, + { + "epoch": 1.3557656655543195, + "grad_norm": 5.97265625, + "learning_rate": 8.644234334445681e-06, + "loss": 2.9189, + "mean_token_accuracy": 0.4441280173582859, + "step": 7313 + }, + { + "epoch": 1.3559510567296997, + "grad_norm": 8.859375, + "learning_rate": 8.644048943270302e-06, + "loss": 2.6071, + "mean_token_accuracy": 0.4887050170738114, + "step": 7314 + }, + { + "epoch": 1.3561364479050797, + "grad_norm": 7.82421875, + "learning_rate": 8.64386355209492e-06, + "loss": 2.8921, + "mean_token_accuracy": 0.4551282051282051, + "step": 7315 + }, + { + "epoch": 1.3563218390804597, + "grad_norm": 5.8046875, + "learning_rate": 8.643678160919541e-06, + "loss": 3.3541, + "mean_token_accuracy": 0.4180748529227688, + "step": 7316 + }, + { + "epoch": 1.3565072302558399, + "grad_norm": 7.0234375, + "learning_rate": 8.64349276974416e-06, + "loss": 2.9752, + "mean_token_accuracy": 0.44296316657504126, + "step": 7317 + }, + { + "epoch": 1.3566926214312198, + "grad_norm": 7.23828125, + "learning_rate": 8.64330737856878e-06, + "loss": 2.774, + "mean_token_accuracy": 0.4824197758329495, + "step": 7318 + }, + { + "epoch": 1.3568780126065998, + "grad_norm": 8.2578125, + "learning_rate": 8.6431219873934e-06, + "loss": 2.4836, + "mean_token_accuracy": 0.5028967076444821, + "step": 7319 + }, + { + "epoch": 1.35706340378198, + "grad_norm": 6.23828125, + "learning_rate": 8.642936596218021e-06, + "loss": 3.2388, + "mean_token_accuracy": 0.4162319543855392, + "step": 7320 + }, + { + "epoch": 1.35724879495736, + "grad_norm": 7.86328125, + "learning_rate": 8.642751205042642e-06, + "loss": 2.988, + "mean_token_accuracy": 0.4471707561342013, + "step": 7321 + }, + { + "epoch": 1.3574341861327401, + "grad_norm": 11.1015625, + "learning_rate": 8.64256581386726e-06, + "loss": 2.3031, + "mean_token_accuracy": 0.5095163050066674, + "step": 7322 + }, + { + "epoch": 1.35761957730812, + "grad_norm": 7.86328125, + "learning_rate": 8.642380422691881e-06, + "loss": 2.6977, + "mean_token_accuracy": 0.4797011559063998, + "step": 7323 + }, + { + "epoch": 1.3578049684835003, + "grad_norm": 6.26171875, + "learning_rate": 8.6421950315165e-06, + "loss": 2.3932, + "mean_token_accuracy": 0.5140370234154714, + "step": 7324 + }, + { + "epoch": 1.3579903596588803, + "grad_norm": 5.5625, + "learning_rate": 8.64200964034112e-06, + "loss": 2.587, + "mean_token_accuracy": 0.5044184380224505, + "step": 7325 + }, + { + "epoch": 1.3581757508342602, + "grad_norm": 7.671875, + "learning_rate": 8.64182424916574e-06, + "loss": 3.2915, + "mean_token_accuracy": 0.42289988492520136, + "step": 7326 + }, + { + "epoch": 1.3583611420096404, + "grad_norm": 6.421875, + "learning_rate": 8.64163885799036e-06, + "loss": 2.5378, + "mean_token_accuracy": 0.47769893563101873, + "step": 7327 + }, + { + "epoch": 1.3585465331850204, + "grad_norm": 7.48046875, + "learning_rate": 8.64145346681498e-06, + "loss": 2.721, + "mean_token_accuracy": 0.4774829600778968, + "step": 7328 + }, + { + "epoch": 1.3587319243604004, + "grad_norm": 8.7578125, + "learning_rate": 8.6412680756396e-06, + "loss": 2.7644, + "mean_token_accuracy": 0.4697347207973032, + "step": 7329 + }, + { + "epoch": 1.3589173155357805, + "grad_norm": 7.63671875, + "learning_rate": 8.641082684464221e-06, + "loss": 3.0469, + "mean_token_accuracy": 0.4492142025611176, + "step": 7330 + }, + { + "epoch": 1.3591027067111605, + "grad_norm": 7.29296875, + "learning_rate": 8.64089729328884e-06, + "loss": 2.6576, + "mean_token_accuracy": 0.48411464119084646, + "step": 7331 + }, + { + "epoch": 1.3592880978865405, + "grad_norm": 7.9375, + "learning_rate": 8.64071190211346e-06, + "loss": 3.0635, + "mean_token_accuracy": 0.4640647310804379, + "step": 7332 + }, + { + "epoch": 1.3594734890619207, + "grad_norm": 6.8125, + "learning_rate": 8.64052651093808e-06, + "loss": 3.0955, + "mean_token_accuracy": 0.42598496925529494, + "step": 7333 + }, + { + "epoch": 1.3596588802373006, + "grad_norm": 7.7265625, + "learning_rate": 8.6403411197627e-06, + "loss": 2.3649, + "mean_token_accuracy": 0.5146190987124464, + "step": 7334 + }, + { + "epoch": 1.3598442714126808, + "grad_norm": 5.20703125, + "learning_rate": 8.64015572858732e-06, + "loss": 2.5601, + "mean_token_accuracy": 0.4849726775956284, + "step": 7335 + }, + { + "epoch": 1.3600296625880608, + "grad_norm": 7.0859375, + "learning_rate": 8.63997033741194e-06, + "loss": 2.2697, + "mean_token_accuracy": 0.5223143130379343, + "step": 7336 + }, + { + "epoch": 1.360215053763441, + "grad_norm": 6.9921875, + "learning_rate": 8.63978494623656e-06, + "loss": 2.4996, + "mean_token_accuracy": 0.511329196132198, + "step": 7337 + }, + { + "epoch": 1.360400444938821, + "grad_norm": 9.859375, + "learning_rate": 8.63959955506118e-06, + "loss": 2.1067, + "mean_token_accuracy": 0.5356511490866235, + "step": 7338 + }, + { + "epoch": 1.360585836114201, + "grad_norm": 8.265625, + "learning_rate": 8.6394141638858e-06, + "loss": 3.2059, + "mean_token_accuracy": 0.4281349812526038, + "step": 7339 + }, + { + "epoch": 1.360771227289581, + "grad_norm": 8.7890625, + "learning_rate": 8.63922877271042e-06, + "loss": 2.3977, + "mean_token_accuracy": 0.49827143972343035, + "step": 7340 + }, + { + "epoch": 1.360956618464961, + "grad_norm": 6.6484375, + "learning_rate": 8.63904338153504e-06, + "loss": 2.6319, + "mean_token_accuracy": 0.4815571444784928, + "step": 7341 + }, + { + "epoch": 1.361142009640341, + "grad_norm": 9.296875, + "learning_rate": 8.638857990359659e-06, + "loss": 2.6013, + "mean_token_accuracy": 0.4659907055344318, + "step": 7342 + }, + { + "epoch": 1.3613274008157212, + "grad_norm": 8.1484375, + "learning_rate": 8.638672599184279e-06, + "loss": 2.0642, + "mean_token_accuracy": 0.5718364698247891, + "step": 7343 + }, + { + "epoch": 1.3615127919911012, + "grad_norm": 6.94921875, + "learning_rate": 8.6384872080089e-06, + "loss": 2.4559, + "mean_token_accuracy": 0.5107373479501427, + "step": 7344 + }, + { + "epoch": 1.3616981831664812, + "grad_norm": 4.875, + "learning_rate": 8.63830181683352e-06, + "loss": 2.9818, + "mean_token_accuracy": 0.44495916128245766, + "step": 7345 + }, + { + "epoch": 1.3618835743418614, + "grad_norm": 6.8984375, + "learning_rate": 8.638116425658139e-06, + "loss": 3.0588, + "mean_token_accuracy": 0.43302990897269183, + "step": 7346 + }, + { + "epoch": 1.3620689655172413, + "grad_norm": 6.1484375, + "learning_rate": 8.63793103448276e-06, + "loss": 2.5581, + "mean_token_accuracy": 0.48720431594964725, + "step": 7347 + }, + { + "epoch": 1.3622543566926213, + "grad_norm": 5.91015625, + "learning_rate": 8.63774564330738e-06, + "loss": 2.8318, + "mean_token_accuracy": 0.45915583373455304, + "step": 7348 + }, + { + "epoch": 1.3624397478680015, + "grad_norm": 7.41015625, + "learning_rate": 8.637560252131999e-06, + "loss": 2.9738, + "mean_token_accuracy": 0.4688416852482103, + "step": 7349 + }, + { + "epoch": 1.3626251390433817, + "grad_norm": 14.3046875, + "learning_rate": 8.637374860956619e-06, + "loss": 2.4147, + "mean_token_accuracy": 0.5191066997518611, + "step": 7350 + }, + { + "epoch": 1.3628105302187616, + "grad_norm": 5.48828125, + "learning_rate": 8.637189469781238e-06, + "loss": 2.6388, + "mean_token_accuracy": 0.48697549082509944, + "step": 7351 + }, + { + "epoch": 1.3629959213941416, + "grad_norm": 5.6640625, + "learning_rate": 8.63700407860586e-06, + "loss": 2.3863, + "mean_token_accuracy": 0.5041995089804885, + "step": 7352 + }, + { + "epoch": 1.3631813125695218, + "grad_norm": 6.9609375, + "learning_rate": 8.636818687430479e-06, + "loss": 2.9356, + "mean_token_accuracy": 0.4636036534735677, + "step": 7353 + }, + { + "epoch": 1.3633667037449018, + "grad_norm": 6.453125, + "learning_rate": 8.6366332962551e-06, + "loss": 2.8832, + "mean_token_accuracy": 0.460431654676259, + "step": 7354 + }, + { + "epoch": 1.3635520949202817, + "grad_norm": 9.296875, + "learning_rate": 8.63644790507972e-06, + "loss": 3.1493, + "mean_token_accuracy": 0.44529262086513993, + "step": 7355 + }, + { + "epoch": 1.363737486095662, + "grad_norm": 10.9453125, + "learning_rate": 8.636262513904339e-06, + "loss": 3.0728, + "mean_token_accuracy": 0.4516035827795435, + "step": 7356 + }, + { + "epoch": 1.363922877271042, + "grad_norm": 8.9296875, + "learning_rate": 8.636077122728959e-06, + "loss": 2.6297, + "mean_token_accuracy": 0.48235671514114625, + "step": 7357 + }, + { + "epoch": 1.3641082684464219, + "grad_norm": 8.21875, + "learning_rate": 8.635891731553578e-06, + "loss": 2.4467, + "mean_token_accuracy": 0.5075614366729678, + "step": 7358 + }, + { + "epoch": 1.364293659621802, + "grad_norm": 6.46875, + "learning_rate": 8.635706340378198e-06, + "loss": 2.9179, + "mean_token_accuracy": 0.4692737430167598, + "step": 7359 + }, + { + "epoch": 1.364479050797182, + "grad_norm": 7.11328125, + "learning_rate": 8.635520949202819e-06, + "loss": 2.5782, + "mean_token_accuracy": 0.4915512465373961, + "step": 7360 + }, + { + "epoch": 1.364664441972562, + "grad_norm": 6.0859375, + "learning_rate": 8.63533555802744e-06, + "loss": 3.4511, + "mean_token_accuracy": 0.42676121832132063, + "step": 7361 + }, + { + "epoch": 1.3648498331479422, + "grad_norm": 8.015625, + "learning_rate": 8.635150166852058e-06, + "loss": 2.6058, + "mean_token_accuracy": 0.4903758020164986, + "step": 7362 + }, + { + "epoch": 1.3650352243233221, + "grad_norm": 5.41796875, + "learning_rate": 8.634964775676679e-06, + "loss": 2.8949, + "mean_token_accuracy": 0.45038855726996285, + "step": 7363 + }, + { + "epoch": 1.3652206154987023, + "grad_norm": 5.19921875, + "learning_rate": 8.6347793845013e-06, + "loss": 2.9809, + "mean_token_accuracy": 0.43647932131495226, + "step": 7364 + }, + { + "epoch": 1.3654060066740823, + "grad_norm": 9.7578125, + "learning_rate": 8.634593993325918e-06, + "loss": 2.6062, + "mean_token_accuracy": 0.4904148164193653, + "step": 7365 + }, + { + "epoch": 1.3655913978494625, + "grad_norm": 6.171875, + "learning_rate": 8.634408602150538e-06, + "loss": 3.1044, + "mean_token_accuracy": 0.4492141064588562, + "step": 7366 + }, + { + "epoch": 1.3657767890248425, + "grad_norm": 14.203125, + "learning_rate": 8.634223210975157e-06, + "loss": 3.1804, + "mean_token_accuracy": 0.4076360310928212, + "step": 7367 + }, + { + "epoch": 1.3659621802002224, + "grad_norm": 6.7890625, + "learning_rate": 8.63403781979978e-06, + "loss": 3.0431, + "mean_token_accuracy": 0.42553875844322936, + "step": 7368 + }, + { + "epoch": 1.3661475713756026, + "grad_norm": 6.984375, + "learning_rate": 8.633852428624398e-06, + "loss": 2.8844, + "mean_token_accuracy": 0.44863852470825527, + "step": 7369 + }, + { + "epoch": 1.3663329625509826, + "grad_norm": 8.8125, + "learning_rate": 8.633667037449019e-06, + "loss": 3.1541, + "mean_token_accuracy": 0.4231031194660128, + "step": 7370 + }, + { + "epoch": 1.3665183537263625, + "grad_norm": 6.55078125, + "learning_rate": 8.633481646273638e-06, + "loss": 2.8224, + "mean_token_accuracy": 0.457565011820331, + "step": 7371 + }, + { + "epoch": 1.3667037449017427, + "grad_norm": 5.77734375, + "learning_rate": 8.633296255098258e-06, + "loss": 2.5783, + "mean_token_accuracy": 0.4784038901601831, + "step": 7372 + }, + { + "epoch": 1.3668891360771227, + "grad_norm": 5.046875, + "learning_rate": 8.633110863922878e-06, + "loss": 2.5815, + "mean_token_accuracy": 0.5002965599051008, + "step": 7373 + }, + { + "epoch": 1.3670745272525027, + "grad_norm": 6.390625, + "learning_rate": 8.632925472747497e-06, + "loss": 2.9484, + "mean_token_accuracy": 0.462430426716141, + "step": 7374 + }, + { + "epoch": 1.3672599184278829, + "grad_norm": 6.703125, + "learning_rate": 8.632740081572118e-06, + "loss": 2.6047, + "mean_token_accuracy": 0.48940710257582687, + "step": 7375 + }, + { + "epoch": 1.3674453096032628, + "grad_norm": 6.77734375, + "learning_rate": 8.632554690396738e-06, + "loss": 2.4381, + "mean_token_accuracy": 0.5100177184135205, + "step": 7376 + }, + { + "epoch": 1.367630700778643, + "grad_norm": 6.44140625, + "learning_rate": 8.632369299221359e-06, + "loss": 2.4171, + "mean_token_accuracy": 0.47732893652102226, + "step": 7377 + }, + { + "epoch": 1.367816091954023, + "grad_norm": 6.32421875, + "learning_rate": 8.632183908045978e-06, + "loss": 3.2222, + "mean_token_accuracy": 0.4439612848926918, + "step": 7378 + }, + { + "epoch": 1.3680014831294032, + "grad_norm": 8.890625, + "learning_rate": 8.631998516870598e-06, + "loss": 2.4916, + "mean_token_accuracy": 0.4841125737630504, + "step": 7379 + }, + { + "epoch": 1.3681868743047831, + "grad_norm": 6.078125, + "learning_rate": 8.631813125695217e-06, + "loss": 2.6611, + "mean_token_accuracy": 0.46060154113845386, + "step": 7380 + }, + { + "epoch": 1.368372265480163, + "grad_norm": 5.72265625, + "learning_rate": 8.631627734519837e-06, + "loss": 2.6812, + "mean_token_accuracy": 0.48614958448753465, + "step": 7381 + }, + { + "epoch": 1.3685576566555433, + "grad_norm": 6.6640625, + "learning_rate": 8.631442343344458e-06, + "loss": 2.7439, + "mean_token_accuracy": 0.46784100127477113, + "step": 7382 + }, + { + "epoch": 1.3687430478309233, + "grad_norm": 5.28515625, + "learning_rate": 8.631256952169077e-06, + "loss": 2.5386, + "mean_token_accuracy": 0.47807181016275313, + "step": 7383 + }, + { + "epoch": 1.3689284390063032, + "grad_norm": 7.203125, + "learning_rate": 8.631071560993697e-06, + "loss": 1.8222, + "mean_token_accuracy": 0.5953983266642415, + "step": 7384 + }, + { + "epoch": 1.3691138301816834, + "grad_norm": 8.3125, + "learning_rate": 8.630886169818318e-06, + "loss": 2.6158, + "mean_token_accuracy": 0.4783786968304466, + "step": 7385 + }, + { + "epoch": 1.3692992213570634, + "grad_norm": 7.34765625, + "learning_rate": 8.630700778642938e-06, + "loss": 2.5117, + "mean_token_accuracy": 0.5161290322580645, + "step": 7386 + }, + { + "epoch": 1.3694846125324434, + "grad_norm": 6.9140625, + "learning_rate": 8.630515387467557e-06, + "loss": 2.7967, + "mean_token_accuracy": 0.4590844062947067, + "step": 7387 + }, + { + "epoch": 1.3696700037078235, + "grad_norm": 7.72265625, + "learning_rate": 8.630329996292177e-06, + "loss": 2.8173, + "mean_token_accuracy": 0.4557356608478803, + "step": 7388 + }, + { + "epoch": 1.3698553948832035, + "grad_norm": 5.68359375, + "learning_rate": 8.630144605116796e-06, + "loss": 2.189, + "mean_token_accuracy": 0.538038613987981, + "step": 7389 + }, + { + "epoch": 1.3700407860585835, + "grad_norm": 7.015625, + "learning_rate": 8.629959213941417e-06, + "loss": 2.7052, + "mean_token_accuracy": 0.4675904180366369, + "step": 7390 + }, + { + "epoch": 1.3702261772339637, + "grad_norm": 7.15234375, + "learning_rate": 8.629773822766037e-06, + "loss": 2.8478, + "mean_token_accuracy": 0.4726126999515269, + "step": 7391 + }, + { + "epoch": 1.3704115684093436, + "grad_norm": 6.44921875, + "learning_rate": 8.629588431590658e-06, + "loss": 2.4109, + "mean_token_accuracy": 0.5066270283661588, + "step": 7392 + }, + { + "epoch": 1.3705969595847238, + "grad_norm": 5.5234375, + "learning_rate": 8.629403040415278e-06, + "loss": 3.062, + "mean_token_accuracy": 0.42673663858583066, + "step": 7393 + }, + { + "epoch": 1.3707823507601038, + "grad_norm": 7.234375, + "learning_rate": 8.629217649239897e-06, + "loss": 3.2006, + "mean_token_accuracy": 0.39489069649212, + "step": 7394 + }, + { + "epoch": 1.370967741935484, + "grad_norm": 8.203125, + "learning_rate": 8.629032258064517e-06, + "loss": 2.7331, + "mean_token_accuracy": 0.46336379669713, + "step": 7395 + }, + { + "epoch": 1.371153133110864, + "grad_norm": 6.921875, + "learning_rate": 8.628846866889136e-06, + "loss": 2.559, + "mean_token_accuracy": 0.48073871014283653, + "step": 7396 + }, + { + "epoch": 1.371338524286244, + "grad_norm": 5.34375, + "learning_rate": 8.628661475713757e-06, + "loss": 2.2595, + "mean_token_accuracy": 0.5477027027027027, + "step": 7397 + }, + { + "epoch": 1.371523915461624, + "grad_norm": 5.59765625, + "learning_rate": 8.628476084538376e-06, + "loss": 2.6096, + "mean_token_accuracy": 0.47985592315901815, + "step": 7398 + }, + { + "epoch": 1.371709306637004, + "grad_norm": 5.8984375, + "learning_rate": 8.628290693362996e-06, + "loss": 2.6805, + "mean_token_accuracy": 0.458012467913458, + "step": 7399 + }, + { + "epoch": 1.371894697812384, + "grad_norm": 9.0078125, + "learning_rate": 8.628105302187617e-06, + "loss": 2.3659, + "mean_token_accuracy": 0.5038386271262983, + "step": 7400 + }, + { + "epoch": 1.3720800889877642, + "grad_norm": 5.73828125, + "learning_rate": 8.627919911012237e-06, + "loss": 3.1338, + "mean_token_accuracy": 0.429970876356897, + "step": 7401 + }, + { + "epoch": 1.3722654801631442, + "grad_norm": 6.35546875, + "learning_rate": 8.627734519836858e-06, + "loss": 2.8711, + "mean_token_accuracy": 0.4637496459926366, + "step": 7402 + }, + { + "epoch": 1.3724508713385242, + "grad_norm": 7.0, + "learning_rate": 8.627549128661476e-06, + "loss": 3.1086, + "mean_token_accuracy": 0.4351287473566364, + "step": 7403 + }, + { + "epoch": 1.3726362625139044, + "grad_norm": 10.7265625, + "learning_rate": 8.627363737486097e-06, + "loss": 2.4281, + "mean_token_accuracy": 0.4818607002907344, + "step": 7404 + }, + { + "epoch": 1.3728216536892843, + "grad_norm": 7.1328125, + "learning_rate": 8.627178346310716e-06, + "loss": 2.8394, + "mean_token_accuracy": 0.4556173820879703, + "step": 7405 + }, + { + "epoch": 1.3730070448646645, + "grad_norm": 9.6875, + "learning_rate": 8.626992955135336e-06, + "loss": 2.8406, + "mean_token_accuracy": 0.4698564593301435, + "step": 7406 + }, + { + "epoch": 1.3731924360400445, + "grad_norm": 10.8828125, + "learning_rate": 8.626807563959957e-06, + "loss": 2.4231, + "mean_token_accuracy": 0.5076489096235217, + "step": 7407 + }, + { + "epoch": 1.3733778272154247, + "grad_norm": 6.17578125, + "learning_rate": 8.626622172784577e-06, + "loss": 3.2381, + "mean_token_accuracy": 0.43894610137755785, + "step": 7408 + }, + { + "epoch": 1.3735632183908046, + "grad_norm": 6.74609375, + "learning_rate": 8.626436781609196e-06, + "loss": 2.4361, + "mean_token_accuracy": 0.49498187900752716, + "step": 7409 + }, + { + "epoch": 1.3737486095661846, + "grad_norm": 8.359375, + "learning_rate": 8.626251390433816e-06, + "loss": 3.1712, + "mean_token_accuracy": 0.4538426032771752, + "step": 7410 + }, + { + "epoch": 1.3739340007415648, + "grad_norm": 9.2265625, + "learning_rate": 8.626065999258437e-06, + "loss": 2.5257, + "mean_token_accuracy": 0.5097571875465514, + "step": 7411 + }, + { + "epoch": 1.3741193919169448, + "grad_norm": 6.765625, + "learning_rate": 8.625880608083056e-06, + "loss": 2.6559, + "mean_token_accuracy": 0.4908521906596052, + "step": 7412 + }, + { + "epoch": 1.3743047830923247, + "grad_norm": 7.9453125, + "learning_rate": 8.625695216907676e-06, + "loss": 2.9155, + "mean_token_accuracy": 0.47449547115843, + "step": 7413 + }, + { + "epoch": 1.374490174267705, + "grad_norm": 8.84375, + "learning_rate": 8.625509825732295e-06, + "loss": 2.4313, + "mean_token_accuracy": 0.4968423942888523, + "step": 7414 + }, + { + "epoch": 1.374675565443085, + "grad_norm": 5.65234375, + "learning_rate": 8.625324434556915e-06, + "loss": 2.7591, + "mean_token_accuracy": 0.4628863134657837, + "step": 7415 + }, + { + "epoch": 1.3748609566184649, + "grad_norm": 8.03125, + "learning_rate": 8.625139043381536e-06, + "loss": 2.9286, + "mean_token_accuracy": 0.4363797020954304, + "step": 7416 + }, + { + "epoch": 1.375046347793845, + "grad_norm": 7.4453125, + "learning_rate": 8.624953652206156e-06, + "loss": 2.8084, + "mean_token_accuracy": 0.48120405049244, + "step": 7417 + }, + { + "epoch": 1.375231738969225, + "grad_norm": 5.26953125, + "learning_rate": 8.624768261030775e-06, + "loss": 3.0796, + "mean_token_accuracy": 0.425513698630137, + "step": 7418 + }, + { + "epoch": 1.375417130144605, + "grad_norm": 5.41796875, + "learning_rate": 8.624582869855396e-06, + "loss": 2.9322, + "mean_token_accuracy": 0.4543315804040963, + "step": 7419 + }, + { + "epoch": 1.3756025213199852, + "grad_norm": 10.0234375, + "learning_rate": 8.624397478680016e-06, + "loss": 2.7116, + "mean_token_accuracy": 0.47493440132578374, + "step": 7420 + }, + { + "epoch": 1.3757879124953651, + "grad_norm": 5.83984375, + "learning_rate": 8.624212087504635e-06, + "loss": 2.8615, + "mean_token_accuracy": 0.45558596665837275, + "step": 7421 + }, + { + "epoch": 1.3759733036707453, + "grad_norm": 6.6484375, + "learning_rate": 8.624026696329255e-06, + "loss": 2.5107, + "mean_token_accuracy": 0.5022156573116692, + "step": 7422 + }, + { + "epoch": 1.3761586948461253, + "grad_norm": 6.78515625, + "learning_rate": 8.623841305153874e-06, + "loss": 3.082, + "mean_token_accuracy": 0.4315381420462412, + "step": 7423 + }, + { + "epoch": 1.3763440860215055, + "grad_norm": 5.71484375, + "learning_rate": 8.623655913978495e-06, + "loss": 2.1851, + "mean_token_accuracy": 0.5349178403755869, + "step": 7424 + }, + { + "epoch": 1.3765294771968855, + "grad_norm": 5.203125, + "learning_rate": 8.623470522803115e-06, + "loss": 2.443, + "mean_token_accuracy": 0.502370820668693, + "step": 7425 + }, + { + "epoch": 1.3767148683722654, + "grad_norm": 6.92578125, + "learning_rate": 8.623285131627736e-06, + "loss": 2.6219, + "mean_token_accuracy": 0.47525664648591737, + "step": 7426 + }, + { + "epoch": 1.3769002595476456, + "grad_norm": 6.77734375, + "learning_rate": 8.623099740452355e-06, + "loss": 2.446, + "mean_token_accuracy": 0.49406824146981626, + "step": 7427 + }, + { + "epoch": 1.3770856507230256, + "grad_norm": 9.5703125, + "learning_rate": 8.622914349276975e-06, + "loss": 2.767, + "mean_token_accuracy": 0.4626664671554691, + "step": 7428 + }, + { + "epoch": 1.3772710418984055, + "grad_norm": 10.0390625, + "learning_rate": 8.622728958101596e-06, + "loss": 2.8705, + "mean_token_accuracy": 0.4491525423728814, + "step": 7429 + }, + { + "epoch": 1.3774564330737857, + "grad_norm": 6.53515625, + "learning_rate": 8.622543566926214e-06, + "loss": 2.9715, + "mean_token_accuracy": 0.43729694606887587, + "step": 7430 + }, + { + "epoch": 1.3776418242491657, + "grad_norm": 10.109375, + "learning_rate": 8.622358175750835e-06, + "loss": 2.9818, + "mean_token_accuracy": 0.44787322768974147, + "step": 7431 + }, + { + "epoch": 1.3778272154245457, + "grad_norm": 11.578125, + "learning_rate": 8.622172784575454e-06, + "loss": 2.8537, + "mean_token_accuracy": 0.4495240480961924, + "step": 7432 + }, + { + "epoch": 1.3780126065999259, + "grad_norm": 7.61328125, + "learning_rate": 8.621987393400076e-06, + "loss": 2.7611, + "mean_token_accuracy": 0.46386614453542185, + "step": 7433 + }, + { + "epoch": 1.3781979977753058, + "grad_norm": 7.3671875, + "learning_rate": 8.621802002224695e-06, + "loss": 2.9143, + "mean_token_accuracy": 0.45947578209928736, + "step": 7434 + }, + { + "epoch": 1.378383388950686, + "grad_norm": 10.8671875, + "learning_rate": 8.621616611049315e-06, + "loss": 3.3139, + "mean_token_accuracy": 0.4023842917251052, + "step": 7435 + }, + { + "epoch": 1.378568780126066, + "grad_norm": 8.0625, + "learning_rate": 8.621431219873934e-06, + "loss": 2.951, + "mean_token_accuracy": 0.4455558731066019, + "step": 7436 + }, + { + "epoch": 1.3787541713014462, + "grad_norm": 8.0390625, + "learning_rate": 8.621245828698554e-06, + "loss": 2.8326, + "mean_token_accuracy": 0.45309800049370524, + "step": 7437 + }, + { + "epoch": 1.3789395624768261, + "grad_norm": 9.71875, + "learning_rate": 8.621060437523175e-06, + "loss": 3.2739, + "mean_token_accuracy": 0.43939622641509435, + "step": 7438 + }, + { + "epoch": 1.379124953652206, + "grad_norm": 7.2890625, + "learning_rate": 8.620875046347794e-06, + "loss": 2.5585, + "mean_token_accuracy": 0.4921118184334348, + "step": 7439 + }, + { + "epoch": 1.3793103448275863, + "grad_norm": 10.234375, + "learning_rate": 8.620689655172414e-06, + "loss": 2.7522, + "mean_token_accuracy": 0.46027159780410287, + "step": 7440 + }, + { + "epoch": 1.3794957360029663, + "grad_norm": 6.27734375, + "learning_rate": 8.620504263997035e-06, + "loss": 2.4853, + "mean_token_accuracy": 0.49250053567602314, + "step": 7441 + }, + { + "epoch": 1.3796811271783462, + "grad_norm": 8.46875, + "learning_rate": 8.620318872821655e-06, + "loss": 2.5383, + "mean_token_accuracy": 0.5088557445816826, + "step": 7442 + }, + { + "epoch": 1.3798665183537264, + "grad_norm": 7.46484375, + "learning_rate": 8.620133481646274e-06, + "loss": 2.7757, + "mean_token_accuracy": 0.47903348141727164, + "step": 7443 + }, + { + "epoch": 1.3800519095291064, + "grad_norm": 10.0390625, + "learning_rate": 8.619948090470894e-06, + "loss": 2.6176, + "mean_token_accuracy": 0.45368620037807184, + "step": 7444 + }, + { + "epoch": 1.3802373007044864, + "grad_norm": 7.37109375, + "learning_rate": 8.619762699295515e-06, + "loss": 2.2462, + "mean_token_accuracy": 0.5062491946914057, + "step": 7445 + }, + { + "epoch": 1.3804226918798665, + "grad_norm": 8.25, + "learning_rate": 8.619577308120134e-06, + "loss": 2.9541, + "mean_token_accuracy": 0.4614381216370455, + "step": 7446 + }, + { + "epoch": 1.3806080830552465, + "grad_norm": 8.703125, + "learning_rate": 8.619391916944754e-06, + "loss": 2.6027, + "mean_token_accuracy": 0.4776796093931644, + "step": 7447 + }, + { + "epoch": 1.3807934742306267, + "grad_norm": 9.3046875, + "learning_rate": 8.619206525769373e-06, + "loss": 2.5998, + "mean_token_accuracy": 0.4815970056144729, + "step": 7448 + }, + { + "epoch": 1.3809788654060067, + "grad_norm": 7.671875, + "learning_rate": 8.619021134593995e-06, + "loss": 2.2996, + "mean_token_accuracy": 0.5394704281118535, + "step": 7449 + }, + { + "epoch": 1.3811642565813869, + "grad_norm": 9.5, + "learning_rate": 8.618835743418614e-06, + "loss": 3.1821, + "mean_token_accuracy": 0.4260977118119975, + "step": 7450 + }, + { + "epoch": 1.3813496477567668, + "grad_norm": 6.6640625, + "learning_rate": 8.618650352243234e-06, + "loss": 2.7154, + "mean_token_accuracy": 0.4883951843117786, + "step": 7451 + }, + { + "epoch": 1.3815350389321468, + "grad_norm": 12.390625, + "learning_rate": 8.618464961067853e-06, + "loss": 3.2936, + "mean_token_accuracy": 0.41905168731311404, + "step": 7452 + }, + { + "epoch": 1.381720430107527, + "grad_norm": 8.59375, + "learning_rate": 8.618279569892474e-06, + "loss": 3.0574, + "mean_token_accuracy": 0.45986779981114256, + "step": 7453 + }, + { + "epoch": 1.381905821282907, + "grad_norm": 7.3359375, + "learning_rate": 8.618094178717094e-06, + "loss": 2.6872, + "mean_token_accuracy": 0.4658268894986281, + "step": 7454 + }, + { + "epoch": 1.382091212458287, + "grad_norm": 9.0625, + "learning_rate": 8.617908787541713e-06, + "loss": 2.8389, + "mean_token_accuracy": 0.4682004495570541, + "step": 7455 + }, + { + "epoch": 1.3822766036336671, + "grad_norm": 11.1015625, + "learning_rate": 8.617723396366334e-06, + "loss": 2.4567, + "mean_token_accuracy": 0.5081764463377766, + "step": 7456 + }, + { + "epoch": 1.382461994809047, + "grad_norm": 6.41796875, + "learning_rate": 8.617538005190954e-06, + "loss": 2.5723, + "mean_token_accuracy": 0.4796665540159964, + "step": 7457 + }, + { + "epoch": 1.382647385984427, + "grad_norm": 6.6953125, + "learning_rate": 8.617352614015575e-06, + "loss": 2.3135, + "mean_token_accuracy": 0.5423728813559322, + "step": 7458 + }, + { + "epoch": 1.3828327771598072, + "grad_norm": 10.9453125, + "learning_rate": 8.617167222840193e-06, + "loss": 2.6908, + "mean_token_accuracy": 0.4795481773061783, + "step": 7459 + }, + { + "epoch": 1.3830181683351872, + "grad_norm": 5.37109375, + "learning_rate": 8.616981831664814e-06, + "loss": 2.7969, + "mean_token_accuracy": 0.4533803644914756, + "step": 7460 + }, + { + "epoch": 1.3832035595105672, + "grad_norm": 7.14453125, + "learning_rate": 8.616796440489433e-06, + "loss": 2.8248, + "mean_token_accuracy": 0.45457532446196813, + "step": 7461 + }, + { + "epoch": 1.3833889506859474, + "grad_norm": 7.7890625, + "learning_rate": 8.616611049314053e-06, + "loss": 2.4295, + "mean_token_accuracy": 0.491754860940192, + "step": 7462 + }, + { + "epoch": 1.3835743418613273, + "grad_norm": 5.9453125, + "learning_rate": 8.616425658138674e-06, + "loss": 3.0007, + "mean_token_accuracy": 0.4679860302677532, + "step": 7463 + }, + { + "epoch": 1.3837597330367075, + "grad_norm": 7.30859375, + "learning_rate": 8.616240266963292e-06, + "loss": 2.3924, + "mean_token_accuracy": 0.5044800754539024, + "step": 7464 + }, + { + "epoch": 1.3839451242120875, + "grad_norm": 6.09375, + "learning_rate": 8.616054875787913e-06, + "loss": 2.955, + "mean_token_accuracy": 0.4501477679265827, + "step": 7465 + }, + { + "epoch": 1.3841305153874677, + "grad_norm": 8.1640625, + "learning_rate": 8.615869484612533e-06, + "loss": 2.4401, + "mean_token_accuracy": 0.5099355178312935, + "step": 7466 + }, + { + "epoch": 1.3843159065628476, + "grad_norm": 6.484375, + "learning_rate": 8.615684093437154e-06, + "loss": 3.801, + "mean_token_accuracy": 0.37421760554001865, + "step": 7467 + }, + { + "epoch": 1.3845012977382276, + "grad_norm": 5.515625, + "learning_rate": 8.615498702261773e-06, + "loss": 2.4741, + "mean_token_accuracy": 0.4952840781495621, + "step": 7468 + }, + { + "epoch": 1.3846866889136078, + "grad_norm": 7.68359375, + "learning_rate": 8.615313311086393e-06, + "loss": 2.9287, + "mean_token_accuracy": 0.45156908028800435, + "step": 7469 + }, + { + "epoch": 1.3848720800889878, + "grad_norm": 7.78515625, + "learning_rate": 8.615127919911012e-06, + "loss": 2.9389, + "mean_token_accuracy": 0.4543524416135881, + "step": 7470 + }, + { + "epoch": 1.3850574712643677, + "grad_norm": 5.8828125, + "learning_rate": 8.614942528735632e-06, + "loss": 2.8659, + "mean_token_accuracy": 0.45898604386214753, + "step": 7471 + }, + { + "epoch": 1.385242862439748, + "grad_norm": 5.3203125, + "learning_rate": 8.614757137560253e-06, + "loss": 3.0361, + "mean_token_accuracy": 0.4255263157894737, + "step": 7472 + }, + { + "epoch": 1.385428253615128, + "grad_norm": 5.9375, + "learning_rate": 8.614571746384873e-06, + "loss": 2.5852, + "mean_token_accuracy": 0.4952340902719372, + "step": 7473 + }, + { + "epoch": 1.3856136447905079, + "grad_norm": 6.17578125, + "learning_rate": 8.614386355209494e-06, + "loss": 3.2013, + "mean_token_accuracy": 0.44165621079046424, + "step": 7474 + }, + { + "epoch": 1.385799035965888, + "grad_norm": 5.9453125, + "learning_rate": 8.614200964034113e-06, + "loss": 2.1264, + "mean_token_accuracy": 0.5561867382772154, + "step": 7475 + }, + { + "epoch": 1.385984427141268, + "grad_norm": 5.86328125, + "learning_rate": 8.614015572858733e-06, + "loss": 2.9316, + "mean_token_accuracy": 0.4616190725700812, + "step": 7476 + }, + { + "epoch": 1.3861698183166482, + "grad_norm": 6.421875, + "learning_rate": 8.613830181683352e-06, + "loss": 3.4656, + "mean_token_accuracy": 0.4162466072120977, + "step": 7477 + }, + { + "epoch": 1.3863552094920282, + "grad_norm": 7.35546875, + "learning_rate": 8.613644790507972e-06, + "loss": 2.5155, + "mean_token_accuracy": 0.4760656642295489, + "step": 7478 + }, + { + "epoch": 1.3865406006674084, + "grad_norm": 5.75390625, + "learning_rate": 8.613459399332591e-06, + "loss": 3.1748, + "mean_token_accuracy": 0.4321182815696485, + "step": 7479 + }, + { + "epoch": 1.3867259918427883, + "grad_norm": 7.375, + "learning_rate": 8.613274008157212e-06, + "loss": 3.2484, + "mean_token_accuracy": 0.430954717510164, + "step": 7480 + }, + { + "epoch": 1.3869113830181683, + "grad_norm": 5.625, + "learning_rate": 8.613088616981832e-06, + "loss": 2.8459, + "mean_token_accuracy": 0.48796660117878193, + "step": 7481 + }, + { + "epoch": 1.3870967741935485, + "grad_norm": 5.53125, + "learning_rate": 8.612903225806453e-06, + "loss": 2.6467, + "mean_token_accuracy": 0.503669028340081, + "step": 7482 + }, + { + "epoch": 1.3872821653689285, + "grad_norm": 6.16796875, + "learning_rate": 8.612717834631073e-06, + "loss": 2.8613, + "mean_token_accuracy": 0.46348019165595417, + "step": 7483 + }, + { + "epoch": 1.3874675565443084, + "grad_norm": 5.34375, + "learning_rate": 8.612532443455692e-06, + "loss": 2.6693, + "mean_token_accuracy": 0.46148884014965813, + "step": 7484 + }, + { + "epoch": 1.3876529477196886, + "grad_norm": 5.1484375, + "learning_rate": 8.612347052280313e-06, + "loss": 3.1279, + "mean_token_accuracy": 0.42932645034414946, + "step": 7485 + }, + { + "epoch": 1.3878383388950686, + "grad_norm": 7.1953125, + "learning_rate": 8.612161661104931e-06, + "loss": 2.3405, + "mean_token_accuracy": 0.49112938122025096, + "step": 7486 + }, + { + "epoch": 1.3880237300704485, + "grad_norm": 5.21875, + "learning_rate": 8.611976269929552e-06, + "loss": 3.2291, + "mean_token_accuracy": 0.41451823690150885, + "step": 7487 + }, + { + "epoch": 1.3882091212458287, + "grad_norm": 6.125, + "learning_rate": 8.611790878754172e-06, + "loss": 2.9358, + "mean_token_accuracy": 0.46724470134874757, + "step": 7488 + }, + { + "epoch": 1.3883945124212087, + "grad_norm": 5.76171875, + "learning_rate": 8.611605487578793e-06, + "loss": 3.2967, + "mean_token_accuracy": 0.4176644931831654, + "step": 7489 + }, + { + "epoch": 1.3885799035965887, + "grad_norm": 6.74609375, + "learning_rate": 8.611420096403412e-06, + "loss": 3.3399, + "mean_token_accuracy": 0.4237076153418566, + "step": 7490 + }, + { + "epoch": 1.3887652947719689, + "grad_norm": 5.88671875, + "learning_rate": 8.611234705228032e-06, + "loss": 2.7049, + "mean_token_accuracy": 0.4617714770386526, + "step": 7491 + }, + { + "epoch": 1.3889506859473488, + "grad_norm": 6.83203125, + "learning_rate": 8.611049314052653e-06, + "loss": 3.1766, + "mean_token_accuracy": 0.4301994301994302, + "step": 7492 + }, + { + "epoch": 1.389136077122729, + "grad_norm": 5.48046875, + "learning_rate": 8.610863922877271e-06, + "loss": 2.5951, + "mean_token_accuracy": 0.481351689612015, + "step": 7493 + }, + { + "epoch": 1.389321468298109, + "grad_norm": 6.24609375, + "learning_rate": 8.610678531701892e-06, + "loss": 2.2081, + "mean_token_accuracy": 0.5127291913995515, + "step": 7494 + }, + { + "epoch": 1.3895068594734892, + "grad_norm": 5.96875, + "learning_rate": 8.61049314052651e-06, + "loss": 3.5189, + "mean_token_accuracy": 0.43254845843632106, + "step": 7495 + }, + { + "epoch": 1.3896922506488691, + "grad_norm": 5.55859375, + "learning_rate": 8.610307749351131e-06, + "loss": 2.8908, + "mean_token_accuracy": 0.4518822724161533, + "step": 7496 + }, + { + "epoch": 1.389877641824249, + "grad_norm": 6.19921875, + "learning_rate": 8.610122358175752e-06, + "loss": 2.6292, + "mean_token_accuracy": 0.49087661094806684, + "step": 7497 + }, + { + "epoch": 1.3900630329996293, + "grad_norm": 7.41796875, + "learning_rate": 8.609936967000372e-06, + "loss": 3.2729, + "mean_token_accuracy": 0.4257347991922818, + "step": 7498 + }, + { + "epoch": 1.3902484241750093, + "grad_norm": 6.1875, + "learning_rate": 8.609751575824991e-06, + "loss": 2.9784, + "mean_token_accuracy": 0.4564425004556224, + "step": 7499 + }, + { + "epoch": 1.3904338153503892, + "grad_norm": 5.7578125, + "learning_rate": 8.609566184649611e-06, + "loss": 2.8498, + "mean_token_accuracy": 0.4636938836999096, + "step": 7500 + }, + { + "epoch": 1.3906192065257694, + "grad_norm": 6.69140625, + "learning_rate": 8.609380793474232e-06, + "loss": 2.847, + "mean_token_accuracy": 0.4925982049189882, + "step": 7501 + }, + { + "epoch": 1.3908045977011494, + "grad_norm": 4.921875, + "learning_rate": 8.60919540229885e-06, + "loss": 2.7124, + "mean_token_accuracy": 0.49513070220399796, + "step": 7502 + }, + { + "epoch": 1.3909899888765294, + "grad_norm": 8.328125, + "learning_rate": 8.609010011123471e-06, + "loss": 3.1974, + "mean_token_accuracy": 0.42016179215930305, + "step": 7503 + }, + { + "epoch": 1.3911753800519095, + "grad_norm": 5.81640625, + "learning_rate": 8.60882461994809e-06, + "loss": 2.9254, + "mean_token_accuracy": 0.4465551839464883, + "step": 7504 + }, + { + "epoch": 1.3913607712272895, + "grad_norm": 5.41015625, + "learning_rate": 8.608639228772712e-06, + "loss": 3.1074, + "mean_token_accuracy": 0.43101326405032137, + "step": 7505 + }, + { + "epoch": 1.3915461624026697, + "grad_norm": 6.890625, + "learning_rate": 8.608453837597331e-06, + "loss": 2.9061, + "mean_token_accuracy": 0.4397948164146868, + "step": 7506 + }, + { + "epoch": 1.3917315535780497, + "grad_norm": 5.53515625, + "learning_rate": 8.608268446421951e-06, + "loss": 2.845, + "mean_token_accuracy": 0.45052463806614423, + "step": 7507 + }, + { + "epoch": 1.3919169447534299, + "grad_norm": 5.8046875, + "learning_rate": 8.60808305524657e-06, + "loss": 2.5062, + "mean_token_accuracy": 0.48328437543712405, + "step": 7508 + }, + { + "epoch": 1.3921023359288098, + "grad_norm": 6.2265625, + "learning_rate": 8.60789766407119e-06, + "loss": 2.8011, + "mean_token_accuracy": 0.473595333128646, + "step": 7509 + }, + { + "epoch": 1.3922877271041898, + "grad_norm": 6.03125, + "learning_rate": 8.607712272895811e-06, + "loss": 2.6691, + "mean_token_accuracy": 0.47118772018117766, + "step": 7510 + }, + { + "epoch": 1.39247311827957, + "grad_norm": 5.62109375, + "learning_rate": 8.60752688172043e-06, + "loss": 2.6488, + "mean_token_accuracy": 0.49080841638981176, + "step": 7511 + }, + { + "epoch": 1.39265850945495, + "grad_norm": 5.80859375, + "learning_rate": 8.60734149054505e-06, + "loss": 2.4396, + "mean_token_accuracy": 0.49196624667668476, + "step": 7512 + }, + { + "epoch": 1.39284390063033, + "grad_norm": 6.52734375, + "learning_rate": 8.607156099369671e-06, + "loss": 2.724, + "mean_token_accuracy": 0.4842219804134929, + "step": 7513 + }, + { + "epoch": 1.3930292918057101, + "grad_norm": 5.47265625, + "learning_rate": 8.606970708194292e-06, + "loss": 2.7364, + "mean_token_accuracy": 0.47397675593734206, + "step": 7514 + }, + { + "epoch": 1.39321468298109, + "grad_norm": 9.65625, + "learning_rate": 8.60678531701891e-06, + "loss": 2.5389, + "mean_token_accuracy": 0.4800979791794244, + "step": 7515 + }, + { + "epoch": 1.39340007415647, + "grad_norm": 6.66796875, + "learning_rate": 8.60659992584353e-06, + "loss": 2.7499, + "mean_token_accuracy": 0.47573786893446723, + "step": 7516 + }, + { + "epoch": 1.3935854653318502, + "grad_norm": 6.31640625, + "learning_rate": 8.60641453466815e-06, + "loss": 4.0145, + "mean_token_accuracy": 0.37669296026194377, + "step": 7517 + }, + { + "epoch": 1.3937708565072302, + "grad_norm": 5.80859375, + "learning_rate": 8.60622914349277e-06, + "loss": 2.9368, + "mean_token_accuracy": 0.44560903632848275, + "step": 7518 + }, + { + "epoch": 1.3939562476826102, + "grad_norm": 6.0703125, + "learning_rate": 8.60604375231739e-06, + "loss": 3.2694, + "mean_token_accuracy": 0.4164484829816726, + "step": 7519 + }, + { + "epoch": 1.3941416388579904, + "grad_norm": 6.203125, + "learning_rate": 8.60585836114201e-06, + "loss": 3.2934, + "mean_token_accuracy": 0.40725957340651114, + "step": 7520 + }, + { + "epoch": 1.3943270300333703, + "grad_norm": 6.28515625, + "learning_rate": 8.605672969966632e-06, + "loss": 3.2264, + "mean_token_accuracy": 0.4428301441058351, + "step": 7521 + }, + { + "epoch": 1.3945124212087505, + "grad_norm": 5.13671875, + "learning_rate": 8.60548757879125e-06, + "loss": 2.8318, + "mean_token_accuracy": 0.4493539796824144, + "step": 7522 + }, + { + "epoch": 1.3946978123841305, + "grad_norm": 8.234375, + "learning_rate": 8.605302187615871e-06, + "loss": 2.535, + "mean_token_accuracy": 0.47875218844501033, + "step": 7523 + }, + { + "epoch": 1.3948832035595107, + "grad_norm": 5.8828125, + "learning_rate": 8.60511679644049e-06, + "loss": 2.3364, + "mean_token_accuracy": 0.5174635906689007, + "step": 7524 + }, + { + "epoch": 1.3950685947348906, + "grad_norm": 6.10546875, + "learning_rate": 8.60493140526511e-06, + "loss": 3.1571, + "mean_token_accuracy": 0.42610981308411217, + "step": 7525 + }, + { + "epoch": 1.3952539859102706, + "grad_norm": 8.2265625, + "learning_rate": 8.60474601408973e-06, + "loss": 3.0124, + "mean_token_accuracy": 0.44651913324112497, + "step": 7526 + }, + { + "epoch": 1.3954393770856508, + "grad_norm": 6.39453125, + "learning_rate": 8.60456062291435e-06, + "loss": 2.6643, + "mean_token_accuracy": 0.47797867408437644, + "step": 7527 + }, + { + "epoch": 1.3956247682610308, + "grad_norm": 11.4921875, + "learning_rate": 8.60437523173897e-06, + "loss": 2.666, + "mean_token_accuracy": 0.46880991004276656, + "step": 7528 + }, + { + "epoch": 1.3958101594364107, + "grad_norm": 5.53515625, + "learning_rate": 8.60418984056359e-06, + "loss": 2.6346, + "mean_token_accuracy": 0.4814997533300444, + "step": 7529 + }, + { + "epoch": 1.395995550611791, + "grad_norm": 8.2109375, + "learning_rate": 8.604004449388211e-06, + "loss": 2.5642, + "mean_token_accuracy": 0.4786364916893605, + "step": 7530 + }, + { + "epoch": 1.396180941787171, + "grad_norm": 5.62109375, + "learning_rate": 8.60381905821283e-06, + "loss": 2.526, + "mean_token_accuracy": 0.5019698410542046, + "step": 7531 + }, + { + "epoch": 1.3963663329625509, + "grad_norm": 7.4453125, + "learning_rate": 8.60363366703745e-06, + "loss": 3.074, + "mean_token_accuracy": 0.4331819656179046, + "step": 7532 + }, + { + "epoch": 1.396551724137931, + "grad_norm": 9.8515625, + "learning_rate": 8.603448275862069e-06, + "loss": 2.8006, + "mean_token_accuracy": 0.4627552487776819, + "step": 7533 + }, + { + "epoch": 1.396737115313311, + "grad_norm": 5.4375, + "learning_rate": 8.60326288468669e-06, + "loss": 3.133, + "mean_token_accuracy": 0.4440777411074441, + "step": 7534 + }, + { + "epoch": 1.3969225064886912, + "grad_norm": 6.92578125, + "learning_rate": 8.60307749351131e-06, + "loss": 3.1841, + "mean_token_accuracy": 0.44229326129371155, + "step": 7535 + }, + { + "epoch": 1.3971078976640712, + "grad_norm": 9.5078125, + "learning_rate": 8.602892102335929e-06, + "loss": 2.8663, + "mean_token_accuracy": 0.4626280892103677, + "step": 7536 + }, + { + "epoch": 1.3972932888394514, + "grad_norm": 8.59375, + "learning_rate": 8.60270671116055e-06, + "loss": 3.2102, + "mean_token_accuracy": 0.4276973761619014, + "step": 7537 + }, + { + "epoch": 1.3974786800148313, + "grad_norm": 6.859375, + "learning_rate": 8.60252131998517e-06, + "loss": 2.8966, + "mean_token_accuracy": 0.44309151499615484, + "step": 7538 + }, + { + "epoch": 1.3976640711902113, + "grad_norm": 6.734375, + "learning_rate": 8.60233592880979e-06, + "loss": 2.4738, + "mean_token_accuracy": 0.48750709823963656, + "step": 7539 + }, + { + "epoch": 1.3978494623655915, + "grad_norm": 7.45703125, + "learning_rate": 8.602150537634409e-06, + "loss": 2.6338, + "mean_token_accuracy": 0.4611682345219864, + "step": 7540 + }, + { + "epoch": 1.3980348535409715, + "grad_norm": 7.57421875, + "learning_rate": 8.60196514645903e-06, + "loss": 2.5671, + "mean_token_accuracy": 0.48541309144398137, + "step": 7541 + }, + { + "epoch": 1.3982202447163514, + "grad_norm": 5.99609375, + "learning_rate": 8.601779755283648e-06, + "loss": 3.1215, + "mean_token_accuracy": 0.4295327102803738, + "step": 7542 + }, + { + "epoch": 1.3984056358917316, + "grad_norm": 7.484375, + "learning_rate": 8.601594364108269e-06, + "loss": 2.9964, + "mean_token_accuracy": 0.456026600166251, + "step": 7543 + }, + { + "epoch": 1.3985910270671116, + "grad_norm": 6.62890625, + "learning_rate": 8.60140897293289e-06, + "loss": 2.2819, + "mean_token_accuracy": 0.5139420448332422, + "step": 7544 + }, + { + "epoch": 1.3987764182424915, + "grad_norm": 6.34375, + "learning_rate": 8.60122358175751e-06, + "loss": 2.7956, + "mean_token_accuracy": 0.46491728465487736, + "step": 7545 + }, + { + "epoch": 1.3989618094178717, + "grad_norm": 6.33984375, + "learning_rate": 8.601038190582129e-06, + "loss": 2.9488, + "mean_token_accuracy": 0.4514762969062463, + "step": 7546 + }, + { + "epoch": 1.3991472005932517, + "grad_norm": 7.96484375, + "learning_rate": 8.600852799406749e-06, + "loss": 3.3419, + "mean_token_accuracy": 0.4166666666666667, + "step": 7547 + }, + { + "epoch": 1.399332591768632, + "grad_norm": 10.265625, + "learning_rate": 8.60066740823137e-06, + "loss": 2.7964, + "mean_token_accuracy": 0.4583766137409111, + "step": 7548 + }, + { + "epoch": 1.3995179829440119, + "grad_norm": 8.9296875, + "learning_rate": 8.600482017055988e-06, + "loss": 2.5309, + "mean_token_accuracy": 0.504236262406197, + "step": 7549 + }, + { + "epoch": 1.399703374119392, + "grad_norm": 7.70703125, + "learning_rate": 8.600296625880609e-06, + "loss": 3.3935, + "mean_token_accuracy": 0.42084648025242083, + "step": 7550 + }, + { + "epoch": 1.399888765294772, + "grad_norm": 8.8515625, + "learning_rate": 8.600111234705228e-06, + "loss": 3.1819, + "mean_token_accuracy": 0.4427761094427761, + "step": 7551 + }, + { + "epoch": 1.400074156470152, + "grad_norm": 5.1953125, + "learning_rate": 8.599925843529848e-06, + "loss": 2.8434, + "mean_token_accuracy": 0.4539869106451182, + "step": 7552 + }, + { + "epoch": 1.4002595476455322, + "grad_norm": 6.203125, + "learning_rate": 8.599740452354469e-06, + "loss": 2.6241, + "mean_token_accuracy": 0.4700646262650896, + "step": 7553 + }, + { + "epoch": 1.4004449388209121, + "grad_norm": 6.15625, + "learning_rate": 8.59955506117909e-06, + "loss": 2.8078, + "mean_token_accuracy": 0.4507689021785562, + "step": 7554 + }, + { + "epoch": 1.4006303299962921, + "grad_norm": 5.07421875, + "learning_rate": 8.599369670003708e-06, + "loss": 2.5778, + "mean_token_accuracy": 0.4873939393939394, + "step": 7555 + }, + { + "epoch": 1.4008157211716723, + "grad_norm": 6.7265625, + "learning_rate": 8.599184278828328e-06, + "loss": 2.8147, + "mean_token_accuracy": 0.4830166954519286, + "step": 7556 + }, + { + "epoch": 1.4010011123470523, + "grad_norm": 6.2734375, + "learning_rate": 8.598998887652949e-06, + "loss": 2.7944, + "mean_token_accuracy": 0.4523403217942467, + "step": 7557 + }, + { + "epoch": 1.4011865035224322, + "grad_norm": 5.6953125, + "learning_rate": 8.598813496477568e-06, + "loss": 2.3654, + "mean_token_accuracy": 0.5004765146358067, + "step": 7558 + }, + { + "epoch": 1.4013718946978124, + "grad_norm": 5.9140625, + "learning_rate": 8.598628105302188e-06, + "loss": 1.8951, + "mean_token_accuracy": 0.5707808564231738, + "step": 7559 + }, + { + "epoch": 1.4015572858731924, + "grad_norm": 7.80078125, + "learning_rate": 8.598442714126807e-06, + "loss": 2.5186, + "mean_token_accuracy": 0.4939209726443769, + "step": 7560 + }, + { + "epoch": 1.4017426770485724, + "grad_norm": 9.171875, + "learning_rate": 8.598257322951428e-06, + "loss": 2.9644, + "mean_token_accuracy": 0.46165113182423434, + "step": 7561 + }, + { + "epoch": 1.4019280682239526, + "grad_norm": 7.484375, + "learning_rate": 8.598071931776048e-06, + "loss": 3.4942, + "mean_token_accuracy": 0.40168539325842695, + "step": 7562 + }, + { + "epoch": 1.4021134593993325, + "grad_norm": 6.14453125, + "learning_rate": 8.597886540600669e-06, + "loss": 3.3395, + "mean_token_accuracy": 0.4283195592286501, + "step": 7563 + }, + { + "epoch": 1.4022988505747127, + "grad_norm": 7.5390625, + "learning_rate": 8.597701149425289e-06, + "loss": 2.7002, + "mean_token_accuracy": 0.47047078604455655, + "step": 7564 + }, + { + "epoch": 1.4024842417500927, + "grad_norm": 7.9296875, + "learning_rate": 8.597515758249908e-06, + "loss": 2.8029, + "mean_token_accuracy": 0.467110125646711, + "step": 7565 + }, + { + "epoch": 1.4026696329254729, + "grad_norm": 7.41796875, + "learning_rate": 8.597330367074528e-06, + "loss": 2.785, + "mean_token_accuracy": 0.4522977694168234, + "step": 7566 + }, + { + "epoch": 1.4028550241008528, + "grad_norm": 6.8515625, + "learning_rate": 8.597144975899147e-06, + "loss": 2.6134, + "mean_token_accuracy": 0.4581900130470881, + "step": 7567 + }, + { + "epoch": 1.4030404152762328, + "grad_norm": 9.1640625, + "learning_rate": 8.596959584723768e-06, + "loss": 2.7567, + "mean_token_accuracy": 0.46747967479674796, + "step": 7568 + }, + { + "epoch": 1.403225806451613, + "grad_norm": 7.37109375, + "learning_rate": 8.596774193548388e-06, + "loss": 2.9283, + "mean_token_accuracy": 0.44391622767570343, + "step": 7569 + }, + { + "epoch": 1.403411197626993, + "grad_norm": 5.80078125, + "learning_rate": 8.596588802373009e-06, + "loss": 2.9584, + "mean_token_accuracy": 0.4473519469137348, + "step": 7570 + }, + { + "epoch": 1.403596588802373, + "grad_norm": 9.7734375, + "learning_rate": 8.596403411197627e-06, + "loss": 2.9271, + "mean_token_accuracy": 0.4757441210327249, + "step": 7571 + }, + { + "epoch": 1.4037819799777531, + "grad_norm": 7.17578125, + "learning_rate": 8.596218020022248e-06, + "loss": 2.5833, + "mean_token_accuracy": 0.49915984036967026, + "step": 7572 + }, + { + "epoch": 1.403967371153133, + "grad_norm": 7.859375, + "learning_rate": 8.596032628846868e-06, + "loss": 2.6068, + "mean_token_accuracy": 0.47024066868424846, + "step": 7573 + }, + { + "epoch": 1.404152762328513, + "grad_norm": 10.71875, + "learning_rate": 8.595847237671487e-06, + "loss": 2.3354, + "mean_token_accuracy": 0.5339408346539883, + "step": 7574 + }, + { + "epoch": 1.4043381535038932, + "grad_norm": 10.5390625, + "learning_rate": 8.595661846496108e-06, + "loss": 2.4981, + "mean_token_accuracy": 0.5108205590622182, + "step": 7575 + }, + { + "epoch": 1.4045235446792732, + "grad_norm": 20.953125, + "learning_rate": 8.595476455320726e-06, + "loss": 2.1395, + "mean_token_accuracy": 0.5107518442944593, + "step": 7576 + }, + { + "epoch": 1.4047089358546534, + "grad_norm": 16.0, + "learning_rate": 8.595291064145347e-06, + "loss": 2.1315, + "mean_token_accuracy": 0.5292562363490056, + "step": 7577 + }, + { + "epoch": 1.4048943270300334, + "grad_norm": 9.1640625, + "learning_rate": 8.595105672969967e-06, + "loss": 2.7965, + "mean_token_accuracy": 0.47703960864762507, + "step": 7578 + }, + { + "epoch": 1.4050797182054136, + "grad_norm": 8.1796875, + "learning_rate": 8.594920281794588e-06, + "loss": 3.5582, + "mean_token_accuracy": 0.4154103852596315, + "step": 7579 + }, + { + "epoch": 1.4052651093807935, + "grad_norm": 7.69921875, + "learning_rate": 8.594734890619207e-06, + "loss": 2.5722, + "mean_token_accuracy": 0.49080141129032256, + "step": 7580 + }, + { + "epoch": 1.4054505005561735, + "grad_norm": 12.1953125, + "learning_rate": 8.594549499443827e-06, + "loss": 2.6134, + "mean_token_accuracy": 0.46299425481581613, + "step": 7581 + }, + { + "epoch": 1.4056358917315537, + "grad_norm": 7.19921875, + "learning_rate": 8.594364108268448e-06, + "loss": 2.4422, + "mean_token_accuracy": 0.4998076676496987, + "step": 7582 + }, + { + "epoch": 1.4058212829069336, + "grad_norm": 7.4765625, + "learning_rate": 8.594178717093066e-06, + "loss": 3.078, + "mean_token_accuracy": 0.45777866083846613, + "step": 7583 + }, + { + "epoch": 1.4060066740823136, + "grad_norm": 14.390625, + "learning_rate": 8.593993325917687e-06, + "loss": 2.6235, + "mean_token_accuracy": 0.507523475506279, + "step": 7584 + }, + { + "epoch": 1.4061920652576938, + "grad_norm": 17.28125, + "learning_rate": 8.593807934742306e-06, + "loss": 2.5532, + "mean_token_accuracy": 0.47417840375586856, + "step": 7585 + }, + { + "epoch": 1.4063774564330738, + "grad_norm": 8.1484375, + "learning_rate": 8.593622543566928e-06, + "loss": 3.0859, + "mean_token_accuracy": 0.4334071885770556, + "step": 7586 + }, + { + "epoch": 1.4065628476084537, + "grad_norm": 6.40234375, + "learning_rate": 8.593437152391547e-06, + "loss": 3.1547, + "mean_token_accuracy": 0.42726517040731504, + "step": 7587 + }, + { + "epoch": 1.406748238783834, + "grad_norm": 11.46875, + "learning_rate": 8.593251761216167e-06, + "loss": 2.6421, + "mean_token_accuracy": 0.47375522871407366, + "step": 7588 + }, + { + "epoch": 1.406933629959214, + "grad_norm": 8.578125, + "learning_rate": 8.593066370040786e-06, + "loss": 2.9107, + "mean_token_accuracy": 0.4473378669762813, + "step": 7589 + }, + { + "epoch": 1.4071190211345939, + "grad_norm": 9.2578125, + "learning_rate": 8.592880978865407e-06, + "loss": 2.5158, + "mean_token_accuracy": 0.5033976624082631, + "step": 7590 + }, + { + "epoch": 1.407304412309974, + "grad_norm": 7.08203125, + "learning_rate": 8.592695587690027e-06, + "loss": 3.6767, + "mean_token_accuracy": 0.418, + "step": 7591 + }, + { + "epoch": 1.407489803485354, + "grad_norm": 10.2109375, + "learning_rate": 8.592510196514646e-06, + "loss": 2.6872, + "mean_token_accuracy": 0.4579204965978274, + "step": 7592 + }, + { + "epoch": 1.4076751946607342, + "grad_norm": 12.8984375, + "learning_rate": 8.592324805339266e-06, + "loss": 2.9897, + "mean_token_accuracy": 0.46375739644970415, + "step": 7593 + }, + { + "epoch": 1.4078605858361142, + "grad_norm": 6.00390625, + "learning_rate": 8.592139414163887e-06, + "loss": 2.5909, + "mean_token_accuracy": 0.46290762634792776, + "step": 7594 + }, + { + "epoch": 1.4080459770114944, + "grad_norm": 7.51171875, + "learning_rate": 8.591954022988507e-06, + "loss": 2.8017, + "mean_token_accuracy": 0.460446247464503, + "step": 7595 + }, + { + "epoch": 1.4082313681868743, + "grad_norm": 9.1796875, + "learning_rate": 8.591768631813126e-06, + "loss": 3.1331, + "mean_token_accuracy": 0.4244349419670128, + "step": 7596 + }, + { + "epoch": 1.4084167593622543, + "grad_norm": 9.328125, + "learning_rate": 8.591583240637747e-06, + "loss": 2.4731, + "mean_token_accuracy": 0.49089798411728774, + "step": 7597 + }, + { + "epoch": 1.4086021505376345, + "grad_norm": 5.73828125, + "learning_rate": 8.591397849462365e-06, + "loss": 2.7015, + "mean_token_accuracy": 0.4763336674462635, + "step": 7598 + }, + { + "epoch": 1.4087875417130145, + "grad_norm": 7.59765625, + "learning_rate": 8.591212458286986e-06, + "loss": 2.4549, + "mean_token_accuracy": 0.5109761793554414, + "step": 7599 + }, + { + "epoch": 1.4089729328883944, + "grad_norm": 6.25390625, + "learning_rate": 8.591027067111606e-06, + "loss": 2.5481, + "mean_token_accuracy": 0.48603504928806135, + "step": 7600 + }, + { + "epoch": 1.4091583240637746, + "grad_norm": 5.68359375, + "learning_rate": 8.590841675936225e-06, + "loss": 2.7613, + "mean_token_accuracy": 0.4571329799492971, + "step": 7601 + }, + { + "epoch": 1.4093437152391546, + "grad_norm": 9.15625, + "learning_rate": 8.590656284760847e-06, + "loss": 2.4423, + "mean_token_accuracy": 0.5031454298754163, + "step": 7602 + }, + { + "epoch": 1.4095291064145345, + "grad_norm": 5.9296875, + "learning_rate": 8.590470893585466e-06, + "loss": 2.4711, + "mean_token_accuracy": 0.4877356347944583, + "step": 7603 + }, + { + "epoch": 1.4097144975899147, + "grad_norm": 5.40625, + "learning_rate": 8.590285502410087e-06, + "loss": 2.6203, + "mean_token_accuracy": 0.48739495798319327, + "step": 7604 + }, + { + "epoch": 1.4098998887652947, + "grad_norm": 6.43359375, + "learning_rate": 8.590100111234705e-06, + "loss": 3.2928, + "mean_token_accuracy": 0.422360248447205, + "step": 7605 + }, + { + "epoch": 1.410085279940675, + "grad_norm": 5.53515625, + "learning_rate": 8.589914720059326e-06, + "loss": 3.5048, + "mean_token_accuracy": 0.40875912408759124, + "step": 7606 + }, + { + "epoch": 1.4102706711160549, + "grad_norm": 6.82421875, + "learning_rate": 8.589729328883946e-06, + "loss": 3.0202, + "mean_token_accuracy": 0.44676737160120844, + "step": 7607 + }, + { + "epoch": 1.410456062291435, + "grad_norm": 6.234375, + "learning_rate": 8.589543937708565e-06, + "loss": 2.7959, + "mean_token_accuracy": 0.47561937825469464, + "step": 7608 + }, + { + "epoch": 1.410641453466815, + "grad_norm": 5.3671875, + "learning_rate": 8.589358546533186e-06, + "loss": 2.0832, + "mean_token_accuracy": 0.5656208033207178, + "step": 7609 + }, + { + "epoch": 1.410826844642195, + "grad_norm": 7.28515625, + "learning_rate": 8.589173155357806e-06, + "loss": 2.9394, + "mean_token_accuracy": 0.4381611597231292, + "step": 7610 + }, + { + "epoch": 1.4110122358175752, + "grad_norm": 6.23828125, + "learning_rate": 8.588987764182427e-06, + "loss": 2.9802, + "mean_token_accuracy": 0.4591329068941009, + "step": 7611 + }, + { + "epoch": 1.4111976269929551, + "grad_norm": 10.15625, + "learning_rate": 8.588802373007045e-06, + "loss": 2.3685, + "mean_token_accuracy": 0.4988829018267841, + "step": 7612 + }, + { + "epoch": 1.4113830181683351, + "grad_norm": 5.39453125, + "learning_rate": 8.588616981831666e-06, + "loss": 2.8282, + "mean_token_accuracy": 0.4635056525079574, + "step": 7613 + }, + { + "epoch": 1.4115684093437153, + "grad_norm": 7.0625, + "learning_rate": 8.588431590656285e-06, + "loss": 2.5143, + "mean_token_accuracy": 0.49886694723211394, + "step": 7614 + }, + { + "epoch": 1.4117538005190953, + "grad_norm": 6.82421875, + "learning_rate": 8.588246199480905e-06, + "loss": 3.0048, + "mean_token_accuracy": 0.4475635593220339, + "step": 7615 + }, + { + "epoch": 1.4119391916944752, + "grad_norm": 6.6015625, + "learning_rate": 8.588060808305526e-06, + "loss": 2.7165, + "mean_token_accuracy": 0.46893813244524146, + "step": 7616 + }, + { + "epoch": 1.4121245828698554, + "grad_norm": 7.046875, + "learning_rate": 8.587875417130145e-06, + "loss": 2.2444, + "mean_token_accuracy": 0.5148683092608326, + "step": 7617 + }, + { + "epoch": 1.4123099740452354, + "grad_norm": 9.3125, + "learning_rate": 8.587690025954765e-06, + "loss": 2.5737, + "mean_token_accuracy": 0.47648514851485146, + "step": 7618 + }, + { + "epoch": 1.4124953652206154, + "grad_norm": 5.515625, + "learning_rate": 8.587504634779386e-06, + "loss": 2.8072, + "mean_token_accuracy": 0.4528301886792453, + "step": 7619 + }, + { + "epoch": 1.4126807563959956, + "grad_norm": 6.03125, + "learning_rate": 8.587319243604006e-06, + "loss": 2.4569, + "mean_token_accuracy": 0.4955592740378427, + "step": 7620 + }, + { + "epoch": 1.4128661475713757, + "grad_norm": 11.484375, + "learning_rate": 8.587133852428625e-06, + "loss": 2.9092, + "mean_token_accuracy": 0.45873055694932263, + "step": 7621 + }, + { + "epoch": 1.4130515387467557, + "grad_norm": 7.765625, + "learning_rate": 8.586948461253245e-06, + "loss": 2.272, + "mean_token_accuracy": 0.5451114518221488, + "step": 7622 + }, + { + "epoch": 1.4132369299221357, + "grad_norm": 6.1953125, + "learning_rate": 8.586763070077864e-06, + "loss": 2.6782, + "mean_token_accuracy": 0.4687468545546049, + "step": 7623 + }, + { + "epoch": 1.4134223210975159, + "grad_norm": 7.01171875, + "learning_rate": 8.586577678902485e-06, + "loss": 3.4935, + "mean_token_accuracy": 0.39215686274509803, + "step": 7624 + }, + { + "epoch": 1.4136077122728958, + "grad_norm": 7.94140625, + "learning_rate": 8.586392287727105e-06, + "loss": 2.8194, + "mean_token_accuracy": 0.4632405424696645, + "step": 7625 + }, + { + "epoch": 1.4137931034482758, + "grad_norm": 5.84375, + "learning_rate": 8.586206896551726e-06, + "loss": 3.0251, + "mean_token_accuracy": 0.43719706411854314, + "step": 7626 + }, + { + "epoch": 1.413978494623656, + "grad_norm": 7.0234375, + "learning_rate": 8.586021505376344e-06, + "loss": 3.3982, + "mean_token_accuracy": 0.39950654773201744, + "step": 7627 + }, + { + "epoch": 1.414163885799036, + "grad_norm": 6.79296875, + "learning_rate": 8.585836114200965e-06, + "loss": 2.9332, + "mean_token_accuracy": 0.4433802816901408, + "step": 7628 + }, + { + "epoch": 1.414349276974416, + "grad_norm": 7.85546875, + "learning_rate": 8.585650723025585e-06, + "loss": 2.771, + "mean_token_accuracy": 0.4742998537847261, + "step": 7629 + }, + { + "epoch": 1.4145346681497961, + "grad_norm": 7.45703125, + "learning_rate": 8.585465331850204e-06, + "loss": 2.1059, + "mean_token_accuracy": 0.5609429689108503, + "step": 7630 + }, + { + "epoch": 1.414720059325176, + "grad_norm": 7.88671875, + "learning_rate": 8.585279940674825e-06, + "loss": 2.222, + "mean_token_accuracy": 0.5087658989343417, + "step": 7631 + }, + { + "epoch": 1.414905450500556, + "grad_norm": 8.546875, + "learning_rate": 8.585094549499443e-06, + "loss": 3.206, + "mean_token_accuracy": 0.4407299493942647, + "step": 7632 + }, + { + "epoch": 1.4150908416759362, + "grad_norm": 7.05078125, + "learning_rate": 8.584909158324064e-06, + "loss": 2.6048, + "mean_token_accuracy": 0.47247013593299464, + "step": 7633 + }, + { + "epoch": 1.4152762328513162, + "grad_norm": 8.109375, + "learning_rate": 8.584723767148684e-06, + "loss": 2.5969, + "mean_token_accuracy": 0.4695606830921653, + "step": 7634 + }, + { + "epoch": 1.4154616240266964, + "grad_norm": 6.78515625, + "learning_rate": 8.584538375973305e-06, + "loss": 3.0512, + "mean_token_accuracy": 0.43850703650826023, + "step": 7635 + }, + { + "epoch": 1.4156470152020764, + "grad_norm": 7.984375, + "learning_rate": 8.584352984797924e-06, + "loss": 2.8041, + "mean_token_accuracy": 0.44735507858712187, + "step": 7636 + }, + { + "epoch": 1.4158324063774566, + "grad_norm": 7.6953125, + "learning_rate": 8.584167593622544e-06, + "loss": 2.8911, + "mean_token_accuracy": 0.44742818971275883, + "step": 7637 + }, + { + "epoch": 1.4160177975528365, + "grad_norm": 6.65625, + "learning_rate": 8.583982202447165e-06, + "loss": 3.0607, + "mean_token_accuracy": 0.43715680292861503, + "step": 7638 + }, + { + "epoch": 1.4162031887282165, + "grad_norm": 7.1328125, + "learning_rate": 8.583796811271783e-06, + "loss": 3.06, + "mean_token_accuracy": 0.44552746471723015, + "step": 7639 + }, + { + "epoch": 1.4163885799035967, + "grad_norm": 6.91015625, + "learning_rate": 8.583611420096404e-06, + "loss": 3.1304, + "mean_token_accuracy": 0.4266384088686012, + "step": 7640 + }, + { + "epoch": 1.4165739710789766, + "grad_norm": 5.77734375, + "learning_rate": 8.583426028921023e-06, + "loss": 3.0355, + "mean_token_accuracy": 0.4720872347990992, + "step": 7641 + }, + { + "epoch": 1.4167593622543566, + "grad_norm": 5.953125, + "learning_rate": 8.583240637745645e-06, + "loss": 2.5934, + "mean_token_accuracy": 0.47260686333534013, + "step": 7642 + }, + { + "epoch": 1.4169447534297368, + "grad_norm": 6.1875, + "learning_rate": 8.583055246570264e-06, + "loss": 2.7985, + "mean_token_accuracy": 0.46864975211431903, + "step": 7643 + }, + { + "epoch": 1.4171301446051168, + "grad_norm": 11.3359375, + "learning_rate": 8.582869855394884e-06, + "loss": 2.4086, + "mean_token_accuracy": 0.48792212474462204, + "step": 7644 + }, + { + "epoch": 1.4173155357804967, + "grad_norm": 6.44921875, + "learning_rate": 8.582684464219505e-06, + "loss": 3.0392, + "mean_token_accuracy": 0.4287531806615776, + "step": 7645 + }, + { + "epoch": 1.417500926955877, + "grad_norm": 6.11328125, + "learning_rate": 8.582499073044124e-06, + "loss": 2.5881, + "mean_token_accuracy": 0.4728149663840982, + "step": 7646 + }, + { + "epoch": 1.417686318131257, + "grad_norm": 9.34375, + "learning_rate": 8.582313681868744e-06, + "loss": 2.4769, + "mean_token_accuracy": 0.5116625983184161, + "step": 7647 + }, + { + "epoch": 1.417871709306637, + "grad_norm": 8.6796875, + "learning_rate": 8.582128290693363e-06, + "loss": 2.9074, + "mean_token_accuracy": 0.4423733263323707, + "step": 7648 + }, + { + "epoch": 1.418057100482017, + "grad_norm": 6.140625, + "learning_rate": 8.581942899517983e-06, + "loss": 2.7226, + "mean_token_accuracy": 0.4660577971646674, + "step": 7649 + }, + { + "epoch": 1.4182424916573972, + "grad_norm": 7.34765625, + "learning_rate": 8.581757508342604e-06, + "loss": 2.1998, + "mean_token_accuracy": 0.5422664790561749, + "step": 7650 + }, + { + "epoch": 1.4184278828327772, + "grad_norm": 9.75, + "learning_rate": 8.581572117167224e-06, + "loss": 3.0443, + "mean_token_accuracy": 0.4627366403067338, + "step": 7651 + }, + { + "epoch": 1.4186132740081572, + "grad_norm": 15.359375, + "learning_rate": 8.581386725991843e-06, + "loss": 2.5842, + "mean_token_accuracy": 0.4750251907298114, + "step": 7652 + }, + { + "epoch": 1.4187986651835374, + "grad_norm": 9.6484375, + "learning_rate": 8.581201334816464e-06, + "loss": 2.6313, + "mean_token_accuracy": 0.4674500587544066, + "step": 7653 + }, + { + "epoch": 1.4189840563589173, + "grad_norm": 6.83203125, + "learning_rate": 8.581015943641084e-06, + "loss": 2.5113, + "mean_token_accuracy": 0.49000799360511593, + "step": 7654 + }, + { + "epoch": 1.4191694475342973, + "grad_norm": 11.8359375, + "learning_rate": 8.580830552465703e-06, + "loss": 2.8825, + "mean_token_accuracy": 0.4323339406990227, + "step": 7655 + }, + { + "epoch": 1.4193548387096775, + "grad_norm": 7.83203125, + "learning_rate": 8.580645161290323e-06, + "loss": 2.8854, + "mean_token_accuracy": 0.44866385372714485, + "step": 7656 + }, + { + "epoch": 1.4195402298850575, + "grad_norm": 6.53515625, + "learning_rate": 8.580459770114942e-06, + "loss": 2.78, + "mean_token_accuracy": 0.46712714249168585, + "step": 7657 + }, + { + "epoch": 1.4197256210604374, + "grad_norm": 9.1484375, + "learning_rate": 8.580274378939564e-06, + "loss": 2.6633, + "mean_token_accuracy": 0.47552255225522555, + "step": 7658 + }, + { + "epoch": 1.4199110122358176, + "grad_norm": 8.7734375, + "learning_rate": 8.580088987764183e-06, + "loss": 3.576, + "mean_token_accuracy": 0.413478012564249, + "step": 7659 + }, + { + "epoch": 1.4200964034111976, + "grad_norm": 7.578125, + "learning_rate": 8.579903596588804e-06, + "loss": 2.247, + "mean_token_accuracy": 0.5579925650557621, + "step": 7660 + }, + { + "epoch": 1.4202817945865776, + "grad_norm": 5.65234375, + "learning_rate": 8.579718205413422e-06, + "loss": 2.7061, + "mean_token_accuracy": 0.45941807044410415, + "step": 7661 + }, + { + "epoch": 1.4204671857619577, + "grad_norm": 5.92578125, + "learning_rate": 8.579532814238043e-06, + "loss": 2.3245, + "mean_token_accuracy": 0.530825901512214, + "step": 7662 + }, + { + "epoch": 1.4206525769373377, + "grad_norm": 5.7109375, + "learning_rate": 8.579347423062663e-06, + "loss": 2.6495, + "mean_token_accuracy": 0.4896580835795694, + "step": 7663 + }, + { + "epoch": 1.420837968112718, + "grad_norm": 5.30078125, + "learning_rate": 8.579162031887282e-06, + "loss": 2.5158, + "mean_token_accuracy": 0.5123512095225905, + "step": 7664 + }, + { + "epoch": 1.4210233592880979, + "grad_norm": 7.19140625, + "learning_rate": 8.578976640711903e-06, + "loss": 2.4797, + "mean_token_accuracy": 0.5045794167269222, + "step": 7665 + }, + { + "epoch": 1.421208750463478, + "grad_norm": 4.89453125, + "learning_rate": 8.578791249536523e-06, + "loss": 2.4155, + "mean_token_accuracy": 0.5364612150049796, + "step": 7666 + }, + { + "epoch": 1.421394141638858, + "grad_norm": 5.9375, + "learning_rate": 8.578605858361144e-06, + "loss": 3.0173, + "mean_token_accuracy": 0.44624644708242767, + "step": 7667 + }, + { + "epoch": 1.421579532814238, + "grad_norm": 6.30078125, + "learning_rate": 8.578420467185763e-06, + "loss": 2.3086, + "mean_token_accuracy": 0.5440675657267402, + "step": 7668 + }, + { + "epoch": 1.4217649239896182, + "grad_norm": 7.11328125, + "learning_rate": 8.578235076010383e-06, + "loss": 2.6213, + "mean_token_accuracy": 0.47236965344989074, + "step": 7669 + }, + { + "epoch": 1.4219503151649981, + "grad_norm": 7.4921875, + "learning_rate": 8.578049684835002e-06, + "loss": 3.0035, + "mean_token_accuracy": 0.4582432432432432, + "step": 7670 + }, + { + "epoch": 1.4221357063403781, + "grad_norm": 5.69921875, + "learning_rate": 8.577864293659622e-06, + "loss": 3.0463, + "mean_token_accuracy": 0.44479085476747204, + "step": 7671 + }, + { + "epoch": 1.4223210975157583, + "grad_norm": 6.640625, + "learning_rate": 8.577678902484243e-06, + "loss": 2.3926, + "mean_token_accuracy": 0.5022312373225152, + "step": 7672 + }, + { + "epoch": 1.4225064886911383, + "grad_norm": 6.7421875, + "learning_rate": 8.577493511308862e-06, + "loss": 3.6338, + "mean_token_accuracy": 0.39423301424235346, + "step": 7673 + }, + { + "epoch": 1.4226918798665182, + "grad_norm": 6.0859375, + "learning_rate": 8.577308120133482e-06, + "loss": 2.859, + "mean_token_accuracy": 0.46866325785244706, + "step": 7674 + }, + { + "epoch": 1.4228772710418984, + "grad_norm": 7.25, + "learning_rate": 8.577122728958103e-06, + "loss": 3.2182, + "mean_token_accuracy": 0.43261490521942353, + "step": 7675 + }, + { + "epoch": 1.4230626622172784, + "grad_norm": 11.421875, + "learning_rate": 8.576937337782723e-06, + "loss": 2.7552, + "mean_token_accuracy": 0.46250515039142975, + "step": 7676 + }, + { + "epoch": 1.4232480533926586, + "grad_norm": 9.046875, + "learning_rate": 8.576751946607342e-06, + "loss": 2.6495, + "mean_token_accuracy": 0.48495627686379134, + "step": 7677 + }, + { + "epoch": 1.4234334445680386, + "grad_norm": 5.83984375, + "learning_rate": 8.576566555431962e-06, + "loss": 2.8948, + "mean_token_accuracy": 0.458525667925584, + "step": 7678 + }, + { + "epoch": 1.4236188357434187, + "grad_norm": 10.1484375, + "learning_rate": 8.576381164256581e-06, + "loss": 2.3759, + "mean_token_accuracy": 0.47884167114936343, + "step": 7679 + }, + { + "epoch": 1.4238042269187987, + "grad_norm": 9.78125, + "learning_rate": 8.576195773081202e-06, + "loss": 2.9394, + "mean_token_accuracy": 0.44356464152029945, + "step": 7680 + }, + { + "epoch": 1.4239896180941787, + "grad_norm": 5.43359375, + "learning_rate": 8.576010381905822e-06, + "loss": 2.4844, + "mean_token_accuracy": 0.5092643051771117, + "step": 7681 + }, + { + "epoch": 1.4241750092695589, + "grad_norm": 7.9921875, + "learning_rate": 8.575824990730441e-06, + "loss": 3.3362, + "mean_token_accuracy": 0.414251497005988, + "step": 7682 + }, + { + "epoch": 1.4243604004449388, + "grad_norm": 7.1796875, + "learning_rate": 8.575639599555063e-06, + "loss": 2.3663, + "mean_token_accuracy": 0.5028409090909091, + "step": 7683 + }, + { + "epoch": 1.4245457916203188, + "grad_norm": 10.84375, + "learning_rate": 8.575454208379682e-06, + "loss": 2.2392, + "mean_token_accuracy": 0.5089828830103268, + "step": 7684 + }, + { + "epoch": 1.424731182795699, + "grad_norm": 7.91796875, + "learning_rate": 8.575268817204302e-06, + "loss": 2.4604, + "mean_token_accuracy": 0.49593593114989243, + "step": 7685 + }, + { + "epoch": 1.424916573971079, + "grad_norm": 9.4296875, + "learning_rate": 8.575083426028921e-06, + "loss": 3.0835, + "mean_token_accuracy": 0.44105192779139735, + "step": 7686 + }, + { + "epoch": 1.425101965146459, + "grad_norm": 9.1328125, + "learning_rate": 8.574898034853542e-06, + "loss": 3.0851, + "mean_token_accuracy": 0.4320540067508439, + "step": 7687 + }, + { + "epoch": 1.4252873563218391, + "grad_norm": 6.49609375, + "learning_rate": 8.574712643678162e-06, + "loss": 2.7192, + "mean_token_accuracy": 0.47838874680306903, + "step": 7688 + }, + { + "epoch": 1.425472747497219, + "grad_norm": 7.796875, + "learning_rate": 8.574527252502781e-06, + "loss": 2.8867, + "mean_token_accuracy": 0.4886224281237226, + "step": 7689 + }, + { + "epoch": 1.425658138672599, + "grad_norm": 7.203125, + "learning_rate": 8.574341861327401e-06, + "loss": 2.8562, + "mean_token_accuracy": 0.46066738947114805, + "step": 7690 + }, + { + "epoch": 1.4258435298479792, + "grad_norm": 7.32421875, + "learning_rate": 8.574156470152022e-06, + "loss": 2.9982, + "mean_token_accuracy": 0.45857367593078135, + "step": 7691 + }, + { + "epoch": 1.4260289210233592, + "grad_norm": 5.078125, + "learning_rate": 8.573971078976642e-06, + "loss": 3.5242, + "mean_token_accuracy": 0.4244358331433782, + "step": 7692 + }, + { + "epoch": 1.4262143121987394, + "grad_norm": 5.90625, + "learning_rate": 8.573785687801261e-06, + "loss": 2.8703, + "mean_token_accuracy": 0.4478164322723908, + "step": 7693 + }, + { + "epoch": 1.4263997033741194, + "grad_norm": 5.7421875, + "learning_rate": 8.573600296625882e-06, + "loss": 3.0314, + "mean_token_accuracy": 0.4577229503983552, + "step": 7694 + }, + { + "epoch": 1.4265850945494996, + "grad_norm": 6.90234375, + "learning_rate": 8.5734149054505e-06, + "loss": 2.6894, + "mean_token_accuracy": 0.467238818286846, + "step": 7695 + }, + { + "epoch": 1.4267704857248795, + "grad_norm": 6.0, + "learning_rate": 8.573229514275121e-06, + "loss": 3.5127, + "mean_token_accuracy": 0.38895816800816235, + "step": 7696 + }, + { + "epoch": 1.4269558769002595, + "grad_norm": 7.671875, + "learning_rate": 8.573044123099742e-06, + "loss": 2.2727, + "mean_token_accuracy": 0.5161997002190707, + "step": 7697 + }, + { + "epoch": 1.4271412680756397, + "grad_norm": 7.265625, + "learning_rate": 8.57285873192436e-06, + "loss": 2.7687, + "mean_token_accuracy": 0.4763688760806916, + "step": 7698 + }, + { + "epoch": 1.4273266592510196, + "grad_norm": 6.859375, + "learning_rate": 8.57267334074898e-06, + "loss": 2.7991, + "mean_token_accuracy": 0.45061523112736945, + "step": 7699 + }, + { + "epoch": 1.4275120504263996, + "grad_norm": 5.984375, + "learning_rate": 8.572487949573601e-06, + "loss": 2.8377, + "mean_token_accuracy": 0.4703621581670362, + "step": 7700 + }, + { + "epoch": 1.4276974416017798, + "grad_norm": 6.53515625, + "learning_rate": 8.572302558398222e-06, + "loss": 2.8186, + "mean_token_accuracy": 0.48189030700241464, + "step": 7701 + }, + { + "epoch": 1.4278828327771598, + "grad_norm": 7.78515625, + "learning_rate": 8.57211716722284e-06, + "loss": 2.557, + "mean_token_accuracy": 0.5141654340653089, + "step": 7702 + }, + { + "epoch": 1.4280682239525397, + "grad_norm": 7.18359375, + "learning_rate": 8.571931776047461e-06, + "loss": 3.5275, + "mean_token_accuracy": 0.3997610196494955, + "step": 7703 + }, + { + "epoch": 1.42825361512792, + "grad_norm": 6.96875, + "learning_rate": 8.57174638487208e-06, + "loss": 2.9627, + "mean_token_accuracy": 0.46830685118742527, + "step": 7704 + }, + { + "epoch": 1.4284390063033, + "grad_norm": 6.1328125, + "learning_rate": 8.5715609936967e-06, + "loss": 2.5705, + "mean_token_accuracy": 0.5050747110234001, + "step": 7705 + }, + { + "epoch": 1.42862439747868, + "grad_norm": 5.99609375, + "learning_rate": 8.57137560252132e-06, + "loss": 2.95, + "mean_token_accuracy": 0.43867611246402477, + "step": 7706 + }, + { + "epoch": 1.42880978865406, + "grad_norm": 5.58203125, + "learning_rate": 8.571190211345941e-06, + "loss": 3.1608, + "mean_token_accuracy": 0.4391939665331134, + "step": 7707 + }, + { + "epoch": 1.4289951798294402, + "grad_norm": 5.609375, + "learning_rate": 8.57100482017056e-06, + "loss": 2.7587, + "mean_token_accuracy": 0.47407319108356183, + "step": 7708 + }, + { + "epoch": 1.4291805710048202, + "grad_norm": 5.58984375, + "learning_rate": 8.57081942899518e-06, + "loss": 2.3133, + "mean_token_accuracy": 0.5212418300653595, + "step": 7709 + }, + { + "epoch": 1.4293659621802002, + "grad_norm": 7.63671875, + "learning_rate": 8.570634037819801e-06, + "loss": 3.1539, + "mean_token_accuracy": 0.4487219943199748, + "step": 7710 + }, + { + "epoch": 1.4295513533555804, + "grad_norm": 6.83203125, + "learning_rate": 8.57044864664442e-06, + "loss": 2.5381, + "mean_token_accuracy": 0.4917435964113899, + "step": 7711 + }, + { + "epoch": 1.4297367445309603, + "grad_norm": 6.421875, + "learning_rate": 8.57026325546904e-06, + "loss": 2.8026, + "mean_token_accuracy": 0.4737026647966339, + "step": 7712 + }, + { + "epoch": 1.4299221357063403, + "grad_norm": 5.73828125, + "learning_rate": 8.57007786429366e-06, + "loss": 2.6224, + "mean_token_accuracy": 0.48362175525339923, + "step": 7713 + }, + { + "epoch": 1.4301075268817205, + "grad_norm": 5.42578125, + "learning_rate": 8.56989247311828e-06, + "loss": 3.0498, + "mean_token_accuracy": 0.4448973649730674, + "step": 7714 + }, + { + "epoch": 1.4302929180571005, + "grad_norm": 5.84765625, + "learning_rate": 8.5697070819429e-06, + "loss": 2.5673, + "mean_token_accuracy": 0.49084735754354886, + "step": 7715 + }, + { + "epoch": 1.4304783092324804, + "grad_norm": 5.60546875, + "learning_rate": 8.56952169076752e-06, + "loss": 2.2223, + "mean_token_accuracy": 0.5291132817455284, + "step": 7716 + }, + { + "epoch": 1.4306637004078606, + "grad_norm": 7.4609375, + "learning_rate": 8.56933629959214e-06, + "loss": 2.5602, + "mean_token_accuracy": 0.4908998988877654, + "step": 7717 + }, + { + "epoch": 1.4308490915832406, + "grad_norm": 6.6640625, + "learning_rate": 8.56915090841676e-06, + "loss": 2.8266, + "mean_token_accuracy": 0.47608958837772397, + "step": 7718 + }, + { + "epoch": 1.4310344827586206, + "grad_norm": 6.8515625, + "learning_rate": 8.56896551724138e-06, + "loss": 3.219, + "mean_token_accuracy": 0.45487195502810746, + "step": 7719 + }, + { + "epoch": 1.4312198739340007, + "grad_norm": 5.32421875, + "learning_rate": 8.568780126066e-06, + "loss": 2.5192, + "mean_token_accuracy": 0.49591309959130997, + "step": 7720 + }, + { + "epoch": 1.431405265109381, + "grad_norm": 5.36328125, + "learning_rate": 8.56859473489062e-06, + "loss": 3.1209, + "mean_token_accuracy": 0.42112526539278133, + "step": 7721 + }, + { + "epoch": 1.431590656284761, + "grad_norm": 8.34375, + "learning_rate": 8.568409343715239e-06, + "loss": 2.957, + "mean_token_accuracy": 0.433922145894668, + "step": 7722 + }, + { + "epoch": 1.4317760474601409, + "grad_norm": 6.45703125, + "learning_rate": 8.56822395253986e-06, + "loss": 2.8379, + "mean_token_accuracy": 0.44290260980267343, + "step": 7723 + }, + { + "epoch": 1.431961438635521, + "grad_norm": 6.296875, + "learning_rate": 8.56803856136448e-06, + "loss": 3.0794, + "mean_token_accuracy": 0.43896103896103894, + "step": 7724 + }, + { + "epoch": 1.432146829810901, + "grad_norm": 5.5390625, + "learning_rate": 8.5678531701891e-06, + "loss": 3.2006, + "mean_token_accuracy": 0.4332078411008698, + "step": 7725 + }, + { + "epoch": 1.432332220986281, + "grad_norm": 7.234375, + "learning_rate": 8.56766777901372e-06, + "loss": 2.4364, + "mean_token_accuracy": 0.5022441651705566, + "step": 7726 + }, + { + "epoch": 1.4325176121616612, + "grad_norm": 5.7421875, + "learning_rate": 8.56748238783834e-06, + "loss": 3.2995, + "mean_token_accuracy": 0.4223314606741573, + "step": 7727 + }, + { + "epoch": 1.4327030033370411, + "grad_norm": 5.9921875, + "learning_rate": 8.56729699666296e-06, + "loss": 2.9298, + "mean_token_accuracy": 0.4463683052090976, + "step": 7728 + }, + { + "epoch": 1.4328883945124211, + "grad_norm": 6.9453125, + "learning_rate": 8.567111605487579e-06, + "loss": 2.9435, + "mean_token_accuracy": 0.4688860435339309, + "step": 7729 + }, + { + "epoch": 1.4330737856878013, + "grad_norm": 6.765625, + "learning_rate": 8.566926214312199e-06, + "loss": 2.9749, + "mean_token_accuracy": 0.4762388818297332, + "step": 7730 + }, + { + "epoch": 1.4332591768631813, + "grad_norm": 7.3671875, + "learning_rate": 8.56674082313682e-06, + "loss": 2.5405, + "mean_token_accuracy": 0.48712953944622234, + "step": 7731 + }, + { + "epoch": 1.4334445680385612, + "grad_norm": 6.49609375, + "learning_rate": 8.56655543196144e-06, + "loss": 3.418, + "mean_token_accuracy": 0.41239316239316237, + "step": 7732 + }, + { + "epoch": 1.4336299592139414, + "grad_norm": 5.9375, + "learning_rate": 8.566370040786059e-06, + "loss": 2.8689, + "mean_token_accuracy": 0.4678377041068778, + "step": 7733 + }, + { + "epoch": 1.4338153503893214, + "grad_norm": 4.5546875, + "learning_rate": 8.56618464961068e-06, + "loss": 3.0953, + "mean_token_accuracy": 0.45127488648271047, + "step": 7734 + }, + { + "epoch": 1.4340007415647016, + "grad_norm": 7.2734375, + "learning_rate": 8.5659992584353e-06, + "loss": 3.0279, + "mean_token_accuracy": 0.44628237259816206, + "step": 7735 + }, + { + "epoch": 1.4341861327400816, + "grad_norm": 6.1640625, + "learning_rate": 8.565813867259919e-06, + "loss": 2.5604, + "mean_token_accuracy": 0.5005012531328321, + "step": 7736 + }, + { + "epoch": 1.4343715239154617, + "grad_norm": 5.5859375, + "learning_rate": 8.565628476084539e-06, + "loss": 2.8696, + "mean_token_accuracy": 0.5053003533568905, + "step": 7737 + }, + { + "epoch": 1.4345569150908417, + "grad_norm": 7.5078125, + "learning_rate": 8.565443084909158e-06, + "loss": 2.9236, + "mean_token_accuracy": 0.45195683266155245, + "step": 7738 + }, + { + "epoch": 1.4347423062662217, + "grad_norm": 6.63671875, + "learning_rate": 8.56525769373378e-06, + "loss": 2.8162, + "mean_token_accuracy": 0.45701575087170854, + "step": 7739 + }, + { + "epoch": 1.4349276974416019, + "grad_norm": 5.4609375, + "learning_rate": 8.565072302558399e-06, + "loss": 2.5311, + "mean_token_accuracy": 0.5023166459486947, + "step": 7740 + }, + { + "epoch": 1.4351130886169818, + "grad_norm": 7.71875, + "learning_rate": 8.56488691138302e-06, + "loss": 2.6726, + "mean_token_accuracy": 0.47072179732313574, + "step": 7741 + }, + { + "epoch": 1.4352984797923618, + "grad_norm": 10.0078125, + "learning_rate": 8.564701520207638e-06, + "loss": 3.5682, + "mean_token_accuracy": 0.44333649889205445, + "step": 7742 + }, + { + "epoch": 1.435483870967742, + "grad_norm": 5.19921875, + "learning_rate": 8.564516129032259e-06, + "loss": 2.9369, + "mean_token_accuracy": 0.45014245014245013, + "step": 7743 + }, + { + "epoch": 1.435669262143122, + "grad_norm": 6.1953125, + "learning_rate": 8.56433073785688e-06, + "loss": 2.7812, + "mean_token_accuracy": 0.47208317289179824, + "step": 7744 + }, + { + "epoch": 1.435854653318502, + "grad_norm": 7.1328125, + "learning_rate": 8.564145346681498e-06, + "loss": 2.9734, + "mean_token_accuracy": 0.45430031223389156, + "step": 7745 + }, + { + "epoch": 1.4360400444938821, + "grad_norm": 5.734375, + "learning_rate": 8.563959955506118e-06, + "loss": 3.4659, + "mean_token_accuracy": 0.4102279043913285, + "step": 7746 + }, + { + "epoch": 1.436225435669262, + "grad_norm": 5.16796875, + "learning_rate": 8.563774564330739e-06, + "loss": 2.5578, + "mean_token_accuracy": 0.49498327759197325, + "step": 7747 + }, + { + "epoch": 1.4364108268446423, + "grad_norm": 7.73046875, + "learning_rate": 8.56358917315536e-06, + "loss": 3.353, + "mean_token_accuracy": 0.4169259489732421, + "step": 7748 + }, + { + "epoch": 1.4365962180200222, + "grad_norm": 7.13671875, + "learning_rate": 8.563403781979978e-06, + "loss": 3.2716, + "mean_token_accuracy": 0.4422768572955639, + "step": 7749 + }, + { + "epoch": 1.4367816091954024, + "grad_norm": 8.984375, + "learning_rate": 8.563218390804599e-06, + "loss": 3.1125, + "mean_token_accuracy": 0.46705597179374175, + "step": 7750 + }, + { + "epoch": 1.4369670003707824, + "grad_norm": 6.40625, + "learning_rate": 8.563032999629218e-06, + "loss": 3.058, + "mean_token_accuracy": 0.4373939599592806, + "step": 7751 + }, + { + "epoch": 1.4371523915461624, + "grad_norm": 8.984375, + "learning_rate": 8.562847608453838e-06, + "loss": 2.3864, + "mean_token_accuracy": 0.5244825845532559, + "step": 7752 + }, + { + "epoch": 1.4373377827215426, + "grad_norm": 8.1796875, + "learning_rate": 8.562662217278459e-06, + "loss": 3.3342, + "mean_token_accuracy": 0.4258579207293909, + "step": 7753 + }, + { + "epoch": 1.4375231738969225, + "grad_norm": 7.51171875, + "learning_rate": 8.562476826103077e-06, + "loss": 2.9517, + "mean_token_accuracy": 0.49028884462151395, + "step": 7754 + }, + { + "epoch": 1.4377085650723025, + "grad_norm": 6.41796875, + "learning_rate": 8.562291434927698e-06, + "loss": 3.0015, + "mean_token_accuracy": 0.45316209799266133, + "step": 7755 + }, + { + "epoch": 1.4378939562476827, + "grad_norm": 8.859375, + "learning_rate": 8.562106043752318e-06, + "loss": 2.7533, + "mean_token_accuracy": 0.47435753563012006, + "step": 7756 + }, + { + "epoch": 1.4380793474230626, + "grad_norm": 7.296875, + "learning_rate": 8.561920652576939e-06, + "loss": 3.0408, + "mean_token_accuracy": 0.42903930131004364, + "step": 7757 + }, + { + "epoch": 1.4382647385984426, + "grad_norm": 6.01953125, + "learning_rate": 8.561735261401558e-06, + "loss": 2.8047, + "mean_token_accuracy": 0.48461074230537116, + "step": 7758 + }, + { + "epoch": 1.4384501297738228, + "grad_norm": 7.0546875, + "learning_rate": 8.561549870226178e-06, + "loss": 3.2329, + "mean_token_accuracy": 0.42741477272727274, + "step": 7759 + }, + { + "epoch": 1.4386355209492028, + "grad_norm": 9.0703125, + "learning_rate": 8.561364479050797e-06, + "loss": 2.7614, + "mean_token_accuracy": 0.46244945118428654, + "step": 7760 + }, + { + "epoch": 1.4388209121245827, + "grad_norm": 10.2109375, + "learning_rate": 8.561179087875417e-06, + "loss": 2.421, + "mean_token_accuracy": 0.5106582651830004, + "step": 7761 + }, + { + "epoch": 1.439006303299963, + "grad_norm": 6.0078125, + "learning_rate": 8.560993696700038e-06, + "loss": 2.9374, + "mean_token_accuracy": 0.45057893250494213, + "step": 7762 + }, + { + "epoch": 1.439191694475343, + "grad_norm": 5.875, + "learning_rate": 8.560808305524658e-06, + "loss": 2.5396, + "mean_token_accuracy": 0.49355634768302714, + "step": 7763 + }, + { + "epoch": 1.439377085650723, + "grad_norm": 9.5, + "learning_rate": 8.560622914349279e-06, + "loss": 2.3521, + "mean_token_accuracy": 0.5118898623279099, + "step": 7764 + }, + { + "epoch": 1.439562476826103, + "grad_norm": 9.0390625, + "learning_rate": 8.560437523173898e-06, + "loss": 2.5188, + "mean_token_accuracy": 0.5155672823218997, + "step": 7765 + }, + { + "epoch": 1.4397478680014832, + "grad_norm": 7.421875, + "learning_rate": 8.560252131998518e-06, + "loss": 2.4701, + "mean_token_accuracy": 0.47448036951501155, + "step": 7766 + }, + { + "epoch": 1.4399332591768632, + "grad_norm": 7.43359375, + "learning_rate": 8.560066740823137e-06, + "loss": 2.911, + "mean_token_accuracy": 0.45908538296081547, + "step": 7767 + }, + { + "epoch": 1.4401186503522432, + "grad_norm": 6.20703125, + "learning_rate": 8.559881349647757e-06, + "loss": 2.9295, + "mean_token_accuracy": 0.43610441346053047, + "step": 7768 + }, + { + "epoch": 1.4403040415276234, + "grad_norm": 8.3828125, + "learning_rate": 8.559695958472378e-06, + "loss": 3.2024, + "mean_token_accuracy": 0.4308510638297872, + "step": 7769 + }, + { + "epoch": 1.4404894327030033, + "grad_norm": 6.453125, + "learning_rate": 8.559510567296997e-06, + "loss": 2.9795, + "mean_token_accuracy": 0.4340358143501035, + "step": 7770 + }, + { + "epoch": 1.4406748238783833, + "grad_norm": 6.83203125, + "learning_rate": 8.559325176121617e-06, + "loss": 2.9526, + "mean_token_accuracy": 0.4266003166704366, + "step": 7771 + }, + { + "epoch": 1.4408602150537635, + "grad_norm": 6.71875, + "learning_rate": 8.559139784946238e-06, + "loss": 2.7802, + "mean_token_accuracy": 0.4729751149152005, + "step": 7772 + }, + { + "epoch": 1.4410456062291435, + "grad_norm": 7.84375, + "learning_rate": 8.558954393770858e-06, + "loss": 2.4028, + "mean_token_accuracy": 0.5047740292807129, + "step": 7773 + }, + { + "epoch": 1.4412309974045234, + "grad_norm": 5.828125, + "learning_rate": 8.558769002595477e-06, + "loss": 2.6473, + "mean_token_accuracy": 0.4792722547108512, + "step": 7774 + }, + { + "epoch": 1.4414163885799036, + "grad_norm": 6.734375, + "learning_rate": 8.558583611420097e-06, + "loss": 2.7631, + "mean_token_accuracy": 0.4400798934753662, + "step": 7775 + }, + { + "epoch": 1.4416017797552836, + "grad_norm": 7.5, + "learning_rate": 8.558398220244716e-06, + "loss": 2.8502, + "mean_token_accuracy": 0.43957139297283826, + "step": 7776 + }, + { + "epoch": 1.4417871709306638, + "grad_norm": 6.4453125, + "learning_rate": 8.558212829069337e-06, + "loss": 2.6056, + "mean_token_accuracy": 0.48202653799758743, + "step": 7777 + }, + { + "epoch": 1.4419725621060437, + "grad_norm": 6.93359375, + "learning_rate": 8.558027437893957e-06, + "loss": 2.777, + "mean_token_accuracy": 0.4604223344992875, + "step": 7778 + }, + { + "epoch": 1.442157953281424, + "grad_norm": 5.35546875, + "learning_rate": 8.557842046718578e-06, + "loss": 2.5362, + "mean_token_accuracy": 0.47174610195731503, + "step": 7779 + }, + { + "epoch": 1.442343344456804, + "grad_norm": 6.5703125, + "learning_rate": 8.557656655543197e-06, + "loss": 2.3456, + "mean_token_accuracy": 0.4970290492957746, + "step": 7780 + }, + { + "epoch": 1.4425287356321839, + "grad_norm": 5.63671875, + "learning_rate": 8.557471264367817e-06, + "loss": 2.7937, + "mean_token_accuracy": 0.45845697329376855, + "step": 7781 + }, + { + "epoch": 1.442714126807564, + "grad_norm": 6.2734375, + "learning_rate": 8.557285873192438e-06, + "loss": 2.7995, + "mean_token_accuracy": 0.4560407569141194, + "step": 7782 + }, + { + "epoch": 1.442899517982944, + "grad_norm": 6.40625, + "learning_rate": 8.557100482017056e-06, + "loss": 2.1102, + "mean_token_accuracy": 0.5638780462074059, + "step": 7783 + }, + { + "epoch": 1.443084909158324, + "grad_norm": 5.48046875, + "learning_rate": 8.556915090841677e-06, + "loss": 3.5781, + "mean_token_accuracy": 0.40634809905824903, + "step": 7784 + }, + { + "epoch": 1.4432703003337042, + "grad_norm": 8.7421875, + "learning_rate": 8.556729699666296e-06, + "loss": 2.9379, + "mean_token_accuracy": 0.4715755278830536, + "step": 7785 + }, + { + "epoch": 1.4434556915090841, + "grad_norm": 9.59375, + "learning_rate": 8.556544308490916e-06, + "loss": 3.2369, + "mean_token_accuracy": 0.43086037430995017, + "step": 7786 + }, + { + "epoch": 1.4436410826844641, + "grad_norm": 7.2734375, + "learning_rate": 8.556358917315537e-06, + "loss": 2.5112, + "mean_token_accuracy": 0.5290492957746479, + "step": 7787 + }, + { + "epoch": 1.4438264738598443, + "grad_norm": 5.78125, + "learning_rate": 8.556173526140157e-06, + "loss": 3.2346, + "mean_token_accuracy": 0.4644572526416907, + "step": 7788 + }, + { + "epoch": 1.4440118650352243, + "grad_norm": 8.375, + "learning_rate": 8.555988134964776e-06, + "loss": 2.1068, + "mean_token_accuracy": 0.5578516243135064, + "step": 7789 + }, + { + "epoch": 1.4441972562106042, + "grad_norm": 6.44921875, + "learning_rate": 8.555802743789396e-06, + "loss": 2.8469, + "mean_token_accuracy": 0.47569359194696786, + "step": 7790 + }, + { + "epoch": 1.4443826473859844, + "grad_norm": 6.37890625, + "learning_rate": 8.555617352614017e-06, + "loss": 3.0, + "mean_token_accuracy": 0.45267631518492996, + "step": 7791 + }, + { + "epoch": 1.4445680385613644, + "grad_norm": 13.1484375, + "learning_rate": 8.555431961438636e-06, + "loss": 2.745, + "mean_token_accuracy": 0.4603275898744948, + "step": 7792 + }, + { + "epoch": 1.4447534297367446, + "grad_norm": 6.4296875, + "learning_rate": 8.555246570263256e-06, + "loss": 2.9126, + "mean_token_accuracy": 0.46118276953029935, + "step": 7793 + }, + { + "epoch": 1.4449388209121246, + "grad_norm": 5.90625, + "learning_rate": 8.555061179087875e-06, + "loss": 2.4989, + "mean_token_accuracy": 0.5202341824157765, + "step": 7794 + }, + { + "epoch": 1.4451242120875047, + "grad_norm": 5.75, + "learning_rate": 8.554875787912495e-06, + "loss": 3.1943, + "mean_token_accuracy": 0.42486805339956535, + "step": 7795 + }, + { + "epoch": 1.4453096032628847, + "grad_norm": 6.578125, + "learning_rate": 8.554690396737116e-06, + "loss": 2.7526, + "mean_token_accuracy": 0.47821376986037556, + "step": 7796 + }, + { + "epoch": 1.4454949944382647, + "grad_norm": 5.5625, + "learning_rate": 8.554505005561736e-06, + "loss": 2.9519, + "mean_token_accuracy": 0.4281893554426317, + "step": 7797 + }, + { + "epoch": 1.4456803856136449, + "grad_norm": 7.45703125, + "learning_rate": 8.554319614386355e-06, + "loss": 3.0369, + "mean_token_accuracy": 0.44507493088898586, + "step": 7798 + }, + { + "epoch": 1.4458657767890248, + "grad_norm": 5.91796875, + "learning_rate": 8.554134223210976e-06, + "loss": 3.1418, + "mean_token_accuracy": 0.42290061445430605, + "step": 7799 + }, + { + "epoch": 1.4460511679644048, + "grad_norm": 9.234375, + "learning_rate": 8.553948832035596e-06, + "loss": 2.6364, + "mean_token_accuracy": 0.470231822971549, + "step": 7800 + }, + { + "epoch": 1.446236559139785, + "grad_norm": 6.26953125, + "learning_rate": 8.553763440860215e-06, + "loss": 2.8642, + "mean_token_accuracy": 0.43511791662009613, + "step": 7801 + }, + { + "epoch": 1.446421950315165, + "grad_norm": 6.6796875, + "learning_rate": 8.553578049684835e-06, + "loss": 2.8581, + "mean_token_accuracy": 0.45207793670458385, + "step": 7802 + }, + { + "epoch": 1.446607341490545, + "grad_norm": 6.58203125, + "learning_rate": 8.553392658509454e-06, + "loss": 2.9414, + "mean_token_accuracy": 0.45637514264965245, + "step": 7803 + }, + { + "epoch": 1.4467927326659251, + "grad_norm": 6.9453125, + "learning_rate": 8.553207267334076e-06, + "loss": 3.1133, + "mean_token_accuracy": 0.4517704517704518, + "step": 7804 + }, + { + "epoch": 1.446978123841305, + "grad_norm": 5.953125, + "learning_rate": 8.553021876158695e-06, + "loss": 2.506, + "mean_token_accuracy": 0.4781306171360096, + "step": 7805 + }, + { + "epoch": 1.4471635150166853, + "grad_norm": 6.53515625, + "learning_rate": 8.552836484983316e-06, + "loss": 2.5493, + "mean_token_accuracy": 0.4895051520162829, + "step": 7806 + }, + { + "epoch": 1.4473489061920652, + "grad_norm": 5.77734375, + "learning_rate": 8.552651093807936e-06, + "loss": 2.8263, + "mean_token_accuracy": 0.44857142857142857, + "step": 7807 + }, + { + "epoch": 1.4475342973674454, + "grad_norm": 6.0859375, + "learning_rate": 8.552465702632555e-06, + "loss": 2.7532, + "mean_token_accuracy": 0.4802801888229024, + "step": 7808 + }, + { + "epoch": 1.4477196885428254, + "grad_norm": 6.1484375, + "learning_rate": 8.552280311457176e-06, + "loss": 3.4146, + "mean_token_accuracy": 0.42637285764253235, + "step": 7809 + }, + { + "epoch": 1.4479050797182054, + "grad_norm": 8.8359375, + "learning_rate": 8.552094920281794e-06, + "loss": 2.5432, + "mean_token_accuracy": 0.4844570044408559, + "step": 7810 + }, + { + "epoch": 1.4480904708935856, + "grad_norm": 6.5625, + "learning_rate": 8.551909529106415e-06, + "loss": 2.894, + "mean_token_accuracy": 0.46410927056088347, + "step": 7811 + }, + { + "epoch": 1.4482758620689655, + "grad_norm": 6.43359375, + "learning_rate": 8.551724137931035e-06, + "loss": 2.7324, + "mean_token_accuracy": 0.47217235188509876, + "step": 7812 + }, + { + "epoch": 1.4484612532443455, + "grad_norm": 11.5, + "learning_rate": 8.551538746755656e-06, + "loss": 3.1898, + "mean_token_accuracy": 0.42460796139927626, + "step": 7813 + }, + { + "epoch": 1.4486466444197257, + "grad_norm": 6.8828125, + "learning_rate": 8.551353355580275e-06, + "loss": 2.7847, + "mean_token_accuracy": 0.4439958127710483, + "step": 7814 + }, + { + "epoch": 1.4488320355951056, + "grad_norm": 8.4609375, + "learning_rate": 8.551167964404895e-06, + "loss": 2.2698, + "mean_token_accuracy": 0.5164261168384879, + "step": 7815 + }, + { + "epoch": 1.4490174267704856, + "grad_norm": 10.09375, + "learning_rate": 8.550982573229516e-06, + "loss": 2.4291, + "mean_token_accuracy": 0.5037783375314862, + "step": 7816 + }, + { + "epoch": 1.4492028179458658, + "grad_norm": 6.59375, + "learning_rate": 8.550797182054134e-06, + "loss": 2.6747, + "mean_token_accuracy": 0.5052889576883385, + "step": 7817 + }, + { + "epoch": 1.4493882091212458, + "grad_norm": 5.33984375, + "learning_rate": 8.550611790878755e-06, + "loss": 2.9248, + "mean_token_accuracy": 0.44591346153846156, + "step": 7818 + }, + { + "epoch": 1.449573600296626, + "grad_norm": 6.86328125, + "learning_rate": 8.550426399703374e-06, + "loss": 3.2043, + "mean_token_accuracy": 0.4389318341531975, + "step": 7819 + }, + { + "epoch": 1.449758991472006, + "grad_norm": 10.6875, + "learning_rate": 8.550241008527996e-06, + "loss": 2.5162, + "mean_token_accuracy": 0.47845953002610964, + "step": 7820 + }, + { + "epoch": 1.4499443826473861, + "grad_norm": 6.98828125, + "learning_rate": 8.550055617352615e-06, + "loss": 2.5938, + "mean_token_accuracy": 0.5034168564920274, + "step": 7821 + }, + { + "epoch": 1.450129773822766, + "grad_norm": 11.21875, + "learning_rate": 8.549870226177235e-06, + "loss": 3.2477, + "mean_token_accuracy": 0.43099809026032765, + "step": 7822 + }, + { + "epoch": 1.450315164998146, + "grad_norm": 8.1796875, + "learning_rate": 8.549684835001854e-06, + "loss": 2.6303, + "mean_token_accuracy": 0.49120549120549123, + "step": 7823 + }, + { + "epoch": 1.4505005561735262, + "grad_norm": 7.140625, + "learning_rate": 8.549499443826474e-06, + "loss": 3.3362, + "mean_token_accuracy": 0.40784313725490196, + "step": 7824 + }, + { + "epoch": 1.4506859473489062, + "grad_norm": 9.59375, + "learning_rate": 8.549314052651095e-06, + "loss": 3.2848, + "mean_token_accuracy": 0.40755444646098005, + "step": 7825 + }, + { + "epoch": 1.4508713385242862, + "grad_norm": 7.171875, + "learning_rate": 8.549128661475714e-06, + "loss": 3.2753, + "mean_token_accuracy": 0.4197776012708499, + "step": 7826 + }, + { + "epoch": 1.4510567296996664, + "grad_norm": 8.0859375, + "learning_rate": 8.548943270300334e-06, + "loss": 2.6093, + "mean_token_accuracy": 0.4801697998787144, + "step": 7827 + }, + { + "epoch": 1.4512421208750463, + "grad_norm": 11.1796875, + "learning_rate": 8.548757879124955e-06, + "loss": 3.0634, + "mean_token_accuracy": 0.4393236978456504, + "step": 7828 + }, + { + "epoch": 1.4514275120504263, + "grad_norm": 7.0, + "learning_rate": 8.548572487949575e-06, + "loss": 2.8975, + "mean_token_accuracy": 0.46254571192638905, + "step": 7829 + }, + { + "epoch": 1.4516129032258065, + "grad_norm": 6.4140625, + "learning_rate": 8.548387096774194e-06, + "loss": 1.963, + "mean_token_accuracy": 0.5665490472829923, + "step": 7830 + }, + { + "epoch": 1.4517982944011865, + "grad_norm": 10.125, + "learning_rate": 8.548201705598814e-06, + "loss": 3.4484, + "mean_token_accuracy": 0.40723367485495615, + "step": 7831 + }, + { + "epoch": 1.4519836855765664, + "grad_norm": 13.75, + "learning_rate": 8.548016314423433e-06, + "loss": 2.3999, + "mean_token_accuracy": 0.49074759437453735, + "step": 7832 + }, + { + "epoch": 1.4521690767519466, + "grad_norm": 7.7109375, + "learning_rate": 8.547830923248054e-06, + "loss": 3.0958, + "mean_token_accuracy": 0.43508137432188065, + "step": 7833 + }, + { + "epoch": 1.4523544679273266, + "grad_norm": 5.40234375, + "learning_rate": 8.547645532072674e-06, + "loss": 2.7748, + "mean_token_accuracy": 0.453654299540886, + "step": 7834 + }, + { + "epoch": 1.4525398591027068, + "grad_norm": 6.140625, + "learning_rate": 8.547460140897293e-06, + "loss": 3.1231, + "mean_token_accuracy": 0.43324051003957204, + "step": 7835 + }, + { + "epoch": 1.4527252502780867, + "grad_norm": 10.4296875, + "learning_rate": 8.547274749721914e-06, + "loss": 3.1024, + "mean_token_accuracy": 0.4361518157822882, + "step": 7836 + }, + { + "epoch": 1.452910641453467, + "grad_norm": 6.3515625, + "learning_rate": 8.547089358546534e-06, + "loss": 3.0712, + "mean_token_accuracy": 0.4516714728185411, + "step": 7837 + }, + { + "epoch": 1.453096032628847, + "grad_norm": 6.04296875, + "learning_rate": 8.546903967371155e-06, + "loss": 2.9177, + "mean_token_accuracy": 0.45163277880468267, + "step": 7838 + }, + { + "epoch": 1.4532814238042269, + "grad_norm": 10.8671875, + "learning_rate": 8.546718576195773e-06, + "loss": 3.0304, + "mean_token_accuracy": 0.4455910902217227, + "step": 7839 + }, + { + "epoch": 1.453466814979607, + "grad_norm": 10.25, + "learning_rate": 8.546533185020394e-06, + "loss": 3.0305, + "mean_token_accuracy": 0.45133891706971807, + "step": 7840 + }, + { + "epoch": 1.453652206154987, + "grad_norm": 11.4375, + "learning_rate": 8.546347793845013e-06, + "loss": 2.8, + "mean_token_accuracy": 0.4720753256803818, + "step": 7841 + }, + { + "epoch": 1.453837597330367, + "grad_norm": 6.70703125, + "learning_rate": 8.546162402669633e-06, + "loss": 3.2404, + "mean_token_accuracy": 0.42706708268330734, + "step": 7842 + }, + { + "epoch": 1.4540229885057472, + "grad_norm": 7.109375, + "learning_rate": 8.545977011494254e-06, + "loss": 2.9074, + "mean_token_accuracy": 0.47761847433512095, + "step": 7843 + }, + { + "epoch": 1.4542083796811272, + "grad_norm": 10.2421875, + "learning_rate": 8.545791620318874e-06, + "loss": 3.4967, + "mean_token_accuracy": 0.3991161231331911, + "step": 7844 + }, + { + "epoch": 1.4543937708565071, + "grad_norm": 6.2421875, + "learning_rate": 8.545606229143495e-06, + "loss": 3.4977, + "mean_token_accuracy": 0.42142943817497874, + "step": 7845 + }, + { + "epoch": 1.4545791620318873, + "grad_norm": 5.80859375, + "learning_rate": 8.545420837968113e-06, + "loss": 3.1832, + "mean_token_accuracy": 0.42845287492590395, + "step": 7846 + }, + { + "epoch": 1.4547645532072673, + "grad_norm": 10.1484375, + "learning_rate": 8.545235446792734e-06, + "loss": 2.7948, + "mean_token_accuracy": 0.4635144350988861, + "step": 7847 + }, + { + "epoch": 1.4549499443826475, + "grad_norm": 10.1953125, + "learning_rate": 8.545050055617353e-06, + "loss": 2.7546, + "mean_token_accuracy": 0.4820136553551411, + "step": 7848 + }, + { + "epoch": 1.4551353355580274, + "grad_norm": 6.49609375, + "learning_rate": 8.544864664441973e-06, + "loss": 2.6429, + "mean_token_accuracy": 0.4621101364522417, + "step": 7849 + }, + { + "epoch": 1.4553207267334076, + "grad_norm": 6.90625, + "learning_rate": 8.544679273266594e-06, + "loss": 2.61, + "mean_token_accuracy": 0.480463347164592, + "step": 7850 + }, + { + "epoch": 1.4555061179087876, + "grad_norm": 7.0390625, + "learning_rate": 8.544493882091212e-06, + "loss": 2.55, + "mean_token_accuracy": 0.47000833101916134, + "step": 7851 + }, + { + "epoch": 1.4556915090841676, + "grad_norm": 6.80859375, + "learning_rate": 8.544308490915833e-06, + "loss": 3.3924, + "mean_token_accuracy": 0.4094776803246498, + "step": 7852 + }, + { + "epoch": 1.4558769002595477, + "grad_norm": 6.0859375, + "learning_rate": 8.544123099740453e-06, + "loss": 2.6124, + "mean_token_accuracy": 0.4919107391910739, + "step": 7853 + }, + { + "epoch": 1.4560622914349277, + "grad_norm": 6.15234375, + "learning_rate": 8.543937708565074e-06, + "loss": 2.5512, + "mean_token_accuracy": 0.48447946513849094, + "step": 7854 + }, + { + "epoch": 1.4562476826103077, + "grad_norm": 7.20703125, + "learning_rate": 8.543752317389693e-06, + "loss": 3.0899, + "mean_token_accuracy": 0.45342533267619517, + "step": 7855 + }, + { + "epoch": 1.4564330737856879, + "grad_norm": 5.94140625, + "learning_rate": 8.543566926214313e-06, + "loss": 2.874, + "mean_token_accuracy": 0.4457006843201428, + "step": 7856 + }, + { + "epoch": 1.4566184649610678, + "grad_norm": 9.5234375, + "learning_rate": 8.543381535038932e-06, + "loss": 2.9142, + "mean_token_accuracy": 0.44180091752907286, + "step": 7857 + }, + { + "epoch": 1.4568038561364478, + "grad_norm": 9.0859375, + "learning_rate": 8.543196143863553e-06, + "loss": 3.3034, + "mean_token_accuracy": 0.4189010460502585, + "step": 7858 + }, + { + "epoch": 1.456989247311828, + "grad_norm": 6.50390625, + "learning_rate": 8.543010752688173e-06, + "loss": 2.5361, + "mean_token_accuracy": 0.4909827015090173, + "step": 7859 + }, + { + "epoch": 1.457174638487208, + "grad_norm": 6.8359375, + "learning_rate": 8.542825361512793e-06, + "loss": 2.8885, + "mean_token_accuracy": 0.47942569567791593, + "step": 7860 + }, + { + "epoch": 1.457360029662588, + "grad_norm": 7.55859375, + "learning_rate": 8.542639970337412e-06, + "loss": 2.8284, + "mean_token_accuracy": 0.46159981768459435, + "step": 7861 + }, + { + "epoch": 1.4575454208379681, + "grad_norm": 9.71875, + "learning_rate": 8.542454579162033e-06, + "loss": 2.7749, + "mean_token_accuracy": 0.4645190023752969, + "step": 7862 + }, + { + "epoch": 1.457730812013348, + "grad_norm": 9.7734375, + "learning_rate": 8.542269187986653e-06, + "loss": 2.5592, + "mean_token_accuracy": 0.5148818208345491, + "step": 7863 + }, + { + "epoch": 1.4579162031887283, + "grad_norm": 5.96875, + "learning_rate": 8.542083796811272e-06, + "loss": 3.2341, + "mean_token_accuracy": 0.42153306026916326, + "step": 7864 + }, + { + "epoch": 1.4581015943641082, + "grad_norm": 7.265625, + "learning_rate": 8.541898405635893e-06, + "loss": 2.715, + "mean_token_accuracy": 0.5018199082133249, + "step": 7865 + }, + { + "epoch": 1.4582869855394884, + "grad_norm": 8.8515625, + "learning_rate": 8.541713014460511e-06, + "loss": 3.0273, + "mean_token_accuracy": 0.44452066540902074, + "step": 7866 + }, + { + "epoch": 1.4584723767148684, + "grad_norm": 6.390625, + "learning_rate": 8.541527623285132e-06, + "loss": 2.8537, + "mean_token_accuracy": 0.4850897510133179, + "step": 7867 + }, + { + "epoch": 1.4586577678902484, + "grad_norm": 8.328125, + "learning_rate": 8.541342232109752e-06, + "loss": 2.9865, + "mean_token_accuracy": 0.4617848303974378, + "step": 7868 + }, + { + "epoch": 1.4588431590656286, + "grad_norm": 7.17578125, + "learning_rate": 8.541156840934373e-06, + "loss": 2.9651, + "mean_token_accuracy": 0.46307053941908716, + "step": 7869 + }, + { + "epoch": 1.4590285502410085, + "grad_norm": 8.5390625, + "learning_rate": 8.540971449758992e-06, + "loss": 2.9879, + "mean_token_accuracy": 0.4407940724171676, + "step": 7870 + }, + { + "epoch": 1.4592139414163885, + "grad_norm": 6.4140625, + "learning_rate": 8.540786058583612e-06, + "loss": 2.3967, + "mean_token_accuracy": 0.5297192172116203, + "step": 7871 + }, + { + "epoch": 1.4593993325917687, + "grad_norm": 6.77734375, + "learning_rate": 8.540600667408233e-06, + "loss": 2.9023, + "mean_token_accuracy": 0.45032790867136263, + "step": 7872 + }, + { + "epoch": 1.4595847237671487, + "grad_norm": 6.421875, + "learning_rate": 8.540415276232851e-06, + "loss": 2.7547, + "mean_token_accuracy": 0.4853306086702671, + "step": 7873 + }, + { + "epoch": 1.4597701149425286, + "grad_norm": 5.89453125, + "learning_rate": 8.540229885057472e-06, + "loss": 2.7977, + "mean_token_accuracy": 0.48594132029339854, + "step": 7874 + }, + { + "epoch": 1.4599555061179088, + "grad_norm": 6.6015625, + "learning_rate": 8.54004449388209e-06, + "loss": 2.6217, + "mean_token_accuracy": 0.4970807875084861, + "step": 7875 + }, + { + "epoch": 1.4601408972932888, + "grad_norm": 6.421875, + "learning_rate": 8.539859102706713e-06, + "loss": 3.7582, + "mean_token_accuracy": 0.3912672450576021, + "step": 7876 + }, + { + "epoch": 1.460326288468669, + "grad_norm": 7.39453125, + "learning_rate": 8.539673711531332e-06, + "loss": 3.7371, + "mean_token_accuracy": 0.3766009852216749, + "step": 7877 + }, + { + "epoch": 1.460511679644049, + "grad_norm": 5.34765625, + "learning_rate": 8.539488320355952e-06, + "loss": 3.2332, + "mean_token_accuracy": 0.43877917414721723, + "step": 7878 + }, + { + "epoch": 1.4606970708194291, + "grad_norm": 7.07421875, + "learning_rate": 8.539302929180571e-06, + "loss": 3.1175, + "mean_token_accuracy": 0.43656716417910446, + "step": 7879 + }, + { + "epoch": 1.460882461994809, + "grad_norm": 7.71875, + "learning_rate": 8.539117538005191e-06, + "loss": 2.9536, + "mean_token_accuracy": 0.4444853392221813, + "step": 7880 + }, + { + "epoch": 1.461067853170189, + "grad_norm": 13.203125, + "learning_rate": 8.538932146829812e-06, + "loss": 2.9463, + "mean_token_accuracy": 0.4631434282858571, + "step": 7881 + }, + { + "epoch": 1.4612532443455692, + "grad_norm": 5.9765625, + "learning_rate": 8.53874675565443e-06, + "loss": 2.639, + "mean_token_accuracy": 0.48976548732547104, + "step": 7882 + }, + { + "epoch": 1.4614386355209492, + "grad_norm": 7.6953125, + "learning_rate": 8.538561364479051e-06, + "loss": 2.5873, + "mean_token_accuracy": 0.507802711014922, + "step": 7883 + }, + { + "epoch": 1.4616240266963292, + "grad_norm": 8.25, + "learning_rate": 8.538375973303672e-06, + "loss": 2.9473, + "mean_token_accuracy": 0.44396664060463903, + "step": 7884 + }, + { + "epoch": 1.4618094178717094, + "grad_norm": 6.0390625, + "learning_rate": 8.538190582128292e-06, + "loss": 3.0263, + "mean_token_accuracy": 0.4480888771326544, + "step": 7885 + }, + { + "epoch": 1.4619948090470893, + "grad_norm": 6.04296875, + "learning_rate": 8.538005190952911e-06, + "loss": 2.1345, + "mean_token_accuracy": 0.5498054474708172, + "step": 7886 + }, + { + "epoch": 1.4621802002224693, + "grad_norm": 6.19921875, + "learning_rate": 8.537819799777532e-06, + "loss": 2.9138, + "mean_token_accuracy": 0.4485056976994195, + "step": 7887 + }, + { + "epoch": 1.4623655913978495, + "grad_norm": 6.90234375, + "learning_rate": 8.537634408602152e-06, + "loss": 3.6099, + "mean_token_accuracy": 0.39652777777777776, + "step": 7888 + }, + { + "epoch": 1.4625509825732295, + "grad_norm": 6.01953125, + "learning_rate": 8.53744901742677e-06, + "loss": 3.2111, + "mean_token_accuracy": 0.44321608040201005, + "step": 7889 + }, + { + "epoch": 1.4627363737486094, + "grad_norm": 7.125, + "learning_rate": 8.537263626251391e-06, + "loss": 2.608, + "mean_token_accuracy": 0.4647887323943662, + "step": 7890 + }, + { + "epoch": 1.4629217649239896, + "grad_norm": 6.80078125, + "learning_rate": 8.53707823507601e-06, + "loss": 3.7678, + "mean_token_accuracy": 0.38359598853868193, + "step": 7891 + }, + { + "epoch": 1.4631071560993696, + "grad_norm": 8.734375, + "learning_rate": 8.536892843900632e-06, + "loss": 3.1978, + "mean_token_accuracy": 0.43611446997178555, + "step": 7892 + }, + { + "epoch": 1.4632925472747498, + "grad_norm": 7.66015625, + "learning_rate": 8.536707452725251e-06, + "loss": 3.1695, + "mean_token_accuracy": 0.4296479707180536, + "step": 7893 + }, + { + "epoch": 1.4634779384501297, + "grad_norm": 6.78515625, + "learning_rate": 8.536522061549872e-06, + "loss": 2.4814, + "mean_token_accuracy": 0.502254850550603, + "step": 7894 + }, + { + "epoch": 1.46366332962551, + "grad_norm": 9.015625, + "learning_rate": 8.53633667037449e-06, + "loss": 2.5606, + "mean_token_accuracy": 0.5031363088057901, + "step": 7895 + }, + { + "epoch": 1.46384872080089, + "grad_norm": 9.3515625, + "learning_rate": 8.536151279199111e-06, + "loss": 3.4927, + "mean_token_accuracy": 0.39601040763226364, + "step": 7896 + }, + { + "epoch": 1.4640341119762699, + "grad_norm": 6.19921875, + "learning_rate": 8.535965888023731e-06, + "loss": 3.4803, + "mean_token_accuracy": 0.4235897435897436, + "step": 7897 + }, + { + "epoch": 1.46421950315165, + "grad_norm": 7.61328125, + "learning_rate": 8.53578049684835e-06, + "loss": 2.7534, + "mean_token_accuracy": 0.47451330063902514, + "step": 7898 + }, + { + "epoch": 1.46440489432703, + "grad_norm": 11.1171875, + "learning_rate": 8.53559510567297e-06, + "loss": 3.651, + "mean_token_accuracy": 0.4040595399188092, + "step": 7899 + }, + { + "epoch": 1.46459028550241, + "grad_norm": 7.64453125, + "learning_rate": 8.535409714497591e-06, + "loss": 3.2656, + "mean_token_accuracy": 0.46500777604976673, + "step": 7900 + }, + { + "epoch": 1.4647756766777902, + "grad_norm": 6.70703125, + "learning_rate": 8.535224323322212e-06, + "loss": 2.6647, + "mean_token_accuracy": 0.4787762293769021, + "step": 7901 + }, + { + "epoch": 1.4649610678531702, + "grad_norm": 9.21875, + "learning_rate": 8.53503893214683e-06, + "loss": 2.7427, + "mean_token_accuracy": 0.4627295149355988, + "step": 7902 + }, + { + "epoch": 1.4651464590285501, + "grad_norm": 5.734375, + "learning_rate": 8.534853540971451e-06, + "loss": 2.6982, + "mean_token_accuracy": 0.4755515417365501, + "step": 7903 + }, + { + "epoch": 1.4653318502039303, + "grad_norm": 5.375, + "learning_rate": 8.53466814979607e-06, + "loss": 2.716, + "mean_token_accuracy": 0.46497622820919177, + "step": 7904 + }, + { + "epoch": 1.4655172413793103, + "grad_norm": 7.15234375, + "learning_rate": 8.53448275862069e-06, + "loss": 3.1299, + "mean_token_accuracy": 0.42488561260803254, + "step": 7905 + }, + { + "epoch": 1.4657026325546905, + "grad_norm": 6.890625, + "learning_rate": 8.53429736744531e-06, + "loss": 3.622, + "mean_token_accuracy": 0.3879200340280732, + "step": 7906 + }, + { + "epoch": 1.4658880237300704, + "grad_norm": 7.4453125, + "learning_rate": 8.53411197626993e-06, + "loss": 2.9202, + "mean_token_accuracy": 0.4488838153221715, + "step": 7907 + }, + { + "epoch": 1.4660734149054506, + "grad_norm": 7.8671875, + "learning_rate": 8.53392658509455e-06, + "loss": 2.7822, + "mean_token_accuracy": 0.49189862899875364, + "step": 7908 + }, + { + "epoch": 1.4662588060808306, + "grad_norm": 6.90234375, + "learning_rate": 8.53374119391917e-06, + "loss": 3.0192, + "mean_token_accuracy": 0.4500153798831129, + "step": 7909 + }, + { + "epoch": 1.4664441972562106, + "grad_norm": 23.671875, + "learning_rate": 8.533555802743791e-06, + "loss": 3.4542, + "mean_token_accuracy": 0.44963240036115054, + "step": 7910 + }, + { + "epoch": 1.4666295884315907, + "grad_norm": 5.62890625, + "learning_rate": 8.53337041156841e-06, + "loss": 2.6252, + "mean_token_accuracy": 0.4949092518813634, + "step": 7911 + }, + { + "epoch": 1.4668149796069707, + "grad_norm": 5.953125, + "learning_rate": 8.53318502039303e-06, + "loss": 2.5613, + "mean_token_accuracy": 0.4935454818372861, + "step": 7912 + }, + { + "epoch": 1.4670003707823507, + "grad_norm": 5.3359375, + "learning_rate": 8.532999629217649e-06, + "loss": 2.805, + "mean_token_accuracy": 0.4658688690375864, + "step": 7913 + }, + { + "epoch": 1.4671857619577309, + "grad_norm": 5.75390625, + "learning_rate": 8.53281423804227e-06, + "loss": 3.1783, + "mean_token_accuracy": 0.4179614667495339, + "step": 7914 + }, + { + "epoch": 1.4673711531331108, + "grad_norm": 6.1484375, + "learning_rate": 8.53262884686689e-06, + "loss": 3.2537, + "mean_token_accuracy": 0.43605616789974333, + "step": 7915 + }, + { + "epoch": 1.4675565443084908, + "grad_norm": 5.0390625, + "learning_rate": 8.53244345569151e-06, + "loss": 2.7442, + "mean_token_accuracy": 0.4841582712804789, + "step": 7916 + }, + { + "epoch": 1.467741935483871, + "grad_norm": 5.14453125, + "learning_rate": 8.53225806451613e-06, + "loss": 2.4418, + "mean_token_accuracy": 0.5245980901728514, + "step": 7917 + }, + { + "epoch": 1.467927326659251, + "grad_norm": 5.83984375, + "learning_rate": 8.53207267334075e-06, + "loss": 2.8591, + "mean_token_accuracy": 0.4605495741603728, + "step": 7918 + }, + { + "epoch": 1.4681127178346312, + "grad_norm": 7.98828125, + "learning_rate": 8.53188728216537e-06, + "loss": 2.7281, + "mean_token_accuracy": 0.4822843474360891, + "step": 7919 + }, + { + "epoch": 1.4682981090100111, + "grad_norm": 5.5, + "learning_rate": 8.531701890989989e-06, + "loss": 2.8218, + "mean_token_accuracy": 0.45784794604537093, + "step": 7920 + }, + { + "epoch": 1.4684835001853913, + "grad_norm": 6.80859375, + "learning_rate": 8.53151649981461e-06, + "loss": 2.4831, + "mean_token_accuracy": 0.49508742714404663, + "step": 7921 + }, + { + "epoch": 1.4686688913607713, + "grad_norm": 6.8671875, + "learning_rate": 8.531331108639228e-06, + "loss": 3.4171, + "mean_token_accuracy": 0.4472391903221943, + "step": 7922 + }, + { + "epoch": 1.4688542825361512, + "grad_norm": 7.1171875, + "learning_rate": 8.531145717463849e-06, + "loss": 2.6813, + "mean_token_accuracy": 0.474243399871217, + "step": 7923 + }, + { + "epoch": 1.4690396737115314, + "grad_norm": 5.515625, + "learning_rate": 8.53096032628847e-06, + "loss": 2.7112, + "mean_token_accuracy": 0.4808935094127564, + "step": 7924 + }, + { + "epoch": 1.4692250648869114, + "grad_norm": 7.85546875, + "learning_rate": 8.53077493511309e-06, + "loss": 2.5896, + "mean_token_accuracy": 0.467614756406646, + "step": 7925 + }, + { + "epoch": 1.4694104560622914, + "grad_norm": 6.33984375, + "learning_rate": 8.53058954393771e-06, + "loss": 2.503, + "mean_token_accuracy": 0.5060560985975351, + "step": 7926 + }, + { + "epoch": 1.4695958472376716, + "grad_norm": 6.58203125, + "learning_rate": 8.530404152762329e-06, + "loss": 2.8306, + "mean_token_accuracy": 0.48042995677065076, + "step": 7927 + }, + { + "epoch": 1.4697812384130515, + "grad_norm": 5.5546875, + "learning_rate": 8.53021876158695e-06, + "loss": 2.6565, + "mean_token_accuracy": 0.49041165942214265, + "step": 7928 + }, + { + "epoch": 1.4699666295884315, + "grad_norm": 8.609375, + "learning_rate": 8.530033370411568e-06, + "loss": 2.8217, + "mean_token_accuracy": 0.45671572604009736, + "step": 7929 + }, + { + "epoch": 1.4701520207638117, + "grad_norm": 10.3203125, + "learning_rate": 8.529847979236189e-06, + "loss": 2.6475, + "mean_token_accuracy": 0.49955357142857143, + "step": 7930 + }, + { + "epoch": 1.4703374119391917, + "grad_norm": 7.8046875, + "learning_rate": 8.52966258806081e-06, + "loss": 2.4959, + "mean_token_accuracy": 0.48393943681901797, + "step": 7931 + }, + { + "epoch": 1.4705228031145716, + "grad_norm": 4.90625, + "learning_rate": 8.529477196885428e-06, + "loss": 2.7817, + "mean_token_accuracy": 0.45967643838949035, + "step": 7932 + }, + { + "epoch": 1.4707081942899518, + "grad_norm": 10.9921875, + "learning_rate": 8.529291805710049e-06, + "loss": 2.401, + "mean_token_accuracy": 0.5024745269286754, + "step": 7933 + }, + { + "epoch": 1.4708935854653318, + "grad_norm": 6.890625, + "learning_rate": 8.52910641453467e-06, + "loss": 2.9299, + "mean_token_accuracy": 0.4428520243640272, + "step": 7934 + }, + { + "epoch": 1.471078976640712, + "grad_norm": 6.45703125, + "learning_rate": 8.52892102335929e-06, + "loss": 2.7991, + "mean_token_accuracy": 0.45350223546944857, + "step": 7935 + }, + { + "epoch": 1.471264367816092, + "grad_norm": 6.1328125, + "learning_rate": 8.528735632183908e-06, + "loss": 3.2484, + "mean_token_accuracy": 0.4583579444772593, + "step": 7936 + }, + { + "epoch": 1.4714497589914721, + "grad_norm": 13.09375, + "learning_rate": 8.528550241008529e-06, + "loss": 3.0654, + "mean_token_accuracy": 0.4769716088328076, + "step": 7937 + }, + { + "epoch": 1.471635150166852, + "grad_norm": 7.41015625, + "learning_rate": 8.528364849833148e-06, + "loss": 2.432, + "mean_token_accuracy": 0.5048942598187312, + "step": 7938 + }, + { + "epoch": 1.471820541342232, + "grad_norm": 7.4375, + "learning_rate": 8.528179458657768e-06, + "loss": 3.3368, + "mean_token_accuracy": 0.42369251577998196, + "step": 7939 + }, + { + "epoch": 1.4720059325176122, + "grad_norm": 10.765625, + "learning_rate": 8.527994067482389e-06, + "loss": 3.0787, + "mean_token_accuracy": 0.4425763944109963, + "step": 7940 + }, + { + "epoch": 1.4721913236929922, + "grad_norm": 6.69140625, + "learning_rate": 8.52780867630701e-06, + "loss": 2.9073, + "mean_token_accuracy": 0.4663755458515284, + "step": 7941 + }, + { + "epoch": 1.4723767148683722, + "grad_norm": 6.13671875, + "learning_rate": 8.527623285131628e-06, + "loss": 2.9467, + "mean_token_accuracy": 0.4519519519519519, + "step": 7942 + }, + { + "epoch": 1.4725621060437524, + "grad_norm": 6.37109375, + "learning_rate": 8.527437893956249e-06, + "loss": 3.0013, + "mean_token_accuracy": 0.44687219395390604, + "step": 7943 + }, + { + "epoch": 1.4727474972191323, + "grad_norm": 7.8671875, + "learning_rate": 8.527252502780869e-06, + "loss": 2.591, + "mean_token_accuracy": 0.48645703611457036, + "step": 7944 + }, + { + "epoch": 1.4729328883945123, + "grad_norm": 5.25, + "learning_rate": 8.527067111605488e-06, + "loss": 3.1381, + "mean_token_accuracy": 0.43822558963705244, + "step": 7945 + }, + { + "epoch": 1.4731182795698925, + "grad_norm": 7.1015625, + "learning_rate": 8.526881720430108e-06, + "loss": 2.952, + "mean_token_accuracy": 0.4658246656760773, + "step": 7946 + }, + { + "epoch": 1.4733036707452725, + "grad_norm": 5.34765625, + "learning_rate": 8.526696329254727e-06, + "loss": 2.6892, + "mean_token_accuracy": 0.472513423676809, + "step": 7947 + }, + { + "epoch": 1.4734890619206527, + "grad_norm": 5.48046875, + "learning_rate": 8.526510938079348e-06, + "loss": 2.1906, + "mean_token_accuracy": 0.5672640080767289, + "step": 7948 + }, + { + "epoch": 1.4736744530960326, + "grad_norm": 5.93359375, + "learning_rate": 8.526325546903968e-06, + "loss": 2.855, + "mean_token_accuracy": 0.45520361990950226, + "step": 7949 + }, + { + "epoch": 1.4738598442714128, + "grad_norm": 5.8515625, + "learning_rate": 8.526140155728589e-06, + "loss": 3.4076, + "mean_token_accuracy": 0.42464902472356814, + "step": 7950 + }, + { + "epoch": 1.4740452354467928, + "grad_norm": 5.22265625, + "learning_rate": 8.525954764553207e-06, + "loss": 3.3606, + "mean_token_accuracy": 0.42683444083133043, + "step": 7951 + }, + { + "epoch": 1.4742306266221727, + "grad_norm": 8.4765625, + "learning_rate": 8.525769373377828e-06, + "loss": 2.5967, + "mean_token_accuracy": 0.48363431151241537, + "step": 7952 + }, + { + "epoch": 1.474416017797553, + "grad_norm": 5.72265625, + "learning_rate": 8.525583982202448e-06, + "loss": 2.9269, + "mean_token_accuracy": 0.4478978770639656, + "step": 7953 + }, + { + "epoch": 1.474601408972933, + "grad_norm": 6.6875, + "learning_rate": 8.525398591027067e-06, + "loss": 3.2819, + "mean_token_accuracy": 0.4549947581249064, + "step": 7954 + }, + { + "epoch": 1.4747868001483129, + "grad_norm": 10.71875, + "learning_rate": 8.525213199851688e-06, + "loss": 2.3159, + "mean_token_accuracy": 0.5185848634124496, + "step": 7955 + }, + { + "epoch": 1.474972191323693, + "grad_norm": 6.1640625, + "learning_rate": 8.525027808676306e-06, + "loss": 3.1898, + "mean_token_accuracy": 0.44335325932251635, + "step": 7956 + }, + { + "epoch": 1.475157582499073, + "grad_norm": 5.89453125, + "learning_rate": 8.524842417500929e-06, + "loss": 2.8931, + "mean_token_accuracy": 0.47636462142760394, + "step": 7957 + }, + { + "epoch": 1.475342973674453, + "grad_norm": 8.1171875, + "learning_rate": 8.524657026325547e-06, + "loss": 2.4912, + "mean_token_accuracy": 0.4952357683850476, + "step": 7958 + }, + { + "epoch": 1.4755283648498332, + "grad_norm": 8.7109375, + "learning_rate": 8.524471635150168e-06, + "loss": 2.802, + "mean_token_accuracy": 0.47229072031148606, + "step": 7959 + }, + { + "epoch": 1.4757137560252132, + "grad_norm": 5.9140625, + "learning_rate": 8.524286243974787e-06, + "loss": 3.6226, + "mean_token_accuracy": 0.39038262668045504, + "step": 7960 + }, + { + "epoch": 1.4758991472005931, + "grad_norm": 6.8046875, + "learning_rate": 8.524100852799407e-06, + "loss": 2.6566, + "mean_token_accuracy": 0.47561489810260016, + "step": 7961 + }, + { + "epoch": 1.4760845383759733, + "grad_norm": 6.0234375, + "learning_rate": 8.523915461624028e-06, + "loss": 2.5882, + "mean_token_accuracy": 0.4835987477882129, + "step": 7962 + }, + { + "epoch": 1.4762699295513533, + "grad_norm": 6.54296875, + "learning_rate": 8.523730070448647e-06, + "loss": 3.0214, + "mean_token_accuracy": 0.4448405826627279, + "step": 7963 + }, + { + "epoch": 1.4764553207267335, + "grad_norm": 6.16015625, + "learning_rate": 8.523544679273267e-06, + "loss": 3.0039, + "mean_token_accuracy": 0.4574182335282035, + "step": 7964 + }, + { + "epoch": 1.4766407119021134, + "grad_norm": 6.53125, + "learning_rate": 8.523359288097887e-06, + "loss": 2.7845, + "mean_token_accuracy": 0.4779474130619169, + "step": 7965 + }, + { + "epoch": 1.4768261030774936, + "grad_norm": 7.57421875, + "learning_rate": 8.523173896922508e-06, + "loss": 2.4045, + "mean_token_accuracy": 0.5024134014764339, + "step": 7966 + }, + { + "epoch": 1.4770114942528736, + "grad_norm": 5.80859375, + "learning_rate": 8.522988505747127e-06, + "loss": 2.7409, + "mean_token_accuracy": 0.4648303000491884, + "step": 7967 + }, + { + "epoch": 1.4771968854282536, + "grad_norm": 6.32421875, + "learning_rate": 8.522803114571747e-06, + "loss": 2.5199, + "mean_token_accuracy": 0.4842148421484215, + "step": 7968 + }, + { + "epoch": 1.4773822766036337, + "grad_norm": 6.125, + "learning_rate": 8.522617723396368e-06, + "loss": 2.4571, + "mean_token_accuracy": 0.5066621499548328, + "step": 7969 + }, + { + "epoch": 1.4775676677790137, + "grad_norm": 6.2421875, + "learning_rate": 8.522432332220987e-06, + "loss": 3.0045, + "mean_token_accuracy": 0.4302426343154246, + "step": 7970 + }, + { + "epoch": 1.4777530589543937, + "grad_norm": 5.55078125, + "learning_rate": 8.522246941045607e-06, + "loss": 2.7451, + "mean_token_accuracy": 0.47583892617449663, + "step": 7971 + }, + { + "epoch": 1.4779384501297739, + "grad_norm": 5.4375, + "learning_rate": 8.522061549870226e-06, + "loss": 2.5533, + "mean_token_accuracy": 0.48322147651006714, + "step": 7972 + }, + { + "epoch": 1.4781238413051538, + "grad_norm": 5.828125, + "learning_rate": 8.521876158694848e-06, + "loss": 2.7393, + "mean_token_accuracy": 0.46737579452587885, + "step": 7973 + }, + { + "epoch": 1.4783092324805338, + "grad_norm": 6.05859375, + "learning_rate": 8.521690767519467e-06, + "loss": 2.5999, + "mean_token_accuracy": 0.5125786163522013, + "step": 7974 + }, + { + "epoch": 1.478494623655914, + "grad_norm": 5.08984375, + "learning_rate": 8.521505376344087e-06, + "loss": 2.5591, + "mean_token_accuracy": 0.484051724137931, + "step": 7975 + }, + { + "epoch": 1.478680014831294, + "grad_norm": 5.7734375, + "learning_rate": 8.521319985168706e-06, + "loss": 2.5576, + "mean_token_accuracy": 0.4828080229226361, + "step": 7976 + }, + { + "epoch": 1.4788654060066742, + "grad_norm": 5.7421875, + "learning_rate": 8.521134593993327e-06, + "loss": 3.3981, + "mean_token_accuracy": 0.4406198399593548, + "step": 7977 + }, + { + "epoch": 1.4790507971820541, + "grad_norm": 5.73046875, + "learning_rate": 8.520949202817947e-06, + "loss": 3.8009, + "mean_token_accuracy": 0.38714918759231903, + "step": 7978 + }, + { + "epoch": 1.4792361883574343, + "grad_norm": 5.65234375, + "learning_rate": 8.520763811642566e-06, + "loss": 3.0012, + "mean_token_accuracy": 0.46612466124661245, + "step": 7979 + }, + { + "epoch": 1.4794215795328143, + "grad_norm": 5.94921875, + "learning_rate": 8.520578420467186e-06, + "loss": 2.8497, + "mean_token_accuracy": 0.46706708744782655, + "step": 7980 + }, + { + "epoch": 1.4796069707081942, + "grad_norm": 5.65625, + "learning_rate": 8.520393029291807e-06, + "loss": 3.0038, + "mean_token_accuracy": 0.4650495877643685, + "step": 7981 + }, + { + "epoch": 1.4797923618835744, + "grad_norm": 6.25390625, + "learning_rate": 8.520207638116427e-06, + "loss": 2.6513, + "mean_token_accuracy": 0.4771202683277432, + "step": 7982 + }, + { + "epoch": 1.4799777530589544, + "grad_norm": 5.17578125, + "learning_rate": 8.520022246941046e-06, + "loss": 1.716, + "mean_token_accuracy": 0.6261138613861386, + "step": 7983 + }, + { + "epoch": 1.4801631442343344, + "grad_norm": 5.8671875, + "learning_rate": 8.519836855765667e-06, + "loss": 2.8564, + "mean_token_accuracy": 0.4692716705824162, + "step": 7984 + }, + { + "epoch": 1.4803485354097146, + "grad_norm": 5.67578125, + "learning_rate": 8.519651464590285e-06, + "loss": 3.0229, + "mean_token_accuracy": 0.45672031317964334, + "step": 7985 + }, + { + "epoch": 1.4805339265850945, + "grad_norm": 5.0625, + "learning_rate": 8.519466073414906e-06, + "loss": 2.4549, + "mean_token_accuracy": 0.49318568994889267, + "step": 7986 + }, + { + "epoch": 1.4807193177604745, + "grad_norm": 5.64453125, + "learning_rate": 8.519280682239526e-06, + "loss": 3.1252, + "mean_token_accuracy": 0.4632725042171446, + "step": 7987 + }, + { + "epoch": 1.4809047089358547, + "grad_norm": 7.40234375, + "learning_rate": 8.519095291064145e-06, + "loss": 3.0907, + "mean_token_accuracy": 0.4764595103578154, + "step": 7988 + }, + { + "epoch": 1.4810901001112347, + "grad_norm": 7.48046875, + "learning_rate": 8.518909899888766e-06, + "loss": 2.7589, + "mean_token_accuracy": 0.5023857164845313, + "step": 7989 + }, + { + "epoch": 1.4812754912866146, + "grad_norm": 7.0859375, + "learning_rate": 8.518724508713386e-06, + "loss": 2.7447, + "mean_token_accuracy": 0.4683005576753742, + "step": 7990 + }, + { + "epoch": 1.4814608824619948, + "grad_norm": 6.6796875, + "learning_rate": 8.518539117538007e-06, + "loss": 3.489, + "mean_token_accuracy": 0.41300906605266946, + "step": 7991 + }, + { + "epoch": 1.4816462736373748, + "grad_norm": 6.9765625, + "learning_rate": 8.518353726362626e-06, + "loss": 3.5141, + "mean_token_accuracy": 0.3938763012859767, + "step": 7992 + }, + { + "epoch": 1.481831664812755, + "grad_norm": 6.41796875, + "learning_rate": 8.518168335187246e-06, + "loss": 2.7942, + "mean_token_accuracy": 0.450038236043844, + "step": 7993 + }, + { + "epoch": 1.482017055988135, + "grad_norm": 6.03125, + "learning_rate": 8.517982944011865e-06, + "loss": 3.0007, + "mean_token_accuracy": 0.44292003685881026, + "step": 7994 + }, + { + "epoch": 1.4822024471635151, + "grad_norm": 7.2734375, + "learning_rate": 8.517797552836485e-06, + "loss": 3.4898, + "mean_token_accuracy": 0.4078859060402685, + "step": 7995 + }, + { + "epoch": 1.482387838338895, + "grad_norm": 7.29296875, + "learning_rate": 8.517612161661106e-06, + "loss": 2.9845, + "mean_token_accuracy": 0.46103575832305793, + "step": 7996 + }, + { + "epoch": 1.482573229514275, + "grad_norm": 7.359375, + "learning_rate": 8.517426770485726e-06, + "loss": 2.7333, + "mean_token_accuracy": 0.46562138542603776, + "step": 7997 + }, + { + "epoch": 1.4827586206896552, + "grad_norm": 5.89453125, + "learning_rate": 8.517241379310345e-06, + "loss": 3.0431, + "mean_token_accuracy": 0.4291243853959619, + "step": 7998 + }, + { + "epoch": 1.4829440118650352, + "grad_norm": 5.328125, + "learning_rate": 8.517055988134966e-06, + "loss": 2.3431, + "mean_token_accuracy": 0.5415029177087055, + "step": 7999 + }, + { + "epoch": 1.4831294030404152, + "grad_norm": 6.28515625, + "learning_rate": 8.516870596959586e-06, + "loss": 2.9362, + "mean_token_accuracy": 0.4457454050374404, + "step": 8000 + }, + { + "epoch": 1.4833147942157954, + "grad_norm": 7.9453125, + "learning_rate": 8.516685205784205e-06, + "loss": 2.6462, + "mean_token_accuracy": 0.4828295042321645, + "step": 8001 + }, + { + "epoch": 1.4835001853911753, + "grad_norm": 6.578125, + "learning_rate": 8.516499814608825e-06, + "loss": 3.0169, + "mean_token_accuracy": 0.46002273588480486, + "step": 8002 + }, + { + "epoch": 1.4836855765665553, + "grad_norm": 5.62890625, + "learning_rate": 8.516314423433444e-06, + "loss": 3.276, + "mean_token_accuracy": 0.415587219343696, + "step": 8003 + }, + { + "epoch": 1.4838709677419355, + "grad_norm": 6.2421875, + "learning_rate": 8.516129032258065e-06, + "loss": 3.5254, + "mean_token_accuracy": 0.4, + "step": 8004 + }, + { + "epoch": 1.4840563589173155, + "grad_norm": 6.09375, + "learning_rate": 8.515943641082685e-06, + "loss": 2.7737, + "mean_token_accuracy": 0.4678609062170706, + "step": 8005 + }, + { + "epoch": 1.4842417500926957, + "grad_norm": 6.3046875, + "learning_rate": 8.515758249907306e-06, + "loss": 3.2617, + "mean_token_accuracy": 0.4360674643217529, + "step": 8006 + }, + { + "epoch": 1.4844271412680756, + "grad_norm": 5.49609375, + "learning_rate": 8.515572858731926e-06, + "loss": 2.9263, + "mean_token_accuracy": 0.4251207729468599, + "step": 8007 + }, + { + "epoch": 1.4846125324434558, + "grad_norm": 5.58984375, + "learning_rate": 8.515387467556545e-06, + "loss": 3.2865, + "mean_token_accuracy": 0.43304046858359957, + "step": 8008 + }, + { + "epoch": 1.4847979236188358, + "grad_norm": 6.07421875, + "learning_rate": 8.515202076381165e-06, + "loss": 2.5854, + "mean_token_accuracy": 0.4826239224137931, + "step": 8009 + }, + { + "epoch": 1.4849833147942157, + "grad_norm": 6.43359375, + "learning_rate": 8.515016685205784e-06, + "loss": 2.9379, + "mean_token_accuracy": 0.4546051551469381, + "step": 8010 + }, + { + "epoch": 1.485168705969596, + "grad_norm": 7.328125, + "learning_rate": 8.514831294030405e-06, + "loss": 2.9528, + "mean_token_accuracy": 0.4526656701544594, + "step": 8011 + }, + { + "epoch": 1.485354097144976, + "grad_norm": 7.66015625, + "learning_rate": 8.514645902855025e-06, + "loss": 2.7272, + "mean_token_accuracy": 0.48482169171824346, + "step": 8012 + }, + { + "epoch": 1.4855394883203559, + "grad_norm": 7.56640625, + "learning_rate": 8.514460511679646e-06, + "loss": 2.8881, + "mean_token_accuracy": 0.4569899665551839, + "step": 8013 + }, + { + "epoch": 1.485724879495736, + "grad_norm": 5.75390625, + "learning_rate": 8.514275120504264e-06, + "loss": 2.7165, + "mean_token_accuracy": 0.4841328413284133, + "step": 8014 + }, + { + "epoch": 1.485910270671116, + "grad_norm": 5.68359375, + "learning_rate": 8.514089729328885e-06, + "loss": 2.8969, + "mean_token_accuracy": 0.45917482344195265, + "step": 8015 + }, + { + "epoch": 1.486095661846496, + "grad_norm": 5.265625, + "learning_rate": 8.513904338153505e-06, + "loss": 2.6781, + "mean_token_accuracy": 0.49518510592766957, + "step": 8016 + }, + { + "epoch": 1.4862810530218762, + "grad_norm": 6.0390625, + "learning_rate": 8.513718946978124e-06, + "loss": 2.853, + "mean_token_accuracy": 0.4585130239779196, + "step": 8017 + }, + { + "epoch": 1.4864664441972562, + "grad_norm": 7.95703125, + "learning_rate": 8.513533555802745e-06, + "loss": 2.6231, + "mean_token_accuracy": 0.4911820781696854, + "step": 8018 + }, + { + "epoch": 1.4866518353726363, + "grad_norm": 5.92578125, + "learning_rate": 8.513348164627364e-06, + "loss": 2.4723, + "mean_token_accuracy": 0.5340948425060574, + "step": 8019 + }, + { + "epoch": 1.4868372265480163, + "grad_norm": 6.37890625, + "learning_rate": 8.513162773451984e-06, + "loss": 2.8321, + "mean_token_accuracy": 0.4672766552293811, + "step": 8020 + }, + { + "epoch": 1.4870226177233965, + "grad_norm": 9.9609375, + "learning_rate": 8.512977382276605e-06, + "loss": 3.1043, + "mean_token_accuracy": 0.4352419460023477, + "step": 8021 + }, + { + "epoch": 1.4872080088987765, + "grad_norm": 10.0, + "learning_rate": 8.512791991101225e-06, + "loss": 2.8343, + "mean_token_accuracy": 0.4513677811550152, + "step": 8022 + }, + { + "epoch": 1.4873934000741564, + "grad_norm": 6.2890625, + "learning_rate": 8.512606599925844e-06, + "loss": 2.6724, + "mean_token_accuracy": 0.5019402985074627, + "step": 8023 + }, + { + "epoch": 1.4875787912495366, + "grad_norm": 9.71875, + "learning_rate": 8.512421208750464e-06, + "loss": 2.7667, + "mean_token_accuracy": 0.4716202270381837, + "step": 8024 + }, + { + "epoch": 1.4877641824249166, + "grad_norm": 8.953125, + "learning_rate": 8.512235817575085e-06, + "loss": 2.9192, + "mean_token_accuracy": 0.47719828672209624, + "step": 8025 + }, + { + "epoch": 1.4879495736002966, + "grad_norm": 6.03515625, + "learning_rate": 8.512050426399704e-06, + "loss": 2.8504, + "mean_token_accuracy": 0.45606117588065687, + "step": 8026 + }, + { + "epoch": 1.4881349647756767, + "grad_norm": 6.7578125, + "learning_rate": 8.511865035224324e-06, + "loss": 2.508, + "mean_token_accuracy": 0.4936126724578436, + "step": 8027 + }, + { + "epoch": 1.4883203559510567, + "grad_norm": 6.1328125, + "learning_rate": 8.511679644048943e-06, + "loss": 3.1028, + "mean_token_accuracy": 0.4385822510822511, + "step": 8028 + }, + { + "epoch": 1.4885057471264367, + "grad_norm": 7.55859375, + "learning_rate": 8.511494252873565e-06, + "loss": 3.4797, + "mean_token_accuracy": 0.44996247185389043, + "step": 8029 + }, + { + "epoch": 1.4886911383018169, + "grad_norm": 5.375, + "learning_rate": 8.511308861698184e-06, + "loss": 3.0795, + "mean_token_accuracy": 0.44145923877255366, + "step": 8030 + }, + { + "epoch": 1.4888765294771968, + "grad_norm": 5.5546875, + "learning_rate": 8.511123470522804e-06, + "loss": 2.8607, + "mean_token_accuracy": 0.45576947275701524, + "step": 8031 + }, + { + "epoch": 1.4890619206525768, + "grad_norm": 10.203125, + "learning_rate": 8.510938079347423e-06, + "loss": 3.3016, + "mean_token_accuracy": 0.42113529279784606, + "step": 8032 + }, + { + "epoch": 1.489247311827957, + "grad_norm": 5.89453125, + "learning_rate": 8.510752688172044e-06, + "loss": 3.1071, + "mean_token_accuracy": 0.45742395114780054, + "step": 8033 + }, + { + "epoch": 1.489432703003337, + "grad_norm": 7.05078125, + "learning_rate": 8.510567296996664e-06, + "loss": 3.1208, + "mean_token_accuracy": 0.43698665297741274, + "step": 8034 + }, + { + "epoch": 1.4896180941787172, + "grad_norm": 5.80078125, + "learning_rate": 8.510381905821283e-06, + "loss": 3.8054, + "mean_token_accuracy": 0.3904093567251462, + "step": 8035 + }, + { + "epoch": 1.4898034853540971, + "grad_norm": 6.55859375, + "learning_rate": 8.510196514645903e-06, + "loss": 3.0763, + "mean_token_accuracy": 0.42972579149556234, + "step": 8036 + }, + { + "epoch": 1.4899888765294773, + "grad_norm": 5.64453125, + "learning_rate": 8.510011123470524e-06, + "loss": 3.0177, + "mean_token_accuracy": 0.4467764060356653, + "step": 8037 + }, + { + "epoch": 1.4901742677048573, + "grad_norm": 6.01171875, + "learning_rate": 8.509825732295144e-06, + "loss": 2.623, + "mean_token_accuracy": 0.47832090582711956, + "step": 8038 + }, + { + "epoch": 1.4903596588802372, + "grad_norm": 5.6953125, + "learning_rate": 8.509640341119763e-06, + "loss": 2.3997, + "mean_token_accuracy": 0.5045176333430487, + "step": 8039 + }, + { + "epoch": 1.4905450500556174, + "grad_norm": 5.8671875, + "learning_rate": 8.509454949944384e-06, + "loss": 3.6561, + "mean_token_accuracy": 0.4093258426966292, + "step": 8040 + }, + { + "epoch": 1.4907304412309974, + "grad_norm": 7.0078125, + "learning_rate": 8.509269558769002e-06, + "loss": 2.6608, + "mean_token_accuracy": 0.46412213740458014, + "step": 8041 + }, + { + "epoch": 1.4909158324063774, + "grad_norm": 7.79296875, + "learning_rate": 8.509084167593623e-06, + "loss": 1.7924, + "mean_token_accuracy": 0.6134769291571979, + "step": 8042 + }, + { + "epoch": 1.4911012235817576, + "grad_norm": 7.73046875, + "learning_rate": 8.508898776418243e-06, + "loss": 2.6075, + "mean_token_accuracy": 0.4886890543768442, + "step": 8043 + }, + { + "epoch": 1.4912866147571375, + "grad_norm": 5.32421875, + "learning_rate": 8.508713385242862e-06, + "loss": 2.7405, + "mean_token_accuracy": 0.47257427799709123, + "step": 8044 + }, + { + "epoch": 1.4914720059325175, + "grad_norm": 7.70703125, + "learning_rate": 8.508527994067483e-06, + "loss": 3.1725, + "mean_token_accuracy": 0.4302888368462139, + "step": 8045 + }, + { + "epoch": 1.4916573971078977, + "grad_norm": 7.1953125, + "learning_rate": 8.508342602892103e-06, + "loss": 3.4588, + "mean_token_accuracy": 0.4131944444444444, + "step": 8046 + }, + { + "epoch": 1.4918427882832777, + "grad_norm": 6.6484375, + "learning_rate": 8.508157211716724e-06, + "loss": 3.1421, + "mean_token_accuracy": 0.4522912361735748, + "step": 8047 + }, + { + "epoch": 1.4920281794586578, + "grad_norm": 7.17578125, + "learning_rate": 8.507971820541343e-06, + "loss": 2.4212, + "mean_token_accuracy": 0.506042122223501, + "step": 8048 + }, + { + "epoch": 1.4922135706340378, + "grad_norm": 5.5625, + "learning_rate": 8.507786429365963e-06, + "loss": 2.6861, + "mean_token_accuracy": 0.4791395045632334, + "step": 8049 + }, + { + "epoch": 1.492398961809418, + "grad_norm": 5.9921875, + "learning_rate": 8.507601038190584e-06, + "loss": 2.5831, + "mean_token_accuracy": 0.499843961302403, + "step": 8050 + }, + { + "epoch": 1.492584352984798, + "grad_norm": 6.484375, + "learning_rate": 8.507415647015202e-06, + "loss": 3.1776, + "mean_token_accuracy": 0.4453522429474333, + "step": 8051 + }, + { + "epoch": 1.492769744160178, + "grad_norm": 6.56640625, + "learning_rate": 8.507230255839823e-06, + "loss": 2.9997, + "mean_token_accuracy": 0.4337108594270486, + "step": 8052 + }, + { + "epoch": 1.4929551353355581, + "grad_norm": 6.1953125, + "learning_rate": 8.507044864664442e-06, + "loss": 3.1659, + "mean_token_accuracy": 0.4317763268039511, + "step": 8053 + }, + { + "epoch": 1.493140526510938, + "grad_norm": 5.4921875, + "learning_rate": 8.506859473489064e-06, + "loss": 2.6612, + "mean_token_accuracy": 0.4779655900996076, + "step": 8054 + }, + { + "epoch": 1.493325917686318, + "grad_norm": 7.01953125, + "learning_rate": 8.506674082313683e-06, + "loss": 2.6773, + "mean_token_accuracy": 0.46616732258474036, + "step": 8055 + }, + { + "epoch": 1.4935113088616983, + "grad_norm": 7.2734375, + "learning_rate": 8.506488691138303e-06, + "loss": 2.5989, + "mean_token_accuracy": 0.5004418634010858, + "step": 8056 + }, + { + "epoch": 1.4936967000370782, + "grad_norm": 5.48046875, + "learning_rate": 8.506303299962922e-06, + "loss": 2.6704, + "mean_token_accuracy": 0.48788443616029825, + "step": 8057 + }, + { + "epoch": 1.4938820912124582, + "grad_norm": 7.375, + "learning_rate": 8.506117908787542e-06, + "loss": 3.2257, + "mean_token_accuracy": 0.438489488710096, + "step": 8058 + }, + { + "epoch": 1.4940674823878384, + "grad_norm": 7.3125, + "learning_rate": 8.505932517612163e-06, + "loss": 2.5419, + "mean_token_accuracy": 0.4870290302655961, + "step": 8059 + }, + { + "epoch": 1.4942528735632183, + "grad_norm": 6.203125, + "learning_rate": 8.505747126436782e-06, + "loss": 3.5305, + "mean_token_accuracy": 0.4162080352228949, + "step": 8060 + }, + { + "epoch": 1.4944382647385983, + "grad_norm": 6.73046875, + "learning_rate": 8.505561735261402e-06, + "loss": 2.8037, + "mean_token_accuracy": 0.47136836886465044, + "step": 8061 + }, + { + "epoch": 1.4946236559139785, + "grad_norm": 7.11328125, + "learning_rate": 8.505376344086023e-06, + "loss": 2.29, + "mean_token_accuracy": 0.5176174496644296, + "step": 8062 + }, + { + "epoch": 1.4948090470893585, + "grad_norm": 6.34765625, + "learning_rate": 8.505190952910643e-06, + "loss": 3.1231, + "mean_token_accuracy": 0.44406538734896944, + "step": 8063 + }, + { + "epoch": 1.4949944382647387, + "grad_norm": 5.86328125, + "learning_rate": 8.505005561735262e-06, + "loss": 2.8737, + "mean_token_accuracy": 0.4923916465526288, + "step": 8064 + }, + { + "epoch": 1.4951798294401186, + "grad_norm": 5.99609375, + "learning_rate": 8.504820170559882e-06, + "loss": 2.9468, + "mean_token_accuracy": 0.454889957732109, + "step": 8065 + }, + { + "epoch": 1.4953652206154988, + "grad_norm": 7.12109375, + "learning_rate": 8.504634779384501e-06, + "loss": 2.7109, + "mean_token_accuracy": 0.4582555348092825, + "step": 8066 + }, + { + "epoch": 1.4955506117908788, + "grad_norm": 5.64453125, + "learning_rate": 8.504449388209122e-06, + "loss": 2.7472, + "mean_token_accuracy": 0.4600153295861012, + "step": 8067 + }, + { + "epoch": 1.4957360029662587, + "grad_norm": 5.890625, + "learning_rate": 8.504263997033742e-06, + "loss": 3.4505, + "mean_token_accuracy": 0.4184286400768861, + "step": 8068 + }, + { + "epoch": 1.495921394141639, + "grad_norm": 6.515625, + "learning_rate": 8.504078605858361e-06, + "loss": 2.9889, + "mean_token_accuracy": 0.44995278564683666, + "step": 8069 + }, + { + "epoch": 1.496106785317019, + "grad_norm": 5.734375, + "learning_rate": 8.503893214682981e-06, + "loss": 2.5889, + "mean_token_accuracy": 0.483691431529899, + "step": 8070 + }, + { + "epoch": 1.4962921764923989, + "grad_norm": 5.55859375, + "learning_rate": 8.503707823507602e-06, + "loss": 2.8222, + "mean_token_accuracy": 0.4794265065858492, + "step": 8071 + }, + { + "epoch": 1.496477567667779, + "grad_norm": 5.9453125, + "learning_rate": 8.503522432332222e-06, + "loss": 2.2889, + "mean_token_accuracy": 0.5097512554011444, + "step": 8072 + }, + { + "epoch": 1.496662958843159, + "grad_norm": 5.3828125, + "learning_rate": 8.503337041156841e-06, + "loss": 2.425, + "mean_token_accuracy": 0.5084414678387051, + "step": 8073 + }, + { + "epoch": 1.496848350018539, + "grad_norm": 6.22265625, + "learning_rate": 8.503151649981462e-06, + "loss": 2.6927, + "mean_token_accuracy": 0.4720052083333333, + "step": 8074 + }, + { + "epoch": 1.4970337411939192, + "grad_norm": 5.7734375, + "learning_rate": 8.50296625880608e-06, + "loss": 3.0126, + "mean_token_accuracy": 0.44879131145626533, + "step": 8075 + }, + { + "epoch": 1.4972191323692992, + "grad_norm": 5.40625, + "learning_rate": 8.502780867630701e-06, + "loss": 2.6459, + "mean_token_accuracy": 0.46622291460133125, + "step": 8076 + }, + { + "epoch": 1.4974045235446793, + "grad_norm": 5.43359375, + "learning_rate": 8.502595476455322e-06, + "loss": 3.2082, + "mean_token_accuracy": 0.43470902226272623, + "step": 8077 + }, + { + "epoch": 1.4975899147200593, + "grad_norm": 5.23828125, + "learning_rate": 8.502410085279942e-06, + "loss": 2.9601, + "mean_token_accuracy": 0.4596949891067538, + "step": 8078 + }, + { + "epoch": 1.4977753058954395, + "grad_norm": 6.23046875, + "learning_rate": 8.50222469410456e-06, + "loss": 3.3594, + "mean_token_accuracy": 0.4082827860280276, + "step": 8079 + }, + { + "epoch": 1.4979606970708195, + "grad_norm": 5.97265625, + "learning_rate": 8.502039302929181e-06, + "loss": 2.5842, + "mean_token_accuracy": 0.4871861924686193, + "step": 8080 + }, + { + "epoch": 1.4981460882461994, + "grad_norm": 6.67578125, + "learning_rate": 8.501853911753802e-06, + "loss": 3.2203, + "mean_token_accuracy": 0.4296975546975547, + "step": 8081 + }, + { + "epoch": 1.4983314794215796, + "grad_norm": 6.62890625, + "learning_rate": 8.50166852057842e-06, + "loss": 2.6627, + "mean_token_accuracy": 0.4765916476110242, + "step": 8082 + }, + { + "epoch": 1.4985168705969596, + "grad_norm": 5.4453125, + "learning_rate": 8.501483129403041e-06, + "loss": 2.9307, + "mean_token_accuracy": 0.4637900874635569, + "step": 8083 + }, + { + "epoch": 1.4987022617723396, + "grad_norm": 6.19140625, + "learning_rate": 8.50129773822766e-06, + "loss": 3.6897, + "mean_token_accuracy": 0.40058020065272576, + "step": 8084 + }, + { + "epoch": 1.4988876529477198, + "grad_norm": 6.58984375, + "learning_rate": 8.50111234705228e-06, + "loss": 2.7901, + "mean_token_accuracy": 0.45515558267236117, + "step": 8085 + }, + { + "epoch": 1.4990730441230997, + "grad_norm": 5.5546875, + "learning_rate": 8.500926955876901e-06, + "loss": 3.3627, + "mean_token_accuracy": 0.42082210242587603, + "step": 8086 + }, + { + "epoch": 1.4992584352984797, + "grad_norm": 5.125, + "learning_rate": 8.500741564701521e-06, + "loss": 2.1672, + "mean_token_accuracy": 0.552065404475043, + "step": 8087 + }, + { + "epoch": 1.4994438264738599, + "grad_norm": 6.83984375, + "learning_rate": 8.500556173526142e-06, + "loss": 2.5806, + "mean_token_accuracy": 0.5043033889187736, + "step": 8088 + }, + { + "epoch": 1.4996292176492398, + "grad_norm": 6.671875, + "learning_rate": 8.50037078235076e-06, + "loss": 3.0413, + "mean_token_accuracy": 0.44064602960969046, + "step": 8089 + }, + { + "epoch": 1.4998146088246198, + "grad_norm": 5.6484375, + "learning_rate": 8.500185391175381e-06, + "loss": 3.1397, + "mean_token_accuracy": 0.43716448726772195, + "step": 8090 + }, + { + "epoch": 1.5, + "grad_norm": 5.50390625, + "learning_rate": 8.5e-06, + "loss": 2.6092, + "mean_token_accuracy": 0.47680293982544786, + "step": 8091 + }, + { + "epoch": 1.5001853911753802, + "grad_norm": 5.4453125, + "learning_rate": 8.49981460882462e-06, + "loss": 2.9471, + "mean_token_accuracy": 0.45460758628545395, + "step": 8092 + }, + { + "epoch": 1.5003707823507602, + "grad_norm": 7.83203125, + "learning_rate": 8.499629217649241e-06, + "loss": 2.4695, + "mean_token_accuracy": 0.49622411693057245, + "step": 8093 + }, + { + "epoch": 1.5005561735261401, + "grad_norm": 5.921875, + "learning_rate": 8.499443826473861e-06, + "loss": 2.3143, + "mean_token_accuracy": 0.5380031972097079, + "step": 8094 + }, + { + "epoch": 1.5007415647015203, + "grad_norm": 6.55859375, + "learning_rate": 8.49925843529848e-06, + "loss": 2.5854, + "mean_token_accuracy": 0.4996929124186218, + "step": 8095 + }, + { + "epoch": 1.5009269558769003, + "grad_norm": 6.78515625, + "learning_rate": 8.4990730441231e-06, + "loss": 3.0328, + "mean_token_accuracy": 0.463856993736952, + "step": 8096 + }, + { + "epoch": 1.5011123470522802, + "grad_norm": 5.4765625, + "learning_rate": 8.498887652947721e-06, + "loss": 2.8831, + "mean_token_accuracy": 0.478944820909971, + "step": 8097 + }, + { + "epoch": 1.5012977382276604, + "grad_norm": 5.328125, + "learning_rate": 8.49870226177234e-06, + "loss": 2.6734, + "mean_token_accuracy": 0.4641961549178044, + "step": 8098 + }, + { + "epoch": 1.5014831294030404, + "grad_norm": 6.53515625, + "learning_rate": 8.49851687059696e-06, + "loss": 2.7245, + "mean_token_accuracy": 0.4691149909692956, + "step": 8099 + }, + { + "epoch": 1.5016685205784204, + "grad_norm": 5.25390625, + "learning_rate": 8.49833147942158e-06, + "loss": 3.2307, + "mean_token_accuracy": 0.43688427689478526, + "step": 8100 + }, + { + "epoch": 1.5018539117538006, + "grad_norm": 5.8671875, + "learning_rate": 8.4981460882462e-06, + "loss": 2.1917, + "mean_token_accuracy": 0.5314391599752933, + "step": 8101 + }, + { + "epoch": 1.5020393029291805, + "grad_norm": 8.3203125, + "learning_rate": 8.49796069707082e-06, + "loss": 2.8677, + "mean_token_accuracy": 0.46004490820235105, + "step": 8102 + }, + { + "epoch": 1.5022246941045605, + "grad_norm": 8.359375, + "learning_rate": 8.49777530589544e-06, + "loss": 2.7993, + "mean_token_accuracy": 0.475659924580048, + "step": 8103 + }, + { + "epoch": 1.5024100852799407, + "grad_norm": 6.42578125, + "learning_rate": 8.49758991472006e-06, + "loss": 2.9356, + "mean_token_accuracy": 0.4481491205040693, + "step": 8104 + }, + { + "epoch": 1.5025954764553209, + "grad_norm": 5.05078125, + "learning_rate": 8.49740452354468e-06, + "loss": 3.0479, + "mean_token_accuracy": 0.4527973927213471, + "step": 8105 + }, + { + "epoch": 1.5027808676307006, + "grad_norm": 5.28515625, + "learning_rate": 8.4972191323693e-06, + "loss": 2.9286, + "mean_token_accuracy": 0.43651452282157677, + "step": 8106 + }, + { + "epoch": 1.5029662588060808, + "grad_norm": 7.1484375, + "learning_rate": 8.49703374119392e-06, + "loss": 3.1495, + "mean_token_accuracy": 0.4466930469191634, + "step": 8107 + }, + { + "epoch": 1.503151649981461, + "grad_norm": 6.5078125, + "learning_rate": 8.49684835001854e-06, + "loss": 2.3214, + "mean_token_accuracy": 0.5451505016722408, + "step": 8108 + }, + { + "epoch": 1.503337041156841, + "grad_norm": 5.87109375, + "learning_rate": 8.496662958843159e-06, + "loss": 3.4007, + "mean_token_accuracy": 0.41091424521615877, + "step": 8109 + }, + { + "epoch": 1.503522432332221, + "grad_norm": 5.81640625, + "learning_rate": 8.49647756766778e-06, + "loss": 2.9649, + "mean_token_accuracy": 0.4525146962769432, + "step": 8110 + }, + { + "epoch": 1.5037078235076011, + "grad_norm": 5.66015625, + "learning_rate": 8.4962921764924e-06, + "loss": 2.7966, + "mean_token_accuracy": 0.4538901317424807, + "step": 8111 + }, + { + "epoch": 1.503893214682981, + "grad_norm": 6.4609375, + "learning_rate": 8.49610678531702e-06, + "loss": 2.2029, + "mean_token_accuracy": 0.5537365791431912, + "step": 8112 + }, + { + "epoch": 1.504078605858361, + "grad_norm": 8.375, + "learning_rate": 8.495921394141639e-06, + "loss": 3.0267, + "mean_token_accuracy": 0.4945741532390661, + "step": 8113 + }, + { + "epoch": 1.5042639970337413, + "grad_norm": 6.80859375, + "learning_rate": 8.49573600296626e-06, + "loss": 3.2295, + "mean_token_accuracy": 0.43619281959512357, + "step": 8114 + }, + { + "epoch": 1.5044493882091212, + "grad_norm": 6.46484375, + "learning_rate": 8.49555061179088e-06, + "loss": 3.7332, + "mean_token_accuracy": 0.40746835443037976, + "step": 8115 + }, + { + "epoch": 1.5046347793845012, + "grad_norm": 9.203125, + "learning_rate": 8.495365220615499e-06, + "loss": 2.7778, + "mean_token_accuracy": 0.46215483234714005, + "step": 8116 + }, + { + "epoch": 1.5048201705598814, + "grad_norm": 6.77734375, + "learning_rate": 8.49517982944012e-06, + "loss": 2.6319, + "mean_token_accuracy": 0.4542042042042042, + "step": 8117 + }, + { + "epoch": 1.5050055617352616, + "grad_norm": 8.328125, + "learning_rate": 8.49499443826474e-06, + "loss": 2.8717, + "mean_token_accuracy": 0.45720850086157383, + "step": 8118 + }, + { + "epoch": 1.5051909529106413, + "grad_norm": 5.77734375, + "learning_rate": 8.49480904708936e-06, + "loss": 2.9367, + "mean_token_accuracy": 0.4533754249635745, + "step": 8119 + }, + { + "epoch": 1.5053763440860215, + "grad_norm": 7.82421875, + "learning_rate": 8.494623655913979e-06, + "loss": 2.9803, + "mean_token_accuracy": 0.44193078732220853, + "step": 8120 + }, + { + "epoch": 1.5055617352614017, + "grad_norm": 6.6015625, + "learning_rate": 8.4944382647386e-06, + "loss": 2.7145, + "mean_token_accuracy": 0.4983169516628518, + "step": 8121 + }, + { + "epoch": 1.5057471264367817, + "grad_norm": 6.953125, + "learning_rate": 8.494252873563218e-06, + "loss": 3.0248, + "mean_token_accuracy": 0.4381896689588997, + "step": 8122 + }, + { + "epoch": 1.5059325176121616, + "grad_norm": 5.90234375, + "learning_rate": 8.494067482387839e-06, + "loss": 3.3629, + "mean_token_accuracy": 0.4031028487361241, + "step": 8123 + }, + { + "epoch": 1.5061179087875418, + "grad_norm": 6.18359375, + "learning_rate": 8.49388209121246e-06, + "loss": 3.0878, + "mean_token_accuracy": 0.4521038495971352, + "step": 8124 + }, + { + "epoch": 1.5063032999629218, + "grad_norm": 6.61328125, + "learning_rate": 8.493696700037078e-06, + "loss": 2.5116, + "mean_token_accuracy": 0.48084748140872735, + "step": 8125 + }, + { + "epoch": 1.5064886911383017, + "grad_norm": 6.3515625, + "learning_rate": 8.4935113088617e-06, + "loss": 2.8296, + "mean_token_accuracy": 0.4741880983668106, + "step": 8126 + }, + { + "epoch": 1.506674082313682, + "grad_norm": 6.90234375, + "learning_rate": 8.493325917686319e-06, + "loss": 3.4793, + "mean_token_accuracy": 0.39510011192637734, + "step": 8127 + }, + { + "epoch": 1.506859473489062, + "grad_norm": 7.0859375, + "learning_rate": 8.49314052651094e-06, + "loss": 2.7283, + "mean_token_accuracy": 0.49664529595944534, + "step": 8128 + }, + { + "epoch": 1.5070448646644419, + "grad_norm": 9.21875, + "learning_rate": 8.492955135335558e-06, + "loss": 2.4883, + "mean_token_accuracy": 0.5003728560775541, + "step": 8129 + }, + { + "epoch": 1.507230255839822, + "grad_norm": 5.3046875, + "learning_rate": 8.492769744160179e-06, + "loss": 2.2301, + "mean_token_accuracy": 0.5423567258429643, + "step": 8130 + }, + { + "epoch": 1.507415647015202, + "grad_norm": 5.3515625, + "learning_rate": 8.4925843529848e-06, + "loss": 2.6135, + "mean_token_accuracy": 0.481780210283739, + "step": 8131 + }, + { + "epoch": 1.507601038190582, + "grad_norm": 6.21484375, + "learning_rate": 8.492398961809418e-06, + "loss": 2.6376, + "mean_token_accuracy": 0.4885966139639009, + "step": 8132 + }, + { + "epoch": 1.5077864293659622, + "grad_norm": 6.35546875, + "learning_rate": 8.492213570634039e-06, + "loss": 2.6117, + "mean_token_accuracy": 0.511895722601873, + "step": 8133 + }, + { + "epoch": 1.5079718205413424, + "grad_norm": 5.859375, + "learning_rate": 8.492028179458659e-06, + "loss": 3.1044, + "mean_token_accuracy": 0.45163014430785675, + "step": 8134 + }, + { + "epoch": 1.5081572117167221, + "grad_norm": 5.53515625, + "learning_rate": 8.49184278828328e-06, + "loss": 3.212, + "mean_token_accuracy": 0.4069689524234979, + "step": 8135 + }, + { + "epoch": 1.5083426028921023, + "grad_norm": 9.8359375, + "learning_rate": 8.491657397107898e-06, + "loss": 2.7327, + "mean_token_accuracy": 0.4587066825002699, + "step": 8136 + }, + { + "epoch": 1.5085279940674825, + "grad_norm": 7.08203125, + "learning_rate": 8.491472005932519e-06, + "loss": 3.0077, + "mean_token_accuracy": 0.4471955533097524, + "step": 8137 + }, + { + "epoch": 1.5087133852428625, + "grad_norm": 8.4375, + "learning_rate": 8.491286614757138e-06, + "loss": 3.3161, + "mean_token_accuracy": 0.41439710701774213, + "step": 8138 + }, + { + "epoch": 1.5088987764182424, + "grad_norm": 8.5859375, + "learning_rate": 8.491101223581758e-06, + "loss": 2.1968, + "mean_token_accuracy": 0.5201405152224824, + "step": 8139 + }, + { + "epoch": 1.5090841675936226, + "grad_norm": 5.515625, + "learning_rate": 8.490915832406379e-06, + "loss": 2.6471, + "mean_token_accuracy": 0.4886032191854813, + "step": 8140 + }, + { + "epoch": 1.5092695587690026, + "grad_norm": 6.875, + "learning_rate": 8.490730441230997e-06, + "loss": 2.5118, + "mean_token_accuracy": 0.48185401658284627, + "step": 8141 + }, + { + "epoch": 1.5094549499443826, + "grad_norm": 7.578125, + "learning_rate": 8.490545050055618e-06, + "loss": 2.7685, + "mean_token_accuracy": 0.4690656565656566, + "step": 8142 + }, + { + "epoch": 1.5096403411197628, + "grad_norm": 6.69140625, + "learning_rate": 8.490359658880238e-06, + "loss": 2.8193, + "mean_token_accuracy": 0.47326596683998823, + "step": 8143 + }, + { + "epoch": 1.5098257322951427, + "grad_norm": 6.29296875, + "learning_rate": 8.490174267704859e-06, + "loss": 2.5314, + "mean_token_accuracy": 0.4626950354609929, + "step": 8144 + }, + { + "epoch": 1.5100111234705227, + "grad_norm": 5.6796875, + "learning_rate": 8.489988876529478e-06, + "loss": 2.8902, + "mean_token_accuracy": 0.46473616473616475, + "step": 8145 + }, + { + "epoch": 1.5101965146459029, + "grad_norm": 8.984375, + "learning_rate": 8.489803485354098e-06, + "loss": 2.4983, + "mean_token_accuracy": 0.49335956714215445, + "step": 8146 + }, + { + "epoch": 1.510381905821283, + "grad_norm": 7.078125, + "learning_rate": 8.489618094178717e-06, + "loss": 2.7449, + "mean_token_accuracy": 0.49389875558777335, + "step": 8147 + }, + { + "epoch": 1.5105672969966628, + "grad_norm": 5.21875, + "learning_rate": 8.489432703003337e-06, + "loss": 2.9781, + "mean_token_accuracy": 0.46951478285471776, + "step": 8148 + }, + { + "epoch": 1.510752688172043, + "grad_norm": 8.421875, + "learning_rate": 8.489247311827958e-06, + "loss": 2.8853, + "mean_token_accuracy": 0.46274311694872444, + "step": 8149 + }, + { + "epoch": 1.5109380793474232, + "grad_norm": 8.0625, + "learning_rate": 8.489061920652578e-06, + "loss": 3.4396, + "mean_token_accuracy": 0.4183177570093458, + "step": 8150 + }, + { + "epoch": 1.5111234705228032, + "grad_norm": 6.5546875, + "learning_rate": 8.488876529477197e-06, + "loss": 2.9917, + "mean_token_accuracy": 0.4447632711621234, + "step": 8151 + }, + { + "epoch": 1.5113088616981831, + "grad_norm": 8.9453125, + "learning_rate": 8.488691138301818e-06, + "loss": 2.9226, + "mean_token_accuracy": 0.4702569517775431, + "step": 8152 + }, + { + "epoch": 1.5114942528735633, + "grad_norm": 7.609375, + "learning_rate": 8.488505747126438e-06, + "loss": 2.6935, + "mean_token_accuracy": 0.4769170579029734, + "step": 8153 + }, + { + "epoch": 1.5116796440489433, + "grad_norm": 5.19140625, + "learning_rate": 8.488320355951057e-06, + "loss": 3.0485, + "mean_token_accuracy": 0.45320447609359105, + "step": 8154 + }, + { + "epoch": 1.5118650352243233, + "grad_norm": 5.04296875, + "learning_rate": 8.488134964775677e-06, + "loss": 2.66, + "mean_token_accuracy": 0.47689503591722954, + "step": 8155 + }, + { + "epoch": 1.5120504263997034, + "grad_norm": 8.1953125, + "learning_rate": 8.487949573600296e-06, + "loss": 3.149, + "mean_token_accuracy": 0.4368541135418065, + "step": 8156 + }, + { + "epoch": 1.5122358175750834, + "grad_norm": 6.58984375, + "learning_rate": 8.487764182424917e-06, + "loss": 2.7056, + "mean_token_accuracy": 0.46638005159071366, + "step": 8157 + }, + { + "epoch": 1.5124212087504634, + "grad_norm": 6.0546875, + "learning_rate": 8.487578791249537e-06, + "loss": 2.593, + "mean_token_accuracy": 0.4851537645811241, + "step": 8158 + }, + { + "epoch": 1.5126065999258436, + "grad_norm": 6.17578125, + "learning_rate": 8.487393400074158e-06, + "loss": 2.7058, + "mean_token_accuracy": 0.47496871088861076, + "step": 8159 + }, + { + "epoch": 1.5127919911012235, + "grad_norm": 7.16015625, + "learning_rate": 8.487208008898777e-06, + "loss": 2.5877, + "mean_token_accuracy": 0.48800599700149927, + "step": 8160 + }, + { + "epoch": 1.5129773822766035, + "grad_norm": 6.6796875, + "learning_rate": 8.487022617723397e-06, + "loss": 3.1071, + "mean_token_accuracy": 0.4526328590538733, + "step": 8161 + }, + { + "epoch": 1.5131627734519837, + "grad_norm": 6.37890625, + "learning_rate": 8.486837226548018e-06, + "loss": 2.2745, + "mean_token_accuracy": 0.5206062217495347, + "step": 8162 + }, + { + "epoch": 1.5133481646273639, + "grad_norm": 6.54296875, + "learning_rate": 8.486651835372636e-06, + "loss": 2.7022, + "mean_token_accuracy": 0.48041566746602715, + "step": 8163 + }, + { + "epoch": 1.5135335558027436, + "grad_norm": 6.19921875, + "learning_rate": 8.486466444197257e-06, + "loss": 2.9882, + "mean_token_accuracy": 0.45608540925266905, + "step": 8164 + }, + { + "epoch": 1.5137189469781238, + "grad_norm": 6.90625, + "learning_rate": 8.486281053021876e-06, + "loss": 3.0707, + "mean_token_accuracy": 0.4313310069790628, + "step": 8165 + }, + { + "epoch": 1.513904338153504, + "grad_norm": 6.47265625, + "learning_rate": 8.486095661846496e-06, + "loss": 3.4558, + "mean_token_accuracy": 0.45211658570437196, + "step": 8166 + }, + { + "epoch": 1.514089729328884, + "grad_norm": 7.09375, + "learning_rate": 8.485910270671117e-06, + "loss": 3.2208, + "mean_token_accuracy": 0.4428692940175787, + "step": 8167 + }, + { + "epoch": 1.514275120504264, + "grad_norm": 6.1015625, + "learning_rate": 8.485724879495737e-06, + "loss": 2.9897, + "mean_token_accuracy": 0.46046119827313886, + "step": 8168 + }, + { + "epoch": 1.5144605116796441, + "grad_norm": 5.8515625, + "learning_rate": 8.485539488320358e-06, + "loss": 2.7912, + "mean_token_accuracy": 0.46390822376901264, + "step": 8169 + }, + { + "epoch": 1.514645902855024, + "grad_norm": 5.8125, + "learning_rate": 8.485354097144976e-06, + "loss": 3.1433, + "mean_token_accuracy": 0.43471267729200175, + "step": 8170 + }, + { + "epoch": 1.514831294030404, + "grad_norm": 7.94140625, + "learning_rate": 8.485168705969597e-06, + "loss": 3.0334, + "mean_token_accuracy": 0.4529342723004695, + "step": 8171 + }, + { + "epoch": 1.5150166852057843, + "grad_norm": 7.06640625, + "learning_rate": 8.484983314794216e-06, + "loss": 2.5931, + "mean_token_accuracy": 0.48040504997369804, + "step": 8172 + }, + { + "epoch": 1.5152020763811642, + "grad_norm": 7.8515625, + "learning_rate": 8.484797923618836e-06, + "loss": 2.8142, + "mean_token_accuracy": 0.4424185547660544, + "step": 8173 + }, + { + "epoch": 1.5153874675565442, + "grad_norm": 5.59375, + "learning_rate": 8.484612532443457e-06, + "loss": 3.3718, + "mean_token_accuracy": 0.4184770682466535, + "step": 8174 + }, + { + "epoch": 1.5155728587319244, + "grad_norm": 6.0078125, + "learning_rate": 8.484427141268077e-06, + "loss": 2.9492, + "mean_token_accuracy": 0.4616157267900703, + "step": 8175 + }, + { + "epoch": 1.5157582499073046, + "grad_norm": 6.73828125, + "learning_rate": 8.484241750092696e-06, + "loss": 2.7291, + "mean_token_accuracy": 0.47106109324758844, + "step": 8176 + }, + { + "epoch": 1.5159436410826843, + "grad_norm": 7.39453125, + "learning_rate": 8.484056358917316e-06, + "loss": 3.4673, + "mean_token_accuracy": 0.42366412213740456, + "step": 8177 + }, + { + "epoch": 1.5161290322580645, + "grad_norm": 6.02734375, + "learning_rate": 8.483870967741937e-06, + "loss": 2.6788, + "mean_token_accuracy": 0.4745308310991957, + "step": 8178 + }, + { + "epoch": 1.5163144234334447, + "grad_norm": 5.45703125, + "learning_rate": 8.483685576566556e-06, + "loss": 3.0684, + "mean_token_accuracy": 0.435375, + "step": 8179 + }, + { + "epoch": 1.5164998146088247, + "grad_norm": 6.51171875, + "learning_rate": 8.483500185391176e-06, + "loss": 3.0418, + "mean_token_accuracy": 0.4603131749460043, + "step": 8180 + }, + { + "epoch": 1.5166852057842046, + "grad_norm": 5.71484375, + "learning_rate": 8.483314794215795e-06, + "loss": 3.086, + "mean_token_accuracy": 0.4425393883225209, + "step": 8181 + }, + { + "epoch": 1.5168705969595848, + "grad_norm": 6.23046875, + "learning_rate": 8.483129403040416e-06, + "loss": 2.984, + "mean_token_accuracy": 0.4871883258646444, + "step": 8182 + }, + { + "epoch": 1.5170559881349648, + "grad_norm": 6.4921875, + "learning_rate": 8.482944011865036e-06, + "loss": 2.9324, + "mean_token_accuracy": 0.4427722772277228, + "step": 8183 + }, + { + "epoch": 1.5172413793103448, + "grad_norm": 7.359375, + "learning_rate": 8.482758620689656e-06, + "loss": 2.8921, + "mean_token_accuracy": 0.46385266529398306, + "step": 8184 + }, + { + "epoch": 1.517426770485725, + "grad_norm": 5.83203125, + "learning_rate": 8.482573229514275e-06, + "loss": 3.2658, + "mean_token_accuracy": 0.4217965653896962, + "step": 8185 + }, + { + "epoch": 1.517612161661105, + "grad_norm": 11.703125, + "learning_rate": 8.482387838338896e-06, + "loss": 3.7565, + "mean_token_accuracy": 0.4201095809269954, + "step": 8186 + }, + { + "epoch": 1.5177975528364849, + "grad_norm": 9.4296875, + "learning_rate": 8.482202447163516e-06, + "loss": 2.9116, + "mean_token_accuracy": 0.45659928656361476, + "step": 8187 + }, + { + "epoch": 1.517982944011865, + "grad_norm": 8.703125, + "learning_rate": 8.482017055988135e-06, + "loss": 2.8034, + "mean_token_accuracy": 0.4729808932129879, + "step": 8188 + }, + { + "epoch": 1.518168335187245, + "grad_norm": 5.02734375, + "learning_rate": 8.481831664812756e-06, + "loss": 2.5335, + "mean_token_accuracy": 0.5016320820704127, + "step": 8189 + }, + { + "epoch": 1.518353726362625, + "grad_norm": 7.20703125, + "learning_rate": 8.481646273637374e-06, + "loss": 2.9734, + "mean_token_accuracy": 0.47483431846677415, + "step": 8190 + }, + { + "epoch": 1.5185391175380052, + "grad_norm": 6.12890625, + "learning_rate": 8.481460882461997e-06, + "loss": 3.1681, + "mean_token_accuracy": 0.47367218732153055, + "step": 8191 + }, + { + "epoch": 1.5187245087133854, + "grad_norm": 6.1171875, + "learning_rate": 8.481275491286615e-06, + "loss": 2.5867, + "mean_token_accuracy": 0.5146371213661314, + "step": 8192 + }, + { + "epoch": 1.5189098998887653, + "grad_norm": 5.5703125, + "learning_rate": 8.481090100111236e-06, + "loss": 2.6973, + "mean_token_accuracy": 0.47012665198237885, + "step": 8193 + }, + { + "epoch": 1.5190952910641453, + "grad_norm": 9.3671875, + "learning_rate": 8.480904708935855e-06, + "loss": 2.919, + "mean_token_accuracy": 0.44632833085118573, + "step": 8194 + }, + { + "epoch": 1.5192806822395255, + "grad_norm": 6.84375, + "learning_rate": 8.480719317760475e-06, + "loss": 2.5664, + "mean_token_accuracy": 0.48267352185089973, + "step": 8195 + }, + { + "epoch": 1.5194660734149055, + "grad_norm": 5.50390625, + "learning_rate": 8.480533926585096e-06, + "loss": 3.0994, + "mean_token_accuracy": 0.4313366466126079, + "step": 8196 + }, + { + "epoch": 1.5196514645902854, + "grad_norm": 5.2578125, + "learning_rate": 8.480348535409714e-06, + "loss": 2.7937, + "mean_token_accuracy": 0.46343849351919786, + "step": 8197 + }, + { + "epoch": 1.5198368557656656, + "grad_norm": 5.97265625, + "learning_rate": 8.480163144234335e-06, + "loss": 2.4882, + "mean_token_accuracy": 0.4899538106235566, + "step": 8198 + }, + { + "epoch": 1.5200222469410456, + "grad_norm": 6.90625, + "learning_rate": 8.479977753058955e-06, + "loss": 2.4123, + "mean_token_accuracy": 0.5082036180058898, + "step": 8199 + }, + { + "epoch": 1.5202076381164256, + "grad_norm": 6.65234375, + "learning_rate": 8.479792361883576e-06, + "loss": 2.4942, + "mean_token_accuracy": 0.4986691717551276, + "step": 8200 + }, + { + "epoch": 1.5203930292918058, + "grad_norm": 9.9921875, + "learning_rate": 8.479606970708195e-06, + "loss": 3.6525, + "mean_token_accuracy": 0.3978559262598938, + "step": 8201 + }, + { + "epoch": 1.5205784204671857, + "grad_norm": 8.375, + "learning_rate": 8.479421579532815e-06, + "loss": 3.102, + "mean_token_accuracy": 0.4873556497588072, + "step": 8202 + }, + { + "epoch": 1.5207638116425657, + "grad_norm": 7.8984375, + "learning_rate": 8.479236188357434e-06, + "loss": 2.454, + "mean_token_accuracy": 0.5201238390092879, + "step": 8203 + }, + { + "epoch": 1.5209492028179459, + "grad_norm": 6.47265625, + "learning_rate": 8.479050797182054e-06, + "loss": 2.9256, + "mean_token_accuracy": 0.4726725003586286, + "step": 8204 + }, + { + "epoch": 1.521134593993326, + "grad_norm": 11.140625, + "learning_rate": 8.478865406006675e-06, + "loss": 4.4489, + "mean_token_accuracy": 0.39100684261974583, + "step": 8205 + }, + { + "epoch": 1.5213199851687058, + "grad_norm": 8.0234375, + "learning_rate": 8.478680014831294e-06, + "loss": 3.802, + "mean_token_accuracy": 0.41375336150595465, + "step": 8206 + }, + { + "epoch": 1.521505376344086, + "grad_norm": 7.92578125, + "learning_rate": 8.478494623655916e-06, + "loss": 2.3995, + "mean_token_accuracy": 0.5010091416359966, + "step": 8207 + }, + { + "epoch": 1.5216907675194662, + "grad_norm": 6.01953125, + "learning_rate": 8.478309232480535e-06, + "loss": 2.7623, + "mean_token_accuracy": 0.46704826732673266, + "step": 8208 + }, + { + "epoch": 1.5218761586948462, + "grad_norm": 11.9453125, + "learning_rate": 8.478123841305155e-06, + "loss": 2.9639, + "mean_token_accuracy": 0.4564993564993565, + "step": 8209 + }, + { + "epoch": 1.5220615498702261, + "grad_norm": 8.640625, + "learning_rate": 8.477938450129774e-06, + "loss": 2.3315, + "mean_token_accuracy": 0.5261520225218551, + "step": 8210 + }, + { + "epoch": 1.5222469410456063, + "grad_norm": 6.0859375, + "learning_rate": 8.477753058954395e-06, + "loss": 2.5042, + "mean_token_accuracy": 0.5022047431772136, + "step": 8211 + }, + { + "epoch": 1.5224323322209863, + "grad_norm": 8.65625, + "learning_rate": 8.477567667779015e-06, + "loss": 2.6928, + "mean_token_accuracy": 0.45893719806763283, + "step": 8212 + }, + { + "epoch": 1.5226177233963663, + "grad_norm": 8.0078125, + "learning_rate": 8.477382276603634e-06, + "loss": 3.128, + "mean_token_accuracy": 0.44004477403106196, + "step": 8213 + }, + { + "epoch": 1.5228031145717464, + "grad_norm": 11.3046875, + "learning_rate": 8.477196885428254e-06, + "loss": 2.6875, + "mean_token_accuracy": 0.4664804469273743, + "step": 8214 + }, + { + "epoch": 1.5229885057471264, + "grad_norm": 6.04296875, + "learning_rate": 8.477011494252875e-06, + "loss": 2.8386, + "mean_token_accuracy": 0.4670073650984518, + "step": 8215 + }, + { + "epoch": 1.5231738969225064, + "grad_norm": 8.046875, + "learning_rate": 8.476826103077495e-06, + "loss": 2.7261, + "mean_token_accuracy": 0.4602086438152012, + "step": 8216 + }, + { + "epoch": 1.5233592880978866, + "grad_norm": 9.1640625, + "learning_rate": 8.476640711902114e-06, + "loss": 2.6466, + "mean_token_accuracy": 0.46522430020779854, + "step": 8217 + }, + { + "epoch": 1.5235446792732668, + "grad_norm": 6.7578125, + "learning_rate": 8.476455320726735e-06, + "loss": 2.894, + "mean_token_accuracy": 0.4687819856704197, + "step": 8218 + }, + { + "epoch": 1.5237300704486465, + "grad_norm": 5.5, + "learning_rate": 8.476269929551353e-06, + "loss": 2.8042, + "mean_token_accuracy": 0.4664469742360695, + "step": 8219 + }, + { + "epoch": 1.5239154616240267, + "grad_norm": 8.4296875, + "learning_rate": 8.476084538375974e-06, + "loss": 2.5862, + "mean_token_accuracy": 0.49321941958231624, + "step": 8220 + }, + { + "epoch": 1.5241008527994069, + "grad_norm": 6.05859375, + "learning_rate": 8.475899147200594e-06, + "loss": 3.2337, + "mean_token_accuracy": 0.4468392692863714, + "step": 8221 + }, + { + "epoch": 1.5242862439747868, + "grad_norm": 6.3828125, + "learning_rate": 8.475713756025213e-06, + "loss": 2.5924, + "mean_token_accuracy": 0.49598416947968804, + "step": 8222 + }, + { + "epoch": 1.5244716351501668, + "grad_norm": 6.36328125, + "learning_rate": 8.475528364849834e-06, + "loss": 2.7448, + "mean_token_accuracy": 0.4636542239685658, + "step": 8223 + }, + { + "epoch": 1.524657026325547, + "grad_norm": 8.671875, + "learning_rate": 8.475342973674454e-06, + "loss": 2.7723, + "mean_token_accuracy": 0.47716150081566067, + "step": 8224 + }, + { + "epoch": 1.524842417500927, + "grad_norm": 6.21484375, + "learning_rate": 8.475157582499075e-06, + "loss": 2.6926, + "mean_token_accuracy": 0.4734565473849784, + "step": 8225 + }, + { + "epoch": 1.525027808676307, + "grad_norm": 6.46484375, + "learning_rate": 8.474972191323693e-06, + "loss": 2.5611, + "mean_token_accuracy": 0.48408057179987, + "step": 8226 + }, + { + "epoch": 1.5252131998516871, + "grad_norm": 7.5703125, + "learning_rate": 8.474786800148314e-06, + "loss": 2.5052, + "mean_token_accuracy": 0.49529395716818986, + "step": 8227 + }, + { + "epoch": 1.525398591027067, + "grad_norm": 10.7109375, + "learning_rate": 8.474601408972933e-06, + "loss": 3.8109, + "mean_token_accuracy": 0.3927765237020316, + "step": 8228 + }, + { + "epoch": 1.525583982202447, + "grad_norm": 10.2109375, + "learning_rate": 8.474416017797553e-06, + "loss": 2.8466, + "mean_token_accuracy": 0.44885297936805657, + "step": 8229 + }, + { + "epoch": 1.5257693733778273, + "grad_norm": 5.04296875, + "learning_rate": 8.474230626622174e-06, + "loss": 2.5406, + "mean_token_accuracy": 0.48607120549656463, + "step": 8230 + }, + { + "epoch": 1.5259547645532072, + "grad_norm": 8.25, + "learning_rate": 8.474045235446794e-06, + "loss": 3.0445, + "mean_token_accuracy": 0.4446354038792045, + "step": 8231 + }, + { + "epoch": 1.5261401557285872, + "grad_norm": 8.25, + "learning_rate": 8.473859844271413e-06, + "loss": 3.0869, + "mean_token_accuracy": 0.45315225517069524, + "step": 8232 + }, + { + "epoch": 1.5263255469039674, + "grad_norm": 8.09375, + "learning_rate": 8.473674453096033e-06, + "loss": 3.0268, + "mean_token_accuracy": 0.46030136192408, + "step": 8233 + }, + { + "epoch": 1.5265109380793476, + "grad_norm": 6.11328125, + "learning_rate": 8.473489061920654e-06, + "loss": 3.1729, + "mean_token_accuracy": 0.4381590373402804, + "step": 8234 + }, + { + "epoch": 1.5266963292547273, + "grad_norm": 9.2265625, + "learning_rate": 8.473303670745273e-06, + "loss": 2.9986, + "mean_token_accuracy": 0.44305724725943973, + "step": 8235 + }, + { + "epoch": 1.5268817204301075, + "grad_norm": 7.5390625, + "learning_rate": 8.473118279569893e-06, + "loss": 3.2708, + "mean_token_accuracy": 0.4195917351257157, + "step": 8236 + }, + { + "epoch": 1.5270671116054877, + "grad_norm": 7.7890625, + "learning_rate": 8.472932888394512e-06, + "loss": 2.9537, + "mean_token_accuracy": 0.4593348239239799, + "step": 8237 + }, + { + "epoch": 1.5272525027808677, + "grad_norm": 8.34375, + "learning_rate": 8.472747497219133e-06, + "loss": 2.5839, + "mean_token_accuracy": 0.4740268735281895, + "step": 8238 + }, + { + "epoch": 1.5274378939562476, + "grad_norm": 8.671875, + "learning_rate": 8.472562106043753e-06, + "loss": 2.4349, + "mean_token_accuracy": 0.5113572787288799, + "step": 8239 + }, + { + "epoch": 1.5276232851316278, + "grad_norm": 12.2734375, + "learning_rate": 8.472376714868374e-06, + "loss": 2.6082, + "mean_token_accuracy": 0.4818398541579021, + "step": 8240 + }, + { + "epoch": 1.5278086763070078, + "grad_norm": 5.94921875, + "learning_rate": 8.472191323692992e-06, + "loss": 2.7817, + "mean_token_accuracy": 0.47315096251266464, + "step": 8241 + }, + { + "epoch": 1.5279940674823878, + "grad_norm": 7.83984375, + "learning_rate": 8.472005932517613e-06, + "loss": 3.148, + "mean_token_accuracy": 0.43857975622681505, + "step": 8242 + }, + { + "epoch": 1.528179458657768, + "grad_norm": 8.640625, + "learning_rate": 8.471820541342233e-06, + "loss": 2.7277, + "mean_token_accuracy": 0.46743564495709666, + "step": 8243 + }, + { + "epoch": 1.528364849833148, + "grad_norm": 7.328125, + "learning_rate": 8.471635150166852e-06, + "loss": 2.6818, + "mean_token_accuracy": 0.4806254248810333, + "step": 8244 + }, + { + "epoch": 1.5285502410085279, + "grad_norm": 6.09375, + "learning_rate": 8.471449758991473e-06, + "loss": 2.3158, + "mean_token_accuracy": 0.5528224534847052, + "step": 8245 + }, + { + "epoch": 1.528735632183908, + "grad_norm": 7.34765625, + "learning_rate": 8.471264367816091e-06, + "loss": 2.6678, + "mean_token_accuracy": 0.4762285897230671, + "step": 8246 + }, + { + "epoch": 1.5289210233592883, + "grad_norm": 7.03125, + "learning_rate": 8.471078976640714e-06, + "loss": 3.0301, + "mean_token_accuracy": 0.45836650214933927, + "step": 8247 + }, + { + "epoch": 1.529106414534668, + "grad_norm": 6.44140625, + "learning_rate": 8.470893585465332e-06, + "loss": 2.9204, + "mean_token_accuracy": 0.4623800959232614, + "step": 8248 + }, + { + "epoch": 1.5292918057100482, + "grad_norm": 6.7578125, + "learning_rate": 8.470708194289953e-06, + "loss": 3.1226, + "mean_token_accuracy": 0.4064873214055863, + "step": 8249 + }, + { + "epoch": 1.5294771968854284, + "grad_norm": 6.96875, + "learning_rate": 8.470522803114573e-06, + "loss": 3.1172, + "mean_token_accuracy": 0.45636420919974796, + "step": 8250 + }, + { + "epoch": 1.5296625880608083, + "grad_norm": 11.5, + "learning_rate": 8.470337411939192e-06, + "loss": 2.2962, + "mean_token_accuracy": 0.5254882489241973, + "step": 8251 + }, + { + "epoch": 1.5298479792361883, + "grad_norm": 6.4609375, + "learning_rate": 8.470152020763813e-06, + "loss": 2.5618, + "mean_token_accuracy": 0.4758269720101781, + "step": 8252 + }, + { + "epoch": 1.5300333704115685, + "grad_norm": 5.29296875, + "learning_rate": 8.469966629588431e-06, + "loss": 3.3506, + "mean_token_accuracy": 0.4041125079906243, + "step": 8253 + }, + { + "epoch": 1.5302187615869485, + "grad_norm": 8.9765625, + "learning_rate": 8.469781238413052e-06, + "loss": 2.6715, + "mean_token_accuracy": 0.5057456254896839, + "step": 8254 + }, + { + "epoch": 1.5304041527623284, + "grad_norm": 8.7734375, + "learning_rate": 8.469595847237672e-06, + "loss": 2.688, + "mean_token_accuracy": 0.4634875668870003, + "step": 8255 + }, + { + "epoch": 1.5305895439377086, + "grad_norm": 7.48046875, + "learning_rate": 8.469410456062293e-06, + "loss": 3.4164, + "mean_token_accuracy": 0.4211187055377319, + "step": 8256 + }, + { + "epoch": 1.5307749351130886, + "grad_norm": 8.0859375, + "learning_rate": 8.469225064886912e-06, + "loss": 2.6893, + "mean_token_accuracy": 0.5069643386347921, + "step": 8257 + }, + { + "epoch": 1.5309603262884686, + "grad_norm": 6.37109375, + "learning_rate": 8.469039673711532e-06, + "loss": 2.5801, + "mean_token_accuracy": 0.48939929328621906, + "step": 8258 + }, + { + "epoch": 1.5311457174638488, + "grad_norm": 8.140625, + "learning_rate": 8.468854282536153e-06, + "loss": 2.9098, + "mean_token_accuracy": 0.44330937066464876, + "step": 8259 + }, + { + "epoch": 1.5313311086392287, + "grad_norm": 8.0390625, + "learning_rate": 8.468668891360771e-06, + "loss": 3.0435, + "mean_token_accuracy": 0.44524761611811753, + "step": 8260 + }, + { + "epoch": 1.5315164998146087, + "grad_norm": 7.33203125, + "learning_rate": 8.468483500185392e-06, + "loss": 3.027, + "mean_token_accuracy": 0.4457318861836316, + "step": 8261 + }, + { + "epoch": 1.5317018909899889, + "grad_norm": 6.203125, + "learning_rate": 8.46829810901001e-06, + "loss": 2.6156, + "mean_token_accuracy": 0.48277439024390245, + "step": 8262 + }, + { + "epoch": 1.531887282165369, + "grad_norm": 7.69921875, + "learning_rate": 8.468112717834633e-06, + "loss": 3.8167, + "mean_token_accuracy": 0.3946355065728323, + "step": 8263 + }, + { + "epoch": 1.5320726733407488, + "grad_norm": 7.703125, + "learning_rate": 8.467927326659252e-06, + "loss": 2.8456, + "mean_token_accuracy": 0.4896875, + "step": 8264 + }, + { + "epoch": 1.532258064516129, + "grad_norm": 6.40625, + "learning_rate": 8.467741935483872e-06, + "loss": 2.7337, + "mean_token_accuracy": 0.4832987054498961, + "step": 8265 + }, + { + "epoch": 1.5324434556915092, + "grad_norm": 5.45703125, + "learning_rate": 8.467556544308491e-06, + "loss": 2.6817, + "mean_token_accuracy": 0.4656932703919201, + "step": 8266 + }, + { + "epoch": 1.5326288468668892, + "grad_norm": 8.859375, + "learning_rate": 8.467371153133112e-06, + "loss": 3.1206, + "mean_token_accuracy": 0.4492307692307692, + "step": 8267 + }, + { + "epoch": 1.5328142380422691, + "grad_norm": 7.75390625, + "learning_rate": 8.467185761957732e-06, + "loss": 3.2165, + "mean_token_accuracy": 0.43408328399447405, + "step": 8268 + }, + { + "epoch": 1.5329996292176493, + "grad_norm": 6.921875, + "learning_rate": 8.46700037078235e-06, + "loss": 2.4233, + "mean_token_accuracy": 0.5070378803560339, + "step": 8269 + }, + { + "epoch": 1.5331850203930293, + "grad_norm": 7.64453125, + "learning_rate": 8.466814979606971e-06, + "loss": 2.0224, + "mean_token_accuracy": 0.5635934462649264, + "step": 8270 + }, + { + "epoch": 1.5333704115684093, + "grad_norm": 4.94921875, + "learning_rate": 8.466629588431592e-06, + "loss": 2.9274, + "mean_token_accuracy": 0.4729085392284352, + "step": 8271 + }, + { + "epoch": 1.5335558027437894, + "grad_norm": 6.6640625, + "learning_rate": 8.466444197256212e-06, + "loss": 2.5367, + "mean_token_accuracy": 0.4700025859839669, + "step": 8272 + }, + { + "epoch": 1.5337411939191694, + "grad_norm": 6.97265625, + "learning_rate": 8.466258806080831e-06, + "loss": 3.4702, + "mean_token_accuracy": 0.40429136081309996, + "step": 8273 + }, + { + "epoch": 1.5339265850945494, + "grad_norm": 6.33203125, + "learning_rate": 8.466073414905452e-06, + "loss": 2.5163, + "mean_token_accuracy": 0.5131054475178162, + "step": 8274 + }, + { + "epoch": 1.5341119762699296, + "grad_norm": 7.80078125, + "learning_rate": 8.46588802373007e-06, + "loss": 2.9597, + "mean_token_accuracy": 0.45412780175414175, + "step": 8275 + }, + { + "epoch": 1.5342973674453098, + "grad_norm": 7.81640625, + "learning_rate": 8.465702632554691e-06, + "loss": 2.2468, + "mean_token_accuracy": 0.5265990737263737, + "step": 8276 + }, + { + "epoch": 1.5344827586206895, + "grad_norm": 6.1328125, + "learning_rate": 8.465517241379311e-06, + "loss": 2.8233, + "mean_token_accuracy": 0.45545023696682463, + "step": 8277 + }, + { + "epoch": 1.5346681497960697, + "grad_norm": 5.6953125, + "learning_rate": 8.46533185020393e-06, + "loss": 2.7845, + "mean_token_accuracy": 0.47294324681038274, + "step": 8278 + }, + { + "epoch": 1.5348535409714499, + "grad_norm": 7.09765625, + "learning_rate": 8.46514645902855e-06, + "loss": 2.7992, + "mean_token_accuracy": 0.4610792461664521, + "step": 8279 + }, + { + "epoch": 1.5350389321468298, + "grad_norm": 6.88671875, + "learning_rate": 8.464961067853171e-06, + "loss": 2.8371, + "mean_token_accuracy": 0.46764243377283177, + "step": 8280 + }, + { + "epoch": 1.5352243233222098, + "grad_norm": 5.50390625, + "learning_rate": 8.464775676677792e-06, + "loss": 2.905, + "mean_token_accuracy": 0.4720666379434152, + "step": 8281 + }, + { + "epoch": 1.53540971449759, + "grad_norm": 5.1171875, + "learning_rate": 8.46459028550241e-06, + "loss": 2.572, + "mean_token_accuracy": 0.48058017727639, + "step": 8282 + }, + { + "epoch": 1.53559510567297, + "grad_norm": 5.16796875, + "learning_rate": 8.464404894327031e-06, + "loss": 3.1354, + "mean_token_accuracy": 0.4449255751014885, + "step": 8283 + }, + { + "epoch": 1.53578049684835, + "grad_norm": 6.1015625, + "learning_rate": 8.46421950315165e-06, + "loss": 2.8049, + "mean_token_accuracy": 0.46330413772274237, + "step": 8284 + }, + { + "epoch": 1.5359658880237301, + "grad_norm": 5.8984375, + "learning_rate": 8.46403411197627e-06, + "loss": 2.4932, + "mean_token_accuracy": 0.5093723545773371, + "step": 8285 + }, + { + "epoch": 1.53615127919911, + "grad_norm": 11.7578125, + "learning_rate": 8.46384872080089e-06, + "loss": 2.5753, + "mean_token_accuracy": 0.5040721714070918, + "step": 8286 + }, + { + "epoch": 1.53633667037449, + "grad_norm": 5.97265625, + "learning_rate": 8.463663329625511e-06, + "loss": 2.7301, + "mean_token_accuracy": 0.5111734859116916, + "step": 8287 + }, + { + "epoch": 1.5365220615498703, + "grad_norm": 7.0703125, + "learning_rate": 8.463477938450132e-06, + "loss": 2.8774, + "mean_token_accuracy": 0.4550398381181232, + "step": 8288 + }, + { + "epoch": 1.5367074527252504, + "grad_norm": 5.91015625, + "learning_rate": 8.46329254727475e-06, + "loss": 3.1831, + "mean_token_accuracy": 0.4555691827405508, + "step": 8289 + }, + { + "epoch": 1.5368928439006302, + "grad_norm": 6.08984375, + "learning_rate": 8.463107156099371e-06, + "loss": 2.9793, + "mean_token_accuracy": 0.474928744853795, + "step": 8290 + }, + { + "epoch": 1.5370782350760104, + "grad_norm": 5.79296875, + "learning_rate": 8.46292176492399e-06, + "loss": 2.4181, + "mean_token_accuracy": 0.513204765999263, + "step": 8291 + }, + { + "epoch": 1.5372636262513906, + "grad_norm": 6.01171875, + "learning_rate": 8.46273637374861e-06, + "loss": 2.4434, + "mean_token_accuracy": 0.4942728722633029, + "step": 8292 + }, + { + "epoch": 1.5374490174267705, + "grad_norm": 6.77734375, + "learning_rate": 8.46255098257323e-06, + "loss": 2.8601, + "mean_token_accuracy": 0.4796938456732271, + "step": 8293 + }, + { + "epoch": 1.5376344086021505, + "grad_norm": 5.87109375, + "learning_rate": 8.46236559139785e-06, + "loss": 2.6536, + "mean_token_accuracy": 0.4689096158116811, + "step": 8294 + }, + { + "epoch": 1.5378197997775307, + "grad_norm": 7.00390625, + "learning_rate": 8.46218020022247e-06, + "loss": 2.818, + "mean_token_accuracy": 0.4736763386180078, + "step": 8295 + }, + { + "epoch": 1.5380051909529107, + "grad_norm": 5.82421875, + "learning_rate": 8.46199480904709e-06, + "loss": 2.36, + "mean_token_accuracy": 0.5431269674711438, + "step": 8296 + }, + { + "epoch": 1.5381905821282906, + "grad_norm": 6.0859375, + "learning_rate": 8.461809417871711e-06, + "loss": 2.8016, + "mean_token_accuracy": 0.47885878489326766, + "step": 8297 + }, + { + "epoch": 1.5383759733036708, + "grad_norm": 5.9453125, + "learning_rate": 8.46162402669633e-06, + "loss": 3.0624, + "mean_token_accuracy": 0.4649161845423528, + "step": 8298 + }, + { + "epoch": 1.5385613644790508, + "grad_norm": 7.6015625, + "learning_rate": 8.46143863552095e-06, + "loss": 3.2099, + "mean_token_accuracy": 0.4384648066846056, + "step": 8299 + }, + { + "epoch": 1.5387467556544308, + "grad_norm": 6.37109375, + "learning_rate": 8.461253244345569e-06, + "loss": 2.5515, + "mean_token_accuracy": 0.5203161836083761, + "step": 8300 + }, + { + "epoch": 1.538932146829811, + "grad_norm": 5.51171875, + "learning_rate": 8.46106785317019e-06, + "loss": 2.7736, + "mean_token_accuracy": 0.4962667994026879, + "step": 8301 + }, + { + "epoch": 1.539117538005191, + "grad_norm": 6.6640625, + "learning_rate": 8.46088246199481e-06, + "loss": 2.4737, + "mean_token_accuracy": 0.47905686546463244, + "step": 8302 + }, + { + "epoch": 1.5393029291805709, + "grad_norm": 7.1171875, + "learning_rate": 8.460697070819429e-06, + "loss": 2.9505, + "mean_token_accuracy": 0.47694981089645305, + "step": 8303 + }, + { + "epoch": 1.539488320355951, + "grad_norm": 5.6640625, + "learning_rate": 8.46051167964405e-06, + "loss": 2.626, + "mean_token_accuracy": 0.5056267409470752, + "step": 8304 + }, + { + "epoch": 1.5396737115313313, + "grad_norm": 8.4765625, + "learning_rate": 8.46032628846867e-06, + "loss": 2.3865, + "mean_token_accuracy": 0.4964625850340136, + "step": 8305 + }, + { + "epoch": 1.539859102706711, + "grad_norm": 6.46484375, + "learning_rate": 8.46014089729329e-06, + "loss": 2.8396, + "mean_token_accuracy": 0.4595394313649813, + "step": 8306 + }, + { + "epoch": 1.5400444938820912, + "grad_norm": 5.87109375, + "learning_rate": 8.45995550611791e-06, + "loss": 3.5105, + "mean_token_accuracy": 0.4338784216139353, + "step": 8307 + }, + { + "epoch": 1.5402298850574714, + "grad_norm": 8.9765625, + "learning_rate": 8.45977011494253e-06, + "loss": 2.8756, + "mean_token_accuracy": 0.45482734319943624, + "step": 8308 + }, + { + "epoch": 1.5404152762328513, + "grad_norm": 5.9296875, + "learning_rate": 8.459584723767148e-06, + "loss": 3.0344, + "mean_token_accuracy": 0.4293410692084542, + "step": 8309 + }, + { + "epoch": 1.5406006674082313, + "grad_norm": 6.875, + "learning_rate": 8.459399332591769e-06, + "loss": 2.5309, + "mean_token_accuracy": 0.49761677788369874, + "step": 8310 + }, + { + "epoch": 1.5407860585836115, + "grad_norm": 11.6953125, + "learning_rate": 8.45921394141639e-06, + "loss": 2.5846, + "mean_token_accuracy": 0.4719591226321037, + "step": 8311 + }, + { + "epoch": 1.5409714497589915, + "grad_norm": 9.4609375, + "learning_rate": 8.45902855024101e-06, + "loss": 2.7333, + "mean_token_accuracy": 0.46911827346301294, + "step": 8312 + }, + { + "epoch": 1.5411568409343714, + "grad_norm": 6.08984375, + "learning_rate": 8.458843159065629e-06, + "loss": 3.1097, + "mean_token_accuracy": 0.4497566561694818, + "step": 8313 + }, + { + "epoch": 1.5413422321097516, + "grad_norm": 10.7109375, + "learning_rate": 8.45865776789025e-06, + "loss": 2.3564, + "mean_token_accuracy": 0.5270201592410638, + "step": 8314 + }, + { + "epoch": 1.5415276232851316, + "grad_norm": 10.125, + "learning_rate": 8.45847237671487e-06, + "loss": 2.9697, + "mean_token_accuracy": 0.45194841214810705, + "step": 8315 + }, + { + "epoch": 1.5417130144605116, + "grad_norm": 7.3046875, + "learning_rate": 8.458286985539489e-06, + "loss": 2.881, + "mean_token_accuracy": 0.4797864225781846, + "step": 8316 + }, + { + "epoch": 1.5418984056358918, + "grad_norm": 7.12109375, + "learning_rate": 8.458101594364109e-06, + "loss": 3.1344, + "mean_token_accuracy": 0.443884620229248, + "step": 8317 + }, + { + "epoch": 1.542083796811272, + "grad_norm": 11.375, + "learning_rate": 8.457916203188728e-06, + "loss": 2.8157, + "mean_token_accuracy": 0.45394381415451107, + "step": 8318 + }, + { + "epoch": 1.5422691879866517, + "grad_norm": 5.71484375, + "learning_rate": 8.457730812013348e-06, + "loss": 2.49, + "mean_token_accuracy": 0.5247035573122529, + "step": 8319 + }, + { + "epoch": 1.5424545791620319, + "grad_norm": 6.125, + "learning_rate": 8.457545420837969e-06, + "loss": 2.6034, + "mean_token_accuracy": 0.5059982862039417, + "step": 8320 + }, + { + "epoch": 1.542639970337412, + "grad_norm": 13.046875, + "learning_rate": 8.45736002966259e-06, + "loss": 2.4745, + "mean_token_accuracy": 0.48938178386031145, + "step": 8321 + }, + { + "epoch": 1.542825361512792, + "grad_norm": 10.8203125, + "learning_rate": 8.457174638487208e-06, + "loss": 2.118, + "mean_token_accuracy": 0.5397564849126522, + "step": 8322 + }, + { + "epoch": 1.543010752688172, + "grad_norm": 6.6953125, + "learning_rate": 8.456989247311829e-06, + "loss": 2.797, + "mean_token_accuracy": 0.46263125386040765, + "step": 8323 + }, + { + "epoch": 1.5431961438635522, + "grad_norm": 6.171875, + "learning_rate": 8.456803856136449e-06, + "loss": 2.6753, + "mean_token_accuracy": 0.47368421052631576, + "step": 8324 + }, + { + "epoch": 1.5433815350389322, + "grad_norm": 7.7734375, + "learning_rate": 8.456618464961068e-06, + "loss": 2.6547, + "mean_token_accuracy": 0.5095279976546467, + "step": 8325 + }, + { + "epoch": 1.5435669262143121, + "grad_norm": 9.1484375, + "learning_rate": 8.456433073785688e-06, + "loss": 2.4895, + "mean_token_accuracy": 0.5024351717783335, + "step": 8326 + }, + { + "epoch": 1.5437523173896923, + "grad_norm": 6.2265625, + "learning_rate": 8.456247682610307e-06, + "loss": 3.1105, + "mean_token_accuracy": 0.45373891001267425, + "step": 8327 + }, + { + "epoch": 1.5439377085650723, + "grad_norm": 5.1640625, + "learning_rate": 8.45606229143493e-06, + "loss": 2.3699, + "mean_token_accuracy": 0.5329433497536946, + "step": 8328 + }, + { + "epoch": 1.5441230997404523, + "grad_norm": 7.8515625, + "learning_rate": 8.455876900259548e-06, + "loss": 3.1118, + "mean_token_accuracy": 0.4311143623506462, + "step": 8329 + }, + { + "epoch": 1.5443084909158324, + "grad_norm": 6.1640625, + "learning_rate": 8.455691509084169e-06, + "loss": 2.5738, + "mean_token_accuracy": 0.49162011173184356, + "step": 8330 + }, + { + "epoch": 1.5444938820912124, + "grad_norm": 5.453125, + "learning_rate": 8.455506117908789e-06, + "loss": 2.4346, + "mean_token_accuracy": 0.49577639751552793, + "step": 8331 + }, + { + "epoch": 1.5446792732665924, + "grad_norm": 7.59765625, + "learning_rate": 8.455320726733408e-06, + "loss": 2.9831, + "mean_token_accuracy": 0.4492063492063492, + "step": 8332 + }, + { + "epoch": 1.5448646644419726, + "grad_norm": 6.9375, + "learning_rate": 8.455135335558028e-06, + "loss": 2.2194, + "mean_token_accuracy": 0.5542857142857143, + "step": 8333 + }, + { + "epoch": 1.5450500556173528, + "grad_norm": 6.0390625, + "learning_rate": 8.454949944382647e-06, + "loss": 2.728, + "mean_token_accuracy": 0.46300156331422615, + "step": 8334 + }, + { + "epoch": 1.5452354467927325, + "grad_norm": 6.359375, + "learning_rate": 8.454764553207268e-06, + "loss": 3.801, + "mean_token_accuracy": 0.3954456415279138, + "step": 8335 + }, + { + "epoch": 1.5454208379681127, + "grad_norm": 5.83984375, + "learning_rate": 8.454579162031888e-06, + "loss": 3.0618, + "mean_token_accuracy": 0.45435452254270514, + "step": 8336 + }, + { + "epoch": 1.5456062291434929, + "grad_norm": 5.9765625, + "learning_rate": 8.454393770856509e-06, + "loss": 3.3953, + "mean_token_accuracy": 0.43796526054590573, + "step": 8337 + }, + { + "epoch": 1.5457916203188728, + "grad_norm": 5.6640625, + "learning_rate": 8.454208379681127e-06, + "loss": 3.0747, + "mean_token_accuracy": 0.4418501715170964, + "step": 8338 + }, + { + "epoch": 1.5459770114942528, + "grad_norm": 5.25, + "learning_rate": 8.454022988505748e-06, + "loss": 2.8709, + "mean_token_accuracy": 0.4779563246806757, + "step": 8339 + }, + { + "epoch": 1.546162402669633, + "grad_norm": 6.640625, + "learning_rate": 8.453837597330368e-06, + "loss": 2.4073, + "mean_token_accuracy": 0.48337982333798235, + "step": 8340 + }, + { + "epoch": 1.546347793845013, + "grad_norm": 5.92578125, + "learning_rate": 8.453652206154987e-06, + "loss": 3.161, + "mean_token_accuracy": 0.42514438027543316, + "step": 8341 + }, + { + "epoch": 1.546533185020393, + "grad_norm": 6.8046875, + "learning_rate": 8.453466814979608e-06, + "loss": 2.5927, + "mean_token_accuracy": 0.48689019545526774, + "step": 8342 + }, + { + "epoch": 1.5467185761957731, + "grad_norm": 5.9375, + "learning_rate": 8.453281423804227e-06, + "loss": 3.4112, + "mean_token_accuracy": 0.4292901062045836, + "step": 8343 + }, + { + "epoch": 1.546903967371153, + "grad_norm": 9.5625, + "learning_rate": 8.453096032628849e-06, + "loss": 2.5495, + "mean_token_accuracy": 0.470580404685836, + "step": 8344 + }, + { + "epoch": 1.547089358546533, + "grad_norm": 6.57421875, + "learning_rate": 8.452910641453468e-06, + "loss": 3.2944, + "mean_token_accuracy": 0.42242473180297613, + "step": 8345 + }, + { + "epoch": 1.5472747497219133, + "grad_norm": 5.11328125, + "learning_rate": 8.452725250278088e-06, + "loss": 2.7521, + "mean_token_accuracy": 0.47368421052631576, + "step": 8346 + }, + { + "epoch": 1.5474601408972934, + "grad_norm": 6.33984375, + "learning_rate": 8.452539859102707e-06, + "loss": 2.3273, + "mean_token_accuracy": 0.5154548130703562, + "step": 8347 + }, + { + "epoch": 1.5476455320726732, + "grad_norm": 6.88671875, + "learning_rate": 8.452354467927327e-06, + "loss": 2.5139, + "mean_token_accuracy": 0.48480355819125276, + "step": 8348 + }, + { + "epoch": 1.5478309232480534, + "grad_norm": 7.49609375, + "learning_rate": 8.452169076751948e-06, + "loss": 2.6049, + "mean_token_accuracy": 0.5007653619068445, + "step": 8349 + }, + { + "epoch": 1.5480163144234336, + "grad_norm": 5.99609375, + "learning_rate": 8.451983685576567e-06, + "loss": 2.9472, + "mean_token_accuracy": 0.4402050792893764, + "step": 8350 + }, + { + "epoch": 1.5482017055988135, + "grad_norm": 6.24609375, + "learning_rate": 8.451798294401187e-06, + "loss": 3.3317, + "mean_token_accuracy": 0.43426034730382557, + "step": 8351 + }, + { + "epoch": 1.5483870967741935, + "grad_norm": 6.9921875, + "learning_rate": 8.451612903225808e-06, + "loss": 3.3378, + "mean_token_accuracy": 0.4583196407054442, + "step": 8352 + }, + { + "epoch": 1.5485724879495737, + "grad_norm": 7.79296875, + "learning_rate": 8.451427512050428e-06, + "loss": 2.359, + "mean_token_accuracy": 0.5041722745625841, + "step": 8353 + }, + { + "epoch": 1.5487578791249537, + "grad_norm": 5.765625, + "learning_rate": 8.451242120875047e-06, + "loss": 2.835, + "mean_token_accuracy": 0.49053080821552414, + "step": 8354 + }, + { + "epoch": 1.5489432703003336, + "grad_norm": 6.55078125, + "learning_rate": 8.451056729699667e-06, + "loss": 2.9094, + "mean_token_accuracy": 0.47598152424942264, + "step": 8355 + }, + { + "epoch": 1.5491286614757138, + "grad_norm": 7.50390625, + "learning_rate": 8.450871338524286e-06, + "loss": 2.5188, + "mean_token_accuracy": 0.5249426259210049, + "step": 8356 + }, + { + "epoch": 1.5493140526510938, + "grad_norm": 7.2109375, + "learning_rate": 8.450685947348907e-06, + "loss": 2.431, + "mean_token_accuracy": 0.5111341273951321, + "step": 8357 + }, + { + "epoch": 1.5494994438264738, + "grad_norm": 6.2265625, + "learning_rate": 8.450500556173527e-06, + "loss": 2.9914, + "mean_token_accuracy": 0.4403846153846154, + "step": 8358 + }, + { + "epoch": 1.549684835001854, + "grad_norm": 6.2890625, + "learning_rate": 8.450315164998146e-06, + "loss": 2.9276, + "mean_token_accuracy": 0.4477098639987494, + "step": 8359 + }, + { + "epoch": 1.549870226177234, + "grad_norm": 5.80078125, + "learning_rate": 8.450129773822766e-06, + "loss": 2.8693, + "mean_token_accuracy": 0.46636005256241786, + "step": 8360 + }, + { + "epoch": 1.5500556173526139, + "grad_norm": 5.2578125, + "learning_rate": 8.449944382647387e-06, + "loss": 3.059, + "mean_token_accuracy": 0.44615600056014565, + "step": 8361 + }, + { + "epoch": 1.550241008527994, + "grad_norm": 6.22265625, + "learning_rate": 8.449758991472007e-06, + "loss": 2.8033, + "mean_token_accuracy": 0.4682950306822284, + "step": 8362 + }, + { + "epoch": 1.5504263997033743, + "grad_norm": 6.67578125, + "learning_rate": 8.449573600296626e-06, + "loss": 3.3225, + "mean_token_accuracy": 0.41184387617765816, + "step": 8363 + }, + { + "epoch": 1.5506117908787542, + "grad_norm": 6.30078125, + "learning_rate": 8.449388209121247e-06, + "loss": 3.3009, + "mean_token_accuracy": 0.4326310632043116, + "step": 8364 + }, + { + "epoch": 1.5507971820541342, + "grad_norm": 5.93359375, + "learning_rate": 8.449202817945865e-06, + "loss": 3.0693, + "mean_token_accuracy": 0.4468957714608966, + "step": 8365 + }, + { + "epoch": 1.5509825732295144, + "grad_norm": 6.62890625, + "learning_rate": 8.449017426770486e-06, + "loss": 2.6706, + "mean_token_accuracy": 0.4885249968986478, + "step": 8366 + }, + { + "epoch": 1.5511679644048944, + "grad_norm": 5.234375, + "learning_rate": 8.448832035595106e-06, + "loss": 2.5995, + "mean_token_accuracy": 0.48256075607560756, + "step": 8367 + }, + { + "epoch": 1.5513533555802743, + "grad_norm": 6.9375, + "learning_rate": 8.448646644419727e-06, + "loss": 2.998, + "mean_token_accuracy": 0.44861731674918487, + "step": 8368 + }, + { + "epoch": 1.5515387467556545, + "grad_norm": 7.08203125, + "learning_rate": 8.448461253244347e-06, + "loss": 2.7109, + "mean_token_accuracy": 0.4887955182072829, + "step": 8369 + }, + { + "epoch": 1.5517241379310345, + "grad_norm": 6.23828125, + "learning_rate": 8.448275862068966e-06, + "loss": 2.9689, + "mean_token_accuracy": 0.4404202719406675, + "step": 8370 + }, + { + "epoch": 1.5519095291064144, + "grad_norm": 6.3203125, + "learning_rate": 8.448090470893587e-06, + "loss": 3.4496, + "mean_token_accuracy": 0.4451539338654504, + "step": 8371 + }, + { + "epoch": 1.5520949202817946, + "grad_norm": 5.921875, + "learning_rate": 8.447905079718206e-06, + "loss": 2.7598, + "mean_token_accuracy": 0.4789072887706999, + "step": 8372 + }, + { + "epoch": 1.5522803114571746, + "grad_norm": 8.7109375, + "learning_rate": 8.447719688542826e-06, + "loss": 3.4347, + "mean_token_accuracy": 0.42412993039443153, + "step": 8373 + }, + { + "epoch": 1.5524657026325546, + "grad_norm": 7.1796875, + "learning_rate": 8.447534297367447e-06, + "loss": 3.0919, + "mean_token_accuracy": 0.46592902442315476, + "step": 8374 + }, + { + "epoch": 1.5526510938079348, + "grad_norm": 8.015625, + "learning_rate": 8.447348906192065e-06, + "loss": 2.5119, + "mean_token_accuracy": 0.49633016484177594, + "step": 8375 + }, + { + "epoch": 1.552836484983315, + "grad_norm": 5.65625, + "learning_rate": 8.447163515016686e-06, + "loss": 2.5972, + "mean_token_accuracy": 0.4932650562172993, + "step": 8376 + }, + { + "epoch": 1.5530218761586947, + "grad_norm": 8.6484375, + "learning_rate": 8.446978123841306e-06, + "loss": 3.2344, + "mean_token_accuracy": 0.45692216280451575, + "step": 8377 + }, + { + "epoch": 1.5532072673340749, + "grad_norm": 6.953125, + "learning_rate": 8.446792732665927e-06, + "loss": 3.1933, + "mean_token_accuracy": 0.4149122807017544, + "step": 8378 + }, + { + "epoch": 1.553392658509455, + "grad_norm": 5.66796875, + "learning_rate": 8.446607341490546e-06, + "loss": 2.6754, + "mean_token_accuracy": 0.46707572583058965, + "step": 8379 + }, + { + "epoch": 1.553578049684835, + "grad_norm": 7.359375, + "learning_rate": 8.446421950315166e-06, + "loss": 2.881, + "mean_token_accuracy": 0.44709997085397846, + "step": 8380 + }, + { + "epoch": 1.553763440860215, + "grad_norm": 6.703125, + "learning_rate": 8.446236559139785e-06, + "loss": 2.923, + "mean_token_accuracy": 0.43796308770039066, + "step": 8381 + }, + { + "epoch": 1.5539488320355952, + "grad_norm": 6.30078125, + "learning_rate": 8.446051167964405e-06, + "loss": 2.7977, + "mean_token_accuracy": 0.47877059569074776, + "step": 8382 + }, + { + "epoch": 1.5541342232109752, + "grad_norm": 5.65625, + "learning_rate": 8.445865776789026e-06, + "loss": 2.5994, + "mean_token_accuracy": 0.48074757937401486, + "step": 8383 + }, + { + "epoch": 1.5543196143863551, + "grad_norm": 5.48828125, + "learning_rate": 8.445680385613646e-06, + "loss": 2.5245, + "mean_token_accuracy": 0.4875305623471883, + "step": 8384 + }, + { + "epoch": 1.5545050055617353, + "grad_norm": 6.01953125, + "learning_rate": 8.445494994438265e-06, + "loss": 3.3098, + "mean_token_accuracy": 0.4410510545119281, + "step": 8385 + }, + { + "epoch": 1.5546903967371153, + "grad_norm": 7.5546875, + "learning_rate": 8.445309603262886e-06, + "loss": 2.699, + "mean_token_accuracy": 0.48598480129562727, + "step": 8386 + }, + { + "epoch": 1.5548757879124953, + "grad_norm": 7.40625, + "learning_rate": 8.445124212087506e-06, + "loss": 2.7595, + "mean_token_accuracy": 0.4619133983223759, + "step": 8387 + }, + { + "epoch": 1.5550611790878754, + "grad_norm": 5.515625, + "learning_rate": 8.444938820912125e-06, + "loss": 2.9571, + "mean_token_accuracy": 0.45032802249297094, + "step": 8388 + }, + { + "epoch": 1.5552465702632556, + "grad_norm": 7.0625, + "learning_rate": 8.444753429736745e-06, + "loss": 2.6437, + "mean_token_accuracy": 0.5005512679162073, + "step": 8389 + }, + { + "epoch": 1.5554319614386354, + "grad_norm": 6.03125, + "learning_rate": 8.444568038561364e-06, + "loss": 3.5808, + "mean_token_accuracy": 0.39526355996944235, + "step": 8390 + }, + { + "epoch": 1.5556173526140156, + "grad_norm": 5.95703125, + "learning_rate": 8.444382647385985e-06, + "loss": 2.916, + "mean_token_accuracy": 0.4557610241820768, + "step": 8391 + }, + { + "epoch": 1.5558027437893958, + "grad_norm": 5.640625, + "learning_rate": 8.444197256210605e-06, + "loss": 2.8687, + "mean_token_accuracy": 0.46607454863133374, + "step": 8392 + }, + { + "epoch": 1.5559881349647757, + "grad_norm": 6.01953125, + "learning_rate": 8.444011865035226e-06, + "loss": 3.1959, + "mean_token_accuracy": 0.4544605947459661, + "step": 8393 + }, + { + "epoch": 1.5561735261401557, + "grad_norm": 6.31640625, + "learning_rate": 8.443826473859844e-06, + "loss": 2.87, + "mean_token_accuracy": 0.47581120943952804, + "step": 8394 + }, + { + "epoch": 1.5563589173155359, + "grad_norm": 6.60546875, + "learning_rate": 8.443641082684465e-06, + "loss": 2.9675, + "mean_token_accuracy": 0.46794577205882354, + "step": 8395 + }, + { + "epoch": 1.5565443084909159, + "grad_norm": 7.68359375, + "learning_rate": 8.443455691509085e-06, + "loss": 2.3252, + "mean_token_accuracy": 0.5242279020234292, + "step": 8396 + }, + { + "epoch": 1.5567296996662958, + "grad_norm": 6.13671875, + "learning_rate": 8.443270300333704e-06, + "loss": 2.2919, + "mean_token_accuracy": 0.5244206773618538, + "step": 8397 + }, + { + "epoch": 1.556915090841676, + "grad_norm": 9.453125, + "learning_rate": 8.443084909158325e-06, + "loss": 2.5541, + "mean_token_accuracy": 0.48144772816039577, + "step": 8398 + }, + { + "epoch": 1.557100482017056, + "grad_norm": 7.859375, + "learning_rate": 8.442899517982944e-06, + "loss": 2.8134, + "mean_token_accuracy": 0.4628579735426661, + "step": 8399 + }, + { + "epoch": 1.557285873192436, + "grad_norm": 6.81640625, + "learning_rate": 8.442714126807566e-06, + "loss": 3.0545, + "mean_token_accuracy": 0.45233995930505555, + "step": 8400 + }, + { + "epoch": 1.5574712643678161, + "grad_norm": 6.1953125, + "learning_rate": 8.442528735632185e-06, + "loss": 3.377, + "mean_token_accuracy": 0.41705213151643955, + "step": 8401 + }, + { + "epoch": 1.557656655543196, + "grad_norm": 5.56640625, + "learning_rate": 8.442343344456805e-06, + "loss": 2.7175, + "mean_token_accuracy": 0.4952866807142065, + "step": 8402 + }, + { + "epoch": 1.557842046718576, + "grad_norm": 9.53125, + "learning_rate": 8.442157953281424e-06, + "loss": 2.1349, + "mean_token_accuracy": 0.536872747435542, + "step": 8403 + }, + { + "epoch": 1.5580274378939563, + "grad_norm": 6.33984375, + "learning_rate": 8.441972562106044e-06, + "loss": 2.6698, + "mean_token_accuracy": 0.4799600449494319, + "step": 8404 + }, + { + "epoch": 1.5582128290693364, + "grad_norm": 5.80078125, + "learning_rate": 8.441787170930665e-06, + "loss": 2.9482, + "mean_token_accuracy": 0.4440196613872201, + "step": 8405 + }, + { + "epoch": 1.5583982202447162, + "grad_norm": 6.48828125, + "learning_rate": 8.441601779755284e-06, + "loss": 2.8323, + "mean_token_accuracy": 0.47560813505250565, + "step": 8406 + }, + { + "epoch": 1.5585836114200964, + "grad_norm": 6.8203125, + "learning_rate": 8.441416388579904e-06, + "loss": 2.4759, + "mean_token_accuracy": 0.5145191703331238, + "step": 8407 + }, + { + "epoch": 1.5587690025954766, + "grad_norm": 7.38671875, + "learning_rate": 8.441230997404525e-06, + "loss": 2.8462, + "mean_token_accuracy": 0.49516129032258066, + "step": 8408 + }, + { + "epoch": 1.5589543937708565, + "grad_norm": 7.2109375, + "learning_rate": 8.441045606229145e-06, + "loss": 2.0692, + "mean_token_accuracy": 0.5750350631136045, + "step": 8409 + }, + { + "epoch": 1.5591397849462365, + "grad_norm": 5.12109375, + "learning_rate": 8.440860215053764e-06, + "loss": 2.2935, + "mean_token_accuracy": 0.5374163879598662, + "step": 8410 + }, + { + "epoch": 1.5593251761216167, + "grad_norm": 9.703125, + "learning_rate": 8.440674823878384e-06, + "loss": 2.3745, + "mean_token_accuracy": 0.488303307340683, + "step": 8411 + }, + { + "epoch": 1.5595105672969967, + "grad_norm": 6.1328125, + "learning_rate": 8.440489432703005e-06, + "loss": 2.7654, + "mean_token_accuracy": 0.4771478667445938, + "step": 8412 + }, + { + "epoch": 1.5596959584723766, + "grad_norm": 5.2578125, + "learning_rate": 8.440304041527624e-06, + "loss": 3.0409, + "mean_token_accuracy": 0.44935428660339194, + "step": 8413 + }, + { + "epoch": 1.5598813496477568, + "grad_norm": 7.70703125, + "learning_rate": 8.440118650352244e-06, + "loss": 3.1878, + "mean_token_accuracy": 0.42570993914807304, + "step": 8414 + }, + { + "epoch": 1.5600667408231368, + "grad_norm": 6.1875, + "learning_rate": 8.439933259176863e-06, + "loss": 2.8927, + "mean_token_accuracy": 0.46062805744042923, + "step": 8415 + }, + { + "epoch": 1.5602521319985168, + "grad_norm": 5.87890625, + "learning_rate": 8.439747868001483e-06, + "loss": 2.2906, + "mean_token_accuracy": 0.529060874885286, + "step": 8416 + }, + { + "epoch": 1.560437523173897, + "grad_norm": 5.6328125, + "learning_rate": 8.439562476826104e-06, + "loss": 3.3128, + "mean_token_accuracy": 0.4397498085269339, + "step": 8417 + }, + { + "epoch": 1.5606229143492771, + "grad_norm": 6.21484375, + "learning_rate": 8.439377085650724e-06, + "loss": 2.6274, + "mean_token_accuracy": 0.48127935017134155, + "step": 8418 + }, + { + "epoch": 1.5608083055246569, + "grad_norm": 9.3046875, + "learning_rate": 8.439191694475343e-06, + "loss": 3.0029, + "mean_token_accuracy": 0.4575427682737169, + "step": 8419 + }, + { + "epoch": 1.560993696700037, + "grad_norm": 7.48828125, + "learning_rate": 8.439006303299964e-06, + "loss": 3.0026, + "mean_token_accuracy": 0.4358916222650493, + "step": 8420 + }, + { + "epoch": 1.5611790878754173, + "grad_norm": 8.578125, + "learning_rate": 8.438820912124584e-06, + "loss": 2.1261, + "mean_token_accuracy": 0.5424522168768691, + "step": 8421 + }, + { + "epoch": 1.5613644790507972, + "grad_norm": 6.203125, + "learning_rate": 8.438635520949203e-06, + "loss": 2.8001, + "mean_token_accuracy": 0.4609452244413684, + "step": 8422 + }, + { + "epoch": 1.5615498702261772, + "grad_norm": 5.22265625, + "learning_rate": 8.438450129773823e-06, + "loss": 2.5867, + "mean_token_accuracy": 0.502964681618974, + "step": 8423 + }, + { + "epoch": 1.5617352614015574, + "grad_norm": 8.1015625, + "learning_rate": 8.438264738598442e-06, + "loss": 3.6961, + "mean_token_accuracy": 0.4135758270084491, + "step": 8424 + }, + { + "epoch": 1.5619206525769374, + "grad_norm": 5.6953125, + "learning_rate": 8.438079347423064e-06, + "loss": 3.3276, + "mean_token_accuracy": 0.4387755102040816, + "step": 8425 + }, + { + "epoch": 1.5621060437523173, + "grad_norm": 8.78125, + "learning_rate": 8.437893956247683e-06, + "loss": 2.9403, + "mean_token_accuracy": 0.4440535633227417, + "step": 8426 + }, + { + "epoch": 1.5622914349276975, + "grad_norm": 8.3828125, + "learning_rate": 8.437708565072304e-06, + "loss": 2.8296, + "mean_token_accuracy": 0.4684845775592311, + "step": 8427 + }, + { + "epoch": 1.5624768261030775, + "grad_norm": 5.86328125, + "learning_rate": 8.437523173896923e-06, + "loss": 2.8187, + "mean_token_accuracy": 0.46486877454019426, + "step": 8428 + }, + { + "epoch": 1.5626622172784574, + "grad_norm": 5.453125, + "learning_rate": 8.437337782721543e-06, + "loss": 3.0312, + "mean_token_accuracy": 0.4531678797279561, + "step": 8429 + }, + { + "epoch": 1.5628476084538376, + "grad_norm": 6.41015625, + "learning_rate": 8.437152391546164e-06, + "loss": 2.5548, + "mean_token_accuracy": 0.484992288527702, + "step": 8430 + }, + { + "epoch": 1.5630329996292176, + "grad_norm": 7.03125, + "learning_rate": 8.436967000370782e-06, + "loss": 3.0103, + "mean_token_accuracy": 0.4444220903329645, + "step": 8431 + }, + { + "epoch": 1.5632183908045976, + "grad_norm": 6.7890625, + "learning_rate": 8.436781609195403e-06, + "loss": 2.9194, + "mean_token_accuracy": 0.45368916797488223, + "step": 8432 + }, + { + "epoch": 1.5634037819799778, + "grad_norm": 5.76953125, + "learning_rate": 8.436596218020023e-06, + "loss": 2.8125, + "mean_token_accuracy": 0.4692723697148476, + "step": 8433 + }, + { + "epoch": 1.563589173155358, + "grad_norm": 5.921875, + "learning_rate": 8.436410826844644e-06, + "loss": 2.8724, + "mean_token_accuracy": 0.45999757193152846, + "step": 8434 + }, + { + "epoch": 1.5637745643307377, + "grad_norm": 5.828125, + "learning_rate": 8.436225435669263e-06, + "loss": 2.7728, + "mean_token_accuracy": 0.4893233082706767, + "step": 8435 + }, + { + "epoch": 1.5639599555061179, + "grad_norm": 6.66796875, + "learning_rate": 8.436040044493883e-06, + "loss": 2.6715, + "mean_token_accuracy": 0.4991968793024323, + "step": 8436 + }, + { + "epoch": 1.564145346681498, + "grad_norm": 5.546875, + "learning_rate": 8.435854653318502e-06, + "loss": 2.8158, + "mean_token_accuracy": 0.464476386036961, + "step": 8437 + }, + { + "epoch": 1.564330737856878, + "grad_norm": 6.78125, + "learning_rate": 8.435669262143122e-06, + "loss": 2.4507, + "mean_token_accuracy": 0.5100829038763164, + "step": 8438 + }, + { + "epoch": 1.564516129032258, + "grad_norm": 5.10546875, + "learning_rate": 8.435483870967743e-06, + "loss": 2.7281, + "mean_token_accuracy": 0.4666173022337406, + "step": 8439 + }, + { + "epoch": 1.5647015202076382, + "grad_norm": 6.41796875, + "learning_rate": 8.435298479792362e-06, + "loss": 3.0754, + "mean_token_accuracy": 0.43266381297332895, + "step": 8440 + }, + { + "epoch": 1.5648869113830182, + "grad_norm": 6.64453125, + "learning_rate": 8.435113088616982e-06, + "loss": 2.1925, + "mean_token_accuracy": 0.5324232081911263, + "step": 8441 + }, + { + "epoch": 1.5650723025583981, + "grad_norm": 5.703125, + "learning_rate": 8.434927697441603e-06, + "loss": 3.0943, + "mean_token_accuracy": 0.4386896283827528, + "step": 8442 + }, + { + "epoch": 1.5652576937337783, + "grad_norm": 8.3828125, + "learning_rate": 8.434742306266223e-06, + "loss": 2.9414, + "mean_token_accuracy": 0.45081549439347607, + "step": 8443 + }, + { + "epoch": 1.5654430849091583, + "grad_norm": 6.4921875, + "learning_rate": 8.434556915090842e-06, + "loss": 2.6132, + "mean_token_accuracy": 0.48522378908645003, + "step": 8444 + }, + { + "epoch": 1.5656284760845383, + "grad_norm": 6.5546875, + "learning_rate": 8.434371523915462e-06, + "loss": 2.4626, + "mean_token_accuracy": 0.4992660251182515, + "step": 8445 + }, + { + "epoch": 1.5658138672599184, + "grad_norm": 6.08203125, + "learning_rate": 8.434186132740081e-06, + "loss": 2.6058, + "mean_token_accuracy": 0.48985383851284237, + "step": 8446 + }, + { + "epoch": 1.5659992584352986, + "grad_norm": 6.32421875, + "learning_rate": 8.434000741564702e-06, + "loss": 2.3627, + "mean_token_accuracy": 0.5083064419744233, + "step": 8447 + }, + { + "epoch": 1.5661846496106784, + "grad_norm": 8.0390625, + "learning_rate": 8.433815350389322e-06, + "loss": 2.8576, + "mean_token_accuracy": 0.4631894761135472, + "step": 8448 + }, + { + "epoch": 1.5663700407860586, + "grad_norm": 6.46875, + "learning_rate": 8.433629959213943e-06, + "loss": 2.5598, + "mean_token_accuracy": 0.5002834467120182, + "step": 8449 + }, + { + "epoch": 1.5665554319614388, + "grad_norm": 5.83203125, + "learning_rate": 8.433444568038563e-06, + "loss": 2.6172, + "mean_token_accuracy": 0.4694871794871795, + "step": 8450 + }, + { + "epoch": 1.5667408231368187, + "grad_norm": 5.6875, + "learning_rate": 8.433259176863182e-06, + "loss": 2.9658, + "mean_token_accuracy": 0.44785358632754674, + "step": 8451 + }, + { + "epoch": 1.5669262143121987, + "grad_norm": 5.31640625, + "learning_rate": 8.433073785687802e-06, + "loss": 2.3299, + "mean_token_accuracy": 0.499515503875969, + "step": 8452 + }, + { + "epoch": 1.5671116054875789, + "grad_norm": 8.78125, + "learning_rate": 8.432888394512421e-06, + "loss": 2.3122, + "mean_token_accuracy": 0.5362698066579444, + "step": 8453 + }, + { + "epoch": 1.5672969966629589, + "grad_norm": 7.81640625, + "learning_rate": 8.432703003337042e-06, + "loss": 2.9139, + "mean_token_accuracy": 0.4539308176100629, + "step": 8454 + }, + { + "epoch": 1.5674823878383388, + "grad_norm": 6.4609375, + "learning_rate": 8.432517612161662e-06, + "loss": 2.6785, + "mean_token_accuracy": 0.5007900677200903, + "step": 8455 + }, + { + "epoch": 1.567667779013719, + "grad_norm": 5.82421875, + "learning_rate": 8.432332220986281e-06, + "loss": 2.9801, + "mean_token_accuracy": 0.45579742336739226, + "step": 8456 + }, + { + "epoch": 1.567853170189099, + "grad_norm": 8.0234375, + "learning_rate": 8.432146829810902e-06, + "loss": 3.1714, + "mean_token_accuracy": 0.4217910056378655, + "step": 8457 + }, + { + "epoch": 1.568038561364479, + "grad_norm": 5.9765625, + "learning_rate": 8.431961438635522e-06, + "loss": 3.3776, + "mean_token_accuracy": 0.4361509539061412, + "step": 8458 + }, + { + "epoch": 1.5682239525398591, + "grad_norm": 7.49609375, + "learning_rate": 8.431776047460143e-06, + "loss": 2.8761, + "mean_token_accuracy": 0.4672897196261682, + "step": 8459 + }, + { + "epoch": 1.568409343715239, + "grad_norm": 5.53515625, + "learning_rate": 8.431590656284761e-06, + "loss": 2.1784, + "mean_token_accuracy": 0.5402506372132541, + "step": 8460 + }, + { + "epoch": 1.568594734890619, + "grad_norm": 5.78125, + "learning_rate": 8.431405265109382e-06, + "loss": 2.5836, + "mean_token_accuracy": 0.4813252392221422, + "step": 8461 + }, + { + "epoch": 1.5687801260659993, + "grad_norm": 5.765625, + "learning_rate": 8.431219873934e-06, + "loss": 3.1844, + "mean_token_accuracy": 0.4172632158590308, + "step": 8462 + }, + { + "epoch": 1.5689655172413794, + "grad_norm": 5.7421875, + "learning_rate": 8.431034482758621e-06, + "loss": 2.8508, + "mean_token_accuracy": 0.47329746752394697, + "step": 8463 + }, + { + "epoch": 1.5691509084167594, + "grad_norm": 7.1640625, + "learning_rate": 8.430849091583242e-06, + "loss": 3.3646, + "mean_token_accuracy": 0.4275109170305677, + "step": 8464 + }, + { + "epoch": 1.5693362995921394, + "grad_norm": 7.46484375, + "learning_rate": 8.430663700407862e-06, + "loss": 2.4379, + "mean_token_accuracy": 0.5061133753241941, + "step": 8465 + }, + { + "epoch": 1.5695216907675196, + "grad_norm": 10.0625, + "learning_rate": 8.430478309232481e-06, + "loss": 3.3212, + "mean_token_accuracy": 0.4708789740849586, + "step": 8466 + }, + { + "epoch": 1.5697070819428995, + "grad_norm": 6.21875, + "learning_rate": 8.430292918057101e-06, + "loss": 2.7067, + "mean_token_accuracy": 0.4847483757295452, + "step": 8467 + }, + { + "epoch": 1.5698924731182795, + "grad_norm": 7.00390625, + "learning_rate": 8.430107526881722e-06, + "loss": 1.93, + "mean_token_accuracy": 0.5710831721470019, + "step": 8468 + }, + { + "epoch": 1.5700778642936597, + "grad_norm": 5.9765625, + "learning_rate": 8.42992213570634e-06, + "loss": 2.4411, + "mean_token_accuracy": 0.5247413685653746, + "step": 8469 + }, + { + "epoch": 1.5702632554690397, + "grad_norm": 7.390625, + "learning_rate": 8.429736744530961e-06, + "loss": 2.6506, + "mean_token_accuracy": 0.4908442715929347, + "step": 8470 + }, + { + "epoch": 1.5704486466444196, + "grad_norm": 8.8984375, + "learning_rate": 8.42955135335558e-06, + "loss": 3.061, + "mean_token_accuracy": 0.4353319295134656, + "step": 8471 + }, + { + "epoch": 1.5706340378197998, + "grad_norm": 7.5546875, + "learning_rate": 8.4293659621802e-06, + "loss": 2.9731, + "mean_token_accuracy": 0.4525856368205264, + "step": 8472 + }, + { + "epoch": 1.5708194289951798, + "grad_norm": 6.37109375, + "learning_rate": 8.429180571004821e-06, + "loss": 2.6035, + "mean_token_accuracy": 0.4880643841222207, + "step": 8473 + }, + { + "epoch": 1.5710048201705598, + "grad_norm": 9.2734375, + "learning_rate": 8.428995179829441e-06, + "loss": 2.7968, + "mean_token_accuracy": 0.4525010969723563, + "step": 8474 + }, + { + "epoch": 1.57119021134594, + "grad_norm": 6.79296875, + "learning_rate": 8.42880978865406e-06, + "loss": 2.7208, + "mean_token_accuracy": 0.4759751271905031, + "step": 8475 + }, + { + "epoch": 1.5713756025213201, + "grad_norm": 7.69921875, + "learning_rate": 8.42862439747868e-06, + "loss": 3.0084, + "mean_token_accuracy": 0.43533946779408106, + "step": 8476 + }, + { + "epoch": 1.5715609936966999, + "grad_norm": 8.1171875, + "learning_rate": 8.428439006303301e-06, + "loss": 2.4103, + "mean_token_accuracy": 0.5210897359302187, + "step": 8477 + }, + { + "epoch": 1.57174638487208, + "grad_norm": 6.4921875, + "learning_rate": 8.42825361512792e-06, + "loss": 2.6997, + "mean_token_accuracy": 0.46847339088624196, + "step": 8478 + }, + { + "epoch": 1.5719317760474603, + "grad_norm": 6.87109375, + "learning_rate": 8.42806822395254e-06, + "loss": 2.6708, + "mean_token_accuracy": 0.4825951510226195, + "step": 8479 + }, + { + "epoch": 1.5721171672228402, + "grad_norm": 7.1640625, + "learning_rate": 8.42788283277716e-06, + "loss": 2.1141, + "mean_token_accuracy": 0.5741696588868941, + "step": 8480 + }, + { + "epoch": 1.5723025583982202, + "grad_norm": 9.1328125, + "learning_rate": 8.427697441601781e-06, + "loss": 2.521, + "mean_token_accuracy": 0.49967500812479687, + "step": 8481 + }, + { + "epoch": 1.5724879495736004, + "grad_norm": 6.109375, + "learning_rate": 8.4275120504264e-06, + "loss": 2.8349, + "mean_token_accuracy": 0.49317523783262096, + "step": 8482 + }, + { + "epoch": 1.5726733407489804, + "grad_norm": 5.8515625, + "learning_rate": 8.42732665925102e-06, + "loss": 3.3681, + "mean_token_accuracy": 0.428452694278986, + "step": 8483 + }, + { + "epoch": 1.5728587319243603, + "grad_norm": 9.234375, + "learning_rate": 8.42714126807564e-06, + "loss": 2.4523, + "mean_token_accuracy": 0.4965366067514248, + "step": 8484 + }, + { + "epoch": 1.5730441230997405, + "grad_norm": 8.1875, + "learning_rate": 8.42695587690026e-06, + "loss": 3.3614, + "mean_token_accuracy": 0.42931078010603385, + "step": 8485 + }, + { + "epoch": 1.5732295142751205, + "grad_norm": 5.67578125, + "learning_rate": 8.42677048572488e-06, + "loss": 2.8377, + "mean_token_accuracy": 0.44548686549476596, + "step": 8486 + }, + { + "epoch": 1.5734149054505004, + "grad_norm": 8.453125, + "learning_rate": 8.4265850945495e-06, + "loss": 3.379, + "mean_token_accuracy": 0.42384887839433294, + "step": 8487 + }, + { + "epoch": 1.5736002966258806, + "grad_norm": 6.4296875, + "learning_rate": 8.42639970337412e-06, + "loss": 3.1337, + "mean_token_accuracy": 0.4355234460196292, + "step": 8488 + }, + { + "epoch": 1.5737856878012608, + "grad_norm": 5.890625, + "learning_rate": 8.42621431219874e-06, + "loss": 2.9977, + "mean_token_accuracy": 0.4712328767123288, + "step": 8489 + }, + { + "epoch": 1.5739710789766406, + "grad_norm": 7.203125, + "learning_rate": 8.42602892102336e-06, + "loss": 3.2388, + "mean_token_accuracy": 0.4267581475128645, + "step": 8490 + }, + { + "epoch": 1.5741564701520208, + "grad_norm": 9.4765625, + "learning_rate": 8.42584352984798e-06, + "loss": 3.1719, + "mean_token_accuracy": 0.45363048166786485, + "step": 8491 + }, + { + "epoch": 1.574341861327401, + "grad_norm": 5.79296875, + "learning_rate": 8.4256581386726e-06, + "loss": 2.6594, + "mean_token_accuracy": 0.4928061173156253, + "step": 8492 + }, + { + "epoch": 1.574527252502781, + "grad_norm": 9.7265625, + "learning_rate": 8.42547274749722e-06, + "loss": 2.5923, + "mean_token_accuracy": 0.5029758289809304, + "step": 8493 + }, + { + "epoch": 1.5747126436781609, + "grad_norm": 12.890625, + "learning_rate": 8.42528735632184e-06, + "loss": 2.4506, + "mean_token_accuracy": 0.5001299038711353, + "step": 8494 + }, + { + "epoch": 1.574898034853541, + "grad_norm": 5.54296875, + "learning_rate": 8.42510196514646e-06, + "loss": 2.8877, + "mean_token_accuracy": 0.45436199937907484, + "step": 8495 + }, + { + "epoch": 1.575083426028921, + "grad_norm": 6.98828125, + "learning_rate": 8.424916573971079e-06, + "loss": 2.8114, + "mean_token_accuracy": 0.4441413354117953, + "step": 8496 + }, + { + "epoch": 1.575268817204301, + "grad_norm": 6.9453125, + "learning_rate": 8.424731182795701e-06, + "loss": 2.4207, + "mean_token_accuracy": 0.4998148833765272, + "step": 8497 + }, + { + "epoch": 1.5754542083796812, + "grad_norm": 8.9765625, + "learning_rate": 8.42454579162032e-06, + "loss": 2.4778, + "mean_token_accuracy": 0.5161745676979179, + "step": 8498 + }, + { + "epoch": 1.5756395995550612, + "grad_norm": 6.60546875, + "learning_rate": 8.42436040044494e-06, + "loss": 2.5788, + "mean_token_accuracy": 0.49110049924028654, + "step": 8499 + }, + { + "epoch": 1.5758249907304411, + "grad_norm": 6.3828125, + "learning_rate": 8.424175009269559e-06, + "loss": 3.3314, + "mean_token_accuracy": 0.42735042735042733, + "step": 8500 + }, + { + "epoch": 1.5760103819058213, + "grad_norm": 9.640625, + "learning_rate": 8.42398961809418e-06, + "loss": 2.9186, + "mean_token_accuracy": 0.4283121597096189, + "step": 8501 + }, + { + "epoch": 1.5761957730812013, + "grad_norm": 8.1171875, + "learning_rate": 8.4238042269188e-06, + "loss": 2.5448, + "mean_token_accuracy": 0.48713480266529985, + "step": 8502 + }, + { + "epoch": 1.5763811642565813, + "grad_norm": 6.21875, + "learning_rate": 8.423618835743419e-06, + "loss": 3.2833, + "mean_token_accuracy": 0.47560679611650486, + "step": 8503 + }, + { + "epoch": 1.5765665554319614, + "grad_norm": 8.2421875, + "learning_rate": 8.42343344456804e-06, + "loss": 2.7554, + "mean_token_accuracy": 0.4461690885072655, + "step": 8504 + }, + { + "epoch": 1.5767519466073416, + "grad_norm": 12.3203125, + "learning_rate": 8.42324805339266e-06, + "loss": 2.826, + "mean_token_accuracy": 0.42980365452275565, + "step": 8505 + }, + { + "epoch": 1.5769373377827214, + "grad_norm": 6.515625, + "learning_rate": 8.42306266221728e-06, + "loss": 2.3576, + "mean_token_accuracy": 0.5284066638188808, + "step": 8506 + }, + { + "epoch": 1.5771227289581016, + "grad_norm": 6.36328125, + "learning_rate": 8.422877271041899e-06, + "loss": 2.8022, + "mean_token_accuracy": 0.48289812431040824, + "step": 8507 + }, + { + "epoch": 1.5773081201334818, + "grad_norm": 9.5625, + "learning_rate": 8.42269187986652e-06, + "loss": 2.5185, + "mean_token_accuracy": 0.4984953703703704, + "step": 8508 + }, + { + "epoch": 1.5774935113088617, + "grad_norm": 7.71875, + "learning_rate": 8.422506488691138e-06, + "loss": 2.2664, + "mean_token_accuracy": 0.5409507923269391, + "step": 8509 + }, + { + "epoch": 1.5776789024842417, + "grad_norm": 5.6953125, + "learning_rate": 8.422321097515759e-06, + "loss": 2.9023, + "mean_token_accuracy": 0.43927948866937827, + "step": 8510 + }, + { + "epoch": 1.5778642936596219, + "grad_norm": 7.24609375, + "learning_rate": 8.42213570634038e-06, + "loss": 2.7093, + "mean_token_accuracy": 0.4667966211825861, + "step": 8511 + }, + { + "epoch": 1.5780496848350019, + "grad_norm": 6.671875, + "learning_rate": 8.421950315164998e-06, + "loss": 2.7113, + "mean_token_accuracy": 0.47303958177744587, + "step": 8512 + }, + { + "epoch": 1.5782350760103818, + "grad_norm": 6.171875, + "learning_rate": 8.421764923989619e-06, + "loss": 3.188, + "mean_token_accuracy": 0.44219292158223455, + "step": 8513 + }, + { + "epoch": 1.578420467185762, + "grad_norm": 6.1875, + "learning_rate": 8.421579532814239e-06, + "loss": 2.7631, + "mean_token_accuracy": 0.46224601867105986, + "step": 8514 + }, + { + "epoch": 1.578605858361142, + "grad_norm": 7.19140625, + "learning_rate": 8.42139414163886e-06, + "loss": 2.271, + "mean_token_accuracy": 0.5368063420158551, + "step": 8515 + }, + { + "epoch": 1.578791249536522, + "grad_norm": 7.41796875, + "learning_rate": 8.421208750463478e-06, + "loss": 2.8469, + "mean_token_accuracy": 0.48372781065088755, + "step": 8516 + }, + { + "epoch": 1.5789766407119021, + "grad_norm": 6.40625, + "learning_rate": 8.421023359288099e-06, + "loss": 2.8499, + "mean_token_accuracy": 0.4705999205403258, + "step": 8517 + }, + { + "epoch": 1.5791620318872823, + "grad_norm": 6.4609375, + "learning_rate": 8.420837968112718e-06, + "loss": 2.5319, + "mean_token_accuracy": 0.4864171621779177, + "step": 8518 + }, + { + "epoch": 1.579347423062662, + "grad_norm": 6.14453125, + "learning_rate": 8.420652576937338e-06, + "loss": 3.4918, + "mean_token_accuracy": 0.4133238837703756, + "step": 8519 + }, + { + "epoch": 1.5795328142380423, + "grad_norm": 7.7421875, + "learning_rate": 8.420467185761959e-06, + "loss": 2.7205, + "mean_token_accuracy": 0.4586458333333333, + "step": 8520 + }, + { + "epoch": 1.5797182054134224, + "grad_norm": 5.6796875, + "learning_rate": 8.420281794586579e-06, + "loss": 2.677, + "mean_token_accuracy": 0.49216171617161714, + "step": 8521 + }, + { + "epoch": 1.5799035965888024, + "grad_norm": 5.5546875, + "learning_rate": 8.420096403411198e-06, + "loss": 3.5245, + "mean_token_accuracy": 0.40401076057715823, + "step": 8522 + }, + { + "epoch": 1.5800889877641824, + "grad_norm": 5.9375, + "learning_rate": 8.419911012235818e-06, + "loss": 2.7, + "mean_token_accuracy": 0.47946030598723044, + "step": 8523 + }, + { + "epoch": 1.5802743789395626, + "grad_norm": 5.88671875, + "learning_rate": 8.419725621060439e-06, + "loss": 3.1547, + "mean_token_accuracy": 0.45251460648413333, + "step": 8524 + }, + { + "epoch": 1.5804597701149425, + "grad_norm": 6.3203125, + "learning_rate": 8.419540229885058e-06, + "loss": 2.9989, + "mean_token_accuracy": 0.4875985663082437, + "step": 8525 + }, + { + "epoch": 1.5806451612903225, + "grad_norm": 8.8046875, + "learning_rate": 8.419354838709678e-06, + "loss": 2.778, + "mean_token_accuracy": 0.4647747074931541, + "step": 8526 + }, + { + "epoch": 1.5808305524657027, + "grad_norm": 7.796875, + "learning_rate": 8.419169447534297e-06, + "loss": 2.5678, + "mean_token_accuracy": 0.49796452457109625, + "step": 8527 + }, + { + "epoch": 1.5810159436410827, + "grad_norm": 6.23046875, + "learning_rate": 8.418984056358917e-06, + "loss": 3.0137, + "mean_token_accuracy": 0.46067242442936457, + "step": 8528 + }, + { + "epoch": 1.5812013348164626, + "grad_norm": 6.64453125, + "learning_rate": 8.418798665183538e-06, + "loss": 2.885, + "mean_token_accuracy": 0.441711988649799, + "step": 8529 + }, + { + "epoch": 1.5813867259918428, + "grad_norm": 6.7421875, + "learning_rate": 8.418613274008158e-06, + "loss": 3.2051, + "mean_token_accuracy": 0.4299418604651163, + "step": 8530 + }, + { + "epoch": 1.5815721171672228, + "grad_norm": 6.87890625, + "learning_rate": 8.418427882832779e-06, + "loss": 3.1126, + "mean_token_accuracy": 0.44399121430812677, + "step": 8531 + }, + { + "epoch": 1.5817575083426028, + "grad_norm": 10.6640625, + "learning_rate": 8.418242491657398e-06, + "loss": 2.64, + "mean_token_accuracy": 0.4772877618522602, + "step": 8532 + }, + { + "epoch": 1.581942899517983, + "grad_norm": 8.6328125, + "learning_rate": 8.418057100482018e-06, + "loss": 2.431, + "mean_token_accuracy": 0.5036755386565273, + "step": 8533 + }, + { + "epoch": 1.5821282906933631, + "grad_norm": 11.40625, + "learning_rate": 8.417871709306637e-06, + "loss": 2.3335, + "mean_token_accuracy": 0.513957509881423, + "step": 8534 + }, + { + "epoch": 1.5823136818687429, + "grad_norm": 6.77734375, + "learning_rate": 8.417686318131258e-06, + "loss": 3.08, + "mean_token_accuracy": 0.4370174277520406, + "step": 8535 + }, + { + "epoch": 1.582499073044123, + "grad_norm": 5.90625, + "learning_rate": 8.417500926955878e-06, + "loss": 2.7527, + "mean_token_accuracy": 0.4726314366806325, + "step": 8536 + }, + { + "epoch": 1.5826844642195033, + "grad_norm": 6.328125, + "learning_rate": 8.417315535780497e-06, + "loss": 2.6442, + "mean_token_accuracy": 0.46855573225386726, + "step": 8537 + }, + { + "epoch": 1.5828698553948832, + "grad_norm": 5.83203125, + "learning_rate": 8.417130144605117e-06, + "loss": 2.9191, + "mean_token_accuracy": 0.49198697068403907, + "step": 8538 + }, + { + "epoch": 1.5830552465702632, + "grad_norm": 6.70703125, + "learning_rate": 8.416944753429738e-06, + "loss": 2.3842, + "mean_token_accuracy": 0.5192307692307693, + "step": 8539 + }, + { + "epoch": 1.5832406377456434, + "grad_norm": 6.078125, + "learning_rate": 8.416759362254358e-06, + "loss": 3.7909, + "mean_token_accuracy": 0.41010984084286034, + "step": 8540 + }, + { + "epoch": 1.5834260289210234, + "grad_norm": 6.05859375, + "learning_rate": 8.416573971078977e-06, + "loss": 2.8041, + "mean_token_accuracy": 0.47223523898781633, + "step": 8541 + }, + { + "epoch": 1.5836114200964033, + "grad_norm": 5.328125, + "learning_rate": 8.416388579903598e-06, + "loss": 3.3487, + "mean_token_accuracy": 0.4473744554001376, + "step": 8542 + }, + { + "epoch": 1.5837968112717835, + "grad_norm": 5.62890625, + "learning_rate": 8.416203188728216e-06, + "loss": 2.8092, + "mean_token_accuracy": 0.45176110260336905, + "step": 8543 + }, + { + "epoch": 1.5839822024471635, + "grad_norm": 7.51171875, + "learning_rate": 8.416017797552837e-06, + "loss": 2.6675, + "mean_token_accuracy": 0.4629073260919806, + "step": 8544 + }, + { + "epoch": 1.5841675936225434, + "grad_norm": 6.1328125, + "learning_rate": 8.415832406377457e-06, + "loss": 2.8197, + "mean_token_accuracy": 0.44554165037152915, + "step": 8545 + }, + { + "epoch": 1.5843529847979236, + "grad_norm": 7.171875, + "learning_rate": 8.415647015202078e-06, + "loss": 2.6288, + "mean_token_accuracy": 0.45631207715560673, + "step": 8546 + }, + { + "epoch": 1.5845383759733038, + "grad_norm": 6.875, + "learning_rate": 8.415461624026697e-06, + "loss": 3.223, + "mean_token_accuracy": 0.44672131147540983, + "step": 8547 + }, + { + "epoch": 1.5847237671486836, + "grad_norm": 7.68359375, + "learning_rate": 8.415276232851317e-06, + "loss": 2.6178, + "mean_token_accuracy": 0.492586778301064, + "step": 8548 + }, + { + "epoch": 1.5849091583240638, + "grad_norm": 5.53515625, + "learning_rate": 8.415090841675938e-06, + "loss": 2.8437, + "mean_token_accuracy": 0.4559015964407223, + "step": 8549 + }, + { + "epoch": 1.585094549499444, + "grad_norm": 6.1484375, + "learning_rate": 8.414905450500556e-06, + "loss": 3.1616, + "mean_token_accuracy": 0.44719314938154137, + "step": 8550 + }, + { + "epoch": 1.585279940674824, + "grad_norm": 7.3359375, + "learning_rate": 8.414720059325177e-06, + "loss": 2.6281, + "mean_token_accuracy": 0.48656798245614036, + "step": 8551 + }, + { + "epoch": 1.5854653318502039, + "grad_norm": 6.625, + "learning_rate": 8.414534668149796e-06, + "loss": 2.3305, + "mean_token_accuracy": 0.5082903981264637, + "step": 8552 + }, + { + "epoch": 1.585650723025584, + "grad_norm": 6.109375, + "learning_rate": 8.414349276974416e-06, + "loss": 2.5596, + "mean_token_accuracy": 0.47637318255250405, + "step": 8553 + }, + { + "epoch": 1.585836114200964, + "grad_norm": 7.57421875, + "learning_rate": 8.414163885799037e-06, + "loss": 3.4852, + "mean_token_accuracy": 0.4205808940322215, + "step": 8554 + }, + { + "epoch": 1.586021505376344, + "grad_norm": 6.42578125, + "learning_rate": 8.413978494623657e-06, + "loss": 3.6456, + "mean_token_accuracy": 0.4002157497303128, + "step": 8555 + }, + { + "epoch": 1.5862068965517242, + "grad_norm": 5.22265625, + "learning_rate": 8.413793103448276e-06, + "loss": 3.2209, + "mean_token_accuracy": 0.4351843032669778, + "step": 8556 + }, + { + "epoch": 1.5863922877271042, + "grad_norm": 5.34375, + "learning_rate": 8.413607712272896e-06, + "loss": 2.9638, + "mean_token_accuracy": 0.4638513964987305, + "step": 8557 + }, + { + "epoch": 1.5865776789024841, + "grad_norm": 6.703125, + "learning_rate": 8.413422321097517e-06, + "loss": 3.0011, + "mean_token_accuracy": 0.4484749777909387, + "step": 8558 + }, + { + "epoch": 1.5867630700778643, + "grad_norm": 7.71875, + "learning_rate": 8.413236929922136e-06, + "loss": 2.981, + "mean_token_accuracy": 0.45481770833333335, + "step": 8559 + }, + { + "epoch": 1.5869484612532443, + "grad_norm": 6.94140625, + "learning_rate": 8.413051538746756e-06, + "loss": 3.0304, + "mean_token_accuracy": 0.44808414725770096, + "step": 8560 + }, + { + "epoch": 1.5871338524286243, + "grad_norm": 6.32421875, + "learning_rate": 8.412866147571375e-06, + "loss": 2.7095, + "mean_token_accuracy": 0.4721120186697783, + "step": 8561 + }, + { + "epoch": 1.5873192436040044, + "grad_norm": 6.0234375, + "learning_rate": 8.412680756395997e-06, + "loss": 2.6582, + "mean_token_accuracy": 0.4550159841680621, + "step": 8562 + }, + { + "epoch": 1.5875046347793846, + "grad_norm": 7.93359375, + "learning_rate": 8.412495365220616e-06, + "loss": 2.2023, + "mean_token_accuracy": 0.550316856780735, + "step": 8563 + }, + { + "epoch": 1.5876900259547646, + "grad_norm": 5.90234375, + "learning_rate": 8.412309974045237e-06, + "loss": 3.2143, + "mean_token_accuracy": 0.42887776983559683, + "step": 8564 + }, + { + "epoch": 1.5878754171301446, + "grad_norm": 5.9296875, + "learning_rate": 8.412124582869855e-06, + "loss": 2.8184, + "mean_token_accuracy": 0.44744780982261373, + "step": 8565 + }, + { + "epoch": 1.5880608083055248, + "grad_norm": 5.46484375, + "learning_rate": 8.411939191694476e-06, + "loss": 2.381, + "mean_token_accuracy": 0.514172335600907, + "step": 8566 + }, + { + "epoch": 1.5882461994809047, + "grad_norm": 5.30078125, + "learning_rate": 8.411753800519096e-06, + "loss": 2.5785, + "mean_token_accuracy": 0.48451327433628316, + "step": 8567 + }, + { + "epoch": 1.5884315906562847, + "grad_norm": 6.38671875, + "learning_rate": 8.411568409343715e-06, + "loss": 3.1453, + "mean_token_accuracy": 0.44353758070410526, + "step": 8568 + }, + { + "epoch": 1.5886169818316649, + "grad_norm": 6.5859375, + "learning_rate": 8.411383018168336e-06, + "loss": 2.9786, + "mean_token_accuracy": 0.44592440801457195, + "step": 8569 + }, + { + "epoch": 1.5888023730070449, + "grad_norm": 6.5234375, + "learning_rate": 8.411197626992956e-06, + "loss": 3.0007, + "mean_token_accuracy": 0.46088902451429153, + "step": 8570 + }, + { + "epoch": 1.5889877641824248, + "grad_norm": 6.08984375, + "learning_rate": 8.411012235817577e-06, + "loss": 3.1898, + "mean_token_accuracy": 0.5014020707506471, + "step": 8571 + }, + { + "epoch": 1.589173155357805, + "grad_norm": 5.359375, + "learning_rate": 8.410826844642195e-06, + "loss": 2.5308, + "mean_token_accuracy": 0.5078087264530672, + "step": 8572 + }, + { + "epoch": 1.589358546533185, + "grad_norm": 7.15234375, + "learning_rate": 8.410641453466816e-06, + "loss": 3.9028, + "mean_token_accuracy": 0.4039861558658551, + "step": 8573 + }, + { + "epoch": 1.589543937708565, + "grad_norm": 6.94921875, + "learning_rate": 8.410456062291436e-06, + "loss": 2.6449, + "mean_token_accuracy": 0.503921568627451, + "step": 8574 + }, + { + "epoch": 1.5897293288839451, + "grad_norm": 6.5546875, + "learning_rate": 8.410270671116055e-06, + "loss": 3.465, + "mean_token_accuracy": 0.41294232225949573, + "step": 8575 + }, + { + "epoch": 1.5899147200593253, + "grad_norm": 7.60546875, + "learning_rate": 8.410085279940676e-06, + "loss": 2.8748, + "mean_token_accuracy": 0.4710275560133917, + "step": 8576 + }, + { + "epoch": 1.590100111234705, + "grad_norm": 7.05078125, + "learning_rate": 8.409899888765294e-06, + "loss": 3.1863, + "mean_token_accuracy": 0.4696335742360847, + "step": 8577 + }, + { + "epoch": 1.5902855024100853, + "grad_norm": 7.93359375, + "learning_rate": 8.409714497589917e-06, + "loss": 2.3812, + "mean_token_accuracy": 0.5145588874402434, + "step": 8578 + }, + { + "epoch": 1.5904708935854655, + "grad_norm": 8.859375, + "learning_rate": 8.409529106414535e-06, + "loss": 3.6163, + "mean_token_accuracy": 0.42421330771078547, + "step": 8579 + }, + { + "epoch": 1.5906562847608454, + "grad_norm": 9.1015625, + "learning_rate": 8.409343715239156e-06, + "loss": 2.8344, + "mean_token_accuracy": 0.47660628176723574, + "step": 8580 + }, + { + "epoch": 1.5908416759362254, + "grad_norm": 9.0859375, + "learning_rate": 8.409158324063775e-06, + "loss": 3.1078, + "mean_token_accuracy": 0.4412874322093846, + "step": 8581 + }, + { + "epoch": 1.5910270671116056, + "grad_norm": 6.2265625, + "learning_rate": 8.408972932888395e-06, + "loss": 2.7176, + "mean_token_accuracy": 0.4824162184526272, + "step": 8582 + }, + { + "epoch": 1.5912124582869855, + "grad_norm": 5.875, + "learning_rate": 8.408787541713016e-06, + "loss": 2.6757, + "mean_token_accuracy": 0.4731616059864729, + "step": 8583 + }, + { + "epoch": 1.5913978494623655, + "grad_norm": 8.765625, + "learning_rate": 8.408602150537634e-06, + "loss": 2.9229, + "mean_token_accuracy": 0.4496810772501772, + "step": 8584 + }, + { + "epoch": 1.5915832406377457, + "grad_norm": 8.5390625, + "learning_rate": 8.408416759362255e-06, + "loss": 2.6517, + "mean_token_accuracy": 0.4864112639161755, + "step": 8585 + }, + { + "epoch": 1.5917686318131257, + "grad_norm": 5.62109375, + "learning_rate": 8.408231368186875e-06, + "loss": 2.4607, + "mean_token_accuracy": 0.5052899936265137, + "step": 8586 + }, + { + "epoch": 1.5919540229885056, + "grad_norm": 7.4765625, + "learning_rate": 8.408045977011496e-06, + "loss": 3.1103, + "mean_token_accuracy": 0.45030020013342226, + "step": 8587 + }, + { + "epoch": 1.5921394141638858, + "grad_norm": 7.06640625, + "learning_rate": 8.407860585836115e-06, + "loss": 2.5483, + "mean_token_accuracy": 0.48023200757575757, + "step": 8588 + }, + { + "epoch": 1.592324805339266, + "grad_norm": 6.33203125, + "learning_rate": 8.407675194660735e-06, + "loss": 3.418, + "mean_token_accuracy": 0.41613614103819785, + "step": 8589 + }, + { + "epoch": 1.5925101965146458, + "grad_norm": 6.00390625, + "learning_rate": 8.407489803485354e-06, + "loss": 2.8001, + "mean_token_accuracy": 0.5009683666881859, + "step": 8590 + }, + { + "epoch": 1.592695587690026, + "grad_norm": 5.6328125, + "learning_rate": 8.407304412309975e-06, + "loss": 2.6756, + "mean_token_accuracy": 0.4856084656084656, + "step": 8591 + }, + { + "epoch": 1.5928809788654061, + "grad_norm": 6.73828125, + "learning_rate": 8.407119021134595e-06, + "loss": 2.8308, + "mean_token_accuracy": 0.4684332464523603, + "step": 8592 + }, + { + "epoch": 1.593066370040786, + "grad_norm": 6.90625, + "learning_rate": 8.406933629959214e-06, + "loss": 2.9859, + "mean_token_accuracy": 0.4724200761967865, + "step": 8593 + }, + { + "epoch": 1.593251761216166, + "grad_norm": 7.609375, + "learning_rate": 8.406748238783834e-06, + "loss": 2.638, + "mean_token_accuracy": 0.4976752836153989, + "step": 8594 + }, + { + "epoch": 1.5934371523915463, + "grad_norm": 6.171875, + "learning_rate": 8.406562847608455e-06, + "loss": 2.7275, + "mean_token_accuracy": 0.4715752566992236, + "step": 8595 + }, + { + "epoch": 1.5936225435669262, + "grad_norm": 5.8359375, + "learning_rate": 8.406377456433075e-06, + "loss": 2.8831, + "mean_token_accuracy": 0.46126126126126127, + "step": 8596 + }, + { + "epoch": 1.5938079347423062, + "grad_norm": 8.09375, + "learning_rate": 8.406192065257694e-06, + "loss": 2.6847, + "mean_token_accuracy": 0.4756987316197104, + "step": 8597 + }, + { + "epoch": 1.5939933259176864, + "grad_norm": 6.10546875, + "learning_rate": 8.406006674082315e-06, + "loss": 3.2149, + "mean_token_accuracy": 0.4498239436619718, + "step": 8598 + }, + { + "epoch": 1.5941787170930664, + "grad_norm": 6.83984375, + "learning_rate": 8.405821282906933e-06, + "loss": 3.1685, + "mean_token_accuracy": 0.4303857566765579, + "step": 8599 + }, + { + "epoch": 1.5943641082684463, + "grad_norm": 8.2265625, + "learning_rate": 8.405635891731554e-06, + "loss": 2.7891, + "mean_token_accuracy": 0.4682016004492489, + "step": 8600 + }, + { + "epoch": 1.5945494994438265, + "grad_norm": 5.90234375, + "learning_rate": 8.405450500556174e-06, + "loss": 3.0223, + "mean_token_accuracy": 0.45625, + "step": 8601 + }, + { + "epoch": 1.5947348906192065, + "grad_norm": 8.15625, + "learning_rate": 8.405265109380795e-06, + "loss": 1.9168, + "mean_token_accuracy": 0.5838676583739421, + "step": 8602 + }, + { + "epoch": 1.5949202817945864, + "grad_norm": 8.2578125, + "learning_rate": 8.405079718205414e-06, + "loss": 2.7637, + "mean_token_accuracy": 0.4684414864115363, + "step": 8603 + }, + { + "epoch": 1.5951056729699666, + "grad_norm": 8.6875, + "learning_rate": 8.404894327030034e-06, + "loss": 2.8355, + "mean_token_accuracy": 0.45230197691389146, + "step": 8604 + }, + { + "epoch": 1.5952910641453468, + "grad_norm": 5.42578125, + "learning_rate": 8.404708935854655e-06, + "loss": 2.856, + "mean_token_accuracy": 0.4762416427889207, + "step": 8605 + }, + { + "epoch": 1.5954764553207266, + "grad_norm": 5.32421875, + "learning_rate": 8.404523544679273e-06, + "loss": 2.8356, + "mean_token_accuracy": 0.4732682473268247, + "step": 8606 + }, + { + "epoch": 1.5956618464961068, + "grad_norm": 8.9453125, + "learning_rate": 8.404338153503894e-06, + "loss": 2.7425, + "mean_token_accuracy": 0.47544318512060446, + "step": 8607 + }, + { + "epoch": 1.595847237671487, + "grad_norm": 7.50390625, + "learning_rate": 8.404152762328513e-06, + "loss": 2.9613, + "mean_token_accuracy": 0.4596247960848287, + "step": 8608 + }, + { + "epoch": 1.596032628846867, + "grad_norm": 5.5234375, + "learning_rate": 8.403967371153133e-06, + "loss": 2.5058, + "mean_token_accuracy": 0.493202258941644, + "step": 8609 + }, + { + "epoch": 1.5962180200222469, + "grad_norm": 7.07421875, + "learning_rate": 8.403781979977754e-06, + "loss": 2.7966, + "mean_token_accuracy": 0.4609459251270991, + "step": 8610 + }, + { + "epoch": 1.596403411197627, + "grad_norm": 10.0390625, + "learning_rate": 8.403596588802374e-06, + "loss": 3.0163, + "mean_token_accuracy": 0.43963520555877245, + "step": 8611 + }, + { + "epoch": 1.596588802373007, + "grad_norm": 5.94921875, + "learning_rate": 8.403411197626995e-06, + "loss": 2.8013, + "mean_token_accuracy": 0.46428034290870807, + "step": 8612 + }, + { + "epoch": 1.596774193548387, + "grad_norm": 8.03125, + "learning_rate": 8.403225806451613e-06, + "loss": 2.3093, + "mean_token_accuracy": 0.5013246982631734, + "step": 8613 + }, + { + "epoch": 1.5969595847237672, + "grad_norm": 6.82421875, + "learning_rate": 8.403040415276234e-06, + "loss": 3.2666, + "mean_token_accuracy": 0.4300713985720286, + "step": 8614 + }, + { + "epoch": 1.5971449758991472, + "grad_norm": 6.43359375, + "learning_rate": 8.402855024100853e-06, + "loss": 3.0618, + "mean_token_accuracy": 0.441352141314017, + "step": 8615 + }, + { + "epoch": 1.5973303670745271, + "grad_norm": 6.58203125, + "learning_rate": 8.402669632925473e-06, + "loss": 2.7172, + "mean_token_accuracy": 0.48085542322960684, + "step": 8616 + }, + { + "epoch": 1.5975157582499073, + "grad_norm": 7.0234375, + "learning_rate": 8.402484241750094e-06, + "loss": 3.0322, + "mean_token_accuracy": 0.4524315705554079, + "step": 8617 + }, + { + "epoch": 1.5977011494252875, + "grad_norm": 5.703125, + "learning_rate": 8.402298850574714e-06, + "loss": 2.7527, + "mean_token_accuracy": 0.4898522877386814, + "step": 8618 + }, + { + "epoch": 1.5978865406006673, + "grad_norm": 6.66796875, + "learning_rate": 8.402113459399333e-06, + "loss": 2.9781, + "mean_token_accuracy": 0.44206065421798435, + "step": 8619 + }, + { + "epoch": 1.5980719317760474, + "grad_norm": 6.5078125, + "learning_rate": 8.401928068223954e-06, + "loss": 3.499, + "mean_token_accuracy": 0.4003276897870016, + "step": 8620 + }, + { + "epoch": 1.5982573229514276, + "grad_norm": 7.4296875, + "learning_rate": 8.401742677048574e-06, + "loss": 2.7849, + "mean_token_accuracy": 0.47256977863330124, + "step": 8621 + }, + { + "epoch": 1.5984427141268076, + "grad_norm": 8.2578125, + "learning_rate": 8.401557285873193e-06, + "loss": 2.9765, + "mean_token_accuracy": 0.46650373778033355, + "step": 8622 + }, + { + "epoch": 1.5986281053021876, + "grad_norm": 8.53125, + "learning_rate": 8.401371894697813e-06, + "loss": 3.6432, + "mean_token_accuracy": 0.4119150080688542, + "step": 8623 + }, + { + "epoch": 1.5988134964775678, + "grad_norm": 10.1875, + "learning_rate": 8.401186503522432e-06, + "loss": 2.8262, + "mean_token_accuracy": 0.4548213546696096, + "step": 8624 + }, + { + "epoch": 1.5989988876529477, + "grad_norm": 5.5703125, + "learning_rate": 8.401001112347053e-06, + "loss": 3.2812, + "mean_token_accuracy": 0.42887624466571833, + "step": 8625 + }, + { + "epoch": 1.5991842788283277, + "grad_norm": 7.80859375, + "learning_rate": 8.400815721171673e-06, + "loss": 2.3344, + "mean_token_accuracy": 0.5284240825137922, + "step": 8626 + }, + { + "epoch": 1.5993696700037079, + "grad_norm": 6.8515625, + "learning_rate": 8.400630329996294e-06, + "loss": 3.0744, + "mean_token_accuracy": 0.42608089260808923, + "step": 8627 + }, + { + "epoch": 1.5995550611790879, + "grad_norm": 6.3515625, + "learning_rate": 8.400444938820912e-06, + "loss": 2.4912, + "mean_token_accuracy": 0.5222658808120497, + "step": 8628 + }, + { + "epoch": 1.5997404523544678, + "grad_norm": 6.5078125, + "learning_rate": 8.400259547645533e-06, + "loss": 2.8949, + "mean_token_accuracy": 0.436965645888388, + "step": 8629 + }, + { + "epoch": 1.599925843529848, + "grad_norm": 6.05078125, + "learning_rate": 8.400074156470153e-06, + "loss": 2.8493, + "mean_token_accuracy": 0.45618312339157524, + "step": 8630 + }, + { + "epoch": 1.600111234705228, + "grad_norm": 6.4140625, + "learning_rate": 8.399888765294772e-06, + "loss": 2.4854, + "mean_token_accuracy": 0.5345849802371542, + "step": 8631 + }, + { + "epoch": 1.600296625880608, + "grad_norm": 5.80078125, + "learning_rate": 8.399703374119393e-06, + "loss": 2.8014, + "mean_token_accuracy": 0.4791158317783838, + "step": 8632 + }, + { + "epoch": 1.6004820170559881, + "grad_norm": 5.7734375, + "learning_rate": 8.399517982944011e-06, + "loss": 2.5197, + "mean_token_accuracy": 0.5085915699809076, + "step": 8633 + }, + { + "epoch": 1.6006674082313683, + "grad_norm": 7.2421875, + "learning_rate": 8.399332591768634e-06, + "loss": 2.7264, + "mean_token_accuracy": 0.4726089588377724, + "step": 8634 + }, + { + "epoch": 1.600852799406748, + "grad_norm": 7.265625, + "learning_rate": 8.399147200593252e-06, + "loss": 2.6153, + "mean_token_accuracy": 0.4830630919890833, + "step": 8635 + }, + { + "epoch": 1.6010381905821283, + "grad_norm": 6.0234375, + "learning_rate": 8.398961809417873e-06, + "loss": 2.7931, + "mean_token_accuracy": 0.4743985194324491, + "step": 8636 + }, + { + "epoch": 1.6012235817575085, + "grad_norm": 8.296875, + "learning_rate": 8.398776418242492e-06, + "loss": 3.1076, + "mean_token_accuracy": 0.42414355628058725, + "step": 8637 + }, + { + "epoch": 1.6014089729328884, + "grad_norm": 5.2734375, + "learning_rate": 8.398591027067112e-06, + "loss": 2.9813, + "mean_token_accuracy": 0.4670192906036092, + "step": 8638 + }, + { + "epoch": 1.6015943641082684, + "grad_norm": 5.3359375, + "learning_rate": 8.398405635891733e-06, + "loss": 3.1805, + "mean_token_accuracy": 0.4517241379310345, + "step": 8639 + }, + { + "epoch": 1.6017797552836486, + "grad_norm": 6.87890625, + "learning_rate": 8.398220244716352e-06, + "loss": 3.3873, + "mean_token_accuracy": 0.4171504596032898, + "step": 8640 + }, + { + "epoch": 1.6019651464590285, + "grad_norm": 9.1953125, + "learning_rate": 8.398034853540972e-06, + "loss": 2.7982, + "mean_token_accuracy": 0.44830297470842617, + "step": 8641 + }, + { + "epoch": 1.6021505376344085, + "grad_norm": 6.109375, + "learning_rate": 8.397849462365592e-06, + "loss": 2.7956, + "mean_token_accuracy": 0.4911080711354309, + "step": 8642 + }, + { + "epoch": 1.6023359288097887, + "grad_norm": 6.6484375, + "learning_rate": 8.397664071190213e-06, + "loss": 2.8701, + "mean_token_accuracy": 0.47009860417467025, + "step": 8643 + }, + { + "epoch": 1.6025213199851687, + "grad_norm": 5.2734375, + "learning_rate": 8.397478680014832e-06, + "loss": 2.985, + "mean_token_accuracy": 0.4557747727776546, + "step": 8644 + }, + { + "epoch": 1.6027067111605486, + "grad_norm": 6.02734375, + "learning_rate": 8.397293288839452e-06, + "loss": 2.6442, + "mean_token_accuracy": 0.478779375657664, + "step": 8645 + }, + { + "epoch": 1.6028921023359288, + "grad_norm": 6.3828125, + "learning_rate": 8.397107897664071e-06, + "loss": 3.9453, + "mean_token_accuracy": 0.3714471968709257, + "step": 8646 + }, + { + "epoch": 1.603077493511309, + "grad_norm": 7.19921875, + "learning_rate": 8.396922506488692e-06, + "loss": 2.8062, + "mean_token_accuracy": 0.4599919039265956, + "step": 8647 + }, + { + "epoch": 1.6032628846866888, + "grad_norm": 8.4921875, + "learning_rate": 8.396737115313312e-06, + "loss": 2.2855, + "mean_token_accuracy": 0.5318315377081293, + "step": 8648 + }, + { + "epoch": 1.603448275862069, + "grad_norm": 6.734375, + "learning_rate": 8.396551724137931e-06, + "loss": 2.8772, + "mean_token_accuracy": 0.4736248590402205, + "step": 8649 + }, + { + "epoch": 1.6036336670374491, + "grad_norm": 6.16796875, + "learning_rate": 8.396366332962553e-06, + "loss": 2.7806, + "mean_token_accuracy": 0.4805728871242201, + "step": 8650 + }, + { + "epoch": 1.603819058212829, + "grad_norm": 6.96875, + "learning_rate": 8.396180941787172e-06, + "loss": 2.799, + "mean_token_accuracy": 0.49238253744718985, + "step": 8651 + }, + { + "epoch": 1.604004449388209, + "grad_norm": 5.7578125, + "learning_rate": 8.395995550611792e-06, + "loss": 2.5007, + "mean_token_accuracy": 0.5115071403281011, + "step": 8652 + }, + { + "epoch": 1.6041898405635893, + "grad_norm": 6.11328125, + "learning_rate": 8.395810159436411e-06, + "loss": 2.8007, + "mean_token_accuracy": 0.4666385135135135, + "step": 8653 + }, + { + "epoch": 1.6043752317389692, + "grad_norm": 5.76953125, + "learning_rate": 8.395624768261032e-06, + "loss": 2.6047, + "mean_token_accuracy": 0.47375565610859727, + "step": 8654 + }, + { + "epoch": 1.6045606229143492, + "grad_norm": 5.6796875, + "learning_rate": 8.395439377085652e-06, + "loss": 2.4953, + "mean_token_accuracy": 0.5057455350962204, + "step": 8655 + }, + { + "epoch": 1.6047460140897294, + "grad_norm": 7.8515625, + "learning_rate": 8.395253985910271e-06, + "loss": 3.532, + "mean_token_accuracy": 0.42178601720586456, + "step": 8656 + }, + { + "epoch": 1.6049314052651094, + "grad_norm": 10.2265625, + "learning_rate": 8.395068594734891e-06, + "loss": 3.0786, + "mean_token_accuracy": 0.4359581360578121, + "step": 8657 + }, + { + "epoch": 1.6051167964404893, + "grad_norm": 7.1015625, + "learning_rate": 8.394883203559512e-06, + "loss": 2.9055, + "mean_token_accuracy": 0.46379027853631893, + "step": 8658 + }, + { + "epoch": 1.6053021876158695, + "grad_norm": 8.84375, + "learning_rate": 8.394697812384132e-06, + "loss": 2.4032, + "mean_token_accuracy": 0.5116225546605293, + "step": 8659 + }, + { + "epoch": 1.6054875787912495, + "grad_norm": 7.60546875, + "learning_rate": 8.394512421208751e-06, + "loss": 3.6699, + "mean_token_accuracy": 0.40378951502061655, + "step": 8660 + }, + { + "epoch": 1.6056729699666294, + "grad_norm": 6.7890625, + "learning_rate": 8.394327030033372e-06, + "loss": 2.4313, + "mean_token_accuracy": 0.5136150234741784, + "step": 8661 + }, + { + "epoch": 1.6058583611420096, + "grad_norm": 7.60546875, + "learning_rate": 8.39414163885799e-06, + "loss": 3.4075, + "mean_token_accuracy": 0.4122950819672131, + "step": 8662 + }, + { + "epoch": 1.6060437523173898, + "grad_norm": 7.0390625, + "learning_rate": 8.393956247682611e-06, + "loss": 2.5689, + "mean_token_accuracy": 0.5111719763586565, + "step": 8663 + }, + { + "epoch": 1.6062291434927698, + "grad_norm": 7.70703125, + "learning_rate": 8.393770856507231e-06, + "loss": 2.6302, + "mean_token_accuracy": 0.48648273949812837, + "step": 8664 + }, + { + "epoch": 1.6064145346681498, + "grad_norm": 8.578125, + "learning_rate": 8.39358546533185e-06, + "loss": 2.7151, + "mean_token_accuracy": 0.4719166184134337, + "step": 8665 + }, + { + "epoch": 1.60659992584353, + "grad_norm": 5.765625, + "learning_rate": 8.39340007415647e-06, + "loss": 2.8652, + "mean_token_accuracy": 0.4518396649715824, + "step": 8666 + }, + { + "epoch": 1.60678531701891, + "grad_norm": 7.63671875, + "learning_rate": 8.393214682981091e-06, + "loss": 2.7526, + "mean_token_accuracy": 0.47163486333161425, + "step": 8667 + }, + { + "epoch": 1.6069707081942899, + "grad_norm": 10.890625, + "learning_rate": 8.393029291805712e-06, + "loss": 3.1586, + "mean_token_accuracy": 0.4612741210679355, + "step": 8668 + }, + { + "epoch": 1.60715609936967, + "grad_norm": 7.453125, + "learning_rate": 8.39284390063033e-06, + "loss": 2.7412, + "mean_token_accuracy": 0.49185611009493946, + "step": 8669 + }, + { + "epoch": 1.60734149054505, + "grad_norm": 9.6484375, + "learning_rate": 8.392658509454951e-06, + "loss": 2.5046, + "mean_token_accuracy": 0.49555254032865975, + "step": 8670 + }, + { + "epoch": 1.60752688172043, + "grad_norm": 6.19140625, + "learning_rate": 8.39247311827957e-06, + "loss": 3.1556, + "mean_token_accuracy": 0.4467073998642227, + "step": 8671 + }, + { + "epoch": 1.6077122728958102, + "grad_norm": 8.6171875, + "learning_rate": 8.39228772710419e-06, + "loss": 2.7868, + "mean_token_accuracy": 0.479288076862498, + "step": 8672 + }, + { + "epoch": 1.6078976640711902, + "grad_norm": 7.59375, + "learning_rate": 8.39210233592881e-06, + "loss": 3.2884, + "mean_token_accuracy": 0.44635845471817603, + "step": 8673 + }, + { + "epoch": 1.6080830552465701, + "grad_norm": 6.671875, + "learning_rate": 8.39191694475343e-06, + "loss": 2.6022, + "mean_token_accuracy": 0.48223615464994773, + "step": 8674 + }, + { + "epoch": 1.6082684464219503, + "grad_norm": 6.00390625, + "learning_rate": 8.39173155357805e-06, + "loss": 3.1312, + "mean_token_accuracy": 0.4388614580307871, + "step": 8675 + }, + { + "epoch": 1.6084538375973305, + "grad_norm": 6.68359375, + "learning_rate": 8.39154616240267e-06, + "loss": 3.131, + "mean_token_accuracy": 0.44995152690256907, + "step": 8676 + }, + { + "epoch": 1.6086392287727103, + "grad_norm": 6.55859375, + "learning_rate": 8.391360771227291e-06, + "loss": 3.7159, + "mean_token_accuracy": 0.39223663954255555, + "step": 8677 + }, + { + "epoch": 1.6088246199480905, + "grad_norm": 7.14453125, + "learning_rate": 8.39117538005191e-06, + "loss": 3.0674, + "mean_token_accuracy": 0.45244186046511625, + "step": 8678 + }, + { + "epoch": 1.6090100111234706, + "grad_norm": 6.9609375, + "learning_rate": 8.39098998887653e-06, + "loss": 2.9394, + "mean_token_accuracy": 0.47921419518377695, + "step": 8679 + }, + { + "epoch": 1.6091954022988506, + "grad_norm": 5.76171875, + "learning_rate": 8.390804597701149e-06, + "loss": 2.4442, + "mean_token_accuracy": 0.49082626850563077, + "step": 8680 + }, + { + "epoch": 1.6093807934742306, + "grad_norm": 5.94140625, + "learning_rate": 8.39061920652577e-06, + "loss": 3.21, + "mean_token_accuracy": 0.42822662601626016, + "step": 8681 + }, + { + "epoch": 1.6095661846496108, + "grad_norm": 6.2734375, + "learning_rate": 8.39043381535039e-06, + "loss": 2.7947, + "mean_token_accuracy": 0.46371158392434986, + "step": 8682 + }, + { + "epoch": 1.6097515758249907, + "grad_norm": 6.0, + "learning_rate": 8.39024842417501e-06, + "loss": 2.8765, + "mean_token_accuracy": 0.4587135293454996, + "step": 8683 + }, + { + "epoch": 1.6099369670003707, + "grad_norm": 8.234375, + "learning_rate": 8.39006303299963e-06, + "loss": 2.4366, + "mean_token_accuracy": 0.531859410430839, + "step": 8684 + }, + { + "epoch": 1.610122358175751, + "grad_norm": 6.0703125, + "learning_rate": 8.38987764182425e-06, + "loss": 3.1128, + "mean_token_accuracy": 0.4520777948789702, + "step": 8685 + }, + { + "epoch": 1.6103077493511309, + "grad_norm": 7.953125, + "learning_rate": 8.38969225064887e-06, + "loss": 2.8506, + "mean_token_accuracy": 0.5088055588834312, + "step": 8686 + }, + { + "epoch": 1.6104931405265108, + "grad_norm": 6.4609375, + "learning_rate": 8.38950685947349e-06, + "loss": 3.0663, + "mean_token_accuracy": 0.4762446297700278, + "step": 8687 + }, + { + "epoch": 1.610678531701891, + "grad_norm": 7.11328125, + "learning_rate": 8.38932146829811e-06, + "loss": 3.1893, + "mean_token_accuracy": 0.4541069459757442, + "step": 8688 + }, + { + "epoch": 1.6108639228772712, + "grad_norm": 6.16015625, + "learning_rate": 8.389136077122728e-06, + "loss": 2.7016, + "mean_token_accuracy": 0.48244810744810745, + "step": 8689 + }, + { + "epoch": 1.611049314052651, + "grad_norm": 8.1875, + "learning_rate": 8.388950685947349e-06, + "loss": 3.0439, + "mean_token_accuracy": 0.4649674500717202, + "step": 8690 + }, + { + "epoch": 1.6112347052280311, + "grad_norm": 6.16796875, + "learning_rate": 8.38876529477197e-06, + "loss": 2.8254, + "mean_token_accuracy": 0.4778393351800554, + "step": 8691 + }, + { + "epoch": 1.6114200964034113, + "grad_norm": 6.69140625, + "learning_rate": 8.38857990359659e-06, + "loss": 3.0551, + "mean_token_accuracy": 0.46653543307086615, + "step": 8692 + }, + { + "epoch": 1.6116054875787913, + "grad_norm": 5.71484375, + "learning_rate": 8.38839451242121e-06, + "loss": 3.0346, + "mean_token_accuracy": 0.45984102503262547, + "step": 8693 + }, + { + "epoch": 1.6117908787541713, + "grad_norm": 7.88671875, + "learning_rate": 8.38820912124583e-06, + "loss": 2.7222, + "mean_token_accuracy": 0.45812518366147514, + "step": 8694 + }, + { + "epoch": 1.6119762699295515, + "grad_norm": 7.54296875, + "learning_rate": 8.38802373007045e-06, + "loss": 3.2794, + "mean_token_accuracy": 0.42385001932740624, + "step": 8695 + }, + { + "epoch": 1.6121616611049314, + "grad_norm": 6.4609375, + "learning_rate": 8.387838338895069e-06, + "loss": 2.9902, + "mean_token_accuracy": 0.46371769383697814, + "step": 8696 + }, + { + "epoch": 1.6123470522803114, + "grad_norm": 6.08203125, + "learning_rate": 8.387652947719689e-06, + "loss": 3.1507, + "mean_token_accuracy": 0.43814016172506737, + "step": 8697 + }, + { + "epoch": 1.6125324434556916, + "grad_norm": 6.73046875, + "learning_rate": 8.38746755654431e-06, + "loss": 2.5095, + "mean_token_accuracy": 0.4918279569892473, + "step": 8698 + }, + { + "epoch": 1.6127178346310715, + "grad_norm": 8.21875, + "learning_rate": 8.38728216536893e-06, + "loss": 2.7553, + "mean_token_accuracy": 0.46977150978462884, + "step": 8699 + }, + { + "epoch": 1.6129032258064515, + "grad_norm": 6.3671875, + "learning_rate": 8.387096774193549e-06, + "loss": 2.8277, + "mean_token_accuracy": 0.4714035964035964, + "step": 8700 + }, + { + "epoch": 1.6130886169818317, + "grad_norm": 6.125, + "learning_rate": 8.38691138301817e-06, + "loss": 2.6838, + "mean_token_accuracy": 0.47739955357142855, + "step": 8701 + }, + { + "epoch": 1.6132740081572117, + "grad_norm": 6.51953125, + "learning_rate": 8.38672599184279e-06, + "loss": 2.8808, + "mean_token_accuracy": 0.46365584308554475, + "step": 8702 + }, + { + "epoch": 1.6134593993325916, + "grad_norm": 10.5625, + "learning_rate": 8.386540600667409e-06, + "loss": 2.836, + "mean_token_accuracy": 0.46285093842097097, + "step": 8703 + }, + { + "epoch": 1.6136447905079718, + "grad_norm": 9.5234375, + "learning_rate": 8.386355209492029e-06, + "loss": 3.6062, + "mean_token_accuracy": 0.4108641975308642, + "step": 8704 + }, + { + "epoch": 1.613830181683352, + "grad_norm": 7.9140625, + "learning_rate": 8.386169818316648e-06, + "loss": 2.6586, + "mean_token_accuracy": 0.48470106260401996, + "step": 8705 + }, + { + "epoch": 1.6140155728587318, + "grad_norm": 8.484375, + "learning_rate": 8.385984427141268e-06, + "loss": 2.3907, + "mean_token_accuracy": 0.5153913808267371, + "step": 8706 + }, + { + "epoch": 1.614200964034112, + "grad_norm": 11.515625, + "learning_rate": 8.385799035965889e-06, + "loss": 2.8371, + "mean_token_accuracy": 0.46099205393691306, + "step": 8707 + }, + { + "epoch": 1.6143863552094921, + "grad_norm": 8.1015625, + "learning_rate": 8.38561364479051e-06, + "loss": 2.6495, + "mean_token_accuracy": 0.4885974914481186, + "step": 8708 + }, + { + "epoch": 1.614571746384872, + "grad_norm": 9.6015625, + "learning_rate": 8.385428253615128e-06, + "loss": 2.6431, + "mean_token_accuracy": 0.48355736917906866, + "step": 8709 + }, + { + "epoch": 1.614757137560252, + "grad_norm": 6.125, + "learning_rate": 8.385242862439749e-06, + "loss": 3.2684, + "mean_token_accuracy": 0.43847361429779014, + "step": 8710 + }, + { + "epoch": 1.6149425287356323, + "grad_norm": 8.5, + "learning_rate": 8.385057471264369e-06, + "loss": 3.1125, + "mean_token_accuracy": 0.445178521225752, + "step": 8711 + }, + { + "epoch": 1.6151279199110122, + "grad_norm": 10.0546875, + "learning_rate": 8.384872080088988e-06, + "loss": 3.1396, + "mean_token_accuracy": 0.4551998774697503, + "step": 8712 + }, + { + "epoch": 1.6153133110863922, + "grad_norm": 8.828125, + "learning_rate": 8.384686688913608e-06, + "loss": 3.1585, + "mean_token_accuracy": 0.435092180546726, + "step": 8713 + }, + { + "epoch": 1.6154987022617724, + "grad_norm": 7.0625, + "learning_rate": 8.384501297738227e-06, + "loss": 2.3264, + "mean_token_accuracy": 0.5415637860082304, + "step": 8714 + }, + { + "epoch": 1.6156840934371524, + "grad_norm": 8.359375, + "learning_rate": 8.38431590656285e-06, + "loss": 3.3887, + "mean_token_accuracy": 0.4264190154565528, + "step": 8715 + }, + { + "epoch": 1.6158694846125323, + "grad_norm": 7.10546875, + "learning_rate": 8.384130515387468e-06, + "loss": 3.2293, + "mean_token_accuracy": 0.43281121187139326, + "step": 8716 + }, + { + "epoch": 1.6160548757879125, + "grad_norm": 6.85546875, + "learning_rate": 8.383945124212089e-06, + "loss": 2.7756, + "mean_token_accuracy": 0.44758016092230096, + "step": 8717 + }, + { + "epoch": 1.6162402669632927, + "grad_norm": 6.76953125, + "learning_rate": 8.383759733036707e-06, + "loss": 2.3221, + "mean_token_accuracy": 0.5683209341745927, + "step": 8718 + }, + { + "epoch": 1.6164256581386724, + "grad_norm": 6.68359375, + "learning_rate": 8.383574341861328e-06, + "loss": 2.8779, + "mean_token_accuracy": 0.46162458017166313, + "step": 8719 + }, + { + "epoch": 1.6166110493140526, + "grad_norm": 6.10546875, + "learning_rate": 8.383388950685948e-06, + "loss": 2.7845, + "mean_token_accuracy": 0.47326709250059146, + "step": 8720 + }, + { + "epoch": 1.6167964404894328, + "grad_norm": 7.09375, + "learning_rate": 8.383203559510567e-06, + "loss": 2.6524, + "mean_token_accuracy": 0.4699955548970218, + "step": 8721 + }, + { + "epoch": 1.6169818316648128, + "grad_norm": 5.2265625, + "learning_rate": 8.383018168335188e-06, + "loss": 2.3545, + "mean_token_accuracy": 0.5278416347381865, + "step": 8722 + }, + { + "epoch": 1.6171672228401928, + "grad_norm": 8.4921875, + "learning_rate": 8.382832777159808e-06, + "loss": 3.2611, + "mean_token_accuracy": 0.44083384426732064, + "step": 8723 + }, + { + "epoch": 1.617352614015573, + "grad_norm": 5.8359375, + "learning_rate": 8.382647385984429e-06, + "loss": 3.1706, + "mean_token_accuracy": 0.4355317884729649, + "step": 8724 + }, + { + "epoch": 1.617538005190953, + "grad_norm": 5.6640625, + "learning_rate": 8.382461994809048e-06, + "loss": 2.5865, + "mean_token_accuracy": 0.4916753381893861, + "step": 8725 + }, + { + "epoch": 1.6177233963663329, + "grad_norm": 6.6015625, + "learning_rate": 8.382276603633668e-06, + "loss": 3.1512, + "mean_token_accuracy": 0.4276888959290353, + "step": 8726 + }, + { + "epoch": 1.617908787541713, + "grad_norm": 5.3671875, + "learning_rate": 8.382091212458287e-06, + "loss": 3.2208, + "mean_token_accuracy": 0.4450572177030194, + "step": 8727 + }, + { + "epoch": 1.618094178717093, + "grad_norm": 6.67578125, + "learning_rate": 8.381905821282907e-06, + "loss": 3.0537, + "mean_token_accuracy": 0.44220616838010557, + "step": 8728 + }, + { + "epoch": 1.618279569892473, + "grad_norm": 4.9375, + "learning_rate": 8.381720430107528e-06, + "loss": 2.736, + "mean_token_accuracy": 0.4629345904537419, + "step": 8729 + }, + { + "epoch": 1.6184649610678532, + "grad_norm": 6.19140625, + "learning_rate": 8.381535038932147e-06, + "loss": 3.5557, + "mean_token_accuracy": 0.40139073827489274, + "step": 8730 + }, + { + "epoch": 1.6186503522432332, + "grad_norm": 9.59375, + "learning_rate": 8.381349647756769e-06, + "loss": 3.157, + "mean_token_accuracy": 0.4577866954776972, + "step": 8731 + }, + { + "epoch": 1.6188357434186131, + "grad_norm": 5.96484375, + "learning_rate": 8.381164256581388e-06, + "loss": 3.4271, + "mean_token_accuracy": 0.4199318568994889, + "step": 8732 + }, + { + "epoch": 1.6190211345939933, + "grad_norm": 6.2578125, + "learning_rate": 8.380978865406008e-06, + "loss": 2.7057, + "mean_token_accuracy": 0.49382030273573113, + "step": 8733 + }, + { + "epoch": 1.6192065257693735, + "grad_norm": 5.3515625, + "learning_rate": 8.380793474230627e-06, + "loss": 3.178, + "mean_token_accuracy": 0.4324363636363636, + "step": 8734 + }, + { + "epoch": 1.6193919169447535, + "grad_norm": 6.03515625, + "learning_rate": 8.380608083055247e-06, + "loss": 2.7252, + "mean_token_accuracy": 0.4905340122731427, + "step": 8735 + }, + { + "epoch": 1.6195773081201335, + "grad_norm": 6.12109375, + "learning_rate": 8.380422691879868e-06, + "loss": 2.5136, + "mean_token_accuracy": 0.4964589235127479, + "step": 8736 + }, + { + "epoch": 1.6197626992955136, + "grad_norm": 6.15234375, + "learning_rate": 8.380237300704487e-06, + "loss": 2.1226, + "mean_token_accuracy": 0.519751327548245, + "step": 8737 + }, + { + "epoch": 1.6199480904708936, + "grad_norm": 5.91015625, + "learning_rate": 8.380051909529107e-06, + "loss": 3.3436, + "mean_token_accuracy": 0.42542306178669814, + "step": 8738 + }, + { + "epoch": 1.6201334816462736, + "grad_norm": 5.9296875, + "learning_rate": 8.379866518353728e-06, + "loss": 2.8995, + "mean_token_accuracy": 0.4425174825174825, + "step": 8739 + }, + { + "epoch": 1.6203188728216538, + "grad_norm": 6.43359375, + "learning_rate": 8.379681127178348e-06, + "loss": 3.3522, + "mean_token_accuracy": 0.42960832648589425, + "step": 8740 + }, + { + "epoch": 1.6205042639970337, + "grad_norm": 6.71484375, + "learning_rate": 8.379495736002967e-06, + "loss": 3.2208, + "mean_token_accuracy": 0.43608297153883263, + "step": 8741 + }, + { + "epoch": 1.6206896551724137, + "grad_norm": 6.06640625, + "learning_rate": 8.379310344827587e-06, + "loss": 2.7103, + "mean_token_accuracy": 0.47791103689084563, + "step": 8742 + }, + { + "epoch": 1.620875046347794, + "grad_norm": 8.6640625, + "learning_rate": 8.379124953652206e-06, + "loss": 2.8738, + "mean_token_accuracy": 0.47170081673592207, + "step": 8743 + }, + { + "epoch": 1.6210604375231739, + "grad_norm": 8.7421875, + "learning_rate": 8.378939562476827e-06, + "loss": 2.364, + "mean_token_accuracy": 0.5155636143850105, + "step": 8744 + }, + { + "epoch": 1.6212458286985538, + "grad_norm": 6.3203125, + "learning_rate": 8.378754171301447e-06, + "loss": 2.8848, + "mean_token_accuracy": 0.4480376304571513, + "step": 8745 + }, + { + "epoch": 1.621431219873934, + "grad_norm": 8.25, + "learning_rate": 8.378568780126066e-06, + "loss": 3.3542, + "mean_token_accuracy": 0.41583214115402956, + "step": 8746 + }, + { + "epoch": 1.6216166110493142, + "grad_norm": 14.15625, + "learning_rate": 8.378383388950686e-06, + "loss": 2.6231, + "mean_token_accuracy": 0.4831294030404153, + "step": 8747 + }, + { + "epoch": 1.621802002224694, + "grad_norm": 6.4765625, + "learning_rate": 8.378197997775307e-06, + "loss": 3.0147, + "mean_token_accuracy": 0.45090361445783134, + "step": 8748 + }, + { + "epoch": 1.6219873934000741, + "grad_norm": 6.609375, + "learning_rate": 8.378012606599927e-06, + "loss": 2.9997, + "mean_token_accuracy": 0.4414651002073255, + "step": 8749 + }, + { + "epoch": 1.6221727845754543, + "grad_norm": 7.109375, + "learning_rate": 8.377827215424546e-06, + "loss": 3.2786, + "mean_token_accuracy": 0.41841941505911634, + "step": 8750 + }, + { + "epoch": 1.6223581757508343, + "grad_norm": 6.85546875, + "learning_rate": 8.377641824249167e-06, + "loss": 2.424, + "mean_token_accuracy": 0.49204898556022664, + "step": 8751 + }, + { + "epoch": 1.6225435669262143, + "grad_norm": 5.43359375, + "learning_rate": 8.377456433073786e-06, + "loss": 3.1259, + "mean_token_accuracy": 0.43714220288527594, + "step": 8752 + }, + { + "epoch": 1.6227289581015945, + "grad_norm": 6.56640625, + "learning_rate": 8.377271041898406e-06, + "loss": 2.7814, + "mean_token_accuracy": 0.4806212596181248, + "step": 8753 + }, + { + "epoch": 1.6229143492769744, + "grad_norm": 6.921875, + "learning_rate": 8.377085650723027e-06, + "loss": 2.8349, + "mean_token_accuracy": 0.4693958141371594, + "step": 8754 + }, + { + "epoch": 1.6230997404523544, + "grad_norm": 7.81640625, + "learning_rate": 8.376900259547647e-06, + "loss": 3.27, + "mean_token_accuracy": 0.4076861058855146, + "step": 8755 + }, + { + "epoch": 1.6232851316277346, + "grad_norm": 6.58984375, + "learning_rate": 8.376714868372266e-06, + "loss": 3.212, + "mean_token_accuracy": 0.43562650740783276, + "step": 8756 + }, + { + "epoch": 1.6234705228031145, + "grad_norm": 6.60546875, + "learning_rate": 8.376529477196886e-06, + "loss": 2.5984, + "mean_token_accuracy": 0.4968465311843027, + "step": 8757 + }, + { + "epoch": 1.6236559139784945, + "grad_norm": 6.484375, + "learning_rate": 8.376344086021507e-06, + "loss": 2.6933, + "mean_token_accuracy": 0.49328897556498796, + "step": 8758 + }, + { + "epoch": 1.6238413051538747, + "grad_norm": 10.6484375, + "learning_rate": 8.376158694846126e-06, + "loss": 3.0886, + "mean_token_accuracy": 0.44981729598051157, + "step": 8759 + }, + { + "epoch": 1.624026696329255, + "grad_norm": 11.0390625, + "learning_rate": 8.375973303670746e-06, + "loss": 2.8943, + "mean_token_accuracy": 0.45137236236712674, + "step": 8760 + }, + { + "epoch": 1.6242120875046346, + "grad_norm": 9.7578125, + "learning_rate": 8.375787912495365e-06, + "loss": 2.6662, + "mean_token_accuracy": 0.4861612515042118, + "step": 8761 + }, + { + "epoch": 1.6243974786800148, + "grad_norm": 6.58984375, + "learning_rate": 8.375602521319985e-06, + "loss": 2.6576, + "mean_token_accuracy": 0.5018670649738611, + "step": 8762 + }, + { + "epoch": 1.624582869855395, + "grad_norm": 8.7109375, + "learning_rate": 8.375417130144606e-06, + "loss": 2.7496, + "mean_token_accuracy": 0.47197558268590456, + "step": 8763 + }, + { + "epoch": 1.624768261030775, + "grad_norm": 8.265625, + "learning_rate": 8.375231738969226e-06, + "loss": 2.6224, + "mean_token_accuracy": 0.4936582442824328, + "step": 8764 + }, + { + "epoch": 1.624953652206155, + "grad_norm": 13.171875, + "learning_rate": 8.375046347793845e-06, + "loss": 2.835, + "mean_token_accuracy": 0.44433552723708164, + "step": 8765 + }, + { + "epoch": 1.6251390433815351, + "grad_norm": 9.03125, + "learning_rate": 8.374860956618466e-06, + "loss": 3.2026, + "mean_token_accuracy": 0.4111567821491486, + "step": 8766 + }, + { + "epoch": 1.625324434556915, + "grad_norm": 5.8359375, + "learning_rate": 8.374675565443086e-06, + "loss": 2.7225, + "mean_token_accuracy": 0.45979899497487436, + "step": 8767 + }, + { + "epoch": 1.625509825732295, + "grad_norm": 8.3671875, + "learning_rate": 8.374490174267705e-06, + "loss": 3.1076, + "mean_token_accuracy": 0.4501593740944654, + "step": 8768 + }, + { + "epoch": 1.6256952169076753, + "grad_norm": 10.90625, + "learning_rate": 8.374304783092325e-06, + "loss": 2.8233, + "mean_token_accuracy": 0.5003565825131936, + "step": 8769 + }, + { + "epoch": 1.6258806080830552, + "grad_norm": 8.6875, + "learning_rate": 8.374119391916944e-06, + "loss": 2.4493, + "mean_token_accuracy": 0.5243741765480896, + "step": 8770 + }, + { + "epoch": 1.6260659992584352, + "grad_norm": 5.45703125, + "learning_rate": 8.373934000741566e-06, + "loss": 3.3252, + "mean_token_accuracy": 0.446087786259542, + "step": 8771 + }, + { + "epoch": 1.6262513904338154, + "grad_norm": 8.2109375, + "learning_rate": 8.373748609566185e-06, + "loss": 3.1779, + "mean_token_accuracy": 0.43308832108272083, + "step": 8772 + }, + { + "epoch": 1.6264367816091954, + "grad_norm": 7.32421875, + "learning_rate": 8.373563218390806e-06, + "loss": 2.9791, + "mean_token_accuracy": 0.44981810400171196, + "step": 8773 + }, + { + "epoch": 1.6266221727845753, + "grad_norm": 6.26953125, + "learning_rate": 8.373377827215426e-06, + "loss": 3.0773, + "mean_token_accuracy": 0.4402861860209136, + "step": 8774 + }, + { + "epoch": 1.6268075639599555, + "grad_norm": 6.31640625, + "learning_rate": 8.373192436040045e-06, + "loss": 2.91, + "mean_token_accuracy": 0.48240880638894884, + "step": 8775 + }, + { + "epoch": 1.6269929551353357, + "grad_norm": 6.21875, + "learning_rate": 8.373007044864665e-06, + "loss": 2.7576, + "mean_token_accuracy": 0.4987667854206632, + "step": 8776 + }, + { + "epoch": 1.6271783463107155, + "grad_norm": 6.1875, + "learning_rate": 8.372821653689284e-06, + "loss": 2.7194, + "mean_token_accuracy": 0.48885793871866295, + "step": 8777 + }, + { + "epoch": 1.6273637374860956, + "grad_norm": 7.65234375, + "learning_rate": 8.372636262513905e-06, + "loss": 3.0453, + "mean_token_accuracy": 0.4468552240733566, + "step": 8778 + }, + { + "epoch": 1.6275491286614758, + "grad_norm": 6.76953125, + "learning_rate": 8.372450871338525e-06, + "loss": 3.1302, + "mean_token_accuracy": 0.4250386398763524, + "step": 8779 + }, + { + "epoch": 1.6277345198368558, + "grad_norm": 8.671875, + "learning_rate": 8.372265480163146e-06, + "loss": 2.8536, + "mean_token_accuracy": 0.4679523539421441, + "step": 8780 + }, + { + "epoch": 1.6279199110122358, + "grad_norm": 7.38671875, + "learning_rate": 8.372080088987765e-06, + "loss": 3.6165, + "mean_token_accuracy": 0.41606929510155316, + "step": 8781 + }, + { + "epoch": 1.628105302187616, + "grad_norm": 6.7109375, + "learning_rate": 8.371894697812385e-06, + "loss": 2.4377, + "mean_token_accuracy": 0.500449121006031, + "step": 8782 + }, + { + "epoch": 1.628290693362996, + "grad_norm": 8.25, + "learning_rate": 8.371709306637006e-06, + "loss": 2.4435, + "mean_token_accuracy": 0.48975055569276366, + "step": 8783 + }, + { + "epoch": 1.628476084538376, + "grad_norm": 5.265625, + "learning_rate": 8.371523915461624e-06, + "loss": 2.4908, + "mean_token_accuracy": 0.5074250490333426, + "step": 8784 + }, + { + "epoch": 1.628661475713756, + "grad_norm": 6.76171875, + "learning_rate": 8.371338524286245e-06, + "loss": 2.6093, + "mean_token_accuracy": 0.48430634023854363, + "step": 8785 + }, + { + "epoch": 1.628846866889136, + "grad_norm": 8.234375, + "learning_rate": 8.371153133110864e-06, + "loss": 2.659, + "mean_token_accuracy": 0.4823463478423314, + "step": 8786 + }, + { + "epoch": 1.629032258064516, + "grad_norm": 6.19140625, + "learning_rate": 8.370967741935484e-06, + "loss": 2.8055, + "mean_token_accuracy": 0.4695269526952695, + "step": 8787 + }, + { + "epoch": 1.6292176492398962, + "grad_norm": 6.6640625, + "learning_rate": 8.370782350760105e-06, + "loss": 2.8646, + "mean_token_accuracy": 0.4707775489186406, + "step": 8788 + }, + { + "epoch": 1.6294030404152764, + "grad_norm": 8.5234375, + "learning_rate": 8.370596959584725e-06, + "loss": 3.1852, + "mean_token_accuracy": 0.4490778970547757, + "step": 8789 + }, + { + "epoch": 1.6295884315906561, + "grad_norm": 6.04296875, + "learning_rate": 8.370411568409344e-06, + "loss": 2.8226, + "mean_token_accuracy": 0.4623541887592789, + "step": 8790 + }, + { + "epoch": 1.6297738227660363, + "grad_norm": 5.87109375, + "learning_rate": 8.370226177233964e-06, + "loss": 3.2292, + "mean_token_accuracy": 0.4213668499607227, + "step": 8791 + }, + { + "epoch": 1.6299592139414165, + "grad_norm": 6.69921875, + "learning_rate": 8.370040786058585e-06, + "loss": 3.3587, + "mean_token_accuracy": 0.4348685022842326, + "step": 8792 + }, + { + "epoch": 1.6301446051167965, + "grad_norm": 6.5234375, + "learning_rate": 8.369855394883204e-06, + "loss": 3.2033, + "mean_token_accuracy": 0.44258752341161217, + "step": 8793 + }, + { + "epoch": 1.6303299962921765, + "grad_norm": 8.8515625, + "learning_rate": 8.369670003707824e-06, + "loss": 2.2662, + "mean_token_accuracy": 0.5220538030861669, + "step": 8794 + }, + { + "epoch": 1.6305153874675566, + "grad_norm": 9.2421875, + "learning_rate": 8.369484612532443e-06, + "loss": 2.6571, + "mean_token_accuracy": 0.4830764581444545, + "step": 8795 + }, + { + "epoch": 1.6307007786429366, + "grad_norm": 9.625, + "learning_rate": 8.369299221357065e-06, + "loss": 2.9988, + "mean_token_accuracy": 0.4165642286416718, + "step": 8796 + }, + { + "epoch": 1.6308861698183166, + "grad_norm": 6.46484375, + "learning_rate": 8.369113830181684e-06, + "loss": 2.7927, + "mean_token_accuracy": 0.4953470959460905, + "step": 8797 + }, + { + "epoch": 1.6310715609936968, + "grad_norm": 8.6640625, + "learning_rate": 8.368928439006304e-06, + "loss": 2.7785, + "mean_token_accuracy": 0.49072418417523467, + "step": 8798 + }, + { + "epoch": 1.6312569521690767, + "grad_norm": 5.83984375, + "learning_rate": 8.368743047830923e-06, + "loss": 3.5161, + "mean_token_accuracy": 0.39652618823212604, + "step": 8799 + }, + { + "epoch": 1.6314423433444567, + "grad_norm": 5.61328125, + "learning_rate": 8.368557656655544e-06, + "loss": 2.7975, + "mean_token_accuracy": 0.46272054638588506, + "step": 8800 + }, + { + "epoch": 1.631627734519837, + "grad_norm": 5.81640625, + "learning_rate": 8.368372265480164e-06, + "loss": 2.7163, + "mean_token_accuracy": 0.5060637820032939, + "step": 8801 + }, + { + "epoch": 1.6318131256952169, + "grad_norm": 5.83203125, + "learning_rate": 8.368186874304783e-06, + "loss": 3.2057, + "mean_token_accuracy": 0.4193423597678917, + "step": 8802 + }, + { + "epoch": 1.6319985168705968, + "grad_norm": 5.89453125, + "learning_rate": 8.368001483129404e-06, + "loss": 3.3967, + "mean_token_accuracy": 0.4403158853903007, + "step": 8803 + }, + { + "epoch": 1.632183908045977, + "grad_norm": 6.31640625, + "learning_rate": 8.367816091954024e-06, + "loss": 2.6812, + "mean_token_accuracy": 0.49637571730594987, + "step": 8804 + }, + { + "epoch": 1.6323692992213572, + "grad_norm": 6.69140625, + "learning_rate": 8.367630700778644e-06, + "loss": 3.6396, + "mean_token_accuracy": 0.4328578455484506, + "step": 8805 + }, + { + "epoch": 1.632554690396737, + "grad_norm": 7.94921875, + "learning_rate": 8.367445309603263e-06, + "loss": 2.6067, + "mean_token_accuracy": 0.49641611778380473, + "step": 8806 + }, + { + "epoch": 1.6327400815721171, + "grad_norm": 6.7734375, + "learning_rate": 8.367259918427884e-06, + "loss": 2.9741, + "mean_token_accuracy": 0.46098868374032165, + "step": 8807 + }, + { + "epoch": 1.6329254727474973, + "grad_norm": 6.93359375, + "learning_rate": 8.367074527252503e-06, + "loss": 2.9466, + "mean_token_accuracy": 0.44734325911760686, + "step": 8808 + }, + { + "epoch": 1.6331108639228773, + "grad_norm": 6.5859375, + "learning_rate": 8.366889136077123e-06, + "loss": 3.0028, + "mean_token_accuracy": 0.4577215878194671, + "step": 8809 + }, + { + "epoch": 1.6332962550982573, + "grad_norm": 6.29296875, + "learning_rate": 8.366703744901744e-06, + "loss": 2.8746, + "mean_token_accuracy": 0.4679935449166218, + "step": 8810 + }, + { + "epoch": 1.6334816462736375, + "grad_norm": 6.55859375, + "learning_rate": 8.366518353726362e-06, + "loss": 2.4088, + "mean_token_accuracy": 0.5062162162162163, + "step": 8811 + }, + { + "epoch": 1.6336670374490174, + "grad_norm": 6.4296875, + "learning_rate": 8.366332962550985e-06, + "loss": 3.1861, + "mean_token_accuracy": 0.42810539523212043, + "step": 8812 + }, + { + "epoch": 1.6338524286243974, + "grad_norm": 10.3125, + "learning_rate": 8.366147571375603e-06, + "loss": 2.6235, + "mean_token_accuracy": 0.5053533190578159, + "step": 8813 + }, + { + "epoch": 1.6340378197997776, + "grad_norm": 6.34375, + "learning_rate": 8.365962180200224e-06, + "loss": 2.6298, + "mean_token_accuracy": 0.505017629509086, + "step": 8814 + }, + { + "epoch": 1.6342232109751575, + "grad_norm": 6.93359375, + "learning_rate": 8.365776789024843e-06, + "loss": 3.0334, + "mean_token_accuracy": 0.4579090291921249, + "step": 8815 + }, + { + "epoch": 1.6344086021505375, + "grad_norm": 8.234375, + "learning_rate": 8.365591397849463e-06, + "loss": 3.137, + "mean_token_accuracy": 0.44664466446644663, + "step": 8816 + }, + { + "epoch": 1.6345939933259177, + "grad_norm": 8.2890625, + "learning_rate": 8.365406006674084e-06, + "loss": 2.7034, + "mean_token_accuracy": 0.4574607329842932, + "step": 8817 + }, + { + "epoch": 1.634779384501298, + "grad_norm": 5.84375, + "learning_rate": 8.365220615498702e-06, + "loss": 2.7892, + "mean_token_accuracy": 0.49075081610446136, + "step": 8818 + }, + { + "epoch": 1.6349647756766776, + "grad_norm": 8.28125, + "learning_rate": 8.365035224323323e-06, + "loss": 2.8311, + "mean_token_accuracy": 0.4548637159289822, + "step": 8819 + }, + { + "epoch": 1.6351501668520578, + "grad_norm": 8.3515625, + "learning_rate": 8.364849833147943e-06, + "loss": 2.2074, + "mean_token_accuracy": 0.5258861439312568, + "step": 8820 + }, + { + "epoch": 1.635335558027438, + "grad_norm": 6.03125, + "learning_rate": 8.364664441972564e-06, + "loss": 2.2845, + "mean_token_accuracy": 0.5308416100365917, + "step": 8821 + }, + { + "epoch": 1.635520949202818, + "grad_norm": 6.88671875, + "learning_rate": 8.364479050797183e-06, + "loss": 3.169, + "mean_token_accuracy": 0.43691473632331196, + "step": 8822 + }, + { + "epoch": 1.635706340378198, + "grad_norm": 6.11328125, + "learning_rate": 8.364293659621803e-06, + "loss": 2.4554, + "mean_token_accuracy": 0.5282762938230384, + "step": 8823 + }, + { + "epoch": 1.6358917315535781, + "grad_norm": 7.40625, + "learning_rate": 8.364108268446422e-06, + "loss": 2.809, + "mean_token_accuracy": 0.4753790839455995, + "step": 8824 + }, + { + "epoch": 1.636077122728958, + "grad_norm": 5.84375, + "learning_rate": 8.363922877271042e-06, + "loss": 2.6758, + "mean_token_accuracy": 0.4891379708805177, + "step": 8825 + }, + { + "epoch": 1.636262513904338, + "grad_norm": 6.34375, + "learning_rate": 8.363737486095663e-06, + "loss": 2.3031, + "mean_token_accuracy": 0.5506055363321799, + "step": 8826 + }, + { + "epoch": 1.6364479050797183, + "grad_norm": 8.828125, + "learning_rate": 8.363552094920282e-06, + "loss": 2.4782, + "mean_token_accuracy": 0.4932441654155862, + "step": 8827 + }, + { + "epoch": 1.6366332962550982, + "grad_norm": 6.453125, + "learning_rate": 8.363366703744902e-06, + "loss": 2.1661, + "mean_token_accuracy": 0.563860103626943, + "step": 8828 + }, + { + "epoch": 1.6368186874304782, + "grad_norm": 6.62890625, + "learning_rate": 8.363181312569523e-06, + "loss": 3.1639, + "mean_token_accuracy": 0.44660886090984675, + "step": 8829 + }, + { + "epoch": 1.6370040786058584, + "grad_norm": 8.0234375, + "learning_rate": 8.362995921394143e-06, + "loss": 3.4875, + "mean_token_accuracy": 0.41378768844221103, + "step": 8830 + }, + { + "epoch": 1.6371894697812384, + "grad_norm": 7.8203125, + "learning_rate": 8.362810530218762e-06, + "loss": 3.1557, + "mean_token_accuracy": 0.4383342840844267, + "step": 8831 + }, + { + "epoch": 1.6373748609566183, + "grad_norm": 7.421875, + "learning_rate": 8.362625139043383e-06, + "loss": 2.7168, + "mean_token_accuracy": 0.49100609756097563, + "step": 8832 + }, + { + "epoch": 1.6375602521319985, + "grad_norm": 10.0703125, + "learning_rate": 8.362439747868001e-06, + "loss": 2.4154, + "mean_token_accuracy": 0.5020499933871181, + "step": 8833 + }, + { + "epoch": 1.6377456433073787, + "grad_norm": 8.9453125, + "learning_rate": 8.362254356692622e-06, + "loss": 2.6216, + "mean_token_accuracy": 0.4719281790164813, + "step": 8834 + }, + { + "epoch": 1.6379310344827587, + "grad_norm": 5.97265625, + "learning_rate": 8.362068965517242e-06, + "loss": 2.6687, + "mean_token_accuracy": 0.48147727272727275, + "step": 8835 + }, + { + "epoch": 1.6381164256581386, + "grad_norm": 8.375, + "learning_rate": 8.361883574341863e-06, + "loss": 4.605, + "mean_token_accuracy": 0.38232301206570724, + "step": 8836 + }, + { + "epoch": 1.6383018168335188, + "grad_norm": 7.80859375, + "learning_rate": 8.361698183166482e-06, + "loss": 2.8881, + "mean_token_accuracy": 0.4531287461040518, + "step": 8837 + }, + { + "epoch": 1.6384872080088988, + "grad_norm": 7.25, + "learning_rate": 8.361512791991102e-06, + "loss": 3.9553, + "mean_token_accuracy": 0.3836397486618571, + "step": 8838 + }, + { + "epoch": 1.6386725991842788, + "grad_norm": 6.23046875, + "learning_rate": 8.361327400815723e-06, + "loss": 2.6187, + "mean_token_accuracy": 0.48896969696969694, + "step": 8839 + }, + { + "epoch": 1.638857990359659, + "grad_norm": 7.0546875, + "learning_rate": 8.361142009640341e-06, + "loss": 3.4368, + "mean_token_accuracy": 0.4477919402600747, + "step": 8840 + }, + { + "epoch": 1.639043381535039, + "grad_norm": 9.390625, + "learning_rate": 8.360956618464962e-06, + "loss": 2.6954, + "mean_token_accuracy": 0.5128427441205159, + "step": 8841 + }, + { + "epoch": 1.639228772710419, + "grad_norm": 6.6796875, + "learning_rate": 8.36077122728958e-06, + "loss": 3.0075, + "mean_token_accuracy": 0.45757471659223636, + "step": 8842 + }, + { + "epoch": 1.639414163885799, + "grad_norm": 6.75390625, + "learning_rate": 8.360585836114201e-06, + "loss": 3.1961, + "mean_token_accuracy": 0.438194723449846, + "step": 8843 + }, + { + "epoch": 1.639599555061179, + "grad_norm": 6.49609375, + "learning_rate": 8.360400444938822e-06, + "loss": 2.701, + "mean_token_accuracy": 0.47089678510998306, + "step": 8844 + }, + { + "epoch": 1.639784946236559, + "grad_norm": 6.171875, + "learning_rate": 8.360215053763442e-06, + "loss": 2.8426, + "mean_token_accuracy": 0.47523838818824976, + "step": 8845 + }, + { + "epoch": 1.6399703374119392, + "grad_norm": 5.82421875, + "learning_rate": 8.360029662588061e-06, + "loss": 2.545, + "mean_token_accuracy": 0.49304148088096206, + "step": 8846 + }, + { + "epoch": 1.6401557285873194, + "grad_norm": 7.23828125, + "learning_rate": 8.359844271412681e-06, + "loss": 3.3549, + "mean_token_accuracy": 0.45261522527187986, + "step": 8847 + }, + { + "epoch": 1.6403411197626991, + "grad_norm": 8.921875, + "learning_rate": 8.359658880237302e-06, + "loss": 2.5924, + "mean_token_accuracy": 0.47549407114624503, + "step": 8848 + }, + { + "epoch": 1.6405265109380793, + "grad_norm": 6.54296875, + "learning_rate": 8.35947348906192e-06, + "loss": 2.6524, + "mean_token_accuracy": 0.5071950662402924, + "step": 8849 + }, + { + "epoch": 1.6407119021134595, + "grad_norm": 7.1953125, + "learning_rate": 8.359288097886541e-06, + "loss": 2.6618, + "mean_token_accuracy": 0.47151931688014165, + "step": 8850 + }, + { + "epoch": 1.6408972932888395, + "grad_norm": 10.3125, + "learning_rate": 8.35910270671116e-06, + "loss": 2.7937, + "mean_token_accuracy": 0.4871468567065411, + "step": 8851 + }, + { + "epoch": 1.6410826844642195, + "grad_norm": 7.00390625, + "learning_rate": 8.358917315535782e-06, + "loss": 2.6599, + "mean_token_accuracy": 0.4843529743445165, + "step": 8852 + }, + { + "epoch": 1.6412680756395996, + "grad_norm": 6.8828125, + "learning_rate": 8.358731924360401e-06, + "loss": 2.6812, + "mean_token_accuracy": 0.48415596654494053, + "step": 8853 + }, + { + "epoch": 1.6414534668149796, + "grad_norm": 8.15625, + "learning_rate": 8.358546533185021e-06, + "loss": 1.9835, + "mean_token_accuracy": 0.546678870292887, + "step": 8854 + }, + { + "epoch": 1.6416388579903596, + "grad_norm": 9.6640625, + "learning_rate": 8.358361142009642e-06, + "loss": 2.8561, + "mean_token_accuracy": 0.4505708039834831, + "step": 8855 + }, + { + "epoch": 1.6418242491657398, + "grad_norm": 7.38671875, + "learning_rate": 8.35817575083426e-06, + "loss": 2.7305, + "mean_token_accuracy": 0.493120470538809, + "step": 8856 + }, + { + "epoch": 1.6420096403411197, + "grad_norm": 6.00390625, + "learning_rate": 8.357990359658881e-06, + "loss": 2.9431, + "mean_token_accuracy": 0.4471526195899772, + "step": 8857 + }, + { + "epoch": 1.6421950315164997, + "grad_norm": 7.5078125, + "learning_rate": 8.3578049684835e-06, + "loss": 2.9658, + "mean_token_accuracy": 0.44033176528843776, + "step": 8858 + }, + { + "epoch": 1.64238042269188, + "grad_norm": 9.3359375, + "learning_rate": 8.35761957730812e-06, + "loss": 2.7248, + "mean_token_accuracy": 0.47452819897458276, + "step": 8859 + }, + { + "epoch": 1.64256581386726, + "grad_norm": 5.60546875, + "learning_rate": 8.357434186132741e-06, + "loss": 2.8646, + "mean_token_accuracy": 0.4497469269703543, + "step": 8860 + }, + { + "epoch": 1.6427512050426398, + "grad_norm": 6.24609375, + "learning_rate": 8.357248794957362e-06, + "loss": 3.2009, + "mean_token_accuracy": 0.4492247520603436, + "step": 8861 + }, + { + "epoch": 1.64293659621802, + "grad_norm": 5.7265625, + "learning_rate": 8.35706340378198e-06, + "loss": 3.0196, + "mean_token_accuracy": 0.4486628793127338, + "step": 8862 + }, + { + "epoch": 1.6431219873934002, + "grad_norm": 7.84765625, + "learning_rate": 8.3568780126066e-06, + "loss": 2.9143, + "mean_token_accuracy": 0.4354253112033195, + "step": 8863 + }, + { + "epoch": 1.6433073785687802, + "grad_norm": 10.421875, + "learning_rate": 8.356692621431221e-06, + "loss": 3.671, + "mean_token_accuracy": 0.4350613154960981, + "step": 8864 + }, + { + "epoch": 1.6434927697441601, + "grad_norm": 7.0, + "learning_rate": 8.35650723025584e-06, + "loss": 2.8428, + "mean_token_accuracy": 0.4612221749610095, + "step": 8865 + }, + { + "epoch": 1.6436781609195403, + "grad_norm": 5.98828125, + "learning_rate": 8.35632183908046e-06, + "loss": 3.1397, + "mean_token_accuracy": 0.4563353445538266, + "step": 8866 + }, + { + "epoch": 1.6438635520949203, + "grad_norm": 6.97265625, + "learning_rate": 8.35613644790508e-06, + "loss": 2.9386, + "mean_token_accuracy": 0.4553683969290707, + "step": 8867 + }, + { + "epoch": 1.6440489432703003, + "grad_norm": 7.48046875, + "learning_rate": 8.355951056729702e-06, + "loss": 3.1005, + "mean_token_accuracy": 0.42735949098621423, + "step": 8868 + }, + { + "epoch": 1.6442343344456805, + "grad_norm": 6.12890625, + "learning_rate": 8.35576566555432e-06, + "loss": 3.0006, + "mean_token_accuracy": 0.4480252764612954, + "step": 8869 + }, + { + "epoch": 1.6444197256210604, + "grad_norm": 5.89453125, + "learning_rate": 8.35558027437894e-06, + "loss": 2.7202, + "mean_token_accuracy": 0.47690671528410816, + "step": 8870 + }, + { + "epoch": 1.6446051167964404, + "grad_norm": 6.3671875, + "learning_rate": 8.35539488320356e-06, + "loss": 3.161, + "mean_token_accuracy": 0.4351408028759736, + "step": 8871 + }, + { + "epoch": 1.6447905079718206, + "grad_norm": 6.4140625, + "learning_rate": 8.35520949202818e-06, + "loss": 2.6233, + "mean_token_accuracy": 0.4792492422476102, + "step": 8872 + }, + { + "epoch": 1.6449758991472005, + "grad_norm": 5.6640625, + "learning_rate": 8.3550241008528e-06, + "loss": 2.8209, + "mean_token_accuracy": 0.45934959349593496, + "step": 8873 + }, + { + "epoch": 1.6451612903225805, + "grad_norm": 6.2578125, + "learning_rate": 8.35483870967742e-06, + "loss": 2.5556, + "mean_token_accuracy": 0.5086761824797089, + "step": 8874 + }, + { + "epoch": 1.6453466814979607, + "grad_norm": 6.7734375, + "learning_rate": 8.35465331850204e-06, + "loss": 2.7282, + "mean_token_accuracy": 0.4775630873191849, + "step": 8875 + }, + { + "epoch": 1.645532072673341, + "grad_norm": 6.18359375, + "learning_rate": 8.35446792732666e-06, + "loss": 2.8468, + "mean_token_accuracy": 0.46067730198712564, + "step": 8876 + }, + { + "epoch": 1.6457174638487206, + "grad_norm": 6.14453125, + "learning_rate": 8.354282536151281e-06, + "loss": 2.8684, + "mean_token_accuracy": 0.4735873850197109, + "step": 8877 + }, + { + "epoch": 1.6459028550241008, + "grad_norm": 7.1796875, + "learning_rate": 8.3540971449759e-06, + "loss": 3.9834, + "mean_token_accuracy": 0.41897696212417024, + "step": 8878 + }, + { + "epoch": 1.646088246199481, + "grad_norm": 5.03125, + "learning_rate": 8.35391175380052e-06, + "loss": 2.3121, + "mean_token_accuracy": 0.5300807043286867, + "step": 8879 + }, + { + "epoch": 1.646273637374861, + "grad_norm": 9.3359375, + "learning_rate": 8.353726362625139e-06, + "loss": 3.1224, + "mean_token_accuracy": 0.4431006848317062, + "step": 8880 + }, + { + "epoch": 1.646459028550241, + "grad_norm": 7.2265625, + "learning_rate": 8.35354097144976e-06, + "loss": 3.2936, + "mean_token_accuracy": 0.4360096589194084, + "step": 8881 + }, + { + "epoch": 1.6466444197256211, + "grad_norm": 5.8671875, + "learning_rate": 8.35335558027438e-06, + "loss": 2.9265, + "mean_token_accuracy": 0.4557272858587318, + "step": 8882 + }, + { + "epoch": 1.6468298109010011, + "grad_norm": 5.76171875, + "learning_rate": 8.353170189098999e-06, + "loss": 2.9484, + "mean_token_accuracy": 0.4702726218097448, + "step": 8883 + }, + { + "epoch": 1.647015202076381, + "grad_norm": 6.01171875, + "learning_rate": 8.35298479792362e-06, + "loss": 2.6761, + "mean_token_accuracy": 0.4667458432304038, + "step": 8884 + }, + { + "epoch": 1.6472005932517613, + "grad_norm": 6.3984375, + "learning_rate": 8.35279940674824e-06, + "loss": 2.9143, + "mean_token_accuracy": 0.47463468431210365, + "step": 8885 + }, + { + "epoch": 1.6473859844271412, + "grad_norm": 6.83203125, + "learning_rate": 8.35261401557286e-06, + "loss": 3.4555, + "mean_token_accuracy": 0.4168646080760095, + "step": 8886 + }, + { + "epoch": 1.6475713756025212, + "grad_norm": 7.1953125, + "learning_rate": 8.352428624397479e-06, + "loss": 2.8409, + "mean_token_accuracy": 0.4543462381300219, + "step": 8887 + }, + { + "epoch": 1.6477567667779014, + "grad_norm": 7.02734375, + "learning_rate": 8.3522432332221e-06, + "loss": 2.6208, + "mean_token_accuracy": 0.4878709677419355, + "step": 8888 + }, + { + "epoch": 1.6479421579532816, + "grad_norm": 6.08984375, + "learning_rate": 8.352057842046718e-06, + "loss": 2.5263, + "mean_token_accuracy": 0.4846723044397463, + "step": 8889 + }, + { + "epoch": 1.6481275491286613, + "grad_norm": 6.796875, + "learning_rate": 8.351872450871339e-06, + "loss": 2.5631, + "mean_token_accuracy": 0.499277858015776, + "step": 8890 + }, + { + "epoch": 1.6483129403040415, + "grad_norm": 6.1484375, + "learning_rate": 8.35168705969596e-06, + "loss": 2.4274, + "mean_token_accuracy": 0.5145564405383136, + "step": 8891 + }, + { + "epoch": 1.6484983314794217, + "grad_norm": 5.8125, + "learning_rate": 8.35150166852058e-06, + "loss": 3.0021, + "mean_token_accuracy": 0.43482805477161074, + "step": 8892 + }, + { + "epoch": 1.6486837226548017, + "grad_norm": 6.26171875, + "learning_rate": 8.3513162773452e-06, + "loss": 2.6814, + "mean_token_accuracy": 0.466122574684324, + "step": 8893 + }, + { + "epoch": 1.6488691138301816, + "grad_norm": 6.46875, + "learning_rate": 8.351130886169819e-06, + "loss": 2.1462, + "mean_token_accuracy": 0.5381827271591051, + "step": 8894 + }, + { + "epoch": 1.6490545050055618, + "grad_norm": 7.2890625, + "learning_rate": 8.35094549499444e-06, + "loss": 2.4187, + "mean_token_accuracy": 0.5052848985542462, + "step": 8895 + }, + { + "epoch": 1.6492398961809418, + "grad_norm": 6.3203125, + "learning_rate": 8.350760103819058e-06, + "loss": 2.5877, + "mean_token_accuracy": 0.4992602708546717, + "step": 8896 + }, + { + "epoch": 1.6494252873563218, + "grad_norm": 5.58984375, + "learning_rate": 8.350574712643679e-06, + "loss": 3.3992, + "mean_token_accuracy": 0.40399581832965503, + "step": 8897 + }, + { + "epoch": 1.649610678531702, + "grad_norm": 6.70703125, + "learning_rate": 8.350389321468298e-06, + "loss": 2.7946, + "mean_token_accuracy": 0.4770566349731294, + "step": 8898 + }, + { + "epoch": 1.649796069707082, + "grad_norm": 5.67578125, + "learning_rate": 8.350203930292918e-06, + "loss": 3.0669, + "mean_token_accuracy": 0.4374927854092116, + "step": 8899 + }, + { + "epoch": 1.649981460882462, + "grad_norm": 5.87109375, + "learning_rate": 8.350018539117539e-06, + "loss": 3.1234, + "mean_token_accuracy": 0.43770384866275275, + "step": 8900 + }, + { + "epoch": 1.650166852057842, + "grad_norm": 6.88671875, + "learning_rate": 8.349833147942159e-06, + "loss": 2.8359, + "mean_token_accuracy": 0.4533399429916966, + "step": 8901 + }, + { + "epoch": 1.650352243233222, + "grad_norm": 5.57421875, + "learning_rate": 8.34964775676678e-06, + "loss": 2.5231, + "mean_token_accuracy": 0.4983351831298557, + "step": 8902 + }, + { + "epoch": 1.650537634408602, + "grad_norm": 5.875, + "learning_rate": 8.349462365591398e-06, + "loss": 2.9503, + "mean_token_accuracy": 0.4602080461210678, + "step": 8903 + }, + { + "epoch": 1.6507230255839822, + "grad_norm": 5.453125, + "learning_rate": 8.349276974416019e-06, + "loss": 3.3628, + "mean_token_accuracy": 0.4294787781026313, + "step": 8904 + }, + { + "epoch": 1.6509084167593624, + "grad_norm": 5.87890625, + "learning_rate": 8.349091583240638e-06, + "loss": 2.7305, + "mean_token_accuracy": 0.46581945661700264, + "step": 8905 + }, + { + "epoch": 1.6510938079347421, + "grad_norm": 5.86328125, + "learning_rate": 8.348906192065258e-06, + "loss": 2.5961, + "mean_token_accuracy": 0.4800856021876115, + "step": 8906 + }, + { + "epoch": 1.6512791991101223, + "grad_norm": 6.18359375, + "learning_rate": 8.348720800889879e-06, + "loss": 2.88, + "mean_token_accuracy": 0.48093496996604856, + "step": 8907 + }, + { + "epoch": 1.6514645902855025, + "grad_norm": 6.64453125, + "learning_rate": 8.348535409714497e-06, + "loss": 2.6665, + "mean_token_accuracy": 0.4768935479108224, + "step": 8908 + }, + { + "epoch": 1.6516499814608825, + "grad_norm": 6.375, + "learning_rate": 8.348350018539118e-06, + "loss": 2.722, + "mean_token_accuracy": 0.4585215379530682, + "step": 8909 + }, + { + "epoch": 1.6518353726362625, + "grad_norm": 6.84765625, + "learning_rate": 8.348164627363738e-06, + "loss": 2.9283, + "mean_token_accuracy": 0.44984288131496253, + "step": 8910 + }, + { + "epoch": 1.6520207638116426, + "grad_norm": 9.5078125, + "learning_rate": 8.347979236188359e-06, + "loss": 2.975, + "mean_token_accuracy": 0.46616541353383456, + "step": 8911 + }, + { + "epoch": 1.6522061549870226, + "grad_norm": 6.5234375, + "learning_rate": 8.347793845012978e-06, + "loss": 2.5324, + "mean_token_accuracy": 0.49645704162976084, + "step": 8912 + }, + { + "epoch": 1.6523915461624026, + "grad_norm": 7.359375, + "learning_rate": 8.347608453837598e-06, + "loss": 3.6046, + "mean_token_accuracy": 0.43010752688172044, + "step": 8913 + }, + { + "epoch": 1.6525769373377828, + "grad_norm": 7.46875, + "learning_rate": 8.347423062662217e-06, + "loss": 2.685, + "mean_token_accuracy": 0.4922027290448343, + "step": 8914 + }, + { + "epoch": 1.6527623285131627, + "grad_norm": 7.33984375, + "learning_rate": 8.347237671486838e-06, + "loss": 2.5033, + "mean_token_accuracy": 0.48224400871459694, + "step": 8915 + }, + { + "epoch": 1.6529477196885427, + "grad_norm": 5.8828125, + "learning_rate": 8.347052280311458e-06, + "loss": 2.8199, + "mean_token_accuracy": 0.46099290780141844, + "step": 8916 + }, + { + "epoch": 1.653133110863923, + "grad_norm": 7.75390625, + "learning_rate": 8.346866889136079e-06, + "loss": 2.7777, + "mean_token_accuracy": 0.47224797986488276, + "step": 8917 + }, + { + "epoch": 1.653318502039303, + "grad_norm": 7.1953125, + "learning_rate": 8.346681497960697e-06, + "loss": 2.423, + "mean_token_accuracy": 0.5005189413596264, + "step": 8918 + }, + { + "epoch": 1.6535038932146828, + "grad_norm": 5.5234375, + "learning_rate": 8.346496106785318e-06, + "loss": 2.9997, + "mean_token_accuracy": 0.4807347670250896, + "step": 8919 + }, + { + "epoch": 1.653689284390063, + "grad_norm": 10.0234375, + "learning_rate": 8.346310715609938e-06, + "loss": 2.5239, + "mean_token_accuracy": 0.4829603627321925, + "step": 8920 + }, + { + "epoch": 1.6538746755654432, + "grad_norm": 6.28515625, + "learning_rate": 8.346125324434557e-06, + "loss": 3.1977, + "mean_token_accuracy": 0.42775712515489467, + "step": 8921 + }, + { + "epoch": 1.6540600667408232, + "grad_norm": 6.15234375, + "learning_rate": 8.345939933259178e-06, + "loss": 2.9899, + "mean_token_accuracy": 0.4540427439844567, + "step": 8922 + }, + { + "epoch": 1.6542454579162031, + "grad_norm": 5.72265625, + "learning_rate": 8.345754542083796e-06, + "loss": 2.8329, + "mean_token_accuracy": 0.4738286969253294, + "step": 8923 + }, + { + "epoch": 1.6544308490915833, + "grad_norm": 8.03125, + "learning_rate": 8.345569150908417e-06, + "loss": 2.926, + "mean_token_accuracy": 0.45076361978573054, + "step": 8924 + }, + { + "epoch": 1.6546162402669633, + "grad_norm": 9.671875, + "learning_rate": 8.345383759733037e-06, + "loss": 2.7679, + "mean_token_accuracy": 0.48451507742461286, + "step": 8925 + }, + { + "epoch": 1.6548016314423433, + "grad_norm": 6.05078125, + "learning_rate": 8.345198368557658e-06, + "loss": 3.097, + "mean_token_accuracy": 0.4546755104695019, + "step": 8926 + }, + { + "epoch": 1.6549870226177235, + "grad_norm": 7.4296875, + "learning_rate": 8.345012977382277e-06, + "loss": 3.1438, + "mean_token_accuracy": 0.4406298093997869, + "step": 8927 + }, + { + "epoch": 1.6551724137931034, + "grad_norm": 7.4609375, + "learning_rate": 8.344827586206897e-06, + "loss": 2.8248, + "mean_token_accuracy": 0.47302540993866565, + "step": 8928 + }, + { + "epoch": 1.6553578049684834, + "grad_norm": 6.40625, + "learning_rate": 8.344642195031518e-06, + "loss": 2.7979, + "mean_token_accuracy": 0.4591714399893433, + "step": 8929 + }, + { + "epoch": 1.6555431961438636, + "grad_norm": 5.80859375, + "learning_rate": 8.344456803856136e-06, + "loss": 3.6356, + "mean_token_accuracy": 0.38245083207261726, + "step": 8930 + }, + { + "epoch": 1.6557285873192435, + "grad_norm": 6.7890625, + "learning_rate": 8.344271412680757e-06, + "loss": 2.658, + "mean_token_accuracy": 0.4933811362382791, + "step": 8931 + }, + { + "epoch": 1.6559139784946235, + "grad_norm": 5.90625, + "learning_rate": 8.344086021505376e-06, + "loss": 2.7383, + "mean_token_accuracy": 0.4720262096774194, + "step": 8932 + }, + { + "epoch": 1.6560993696700037, + "grad_norm": 5.81640625, + "learning_rate": 8.343900630329998e-06, + "loss": 2.4528, + "mean_token_accuracy": 0.5091585181104531, + "step": 8933 + }, + { + "epoch": 1.656284760845384, + "grad_norm": 5.40234375, + "learning_rate": 8.343715239154617e-06, + "loss": 2.7788, + "mean_token_accuracy": 0.47177594442032134, + "step": 8934 + }, + { + "epoch": 1.6564701520207639, + "grad_norm": 6.84375, + "learning_rate": 8.343529847979237e-06, + "loss": 2.4675, + "mean_token_accuracy": 0.5143762183235867, + "step": 8935 + }, + { + "epoch": 1.6566555431961438, + "grad_norm": 5.3515625, + "learning_rate": 8.343344456803858e-06, + "loss": 3.0416, + "mean_token_accuracy": 0.4616797900262467, + "step": 8936 + }, + { + "epoch": 1.656840934371524, + "grad_norm": 7.28515625, + "learning_rate": 8.343159065628476e-06, + "loss": 2.7382, + "mean_token_accuracy": 0.48330229671011793, + "step": 8937 + }, + { + "epoch": 1.657026325546904, + "grad_norm": 5.96875, + "learning_rate": 8.342973674453097e-06, + "loss": 3.2798, + "mean_token_accuracy": 0.4383697813121272, + "step": 8938 + }, + { + "epoch": 1.657211716722284, + "grad_norm": 5.04296875, + "learning_rate": 8.342788283277716e-06, + "loss": 2.706, + "mean_token_accuracy": 0.4714981729598051, + "step": 8939 + }, + { + "epoch": 1.6573971078976641, + "grad_norm": 8.8203125, + "learning_rate": 8.342602892102336e-06, + "loss": 2.3911, + "mean_token_accuracy": 0.5070240825688074, + "step": 8940 + }, + { + "epoch": 1.6575824990730441, + "grad_norm": 7.84765625, + "learning_rate": 8.342417500926957e-06, + "loss": 2.5925, + "mean_token_accuracy": 0.48901715568382237, + "step": 8941 + }, + { + "epoch": 1.657767890248424, + "grad_norm": 6.11328125, + "learning_rate": 8.342232109751577e-06, + "loss": 3.574, + "mean_token_accuracy": 0.40052164840897236, + "step": 8942 + }, + { + "epoch": 1.6579532814238043, + "grad_norm": 8.546875, + "learning_rate": 8.342046718576196e-06, + "loss": 2.3831, + "mean_token_accuracy": 0.5049430609435616, + "step": 8943 + }, + { + "epoch": 1.6581386725991842, + "grad_norm": 6.82421875, + "learning_rate": 8.341861327400817e-06, + "loss": 3.0246, + "mean_token_accuracy": 0.4502415458937198, + "step": 8944 + }, + { + "epoch": 1.6583240637745642, + "grad_norm": 5.7109375, + "learning_rate": 8.341675936225437e-06, + "loss": 2.9924, + "mean_token_accuracy": 0.46675567423230974, + "step": 8945 + }, + { + "epoch": 1.6585094549499444, + "grad_norm": 7.1953125, + "learning_rate": 8.341490545050056e-06, + "loss": 3.1273, + "mean_token_accuracy": 0.4332704797821077, + "step": 8946 + }, + { + "epoch": 1.6586948461253246, + "grad_norm": 10.328125, + "learning_rate": 8.341305153874676e-06, + "loss": 2.506, + "mean_token_accuracy": 0.4873920945024989, + "step": 8947 + }, + { + "epoch": 1.6588802373007043, + "grad_norm": 6.18359375, + "learning_rate": 8.341119762699295e-06, + "loss": 3.1364, + "mean_token_accuracy": 0.4446677384780279, + "step": 8948 + }, + { + "epoch": 1.6590656284760845, + "grad_norm": 6.7421875, + "learning_rate": 8.340934371523917e-06, + "loss": 2.5963, + "mean_token_accuracy": 0.47680511182108626, + "step": 8949 + }, + { + "epoch": 1.6592510196514647, + "grad_norm": 8.2109375, + "learning_rate": 8.340748980348536e-06, + "loss": 2.869, + "mean_token_accuracy": 0.4663911510878814, + "step": 8950 + }, + { + "epoch": 1.6594364108268447, + "grad_norm": 7.65625, + "learning_rate": 8.340563589173157e-06, + "loss": 2.7554, + "mean_token_accuracy": 0.47129489124936774, + "step": 8951 + }, + { + "epoch": 1.6596218020022246, + "grad_norm": 5.58203125, + "learning_rate": 8.340378197997775e-06, + "loss": 2.6674, + "mean_token_accuracy": 0.464941112024103, + "step": 8952 + }, + { + "epoch": 1.6598071931776048, + "grad_norm": 9.9140625, + "learning_rate": 8.340192806822396e-06, + "loss": 3.1767, + "mean_token_accuracy": 0.43936731107205623, + "step": 8953 + }, + { + "epoch": 1.6599925843529848, + "grad_norm": 9.7734375, + "learning_rate": 8.340007415647016e-06, + "loss": 3.0985, + "mean_token_accuracy": 0.42906415267877673, + "step": 8954 + }, + { + "epoch": 1.6601779755283648, + "grad_norm": 7.01953125, + "learning_rate": 8.339822024471635e-06, + "loss": 3.0179, + "mean_token_accuracy": 0.4436495983935743, + "step": 8955 + }, + { + "epoch": 1.660363366703745, + "grad_norm": 10.234375, + "learning_rate": 8.339636633296256e-06, + "loss": 2.9426, + "mean_token_accuracy": 0.46073227167551384, + "step": 8956 + }, + { + "epoch": 1.660548757879125, + "grad_norm": 6.9609375, + "learning_rate": 8.339451242120876e-06, + "loss": 2.3345, + "mean_token_accuracy": 0.5381624983801996, + "step": 8957 + }, + { + "epoch": 1.660734149054505, + "grad_norm": 5.80078125, + "learning_rate": 8.339265850945497e-06, + "loss": 2.6572, + "mean_token_accuracy": 0.4807121661721068, + "step": 8958 + }, + { + "epoch": 1.660919540229885, + "grad_norm": 7.74609375, + "learning_rate": 8.339080459770115e-06, + "loss": 3.3757, + "mean_token_accuracy": 0.42642746248059343, + "step": 8959 + }, + { + "epoch": 1.6611049314052653, + "grad_norm": 9.078125, + "learning_rate": 8.338895068594736e-06, + "loss": 3.4042, + "mean_token_accuracy": 0.42110596409959467, + "step": 8960 + }, + { + "epoch": 1.661290322580645, + "grad_norm": 9.578125, + "learning_rate": 8.338709677419355e-06, + "loss": 2.6259, + "mean_token_accuracy": 0.47948521916411824, + "step": 8961 + }, + { + "epoch": 1.6614757137560252, + "grad_norm": 6.2578125, + "learning_rate": 8.338524286243975e-06, + "loss": 3.0731, + "mean_token_accuracy": 0.4330493000608643, + "step": 8962 + }, + { + "epoch": 1.6616611049314054, + "grad_norm": 8.09375, + "learning_rate": 8.338338895068596e-06, + "loss": 3.3641, + "mean_token_accuracy": 0.42841091492776884, + "step": 8963 + }, + { + "epoch": 1.6618464961067854, + "grad_norm": 7.44140625, + "learning_rate": 8.338153503893215e-06, + "loss": 3.119, + "mean_token_accuracy": 0.4395997140814868, + "step": 8964 + }, + { + "epoch": 1.6620318872821653, + "grad_norm": 5.9140625, + "learning_rate": 8.337968112717835e-06, + "loss": 3.0123, + "mean_token_accuracy": 0.4436574372182872, + "step": 8965 + }, + { + "epoch": 1.6622172784575455, + "grad_norm": 7.50390625, + "learning_rate": 8.337782721542455e-06, + "loss": 3.1252, + "mean_token_accuracy": 0.41665666626665065, + "step": 8966 + }, + { + "epoch": 1.6624026696329255, + "grad_norm": 10.03125, + "learning_rate": 8.337597330367076e-06, + "loss": 2.9077, + "mean_token_accuracy": 0.4581852641554134, + "step": 8967 + }, + { + "epoch": 1.6625880608083055, + "grad_norm": 6.3984375, + "learning_rate": 8.337411939191695e-06, + "loss": 2.8411, + "mean_token_accuracy": 0.4494369494926971, + "step": 8968 + }, + { + "epoch": 1.6627734519836856, + "grad_norm": 6.703125, + "learning_rate": 8.337226548016315e-06, + "loss": 2.8383, + "mean_token_accuracy": 0.4523720582741875, + "step": 8969 + }, + { + "epoch": 1.6629588431590656, + "grad_norm": 9.6640625, + "learning_rate": 8.337041156840934e-06, + "loss": 2.2965, + "mean_token_accuracy": 0.5086042065009561, + "step": 8970 + }, + { + "epoch": 1.6631442343344456, + "grad_norm": 6.0703125, + "learning_rate": 8.336855765665555e-06, + "loss": 3.091, + "mean_token_accuracy": 0.4368794326241135, + "step": 8971 + }, + { + "epoch": 1.6633296255098258, + "grad_norm": 6.46875, + "learning_rate": 8.336670374490175e-06, + "loss": 3.115, + "mean_token_accuracy": 0.44277175116227585, + "step": 8972 + }, + { + "epoch": 1.6635150166852057, + "grad_norm": 6.19921875, + "learning_rate": 8.336484983314796e-06, + "loss": 3.1499, + "mean_token_accuracy": 0.4318181818181818, + "step": 8973 + }, + { + "epoch": 1.6637004078605857, + "grad_norm": 7.09765625, + "learning_rate": 8.336299592139416e-06, + "loss": 2.9243, + "mean_token_accuracy": 0.4716981132075472, + "step": 8974 + }, + { + "epoch": 1.663885799035966, + "grad_norm": 6.0390625, + "learning_rate": 8.336114200964035e-06, + "loss": 2.7871, + "mean_token_accuracy": 0.4651128701260627, + "step": 8975 + }, + { + "epoch": 1.664071190211346, + "grad_norm": 6.40625, + "learning_rate": 8.335928809788655e-06, + "loss": 3.1742, + "mean_token_accuracy": 0.4586917929810187, + "step": 8976 + }, + { + "epoch": 1.6642565813867258, + "grad_norm": 8.03125, + "learning_rate": 8.335743418613274e-06, + "loss": 3.0432, + "mean_token_accuracy": 0.44231963243511735, + "step": 8977 + }, + { + "epoch": 1.664441972562106, + "grad_norm": 6.12890625, + "learning_rate": 8.335558027437895e-06, + "loss": 3.0977, + "mean_token_accuracy": 0.44981949458483755, + "step": 8978 + }, + { + "epoch": 1.6646273637374862, + "grad_norm": 6.32421875, + "learning_rate": 8.335372636262513e-06, + "loss": 2.0904, + "mean_token_accuracy": 0.5561176098640531, + "step": 8979 + }, + { + "epoch": 1.6648127549128662, + "grad_norm": 7.96484375, + "learning_rate": 8.335187245087134e-06, + "loss": 2.8054, + "mean_token_accuracy": 0.48493057907213005, + "step": 8980 + }, + { + "epoch": 1.6649981460882461, + "grad_norm": 9.0625, + "learning_rate": 8.335001853911754e-06, + "loss": 2.3613, + "mean_token_accuracy": 0.5271335542099852, + "step": 8981 + }, + { + "epoch": 1.6651835372636263, + "grad_norm": 6.28515625, + "learning_rate": 8.334816462736375e-06, + "loss": 3.0985, + "mean_token_accuracy": 0.45264373716632444, + "step": 8982 + }, + { + "epoch": 1.6653689284390063, + "grad_norm": 6.18359375, + "learning_rate": 8.334631071560995e-06, + "loss": 2.9048, + "mean_token_accuracy": 0.4543681747269891, + "step": 8983 + }, + { + "epoch": 1.6655543196143863, + "grad_norm": 5.24609375, + "learning_rate": 8.334445680385614e-06, + "loss": 2.6863, + "mean_token_accuracy": 0.47003154574132494, + "step": 8984 + }, + { + "epoch": 1.6657397107897665, + "grad_norm": 5.4921875, + "learning_rate": 8.334260289210235e-06, + "loss": 2.8223, + "mean_token_accuracy": 0.47709074733096085, + "step": 8985 + }, + { + "epoch": 1.6659251019651464, + "grad_norm": 7.75, + "learning_rate": 8.334074898034853e-06, + "loss": 3.0837, + "mean_token_accuracy": 0.44212198997677543, + "step": 8986 + }, + { + "epoch": 1.6661104931405264, + "grad_norm": 6.62890625, + "learning_rate": 8.333889506859474e-06, + "loss": 2.7917, + "mean_token_accuracy": 0.46207624323967816, + "step": 8987 + }, + { + "epoch": 1.6662958843159066, + "grad_norm": 7.41796875, + "learning_rate": 8.333704115684094e-06, + "loss": 2.9036, + "mean_token_accuracy": 0.46514527996009475, + "step": 8988 + }, + { + "epoch": 1.6664812754912868, + "grad_norm": 8.15625, + "learning_rate": 8.333518724508715e-06, + "loss": 3.101, + "mean_token_accuracy": 0.45799164474702153, + "step": 8989 + }, + { + "epoch": 1.6666666666666665, + "grad_norm": 6.37890625, + "learning_rate": 8.333333333333334e-06, + "loss": 3.3409, + "mean_token_accuracy": 0.43858688210324526, + "step": 8990 + }, + { + "epoch": 1.6668520578420467, + "grad_norm": 7.76171875, + "learning_rate": 8.333147942157954e-06, + "loss": 3.3431, + "mean_token_accuracy": 0.41430886202168765, + "step": 8991 + }, + { + "epoch": 1.667037449017427, + "grad_norm": 6.8125, + "learning_rate": 8.332962550982575e-06, + "loss": 3.332, + "mean_token_accuracy": 0.4458502024291498, + "step": 8992 + }, + { + "epoch": 1.6672228401928069, + "grad_norm": 5.96484375, + "learning_rate": 8.332777159807194e-06, + "loss": 2.6627, + "mean_token_accuracy": 0.4687168610816543, + "step": 8993 + }, + { + "epoch": 1.6674082313681868, + "grad_norm": 7.71484375, + "learning_rate": 8.332591768631814e-06, + "loss": 3.0384, + "mean_token_accuracy": 0.447800741918389, + "step": 8994 + }, + { + "epoch": 1.667593622543567, + "grad_norm": 7.61328125, + "learning_rate": 8.332406377456433e-06, + "loss": 2.6316, + "mean_token_accuracy": 0.4780055136042191, + "step": 8995 + }, + { + "epoch": 1.667779013718947, + "grad_norm": 5.83203125, + "learning_rate": 8.332220986281053e-06, + "loss": 2.6068, + "mean_token_accuracy": 0.498416354736539, + "step": 8996 + }, + { + "epoch": 1.667964404894327, + "grad_norm": 8.484375, + "learning_rate": 8.332035595105674e-06, + "loss": 2.5613, + "mean_token_accuracy": 0.4997301672962763, + "step": 8997 + }, + { + "epoch": 1.6681497960697071, + "grad_norm": 7.53515625, + "learning_rate": 8.331850203930294e-06, + "loss": 3.7567, + "mean_token_accuracy": 0.3870479857005362, + "step": 8998 + }, + { + "epoch": 1.6683351872450871, + "grad_norm": 5.7265625, + "learning_rate": 8.331664812754913e-06, + "loss": 2.6293, + "mean_token_accuracy": 0.4918627666593358, + "step": 8999 + }, + { + "epoch": 1.668520578420467, + "grad_norm": 7.2890625, + "learning_rate": 8.331479421579534e-06, + "loss": 2.5366, + "mean_token_accuracy": 0.4916379734382686, + "step": 9000 + }, + { + "epoch": 1.6687059695958473, + "grad_norm": 6.71484375, + "learning_rate": 8.331294030404154e-06, + "loss": 2.8305, + "mean_token_accuracy": 0.49238445378151263, + "step": 9001 + }, + { + "epoch": 1.6688913607712272, + "grad_norm": 5.51953125, + "learning_rate": 8.331108639228773e-06, + "loss": 3.6064, + "mean_token_accuracy": 0.40842560773832254, + "step": 9002 + }, + { + "epoch": 1.6690767519466072, + "grad_norm": 7.12109375, + "learning_rate": 8.330923248053393e-06, + "loss": 2.5349, + "mean_token_accuracy": 0.510231120270867, + "step": 9003 + }, + { + "epoch": 1.6692621431219874, + "grad_norm": 8.2734375, + "learning_rate": 8.330737856878012e-06, + "loss": 3.027, + "mean_token_accuracy": 0.46778668310727495, + "step": 9004 + }, + { + "epoch": 1.6694475342973676, + "grad_norm": 6.01171875, + "learning_rate": 8.330552465702634e-06, + "loss": 3.4727, + "mean_token_accuracy": 0.4431224608241439, + "step": 9005 + }, + { + "epoch": 1.6696329254727473, + "grad_norm": 7.98828125, + "learning_rate": 8.330367074527253e-06, + "loss": 2.7563, + "mean_token_accuracy": 0.4786685419596812, + "step": 9006 + }, + { + "epoch": 1.6698183166481275, + "grad_norm": 7.05078125, + "learning_rate": 8.330181683351874e-06, + "loss": 3.0117, + "mean_token_accuracy": 0.44002541296060993, + "step": 9007 + }, + { + "epoch": 1.6700037078235077, + "grad_norm": 6.9609375, + "learning_rate": 8.329996292176492e-06, + "loss": 3.0331, + "mean_token_accuracy": 0.4196854816062904, + "step": 9008 + }, + { + "epoch": 1.6701890989988877, + "grad_norm": 7.7265625, + "learning_rate": 8.329810901001113e-06, + "loss": 2.7892, + "mean_token_accuracy": 0.4558925356598809, + "step": 9009 + }, + { + "epoch": 1.6703744901742676, + "grad_norm": 5.61328125, + "learning_rate": 8.329625509825733e-06, + "loss": 2.3497, + "mean_token_accuracy": 0.5170417160152652, + "step": 9010 + }, + { + "epoch": 1.6705598813496478, + "grad_norm": 5.80859375, + "learning_rate": 8.329440118650352e-06, + "loss": 3.2756, + "mean_token_accuracy": 0.44947275922671354, + "step": 9011 + }, + { + "epoch": 1.6707452725250278, + "grad_norm": 7.1953125, + "learning_rate": 8.329254727474973e-06, + "loss": 2.9402, + "mean_token_accuracy": 0.46925512414597564, + "step": 9012 + }, + { + "epoch": 1.6709306637004078, + "grad_norm": 7.40234375, + "learning_rate": 8.329069336299593e-06, + "loss": 3.0045, + "mean_token_accuracy": 0.45862776384773196, + "step": 9013 + }, + { + "epoch": 1.671116054875788, + "grad_norm": 8.1484375, + "learning_rate": 8.328883945124214e-06, + "loss": 2.5807, + "mean_token_accuracy": 0.47954999474292925, + "step": 9014 + }, + { + "epoch": 1.671301446051168, + "grad_norm": 6.44921875, + "learning_rate": 8.328698553948832e-06, + "loss": 2.528, + "mean_token_accuracy": 0.5122330646301796, + "step": 9015 + }, + { + "epoch": 1.671486837226548, + "grad_norm": 6.60546875, + "learning_rate": 8.328513162773453e-06, + "loss": 2.9889, + "mean_token_accuracy": 0.4682550246239851, + "step": 9016 + }, + { + "epoch": 1.671672228401928, + "grad_norm": 6.16015625, + "learning_rate": 8.328327771598072e-06, + "loss": 2.8057, + "mean_token_accuracy": 0.4726643598615917, + "step": 9017 + }, + { + "epoch": 1.6718576195773083, + "grad_norm": 7.25, + "learning_rate": 8.328142380422692e-06, + "loss": 2.8836, + "mean_token_accuracy": 0.4497442855651811, + "step": 9018 + }, + { + "epoch": 1.672043010752688, + "grad_norm": 9.0, + "learning_rate": 8.327956989247313e-06, + "loss": 2.4302, + "mean_token_accuracy": 0.48620917917034423, + "step": 9019 + }, + { + "epoch": 1.6722284019280682, + "grad_norm": 5.08203125, + "learning_rate": 8.327771598071932e-06, + "loss": 2.803, + "mean_token_accuracy": 0.45986580516898606, + "step": 9020 + }, + { + "epoch": 1.6724137931034484, + "grad_norm": 5.703125, + "learning_rate": 8.327586206896554e-06, + "loss": 3.3039, + "mean_token_accuracy": 0.43088975937325125, + "step": 9021 + }, + { + "epoch": 1.6725991842788284, + "grad_norm": 6.8359375, + "learning_rate": 8.327400815721173e-06, + "loss": 2.8829, + "mean_token_accuracy": 0.4466959215281363, + "step": 9022 + }, + { + "epoch": 1.6727845754542083, + "grad_norm": 7.06640625, + "learning_rate": 8.327215424545793e-06, + "loss": 3.3077, + "mean_token_accuracy": 0.4373962138824311, + "step": 9023 + }, + { + "epoch": 1.6729699666295885, + "grad_norm": 5.78125, + "learning_rate": 8.327030033370412e-06, + "loss": 2.7014, + "mean_token_accuracy": 0.4916142557651992, + "step": 9024 + }, + { + "epoch": 1.6731553578049685, + "grad_norm": 6.7578125, + "learning_rate": 8.326844642195032e-06, + "loss": 2.2777, + "mean_token_accuracy": 0.5315116279069767, + "step": 9025 + }, + { + "epoch": 1.6733407489803485, + "grad_norm": 5.46875, + "learning_rate": 8.326659251019653e-06, + "loss": 2.6853, + "mean_token_accuracy": 0.48560923037910486, + "step": 9026 + }, + { + "epoch": 1.6735261401557286, + "grad_norm": 5.671875, + "learning_rate": 8.326473859844272e-06, + "loss": 2.8309, + "mean_token_accuracy": 0.4642772088592522, + "step": 9027 + }, + { + "epoch": 1.6737115313311086, + "grad_norm": 5.359375, + "learning_rate": 8.326288468668892e-06, + "loss": 2.4247, + "mean_token_accuracy": 0.49465942917177375, + "step": 9028 + }, + { + "epoch": 1.6738969225064886, + "grad_norm": 6.1640625, + "learning_rate": 8.326103077493513e-06, + "loss": 3.0541, + "mean_token_accuracy": 0.443935119887165, + "step": 9029 + }, + { + "epoch": 1.6740823136818688, + "grad_norm": 6.2265625, + "learning_rate": 8.325917686318133e-06, + "loss": 2.9046, + "mean_token_accuracy": 0.46596022644796053, + "step": 9030 + }, + { + "epoch": 1.6742677048572487, + "grad_norm": 5.2265625, + "learning_rate": 8.325732295142752e-06, + "loss": 3.2694, + "mean_token_accuracy": 0.4195157561747171, + "step": 9031 + }, + { + "epoch": 1.6744530960326287, + "grad_norm": 5.16796875, + "learning_rate": 8.325546903967372e-06, + "loss": 2.7101, + "mean_token_accuracy": 0.48514851485148514, + "step": 9032 + }, + { + "epoch": 1.674638487208009, + "grad_norm": 5.72265625, + "learning_rate": 8.325361512791991e-06, + "loss": 2.3203, + "mean_token_accuracy": 0.5085653104925053, + "step": 9033 + }, + { + "epoch": 1.674823878383389, + "grad_norm": 5.66015625, + "learning_rate": 8.325176121616612e-06, + "loss": 3.1346, + "mean_token_accuracy": 0.4413498208713741, + "step": 9034 + }, + { + "epoch": 1.675009269558769, + "grad_norm": 5.7578125, + "learning_rate": 8.324990730441232e-06, + "loss": 2.7465, + "mean_token_accuracy": 0.46360804304729536, + "step": 9035 + }, + { + "epoch": 1.675194660734149, + "grad_norm": 4.90625, + "learning_rate": 8.324805339265851e-06, + "loss": 2.7933, + "mean_token_accuracy": 0.48440279919345275, + "step": 9036 + }, + { + "epoch": 1.6753800519095292, + "grad_norm": 7.2578125, + "learning_rate": 8.324619948090471e-06, + "loss": 2.3204, + "mean_token_accuracy": 0.5199916483975363, + "step": 9037 + }, + { + "epoch": 1.6755654430849092, + "grad_norm": 7.26953125, + "learning_rate": 8.324434556915092e-06, + "loss": 2.8912, + "mean_token_accuracy": 0.46266094420600856, + "step": 9038 + }, + { + "epoch": 1.6757508342602891, + "grad_norm": 6.515625, + "learning_rate": 8.324249165739712e-06, + "loss": 2.9263, + "mean_token_accuracy": 0.45702005730659023, + "step": 9039 + }, + { + "epoch": 1.6759362254356693, + "grad_norm": 7.6640625, + "learning_rate": 8.324063774564331e-06, + "loss": 3.4648, + "mean_token_accuracy": 0.43855534709193245, + "step": 9040 + }, + { + "epoch": 1.6761216166110493, + "grad_norm": 5.6953125, + "learning_rate": 8.323878383388952e-06, + "loss": 2.9746, + "mean_token_accuracy": 0.44357976653696496, + "step": 9041 + }, + { + "epoch": 1.6763070077864293, + "grad_norm": 6.3828125, + "learning_rate": 8.32369299221357e-06, + "loss": 2.2308, + "mean_token_accuracy": 0.5568052832525023, + "step": 9042 + }, + { + "epoch": 1.6764923989618095, + "grad_norm": 6.01171875, + "learning_rate": 8.323507601038191e-06, + "loss": 2.7737, + "mean_token_accuracy": 0.48327016783974014, + "step": 9043 + }, + { + "epoch": 1.6766777901371894, + "grad_norm": 7.5859375, + "learning_rate": 8.323322209862811e-06, + "loss": 2.5998, + "mean_token_accuracy": 0.5013791374122367, + "step": 9044 + }, + { + "epoch": 1.6768631813125694, + "grad_norm": 5.625, + "learning_rate": 8.32313681868743e-06, + "loss": 3.1244, + "mean_token_accuracy": 0.4639464257903753, + "step": 9045 + }, + { + "epoch": 1.6770485724879496, + "grad_norm": 8.9609375, + "learning_rate": 8.32295142751205e-06, + "loss": 2.7473, + "mean_token_accuracy": 0.46039453717754175, + "step": 9046 + }, + { + "epoch": 1.6772339636633298, + "grad_norm": 5.7890625, + "learning_rate": 8.322766036336671e-06, + "loss": 2.586, + "mean_token_accuracy": 0.49952278692436175, + "step": 9047 + }, + { + "epoch": 1.6774193548387095, + "grad_norm": 5.49609375, + "learning_rate": 8.322580645161292e-06, + "loss": 3.265, + "mean_token_accuracy": 0.4274952919020716, + "step": 9048 + }, + { + "epoch": 1.6776047460140897, + "grad_norm": 6.6953125, + "learning_rate": 8.32239525398591e-06, + "loss": 3.0674, + "mean_token_accuracy": 0.4355092129695758, + "step": 9049 + }, + { + "epoch": 1.67779013718947, + "grad_norm": 6.84375, + "learning_rate": 8.322209862810531e-06, + "loss": 2.5567, + "mean_token_accuracy": 0.4922271037512673, + "step": 9050 + }, + { + "epoch": 1.6779755283648499, + "grad_norm": 5.41796875, + "learning_rate": 8.32202447163515e-06, + "loss": 2.5311, + "mean_token_accuracy": 0.49187087653157396, + "step": 9051 + }, + { + "epoch": 1.6781609195402298, + "grad_norm": 5.51953125, + "learning_rate": 8.32183908045977e-06, + "loss": 2.6122, + "mean_token_accuracy": 0.4788783685360524, + "step": 9052 + }, + { + "epoch": 1.67834631071561, + "grad_norm": 6.140625, + "learning_rate": 8.32165368928439e-06, + "loss": 3.0602, + "mean_token_accuracy": 0.4343867166577397, + "step": 9053 + }, + { + "epoch": 1.67853170189099, + "grad_norm": 7.078125, + "learning_rate": 8.321468298109011e-06, + "loss": 2.9969, + "mean_token_accuracy": 0.443505212735982, + "step": 9054 + }, + { + "epoch": 1.67871709306637, + "grad_norm": 6.5859375, + "learning_rate": 8.321282906933632e-06, + "loss": 3.214, + "mean_token_accuracy": 0.42210753720595295, + "step": 9055 + }, + { + "epoch": 1.6789024842417501, + "grad_norm": 6.32421875, + "learning_rate": 8.32109751575825e-06, + "loss": 2.9148, + "mean_token_accuracy": 0.4652930001461347, + "step": 9056 + }, + { + "epoch": 1.6790878754171301, + "grad_norm": 6.00390625, + "learning_rate": 8.320912124582871e-06, + "loss": 2.6057, + "mean_token_accuracy": 0.48931855056787454, + "step": 9057 + }, + { + "epoch": 1.67927326659251, + "grad_norm": 6.03515625, + "learning_rate": 8.32072673340749e-06, + "loss": 2.8581, + "mean_token_accuracy": 0.4650215620507906, + "step": 9058 + }, + { + "epoch": 1.6794586577678903, + "grad_norm": 8.2109375, + "learning_rate": 8.32054134223211e-06, + "loss": 2.5866, + "mean_token_accuracy": 0.5024516129032258, + "step": 9059 + }, + { + "epoch": 1.6796440489432705, + "grad_norm": 6.140625, + "learning_rate": 8.32035595105673e-06, + "loss": 3.1098, + "mean_token_accuracy": 0.43600791894724583, + "step": 9060 + }, + { + "epoch": 1.6798294401186502, + "grad_norm": 6.9140625, + "learning_rate": 8.32017055988135e-06, + "loss": 3.1584, + "mean_token_accuracy": 0.46320730778990105, + "step": 9061 + }, + { + "epoch": 1.6800148312940304, + "grad_norm": 6.390625, + "learning_rate": 8.31998516870597e-06, + "loss": 2.5249, + "mean_token_accuracy": 0.527389903329753, + "step": 9062 + }, + { + "epoch": 1.6802002224694106, + "grad_norm": 6.23828125, + "learning_rate": 8.31979977753059e-06, + "loss": 2.4759, + "mean_token_accuracy": 0.49195205479452053, + "step": 9063 + }, + { + "epoch": 1.6803856136447906, + "grad_norm": 6.6796875, + "learning_rate": 8.319614386355211e-06, + "loss": 3.4126, + "mean_token_accuracy": 0.4267648864333947, + "step": 9064 + }, + { + "epoch": 1.6805710048201705, + "grad_norm": 6.14453125, + "learning_rate": 8.31942899517983e-06, + "loss": 2.9261, + "mean_token_accuracy": 0.46221216296004725, + "step": 9065 + }, + { + "epoch": 1.6807563959955507, + "grad_norm": 6.14453125, + "learning_rate": 8.31924360400445e-06, + "loss": 3.2519, + "mean_token_accuracy": 0.43577087933497854, + "step": 9066 + }, + { + "epoch": 1.6809417871709307, + "grad_norm": 6.17578125, + "learning_rate": 8.31905821282907e-06, + "loss": 2.9324, + "mean_token_accuracy": 0.4508689474259482, + "step": 9067 + }, + { + "epoch": 1.6811271783463106, + "grad_norm": 6.3828125, + "learning_rate": 8.31887282165369e-06, + "loss": 2.5088, + "mean_token_accuracy": 0.5060381201804162, + "step": 9068 + }, + { + "epoch": 1.6813125695216908, + "grad_norm": 6.03515625, + "learning_rate": 8.31868743047831e-06, + "loss": 2.7209, + "mean_token_accuracy": 0.47542828560710265, + "step": 9069 + }, + { + "epoch": 1.6814979606970708, + "grad_norm": 6.5625, + "learning_rate": 8.31850203930293e-06, + "loss": 2.7264, + "mean_token_accuracy": 0.46617715981877833, + "step": 9070 + }, + { + "epoch": 1.6816833518724508, + "grad_norm": 6.42578125, + "learning_rate": 8.31831664812755e-06, + "loss": 3.1624, + "mean_token_accuracy": 0.4611142533936652, + "step": 9071 + }, + { + "epoch": 1.681868743047831, + "grad_norm": 33.53125, + "learning_rate": 8.31813125695217e-06, + "loss": 3.0743, + "mean_token_accuracy": 0.5067851567191065, + "step": 9072 + }, + { + "epoch": 1.682054134223211, + "grad_norm": 7.07421875, + "learning_rate": 8.31794586577679e-06, + "loss": 3.0683, + "mean_token_accuracy": 0.4524196573715143, + "step": 9073 + }, + { + "epoch": 1.682239525398591, + "grad_norm": 9.8046875, + "learning_rate": 8.31776047460141e-06, + "loss": 3.0395, + "mean_token_accuracy": 0.4491747296528173, + "step": 9074 + }, + { + "epoch": 1.682424916573971, + "grad_norm": 6.96484375, + "learning_rate": 8.31757508342603e-06, + "loss": 2.7205, + "mean_token_accuracy": 0.45052359101070716, + "step": 9075 + }, + { + "epoch": 1.6826103077493513, + "grad_norm": 7.3828125, + "learning_rate": 8.317389692250649e-06, + "loss": 2.6822, + "mean_token_accuracy": 0.4628480509148767, + "step": 9076 + }, + { + "epoch": 1.682795698924731, + "grad_norm": 8.5546875, + "learning_rate": 8.317204301075269e-06, + "loss": 3.2102, + "mean_token_accuracy": 0.4362955774296409, + "step": 9077 + }, + { + "epoch": 1.6829810901001112, + "grad_norm": 9.5234375, + "learning_rate": 8.31701890989989e-06, + "loss": 2.946, + "mean_token_accuracy": 0.44569816643159377, + "step": 9078 + }, + { + "epoch": 1.6831664812754914, + "grad_norm": 10.9140625, + "learning_rate": 8.31683351872451e-06, + "loss": 3.0545, + "mean_token_accuracy": 0.45707070707070707, + "step": 9079 + }, + { + "epoch": 1.6833518724508714, + "grad_norm": 8.4140625, + "learning_rate": 8.316648127549129e-06, + "loss": 2.4792, + "mean_token_accuracy": 0.5179000801496126, + "step": 9080 + }, + { + "epoch": 1.6835372636262513, + "grad_norm": 5.81640625, + "learning_rate": 8.31646273637375e-06, + "loss": 3.2532, + "mean_token_accuracy": 0.43907156673114117, + "step": 9081 + }, + { + "epoch": 1.6837226548016315, + "grad_norm": 5.59375, + "learning_rate": 8.31627734519837e-06, + "loss": 2.8948, + "mean_token_accuracy": 0.47361461714148334, + "step": 9082 + }, + { + "epoch": 1.6839080459770115, + "grad_norm": 6.81640625, + "learning_rate": 8.316091954022989e-06, + "loss": 3.1187, + "mean_token_accuracy": 0.45247904531893735, + "step": 9083 + }, + { + "epoch": 1.6840934371523915, + "grad_norm": 5.84375, + "learning_rate": 8.315906562847609e-06, + "loss": 2.9655, + "mean_token_accuracy": 0.4411298000634719, + "step": 9084 + }, + { + "epoch": 1.6842788283277716, + "grad_norm": 8.6640625, + "learning_rate": 8.315721171672228e-06, + "loss": 2.769, + "mean_token_accuracy": 0.47344643186428176, + "step": 9085 + }, + { + "epoch": 1.6844642195031516, + "grad_norm": 6.0390625, + "learning_rate": 8.31553578049685e-06, + "loss": 3.2741, + "mean_token_accuracy": 0.4599254426840634, + "step": 9086 + }, + { + "epoch": 1.6846496106785316, + "grad_norm": 6.88671875, + "learning_rate": 8.315350389321469e-06, + "loss": 2.901, + "mean_token_accuracy": 0.47023411371237456, + "step": 9087 + }, + { + "epoch": 1.6848350018539118, + "grad_norm": 6.39453125, + "learning_rate": 8.31516499814609e-06, + "loss": 3.1977, + "mean_token_accuracy": 0.44329896907216493, + "step": 9088 + }, + { + "epoch": 1.685020393029292, + "grad_norm": 6.015625, + "learning_rate": 8.314979606970708e-06, + "loss": 3.1098, + "mean_token_accuracy": 0.42452305982589367, + "step": 9089 + }, + { + "epoch": 1.6852057842046717, + "grad_norm": 5.984375, + "learning_rate": 8.314794215795329e-06, + "loss": 3.1597, + "mean_token_accuracy": 0.4833976833976834, + "step": 9090 + }, + { + "epoch": 1.685391175380052, + "grad_norm": 6.9921875, + "learning_rate": 8.314608824619949e-06, + "loss": 3.1161, + "mean_token_accuracy": 0.45440792188144685, + "step": 9091 + }, + { + "epoch": 1.685576566555432, + "grad_norm": 6.19140625, + "learning_rate": 8.314423433444568e-06, + "loss": 2.4878, + "mean_token_accuracy": 0.51136638452237, + "step": 9092 + }, + { + "epoch": 1.685761957730812, + "grad_norm": 6.40234375, + "learning_rate": 8.314238042269188e-06, + "loss": 2.6714, + "mean_token_accuracy": 0.5005433306166802, + "step": 9093 + }, + { + "epoch": 1.685947348906192, + "grad_norm": 6.7734375, + "learning_rate": 8.314052651093809e-06, + "loss": 3.189, + "mean_token_accuracy": 0.428958417987268, + "step": 9094 + }, + { + "epoch": 1.6861327400815722, + "grad_norm": 7.9921875, + "learning_rate": 8.31386725991843e-06, + "loss": 2.5998, + "mean_token_accuracy": 0.4912023460410557, + "step": 9095 + }, + { + "epoch": 1.6863181312569522, + "grad_norm": 6.62109375, + "learning_rate": 8.313681868743048e-06, + "loss": 2.0204, + "mean_token_accuracy": 0.5506028993361333, + "step": 9096 + }, + { + "epoch": 1.6865035224323321, + "grad_norm": 7.9921875, + "learning_rate": 8.313496477567669e-06, + "loss": 2.5934, + "mean_token_accuracy": 0.4632554945054945, + "step": 9097 + }, + { + "epoch": 1.6866889136077123, + "grad_norm": 6.44140625, + "learning_rate": 8.313311086392288e-06, + "loss": 2.6643, + "mean_token_accuracy": 0.49024628139478177, + "step": 9098 + }, + { + "epoch": 1.6868743047830923, + "grad_norm": 6.6328125, + "learning_rate": 8.313125695216908e-06, + "loss": 3.2855, + "mean_token_accuracy": 0.4303356554781507, + "step": 9099 + }, + { + "epoch": 1.6870596959584723, + "grad_norm": 5.83984375, + "learning_rate": 8.312940304041528e-06, + "loss": 2.2972, + "mean_token_accuracy": 0.5406933333333334, + "step": 9100 + }, + { + "epoch": 1.6872450871338525, + "grad_norm": 6.07421875, + "learning_rate": 8.312754912866147e-06, + "loss": 2.7214, + "mean_token_accuracy": 0.48404255319148937, + "step": 9101 + }, + { + "epoch": 1.6874304783092324, + "grad_norm": 8.2109375, + "learning_rate": 8.31256952169077e-06, + "loss": 2.5909, + "mean_token_accuracy": 0.48796112215595316, + "step": 9102 + }, + { + "epoch": 1.6876158694846124, + "grad_norm": 6.20703125, + "learning_rate": 8.312384130515388e-06, + "loss": 3.1369, + "mean_token_accuracy": 0.42562893081761005, + "step": 9103 + }, + { + "epoch": 1.6878012606599926, + "grad_norm": 7.109375, + "learning_rate": 8.312198739340009e-06, + "loss": 3.0262, + "mean_token_accuracy": 0.4610630407911001, + "step": 9104 + }, + { + "epoch": 1.6879866518353728, + "grad_norm": 6.6484375, + "learning_rate": 8.312013348164628e-06, + "loss": 2.7802, + "mean_token_accuracy": 0.47671840354767187, + "step": 9105 + }, + { + "epoch": 1.6881720430107527, + "grad_norm": 10.421875, + "learning_rate": 8.311827956989248e-06, + "loss": 2.5358, + "mean_token_accuracy": 0.48253968253968255, + "step": 9106 + }, + { + "epoch": 1.6883574341861327, + "grad_norm": 5.515625, + "learning_rate": 8.311642565813869e-06, + "loss": 2.5276, + "mean_token_accuracy": 0.5057920648711266, + "step": 9107 + }, + { + "epoch": 1.688542825361513, + "grad_norm": 5.66796875, + "learning_rate": 8.311457174638487e-06, + "loss": 2.7112, + "mean_token_accuracy": 0.47992403689636465, + "step": 9108 + }, + { + "epoch": 1.6887282165368929, + "grad_norm": 6.9609375, + "learning_rate": 8.311271783463108e-06, + "loss": 3.8169, + "mean_token_accuracy": 0.448292164046803, + "step": 9109 + }, + { + "epoch": 1.6889136077122728, + "grad_norm": 6.42578125, + "learning_rate": 8.311086392287728e-06, + "loss": 2.9506, + "mean_token_accuracy": 0.5024707725683982, + "step": 9110 + }, + { + "epoch": 1.689098998887653, + "grad_norm": 7.9609375, + "learning_rate": 8.310901001112349e-06, + "loss": 2.8987, + "mean_token_accuracy": 0.45084897229669346, + "step": 9111 + }, + { + "epoch": 1.689284390063033, + "grad_norm": 6.29296875, + "learning_rate": 8.310715609936968e-06, + "loss": 2.9296, + "mean_token_accuracy": 0.448702269330072, + "step": 9112 + }, + { + "epoch": 1.689469781238413, + "grad_norm": 5.46875, + "learning_rate": 8.310530218761588e-06, + "loss": 3.2255, + "mean_token_accuracy": 0.4287630402384501, + "step": 9113 + }, + { + "epoch": 1.6896551724137931, + "grad_norm": 5.8671875, + "learning_rate": 8.310344827586207e-06, + "loss": 3.1278, + "mean_token_accuracy": 0.44597279535381323, + "step": 9114 + }, + { + "epoch": 1.6898405635891731, + "grad_norm": 6.640625, + "learning_rate": 8.310159436410827e-06, + "loss": 3.0455, + "mean_token_accuracy": 0.4509682224428997, + "step": 9115 + }, + { + "epoch": 1.690025954764553, + "grad_norm": 6.19921875, + "learning_rate": 8.309974045235448e-06, + "loss": 2.872, + "mean_token_accuracy": 0.4431968295904888, + "step": 9116 + }, + { + "epoch": 1.6902113459399333, + "grad_norm": 6.76171875, + "learning_rate": 8.309788654060067e-06, + "loss": 2.564, + "mean_token_accuracy": 0.4997751798561151, + "step": 9117 + }, + { + "epoch": 1.6903967371153135, + "grad_norm": 7.09765625, + "learning_rate": 8.309603262884687e-06, + "loss": 2.4654, + "mean_token_accuracy": 0.4890738813735692, + "step": 9118 + }, + { + "epoch": 1.6905821282906932, + "grad_norm": 6.6484375, + "learning_rate": 8.309417871709308e-06, + "loss": 3.405, + "mean_token_accuracy": 0.424188467333242, + "step": 9119 + }, + { + "epoch": 1.6907675194660734, + "grad_norm": 5.5234375, + "learning_rate": 8.309232480533928e-06, + "loss": 3.1678, + "mean_token_accuracy": 0.44377059986816086, + "step": 9120 + }, + { + "epoch": 1.6909529106414536, + "grad_norm": 6.12109375, + "learning_rate": 8.309047089358547e-06, + "loss": 2.6648, + "mean_token_accuracy": 0.4827944230198754, + "step": 9121 + }, + { + "epoch": 1.6911383018168336, + "grad_norm": 5.69921875, + "learning_rate": 8.308861698183167e-06, + "loss": 3.0312, + "mean_token_accuracy": 0.43607245996324495, + "step": 9122 + }, + { + "epoch": 1.6913236929922135, + "grad_norm": 6.01171875, + "learning_rate": 8.308676307007786e-06, + "loss": 3.0037, + "mean_token_accuracy": 0.4627159745377387, + "step": 9123 + }, + { + "epoch": 1.6915090841675937, + "grad_norm": 5.9140625, + "learning_rate": 8.308490915832407e-06, + "loss": 3.2002, + "mean_token_accuracy": 0.44009475127789555, + "step": 9124 + }, + { + "epoch": 1.6916944753429737, + "grad_norm": 5.5234375, + "learning_rate": 8.308305524657027e-06, + "loss": 2.7782, + "mean_token_accuracy": 0.47016314464031533, + "step": 9125 + }, + { + "epoch": 1.6918798665183536, + "grad_norm": 5.7578125, + "learning_rate": 8.308120133481648e-06, + "loss": 3.0414, + "mean_token_accuracy": 0.4603316326530612, + "step": 9126 + }, + { + "epoch": 1.6920652576937338, + "grad_norm": 7.6796875, + "learning_rate": 8.307934742306267e-06, + "loss": 2.6886, + "mean_token_accuracy": 0.47412060301507536, + "step": 9127 + }, + { + "epoch": 1.6922506488691138, + "grad_norm": 5.9140625, + "learning_rate": 8.307749351130887e-06, + "loss": 2.418, + "mean_token_accuracy": 0.49349069229833314, + "step": 9128 + }, + { + "epoch": 1.6924360400444938, + "grad_norm": 5.54296875, + "learning_rate": 8.307563959955507e-06, + "loss": 3.019, + "mean_token_accuracy": 0.44623799359658484, + "step": 9129 + }, + { + "epoch": 1.692621431219874, + "grad_norm": 6.51171875, + "learning_rate": 8.307378568780126e-06, + "loss": 2.6168, + "mean_token_accuracy": 0.4892263759086189, + "step": 9130 + }, + { + "epoch": 1.6928068223952542, + "grad_norm": 5.9921875, + "learning_rate": 8.307193177604747e-06, + "loss": 3.3726, + "mean_token_accuracy": 0.41840161182001345, + "step": 9131 + }, + { + "epoch": 1.692992213570634, + "grad_norm": 6.66015625, + "learning_rate": 8.307007786429366e-06, + "loss": 2.371, + "mean_token_accuracy": 0.5277275467148885, + "step": 9132 + }, + { + "epoch": 1.693177604746014, + "grad_norm": 7.6796875, + "learning_rate": 8.306822395253986e-06, + "loss": 2.9492, + "mean_token_accuracy": 0.4437097321125805, + "step": 9133 + }, + { + "epoch": 1.6933629959213943, + "grad_norm": 7.59765625, + "learning_rate": 8.306637004078607e-06, + "loss": 2.9861, + "mean_token_accuracy": 0.45683563748079875, + "step": 9134 + }, + { + "epoch": 1.6935483870967742, + "grad_norm": 5.83984375, + "learning_rate": 8.306451612903227e-06, + "loss": 2.6193, + "mean_token_accuracy": 0.4721680420105026, + "step": 9135 + }, + { + "epoch": 1.6937337782721542, + "grad_norm": 6.609375, + "learning_rate": 8.306266221727846e-06, + "loss": 3.0677, + "mean_token_accuracy": 0.44235895157707683, + "step": 9136 + }, + { + "epoch": 1.6939191694475344, + "grad_norm": 7.86328125, + "learning_rate": 8.306080830552466e-06, + "loss": 3.1931, + "mean_token_accuracy": 0.44338819523269013, + "step": 9137 + }, + { + "epoch": 1.6941045606229144, + "grad_norm": 7.09375, + "learning_rate": 8.305895439377087e-06, + "loss": 2.1719, + "mean_token_accuracy": 0.531744975376015, + "step": 9138 + }, + { + "epoch": 1.6942899517982943, + "grad_norm": 10.1796875, + "learning_rate": 8.305710048201706e-06, + "loss": 3.0565, + "mean_token_accuracy": 0.4555105461656656, + "step": 9139 + }, + { + "epoch": 1.6944753429736745, + "grad_norm": 7.66015625, + "learning_rate": 8.305524657026326e-06, + "loss": 2.9333, + "mean_token_accuracy": 0.4652434956637759, + "step": 9140 + }, + { + "epoch": 1.6946607341490545, + "grad_norm": 7.47265625, + "learning_rate": 8.305339265850945e-06, + "loss": 3.2352, + "mean_token_accuracy": 0.4527670074021854, + "step": 9141 + }, + { + "epoch": 1.6948461253244345, + "grad_norm": 6.17578125, + "learning_rate": 8.305153874675567e-06, + "loss": 2.5131, + "mean_token_accuracy": 0.4810828440965427, + "step": 9142 + }, + { + "epoch": 1.6950315164998146, + "grad_norm": 5.85546875, + "learning_rate": 8.304968483500186e-06, + "loss": 2.4858, + "mean_token_accuracy": 0.5054072553045859, + "step": 9143 + }, + { + "epoch": 1.6952169076751946, + "grad_norm": 6.0625, + "learning_rate": 8.304783092324806e-06, + "loss": 2.6147, + "mean_token_accuracy": 0.49845616861323666, + "step": 9144 + }, + { + "epoch": 1.6954022988505746, + "grad_norm": 6.94921875, + "learning_rate": 8.304597701149427e-06, + "loss": 2.9895, + "mean_token_accuracy": 0.4867490106727425, + "step": 9145 + }, + { + "epoch": 1.6955876900259548, + "grad_norm": 6.90625, + "learning_rate": 8.304412309974046e-06, + "loss": 2.7804, + "mean_token_accuracy": 0.4589828373484491, + "step": 9146 + }, + { + "epoch": 1.695773081201335, + "grad_norm": 7.94921875, + "learning_rate": 8.304226918798666e-06, + "loss": 2.3435, + "mean_token_accuracy": 0.5245294204831777, + "step": 9147 + }, + { + "epoch": 1.6959584723767147, + "grad_norm": 6.80859375, + "learning_rate": 8.304041527623285e-06, + "loss": 3.4427, + "mean_token_accuracy": 0.4133880290881969, + "step": 9148 + }, + { + "epoch": 1.696143863552095, + "grad_norm": 8.8046875, + "learning_rate": 8.303856136447905e-06, + "loss": 3.057, + "mean_token_accuracy": 0.44923903312444047, + "step": 9149 + }, + { + "epoch": 1.696329254727475, + "grad_norm": 7.5, + "learning_rate": 8.303670745272526e-06, + "loss": 3.2175, + "mean_token_accuracy": 0.44012042818911684, + "step": 9150 + }, + { + "epoch": 1.696514645902855, + "grad_norm": 6.1875, + "learning_rate": 8.303485354097146e-06, + "loss": 2.9516, + "mean_token_accuracy": 0.4599203187250996, + "step": 9151 + }, + { + "epoch": 1.696700037078235, + "grad_norm": 5.84375, + "learning_rate": 8.303299962921765e-06, + "loss": 3.0816, + "mean_token_accuracy": 0.42380614417520024, + "step": 9152 + }, + { + "epoch": 1.6968854282536152, + "grad_norm": 7.4140625, + "learning_rate": 8.303114571746386e-06, + "loss": 2.6362, + "mean_token_accuracy": 0.5126780482877058, + "step": 9153 + }, + { + "epoch": 1.6970708194289952, + "grad_norm": 7.640625, + "learning_rate": 8.302929180571006e-06, + "loss": 3.095, + "mean_token_accuracy": 0.4669619785898856, + "step": 9154 + }, + { + "epoch": 1.6972562106043751, + "grad_norm": 6.19921875, + "learning_rate": 8.302743789395625e-06, + "loss": 3.6208, + "mean_token_accuracy": 0.4143248306672431, + "step": 9155 + }, + { + "epoch": 1.6974416017797553, + "grad_norm": 14.515625, + "learning_rate": 8.302558398220246e-06, + "loss": 3.1389, + "mean_token_accuracy": 0.47212905286695417, + "step": 9156 + }, + { + "epoch": 1.6976269929551353, + "grad_norm": 4.984375, + "learning_rate": 8.302373007044864e-06, + "loss": 2.618, + "mean_token_accuracy": 0.4993939393939394, + "step": 9157 + }, + { + "epoch": 1.6978123841305153, + "grad_norm": 6.0625, + "learning_rate": 8.302187615869485e-06, + "loss": 3.0672, + "mean_token_accuracy": 0.4473547717842324, + "step": 9158 + }, + { + "epoch": 1.6979977753058955, + "grad_norm": 7.78125, + "learning_rate": 8.302002224694105e-06, + "loss": 2.5315, + "mean_token_accuracy": 0.5020033741037537, + "step": 9159 + }, + { + "epoch": 1.6981831664812757, + "grad_norm": 6.65625, + "learning_rate": 8.301816833518726e-06, + "loss": 2.8877, + "mean_token_accuracy": 0.4588102833552261, + "step": 9160 + }, + { + "epoch": 1.6983685576566554, + "grad_norm": 5.9140625, + "learning_rate": 8.301631442343345e-06, + "loss": 2.7359, + "mean_token_accuracy": 0.4795289855072464, + "step": 9161 + }, + { + "epoch": 1.6985539488320356, + "grad_norm": 8.1640625, + "learning_rate": 8.301446051167965e-06, + "loss": 3.0187, + "mean_token_accuracy": 0.4553030303030303, + "step": 9162 + }, + { + "epoch": 1.6987393400074158, + "grad_norm": 10.9921875, + "learning_rate": 8.301260659992586e-06, + "loss": 2.7128, + "mean_token_accuracy": 0.465080778526905, + "step": 9163 + }, + { + "epoch": 1.6989247311827957, + "grad_norm": 6.1171875, + "learning_rate": 8.301075268817204e-06, + "loss": 3.1521, + "mean_token_accuracy": 0.44034786869181775, + "step": 9164 + }, + { + "epoch": 1.6991101223581757, + "grad_norm": 8.265625, + "learning_rate": 8.300889877641825e-06, + "loss": 3.1169, + "mean_token_accuracy": 0.4561344537815126, + "step": 9165 + }, + { + "epoch": 1.699295513533556, + "grad_norm": 6.06640625, + "learning_rate": 8.300704486466444e-06, + "loss": 3.2531, + "mean_token_accuracy": 0.4515744906059804, + "step": 9166 + }, + { + "epoch": 1.6994809047089359, + "grad_norm": 5.53515625, + "learning_rate": 8.300519095291066e-06, + "loss": 3.0218, + "mean_token_accuracy": 0.46347125495015706, + "step": 9167 + }, + { + "epoch": 1.6996662958843158, + "grad_norm": 8.65625, + "learning_rate": 8.300333704115685e-06, + "loss": 2.7546, + "mean_token_accuracy": 0.45079400581525386, + "step": 9168 + }, + { + "epoch": 1.699851687059696, + "grad_norm": 9.2265625, + "learning_rate": 8.300148312940305e-06, + "loss": 2.6805, + "mean_token_accuracy": 0.49092607419268747, + "step": 9169 + }, + { + "epoch": 1.700037078235076, + "grad_norm": 5.4296875, + "learning_rate": 8.299962921764924e-06, + "loss": 2.7725, + "mean_token_accuracy": 0.4744058500914077, + "step": 9170 + }, + { + "epoch": 1.700222469410456, + "grad_norm": 6.875, + "learning_rate": 8.299777530589544e-06, + "loss": 2.8927, + "mean_token_accuracy": 0.4659549228944247, + "step": 9171 + }, + { + "epoch": 1.7004078605858362, + "grad_norm": 5.8828125, + "learning_rate": 8.299592139414165e-06, + "loss": 2.6225, + "mean_token_accuracy": 0.49113960524420114, + "step": 9172 + }, + { + "epoch": 1.7005932517612161, + "grad_norm": 8.4140625, + "learning_rate": 8.299406748238784e-06, + "loss": 3.0099, + "mean_token_accuracy": 0.4428220326731242, + "step": 9173 + }, + { + "epoch": 1.700778642936596, + "grad_norm": 6.63671875, + "learning_rate": 8.299221357063404e-06, + "loss": 3.1017, + "mean_token_accuracy": 0.4497628288055196, + "step": 9174 + }, + { + "epoch": 1.7009640341119763, + "grad_norm": 6.0, + "learning_rate": 8.299035965888025e-06, + "loss": 3.1422, + "mean_token_accuracy": 0.46157643574131973, + "step": 9175 + }, + { + "epoch": 1.7011494252873565, + "grad_norm": 6.65625, + "learning_rate": 8.298850574712645e-06, + "loss": 2.8155, + "mean_token_accuracy": 0.502017238217495, + "step": 9176 + }, + { + "epoch": 1.7013348164627362, + "grad_norm": 6.51171875, + "learning_rate": 8.298665183537264e-06, + "loss": 2.9695, + "mean_token_accuracy": 0.4541035967226774, + "step": 9177 + }, + { + "epoch": 1.7015202076381164, + "grad_norm": 6.44140625, + "learning_rate": 8.298479792361884e-06, + "loss": 2.8345, + "mean_token_accuracy": 0.4619680338917774, + "step": 9178 + }, + { + "epoch": 1.7017055988134966, + "grad_norm": 5.23828125, + "learning_rate": 8.298294401186503e-06, + "loss": 2.4934, + "mean_token_accuracy": 0.5127464523420822, + "step": 9179 + }, + { + "epoch": 1.7018909899888766, + "grad_norm": 5.625, + "learning_rate": 8.298109010011124e-06, + "loss": 3.3165, + "mean_token_accuracy": 0.4198581560283688, + "step": 9180 + }, + { + "epoch": 1.7020763811642565, + "grad_norm": 6.44921875, + "learning_rate": 8.297923618835744e-06, + "loss": 3.1553, + "mean_token_accuracy": 0.42332230623818523, + "step": 9181 + }, + { + "epoch": 1.7022617723396367, + "grad_norm": 9.1171875, + "learning_rate": 8.297738227660363e-06, + "loss": 2.7647, + "mean_token_accuracy": 0.4835945427894915, + "step": 9182 + }, + { + "epoch": 1.7024471635150167, + "grad_norm": 4.9140625, + "learning_rate": 8.297552836484985e-06, + "loss": 2.4592, + "mean_token_accuracy": 0.4954534283607766, + "step": 9183 + }, + { + "epoch": 1.7026325546903966, + "grad_norm": 6.125, + "learning_rate": 8.297367445309604e-06, + "loss": 3.2922, + "mean_token_accuracy": 0.4340136054421769, + "step": 9184 + }, + { + "epoch": 1.7028179458657768, + "grad_norm": 6.26171875, + "learning_rate": 8.297182054134225e-06, + "loss": 3.2195, + "mean_token_accuracy": 0.4219881500987492, + "step": 9185 + }, + { + "epoch": 1.7030033370411568, + "grad_norm": 7.85546875, + "learning_rate": 8.296996662958843e-06, + "loss": 3.0717, + "mean_token_accuracy": 0.45837912087912086, + "step": 9186 + }, + { + "epoch": 1.7031887282165368, + "grad_norm": 7.41015625, + "learning_rate": 8.296811271783464e-06, + "loss": 2.836, + "mean_token_accuracy": 0.45192192551790206, + "step": 9187 + }, + { + "epoch": 1.703374119391917, + "grad_norm": 8.1953125, + "learning_rate": 8.296625880608084e-06, + "loss": 2.6632, + "mean_token_accuracy": 0.47739846174605316, + "step": 9188 + }, + { + "epoch": 1.7035595105672972, + "grad_norm": 6.4140625, + "learning_rate": 8.296440489432703e-06, + "loss": 2.998, + "mean_token_accuracy": 0.45372960372960375, + "step": 9189 + }, + { + "epoch": 1.703744901742677, + "grad_norm": 6.96484375, + "learning_rate": 8.296255098257324e-06, + "loss": 2.5713, + "mean_token_accuracy": 0.5227439471753484, + "step": 9190 + }, + { + "epoch": 1.703930292918057, + "grad_norm": 7.65625, + "learning_rate": 8.296069707081944e-06, + "loss": 2.9373, + "mean_token_accuracy": 0.44803242253889397, + "step": 9191 + }, + { + "epoch": 1.7041156840934373, + "grad_norm": 6.484375, + "learning_rate": 8.295884315906565e-06, + "loss": 2.6226, + "mean_token_accuracy": 0.505586592178771, + "step": 9192 + }, + { + "epoch": 1.7043010752688172, + "grad_norm": 7.01953125, + "learning_rate": 8.295698924731183e-06, + "loss": 3.0122, + "mean_token_accuracy": 0.43734910671173344, + "step": 9193 + }, + { + "epoch": 1.7044864664441972, + "grad_norm": 5.5234375, + "learning_rate": 8.295513533555804e-06, + "loss": 3.536, + "mean_token_accuracy": 0.4166666666666667, + "step": 9194 + }, + { + "epoch": 1.7046718576195774, + "grad_norm": 7.4765625, + "learning_rate": 8.295328142380423e-06, + "loss": 2.5758, + "mean_token_accuracy": 0.4848899390623057, + "step": 9195 + }, + { + "epoch": 1.7048572487949574, + "grad_norm": 7.26953125, + "learning_rate": 8.295142751205043e-06, + "loss": 2.6302, + "mean_token_accuracy": 0.5172917681441792, + "step": 9196 + }, + { + "epoch": 1.7050426399703373, + "grad_norm": 6.80078125, + "learning_rate": 8.294957360029664e-06, + "loss": 2.5561, + "mean_token_accuracy": 0.49020798542583877, + "step": 9197 + }, + { + "epoch": 1.7052280311457175, + "grad_norm": 6.0703125, + "learning_rate": 8.294771968854282e-06, + "loss": 3.5002, + "mean_token_accuracy": 0.43086719223771985, + "step": 9198 + }, + { + "epoch": 1.7054134223210975, + "grad_norm": 5.21484375, + "learning_rate": 8.294586577678903e-06, + "loss": 2.8274, + "mean_token_accuracy": 0.46041412911084045, + "step": 9199 + }, + { + "epoch": 1.7055988134964775, + "grad_norm": 6.8046875, + "learning_rate": 8.294401186503523e-06, + "loss": 2.5256, + "mean_token_accuracy": 0.5048094484354073, + "step": 9200 + }, + { + "epoch": 1.7057842046718577, + "grad_norm": 5.96484375, + "learning_rate": 8.294215795328144e-06, + "loss": 2.9088, + "mean_token_accuracy": 0.476592478894858, + "step": 9201 + }, + { + "epoch": 1.7059695958472376, + "grad_norm": 7.859375, + "learning_rate": 8.294030404152763e-06, + "loss": 2.65, + "mean_token_accuracy": 0.4851347596554598, + "step": 9202 + }, + { + "epoch": 1.7061549870226176, + "grad_norm": 5.8046875, + "learning_rate": 8.293845012977383e-06, + "loss": 3.079, + "mean_token_accuracy": 0.4226062687848862, + "step": 9203 + }, + { + "epoch": 1.7063403781979978, + "grad_norm": 7.08984375, + "learning_rate": 8.293659621802002e-06, + "loss": 3.1121, + "mean_token_accuracy": 0.44983563445101904, + "step": 9204 + }, + { + "epoch": 1.706525769373378, + "grad_norm": 6.48046875, + "learning_rate": 8.293474230626622e-06, + "loss": 3.2747, + "mean_token_accuracy": 0.45783464025312975, + "step": 9205 + }, + { + "epoch": 1.706711160548758, + "grad_norm": 5.3125, + "learning_rate": 8.293288839451243e-06, + "loss": 3.0289, + "mean_token_accuracy": 0.4499727371864776, + "step": 9206 + }, + { + "epoch": 1.706896551724138, + "grad_norm": 5.73046875, + "learning_rate": 8.293103448275863e-06, + "loss": 2.8939, + "mean_token_accuracy": 0.4688015393073117, + "step": 9207 + }, + { + "epoch": 1.707081942899518, + "grad_norm": 7.26953125, + "learning_rate": 8.292918057100482e-06, + "loss": 3.2186, + "mean_token_accuracy": 0.4258528660942214, + "step": 9208 + }, + { + "epoch": 1.707267334074898, + "grad_norm": 5.3671875, + "learning_rate": 8.292732665925103e-06, + "loss": 2.9231, + "mean_token_accuracy": 0.4463087248322148, + "step": 9209 + }, + { + "epoch": 1.707452725250278, + "grad_norm": 7.109375, + "learning_rate": 8.292547274749723e-06, + "loss": 2.6988, + "mean_token_accuracy": 0.4710302766281856, + "step": 9210 + }, + { + "epoch": 1.7076381164256582, + "grad_norm": 6.7890625, + "learning_rate": 8.292361883574342e-06, + "loss": 2.9817, + "mean_token_accuracy": 0.45598740767647417, + "step": 9211 + }, + { + "epoch": 1.7078235076010382, + "grad_norm": 8.3515625, + "learning_rate": 8.292176492398963e-06, + "loss": 3.4509, + "mean_token_accuracy": 0.4278626452539496, + "step": 9212 + }, + { + "epoch": 1.7080088987764181, + "grad_norm": 9.9921875, + "learning_rate": 8.291991101223581e-06, + "loss": 2.4456, + "mean_token_accuracy": 0.48627092050209203, + "step": 9213 + }, + { + "epoch": 1.7081942899517983, + "grad_norm": 5.94140625, + "learning_rate": 8.291805710048202e-06, + "loss": 2.8301, + "mean_token_accuracy": 0.46034129692832765, + "step": 9214 + }, + { + "epoch": 1.7083796811271783, + "grad_norm": 6.36328125, + "learning_rate": 8.291620318872822e-06, + "loss": 2.6948, + "mean_token_accuracy": 0.4872531867033242, + "step": 9215 + }, + { + "epoch": 1.7085650723025583, + "grad_norm": 6.84765625, + "learning_rate": 8.291434927697443e-06, + "loss": 3.2476, + "mean_token_accuracy": 0.4539614561027837, + "step": 9216 + }, + { + "epoch": 1.7087504634779385, + "grad_norm": 6.4375, + "learning_rate": 8.291249536522062e-06, + "loss": 3.4262, + "mean_token_accuracy": 0.42397806580259223, + "step": 9217 + }, + { + "epoch": 1.7089358546533187, + "grad_norm": 7.6328125, + "learning_rate": 8.291064145346682e-06, + "loss": 3.2596, + "mean_token_accuracy": 0.4411130284728214, + "step": 9218 + }, + { + "epoch": 1.7091212458286984, + "grad_norm": 7.6796875, + "learning_rate": 8.290878754171303e-06, + "loss": 3.3181, + "mean_token_accuracy": 0.44378204363896506, + "step": 9219 + }, + { + "epoch": 1.7093066370040786, + "grad_norm": 7.40234375, + "learning_rate": 8.290693362995921e-06, + "loss": 3.0276, + "mean_token_accuracy": 0.4325952914798206, + "step": 9220 + }, + { + "epoch": 1.7094920281794588, + "grad_norm": 6.6640625, + "learning_rate": 8.290507971820542e-06, + "loss": 2.9692, + "mean_token_accuracy": 0.4624371487725525, + "step": 9221 + }, + { + "epoch": 1.7096774193548387, + "grad_norm": 8.3203125, + "learning_rate": 8.29032258064516e-06, + "loss": 2.4669, + "mean_token_accuracy": 0.4934699103713188, + "step": 9222 + }, + { + "epoch": 1.7098628105302187, + "grad_norm": 6.18359375, + "learning_rate": 8.290137189469783e-06, + "loss": 2.8584, + "mean_token_accuracy": 0.46683811586356255, + "step": 9223 + }, + { + "epoch": 1.710048201705599, + "grad_norm": 6.41796875, + "learning_rate": 8.289951798294402e-06, + "loss": 2.0392, + "mean_token_accuracy": 0.568015602145295, + "step": 9224 + }, + { + "epoch": 1.7102335928809789, + "grad_norm": 6.89453125, + "learning_rate": 8.289766407119022e-06, + "loss": 3.2076, + "mean_token_accuracy": 0.4288857006217121, + "step": 9225 + }, + { + "epoch": 1.7104189840563588, + "grad_norm": 6.3828125, + "learning_rate": 8.289581015943643e-06, + "loss": 2.5082, + "mean_token_accuracy": 0.5105743209381578, + "step": 9226 + }, + { + "epoch": 1.710604375231739, + "grad_norm": 6.36328125, + "learning_rate": 8.289395624768261e-06, + "loss": 2.7186, + "mean_token_accuracy": 0.4736082722135385, + "step": 9227 + }, + { + "epoch": 1.710789766407119, + "grad_norm": 6.50390625, + "learning_rate": 8.289210233592882e-06, + "loss": 2.754, + "mean_token_accuracy": 0.4829787234042553, + "step": 9228 + }, + { + "epoch": 1.710975157582499, + "grad_norm": 7.1875, + "learning_rate": 8.2890248424175e-06, + "loss": 2.7404, + "mean_token_accuracy": 0.4737308520065831, + "step": 9229 + }, + { + "epoch": 1.7111605487578792, + "grad_norm": 5.14453125, + "learning_rate": 8.288839451242121e-06, + "loss": 3.2076, + "mean_token_accuracy": 0.4420117732107818, + "step": 9230 + }, + { + "epoch": 1.7113459399332593, + "grad_norm": 6.62109375, + "learning_rate": 8.288654060066742e-06, + "loss": 2.8795, + "mean_token_accuracy": 0.4672639558924879, + "step": 9231 + }, + { + "epoch": 1.711531331108639, + "grad_norm": 6.3984375, + "learning_rate": 8.288468668891362e-06, + "loss": 3.4243, + "mean_token_accuracy": 0.4143802190651773, + "step": 9232 + }, + { + "epoch": 1.7117167222840193, + "grad_norm": 6.47265625, + "learning_rate": 8.288283277715981e-06, + "loss": 3.3869, + "mean_token_accuracy": 0.4306097927305497, + "step": 9233 + }, + { + "epoch": 1.7119021134593995, + "grad_norm": 7.28515625, + "learning_rate": 8.288097886540601e-06, + "loss": 2.7603, + "mean_token_accuracy": 0.4820444295974612, + "step": 9234 + }, + { + "epoch": 1.7120875046347794, + "grad_norm": 5.51171875, + "learning_rate": 8.287912495365222e-06, + "loss": 3.3483, + "mean_token_accuracy": 0.42633504023408925, + "step": 9235 + }, + { + "epoch": 1.7122728958101594, + "grad_norm": 6.96484375, + "learning_rate": 8.28772710418984e-06, + "loss": 2.4689, + "mean_token_accuracy": 0.4989429175475687, + "step": 9236 + }, + { + "epoch": 1.7124582869855396, + "grad_norm": 5.9296875, + "learning_rate": 8.287541713014461e-06, + "loss": 3.0406, + "mean_token_accuracy": 0.43970588235294117, + "step": 9237 + }, + { + "epoch": 1.7126436781609196, + "grad_norm": 5.6015625, + "learning_rate": 8.28735632183908e-06, + "loss": 2.9407, + "mean_token_accuracy": 0.4594490216271885, + "step": 9238 + }, + { + "epoch": 1.7128290693362995, + "grad_norm": 5.8515625, + "learning_rate": 8.287170930663702e-06, + "loss": 3.2096, + "mean_token_accuracy": 0.4372937293729373, + "step": 9239 + }, + { + "epoch": 1.7130144605116797, + "grad_norm": 5.421875, + "learning_rate": 8.286985539488321e-06, + "loss": 2.894, + "mean_token_accuracy": 0.4685230024213075, + "step": 9240 + }, + { + "epoch": 1.7131998516870597, + "grad_norm": 6.15625, + "learning_rate": 8.286800148312942e-06, + "loss": 3.0391, + "mean_token_accuracy": 0.4578050443081118, + "step": 9241 + }, + { + "epoch": 1.7133852428624397, + "grad_norm": 5.78515625, + "learning_rate": 8.28661475713756e-06, + "loss": 2.9948, + "mean_token_accuracy": 0.447605561277034, + "step": 9242 + }, + { + "epoch": 1.7135706340378198, + "grad_norm": 5.9921875, + "learning_rate": 8.28642936596218e-06, + "loss": 2.7272, + "mean_token_accuracy": 0.4799179442884906, + "step": 9243 + }, + { + "epoch": 1.7137560252131998, + "grad_norm": 7.09375, + "learning_rate": 8.286243974786801e-06, + "loss": 3.0862, + "mean_token_accuracy": 0.44486732212707475, + "step": 9244 + }, + { + "epoch": 1.7139414163885798, + "grad_norm": 5.9375, + "learning_rate": 8.28605858361142e-06, + "loss": 2.9846, + "mean_token_accuracy": 0.4597345132743363, + "step": 9245 + }, + { + "epoch": 1.71412680756396, + "grad_norm": 6.125, + "learning_rate": 8.28587319243604e-06, + "loss": 2.7016, + "mean_token_accuracy": 0.4590254706533776, + "step": 9246 + }, + { + "epoch": 1.7143121987393402, + "grad_norm": 6.0625, + "learning_rate": 8.285687801260661e-06, + "loss": 3.0713, + "mean_token_accuracy": 0.43330821401657876, + "step": 9247 + }, + { + "epoch": 1.71449758991472, + "grad_norm": 5.92578125, + "learning_rate": 8.285502410085282e-06, + "loss": 2.8442, + "mean_token_accuracy": 0.48251509283517485, + "step": 9248 + }, + { + "epoch": 1.7146829810901, + "grad_norm": 6.6015625, + "learning_rate": 8.2853170189099e-06, + "loss": 2.642, + "mean_token_accuracy": 0.495844414893617, + "step": 9249 + }, + { + "epoch": 1.7148683722654803, + "grad_norm": 6.19140625, + "learning_rate": 8.285131627734521e-06, + "loss": 2.5013, + "mean_token_accuracy": 0.4824617756649107, + "step": 9250 + }, + { + "epoch": 1.7150537634408602, + "grad_norm": 6.13671875, + "learning_rate": 8.28494623655914e-06, + "loss": 2.8992, + "mean_token_accuracy": 0.4868729488982654, + "step": 9251 + }, + { + "epoch": 1.7152391546162402, + "grad_norm": 5.8203125, + "learning_rate": 8.28476084538376e-06, + "loss": 2.4433, + "mean_token_accuracy": 0.49912810194500334, + "step": 9252 + }, + { + "epoch": 1.7154245457916204, + "grad_norm": 8.203125, + "learning_rate": 8.28457545420838e-06, + "loss": 2.8871, + "mean_token_accuracy": 0.4303702913242984, + "step": 9253 + }, + { + "epoch": 1.7156099369670004, + "grad_norm": 6.82421875, + "learning_rate": 8.284390063033e-06, + "loss": 2.9166, + "mean_token_accuracy": 0.5072049279507205, + "step": 9254 + }, + { + "epoch": 1.7157953281423803, + "grad_norm": 9.1953125, + "learning_rate": 8.284204671857622e-06, + "loss": 2.9052, + "mean_token_accuracy": 0.4379371828281443, + "step": 9255 + }, + { + "epoch": 1.7159807193177605, + "grad_norm": 5.58984375, + "learning_rate": 8.28401928068224e-06, + "loss": 2.8858, + "mean_token_accuracy": 0.44904204364023415, + "step": 9256 + }, + { + "epoch": 1.7161661104931405, + "grad_norm": 7.6328125, + "learning_rate": 8.283833889506861e-06, + "loss": 2.713, + "mean_token_accuracy": 0.46743111960503264, + "step": 9257 + }, + { + "epoch": 1.7163515016685205, + "grad_norm": 5.37890625, + "learning_rate": 8.28364849833148e-06, + "loss": 3.3529, + "mean_token_accuracy": 0.41551064991807757, + "step": 9258 + }, + { + "epoch": 1.7165368928439007, + "grad_norm": 5.9140625, + "learning_rate": 8.2834631071561e-06, + "loss": 2.9271, + "mean_token_accuracy": 0.4464420607342499, + "step": 9259 + }, + { + "epoch": 1.7167222840192808, + "grad_norm": 7.15625, + "learning_rate": 8.283277715980719e-06, + "loss": 2.8367, + "mean_token_accuracy": 0.46855072463768116, + "step": 9260 + }, + { + "epoch": 1.7169076751946606, + "grad_norm": 5.828125, + "learning_rate": 8.28309232480534e-06, + "loss": 2.402, + "mean_token_accuracy": 0.5226076184577269, + "step": 9261 + }, + { + "epoch": 1.7170930663700408, + "grad_norm": 6.359375, + "learning_rate": 8.28290693362996e-06, + "loss": 2.9922, + "mean_token_accuracy": 0.44770979221673746, + "step": 9262 + }, + { + "epoch": 1.717278457545421, + "grad_norm": 6.09765625, + "learning_rate": 8.28272154245458e-06, + "loss": 3.0814, + "mean_token_accuracy": 0.4475608294378521, + "step": 9263 + }, + { + "epoch": 1.717463848720801, + "grad_norm": 7.18359375, + "learning_rate": 8.282536151279201e-06, + "loss": 2.1824, + "mean_token_accuracy": 0.5180355452728994, + "step": 9264 + }, + { + "epoch": 1.717649239896181, + "grad_norm": 7.51953125, + "learning_rate": 8.28235076010382e-06, + "loss": 2.67, + "mean_token_accuracy": 0.48107900792636155, + "step": 9265 + }, + { + "epoch": 1.717834631071561, + "grad_norm": 6.828125, + "learning_rate": 8.28216536892844e-06, + "loss": 2.8987, + "mean_token_accuracy": 0.47889740363719546, + "step": 9266 + }, + { + "epoch": 1.718020022246941, + "grad_norm": 6.5625, + "learning_rate": 8.281979977753059e-06, + "loss": 2.8734, + "mean_token_accuracy": 0.46486657192483816, + "step": 9267 + }, + { + "epoch": 1.718205413422321, + "grad_norm": 6.4453125, + "learning_rate": 8.28179458657768e-06, + "loss": 2.7657, + "mean_token_accuracy": 0.4826663097309597, + "step": 9268 + }, + { + "epoch": 1.7183908045977012, + "grad_norm": 6.125, + "learning_rate": 8.2816091954023e-06, + "loss": 3.0378, + "mean_token_accuracy": 0.4746412666996536, + "step": 9269 + }, + { + "epoch": 1.7185761957730812, + "grad_norm": 4.81640625, + "learning_rate": 8.281423804226919e-06, + "loss": 2.6302, + "mean_token_accuracy": 0.47150153217568946, + "step": 9270 + }, + { + "epoch": 1.7187615869484612, + "grad_norm": 5.65234375, + "learning_rate": 8.28123841305154e-06, + "loss": 3.159, + "mean_token_accuracy": 0.44659365666334805, + "step": 9271 + }, + { + "epoch": 1.7189469781238413, + "grad_norm": 6.3671875, + "learning_rate": 8.28105302187616e-06, + "loss": 3.0113, + "mean_token_accuracy": 0.46462303231151614, + "step": 9272 + }, + { + "epoch": 1.7191323692992213, + "grad_norm": 5.55859375, + "learning_rate": 8.28086763070078e-06, + "loss": 3.053, + "mean_token_accuracy": 0.44271364099577815, + "step": 9273 + }, + { + "epoch": 1.7193177604746013, + "grad_norm": 6.65625, + "learning_rate": 8.280682239525399e-06, + "loss": 2.2959, + "mean_token_accuracy": 0.5494971715901948, + "step": 9274 + }, + { + "epoch": 1.7195031516499815, + "grad_norm": 6.12109375, + "learning_rate": 8.28049684835002e-06, + "loss": 3.0604, + "mean_token_accuracy": 0.4432760663507109, + "step": 9275 + }, + { + "epoch": 1.7196885428253617, + "grad_norm": 5.67578125, + "learning_rate": 8.280311457174638e-06, + "loss": 2.6921, + "mean_token_accuracy": 0.46728845633955124, + "step": 9276 + }, + { + "epoch": 1.7198739340007414, + "grad_norm": 5.70703125, + "learning_rate": 8.280126065999259e-06, + "loss": 2.4873, + "mean_token_accuracy": 0.49040011725047633, + "step": 9277 + }, + { + "epoch": 1.7200593251761216, + "grad_norm": 6.65625, + "learning_rate": 8.27994067482388e-06, + "loss": 3.1706, + "mean_token_accuracy": 0.4393828067597355, + "step": 9278 + }, + { + "epoch": 1.7202447163515018, + "grad_norm": 5.50390625, + "learning_rate": 8.279755283648498e-06, + "loss": 3.0636, + "mean_token_accuracy": 0.43363711681855843, + "step": 9279 + }, + { + "epoch": 1.7204301075268817, + "grad_norm": 8.3359375, + "learning_rate": 8.279569892473119e-06, + "loss": 2.8496, + "mean_token_accuracy": 0.4640439932318105, + "step": 9280 + }, + { + "epoch": 1.7206154987022617, + "grad_norm": 5.3671875, + "learning_rate": 8.27938450129774e-06, + "loss": 2.7917, + "mean_token_accuracy": 0.48745724059293044, + "step": 9281 + }, + { + "epoch": 1.720800889877642, + "grad_norm": 5.9453125, + "learning_rate": 8.27919911012236e-06, + "loss": 3.078, + "mean_token_accuracy": 0.43871800892228224, + "step": 9282 + }, + { + "epoch": 1.7209862810530219, + "grad_norm": 7.66015625, + "learning_rate": 8.279013718946978e-06, + "loss": 2.5141, + "mean_token_accuracy": 0.49268094334507995, + "step": 9283 + }, + { + "epoch": 1.7211716722284018, + "grad_norm": 6.5078125, + "learning_rate": 8.278828327771599e-06, + "loss": 3.1947, + "mean_token_accuracy": 0.404654823476501, + "step": 9284 + }, + { + "epoch": 1.721357063403782, + "grad_norm": 5.54296875, + "learning_rate": 8.278642936596218e-06, + "loss": 2.763, + "mean_token_accuracy": 0.4711985688729875, + "step": 9285 + }, + { + "epoch": 1.721542454579162, + "grad_norm": 6.33203125, + "learning_rate": 8.278457545420838e-06, + "loss": 3.0974, + "mean_token_accuracy": 0.4415665147119768, + "step": 9286 + }, + { + "epoch": 1.721727845754542, + "grad_norm": 7.06640625, + "learning_rate": 8.278272154245459e-06, + "loss": 3.5288, + "mean_token_accuracy": 0.42699036058550516, + "step": 9287 + }, + { + "epoch": 1.7219132369299222, + "grad_norm": 5.703125, + "learning_rate": 8.27808676307008e-06, + "loss": 2.5371, + "mean_token_accuracy": 0.47725887086940616, + "step": 9288 + }, + { + "epoch": 1.7220986281053023, + "grad_norm": 5.546875, + "learning_rate": 8.277901371894698e-06, + "loss": 2.8843, + "mean_token_accuracy": 0.46002300834052345, + "step": 9289 + }, + { + "epoch": 1.722284019280682, + "grad_norm": 6.1953125, + "learning_rate": 8.277715980719318e-06, + "loss": 3.4144, + "mean_token_accuracy": 0.43328566714332856, + "step": 9290 + }, + { + "epoch": 1.7224694104560623, + "grad_norm": 6.87109375, + "learning_rate": 8.277530589543939e-06, + "loss": 2.6404, + "mean_token_accuracy": 0.5106971328773104, + "step": 9291 + }, + { + "epoch": 1.7226548016314425, + "grad_norm": 5.671875, + "learning_rate": 8.277345198368558e-06, + "loss": 3.1405, + "mean_token_accuracy": 0.44143749315218583, + "step": 9292 + }, + { + "epoch": 1.7228401928068224, + "grad_norm": 8.3828125, + "learning_rate": 8.277159807193178e-06, + "loss": 3.0832, + "mean_token_accuracy": 0.4482724870314182, + "step": 9293 + }, + { + "epoch": 1.7230255839822024, + "grad_norm": 7.25390625, + "learning_rate": 8.276974416017797e-06, + "loss": 3.4287, + "mean_token_accuracy": 0.4282844990548204, + "step": 9294 + }, + { + "epoch": 1.7232109751575826, + "grad_norm": 5.85546875, + "learning_rate": 8.276789024842418e-06, + "loss": 3.2222, + "mean_token_accuracy": 0.43010176754151047, + "step": 9295 + }, + { + "epoch": 1.7233963663329626, + "grad_norm": 5.75390625, + "learning_rate": 8.276603633667038e-06, + "loss": 2.6772, + "mean_token_accuracy": 0.48214686728048506, + "step": 9296 + }, + { + "epoch": 1.7235817575083425, + "grad_norm": 6.85546875, + "learning_rate": 8.276418242491659e-06, + "loss": 3.1786, + "mean_token_accuracy": 0.4303312027890761, + "step": 9297 + }, + { + "epoch": 1.7237671486837227, + "grad_norm": 5.8515625, + "learning_rate": 8.276232851316277e-06, + "loss": 3.4945, + "mean_token_accuracy": 0.43135917030567683, + "step": 9298 + }, + { + "epoch": 1.7239525398591027, + "grad_norm": 6.91796875, + "learning_rate": 8.276047460140898e-06, + "loss": 3.0121, + "mean_token_accuracy": 0.4461756373937677, + "step": 9299 + }, + { + "epoch": 1.7241379310344827, + "grad_norm": 5.82421875, + "learning_rate": 8.275862068965518e-06, + "loss": 3.5069, + "mean_token_accuracy": 0.392887383573243, + "step": 9300 + }, + { + "epoch": 1.7243233222098628, + "grad_norm": 6.4296875, + "learning_rate": 8.275676677790137e-06, + "loss": 3.561, + "mean_token_accuracy": 0.42029174264131286, + "step": 9301 + }, + { + "epoch": 1.7245087133852428, + "grad_norm": 6.6640625, + "learning_rate": 8.275491286614758e-06, + "loss": 2.8963, + "mean_token_accuracy": 0.4639070606812542, + "step": 9302 + }, + { + "epoch": 1.7246941045606228, + "grad_norm": 8.21875, + "learning_rate": 8.275305895439376e-06, + "loss": 2.147, + "mean_token_accuracy": 0.5434427260153402, + "step": 9303 + }, + { + "epoch": 1.724879495736003, + "grad_norm": 7.56640625, + "learning_rate": 8.275120504263999e-06, + "loss": 2.5683, + "mean_token_accuracy": 0.504697631335186, + "step": 9304 + }, + { + "epoch": 1.7250648869113832, + "grad_norm": 5.6875, + "learning_rate": 8.274935113088617e-06, + "loss": 2.6573, + "mean_token_accuracy": 0.5122749590834698, + "step": 9305 + }, + { + "epoch": 1.7252502780867631, + "grad_norm": 8.0234375, + "learning_rate": 8.274749721913238e-06, + "loss": 2.573, + "mean_token_accuracy": 0.5083012352238013, + "step": 9306 + }, + { + "epoch": 1.725435669262143, + "grad_norm": 7.0, + "learning_rate": 8.274564330737858e-06, + "loss": 2.6803, + "mean_token_accuracy": 0.4829891546575546, + "step": 9307 + }, + { + "epoch": 1.7256210604375233, + "grad_norm": 6.39453125, + "learning_rate": 8.274378939562477e-06, + "loss": 3.4856, + "mean_token_accuracy": 0.40768775872264934, + "step": 9308 + }, + { + "epoch": 1.7258064516129032, + "grad_norm": 7.34765625, + "learning_rate": 8.274193548387098e-06, + "loss": 2.4347, + "mean_token_accuracy": 0.47008642945336176, + "step": 9309 + }, + { + "epoch": 1.7259918427882832, + "grad_norm": 6.15234375, + "learning_rate": 8.274008157211716e-06, + "loss": 3.0024, + "mean_token_accuracy": 0.4743477718137561, + "step": 9310 + }, + { + "epoch": 1.7261772339636634, + "grad_norm": 6.35546875, + "learning_rate": 8.273822766036337e-06, + "loss": 3.0678, + "mean_token_accuracy": 0.46008492569002124, + "step": 9311 + }, + { + "epoch": 1.7263626251390434, + "grad_norm": 7.49609375, + "learning_rate": 8.273637374860957e-06, + "loss": 2.3818, + "mean_token_accuracy": 0.5042965627498002, + "step": 9312 + }, + { + "epoch": 1.7265480163144233, + "grad_norm": 5.57421875, + "learning_rate": 8.273451983685578e-06, + "loss": 3.0811, + "mean_token_accuracy": 0.44785276073619634, + "step": 9313 + }, + { + "epoch": 1.7267334074898035, + "grad_norm": 5.72265625, + "learning_rate": 8.273266592510197e-06, + "loss": 2.7298, + "mean_token_accuracy": 0.4797364495275982, + "step": 9314 + }, + { + "epoch": 1.7269187986651835, + "grad_norm": 6.66796875, + "learning_rate": 8.273081201334817e-06, + "loss": 3.0762, + "mean_token_accuracy": 0.4327558696147709, + "step": 9315 + }, + { + "epoch": 1.7271041898405635, + "grad_norm": 5.45703125, + "learning_rate": 8.272895810159438e-06, + "loss": 3.0138, + "mean_token_accuracy": 0.4311095827602017, + "step": 9316 + }, + { + "epoch": 1.7272895810159437, + "grad_norm": 5.61328125, + "learning_rate": 8.272710418984057e-06, + "loss": 3.0153, + "mean_token_accuracy": 0.4666947132239518, + "step": 9317 + }, + { + "epoch": 1.7274749721913238, + "grad_norm": 6.2421875, + "learning_rate": 8.272525027808677e-06, + "loss": 2.9025, + "mean_token_accuracy": 0.4673913043478261, + "step": 9318 + }, + { + "epoch": 1.7276603633667036, + "grad_norm": 6.2421875, + "learning_rate": 8.272339636633296e-06, + "loss": 3.3398, + "mean_token_accuracy": 0.43284201785343573, + "step": 9319 + }, + { + "epoch": 1.7278457545420838, + "grad_norm": 6.40625, + "learning_rate": 8.272154245457918e-06, + "loss": 2.6005, + "mean_token_accuracy": 0.4953247409653778, + "step": 9320 + }, + { + "epoch": 1.728031145717464, + "grad_norm": 5.98828125, + "learning_rate": 8.271968854282537e-06, + "loss": 2.806, + "mean_token_accuracy": 0.48205128205128206, + "step": 9321 + }, + { + "epoch": 1.728216536892844, + "grad_norm": 7.98046875, + "learning_rate": 8.271783463107157e-06, + "loss": 2.6799, + "mean_token_accuracy": 0.4909946082367787, + "step": 9322 + }, + { + "epoch": 1.728401928068224, + "grad_norm": 7.6953125, + "learning_rate": 8.271598071931776e-06, + "loss": 2.8031, + "mean_token_accuracy": 0.47549501474512007, + "step": 9323 + }, + { + "epoch": 1.728587319243604, + "grad_norm": 5.1015625, + "learning_rate": 8.271412680756397e-06, + "loss": 2.6223, + "mean_token_accuracy": 0.4805940871514642, + "step": 9324 + }, + { + "epoch": 1.728772710418984, + "grad_norm": 5.11328125, + "learning_rate": 8.271227289581017e-06, + "loss": 3.0015, + "mean_token_accuracy": 0.4448269267546376, + "step": 9325 + }, + { + "epoch": 1.728958101594364, + "grad_norm": 7.4140625, + "learning_rate": 8.271041898405636e-06, + "loss": 2.7509, + "mean_token_accuracy": 0.4804869913275517, + "step": 9326 + }, + { + "epoch": 1.7291434927697442, + "grad_norm": 8.875, + "learning_rate": 8.270856507230256e-06, + "loss": 3.0281, + "mean_token_accuracy": 0.45755128764731556, + "step": 9327 + }, + { + "epoch": 1.7293288839451242, + "grad_norm": 6.49609375, + "learning_rate": 8.270671116054877e-06, + "loss": 3.6008, + "mean_token_accuracy": 0.41527034030057003, + "step": 9328 + }, + { + "epoch": 1.7295142751205042, + "grad_norm": 5.87890625, + "learning_rate": 8.270485724879497e-06, + "loss": 3.1132, + "mean_token_accuracy": 0.450685117351784, + "step": 9329 + }, + { + "epoch": 1.7296996662958843, + "grad_norm": 7.5703125, + "learning_rate": 8.270300333704116e-06, + "loss": 2.4219, + "mean_token_accuracy": 0.5050994390617032, + "step": 9330 + }, + { + "epoch": 1.7298850574712645, + "grad_norm": 6.40234375, + "learning_rate": 8.270114942528737e-06, + "loss": 3.5794, + "mean_token_accuracy": 0.42763636363636365, + "step": 9331 + }, + { + "epoch": 1.7300704486466443, + "grad_norm": 6.27734375, + "learning_rate": 8.269929551353355e-06, + "loss": 2.9938, + "mean_token_accuracy": 0.46228981206726016, + "step": 9332 + }, + { + "epoch": 1.7302558398220245, + "grad_norm": 6.421875, + "learning_rate": 8.269744160177976e-06, + "loss": 2.9216, + "mean_token_accuracy": 0.47494587070832045, + "step": 9333 + }, + { + "epoch": 1.7304412309974047, + "grad_norm": 6.19140625, + "learning_rate": 8.269558769002596e-06, + "loss": 2.6744, + "mean_token_accuracy": 0.49249482401656314, + "step": 9334 + }, + { + "epoch": 1.7306266221727846, + "grad_norm": 6.2109375, + "learning_rate": 8.269373377827215e-06, + "loss": 2.5632, + "mean_token_accuracy": 0.5022277227722772, + "step": 9335 + }, + { + "epoch": 1.7308120133481646, + "grad_norm": 5.390625, + "learning_rate": 8.269187986651836e-06, + "loss": 2.5703, + "mean_token_accuracy": 0.49089861751152075, + "step": 9336 + }, + { + "epoch": 1.7309974045235448, + "grad_norm": 6.234375, + "learning_rate": 8.269002595476456e-06, + "loss": 2.9509, + "mean_token_accuracy": 0.46065318818040435, + "step": 9337 + }, + { + "epoch": 1.7311827956989247, + "grad_norm": 5.83203125, + "learning_rate": 8.268817204301077e-06, + "loss": 2.6467, + "mean_token_accuracy": 0.49814183957881697, + "step": 9338 + }, + { + "epoch": 1.7313681868743047, + "grad_norm": 5.98046875, + "learning_rate": 8.268631813125695e-06, + "loss": 2.6043, + "mean_token_accuracy": 0.48254378746914306, + "step": 9339 + }, + { + "epoch": 1.731553578049685, + "grad_norm": 6.2734375, + "learning_rate": 8.268446421950316e-06, + "loss": 3.3792, + "mean_token_accuracy": 0.42272832907273994, + "step": 9340 + }, + { + "epoch": 1.7317389692250649, + "grad_norm": 6.421875, + "learning_rate": 8.268261030774935e-06, + "loss": 2.4625, + "mean_token_accuracy": 0.5006131959774344, + "step": 9341 + }, + { + "epoch": 1.7319243604004448, + "grad_norm": 6.9375, + "learning_rate": 8.268075639599555e-06, + "loss": 3.8282, + "mean_token_accuracy": 0.41473178542834266, + "step": 9342 + }, + { + "epoch": 1.732109751575825, + "grad_norm": 7.76953125, + "learning_rate": 8.267890248424176e-06, + "loss": 3.033, + "mean_token_accuracy": 0.45601064616294545, + "step": 9343 + }, + { + "epoch": 1.732295142751205, + "grad_norm": 5.22265625, + "learning_rate": 8.267704857248796e-06, + "loss": 2.7972, + "mean_token_accuracy": 0.48488476504040706, + "step": 9344 + }, + { + "epoch": 1.732480533926585, + "grad_norm": 8.3203125, + "learning_rate": 8.267519466073417e-06, + "loss": 2.5188, + "mean_token_accuracy": 0.48247223563495895, + "step": 9345 + }, + { + "epoch": 1.7326659251019652, + "grad_norm": 6.68359375, + "learning_rate": 8.267334074898036e-06, + "loss": 2.8236, + "mean_token_accuracy": 0.48553459119496856, + "step": 9346 + }, + { + "epoch": 1.7328513162773453, + "grad_norm": 7.9140625, + "learning_rate": 8.267148683722656e-06, + "loss": 2.8859, + "mean_token_accuracy": 0.466206616602395, + "step": 9347 + }, + { + "epoch": 1.733036707452725, + "grad_norm": 7.421875, + "learning_rate": 8.266963292547275e-06, + "loss": 3.2492, + "mean_token_accuracy": 0.43426025917926564, + "step": 9348 + }, + { + "epoch": 1.7332220986281053, + "grad_norm": 10.828125, + "learning_rate": 8.266777901371895e-06, + "loss": 2.4428, + "mean_token_accuracy": 0.5040750251200179, + "step": 9349 + }, + { + "epoch": 1.7334074898034855, + "grad_norm": 6.24609375, + "learning_rate": 8.266592510196516e-06, + "loss": 3.376, + "mean_token_accuracy": 0.42036027494666983, + "step": 9350 + }, + { + "epoch": 1.7335928809788654, + "grad_norm": 9.1953125, + "learning_rate": 8.266407119021135e-06, + "loss": 2.4273, + "mean_token_accuracy": 0.5161923454367027, + "step": 9351 + }, + { + "epoch": 1.7337782721542454, + "grad_norm": 7.5625, + "learning_rate": 8.266221727845755e-06, + "loss": 2.684, + "mean_token_accuracy": 0.47750346100599905, + "step": 9352 + }, + { + "epoch": 1.7339636633296256, + "grad_norm": 6.01953125, + "learning_rate": 8.266036336670376e-06, + "loss": 3.2636, + "mean_token_accuracy": 0.4480085538626036, + "step": 9353 + }, + { + "epoch": 1.7341490545050056, + "grad_norm": 8.234375, + "learning_rate": 8.265850945494996e-06, + "loss": 2.5755, + "mean_token_accuracy": 0.47633136094674555, + "step": 9354 + }, + { + "epoch": 1.7343344456803855, + "grad_norm": 7.0703125, + "learning_rate": 8.265665554319615e-06, + "loss": 2.6987, + "mean_token_accuracy": 0.5024768195097168, + "step": 9355 + }, + { + "epoch": 1.7345198368557657, + "grad_norm": 7.78515625, + "learning_rate": 8.265480163144235e-06, + "loss": 3.0672, + "mean_token_accuracy": 0.4562201753412496, + "step": 9356 + }, + { + "epoch": 1.7347052280311457, + "grad_norm": 6.421875, + "learning_rate": 8.265294771968854e-06, + "loss": 3.6915, + "mean_token_accuracy": 0.41029923451635353, + "step": 9357 + }, + { + "epoch": 1.7348906192065257, + "grad_norm": 8.703125, + "learning_rate": 8.265109380793475e-06, + "loss": 2.9138, + "mean_token_accuracy": 0.46609868654597086, + "step": 9358 + }, + { + "epoch": 1.7350760103819058, + "grad_norm": 10.8046875, + "learning_rate": 8.264923989618095e-06, + "loss": 4.3301, + "mean_token_accuracy": 0.4281911353653667, + "step": 9359 + }, + { + "epoch": 1.735261401557286, + "grad_norm": 12.5703125, + "learning_rate": 8.264738598442716e-06, + "loss": 2.875, + "mean_token_accuracy": 0.45106996874248617, + "step": 9360 + }, + { + "epoch": 1.7354467927326658, + "grad_norm": 9.328125, + "learning_rate": 8.264553207267334e-06, + "loss": 3.1992, + "mean_token_accuracy": 0.44613396938978334, + "step": 9361 + }, + { + "epoch": 1.735632183908046, + "grad_norm": 9.03125, + "learning_rate": 8.264367816091955e-06, + "loss": 3.6802, + "mean_token_accuracy": 0.4004074596458235, + "step": 9362 + }, + { + "epoch": 1.7358175750834262, + "grad_norm": 10.546875, + "learning_rate": 8.264182424916575e-06, + "loss": 3.0055, + "mean_token_accuracy": 0.4527725962353739, + "step": 9363 + }, + { + "epoch": 1.7360029662588061, + "grad_norm": 10.3984375, + "learning_rate": 8.263997033741194e-06, + "loss": 3.1479, + "mean_token_accuracy": 0.4545003309066843, + "step": 9364 + }, + { + "epoch": 1.736188357434186, + "grad_norm": 9.8515625, + "learning_rate": 8.263811642565815e-06, + "loss": 2.4416, + "mean_token_accuracy": 0.501778093883357, + "step": 9365 + }, + { + "epoch": 1.7363737486095663, + "grad_norm": 11.125, + "learning_rate": 8.263626251390433e-06, + "loss": 2.6524, + "mean_token_accuracy": 0.47113478691774036, + "step": 9366 + }, + { + "epoch": 1.7365591397849462, + "grad_norm": 9.3046875, + "learning_rate": 8.263440860215054e-06, + "loss": 2.723, + "mean_token_accuracy": 0.45169043422528443, + "step": 9367 + }, + { + "epoch": 1.7367445309603262, + "grad_norm": 7.16015625, + "learning_rate": 8.263255469039674e-06, + "loss": 2.5012, + "mean_token_accuracy": 0.5, + "step": 9368 + }, + { + "epoch": 1.7369299221357064, + "grad_norm": 6.4609375, + "learning_rate": 8.263070077864295e-06, + "loss": 3.229, + "mean_token_accuracy": 0.4164645239539115, + "step": 9369 + }, + { + "epoch": 1.7371153133110864, + "grad_norm": 7.7421875, + "learning_rate": 8.262884686688914e-06, + "loss": 2.4474, + "mean_token_accuracy": 0.5226224783861672, + "step": 9370 + }, + { + "epoch": 1.7373007044864663, + "grad_norm": 6.73046875, + "learning_rate": 8.262699295513534e-06, + "loss": 3.4926, + "mean_token_accuracy": 0.42429696287964, + "step": 9371 + }, + { + "epoch": 1.7374860956618465, + "grad_norm": 5.8125, + "learning_rate": 8.262513904338155e-06, + "loss": 3.2431, + "mean_token_accuracy": 0.4444765342960289, + "step": 9372 + }, + { + "epoch": 1.7376714868372265, + "grad_norm": 5.34765625, + "learning_rate": 8.262328513162774e-06, + "loss": 2.8934, + "mean_token_accuracy": 0.45482411326813604, + "step": 9373 + }, + { + "epoch": 1.7378568780126065, + "grad_norm": 6.046875, + "learning_rate": 8.262143121987394e-06, + "loss": 2.8084, + "mean_token_accuracy": 0.47649424490361947, + "step": 9374 + }, + { + "epoch": 1.7380422691879867, + "grad_norm": 5.515625, + "learning_rate": 8.261957730812013e-06, + "loss": 3.1988, + "mean_token_accuracy": 0.4539656771799629, + "step": 9375 + }, + { + "epoch": 1.7382276603633668, + "grad_norm": 5.3515625, + "learning_rate": 8.261772339636635e-06, + "loss": 2.2626, + "mean_token_accuracy": 0.5365316362746297, + "step": 9376 + }, + { + "epoch": 1.7384130515387466, + "grad_norm": 6.15234375, + "learning_rate": 8.261586948461254e-06, + "loss": 3.3794, + "mean_token_accuracy": 0.4443375040102663, + "step": 9377 + }, + { + "epoch": 1.7385984427141268, + "grad_norm": 5.23828125, + "learning_rate": 8.261401557285874e-06, + "loss": 2.6585, + "mean_token_accuracy": 0.48484848484848486, + "step": 9378 + }, + { + "epoch": 1.738783833889507, + "grad_norm": 7.20703125, + "learning_rate": 8.261216166110493e-06, + "loss": 2.7338, + "mean_token_accuracy": 0.48241559485530544, + "step": 9379 + }, + { + "epoch": 1.738969225064887, + "grad_norm": 7.41796875, + "learning_rate": 8.261030774935114e-06, + "loss": 2.9851, + "mean_token_accuracy": 0.47561144120491916, + "step": 9380 + }, + { + "epoch": 1.739154616240267, + "grad_norm": 6.56640625, + "learning_rate": 8.260845383759734e-06, + "loss": 3.702, + "mean_token_accuracy": 0.410084985835694, + "step": 9381 + }, + { + "epoch": 1.739340007415647, + "grad_norm": 6.36328125, + "learning_rate": 8.260659992584353e-06, + "loss": 2.4, + "mean_token_accuracy": 0.5186509355261589, + "step": 9382 + }, + { + "epoch": 1.739525398591027, + "grad_norm": 5.2578125, + "learning_rate": 8.260474601408973e-06, + "loss": 2.4216, + "mean_token_accuracy": 0.49484978540772534, + "step": 9383 + }, + { + "epoch": 1.739710789766407, + "grad_norm": 6.88671875, + "learning_rate": 8.260289210233594e-06, + "loss": 2.8913, + "mean_token_accuracy": 0.45968475295544103, + "step": 9384 + }, + { + "epoch": 1.7398961809417872, + "grad_norm": 6.82421875, + "learning_rate": 8.260103819058214e-06, + "loss": 3.2902, + "mean_token_accuracy": 0.4588020674977197, + "step": 9385 + }, + { + "epoch": 1.7400815721171672, + "grad_norm": 6.99609375, + "learning_rate": 8.259918427882833e-06, + "loss": 2.6656, + "mean_token_accuracy": 0.4816723940435281, + "step": 9386 + }, + { + "epoch": 1.7402669632925472, + "grad_norm": 6.5625, + "learning_rate": 8.259733036707454e-06, + "loss": 3.0478, + "mean_token_accuracy": 0.44549707602339184, + "step": 9387 + }, + { + "epoch": 1.7404523544679273, + "grad_norm": 8.0, + "learning_rate": 8.259547645532074e-06, + "loss": 2.6733, + "mean_token_accuracy": 0.4737997256515775, + "step": 9388 + }, + { + "epoch": 1.7406377456433075, + "grad_norm": 6.671875, + "learning_rate": 8.259362254356693e-06, + "loss": 3.4276, + "mean_token_accuracy": 0.4253090909090909, + "step": 9389 + }, + { + "epoch": 1.7408231368186873, + "grad_norm": 5.7109375, + "learning_rate": 8.259176863181313e-06, + "loss": 2.8682, + "mean_token_accuracy": 0.47598979705001665, + "step": 9390 + }, + { + "epoch": 1.7410085279940675, + "grad_norm": 5.90625, + "learning_rate": 8.258991472005932e-06, + "loss": 3.0569, + "mean_token_accuracy": 0.43924107815088087, + "step": 9391 + }, + { + "epoch": 1.7411939191694477, + "grad_norm": 6.19921875, + "learning_rate": 8.258806080830554e-06, + "loss": 2.3682, + "mean_token_accuracy": 0.518824027072758, + "step": 9392 + }, + { + "epoch": 1.7413793103448276, + "grad_norm": 5.8515625, + "learning_rate": 8.258620689655173e-06, + "loss": 3.0434, + "mean_token_accuracy": 0.4694537762846833, + "step": 9393 + }, + { + "epoch": 1.7415647015202076, + "grad_norm": 6.68359375, + "learning_rate": 8.258435298479794e-06, + "loss": 3.3234, + "mean_token_accuracy": 0.4292787217283673, + "step": 9394 + }, + { + "epoch": 1.7417500926955878, + "grad_norm": 6.98828125, + "learning_rate": 8.258249907304412e-06, + "loss": 3.0662, + "mean_token_accuracy": 0.44380979580179925, + "step": 9395 + }, + { + "epoch": 1.7419354838709677, + "grad_norm": 5.86328125, + "learning_rate": 8.258064516129033e-06, + "loss": 3.1963, + "mean_token_accuracy": 0.45581773799837266, + "step": 9396 + }, + { + "epoch": 1.7421208750463477, + "grad_norm": 5.85546875, + "learning_rate": 8.257879124953653e-06, + "loss": 2.7508, + "mean_token_accuracy": 0.48428422463876064, + "step": 9397 + }, + { + "epoch": 1.742306266221728, + "grad_norm": 6.953125, + "learning_rate": 8.257693733778272e-06, + "loss": 2.4479, + "mean_token_accuracy": 0.51224381155177, + "step": 9398 + }, + { + "epoch": 1.7424916573971079, + "grad_norm": 6.70703125, + "learning_rate": 8.257508342602893e-06, + "loss": 2.9268, + "mean_token_accuracy": 0.4686219262295082, + "step": 9399 + }, + { + "epoch": 1.7426770485724878, + "grad_norm": 5.45703125, + "learning_rate": 8.257322951427513e-06, + "loss": 2.4966, + "mean_token_accuracy": 0.5131662638341178, + "step": 9400 + }, + { + "epoch": 1.742862439747868, + "grad_norm": 5.56640625, + "learning_rate": 8.257137560252134e-06, + "loss": 2.8623, + "mean_token_accuracy": 0.46940592799898234, + "step": 9401 + }, + { + "epoch": 1.743047830923248, + "grad_norm": 6.46484375, + "learning_rate": 8.256952169076753e-06, + "loss": 2.8002, + "mean_token_accuracy": 0.4510169228380686, + "step": 9402 + }, + { + "epoch": 1.743233222098628, + "grad_norm": 6.3203125, + "learning_rate": 8.256766777901373e-06, + "loss": 2.9303, + "mean_token_accuracy": 0.4623583378305451, + "step": 9403 + }, + { + "epoch": 1.7434186132740082, + "grad_norm": 6.703125, + "learning_rate": 8.256581386725992e-06, + "loss": 2.7939, + "mean_token_accuracy": 0.4607034899697719, + "step": 9404 + }, + { + "epoch": 1.7436040044493883, + "grad_norm": 6.3984375, + "learning_rate": 8.256395995550612e-06, + "loss": 3.2591, + "mean_token_accuracy": 0.4199567333693889, + "step": 9405 + }, + { + "epoch": 1.7437893956247683, + "grad_norm": 7.890625, + "learning_rate": 8.256210604375233e-06, + "loss": 2.5303, + "mean_token_accuracy": 0.4744815596672048, + "step": 9406 + }, + { + "epoch": 1.7439747868001483, + "grad_norm": 6.953125, + "learning_rate": 8.256025213199852e-06, + "loss": 3.1461, + "mean_token_accuracy": 0.4511627906976744, + "step": 9407 + }, + { + "epoch": 1.7441601779755285, + "grad_norm": 5.5703125, + "learning_rate": 8.255839822024472e-06, + "loss": 2.624, + "mean_token_accuracy": 0.5181224004753416, + "step": 9408 + }, + { + "epoch": 1.7443455691509084, + "grad_norm": 5.43359375, + "learning_rate": 8.255654430849093e-06, + "loss": 2.9124, + "mean_token_accuracy": 0.45454545454545453, + "step": 9409 + }, + { + "epoch": 1.7445309603262884, + "grad_norm": 7.14453125, + "learning_rate": 8.255469039673713e-06, + "loss": 2.8364, + "mean_token_accuracy": 0.45881030253475064, + "step": 9410 + }, + { + "epoch": 1.7447163515016686, + "grad_norm": 6.3671875, + "learning_rate": 8.255283648498332e-06, + "loss": 3.0072, + "mean_token_accuracy": 0.44368697628177817, + "step": 9411 + }, + { + "epoch": 1.7449017426770486, + "grad_norm": 5.43359375, + "learning_rate": 8.255098257322952e-06, + "loss": 2.7304, + "mean_token_accuracy": 0.4976258309591643, + "step": 9412 + }, + { + "epoch": 1.7450871338524285, + "grad_norm": 6.12109375, + "learning_rate": 8.254912866147571e-06, + "loss": 3.0609, + "mean_token_accuracy": 0.4456648009293484, + "step": 9413 + }, + { + "epoch": 1.7452725250278087, + "grad_norm": 6.1015625, + "learning_rate": 8.254727474972192e-06, + "loss": 2.3035, + "mean_token_accuracy": 0.5139121689574254, + "step": 9414 + }, + { + "epoch": 1.7454579162031887, + "grad_norm": 9.28125, + "learning_rate": 8.254542083796812e-06, + "loss": 2.9757, + "mean_token_accuracy": 0.445869907720009, + "step": 9415 + }, + { + "epoch": 1.7456433073785687, + "grad_norm": 8.8203125, + "learning_rate": 8.254356692621431e-06, + "loss": 2.032, + "mean_token_accuracy": 0.589329268292683, + "step": 9416 + }, + { + "epoch": 1.7458286985539488, + "grad_norm": 5.59375, + "learning_rate": 8.254171301446051e-06, + "loss": 2.6346, + "mean_token_accuracy": 0.4829240756421112, + "step": 9417 + }, + { + "epoch": 1.746014089729329, + "grad_norm": 6.07421875, + "learning_rate": 8.253985910270672e-06, + "loss": 2.9825, + "mean_token_accuracy": 0.46016959273856445, + "step": 9418 + }, + { + "epoch": 1.7461994809047088, + "grad_norm": 10.15625, + "learning_rate": 8.253800519095292e-06, + "loss": 2.9424, + "mean_token_accuracy": 0.4825133372851215, + "step": 9419 + }, + { + "epoch": 1.746384872080089, + "grad_norm": 9.0625, + "learning_rate": 8.253615127919911e-06, + "loss": 2.7412, + "mean_token_accuracy": 0.46885107579805935, + "step": 9420 + }, + { + "epoch": 1.7465702632554692, + "grad_norm": 5.51953125, + "learning_rate": 8.253429736744532e-06, + "loss": 3.4558, + "mean_token_accuracy": 0.4116436342252279, + "step": 9421 + }, + { + "epoch": 1.7467556544308491, + "grad_norm": 9.6796875, + "learning_rate": 8.25324434556915e-06, + "loss": 3.088, + "mean_token_accuracy": 0.44594594594594594, + "step": 9422 + }, + { + "epoch": 1.746941045606229, + "grad_norm": 15.5703125, + "learning_rate": 8.253058954393771e-06, + "loss": 2.7433, + "mean_token_accuracy": 0.4756082188327241, + "step": 9423 + }, + { + "epoch": 1.7471264367816093, + "grad_norm": 8.8515625, + "learning_rate": 8.252873563218391e-06, + "loss": 2.6124, + "mean_token_accuracy": 0.4821522034962839, + "step": 9424 + }, + { + "epoch": 1.7473118279569892, + "grad_norm": 7.359375, + "learning_rate": 8.252688172043012e-06, + "loss": 2.5557, + "mean_token_accuracy": 0.527177089421391, + "step": 9425 + }, + { + "epoch": 1.7474972191323692, + "grad_norm": 13.25, + "learning_rate": 8.252502780867632e-06, + "loss": 2.844, + "mean_token_accuracy": 0.4844280860702152, + "step": 9426 + }, + { + "epoch": 1.7476826103077494, + "grad_norm": 9.578125, + "learning_rate": 8.252317389692251e-06, + "loss": 2.8621, + "mean_token_accuracy": 0.48211169284467714, + "step": 9427 + }, + { + "epoch": 1.7478680014831294, + "grad_norm": 5.64453125, + "learning_rate": 8.252131998516872e-06, + "loss": 2.8808, + "mean_token_accuracy": 0.48101124846864907, + "step": 9428 + }, + { + "epoch": 1.7480533926585093, + "grad_norm": 5.56640625, + "learning_rate": 8.25194660734149e-06, + "loss": 2.4273, + "mean_token_accuracy": 0.5336857280153772, + "step": 9429 + }, + { + "epoch": 1.7482387838338895, + "grad_norm": 7.3046875, + "learning_rate": 8.251761216166111e-06, + "loss": 2.8881, + "mean_token_accuracy": 0.4724942570426792, + "step": 9430 + }, + { + "epoch": 1.7484241750092697, + "grad_norm": 7.5234375, + "learning_rate": 8.251575824990732e-06, + "loss": 2.225, + "mean_token_accuracy": 0.5436857107448259, + "step": 9431 + }, + { + "epoch": 1.7486095661846495, + "grad_norm": 7.1875, + "learning_rate": 8.25139043381535e-06, + "loss": 3.5422, + "mean_token_accuracy": 0.39936367409046897, + "step": 9432 + }, + { + "epoch": 1.7487949573600297, + "grad_norm": 7.359375, + "learning_rate": 8.25120504263997e-06, + "loss": 2.9385, + "mean_token_accuracy": 0.4784204088764634, + "step": 9433 + }, + { + "epoch": 1.7489803485354098, + "grad_norm": 12.6171875, + "learning_rate": 8.251019651464591e-06, + "loss": 2.5904, + "mean_token_accuracy": 0.50630068621335, + "step": 9434 + }, + { + "epoch": 1.7491657397107898, + "grad_norm": 7.8125, + "learning_rate": 8.250834260289212e-06, + "loss": 3.3426, + "mean_token_accuracy": 0.41955427749820273, + "step": 9435 + }, + { + "epoch": 1.7493511308861698, + "grad_norm": 8.3828125, + "learning_rate": 8.25064886911383e-06, + "loss": 3.2227, + "mean_token_accuracy": 0.4277071051815079, + "step": 9436 + }, + { + "epoch": 1.74953652206155, + "grad_norm": 7.85546875, + "learning_rate": 8.250463477938451e-06, + "loss": 3.2273, + "mean_token_accuracy": 0.4475897192940898, + "step": 9437 + }, + { + "epoch": 1.74972191323693, + "grad_norm": 8.375, + "learning_rate": 8.25027808676307e-06, + "loss": 2.6869, + "mean_token_accuracy": 0.45848771785371206, + "step": 9438 + }, + { + "epoch": 1.74990730441231, + "grad_norm": 7.9375, + "learning_rate": 8.25009269558769e-06, + "loss": 2.5631, + "mean_token_accuracy": 0.49118671858987495, + "step": 9439 + }, + { + "epoch": 1.75009269558769, + "grad_norm": 8.0234375, + "learning_rate": 8.249907304412311e-06, + "loss": 2.2374, + "mean_token_accuracy": 0.55092655214392, + "step": 9440 + }, + { + "epoch": 1.75027808676307, + "grad_norm": 8.515625, + "learning_rate": 8.249721913236931e-06, + "loss": 3.1711, + "mean_token_accuracy": 0.43751735628991945, + "step": 9441 + }, + { + "epoch": 1.75046347793845, + "grad_norm": 7.52734375, + "learning_rate": 8.24953652206155e-06, + "loss": 2.5655, + "mean_token_accuracy": 0.4840611091062192, + "step": 9442 + }, + { + "epoch": 1.7506488691138302, + "grad_norm": 6.43359375, + "learning_rate": 8.24935113088617e-06, + "loss": 3.2653, + "mean_token_accuracy": 0.42957835116425425, + "step": 9443 + }, + { + "epoch": 1.7508342602892102, + "grad_norm": 7.1796875, + "learning_rate": 8.249165739710791e-06, + "loss": 2.6631, + "mean_token_accuracy": 0.495279307631786, + "step": 9444 + }, + { + "epoch": 1.7510196514645902, + "grad_norm": 8.4609375, + "learning_rate": 8.24898034853541e-06, + "loss": 2.1668, + "mean_token_accuracy": 0.5481319880010909, + "step": 9445 + }, + { + "epoch": 1.7512050426399703, + "grad_norm": 6.0859375, + "learning_rate": 8.24879495736003e-06, + "loss": 2.7202, + "mean_token_accuracy": 0.4739385065885798, + "step": 9446 + }, + { + "epoch": 1.7513904338153505, + "grad_norm": 7.90234375, + "learning_rate": 8.24860956618465e-06, + "loss": 2.8585, + "mean_token_accuracy": 0.4541351549158331, + "step": 9447 + }, + { + "epoch": 1.7515758249907303, + "grad_norm": 11.0703125, + "learning_rate": 8.24842417500927e-06, + "loss": 2.5817, + "mean_token_accuracy": 0.490241891148983, + "step": 9448 + }, + { + "epoch": 1.7517612161661105, + "grad_norm": 7.12890625, + "learning_rate": 8.24823878383389e-06, + "loss": 2.66, + "mean_token_accuracy": 0.4823131344201784, + "step": 9449 + }, + { + "epoch": 1.7519466073414907, + "grad_norm": 8.1953125, + "learning_rate": 8.24805339265851e-06, + "loss": 3.0404, + "mean_token_accuracy": 0.4599828803766317, + "step": 9450 + }, + { + "epoch": 1.7521319985168706, + "grad_norm": 8.390625, + "learning_rate": 8.24786800148313e-06, + "loss": 2.2881, + "mean_token_accuracy": 0.5435881238155401, + "step": 9451 + }, + { + "epoch": 1.7523173896922506, + "grad_norm": 6.16015625, + "learning_rate": 8.24768261030775e-06, + "loss": 2.6605, + "mean_token_accuracy": 0.47557807222655235, + "step": 9452 + }, + { + "epoch": 1.7525027808676308, + "grad_norm": 9.1796875, + "learning_rate": 8.24749721913237e-06, + "loss": 2.1359, + "mean_token_accuracy": 0.5668958223162348, + "step": 9453 + }, + { + "epoch": 1.7526881720430108, + "grad_norm": 7.67578125, + "learning_rate": 8.24731182795699e-06, + "loss": 2.1717, + "mean_token_accuracy": 0.5342292089249493, + "step": 9454 + }, + { + "epoch": 1.7528735632183907, + "grad_norm": 6.82421875, + "learning_rate": 8.24712643678161e-06, + "loss": 3.2054, + "mean_token_accuracy": 0.45174696013544713, + "step": 9455 + }, + { + "epoch": 1.753058954393771, + "grad_norm": 6.44140625, + "learning_rate": 8.246941045606229e-06, + "loss": 3.1688, + "mean_token_accuracy": 0.44142550233792494, + "step": 9456 + }, + { + "epoch": 1.7532443455691509, + "grad_norm": 7.41015625, + "learning_rate": 8.24675565443085e-06, + "loss": 2.6262, + "mean_token_accuracy": 0.5370440970898462, + "step": 9457 + }, + { + "epoch": 1.7534297367445308, + "grad_norm": 5.8828125, + "learning_rate": 8.24657026325547e-06, + "loss": 2.5084, + "mean_token_accuracy": 0.485688629475811, + "step": 9458 + }, + { + "epoch": 1.753615127919911, + "grad_norm": 7.0859375, + "learning_rate": 8.24638487208009e-06, + "loss": 2.6368, + "mean_token_accuracy": 0.4763805721889554, + "step": 9459 + }, + { + "epoch": 1.7538005190952912, + "grad_norm": 6.09375, + "learning_rate": 8.246199480904709e-06, + "loss": 3.6168, + "mean_token_accuracy": 0.4252432506509524, + "step": 9460 + }, + { + "epoch": 1.753985910270671, + "grad_norm": 9.9296875, + "learning_rate": 8.24601408972933e-06, + "loss": 2.8573, + "mean_token_accuracy": 0.4588820906356943, + "step": 9461 + }, + { + "epoch": 1.7541713014460512, + "grad_norm": 5.8515625, + "learning_rate": 8.24582869855395e-06, + "loss": 2.7066, + "mean_token_accuracy": 0.504407680231856, + "step": 9462 + }, + { + "epoch": 1.7543566926214313, + "grad_norm": 6.125, + "learning_rate": 8.245643307378569e-06, + "loss": 3.3165, + "mean_token_accuracy": 0.44382801664355065, + "step": 9463 + }, + { + "epoch": 1.7545420837968113, + "grad_norm": 9.1171875, + "learning_rate": 8.245457916203189e-06, + "loss": 2.201, + "mean_token_accuracy": 0.5501640085963126, + "step": 9464 + }, + { + "epoch": 1.7547274749721913, + "grad_norm": 6.59765625, + "learning_rate": 8.24527252502781e-06, + "loss": 2.3556, + "mean_token_accuracy": 0.5039231881065456, + "step": 9465 + }, + { + "epoch": 1.7549128661475715, + "grad_norm": 6.44921875, + "learning_rate": 8.24508713385243e-06, + "loss": 2.7625, + "mean_token_accuracy": 0.47192533493903355, + "step": 9466 + }, + { + "epoch": 1.7550982573229514, + "grad_norm": 5.96875, + "learning_rate": 8.244901742677049e-06, + "loss": 2.5207, + "mean_token_accuracy": 0.49774947853770996, + "step": 9467 + }, + { + "epoch": 1.7552836484983314, + "grad_norm": 7.03125, + "learning_rate": 8.24471635150167e-06, + "loss": 2.39, + "mean_token_accuracy": 0.5091093117408907, + "step": 9468 + }, + { + "epoch": 1.7554690396737116, + "grad_norm": 6.89453125, + "learning_rate": 8.24453096032629e-06, + "loss": 2.6462, + "mean_token_accuracy": 0.47351524879614765, + "step": 9469 + }, + { + "epoch": 1.7556544308490916, + "grad_norm": 7.01953125, + "learning_rate": 8.244345569150909e-06, + "loss": 2.6854, + "mean_token_accuracy": 0.46433941997851774, + "step": 9470 + }, + { + "epoch": 1.7558398220244715, + "grad_norm": 8.984375, + "learning_rate": 8.24416017797553e-06, + "loss": 2.6181, + "mean_token_accuracy": 0.4770395074397127, + "step": 9471 + }, + { + "epoch": 1.7560252131998517, + "grad_norm": 6.39453125, + "learning_rate": 8.243974786800148e-06, + "loss": 3.6557, + "mean_token_accuracy": 0.4180726800778715, + "step": 9472 + }, + { + "epoch": 1.7562106043752317, + "grad_norm": 5.83984375, + "learning_rate": 8.24378939562477e-06, + "loss": 2.5307, + "mean_token_accuracy": 0.5004932587964486, + "step": 9473 + }, + { + "epoch": 1.7563959955506117, + "grad_norm": 6.84375, + "learning_rate": 8.243604004449389e-06, + "loss": 3.007, + "mean_token_accuracy": 0.4706298655343241, + "step": 9474 + }, + { + "epoch": 1.7565813867259918, + "grad_norm": 5.47265625, + "learning_rate": 8.24341861327401e-06, + "loss": 2.6142, + "mean_token_accuracy": 0.49333715432010317, + "step": 9475 + }, + { + "epoch": 1.756766777901372, + "grad_norm": 7.234375, + "learning_rate": 8.243233222098628e-06, + "loss": 2.9798, + "mean_token_accuracy": 0.4787966252220249, + "step": 9476 + }, + { + "epoch": 1.756952169076752, + "grad_norm": 6.96875, + "learning_rate": 8.243047830923249e-06, + "loss": 2.8049, + "mean_token_accuracy": 0.46520982822352686, + "step": 9477 + }, + { + "epoch": 1.757137560252132, + "grad_norm": 6.046875, + "learning_rate": 8.24286243974787e-06, + "loss": 2.6237, + "mean_token_accuracy": 0.4904397705544933, + "step": 9478 + }, + { + "epoch": 1.7573229514275122, + "grad_norm": 7.296875, + "learning_rate": 8.242677048572488e-06, + "loss": 3.0178, + "mean_token_accuracy": 0.4608788853161844, + "step": 9479 + }, + { + "epoch": 1.7575083426028921, + "grad_norm": 6.8203125, + "learning_rate": 8.242491657397109e-06, + "loss": 2.973, + "mean_token_accuracy": 0.4524380495603517, + "step": 9480 + }, + { + "epoch": 1.757693733778272, + "grad_norm": 6.0546875, + "learning_rate": 8.242306266221729e-06, + "loss": 2.5458, + "mean_token_accuracy": 0.49889494622071606, + "step": 9481 + }, + { + "epoch": 1.7578791249536523, + "grad_norm": 7.26953125, + "learning_rate": 8.24212087504635e-06, + "loss": 2.9402, + "mean_token_accuracy": 0.4562556663644606, + "step": 9482 + }, + { + "epoch": 1.7580645161290323, + "grad_norm": 6.66796875, + "learning_rate": 8.241935483870968e-06, + "loss": 2.5337, + "mean_token_accuracy": 0.49869420702754036, + "step": 9483 + }, + { + "epoch": 1.7582499073044122, + "grad_norm": 5.35546875, + "learning_rate": 8.241750092695589e-06, + "loss": 3.3672, + "mean_token_accuracy": 0.4471954940154893, + "step": 9484 + }, + { + "epoch": 1.7584352984797924, + "grad_norm": 5.9296875, + "learning_rate": 8.241564701520208e-06, + "loss": 3.193, + "mean_token_accuracy": 0.4397254952738573, + "step": 9485 + }, + { + "epoch": 1.7586206896551724, + "grad_norm": 23.21875, + "learning_rate": 8.241379310344828e-06, + "loss": 3.94, + "mean_token_accuracy": 0.47316730971588916, + "step": 9486 + }, + { + "epoch": 1.7588060808305523, + "grad_norm": 7.296875, + "learning_rate": 8.241193919169449e-06, + "loss": 3.1978, + "mean_token_accuracy": 0.43987014903349564, + "step": 9487 + }, + { + "epoch": 1.7589914720059325, + "grad_norm": 6.109375, + "learning_rate": 8.241008527994067e-06, + "loss": 2.9465, + "mean_token_accuracy": 0.45274467013597447, + "step": 9488 + }, + { + "epoch": 1.7591768631813127, + "grad_norm": 5.69921875, + "learning_rate": 8.240823136818688e-06, + "loss": 2.9341, + "mean_token_accuracy": 0.48116646415552855, + "step": 9489 + }, + { + "epoch": 1.7593622543566925, + "grad_norm": 9.1875, + "learning_rate": 8.240637745643308e-06, + "loss": 3.4204, + "mean_token_accuracy": 0.42246575342465753, + "step": 9490 + }, + { + "epoch": 1.7595476455320727, + "grad_norm": 5.99609375, + "learning_rate": 8.240452354467929e-06, + "loss": 3.1232, + "mean_token_accuracy": 0.4532803180914513, + "step": 9491 + }, + { + "epoch": 1.7597330367074528, + "grad_norm": 6.5859375, + "learning_rate": 8.240266963292548e-06, + "loss": 2.7409, + "mean_token_accuracy": 0.47150370290866156, + "step": 9492 + }, + { + "epoch": 1.7599184278828328, + "grad_norm": 7.81640625, + "learning_rate": 8.240081572117168e-06, + "loss": 4.1064, + "mean_token_accuracy": 0.3837837837837838, + "step": 9493 + }, + { + "epoch": 1.7601038190582128, + "grad_norm": 6.7265625, + "learning_rate": 8.239896180941787e-06, + "loss": 3.4168, + "mean_token_accuracy": 0.4287435974691172, + "step": 9494 + }, + { + "epoch": 1.760289210233593, + "grad_norm": 7.7734375, + "learning_rate": 8.239710789766407e-06, + "loss": 2.7165, + "mean_token_accuracy": 0.5046208530805687, + "step": 9495 + }, + { + "epoch": 1.760474601408973, + "grad_norm": 6.0, + "learning_rate": 8.239525398591028e-06, + "loss": 3.088, + "mean_token_accuracy": 0.4526572290554653, + "step": 9496 + }, + { + "epoch": 1.760659992584353, + "grad_norm": 9.1796875, + "learning_rate": 8.239340007415648e-06, + "loss": 3.5986, + "mean_token_accuracy": 0.3952473326867119, + "step": 9497 + }, + { + "epoch": 1.760845383759733, + "grad_norm": 8.328125, + "learning_rate": 8.239154616240267e-06, + "loss": 3.5032, + "mean_token_accuracy": 0.4232600025730091, + "step": 9498 + }, + { + "epoch": 1.761030774935113, + "grad_norm": 11.4765625, + "learning_rate": 8.238969225064888e-06, + "loss": 2.8789, + "mean_token_accuracy": 0.46553122465531227, + "step": 9499 + }, + { + "epoch": 1.761216166110493, + "grad_norm": 6.93359375, + "learning_rate": 8.238783833889508e-06, + "loss": 2.964, + "mean_token_accuracy": 0.46408839779005523, + "step": 9500 + }, + { + "epoch": 1.7614015572858732, + "grad_norm": 8.1875, + "learning_rate": 8.238598442714127e-06, + "loss": 3.3812, + "mean_token_accuracy": 0.4151216305062459, + "step": 9501 + }, + { + "epoch": 1.7615869484612534, + "grad_norm": 15.6015625, + "learning_rate": 8.238413051538747e-06, + "loss": 2.384, + "mean_token_accuracy": 0.4917832405269591, + "step": 9502 + }, + { + "epoch": 1.7617723396366332, + "grad_norm": 15.53125, + "learning_rate": 8.238227660363366e-06, + "loss": 2.6349, + "mean_token_accuracy": 0.45594405594405596, + "step": 9503 + }, + { + "epoch": 1.7619577308120133, + "grad_norm": 6.875, + "learning_rate": 8.238042269187987e-06, + "loss": 2.9421, + "mean_token_accuracy": 0.49014925373134327, + "step": 9504 + }, + { + "epoch": 1.7621431219873935, + "grad_norm": 9.0859375, + "learning_rate": 8.237856878012607e-06, + "loss": 2.8547, + "mean_token_accuracy": 0.48193787981093855, + "step": 9505 + }, + { + "epoch": 1.7623285131627735, + "grad_norm": 13.609375, + "learning_rate": 8.237671486837228e-06, + "loss": 2.4167, + "mean_token_accuracy": 0.5197938627313188, + "step": 9506 + }, + { + "epoch": 1.7625139043381535, + "grad_norm": 7.91015625, + "learning_rate": 8.237486095661848e-06, + "loss": 3.4779, + "mean_token_accuracy": 0.4281540989592843, + "step": 9507 + }, + { + "epoch": 1.7626992955135337, + "grad_norm": 8.96875, + "learning_rate": 8.237300704486467e-06, + "loss": 2.8377, + "mean_token_accuracy": 0.49267217630853993, + "step": 9508 + }, + { + "epoch": 1.7628846866889136, + "grad_norm": 11.1328125, + "learning_rate": 8.237115313311088e-06, + "loss": 2.9774, + "mean_token_accuracy": 0.4681141439205955, + "step": 9509 + }, + { + "epoch": 1.7630700778642936, + "grad_norm": 7.9375, + "learning_rate": 8.236929922135706e-06, + "loss": 2.6403, + "mean_token_accuracy": 0.48046875, + "step": 9510 + }, + { + "epoch": 1.7632554690396738, + "grad_norm": 6.5859375, + "learning_rate": 8.236744530960327e-06, + "loss": 2.598, + "mean_token_accuracy": 0.472168410176934, + "step": 9511 + }, + { + "epoch": 1.7634408602150538, + "grad_norm": 8.96875, + "learning_rate": 8.236559139784947e-06, + "loss": 2.3104, + "mean_token_accuracy": 0.5360123647604328, + "step": 9512 + }, + { + "epoch": 1.7636262513904337, + "grad_norm": 7.19921875, + "learning_rate": 8.236373748609568e-06, + "loss": 2.7339, + "mean_token_accuracy": 0.46906127376999696, + "step": 9513 + }, + { + "epoch": 1.763811642565814, + "grad_norm": 8.8671875, + "learning_rate": 8.236188357434187e-06, + "loss": 2.9717, + "mean_token_accuracy": 0.4657777777777778, + "step": 9514 + }, + { + "epoch": 1.7639970337411939, + "grad_norm": 6.45703125, + "learning_rate": 8.236002966258807e-06, + "loss": 3.4706, + "mean_token_accuracy": 0.4333295049959802, + "step": 9515 + }, + { + "epoch": 1.7641824249165738, + "grad_norm": 4.7890625, + "learning_rate": 8.235817575083428e-06, + "loss": 2.8686, + "mean_token_accuracy": 0.47574626865671643, + "step": 9516 + }, + { + "epoch": 1.764367816091954, + "grad_norm": 8.21875, + "learning_rate": 8.235632183908046e-06, + "loss": 2.9334, + "mean_token_accuracy": 0.48323119777158774, + "step": 9517 + }, + { + "epoch": 1.7645532072673342, + "grad_norm": 11.4140625, + "learning_rate": 8.235446792732667e-06, + "loss": 2.8059, + "mean_token_accuracy": 0.4714636987187783, + "step": 9518 + }, + { + "epoch": 1.764738598442714, + "grad_norm": 7.55859375, + "learning_rate": 8.235261401557286e-06, + "loss": 2.6959, + "mean_token_accuracy": 0.5063379300738264, + "step": 9519 + }, + { + "epoch": 1.7649239896180942, + "grad_norm": 7.25390625, + "learning_rate": 8.235076010381906e-06, + "loss": 2.9252, + "mean_token_accuracy": 0.4820542412002308, + "step": 9520 + }, + { + "epoch": 1.7651093807934743, + "grad_norm": 7.46875, + "learning_rate": 8.234890619206527e-06, + "loss": 2.8497, + "mean_token_accuracy": 0.46867454568560174, + "step": 9521 + }, + { + "epoch": 1.7652947719688543, + "grad_norm": 8.5390625, + "learning_rate": 8.234705228031147e-06, + "loss": 2.7107, + "mean_token_accuracy": 0.5033872377622378, + "step": 9522 + }, + { + "epoch": 1.7654801631442343, + "grad_norm": 6.109375, + "learning_rate": 8.234519836855766e-06, + "loss": 2.8375, + "mean_token_accuracy": 0.4396047328045768, + "step": 9523 + }, + { + "epoch": 1.7656655543196145, + "grad_norm": 6.58203125, + "learning_rate": 8.234334445680386e-06, + "loss": 3.5387, + "mean_token_accuracy": 0.4346833130328867, + "step": 9524 + }, + { + "epoch": 1.7658509454949944, + "grad_norm": 7.046875, + "learning_rate": 8.234149054505007e-06, + "loss": 3.9087, + "mean_token_accuracy": 0.3810373012334671, + "step": 9525 + }, + { + "epoch": 1.7660363366703744, + "grad_norm": 10.9375, + "learning_rate": 8.233963663329626e-06, + "loss": 3.6271, + "mean_token_accuracy": 0.42084251101321585, + "step": 9526 + }, + { + "epoch": 1.7662217278457546, + "grad_norm": 7.734375, + "learning_rate": 8.233778272154246e-06, + "loss": 3.0169, + "mean_token_accuracy": 0.46674462797836574, + "step": 9527 + }, + { + "epoch": 1.7664071190211346, + "grad_norm": 11.796875, + "learning_rate": 8.233592880978865e-06, + "loss": 2.2579, + "mean_token_accuracy": 0.5234688536032106, + "step": 9528 + }, + { + "epoch": 1.7665925101965145, + "grad_norm": 7.7421875, + "learning_rate": 8.233407489803485e-06, + "loss": 3.0129, + "mean_token_accuracy": 0.46468535675610634, + "step": 9529 + }, + { + "epoch": 1.7667779013718947, + "grad_norm": 9.0390625, + "learning_rate": 8.233222098628106e-06, + "loss": 2.8426, + "mean_token_accuracy": 0.48530549110595517, + "step": 9530 + }, + { + "epoch": 1.766963292547275, + "grad_norm": 10.0078125, + "learning_rate": 8.233036707452726e-06, + "loss": 2.7707, + "mean_token_accuracy": 0.47758152173913043, + "step": 9531 + }, + { + "epoch": 1.7671486837226547, + "grad_norm": 7.3359375, + "learning_rate": 8.232851316277345e-06, + "loss": 2.7291, + "mean_token_accuracy": 0.4583999016836672, + "step": 9532 + }, + { + "epoch": 1.7673340748980348, + "grad_norm": 6.9609375, + "learning_rate": 8.232665925101966e-06, + "loss": 2.7015, + "mean_token_accuracy": 0.5235050770966528, + "step": 9533 + }, + { + "epoch": 1.767519466073415, + "grad_norm": 7.26171875, + "learning_rate": 8.232480533926586e-06, + "loss": 2.7186, + "mean_token_accuracy": 0.47739754964089565, + "step": 9534 + }, + { + "epoch": 1.767704857248795, + "grad_norm": 6.3125, + "learning_rate": 8.232295142751205e-06, + "loss": 2.5374, + "mean_token_accuracy": 0.5085859340002986, + "step": 9535 + }, + { + "epoch": 1.767890248424175, + "grad_norm": 5.98046875, + "learning_rate": 8.232109751575826e-06, + "loss": 2.5203, + "mean_token_accuracy": 0.5136948781155848, + "step": 9536 + }, + { + "epoch": 1.7680756395995552, + "grad_norm": 6.2890625, + "learning_rate": 8.231924360400444e-06, + "loss": 2.2888, + "mean_token_accuracy": 0.5246317927882174, + "step": 9537 + }, + { + "epoch": 1.7682610307749351, + "grad_norm": 8.796875, + "learning_rate": 8.231738969225067e-06, + "loss": 2.433, + "mean_token_accuracy": 0.510969387755102, + "step": 9538 + }, + { + "epoch": 1.768446421950315, + "grad_norm": 10.8125, + "learning_rate": 8.231553578049685e-06, + "loss": 3.1167, + "mean_token_accuracy": 0.4309111880046136, + "step": 9539 + }, + { + "epoch": 1.7686318131256953, + "grad_norm": 8.046875, + "learning_rate": 8.231368186874306e-06, + "loss": 2.2262, + "mean_token_accuracy": 0.5443245778611632, + "step": 9540 + }, + { + "epoch": 1.7688172043010753, + "grad_norm": 12.8203125, + "learning_rate": 8.231182795698925e-06, + "loss": 2.8502, + "mean_token_accuracy": 0.4690436039508552, + "step": 9541 + }, + { + "epoch": 1.7690025954764552, + "grad_norm": 6.69921875, + "learning_rate": 8.230997404523545e-06, + "loss": 2.8884, + "mean_token_accuracy": 0.46958153914099593, + "step": 9542 + }, + { + "epoch": 1.7691879866518354, + "grad_norm": 5.73828125, + "learning_rate": 8.230812013348166e-06, + "loss": 3.0683, + "mean_token_accuracy": 0.4351640427833193, + "step": 9543 + }, + { + "epoch": 1.7693733778272154, + "grad_norm": 7.00390625, + "learning_rate": 8.230626622172784e-06, + "loss": 2.9766, + "mean_token_accuracy": 0.4453111457791645, + "step": 9544 + }, + { + "epoch": 1.7695587690025953, + "grad_norm": 6.12890625, + "learning_rate": 8.230441230997405e-06, + "loss": 3.0401, + "mean_token_accuracy": 0.462758219377364, + "step": 9545 + }, + { + "epoch": 1.7697441601779755, + "grad_norm": 6.2109375, + "learning_rate": 8.230255839822025e-06, + "loss": 2.9325, + "mean_token_accuracy": 0.47351970828268797, + "step": 9546 + }, + { + "epoch": 1.7699295513533557, + "grad_norm": 6.5625, + "learning_rate": 8.230070448646646e-06, + "loss": 2.8599, + "mean_token_accuracy": 0.47406434668417596, + "step": 9547 + }, + { + "epoch": 1.7701149425287355, + "grad_norm": 6.01953125, + "learning_rate": 8.229885057471265e-06, + "loss": 2.5419, + "mean_token_accuracy": 0.48299136069114473, + "step": 9548 + }, + { + "epoch": 1.7703003337041157, + "grad_norm": 7.16015625, + "learning_rate": 8.229699666295885e-06, + "loss": 2.8444, + "mean_token_accuracy": 0.45300772936036743, + "step": 9549 + }, + { + "epoch": 1.7704857248794958, + "grad_norm": 6.58984375, + "learning_rate": 8.229514275120506e-06, + "loss": 2.3771, + "mean_token_accuracy": 0.502724358974359, + "step": 9550 + }, + { + "epoch": 1.7706711160548758, + "grad_norm": 6.1953125, + "learning_rate": 8.229328883945124e-06, + "loss": 3.0421, + "mean_token_accuracy": 0.4472931075602194, + "step": 9551 + }, + { + "epoch": 1.7708565072302558, + "grad_norm": 6.1796875, + "learning_rate": 8.229143492769745e-06, + "loss": 2.7132, + "mean_token_accuracy": 0.4877537511032657, + "step": 9552 + }, + { + "epoch": 1.771041898405636, + "grad_norm": 6.22265625, + "learning_rate": 8.228958101594364e-06, + "loss": 2.8283, + "mean_token_accuracy": 0.469858857670492, + "step": 9553 + }, + { + "epoch": 1.771227289581016, + "grad_norm": 7.6875, + "learning_rate": 8.228772710418986e-06, + "loss": 2.8835, + "mean_token_accuracy": 0.45665992487720314, + "step": 9554 + }, + { + "epoch": 1.771412680756396, + "grad_norm": 9.5703125, + "learning_rate": 8.228587319243605e-06, + "loss": 2.4835, + "mean_token_accuracy": 0.495383767396996, + "step": 9555 + }, + { + "epoch": 1.771598071931776, + "grad_norm": 5.0625, + "learning_rate": 8.228401928068225e-06, + "loss": 2.6586, + "mean_token_accuracy": 0.48697999364877737, + "step": 9556 + }, + { + "epoch": 1.771783463107156, + "grad_norm": 7.97265625, + "learning_rate": 8.228216536892844e-06, + "loss": 2.7101, + "mean_token_accuracy": 0.48128165602290246, + "step": 9557 + }, + { + "epoch": 1.771968854282536, + "grad_norm": 6.1796875, + "learning_rate": 8.228031145717464e-06, + "loss": 3.0213, + "mean_token_accuracy": 0.43493975903614457, + "step": 9558 + }, + { + "epoch": 1.7721542454579162, + "grad_norm": 5.421875, + "learning_rate": 8.227845754542085e-06, + "loss": 3.0755, + "mean_token_accuracy": 0.4640784212562879, + "step": 9559 + }, + { + "epoch": 1.7723396366332964, + "grad_norm": 6.7890625, + "learning_rate": 8.227660363366704e-06, + "loss": 2.4714, + "mean_token_accuracy": 0.5230096640589047, + "step": 9560 + }, + { + "epoch": 1.7725250278086762, + "grad_norm": 6.3125, + "learning_rate": 8.227474972191324e-06, + "loss": 2.8238, + "mean_token_accuracy": 0.4774542327067207, + "step": 9561 + }, + { + "epoch": 1.7727104189840563, + "grad_norm": 6.16796875, + "learning_rate": 8.227289581015945e-06, + "loss": 2.2713, + "mean_token_accuracy": 0.519268451992162, + "step": 9562 + }, + { + "epoch": 1.7728958101594365, + "grad_norm": 7.1171875, + "learning_rate": 8.227104189840565e-06, + "loss": 2.9995, + "mean_token_accuracy": 0.44350226779727864, + "step": 9563 + }, + { + "epoch": 1.7730812013348165, + "grad_norm": 6.18359375, + "learning_rate": 8.226918798665184e-06, + "loss": 3.3824, + "mean_token_accuracy": 0.457194303486659, + "step": 9564 + }, + { + "epoch": 1.7732665925101965, + "grad_norm": 6.21484375, + "learning_rate": 8.226733407489805e-06, + "loss": 2.9569, + "mean_token_accuracy": 0.457089552238806, + "step": 9565 + }, + { + "epoch": 1.7734519836855767, + "grad_norm": 5.484375, + "learning_rate": 8.226548016314423e-06, + "loss": 2.7289, + "mean_token_accuracy": 0.4761834319526627, + "step": 9566 + }, + { + "epoch": 1.7736373748609566, + "grad_norm": 5.5703125, + "learning_rate": 8.226362625139044e-06, + "loss": 2.903, + "mean_token_accuracy": 0.4675893235418726, + "step": 9567 + }, + { + "epoch": 1.7738227660363366, + "grad_norm": 6.5703125, + "learning_rate": 8.226177233963664e-06, + "loss": 3.0034, + "mean_token_accuracy": 0.4648783814503296, + "step": 9568 + }, + { + "epoch": 1.7740081572117168, + "grad_norm": 5.6875, + "learning_rate": 8.225991842788283e-06, + "loss": 3.351, + "mean_token_accuracy": 0.4181791569086651, + "step": 9569 + }, + { + "epoch": 1.7741935483870968, + "grad_norm": 5.5, + "learning_rate": 8.225806451612904e-06, + "loss": 2.8396, + "mean_token_accuracy": 0.465905383360522, + "step": 9570 + }, + { + "epoch": 1.7743789395624767, + "grad_norm": 5.83203125, + "learning_rate": 8.225621060437524e-06, + "loss": 2.415, + "mean_token_accuracy": 0.5131720844245878, + "step": 9571 + }, + { + "epoch": 1.774564330737857, + "grad_norm": 7.4140625, + "learning_rate": 8.225435669262145e-06, + "loss": 3.2642, + "mean_token_accuracy": 0.4519869413486435, + "step": 9572 + }, + { + "epoch": 1.7747497219132369, + "grad_norm": 6.9921875, + "learning_rate": 8.225250278086763e-06, + "loss": 3.4175, + "mean_token_accuracy": 0.4090725535065285, + "step": 9573 + }, + { + "epoch": 1.7749351130886168, + "grad_norm": 6.75390625, + "learning_rate": 8.225064886911384e-06, + "loss": 2.4711, + "mean_token_accuracy": 0.5050037341299477, + "step": 9574 + }, + { + "epoch": 1.775120504263997, + "grad_norm": 6.890625, + "learning_rate": 8.224879495736003e-06, + "loss": 3.3154, + "mean_token_accuracy": 0.44387755102040816, + "step": 9575 + }, + { + "epoch": 1.7753058954393772, + "grad_norm": 5.3203125, + "learning_rate": 8.224694104560623e-06, + "loss": 2.7816, + "mean_token_accuracy": 0.4677017723743958, + "step": 9576 + }, + { + "epoch": 1.7754912866147572, + "grad_norm": 6.27734375, + "learning_rate": 8.224508713385244e-06, + "loss": 3.3314, + "mean_token_accuracy": 0.4208096590909091, + "step": 9577 + }, + { + "epoch": 1.7756766777901372, + "grad_norm": 5.44140625, + "learning_rate": 8.224323322209864e-06, + "loss": 3.0081, + "mean_token_accuracy": 0.44363310274148093, + "step": 9578 + }, + { + "epoch": 1.7758620689655173, + "grad_norm": 7.30859375, + "learning_rate": 8.224137931034483e-06, + "loss": 2.7279, + "mean_token_accuracy": 0.47324169224609636, + "step": 9579 + }, + { + "epoch": 1.7760474601408973, + "grad_norm": 5.89453125, + "learning_rate": 8.223952539859103e-06, + "loss": 4.0705, + "mean_token_accuracy": 0.39968445963712856, + "step": 9580 + }, + { + "epoch": 1.7762328513162773, + "grad_norm": 7.27734375, + "learning_rate": 8.223767148683724e-06, + "loss": 2.4423, + "mean_token_accuracy": 0.5002010993430754, + "step": 9581 + }, + { + "epoch": 1.7764182424916575, + "grad_norm": 6.3515625, + "learning_rate": 8.223581757508343e-06, + "loss": 2.4231, + "mean_token_accuracy": 0.5058267716535433, + "step": 9582 + }, + { + "epoch": 1.7766036336670374, + "grad_norm": 6.09765625, + "learning_rate": 8.223396366332963e-06, + "loss": 2.9354, + "mean_token_accuracy": 0.47269279393173197, + "step": 9583 + }, + { + "epoch": 1.7767890248424174, + "grad_norm": 5.265625, + "learning_rate": 8.223210975157582e-06, + "loss": 2.7054, + "mean_token_accuracy": 0.4783329648463409, + "step": 9584 + }, + { + "epoch": 1.7769744160177976, + "grad_norm": 5.5859375, + "learning_rate": 8.223025583982202e-06, + "loss": 3.2465, + "mean_token_accuracy": 0.4434366339128244, + "step": 9585 + }, + { + "epoch": 1.7771598071931776, + "grad_norm": 7.39453125, + "learning_rate": 8.222840192806823e-06, + "loss": 3.0398, + "mean_token_accuracy": 0.44512443900448795, + "step": 9586 + }, + { + "epoch": 1.7773451983685575, + "grad_norm": 6.125, + "learning_rate": 8.222654801631443e-06, + "loss": 2.4514, + "mean_token_accuracy": 0.47677419354838707, + "step": 9587 + }, + { + "epoch": 1.7775305895439377, + "grad_norm": 7.1328125, + "learning_rate": 8.222469410456064e-06, + "loss": 2.4678, + "mean_token_accuracy": 0.5025551371705218, + "step": 9588 + }, + { + "epoch": 1.777715980719318, + "grad_norm": 7.953125, + "learning_rate": 8.222284019280683e-06, + "loss": 3.0658, + "mean_token_accuracy": 0.45177728063634104, + "step": 9589 + }, + { + "epoch": 1.7779013718946977, + "grad_norm": 7.08203125, + "learning_rate": 8.222098628105303e-06, + "loss": 3.5725, + "mean_token_accuracy": 0.42514670995130477, + "step": 9590 + }, + { + "epoch": 1.7780867630700778, + "grad_norm": 7.76171875, + "learning_rate": 8.221913236929922e-06, + "loss": 2.6558, + "mean_token_accuracy": 0.4767092829349639, + "step": 9591 + }, + { + "epoch": 1.778272154245458, + "grad_norm": 7.91015625, + "learning_rate": 8.221727845754543e-06, + "loss": 3.0104, + "mean_token_accuracy": 0.47475157168931253, + "step": 9592 + }, + { + "epoch": 1.778457545420838, + "grad_norm": 7.23828125, + "learning_rate": 8.221542454579163e-06, + "loss": 2.2635, + "mean_token_accuracy": 0.5650228576986674, + "step": 9593 + }, + { + "epoch": 1.778642936596218, + "grad_norm": 7.6640625, + "learning_rate": 8.221357063403784e-06, + "loss": 2.8007, + "mean_token_accuracy": 0.4931309904153355, + "step": 9594 + }, + { + "epoch": 1.7788283277715982, + "grad_norm": 6.625, + "learning_rate": 8.221171672228402e-06, + "loss": 2.4612, + "mean_token_accuracy": 0.4998455677957377, + "step": 9595 + }, + { + "epoch": 1.7790137189469781, + "grad_norm": 7.24609375, + "learning_rate": 8.220986281053023e-06, + "loss": 2.1666, + "mean_token_accuracy": 0.5443766937669376, + "step": 9596 + }, + { + "epoch": 1.779199110122358, + "grad_norm": 5.3359375, + "learning_rate": 8.220800889877643e-06, + "loss": 2.626, + "mean_token_accuracy": 0.4946322521699406, + "step": 9597 + }, + { + "epoch": 1.7793845012977383, + "grad_norm": 6.02734375, + "learning_rate": 8.220615498702262e-06, + "loss": 2.9786, + "mean_token_accuracy": 0.4478200618270973, + "step": 9598 + }, + { + "epoch": 1.7795698924731183, + "grad_norm": 5.5859375, + "learning_rate": 8.220430107526883e-06, + "loss": 2.8726, + "mean_token_accuracy": 0.4552467609534171, + "step": 9599 + }, + { + "epoch": 1.7797552836484982, + "grad_norm": 5.484375, + "learning_rate": 8.220244716351501e-06, + "loss": 3.4032, + "mean_token_accuracy": 0.43150248502612465, + "step": 9600 + }, + { + "epoch": 1.7799406748238784, + "grad_norm": 5.06640625, + "learning_rate": 8.220059325176122e-06, + "loss": 3.0623, + "mean_token_accuracy": 0.4571986811576505, + "step": 9601 + }, + { + "epoch": 1.7801260659992586, + "grad_norm": 9.0234375, + "learning_rate": 8.219873934000742e-06, + "loss": 2.7846, + "mean_token_accuracy": 0.46526131969564677, + "step": 9602 + }, + { + "epoch": 1.7803114571746383, + "grad_norm": 6.58984375, + "learning_rate": 8.219688542825363e-06, + "loss": 2.8147, + "mean_token_accuracy": 0.4742242242242242, + "step": 9603 + }, + { + "epoch": 1.7804968483500185, + "grad_norm": 9.328125, + "learning_rate": 8.219503151649982e-06, + "loss": 2.6249, + "mean_token_accuracy": 0.49377299745258985, + "step": 9604 + }, + { + "epoch": 1.7806822395253987, + "grad_norm": 7.05859375, + "learning_rate": 8.219317760474602e-06, + "loss": 2.7874, + "mean_token_accuracy": 0.4858474858474858, + "step": 9605 + }, + { + "epoch": 1.7808676307007787, + "grad_norm": 7.86328125, + "learning_rate": 8.219132369299223e-06, + "loss": 2.3155, + "mean_token_accuracy": 0.5153146100401214, + "step": 9606 + }, + { + "epoch": 1.7810530218761587, + "grad_norm": 8.7578125, + "learning_rate": 8.218946978123841e-06, + "loss": 3.0138, + "mean_token_accuracy": 0.4533134259790973, + "step": 9607 + }, + { + "epoch": 1.7812384130515388, + "grad_norm": 6.40234375, + "learning_rate": 8.218761586948462e-06, + "loss": 3.2122, + "mean_token_accuracy": 0.43982494529540483, + "step": 9608 + }, + { + "epoch": 1.7814238042269188, + "grad_norm": 8.03125, + "learning_rate": 8.21857619577308e-06, + "loss": 2.9155, + "mean_token_accuracy": 0.45991892245324967, + "step": 9609 + }, + { + "epoch": 1.7816091954022988, + "grad_norm": 6.79296875, + "learning_rate": 8.218390804597703e-06, + "loss": 2.9605, + "mean_token_accuracy": 0.4465611083621969, + "step": 9610 + }, + { + "epoch": 1.781794586577679, + "grad_norm": 6.73828125, + "learning_rate": 8.218205413422322e-06, + "loss": 2.9695, + "mean_token_accuracy": 0.45112521638776687, + "step": 9611 + }, + { + "epoch": 1.781979977753059, + "grad_norm": 6.8671875, + "learning_rate": 8.218020022246942e-06, + "loss": 2.9868, + "mean_token_accuracy": 0.4624202162302983, + "step": 9612 + }, + { + "epoch": 1.782165368928439, + "grad_norm": 8.0703125, + "learning_rate": 8.217834631071561e-06, + "loss": 2.6783, + "mean_token_accuracy": 0.4740061162079511, + "step": 9613 + }, + { + "epoch": 1.782350760103819, + "grad_norm": 9.1171875, + "learning_rate": 8.217649239896181e-06, + "loss": 2.6579, + "mean_token_accuracy": 0.47734587769249814, + "step": 9614 + }, + { + "epoch": 1.782536151279199, + "grad_norm": 6.51953125, + "learning_rate": 8.217463848720802e-06, + "loss": 3.014, + "mean_token_accuracy": 0.45768025078369906, + "step": 9615 + }, + { + "epoch": 1.782721542454579, + "grad_norm": 7.5234375, + "learning_rate": 8.21727845754542e-06, + "loss": 2.3631, + "mean_token_accuracy": 0.5174494455316373, + "step": 9616 + }, + { + "epoch": 1.7829069336299592, + "grad_norm": 6.375, + "learning_rate": 8.217093066370041e-06, + "loss": 3.3574, + "mean_token_accuracy": 0.43539923415118426, + "step": 9617 + }, + { + "epoch": 1.7830923248053394, + "grad_norm": 6.34375, + "learning_rate": 8.216907675194662e-06, + "loss": 2.7432, + "mean_token_accuracy": 0.5021739130434782, + "step": 9618 + }, + { + "epoch": 1.7832777159807192, + "grad_norm": 7.01171875, + "learning_rate": 8.216722284019282e-06, + "loss": 3.1228, + "mean_token_accuracy": 0.4425440940673437, + "step": 9619 + }, + { + "epoch": 1.7834631071560993, + "grad_norm": 5.9765625, + "learning_rate": 8.216536892843901e-06, + "loss": 2.9486, + "mean_token_accuracy": 0.47592620368981553, + "step": 9620 + }, + { + "epoch": 1.7836484983314795, + "grad_norm": 5.93359375, + "learning_rate": 8.216351501668522e-06, + "loss": 3.1292, + "mean_token_accuracy": 0.4308355345474767, + "step": 9621 + }, + { + "epoch": 1.7838338895068595, + "grad_norm": 6.48828125, + "learning_rate": 8.21616611049314e-06, + "loss": 2.4951, + "mean_token_accuracy": 0.493866424352567, + "step": 9622 + }, + { + "epoch": 1.7840192806822395, + "grad_norm": 6.7109375, + "learning_rate": 8.21598071931776e-06, + "loss": 2.5534, + "mean_token_accuracy": 0.4870761204647854, + "step": 9623 + }, + { + "epoch": 1.7842046718576197, + "grad_norm": 5.5859375, + "learning_rate": 8.215795328142381e-06, + "loss": 2.7453, + "mean_token_accuracy": 0.4646808510638298, + "step": 9624 + }, + { + "epoch": 1.7843900630329996, + "grad_norm": 6.4921875, + "learning_rate": 8.215609936967e-06, + "loss": 2.8947, + "mean_token_accuracy": 0.45857359635811834, + "step": 9625 + }, + { + "epoch": 1.7845754542083796, + "grad_norm": 9.5078125, + "learning_rate": 8.215424545791622e-06, + "loss": 3.2034, + "mean_token_accuracy": 0.4329777365491651, + "step": 9626 + }, + { + "epoch": 1.7847608453837598, + "grad_norm": 7.734375, + "learning_rate": 8.215239154616241e-06, + "loss": 2.5492, + "mean_token_accuracy": 0.49579045837231056, + "step": 9627 + }, + { + "epoch": 1.7849462365591398, + "grad_norm": 6.27734375, + "learning_rate": 8.215053763440862e-06, + "loss": 2.6523, + "mean_token_accuracy": 0.4826128460384563, + "step": 9628 + }, + { + "epoch": 1.7851316277345197, + "grad_norm": 9.65625, + "learning_rate": 8.21486837226548e-06, + "loss": 2.8742, + "mean_token_accuracy": 0.4262508122157245, + "step": 9629 + }, + { + "epoch": 1.7853170189099, + "grad_norm": 5.48828125, + "learning_rate": 8.214682981090101e-06, + "loss": 2.7872, + "mean_token_accuracy": 0.4583948793697686, + "step": 9630 + }, + { + "epoch": 1.78550241008528, + "grad_norm": 5.7109375, + "learning_rate": 8.214497589914721e-06, + "loss": 3.2122, + "mean_token_accuracy": 0.43611793611793614, + "step": 9631 + }, + { + "epoch": 1.7856878012606598, + "grad_norm": 6.4765625, + "learning_rate": 8.21431219873934e-06, + "loss": 2.197, + "mean_token_accuracy": 0.5417288641606731, + "step": 9632 + }, + { + "epoch": 1.78587319243604, + "grad_norm": 6.23046875, + "learning_rate": 8.21412680756396e-06, + "loss": 2.8691, + "mean_token_accuracy": 0.46356519120746764, + "step": 9633 + }, + { + "epoch": 1.7860585836114202, + "grad_norm": 7.52734375, + "learning_rate": 8.213941416388581e-06, + "loss": 2.8759, + "mean_token_accuracy": 0.48735163649442514, + "step": 9634 + }, + { + "epoch": 1.7862439747868002, + "grad_norm": 5.2578125, + "learning_rate": 8.213756025213202e-06, + "loss": 3.1746, + "mean_token_accuracy": 0.4302978515625, + "step": 9635 + }, + { + "epoch": 1.7864293659621802, + "grad_norm": 6.18359375, + "learning_rate": 8.21357063403782e-06, + "loss": 4.1098, + "mean_token_accuracy": 0.3805104408352668, + "step": 9636 + }, + { + "epoch": 1.7866147571375603, + "grad_norm": 6.921875, + "learning_rate": 8.213385242862441e-06, + "loss": 2.913, + "mean_token_accuracy": 0.4610784837159637, + "step": 9637 + }, + { + "epoch": 1.7868001483129403, + "grad_norm": 7.44140625, + "learning_rate": 8.21319985168706e-06, + "loss": 2.5299, + "mean_token_accuracy": 0.4878655880522713, + "step": 9638 + }, + { + "epoch": 1.7869855394883203, + "grad_norm": 5.7890625, + "learning_rate": 8.21301446051168e-06, + "loss": 3.5456, + "mean_token_accuracy": 0.41764783701547825, + "step": 9639 + }, + { + "epoch": 1.7871709306637005, + "grad_norm": 7.68359375, + "learning_rate": 8.2128290693363e-06, + "loss": 3.226, + "mean_token_accuracy": 0.4437374413931681, + "step": 9640 + }, + { + "epoch": 1.7873563218390804, + "grad_norm": 7.58203125, + "learning_rate": 8.21264367816092e-06, + "loss": 2.8886, + "mean_token_accuracy": 0.4631212053069485, + "step": 9641 + }, + { + "epoch": 1.7875417130144604, + "grad_norm": 7.4921875, + "learning_rate": 8.21245828698554e-06, + "loss": 3.1694, + "mean_token_accuracy": 0.4325104662913238, + "step": 9642 + }, + { + "epoch": 1.7877271041898406, + "grad_norm": 6.7734375, + "learning_rate": 8.21227289581016e-06, + "loss": 3.1785, + "mean_token_accuracy": 0.44219549946422193, + "step": 9643 + }, + { + "epoch": 1.7879124953652206, + "grad_norm": 6.328125, + "learning_rate": 8.212087504634781e-06, + "loss": 3.1936, + "mean_token_accuracy": 0.4452657897597565, + "step": 9644 + }, + { + "epoch": 1.7880978865406005, + "grad_norm": 8.1328125, + "learning_rate": 8.2119021134594e-06, + "loss": 2.6013, + "mean_token_accuracy": 0.48523636363636363, + "step": 9645 + }, + { + "epoch": 1.7882832777159807, + "grad_norm": 8.921875, + "learning_rate": 8.21171672228402e-06, + "loss": 3.0429, + "mean_token_accuracy": 0.462721110927426, + "step": 9646 + }, + { + "epoch": 1.788468668891361, + "grad_norm": 6.37890625, + "learning_rate": 8.211531331108639e-06, + "loss": 2.6957, + "mean_token_accuracy": 0.4734561213434453, + "step": 9647 + }, + { + "epoch": 1.7886540600667407, + "grad_norm": 6.1328125, + "learning_rate": 8.21134593993326e-06, + "loss": 3.2295, + "mean_token_accuracy": 0.42920110192837463, + "step": 9648 + }, + { + "epoch": 1.7888394512421208, + "grad_norm": 11.2109375, + "learning_rate": 8.21116054875788e-06, + "loss": 2.49, + "mean_token_accuracy": 0.4917073170731707, + "step": 9649 + }, + { + "epoch": 1.789024842417501, + "grad_norm": 6.3984375, + "learning_rate": 8.210975157582499e-06, + "loss": 2.9582, + "mean_token_accuracy": 0.44768289128533956, + "step": 9650 + }, + { + "epoch": 1.789210233592881, + "grad_norm": 7.12109375, + "learning_rate": 8.21078976640712e-06, + "loss": 3.4092, + "mean_token_accuracy": 0.44888435175732827, + "step": 9651 + }, + { + "epoch": 1.789395624768261, + "grad_norm": 6.74609375, + "learning_rate": 8.21060437523174e-06, + "loss": 3.0891, + "mean_token_accuracy": 0.44674428633031477, + "step": 9652 + }, + { + "epoch": 1.7895810159436412, + "grad_norm": 8.6171875, + "learning_rate": 8.21041898405636e-06, + "loss": 2.785, + "mean_token_accuracy": 0.48672566371681414, + "step": 9653 + }, + { + "epoch": 1.7897664071190211, + "grad_norm": 8.1328125, + "learning_rate": 8.210233592880979e-06, + "loss": 2.7493, + "mean_token_accuracy": 0.4682337139019476, + "step": 9654 + }, + { + "epoch": 1.789951798294401, + "grad_norm": 7.171875, + "learning_rate": 8.2100482017056e-06, + "loss": 3.4636, + "mean_token_accuracy": 0.40964840556009813, + "step": 9655 + }, + { + "epoch": 1.7901371894697813, + "grad_norm": 5.58984375, + "learning_rate": 8.209862810530218e-06, + "loss": 3.2928, + "mean_token_accuracy": 0.42924393723252496, + "step": 9656 + }, + { + "epoch": 1.7903225806451613, + "grad_norm": 5.51171875, + "learning_rate": 8.209677419354839e-06, + "loss": 2.8689, + "mean_token_accuracy": 0.46484708175947365, + "step": 9657 + }, + { + "epoch": 1.7905079718205412, + "grad_norm": 9.1484375, + "learning_rate": 8.20949202817946e-06, + "loss": 3.4795, + "mean_token_accuracy": 0.40644187216909916, + "step": 9658 + }, + { + "epoch": 1.7906933629959214, + "grad_norm": 7.0234375, + "learning_rate": 8.20930663700408e-06, + "loss": 2.6107, + "mean_token_accuracy": 0.48478071810162754, + "step": 9659 + }, + { + "epoch": 1.7908787541713016, + "grad_norm": 8.546875, + "learning_rate": 8.209121245828699e-06, + "loss": 2.3993, + "mean_token_accuracy": 0.5128870157237801, + "step": 9660 + }, + { + "epoch": 1.7910641453466813, + "grad_norm": 7.68359375, + "learning_rate": 8.20893585465332e-06, + "loss": 2.9357, + "mean_token_accuracy": 0.48224023581429626, + "step": 9661 + }, + { + "epoch": 1.7912495365220615, + "grad_norm": 6.59765625, + "learning_rate": 8.20875046347794e-06, + "loss": 2.6467, + "mean_token_accuracy": 0.5277408437084492, + "step": 9662 + }, + { + "epoch": 1.7914349276974417, + "grad_norm": 6.9765625, + "learning_rate": 8.208565072302558e-06, + "loss": 2.6732, + "mean_token_accuracy": 0.47769355039145023, + "step": 9663 + }, + { + "epoch": 1.7916203188728217, + "grad_norm": 5.86328125, + "learning_rate": 8.208379681127179e-06, + "loss": 3.4767, + "mean_token_accuracy": 0.427953689496443, + "step": 9664 + }, + { + "epoch": 1.7918057100482017, + "grad_norm": 6.1484375, + "learning_rate": 8.208194289951798e-06, + "loss": 2.6623, + "mean_token_accuracy": 0.4913562895291637, + "step": 9665 + }, + { + "epoch": 1.7919911012235819, + "grad_norm": 6.0625, + "learning_rate": 8.208008898776418e-06, + "loss": 3.2077, + "mean_token_accuracy": 0.4350403034613561, + "step": 9666 + }, + { + "epoch": 1.7921764923989618, + "grad_norm": 5.66796875, + "learning_rate": 8.207823507601039e-06, + "loss": 2.9873, + "mean_token_accuracy": 0.4515014615997874, + "step": 9667 + }, + { + "epoch": 1.7923618835743418, + "grad_norm": 8.0390625, + "learning_rate": 8.20763811642566e-06, + "loss": 2.9095, + "mean_token_accuracy": 0.4779992478375329, + "step": 9668 + }, + { + "epoch": 1.792547274749722, + "grad_norm": 8.109375, + "learning_rate": 8.20745272525028e-06, + "loss": 2.78, + "mean_token_accuracy": 0.4652019650655022, + "step": 9669 + }, + { + "epoch": 1.792732665925102, + "grad_norm": 5.765625, + "learning_rate": 8.207267334074899e-06, + "loss": 2.6805, + "mean_token_accuracy": 0.46454802259887007, + "step": 9670 + }, + { + "epoch": 1.792918057100482, + "grad_norm": 6.828125, + "learning_rate": 8.207081942899519e-06, + "loss": 3.4574, + "mean_token_accuracy": 0.4247166756610901, + "step": 9671 + }, + { + "epoch": 1.793103448275862, + "grad_norm": 10.3359375, + "learning_rate": 8.206896551724138e-06, + "loss": 3.2071, + "mean_token_accuracy": 0.4386710239651416, + "step": 9672 + }, + { + "epoch": 1.793288839451242, + "grad_norm": 7.546875, + "learning_rate": 8.206711160548758e-06, + "loss": 2.956, + "mean_token_accuracy": 0.4792671166827387, + "step": 9673 + }, + { + "epoch": 1.793474230626622, + "grad_norm": 5.8671875, + "learning_rate": 8.206525769373379e-06, + "loss": 2.8832, + "mean_token_accuracy": 0.4693033261625021, + "step": 9674 + }, + { + "epoch": 1.7936596218020022, + "grad_norm": 9.078125, + "learning_rate": 8.206340378198e-06, + "loss": 2.4613, + "mean_token_accuracy": 0.48221767514372016, + "step": 9675 + }, + { + "epoch": 1.7938450129773824, + "grad_norm": 9.9375, + "learning_rate": 8.206154987022618e-06, + "loss": 2.3304, + "mean_token_accuracy": 0.517806111233238, + "step": 9676 + }, + { + "epoch": 1.7940304041527624, + "grad_norm": 7.90234375, + "learning_rate": 8.205969595847239e-06, + "loss": 2.8567, + "mean_token_accuracy": 0.4871621621621622, + "step": 9677 + }, + { + "epoch": 1.7942157953281423, + "grad_norm": 9.390625, + "learning_rate": 8.205784204671859e-06, + "loss": 3.6323, + "mean_token_accuracy": 0.41097456416118894, + "step": 9678 + }, + { + "epoch": 1.7944011865035225, + "grad_norm": 8.4921875, + "learning_rate": 8.205598813496478e-06, + "loss": 3.6311, + "mean_token_accuracy": 0.3952914798206278, + "step": 9679 + }, + { + "epoch": 1.7945865776789025, + "grad_norm": 10.09375, + "learning_rate": 8.205413422321098e-06, + "loss": 2.6286, + "mean_token_accuracy": 0.46046301864101025, + "step": 9680 + }, + { + "epoch": 1.7947719688542825, + "grad_norm": 6.74609375, + "learning_rate": 8.205228031145717e-06, + "loss": 2.9353, + "mean_token_accuracy": 0.47051133062173156, + "step": 9681 + }, + { + "epoch": 1.7949573600296627, + "grad_norm": 7.703125, + "learning_rate": 8.205042639970338e-06, + "loss": 3.289, + "mean_token_accuracy": 0.436469824789098, + "step": 9682 + }, + { + "epoch": 1.7951427512050426, + "grad_norm": 12.5, + "learning_rate": 8.204857248794958e-06, + "loss": 2.3779, + "mean_token_accuracy": 0.4870514820592824, + "step": 9683 + }, + { + "epoch": 1.7953281423804226, + "grad_norm": 9.46875, + "learning_rate": 8.204671857619579e-06, + "loss": 2.7959, + "mean_token_accuracy": 0.4654761904761905, + "step": 9684 + }, + { + "epoch": 1.7955135335558028, + "grad_norm": 6.4609375, + "learning_rate": 8.204486466444197e-06, + "loss": 3.0577, + "mean_token_accuracy": 0.4372146118721461, + "step": 9685 + }, + { + "epoch": 1.7956989247311828, + "grad_norm": 7.03515625, + "learning_rate": 8.204301075268818e-06, + "loss": 3.3634, + "mean_token_accuracy": 0.4482315112540193, + "step": 9686 + }, + { + "epoch": 1.7958843159065627, + "grad_norm": 9.21875, + "learning_rate": 8.204115684093438e-06, + "loss": 2.9286, + "mean_token_accuracy": 0.4506856540084388, + "step": 9687 + }, + { + "epoch": 1.796069707081943, + "grad_norm": 6.52734375, + "learning_rate": 8.203930292918057e-06, + "loss": 2.6642, + "mean_token_accuracy": 0.47392182487822265, + "step": 9688 + }, + { + "epoch": 1.796255098257323, + "grad_norm": 7.3046875, + "learning_rate": 8.203744901742678e-06, + "loss": 2.7706, + "mean_token_accuracy": 0.4916674339379431, + "step": 9689 + }, + { + "epoch": 1.7964404894327028, + "grad_norm": 7.10546875, + "learning_rate": 8.203559510567296e-06, + "loss": 3.2834, + "mean_token_accuracy": 0.44178847807394667, + "step": 9690 + }, + { + "epoch": 1.796625880608083, + "grad_norm": 5.35546875, + "learning_rate": 8.203374119391919e-06, + "loss": 2.6904, + "mean_token_accuracy": 0.4775310740465314, + "step": 9691 + }, + { + "epoch": 1.7968112717834632, + "grad_norm": 5.125, + "learning_rate": 8.203188728216537e-06, + "loss": 2.6956, + "mean_token_accuracy": 0.46671750735374223, + "step": 9692 + }, + { + "epoch": 1.7969966629588432, + "grad_norm": 6.16796875, + "learning_rate": 8.203003337041158e-06, + "loss": 3.2733, + "mean_token_accuracy": 0.4330246913580247, + "step": 9693 + }, + { + "epoch": 1.7971820541342232, + "grad_norm": 8.9140625, + "learning_rate": 8.202817945865777e-06, + "loss": 3.0777, + "mean_token_accuracy": 0.4467065868263473, + "step": 9694 + }, + { + "epoch": 1.7973674453096034, + "grad_norm": 6.38671875, + "learning_rate": 8.202632554690397e-06, + "loss": 3.2288, + "mean_token_accuracy": 0.43284913353720694, + "step": 9695 + }, + { + "epoch": 1.7975528364849833, + "grad_norm": 6.1171875, + "learning_rate": 8.202447163515018e-06, + "loss": 3.0861, + "mean_token_accuracy": 0.44957768108523166, + "step": 9696 + }, + { + "epoch": 1.7977382276603633, + "grad_norm": 5.78515625, + "learning_rate": 8.202261772339637e-06, + "loss": 2.5436, + "mean_token_accuracy": 0.4953366813569745, + "step": 9697 + }, + { + "epoch": 1.7979236188357435, + "grad_norm": 7.46484375, + "learning_rate": 8.202076381164257e-06, + "loss": 2.9292, + "mean_token_accuracy": 0.4704201680672269, + "step": 9698 + }, + { + "epoch": 1.7981090100111234, + "grad_norm": 5.6015625, + "learning_rate": 8.201890989988878e-06, + "loss": 2.5926, + "mean_token_accuracy": 0.4812778214192197, + "step": 9699 + }, + { + "epoch": 1.7982944011865034, + "grad_norm": 5.421875, + "learning_rate": 8.201705598813498e-06, + "loss": 2.7337, + "mean_token_accuracy": 0.4800646551724138, + "step": 9700 + }, + { + "epoch": 1.7984797923618836, + "grad_norm": 7.63671875, + "learning_rate": 8.201520207638117e-06, + "loss": 2.6219, + "mean_token_accuracy": 0.5316345736209642, + "step": 9701 + }, + { + "epoch": 1.7986651835372638, + "grad_norm": 6.9375, + "learning_rate": 8.201334816462737e-06, + "loss": 2.0324, + "mean_token_accuracy": 0.568499660556687, + "step": 9702 + }, + { + "epoch": 1.7988505747126435, + "grad_norm": 7.09765625, + "learning_rate": 8.201149425287356e-06, + "loss": 2.771, + "mean_token_accuracy": 0.4827756513149772, + "step": 9703 + }, + { + "epoch": 1.7990359658880237, + "grad_norm": 7.47265625, + "learning_rate": 8.200964034111977e-06, + "loss": 2.7877, + "mean_token_accuracy": 0.49731077003804275, + "step": 9704 + }, + { + "epoch": 1.799221357063404, + "grad_norm": 7.1875, + "learning_rate": 8.200778642936597e-06, + "loss": 2.8058, + "mean_token_accuracy": 0.47399684810280035, + "step": 9705 + }, + { + "epoch": 1.7994067482387839, + "grad_norm": 6.18359375, + "learning_rate": 8.200593251761216e-06, + "loss": 2.9046, + "mean_token_accuracy": 0.45935937789298276, + "step": 9706 + }, + { + "epoch": 1.7995921394141638, + "grad_norm": 6.23828125, + "learning_rate": 8.200407860585838e-06, + "loss": 2.9626, + "mean_token_accuracy": 0.44463559587112916, + "step": 9707 + }, + { + "epoch": 1.799777530589544, + "grad_norm": 5.828125, + "learning_rate": 8.200222469410457e-06, + "loss": 2.2819, + "mean_token_accuracy": 0.5537817028985508, + "step": 9708 + }, + { + "epoch": 1.799962921764924, + "grad_norm": 5.4375, + "learning_rate": 8.200037078235077e-06, + "loss": 2.642, + "mean_token_accuracy": 0.5057347670250896, + "step": 9709 + }, + { + "epoch": 1.800148312940304, + "grad_norm": 6.09375, + "learning_rate": 8.199851687059696e-06, + "loss": 3.0603, + "mean_token_accuracy": 0.4719835876394409, + "step": 9710 + }, + { + "epoch": 1.8003337041156842, + "grad_norm": 6.38671875, + "learning_rate": 8.199666295884317e-06, + "loss": 2.6213, + "mean_token_accuracy": 0.4742669993761697, + "step": 9711 + }, + { + "epoch": 1.8005190952910641, + "grad_norm": 6.43359375, + "learning_rate": 8.199480904708937e-06, + "loss": 3.066, + "mean_token_accuracy": 0.43764013452914796, + "step": 9712 + }, + { + "epoch": 1.800704486466444, + "grad_norm": 8.265625, + "learning_rate": 8.199295513533556e-06, + "loss": 3.1842, + "mean_token_accuracy": 0.44929006085192696, + "step": 9713 + }, + { + "epoch": 1.8008898776418243, + "grad_norm": 6.28125, + "learning_rate": 8.199110122358176e-06, + "loss": 3.8345, + "mean_token_accuracy": 0.4025541365907829, + "step": 9714 + }, + { + "epoch": 1.8010752688172043, + "grad_norm": 6.25, + "learning_rate": 8.198924731182797e-06, + "loss": 3.0789, + "mean_token_accuracy": 0.4502623773671002, + "step": 9715 + }, + { + "epoch": 1.8012606599925842, + "grad_norm": 7.56640625, + "learning_rate": 8.198739340007417e-06, + "loss": 2.8256, + "mean_token_accuracy": 0.4991372368572415, + "step": 9716 + }, + { + "epoch": 1.8014460511679644, + "grad_norm": 6.33984375, + "learning_rate": 8.198553948832036e-06, + "loss": 2.7627, + "mean_token_accuracy": 0.47356095155922595, + "step": 9717 + }, + { + "epoch": 1.8016314423433446, + "grad_norm": 6.7578125, + "learning_rate": 8.198368557656657e-06, + "loss": 2.8067, + "mean_token_accuracy": 0.4675972083748754, + "step": 9718 + }, + { + "epoch": 1.8018168335187243, + "grad_norm": 6.515625, + "learning_rate": 8.198183166481275e-06, + "loss": 3.1668, + "mean_token_accuracy": 0.46179354094579006, + "step": 9719 + }, + { + "epoch": 1.8020022246941045, + "grad_norm": 6.91796875, + "learning_rate": 8.197997775305896e-06, + "loss": 2.7347, + "mean_token_accuracy": 0.46636971046770603, + "step": 9720 + }, + { + "epoch": 1.8021876158694847, + "grad_norm": 7.0859375, + "learning_rate": 8.197812384130516e-06, + "loss": 4.1433, + "mean_token_accuracy": 0.3972676856485865, + "step": 9721 + }, + { + "epoch": 1.8023730070448647, + "grad_norm": 6.19140625, + "learning_rate": 8.197626992955135e-06, + "loss": 3.0488, + "mean_token_accuracy": 0.46537741734248284, + "step": 9722 + }, + { + "epoch": 1.8025583982202447, + "grad_norm": 5.9453125, + "learning_rate": 8.197441601779756e-06, + "loss": 3.0955, + "mean_token_accuracy": 0.4326722338204593, + "step": 9723 + }, + { + "epoch": 1.8027437893956249, + "grad_norm": 5.84765625, + "learning_rate": 8.197256210604376e-06, + "loss": 2.6003, + "mean_token_accuracy": 0.48848, + "step": 9724 + }, + { + "epoch": 1.8029291805710048, + "grad_norm": 5.82421875, + "learning_rate": 8.197070819428997e-06, + "loss": 3.3642, + "mean_token_accuracy": 0.42700548081714, + "step": 9725 + }, + { + "epoch": 1.8031145717463848, + "grad_norm": 5.984375, + "learning_rate": 8.196885428253616e-06, + "loss": 2.1562, + "mean_token_accuracy": 0.5632199485025071, + "step": 9726 + }, + { + "epoch": 1.803299962921765, + "grad_norm": 6.5078125, + "learning_rate": 8.196700037078236e-06, + "loss": 3.561, + "mean_token_accuracy": 0.4193629929221436, + "step": 9727 + }, + { + "epoch": 1.803485354097145, + "grad_norm": 7.296875, + "learning_rate": 8.196514645902855e-06, + "loss": 2.4221, + "mean_token_accuracy": 0.48274898728872745, + "step": 9728 + }, + { + "epoch": 1.803670745272525, + "grad_norm": 6.9765625, + "learning_rate": 8.196329254727475e-06, + "loss": 2.7572, + "mean_token_accuracy": 0.4813626642224259, + "step": 9729 + }, + { + "epoch": 1.803856136447905, + "grad_norm": 5.28515625, + "learning_rate": 8.196143863552096e-06, + "loss": 2.7202, + "mean_token_accuracy": 0.45993413830954993, + "step": 9730 + }, + { + "epoch": 1.8040415276232853, + "grad_norm": 8.5859375, + "learning_rate": 8.195958472376716e-06, + "loss": 2.8889, + "mean_token_accuracy": 0.4806515085264539, + "step": 9731 + }, + { + "epoch": 1.804226918798665, + "grad_norm": 7.5859375, + "learning_rate": 8.195773081201335e-06, + "loss": 3.3232, + "mean_token_accuracy": 0.4307084391758397, + "step": 9732 + }, + { + "epoch": 1.8044123099740452, + "grad_norm": 7.33203125, + "learning_rate": 8.195587690025956e-06, + "loss": 3.5033, + "mean_token_accuracy": 0.41774845711853587, + "step": 9733 + }, + { + "epoch": 1.8045977011494254, + "grad_norm": 8.171875, + "learning_rate": 8.195402298850576e-06, + "loss": 2.5851, + "mean_token_accuracy": 0.48422800221361373, + "step": 9734 + }, + { + "epoch": 1.8047830923248054, + "grad_norm": 7.9765625, + "learning_rate": 8.195216907675195e-06, + "loss": 2.727, + "mean_token_accuracy": 0.4723446452181578, + "step": 9735 + }, + { + "epoch": 1.8049684835001854, + "grad_norm": 6.91015625, + "learning_rate": 8.195031516499815e-06, + "loss": 2.8107, + "mean_token_accuracy": 0.45479302832244006, + "step": 9736 + }, + { + "epoch": 1.8051538746755655, + "grad_norm": 6.1953125, + "learning_rate": 8.194846125324434e-06, + "loss": 2.7632, + "mean_token_accuracy": 0.46462513199577615, + "step": 9737 + }, + { + "epoch": 1.8053392658509455, + "grad_norm": 16.609375, + "learning_rate": 8.194660734149055e-06, + "loss": 2.5031, + "mean_token_accuracy": 0.5150624540778839, + "step": 9738 + }, + { + "epoch": 1.8055246570263255, + "grad_norm": 5.68359375, + "learning_rate": 8.194475342973675e-06, + "loss": 2.5784, + "mean_token_accuracy": 0.4961229946524064, + "step": 9739 + }, + { + "epoch": 1.8057100482017057, + "grad_norm": 5.0859375, + "learning_rate": 8.194289951798296e-06, + "loss": 2.6473, + "mean_token_accuracy": 0.48227894036530783, + "step": 9740 + }, + { + "epoch": 1.8058954393770856, + "grad_norm": 5.58984375, + "learning_rate": 8.194104560622914e-06, + "loss": 2.9946, + "mean_token_accuracy": 0.45447750037374796, + "step": 9741 + }, + { + "epoch": 1.8060808305524656, + "grad_norm": 6.65234375, + "learning_rate": 8.193919169447535e-06, + "loss": 2.7202, + "mean_token_accuracy": 0.530852224512062, + "step": 9742 + }, + { + "epoch": 1.8062662217278458, + "grad_norm": 5.60546875, + "learning_rate": 8.193733778272155e-06, + "loss": 2.9559, + "mean_token_accuracy": 0.4725196288365453, + "step": 9743 + }, + { + "epoch": 1.8064516129032258, + "grad_norm": 5.71875, + "learning_rate": 8.193548387096774e-06, + "loss": 2.3046, + "mean_token_accuracy": 0.51862689926843, + "step": 9744 + }, + { + "epoch": 1.8066370040786057, + "grad_norm": 5.953125, + "learning_rate": 8.193362995921395e-06, + "loss": 2.8691, + "mean_token_accuracy": 0.4575927472265299, + "step": 9745 + }, + { + "epoch": 1.806822395253986, + "grad_norm": 5.77734375, + "learning_rate": 8.193177604746014e-06, + "loss": 3.112, + "mean_token_accuracy": 0.47305924412665984, + "step": 9746 + }, + { + "epoch": 1.807007786429366, + "grad_norm": 6.3671875, + "learning_rate": 8.192992213570636e-06, + "loss": 3.0639, + "mean_token_accuracy": 0.4385299503008109, + "step": 9747 + }, + { + "epoch": 1.8071931776047458, + "grad_norm": 6.15234375, + "learning_rate": 8.192806822395254e-06, + "loss": 2.7636, + "mean_token_accuracy": 0.49243172096084237, + "step": 9748 + }, + { + "epoch": 1.807378568780126, + "grad_norm": 8.84375, + "learning_rate": 8.192621431219875e-06, + "loss": 2.265, + "mean_token_accuracy": 0.5377920293174531, + "step": 9749 + }, + { + "epoch": 1.8075639599555062, + "grad_norm": 6.265625, + "learning_rate": 8.192436040044495e-06, + "loss": 2.3752, + "mean_token_accuracy": 0.5179594689028651, + "step": 9750 + }, + { + "epoch": 1.8077493511308862, + "grad_norm": 5.62890625, + "learning_rate": 8.192250648869114e-06, + "loss": 2.874, + "mean_token_accuracy": 0.4565040650406504, + "step": 9751 + }, + { + "epoch": 1.8079347423062662, + "grad_norm": 6.25390625, + "learning_rate": 8.192065257693735e-06, + "loss": 2.7652, + "mean_token_accuracy": 0.4917075759586042, + "step": 9752 + }, + { + "epoch": 1.8081201334816464, + "grad_norm": 6.828125, + "learning_rate": 8.191879866518354e-06, + "loss": 2.113, + "mean_token_accuracy": 0.5598546387345019, + "step": 9753 + }, + { + "epoch": 1.8083055246570263, + "grad_norm": 10.515625, + "learning_rate": 8.191694475342974e-06, + "loss": 3.0867, + "mean_token_accuracy": 0.4830652543481104, + "step": 9754 + }, + { + "epoch": 1.8084909158324063, + "grad_norm": 8.4765625, + "learning_rate": 8.191509084167595e-06, + "loss": 2.483, + "mean_token_accuracy": 0.4932237600922722, + "step": 9755 + }, + { + "epoch": 1.8086763070077865, + "grad_norm": 6.796875, + "learning_rate": 8.191323692992215e-06, + "loss": 3.2837, + "mean_token_accuracy": 0.4286278121299829, + "step": 9756 + }, + { + "epoch": 1.8088616981831664, + "grad_norm": 7.51953125, + "learning_rate": 8.191138301816834e-06, + "loss": 2.9066, + "mean_token_accuracy": 0.4557165861513688, + "step": 9757 + }, + { + "epoch": 1.8090470893585464, + "grad_norm": 7.5, + "learning_rate": 8.190952910641454e-06, + "loss": 2.9211, + "mean_token_accuracy": 0.4515810276679842, + "step": 9758 + }, + { + "epoch": 1.8092324805339266, + "grad_norm": 6.078125, + "learning_rate": 8.190767519466075e-06, + "loss": 3.6522, + "mean_token_accuracy": 0.4365850123206262, + "step": 9759 + }, + { + "epoch": 1.8094178717093068, + "grad_norm": 5.734375, + "learning_rate": 8.190582128290694e-06, + "loss": 2.6315, + "mean_token_accuracy": 0.4782507015902713, + "step": 9760 + }, + { + "epoch": 1.8096032628846865, + "grad_norm": 8.7265625, + "learning_rate": 8.190396737115314e-06, + "loss": 3.0339, + "mean_token_accuracy": 0.46263858093126387, + "step": 9761 + }, + { + "epoch": 1.8097886540600667, + "grad_norm": 6.03125, + "learning_rate": 8.190211345939933e-06, + "loss": 3.0578, + "mean_token_accuracy": 0.4490238611713666, + "step": 9762 + }, + { + "epoch": 1.809974045235447, + "grad_norm": 6.17578125, + "learning_rate": 8.190025954764555e-06, + "loss": 2.5821, + "mean_token_accuracy": 0.5032177703965124, + "step": 9763 + }, + { + "epoch": 1.8101594364108269, + "grad_norm": 7.890625, + "learning_rate": 8.189840563589174e-06, + "loss": 2.7666, + "mean_token_accuracy": 0.47734420500873614, + "step": 9764 + }, + { + "epoch": 1.8103448275862069, + "grad_norm": 8.28125, + "learning_rate": 8.189655172413794e-06, + "loss": 2.8335, + "mean_token_accuracy": 0.4757357545397621, + "step": 9765 + }, + { + "epoch": 1.810530218761587, + "grad_norm": 5.890625, + "learning_rate": 8.189469781238413e-06, + "loss": 2.9625, + "mean_token_accuracy": 0.46898620275944813, + "step": 9766 + }, + { + "epoch": 1.810715609936967, + "grad_norm": 10.0546875, + "learning_rate": 8.189284390063034e-06, + "loss": 2.2506, + "mean_token_accuracy": 0.5117647058823529, + "step": 9767 + }, + { + "epoch": 1.810901001112347, + "grad_norm": 8.015625, + "learning_rate": 8.189098998887654e-06, + "loss": 3.1272, + "mean_token_accuracy": 0.4377541650268923, + "step": 9768 + }, + { + "epoch": 1.8110863922877272, + "grad_norm": 5.75390625, + "learning_rate": 8.188913607712273e-06, + "loss": 2.5252, + "mean_token_accuracy": 0.49521503516660903, + "step": 9769 + }, + { + "epoch": 1.8112717834631071, + "grad_norm": 6.03125, + "learning_rate": 8.188728216536893e-06, + "loss": 3.3686, + "mean_token_accuracy": 0.4259910860896083, + "step": 9770 + }, + { + "epoch": 1.811457174638487, + "grad_norm": 5.75, + "learning_rate": 8.188542825361514e-06, + "loss": 3.2681, + "mean_token_accuracy": 0.4480208786428882, + "step": 9771 + }, + { + "epoch": 1.8116425658138673, + "grad_norm": 8.6953125, + "learning_rate": 8.188357434186134e-06, + "loss": 2.7263, + "mean_token_accuracy": 0.48052115583075333, + "step": 9772 + }, + { + "epoch": 1.8118279569892473, + "grad_norm": 6.87890625, + "learning_rate": 8.188172043010753e-06, + "loss": 2.9371, + "mean_token_accuracy": 0.4658648744251857, + "step": 9773 + }, + { + "epoch": 1.8120133481646272, + "grad_norm": 5.88671875, + "learning_rate": 8.187986651835374e-06, + "loss": 2.6383, + "mean_token_accuracy": 0.48497613030047737, + "step": 9774 + }, + { + "epoch": 1.8121987393400074, + "grad_norm": 8.5234375, + "learning_rate": 8.187801260659993e-06, + "loss": 2.6084, + "mean_token_accuracy": 0.491070110701107, + "step": 9775 + }, + { + "epoch": 1.8123841305153876, + "grad_norm": 10.8671875, + "learning_rate": 8.187615869484613e-06, + "loss": 3.2205, + "mean_token_accuracy": 0.4423391494002181, + "step": 9776 + }, + { + "epoch": 1.8125695216907676, + "grad_norm": 6.08203125, + "learning_rate": 8.187430478309233e-06, + "loss": 3.0314, + "mean_token_accuracy": 0.443636925931653, + "step": 9777 + }, + { + "epoch": 1.8127549128661475, + "grad_norm": 8.25, + "learning_rate": 8.187245087133852e-06, + "loss": 3.0045, + "mean_token_accuracy": 0.4608355091383812, + "step": 9778 + }, + { + "epoch": 1.8129403040415277, + "grad_norm": 9.46875, + "learning_rate": 8.187059695958473e-06, + "loss": 2.4985, + "mean_token_accuracy": 0.5064224282363453, + "step": 9779 + }, + { + "epoch": 1.8131256952169077, + "grad_norm": 6.80078125, + "learning_rate": 8.186874304783093e-06, + "loss": 3.3964, + "mean_token_accuracy": 0.4038800705467372, + "step": 9780 + }, + { + "epoch": 1.8133110863922877, + "grad_norm": 5.40625, + "learning_rate": 8.186688913607714e-06, + "loss": 2.6853, + "mean_token_accuracy": 0.48298959985898116, + "step": 9781 + }, + { + "epoch": 1.8134964775676679, + "grad_norm": 8.3203125, + "learning_rate": 8.186503522432333e-06, + "loss": 2.9409, + "mean_token_accuracy": 0.45241417806307566, + "step": 9782 + }, + { + "epoch": 1.8136818687430478, + "grad_norm": 8.4375, + "learning_rate": 8.186318131256953e-06, + "loss": 2.323, + "mean_token_accuracy": 0.5307000886188125, + "step": 9783 + }, + { + "epoch": 1.8138672599184278, + "grad_norm": 4.9921875, + "learning_rate": 8.186132740081572e-06, + "loss": 2.9498, + "mean_token_accuracy": 0.4759418653873289, + "step": 9784 + }, + { + "epoch": 1.814052651093808, + "grad_norm": 7.09375, + "learning_rate": 8.185947348906192e-06, + "loss": 2.3819, + "mean_token_accuracy": 0.5059413027916965, + "step": 9785 + }, + { + "epoch": 1.814238042269188, + "grad_norm": 7.8203125, + "learning_rate": 8.185761957730813e-06, + "loss": 2.5279, + "mean_token_accuracy": 0.5114387391967463, + "step": 9786 + }, + { + "epoch": 1.814423433444568, + "grad_norm": 4.609375, + "learning_rate": 8.185576566555432e-06, + "loss": 2.9901, + "mean_token_accuracy": 0.4519632414369256, + "step": 9787 + }, + { + "epoch": 1.814608824619948, + "grad_norm": 5.671875, + "learning_rate": 8.185391175380054e-06, + "loss": 3.4827, + "mean_token_accuracy": 0.43407159412825136, + "step": 9788 + }, + { + "epoch": 1.8147942157953283, + "grad_norm": 6.828125, + "learning_rate": 8.185205784204673e-06, + "loss": 2.583, + "mean_token_accuracy": 0.48542349491116094, + "step": 9789 + }, + { + "epoch": 1.814979606970708, + "grad_norm": 5.359375, + "learning_rate": 8.185020393029293e-06, + "loss": 2.9935, + "mean_token_accuracy": 0.455950991831972, + "step": 9790 + }, + { + "epoch": 1.8151649981460882, + "grad_norm": 6.98828125, + "learning_rate": 8.184835001853912e-06, + "loss": 2.8281, + "mean_token_accuracy": 0.5339027595269382, + "step": 9791 + }, + { + "epoch": 1.8153503893214684, + "grad_norm": 6.41015625, + "learning_rate": 8.184649610678532e-06, + "loss": 3.0733, + "mean_token_accuracy": 0.452537865649208, + "step": 9792 + }, + { + "epoch": 1.8155357804968484, + "grad_norm": 5.82421875, + "learning_rate": 8.184464219503153e-06, + "loss": 3.1223, + "mean_token_accuracy": 0.45197670095426945, + "step": 9793 + }, + { + "epoch": 1.8157211716722284, + "grad_norm": 5.64453125, + "learning_rate": 8.184278828327772e-06, + "loss": 3.5586, + "mean_token_accuracy": 0.42010217417131995, + "step": 9794 + }, + { + "epoch": 1.8159065628476085, + "grad_norm": 5.44921875, + "learning_rate": 8.184093437152392e-06, + "loss": 2.2773, + "mean_token_accuracy": 0.5234433408095696, + "step": 9795 + }, + { + "epoch": 1.8160919540229885, + "grad_norm": 6.65625, + "learning_rate": 8.183908045977013e-06, + "loss": 3.0704, + "mean_token_accuracy": 0.4691806564770734, + "step": 9796 + }, + { + "epoch": 1.8162773451983685, + "grad_norm": 6.20703125, + "learning_rate": 8.183722654801633e-06, + "loss": 2.6987, + "mean_token_accuracy": 0.4841087056655919, + "step": 9797 + }, + { + "epoch": 1.8164627363737487, + "grad_norm": 6.03125, + "learning_rate": 8.183537263626252e-06, + "loss": 3.2751, + "mean_token_accuracy": 0.44124732334047106, + "step": 9798 + }, + { + "epoch": 1.8166481275491286, + "grad_norm": 6.08203125, + "learning_rate": 8.183351872450872e-06, + "loss": 3.1651, + "mean_token_accuracy": 0.4688142563399589, + "step": 9799 + }, + { + "epoch": 1.8168335187245086, + "grad_norm": 5.28515625, + "learning_rate": 8.183166481275491e-06, + "loss": 2.075, + "mean_token_accuracy": 0.5517497034400949, + "step": 9800 + }, + { + "epoch": 1.8170189098998888, + "grad_norm": 5.9765625, + "learning_rate": 8.182981090100112e-06, + "loss": 2.9874, + "mean_token_accuracy": 0.45034224460803307, + "step": 9801 + }, + { + "epoch": 1.817204301075269, + "grad_norm": 5.50390625, + "learning_rate": 8.182795698924732e-06, + "loss": 3.0302, + "mean_token_accuracy": 0.46223129578479644, + "step": 9802 + }, + { + "epoch": 1.8173896922506487, + "grad_norm": 6.05859375, + "learning_rate": 8.182610307749351e-06, + "loss": 3.0606, + "mean_token_accuracy": 0.4563717778046714, + "step": 9803 + }, + { + "epoch": 1.817575083426029, + "grad_norm": 5.08984375, + "learning_rate": 8.182424916573972e-06, + "loss": 2.6675, + "mean_token_accuracy": 0.5217276099629041, + "step": 9804 + }, + { + "epoch": 1.817760474601409, + "grad_norm": 6.1796875, + "learning_rate": 8.182239525398592e-06, + "loss": 3.0867, + "mean_token_accuracy": 0.4517724649629019, + "step": 9805 + }, + { + "epoch": 1.817945865776789, + "grad_norm": 6.0546875, + "learning_rate": 8.182054134223212e-06, + "loss": 2.9183, + "mean_token_accuracy": 0.4608718837488335, + "step": 9806 + }, + { + "epoch": 1.818131256952169, + "grad_norm": 5.82421875, + "learning_rate": 8.181868743047831e-06, + "loss": 2.7907, + "mean_token_accuracy": 0.5035245335176227, + "step": 9807 + }, + { + "epoch": 1.8183166481275492, + "grad_norm": 7.70703125, + "learning_rate": 8.181683351872452e-06, + "loss": 3.377, + "mean_token_accuracy": 0.43843416370106764, + "step": 9808 + }, + { + "epoch": 1.8185020393029292, + "grad_norm": 7.59375, + "learning_rate": 8.18149796069707e-06, + "loss": 2.7963, + "mean_token_accuracy": 0.46919967663702505, + "step": 9809 + }, + { + "epoch": 1.8186874304783092, + "grad_norm": 8.109375, + "learning_rate": 8.181312569521691e-06, + "loss": 2.2368, + "mean_token_accuracy": 0.5364663585002568, + "step": 9810 + }, + { + "epoch": 1.8188728216536894, + "grad_norm": 5.9921875, + "learning_rate": 8.181127178346312e-06, + "loss": 2.4569, + "mean_token_accuracy": 0.5049197307094769, + "step": 9811 + }, + { + "epoch": 1.8190582128290693, + "grad_norm": 5.25390625, + "learning_rate": 8.180941787170932e-06, + "loss": 2.1116, + "mean_token_accuracy": 0.5403482018045495, + "step": 9812 + }, + { + "epoch": 1.8192436040044493, + "grad_norm": 5.875, + "learning_rate": 8.180756395995551e-06, + "loss": 2.9597, + "mean_token_accuracy": 0.45045857765749514, + "step": 9813 + }, + { + "epoch": 1.8194289951798295, + "grad_norm": 6.91015625, + "learning_rate": 8.180571004820171e-06, + "loss": 2.0494, + "mean_token_accuracy": 0.5486033519553073, + "step": 9814 + }, + { + "epoch": 1.8196143863552094, + "grad_norm": 6.56640625, + "learning_rate": 8.180385613644792e-06, + "loss": 3.0762, + "mean_token_accuracy": 0.4500611995104039, + "step": 9815 + }, + { + "epoch": 1.8197997775305894, + "grad_norm": 6.078125, + "learning_rate": 8.18020022246941e-06, + "loss": 2.7934, + "mean_token_accuracy": 0.47223734349482854, + "step": 9816 + }, + { + "epoch": 1.8199851687059696, + "grad_norm": 6.08984375, + "learning_rate": 8.180014831294031e-06, + "loss": 3.1129, + "mean_token_accuracy": 0.4422683923705722, + "step": 9817 + }, + { + "epoch": 1.8201705598813498, + "grad_norm": 5.31640625, + "learning_rate": 8.17982944011865e-06, + "loss": 3.3861, + "mean_token_accuracy": 0.39957466918714557, + "step": 9818 + }, + { + "epoch": 1.8203559510567295, + "grad_norm": 5.62109375, + "learning_rate": 8.17964404894327e-06, + "loss": 2.6603, + "mean_token_accuracy": 0.4887010242237035, + "step": 9819 + }, + { + "epoch": 1.8205413422321097, + "grad_norm": 5.58984375, + "learning_rate": 8.179458657767891e-06, + "loss": 2.6692, + "mean_token_accuracy": 0.4914947520810713, + "step": 9820 + }, + { + "epoch": 1.82072673340749, + "grad_norm": 7.00390625, + "learning_rate": 8.179273266592511e-06, + "loss": 2.4118, + "mean_token_accuracy": 0.5110850286906625, + "step": 9821 + }, + { + "epoch": 1.8209121245828699, + "grad_norm": 8.109375, + "learning_rate": 8.17908787541713e-06, + "loss": 2.8918, + "mean_token_accuracy": 0.4678201599161096, + "step": 9822 + }, + { + "epoch": 1.8210975157582499, + "grad_norm": 8.28125, + "learning_rate": 8.17890248424175e-06, + "loss": 2.5796, + "mean_token_accuracy": 0.48856664807585054, + "step": 9823 + }, + { + "epoch": 1.82128290693363, + "grad_norm": 6.7890625, + "learning_rate": 8.178717093066371e-06, + "loss": 2.877, + "mean_token_accuracy": 0.47789790611742167, + "step": 9824 + }, + { + "epoch": 1.82146829810901, + "grad_norm": 6.28125, + "learning_rate": 8.17853170189099e-06, + "loss": 3.0604, + "mean_token_accuracy": 0.4393972483074907, + "step": 9825 + }, + { + "epoch": 1.82165368928439, + "grad_norm": 6.5703125, + "learning_rate": 8.17834631071561e-06, + "loss": 2.4296, + "mean_token_accuracy": 0.48900714185688277, + "step": 9826 + }, + { + "epoch": 1.8218390804597702, + "grad_norm": 5.2578125, + "learning_rate": 8.17816091954023e-06, + "loss": 2.6679, + "mean_token_accuracy": 0.5003408316291752, + "step": 9827 + }, + { + "epoch": 1.8220244716351501, + "grad_norm": 7.54296875, + "learning_rate": 8.177975528364851e-06, + "loss": 3.0166, + "mean_token_accuracy": 0.44657050338534, + "step": 9828 + }, + { + "epoch": 1.82220986281053, + "grad_norm": 5.9296875, + "learning_rate": 8.17779013718947e-06, + "loss": 3.3837, + "mean_token_accuracy": 0.4261609259517501, + "step": 9829 + }, + { + "epoch": 1.8223952539859103, + "grad_norm": 5.69140625, + "learning_rate": 8.17760474601409e-06, + "loss": 3.2641, + "mean_token_accuracy": 0.4339457567804024, + "step": 9830 + }, + { + "epoch": 1.8225806451612905, + "grad_norm": 6.73828125, + "learning_rate": 8.177419354838711e-06, + "loss": 2.8617, + "mean_token_accuracy": 0.45494755465689374, + "step": 9831 + }, + { + "epoch": 1.8227660363366702, + "grad_norm": 5.5546875, + "learning_rate": 8.17723396366333e-06, + "loss": 3.3221, + "mean_token_accuracy": 0.4438561930558976, + "step": 9832 + }, + { + "epoch": 1.8229514275120504, + "grad_norm": 5.91796875, + "learning_rate": 8.17704857248795e-06, + "loss": 3.6643, + "mean_token_accuracy": 0.4020573108008817, + "step": 9833 + }, + { + "epoch": 1.8231368186874306, + "grad_norm": 7.65234375, + "learning_rate": 8.17686318131257e-06, + "loss": 2.4775, + "mean_token_accuracy": 0.5040042712226375, + "step": 9834 + }, + { + "epoch": 1.8233222098628106, + "grad_norm": 5.91796875, + "learning_rate": 8.17667779013719e-06, + "loss": 2.8397, + "mean_token_accuracy": 0.46711769973137, + "step": 9835 + }, + { + "epoch": 1.8235076010381905, + "grad_norm": 8.0859375, + "learning_rate": 8.17649239896181e-06, + "loss": 2.5356, + "mean_token_accuracy": 0.48488252363568285, + "step": 9836 + }, + { + "epoch": 1.8236929922135707, + "grad_norm": 6.9375, + "learning_rate": 8.17630700778643e-06, + "loss": 3.7763, + "mean_token_accuracy": 0.41598842466992225, + "step": 9837 + }, + { + "epoch": 1.8238783833889507, + "grad_norm": 6.45703125, + "learning_rate": 8.17612161661105e-06, + "loss": 2.8534, + "mean_token_accuracy": 0.48314432188907874, + "step": 9838 + }, + { + "epoch": 1.8240637745643307, + "grad_norm": 6.34765625, + "learning_rate": 8.17593622543567e-06, + "loss": 3.2312, + "mean_token_accuracy": 0.42416596579758903, + "step": 9839 + }, + { + "epoch": 1.8242491657397109, + "grad_norm": 6.17578125, + "learning_rate": 8.17575083426029e-06, + "loss": 2.8948, + "mean_token_accuracy": 0.46728221597751907, + "step": 9840 + }, + { + "epoch": 1.8244345569150908, + "grad_norm": 6.21484375, + "learning_rate": 8.17556544308491e-06, + "loss": 2.8339, + "mean_token_accuracy": 0.4582139446036294, + "step": 9841 + }, + { + "epoch": 1.8246199480904708, + "grad_norm": 5.92578125, + "learning_rate": 8.17538005190953e-06, + "loss": 3.296, + "mean_token_accuracy": 0.43596189468113256, + "step": 9842 + }, + { + "epoch": 1.824805339265851, + "grad_norm": 7.359375, + "learning_rate": 8.175194660734149e-06, + "loss": 2.0158, + "mean_token_accuracy": 0.5690138027605521, + "step": 9843 + }, + { + "epoch": 1.824990730441231, + "grad_norm": 8.2421875, + "learning_rate": 8.17500926955877e-06, + "loss": 2.5757, + "mean_token_accuracy": 0.4889845530514054, + "step": 9844 + }, + { + "epoch": 1.825176121616611, + "grad_norm": 7.71875, + "learning_rate": 8.17482387838339e-06, + "loss": 2.8355, + "mean_token_accuracy": 0.4741042345276873, + "step": 9845 + }, + { + "epoch": 1.825361512791991, + "grad_norm": 6.4140625, + "learning_rate": 8.17463848720801e-06, + "loss": 3.1864, + "mean_token_accuracy": 0.4386013597890939, + "step": 9846 + }, + { + "epoch": 1.8255469039673713, + "grad_norm": 8.375, + "learning_rate": 8.174453096032629e-06, + "loss": 2.5643, + "mean_token_accuracy": 0.496025198740063, + "step": 9847 + }, + { + "epoch": 1.8257322951427513, + "grad_norm": 7.75390625, + "learning_rate": 8.17426770485725e-06, + "loss": 3.2, + "mean_token_accuracy": 0.4153522607781283, + "step": 9848 + }, + { + "epoch": 1.8259176863181312, + "grad_norm": 6.97265625, + "learning_rate": 8.17408231368187e-06, + "loss": 3.3362, + "mean_token_accuracy": 0.4476205434270457, + "step": 9849 + }, + { + "epoch": 1.8261030774935114, + "grad_norm": 6.3984375, + "learning_rate": 8.173896922506489e-06, + "loss": 3.1812, + "mean_token_accuracy": 0.4113464447806354, + "step": 9850 + }, + { + "epoch": 1.8262884686688914, + "grad_norm": 8.0703125, + "learning_rate": 8.17371153133111e-06, + "loss": 3.4171, + "mean_token_accuracy": 0.4261029411764706, + "step": 9851 + }, + { + "epoch": 1.8264738598442714, + "grad_norm": 9.2578125, + "learning_rate": 8.17352614015573e-06, + "loss": 3.2803, + "mean_token_accuracy": 0.45393871028388794, + "step": 9852 + }, + { + "epoch": 1.8266592510196515, + "grad_norm": 7.2734375, + "learning_rate": 8.17334074898035e-06, + "loss": 2.8363, + "mean_token_accuracy": 0.46935180836073276, + "step": 9853 + }, + { + "epoch": 1.8268446421950315, + "grad_norm": 7.7109375, + "learning_rate": 8.173155357804969e-06, + "loss": 2.9806, + "mean_token_accuracy": 0.46115317414094353, + "step": 9854 + }, + { + "epoch": 1.8270300333704115, + "grad_norm": 10.5546875, + "learning_rate": 8.17296996662959e-06, + "loss": 2.6667, + "mean_token_accuracy": 0.5090268604139145, + "step": 9855 + }, + { + "epoch": 1.8272154245457917, + "grad_norm": 6.67578125, + "learning_rate": 8.172784575454208e-06, + "loss": 3.0599, + "mean_token_accuracy": 0.45188536953242836, + "step": 9856 + }, + { + "epoch": 1.8274008157211716, + "grad_norm": 9.03125, + "learning_rate": 8.172599184278829e-06, + "loss": 2.1617, + "mean_token_accuracy": 0.5504184934236748, + "step": 9857 + }, + { + "epoch": 1.8275862068965516, + "grad_norm": 8.625, + "learning_rate": 8.17241379310345e-06, + "loss": 3.0517, + "mean_token_accuracy": 0.4832857382832185, + "step": 9858 + }, + { + "epoch": 1.8277715980719318, + "grad_norm": 5.453125, + "learning_rate": 8.172228401928068e-06, + "loss": 2.856, + "mean_token_accuracy": 0.47587392550143265, + "step": 9859 + }, + { + "epoch": 1.827956989247312, + "grad_norm": 5.8515625, + "learning_rate": 8.172043010752689e-06, + "loss": 2.6661, + "mean_token_accuracy": 0.47032229838121625, + "step": 9860 + }, + { + "epoch": 1.8281423804226917, + "grad_norm": 10.1875, + "learning_rate": 8.171857619577309e-06, + "loss": 2.2526, + "mean_token_accuracy": 0.5315723840834372, + "step": 9861 + }, + { + "epoch": 1.828327771598072, + "grad_norm": 7.11328125, + "learning_rate": 8.17167222840193e-06, + "loss": 2.9516, + "mean_token_accuracy": 0.4582560296846011, + "step": 9862 + }, + { + "epoch": 1.828513162773452, + "grad_norm": 6.26953125, + "learning_rate": 8.171486837226548e-06, + "loss": 2.347, + "mean_token_accuracy": 0.5147873058744091, + "step": 9863 + }, + { + "epoch": 1.828698553948832, + "grad_norm": 7.96875, + "learning_rate": 8.171301446051169e-06, + "loss": 2.9877, + "mean_token_accuracy": 0.4392204960479695, + "step": 9864 + }, + { + "epoch": 1.828883945124212, + "grad_norm": 7.3984375, + "learning_rate": 8.171116054875788e-06, + "loss": 3.3785, + "mean_token_accuracy": 0.45092262366258334, + "step": 9865 + }, + { + "epoch": 1.8290693362995922, + "grad_norm": 5.5546875, + "learning_rate": 8.170930663700408e-06, + "loss": 3.3754, + "mean_token_accuracy": 0.4195064629847238, + "step": 9866 + }, + { + "epoch": 1.8292547274749722, + "grad_norm": 5.67578125, + "learning_rate": 8.170745272525029e-06, + "loss": 2.5538, + "mean_token_accuracy": 0.5019165727170237, + "step": 9867 + }, + { + "epoch": 1.8294401186503522, + "grad_norm": 6.2578125, + "learning_rate": 8.170559881349649e-06, + "loss": 3.2508, + "mean_token_accuracy": 0.44874164652353193, + "step": 9868 + }, + { + "epoch": 1.8296255098257324, + "grad_norm": 6.1796875, + "learning_rate": 8.17037449017427e-06, + "loss": 3.2008, + "mean_token_accuracy": 0.44672607516466484, + "step": 9869 + }, + { + "epoch": 1.8298109010011123, + "grad_norm": 8.0390625, + "learning_rate": 8.170189098998888e-06, + "loss": 3.1967, + "mean_token_accuracy": 0.43868548742831937, + "step": 9870 + }, + { + "epoch": 1.8299962921764923, + "grad_norm": 6.1015625, + "learning_rate": 8.170003707823509e-06, + "loss": 2.8199, + "mean_token_accuracy": 0.4828819068255688, + "step": 9871 + }, + { + "epoch": 1.8301816833518725, + "grad_norm": 7.12109375, + "learning_rate": 8.169818316648128e-06, + "loss": 2.9534, + "mean_token_accuracy": 0.46561147802322933, + "step": 9872 + }, + { + "epoch": 1.8303670745272527, + "grad_norm": 5.73046875, + "learning_rate": 8.169632925472748e-06, + "loss": 2.6069, + "mean_token_accuracy": 0.5093678598629093, + "step": 9873 + }, + { + "epoch": 1.8305524657026324, + "grad_norm": 7.52734375, + "learning_rate": 8.169447534297369e-06, + "loss": 2.6319, + "mean_token_accuracy": 0.49981709547616143, + "step": 9874 + }, + { + "epoch": 1.8307378568780126, + "grad_norm": 6.234375, + "learning_rate": 8.169262143121987e-06, + "loss": 3.2729, + "mean_token_accuracy": 0.4259123552689433, + "step": 9875 + }, + { + "epoch": 1.8309232480533928, + "grad_norm": 6.40234375, + "learning_rate": 8.169076751946608e-06, + "loss": 2.8166, + "mean_token_accuracy": 0.4746349913387775, + "step": 9876 + }, + { + "epoch": 1.8311086392287728, + "grad_norm": 6.6953125, + "learning_rate": 8.168891360771228e-06, + "loss": 2.5001, + "mean_token_accuracy": 0.5214254797287549, + "step": 9877 + }, + { + "epoch": 1.8312940304041527, + "grad_norm": 6.171875, + "learning_rate": 8.168705969595849e-06, + "loss": 3.8367, + "mean_token_accuracy": 0.3778600714191245, + "step": 9878 + }, + { + "epoch": 1.831479421579533, + "grad_norm": 5.65625, + "learning_rate": 8.168520578420468e-06, + "loss": 3.215, + "mean_token_accuracy": 0.44952089987501737, + "step": 9879 + }, + { + "epoch": 1.8316648127549129, + "grad_norm": 10.25, + "learning_rate": 8.168335187245088e-06, + "loss": 2.0783, + "mean_token_accuracy": 0.5462147230103032, + "step": 9880 + }, + { + "epoch": 1.8318502039302929, + "grad_norm": 10.8125, + "learning_rate": 8.168149796069707e-06, + "loss": 2.4704, + "mean_token_accuracy": 0.5054429996976111, + "step": 9881 + }, + { + "epoch": 1.832035595105673, + "grad_norm": 8.265625, + "learning_rate": 8.167964404894327e-06, + "loss": 2.9189, + "mean_token_accuracy": 0.44876946258161726, + "step": 9882 + }, + { + "epoch": 1.832220986281053, + "grad_norm": 7.171875, + "learning_rate": 8.167779013718948e-06, + "loss": 2.9213, + "mean_token_accuracy": 0.4702416028285209, + "step": 9883 + }, + { + "epoch": 1.832406377456433, + "grad_norm": 9.6484375, + "learning_rate": 8.167593622543568e-06, + "loss": 2.562, + "mean_token_accuracy": 0.4769986601161233, + "step": 9884 + }, + { + "epoch": 1.8325917686318132, + "grad_norm": 5.140625, + "learning_rate": 8.167408231368187e-06, + "loss": 2.7043, + "mean_token_accuracy": 0.48070460076486266, + "step": 9885 + }, + { + "epoch": 1.8327771598071931, + "grad_norm": 5.62890625, + "learning_rate": 8.167222840192808e-06, + "loss": 2.9202, + "mean_token_accuracy": 0.4669733427695211, + "step": 9886 + }, + { + "epoch": 1.832962550982573, + "grad_norm": 6.19140625, + "learning_rate": 8.167037449017428e-06, + "loss": 2.6217, + "mean_token_accuracy": 0.47896484915582616, + "step": 9887 + }, + { + "epoch": 1.8331479421579533, + "grad_norm": 6.24609375, + "learning_rate": 8.166852057842047e-06, + "loss": 2.7821, + "mean_token_accuracy": 0.4508919623170976, + "step": 9888 + }, + { + "epoch": 1.8333333333333335, + "grad_norm": 5.5625, + "learning_rate": 8.166666666666668e-06, + "loss": 2.7016, + "mean_token_accuracy": 0.47558284562985703, + "step": 9889 + }, + { + "epoch": 1.8335187245087132, + "grad_norm": 6.60546875, + "learning_rate": 8.166481275491286e-06, + "loss": 2.9647, + "mean_token_accuracy": 0.45293618825524695, + "step": 9890 + }, + { + "epoch": 1.8337041156840934, + "grad_norm": 5.0234375, + "learning_rate": 8.166295884315907e-06, + "loss": 2.4817, + "mean_token_accuracy": 0.49432700346351366, + "step": 9891 + }, + { + "epoch": 1.8338895068594736, + "grad_norm": 7.81640625, + "learning_rate": 8.166110493140527e-06, + "loss": 3.2523, + "mean_token_accuracy": 0.42291438409766785, + "step": 9892 + }, + { + "epoch": 1.8340748980348536, + "grad_norm": 4.9765625, + "learning_rate": 8.165925101965148e-06, + "loss": 2.9318, + "mean_token_accuracy": 0.43706777316735823, + "step": 9893 + }, + { + "epoch": 1.8342602892102335, + "grad_norm": 5.52734375, + "learning_rate": 8.165739710789767e-06, + "loss": 2.9282, + "mean_token_accuracy": 0.45150794643974473, + "step": 9894 + }, + { + "epoch": 1.8344456803856137, + "grad_norm": 6.20703125, + "learning_rate": 8.165554319614387e-06, + "loss": 3.4705, + "mean_token_accuracy": 0.4322184138990663, + "step": 9895 + }, + { + "epoch": 1.8346310715609937, + "grad_norm": 8.1015625, + "learning_rate": 8.165368928439008e-06, + "loss": 2.375, + "mean_token_accuracy": 0.508030303030303, + "step": 9896 + }, + { + "epoch": 1.8348164627363737, + "grad_norm": 6.8828125, + "learning_rate": 8.165183537263626e-06, + "loss": 2.878, + "mean_token_accuracy": 0.44756671899529044, + "step": 9897 + }, + { + "epoch": 1.8350018539117539, + "grad_norm": 5.48828125, + "learning_rate": 8.164998146088247e-06, + "loss": 3.0532, + "mean_token_accuracy": 0.4302161954714085, + "step": 9898 + }, + { + "epoch": 1.8351872450871338, + "grad_norm": 5.96484375, + "learning_rate": 8.164812754912866e-06, + "loss": 2.7995, + "mean_token_accuracy": 0.4733405875952122, + "step": 9899 + }, + { + "epoch": 1.8353726362625138, + "grad_norm": 6.57421875, + "learning_rate": 8.164627363737486e-06, + "loss": 2.905, + "mean_token_accuracy": 0.47527472527472525, + "step": 9900 + }, + { + "epoch": 1.835558027437894, + "grad_norm": 9.0859375, + "learning_rate": 8.164441972562107e-06, + "loss": 2.8332, + "mean_token_accuracy": 0.4676726511730367, + "step": 9901 + }, + { + "epoch": 1.8357434186132742, + "grad_norm": 5.31640625, + "learning_rate": 8.164256581386727e-06, + "loss": 2.7991, + "mean_token_accuracy": 0.49108402822322, + "step": 9902 + }, + { + "epoch": 1.835928809788654, + "grad_norm": 6.9921875, + "learning_rate": 8.164071190211346e-06, + "loss": 2.4431, + "mean_token_accuracy": 0.5142450142450142, + "step": 9903 + }, + { + "epoch": 1.836114200964034, + "grad_norm": 9.8515625, + "learning_rate": 8.163885799035966e-06, + "loss": 1.9152, + "mean_token_accuracy": 0.5636960087479497, + "step": 9904 + }, + { + "epoch": 1.8362995921394143, + "grad_norm": 5.875, + "learning_rate": 8.163700407860587e-06, + "loss": 3.1739, + "mean_token_accuracy": 0.44848621780388614, + "step": 9905 + }, + { + "epoch": 1.8364849833147943, + "grad_norm": 8.7578125, + "learning_rate": 8.163515016685206e-06, + "loss": 3.2362, + "mean_token_accuracy": 0.4287764153404644, + "step": 9906 + }, + { + "epoch": 1.8366703744901742, + "grad_norm": 8.625, + "learning_rate": 8.163329625509826e-06, + "loss": 3.5704, + "mean_token_accuracy": 0.4211150652431791, + "step": 9907 + }, + { + "epoch": 1.8368557656655544, + "grad_norm": 6.859375, + "learning_rate": 8.163144234334445e-06, + "loss": 2.6843, + "mean_token_accuracy": 0.4796943540399038, + "step": 9908 + }, + { + "epoch": 1.8370411568409344, + "grad_norm": 10.609375, + "learning_rate": 8.162958843159067e-06, + "loss": 2.5302, + "mean_token_accuracy": 0.495017015070491, + "step": 9909 + }, + { + "epoch": 1.8372265480163144, + "grad_norm": 8.0390625, + "learning_rate": 8.162773451983686e-06, + "loss": 2.4259, + "mean_token_accuracy": 0.500880503144654, + "step": 9910 + }, + { + "epoch": 1.8374119391916945, + "grad_norm": 7.1171875, + "learning_rate": 8.162588060808306e-06, + "loss": 2.2755, + "mean_token_accuracy": 0.5408628081457664, + "step": 9911 + }, + { + "epoch": 1.8375973303670745, + "grad_norm": 6.67578125, + "learning_rate": 8.162402669632927e-06, + "loss": 2.8986, + "mean_token_accuracy": 0.48922825197709296, + "step": 9912 + }, + { + "epoch": 1.8377827215424545, + "grad_norm": 10.78125, + "learning_rate": 8.162217278457546e-06, + "loss": 2.7058, + "mean_token_accuracy": 0.46162452450212577, + "step": 9913 + }, + { + "epoch": 1.8379681127178347, + "grad_norm": 6.453125, + "learning_rate": 8.162031887282166e-06, + "loss": 2.7801, + "mean_token_accuracy": 0.4758316747227751, + "step": 9914 + }, + { + "epoch": 1.8381535038932146, + "grad_norm": 6.05078125, + "learning_rate": 8.161846496106785e-06, + "loss": 3.0257, + "mean_token_accuracy": 0.47233718144195164, + "step": 9915 + }, + { + "epoch": 1.8383388950685946, + "grad_norm": 7.8984375, + "learning_rate": 8.161661104931406e-06, + "loss": 2.867, + "mean_token_accuracy": 0.48376201610808, + "step": 9916 + }, + { + "epoch": 1.8385242862439748, + "grad_norm": 6.4609375, + "learning_rate": 8.161475713756026e-06, + "loss": 3.3436, + "mean_token_accuracy": 0.44700139470013944, + "step": 9917 + }, + { + "epoch": 1.838709677419355, + "grad_norm": 9.765625, + "learning_rate": 8.161290322580647e-06, + "loss": 2.5837, + "mean_token_accuracy": 0.48219241443108235, + "step": 9918 + }, + { + "epoch": 1.8388950685947347, + "grad_norm": 7.8203125, + "learning_rate": 8.161104931405265e-06, + "loss": 2.1196, + "mean_token_accuracy": 0.5406290956749672, + "step": 9919 + }, + { + "epoch": 1.839080459770115, + "grad_norm": 5.12109375, + "learning_rate": 8.160919540229886e-06, + "loss": 2.3421, + "mean_token_accuracy": 0.5113786875376279, + "step": 9920 + }, + { + "epoch": 1.839265850945495, + "grad_norm": 7.99609375, + "learning_rate": 8.160734149054506e-06, + "loss": 3.2918, + "mean_token_accuracy": 0.4302621995630007, + "step": 9921 + }, + { + "epoch": 1.839451242120875, + "grad_norm": 5.60546875, + "learning_rate": 8.160548757879125e-06, + "loss": 2.7983, + "mean_token_accuracy": 0.49405656510452245, + "step": 9922 + }, + { + "epoch": 1.839636633296255, + "grad_norm": 5.8984375, + "learning_rate": 8.160363366703746e-06, + "loss": 2.9442, + "mean_token_accuracy": 0.44370782526340713, + "step": 9923 + }, + { + "epoch": 1.8398220244716352, + "grad_norm": 7.53125, + "learning_rate": 8.160177975528364e-06, + "loss": 3.295, + "mean_token_accuracy": 0.44047467143039426, + "step": 9924 + }, + { + "epoch": 1.8400074156470152, + "grad_norm": 7.2265625, + "learning_rate": 8.159992584352987e-06, + "loss": 2.2591, + "mean_token_accuracy": 0.5414471562197614, + "step": 9925 + }, + { + "epoch": 1.8401928068223952, + "grad_norm": 6.75, + "learning_rate": 8.159807193177605e-06, + "loss": 2.8694, + "mean_token_accuracy": 0.46465249582039647, + "step": 9926 + }, + { + "epoch": 1.8403781979977754, + "grad_norm": 10.1328125, + "learning_rate": 8.159621802002226e-06, + "loss": 2.1149, + "mean_token_accuracy": 0.5379284274193549, + "step": 9927 + }, + { + "epoch": 1.8405635891731553, + "grad_norm": 6.55859375, + "learning_rate": 8.159436410826845e-06, + "loss": 2.5451, + "mean_token_accuracy": 0.4833965299322073, + "step": 9928 + }, + { + "epoch": 1.8407489803485353, + "grad_norm": 7.41015625, + "learning_rate": 8.159251019651465e-06, + "loss": 2.9423, + "mean_token_accuracy": 0.49673103238558614, + "step": 9929 + }, + { + "epoch": 1.8409343715239155, + "grad_norm": 6.9296875, + "learning_rate": 8.159065628476086e-06, + "loss": 2.8812, + "mean_token_accuracy": 0.46237878973855406, + "step": 9930 + }, + { + "epoch": 1.8411197626992957, + "grad_norm": 7.16015625, + "learning_rate": 8.158880237300704e-06, + "loss": 2.3804, + "mean_token_accuracy": 0.516580310880829, + "step": 9931 + }, + { + "epoch": 1.8413051538746754, + "grad_norm": 7.76171875, + "learning_rate": 8.158694846125325e-06, + "loss": 3.0806, + "mean_token_accuracy": 0.4563608434576176, + "step": 9932 + }, + { + "epoch": 1.8414905450500556, + "grad_norm": 7.32421875, + "learning_rate": 8.158509454949945e-06, + "loss": 2.784, + "mean_token_accuracy": 0.4812752331894751, + "step": 9933 + }, + { + "epoch": 1.8416759362254358, + "grad_norm": 5.73828125, + "learning_rate": 8.158324063774566e-06, + "loss": 2.9815, + "mean_token_accuracy": 0.4635841644001209, + "step": 9934 + }, + { + "epoch": 1.8418613274008158, + "grad_norm": 6.3046875, + "learning_rate": 8.158138672599185e-06, + "loss": 3.1736, + "mean_token_accuracy": 0.4578762863281692, + "step": 9935 + }, + { + "epoch": 1.8420467185761957, + "grad_norm": 7.2734375, + "learning_rate": 8.157953281423805e-06, + "loss": 2.8627, + "mean_token_accuracy": 0.46904982977406373, + "step": 9936 + }, + { + "epoch": 1.842232109751576, + "grad_norm": 5.5078125, + "learning_rate": 8.157767890248424e-06, + "loss": 3.0206, + "mean_token_accuracy": 0.46620475113122173, + "step": 9937 + }, + { + "epoch": 1.8424175009269559, + "grad_norm": 5.57421875, + "learning_rate": 8.157582499073045e-06, + "loss": 3.1131, + "mean_token_accuracy": 0.44411447084233263, + "step": 9938 + }, + { + "epoch": 1.8426028921023359, + "grad_norm": 10.75, + "learning_rate": 8.157397107897665e-06, + "loss": 3.0653, + "mean_token_accuracy": 0.4419742729306488, + "step": 9939 + }, + { + "epoch": 1.842788283277716, + "grad_norm": 10.375, + "learning_rate": 8.157211716722284e-06, + "loss": 2.3796, + "mean_token_accuracy": 0.5429635541367385, + "step": 9940 + }, + { + "epoch": 1.842973674453096, + "grad_norm": 6.00390625, + "learning_rate": 8.157026325546904e-06, + "loss": 2.7982, + "mean_token_accuracy": 0.48338398196255267, + "step": 9941 + }, + { + "epoch": 1.843159065628476, + "grad_norm": 6.7421875, + "learning_rate": 8.156840934371525e-06, + "loss": 3.0193, + "mean_token_accuracy": 0.433843085106383, + "step": 9942 + }, + { + "epoch": 1.8433444568038562, + "grad_norm": 6.4453125, + "learning_rate": 8.156655543196145e-06, + "loss": 2.8091, + "mean_token_accuracy": 0.4510950891035233, + "step": 9943 + }, + { + "epoch": 1.8435298479792361, + "grad_norm": 5.171875, + "learning_rate": 8.156470152020764e-06, + "loss": 2.4815, + "mean_token_accuracy": 0.49093214965123655, + "step": 9944 + }, + { + "epoch": 1.843715239154616, + "grad_norm": 6.5859375, + "learning_rate": 8.156284760845385e-06, + "loss": 3.5451, + "mean_token_accuracy": 0.4009827448291624, + "step": 9945 + }, + { + "epoch": 1.8439006303299963, + "grad_norm": 7.53125, + "learning_rate": 8.156099369670003e-06, + "loss": 2.6191, + "mean_token_accuracy": 0.49115838031778575, + "step": 9946 + }, + { + "epoch": 1.8440860215053765, + "grad_norm": 5.375, + "learning_rate": 8.155913978494624e-06, + "loss": 3.359, + "mean_token_accuracy": 0.4231683596282689, + "step": 9947 + }, + { + "epoch": 1.8442714126807565, + "grad_norm": 5.640625, + "learning_rate": 8.155728587319244e-06, + "loss": 2.8164, + "mean_token_accuracy": 0.4651907952228372, + "step": 9948 + }, + { + "epoch": 1.8444568038561364, + "grad_norm": 6.7109375, + "learning_rate": 8.155543196143865e-06, + "loss": 2.9187, + "mean_token_accuracy": 0.46469888636609363, + "step": 9949 + }, + { + "epoch": 1.8446421950315166, + "grad_norm": 6.45703125, + "learning_rate": 8.155357804968485e-06, + "loss": 3.7768, + "mean_token_accuracy": 0.40051306717973384, + "step": 9950 + }, + { + "epoch": 1.8448275862068966, + "grad_norm": 6.046875, + "learning_rate": 8.155172413793104e-06, + "loss": 3.0859, + "mean_token_accuracy": 0.45669553630912724, + "step": 9951 + }, + { + "epoch": 1.8450129773822765, + "grad_norm": 8.1875, + "learning_rate": 8.154987022617725e-06, + "loss": 2.538, + "mean_token_accuracy": 0.4975929978118162, + "step": 9952 + }, + { + "epoch": 1.8451983685576567, + "grad_norm": 9.7578125, + "learning_rate": 8.154801631442343e-06, + "loss": 3.7664, + "mean_token_accuracy": 0.4131888710540396, + "step": 9953 + }, + { + "epoch": 1.8453837597330367, + "grad_norm": 8.609375, + "learning_rate": 8.154616240266964e-06, + "loss": 2.6419, + "mean_token_accuracy": 0.4950580146110872, + "step": 9954 + }, + { + "epoch": 1.8455691509084167, + "grad_norm": 6.1484375, + "learning_rate": 8.154430849091584e-06, + "loss": 2.6192, + "mean_token_accuracy": 0.47983014861995754, + "step": 9955 + }, + { + "epoch": 1.8457545420837969, + "grad_norm": 7.16015625, + "learning_rate": 8.154245457916203e-06, + "loss": 2.8454, + "mean_token_accuracy": 0.44467787114845936, + "step": 9956 + }, + { + "epoch": 1.8459399332591768, + "grad_norm": 9.9296875, + "learning_rate": 8.154060066740824e-06, + "loss": 2.5857, + "mean_token_accuracy": 0.48365145228215767, + "step": 9957 + }, + { + "epoch": 1.8461253244345568, + "grad_norm": 6.61328125, + "learning_rate": 8.153874675565444e-06, + "loss": 3.0831, + "mean_token_accuracy": 0.4460812356979405, + "step": 9958 + }, + { + "epoch": 1.846310715609937, + "grad_norm": 8.1875, + "learning_rate": 8.153689284390065e-06, + "loss": 2.9066, + "mean_token_accuracy": 0.4807849550286182, + "step": 9959 + }, + { + "epoch": 1.8464961067853172, + "grad_norm": 9.1875, + "learning_rate": 8.153503893214683e-06, + "loss": 2.8255, + "mean_token_accuracy": 0.4528210704361484, + "step": 9960 + }, + { + "epoch": 1.846681497960697, + "grad_norm": 6.13671875, + "learning_rate": 8.153318502039304e-06, + "loss": 3.0219, + "mean_token_accuracy": 0.4478114478114478, + "step": 9961 + }, + { + "epoch": 1.846866889136077, + "grad_norm": 6.8984375, + "learning_rate": 8.153133110863923e-06, + "loss": 2.6925, + "mean_token_accuracy": 0.48782456612404146, + "step": 9962 + }, + { + "epoch": 1.8470522803114573, + "grad_norm": 8.109375, + "learning_rate": 8.152947719688543e-06, + "loss": 3.1296, + "mean_token_accuracy": 0.4542721122525683, + "step": 9963 + }, + { + "epoch": 1.8472376714868373, + "grad_norm": 7.91015625, + "learning_rate": 8.152762328513164e-06, + "loss": 2.9401, + "mean_token_accuracy": 0.46347497089639117, + "step": 9964 + }, + { + "epoch": 1.8474230626622172, + "grad_norm": 5.71484375, + "learning_rate": 8.152576937337784e-06, + "loss": 2.5118, + "mean_token_accuracy": 0.5021300766827606, + "step": 9965 + }, + { + "epoch": 1.8476084538375974, + "grad_norm": 13.7109375, + "learning_rate": 8.152391546162403e-06, + "loss": 2.5042, + "mean_token_accuracy": 0.4783227643328588, + "step": 9966 + }, + { + "epoch": 1.8477938450129774, + "grad_norm": 17.9375, + "learning_rate": 8.152206154987024e-06, + "loss": 3.0861, + "mean_token_accuracy": 0.4556627461345315, + "step": 9967 + }, + { + "epoch": 1.8479792361883574, + "grad_norm": 12.78125, + "learning_rate": 8.152020763811644e-06, + "loss": 2.1583, + "mean_token_accuracy": 0.5589119916307048, + "step": 9968 + }, + { + "epoch": 1.8481646273637375, + "grad_norm": 5.56640625, + "learning_rate": 8.151835372636263e-06, + "loss": 3.0719, + "mean_token_accuracy": 0.44946528091580057, + "step": 9969 + }, + { + "epoch": 1.8483500185391175, + "grad_norm": 10.2890625, + "learning_rate": 8.151649981460883e-06, + "loss": 3.1589, + "mean_token_accuracy": 0.445649400357234, + "step": 9970 + }, + { + "epoch": 1.8485354097144975, + "grad_norm": 10.5546875, + "learning_rate": 8.151464590285502e-06, + "loss": 3.0036, + "mean_token_accuracy": 0.44089834515366433, + "step": 9971 + }, + { + "epoch": 1.8487208008898777, + "grad_norm": 10.96875, + "learning_rate": 8.151279199110123e-06, + "loss": 2.5958, + "mean_token_accuracy": 0.46606855302507477, + "step": 9972 + }, + { + "epoch": 1.8489061920652579, + "grad_norm": 5.62890625, + "learning_rate": 8.151093807934743e-06, + "loss": 3.1624, + "mean_token_accuracy": 0.44402218570254726, + "step": 9973 + }, + { + "epoch": 1.8490915832406376, + "grad_norm": 8.2734375, + "learning_rate": 8.150908416759364e-06, + "loss": 3.128, + "mean_token_accuracy": 0.4478021978021978, + "step": 9974 + }, + { + "epoch": 1.8492769744160178, + "grad_norm": 10.9453125, + "learning_rate": 8.150723025583982e-06, + "loss": 2.8193, + "mean_token_accuracy": 0.48189280540801543, + "step": 9975 + }, + { + "epoch": 1.849462365591398, + "grad_norm": 7.31640625, + "learning_rate": 8.150537634408603e-06, + "loss": 2.7037, + "mean_token_accuracy": 0.48055000587613117, + "step": 9976 + }, + { + "epoch": 1.849647756766778, + "grad_norm": 5.7890625, + "learning_rate": 8.150352243233223e-06, + "loss": 2.8575, + "mean_token_accuracy": 0.46745087555139686, + "step": 9977 + }, + { + "epoch": 1.849833147942158, + "grad_norm": 7.515625, + "learning_rate": 8.150166852057842e-06, + "loss": 3.2801, + "mean_token_accuracy": 0.44025, + "step": 9978 + }, + { + "epoch": 1.850018539117538, + "grad_norm": 9.109375, + "learning_rate": 8.149981460882463e-06, + "loss": 2.5739, + "mean_token_accuracy": 0.5059568530642911, + "step": 9979 + }, + { + "epoch": 1.850203930292918, + "grad_norm": 6.05078125, + "learning_rate": 8.149796069707081e-06, + "loss": 2.9637, + "mean_token_accuracy": 0.43939051918735894, + "step": 9980 + }, + { + "epoch": 1.850389321468298, + "grad_norm": 5.859375, + "learning_rate": 8.149610678531704e-06, + "loss": 3.0351, + "mean_token_accuracy": 0.4342086980686397, + "step": 9981 + }, + { + "epoch": 1.8505747126436782, + "grad_norm": 8.796875, + "learning_rate": 8.149425287356322e-06, + "loss": 2.9432, + "mean_token_accuracy": 0.4728248192209004, + "step": 9982 + }, + { + "epoch": 1.8507601038190582, + "grad_norm": 9.375, + "learning_rate": 8.149239896180943e-06, + "loss": 2.816, + "mean_token_accuracy": 0.4644736842105263, + "step": 9983 + }, + { + "epoch": 1.8509454949944382, + "grad_norm": 7.109375, + "learning_rate": 8.149054505005562e-06, + "loss": 3.3016, + "mean_token_accuracy": 0.4351917866818765, + "step": 9984 + }, + { + "epoch": 1.8511308861698184, + "grad_norm": 6.61328125, + "learning_rate": 8.148869113830182e-06, + "loss": 3.4519, + "mean_token_accuracy": 0.4630037783375315, + "step": 9985 + }, + { + "epoch": 1.8513162773451983, + "grad_norm": 7.5078125, + "learning_rate": 8.148683722654803e-06, + "loss": 3.4055, + "mean_token_accuracy": 0.4355115026921194, + "step": 9986 + }, + { + "epoch": 1.8515016685205783, + "grad_norm": 7.3671875, + "learning_rate": 8.148498331479421e-06, + "loss": 3.4726, + "mean_token_accuracy": 0.42811416377700723, + "step": 9987 + }, + { + "epoch": 1.8516870596959585, + "grad_norm": 5.87109375, + "learning_rate": 8.148312940304042e-06, + "loss": 2.8772, + "mean_token_accuracy": 0.45348113090048575, + "step": 9988 + }, + { + "epoch": 1.8518724508713387, + "grad_norm": 7.51953125, + "learning_rate": 8.148127549128662e-06, + "loss": 2.5587, + "mean_token_accuracy": 0.4889494833524684, + "step": 9989 + }, + { + "epoch": 1.8520578420467184, + "grad_norm": 9.046875, + "learning_rate": 8.147942157953283e-06, + "loss": 3.0855, + "mean_token_accuracy": 0.4576296517710357, + "step": 9990 + }, + { + "epoch": 1.8522432332220986, + "grad_norm": 6.50390625, + "learning_rate": 8.147756766777902e-06, + "loss": 2.8655, + "mean_token_accuracy": 0.4732809430255403, + "step": 9991 + }, + { + "epoch": 1.8524286243974788, + "grad_norm": 6.80078125, + "learning_rate": 8.147571375602522e-06, + "loss": 2.6175, + "mean_token_accuracy": 0.4849949135300102, + "step": 9992 + }, + { + "epoch": 1.8526140155728588, + "grad_norm": 7.0234375, + "learning_rate": 8.147385984427143e-06, + "loss": 3.2387, + "mean_token_accuracy": 0.48017743276961466, + "step": 9993 + }, + { + "epoch": 1.8527994067482387, + "grad_norm": 6.1015625, + "learning_rate": 8.147200593251762e-06, + "loss": 3.4517, + "mean_token_accuracy": 0.40099812850904554, + "step": 9994 + }, + { + "epoch": 1.852984797923619, + "grad_norm": 5.51171875, + "learning_rate": 8.147015202076382e-06, + "loss": 2.9321, + "mean_token_accuracy": 0.4466759002770083, + "step": 9995 + }, + { + "epoch": 1.8531701890989989, + "grad_norm": 6.44140625, + "learning_rate": 8.146829810901e-06, + "loss": 2.7863, + "mean_token_accuracy": 0.4731091244501206, + "step": 9996 + }, + { + "epoch": 1.8533555802743789, + "grad_norm": 6.546875, + "learning_rate": 8.146644419725623e-06, + "loss": 3.5497, + "mean_token_accuracy": 0.3927816369676835, + "step": 9997 + }, + { + "epoch": 1.853540971449759, + "grad_norm": 6.39453125, + "learning_rate": 8.146459028550242e-06, + "loss": 2.8374, + "mean_token_accuracy": 0.46509671993271656, + "step": 9998 + }, + { + "epoch": 1.853726362625139, + "grad_norm": 5.515625, + "learning_rate": 8.146273637374862e-06, + "loss": 2.8204, + "mean_token_accuracy": 0.4816753926701571, + "step": 9999 + }, + { + "epoch": 1.853911753800519, + "grad_norm": 7.078125, + "learning_rate": 8.146088246199481e-06, + "loss": 3.8214, + "mean_token_accuracy": 0.41460016717748677, + "step": 10000 + }, + { + "epoch": 1.8540971449758992, + "grad_norm": 6.625, + "learning_rate": 8.145902855024102e-06, + "loss": 3.7568, + "mean_token_accuracy": 0.3967456329265375, + "step": 10001 + }, + { + "epoch": 1.8542825361512794, + "grad_norm": 5.37109375, + "learning_rate": 8.145717463848722e-06, + "loss": 2.8804, + "mean_token_accuracy": 0.4695168502562984, + "step": 10002 + }, + { + "epoch": 1.854467927326659, + "grad_norm": 6.1875, + "learning_rate": 8.145532072673341e-06, + "loss": 2.7417, + "mean_token_accuracy": 0.479466271312083, + "step": 10003 + }, + { + "epoch": 1.8546533185020393, + "grad_norm": 7.54296875, + "learning_rate": 8.145346681497961e-06, + "loss": 2.8913, + "mean_token_accuracy": 0.4465077273271834, + "step": 10004 + }, + { + "epoch": 1.8548387096774195, + "grad_norm": 8.2578125, + "learning_rate": 8.145161290322582e-06, + "loss": 3.2007, + "mean_token_accuracy": 0.44229973803943196, + "step": 10005 + }, + { + "epoch": 1.8550241008527995, + "grad_norm": 6.5625, + "learning_rate": 8.144975899147202e-06, + "loss": 2.7217, + "mean_token_accuracy": 0.4804333407030732, + "step": 10006 + }, + { + "epoch": 1.8552094920281794, + "grad_norm": 7.5546875, + "learning_rate": 8.144790507971821e-06, + "loss": 2.6358, + "mean_token_accuracy": 0.4964329643296433, + "step": 10007 + }, + { + "epoch": 1.8553948832035596, + "grad_norm": 6.36328125, + "learning_rate": 8.144605116796442e-06, + "loss": 2.9489, + "mean_token_accuracy": 0.47580756966588106, + "step": 10008 + }, + { + "epoch": 1.8555802743789396, + "grad_norm": 5.5234375, + "learning_rate": 8.14441972562106e-06, + "loss": 3.0713, + "mean_token_accuracy": 0.451479052335698, + "step": 10009 + }, + { + "epoch": 1.8557656655543195, + "grad_norm": 7.5, + "learning_rate": 8.144234334445681e-06, + "loss": 2.7688, + "mean_token_accuracy": 0.45563689604685215, + "step": 10010 + }, + { + "epoch": 1.8559510567296997, + "grad_norm": 5.6328125, + "learning_rate": 8.144048943270301e-06, + "loss": 2.9745, + "mean_token_accuracy": 0.4631933265037319, + "step": 10011 + }, + { + "epoch": 1.8561364479050797, + "grad_norm": 5.6640625, + "learning_rate": 8.14386355209492e-06, + "loss": 3.2617, + "mean_token_accuracy": 0.41563731931668857, + "step": 10012 + }, + { + "epoch": 1.8563218390804597, + "grad_norm": 6.30078125, + "learning_rate": 8.14367816091954e-06, + "loss": 2.5986, + "mean_token_accuracy": 0.5061658398299078, + "step": 10013 + }, + { + "epoch": 1.8565072302558399, + "grad_norm": 6.57421875, + "learning_rate": 8.143492769744161e-06, + "loss": 2.89, + "mean_token_accuracy": 0.45018300122000815, + "step": 10014 + }, + { + "epoch": 1.8566926214312198, + "grad_norm": 5.9375, + "learning_rate": 8.143307378568782e-06, + "loss": 2.899, + "mean_token_accuracy": 0.46830193150847255, + "step": 10015 + }, + { + "epoch": 1.8568780126065998, + "grad_norm": 7.09375, + "learning_rate": 8.1431219873934e-06, + "loss": 2.2012, + "mean_token_accuracy": 0.5535831689677844, + "step": 10016 + }, + { + "epoch": 1.85706340378198, + "grad_norm": 6.95703125, + "learning_rate": 8.142936596218021e-06, + "loss": 3.0441, + "mean_token_accuracy": 0.4692197958959728, + "step": 10017 + }, + { + "epoch": 1.8572487949573602, + "grad_norm": 8.9375, + "learning_rate": 8.14275120504264e-06, + "loss": 2.4698, + "mean_token_accuracy": 0.5164878823996821, + "step": 10018 + }, + { + "epoch": 1.85743418613274, + "grad_norm": 6.4453125, + "learning_rate": 8.14256581386726e-06, + "loss": 2.9722, + "mean_token_accuracy": 0.4692706609290522, + "step": 10019 + }, + { + "epoch": 1.85761957730812, + "grad_norm": 10.453125, + "learning_rate": 8.14238042269188e-06, + "loss": 2.5891, + "mean_token_accuracy": 0.5196545946642609, + "step": 10020 + }, + { + "epoch": 1.8578049684835003, + "grad_norm": 7.15625, + "learning_rate": 8.1421950315165e-06, + "loss": 2.7676, + "mean_token_accuracy": 0.4744458692068144, + "step": 10021 + }, + { + "epoch": 1.8579903596588803, + "grad_norm": 6.25390625, + "learning_rate": 8.14200964034112e-06, + "loss": 2.9565, + "mean_token_accuracy": 0.45191733365189346, + "step": 10022 + }, + { + "epoch": 1.8581757508342602, + "grad_norm": 9.25, + "learning_rate": 8.14182424916574e-06, + "loss": 3.2358, + "mean_token_accuracy": 0.5224670688169396, + "step": 10023 + }, + { + "epoch": 1.8583611420096404, + "grad_norm": 6.2421875, + "learning_rate": 8.141638857990361e-06, + "loss": 3.3779, + "mean_token_accuracy": 0.4293432584944171, + "step": 10024 + }, + { + "epoch": 1.8585465331850204, + "grad_norm": 6.3671875, + "learning_rate": 8.14145346681498e-06, + "loss": 2.6535, + "mean_token_accuracy": 0.4979274611398964, + "step": 10025 + }, + { + "epoch": 1.8587319243604004, + "grad_norm": 6.7421875, + "learning_rate": 8.1412680756396e-06, + "loss": 3.1015, + "mean_token_accuracy": 0.4486022131624927, + "step": 10026 + }, + { + "epoch": 1.8589173155357805, + "grad_norm": 6.49609375, + "learning_rate": 8.141082684464219e-06, + "loss": 3.4688, + "mean_token_accuracy": 0.4399370307580528, + "step": 10027 + }, + { + "epoch": 1.8591027067111605, + "grad_norm": 10.6796875, + "learning_rate": 8.14089729328884e-06, + "loss": 2.6873, + "mean_token_accuracy": 0.5049084959398861, + "step": 10028 + }, + { + "epoch": 1.8592880978865405, + "grad_norm": 6.21484375, + "learning_rate": 8.14071190211346e-06, + "loss": 3.0716, + "mean_token_accuracy": 0.44325648735939416, + "step": 10029 + }, + { + "epoch": 1.8594734890619207, + "grad_norm": 7.546875, + "learning_rate": 8.14052651093808e-06, + "loss": 3.1817, + "mean_token_accuracy": 0.46569014084507043, + "step": 10030 + }, + { + "epoch": 1.8596588802373009, + "grad_norm": 8.0546875, + "learning_rate": 8.140341119762701e-06, + "loss": 2.7788, + "mean_token_accuracy": 0.47570750858825456, + "step": 10031 + }, + { + "epoch": 1.8598442714126806, + "grad_norm": 7.8984375, + "learning_rate": 8.14015572858732e-06, + "loss": 2.3437, + "mean_token_accuracy": 0.520845231296402, + "step": 10032 + }, + { + "epoch": 1.8600296625880608, + "grad_norm": 6.70703125, + "learning_rate": 8.13997033741194e-06, + "loss": 2.2979, + "mean_token_accuracy": 0.5494066762781413, + "step": 10033 + }, + { + "epoch": 1.860215053763441, + "grad_norm": 6.015625, + "learning_rate": 8.139784946236559e-06, + "loss": 3.5025, + "mean_token_accuracy": 0.4078773460216759, + "step": 10034 + }, + { + "epoch": 1.860400444938821, + "grad_norm": 9.734375, + "learning_rate": 8.13959955506118e-06, + "loss": 2.4133, + "mean_token_accuracy": 0.5071715433161216, + "step": 10035 + }, + { + "epoch": 1.860585836114201, + "grad_norm": 5.91015625, + "learning_rate": 8.1394141638858e-06, + "loss": 3.044, + "mean_token_accuracy": 0.4548951048951049, + "step": 10036 + }, + { + "epoch": 1.860771227289581, + "grad_norm": 7.08984375, + "learning_rate": 8.139228772710419e-06, + "loss": 2.7719, + "mean_token_accuracy": 0.4637427687340239, + "step": 10037 + }, + { + "epoch": 1.860956618464961, + "grad_norm": 6.0234375, + "learning_rate": 8.13904338153504e-06, + "loss": 3.3162, + "mean_token_accuracy": 0.45489655172413795, + "step": 10038 + }, + { + "epoch": 1.861142009640341, + "grad_norm": 6.25390625, + "learning_rate": 8.13885799035966e-06, + "loss": 2.6496, + "mean_token_accuracy": 0.5134001636661211, + "step": 10039 + }, + { + "epoch": 1.8613274008157212, + "grad_norm": 6.86328125, + "learning_rate": 8.13867259918428e-06, + "loss": 2.8543, + "mean_token_accuracy": 0.45669789820207646, + "step": 10040 + }, + { + "epoch": 1.8615127919911012, + "grad_norm": 5.81640625, + "learning_rate": 8.1384872080089e-06, + "loss": 3.231, + "mean_token_accuracy": 0.4535752401280683, + "step": 10041 + }, + { + "epoch": 1.8616981831664812, + "grad_norm": 7.66015625, + "learning_rate": 8.13830181683352e-06, + "loss": 2.7361, + "mean_token_accuracy": 0.4631645569620253, + "step": 10042 + }, + { + "epoch": 1.8618835743418614, + "grad_norm": 6.41796875, + "learning_rate": 8.138116425658138e-06, + "loss": 2.4694, + "mean_token_accuracy": 0.5153933865450399, + "step": 10043 + }, + { + "epoch": 1.8620689655172413, + "grad_norm": 6.765625, + "learning_rate": 8.137931034482759e-06, + "loss": 3.0429, + "mean_token_accuracy": 0.4396620015843676, + "step": 10044 + }, + { + "epoch": 1.8622543566926213, + "grad_norm": 5.98828125, + "learning_rate": 8.13774564330738e-06, + "loss": 3.3068, + "mean_token_accuracy": 0.4488695652173913, + "step": 10045 + }, + { + "epoch": 1.8624397478680015, + "grad_norm": 6.58984375, + "learning_rate": 8.137560252132e-06, + "loss": 2.6463, + "mean_token_accuracy": 0.48038302965819923, + "step": 10046 + }, + { + "epoch": 1.8626251390433817, + "grad_norm": 7.453125, + "learning_rate": 8.137374860956619e-06, + "loss": 3.2315, + "mean_token_accuracy": 0.4383149448345035, + "step": 10047 + }, + { + "epoch": 1.8628105302187616, + "grad_norm": 7.11328125, + "learning_rate": 8.13718946978124e-06, + "loss": 2.549, + "mean_token_accuracy": 0.46773914547656115, + "step": 10048 + }, + { + "epoch": 1.8629959213941416, + "grad_norm": 5.90234375, + "learning_rate": 8.13700407860586e-06, + "loss": 3.1311, + "mean_token_accuracy": 0.441596161131456, + "step": 10049 + }, + { + "epoch": 1.8631813125695218, + "grad_norm": 6.0, + "learning_rate": 8.136818687430479e-06, + "loss": 2.9813, + "mean_token_accuracy": 0.4544702842377261, + "step": 10050 + }, + { + "epoch": 1.8633667037449018, + "grad_norm": 6.828125, + "learning_rate": 8.136633296255099e-06, + "loss": 2.7688, + "mean_token_accuracy": 0.4627846912420756, + "step": 10051 + }, + { + "epoch": 1.8635520949202817, + "grad_norm": 7.32421875, + "learning_rate": 8.136447905079718e-06, + "loss": 3.1986, + "mean_token_accuracy": 0.4503259857187209, + "step": 10052 + }, + { + "epoch": 1.863737486095662, + "grad_norm": 5.73046875, + "learning_rate": 8.136262513904338e-06, + "loss": 2.7262, + "mean_token_accuracy": 0.4857604775525432, + "step": 10053 + }, + { + "epoch": 1.863922877271042, + "grad_norm": 6.90234375, + "learning_rate": 8.136077122728959e-06, + "loss": 3.2629, + "mean_token_accuracy": 0.43776667073022063, + "step": 10054 + }, + { + "epoch": 1.8641082684464219, + "grad_norm": 7.30859375, + "learning_rate": 8.13589173155358e-06, + "loss": 2.7272, + "mean_token_accuracy": 0.4948280682135868, + "step": 10055 + }, + { + "epoch": 1.864293659621802, + "grad_norm": 6.30859375, + "learning_rate": 8.135706340378198e-06, + "loss": 2.95, + "mean_token_accuracy": 0.48943862987630826, + "step": 10056 + }, + { + "epoch": 1.864479050797182, + "grad_norm": 6.92578125, + "learning_rate": 8.135520949202819e-06, + "loss": 3.4681, + "mean_token_accuracy": 0.5044055849261218, + "step": 10057 + }, + { + "epoch": 1.864664441972562, + "grad_norm": 10.1484375, + "learning_rate": 8.135335558027439e-06, + "loss": 2.588, + "mean_token_accuracy": 0.5019032849288031, + "step": 10058 + }, + { + "epoch": 1.8648498331479422, + "grad_norm": 6.81640625, + "learning_rate": 8.135150166852058e-06, + "loss": 2.6152, + "mean_token_accuracy": 0.4832443653618031, + "step": 10059 + }, + { + "epoch": 1.8650352243233224, + "grad_norm": 5.90625, + "learning_rate": 8.134964775676678e-06, + "loss": 2.9631, + "mean_token_accuracy": 0.4725109409190372, + "step": 10060 + }, + { + "epoch": 1.865220615498702, + "grad_norm": 6.92578125, + "learning_rate": 8.134779384501297e-06, + "loss": 2.7595, + "mean_token_accuracy": 0.4825163908835467, + "step": 10061 + }, + { + "epoch": 1.8654060066740823, + "grad_norm": 7.8046875, + "learning_rate": 8.13459399332592e-06, + "loss": 2.371, + "mean_token_accuracy": 0.5030015356694123, + "step": 10062 + }, + { + "epoch": 1.8655913978494625, + "grad_norm": 6.09375, + "learning_rate": 8.134408602150538e-06, + "loss": 3.0158, + "mean_token_accuracy": 0.4490874764002517, + "step": 10063 + }, + { + "epoch": 1.8657767890248425, + "grad_norm": 9.734375, + "learning_rate": 8.134223210975159e-06, + "loss": 2.8755, + "mean_token_accuracy": 0.4687551798441903, + "step": 10064 + }, + { + "epoch": 1.8659621802002224, + "grad_norm": 6.1328125, + "learning_rate": 8.134037819799777e-06, + "loss": 2.9597, + "mean_token_accuracy": 0.46167471819645733, + "step": 10065 + }, + { + "epoch": 1.8661475713756026, + "grad_norm": 7.34765625, + "learning_rate": 8.133852428624398e-06, + "loss": 2.518, + "mean_token_accuracy": 0.49220294311443, + "step": 10066 + }, + { + "epoch": 1.8663329625509826, + "grad_norm": 6.9140625, + "learning_rate": 8.133667037449018e-06, + "loss": 3.0132, + "mean_token_accuracy": 0.45056614194973765, + "step": 10067 + }, + { + "epoch": 1.8665183537263625, + "grad_norm": 5.65625, + "learning_rate": 8.133481646273637e-06, + "loss": 2.8332, + "mean_token_accuracy": 0.46616219303255285, + "step": 10068 + }, + { + "epoch": 1.8667037449017427, + "grad_norm": 7.2421875, + "learning_rate": 8.133296255098258e-06, + "loss": 3.1782, + "mean_token_accuracy": 0.4431973720229948, + "step": 10069 + }, + { + "epoch": 1.8668891360771227, + "grad_norm": 6.8984375, + "learning_rate": 8.133110863922878e-06, + "loss": 2.3584, + "mean_token_accuracy": 0.5256545887331394, + "step": 10070 + }, + { + "epoch": 1.8670745272525027, + "grad_norm": 6.1328125, + "learning_rate": 8.132925472747499e-06, + "loss": 3.3312, + "mean_token_accuracy": 0.43197603159471604, + "step": 10071 + }, + { + "epoch": 1.8672599184278829, + "grad_norm": 7.90234375, + "learning_rate": 8.132740081572117e-06, + "loss": 2.3161, + "mean_token_accuracy": 0.5472336824288586, + "step": 10072 + }, + { + "epoch": 1.867445309603263, + "grad_norm": 9.4375, + "learning_rate": 8.132554690396738e-06, + "loss": 2.8848, + "mean_token_accuracy": 0.46260650047333546, + "step": 10073 + }, + { + "epoch": 1.8676307007786428, + "grad_norm": 6.09765625, + "learning_rate": 8.132369299221358e-06, + "loss": 3.3155, + "mean_token_accuracy": 0.4501431918771153, + "step": 10074 + }, + { + "epoch": 1.867816091954023, + "grad_norm": 5.640625, + "learning_rate": 8.132183908045977e-06, + "loss": 2.6753, + "mean_token_accuracy": 0.4748073503260225, + "step": 10075 + }, + { + "epoch": 1.8680014831294032, + "grad_norm": 6.62890625, + "learning_rate": 8.131998516870598e-06, + "loss": 3.3371, + "mean_token_accuracy": 0.4504396482813749, + "step": 10076 + }, + { + "epoch": 1.8681868743047831, + "grad_norm": 10.078125, + "learning_rate": 8.131813125695217e-06, + "loss": 3.5119, + "mean_token_accuracy": 0.4208205265267941, + "step": 10077 + }, + { + "epoch": 1.868372265480163, + "grad_norm": 6.53515625, + "learning_rate": 8.131627734519839e-06, + "loss": 3.2437, + "mean_token_accuracy": 0.4361607933415973, + "step": 10078 + }, + { + "epoch": 1.8685576566555433, + "grad_norm": 8.5703125, + "learning_rate": 8.131442343344458e-06, + "loss": 2.022, + "mean_token_accuracy": 0.5736419280795715, + "step": 10079 + }, + { + "epoch": 1.8687430478309233, + "grad_norm": 9.1328125, + "learning_rate": 8.131256952169078e-06, + "loss": 3.0501, + "mean_token_accuracy": 0.4652041438147471, + "step": 10080 + }, + { + "epoch": 1.8689284390063032, + "grad_norm": 8.328125, + "learning_rate": 8.131071560993697e-06, + "loss": 3.0278, + "mean_token_accuracy": 0.44563279857397503, + "step": 10081 + }, + { + "epoch": 1.8691138301816834, + "grad_norm": 6.546875, + "learning_rate": 8.130886169818317e-06, + "loss": 3.2802, + "mean_token_accuracy": 0.4242276224203199, + "step": 10082 + }, + { + "epoch": 1.8692992213570634, + "grad_norm": 7.91796875, + "learning_rate": 8.130700778642938e-06, + "loss": 2.6393, + "mean_token_accuracy": 0.4713003845606039, + "step": 10083 + }, + { + "epoch": 1.8694846125324434, + "grad_norm": 6.08203125, + "learning_rate": 8.130515387467557e-06, + "loss": 3.0436, + "mean_token_accuracy": 0.44912968864917874, + "step": 10084 + }, + { + "epoch": 1.8696700037078235, + "grad_norm": 7.609375, + "learning_rate": 8.130329996292177e-06, + "loss": 2.7074, + "mean_token_accuracy": 0.48401475561020596, + "step": 10085 + }, + { + "epoch": 1.8698553948832035, + "grad_norm": 7.578125, + "learning_rate": 8.130144605116798e-06, + "loss": 2.275, + "mean_token_accuracy": 0.5303368398682393, + "step": 10086 + }, + { + "epoch": 1.8700407860585835, + "grad_norm": 6.58984375, + "learning_rate": 8.129959213941418e-06, + "loss": 3.4136, + "mean_token_accuracy": 0.4453793703757435, + "step": 10087 + }, + { + "epoch": 1.8702261772339637, + "grad_norm": 5.75, + "learning_rate": 8.129773822766037e-06, + "loss": 2.6269, + "mean_token_accuracy": 0.47276036926277465, + "step": 10088 + }, + { + "epoch": 1.8704115684093439, + "grad_norm": 6.6015625, + "learning_rate": 8.129588431590657e-06, + "loss": 2.9661, + "mean_token_accuracy": 0.4627847604084839, + "step": 10089 + }, + { + "epoch": 1.8705969595847236, + "grad_norm": 7.31640625, + "learning_rate": 8.129403040415276e-06, + "loss": 2.8455, + "mean_token_accuracy": 0.4611319868482856, + "step": 10090 + }, + { + "epoch": 1.8707823507601038, + "grad_norm": 5.515625, + "learning_rate": 8.129217649239897e-06, + "loss": 3.1213, + "mean_token_accuracy": 0.43246854470548934, + "step": 10091 + }, + { + "epoch": 1.870967741935484, + "grad_norm": 6.58203125, + "learning_rate": 8.129032258064517e-06, + "loss": 2.3151, + "mean_token_accuracy": 0.536333231985406, + "step": 10092 + }, + { + "epoch": 1.871153133110864, + "grad_norm": 6.26171875, + "learning_rate": 8.128846866889136e-06, + "loss": 2.9521, + "mean_token_accuracy": 0.4721120475954388, + "step": 10093 + }, + { + "epoch": 1.871338524286244, + "grad_norm": 10.15625, + "learning_rate": 8.128661475713756e-06, + "loss": 2.7966, + "mean_token_accuracy": 0.4749476622470342, + "step": 10094 + }, + { + "epoch": 1.871523915461624, + "grad_norm": 6.27734375, + "learning_rate": 8.128476084538377e-06, + "loss": 3.0571, + "mean_token_accuracy": 0.4620541700187718, + "step": 10095 + }, + { + "epoch": 1.871709306637004, + "grad_norm": 7.13671875, + "learning_rate": 8.128290693362997e-06, + "loss": 3.5073, + "mean_token_accuracy": 0.41513429176418143, + "step": 10096 + }, + { + "epoch": 1.871894697812384, + "grad_norm": 6.8828125, + "learning_rate": 8.128105302187616e-06, + "loss": 2.9056, + "mean_token_accuracy": 0.4556067083196317, + "step": 10097 + }, + { + "epoch": 1.8720800889877642, + "grad_norm": 5.71875, + "learning_rate": 8.127919911012237e-06, + "loss": 2.8175, + "mean_token_accuracy": 0.4741745283018868, + "step": 10098 + }, + { + "epoch": 1.8722654801631442, + "grad_norm": 7.9609375, + "learning_rate": 8.127734519836856e-06, + "loss": 2.2557, + "mean_token_accuracy": 0.5196486361534906, + "step": 10099 + }, + { + "epoch": 1.8724508713385242, + "grad_norm": 8.890625, + "learning_rate": 8.127549128661476e-06, + "loss": 3.0002, + "mean_token_accuracy": 0.4482452243447357, + "step": 10100 + }, + { + "epoch": 1.8726362625139044, + "grad_norm": 13.4296875, + "learning_rate": 8.127363737486096e-06, + "loss": 2.6813, + "mean_token_accuracy": 0.47229085774797036, + "step": 10101 + }, + { + "epoch": 1.8728216536892845, + "grad_norm": 6.83984375, + "learning_rate": 8.127178346310717e-06, + "loss": 2.9567, + "mean_token_accuracy": 0.47224435590969455, + "step": 10102 + }, + { + "epoch": 1.8730070448646643, + "grad_norm": 9.3125, + "learning_rate": 8.126992955135336e-06, + "loss": 3.1554, + "mean_token_accuracy": 0.4297907488986784, + "step": 10103 + }, + { + "epoch": 1.8731924360400445, + "grad_norm": 9.8515625, + "learning_rate": 8.126807563959956e-06, + "loss": 2.5324, + "mean_token_accuracy": 0.5000681477443096, + "step": 10104 + }, + { + "epoch": 1.8733778272154247, + "grad_norm": 10.28125, + "learning_rate": 8.126622172784577e-06, + "loss": 2.9469, + "mean_token_accuracy": 0.4587481019065294, + "step": 10105 + }, + { + "epoch": 1.8735632183908046, + "grad_norm": 5.6171875, + "learning_rate": 8.126436781609196e-06, + "loss": 2.9074, + "mean_token_accuracy": 0.46707726763717805, + "step": 10106 + }, + { + "epoch": 1.8737486095661846, + "grad_norm": 9.9453125, + "learning_rate": 8.126251390433816e-06, + "loss": 2.9564, + "mean_token_accuracy": 0.4639951792708647, + "step": 10107 + }, + { + "epoch": 1.8739340007415648, + "grad_norm": 6.44140625, + "learning_rate": 8.126065999258435e-06, + "loss": 3.4303, + "mean_token_accuracy": 0.42714465937762824, + "step": 10108 + }, + { + "epoch": 1.8741193919169448, + "grad_norm": 7.59375, + "learning_rate": 8.125880608083055e-06, + "loss": 2.7195, + "mean_token_accuracy": 0.4618293547624675, + "step": 10109 + }, + { + "epoch": 1.8743047830923247, + "grad_norm": 10.4140625, + "learning_rate": 8.125695216907676e-06, + "loss": 2.6316, + "mean_token_accuracy": 0.5033467202141901, + "step": 10110 + }, + { + "epoch": 1.874490174267705, + "grad_norm": 6.03125, + "learning_rate": 8.125509825732296e-06, + "loss": 3.3669, + "mean_token_accuracy": 0.4336404896162839, + "step": 10111 + }, + { + "epoch": 1.874675565443085, + "grad_norm": 6.53125, + "learning_rate": 8.125324434556917e-06, + "loss": 2.6287, + "mean_token_accuracy": 0.4826904055390702, + "step": 10112 + }, + { + "epoch": 1.8748609566184649, + "grad_norm": 7.76171875, + "learning_rate": 8.125139043381536e-06, + "loss": 3.0431, + "mean_token_accuracy": 0.46282229592382584, + "step": 10113 + }, + { + "epoch": 1.875046347793845, + "grad_norm": 6.64453125, + "learning_rate": 8.124953652206156e-06, + "loss": 3.5989, + "mean_token_accuracy": 0.41878830740216877, + "step": 10114 + }, + { + "epoch": 1.875231738969225, + "grad_norm": 5.984375, + "learning_rate": 8.124768261030775e-06, + "loss": 2.3945, + "mean_token_accuracy": 0.5489812363301485, + "step": 10115 + }, + { + "epoch": 1.875417130144605, + "grad_norm": 6.453125, + "learning_rate": 8.124582869855395e-06, + "loss": 2.7672, + "mean_token_accuracy": 0.47782772445632365, + "step": 10116 + }, + { + "epoch": 1.8756025213199852, + "grad_norm": 8.3515625, + "learning_rate": 8.124397478680016e-06, + "loss": 3.28, + "mean_token_accuracy": 0.43952451708766715, + "step": 10117 + }, + { + "epoch": 1.8757879124953654, + "grad_norm": 4.98046875, + "learning_rate": 8.124212087504636e-06, + "loss": 3.1113, + "mean_token_accuracy": 0.46519708980345315, + "step": 10118 + }, + { + "epoch": 1.875973303670745, + "grad_norm": 5.46484375, + "learning_rate": 8.124026696329255e-06, + "loss": 2.7496, + "mean_token_accuracy": 0.48156620021528523, + "step": 10119 + }, + { + "epoch": 1.8761586948461253, + "grad_norm": 6.48046875, + "learning_rate": 8.123841305153876e-06, + "loss": 2.9656, + "mean_token_accuracy": 0.44422546314544736, + "step": 10120 + }, + { + "epoch": 1.8763440860215055, + "grad_norm": 8.421875, + "learning_rate": 8.123655913978496e-06, + "loss": 2.5989, + "mean_token_accuracy": 0.48301366428481884, + "step": 10121 + }, + { + "epoch": 1.8765294771968855, + "grad_norm": 7.0703125, + "learning_rate": 8.123470522803115e-06, + "loss": 3.3154, + "mean_token_accuracy": 0.4249201277955272, + "step": 10122 + }, + { + "epoch": 1.8767148683722654, + "grad_norm": 7.765625, + "learning_rate": 8.123285131627735e-06, + "loss": 2.725, + "mean_token_accuracy": 0.49347568208778175, + "step": 10123 + }, + { + "epoch": 1.8769002595476456, + "grad_norm": 6.01953125, + "learning_rate": 8.123099740452354e-06, + "loss": 2.8754, + "mean_token_accuracy": 0.45204143814747105, + "step": 10124 + }, + { + "epoch": 1.8770856507230256, + "grad_norm": 7.3203125, + "learning_rate": 8.122914349276975e-06, + "loss": 2.4927, + "mean_token_accuracy": 0.5107468838943959, + "step": 10125 + }, + { + "epoch": 1.8772710418984055, + "grad_norm": 9.6171875, + "learning_rate": 8.122728958101595e-06, + "loss": 3.3556, + "mean_token_accuracy": 0.4277137757585591, + "step": 10126 + }, + { + "epoch": 1.8774564330737857, + "grad_norm": 9.953125, + "learning_rate": 8.122543566926216e-06, + "loss": 2.8704, + "mean_token_accuracy": 0.4784792784564448, + "step": 10127 + }, + { + "epoch": 1.8776418242491657, + "grad_norm": 6.70703125, + "learning_rate": 8.122358175750835e-06, + "loss": 2.3334, + "mean_token_accuracy": 0.5217520415738679, + "step": 10128 + }, + { + "epoch": 1.8778272154245457, + "grad_norm": 5.7421875, + "learning_rate": 8.122172784575455e-06, + "loss": 3.0312, + "mean_token_accuracy": 0.4983666061705989, + "step": 10129 + }, + { + "epoch": 1.8780126065999259, + "grad_norm": 11.40625, + "learning_rate": 8.121987393400075e-06, + "loss": 2.5276, + "mean_token_accuracy": 0.49204720369420213, + "step": 10130 + }, + { + "epoch": 1.878197997775306, + "grad_norm": 9.8515625, + "learning_rate": 8.121802002224694e-06, + "loss": 2.5466, + "mean_token_accuracy": 0.49810574864108054, + "step": 10131 + }, + { + "epoch": 1.8783833889506858, + "grad_norm": 6.7578125, + "learning_rate": 8.121616611049315e-06, + "loss": 2.622, + "mean_token_accuracy": 0.48539847161572053, + "step": 10132 + }, + { + "epoch": 1.878568780126066, + "grad_norm": 9.5234375, + "learning_rate": 8.121431219873934e-06, + "loss": 2.7148, + "mean_token_accuracy": 0.4747528720277852, + "step": 10133 + }, + { + "epoch": 1.8787541713014462, + "grad_norm": 9.421875, + "learning_rate": 8.121245828698556e-06, + "loss": 2.8769, + "mean_token_accuracy": 0.4526807737245229, + "step": 10134 + }, + { + "epoch": 1.8789395624768261, + "grad_norm": 7.88671875, + "learning_rate": 8.121060437523175e-06, + "loss": 3.5884, + "mean_token_accuracy": 0.4243703199455412, + "step": 10135 + }, + { + "epoch": 1.879124953652206, + "grad_norm": 5.921875, + "learning_rate": 8.120875046347795e-06, + "loss": 2.7137, + "mean_token_accuracy": 0.48685774666490556, + "step": 10136 + }, + { + "epoch": 1.8793103448275863, + "grad_norm": 6.82421875, + "learning_rate": 8.120689655172414e-06, + "loss": 2.399, + "mean_token_accuracy": 0.5368325173630878, + "step": 10137 + }, + { + "epoch": 1.8794957360029663, + "grad_norm": 6.6015625, + "learning_rate": 8.120504263997034e-06, + "loss": 2.0357, + "mean_token_accuracy": 0.5621627023785729, + "step": 10138 + }, + { + "epoch": 1.8796811271783462, + "grad_norm": 6.7265625, + "learning_rate": 8.120318872821655e-06, + "loss": 2.7578, + "mean_token_accuracy": 0.46712586098935505, + "step": 10139 + }, + { + "epoch": 1.8798665183537264, + "grad_norm": 6.33203125, + "learning_rate": 8.120133481646274e-06, + "loss": 2.7437, + "mean_token_accuracy": 0.49986958789775693, + "step": 10140 + }, + { + "epoch": 1.8800519095291064, + "grad_norm": 6.65625, + "learning_rate": 8.119948090470894e-06, + "loss": 2.8836, + "mean_token_accuracy": 0.47645561565383393, + "step": 10141 + }, + { + "epoch": 1.8802373007044864, + "grad_norm": 6.54296875, + "learning_rate": 8.119762699295515e-06, + "loss": 2.7803, + "mean_token_accuracy": 0.4742000576535024, + "step": 10142 + }, + { + "epoch": 1.8804226918798665, + "grad_norm": 7.60546875, + "learning_rate": 8.119577308120135e-06, + "loss": 2.5352, + "mean_token_accuracy": 0.4858186506231199, + "step": 10143 + }, + { + "epoch": 1.8806080830552465, + "grad_norm": 6.7734375, + "learning_rate": 8.119391916944754e-06, + "loss": 3.0937, + "mean_token_accuracy": 0.44979203802733214, + "step": 10144 + }, + { + "epoch": 1.8807934742306265, + "grad_norm": 6.48046875, + "learning_rate": 8.119206525769374e-06, + "loss": 3.2022, + "mean_token_accuracy": 0.4332659251769464, + "step": 10145 + }, + { + "epoch": 1.8809788654060067, + "grad_norm": 8.1484375, + "learning_rate": 8.119021134593993e-06, + "loss": 2.9449, + "mean_token_accuracy": 0.4693407100046104, + "step": 10146 + }, + { + "epoch": 1.8811642565813869, + "grad_norm": 6.12109375, + "learning_rate": 8.118835743418614e-06, + "loss": 3.2414, + "mean_token_accuracy": 0.44424007744433686, + "step": 10147 + }, + { + "epoch": 1.8813496477567668, + "grad_norm": 6.25390625, + "learning_rate": 8.118650352243234e-06, + "loss": 3.5111, + "mean_token_accuracy": 0.41730474732006123, + "step": 10148 + }, + { + "epoch": 1.8815350389321468, + "grad_norm": 6.90625, + "learning_rate": 8.118464961067853e-06, + "loss": 3.1791, + "mean_token_accuracy": 0.4261992619926199, + "step": 10149 + }, + { + "epoch": 1.881720430107527, + "grad_norm": 6.609375, + "learning_rate": 8.118279569892473e-06, + "loss": 2.8779, + "mean_token_accuracy": 0.4701067615658363, + "step": 10150 + }, + { + "epoch": 1.881905821282907, + "grad_norm": 6.36328125, + "learning_rate": 8.118094178717094e-06, + "loss": 3.5744, + "mean_token_accuracy": 0.40881337934696044, + "step": 10151 + }, + { + "epoch": 1.882091212458287, + "grad_norm": 6.55078125, + "learning_rate": 8.117908787541714e-06, + "loss": 3.0412, + "mean_token_accuracy": 0.4582822085889571, + "step": 10152 + }, + { + "epoch": 1.8822766036336671, + "grad_norm": 6.6875, + "learning_rate": 8.117723396366333e-06, + "loss": 2.565, + "mean_token_accuracy": 0.4875977653631285, + "step": 10153 + }, + { + "epoch": 1.882461994809047, + "grad_norm": 6.421875, + "learning_rate": 8.117538005190954e-06, + "loss": 2.9909, + "mean_token_accuracy": 0.47181910226122176, + "step": 10154 + }, + { + "epoch": 1.882647385984427, + "grad_norm": 8.9296875, + "learning_rate": 8.117352614015574e-06, + "loss": 2.7223, + "mean_token_accuracy": 0.46560920830993824, + "step": 10155 + }, + { + "epoch": 1.8828327771598072, + "grad_norm": 10.46875, + "learning_rate": 8.117167222840193e-06, + "loss": 2.7738, + "mean_token_accuracy": 0.46884038916644755, + "step": 10156 + }, + { + "epoch": 1.8830181683351872, + "grad_norm": 5.6015625, + "learning_rate": 8.116981831664814e-06, + "loss": 2.7535, + "mean_token_accuracy": 0.48962117280747275, + "step": 10157 + }, + { + "epoch": 1.8832035595105672, + "grad_norm": 6.9296875, + "learning_rate": 8.116796440489432e-06, + "loss": 2.5823, + "mean_token_accuracy": 0.48230888429752067, + "step": 10158 + }, + { + "epoch": 1.8833889506859474, + "grad_norm": 11.03125, + "learning_rate": 8.116611049314054e-06, + "loss": 3.3281, + "mean_token_accuracy": 0.4120691683331373, + "step": 10159 + }, + { + "epoch": 1.8835743418613276, + "grad_norm": 6.1875, + "learning_rate": 8.116425658138673e-06, + "loss": 2.7301, + "mean_token_accuracy": 0.4890616004605642, + "step": 10160 + }, + { + "epoch": 1.8837597330367073, + "grad_norm": 6.9296875, + "learning_rate": 8.116240266963294e-06, + "loss": 2.8867, + "mean_token_accuracy": 0.4628535903015232, + "step": 10161 + }, + { + "epoch": 1.8839451242120875, + "grad_norm": 7.45703125, + "learning_rate": 8.116054875787913e-06, + "loss": 2.2121, + "mean_token_accuracy": 0.5138355620283331, + "step": 10162 + }, + { + "epoch": 1.8841305153874677, + "grad_norm": 6.12109375, + "learning_rate": 8.115869484612533e-06, + "loss": 2.8463, + "mean_token_accuracy": 0.47619047619047616, + "step": 10163 + }, + { + "epoch": 1.8843159065628476, + "grad_norm": 6.17578125, + "learning_rate": 8.115684093437154e-06, + "loss": 2.9441, + "mean_token_accuracy": 0.436, + "step": 10164 + }, + { + "epoch": 1.8845012977382276, + "grad_norm": 6.421875, + "learning_rate": 8.115498702261772e-06, + "loss": 3.2196, + "mean_token_accuracy": 0.42308148499584863, + "step": 10165 + }, + { + "epoch": 1.8846866889136078, + "grad_norm": 8.53125, + "learning_rate": 8.115313311086393e-06, + "loss": 2.2704, + "mean_token_accuracy": 0.5262193725669796, + "step": 10166 + }, + { + "epoch": 1.8848720800889878, + "grad_norm": 5.62890625, + "learning_rate": 8.115127919911013e-06, + "loss": 3.1664, + "mean_token_accuracy": 0.4422132976349844, + "step": 10167 + }, + { + "epoch": 1.8850574712643677, + "grad_norm": 8.8671875, + "learning_rate": 8.114942528735634e-06, + "loss": 2.7669, + "mean_token_accuracy": 0.4784598214285714, + "step": 10168 + }, + { + "epoch": 1.885242862439748, + "grad_norm": 8.25, + "learning_rate": 8.114757137560253e-06, + "loss": 3.097, + "mean_token_accuracy": 0.4444283646888567, + "step": 10169 + }, + { + "epoch": 1.885428253615128, + "grad_norm": 5.8359375, + "learning_rate": 8.114571746384873e-06, + "loss": 2.7841, + "mean_token_accuracy": 0.4897499436810092, + "step": 10170 + }, + { + "epoch": 1.8856136447905079, + "grad_norm": 6.13671875, + "learning_rate": 8.114386355209492e-06, + "loss": 3.0454, + "mean_token_accuracy": 0.4658125609246623, + "step": 10171 + }, + { + "epoch": 1.885799035965888, + "grad_norm": 5.96875, + "learning_rate": 8.114200964034112e-06, + "loss": 3.1056, + "mean_token_accuracy": 0.4548320983761627, + "step": 10172 + }, + { + "epoch": 1.8859844271412682, + "grad_norm": 5.91015625, + "learning_rate": 8.114015572858733e-06, + "loss": 2.8177, + "mean_token_accuracy": 0.4608807182556648, + "step": 10173 + }, + { + "epoch": 1.886169818316648, + "grad_norm": 5.03515625, + "learning_rate": 8.113830181683352e-06, + "loss": 2.718, + "mean_token_accuracy": 0.4823258196721312, + "step": 10174 + }, + { + "epoch": 1.8863552094920282, + "grad_norm": 5.984375, + "learning_rate": 8.113644790507972e-06, + "loss": 2.838, + "mean_token_accuracy": 0.49235833529273454, + "step": 10175 + }, + { + "epoch": 1.8865406006674084, + "grad_norm": 8.3984375, + "learning_rate": 8.113459399332593e-06, + "loss": 2.6655, + "mean_token_accuracy": 0.4907505596795098, + "step": 10176 + }, + { + "epoch": 1.8867259918427883, + "grad_norm": 6.9453125, + "learning_rate": 8.113274008157213e-06, + "loss": 2.5754, + "mean_token_accuracy": 0.5122862642962291, + "step": 10177 + }, + { + "epoch": 1.8869113830181683, + "grad_norm": 6.48828125, + "learning_rate": 8.113088616981832e-06, + "loss": 3.4611, + "mean_token_accuracy": 0.4354364540931705, + "step": 10178 + }, + { + "epoch": 1.8870967741935485, + "grad_norm": 6.7578125, + "learning_rate": 8.112903225806452e-06, + "loss": 2.8304, + "mean_token_accuracy": 0.4632697155203457, + "step": 10179 + }, + { + "epoch": 1.8872821653689285, + "grad_norm": 6.7421875, + "learning_rate": 8.112717834631071e-06, + "loss": 3.308, + "mean_token_accuracy": 0.42641209228321403, + "step": 10180 + }, + { + "epoch": 1.8874675565443084, + "grad_norm": 6.484375, + "learning_rate": 8.112532443455692e-06, + "loss": 3.098, + "mean_token_accuracy": 0.46327047565437657, + "step": 10181 + }, + { + "epoch": 1.8876529477196886, + "grad_norm": 4.94140625, + "learning_rate": 8.112347052280312e-06, + "loss": 2.707, + "mean_token_accuracy": 0.48258885686839575, + "step": 10182 + }, + { + "epoch": 1.8878383388950686, + "grad_norm": 7.453125, + "learning_rate": 8.112161661104933e-06, + "loss": 2.1207, + "mean_token_accuracy": 0.5361473347060169, + "step": 10183 + }, + { + "epoch": 1.8880237300704485, + "grad_norm": 6.0859375, + "learning_rate": 8.111976269929552e-06, + "loss": 3.6855, + "mean_token_accuracy": 0.40639486508974204, + "step": 10184 + }, + { + "epoch": 1.8882091212458287, + "grad_norm": 6.98828125, + "learning_rate": 8.111790878754172e-06, + "loss": 3.0511, + "mean_token_accuracy": 0.4448014394036756, + "step": 10185 + }, + { + "epoch": 1.8883945124212087, + "grad_norm": 7.4453125, + "learning_rate": 8.111605487578793e-06, + "loss": 2.46, + "mean_token_accuracy": 0.49883213161368944, + "step": 10186 + }, + { + "epoch": 1.8885799035965887, + "grad_norm": 6.20703125, + "learning_rate": 8.111420096403411e-06, + "loss": 3.4081, + "mean_token_accuracy": 0.41220387652548457, + "step": 10187 + }, + { + "epoch": 1.8887652947719689, + "grad_norm": 6.3203125, + "learning_rate": 8.111234705228032e-06, + "loss": 3.4229, + "mean_token_accuracy": 0.4361374553516443, + "step": 10188 + }, + { + "epoch": 1.888950685947349, + "grad_norm": 9.5078125, + "learning_rate": 8.11104931405265e-06, + "loss": 2.6671, + "mean_token_accuracy": 0.4957492703971577, + "step": 10189 + }, + { + "epoch": 1.8891360771227288, + "grad_norm": 6.25, + "learning_rate": 8.110863922877271e-06, + "loss": 2.3141, + "mean_token_accuracy": 0.4990070921985816, + "step": 10190 + }, + { + "epoch": 1.889321468298109, + "grad_norm": 6.43359375, + "learning_rate": 8.110678531701892e-06, + "loss": 3.0731, + "mean_token_accuracy": 0.4689890710382514, + "step": 10191 + }, + { + "epoch": 1.8895068594734892, + "grad_norm": 6.3828125, + "learning_rate": 8.110493140526512e-06, + "loss": 2.8349, + "mean_token_accuracy": 0.4778192855475134, + "step": 10192 + }, + { + "epoch": 1.8896922506488691, + "grad_norm": 6.94140625, + "learning_rate": 8.110307749351133e-06, + "loss": 2.9536, + "mean_token_accuracy": 0.47776084245531386, + "step": 10193 + }, + { + "epoch": 1.889877641824249, + "grad_norm": 6.0546875, + "learning_rate": 8.110122358175751e-06, + "loss": 3.0804, + "mean_token_accuracy": 0.4528101802757158, + "step": 10194 + }, + { + "epoch": 1.8900630329996293, + "grad_norm": 6.65625, + "learning_rate": 8.109936967000372e-06, + "loss": 3.7011, + "mean_token_accuracy": 0.3937895026106073, + "step": 10195 + }, + { + "epoch": 1.8902484241750093, + "grad_norm": 6.2734375, + "learning_rate": 8.10975157582499e-06, + "loss": 2.7686, + "mean_token_accuracy": 0.4699491129143171, + "step": 10196 + }, + { + "epoch": 1.8904338153503892, + "grad_norm": 7.3046875, + "learning_rate": 8.109566184649611e-06, + "loss": 3.0357, + "mean_token_accuracy": 0.45566166439290584, + "step": 10197 + }, + { + "epoch": 1.8906192065257694, + "grad_norm": 5.796875, + "learning_rate": 8.109380793474232e-06, + "loss": 3.9371, + "mean_token_accuracy": 0.38888207141980613, + "step": 10198 + }, + { + "epoch": 1.8908045977011494, + "grad_norm": 5.8671875, + "learning_rate": 8.109195402298852e-06, + "loss": 2.8585, + "mean_token_accuracy": 0.4585698070374574, + "step": 10199 + }, + { + "epoch": 1.8909899888765294, + "grad_norm": 7.375, + "learning_rate": 8.109010011123471e-06, + "loss": 3.3664, + "mean_token_accuracy": 0.43427437141596825, + "step": 10200 + }, + { + "epoch": 1.8911753800519095, + "grad_norm": 6.01953125, + "learning_rate": 8.108824619948091e-06, + "loss": 2.836, + "mean_token_accuracy": 0.4595010779180782, + "step": 10201 + }, + { + "epoch": 1.8913607712272897, + "grad_norm": 6.671875, + "learning_rate": 8.108639228772712e-06, + "loss": 3.0316, + "mean_token_accuracy": 0.46031488394741116, + "step": 10202 + }, + { + "epoch": 1.8915461624026695, + "grad_norm": 5.6171875, + "learning_rate": 8.10845383759733e-06, + "loss": 2.6315, + "mean_token_accuracy": 0.47295864262990456, + "step": 10203 + }, + { + "epoch": 1.8917315535780497, + "grad_norm": 5.859375, + "learning_rate": 8.108268446421951e-06, + "loss": 2.5142, + "mean_token_accuracy": 0.49154471544715445, + "step": 10204 + }, + { + "epoch": 1.8919169447534299, + "grad_norm": 6.11328125, + "learning_rate": 8.10808305524657e-06, + "loss": 2.682, + "mean_token_accuracy": 0.4906141603754336, + "step": 10205 + }, + { + "epoch": 1.8921023359288098, + "grad_norm": 7.28125, + "learning_rate": 8.10789766407119e-06, + "loss": 3.5345, + "mean_token_accuracy": 0.4322392414296134, + "step": 10206 + }, + { + "epoch": 1.8922877271041898, + "grad_norm": 5.94921875, + "learning_rate": 8.107712272895811e-06, + "loss": 4.0479, + "mean_token_accuracy": 0.3861627906976744, + "step": 10207 + }, + { + "epoch": 1.89247311827957, + "grad_norm": 7.0234375, + "learning_rate": 8.107526881720431e-06, + "loss": 3.6692, + "mean_token_accuracy": 0.42349911912183225, + "step": 10208 + }, + { + "epoch": 1.89265850945495, + "grad_norm": 7.46875, + "learning_rate": 8.10734149054505e-06, + "loss": 3.2852, + "mean_token_accuracy": 0.45218821148494814, + "step": 10209 + }, + { + "epoch": 1.89284390063033, + "grad_norm": 5.80859375, + "learning_rate": 8.10715609936967e-06, + "loss": 3.2614, + "mean_token_accuracy": 0.4279962103268593, + "step": 10210 + }, + { + "epoch": 1.8930292918057101, + "grad_norm": 5.921875, + "learning_rate": 8.106970708194291e-06, + "loss": 3.1745, + "mean_token_accuracy": 0.4562043795620438, + "step": 10211 + }, + { + "epoch": 1.89321468298109, + "grad_norm": 6.11328125, + "learning_rate": 8.10678531701891e-06, + "loss": 2.6065, + "mean_token_accuracy": 0.4901935316274841, + "step": 10212 + }, + { + "epoch": 1.89340007415647, + "grad_norm": 7.015625, + "learning_rate": 8.10659992584353e-06, + "loss": 2.9656, + "mean_token_accuracy": 0.4572724612232953, + "step": 10213 + }, + { + "epoch": 1.8935854653318502, + "grad_norm": 5.3125, + "learning_rate": 8.10641453466815e-06, + "loss": 2.4477, + "mean_token_accuracy": 0.5052414231257941, + "step": 10214 + }, + { + "epoch": 1.8937708565072302, + "grad_norm": 5.796875, + "learning_rate": 8.106229143492772e-06, + "loss": 3.4245, + "mean_token_accuracy": 0.41787858572381587, + "step": 10215 + }, + { + "epoch": 1.8939562476826102, + "grad_norm": 5.20703125, + "learning_rate": 8.10604375231739e-06, + "loss": 2.6053, + "mean_token_accuracy": 0.4931606343057741, + "step": 10216 + }, + { + "epoch": 1.8941416388579904, + "grad_norm": 5.02734375, + "learning_rate": 8.10585836114201e-06, + "loss": 2.7455, + "mean_token_accuracy": 0.46941544885177455, + "step": 10217 + }, + { + "epoch": 1.8943270300333706, + "grad_norm": 5.7578125, + "learning_rate": 8.10567296996663e-06, + "loss": 3.325, + "mean_token_accuracy": 0.42923850574712646, + "step": 10218 + }, + { + "epoch": 1.8945124212087505, + "grad_norm": 5.6484375, + "learning_rate": 8.10548757879125e-06, + "loss": 2.8436, + "mean_token_accuracy": 0.48911245865490627, + "step": 10219 + }, + { + "epoch": 1.8946978123841305, + "grad_norm": 5.375, + "learning_rate": 8.10530218761587e-06, + "loss": 3.3293, + "mean_token_accuracy": 0.43183415319747015, + "step": 10220 + }, + { + "epoch": 1.8948832035595107, + "grad_norm": 5.296875, + "learning_rate": 8.10511679644049e-06, + "loss": 3.2274, + "mean_token_accuracy": 0.4331697977743209, + "step": 10221 + }, + { + "epoch": 1.8950685947348906, + "grad_norm": 6.359375, + "learning_rate": 8.10493140526511e-06, + "loss": 3.0228, + "mean_token_accuracy": 0.4664996043260353, + "step": 10222 + }, + { + "epoch": 1.8952539859102706, + "grad_norm": 6.21875, + "learning_rate": 8.10474601408973e-06, + "loss": 2.8443, + "mean_token_accuracy": 0.4662805662805663, + "step": 10223 + }, + { + "epoch": 1.8954393770856508, + "grad_norm": 5.859375, + "learning_rate": 8.104560622914351e-06, + "loss": 3.2023, + "mean_token_accuracy": 0.4566687154271666, + "step": 10224 + }, + { + "epoch": 1.8956247682610308, + "grad_norm": 6.26953125, + "learning_rate": 8.10437523173897e-06, + "loss": 2.6744, + "mean_token_accuracy": 0.49914129586260736, + "step": 10225 + }, + { + "epoch": 1.8958101594364107, + "grad_norm": 7.0703125, + "learning_rate": 8.10418984056359e-06, + "loss": 2.3553, + "mean_token_accuracy": 0.5187844595460956, + "step": 10226 + }, + { + "epoch": 1.895995550611791, + "grad_norm": 5.7734375, + "learning_rate": 8.104004449388209e-06, + "loss": 2.9762, + "mean_token_accuracy": 0.46474358974358976, + "step": 10227 + }, + { + "epoch": 1.896180941787171, + "grad_norm": 11.4140625, + "learning_rate": 8.10381905821283e-06, + "loss": 2.8779, + "mean_token_accuracy": 0.43937694704049846, + "step": 10228 + }, + { + "epoch": 1.8963663329625509, + "grad_norm": 6.36328125, + "learning_rate": 8.10363366703745e-06, + "loss": 4.1701, + "mean_token_accuracy": 0.3883248730964467, + "step": 10229 + }, + { + "epoch": 1.896551724137931, + "grad_norm": 10.15625, + "learning_rate": 8.103448275862069e-06, + "loss": 2.7331, + "mean_token_accuracy": 0.4723637789102313, + "step": 10230 + }, + { + "epoch": 1.8967371153133112, + "grad_norm": 6.6796875, + "learning_rate": 8.103262884686691e-06, + "loss": 3.0446, + "mean_token_accuracy": 0.4592933947772657, + "step": 10231 + }, + { + "epoch": 1.896922506488691, + "grad_norm": 7.41796875, + "learning_rate": 8.10307749351131e-06, + "loss": 3.2741, + "mean_token_accuracy": 0.43571988502743664, + "step": 10232 + }, + { + "epoch": 1.8971078976640712, + "grad_norm": 5.05078125, + "learning_rate": 8.10289210233593e-06, + "loss": 3.0238, + "mean_token_accuracy": 0.46801980198019805, + "step": 10233 + }, + { + "epoch": 1.8972932888394514, + "grad_norm": 5.5703125, + "learning_rate": 8.102706711160549e-06, + "loss": 3.1857, + "mean_token_accuracy": 0.4452408256880734, + "step": 10234 + }, + { + "epoch": 1.8974786800148313, + "grad_norm": 5.5625, + "learning_rate": 8.10252131998517e-06, + "loss": 2.776, + "mean_token_accuracy": 0.4755217220107626, + "step": 10235 + }, + { + "epoch": 1.8976640711902113, + "grad_norm": 6.79296875, + "learning_rate": 8.10233592880979e-06, + "loss": 2.7818, + "mean_token_accuracy": 0.48226846424384523, + "step": 10236 + }, + { + "epoch": 1.8978494623655915, + "grad_norm": 7.09375, + "learning_rate": 8.102150537634409e-06, + "loss": 2.9528, + "mean_token_accuracy": 0.4675055309734513, + "step": 10237 + }, + { + "epoch": 1.8980348535409715, + "grad_norm": 6.4765625, + "learning_rate": 8.10196514645903e-06, + "loss": 2.9773, + "mean_token_accuracy": 0.44704870444038275, + "step": 10238 + }, + { + "epoch": 1.8982202447163514, + "grad_norm": 5.7265625, + "learning_rate": 8.10177975528365e-06, + "loss": 2.5908, + "mean_token_accuracy": 0.5, + "step": 10239 + }, + { + "epoch": 1.8984056358917316, + "grad_norm": 7.66796875, + "learning_rate": 8.10159436410827e-06, + "loss": 3.0848, + "mean_token_accuracy": 0.4497307001795332, + "step": 10240 + }, + { + "epoch": 1.8985910270671116, + "grad_norm": 7.5703125, + "learning_rate": 8.101408972932889e-06, + "loss": 3.0003, + "mean_token_accuracy": 0.47577142857142857, + "step": 10241 + }, + { + "epoch": 1.8987764182424915, + "grad_norm": 5.84765625, + "learning_rate": 8.10122358175751e-06, + "loss": 2.5143, + "mean_token_accuracy": 0.4969033558980268, + "step": 10242 + }, + { + "epoch": 1.8989618094178717, + "grad_norm": 9.3671875, + "learning_rate": 8.101038190582128e-06, + "loss": 3.0333, + "mean_token_accuracy": 0.46844919786096256, + "step": 10243 + }, + { + "epoch": 1.899147200593252, + "grad_norm": 8.6953125, + "learning_rate": 8.100852799406749e-06, + "loss": 3.0233, + "mean_token_accuracy": 0.4796336796063423, + "step": 10244 + }, + { + "epoch": 1.8993325917686317, + "grad_norm": 6.50390625, + "learning_rate": 8.10066740823137e-06, + "loss": 2.7202, + "mean_token_accuracy": 0.49641932043272896, + "step": 10245 + }, + { + "epoch": 1.8995179829440119, + "grad_norm": 10.484375, + "learning_rate": 8.100482017055988e-06, + "loss": 2.6624, + "mean_token_accuracy": 0.5025544202576633, + "step": 10246 + }, + { + "epoch": 1.899703374119392, + "grad_norm": 6.5234375, + "learning_rate": 8.100296625880609e-06, + "loss": 2.9508, + "mean_token_accuracy": 0.4321632454755253, + "step": 10247 + }, + { + "epoch": 1.899888765294772, + "grad_norm": 11.546875, + "learning_rate": 8.100111234705229e-06, + "loss": 2.406, + "mean_token_accuracy": 0.5115220901218259, + "step": 10248 + }, + { + "epoch": 1.900074156470152, + "grad_norm": 6.73828125, + "learning_rate": 8.09992584352985e-06, + "loss": 3.5183, + "mean_token_accuracy": 0.41292829388908514, + "step": 10249 + }, + { + "epoch": 1.9002595476455322, + "grad_norm": 8.15625, + "learning_rate": 8.099740452354468e-06, + "loss": 3.1894, + "mean_token_accuracy": 0.4647098065376918, + "step": 10250 + }, + { + "epoch": 1.9004449388209121, + "grad_norm": 6.640625, + "learning_rate": 8.099555061179089e-06, + "loss": 3.3746, + "mean_token_accuracy": 0.4244885045349083, + "step": 10251 + }, + { + "epoch": 1.9006303299962921, + "grad_norm": 9.0390625, + "learning_rate": 8.099369670003708e-06, + "loss": 2.7523, + "mean_token_accuracy": 0.4865411348414002, + "step": 10252 + }, + { + "epoch": 1.9008157211716723, + "grad_norm": 6.78515625, + "learning_rate": 8.099184278828328e-06, + "loss": 2.3637, + "mean_token_accuracy": 0.5105720586327669, + "step": 10253 + }, + { + "epoch": 1.9010011123470523, + "grad_norm": 7.76171875, + "learning_rate": 8.098998887652949e-06, + "loss": 2.9138, + "mean_token_accuracy": 0.45197978096040436, + "step": 10254 + }, + { + "epoch": 1.9011865035224322, + "grad_norm": 8.3671875, + "learning_rate": 8.098813496477569e-06, + "loss": 2.8814, + "mean_token_accuracy": 0.45235294117647057, + "step": 10255 + }, + { + "epoch": 1.9013718946978124, + "grad_norm": 7.87109375, + "learning_rate": 8.098628105302188e-06, + "loss": 3.1355, + "mean_token_accuracy": 0.4411764705882353, + "step": 10256 + }, + { + "epoch": 1.9015572858731924, + "grad_norm": 6.16796875, + "learning_rate": 8.098442714126808e-06, + "loss": 2.8745, + "mean_token_accuracy": 0.47480106100795755, + "step": 10257 + }, + { + "epoch": 1.9017426770485724, + "grad_norm": 7.9140625, + "learning_rate": 8.098257322951429e-06, + "loss": 2.9194, + "mean_token_accuracy": 0.4636769248369752, + "step": 10258 + }, + { + "epoch": 1.9019280682239526, + "grad_norm": 7.90234375, + "learning_rate": 8.098071931776048e-06, + "loss": 2.6303, + "mean_token_accuracy": 0.5020851762921774, + "step": 10259 + }, + { + "epoch": 1.9021134593993327, + "grad_norm": 6.05078125, + "learning_rate": 8.097886540600668e-06, + "loss": 2.4992, + "mean_token_accuracy": 0.5027665068240502, + "step": 10260 + }, + { + "epoch": 1.9022988505747125, + "grad_norm": 5.68359375, + "learning_rate": 8.097701149425287e-06, + "loss": 2.5759, + "mean_token_accuracy": 0.5078587992785365, + "step": 10261 + }, + { + "epoch": 1.9024842417500927, + "grad_norm": 8.6484375, + "learning_rate": 8.097515758249908e-06, + "loss": 2.6642, + "mean_token_accuracy": 0.47744116499804967, + "step": 10262 + }, + { + "epoch": 1.9026696329254729, + "grad_norm": 6.89453125, + "learning_rate": 8.097330367074528e-06, + "loss": 3.0275, + "mean_token_accuracy": 0.4750856258976909, + "step": 10263 + }, + { + "epoch": 1.9028550241008528, + "grad_norm": 8.03125, + "learning_rate": 8.097144975899148e-06, + "loss": 2.2295, + "mean_token_accuracy": 0.5027366172740622, + "step": 10264 + }, + { + "epoch": 1.9030404152762328, + "grad_norm": 5.75390625, + "learning_rate": 8.096959584723767e-06, + "loss": 3.2299, + "mean_token_accuracy": 0.4493662943907642, + "step": 10265 + }, + { + "epoch": 1.903225806451613, + "grad_norm": 7.70703125, + "learning_rate": 8.096774193548388e-06, + "loss": 3.1596, + "mean_token_accuracy": 0.4524808332127875, + "step": 10266 + }, + { + "epoch": 1.903411197626993, + "grad_norm": 8.1875, + "learning_rate": 8.096588802373008e-06, + "loss": 2.4824, + "mean_token_accuracy": 0.5213193885760258, + "step": 10267 + }, + { + "epoch": 1.903596588802373, + "grad_norm": 8.078125, + "learning_rate": 8.096403411197627e-06, + "loss": 1.9392, + "mean_token_accuracy": 0.5655058043117744, + "step": 10268 + }, + { + "epoch": 1.9037819799777531, + "grad_norm": 6.98828125, + "learning_rate": 8.096218020022248e-06, + "loss": 2.8976, + "mean_token_accuracy": 0.4680600033709759, + "step": 10269 + }, + { + "epoch": 1.903967371153133, + "grad_norm": 8.234375, + "learning_rate": 8.096032628846866e-06, + "loss": 3.0843, + "mean_token_accuracy": 0.4384149046281236, + "step": 10270 + }, + { + "epoch": 1.904152762328513, + "grad_norm": 10.203125, + "learning_rate": 8.095847237671487e-06, + "loss": 2.8745, + "mean_token_accuracy": 0.441351888667992, + "step": 10271 + }, + { + "epoch": 1.9043381535038932, + "grad_norm": 9.7265625, + "learning_rate": 8.095661846496107e-06, + "loss": 2.5442, + "mean_token_accuracy": 0.5048254079663099, + "step": 10272 + }, + { + "epoch": 1.9045235446792734, + "grad_norm": 6.72265625, + "learning_rate": 8.095476455320728e-06, + "loss": 3.1269, + "mean_token_accuracy": 0.43475572047000616, + "step": 10273 + }, + { + "epoch": 1.9047089358546532, + "grad_norm": 10.2265625, + "learning_rate": 8.095291064145348e-06, + "loss": 2.5302, + "mean_token_accuracy": 0.5091968117719191, + "step": 10274 + }, + { + "epoch": 1.9048943270300334, + "grad_norm": 7.47265625, + "learning_rate": 8.095105672969967e-06, + "loss": 3.5961, + "mean_token_accuracy": 0.40953098106712565, + "step": 10275 + }, + { + "epoch": 1.9050797182054136, + "grad_norm": 7.27734375, + "learning_rate": 8.094920281794588e-06, + "loss": 2.9623, + "mean_token_accuracy": 0.4692132269099202, + "step": 10276 + }, + { + "epoch": 1.9052651093807935, + "grad_norm": 7.32421875, + "learning_rate": 8.094734890619206e-06, + "loss": 3.2354, + "mean_token_accuracy": 0.439851445180042, + "step": 10277 + }, + { + "epoch": 1.9054505005561735, + "grad_norm": 5.89453125, + "learning_rate": 8.094549499443827e-06, + "loss": 3.0347, + "mean_token_accuracy": 0.4633295978084921, + "step": 10278 + }, + { + "epoch": 1.9056358917315537, + "grad_norm": 6.83984375, + "learning_rate": 8.094364108268447e-06, + "loss": 2.9089, + "mean_token_accuracy": 0.4681907250163292, + "step": 10279 + }, + { + "epoch": 1.9058212829069336, + "grad_norm": 6.67578125, + "learning_rate": 8.094178717093068e-06, + "loss": 3.2072, + "mean_token_accuracy": 0.45765587445214195, + "step": 10280 + }, + { + "epoch": 1.9060066740823136, + "grad_norm": 5.55078125, + "learning_rate": 8.093993325917687e-06, + "loss": 2.7036, + "mean_token_accuracy": 0.47793589190743135, + "step": 10281 + }, + { + "epoch": 1.9061920652576938, + "grad_norm": 6.5859375, + "learning_rate": 8.093807934742307e-06, + "loss": 2.8824, + "mean_token_accuracy": 0.4811320754716981, + "step": 10282 + }, + { + "epoch": 1.9063774564330738, + "grad_norm": 7.06640625, + "learning_rate": 8.093622543566928e-06, + "loss": 2.8615, + "mean_token_accuracy": 0.47151898734177217, + "step": 10283 + }, + { + "epoch": 1.9065628476084537, + "grad_norm": 6.80859375, + "learning_rate": 8.093437152391546e-06, + "loss": 2.6127, + "mean_token_accuracy": 0.49407343162763806, + "step": 10284 + }, + { + "epoch": 1.906748238783834, + "grad_norm": 7.34765625, + "learning_rate": 8.093251761216167e-06, + "loss": 2.861, + "mean_token_accuracy": 0.4880141307090588, + "step": 10285 + }, + { + "epoch": 1.906933629959214, + "grad_norm": 6.515625, + "learning_rate": 8.093066370040786e-06, + "loss": 2.8208, + "mean_token_accuracy": 0.49135864765989656, + "step": 10286 + }, + { + "epoch": 1.9071190211345939, + "grad_norm": 6.43359375, + "learning_rate": 8.092880978865406e-06, + "loss": 2.7112, + "mean_token_accuracy": 0.488235982675875, + "step": 10287 + }, + { + "epoch": 1.907304412309974, + "grad_norm": 7.92578125, + "learning_rate": 8.092695587690027e-06, + "loss": 3.3615, + "mean_token_accuracy": 0.4293885332970176, + "step": 10288 + }, + { + "epoch": 1.9074898034853542, + "grad_norm": 6.91796875, + "learning_rate": 8.092510196514647e-06, + "loss": 2.9566, + "mean_token_accuracy": 0.4629401625215464, + "step": 10289 + }, + { + "epoch": 1.907675194660734, + "grad_norm": 6.5625, + "learning_rate": 8.092324805339266e-06, + "loss": 2.8477, + "mean_token_accuracy": 0.46130141045395284, + "step": 10290 + }, + { + "epoch": 1.9078605858361142, + "grad_norm": 7.1875, + "learning_rate": 8.092139414163887e-06, + "loss": 2.9925, + "mean_token_accuracy": 0.48203479576399394, + "step": 10291 + }, + { + "epoch": 1.9080459770114944, + "grad_norm": 8.234375, + "learning_rate": 8.091954022988507e-06, + "loss": 2.7513, + "mean_token_accuracy": 0.4864825791528119, + "step": 10292 + }, + { + "epoch": 1.9082313681868743, + "grad_norm": 9.1953125, + "learning_rate": 8.091768631813126e-06, + "loss": 2.8324, + "mean_token_accuracy": 0.4657139692035006, + "step": 10293 + }, + { + "epoch": 1.9084167593622543, + "grad_norm": 9.671875, + "learning_rate": 8.091583240637746e-06, + "loss": 2.9701, + "mean_token_accuracy": 0.4721758128775222, + "step": 10294 + }, + { + "epoch": 1.9086021505376345, + "grad_norm": 10.4921875, + "learning_rate": 8.091397849462365e-06, + "loss": 2.7357, + "mean_token_accuracy": 0.4798964968152866, + "step": 10295 + }, + { + "epoch": 1.9087875417130145, + "grad_norm": 9.765625, + "learning_rate": 8.091212458286987e-06, + "loss": 2.5448, + "mean_token_accuracy": 0.48547666335650447, + "step": 10296 + }, + { + "epoch": 1.9089729328883944, + "grad_norm": 5.48828125, + "learning_rate": 8.091027067111606e-06, + "loss": 4.1756, + "mean_token_accuracy": 0.37635776494764656, + "step": 10297 + }, + { + "epoch": 1.9091583240637746, + "grad_norm": 9.2265625, + "learning_rate": 8.090841675936227e-06, + "loss": 2.7633, + "mean_token_accuracy": 0.4745273950656841, + "step": 10298 + }, + { + "epoch": 1.9093437152391546, + "grad_norm": 7.12890625, + "learning_rate": 8.090656284760845e-06, + "loss": 3.1784, + "mean_token_accuracy": 0.4439259604915228, + "step": 10299 + }, + { + "epoch": 1.9095291064145345, + "grad_norm": 8.71875, + "learning_rate": 8.090470893585466e-06, + "loss": 2.5782, + "mean_token_accuracy": 0.48118939883645767, + "step": 10300 + }, + { + "epoch": 1.9097144975899147, + "grad_norm": 6.0859375, + "learning_rate": 8.090285502410086e-06, + "loss": 2.1835, + "mean_token_accuracy": 0.5498420720151611, + "step": 10301 + }, + { + "epoch": 1.909899888765295, + "grad_norm": 9.515625, + "learning_rate": 8.090100111234705e-06, + "loss": 2.4372, + "mean_token_accuracy": 0.5194482415807133, + "step": 10302 + }, + { + "epoch": 1.9100852799406747, + "grad_norm": 10.984375, + "learning_rate": 8.089914720059326e-06, + "loss": 2.9623, + "mean_token_accuracy": 0.451738006320023, + "step": 10303 + }, + { + "epoch": 1.9102706711160549, + "grad_norm": 7.359375, + "learning_rate": 8.089729328883946e-06, + "loss": 3.4499, + "mean_token_accuracy": 0.44393766628658304, + "step": 10304 + }, + { + "epoch": 1.910456062291435, + "grad_norm": 8.8984375, + "learning_rate": 8.089543937708567e-06, + "loss": 2.9217, + "mean_token_accuracy": 0.4655773158716809, + "step": 10305 + }, + { + "epoch": 1.910641453466815, + "grad_norm": 10.921875, + "learning_rate": 8.089358546533185e-06, + "loss": 3.0264, + "mean_token_accuracy": 0.46019988242210463, + "step": 10306 + }, + { + "epoch": 1.910826844642195, + "grad_norm": 6.46875, + "learning_rate": 8.089173155357806e-06, + "loss": 3.2135, + "mean_token_accuracy": 0.4380597014925373, + "step": 10307 + }, + { + "epoch": 1.9110122358175752, + "grad_norm": 8.4375, + "learning_rate": 8.088987764182425e-06, + "loss": 3.6797, + "mean_token_accuracy": 0.3981517636339971, + "step": 10308 + }, + { + "epoch": 1.9111976269929551, + "grad_norm": 9.6015625, + "learning_rate": 8.088802373007045e-06, + "loss": 3.0903, + "mean_token_accuracy": 0.46560477849676457, + "step": 10309 + }, + { + "epoch": 1.9113830181683351, + "grad_norm": 10.5, + "learning_rate": 8.088616981831666e-06, + "loss": 3.372, + "mean_token_accuracy": 0.43158002876515306, + "step": 10310 + }, + { + "epoch": 1.9115684093437153, + "grad_norm": 7.09765625, + "learning_rate": 8.088431590656284e-06, + "loss": 2.5558, + "mean_token_accuracy": 0.4810109100952907, + "step": 10311 + }, + { + "epoch": 1.9117538005190953, + "grad_norm": 12.109375, + "learning_rate": 8.088246199480907e-06, + "loss": 2.385, + "mean_token_accuracy": 0.5205712342740565, + "step": 10312 + }, + { + "epoch": 1.9119391916944752, + "grad_norm": 14.5390625, + "learning_rate": 8.088060808305525e-06, + "loss": 2.6275, + "mean_token_accuracy": 0.47574039067422813, + "step": 10313 + }, + { + "epoch": 1.9121245828698554, + "grad_norm": 8.2421875, + "learning_rate": 8.087875417130146e-06, + "loss": 2.5504, + "mean_token_accuracy": 0.5081716637272193, + "step": 10314 + }, + { + "epoch": 1.9123099740452354, + "grad_norm": 7.81640625, + "learning_rate": 8.087690025954765e-06, + "loss": 2.3461, + "mean_token_accuracy": 0.5279630123588512, + "step": 10315 + }, + { + "epoch": 1.9124953652206154, + "grad_norm": 8.2578125, + "learning_rate": 8.087504634779385e-06, + "loss": 3.0193, + "mean_token_accuracy": 0.4689250102445021, + "step": 10316 + }, + { + "epoch": 1.9126807563959956, + "grad_norm": 8.125, + "learning_rate": 8.087319243604006e-06, + "loss": 2.9256, + "mean_token_accuracy": 0.448177591120444, + "step": 10317 + }, + { + "epoch": 1.9128661475713757, + "grad_norm": 7.875, + "learning_rate": 8.087133852428625e-06, + "loss": 2.9542, + "mean_token_accuracy": 0.4679600235155791, + "step": 10318 + }, + { + "epoch": 1.9130515387467557, + "grad_norm": 14.765625, + "learning_rate": 8.086948461253245e-06, + "loss": 2.9623, + "mean_token_accuracy": 0.4629825489159175, + "step": 10319 + }, + { + "epoch": 1.9132369299221357, + "grad_norm": 6.55078125, + "learning_rate": 8.086763070077866e-06, + "loss": 2.7826, + "mean_token_accuracy": 0.4946314567614626, + "step": 10320 + }, + { + "epoch": 1.9134223210975159, + "grad_norm": 6.3203125, + "learning_rate": 8.086577678902486e-06, + "loss": 3.3196, + "mean_token_accuracy": 0.44623282919383933, + "step": 10321 + }, + { + "epoch": 1.9136077122728958, + "grad_norm": 7.87109375, + "learning_rate": 8.086392287727105e-06, + "loss": 2.6543, + "mean_token_accuracy": 0.49813200498132004, + "step": 10322 + }, + { + "epoch": 1.9137931034482758, + "grad_norm": 7.8515625, + "learning_rate": 8.086206896551725e-06, + "loss": 2.7293, + "mean_token_accuracy": 0.4696587438587173, + "step": 10323 + }, + { + "epoch": 1.913978494623656, + "grad_norm": 7.09765625, + "learning_rate": 8.086021505376344e-06, + "loss": 2.5154, + "mean_token_accuracy": 0.5053430220132507, + "step": 10324 + }, + { + "epoch": 1.914163885799036, + "grad_norm": 10.984375, + "learning_rate": 8.085836114200965e-06, + "loss": 2.7474, + "mean_token_accuracy": 0.5030978934324659, + "step": 10325 + }, + { + "epoch": 1.914349276974416, + "grad_norm": 6.8203125, + "learning_rate": 8.085650723025585e-06, + "loss": 3.2159, + "mean_token_accuracy": 0.4538786396977106, + "step": 10326 + }, + { + "epoch": 1.9145346681497961, + "grad_norm": 7.81640625, + "learning_rate": 8.085465331850204e-06, + "loss": 2.9475, + "mean_token_accuracy": 0.46917450365726227, + "step": 10327 + }, + { + "epoch": 1.914720059325176, + "grad_norm": 6.09375, + "learning_rate": 8.085279940674824e-06, + "loss": 2.8872, + "mean_token_accuracy": 0.48288075560802834, + "step": 10328 + }, + { + "epoch": 1.914905450500556, + "grad_norm": 10.9296875, + "learning_rate": 8.085094549499445e-06, + "loss": 3.072, + "mean_token_accuracy": 0.4654073991281994, + "step": 10329 + }, + { + "epoch": 1.9150908416759362, + "grad_norm": 8.6015625, + "learning_rate": 8.084909158324065e-06, + "loss": 3.0468, + "mean_token_accuracy": 0.44217391304347825, + "step": 10330 + }, + { + "epoch": 1.9152762328513164, + "grad_norm": 6.32421875, + "learning_rate": 8.084723767148684e-06, + "loss": 2.6005, + "mean_token_accuracy": 0.48831677626976355, + "step": 10331 + }, + { + "epoch": 1.9154616240266962, + "grad_norm": 6.13671875, + "learning_rate": 8.084538375973305e-06, + "loss": 2.7467, + "mean_token_accuracy": 0.467689620758483, + "step": 10332 + }, + { + "epoch": 1.9156470152020764, + "grad_norm": 7.578125, + "learning_rate": 8.084352984797923e-06, + "loss": 2.4015, + "mean_token_accuracy": 0.5086884075220186, + "step": 10333 + }, + { + "epoch": 1.9158324063774566, + "grad_norm": 6.43359375, + "learning_rate": 8.084167593622544e-06, + "loss": 2.8891, + "mean_token_accuracy": 0.4600989979476035, + "step": 10334 + }, + { + "epoch": 1.9160177975528365, + "grad_norm": 5.88671875, + "learning_rate": 8.083982202447164e-06, + "loss": 3.3017, + "mean_token_accuracy": 0.46669391091132, + "step": 10335 + }, + { + "epoch": 1.9162031887282165, + "grad_norm": 5.78125, + "learning_rate": 8.083796811271785e-06, + "loss": 3.0785, + "mean_token_accuracy": 0.45549230955552866, + "step": 10336 + }, + { + "epoch": 1.9163885799035967, + "grad_norm": 7.0859375, + "learning_rate": 8.083611420096404e-06, + "loss": 2.1819, + "mean_token_accuracy": 0.555714968914395, + "step": 10337 + }, + { + "epoch": 1.9165739710789766, + "grad_norm": 6.22265625, + "learning_rate": 8.083426028921024e-06, + "loss": 2.4666, + "mean_token_accuracy": 0.5310273351470398, + "step": 10338 + }, + { + "epoch": 1.9167593622543566, + "grad_norm": 5.66015625, + "learning_rate": 8.083240637745645e-06, + "loss": 3.443, + "mean_token_accuracy": 0.434717880005955, + "step": 10339 + }, + { + "epoch": 1.9169447534297368, + "grad_norm": 5.9609375, + "learning_rate": 8.083055246570263e-06, + "loss": 2.8922, + "mean_token_accuracy": 0.4551706571574121, + "step": 10340 + }, + { + "epoch": 1.9171301446051168, + "grad_norm": 6.5625, + "learning_rate": 8.082869855394884e-06, + "loss": 2.1332, + "mean_token_accuracy": 0.5711869781207061, + "step": 10341 + }, + { + "epoch": 1.9173155357804967, + "grad_norm": 7.94140625, + "learning_rate": 8.082684464219503e-06, + "loss": 1.8187, + "mean_token_accuracy": 0.6020258367586612, + "step": 10342 + }, + { + "epoch": 1.917500926955877, + "grad_norm": 8.875, + "learning_rate": 8.082499073044123e-06, + "loss": 2.6121, + "mean_token_accuracy": 0.4914769340803433, + "step": 10343 + }, + { + "epoch": 1.9176863181312571, + "grad_norm": 5.1953125, + "learning_rate": 8.082313681868744e-06, + "loss": 3.0569, + "mean_token_accuracy": 0.4718068018167719, + "step": 10344 + }, + { + "epoch": 1.9178717093066369, + "grad_norm": 7.04296875, + "learning_rate": 8.082128290693364e-06, + "loss": 3.9936, + "mean_token_accuracy": 0.4157936351057521, + "step": 10345 + }, + { + "epoch": 1.918057100482017, + "grad_norm": 6.9609375, + "learning_rate": 8.081942899517983e-06, + "loss": 2.5488, + "mean_token_accuracy": 0.5059743954480797, + "step": 10346 + }, + { + "epoch": 1.9182424916573972, + "grad_norm": 8.0234375, + "learning_rate": 8.081757508342604e-06, + "loss": 3.1018, + "mean_token_accuracy": 0.46181242580134546, + "step": 10347 + }, + { + "epoch": 1.9184278828327772, + "grad_norm": 5.16796875, + "learning_rate": 8.081572117167224e-06, + "loss": 2.9348, + "mean_token_accuracy": 0.4611624834874505, + "step": 10348 + }, + { + "epoch": 1.9186132740081572, + "grad_norm": 7.24609375, + "learning_rate": 8.081386725991843e-06, + "loss": 3.824, + "mean_token_accuracy": 0.404228280961183, + "step": 10349 + }, + { + "epoch": 1.9187986651835374, + "grad_norm": 8.7890625, + "learning_rate": 8.081201334816463e-06, + "loss": 2.3782, + "mean_token_accuracy": 0.5364560318161732, + "step": 10350 + }, + { + "epoch": 1.9189840563589173, + "grad_norm": 5.96875, + "learning_rate": 8.081015943641082e-06, + "loss": 3.5373, + "mean_token_accuracy": 0.4282586483725004, + "step": 10351 + }, + { + "epoch": 1.9191694475342973, + "grad_norm": 5.68359375, + "learning_rate": 8.080830552465704e-06, + "loss": 2.7985, + "mean_token_accuracy": 0.4622412562455389, + "step": 10352 + }, + { + "epoch": 1.9193548387096775, + "grad_norm": 10.234375, + "learning_rate": 8.080645161290323e-06, + "loss": 2.9669, + "mean_token_accuracy": 0.45303446172613604, + "step": 10353 + }, + { + "epoch": 1.9195402298850575, + "grad_norm": 6.47265625, + "learning_rate": 8.080459770114944e-06, + "loss": 3.0813, + "mean_token_accuracy": 0.4490177736202058, + "step": 10354 + }, + { + "epoch": 1.9197256210604374, + "grad_norm": 6.60546875, + "learning_rate": 8.080274378939564e-06, + "loss": 3.3202, + "mean_token_accuracy": 0.4305711786026747, + "step": 10355 + }, + { + "epoch": 1.9199110122358176, + "grad_norm": 9.09375, + "learning_rate": 8.080088987764183e-06, + "loss": 3.046, + "mean_token_accuracy": 0.44542228530872957, + "step": 10356 + }, + { + "epoch": 1.9200964034111976, + "grad_norm": 6.35546875, + "learning_rate": 8.079903596588803e-06, + "loss": 3.2808, + "mean_token_accuracy": 0.4639618138424821, + "step": 10357 + }, + { + "epoch": 1.9202817945865776, + "grad_norm": 7.02734375, + "learning_rate": 8.079718205413422e-06, + "loss": 2.6198, + "mean_token_accuracy": 0.4948529411764706, + "step": 10358 + }, + { + "epoch": 1.9204671857619577, + "grad_norm": 6.1875, + "learning_rate": 8.079532814238043e-06, + "loss": 2.846, + "mean_token_accuracy": 0.4687459389213775, + "step": 10359 + }, + { + "epoch": 1.920652576937338, + "grad_norm": 6.48046875, + "learning_rate": 8.079347423062663e-06, + "loss": 2.364, + "mean_token_accuracy": 0.5096587250482936, + "step": 10360 + }, + { + "epoch": 1.9208379681127177, + "grad_norm": 5.328125, + "learning_rate": 8.079162031887284e-06, + "loss": 3.7651, + "mean_token_accuracy": 0.41081081081081083, + "step": 10361 + }, + { + "epoch": 1.9210233592880979, + "grad_norm": 6.08203125, + "learning_rate": 8.078976640711902e-06, + "loss": 2.8441, + "mean_token_accuracy": 0.4825749167591565, + "step": 10362 + }, + { + "epoch": 1.921208750463478, + "grad_norm": 6.55859375, + "learning_rate": 8.078791249536523e-06, + "loss": 2.6774, + "mean_token_accuracy": 0.4939595660749507, + "step": 10363 + }, + { + "epoch": 1.921394141638858, + "grad_norm": 6.38671875, + "learning_rate": 8.078605858361143e-06, + "loss": 2.8022, + "mean_token_accuracy": 0.46705619094977624, + "step": 10364 + }, + { + "epoch": 1.921579532814238, + "grad_norm": 6.93359375, + "learning_rate": 8.078420467185762e-06, + "loss": 3.0185, + "mean_token_accuracy": 0.4496509129967777, + "step": 10365 + }, + { + "epoch": 1.9217649239896182, + "grad_norm": 5.21875, + "learning_rate": 8.078235076010383e-06, + "loss": 2.5587, + "mean_token_accuracy": 0.5148629148629149, + "step": 10366 + }, + { + "epoch": 1.9219503151649981, + "grad_norm": 5.7421875, + "learning_rate": 8.078049684835001e-06, + "loss": 2.9501, + "mean_token_accuracy": 0.47554617676266137, + "step": 10367 + }, + { + "epoch": 1.9221357063403781, + "grad_norm": 6.28515625, + "learning_rate": 8.077864293659624e-06, + "loss": 2.6439, + "mean_token_accuracy": 0.48576688854269934, + "step": 10368 + }, + { + "epoch": 1.9223210975157583, + "grad_norm": 5.78515625, + "learning_rate": 8.077678902484242e-06, + "loss": 3.3892, + "mean_token_accuracy": 0.4363131079203335, + "step": 10369 + }, + { + "epoch": 1.9225064886911383, + "grad_norm": 6.953125, + "learning_rate": 8.077493511308863e-06, + "loss": 3.2938, + "mean_token_accuracy": 0.45559766465225854, + "step": 10370 + }, + { + "epoch": 1.9226918798665182, + "grad_norm": 7.58203125, + "learning_rate": 8.077308120133482e-06, + "loss": 2.7635, + "mean_token_accuracy": 0.48502276539659717, + "step": 10371 + }, + { + "epoch": 1.9228772710418984, + "grad_norm": 5.96484375, + "learning_rate": 8.077122728958102e-06, + "loss": 2.1215, + "mean_token_accuracy": 0.5575916230366492, + "step": 10372 + }, + { + "epoch": 1.9230626622172786, + "grad_norm": 6.77734375, + "learning_rate": 8.076937337782723e-06, + "loss": 2.7117, + "mean_token_accuracy": 0.45497197966896913, + "step": 10373 + }, + { + "epoch": 1.9232480533926584, + "grad_norm": 6.54296875, + "learning_rate": 8.076751946607342e-06, + "loss": 3.0793, + "mean_token_accuracy": 0.43742203742203745, + "step": 10374 + }, + { + "epoch": 1.9234334445680386, + "grad_norm": 6.67578125, + "learning_rate": 8.076566555431962e-06, + "loss": 2.7692, + "mean_token_accuracy": 0.46850344395742016, + "step": 10375 + }, + { + "epoch": 1.9236188357434187, + "grad_norm": 8.203125, + "learning_rate": 8.076381164256583e-06, + "loss": 2.7727, + "mean_token_accuracy": 0.468324960176449, + "step": 10376 + }, + { + "epoch": 1.9238042269187987, + "grad_norm": 6.140625, + "learning_rate": 8.076195773081203e-06, + "loss": 2.9475, + "mean_token_accuracy": 0.46644676979071886, + "step": 10377 + }, + { + "epoch": 1.9239896180941787, + "grad_norm": 8.640625, + "learning_rate": 8.076010381905822e-06, + "loss": 3.1439, + "mean_token_accuracy": 0.46116504854368934, + "step": 10378 + }, + { + "epoch": 1.9241750092695589, + "grad_norm": 6.578125, + "learning_rate": 8.075824990730442e-06, + "loss": 2.471, + "mean_token_accuracy": 0.5143592522351667, + "step": 10379 + }, + { + "epoch": 1.9243604004449388, + "grad_norm": 9.046875, + "learning_rate": 8.075639599555061e-06, + "loss": 2.2921, + "mean_token_accuracy": 0.5141073982577038, + "step": 10380 + }, + { + "epoch": 1.9245457916203188, + "grad_norm": 7.1796875, + "learning_rate": 8.075454208379682e-06, + "loss": 2.9535, + "mean_token_accuracy": 0.4465894465894466, + "step": 10381 + }, + { + "epoch": 1.924731182795699, + "grad_norm": 6.5859375, + "learning_rate": 8.075268817204302e-06, + "loss": 2.8682, + "mean_token_accuracy": 0.46201329534662866, + "step": 10382 + }, + { + "epoch": 1.924916573971079, + "grad_norm": 6.65625, + "learning_rate": 8.075083426028921e-06, + "loss": 2.8094, + "mean_token_accuracy": 0.46615656268393174, + "step": 10383 + }, + { + "epoch": 1.925101965146459, + "grad_norm": 10.2578125, + "learning_rate": 8.074898034853541e-06, + "loss": 2.0601, + "mean_token_accuracy": 0.575619448340346, + "step": 10384 + }, + { + "epoch": 1.9252873563218391, + "grad_norm": 6.4921875, + "learning_rate": 8.074712643678162e-06, + "loss": 3.3101, + "mean_token_accuracy": 0.4569380549909062, + "step": 10385 + }, + { + "epoch": 1.925472747497219, + "grad_norm": 6.09375, + "learning_rate": 8.074527252502782e-06, + "loss": 3.1516, + "mean_token_accuracy": 0.43795798729848556, + "step": 10386 + }, + { + "epoch": 1.925658138672599, + "grad_norm": 6.73046875, + "learning_rate": 8.074341861327401e-06, + "loss": 2.7083, + "mean_token_accuracy": 0.4701336338753824, + "step": 10387 + }, + { + "epoch": 1.9258435298479792, + "grad_norm": 5.203125, + "learning_rate": 8.074156470152022e-06, + "loss": 2.9933, + "mean_token_accuracy": 0.4438324727481354, + "step": 10388 + }, + { + "epoch": 1.9260289210233594, + "grad_norm": 6.15234375, + "learning_rate": 8.07397107897664e-06, + "loss": 3.0362, + "mean_token_accuracy": 0.4450777202072539, + "step": 10389 + }, + { + "epoch": 1.9262143121987392, + "grad_norm": 5.48046875, + "learning_rate": 8.073785687801261e-06, + "loss": 3.0521, + "mean_token_accuracy": 0.4445106687328645, + "step": 10390 + }, + { + "epoch": 1.9263997033741194, + "grad_norm": 8.296875, + "learning_rate": 8.073600296625881e-06, + "loss": 2.3927, + "mean_token_accuracy": 0.4922572043892934, + "step": 10391 + }, + { + "epoch": 1.9265850945494996, + "grad_norm": 6.8046875, + "learning_rate": 8.073414905450502e-06, + "loss": 3.6478, + "mean_token_accuracy": 0.43342911877394635, + "step": 10392 + }, + { + "epoch": 1.9267704857248795, + "grad_norm": 6.76171875, + "learning_rate": 8.073229514275122e-06, + "loss": 2.445, + "mean_token_accuracy": 0.5221228923842933, + "step": 10393 + }, + { + "epoch": 1.9269558769002595, + "grad_norm": 7.4375, + "learning_rate": 8.073044123099741e-06, + "loss": 3.6617, + "mean_token_accuracy": 0.4127818508784064, + "step": 10394 + }, + { + "epoch": 1.9271412680756397, + "grad_norm": 7.9453125, + "learning_rate": 8.072858731924362e-06, + "loss": 2.593, + "mean_token_accuracy": 0.5104820403258112, + "step": 10395 + }, + { + "epoch": 1.9273266592510196, + "grad_norm": 6.125, + "learning_rate": 8.07267334074898e-06, + "loss": 3.2392, + "mean_token_accuracy": 0.44300991131977047, + "step": 10396 + }, + { + "epoch": 1.9275120504263996, + "grad_norm": 14.2109375, + "learning_rate": 8.072487949573601e-06, + "loss": 3.0613, + "mean_token_accuracy": 0.45460835870467753, + "step": 10397 + }, + { + "epoch": 1.9276974416017798, + "grad_norm": 10.4765625, + "learning_rate": 8.072302558398221e-06, + "loss": 3.2837, + "mean_token_accuracy": 0.42866733803289697, + "step": 10398 + }, + { + "epoch": 1.9278828327771598, + "grad_norm": 11.25, + "learning_rate": 8.07211716722284e-06, + "loss": 3.3254, + "mean_token_accuracy": 0.42421848549825075, + "step": 10399 + }, + { + "epoch": 1.9280682239525397, + "grad_norm": 7.78125, + "learning_rate": 8.07193177604746e-06, + "loss": 3.5773, + "mean_token_accuracy": 0.42247658688865763, + "step": 10400 + }, + { + "epoch": 1.92825361512792, + "grad_norm": 18.515625, + "learning_rate": 8.071746384872081e-06, + "loss": 2.0805, + "mean_token_accuracy": 0.5460359760159893, + "step": 10401 + }, + { + "epoch": 1.9284390063033001, + "grad_norm": 11.8359375, + "learning_rate": 8.071560993696702e-06, + "loss": 2.8686, + "mean_token_accuracy": 0.45147820277267414, + "step": 10402 + }, + { + "epoch": 1.9286243974786799, + "grad_norm": 13.34375, + "learning_rate": 8.07137560252132e-06, + "loss": 2.4216, + "mean_token_accuracy": 0.5168491397696573, + "step": 10403 + }, + { + "epoch": 1.92880978865406, + "grad_norm": 5.61328125, + "learning_rate": 8.071190211345941e-06, + "loss": 2.9512, + "mean_token_accuracy": 0.4650275540483256, + "step": 10404 + }, + { + "epoch": 1.9289951798294402, + "grad_norm": 7.24609375, + "learning_rate": 8.07100482017056e-06, + "loss": 3.0431, + "mean_token_accuracy": 0.4736293516781322, + "step": 10405 + }, + { + "epoch": 1.9291805710048202, + "grad_norm": 10.0, + "learning_rate": 8.07081942899518e-06, + "loss": 3.2925, + "mean_token_accuracy": 0.4232345707928329, + "step": 10406 + }, + { + "epoch": 1.9293659621802002, + "grad_norm": 8.5859375, + "learning_rate": 8.0706340378198e-06, + "loss": 2.8336, + "mean_token_accuracy": 0.463197803634462, + "step": 10407 + }, + { + "epoch": 1.9295513533555804, + "grad_norm": 5.9609375, + "learning_rate": 8.07044864664442e-06, + "loss": 3.046, + "mean_token_accuracy": 0.4464165658195509, + "step": 10408 + }, + { + "epoch": 1.9297367445309603, + "grad_norm": 8.46875, + "learning_rate": 8.07026325546904e-06, + "loss": 2.7676, + "mean_token_accuracy": 0.5019873399087296, + "step": 10409 + }, + { + "epoch": 1.9299221357063403, + "grad_norm": 6.6953125, + "learning_rate": 8.07007786429366e-06, + "loss": 2.5103, + "mean_token_accuracy": 0.502814845704754, + "step": 10410 + }, + { + "epoch": 1.9301075268817205, + "grad_norm": 12.1796875, + "learning_rate": 8.069892473118281e-06, + "loss": 2.8528, + "mean_token_accuracy": 0.45404696626534746, + "step": 10411 + }, + { + "epoch": 1.9302929180571005, + "grad_norm": 6.75, + "learning_rate": 8.0697070819429e-06, + "loss": 2.9298, + "mean_token_accuracy": 0.4493141877411059, + "step": 10412 + }, + { + "epoch": 1.9304783092324804, + "grad_norm": 10.609375, + "learning_rate": 8.06952169076752e-06, + "loss": 4.0295, + "mean_token_accuracy": 0.41983695652173914, + "step": 10413 + }, + { + "epoch": 1.9306637004078606, + "grad_norm": 8.265625, + "learning_rate": 8.06933629959214e-06, + "loss": 3.1219, + "mean_token_accuracy": 0.42972315181016124, + "step": 10414 + }, + { + "epoch": 1.9308490915832406, + "grad_norm": 8.8515625, + "learning_rate": 8.06915090841676e-06, + "loss": 2.5215, + "mean_token_accuracy": 0.49205461056401073, + "step": 10415 + }, + { + "epoch": 1.9310344827586206, + "grad_norm": 6.15625, + "learning_rate": 8.06896551724138e-06, + "loss": 2.9309, + "mean_token_accuracy": 0.4779299847792998, + "step": 10416 + }, + { + "epoch": 1.9312198739340007, + "grad_norm": 6.640625, + "learning_rate": 8.068780126066e-06, + "loss": 2.6323, + "mean_token_accuracy": 0.5032974661575842, + "step": 10417 + }, + { + "epoch": 1.931405265109381, + "grad_norm": 8.15625, + "learning_rate": 8.06859473489062e-06, + "loss": 2.5085, + "mean_token_accuracy": 0.49014014634751335, + "step": 10418 + }, + { + "epoch": 1.931590656284761, + "grad_norm": 6.390625, + "learning_rate": 8.06840934371524e-06, + "loss": 3.2134, + "mean_token_accuracy": 0.4583525080533824, + "step": 10419 + }, + { + "epoch": 1.9317760474601409, + "grad_norm": 5.9296875, + "learning_rate": 8.06822395253986e-06, + "loss": 2.9791, + "mean_token_accuracy": 0.44430596666147376, + "step": 10420 + }, + { + "epoch": 1.931961438635521, + "grad_norm": 7.0390625, + "learning_rate": 8.06803856136448e-06, + "loss": 3.1352, + "mean_token_accuracy": 0.43994928538497, + "step": 10421 + }, + { + "epoch": 1.932146829810901, + "grad_norm": 5.17578125, + "learning_rate": 8.0678531701891e-06, + "loss": 2.8088, + "mean_token_accuracy": 0.488679476196304, + "step": 10422 + }, + { + "epoch": 1.932332220986281, + "grad_norm": 7.86328125, + "learning_rate": 8.067667779013719e-06, + "loss": 3.2063, + "mean_token_accuracy": 0.4427150886294855, + "step": 10423 + }, + { + "epoch": 1.9325176121616612, + "grad_norm": 7.1328125, + "learning_rate": 8.067482387838339e-06, + "loss": 3.2191, + "mean_token_accuracy": 0.45500750837472564, + "step": 10424 + }, + { + "epoch": 1.9327030033370411, + "grad_norm": 6.20703125, + "learning_rate": 8.06729699666296e-06, + "loss": 2.7669, + "mean_token_accuracy": 0.48097795041699987, + "step": 10425 + }, + { + "epoch": 1.9328883945124211, + "grad_norm": 6.09765625, + "learning_rate": 8.06711160548758e-06, + "loss": 3.2506, + "mean_token_accuracy": 0.45327954190525765, + "step": 10426 + }, + { + "epoch": 1.9330737856878013, + "grad_norm": 5.47265625, + "learning_rate": 8.066926214312199e-06, + "loss": 2.6359, + "mean_token_accuracy": 0.4958767067730161, + "step": 10427 + }, + { + "epoch": 1.9332591768631813, + "grad_norm": 6.15625, + "learning_rate": 8.06674082313682e-06, + "loss": 3.4798, + "mean_token_accuracy": 0.43817427385892116, + "step": 10428 + }, + { + "epoch": 1.9334445680385612, + "grad_norm": 7.2890625, + "learning_rate": 8.06655543196144e-06, + "loss": 2.6007, + "mean_token_accuracy": 0.4999341151666886, + "step": 10429 + }, + { + "epoch": 1.9336299592139414, + "grad_norm": 5.921875, + "learning_rate": 8.066370040786059e-06, + "loss": 2.9893, + "mean_token_accuracy": 0.4700162074554295, + "step": 10430 + }, + { + "epoch": 1.9338153503893216, + "grad_norm": 6.70703125, + "learning_rate": 8.066184649610679e-06, + "loss": 3.1471, + "mean_token_accuracy": 0.4241455347298787, + "step": 10431 + }, + { + "epoch": 1.9340007415647014, + "grad_norm": 7.80859375, + "learning_rate": 8.065999258435298e-06, + "loss": 2.6235, + "mean_token_accuracy": 0.472627953924712, + "step": 10432 + }, + { + "epoch": 1.9341861327400816, + "grad_norm": 5.07421875, + "learning_rate": 8.06581386725992e-06, + "loss": 2.9562, + "mean_token_accuracy": 0.4500165690931183, + "step": 10433 + }, + { + "epoch": 1.9343715239154617, + "grad_norm": 8.515625, + "learning_rate": 8.065628476084539e-06, + "loss": 3.4685, + "mean_token_accuracy": 0.4381852111033357, + "step": 10434 + }, + { + "epoch": 1.9345569150908417, + "grad_norm": 6.765625, + "learning_rate": 8.06544308490916e-06, + "loss": 2.1582, + "mean_token_accuracy": 0.5354643984085609, + "step": 10435 + }, + { + "epoch": 1.9347423062662217, + "grad_norm": 6.125, + "learning_rate": 8.06525769373378e-06, + "loss": 3.1293, + "mean_token_accuracy": 0.4457030660021145, + "step": 10436 + }, + { + "epoch": 1.9349276974416019, + "grad_norm": 6.50390625, + "learning_rate": 8.065072302558399e-06, + "loss": 3.2835, + "mean_token_accuracy": 0.45025536261491317, + "step": 10437 + }, + { + "epoch": 1.9351130886169818, + "grad_norm": 6.37109375, + "learning_rate": 8.064886911383019e-06, + "loss": 3.1872, + "mean_token_accuracy": 0.4247378931602596, + "step": 10438 + }, + { + "epoch": 1.9352984797923618, + "grad_norm": 8.1640625, + "learning_rate": 8.064701520207638e-06, + "loss": 2.3702, + "mean_token_accuracy": 0.5336713325250744, + "step": 10439 + }, + { + "epoch": 1.935483870967742, + "grad_norm": 6.32421875, + "learning_rate": 8.064516129032258e-06, + "loss": 2.5949, + "mean_token_accuracy": 0.48480614739783445, + "step": 10440 + }, + { + "epoch": 1.935669262143122, + "grad_norm": 9.453125, + "learning_rate": 8.064330737856879e-06, + "loss": 2.622, + "mean_token_accuracy": 0.47119110111236095, + "step": 10441 + }, + { + "epoch": 1.935854653318502, + "grad_norm": 6.109375, + "learning_rate": 8.0641453466815e-06, + "loss": 2.7047, + "mean_token_accuracy": 0.4800064236389915, + "step": 10442 + }, + { + "epoch": 1.9360400444938821, + "grad_norm": 6.05078125, + "learning_rate": 8.063959955506118e-06, + "loss": 3.3167, + "mean_token_accuracy": 0.42436431547191494, + "step": 10443 + }, + { + "epoch": 1.9362254356692623, + "grad_norm": 6.08203125, + "learning_rate": 8.063774564330739e-06, + "loss": 2.7186, + "mean_token_accuracy": 0.5052604234316145, + "step": 10444 + }, + { + "epoch": 1.936410826844642, + "grad_norm": 7.0, + "learning_rate": 8.06358917315536e-06, + "loss": 2.6626, + "mean_token_accuracy": 0.4682219557396708, + "step": 10445 + }, + { + "epoch": 1.9365962180200222, + "grad_norm": 7.9609375, + "learning_rate": 8.063403781979978e-06, + "loss": 3.2389, + "mean_token_accuracy": 0.45312157721796276, + "step": 10446 + }, + { + "epoch": 1.9367816091954024, + "grad_norm": 6.30859375, + "learning_rate": 8.063218390804598e-06, + "loss": 3.0901, + "mean_token_accuracy": 0.4410851666891618, + "step": 10447 + }, + { + "epoch": 1.9369670003707824, + "grad_norm": 5.26953125, + "learning_rate": 8.063032999629217e-06, + "loss": 3.1376, + "mean_token_accuracy": 0.45340022296544036, + "step": 10448 + }, + { + "epoch": 1.9371523915461624, + "grad_norm": 5.48046875, + "learning_rate": 8.06284760845384e-06, + "loss": 2.5845, + "mean_token_accuracy": 0.5070149423976275, + "step": 10449 + }, + { + "epoch": 1.9373377827215426, + "grad_norm": 8.9453125, + "learning_rate": 8.062662217278458e-06, + "loss": 2.4323, + "mean_token_accuracy": 0.5157000369412634, + "step": 10450 + }, + { + "epoch": 1.9375231738969225, + "grad_norm": 6.1953125, + "learning_rate": 8.062476826103079e-06, + "loss": 2.8212, + "mean_token_accuracy": 0.45074483421432004, + "step": 10451 + }, + { + "epoch": 1.9377085650723025, + "grad_norm": 5.91015625, + "learning_rate": 8.062291434927698e-06, + "loss": 3.0841, + "mean_token_accuracy": 0.44957632175677287, + "step": 10452 + }, + { + "epoch": 1.9378939562476827, + "grad_norm": 6.625, + "learning_rate": 8.062106043752318e-06, + "loss": 2.6862, + "mean_token_accuracy": 0.48737435645991667, + "step": 10453 + }, + { + "epoch": 1.9380793474230626, + "grad_norm": 6.59375, + "learning_rate": 8.061920652576938e-06, + "loss": 2.7423, + "mean_token_accuracy": 0.4707379134860051, + "step": 10454 + }, + { + "epoch": 1.9382647385984426, + "grad_norm": 5.45703125, + "learning_rate": 8.061735261401557e-06, + "loss": 2.9807, + "mean_token_accuracy": 0.4604999331640155, + "step": 10455 + }, + { + "epoch": 1.9384501297738228, + "grad_norm": 5.84375, + "learning_rate": 8.061549870226178e-06, + "loss": 2.5858, + "mean_token_accuracy": 0.5099452291726723, + "step": 10456 + }, + { + "epoch": 1.9386355209492028, + "grad_norm": 5.875, + "learning_rate": 8.061364479050798e-06, + "loss": 2.9005, + "mean_token_accuracy": 0.4639815215822145, + "step": 10457 + }, + { + "epoch": 1.9388209121245827, + "grad_norm": 6.2890625, + "learning_rate": 8.061179087875419e-06, + "loss": 2.7863, + "mean_token_accuracy": 0.4829837328767123, + "step": 10458 + }, + { + "epoch": 1.939006303299963, + "grad_norm": 6.47265625, + "learning_rate": 8.060993696700038e-06, + "loss": 3.2447, + "mean_token_accuracy": 0.4280918913248715, + "step": 10459 + }, + { + "epoch": 1.9391916944753431, + "grad_norm": 5.515625, + "learning_rate": 8.060808305524658e-06, + "loss": 2.981, + "mean_token_accuracy": 0.4626738228836302, + "step": 10460 + }, + { + "epoch": 1.9393770856507229, + "grad_norm": 6.046875, + "learning_rate": 8.060622914349277e-06, + "loss": 3.1321, + "mean_token_accuracy": 0.46447459186019774, + "step": 10461 + }, + { + "epoch": 1.939562476826103, + "grad_norm": 6.65625, + "learning_rate": 8.060437523173897e-06, + "loss": 2.5942, + "mean_token_accuracy": 0.47986822840409954, + "step": 10462 + }, + { + "epoch": 1.9397478680014832, + "grad_norm": 9.015625, + "learning_rate": 8.060252131998518e-06, + "loss": 3.3177, + "mean_token_accuracy": 0.4402570122327938, + "step": 10463 + }, + { + "epoch": 1.9399332591768632, + "grad_norm": 6.50390625, + "learning_rate": 8.060066740823137e-06, + "loss": 2.5836, + "mean_token_accuracy": 0.49836717307965356, + "step": 10464 + }, + { + "epoch": 1.9401186503522432, + "grad_norm": 6.38671875, + "learning_rate": 8.059881349647757e-06, + "loss": 3.0579, + "mean_token_accuracy": 0.4377117539744592, + "step": 10465 + }, + { + "epoch": 1.9403040415276234, + "grad_norm": 9.40625, + "learning_rate": 8.059695958472378e-06, + "loss": 2.9538, + "mean_token_accuracy": 0.4717967201297531, + "step": 10466 + }, + { + "epoch": 1.9404894327030033, + "grad_norm": 6.390625, + "learning_rate": 8.059510567296998e-06, + "loss": 2.8775, + "mean_token_accuracy": 0.47337278106508873, + "step": 10467 + }, + { + "epoch": 1.9406748238783833, + "grad_norm": 8.546875, + "learning_rate": 8.059325176121617e-06, + "loss": 2.4804, + "mean_token_accuracy": 0.4950964884530212, + "step": 10468 + }, + { + "epoch": 1.9408602150537635, + "grad_norm": 6.49609375, + "learning_rate": 8.059139784946237e-06, + "loss": 2.7158, + "mean_token_accuracy": 0.49169741697416974, + "step": 10469 + }, + { + "epoch": 1.9410456062291435, + "grad_norm": 8.1640625, + "learning_rate": 8.058954393770856e-06, + "loss": 2.8394, + "mean_token_accuracy": 0.4626769626769627, + "step": 10470 + }, + { + "epoch": 1.9412309974045234, + "grad_norm": 8.2109375, + "learning_rate": 8.058769002595477e-06, + "loss": 3.7991, + "mean_token_accuracy": 0.40977550440466043, + "step": 10471 + }, + { + "epoch": 1.9414163885799036, + "grad_norm": 5.78515625, + "learning_rate": 8.058583611420097e-06, + "loss": 2.5155, + "mean_token_accuracy": 0.4830421377183967, + "step": 10472 + }, + { + "epoch": 1.9416017797552838, + "grad_norm": 8.4140625, + "learning_rate": 8.058398220244718e-06, + "loss": 3.1837, + "mean_token_accuracy": 0.4492995330220147, + "step": 10473 + }, + { + "epoch": 1.9417871709306636, + "grad_norm": 6.68359375, + "learning_rate": 8.058212829069338e-06, + "loss": 2.9098, + "mean_token_accuracy": 0.45946335833814195, + "step": 10474 + }, + { + "epoch": 1.9419725621060437, + "grad_norm": 8.984375, + "learning_rate": 8.058027437893957e-06, + "loss": 3.5126, + "mean_token_accuracy": 0.39788499669530736, + "step": 10475 + }, + { + "epoch": 1.942157953281424, + "grad_norm": 8.90625, + "learning_rate": 8.057842046718577e-06, + "loss": 2.9853, + "mean_token_accuracy": 0.46144523557036815, + "step": 10476 + }, + { + "epoch": 1.942343344456804, + "grad_norm": 6.3671875, + "learning_rate": 8.057656655543196e-06, + "loss": 2.9652, + "mean_token_accuracy": 0.45551203133743706, + "step": 10477 + }, + { + "epoch": 1.9425287356321839, + "grad_norm": 8.1796875, + "learning_rate": 8.057471264367817e-06, + "loss": 2.9007, + "mean_token_accuracy": 0.46941827426349636, + "step": 10478 + }, + { + "epoch": 1.942714126807564, + "grad_norm": 5.99609375, + "learning_rate": 8.057285873192436e-06, + "loss": 3.3413, + "mean_token_accuracy": 0.44204420442044207, + "step": 10479 + }, + { + "epoch": 1.942899517982944, + "grad_norm": 7.2109375, + "learning_rate": 8.057100482017056e-06, + "loss": 2.6892, + "mean_token_accuracy": 0.47316807409425227, + "step": 10480 + }, + { + "epoch": 1.943084909158324, + "grad_norm": 5.98828125, + "learning_rate": 8.056915090841677e-06, + "loss": 3.3456, + "mean_token_accuracy": 0.43257097791798105, + "step": 10481 + }, + { + "epoch": 1.9432703003337042, + "grad_norm": 5.5390625, + "learning_rate": 8.056729699666297e-06, + "loss": 2.9839, + "mean_token_accuracy": 0.44241182496707154, + "step": 10482 + }, + { + "epoch": 1.9434556915090841, + "grad_norm": 5.546875, + "learning_rate": 8.056544308490917e-06, + "loss": 2.9067, + "mean_token_accuracy": 0.47664562669071237, + "step": 10483 + }, + { + "epoch": 1.9436410826844641, + "grad_norm": 6.15234375, + "learning_rate": 8.056358917315536e-06, + "loss": 2.6108, + "mean_token_accuracy": 0.49892008639308855, + "step": 10484 + }, + { + "epoch": 1.9438264738598443, + "grad_norm": 8.109375, + "learning_rate": 8.056173526140157e-06, + "loss": 2.6796, + "mean_token_accuracy": 0.4784561233159364, + "step": 10485 + }, + { + "epoch": 1.9440118650352243, + "grad_norm": 11.09375, + "learning_rate": 8.055988134964776e-06, + "loss": 1.846, + "mean_token_accuracy": 0.5865133917990045, + "step": 10486 + }, + { + "epoch": 1.9441972562106042, + "grad_norm": 8.140625, + "learning_rate": 8.055802743789396e-06, + "loss": 2.8276, + "mean_token_accuracy": 0.47274061648144267, + "step": 10487 + }, + { + "epoch": 1.9443826473859844, + "grad_norm": 7.8515625, + "learning_rate": 8.055617352614017e-06, + "loss": 2.5337, + "mean_token_accuracy": 0.48117994100294986, + "step": 10488 + }, + { + "epoch": 1.9445680385613646, + "grad_norm": 6.8359375, + "learning_rate": 8.055431961438637e-06, + "loss": 2.7193, + "mean_token_accuracy": 0.4626111560226354, + "step": 10489 + }, + { + "epoch": 1.9447534297367444, + "grad_norm": 6.44140625, + "learning_rate": 8.055246570263256e-06, + "loss": 2.9138, + "mean_token_accuracy": 0.4703710899553138, + "step": 10490 + }, + { + "epoch": 1.9449388209121246, + "grad_norm": 13.0546875, + "learning_rate": 8.055061179087876e-06, + "loss": 2.6443, + "mean_token_accuracy": 0.5158277744087326, + "step": 10491 + }, + { + "epoch": 1.9451242120875047, + "grad_norm": 7.8203125, + "learning_rate": 8.054875787912497e-06, + "loss": 2.9872, + "mean_token_accuracy": 0.46629213483146065, + "step": 10492 + }, + { + "epoch": 1.9453096032628847, + "grad_norm": 6.703125, + "learning_rate": 8.054690396737116e-06, + "loss": 2.2749, + "mean_token_accuracy": 0.5698972755694507, + "step": 10493 + }, + { + "epoch": 1.9454949944382647, + "grad_norm": 8.0234375, + "learning_rate": 8.054505005561736e-06, + "loss": 2.8601, + "mean_token_accuracy": 0.4625447535114294, + "step": 10494 + }, + { + "epoch": 1.9456803856136449, + "grad_norm": 6.25, + "learning_rate": 8.054319614386355e-06, + "loss": 3.442, + "mean_token_accuracy": 0.4168589821450261, + "step": 10495 + }, + { + "epoch": 1.9458657767890248, + "grad_norm": 6.26171875, + "learning_rate": 8.054134223210975e-06, + "loss": 2.8337, + "mean_token_accuracy": 0.4772261443862446, + "step": 10496 + }, + { + "epoch": 1.9460511679644048, + "grad_norm": 7.1484375, + "learning_rate": 8.053948832035596e-06, + "loss": 3.0354, + "mean_token_accuracy": 0.4549929676511955, + "step": 10497 + }, + { + "epoch": 1.946236559139785, + "grad_norm": 6.30078125, + "learning_rate": 8.053763440860216e-06, + "loss": 2.2133, + "mean_token_accuracy": 0.5409836065573771, + "step": 10498 + }, + { + "epoch": 1.946421950315165, + "grad_norm": 6.015625, + "learning_rate": 8.053578049684835e-06, + "loss": 2.7547, + "mean_token_accuracy": 0.47962541128828146, + "step": 10499 + }, + { + "epoch": 1.946607341490545, + "grad_norm": 6.40625, + "learning_rate": 8.053392658509456e-06, + "loss": 3.0949, + "mean_token_accuracy": 0.449935723468076, + "step": 10500 + }, + { + "epoch": 1.9467927326659251, + "grad_norm": 6.390625, + "learning_rate": 8.053207267334076e-06, + "loss": 3.4475, + "mean_token_accuracy": 0.4132399457749661, + "step": 10501 + }, + { + "epoch": 1.9469781238413053, + "grad_norm": 7.72265625, + "learning_rate": 8.053021876158695e-06, + "loss": 3.345, + "mean_token_accuracy": 0.4551383747576238, + "step": 10502 + }, + { + "epoch": 1.947163515016685, + "grad_norm": 7.01953125, + "learning_rate": 8.052836484983315e-06, + "loss": 2.6486, + "mean_token_accuracy": 0.4955171952361836, + "step": 10503 + }, + { + "epoch": 1.9473489061920652, + "grad_norm": 7.80078125, + "learning_rate": 8.052651093807934e-06, + "loss": 2.4398, + "mean_token_accuracy": 0.5034168564920274, + "step": 10504 + }, + { + "epoch": 1.9475342973674454, + "grad_norm": 6.01953125, + "learning_rate": 8.052465702632556e-06, + "loss": 3.0308, + "mean_token_accuracy": 0.46256598082516426, + "step": 10505 + }, + { + "epoch": 1.9477196885428254, + "grad_norm": 5.8515625, + "learning_rate": 8.052280311457175e-06, + "loss": 3.2298, + "mean_token_accuracy": 0.44019581899973537, + "step": 10506 + }, + { + "epoch": 1.9479050797182054, + "grad_norm": 5.48046875, + "learning_rate": 8.052094920281796e-06, + "loss": 3.6104, + "mean_token_accuracy": 0.416658023026657, + "step": 10507 + }, + { + "epoch": 1.9480904708935856, + "grad_norm": 5.61328125, + "learning_rate": 8.051909529106415e-06, + "loss": 2.9139, + "mean_token_accuracy": 0.45685087055261164, + "step": 10508 + }, + { + "epoch": 1.9482758620689655, + "grad_norm": 6.85546875, + "learning_rate": 8.051724137931035e-06, + "loss": 3.2905, + "mean_token_accuracy": 0.4073987081620669, + "step": 10509 + }, + { + "epoch": 1.9484612532443455, + "grad_norm": 7.85546875, + "learning_rate": 8.051538746755656e-06, + "loss": 3.2761, + "mean_token_accuracy": 0.43172454384932313, + "step": 10510 + }, + { + "epoch": 1.9486466444197257, + "grad_norm": 5.734375, + "learning_rate": 8.051353355580274e-06, + "loss": 2.4804, + "mean_token_accuracy": 0.48996188055908513, + "step": 10511 + }, + { + "epoch": 1.9488320355951056, + "grad_norm": 6.65625, + "learning_rate": 8.051167964404895e-06, + "loss": 2.3101, + "mean_token_accuracy": 0.515110448570267, + "step": 10512 + }, + { + "epoch": 1.9490174267704856, + "grad_norm": 5.859375, + "learning_rate": 8.050982573229515e-06, + "loss": 2.7689, + "mean_token_accuracy": 0.47511061946902655, + "step": 10513 + }, + { + "epoch": 1.9492028179458658, + "grad_norm": 6.79296875, + "learning_rate": 8.050797182054136e-06, + "loss": 2.699, + "mean_token_accuracy": 0.48139880952380953, + "step": 10514 + }, + { + "epoch": 1.9493882091212458, + "grad_norm": 5.23828125, + "learning_rate": 8.050611790878755e-06, + "loss": 2.9543, + "mean_token_accuracy": 0.4460211767274315, + "step": 10515 + }, + { + "epoch": 1.9495736002966257, + "grad_norm": 6.34375, + "learning_rate": 8.050426399703375e-06, + "loss": 3.6145, + "mean_token_accuracy": 0.4283913659996849, + "step": 10516 + }, + { + "epoch": 1.949758991472006, + "grad_norm": 6.3046875, + "learning_rate": 8.050241008527996e-06, + "loss": 2.8108, + "mean_token_accuracy": 0.45940005171967935, + "step": 10517 + }, + { + "epoch": 1.9499443826473861, + "grad_norm": 5.4296875, + "learning_rate": 8.050055617352614e-06, + "loss": 2.9966, + "mean_token_accuracy": 0.4540406427221172, + "step": 10518 + }, + { + "epoch": 1.950129773822766, + "grad_norm": 5.921875, + "learning_rate": 8.049870226177235e-06, + "loss": 3.8425, + "mean_token_accuracy": 0.3959035843636818, + "step": 10519 + }, + { + "epoch": 1.950315164998146, + "grad_norm": 6.89453125, + "learning_rate": 8.049684835001854e-06, + "loss": 2.6853, + "mean_token_accuracy": 0.4767041031913711, + "step": 10520 + }, + { + "epoch": 1.9505005561735262, + "grad_norm": 7.07421875, + "learning_rate": 8.049499443826474e-06, + "loss": 2.9303, + "mean_token_accuracy": 0.4880968762900784, + "step": 10521 + }, + { + "epoch": 1.9506859473489062, + "grad_norm": 5.7890625, + "learning_rate": 8.049314052651095e-06, + "loss": 2.5162, + "mean_token_accuracy": 0.4987246312520794, + "step": 10522 + }, + { + "epoch": 1.9508713385242862, + "grad_norm": 5.80859375, + "learning_rate": 8.049128661475715e-06, + "loss": 3.0992, + "mean_token_accuracy": 0.46462370437133843, + "step": 10523 + }, + { + "epoch": 1.9510567296996664, + "grad_norm": 6.51953125, + "learning_rate": 8.048943270300334e-06, + "loss": 2.9218, + "mean_token_accuracy": 0.4610543979862956, + "step": 10524 + }, + { + "epoch": 1.9512421208750463, + "grad_norm": 5.93359375, + "learning_rate": 8.048757879124954e-06, + "loss": 3.123, + "mean_token_accuracy": 0.45618141916605703, + "step": 10525 + }, + { + "epoch": 1.9514275120504263, + "grad_norm": 6.97265625, + "learning_rate": 8.048572487949575e-06, + "loss": 2.453, + "mean_token_accuracy": 0.5111425539441103, + "step": 10526 + }, + { + "epoch": 1.9516129032258065, + "grad_norm": 7.50390625, + "learning_rate": 8.048387096774194e-06, + "loss": 2.5441, + "mean_token_accuracy": 0.4904710046283692, + "step": 10527 + }, + { + "epoch": 1.9517982944011865, + "grad_norm": 6.9609375, + "learning_rate": 8.048201705598814e-06, + "loss": 3.277, + "mean_token_accuracy": 0.4361100144755889, + "step": 10528 + }, + { + "epoch": 1.9519836855765664, + "grad_norm": 7.58203125, + "learning_rate": 8.048016314423433e-06, + "loss": 3.6208, + "mean_token_accuracy": 0.40683760683760684, + "step": 10529 + }, + { + "epoch": 1.9521690767519466, + "grad_norm": 7.03125, + "learning_rate": 8.047830923248055e-06, + "loss": 3.0153, + "mean_token_accuracy": 0.44150139757753226, + "step": 10530 + }, + { + "epoch": 1.9523544679273268, + "grad_norm": 7.75390625, + "learning_rate": 8.047645532072674e-06, + "loss": 3.5224, + "mean_token_accuracy": 0.4205135400589308, + "step": 10531 + }, + { + "epoch": 1.9525398591027066, + "grad_norm": 13.1484375, + "learning_rate": 8.047460140897294e-06, + "loss": 3.4522, + "mean_token_accuracy": 0.4147567103811742, + "step": 10532 + }, + { + "epoch": 1.9527252502780867, + "grad_norm": 13.8125, + "learning_rate": 8.047274749721913e-06, + "loss": 2.7297, + "mean_token_accuracy": 0.4707655213984328, + "step": 10533 + }, + { + "epoch": 1.952910641453467, + "grad_norm": 6.96875, + "learning_rate": 8.047089358546534e-06, + "loss": 2.7687, + "mean_token_accuracy": 0.4621772414687946, + "step": 10534 + }, + { + "epoch": 1.953096032628847, + "grad_norm": 6.50390625, + "learning_rate": 8.046903967371154e-06, + "loss": 3.0116, + "mean_token_accuracy": 0.448531226857566, + "step": 10535 + }, + { + "epoch": 1.9532814238042269, + "grad_norm": 7.36328125, + "learning_rate": 8.046718576195773e-06, + "loss": 2.8412, + "mean_token_accuracy": 0.4760946149974836, + "step": 10536 + }, + { + "epoch": 1.953466814979607, + "grad_norm": 5.38671875, + "learning_rate": 8.046533185020394e-06, + "loss": 2.77, + "mean_token_accuracy": 0.49051817852397633, + "step": 10537 + }, + { + "epoch": 1.953652206154987, + "grad_norm": 7.4375, + "learning_rate": 8.046347793845014e-06, + "loss": 2.8354, + "mean_token_accuracy": 0.4973399691093187, + "step": 10538 + }, + { + "epoch": 1.953837597330367, + "grad_norm": 7.72265625, + "learning_rate": 8.046162402669635e-06, + "loss": 2.4162, + "mean_token_accuracy": 0.523253980766199, + "step": 10539 + }, + { + "epoch": 1.9540229885057472, + "grad_norm": 7.43359375, + "learning_rate": 8.045977011494253e-06, + "loss": 2.8244, + "mean_token_accuracy": 0.47996089931573804, + "step": 10540 + }, + { + "epoch": 1.9542083796811272, + "grad_norm": 6.25, + "learning_rate": 8.045791620318874e-06, + "loss": 3.141, + "mean_token_accuracy": 0.4533728429492766, + "step": 10541 + }, + { + "epoch": 1.9543937708565071, + "grad_norm": 6.43359375, + "learning_rate": 8.045606229143493e-06, + "loss": 2.8538, + "mean_token_accuracy": 0.48956617243272926, + "step": 10542 + }, + { + "epoch": 1.9545791620318873, + "grad_norm": 6.56640625, + "learning_rate": 8.045420837968113e-06, + "loss": 2.8376, + "mean_token_accuracy": 0.4777652687869406, + "step": 10543 + }, + { + "epoch": 1.9547645532072675, + "grad_norm": 6.33203125, + "learning_rate": 8.045235446792734e-06, + "loss": 2.9892, + "mean_token_accuracy": 0.4557752341311134, + "step": 10544 + }, + { + "epoch": 1.9549499443826472, + "grad_norm": 9.09375, + "learning_rate": 8.045050055617352e-06, + "loss": 2.6309, + "mean_token_accuracy": 0.4774703557312253, + "step": 10545 + }, + { + "epoch": 1.9551353355580274, + "grad_norm": 7.41015625, + "learning_rate": 8.044864664441973e-06, + "loss": 2.5797, + "mean_token_accuracy": 0.5078280318091452, + "step": 10546 + }, + { + "epoch": 1.9553207267334076, + "grad_norm": 5.6953125, + "learning_rate": 8.044679273266593e-06, + "loss": 2.924, + "mean_token_accuracy": 0.447505498321565, + "step": 10547 + }, + { + "epoch": 1.9555061179087876, + "grad_norm": 6.2578125, + "learning_rate": 8.044493882091214e-06, + "loss": 3.5033, + "mean_token_accuracy": 0.4209031909140076, + "step": 10548 + }, + { + "epoch": 1.9556915090841676, + "grad_norm": 6.76953125, + "learning_rate": 8.044308490915833e-06, + "loss": 2.7625, + "mean_token_accuracy": 0.4569965870307167, + "step": 10549 + }, + { + "epoch": 1.9558769002595477, + "grad_norm": 7.49609375, + "learning_rate": 8.044123099740453e-06, + "loss": 2.6469, + "mean_token_accuracy": 0.47849355797819626, + "step": 10550 + }, + { + "epoch": 1.9560622914349277, + "grad_norm": 10.1953125, + "learning_rate": 8.043937708565072e-06, + "loss": 3.3117, + "mean_token_accuracy": 0.4523127919240621, + "step": 10551 + }, + { + "epoch": 1.9562476826103077, + "grad_norm": 11.7265625, + "learning_rate": 8.043752317389692e-06, + "loss": 2.5625, + "mean_token_accuracy": 0.48314902580305424, + "step": 10552 + }, + { + "epoch": 1.9564330737856879, + "grad_norm": 9.2890625, + "learning_rate": 8.043566926214313e-06, + "loss": 3.4114, + "mean_token_accuracy": 0.4431753283837807, + "step": 10553 + }, + { + "epoch": 1.9566184649610678, + "grad_norm": 6.17578125, + "learning_rate": 8.043381535038933e-06, + "loss": 2.3534, + "mean_token_accuracy": 0.5067531779661016, + "step": 10554 + }, + { + "epoch": 1.9568038561364478, + "grad_norm": 5.953125, + "learning_rate": 8.043196143863554e-06, + "loss": 2.627, + "mean_token_accuracy": 0.46782255545142143, + "step": 10555 + }, + { + "epoch": 1.956989247311828, + "grad_norm": 6.23046875, + "learning_rate": 8.043010752688173e-06, + "loss": 2.5411, + "mean_token_accuracy": 0.5180926096289482, + "step": 10556 + }, + { + "epoch": 1.957174638487208, + "grad_norm": 5.51953125, + "learning_rate": 8.042825361512793e-06, + "loss": 2.6001, + "mean_token_accuracy": 0.5063050998772458, + "step": 10557 + }, + { + "epoch": 1.957360029662588, + "grad_norm": 5.9140625, + "learning_rate": 8.042639970337412e-06, + "loss": 3.2724, + "mean_token_accuracy": 0.4320300997873385, + "step": 10558 + }, + { + "epoch": 1.9575454208379681, + "grad_norm": 6.3203125, + "learning_rate": 8.042454579162032e-06, + "loss": 3.0506, + "mean_token_accuracy": 0.45068825910931176, + "step": 10559 + }, + { + "epoch": 1.9577308120133483, + "grad_norm": 6.83203125, + "learning_rate": 8.042269187986651e-06, + "loss": 2.6215, + "mean_token_accuracy": 0.4752245113576334, + "step": 10560 + }, + { + "epoch": 1.957916203188728, + "grad_norm": 6.765625, + "learning_rate": 8.042083796811272e-06, + "loss": 2.9711, + "mean_token_accuracy": 0.45222849968612683, + "step": 10561 + }, + { + "epoch": 1.9581015943641082, + "grad_norm": 6.94140625, + "learning_rate": 8.041898405635892e-06, + "loss": 2.9325, + "mean_token_accuracy": 0.46841404730213987, + "step": 10562 + }, + { + "epoch": 1.9582869855394884, + "grad_norm": 6.72265625, + "learning_rate": 8.041713014460513e-06, + "loss": 2.8092, + "mean_token_accuracy": 0.46603131381892443, + "step": 10563 + }, + { + "epoch": 1.9584723767148684, + "grad_norm": 6.19921875, + "learning_rate": 8.041527623285133e-06, + "loss": 2.587, + "mean_token_accuracy": 0.4912476129853596, + "step": 10564 + }, + { + "epoch": 1.9586577678902484, + "grad_norm": 6.32421875, + "learning_rate": 8.041342232109752e-06, + "loss": 3.3039, + "mean_token_accuracy": 0.4194163646767118, + "step": 10565 + }, + { + "epoch": 1.9588431590656286, + "grad_norm": 5.7890625, + "learning_rate": 8.041156840934373e-06, + "loss": 2.6794, + "mean_token_accuracy": 0.48881193393713374, + "step": 10566 + }, + { + "epoch": 1.9590285502410085, + "grad_norm": 6.27734375, + "learning_rate": 8.040971449758991e-06, + "loss": 2.518, + "mean_token_accuracy": 0.4898589519201071, + "step": 10567 + }, + { + "epoch": 1.9592139414163885, + "grad_norm": 5.92578125, + "learning_rate": 8.040786058583612e-06, + "loss": 3.2332, + "mean_token_accuracy": 0.42391304347826086, + "step": 10568 + }, + { + "epoch": 1.9593993325917687, + "grad_norm": 5.6640625, + "learning_rate": 8.040600667408232e-06, + "loss": 2.915, + "mean_token_accuracy": 0.4814674016346702, + "step": 10569 + }, + { + "epoch": 1.9595847237671487, + "grad_norm": 5.3203125, + "learning_rate": 8.040415276232853e-06, + "loss": 2.5864, + "mean_token_accuracy": 0.4891011840688913, + "step": 10570 + }, + { + "epoch": 1.9597701149425286, + "grad_norm": 6.2109375, + "learning_rate": 8.040229885057472e-06, + "loss": 2.8914, + "mean_token_accuracy": 0.4819497587755781, + "step": 10571 + }, + { + "epoch": 1.9599555061179088, + "grad_norm": 6.85546875, + "learning_rate": 8.040044493882092e-06, + "loss": 2.562, + "mean_token_accuracy": 0.4825344212136665, + "step": 10572 + }, + { + "epoch": 1.960140897293289, + "grad_norm": 5.63671875, + "learning_rate": 8.039859102706713e-06, + "loss": 3.1942, + "mean_token_accuracy": 0.4466324200913242, + "step": 10573 + }, + { + "epoch": 1.9603262884686687, + "grad_norm": 5.66015625, + "learning_rate": 8.039673711531331e-06, + "loss": 3.0959, + "mean_token_accuracy": 0.4499563754206656, + "step": 10574 + }, + { + "epoch": 1.960511679644049, + "grad_norm": 6.86328125, + "learning_rate": 8.039488320355952e-06, + "loss": 2.6456, + "mean_token_accuracy": 0.4760304004676995, + "step": 10575 + }, + { + "epoch": 1.9606970708194291, + "grad_norm": 5.88671875, + "learning_rate": 8.03930292918057e-06, + "loss": 3.2979, + "mean_token_accuracy": 0.4250236518448439, + "step": 10576 + }, + { + "epoch": 1.960882461994809, + "grad_norm": 6.875, + "learning_rate": 8.039117538005191e-06, + "loss": 3.0132, + "mean_token_accuracy": 0.4568432374294165, + "step": 10577 + }, + { + "epoch": 1.961067853170189, + "grad_norm": 6.01171875, + "learning_rate": 8.038932146829812e-06, + "loss": 2.6318, + "mean_token_accuracy": 0.5182692307692308, + "step": 10578 + }, + { + "epoch": 1.9612532443455692, + "grad_norm": 6.6796875, + "learning_rate": 8.038746755654432e-06, + "loss": 2.7356, + "mean_token_accuracy": 0.47750575434191256, + "step": 10579 + }, + { + "epoch": 1.9614386355209492, + "grad_norm": 5.796875, + "learning_rate": 8.038561364479051e-06, + "loss": 3.27, + "mean_token_accuracy": 0.43345823764051733, + "step": 10580 + }, + { + "epoch": 1.9616240266963292, + "grad_norm": 5.80078125, + "learning_rate": 8.038375973303671e-06, + "loss": 3.5975, + "mean_token_accuracy": 0.4071534078919603, + "step": 10581 + }, + { + "epoch": 1.9618094178717094, + "grad_norm": 6.53125, + "learning_rate": 8.038190582128292e-06, + "loss": 3.0283, + "mean_token_accuracy": 0.46247987117552336, + "step": 10582 + }, + { + "epoch": 1.9619948090470893, + "grad_norm": 6.34375, + "learning_rate": 8.03800519095291e-06, + "loss": 3.6128, + "mean_token_accuracy": 0.4159510988211323, + "step": 10583 + }, + { + "epoch": 1.9621802002224693, + "grad_norm": 6.0078125, + "learning_rate": 8.037819799777531e-06, + "loss": 3.3801, + "mean_token_accuracy": 0.41379310344827586, + "step": 10584 + }, + { + "epoch": 1.9623655913978495, + "grad_norm": 7.98046875, + "learning_rate": 8.03763440860215e-06, + "loss": 2.5335, + "mean_token_accuracy": 0.49079858179976366, + "step": 10585 + }, + { + "epoch": 1.9625509825732295, + "grad_norm": 5.578125, + "learning_rate": 8.037449017426772e-06, + "loss": 3.2632, + "mean_token_accuracy": 0.4355491706934831, + "step": 10586 + }, + { + "epoch": 1.9627363737486094, + "grad_norm": 6.19921875, + "learning_rate": 8.037263626251391e-06, + "loss": 2.9972, + "mean_token_accuracy": 0.44512482336316533, + "step": 10587 + }, + { + "epoch": 1.9629217649239896, + "grad_norm": 6.31640625, + "learning_rate": 8.037078235076011e-06, + "loss": 2.5911, + "mean_token_accuracy": 0.4903589021815623, + "step": 10588 + }, + { + "epoch": 1.9631071560993698, + "grad_norm": 5.8359375, + "learning_rate": 8.03689284390063e-06, + "loss": 2.7078, + "mean_token_accuracy": 0.4768100413286392, + "step": 10589 + }, + { + "epoch": 1.9632925472747496, + "grad_norm": 5.8671875, + "learning_rate": 8.03670745272525e-06, + "loss": 2.8359, + "mean_token_accuracy": 0.45407239819004525, + "step": 10590 + }, + { + "epoch": 1.9634779384501297, + "grad_norm": 7.90234375, + "learning_rate": 8.036522061549871e-06, + "loss": 2.6195, + "mean_token_accuracy": 0.48159708674730134, + "step": 10591 + }, + { + "epoch": 1.96366332962551, + "grad_norm": 8.03125, + "learning_rate": 8.03633667037449e-06, + "loss": 2.2452, + "mean_token_accuracy": 0.5224158573899161, + "step": 10592 + }, + { + "epoch": 1.96384872080089, + "grad_norm": 6.3046875, + "learning_rate": 8.03615127919911e-06, + "loss": 3.3375, + "mean_token_accuracy": 0.44288025889967636, + "step": 10593 + }, + { + "epoch": 1.9640341119762699, + "grad_norm": 8.6484375, + "learning_rate": 8.035965888023731e-06, + "loss": 2.5639, + "mean_token_accuracy": 0.5264267826545407, + "step": 10594 + }, + { + "epoch": 1.96421950315165, + "grad_norm": 8.34375, + "learning_rate": 8.035780496848352e-06, + "loss": 2.557, + "mean_token_accuracy": 0.47866391995291346, + "step": 10595 + }, + { + "epoch": 1.96440489432703, + "grad_norm": 5.8515625, + "learning_rate": 8.03559510567297e-06, + "loss": 3.4585, + "mean_token_accuracy": 0.4276387377584331, + "step": 10596 + }, + { + "epoch": 1.96459028550241, + "grad_norm": 7.46484375, + "learning_rate": 8.03540971449759e-06, + "loss": 2.1089, + "mean_token_accuracy": 0.5613308223477715, + "step": 10597 + }, + { + "epoch": 1.9647756766777902, + "grad_norm": 6.3203125, + "learning_rate": 8.035224323322211e-06, + "loss": 2.9303, + "mean_token_accuracy": 0.4532082711575357, + "step": 10598 + }, + { + "epoch": 1.9649610678531702, + "grad_norm": 5.96484375, + "learning_rate": 8.03503893214683e-06, + "loss": 3.375, + "mean_token_accuracy": 0.4267900758452243, + "step": 10599 + }, + { + "epoch": 1.9651464590285501, + "grad_norm": 5.79296875, + "learning_rate": 8.03485354097145e-06, + "loss": 3.2035, + "mean_token_accuracy": 0.4290803536086555, + "step": 10600 + }, + { + "epoch": 1.9653318502039303, + "grad_norm": 5.56640625, + "learning_rate": 8.03466814979607e-06, + "loss": 2.6259, + "mean_token_accuracy": 0.49467238211879977, + "step": 10601 + }, + { + "epoch": 1.9655172413793105, + "grad_norm": 5.30859375, + "learning_rate": 8.034482758620692e-06, + "loss": 2.8318, + "mean_token_accuracy": 0.4607976653696498, + "step": 10602 + }, + { + "epoch": 1.9657026325546902, + "grad_norm": 5.953125, + "learning_rate": 8.03429736744531e-06, + "loss": 3.0009, + "mean_token_accuracy": 0.4699017199017199, + "step": 10603 + }, + { + "epoch": 1.9658880237300704, + "grad_norm": 7.2265625, + "learning_rate": 8.034111976269931e-06, + "loss": 2.525, + "mean_token_accuracy": 0.4879690949227373, + "step": 10604 + }, + { + "epoch": 1.9660734149054506, + "grad_norm": 5.38671875, + "learning_rate": 8.03392658509455e-06, + "loss": 3.2323, + "mean_token_accuracy": 0.4411607809666062, + "step": 10605 + }, + { + "epoch": 1.9662588060808306, + "grad_norm": 10.5, + "learning_rate": 8.03374119391917e-06, + "loss": 2.4949, + "mean_token_accuracy": 0.4899717324000538, + "step": 10606 + }, + { + "epoch": 1.9664441972562106, + "grad_norm": 6.7578125, + "learning_rate": 8.03355580274379e-06, + "loss": 2.9911, + "mean_token_accuracy": 0.4533273110508883, + "step": 10607 + }, + { + "epoch": 1.9666295884315907, + "grad_norm": 7.02734375, + "learning_rate": 8.03337041156841e-06, + "loss": 3.3464, + "mean_token_accuracy": 0.4457762557077626, + "step": 10608 + }, + { + "epoch": 1.9668149796069707, + "grad_norm": 6.734375, + "learning_rate": 8.03318502039303e-06, + "loss": 3.1309, + "mean_token_accuracy": 0.45518788558609086, + "step": 10609 + }, + { + "epoch": 1.9670003707823507, + "grad_norm": 7.71875, + "learning_rate": 8.03299962921765e-06, + "loss": 2.7598, + "mean_token_accuracy": 0.471722621902478, + "step": 10610 + }, + { + "epoch": 1.9671857619577309, + "grad_norm": 6.11328125, + "learning_rate": 8.032814238042271e-06, + "loss": 2.8744, + "mean_token_accuracy": 0.4774156660949114, + "step": 10611 + }, + { + "epoch": 1.9673711531331108, + "grad_norm": 16.203125, + "learning_rate": 8.03262884686689e-06, + "loss": 2.9014, + "mean_token_accuracy": 0.4712213383247543, + "step": 10612 + }, + { + "epoch": 1.9675565443084908, + "grad_norm": 9.3359375, + "learning_rate": 8.03244345569151e-06, + "loss": 3.072, + "mean_token_accuracy": 0.4389261744966443, + "step": 10613 + }, + { + "epoch": 1.967741935483871, + "grad_norm": 7.53515625, + "learning_rate": 8.032258064516129e-06, + "loss": 3.0121, + "mean_token_accuracy": 0.4968789013732834, + "step": 10614 + }, + { + "epoch": 1.9679273266592512, + "grad_norm": 8.9765625, + "learning_rate": 8.03207267334075e-06, + "loss": 2.2504, + "mean_token_accuracy": 0.5257638967189086, + "step": 10615 + }, + { + "epoch": 1.968112717834631, + "grad_norm": 5.92578125, + "learning_rate": 8.03188728216537e-06, + "loss": 3.0954, + "mean_token_accuracy": 0.45932499663842946, + "step": 10616 + }, + { + "epoch": 1.9682981090100111, + "grad_norm": 6.51171875, + "learning_rate": 8.031701890989989e-06, + "loss": 2.9062, + "mean_token_accuracy": 0.47510937684758187, + "step": 10617 + }, + { + "epoch": 1.9684835001853913, + "grad_norm": 6.625, + "learning_rate": 8.03151649981461e-06, + "loss": 3.3537, + "mean_token_accuracy": 0.4477894986369563, + "step": 10618 + }, + { + "epoch": 1.9686688913607713, + "grad_norm": 7.62109375, + "learning_rate": 8.03133110863923e-06, + "loss": 2.7757, + "mean_token_accuracy": 0.4804917827123322, + "step": 10619 + }, + { + "epoch": 1.9688542825361512, + "grad_norm": 6.60546875, + "learning_rate": 8.03114571746385e-06, + "loss": 3.1903, + "mean_token_accuracy": 0.4413394066231688, + "step": 10620 + }, + { + "epoch": 1.9690396737115314, + "grad_norm": 7.12109375, + "learning_rate": 8.030960326288469e-06, + "loss": 2.8262, + "mean_token_accuracy": 0.4662219699120068, + "step": 10621 + }, + { + "epoch": 1.9692250648869114, + "grad_norm": 7.6015625, + "learning_rate": 8.03077493511309e-06, + "loss": 3.1873, + "mean_token_accuracy": 0.4777592768791627, + "step": 10622 + }, + { + "epoch": 1.9694104560622914, + "grad_norm": 8.796875, + "learning_rate": 8.030589543937708e-06, + "loss": 3.19, + "mean_token_accuracy": 0.4783060535203949, + "step": 10623 + }, + { + "epoch": 1.9695958472376716, + "grad_norm": 6.03515625, + "learning_rate": 8.030404152762329e-06, + "loss": 3.1569, + "mean_token_accuracy": 0.4679006169751542, + "step": 10624 + }, + { + "epoch": 1.9697812384130515, + "grad_norm": 6.96484375, + "learning_rate": 8.03021876158695e-06, + "loss": 3.2334, + "mean_token_accuracy": 0.44752066115702477, + "step": 10625 + }, + { + "epoch": 1.9699666295884315, + "grad_norm": 6.48046875, + "learning_rate": 8.03003337041157e-06, + "loss": 3.5067, + "mean_token_accuracy": 0.3862623762376238, + "step": 10626 + }, + { + "epoch": 1.9701520207638117, + "grad_norm": 5.42578125, + "learning_rate": 8.029847979236189e-06, + "loss": 3.3476, + "mean_token_accuracy": 0.4401382374722291, + "step": 10627 + }, + { + "epoch": 1.9703374119391917, + "grad_norm": 6.84375, + "learning_rate": 8.029662588060809e-06, + "loss": 2.4076, + "mean_token_accuracy": 0.5136352568785001, + "step": 10628 + }, + { + "epoch": 1.9705228031145716, + "grad_norm": 7.25390625, + "learning_rate": 8.02947719688543e-06, + "loss": 3.0193, + "mean_token_accuracy": 0.46475981935130695, + "step": 10629 + }, + { + "epoch": 1.9707081942899518, + "grad_norm": 6.203125, + "learning_rate": 8.029291805710048e-06, + "loss": 2.7658, + "mean_token_accuracy": 0.46382003152314083, + "step": 10630 + }, + { + "epoch": 1.970893585465332, + "grad_norm": 5.89453125, + "learning_rate": 8.029106414534669e-06, + "loss": 2.6314, + "mean_token_accuracy": 0.4692511504671594, + "step": 10631 + }, + { + "epoch": 1.9710789766407117, + "grad_norm": 6.8359375, + "learning_rate": 8.028921023359288e-06, + "loss": 3.0423, + "mean_token_accuracy": 0.4620600115406809, + "step": 10632 + }, + { + "epoch": 1.971264367816092, + "grad_norm": 6.7265625, + "learning_rate": 8.028735632183908e-06, + "loss": 3.2103, + "mean_token_accuracy": 0.4374867331776693, + "step": 10633 + }, + { + "epoch": 1.9714497589914721, + "grad_norm": 6.60546875, + "learning_rate": 8.028550241008529e-06, + "loss": 2.8704, + "mean_token_accuracy": 0.4605589362181654, + "step": 10634 + }, + { + "epoch": 1.971635150166852, + "grad_norm": 5.50390625, + "learning_rate": 8.02836484983315e-06, + "loss": 2.7121, + "mean_token_accuracy": 0.49283596837944665, + "step": 10635 + }, + { + "epoch": 1.971820541342232, + "grad_norm": 5.921875, + "learning_rate": 8.02817945865777e-06, + "loss": 2.261, + "mean_token_accuracy": 0.5492651757188498, + "step": 10636 + }, + { + "epoch": 1.9720059325176122, + "grad_norm": 6.703125, + "learning_rate": 8.027994067482388e-06, + "loss": 2.6194, + "mean_token_accuracy": 0.5007116424708227, + "step": 10637 + }, + { + "epoch": 1.9721913236929922, + "grad_norm": 7.43359375, + "learning_rate": 8.027808676307009e-06, + "loss": 2.9138, + "mean_token_accuracy": 0.4536758677234217, + "step": 10638 + }, + { + "epoch": 1.9723767148683722, + "grad_norm": 7.54296875, + "learning_rate": 8.027623285131628e-06, + "loss": 2.7717, + "mean_token_accuracy": 0.48630338733431516, + "step": 10639 + }, + { + "epoch": 1.9725621060437524, + "grad_norm": 8.15625, + "learning_rate": 8.027437893956248e-06, + "loss": 3.6419, + "mean_token_accuracy": 0.4408547871445398, + "step": 10640 + }, + { + "epoch": 1.9727474972191323, + "grad_norm": 5.95703125, + "learning_rate": 8.027252502780867e-06, + "loss": 3.021, + "mean_token_accuracy": 0.44087272727272725, + "step": 10641 + }, + { + "epoch": 1.9729328883945123, + "grad_norm": 10.734375, + "learning_rate": 8.027067111605488e-06, + "loss": 3.1236, + "mean_token_accuracy": 0.4875593269092478, + "step": 10642 + }, + { + "epoch": 1.9731182795698925, + "grad_norm": 6.1640625, + "learning_rate": 8.026881720430108e-06, + "loss": 3.0518, + "mean_token_accuracy": 0.46080198242847487, + "step": 10643 + }, + { + "epoch": 1.9733036707452727, + "grad_norm": 6.4375, + "learning_rate": 8.026696329254729e-06, + "loss": 2.6385, + "mean_token_accuracy": 0.4874191229331416, + "step": 10644 + }, + { + "epoch": 1.9734890619206524, + "grad_norm": 5.48046875, + "learning_rate": 8.026510938079349e-06, + "loss": 3.0651, + "mean_token_accuracy": 0.4495970405601797, + "step": 10645 + }, + { + "epoch": 1.9736744530960326, + "grad_norm": 5.75390625, + "learning_rate": 8.026325546903968e-06, + "loss": 2.8533, + "mean_token_accuracy": 0.4759027266028003, + "step": 10646 + }, + { + "epoch": 1.9738598442714128, + "grad_norm": 5.8515625, + "learning_rate": 8.026140155728588e-06, + "loss": 2.4104, + "mean_token_accuracy": 0.5054945054945055, + "step": 10647 + }, + { + "epoch": 1.9740452354467928, + "grad_norm": 5.56640625, + "learning_rate": 8.025954764553207e-06, + "loss": 2.9778, + "mean_token_accuracy": 0.46188449848024316, + "step": 10648 + }, + { + "epoch": 1.9742306266221727, + "grad_norm": 7.95703125, + "learning_rate": 8.025769373377828e-06, + "loss": 3.6646, + "mean_token_accuracy": 0.4354908461871126, + "step": 10649 + }, + { + "epoch": 1.974416017797553, + "grad_norm": 7.14453125, + "learning_rate": 8.025583982202448e-06, + "loss": 2.5355, + "mean_token_accuracy": 0.5058131939908557, + "step": 10650 + }, + { + "epoch": 1.974601408972933, + "grad_norm": 8.6640625, + "learning_rate": 8.025398591027069e-06, + "loss": 2.8217, + "mean_token_accuracy": 0.48036253776435045, + "step": 10651 + }, + { + "epoch": 1.9747868001483129, + "grad_norm": 7.5546875, + "learning_rate": 8.025213199851687e-06, + "loss": 2.4395, + "mean_token_accuracy": 0.529796511627907, + "step": 10652 + }, + { + "epoch": 1.974972191323693, + "grad_norm": 6.9140625, + "learning_rate": 8.025027808676308e-06, + "loss": 2.6031, + "mean_token_accuracy": 0.5236822568671121, + "step": 10653 + }, + { + "epoch": 1.975157582499073, + "grad_norm": 8.734375, + "learning_rate": 8.024842417500928e-06, + "loss": 3.1856, + "mean_token_accuracy": 0.44300265103948655, + "step": 10654 + }, + { + "epoch": 1.975342973674453, + "grad_norm": 7.15234375, + "learning_rate": 8.024657026325547e-06, + "loss": 2.8198, + "mean_token_accuracy": 0.47390515089630136, + "step": 10655 + }, + { + "epoch": 1.9755283648498332, + "grad_norm": 10.1015625, + "learning_rate": 8.024471635150168e-06, + "loss": 2.3306, + "mean_token_accuracy": 0.512817290776577, + "step": 10656 + }, + { + "epoch": 1.9757137560252132, + "grad_norm": 8.3515625, + "learning_rate": 8.024286243974786e-06, + "loss": 2.0155, + "mean_token_accuracy": 0.598697539797395, + "step": 10657 + }, + { + "epoch": 1.9758991472005931, + "grad_norm": 10.4765625, + "learning_rate": 8.024100852799407e-06, + "loss": 2.659, + "mean_token_accuracy": 0.4819435325016415, + "step": 10658 + }, + { + "epoch": 1.9760845383759733, + "grad_norm": 9.71875, + "learning_rate": 8.023915461624027e-06, + "loss": 2.8298, + "mean_token_accuracy": 0.4708695652173913, + "step": 10659 + }, + { + "epoch": 1.9762699295513535, + "grad_norm": 6.75390625, + "learning_rate": 8.023730070448648e-06, + "loss": 3.3916, + "mean_token_accuracy": 0.4241001564945227, + "step": 10660 + }, + { + "epoch": 1.9764553207267332, + "grad_norm": 7.96875, + "learning_rate": 8.023544679273267e-06, + "loss": 3.2713, + "mean_token_accuracy": 0.43509385937002865, + "step": 10661 + }, + { + "epoch": 1.9766407119021134, + "grad_norm": 7.51953125, + "learning_rate": 8.023359288097887e-06, + "loss": 3.5524, + "mean_token_accuracy": 0.4191852825229961, + "step": 10662 + }, + { + "epoch": 1.9768261030774936, + "grad_norm": 8.15625, + "learning_rate": 8.023173896922508e-06, + "loss": 3.0429, + "mean_token_accuracy": 0.46165843514426824, + "step": 10663 + }, + { + "epoch": 1.9770114942528736, + "grad_norm": 7.62109375, + "learning_rate": 8.022988505747126e-06, + "loss": 3.1729, + "mean_token_accuracy": 0.44058744993324434, + "step": 10664 + }, + { + "epoch": 1.9771968854282536, + "grad_norm": 5.734375, + "learning_rate": 8.022803114571747e-06, + "loss": 3.2362, + "mean_token_accuracy": 0.44572984008166044, + "step": 10665 + }, + { + "epoch": 1.9773822766036337, + "grad_norm": 7.31640625, + "learning_rate": 8.022617723396366e-06, + "loss": 2.0301, + "mean_token_accuracy": 0.5658025372311086, + "step": 10666 + }, + { + "epoch": 1.9775676677790137, + "grad_norm": 7.3984375, + "learning_rate": 8.022432332220988e-06, + "loss": 3.2387, + "mean_token_accuracy": 0.4350767629456154, + "step": 10667 + }, + { + "epoch": 1.9777530589543937, + "grad_norm": 6.98828125, + "learning_rate": 8.022246941045607e-06, + "loss": 3.1533, + "mean_token_accuracy": 0.47295946696279845, + "step": 10668 + }, + { + "epoch": 1.9779384501297739, + "grad_norm": 6.73046875, + "learning_rate": 8.022061549870227e-06, + "loss": 3.1954, + "mean_token_accuracy": 0.44366197183098594, + "step": 10669 + }, + { + "epoch": 1.9781238413051538, + "grad_norm": 6.28125, + "learning_rate": 8.021876158694846e-06, + "loss": 3.0928, + "mean_token_accuracy": 0.4833887043189369, + "step": 10670 + }, + { + "epoch": 1.9783092324805338, + "grad_norm": 6.0078125, + "learning_rate": 8.021690767519467e-06, + "loss": 3.1318, + "mean_token_accuracy": 0.4527487821851079, + "step": 10671 + }, + { + "epoch": 1.978494623655914, + "grad_norm": 5.44140625, + "learning_rate": 8.021505376344087e-06, + "loss": 2.7924, + "mean_token_accuracy": 0.46773120425815035, + "step": 10672 + }, + { + "epoch": 1.9786800148312942, + "grad_norm": 6.19921875, + "learning_rate": 8.021319985168706e-06, + "loss": 2.8581, + "mean_token_accuracy": 0.4737927687454501, + "step": 10673 + }, + { + "epoch": 1.978865406006674, + "grad_norm": 6.23046875, + "learning_rate": 8.021134593993326e-06, + "loss": 2.5478, + "mean_token_accuracy": 0.4771878184082418, + "step": 10674 + }, + { + "epoch": 1.9790507971820541, + "grad_norm": 6.19921875, + "learning_rate": 8.020949202817947e-06, + "loss": 3.783, + "mean_token_accuracy": 0.4208070271754049, + "step": 10675 + }, + { + "epoch": 1.9792361883574343, + "grad_norm": 7.328125, + "learning_rate": 8.020763811642567e-06, + "loss": 2.2361, + "mean_token_accuracy": 0.5335989661856558, + "step": 10676 + }, + { + "epoch": 1.9794215795328143, + "grad_norm": 7.48046875, + "learning_rate": 8.020578420467186e-06, + "loss": 3.131, + "mean_token_accuracy": 0.43209486166007904, + "step": 10677 + }, + { + "epoch": 1.9796069707081942, + "grad_norm": 8.984375, + "learning_rate": 8.020393029291807e-06, + "loss": 2.8919, + "mean_token_accuracy": 0.44433208615633535, + "step": 10678 + }, + { + "epoch": 1.9797923618835744, + "grad_norm": 10.1953125, + "learning_rate": 8.020207638116425e-06, + "loss": 2.5593, + "mean_token_accuracy": 0.4896719319562576, + "step": 10679 + }, + { + "epoch": 1.9799777530589544, + "grad_norm": 7.453125, + "learning_rate": 8.020022246941046e-06, + "loss": 3.285, + "mean_token_accuracy": 0.4269027882441598, + "step": 10680 + }, + { + "epoch": 1.9801631442343344, + "grad_norm": 10.953125, + "learning_rate": 8.019836855765666e-06, + "loss": 2.6711, + "mean_token_accuracy": 0.4742864025051186, + "step": 10681 + }, + { + "epoch": 1.9803485354097146, + "grad_norm": 8.9375, + "learning_rate": 8.019651464590285e-06, + "loss": 3.0672, + "mean_token_accuracy": 0.4702925634746746, + "step": 10682 + }, + { + "epoch": 1.9805339265850945, + "grad_norm": 8.09375, + "learning_rate": 8.019466073414907e-06, + "loss": 2.431, + "mean_token_accuracy": 0.5255681818181818, + "step": 10683 + }, + { + "epoch": 1.9807193177604745, + "grad_norm": 7.1796875, + "learning_rate": 8.019280682239526e-06, + "loss": 3.1742, + "mean_token_accuracy": 0.4424148974024434, + "step": 10684 + }, + { + "epoch": 1.9809047089358547, + "grad_norm": 6.94921875, + "learning_rate": 8.019095291064147e-06, + "loss": 2.6353, + "mean_token_accuracy": 0.4828155981493721, + "step": 10685 + }, + { + "epoch": 1.9810901001112347, + "grad_norm": 6.09375, + "learning_rate": 8.018909899888765e-06, + "loss": 2.3101, + "mean_token_accuracy": 0.5647118947233933, + "step": 10686 + }, + { + "epoch": 1.9812754912866146, + "grad_norm": 6.30078125, + "learning_rate": 8.018724508713386e-06, + "loss": 2.6596, + "mean_token_accuracy": 0.4853117107536887, + "step": 10687 + }, + { + "epoch": 1.9814608824619948, + "grad_norm": 5.4140625, + "learning_rate": 8.018539117538006e-06, + "loss": 2.662, + "mean_token_accuracy": 0.4828291684933509, + "step": 10688 + }, + { + "epoch": 1.981646273637375, + "grad_norm": 5.921875, + "learning_rate": 8.018353726362625e-06, + "loss": 3.189, + "mean_token_accuracy": 0.44811028500619576, + "step": 10689 + }, + { + "epoch": 1.981831664812755, + "grad_norm": 8.2109375, + "learning_rate": 8.018168335187246e-06, + "loss": 3.0401, + "mean_token_accuracy": 0.48121718055742563, + "step": 10690 + }, + { + "epoch": 1.982017055988135, + "grad_norm": 6.7421875, + "learning_rate": 8.017982944011866e-06, + "loss": 2.8874, + "mean_token_accuracy": 0.4606582278481013, + "step": 10691 + }, + { + "epoch": 1.9822024471635151, + "grad_norm": 7.69140625, + "learning_rate": 8.017797552836487e-06, + "loss": 3.1193, + "mean_token_accuracy": 0.4639175257731959, + "step": 10692 + }, + { + "epoch": 1.982387838338895, + "grad_norm": 6.5, + "learning_rate": 8.017612161661105e-06, + "loss": 3.0379, + "mean_token_accuracy": 0.4620075046904315, + "step": 10693 + }, + { + "epoch": 1.982573229514275, + "grad_norm": 7.578125, + "learning_rate": 8.017426770485726e-06, + "loss": 2.769, + "mean_token_accuracy": 0.46250282358256156, + "step": 10694 + }, + { + "epoch": 1.9827586206896552, + "grad_norm": 7.9609375, + "learning_rate": 8.017241379310345e-06, + "loss": 2.7203, + "mean_token_accuracy": 0.4851439890294073, + "step": 10695 + }, + { + "epoch": 1.9829440118650352, + "grad_norm": 8.4921875, + "learning_rate": 8.017055988134965e-06, + "loss": 3.1994, + "mean_token_accuracy": 0.4620359281437126, + "step": 10696 + }, + { + "epoch": 1.9831294030404152, + "grad_norm": 5.2421875, + "learning_rate": 8.016870596959586e-06, + "loss": 3.025, + "mean_token_accuracy": 0.4575773064145157, + "step": 10697 + }, + { + "epoch": 1.9833147942157954, + "grad_norm": 8.3984375, + "learning_rate": 8.016685205784205e-06, + "loss": 3.2198, + "mean_token_accuracy": 0.4391107853914743, + "step": 10698 + }, + { + "epoch": 1.9835001853911753, + "grad_norm": 9.3125, + "learning_rate": 8.016499814608825e-06, + "loss": 2.5917, + "mean_token_accuracy": 0.4902886431076342, + "step": 10699 + }, + { + "epoch": 1.9836855765665553, + "grad_norm": 7.69140625, + "learning_rate": 8.016314423433446e-06, + "loss": 2.8955, + "mean_token_accuracy": 0.4513635861764381, + "step": 10700 + }, + { + "epoch": 1.9838709677419355, + "grad_norm": 7.59375, + "learning_rate": 8.016129032258066e-06, + "loss": 2.5019, + "mean_token_accuracy": 0.4971257185703574, + "step": 10701 + }, + { + "epoch": 1.9840563589173157, + "grad_norm": 11.9765625, + "learning_rate": 8.015943641082685e-06, + "loss": 2.8997, + "mean_token_accuracy": 0.4768067922965417, + "step": 10702 + }, + { + "epoch": 1.9842417500926954, + "grad_norm": 7.1796875, + "learning_rate": 8.015758249907305e-06, + "loss": 3.556, + "mean_token_accuracy": 0.41295595949737707, + "step": 10703 + }, + { + "epoch": 1.9844271412680756, + "grad_norm": 8.2734375, + "learning_rate": 8.015572858731924e-06, + "loss": 2.6868, + "mean_token_accuracy": 0.4712054965646471, + "step": 10704 + }, + { + "epoch": 1.9846125324434558, + "grad_norm": 11.5390625, + "learning_rate": 8.015387467556545e-06, + "loss": 2.9923, + "mean_token_accuracy": 0.4474330487662262, + "step": 10705 + }, + { + "epoch": 1.9847979236188358, + "grad_norm": 8.46875, + "learning_rate": 8.015202076381165e-06, + "loss": 2.9238, + "mean_token_accuracy": 0.46416510318949344, + "step": 10706 + }, + { + "epoch": 1.9849833147942157, + "grad_norm": 6.62109375, + "learning_rate": 8.015016685205786e-06, + "loss": 2.8922, + "mean_token_accuracy": 0.44942424926620006, + "step": 10707 + }, + { + "epoch": 1.985168705969596, + "grad_norm": 11.5, + "learning_rate": 8.014831294030404e-06, + "loss": 2.7848, + "mean_token_accuracy": 0.45894102726696256, + "step": 10708 + }, + { + "epoch": 1.985354097144976, + "grad_norm": 9.7109375, + "learning_rate": 8.014645902855025e-06, + "loss": 3.0297, + "mean_token_accuracy": 0.4752231455832564, + "step": 10709 + }, + { + "epoch": 1.9855394883203559, + "grad_norm": 6.44140625, + "learning_rate": 8.014460511679645e-06, + "loss": 3.0374, + "mean_token_accuracy": 0.4555663385553861, + "step": 10710 + }, + { + "epoch": 1.985724879495736, + "grad_norm": 8.9453125, + "learning_rate": 8.014275120504264e-06, + "loss": 2.6101, + "mean_token_accuracy": 0.5046206225680934, + "step": 10711 + }, + { + "epoch": 1.985910270671116, + "grad_norm": 6.30859375, + "learning_rate": 8.014089729328885e-06, + "loss": 2.4811, + "mean_token_accuracy": 0.5099959200326397, + "step": 10712 + }, + { + "epoch": 1.986095661846496, + "grad_norm": 6.5234375, + "learning_rate": 8.013904338153503e-06, + "loss": 3.4257, + "mean_token_accuracy": 0.4121285627653123, + "step": 10713 + }, + { + "epoch": 1.9862810530218762, + "grad_norm": 9.421875, + "learning_rate": 8.013718946978124e-06, + "loss": 2.7287, + "mean_token_accuracy": 0.49371657754010695, + "step": 10714 + }, + { + "epoch": 1.9864664441972564, + "grad_norm": 6.74609375, + "learning_rate": 8.013533555802744e-06, + "loss": 3.4397, + "mean_token_accuracy": 0.43787477339283226, + "step": 10715 + }, + { + "epoch": 1.9866518353726361, + "grad_norm": 7.39453125, + "learning_rate": 8.013348164627365e-06, + "loss": 2.644, + "mean_token_accuracy": 0.5053722179585571, + "step": 10716 + }, + { + "epoch": 1.9868372265480163, + "grad_norm": 6.9296875, + "learning_rate": 8.013162773451985e-06, + "loss": 3.1884, + "mean_token_accuracy": 0.45357142857142857, + "step": 10717 + }, + { + "epoch": 1.9870226177233965, + "grad_norm": 5.9921875, + "learning_rate": 8.012977382276604e-06, + "loss": 2.7344, + "mean_token_accuracy": 0.49273447820343463, + "step": 10718 + }, + { + "epoch": 1.9872080088987765, + "grad_norm": 6.12109375, + "learning_rate": 8.012791991101225e-06, + "loss": 2.6594, + "mean_token_accuracy": 0.48302425106990016, + "step": 10719 + }, + { + "epoch": 1.9873934000741564, + "grad_norm": 6.55859375, + "learning_rate": 8.012606599925843e-06, + "loss": 3.5412, + "mean_token_accuracy": 0.417531556802244, + "step": 10720 + }, + { + "epoch": 1.9875787912495366, + "grad_norm": 10.84375, + "learning_rate": 8.012421208750464e-06, + "loss": 2.5985, + "mean_token_accuracy": 0.49404919333509656, + "step": 10721 + }, + { + "epoch": 1.9877641824249166, + "grad_norm": 5.96875, + "learning_rate": 8.012235817575083e-06, + "loss": 3.3425, + "mean_token_accuracy": 0.4218241042345277, + "step": 10722 + }, + { + "epoch": 1.9879495736002966, + "grad_norm": 8.4296875, + "learning_rate": 8.012050426399705e-06, + "loss": 3.8381, + "mean_token_accuracy": 0.39969079665332846, + "step": 10723 + }, + { + "epoch": 1.9881349647756767, + "grad_norm": 6.84375, + "learning_rate": 8.011865035224324e-06, + "loss": 2.8277, + "mean_token_accuracy": 0.4730851758559567, + "step": 10724 + }, + { + "epoch": 1.9883203559510567, + "grad_norm": 8.5, + "learning_rate": 8.011679644048944e-06, + "loss": 2.3781, + "mean_token_accuracy": 0.5369665397715888, + "step": 10725 + }, + { + "epoch": 1.9885057471264367, + "grad_norm": 6.09375, + "learning_rate": 8.011494252873565e-06, + "loss": 2.7855, + "mean_token_accuracy": 0.4789915966386555, + "step": 10726 + }, + { + "epoch": 1.9886911383018169, + "grad_norm": 8.3359375, + "learning_rate": 8.011308861698184e-06, + "loss": 2.7558, + "mean_token_accuracy": 0.4632361760825237, + "step": 10727 + }, + { + "epoch": 1.9888765294771968, + "grad_norm": 7.7890625, + "learning_rate": 8.011123470522804e-06, + "loss": 2.6262, + "mean_token_accuracy": 0.5144725557461407, + "step": 10728 + }, + { + "epoch": 1.9890619206525768, + "grad_norm": 7.1875, + "learning_rate": 8.010938079347423e-06, + "loss": 3.3445, + "mean_token_accuracy": 0.4565354928508161, + "step": 10729 + }, + { + "epoch": 1.989247311827957, + "grad_norm": 8.4140625, + "learning_rate": 8.010752688172043e-06, + "loss": 3.0721, + "mean_token_accuracy": 0.47480077536075815, + "step": 10730 + }, + { + "epoch": 1.9894327030033372, + "grad_norm": 9.875, + "learning_rate": 8.010567296996664e-06, + "loss": 2.4567, + "mean_token_accuracy": 0.5297147882899862, + "step": 10731 + }, + { + "epoch": 1.989618094178717, + "grad_norm": 10.8203125, + "learning_rate": 8.010381905821284e-06, + "loss": 3.8611, + "mean_token_accuracy": 0.42245534448539834, + "step": 10732 + }, + { + "epoch": 1.9898034853540971, + "grad_norm": 7.80859375, + "learning_rate": 8.010196514645903e-06, + "loss": 3.1811, + "mean_token_accuracy": 0.45581480801028845, + "step": 10733 + }, + { + "epoch": 1.9899888765294773, + "grad_norm": 7.98046875, + "learning_rate": 8.010011123470524e-06, + "loss": 3.1298, + "mean_token_accuracy": 0.45045385779122543, + "step": 10734 + }, + { + "epoch": 1.9901742677048573, + "grad_norm": 8.265625, + "learning_rate": 8.009825732295144e-06, + "loss": 2.9986, + "mean_token_accuracy": 0.4547320878669678, + "step": 10735 + }, + { + "epoch": 1.9903596588802372, + "grad_norm": 10.8359375, + "learning_rate": 8.009640341119763e-06, + "loss": 3.1925, + "mean_token_accuracy": 0.4679950866657568, + "step": 10736 + }, + { + "epoch": 1.9905450500556174, + "grad_norm": 9.3828125, + "learning_rate": 8.009454949944383e-06, + "loss": 3.1173, + "mean_token_accuracy": 0.44630973986690864, + "step": 10737 + }, + { + "epoch": 1.9907304412309974, + "grad_norm": 14.9921875, + "learning_rate": 8.009269558769002e-06, + "loss": 2.8305, + "mean_token_accuracy": 0.43517074016293983, + "step": 10738 + }, + { + "epoch": 1.9909158324063774, + "grad_norm": 8.21875, + "learning_rate": 8.009084167593624e-06, + "loss": 3.5514, + "mean_token_accuracy": 0.42876693581497194, + "step": 10739 + }, + { + "epoch": 1.9911012235817576, + "grad_norm": 10.0, + "learning_rate": 8.008898776418243e-06, + "loss": 2.8148, + "mean_token_accuracy": 0.4732837627528964, + "step": 10740 + }, + { + "epoch": 1.9912866147571375, + "grad_norm": 8.0234375, + "learning_rate": 8.008713385242864e-06, + "loss": 3.0343, + "mean_token_accuracy": 0.4490351872871737, + "step": 10741 + }, + { + "epoch": 1.9914720059325175, + "grad_norm": 8.8671875, + "learning_rate": 8.008527994067482e-06, + "loss": 2.7333, + "mean_token_accuracy": 0.4609048978695366, + "step": 10742 + }, + { + "epoch": 1.9916573971078977, + "grad_norm": 8.703125, + "learning_rate": 8.008342602892103e-06, + "loss": 2.6406, + "mean_token_accuracy": 0.4728003444811253, + "step": 10743 + }, + { + "epoch": 1.9918427882832779, + "grad_norm": 8.28125, + "learning_rate": 8.008157211716723e-06, + "loss": 2.7926, + "mean_token_accuracy": 0.4745317496573778, + "step": 10744 + }, + { + "epoch": 1.9920281794586576, + "grad_norm": 9.5234375, + "learning_rate": 8.007971820541342e-06, + "loss": 2.5409, + "mean_token_accuracy": 0.49304461942257216, + "step": 10745 + }, + { + "epoch": 1.9922135706340378, + "grad_norm": 6.87109375, + "learning_rate": 8.007786429365963e-06, + "loss": 2.5533, + "mean_token_accuracy": 0.4864109728219456, + "step": 10746 + }, + { + "epoch": 1.992398961809418, + "grad_norm": 6.7578125, + "learning_rate": 8.007601038190583e-06, + "loss": 3.0521, + "mean_token_accuracy": 0.46139574711003284, + "step": 10747 + }, + { + "epoch": 1.992584352984798, + "grad_norm": 7.59765625, + "learning_rate": 8.007415647015204e-06, + "loss": 3.3525, + "mean_token_accuracy": 0.45082726671078754, + "step": 10748 + }, + { + "epoch": 1.992769744160178, + "grad_norm": 7.84375, + "learning_rate": 8.007230255839822e-06, + "loss": 2.2662, + "mean_token_accuracy": 0.5143433437920215, + "step": 10749 + }, + { + "epoch": 1.9929551353355581, + "grad_norm": 6.51953125, + "learning_rate": 8.007044864664443e-06, + "loss": 2.9754, + "mean_token_accuracy": 0.48411758371917735, + "step": 10750 + }, + { + "epoch": 1.993140526510938, + "grad_norm": 6.7265625, + "learning_rate": 8.006859473489062e-06, + "loss": 2.2903, + "mean_token_accuracy": 0.5112095583299772, + "step": 10751 + }, + { + "epoch": 1.993325917686318, + "grad_norm": 5.67578125, + "learning_rate": 8.006674082313682e-06, + "loss": 3.0573, + "mean_token_accuracy": 0.45158144071339, + "step": 10752 + }, + { + "epoch": 1.9935113088616983, + "grad_norm": 5.88671875, + "learning_rate": 8.006488691138303e-06, + "loss": 3.0611, + "mean_token_accuracy": 0.4632192279679534, + "step": 10753 + }, + { + "epoch": 1.9936967000370782, + "grad_norm": 7.3515625, + "learning_rate": 8.006303299962922e-06, + "loss": 2.6784, + "mean_token_accuracy": 0.47740986019131715, + "step": 10754 + }, + { + "epoch": 1.9938820912124582, + "grad_norm": 7.171875, + "learning_rate": 8.006117908787544e-06, + "loss": 3.0903, + "mean_token_accuracy": 0.4571873378308251, + "step": 10755 + }, + { + "epoch": 1.9940674823878384, + "grad_norm": 8.5625, + "learning_rate": 8.005932517612163e-06, + "loss": 2.4697, + "mean_token_accuracy": 0.5211170724996561, + "step": 10756 + }, + { + "epoch": 1.9942528735632183, + "grad_norm": 8.0, + "learning_rate": 8.005747126436783e-06, + "loss": 3.4159, + "mean_token_accuracy": 0.4333435021354484, + "step": 10757 + }, + { + "epoch": 1.9944382647385983, + "grad_norm": 8.0546875, + "learning_rate": 8.005561735261402e-06, + "loss": 2.8394, + "mean_token_accuracy": 0.4744318181818182, + "step": 10758 + }, + { + "epoch": 1.9946236559139785, + "grad_norm": 6.9296875, + "learning_rate": 8.005376344086022e-06, + "loss": 2.4562, + "mean_token_accuracy": 0.539283171136957, + "step": 10759 + }, + { + "epoch": 1.9948090470893587, + "grad_norm": 6.30078125, + "learning_rate": 8.005190952910641e-06, + "loss": 2.6911, + "mean_token_accuracy": 0.4833207547169811, + "step": 10760 + }, + { + "epoch": 1.9949944382647384, + "grad_norm": 6.80859375, + "learning_rate": 8.005005561735262e-06, + "loss": 2.9502, + "mean_token_accuracy": 0.4690017513134851, + "step": 10761 + }, + { + "epoch": 1.9951798294401186, + "grad_norm": 8.3046875, + "learning_rate": 8.004820170559882e-06, + "loss": 3.6189, + "mean_token_accuracy": 0.45224908820748344, + "step": 10762 + }, + { + "epoch": 1.9953652206154988, + "grad_norm": 6.11328125, + "learning_rate": 8.004634779384503e-06, + "loss": 2.8786, + "mean_token_accuracy": 0.4625249358791679, + "step": 10763 + }, + { + "epoch": 1.9955506117908788, + "grad_norm": 6.0859375, + "learning_rate": 8.004449388209123e-06, + "loss": 3.063, + "mean_token_accuracy": 0.4746670328161472, + "step": 10764 + }, + { + "epoch": 1.9957360029662587, + "grad_norm": 7.2421875, + "learning_rate": 8.004263997033742e-06, + "loss": 3.327, + "mean_token_accuracy": 0.438682652029467, + "step": 10765 + }, + { + "epoch": 1.995921394141639, + "grad_norm": 7.02734375, + "learning_rate": 8.004078605858362e-06, + "loss": 2.6294, + "mean_token_accuracy": 0.49117381780698366, + "step": 10766 + }, + { + "epoch": 1.996106785317019, + "grad_norm": 7.2109375, + "learning_rate": 8.003893214682981e-06, + "loss": 3.8101, + "mean_token_accuracy": 0.4112128657583203, + "step": 10767 + }, + { + "epoch": 1.9962921764923989, + "grad_norm": 5.37890625, + "learning_rate": 8.003707823507602e-06, + "loss": 2.8985, + "mean_token_accuracy": 0.47346706501636077, + "step": 10768 + }, + { + "epoch": 1.996477567667779, + "grad_norm": 6.8046875, + "learning_rate": 8.003522432332222e-06, + "loss": 2.8025, + "mean_token_accuracy": 0.47616702777230646, + "step": 10769 + }, + { + "epoch": 1.996662958843159, + "grad_norm": 7.0078125, + "learning_rate": 8.003337041156841e-06, + "loss": 2.8019, + "mean_token_accuracy": 0.475470697427738, + "step": 10770 + }, + { + "epoch": 1.996848350018539, + "grad_norm": 7.9296875, + "learning_rate": 8.003151649981461e-06, + "loss": 2.7185, + "mean_token_accuracy": 0.48474033965050456, + "step": 10771 + }, + { + "epoch": 1.9970337411939192, + "grad_norm": 7.10546875, + "learning_rate": 8.002966258806082e-06, + "loss": 2.4837, + "mean_token_accuracy": 0.49939702532493635, + "step": 10772 + }, + { + "epoch": 1.9972191323692994, + "grad_norm": 6.26171875, + "learning_rate": 8.002780867630702e-06, + "loss": 3.3297, + "mean_token_accuracy": 0.4303306116985678, + "step": 10773 + }, + { + "epoch": 1.9974045235446791, + "grad_norm": 6.1171875, + "learning_rate": 8.002595476455321e-06, + "loss": 2.4726, + "mean_token_accuracy": 0.5159839552692355, + "step": 10774 + }, + { + "epoch": 1.9975899147200593, + "grad_norm": 9.3828125, + "learning_rate": 8.002410085279942e-06, + "loss": 2.7426, + "mean_token_accuracy": 0.47046390032261654, + "step": 10775 + }, + { + "epoch": 1.9977753058954395, + "grad_norm": 6.45703125, + "learning_rate": 8.00222469410456e-06, + "loss": 2.3168, + "mean_token_accuracy": 0.5241369632332304, + "step": 10776 + }, + { + "epoch": 1.9979606970708195, + "grad_norm": 5.6953125, + "learning_rate": 8.002039302929181e-06, + "loss": 3.0936, + "mean_token_accuracy": 0.4397991211550534, + "step": 10777 + }, + { + "epoch": 1.9981460882461994, + "grad_norm": 7.296875, + "learning_rate": 8.001853911753802e-06, + "loss": 2.5426, + "mean_token_accuracy": 0.4850966262692434, + "step": 10778 + }, + { + "epoch": 1.9983314794215796, + "grad_norm": 6.328125, + "learning_rate": 8.00166852057842e-06, + "loss": 3.1363, + "mean_token_accuracy": 0.43676767676767675, + "step": 10779 + }, + { + "epoch": 1.9985168705969596, + "grad_norm": 7.78515625, + "learning_rate": 8.00148312940304e-06, + "loss": 1.9923, + "mean_token_accuracy": 0.5631584437258788, + "step": 10780 + }, + { + "epoch": 1.9987022617723396, + "grad_norm": 6.421875, + "learning_rate": 8.001297738227661e-06, + "loss": 3.1193, + "mean_token_accuracy": 0.43542234332425067, + "step": 10781 + }, + { + "epoch": 1.9988876529477198, + "grad_norm": 7.203125, + "learning_rate": 8.001112347052282e-06, + "loss": 3.7044, + "mean_token_accuracy": 0.41835147744945567, + "step": 10782 + }, + { + "epoch": 1.9990730441230997, + "grad_norm": 7.21875, + "learning_rate": 8.0009269558769e-06, + "loss": 4.3436, + "mean_token_accuracy": 0.4155860349127182, + "step": 10783 + }, + { + "epoch": 1.9992584352984797, + "grad_norm": 5.484375, + "learning_rate": 8.000741564701521e-06, + "loss": 3.3995, + "mean_token_accuracy": 0.4310155735906997, + "step": 10784 + }, + { + "epoch": 1.9994438264738599, + "grad_norm": 6.65234375, + "learning_rate": 8.00055617352614e-06, + "loss": 2.9603, + "mean_token_accuracy": 0.465089065141074, + "step": 10785 + }, + { + "epoch": 1.9996292176492398, + "grad_norm": 8.0546875, + "learning_rate": 8.00037078235076e-06, + "loss": 3.042, + "mean_token_accuracy": 0.4503063308373043, + "step": 10786 + }, + { + "epoch": 1.9998146088246198, + "grad_norm": 6.80859375, + "learning_rate": 8.00018539117538e-06, + "loss": 2.5963, + "mean_token_accuracy": 0.5153462749213011, + "step": 10787 + }, + { + "epoch": 2.0, + "grad_norm": 7.71875, + "learning_rate": 8.000000000000001e-06, + "loss": 3.6965, + "mean_token_accuracy": 0.4419183538001991, + "step": 10788 + }, + { + "epoch": 2.00018539117538, + "grad_norm": 8.40625, + "learning_rate": 7.99981460882462e-06, + "loss": 2.4972, + "mean_token_accuracy": 0.48521563536708495, + "step": 10789 + }, + { + "epoch": 2.00037078235076, + "grad_norm": 8.75, + "learning_rate": 7.99962921764924e-06, + "loss": 2.8787, + "mean_token_accuracy": 0.47733677782996714, + "step": 10790 + }, + { + "epoch": 2.00055617352614, + "grad_norm": 8.9296875, + "learning_rate": 7.999443826473861e-06, + "loss": 2.4858, + "mean_token_accuracy": 0.5247357293868922, + "step": 10791 + }, + { + "epoch": 2.0007415647015203, + "grad_norm": 10.0390625, + "learning_rate": 7.99925843529848e-06, + "loss": 2.4654, + "mean_token_accuracy": 0.5227120908483633, + "step": 10792 + }, + { + "epoch": 2.0009269558769, + "grad_norm": 8.6015625, + "learning_rate": 7.9990730441231e-06, + "loss": 2.7942, + "mean_token_accuracy": 0.47761897282864774, + "step": 10793 + }, + { + "epoch": 2.0011123470522802, + "grad_norm": 7.37109375, + "learning_rate": 7.99888765294772e-06, + "loss": 3.0089, + "mean_token_accuracy": 0.47504078303425773, + "step": 10794 + }, + { + "epoch": 2.0012977382276604, + "grad_norm": 8.421875, + "learning_rate": 7.99870226177234e-06, + "loss": 2.2437, + "mean_token_accuracy": 0.5568345323741007, + "step": 10795 + }, + { + "epoch": 2.0014831294030406, + "grad_norm": 8.671875, + "learning_rate": 7.99851687059696e-06, + "loss": 2.5008, + "mean_token_accuracy": 0.5194006915097964, + "step": 10796 + }, + { + "epoch": 2.0016685205784204, + "grad_norm": 8.6875, + "learning_rate": 7.99833147942158e-06, + "loss": 2.4547, + "mean_token_accuracy": 0.5127133009379591, + "step": 10797 + }, + { + "epoch": 2.0018539117538006, + "grad_norm": 6.265625, + "learning_rate": 7.9981460882462e-06, + "loss": 2.5066, + "mean_token_accuracy": 0.5023052959501557, + "step": 10798 + }, + { + "epoch": 2.0020393029291808, + "grad_norm": 7.83203125, + "learning_rate": 7.99796069707082e-06, + "loss": 2.964, + "mean_token_accuracy": 0.4700488269620102, + "step": 10799 + }, + { + "epoch": 2.0022246941045605, + "grad_norm": 7.9453125, + "learning_rate": 7.99777530589544e-06, + "loss": 2.7573, + "mean_token_accuracy": 0.4727040090986636, + "step": 10800 + }, + { + "epoch": 2.0024100852799407, + "grad_norm": 7.94140625, + "learning_rate": 7.99758991472006e-06, + "loss": 3.0897, + "mean_token_accuracy": 0.427367055771725, + "step": 10801 + }, + { + "epoch": 2.002595476455321, + "grad_norm": 9.75, + "learning_rate": 7.99740452354468e-06, + "loss": 2.7898, + "mean_token_accuracy": 0.47820593809222994, + "step": 10802 + }, + { + "epoch": 2.0027808676307006, + "grad_norm": 6.2734375, + "learning_rate": 7.997219132369299e-06, + "loss": 3.0672, + "mean_token_accuracy": 0.4681102828067684, + "step": 10803 + }, + { + "epoch": 2.002966258806081, + "grad_norm": 6.328125, + "learning_rate": 7.99703374119392e-06, + "loss": 2.9532, + "mean_token_accuracy": 0.4748270102600811, + "step": 10804 + }, + { + "epoch": 2.003151649981461, + "grad_norm": 7.41015625, + "learning_rate": 7.99684835001854e-06, + "loss": 2.4855, + "mean_token_accuracy": 0.5426364467933733, + "step": 10805 + }, + { + "epoch": 2.0033370411568407, + "grad_norm": 6.87109375, + "learning_rate": 7.99666295884316e-06, + "loss": 3.4007, + "mean_token_accuracy": 0.4587834664161578, + "step": 10806 + }, + { + "epoch": 2.003522432332221, + "grad_norm": 5.671875, + "learning_rate": 7.99647756766778e-06, + "loss": 3.0086, + "mean_token_accuracy": 0.4724717473196175, + "step": 10807 + }, + { + "epoch": 2.003707823507601, + "grad_norm": 7.90625, + "learning_rate": 7.9962921764924e-06, + "loss": 3.0367, + "mean_token_accuracy": 0.4762157148535842, + "step": 10808 + }, + { + "epoch": 2.003893214682981, + "grad_norm": 5.8984375, + "learning_rate": 7.99610678531702e-06, + "loss": 2.8093, + "mean_token_accuracy": 0.4808464064210142, + "step": 10809 + }, + { + "epoch": 2.004078605858361, + "grad_norm": 6.99609375, + "learning_rate": 7.995921394141639e-06, + "loss": 3.4001, + "mean_token_accuracy": 0.4218236928517302, + "step": 10810 + }, + { + "epoch": 2.0042639970337413, + "grad_norm": 7.48046875, + "learning_rate": 7.995736002966259e-06, + "loss": 2.5082, + "mean_token_accuracy": 0.5175164216452924, + "step": 10811 + }, + { + "epoch": 2.0044493882091214, + "grad_norm": 5.81640625, + "learning_rate": 7.99555061179088e-06, + "loss": 2.2787, + "mean_token_accuracy": 0.5540441176470589, + "step": 10812 + }, + { + "epoch": 2.004634779384501, + "grad_norm": 6.32421875, + "learning_rate": 7.9953652206155e-06, + "loss": 2.8187, + "mean_token_accuracy": 0.4836655592469546, + "step": 10813 + }, + { + "epoch": 2.0048201705598814, + "grad_norm": 5.23828125, + "learning_rate": 7.995179829440119e-06, + "loss": 2.4098, + "mean_token_accuracy": 0.5099636545933074, + "step": 10814 + }, + { + "epoch": 2.0050055617352616, + "grad_norm": 6.05859375, + "learning_rate": 7.99499443826474e-06, + "loss": 3.0315, + "mean_token_accuracy": 0.46397273612463485, + "step": 10815 + }, + { + "epoch": 2.0051909529106413, + "grad_norm": 8.984375, + "learning_rate": 7.99480904708936e-06, + "loss": 3.1042, + "mean_token_accuracy": 0.4725745629070963, + "step": 10816 + }, + { + "epoch": 2.0053763440860215, + "grad_norm": 7.39453125, + "learning_rate": 7.994623655913979e-06, + "loss": 2.541, + "mean_token_accuracy": 0.5356822932218572, + "step": 10817 + }, + { + "epoch": 2.0055617352614017, + "grad_norm": 6.66015625, + "learning_rate": 7.994438264738599e-06, + "loss": 2.3002, + "mean_token_accuracy": 0.5514898065865134, + "step": 10818 + }, + { + "epoch": 2.0057471264367814, + "grad_norm": 6.171875, + "learning_rate": 7.994252873563218e-06, + "loss": 2.7205, + "mean_token_accuracy": 0.49355376914909754, + "step": 10819 + }, + { + "epoch": 2.0059325176121616, + "grad_norm": 7.01953125, + "learning_rate": 7.99406748238784e-06, + "loss": 2.774, + "mean_token_accuracy": 0.4949189450762158, + "step": 10820 + }, + { + "epoch": 2.006117908787542, + "grad_norm": 7.54296875, + "learning_rate": 7.993882091212459e-06, + "loss": 2.7606, + "mean_token_accuracy": 0.47726341663252764, + "step": 10821 + }, + { + "epoch": 2.0063032999629216, + "grad_norm": 5.703125, + "learning_rate": 7.99369670003708e-06, + "loss": 3.1624, + "mean_token_accuracy": 0.4446133796698523, + "step": 10822 + }, + { + "epoch": 2.0064886911383017, + "grad_norm": 6.84375, + "learning_rate": 7.993511308861698e-06, + "loss": 2.5116, + "mean_token_accuracy": 0.5036338859868271, + "step": 10823 + }, + { + "epoch": 2.006674082313682, + "grad_norm": 7.3359375, + "learning_rate": 7.993325917686319e-06, + "loss": 2.6711, + "mean_token_accuracy": 0.4898547244567175, + "step": 10824 + }, + { + "epoch": 2.006859473489062, + "grad_norm": 6.58984375, + "learning_rate": 7.99314052651094e-06, + "loss": 2.7956, + "mean_token_accuracy": 0.47031766566048583, + "step": 10825 + }, + { + "epoch": 2.007044864664442, + "grad_norm": 8.0546875, + "learning_rate": 7.992955135335558e-06, + "loss": 2.5852, + "mean_token_accuracy": 0.500428134556575, + "step": 10826 + }, + { + "epoch": 2.007230255839822, + "grad_norm": 6.90234375, + "learning_rate": 7.992769744160178e-06, + "loss": 3.3116, + "mean_token_accuracy": 0.4528529563868299, + "step": 10827 + }, + { + "epoch": 2.0074156470152023, + "grad_norm": 8.0625, + "learning_rate": 7.992584352984799e-06, + "loss": 2.4038, + "mean_token_accuracy": 0.522578360191252, + "step": 10828 + }, + { + "epoch": 2.007601038190582, + "grad_norm": 7.75, + "learning_rate": 7.99239896180942e-06, + "loss": 2.6199, + "mean_token_accuracy": 0.5260590500641849, + "step": 10829 + }, + { + "epoch": 2.007786429365962, + "grad_norm": 9.234375, + "learning_rate": 7.992213570634038e-06, + "loss": 2.4868, + "mean_token_accuracy": 0.5149768399382398, + "step": 10830 + }, + { + "epoch": 2.0079718205413424, + "grad_norm": 6.234375, + "learning_rate": 7.992028179458659e-06, + "loss": 2.7919, + "mean_token_accuracy": 0.4741869181477451, + "step": 10831 + }, + { + "epoch": 2.008157211716722, + "grad_norm": 8.046875, + "learning_rate": 7.991842788283278e-06, + "loss": 2.6223, + "mean_token_accuracy": 0.4823549664313996, + "step": 10832 + }, + { + "epoch": 2.0083426028921023, + "grad_norm": 7.75390625, + "learning_rate": 7.991657397107898e-06, + "loss": 3.3037, + "mean_token_accuracy": 0.46093133385951063, + "step": 10833 + }, + { + "epoch": 2.0085279940674825, + "grad_norm": 7.69921875, + "learning_rate": 7.991472005932519e-06, + "loss": 2.6483, + "mean_token_accuracy": 0.4977668258124134, + "step": 10834 + }, + { + "epoch": 2.0087133852428622, + "grad_norm": 6.88671875, + "learning_rate": 7.991286614757137e-06, + "loss": 3.1207, + "mean_token_accuracy": 0.446443172526574, + "step": 10835 + }, + { + "epoch": 2.0088987764182424, + "grad_norm": 8.3359375, + "learning_rate": 7.99110122358176e-06, + "loss": 3.3604, + "mean_token_accuracy": 0.4459804658151766, + "step": 10836 + }, + { + "epoch": 2.0090841675936226, + "grad_norm": 9.8984375, + "learning_rate": 7.990915832406378e-06, + "loss": 3.2352, + "mean_token_accuracy": 0.4567315055904426, + "step": 10837 + }, + { + "epoch": 2.0092695587690024, + "grad_norm": 7.83984375, + "learning_rate": 7.990730441230999e-06, + "loss": 3.4465, + "mean_token_accuracy": 0.4493016037247801, + "step": 10838 + }, + { + "epoch": 2.0094549499443826, + "grad_norm": 7.85546875, + "learning_rate": 7.990545050055618e-06, + "loss": 2.1356, + "mean_token_accuracy": 0.5392230711288066, + "step": 10839 + }, + { + "epoch": 2.0096403411197628, + "grad_norm": 8.9921875, + "learning_rate": 7.990359658880238e-06, + "loss": 3.1533, + "mean_token_accuracy": 0.45023466580352806, + "step": 10840 + }, + { + "epoch": 2.009825732295143, + "grad_norm": 10.8203125, + "learning_rate": 7.990174267704857e-06, + "loss": 2.6224, + "mean_token_accuracy": 0.4899822815864795, + "step": 10841 + }, + { + "epoch": 2.0100111234705227, + "grad_norm": 8.6875, + "learning_rate": 7.989988876529477e-06, + "loss": 2.3736, + "mean_token_accuracy": 0.5177955005356505, + "step": 10842 + }, + { + "epoch": 2.010196514645903, + "grad_norm": 9.3359375, + "learning_rate": 7.989803485354098e-06, + "loss": 2.6667, + "mean_token_accuracy": 0.49387755102040815, + "step": 10843 + }, + { + "epoch": 2.010381905821283, + "grad_norm": 8.3671875, + "learning_rate": 7.989618094178718e-06, + "loss": 3.2224, + "mean_token_accuracy": 0.45077978789769185, + "step": 10844 + }, + { + "epoch": 2.010567296996663, + "grad_norm": 6.59375, + "learning_rate": 7.989432703003339e-06, + "loss": 2.4606, + "mean_token_accuracy": 0.5144292557541769, + "step": 10845 + }, + { + "epoch": 2.010752688172043, + "grad_norm": 8.375, + "learning_rate": 7.989247311827958e-06, + "loss": 2.7893, + "mean_token_accuracy": 0.4937189599766287, + "step": 10846 + }, + { + "epoch": 2.010938079347423, + "grad_norm": 7.49609375, + "learning_rate": 7.989061920652578e-06, + "loss": 3.2547, + "mean_token_accuracy": 0.43530444964871196, + "step": 10847 + }, + { + "epoch": 2.011123470522803, + "grad_norm": 7.94921875, + "learning_rate": 7.988876529477197e-06, + "loss": 2.7162, + "mean_token_accuracy": 0.474468085106383, + "step": 10848 + }, + { + "epoch": 2.011308861698183, + "grad_norm": 7.671875, + "learning_rate": 7.988691138301817e-06, + "loss": 2.9402, + "mean_token_accuracy": 0.46807917479788125, + "step": 10849 + }, + { + "epoch": 2.0114942528735633, + "grad_norm": 9.140625, + "learning_rate": 7.988505747126438e-06, + "loss": 3.1294, + "mean_token_accuracy": 0.44867807153965783, + "step": 10850 + }, + { + "epoch": 2.011679644048943, + "grad_norm": 7.3359375, + "learning_rate": 7.988320355951057e-06, + "loss": 2.8828, + "mean_token_accuracy": 0.4852829037669666, + "step": 10851 + }, + { + "epoch": 2.0118650352243233, + "grad_norm": 6.9921875, + "learning_rate": 7.988134964775677e-06, + "loss": 2.8591, + "mean_token_accuracy": 0.46142046326123637, + "step": 10852 + }, + { + "epoch": 2.0120504263997034, + "grad_norm": 10.5859375, + "learning_rate": 7.987949573600298e-06, + "loss": 2.0983, + "mean_token_accuracy": 0.5826745718050066, + "step": 10853 + }, + { + "epoch": 2.0122358175750836, + "grad_norm": 6.8671875, + "learning_rate": 7.987764182424918e-06, + "loss": 2.4871, + "mean_token_accuracy": 0.5379694593479158, + "step": 10854 + }, + { + "epoch": 2.0124212087504634, + "grad_norm": 6.29296875, + "learning_rate": 7.987578791249537e-06, + "loss": 2.9109, + "mean_token_accuracy": 0.4916935283907761, + "step": 10855 + }, + { + "epoch": 2.0126065999258436, + "grad_norm": 6.31640625, + "learning_rate": 7.987393400074157e-06, + "loss": 3.7756, + "mean_token_accuracy": 0.41575239635061784, + "step": 10856 + }, + { + "epoch": 2.0127919911012238, + "grad_norm": 6.875, + "learning_rate": 7.987208008898776e-06, + "loss": 2.7283, + "mean_token_accuracy": 0.4952919020715631, + "step": 10857 + }, + { + "epoch": 2.0129773822766035, + "grad_norm": 8.890625, + "learning_rate": 7.987022617723397e-06, + "loss": 2.7272, + "mean_token_accuracy": 0.48917618761274806, + "step": 10858 + }, + { + "epoch": 2.0131627734519837, + "grad_norm": 7.08203125, + "learning_rate": 7.986837226548017e-06, + "loss": 2.4486, + "mean_token_accuracy": 0.5209363186972957, + "step": 10859 + }, + { + "epoch": 2.013348164627364, + "grad_norm": 6.30859375, + "learning_rate": 7.986651835372638e-06, + "loss": 2.1878, + "mean_token_accuracy": 0.5513748191027497, + "step": 10860 + }, + { + "epoch": 2.0135335558027436, + "grad_norm": 9.453125, + "learning_rate": 7.986466444197257e-06, + "loss": 2.906, + "mean_token_accuracy": 0.4704703649019194, + "step": 10861 + }, + { + "epoch": 2.013718946978124, + "grad_norm": 6.24609375, + "learning_rate": 7.986281053021877e-06, + "loss": 2.4332, + "mean_token_accuracy": 0.5122103944896681, + "step": 10862 + }, + { + "epoch": 2.013904338153504, + "grad_norm": 8.65625, + "learning_rate": 7.986095661846498e-06, + "loss": 2.7561, + "mean_token_accuracy": 0.4879302103250478, + "step": 10863 + }, + { + "epoch": 2.0140897293288837, + "grad_norm": 8.03125, + "learning_rate": 7.985910270671116e-06, + "loss": 2.5752, + "mean_token_accuracy": 0.5098098400241473, + "step": 10864 + }, + { + "epoch": 2.014275120504264, + "grad_norm": 7.53515625, + "learning_rate": 7.985724879495737e-06, + "loss": 2.524, + "mean_token_accuracy": 0.5269403075432861, + "step": 10865 + }, + { + "epoch": 2.014460511679644, + "grad_norm": 7.4453125, + "learning_rate": 7.985539488320356e-06, + "loss": 2.5592, + "mean_token_accuracy": 0.5008982035928143, + "step": 10866 + }, + { + "epoch": 2.0146459028550243, + "grad_norm": 6.07421875, + "learning_rate": 7.985354097144976e-06, + "loss": 2.8083, + "mean_token_accuracy": 0.48344170573113565, + "step": 10867 + }, + { + "epoch": 2.014831294030404, + "grad_norm": 6.2578125, + "learning_rate": 7.985168705969597e-06, + "loss": 3.3403, + "mean_token_accuracy": 0.4456066945606695, + "step": 10868 + }, + { + "epoch": 2.0150166852057843, + "grad_norm": 7.8984375, + "learning_rate": 7.984983314794217e-06, + "loss": 2.6673, + "mean_token_accuracy": 0.5100611309220581, + "step": 10869 + }, + { + "epoch": 2.0152020763811644, + "grad_norm": 6.765625, + "learning_rate": 7.984797923618836e-06, + "loss": 3.2952, + "mean_token_accuracy": 0.45961779885618637, + "step": 10870 + }, + { + "epoch": 2.015387467556544, + "grad_norm": 6.1328125, + "learning_rate": 7.984612532443456e-06, + "loss": 2.6104, + "mean_token_accuracy": 0.48666331152491193, + "step": 10871 + }, + { + "epoch": 2.0155728587319244, + "grad_norm": 7.64453125, + "learning_rate": 7.984427141268077e-06, + "loss": 2.4744, + "mean_token_accuracy": 0.511060507482108, + "step": 10872 + }, + { + "epoch": 2.0157582499073046, + "grad_norm": 6.2421875, + "learning_rate": 7.984241750092696e-06, + "loss": 2.5579, + "mean_token_accuracy": 0.520836536982931, + "step": 10873 + }, + { + "epoch": 2.0159436410826843, + "grad_norm": 6.69921875, + "learning_rate": 7.984056358917316e-06, + "loss": 2.7044, + "mean_token_accuracy": 0.4944746825004123, + "step": 10874 + }, + { + "epoch": 2.0161290322580645, + "grad_norm": 9.71875, + "learning_rate": 7.983870967741935e-06, + "loss": 2.6038, + "mean_token_accuracy": 0.4991414608374059, + "step": 10875 + }, + { + "epoch": 2.0163144234334447, + "grad_norm": 6.125, + "learning_rate": 7.983685576566557e-06, + "loss": 2.6446, + "mean_token_accuracy": 0.5006514657980456, + "step": 10876 + }, + { + "epoch": 2.0164998146088244, + "grad_norm": 7.1796875, + "learning_rate": 7.983500185391176e-06, + "loss": 2.4562, + "mean_token_accuracy": 0.5112548861032484, + "step": 10877 + }, + { + "epoch": 2.0166852057842046, + "grad_norm": 5.72265625, + "learning_rate": 7.983314794215796e-06, + "loss": 2.5686, + "mean_token_accuracy": 0.5081474296799224, + "step": 10878 + }, + { + "epoch": 2.016870596959585, + "grad_norm": 6.77734375, + "learning_rate": 7.983129403040415e-06, + "loss": 3.7187, + "mean_token_accuracy": 0.4262147570485903, + "step": 10879 + }, + { + "epoch": 2.0170559881349646, + "grad_norm": 7.109375, + "learning_rate": 7.982944011865036e-06, + "loss": 3.4231, + "mean_token_accuracy": 0.45064438464806683, + "step": 10880 + }, + { + "epoch": 2.0172413793103448, + "grad_norm": 6.54296875, + "learning_rate": 7.982758620689656e-06, + "loss": 2.7103, + "mean_token_accuracy": 0.4659292497130335, + "step": 10881 + }, + { + "epoch": 2.017426770485725, + "grad_norm": 6.5625, + "learning_rate": 7.982573229514275e-06, + "loss": 2.7305, + "mean_token_accuracy": 0.5007112375533428, + "step": 10882 + }, + { + "epoch": 2.017612161661105, + "grad_norm": 6.05078125, + "learning_rate": 7.982387838338895e-06, + "loss": 3.1448, + "mean_token_accuracy": 0.4521201185719809, + "step": 10883 + }, + { + "epoch": 2.017797552836485, + "grad_norm": 6.19921875, + "learning_rate": 7.982202447163516e-06, + "loss": 2.7618, + "mean_token_accuracy": 0.5175892738616753, + "step": 10884 + }, + { + "epoch": 2.017982944011865, + "grad_norm": 7.63671875, + "learning_rate": 7.982017055988136e-06, + "loss": 3.4577, + "mean_token_accuracy": 0.4248587570621469, + "step": 10885 + }, + { + "epoch": 2.0181683351872453, + "grad_norm": 7.63671875, + "learning_rate": 7.981831664812755e-06, + "loss": 3.2489, + "mean_token_accuracy": 0.47715404699738906, + "step": 10886 + }, + { + "epoch": 2.018353726362625, + "grad_norm": 7.15234375, + "learning_rate": 7.981646273637376e-06, + "loss": 3.5504, + "mean_token_accuracy": 0.46905537459283386, + "step": 10887 + }, + { + "epoch": 2.018539117538005, + "grad_norm": 5.9375, + "learning_rate": 7.981460882461996e-06, + "loss": 2.5756, + "mean_token_accuracy": 0.49847581179589134, + "step": 10888 + }, + { + "epoch": 2.0187245087133854, + "grad_norm": 7.21484375, + "learning_rate": 7.981275491286615e-06, + "loss": 3.0198, + "mean_token_accuracy": 0.47399483585392843, + "step": 10889 + }, + { + "epoch": 2.018909899888765, + "grad_norm": 7.6015625, + "learning_rate": 7.981090100111236e-06, + "loss": 2.9589, + "mean_token_accuracy": 0.47300245432233434, + "step": 10890 + }, + { + "epoch": 2.0190952910641453, + "grad_norm": 6.3359375, + "learning_rate": 7.980904708935854e-06, + "loss": 3.0154, + "mean_token_accuracy": 0.4870176890452053, + "step": 10891 + }, + { + "epoch": 2.0192806822395255, + "grad_norm": 8.9453125, + "learning_rate": 7.980719317760475e-06, + "loss": 2.6939, + "mean_token_accuracy": 0.5089450956199877, + "step": 10892 + }, + { + "epoch": 2.0194660734149052, + "grad_norm": 7.66796875, + "learning_rate": 7.980533926585095e-06, + "loss": 2.7276, + "mean_token_accuracy": 0.47743113176236307, + "step": 10893 + }, + { + "epoch": 2.0196514645902854, + "grad_norm": 6.5859375, + "learning_rate": 7.980348535409716e-06, + "loss": 3.1401, + "mean_token_accuracy": 0.44131646946824427, + "step": 10894 + }, + { + "epoch": 2.0198368557656656, + "grad_norm": 7.03125, + "learning_rate": 7.980163144234335e-06, + "loss": 2.5318, + "mean_token_accuracy": 0.48655256723716384, + "step": 10895 + }, + { + "epoch": 2.020022246941046, + "grad_norm": 7.0703125, + "learning_rate": 7.979977753058955e-06, + "loss": 3.2773, + "mean_token_accuracy": 0.4398320895522388, + "step": 10896 + }, + { + "epoch": 2.0202076381164256, + "grad_norm": 7.7890625, + "learning_rate": 7.979792361883576e-06, + "loss": 2.7659, + "mean_token_accuracy": 0.47199885123492247, + "step": 10897 + }, + { + "epoch": 2.0203930292918058, + "grad_norm": 7.06640625, + "learning_rate": 7.979606970708194e-06, + "loss": 2.8227, + "mean_token_accuracy": 0.458602931467054, + "step": 10898 + }, + { + "epoch": 2.020578420467186, + "grad_norm": 6.484375, + "learning_rate": 7.979421579532815e-06, + "loss": 3.4512, + "mean_token_accuracy": 0.437013082463619, + "step": 10899 + }, + { + "epoch": 2.0207638116425657, + "grad_norm": 7.05859375, + "learning_rate": 7.979236188357434e-06, + "loss": 2.8602, + "mean_token_accuracy": 0.47945372515546886, + "step": 10900 + }, + { + "epoch": 2.020949202817946, + "grad_norm": 6.02734375, + "learning_rate": 7.979050797182056e-06, + "loss": 2.8821, + "mean_token_accuracy": 0.505519093078759, + "step": 10901 + }, + { + "epoch": 2.021134593993326, + "grad_norm": 9.3359375, + "learning_rate": 7.978865406006675e-06, + "loss": 2.4815, + "mean_token_accuracy": 0.5040837455423904, + "step": 10902 + }, + { + "epoch": 2.021319985168706, + "grad_norm": 8.3359375, + "learning_rate": 7.978680014831295e-06, + "loss": 2.8917, + "mean_token_accuracy": 0.4811552907654449, + "step": 10903 + }, + { + "epoch": 2.021505376344086, + "grad_norm": 7.52734375, + "learning_rate": 7.978494623655914e-06, + "loss": 3.5858, + "mean_token_accuracy": 0.42793682132280353, + "step": 10904 + }, + { + "epoch": 2.021690767519466, + "grad_norm": 6.71875, + "learning_rate": 7.978309232480534e-06, + "loss": 2.4233, + "mean_token_accuracy": 0.5106638229805385, + "step": 10905 + }, + { + "epoch": 2.021876158694846, + "grad_norm": 7.12890625, + "learning_rate": 7.978123841305155e-06, + "loss": 3.006, + "mean_token_accuracy": 0.475947622329428, + "step": 10906 + }, + { + "epoch": 2.022061549870226, + "grad_norm": 7.296875, + "learning_rate": 7.977938450129774e-06, + "loss": 2.7761, + "mean_token_accuracy": 0.48358348968105064, + "step": 10907 + }, + { + "epoch": 2.0222469410456063, + "grad_norm": 7.78125, + "learning_rate": 7.977753058954394e-06, + "loss": 2.9711, + "mean_token_accuracy": 0.4762996019780485, + "step": 10908 + }, + { + "epoch": 2.022432332220986, + "grad_norm": 6.7890625, + "learning_rate": 7.977567667779015e-06, + "loss": 3.3472, + "mean_token_accuracy": 0.44415797492089537, + "step": 10909 + }, + { + "epoch": 2.0226177233963663, + "grad_norm": 10.59375, + "learning_rate": 7.977382276603635e-06, + "loss": 2.7198, + "mean_token_accuracy": 0.4860913993755322, + "step": 10910 + }, + { + "epoch": 2.0228031145717464, + "grad_norm": 7.125, + "learning_rate": 7.977196885428254e-06, + "loss": 2.6829, + "mean_token_accuracy": 0.4999343228687771, + "step": 10911 + }, + { + "epoch": 2.0229885057471266, + "grad_norm": 6.87890625, + "learning_rate": 7.977011494252874e-06, + "loss": 2.8713, + "mean_token_accuracy": 0.4770951226084613, + "step": 10912 + }, + { + "epoch": 2.0231738969225064, + "grad_norm": 8.0234375, + "learning_rate": 7.976826103077493e-06, + "loss": 2.5937, + "mean_token_accuracy": 0.5087930257026905, + "step": 10913 + }, + { + "epoch": 2.0233592880978866, + "grad_norm": 6.671875, + "learning_rate": 7.976640711902114e-06, + "loss": 2.8037, + "mean_token_accuracy": 0.5107650903498654, + "step": 10914 + }, + { + "epoch": 2.0235446792732668, + "grad_norm": 5.94921875, + "learning_rate": 7.976455320726734e-06, + "loss": 2.7132, + "mean_token_accuracy": 0.48962177121771217, + "step": 10915 + }, + { + "epoch": 2.0237300704486465, + "grad_norm": 14.46875, + "learning_rate": 7.976269929551353e-06, + "loss": 2.3827, + "mean_token_accuracy": 0.5002660281989891, + "step": 10916 + }, + { + "epoch": 2.0239154616240267, + "grad_norm": 6.30859375, + "learning_rate": 7.976084538375974e-06, + "loss": 2.8899, + "mean_token_accuracy": 0.4838774485183325, + "step": 10917 + }, + { + "epoch": 2.024100852799407, + "grad_norm": 7.23828125, + "learning_rate": 7.975899147200594e-06, + "loss": 2.8815, + "mean_token_accuracy": 0.4639410868461148, + "step": 10918 + }, + { + "epoch": 2.0242862439747866, + "grad_norm": 6.92578125, + "learning_rate": 7.975713756025215e-06, + "loss": 2.7172, + "mean_token_accuracy": 0.4928514694201747, + "step": 10919 + }, + { + "epoch": 2.024471635150167, + "grad_norm": 5.5546875, + "learning_rate": 7.975528364849833e-06, + "loss": 2.6342, + "mean_token_accuracy": 0.4960535117056856, + "step": 10920 + }, + { + "epoch": 2.024657026325547, + "grad_norm": 6.09375, + "learning_rate": 7.975342973674454e-06, + "loss": 2.5127, + "mean_token_accuracy": 0.5042016806722689, + "step": 10921 + }, + { + "epoch": 2.0248424175009268, + "grad_norm": 11.515625, + "learning_rate": 7.975157582499073e-06, + "loss": 3.2478, + "mean_token_accuracy": 0.4659694288012872, + "step": 10922 + }, + { + "epoch": 2.025027808676307, + "grad_norm": 8.8984375, + "learning_rate": 7.974972191323693e-06, + "loss": 3.0458, + "mean_token_accuracy": 0.481604820805582, + "step": 10923 + }, + { + "epoch": 2.025213199851687, + "grad_norm": 6.3515625, + "learning_rate": 7.974786800148314e-06, + "loss": 2.9322, + "mean_token_accuracy": 0.4709795824730443, + "step": 10924 + }, + { + "epoch": 2.0253985910270673, + "grad_norm": 7.6015625, + "learning_rate": 7.974601408972934e-06, + "loss": 2.2006, + "mean_token_accuracy": 0.523441126411883, + "step": 10925 + }, + { + "epoch": 2.025583982202447, + "grad_norm": 6.86328125, + "learning_rate": 7.974416017797555e-06, + "loss": 2.7883, + "mean_token_accuracy": 0.47608105285147273, + "step": 10926 + }, + { + "epoch": 2.0257693733778273, + "grad_norm": 7.8984375, + "learning_rate": 7.974230626622173e-06, + "loss": 2.3551, + "mean_token_accuracy": 0.51952770208901, + "step": 10927 + }, + { + "epoch": 2.0259547645532074, + "grad_norm": 6.23046875, + "learning_rate": 7.974045235446794e-06, + "loss": 2.6286, + "mean_token_accuracy": 0.5008053887831307, + "step": 10928 + }, + { + "epoch": 2.026140155728587, + "grad_norm": 5.890625, + "learning_rate": 7.973859844271413e-06, + "loss": 2.8286, + "mean_token_accuracy": 0.47745149449396956, + "step": 10929 + }, + { + "epoch": 2.0263255469039674, + "grad_norm": 7.53515625, + "learning_rate": 7.973674453096033e-06, + "loss": 2.5628, + "mean_token_accuracy": 0.5028285751874754, + "step": 10930 + }, + { + "epoch": 2.0265109380793476, + "grad_norm": 5.84375, + "learning_rate": 7.973489061920654e-06, + "loss": 2.2795, + "mean_token_accuracy": 0.5443146500910943, + "step": 10931 + }, + { + "epoch": 2.0266963292547273, + "grad_norm": 6.78125, + "learning_rate": 7.973303670745272e-06, + "loss": 2.8108, + "mean_token_accuracy": 0.5187316868982838, + "step": 10932 + }, + { + "epoch": 2.0268817204301075, + "grad_norm": 6.7109375, + "learning_rate": 7.973118279569893e-06, + "loss": 2.0401, + "mean_token_accuracy": 0.5779006699989365, + "step": 10933 + }, + { + "epoch": 2.0270671116054877, + "grad_norm": 6.078125, + "learning_rate": 7.972932888394513e-06, + "loss": 2.4783, + "mean_token_accuracy": 0.5137121854679106, + "step": 10934 + }, + { + "epoch": 2.0272525027808674, + "grad_norm": 6.26171875, + "learning_rate": 7.972747497219134e-06, + "loss": 2.735, + "mean_token_accuracy": 0.49124012366884234, + "step": 10935 + }, + { + "epoch": 2.0274378939562476, + "grad_norm": 6.2578125, + "learning_rate": 7.972562106043753e-06, + "loss": 3.2162, + "mean_token_accuracy": 0.4650735294117647, + "step": 10936 + }, + { + "epoch": 2.027623285131628, + "grad_norm": 6.796875, + "learning_rate": 7.972376714868373e-06, + "loss": 3.0123, + "mean_token_accuracy": 0.45938183807439825, + "step": 10937 + }, + { + "epoch": 2.027808676307008, + "grad_norm": 6.6640625, + "learning_rate": 7.972191323692992e-06, + "loss": 2.9601, + "mean_token_accuracy": 0.4889024950252564, + "step": 10938 + }, + { + "epoch": 2.0279940674823878, + "grad_norm": 6.828125, + "learning_rate": 7.972005932517613e-06, + "loss": 2.5415, + "mean_token_accuracy": 0.509349593495935, + "step": 10939 + }, + { + "epoch": 2.028179458657768, + "grad_norm": 7.79296875, + "learning_rate": 7.971820541342233e-06, + "loss": 3.7182, + "mean_token_accuracy": 0.4177583697234352, + "step": 10940 + }, + { + "epoch": 2.028364849833148, + "grad_norm": 9.2265625, + "learning_rate": 7.971635150166853e-06, + "loss": 3.4607, + "mean_token_accuracy": 0.4235700197238659, + "step": 10941 + }, + { + "epoch": 2.028550241008528, + "grad_norm": 6.9765625, + "learning_rate": 7.971449758991472e-06, + "loss": 3.1936, + "mean_token_accuracy": 0.46866051543111675, + "step": 10942 + }, + { + "epoch": 2.028735632183908, + "grad_norm": 8.7265625, + "learning_rate": 7.971264367816093e-06, + "loss": 2.5056, + "mean_token_accuracy": 0.4971355080088858, + "step": 10943 + }, + { + "epoch": 2.0289210233592883, + "grad_norm": 6.27734375, + "learning_rate": 7.971078976640713e-06, + "loss": 2.8824, + "mean_token_accuracy": 0.4587579834216606, + "step": 10944 + }, + { + "epoch": 2.029106414534668, + "grad_norm": 6.51953125, + "learning_rate": 7.970893585465332e-06, + "loss": 2.5519, + "mean_token_accuracy": 0.5053327150475307, + "step": 10945 + }, + { + "epoch": 2.029291805710048, + "grad_norm": 6.2109375, + "learning_rate": 7.970708194289953e-06, + "loss": 2.6741, + "mean_token_accuracy": 0.4846927374301676, + "step": 10946 + }, + { + "epoch": 2.0294771968854284, + "grad_norm": 6.8828125, + "learning_rate": 7.970522803114571e-06, + "loss": 2.5006, + "mean_token_accuracy": 0.5080246913580246, + "step": 10947 + }, + { + "epoch": 2.029662588060808, + "grad_norm": 6.328125, + "learning_rate": 7.970337411939192e-06, + "loss": 2.5913, + "mean_token_accuracy": 0.4989371752479924, + "step": 10948 + }, + { + "epoch": 2.0298479792361883, + "grad_norm": 6.84375, + "learning_rate": 7.970152020763812e-06, + "loss": 3.1748, + "mean_token_accuracy": 0.47072497457504, + "step": 10949 + }, + { + "epoch": 2.0300333704115685, + "grad_norm": 6.3359375, + "learning_rate": 7.969966629588433e-06, + "loss": 2.6583, + "mean_token_accuracy": 0.4880167451596023, + "step": 10950 + }, + { + "epoch": 2.0302187615869483, + "grad_norm": 6.73828125, + "learning_rate": 7.969781238413052e-06, + "loss": 2.8343, + "mean_token_accuracy": 0.47580174927113705, + "step": 10951 + }, + { + "epoch": 2.0304041527623284, + "grad_norm": 6.9140625, + "learning_rate": 7.969595847237672e-06, + "loss": 2.7853, + "mean_token_accuracy": 0.4761566678499586, + "step": 10952 + }, + { + "epoch": 2.0305895439377086, + "grad_norm": 7.32421875, + "learning_rate": 7.969410456062293e-06, + "loss": 2.9687, + "mean_token_accuracy": 0.4613072877535687, + "step": 10953 + }, + { + "epoch": 2.030774935113089, + "grad_norm": 8.0859375, + "learning_rate": 7.969225064886911e-06, + "loss": 2.3049, + "mean_token_accuracy": 0.5537666174298376, + "step": 10954 + }, + { + "epoch": 2.0309603262884686, + "grad_norm": 6.578125, + "learning_rate": 7.969039673711532e-06, + "loss": 2.5755, + "mean_token_accuracy": 0.5130607941899965, + "step": 10955 + }, + { + "epoch": 2.0311457174638488, + "grad_norm": 12.515625, + "learning_rate": 7.96885428253615e-06, + "loss": 2.4719, + "mean_token_accuracy": 0.4801307590152212, + "step": 10956 + }, + { + "epoch": 2.031331108639229, + "grad_norm": 7.94140625, + "learning_rate": 7.968668891360773e-06, + "loss": 2.6705, + "mean_token_accuracy": 0.4752743337608665, + "step": 10957 + }, + { + "epoch": 2.0315164998146087, + "grad_norm": 5.72265625, + "learning_rate": 7.968483500185392e-06, + "loss": 2.5384, + "mean_token_accuracy": 0.5143737166324436, + "step": 10958 + }, + { + "epoch": 2.031701890989989, + "grad_norm": 7.2265625, + "learning_rate": 7.968298109010012e-06, + "loss": 2.3619, + "mean_token_accuracy": 0.5525588738323904, + "step": 10959 + }, + { + "epoch": 2.031887282165369, + "grad_norm": 9.6171875, + "learning_rate": 7.968112717834631e-06, + "loss": 2.6587, + "mean_token_accuracy": 0.48940269749518306, + "step": 10960 + }, + { + "epoch": 2.032072673340749, + "grad_norm": 6.453125, + "learning_rate": 7.967927326659251e-06, + "loss": 2.7894, + "mean_token_accuracy": 0.4845300642148278, + "step": 10961 + }, + { + "epoch": 2.032258064516129, + "grad_norm": 6.1015625, + "learning_rate": 7.967741935483872e-06, + "loss": 3.1091, + "mean_token_accuracy": 0.4636527485731451, + "step": 10962 + }, + { + "epoch": 2.032443455691509, + "grad_norm": 6.53125, + "learning_rate": 7.96755654430849e-06, + "loss": 2.6311, + "mean_token_accuracy": 0.48192019950124687, + "step": 10963 + }, + { + "epoch": 2.032628846866889, + "grad_norm": 6.96875, + "learning_rate": 7.967371153133111e-06, + "loss": 3.2455, + "mean_token_accuracy": 0.4506332757628094, + "step": 10964 + }, + { + "epoch": 2.032814238042269, + "grad_norm": 7.109375, + "learning_rate": 7.967185761957732e-06, + "loss": 3.3215, + "mean_token_accuracy": 0.46040593122305185, + "step": 10965 + }, + { + "epoch": 2.0329996292176493, + "grad_norm": 6.8359375, + "learning_rate": 7.967000370782352e-06, + "loss": 2.4834, + "mean_token_accuracy": 0.5414207898320472, + "step": 10966 + }, + { + "epoch": 2.0331850203930295, + "grad_norm": 6.18359375, + "learning_rate": 7.966814979606971e-06, + "loss": 1.9284, + "mean_token_accuracy": 0.5845596558750283, + "step": 10967 + }, + { + "epoch": 2.0333704115684093, + "grad_norm": 6.328125, + "learning_rate": 7.966629588431592e-06, + "loss": 2.6183, + "mean_token_accuracy": 0.5165553502913173, + "step": 10968 + }, + { + "epoch": 2.0335558027437894, + "grad_norm": 7.43359375, + "learning_rate": 7.966444197256212e-06, + "loss": 2.8584, + "mean_token_accuracy": 0.4656469408224674, + "step": 10969 + }, + { + "epoch": 2.0337411939191696, + "grad_norm": 7.2890625, + "learning_rate": 7.96625880608083e-06, + "loss": 3.08, + "mean_token_accuracy": 0.4680874316939891, + "step": 10970 + }, + { + "epoch": 2.0339265850945494, + "grad_norm": 8.5625, + "learning_rate": 7.966073414905451e-06, + "loss": 3.034, + "mean_token_accuracy": 0.4369439344725321, + "step": 10971 + }, + { + "epoch": 2.0341119762699296, + "grad_norm": 7.45703125, + "learning_rate": 7.96588802373007e-06, + "loss": 2.5981, + "mean_token_accuracy": 0.49218932152016787, + "step": 10972 + }, + { + "epoch": 2.0342973674453098, + "grad_norm": 6.54296875, + "learning_rate": 7.965702632554692e-06, + "loss": 3.5325, + "mean_token_accuracy": 0.4231917010089527, + "step": 10973 + }, + { + "epoch": 2.0344827586206895, + "grad_norm": 8.359375, + "learning_rate": 7.965517241379311e-06, + "loss": 2.7116, + "mean_token_accuracy": 0.4628733697938578, + "step": 10974 + }, + { + "epoch": 2.0346681497960697, + "grad_norm": 6.2421875, + "learning_rate": 7.965331850203932e-06, + "loss": 2.4349, + "mean_token_accuracy": 0.5247459416655669, + "step": 10975 + }, + { + "epoch": 2.03485354097145, + "grad_norm": 7.83984375, + "learning_rate": 7.96514645902855e-06, + "loss": 2.6804, + "mean_token_accuracy": 0.5245264691597863, + "step": 10976 + }, + { + "epoch": 2.0350389321468296, + "grad_norm": 8.5390625, + "learning_rate": 7.964961067853171e-06, + "loss": 2.9453, + "mean_token_accuracy": 0.45475113122171945, + "step": 10977 + }, + { + "epoch": 2.03522432332221, + "grad_norm": 6.2421875, + "learning_rate": 7.964775676677791e-06, + "loss": 2.4727, + "mean_token_accuracy": 0.5122006841505131, + "step": 10978 + }, + { + "epoch": 2.03540971449759, + "grad_norm": 6.15234375, + "learning_rate": 7.96459028550241e-06, + "loss": 2.7964, + "mean_token_accuracy": 0.4700374531835206, + "step": 10979 + }, + { + "epoch": 2.0355951056729698, + "grad_norm": 7.4296875, + "learning_rate": 7.96440489432703e-06, + "loss": 2.8148, + "mean_token_accuracy": 0.4783099864437415, + "step": 10980 + }, + { + "epoch": 2.03578049684835, + "grad_norm": 6.9609375, + "learning_rate": 7.964219503151651e-06, + "loss": 2.8028, + "mean_token_accuracy": 0.5144400352733686, + "step": 10981 + }, + { + "epoch": 2.03596588802373, + "grad_norm": 6.57421875, + "learning_rate": 7.964034111976272e-06, + "loss": 3.2683, + "mean_token_accuracy": 0.4507530321204851, + "step": 10982 + }, + { + "epoch": 2.0361512791991103, + "grad_norm": 6.875, + "learning_rate": 7.96384872080089e-06, + "loss": 2.8861, + "mean_token_accuracy": 0.5095316545069428, + "step": 10983 + }, + { + "epoch": 2.03633667037449, + "grad_norm": 6.65625, + "learning_rate": 7.963663329625511e-06, + "loss": 2.8372, + "mean_token_accuracy": 0.47297894271830365, + "step": 10984 + }, + { + "epoch": 2.0365220615498703, + "grad_norm": 6.359375, + "learning_rate": 7.96347793845013e-06, + "loss": 2.4382, + "mean_token_accuracy": 0.5084179104477612, + "step": 10985 + }, + { + "epoch": 2.0367074527252504, + "grad_norm": 6.29296875, + "learning_rate": 7.96329254727475e-06, + "loss": 2.5758, + "mean_token_accuracy": 0.49111656796186626, + "step": 10986 + }, + { + "epoch": 2.03689284390063, + "grad_norm": 6.9375, + "learning_rate": 7.96310715609937e-06, + "loss": 3.2392, + "mean_token_accuracy": 0.4381979695431472, + "step": 10987 + }, + { + "epoch": 2.0370782350760104, + "grad_norm": 7.87890625, + "learning_rate": 7.96292176492399e-06, + "loss": 3.4449, + "mean_token_accuracy": 0.4191159181754056, + "step": 10988 + }, + { + "epoch": 2.0372636262513906, + "grad_norm": 9.7890625, + "learning_rate": 7.96273637374861e-06, + "loss": 2.4293, + "mean_token_accuracy": 0.5186659346692286, + "step": 10989 + }, + { + "epoch": 2.0374490174267703, + "grad_norm": 6.27734375, + "learning_rate": 7.96255098257323e-06, + "loss": 2.8573, + "mean_token_accuracy": 0.4880298704151109, + "step": 10990 + }, + { + "epoch": 2.0376344086021505, + "grad_norm": 8.125, + "learning_rate": 7.962365591397851e-06, + "loss": 2.7946, + "mean_token_accuracy": 0.5091701936642299, + "step": 10991 + }, + { + "epoch": 2.0378197997775307, + "grad_norm": 6.30078125, + "learning_rate": 7.96218020022247e-06, + "loss": 3.0101, + "mean_token_accuracy": 0.4514452744397532, + "step": 10992 + }, + { + "epoch": 2.0380051909529104, + "grad_norm": 6.17578125, + "learning_rate": 7.96199480904709e-06, + "loss": 2.5807, + "mean_token_accuracy": 0.5050998263888888, + "step": 10993 + }, + { + "epoch": 2.0381905821282906, + "grad_norm": 6.91015625, + "learning_rate": 7.961809417871709e-06, + "loss": 3.1641, + "mean_token_accuracy": 0.42320695484182563, + "step": 10994 + }, + { + "epoch": 2.038375973303671, + "grad_norm": 6.33203125, + "learning_rate": 7.96162402669633e-06, + "loss": 2.3743, + "mean_token_accuracy": 0.5270337922403003, + "step": 10995 + }, + { + "epoch": 2.038561364479051, + "grad_norm": 6.5078125, + "learning_rate": 7.96143863552095e-06, + "loss": 2.6811, + "mean_token_accuracy": 0.5005936319481922, + "step": 10996 + }, + { + "epoch": 2.0387467556544308, + "grad_norm": 6.99609375, + "learning_rate": 7.96125324434557e-06, + "loss": 3.3177, + "mean_token_accuracy": 0.4190463540974897, + "step": 10997 + }, + { + "epoch": 2.038932146829811, + "grad_norm": 7.55859375, + "learning_rate": 7.96106785317019e-06, + "loss": 2.0541, + "mean_token_accuracy": 0.5514201762977473, + "step": 10998 + }, + { + "epoch": 2.039117538005191, + "grad_norm": 6.078125, + "learning_rate": 7.96088246199481e-06, + "loss": 2.2914, + "mean_token_accuracy": 0.5371554831957205, + "step": 10999 + }, + { + "epoch": 2.039302929180571, + "grad_norm": 7.234375, + "learning_rate": 7.96069707081943e-06, + "loss": 3.0013, + "mean_token_accuracy": 0.49369544131910764, + "step": 11000 + }, + { + "epoch": 2.039488320355951, + "grad_norm": 7.16015625, + "learning_rate": 7.960511679644049e-06, + "loss": 2.9114, + "mean_token_accuracy": 0.47163695299837927, + "step": 11001 + }, + { + "epoch": 2.0396737115313313, + "grad_norm": 6.92578125, + "learning_rate": 7.96032628846867e-06, + "loss": 2.8633, + "mean_token_accuracy": 0.47253585596582237, + "step": 11002 + }, + { + "epoch": 2.039859102706711, + "grad_norm": 6.09765625, + "learning_rate": 7.960140897293288e-06, + "loss": 2.4309, + "mean_token_accuracy": 0.5130370370370371, + "step": 11003 + }, + { + "epoch": 2.040044493882091, + "grad_norm": 7.69140625, + "learning_rate": 7.959955506117909e-06, + "loss": 3.302, + "mean_token_accuracy": 0.4816934767591039, + "step": 11004 + }, + { + "epoch": 2.0402298850574714, + "grad_norm": 7.81640625, + "learning_rate": 7.95977011494253e-06, + "loss": 3.1727, + "mean_token_accuracy": 0.44715447154471544, + "step": 11005 + }, + { + "epoch": 2.040415276232851, + "grad_norm": 8.6875, + "learning_rate": 7.95958472376715e-06, + "loss": 2.8182, + "mean_token_accuracy": 0.46822870240672354, + "step": 11006 + }, + { + "epoch": 2.0406006674082313, + "grad_norm": 9.28125, + "learning_rate": 7.95939933259177e-06, + "loss": 2.7062, + "mean_token_accuracy": 0.4748201438848921, + "step": 11007 + }, + { + "epoch": 2.0407860585836115, + "grad_norm": 9.75, + "learning_rate": 7.959213941416389e-06, + "loss": 2.7599, + "mean_token_accuracy": 0.5070969469737547, + "step": 11008 + }, + { + "epoch": 2.0409714497589917, + "grad_norm": 7.8046875, + "learning_rate": 7.95902855024101e-06, + "loss": 3.1223, + "mean_token_accuracy": 0.45673779596609176, + "step": 11009 + }, + { + "epoch": 2.0411568409343714, + "grad_norm": 7.484375, + "learning_rate": 7.958843159065628e-06, + "loss": 3.3623, + "mean_token_accuracy": 0.43852813852813854, + "step": 11010 + }, + { + "epoch": 2.0413422321097516, + "grad_norm": 8.0, + "learning_rate": 7.958657767890249e-06, + "loss": 3.0495, + "mean_token_accuracy": 0.4620418848167539, + "step": 11011 + }, + { + "epoch": 2.041527623285132, + "grad_norm": 7.11328125, + "learning_rate": 7.95847237671487e-06, + "loss": 2.6402, + "mean_token_accuracy": 0.48354555978170993, + "step": 11012 + }, + { + "epoch": 2.0417130144605116, + "grad_norm": 5.7890625, + "learning_rate": 7.958286985539488e-06, + "loss": 2.2659, + "mean_token_accuracy": 0.5388254940161425, + "step": 11013 + }, + { + "epoch": 2.0418984056358918, + "grad_norm": 6.01953125, + "learning_rate": 7.958101594364109e-06, + "loss": 2.9277, + "mean_token_accuracy": 0.4634539014704304, + "step": 11014 + }, + { + "epoch": 2.042083796811272, + "grad_norm": 8.2109375, + "learning_rate": 7.95791620318873e-06, + "loss": 3.0561, + "mean_token_accuracy": 0.4542488990376774, + "step": 11015 + }, + { + "epoch": 2.0422691879866517, + "grad_norm": 6.66796875, + "learning_rate": 7.95773081201335e-06, + "loss": 2.7688, + "mean_token_accuracy": 0.47597290058295255, + "step": 11016 + }, + { + "epoch": 2.042454579162032, + "grad_norm": 6.453125, + "learning_rate": 7.957545420837968e-06, + "loss": 2.6242, + "mean_token_accuracy": 0.4846203763789747, + "step": 11017 + }, + { + "epoch": 2.042639970337412, + "grad_norm": 6.67578125, + "learning_rate": 7.957360029662589e-06, + "loss": 3.1075, + "mean_token_accuracy": 0.4505978602894902, + "step": 11018 + }, + { + "epoch": 2.042825361512792, + "grad_norm": 7.25, + "learning_rate": 7.957174638487208e-06, + "loss": 3.3198, + "mean_token_accuracy": 0.4637937124690922, + "step": 11019 + }, + { + "epoch": 2.043010752688172, + "grad_norm": 8.359375, + "learning_rate": 7.956989247311828e-06, + "loss": 3.2756, + "mean_token_accuracy": 0.44936776113059185, + "step": 11020 + }, + { + "epoch": 2.043196143863552, + "grad_norm": 8.1015625, + "learning_rate": 7.956803856136449e-06, + "loss": 2.7053, + "mean_token_accuracy": 0.4486138336600392, + "step": 11021 + }, + { + "epoch": 2.043381535038932, + "grad_norm": 7.15625, + "learning_rate": 7.95661846496107e-06, + "loss": 2.4052, + "mean_token_accuracy": 0.5379181660669066, + "step": 11022 + }, + { + "epoch": 2.043566926214312, + "grad_norm": 8.9609375, + "learning_rate": 7.956433073785688e-06, + "loss": 2.74, + "mean_token_accuracy": 0.49881201956673654, + "step": 11023 + }, + { + "epoch": 2.0437523173896923, + "grad_norm": 8.984375, + "learning_rate": 7.956247682610309e-06, + "loss": 3.0969, + "mean_token_accuracy": 0.4695234577022534, + "step": 11024 + }, + { + "epoch": 2.0439377085650725, + "grad_norm": 9.2734375, + "learning_rate": 7.956062291434929e-06, + "loss": 2.9824, + "mean_token_accuracy": 0.4573502722323049, + "step": 11025 + }, + { + "epoch": 2.0441230997404523, + "grad_norm": 9.09375, + "learning_rate": 7.955876900259548e-06, + "loss": 2.4877, + "mean_token_accuracy": 0.4992274412855377, + "step": 11026 + }, + { + "epoch": 2.0443084909158324, + "grad_norm": 9.234375, + "learning_rate": 7.955691509084168e-06, + "loss": 2.4069, + "mean_token_accuracy": 0.518968980138362, + "step": 11027 + }, + { + "epoch": 2.0444938820912126, + "grad_norm": 10.90625, + "learning_rate": 7.955506117908787e-06, + "loss": 2.4174, + "mean_token_accuracy": 0.4935080694090523, + "step": 11028 + }, + { + "epoch": 2.0446792732665924, + "grad_norm": 8.9921875, + "learning_rate": 7.955320726733408e-06, + "loss": 2.7527, + "mean_token_accuracy": 0.47843905915894513, + "step": 11029 + }, + { + "epoch": 2.0448646644419726, + "grad_norm": 9.8125, + "learning_rate": 7.955135335558028e-06, + "loss": 3.7395, + "mean_token_accuracy": 0.42388059701492536, + "step": 11030 + }, + { + "epoch": 2.0450500556173528, + "grad_norm": 8.515625, + "learning_rate": 7.954949944382649e-06, + "loss": 3.3123, + "mean_token_accuracy": 0.45037438266687907, + "step": 11031 + }, + { + "epoch": 2.0452354467927325, + "grad_norm": 5.85546875, + "learning_rate": 7.954764553207267e-06, + "loss": 3.1424, + "mean_token_accuracy": 0.46837708830548924, + "step": 11032 + }, + { + "epoch": 2.0454208379681127, + "grad_norm": 10.734375, + "learning_rate": 7.954579162031888e-06, + "loss": 2.6922, + "mean_token_accuracy": 0.46987485485743774, + "step": 11033 + }, + { + "epoch": 2.045606229143493, + "grad_norm": 9.9453125, + "learning_rate": 7.954393770856508e-06, + "loss": 2.437, + "mean_token_accuracy": 0.5067294014662436, + "step": 11034 + }, + { + "epoch": 2.0457916203188726, + "grad_norm": 13.71875, + "learning_rate": 7.954208379681127e-06, + "loss": 2.5411, + "mean_token_accuracy": 0.512496711391739, + "step": 11035 + }, + { + "epoch": 2.045977011494253, + "grad_norm": 9.2109375, + "learning_rate": 7.954022988505748e-06, + "loss": 3.4063, + "mean_token_accuracy": 0.42857142857142855, + "step": 11036 + }, + { + "epoch": 2.046162402669633, + "grad_norm": 8.1328125, + "learning_rate": 7.953837597330366e-06, + "loss": 2.8277, + "mean_token_accuracy": 0.47345903977182696, + "step": 11037 + }, + { + "epoch": 2.0463477938450128, + "grad_norm": 6.4609375, + "learning_rate": 7.953652206154989e-06, + "loss": 3.0639, + "mean_token_accuracy": 0.46919431279620855, + "step": 11038 + }, + { + "epoch": 2.046533185020393, + "grad_norm": 7.484375, + "learning_rate": 7.953466814979607e-06, + "loss": 3.0062, + "mean_token_accuracy": 0.4790504451038576, + "step": 11039 + }, + { + "epoch": 2.046718576195773, + "grad_norm": 7.3125, + "learning_rate": 7.953281423804228e-06, + "loss": 2.9183, + "mean_token_accuracy": 0.45355721634475393, + "step": 11040 + }, + { + "epoch": 2.0469039673711533, + "grad_norm": 8.265625, + "learning_rate": 7.953096032628847e-06, + "loss": 3.0324, + "mean_token_accuracy": 0.4486007995431182, + "step": 11041 + }, + { + "epoch": 2.047089358546533, + "grad_norm": 5.76953125, + "learning_rate": 7.952910641453467e-06, + "loss": 2.2781, + "mean_token_accuracy": 0.5317887931034483, + "step": 11042 + }, + { + "epoch": 2.0472747497219133, + "grad_norm": 8.5078125, + "learning_rate": 7.952725250278088e-06, + "loss": 2.9737, + "mean_token_accuracy": 0.45422832980972516, + "step": 11043 + }, + { + "epoch": 2.0474601408972934, + "grad_norm": 10.2265625, + "learning_rate": 7.952539859102707e-06, + "loss": 2.2307, + "mean_token_accuracy": 0.5644883920894239, + "step": 11044 + }, + { + "epoch": 2.047645532072673, + "grad_norm": 7.01953125, + "learning_rate": 7.952354467927327e-06, + "loss": 2.811, + "mean_token_accuracy": 0.5130250529727035, + "step": 11045 + }, + { + "epoch": 2.0478309232480534, + "grad_norm": 9.59375, + "learning_rate": 7.952169076751947e-06, + "loss": 2.1242, + "mean_token_accuracy": 0.5831739961759083, + "step": 11046 + }, + { + "epoch": 2.0480163144234336, + "grad_norm": 11.1953125, + "learning_rate": 7.951983685576568e-06, + "loss": 2.6954, + "mean_token_accuracy": 0.48087178298168326, + "step": 11047 + }, + { + "epoch": 2.0482017055988133, + "grad_norm": 11.4921875, + "learning_rate": 7.951798294401187e-06, + "loss": 2.4739, + "mean_token_accuracy": 0.51024655779699, + "step": 11048 + }, + { + "epoch": 2.0483870967741935, + "grad_norm": 6.4375, + "learning_rate": 7.951612903225807e-06, + "loss": 3.0471, + "mean_token_accuracy": 0.4578625235404896, + "step": 11049 + }, + { + "epoch": 2.0485724879495737, + "grad_norm": 11.890625, + "learning_rate": 7.951427512050428e-06, + "loss": 2.5466, + "mean_token_accuracy": 0.4874804381846635, + "step": 11050 + }, + { + "epoch": 2.0487578791249534, + "grad_norm": 9.6640625, + "learning_rate": 7.951242120875047e-06, + "loss": 2.8876, + "mean_token_accuracy": 0.4689655172413793, + "step": 11051 + }, + { + "epoch": 2.0489432703003336, + "grad_norm": 6.61328125, + "learning_rate": 7.951056729699667e-06, + "loss": 2.5437, + "mean_token_accuracy": 0.518523795953263, + "step": 11052 + }, + { + "epoch": 2.049128661475714, + "grad_norm": 8.0, + "learning_rate": 7.950871338524286e-06, + "loss": 3.5081, + "mean_token_accuracy": 0.447575115322279, + "step": 11053 + }, + { + "epoch": 2.049314052651094, + "grad_norm": 9.3359375, + "learning_rate": 7.950685947348908e-06, + "loss": 2.3483, + "mean_token_accuracy": 0.5288818987703746, + "step": 11054 + }, + { + "epoch": 2.0494994438264738, + "grad_norm": 6.71484375, + "learning_rate": 7.950500556173527e-06, + "loss": 3.3799, + "mean_token_accuracy": 0.4470314318975553, + "step": 11055 + }, + { + "epoch": 2.049684835001854, + "grad_norm": 8.59375, + "learning_rate": 7.950315164998147e-06, + "loss": 2.9384, + "mean_token_accuracy": 0.4537899773356837, + "step": 11056 + }, + { + "epoch": 2.049870226177234, + "grad_norm": 9.015625, + "learning_rate": 7.950129773822766e-06, + "loss": 2.9968, + "mean_token_accuracy": 0.4688079061148857, + "step": 11057 + }, + { + "epoch": 2.050055617352614, + "grad_norm": 5.69140625, + "learning_rate": 7.949944382647387e-06, + "loss": 2.9191, + "mean_token_accuracy": 0.4786099460754943, + "step": 11058 + }, + { + "epoch": 2.050241008527994, + "grad_norm": 5.59375, + "learning_rate": 7.949758991472007e-06, + "loss": 2.5432, + "mean_token_accuracy": 0.5020955574182733, + "step": 11059 + }, + { + "epoch": 2.0504263997033743, + "grad_norm": 6.41796875, + "learning_rate": 7.949573600296626e-06, + "loss": 3.1121, + "mean_token_accuracy": 0.4605304212168487, + "step": 11060 + }, + { + "epoch": 2.050611790878754, + "grad_norm": 6.90625, + "learning_rate": 7.949388209121246e-06, + "loss": 3.0774, + "mean_token_accuracy": 0.464281214564697, + "step": 11061 + }, + { + "epoch": 2.050797182054134, + "grad_norm": 5.921875, + "learning_rate": 7.949202817945867e-06, + "loss": 3.0524, + "mean_token_accuracy": 0.47358024691358025, + "step": 11062 + }, + { + "epoch": 2.0509825732295144, + "grad_norm": 7.1875, + "learning_rate": 7.949017426770487e-06, + "loss": 2.6627, + "mean_token_accuracy": 0.5082650567773466, + "step": 11063 + }, + { + "epoch": 2.051167964404894, + "grad_norm": 7.7734375, + "learning_rate": 7.948832035595106e-06, + "loss": 2.2627, + "mean_token_accuracy": 0.5312225153913809, + "step": 11064 + }, + { + "epoch": 2.0513533555802743, + "grad_norm": 6.0234375, + "learning_rate": 7.948646644419727e-06, + "loss": 2.9321, + "mean_token_accuracy": 0.5119151833479234, + "step": 11065 + }, + { + "epoch": 2.0515387467556545, + "grad_norm": 6.41015625, + "learning_rate": 7.948461253244345e-06, + "loss": 2.2864, + "mean_token_accuracy": 0.553906904391528, + "step": 11066 + }, + { + "epoch": 2.0517241379310347, + "grad_norm": 7.75, + "learning_rate": 7.948275862068966e-06, + "loss": 2.5996, + "mean_token_accuracy": 0.5200213561131874, + "step": 11067 + }, + { + "epoch": 2.0519095291064144, + "grad_norm": 5.9140625, + "learning_rate": 7.948090470893586e-06, + "loss": 2.631, + "mean_token_accuracy": 0.5, + "step": 11068 + }, + { + "epoch": 2.0520949202817946, + "grad_norm": 6.3828125, + "learning_rate": 7.947905079718205e-06, + "loss": 3.0364, + "mean_token_accuracy": 0.4728038367060475, + "step": 11069 + }, + { + "epoch": 2.052280311457175, + "grad_norm": 6.83984375, + "learning_rate": 7.947719688542826e-06, + "loss": 2.7977, + "mean_token_accuracy": 0.46464539383017067, + "step": 11070 + }, + { + "epoch": 2.0524657026325546, + "grad_norm": 7.359375, + "learning_rate": 7.947534297367446e-06, + "loss": 2.7611, + "mean_token_accuracy": 0.4973718791064389, + "step": 11071 + }, + { + "epoch": 2.0526510938079348, + "grad_norm": 10.0859375, + "learning_rate": 7.947348906192067e-06, + "loss": 2.6969, + "mean_token_accuracy": 0.47619625941219335, + "step": 11072 + }, + { + "epoch": 2.052836484983315, + "grad_norm": 6.5625, + "learning_rate": 7.947163515016686e-06, + "loss": 2.6009, + "mean_token_accuracy": 0.482989403234802, + "step": 11073 + }, + { + "epoch": 2.0530218761586947, + "grad_norm": 8.0859375, + "learning_rate": 7.946978123841306e-06, + "loss": 2.8616, + "mean_token_accuracy": 0.47792734114922564, + "step": 11074 + }, + { + "epoch": 2.053207267334075, + "grad_norm": 7.2734375, + "learning_rate": 7.946792732665925e-06, + "loss": 3.3049, + "mean_token_accuracy": 0.4371144403877628, + "step": 11075 + }, + { + "epoch": 2.053392658509455, + "grad_norm": 8.0234375, + "learning_rate": 7.946607341490545e-06, + "loss": 3.1219, + "mean_token_accuracy": 0.456532877882152, + "step": 11076 + }, + { + "epoch": 2.053578049684835, + "grad_norm": 8.375, + "learning_rate": 7.946421950315166e-06, + "loss": 3.1793, + "mean_token_accuracy": 0.4536959786417487, + "step": 11077 + }, + { + "epoch": 2.053763440860215, + "grad_norm": 7.82421875, + "learning_rate": 7.946236559139786e-06, + "loss": 2.9693, + "mean_token_accuracy": 0.48003237992444686, + "step": 11078 + }, + { + "epoch": 2.053948832035595, + "grad_norm": 11.9921875, + "learning_rate": 7.946051167964405e-06, + "loss": 2.3218, + "mean_token_accuracy": 0.499479979199168, + "step": 11079 + }, + { + "epoch": 2.054134223210975, + "grad_norm": 6.72265625, + "learning_rate": 7.945865776789026e-06, + "loss": 2.3435, + "mean_token_accuracy": 0.5111767186840995, + "step": 11080 + }, + { + "epoch": 2.054319614386355, + "grad_norm": 5.72265625, + "learning_rate": 7.945680385613646e-06, + "loss": 2.9086, + "mean_token_accuracy": 0.47548048922539315, + "step": 11081 + }, + { + "epoch": 2.0545050055617353, + "grad_norm": 6.84765625, + "learning_rate": 7.945494994438265e-06, + "loss": 2.3728, + "mean_token_accuracy": 0.5271374379924132, + "step": 11082 + }, + { + "epoch": 2.0546903967371155, + "grad_norm": 7.76953125, + "learning_rate": 7.945309603262885e-06, + "loss": 3.1417, + "mean_token_accuracy": 0.47062262496346097, + "step": 11083 + }, + { + "epoch": 2.0548757879124953, + "grad_norm": 7.0859375, + "learning_rate": 7.945124212087504e-06, + "loss": 2.8168, + "mean_token_accuracy": 0.4705746329055382, + "step": 11084 + }, + { + "epoch": 2.0550611790878754, + "grad_norm": 9.6796875, + "learning_rate": 7.944938820912125e-06, + "loss": 2.2831, + "mean_token_accuracy": 0.5317373461012312, + "step": 11085 + }, + { + "epoch": 2.0552465702632556, + "grad_norm": 7.19140625, + "learning_rate": 7.944753429736745e-06, + "loss": 2.5842, + "mean_token_accuracy": 0.5192584075883875, + "step": 11086 + }, + { + "epoch": 2.0554319614386354, + "grad_norm": 6.58203125, + "learning_rate": 7.944568038561366e-06, + "loss": 2.9399, + "mean_token_accuracy": 0.4646300237886566, + "step": 11087 + }, + { + "epoch": 2.0556173526140156, + "grad_norm": 6.73828125, + "learning_rate": 7.944382647385986e-06, + "loss": 2.6766, + "mean_token_accuracy": 0.5211722179189259, + "step": 11088 + }, + { + "epoch": 2.0558027437893958, + "grad_norm": 9.140625, + "learning_rate": 7.944197256210605e-06, + "loss": 2.5845, + "mean_token_accuracy": 0.5061963775023832, + "step": 11089 + }, + { + "epoch": 2.0559881349647755, + "grad_norm": 9.0703125, + "learning_rate": 7.944011865035225e-06, + "loss": 3.0319, + "mean_token_accuracy": 0.46777003484320556, + "step": 11090 + }, + { + "epoch": 2.0561735261401557, + "grad_norm": 9.9609375, + "learning_rate": 7.943826473859844e-06, + "loss": 3.0523, + "mean_token_accuracy": 0.4605847237103934, + "step": 11091 + }, + { + "epoch": 2.056358917315536, + "grad_norm": 6.6640625, + "learning_rate": 7.943641082684465e-06, + "loss": 2.4852, + "mean_token_accuracy": 0.49620599577061825, + "step": 11092 + }, + { + "epoch": 2.0565443084909156, + "grad_norm": 7.9453125, + "learning_rate": 7.943455691509085e-06, + "loss": 3.8453, + "mean_token_accuracy": 0.41403508771929826, + "step": 11093 + }, + { + "epoch": 2.056729699666296, + "grad_norm": 10.078125, + "learning_rate": 7.943270300333706e-06, + "loss": 3.087, + "mean_token_accuracy": 0.4543714866642746, + "step": 11094 + }, + { + "epoch": 2.056915090841676, + "grad_norm": 7.703125, + "learning_rate": 7.943084909158324e-06, + "loss": 2.9585, + "mean_token_accuracy": 0.47253653936822254, + "step": 11095 + }, + { + "epoch": 2.057100482017056, + "grad_norm": 5.7109375, + "learning_rate": 7.942899517982945e-06, + "loss": 2.9384, + "mean_token_accuracy": 0.46965339791130617, + "step": 11096 + }, + { + "epoch": 2.057285873192436, + "grad_norm": 11.4609375, + "learning_rate": 7.942714126807565e-06, + "loss": 3.8138, + "mean_token_accuracy": 0.4392918483287092, + "step": 11097 + }, + { + "epoch": 2.057471264367816, + "grad_norm": 12.578125, + "learning_rate": 7.942528735632184e-06, + "loss": 2.7812, + "mean_token_accuracy": 0.47592030610898534, + "step": 11098 + }, + { + "epoch": 2.0576566555431963, + "grad_norm": 10.4375, + "learning_rate": 7.942343344456805e-06, + "loss": 2.5163, + "mean_token_accuracy": 0.5046496398166339, + "step": 11099 + }, + { + "epoch": 2.057842046718576, + "grad_norm": 6.25, + "learning_rate": 7.942157953281424e-06, + "loss": 2.691, + "mean_token_accuracy": 0.48757763975155277, + "step": 11100 + }, + { + "epoch": 2.0580274378939563, + "grad_norm": 10.0234375, + "learning_rate": 7.941972562106044e-06, + "loss": 3.0978, + "mean_token_accuracy": 0.44033816425120775, + "step": 11101 + }, + { + "epoch": 2.0582128290693364, + "grad_norm": 10.8359375, + "learning_rate": 7.941787170930665e-06, + "loss": 3.6431, + "mean_token_accuracy": 0.4451305575158786, + "step": 11102 + }, + { + "epoch": 2.058398220244716, + "grad_norm": 7.55078125, + "learning_rate": 7.941601779755285e-06, + "loss": 2.9076, + "mean_token_accuracy": 0.45856862971072954, + "step": 11103 + }, + { + "epoch": 2.0585836114200964, + "grad_norm": 6.44140625, + "learning_rate": 7.941416388579904e-06, + "loss": 3.3467, + "mean_token_accuracy": 0.4203998073217726, + "step": 11104 + }, + { + "epoch": 2.0587690025954766, + "grad_norm": 5.80859375, + "learning_rate": 7.941230997404524e-06, + "loss": 3.0493, + "mean_token_accuracy": 0.4525930445393533, + "step": 11105 + }, + { + "epoch": 2.0589543937708563, + "grad_norm": 12.96875, + "learning_rate": 7.941045606229145e-06, + "loss": 2.66, + "mean_token_accuracy": 0.4881414980570816, + "step": 11106 + }, + { + "epoch": 2.0591397849462365, + "grad_norm": 11.5, + "learning_rate": 7.940860215053764e-06, + "loss": 2.7268, + "mean_token_accuracy": 0.4617248062015504, + "step": 11107 + }, + { + "epoch": 2.0593251761216167, + "grad_norm": 7.86328125, + "learning_rate": 7.940674823878384e-06, + "loss": 2.5614, + "mean_token_accuracy": 0.4952796956460476, + "step": 11108 + }, + { + "epoch": 2.0595105672969964, + "grad_norm": 7.828125, + "learning_rate": 7.940489432703003e-06, + "loss": 2.8869, + "mean_token_accuracy": 0.465930800254723, + "step": 11109 + }, + { + "epoch": 2.0596959584723766, + "grad_norm": 7.98046875, + "learning_rate": 7.940304041527625e-06, + "loss": 2.9661, + "mean_token_accuracy": 0.47250658087159986, + "step": 11110 + }, + { + "epoch": 2.059881349647757, + "grad_norm": 8.6640625, + "learning_rate": 7.940118650352244e-06, + "loss": 2.4165, + "mean_token_accuracy": 0.5272820644498651, + "step": 11111 + }, + { + "epoch": 2.060066740823137, + "grad_norm": 7.5234375, + "learning_rate": 7.939933259176864e-06, + "loss": 3.0407, + "mean_token_accuracy": 0.4624349119761964, + "step": 11112 + }, + { + "epoch": 2.0602521319985168, + "grad_norm": 8.8515625, + "learning_rate": 7.939747868001483e-06, + "loss": 2.8667, + "mean_token_accuracy": 0.48247232472324725, + "step": 11113 + }, + { + "epoch": 2.060437523173897, + "grad_norm": 7.921875, + "learning_rate": 7.939562476826104e-06, + "loss": 3.0871, + "mean_token_accuracy": 0.46703444564047364, + "step": 11114 + }, + { + "epoch": 2.060622914349277, + "grad_norm": 7.0625, + "learning_rate": 7.939377085650724e-06, + "loss": 3.4347, + "mean_token_accuracy": 0.45660749506903353, + "step": 11115 + }, + { + "epoch": 2.060808305524657, + "grad_norm": 7.21875, + "learning_rate": 7.939191694475343e-06, + "loss": 2.4902, + "mean_token_accuracy": 0.5036619718309859, + "step": 11116 + }, + { + "epoch": 2.060993696700037, + "grad_norm": 7.515625, + "learning_rate": 7.939006303299963e-06, + "loss": 2.0226, + "mean_token_accuracy": 0.556198347107438, + "step": 11117 + }, + { + "epoch": 2.0611790878754173, + "grad_norm": 5.75390625, + "learning_rate": 7.938820912124584e-06, + "loss": 3.2231, + "mean_token_accuracy": 0.4441551679250195, + "step": 11118 + }, + { + "epoch": 2.061364479050797, + "grad_norm": 7.55078125, + "learning_rate": 7.938635520949204e-06, + "loss": 3.2792, + "mean_token_accuracy": 0.4496652465003043, + "step": 11119 + }, + { + "epoch": 2.061549870226177, + "grad_norm": 6.578125, + "learning_rate": 7.938450129773823e-06, + "loss": 2.7822, + "mean_token_accuracy": 0.4740213523131673, + "step": 11120 + }, + { + "epoch": 2.0617352614015574, + "grad_norm": 6.1875, + "learning_rate": 7.938264738598444e-06, + "loss": 2.5927, + "mean_token_accuracy": 0.5084284232365145, + "step": 11121 + }, + { + "epoch": 2.061920652576937, + "grad_norm": 8.3671875, + "learning_rate": 7.938079347423062e-06, + "loss": 3.0243, + "mean_token_accuracy": 0.4475457170356112, + "step": 11122 + }, + { + "epoch": 2.0621060437523173, + "grad_norm": 7.71875, + "learning_rate": 7.937893956247683e-06, + "loss": 2.595, + "mean_token_accuracy": 0.507460126907906, + "step": 11123 + }, + { + "epoch": 2.0622914349276975, + "grad_norm": 6.87890625, + "learning_rate": 7.937708565072303e-06, + "loss": 3.0396, + "mean_token_accuracy": 0.4526813880126183, + "step": 11124 + }, + { + "epoch": 2.0624768261030777, + "grad_norm": 6.77734375, + "learning_rate": 7.937523173896922e-06, + "loss": 2.7191, + "mean_token_accuracy": 0.5110604638691514, + "step": 11125 + }, + { + "epoch": 2.0626622172784574, + "grad_norm": 8.1015625, + "learning_rate": 7.937337782721544e-06, + "loss": 2.9408, + "mean_token_accuracy": 0.4760226882090065, + "step": 11126 + }, + { + "epoch": 2.0628476084538376, + "grad_norm": 5.93359375, + "learning_rate": 7.937152391546163e-06, + "loss": 2.7256, + "mean_token_accuracy": 0.49726231956197114, + "step": 11127 + }, + { + "epoch": 2.063032999629218, + "grad_norm": 7.56640625, + "learning_rate": 7.936967000370784e-06, + "loss": 3.4549, + "mean_token_accuracy": 0.4217378141083863, + "step": 11128 + }, + { + "epoch": 2.0632183908045976, + "grad_norm": 7.51171875, + "learning_rate": 7.936781609195403e-06, + "loss": 3.0561, + "mean_token_accuracy": 0.4519373279695109, + "step": 11129 + }, + { + "epoch": 2.0634037819799778, + "grad_norm": 6.0234375, + "learning_rate": 7.936596218020023e-06, + "loss": 2.4912, + "mean_token_accuracy": 0.5134871628209294, + "step": 11130 + }, + { + "epoch": 2.063589173155358, + "grad_norm": 6.94140625, + "learning_rate": 7.936410826844644e-06, + "loss": 1.9724, + "mean_token_accuracy": 0.5705767984445884, + "step": 11131 + }, + { + "epoch": 2.0637745643307377, + "grad_norm": 5.2734375, + "learning_rate": 7.936225435669262e-06, + "loss": 2.3906, + "mean_token_accuracy": 0.5038128068526063, + "step": 11132 + }, + { + "epoch": 2.063959955506118, + "grad_norm": 8.2578125, + "learning_rate": 7.936040044493883e-06, + "loss": 3.2315, + "mean_token_accuracy": 0.45629232950070875, + "step": 11133 + }, + { + "epoch": 2.064145346681498, + "grad_norm": 6.609375, + "learning_rate": 7.935854653318503e-06, + "loss": 2.7083, + "mean_token_accuracy": 0.4942213233169604, + "step": 11134 + }, + { + "epoch": 2.064330737856878, + "grad_norm": 6.5546875, + "learning_rate": 7.935669262143124e-06, + "loss": 1.9959, + "mean_token_accuracy": 0.5652116576552816, + "step": 11135 + }, + { + "epoch": 2.064516129032258, + "grad_norm": 7.484375, + "learning_rate": 7.935483870967743e-06, + "loss": 2.6744, + "mean_token_accuracy": 0.5312447078746825, + "step": 11136 + }, + { + "epoch": 2.064701520207638, + "grad_norm": 9.140625, + "learning_rate": 7.935298479792363e-06, + "loss": 2.6932, + "mean_token_accuracy": 0.47133838383838383, + "step": 11137 + }, + { + "epoch": 2.0648869113830184, + "grad_norm": 7.484375, + "learning_rate": 7.935113088616982e-06, + "loss": 2.9929, + "mean_token_accuracy": 0.45969983324068925, + "step": 11138 + }, + { + "epoch": 2.065072302558398, + "grad_norm": 7.10546875, + "learning_rate": 7.934927697441602e-06, + "loss": 3.8653, + "mean_token_accuracy": 0.4068852073786864, + "step": 11139 + }, + { + "epoch": 2.0652576937337783, + "grad_norm": 7.00390625, + "learning_rate": 7.934742306266223e-06, + "loss": 2.9088, + "mean_token_accuracy": 0.48401761613865607, + "step": 11140 + }, + { + "epoch": 2.0654430849091585, + "grad_norm": 8.5546875, + "learning_rate": 7.934556915090842e-06, + "loss": 2.5674, + "mean_token_accuracy": 0.49857217030114226, + "step": 11141 + }, + { + "epoch": 2.0656284760845383, + "grad_norm": 8.6328125, + "learning_rate": 7.934371523915462e-06, + "loss": 3.1848, + "mean_token_accuracy": 0.4727343547130615, + "step": 11142 + }, + { + "epoch": 2.0658138672599184, + "grad_norm": 6.6796875, + "learning_rate": 7.934186132740083e-06, + "loss": 2.7549, + "mean_token_accuracy": 0.49181698721777567, + "step": 11143 + }, + { + "epoch": 2.0659992584352986, + "grad_norm": 9.65625, + "learning_rate": 7.934000741564703e-06, + "loss": 2.7603, + "mean_token_accuracy": 0.47869609856262835, + "step": 11144 + }, + { + "epoch": 2.0661846496106784, + "grad_norm": 6.921875, + "learning_rate": 7.933815350389322e-06, + "loss": 3.2121, + "mean_token_accuracy": 0.4722010662604722, + "step": 11145 + }, + { + "epoch": 2.0663700407860586, + "grad_norm": 6.55078125, + "learning_rate": 7.933629959213942e-06, + "loss": 2.6413, + "mean_token_accuracy": 0.5013069997095556, + "step": 11146 + }, + { + "epoch": 2.0665554319614388, + "grad_norm": 5.91796875, + "learning_rate": 7.933444568038561e-06, + "loss": 2.9234, + "mean_token_accuracy": 0.4948948948948949, + "step": 11147 + }, + { + "epoch": 2.0667408231368185, + "grad_norm": 5.79296875, + "learning_rate": 7.933259176863182e-06, + "loss": 3.265, + "mean_token_accuracy": 0.44822739340933937, + "step": 11148 + }, + { + "epoch": 2.0669262143121987, + "grad_norm": 5.36328125, + "learning_rate": 7.933073785687802e-06, + "loss": 2.6778, + "mean_token_accuracy": 0.5074057939446743, + "step": 11149 + }, + { + "epoch": 2.067111605487579, + "grad_norm": 6.609375, + "learning_rate": 7.932888394512421e-06, + "loss": 2.8743, + "mean_token_accuracy": 0.48020571701949527, + "step": 11150 + }, + { + "epoch": 2.0672969966629586, + "grad_norm": 6.2890625, + "learning_rate": 7.932703003337041e-06, + "loss": 2.6268, + "mean_token_accuracy": 0.502021018593371, + "step": 11151 + }, + { + "epoch": 2.067482387838339, + "grad_norm": 6.046875, + "learning_rate": 7.932517612161662e-06, + "loss": 2.8148, + "mean_token_accuracy": 0.49466728495246926, + "step": 11152 + }, + { + "epoch": 2.067667779013719, + "grad_norm": 6.6171875, + "learning_rate": 7.932332220986282e-06, + "loss": 2.6459, + "mean_token_accuracy": 0.504446871586831, + "step": 11153 + }, + { + "epoch": 2.067853170189099, + "grad_norm": 6.234375, + "learning_rate": 7.932146829810901e-06, + "loss": 2.7008, + "mean_token_accuracy": 0.47444108761329307, + "step": 11154 + }, + { + "epoch": 2.068038561364479, + "grad_norm": 6.41796875, + "learning_rate": 7.931961438635522e-06, + "loss": 3.479, + "mean_token_accuracy": 0.40685191032522894, + "step": 11155 + }, + { + "epoch": 2.068223952539859, + "grad_norm": 6.44921875, + "learning_rate": 7.93177604746014e-06, + "loss": 2.2386, + "mean_token_accuracy": 0.5574893791969302, + "step": 11156 + }, + { + "epoch": 2.0684093437152393, + "grad_norm": 6.07421875, + "learning_rate": 7.931590656284761e-06, + "loss": 2.4362, + "mean_token_accuracy": 0.5046847888953152, + "step": 11157 + }, + { + "epoch": 2.068594734890619, + "grad_norm": 9.1640625, + "learning_rate": 7.931405265109382e-06, + "loss": 3.2901, + "mean_token_accuracy": 0.44636927963043166, + "step": 11158 + }, + { + "epoch": 2.0687801260659993, + "grad_norm": 7.5859375, + "learning_rate": 7.931219873934002e-06, + "loss": 2.1656, + "mean_token_accuracy": 0.5534294234592445, + "step": 11159 + }, + { + "epoch": 2.0689655172413794, + "grad_norm": 7.265625, + "learning_rate": 7.93103448275862e-06, + "loss": 2.6748, + "mean_token_accuracy": 0.47854077253218885, + "step": 11160 + }, + { + "epoch": 2.069150908416759, + "grad_norm": 6.68359375, + "learning_rate": 7.930849091583241e-06, + "loss": 2.7176, + "mean_token_accuracy": 0.4877137519758586, + "step": 11161 + }, + { + "epoch": 2.0693362995921394, + "grad_norm": 7.328125, + "learning_rate": 7.930663700407862e-06, + "loss": 2.7989, + "mean_token_accuracy": 0.4841511072514112, + "step": 11162 + }, + { + "epoch": 2.0695216907675196, + "grad_norm": 7.1796875, + "learning_rate": 7.93047830923248e-06, + "loss": 2.3613, + "mean_token_accuracy": 0.5160154284755996, + "step": 11163 + }, + { + "epoch": 2.0697070819428993, + "grad_norm": 6.21875, + "learning_rate": 7.930292918057101e-06, + "loss": 2.7173, + "mean_token_accuracy": 0.502951149064423, + "step": 11164 + }, + { + "epoch": 2.0698924731182795, + "grad_norm": 6.64453125, + "learning_rate": 7.93010752688172e-06, + "loss": 2.7911, + "mean_token_accuracy": 0.49530878028287356, + "step": 11165 + }, + { + "epoch": 2.0700778642936597, + "grad_norm": 6.46875, + "learning_rate": 7.92992213570634e-06, + "loss": 2.3694, + "mean_token_accuracy": 0.5239942369500167, + "step": 11166 + }, + { + "epoch": 2.07026325546904, + "grad_norm": 10.5859375, + "learning_rate": 7.929736744530961e-06, + "loss": 2.4885, + "mean_token_accuracy": 0.5001112594570538, + "step": 11167 + }, + { + "epoch": 2.0704486466444196, + "grad_norm": 11.4375, + "learning_rate": 7.929551353355581e-06, + "loss": 2.3009, + "mean_token_accuracy": 0.5181575224021381, + "step": 11168 + }, + { + "epoch": 2.0706340378198, + "grad_norm": 5.9609375, + "learning_rate": 7.929365962180202e-06, + "loss": 2.7442, + "mean_token_accuracy": 0.5027252324462969, + "step": 11169 + }, + { + "epoch": 2.07081942899518, + "grad_norm": 7.8046875, + "learning_rate": 7.92918057100482e-06, + "loss": 2.4878, + "mean_token_accuracy": 0.49542374895994296, + "step": 11170 + }, + { + "epoch": 2.0710048201705598, + "grad_norm": 8.390625, + "learning_rate": 7.928995179829441e-06, + "loss": 3.2973, + "mean_token_accuracy": 0.4512610088070456, + "step": 11171 + }, + { + "epoch": 2.07119021134594, + "grad_norm": 6.125, + "learning_rate": 7.92880978865406e-06, + "loss": 2.8435, + "mean_token_accuracy": 0.4988403963736032, + "step": 11172 + }, + { + "epoch": 2.07137560252132, + "grad_norm": 5.90625, + "learning_rate": 7.92862439747868e-06, + "loss": 2.7373, + "mean_token_accuracy": 0.4900874635568513, + "step": 11173 + }, + { + "epoch": 2.0715609936967, + "grad_norm": 8.28125, + "learning_rate": 7.928439006303301e-06, + "loss": 3.2347, + "mean_token_accuracy": 0.45322410147991543, + "step": 11174 + }, + { + "epoch": 2.07174638487208, + "grad_norm": 7.19140625, + "learning_rate": 7.928253615127921e-06, + "loss": 3.3862, + "mean_token_accuracy": 0.4428425804572594, + "step": 11175 + }, + { + "epoch": 2.0719317760474603, + "grad_norm": 8.453125, + "learning_rate": 7.92806822395254e-06, + "loss": 2.1702, + "mean_token_accuracy": 0.5416078984485191, + "step": 11176 + }, + { + "epoch": 2.07211716722284, + "grad_norm": 9.796875, + "learning_rate": 7.92788283277716e-06, + "loss": 3.212, + "mean_token_accuracy": 0.45143620574482296, + "step": 11177 + }, + { + "epoch": 2.07230255839822, + "grad_norm": 6.73046875, + "learning_rate": 7.927697441601781e-06, + "loss": 2.714, + "mean_token_accuracy": 0.48797385620915035, + "step": 11178 + }, + { + "epoch": 2.0724879495736004, + "grad_norm": 8.4140625, + "learning_rate": 7.9275120504264e-06, + "loss": 2.9505, + "mean_token_accuracy": 0.46327052060044716, + "step": 11179 + }, + { + "epoch": 2.07267334074898, + "grad_norm": 10.2421875, + "learning_rate": 7.92732665925102e-06, + "loss": 2.5954, + "mean_token_accuracy": 0.4898520361398455, + "step": 11180 + }, + { + "epoch": 2.0728587319243603, + "grad_norm": 8.5234375, + "learning_rate": 7.92714126807564e-06, + "loss": 2.7867, + "mean_token_accuracy": 0.4818530539982296, + "step": 11181 + }, + { + "epoch": 2.0730441230997405, + "grad_norm": 6.046875, + "learning_rate": 7.92695587690026e-06, + "loss": 2.9596, + "mean_token_accuracy": 0.47026022304832715, + "step": 11182 + }, + { + "epoch": 2.0732295142751207, + "grad_norm": 10.1953125, + "learning_rate": 7.92677048572488e-06, + "loss": 2.9316, + "mean_token_accuracy": 0.45340201245807377, + "step": 11183 + }, + { + "epoch": 2.0734149054505004, + "grad_norm": 6.33203125, + "learning_rate": 7.9265850945495e-06, + "loss": 2.7745, + "mean_token_accuracy": 0.478374672815655, + "step": 11184 + }, + { + "epoch": 2.0736002966258806, + "grad_norm": 5.5859375, + "learning_rate": 7.92639970337412e-06, + "loss": 2.1262, + "mean_token_accuracy": 0.5523270440251572, + "step": 11185 + }, + { + "epoch": 2.073785687801261, + "grad_norm": 6.9609375, + "learning_rate": 7.92621431219874e-06, + "loss": 2.8001, + "mean_token_accuracy": 0.49698228950232515, + "step": 11186 + }, + { + "epoch": 2.0739710789766406, + "grad_norm": 7.1328125, + "learning_rate": 7.92602892102336e-06, + "loss": 3.0336, + "mean_token_accuracy": 0.4376704111680663, + "step": 11187 + }, + { + "epoch": 2.0741564701520208, + "grad_norm": 6.12109375, + "learning_rate": 7.92584352984798e-06, + "loss": 3.0557, + "mean_token_accuracy": 0.4730881494454174, + "step": 11188 + }, + { + "epoch": 2.074341861327401, + "grad_norm": 6.48046875, + "learning_rate": 7.9256581386726e-06, + "loss": 3.4424, + "mean_token_accuracy": 0.4170359428852782, + "step": 11189 + }, + { + "epoch": 2.0745272525027807, + "grad_norm": 5.6875, + "learning_rate": 7.925472747497219e-06, + "loss": 2.6061, + "mean_token_accuracy": 0.49539447336804165, + "step": 11190 + }, + { + "epoch": 2.074712643678161, + "grad_norm": 7.390625, + "learning_rate": 7.92528735632184e-06, + "loss": 2.8948, + "mean_token_accuracy": 0.5060207224866984, + "step": 11191 + }, + { + "epoch": 2.074898034853541, + "grad_norm": 7.875, + "learning_rate": 7.92510196514646e-06, + "loss": 2.8509, + "mean_token_accuracy": 0.47306075659151703, + "step": 11192 + }, + { + "epoch": 2.075083426028921, + "grad_norm": 9.9765625, + "learning_rate": 7.92491657397108e-06, + "loss": 2.7732, + "mean_token_accuracy": 0.4957157784743992, + "step": 11193 + }, + { + "epoch": 2.075268817204301, + "grad_norm": 9.6796875, + "learning_rate": 7.924731182795699e-06, + "loss": 3.0266, + "mean_token_accuracy": 0.49566587864460204, + "step": 11194 + }, + { + "epoch": 2.075454208379681, + "grad_norm": 7.484375, + "learning_rate": 7.92454579162032e-06, + "loss": 2.7491, + "mean_token_accuracy": 0.480374464245432, + "step": 11195 + }, + { + "epoch": 2.0756395995550614, + "grad_norm": 6.92578125, + "learning_rate": 7.92436040044494e-06, + "loss": 2.4174, + "mean_token_accuracy": 0.5198670254867817, + "step": 11196 + }, + { + "epoch": 2.075824990730441, + "grad_norm": 6.46875, + "learning_rate": 7.924175009269559e-06, + "loss": 2.4569, + "mean_token_accuracy": 0.5028197198471894, + "step": 11197 + }, + { + "epoch": 2.0760103819058213, + "grad_norm": 5.87109375, + "learning_rate": 7.92398961809418e-06, + "loss": 2.3831, + "mean_token_accuracy": 0.5193777292576419, + "step": 11198 + }, + { + "epoch": 2.0761957730812015, + "grad_norm": 7.3828125, + "learning_rate": 7.9238042269188e-06, + "loss": 2.8468, + "mean_token_accuracy": 0.45149497323410365, + "step": 11199 + }, + { + "epoch": 2.0763811642565813, + "grad_norm": 6.328125, + "learning_rate": 7.92361883574342e-06, + "loss": 2.4394, + "mean_token_accuracy": 0.5022538897775193, + "step": 11200 + }, + { + "epoch": 2.0765665554319614, + "grad_norm": 6.96484375, + "learning_rate": 7.923433444568039e-06, + "loss": 3.0919, + "mean_token_accuracy": 0.46627349797272394, + "step": 11201 + }, + { + "epoch": 2.0767519466073416, + "grad_norm": 6.72265625, + "learning_rate": 7.92324805339266e-06, + "loss": 3.1912, + "mean_token_accuracy": 0.44732724902216425, + "step": 11202 + }, + { + "epoch": 2.0769373377827214, + "grad_norm": 6.0703125, + "learning_rate": 7.923062662217278e-06, + "loss": 2.7846, + "mean_token_accuracy": 0.4866810655147588, + "step": 11203 + }, + { + "epoch": 2.0771227289581016, + "grad_norm": 5.61328125, + "learning_rate": 7.922877271041899e-06, + "loss": 2.7956, + "mean_token_accuracy": 0.4658833522083805, + "step": 11204 + }, + { + "epoch": 2.0773081201334818, + "grad_norm": 12.671875, + "learning_rate": 7.92269187986652e-06, + "loss": 3.1031, + "mean_token_accuracy": 0.4868867633446467, + "step": 11205 + }, + { + "epoch": 2.0774935113088615, + "grad_norm": 7.34375, + "learning_rate": 7.922506488691138e-06, + "loss": 3.2396, + "mean_token_accuracy": 0.4571019398515818, + "step": 11206 + }, + { + "epoch": 2.0776789024842417, + "grad_norm": 7.5390625, + "learning_rate": 7.92232109751576e-06, + "loss": 2.8835, + "mean_token_accuracy": 0.48490749756572543, + "step": 11207 + }, + { + "epoch": 2.077864293659622, + "grad_norm": 7.2578125, + "learning_rate": 7.922135706340379e-06, + "loss": 3.9074, + "mean_token_accuracy": 0.41481151677055506, + "step": 11208 + }, + { + "epoch": 2.078049684835002, + "grad_norm": 5.72265625, + "learning_rate": 7.921950315165e-06, + "loss": 2.8236, + "mean_token_accuracy": 0.47912110798708735, + "step": 11209 + }, + { + "epoch": 2.078235076010382, + "grad_norm": 7.48828125, + "learning_rate": 7.921764923989618e-06, + "loss": 4.0121, + "mean_token_accuracy": 0.41961356179365655, + "step": 11210 + }, + { + "epoch": 2.078420467185762, + "grad_norm": 8.125, + "learning_rate": 7.921579532814239e-06, + "loss": 2.7882, + "mean_token_accuracy": 0.4696940542620743, + "step": 11211 + }, + { + "epoch": 2.078605858361142, + "grad_norm": 8.7578125, + "learning_rate": 7.92139414163886e-06, + "loss": 2.6548, + "mean_token_accuracy": 0.5006422982599557, + "step": 11212 + }, + { + "epoch": 2.078791249536522, + "grad_norm": 8.90625, + "learning_rate": 7.921208750463478e-06, + "loss": 2.8819, + "mean_token_accuracy": 0.4953345774460144, + "step": 11213 + }, + { + "epoch": 2.078976640711902, + "grad_norm": 7.8203125, + "learning_rate": 7.921023359288099e-06, + "loss": 3.2911, + "mean_token_accuracy": 0.44637173774665817, + "step": 11214 + }, + { + "epoch": 2.0791620318872823, + "grad_norm": 7.39453125, + "learning_rate": 7.920837968112719e-06, + "loss": 2.6889, + "mean_token_accuracy": 0.4697923956146489, + "step": 11215 + }, + { + "epoch": 2.079347423062662, + "grad_norm": 8.234375, + "learning_rate": 7.92065257693734e-06, + "loss": 2.763, + "mean_token_accuracy": 0.48798328108672934, + "step": 11216 + }, + { + "epoch": 2.0795328142380423, + "grad_norm": 7.11328125, + "learning_rate": 7.920467185761958e-06, + "loss": 2.6089, + "mean_token_accuracy": 0.5180096051227321, + "step": 11217 + }, + { + "epoch": 2.0797182054134224, + "grad_norm": 9.3984375, + "learning_rate": 7.920281794586579e-06, + "loss": 2.8342, + "mean_token_accuracy": 0.46156077748767044, + "step": 11218 + }, + { + "epoch": 2.079903596588802, + "grad_norm": 7.01171875, + "learning_rate": 7.920096403411198e-06, + "loss": 3.4359, + "mean_token_accuracy": 0.45224513675588707, + "step": 11219 + }, + { + "epoch": 2.0800889877641824, + "grad_norm": 8.1640625, + "learning_rate": 7.919911012235818e-06, + "loss": 2.6304, + "mean_token_accuracy": 0.5109052883178966, + "step": 11220 + }, + { + "epoch": 2.0802743789395626, + "grad_norm": 10.7421875, + "learning_rate": 7.919725621060439e-06, + "loss": 2.4993, + "mean_token_accuracy": 0.5180995475113123, + "step": 11221 + }, + { + "epoch": 2.0804597701149423, + "grad_norm": 7.23046875, + "learning_rate": 7.919540229885057e-06, + "loss": 2.5783, + "mean_token_accuracy": 0.4964309614097702, + "step": 11222 + }, + { + "epoch": 2.0806451612903225, + "grad_norm": 7.44140625, + "learning_rate": 7.919354838709678e-06, + "loss": 3.1658, + "mean_token_accuracy": 0.4681544028950543, + "step": 11223 + }, + { + "epoch": 2.0808305524657027, + "grad_norm": 10.671875, + "learning_rate": 7.919169447534298e-06, + "loss": 2.4823, + "mean_token_accuracy": 0.526896551724138, + "step": 11224 + }, + { + "epoch": 2.081015943641083, + "grad_norm": 9.40625, + "learning_rate": 7.918984056358919e-06, + "loss": 2.181, + "mean_token_accuracy": 0.5634290662433251, + "step": 11225 + }, + { + "epoch": 2.0812013348164626, + "grad_norm": 6.34375, + "learning_rate": 7.918798665183538e-06, + "loss": 2.4768, + "mean_token_accuracy": 0.5597649918962723, + "step": 11226 + }, + { + "epoch": 2.081386725991843, + "grad_norm": 10.390625, + "learning_rate": 7.918613274008158e-06, + "loss": 2.8365, + "mean_token_accuracy": 0.46663706006439437, + "step": 11227 + }, + { + "epoch": 2.081572117167223, + "grad_norm": 14.5234375, + "learning_rate": 7.918427882832777e-06, + "loss": 2.4348, + "mean_token_accuracy": 0.5061224489795918, + "step": 11228 + }, + { + "epoch": 2.0817575083426028, + "grad_norm": 12.9296875, + "learning_rate": 7.918242491657397e-06, + "loss": 3.4438, + "mean_token_accuracy": 0.4461184588844163, + "step": 11229 + }, + { + "epoch": 2.081942899517983, + "grad_norm": 7.12890625, + "learning_rate": 7.918057100482018e-06, + "loss": 2.5604, + "mean_token_accuracy": 0.4986726281961716, + "step": 11230 + }, + { + "epoch": 2.082128290693363, + "grad_norm": 11.8828125, + "learning_rate": 7.917871709306638e-06, + "loss": 3.311, + "mean_token_accuracy": 0.4283367556468172, + "step": 11231 + }, + { + "epoch": 2.082313681868743, + "grad_norm": 21.453125, + "learning_rate": 7.917686318131257e-06, + "loss": 2.5196, + "mean_token_accuracy": 0.492292600896861, + "step": 11232 + }, + { + "epoch": 2.082499073044123, + "grad_norm": 15.640625, + "learning_rate": 7.917500926955878e-06, + "loss": 2.764, + "mean_token_accuracy": 0.48109460055907016, + "step": 11233 + }, + { + "epoch": 2.0826844642195033, + "grad_norm": 7.92578125, + "learning_rate": 7.917315535780498e-06, + "loss": 3.0109, + "mean_token_accuracy": 0.4821953609931395, + "step": 11234 + }, + { + "epoch": 2.082869855394883, + "grad_norm": 7.90234375, + "learning_rate": 7.917130144605117e-06, + "loss": 2.7545, + "mean_token_accuracy": 0.47784679089026916, + "step": 11235 + }, + { + "epoch": 2.083055246570263, + "grad_norm": 11.640625, + "learning_rate": 7.916944753429737e-06, + "loss": 3.0109, + "mean_token_accuracy": 0.4734542565867867, + "step": 11236 + }, + { + "epoch": 2.0832406377456434, + "grad_norm": 13.6875, + "learning_rate": 7.916759362254356e-06, + "loss": 2.5166, + "mean_token_accuracy": 0.5026559604694256, + "step": 11237 + }, + { + "epoch": 2.083426028921023, + "grad_norm": 6.29296875, + "learning_rate": 7.916573971078977e-06, + "loss": 2.1478, + "mean_token_accuracy": 0.5757575757575758, + "step": 11238 + }, + { + "epoch": 2.0836114200964033, + "grad_norm": 6.36328125, + "learning_rate": 7.916388579903597e-06, + "loss": 2.5818, + "mean_token_accuracy": 0.4994792441600952, + "step": 11239 + }, + { + "epoch": 2.0837968112717835, + "grad_norm": 8.640625, + "learning_rate": 7.916203188728218e-06, + "loss": 3.3312, + "mean_token_accuracy": 0.4643041237113402, + "step": 11240 + }, + { + "epoch": 2.0839822024471637, + "grad_norm": 6.6328125, + "learning_rate": 7.916017797552837e-06, + "loss": 2.5959, + "mean_token_accuracy": 0.5050780282387912, + "step": 11241 + }, + { + "epoch": 2.0841675936225434, + "grad_norm": 6.625, + "learning_rate": 7.915832406377457e-06, + "loss": 2.9855, + "mean_token_accuracy": 0.4720715835140998, + "step": 11242 + }, + { + "epoch": 2.0843529847979236, + "grad_norm": 6.625, + "learning_rate": 7.915647015202078e-06, + "loss": 2.9797, + "mean_token_accuracy": 0.4456854141736471, + "step": 11243 + }, + { + "epoch": 2.084538375973304, + "grad_norm": 7.15625, + "learning_rate": 7.915461624026696e-06, + "loss": 3.0983, + "mean_token_accuracy": 0.4613572101790763, + "step": 11244 + }, + { + "epoch": 2.0847237671486836, + "grad_norm": 6.328125, + "learning_rate": 7.915276232851317e-06, + "loss": 3.2398, + "mean_token_accuracy": 0.4519333096842852, + "step": 11245 + }, + { + "epoch": 2.0849091583240638, + "grad_norm": 7.32421875, + "learning_rate": 7.915090841675936e-06, + "loss": 2.4483, + "mean_token_accuracy": 0.5056314761976273, + "step": 11246 + }, + { + "epoch": 2.085094549499444, + "grad_norm": 6.5703125, + "learning_rate": 7.914905450500558e-06, + "loss": 2.7771, + "mean_token_accuracy": 0.4961982540129541, + "step": 11247 + }, + { + "epoch": 2.0852799406748237, + "grad_norm": 6.61328125, + "learning_rate": 7.914720059325177e-06, + "loss": 2.4718, + "mean_token_accuracy": 0.5181505540695452, + "step": 11248 + }, + { + "epoch": 2.085465331850204, + "grad_norm": 6.1328125, + "learning_rate": 7.914534668149797e-06, + "loss": 1.9649, + "mean_token_accuracy": 0.5939055481364575, + "step": 11249 + }, + { + "epoch": 2.085650723025584, + "grad_norm": 6.12109375, + "learning_rate": 7.914349276974418e-06, + "loss": 3.2338, + "mean_token_accuracy": 0.45025088728429813, + "step": 11250 + }, + { + "epoch": 2.085836114200964, + "grad_norm": 6.359375, + "learning_rate": 7.914163885799036e-06, + "loss": 3.312, + "mean_token_accuracy": 0.4452756996397894, + "step": 11251 + }, + { + "epoch": 2.086021505376344, + "grad_norm": 8.234375, + "learning_rate": 7.913978494623657e-06, + "loss": 3.0548, + "mean_token_accuracy": 0.48682232688646154, + "step": 11252 + }, + { + "epoch": 2.086206896551724, + "grad_norm": 6.4375, + "learning_rate": 7.913793103448276e-06, + "loss": 2.8616, + "mean_token_accuracy": 0.46505700871898054, + "step": 11253 + }, + { + "epoch": 2.0863922877271044, + "grad_norm": 6.1640625, + "learning_rate": 7.913607712272896e-06, + "loss": 2.7472, + "mean_token_accuracy": 0.48307846578089747, + "step": 11254 + }, + { + "epoch": 2.086577678902484, + "grad_norm": 7.11328125, + "learning_rate": 7.913422321097517e-06, + "loss": 2.7216, + "mean_token_accuracy": 0.4607990012484395, + "step": 11255 + }, + { + "epoch": 2.0867630700778643, + "grad_norm": 6.79296875, + "learning_rate": 7.913236929922137e-06, + "loss": 2.6188, + "mean_token_accuracy": 0.4904751232631107, + "step": 11256 + }, + { + "epoch": 2.0869484612532445, + "grad_norm": 7.4609375, + "learning_rate": 7.913051538746756e-06, + "loss": 2.614, + "mean_token_accuracy": 0.505157201441531, + "step": 11257 + }, + { + "epoch": 2.0871338524286243, + "grad_norm": 8.484375, + "learning_rate": 7.912866147571376e-06, + "loss": 3.0615, + "mean_token_accuracy": 0.47309197651663404, + "step": 11258 + }, + { + "epoch": 2.0873192436040044, + "grad_norm": 8.5078125, + "learning_rate": 7.912680756395997e-06, + "loss": 3.0495, + "mean_token_accuracy": 0.4587781731909846, + "step": 11259 + }, + { + "epoch": 2.0875046347793846, + "grad_norm": 8.6953125, + "learning_rate": 7.912495365220616e-06, + "loss": 3.632, + "mean_token_accuracy": 0.43979416809605487, + "step": 11260 + }, + { + "epoch": 2.0876900259547644, + "grad_norm": 8.1171875, + "learning_rate": 7.912309974045236e-06, + "loss": 2.7262, + "mean_token_accuracy": 0.48568235787121106, + "step": 11261 + }, + { + "epoch": 2.0878754171301446, + "grad_norm": 9.921875, + "learning_rate": 7.912124582869855e-06, + "loss": 2.9473, + "mean_token_accuracy": 0.4910102186004398, + "step": 11262 + }, + { + "epoch": 2.0880608083055248, + "grad_norm": 7.30859375, + "learning_rate": 7.911939191694476e-06, + "loss": 2.6281, + "mean_token_accuracy": 0.48457099849473156, + "step": 11263 + }, + { + "epoch": 2.0882461994809045, + "grad_norm": 7.328125, + "learning_rate": 7.911753800519096e-06, + "loss": 3.1874, + "mean_token_accuracy": 0.43985952589991223, + "step": 11264 + }, + { + "epoch": 2.0884315906562847, + "grad_norm": 8.53125, + "learning_rate": 7.911568409343716e-06, + "loss": 2.6595, + "mean_token_accuracy": 0.48931855056787454, + "step": 11265 + }, + { + "epoch": 2.088616981831665, + "grad_norm": 10.4140625, + "learning_rate": 7.911383018168335e-06, + "loss": 2.9954, + "mean_token_accuracy": 0.46571508536244055, + "step": 11266 + }, + { + "epoch": 2.088802373007045, + "grad_norm": 6.71875, + "learning_rate": 7.911197626992956e-06, + "loss": 2.8687, + "mean_token_accuracy": 0.47425258610890075, + "step": 11267 + }, + { + "epoch": 2.088987764182425, + "grad_norm": 6.921875, + "learning_rate": 7.911012235817576e-06, + "loss": 2.9902, + "mean_token_accuracy": 0.4499319727891156, + "step": 11268 + }, + { + "epoch": 2.089173155357805, + "grad_norm": 8.15625, + "learning_rate": 7.910826844642195e-06, + "loss": 2.8762, + "mean_token_accuracy": 0.5055301755229622, + "step": 11269 + }, + { + "epoch": 2.089358546533185, + "grad_norm": 6.67578125, + "learning_rate": 7.910641453466816e-06, + "loss": 3.0175, + "mean_token_accuracy": 0.4822316986496091, + "step": 11270 + }, + { + "epoch": 2.089543937708565, + "grad_norm": 6.35546875, + "learning_rate": 7.910456062291434e-06, + "loss": 2.8184, + "mean_token_accuracy": 0.4807247494217425, + "step": 11271 + }, + { + "epoch": 2.089729328883945, + "grad_norm": 6.97265625, + "learning_rate": 7.910270671116057e-06, + "loss": 3.1166, + "mean_token_accuracy": 0.4668491105858938, + "step": 11272 + }, + { + "epoch": 2.0899147200593253, + "grad_norm": 6.7265625, + "learning_rate": 7.910085279940675e-06, + "loss": 2.888, + "mean_token_accuracy": 0.4923469387755102, + "step": 11273 + }, + { + "epoch": 2.090100111234705, + "grad_norm": 5.83203125, + "learning_rate": 7.909899888765296e-06, + "loss": 2.6623, + "mean_token_accuracy": 0.48323605266730013, + "step": 11274 + }, + { + "epoch": 2.0902855024100853, + "grad_norm": 6.81640625, + "learning_rate": 7.909714497589915e-06, + "loss": 3.0822, + "mean_token_accuracy": 0.45037868895272914, + "step": 11275 + }, + { + "epoch": 2.0904708935854655, + "grad_norm": 6.12109375, + "learning_rate": 7.909529106414535e-06, + "loss": 2.3874, + "mean_token_accuracy": 0.5391040242976461, + "step": 11276 + }, + { + "epoch": 2.090656284760845, + "grad_norm": 9.6484375, + "learning_rate": 7.909343715239156e-06, + "loss": 2.1114, + "mean_token_accuracy": 0.5447983681154872, + "step": 11277 + }, + { + "epoch": 2.0908416759362254, + "grad_norm": 6.05078125, + "learning_rate": 7.909158324063774e-06, + "loss": 3.0152, + "mean_token_accuracy": 0.4966131907308378, + "step": 11278 + }, + { + "epoch": 2.0910270671116056, + "grad_norm": 6.08203125, + "learning_rate": 7.908972932888395e-06, + "loss": 2.7518, + "mean_token_accuracy": 0.48224919835089325, + "step": 11279 + }, + { + "epoch": 2.0912124582869858, + "grad_norm": 8.71875, + "learning_rate": 7.908787541713015e-06, + "loss": 3.0807, + "mean_token_accuracy": 0.45693035835023665, + "step": 11280 + }, + { + "epoch": 2.0913978494623655, + "grad_norm": 7.48046875, + "learning_rate": 7.908602150537636e-06, + "loss": 3.3614, + "mean_token_accuracy": 0.44888832098134424, + "step": 11281 + }, + { + "epoch": 2.0915832406377457, + "grad_norm": 6.24609375, + "learning_rate": 7.908416759362255e-06, + "loss": 2.8361, + "mean_token_accuracy": 0.4682956627978009, + "step": 11282 + }, + { + "epoch": 2.091768631813126, + "grad_norm": 7.47265625, + "learning_rate": 7.908231368186875e-06, + "loss": 2.6449, + "mean_token_accuracy": 0.5144083384426732, + "step": 11283 + }, + { + "epoch": 2.0919540229885056, + "grad_norm": 8.40625, + "learning_rate": 7.908045977011494e-06, + "loss": 2.349, + "mean_token_accuracy": 0.5066478961363992, + "step": 11284 + }, + { + "epoch": 2.092139414163886, + "grad_norm": 5.94921875, + "learning_rate": 7.907860585836114e-06, + "loss": 2.344, + "mean_token_accuracy": 0.5207397622192866, + "step": 11285 + }, + { + "epoch": 2.092324805339266, + "grad_norm": 10.078125, + "learning_rate": 7.907675194660735e-06, + "loss": 2.6016, + "mean_token_accuracy": 0.49842857142857144, + "step": 11286 + }, + { + "epoch": 2.0925101965146458, + "grad_norm": 7.15625, + "learning_rate": 7.907489803485354e-06, + "loss": 2.706, + "mean_token_accuracy": 0.49681616832779624, + "step": 11287 + }, + { + "epoch": 2.092695587690026, + "grad_norm": 7.3671875, + "learning_rate": 7.907304412309976e-06, + "loss": 2.9226, + "mean_token_accuracy": 0.4780289560579121, + "step": 11288 + }, + { + "epoch": 2.092880978865406, + "grad_norm": 7.75, + "learning_rate": 7.907119021134595e-06, + "loss": 2.2471, + "mean_token_accuracy": 0.5176368123094789, + "step": 11289 + }, + { + "epoch": 2.093066370040786, + "grad_norm": 8.0390625, + "learning_rate": 7.906933629959215e-06, + "loss": 3.1462, + "mean_token_accuracy": 0.4358316221765914, + "step": 11290 + }, + { + "epoch": 2.093251761216166, + "grad_norm": 7.26953125, + "learning_rate": 7.906748238783834e-06, + "loss": 2.5687, + "mean_token_accuracy": 0.5039747807017544, + "step": 11291 + }, + { + "epoch": 2.0934371523915463, + "grad_norm": 7.296875, + "learning_rate": 7.906562847608455e-06, + "loss": 2.8499, + "mean_token_accuracy": 0.46308954203691044, + "step": 11292 + }, + { + "epoch": 2.093622543566926, + "grad_norm": 7.54296875, + "learning_rate": 7.906377456433075e-06, + "loss": 2.9991, + "mean_token_accuracy": 0.45910687405920725, + "step": 11293 + }, + { + "epoch": 2.093807934742306, + "grad_norm": 9.046875, + "learning_rate": 7.906192065257694e-06, + "loss": 2.4361, + "mean_token_accuracy": 0.5240981240981241, + "step": 11294 + }, + { + "epoch": 2.0939933259176864, + "grad_norm": 7.55859375, + "learning_rate": 7.906006674082314e-06, + "loss": 2.8269, + "mean_token_accuracy": 0.47691472026072784, + "step": 11295 + }, + { + "epoch": 2.0941787170930666, + "grad_norm": 7.73828125, + "learning_rate": 7.905821282906935e-06, + "loss": 2.9652, + "mean_token_accuracy": 0.48208077025942764, + "step": 11296 + }, + { + "epoch": 2.0943641082684463, + "grad_norm": 6.171875, + "learning_rate": 7.905635891731555e-06, + "loss": 2.9948, + "mean_token_accuracy": 0.4394129024917511, + "step": 11297 + }, + { + "epoch": 2.0945494994438265, + "grad_norm": 6.87109375, + "learning_rate": 7.905450500556174e-06, + "loss": 2.7124, + "mean_token_accuracy": 0.468534253850239, + "step": 11298 + }, + { + "epoch": 2.0947348906192067, + "grad_norm": 7.05078125, + "learning_rate": 7.905265109380795e-06, + "loss": 2.0024, + "mean_token_accuracy": 0.5802606661468197, + "step": 11299 + }, + { + "epoch": 2.0949202817945864, + "grad_norm": 6.359375, + "learning_rate": 7.905079718205413e-06, + "loss": 2.781, + "mean_token_accuracy": 0.4672542166843945, + "step": 11300 + }, + { + "epoch": 2.0951056729699666, + "grad_norm": 6.58984375, + "learning_rate": 7.904894327030034e-06, + "loss": 2.8979, + "mean_token_accuracy": 0.4804295116037409, + "step": 11301 + }, + { + "epoch": 2.095291064145347, + "grad_norm": 6.3515625, + "learning_rate": 7.904708935854654e-06, + "loss": 2.7591, + "mean_token_accuracy": 0.48442426535502586, + "step": 11302 + }, + { + "epoch": 2.0954764553207266, + "grad_norm": 9.0390625, + "learning_rate": 7.904523544679273e-06, + "loss": 3.4266, + "mean_token_accuracy": 0.4580855281789861, + "step": 11303 + }, + { + "epoch": 2.0956618464961068, + "grad_norm": 6.2109375, + "learning_rate": 7.904338153503894e-06, + "loss": 3.0724, + "mean_token_accuracy": 0.4642050737149688, + "step": 11304 + }, + { + "epoch": 2.095847237671487, + "grad_norm": 8.9296875, + "learning_rate": 7.904152762328514e-06, + "loss": 2.6057, + "mean_token_accuracy": 0.4832904884318766, + "step": 11305 + }, + { + "epoch": 2.0960326288468667, + "grad_norm": 8.140625, + "learning_rate": 7.903967371153135e-06, + "loss": 2.154, + "mean_token_accuracy": 0.5661466650288662, + "step": 11306 + }, + { + "epoch": 2.096218020022247, + "grad_norm": 7.24609375, + "learning_rate": 7.903781979977753e-06, + "loss": 2.2587, + "mean_token_accuracy": 0.5490127758420441, + "step": 11307 + }, + { + "epoch": 2.096403411197627, + "grad_norm": 6.66796875, + "learning_rate": 7.903596588802374e-06, + "loss": 2.4575, + "mean_token_accuracy": 0.5357698289269052, + "step": 11308 + }, + { + "epoch": 2.096588802373007, + "grad_norm": 7.19140625, + "learning_rate": 7.903411197626993e-06, + "loss": 2.6227, + "mean_token_accuracy": 0.4932681759250025, + "step": 11309 + }, + { + "epoch": 2.096774193548387, + "grad_norm": 6.81640625, + "learning_rate": 7.903225806451613e-06, + "loss": 2.9474, + "mean_token_accuracy": 0.48655913978494625, + "step": 11310 + }, + { + "epoch": 2.096959584723767, + "grad_norm": 6.578125, + "learning_rate": 7.903040415276234e-06, + "loss": 2.6751, + "mean_token_accuracy": 0.48142593447161974, + "step": 11311 + }, + { + "epoch": 2.0971449758991474, + "grad_norm": 7.69140625, + "learning_rate": 7.902855024100854e-06, + "loss": 3.1818, + "mean_token_accuracy": 0.45396007726980037, + "step": 11312 + }, + { + "epoch": 2.097330367074527, + "grad_norm": 6.12109375, + "learning_rate": 7.902669632925473e-06, + "loss": 2.6713, + "mean_token_accuracy": 0.48611753817677, + "step": 11313 + }, + { + "epoch": 2.0975157582499073, + "grad_norm": 7.9375, + "learning_rate": 7.902484241750093e-06, + "loss": 3.3448, + "mean_token_accuracy": 0.4425367362722351, + "step": 11314 + }, + { + "epoch": 2.0977011494252875, + "grad_norm": 8.84375, + "learning_rate": 7.902298850574714e-06, + "loss": 2.6868, + "mean_token_accuracy": 0.48989597034556975, + "step": 11315 + }, + { + "epoch": 2.0978865406006673, + "grad_norm": 6.6015625, + "learning_rate": 7.902113459399333e-06, + "loss": 3.1244, + "mean_token_accuracy": 0.45454545454545453, + "step": 11316 + }, + { + "epoch": 2.0980719317760474, + "grad_norm": 6.55859375, + "learning_rate": 7.901928068223953e-06, + "loss": 2.3703, + "mean_token_accuracy": 0.5241027181258447, + "step": 11317 + }, + { + "epoch": 2.0982573229514276, + "grad_norm": 6.3046875, + "learning_rate": 7.901742677048572e-06, + "loss": 2.622, + "mean_token_accuracy": 0.5051435590357746, + "step": 11318 + }, + { + "epoch": 2.0984427141268074, + "grad_norm": 10.2734375, + "learning_rate": 7.901557285873193e-06, + "loss": 3.1607, + "mean_token_accuracy": 0.47352386479036707, + "step": 11319 + }, + { + "epoch": 2.0986281053021876, + "grad_norm": 7.3515625, + "learning_rate": 7.901371894697813e-06, + "loss": 2.4999, + "mean_token_accuracy": 0.5051663128096249, + "step": 11320 + }, + { + "epoch": 2.0988134964775678, + "grad_norm": 6.90625, + "learning_rate": 7.901186503522434e-06, + "loss": 3.1715, + "mean_token_accuracy": 0.45490633916387546, + "step": 11321 + }, + { + "epoch": 2.0989988876529475, + "grad_norm": 7.72265625, + "learning_rate": 7.901001112347052e-06, + "loss": 3.1255, + "mean_token_accuracy": 0.44624479964966063, + "step": 11322 + }, + { + "epoch": 2.0991842788283277, + "grad_norm": 6.390625, + "learning_rate": 7.900815721171673e-06, + "loss": 2.823, + "mean_token_accuracy": 0.4885814954978468, + "step": 11323 + }, + { + "epoch": 2.099369670003708, + "grad_norm": 8.3984375, + "learning_rate": 7.900630329996293e-06, + "loss": 2.5891, + "mean_token_accuracy": 0.5083705357142857, + "step": 11324 + }, + { + "epoch": 2.099555061179088, + "grad_norm": 6.41015625, + "learning_rate": 7.900444938820912e-06, + "loss": 3.1322, + "mean_token_accuracy": 0.4505723204994797, + "step": 11325 + }, + { + "epoch": 2.099740452354468, + "grad_norm": 7.19140625, + "learning_rate": 7.900259547645533e-06, + "loss": 2.7176, + "mean_token_accuracy": 0.47370431682159686, + "step": 11326 + }, + { + "epoch": 2.099925843529848, + "grad_norm": 6.77734375, + "learning_rate": 7.900074156470151e-06, + "loss": 2.9175, + "mean_token_accuracy": 0.49468831429144117, + "step": 11327 + }, + { + "epoch": 2.100111234705228, + "grad_norm": 6.22265625, + "learning_rate": 7.899888765294774e-06, + "loss": 2.79, + "mean_token_accuracy": 0.4715087803129853, + "step": 11328 + }, + { + "epoch": 2.100296625880608, + "grad_norm": 6.296875, + "learning_rate": 7.899703374119392e-06, + "loss": 2.0651, + "mean_token_accuracy": 0.5835576217274284, + "step": 11329 + }, + { + "epoch": 2.100482017055988, + "grad_norm": 10.0546875, + "learning_rate": 7.899517982944013e-06, + "loss": 2.3867, + "mean_token_accuracy": 0.5006275275414865, + "step": 11330 + }, + { + "epoch": 2.1006674082313683, + "grad_norm": 6.22265625, + "learning_rate": 7.899332591768633e-06, + "loss": 2.4466, + "mean_token_accuracy": 0.545332257196664, + "step": 11331 + }, + { + "epoch": 2.100852799406748, + "grad_norm": 9.53125, + "learning_rate": 7.899147200593252e-06, + "loss": 2.662, + "mean_token_accuracy": 0.5035637515556058, + "step": 11332 + }, + { + "epoch": 2.1010381905821283, + "grad_norm": 9.875, + "learning_rate": 7.898961809417873e-06, + "loss": 2.5317, + "mean_token_accuracy": 0.49613633957870223, + "step": 11333 + }, + { + "epoch": 2.1012235817575085, + "grad_norm": 9.84375, + "learning_rate": 7.898776418242491e-06, + "loss": 3.1199, + "mean_token_accuracy": 0.4750234155479238, + "step": 11334 + }, + { + "epoch": 2.101408972932888, + "grad_norm": 11.6484375, + "learning_rate": 7.898591027067112e-06, + "loss": 2.6282, + "mean_token_accuracy": 0.49972943722943725, + "step": 11335 + }, + { + "epoch": 2.1015943641082684, + "grad_norm": 7.421875, + "learning_rate": 7.898405635891732e-06, + "loss": 2.7258, + "mean_token_accuracy": 0.49649517259621745, + "step": 11336 + }, + { + "epoch": 2.1017797552836486, + "grad_norm": 10.921875, + "learning_rate": 7.898220244716353e-06, + "loss": 2.5406, + "mean_token_accuracy": 0.48103607770582796, + "step": 11337 + }, + { + "epoch": 2.1019651464590288, + "grad_norm": 7.23828125, + "learning_rate": 7.898034853540972e-06, + "loss": 2.6286, + "mean_token_accuracy": 0.5033387742988574, + "step": 11338 + }, + { + "epoch": 2.1021505376344085, + "grad_norm": 8.25, + "learning_rate": 7.897849462365592e-06, + "loss": 3.1336, + "mean_token_accuracy": 0.4638655462184874, + "step": 11339 + }, + { + "epoch": 2.1023359288097887, + "grad_norm": 6.3984375, + "learning_rate": 7.897664071190213e-06, + "loss": 3.159, + "mean_token_accuracy": 0.45870275314979003, + "step": 11340 + }, + { + "epoch": 2.102521319985169, + "grad_norm": 10.296875, + "learning_rate": 7.897478680014831e-06, + "loss": 2.5997, + "mean_token_accuracy": 0.48745082078415414, + "step": 11341 + }, + { + "epoch": 2.1027067111605486, + "grad_norm": 8.0859375, + "learning_rate": 7.897293288839452e-06, + "loss": 2.3333, + "mean_token_accuracy": 0.5429379371930346, + "step": 11342 + }, + { + "epoch": 2.102892102335929, + "grad_norm": 6.39453125, + "learning_rate": 7.89710789766407e-06, + "loss": 3.4395, + "mean_token_accuracy": 0.4435817157169693, + "step": 11343 + }, + { + "epoch": 2.103077493511309, + "grad_norm": 8.21875, + "learning_rate": 7.896922506488693e-06, + "loss": 2.633, + "mean_token_accuracy": 0.48471741637831606, + "step": 11344 + }, + { + "epoch": 2.1032628846866888, + "grad_norm": 7.02734375, + "learning_rate": 7.896737115313312e-06, + "loss": 3.0752, + "mean_token_accuracy": 0.4471607103705328, + "step": 11345 + }, + { + "epoch": 2.103448275862069, + "grad_norm": 6.44921875, + "learning_rate": 7.896551724137932e-06, + "loss": 2.9675, + "mean_token_accuracy": 0.47425742574257423, + "step": 11346 + }, + { + "epoch": 2.103633667037449, + "grad_norm": 8.734375, + "learning_rate": 7.896366332962551e-06, + "loss": 2.471, + "mean_token_accuracy": 0.5067905646890636, + "step": 11347 + }, + { + "epoch": 2.103819058212829, + "grad_norm": 8.734375, + "learning_rate": 7.896180941787172e-06, + "loss": 2.6899, + "mean_token_accuracy": 0.47939346811819594, + "step": 11348 + }, + { + "epoch": 2.104004449388209, + "grad_norm": 6.51953125, + "learning_rate": 7.895995550611792e-06, + "loss": 3.0898, + "mean_token_accuracy": 0.4760630389533155, + "step": 11349 + }, + { + "epoch": 2.1041898405635893, + "grad_norm": 6.19140625, + "learning_rate": 7.89581015943641e-06, + "loss": 2.8013, + "mean_token_accuracy": 0.5156864830373286, + "step": 11350 + }, + { + "epoch": 2.1043752317389695, + "grad_norm": 8.4140625, + "learning_rate": 7.895624768261031e-06, + "loss": 2.1929, + "mean_token_accuracy": 0.5333640128854119, + "step": 11351 + }, + { + "epoch": 2.104560622914349, + "grad_norm": 6.4453125, + "learning_rate": 7.895439377085652e-06, + "loss": 2.9384, + "mean_token_accuracy": 0.4659201033703026, + "step": 11352 + }, + { + "epoch": 2.1047460140897294, + "grad_norm": 5.91796875, + "learning_rate": 7.895253985910272e-06, + "loss": 2.6375, + "mean_token_accuracy": 0.496870925684485, + "step": 11353 + }, + { + "epoch": 2.1049314052651096, + "grad_norm": 7.1328125, + "learning_rate": 7.895068594734891e-06, + "loss": 2.519, + "mean_token_accuracy": 0.5207536865101038, + "step": 11354 + }, + { + "epoch": 2.1051167964404893, + "grad_norm": 6.2421875, + "learning_rate": 7.894883203559512e-06, + "loss": 2.3526, + "mean_token_accuracy": 0.5189421015010722, + "step": 11355 + }, + { + "epoch": 2.1053021876158695, + "grad_norm": 5.8671875, + "learning_rate": 7.89469781238413e-06, + "loss": 2.5606, + "mean_token_accuracy": 0.4931017691933128, + "step": 11356 + }, + { + "epoch": 2.1054875787912497, + "grad_norm": 6.5078125, + "learning_rate": 7.894512421208751e-06, + "loss": 2.7606, + "mean_token_accuracy": 0.485230352303523, + "step": 11357 + }, + { + "epoch": 2.1056729699666294, + "grad_norm": 6.21875, + "learning_rate": 7.894327030033371e-06, + "loss": 2.8605, + "mean_token_accuracy": 0.4949232585596222, + "step": 11358 + }, + { + "epoch": 2.1058583611420096, + "grad_norm": 6.4296875, + "learning_rate": 7.89414163885799e-06, + "loss": 2.5093, + "mean_token_accuracy": 0.5299879243257749, + "step": 11359 + }, + { + "epoch": 2.10604375231739, + "grad_norm": 7.296875, + "learning_rate": 7.89395624768261e-06, + "loss": 3.3161, + "mean_token_accuracy": 0.44537576360278447, + "step": 11360 + }, + { + "epoch": 2.1062291434927696, + "grad_norm": 6.3125, + "learning_rate": 7.893770856507231e-06, + "loss": 2.8225, + "mean_token_accuracy": 0.47462941847206386, + "step": 11361 + }, + { + "epoch": 2.1064145346681498, + "grad_norm": 6.5, + "learning_rate": 7.893585465331852e-06, + "loss": 2.684, + "mean_token_accuracy": 0.48043505438179773, + "step": 11362 + }, + { + "epoch": 2.10659992584353, + "grad_norm": 6.34765625, + "learning_rate": 7.89340007415647e-06, + "loss": 3.0781, + "mean_token_accuracy": 0.45559336919530136, + "step": 11363 + }, + { + "epoch": 2.1067853170189097, + "grad_norm": 5.71484375, + "learning_rate": 7.893214682981091e-06, + "loss": 2.1556, + "mean_token_accuracy": 0.580490335970943, + "step": 11364 + }, + { + "epoch": 2.10697070819429, + "grad_norm": 6.55859375, + "learning_rate": 7.89302929180571e-06, + "loss": 2.7745, + "mean_token_accuracy": 0.4796402289452167, + "step": 11365 + }, + { + "epoch": 2.10715609936967, + "grad_norm": 6.69921875, + "learning_rate": 7.89284390063033e-06, + "loss": 2.7886, + "mean_token_accuracy": 0.4724464236014473, + "step": 11366 + }, + { + "epoch": 2.1073414905450503, + "grad_norm": 6.8046875, + "learning_rate": 7.89265850945495e-06, + "loss": 2.4865, + "mean_token_accuracy": 0.49732620320855614, + "step": 11367 + }, + { + "epoch": 2.10752688172043, + "grad_norm": 10.078125, + "learning_rate": 7.892473118279571e-06, + "loss": 2.6186, + "mean_token_accuracy": 0.4750963126031921, + "step": 11368 + }, + { + "epoch": 2.10771227289581, + "grad_norm": 8.8046875, + "learning_rate": 7.892287727104192e-06, + "loss": 2.6249, + "mean_token_accuracy": 0.49086936163571093, + "step": 11369 + }, + { + "epoch": 2.1078976640711904, + "grad_norm": 7.39453125, + "learning_rate": 7.89210233592881e-06, + "loss": 2.9338, + "mean_token_accuracy": 0.4702110606465402, + "step": 11370 + }, + { + "epoch": 2.10808305524657, + "grad_norm": 6.54296875, + "learning_rate": 7.891916944753431e-06, + "loss": 2.7356, + "mean_token_accuracy": 0.5041186161449753, + "step": 11371 + }, + { + "epoch": 2.1082684464219503, + "grad_norm": 6.0859375, + "learning_rate": 7.89173155357805e-06, + "loss": 2.3997, + "mean_token_accuracy": 0.5230616082291782, + "step": 11372 + }, + { + "epoch": 2.1084538375973305, + "grad_norm": 8.0390625, + "learning_rate": 7.89154616240267e-06, + "loss": 2.5174, + "mean_token_accuracy": 0.49872053872053873, + "step": 11373 + }, + { + "epoch": 2.1086392287727103, + "grad_norm": 6.953125, + "learning_rate": 7.89136077122729e-06, + "loss": 2.2907, + "mean_token_accuracy": 0.5358354058331449, + "step": 11374 + }, + { + "epoch": 2.1088246199480905, + "grad_norm": 6.33203125, + "learning_rate": 7.89117538005191e-06, + "loss": 3.5177, + "mean_token_accuracy": 0.4452301719356628, + "step": 11375 + }, + { + "epoch": 2.1090100111234706, + "grad_norm": 6.19921875, + "learning_rate": 7.89098998887653e-06, + "loss": 2.4695, + "mean_token_accuracy": 0.49985101311084623, + "step": 11376 + }, + { + "epoch": 2.1091954022988504, + "grad_norm": 5.921875, + "learning_rate": 7.89080459770115e-06, + "loss": 3.2945, + "mean_token_accuracy": 0.4594631236442516, + "step": 11377 + }, + { + "epoch": 2.1093807934742306, + "grad_norm": 7.5625, + "learning_rate": 7.890619206525771e-06, + "loss": 2.4619, + "mean_token_accuracy": 0.531015157304802, + "step": 11378 + }, + { + "epoch": 2.1095661846496108, + "grad_norm": 6.25, + "learning_rate": 7.89043381535039e-06, + "loss": 2.6581, + "mean_token_accuracy": 0.5039646579066607, + "step": 11379 + }, + { + "epoch": 2.1097515758249905, + "grad_norm": 6.6015625, + "learning_rate": 7.89024842417501e-06, + "loss": 3.5985, + "mean_token_accuracy": 0.432475884244373, + "step": 11380 + }, + { + "epoch": 2.1099369670003707, + "grad_norm": 6.40625, + "learning_rate": 7.890063032999629e-06, + "loss": 2.0106, + "mean_token_accuracy": 0.5544057698338037, + "step": 11381 + }, + { + "epoch": 2.110122358175751, + "grad_norm": 7.46875, + "learning_rate": 7.88987764182425e-06, + "loss": 3.1674, + "mean_token_accuracy": 0.42293994842195753, + "step": 11382 + }, + { + "epoch": 2.110307749351131, + "grad_norm": 6.70703125, + "learning_rate": 7.88969225064887e-06, + "loss": 2.7461, + "mean_token_accuracy": 0.5075306479859895, + "step": 11383 + }, + { + "epoch": 2.110493140526511, + "grad_norm": 7.0390625, + "learning_rate": 7.889506859473489e-06, + "loss": 3.0897, + "mean_token_accuracy": 0.46999668471654327, + "step": 11384 + }, + { + "epoch": 2.110678531701891, + "grad_norm": 6.734375, + "learning_rate": 7.88932146829811e-06, + "loss": 2.588, + "mean_token_accuracy": 0.48945849977807365, + "step": 11385 + }, + { + "epoch": 2.110863922877271, + "grad_norm": 10.1640625, + "learning_rate": 7.88913607712273e-06, + "loss": 2.2732, + "mean_token_accuracy": 0.5444887118193891, + "step": 11386 + }, + { + "epoch": 2.111049314052651, + "grad_norm": 6.83984375, + "learning_rate": 7.88895068594735e-06, + "loss": 2.6596, + "mean_token_accuracy": 0.4612432847275518, + "step": 11387 + }, + { + "epoch": 2.111234705228031, + "grad_norm": 7.515625, + "learning_rate": 7.88876529477197e-06, + "loss": 3.1292, + "mean_token_accuracy": 0.4538564422648239, + "step": 11388 + }, + { + "epoch": 2.1114200964034113, + "grad_norm": 6.79296875, + "learning_rate": 7.88857990359659e-06, + "loss": 2.6225, + "mean_token_accuracy": 0.4931452675982072, + "step": 11389 + }, + { + "epoch": 2.111605487578791, + "grad_norm": 8.734375, + "learning_rate": 7.888394512421208e-06, + "loss": 2.4968, + "mean_token_accuracy": 0.4959771606540358, + "step": 11390 + }, + { + "epoch": 2.1117908787541713, + "grad_norm": 7.1953125, + "learning_rate": 7.888209121245829e-06, + "loss": 2.8489, + "mean_token_accuracy": 0.4687676493843895, + "step": 11391 + }, + { + "epoch": 2.1119762699295515, + "grad_norm": 8.859375, + "learning_rate": 7.88802373007045e-06, + "loss": 3.431, + "mean_token_accuracy": 0.4613022898810285, + "step": 11392 + }, + { + "epoch": 2.112161661104931, + "grad_norm": 7.65625, + "learning_rate": 7.88783833889507e-06, + "loss": 3.2983, + "mean_token_accuracy": 0.460904044409199, + "step": 11393 + }, + { + "epoch": 2.1123470522803114, + "grad_norm": 8.7109375, + "learning_rate": 7.887652947719689e-06, + "loss": 2.5795, + "mean_token_accuracy": 0.4711596842744384, + "step": 11394 + }, + { + "epoch": 2.1125324434556916, + "grad_norm": 8.4140625, + "learning_rate": 7.88746755654431e-06, + "loss": 2.8672, + "mean_token_accuracy": 0.48810188805720034, + "step": 11395 + }, + { + "epoch": 2.1127178346310718, + "grad_norm": 11.46875, + "learning_rate": 7.88728216536893e-06, + "loss": 2.4195, + "mean_token_accuracy": 0.5025879917184265, + "step": 11396 + }, + { + "epoch": 2.1129032258064515, + "grad_norm": 7.046875, + "learning_rate": 7.887096774193549e-06, + "loss": 3.221, + "mean_token_accuracy": 0.47037980290111836, + "step": 11397 + }, + { + "epoch": 2.1130886169818317, + "grad_norm": 7.5234375, + "learning_rate": 7.886911383018169e-06, + "loss": 3.2469, + "mean_token_accuracy": 0.43959190979058527, + "step": 11398 + }, + { + "epoch": 2.113274008157212, + "grad_norm": 6.8828125, + "learning_rate": 7.886725991842788e-06, + "loss": 2.9992, + "mean_token_accuracy": 0.458060587608464, + "step": 11399 + }, + { + "epoch": 2.1134593993325916, + "grad_norm": 7.515625, + "learning_rate": 7.886540600667408e-06, + "loss": 3.0594, + "mean_token_accuracy": 0.4699634337454292, + "step": 11400 + }, + { + "epoch": 2.113644790507972, + "grad_norm": 5.6875, + "learning_rate": 7.886355209492029e-06, + "loss": 2.1893, + "mean_token_accuracy": 0.536655069582505, + "step": 11401 + }, + { + "epoch": 2.113830181683352, + "grad_norm": 8.765625, + "learning_rate": 7.88616981831665e-06, + "loss": 3.9438, + "mean_token_accuracy": 0.40982028241335045, + "step": 11402 + }, + { + "epoch": 2.1140155728587318, + "grad_norm": 7.27734375, + "learning_rate": 7.885984427141268e-06, + "loss": 2.6758, + "mean_token_accuracy": 0.5003026268006294, + "step": 11403 + }, + { + "epoch": 2.114200964034112, + "grad_norm": 6.19921875, + "learning_rate": 7.885799035965889e-06, + "loss": 2.7982, + "mean_token_accuracy": 0.4918112930823759, + "step": 11404 + }, + { + "epoch": 2.114386355209492, + "grad_norm": 6.2734375, + "learning_rate": 7.885613644790509e-06, + "loss": 2.3918, + "mean_token_accuracy": 0.5169584914901433, + "step": 11405 + }, + { + "epoch": 2.114571746384872, + "grad_norm": 7.36328125, + "learning_rate": 7.885428253615128e-06, + "loss": 2.8001, + "mean_token_accuracy": 0.46820603907637653, + "step": 11406 + }, + { + "epoch": 2.114757137560252, + "grad_norm": 6.6015625, + "learning_rate": 7.885242862439748e-06, + "loss": 2.5317, + "mean_token_accuracy": 0.49456390432471614, + "step": 11407 + }, + { + "epoch": 2.1149425287356323, + "grad_norm": 9.4375, + "learning_rate": 7.885057471264367e-06, + "loss": 2.8676, + "mean_token_accuracy": 0.47792508688376883, + "step": 11408 + }, + { + "epoch": 2.1151279199110125, + "grad_norm": 6.1171875, + "learning_rate": 7.88487208008899e-06, + "loss": 2.3359, + "mean_token_accuracy": 0.5318199711323973, + "step": 11409 + }, + { + "epoch": 2.115313311086392, + "grad_norm": 6.33984375, + "learning_rate": 7.884686688913608e-06, + "loss": 2.6462, + "mean_token_accuracy": 0.4847010826926094, + "step": 11410 + }, + { + "epoch": 2.1154987022617724, + "grad_norm": 6.35546875, + "learning_rate": 7.884501297738229e-06, + "loss": 2.8685, + "mean_token_accuracy": 0.467244564445076, + "step": 11411 + }, + { + "epoch": 2.1156840934371526, + "grad_norm": 6.26171875, + "learning_rate": 7.884315906562849e-06, + "loss": 3.1337, + "mean_token_accuracy": 0.46775871034841393, + "step": 11412 + }, + { + "epoch": 2.1158694846125323, + "grad_norm": 6.82421875, + "learning_rate": 7.884130515387468e-06, + "loss": 2.5842, + "mean_token_accuracy": 0.48522130532633156, + "step": 11413 + }, + { + "epoch": 2.1160548757879125, + "grad_norm": 8.7734375, + "learning_rate": 7.883945124212088e-06, + "loss": 3.4982, + "mean_token_accuracy": 0.43869038249645836, + "step": 11414 + }, + { + "epoch": 2.1162402669632927, + "grad_norm": 7.19140625, + "learning_rate": 7.883759733036707e-06, + "loss": 3.0445, + "mean_token_accuracy": 0.46871648194494103, + "step": 11415 + }, + { + "epoch": 2.1164256581386724, + "grad_norm": 6.01953125, + "learning_rate": 7.883574341861328e-06, + "loss": 2.7898, + "mean_token_accuracy": 0.4855997083485235, + "step": 11416 + }, + { + "epoch": 2.1166110493140526, + "grad_norm": 9.4765625, + "learning_rate": 7.883388950685948e-06, + "loss": 2.5163, + "mean_token_accuracy": 0.48711477151965993, + "step": 11417 + }, + { + "epoch": 2.116796440489433, + "grad_norm": 10.0, + "learning_rate": 7.883203559510569e-06, + "loss": 2.5753, + "mean_token_accuracy": 0.48977751052315094, + "step": 11418 + }, + { + "epoch": 2.1169818316648126, + "grad_norm": 8.3515625, + "learning_rate": 7.883018168335187e-06, + "loss": 2.0136, + "mean_token_accuracy": 0.5458218158622156, + "step": 11419 + }, + { + "epoch": 2.1171672228401928, + "grad_norm": 6.0625, + "learning_rate": 7.882832777159808e-06, + "loss": 2.2165, + "mean_token_accuracy": 0.5333425759046166, + "step": 11420 + }, + { + "epoch": 2.117352614015573, + "grad_norm": 7.6640625, + "learning_rate": 7.882647385984428e-06, + "loss": 3.1282, + "mean_token_accuracy": 0.4411401776900296, + "step": 11421 + }, + { + "epoch": 2.1175380051909527, + "grad_norm": 6.6640625, + "learning_rate": 7.882461994809047e-06, + "loss": 2.7005, + "mean_token_accuracy": 0.4928825622775801, + "step": 11422 + }, + { + "epoch": 2.117723396366333, + "grad_norm": 7.61328125, + "learning_rate": 7.882276603633668e-06, + "loss": 2.4166, + "mean_token_accuracy": 0.515887621150779, + "step": 11423 + }, + { + "epoch": 2.117908787541713, + "grad_norm": 8.6015625, + "learning_rate": 7.882091212458287e-06, + "loss": 2.4793, + "mean_token_accuracy": 0.5161048689138577, + "step": 11424 + }, + { + "epoch": 2.1180941787170933, + "grad_norm": 7.87109375, + "learning_rate": 7.881905821282909e-06, + "loss": 2.6848, + "mean_token_accuracy": 0.5023094688221709, + "step": 11425 + }, + { + "epoch": 2.118279569892473, + "grad_norm": 8.34375, + "learning_rate": 7.881720430107528e-06, + "loss": 3.2211, + "mean_token_accuracy": 0.4461174713787954, + "step": 11426 + }, + { + "epoch": 2.118464961067853, + "grad_norm": 6.68359375, + "learning_rate": 7.881535038932148e-06, + "loss": 2.5061, + "mean_token_accuracy": 0.5205206378986866, + "step": 11427 + }, + { + "epoch": 2.1186503522432334, + "grad_norm": 5.6796875, + "learning_rate": 7.881349647756767e-06, + "loss": 2.2642, + "mean_token_accuracy": 0.569510778365819, + "step": 11428 + }, + { + "epoch": 2.118835743418613, + "grad_norm": 10.7265625, + "learning_rate": 7.881164256581387e-06, + "loss": 3.0339, + "mean_token_accuracy": 0.4728633811603244, + "step": 11429 + }, + { + "epoch": 2.1190211345939933, + "grad_norm": 6.28125, + "learning_rate": 7.880978865406008e-06, + "loss": 2.8041, + "mean_token_accuracy": 0.4718823044135845, + "step": 11430 + }, + { + "epoch": 2.1192065257693735, + "grad_norm": 6.859375, + "learning_rate": 7.880793474230627e-06, + "loss": 3.4319, + "mean_token_accuracy": 0.4161579892280072, + "step": 11431 + }, + { + "epoch": 2.1193919169447533, + "grad_norm": 9.3359375, + "learning_rate": 7.880608083055247e-06, + "loss": 2.4395, + "mean_token_accuracy": 0.5122081387591728, + "step": 11432 + }, + { + "epoch": 2.1195773081201335, + "grad_norm": 6.484375, + "learning_rate": 7.880422691879868e-06, + "loss": 2.8505, + "mean_token_accuracy": 0.48547844695811676, + "step": 11433 + }, + { + "epoch": 2.1197626992955136, + "grad_norm": 6.109375, + "learning_rate": 7.880237300704488e-06, + "loss": 2.7771, + "mean_token_accuracy": 0.4701402805611222, + "step": 11434 + }, + { + "epoch": 2.1199480904708934, + "grad_norm": 9.9609375, + "learning_rate": 7.880051909529107e-06, + "loss": 2.7191, + "mean_token_accuracy": 0.49423829157416216, + "step": 11435 + }, + { + "epoch": 2.1201334816462736, + "grad_norm": 9.515625, + "learning_rate": 7.879866518353727e-06, + "loss": 2.9555, + "mean_token_accuracy": 0.4778365667254556, + "step": 11436 + }, + { + "epoch": 2.1203188728216538, + "grad_norm": 7.8671875, + "learning_rate": 7.879681127178346e-06, + "loss": 3.7456, + "mean_token_accuracy": 0.39080612924716857, + "step": 11437 + }, + { + "epoch": 2.120504263997034, + "grad_norm": 7.37109375, + "learning_rate": 7.879495736002967e-06, + "loss": 2.7234, + "mean_token_accuracy": 0.49799548985216735, + "step": 11438 + }, + { + "epoch": 2.1206896551724137, + "grad_norm": 7.53515625, + "learning_rate": 7.879310344827587e-06, + "loss": 3.2699, + "mean_token_accuracy": 0.4485481065167938, + "step": 11439 + }, + { + "epoch": 2.120875046347794, + "grad_norm": 6.2421875, + "learning_rate": 7.879124953652206e-06, + "loss": 3.2053, + "mean_token_accuracy": 0.46002331002331004, + "step": 11440 + }, + { + "epoch": 2.121060437523174, + "grad_norm": 6.265625, + "learning_rate": 7.878939562476826e-06, + "loss": 2.7628, + "mean_token_accuracy": 0.47265625, + "step": 11441 + }, + { + "epoch": 2.121245828698554, + "grad_norm": 7.70703125, + "learning_rate": 7.878754171301447e-06, + "loss": 2.9482, + "mean_token_accuracy": 0.4832099418297197, + "step": 11442 + }, + { + "epoch": 2.121431219873934, + "grad_norm": 8.4375, + "learning_rate": 7.878568780126067e-06, + "loss": 3.6468, + "mean_token_accuracy": 0.41553094832481546, + "step": 11443 + }, + { + "epoch": 2.121616611049314, + "grad_norm": 7.234375, + "learning_rate": 7.878383388950686e-06, + "loss": 3.3835, + "mean_token_accuracy": 0.4437705592105263, + "step": 11444 + }, + { + "epoch": 2.121802002224694, + "grad_norm": 6.8828125, + "learning_rate": 7.878197997775307e-06, + "loss": 2.7104, + "mean_token_accuracy": 0.5244548286604361, + "step": 11445 + }, + { + "epoch": 2.121987393400074, + "grad_norm": 6.11328125, + "learning_rate": 7.878012606599925e-06, + "loss": 2.4811, + "mean_token_accuracy": 0.5014702278853223, + "step": 11446 + }, + { + "epoch": 2.1221727845754543, + "grad_norm": 8.8984375, + "learning_rate": 7.877827215424546e-06, + "loss": 3.1323, + "mean_token_accuracy": 0.4387086712414223, + "step": 11447 + }, + { + "epoch": 2.122358175750834, + "grad_norm": 6.05859375, + "learning_rate": 7.877641824249166e-06, + "loss": 2.5822, + "mean_token_accuracy": 0.5008764241893077, + "step": 11448 + }, + { + "epoch": 2.1225435669262143, + "grad_norm": 5.93359375, + "learning_rate": 7.877456433073787e-06, + "loss": 2.6828, + "mean_token_accuracy": 0.48578811369509045, + "step": 11449 + }, + { + "epoch": 2.1227289581015945, + "grad_norm": 8.53125, + "learning_rate": 7.877271041898407e-06, + "loss": 2.8419, + "mean_token_accuracy": 0.46367357380404195, + "step": 11450 + }, + { + "epoch": 2.122914349276974, + "grad_norm": 5.59765625, + "learning_rate": 7.877085650723026e-06, + "loss": 2.9522, + "mean_token_accuracy": 0.4520653007559987, + "step": 11451 + }, + { + "epoch": 2.1230997404523544, + "grad_norm": 6.21484375, + "learning_rate": 7.876900259547647e-06, + "loss": 2.2091, + "mean_token_accuracy": 0.5495506586236613, + "step": 11452 + }, + { + "epoch": 2.1232851316277346, + "grad_norm": 8.6328125, + "learning_rate": 7.876714868372266e-06, + "loss": 2.7831, + "mean_token_accuracy": 0.48254051917248647, + "step": 11453 + }, + { + "epoch": 2.1234705228031148, + "grad_norm": 7.81640625, + "learning_rate": 7.876529477196886e-06, + "loss": 2.2761, + "mean_token_accuracy": 0.5523625310859354, + "step": 11454 + }, + { + "epoch": 2.1236559139784945, + "grad_norm": 6.58203125, + "learning_rate": 7.876344086021507e-06, + "loss": 2.4132, + "mean_token_accuracy": 0.5098591549295775, + "step": 11455 + }, + { + "epoch": 2.1238413051538747, + "grad_norm": 9.140625, + "learning_rate": 7.876158694846125e-06, + "loss": 2.2637, + "mean_token_accuracy": 0.525768852689229, + "step": 11456 + }, + { + "epoch": 2.124026696329255, + "grad_norm": 7.06640625, + "learning_rate": 7.875973303670746e-06, + "loss": 3.0759, + "mean_token_accuracy": 0.45397960535028475, + "step": 11457 + }, + { + "epoch": 2.1242120875046346, + "grad_norm": 6.9140625, + "learning_rate": 7.875787912495366e-06, + "loss": 2.8676, + "mean_token_accuracy": 0.4756224066390041, + "step": 11458 + }, + { + "epoch": 2.124397478680015, + "grad_norm": 8.7109375, + "learning_rate": 7.875602521319987e-06, + "loss": 2.1941, + "mean_token_accuracy": 0.5396975425330813, + "step": 11459 + }, + { + "epoch": 2.124582869855395, + "grad_norm": 7.54296875, + "learning_rate": 7.875417130144606e-06, + "loss": 2.58, + "mean_token_accuracy": 0.487146937480165, + "step": 11460 + }, + { + "epoch": 2.1247682610307748, + "grad_norm": 7.859375, + "learning_rate": 7.875231738969226e-06, + "loss": 2.6915, + "mean_token_accuracy": 0.48961073119410836, + "step": 11461 + }, + { + "epoch": 2.124953652206155, + "grad_norm": 7.03515625, + "learning_rate": 7.875046347793845e-06, + "loss": 3.4177, + "mean_token_accuracy": 0.4389347113186922, + "step": 11462 + }, + { + "epoch": 2.125139043381535, + "grad_norm": 6.58984375, + "learning_rate": 7.874860956618465e-06, + "loss": 2.6228, + "mean_token_accuracy": 0.4976711690731253, + "step": 11463 + }, + { + "epoch": 2.125324434556915, + "grad_norm": 7.7890625, + "learning_rate": 7.874675565443086e-06, + "loss": 3.474, + "mean_token_accuracy": 0.44262761268306555, + "step": 11464 + }, + { + "epoch": 2.125509825732295, + "grad_norm": 8.203125, + "learning_rate": 7.874490174267706e-06, + "loss": 2.8924, + "mean_token_accuracy": 0.4666278053045759, + "step": 11465 + }, + { + "epoch": 2.1256952169076753, + "grad_norm": 6.70703125, + "learning_rate": 7.874304783092325e-06, + "loss": 2.564, + "mean_token_accuracy": 0.5220028208744711, + "step": 11466 + }, + { + "epoch": 2.1258806080830555, + "grad_norm": 6.43359375, + "learning_rate": 7.874119391916946e-06, + "loss": 3.0559, + "mean_token_accuracy": 0.45654872749844816, + "step": 11467 + }, + { + "epoch": 2.126065999258435, + "grad_norm": 6.79296875, + "learning_rate": 7.873934000741566e-06, + "loss": 2.5733, + "mean_token_accuracy": 0.5139307683029791, + "step": 11468 + }, + { + "epoch": 2.1262513904338154, + "grad_norm": 7.734375, + "learning_rate": 7.873748609566185e-06, + "loss": 3.0504, + "mean_token_accuracy": 0.4625754527162978, + "step": 11469 + }, + { + "epoch": 2.1264367816091956, + "grad_norm": 6.44921875, + "learning_rate": 7.873563218390805e-06, + "loss": 3.4013, + "mean_token_accuracy": 0.4378836238644734, + "step": 11470 + }, + { + "epoch": 2.1266221727845753, + "grad_norm": 6.421875, + "learning_rate": 7.873377827215424e-06, + "loss": 3.2089, + "mean_token_accuracy": 0.4607442041691019, + "step": 11471 + }, + { + "epoch": 2.1268075639599555, + "grad_norm": 6.26171875, + "learning_rate": 7.873192436040045e-06, + "loss": 2.9077, + "mean_token_accuracy": 0.4668155315717653, + "step": 11472 + }, + { + "epoch": 2.1269929551353357, + "grad_norm": 6.1171875, + "learning_rate": 7.873007044864665e-06, + "loss": 2.8879, + "mean_token_accuracy": 0.4642435375934917, + "step": 11473 + }, + { + "epoch": 2.1271783463107155, + "grad_norm": 7.4375, + "learning_rate": 7.872821653689286e-06, + "loss": 2.7947, + "mean_token_accuracy": 0.4775175980462577, + "step": 11474 + }, + { + "epoch": 2.1273637374860956, + "grad_norm": 6.20703125, + "learning_rate": 7.872636262513904e-06, + "loss": 3.0391, + "mean_token_accuracy": 0.46111805121798877, + "step": 11475 + }, + { + "epoch": 2.127549128661476, + "grad_norm": 6.80078125, + "learning_rate": 7.872450871338525e-06, + "loss": 1.9665, + "mean_token_accuracy": 0.5618881587769881, + "step": 11476 + }, + { + "epoch": 2.1277345198368556, + "grad_norm": 5.95703125, + "learning_rate": 7.872265480163145e-06, + "loss": 2.6681, + "mean_token_accuracy": 0.4909418571564071, + "step": 11477 + }, + { + "epoch": 2.1279199110122358, + "grad_norm": 7.7734375, + "learning_rate": 7.872080088987764e-06, + "loss": 2.9948, + "mean_token_accuracy": 0.49434333497294636, + "step": 11478 + }, + { + "epoch": 2.128105302187616, + "grad_norm": 6.640625, + "learning_rate": 7.871894697812385e-06, + "loss": 3.383, + "mean_token_accuracy": 0.4333373771685066, + "step": 11479 + }, + { + "epoch": 2.128290693362996, + "grad_norm": 7.1875, + "learning_rate": 7.871709306637004e-06, + "loss": 2.6971, + "mean_token_accuracy": 0.4794791494972804, + "step": 11480 + }, + { + "epoch": 2.128476084538376, + "grad_norm": 6.84765625, + "learning_rate": 7.871523915461626e-06, + "loss": 2.5837, + "mean_token_accuracy": 0.5145454545454545, + "step": 11481 + }, + { + "epoch": 2.128661475713756, + "grad_norm": 6.75390625, + "learning_rate": 7.871338524286245e-06, + "loss": 2.4486, + "mean_token_accuracy": 0.5107474691443628, + "step": 11482 + }, + { + "epoch": 2.1288468668891363, + "grad_norm": 7.23046875, + "learning_rate": 7.871153133110865e-06, + "loss": 2.7773, + "mean_token_accuracy": 0.5265191897654584, + "step": 11483 + }, + { + "epoch": 2.129032258064516, + "grad_norm": 7.3203125, + "learning_rate": 7.870967741935484e-06, + "loss": 2.5754, + "mean_token_accuracy": 0.47774436090225564, + "step": 11484 + }, + { + "epoch": 2.129217649239896, + "grad_norm": 7.16796875, + "learning_rate": 7.870782350760104e-06, + "loss": 2.8913, + "mean_token_accuracy": 0.46409540725704135, + "step": 11485 + }, + { + "epoch": 2.1294030404152764, + "grad_norm": 7.26171875, + "learning_rate": 7.870596959584725e-06, + "loss": 2.9117, + "mean_token_accuracy": 0.4724526066350711, + "step": 11486 + }, + { + "epoch": 2.129588431590656, + "grad_norm": 7.00390625, + "learning_rate": 7.870411568409344e-06, + "loss": 2.7702, + "mean_token_accuracy": 0.49097027481772293, + "step": 11487 + }, + { + "epoch": 2.1297738227660363, + "grad_norm": 6.4453125, + "learning_rate": 7.870226177233964e-06, + "loss": 2.9172, + "mean_token_accuracy": 0.4742967992240543, + "step": 11488 + }, + { + "epoch": 2.1299592139414165, + "grad_norm": 8.84375, + "learning_rate": 7.870040786058585e-06, + "loss": 4.5595, + "mean_token_accuracy": 0.39080459770114945, + "step": 11489 + }, + { + "epoch": 2.1301446051167963, + "grad_norm": 7.640625, + "learning_rate": 7.869855394883205e-06, + "loss": 2.9024, + "mean_token_accuracy": 0.4945005273466928, + "step": 11490 + }, + { + "epoch": 2.1303299962921765, + "grad_norm": 6.2578125, + "learning_rate": 7.869670003707824e-06, + "loss": 3.2204, + "mean_token_accuracy": 0.4407552083333333, + "step": 11491 + }, + { + "epoch": 2.1305153874675566, + "grad_norm": 6.51171875, + "learning_rate": 7.869484612532444e-06, + "loss": 2.4181, + "mean_token_accuracy": 0.5043558606124604, + "step": 11492 + }, + { + "epoch": 2.1307007786429364, + "grad_norm": 6.5625, + "learning_rate": 7.869299221357065e-06, + "loss": 2.9503, + "mean_token_accuracy": 0.461010922021844, + "step": 11493 + }, + { + "epoch": 2.1308861698183166, + "grad_norm": 6.11328125, + "learning_rate": 7.869113830181684e-06, + "loss": 3.0311, + "mean_token_accuracy": 0.45435349646006706, + "step": 11494 + }, + { + "epoch": 2.1310715609936968, + "grad_norm": 7.890625, + "learning_rate": 7.868928439006304e-06, + "loss": 3.2539, + "mean_token_accuracy": 0.455690013445789, + "step": 11495 + }, + { + "epoch": 2.131256952169077, + "grad_norm": 6.26171875, + "learning_rate": 7.868743047830923e-06, + "loss": 2.8262, + "mean_token_accuracy": 0.47705779334500875, + "step": 11496 + }, + { + "epoch": 2.1314423433444567, + "grad_norm": 6.54296875, + "learning_rate": 7.868557656655545e-06, + "loss": 3.0703, + "mean_token_accuracy": 0.4620554808749847, + "step": 11497 + }, + { + "epoch": 2.131627734519837, + "grad_norm": 7.1328125, + "learning_rate": 7.868372265480164e-06, + "loss": 2.9176, + "mean_token_accuracy": 0.47026963381608383, + "step": 11498 + }, + { + "epoch": 2.131813125695217, + "grad_norm": 7.09375, + "learning_rate": 7.868186874304784e-06, + "loss": 3.1349, + "mean_token_accuracy": 0.44756856418529617, + "step": 11499 + }, + { + "epoch": 2.131998516870597, + "grad_norm": 7.125, + "learning_rate": 7.868001483129403e-06, + "loss": 3.3496, + "mean_token_accuracy": 0.4487854843429909, + "step": 11500 + }, + { + "epoch": 2.132183908045977, + "grad_norm": 6.14453125, + "learning_rate": 7.867816091954024e-06, + "loss": 3.4223, + "mean_token_accuracy": 0.4303899082568807, + "step": 11501 + }, + { + "epoch": 2.132369299221357, + "grad_norm": 6.89453125, + "learning_rate": 7.867630700778644e-06, + "loss": 2.755, + "mean_token_accuracy": 0.4735441452723857, + "step": 11502 + }, + { + "epoch": 2.132554690396737, + "grad_norm": 6.03125, + "learning_rate": 7.867445309603263e-06, + "loss": 2.5196, + "mean_token_accuracy": 0.5060477923099617, + "step": 11503 + }, + { + "epoch": 2.132740081572117, + "grad_norm": 7.0625, + "learning_rate": 7.867259918427883e-06, + "loss": 2.2935, + "mean_token_accuracy": 0.5233087427405431, + "step": 11504 + }, + { + "epoch": 2.1329254727474973, + "grad_norm": 10.53125, + "learning_rate": 7.867074527252504e-06, + "loss": 3.4776, + "mean_token_accuracy": 0.4454939000393546, + "step": 11505 + }, + { + "epoch": 2.133110863922877, + "grad_norm": 6.37109375, + "learning_rate": 7.866889136077124e-06, + "loss": 2.978, + "mean_token_accuracy": 0.48537578674564974, + "step": 11506 + }, + { + "epoch": 2.1332962550982573, + "grad_norm": 6.7265625, + "learning_rate": 7.866703744901743e-06, + "loss": 3.1822, + "mean_token_accuracy": 0.4754569190600522, + "step": 11507 + }, + { + "epoch": 2.1334816462736375, + "grad_norm": 6.76171875, + "learning_rate": 7.866518353726364e-06, + "loss": 3.5361, + "mean_token_accuracy": 0.4441272861824717, + "step": 11508 + }, + { + "epoch": 2.133667037449017, + "grad_norm": 6.640625, + "learning_rate": 7.866332962550983e-06, + "loss": 2.6576, + "mean_token_accuracy": 0.5000692041522491, + "step": 11509 + }, + { + "epoch": 2.1338524286243974, + "grad_norm": 7.62109375, + "learning_rate": 7.866147571375603e-06, + "loss": 2.7419, + "mean_token_accuracy": 0.4789335088874259, + "step": 11510 + }, + { + "epoch": 2.1340378197997776, + "grad_norm": 6.33203125, + "learning_rate": 7.865962180200224e-06, + "loss": 2.4501, + "mean_token_accuracy": 0.49930715935334874, + "step": 11511 + }, + { + "epoch": 2.1342232109751578, + "grad_norm": 11.0625, + "learning_rate": 7.865776789024842e-06, + "loss": 3.669, + "mean_token_accuracy": 0.4354520817935452, + "step": 11512 + }, + { + "epoch": 2.1344086021505375, + "grad_norm": 10.9140625, + "learning_rate": 7.865591397849463e-06, + "loss": 2.5974, + "mean_token_accuracy": 0.5125889726973335, + "step": 11513 + }, + { + "epoch": 2.1345939933259177, + "grad_norm": 7.80078125, + "learning_rate": 7.865406006674083e-06, + "loss": 2.6478, + "mean_token_accuracy": 0.4849318658280922, + "step": 11514 + }, + { + "epoch": 2.134779384501298, + "grad_norm": 8.7109375, + "learning_rate": 7.865220615498704e-06, + "loss": 2.9767, + "mean_token_accuracy": 0.47253797939020503, + "step": 11515 + }, + { + "epoch": 2.1349647756766776, + "grad_norm": 13.8515625, + "learning_rate": 7.865035224323323e-06, + "loss": 2.4862, + "mean_token_accuracy": 0.5020785838809173, + "step": 11516 + }, + { + "epoch": 2.135150166852058, + "grad_norm": 10.703125, + "learning_rate": 7.864849833147943e-06, + "loss": 2.4018, + "mean_token_accuracy": 0.547474528506395, + "step": 11517 + }, + { + "epoch": 2.135335558027438, + "grad_norm": 7.37109375, + "learning_rate": 7.864664441972562e-06, + "loss": 3.1074, + "mean_token_accuracy": 0.4631336405529954, + "step": 11518 + }, + { + "epoch": 2.1355209492028178, + "grad_norm": 9.328125, + "learning_rate": 7.864479050797182e-06, + "loss": 3.0804, + "mean_token_accuracy": 0.46291208791208793, + "step": 11519 + }, + { + "epoch": 2.135706340378198, + "grad_norm": 13.125, + "learning_rate": 7.864293659621803e-06, + "loss": 3.0889, + "mean_token_accuracy": 0.45961571161367115, + "step": 11520 + }, + { + "epoch": 2.135891731553578, + "grad_norm": 9.546875, + "learning_rate": 7.864108268446422e-06, + "loss": 2.6312, + "mean_token_accuracy": 0.4962029161603888, + "step": 11521 + }, + { + "epoch": 2.136077122728958, + "grad_norm": 8.578125, + "learning_rate": 7.863922877271042e-06, + "loss": 2.2144, + "mean_token_accuracy": 0.5345740413925878, + "step": 11522 + }, + { + "epoch": 2.136262513904338, + "grad_norm": 6.4296875, + "learning_rate": 7.863737486095663e-06, + "loss": 3.0574, + "mean_token_accuracy": 0.4842914438502674, + "step": 11523 + }, + { + "epoch": 2.1364479050797183, + "grad_norm": 7.7265625, + "learning_rate": 7.863552094920283e-06, + "loss": 2.9194, + "mean_token_accuracy": 0.4962624584717608, + "step": 11524 + }, + { + "epoch": 2.1366332962550985, + "grad_norm": 7.0, + "learning_rate": 7.863366703744902e-06, + "loss": 2.3082, + "mean_token_accuracy": 0.5306609130138542, + "step": 11525 + }, + { + "epoch": 2.136818687430478, + "grad_norm": 6.578125, + "learning_rate": 7.863181312569522e-06, + "loss": 2.602, + "mean_token_accuracy": 0.5008792965627498, + "step": 11526 + }, + { + "epoch": 2.1370040786058584, + "grad_norm": 8.3515625, + "learning_rate": 7.862995921394141e-06, + "loss": 2.7841, + "mean_token_accuracy": 0.48615635179153094, + "step": 11527 + }, + { + "epoch": 2.1371894697812386, + "grad_norm": 10.078125, + "learning_rate": 7.862810530218762e-06, + "loss": 2.8097, + "mean_token_accuracy": 0.475115379817887, + "step": 11528 + }, + { + "epoch": 2.1373748609566183, + "grad_norm": 7.82421875, + "learning_rate": 7.862625139043382e-06, + "loss": 2.5396, + "mean_token_accuracy": 0.49343419925777904, + "step": 11529 + }, + { + "epoch": 2.1375602521319985, + "grad_norm": 6.3984375, + "learning_rate": 7.862439747868003e-06, + "loss": 2.9166, + "mean_token_accuracy": 0.48377642872095455, + "step": 11530 + }, + { + "epoch": 2.1377456433073787, + "grad_norm": 8.7265625, + "learning_rate": 7.862254356692623e-06, + "loss": 3.2184, + "mean_token_accuracy": 0.4796818510484454, + "step": 11531 + }, + { + "epoch": 2.1379310344827585, + "grad_norm": 9.265625, + "learning_rate": 7.862068965517242e-06, + "loss": 2.1981, + "mean_token_accuracy": 0.5180988353792887, + "step": 11532 + }, + { + "epoch": 2.1381164256581386, + "grad_norm": 6.7265625, + "learning_rate": 7.861883574341862e-06, + "loss": 2.9197, + "mean_token_accuracy": 0.4667146455559554, + "step": 11533 + }, + { + "epoch": 2.138301816833519, + "grad_norm": 6.87109375, + "learning_rate": 7.861698183166481e-06, + "loss": 2.5734, + "mean_token_accuracy": 0.5165238678090576, + "step": 11534 + }, + { + "epoch": 2.1384872080088986, + "grad_norm": 6.99609375, + "learning_rate": 7.861512791991102e-06, + "loss": 2.8794, + "mean_token_accuracy": 0.49425915800984144, + "step": 11535 + }, + { + "epoch": 2.1386725991842788, + "grad_norm": 7.60546875, + "learning_rate": 7.861327400815722e-06, + "loss": 3.0902, + "mean_token_accuracy": 0.48586500743946975, + "step": 11536 + }, + { + "epoch": 2.138857990359659, + "grad_norm": 6.80859375, + "learning_rate": 7.861142009640341e-06, + "loss": 2.3077, + "mean_token_accuracy": 0.5285754112071369, + "step": 11537 + }, + { + "epoch": 2.139043381535039, + "grad_norm": 6.73046875, + "learning_rate": 7.860956618464962e-06, + "loss": 2.3313, + "mean_token_accuracy": 0.5224636497304362, + "step": 11538 + }, + { + "epoch": 2.139228772710419, + "grad_norm": 7.015625, + "learning_rate": 7.860771227289582e-06, + "loss": 3.2249, + "mean_token_accuracy": 0.44451047392660825, + "step": 11539 + }, + { + "epoch": 2.139414163885799, + "grad_norm": 7.64453125, + "learning_rate": 7.860585836114203e-06, + "loss": 2.7683, + "mean_token_accuracy": 0.4900969812867095, + "step": 11540 + }, + { + "epoch": 2.1395995550611793, + "grad_norm": 6.78125, + "learning_rate": 7.860400444938821e-06, + "loss": 2.9554, + "mean_token_accuracy": 0.4647802528597231, + "step": 11541 + }, + { + "epoch": 2.139784946236559, + "grad_norm": 8.3203125, + "learning_rate": 7.860215053763442e-06, + "loss": 3.3257, + "mean_token_accuracy": 0.43535040082751486, + "step": 11542 + }, + { + "epoch": 2.139970337411939, + "grad_norm": 6.265625, + "learning_rate": 7.86002966258806e-06, + "loss": 2.7118, + "mean_token_accuracy": 0.4671033478893741, + "step": 11543 + }, + { + "epoch": 2.1401557285873194, + "grad_norm": 6.42578125, + "learning_rate": 7.859844271412681e-06, + "loss": 3.5109, + "mean_token_accuracy": 0.43784639746634996, + "step": 11544 + }, + { + "epoch": 2.140341119762699, + "grad_norm": 7.28515625, + "learning_rate": 7.859658880237302e-06, + "loss": 2.4091, + "mean_token_accuracy": 0.4960323185687491, + "step": 11545 + }, + { + "epoch": 2.1405265109380793, + "grad_norm": 6.6328125, + "learning_rate": 7.859473489061922e-06, + "loss": 2.4744, + "mean_token_accuracy": 0.5087961000423908, + "step": 11546 + }, + { + "epoch": 2.1407119021134595, + "grad_norm": 6.0078125, + "learning_rate": 7.859288097886541e-06, + "loss": 2.9778, + "mean_token_accuracy": 0.4682507169192954, + "step": 11547 + }, + { + "epoch": 2.1408972932888393, + "grad_norm": 6.61328125, + "learning_rate": 7.859102706711161e-06, + "loss": 3.2734, + "mean_token_accuracy": 0.4403899721448468, + "step": 11548 + }, + { + "epoch": 2.1410826844642195, + "grad_norm": 6.63671875, + "learning_rate": 7.858917315535782e-06, + "loss": 2.6088, + "mean_token_accuracy": 0.48622167789344767, + "step": 11549 + }, + { + "epoch": 2.1412680756395996, + "grad_norm": 7.36328125, + "learning_rate": 7.8587319243604e-06, + "loss": 2.793, + "mean_token_accuracy": 0.4679232232405326, + "step": 11550 + }, + { + "epoch": 2.14145346681498, + "grad_norm": 7.390625, + "learning_rate": 7.858546533185021e-06, + "loss": 3.0518, + "mean_token_accuracy": 0.4737612887478643, + "step": 11551 + }, + { + "epoch": 2.1416388579903596, + "grad_norm": 6.18359375, + "learning_rate": 7.85836114200964e-06, + "loss": 2.8566, + "mean_token_accuracy": 0.4680456112437019, + "step": 11552 + }, + { + "epoch": 2.1418242491657398, + "grad_norm": 6.59375, + "learning_rate": 7.85817575083426e-06, + "loss": 2.6595, + "mean_token_accuracy": 0.49789076376554176, + "step": 11553 + }, + { + "epoch": 2.14200964034112, + "grad_norm": 9.6171875, + "learning_rate": 7.857990359658881e-06, + "loss": 2.9908, + "mean_token_accuracy": 0.47617804464169167, + "step": 11554 + }, + { + "epoch": 2.1421950315164997, + "grad_norm": 8.4296875, + "learning_rate": 7.857804968483501e-06, + "loss": 3.6647, + "mean_token_accuracy": 0.43835192069392814, + "step": 11555 + }, + { + "epoch": 2.14238042269188, + "grad_norm": 6.34375, + "learning_rate": 7.85761957730812e-06, + "loss": 2.85, + "mean_token_accuracy": 0.4720938943688351, + "step": 11556 + }, + { + "epoch": 2.14256581386726, + "grad_norm": 7.8046875, + "learning_rate": 7.85743418613274e-06, + "loss": 2.5237, + "mean_token_accuracy": 0.5241442144995444, + "step": 11557 + }, + { + "epoch": 2.14275120504264, + "grad_norm": 7.51171875, + "learning_rate": 7.857248794957361e-06, + "loss": 3.225, + "mean_token_accuracy": 0.4528099910793934, + "step": 11558 + }, + { + "epoch": 2.14293659621802, + "grad_norm": 7.328125, + "learning_rate": 7.85706340378198e-06, + "loss": 3.0033, + "mean_token_accuracy": 0.4754738015607581, + "step": 11559 + }, + { + "epoch": 2.1431219873934, + "grad_norm": 6.16015625, + "learning_rate": 7.8568780126066e-06, + "loss": 3.0342, + "mean_token_accuracy": 0.44600651996740015, + "step": 11560 + }, + { + "epoch": 2.14330737856878, + "grad_norm": 6.5234375, + "learning_rate": 7.85669262143122e-06, + "loss": 2.8363, + "mean_token_accuracy": 0.47174122174122174, + "step": 11561 + }, + { + "epoch": 2.14349276974416, + "grad_norm": 7.50390625, + "learning_rate": 7.856507230255841e-06, + "loss": 3.0203, + "mean_token_accuracy": 0.47149087384913585, + "step": 11562 + }, + { + "epoch": 2.1436781609195403, + "grad_norm": 7.96484375, + "learning_rate": 7.85632183908046e-06, + "loss": 2.7691, + "mean_token_accuracy": 0.4774445564516129, + "step": 11563 + }, + { + "epoch": 2.14386355209492, + "grad_norm": 7.31640625, + "learning_rate": 7.85613644790508e-06, + "loss": 2.5543, + "mean_token_accuracy": 0.5138274336283186, + "step": 11564 + }, + { + "epoch": 2.1440489432703003, + "grad_norm": 6.1328125, + "learning_rate": 7.8559510567297e-06, + "loss": 3.075, + "mean_token_accuracy": 0.4583333333333333, + "step": 11565 + }, + { + "epoch": 2.1442343344456805, + "grad_norm": 7.25, + "learning_rate": 7.85576566555432e-06, + "loss": 2.7653, + "mean_token_accuracy": 0.499865627519484, + "step": 11566 + }, + { + "epoch": 2.14441972562106, + "grad_norm": 7.80078125, + "learning_rate": 7.85558027437894e-06, + "loss": 2.4609, + "mean_token_accuracy": 0.5140958517921869, + "step": 11567 + }, + { + "epoch": 2.1446051167964404, + "grad_norm": 6.05859375, + "learning_rate": 7.85539488320356e-06, + "loss": 2.5637, + "mean_token_accuracy": 0.48964745383324004, + "step": 11568 + }, + { + "epoch": 2.1447905079718206, + "grad_norm": 6.1171875, + "learning_rate": 7.85520949202818e-06, + "loss": 2.9872, + "mean_token_accuracy": 0.4796568308852203, + "step": 11569 + }, + { + "epoch": 2.1449758991472008, + "grad_norm": 6.48828125, + "learning_rate": 7.8550241008528e-06, + "loss": 2.9157, + "mean_token_accuracy": 0.4640637450199203, + "step": 11570 + }, + { + "epoch": 2.1451612903225805, + "grad_norm": 7.03125, + "learning_rate": 7.85483870967742e-06, + "loss": 3.1482, + "mean_token_accuracy": 0.46536532465771996, + "step": 11571 + }, + { + "epoch": 2.1453466814979607, + "grad_norm": 5.859375, + "learning_rate": 7.85465331850204e-06, + "loss": 2.725, + "mean_token_accuracy": 0.48176881303335917, + "step": 11572 + }, + { + "epoch": 2.145532072673341, + "grad_norm": 6.25, + "learning_rate": 7.85446792732666e-06, + "loss": 2.8241, + "mean_token_accuracy": 0.4754338792471278, + "step": 11573 + }, + { + "epoch": 2.1457174638487206, + "grad_norm": 5.51953125, + "learning_rate": 7.85428253615128e-06, + "loss": 2.2883, + "mean_token_accuracy": 0.5590506472859414, + "step": 11574 + }, + { + "epoch": 2.145902855024101, + "grad_norm": 6.62890625, + "learning_rate": 7.8540971449759e-06, + "loss": 2.6107, + "mean_token_accuracy": 0.49267139479905436, + "step": 11575 + }, + { + "epoch": 2.146088246199481, + "grad_norm": 8.984375, + "learning_rate": 7.85391175380052e-06, + "loss": 2.9945, + "mean_token_accuracy": 0.4733044733044733, + "step": 11576 + }, + { + "epoch": 2.1462736373748608, + "grad_norm": 6.828125, + "learning_rate": 7.853726362625139e-06, + "loss": 2.4319, + "mean_token_accuracy": 0.529519033508092, + "step": 11577 + }, + { + "epoch": 2.146459028550241, + "grad_norm": 8.90625, + "learning_rate": 7.853540971449761e-06, + "loss": 3.1455, + "mean_token_accuracy": 0.4513576204120617, + "step": 11578 + }, + { + "epoch": 2.146644419725621, + "grad_norm": 6.5390625, + "learning_rate": 7.85335558027438e-06, + "loss": 2.9674, + "mean_token_accuracy": 0.46641969407265776, + "step": 11579 + }, + { + "epoch": 2.146829810901001, + "grad_norm": 6.7421875, + "learning_rate": 7.853170189099e-06, + "loss": 2.9822, + "mean_token_accuracy": 0.4791574605980262, + "step": 11580 + }, + { + "epoch": 2.147015202076381, + "grad_norm": 8.2734375, + "learning_rate": 7.852984797923619e-06, + "loss": 2.6795, + "mean_token_accuracy": 0.4926612305411416, + "step": 11581 + }, + { + "epoch": 2.1472005932517613, + "grad_norm": 7.0859375, + "learning_rate": 7.85279940674824e-06, + "loss": 2.8516, + "mean_token_accuracy": 0.5040310540459839, + "step": 11582 + }, + { + "epoch": 2.1473859844271415, + "grad_norm": 7.32421875, + "learning_rate": 7.85261401557286e-06, + "loss": 3.3562, + "mean_token_accuracy": 0.4627863953322897, + "step": 11583 + }, + { + "epoch": 2.147571375602521, + "grad_norm": 8.7109375, + "learning_rate": 7.852428624397479e-06, + "loss": 3.2026, + "mean_token_accuracy": 0.4621901083842033, + "step": 11584 + }, + { + "epoch": 2.1477567667779014, + "grad_norm": 7.77734375, + "learning_rate": 7.8522432332221e-06, + "loss": 2.5969, + "mean_token_accuracy": 0.501910132799709, + "step": 11585 + }, + { + "epoch": 2.1479421579532816, + "grad_norm": 6.82421875, + "learning_rate": 7.85205784204672e-06, + "loss": 2.7833, + "mean_token_accuracy": 0.5053075241960662, + "step": 11586 + }, + { + "epoch": 2.1481275491286613, + "grad_norm": 7.08984375, + "learning_rate": 7.85187245087134e-06, + "loss": 2.661, + "mean_token_accuracy": 0.4914048606994665, + "step": 11587 + }, + { + "epoch": 2.1483129403040415, + "grad_norm": 7.21484375, + "learning_rate": 7.851687059695959e-06, + "loss": 2.8188, + "mean_token_accuracy": 0.5149067585223899, + "step": 11588 + }, + { + "epoch": 2.1484983314794217, + "grad_norm": 6.109375, + "learning_rate": 7.85150166852058e-06, + "loss": 2.901, + "mean_token_accuracy": 0.4795124481327801, + "step": 11589 + }, + { + "epoch": 2.1486837226548015, + "grad_norm": 7.609375, + "learning_rate": 7.851316277345198e-06, + "loss": 2.6326, + "mean_token_accuracy": 0.5131058720164007, + "step": 11590 + }, + { + "epoch": 2.1488691138301816, + "grad_norm": 8.859375, + "learning_rate": 7.851130886169819e-06, + "loss": 2.3959, + "mean_token_accuracy": 0.510192329839917, + "step": 11591 + }, + { + "epoch": 2.149054505005562, + "grad_norm": 5.71484375, + "learning_rate": 7.85094549499444e-06, + "loss": 2.941, + "mean_token_accuracy": 0.47459714666940367, + "step": 11592 + }, + { + "epoch": 2.1492398961809416, + "grad_norm": 8.3515625, + "learning_rate": 7.850760103819058e-06, + "loss": 3.0336, + "mean_token_accuracy": 0.4475625632865616, + "step": 11593 + }, + { + "epoch": 2.1494252873563218, + "grad_norm": 6.36328125, + "learning_rate": 7.850574712643679e-06, + "loss": 2.5556, + "mean_token_accuracy": 0.5026570803376055, + "step": 11594 + }, + { + "epoch": 2.149610678531702, + "grad_norm": 7.37890625, + "learning_rate": 7.850389321468299e-06, + "loss": 3.1539, + "mean_token_accuracy": 0.45358133463860334, + "step": 11595 + }, + { + "epoch": 2.149796069707082, + "grad_norm": 7.68359375, + "learning_rate": 7.85020393029292e-06, + "loss": 3.7394, + "mean_token_accuracy": 0.42516483516483516, + "step": 11596 + }, + { + "epoch": 2.149981460882462, + "grad_norm": 6.90234375, + "learning_rate": 7.850018539117538e-06, + "loss": 3.2947, + "mean_token_accuracy": 0.44200138504155123, + "step": 11597 + }, + { + "epoch": 2.150166852057842, + "grad_norm": 8.203125, + "learning_rate": 7.849833147942159e-06, + "loss": 2.2175, + "mean_token_accuracy": 0.5135293428604897, + "step": 11598 + }, + { + "epoch": 2.1503522432332223, + "grad_norm": 7.76953125, + "learning_rate": 7.849647756766778e-06, + "loss": 3.227, + "mean_token_accuracy": 0.45668196356556806, + "step": 11599 + }, + { + "epoch": 2.150537634408602, + "grad_norm": 5.609375, + "learning_rate": 7.849462365591398e-06, + "loss": 2.5068, + "mean_token_accuracy": 0.4908872639965077, + "step": 11600 + }, + { + "epoch": 2.150723025583982, + "grad_norm": 6.125, + "learning_rate": 7.849276974416019e-06, + "loss": 2.4503, + "mean_token_accuracy": 0.525016160310278, + "step": 11601 + }, + { + "epoch": 2.1509084167593624, + "grad_norm": 7.32421875, + "learning_rate": 7.849091583240639e-06, + "loss": 3.8412, + "mean_token_accuracy": 0.41142638036809814, + "step": 11602 + }, + { + "epoch": 2.151093807934742, + "grad_norm": 6.671875, + "learning_rate": 7.848906192065258e-06, + "loss": 3.0111, + "mean_token_accuracy": 0.44451400845202693, + "step": 11603 + }, + { + "epoch": 2.1512791991101223, + "grad_norm": 5.78125, + "learning_rate": 7.848720800889878e-06, + "loss": 2.4512, + "mean_token_accuracy": 0.4895552992438806, + "step": 11604 + }, + { + "epoch": 2.1514645902855025, + "grad_norm": 6.86328125, + "learning_rate": 7.848535409714499e-06, + "loss": 3.2933, + "mean_token_accuracy": 0.454488971730351, + "step": 11605 + }, + { + "epoch": 2.1516499814608823, + "grad_norm": 6.1875, + "learning_rate": 7.848350018539118e-06, + "loss": 3.2659, + "mean_token_accuracy": 0.4210592527361932, + "step": 11606 + }, + { + "epoch": 2.1518353726362625, + "grad_norm": 5.52734375, + "learning_rate": 7.848164627363738e-06, + "loss": 2.4509, + "mean_token_accuracy": 0.5072610178052784, + "step": 11607 + }, + { + "epoch": 2.1520207638116426, + "grad_norm": 7.328125, + "learning_rate": 7.847979236188357e-06, + "loss": 2.7195, + "mean_token_accuracy": 0.4990375978442192, + "step": 11608 + }, + { + "epoch": 2.152206154987023, + "grad_norm": 6.73046875, + "learning_rate": 7.847793845012977e-06, + "loss": 2.3363, + "mean_token_accuracy": 0.5520441434662654, + "step": 11609 + }, + { + "epoch": 2.1523915461624026, + "grad_norm": 7.796875, + "learning_rate": 7.847608453837598e-06, + "loss": 3.5386, + "mean_token_accuracy": 0.44560185185185186, + "step": 11610 + }, + { + "epoch": 2.1525769373377828, + "grad_norm": 7.28515625, + "learning_rate": 7.847423062662218e-06, + "loss": 2.657, + "mean_token_accuracy": 0.4910896476669841, + "step": 11611 + }, + { + "epoch": 2.152762328513163, + "grad_norm": 7.62890625, + "learning_rate": 7.847237671486839e-06, + "loss": 3.4563, + "mean_token_accuracy": 0.46261859582542697, + "step": 11612 + }, + { + "epoch": 2.1529477196885427, + "grad_norm": 7.27734375, + "learning_rate": 7.847052280311458e-06, + "loss": 2.8229, + "mean_token_accuracy": 0.4602754237288136, + "step": 11613 + }, + { + "epoch": 2.153133110863923, + "grad_norm": 13.046875, + "learning_rate": 7.846866889136078e-06, + "loss": 2.1644, + "mean_token_accuracy": 0.5269957840337277, + "step": 11614 + }, + { + "epoch": 2.153318502039303, + "grad_norm": 6.328125, + "learning_rate": 7.846681497960697e-06, + "loss": 2.8322, + "mean_token_accuracy": 0.46472053126729385, + "step": 11615 + }, + { + "epoch": 2.153503893214683, + "grad_norm": 6.953125, + "learning_rate": 7.846496106785318e-06, + "loss": 3.0569, + "mean_token_accuracy": 0.44762897342365815, + "step": 11616 + }, + { + "epoch": 2.153689284390063, + "grad_norm": 10.4140625, + "learning_rate": 7.846310715609938e-06, + "loss": 2.5276, + "mean_token_accuracy": 0.500125156445557, + "step": 11617 + }, + { + "epoch": 2.153874675565443, + "grad_norm": 7.875, + "learning_rate": 7.846125324434559e-06, + "loss": 2.7325, + "mean_token_accuracy": 0.5011786892975012, + "step": 11618 + }, + { + "epoch": 2.154060066740823, + "grad_norm": 6.52734375, + "learning_rate": 7.845939933259177e-06, + "loss": 2.8773, + "mean_token_accuracy": 0.46140238018655516, + "step": 11619 + }, + { + "epoch": 2.154245457916203, + "grad_norm": 8.1015625, + "learning_rate": 7.845754542083798e-06, + "loss": 3.1528, + "mean_token_accuracy": 0.46240820608462524, + "step": 11620 + }, + { + "epoch": 2.1544308490915833, + "grad_norm": 8.0234375, + "learning_rate": 7.845569150908418e-06, + "loss": 2.8611, + "mean_token_accuracy": 0.47739291380222104, + "step": 11621 + }, + { + "epoch": 2.1546162402669635, + "grad_norm": 6.44140625, + "learning_rate": 7.845383759733037e-06, + "loss": 2.6951, + "mean_token_accuracy": 0.5041375743470391, + "step": 11622 + }, + { + "epoch": 2.1548016314423433, + "grad_norm": 8.3828125, + "learning_rate": 7.845198368557658e-06, + "loss": 2.868, + "mean_token_accuracy": 0.4819632560010728, + "step": 11623 + }, + { + "epoch": 2.1549870226177235, + "grad_norm": 11.6015625, + "learning_rate": 7.845012977382276e-06, + "loss": 3.8543, + "mean_token_accuracy": 0.42042755344418054, + "step": 11624 + }, + { + "epoch": 2.1551724137931036, + "grad_norm": 9.3046875, + "learning_rate": 7.844827586206897e-06, + "loss": 1.9827, + "mean_token_accuracy": 0.5706386014983946, + "step": 11625 + }, + { + "epoch": 2.1553578049684834, + "grad_norm": 7.03515625, + "learning_rate": 7.844642195031517e-06, + "loss": 2.6455, + "mean_token_accuracy": 0.49177153920619554, + "step": 11626 + }, + { + "epoch": 2.1555431961438636, + "grad_norm": 11.109375, + "learning_rate": 7.844456803856138e-06, + "loss": 3.7379, + "mean_token_accuracy": 0.41831873367143074, + "step": 11627 + }, + { + "epoch": 2.1557285873192438, + "grad_norm": 7.6015625, + "learning_rate": 7.844271412680757e-06, + "loss": 2.9217, + "mean_token_accuracy": 0.46221966647498564, + "step": 11628 + }, + { + "epoch": 2.1559139784946235, + "grad_norm": 11.9296875, + "learning_rate": 7.844086021505377e-06, + "loss": 2.9899, + "mean_token_accuracy": 0.4551961823966066, + "step": 11629 + }, + { + "epoch": 2.1560993696700037, + "grad_norm": 8.9296875, + "learning_rate": 7.843900630329998e-06, + "loss": 2.6481, + "mean_token_accuracy": 0.5051657050852006, + "step": 11630 + }, + { + "epoch": 2.156284760845384, + "grad_norm": 5.890625, + "learning_rate": 7.843715239154616e-06, + "loss": 2.8358, + "mean_token_accuracy": 0.4611421842802473, + "step": 11631 + }, + { + "epoch": 2.1564701520207636, + "grad_norm": 7.7421875, + "learning_rate": 7.843529847979237e-06, + "loss": 2.838, + "mean_token_accuracy": 0.48794505186431175, + "step": 11632 + }, + { + "epoch": 2.156655543196144, + "grad_norm": 12.3359375, + "learning_rate": 7.843344456803856e-06, + "loss": 2.4521, + "mean_token_accuracy": 0.5150042869391255, + "step": 11633 + }, + { + "epoch": 2.156840934371524, + "grad_norm": 9.5078125, + "learning_rate": 7.843159065628476e-06, + "loss": 3.0226, + "mean_token_accuracy": 0.44533737680060653, + "step": 11634 + }, + { + "epoch": 2.1570263255469038, + "grad_norm": 7.546875, + "learning_rate": 7.842973674453097e-06, + "loss": 2.4126, + "mean_token_accuracy": 0.513152514651765, + "step": 11635 + }, + { + "epoch": 2.157211716722284, + "grad_norm": 10.265625, + "learning_rate": 7.842788283277717e-06, + "loss": 2.7197, + "mean_token_accuracy": 0.4929230769230769, + "step": 11636 + }, + { + "epoch": 2.157397107897664, + "grad_norm": 12.0546875, + "learning_rate": 7.842602892102336e-06, + "loss": 2.9867, + "mean_token_accuracy": 0.4549065033923548, + "step": 11637 + }, + { + "epoch": 2.157582499073044, + "grad_norm": 7.42578125, + "learning_rate": 7.842417500926956e-06, + "loss": 2.8258, + "mean_token_accuracy": 0.46657034020753974, + "step": 11638 + }, + { + "epoch": 2.157767890248424, + "grad_norm": 9.125, + "learning_rate": 7.842232109751577e-06, + "loss": 2.9667, + "mean_token_accuracy": 0.4741003547896604, + "step": 11639 + }, + { + "epoch": 2.1579532814238043, + "grad_norm": 8.140625, + "learning_rate": 7.842046718576196e-06, + "loss": 3.1668, + "mean_token_accuracy": 0.4607828089025326, + "step": 11640 + }, + { + "epoch": 2.1581386725991845, + "grad_norm": 8.7890625, + "learning_rate": 7.841861327400816e-06, + "loss": 2.8076, + "mean_token_accuracy": 0.4656987698104843, + "step": 11641 + }, + { + "epoch": 2.158324063774564, + "grad_norm": 6.38671875, + "learning_rate": 7.841675936225435e-06, + "loss": 2.3557, + "mean_token_accuracy": 0.547244567174931, + "step": 11642 + }, + { + "epoch": 2.1585094549499444, + "grad_norm": 9.09375, + "learning_rate": 7.841490545050057e-06, + "loss": 3.2148, + "mean_token_accuracy": 0.43332416827055265, + "step": 11643 + }, + { + "epoch": 2.1586948461253246, + "grad_norm": 8.0390625, + "learning_rate": 7.841305153874676e-06, + "loss": 2.7146, + "mean_token_accuracy": 0.5033197437390798, + "step": 11644 + }, + { + "epoch": 2.1588802373007043, + "grad_norm": 7.55078125, + "learning_rate": 7.841119762699297e-06, + "loss": 2.8969, + "mean_token_accuracy": 0.4794195250659631, + "step": 11645 + }, + { + "epoch": 2.1590656284760845, + "grad_norm": 6.65234375, + "learning_rate": 7.840934371523915e-06, + "loss": 3.2974, + "mean_token_accuracy": 0.442995372318048, + "step": 11646 + }, + { + "epoch": 2.1592510196514647, + "grad_norm": 8.1484375, + "learning_rate": 7.840748980348536e-06, + "loss": 2.9653, + "mean_token_accuracy": 0.5250687863038827, + "step": 11647 + }, + { + "epoch": 2.1594364108268445, + "grad_norm": 7.6015625, + "learning_rate": 7.840563589173156e-06, + "loss": 2.8016, + "mean_token_accuracy": 0.525, + "step": 11648 + }, + { + "epoch": 2.1596218020022246, + "grad_norm": 8.1015625, + "learning_rate": 7.840378197997775e-06, + "loss": 3.5144, + "mean_token_accuracy": 0.44856039325842695, + "step": 11649 + }, + { + "epoch": 2.159807193177605, + "grad_norm": 12.953125, + "learning_rate": 7.840192806822396e-06, + "loss": 2.9875, + "mean_token_accuracy": 0.45776199804113615, + "step": 11650 + }, + { + "epoch": 2.1599925843529846, + "grad_norm": 6.66015625, + "learning_rate": 7.840007415647016e-06, + "loss": 2.5638, + "mean_token_accuracy": 0.521012069216228, + "step": 11651 + }, + { + "epoch": 2.1601779755283648, + "grad_norm": 6.33984375, + "learning_rate": 7.839822024471637e-06, + "loss": 2.6743, + "mean_token_accuracy": 0.47246945240609445, + "step": 11652 + }, + { + "epoch": 2.160363366703745, + "grad_norm": 6.11328125, + "learning_rate": 7.839636633296255e-06, + "loss": 2.2056, + "mean_token_accuracy": 0.5294992563212693, + "step": 11653 + }, + { + "epoch": 2.160548757879125, + "grad_norm": 7.375, + "learning_rate": 7.839451242120876e-06, + "loss": 3.0935, + "mean_token_accuracy": 0.4359602892511337, + "step": 11654 + }, + { + "epoch": 2.160734149054505, + "grad_norm": 5.96875, + "learning_rate": 7.839265850945496e-06, + "loss": 2.3627, + "mean_token_accuracy": 0.5240197351337315, + "step": 11655 + }, + { + "epoch": 2.160919540229885, + "grad_norm": 8.0546875, + "learning_rate": 7.839080459770115e-06, + "loss": 2.4664, + "mean_token_accuracy": 0.5241483724451174, + "step": 11656 + }, + { + "epoch": 2.1611049314052653, + "grad_norm": 5.91015625, + "learning_rate": 7.838895068594736e-06, + "loss": 2.8979, + "mean_token_accuracy": 0.46977950713359273, + "step": 11657 + }, + { + "epoch": 2.161290322580645, + "grad_norm": 6.44921875, + "learning_rate": 7.838709677419354e-06, + "loss": 2.3012, + "mean_token_accuracy": 0.5335451545887816, + "step": 11658 + }, + { + "epoch": 2.161475713756025, + "grad_norm": 6.30078125, + "learning_rate": 7.838524286243977e-06, + "loss": 3.052, + "mean_token_accuracy": 0.4672131147540984, + "step": 11659 + }, + { + "epoch": 2.1616611049314054, + "grad_norm": 7.05078125, + "learning_rate": 7.838338895068595e-06, + "loss": 3.0092, + "mean_token_accuracy": 0.475129111171514, + "step": 11660 + }, + { + "epoch": 2.161846496106785, + "grad_norm": 6.26171875, + "learning_rate": 7.838153503893216e-06, + "loss": 2.3227, + "mean_token_accuracy": 0.5131421092812739, + "step": 11661 + }, + { + "epoch": 2.1620318872821653, + "grad_norm": 7.19140625, + "learning_rate": 7.837968112717835e-06, + "loss": 3.5782, + "mean_token_accuracy": 0.4424010592908636, + "step": 11662 + }, + { + "epoch": 2.1622172784575455, + "grad_norm": 7.50390625, + "learning_rate": 7.837782721542455e-06, + "loss": 2.7696, + "mean_token_accuracy": 0.4787002487562189, + "step": 11663 + }, + { + "epoch": 2.1624026696329253, + "grad_norm": 5.30078125, + "learning_rate": 7.837597330367076e-06, + "loss": 3.0263, + "mean_token_accuracy": 0.4685141255121846, + "step": 11664 + }, + { + "epoch": 2.1625880608083055, + "grad_norm": 6.3828125, + "learning_rate": 7.837411939191694e-06, + "loss": 2.854, + "mean_token_accuracy": 0.453438673746924, + "step": 11665 + }, + { + "epoch": 2.1627734519836856, + "grad_norm": 7.4140625, + "learning_rate": 7.837226548016315e-06, + "loss": 2.6018, + "mean_token_accuracy": 0.4949720670391061, + "step": 11666 + }, + { + "epoch": 2.162958843159066, + "grad_norm": 6.67578125, + "learning_rate": 7.837041156840935e-06, + "loss": 2.4184, + "mean_token_accuracy": 0.5136090491339697, + "step": 11667 + }, + { + "epoch": 2.1631442343344456, + "grad_norm": 6.37109375, + "learning_rate": 7.836855765665556e-06, + "loss": 2.8056, + "mean_token_accuracy": 0.4586860137041516, + "step": 11668 + }, + { + "epoch": 2.1633296255098258, + "grad_norm": 7.3515625, + "learning_rate": 7.836670374490175e-06, + "loss": 3.0698, + "mean_token_accuracy": 0.46689723320158105, + "step": 11669 + }, + { + "epoch": 2.163515016685206, + "grad_norm": 5.72265625, + "learning_rate": 7.836484983314795e-06, + "loss": 2.8279, + "mean_token_accuracy": 0.5243033636937505, + "step": 11670 + }, + { + "epoch": 2.1637004078605857, + "grad_norm": 7.83203125, + "learning_rate": 7.836299592139414e-06, + "loss": 2.2005, + "mean_token_accuracy": 0.5512626262626262, + "step": 11671 + }, + { + "epoch": 2.163885799035966, + "grad_norm": 6.64453125, + "learning_rate": 7.836114200964035e-06, + "loss": 2.6436, + "mean_token_accuracy": 0.49793032080027594, + "step": 11672 + }, + { + "epoch": 2.164071190211346, + "grad_norm": 6.10546875, + "learning_rate": 7.835928809788655e-06, + "loss": 2.7201, + "mean_token_accuracy": 0.5073295870999267, + "step": 11673 + }, + { + "epoch": 2.164256581386726, + "grad_norm": 7.01171875, + "learning_rate": 7.835743418613274e-06, + "loss": 2.403, + "mean_token_accuracy": 0.5456586826347305, + "step": 11674 + }, + { + "epoch": 2.164441972562106, + "grad_norm": 7.08203125, + "learning_rate": 7.835558027437894e-06, + "loss": 3.5827, + "mean_token_accuracy": 0.41216539196940727, + "step": 11675 + }, + { + "epoch": 2.164627363737486, + "grad_norm": 6.96875, + "learning_rate": 7.835372636262515e-06, + "loss": 3.4009, + "mean_token_accuracy": 0.43710335448776066, + "step": 11676 + }, + { + "epoch": 2.164812754912866, + "grad_norm": 6.77734375, + "learning_rate": 7.835187245087135e-06, + "loss": 2.5886, + "mean_token_accuracy": 0.5185280826339574, + "step": 11677 + }, + { + "epoch": 2.164998146088246, + "grad_norm": 6.359375, + "learning_rate": 7.835001853911754e-06, + "loss": 2.6307, + "mean_token_accuracy": 0.4783954438529157, + "step": 11678 + }, + { + "epoch": 2.1651835372636263, + "grad_norm": 6.73828125, + "learning_rate": 7.834816462736375e-06, + "loss": 2.9812, + "mean_token_accuracy": 0.4775578204625637, + "step": 11679 + }, + { + "epoch": 2.1653689284390065, + "grad_norm": 7.8515625, + "learning_rate": 7.834631071560993e-06, + "loss": 2.8219, + "mean_token_accuracy": 0.48304628632938645, + "step": 11680 + }, + { + "epoch": 2.1655543196143863, + "grad_norm": 8.2421875, + "learning_rate": 7.834445680385614e-06, + "loss": 3.319, + "mean_token_accuracy": 0.43412620831975673, + "step": 11681 + }, + { + "epoch": 2.1657397107897665, + "grad_norm": 6.76953125, + "learning_rate": 7.834260289210234e-06, + "loss": 2.9016, + "mean_token_accuracy": 0.48965198288819517, + "step": 11682 + }, + { + "epoch": 2.1659251019651466, + "grad_norm": 5.921875, + "learning_rate": 7.834074898034855e-06, + "loss": 2.6194, + "mean_token_accuracy": 0.5012262811410869, + "step": 11683 + }, + { + "epoch": 2.1661104931405264, + "grad_norm": 6.63671875, + "learning_rate": 7.833889506859474e-06, + "loss": 2.8585, + "mean_token_accuracy": 0.4804632662611137, + "step": 11684 + }, + { + "epoch": 2.1662958843159066, + "grad_norm": 7.109375, + "learning_rate": 7.833704115684094e-06, + "loss": 2.5131, + "mean_token_accuracy": 0.5015402843601896, + "step": 11685 + }, + { + "epoch": 2.1664812754912868, + "grad_norm": 6.2109375, + "learning_rate": 7.833518724508715e-06, + "loss": 2.5351, + "mean_token_accuracy": 0.4964262508122157, + "step": 11686 + }, + { + "epoch": 2.1666666666666665, + "grad_norm": 7.00390625, + "learning_rate": 7.833333333333333e-06, + "loss": 2.6604, + "mean_token_accuracy": 0.5026633024950939, + "step": 11687 + }, + { + "epoch": 2.1668520578420467, + "grad_norm": 7.46875, + "learning_rate": 7.833147942157954e-06, + "loss": 2.3652, + "mean_token_accuracy": 0.5031928480204342, + "step": 11688 + }, + { + "epoch": 2.167037449017427, + "grad_norm": 7.08203125, + "learning_rate": 7.832962550982573e-06, + "loss": 2.9314, + "mean_token_accuracy": 0.4708355228586443, + "step": 11689 + }, + { + "epoch": 2.1672228401928066, + "grad_norm": 7.50390625, + "learning_rate": 7.832777159807193e-06, + "loss": 2.8314, + "mean_token_accuracy": 0.48037341131481276, + "step": 11690 + }, + { + "epoch": 2.167408231368187, + "grad_norm": 8.296875, + "learning_rate": 7.832591768631814e-06, + "loss": 3.1353, + "mean_token_accuracy": 0.44041778762462414, + "step": 11691 + }, + { + "epoch": 2.167593622543567, + "grad_norm": 7.92578125, + "learning_rate": 7.832406377456434e-06, + "loss": 2.9751, + "mean_token_accuracy": 0.45702247191011236, + "step": 11692 + }, + { + "epoch": 2.167779013718947, + "grad_norm": 9.0078125, + "learning_rate": 7.832220986281055e-06, + "loss": 2.4721, + "mean_token_accuracy": 0.5291273313377849, + "step": 11693 + }, + { + "epoch": 2.167964404894327, + "grad_norm": 8.5546875, + "learning_rate": 7.832035595105673e-06, + "loss": 3.2931, + "mean_token_accuracy": 0.46428090945782324, + "step": 11694 + }, + { + "epoch": 2.168149796069707, + "grad_norm": 6.890625, + "learning_rate": 7.831850203930294e-06, + "loss": 2.8872, + "mean_token_accuracy": 0.47872797593467986, + "step": 11695 + }, + { + "epoch": 2.1683351872450873, + "grad_norm": 7.95703125, + "learning_rate": 7.831664812754913e-06, + "loss": 2.8209, + "mean_token_accuracy": 0.4648729446935725, + "step": 11696 + }, + { + "epoch": 2.168520578420467, + "grad_norm": 7.90234375, + "learning_rate": 7.831479421579533e-06, + "loss": 2.8322, + "mean_token_accuracy": 0.46124210201880106, + "step": 11697 + }, + { + "epoch": 2.1687059695958473, + "grad_norm": 8.6171875, + "learning_rate": 7.831294030404154e-06, + "loss": 2.7701, + "mean_token_accuracy": 0.502021018593371, + "step": 11698 + }, + { + "epoch": 2.1688913607712275, + "grad_norm": 7.359375, + "learning_rate": 7.831108639228774e-06, + "loss": 2.5262, + "mean_token_accuracy": 0.4878946623247912, + "step": 11699 + }, + { + "epoch": 2.169076751946607, + "grad_norm": 6.625, + "learning_rate": 7.830923248053393e-06, + "loss": 2.1812, + "mean_token_accuracy": 0.5614816700610998, + "step": 11700 + }, + { + "epoch": 2.1692621431219874, + "grad_norm": 6.61328125, + "learning_rate": 7.830737856878014e-06, + "loss": 2.9918, + "mean_token_accuracy": 0.470474879559532, + "step": 11701 + }, + { + "epoch": 2.1694475342973676, + "grad_norm": 8.3125, + "learning_rate": 7.830552465702634e-06, + "loss": 3.0012, + "mean_token_accuracy": 0.4483498153222924, + "step": 11702 + }, + { + "epoch": 2.1696329254727473, + "grad_norm": 8.3046875, + "learning_rate": 7.830367074527253e-06, + "loss": 3.1714, + "mean_token_accuracy": 0.4405934262693439, + "step": 11703 + }, + { + "epoch": 2.1698183166481275, + "grad_norm": 7.20703125, + "learning_rate": 7.830181683351873e-06, + "loss": 2.8371, + "mean_token_accuracy": 0.4741596917762721, + "step": 11704 + }, + { + "epoch": 2.1700037078235077, + "grad_norm": 6.35546875, + "learning_rate": 7.829996292176492e-06, + "loss": 3.0208, + "mean_token_accuracy": 0.4547750781813808, + "step": 11705 + }, + { + "epoch": 2.1701890989988875, + "grad_norm": 6.13671875, + "learning_rate": 7.829810901001113e-06, + "loss": 2.5817, + "mean_token_accuracy": 0.5038339502908514, + "step": 11706 + }, + { + "epoch": 2.1703744901742676, + "grad_norm": 8.203125, + "learning_rate": 7.829625509825733e-06, + "loss": 2.8177, + "mean_token_accuracy": 0.46447778092272857, + "step": 11707 + }, + { + "epoch": 2.170559881349648, + "grad_norm": 7.29296875, + "learning_rate": 7.829440118650354e-06, + "loss": 3.2234, + "mean_token_accuracy": 0.46598984771573604, + "step": 11708 + }, + { + "epoch": 2.1707452725250276, + "grad_norm": 6.75390625, + "learning_rate": 7.829254727474972e-06, + "loss": 3.0202, + "mean_token_accuracy": 0.448267295988076, + "step": 11709 + }, + { + "epoch": 2.1709306637004078, + "grad_norm": 7.23046875, + "learning_rate": 7.829069336299593e-06, + "loss": 4.1314, + "mean_token_accuracy": 0.40699084794354395, + "step": 11710 + }, + { + "epoch": 2.171116054875788, + "grad_norm": 6.06640625, + "learning_rate": 7.828883945124213e-06, + "loss": 2.8943, + "mean_token_accuracy": 0.46569259418421427, + "step": 11711 + }, + { + "epoch": 2.171301446051168, + "grad_norm": 7.0546875, + "learning_rate": 7.828698553948832e-06, + "loss": 2.5443, + "mean_token_accuracy": 0.4901391224583397, + "step": 11712 + }, + { + "epoch": 2.171486837226548, + "grad_norm": 7.21875, + "learning_rate": 7.828513162773453e-06, + "loss": 2.5347, + "mean_token_accuracy": 0.49584002335425487, + "step": 11713 + }, + { + "epoch": 2.171672228401928, + "grad_norm": 6.38671875, + "learning_rate": 7.828327771598071e-06, + "loss": 3.1344, + "mean_token_accuracy": 0.4525451950523311, + "step": 11714 + }, + { + "epoch": 2.1718576195773083, + "grad_norm": 7.71875, + "learning_rate": 7.828142380422694e-06, + "loss": 2.6198, + "mean_token_accuracy": 0.49417249417249415, + "step": 11715 + }, + { + "epoch": 2.172043010752688, + "grad_norm": 7.875, + "learning_rate": 7.827956989247312e-06, + "loss": 2.5994, + "mean_token_accuracy": 0.4996309184857113, + "step": 11716 + }, + { + "epoch": 2.172228401928068, + "grad_norm": 7.61328125, + "learning_rate": 7.827771598071933e-06, + "loss": 2.7492, + "mean_token_accuracy": 0.48932423508694695, + "step": 11717 + }, + { + "epoch": 2.1724137931034484, + "grad_norm": 7.82421875, + "learning_rate": 7.827586206896552e-06, + "loss": 2.458, + "mean_token_accuracy": 0.5064543889845095, + "step": 11718 + }, + { + "epoch": 2.172599184278828, + "grad_norm": 9.4453125, + "learning_rate": 7.827400815721172e-06, + "loss": 2.2809, + "mean_token_accuracy": 0.4801214798453893, + "step": 11719 + }, + { + "epoch": 2.1727845754542083, + "grad_norm": 9.3125, + "learning_rate": 7.827215424545793e-06, + "loss": 2.7367, + "mean_token_accuracy": 0.46733860678744577, + "step": 11720 + }, + { + "epoch": 2.1729699666295885, + "grad_norm": 7.2109375, + "learning_rate": 7.827030033370412e-06, + "loss": 2.9765, + "mean_token_accuracy": 0.4969896004378763, + "step": 11721 + }, + { + "epoch": 2.1731553578049683, + "grad_norm": 6.6796875, + "learning_rate": 7.826844642195032e-06, + "loss": 3.167, + "mean_token_accuracy": 0.4637145033184637, + "step": 11722 + }, + { + "epoch": 2.1733407489803485, + "grad_norm": 11.0859375, + "learning_rate": 7.826659251019652e-06, + "loss": 2.497, + "mean_token_accuracy": 0.49565323864332067, + "step": 11723 + }, + { + "epoch": 2.1735261401557286, + "grad_norm": 8.75, + "learning_rate": 7.826473859844273e-06, + "loss": 2.3306, + "mean_token_accuracy": 0.5187713310580204, + "step": 11724 + }, + { + "epoch": 2.173711531331109, + "grad_norm": 6.73046875, + "learning_rate": 7.826288468668892e-06, + "loss": 2.7449, + "mean_token_accuracy": 0.4832349785407725, + "step": 11725 + }, + { + "epoch": 2.1738969225064886, + "grad_norm": 7.78515625, + "learning_rate": 7.826103077493512e-06, + "loss": 3.0016, + "mean_token_accuracy": 0.4648970955953068, + "step": 11726 + }, + { + "epoch": 2.1740823136818688, + "grad_norm": 10.734375, + "learning_rate": 7.825917686318131e-06, + "loss": 2.4921, + "mean_token_accuracy": 0.5087548638132295, + "step": 11727 + }, + { + "epoch": 2.174267704857249, + "grad_norm": 9.453125, + "learning_rate": 7.825732295142752e-06, + "loss": 2.9936, + "mean_token_accuracy": 0.4735768903993203, + "step": 11728 + }, + { + "epoch": 2.1744530960326287, + "grad_norm": 7.11328125, + "learning_rate": 7.825546903967372e-06, + "loss": 2.8808, + "mean_token_accuracy": 0.4661866440773684, + "step": 11729 + }, + { + "epoch": 2.174638487208009, + "grad_norm": 6.34375, + "learning_rate": 7.825361512791991e-06, + "loss": 3.0877, + "mean_token_accuracy": 0.46234033181617973, + "step": 11730 + }, + { + "epoch": 2.174823878383389, + "grad_norm": 7.98828125, + "learning_rate": 7.825176121616613e-06, + "loss": 2.8563, + "mean_token_accuracy": 0.4757229320780094, + "step": 11731 + }, + { + "epoch": 2.175009269558769, + "grad_norm": 6.828125, + "learning_rate": 7.824990730441232e-06, + "loss": 2.7179, + "mean_token_accuracy": 0.49141055949566587, + "step": 11732 + }, + { + "epoch": 2.175194660734149, + "grad_norm": 6.80078125, + "learning_rate": 7.824805339265852e-06, + "loss": 3.2425, + "mean_token_accuracy": 0.4406758130081301, + "step": 11733 + }, + { + "epoch": 2.175380051909529, + "grad_norm": 8.9765625, + "learning_rate": 7.824619948090471e-06, + "loss": 2.7108, + "mean_token_accuracy": 0.4924299772899319, + "step": 11734 + }, + { + "epoch": 2.175565443084909, + "grad_norm": 14.0078125, + "learning_rate": 7.824434556915092e-06, + "loss": 2.6569, + "mean_token_accuracy": 0.4727356040934779, + "step": 11735 + }, + { + "epoch": 2.175750834260289, + "grad_norm": 5.82421875, + "learning_rate": 7.824249165739712e-06, + "loss": 3.0185, + "mean_token_accuracy": 0.46004206098843325, + "step": 11736 + }, + { + "epoch": 2.1759362254356693, + "grad_norm": 7.671875, + "learning_rate": 7.824063774564331e-06, + "loss": 2.5249, + "mean_token_accuracy": 0.5070440573770492, + "step": 11737 + }, + { + "epoch": 2.1761216166110495, + "grad_norm": 11.390625, + "learning_rate": 7.823878383388951e-06, + "loss": 2.5691, + "mean_token_accuracy": 0.5041547649961049, + "step": 11738 + }, + { + "epoch": 2.1763070077864293, + "grad_norm": 9.84375, + "learning_rate": 7.823692992213572e-06, + "loss": 2.3748, + "mean_token_accuracy": 0.5261975162625665, + "step": 11739 + }, + { + "epoch": 2.1764923989618095, + "grad_norm": 6.96875, + "learning_rate": 7.823507601038192e-06, + "loss": 2.6315, + "mean_token_accuracy": 0.5006216972334473, + "step": 11740 + }, + { + "epoch": 2.1766777901371896, + "grad_norm": 12.203125, + "learning_rate": 7.823322209862811e-06, + "loss": 3.162, + "mean_token_accuracy": 0.4213425570074302, + "step": 11741 + }, + { + "epoch": 2.1768631813125694, + "grad_norm": 7.67578125, + "learning_rate": 7.823136818687432e-06, + "loss": 2.7865, + "mean_token_accuracy": 0.48819875776397514, + "step": 11742 + }, + { + "epoch": 2.1770485724879496, + "grad_norm": 9.015625, + "learning_rate": 7.82295142751205e-06, + "loss": 2.7514, + "mean_token_accuracy": 0.474950971652701, + "step": 11743 + }, + { + "epoch": 2.1772339636633298, + "grad_norm": 7.75, + "learning_rate": 7.822766036336671e-06, + "loss": 2.9702, + "mean_token_accuracy": 0.4678945589726259, + "step": 11744 + }, + { + "epoch": 2.1774193548387095, + "grad_norm": 8.4375, + "learning_rate": 7.822580645161291e-06, + "loss": 2.9056, + "mean_token_accuracy": 0.4845917920257047, + "step": 11745 + }, + { + "epoch": 2.1776047460140897, + "grad_norm": 8.3984375, + "learning_rate": 7.82239525398591e-06, + "loss": 2.7136, + "mean_token_accuracy": 0.4955364134690681, + "step": 11746 + }, + { + "epoch": 2.17779013718947, + "grad_norm": 6.0703125, + "learning_rate": 7.82220986281053e-06, + "loss": 3.0423, + "mean_token_accuracy": 0.44992481203007517, + "step": 11747 + }, + { + "epoch": 2.1779755283648496, + "grad_norm": 10.734375, + "learning_rate": 7.822024471635151e-06, + "loss": 2.5889, + "mean_token_accuracy": 0.4938467645891227, + "step": 11748 + }, + { + "epoch": 2.17816091954023, + "grad_norm": 8.5546875, + "learning_rate": 7.821839080459772e-06, + "loss": 3.4367, + "mean_token_accuracy": 0.4332432816721172, + "step": 11749 + }, + { + "epoch": 2.17834631071561, + "grad_norm": 7.26171875, + "learning_rate": 7.82165368928439e-06, + "loss": 2.9641, + "mean_token_accuracy": 0.46042889966811335, + "step": 11750 + }, + { + "epoch": 2.17853170189099, + "grad_norm": 9.2734375, + "learning_rate": 7.821468298109011e-06, + "loss": 2.3124, + "mean_token_accuracy": 0.5506470325747435, + "step": 11751 + }, + { + "epoch": 2.17871709306637, + "grad_norm": 12.578125, + "learning_rate": 7.82128290693363e-06, + "loss": 2.6574, + "mean_token_accuracy": 0.4768280123583934, + "step": 11752 + }, + { + "epoch": 2.17890248424175, + "grad_norm": 7.6796875, + "learning_rate": 7.82109751575825e-06, + "loss": 2.6732, + "mean_token_accuracy": 0.4917677642980936, + "step": 11753 + }, + { + "epoch": 2.1790878754171303, + "grad_norm": 6.46484375, + "learning_rate": 7.82091212458287e-06, + "loss": 2.9914, + "mean_token_accuracy": 0.466643051127642, + "step": 11754 + }, + { + "epoch": 2.17927326659251, + "grad_norm": 7.3671875, + "learning_rate": 7.82072673340749e-06, + "loss": 2.4799, + "mean_token_accuracy": 0.5151087263484891, + "step": 11755 + }, + { + "epoch": 2.1794586577678903, + "grad_norm": 8.25, + "learning_rate": 7.82054134223211e-06, + "loss": 2.5673, + "mean_token_accuracy": 0.4981121115306419, + "step": 11756 + }, + { + "epoch": 2.1796440489432705, + "grad_norm": 6.33203125, + "learning_rate": 7.82035595105673e-06, + "loss": 3.095, + "mean_token_accuracy": 0.45077220077220076, + "step": 11757 + }, + { + "epoch": 2.17982944011865, + "grad_norm": 11.0703125, + "learning_rate": 7.820170559881351e-06, + "loss": 2.2856, + "mean_token_accuracy": 0.531239252550728, + "step": 11758 + }, + { + "epoch": 2.1800148312940304, + "grad_norm": 9.96875, + "learning_rate": 7.81998516870597e-06, + "loss": 2.5912, + "mean_token_accuracy": 0.49404580152671757, + "step": 11759 + }, + { + "epoch": 2.1802002224694106, + "grad_norm": 6.2890625, + "learning_rate": 7.81979977753059e-06, + "loss": 2.7074, + "mean_token_accuracy": 0.48426383697093933, + "step": 11760 + }, + { + "epoch": 2.1803856136447903, + "grad_norm": 8.328125, + "learning_rate": 7.819614386355209e-06, + "loss": 3.1886, + "mean_token_accuracy": 0.47280163599182007, + "step": 11761 + }, + { + "epoch": 2.1805710048201705, + "grad_norm": 12.1640625, + "learning_rate": 7.81942899517983e-06, + "loss": 2.7125, + "mean_token_accuracy": 0.478110599078341, + "step": 11762 + }, + { + "epoch": 2.1807563959955507, + "grad_norm": 12.1484375, + "learning_rate": 7.81924360400445e-06, + "loss": 2.5696, + "mean_token_accuracy": 0.5125634517766497, + "step": 11763 + }, + { + "epoch": 2.1809417871709305, + "grad_norm": 7.24609375, + "learning_rate": 7.81905821282907e-06, + "loss": 2.382, + "mean_token_accuracy": 0.502837947411097, + "step": 11764 + }, + { + "epoch": 2.1811271783463106, + "grad_norm": 8.3125, + "learning_rate": 7.81887282165369e-06, + "loss": 3.1348, + "mean_token_accuracy": 0.44934402332361517, + "step": 11765 + }, + { + "epoch": 2.181312569521691, + "grad_norm": 8.3828125, + "learning_rate": 7.81868743047831e-06, + "loss": 3.2821, + "mean_token_accuracy": 0.44374209860935526, + "step": 11766 + }, + { + "epoch": 2.181497960697071, + "grad_norm": 9.6640625, + "learning_rate": 7.81850203930293e-06, + "loss": 2.6517, + "mean_token_accuracy": 0.47556294779938585, + "step": 11767 + }, + { + "epoch": 2.1816833518724508, + "grad_norm": 7.24609375, + "learning_rate": 7.81831664812755e-06, + "loss": 2.335, + "mean_token_accuracy": 0.5643899895724713, + "step": 11768 + }, + { + "epoch": 2.181868743047831, + "grad_norm": 6.50390625, + "learning_rate": 7.81813125695217e-06, + "loss": 2.9825, + "mean_token_accuracy": 0.44958245045494205, + "step": 11769 + }, + { + "epoch": 2.182054134223211, + "grad_norm": 5.47265625, + "learning_rate": 7.817945865776788e-06, + "loss": 2.2381, + "mean_token_accuracy": 0.5518856875146404, + "step": 11770 + }, + { + "epoch": 2.182239525398591, + "grad_norm": 8.2578125, + "learning_rate": 7.817760474601409e-06, + "loss": 2.8548, + "mean_token_accuracy": 0.4906176700547303, + "step": 11771 + }, + { + "epoch": 2.182424916573971, + "grad_norm": 6.62109375, + "learning_rate": 7.81757508342603e-06, + "loss": 3.2429, + "mean_token_accuracy": 0.45691747572815533, + "step": 11772 + }, + { + "epoch": 2.1826103077493513, + "grad_norm": 6.453125, + "learning_rate": 7.81738969225065e-06, + "loss": 2.9636, + "mean_token_accuracy": 0.4780254376023171, + "step": 11773 + }, + { + "epoch": 2.182795698924731, + "grad_norm": 6.12890625, + "learning_rate": 7.81720430107527e-06, + "loss": 3.2501, + "mean_token_accuracy": 0.4278056170609486, + "step": 11774 + }, + { + "epoch": 2.182981090100111, + "grad_norm": 6.27734375, + "learning_rate": 7.81701890989989e-06, + "loss": 2.8703, + "mean_token_accuracy": 0.4702523240371846, + "step": 11775 + }, + { + "epoch": 2.1831664812754914, + "grad_norm": 6.77734375, + "learning_rate": 7.81683351872451e-06, + "loss": 2.631, + "mean_token_accuracy": 0.5062853985357093, + "step": 11776 + }, + { + "epoch": 2.183351872450871, + "grad_norm": 6.3125, + "learning_rate": 7.816648127549129e-06, + "loss": 3.031, + "mean_token_accuracy": 0.4707396369547548, + "step": 11777 + }, + { + "epoch": 2.1835372636262513, + "grad_norm": 6.53125, + "learning_rate": 7.816462736373749e-06, + "loss": 2.915, + "mean_token_accuracy": 0.4781431334622824, + "step": 11778 + }, + { + "epoch": 2.1837226548016315, + "grad_norm": 6.45703125, + "learning_rate": 7.81627734519837e-06, + "loss": 2.5358, + "mean_token_accuracy": 0.5004770342101676, + "step": 11779 + }, + { + "epoch": 2.1839080459770113, + "grad_norm": 6.1171875, + "learning_rate": 7.81609195402299e-06, + "loss": 2.9821, + "mean_token_accuracy": 0.46633872101194657, + "step": 11780 + }, + { + "epoch": 2.1840934371523915, + "grad_norm": 6.04296875, + "learning_rate": 7.815906562847609e-06, + "loss": 2.7786, + "mean_token_accuracy": 0.5052825552825553, + "step": 11781 + }, + { + "epoch": 2.1842788283277716, + "grad_norm": 6.14453125, + "learning_rate": 7.81572117167223e-06, + "loss": 2.9774, + "mean_token_accuracy": 0.4675678622042952, + "step": 11782 + }, + { + "epoch": 2.184464219503152, + "grad_norm": 6.2265625, + "learning_rate": 7.81553578049685e-06, + "loss": 2.7744, + "mean_token_accuracy": 0.5113224071062179, + "step": 11783 + }, + { + "epoch": 2.1846496106785316, + "grad_norm": 7.6015625, + "learning_rate": 7.815350389321469e-06, + "loss": 2.8205, + "mean_token_accuracy": 0.5114470842332614, + "step": 11784 + }, + { + "epoch": 2.1848350018539118, + "grad_norm": 6.61328125, + "learning_rate": 7.815164998146089e-06, + "loss": 2.5907, + "mean_token_accuracy": 0.4918096220817748, + "step": 11785 + }, + { + "epoch": 2.185020393029292, + "grad_norm": 8.0703125, + "learning_rate": 7.814979606970708e-06, + "loss": 2.5211, + "mean_token_accuracy": 0.489434139469359, + "step": 11786 + }, + { + "epoch": 2.1852057842046717, + "grad_norm": 6.21484375, + "learning_rate": 7.814794215795328e-06, + "loss": 2.1906, + "mean_token_accuracy": 0.5390372512332029, + "step": 11787 + }, + { + "epoch": 2.185391175380052, + "grad_norm": 7.07421875, + "learning_rate": 7.814608824619949e-06, + "loss": 2.7153, + "mean_token_accuracy": 0.5018808397039194, + "step": 11788 + }, + { + "epoch": 2.185576566555432, + "grad_norm": 9.3125, + "learning_rate": 7.81442343344457e-06, + "loss": 3.8291, + "mean_token_accuracy": 0.42101374316536133, + "step": 11789 + }, + { + "epoch": 2.185761957730812, + "grad_norm": 8.4765625, + "learning_rate": 7.814238042269188e-06, + "loss": 2.8814, + "mean_token_accuracy": 0.48073022312373226, + "step": 11790 + }, + { + "epoch": 2.185947348906192, + "grad_norm": 7.37109375, + "learning_rate": 7.814052651093809e-06, + "loss": 2.668, + "mean_token_accuracy": 0.4969786656801085, + "step": 11791 + }, + { + "epoch": 2.186132740081572, + "grad_norm": 6.7734375, + "learning_rate": 7.813867259918429e-06, + "loss": 2.7012, + "mean_token_accuracy": 0.4903770102820986, + "step": 11792 + }, + { + "epoch": 2.186318131256952, + "grad_norm": 7.32421875, + "learning_rate": 7.813681868743048e-06, + "loss": 2.257, + "mean_token_accuracy": 0.5540407762220585, + "step": 11793 + }, + { + "epoch": 2.186503522432332, + "grad_norm": 7.41015625, + "learning_rate": 7.813496477567668e-06, + "loss": 3.0187, + "mean_token_accuracy": 0.48068838675775094, + "step": 11794 + }, + { + "epoch": 2.1866889136077123, + "grad_norm": 7.7265625, + "learning_rate": 7.813311086392287e-06, + "loss": 2.6653, + "mean_token_accuracy": 0.4883298755186722, + "step": 11795 + }, + { + "epoch": 2.1868743047830925, + "grad_norm": 7.51171875, + "learning_rate": 7.81312569521691e-06, + "loss": 3.8253, + "mean_token_accuracy": 0.39764606977721734, + "step": 11796 + }, + { + "epoch": 2.1870596959584723, + "grad_norm": 7.1796875, + "learning_rate": 7.812940304041528e-06, + "loss": 2.6718, + "mean_token_accuracy": 0.5177797051170858, + "step": 11797 + }, + { + "epoch": 2.1872450871338525, + "grad_norm": 10.0, + "learning_rate": 7.812754912866149e-06, + "loss": 2.9222, + "mean_token_accuracy": 0.46614304415089847, + "step": 11798 + }, + { + "epoch": 2.1874304783092327, + "grad_norm": 7.73828125, + "learning_rate": 7.812569521690767e-06, + "loss": 2.5826, + "mean_token_accuracy": 0.5208474806464756, + "step": 11799 + }, + { + "epoch": 2.1876158694846124, + "grad_norm": 14.828125, + "learning_rate": 7.812384130515388e-06, + "loss": 2.683, + "mean_token_accuracy": 0.4780304087041428, + "step": 11800 + }, + { + "epoch": 2.1878012606599926, + "grad_norm": 10.0078125, + "learning_rate": 7.812198739340008e-06, + "loss": 2.3743, + "mean_token_accuracy": 0.5165443928731847, + "step": 11801 + }, + { + "epoch": 2.1879866518353728, + "grad_norm": 8.671875, + "learning_rate": 7.812013348164627e-06, + "loss": 3.0875, + "mean_token_accuracy": 0.4553081147040879, + "step": 11802 + }, + { + "epoch": 2.1881720430107525, + "grad_norm": 8.2421875, + "learning_rate": 7.811827956989248e-06, + "loss": 4.0577, + "mean_token_accuracy": 0.4005803684077719, + "step": 11803 + }, + { + "epoch": 2.1883574341861327, + "grad_norm": 9.2109375, + "learning_rate": 7.811642565813868e-06, + "loss": 3.1167, + "mean_token_accuracy": 0.4564408041697692, + "step": 11804 + }, + { + "epoch": 2.188542825361513, + "grad_norm": 8.4765625, + "learning_rate": 7.811457174638489e-06, + "loss": 2.8267, + "mean_token_accuracy": 0.47233991042246704, + "step": 11805 + }, + { + "epoch": 2.1887282165368926, + "grad_norm": 7.01953125, + "learning_rate": 7.811271783463108e-06, + "loss": 2.8637, + "mean_token_accuracy": 0.47626301091647627, + "step": 11806 + }, + { + "epoch": 2.188913607712273, + "grad_norm": 8.703125, + "learning_rate": 7.811086392287728e-06, + "loss": 3.8213, + "mean_token_accuracy": 0.4155307091438199, + "step": 11807 + }, + { + "epoch": 2.189098998887653, + "grad_norm": 7.7734375, + "learning_rate": 7.810901001112347e-06, + "loss": 2.8084, + "mean_token_accuracy": 0.46352300415651426, + "step": 11808 + }, + { + "epoch": 2.189284390063033, + "grad_norm": 7.05078125, + "learning_rate": 7.810715609936967e-06, + "loss": 2.9067, + "mean_token_accuracy": 0.4750364190173487, + "step": 11809 + }, + { + "epoch": 2.189469781238413, + "grad_norm": 7.25, + "learning_rate": 7.810530218761588e-06, + "loss": 2.7101, + "mean_token_accuracy": 0.49149010072941995, + "step": 11810 + }, + { + "epoch": 2.189655172413793, + "grad_norm": 7.890625, + "learning_rate": 7.810344827586207e-06, + "loss": 2.7017, + "mean_token_accuracy": 0.4917340837143862, + "step": 11811 + }, + { + "epoch": 2.1898405635891733, + "grad_norm": 6.66796875, + "learning_rate": 7.810159436410829e-06, + "loss": 2.7602, + "mean_token_accuracy": 0.4845577211394303, + "step": 11812 + }, + { + "epoch": 2.190025954764553, + "grad_norm": 6.5078125, + "learning_rate": 7.809974045235448e-06, + "loss": 2.5966, + "mean_token_accuracy": 0.5026645542272793, + "step": 11813 + }, + { + "epoch": 2.1902113459399333, + "grad_norm": 8.8515625, + "learning_rate": 7.809788654060068e-06, + "loss": 2.6584, + "mean_token_accuracy": 0.48828504089837793, + "step": 11814 + }, + { + "epoch": 2.1903967371153135, + "grad_norm": 7.2734375, + "learning_rate": 7.809603262884687e-06, + "loss": 2.9241, + "mean_token_accuracy": 0.47361214836942994, + "step": 11815 + }, + { + "epoch": 2.190582128290693, + "grad_norm": 7.9296875, + "learning_rate": 7.809417871709307e-06, + "loss": 2.6258, + "mean_token_accuracy": 0.4866608353378526, + "step": 11816 + }, + { + "epoch": 2.1907675194660734, + "grad_norm": 7.37890625, + "learning_rate": 7.809232480533928e-06, + "loss": 2.9412, + "mean_token_accuracy": 0.4495462549099282, + "step": 11817 + }, + { + "epoch": 2.1909529106414536, + "grad_norm": 6.59375, + "learning_rate": 7.809047089358547e-06, + "loss": 2.5933, + "mean_token_accuracy": 0.5104875283446711, + "step": 11818 + }, + { + "epoch": 2.1911383018168333, + "grad_norm": 7.36328125, + "learning_rate": 7.808861698183167e-06, + "loss": 1.9793, + "mean_token_accuracy": 0.5658318014705882, + "step": 11819 + }, + { + "epoch": 2.1913236929922135, + "grad_norm": 6.6171875, + "learning_rate": 7.808676307007788e-06, + "loss": 3.1333, + "mean_token_accuracy": 0.45281007751937985, + "step": 11820 + }, + { + "epoch": 2.1915090841675937, + "grad_norm": 7.45703125, + "learning_rate": 7.808490915832408e-06, + "loss": 3.0255, + "mean_token_accuracy": 0.4729045796815208, + "step": 11821 + }, + { + "epoch": 2.191694475342974, + "grad_norm": 6.890625, + "learning_rate": 7.808305524657027e-06, + "loss": 3.1095, + "mean_token_accuracy": 0.46998956158663885, + "step": 11822 + }, + { + "epoch": 2.1918798665183536, + "grad_norm": 7.1015625, + "learning_rate": 7.808120133481647e-06, + "loss": 2.7337, + "mean_token_accuracy": 0.4671521035598705, + "step": 11823 + }, + { + "epoch": 2.192065257693734, + "grad_norm": 7.80859375, + "learning_rate": 7.807934742306266e-06, + "loss": 3.3008, + "mean_token_accuracy": 0.4612629344653754, + "step": 11824 + }, + { + "epoch": 2.192250648869114, + "grad_norm": 7.25, + "learning_rate": 7.807749351130887e-06, + "loss": 2.156, + "mean_token_accuracy": 0.540268456375839, + "step": 11825 + }, + { + "epoch": 2.1924360400444938, + "grad_norm": 6.1328125, + "learning_rate": 7.807563959955507e-06, + "loss": 2.8329, + "mean_token_accuracy": 0.4742003789145213, + "step": 11826 + }, + { + "epoch": 2.192621431219874, + "grad_norm": 8.5234375, + "learning_rate": 7.807378568780126e-06, + "loss": 2.7903, + "mean_token_accuracy": 0.4961832061068702, + "step": 11827 + }, + { + "epoch": 2.192806822395254, + "grad_norm": 8.0859375, + "learning_rate": 7.807193177604746e-06, + "loss": 2.8301, + "mean_token_accuracy": 0.4761796733212341, + "step": 11828 + }, + { + "epoch": 2.192992213570634, + "grad_norm": 6.6953125, + "learning_rate": 7.807007786429367e-06, + "loss": 2.8573, + "mean_token_accuracy": 0.4635564745412251, + "step": 11829 + }, + { + "epoch": 2.193177604746014, + "grad_norm": 6.74609375, + "learning_rate": 7.806822395253987e-06, + "loss": 2.6224, + "mean_token_accuracy": 0.5138790035587188, + "step": 11830 + }, + { + "epoch": 2.1933629959213943, + "grad_norm": 7.125, + "learning_rate": 7.806637004078606e-06, + "loss": 2.568, + "mean_token_accuracy": 0.49701884092535176, + "step": 11831 + }, + { + "epoch": 2.193548387096774, + "grad_norm": 6.171875, + "learning_rate": 7.806451612903227e-06, + "loss": 2.8077, + "mean_token_accuracy": 0.4831932773109244, + "step": 11832 + }, + { + "epoch": 2.193733778272154, + "grad_norm": 6.10546875, + "learning_rate": 7.806266221727846e-06, + "loss": 3.084, + "mean_token_accuracy": 0.4701397712833545, + "step": 11833 + }, + { + "epoch": 2.1939191694475344, + "grad_norm": 5.9921875, + "learning_rate": 7.806080830552466e-06, + "loss": 2.3446, + "mean_token_accuracy": 0.5230318729586665, + "step": 11834 + }, + { + "epoch": 2.194104560622914, + "grad_norm": 11.1171875, + "learning_rate": 7.805895439377087e-06, + "loss": 3.557, + "mean_token_accuracy": 0.44720678560982746, + "step": 11835 + }, + { + "epoch": 2.1942899517982943, + "grad_norm": 7.6015625, + "learning_rate": 7.805710048201707e-06, + "loss": 2.681, + "mean_token_accuracy": 0.49961113703530874, + "step": 11836 + }, + { + "epoch": 2.1944753429736745, + "grad_norm": 6.76953125, + "learning_rate": 7.805524657026326e-06, + "loss": 3.0562, + "mean_token_accuracy": 0.4476699770817418, + "step": 11837 + }, + { + "epoch": 2.1946607341490543, + "grad_norm": 6.734375, + "learning_rate": 7.805339265850946e-06, + "loss": 3.0839, + "mean_token_accuracy": 0.4598327420172326, + "step": 11838 + }, + { + "epoch": 2.1948461253244345, + "grad_norm": 6.62109375, + "learning_rate": 7.805153874675567e-06, + "loss": 3.035, + "mean_token_accuracy": 0.4723774053382992, + "step": 11839 + }, + { + "epoch": 2.1950315164998146, + "grad_norm": 7.765625, + "learning_rate": 7.804968483500186e-06, + "loss": 3.2996, + "mean_token_accuracy": 0.4732905982905983, + "step": 11840 + }, + { + "epoch": 2.195216907675195, + "grad_norm": 7.5078125, + "learning_rate": 7.804783092324806e-06, + "loss": 2.8382, + "mean_token_accuracy": 0.4895900064061499, + "step": 11841 + }, + { + "epoch": 2.1954022988505746, + "grad_norm": 7.0546875, + "learning_rate": 7.804597701149425e-06, + "loss": 2.4008, + "mean_token_accuracy": 0.5292988929889298, + "step": 11842 + }, + { + "epoch": 2.1955876900259548, + "grad_norm": 6.66015625, + "learning_rate": 7.804412309974045e-06, + "loss": 2.3661, + "mean_token_accuracy": 0.5214861827317202, + "step": 11843 + }, + { + "epoch": 2.195773081201335, + "grad_norm": 8.296875, + "learning_rate": 7.804226918798666e-06, + "loss": 3.6342, + "mean_token_accuracy": 0.42572440015733576, + "step": 11844 + }, + { + "epoch": 2.1959584723767147, + "grad_norm": 7.31640625, + "learning_rate": 7.804041527623286e-06, + "loss": 2.7076, + "mean_token_accuracy": 0.4934247499157019, + "step": 11845 + }, + { + "epoch": 2.196143863552095, + "grad_norm": 6.92578125, + "learning_rate": 7.803856136447905e-06, + "loss": 2.9512, + "mean_token_accuracy": 0.4423737774253238, + "step": 11846 + }, + { + "epoch": 2.196329254727475, + "grad_norm": 5.765625, + "learning_rate": 7.803670745272526e-06, + "loss": 2.5791, + "mean_token_accuracy": 0.5262521968365553, + "step": 11847 + }, + { + "epoch": 2.196514645902855, + "grad_norm": 8.34375, + "learning_rate": 7.803485354097146e-06, + "loss": 2.7669, + "mean_token_accuracy": 0.4960148021633931, + "step": 11848 + }, + { + "epoch": 2.196700037078235, + "grad_norm": 8.4765625, + "learning_rate": 7.803299962921765e-06, + "loss": 2.8539, + "mean_token_accuracy": 0.48688790902761664, + "step": 11849 + }, + { + "epoch": 2.196885428253615, + "grad_norm": 6.59375, + "learning_rate": 7.803114571746385e-06, + "loss": 2.7752, + "mean_token_accuracy": 0.4676324587174552, + "step": 11850 + }, + { + "epoch": 2.197070819428995, + "grad_norm": 9.3828125, + "learning_rate": 7.802929180571004e-06, + "loss": 3.0113, + "mean_token_accuracy": 0.48169418521177315, + "step": 11851 + }, + { + "epoch": 2.197256210604375, + "grad_norm": 8.5625, + "learning_rate": 7.802743789395626e-06, + "loss": 2.9911, + "mean_token_accuracy": 0.4735769922109047, + "step": 11852 + }, + { + "epoch": 2.1974416017797553, + "grad_norm": 6.78125, + "learning_rate": 7.802558398220245e-06, + "loss": 3.0147, + "mean_token_accuracy": 0.4595535285645703, + "step": 11853 + }, + { + "epoch": 2.1976269929551355, + "grad_norm": 6.68359375, + "learning_rate": 7.802373007044866e-06, + "loss": 2.4068, + "mean_token_accuracy": 0.5611698655176917, + "step": 11854 + }, + { + "epoch": 2.1978123841305153, + "grad_norm": 7.56640625, + "learning_rate": 7.802187615869486e-06, + "loss": 2.7999, + "mean_token_accuracy": 0.46388399512129014, + "step": 11855 + }, + { + "epoch": 2.1979977753058955, + "grad_norm": 7.87109375, + "learning_rate": 7.802002224694105e-06, + "loss": 3.1556, + "mean_token_accuracy": 0.4858278955954323, + "step": 11856 + }, + { + "epoch": 2.1981831664812757, + "grad_norm": 8.0390625, + "learning_rate": 7.801816833518725e-06, + "loss": 2.743, + "mean_token_accuracy": 0.47861070135443207, + "step": 11857 + }, + { + "epoch": 2.1983685576566554, + "grad_norm": 6.8203125, + "learning_rate": 7.801631442343344e-06, + "loss": 2.7668, + "mean_token_accuracy": 0.48966613672496023, + "step": 11858 + }, + { + "epoch": 2.1985539488320356, + "grad_norm": 7.4609375, + "learning_rate": 7.801446051167965e-06, + "loss": 2.7806, + "mean_token_accuracy": 0.4899024591290012, + "step": 11859 + }, + { + "epoch": 2.1987393400074158, + "grad_norm": 6.1328125, + "learning_rate": 7.801260659992585e-06, + "loss": 2.4354, + "mean_token_accuracy": 0.5388636143645339, + "step": 11860 + }, + { + "epoch": 2.1989247311827955, + "grad_norm": 6.5390625, + "learning_rate": 7.801075268817206e-06, + "loss": 3.4223, + "mean_token_accuracy": 0.44740853658536583, + "step": 11861 + }, + { + "epoch": 2.1991101223581757, + "grad_norm": 6.10546875, + "learning_rate": 7.800889877641825e-06, + "loss": 2.8677, + "mean_token_accuracy": 0.47115384615384615, + "step": 11862 + }, + { + "epoch": 2.199295513533556, + "grad_norm": 7.1953125, + "learning_rate": 7.800704486466445e-06, + "loss": 3.1164, + "mean_token_accuracy": 0.45913392601089764, + "step": 11863 + }, + { + "epoch": 2.1994809047089356, + "grad_norm": 5.55859375, + "learning_rate": 7.800519095291066e-06, + "loss": 3.1536, + "mean_token_accuracy": 0.44969199178644764, + "step": 11864 + }, + { + "epoch": 2.199666295884316, + "grad_norm": 6.28125, + "learning_rate": 7.800333704115684e-06, + "loss": 2.6105, + "mean_token_accuracy": 0.5046666666666667, + "step": 11865 + }, + { + "epoch": 2.199851687059696, + "grad_norm": 7.50390625, + "learning_rate": 7.800148312940305e-06, + "loss": 2.6588, + "mean_token_accuracy": 0.508198727361723, + "step": 11866 + }, + { + "epoch": 2.200037078235076, + "grad_norm": 6.76171875, + "learning_rate": 7.799962921764924e-06, + "loss": 3.3561, + "mean_token_accuracy": 0.4353909465020576, + "step": 11867 + }, + { + "epoch": 2.200222469410456, + "grad_norm": 9.6484375, + "learning_rate": 7.799777530589546e-06, + "loss": 2.5715, + "mean_token_accuracy": 0.4887495981999357, + "step": 11868 + }, + { + "epoch": 2.200407860585836, + "grad_norm": 6.53515625, + "learning_rate": 7.799592139414165e-06, + "loss": 2.4468, + "mean_token_accuracy": 0.49375866851595007, + "step": 11869 + }, + { + "epoch": 2.2005932517612163, + "grad_norm": 7.11328125, + "learning_rate": 7.799406748238785e-06, + "loss": 3.0639, + "mean_token_accuracy": 0.4786485218207414, + "step": 11870 + }, + { + "epoch": 2.200778642936596, + "grad_norm": 7.9296875, + "learning_rate": 7.799221357063404e-06, + "loss": 2.628, + "mean_token_accuracy": 0.48329276388581544, + "step": 11871 + }, + { + "epoch": 2.2009640341119763, + "grad_norm": 7.8203125, + "learning_rate": 7.799035965888024e-06, + "loss": 2.787, + "mean_token_accuracy": 0.48743582815455283, + "step": 11872 + }, + { + "epoch": 2.2011494252873565, + "grad_norm": 6.27734375, + "learning_rate": 7.798850574712645e-06, + "loss": 2.8709, + "mean_token_accuracy": 0.47448912326961107, + "step": 11873 + }, + { + "epoch": 2.201334816462736, + "grad_norm": 7.9921875, + "learning_rate": 7.798665183537264e-06, + "loss": 3.1996, + "mean_token_accuracy": 0.45327510917030567, + "step": 11874 + }, + { + "epoch": 2.2015202076381164, + "grad_norm": 8.125, + "learning_rate": 7.798479792361884e-06, + "loss": 2.6945, + "mean_token_accuracy": 0.5060163022383232, + "step": 11875 + }, + { + "epoch": 2.2017055988134966, + "grad_norm": 6.4453125, + "learning_rate": 7.798294401186505e-06, + "loss": 3.0816, + "mean_token_accuracy": 0.4609872611464968, + "step": 11876 + }, + { + "epoch": 2.2018909899888763, + "grad_norm": 8.765625, + "learning_rate": 7.798109010011125e-06, + "loss": 2.3322, + "mean_token_accuracy": 0.5349624457844071, + "step": 11877 + }, + { + "epoch": 2.2020763811642565, + "grad_norm": 8.1015625, + "learning_rate": 7.797923618835744e-06, + "loss": 3.1153, + "mean_token_accuracy": 0.44488809498791637, + "step": 11878 + }, + { + "epoch": 2.2022617723396367, + "grad_norm": 9.609375, + "learning_rate": 7.797738227660364e-06, + "loss": 3.4308, + "mean_token_accuracy": 0.41430073606729756, + "step": 11879 + }, + { + "epoch": 2.202447163515017, + "grad_norm": 8.0234375, + "learning_rate": 7.797552836484983e-06, + "loss": 2.4783, + "mean_token_accuracy": 0.5010259040779688, + "step": 11880 + }, + { + "epoch": 2.2026325546903966, + "grad_norm": 6.859375, + "learning_rate": 7.797367445309604e-06, + "loss": 3.26, + "mean_token_accuracy": 0.4545644018340975, + "step": 11881 + }, + { + "epoch": 2.202817945865777, + "grad_norm": 7.83984375, + "learning_rate": 7.797182054134224e-06, + "loss": 2.4844, + "mean_token_accuracy": 0.510914142082286, + "step": 11882 + }, + { + "epoch": 2.203003337041157, + "grad_norm": 8.21875, + "learning_rate": 7.796996662958843e-06, + "loss": 2.1678, + "mean_token_accuracy": 0.5746996996996997, + "step": 11883 + }, + { + "epoch": 2.2031887282165368, + "grad_norm": 7.11328125, + "learning_rate": 7.796811271783464e-06, + "loss": 2.6749, + "mean_token_accuracy": 0.48767403244347934, + "step": 11884 + }, + { + "epoch": 2.203374119391917, + "grad_norm": 5.84765625, + "learning_rate": 7.796625880608084e-06, + "loss": 2.3036, + "mean_token_accuracy": 0.5433215202248564, + "step": 11885 + }, + { + "epoch": 2.203559510567297, + "grad_norm": 7.44921875, + "learning_rate": 7.796440489432704e-06, + "loss": 3.3311, + "mean_token_accuracy": 0.42900302114803623, + "step": 11886 + }, + { + "epoch": 2.203744901742677, + "grad_norm": 7.515625, + "learning_rate": 7.796255098257323e-06, + "loss": 2.6592, + "mean_token_accuracy": 0.4876387487386478, + "step": 11887 + }, + { + "epoch": 2.203930292918057, + "grad_norm": 6.73828125, + "learning_rate": 7.796069707081944e-06, + "loss": 3.0023, + "mean_token_accuracy": 0.47071207430340556, + "step": 11888 + }, + { + "epoch": 2.2041156840934373, + "grad_norm": 6.5078125, + "learning_rate": 7.795884315906563e-06, + "loss": 2.165, + "mean_token_accuracy": 0.5381306218224482, + "step": 11889 + }, + { + "epoch": 2.204301075268817, + "grad_norm": 6.23046875, + "learning_rate": 7.795698924731183e-06, + "loss": 2.3951, + "mean_token_accuracy": 0.5587950808345999, + "step": 11890 + }, + { + "epoch": 2.204486466444197, + "grad_norm": 7.12109375, + "learning_rate": 7.795513533555804e-06, + "loss": 2.5515, + "mean_token_accuracy": 0.5019286403085824, + "step": 11891 + }, + { + "epoch": 2.2046718576195774, + "grad_norm": 9.0078125, + "learning_rate": 7.795328142380422e-06, + "loss": 2.997, + "mean_token_accuracy": 0.4725957878710987, + "step": 11892 + }, + { + "epoch": 2.2048572487949576, + "grad_norm": 6.7265625, + "learning_rate": 7.795142751205045e-06, + "loss": 2.4579, + "mean_token_accuracy": 0.4974031162604874, + "step": 11893 + }, + { + "epoch": 2.2050426399703373, + "grad_norm": 7.2265625, + "learning_rate": 7.794957360029663e-06, + "loss": 3.901, + "mean_token_accuracy": 0.4089327953248922, + "step": 11894 + }, + { + "epoch": 2.2052280311457175, + "grad_norm": 9.03125, + "learning_rate": 7.794771968854284e-06, + "loss": 2.9855, + "mean_token_accuracy": 0.45355361596009974, + "step": 11895 + }, + { + "epoch": 2.2054134223210977, + "grad_norm": 9.6328125, + "learning_rate": 7.794586577678903e-06, + "loss": 2.7867, + "mean_token_accuracy": 0.47638205930561767, + "step": 11896 + }, + { + "epoch": 2.2055988134964775, + "grad_norm": 8.8515625, + "learning_rate": 7.794401186503523e-06, + "loss": 2.5441, + "mean_token_accuracy": 0.5295192578015181, + "step": 11897 + }, + { + "epoch": 2.2057842046718577, + "grad_norm": 9.3359375, + "learning_rate": 7.794215795328144e-06, + "loss": 3.3979, + "mean_token_accuracy": 0.4167782987273945, + "step": 11898 + }, + { + "epoch": 2.205969595847238, + "grad_norm": 9.453125, + "learning_rate": 7.794030404152762e-06, + "loss": 3.787, + "mean_token_accuracy": 0.4149960357911428, + "step": 11899 + }, + { + "epoch": 2.2061549870226176, + "grad_norm": 10.9453125, + "learning_rate": 7.793845012977383e-06, + "loss": 3.1969, + "mean_token_accuracy": 0.4264071786505069, + "step": 11900 + }, + { + "epoch": 2.2063403781979978, + "grad_norm": 9.6484375, + "learning_rate": 7.793659621802003e-06, + "loss": 2.7032, + "mean_token_accuracy": 0.4776641550053821, + "step": 11901 + }, + { + "epoch": 2.206525769373378, + "grad_norm": 9.875, + "learning_rate": 7.793474230626624e-06, + "loss": 2.4513, + "mean_token_accuracy": 0.5256026244472971, + "step": 11902 + }, + { + "epoch": 2.2067111605487577, + "grad_norm": 7.33984375, + "learning_rate": 7.793288839451243e-06, + "loss": 3.3082, + "mean_token_accuracy": 0.4279402985074627, + "step": 11903 + }, + { + "epoch": 2.206896551724138, + "grad_norm": 7.078125, + "learning_rate": 7.793103448275863e-06, + "loss": 2.855, + "mean_token_accuracy": 0.4734380621177599, + "step": 11904 + }, + { + "epoch": 2.207081942899518, + "grad_norm": 10.4609375, + "learning_rate": 7.792918057100482e-06, + "loss": 2.2214, + "mean_token_accuracy": 0.553426844566085, + "step": 11905 + }, + { + "epoch": 2.207267334074898, + "grad_norm": 10.09375, + "learning_rate": 7.792732665925102e-06, + "loss": 3.3526, + "mean_token_accuracy": 0.4753329297820823, + "step": 11906 + }, + { + "epoch": 2.207452725250278, + "grad_norm": 7.43359375, + "learning_rate": 7.792547274749723e-06, + "loss": 2.4005, + "mean_token_accuracy": 0.5142055419151175, + "step": 11907 + }, + { + "epoch": 2.207638116425658, + "grad_norm": 6.484375, + "learning_rate": 7.792361883574342e-06, + "loss": 2.7077, + "mean_token_accuracy": 0.49850924269528923, + "step": 11908 + }, + { + "epoch": 2.207823507601038, + "grad_norm": 7.71484375, + "learning_rate": 7.792176492398962e-06, + "loss": 2.6095, + "mean_token_accuracy": 0.4817401425869344, + "step": 11909 + }, + { + "epoch": 2.208008898776418, + "grad_norm": 9.3515625, + "learning_rate": 7.791991101223583e-06, + "loss": 2.7584, + "mean_token_accuracy": 0.48442584844258485, + "step": 11910 + }, + { + "epoch": 2.2081942899517983, + "grad_norm": 8.0859375, + "learning_rate": 7.791805710048203e-06, + "loss": 2.6275, + "mean_token_accuracy": 0.5121604828688088, + "step": 11911 + }, + { + "epoch": 2.2083796811271785, + "grad_norm": 7.40234375, + "learning_rate": 7.791620318872822e-06, + "loss": 2.6232, + "mean_token_accuracy": 0.5022554452893414, + "step": 11912 + }, + { + "epoch": 2.2085650723025583, + "grad_norm": 7.859375, + "learning_rate": 7.791434927697443e-06, + "loss": 2.7867, + "mean_token_accuracy": 0.47111319868482854, + "step": 11913 + }, + { + "epoch": 2.2087504634779385, + "grad_norm": 7.390625, + "learning_rate": 7.791249536522061e-06, + "loss": 2.758, + "mean_token_accuracy": 0.48204590913420325, + "step": 11914 + }, + { + "epoch": 2.2089358546533187, + "grad_norm": 8.9453125, + "learning_rate": 7.791064145346682e-06, + "loss": 2.9664, + "mean_token_accuracy": 0.47520530638029057, + "step": 11915 + }, + { + "epoch": 2.2091212458286984, + "grad_norm": 9.0703125, + "learning_rate": 7.790878754171302e-06, + "loss": 3.0299, + "mean_token_accuracy": 0.4846272098385857, + "step": 11916 + }, + { + "epoch": 2.2093066370040786, + "grad_norm": 7.40625, + "learning_rate": 7.790693362995923e-06, + "loss": 2.5467, + "mean_token_accuracy": 0.5323657847418444, + "step": 11917 + }, + { + "epoch": 2.209492028179459, + "grad_norm": 7.08203125, + "learning_rate": 7.790507971820542e-06, + "loss": 2.8027, + "mean_token_accuracy": 0.49663692518874397, + "step": 11918 + }, + { + "epoch": 2.2096774193548385, + "grad_norm": 7.65234375, + "learning_rate": 7.790322580645162e-06, + "loss": 2.2149, + "mean_token_accuracy": 0.5773037167363096, + "step": 11919 + }, + { + "epoch": 2.2098628105302187, + "grad_norm": 6.23828125, + "learning_rate": 7.790137189469783e-06, + "loss": 2.4778, + "mean_token_accuracy": 0.4977643504531722, + "step": 11920 + }, + { + "epoch": 2.210048201705599, + "grad_norm": 7.65625, + "learning_rate": 7.789951798294401e-06, + "loss": 2.9266, + "mean_token_accuracy": 0.48638132295719844, + "step": 11921 + }, + { + "epoch": 2.2102335928809786, + "grad_norm": 7.21484375, + "learning_rate": 7.789766407119022e-06, + "loss": 2.6366, + "mean_token_accuracy": 0.48789075002667237, + "step": 11922 + }, + { + "epoch": 2.210418984056359, + "grad_norm": 6.43359375, + "learning_rate": 7.78958101594364e-06, + "loss": 2.3747, + "mean_token_accuracy": 0.5431301652892562, + "step": 11923 + }, + { + "epoch": 2.210604375231739, + "grad_norm": 7.35546875, + "learning_rate": 7.789395624768261e-06, + "loss": 2.3608, + "mean_token_accuracy": 0.5604260089686098, + "step": 11924 + }, + { + "epoch": 2.210789766407119, + "grad_norm": 8.7265625, + "learning_rate": 7.789210233592882e-06, + "loss": 2.5968, + "mean_token_accuracy": 0.5228236061297685, + "step": 11925 + }, + { + "epoch": 2.210975157582499, + "grad_norm": 8.1328125, + "learning_rate": 7.789024842417502e-06, + "loss": 2.7785, + "mean_token_accuracy": 0.48058124174372524, + "step": 11926 + }, + { + "epoch": 2.211160548757879, + "grad_norm": 8.484375, + "learning_rate": 7.788839451242121e-06, + "loss": 3.1148, + "mean_token_accuracy": 0.46057975073519114, + "step": 11927 + }, + { + "epoch": 2.2113459399332593, + "grad_norm": 6.94921875, + "learning_rate": 7.788654060066741e-06, + "loss": 3.5398, + "mean_token_accuracy": 0.4428125812321289, + "step": 11928 + }, + { + "epoch": 2.211531331108639, + "grad_norm": 8.671875, + "learning_rate": 7.788468668891362e-06, + "loss": 2.9515, + "mean_token_accuracy": 0.465014164305949, + "step": 11929 + }, + { + "epoch": 2.2117167222840193, + "grad_norm": 6.390625, + "learning_rate": 7.78828327771598e-06, + "loss": 3.289, + "mean_token_accuracy": 0.4286797220467467, + "step": 11930 + }, + { + "epoch": 2.2119021134593995, + "grad_norm": 7.671875, + "learning_rate": 7.788097886540601e-06, + "loss": 2.4063, + "mean_token_accuracy": 0.551678445229682, + "step": 11931 + }, + { + "epoch": 2.212087504634779, + "grad_norm": 8.2578125, + "learning_rate": 7.78791249536522e-06, + "loss": 2.6868, + "mean_token_accuracy": 0.48629984406326576, + "step": 11932 + }, + { + "epoch": 2.2122728958101594, + "grad_norm": 8.890625, + "learning_rate": 7.787727104189842e-06, + "loss": 2.8754, + "mean_token_accuracy": 0.4559852670349908, + "step": 11933 + }, + { + "epoch": 2.2124582869855396, + "grad_norm": 7.4921875, + "learning_rate": 7.787541713014461e-06, + "loss": 3.2408, + "mean_token_accuracy": 0.4418604651162791, + "step": 11934 + }, + { + "epoch": 2.2126436781609193, + "grad_norm": 8.3984375, + "learning_rate": 7.787356321839081e-06, + "loss": 2.2244, + "mean_token_accuracy": 0.5321828358208955, + "step": 11935 + }, + { + "epoch": 2.2128290693362995, + "grad_norm": 8.71875, + "learning_rate": 7.787170930663702e-06, + "loss": 2.395, + "mean_token_accuracy": 0.5322836567801782, + "step": 11936 + }, + { + "epoch": 2.2130144605116797, + "grad_norm": 8.1328125, + "learning_rate": 7.78698553948832e-06, + "loss": 3.1417, + "mean_token_accuracy": 0.4822123401889939, + "step": 11937 + }, + { + "epoch": 2.21319985168706, + "grad_norm": 7.76953125, + "learning_rate": 7.786800148312941e-06, + "loss": 2.3425, + "mean_token_accuracy": 0.5300977533870691, + "step": 11938 + }, + { + "epoch": 2.2133852428624397, + "grad_norm": 6.9609375, + "learning_rate": 7.78661475713756e-06, + "loss": 2.5862, + "mean_token_accuracy": 0.5133358614587283, + "step": 11939 + }, + { + "epoch": 2.21357063403782, + "grad_norm": 8.65625, + "learning_rate": 7.78642936596218e-06, + "loss": 2.913, + "mean_token_accuracy": 0.47900575761831204, + "step": 11940 + }, + { + "epoch": 2.2137560252132, + "grad_norm": 10.984375, + "learning_rate": 7.786243974786801e-06, + "loss": 2.4861, + "mean_token_accuracy": 0.5132455231611662, + "step": 11941 + }, + { + "epoch": 2.2139414163885798, + "grad_norm": 6.87890625, + "learning_rate": 7.786058583611422e-06, + "loss": 3.0099, + "mean_token_accuracy": 0.445326278659612, + "step": 11942 + }, + { + "epoch": 2.21412680756396, + "grad_norm": 10.359375, + "learning_rate": 7.78587319243604e-06, + "loss": 2.8093, + "mean_token_accuracy": 0.4887249443207127, + "step": 11943 + }, + { + "epoch": 2.21431219873934, + "grad_norm": 9.578125, + "learning_rate": 7.78568780126066e-06, + "loss": 2.9031, + "mean_token_accuracy": 0.47985651214128033, + "step": 11944 + }, + { + "epoch": 2.21449758991472, + "grad_norm": 10.859375, + "learning_rate": 7.785502410085281e-06, + "loss": 2.2878, + "mean_token_accuracy": 0.5188163884673748, + "step": 11945 + }, + { + "epoch": 2.2146829810901, + "grad_norm": 6.5703125, + "learning_rate": 7.7853170189099e-06, + "loss": 2.0436, + "mean_token_accuracy": 0.57378754388246, + "step": 11946 + }, + { + "epoch": 2.2148683722654803, + "grad_norm": 9.96875, + "learning_rate": 7.78513162773452e-06, + "loss": 3.6276, + "mean_token_accuracy": 0.44884527940648555, + "step": 11947 + }, + { + "epoch": 2.21505376344086, + "grad_norm": 9.796875, + "learning_rate": 7.78494623655914e-06, + "loss": 3.3843, + "mean_token_accuracy": 0.43529411764705883, + "step": 11948 + }, + { + "epoch": 2.21523915461624, + "grad_norm": 10.5078125, + "learning_rate": 7.784760845383762e-06, + "loss": 2.8625, + "mean_token_accuracy": 0.4746483001172333, + "step": 11949 + }, + { + "epoch": 2.2154245457916204, + "grad_norm": 6.82421875, + "learning_rate": 7.78457545420838e-06, + "loss": 2.6808, + "mean_token_accuracy": 0.49445041174364485, + "step": 11950 + }, + { + "epoch": 2.2156099369670006, + "grad_norm": 8.46875, + "learning_rate": 7.784390063033e-06, + "loss": 2.8473, + "mean_token_accuracy": 0.4697036063658928, + "step": 11951 + }, + { + "epoch": 2.2157953281423803, + "grad_norm": 8.375, + "learning_rate": 7.78420467185762e-06, + "loss": 2.182, + "mean_token_accuracy": 0.5804651162790697, + "step": 11952 + }, + { + "epoch": 2.2159807193177605, + "grad_norm": 7.1015625, + "learning_rate": 7.78401928068224e-06, + "loss": 2.727, + "mean_token_accuracy": 0.4769946157611356, + "step": 11953 + }, + { + "epoch": 2.2161661104931407, + "grad_norm": 8.40625, + "learning_rate": 7.78383388950686e-06, + "loss": 2.3702, + "mean_token_accuracy": 0.5205972536995067, + "step": 11954 + }, + { + "epoch": 2.2163515016685205, + "grad_norm": 8.0078125, + "learning_rate": 7.78364849833148e-06, + "loss": 3.0988, + "mean_token_accuracy": 0.463436928702011, + "step": 11955 + }, + { + "epoch": 2.2165368928439007, + "grad_norm": 8.65625, + "learning_rate": 7.7834631071561e-06, + "loss": 2.4038, + "mean_token_accuracy": 0.5274940678840401, + "step": 11956 + }, + { + "epoch": 2.216722284019281, + "grad_norm": 7.8984375, + "learning_rate": 7.78327771598072e-06, + "loss": 3.1949, + "mean_token_accuracy": 0.4634555475676971, + "step": 11957 + }, + { + "epoch": 2.2169076751946606, + "grad_norm": 10.2421875, + "learning_rate": 7.783092324805341e-06, + "loss": 2.6592, + "mean_token_accuracy": 0.5072027239392352, + "step": 11958 + }, + { + "epoch": 2.2170930663700408, + "grad_norm": 8.2890625, + "learning_rate": 7.78290693362996e-06, + "loss": 3.6864, + "mean_token_accuracy": 0.4215857928964482, + "step": 11959 + }, + { + "epoch": 2.217278457545421, + "grad_norm": 5.9375, + "learning_rate": 7.78272154245458e-06, + "loss": 2.8368, + "mean_token_accuracy": 0.48934356351236147, + "step": 11960 + }, + { + "epoch": 2.2174638487208007, + "grad_norm": 6.73828125, + "learning_rate": 7.782536151279199e-06, + "loss": 2.4766, + "mean_token_accuracy": 0.5211141060197664, + "step": 11961 + }, + { + "epoch": 2.217649239896181, + "grad_norm": 7.0078125, + "learning_rate": 7.78235076010382e-06, + "loss": 3.3719, + "mean_token_accuracy": 0.4455611390284757, + "step": 11962 + }, + { + "epoch": 2.217834631071561, + "grad_norm": 7.0, + "learning_rate": 7.78216536892844e-06, + "loss": 2.3562, + "mean_token_accuracy": 0.5208003617454217, + "step": 11963 + }, + { + "epoch": 2.2180200222469413, + "grad_norm": 8.03125, + "learning_rate": 7.781979977753059e-06, + "loss": 3.0717, + "mean_token_accuracy": 0.47812149382913927, + "step": 11964 + }, + { + "epoch": 2.218205413422321, + "grad_norm": 7.6484375, + "learning_rate": 7.78179458657768e-06, + "loss": 2.3392, + "mean_token_accuracy": 0.5286779992662346, + "step": 11965 + }, + { + "epoch": 2.218390804597701, + "grad_norm": 7.77734375, + "learning_rate": 7.7816091954023e-06, + "loss": 2.8786, + "mean_token_accuracy": 0.4892343120322395, + "step": 11966 + }, + { + "epoch": 2.2185761957730814, + "grad_norm": 6.71484375, + "learning_rate": 7.78142380422692e-06, + "loss": 3.0756, + "mean_token_accuracy": 0.45655024436762426, + "step": 11967 + }, + { + "epoch": 2.218761586948461, + "grad_norm": 7.85546875, + "learning_rate": 7.781238413051539e-06, + "loss": 2.8121, + "mean_token_accuracy": 0.5020055913455694, + "step": 11968 + }, + { + "epoch": 2.2189469781238413, + "grad_norm": 6.671875, + "learning_rate": 7.78105302187616e-06, + "loss": 2.8269, + "mean_token_accuracy": 0.48883224083515414, + "step": 11969 + }, + { + "epoch": 2.2191323692992215, + "grad_norm": 7.76171875, + "learning_rate": 7.780867630700778e-06, + "loss": 2.665, + "mean_token_accuracy": 0.5027013443899987, + "step": 11970 + }, + { + "epoch": 2.2193177604746013, + "grad_norm": 9.1484375, + "learning_rate": 7.780682239525399e-06, + "loss": 2.7637, + "mean_token_accuracy": 0.4811472369885029, + "step": 11971 + }, + { + "epoch": 2.2195031516499815, + "grad_norm": 9.828125, + "learning_rate": 7.78049684835002e-06, + "loss": 2.4618, + "mean_token_accuracy": 0.5366212734633787, + "step": 11972 + }, + { + "epoch": 2.2196885428253617, + "grad_norm": 7.578125, + "learning_rate": 7.78031145717464e-06, + "loss": 2.25, + "mean_token_accuracy": 0.531975217893521, + "step": 11973 + }, + { + "epoch": 2.2198739340007414, + "grad_norm": 7.7890625, + "learning_rate": 7.78012606599926e-06, + "loss": 3.1534, + "mean_token_accuracy": 0.48495807989478873, + "step": 11974 + }, + { + "epoch": 2.2200593251761216, + "grad_norm": 6.8203125, + "learning_rate": 7.779940674823879e-06, + "loss": 2.5977, + "mean_token_accuracy": 0.4901147959183674, + "step": 11975 + }, + { + "epoch": 2.220244716351502, + "grad_norm": 7.28125, + "learning_rate": 7.7797552836485e-06, + "loss": 2.5241, + "mean_token_accuracy": 0.49740932642487046, + "step": 11976 + }, + { + "epoch": 2.2204301075268815, + "grad_norm": 7.671875, + "learning_rate": 7.779569892473118e-06, + "loss": 2.6926, + "mean_token_accuracy": 0.4974965862539827, + "step": 11977 + }, + { + "epoch": 2.2206154987022617, + "grad_norm": 7.23828125, + "learning_rate": 7.779384501297739e-06, + "loss": 2.7638, + "mean_token_accuracy": 0.47655088525476413, + "step": 11978 + }, + { + "epoch": 2.220800889877642, + "grad_norm": 7.7890625, + "learning_rate": 7.77919911012236e-06, + "loss": 3.4153, + "mean_token_accuracy": 0.4239278885550396, + "step": 11979 + }, + { + "epoch": 2.2209862810530216, + "grad_norm": 8.9375, + "learning_rate": 7.779013718946978e-06, + "loss": 2.6558, + "mean_token_accuracy": 0.4710344827586207, + "step": 11980 + }, + { + "epoch": 2.221171672228402, + "grad_norm": 6.18359375, + "learning_rate": 7.778828327771599e-06, + "loss": 3.0493, + "mean_token_accuracy": 0.46605482997999764, + "step": 11981 + }, + { + "epoch": 2.221357063403782, + "grad_norm": 6.828125, + "learning_rate": 7.778642936596219e-06, + "loss": 2.2945, + "mean_token_accuracy": 0.5442914585110272, + "step": 11982 + }, + { + "epoch": 2.221542454579162, + "grad_norm": 7.3125, + "learning_rate": 7.77845754542084e-06, + "loss": 3.2996, + "mean_token_accuracy": 0.4552517555655162, + "step": 11983 + }, + { + "epoch": 2.221727845754542, + "grad_norm": 6.66796875, + "learning_rate": 7.778272154245458e-06, + "loss": 3.1781, + "mean_token_accuracy": 0.4547076060644668, + "step": 11984 + }, + { + "epoch": 2.221913236929922, + "grad_norm": 8.9375, + "learning_rate": 7.778086763070079e-06, + "loss": 2.6273, + "mean_token_accuracy": 0.5001198753296572, + "step": 11985 + }, + { + "epoch": 2.2220986281053023, + "grad_norm": 9.21875, + "learning_rate": 7.777901371894698e-06, + "loss": 3.7597, + "mean_token_accuracy": 0.4258783204798629, + "step": 11986 + }, + { + "epoch": 2.222284019280682, + "grad_norm": 7.4609375, + "learning_rate": 7.777715980719318e-06, + "loss": 2.4037, + "mean_token_accuracy": 0.5353858456617353, + "step": 11987 + }, + { + "epoch": 2.2224694104560623, + "grad_norm": 7.828125, + "learning_rate": 7.777530589543939e-06, + "loss": 2.5603, + "mean_token_accuracy": 0.49210084033613444, + "step": 11988 + }, + { + "epoch": 2.2226548016314425, + "grad_norm": 8.234375, + "learning_rate": 7.77734519836856e-06, + "loss": 2.2506, + "mean_token_accuracy": 0.5515804098645363, + "step": 11989 + }, + { + "epoch": 2.222840192806822, + "grad_norm": 8.796875, + "learning_rate": 7.777159807193178e-06, + "loss": 2.2632, + "mean_token_accuracy": 0.5265197060788244, + "step": 11990 + }, + { + "epoch": 2.2230255839822024, + "grad_norm": 6.7578125, + "learning_rate": 7.776974416017798e-06, + "loss": 2.2753, + "mean_token_accuracy": 0.5423159913689762, + "step": 11991 + }, + { + "epoch": 2.2232109751575826, + "grad_norm": 7.55859375, + "learning_rate": 7.776789024842419e-06, + "loss": 2.8279, + "mean_token_accuracy": 0.4670143783478996, + "step": 11992 + }, + { + "epoch": 2.2233963663329623, + "grad_norm": 7.265625, + "learning_rate": 7.776603633667038e-06, + "loss": 3.4709, + "mean_token_accuracy": 0.4638262716865337, + "step": 11993 + }, + { + "epoch": 2.2235817575083425, + "grad_norm": 6.46875, + "learning_rate": 7.776418242491658e-06, + "loss": 2.8486, + "mean_token_accuracy": 0.4538021259198692, + "step": 11994 + }, + { + "epoch": 2.2237671486837227, + "grad_norm": 6.4140625, + "learning_rate": 7.776232851316277e-06, + "loss": 2.6459, + "mean_token_accuracy": 0.4869117869571796, + "step": 11995 + }, + { + "epoch": 2.223952539859103, + "grad_norm": 7.23046875, + "learning_rate": 7.776047460140898e-06, + "loss": 2.3697, + "mean_token_accuracy": 0.5180772960243801, + "step": 11996 + }, + { + "epoch": 2.2241379310344827, + "grad_norm": 8.7578125, + "learning_rate": 7.775862068965518e-06, + "loss": 3.4246, + "mean_token_accuracy": 0.5109034267912772, + "step": 11997 + }, + { + "epoch": 2.224323322209863, + "grad_norm": 9.59375, + "learning_rate": 7.775676677790139e-06, + "loss": 2.6548, + "mean_token_accuracy": 0.47973238882329794, + "step": 11998 + }, + { + "epoch": 2.224508713385243, + "grad_norm": 8.46875, + "learning_rate": 7.775491286614757e-06, + "loss": 2.9374, + "mean_token_accuracy": 0.4498797113071371, + "step": 11999 + }, + { + "epoch": 2.2246941045606228, + "grad_norm": 6.48046875, + "learning_rate": 7.775305895439378e-06, + "loss": 2.2535, + "mean_token_accuracy": 0.5514945829526171, + "step": 12000 + }, + { + "epoch": 2.224879495736003, + "grad_norm": 9.5, + "learning_rate": 7.775120504263998e-06, + "loss": 2.5706, + "mean_token_accuracy": 0.49743975903614457, + "step": 12001 + }, + { + "epoch": 2.225064886911383, + "grad_norm": 8.1015625, + "learning_rate": 7.774935113088617e-06, + "loss": 2.3838, + "mean_token_accuracy": 0.5169693361119381, + "step": 12002 + }, + { + "epoch": 2.225250278086763, + "grad_norm": 11.171875, + "learning_rate": 7.774749721913238e-06, + "loss": 2.1912, + "mean_token_accuracy": 0.5551322277005827, + "step": 12003 + }, + { + "epoch": 2.225435669262143, + "grad_norm": 9.7109375, + "learning_rate": 7.774564330737856e-06, + "loss": 3.0355, + "mean_token_accuracy": 0.45807275047862156, + "step": 12004 + }, + { + "epoch": 2.2256210604375233, + "grad_norm": 8.671875, + "learning_rate": 7.774378939562477e-06, + "loss": 3.3282, + "mean_token_accuracy": 0.4469732890785522, + "step": 12005 + }, + { + "epoch": 2.225806451612903, + "grad_norm": 7.28515625, + "learning_rate": 7.774193548387097e-06, + "loss": 2.9265, + "mean_token_accuracy": 0.46012781328146707, + "step": 12006 + }, + { + "epoch": 2.225991842788283, + "grad_norm": 10.4765625, + "learning_rate": 7.774008157211718e-06, + "loss": 1.8749, + "mean_token_accuracy": 0.5751047973917094, + "step": 12007 + }, + { + "epoch": 2.2261772339636634, + "grad_norm": 7.91015625, + "learning_rate": 7.773822766036337e-06, + "loss": 2.7655, + "mean_token_accuracy": 0.4764431928950504, + "step": 12008 + }, + { + "epoch": 2.2263626251390436, + "grad_norm": 8.171875, + "learning_rate": 7.773637374860957e-06, + "loss": 2.9449, + "mean_token_accuracy": 0.4619849100406268, + "step": 12009 + }, + { + "epoch": 2.2265480163144233, + "grad_norm": 6.953125, + "learning_rate": 7.773451983685578e-06, + "loss": 2.7073, + "mean_token_accuracy": 0.46598157335223245, + "step": 12010 + }, + { + "epoch": 2.2267334074898035, + "grad_norm": 7.734375, + "learning_rate": 7.773266592510196e-06, + "loss": 2.5881, + "mean_token_accuracy": 0.5057515337423313, + "step": 12011 + }, + { + "epoch": 2.2269187986651837, + "grad_norm": 7.46875, + "learning_rate": 7.773081201334817e-06, + "loss": 2.5666, + "mean_token_accuracy": 0.47752207653197754, + "step": 12012 + }, + { + "epoch": 2.2271041898405635, + "grad_norm": 5.50390625, + "learning_rate": 7.772895810159436e-06, + "loss": 1.8873, + "mean_token_accuracy": 0.6180646700871018, + "step": 12013 + }, + { + "epoch": 2.2272895810159437, + "grad_norm": 9.953125, + "learning_rate": 7.772710418984058e-06, + "loss": 2.5095, + "mean_token_accuracy": 0.5322902796271638, + "step": 12014 + }, + { + "epoch": 2.227474972191324, + "grad_norm": 9.7890625, + "learning_rate": 7.772525027808677e-06, + "loss": 3.0417, + "mean_token_accuracy": 0.45217391304347826, + "step": 12015 + }, + { + "epoch": 2.2276603633667036, + "grad_norm": 8.0078125, + "learning_rate": 7.772339636633297e-06, + "loss": 2.579, + "mean_token_accuracy": 0.4932673811486672, + "step": 12016 + }, + { + "epoch": 2.227845754542084, + "grad_norm": 7.11328125, + "learning_rate": 7.772154245457918e-06, + "loss": 2.8692, + "mean_token_accuracy": 0.4742093373493976, + "step": 12017 + }, + { + "epoch": 2.228031145717464, + "grad_norm": 8.3359375, + "learning_rate": 7.771968854282536e-06, + "loss": 3.4776, + "mean_token_accuracy": 0.4562784915043514, + "step": 12018 + }, + { + "epoch": 2.2282165368928437, + "grad_norm": 7.9453125, + "learning_rate": 7.771783463107157e-06, + "loss": 3.1038, + "mean_token_accuracy": 0.45587382581468167, + "step": 12019 + }, + { + "epoch": 2.228401928068224, + "grad_norm": 8.046875, + "learning_rate": 7.771598071931776e-06, + "loss": 3.0998, + "mean_token_accuracy": 0.45685005393743255, + "step": 12020 + }, + { + "epoch": 2.228587319243604, + "grad_norm": 7.30859375, + "learning_rate": 7.771412680756396e-06, + "loss": 2.3569, + "mean_token_accuracy": 0.5152247462542291, + "step": 12021 + }, + { + "epoch": 2.2287727104189843, + "grad_norm": 10.203125, + "learning_rate": 7.771227289581017e-06, + "loss": 3.1015, + "mean_token_accuracy": 0.47266265718972117, + "step": 12022 + }, + { + "epoch": 2.228958101594364, + "grad_norm": 11.0078125, + "learning_rate": 7.771041898405637e-06, + "loss": 3.0635, + "mean_token_accuracy": 0.4423125473365312, + "step": 12023 + }, + { + "epoch": 2.229143492769744, + "grad_norm": 10.2890625, + "learning_rate": 7.770856507230256e-06, + "loss": 3.4326, + "mean_token_accuracy": 0.41913499344692007, + "step": 12024 + }, + { + "epoch": 2.2293288839451244, + "grad_norm": 8.46875, + "learning_rate": 7.770671116054877e-06, + "loss": 3.1609, + "mean_token_accuracy": 0.4506513026052104, + "step": 12025 + }, + { + "epoch": 2.229514275120504, + "grad_norm": 6.640625, + "learning_rate": 7.770485724879497e-06, + "loss": 2.8656, + "mean_token_accuracy": 0.4934078212290503, + "step": 12026 + }, + { + "epoch": 2.2296996662958843, + "grad_norm": 18.140625, + "learning_rate": 7.770300333704116e-06, + "loss": 2.5152, + "mean_token_accuracy": 0.5103658536585366, + "step": 12027 + }, + { + "epoch": 2.2298850574712645, + "grad_norm": 11.875, + "learning_rate": 7.770114942528736e-06, + "loss": 2.8123, + "mean_token_accuracy": 0.4768059744363062, + "step": 12028 + }, + { + "epoch": 2.2300704486466443, + "grad_norm": 9.28125, + "learning_rate": 7.769929551353355e-06, + "loss": 2.5078, + "mean_token_accuracy": 0.5463003343509231, + "step": 12029 + }, + { + "epoch": 2.2302558398220245, + "grad_norm": 6.39453125, + "learning_rate": 7.769744160177977e-06, + "loss": 2.1212, + "mean_token_accuracy": 0.5544589047078348, + "step": 12030 + }, + { + "epoch": 2.2304412309974047, + "grad_norm": 8.2421875, + "learning_rate": 7.769558769002596e-06, + "loss": 3.4837, + "mean_token_accuracy": 0.44278825076720735, + "step": 12031 + }, + { + "epoch": 2.2306266221727844, + "grad_norm": 9.6484375, + "learning_rate": 7.769373377827217e-06, + "loss": 2.9991, + "mean_token_accuracy": 0.48059701492537316, + "step": 12032 + }, + { + "epoch": 2.2308120133481646, + "grad_norm": 13.90625, + "learning_rate": 7.769187986651835e-06, + "loss": 2.549, + "mean_token_accuracy": 0.5010088138472975, + "step": 12033 + }, + { + "epoch": 2.230997404523545, + "grad_norm": 8.4140625, + "learning_rate": 7.769002595476456e-06, + "loss": 3.4452, + "mean_token_accuracy": 0.42181244692466335, + "step": 12034 + }, + { + "epoch": 2.2311827956989245, + "grad_norm": 10.8984375, + "learning_rate": 7.768817204301076e-06, + "loss": 3.2512, + "mean_token_accuracy": 0.448995877111318, + "step": 12035 + }, + { + "epoch": 2.2313681868743047, + "grad_norm": 17.640625, + "learning_rate": 7.768631813125695e-06, + "loss": 2.7173, + "mean_token_accuracy": 0.47703885731838425, + "step": 12036 + }, + { + "epoch": 2.231553578049685, + "grad_norm": 9.109375, + "learning_rate": 7.768446421950316e-06, + "loss": 3.429, + "mean_token_accuracy": 0.43478260869565216, + "step": 12037 + }, + { + "epoch": 2.231738969225065, + "grad_norm": 17.046875, + "learning_rate": 7.768261030774936e-06, + "loss": 2.5474, + "mean_token_accuracy": 0.5048292749285811, + "step": 12038 + }, + { + "epoch": 2.231924360400445, + "grad_norm": 10.8125, + "learning_rate": 7.768075639599557e-06, + "loss": 2.8851, + "mean_token_accuracy": 0.4807533681605719, + "step": 12039 + }, + { + "epoch": 2.232109751575825, + "grad_norm": 9.375, + "learning_rate": 7.767890248424175e-06, + "loss": 3.0266, + "mean_token_accuracy": 0.4669542229024628, + "step": 12040 + }, + { + "epoch": 2.232295142751205, + "grad_norm": 8.421875, + "learning_rate": 7.767704857248796e-06, + "loss": 3.1411, + "mean_token_accuracy": 0.4681952662721893, + "step": 12041 + }, + { + "epoch": 2.232480533926585, + "grad_norm": 7.171875, + "learning_rate": 7.767519466073415e-06, + "loss": 2.8008, + "mean_token_accuracy": 0.48014359434597265, + "step": 12042 + }, + { + "epoch": 2.232665925101965, + "grad_norm": 6.9296875, + "learning_rate": 7.767334074898035e-06, + "loss": 2.5464, + "mean_token_accuracy": 0.5203527815468114, + "step": 12043 + }, + { + "epoch": 2.2328513162773453, + "grad_norm": 6.921875, + "learning_rate": 7.767148683722656e-06, + "loss": 2.9109, + "mean_token_accuracy": 0.4917730683164407, + "step": 12044 + }, + { + "epoch": 2.233036707452725, + "grad_norm": 9.15625, + "learning_rate": 7.766963292547275e-06, + "loss": 3.1433, + "mean_token_accuracy": 0.45906286476571617, + "step": 12045 + }, + { + "epoch": 2.2332220986281053, + "grad_norm": 7.26171875, + "learning_rate": 7.766777901371895e-06, + "loss": 2.3254, + "mean_token_accuracy": 0.5052090380192126, + "step": 12046 + }, + { + "epoch": 2.2334074898034855, + "grad_norm": 8.515625, + "learning_rate": 7.766592510196515e-06, + "loss": 2.9677, + "mean_token_accuracy": 0.47182398864442865, + "step": 12047 + }, + { + "epoch": 2.233592880978865, + "grad_norm": 6.26953125, + "learning_rate": 7.766407119021136e-06, + "loss": 2.7347, + "mean_token_accuracy": 0.4868785295094674, + "step": 12048 + }, + { + "epoch": 2.2337782721542454, + "grad_norm": 8.9296875, + "learning_rate": 7.766221727845755e-06, + "loss": 2.5619, + "mean_token_accuracy": 0.47712814298697365, + "step": 12049 + }, + { + "epoch": 2.2339636633296256, + "grad_norm": 7.28515625, + "learning_rate": 7.766036336670375e-06, + "loss": 3.0943, + "mean_token_accuracy": 0.4607336139506915, + "step": 12050 + }, + { + "epoch": 2.2341490545050053, + "grad_norm": 6.921875, + "learning_rate": 7.765850945494994e-06, + "loss": 2.6011, + "mean_token_accuracy": 0.49840425531914895, + "step": 12051 + }, + { + "epoch": 2.2343344456803855, + "grad_norm": 7.171875, + "learning_rate": 7.765665554319615e-06, + "loss": 2.7287, + "mean_token_accuracy": 0.49435566632458583, + "step": 12052 + }, + { + "epoch": 2.2345198368557657, + "grad_norm": 6.5234375, + "learning_rate": 7.765480163144235e-06, + "loss": 3.3745, + "mean_token_accuracy": 0.4345413764608665, + "step": 12053 + }, + { + "epoch": 2.234705228031146, + "grad_norm": 7.18359375, + "learning_rate": 7.765294771968856e-06, + "loss": 2.552, + "mean_token_accuracy": 0.50757977313686, + "step": 12054 + }, + { + "epoch": 2.2348906192065257, + "grad_norm": 10.3203125, + "learning_rate": 7.765109380793476e-06, + "loss": 2.6136, + "mean_token_accuracy": 0.514277245331489, + "step": 12055 + }, + { + "epoch": 2.235076010381906, + "grad_norm": 6.7578125, + "learning_rate": 7.764923989618095e-06, + "loss": 2.9183, + "mean_token_accuracy": 0.4629731970157502, + "step": 12056 + }, + { + "epoch": 2.235261401557286, + "grad_norm": 7.12890625, + "learning_rate": 7.764738598442715e-06, + "loss": 2.9702, + "mean_token_accuracy": 0.4732010660349423, + "step": 12057 + }, + { + "epoch": 2.2354467927326658, + "grad_norm": 10.5, + "learning_rate": 7.764553207267334e-06, + "loss": 1.9867, + "mean_token_accuracy": 0.562273276904474, + "step": 12058 + }, + { + "epoch": 2.235632183908046, + "grad_norm": 9.25, + "learning_rate": 7.764367816091955e-06, + "loss": 2.9177, + "mean_token_accuracy": 0.4767565919829294, + "step": 12059 + }, + { + "epoch": 2.235817575083426, + "grad_norm": 7.30078125, + "learning_rate": 7.764182424916575e-06, + "loss": 2.6563, + "mean_token_accuracy": 0.5096551724137931, + "step": 12060 + }, + { + "epoch": 2.236002966258806, + "grad_norm": 7.02734375, + "learning_rate": 7.763997033741194e-06, + "loss": 2.6784, + "mean_token_accuracy": 0.4756410256410256, + "step": 12061 + }, + { + "epoch": 2.236188357434186, + "grad_norm": 7.34765625, + "learning_rate": 7.763811642565814e-06, + "loss": 2.4739, + "mean_token_accuracy": 0.5207944548120501, + "step": 12062 + }, + { + "epoch": 2.2363737486095663, + "grad_norm": 9.4453125, + "learning_rate": 7.763626251390435e-06, + "loss": 2.8722, + "mean_token_accuracy": 0.47250621718706826, + "step": 12063 + }, + { + "epoch": 2.236559139784946, + "grad_norm": 6.89453125, + "learning_rate": 7.763440860215055e-06, + "loss": 2.2982, + "mean_token_accuracy": 0.5625094711319897, + "step": 12064 + }, + { + "epoch": 2.236744530960326, + "grad_norm": 10.359375, + "learning_rate": 7.763255469039674e-06, + "loss": 3.4173, + "mean_token_accuracy": 0.4757344237598192, + "step": 12065 + }, + { + "epoch": 2.2369299221357064, + "grad_norm": 7.4375, + "learning_rate": 7.763070077864295e-06, + "loss": 2.9677, + "mean_token_accuracy": 0.47505197505197505, + "step": 12066 + }, + { + "epoch": 2.2371153133110866, + "grad_norm": 7.0703125, + "learning_rate": 7.762884686688913e-06, + "loss": 2.7488, + "mean_token_accuracy": 0.4832523315381584, + "step": 12067 + }, + { + "epoch": 2.2373007044864663, + "grad_norm": 6.10546875, + "learning_rate": 7.762699295513534e-06, + "loss": 3.1267, + "mean_token_accuracy": 0.4605229428351705, + "step": 12068 + }, + { + "epoch": 2.2374860956618465, + "grad_norm": 7.4765625, + "learning_rate": 7.762513904338154e-06, + "loss": 2.6699, + "mean_token_accuracy": 0.4881410654414504, + "step": 12069 + }, + { + "epoch": 2.2376714868372267, + "grad_norm": 8.5859375, + "learning_rate": 7.762328513162775e-06, + "loss": 2.474, + "mean_token_accuracy": 0.5048532335399368, + "step": 12070 + }, + { + "epoch": 2.2378568780126065, + "grad_norm": 6.453125, + "learning_rate": 7.762143121987394e-06, + "loss": 2.7844, + "mean_token_accuracy": 0.48481262327416175, + "step": 12071 + }, + { + "epoch": 2.2380422691879867, + "grad_norm": 7.59765625, + "learning_rate": 7.761957730812014e-06, + "loss": 2.5888, + "mean_token_accuracy": 0.4884855879213055, + "step": 12072 + }, + { + "epoch": 2.238227660363367, + "grad_norm": 8.1484375, + "learning_rate": 7.761772339636635e-06, + "loss": 2.2001, + "mean_token_accuracy": 0.5277821501652763, + "step": 12073 + }, + { + "epoch": 2.2384130515387466, + "grad_norm": 7.98828125, + "learning_rate": 7.761586948461254e-06, + "loss": 1.9494, + "mean_token_accuracy": 0.5598180970149254, + "step": 12074 + }, + { + "epoch": 2.238598442714127, + "grad_norm": 9.765625, + "learning_rate": 7.761401557285874e-06, + "loss": 2.6822, + "mean_token_accuracy": 0.49797160243407707, + "step": 12075 + }, + { + "epoch": 2.238783833889507, + "grad_norm": 8.4453125, + "learning_rate": 7.761216166110493e-06, + "loss": 3.0987, + "mean_token_accuracy": 0.4543436025534277, + "step": 12076 + }, + { + "epoch": 2.2389692250648867, + "grad_norm": 10.6953125, + "learning_rate": 7.761030774935113e-06, + "loss": 2.4638, + "mean_token_accuracy": 0.5197465369879163, + "step": 12077 + }, + { + "epoch": 2.239154616240267, + "grad_norm": 9.6328125, + "learning_rate": 7.760845383759734e-06, + "loss": 3.1135, + "mean_token_accuracy": 0.4840090357864701, + "step": 12078 + }, + { + "epoch": 2.239340007415647, + "grad_norm": 6.7734375, + "learning_rate": 7.760659992584354e-06, + "loss": 2.6245, + "mean_token_accuracy": 0.5267494773090641, + "step": 12079 + }, + { + "epoch": 2.2395253985910273, + "grad_norm": 9.4296875, + "learning_rate": 7.760474601408973e-06, + "loss": 2.1503, + "mean_token_accuracy": 0.5302083333333333, + "step": 12080 + }, + { + "epoch": 2.239710789766407, + "grad_norm": 9.734375, + "learning_rate": 7.760289210233594e-06, + "loss": 2.6242, + "mean_token_accuracy": 0.4913735899137359, + "step": 12081 + }, + { + "epoch": 2.239896180941787, + "grad_norm": 6.51953125, + "learning_rate": 7.760103819058214e-06, + "loss": 2.6773, + "mean_token_accuracy": 0.49447317716255956, + "step": 12082 + }, + { + "epoch": 2.2400815721171674, + "grad_norm": 6.20703125, + "learning_rate": 7.759918427882833e-06, + "loss": 3.5276, + "mean_token_accuracy": 0.43000658761528326, + "step": 12083 + }, + { + "epoch": 2.240266963292547, + "grad_norm": 7.87109375, + "learning_rate": 7.759733036707453e-06, + "loss": 2.6657, + "mean_token_accuracy": 0.5113230035756854, + "step": 12084 + }, + { + "epoch": 2.2404523544679273, + "grad_norm": 8.640625, + "learning_rate": 7.759547645532072e-06, + "loss": 2.6708, + "mean_token_accuracy": 0.4780566366930613, + "step": 12085 + }, + { + "epoch": 2.2406377456433075, + "grad_norm": 7.67578125, + "learning_rate": 7.759362254356694e-06, + "loss": 2.9049, + "mean_token_accuracy": 0.465109964702688, + "step": 12086 + }, + { + "epoch": 2.2408231368186873, + "grad_norm": 8.890625, + "learning_rate": 7.759176863181313e-06, + "loss": 2.6978, + "mean_token_accuracy": 0.4842185128983308, + "step": 12087 + }, + { + "epoch": 2.2410085279940675, + "grad_norm": 9.6015625, + "learning_rate": 7.758991472005934e-06, + "loss": 2.4653, + "mean_token_accuracy": 0.5004153316720067, + "step": 12088 + }, + { + "epoch": 2.2411939191694477, + "grad_norm": 7.58984375, + "learning_rate": 7.758806080830552e-06, + "loss": 2.8256, + "mean_token_accuracy": 0.46210643633324044, + "step": 12089 + }, + { + "epoch": 2.2413793103448274, + "grad_norm": 7.09765625, + "learning_rate": 7.758620689655173e-06, + "loss": 3.3098, + "mean_token_accuracy": 0.45077978789769185, + "step": 12090 + }, + { + "epoch": 2.2415647015202076, + "grad_norm": 6.796875, + "learning_rate": 7.758435298479793e-06, + "loss": 2.9374, + "mean_token_accuracy": 0.4592112371690978, + "step": 12091 + }, + { + "epoch": 2.241750092695588, + "grad_norm": 10.4921875, + "learning_rate": 7.758249907304412e-06, + "loss": 2.8221, + "mean_token_accuracy": 0.49542786952367956, + "step": 12092 + }, + { + "epoch": 2.241935483870968, + "grad_norm": 8.4375, + "learning_rate": 7.758064516129033e-06, + "loss": 2.9644, + "mean_token_accuracy": 0.46271664651350264, + "step": 12093 + }, + { + "epoch": 2.2421208750463477, + "grad_norm": 8.5, + "learning_rate": 7.757879124953653e-06, + "loss": 2.6776, + "mean_token_accuracy": 0.49653808110781406, + "step": 12094 + }, + { + "epoch": 2.242306266221728, + "grad_norm": 9.109375, + "learning_rate": 7.757693733778274e-06, + "loss": 2.4345, + "mean_token_accuracy": 0.5036297640653358, + "step": 12095 + }, + { + "epoch": 2.242491657397108, + "grad_norm": 7.43359375, + "learning_rate": 7.757508342602892e-06, + "loss": 3.0578, + "mean_token_accuracy": 0.46436015006252607, + "step": 12096 + }, + { + "epoch": 2.242677048572488, + "grad_norm": 7.640625, + "learning_rate": 7.757322951427513e-06, + "loss": 2.4669, + "mean_token_accuracy": 0.497488138431482, + "step": 12097 + }, + { + "epoch": 2.242862439747868, + "grad_norm": 7.984375, + "learning_rate": 7.757137560252133e-06, + "loss": 2.1927, + "mean_token_accuracy": 0.5394279604383855, + "step": 12098 + }, + { + "epoch": 2.243047830923248, + "grad_norm": 8.0, + "learning_rate": 7.756952169076752e-06, + "loss": 2.6437, + "mean_token_accuracy": 0.47671607274256284, + "step": 12099 + }, + { + "epoch": 2.243233222098628, + "grad_norm": 7.51171875, + "learning_rate": 7.756766777901373e-06, + "loss": 3.3141, + "mean_token_accuracy": 0.44250816866345105, + "step": 12100 + }, + { + "epoch": 2.243418613274008, + "grad_norm": 6.8671875, + "learning_rate": 7.756581386725992e-06, + "loss": 2.9843, + "mean_token_accuracy": 0.4984126984126984, + "step": 12101 + }, + { + "epoch": 2.2436040044493883, + "grad_norm": 8.296875, + "learning_rate": 7.756395995550614e-06, + "loss": 2.9832, + "mean_token_accuracy": 0.46637533381888807, + "step": 12102 + }, + { + "epoch": 2.243789395624768, + "grad_norm": 8.4921875, + "learning_rate": 7.756210604375233e-06, + "loss": 2.8825, + "mean_token_accuracy": 0.48408910103420844, + "step": 12103 + }, + { + "epoch": 2.2439747868001483, + "grad_norm": 7.58984375, + "learning_rate": 7.756025213199853e-06, + "loss": 2.5594, + "mean_token_accuracy": 0.48502304147465436, + "step": 12104 + }, + { + "epoch": 2.2441601779755285, + "grad_norm": 6.08984375, + "learning_rate": 7.755839822024472e-06, + "loss": 2.5902, + "mean_token_accuracy": 0.49101961867919314, + "step": 12105 + }, + { + "epoch": 2.244345569150908, + "grad_norm": 7.35546875, + "learning_rate": 7.755654430849092e-06, + "loss": 2.868, + "mean_token_accuracy": 0.5196820590461771, + "step": 12106 + }, + { + "epoch": 2.2445309603262884, + "grad_norm": 11.3671875, + "learning_rate": 7.755469039673713e-06, + "loss": 2.7644, + "mean_token_accuracy": 0.4795240302889816, + "step": 12107 + }, + { + "epoch": 2.2447163515016686, + "grad_norm": 7.23828125, + "learning_rate": 7.755283648498332e-06, + "loss": 3.1238, + "mean_token_accuracy": 0.4791483757682177, + "step": 12108 + }, + { + "epoch": 2.2449017426770483, + "grad_norm": 6.6484375, + "learning_rate": 7.755098257322952e-06, + "loss": 2.3348, + "mean_token_accuracy": 0.5593173685659235, + "step": 12109 + }, + { + "epoch": 2.2450871338524285, + "grad_norm": 8.8046875, + "learning_rate": 7.754912866147573e-06, + "loss": 2.3053, + "mean_token_accuracy": 0.5271523178807948, + "step": 12110 + }, + { + "epoch": 2.2452725250278087, + "grad_norm": 8.4921875, + "learning_rate": 7.754727474972193e-06, + "loss": 2.5885, + "mean_token_accuracy": 0.49887402304941053, + "step": 12111 + }, + { + "epoch": 2.245457916203189, + "grad_norm": 8.390625, + "learning_rate": 7.754542083796812e-06, + "loss": 2.5485, + "mean_token_accuracy": 0.5562073669849932, + "step": 12112 + }, + { + "epoch": 2.2456433073785687, + "grad_norm": 10.6328125, + "learning_rate": 7.754356692621432e-06, + "loss": 2.9785, + "mean_token_accuracy": 0.4877943088256536, + "step": 12113 + }, + { + "epoch": 2.245828698553949, + "grad_norm": 10.1171875, + "learning_rate": 7.754171301446051e-06, + "loss": 2.7546, + "mean_token_accuracy": 0.47619665326242056, + "step": 12114 + }, + { + "epoch": 2.246014089729329, + "grad_norm": 8.796875, + "learning_rate": 7.753985910270672e-06, + "loss": 2.7775, + "mean_token_accuracy": 0.49025300705101615, + "step": 12115 + }, + { + "epoch": 2.246199480904709, + "grad_norm": 8.6640625, + "learning_rate": 7.753800519095292e-06, + "loss": 2.21, + "mean_token_accuracy": 0.5727558230829626, + "step": 12116 + }, + { + "epoch": 2.246384872080089, + "grad_norm": 9.078125, + "learning_rate": 7.753615127919911e-06, + "loss": 3.8653, + "mean_token_accuracy": 0.4176839703365659, + "step": 12117 + }, + { + "epoch": 2.246570263255469, + "grad_norm": 6.50390625, + "learning_rate": 7.753429736744531e-06, + "loss": 2.8779, + "mean_token_accuracy": 0.466182478438493, + "step": 12118 + }, + { + "epoch": 2.246755654430849, + "grad_norm": 7.23046875, + "learning_rate": 7.753244345569152e-06, + "loss": 3.0161, + "mean_token_accuracy": 0.4658457550226831, + "step": 12119 + }, + { + "epoch": 2.246941045606229, + "grad_norm": 8.4765625, + "learning_rate": 7.753058954393772e-06, + "loss": 2.5899, + "mean_token_accuracy": 0.4954117215220849, + "step": 12120 + }, + { + "epoch": 2.2471264367816093, + "grad_norm": 8.46875, + "learning_rate": 7.752873563218391e-06, + "loss": 2.7308, + "mean_token_accuracy": 0.49852625937834943, + "step": 12121 + }, + { + "epoch": 2.247311827956989, + "grad_norm": 7.86328125, + "learning_rate": 7.752688172043012e-06, + "loss": 4.0007, + "mean_token_accuracy": 0.411214953271028, + "step": 12122 + }, + { + "epoch": 2.247497219132369, + "grad_norm": 7.8984375, + "learning_rate": 7.75250278086763e-06, + "loss": 2.3514, + "mean_token_accuracy": 0.5447880870561282, + "step": 12123 + }, + { + "epoch": 2.2476826103077494, + "grad_norm": 8.8046875, + "learning_rate": 7.752317389692251e-06, + "loss": 2.1949, + "mean_token_accuracy": 0.5611441037450006, + "step": 12124 + }, + { + "epoch": 2.2478680014831296, + "grad_norm": 6.515625, + "learning_rate": 7.752131998516871e-06, + "loss": 2.9224, + "mean_token_accuracy": 0.487448588852645, + "step": 12125 + }, + { + "epoch": 2.2480533926585093, + "grad_norm": 8.1015625, + "learning_rate": 7.75194660734149e-06, + "loss": 2.6046, + "mean_token_accuracy": 0.5054762694988384, + "step": 12126 + }, + { + "epoch": 2.2482387838338895, + "grad_norm": 8.7578125, + "learning_rate": 7.75176121616611e-06, + "loss": 2.9216, + "mean_token_accuracy": 0.4445641923750505, + "step": 12127 + }, + { + "epoch": 2.2484241750092697, + "grad_norm": 6.9921875, + "learning_rate": 7.751575824990731e-06, + "loss": 3.2858, + "mean_token_accuracy": 0.44481054365733114, + "step": 12128 + }, + { + "epoch": 2.2486095661846495, + "grad_norm": 7.34375, + "learning_rate": 7.751390433815352e-06, + "loss": 3.1713, + "mean_token_accuracy": 0.46034725480994837, + "step": 12129 + }, + { + "epoch": 2.2487949573600297, + "grad_norm": 7.89453125, + "learning_rate": 7.75120504263997e-06, + "loss": 2.4901, + "mean_token_accuracy": 0.5337311251826595, + "step": 12130 + }, + { + "epoch": 2.24898034853541, + "grad_norm": 9.6875, + "learning_rate": 7.751019651464591e-06, + "loss": 2.5727, + "mean_token_accuracy": 0.5099528548978522, + "step": 12131 + }, + { + "epoch": 2.2491657397107896, + "grad_norm": 10.421875, + "learning_rate": 7.75083426028921e-06, + "loss": 3.3868, + "mean_token_accuracy": 0.4244176706827309, + "step": 12132 + }, + { + "epoch": 2.24935113088617, + "grad_norm": 8.125, + "learning_rate": 7.75064886911383e-06, + "loss": 2.752, + "mean_token_accuracy": 0.46797180892717305, + "step": 12133 + }, + { + "epoch": 2.24953652206155, + "grad_norm": 8.359375, + "learning_rate": 7.75046347793845e-06, + "loss": 3.1624, + "mean_token_accuracy": 0.4776416378678396, + "step": 12134 + }, + { + "epoch": 2.2497219132369297, + "grad_norm": 7.3984375, + "learning_rate": 7.750278086763071e-06, + "loss": 2.8279, + "mean_token_accuracy": 0.48582839474362277, + "step": 12135 + }, + { + "epoch": 2.24990730441231, + "grad_norm": 8.9296875, + "learning_rate": 7.750092695587692e-06, + "loss": 2.7376, + "mean_token_accuracy": 0.48172189955585926, + "step": 12136 + }, + { + "epoch": 2.25009269558769, + "grad_norm": 8.609375, + "learning_rate": 7.74990730441231e-06, + "loss": 2.4682, + "mean_token_accuracy": 0.5165595650024716, + "step": 12137 + }, + { + "epoch": 2.2502780867630703, + "grad_norm": 6.04296875, + "learning_rate": 7.749721913236931e-06, + "loss": 2.1244, + "mean_token_accuracy": 0.5573440643863179, + "step": 12138 + }, + { + "epoch": 2.25046347793845, + "grad_norm": 7.4609375, + "learning_rate": 7.74953652206155e-06, + "loss": 2.424, + "mean_token_accuracy": 0.5202831472910427, + "step": 12139 + }, + { + "epoch": 2.25064886911383, + "grad_norm": 6.15234375, + "learning_rate": 7.74935113088617e-06, + "loss": 2.0929, + "mean_token_accuracy": 0.5698474827911999, + "step": 12140 + }, + { + "epoch": 2.2508342602892104, + "grad_norm": 8.984375, + "learning_rate": 7.74916573971079e-06, + "loss": 3.034, + "mean_token_accuracy": 0.47167344567112146, + "step": 12141 + }, + { + "epoch": 2.25101965146459, + "grad_norm": 6.28125, + "learning_rate": 7.74898034853541e-06, + "loss": 2.2375, + "mean_token_accuracy": 0.5411836485661989, + "step": 12142 + }, + { + "epoch": 2.2512050426399703, + "grad_norm": 6.4921875, + "learning_rate": 7.74879495736003e-06, + "loss": 2.4503, + "mean_token_accuracy": 0.5004016870857602, + "step": 12143 + }, + { + "epoch": 2.2513904338153505, + "grad_norm": 8.1328125, + "learning_rate": 7.74860956618465e-06, + "loss": 3.0079, + "mean_token_accuracy": 0.4508568917533324, + "step": 12144 + }, + { + "epoch": 2.2515758249907303, + "grad_norm": 7.4375, + "learning_rate": 7.748424175009271e-06, + "loss": 2.8543, + "mean_token_accuracy": 0.4930673457838144, + "step": 12145 + }, + { + "epoch": 2.2517612161661105, + "grad_norm": 6.7421875, + "learning_rate": 7.74823878383389e-06, + "loss": 2.0357, + "mean_token_accuracy": 0.5958986731001207, + "step": 12146 + }, + { + "epoch": 2.2519466073414907, + "grad_norm": 11.21875, + "learning_rate": 7.74805339265851e-06, + "loss": 2.5374, + "mean_token_accuracy": 0.5060950554718532, + "step": 12147 + }, + { + "epoch": 2.2521319985168704, + "grad_norm": 7.23828125, + "learning_rate": 7.74786800148313e-06, + "loss": 2.9639, + "mean_token_accuracy": 0.45950351053159477, + "step": 12148 + }, + { + "epoch": 2.2523173896922506, + "grad_norm": 8.2421875, + "learning_rate": 7.74768261030775e-06, + "loss": 2.5864, + "mean_token_accuracy": 0.5086425443650611, + "step": 12149 + }, + { + "epoch": 2.252502780867631, + "grad_norm": 9.9140625, + "learning_rate": 7.74749721913237e-06, + "loss": 2.9266, + "mean_token_accuracy": 0.46787792423702645, + "step": 12150 + }, + { + "epoch": 2.252688172043011, + "grad_norm": 6.40625, + "learning_rate": 7.74731182795699e-06, + "loss": 2.6967, + "mean_token_accuracy": 0.48873373532211994, + "step": 12151 + }, + { + "epoch": 2.2528735632183907, + "grad_norm": 17.203125, + "learning_rate": 7.74712643678161e-06, + "loss": 3.5208, + "mean_token_accuracy": 0.43896781675848073, + "step": 12152 + }, + { + "epoch": 2.253058954393771, + "grad_norm": 8.3046875, + "learning_rate": 7.74694104560623e-06, + "loss": 3.5203, + "mean_token_accuracy": 0.4177996115617503, + "step": 12153 + }, + { + "epoch": 2.253244345569151, + "grad_norm": 11.734375, + "learning_rate": 7.74675565443085e-06, + "loss": 3.2452, + "mean_token_accuracy": 0.4503130335799658, + "step": 12154 + }, + { + "epoch": 2.253429736744531, + "grad_norm": 10.2421875, + "learning_rate": 7.74657026325547e-06, + "loss": 2.8815, + "mean_token_accuracy": 0.4617801047120419, + "step": 12155 + }, + { + "epoch": 2.253615127919911, + "grad_norm": 6.41796875, + "learning_rate": 7.74638487208009e-06, + "loss": 2.6623, + "mean_token_accuracy": 0.4972477064220184, + "step": 12156 + }, + { + "epoch": 2.253800519095291, + "grad_norm": 9.6796875, + "learning_rate": 7.746199480904709e-06, + "loss": 2.7382, + "mean_token_accuracy": 0.4926342467923333, + "step": 12157 + }, + { + "epoch": 2.253985910270671, + "grad_norm": 8.859375, + "learning_rate": 7.746014089729329e-06, + "loss": 3.59, + "mean_token_accuracy": 0.424284346067894, + "step": 12158 + }, + { + "epoch": 2.254171301446051, + "grad_norm": 6.8984375, + "learning_rate": 7.74582869855395e-06, + "loss": 3.1383, + "mean_token_accuracy": 0.43444165621079045, + "step": 12159 + }, + { + "epoch": 2.2543566926214313, + "grad_norm": 9.875, + "learning_rate": 7.74564330737857e-06, + "loss": 2.2417, + "mean_token_accuracy": 0.5447269910933265, + "step": 12160 + }, + { + "epoch": 2.254542083796811, + "grad_norm": 11.8984375, + "learning_rate": 7.745457916203189e-06, + "loss": 2.4905, + "mean_token_accuracy": 0.5002277904328019, + "step": 12161 + }, + { + "epoch": 2.2547274749721913, + "grad_norm": 7.6171875, + "learning_rate": 7.74527252502781e-06, + "loss": 2.9058, + "mean_token_accuracy": 0.462475442043222, + "step": 12162 + }, + { + "epoch": 2.2549128661475715, + "grad_norm": 8.1953125, + "learning_rate": 7.74508713385243e-06, + "loss": 3.0764, + "mean_token_accuracy": 0.45381335859782096, + "step": 12163 + }, + { + "epoch": 2.2550982573229517, + "grad_norm": 7.484375, + "learning_rate": 7.744901742677049e-06, + "loss": 2.5664, + "mean_token_accuracy": 0.5121342708097074, + "step": 12164 + }, + { + "epoch": 2.2552836484983314, + "grad_norm": 7.671875, + "learning_rate": 7.744716351501669e-06, + "loss": 3.2198, + "mean_token_accuracy": 0.45454545454545453, + "step": 12165 + }, + { + "epoch": 2.2554690396737116, + "grad_norm": 7.60546875, + "learning_rate": 7.744530960326288e-06, + "loss": 2.3133, + "mean_token_accuracy": 0.5286697247706422, + "step": 12166 + }, + { + "epoch": 2.2556544308490913, + "grad_norm": 7.078125, + "learning_rate": 7.74434556915091e-06, + "loss": 2.6853, + "mean_token_accuracy": 0.49432081594807603, + "step": 12167 + }, + { + "epoch": 2.2558398220244715, + "grad_norm": 9.71875, + "learning_rate": 7.744160177975529e-06, + "loss": 2.9387, + "mean_token_accuracy": 0.4880048587913757, + "step": 12168 + }, + { + "epoch": 2.2560252131998517, + "grad_norm": 8.2421875, + "learning_rate": 7.74397478680015e-06, + "loss": 2.2322, + "mean_token_accuracy": 0.5385499253131104, + "step": 12169 + }, + { + "epoch": 2.256210604375232, + "grad_norm": 7.57421875, + "learning_rate": 7.743789395624768e-06, + "loss": 2.5122, + "mean_token_accuracy": 0.5107234314980794, + "step": 12170 + }, + { + "epoch": 2.2563959955506117, + "grad_norm": 7.1875, + "learning_rate": 7.743604004449389e-06, + "loss": 3.2273, + "mean_token_accuracy": 0.4438337801608579, + "step": 12171 + }, + { + "epoch": 2.256581386725992, + "grad_norm": 7.125, + "learning_rate": 7.743418613274009e-06, + "loss": 3.1777, + "mean_token_accuracy": 0.46206237304337433, + "step": 12172 + }, + { + "epoch": 2.256766777901372, + "grad_norm": 6.50390625, + "learning_rate": 7.743233222098628e-06, + "loss": 3.0609, + "mean_token_accuracy": 0.4763869286722397, + "step": 12173 + }, + { + "epoch": 2.256952169076752, + "grad_norm": 7.70703125, + "learning_rate": 7.743047830923248e-06, + "loss": 2.8586, + "mean_token_accuracy": 0.4877212237618329, + "step": 12174 + }, + { + "epoch": 2.257137560252132, + "grad_norm": 11.015625, + "learning_rate": 7.742862439747869e-06, + "loss": 2.8551, + "mean_token_accuracy": 0.47454989997777286, + "step": 12175 + }, + { + "epoch": 2.257322951427512, + "grad_norm": 7.48046875, + "learning_rate": 7.74267704857249e-06, + "loss": 3.2161, + "mean_token_accuracy": 0.43216928469427496, + "step": 12176 + }, + { + "epoch": 2.2575083426028923, + "grad_norm": 7.57421875, + "learning_rate": 7.742491657397108e-06, + "loss": 2.2127, + "mean_token_accuracy": 0.5366089965397924, + "step": 12177 + }, + { + "epoch": 2.257693733778272, + "grad_norm": 6.8828125, + "learning_rate": 7.742306266221729e-06, + "loss": 3.2162, + "mean_token_accuracy": 0.4553827751196172, + "step": 12178 + }, + { + "epoch": 2.2578791249536523, + "grad_norm": 6.36328125, + "learning_rate": 7.74212087504635e-06, + "loss": 2.559, + "mean_token_accuracy": 0.4988824318283415, + "step": 12179 + }, + { + "epoch": 2.258064516129032, + "grad_norm": 7.27734375, + "learning_rate": 7.741935483870968e-06, + "loss": 2.1841, + "mean_token_accuracy": 0.587200846225043, + "step": 12180 + }, + { + "epoch": 2.258249907304412, + "grad_norm": 7.546875, + "learning_rate": 7.741750092695588e-06, + "loss": 2.4472, + "mean_token_accuracy": 0.5087586505190311, + "step": 12181 + }, + { + "epoch": 2.2584352984797924, + "grad_norm": 6.33203125, + "learning_rate": 7.741564701520207e-06, + "loss": 2.6842, + "mean_token_accuracy": 0.4960493559909081, + "step": 12182 + }, + { + "epoch": 2.2586206896551726, + "grad_norm": 8.21875, + "learning_rate": 7.74137931034483e-06, + "loss": 3.0518, + "mean_token_accuracy": 0.45153933865450396, + "step": 12183 + }, + { + "epoch": 2.2588060808305523, + "grad_norm": 6.0859375, + "learning_rate": 7.741193919169448e-06, + "loss": 2.6314, + "mean_token_accuracy": 0.49850707516551995, + "step": 12184 + }, + { + "epoch": 2.2589914720059325, + "grad_norm": 6.54296875, + "learning_rate": 7.741008527994069e-06, + "loss": 3.2296, + "mean_token_accuracy": 0.4461427243622844, + "step": 12185 + }, + { + "epoch": 2.2591768631813127, + "grad_norm": 7.80078125, + "learning_rate": 7.740823136818688e-06, + "loss": 2.5161, + "mean_token_accuracy": 0.5158302063789869, + "step": 12186 + }, + { + "epoch": 2.2593622543566925, + "grad_norm": 6.5546875, + "learning_rate": 7.740637745643308e-06, + "loss": 2.8974, + "mean_token_accuracy": 0.4920493575880931, + "step": 12187 + }, + { + "epoch": 2.2595476455320727, + "grad_norm": 8.1171875, + "learning_rate": 7.740452354467929e-06, + "loss": 2.799, + "mean_token_accuracy": 0.5207956600361664, + "step": 12188 + }, + { + "epoch": 2.259733036707453, + "grad_norm": 8.140625, + "learning_rate": 7.740266963292547e-06, + "loss": 3.625, + "mean_token_accuracy": 0.4387947269303202, + "step": 12189 + }, + { + "epoch": 2.2599184278828326, + "grad_norm": 6.8125, + "learning_rate": 7.740081572117168e-06, + "loss": 3.3853, + "mean_token_accuracy": 0.4233378561736771, + "step": 12190 + }, + { + "epoch": 2.260103819058213, + "grad_norm": 8.6015625, + "learning_rate": 7.739896180941788e-06, + "loss": 2.965, + "mean_token_accuracy": 0.48697132381825325, + "step": 12191 + }, + { + "epoch": 2.260289210233593, + "grad_norm": 6.79296875, + "learning_rate": 7.739710789766409e-06, + "loss": 2.9977, + "mean_token_accuracy": 0.46733481811432814, + "step": 12192 + }, + { + "epoch": 2.2604746014089727, + "grad_norm": 6.27734375, + "learning_rate": 7.739525398591028e-06, + "loss": 2.9364, + "mean_token_accuracy": 0.45676728334956185, + "step": 12193 + }, + { + "epoch": 2.260659992584353, + "grad_norm": 8.046875, + "learning_rate": 7.739340007415648e-06, + "loss": 2.4516, + "mean_token_accuracy": 0.5364423717521652, + "step": 12194 + }, + { + "epoch": 2.260845383759733, + "grad_norm": 7.47265625, + "learning_rate": 7.739154616240267e-06, + "loss": 2.6279, + "mean_token_accuracy": 0.4958997369642581, + "step": 12195 + }, + { + "epoch": 2.2610307749351133, + "grad_norm": 6.3359375, + "learning_rate": 7.738969225064887e-06, + "loss": 3.505, + "mean_token_accuracy": 0.4298963447899618, + "step": 12196 + }, + { + "epoch": 2.261216166110493, + "grad_norm": 7.1484375, + "learning_rate": 7.738783833889508e-06, + "loss": 2.7582, + "mean_token_accuracy": 0.5571134791549822, + "step": 12197 + }, + { + "epoch": 2.261401557285873, + "grad_norm": 6.9296875, + "learning_rate": 7.738598442714127e-06, + "loss": 2.4797, + "mean_token_accuracy": 0.5063159265529366, + "step": 12198 + }, + { + "epoch": 2.2615869484612534, + "grad_norm": 6.3671875, + "learning_rate": 7.738413051538747e-06, + "loss": 3.014, + "mean_token_accuracy": 0.4521922219469903, + "step": 12199 + }, + { + "epoch": 2.261772339636633, + "grad_norm": 6.265625, + "learning_rate": 7.738227660363368e-06, + "loss": 2.4482, + "mean_token_accuracy": 0.5417820286936823, + "step": 12200 + }, + { + "epoch": 2.2619577308120133, + "grad_norm": 6.5625, + "learning_rate": 7.738042269187988e-06, + "loss": 3.1296, + "mean_token_accuracy": 0.4521158129175947, + "step": 12201 + }, + { + "epoch": 2.2621431219873935, + "grad_norm": 7.515625, + "learning_rate": 7.737856878012607e-06, + "loss": 2.262, + "mean_token_accuracy": 0.5319122413363251, + "step": 12202 + }, + { + "epoch": 2.2623285131627733, + "grad_norm": 6.67578125, + "learning_rate": 7.737671486837227e-06, + "loss": 2.3516, + "mean_token_accuracy": 0.5136397889977392, + "step": 12203 + }, + { + "epoch": 2.2625139043381535, + "grad_norm": 6.53125, + "learning_rate": 7.737486095661846e-06, + "loss": 2.9853, + "mean_token_accuracy": 0.47758647432914536, + "step": 12204 + }, + { + "epoch": 2.2626992955135337, + "grad_norm": 6.67578125, + "learning_rate": 7.737300704486467e-06, + "loss": 3.0608, + "mean_token_accuracy": 0.4665823984831727, + "step": 12205 + }, + { + "epoch": 2.2628846866889134, + "grad_norm": 8.390625, + "learning_rate": 7.737115313311087e-06, + "loss": 2.5472, + "mean_token_accuracy": 0.5084219858156028, + "step": 12206 + }, + { + "epoch": 2.2630700778642936, + "grad_norm": 7.57421875, + "learning_rate": 7.736929922135708e-06, + "loss": 2.1493, + "mean_token_accuracy": 0.5377178476020863, + "step": 12207 + }, + { + "epoch": 2.263255469039674, + "grad_norm": 8.8046875, + "learning_rate": 7.736744530960327e-06, + "loss": 2.9281, + "mean_token_accuracy": 0.45609968125181105, + "step": 12208 + }, + { + "epoch": 2.263440860215054, + "grad_norm": 7.890625, + "learning_rate": 7.736559139784947e-06, + "loss": 2.4428, + "mean_token_accuracy": 0.5407435701553348, + "step": 12209 + }, + { + "epoch": 2.2636262513904337, + "grad_norm": 6.21875, + "learning_rate": 7.736373748609567e-06, + "loss": 2.4876, + "mean_token_accuracy": 0.5139507620164127, + "step": 12210 + }, + { + "epoch": 2.263811642565814, + "grad_norm": 6.84765625, + "learning_rate": 7.736188357434186e-06, + "loss": 2.7385, + "mean_token_accuracy": 0.48383852894699036, + "step": 12211 + }, + { + "epoch": 2.263997033741194, + "grad_norm": 6.61328125, + "learning_rate": 7.736002966258807e-06, + "loss": 2.9654, + "mean_token_accuracy": 0.4527909395585346, + "step": 12212 + }, + { + "epoch": 2.264182424916574, + "grad_norm": 6.5, + "learning_rate": 7.735817575083426e-06, + "loss": 2.6396, + "mean_token_accuracy": 0.4938284198714679, + "step": 12213 + }, + { + "epoch": 2.264367816091954, + "grad_norm": 6.37109375, + "learning_rate": 7.735632183908046e-06, + "loss": 2.6389, + "mean_token_accuracy": 0.5134359509522567, + "step": 12214 + }, + { + "epoch": 2.2645532072673342, + "grad_norm": 6.39453125, + "learning_rate": 7.735446792732667e-06, + "loss": 2.9048, + "mean_token_accuracy": 0.4741388174807198, + "step": 12215 + }, + { + "epoch": 2.264738598442714, + "grad_norm": 7.0703125, + "learning_rate": 7.735261401557287e-06, + "loss": 2.4109, + "mean_token_accuracy": 0.5444931648181583, + "step": 12216 + }, + { + "epoch": 2.264923989618094, + "grad_norm": 7.2265625, + "learning_rate": 7.735076010381908e-06, + "loss": 3.0247, + "mean_token_accuracy": 0.4494396664060464, + "step": 12217 + }, + { + "epoch": 2.2651093807934743, + "grad_norm": 6.734375, + "learning_rate": 7.734890619206526e-06, + "loss": 2.8215, + "mean_token_accuracy": 0.47888421380707497, + "step": 12218 + }, + { + "epoch": 2.265294771968854, + "grad_norm": 6.8515625, + "learning_rate": 7.734705228031147e-06, + "loss": 3.0242, + "mean_token_accuracy": 0.4396551724137931, + "step": 12219 + }, + { + "epoch": 2.2654801631442343, + "grad_norm": 7.3671875, + "learning_rate": 7.734519836855766e-06, + "loss": 2.7896, + "mean_token_accuracy": 0.48385964912280705, + "step": 12220 + }, + { + "epoch": 2.2656655543196145, + "grad_norm": 7.79296875, + "learning_rate": 7.734334445680386e-06, + "loss": 2.6054, + "mean_token_accuracy": 0.5064251432110234, + "step": 12221 + }, + { + "epoch": 2.2658509454949947, + "grad_norm": 6.3125, + "learning_rate": 7.734149054505005e-06, + "loss": 2.5042, + "mean_token_accuracy": 0.49104436483879854, + "step": 12222 + }, + { + "epoch": 2.2660363366703744, + "grad_norm": 8.484375, + "learning_rate": 7.733963663329627e-06, + "loss": 2.6719, + "mean_token_accuracy": 0.4715068493150685, + "step": 12223 + }, + { + "epoch": 2.2662217278457546, + "grad_norm": 8.4609375, + "learning_rate": 7.733778272154246e-06, + "loss": 2.9348, + "mean_token_accuracy": 0.4837801306600586, + "step": 12224 + }, + { + "epoch": 2.266407119021135, + "grad_norm": 8.15625, + "learning_rate": 7.733592880978866e-06, + "loss": 2.9207, + "mean_token_accuracy": 0.47441664323868427, + "step": 12225 + }, + { + "epoch": 2.2665925101965145, + "grad_norm": 11.5078125, + "learning_rate": 7.733407489803487e-06, + "loss": 3.1518, + "mean_token_accuracy": 0.43690165361183636, + "step": 12226 + }, + { + "epoch": 2.2667779013718947, + "grad_norm": 7.28125, + "learning_rate": 7.733222098628106e-06, + "loss": 2.6889, + "mean_token_accuracy": 0.4771084337349398, + "step": 12227 + }, + { + "epoch": 2.266963292547275, + "grad_norm": 7.98046875, + "learning_rate": 7.733036707452726e-06, + "loss": 2.456, + "mean_token_accuracy": 0.5015974440894568, + "step": 12228 + }, + { + "epoch": 2.2671486837226547, + "grad_norm": 7.9375, + "learning_rate": 7.732851316277345e-06, + "loss": 3.1607, + "mean_token_accuracy": 0.47321071571371454, + "step": 12229 + }, + { + "epoch": 2.267334074898035, + "grad_norm": 9.9296875, + "learning_rate": 7.732665925101965e-06, + "loss": 2.781, + "mean_token_accuracy": 0.4948721117014704, + "step": 12230 + }, + { + "epoch": 2.267519466073415, + "grad_norm": 7.42578125, + "learning_rate": 7.732480533926586e-06, + "loss": 2.3298, + "mean_token_accuracy": 0.5564197373454035, + "step": 12231 + }, + { + "epoch": 2.267704857248795, + "grad_norm": 9.046875, + "learning_rate": 7.732295142751206e-06, + "loss": 3.0407, + "mean_token_accuracy": 0.4636433710174717, + "step": 12232 + }, + { + "epoch": 2.267890248424175, + "grad_norm": 7.94921875, + "learning_rate": 7.732109751575825e-06, + "loss": 2.1698, + "mean_token_accuracy": 0.5451970126191089, + "step": 12233 + }, + { + "epoch": 2.268075639599555, + "grad_norm": 10.1875, + "learning_rate": 7.731924360400446e-06, + "loss": 2.6631, + "mean_token_accuracy": 0.47915849993466614, + "step": 12234 + }, + { + "epoch": 2.2682610307749353, + "grad_norm": 8.375, + "learning_rate": 7.731738969225066e-06, + "loss": 2.3374, + "mean_token_accuracy": 0.5316393240409523, + "step": 12235 + }, + { + "epoch": 2.268446421950315, + "grad_norm": 7.34375, + "learning_rate": 7.731553578049685e-06, + "loss": 2.4193, + "mean_token_accuracy": 0.5041509433962265, + "step": 12236 + }, + { + "epoch": 2.2686318131256953, + "grad_norm": 5.81640625, + "learning_rate": 7.731368186874306e-06, + "loss": 2.3568, + "mean_token_accuracy": 0.5251770766258854, + "step": 12237 + }, + { + "epoch": 2.268817204301075, + "grad_norm": 6.83203125, + "learning_rate": 7.731182795698924e-06, + "loss": 2.3761, + "mean_token_accuracy": 0.5247570569180935, + "step": 12238 + }, + { + "epoch": 2.269002595476455, + "grad_norm": 9.328125, + "learning_rate": 7.730997404523546e-06, + "loss": 2.932, + "mean_token_accuracy": 0.5094170403587444, + "step": 12239 + }, + { + "epoch": 2.2691879866518354, + "grad_norm": 11.3515625, + "learning_rate": 7.730812013348165e-06, + "loss": 3.0937, + "mean_token_accuracy": 0.45418745077448147, + "step": 12240 + }, + { + "epoch": 2.2693733778272156, + "grad_norm": 6.75, + "learning_rate": 7.730626622172786e-06, + "loss": 2.5064, + "mean_token_accuracy": 0.5114265095020447, + "step": 12241 + }, + { + "epoch": 2.2695587690025953, + "grad_norm": 9.546875, + "learning_rate": 7.730441230997405e-06, + "loss": 3.231, + "mean_token_accuracy": 0.4383599339570721, + "step": 12242 + }, + { + "epoch": 2.2697441601779755, + "grad_norm": 10.1875, + "learning_rate": 7.730255839822025e-06, + "loss": 2.0396, + "mean_token_accuracy": 0.556033920417482, + "step": 12243 + }, + { + "epoch": 2.2699295513533557, + "grad_norm": 6.046875, + "learning_rate": 7.730070448646646e-06, + "loss": 2.5544, + "mean_token_accuracy": 0.5052040696994503, + "step": 12244 + }, + { + "epoch": 2.2701149425287355, + "grad_norm": 7.7421875, + "learning_rate": 7.729885057471264e-06, + "loss": 2.7289, + "mean_token_accuracy": 0.5369396922488769, + "step": 12245 + }, + { + "epoch": 2.2703003337041157, + "grad_norm": 7.44921875, + "learning_rate": 7.729699666295885e-06, + "loss": 3.0965, + "mean_token_accuracy": 0.47558981546367673, + "step": 12246 + }, + { + "epoch": 2.270485724879496, + "grad_norm": 6.4296875, + "learning_rate": 7.729514275120505e-06, + "loss": 2.8389, + "mean_token_accuracy": 0.48094277524153306, + "step": 12247 + }, + { + "epoch": 2.270671116054876, + "grad_norm": 6.046875, + "learning_rate": 7.729328883945126e-06, + "loss": 2.4416, + "mean_token_accuracy": 0.5253339752076062, + "step": 12248 + }, + { + "epoch": 2.270856507230256, + "grad_norm": 6.83203125, + "learning_rate": 7.729143492769745e-06, + "loss": 2.7874, + "mean_token_accuracy": 0.47637238256932657, + "step": 12249 + }, + { + "epoch": 2.271041898405636, + "grad_norm": 7.62109375, + "learning_rate": 7.728958101594365e-06, + "loss": 3.348, + "mean_token_accuracy": 0.468896080546566, + "step": 12250 + }, + { + "epoch": 2.2712272895810157, + "grad_norm": 6.85546875, + "learning_rate": 7.728772710418984e-06, + "loss": 2.5723, + "mean_token_accuracy": 0.5007606490872211, + "step": 12251 + }, + { + "epoch": 2.271412680756396, + "grad_norm": 7.2109375, + "learning_rate": 7.728587319243604e-06, + "loss": 2.6499, + "mean_token_accuracy": 0.514693416219271, + "step": 12252 + }, + { + "epoch": 2.271598071931776, + "grad_norm": 7.75390625, + "learning_rate": 7.728401928068225e-06, + "loss": 2.5572, + "mean_token_accuracy": 0.5251751024183957, + "step": 12253 + }, + { + "epoch": 2.2717834631071563, + "grad_norm": 8.0546875, + "learning_rate": 7.728216536892844e-06, + "loss": 2.6036, + "mean_token_accuracy": 0.5134105232302506, + "step": 12254 + }, + { + "epoch": 2.271968854282536, + "grad_norm": 8.78125, + "learning_rate": 7.728031145717464e-06, + "loss": 2.6388, + "mean_token_accuracy": 0.5277387091733083, + "step": 12255 + }, + { + "epoch": 2.272154245457916, + "grad_norm": 6.81640625, + "learning_rate": 7.727845754542085e-06, + "loss": 2.3917, + "mean_token_accuracy": 0.5258649093904448, + "step": 12256 + }, + { + "epoch": 2.2723396366332964, + "grad_norm": 6.87890625, + "learning_rate": 7.727660363366705e-06, + "loss": 2.8243, + "mean_token_accuracy": 0.47924406396381847, + "step": 12257 + }, + { + "epoch": 2.272525027808676, + "grad_norm": 6.0234375, + "learning_rate": 7.727474972191324e-06, + "loss": 2.6253, + "mean_token_accuracy": 0.4980694980694981, + "step": 12258 + }, + { + "epoch": 2.2727104189840563, + "grad_norm": 6.30859375, + "learning_rate": 7.727289581015944e-06, + "loss": 2.8235, + "mean_token_accuracy": 0.4723259389413573, + "step": 12259 + }, + { + "epoch": 2.2728958101594365, + "grad_norm": 6.7421875, + "learning_rate": 7.727104189840563e-06, + "loss": 2.5097, + "mean_token_accuracy": 0.49756888168557534, + "step": 12260 + }, + { + "epoch": 2.2730812013348163, + "grad_norm": 25.984375, + "learning_rate": 7.726918798665184e-06, + "loss": 2.6903, + "mean_token_accuracy": 0.49481020166073547, + "step": 12261 + }, + { + "epoch": 2.2732665925101965, + "grad_norm": 7.4609375, + "learning_rate": 7.726733407489804e-06, + "loss": 2.8883, + "mean_token_accuracy": 0.4928325167846126, + "step": 12262 + }, + { + "epoch": 2.2734519836855767, + "grad_norm": 6.8515625, + "learning_rate": 7.726548016314423e-06, + "loss": 3.4679, + "mean_token_accuracy": 0.4385512584407612, + "step": 12263 + }, + { + "epoch": 2.2736373748609564, + "grad_norm": 6.8515625, + "learning_rate": 7.726362625139045e-06, + "loss": 2.7812, + "mean_token_accuracy": 0.4682325109834404, + "step": 12264 + }, + { + "epoch": 2.2738227660363366, + "grad_norm": 7.72265625, + "learning_rate": 7.726177233963664e-06, + "loss": 3.4717, + "mean_token_accuracy": 0.4509090909090909, + "step": 12265 + }, + { + "epoch": 2.274008157211717, + "grad_norm": 6.859375, + "learning_rate": 7.725991842788285e-06, + "loss": 3.0445, + "mean_token_accuracy": 0.4805668016194332, + "step": 12266 + }, + { + "epoch": 2.274193548387097, + "grad_norm": 6.99609375, + "learning_rate": 7.725806451612903e-06, + "loss": 2.9308, + "mean_token_accuracy": 0.4836852207293666, + "step": 12267 + }, + { + "epoch": 2.2743789395624767, + "grad_norm": 6.8046875, + "learning_rate": 7.725621060437524e-06, + "loss": 2.9904, + "mean_token_accuracy": 0.4756466241182398, + "step": 12268 + }, + { + "epoch": 2.274564330737857, + "grad_norm": 6.66796875, + "learning_rate": 7.725435669262144e-06, + "loss": 2.9606, + "mean_token_accuracy": 0.4692737430167598, + "step": 12269 + }, + { + "epoch": 2.274749721913237, + "grad_norm": 6.3984375, + "learning_rate": 7.725250278086763e-06, + "loss": 2.5514, + "mean_token_accuracy": 0.5100564317754305, + "step": 12270 + }, + { + "epoch": 2.274935113088617, + "grad_norm": 6.05078125, + "learning_rate": 7.725064886911384e-06, + "loss": 2.969, + "mean_token_accuracy": 0.46182231053079764, + "step": 12271 + }, + { + "epoch": 2.275120504263997, + "grad_norm": 6.59375, + "learning_rate": 7.724879495736004e-06, + "loss": 3.2892, + "mean_token_accuracy": 0.43890625, + "step": 12272 + }, + { + "epoch": 2.2753058954393772, + "grad_norm": 6.33984375, + "learning_rate": 7.724694104560625e-06, + "loss": 2.4695, + "mean_token_accuracy": 0.530638852672751, + "step": 12273 + }, + { + "epoch": 2.275491286614757, + "grad_norm": 6.640625, + "learning_rate": 7.724508713385243e-06, + "loss": 3.0003, + "mean_token_accuracy": 0.46051246051246053, + "step": 12274 + }, + { + "epoch": 2.275676677790137, + "grad_norm": 6.296875, + "learning_rate": 7.724323322209864e-06, + "loss": 2.2532, + "mean_token_accuracy": 0.5320970042796006, + "step": 12275 + }, + { + "epoch": 2.2758620689655173, + "grad_norm": 6.73828125, + "learning_rate": 7.724137931034483e-06, + "loss": 2.2557, + "mean_token_accuracy": 0.5378576854899975, + "step": 12276 + }, + { + "epoch": 2.276047460140897, + "grad_norm": 7.92578125, + "learning_rate": 7.723952539859103e-06, + "loss": 2.788, + "mean_token_accuracy": 0.48034527003382715, + "step": 12277 + }, + { + "epoch": 2.2762328513162773, + "grad_norm": 7.03515625, + "learning_rate": 7.723767148683724e-06, + "loss": 2.9047, + "mean_token_accuracy": 0.4802168815943728, + "step": 12278 + }, + { + "epoch": 2.2764182424916575, + "grad_norm": 8.0078125, + "learning_rate": 7.723581757508342e-06, + "loss": 3.4169, + "mean_token_accuracy": 0.4406285436578649, + "step": 12279 + }, + { + "epoch": 2.2766036336670377, + "grad_norm": 6.578125, + "learning_rate": 7.723396366332963e-06, + "loss": 2.9437, + "mean_token_accuracy": 0.46548428207306713, + "step": 12280 + }, + { + "epoch": 2.2767890248424174, + "grad_norm": 6.44921875, + "learning_rate": 7.723210975157583e-06, + "loss": 2.5239, + "mean_token_accuracy": 0.5050145157033518, + "step": 12281 + }, + { + "epoch": 2.2769744160177976, + "grad_norm": 6.4609375, + "learning_rate": 7.723025583982204e-06, + "loss": 2.4356, + "mean_token_accuracy": 0.5245378374783881, + "step": 12282 + }, + { + "epoch": 2.277159807193178, + "grad_norm": 7.140625, + "learning_rate": 7.722840192806823e-06, + "loss": 2.4411, + "mean_token_accuracy": 0.5283252929014473, + "step": 12283 + }, + { + "epoch": 2.2773451983685575, + "grad_norm": 7.69921875, + "learning_rate": 7.722654801631443e-06, + "loss": 2.706, + "mean_token_accuracy": 0.4868383809793377, + "step": 12284 + }, + { + "epoch": 2.2775305895439377, + "grad_norm": 6.76171875, + "learning_rate": 7.722469410456062e-06, + "loss": 2.8171, + "mean_token_accuracy": 0.48856891624532867, + "step": 12285 + }, + { + "epoch": 2.277715980719318, + "grad_norm": 6.80859375, + "learning_rate": 7.722284019280682e-06, + "loss": 3.0878, + "mean_token_accuracy": 0.45571536714610145, + "step": 12286 + }, + { + "epoch": 2.2779013718946977, + "grad_norm": 7.23828125, + "learning_rate": 7.722098628105303e-06, + "loss": 3.3486, + "mean_token_accuracy": 0.45006388415672915, + "step": 12287 + }, + { + "epoch": 2.278086763070078, + "grad_norm": 6.09375, + "learning_rate": 7.721913236929923e-06, + "loss": 2.5485, + "mean_token_accuracy": 0.5066442645486482, + "step": 12288 + }, + { + "epoch": 2.278272154245458, + "grad_norm": 7.33203125, + "learning_rate": 7.721727845754542e-06, + "loss": 2.842, + "mean_token_accuracy": 0.46078190312269485, + "step": 12289 + }, + { + "epoch": 2.278457545420838, + "grad_norm": 6.47265625, + "learning_rate": 7.721542454579163e-06, + "loss": 3.3868, + "mean_token_accuracy": 0.45248868778280543, + "step": 12290 + }, + { + "epoch": 2.278642936596218, + "grad_norm": 6.671875, + "learning_rate": 7.721357063403783e-06, + "loss": 2.901, + "mean_token_accuracy": 0.45994269340974214, + "step": 12291 + }, + { + "epoch": 2.278828327771598, + "grad_norm": 8.5703125, + "learning_rate": 7.721171672228402e-06, + "loss": 2.54, + "mean_token_accuracy": 0.5339021615472127, + "step": 12292 + }, + { + "epoch": 2.2790137189469784, + "grad_norm": 6.83984375, + "learning_rate": 7.720986281053023e-06, + "loss": 3.0206, + "mean_token_accuracy": 0.47869071476285907, + "step": 12293 + }, + { + "epoch": 2.279199110122358, + "grad_norm": 6.46875, + "learning_rate": 7.720800889877641e-06, + "loss": 2.9749, + "mean_token_accuracy": 0.4593609865470852, + "step": 12294 + }, + { + "epoch": 2.2793845012977383, + "grad_norm": 6.1796875, + "learning_rate": 7.720615498702262e-06, + "loss": 2.7716, + "mean_token_accuracy": 0.4835558001926517, + "step": 12295 + }, + { + "epoch": 2.279569892473118, + "grad_norm": 6.671875, + "learning_rate": 7.720430107526882e-06, + "loss": 3.4171, + "mean_token_accuracy": 0.42994241842610365, + "step": 12296 + }, + { + "epoch": 2.279755283648498, + "grad_norm": 7.41796875, + "learning_rate": 7.720244716351503e-06, + "loss": 2.4573, + "mean_token_accuracy": 0.5135351914978945, + "step": 12297 + }, + { + "epoch": 2.2799406748238784, + "grad_norm": 6.95703125, + "learning_rate": 7.720059325176123e-06, + "loss": 2.8525, + "mean_token_accuracy": 0.4672969966629588, + "step": 12298 + }, + { + "epoch": 2.2801260659992586, + "grad_norm": 6.59765625, + "learning_rate": 7.719873934000742e-06, + "loss": 2.9846, + "mean_token_accuracy": 0.4550363676438175, + "step": 12299 + }, + { + "epoch": 2.2803114571746383, + "grad_norm": 8.6484375, + "learning_rate": 7.719688542825363e-06, + "loss": 2.7551, + "mean_token_accuracy": 0.4882758620689655, + "step": 12300 + }, + { + "epoch": 2.2804968483500185, + "grad_norm": 6.74609375, + "learning_rate": 7.719503151649981e-06, + "loss": 2.7783, + "mean_token_accuracy": 0.47149098241290466, + "step": 12301 + }, + { + "epoch": 2.2806822395253987, + "grad_norm": 7.921875, + "learning_rate": 7.719317760474602e-06, + "loss": 2.4022, + "mean_token_accuracy": 0.5127513995438524, + "step": 12302 + }, + { + "epoch": 2.2808676307007785, + "grad_norm": 7.9921875, + "learning_rate": 7.71913236929922e-06, + "loss": 3.5181, + "mean_token_accuracy": 0.41800152555301295, + "step": 12303 + }, + { + "epoch": 2.2810530218761587, + "grad_norm": 8.125, + "learning_rate": 7.718946978123843e-06, + "loss": 2.9149, + "mean_token_accuracy": 0.49525101763907736, + "step": 12304 + }, + { + "epoch": 2.281238413051539, + "grad_norm": 7.75390625, + "learning_rate": 7.718761586948462e-06, + "loss": 3.2494, + "mean_token_accuracy": 0.4395968322534197, + "step": 12305 + }, + { + "epoch": 2.281423804226919, + "grad_norm": 7.015625, + "learning_rate": 7.718576195773082e-06, + "loss": 2.9502, + "mean_token_accuracy": 0.43871039964177766, + "step": 12306 + }, + { + "epoch": 2.281609195402299, + "grad_norm": 6.875, + "learning_rate": 7.718390804597703e-06, + "loss": 2.6265, + "mean_token_accuracy": 0.48924022837066317, + "step": 12307 + }, + { + "epoch": 2.281794586577679, + "grad_norm": 7.75, + "learning_rate": 7.718205413422321e-06, + "loss": 2.7879, + "mean_token_accuracy": 0.47962555066079293, + "step": 12308 + }, + { + "epoch": 2.2819799777530587, + "grad_norm": 7.98828125, + "learning_rate": 7.718020022246942e-06, + "loss": 2.6771, + "mean_token_accuracy": 0.5035128805620609, + "step": 12309 + }, + { + "epoch": 2.282165368928439, + "grad_norm": 7.5625, + "learning_rate": 7.71783463107156e-06, + "loss": 2.6573, + "mean_token_accuracy": 0.47474747474747475, + "step": 12310 + }, + { + "epoch": 2.282350760103819, + "grad_norm": 7.1328125, + "learning_rate": 7.717649239896181e-06, + "loss": 3.0201, + "mean_token_accuracy": 0.4600777511961722, + "step": 12311 + }, + { + "epoch": 2.2825361512791993, + "grad_norm": 6.78125, + "learning_rate": 7.717463848720802e-06, + "loss": 3.0153, + "mean_token_accuracy": 0.4663976624460832, + "step": 12312 + }, + { + "epoch": 2.282721542454579, + "grad_norm": 7.4453125, + "learning_rate": 7.717278457545422e-06, + "loss": 3.0663, + "mean_token_accuracy": 0.46137506987143656, + "step": 12313 + }, + { + "epoch": 2.2829069336299592, + "grad_norm": 9.234375, + "learning_rate": 7.717093066370041e-06, + "loss": 2.5378, + "mean_token_accuracy": 0.48985204855842185, + "step": 12314 + }, + { + "epoch": 2.2830923248053394, + "grad_norm": 9.2578125, + "learning_rate": 7.716907675194661e-06, + "loss": 3.11, + "mean_token_accuracy": 0.452896512935883, + "step": 12315 + }, + { + "epoch": 2.283277715980719, + "grad_norm": 9.2890625, + "learning_rate": 7.716722284019282e-06, + "loss": 3.2963, + "mean_token_accuracy": 0.46542280041081824, + "step": 12316 + }, + { + "epoch": 2.2834631071560993, + "grad_norm": 11.84375, + "learning_rate": 7.7165368928439e-06, + "loss": 2.8924, + "mean_token_accuracy": 0.46913073237508557, + "step": 12317 + }, + { + "epoch": 2.2836484983314795, + "grad_norm": 8.921875, + "learning_rate": 7.716351501668521e-06, + "loss": 2.5914, + "mean_token_accuracy": 0.5074664964901084, + "step": 12318 + }, + { + "epoch": 2.2838338895068593, + "grad_norm": 6.6328125, + "learning_rate": 7.71616611049314e-06, + "loss": 2.9147, + "mean_token_accuracy": 0.459067211497816, + "step": 12319 + }, + { + "epoch": 2.2840192806822395, + "grad_norm": 8.7421875, + "learning_rate": 7.715980719317762e-06, + "loss": 2.4882, + "mean_token_accuracy": 0.5079697986577181, + "step": 12320 + }, + { + "epoch": 2.2842046718576197, + "grad_norm": 12.046875, + "learning_rate": 7.715795328142381e-06, + "loss": 3.0884, + "mean_token_accuracy": 0.4631737529293606, + "step": 12321 + }, + { + "epoch": 2.2843900630329994, + "grad_norm": 12.21875, + "learning_rate": 7.715609936967002e-06, + "loss": 2.566, + "mean_token_accuracy": 0.4939542063287883, + "step": 12322 + }, + { + "epoch": 2.2845754542083796, + "grad_norm": 10.828125, + "learning_rate": 7.71542454579162e-06, + "loss": 2.4587, + "mean_token_accuracy": 0.5019508057675997, + "step": 12323 + }, + { + "epoch": 2.28476084538376, + "grad_norm": 12.6875, + "learning_rate": 7.71523915461624e-06, + "loss": 2.8035, + "mean_token_accuracy": 0.47122692725298587, + "step": 12324 + }, + { + "epoch": 2.28494623655914, + "grad_norm": 13.6484375, + "learning_rate": 7.715053763440861e-06, + "loss": 2.815, + "mean_token_accuracy": 0.481635301752109, + "step": 12325 + }, + { + "epoch": 2.2851316277345197, + "grad_norm": 10.7890625, + "learning_rate": 7.71486837226548e-06, + "loss": 2.8068, + "mean_token_accuracy": 0.45285664213109006, + "step": 12326 + }, + { + "epoch": 2.2853170189099, + "grad_norm": 9.75, + "learning_rate": 7.7146829810901e-06, + "loss": 3.2683, + "mean_token_accuracy": 0.42665635473060065, + "step": 12327 + }, + { + "epoch": 2.28550241008528, + "grad_norm": 9.4921875, + "learning_rate": 7.714497589914721e-06, + "loss": 2.9497, + "mean_token_accuracy": 0.4868401705631525, + "step": 12328 + }, + { + "epoch": 2.28568780126066, + "grad_norm": 6.76171875, + "learning_rate": 7.714312198739342e-06, + "loss": 2.6697, + "mean_token_accuracy": 0.5071115973741794, + "step": 12329 + }, + { + "epoch": 2.28587319243604, + "grad_norm": 8.3125, + "learning_rate": 7.71412680756396e-06, + "loss": 2.6884, + "mean_token_accuracy": 0.504228178300129, + "step": 12330 + }, + { + "epoch": 2.2860585836114202, + "grad_norm": 6.7890625, + "learning_rate": 7.713941416388581e-06, + "loss": 3.3915, + "mean_token_accuracy": 0.4232496986961762, + "step": 12331 + }, + { + "epoch": 2.2862439747868, + "grad_norm": 7.94140625, + "learning_rate": 7.7137560252132e-06, + "loss": 2.9875, + "mean_token_accuracy": 0.47459710461622506, + "step": 12332 + }, + { + "epoch": 2.28642936596218, + "grad_norm": 7.8125, + "learning_rate": 7.71357063403782e-06, + "loss": 2.7082, + "mean_token_accuracy": 0.4925227568270481, + "step": 12333 + }, + { + "epoch": 2.2866147571375603, + "grad_norm": 10.8828125, + "learning_rate": 7.71338524286244e-06, + "loss": 3.1996, + "mean_token_accuracy": 0.46999735659529474, + "step": 12334 + }, + { + "epoch": 2.28680014831294, + "grad_norm": 10.0390625, + "learning_rate": 7.71319985168706e-06, + "loss": 2.9266, + "mean_token_accuracy": 0.4394951744617669, + "step": 12335 + }, + { + "epoch": 2.2869855394883203, + "grad_norm": 8.015625, + "learning_rate": 7.713014460511682e-06, + "loss": 2.9532, + "mean_token_accuracy": 0.4674034695019586, + "step": 12336 + }, + { + "epoch": 2.2871709306637005, + "grad_norm": 7.73828125, + "learning_rate": 7.7128290693363e-06, + "loss": 2.8164, + "mean_token_accuracy": 0.4815611995982207, + "step": 12337 + }, + { + "epoch": 2.2873563218390807, + "grad_norm": 6.37890625, + "learning_rate": 7.712643678160921e-06, + "loss": 2.5643, + "mean_token_accuracy": 0.517648536616003, + "step": 12338 + }, + { + "epoch": 2.2875417130144604, + "grad_norm": 7.19921875, + "learning_rate": 7.71245828698554e-06, + "loss": 2.4472, + "mean_token_accuracy": 0.5068360556563823, + "step": 12339 + }, + { + "epoch": 2.2877271041898406, + "grad_norm": 8.421875, + "learning_rate": 7.71227289581016e-06, + "loss": 2.874, + "mean_token_accuracy": 0.5015486725663717, + "step": 12340 + }, + { + "epoch": 2.287912495365221, + "grad_norm": 8.671875, + "learning_rate": 7.712087504634779e-06, + "loss": 3.2409, + "mean_token_accuracy": 0.4880908757786735, + "step": 12341 + }, + { + "epoch": 2.2880978865406005, + "grad_norm": 6.09765625, + "learning_rate": 7.7119021134594e-06, + "loss": 2.4699, + "mean_token_accuracy": 0.5313754732132614, + "step": 12342 + }, + { + "epoch": 2.2882832777159807, + "grad_norm": 6.4765625, + "learning_rate": 7.71171672228402e-06, + "loss": 2.899, + "mean_token_accuracy": 0.47677703695332807, + "step": 12343 + }, + { + "epoch": 2.288468668891361, + "grad_norm": 7.734375, + "learning_rate": 7.71153133110864e-06, + "loss": 2.5331, + "mean_token_accuracy": 0.5130286202477574, + "step": 12344 + }, + { + "epoch": 2.2886540600667407, + "grad_norm": 7.30078125, + "learning_rate": 7.711345939933261e-06, + "loss": 2.3787, + "mean_token_accuracy": 0.5046035454170674, + "step": 12345 + }, + { + "epoch": 2.288839451242121, + "grad_norm": 6.9453125, + "learning_rate": 7.71116054875788e-06, + "loss": 2.679, + "mean_token_accuracy": 0.5092690278824415, + "step": 12346 + }, + { + "epoch": 2.289024842417501, + "grad_norm": 7.25, + "learning_rate": 7.7109751575825e-06, + "loss": 3.7638, + "mean_token_accuracy": 0.4088009614495701, + "step": 12347 + }, + { + "epoch": 2.289210233592881, + "grad_norm": 6.5703125, + "learning_rate": 7.710789766407119e-06, + "loss": 2.987, + "mean_token_accuracy": 0.45755060433959516, + "step": 12348 + }, + { + "epoch": 2.289395624768261, + "grad_norm": 6.9453125, + "learning_rate": 7.71060437523174e-06, + "loss": 2.9332, + "mean_token_accuracy": 0.4879344186583535, + "step": 12349 + }, + { + "epoch": 2.289581015943641, + "grad_norm": 7.45703125, + "learning_rate": 7.71041898405636e-06, + "loss": 2.8706, + "mean_token_accuracy": 0.47400346620450606, + "step": 12350 + }, + { + "epoch": 2.2897664071190214, + "grad_norm": 5.93359375, + "learning_rate": 7.710233592880979e-06, + "loss": 2.5068, + "mean_token_accuracy": 0.5312593478911157, + "step": 12351 + }, + { + "epoch": 2.289951798294401, + "grad_norm": 7.3125, + "learning_rate": 7.7100482017056e-06, + "loss": 2.8078, + "mean_token_accuracy": 0.4725554343874955, + "step": 12352 + }, + { + "epoch": 2.2901371894697813, + "grad_norm": 7.765625, + "learning_rate": 7.70986281053022e-06, + "loss": 3.297, + "mean_token_accuracy": 0.4491356891540462, + "step": 12353 + }, + { + "epoch": 2.2903225806451615, + "grad_norm": 8.203125, + "learning_rate": 7.70967741935484e-06, + "loss": 3.1961, + "mean_token_accuracy": 0.4396215257244234, + "step": 12354 + }, + { + "epoch": 2.290507971820541, + "grad_norm": 8.171875, + "learning_rate": 7.709492028179459e-06, + "loss": 2.4419, + "mean_token_accuracy": 0.5054302422723476, + "step": 12355 + }, + { + "epoch": 2.2906933629959214, + "grad_norm": 7.02734375, + "learning_rate": 7.70930663700408e-06, + "loss": 3.4039, + "mean_token_accuracy": 0.4561869357408391, + "step": 12356 + }, + { + "epoch": 2.2908787541713016, + "grad_norm": 8.0703125, + "learning_rate": 7.709121245828698e-06, + "loss": 3.0542, + "mean_token_accuracy": 0.46413071227980596, + "step": 12357 + }, + { + "epoch": 2.2910641453466813, + "grad_norm": 10.59375, + "learning_rate": 7.708935854653319e-06, + "loss": 2.664, + "mean_token_accuracy": 0.4823845076994867, + "step": 12358 + }, + { + "epoch": 2.2912495365220615, + "grad_norm": 9.578125, + "learning_rate": 7.70875046347794e-06, + "loss": 2.5334, + "mean_token_accuracy": 0.48986402966625464, + "step": 12359 + }, + { + "epoch": 2.2914349276974417, + "grad_norm": 6.8125, + "learning_rate": 7.70856507230256e-06, + "loss": 2.8376, + "mean_token_accuracy": 0.4892575793745524, + "step": 12360 + }, + { + "epoch": 2.2916203188728215, + "grad_norm": 10.5546875, + "learning_rate": 7.708379681127179e-06, + "loss": 2.9273, + "mean_token_accuracy": 0.4968144232072658, + "step": 12361 + }, + { + "epoch": 2.2918057100482017, + "grad_norm": 9.6640625, + "learning_rate": 7.7081942899518e-06, + "loss": 2.6689, + "mean_token_accuracy": 0.5133662081494939, + "step": 12362 + }, + { + "epoch": 2.291991101223582, + "grad_norm": 8.03125, + "learning_rate": 7.70800889877642e-06, + "loss": 3.2168, + "mean_token_accuracy": 0.4609375, + "step": 12363 + }, + { + "epoch": 2.292176492398962, + "grad_norm": 6.9765625, + "learning_rate": 7.707823507601038e-06, + "loss": 2.669, + "mean_token_accuracy": 0.4822877753445833, + "step": 12364 + }, + { + "epoch": 2.292361883574342, + "grad_norm": 6.25, + "learning_rate": 7.707638116425659e-06, + "loss": 2.4671, + "mean_token_accuracy": 0.5294596165020337, + "step": 12365 + }, + { + "epoch": 2.292547274749722, + "grad_norm": 11.7421875, + "learning_rate": 7.707452725250278e-06, + "loss": 3.6039, + "mean_token_accuracy": 0.4441521429354379, + "step": 12366 + }, + { + "epoch": 2.2927326659251017, + "grad_norm": 9.03125, + "learning_rate": 7.707267334074898e-06, + "loss": 2.6187, + "mean_token_accuracy": 0.5063834240267895, + "step": 12367 + }, + { + "epoch": 2.292918057100482, + "grad_norm": 7.2421875, + "learning_rate": 7.707081942899519e-06, + "loss": 2.5171, + "mean_token_accuracy": 0.5306551135617698, + "step": 12368 + }, + { + "epoch": 2.293103448275862, + "grad_norm": 7.140625, + "learning_rate": 7.70689655172414e-06, + "loss": 2.5132, + "mean_token_accuracy": 0.5106628982528263, + "step": 12369 + }, + { + "epoch": 2.2932888394512423, + "grad_norm": 10.515625, + "learning_rate": 7.706711160548758e-06, + "loss": 3.0694, + "mean_token_accuracy": 0.4666370106761566, + "step": 12370 + }, + { + "epoch": 2.293474230626622, + "grad_norm": 8.4375, + "learning_rate": 7.706525769373378e-06, + "loss": 2.7084, + "mean_token_accuracy": 0.4901458634895891, + "step": 12371 + }, + { + "epoch": 2.2936596218020022, + "grad_norm": 8.4765625, + "learning_rate": 7.706340378197999e-06, + "loss": 2.7464, + "mean_token_accuracy": 0.4696879819670186, + "step": 12372 + }, + { + "epoch": 2.2938450129773824, + "grad_norm": 6.68359375, + "learning_rate": 7.706154987022618e-06, + "loss": 3.0206, + "mean_token_accuracy": 0.4787629413326254, + "step": 12373 + }, + { + "epoch": 2.294030404152762, + "grad_norm": 9.8984375, + "learning_rate": 7.705969595847238e-06, + "loss": 2.2587, + "mean_token_accuracy": 0.540482284790516, + "step": 12374 + }, + { + "epoch": 2.2942157953281423, + "grad_norm": 7.16015625, + "learning_rate": 7.705784204671857e-06, + "loss": 2.7156, + "mean_token_accuracy": 0.498600265212907, + "step": 12375 + }, + { + "epoch": 2.2944011865035225, + "grad_norm": 8.375, + "learning_rate": 7.705598813496478e-06, + "loss": 2.4011, + "mean_token_accuracy": 0.5066717988196048, + "step": 12376 + }, + { + "epoch": 2.2945865776789027, + "grad_norm": 6.5546875, + "learning_rate": 7.705413422321098e-06, + "loss": 2.4112, + "mean_token_accuracy": 0.5198913941683391, + "step": 12377 + }, + { + "epoch": 2.2947719688542825, + "grad_norm": 6.96484375, + "learning_rate": 7.705228031145719e-06, + "loss": 3.0232, + "mean_token_accuracy": 0.4693517565779803, + "step": 12378 + }, + { + "epoch": 2.2949573600296627, + "grad_norm": 7.41015625, + "learning_rate": 7.705042639970337e-06, + "loss": 3.0818, + "mean_token_accuracy": 0.4465294807548779, + "step": 12379 + }, + { + "epoch": 2.2951427512050424, + "grad_norm": 6.7421875, + "learning_rate": 7.704857248794958e-06, + "loss": 2.7467, + "mean_token_accuracy": 0.47884393063583813, + "step": 12380 + }, + { + "epoch": 2.2953281423804226, + "grad_norm": 6.05078125, + "learning_rate": 7.704671857619578e-06, + "loss": 3.0658, + "mean_token_accuracy": 0.45109809663250366, + "step": 12381 + }, + { + "epoch": 2.295513533555803, + "grad_norm": 6.6171875, + "learning_rate": 7.704486466444197e-06, + "loss": 2.7133, + "mean_token_accuracy": 0.505515587529976, + "step": 12382 + }, + { + "epoch": 2.295698924731183, + "grad_norm": 7.1796875, + "learning_rate": 7.704301075268818e-06, + "loss": 3.0049, + "mean_token_accuracy": 0.4654887504766747, + "step": 12383 + }, + { + "epoch": 2.2958843159065627, + "grad_norm": 7.04296875, + "learning_rate": 7.704115684093436e-06, + "loss": 2.846, + "mean_token_accuracy": 0.5020045101478326, + "step": 12384 + }, + { + "epoch": 2.296069707081943, + "grad_norm": 6.13671875, + "learning_rate": 7.703930292918059e-06, + "loss": 2.8673, + "mean_token_accuracy": 0.4724012328383301, + "step": 12385 + }, + { + "epoch": 2.296255098257323, + "grad_norm": 7.25390625, + "learning_rate": 7.703744901742677e-06, + "loss": 3.2847, + "mean_token_accuracy": 0.45753517545346667, + "step": 12386 + }, + { + "epoch": 2.296440489432703, + "grad_norm": 7.37890625, + "learning_rate": 7.703559510567298e-06, + "loss": 2.7863, + "mean_token_accuracy": 0.4740223773125069, + "step": 12387 + }, + { + "epoch": 2.296625880608083, + "grad_norm": 6.3515625, + "learning_rate": 7.703374119391918e-06, + "loss": 2.4772, + "mean_token_accuracy": 0.544108008913357, + "step": 12388 + }, + { + "epoch": 2.2968112717834632, + "grad_norm": 6.17578125, + "learning_rate": 7.703188728216537e-06, + "loss": 2.9356, + "mean_token_accuracy": 0.4666732109551389, + "step": 12389 + }, + { + "epoch": 2.296996662958843, + "grad_norm": 7.25, + "learning_rate": 7.703003337041158e-06, + "loss": 3.061, + "mean_token_accuracy": 0.4674210839785587, + "step": 12390 + }, + { + "epoch": 2.297182054134223, + "grad_norm": 6.46484375, + "learning_rate": 7.702817945865776e-06, + "loss": 2.6022, + "mean_token_accuracy": 0.5003592814371257, + "step": 12391 + }, + { + "epoch": 2.2973674453096034, + "grad_norm": 6.2109375, + "learning_rate": 7.702632554690397e-06, + "loss": 2.6589, + "mean_token_accuracy": 0.4874165983846424, + "step": 12392 + }, + { + "epoch": 2.297552836484983, + "grad_norm": 7.09765625, + "learning_rate": 7.702447163515017e-06, + "loss": 3.0223, + "mean_token_accuracy": 0.44184630373888684, + "step": 12393 + }, + { + "epoch": 2.2977382276603633, + "grad_norm": 7.38671875, + "learning_rate": 7.702261772339638e-06, + "loss": 3.0053, + "mean_token_accuracy": 0.4431543299467828, + "step": 12394 + }, + { + "epoch": 2.2979236188357435, + "grad_norm": 6.5078125, + "learning_rate": 7.702076381164257e-06, + "loss": 2.6815, + "mean_token_accuracy": 0.4946334089191232, + "step": 12395 + }, + { + "epoch": 2.2981090100111237, + "grad_norm": 9.78125, + "learning_rate": 7.701890989988877e-06, + "loss": 3.3622, + "mean_token_accuracy": 0.44126250962278674, + "step": 12396 + }, + { + "epoch": 2.2982944011865034, + "grad_norm": 7.74609375, + "learning_rate": 7.701705598813498e-06, + "loss": 3.1534, + "mean_token_accuracy": 0.4761852741970723, + "step": 12397 + }, + { + "epoch": 2.2984797923618836, + "grad_norm": 7.9296875, + "learning_rate": 7.701520207638117e-06, + "loss": 2.992, + "mean_token_accuracy": 0.4489993098688751, + "step": 12398 + }, + { + "epoch": 2.298665183537264, + "grad_norm": 7.0234375, + "learning_rate": 7.701334816462737e-06, + "loss": 3.1628, + "mean_token_accuracy": 0.4569832402234637, + "step": 12399 + }, + { + "epoch": 2.2988505747126435, + "grad_norm": 7.27734375, + "learning_rate": 7.701149425287356e-06, + "loss": 3.0383, + "mean_token_accuracy": 0.4636502886767998, + "step": 12400 + }, + { + "epoch": 2.2990359658880237, + "grad_norm": 6.8203125, + "learning_rate": 7.700964034111978e-06, + "loss": 2.8046, + "mean_token_accuracy": 0.48128646648640955, + "step": 12401 + }, + { + "epoch": 2.299221357063404, + "grad_norm": 8.453125, + "learning_rate": 7.700778642936597e-06, + "loss": 2.5885, + "mean_token_accuracy": 0.4975621358068736, + "step": 12402 + }, + { + "epoch": 2.2994067482387837, + "grad_norm": 6.515625, + "learning_rate": 7.700593251761217e-06, + "loss": 2.5951, + "mean_token_accuracy": 0.5140799794265141, + "step": 12403 + }, + { + "epoch": 2.299592139414164, + "grad_norm": 6.2890625, + "learning_rate": 7.700407860585836e-06, + "loss": 2.6908, + "mean_token_accuracy": 0.4959918564702888, + "step": 12404 + }, + { + "epoch": 2.299777530589544, + "grad_norm": 9.8125, + "learning_rate": 7.700222469410457e-06, + "loss": 2.0845, + "mean_token_accuracy": 0.5581501309944185, + "step": 12405 + }, + { + "epoch": 2.299962921764924, + "grad_norm": 7.9609375, + "learning_rate": 7.700037078235077e-06, + "loss": 2.6887, + "mean_token_accuracy": 0.4795968777788756, + "step": 12406 + }, + { + "epoch": 2.300148312940304, + "grad_norm": 7.53125, + "learning_rate": 7.699851687059696e-06, + "loss": 2.2549, + "mean_token_accuracy": 0.5320754716981132, + "step": 12407 + }, + { + "epoch": 2.300333704115684, + "grad_norm": 7.72265625, + "learning_rate": 7.699666295884316e-06, + "loss": 2.4278, + "mean_token_accuracy": 0.5091470134450077, + "step": 12408 + }, + { + "epoch": 2.3005190952910644, + "grad_norm": 8.078125, + "learning_rate": 7.699480904708937e-06, + "loss": 2.8942, + "mean_token_accuracy": 0.45304645083450634, + "step": 12409 + }, + { + "epoch": 2.300704486466444, + "grad_norm": 9.421875, + "learning_rate": 7.699295513533557e-06, + "loss": 3.5095, + "mean_token_accuracy": 0.44678310195551574, + "step": 12410 + }, + { + "epoch": 2.3008898776418243, + "grad_norm": 8.953125, + "learning_rate": 7.699110122358176e-06, + "loss": 2.3591, + "mean_token_accuracy": 0.5284805844946556, + "step": 12411 + }, + { + "epoch": 2.3010752688172045, + "grad_norm": 7.46875, + "learning_rate": 7.698924731182797e-06, + "loss": 2.5706, + "mean_token_accuracy": 0.5013755158184319, + "step": 12412 + }, + { + "epoch": 2.3012606599925842, + "grad_norm": 7.171875, + "learning_rate": 7.698739340007415e-06, + "loss": 3.2727, + "mean_token_accuracy": 0.4400807202732071, + "step": 12413 + }, + { + "epoch": 2.3014460511679644, + "grad_norm": 8.0703125, + "learning_rate": 7.698553948832036e-06, + "loss": 3.4123, + "mean_token_accuracy": 0.45824777549623547, + "step": 12414 + }, + { + "epoch": 2.3016314423433446, + "grad_norm": 6.7890625, + "learning_rate": 7.698368557656656e-06, + "loss": 2.8796, + "mean_token_accuracy": 0.47637213881368445, + "step": 12415 + }, + { + "epoch": 2.3018168335187243, + "grad_norm": 6.87890625, + "learning_rate": 7.698183166481275e-06, + "loss": 2.5544, + "mean_token_accuracy": 0.5032796660703638, + "step": 12416 + }, + { + "epoch": 2.3020022246941045, + "grad_norm": 6.34375, + "learning_rate": 7.697997775305897e-06, + "loss": 2.4755, + "mean_token_accuracy": 0.5306316289377869, + "step": 12417 + }, + { + "epoch": 2.3021876158694847, + "grad_norm": 6.76171875, + "learning_rate": 7.697812384130516e-06, + "loss": 2.6128, + "mean_token_accuracy": 0.49884022376859055, + "step": 12418 + }, + { + "epoch": 2.3023730070448645, + "grad_norm": 6.6796875, + "learning_rate": 7.697626992955137e-06, + "loss": 2.2853, + "mean_token_accuracy": 0.525100516944285, + "step": 12419 + }, + { + "epoch": 2.3025583982202447, + "grad_norm": 6.21875, + "learning_rate": 7.697441601779755e-06, + "loss": 2.4777, + "mean_token_accuracy": 0.5133424351495594, + "step": 12420 + }, + { + "epoch": 2.302743789395625, + "grad_norm": 7.6875, + "learning_rate": 7.697256210604376e-06, + "loss": 2.7159, + "mean_token_accuracy": 0.46685748719493825, + "step": 12421 + }, + { + "epoch": 2.302929180571005, + "grad_norm": 9.46875, + "learning_rate": 7.697070819428995e-06, + "loss": 2.1954, + "mean_token_accuracy": 0.5409767780901802, + "step": 12422 + }, + { + "epoch": 2.303114571746385, + "grad_norm": 7.19921875, + "learning_rate": 7.696885428253615e-06, + "loss": 3.0459, + "mean_token_accuracy": 0.48126254740129376, + "step": 12423 + }, + { + "epoch": 2.303299962921765, + "grad_norm": 7.3984375, + "learning_rate": 7.696700037078236e-06, + "loss": 2.5737, + "mean_token_accuracy": 0.49570786210655404, + "step": 12424 + }, + { + "epoch": 2.303485354097145, + "grad_norm": 7.5625, + "learning_rate": 7.696514645902856e-06, + "loss": 2.8621, + "mean_token_accuracy": 0.507607593102449, + "step": 12425 + }, + { + "epoch": 2.303670745272525, + "grad_norm": 6.94140625, + "learning_rate": 7.696329254727477e-06, + "loss": 3.2183, + "mean_token_accuracy": 0.45683183183183185, + "step": 12426 + }, + { + "epoch": 2.303856136447905, + "grad_norm": 8.53125, + "learning_rate": 7.696143863552096e-06, + "loss": 2.1204, + "mean_token_accuracy": 0.5615902964959568, + "step": 12427 + }, + { + "epoch": 2.3040415276232853, + "grad_norm": 7.26171875, + "learning_rate": 7.695958472376716e-06, + "loss": 2.5169, + "mean_token_accuracy": 0.5221266133988937, + "step": 12428 + }, + { + "epoch": 2.304226918798665, + "grad_norm": 6.859375, + "learning_rate": 7.695773081201335e-06, + "loss": 2.4897, + "mean_token_accuracy": 0.5156316916488223, + "step": 12429 + }, + { + "epoch": 2.3044123099740452, + "grad_norm": 6.78125, + "learning_rate": 7.695587690025955e-06, + "loss": 2.4151, + "mean_token_accuracy": 0.5531581485053038, + "step": 12430 + }, + { + "epoch": 2.3045977011494254, + "grad_norm": 6.6171875, + "learning_rate": 7.695402298850576e-06, + "loss": 2.9494, + "mean_token_accuracy": 0.4794857768052516, + "step": 12431 + }, + { + "epoch": 2.304783092324805, + "grad_norm": 8.640625, + "learning_rate": 7.695216907675195e-06, + "loss": 2.5851, + "mean_token_accuracy": 0.48889201349831274, + "step": 12432 + }, + { + "epoch": 2.3049684835001854, + "grad_norm": 6.7265625, + "learning_rate": 7.695031516499815e-06, + "loss": 2.3769, + "mean_token_accuracy": 0.5613221657194137, + "step": 12433 + }, + { + "epoch": 2.3051538746755655, + "grad_norm": 6.6328125, + "learning_rate": 7.694846125324436e-06, + "loss": 3.0874, + "mean_token_accuracy": 0.46653279785809904, + "step": 12434 + }, + { + "epoch": 2.3053392658509457, + "grad_norm": 7.42578125, + "learning_rate": 7.694660734149056e-06, + "loss": 2.9401, + "mean_token_accuracy": 0.46952686447473935, + "step": 12435 + }, + { + "epoch": 2.3055246570263255, + "grad_norm": 6.87109375, + "learning_rate": 7.694475342973675e-06, + "loss": 2.7312, + "mean_token_accuracy": 0.5100373805897827, + "step": 12436 + }, + { + "epoch": 2.3057100482017057, + "grad_norm": 7.44140625, + "learning_rate": 7.694289951798295e-06, + "loss": 2.6576, + "mean_token_accuracy": 0.4916907018266722, + "step": 12437 + }, + { + "epoch": 2.3058954393770854, + "grad_norm": 6.2578125, + "learning_rate": 7.694104560622914e-06, + "loss": 2.9314, + "mean_token_accuracy": 0.4694353070175439, + "step": 12438 + }, + { + "epoch": 2.3060808305524656, + "grad_norm": 7.0390625, + "learning_rate": 7.693919169447535e-06, + "loss": 3.2699, + "mean_token_accuracy": 0.44308614923307577, + "step": 12439 + }, + { + "epoch": 2.306266221727846, + "grad_norm": 7.671875, + "learning_rate": 7.693733778272155e-06, + "loss": 3.2509, + "mean_token_accuracy": 0.4649919828968466, + "step": 12440 + }, + { + "epoch": 2.306451612903226, + "grad_norm": 7.21875, + "learning_rate": 7.693548387096776e-06, + "loss": 2.4587, + "mean_token_accuracy": 0.5065188253929572, + "step": 12441 + }, + { + "epoch": 2.3066370040786057, + "grad_norm": 6.3046875, + "learning_rate": 7.693362995921394e-06, + "loss": 2.7282, + "mean_token_accuracy": 0.4948906844106464, + "step": 12442 + }, + { + "epoch": 2.306822395253986, + "grad_norm": 6.7890625, + "learning_rate": 7.693177604746015e-06, + "loss": 3.6955, + "mean_token_accuracy": 0.4130308318789994, + "step": 12443 + }, + { + "epoch": 2.307007786429366, + "grad_norm": 6.88671875, + "learning_rate": 7.692992213570635e-06, + "loss": 2.7446, + "mean_token_accuracy": 0.4637839147286822, + "step": 12444 + }, + { + "epoch": 2.307193177604746, + "grad_norm": 7.09765625, + "learning_rate": 7.692806822395254e-06, + "loss": 2.7511, + "mean_token_accuracy": 0.47876220731850805, + "step": 12445 + }, + { + "epoch": 2.307378568780126, + "grad_norm": 8.15625, + "learning_rate": 7.692621431219875e-06, + "loss": 3.2737, + "mean_token_accuracy": 0.4591760299625468, + "step": 12446 + }, + { + "epoch": 2.3075639599555062, + "grad_norm": 8.265625, + "learning_rate": 7.692436040044493e-06, + "loss": 3.5588, + "mean_token_accuracy": 0.43906757155503096, + "step": 12447 + }, + { + "epoch": 2.3077493511308864, + "grad_norm": 8.734375, + "learning_rate": 7.692250648869114e-06, + "loss": 2.9043, + "mean_token_accuracy": 0.49074074074074076, + "step": 12448 + }, + { + "epoch": 2.307934742306266, + "grad_norm": 7.6328125, + "learning_rate": 7.692065257693734e-06, + "loss": 2.8362, + "mean_token_accuracy": 0.48956903650837225, + "step": 12449 + }, + { + "epoch": 2.3081201334816464, + "grad_norm": 7.24609375, + "learning_rate": 7.691879866518355e-06, + "loss": 2.743, + "mean_token_accuracy": 0.4937799043062201, + "step": 12450 + }, + { + "epoch": 2.308305524657026, + "grad_norm": 7.15625, + "learning_rate": 7.691694475342974e-06, + "loss": 2.9813, + "mean_token_accuracy": 0.4714104193138501, + "step": 12451 + }, + { + "epoch": 2.3084909158324063, + "grad_norm": 7.953125, + "learning_rate": 7.691509084167594e-06, + "loss": 3.2915, + "mean_token_accuracy": 0.4258417958311064, + "step": 12452 + }, + { + "epoch": 2.3086763070077865, + "grad_norm": 9.796875, + "learning_rate": 7.691323692992215e-06, + "loss": 3.5705, + "mean_token_accuracy": 0.4273372415921012, + "step": 12453 + }, + { + "epoch": 2.3088616981831667, + "grad_norm": 7.51953125, + "learning_rate": 7.691138301816834e-06, + "loss": 2.7226, + "mean_token_accuracy": 0.5039849297203304, + "step": 12454 + }, + { + "epoch": 2.3090470893585464, + "grad_norm": 7.55078125, + "learning_rate": 7.690952910641454e-06, + "loss": 3.657, + "mean_token_accuracy": 0.4143646408839779, + "step": 12455 + }, + { + "epoch": 2.3092324805339266, + "grad_norm": 8.625, + "learning_rate": 7.690767519466073e-06, + "loss": 2.4227, + "mean_token_accuracy": 0.5124690097710369, + "step": 12456 + }, + { + "epoch": 2.309417871709307, + "grad_norm": 8.640625, + "learning_rate": 7.690582128290695e-06, + "loss": 2.8058, + "mean_token_accuracy": 0.4929073130422807, + "step": 12457 + }, + { + "epoch": 2.3096032628846865, + "grad_norm": 7.55078125, + "learning_rate": 7.690396737115314e-06, + "loss": 2.8245, + "mean_token_accuracy": 0.4721133901820696, + "step": 12458 + }, + { + "epoch": 2.3097886540600667, + "grad_norm": 9.140625, + "learning_rate": 7.690211345939934e-06, + "loss": 3.1665, + "mean_token_accuracy": 0.46516779266806957, + "step": 12459 + }, + { + "epoch": 2.309974045235447, + "grad_norm": 11.5546875, + "learning_rate": 7.690025954764553e-06, + "loss": 2.6003, + "mean_token_accuracy": 0.4995551035973052, + "step": 12460 + }, + { + "epoch": 2.3101594364108267, + "grad_norm": 7.56640625, + "learning_rate": 7.689840563589174e-06, + "loss": 2.752, + "mean_token_accuracy": 0.4958196476560167, + "step": 12461 + }, + { + "epoch": 2.310344827586207, + "grad_norm": 7.69921875, + "learning_rate": 7.689655172413794e-06, + "loss": 3.3258, + "mean_token_accuracy": 0.4518234709805591, + "step": 12462 + }, + { + "epoch": 2.310530218761587, + "grad_norm": 8.6171875, + "learning_rate": 7.689469781238413e-06, + "loss": 4.0492, + "mean_token_accuracy": 0.4021437078205637, + "step": 12463 + }, + { + "epoch": 2.310715609936967, + "grad_norm": 8.484375, + "learning_rate": 7.689284390063033e-06, + "loss": 2.775, + "mean_token_accuracy": 0.4768550504352632, + "step": 12464 + }, + { + "epoch": 2.310901001112347, + "grad_norm": 8.8125, + "learning_rate": 7.689098998887654e-06, + "loss": 3.1076, + "mean_token_accuracy": 0.46593319194061505, + "step": 12465 + }, + { + "epoch": 2.311086392287727, + "grad_norm": 8.2734375, + "learning_rate": 7.688913607712274e-06, + "loss": 2.8516, + "mean_token_accuracy": 0.4827098598791619, + "step": 12466 + }, + { + "epoch": 2.3112717834631074, + "grad_norm": 8.65625, + "learning_rate": 7.688728216536893e-06, + "loss": 3.4446, + "mean_token_accuracy": 0.42568306010928963, + "step": 12467 + }, + { + "epoch": 2.311457174638487, + "grad_norm": 7.1875, + "learning_rate": 7.688542825361514e-06, + "loss": 2.9673, + "mean_token_accuracy": 0.4704364652762847, + "step": 12468 + }, + { + "epoch": 2.3116425658138673, + "grad_norm": 10.71875, + "learning_rate": 7.688357434186134e-06, + "loss": 1.8509, + "mean_token_accuracy": 0.5848529411764706, + "step": 12469 + }, + { + "epoch": 2.3118279569892475, + "grad_norm": 8.2734375, + "learning_rate": 7.688172043010753e-06, + "loss": 2.7619, + "mean_token_accuracy": 0.49602203182374544, + "step": 12470 + }, + { + "epoch": 2.3120133481646272, + "grad_norm": 13.6875, + "learning_rate": 7.687986651835373e-06, + "loss": 2.8185, + "mean_token_accuracy": 0.4682190056639396, + "step": 12471 + }, + { + "epoch": 2.3121987393400074, + "grad_norm": 9.21875, + "learning_rate": 7.687801260659992e-06, + "loss": 3.6923, + "mean_token_accuracy": 0.41774255400741406, + "step": 12472 + }, + { + "epoch": 2.3123841305153876, + "grad_norm": 10.671875, + "learning_rate": 7.687615869484614e-06, + "loss": 3.0597, + "mean_token_accuracy": 0.4607232968881413, + "step": 12473 + }, + { + "epoch": 2.3125695216907673, + "grad_norm": 9.890625, + "learning_rate": 7.687430478309233e-06, + "loss": 2.7601, + "mean_token_accuracy": 0.507159186620142, + "step": 12474 + }, + { + "epoch": 2.3127549128661475, + "grad_norm": 11.8984375, + "learning_rate": 7.687245087133854e-06, + "loss": 3.2629, + "mean_token_accuracy": 0.4358403797656134, + "step": 12475 + }, + { + "epoch": 2.3129403040415277, + "grad_norm": 7.26953125, + "learning_rate": 7.687059695958472e-06, + "loss": 2.6676, + "mean_token_accuracy": 0.4924833276817, + "step": 12476 + }, + { + "epoch": 2.3131256952169075, + "grad_norm": 12.3984375, + "learning_rate": 7.686874304783093e-06, + "loss": 2.9411, + "mean_token_accuracy": 0.4782487838776928, + "step": 12477 + }, + { + "epoch": 2.3133110863922877, + "grad_norm": 6.9453125, + "learning_rate": 7.686688913607713e-06, + "loss": 3.0105, + "mean_token_accuracy": 0.4765661252900232, + "step": 12478 + }, + { + "epoch": 2.313496477567668, + "grad_norm": 9.1171875, + "learning_rate": 7.686503522432332e-06, + "loss": 2.8064, + "mean_token_accuracy": 0.46832373446498937, + "step": 12479 + }, + { + "epoch": 2.313681868743048, + "grad_norm": 7.87890625, + "learning_rate": 7.686318131256953e-06, + "loss": 2.8239, + "mean_token_accuracy": 0.4911072862880092, + "step": 12480 + }, + { + "epoch": 2.313867259918428, + "grad_norm": 5.8125, + "learning_rate": 7.686132740081573e-06, + "loss": 2.6431, + "mean_token_accuracy": 0.48807495741056217, + "step": 12481 + }, + { + "epoch": 2.314052651093808, + "grad_norm": 7.93359375, + "learning_rate": 7.685947348906194e-06, + "loss": 2.9797, + "mean_token_accuracy": 0.4630954192664739, + "step": 12482 + }, + { + "epoch": 2.314238042269188, + "grad_norm": 7.44140625, + "learning_rate": 7.685761957730813e-06, + "loss": 2.7672, + "mean_token_accuracy": 0.47533126585847196, + "step": 12483 + }, + { + "epoch": 2.314423433444568, + "grad_norm": 6.7578125, + "learning_rate": 7.685576566555433e-06, + "loss": 3.0652, + "mean_token_accuracy": 0.4541126686061888, + "step": 12484 + }, + { + "epoch": 2.314608824619948, + "grad_norm": 6.8046875, + "learning_rate": 7.685391175380052e-06, + "loss": 3.1711, + "mean_token_accuracy": 0.46094637223974766, + "step": 12485 + }, + { + "epoch": 2.3147942157953283, + "grad_norm": 7.453125, + "learning_rate": 7.685205784204672e-06, + "loss": 3.4849, + "mean_token_accuracy": 0.4294614147909968, + "step": 12486 + }, + { + "epoch": 2.314979606970708, + "grad_norm": 6.9765625, + "learning_rate": 7.685020393029293e-06, + "loss": 2.1792, + "mean_token_accuracy": 0.5681881051175657, + "step": 12487 + }, + { + "epoch": 2.3151649981460882, + "grad_norm": 6.16015625, + "learning_rate": 7.684835001853912e-06, + "loss": 2.5951, + "mean_token_accuracy": 0.5031096563011457, + "step": 12488 + }, + { + "epoch": 2.3153503893214684, + "grad_norm": 8.171875, + "learning_rate": 7.684649610678532e-06, + "loss": 2.7706, + "mean_token_accuracy": 0.4960662525879917, + "step": 12489 + }, + { + "epoch": 2.315535780496848, + "grad_norm": 7.1171875, + "learning_rate": 7.684464219503153e-06, + "loss": 3.5488, + "mean_token_accuracy": 0.43252944475602917, + "step": 12490 + }, + { + "epoch": 2.3157211716722284, + "grad_norm": 7.14453125, + "learning_rate": 7.684278828327773e-06, + "loss": 2.762, + "mean_token_accuracy": 0.47824870611274306, + "step": 12491 + }, + { + "epoch": 2.3159065628476085, + "grad_norm": 6.57421875, + "learning_rate": 7.684093437152392e-06, + "loss": 2.6737, + "mean_token_accuracy": 0.48744892002335083, + "step": 12492 + }, + { + "epoch": 2.3160919540229887, + "grad_norm": 7.34375, + "learning_rate": 7.683908045977012e-06, + "loss": 2.661, + "mean_token_accuracy": 0.4969766772243018, + "step": 12493 + }, + { + "epoch": 2.3162773451983685, + "grad_norm": 8.0859375, + "learning_rate": 7.683722654801631e-06, + "loss": 2.9318, + "mean_token_accuracy": 0.4909963985594238, + "step": 12494 + }, + { + "epoch": 2.3164627363737487, + "grad_norm": 7.125, + "learning_rate": 7.683537263626252e-06, + "loss": 3.1949, + "mean_token_accuracy": 0.46098149637972646, + "step": 12495 + }, + { + "epoch": 2.316648127549129, + "grad_norm": 6.81640625, + "learning_rate": 7.683351872450872e-06, + "loss": 2.8188, + "mean_token_accuracy": 0.49283739633073637, + "step": 12496 + }, + { + "epoch": 2.3168335187245086, + "grad_norm": 7.1875, + "learning_rate": 7.683166481275491e-06, + "loss": 3.1061, + "mean_token_accuracy": 0.44430217669654287, + "step": 12497 + }, + { + "epoch": 2.317018909899889, + "grad_norm": 6.57421875, + "learning_rate": 7.682981090100111e-06, + "loss": 2.732, + "mean_token_accuracy": 0.5001197318007663, + "step": 12498 + }, + { + "epoch": 2.317204301075269, + "grad_norm": 6.99609375, + "learning_rate": 7.682795698924732e-06, + "loss": 2.7438, + "mean_token_accuracy": 0.4805482486501454, + "step": 12499 + }, + { + "epoch": 2.3173896922506487, + "grad_norm": 6.21484375, + "learning_rate": 7.682610307749352e-06, + "loss": 2.55, + "mean_token_accuracy": 0.4961343641695548, + "step": 12500 + }, + { + "epoch": 2.317575083426029, + "grad_norm": 8.5703125, + "learning_rate": 7.682424916573971e-06, + "loss": 2.5058, + "mean_token_accuracy": 0.5336538461538461, + "step": 12501 + }, + { + "epoch": 2.317760474601409, + "grad_norm": 6.68359375, + "learning_rate": 7.682239525398592e-06, + "loss": 3.3744, + "mean_token_accuracy": 0.4255942689677629, + "step": 12502 + }, + { + "epoch": 2.317945865776789, + "grad_norm": 7.1640625, + "learning_rate": 7.68205413422321e-06, + "loss": 3.7901, + "mean_token_accuracy": 0.42271019936434556, + "step": 12503 + }, + { + "epoch": 2.318131256952169, + "grad_norm": 8.4296875, + "learning_rate": 7.681868743047831e-06, + "loss": 3.4621, + "mean_token_accuracy": 0.4473953013278856, + "step": 12504 + }, + { + "epoch": 2.3183166481275492, + "grad_norm": 7.65625, + "learning_rate": 7.681683351872451e-06, + "loss": 3.3619, + "mean_token_accuracy": 0.42087095061072755, + "step": 12505 + }, + { + "epoch": 2.3185020393029294, + "grad_norm": 7.34375, + "learning_rate": 7.681497960697072e-06, + "loss": 2.6962, + "mean_token_accuracy": 0.5012060647829083, + "step": 12506 + }, + { + "epoch": 2.318687430478309, + "grad_norm": 7.53515625, + "learning_rate": 7.681312569521692e-06, + "loss": 2.1573, + "mean_token_accuracy": 0.5718550685303297, + "step": 12507 + }, + { + "epoch": 2.3188728216536894, + "grad_norm": 6.44140625, + "learning_rate": 7.681127178346311e-06, + "loss": 2.462, + "mean_token_accuracy": 0.497235219055721, + "step": 12508 + }, + { + "epoch": 2.319058212829069, + "grad_norm": 20.390625, + "learning_rate": 7.680941787170932e-06, + "loss": 3.3533, + "mean_token_accuracy": 0.42165206508135167, + "step": 12509 + }, + { + "epoch": 2.3192436040044493, + "grad_norm": 7.21484375, + "learning_rate": 7.68075639599555e-06, + "loss": 2.9096, + "mean_token_accuracy": 0.502943396226415, + "step": 12510 + }, + { + "epoch": 2.3194289951798295, + "grad_norm": 7.3515625, + "learning_rate": 7.680571004820171e-06, + "loss": 2.9026, + "mean_token_accuracy": 0.4769267083677564, + "step": 12511 + }, + { + "epoch": 2.3196143863552097, + "grad_norm": 7.0703125, + "learning_rate": 7.680385613644792e-06, + "loss": 2.6556, + "mean_token_accuracy": 0.4879154078549849, + "step": 12512 + }, + { + "epoch": 2.3197997775305894, + "grad_norm": 6.6015625, + "learning_rate": 7.68020022246941e-06, + "loss": 3.2014, + "mean_token_accuracy": 0.4446961620469083, + "step": 12513 + }, + { + "epoch": 2.3199851687059696, + "grad_norm": 6.0078125, + "learning_rate": 7.68001483129403e-06, + "loss": 2.6664, + "mean_token_accuracy": 0.48041336217255465, + "step": 12514 + }, + { + "epoch": 2.32017055988135, + "grad_norm": 6.46875, + "learning_rate": 7.679829440118651e-06, + "loss": 2.7995, + "mean_token_accuracy": 0.48399706816516, + "step": 12515 + }, + { + "epoch": 2.3203559510567295, + "grad_norm": 6.359375, + "learning_rate": 7.679644048943272e-06, + "loss": 2.9888, + "mean_token_accuracy": 0.4654921020656136, + "step": 12516 + }, + { + "epoch": 2.3205413422321097, + "grad_norm": 7.33984375, + "learning_rate": 7.67945865776789e-06, + "loss": 2.6512, + "mean_token_accuracy": 0.5023070803500398, + "step": 12517 + }, + { + "epoch": 2.32072673340749, + "grad_norm": 6.55859375, + "learning_rate": 7.679273266592511e-06, + "loss": 2.5576, + "mean_token_accuracy": 0.4957111234089651, + "step": 12518 + }, + { + "epoch": 2.3209121245828697, + "grad_norm": 6.98828125, + "learning_rate": 7.67908787541713e-06, + "loss": 3.1854, + "mean_token_accuracy": 0.4471495539089672, + "step": 12519 + }, + { + "epoch": 2.32109751575825, + "grad_norm": 5.79296875, + "learning_rate": 7.67890248424175e-06, + "loss": 2.8661, + "mean_token_accuracy": 0.4761671363803455, + "step": 12520 + }, + { + "epoch": 2.32128290693363, + "grad_norm": 6.7578125, + "learning_rate": 7.678717093066371e-06, + "loss": 2.7791, + "mean_token_accuracy": 0.5106717984604618, + "step": 12521 + }, + { + "epoch": 2.32146829810901, + "grad_norm": 7.32421875, + "learning_rate": 7.678531701890991e-06, + "loss": 3.1174, + "mean_token_accuracy": 0.48091497864131183, + "step": 12522 + }, + { + "epoch": 2.32165368928439, + "grad_norm": 6.5859375, + "learning_rate": 7.67834631071561e-06, + "loss": 2.8908, + "mean_token_accuracy": 0.4673913043478261, + "step": 12523 + }, + { + "epoch": 2.32183908045977, + "grad_norm": 9.1953125, + "learning_rate": 7.67816091954023e-06, + "loss": 2.5657, + "mean_token_accuracy": 0.5145728643216081, + "step": 12524 + }, + { + "epoch": 2.3220244716351504, + "grad_norm": 7.9140625, + "learning_rate": 7.677975528364851e-06, + "loss": 2.9351, + "mean_token_accuracy": 0.47720778337188735, + "step": 12525 + }, + { + "epoch": 2.32220986281053, + "grad_norm": 7.234375, + "learning_rate": 7.67779013718947e-06, + "loss": 2.899, + "mean_token_accuracy": 0.48561593312023604, + "step": 12526 + }, + { + "epoch": 2.3223952539859103, + "grad_norm": 7.33984375, + "learning_rate": 7.67760474601409e-06, + "loss": 3.185, + "mean_token_accuracy": 0.45740905057675246, + "step": 12527 + }, + { + "epoch": 2.3225806451612905, + "grad_norm": 10.9296875, + "learning_rate": 7.67741935483871e-06, + "loss": 2.6792, + "mean_token_accuracy": 0.48737972388788176, + "step": 12528 + }, + { + "epoch": 2.3227660363366702, + "grad_norm": 9.171875, + "learning_rate": 7.67723396366333e-06, + "loss": 3.3524, + "mean_token_accuracy": 0.43610665889552674, + "step": 12529 + }, + { + "epoch": 2.3229514275120504, + "grad_norm": 6.8359375, + "learning_rate": 7.67704857248795e-06, + "loss": 2.6895, + "mean_token_accuracy": 0.5094869992972593, + "step": 12530 + }, + { + "epoch": 2.3231368186874306, + "grad_norm": 7.29296875, + "learning_rate": 7.67686318131257e-06, + "loss": 3.3391, + "mean_token_accuracy": 0.44766523798781865, + "step": 12531 + }, + { + "epoch": 2.3233222098628104, + "grad_norm": 7.36328125, + "learning_rate": 7.67667779013719e-06, + "loss": 2.5552, + "mean_token_accuracy": 0.5299270072992701, + "step": 12532 + }, + { + "epoch": 2.3235076010381905, + "grad_norm": 7.29296875, + "learning_rate": 7.67649239896181e-06, + "loss": 3.0744, + "mean_token_accuracy": 0.4640101347460555, + "step": 12533 + }, + { + "epoch": 2.3236929922135707, + "grad_norm": 6.37890625, + "learning_rate": 7.67630700778643e-06, + "loss": 2.6446, + "mean_token_accuracy": 0.4767110977984125, + "step": 12534 + }, + { + "epoch": 2.3238783833889505, + "grad_norm": 6.57421875, + "learning_rate": 7.67612161661105e-06, + "loss": 2.8585, + "mean_token_accuracy": 0.5069143735158542, + "step": 12535 + }, + { + "epoch": 2.3240637745643307, + "grad_norm": 7.890625, + "learning_rate": 7.67593622543567e-06, + "loss": 3.0067, + "mean_token_accuracy": 0.45326887661141807, + "step": 12536 + }, + { + "epoch": 2.324249165739711, + "grad_norm": 7.4921875, + "learning_rate": 7.675750834260289e-06, + "loss": 3.4811, + "mean_token_accuracy": 0.4217931447388116, + "step": 12537 + }, + { + "epoch": 2.324434556915091, + "grad_norm": 8.3125, + "learning_rate": 7.67556544308491e-06, + "loss": 3.4629, + "mean_token_accuracy": 0.4188308100342579, + "step": 12538 + }, + { + "epoch": 2.324619948090471, + "grad_norm": 6.125, + "learning_rate": 7.67538005190953e-06, + "loss": 2.9007, + "mean_token_accuracy": 0.4603814128391083, + "step": 12539 + }, + { + "epoch": 2.324805339265851, + "grad_norm": 7.77734375, + "learning_rate": 7.67519466073415e-06, + "loss": 3.219, + "mean_token_accuracy": 0.46049469964664314, + "step": 12540 + }, + { + "epoch": 2.324990730441231, + "grad_norm": 7.21484375, + "learning_rate": 7.675009269558769e-06, + "loss": 1.9265, + "mean_token_accuracy": 0.6070070300795206, + "step": 12541 + }, + { + "epoch": 2.325176121616611, + "grad_norm": 8.6171875, + "learning_rate": 7.67482387838339e-06, + "loss": 2.6098, + "mean_token_accuracy": 0.4927643236857649, + "step": 12542 + }, + { + "epoch": 2.325361512791991, + "grad_norm": 6.44921875, + "learning_rate": 7.67463848720801e-06, + "loss": 2.6683, + "mean_token_accuracy": 0.4868654311039484, + "step": 12543 + }, + { + "epoch": 2.3255469039673713, + "grad_norm": 9.265625, + "learning_rate": 7.674453096032629e-06, + "loss": 2.7381, + "mean_token_accuracy": 0.4894758998105662, + "step": 12544 + }, + { + "epoch": 2.325732295142751, + "grad_norm": 6.4375, + "learning_rate": 7.674267704857249e-06, + "loss": 3.148, + "mean_token_accuracy": 0.4580379602103819, + "step": 12545 + }, + { + "epoch": 2.3259176863181312, + "grad_norm": 9.125, + "learning_rate": 7.67408231368187e-06, + "loss": 2.1669, + "mean_token_accuracy": 0.5620783405497936, + "step": 12546 + }, + { + "epoch": 2.3261030774935114, + "grad_norm": 6.91015625, + "learning_rate": 7.67389692250649e-06, + "loss": 2.8972, + "mean_token_accuracy": 0.4968562446413261, + "step": 12547 + }, + { + "epoch": 2.326288468668891, + "grad_norm": 7.37890625, + "learning_rate": 7.673711531331109e-06, + "loss": 3.0508, + "mean_token_accuracy": 0.4783404514948139, + "step": 12548 + }, + { + "epoch": 2.3264738598442714, + "grad_norm": 8.71875, + "learning_rate": 7.67352614015573e-06, + "loss": 2.3728, + "mean_token_accuracy": 0.528854961832061, + "step": 12549 + }, + { + "epoch": 2.3266592510196515, + "grad_norm": 6.48046875, + "learning_rate": 7.67334074898035e-06, + "loss": 2.5083, + "mean_token_accuracy": 0.5103196131619294, + "step": 12550 + }, + { + "epoch": 2.3268446421950317, + "grad_norm": 7.77734375, + "learning_rate": 7.673155357804969e-06, + "loss": 2.7138, + "mean_token_accuracy": 0.5061946902654867, + "step": 12551 + }, + { + "epoch": 2.3270300333704115, + "grad_norm": 8.828125, + "learning_rate": 7.67296996662959e-06, + "loss": 3.0901, + "mean_token_accuracy": 0.48267008985879334, + "step": 12552 + }, + { + "epoch": 2.3272154245457917, + "grad_norm": 6.4609375, + "learning_rate": 7.672784575454208e-06, + "loss": 3.0396, + "mean_token_accuracy": 0.47396828966602944, + "step": 12553 + }, + { + "epoch": 2.327400815721172, + "grad_norm": 6.6171875, + "learning_rate": 7.67259918427883e-06, + "loss": 2.7073, + "mean_token_accuracy": 0.5045335242185636, + "step": 12554 + }, + { + "epoch": 2.3275862068965516, + "grad_norm": 7.41796875, + "learning_rate": 7.672413793103449e-06, + "loss": 2.7639, + "mean_token_accuracy": 0.47121389539422326, + "step": 12555 + }, + { + "epoch": 2.327771598071932, + "grad_norm": 7.109375, + "learning_rate": 7.67222840192807e-06, + "loss": 3.0459, + "mean_token_accuracy": 0.4727802981205444, + "step": 12556 + }, + { + "epoch": 2.327956989247312, + "grad_norm": 8.1015625, + "learning_rate": 7.672043010752688e-06, + "loss": 3.6025, + "mean_token_accuracy": 0.4172959553118858, + "step": 12557 + }, + { + "epoch": 2.3281423804226917, + "grad_norm": 10.2734375, + "learning_rate": 7.671857619577309e-06, + "loss": 3.4901, + "mean_token_accuracy": 0.4558363731109953, + "step": 12558 + }, + { + "epoch": 2.328327771598072, + "grad_norm": 7.3984375, + "learning_rate": 7.67167222840193e-06, + "loss": 2.97, + "mean_token_accuracy": 0.4536532170119956, + "step": 12559 + }, + { + "epoch": 2.328513162773452, + "grad_norm": 7.51953125, + "learning_rate": 7.671486837226548e-06, + "loss": 3.004, + "mean_token_accuracy": 0.48123154787009703, + "step": 12560 + }, + { + "epoch": 2.328698553948832, + "grad_norm": 10.1171875, + "learning_rate": 7.671301446051169e-06, + "loss": 2.6856, + "mean_token_accuracy": 0.49578303290888043, + "step": 12561 + }, + { + "epoch": 2.328883945124212, + "grad_norm": 7.5078125, + "learning_rate": 7.671116054875789e-06, + "loss": 2.197, + "mean_token_accuracy": 0.5448673904081353, + "step": 12562 + }, + { + "epoch": 2.3290693362995922, + "grad_norm": 7.6640625, + "learning_rate": 7.67093066370041e-06, + "loss": 3.6846, + "mean_token_accuracy": 0.4306611490713117, + "step": 12563 + }, + { + "epoch": 2.3292547274749724, + "grad_norm": 8.546875, + "learning_rate": 7.670745272525028e-06, + "loss": 2.9073, + "mean_token_accuracy": 0.47362066883979237, + "step": 12564 + }, + { + "epoch": 2.329440118650352, + "grad_norm": 9.765625, + "learning_rate": 7.670559881349649e-06, + "loss": 2.6718, + "mean_token_accuracy": 0.4786224821312541, + "step": 12565 + }, + { + "epoch": 2.3296255098257324, + "grad_norm": 6.89453125, + "learning_rate": 7.670374490174268e-06, + "loss": 2.7653, + "mean_token_accuracy": 0.5083461583994318, + "step": 12566 + }, + { + "epoch": 2.329810901001112, + "grad_norm": 8.0, + "learning_rate": 7.670189098998888e-06, + "loss": 2.4453, + "mean_token_accuracy": 0.5380765589070743, + "step": 12567 + }, + { + "epoch": 2.3299962921764923, + "grad_norm": 7.109375, + "learning_rate": 7.670003707823509e-06, + "loss": 2.5508, + "mean_token_accuracy": 0.5171128898702733, + "step": 12568 + }, + { + "epoch": 2.3301816833518725, + "grad_norm": 9.3125, + "learning_rate": 7.669818316648127e-06, + "loss": 2.7306, + "mean_token_accuracy": 0.48519458544839256, + "step": 12569 + }, + { + "epoch": 2.3303670745272527, + "grad_norm": 7.6171875, + "learning_rate": 7.669632925472748e-06, + "loss": 2.0762, + "mean_token_accuracy": 0.5322864491674564, + "step": 12570 + }, + { + "epoch": 2.3305524657026324, + "grad_norm": 8.4375, + "learning_rate": 7.669447534297368e-06, + "loss": 2.9514, + "mean_token_accuracy": 0.4639956686518679, + "step": 12571 + }, + { + "epoch": 2.3307378568780126, + "grad_norm": 10.046875, + "learning_rate": 7.669262143121989e-06, + "loss": 2.9068, + "mean_token_accuracy": 0.4735836330935252, + "step": 12572 + }, + { + "epoch": 2.330923248053393, + "grad_norm": 8.53125, + "learning_rate": 7.669076751946608e-06, + "loss": 2.9762, + "mean_token_accuracy": 0.4944358360332441, + "step": 12573 + }, + { + "epoch": 2.3311086392287725, + "grad_norm": 7.12109375, + "learning_rate": 7.668891360771228e-06, + "loss": 2.6447, + "mean_token_accuracy": 0.4820792520035619, + "step": 12574 + }, + { + "epoch": 2.3312940304041527, + "grad_norm": 9.4609375, + "learning_rate": 7.668705969595847e-06, + "loss": 2.5006, + "mean_token_accuracy": 0.49803149606299213, + "step": 12575 + }, + { + "epoch": 2.331479421579533, + "grad_norm": 7.421875, + "learning_rate": 7.668520578420467e-06, + "loss": 3.3792, + "mean_token_accuracy": 0.4250453214335518, + "step": 12576 + }, + { + "epoch": 2.331664812754913, + "grad_norm": 6.94921875, + "learning_rate": 7.668335187245088e-06, + "loss": 3.6605, + "mean_token_accuracy": 0.4052940442002747, + "step": 12577 + }, + { + "epoch": 2.331850203930293, + "grad_norm": 7.81640625, + "learning_rate": 7.668149796069708e-06, + "loss": 2.8406, + "mean_token_accuracy": 0.49838237559698045, + "step": 12578 + }, + { + "epoch": 2.332035595105673, + "grad_norm": 7.71875, + "learning_rate": 7.667964404894327e-06, + "loss": 3.1928, + "mean_token_accuracy": 0.455985095482068, + "step": 12579 + }, + { + "epoch": 2.332220986281053, + "grad_norm": 7.14453125, + "learning_rate": 7.667779013718948e-06, + "loss": 2.7584, + "mean_token_accuracy": 0.48723949626657753, + "step": 12580 + }, + { + "epoch": 2.332406377456433, + "grad_norm": 8.015625, + "learning_rate": 7.667593622543568e-06, + "loss": 3.3051, + "mean_token_accuracy": 0.46434549715433543, + "step": 12581 + }, + { + "epoch": 2.332591768631813, + "grad_norm": 7.74609375, + "learning_rate": 7.667408231368187e-06, + "loss": 2.6728, + "mean_token_accuracy": 0.4903015892879489, + "step": 12582 + }, + { + "epoch": 2.3327771598071934, + "grad_norm": 7.42578125, + "learning_rate": 7.667222840192807e-06, + "loss": 3.0481, + "mean_token_accuracy": 0.4771587008185899, + "step": 12583 + }, + { + "epoch": 2.332962550982573, + "grad_norm": 12.5078125, + "learning_rate": 7.667037449017426e-06, + "loss": 3.3073, + "mean_token_accuracy": 0.4351243547630221, + "step": 12584 + }, + { + "epoch": 2.3331479421579533, + "grad_norm": 7.6015625, + "learning_rate": 7.666852057842047e-06, + "loss": 2.7259, + "mean_token_accuracy": 0.47663682148852826, + "step": 12585 + }, + { + "epoch": 2.3333333333333335, + "grad_norm": 9.2890625, + "learning_rate": 7.666666666666667e-06, + "loss": 2.5987, + "mean_token_accuracy": 0.5103199174406605, + "step": 12586 + }, + { + "epoch": 2.3335187245087132, + "grad_norm": 9.8125, + "learning_rate": 7.666481275491288e-06, + "loss": 3.2439, + "mean_token_accuracy": 0.45442207645329113, + "step": 12587 + }, + { + "epoch": 2.3337041156840934, + "grad_norm": 9.1875, + "learning_rate": 7.666295884315908e-06, + "loss": 2.8019, + "mean_token_accuracy": 0.4652027419542233, + "step": 12588 + }, + { + "epoch": 2.3338895068594736, + "grad_norm": 8.5859375, + "learning_rate": 7.666110493140527e-06, + "loss": 2.5149, + "mean_token_accuracy": 0.500669164882227, + "step": 12589 + }, + { + "epoch": 2.3340748980348534, + "grad_norm": 7.83984375, + "learning_rate": 7.665925101965148e-06, + "loss": 2.5843, + "mean_token_accuracy": 0.4925481643038895, + "step": 12590 + }, + { + "epoch": 2.3342602892102335, + "grad_norm": 6.82421875, + "learning_rate": 7.665739710789766e-06, + "loss": 2.1813, + "mean_token_accuracy": 0.5510982778178818, + "step": 12591 + }, + { + "epoch": 2.3344456803856137, + "grad_norm": 9.2578125, + "learning_rate": 7.665554319614387e-06, + "loss": 2.5247, + "mean_token_accuracy": 0.505558093497792, + "step": 12592 + }, + { + "epoch": 2.3346310715609935, + "grad_norm": 7.328125, + "learning_rate": 7.665368928439007e-06, + "loss": 3.126, + "mean_token_accuracy": 0.46557759626604434, + "step": 12593 + }, + { + "epoch": 2.3348164627363737, + "grad_norm": 9.5546875, + "learning_rate": 7.665183537263628e-06, + "loss": 2.3803, + "mean_token_accuracy": 0.5134812060034533, + "step": 12594 + }, + { + "epoch": 2.335001853911754, + "grad_norm": 8.6640625, + "learning_rate": 7.664998146088247e-06, + "loss": 2.9333, + "mean_token_accuracy": 0.48565719994291423, + "step": 12595 + }, + { + "epoch": 2.335187245087134, + "grad_norm": 6.80078125, + "learning_rate": 7.664812754912867e-06, + "loss": 2.6298, + "mean_token_accuracy": 0.5078457446808511, + "step": 12596 + }, + { + "epoch": 2.335372636262514, + "grad_norm": 7.03515625, + "learning_rate": 7.664627363737488e-06, + "loss": 2.7339, + "mean_token_accuracy": 0.4856924254016832, + "step": 12597 + }, + { + "epoch": 2.335558027437894, + "grad_norm": 6.8671875, + "learning_rate": 7.664441972562106e-06, + "loss": 3.4625, + "mean_token_accuracy": 0.42072538860103625, + "step": 12598 + }, + { + "epoch": 2.335743418613274, + "grad_norm": 7.4609375, + "learning_rate": 7.664256581386727e-06, + "loss": 2.8357, + "mean_token_accuracy": 0.48549596717135984, + "step": 12599 + }, + { + "epoch": 2.335928809788654, + "grad_norm": 7.171875, + "learning_rate": 7.664071190211346e-06, + "loss": 3.169, + "mean_token_accuracy": 0.4597300166843622, + "step": 12600 + }, + { + "epoch": 2.336114200964034, + "grad_norm": 6.48828125, + "learning_rate": 7.663885799035966e-06, + "loss": 2.7641, + "mean_token_accuracy": 0.47121653488694687, + "step": 12601 + }, + { + "epoch": 2.3362995921394143, + "grad_norm": 8.0390625, + "learning_rate": 7.663700407860587e-06, + "loss": 2.7544, + "mean_token_accuracy": 0.4953324716357892, + "step": 12602 + }, + { + "epoch": 2.336484983314794, + "grad_norm": 10.109375, + "learning_rate": 7.663515016685207e-06, + "loss": 2.5253, + "mean_token_accuracy": 0.5197780993728895, + "step": 12603 + }, + { + "epoch": 2.3366703744901742, + "grad_norm": 6.03125, + "learning_rate": 7.663329625509826e-06, + "loss": 2.1609, + "mean_token_accuracy": 0.5681818181818182, + "step": 12604 + }, + { + "epoch": 2.3368557656655544, + "grad_norm": 7.84765625, + "learning_rate": 7.663144234334446e-06, + "loss": 2.8367, + "mean_token_accuracy": 0.48933809934672745, + "step": 12605 + }, + { + "epoch": 2.337041156840934, + "grad_norm": 7.390625, + "learning_rate": 7.662958843159067e-06, + "loss": 2.4827, + "mean_token_accuracy": 0.5052050671014674, + "step": 12606 + }, + { + "epoch": 2.3372265480163144, + "grad_norm": 6.34375, + "learning_rate": 7.662773451983686e-06, + "loss": 2.7794, + "mean_token_accuracy": 0.4874662553027381, + "step": 12607 + }, + { + "epoch": 2.3374119391916945, + "grad_norm": 7.453125, + "learning_rate": 7.662588060808306e-06, + "loss": 3.0296, + "mean_token_accuracy": 0.45680839612486546, + "step": 12608 + }, + { + "epoch": 2.3375973303670747, + "grad_norm": 6.796875, + "learning_rate": 7.662402669632925e-06, + "loss": 2.8897, + "mean_token_accuracy": 0.4877407008731013, + "step": 12609 + }, + { + "epoch": 2.3377827215424545, + "grad_norm": 7.96484375, + "learning_rate": 7.662217278457547e-06, + "loss": 3.7091, + "mean_token_accuracy": 0.442681025931699, + "step": 12610 + }, + { + "epoch": 2.3379681127178347, + "grad_norm": 6.05859375, + "learning_rate": 7.662031887282166e-06, + "loss": 2.6445, + "mean_token_accuracy": 0.5008590197069227, + "step": 12611 + }, + { + "epoch": 2.338153503893215, + "grad_norm": 7.4921875, + "learning_rate": 7.661846496106786e-06, + "loss": 2.7118, + "mean_token_accuracy": 0.48599857853589196, + "step": 12612 + }, + { + "epoch": 2.3383388950685946, + "grad_norm": 6.140625, + "learning_rate": 7.661661104931405e-06, + "loss": 2.6083, + "mean_token_accuracy": 0.525648690661246, + "step": 12613 + }, + { + "epoch": 2.338524286243975, + "grad_norm": 8.3359375, + "learning_rate": 7.661475713756026e-06, + "loss": 2.3451, + "mean_token_accuracy": 0.5182229316307528, + "step": 12614 + }, + { + "epoch": 2.338709677419355, + "grad_norm": 6.55078125, + "learning_rate": 7.661290322580646e-06, + "loss": 3.0661, + "mean_token_accuracy": 0.4537600389389146, + "step": 12615 + }, + { + "epoch": 2.3388950685947347, + "grad_norm": 6.31640625, + "learning_rate": 7.661104931405265e-06, + "loss": 2.8766, + "mean_token_accuracy": 0.48340306834030683, + "step": 12616 + }, + { + "epoch": 2.339080459770115, + "grad_norm": 7.1015625, + "learning_rate": 7.660919540229886e-06, + "loss": 2.7529, + "mean_token_accuracy": 0.4805543770329515, + "step": 12617 + }, + { + "epoch": 2.339265850945495, + "grad_norm": 6.79296875, + "learning_rate": 7.660734149054506e-06, + "loss": 2.8647, + "mean_token_accuracy": 0.4725394235997825, + "step": 12618 + }, + { + "epoch": 2.339451242120875, + "grad_norm": 7.69140625, + "learning_rate": 7.660548757879127e-06, + "loss": 3.1733, + "mean_token_accuracy": 0.45658706310295744, + "step": 12619 + }, + { + "epoch": 2.339636633296255, + "grad_norm": 6.75390625, + "learning_rate": 7.660363366703745e-06, + "loss": 2.6518, + "mean_token_accuracy": 0.49564243027888444, + "step": 12620 + }, + { + "epoch": 2.3398220244716352, + "grad_norm": 7.296875, + "learning_rate": 7.660177975528366e-06, + "loss": 2.9179, + "mean_token_accuracy": 0.49985915492957744, + "step": 12621 + }, + { + "epoch": 2.3400074156470154, + "grad_norm": 7.4375, + "learning_rate": 7.659992584352985e-06, + "loss": 3.2764, + "mean_token_accuracy": 0.44620341832446064, + "step": 12622 + }, + { + "epoch": 2.340192806822395, + "grad_norm": 6.01171875, + "learning_rate": 7.659807193177605e-06, + "loss": 2.7113, + "mean_token_accuracy": 0.519356343283582, + "step": 12623 + }, + { + "epoch": 2.3403781979977754, + "grad_norm": 6.98828125, + "learning_rate": 7.659621802002226e-06, + "loss": 2.7203, + "mean_token_accuracy": 0.48994749647842234, + "step": 12624 + }, + { + "epoch": 2.3405635891731555, + "grad_norm": 6.60546875, + "learning_rate": 7.659436410826844e-06, + "loss": 2.2241, + "mean_token_accuracy": 0.569476637794315, + "step": 12625 + }, + { + "epoch": 2.3407489803485353, + "grad_norm": 8.5625, + "learning_rate": 7.659251019651465e-06, + "loss": 2.8383, + "mean_token_accuracy": 0.4978023216499493, + "step": 12626 + }, + { + "epoch": 2.3409343715239155, + "grad_norm": 7.109375, + "learning_rate": 7.659065628476085e-06, + "loss": 3.2374, + "mean_token_accuracy": 0.46051611474138887, + "step": 12627 + }, + { + "epoch": 2.3411197626992957, + "grad_norm": 7.18359375, + "learning_rate": 7.658880237300706e-06, + "loss": 2.7629, + "mean_token_accuracy": 0.4919365767719203, + "step": 12628 + }, + { + "epoch": 2.3413051538746754, + "grad_norm": 7.3203125, + "learning_rate": 7.658694846125325e-06, + "loss": 2.66, + "mean_token_accuracy": 0.500991817505579, + "step": 12629 + }, + { + "epoch": 2.3414905450500556, + "grad_norm": 6.98828125, + "learning_rate": 7.658509454949945e-06, + "loss": 2.6718, + "mean_token_accuracy": 0.5397245762711864, + "step": 12630 + }, + { + "epoch": 2.341675936225436, + "grad_norm": 8.828125, + "learning_rate": 7.658324063774566e-06, + "loss": 3.2498, + "mean_token_accuracy": 0.45516627078384797, + "step": 12631 + }, + { + "epoch": 2.3418613274008155, + "grad_norm": 7.0, + "learning_rate": 7.658138672599184e-06, + "loss": 2.8458, + "mean_token_accuracy": 0.5068824686129179, + "step": 12632 + }, + { + "epoch": 2.3420467185761957, + "grad_norm": 7.69921875, + "learning_rate": 7.657953281423805e-06, + "loss": 3.0113, + "mean_token_accuracy": 0.48765586034912717, + "step": 12633 + }, + { + "epoch": 2.342232109751576, + "grad_norm": 11.0078125, + "learning_rate": 7.657767890248424e-06, + "loss": 2.6734, + "mean_token_accuracy": 0.5218462549277266, + "step": 12634 + }, + { + "epoch": 2.342417500926956, + "grad_norm": 8.453125, + "learning_rate": 7.657582499073046e-06, + "loss": 2.969, + "mean_token_accuracy": 0.47216664705190065, + "step": 12635 + }, + { + "epoch": 2.342602892102336, + "grad_norm": 9.2734375, + "learning_rate": 7.657397107897665e-06, + "loss": 3.241, + "mean_token_accuracy": 0.44176239639377635, + "step": 12636 + }, + { + "epoch": 2.342788283277716, + "grad_norm": 6.9140625, + "learning_rate": 7.657211716722285e-06, + "loss": 2.8486, + "mean_token_accuracy": 0.47477781950181497, + "step": 12637 + }, + { + "epoch": 2.342973674453096, + "grad_norm": 6.92578125, + "learning_rate": 7.657026325546904e-06, + "loss": 2.9696, + "mean_token_accuracy": 0.4737189646064448, + "step": 12638 + }, + { + "epoch": 2.343159065628476, + "grad_norm": 7.828125, + "learning_rate": 7.656840934371524e-06, + "loss": 2.774, + "mean_token_accuracy": 0.5122652681441381, + "step": 12639 + }, + { + "epoch": 2.343344456803856, + "grad_norm": 7.68359375, + "learning_rate": 7.656655543196145e-06, + "loss": 2.7355, + "mean_token_accuracy": 0.4725025513096723, + "step": 12640 + }, + { + "epoch": 2.3435298479792364, + "grad_norm": 6.7109375, + "learning_rate": 7.656470152020764e-06, + "loss": 3.5417, + "mean_token_accuracy": 0.4424864864864865, + "step": 12641 + }, + { + "epoch": 2.343715239154616, + "grad_norm": 7.296875, + "learning_rate": 7.656284760845384e-06, + "loss": 2.7497, + "mean_token_accuracy": 0.491364168618267, + "step": 12642 + }, + { + "epoch": 2.3439006303299963, + "grad_norm": 6.37890625, + "learning_rate": 7.656099369670005e-06, + "loss": 3.2085, + "mean_token_accuracy": 0.4650660264105642, + "step": 12643 + }, + { + "epoch": 2.3440860215053765, + "grad_norm": 8.390625, + "learning_rate": 7.655913978494625e-06, + "loss": 3.5763, + "mean_token_accuracy": 0.4244721169463996, + "step": 12644 + }, + { + "epoch": 2.3442714126807562, + "grad_norm": 8.1875, + "learning_rate": 7.655728587319244e-06, + "loss": 2.9791, + "mean_token_accuracy": 0.4724703589410427, + "step": 12645 + }, + { + "epoch": 2.3444568038561364, + "grad_norm": 7.26171875, + "learning_rate": 7.655543196143865e-06, + "loss": 3.647, + "mean_token_accuracy": 0.41794538361508454, + "step": 12646 + }, + { + "epoch": 2.3446421950315166, + "grad_norm": 6.6328125, + "learning_rate": 7.655357804968483e-06, + "loss": 2.5335, + "mean_token_accuracy": 0.5119386014781125, + "step": 12647 + }, + { + "epoch": 2.344827586206897, + "grad_norm": 7.10546875, + "learning_rate": 7.655172413793104e-06, + "loss": 3.0111, + "mean_token_accuracy": 0.46762087624367255, + "step": 12648 + }, + { + "epoch": 2.3450129773822765, + "grad_norm": 7.55859375, + "learning_rate": 7.654987022617724e-06, + "loss": 3.3048, + "mean_token_accuracy": 0.46143922018348627, + "step": 12649 + }, + { + "epoch": 2.3451983685576567, + "grad_norm": 6.69921875, + "learning_rate": 7.654801631442343e-06, + "loss": 2.7533, + "mean_token_accuracy": 0.49252955697024814, + "step": 12650 + }, + { + "epoch": 2.3453837597330365, + "grad_norm": 7.79296875, + "learning_rate": 7.654616240266964e-06, + "loss": 2.5228, + "mean_token_accuracy": 0.49957032368948723, + "step": 12651 + }, + { + "epoch": 2.3455691509084167, + "grad_norm": 8.6328125, + "learning_rate": 7.654430849091584e-06, + "loss": 3.1885, + "mean_token_accuracy": 0.47089823164304956, + "step": 12652 + }, + { + "epoch": 2.345754542083797, + "grad_norm": 5.93359375, + "learning_rate": 7.654245457916205e-06, + "loss": 3.0121, + "mean_token_accuracy": 0.4635432511915627, + "step": 12653 + }, + { + "epoch": 2.345939933259177, + "grad_norm": 12.546875, + "learning_rate": 7.654060066740823e-06, + "loss": 3.2519, + "mean_token_accuracy": 0.45813253012048194, + "step": 12654 + }, + { + "epoch": 2.346125324434557, + "grad_norm": 8.265625, + "learning_rate": 7.653874675565444e-06, + "loss": 2.6091, + "mean_token_accuracy": 0.5333966874830302, + "step": 12655 + }, + { + "epoch": 2.346310715609937, + "grad_norm": 8.328125, + "learning_rate": 7.653689284390063e-06, + "loss": 3.9857, + "mean_token_accuracy": 0.39912562684839914, + "step": 12656 + }, + { + "epoch": 2.346496106785317, + "grad_norm": 8.1640625, + "learning_rate": 7.653503893214683e-06, + "loss": 2.7098, + "mean_token_accuracy": 0.4752280009859502, + "step": 12657 + }, + { + "epoch": 2.346681497960697, + "grad_norm": 6.76171875, + "learning_rate": 7.653318502039304e-06, + "loss": 2.4499, + "mean_token_accuracy": 0.5183239962651728, + "step": 12658 + }, + { + "epoch": 2.346866889136077, + "grad_norm": 8.4453125, + "learning_rate": 7.653133110863924e-06, + "loss": 2.5942, + "mean_token_accuracy": 0.5066455696202532, + "step": 12659 + }, + { + "epoch": 2.3470522803114573, + "grad_norm": 7.19140625, + "learning_rate": 7.652947719688543e-06, + "loss": 3.0368, + "mean_token_accuracy": 0.4773418071290412, + "step": 12660 + }, + { + "epoch": 2.347237671486837, + "grad_norm": 7.98046875, + "learning_rate": 7.652762328513163e-06, + "loss": 3.4078, + "mean_token_accuracy": 0.43305414651409696, + "step": 12661 + }, + { + "epoch": 2.3474230626622172, + "grad_norm": 6.78515625, + "learning_rate": 7.652576937337784e-06, + "loss": 2.7389, + "mean_token_accuracy": 0.48809055118110234, + "step": 12662 + }, + { + "epoch": 2.3476084538375974, + "grad_norm": 8.0390625, + "learning_rate": 7.652391546162403e-06, + "loss": 3.5226, + "mean_token_accuracy": 0.42998760842627015, + "step": 12663 + }, + { + "epoch": 2.347793845012977, + "grad_norm": 7.5, + "learning_rate": 7.652206154987023e-06, + "loss": 2.9686, + "mean_token_accuracy": 0.4614777806994478, + "step": 12664 + }, + { + "epoch": 2.3479792361883574, + "grad_norm": 6.38671875, + "learning_rate": 7.652020763811642e-06, + "loss": 3.1098, + "mean_token_accuracy": 0.44243851386708527, + "step": 12665 + }, + { + "epoch": 2.3481646273637375, + "grad_norm": 10.046875, + "learning_rate": 7.651835372636262e-06, + "loss": 2.9901, + "mean_token_accuracy": 0.4679245283018868, + "step": 12666 + }, + { + "epoch": 2.3483500185391177, + "grad_norm": 10.28125, + "learning_rate": 7.651649981460883e-06, + "loss": 2.6448, + "mean_token_accuracy": 0.5109712536145603, + "step": 12667 + }, + { + "epoch": 2.3485354097144975, + "grad_norm": 8.90625, + "learning_rate": 7.651464590285503e-06, + "loss": 4.7752, + "mean_token_accuracy": 0.40296992004061427, + "step": 12668 + }, + { + "epoch": 2.3487208008898777, + "grad_norm": 7.546875, + "learning_rate": 7.651279199110124e-06, + "loss": 2.6924, + "mean_token_accuracy": 0.5085440074906367, + "step": 12669 + }, + { + "epoch": 2.348906192065258, + "grad_norm": 7.19921875, + "learning_rate": 7.651093807934743e-06, + "loss": 2.8625, + "mean_token_accuracy": 0.48123502794783074, + "step": 12670 + }, + { + "epoch": 2.3490915832406376, + "grad_norm": 6.75, + "learning_rate": 7.650908416759363e-06, + "loss": 3.0149, + "mean_token_accuracy": 0.4653578214059531, + "step": 12671 + }, + { + "epoch": 2.349276974416018, + "grad_norm": 6.4765625, + "learning_rate": 7.650723025583982e-06, + "loss": 2.5441, + "mean_token_accuracy": 0.5249773960216998, + "step": 12672 + }, + { + "epoch": 2.349462365591398, + "grad_norm": 6.6328125, + "learning_rate": 7.650537634408603e-06, + "loss": 2.7856, + "mean_token_accuracy": 0.4722145804676754, + "step": 12673 + }, + { + "epoch": 2.3496477567667777, + "grad_norm": 7.14453125, + "learning_rate": 7.650352243233223e-06, + "loss": 2.729, + "mean_token_accuracy": 0.494606521206178, + "step": 12674 + }, + { + "epoch": 2.349833147942158, + "grad_norm": 6.49609375, + "learning_rate": 7.650166852057844e-06, + "loss": 3.0673, + "mean_token_accuracy": 0.4678887641850605, + "step": 12675 + }, + { + "epoch": 2.350018539117538, + "grad_norm": 7.58203125, + "learning_rate": 7.649981460882462e-06, + "loss": 2.4124, + "mean_token_accuracy": 0.5157462345960748, + "step": 12676 + }, + { + "epoch": 2.350203930292918, + "grad_norm": 9.828125, + "learning_rate": 7.649796069707083e-06, + "loss": 3.1404, + "mean_token_accuracy": 0.47819036808821147, + "step": 12677 + }, + { + "epoch": 2.350389321468298, + "grad_norm": 9.2734375, + "learning_rate": 7.649610678531703e-06, + "loss": 2.3369, + "mean_token_accuracy": 0.5272592409444925, + "step": 12678 + }, + { + "epoch": 2.3505747126436782, + "grad_norm": 6.09375, + "learning_rate": 7.649425287356322e-06, + "loss": 2.8798, + "mean_token_accuracy": 0.4819022722702594, + "step": 12679 + }, + { + "epoch": 2.3507601038190584, + "grad_norm": 10.7265625, + "learning_rate": 7.649239896180943e-06, + "loss": 2.4492, + "mean_token_accuracy": 0.5162975344755537, + "step": 12680 + }, + { + "epoch": 2.350945494994438, + "grad_norm": 10.0078125, + "learning_rate": 7.649054505005561e-06, + "loss": 2.9156, + "mean_token_accuracy": 0.49132009491694767, + "step": 12681 + }, + { + "epoch": 2.3511308861698184, + "grad_norm": 7.421875, + "learning_rate": 7.648869113830182e-06, + "loss": 2.5295, + "mean_token_accuracy": 0.49915824915824913, + "step": 12682 + }, + { + "epoch": 2.3513162773451985, + "grad_norm": 12.0859375, + "learning_rate": 7.648683722654802e-06, + "loss": 2.5398, + "mean_token_accuracy": 0.5206725786018657, + "step": 12683 + }, + { + "epoch": 2.3515016685205783, + "grad_norm": 12.3515625, + "learning_rate": 7.648498331479423e-06, + "loss": 2.5708, + "mean_token_accuracy": 0.5052802599512591, + "step": 12684 + }, + { + "epoch": 2.3516870596959585, + "grad_norm": 8.8046875, + "learning_rate": 7.648312940304042e-06, + "loss": 2.5489, + "mean_token_accuracy": 0.5174840085287846, + "step": 12685 + }, + { + "epoch": 2.3518724508713387, + "grad_norm": 10.171875, + "learning_rate": 7.648127549128662e-06, + "loss": 2.6896, + "mean_token_accuracy": 0.5274261603375527, + "step": 12686 + }, + { + "epoch": 2.3520578420467184, + "grad_norm": 11.6953125, + "learning_rate": 7.647942157953283e-06, + "loss": 2.8721, + "mean_token_accuracy": 0.4750089466778003, + "step": 12687 + }, + { + "epoch": 2.3522432332220986, + "grad_norm": 9.6015625, + "learning_rate": 7.647756766777901e-06, + "loss": 2.8832, + "mean_token_accuracy": 0.4763661922333509, + "step": 12688 + }, + { + "epoch": 2.352428624397479, + "grad_norm": 10.5234375, + "learning_rate": 7.647571375602522e-06, + "loss": 2.5286, + "mean_token_accuracy": 0.502900160434407, + "step": 12689 + }, + { + "epoch": 2.3526140155728585, + "grad_norm": 9.390625, + "learning_rate": 7.64738598442714e-06, + "loss": 2.9186, + "mean_token_accuracy": 0.45679699457353556, + "step": 12690 + }, + { + "epoch": 2.3527994067482387, + "grad_norm": 7.015625, + "learning_rate": 7.647200593251763e-06, + "loss": 2.9403, + "mean_token_accuracy": 0.4965166908563135, + "step": 12691 + }, + { + "epoch": 2.352984797923619, + "grad_norm": 6.78125, + "learning_rate": 7.647015202076382e-06, + "loss": 2.8664, + "mean_token_accuracy": 0.4742129119196374, + "step": 12692 + }, + { + "epoch": 2.353170189098999, + "grad_norm": 9.6640625, + "learning_rate": 7.646829810901002e-06, + "loss": 3.4814, + "mean_token_accuracy": 0.4728668941979522, + "step": 12693 + }, + { + "epoch": 2.353355580274379, + "grad_norm": 14.7890625, + "learning_rate": 7.646644419725621e-06, + "loss": 2.7433, + "mean_token_accuracy": 0.49991957535788967, + "step": 12694 + }, + { + "epoch": 2.353540971449759, + "grad_norm": 8.328125, + "learning_rate": 7.646459028550241e-06, + "loss": 3.249, + "mean_token_accuracy": 0.47268810586244037, + "step": 12695 + }, + { + "epoch": 2.3537263626251392, + "grad_norm": 8.4765625, + "learning_rate": 7.646273637374862e-06, + "loss": 2.2821, + "mean_token_accuracy": 0.5386757817445246, + "step": 12696 + }, + { + "epoch": 2.353911753800519, + "grad_norm": 9.2109375, + "learning_rate": 7.64608824619948e-06, + "loss": 3.786, + "mean_token_accuracy": 0.46812543073742247, + "step": 12697 + }, + { + "epoch": 2.354097144975899, + "grad_norm": 12.046875, + "learning_rate": 7.645902855024101e-06, + "loss": 2.8505, + "mean_token_accuracy": 0.477293620150763, + "step": 12698 + }, + { + "epoch": 2.3542825361512794, + "grad_norm": 11.765625, + "learning_rate": 7.645717463848722e-06, + "loss": 3.3823, + "mean_token_accuracy": 0.44162793972679903, + "step": 12699 + }, + { + "epoch": 2.354467927326659, + "grad_norm": 10.6875, + "learning_rate": 7.645532072673342e-06, + "loss": 2.5182, + "mean_token_accuracy": 0.5151852883811646, + "step": 12700 + }, + { + "epoch": 2.3546533185020393, + "grad_norm": 7.52734375, + "learning_rate": 7.645346681497961e-06, + "loss": 3.1233, + "mean_token_accuracy": 0.44495412844036697, + "step": 12701 + }, + { + "epoch": 2.3548387096774195, + "grad_norm": 13.359375, + "learning_rate": 7.645161290322582e-06, + "loss": 2.7183, + "mean_token_accuracy": 0.4849968612680477, + "step": 12702 + }, + { + "epoch": 2.3550241008527992, + "grad_norm": 14.6640625, + "learning_rate": 7.6449758991472e-06, + "loss": 3.1737, + "mean_token_accuracy": 0.43594009983361065, + "step": 12703 + }, + { + "epoch": 2.3552094920281794, + "grad_norm": 12.0859375, + "learning_rate": 7.64479050797182e-06, + "loss": 2.6796, + "mean_token_accuracy": 0.4673304293714997, + "step": 12704 + }, + { + "epoch": 2.3553948832035596, + "grad_norm": 8.9765625, + "learning_rate": 7.644605116796441e-06, + "loss": 3.1492, + "mean_token_accuracy": 0.4340004765308554, + "step": 12705 + }, + { + "epoch": 2.35558027437894, + "grad_norm": 11.046875, + "learning_rate": 7.64441972562106e-06, + "loss": 2.3386, + "mean_token_accuracy": 0.5374172185430464, + "step": 12706 + }, + { + "epoch": 2.3557656655543195, + "grad_norm": 11.0234375, + "learning_rate": 7.644234334445682e-06, + "loss": 2.5372, + "mean_token_accuracy": 0.5053523114927656, + "step": 12707 + }, + { + "epoch": 2.3559510567296997, + "grad_norm": 7.25, + "learning_rate": 7.644048943270301e-06, + "loss": 3.5891, + "mean_token_accuracy": 0.40726500050591924, + "step": 12708 + }, + { + "epoch": 2.3561364479050795, + "grad_norm": 8.4765625, + "learning_rate": 7.643863552094922e-06, + "loss": 3.3423, + "mean_token_accuracy": 0.45510756150972564, + "step": 12709 + }, + { + "epoch": 2.3563218390804597, + "grad_norm": 19.171875, + "learning_rate": 7.64367816091954e-06, + "loss": 2.6173, + "mean_token_accuracy": 0.47494902417710455, + "step": 12710 + }, + { + "epoch": 2.35650723025584, + "grad_norm": 11.6796875, + "learning_rate": 7.643492769744161e-06, + "loss": 2.7371, + "mean_token_accuracy": 0.4911774141803016, + "step": 12711 + }, + { + "epoch": 2.35669262143122, + "grad_norm": 6.29296875, + "learning_rate": 7.643307378568781e-06, + "loss": 2.3398, + "mean_token_accuracy": 0.5554495587884569, + "step": 12712 + }, + { + "epoch": 2.3568780126066, + "grad_norm": 9.7734375, + "learning_rate": 7.6431219873934e-06, + "loss": 3.3931, + "mean_token_accuracy": 0.4663665594855306, + "step": 12713 + }, + { + "epoch": 2.35706340378198, + "grad_norm": 9.3125, + "learning_rate": 7.64293659621802e-06, + "loss": 3.5506, + "mean_token_accuracy": 0.42844503428445035, + "step": 12714 + }, + { + "epoch": 2.35724879495736, + "grad_norm": 10.6953125, + "learning_rate": 7.642751205042641e-06, + "loss": 2.5488, + "mean_token_accuracy": 0.5016935904116727, + "step": 12715 + }, + { + "epoch": 2.35743418613274, + "grad_norm": 9.8203125, + "learning_rate": 7.642565813867262e-06, + "loss": 3.867, + "mean_token_accuracy": 0.4346470087893394, + "step": 12716 + }, + { + "epoch": 2.35761957730812, + "grad_norm": 11.9453125, + "learning_rate": 7.64238042269188e-06, + "loss": 2.7364, + "mean_token_accuracy": 0.46881720430107526, + "step": 12717 + }, + { + "epoch": 2.3578049684835003, + "grad_norm": 8.2265625, + "learning_rate": 7.642195031516501e-06, + "loss": 2.8633, + "mean_token_accuracy": 0.4728379464798438, + "step": 12718 + }, + { + "epoch": 2.3579903596588805, + "grad_norm": 6.046875, + "learning_rate": 7.64200964034112e-06, + "loss": 2.5168, + "mean_token_accuracy": 0.5271246551950611, + "step": 12719 + }, + { + "epoch": 2.3581757508342602, + "grad_norm": 10.0703125, + "learning_rate": 7.64182424916574e-06, + "loss": 3.2942, + "mean_token_accuracy": 0.4385458047429878, + "step": 12720 + }, + { + "epoch": 2.3583611420096404, + "grad_norm": 12.9375, + "learning_rate": 7.64163885799036e-06, + "loss": 2.4964, + "mean_token_accuracy": 0.5030032127392093, + "step": 12721 + }, + { + "epoch": 2.35854653318502, + "grad_norm": 6.7109375, + "learning_rate": 7.64145346681498e-06, + "loss": 2.6264, + "mean_token_accuracy": 0.49041132125380243, + "step": 12722 + }, + { + "epoch": 2.3587319243604004, + "grad_norm": 9.4609375, + "learning_rate": 7.6412680756396e-06, + "loss": 2.4269, + "mean_token_accuracy": 0.52453653217012, + "step": 12723 + }, + { + "epoch": 2.3589173155357805, + "grad_norm": 9.484375, + "learning_rate": 7.64108268446422e-06, + "loss": 2.5169, + "mean_token_accuracy": 0.5238573581115018, + "step": 12724 + }, + { + "epoch": 2.3591027067111607, + "grad_norm": 6.640625, + "learning_rate": 7.640897293288841e-06, + "loss": 2.8032, + "mean_token_accuracy": 0.4929658340511054, + "step": 12725 + }, + { + "epoch": 2.3592880978865405, + "grad_norm": 11.6875, + "learning_rate": 7.64071190211346e-06, + "loss": 2.2529, + "mean_token_accuracy": 0.5287253141831239, + "step": 12726 + }, + { + "epoch": 2.3594734890619207, + "grad_norm": 6.734375, + "learning_rate": 7.64052651093808e-06, + "loss": 3.1237, + "mean_token_accuracy": 0.4534263438654083, + "step": 12727 + }, + { + "epoch": 2.359658880237301, + "grad_norm": 8.3046875, + "learning_rate": 7.640341119762699e-06, + "loss": 3.2458, + "mean_token_accuracy": 0.4346762355243843, + "step": 12728 + }, + { + "epoch": 2.3598442714126806, + "grad_norm": 8.796875, + "learning_rate": 7.64015572858732e-06, + "loss": 3.3514, + "mean_token_accuracy": 0.43756637839478074, + "step": 12729 + }, + { + "epoch": 2.360029662588061, + "grad_norm": 6.9140625, + "learning_rate": 7.63997033741194e-06, + "loss": 2.5698, + "mean_token_accuracy": 0.5009693237541338, + "step": 12730 + }, + { + "epoch": 2.360215053763441, + "grad_norm": 5.97265625, + "learning_rate": 7.63978494623656e-06, + "loss": 2.5455, + "mean_token_accuracy": 0.5137276328370441, + "step": 12731 + }, + { + "epoch": 2.3604004449388207, + "grad_norm": 7.109375, + "learning_rate": 7.63959955506118e-06, + "loss": 2.7781, + "mean_token_accuracy": 0.4802336028751123, + "step": 12732 + }, + { + "epoch": 2.360585836114201, + "grad_norm": 7.70703125, + "learning_rate": 7.6394141638858e-06, + "loss": 3.3729, + "mean_token_accuracy": 0.45111414279217826, + "step": 12733 + }, + { + "epoch": 2.360771227289581, + "grad_norm": 6.36328125, + "learning_rate": 7.63922877271042e-06, + "loss": 2.9118, + "mean_token_accuracy": 0.48656029897944514, + "step": 12734 + }, + { + "epoch": 2.360956618464961, + "grad_norm": 6.5703125, + "learning_rate": 7.639043381535039e-06, + "loss": 2.3134, + "mean_token_accuracy": 0.525911708253359, + "step": 12735 + }, + { + "epoch": 2.361142009640341, + "grad_norm": 9.4140625, + "learning_rate": 7.63885799035966e-06, + "loss": 3.6539, + "mean_token_accuracy": 0.46134969325153374, + "step": 12736 + }, + { + "epoch": 2.3613274008157212, + "grad_norm": 6.94140625, + "learning_rate": 7.638672599184278e-06, + "loss": 2.5971, + "mean_token_accuracy": 0.47412312975226883, + "step": 12737 + }, + { + "epoch": 2.3615127919911014, + "grad_norm": 12.4609375, + "learning_rate": 7.638487208008899e-06, + "loss": 4.3308, + "mean_token_accuracy": 0.44086021505376344, + "step": 12738 + }, + { + "epoch": 2.361698183166481, + "grad_norm": 6.43359375, + "learning_rate": 7.63830181683352e-06, + "loss": 3.1351, + "mean_token_accuracy": 0.4503222341568206, + "step": 12739 + }, + { + "epoch": 2.3618835743418614, + "grad_norm": 6.48046875, + "learning_rate": 7.63811642565814e-06, + "loss": 2.9824, + "mean_token_accuracy": 0.4892031822199773, + "step": 12740 + }, + { + "epoch": 2.3620689655172415, + "grad_norm": 7.1484375, + "learning_rate": 7.637931034482759e-06, + "loss": 3.1858, + "mean_token_accuracy": 0.4563197026022305, + "step": 12741 + }, + { + "epoch": 2.3622543566926213, + "grad_norm": 6.44921875, + "learning_rate": 7.63774564330738e-06, + "loss": 2.7439, + "mean_token_accuracy": 0.49961695607763024, + "step": 12742 + }, + { + "epoch": 2.3624397478680015, + "grad_norm": 6.94921875, + "learning_rate": 7.637560252132e-06, + "loss": 2.6781, + "mean_token_accuracy": 0.5184993531694696, + "step": 12743 + }, + { + "epoch": 2.3626251390433817, + "grad_norm": 7.796875, + "learning_rate": 7.637374860956618e-06, + "loss": 2.5922, + "mean_token_accuracy": 0.4960051134547779, + "step": 12744 + }, + { + "epoch": 2.3628105302187614, + "grad_norm": 6.97265625, + "learning_rate": 7.637189469781239e-06, + "loss": 2.827, + "mean_token_accuracy": 0.4854996243425995, + "step": 12745 + }, + { + "epoch": 2.3629959213941416, + "grad_norm": 8.25, + "learning_rate": 7.637004078605858e-06, + "loss": 2.8077, + "mean_token_accuracy": 0.48614019958112603, + "step": 12746 + }, + { + "epoch": 2.363181312569522, + "grad_norm": 7.03515625, + "learning_rate": 7.636818687430478e-06, + "loss": 2.9392, + "mean_token_accuracy": 0.5073002754820937, + "step": 12747 + }, + { + "epoch": 2.3633667037449015, + "grad_norm": 6.7734375, + "learning_rate": 7.636633296255099e-06, + "loss": 3.2193, + "mean_token_accuracy": 0.4584670689433817, + "step": 12748 + }, + { + "epoch": 2.3635520949202817, + "grad_norm": 6.5546875, + "learning_rate": 7.63644790507972e-06, + "loss": 2.8194, + "mean_token_accuracy": 0.46354625550660794, + "step": 12749 + }, + { + "epoch": 2.363737486095662, + "grad_norm": 6.86328125, + "learning_rate": 7.63626251390434e-06, + "loss": 2.9762, + "mean_token_accuracy": 0.4776251661497563, + "step": 12750 + }, + { + "epoch": 2.363922877271042, + "grad_norm": 6.6953125, + "learning_rate": 7.636077122728959e-06, + "loss": 2.9847, + "mean_token_accuracy": 0.4688009313154831, + "step": 12751 + }, + { + "epoch": 2.364108268446422, + "grad_norm": 8.375, + "learning_rate": 7.635891731553579e-06, + "loss": 2.5015, + "mean_token_accuracy": 0.5233605989186192, + "step": 12752 + }, + { + "epoch": 2.364293659621802, + "grad_norm": 8.0, + "learning_rate": 7.635706340378198e-06, + "loss": 2.5795, + "mean_token_accuracy": 0.48991488865570715, + "step": 12753 + }, + { + "epoch": 2.3644790507971822, + "grad_norm": 7.1953125, + "learning_rate": 7.635520949202818e-06, + "loss": 3.5159, + "mean_token_accuracy": 0.4509702189478053, + "step": 12754 + }, + { + "epoch": 2.364664441972562, + "grad_norm": 6.859375, + "learning_rate": 7.635335558027439e-06, + "loss": 2.4924, + "mean_token_accuracy": 0.5078646029948408, + "step": 12755 + }, + { + "epoch": 2.364849833147942, + "grad_norm": 8.1953125, + "learning_rate": 7.63515016685206e-06, + "loss": 3.449, + "mean_token_accuracy": 0.44929415154134256, + "step": 12756 + }, + { + "epoch": 2.3650352243233224, + "grad_norm": 7.09765625, + "learning_rate": 7.634964775676678e-06, + "loss": 3.0195, + "mean_token_accuracy": 0.4636363636363636, + "step": 12757 + }, + { + "epoch": 2.365220615498702, + "grad_norm": 9.71875, + "learning_rate": 7.634779384501299e-06, + "loss": 2.7043, + "mean_token_accuracy": 0.48497495826377296, + "step": 12758 + }, + { + "epoch": 2.3654060066740823, + "grad_norm": 6.61328125, + "learning_rate": 7.634593993325919e-06, + "loss": 2.9491, + "mean_token_accuracy": 0.4749941023826374, + "step": 12759 + }, + { + "epoch": 2.3655913978494625, + "grad_norm": 6.0234375, + "learning_rate": 7.634408602150538e-06, + "loss": 2.8202, + "mean_token_accuracy": 0.4921353970051592, + "step": 12760 + }, + { + "epoch": 2.3657767890248422, + "grad_norm": 9.96875, + "learning_rate": 7.634223210975158e-06, + "loss": 2.6908, + "mean_token_accuracy": 0.49056603773584906, + "step": 12761 + }, + { + "epoch": 2.3659621802002224, + "grad_norm": 11.125, + "learning_rate": 7.634037819799777e-06, + "loss": 2.8311, + "mean_token_accuracy": 0.4874585896829153, + "step": 12762 + }, + { + "epoch": 2.3661475713756026, + "grad_norm": 9.4140625, + "learning_rate": 7.633852428624398e-06, + "loss": 3.456, + "mean_token_accuracy": 0.44687375016664443, + "step": 12763 + }, + { + "epoch": 2.366332962550983, + "grad_norm": 6.12890625, + "learning_rate": 7.633667037449018e-06, + "loss": 3.2571, + "mean_token_accuracy": 0.43760896637608965, + "step": 12764 + }, + { + "epoch": 2.3665183537263625, + "grad_norm": 7.86328125, + "learning_rate": 7.633481646273639e-06, + "loss": 3.0385, + "mean_token_accuracy": 0.4922884012539185, + "step": 12765 + }, + { + "epoch": 2.3667037449017427, + "grad_norm": 8.2578125, + "learning_rate": 7.633296255098257e-06, + "loss": 3.0516, + "mean_token_accuracy": 0.45058300943920043, + "step": 12766 + }, + { + "epoch": 2.3668891360771225, + "grad_norm": 7.69921875, + "learning_rate": 7.633110863922878e-06, + "loss": 2.814, + "mean_token_accuracy": 0.4796451766196737, + "step": 12767 + }, + { + "epoch": 2.3670745272525027, + "grad_norm": 7.53125, + "learning_rate": 7.632925472747498e-06, + "loss": 2.9657, + "mean_token_accuracy": 0.4828198301001648, + "step": 12768 + }, + { + "epoch": 2.367259918427883, + "grad_norm": 7.1328125, + "learning_rate": 7.632740081572117e-06, + "loss": 2.4662, + "mean_token_accuracy": 0.5026178010471204, + "step": 12769 + }, + { + "epoch": 2.367445309603263, + "grad_norm": 6.91015625, + "learning_rate": 7.632554690396738e-06, + "loss": 2.3976, + "mean_token_accuracy": 0.5290185676392573, + "step": 12770 + }, + { + "epoch": 2.367630700778643, + "grad_norm": 6.7109375, + "learning_rate": 7.632369299221356e-06, + "loss": 2.9553, + "mean_token_accuracy": 0.47660661468136595, + "step": 12771 + }, + { + "epoch": 2.367816091954023, + "grad_norm": 9.96875, + "learning_rate": 7.632183908045979e-06, + "loss": 2.6112, + "mean_token_accuracy": 0.5271551075673236, + "step": 12772 + }, + { + "epoch": 2.368001483129403, + "grad_norm": 6.4609375, + "learning_rate": 7.631998516870597e-06, + "loss": 2.513, + "mean_token_accuracy": 0.5075061944322985, + "step": 12773 + }, + { + "epoch": 2.368186874304783, + "grad_norm": 7.6875, + "learning_rate": 7.631813125695218e-06, + "loss": 2.5828, + "mean_token_accuracy": 0.5034639409426462, + "step": 12774 + }, + { + "epoch": 2.368372265480163, + "grad_norm": 8.890625, + "learning_rate": 7.631627734519837e-06, + "loss": 2.9286, + "mean_token_accuracy": 0.5000714387769681, + "step": 12775 + }, + { + "epoch": 2.3685576566555433, + "grad_norm": 7.15234375, + "learning_rate": 7.631442343344457e-06, + "loss": 3.4681, + "mean_token_accuracy": 0.42647552917459963, + "step": 12776 + }, + { + "epoch": 2.3687430478309235, + "grad_norm": 9.90625, + "learning_rate": 7.631256952169078e-06, + "loss": 2.1824, + "mean_token_accuracy": 0.5376392995224016, + "step": 12777 + }, + { + "epoch": 2.3689284390063032, + "grad_norm": 6.87890625, + "learning_rate": 7.631071560993697e-06, + "loss": 3.3282, + "mean_token_accuracy": 0.45767272028442396, + "step": 12778 + }, + { + "epoch": 2.3691138301816834, + "grad_norm": 9.296875, + "learning_rate": 7.630886169818317e-06, + "loss": 2.4789, + "mean_token_accuracy": 0.5095939933259177, + "step": 12779 + }, + { + "epoch": 2.369299221357063, + "grad_norm": 6.55859375, + "learning_rate": 7.630700778642938e-06, + "loss": 2.8441, + "mean_token_accuracy": 0.47527472527472525, + "step": 12780 + }, + { + "epoch": 2.3694846125324434, + "grad_norm": 9.4140625, + "learning_rate": 7.630515387467558e-06, + "loss": 2.7123, + "mean_token_accuracy": 0.4901937335565654, + "step": 12781 + }, + { + "epoch": 2.3696700037078235, + "grad_norm": 7.55859375, + "learning_rate": 7.630329996292177e-06, + "loss": 3.7741, + "mean_token_accuracy": 0.4423229912490056, + "step": 12782 + }, + { + "epoch": 2.3698553948832037, + "grad_norm": 7.38671875, + "learning_rate": 7.630144605116797e-06, + "loss": 3.007, + "mean_token_accuracy": 0.47838696312664, + "step": 12783 + }, + { + "epoch": 2.3700407860585835, + "grad_norm": 8.0859375, + "learning_rate": 7.629959213941416e-06, + "loss": 3.2904, + "mean_token_accuracy": 0.45183863885839737, + "step": 12784 + }, + { + "epoch": 2.3702261772339637, + "grad_norm": 9.9609375, + "learning_rate": 7.629773822766037e-06, + "loss": 2.3593, + "mean_token_accuracy": 0.5288651084201633, + "step": 12785 + }, + { + "epoch": 2.370411568409344, + "grad_norm": 6.515625, + "learning_rate": 7.629588431590657e-06, + "loss": 2.935, + "mean_token_accuracy": 0.45790634323517965, + "step": 12786 + }, + { + "epoch": 2.3705969595847236, + "grad_norm": 8.2265625, + "learning_rate": 7.629403040415276e-06, + "loss": 2.9545, + "mean_token_accuracy": 0.475384349628606, + "step": 12787 + }, + { + "epoch": 2.370782350760104, + "grad_norm": 6.53125, + "learning_rate": 7.629217649239897e-06, + "loss": 2.971, + "mean_token_accuracy": 0.46498467342607874, + "step": 12788 + }, + { + "epoch": 2.370967741935484, + "grad_norm": 6.67578125, + "learning_rate": 7.629032258064517e-06, + "loss": 2.6182, + "mean_token_accuracy": 0.5018462032428961, + "step": 12789 + }, + { + "epoch": 2.3711531331108637, + "grad_norm": 6.2734375, + "learning_rate": 7.6288468668891365e-06, + "loss": 3.0286, + "mean_token_accuracy": 0.4704869265695945, + "step": 12790 + }, + { + "epoch": 2.371338524286244, + "grad_norm": 7.84375, + "learning_rate": 7.628661475713757e-06, + "loss": 2.747, + "mean_token_accuracy": 0.4905637840420449, + "step": 12791 + }, + { + "epoch": 2.371523915461624, + "grad_norm": 6.19140625, + "learning_rate": 7.628476084538377e-06, + "loss": 2.7038, + "mean_token_accuracy": 0.48619224641529474, + "step": 12792 + }, + { + "epoch": 2.371709306637004, + "grad_norm": 7.65234375, + "learning_rate": 7.628290693362996e-06, + "loss": 2.4417, + "mean_token_accuracy": 0.5021608810818347, + "step": 12793 + }, + { + "epoch": 2.371894697812384, + "grad_norm": 7.1875, + "learning_rate": 7.628105302187616e-06, + "loss": 2.465, + "mean_token_accuracy": 0.490539916935856, + "step": 12794 + }, + { + "epoch": 2.3720800889877642, + "grad_norm": 9.1796875, + "learning_rate": 7.6279199110122356e-06, + "loss": 1.8646, + "mean_token_accuracy": 0.577714012434242, + "step": 12795 + }, + { + "epoch": 2.3722654801631444, + "grad_norm": 9.03125, + "learning_rate": 7.627734519836857e-06, + "loss": 3.0283, + "mean_token_accuracy": 0.459721146398141, + "step": 12796 + }, + { + "epoch": 2.372450871338524, + "grad_norm": 8.28125, + "learning_rate": 7.6275491286614766e-06, + "loss": 3.0281, + "mean_token_accuracy": 0.4505971769815418, + "step": 12797 + }, + { + "epoch": 2.3726362625139044, + "grad_norm": 7.25, + "learning_rate": 7.627363737486096e-06, + "loss": 2.6155, + "mean_token_accuracy": 0.5033410278216499, + "step": 12798 + }, + { + "epoch": 2.3728216536892845, + "grad_norm": 7.04296875, + "learning_rate": 7.627178346310717e-06, + "loss": 2.5541, + "mean_token_accuracy": 0.5141664558563117, + "step": 12799 + }, + { + "epoch": 2.3730070448646643, + "grad_norm": 8.9140625, + "learning_rate": 7.626992955135336e-06, + "loss": 3.4132, + "mean_token_accuracy": 0.45859133126934987, + "step": 12800 + }, + { + "epoch": 2.3731924360400445, + "grad_norm": 7.10546875, + "learning_rate": 7.626807563959956e-06, + "loss": 2.4952, + "mean_token_accuracy": 0.5132189489586706, + "step": 12801 + }, + { + "epoch": 2.3733778272154247, + "grad_norm": 6.3515625, + "learning_rate": 7.626622172784576e-06, + "loss": 2.9273, + "mean_token_accuracy": 0.47131593257205, + "step": 12802 + }, + { + "epoch": 2.3735632183908044, + "grad_norm": 6.890625, + "learning_rate": 7.626436781609195e-06, + "loss": 3.5297, + "mean_token_accuracy": 0.43121172353455817, + "step": 12803 + }, + { + "epoch": 2.3737486095661846, + "grad_norm": 5.94921875, + "learning_rate": 7.626251390433817e-06, + "loss": 2.8368, + "mean_token_accuracy": 0.490608284612514, + "step": 12804 + }, + { + "epoch": 2.373934000741565, + "grad_norm": 7.62890625, + "learning_rate": 7.626065999258436e-06, + "loss": 3.4998, + "mean_token_accuracy": 0.4524793388429752, + "step": 12805 + }, + { + "epoch": 2.3741193919169445, + "grad_norm": 6.06640625, + "learning_rate": 7.625880608083056e-06, + "loss": 2.3121, + "mean_token_accuracy": 0.5234814398200225, + "step": 12806 + }, + { + "epoch": 2.3743047830923247, + "grad_norm": 7.0703125, + "learning_rate": 7.6256952169076755e-06, + "loss": 3.0902, + "mean_token_accuracy": 0.43080900243309, + "step": 12807 + }, + { + "epoch": 2.374490174267705, + "grad_norm": 6.6015625, + "learning_rate": 7.625509825732296e-06, + "loss": 3.2591, + "mean_token_accuracy": 0.446748053041465, + "step": 12808 + }, + { + "epoch": 2.374675565443085, + "grad_norm": 9.34375, + "learning_rate": 7.625324434556916e-06, + "loss": 2.8188, + "mean_token_accuracy": 0.47324743393754093, + "step": 12809 + }, + { + "epoch": 2.374860956618465, + "grad_norm": 6.33203125, + "learning_rate": 7.625139043381535e-06, + "loss": 2.7375, + "mean_token_accuracy": 0.4955296667569764, + "step": 12810 + }, + { + "epoch": 2.375046347793845, + "grad_norm": 8.1640625, + "learning_rate": 7.624953652206155e-06, + "loss": 2.8593, + "mean_token_accuracy": 0.4711111111111111, + "step": 12811 + }, + { + "epoch": 2.3752317389692252, + "grad_norm": 8.640625, + "learning_rate": 7.624768261030776e-06, + "loss": 3.0338, + "mean_token_accuracy": 0.45329315540249676, + "step": 12812 + }, + { + "epoch": 2.375417130144605, + "grad_norm": 8.453125, + "learning_rate": 7.624582869855396e-06, + "loss": 2.7287, + "mean_token_accuracy": 0.49201791661881245, + "step": 12813 + }, + { + "epoch": 2.375602521319985, + "grad_norm": 6.98046875, + "learning_rate": 7.624397478680016e-06, + "loss": 2.9999, + "mean_token_accuracy": 0.46235619630115043, + "step": 12814 + }, + { + "epoch": 2.3757879124953654, + "grad_norm": 8.0078125, + "learning_rate": 7.624212087504635e-06, + "loss": 3.1702, + "mean_token_accuracy": 0.45966207899742645, + "step": 12815 + }, + { + "epoch": 2.375973303670745, + "grad_norm": 7.17578125, + "learning_rate": 7.624026696329255e-06, + "loss": 2.641, + "mean_token_accuracy": 0.49703703703703705, + "step": 12816 + }, + { + "epoch": 2.3761586948461253, + "grad_norm": 6.90625, + "learning_rate": 7.623841305153875e-06, + "loss": 2.6376, + "mean_token_accuracy": 0.5033916117321104, + "step": 12817 + }, + { + "epoch": 2.3763440860215055, + "grad_norm": 8.6171875, + "learning_rate": 7.623655913978495e-06, + "loss": 2.9636, + "mean_token_accuracy": 0.4859513930053349, + "step": 12818 + }, + { + "epoch": 2.3765294771968852, + "grad_norm": 6.7890625, + "learning_rate": 7.623470522803115e-06, + "loss": 2.6822, + "mean_token_accuracy": 0.4948914431673052, + "step": 12819 + }, + { + "epoch": 2.3767148683722654, + "grad_norm": 8.484375, + "learning_rate": 7.623285131627736e-06, + "loss": 2.4426, + "mean_token_accuracy": 0.5233412322274882, + "step": 12820 + }, + { + "epoch": 2.3769002595476456, + "grad_norm": 7.6875, + "learning_rate": 7.623099740452356e-06, + "loss": 2.562, + "mean_token_accuracy": 0.5269776440240757, + "step": 12821 + }, + { + "epoch": 2.377085650723026, + "grad_norm": 7.609375, + "learning_rate": 7.622914349276975e-06, + "loss": 2.6695, + "mean_token_accuracy": 0.47727876694127025, + "step": 12822 + }, + { + "epoch": 2.3772710418984055, + "grad_norm": 7.578125, + "learning_rate": 7.622728958101595e-06, + "loss": 2.6549, + "mean_token_accuracy": 0.5326473577235772, + "step": 12823 + }, + { + "epoch": 2.3774564330737857, + "grad_norm": 7.83984375, + "learning_rate": 7.6225435669262146e-06, + "loss": 2.8208, + "mean_token_accuracy": 0.48042574015170053, + "step": 12824 + }, + { + "epoch": 2.377641824249166, + "grad_norm": 8.1328125, + "learning_rate": 7.622358175750835e-06, + "loss": 3.0595, + "mean_token_accuracy": 0.45216606498194944, + "step": 12825 + }, + { + "epoch": 2.3778272154245457, + "grad_norm": 7.1953125, + "learning_rate": 7.622172784575455e-06, + "loss": 2.216, + "mean_token_accuracy": 0.5622107629247654, + "step": 12826 + }, + { + "epoch": 2.378012606599926, + "grad_norm": 8.59375, + "learning_rate": 7.621987393400074e-06, + "loss": 3.2191, + "mean_token_accuracy": 0.45646265898467225, + "step": 12827 + }, + { + "epoch": 2.378197997775306, + "grad_norm": 7.56640625, + "learning_rate": 7.621802002224695e-06, + "loss": 2.6798, + "mean_token_accuracy": 0.5193181818181818, + "step": 12828 + }, + { + "epoch": 2.378383388950686, + "grad_norm": 7.24609375, + "learning_rate": 7.621616611049315e-06, + "loss": 2.6441, + "mean_token_accuracy": 0.49170687575392036, + "step": 12829 + }, + { + "epoch": 2.378568780126066, + "grad_norm": 11.1484375, + "learning_rate": 7.621431219873935e-06, + "loss": 4.4324, + "mean_token_accuracy": 0.4349605378544285, + "step": 12830 + }, + { + "epoch": 2.378754171301446, + "grad_norm": 8.6171875, + "learning_rate": 7.621245828698555e-06, + "loss": 2.534, + "mean_token_accuracy": 0.48733862959285007, + "step": 12831 + }, + { + "epoch": 2.378939562476826, + "grad_norm": 8.203125, + "learning_rate": 7.621060437523174e-06, + "loss": 4.0336, + "mean_token_accuracy": 0.4362327358213341, + "step": 12832 + }, + { + "epoch": 2.379124953652206, + "grad_norm": 8.34375, + "learning_rate": 7.620875046347794e-06, + "loss": 2.8846, + "mean_token_accuracy": 0.47443216324547555, + "step": 12833 + }, + { + "epoch": 2.3793103448275863, + "grad_norm": 7.42578125, + "learning_rate": 7.620689655172414e-06, + "loss": 3.2789, + "mean_token_accuracy": 0.4586063132817153, + "step": 12834 + }, + { + "epoch": 2.3794957360029665, + "grad_norm": 6.84765625, + "learning_rate": 7.620504263997034e-06, + "loss": 2.6632, + "mean_token_accuracy": 0.48108907231735865, + "step": 12835 + }, + { + "epoch": 2.3796811271783462, + "grad_norm": 7.9453125, + "learning_rate": 7.6203188728216545e-06, + "loss": 2.7342, + "mean_token_accuracy": 0.5299502487562189, + "step": 12836 + }, + { + "epoch": 2.3798665183537264, + "grad_norm": 7.3984375, + "learning_rate": 7.620133481646275e-06, + "loss": 2.6479, + "mean_token_accuracy": 0.48641941265683164, + "step": 12837 + }, + { + "epoch": 2.380051909529106, + "grad_norm": 12.4375, + "learning_rate": 7.619948090470895e-06, + "loss": 3.1233, + "mean_token_accuracy": 0.4884450784593438, + "step": 12838 + }, + { + "epoch": 2.3802373007044864, + "grad_norm": 9.484375, + "learning_rate": 7.619762699295514e-06, + "loss": 3.0892, + "mean_token_accuracy": 0.4685679239497087, + "step": 12839 + }, + { + "epoch": 2.3804226918798665, + "grad_norm": 7.12109375, + "learning_rate": 7.619577308120134e-06, + "loss": 2.8422, + "mean_token_accuracy": 0.47885491216655823, + "step": 12840 + }, + { + "epoch": 2.3806080830552467, + "grad_norm": 6.81640625, + "learning_rate": 7.619391916944754e-06, + "loss": 2.6232, + "mean_token_accuracy": 0.4951985093879891, + "step": 12841 + }, + { + "epoch": 2.3807934742306265, + "grad_norm": 7.10546875, + "learning_rate": 7.619206525769374e-06, + "loss": 2.7345, + "mean_token_accuracy": 0.484733756717147, + "step": 12842 + }, + { + "epoch": 2.3809788654060067, + "grad_norm": 7.8046875, + "learning_rate": 7.619021134593994e-06, + "loss": 3.1829, + "mean_token_accuracy": 0.46738333740532617, + "step": 12843 + }, + { + "epoch": 2.381164256581387, + "grad_norm": 6.37109375, + "learning_rate": 7.618835743418614e-06, + "loss": 2.319, + "mean_token_accuracy": 0.5451876436047995, + "step": 12844 + }, + { + "epoch": 2.3813496477567666, + "grad_norm": 6.390625, + "learning_rate": 7.618650352243234e-06, + "loss": 2.8467, + "mean_token_accuracy": 0.49559533958510943, + "step": 12845 + }, + { + "epoch": 2.381535038932147, + "grad_norm": 5.94921875, + "learning_rate": 7.618464961067854e-06, + "loss": 2.7228, + "mean_token_accuracy": 0.4895330112721417, + "step": 12846 + }, + { + "epoch": 2.381720430107527, + "grad_norm": 7.09375, + "learning_rate": 7.618279569892474e-06, + "loss": 2.5239, + "mean_token_accuracy": 0.49325100516944287, + "step": 12847 + }, + { + "epoch": 2.381905821282907, + "grad_norm": 6.47265625, + "learning_rate": 7.618094178717094e-06, + "loss": 3.1132, + "mean_token_accuracy": 0.4339475733194913, + "step": 12848 + }, + { + "epoch": 2.382091212458287, + "grad_norm": 8.1171875, + "learning_rate": 7.617908787541713e-06, + "loss": 2.6658, + "mean_token_accuracy": 0.4815748031496063, + "step": 12849 + }, + { + "epoch": 2.382276603633667, + "grad_norm": 7.58984375, + "learning_rate": 7.617723396366333e-06, + "loss": 2.9789, + "mean_token_accuracy": 0.465514839069249, + "step": 12850 + }, + { + "epoch": 2.382461994809047, + "grad_norm": 8.8203125, + "learning_rate": 7.6175380051909534e-06, + "loss": 3.4512, + "mean_token_accuracy": 0.4434701877449969, + "step": 12851 + }, + { + "epoch": 2.382647385984427, + "grad_norm": 6.8046875, + "learning_rate": 7.617352614015574e-06, + "loss": 3.4752, + "mean_token_accuracy": 0.45186781609195403, + "step": 12852 + }, + { + "epoch": 2.3828327771598072, + "grad_norm": 7.76171875, + "learning_rate": 7.6171672228401936e-06, + "loss": 2.8936, + "mean_token_accuracy": 0.47450587224291035, + "step": 12853 + }, + { + "epoch": 2.3830181683351874, + "grad_norm": 7.13671875, + "learning_rate": 7.616981831664813e-06, + "loss": 3.2169, + "mean_token_accuracy": 0.45313782991202345, + "step": 12854 + }, + { + "epoch": 2.383203559510567, + "grad_norm": 6.23046875, + "learning_rate": 7.616796440489434e-06, + "loss": 2.547, + "mean_token_accuracy": 0.5054102177783866, + "step": 12855 + }, + { + "epoch": 2.3833889506859474, + "grad_norm": 8.0234375, + "learning_rate": 7.616611049314053e-06, + "loss": 3.2302, + "mean_token_accuracy": 0.46348019165595417, + "step": 12856 + }, + { + "epoch": 2.3835743418613276, + "grad_norm": 8.71875, + "learning_rate": 7.616425658138673e-06, + "loss": 3.3054, + "mean_token_accuracy": 0.4574215330749031, + "step": 12857 + }, + { + "epoch": 2.3837597330367073, + "grad_norm": 5.88671875, + "learning_rate": 7.616240266963293e-06, + "loss": 2.4971, + "mean_token_accuracy": 0.5349489103897278, + "step": 12858 + }, + { + "epoch": 2.3839451242120875, + "grad_norm": 7.33203125, + "learning_rate": 7.616054875787912e-06, + "loss": 3.0072, + "mean_token_accuracy": 0.4363283775048481, + "step": 12859 + }, + { + "epoch": 2.3841305153874677, + "grad_norm": 6.35546875, + "learning_rate": 7.615869484612534e-06, + "loss": 2.9194, + "mean_token_accuracy": 0.4763799104922924, + "step": 12860 + }, + { + "epoch": 2.3843159065628474, + "grad_norm": 7.11328125, + "learning_rate": 7.615684093437153e-06, + "loss": 2.6495, + "mean_token_accuracy": 0.5037143541816439, + "step": 12861 + }, + { + "epoch": 2.3845012977382276, + "grad_norm": 7.609375, + "learning_rate": 7.615498702261773e-06, + "loss": 2.6164, + "mean_token_accuracy": 0.5080191184280404, + "step": 12862 + }, + { + "epoch": 2.384686688913608, + "grad_norm": 7.1953125, + "learning_rate": 7.615313311086393e-06, + "loss": 3.049, + "mean_token_accuracy": 0.46779038718291055, + "step": 12863 + }, + { + "epoch": 2.3848720800889875, + "grad_norm": 7.828125, + "learning_rate": 7.615127919911013e-06, + "loss": 2.7287, + "mean_token_accuracy": 0.506372396642835, + "step": 12864 + }, + { + "epoch": 2.3850574712643677, + "grad_norm": 7.046875, + "learning_rate": 7.614942528735633e-06, + "loss": 3.0195, + "mean_token_accuracy": 0.46064209274673007, + "step": 12865 + }, + { + "epoch": 2.385242862439748, + "grad_norm": 10.875, + "learning_rate": 7.614757137560252e-06, + "loss": 2.9555, + "mean_token_accuracy": 0.4691485694588073, + "step": 12866 + }, + { + "epoch": 2.385428253615128, + "grad_norm": 8.328125, + "learning_rate": 7.614571746384872e-06, + "loss": 2.7083, + "mean_token_accuracy": 0.48321678321678324, + "step": 12867 + }, + { + "epoch": 2.385613644790508, + "grad_norm": 8.8125, + "learning_rate": 7.6143863552094925e-06, + "loss": 2.2453, + "mean_token_accuracy": 0.5366689513365319, + "step": 12868 + }, + { + "epoch": 2.385799035965888, + "grad_norm": 10.8125, + "learning_rate": 7.614200964034113e-06, + "loss": 2.9106, + "mean_token_accuracy": 0.49093581577658013, + "step": 12869 + }, + { + "epoch": 2.3859844271412682, + "grad_norm": 7.85546875, + "learning_rate": 7.614015572858733e-06, + "loss": 3.0141, + "mean_token_accuracy": 0.4716036228023442, + "step": 12870 + }, + { + "epoch": 2.386169818316648, + "grad_norm": 8.15625, + "learning_rate": 7.613830181683352e-06, + "loss": 2.4281, + "mean_token_accuracy": 0.5031372966074658, + "step": 12871 + }, + { + "epoch": 2.386355209492028, + "grad_norm": 8.0703125, + "learning_rate": 7.613644790507973e-06, + "loss": 3.343, + "mean_token_accuracy": 0.41998318385650224, + "step": 12872 + }, + { + "epoch": 2.3865406006674084, + "grad_norm": 7.6015625, + "learning_rate": 7.613459399332592e-06, + "loss": 2.7884, + "mean_token_accuracy": 0.4732039397450753, + "step": 12873 + }, + { + "epoch": 2.386725991842788, + "grad_norm": 7.98828125, + "learning_rate": 7.613274008157212e-06, + "loss": 2.8451, + "mean_token_accuracy": 0.48303055907618647, + "step": 12874 + }, + { + "epoch": 2.3869113830181683, + "grad_norm": 8.109375, + "learning_rate": 7.613088616981832e-06, + "loss": 2.248, + "mean_token_accuracy": 0.5320586214792764, + "step": 12875 + }, + { + "epoch": 2.3870967741935485, + "grad_norm": 6.4375, + "learning_rate": 7.612903225806451e-06, + "loss": 3.1429, + "mean_token_accuracy": 0.4531781754308925, + "step": 12876 + }, + { + "epoch": 2.3872821653689282, + "grad_norm": 7.55078125, + "learning_rate": 7.612717834631073e-06, + "loss": 2.7299, + "mean_token_accuracy": 0.4880334161210205, + "step": 12877 + }, + { + "epoch": 2.3874675565443084, + "grad_norm": 7.1328125, + "learning_rate": 7.612532443455692e-06, + "loss": 2.5029, + "mean_token_accuracy": 0.4981395348837209, + "step": 12878 + }, + { + "epoch": 2.3876529477196886, + "grad_norm": 6.88671875, + "learning_rate": 7.612347052280312e-06, + "loss": 2.3593, + "mean_token_accuracy": 0.5202710207708542, + "step": 12879 + }, + { + "epoch": 2.387838338895069, + "grad_norm": 6.671875, + "learning_rate": 7.6121616611049324e-06, + "loss": 2.7799, + "mean_token_accuracy": 0.47780496329954564, + "step": 12880 + }, + { + "epoch": 2.3880237300704485, + "grad_norm": 7.578125, + "learning_rate": 7.611976269929552e-06, + "loss": 2.6593, + "mean_token_accuracy": 0.48833795465666285, + "step": 12881 + }, + { + "epoch": 2.3882091212458287, + "grad_norm": 8.03125, + "learning_rate": 7.611790878754172e-06, + "loss": 2.0249, + "mean_token_accuracy": 0.601905239408373, + "step": 12882 + }, + { + "epoch": 2.388394512421209, + "grad_norm": 6.79296875, + "learning_rate": 7.611605487578791e-06, + "loss": 2.6771, + "mean_token_accuracy": 0.5175071857852104, + "step": 12883 + }, + { + "epoch": 2.3885799035965887, + "grad_norm": 7.52734375, + "learning_rate": 7.611420096403411e-06, + "loss": 2.5734, + "mean_token_accuracy": 0.4999361348831268, + "step": 12884 + }, + { + "epoch": 2.388765294771969, + "grad_norm": 7.36328125, + "learning_rate": 7.611234705228032e-06, + "loss": 2.7689, + "mean_token_accuracy": 0.46973754686663094, + "step": 12885 + }, + { + "epoch": 2.388950685947349, + "grad_norm": 6.9453125, + "learning_rate": 7.611049314052652e-06, + "loss": 2.588, + "mean_token_accuracy": 0.5028231797919762, + "step": 12886 + }, + { + "epoch": 2.389136077122729, + "grad_norm": 5.75390625, + "learning_rate": 7.610863922877272e-06, + "loss": 2.4097, + "mean_token_accuracy": 0.5393246067539325, + "step": 12887 + }, + { + "epoch": 2.389321468298109, + "grad_norm": 6.296875, + "learning_rate": 7.610678531701891e-06, + "loss": 2.4244, + "mean_token_accuracy": 0.4878986332574032, + "step": 12888 + }, + { + "epoch": 2.389506859473489, + "grad_norm": 6.97265625, + "learning_rate": 7.610493140526512e-06, + "loss": 3.2427, + "mean_token_accuracy": 0.4654140570633264, + "step": 12889 + }, + { + "epoch": 2.389692250648869, + "grad_norm": 7.37890625, + "learning_rate": 7.6103077493511314e-06, + "loss": 2.3137, + "mean_token_accuracy": 0.5266415827163491, + "step": 12890 + }, + { + "epoch": 2.389877641824249, + "grad_norm": 6.921875, + "learning_rate": 7.610122358175751e-06, + "loss": 2.5531, + "mean_token_accuracy": 0.5078793895024196, + "step": 12891 + }, + { + "epoch": 2.3900630329996293, + "grad_norm": 6.4765625, + "learning_rate": 7.609936967000371e-06, + "loss": 3.0244, + "mean_token_accuracy": 0.4844454463480613, + "step": 12892 + }, + { + "epoch": 2.3902484241750095, + "grad_norm": 7.140625, + "learning_rate": 7.609751575824992e-06, + "loss": 3.117, + "mean_token_accuracy": 0.4550881577120644, + "step": 12893 + }, + { + "epoch": 2.3904338153503892, + "grad_norm": 6.34765625, + "learning_rate": 7.609566184649612e-06, + "loss": 2.7601, + "mean_token_accuracy": 0.47757542810546344, + "step": 12894 + }, + { + "epoch": 2.3906192065257694, + "grad_norm": 7.42578125, + "learning_rate": 7.609380793474231e-06, + "loss": 2.8455, + "mean_token_accuracy": 0.4616951276829077, + "step": 12895 + }, + { + "epoch": 2.3908045977011496, + "grad_norm": 7.90625, + "learning_rate": 7.609195402298851e-06, + "loss": 3.1748, + "mean_token_accuracy": 0.45281594443605616, + "step": 12896 + }, + { + "epoch": 2.3909899888765294, + "grad_norm": 6.4765625, + "learning_rate": 7.609010011123471e-06, + "loss": 2.6226, + "mean_token_accuracy": 0.49319498719849075, + "step": 12897 + }, + { + "epoch": 2.3911753800519095, + "grad_norm": 7.83203125, + "learning_rate": 7.608824619948091e-06, + "loss": 2.7255, + "mean_token_accuracy": 0.4897150887951961, + "step": 12898 + }, + { + "epoch": 2.3913607712272897, + "grad_norm": 7.84765625, + "learning_rate": 7.608639228772711e-06, + "loss": 2.9013, + "mean_token_accuracy": 0.4596296296296296, + "step": 12899 + }, + { + "epoch": 2.3915461624026695, + "grad_norm": 7.375, + "learning_rate": 7.60845383759733e-06, + "loss": 2.4629, + "mean_token_accuracy": 0.5579504865131174, + "step": 12900 + }, + { + "epoch": 2.3917315535780497, + "grad_norm": 10.234375, + "learning_rate": 7.608268446421952e-06, + "loss": 2.7144, + "mean_token_accuracy": 0.47274696185777887, + "step": 12901 + }, + { + "epoch": 2.39191694475343, + "grad_norm": 7.5859375, + "learning_rate": 7.608083055246571e-06, + "loss": 2.435, + "mean_token_accuracy": 0.5075818036711891, + "step": 12902 + }, + { + "epoch": 2.3921023359288096, + "grad_norm": 8.796875, + "learning_rate": 7.607897664071191e-06, + "loss": 2.8358, + "mean_token_accuracy": 0.4858757062146893, + "step": 12903 + }, + { + "epoch": 2.39228772710419, + "grad_norm": 8.3203125, + "learning_rate": 7.607712272895811e-06, + "loss": 2.5283, + "mean_token_accuracy": 0.5091582424181763, + "step": 12904 + }, + { + "epoch": 2.39247311827957, + "grad_norm": 9.3359375, + "learning_rate": 7.60752688172043e-06, + "loss": 3.6304, + "mean_token_accuracy": 0.43239208856822986, + "step": 12905 + }, + { + "epoch": 2.39265850945495, + "grad_norm": 8.328125, + "learning_rate": 7.607341490545051e-06, + "loss": 3.168, + "mean_token_accuracy": 0.4575138678769541, + "step": 12906 + }, + { + "epoch": 2.39284390063033, + "grad_norm": 7.484375, + "learning_rate": 7.6071560993696705e-06, + "loss": 2.5914, + "mean_token_accuracy": 0.5168791312559018, + "step": 12907 + }, + { + "epoch": 2.39302929180571, + "grad_norm": 8.3515625, + "learning_rate": 7.60697070819429e-06, + "loss": 2.0653, + "mean_token_accuracy": 0.5919213973799127, + "step": 12908 + }, + { + "epoch": 2.39321468298109, + "grad_norm": 7.20703125, + "learning_rate": 7.606785317018911e-06, + "loss": 2.1766, + "mean_token_accuracy": 0.5245180926614812, + "step": 12909 + }, + { + "epoch": 2.39340007415647, + "grad_norm": 7.328125, + "learning_rate": 7.606599925843531e-06, + "loss": 2.9521, + "mean_token_accuracy": 0.4816683070866142, + "step": 12910 + }, + { + "epoch": 2.3935854653318502, + "grad_norm": 9.0859375, + "learning_rate": 7.606414534668151e-06, + "loss": 2.5194, + "mean_token_accuracy": 0.5206298828125, + "step": 12911 + }, + { + "epoch": 2.3937708565072304, + "grad_norm": 7.4375, + "learning_rate": 7.60622914349277e-06, + "loss": 2.9665, + "mean_token_accuracy": 0.45284872298624756, + "step": 12912 + }, + { + "epoch": 2.39395624768261, + "grad_norm": 6.859375, + "learning_rate": 7.60604375231739e-06, + "loss": 3.1054, + "mean_token_accuracy": 0.45099549004509953, + "step": 12913 + }, + { + "epoch": 2.3941416388579904, + "grad_norm": 8.015625, + "learning_rate": 7.60585836114201e-06, + "loss": 2.5806, + "mean_token_accuracy": 0.490507794412718, + "step": 12914 + }, + { + "epoch": 2.3943270300333706, + "grad_norm": 10.3828125, + "learning_rate": 7.60567296996663e-06, + "loss": 2.4481, + "mean_token_accuracy": 0.5089881288863765, + "step": 12915 + }, + { + "epoch": 2.3945124212087503, + "grad_norm": 8.0703125, + "learning_rate": 7.60548757879125e-06, + "loss": 2.5725, + "mean_token_accuracy": 0.5217889908256881, + "step": 12916 + }, + { + "epoch": 2.3946978123841305, + "grad_norm": 9.5625, + "learning_rate": 7.60530218761587e-06, + "loss": 3.5452, + "mean_token_accuracy": 0.4156528282954973, + "step": 12917 + }, + { + "epoch": 2.3948832035595107, + "grad_norm": 8.34375, + "learning_rate": 7.605116796440491e-06, + "loss": 3.6632, + "mean_token_accuracy": 0.437847866419295, + "step": 12918 + }, + { + "epoch": 2.395068594734891, + "grad_norm": 6.53515625, + "learning_rate": 7.6049314052651104e-06, + "loss": 2.99, + "mean_token_accuracy": 0.47394296951819076, + "step": 12919 + }, + { + "epoch": 2.3952539859102706, + "grad_norm": 7.1171875, + "learning_rate": 7.60474601408973e-06, + "loss": 3.126, + "mean_token_accuracy": 0.4464689265536723, + "step": 12920 + }, + { + "epoch": 2.395439377085651, + "grad_norm": 8.4375, + "learning_rate": 7.60456062291435e-06, + "loss": 2.533, + "mean_token_accuracy": 0.4963629638762175, + "step": 12921 + }, + { + "epoch": 2.3956247682610305, + "grad_norm": 7.63671875, + "learning_rate": 7.604375231738969e-06, + "loss": 2.3776, + "mean_token_accuracy": 0.5556081400851869, + "step": 12922 + }, + { + "epoch": 2.3958101594364107, + "grad_norm": 8.0234375, + "learning_rate": 7.60418984056359e-06, + "loss": 2.6208, + "mean_token_accuracy": 0.48827264481015936, + "step": 12923 + }, + { + "epoch": 2.395995550611791, + "grad_norm": 7.02734375, + "learning_rate": 7.6040044493882095e-06, + "loss": 2.8721, + "mean_token_accuracy": 0.48974189278623426, + "step": 12924 + }, + { + "epoch": 2.396180941787171, + "grad_norm": 7.4140625, + "learning_rate": 7.60381905821283e-06, + "loss": 2.7974, + "mean_token_accuracy": 0.46070460704607047, + "step": 12925 + }, + { + "epoch": 2.396366332962551, + "grad_norm": 8.8359375, + "learning_rate": 7.60363366703745e-06, + "loss": 3.4779, + "mean_token_accuracy": 0.4477413640389725, + "step": 12926 + }, + { + "epoch": 2.396551724137931, + "grad_norm": 6.96484375, + "learning_rate": 7.60344827586207e-06, + "loss": 2.7552, + "mean_token_accuracy": 0.4899527983816588, + "step": 12927 + }, + { + "epoch": 2.3967371153133112, + "grad_norm": 7.703125, + "learning_rate": 7.60326288468669e-06, + "loss": 3.2684, + "mean_token_accuracy": 0.4344903278913431, + "step": 12928 + }, + { + "epoch": 2.396922506488691, + "grad_norm": 9.59375, + "learning_rate": 7.603077493511309e-06, + "loss": 2.831, + "mean_token_accuracy": 0.4612857813233224, + "step": 12929 + }, + { + "epoch": 2.397107897664071, + "grad_norm": 6.94921875, + "learning_rate": 7.602892102335929e-06, + "loss": 2.6646, + "mean_token_accuracy": 0.48606581678031424, + "step": 12930 + }, + { + "epoch": 2.3972932888394514, + "grad_norm": 8.15625, + "learning_rate": 7.602706711160549e-06, + "loss": 2.9582, + "mean_token_accuracy": 0.4779817883266159, + "step": 12931 + }, + { + "epoch": 2.397478680014831, + "grad_norm": 9.09375, + "learning_rate": 7.602521319985169e-06, + "loss": 3.0835, + "mean_token_accuracy": 0.472478094810155, + "step": 12932 + }, + { + "epoch": 2.3976640711902113, + "grad_norm": 14.3046875, + "learning_rate": 7.60233592880979e-06, + "loss": 2.5612, + "mean_token_accuracy": 0.494176752683261, + "step": 12933 + }, + { + "epoch": 2.3978494623655915, + "grad_norm": 9.0546875, + "learning_rate": 7.602150537634409e-06, + "loss": 2.8401, + "mean_token_accuracy": 0.49625520110957005, + "step": 12934 + }, + { + "epoch": 2.3980348535409712, + "grad_norm": 7.4296875, + "learning_rate": 7.601965146459029e-06, + "loss": 2.7248, + "mean_token_accuracy": 0.5034584980237155, + "step": 12935 + }, + { + "epoch": 2.3982202447163514, + "grad_norm": 7.203125, + "learning_rate": 7.6017797552836495e-06, + "loss": 2.7904, + "mean_token_accuracy": 0.47932085328689594, + "step": 12936 + }, + { + "epoch": 2.3984056358917316, + "grad_norm": 8.328125, + "learning_rate": 7.601594364108269e-06, + "loss": 2.6128, + "mean_token_accuracy": 0.5543087020961254, + "step": 12937 + }, + { + "epoch": 2.398591027067112, + "grad_norm": 6.55859375, + "learning_rate": 7.601408972932889e-06, + "loss": 2.6674, + "mean_token_accuracy": 0.5333937354387781, + "step": 12938 + }, + { + "epoch": 2.3987764182424915, + "grad_norm": 6.65234375, + "learning_rate": 7.601223581757508e-06, + "loss": 2.1034, + "mean_token_accuracy": 0.5353776715783884, + "step": 12939 + }, + { + "epoch": 2.3989618094178717, + "grad_norm": 9.09375, + "learning_rate": 7.601038190582128e-06, + "loss": 2.7531, + "mean_token_accuracy": 0.474679682733374, + "step": 12940 + }, + { + "epoch": 2.399147200593252, + "grad_norm": 9.9921875, + "learning_rate": 7.600852799406749e-06, + "loss": 3.1425, + "mean_token_accuracy": 0.4515592515592516, + "step": 12941 + }, + { + "epoch": 2.3993325917686317, + "grad_norm": 8.15625, + "learning_rate": 7.600667408231369e-06, + "loss": 3.2147, + "mean_token_accuracy": 0.4527454817543456, + "step": 12942 + }, + { + "epoch": 2.399517982944012, + "grad_norm": 9.3359375, + "learning_rate": 7.600482017055989e-06, + "loss": 2.9758, + "mean_token_accuracy": 0.4651004426285325, + "step": 12943 + }, + { + "epoch": 2.399703374119392, + "grad_norm": 15.671875, + "learning_rate": 7.600296625880609e-06, + "loss": 3.8154, + "mean_token_accuracy": 0.4373692468619247, + "step": 12944 + }, + { + "epoch": 2.399888765294772, + "grad_norm": 7.83203125, + "learning_rate": 7.600111234705229e-06, + "loss": 3.4938, + "mean_token_accuracy": 0.4389093419236079, + "step": 12945 + }, + { + "epoch": 2.400074156470152, + "grad_norm": 7.83203125, + "learning_rate": 7.5999258435298484e-06, + "loss": 3.1224, + "mean_token_accuracy": 0.4634377967711301, + "step": 12946 + }, + { + "epoch": 2.400259547645532, + "grad_norm": 7.13671875, + "learning_rate": 7.599740452354468e-06, + "loss": 2.4604, + "mean_token_accuracy": 0.5184146341463415, + "step": 12947 + }, + { + "epoch": 2.400444938820912, + "grad_norm": 7.40234375, + "learning_rate": 7.599555061179088e-06, + "loss": 3.3107, + "mean_token_accuracy": 0.44049247606019154, + "step": 12948 + }, + { + "epoch": 2.400630329996292, + "grad_norm": 8.1171875, + "learning_rate": 7.599369670003709e-06, + "loss": 3.1559, + "mean_token_accuracy": 0.44666666666666666, + "step": 12949 + }, + { + "epoch": 2.4008157211716723, + "grad_norm": 8.3515625, + "learning_rate": 7.599184278828329e-06, + "loss": 2.5909, + "mean_token_accuracy": 0.4935864545920985, + "step": 12950 + }, + { + "epoch": 2.4010011123470525, + "grad_norm": 7.984375, + "learning_rate": 7.598998887652948e-06, + "loss": 3.0329, + "mean_token_accuracy": 0.45072402643047943, + "step": 12951 + }, + { + "epoch": 2.4011865035224322, + "grad_norm": 12.671875, + "learning_rate": 7.598813496477568e-06, + "loss": 2.7612, + "mean_token_accuracy": 0.4830166954519286, + "step": 12952 + }, + { + "epoch": 2.4013718946978124, + "grad_norm": 11.2578125, + "learning_rate": 7.5986281053021885e-06, + "loss": 3.1774, + "mean_token_accuracy": 0.5025528811086798, + "step": 12953 + }, + { + "epoch": 2.4015572858731926, + "grad_norm": 10.1171875, + "learning_rate": 7.598442714126808e-06, + "loss": 2.9938, + "mean_token_accuracy": 0.4864432109308284, + "step": 12954 + }, + { + "epoch": 2.4017426770485724, + "grad_norm": 9.9765625, + "learning_rate": 7.598257322951428e-06, + "loss": 2.7588, + "mean_token_accuracy": 0.49134014764338446, + "step": 12955 + }, + { + "epoch": 2.4019280682239526, + "grad_norm": 10.4140625, + "learning_rate": 7.5980719317760474e-06, + "loss": 2.8426, + "mean_token_accuracy": 0.46921965317919073, + "step": 12956 + }, + { + "epoch": 2.4021134593993327, + "grad_norm": 6.42578125, + "learning_rate": 7.597886540600669e-06, + "loss": 2.4938, + "mean_token_accuracy": 0.49924318869828455, + "step": 12957 + }, + { + "epoch": 2.4022988505747125, + "grad_norm": 7.1015625, + "learning_rate": 7.597701149425288e-06, + "loss": 3.2672, + "mean_token_accuracy": 0.46181935345290515, + "step": 12958 + }, + { + "epoch": 2.4024842417500927, + "grad_norm": 9.7578125, + "learning_rate": 7.597515758249908e-06, + "loss": 2.746, + "mean_token_accuracy": 0.4760833708202129, + "step": 12959 + }, + { + "epoch": 2.402669632925473, + "grad_norm": 9.0625, + "learning_rate": 7.597330367074528e-06, + "loss": 3.4525, + "mean_token_accuracy": 0.45295353724025217, + "step": 12960 + }, + { + "epoch": 2.4028550241008526, + "grad_norm": 6.9609375, + "learning_rate": 7.597144975899148e-06, + "loss": 3.0148, + "mean_token_accuracy": 0.4643744821872411, + "step": 12961 + }, + { + "epoch": 2.403040415276233, + "grad_norm": 9.796875, + "learning_rate": 7.596959584723768e-06, + "loss": 2.6426, + "mean_token_accuracy": 0.4912237330037083, + "step": 12962 + }, + { + "epoch": 2.403225806451613, + "grad_norm": 7.04296875, + "learning_rate": 7.5967741935483875e-06, + "loss": 2.7152, + "mean_token_accuracy": 0.4900288818594416, + "step": 12963 + }, + { + "epoch": 2.403411197626993, + "grad_norm": 6.73046875, + "learning_rate": 7.596588802373007e-06, + "loss": 2.7653, + "mean_token_accuracy": 0.49639159083777845, + "step": 12964 + }, + { + "epoch": 2.403596588802373, + "grad_norm": 9.3515625, + "learning_rate": 7.5964034111976285e-06, + "loss": 2.9292, + "mean_token_accuracy": 0.4719946272666219, + "step": 12965 + }, + { + "epoch": 2.403781979977753, + "grad_norm": 10.203125, + "learning_rate": 7.596218020022248e-06, + "loss": 2.996, + "mean_token_accuracy": 0.43944887599709936, + "step": 12966 + }, + { + "epoch": 2.4039673711531333, + "grad_norm": 7.55078125, + "learning_rate": 7.596032628846868e-06, + "loss": 2.9697, + "mean_token_accuracy": 0.46889101599890604, + "step": 12967 + }, + { + "epoch": 2.404152762328513, + "grad_norm": 8.6171875, + "learning_rate": 7.595847237671487e-06, + "loss": 2.9917, + "mean_token_accuracy": 0.46231865451277443, + "step": 12968 + }, + { + "epoch": 2.4043381535038932, + "grad_norm": 9.2890625, + "learning_rate": 7.595661846496107e-06, + "loss": 2.7733, + "mean_token_accuracy": 0.4686333084391337, + "step": 12969 + }, + { + "epoch": 2.4045235446792734, + "grad_norm": 8.0703125, + "learning_rate": 7.5954764553207275e-06, + "loss": 2.6628, + "mean_token_accuracy": 0.4991941982272361, + "step": 12970 + }, + { + "epoch": 2.404708935854653, + "grad_norm": 6.10546875, + "learning_rate": 7.595291064145347e-06, + "loss": 2.7914, + "mean_token_accuracy": 0.48505258182677535, + "step": 12971 + }, + { + "epoch": 2.4048943270300334, + "grad_norm": 6.92578125, + "learning_rate": 7.595105672969967e-06, + "loss": 2.4565, + "mean_token_accuracy": 0.5098551412966041, + "step": 12972 + }, + { + "epoch": 2.4050797182054136, + "grad_norm": 9.671875, + "learning_rate": 7.594920281794587e-06, + "loss": 2.869, + "mean_token_accuracy": 0.48788057936742535, + "step": 12973 + }, + { + "epoch": 2.4052651093807933, + "grad_norm": 6.85546875, + "learning_rate": 7.594734890619208e-06, + "loss": 3.1107, + "mean_token_accuracy": 0.45315990014452767, + "step": 12974 + }, + { + "epoch": 2.4054505005561735, + "grad_norm": 6.4921875, + "learning_rate": 7.5945494994438274e-06, + "loss": 2.8511, + "mean_token_accuracy": 0.4883617823099091, + "step": 12975 + }, + { + "epoch": 2.4056358917315537, + "grad_norm": 6.58203125, + "learning_rate": 7.594364108268447e-06, + "loss": 3.1219, + "mean_token_accuracy": 0.46508844261204946, + "step": 12976 + }, + { + "epoch": 2.405821282906934, + "grad_norm": 6.4921875, + "learning_rate": 7.594178717093067e-06, + "loss": 3.2758, + "mean_token_accuracy": 0.4416915293812427, + "step": 12977 + }, + { + "epoch": 2.4060066740823136, + "grad_norm": 8.4296875, + "learning_rate": 7.593993325917686e-06, + "loss": 2.6859, + "mean_token_accuracy": 0.5001744388882429, + "step": 12978 + }, + { + "epoch": 2.406192065257694, + "grad_norm": 11.09375, + "learning_rate": 7.593807934742307e-06, + "loss": 2.4943, + "mean_token_accuracy": 0.4905067434856619, + "step": 12979 + }, + { + "epoch": 2.4063774564330735, + "grad_norm": 7.64453125, + "learning_rate": 7.5936225435669265e-06, + "loss": 2.7061, + "mean_token_accuracy": 0.47316276537833424, + "step": 12980 + }, + { + "epoch": 2.4065628476084537, + "grad_norm": 7.7109375, + "learning_rate": 7.593437152391547e-06, + "loss": 2.385, + "mean_token_accuracy": 0.5164886205294937, + "step": 12981 + }, + { + "epoch": 2.406748238783834, + "grad_norm": 9.609375, + "learning_rate": 7.5932517612161675e-06, + "loss": 2.9225, + "mean_token_accuracy": 0.4590711660541263, + "step": 12982 + }, + { + "epoch": 2.406933629959214, + "grad_norm": 9.5625, + "learning_rate": 7.593066370040787e-06, + "loss": 3.3258, + "mean_token_accuracy": 0.45997267759562843, + "step": 12983 + }, + { + "epoch": 2.407119021134594, + "grad_norm": 8.6171875, + "learning_rate": 7.592880978865407e-06, + "loss": 2.4454, + "mean_token_accuracy": 0.5645561249793354, + "step": 12984 + }, + { + "epoch": 2.407304412309974, + "grad_norm": 7.2421875, + "learning_rate": 7.5926955876900264e-06, + "loss": 2.9038, + "mean_token_accuracy": 0.4656141868512111, + "step": 12985 + }, + { + "epoch": 2.4074898034853542, + "grad_norm": 8.671875, + "learning_rate": 7.592510196514646e-06, + "loss": 2.3729, + "mean_token_accuracy": 0.516890924706556, + "step": 12986 + }, + { + "epoch": 2.407675194660734, + "grad_norm": 7.046875, + "learning_rate": 7.5923248053392666e-06, + "loss": 2.8087, + "mean_token_accuracy": 0.4920895347474511, + "step": 12987 + }, + { + "epoch": 2.407860585836114, + "grad_norm": 7.6171875, + "learning_rate": 7.592139414163886e-06, + "loss": 2.3066, + "mean_token_accuracy": 0.5226023213194869, + "step": 12988 + }, + { + "epoch": 2.4080459770114944, + "grad_norm": 9.4453125, + "learning_rate": 7.591954022988507e-06, + "loss": 2.8436, + "mean_token_accuracy": 0.4660321237358715, + "step": 12989 + }, + { + "epoch": 2.408231368186874, + "grad_norm": 8.5859375, + "learning_rate": 7.591768631813126e-06, + "loss": 3.7846, + "mean_token_accuracy": 0.43788691704498556, + "step": 12990 + }, + { + "epoch": 2.4084167593622543, + "grad_norm": 8.5234375, + "learning_rate": 7.591583240637747e-06, + "loss": 2.9494, + "mean_token_accuracy": 0.4871457785568215, + "step": 12991 + }, + { + "epoch": 2.4086021505376345, + "grad_norm": 8.4375, + "learning_rate": 7.5913978494623665e-06, + "loss": 2.3989, + "mean_token_accuracy": 0.5166487647690655, + "step": 12992 + }, + { + "epoch": 2.4087875417130142, + "grad_norm": 6.4921875, + "learning_rate": 7.591212458286986e-06, + "loss": 2.4155, + "mean_token_accuracy": 0.511744738628649, + "step": 12993 + }, + { + "epoch": 2.4089729328883944, + "grad_norm": 7.58203125, + "learning_rate": 7.591027067111606e-06, + "loss": 2.3309, + "mean_token_accuracy": 0.513776102088167, + "step": 12994 + }, + { + "epoch": 2.4091583240637746, + "grad_norm": 8.3125, + "learning_rate": 7.590841675936225e-06, + "loss": 2.9935, + "mean_token_accuracy": 0.48462948980692483, + "step": 12995 + }, + { + "epoch": 2.409343715239155, + "grad_norm": 7.96484375, + "learning_rate": 7.590656284760846e-06, + "loss": 3.0165, + "mean_token_accuracy": 0.47132904608788856, + "step": 12996 + }, + { + "epoch": 2.4095291064145345, + "grad_norm": 7.171875, + "learning_rate": 7.5904708935854656e-06, + "loss": 3.0407, + "mean_token_accuracy": 0.4814032121724429, + "step": 12997 + }, + { + "epoch": 2.4097144975899147, + "grad_norm": 7.6796875, + "learning_rate": 7.590285502410086e-06, + "loss": 2.7852, + "mean_token_accuracy": 0.5058531394111387, + "step": 12998 + }, + { + "epoch": 2.409899888765295, + "grad_norm": 8.546875, + "learning_rate": 7.5901001112347065e-06, + "loss": 3.1236, + "mean_token_accuracy": 0.517587373167982, + "step": 12999 + }, + { + "epoch": 2.4100852799406747, + "grad_norm": 7.1640625, + "learning_rate": 7.589914720059326e-06, + "loss": 2.7384, + "mean_token_accuracy": 0.5039335015585572, + "step": 13000 + }, + { + "epoch": 2.410270671116055, + "grad_norm": 8.34375, + "learning_rate": 7.589729328883946e-06, + "loss": 3.5377, + "mean_token_accuracy": 0.4237006767973439, + "step": 13001 + }, + { + "epoch": 2.410456062291435, + "grad_norm": 7.6328125, + "learning_rate": 7.5895439377085655e-06, + "loss": 3.0899, + "mean_token_accuracy": 0.4553504747679505, + "step": 13002 + }, + { + "epoch": 2.410641453466815, + "grad_norm": 7.078125, + "learning_rate": 7.589358546533185e-06, + "loss": 2.0258, + "mean_token_accuracy": 0.5856121814791796, + "step": 13003 + }, + { + "epoch": 2.410826844642195, + "grad_norm": 9.5703125, + "learning_rate": 7.589173155357806e-06, + "loss": 2.7222, + "mean_token_accuracy": 0.5356884494815529, + "step": 13004 + }, + { + "epoch": 2.411012235817575, + "grad_norm": 6.5078125, + "learning_rate": 7.588987764182425e-06, + "loss": 3.0677, + "mean_token_accuracy": 0.45830202854996244, + "step": 13005 + }, + { + "epoch": 2.411197626992955, + "grad_norm": 7.078125, + "learning_rate": 7.588802373007046e-06, + "loss": 2.6529, + "mean_token_accuracy": 0.5003707548568886, + "step": 13006 + }, + { + "epoch": 2.411383018168335, + "grad_norm": 7.96875, + "learning_rate": 7.588616981831665e-06, + "loss": 2.8894, + "mean_token_accuracy": 0.47344759763978644, + "step": 13007 + }, + { + "epoch": 2.4115684093437153, + "grad_norm": 7.7890625, + "learning_rate": 7.588431590656286e-06, + "loss": 3.5992, + "mean_token_accuracy": 0.44351851851851853, + "step": 13008 + }, + { + "epoch": 2.4117538005190955, + "grad_norm": 8.53125, + "learning_rate": 7.5882461994809055e-06, + "loss": 2.5435, + "mean_token_accuracy": 0.5276387896116274, + "step": 13009 + }, + { + "epoch": 2.4119391916944752, + "grad_norm": 9.4453125, + "learning_rate": 7.588060808305525e-06, + "loss": 2.523, + "mean_token_accuracy": 0.4912124977809338, + "step": 13010 + }, + { + "epoch": 2.4121245828698554, + "grad_norm": 9.359375, + "learning_rate": 7.587875417130145e-06, + "loss": 2.4542, + "mean_token_accuracy": 0.5363849765258216, + "step": 13011 + }, + { + "epoch": 2.4123099740452356, + "grad_norm": 8.1796875, + "learning_rate": 7.5876900259547644e-06, + "loss": 2.6371, + "mean_token_accuracy": 0.5310610932475884, + "step": 13012 + }, + { + "epoch": 2.4124953652206154, + "grad_norm": 6.9140625, + "learning_rate": 7.587504634779385e-06, + "loss": 2.9038, + "mean_token_accuracy": 0.46286797502230154, + "step": 13013 + }, + { + "epoch": 2.4126807563959956, + "grad_norm": 7.26171875, + "learning_rate": 7.5873192436040054e-06, + "loss": 2.6516, + "mean_token_accuracy": 0.4775266948100323, + "step": 13014 + }, + { + "epoch": 2.4128661475713757, + "grad_norm": 9.0, + "learning_rate": 7.587133852428625e-06, + "loss": 3.1117, + "mean_token_accuracy": 0.442954627080091, + "step": 13015 + }, + { + "epoch": 2.4130515387467555, + "grad_norm": 9.4453125, + "learning_rate": 7.586948461253245e-06, + "loss": 2.4102, + "mean_token_accuracy": 0.5238639611317519, + "step": 13016 + }, + { + "epoch": 2.4132369299221357, + "grad_norm": 6.84765625, + "learning_rate": 7.586763070077865e-06, + "loss": 2.1421, + "mean_token_accuracy": 0.5573435156143461, + "step": 13017 + }, + { + "epoch": 2.413422321097516, + "grad_norm": 7.5234375, + "learning_rate": 7.586577678902485e-06, + "loss": 3.0548, + "mean_token_accuracy": 0.484260391198044, + "step": 13018 + }, + { + "epoch": 2.4136077122728956, + "grad_norm": 13.6484375, + "learning_rate": 7.5863922877271045e-06, + "loss": 2.4858, + "mean_token_accuracy": 0.5031133250311333, + "step": 13019 + }, + { + "epoch": 2.413793103448276, + "grad_norm": 9.5390625, + "learning_rate": 7.586206896551724e-06, + "loss": 2.9742, + "mean_token_accuracy": 0.4608003744441844, + "step": 13020 + }, + { + "epoch": 2.413978494623656, + "grad_norm": 8.453125, + "learning_rate": 7.586021505376344e-06, + "loss": 2.9751, + "mean_token_accuracy": 0.44908202351076476, + "step": 13021 + }, + { + "epoch": 2.414163885799036, + "grad_norm": 13.40625, + "learning_rate": 7.585836114200965e-06, + "loss": 3.6725, + "mean_token_accuracy": 0.40941603501654744, + "step": 13022 + }, + { + "epoch": 2.414349276974416, + "grad_norm": 9.265625, + "learning_rate": 7.585650723025585e-06, + "loss": 2.4709, + "mean_token_accuracy": 0.5056165593840717, + "step": 13023 + }, + { + "epoch": 2.414534668149796, + "grad_norm": 8.6171875, + "learning_rate": 7.585465331850204e-06, + "loss": 2.9383, + "mean_token_accuracy": 0.4823837049270575, + "step": 13024 + }, + { + "epoch": 2.4147200593251763, + "grad_norm": 7.46875, + "learning_rate": 7.585279940674825e-06, + "loss": 2.7085, + "mean_token_accuracy": 0.532874139010645, + "step": 13025 + }, + { + "epoch": 2.414905450500556, + "grad_norm": 8.5078125, + "learning_rate": 7.5850945494994446e-06, + "loss": 2.7346, + "mean_token_accuracy": 0.4952848469190027, + "step": 13026 + }, + { + "epoch": 2.4150908416759362, + "grad_norm": 7.59765625, + "learning_rate": 7.584909158324064e-06, + "loss": 2.8501, + "mean_token_accuracy": 0.49902504874756265, + "step": 13027 + }, + { + "epoch": 2.4152762328513164, + "grad_norm": 8.609375, + "learning_rate": 7.584723767148684e-06, + "loss": 2.7343, + "mean_token_accuracy": 0.4787338192102687, + "step": 13028 + }, + { + "epoch": 2.415461624026696, + "grad_norm": 8.1484375, + "learning_rate": 7.5845383759733035e-06, + "loss": 2.5705, + "mean_token_accuracy": 0.5290199809705043, + "step": 13029 + }, + { + "epoch": 2.4156470152020764, + "grad_norm": 7.1484375, + "learning_rate": 7.584352984797925e-06, + "loss": 3.3533, + "mean_token_accuracy": 0.44161073825503355, + "step": 13030 + }, + { + "epoch": 2.4158324063774566, + "grad_norm": 7.01171875, + "learning_rate": 7.5841675936225445e-06, + "loss": 2.3582, + "mean_token_accuracy": 0.5567706658089417, + "step": 13031 + }, + { + "epoch": 2.4160177975528363, + "grad_norm": 12.0390625, + "learning_rate": 7.583982202447164e-06, + "loss": 2.4232, + "mean_token_accuracy": 0.5548558170367915, + "step": 13032 + }, + { + "epoch": 2.4162031887282165, + "grad_norm": 14.765625, + "learning_rate": 7.583796811271784e-06, + "loss": 2.5361, + "mean_token_accuracy": 0.5014637319744117, + "step": 13033 + }, + { + "epoch": 2.4163885799035967, + "grad_norm": 7.78515625, + "learning_rate": 7.583611420096404e-06, + "loss": 2.9985, + "mean_token_accuracy": 0.45525068237322225, + "step": 13034 + }, + { + "epoch": 2.416573971078977, + "grad_norm": 8.703125, + "learning_rate": 7.583426028921024e-06, + "loss": 2.9385, + "mean_token_accuracy": 0.45631805520384905, + "step": 13035 + }, + { + "epoch": 2.4167593622543566, + "grad_norm": 13.3828125, + "learning_rate": 7.5832406377456435e-06, + "loss": 2.9345, + "mean_token_accuracy": 0.48874874587931777, + "step": 13036 + }, + { + "epoch": 2.416944753429737, + "grad_norm": 7.3515625, + "learning_rate": 7.583055246570263e-06, + "loss": 2.2558, + "mean_token_accuracy": 0.5548808538043978, + "step": 13037 + }, + { + "epoch": 2.4171301446051165, + "grad_norm": 6.8125, + "learning_rate": 7.5828698553948845e-06, + "loss": 2.9617, + "mean_token_accuracy": 0.446090655713657, + "step": 13038 + }, + { + "epoch": 2.4173155357804967, + "grad_norm": 10.1484375, + "learning_rate": 7.582684464219504e-06, + "loss": 2.6885, + "mean_token_accuracy": 0.5009214273747696, + "step": 13039 + }, + { + "epoch": 2.417500926955877, + "grad_norm": 7.39453125, + "learning_rate": 7.582499073044124e-06, + "loss": 3.102, + "mean_token_accuracy": 0.4498532942117898, + "step": 13040 + }, + { + "epoch": 2.417686318131257, + "grad_norm": 9.296875, + "learning_rate": 7.5823136818687435e-06, + "loss": 2.6843, + "mean_token_accuracy": 0.5035778175313059, + "step": 13041 + }, + { + "epoch": 2.417871709306637, + "grad_norm": 9.8515625, + "learning_rate": 7.582128290693364e-06, + "loss": 3.3557, + "mean_token_accuracy": 0.45001960015680126, + "step": 13042 + }, + { + "epoch": 2.418057100482017, + "grad_norm": 7.734375, + "learning_rate": 7.581942899517984e-06, + "loss": 3.454, + "mean_token_accuracy": 0.4577763286599283, + "step": 13043 + }, + { + "epoch": 2.4182424916573972, + "grad_norm": 8.1875, + "learning_rate": 7.581757508342603e-06, + "loss": 2.7699, + "mean_token_accuracy": 0.48417108153508204, + "step": 13044 + }, + { + "epoch": 2.418427882832777, + "grad_norm": 7.9140625, + "learning_rate": 7.581572117167223e-06, + "loss": 2.6982, + "mean_token_accuracy": 0.5237360092628329, + "step": 13045 + }, + { + "epoch": 2.418613274008157, + "grad_norm": 8.1328125, + "learning_rate": 7.581386725991844e-06, + "loss": 3.332, + "mean_token_accuracy": 0.43684527393136663, + "step": 13046 + }, + { + "epoch": 2.4187986651835374, + "grad_norm": 7.3359375, + "learning_rate": 7.581201334816464e-06, + "loss": 2.7519, + "mean_token_accuracy": 0.4999185800358248, + "step": 13047 + }, + { + "epoch": 2.4189840563589176, + "grad_norm": 7.01171875, + "learning_rate": 7.5810159436410835e-06, + "loss": 2.9073, + "mean_token_accuracy": 0.47575360419397117, + "step": 13048 + }, + { + "epoch": 2.4191694475342973, + "grad_norm": 7.390625, + "learning_rate": 7.580830552465703e-06, + "loss": 3.1208, + "mean_token_accuracy": 0.4566551377886868, + "step": 13049 + }, + { + "epoch": 2.4193548387096775, + "grad_norm": 6.8515625, + "learning_rate": 7.580645161290323e-06, + "loss": 2.5623, + "mean_token_accuracy": 0.5099206349206349, + "step": 13050 + }, + { + "epoch": 2.4195402298850572, + "grad_norm": 8.265625, + "learning_rate": 7.580459770114943e-06, + "loss": 2.7889, + "mean_token_accuracy": 0.49156305506216696, + "step": 13051 + }, + { + "epoch": 2.4197256210604374, + "grad_norm": 7.05078125, + "learning_rate": 7.580274378939563e-06, + "loss": 3.2181, + "mean_token_accuracy": 0.44080923028291447, + "step": 13052 + }, + { + "epoch": 2.4199110122358176, + "grad_norm": 8.375, + "learning_rate": 7.5800889877641826e-06, + "loss": 2.9596, + "mean_token_accuracy": 0.47565402428338965, + "step": 13053 + }, + { + "epoch": 2.420096403411198, + "grad_norm": 7.16015625, + "learning_rate": 7.579903596588803e-06, + "loss": 2.6662, + "mean_token_accuracy": 0.4963212370619778, + "step": 13054 + }, + { + "epoch": 2.4202817945865776, + "grad_norm": 5.890625, + "learning_rate": 7.5797182054134236e-06, + "loss": 2.2758, + "mean_token_accuracy": 0.5249330442545593, + "step": 13055 + }, + { + "epoch": 2.4204671857619577, + "grad_norm": 6.73828125, + "learning_rate": 7.579532814238043e-06, + "loss": 2.911, + "mean_token_accuracy": 0.46710080043413377, + "step": 13056 + }, + { + "epoch": 2.420652576937338, + "grad_norm": 6.921875, + "learning_rate": 7.579347423062663e-06, + "loss": 2.1998, + "mean_token_accuracy": 0.5659726499938401, + "step": 13057 + }, + { + "epoch": 2.4208379681127177, + "grad_norm": 7.47265625, + "learning_rate": 7.5791620318872825e-06, + "loss": 2.9534, + "mean_token_accuracy": 0.47557328015952144, + "step": 13058 + }, + { + "epoch": 2.421023359288098, + "grad_norm": 9.0390625, + "learning_rate": 7.578976640711902e-06, + "loss": 2.5074, + "mean_token_accuracy": 0.5118297872340426, + "step": 13059 + }, + { + "epoch": 2.421208750463478, + "grad_norm": 6.98828125, + "learning_rate": 7.578791249536523e-06, + "loss": 3.4705, + "mean_token_accuracy": 0.4349939246658566, + "step": 13060 + }, + { + "epoch": 2.421394141638858, + "grad_norm": 9.15625, + "learning_rate": 7.578605858361142e-06, + "loss": 3.0099, + "mean_token_accuracy": 0.46039787470653654, + "step": 13061 + }, + { + "epoch": 2.421579532814238, + "grad_norm": 7.35546875, + "learning_rate": 7.578420467185763e-06, + "loss": 3.0953, + "mean_token_accuracy": 0.4615287428932407, + "step": 13062 + }, + { + "epoch": 2.421764923989618, + "grad_norm": 6.15234375, + "learning_rate": 7.578235076010383e-06, + "loss": 3.1408, + "mean_token_accuracy": 0.4482036972445064, + "step": 13063 + }, + { + "epoch": 2.421950315164998, + "grad_norm": 7.1953125, + "learning_rate": 7.578049684835003e-06, + "loss": 3.2398, + "mean_token_accuracy": 0.45486547648981646, + "step": 13064 + }, + { + "epoch": 2.422135706340378, + "grad_norm": 8.6484375, + "learning_rate": 7.5778642936596225e-06, + "loss": 3.1131, + "mean_token_accuracy": 0.4656204282484373, + "step": 13065 + }, + { + "epoch": 2.4223210975157583, + "grad_norm": 8.09375, + "learning_rate": 7.577678902484242e-06, + "loss": 2.2597, + "mean_token_accuracy": 0.5251735154786824, + "step": 13066 + }, + { + "epoch": 2.4225064886911385, + "grad_norm": 9.5703125, + "learning_rate": 7.577493511308862e-06, + "loss": 2.5484, + "mean_token_accuracy": 0.49126186872161826, + "step": 13067 + }, + { + "epoch": 2.4226918798665182, + "grad_norm": 8.5, + "learning_rate": 7.577308120133482e-06, + "loss": 3.1984, + "mean_token_accuracy": 0.4782842399483982, + "step": 13068 + }, + { + "epoch": 2.4228772710418984, + "grad_norm": 8.6484375, + "learning_rate": 7.577122728958102e-06, + "loss": 2.6909, + "mean_token_accuracy": 0.48757894736842106, + "step": 13069 + }, + { + "epoch": 2.4230626622172786, + "grad_norm": 7.35546875, + "learning_rate": 7.5769373377827225e-06, + "loss": 2.9344, + "mean_token_accuracy": 0.4784245583550536, + "step": 13070 + }, + { + "epoch": 2.4232480533926584, + "grad_norm": 6.71484375, + "learning_rate": 7.576751946607342e-06, + "loss": 2.5882, + "mean_token_accuracy": 0.5107466063348416, + "step": 13071 + }, + { + "epoch": 2.4234334445680386, + "grad_norm": 8.390625, + "learning_rate": 7.576566555431963e-06, + "loss": 2.8166, + "mean_token_accuracy": 0.476245924545878, + "step": 13072 + }, + { + "epoch": 2.4236188357434187, + "grad_norm": 7.96875, + "learning_rate": 7.576381164256582e-06, + "loss": 3.8705, + "mean_token_accuracy": 0.43874099132225325, + "step": 13073 + }, + { + "epoch": 2.4238042269187985, + "grad_norm": 7.625, + "learning_rate": 7.576195773081202e-06, + "loss": 2.9153, + "mean_token_accuracy": 0.45453053184504266, + "step": 13074 + }, + { + "epoch": 2.4239896180941787, + "grad_norm": 6.67578125, + "learning_rate": 7.5760103819058215e-06, + "loss": 2.7217, + "mean_token_accuracy": 0.49364567526555386, + "step": 13075 + }, + { + "epoch": 2.424175009269559, + "grad_norm": 6.09765625, + "learning_rate": 7.575824990730441e-06, + "loss": 2.6107, + "mean_token_accuracy": 0.4810864442515193, + "step": 13076 + }, + { + "epoch": 2.4243604004449386, + "grad_norm": 7.30078125, + "learning_rate": 7.575639599555062e-06, + "loss": 2.9492, + "mean_token_accuracy": 0.4635499207606973, + "step": 13077 + }, + { + "epoch": 2.424545791620319, + "grad_norm": 7.91796875, + "learning_rate": 7.575454208379682e-06, + "loss": 2.4701, + "mean_token_accuracy": 0.5179214049229883, + "step": 13078 + }, + { + "epoch": 2.424731182795699, + "grad_norm": 7.9375, + "learning_rate": 7.575268817204302e-06, + "loss": 3.4104, + "mean_token_accuracy": 0.4479741553371921, + "step": 13079 + }, + { + "epoch": 2.424916573971079, + "grad_norm": 9.03125, + "learning_rate": 7.575083426028922e-06, + "loss": 3.0889, + "mean_token_accuracy": 0.4466310439202398, + "step": 13080 + }, + { + "epoch": 2.425101965146459, + "grad_norm": 6.51953125, + "learning_rate": 7.574898034853542e-06, + "loss": 2.6558, + "mean_token_accuracy": 0.5078426640926641, + "step": 13081 + }, + { + "epoch": 2.425287356321839, + "grad_norm": 7.1484375, + "learning_rate": 7.5747126436781616e-06, + "loss": 2.8147, + "mean_token_accuracy": 0.47686557546075525, + "step": 13082 + }, + { + "epoch": 2.4254727474972193, + "grad_norm": 7.3125, + "learning_rate": 7.574527252502781e-06, + "loss": 3.0744, + "mean_token_accuracy": 0.4490125332320547, + "step": 13083 + }, + { + "epoch": 2.425658138672599, + "grad_norm": 7.859375, + "learning_rate": 7.574341861327401e-06, + "loss": 2.7617, + "mean_token_accuracy": 0.5034332799267567, + "step": 13084 + }, + { + "epoch": 2.4258435298479792, + "grad_norm": 6.921875, + "learning_rate": 7.574156470152021e-06, + "loss": 2.744, + "mean_token_accuracy": 0.48842372343799556, + "step": 13085 + }, + { + "epoch": 2.4260289210233594, + "grad_norm": 7.390625, + "learning_rate": 7.573971078976642e-06, + "loss": 3.0766, + "mean_token_accuracy": 0.44488603156049095, + "step": 13086 + }, + { + "epoch": 2.426214312198739, + "grad_norm": 8.3671875, + "learning_rate": 7.5737856878012615e-06, + "loss": 2.8811, + "mean_token_accuracy": 0.48511511951899106, + "step": 13087 + }, + { + "epoch": 2.4263997033741194, + "grad_norm": 6.96484375, + "learning_rate": 7.573600296625881e-06, + "loss": 2.7444, + "mean_token_accuracy": 0.48018333782690753, + "step": 13088 + }, + { + "epoch": 2.4265850945494996, + "grad_norm": 7.16796875, + "learning_rate": 7.573414905450502e-06, + "loss": 3.3839, + "mean_token_accuracy": 0.42384823848238484, + "step": 13089 + }, + { + "epoch": 2.4267704857248793, + "grad_norm": 7.94140625, + "learning_rate": 7.573229514275121e-06, + "loss": 2.871, + "mean_token_accuracy": 0.47692557686057846, + "step": 13090 + }, + { + "epoch": 2.4269558769002595, + "grad_norm": 7.9921875, + "learning_rate": 7.573044123099741e-06, + "loss": 2.8172, + "mean_token_accuracy": 0.4931452149607347, + "step": 13091 + }, + { + "epoch": 2.4271412680756397, + "grad_norm": 7.38671875, + "learning_rate": 7.5728587319243606e-06, + "loss": 2.9111, + "mean_token_accuracy": 0.4990161949447556, + "step": 13092 + }, + { + "epoch": 2.42732665925102, + "grad_norm": 6.9765625, + "learning_rate": 7.57267334074898e-06, + "loss": 3.4719, + "mean_token_accuracy": 0.437948865268601, + "step": 13093 + }, + { + "epoch": 2.4275120504263996, + "grad_norm": 7.58984375, + "learning_rate": 7.5724879495736015e-06, + "loss": 3.1812, + "mean_token_accuracy": 0.45845523698069046, + "step": 13094 + }, + { + "epoch": 2.42769744160178, + "grad_norm": 7.68359375, + "learning_rate": 7.572302558398221e-06, + "loss": 2.9154, + "mean_token_accuracy": 0.4712913053667682, + "step": 13095 + }, + { + "epoch": 2.42788283277716, + "grad_norm": 6.484375, + "learning_rate": 7.572117167222841e-06, + "loss": 2.404, + "mean_token_accuracy": 0.5299557176348008, + "step": 13096 + }, + { + "epoch": 2.4280682239525397, + "grad_norm": 7.4921875, + "learning_rate": 7.5719317760474605e-06, + "loss": 2.5105, + "mean_token_accuracy": 0.5284144059869037, + "step": 13097 + }, + { + "epoch": 2.42825361512792, + "grad_norm": 8.3125, + "learning_rate": 7.571746384872081e-06, + "loss": 2.716, + "mean_token_accuracy": 0.4972862957937585, + "step": 13098 + }, + { + "epoch": 2.4284390063033, + "grad_norm": 8.671875, + "learning_rate": 7.571560993696701e-06, + "loss": 2.8334, + "mean_token_accuracy": 0.4999349720379763, + "step": 13099 + }, + { + "epoch": 2.42862439747868, + "grad_norm": 8.9609375, + "learning_rate": 7.57137560252132e-06, + "loss": 2.4092, + "mean_token_accuracy": 0.5411827526364972, + "step": 13100 + }, + { + "epoch": 2.42880978865406, + "grad_norm": 7.7109375, + "learning_rate": 7.57119021134594e-06, + "loss": 3.3386, + "mean_token_accuracy": 0.43820224719101125, + "step": 13101 + }, + { + "epoch": 2.4289951798294402, + "grad_norm": 7.03515625, + "learning_rate": 7.571004820170561e-06, + "loss": 2.6384, + "mean_token_accuracy": 0.49078040603464335, + "step": 13102 + }, + { + "epoch": 2.42918057100482, + "grad_norm": 6.97265625, + "learning_rate": 7.570819428995181e-06, + "loss": 3.2631, + "mean_token_accuracy": 0.4605446485117163, + "step": 13103 + }, + { + "epoch": 2.4293659621802, + "grad_norm": 8.34375, + "learning_rate": 7.5706340378198005e-06, + "loss": 2.6038, + "mean_token_accuracy": 0.4954159026504417, + "step": 13104 + }, + { + "epoch": 2.4295513533555804, + "grad_norm": 8.015625, + "learning_rate": 7.57044864664442e-06, + "loss": 3.1927, + "mean_token_accuracy": 0.4530847540782534, + "step": 13105 + }, + { + "epoch": 2.4297367445309606, + "grad_norm": 10.0390625, + "learning_rate": 7.570263255469041e-06, + "loss": 3.1332, + "mean_token_accuracy": 0.483695652173913, + "step": 13106 + }, + { + "epoch": 2.4299221357063403, + "grad_norm": 7.8125, + "learning_rate": 7.57007786429366e-06, + "loss": 2.3271, + "mean_token_accuracy": 0.5253504672897197, + "step": 13107 + }, + { + "epoch": 2.4301075268817205, + "grad_norm": 7.7421875, + "learning_rate": 7.56989247311828e-06, + "loss": 2.9811, + "mean_token_accuracy": 0.46444565811032223, + "step": 13108 + }, + { + "epoch": 2.4302929180571002, + "grad_norm": 8.1953125, + "learning_rate": 7.5697070819429e-06, + "loss": 3.1979, + "mean_token_accuracy": 0.4508400292184076, + "step": 13109 + }, + { + "epoch": 2.4304783092324804, + "grad_norm": 8.2109375, + "learning_rate": 7.569521690767521e-06, + "loss": 2.8162, + "mean_token_accuracy": 0.46913405848283657, + "step": 13110 + }, + { + "epoch": 2.4306637004078606, + "grad_norm": 8.3515625, + "learning_rate": 7.5693362995921406e-06, + "loss": 2.5909, + "mean_token_accuracy": 0.4937243852459016, + "step": 13111 + }, + { + "epoch": 2.430849091583241, + "grad_norm": 7.42578125, + "learning_rate": 7.56915090841676e-06, + "loss": 2.9309, + "mean_token_accuracy": 0.49145299145299143, + "step": 13112 + }, + { + "epoch": 2.4310344827586206, + "grad_norm": 8.421875, + "learning_rate": 7.56896551724138e-06, + "loss": 3.6261, + "mean_token_accuracy": 0.41516421401422865, + "step": 13113 + }, + { + "epoch": 2.4312198739340007, + "grad_norm": 9.1953125, + "learning_rate": 7.5687801260659995e-06, + "loss": 2.8108, + "mean_token_accuracy": 0.46534925209276795, + "step": 13114 + }, + { + "epoch": 2.431405265109381, + "grad_norm": 7.8203125, + "learning_rate": 7.56859473489062e-06, + "loss": 2.8305, + "mean_token_accuracy": 0.4826048171275647, + "step": 13115 + }, + { + "epoch": 2.4315906562847607, + "grad_norm": 8.3203125, + "learning_rate": 7.56840934371524e-06, + "loss": 3.3202, + "mean_token_accuracy": 0.4589609255621043, + "step": 13116 + }, + { + "epoch": 2.431776047460141, + "grad_norm": 9.046875, + "learning_rate": 7.568223952539859e-06, + "loss": 3.3454, + "mean_token_accuracy": 0.4475333257377236, + "step": 13117 + }, + { + "epoch": 2.431961438635521, + "grad_norm": 10.5, + "learning_rate": 7.568038561364479e-06, + "loss": 3.0209, + "mean_token_accuracy": 0.45747316267547483, + "step": 13118 + }, + { + "epoch": 2.4321468298109012, + "grad_norm": 7.3515625, + "learning_rate": 7.5678531701891e-06, + "loss": 2.7378, + "mean_token_accuracy": 0.49765896390273373, + "step": 13119 + }, + { + "epoch": 2.432332220986281, + "grad_norm": 9.40625, + "learning_rate": 7.56766777901372e-06, + "loss": 2.7954, + "mean_token_accuracy": 0.5068163592622293, + "step": 13120 + }, + { + "epoch": 2.432517612161661, + "grad_norm": 10.8046875, + "learning_rate": 7.5674823878383396e-06, + "loss": 2.966, + "mean_token_accuracy": 0.4535175879396985, + "step": 13121 + }, + { + "epoch": 2.432703003337041, + "grad_norm": 8.515625, + "learning_rate": 7.567296996662959e-06, + "loss": 2.1596, + "mean_token_accuracy": 0.5152173913043478, + "step": 13122 + }, + { + "epoch": 2.432888394512421, + "grad_norm": 6.38671875, + "learning_rate": 7.56711160548758e-06, + "loss": 2.462, + "mean_token_accuracy": 0.5202634245187436, + "step": 13123 + }, + { + "epoch": 2.4330737856878013, + "grad_norm": 8.40625, + "learning_rate": 7.566926214312199e-06, + "loss": 2.0985, + "mean_token_accuracy": 0.5561284274666042, + "step": 13124 + }, + { + "epoch": 2.4332591768631815, + "grad_norm": 6.43359375, + "learning_rate": 7.566740823136819e-06, + "loss": 2.7968, + "mean_token_accuracy": 0.45245036790226295, + "step": 13125 + }, + { + "epoch": 2.4334445680385612, + "grad_norm": 7.78125, + "learning_rate": 7.566555431961439e-06, + "loss": 2.8199, + "mean_token_accuracy": 0.4965419901199718, + "step": 13126 + }, + { + "epoch": 2.4336299592139414, + "grad_norm": 6.6328125, + "learning_rate": 7.56637004078606e-06, + "loss": 3.0973, + "mean_token_accuracy": 0.45958254269449716, + "step": 13127 + }, + { + "epoch": 2.4338153503893216, + "grad_norm": 6.9375, + "learning_rate": 7.56618464961068e-06, + "loss": 2.3314, + "mean_token_accuracy": 0.531118747613593, + "step": 13128 + }, + { + "epoch": 2.4340007415647014, + "grad_norm": 7.95703125, + "learning_rate": 7.565999258435299e-06, + "loss": 3.4013, + "mean_token_accuracy": 0.4623850835167681, + "step": 13129 + }, + { + "epoch": 2.4341861327400816, + "grad_norm": 7.734375, + "learning_rate": 7.565813867259919e-06, + "loss": 3.0837, + "mean_token_accuracy": 0.4850006437491953, + "step": 13130 + }, + { + "epoch": 2.4343715239154617, + "grad_norm": 7.06640625, + "learning_rate": 7.5656284760845385e-06, + "loss": 3.041, + "mean_token_accuracy": 0.4571510461450272, + "step": 13131 + }, + { + "epoch": 2.4345569150908415, + "grad_norm": 8.6796875, + "learning_rate": 7.565443084909159e-06, + "loss": 2.8106, + "mean_token_accuracy": 0.4821132075471698, + "step": 13132 + }, + { + "epoch": 2.4347423062662217, + "grad_norm": 7.1640625, + "learning_rate": 7.565257693733779e-06, + "loss": 3.0489, + "mean_token_accuracy": 0.4772200772200772, + "step": 13133 + }, + { + "epoch": 2.434927697441602, + "grad_norm": 7.46484375, + "learning_rate": 7.565072302558398e-06, + "loss": 2.8196, + "mean_token_accuracy": 0.48959795788130184, + "step": 13134 + }, + { + "epoch": 2.4351130886169816, + "grad_norm": 8.6328125, + "learning_rate": 7.564886911383019e-06, + "loss": 2.9486, + "mean_token_accuracy": 0.49554773244978256, + "step": 13135 + }, + { + "epoch": 2.435298479792362, + "grad_norm": 6.37890625, + "learning_rate": 7.564701520207639e-06, + "loss": 2.8255, + "mean_token_accuracy": 0.4834971025447216, + "step": 13136 + }, + { + "epoch": 2.435483870967742, + "grad_norm": 7.671875, + "learning_rate": 7.564516129032259e-06, + "loss": 3.2447, + "mean_token_accuracy": 0.44636429085673146, + "step": 13137 + }, + { + "epoch": 2.435669262143122, + "grad_norm": 11.5859375, + "learning_rate": 7.564330737856879e-06, + "loss": 2.5231, + "mean_token_accuracy": 0.5014048890137679, + "step": 13138 + }, + { + "epoch": 2.435854653318502, + "grad_norm": 6.16015625, + "learning_rate": 7.564145346681498e-06, + "loss": 2.6765, + "mean_token_accuracy": 0.49656235182550973, + "step": 13139 + }, + { + "epoch": 2.436040044493882, + "grad_norm": 7.7421875, + "learning_rate": 7.563959955506118e-06, + "loss": 3.1053, + "mean_token_accuracy": 0.46008073559093965, + "step": 13140 + }, + { + "epoch": 2.4362254356692623, + "grad_norm": 7.04296875, + "learning_rate": 7.563774564330738e-06, + "loss": 2.924, + "mean_token_accuracy": 0.4785336449747816, + "step": 13141 + }, + { + "epoch": 2.436410826844642, + "grad_norm": 8.2734375, + "learning_rate": 7.563589173155358e-06, + "loss": 3.3711, + "mean_token_accuracy": 0.4209643605870021, + "step": 13142 + }, + { + "epoch": 2.4365962180200222, + "grad_norm": 9.8515625, + "learning_rate": 7.5634037819799785e-06, + "loss": 3.2916, + "mean_token_accuracy": 0.4442758781654802, + "step": 13143 + }, + { + "epoch": 2.4367816091954024, + "grad_norm": 10.609375, + "learning_rate": 7.563218390804599e-06, + "loss": 2.7056, + "mean_token_accuracy": 0.4814107461166285, + "step": 13144 + }, + { + "epoch": 2.436967000370782, + "grad_norm": 7.7734375, + "learning_rate": 7.563032999629219e-06, + "loss": 3.1668, + "mean_token_accuracy": 0.5013934633899164, + "step": 13145 + }, + { + "epoch": 2.4371523915461624, + "grad_norm": 9.8828125, + "learning_rate": 7.562847608453838e-06, + "loss": 3.2758, + "mean_token_accuracy": 0.44922308546059936, + "step": 13146 + }, + { + "epoch": 2.4373377827215426, + "grad_norm": 12.6015625, + "learning_rate": 7.562662217278458e-06, + "loss": 2.7055, + "mean_token_accuracy": 0.4899139149428241, + "step": 13147 + }, + { + "epoch": 2.4375231738969223, + "grad_norm": 7.80859375, + "learning_rate": 7.562476826103078e-06, + "loss": 3.1387, + "mean_token_accuracy": 0.4623699363572078, + "step": 13148 + }, + { + "epoch": 2.4377085650723025, + "grad_norm": 7.6171875, + "learning_rate": 7.562291434927698e-06, + "loss": 3.2907, + "mean_token_accuracy": 0.4459366391184573, + "step": 13149 + }, + { + "epoch": 2.4378939562476827, + "grad_norm": 8.328125, + "learning_rate": 7.562106043752318e-06, + "loss": 3.6717, + "mean_token_accuracy": 0.45035987661373245, + "step": 13150 + }, + { + "epoch": 2.438079347423063, + "grad_norm": 7.78125, + "learning_rate": 7.561920652576938e-06, + "loss": 3.189, + "mean_token_accuracy": 0.462581905966203, + "step": 13151 + }, + { + "epoch": 2.4382647385984426, + "grad_norm": 7.66796875, + "learning_rate": 7.561735261401558e-06, + "loss": 2.9878, + "mean_token_accuracy": 0.47948260481712757, + "step": 13152 + }, + { + "epoch": 2.438450129773823, + "grad_norm": 10.3984375, + "learning_rate": 7.561549870226178e-06, + "loss": 2.6643, + "mean_token_accuracy": 0.5160837375542507, + "step": 13153 + }, + { + "epoch": 2.438635520949203, + "grad_norm": 7.42578125, + "learning_rate": 7.561364479050798e-06, + "loss": 3.1829, + "mean_token_accuracy": 0.45798155604623975, + "step": 13154 + }, + { + "epoch": 2.4388209121245827, + "grad_norm": 7.98046875, + "learning_rate": 7.561179087875418e-06, + "loss": 2.7662, + "mean_token_accuracy": 0.5057188669160877, + "step": 13155 + }, + { + "epoch": 2.439006303299963, + "grad_norm": 11.1640625, + "learning_rate": 7.560993696700037e-06, + "loss": 2.9334, + "mean_token_accuracy": 0.4752008382815229, + "step": 13156 + }, + { + "epoch": 2.439191694475343, + "grad_norm": 6.625, + "learning_rate": 7.560808305524657e-06, + "loss": 2.4537, + "mean_token_accuracy": 0.526532185532571, + "step": 13157 + }, + { + "epoch": 2.439377085650723, + "grad_norm": 7.29296875, + "learning_rate": 7.560622914349277e-06, + "loss": 3.5148, + "mean_token_accuracy": 0.43953318745441283, + "step": 13158 + }, + { + "epoch": 2.439562476826103, + "grad_norm": 11.21875, + "learning_rate": 7.560437523173898e-06, + "loss": 2.8551, + "mean_token_accuracy": 0.49088207483452656, + "step": 13159 + }, + { + "epoch": 2.4397478680014832, + "grad_norm": 7.64453125, + "learning_rate": 7.5602521319985175e-06, + "loss": 3.7279, + "mean_token_accuracy": 0.4224299065420561, + "step": 13160 + }, + { + "epoch": 2.439933259176863, + "grad_norm": 6.9921875, + "learning_rate": 7.560066740823138e-06, + "loss": 3.1398, + "mean_token_accuracy": 0.44332998996990974, + "step": 13161 + }, + { + "epoch": 2.440118650352243, + "grad_norm": 8.1328125, + "learning_rate": 7.559881349647758e-06, + "loss": 2.8059, + "mean_token_accuracy": 0.48989350141273635, + "step": 13162 + }, + { + "epoch": 2.4403040415276234, + "grad_norm": 7.1484375, + "learning_rate": 7.559695958472377e-06, + "loss": 2.8894, + "mean_token_accuracy": 0.4716376944190302, + "step": 13163 + }, + { + "epoch": 2.4404894327030036, + "grad_norm": 9.6640625, + "learning_rate": 7.559510567296997e-06, + "loss": 4.1782, + "mean_token_accuracy": 0.4034749034749035, + "step": 13164 + }, + { + "epoch": 2.4406748238783833, + "grad_norm": 7.12109375, + "learning_rate": 7.559325176121617e-06, + "loss": 2.4011, + "mean_token_accuracy": 0.5283325012481278, + "step": 13165 + }, + { + "epoch": 2.4408602150537635, + "grad_norm": 7.625, + "learning_rate": 7.559139784946237e-06, + "loss": 3.3419, + "mean_token_accuracy": 0.45691980127750176, + "step": 13166 + }, + { + "epoch": 2.4410456062291437, + "grad_norm": 6.4140625, + "learning_rate": 7.558954393770858e-06, + "loss": 2.8391, + "mean_token_accuracy": 0.47615885243724265, + "step": 13167 + }, + { + "epoch": 2.4412309974045234, + "grad_norm": 6.578125, + "learning_rate": 7.558769002595477e-06, + "loss": 3.0291, + "mean_token_accuracy": 0.47982690594374444, + "step": 13168 + }, + { + "epoch": 2.4414163885799036, + "grad_norm": 7.484375, + "learning_rate": 7.558583611420097e-06, + "loss": 3.0209, + "mean_token_accuracy": 0.49640490624559425, + "step": 13169 + }, + { + "epoch": 2.441601779755284, + "grad_norm": 6.8515625, + "learning_rate": 7.558398220244717e-06, + "loss": 2.6576, + "mean_token_accuracy": 0.512249443207127, + "step": 13170 + }, + { + "epoch": 2.4417871709306636, + "grad_norm": 6.765625, + "learning_rate": 7.558212829069337e-06, + "loss": 2.3558, + "mean_token_accuracy": 0.5250787224471435, + "step": 13171 + }, + { + "epoch": 2.4419725621060437, + "grad_norm": 13.5390625, + "learning_rate": 7.558027437893957e-06, + "loss": 3.4024, + "mean_token_accuracy": 0.4574479559918024, + "step": 13172 + }, + { + "epoch": 2.442157953281424, + "grad_norm": 9.2734375, + "learning_rate": 7.557842046718576e-06, + "loss": 2.6378, + "mean_token_accuracy": 0.5078770862579941, + "step": 13173 + }, + { + "epoch": 2.4423433444568037, + "grad_norm": 8.3515625, + "learning_rate": 7.557656655543196e-06, + "loss": 2.8637, + "mean_token_accuracy": 0.4762852793067104, + "step": 13174 + }, + { + "epoch": 2.442528735632184, + "grad_norm": 7.46875, + "learning_rate": 7.557471264367817e-06, + "loss": 3.0389, + "mean_token_accuracy": 0.44976974245266926, + "step": 13175 + }, + { + "epoch": 2.442714126807564, + "grad_norm": 8.828125, + "learning_rate": 7.557285873192437e-06, + "loss": 3.3449, + "mean_token_accuracy": 0.47508600599267564, + "step": 13176 + }, + { + "epoch": 2.4428995179829442, + "grad_norm": 8.2890625, + "learning_rate": 7.557100482017057e-06, + "loss": 2.9713, + "mean_token_accuracy": 0.45991253644314867, + "step": 13177 + }, + { + "epoch": 2.443084909158324, + "grad_norm": 6.78125, + "learning_rate": 7.556915090841676e-06, + "loss": 2.4336, + "mean_token_accuracy": 0.5197181720324885, + "step": 13178 + }, + { + "epoch": 2.443270300333704, + "grad_norm": 7.3828125, + "learning_rate": 7.556729699666297e-06, + "loss": 2.8244, + "mean_token_accuracy": 0.5099741844637409, + "step": 13179 + }, + { + "epoch": 2.443455691509084, + "grad_norm": 7.16796875, + "learning_rate": 7.556544308490916e-06, + "loss": 3.0526, + "mean_token_accuracy": 0.4602693602693603, + "step": 13180 + }, + { + "epoch": 2.443641082684464, + "grad_norm": 6.52734375, + "learning_rate": 7.556358917315536e-06, + "loss": 3.0262, + "mean_token_accuracy": 0.46614613681389117, + "step": 13181 + }, + { + "epoch": 2.4438264738598443, + "grad_norm": 9.7109375, + "learning_rate": 7.556173526140156e-06, + "loss": 2.6327, + "mean_token_accuracy": 0.4994246260069045, + "step": 13182 + }, + { + "epoch": 2.4440118650352245, + "grad_norm": 7.92578125, + "learning_rate": 7.555988134964777e-06, + "loss": 2.8908, + "mean_token_accuracy": 0.4602808865481677, + "step": 13183 + }, + { + "epoch": 2.4441972562106042, + "grad_norm": 9.109375, + "learning_rate": 7.555802743789397e-06, + "loss": 2.9989, + "mean_token_accuracy": 0.4976043805612594, + "step": 13184 + }, + { + "epoch": 2.4443826473859844, + "grad_norm": 6.984375, + "learning_rate": 7.555617352614016e-06, + "loss": 3.2671, + "mean_token_accuracy": 0.44524959742351045, + "step": 13185 + }, + { + "epoch": 2.4445680385613646, + "grad_norm": 7.4609375, + "learning_rate": 7.555431961438636e-06, + "loss": 3.231, + "mean_token_accuracy": 0.4870578669196368, + "step": 13186 + }, + { + "epoch": 2.4447534297367444, + "grad_norm": 7.73828125, + "learning_rate": 7.555246570263256e-06, + "loss": 4.489, + "mean_token_accuracy": 0.3473892274748806, + "step": 13187 + }, + { + "epoch": 2.4449388209121246, + "grad_norm": 6.5078125, + "learning_rate": 7.555061179087876e-06, + "loss": 2.8399, + "mean_token_accuracy": 0.46882640586797064, + "step": 13188 + }, + { + "epoch": 2.4451242120875047, + "grad_norm": 6.4453125, + "learning_rate": 7.554875787912496e-06, + "loss": 2.8271, + "mean_token_accuracy": 0.5100322991093276, + "step": 13189 + }, + { + "epoch": 2.445309603262885, + "grad_norm": 8.21875, + "learning_rate": 7.554690396737115e-06, + "loss": 4.06, + "mean_token_accuracy": 0.41890744409849984, + "step": 13190 + }, + { + "epoch": 2.4454949944382647, + "grad_norm": 7.86328125, + "learning_rate": 7.554505005561737e-06, + "loss": 3.1707, + "mean_token_accuracy": 0.4500276090557703, + "step": 13191 + }, + { + "epoch": 2.445680385613645, + "grad_norm": 6.96484375, + "learning_rate": 7.554319614386356e-06, + "loss": 2.8706, + "mean_token_accuracy": 0.488702201622248, + "step": 13192 + }, + { + "epoch": 2.4458657767890246, + "grad_norm": 7.03515625, + "learning_rate": 7.554134223210976e-06, + "loss": 2.7088, + "mean_token_accuracy": 0.4857250660294303, + "step": 13193 + }, + { + "epoch": 2.446051167964405, + "grad_norm": 6.5859375, + "learning_rate": 7.553948832035596e-06, + "loss": 2.7485, + "mean_token_accuracy": 0.49779179810725555, + "step": 13194 + }, + { + "epoch": 2.446236559139785, + "grad_norm": 7.859375, + "learning_rate": 7.553763440860215e-06, + "loss": 3.3263, + "mean_token_accuracy": 0.4362745098039216, + "step": 13195 + }, + { + "epoch": 2.446421950315165, + "grad_norm": 7.12109375, + "learning_rate": 7.553578049684836e-06, + "loss": 2.5866, + "mean_token_accuracy": 0.49486747357131916, + "step": 13196 + }, + { + "epoch": 2.446607341490545, + "grad_norm": 6.63671875, + "learning_rate": 7.553392658509455e-06, + "loss": 2.9705, + "mean_token_accuracy": 0.4706743080112181, + "step": 13197 + }, + { + "epoch": 2.446792732665925, + "grad_norm": 6.0859375, + "learning_rate": 7.553207267334075e-06, + "loss": 3.3024, + "mean_token_accuracy": 0.44313536907910556, + "step": 13198 + }, + { + "epoch": 2.4469781238413053, + "grad_norm": 7.0390625, + "learning_rate": 7.553021876158696e-06, + "loss": 3.6083, + "mean_token_accuracy": 0.43163824604141293, + "step": 13199 + }, + { + "epoch": 2.447163515016685, + "grad_norm": 7.64453125, + "learning_rate": 7.552836484983316e-06, + "loss": 2.9616, + "mean_token_accuracy": 0.47360084477296727, + "step": 13200 + }, + { + "epoch": 2.4473489061920652, + "grad_norm": 6.11328125, + "learning_rate": 7.552651093807936e-06, + "loss": 2.6068, + "mean_token_accuracy": 0.47780517879161527, + "step": 13201 + }, + { + "epoch": 2.4475342973674454, + "grad_norm": 6.859375, + "learning_rate": 7.552465702632555e-06, + "loss": 2.6885, + "mean_token_accuracy": 0.4832381788903528, + "step": 13202 + }, + { + "epoch": 2.447719688542825, + "grad_norm": 6.27734375, + "learning_rate": 7.552280311457175e-06, + "loss": 3.0123, + "mean_token_accuracy": 0.4588390501319261, + "step": 13203 + }, + { + "epoch": 2.4479050797182054, + "grad_norm": 7.07421875, + "learning_rate": 7.5520949202817954e-06, + "loss": 3.011, + "mean_token_accuracy": 0.46469651389134664, + "step": 13204 + }, + { + "epoch": 2.4480904708935856, + "grad_norm": 8.265625, + "learning_rate": 7.551909529106415e-06, + "loss": 2.5261, + "mean_token_accuracy": 0.4989381282741045, + "step": 13205 + }, + { + "epoch": 2.4482758620689653, + "grad_norm": 6.48828125, + "learning_rate": 7.551724137931035e-06, + "loss": 2.9973, + "mean_token_accuracy": 0.47969483568075116, + "step": 13206 + }, + { + "epoch": 2.4484612532443455, + "grad_norm": 7.26171875, + "learning_rate": 7.551538746755655e-06, + "loss": 2.6401, + "mean_token_accuracy": 0.4899672131147541, + "step": 13207 + }, + { + "epoch": 2.4486466444197257, + "grad_norm": 10.7109375, + "learning_rate": 7.551353355580276e-06, + "loss": 2.3393, + "mean_token_accuracy": 0.5124252491694352, + "step": 13208 + }, + { + "epoch": 2.448832035595106, + "grad_norm": 7.265625, + "learning_rate": 7.551167964404895e-06, + "loss": 3.3117, + "mean_token_accuracy": 0.47304881923117653, + "step": 13209 + }, + { + "epoch": 2.4490174267704856, + "grad_norm": 7.19140625, + "learning_rate": 7.550982573229515e-06, + "loss": 3.0107, + "mean_token_accuracy": 0.4800339847068819, + "step": 13210 + }, + { + "epoch": 2.449202817945866, + "grad_norm": 6.875, + "learning_rate": 7.550797182054135e-06, + "loss": 2.704, + "mean_token_accuracy": 0.48489405331510593, + "step": 13211 + }, + { + "epoch": 2.449388209121246, + "grad_norm": 7.734375, + "learning_rate": 7.550611790878754e-06, + "loss": 2.9996, + "mean_token_accuracy": 0.4691937276206939, + "step": 13212 + }, + { + "epoch": 2.4495736002966257, + "grad_norm": 7.3046875, + "learning_rate": 7.550426399703375e-06, + "loss": 2.7962, + "mean_token_accuracy": 0.5191304347826087, + "step": 13213 + }, + { + "epoch": 2.449758991472006, + "grad_norm": 7.56640625, + "learning_rate": 7.5502410085279944e-06, + "loss": 2.6208, + "mean_token_accuracy": 0.5085951393005335, + "step": 13214 + }, + { + "epoch": 2.449944382647386, + "grad_norm": 6.8359375, + "learning_rate": 7.550055617352615e-06, + "loss": 2.6724, + "mean_token_accuracy": 0.4931972789115646, + "step": 13215 + }, + { + "epoch": 2.450129773822766, + "grad_norm": 6.69921875, + "learning_rate": 7.5498702261772346e-06, + "loss": 2.8974, + "mean_token_accuracy": 0.4801796221740477, + "step": 13216 + }, + { + "epoch": 2.450315164998146, + "grad_norm": 6.66015625, + "learning_rate": 7.549684835001855e-06, + "loss": 3.2697, + "mean_token_accuracy": 0.4374600468783294, + "step": 13217 + }, + { + "epoch": 2.4505005561735262, + "grad_norm": 7.7578125, + "learning_rate": 7.549499443826475e-06, + "loss": 2.5174, + "mean_token_accuracy": 0.50613774389456, + "step": 13218 + }, + { + "epoch": 2.450685947348906, + "grad_norm": 7.3671875, + "learning_rate": 7.549314052651094e-06, + "loss": 2.6881, + "mean_token_accuracy": 0.5040331873703618, + "step": 13219 + }, + { + "epoch": 2.450871338524286, + "grad_norm": 7.0703125, + "learning_rate": 7.549128661475714e-06, + "loss": 3.4583, + "mean_token_accuracy": 0.42150170648464164, + "step": 13220 + }, + { + "epoch": 2.4510567296996664, + "grad_norm": 7.37109375, + "learning_rate": 7.548943270300334e-06, + "loss": 3.051, + "mean_token_accuracy": 0.451077246452969, + "step": 13221 + }, + { + "epoch": 2.4512421208750466, + "grad_norm": 8.2578125, + "learning_rate": 7.548757879124954e-06, + "loss": 2.6532, + "mean_token_accuracy": 0.5150265957446809, + "step": 13222 + }, + { + "epoch": 2.4514275120504263, + "grad_norm": 6.9296875, + "learning_rate": 7.548572487949575e-06, + "loss": 2.6949, + "mean_token_accuracy": 0.5034596375617792, + "step": 13223 + }, + { + "epoch": 2.4516129032258065, + "grad_norm": 8.484375, + "learning_rate": 7.548387096774194e-06, + "loss": 2.9165, + "mean_token_accuracy": 0.48109264119762807, + "step": 13224 + }, + { + "epoch": 2.4517982944011867, + "grad_norm": 8.078125, + "learning_rate": 7.548201705598815e-06, + "loss": 2.8765, + "mean_token_accuracy": 0.4824469478648153, + "step": 13225 + }, + { + "epoch": 2.4519836855765664, + "grad_norm": 8.4453125, + "learning_rate": 7.548016314423434e-06, + "loss": 3.0404, + "mean_token_accuracy": 0.46179775280898877, + "step": 13226 + }, + { + "epoch": 2.4521690767519466, + "grad_norm": 7.828125, + "learning_rate": 7.547830923248054e-06, + "loss": 2.717, + "mean_token_accuracy": 0.4817970565453137, + "step": 13227 + }, + { + "epoch": 2.452354467927327, + "grad_norm": 7.87109375, + "learning_rate": 7.547645532072674e-06, + "loss": 2.9725, + "mean_token_accuracy": 0.4736559139784946, + "step": 13228 + }, + { + "epoch": 2.4525398591027066, + "grad_norm": 7.078125, + "learning_rate": 7.547460140897293e-06, + "loss": 3.0439, + "mean_token_accuracy": 0.449438202247191, + "step": 13229 + }, + { + "epoch": 2.4527252502780867, + "grad_norm": 7.34375, + "learning_rate": 7.547274749721914e-06, + "loss": 2.3932, + "mean_token_accuracy": 0.5071455672189199, + "step": 13230 + }, + { + "epoch": 2.452910641453467, + "grad_norm": 6.60546875, + "learning_rate": 7.547089358546534e-06, + "loss": 2.5717, + "mean_token_accuracy": 0.506631299734748, + "step": 13231 + }, + { + "epoch": 2.4530960326288467, + "grad_norm": 7.05859375, + "learning_rate": 7.546903967371154e-06, + "loss": 3.4247, + "mean_token_accuracy": 0.44687610307094955, + "step": 13232 + }, + { + "epoch": 2.453281423804227, + "grad_norm": 8.203125, + "learning_rate": 7.546718576195774e-06, + "loss": 2.7161, + "mean_token_accuracy": 0.4849804092294297, + "step": 13233 + }, + { + "epoch": 2.453466814979607, + "grad_norm": 16.609375, + "learning_rate": 7.546533185020394e-06, + "loss": 2.7526, + "mean_token_accuracy": 0.5026485490557346, + "step": 13234 + }, + { + "epoch": 2.4536522061549872, + "grad_norm": 8.4296875, + "learning_rate": 7.546347793845014e-06, + "loss": 2.9868, + "mean_token_accuracy": 0.453990453990454, + "step": 13235 + }, + { + "epoch": 2.453837597330367, + "grad_norm": 9.5234375, + "learning_rate": 7.546162402669633e-06, + "loss": 3.3655, + "mean_token_accuracy": 0.4521408497272276, + "step": 13236 + }, + { + "epoch": 2.454022988505747, + "grad_norm": 8.4921875, + "learning_rate": 7.545977011494253e-06, + "loss": 2.8036, + "mean_token_accuracy": 0.4907233127184727, + "step": 13237 + }, + { + "epoch": 2.4542083796811274, + "grad_norm": 9.65625, + "learning_rate": 7.545791620318873e-06, + "loss": 2.5723, + "mean_token_accuracy": 0.5023023286409682, + "step": 13238 + }, + { + "epoch": 2.454393770856507, + "grad_norm": 8.40625, + "learning_rate": 7.545606229143493e-06, + "loss": 2.7374, + "mean_token_accuracy": 0.5142514251425142, + "step": 13239 + }, + { + "epoch": 2.4545791620318873, + "grad_norm": 7.21484375, + "learning_rate": 7.545420837968114e-06, + "loss": 2.9025, + "mean_token_accuracy": 0.47911007631612984, + "step": 13240 + }, + { + "epoch": 2.4547645532072675, + "grad_norm": 10.3046875, + "learning_rate": 7.545235446792733e-06, + "loss": 2.744, + "mean_token_accuracy": 0.4700448308653716, + "step": 13241 + }, + { + "epoch": 2.4549499443826472, + "grad_norm": 7.32421875, + "learning_rate": 7.545050055617354e-06, + "loss": 3.4312, + "mean_token_accuracy": 0.4547646237787083, + "step": 13242 + }, + { + "epoch": 2.4551353355580274, + "grad_norm": 8.578125, + "learning_rate": 7.5448646644419734e-06, + "loss": 2.874, + "mean_token_accuracy": 0.4737789781679046, + "step": 13243 + }, + { + "epoch": 2.4553207267334076, + "grad_norm": 10.2265625, + "learning_rate": 7.544679273266593e-06, + "loss": 3.0093, + "mean_token_accuracy": 0.4699094180419407, + "step": 13244 + }, + { + "epoch": 2.4555061179087874, + "grad_norm": 8.8515625, + "learning_rate": 7.544493882091213e-06, + "loss": 3.0177, + "mean_token_accuracy": 0.46644363341443634, + "step": 13245 + }, + { + "epoch": 2.4556915090841676, + "grad_norm": 9.4609375, + "learning_rate": 7.544308490915832e-06, + "loss": 2.2757, + "mean_token_accuracy": 0.5451754971306553, + "step": 13246 + }, + { + "epoch": 2.4558769002595477, + "grad_norm": 9.078125, + "learning_rate": 7.544123099740452e-06, + "loss": 3.0499, + "mean_token_accuracy": 0.46343107516009435, + "step": 13247 + }, + { + "epoch": 2.456062291434928, + "grad_norm": 6.71484375, + "learning_rate": 7.543937708565073e-06, + "loss": 3.0525, + "mean_token_accuracy": 0.4864864864864865, + "step": 13248 + }, + { + "epoch": 2.4562476826103077, + "grad_norm": 11.4921875, + "learning_rate": 7.543752317389693e-06, + "loss": 3.4827, + "mean_token_accuracy": 0.5011206328279499, + "step": 13249 + }, + { + "epoch": 2.456433073785688, + "grad_norm": 7.56640625, + "learning_rate": 7.543566926214313e-06, + "loss": 3.0248, + "mean_token_accuracy": 0.4866333725029377, + "step": 13250 + }, + { + "epoch": 2.4566184649610676, + "grad_norm": 8.6640625, + "learning_rate": 7.543381535038933e-06, + "loss": 2.2933, + "mean_token_accuracy": 0.5190633443410387, + "step": 13251 + }, + { + "epoch": 2.456803856136448, + "grad_norm": 6.83984375, + "learning_rate": 7.543196143863553e-06, + "loss": 2.2886, + "mean_token_accuracy": 0.5615132558832291, + "step": 13252 + }, + { + "epoch": 2.456989247311828, + "grad_norm": 7.12109375, + "learning_rate": 7.543010752688172e-06, + "loss": 2.9018, + "mean_token_accuracy": 0.4667919799498747, + "step": 13253 + }, + { + "epoch": 2.457174638487208, + "grad_norm": 6.97265625, + "learning_rate": 7.542825361512792e-06, + "loss": 3.039, + "mean_token_accuracy": 0.4797348011008256, + "step": 13254 + }, + { + "epoch": 2.457360029662588, + "grad_norm": 6.46875, + "learning_rate": 7.542639970337412e-06, + "loss": 2.7739, + "mean_token_accuracy": 0.4895278665246716, + "step": 13255 + }, + { + "epoch": 2.457545420837968, + "grad_norm": 9.1328125, + "learning_rate": 7.542454579162033e-06, + "loss": 2.5269, + "mean_token_accuracy": 0.5278604849000426, + "step": 13256 + }, + { + "epoch": 2.4577308120133483, + "grad_norm": 7.72265625, + "learning_rate": 7.542269187986653e-06, + "loss": 2.6555, + "mean_token_accuracy": 0.5005268703898841, + "step": 13257 + }, + { + "epoch": 2.457916203188728, + "grad_norm": 7.8515625, + "learning_rate": 7.542083796811272e-06, + "loss": 3.2479, + "mean_token_accuracy": 0.45065410779696496, + "step": 13258 + }, + { + "epoch": 2.4581015943641082, + "grad_norm": 7.44921875, + "learning_rate": 7.541898405635892e-06, + "loss": 3.301, + "mean_token_accuracy": 0.4671036659383345, + "step": 13259 + }, + { + "epoch": 2.4582869855394884, + "grad_norm": 7.74609375, + "learning_rate": 7.5417130144605125e-06, + "loss": 2.5827, + "mean_token_accuracy": 0.5135805130416038, + "step": 13260 + }, + { + "epoch": 2.458472376714868, + "grad_norm": 7.62109375, + "learning_rate": 7.541527623285132e-06, + "loss": 2.6281, + "mean_token_accuracy": 0.4961340206185567, + "step": 13261 + }, + { + "epoch": 2.4586577678902484, + "grad_norm": 9.578125, + "learning_rate": 7.541342232109752e-06, + "loss": 3.1316, + "mean_token_accuracy": 0.45230078563411896, + "step": 13262 + }, + { + "epoch": 2.4588431590656286, + "grad_norm": 8.671875, + "learning_rate": 7.541156840934371e-06, + "loss": 2.694, + "mean_token_accuracy": 0.47699468085106383, + "step": 13263 + }, + { + "epoch": 2.4590285502410083, + "grad_norm": 7.93359375, + "learning_rate": 7.540971449758993e-06, + "loss": 2.7149, + "mean_token_accuracy": 0.5159562077801072, + "step": 13264 + }, + { + "epoch": 2.4592139414163885, + "grad_norm": 10.34375, + "learning_rate": 7.540786058583612e-06, + "loss": 2.7619, + "mean_token_accuracy": 0.5025322283609577, + "step": 13265 + }, + { + "epoch": 2.4593993325917687, + "grad_norm": 8.9375, + "learning_rate": 7.540600667408232e-06, + "loss": 2.7486, + "mean_token_accuracy": 0.4464885188666577, + "step": 13266 + }, + { + "epoch": 2.459584723767149, + "grad_norm": 6.17578125, + "learning_rate": 7.540415276232852e-06, + "loss": 2.6317, + "mean_token_accuracy": 0.5076169029010466, + "step": 13267 + }, + { + "epoch": 2.4597701149425286, + "grad_norm": 6.5625, + "learning_rate": 7.540229885057472e-06, + "loss": 2.2256, + "mean_token_accuracy": 0.5350691619202603, + "step": 13268 + }, + { + "epoch": 2.459955506117909, + "grad_norm": 9.3125, + "learning_rate": 7.540044493882092e-06, + "loss": 2.823, + "mean_token_accuracy": 0.49676519091716353, + "step": 13269 + }, + { + "epoch": 2.460140897293289, + "grad_norm": 15.5078125, + "learning_rate": 7.5398591027067114e-06, + "loss": 3.2457, + "mean_token_accuracy": 0.443209574987787, + "step": 13270 + }, + { + "epoch": 2.4603262884686687, + "grad_norm": 8.890625, + "learning_rate": 7.539673711531331e-06, + "loss": 2.8765, + "mean_token_accuracy": 0.46920492721164614, + "step": 13271 + }, + { + "epoch": 2.460511679644049, + "grad_norm": 8.828125, + "learning_rate": 7.5394883203559524e-06, + "loss": 2.5411, + "mean_token_accuracy": 0.5048463356973996, + "step": 13272 + }, + { + "epoch": 2.460697070819429, + "grad_norm": 10.8359375, + "learning_rate": 7.539302929180572e-06, + "loss": 2.3473, + "mean_token_accuracy": 0.5099614395886889, + "step": 13273 + }, + { + "epoch": 2.460882461994809, + "grad_norm": 7.328125, + "learning_rate": 7.539117538005192e-06, + "loss": 3.1377, + "mean_token_accuracy": 0.46026069904256545, + "step": 13274 + }, + { + "epoch": 2.461067853170189, + "grad_norm": 8.09375, + "learning_rate": 7.538932146829811e-06, + "loss": 2.9261, + "mean_token_accuracy": 0.4772105742935278, + "step": 13275 + }, + { + "epoch": 2.4612532443455692, + "grad_norm": 6.44921875, + "learning_rate": 7.538746755654431e-06, + "loss": 3.537, + "mean_token_accuracy": 0.4269978401727862, + "step": 13276 + }, + { + "epoch": 2.461438635520949, + "grad_norm": 7.44140625, + "learning_rate": 7.5385613644790515e-06, + "loss": 2.6834, + "mean_token_accuracy": 0.5136465324384788, + "step": 13277 + }, + { + "epoch": 2.461624026696329, + "grad_norm": 7.81640625, + "learning_rate": 7.538375973303671e-06, + "loss": 2.7812, + "mean_token_accuracy": 0.4897326045206087, + "step": 13278 + }, + { + "epoch": 2.4618094178717094, + "grad_norm": 9.6875, + "learning_rate": 7.538190582128291e-06, + "loss": 3.0179, + "mean_token_accuracy": 0.45230582524271845, + "step": 13279 + }, + { + "epoch": 2.4619948090470896, + "grad_norm": 9.171875, + "learning_rate": 7.538005190952912e-06, + "loss": 2.5547, + "mean_token_accuracy": 0.5022950819672131, + "step": 13280 + }, + { + "epoch": 2.4621802002224693, + "grad_norm": 7.6171875, + "learning_rate": 7.537819799777532e-06, + "loss": 3.5734, + "mean_token_accuracy": 0.4427555773473891, + "step": 13281 + }, + { + "epoch": 2.4623655913978495, + "grad_norm": 10.8515625, + "learning_rate": 7.537634408602151e-06, + "loss": 3.0597, + "mean_token_accuracy": 0.4607632356893308, + "step": 13282 + }, + { + "epoch": 2.4625509825732297, + "grad_norm": 8.0859375, + "learning_rate": 7.537449017426771e-06, + "loss": 2.7501, + "mean_token_accuracy": 0.4774280273328106, + "step": 13283 + }, + { + "epoch": 2.4627363737486094, + "grad_norm": 7.625, + "learning_rate": 7.537263626251391e-06, + "loss": 2.8907, + "mean_token_accuracy": 0.4921441774491682, + "step": 13284 + }, + { + "epoch": 2.4629217649239896, + "grad_norm": 7.16796875, + "learning_rate": 7.537078235076011e-06, + "loss": 3.3606, + "mean_token_accuracy": 0.43273841236014915, + "step": 13285 + }, + { + "epoch": 2.46310715609937, + "grad_norm": 8.09375, + "learning_rate": 7.536892843900631e-06, + "loss": 2.9947, + "mean_token_accuracy": 0.47450684488519995, + "step": 13286 + }, + { + "epoch": 2.4632925472747496, + "grad_norm": 8.140625, + "learning_rate": 7.5367074527252505e-06, + "loss": 2.5571, + "mean_token_accuracy": 0.5103296193129062, + "step": 13287 + }, + { + "epoch": 2.4634779384501297, + "grad_norm": 6.9296875, + "learning_rate": 7.536522061549871e-06, + "loss": 3.0481, + "mean_token_accuracy": 0.4744993742177722, + "step": 13288 + }, + { + "epoch": 2.46366332962551, + "grad_norm": 8.2109375, + "learning_rate": 7.5363366703744915e-06, + "loss": 2.9223, + "mean_token_accuracy": 0.48070626753975676, + "step": 13289 + }, + { + "epoch": 2.4638487208008897, + "grad_norm": 7.265625, + "learning_rate": 7.536151279199111e-06, + "loss": 2.5837, + "mean_token_accuracy": 0.5, + "step": 13290 + }, + { + "epoch": 2.46403411197627, + "grad_norm": 7.51171875, + "learning_rate": 7.535965888023731e-06, + "loss": 2.6596, + "mean_token_accuracy": 0.49928140270192584, + "step": 13291 + }, + { + "epoch": 2.46421950315165, + "grad_norm": 6.70703125, + "learning_rate": 7.53578049684835e-06, + "loss": 2.4983, + "mean_token_accuracy": 0.4989667049368542, + "step": 13292 + }, + { + "epoch": 2.4644048943270302, + "grad_norm": 7.58984375, + "learning_rate": 7.53559510567297e-06, + "loss": 2.5259, + "mean_token_accuracy": 0.5089542892924233, + "step": 13293 + }, + { + "epoch": 2.46459028550241, + "grad_norm": 6.796875, + "learning_rate": 7.5354097144975905e-06, + "loss": 3.3845, + "mean_token_accuracy": 0.45827458018202794, + "step": 13294 + }, + { + "epoch": 2.46477567667779, + "grad_norm": 12.265625, + "learning_rate": 7.53522432332221e-06, + "loss": 3.9096, + "mean_token_accuracy": 0.4415058696532182, + "step": 13295 + }, + { + "epoch": 2.4649610678531704, + "grad_norm": 6.83203125, + "learning_rate": 7.535038932146831e-06, + "loss": 2.4868, + "mean_token_accuracy": 0.5420560747663551, + "step": 13296 + }, + { + "epoch": 2.46514645902855, + "grad_norm": 7.65234375, + "learning_rate": 7.53485354097145e-06, + "loss": 2.7698, + "mean_token_accuracy": 0.4740678145135735, + "step": 13297 + }, + { + "epoch": 2.4653318502039303, + "grad_norm": 8.046875, + "learning_rate": 7.534668149796071e-06, + "loss": 3.0664, + "mean_token_accuracy": 0.4593239943451998, + "step": 13298 + }, + { + "epoch": 2.4655172413793105, + "grad_norm": 11.7890625, + "learning_rate": 7.5344827586206904e-06, + "loss": 2.5011, + "mean_token_accuracy": 0.4934348449740704, + "step": 13299 + }, + { + "epoch": 2.4657026325546902, + "grad_norm": 12.5078125, + "learning_rate": 7.53429736744531e-06, + "loss": 2.2675, + "mean_token_accuracy": 0.5410469107551488, + "step": 13300 + }, + { + "epoch": 2.4658880237300704, + "grad_norm": 9.7890625, + "learning_rate": 7.53411197626993e-06, + "loss": 2.9737, + "mean_token_accuracy": 0.4888496056567854, + "step": 13301 + }, + { + "epoch": 2.4660734149054506, + "grad_norm": 6.9140625, + "learning_rate": 7.533926585094549e-06, + "loss": 2.9123, + "mean_token_accuracy": 0.46083532555196177, + "step": 13302 + }, + { + "epoch": 2.4662588060808304, + "grad_norm": 9.4453125, + "learning_rate": 7.53374119391917e-06, + "loss": 3.3758, + "mean_token_accuracy": 0.45414146401678807, + "step": 13303 + }, + { + "epoch": 2.4664441972562106, + "grad_norm": 11.234375, + "learning_rate": 7.53355580274379e-06, + "loss": 2.471, + "mean_token_accuracy": 0.5417844622865543, + "step": 13304 + }, + { + "epoch": 2.4666295884315907, + "grad_norm": 14.1953125, + "learning_rate": 7.53337041156841e-06, + "loss": 2.791, + "mean_token_accuracy": 0.4993684210526316, + "step": 13305 + }, + { + "epoch": 2.466814979606971, + "grad_norm": 7.63671875, + "learning_rate": 7.5331850203930305e-06, + "loss": 3.2095, + "mean_token_accuracy": 0.4437604924454393, + "step": 13306 + }, + { + "epoch": 2.4670003707823507, + "grad_norm": 12.0390625, + "learning_rate": 7.53299962921765e-06, + "loss": 2.7843, + "mean_token_accuracy": 0.4885530619149656, + "step": 13307 + }, + { + "epoch": 2.467185761957731, + "grad_norm": 9.25, + "learning_rate": 7.53281423804227e-06, + "loss": 2.6579, + "mean_token_accuracy": 0.5365567574612277, + "step": 13308 + }, + { + "epoch": 2.4673711531331106, + "grad_norm": 8.171875, + "learning_rate": 7.5326288468668894e-06, + "loss": 2.7935, + "mean_token_accuracy": 0.5261004514672686, + "step": 13309 + }, + { + "epoch": 2.467556544308491, + "grad_norm": 6.8046875, + "learning_rate": 7.532443455691509e-06, + "loss": 2.7979, + "mean_token_accuracy": 0.5002209944751381, + "step": 13310 + }, + { + "epoch": 2.467741935483871, + "grad_norm": 9.0078125, + "learning_rate": 7.5322580645161296e-06, + "loss": 2.6403, + "mean_token_accuracy": 0.49795261772448085, + "step": 13311 + }, + { + "epoch": 2.467927326659251, + "grad_norm": 7.921875, + "learning_rate": 7.53207267334075e-06, + "loss": 3.096, + "mean_token_accuracy": 0.4681624467285034, + "step": 13312 + }, + { + "epoch": 2.468112717834631, + "grad_norm": 7.27734375, + "learning_rate": 7.53188728216537e-06, + "loss": 2.1734, + "mean_token_accuracy": 0.5699150520187076, + "step": 13313 + }, + { + "epoch": 2.468298109010011, + "grad_norm": 7.68359375, + "learning_rate": 7.531701890989989e-06, + "loss": 3.0781, + "mean_token_accuracy": 0.45792450652199007, + "step": 13314 + }, + { + "epoch": 2.4684835001853913, + "grad_norm": 8.4609375, + "learning_rate": 7.53151649981461e-06, + "loss": 3.2745, + "mean_token_accuracy": 0.4541628304616599, + "step": 13315 + }, + { + "epoch": 2.468668891360771, + "grad_norm": 8.96875, + "learning_rate": 7.5313311086392295e-06, + "loss": 3.1627, + "mean_token_accuracy": 0.42383209207853756, + "step": 13316 + }, + { + "epoch": 2.4688542825361512, + "grad_norm": 9.2265625, + "learning_rate": 7.531145717463849e-06, + "loss": 2.4012, + "mean_token_accuracy": 0.5350802008081302, + "step": 13317 + }, + { + "epoch": 2.4690396737115314, + "grad_norm": 6.921875, + "learning_rate": 7.530960326288469e-06, + "loss": 2.5358, + "mean_token_accuracy": 0.503161159489443, + "step": 13318 + }, + { + "epoch": 2.4692250648869116, + "grad_norm": 7.78125, + "learning_rate": 7.530774935113088e-06, + "loss": 2.4126, + "mean_token_accuracy": 0.5125684420109508, + "step": 13319 + }, + { + "epoch": 2.4694104560622914, + "grad_norm": 7.74609375, + "learning_rate": 7.53058954393771e-06, + "loss": 3.2211, + "mean_token_accuracy": 0.4494884158532283, + "step": 13320 + }, + { + "epoch": 2.4695958472376716, + "grad_norm": 6.546875, + "learning_rate": 7.530404152762329e-06, + "loss": 2.8311, + "mean_token_accuracy": 0.48358938547486036, + "step": 13321 + }, + { + "epoch": 2.4697812384130513, + "grad_norm": 8.3671875, + "learning_rate": 7.530218761586949e-06, + "loss": 3.6416, + "mean_token_accuracy": 0.45073919422187114, + "step": 13322 + }, + { + "epoch": 2.4699666295884315, + "grad_norm": 8.796875, + "learning_rate": 7.5300333704115695e-06, + "loss": 2.8854, + "mean_token_accuracy": 0.46971153846153846, + "step": 13323 + }, + { + "epoch": 2.4701520207638117, + "grad_norm": 8.390625, + "learning_rate": 7.529847979236189e-06, + "loss": 2.5262, + "mean_token_accuracy": 0.5237226277372263, + "step": 13324 + }, + { + "epoch": 2.470337411939192, + "grad_norm": 10.6640625, + "learning_rate": 7.529662588060809e-06, + "loss": 2.5144, + "mean_token_accuracy": 0.5209748302037555, + "step": 13325 + }, + { + "epoch": 2.4705228031145716, + "grad_norm": 7.5546875, + "learning_rate": 7.5294771968854285e-06, + "loss": 3.1682, + "mean_token_accuracy": 0.46429780033840945, + "step": 13326 + }, + { + "epoch": 2.470708194289952, + "grad_norm": 7.890625, + "learning_rate": 7.529291805710048e-06, + "loss": 3.5393, + "mean_token_accuracy": 0.4307639445653419, + "step": 13327 + }, + { + "epoch": 2.470893585465332, + "grad_norm": 9.1953125, + "learning_rate": 7.5291064145346695e-06, + "loss": 2.8436, + "mean_token_accuracy": 0.46603908484270734, + "step": 13328 + }, + { + "epoch": 2.4710789766407117, + "grad_norm": 7.890625, + "learning_rate": 7.528921023359289e-06, + "loss": 2.5791, + "mean_token_accuracy": 0.5097365406643757, + "step": 13329 + }, + { + "epoch": 2.471264367816092, + "grad_norm": 6.765625, + "learning_rate": 7.528735632183909e-06, + "loss": 2.7928, + "mean_token_accuracy": 0.49418257756563244, + "step": 13330 + }, + { + "epoch": 2.471449758991472, + "grad_norm": 9.2265625, + "learning_rate": 7.528550241008528e-06, + "loss": 3.1077, + "mean_token_accuracy": 0.4617847138855542, + "step": 13331 + }, + { + "epoch": 2.471635150166852, + "grad_norm": 9.0859375, + "learning_rate": 7.528364849833149e-06, + "loss": 3.0796, + "mean_token_accuracy": 0.4658896658896659, + "step": 13332 + }, + { + "epoch": 2.471820541342232, + "grad_norm": 7.2265625, + "learning_rate": 7.5281794586577685e-06, + "loss": 2.9862, + "mean_token_accuracy": 0.4403870967741936, + "step": 13333 + }, + { + "epoch": 2.4720059325176122, + "grad_norm": 7.3515625, + "learning_rate": 7.527994067482388e-06, + "loss": 2.9313, + "mean_token_accuracy": 0.4601187403200826, + "step": 13334 + }, + { + "epoch": 2.472191323692992, + "grad_norm": 7.8515625, + "learning_rate": 7.527808676307008e-06, + "loss": 2.9619, + "mean_token_accuracy": 0.4835637480798771, + "step": 13335 + }, + { + "epoch": 2.472376714868372, + "grad_norm": 7.18359375, + "learning_rate": 7.527623285131629e-06, + "loss": 3.5066, + "mean_token_accuracy": 0.4232268121590023, + "step": 13336 + }, + { + "epoch": 2.4725621060437524, + "grad_norm": 8.1796875, + "learning_rate": 7.527437893956249e-06, + "loss": 3.3012, + "mean_token_accuracy": 0.46864857847591845, + "step": 13337 + }, + { + "epoch": 2.4727474972191326, + "grad_norm": 6.80078125, + "learning_rate": 7.5272525027808684e-06, + "loss": 2.7245, + "mean_token_accuracy": 0.4970202622169249, + "step": 13338 + }, + { + "epoch": 2.4729328883945123, + "grad_norm": 6.9453125, + "learning_rate": 7.527067111605488e-06, + "loss": 2.4637, + "mean_token_accuracy": 0.5198421164924258, + "step": 13339 + }, + { + "epoch": 2.4731182795698925, + "grad_norm": 9.1796875, + "learning_rate": 7.526881720430108e-06, + "loss": 2.4712, + "mean_token_accuracy": 0.5152079453755432, + "step": 13340 + }, + { + "epoch": 2.4733036707452727, + "grad_norm": 7.62890625, + "learning_rate": 7.526696329254728e-06, + "loss": 2.4651, + "mean_token_accuracy": 0.5261992619926199, + "step": 13341 + }, + { + "epoch": 2.4734890619206524, + "grad_norm": 7.48046875, + "learning_rate": 7.526510938079348e-06, + "loss": 2.9323, + "mean_token_accuracy": 0.4613366907984472, + "step": 13342 + }, + { + "epoch": 2.4736744530960326, + "grad_norm": 7.7578125, + "learning_rate": 7.5263255469039675e-06, + "loss": 2.7491, + "mean_token_accuracy": 0.4887230514096186, + "step": 13343 + }, + { + "epoch": 2.473859844271413, + "grad_norm": 8.8125, + "learning_rate": 7.526140155728589e-06, + "loss": 2.8553, + "mean_token_accuracy": 0.466578073089701, + "step": 13344 + }, + { + "epoch": 2.4740452354467926, + "grad_norm": 8.78125, + "learning_rate": 7.5259547645532085e-06, + "loss": 2.8389, + "mean_token_accuracy": 0.4698604060913706, + "step": 13345 + }, + { + "epoch": 2.4742306266221727, + "grad_norm": 10.0234375, + "learning_rate": 7.525769373377828e-06, + "loss": 2.4711, + "mean_token_accuracy": 0.5102293862368258, + "step": 13346 + }, + { + "epoch": 2.474416017797553, + "grad_norm": 6.609375, + "learning_rate": 7.525583982202448e-06, + "loss": 3.2931, + "mean_token_accuracy": 0.4564947172156619, + "step": 13347 + }, + { + "epoch": 2.4746014089729327, + "grad_norm": 6.77734375, + "learning_rate": 7.525398591027067e-06, + "loss": 2.0569, + "mean_token_accuracy": 0.572173124858373, + "step": 13348 + }, + { + "epoch": 2.474786800148313, + "grad_norm": 7.37109375, + "learning_rate": 7.525213199851688e-06, + "loss": 3.2619, + "mean_token_accuracy": 0.46810165975103735, + "step": 13349 + }, + { + "epoch": 2.474972191323693, + "grad_norm": 7.640625, + "learning_rate": 7.5250278086763076e-06, + "loss": 3.283, + "mean_token_accuracy": 0.4435272536687631, + "step": 13350 + }, + { + "epoch": 2.4751575824990732, + "grad_norm": 7.67578125, + "learning_rate": 7.524842417500927e-06, + "loss": 3.7357, + "mean_token_accuracy": 0.4129402671966435, + "step": 13351 + }, + { + "epoch": 2.475342973674453, + "grad_norm": 8.2578125, + "learning_rate": 7.524657026325548e-06, + "loss": 2.8798, + "mean_token_accuracy": 0.49326347305389223, + "step": 13352 + }, + { + "epoch": 2.475528364849833, + "grad_norm": 11.15625, + "learning_rate": 7.524471635150168e-06, + "loss": 3.0943, + "mean_token_accuracy": 0.4886431919240476, + "step": 13353 + }, + { + "epoch": 2.4757137560252134, + "grad_norm": 7.83984375, + "learning_rate": 7.524286243974788e-06, + "loss": 2.8252, + "mean_token_accuracy": 0.48654353562005276, + "step": 13354 + }, + { + "epoch": 2.475899147200593, + "grad_norm": 7.0703125, + "learning_rate": 7.5241008527994075e-06, + "loss": 3.3741, + "mean_token_accuracy": 0.4473328324567994, + "step": 13355 + }, + { + "epoch": 2.4760845383759733, + "grad_norm": 8.640625, + "learning_rate": 7.523915461624027e-06, + "loss": 2.7767, + "mean_token_accuracy": 0.48945416004239534, + "step": 13356 + }, + { + "epoch": 2.4762699295513535, + "grad_norm": 8.671875, + "learning_rate": 7.523730070448647e-06, + "loss": 2.9909, + "mean_token_accuracy": 0.4589222107319315, + "step": 13357 + }, + { + "epoch": 2.4764553207267332, + "grad_norm": 7.32421875, + "learning_rate": 7.523544679273267e-06, + "loss": 2.4897, + "mean_token_accuracy": 0.5608597553734711, + "step": 13358 + }, + { + "epoch": 2.4766407119021134, + "grad_norm": 7.8203125, + "learning_rate": 7.523359288097887e-06, + "loss": 3.1675, + "mean_token_accuracy": 0.46964476654389925, + "step": 13359 + }, + { + "epoch": 2.4768261030774936, + "grad_norm": 13.1015625, + "learning_rate": 7.523173896922507e-06, + "loss": 3.1675, + "mean_token_accuracy": 0.4482343499197432, + "step": 13360 + }, + { + "epoch": 2.4770114942528734, + "grad_norm": 6.80078125, + "learning_rate": 7.522988505747128e-06, + "loss": 2.694, + "mean_token_accuracy": 0.48408050260888086, + "step": 13361 + }, + { + "epoch": 2.4771968854282536, + "grad_norm": 7.1796875, + "learning_rate": 7.5228031145717475e-06, + "loss": 2.6752, + "mean_token_accuracy": 0.504384799362211, + "step": 13362 + }, + { + "epoch": 2.4773822766036337, + "grad_norm": 8.515625, + "learning_rate": 7.522617723396367e-06, + "loss": 3.0982, + "mean_token_accuracy": 0.4556381464494702, + "step": 13363 + }, + { + "epoch": 2.477567667779014, + "grad_norm": 8.359375, + "learning_rate": 7.522432332220987e-06, + "loss": 2.6557, + "mean_token_accuracy": 0.4727175938248603, + "step": 13364 + }, + { + "epoch": 2.4777530589543937, + "grad_norm": 6.66796875, + "learning_rate": 7.5222469410456065e-06, + "loss": 2.7096, + "mean_token_accuracy": 0.4813586574213647, + "step": 13365 + }, + { + "epoch": 2.477938450129774, + "grad_norm": 8.203125, + "learning_rate": 7.522061549870226e-06, + "loss": 2.7456, + "mean_token_accuracy": 0.4972761558876938, + "step": 13366 + }, + { + "epoch": 2.478123841305154, + "grad_norm": 7.52734375, + "learning_rate": 7.521876158694847e-06, + "loss": 3.4023, + "mean_token_accuracy": 0.4281520827509086, + "step": 13367 + }, + { + "epoch": 2.478309232480534, + "grad_norm": 10.3828125, + "learning_rate": 7.521690767519466e-06, + "loss": 2.8668, + "mean_token_accuracy": 0.5126627218934912, + "step": 13368 + }, + { + "epoch": 2.478494623655914, + "grad_norm": 8.7109375, + "learning_rate": 7.521505376344087e-06, + "loss": 2.2928, + "mean_token_accuracy": 0.5527035556490637, + "step": 13369 + }, + { + "epoch": 2.478680014831294, + "grad_norm": 7.66015625, + "learning_rate": 7.521319985168707e-06, + "loss": 2.7185, + "mean_token_accuracy": 0.5340132364109798, + "step": 13370 + }, + { + "epoch": 2.478865406006674, + "grad_norm": 7.015625, + "learning_rate": 7.521134593993327e-06, + "loss": 2.7987, + "mean_token_accuracy": 0.493445833961127, + "step": 13371 + }, + { + "epoch": 2.479050797182054, + "grad_norm": 8.34375, + "learning_rate": 7.5209492028179465e-06, + "loss": 2.3068, + "mean_token_accuracy": 0.5200969758836289, + "step": 13372 + }, + { + "epoch": 2.4792361883574343, + "grad_norm": 7.953125, + "learning_rate": 7.520763811642566e-06, + "loss": 3.0508, + "mean_token_accuracy": 0.46788263283108644, + "step": 13373 + }, + { + "epoch": 2.479421579532814, + "grad_norm": 8.1796875, + "learning_rate": 7.520578420467186e-06, + "loss": 3.2592, + "mean_token_accuracy": 0.43542822858857055, + "step": 13374 + }, + { + "epoch": 2.4796069707081942, + "grad_norm": 6.921875, + "learning_rate": 7.520393029291806e-06, + "loss": 2.8971, + "mean_token_accuracy": 0.4828077959118999, + "step": 13375 + }, + { + "epoch": 2.4797923618835744, + "grad_norm": 7.34375, + "learning_rate": 7.520207638116426e-06, + "loss": 2.9779, + "mean_token_accuracy": 0.45186615186615187, + "step": 13376 + }, + { + "epoch": 2.4799777530589546, + "grad_norm": 8.578125, + "learning_rate": 7.520022246941046e-06, + "loss": 2.9864, + "mean_token_accuracy": 0.4780915287244401, + "step": 13377 + }, + { + "epoch": 2.4801631442343344, + "grad_norm": 8.3359375, + "learning_rate": 7.519836855765666e-06, + "loss": 2.8084, + "mean_token_accuracy": 0.49474615639862846, + "step": 13378 + }, + { + "epoch": 2.4803485354097146, + "grad_norm": 7.640625, + "learning_rate": 7.5196514645902866e-06, + "loss": 2.9678, + "mean_token_accuracy": 0.4765432098765432, + "step": 13379 + }, + { + "epoch": 2.4805339265850943, + "grad_norm": 7.59765625, + "learning_rate": 7.519466073414906e-06, + "loss": 2.6507, + "mean_token_accuracy": 0.493480150680962, + "step": 13380 + }, + { + "epoch": 2.4807193177604745, + "grad_norm": 7.4140625, + "learning_rate": 7.519280682239526e-06, + "loss": 2.6835, + "mean_token_accuracy": 0.48588987833939545, + "step": 13381 + }, + { + "epoch": 2.4809047089358547, + "grad_norm": 6.97265625, + "learning_rate": 7.5190952910641455e-06, + "loss": 2.4351, + "mean_token_accuracy": 0.5155837295298468, + "step": 13382 + }, + { + "epoch": 2.481090100111235, + "grad_norm": 6.4453125, + "learning_rate": 7.518909899888765e-06, + "loss": 2.6197, + "mean_token_accuracy": 0.5044617643736586, + "step": 13383 + }, + { + "epoch": 2.4812754912866146, + "grad_norm": 7.70703125, + "learning_rate": 7.518724508713386e-06, + "loss": 2.7679, + "mean_token_accuracy": 0.4906679764243615, + "step": 13384 + }, + { + "epoch": 2.481460882461995, + "grad_norm": 9.9296875, + "learning_rate": 7.518539117538006e-06, + "loss": 3.82, + "mean_token_accuracy": 0.4376544596719838, + "step": 13385 + }, + { + "epoch": 2.481646273637375, + "grad_norm": 7.109375, + "learning_rate": 7.518353726362626e-06, + "loss": 2.8567, + "mean_token_accuracy": 0.4698809961967857, + "step": 13386 + }, + { + "epoch": 2.4818316648127547, + "grad_norm": 7.4921875, + "learning_rate": 7.518168335187246e-06, + "loss": 3.1748, + "mean_token_accuracy": 0.46855796804588284, + "step": 13387 + }, + { + "epoch": 2.482017055988135, + "grad_norm": 6.16015625, + "learning_rate": 7.517982944011866e-06, + "loss": 2.6274, + "mean_token_accuracy": 0.515479049906592, + "step": 13388 + }, + { + "epoch": 2.482202447163515, + "grad_norm": 7.67578125, + "learning_rate": 7.5177975528364855e-06, + "loss": 3.2758, + "mean_token_accuracy": 0.47297999480384517, + "step": 13389 + }, + { + "epoch": 2.4823878383388953, + "grad_norm": 7.75390625, + "learning_rate": 7.517612161661105e-06, + "loss": 2.3158, + "mean_token_accuracy": 0.5441895449417075, + "step": 13390 + }, + { + "epoch": 2.482573229514275, + "grad_norm": 7.0078125, + "learning_rate": 7.517426770485725e-06, + "loss": 3.1104, + "mean_token_accuracy": 0.46416107382550337, + "step": 13391 + }, + { + "epoch": 2.4827586206896552, + "grad_norm": 8.25, + "learning_rate": 7.517241379310345e-06, + "loss": 2.5368, + "mean_token_accuracy": 0.5137787337007662, + "step": 13392 + }, + { + "epoch": 2.482944011865035, + "grad_norm": 8.34375, + "learning_rate": 7.517055988134966e-06, + "loss": 3.7509, + "mean_token_accuracy": 0.4327950605556875, + "step": 13393 + }, + { + "epoch": 2.483129403040415, + "grad_norm": 8.75, + "learning_rate": 7.5168705969595855e-06, + "loss": 2.3938, + "mean_token_accuracy": 0.5130890052356021, + "step": 13394 + }, + { + "epoch": 2.4833147942157954, + "grad_norm": 6.50390625, + "learning_rate": 7.516685205784205e-06, + "loss": 3.1548, + "mean_token_accuracy": 0.44699950811608463, + "step": 13395 + }, + { + "epoch": 2.4835001853911756, + "grad_norm": 6.7265625, + "learning_rate": 7.516499814608826e-06, + "loss": 2.8864, + "mean_token_accuracy": 0.47256312596981237, + "step": 13396 + }, + { + "epoch": 2.4836855765665553, + "grad_norm": 7.33984375, + "learning_rate": 7.516314423433445e-06, + "loss": 2.8077, + "mean_token_accuracy": 0.523390643742503, + "step": 13397 + }, + { + "epoch": 2.4838709677419355, + "grad_norm": 8.40625, + "learning_rate": 7.516129032258065e-06, + "loss": 2.6146, + "mean_token_accuracy": 0.49390937144713337, + "step": 13398 + }, + { + "epoch": 2.4840563589173157, + "grad_norm": 7.3515625, + "learning_rate": 7.5159436410826845e-06, + "loss": 3.4517, + "mean_token_accuracy": 0.4265389082462253, + "step": 13399 + }, + { + "epoch": 2.4842417500926954, + "grad_norm": 9.7265625, + "learning_rate": 7.515758249907304e-06, + "loss": 2.4666, + "mean_token_accuracy": 0.4980188211986132, + "step": 13400 + }, + { + "epoch": 2.4844271412680756, + "grad_norm": 8.578125, + "learning_rate": 7.5155728587319255e-06, + "loss": 2.8473, + "mean_token_accuracy": 0.4804493103938575, + "step": 13401 + }, + { + "epoch": 2.484612532443456, + "grad_norm": 8.1875, + "learning_rate": 7.515387467556545e-06, + "loss": 2.9203, + "mean_token_accuracy": 0.4864145502938893, + "step": 13402 + }, + { + "epoch": 2.4847979236188356, + "grad_norm": 11.3359375, + "learning_rate": 7.515202076381165e-06, + "loss": 2.925, + "mean_token_accuracy": 0.46713742531233027, + "step": 13403 + }, + { + "epoch": 2.4849833147942157, + "grad_norm": 9.859375, + "learning_rate": 7.515016685205785e-06, + "loss": 3.007, + "mean_token_accuracy": 0.49192928516525747, + "step": 13404 + }, + { + "epoch": 2.485168705969596, + "grad_norm": 6.8984375, + "learning_rate": 7.514831294030405e-06, + "loss": 3.1074, + "mean_token_accuracy": 0.46310925622298404, + "step": 13405 + }, + { + "epoch": 2.4853540971449757, + "grad_norm": 8.8515625, + "learning_rate": 7.5146459028550246e-06, + "loss": 2.9993, + "mean_token_accuracy": 0.48634661968729354, + "step": 13406 + }, + { + "epoch": 2.485539488320356, + "grad_norm": 9.28125, + "learning_rate": 7.514460511679644e-06, + "loss": 3.0121, + "mean_token_accuracy": 0.4662534435261708, + "step": 13407 + }, + { + "epoch": 2.485724879495736, + "grad_norm": 6.15625, + "learning_rate": 7.514275120504264e-06, + "loss": 2.7469, + "mean_token_accuracy": 0.4930687686871432, + "step": 13408 + }, + { + "epoch": 2.4859102706711163, + "grad_norm": 7.125, + "learning_rate": 7.514089729328885e-06, + "loss": 3.0081, + "mean_token_accuracy": 0.4764559190259645, + "step": 13409 + }, + { + "epoch": 2.486095661846496, + "grad_norm": 6.91796875, + "learning_rate": 7.513904338153505e-06, + "loss": 2.8525, + "mean_token_accuracy": 0.4921892434724392, + "step": 13410 + }, + { + "epoch": 2.486281053021876, + "grad_norm": 9.0234375, + "learning_rate": 7.5137189469781245e-06, + "loss": 2.6243, + "mean_token_accuracy": 0.484802661424467, + "step": 13411 + }, + { + "epoch": 2.4864664441972564, + "grad_norm": 7.37890625, + "learning_rate": 7.513533555802744e-06, + "loss": 3.3506, + "mean_token_accuracy": 0.44198342383538153, + "step": 13412 + }, + { + "epoch": 2.486651835372636, + "grad_norm": 7.69140625, + "learning_rate": 7.513348164627365e-06, + "loss": 2.4477, + "mean_token_accuracy": 0.5536558242980262, + "step": 13413 + }, + { + "epoch": 2.4868372265480163, + "grad_norm": 7.6796875, + "learning_rate": 7.513162773451984e-06, + "loss": 2.3984, + "mean_token_accuracy": 0.5429239505622981, + "step": 13414 + }, + { + "epoch": 2.4870226177233965, + "grad_norm": 7.9765625, + "learning_rate": 7.512977382276604e-06, + "loss": 3.301, + "mean_token_accuracy": 0.4442921236291127, + "step": 13415 + }, + { + "epoch": 2.4872080088987762, + "grad_norm": 7.75390625, + "learning_rate": 7.5127919911012236e-06, + "loss": 3.0548, + "mean_token_accuracy": 0.4658942983598021, + "step": 13416 + }, + { + "epoch": 2.4873934000741564, + "grad_norm": 8.1015625, + "learning_rate": 7.512606599925845e-06, + "loss": 2.9517, + "mean_token_accuracy": 0.5045510455104552, + "step": 13417 + }, + { + "epoch": 2.4875787912495366, + "grad_norm": 7.25, + "learning_rate": 7.5124212087504645e-06, + "loss": 3.0516, + "mean_token_accuracy": 0.45899091343854614, + "step": 13418 + }, + { + "epoch": 2.4877641824249164, + "grad_norm": 8.4375, + "learning_rate": 7.512235817575084e-06, + "loss": 2.8354, + "mean_token_accuracy": 0.48569033648664833, + "step": 13419 + }, + { + "epoch": 2.4879495736002966, + "grad_norm": 7.06640625, + "learning_rate": 7.512050426399704e-06, + "loss": 2.4433, + "mean_token_accuracy": 0.5433648286560429, + "step": 13420 + }, + { + "epoch": 2.4881349647756767, + "grad_norm": 8.1484375, + "learning_rate": 7.5118650352243235e-06, + "loss": 2.8987, + "mean_token_accuracy": 0.5172338806132435, + "step": 13421 + }, + { + "epoch": 2.488320355951057, + "grad_norm": 7.01953125, + "learning_rate": 7.511679644048944e-06, + "loss": 2.8077, + "mean_token_accuracy": 0.4860454115421003, + "step": 13422 + }, + { + "epoch": 2.4885057471264367, + "grad_norm": 6.37109375, + "learning_rate": 7.511494252873564e-06, + "loss": 2.9741, + "mean_token_accuracy": 0.4604071067184375, + "step": 13423 + }, + { + "epoch": 2.488691138301817, + "grad_norm": 7.3828125, + "learning_rate": 7.511308861698183e-06, + "loss": 3.0482, + "mean_token_accuracy": 0.47602070155261644, + "step": 13424 + }, + { + "epoch": 2.488876529477197, + "grad_norm": 6.68359375, + "learning_rate": 7.511123470522805e-06, + "loss": 2.6006, + "mean_token_accuracy": 0.5005136106831022, + "step": 13425 + }, + { + "epoch": 2.489061920652577, + "grad_norm": 6.94140625, + "learning_rate": 7.510938079347424e-06, + "loss": 2.8546, + "mean_token_accuracy": 0.47346611415586154, + "step": 13426 + }, + { + "epoch": 2.489247311827957, + "grad_norm": 8.109375, + "learning_rate": 7.510752688172044e-06, + "loss": 3.4283, + "mean_token_accuracy": 0.4365817495619356, + "step": 13427 + }, + { + "epoch": 2.489432703003337, + "grad_norm": 7.7109375, + "learning_rate": 7.5105672969966635e-06, + "loss": 3.2388, + "mean_token_accuracy": 0.4547099400655886, + "step": 13428 + }, + { + "epoch": 2.489618094178717, + "grad_norm": 7.70703125, + "learning_rate": 7.510381905821283e-06, + "loss": 3.4889, + "mean_token_accuracy": 0.43380668804397615, + "step": 13429 + }, + { + "epoch": 2.489803485354097, + "grad_norm": 9.484375, + "learning_rate": 7.510196514645904e-06, + "loss": 2.4831, + "mean_token_accuracy": 0.4905131166474179, + "step": 13430 + }, + { + "epoch": 2.4899888765294773, + "grad_norm": 9.75, + "learning_rate": 7.510011123470523e-06, + "loss": 2.6115, + "mean_token_accuracy": 0.4971209213051823, + "step": 13431 + }, + { + "epoch": 2.490174267704857, + "grad_norm": 10.953125, + "learning_rate": 7.509825732295143e-06, + "loss": 3.3875, + "mean_token_accuracy": 0.45209918286841366, + "step": 13432 + }, + { + "epoch": 2.4903596588802372, + "grad_norm": 6.05859375, + "learning_rate": 7.5096403411197634e-06, + "loss": 2.3436, + "mean_token_accuracy": 0.5541093343250278, + "step": 13433 + }, + { + "epoch": 2.4905450500556174, + "grad_norm": 8.359375, + "learning_rate": 7.509454949944384e-06, + "loss": 2.8157, + "mean_token_accuracy": 0.49122030396930794, + "step": 13434 + }, + { + "epoch": 2.4907304412309976, + "grad_norm": 9.578125, + "learning_rate": 7.5092695587690036e-06, + "loss": 3.1524, + "mean_token_accuracy": 0.4540645879732739, + "step": 13435 + }, + { + "epoch": 2.4909158324063774, + "grad_norm": 7.84765625, + "learning_rate": 7.509084167593623e-06, + "loss": 2.2787, + "mean_token_accuracy": 0.5460733644270768, + "step": 13436 + }, + { + "epoch": 2.4911012235817576, + "grad_norm": 6.36328125, + "learning_rate": 7.508898776418243e-06, + "loss": 2.8285, + "mean_token_accuracy": 0.48437681496108725, + "step": 13437 + }, + { + "epoch": 2.4912866147571378, + "grad_norm": 7.26171875, + "learning_rate": 7.5087133852428625e-06, + "loss": 2.4766, + "mean_token_accuracy": 0.4917558886509636, + "step": 13438 + }, + { + "epoch": 2.4914720059325175, + "grad_norm": 6.91015625, + "learning_rate": 7.508527994067483e-06, + "loss": 3.4809, + "mean_token_accuracy": 0.43765529704088546, + "step": 13439 + }, + { + "epoch": 2.4916573971078977, + "grad_norm": 11.96875, + "learning_rate": 7.508342602892103e-06, + "loss": 2.8066, + "mean_token_accuracy": 0.46888639481232075, + "step": 13440 + }, + { + "epoch": 2.491842788283278, + "grad_norm": 12.6796875, + "learning_rate": 7.508157211716723e-06, + "loss": 2.9643, + "mean_token_accuracy": 0.4918774074694356, + "step": 13441 + }, + { + "epoch": 2.4920281794586576, + "grad_norm": 11.3515625, + "learning_rate": 7.507971820541344e-06, + "loss": 2.6283, + "mean_token_accuracy": 0.49481441048034935, + "step": 13442 + }, + { + "epoch": 2.492213570634038, + "grad_norm": 7.2109375, + "learning_rate": 7.507786429365963e-06, + "loss": 2.6463, + "mean_token_accuracy": 0.48945425784339575, + "step": 13443 + }, + { + "epoch": 2.492398961809418, + "grad_norm": 7.68359375, + "learning_rate": 7.507601038190583e-06, + "loss": 2.6996, + "mean_token_accuracy": 0.5244840145690004, + "step": 13444 + }, + { + "epoch": 2.4925843529847977, + "grad_norm": 8.921875, + "learning_rate": 7.5074156470152026e-06, + "loss": 3.3345, + "mean_token_accuracy": 0.4383633732403631, + "step": 13445 + }, + { + "epoch": 2.492769744160178, + "grad_norm": 12.5390625, + "learning_rate": 7.507230255839822e-06, + "loss": 2.6583, + "mean_token_accuracy": 0.48631717771183647, + "step": 13446 + }, + { + "epoch": 2.492955135335558, + "grad_norm": 7.08203125, + "learning_rate": 7.507044864664442e-06, + "loss": 2.8589, + "mean_token_accuracy": 0.48365101503176816, + "step": 13447 + }, + { + "epoch": 2.4931405265109383, + "grad_norm": 8.703125, + "learning_rate": 7.506859473489062e-06, + "loss": 2.7982, + "mean_token_accuracy": 0.5099128867527786, + "step": 13448 + }, + { + "epoch": 2.493325917686318, + "grad_norm": 7.84765625, + "learning_rate": 7.506674082313683e-06, + "loss": 3.0923, + "mean_token_accuracy": 0.45694704049844237, + "step": 13449 + }, + { + "epoch": 2.4935113088616983, + "grad_norm": 7.53125, + "learning_rate": 7.5064886911383025e-06, + "loss": 2.6751, + "mean_token_accuracy": 0.5052465143021417, + "step": 13450 + }, + { + "epoch": 2.493696700037078, + "grad_norm": 8.1484375, + "learning_rate": 7.506303299962923e-06, + "loss": 3.4327, + "mean_token_accuracy": 0.4412366691802219, + "step": 13451 + }, + { + "epoch": 2.493882091212458, + "grad_norm": 10.5234375, + "learning_rate": 7.506117908787543e-06, + "loss": 3.5384, + "mean_token_accuracy": 0.45236631837738167, + "step": 13452 + }, + { + "epoch": 2.4940674823878384, + "grad_norm": 9.2421875, + "learning_rate": 7.505932517612162e-06, + "loss": 3.0827, + "mean_token_accuracy": 0.4602706832569308, + "step": 13453 + }, + { + "epoch": 2.4942528735632186, + "grad_norm": 7.7578125, + "learning_rate": 7.505747126436782e-06, + "loss": 2.8087, + "mean_token_accuracy": 0.47281921618204803, + "step": 13454 + }, + { + "epoch": 2.4944382647385983, + "grad_norm": 7.90234375, + "learning_rate": 7.5055617352614015e-06, + "loss": 2.3976, + "mean_token_accuracy": 0.508042416299297, + "step": 13455 + }, + { + "epoch": 2.4946236559139785, + "grad_norm": 8.015625, + "learning_rate": 7.505376344086022e-06, + "loss": 2.6759, + "mean_token_accuracy": 0.49007849978194507, + "step": 13456 + }, + { + "epoch": 2.4948090470893587, + "grad_norm": 7.890625, + "learning_rate": 7.5051909529106425e-06, + "loss": 2.8726, + "mean_token_accuracy": 0.47721644378405154, + "step": 13457 + }, + { + "epoch": 2.4949944382647384, + "grad_norm": 8.9609375, + "learning_rate": 7.505005561735262e-06, + "loss": 3.6097, + "mean_token_accuracy": 0.4575740318906606, + "step": 13458 + }, + { + "epoch": 2.4951798294401186, + "grad_norm": 9.4453125, + "learning_rate": 7.504820170559882e-06, + "loss": 3.3394, + "mean_token_accuracy": 0.46460490849765923, + "step": 13459 + }, + { + "epoch": 2.495365220615499, + "grad_norm": 9.359375, + "learning_rate": 7.504634779384502e-06, + "loss": 3.1626, + "mean_token_accuracy": 0.4410997866793079, + "step": 13460 + }, + { + "epoch": 2.495550611790879, + "grad_norm": 6.77734375, + "learning_rate": 7.504449388209122e-06, + "loss": 2.5744, + "mean_token_accuracy": 0.4923963698798136, + "step": 13461 + }, + { + "epoch": 2.4957360029662587, + "grad_norm": 7.26953125, + "learning_rate": 7.504263997033742e-06, + "loss": 3.1093, + "mean_token_accuracy": 0.47325968788357003, + "step": 13462 + }, + { + "epoch": 2.495921394141639, + "grad_norm": 9.1640625, + "learning_rate": 7.504078605858361e-06, + "loss": 3.992, + "mean_token_accuracy": 0.40954167783436074, + "step": 13463 + }, + { + "epoch": 2.4961067853170187, + "grad_norm": 10.2734375, + "learning_rate": 7.503893214682981e-06, + "loss": 2.299, + "mean_token_accuracy": 0.5656646473882729, + "step": 13464 + }, + { + "epoch": 2.496292176492399, + "grad_norm": 7.4609375, + "learning_rate": 7.503707823507602e-06, + "loss": 3.6524, + "mean_token_accuracy": 0.4281974080377013, + "step": 13465 + }, + { + "epoch": 2.496477567667779, + "grad_norm": 6.921875, + "learning_rate": 7.503522432332222e-06, + "loss": 2.5368, + "mean_token_accuracy": 0.4874261371444277, + "step": 13466 + }, + { + "epoch": 2.4966629588431593, + "grad_norm": 7.0859375, + "learning_rate": 7.5033370411568415e-06, + "loss": 3.6824, + "mean_token_accuracy": 0.4521659468655935, + "step": 13467 + }, + { + "epoch": 2.496848350018539, + "grad_norm": 7.91015625, + "learning_rate": 7.503151649981462e-06, + "loss": 2.5876, + "mean_token_accuracy": 0.5202537119792418, + "step": 13468 + }, + { + "epoch": 2.497033741193919, + "grad_norm": 7.6328125, + "learning_rate": 7.502966258806082e-06, + "loss": 2.855, + "mean_token_accuracy": 0.48551521818848553, + "step": 13469 + }, + { + "epoch": 2.4972191323692994, + "grad_norm": 6.703125, + "learning_rate": 7.502780867630701e-06, + "loss": 3.0999, + "mean_token_accuracy": 0.4549696027145483, + "step": 13470 + }, + { + "epoch": 2.497404523544679, + "grad_norm": 7.15234375, + "learning_rate": 7.502595476455321e-06, + "loss": 3.0055, + "mean_token_accuracy": 0.4769097437167265, + "step": 13471 + }, + { + "epoch": 2.4975899147200593, + "grad_norm": 6.15234375, + "learning_rate": 7.502410085279941e-06, + "loss": 2.8982, + "mean_token_accuracy": 0.4797608668576411, + "step": 13472 + }, + { + "epoch": 2.4977753058954395, + "grad_norm": 6.90234375, + "learning_rate": 7.502224694104562e-06, + "loss": 2.7075, + "mean_token_accuracy": 0.5038260658326248, + "step": 13473 + }, + { + "epoch": 2.4979606970708192, + "grad_norm": 7.04296875, + "learning_rate": 7.5020393029291816e-06, + "loss": 2.1819, + "mean_token_accuracy": 0.5451341518872215, + "step": 13474 + }, + { + "epoch": 2.4981460882461994, + "grad_norm": 7.79296875, + "learning_rate": 7.501853911753801e-06, + "loss": 2.7286, + "mean_token_accuracy": 0.5014287226161499, + "step": 13475 + }, + { + "epoch": 2.4983314794215796, + "grad_norm": 7.00390625, + "learning_rate": 7.501668520578421e-06, + "loss": 2.8358, + "mean_token_accuracy": 0.4750593824228028, + "step": 13476 + }, + { + "epoch": 2.4985168705969594, + "grad_norm": 7.88671875, + "learning_rate": 7.501483129403041e-06, + "loss": 2.7956, + "mean_token_accuracy": 0.5177061634603602, + "step": 13477 + }, + { + "epoch": 2.4987022617723396, + "grad_norm": 7.48046875, + "learning_rate": 7.501297738227661e-06, + "loss": 3.5313, + "mean_token_accuracy": 0.4282934131736527, + "step": 13478 + }, + { + "epoch": 2.4988876529477198, + "grad_norm": 5.89453125, + "learning_rate": 7.501112347052281e-06, + "loss": 2.8049, + "mean_token_accuracy": 0.48816316620128686, + "step": 13479 + }, + { + "epoch": 2.4990730441231, + "grad_norm": 7.796875, + "learning_rate": 7.5009269558769e-06, + "loss": 2.7458, + "mean_token_accuracy": 0.4665671323450997, + "step": 13480 + }, + { + "epoch": 2.4992584352984797, + "grad_norm": 10.125, + "learning_rate": 7.500741564701522e-06, + "loss": 2.8412, + "mean_token_accuracy": 0.4630314232902033, + "step": 13481 + }, + { + "epoch": 2.49944382647386, + "grad_norm": 8.7421875, + "learning_rate": 7.500556173526141e-06, + "loss": 2.8524, + "mean_token_accuracy": 0.4822684194435952, + "step": 13482 + }, + { + "epoch": 2.49962921764924, + "grad_norm": 7.3671875, + "learning_rate": 7.500370782350761e-06, + "loss": 2.9411, + "mean_token_accuracy": 0.5023119605425401, + "step": 13483 + }, + { + "epoch": 2.49981460882462, + "grad_norm": 8.2890625, + "learning_rate": 7.5001853911753805e-06, + "loss": 2.4033, + "mean_token_accuracy": 0.5398520953163517, + "step": 13484 + }, + { + "epoch": 2.5, + "grad_norm": 6.35546875, + "learning_rate": 7.500000000000001e-06, + "loss": 2.9542, + "mean_token_accuracy": 0.46430566417160096, + "step": 13485 + }, + { + "epoch": 2.50018539117538, + "grad_norm": 7.25, + "learning_rate": 7.499814608824621e-06, + "loss": 2.5042, + "mean_token_accuracy": 0.5147773545251544, + "step": 13486 + }, + { + "epoch": 2.5003707823507604, + "grad_norm": 7.203125, + "learning_rate": 7.49962921764924e-06, + "loss": 3.3609, + "mean_token_accuracy": 0.4247676463200201, + "step": 13487 + }, + { + "epoch": 2.50055617352614, + "grad_norm": 7.76171875, + "learning_rate": 7.49944382647386e-06, + "loss": 2.2649, + "mean_token_accuracy": 0.5556672891825887, + "step": 13488 + }, + { + "epoch": 2.5007415647015203, + "grad_norm": 8.7265625, + "learning_rate": 7.49925843529848e-06, + "loss": 2.7262, + "mean_token_accuracy": 0.5074726745482936, + "step": 13489 + }, + { + "epoch": 2.5009269558769, + "grad_norm": 7.9375, + "learning_rate": 7.499073044123101e-06, + "loss": 2.9391, + "mean_token_accuracy": 0.46142414860681114, + "step": 13490 + }, + { + "epoch": 2.5011123470522802, + "grad_norm": 7.125, + "learning_rate": 7.498887652947721e-06, + "loss": 2.7601, + "mean_token_accuracy": 0.4780564263322884, + "step": 13491 + }, + { + "epoch": 2.5012977382276604, + "grad_norm": 7.078125, + "learning_rate": 7.49870226177234e-06, + "loss": 2.6352, + "mean_token_accuracy": 0.5016646223935518, + "step": 13492 + }, + { + "epoch": 2.5014831294030406, + "grad_norm": 8.5234375, + "learning_rate": 7.49851687059696e-06, + "loss": 3.3239, + "mean_token_accuracy": 0.4602649006622517, + "step": 13493 + }, + { + "epoch": 2.5016685205784204, + "grad_norm": 8.3359375, + "learning_rate": 7.49833147942158e-06, + "loss": 2.7484, + "mean_token_accuracy": 0.47357818069115143, + "step": 13494 + }, + { + "epoch": 2.5018539117538006, + "grad_norm": 7.1484375, + "learning_rate": 7.4981460882462e-06, + "loss": 2.6622, + "mean_token_accuracy": 0.4862629660779366, + "step": 13495 + }, + { + "epoch": 2.5020393029291803, + "grad_norm": 8.234375, + "learning_rate": 7.49796069707082e-06, + "loss": 3.0296, + "mean_token_accuracy": 0.48395527806011324, + "step": 13496 + }, + { + "epoch": 2.5022246941045605, + "grad_norm": 12.6484375, + "learning_rate": 7.497775305895439e-06, + "loss": 3.598, + "mean_token_accuracy": 0.43443620670576516, + "step": 13497 + }, + { + "epoch": 2.5024100852799407, + "grad_norm": 10.5625, + "learning_rate": 7.497589914720061e-06, + "loss": 2.4522, + "mean_token_accuracy": 0.502421457841798, + "step": 13498 + }, + { + "epoch": 2.502595476455321, + "grad_norm": 7.3828125, + "learning_rate": 7.49740452354468e-06, + "loss": 2.7098, + "mean_token_accuracy": 0.4844728956687551, + "step": 13499 + }, + { + "epoch": 2.5027808676307006, + "grad_norm": 9.28125, + "learning_rate": 7.4972191323693e-06, + "loss": 3.0257, + "mean_token_accuracy": 0.4826392894220787, + "step": 13500 + }, + { + "epoch": 2.502966258806081, + "grad_norm": 10.6328125, + "learning_rate": 7.49703374119392e-06, + "loss": 2.7512, + "mean_token_accuracy": 0.49948418156808805, + "step": 13501 + }, + { + "epoch": 2.503151649981461, + "grad_norm": 8.0625, + "learning_rate": 7.496848350018539e-06, + "loss": 3.6229, + "mean_token_accuracy": 0.4290468986384266, + "step": 13502 + }, + { + "epoch": 2.5033370411568407, + "grad_norm": 6.20703125, + "learning_rate": 7.49666295884316e-06, + "loss": 2.3191, + "mean_token_accuracy": 0.5459044790893344, + "step": 13503 + }, + { + "epoch": 2.503522432332221, + "grad_norm": 11.2109375, + "learning_rate": 7.496477567667779e-06, + "loss": 2.6314, + "mean_token_accuracy": 0.47930372807017546, + "step": 13504 + }, + { + "epoch": 2.503707823507601, + "grad_norm": 10.328125, + "learning_rate": 7.496292176492399e-06, + "loss": 2.5754, + "mean_token_accuracy": 0.5025178592341023, + "step": 13505 + }, + { + "epoch": 2.5038932146829813, + "grad_norm": 6.59765625, + "learning_rate": 7.49610678531702e-06, + "loss": 2.5662, + "mean_token_accuracy": 0.5095532831001076, + "step": 13506 + }, + { + "epoch": 2.504078605858361, + "grad_norm": 9.84375, + "learning_rate": 7.49592139414164e-06, + "loss": 2.5524, + "mean_token_accuracy": 0.5043478260869565, + "step": 13507 + }, + { + "epoch": 2.5042639970337413, + "grad_norm": 7.81640625, + "learning_rate": 7.49573600296626e-06, + "loss": 3.1508, + "mean_token_accuracy": 0.4525327878714956, + "step": 13508 + }, + { + "epoch": 2.504449388209121, + "grad_norm": 7.30078125, + "learning_rate": 7.495550611790879e-06, + "loss": 3.0043, + "mean_token_accuracy": 0.45413533834586467, + "step": 13509 + }, + { + "epoch": 2.504634779384501, + "grad_norm": 10.921875, + "learning_rate": 7.495365220615499e-06, + "loss": 2.8416, + "mean_token_accuracy": 0.47270471464019853, + "step": 13510 + }, + { + "epoch": 2.5048201705598814, + "grad_norm": 8.90625, + "learning_rate": 7.495179829440119e-06, + "loss": 2.9219, + "mean_token_accuracy": 0.49427853753837564, + "step": 13511 + }, + { + "epoch": 2.5050055617352616, + "grad_norm": 8.8671875, + "learning_rate": 7.494994438264739e-06, + "loss": 3.2972, + "mean_token_accuracy": 0.47516422726748875, + "step": 13512 + }, + { + "epoch": 2.5051909529106413, + "grad_norm": 8.4453125, + "learning_rate": 7.494809047089359e-06, + "loss": 3.1952, + "mean_token_accuracy": 0.46358519529251235, + "step": 13513 + }, + { + "epoch": 2.5053763440860215, + "grad_norm": 7.76171875, + "learning_rate": 7.494623655913979e-06, + "loss": 2.9123, + "mean_token_accuracy": 0.46158714086824454, + "step": 13514 + }, + { + "epoch": 2.5055617352614017, + "grad_norm": 9.1015625, + "learning_rate": 7.4944382647386e-06, + "loss": 3.9804, + "mean_token_accuracy": 0.40692307692307694, + "step": 13515 + }, + { + "epoch": 2.5057471264367814, + "grad_norm": 7.390625, + "learning_rate": 7.494252873563219e-06, + "loss": 2.8283, + "mean_token_accuracy": 0.5033697768458889, + "step": 13516 + }, + { + "epoch": 2.5059325176121616, + "grad_norm": 10.3984375, + "learning_rate": 7.494067482387839e-06, + "loss": 2.7655, + "mean_token_accuracy": 0.4876608808596056, + "step": 13517 + }, + { + "epoch": 2.506117908787542, + "grad_norm": 11.4375, + "learning_rate": 7.493882091212459e-06, + "loss": 4.025, + "mean_token_accuracy": 0.41453926574276684, + "step": 13518 + }, + { + "epoch": 2.506303299962922, + "grad_norm": 10.0234375, + "learning_rate": 7.493696700037078e-06, + "loss": 2.8974, + "mean_token_accuracy": 0.47974772249474423, + "step": 13519 + }, + { + "epoch": 2.5064886911383017, + "grad_norm": 6.4921875, + "learning_rate": 7.493511308861699e-06, + "loss": 2.9625, + "mean_token_accuracy": 0.4745103686635945, + "step": 13520 + }, + { + "epoch": 2.506674082313682, + "grad_norm": 7.87890625, + "learning_rate": 7.493325917686318e-06, + "loss": 2.9649, + "mean_token_accuracy": 0.5141062376019396, + "step": 13521 + }, + { + "epoch": 2.5068594734890617, + "grad_norm": 8.234375, + "learning_rate": 7.493140526510939e-06, + "loss": 3.3066, + "mean_token_accuracy": 0.44746600741656367, + "step": 13522 + }, + { + "epoch": 2.507044864664442, + "grad_norm": 6.9375, + "learning_rate": 7.492955135335559e-06, + "loss": 2.3291, + "mean_token_accuracy": 0.5433839479392625, + "step": 13523 + }, + { + "epoch": 2.507230255839822, + "grad_norm": 7.04296875, + "learning_rate": 7.492769744160179e-06, + "loss": 2.8499, + "mean_token_accuracy": 0.45614495798319327, + "step": 13524 + }, + { + "epoch": 2.5074156470152023, + "grad_norm": 8.3671875, + "learning_rate": 7.492584352984799e-06, + "loss": 2.9179, + "mean_token_accuracy": 0.46381773108251684, + "step": 13525 + }, + { + "epoch": 2.507601038190582, + "grad_norm": 6.2578125, + "learning_rate": 7.492398961809418e-06, + "loss": 2.4729, + "mean_token_accuracy": 0.518233259981138, + "step": 13526 + }, + { + "epoch": 2.507786429365962, + "grad_norm": 9.1640625, + "learning_rate": 7.492213570634038e-06, + "loss": 3.3009, + "mean_token_accuracy": 0.47492447129909365, + "step": 13527 + }, + { + "epoch": 2.5079718205413424, + "grad_norm": 7.10546875, + "learning_rate": 7.492028179458658e-06, + "loss": 2.9424, + "mean_token_accuracy": 0.47959183673469385, + "step": 13528 + }, + { + "epoch": 2.508157211716722, + "grad_norm": 6.6875, + "learning_rate": 7.491842788283278e-06, + "loss": 2.7751, + "mean_token_accuracy": 0.5188515081206496, + "step": 13529 + }, + { + "epoch": 2.5083426028921023, + "grad_norm": 6.9375, + "learning_rate": 7.491657397107899e-06, + "loss": 2.7138, + "mean_token_accuracy": 0.49895148888578217, + "step": 13530 + }, + { + "epoch": 2.5085279940674825, + "grad_norm": 7.18359375, + "learning_rate": 7.491472005932518e-06, + "loss": 2.7392, + "mean_token_accuracy": 0.47947650208209397, + "step": 13531 + }, + { + "epoch": 2.5087133852428627, + "grad_norm": 7.09765625, + "learning_rate": 7.491286614757139e-06, + "loss": 2.7898, + "mean_token_accuracy": 0.471240755957272, + "step": 13532 + }, + { + "epoch": 2.5088987764182424, + "grad_norm": 8.1953125, + "learning_rate": 7.491101223581758e-06, + "loss": 2.9585, + "mean_token_accuracy": 0.46684894053315107, + "step": 13533 + }, + { + "epoch": 2.5090841675936226, + "grad_norm": 6.56640625, + "learning_rate": 7.490915832406378e-06, + "loss": 2.7227, + "mean_token_accuracy": 0.4837103428652724, + "step": 13534 + }, + { + "epoch": 2.5092695587690024, + "grad_norm": 8.375, + "learning_rate": 7.490730441230998e-06, + "loss": 3.0762, + "mean_token_accuracy": 0.473107228502912, + "step": 13535 + }, + { + "epoch": 2.5094549499443826, + "grad_norm": 11.4296875, + "learning_rate": 7.490545050055617e-06, + "loss": 3.2919, + "mean_token_accuracy": 0.4486021505376344, + "step": 13536 + }, + { + "epoch": 2.5096403411197628, + "grad_norm": 11.40625, + "learning_rate": 7.490359658880238e-06, + "loss": 2.6599, + "mean_token_accuracy": 0.49650880388585306, + "step": 13537 + }, + { + "epoch": 2.509825732295143, + "grad_norm": 8.6796875, + "learning_rate": 7.490174267704858e-06, + "loss": 2.8254, + "mean_token_accuracy": 0.4617489737529544, + "step": 13538 + }, + { + "epoch": 2.5100111234705227, + "grad_norm": 8.6328125, + "learning_rate": 7.489988876529478e-06, + "loss": 3.3309, + "mean_token_accuracy": 0.46528035647976235, + "step": 13539 + }, + { + "epoch": 2.510196514645903, + "grad_norm": 10.4921875, + "learning_rate": 7.4898034853540976e-06, + "loss": 2.5423, + "mean_token_accuracy": 0.5006151574803149, + "step": 13540 + }, + { + "epoch": 2.510381905821283, + "grad_norm": 7.609375, + "learning_rate": 7.489618094178718e-06, + "loss": 2.7209, + "mean_token_accuracy": 0.5164339116941741, + "step": 13541 + }, + { + "epoch": 2.510567296996663, + "grad_norm": 15.1875, + "learning_rate": 7.489432703003338e-06, + "loss": 2.6962, + "mean_token_accuracy": 0.48496379035227694, + "step": 13542 + }, + { + "epoch": 2.510752688172043, + "grad_norm": 12.921875, + "learning_rate": 7.489247311827957e-06, + "loss": 2.7829, + "mean_token_accuracy": 0.4890592334494773, + "step": 13543 + }, + { + "epoch": 2.510938079347423, + "grad_norm": 10.4453125, + "learning_rate": 7.489061920652577e-06, + "loss": 3.534, + "mean_token_accuracy": 0.4190061763054464, + "step": 13544 + }, + { + "epoch": 2.5111234705228034, + "grad_norm": 7.71484375, + "learning_rate": 7.488876529477197e-06, + "loss": 2.2997, + "mean_token_accuracy": 0.507940640458214, + "step": 13545 + }, + { + "epoch": 2.511308861698183, + "grad_norm": 12.6875, + "learning_rate": 7.488691138301818e-06, + "loss": 2.7636, + "mean_token_accuracy": 0.5020363236103468, + "step": 13546 + }, + { + "epoch": 2.5114942528735633, + "grad_norm": 10.1171875, + "learning_rate": 7.488505747126438e-06, + "loss": 2.8945, + "mean_token_accuracy": 0.5111307031077805, + "step": 13547 + }, + { + "epoch": 2.511679644048943, + "grad_norm": 8.7109375, + "learning_rate": 7.488320355951057e-06, + "loss": 3.3234, + "mean_token_accuracy": 0.4332164058283864, + "step": 13548 + }, + { + "epoch": 2.5118650352243233, + "grad_norm": 8.8203125, + "learning_rate": 7.488134964775678e-06, + "loss": 2.6633, + "mean_token_accuracy": 0.4996640236527348, + "step": 13549 + }, + { + "epoch": 2.5120504263997034, + "grad_norm": 8.875, + "learning_rate": 7.487949573600297e-06, + "loss": 3.1899, + "mean_token_accuracy": 0.464041095890411, + "step": 13550 + }, + { + "epoch": 2.5122358175750836, + "grad_norm": 6.98828125, + "learning_rate": 7.487764182424917e-06, + "loss": 3.1297, + "mean_token_accuracy": 0.4735901509134233, + "step": 13551 + }, + { + "epoch": 2.5124212087504634, + "grad_norm": 9.4921875, + "learning_rate": 7.487578791249537e-06, + "loss": 2.8181, + "mean_token_accuracy": 0.46585160202360876, + "step": 13552 + }, + { + "epoch": 2.5126065999258436, + "grad_norm": 7.85546875, + "learning_rate": 7.487393400074156e-06, + "loss": 3.0745, + "mean_token_accuracy": 0.47018909899888767, + "step": 13553 + }, + { + "epoch": 2.5127919911012233, + "grad_norm": 9.109375, + "learning_rate": 7.487208008898778e-06, + "loss": 2.9643, + "mean_token_accuracy": 0.4624892703862661, + "step": 13554 + }, + { + "epoch": 2.5129773822766035, + "grad_norm": 8.796875, + "learning_rate": 7.487022617723397e-06, + "loss": 2.6436, + "mean_token_accuracy": 0.506547300908605, + "step": 13555 + }, + { + "epoch": 2.5131627734519837, + "grad_norm": 11.1796875, + "learning_rate": 7.486837226548017e-06, + "loss": 2.9021, + "mean_token_accuracy": 0.4730668983492615, + "step": 13556 + }, + { + "epoch": 2.513348164627364, + "grad_norm": 8.890625, + "learning_rate": 7.486651835372637e-06, + "loss": 3.1534, + "mean_token_accuracy": 0.4651646871556263, + "step": 13557 + }, + { + "epoch": 2.5135335558027436, + "grad_norm": 7.82421875, + "learning_rate": 7.486466444197257e-06, + "loss": 3.5317, + "mean_token_accuracy": 0.431175805539853, + "step": 13558 + }, + { + "epoch": 2.513718946978124, + "grad_norm": 9.4609375, + "learning_rate": 7.486281053021877e-06, + "loss": 2.5818, + "mean_token_accuracy": 0.4853761393007754, + "step": 13559 + }, + { + "epoch": 2.513904338153504, + "grad_norm": 7.0859375, + "learning_rate": 7.486095661846496e-06, + "loss": 3.1095, + "mean_token_accuracy": 0.46477641768673494, + "step": 13560 + }, + { + "epoch": 2.5140897293288837, + "grad_norm": 7.61328125, + "learning_rate": 7.485910270671116e-06, + "loss": 2.6238, + "mean_token_accuracy": 0.504364694471387, + "step": 13561 + }, + { + "epoch": 2.514275120504264, + "grad_norm": 8.28125, + "learning_rate": 7.485724879495737e-06, + "loss": 3.1256, + "mean_token_accuracy": 0.43881317433276545, + "step": 13562 + }, + { + "epoch": 2.514460511679644, + "grad_norm": 6.703125, + "learning_rate": 7.485539488320357e-06, + "loss": 2.6103, + "mean_token_accuracy": 0.5021589450344264, + "step": 13563 + }, + { + "epoch": 2.5146459028550243, + "grad_norm": 12.484375, + "learning_rate": 7.485354097144977e-06, + "loss": 2.314, + "mean_token_accuracy": 0.5506278386321133, + "step": 13564 + }, + { + "epoch": 2.514831294030404, + "grad_norm": 9.109375, + "learning_rate": 7.485168705969596e-06, + "loss": 3.0727, + "mean_token_accuracy": 0.43622363575493805, + "step": 13565 + }, + { + "epoch": 2.5150166852057843, + "grad_norm": 7.66015625, + "learning_rate": 7.484983314794216e-06, + "loss": 3.4117, + "mean_token_accuracy": 0.44035006909258406, + "step": 13566 + }, + { + "epoch": 2.515202076381164, + "grad_norm": 9.4375, + "learning_rate": 7.4847979236188364e-06, + "loss": 2.9408, + "mean_token_accuracy": 0.4542483660130719, + "step": 13567 + }, + { + "epoch": 2.515387467556544, + "grad_norm": 8.640625, + "learning_rate": 7.484612532443456e-06, + "loss": 3.6595, + "mean_token_accuracy": 0.43819530284301605, + "step": 13568 + }, + { + "epoch": 2.5155728587319244, + "grad_norm": 7.28515625, + "learning_rate": 7.484427141268076e-06, + "loss": 3.0287, + "mean_token_accuracy": 0.4590078328981723, + "step": 13569 + }, + { + "epoch": 2.5157582499073046, + "grad_norm": 8.78125, + "learning_rate": 7.484241750092697e-06, + "loss": 2.7276, + "mean_token_accuracy": 0.5053715308863026, + "step": 13570 + }, + { + "epoch": 2.5159436410826843, + "grad_norm": 8.0625, + "learning_rate": 7.484056358917317e-06, + "loss": 2.8367, + "mean_token_accuracy": 0.5202349869451697, + "step": 13571 + }, + { + "epoch": 2.5161290322580645, + "grad_norm": 8.46875, + "learning_rate": 7.483870967741936e-06, + "loss": 3.4563, + "mean_token_accuracy": 0.484456322970882, + "step": 13572 + }, + { + "epoch": 2.5163144234334447, + "grad_norm": 12.453125, + "learning_rate": 7.483685576566556e-06, + "loss": 3.3911, + "mean_token_accuracy": 0.4842529296875, + "step": 13573 + }, + { + "epoch": 2.5164998146088244, + "grad_norm": 9.921875, + "learning_rate": 7.483500185391176e-06, + "loss": 2.2968, + "mean_token_accuracy": 0.5326054883446444, + "step": 13574 + }, + { + "epoch": 2.5166852057842046, + "grad_norm": 7.03515625, + "learning_rate": 7.483314794215796e-06, + "loss": 2.2568, + "mean_token_accuracy": 0.5827769605069594, + "step": 13575 + }, + { + "epoch": 2.516870596959585, + "grad_norm": 8.828125, + "learning_rate": 7.483129403040416e-06, + "loss": 2.6532, + "mean_token_accuracy": 0.4973130951258603, + "step": 13576 + }, + { + "epoch": 2.517055988134965, + "grad_norm": 9.4296875, + "learning_rate": 7.482944011865035e-06, + "loss": 2.4652, + "mean_token_accuracy": 0.5212249208025344, + "step": 13577 + }, + { + "epoch": 2.5172413793103448, + "grad_norm": 13.390625, + "learning_rate": 7.482758620689656e-06, + "loss": 2.8524, + "mean_token_accuracy": 0.4833174451858913, + "step": 13578 + }, + { + "epoch": 2.517426770485725, + "grad_norm": 7.0078125, + "learning_rate": 7.482573229514276e-06, + "loss": 2.688, + "mean_token_accuracy": 0.4966638359820454, + "step": 13579 + }, + { + "epoch": 2.5176121616611047, + "grad_norm": 10.7421875, + "learning_rate": 7.482387838338896e-06, + "loss": 2.8541, + "mean_token_accuracy": 0.4936293182713779, + "step": 13580 + }, + { + "epoch": 2.517797552836485, + "grad_norm": 12.7109375, + "learning_rate": 7.482202447163516e-06, + "loss": 3.1062, + "mean_token_accuracy": 0.4555104589447393, + "step": 13581 + }, + { + "epoch": 2.517982944011865, + "grad_norm": 8.828125, + "learning_rate": 7.482017055988135e-06, + "loss": 4.0784, + "mean_token_accuracy": 0.3941176470588235, + "step": 13582 + }, + { + "epoch": 2.5181683351872453, + "grad_norm": 7.14453125, + "learning_rate": 7.481831664812755e-06, + "loss": 2.1591, + "mean_token_accuracy": 0.5567731713757592, + "step": 13583 + }, + { + "epoch": 2.518353726362625, + "grad_norm": 7.5703125, + "learning_rate": 7.4816462736373755e-06, + "loss": 2.9052, + "mean_token_accuracy": 0.47465498938428874, + "step": 13584 + }, + { + "epoch": 2.518539117538005, + "grad_norm": 7.0703125, + "learning_rate": 7.481460882461995e-06, + "loss": 2.642, + "mean_token_accuracy": 0.4950006328312872, + "step": 13585 + }, + { + "epoch": 2.5187245087133854, + "grad_norm": 10.109375, + "learning_rate": 7.481275491286616e-06, + "loss": 2.6972, + "mean_token_accuracy": 0.4760025597269625, + "step": 13586 + }, + { + "epoch": 2.518909899888765, + "grad_norm": 7.82421875, + "learning_rate": 7.481090100111236e-06, + "loss": 3.2981, + "mean_token_accuracy": 0.4650695258837326, + "step": 13587 + }, + { + "epoch": 2.5190952910641453, + "grad_norm": 8.078125, + "learning_rate": 7.480904708935856e-06, + "loss": 2.8837, + "mean_token_accuracy": 0.48855258051998446, + "step": 13588 + }, + { + "epoch": 2.5192806822395255, + "grad_norm": 8.0625, + "learning_rate": 7.480719317760475e-06, + "loss": 2.3827, + "mean_token_accuracy": 0.5568291505791506, + "step": 13589 + }, + { + "epoch": 2.5194660734149057, + "grad_norm": 6.84375, + "learning_rate": 7.480533926585095e-06, + "loss": 2.1944, + "mean_token_accuracy": 0.533155210104445, + "step": 13590 + }, + { + "epoch": 2.5196514645902854, + "grad_norm": 6.6328125, + "learning_rate": 7.480348535409715e-06, + "loss": 2.4606, + "mean_token_accuracy": 0.5226680562793121, + "step": 13591 + }, + { + "epoch": 2.5198368557656656, + "grad_norm": 6.53125, + "learning_rate": 7.480163144234335e-06, + "loss": 2.8942, + "mean_token_accuracy": 0.4800244910454615, + "step": 13592 + }, + { + "epoch": 2.5200222469410454, + "grad_norm": 6.52734375, + "learning_rate": 7.479977753058955e-06, + "loss": 2.2138, + "mean_token_accuracy": 0.5512655512655512, + "step": 13593 + }, + { + "epoch": 2.5202076381164256, + "grad_norm": 8.484375, + "learning_rate": 7.479792361883575e-06, + "loss": 3.1283, + "mean_token_accuracy": 0.4785069235400361, + "step": 13594 + }, + { + "epoch": 2.5203930292918058, + "grad_norm": 6.54296875, + "learning_rate": 7.479606970708195e-06, + "loss": 3.3479, + "mean_token_accuracy": 0.46871671991480296, + "step": 13595 + }, + { + "epoch": 2.520578420467186, + "grad_norm": 6.89453125, + "learning_rate": 7.4794215795328154e-06, + "loss": 3.3745, + "mean_token_accuracy": 0.4316820276497696, + "step": 13596 + }, + { + "epoch": 2.5207638116425657, + "grad_norm": 6.68359375, + "learning_rate": 7.479236188357435e-06, + "loss": 2.654, + "mean_token_accuracy": 0.4759564293304995, + "step": 13597 + }, + { + "epoch": 2.520949202817946, + "grad_norm": 7.44140625, + "learning_rate": 7.479050797182055e-06, + "loss": 2.4964, + "mean_token_accuracy": 0.4986636657757772, + "step": 13598 + }, + { + "epoch": 2.521134593993326, + "grad_norm": 8.2265625, + "learning_rate": 7.478865406006674e-06, + "loss": 3.299, + "mean_token_accuracy": 0.44449463579898363, + "step": 13599 + }, + { + "epoch": 2.521319985168706, + "grad_norm": 6.3515625, + "learning_rate": 7.478680014831294e-06, + "loss": 2.7694, + "mean_token_accuracy": 0.5060827250608273, + "step": 13600 + }, + { + "epoch": 2.521505376344086, + "grad_norm": 7.2421875, + "learning_rate": 7.4784946236559145e-06, + "loss": 2.6156, + "mean_token_accuracy": 0.48412103025756437, + "step": 13601 + }, + { + "epoch": 2.521690767519466, + "grad_norm": 7.66796875, + "learning_rate": 7.478309232480535e-06, + "loss": 3.1582, + "mean_token_accuracy": 0.4842410655413425, + "step": 13602 + }, + { + "epoch": 2.5218761586948464, + "grad_norm": 9.375, + "learning_rate": 7.478123841305155e-06, + "loss": 3.2825, + "mean_token_accuracy": 0.4431831904359377, + "step": 13603 + }, + { + "epoch": 2.522061549870226, + "grad_norm": 7.08984375, + "learning_rate": 7.477938450129775e-06, + "loss": 2.9868, + "mean_token_accuracy": 0.4723636363636364, + "step": 13604 + }, + { + "epoch": 2.5222469410456063, + "grad_norm": 8.015625, + "learning_rate": 7.477753058954395e-06, + "loss": 3.2567, + "mean_token_accuracy": 0.4584993002476047, + "step": 13605 + }, + { + "epoch": 2.522432332220986, + "grad_norm": 9.8984375, + "learning_rate": 7.477567667779014e-06, + "loss": 3.3552, + "mean_token_accuracy": 0.44371822803195354, + "step": 13606 + }, + { + "epoch": 2.5226177233963663, + "grad_norm": 10.2890625, + "learning_rate": 7.477382276603634e-06, + "loss": 2.6186, + "mean_token_accuracy": 0.48052095130237826, + "step": 13607 + }, + { + "epoch": 2.5228031145717464, + "grad_norm": 7.89453125, + "learning_rate": 7.477196885428254e-06, + "loss": 3.0229, + "mean_token_accuracy": 0.45211667527103766, + "step": 13608 + }, + { + "epoch": 2.5229885057471266, + "grad_norm": 10.4921875, + "learning_rate": 7.477011494252873e-06, + "loss": 3.7733, + "mean_token_accuracy": 0.4479822719850711, + "step": 13609 + }, + { + "epoch": 2.5231738969225064, + "grad_norm": 9.046875, + "learning_rate": 7.476826103077494e-06, + "loss": 2.4991, + "mean_token_accuracy": 0.49696969696969695, + "step": 13610 + }, + { + "epoch": 2.5233592880978866, + "grad_norm": 7.53515625, + "learning_rate": 7.476640711902114e-06, + "loss": 3.2552, + "mean_token_accuracy": 0.4511055052678439, + "step": 13611 + }, + { + "epoch": 2.5235446792732668, + "grad_norm": 7.04296875, + "learning_rate": 7.476455320726734e-06, + "loss": 2.6726, + "mean_token_accuracy": 0.5106801573917932, + "step": 13612 + }, + { + "epoch": 2.5237300704486465, + "grad_norm": 8.6640625, + "learning_rate": 7.4762699295513545e-06, + "loss": 2.9615, + "mean_token_accuracy": 0.48330182734719596, + "step": 13613 + }, + { + "epoch": 2.5239154616240267, + "grad_norm": 6.16796875, + "learning_rate": 7.476084538375974e-06, + "loss": 2.8263, + "mean_token_accuracy": 0.484375, + "step": 13614 + }, + { + "epoch": 2.524100852799407, + "grad_norm": 8.1328125, + "learning_rate": 7.475899147200594e-06, + "loss": 2.8008, + "mean_token_accuracy": 0.4865752642102257, + "step": 13615 + }, + { + "epoch": 2.524286243974787, + "grad_norm": 8.03125, + "learning_rate": 7.475713756025213e-06, + "loss": 2.7067, + "mean_token_accuracy": 0.48707671043538353, + "step": 13616 + }, + { + "epoch": 2.524471635150167, + "grad_norm": 8.515625, + "learning_rate": 7.475528364849833e-06, + "loss": 3.4345, + "mean_token_accuracy": 0.4487037719621575, + "step": 13617 + }, + { + "epoch": 2.524657026325547, + "grad_norm": 8.1171875, + "learning_rate": 7.4753429736744535e-06, + "loss": 2.8192, + "mean_token_accuracy": 0.4895811184350415, + "step": 13618 + }, + { + "epoch": 2.5248424175009268, + "grad_norm": 8.296875, + "learning_rate": 7.475157582499074e-06, + "loss": 2.9471, + "mean_token_accuracy": 0.4761296503273192, + "step": 13619 + }, + { + "epoch": 2.525027808676307, + "grad_norm": 7.828125, + "learning_rate": 7.474972191323694e-06, + "loss": 2.7374, + "mean_token_accuracy": 0.47462259239979177, + "step": 13620 + }, + { + "epoch": 2.525213199851687, + "grad_norm": 7.05078125, + "learning_rate": 7.474786800148313e-06, + "loss": 2.5386, + "mean_token_accuracy": 0.5518618734699137, + "step": 13621 + }, + { + "epoch": 2.5253985910270673, + "grad_norm": 7.52734375, + "learning_rate": 7.474601408972934e-06, + "loss": 2.7704, + "mean_token_accuracy": 0.5014114326040932, + "step": 13622 + }, + { + "epoch": 2.525583982202447, + "grad_norm": 8.2265625, + "learning_rate": 7.4744160177975535e-06, + "loss": 2.902, + "mean_token_accuracy": 0.47486755394753843, + "step": 13623 + }, + { + "epoch": 2.5257693733778273, + "grad_norm": 7.421875, + "learning_rate": 7.474230626622173e-06, + "loss": 2.9414, + "mean_token_accuracy": 0.5028134143596669, + "step": 13624 + }, + { + "epoch": 2.525954764553207, + "grad_norm": 6.61328125, + "learning_rate": 7.474045235446793e-06, + "loss": 2.8455, + "mean_token_accuracy": 0.483059624204764, + "step": 13625 + }, + { + "epoch": 2.526140155728587, + "grad_norm": 7.234375, + "learning_rate": 7.473859844271412e-06, + "loss": 2.813, + "mean_token_accuracy": 0.5222148978246539, + "step": 13626 + }, + { + "epoch": 2.5263255469039674, + "grad_norm": 6.35546875, + "learning_rate": 7.473674453096034e-06, + "loss": 2.5901, + "mean_token_accuracy": 0.4913294797687861, + "step": 13627 + }, + { + "epoch": 2.5265109380793476, + "grad_norm": 6.2578125, + "learning_rate": 7.473489061920653e-06, + "loss": 2.4609, + "mean_token_accuracy": 0.521511017838405, + "step": 13628 + }, + { + "epoch": 2.5266963292547273, + "grad_norm": 6.33984375, + "learning_rate": 7.473303670745273e-06, + "loss": 2.7149, + "mean_token_accuracy": 0.5002018570851837, + "step": 13629 + }, + { + "epoch": 2.5268817204301075, + "grad_norm": 8.171875, + "learning_rate": 7.4731182795698935e-06, + "loss": 3.0819, + "mean_token_accuracy": 0.4727933356965615, + "step": 13630 + }, + { + "epoch": 2.5270671116054877, + "grad_norm": 7.50390625, + "learning_rate": 7.472932888394513e-06, + "loss": 3.085, + "mean_token_accuracy": 0.4551306323362363, + "step": 13631 + }, + { + "epoch": 2.5272525027808674, + "grad_norm": 7.0703125, + "learning_rate": 7.472747497219133e-06, + "loss": 2.9557, + "mean_token_accuracy": 0.49679724069967973, + "step": 13632 + }, + { + "epoch": 2.5274378939562476, + "grad_norm": 8.3203125, + "learning_rate": 7.4725621060437524e-06, + "loss": 2.9449, + "mean_token_accuracy": 0.48145001416029454, + "step": 13633 + }, + { + "epoch": 2.527623285131628, + "grad_norm": 7.49609375, + "learning_rate": 7.472376714868372e-06, + "loss": 2.3127, + "mean_token_accuracy": 0.5291616846020991, + "step": 13634 + }, + { + "epoch": 2.527808676307008, + "grad_norm": 6.54296875, + "learning_rate": 7.472191323692993e-06, + "loss": 3.3442, + "mean_token_accuracy": 0.41216901408450707, + "step": 13635 + }, + { + "epoch": 2.5279940674823878, + "grad_norm": 7.44140625, + "learning_rate": 7.472005932517613e-06, + "loss": 3.656, + "mean_token_accuracy": 0.4393581291205374, + "step": 13636 + }, + { + "epoch": 2.528179458657768, + "grad_norm": 8.875, + "learning_rate": 7.471820541342233e-06, + "loss": 2.8291, + "mean_token_accuracy": 0.4743619489559165, + "step": 13637 + }, + { + "epoch": 2.5283648498331477, + "grad_norm": 7.35546875, + "learning_rate": 7.471635150166852e-06, + "loss": 3.6574, + "mean_token_accuracy": 0.44239860661856184, + "step": 13638 + }, + { + "epoch": 2.528550241008528, + "grad_norm": 7.453125, + "learning_rate": 7.471449758991473e-06, + "loss": 2.515, + "mean_token_accuracy": 0.5135363790186125, + "step": 13639 + }, + { + "epoch": 2.528735632183908, + "grad_norm": 9.484375, + "learning_rate": 7.4712643678160925e-06, + "loss": 2.6836, + "mean_token_accuracy": 0.494946201499837, + "step": 13640 + }, + { + "epoch": 2.5289210233592883, + "grad_norm": 8.6875, + "learning_rate": 7.471078976640712e-06, + "loss": 3.1493, + "mean_token_accuracy": 0.46058264592252274, + "step": 13641 + }, + { + "epoch": 2.529106414534668, + "grad_norm": 7.421875, + "learning_rate": 7.470893585465332e-06, + "loss": 3.1181, + "mean_token_accuracy": 0.46413338503295853, + "step": 13642 + }, + { + "epoch": 2.529291805710048, + "grad_norm": 6.921875, + "learning_rate": 7.470708194289953e-06, + "loss": 2.945, + "mean_token_accuracy": 0.47437689261588634, + "step": 13643 + }, + { + "epoch": 2.5294771968854284, + "grad_norm": 6.89453125, + "learning_rate": 7.470522803114573e-06, + "loss": 2.9656, + "mean_token_accuracy": 0.46724300959070647, + "step": 13644 + }, + { + "epoch": 2.529662588060808, + "grad_norm": 7.42578125, + "learning_rate": 7.470337411939192e-06, + "loss": 2.8079, + "mean_token_accuracy": 0.47449862567443757, + "step": 13645 + }, + { + "epoch": 2.5298479792361883, + "grad_norm": 5.74609375, + "learning_rate": 7.470152020763812e-06, + "loss": 2.6052, + "mean_token_accuracy": 0.5024642681123707, + "step": 13646 + }, + { + "epoch": 2.5300333704115685, + "grad_norm": 7.03125, + "learning_rate": 7.469966629588432e-06, + "loss": 3.2737, + "mean_token_accuracy": 0.4512279888785913, + "step": 13647 + }, + { + "epoch": 2.5302187615869487, + "grad_norm": 6.00390625, + "learning_rate": 7.469781238413052e-06, + "loss": 2.9769, + "mean_token_accuracy": 0.45360384410037374, + "step": 13648 + }, + { + "epoch": 2.5304041527623284, + "grad_norm": 7.1953125, + "learning_rate": 7.469595847237672e-06, + "loss": 3.7534, + "mean_token_accuracy": 0.42343559940532505, + "step": 13649 + }, + { + "epoch": 2.5305895439377086, + "grad_norm": 7.171875, + "learning_rate": 7.4694104560622915e-06, + "loss": 3.1812, + "mean_token_accuracy": 0.46565193965517243, + "step": 13650 + }, + { + "epoch": 2.5307749351130884, + "grad_norm": 8.9375, + "learning_rate": 7.469225064886913e-06, + "loss": 3.4261, + "mean_token_accuracy": 0.41765270227054685, + "step": 13651 + }, + { + "epoch": 2.5309603262884686, + "grad_norm": 8.5546875, + "learning_rate": 7.4690396737115325e-06, + "loss": 2.8297, + "mean_token_accuracy": 0.4800173761946134, + "step": 13652 + }, + { + "epoch": 2.5311457174638488, + "grad_norm": 7.29296875, + "learning_rate": 7.468854282536152e-06, + "loss": 3.5447, + "mean_token_accuracy": 0.4408883826879271, + "step": 13653 + }, + { + "epoch": 2.531331108639229, + "grad_norm": 7.03515625, + "learning_rate": 7.468668891360772e-06, + "loss": 2.5639, + "mean_token_accuracy": 0.5211361366914457, + "step": 13654 + }, + { + "epoch": 2.5315164998146087, + "grad_norm": 6.5390625, + "learning_rate": 7.468483500185391e-06, + "loss": 2.6471, + "mean_token_accuracy": 0.4964811963932263, + "step": 13655 + }, + { + "epoch": 2.531701890989989, + "grad_norm": 6.33984375, + "learning_rate": 7.468298109010012e-06, + "loss": 3.1685, + "mean_token_accuracy": 0.4529842680437132, + "step": 13656 + }, + { + "epoch": 2.531887282165369, + "grad_norm": 7.8046875, + "learning_rate": 7.4681127178346315e-06, + "loss": 3.0455, + "mean_token_accuracy": 0.46415525114155254, + "step": 13657 + }, + { + "epoch": 2.532072673340749, + "grad_norm": 6.80859375, + "learning_rate": 7.467927326659251e-06, + "loss": 2.5499, + "mean_token_accuracy": 0.5102534890344631, + "step": 13658 + }, + { + "epoch": 2.532258064516129, + "grad_norm": 6.6796875, + "learning_rate": 7.467741935483872e-06, + "loss": 2.954, + "mean_token_accuracy": 0.4601291485207989, + "step": 13659 + }, + { + "epoch": 2.532443455691509, + "grad_norm": 8.7109375, + "learning_rate": 7.467556544308492e-06, + "loss": 3.0062, + "mean_token_accuracy": 0.5111967966780365, + "step": 13660 + }, + { + "epoch": 2.5326288468668894, + "grad_norm": 12.3515625, + "learning_rate": 7.467371153133112e-06, + "loss": 2.6907, + "mean_token_accuracy": 0.49343683839096686, + "step": 13661 + }, + { + "epoch": 2.532814238042269, + "grad_norm": 9.40625, + "learning_rate": 7.4671857619577314e-06, + "loss": 2.8912, + "mean_token_accuracy": 0.485891035380942, + "step": 13662 + }, + { + "epoch": 2.5329996292176493, + "grad_norm": 6.72265625, + "learning_rate": 7.467000370782351e-06, + "loss": 3.0471, + "mean_token_accuracy": 0.4590987019348518, + "step": 13663 + }, + { + "epoch": 2.533185020393029, + "grad_norm": 9.6796875, + "learning_rate": 7.466814979606971e-06, + "loss": 3.8774, + "mean_token_accuracy": 0.41136576239476147, + "step": 13664 + }, + { + "epoch": 2.5333704115684093, + "grad_norm": 8.2578125, + "learning_rate": 7.466629588431591e-06, + "loss": 2.7784, + "mean_token_accuracy": 0.48436179205409974, + "step": 13665 + }, + { + "epoch": 2.5335558027437894, + "grad_norm": 7.5703125, + "learning_rate": 7.466444197256211e-06, + "loss": 2.8374, + "mean_token_accuracy": 0.47025557368134857, + "step": 13666 + }, + { + "epoch": 2.5337411939191696, + "grad_norm": 7.90625, + "learning_rate": 7.466258806080831e-06, + "loss": 2.655, + "mean_token_accuracy": 0.5195878758835509, + "step": 13667 + }, + { + "epoch": 2.5339265850945494, + "grad_norm": 9.7109375, + "learning_rate": 7.466073414905452e-06, + "loss": 2.2084, + "mean_token_accuracy": 0.5285680695979309, + "step": 13668 + }, + { + "epoch": 2.5341119762699296, + "grad_norm": 6.13671875, + "learning_rate": 7.4658880237300715e-06, + "loss": 2.512, + "mean_token_accuracy": 0.51995825723976, + "step": 13669 + }, + { + "epoch": 2.5342973674453098, + "grad_norm": 11.015625, + "learning_rate": 7.465702632554691e-06, + "loss": 2.9147, + "mean_token_accuracy": 0.5300930937432344, + "step": 13670 + }, + { + "epoch": 2.5344827586206895, + "grad_norm": 12.4296875, + "learning_rate": 7.465517241379311e-06, + "loss": 3.0565, + "mean_token_accuracy": 0.46942656524283205, + "step": 13671 + }, + { + "epoch": 2.5346681497960697, + "grad_norm": 7.69921875, + "learning_rate": 7.46533185020393e-06, + "loss": 3.3901, + "mean_token_accuracy": 0.44393837910247824, + "step": 13672 + }, + { + "epoch": 2.53485354097145, + "grad_norm": 8.1796875, + "learning_rate": 7.465146459028551e-06, + "loss": 2.4282, + "mean_token_accuracy": 0.5174624226348364, + "step": 13673 + }, + { + "epoch": 2.53503893214683, + "grad_norm": 9.765625, + "learning_rate": 7.4649610678531706e-06, + "loss": 3.6076, + "mean_token_accuracy": 0.4631484334874165, + "step": 13674 + }, + { + "epoch": 2.53522432332221, + "grad_norm": 8.328125, + "learning_rate": 7.464775676677791e-06, + "loss": 2.512, + "mean_token_accuracy": 0.5079872204472844, + "step": 13675 + }, + { + "epoch": 2.53540971449759, + "grad_norm": 9.765625, + "learning_rate": 7.464590285502411e-06, + "loss": 2.0045, + "mean_token_accuracy": 0.5813310652774825, + "step": 13676 + }, + { + "epoch": 2.5355951056729698, + "grad_norm": 7.859375, + "learning_rate": 7.464404894327031e-06, + "loss": 3.3312, + "mean_token_accuracy": 0.447840211767537, + "step": 13677 + }, + { + "epoch": 2.53578049684835, + "grad_norm": 7.890625, + "learning_rate": 7.464219503151651e-06, + "loss": 2.5785, + "mean_token_accuracy": 0.514599555061179, + "step": 13678 + }, + { + "epoch": 2.53596588802373, + "grad_norm": 6.67578125, + "learning_rate": 7.4640341119762705e-06, + "loss": 2.5208, + "mean_token_accuracy": 0.5176873928627085, + "step": 13679 + }, + { + "epoch": 2.5361512791991103, + "grad_norm": 8.140625, + "learning_rate": 7.46384872080089e-06, + "loss": 3.1449, + "mean_token_accuracy": 0.45553869499241273, + "step": 13680 + }, + { + "epoch": 2.53633667037449, + "grad_norm": 7.98828125, + "learning_rate": 7.46366332962551e-06, + "loss": 2.4454, + "mean_token_accuracy": 0.5124571428571428, + "step": 13681 + }, + { + "epoch": 2.5365220615498703, + "grad_norm": 6.78515625, + "learning_rate": 7.46347793845013e-06, + "loss": 2.5707, + "mean_token_accuracy": 0.5067470864853814, + "step": 13682 + }, + { + "epoch": 2.5367074527252504, + "grad_norm": 9.3515625, + "learning_rate": 7.463292547274751e-06, + "loss": 2.5917, + "mean_token_accuracy": 0.5055731549923735, + "step": 13683 + }, + { + "epoch": 2.53689284390063, + "grad_norm": 7.33984375, + "learning_rate": 7.46310715609937e-06, + "loss": 2.4356, + "mean_token_accuracy": 0.5091906721536351, + "step": 13684 + }, + { + "epoch": 2.5370782350760104, + "grad_norm": 6.87109375, + "learning_rate": 7.46292176492399e-06, + "loss": 3.259, + "mean_token_accuracy": 0.45609537238815534, + "step": 13685 + }, + { + "epoch": 2.5372636262513906, + "grad_norm": 7.93359375, + "learning_rate": 7.4627363737486105e-06, + "loss": 2.8888, + "mean_token_accuracy": 0.48690899847483476, + "step": 13686 + }, + { + "epoch": 2.5374490174267708, + "grad_norm": 7.78515625, + "learning_rate": 7.46255098257323e-06, + "loss": 2.447, + "mean_token_accuracy": 0.5214928694244968, + "step": 13687 + }, + { + "epoch": 2.5376344086021505, + "grad_norm": 8.3671875, + "learning_rate": 7.46236559139785e-06, + "loss": 2.5526, + "mean_token_accuracy": 0.531605158983527, + "step": 13688 + }, + { + "epoch": 2.5378197997775307, + "grad_norm": 9.3203125, + "learning_rate": 7.4621802002224695e-06, + "loss": 2.6413, + "mean_token_accuracy": 0.5223688821399369, + "step": 13689 + }, + { + "epoch": 2.5380051909529104, + "grad_norm": 8.3046875, + "learning_rate": 7.461994809047089e-06, + "loss": 3.0175, + "mean_token_accuracy": 0.4665192284064381, + "step": 13690 + }, + { + "epoch": 2.5381905821282906, + "grad_norm": 10.921875, + "learning_rate": 7.4618094178717104e-06, + "loss": 2.9656, + "mean_token_accuracy": 0.4708090075062552, + "step": 13691 + }, + { + "epoch": 2.538375973303671, + "grad_norm": 7.92578125, + "learning_rate": 7.46162402669633e-06, + "loss": 2.6859, + "mean_token_accuracy": 0.5093271581114113, + "step": 13692 + }, + { + "epoch": 2.538561364479051, + "grad_norm": 12.140625, + "learning_rate": 7.46143863552095e-06, + "loss": 2.5423, + "mean_token_accuracy": 0.5205352411234764, + "step": 13693 + }, + { + "epoch": 2.5387467556544308, + "grad_norm": 7.12109375, + "learning_rate": 7.46125324434557e-06, + "loss": 2.463, + "mean_token_accuracy": 0.5362103843560351, + "step": 13694 + }, + { + "epoch": 2.538932146829811, + "grad_norm": 9.21875, + "learning_rate": 7.46106785317019e-06, + "loss": 3.0837, + "mean_token_accuracy": 0.44954128440366975, + "step": 13695 + }, + { + "epoch": 2.5391175380051907, + "grad_norm": 9.4609375, + "learning_rate": 7.4608824619948095e-06, + "loss": 2.931, + "mean_token_accuracy": 0.4732313575525813, + "step": 13696 + }, + { + "epoch": 2.539302929180571, + "grad_norm": 7.25390625, + "learning_rate": 7.460697070819429e-06, + "loss": 2.8577, + "mean_token_accuracy": 0.4890343322999196, + "step": 13697 + }, + { + "epoch": 2.539488320355951, + "grad_norm": 9.0234375, + "learning_rate": 7.460511679644049e-06, + "loss": 2.846, + "mean_token_accuracy": 0.4860530773574252, + "step": 13698 + }, + { + "epoch": 2.5396737115313313, + "grad_norm": 9.71875, + "learning_rate": 7.46032628846867e-06, + "loss": 3.0379, + "mean_token_accuracy": 0.47923632833567037, + "step": 13699 + }, + { + "epoch": 2.539859102706711, + "grad_norm": 8.390625, + "learning_rate": 7.46014089729329e-06, + "loss": 2.5781, + "mean_token_accuracy": 0.5247789669613774, + "step": 13700 + }, + { + "epoch": 2.540044493882091, + "grad_norm": 7.58984375, + "learning_rate": 7.459955506117909e-06, + "loss": 3.3282, + "mean_token_accuracy": 0.44519653247832797, + "step": 13701 + }, + { + "epoch": 2.5402298850574714, + "grad_norm": 12.8046875, + "learning_rate": 7.459770114942529e-06, + "loss": 2.6228, + "mean_token_accuracy": 0.49311328949001115, + "step": 13702 + }, + { + "epoch": 2.540415276232851, + "grad_norm": 10.8046875, + "learning_rate": 7.4595847237671496e-06, + "loss": 3.8102, + "mean_token_accuracy": 0.4461622210125204, + "step": 13703 + }, + { + "epoch": 2.5406006674082313, + "grad_norm": 8.9921875, + "learning_rate": 7.459399332591769e-06, + "loss": 3.1903, + "mean_token_accuracy": 0.46397941680960547, + "step": 13704 + }, + { + "epoch": 2.5407860585836115, + "grad_norm": 7.90625, + "learning_rate": 7.459213941416389e-06, + "loss": 2.7562, + "mean_token_accuracy": 0.5047322253000923, + "step": 13705 + }, + { + "epoch": 2.5409714497589917, + "grad_norm": 9.3671875, + "learning_rate": 7.4590285502410085e-06, + "loss": 2.8957, + "mean_token_accuracy": 0.48219380746063617, + "step": 13706 + }, + { + "epoch": 2.5411568409343714, + "grad_norm": 8.0234375, + "learning_rate": 7.45884315906563e-06, + "loss": 2.7646, + "mean_token_accuracy": 0.5115737570195864, + "step": 13707 + }, + { + "epoch": 2.5413422321097516, + "grad_norm": 8.140625, + "learning_rate": 7.4586577678902495e-06, + "loss": 3.1256, + "mean_token_accuracy": 0.47494598209692357, + "step": 13708 + }, + { + "epoch": 2.5415276232851314, + "grad_norm": 10.6015625, + "learning_rate": 7.458472376714869e-06, + "loss": 2.5526, + "mean_token_accuracy": 0.5268019776216497, + "step": 13709 + }, + { + "epoch": 2.5417130144605116, + "grad_norm": 9.2109375, + "learning_rate": 7.458286985539489e-06, + "loss": 2.1826, + "mean_token_accuracy": 0.5614336917562724, + "step": 13710 + }, + { + "epoch": 2.5418984056358918, + "grad_norm": 7.84375, + "learning_rate": 7.458101594364109e-06, + "loss": 2.5114, + "mean_token_accuracy": 0.5068179075660674, + "step": 13711 + }, + { + "epoch": 2.542083796811272, + "grad_norm": 9.9609375, + "learning_rate": 7.457916203188729e-06, + "loss": 2.7617, + "mean_token_accuracy": 0.49123359580052495, + "step": 13712 + }, + { + "epoch": 2.5422691879866517, + "grad_norm": 9.609375, + "learning_rate": 7.4577308120133485e-06, + "loss": 3.2789, + "mean_token_accuracy": 0.47387677447680376, + "step": 13713 + }, + { + "epoch": 2.542454579162032, + "grad_norm": 6.703125, + "learning_rate": 7.457545420837968e-06, + "loss": 2.9395, + "mean_token_accuracy": 0.4611215834118756, + "step": 13714 + }, + { + "epoch": 2.542639970337412, + "grad_norm": 10.2109375, + "learning_rate": 7.4573600296625895e-06, + "loss": 2.8379, + "mean_token_accuracy": 0.4648496900222248, + "step": 13715 + }, + { + "epoch": 2.542825361512792, + "grad_norm": 13.375, + "learning_rate": 7.457174638487209e-06, + "loss": 2.7651, + "mean_token_accuracy": 0.47644554319461613, + "step": 13716 + }, + { + "epoch": 2.543010752688172, + "grad_norm": 8.8671875, + "learning_rate": 7.456989247311829e-06, + "loss": 3.3587, + "mean_token_accuracy": 0.4694264069264069, + "step": 13717 + }, + { + "epoch": 2.543196143863552, + "grad_norm": 10.828125, + "learning_rate": 7.4568038561364485e-06, + "loss": 2.8851, + "mean_token_accuracy": 0.4812841174941513, + "step": 13718 + }, + { + "epoch": 2.5433815350389324, + "grad_norm": 9.140625, + "learning_rate": 7.456618464961068e-06, + "loss": 3.4093, + "mean_token_accuracy": 0.42862394314422253, + "step": 13719 + }, + { + "epoch": 2.543566926214312, + "grad_norm": 7.16796875, + "learning_rate": 7.456433073785689e-06, + "loss": 2.2668, + "mean_token_accuracy": 0.5445887445887446, + "step": 13720 + }, + { + "epoch": 2.5437523173896923, + "grad_norm": 9.4375, + "learning_rate": 7.456247682610308e-06, + "loss": 3.5722, + "mean_token_accuracy": 0.45403949730700177, + "step": 13721 + }, + { + "epoch": 2.543937708565072, + "grad_norm": 11.546875, + "learning_rate": 7.456062291434928e-06, + "loss": 3.1793, + "mean_token_accuracy": 0.46894803548795944, + "step": 13722 + }, + { + "epoch": 2.5441230997404523, + "grad_norm": 11.6875, + "learning_rate": 7.455876900259549e-06, + "loss": 2.5427, + "mean_token_accuracy": 0.5072082500263074, + "step": 13723 + }, + { + "epoch": 2.5443084909158324, + "grad_norm": 7.44140625, + "learning_rate": 7.455691509084169e-06, + "loss": 3.3973, + "mean_token_accuracy": 0.43305582093626316, + "step": 13724 + }, + { + "epoch": 2.5444938820912126, + "grad_norm": 8.6328125, + "learning_rate": 7.4555061179087885e-06, + "loss": 2.9154, + "mean_token_accuracy": 0.46700143472022954, + "step": 13725 + }, + { + "epoch": 2.5446792732665924, + "grad_norm": 8.375, + "learning_rate": 7.455320726733408e-06, + "loss": 2.6136, + "mean_token_accuracy": 0.49379395945386845, + "step": 13726 + }, + { + "epoch": 2.5448646644419726, + "grad_norm": 7.60546875, + "learning_rate": 7.455135335558028e-06, + "loss": 2.5721, + "mean_token_accuracy": 0.508656103286385, + "step": 13727 + }, + { + "epoch": 2.5450500556173528, + "grad_norm": 9.9921875, + "learning_rate": 7.4549499443826474e-06, + "loss": 3.7884, + "mean_token_accuracy": 0.4765258215962441, + "step": 13728 + }, + { + "epoch": 2.5452354467927325, + "grad_norm": 13.46875, + "learning_rate": 7.454764553207268e-06, + "loss": 2.6743, + "mean_token_accuracy": 0.5086196786677613, + "step": 13729 + }, + { + "epoch": 2.5454208379681127, + "grad_norm": 9.0390625, + "learning_rate": 7.454579162031888e-06, + "loss": 3.4162, + "mean_token_accuracy": 0.4493441881501583, + "step": 13730 + }, + { + "epoch": 2.545606229143493, + "grad_norm": 8.6328125, + "learning_rate": 7.454393770856508e-06, + "loss": 3.1033, + "mean_token_accuracy": 0.46269173105754813, + "step": 13731 + }, + { + "epoch": 2.545791620318873, + "grad_norm": 7.94921875, + "learning_rate": 7.4542083796811286e-06, + "loss": 3.1873, + "mean_token_accuracy": 0.45701777659856724, + "step": 13732 + }, + { + "epoch": 2.545977011494253, + "grad_norm": 7.5546875, + "learning_rate": 7.454022988505748e-06, + "loss": 3.1608, + "mean_token_accuracy": 0.45866209262435675, + "step": 13733 + }, + { + "epoch": 2.546162402669633, + "grad_norm": 8.625, + "learning_rate": 7.453837597330368e-06, + "loss": 3.1048, + "mean_token_accuracy": 0.4539690925901466, + "step": 13734 + }, + { + "epoch": 2.5463477938450128, + "grad_norm": 7.3515625, + "learning_rate": 7.4536522061549875e-06, + "loss": 2.7198, + "mean_token_accuracy": 0.4927085823571599, + "step": 13735 + }, + { + "epoch": 2.546533185020393, + "grad_norm": 10.515625, + "learning_rate": 7.453466814979607e-06, + "loss": 2.9261, + "mean_token_accuracy": 0.46141845673826953, + "step": 13736 + }, + { + "epoch": 2.546718576195773, + "grad_norm": 10.7109375, + "learning_rate": 7.453281423804228e-06, + "loss": 2.7738, + "mean_token_accuracy": 0.4698316183348924, + "step": 13737 + }, + { + "epoch": 2.5469039673711533, + "grad_norm": 7.3515625, + "learning_rate": 7.453096032628847e-06, + "loss": 3.1993, + "mean_token_accuracy": 0.4743239552034963, + "step": 13738 + }, + { + "epoch": 2.547089358546533, + "grad_norm": 7.79296875, + "learning_rate": 7.452910641453467e-06, + "loss": 2.7701, + "mean_token_accuracy": 0.48983050847457626, + "step": 13739 + }, + { + "epoch": 2.5472747497219133, + "grad_norm": 8.1875, + "learning_rate": 7.452725250278087e-06, + "loss": 2.6402, + "mean_token_accuracy": 0.518609865470852, + "step": 13740 + }, + { + "epoch": 2.5474601408972934, + "grad_norm": 7.92578125, + "learning_rate": 7.452539859102708e-06, + "loss": 2.8256, + "mean_token_accuracy": 0.5, + "step": 13741 + }, + { + "epoch": 2.547645532072673, + "grad_norm": 7.0234375, + "learning_rate": 7.4523544679273275e-06, + "loss": 2.8078, + "mean_token_accuracy": 0.47328538402260467, + "step": 13742 + }, + { + "epoch": 2.5478309232480534, + "grad_norm": 8.7265625, + "learning_rate": 7.452169076751947e-06, + "loss": 2.5951, + "mean_token_accuracy": 0.5126547455295736, + "step": 13743 + }, + { + "epoch": 2.5480163144234336, + "grad_norm": 8.2421875, + "learning_rate": 7.451983685576567e-06, + "loss": 2.696, + "mean_token_accuracy": 0.4946119516708392, + "step": 13744 + }, + { + "epoch": 2.5482017055988138, + "grad_norm": 6.62890625, + "learning_rate": 7.4517982944011865e-06, + "loss": 2.2215, + "mean_token_accuracy": 0.5694678545629666, + "step": 13745 + }, + { + "epoch": 2.5483870967741935, + "grad_norm": 6.83203125, + "learning_rate": 7.451612903225807e-06, + "loss": 3.3826, + "mean_token_accuracy": 0.44591859680683604, + "step": 13746 + }, + { + "epoch": 2.5485724879495737, + "grad_norm": 8.53125, + "learning_rate": 7.451427512050427e-06, + "loss": 3.0809, + "mean_token_accuracy": 0.4765109705209315, + "step": 13747 + }, + { + "epoch": 2.5487578791249534, + "grad_norm": 6.8984375, + "learning_rate": 7.451242120875047e-06, + "loss": 2.8689, + "mean_token_accuracy": 0.4675194660734149, + "step": 13748 + }, + { + "epoch": 2.5489432703003336, + "grad_norm": 6.4140625, + "learning_rate": 7.451056729699668e-06, + "loss": 2.9036, + "mean_token_accuracy": 0.47838555496548063, + "step": 13749 + }, + { + "epoch": 2.549128661475714, + "grad_norm": 10.875, + "learning_rate": 7.450871338524287e-06, + "loss": 3.0931, + "mean_token_accuracy": 0.48893285519970325, + "step": 13750 + }, + { + "epoch": 2.549314052651094, + "grad_norm": 9.515625, + "learning_rate": 7.450685947348907e-06, + "loss": 2.9861, + "mean_token_accuracy": 0.47596675099385616, + "step": 13751 + }, + { + "epoch": 2.5494994438264738, + "grad_norm": 8.90625, + "learning_rate": 7.4505005561735265e-06, + "loss": 2.6401, + "mean_token_accuracy": 0.4944962784358947, + "step": 13752 + }, + { + "epoch": 2.549684835001854, + "grad_norm": 9.390625, + "learning_rate": 7.450315164998146e-06, + "loss": 3.3395, + "mean_token_accuracy": 0.46005370929842226, + "step": 13753 + }, + { + "epoch": 2.5498702261772337, + "grad_norm": 7.82421875, + "learning_rate": 7.450129773822767e-06, + "loss": 3.1013, + "mean_token_accuracy": 0.48082088750402924, + "step": 13754 + }, + { + "epoch": 2.550055617352614, + "grad_norm": 10.2890625, + "learning_rate": 7.449944382647386e-06, + "loss": 2.8467, + "mean_token_accuracy": 0.48082454458293383, + "step": 13755 + }, + { + "epoch": 2.550241008527994, + "grad_norm": 7.73828125, + "learning_rate": 7.449758991472007e-06, + "loss": 3.2417, + "mean_token_accuracy": 0.4435667135591386, + "step": 13756 + }, + { + "epoch": 2.5504263997033743, + "grad_norm": 9.515625, + "learning_rate": 7.4495736002966264e-06, + "loss": 3.027, + "mean_token_accuracy": 0.44071192833569073, + "step": 13757 + }, + { + "epoch": 2.5506117908787544, + "grad_norm": 9.78125, + "learning_rate": 7.449388209121247e-06, + "loss": 2.8907, + "mean_token_accuracy": 0.47465727581436296, + "step": 13758 + }, + { + "epoch": 2.550797182054134, + "grad_norm": 8.9765625, + "learning_rate": 7.449202817945867e-06, + "loss": 3.2299, + "mean_token_accuracy": 0.45731788520442135, + "step": 13759 + }, + { + "epoch": 2.5509825732295144, + "grad_norm": 8.703125, + "learning_rate": 7.449017426770486e-06, + "loss": 2.8205, + "mean_token_accuracy": 0.48021828103683495, + "step": 13760 + }, + { + "epoch": 2.551167964404894, + "grad_norm": 11.125, + "learning_rate": 7.448832035595106e-06, + "loss": 3.0661, + "mean_token_accuracy": 0.49720341477774505, + "step": 13761 + }, + { + "epoch": 2.5513533555802743, + "grad_norm": 9.171875, + "learning_rate": 7.4486466444197255e-06, + "loss": 2.885, + "mean_token_accuracy": 0.47912399558846697, + "step": 13762 + }, + { + "epoch": 2.5515387467556545, + "grad_norm": 6.625, + "learning_rate": 7.448461253244346e-06, + "loss": 3.1454, + "mean_token_accuracy": 0.47504720798489347, + "step": 13763 + }, + { + "epoch": 2.5517241379310347, + "grad_norm": 7.3671875, + "learning_rate": 7.4482758620689665e-06, + "loss": 3.2068, + "mean_token_accuracy": 0.4516082711085583, + "step": 13764 + }, + { + "epoch": 2.5519095291064144, + "grad_norm": 7.671875, + "learning_rate": 7.448090470893586e-06, + "loss": 2.9544, + "mean_token_accuracy": 0.4733085501858736, + "step": 13765 + }, + { + "epoch": 2.5520949202817946, + "grad_norm": 7.73828125, + "learning_rate": 7.447905079718206e-06, + "loss": 2.461, + "mean_token_accuracy": 0.5116017437772465, + "step": 13766 + }, + { + "epoch": 2.5522803114571744, + "grad_norm": 7.08984375, + "learning_rate": 7.447719688542826e-06, + "loss": 2.9461, + "mean_token_accuracy": 0.4699897107579742, + "step": 13767 + }, + { + "epoch": 2.5524657026325546, + "grad_norm": 8.078125, + "learning_rate": 7.447534297367446e-06, + "loss": 2.9221, + "mean_token_accuracy": 0.45645771981489275, + "step": 13768 + }, + { + "epoch": 2.5526510938079348, + "grad_norm": 7.7578125, + "learning_rate": 7.4473489061920656e-06, + "loss": 2.6815, + "mean_token_accuracy": 0.4994958951461904, + "step": 13769 + }, + { + "epoch": 2.552836484983315, + "grad_norm": 6.7109375, + "learning_rate": 7.447163515016685e-06, + "loss": 3.0948, + "mean_token_accuracy": 0.46786175710594313, + "step": 13770 + }, + { + "epoch": 2.5530218761586947, + "grad_norm": 8.6640625, + "learning_rate": 7.446978123841305e-06, + "loss": 2.414, + "mean_token_accuracy": 0.5302649930264993, + "step": 13771 + }, + { + "epoch": 2.553207267334075, + "grad_norm": 6.41015625, + "learning_rate": 7.446792732665926e-06, + "loss": 2.2122, + "mean_token_accuracy": 0.56563687749732, + "step": 13772 + }, + { + "epoch": 2.553392658509455, + "grad_norm": 6.51171875, + "learning_rate": 7.446607341490546e-06, + "loss": 3.0249, + "mean_token_accuracy": 0.4741901501185146, + "step": 13773 + }, + { + "epoch": 2.553578049684835, + "grad_norm": 7.96484375, + "learning_rate": 7.4464219503151655e-06, + "loss": 3.2153, + "mean_token_accuracy": 0.4674100623921412, + "step": 13774 + }, + { + "epoch": 2.553763440860215, + "grad_norm": 8.234375, + "learning_rate": 7.446236559139786e-06, + "loss": 2.7232, + "mean_token_accuracy": 0.4859452736318408, + "step": 13775 + }, + { + "epoch": 2.553948832035595, + "grad_norm": 7.4296875, + "learning_rate": 7.446051167964406e-06, + "loss": 2.9467, + "mean_token_accuracy": 0.4530834848272969, + "step": 13776 + }, + { + "epoch": 2.5541342232109754, + "grad_norm": 8.8984375, + "learning_rate": 7.445865776789025e-06, + "loss": 3.0162, + "mean_token_accuracy": 0.4883168316831683, + "step": 13777 + }, + { + "epoch": 2.554319614386355, + "grad_norm": 29.234375, + "learning_rate": 7.445680385613645e-06, + "loss": 4.0019, + "mean_token_accuracy": 0.49345417925478346, + "step": 13778 + }, + { + "epoch": 2.5545050055617353, + "grad_norm": 13.8984375, + "learning_rate": 7.4454949944382645e-06, + "loss": 3.038, + "mean_token_accuracy": 0.457682698313554, + "step": 13779 + }, + { + "epoch": 2.554690396737115, + "grad_norm": 7.28515625, + "learning_rate": 7.445309603262886e-06, + "loss": 3.2163, + "mean_token_accuracy": 0.45729213993639256, + "step": 13780 + }, + { + "epoch": 2.5548757879124953, + "grad_norm": 8.6484375, + "learning_rate": 7.4451242120875055e-06, + "loss": 3.2263, + "mean_token_accuracy": 0.47430710095083956, + "step": 13781 + }, + { + "epoch": 2.5550611790878754, + "grad_norm": 12.6875, + "learning_rate": 7.444938820912125e-06, + "loss": 2.3114, + "mean_token_accuracy": 0.5545127282077655, + "step": 13782 + }, + { + "epoch": 2.5552465702632556, + "grad_norm": 6.64453125, + "learning_rate": 7.444753429736745e-06, + "loss": 3.192, + "mean_token_accuracy": 0.46390516782099095, + "step": 13783 + }, + { + "epoch": 2.5554319614386354, + "grad_norm": 7.89453125, + "learning_rate": 7.444568038561365e-06, + "loss": 2.977, + "mean_token_accuracy": 0.4658491561181435, + "step": 13784 + }, + { + "epoch": 2.5556173526140156, + "grad_norm": 10.40625, + "learning_rate": 7.444382647385985e-06, + "loss": 2.4909, + "mean_token_accuracy": 0.5201331114808653, + "step": 13785 + }, + { + "epoch": 2.5558027437893958, + "grad_norm": 8.3828125, + "learning_rate": 7.444197256210605e-06, + "loss": 3.0655, + "mean_token_accuracy": 0.4624685929648241, + "step": 13786 + }, + { + "epoch": 2.5559881349647755, + "grad_norm": 7.94140625, + "learning_rate": 7.444011865035224e-06, + "loss": 2.3223, + "mean_token_accuracy": 0.5297484072994277, + "step": 13787 + }, + { + "epoch": 2.5561735261401557, + "grad_norm": 9.875, + "learning_rate": 7.443826473859846e-06, + "loss": 3.0607, + "mean_token_accuracy": 0.4629455909943715, + "step": 13788 + }, + { + "epoch": 2.556358917315536, + "grad_norm": 8.9140625, + "learning_rate": 7.443641082684465e-06, + "loss": 2.8328, + "mean_token_accuracy": 0.4606174433618969, + "step": 13789 + }, + { + "epoch": 2.556544308490916, + "grad_norm": 6.359375, + "learning_rate": 7.443455691509085e-06, + "loss": 2.5463, + "mean_token_accuracy": 0.5047575199508901, + "step": 13790 + }, + { + "epoch": 2.556729699666296, + "grad_norm": 12.8203125, + "learning_rate": 7.4432703003337045e-06, + "loss": 3.1607, + "mean_token_accuracy": 0.4719748559455212, + "step": 13791 + }, + { + "epoch": 2.556915090841676, + "grad_norm": 12.6640625, + "learning_rate": 7.443084909158325e-06, + "loss": 2.937, + "mean_token_accuracy": 0.46943550745832113, + "step": 13792 + }, + { + "epoch": 2.5571004820170558, + "grad_norm": 14.53125, + "learning_rate": 7.442899517982945e-06, + "loss": 4.2626, + "mean_token_accuracy": 0.41929133858267714, + "step": 13793 + }, + { + "epoch": 2.557285873192436, + "grad_norm": 8.6484375, + "learning_rate": 7.442714126807564e-06, + "loss": 2.541, + "mean_token_accuracy": 0.5419734904270986, + "step": 13794 + }, + { + "epoch": 2.557471264367816, + "grad_norm": 7.58203125, + "learning_rate": 7.442528735632184e-06, + "loss": 2.4456, + "mean_token_accuracy": 0.5135135135135135, + "step": 13795 + }, + { + "epoch": 2.5576566555431963, + "grad_norm": 7.4921875, + "learning_rate": 7.442343344456805e-06, + "loss": 2.781, + "mean_token_accuracy": 0.48956328645447816, + "step": 13796 + }, + { + "epoch": 2.557842046718576, + "grad_norm": 7.5703125, + "learning_rate": 7.442157953281425e-06, + "loss": 3.19, + "mean_token_accuracy": 0.46706859287908087, + "step": 13797 + }, + { + "epoch": 2.5580274378939563, + "grad_norm": 7.48828125, + "learning_rate": 7.4419725621060446e-06, + "loss": 3.2036, + "mean_token_accuracy": 0.4515965534718702, + "step": 13798 + }, + { + "epoch": 2.5582128290693364, + "grad_norm": 6.4765625, + "learning_rate": 7.441787170930664e-06, + "loss": 2.3134, + "mean_token_accuracy": 0.5537430167597766, + "step": 13799 + }, + { + "epoch": 2.558398220244716, + "grad_norm": 9.0390625, + "learning_rate": 7.441601779755284e-06, + "loss": 2.9859, + "mean_token_accuracy": 0.514644993099218, + "step": 13800 + }, + { + "epoch": 2.5585836114200964, + "grad_norm": 6.84765625, + "learning_rate": 7.441416388579904e-06, + "loss": 2.6691, + "mean_token_accuracy": 0.495105554900342, + "step": 13801 + }, + { + "epoch": 2.5587690025954766, + "grad_norm": 7.33984375, + "learning_rate": 7.441230997404524e-06, + "loss": 2.5071, + "mean_token_accuracy": 0.502056202878684, + "step": 13802 + }, + { + "epoch": 2.5589543937708568, + "grad_norm": 8.0703125, + "learning_rate": 7.441045606229144e-06, + "loss": 2.8847, + "mean_token_accuracy": 0.478494623655914, + "step": 13803 + }, + { + "epoch": 2.5591397849462365, + "grad_norm": 6.8671875, + "learning_rate": 7.440860215053764e-06, + "loss": 2.6381, + "mean_token_accuracy": 0.4953638662545659, + "step": 13804 + }, + { + "epoch": 2.5593251761216167, + "grad_norm": 6.68359375, + "learning_rate": 7.440674823878385e-06, + "loss": 2.4831, + "mean_token_accuracy": 0.5285391353489687, + "step": 13805 + }, + { + "epoch": 2.5595105672969964, + "grad_norm": 9.265625, + "learning_rate": 7.440489432703004e-06, + "loss": 2.7182, + "mean_token_accuracy": 0.4893338568250229, + "step": 13806 + }, + { + "epoch": 2.5596959584723766, + "grad_norm": 9.6015625, + "learning_rate": 7.440304041527624e-06, + "loss": 3.1842, + "mean_token_accuracy": 0.5038948393378773, + "step": 13807 + }, + { + "epoch": 2.559881349647757, + "grad_norm": 7.02734375, + "learning_rate": 7.4401186503522435e-06, + "loss": 2.8508, + "mean_token_accuracy": 0.4800404975412207, + "step": 13808 + }, + { + "epoch": 2.560066740823137, + "grad_norm": 6.74609375, + "learning_rate": 7.439933259176863e-06, + "loss": 2.3926, + "mean_token_accuracy": 0.5151197800759262, + "step": 13809 + }, + { + "epoch": 2.5602521319985168, + "grad_norm": 7.65625, + "learning_rate": 7.439747868001484e-06, + "loss": 2.9048, + "mean_token_accuracy": 0.48632218844984804, + "step": 13810 + }, + { + "epoch": 2.560437523173897, + "grad_norm": 9.8203125, + "learning_rate": 7.439562476826103e-06, + "loss": 3.113, + "mean_token_accuracy": 0.47706809229037705, + "step": 13811 + }, + { + "epoch": 2.560622914349277, + "grad_norm": 7.71875, + "learning_rate": 7.439377085650724e-06, + "loss": 2.8864, + "mean_token_accuracy": 0.49459920409323477, + "step": 13812 + }, + { + "epoch": 2.560808305524657, + "grad_norm": 8.6328125, + "learning_rate": 7.439191694475344e-06, + "loss": 3.495, + "mean_token_accuracy": 0.43187289359653347, + "step": 13813 + }, + { + "epoch": 2.560993696700037, + "grad_norm": 6.6015625, + "learning_rate": 7.439006303299964e-06, + "loss": 2.5041, + "mean_token_accuracy": 0.5275664130289681, + "step": 13814 + }, + { + "epoch": 2.5611790878754173, + "grad_norm": 6.57421875, + "learning_rate": 7.438820912124584e-06, + "loss": 3.0163, + "mean_token_accuracy": 0.4601349662584354, + "step": 13815 + }, + { + "epoch": 2.5613644790507974, + "grad_norm": 7.7109375, + "learning_rate": 7.438635520949203e-06, + "loss": 2.5587, + "mean_token_accuracy": 0.5119920354783238, + "step": 13816 + }, + { + "epoch": 2.561549870226177, + "grad_norm": 7.2421875, + "learning_rate": 7.438450129773823e-06, + "loss": 3.2185, + "mean_token_accuracy": 0.460569295380308, + "step": 13817 + }, + { + "epoch": 2.5617352614015574, + "grad_norm": 6.9609375, + "learning_rate": 7.438264738598443e-06, + "loss": 2.5738, + "mean_token_accuracy": 0.4888156345655757, + "step": 13818 + }, + { + "epoch": 2.561920652576937, + "grad_norm": 9.1640625, + "learning_rate": 7.438079347423063e-06, + "loss": 3.7603, + "mean_token_accuracy": 0.41190405685519693, + "step": 13819 + }, + { + "epoch": 2.5621060437523173, + "grad_norm": 6.34375, + "learning_rate": 7.4378939562476835e-06, + "loss": 2.8523, + "mean_token_accuracy": 0.4710240172063568, + "step": 13820 + }, + { + "epoch": 2.5622914349276975, + "grad_norm": 6.5, + "learning_rate": 7.437708565072303e-06, + "loss": 2.5775, + "mean_token_accuracy": 0.5075620767494357, + "step": 13821 + }, + { + "epoch": 2.5624768261030777, + "grad_norm": 6.62109375, + "learning_rate": 7.437523173896924e-06, + "loss": 2.7058, + "mean_token_accuracy": 0.5084475613643609, + "step": 13822 + }, + { + "epoch": 2.5626622172784574, + "grad_norm": 8.6484375, + "learning_rate": 7.437337782721543e-06, + "loss": 2.1548, + "mean_token_accuracy": 0.563770325203252, + "step": 13823 + }, + { + "epoch": 2.5628476084538376, + "grad_norm": 8.8203125, + "learning_rate": 7.437152391546163e-06, + "loss": 3.722, + "mean_token_accuracy": 0.42838829973196596, + "step": 13824 + }, + { + "epoch": 2.5630329996292174, + "grad_norm": 9.1171875, + "learning_rate": 7.436967000370783e-06, + "loss": 2.4887, + "mean_token_accuracy": 0.5232360097323601, + "step": 13825 + }, + { + "epoch": 2.5632183908045976, + "grad_norm": 8.1015625, + "learning_rate": 7.436781609195402e-06, + "loss": 2.449, + "mean_token_accuracy": 0.5107878391631252, + "step": 13826 + }, + { + "epoch": 2.5634037819799778, + "grad_norm": 7.4765625, + "learning_rate": 7.436596218020023e-06, + "loss": 3.0364, + "mean_token_accuracy": 0.4947598651234849, + "step": 13827 + }, + { + "epoch": 2.563589173155358, + "grad_norm": 8.15625, + "learning_rate": 7.436410826844643e-06, + "loss": 2.3949, + "mean_token_accuracy": 0.5049645390070922, + "step": 13828 + }, + { + "epoch": 2.5637745643307377, + "grad_norm": 7.8984375, + "learning_rate": 7.436225435669263e-06, + "loss": 2.7149, + "mean_token_accuracy": 0.4957854406130268, + "step": 13829 + }, + { + "epoch": 2.563959955506118, + "grad_norm": 9.140625, + "learning_rate": 7.436040044493883e-06, + "loss": 3.4269, + "mean_token_accuracy": 0.4505919587387176, + "step": 13830 + }, + { + "epoch": 2.564145346681498, + "grad_norm": 10.390625, + "learning_rate": 7.435854653318503e-06, + "loss": 2.9345, + "mean_token_accuracy": 0.4883227176220807, + "step": 13831 + }, + { + "epoch": 2.564330737856878, + "grad_norm": 7.44140625, + "learning_rate": 7.435669262143123e-06, + "loss": 2.7499, + "mean_token_accuracy": 0.49706933523945673, + "step": 13832 + }, + { + "epoch": 2.564516129032258, + "grad_norm": 7.69140625, + "learning_rate": 7.435483870967742e-06, + "loss": 3.1706, + "mean_token_accuracy": 0.4594022745305475, + "step": 13833 + }, + { + "epoch": 2.564701520207638, + "grad_norm": 9.015625, + "learning_rate": 7.435298479792362e-06, + "loss": 2.4651, + "mean_token_accuracy": 0.5189968652037618, + "step": 13834 + }, + { + "epoch": 2.5648869113830184, + "grad_norm": 9.40625, + "learning_rate": 7.435113088616982e-06, + "loss": 2.3811, + "mean_token_accuracy": 0.519605077574048, + "step": 13835 + }, + { + "epoch": 2.565072302558398, + "grad_norm": 7.73046875, + "learning_rate": 7.434927697441603e-06, + "loss": 2.987, + "mean_token_accuracy": 0.46234522942461764, + "step": 13836 + }, + { + "epoch": 2.5652576937337783, + "grad_norm": 7.8984375, + "learning_rate": 7.4347423062662225e-06, + "loss": 3.0258, + "mean_token_accuracy": 0.48636689787132265, + "step": 13837 + }, + { + "epoch": 2.565443084909158, + "grad_norm": 9.890625, + "learning_rate": 7.434556915090842e-06, + "loss": 3.7939, + "mean_token_accuracy": 0.4362322515212982, + "step": 13838 + }, + { + "epoch": 2.5656284760845383, + "grad_norm": 9.1796875, + "learning_rate": 7.434371523915463e-06, + "loss": 2.9287, + "mean_token_accuracy": 0.46857410881801126, + "step": 13839 + }, + { + "epoch": 2.5658138672599184, + "grad_norm": 7.11328125, + "learning_rate": 7.434186132740082e-06, + "loss": 3.0496, + "mean_token_accuracy": 0.4505854211255193, + "step": 13840 + }, + { + "epoch": 2.5659992584352986, + "grad_norm": 8.484375, + "learning_rate": 7.434000741564702e-06, + "loss": 2.9985, + "mean_token_accuracy": 0.4707586933614331, + "step": 13841 + }, + { + "epoch": 2.5661846496106784, + "grad_norm": 6.13671875, + "learning_rate": 7.433815350389322e-06, + "loss": 2.6593, + "mean_token_accuracy": 0.4863102998696219, + "step": 13842 + }, + { + "epoch": 2.5663700407860586, + "grad_norm": 5.8828125, + "learning_rate": 7.433629959213941e-06, + "loss": 3.045, + "mean_token_accuracy": 0.46725471242145966, + "step": 13843 + }, + { + "epoch": 2.5665554319614388, + "grad_norm": 7.1484375, + "learning_rate": 7.433444568038563e-06, + "loss": 3.2909, + "mean_token_accuracy": 0.473590828442746, + "step": 13844 + }, + { + "epoch": 2.5667408231368185, + "grad_norm": 6.87890625, + "learning_rate": 7.433259176863182e-06, + "loss": 2.794, + "mean_token_accuracy": 0.4644133412745682, + "step": 13845 + }, + { + "epoch": 2.5669262143121987, + "grad_norm": 13.609375, + "learning_rate": 7.433073785687802e-06, + "loss": 3.4907, + "mean_token_accuracy": 0.4208729216152019, + "step": 13846 + }, + { + "epoch": 2.567111605487579, + "grad_norm": 7.953125, + "learning_rate": 7.4328883945124215e-06, + "loss": 3.1017, + "mean_token_accuracy": 0.47587791270101737, + "step": 13847 + }, + { + "epoch": 2.567296996662959, + "grad_norm": 7.10546875, + "learning_rate": 7.432703003337042e-06, + "loss": 2.3971, + "mean_token_accuracy": 0.53634483855575, + "step": 13848 + }, + { + "epoch": 2.567482387838339, + "grad_norm": 7.28515625, + "learning_rate": 7.432517612161662e-06, + "loss": 2.5727, + "mean_token_accuracy": 0.5276028158577251, + "step": 13849 + }, + { + "epoch": 2.567667779013719, + "grad_norm": 7.32421875, + "learning_rate": 7.432332220986281e-06, + "loss": 2.7462, + "mean_token_accuracy": 0.4621802612800527, + "step": 13850 + }, + { + "epoch": 2.5678531701890988, + "grad_norm": 8.015625, + "learning_rate": 7.432146829810901e-06, + "loss": 2.8527, + "mean_token_accuracy": 0.4865229110512129, + "step": 13851 + }, + { + "epoch": 2.568038561364479, + "grad_norm": 8.34375, + "learning_rate": 7.431961438635522e-06, + "loss": 2.739, + "mean_token_accuracy": 0.4807002439374372, + "step": 13852 + }, + { + "epoch": 2.568223952539859, + "grad_norm": 7.19140625, + "learning_rate": 7.431776047460142e-06, + "loss": 2.6978, + "mean_token_accuracy": 0.5185643564356436, + "step": 13853 + }, + { + "epoch": 2.5684093437152393, + "grad_norm": 7.46484375, + "learning_rate": 7.431590656284762e-06, + "loss": 2.6608, + "mean_token_accuracy": 0.47228995788073597, + "step": 13854 + }, + { + "epoch": 2.568594734890619, + "grad_norm": 8.0546875, + "learning_rate": 7.431405265109381e-06, + "loss": 3.0315, + "mean_token_accuracy": 0.4684123025768911, + "step": 13855 + }, + { + "epoch": 2.5687801260659993, + "grad_norm": 7.74609375, + "learning_rate": 7.431219873934002e-06, + "loss": 3.5044, + "mean_token_accuracy": 0.4502816556453588, + "step": 13856 + }, + { + "epoch": 2.5689655172413794, + "grad_norm": 11.7265625, + "learning_rate": 7.431034482758621e-06, + "loss": 2.8327, + "mean_token_accuracy": 0.46732922092757595, + "step": 13857 + }, + { + "epoch": 2.569150908416759, + "grad_norm": 11.7421875, + "learning_rate": 7.430849091583241e-06, + "loss": 3.2985, + "mean_token_accuracy": 0.45017128620367486, + "step": 13858 + }, + { + "epoch": 2.5693362995921394, + "grad_norm": 7.75, + "learning_rate": 7.430663700407861e-06, + "loss": 3.0737, + "mean_token_accuracy": 0.42348837209302326, + "step": 13859 + }, + { + "epoch": 2.5695216907675196, + "grad_norm": 8.4140625, + "learning_rate": 7.43047830923248e-06, + "loss": 4.3692, + "mean_token_accuracy": 0.40856465336867065, + "step": 13860 + }, + { + "epoch": 2.5697070819428998, + "grad_norm": 6.265625, + "learning_rate": 7.430292918057102e-06, + "loss": 2.6703, + "mean_token_accuracy": 0.4928308070462925, + "step": 13861 + }, + { + "epoch": 2.5698924731182795, + "grad_norm": 7.921875, + "learning_rate": 7.430107526881721e-06, + "loss": 2.4017, + "mean_token_accuracy": 0.5153974695631416, + "step": 13862 + }, + { + "epoch": 2.5700778642936597, + "grad_norm": 10.4296875, + "learning_rate": 7.429922135706341e-06, + "loss": 2.8629, + "mean_token_accuracy": 0.47024504084014, + "step": 13863 + }, + { + "epoch": 2.5702632554690394, + "grad_norm": 6.29296875, + "learning_rate": 7.4297367445309606e-06, + "loss": 2.7099, + "mean_token_accuracy": 0.495089614534741, + "step": 13864 + }, + { + "epoch": 2.5704486466444196, + "grad_norm": 10.5078125, + "learning_rate": 7.429551353355581e-06, + "loss": 3.4971, + "mean_token_accuracy": 0.43264825238905613, + "step": 13865 + }, + { + "epoch": 2.5706340378198, + "grad_norm": 9.8671875, + "learning_rate": 7.429365962180201e-06, + "loss": 2.6106, + "mean_token_accuracy": 0.5007216742843397, + "step": 13866 + }, + { + "epoch": 2.57081942899518, + "grad_norm": 6.40625, + "learning_rate": 7.42918057100482e-06, + "loss": 2.9911, + "mean_token_accuracy": 0.4704772475027747, + "step": 13867 + }, + { + "epoch": 2.5710048201705598, + "grad_norm": 9.1328125, + "learning_rate": 7.42899517982944e-06, + "loss": 2.4868, + "mean_token_accuracy": 0.5120310183601323, + "step": 13868 + }, + { + "epoch": 2.57119021134594, + "grad_norm": 7.05078125, + "learning_rate": 7.428809788654061e-06, + "loss": 2.8921, + "mean_token_accuracy": 0.4736093327642623, + "step": 13869 + }, + { + "epoch": 2.57137560252132, + "grad_norm": 8.2265625, + "learning_rate": 7.428624397478681e-06, + "loss": 2.7207, + "mean_token_accuracy": 0.4735373268214891, + "step": 13870 + }, + { + "epoch": 2.5715609936967, + "grad_norm": 9.453125, + "learning_rate": 7.428439006303301e-06, + "loss": 3.2834, + "mean_token_accuracy": 0.4847742922723795, + "step": 13871 + }, + { + "epoch": 2.57174638487208, + "grad_norm": 6.8359375, + "learning_rate": 7.42825361512792e-06, + "loss": 2.4725, + "mean_token_accuracy": 0.5376717281272596, + "step": 13872 + }, + { + "epoch": 2.5719317760474603, + "grad_norm": 6.85546875, + "learning_rate": 7.428068223952541e-06, + "loss": 3.1751, + "mean_token_accuracy": 0.4665497707040734, + "step": 13873 + }, + { + "epoch": 2.5721171672228405, + "grad_norm": 6.796875, + "learning_rate": 7.42788283277716e-06, + "loss": 3.0615, + "mean_token_accuracy": 0.44154147615937295, + "step": 13874 + }, + { + "epoch": 2.57230255839822, + "grad_norm": 9.3984375, + "learning_rate": 7.42769744160178e-06, + "loss": 3.0767, + "mean_token_accuracy": 0.481562099871959, + "step": 13875 + }, + { + "epoch": 2.5724879495736004, + "grad_norm": 7.0859375, + "learning_rate": 7.4275120504264e-06, + "loss": 3.4327, + "mean_token_accuracy": 0.4141820212171971, + "step": 13876 + }, + { + "epoch": 2.57267334074898, + "grad_norm": 8.4453125, + "learning_rate": 7.427326659251021e-06, + "loss": 3.6529, + "mean_token_accuracy": 0.45190895741556536, + "step": 13877 + }, + { + "epoch": 2.5728587319243603, + "grad_norm": 8.765625, + "learning_rate": 7.427141268075641e-06, + "loss": 2.7644, + "mean_token_accuracy": 0.4810219874708267, + "step": 13878 + }, + { + "epoch": 2.5730441230997405, + "grad_norm": 7.65234375, + "learning_rate": 7.42695587690026e-06, + "loss": 2.5066, + "mean_token_accuracy": 0.5161910394795209, + "step": 13879 + }, + { + "epoch": 2.5732295142751207, + "grad_norm": 7.64453125, + "learning_rate": 7.42677048572488e-06, + "loss": 2.7141, + "mean_token_accuracy": 0.4954978467962939, + "step": 13880 + }, + { + "epoch": 2.5734149054505004, + "grad_norm": 9.7265625, + "learning_rate": 7.4265850945495e-06, + "loss": 3.3356, + "mean_token_accuracy": 0.4327260571619589, + "step": 13881 + }, + { + "epoch": 2.5736002966258806, + "grad_norm": 8.7265625, + "learning_rate": 7.42639970337412e-06, + "loss": 3.0133, + "mean_token_accuracy": 0.4539687703318152, + "step": 13882 + }, + { + "epoch": 2.573785687801261, + "grad_norm": 7.47265625, + "learning_rate": 7.42621431219874e-06, + "loss": 2.8698, + "mean_token_accuracy": 0.4681172911333488, + "step": 13883 + }, + { + "epoch": 2.5739710789766406, + "grad_norm": 7.625, + "learning_rate": 7.426028921023359e-06, + "loss": 2.883, + "mean_token_accuracy": 0.5178223336118073, + "step": 13884 + }, + { + "epoch": 2.5741564701520208, + "grad_norm": 8.25, + "learning_rate": 7.42584352984798e-06, + "loss": 2.9943, + "mean_token_accuracy": 0.5128030044383749, + "step": 13885 + }, + { + "epoch": 2.574341861327401, + "grad_norm": 8.359375, + "learning_rate": 7.4256581386726e-06, + "loss": 2.9865, + "mean_token_accuracy": 0.47829816435066047, + "step": 13886 + }, + { + "epoch": 2.574527252502781, + "grad_norm": 7.46484375, + "learning_rate": 7.42547274749722e-06, + "loss": 3.2705, + "mean_token_accuracy": 0.4473460463986396, + "step": 13887 + }, + { + "epoch": 2.574712643678161, + "grad_norm": 7.21875, + "learning_rate": 7.42528735632184e-06, + "loss": 2.7705, + "mean_token_accuracy": 0.4973309608540925, + "step": 13888 + }, + { + "epoch": 2.574898034853541, + "grad_norm": 6.9453125, + "learning_rate": 7.425101965146459e-06, + "loss": 2.9913, + "mean_token_accuracy": 0.4772191185599007, + "step": 13889 + }, + { + "epoch": 2.575083426028921, + "grad_norm": 7.84765625, + "learning_rate": 7.424916573971079e-06, + "loss": 3.0265, + "mean_token_accuracy": 0.4748159057437408, + "step": 13890 + }, + { + "epoch": 2.575268817204301, + "grad_norm": 7.66796875, + "learning_rate": 7.4247311827956994e-06, + "loss": 2.9358, + "mean_token_accuracy": 0.48886378308586187, + "step": 13891 + }, + { + "epoch": 2.575454208379681, + "grad_norm": 8.53125, + "learning_rate": 7.424545791620319e-06, + "loss": 2.9368, + "mean_token_accuracy": 0.48583494633116314, + "step": 13892 + }, + { + "epoch": 2.5756395995550614, + "grad_norm": 7.5390625, + "learning_rate": 7.4243604004449396e-06, + "loss": 2.9243, + "mean_token_accuracy": 0.4774090853434554, + "step": 13893 + }, + { + "epoch": 2.575824990730441, + "grad_norm": 9.2734375, + "learning_rate": 7.42417500926956e-06, + "loss": 1.7282, + "mean_token_accuracy": 0.6059786110037366, + "step": 13894 + }, + { + "epoch": 2.5760103819058213, + "grad_norm": 9.1484375, + "learning_rate": 7.42398961809418e-06, + "loss": 2.8561, + "mean_token_accuracy": 0.4573792111257643, + "step": 13895 + }, + { + "epoch": 2.576195773081201, + "grad_norm": 7.16796875, + "learning_rate": 7.423804226918799e-06, + "loss": 2.9652, + "mean_token_accuracy": 0.47495727619298017, + "step": 13896 + }, + { + "epoch": 2.5763811642565813, + "grad_norm": 7.296875, + "learning_rate": 7.423618835743419e-06, + "loss": 3.5156, + "mean_token_accuracy": 0.4018526687251875, + "step": 13897 + }, + { + "epoch": 2.5765665554319614, + "grad_norm": 7.56640625, + "learning_rate": 7.423433444568039e-06, + "loss": 3.0222, + "mean_token_accuracy": 0.46008907159986295, + "step": 13898 + }, + { + "epoch": 2.5767519466073416, + "grad_norm": 6.81640625, + "learning_rate": 7.423248053392659e-06, + "loss": 3.1452, + "mean_token_accuracy": 0.45694078491194007, + "step": 13899 + }, + { + "epoch": 2.5769373377827214, + "grad_norm": 10.6640625, + "learning_rate": 7.423062662217279e-06, + "loss": 2.6061, + "mean_token_accuracy": 0.4940143655227454, + "step": 13900 + }, + { + "epoch": 2.5771227289581016, + "grad_norm": 7.5625, + "learning_rate": 7.422877271041899e-06, + "loss": 3.0893, + "mean_token_accuracy": 0.47484358706986446, + "step": 13901 + }, + { + "epoch": 2.5773081201334818, + "grad_norm": 8.4140625, + "learning_rate": 7.422691879866519e-06, + "loss": 2.9624, + "mean_token_accuracy": 0.47853681052388647, + "step": 13902 + }, + { + "epoch": 2.5774935113088615, + "grad_norm": 7.3515625, + "learning_rate": 7.422506488691139e-06, + "loss": 3.4662, + "mean_token_accuracy": 0.4620510921880785, + "step": 13903 + }, + { + "epoch": 2.5776789024842417, + "grad_norm": 6.73046875, + "learning_rate": 7.422321097515759e-06, + "loss": 3.3028, + "mean_token_accuracy": 0.44192997438087106, + "step": 13904 + }, + { + "epoch": 2.577864293659622, + "grad_norm": 11.703125, + "learning_rate": 7.422135706340379e-06, + "loss": 2.8506, + "mean_token_accuracy": 0.4718213420415043, + "step": 13905 + }, + { + "epoch": 2.578049684835002, + "grad_norm": 8.3828125, + "learning_rate": 7.421950315164998e-06, + "loss": 2.9878, + "mean_token_accuracy": 0.46676036542515814, + "step": 13906 + }, + { + "epoch": 2.578235076010382, + "grad_norm": 10.390625, + "learning_rate": 7.421764923989618e-06, + "loss": 2.4272, + "mean_token_accuracy": 0.5113495200451722, + "step": 13907 + }, + { + "epoch": 2.578420467185762, + "grad_norm": 9.8671875, + "learning_rate": 7.4215795328142385e-06, + "loss": 2.5407, + "mean_token_accuracy": 0.5048792388387412, + "step": 13908 + }, + { + "epoch": 2.5786058583611418, + "grad_norm": 11.8828125, + "learning_rate": 7.421394141638859e-06, + "loss": 2.7853, + "mean_token_accuracy": 0.4998896490840874, + "step": 13909 + }, + { + "epoch": 2.578791249536522, + "grad_norm": 8.453125, + "learning_rate": 7.421208750463479e-06, + "loss": 2.8793, + "mean_token_accuracy": 0.47957017776598565, + "step": 13910 + }, + { + "epoch": 2.578976640711902, + "grad_norm": 12.171875, + "learning_rate": 7.421023359288099e-06, + "loss": 2.9128, + "mean_token_accuracy": 0.5126591541050783, + "step": 13911 + }, + { + "epoch": 2.5791620318872823, + "grad_norm": 12.1015625, + "learning_rate": 7.420837968112719e-06, + "loss": 2.7874, + "mean_token_accuracy": 0.46401485838105555, + "step": 13912 + }, + { + "epoch": 2.579347423062662, + "grad_norm": 7.76171875, + "learning_rate": 7.420652576937338e-06, + "loss": 2.5907, + "mean_token_accuracy": 0.4966266437964551, + "step": 13913 + }, + { + "epoch": 2.5795328142380423, + "grad_norm": 10.390625, + "learning_rate": 7.420467185761958e-06, + "loss": 3.5395, + "mean_token_accuracy": 0.441124212566629, + "step": 13914 + }, + { + "epoch": 2.5797182054134224, + "grad_norm": 11.359375, + "learning_rate": 7.420281794586578e-06, + "loss": 3.0276, + "mean_token_accuracy": 0.4985744390727656, + "step": 13915 + }, + { + "epoch": 2.579903596588802, + "grad_norm": 9.25, + "learning_rate": 7.420096403411198e-06, + "loss": 2.2037, + "mean_token_accuracy": 0.5579320299173337, + "step": 13916 + }, + { + "epoch": 2.5800889877641824, + "grad_norm": 8.875, + "learning_rate": 7.419911012235819e-06, + "loss": 3.328, + "mean_token_accuracy": 0.4683205217796413, + "step": 13917 + }, + { + "epoch": 2.5802743789395626, + "grad_norm": 14.078125, + "learning_rate": 7.419725621060438e-06, + "loss": 3.0099, + "mean_token_accuracy": 0.48199910554561715, + "step": 13918 + }, + { + "epoch": 2.5804597701149428, + "grad_norm": 16.5, + "learning_rate": 7.419540229885058e-06, + "loss": 2.81, + "mean_token_accuracy": 0.4898307148946535, + "step": 13919 + }, + { + "epoch": 2.5806451612903225, + "grad_norm": 8.703125, + "learning_rate": 7.4193548387096784e-06, + "loss": 2.3953, + "mean_token_accuracy": 0.5180497468633062, + "step": 13920 + }, + { + "epoch": 2.5808305524657027, + "grad_norm": 10.828125, + "learning_rate": 7.419169447534298e-06, + "loss": 2.4448, + "mean_token_accuracy": 0.5023769907297362, + "step": 13921 + }, + { + "epoch": 2.5810159436410824, + "grad_norm": 15.8828125, + "learning_rate": 7.418984056358918e-06, + "loss": 2.6862, + "mean_token_accuracy": 0.48659694674840726, + "step": 13922 + }, + { + "epoch": 2.5812013348164626, + "grad_norm": 10.7421875, + "learning_rate": 7.418798665183537e-06, + "loss": 3.193, + "mean_token_accuracy": 0.4764187757832534, + "step": 13923 + }, + { + "epoch": 2.581386725991843, + "grad_norm": 6.53125, + "learning_rate": 7.418613274008157e-06, + "loss": 2.8806, + "mean_token_accuracy": 0.5219277108433735, + "step": 13924 + }, + { + "epoch": 2.581572117167223, + "grad_norm": 12.9921875, + "learning_rate": 7.418427882832778e-06, + "loss": 3.1153, + "mean_token_accuracy": 0.4549132947976879, + "step": 13925 + }, + { + "epoch": 2.5817575083426028, + "grad_norm": 12.8984375, + "learning_rate": 7.418242491657398e-06, + "loss": 3.5043, + "mean_token_accuracy": 0.43431442928930364, + "step": 13926 + }, + { + "epoch": 2.581942899517983, + "grad_norm": 12.96875, + "learning_rate": 7.418057100482018e-06, + "loss": 2.4754, + "mean_token_accuracy": 0.5062525211778943, + "step": 13927 + }, + { + "epoch": 2.582128290693363, + "grad_norm": 8.7890625, + "learning_rate": 7.417871709306637e-06, + "loss": 2.3015, + "mean_token_accuracy": 0.5627637130801688, + "step": 13928 + }, + { + "epoch": 2.582313681868743, + "grad_norm": 14.3515625, + "learning_rate": 7.417686318131258e-06, + "loss": 2.551, + "mean_token_accuracy": 0.5056388483481491, + "step": 13929 + }, + { + "epoch": 2.582499073044123, + "grad_norm": 12.796875, + "learning_rate": 7.417500926955877e-06, + "loss": 2.8666, + "mean_token_accuracy": 0.4835680751173709, + "step": 13930 + }, + { + "epoch": 2.5826844642195033, + "grad_norm": 7.375, + "learning_rate": 7.417315535780497e-06, + "loss": 3.738, + "mean_token_accuracy": 0.41615109084988605, + "step": 13931 + }, + { + "epoch": 2.5828698553948835, + "grad_norm": 11.9609375, + "learning_rate": 7.417130144605117e-06, + "loss": 2.7118, + "mean_token_accuracy": 0.49779601763185893, + "step": 13932 + }, + { + "epoch": 2.583055246570263, + "grad_norm": 16.25, + "learning_rate": 7.416944753429738e-06, + "loss": 2.5849, + "mean_token_accuracy": 0.5102974828375286, + "step": 13933 + }, + { + "epoch": 2.5832406377456434, + "grad_norm": 7.75390625, + "learning_rate": 7.416759362254358e-06, + "loss": 2.7829, + "mean_token_accuracy": 0.4915697674418605, + "step": 13934 + }, + { + "epoch": 2.583426028921023, + "grad_norm": 7.75, + "learning_rate": 7.416573971078977e-06, + "loss": 3.2192, + "mean_token_accuracy": 0.4704918032786885, + "step": 13935 + }, + { + "epoch": 2.5836114200964033, + "grad_norm": 7.7265625, + "learning_rate": 7.416388579903597e-06, + "loss": 2.1098, + "mean_token_accuracy": 0.5558608058608059, + "step": 13936 + }, + { + "epoch": 2.5837968112717835, + "grad_norm": 8.703125, + "learning_rate": 7.4162031887282175e-06, + "loss": 3.5302, + "mean_token_accuracy": 0.42971204188481676, + "step": 13937 + }, + { + "epoch": 2.5839822024471637, + "grad_norm": 8.515625, + "learning_rate": 7.416017797552837e-06, + "loss": 3.2424, + "mean_token_accuracy": 0.4374258600237248, + "step": 13938 + }, + { + "epoch": 2.5841675936225434, + "grad_norm": 7.21875, + "learning_rate": 7.415832406377457e-06, + "loss": 2.4694, + "mean_token_accuracy": 0.5230387868183144, + "step": 13939 + }, + { + "epoch": 2.5843529847979236, + "grad_norm": 7.1875, + "learning_rate": 7.415647015202076e-06, + "loss": 2.5194, + "mean_token_accuracy": 0.5006126616746086, + "step": 13940 + }, + { + "epoch": 2.584538375973304, + "grad_norm": 7.5625, + "learning_rate": 7.415461624026698e-06, + "loss": 3.7236, + "mean_token_accuracy": 0.432829992189534, + "step": 13941 + }, + { + "epoch": 2.5847237671486836, + "grad_norm": 7.828125, + "learning_rate": 7.415276232851317e-06, + "loss": 2.9918, + "mean_token_accuracy": 0.5079330342488237, + "step": 13942 + }, + { + "epoch": 2.5849091583240638, + "grad_norm": 8.953125, + "learning_rate": 7.415090841675937e-06, + "loss": 2.8803, + "mean_token_accuracy": 0.47976111479761113, + "step": 13943 + }, + { + "epoch": 2.585094549499444, + "grad_norm": 6.5390625, + "learning_rate": 7.414905450500557e-06, + "loss": 2.3202, + "mean_token_accuracy": 0.5530612244897959, + "step": 13944 + }, + { + "epoch": 2.585279940674824, + "grad_norm": 8.4140625, + "learning_rate": 7.414720059325176e-06, + "loss": 2.7998, + "mean_token_accuracy": 0.5053997923156802, + "step": 13945 + }, + { + "epoch": 2.585465331850204, + "grad_norm": 6.9453125, + "learning_rate": 7.414534668149797e-06, + "loss": 3.0802, + "mean_token_accuracy": 0.4533551554828151, + "step": 13946 + }, + { + "epoch": 2.585650723025584, + "grad_norm": 7.74609375, + "learning_rate": 7.4143492769744165e-06, + "loss": 2.8331, + "mean_token_accuracy": 0.48632218844984804, + "step": 13947 + }, + { + "epoch": 2.585836114200964, + "grad_norm": 6.69140625, + "learning_rate": 7.414163885799036e-06, + "loss": 2.6185, + "mean_token_accuracy": 0.4991015274034142, + "step": 13948 + }, + { + "epoch": 2.586021505376344, + "grad_norm": 6.7421875, + "learning_rate": 7.4139784946236574e-06, + "loss": 2.8337, + "mean_token_accuracy": 0.4729305363647783, + "step": 13949 + }, + { + "epoch": 2.586206896551724, + "grad_norm": 8.5078125, + "learning_rate": 7.413793103448277e-06, + "loss": 2.951, + "mean_token_accuracy": 0.45045170257123, + "step": 13950 + }, + { + "epoch": 2.5863922877271044, + "grad_norm": 7.421875, + "learning_rate": 7.413607712272897e-06, + "loss": 2.8925, + "mean_token_accuracy": 0.46608803471791693, + "step": 13951 + }, + { + "epoch": 2.586577678902484, + "grad_norm": 7.984375, + "learning_rate": 7.413422321097516e-06, + "loss": 2.986, + "mean_token_accuracy": 0.48396024657189585, + "step": 13952 + }, + { + "epoch": 2.5867630700778643, + "grad_norm": 7.328125, + "learning_rate": 7.413236929922136e-06, + "loss": 2.6442, + "mean_token_accuracy": 0.5099665551839465, + "step": 13953 + }, + { + "epoch": 2.586948461253244, + "grad_norm": 6.9765625, + "learning_rate": 7.4130515387467565e-06, + "loss": 2.658, + "mean_token_accuracy": 0.4752606321363561, + "step": 13954 + }, + { + "epoch": 2.5871338524286243, + "grad_norm": 7.8125, + "learning_rate": 7.412866147571376e-06, + "loss": 2.597, + "mean_token_accuracy": 0.49253030160668987, + "step": 13955 + }, + { + "epoch": 2.5873192436040044, + "grad_norm": 7.35546875, + "learning_rate": 7.412680756395996e-06, + "loss": 2.4636, + "mean_token_accuracy": 0.5244667503136763, + "step": 13956 + }, + { + "epoch": 2.5875046347793846, + "grad_norm": 9.0234375, + "learning_rate": 7.412495365220616e-06, + "loss": 3.1874, + "mean_token_accuracy": 0.45848168140576545, + "step": 13957 + }, + { + "epoch": 2.587690025954765, + "grad_norm": 8.5234375, + "learning_rate": 7.412309974045237e-06, + "loss": 3.0442, + "mean_token_accuracy": 0.47606863335340155, + "step": 13958 + }, + { + "epoch": 2.5878754171301446, + "grad_norm": 8.984375, + "learning_rate": 7.412124582869856e-06, + "loss": 3.7709, + "mean_token_accuracy": 0.4407072587077905, + "step": 13959 + }, + { + "epoch": 2.5880608083055248, + "grad_norm": 9.9140625, + "learning_rate": 7.411939191694476e-06, + "loss": 2.5997, + "mean_token_accuracy": 0.5048709847288047, + "step": 13960 + }, + { + "epoch": 2.5882461994809045, + "grad_norm": 8.21875, + "learning_rate": 7.411753800519096e-06, + "loss": 3.2985, + "mean_token_accuracy": 0.47121732241880115, + "step": 13961 + }, + { + "epoch": 2.5884315906562847, + "grad_norm": 6.59375, + "learning_rate": 7.411568409343715e-06, + "loss": 2.8897, + "mean_token_accuracy": 0.4830540746382331, + "step": 13962 + }, + { + "epoch": 2.588616981831665, + "grad_norm": 7.4453125, + "learning_rate": 7.411383018168336e-06, + "loss": 3.1365, + "mean_token_accuracy": 0.47453310696095075, + "step": 13963 + }, + { + "epoch": 2.588802373007045, + "grad_norm": 7.4921875, + "learning_rate": 7.4111976269929555e-06, + "loss": 2.2326, + "mean_token_accuracy": 0.5432654141307224, + "step": 13964 + }, + { + "epoch": 2.588987764182425, + "grad_norm": 6.8515625, + "learning_rate": 7.411012235817576e-06, + "loss": 3.1662, + "mean_token_accuracy": 0.44380853277835586, + "step": 13965 + }, + { + "epoch": 2.589173155357805, + "grad_norm": 9.234375, + "learning_rate": 7.410826844642196e-06, + "loss": 2.869, + "mean_token_accuracy": 0.48666568439664065, + "step": 13966 + }, + { + "epoch": 2.5893585465331848, + "grad_norm": 7.44921875, + "learning_rate": 7.410641453466816e-06, + "loss": 2.7601, + "mean_token_accuracy": 0.5128174697365298, + "step": 13967 + }, + { + "epoch": 2.589543937708565, + "grad_norm": 7.42578125, + "learning_rate": 7.410456062291436e-06, + "loss": 2.3941, + "mean_token_accuracy": 0.5070015879890284, + "step": 13968 + }, + { + "epoch": 2.589729328883945, + "grad_norm": 7.734375, + "learning_rate": 7.410270671116055e-06, + "loss": 2.5153, + "mean_token_accuracy": 0.520937813440321, + "step": 13969 + }, + { + "epoch": 2.5899147200593253, + "grad_norm": 8.3203125, + "learning_rate": 7.410085279940675e-06, + "loss": 2.6845, + "mean_token_accuracy": 0.49912556838055266, + "step": 13970 + }, + { + "epoch": 2.590100111234705, + "grad_norm": 8.671875, + "learning_rate": 7.409899888765295e-06, + "loss": 2.7338, + "mean_token_accuracy": 0.5214756967820116, + "step": 13971 + }, + { + "epoch": 2.5902855024100853, + "grad_norm": 9.453125, + "learning_rate": 7.409714497589915e-06, + "loss": 3.0871, + "mean_token_accuracy": 0.46781163434903045, + "step": 13972 + }, + { + "epoch": 2.5904708935854655, + "grad_norm": 10.265625, + "learning_rate": 7.409529106414536e-06, + "loss": 2.7718, + "mean_token_accuracy": 0.511468204916907, + "step": 13973 + }, + { + "epoch": 2.590656284760845, + "grad_norm": 8.4765625, + "learning_rate": 7.409343715239155e-06, + "loss": 2.5169, + "mean_token_accuracy": 0.5114715189873418, + "step": 13974 + }, + { + "epoch": 2.5908416759362254, + "grad_norm": 7.74609375, + "learning_rate": 7.409158324063776e-06, + "loss": 3.3568, + "mean_token_accuracy": 0.444529262086514, + "step": 13975 + }, + { + "epoch": 2.5910270671116056, + "grad_norm": 10.8828125, + "learning_rate": 7.4089729328883955e-06, + "loss": 2.368, + "mean_token_accuracy": 0.5223726627981947, + "step": 13976 + }, + { + "epoch": 2.5912124582869858, + "grad_norm": 7.5234375, + "learning_rate": 7.408787541713015e-06, + "loss": 2.3371, + "mean_token_accuracy": 0.5459699833240689, + "step": 13977 + }, + { + "epoch": 2.5913978494623655, + "grad_norm": 7.89453125, + "learning_rate": 7.408602150537635e-06, + "loss": 2.256, + "mean_token_accuracy": 0.5705033058966401, + "step": 13978 + }, + { + "epoch": 2.5915832406377457, + "grad_norm": 10.109375, + "learning_rate": 7.408416759362254e-06, + "loss": 3.1308, + "mean_token_accuracy": 0.43333333333333335, + "step": 13979 + }, + { + "epoch": 2.5917686318131254, + "grad_norm": 11.203125, + "learning_rate": 7.408231368186875e-06, + "loss": 2.8321, + "mean_token_accuracy": 0.4740951029098652, + "step": 13980 + }, + { + "epoch": 2.5919540229885056, + "grad_norm": 8.0625, + "learning_rate": 7.4080459770114945e-06, + "loss": 2.9113, + "mean_token_accuracy": 0.46781901372648704, + "step": 13981 + }, + { + "epoch": 2.592139414163886, + "grad_norm": 8.984375, + "learning_rate": 7.407860585836115e-06, + "loss": 2.8399, + "mean_token_accuracy": 0.48662827895073574, + "step": 13982 + }, + { + "epoch": 2.592324805339266, + "grad_norm": 12.453125, + "learning_rate": 7.407675194660735e-06, + "loss": 2.8819, + "mean_token_accuracy": 0.4713804713804714, + "step": 13983 + }, + { + "epoch": 2.5925101965146458, + "grad_norm": 8.9765625, + "learning_rate": 7.407489803485355e-06, + "loss": 2.6188, + "mean_token_accuracy": 0.4950351174618552, + "step": 13984 + }, + { + "epoch": 2.592695587690026, + "grad_norm": 9.6875, + "learning_rate": 7.407304412309975e-06, + "loss": 2.9166, + "mean_token_accuracy": 0.4851043865822191, + "step": 13985 + }, + { + "epoch": 2.592880978865406, + "grad_norm": 10.671875, + "learning_rate": 7.4071190211345944e-06, + "loss": 2.4716, + "mean_token_accuracy": 0.5270096463022508, + "step": 13986 + }, + { + "epoch": 2.593066370040786, + "grad_norm": 7.70703125, + "learning_rate": 7.406933629959214e-06, + "loss": 3.4771, + "mean_token_accuracy": 0.4183262209577999, + "step": 13987 + }, + { + "epoch": 2.593251761216166, + "grad_norm": 7.5703125, + "learning_rate": 7.406748238783834e-06, + "loss": 2.8169, + "mean_token_accuracy": 0.4788005803643398, + "step": 13988 + }, + { + "epoch": 2.5934371523915463, + "grad_norm": 13.78125, + "learning_rate": 7.406562847608454e-06, + "loss": 3.0382, + "mean_token_accuracy": 0.4695856137607506, + "step": 13989 + }, + { + "epoch": 2.5936225435669265, + "grad_norm": 14.40625, + "learning_rate": 7.406377456433075e-06, + "loss": 2.449, + "mean_token_accuracy": 0.5082574031890661, + "step": 13990 + }, + { + "epoch": 2.593807934742306, + "grad_norm": 11.8828125, + "learning_rate": 7.406192065257694e-06, + "loss": 2.6672, + "mean_token_accuracy": 0.4993322451402285, + "step": 13991 + }, + { + "epoch": 2.5939933259176864, + "grad_norm": 8.2265625, + "learning_rate": 7.406006674082315e-06, + "loss": 2.9562, + "mean_token_accuracy": 0.4866153846153846, + "step": 13992 + }, + { + "epoch": 2.594178717093066, + "grad_norm": 9.7421875, + "learning_rate": 7.4058212829069345e-06, + "loss": 3.907, + "mean_token_accuracy": 0.39424420229114276, + "step": 13993 + }, + { + "epoch": 2.5943641082684463, + "grad_norm": 12.3046875, + "learning_rate": 7.405635891731554e-06, + "loss": 2.9289, + "mean_token_accuracy": 0.47823428711176325, + "step": 13994 + }, + { + "epoch": 2.5945494994438265, + "grad_norm": 7.328125, + "learning_rate": 7.405450500556174e-06, + "loss": 2.8235, + "mean_token_accuracy": 0.46825521241573914, + "step": 13995 + }, + { + "epoch": 2.5947348906192067, + "grad_norm": 9.90625, + "learning_rate": 7.4052651093807934e-06, + "loss": 2.6526, + "mean_token_accuracy": 0.5003327787021631, + "step": 13996 + }, + { + "epoch": 2.5949202817945864, + "grad_norm": 11.5, + "learning_rate": 7.405079718205414e-06, + "loss": 3.2845, + "mean_token_accuracy": 0.4846711614298435, + "step": 13997 + }, + { + "epoch": 2.5951056729699666, + "grad_norm": 13.0703125, + "learning_rate": 7.404894327030034e-06, + "loss": 2.832, + "mean_token_accuracy": 0.485972850678733, + "step": 13998 + }, + { + "epoch": 2.595291064145347, + "grad_norm": 7.4609375, + "learning_rate": 7.404708935854654e-06, + "loss": 2.9016, + "mean_token_accuracy": 0.4855574812247256, + "step": 13999 + }, + { + "epoch": 2.5954764553207266, + "grad_norm": 10.265625, + "learning_rate": 7.404523544679274e-06, + "loss": 2.9818, + "mean_token_accuracy": 0.47647538754419366, + "step": 14000 + }, + { + "epoch": 2.5956618464961068, + "grad_norm": 10.796875, + "learning_rate": 7.404338153503894e-06, + "loss": 3.0895, + "mean_token_accuracy": 0.47282847517211074, + "step": 14001 + }, + { + "epoch": 2.595847237671487, + "grad_norm": 8.6015625, + "learning_rate": 7.404152762328514e-06, + "loss": 2.5182, + "mean_token_accuracy": 0.5162206607863656, + "step": 14002 + }, + { + "epoch": 2.596032628846867, + "grad_norm": 6.34765625, + "learning_rate": 7.4039673711531335e-06, + "loss": 2.4964, + "mean_token_accuracy": 0.5136678906568748, + "step": 14003 + }, + { + "epoch": 2.596218020022247, + "grad_norm": 12.3515625, + "learning_rate": 7.403781979977753e-06, + "loss": 3.1507, + "mean_token_accuracy": 0.48750215480089637, + "step": 14004 + }, + { + "epoch": 2.596403411197627, + "grad_norm": 10.3515625, + "learning_rate": 7.403596588802373e-06, + "loss": 3.2799, + "mean_token_accuracy": 0.45151093690732075, + "step": 14005 + }, + { + "epoch": 2.596588802373007, + "grad_norm": 10.5390625, + "learning_rate": 7.403411197626994e-06, + "loss": 3.4013, + "mean_token_accuracy": 0.4314648409641991, + "step": 14006 + }, + { + "epoch": 2.596774193548387, + "grad_norm": 8.890625, + "learning_rate": 7.403225806451614e-06, + "loss": 3.4462, + "mean_token_accuracy": 0.4637015781922525, + "step": 14007 + }, + { + "epoch": 2.596959584723767, + "grad_norm": 7.609375, + "learning_rate": 7.403040415276233e-06, + "loss": 3.0131, + "mean_token_accuracy": 0.5031438935912939, + "step": 14008 + }, + { + "epoch": 2.5971449758991474, + "grad_norm": 12.3046875, + "learning_rate": 7.402855024100853e-06, + "loss": 2.436, + "mean_token_accuracy": 0.5123674911660777, + "step": 14009 + }, + { + "epoch": 2.597330367074527, + "grad_norm": 12.1171875, + "learning_rate": 7.4026696329254735e-06, + "loss": 2.6155, + "mean_token_accuracy": 0.4810971089696071, + "step": 14010 + }, + { + "epoch": 2.5975157582499073, + "grad_norm": 7.21484375, + "learning_rate": 7.402484241750093e-06, + "loss": 2.4449, + "mean_token_accuracy": 0.51972401379931, + "step": 14011 + }, + { + "epoch": 2.5977011494252875, + "grad_norm": 10.1796875, + "learning_rate": 7.402298850574713e-06, + "loss": 2.8226, + "mean_token_accuracy": 0.48535980148883373, + "step": 14012 + }, + { + "epoch": 2.5978865406006673, + "grad_norm": 13.8203125, + "learning_rate": 7.4021134593993325e-06, + "loss": 3.3392, + "mean_token_accuracy": 0.4419678036699313, + "step": 14013 + }, + { + "epoch": 2.5980719317760474, + "grad_norm": 11.96875, + "learning_rate": 7.401928068223954e-06, + "loss": 2.7862, + "mean_token_accuracy": 0.4786916557240062, + "step": 14014 + }, + { + "epoch": 2.5982573229514276, + "grad_norm": 8.6484375, + "learning_rate": 7.4017426770485734e-06, + "loss": 3.0507, + "mean_token_accuracy": 0.4789346802892894, + "step": 14015 + }, + { + "epoch": 2.598442714126808, + "grad_norm": 10.203125, + "learning_rate": 7.401557285873193e-06, + "loss": 3.0694, + "mean_token_accuracy": 0.465343347639485, + "step": 14016 + }, + { + "epoch": 2.5986281053021876, + "grad_norm": 13.859375, + "learning_rate": 7.401371894697813e-06, + "loss": 3.0966, + "mean_token_accuracy": 0.47842153886762573, + "step": 14017 + }, + { + "epoch": 2.5988134964775678, + "grad_norm": 9.21875, + "learning_rate": 7.401186503522433e-06, + "loss": 3.2729, + "mean_token_accuracy": 0.47618315918869086, + "step": 14018 + }, + { + "epoch": 2.5989988876529475, + "grad_norm": 7.78125, + "learning_rate": 7.401001112347053e-06, + "loss": 3.1098, + "mean_token_accuracy": 0.44264356573983515, + "step": 14019 + }, + { + "epoch": 2.5991842788283277, + "grad_norm": 13.6171875, + "learning_rate": 7.4008157211716725e-06, + "loss": 2.397, + "mean_token_accuracy": 0.5088184590903063, + "step": 14020 + }, + { + "epoch": 2.599369670003708, + "grad_norm": 16.046875, + "learning_rate": 7.400630329996292e-06, + "loss": 2.4798, + "mean_token_accuracy": 0.510983629494893, + "step": 14021 + }, + { + "epoch": 2.599555061179088, + "grad_norm": 7.57421875, + "learning_rate": 7.4004449388209135e-06, + "loss": 3.3235, + "mean_token_accuracy": 0.4503038619878455, + "step": 14022 + }, + { + "epoch": 2.599740452354468, + "grad_norm": 7.359375, + "learning_rate": 7.400259547645533e-06, + "loss": 2.757, + "mean_token_accuracy": 0.48173076923076924, + "step": 14023 + }, + { + "epoch": 2.599925843529848, + "grad_norm": 10.515625, + "learning_rate": 7.400074156470153e-06, + "loss": 2.8751, + "mean_token_accuracy": 0.4902627107567638, + "step": 14024 + }, + { + "epoch": 2.6001112347052278, + "grad_norm": 8.2109375, + "learning_rate": 7.3998887652947724e-06, + "loss": 2.72, + "mean_token_accuracy": 0.5054747801112907, + "step": 14025 + }, + { + "epoch": 2.600296625880608, + "grad_norm": 7.0234375, + "learning_rate": 7.399703374119392e-06, + "loss": 3.0959, + "mean_token_accuracy": 0.4595917225950783, + "step": 14026 + }, + { + "epoch": 2.600482017055988, + "grad_norm": 7.29296875, + "learning_rate": 7.3995179829440126e-06, + "loss": 2.6463, + "mean_token_accuracy": 0.49626181407814923, + "step": 14027 + }, + { + "epoch": 2.6006674082313683, + "grad_norm": 8.53125, + "learning_rate": 7.399332591768632e-06, + "loss": 3.1928, + "mean_token_accuracy": 0.4684243565599498, + "step": 14028 + }, + { + "epoch": 2.600852799406748, + "grad_norm": 9.34375, + "learning_rate": 7.399147200593252e-06, + "loss": 2.3868, + "mean_token_accuracy": 0.5161152917790715, + "step": 14029 + }, + { + "epoch": 2.6010381905821283, + "grad_norm": 8.7265625, + "learning_rate": 7.398961809417873e-06, + "loss": 3.1542, + "mean_token_accuracy": 0.4465753424657534, + "step": 14030 + }, + { + "epoch": 2.6012235817575085, + "grad_norm": 8.9765625, + "learning_rate": 7.398776418242493e-06, + "loss": 3.0323, + "mean_token_accuracy": 0.48227725176013597, + "step": 14031 + }, + { + "epoch": 2.601408972932888, + "grad_norm": 9.0390625, + "learning_rate": 7.3985910270671125e-06, + "loss": 2.2046, + "mean_token_accuracy": 0.546595715650157, + "step": 14032 + }, + { + "epoch": 2.6015943641082684, + "grad_norm": 9.09375, + "learning_rate": 7.398405635891732e-06, + "loss": 2.7814, + "mean_token_accuracy": 0.4890779489537825, + "step": 14033 + }, + { + "epoch": 2.6017797552836486, + "grad_norm": 7.03125, + "learning_rate": 7.398220244716352e-06, + "loss": 3.1435, + "mean_token_accuracy": 0.4759066572381049, + "step": 14034 + }, + { + "epoch": 2.6019651464590288, + "grad_norm": 7.26171875, + "learning_rate": 7.398034853540972e-06, + "loss": 3.458, + "mean_token_accuracy": 0.41332504403688736, + "step": 14035 + }, + { + "epoch": 2.6021505376344085, + "grad_norm": 8.5390625, + "learning_rate": 7.397849462365592e-06, + "loss": 3.0594, + "mean_token_accuracy": 0.4686639497742042, + "step": 14036 + }, + { + "epoch": 2.6023359288097887, + "grad_norm": 7.28515625, + "learning_rate": 7.3976640711902115e-06, + "loss": 3.0607, + "mean_token_accuracy": 0.4781514830508475, + "step": 14037 + }, + { + "epoch": 2.6025213199851684, + "grad_norm": 9.8828125, + "learning_rate": 7.397478680014832e-06, + "loss": 3.0082, + "mean_token_accuracy": 0.48933431408606104, + "step": 14038 + }, + { + "epoch": 2.6027067111605486, + "grad_norm": 7.30859375, + "learning_rate": 7.3972932888394525e-06, + "loss": 3.2307, + "mean_token_accuracy": 0.4556137437731511, + "step": 14039 + }, + { + "epoch": 2.602892102335929, + "grad_norm": 8.0859375, + "learning_rate": 7.397107897664072e-06, + "loss": 2.8999, + "mean_token_accuracy": 0.48298091799896853, + "step": 14040 + }, + { + "epoch": 2.603077493511309, + "grad_norm": 9.0, + "learning_rate": 7.396922506488692e-06, + "loss": 3.0363, + "mean_token_accuracy": 0.48129145288449504, + "step": 14041 + }, + { + "epoch": 2.6032628846866888, + "grad_norm": 9.59375, + "learning_rate": 7.3967371153133115e-06, + "loss": 2.9399, + "mean_token_accuracy": 0.49319517476028457, + "step": 14042 + }, + { + "epoch": 2.603448275862069, + "grad_norm": 7.625, + "learning_rate": 7.396551724137931e-06, + "loss": 2.8305, + "mean_token_accuracy": 0.47251992305578455, + "step": 14043 + }, + { + "epoch": 2.603633667037449, + "grad_norm": 9.328125, + "learning_rate": 7.396366332962552e-06, + "loss": 3.2587, + "mean_token_accuracy": 0.45565980629539954, + "step": 14044 + }, + { + "epoch": 2.603819058212829, + "grad_norm": 8.1796875, + "learning_rate": 7.396180941787171e-06, + "loss": 3.3015, + "mean_token_accuracy": 0.4554052567909381, + "step": 14045 + }, + { + "epoch": 2.604004449388209, + "grad_norm": 9.25, + "learning_rate": 7.395995550611792e-06, + "loss": 3.0196, + "mean_token_accuracy": 0.4747941643796042, + "step": 14046 + }, + { + "epoch": 2.6041898405635893, + "grad_norm": 7.23828125, + "learning_rate": 7.395810159436411e-06, + "loss": 3.1163, + "mean_token_accuracy": 0.4877472322428163, + "step": 14047 + }, + { + "epoch": 2.6043752317389695, + "grad_norm": 6.984375, + "learning_rate": 7.395624768261032e-06, + "loss": 2.8238, + "mean_token_accuracy": 0.472082329756952, + "step": 14048 + }, + { + "epoch": 2.604560622914349, + "grad_norm": 8.4453125, + "learning_rate": 7.3954393770856515e-06, + "loss": 2.5248, + "mean_token_accuracy": 0.5030164092664092, + "step": 14049 + }, + { + "epoch": 2.6047460140897294, + "grad_norm": 10.6875, + "learning_rate": 7.395253985910271e-06, + "loss": 3.261, + "mean_token_accuracy": 0.4514126047811239, + "step": 14050 + }, + { + "epoch": 2.604931405265109, + "grad_norm": 10.1328125, + "learning_rate": 7.395068594734891e-06, + "loss": 3.1654, + "mean_token_accuracy": 0.4627264988447039, + "step": 14051 + }, + { + "epoch": 2.6051167964404893, + "grad_norm": 8.0234375, + "learning_rate": 7.3948832035595104e-06, + "loss": 2.7106, + "mean_token_accuracy": 0.5234604105571847, + "step": 14052 + }, + { + "epoch": 2.6053021876158695, + "grad_norm": 8.2109375, + "learning_rate": 7.394697812384131e-06, + "loss": 2.7293, + "mean_token_accuracy": 0.5092974056341165, + "step": 14053 + }, + { + "epoch": 2.6054875787912497, + "grad_norm": 6.84375, + "learning_rate": 7.3945124212087514e-06, + "loss": 3.124, + "mean_token_accuracy": 0.44587706146926537, + "step": 14054 + }, + { + "epoch": 2.6056729699666294, + "grad_norm": 9.890625, + "learning_rate": 7.394327030033371e-06, + "loss": 3.0979, + "mean_token_accuracy": 0.4720161834120027, + "step": 14055 + }, + { + "epoch": 2.6058583611420096, + "grad_norm": 9.3671875, + "learning_rate": 7.3941416388579916e-06, + "loss": 2.7948, + "mean_token_accuracy": 0.4816664476278092, + "step": 14056 + }, + { + "epoch": 2.60604375231739, + "grad_norm": 7.7890625, + "learning_rate": 7.393956247682611e-06, + "loss": 2.7531, + "mean_token_accuracy": 0.5115388491383654, + "step": 14057 + }, + { + "epoch": 2.6062291434927696, + "grad_norm": 9.109375, + "learning_rate": 7.393770856507231e-06, + "loss": 2.9072, + "mean_token_accuracy": 0.48580668343514194, + "step": 14058 + }, + { + "epoch": 2.6064145346681498, + "grad_norm": 7.24609375, + "learning_rate": 7.3935854653318505e-06, + "loss": 2.9306, + "mean_token_accuracy": 0.47118816930137686, + "step": 14059 + }, + { + "epoch": 2.60659992584353, + "grad_norm": 8.859375, + "learning_rate": 7.39340007415647e-06, + "loss": 2.8998, + "mean_token_accuracy": 0.4931021389697507, + "step": 14060 + }, + { + "epoch": 2.60678531701891, + "grad_norm": 10.1796875, + "learning_rate": 7.393214682981091e-06, + "loss": 2.8502, + "mean_token_accuracy": 0.4877284030706022, + "step": 14061 + }, + { + "epoch": 2.60697070819429, + "grad_norm": 8.7890625, + "learning_rate": 7.393029291805711e-06, + "loss": 2.749, + "mean_token_accuracy": 0.4812680115273775, + "step": 14062 + }, + { + "epoch": 2.60715609936967, + "grad_norm": 6.31640625, + "learning_rate": 7.392843900630331e-06, + "loss": 3.3214, + "mean_token_accuracy": 0.44415243101182655, + "step": 14063 + }, + { + "epoch": 2.60734149054505, + "grad_norm": 13.8359375, + "learning_rate": 7.39265850945495e-06, + "loss": 2.5121, + "mean_token_accuracy": 0.4973715651135006, + "step": 14064 + }, + { + "epoch": 2.60752688172043, + "grad_norm": 9.2109375, + "learning_rate": 7.392473118279571e-06, + "loss": 3.4335, + "mean_token_accuracy": 0.43648365802608274, + "step": 14065 + }, + { + "epoch": 2.60771227289581, + "grad_norm": 7.5546875, + "learning_rate": 7.3922877271041905e-06, + "loss": 2.9448, + "mean_token_accuracy": 0.5109794353433252, + "step": 14066 + }, + { + "epoch": 2.6078976640711904, + "grad_norm": 8.2421875, + "learning_rate": 7.39210233592881e-06, + "loss": 2.8468, + "mean_token_accuracy": 0.4609544468546638, + "step": 14067 + }, + { + "epoch": 2.60808305524657, + "grad_norm": 7.1171875, + "learning_rate": 7.39191694475343e-06, + "loss": 3.2203, + "mean_token_accuracy": 0.4631093544137022, + "step": 14068 + }, + { + "epoch": 2.6082684464219503, + "grad_norm": 7.50390625, + "learning_rate": 7.3917315535780495e-06, + "loss": 2.4313, + "mean_token_accuracy": 0.540004638218924, + "step": 14069 + }, + { + "epoch": 2.6084538375973305, + "grad_norm": 7.33203125, + "learning_rate": 7.391546162402671e-06, + "loss": 2.9706, + "mean_token_accuracy": 0.4758477677482313, + "step": 14070 + }, + { + "epoch": 2.6086392287727103, + "grad_norm": 8.0, + "learning_rate": 7.3913607712272905e-06, + "loss": 2.6613, + "mean_token_accuracy": 0.515305490147478, + "step": 14071 + }, + { + "epoch": 2.6088246199480905, + "grad_norm": 6.21484375, + "learning_rate": 7.39117538005191e-06, + "loss": 2.756, + "mean_token_accuracy": 0.4924379915305505, + "step": 14072 + }, + { + "epoch": 2.6090100111234706, + "grad_norm": 7.37109375, + "learning_rate": 7.390989988876531e-06, + "loss": 2.9131, + "mean_token_accuracy": 0.5037259615384615, + "step": 14073 + }, + { + "epoch": 2.609195402298851, + "grad_norm": 8.921875, + "learning_rate": 7.39080459770115e-06, + "loss": 3.4159, + "mean_token_accuracy": 0.43231334149326806, + "step": 14074 + }, + { + "epoch": 2.6093807934742306, + "grad_norm": 7.32421875, + "learning_rate": 7.39061920652577e-06, + "loss": 3.4893, + "mean_token_accuracy": 0.43875636558563386, + "step": 14075 + }, + { + "epoch": 2.6095661846496108, + "grad_norm": 7.17578125, + "learning_rate": 7.3904338153503895e-06, + "loss": 3.0812, + "mean_token_accuracy": 0.45950238221281103, + "step": 14076 + }, + { + "epoch": 2.6097515758249905, + "grad_norm": 11.4296875, + "learning_rate": 7.390248424175009e-06, + "loss": 2.2841, + "mean_token_accuracy": 0.5572435050638486, + "step": 14077 + }, + { + "epoch": 2.6099369670003707, + "grad_norm": 9.390625, + "learning_rate": 7.3900630329996305e-06, + "loss": 2.4703, + "mean_token_accuracy": 0.4871969955616251, + "step": 14078 + }, + { + "epoch": 2.610122358175751, + "grad_norm": 9.4375, + "learning_rate": 7.38987764182425e-06, + "loss": 3.2738, + "mean_token_accuracy": 0.4497816593886463, + "step": 14079 + }, + { + "epoch": 2.610307749351131, + "grad_norm": 8.5, + "learning_rate": 7.38969225064887e-06, + "loss": 2.8264, + "mean_token_accuracy": 0.4925914726338071, + "step": 14080 + }, + { + "epoch": 2.610493140526511, + "grad_norm": 10.421875, + "learning_rate": 7.3895068594734894e-06, + "loss": 3.2832, + "mean_token_accuracy": 0.4251590289497598, + "step": 14081 + }, + { + "epoch": 2.610678531701891, + "grad_norm": 11.03125, + "learning_rate": 7.38932146829811e-06, + "loss": 2.1693, + "mean_token_accuracy": 0.5547879689658838, + "step": 14082 + }, + { + "epoch": 2.610863922877271, + "grad_norm": 11.1953125, + "learning_rate": 7.38913607712273e-06, + "loss": 2.8699, + "mean_token_accuracy": 0.48881880733944955, + "step": 14083 + }, + { + "epoch": 2.611049314052651, + "grad_norm": 8.75, + "learning_rate": 7.388950685947349e-06, + "loss": 2.6956, + "mean_token_accuracy": 0.48306852832611136, + "step": 14084 + }, + { + "epoch": 2.611234705228031, + "grad_norm": 9.3203125, + "learning_rate": 7.388765294771969e-06, + "loss": 2.465, + "mean_token_accuracy": 0.5628022172425994, + "step": 14085 + }, + { + "epoch": 2.6114200964034113, + "grad_norm": 9.28125, + "learning_rate": 7.38857990359659e-06, + "loss": 3.0678, + "mean_token_accuracy": 0.4595360824742268, + "step": 14086 + }, + { + "epoch": 2.6116054875787915, + "grad_norm": 9.21875, + "learning_rate": 7.38839451242121e-06, + "loss": 3.0063, + "mean_token_accuracy": 0.47842862153630306, + "step": 14087 + }, + { + "epoch": 2.6117908787541713, + "grad_norm": 8.9765625, + "learning_rate": 7.3882091212458295e-06, + "loss": 2.9347, + "mean_token_accuracy": 0.4759678597516435, + "step": 14088 + }, + { + "epoch": 2.6119762699295515, + "grad_norm": 10.640625, + "learning_rate": 7.388023730070449e-06, + "loss": 3.6015, + "mean_token_accuracy": 0.4211740579338631, + "step": 14089 + }, + { + "epoch": 2.612161661104931, + "grad_norm": 8.7109375, + "learning_rate": 7.387838338895069e-06, + "loss": 3.1343, + "mean_token_accuracy": 0.4515026517383618, + "step": 14090 + }, + { + "epoch": 2.6123470522803114, + "grad_norm": 10.09375, + "learning_rate": 7.387652947719689e-06, + "loss": 2.8525, + "mean_token_accuracy": 0.4834376630151278, + "step": 14091 + }, + { + "epoch": 2.6125324434556916, + "grad_norm": 9.6328125, + "learning_rate": 7.387467556544309e-06, + "loss": 2.4331, + "mean_token_accuracy": 0.5052425555710891, + "step": 14092 + }, + { + "epoch": 2.6127178346310718, + "grad_norm": 8.75, + "learning_rate": 7.3872821653689286e-06, + "loss": 2.7591, + "mean_token_accuracy": 0.5040108975329196, + "step": 14093 + }, + { + "epoch": 2.6129032258064515, + "grad_norm": 7.13671875, + "learning_rate": 7.38709677419355e-06, + "loss": 2.5322, + "mean_token_accuracy": 0.5232524552281917, + "step": 14094 + }, + { + "epoch": 2.6130886169818317, + "grad_norm": 8.953125, + "learning_rate": 7.3869113830181695e-06, + "loss": 2.4001, + "mean_token_accuracy": 0.5013999066728885, + "step": 14095 + }, + { + "epoch": 2.6132740081572114, + "grad_norm": 6.9921875, + "learning_rate": 7.386725991842789e-06, + "loss": 3.086, + "mean_token_accuracy": 0.4639605462822458, + "step": 14096 + }, + { + "epoch": 2.6134593993325916, + "grad_norm": 9.1171875, + "learning_rate": 7.386540600667409e-06, + "loss": 2.9732, + "mean_token_accuracy": 0.47910330066463896, + "step": 14097 + }, + { + "epoch": 2.613644790507972, + "grad_norm": 9.609375, + "learning_rate": 7.3863552094920285e-06, + "loss": 3.4957, + "mean_token_accuracy": 0.4449487744780184, + "step": 14098 + }, + { + "epoch": 2.613830181683352, + "grad_norm": 7.9921875, + "learning_rate": 7.386169818316649e-06, + "loss": 3.2253, + "mean_token_accuracy": 0.46657160963244615, + "step": 14099 + }, + { + "epoch": 2.6140155728587318, + "grad_norm": 12.3671875, + "learning_rate": 7.385984427141269e-06, + "loss": 2.1501, + "mean_token_accuracy": 0.5361332707648991, + "step": 14100 + }, + { + "epoch": 2.614200964034112, + "grad_norm": 9.9765625, + "learning_rate": 7.385799035965888e-06, + "loss": 2.635, + "mean_token_accuracy": 0.5020650490449148, + "step": 14101 + }, + { + "epoch": 2.614386355209492, + "grad_norm": 7.86328125, + "learning_rate": 7.385613644790509e-06, + "loss": 2.2437, + "mean_token_accuracy": 0.5814469596094097, + "step": 14102 + }, + { + "epoch": 2.614571746384872, + "grad_norm": 6.90625, + "learning_rate": 7.385428253615129e-06, + "loss": 3.1955, + "mean_token_accuracy": 0.457201646090535, + "step": 14103 + }, + { + "epoch": 2.614757137560252, + "grad_norm": 10.546875, + "learning_rate": 7.385242862439749e-06, + "loss": 3.0441, + "mean_token_accuracy": 0.46887694928580786, + "step": 14104 + }, + { + "epoch": 2.6149425287356323, + "grad_norm": 10.1953125, + "learning_rate": 7.3850574712643685e-06, + "loss": 2.9211, + "mean_token_accuracy": 0.4647692143694702, + "step": 14105 + }, + { + "epoch": 2.6151279199110125, + "grad_norm": 7.98828125, + "learning_rate": 7.384872080088988e-06, + "loss": 2.8921, + "mean_token_accuracy": 0.4710591941199633, + "step": 14106 + }, + { + "epoch": 2.615313311086392, + "grad_norm": 6.73046875, + "learning_rate": 7.384686688913608e-06, + "loss": 3.0889, + "mean_token_accuracy": 0.45594202898550723, + "step": 14107 + }, + { + "epoch": 2.6154987022617724, + "grad_norm": 10.453125, + "learning_rate": 7.384501297738228e-06, + "loss": 3.0201, + "mean_token_accuracy": 0.4977744807121662, + "step": 14108 + }, + { + "epoch": 2.615684093437152, + "grad_norm": 8.5390625, + "learning_rate": 7.384315906562848e-06, + "loss": 3.3037, + "mean_token_accuracy": 0.4407453416149068, + "step": 14109 + }, + { + "epoch": 2.6158694846125323, + "grad_norm": 14.578125, + "learning_rate": 7.384130515387468e-06, + "loss": 2.791, + "mean_token_accuracy": 0.47880299251870323, + "step": 14110 + }, + { + "epoch": 2.6160548757879125, + "grad_norm": 7.4765625, + "learning_rate": 7.383945124212089e-06, + "loss": 2.7112, + "mean_token_accuracy": 0.5003955249180698, + "step": 14111 + }, + { + "epoch": 2.6162402669632927, + "grad_norm": 9.3125, + "learning_rate": 7.383759733036709e-06, + "loss": 3.2263, + "mean_token_accuracy": 0.46655400647796086, + "step": 14112 + }, + { + "epoch": 2.6164256581386724, + "grad_norm": 6.57421875, + "learning_rate": 7.383574341861328e-06, + "loss": 2.8133, + "mean_token_accuracy": 0.4804353816478369, + "step": 14113 + }, + { + "epoch": 2.6166110493140526, + "grad_norm": 7.140625, + "learning_rate": 7.383388950685948e-06, + "loss": 2.8674, + "mean_token_accuracy": 0.47397744258730806, + "step": 14114 + }, + { + "epoch": 2.616796440489433, + "grad_norm": 8.25, + "learning_rate": 7.3832035595105675e-06, + "loss": 3.6098, + "mean_token_accuracy": 0.4327720207253886, + "step": 14115 + }, + { + "epoch": 2.6169818316648126, + "grad_norm": 13.125, + "learning_rate": 7.383018168335188e-06, + "loss": 2.9063, + "mean_token_accuracy": 0.5001217730150999, + "step": 14116 + }, + { + "epoch": 2.6171672228401928, + "grad_norm": 7.3203125, + "learning_rate": 7.382832777159808e-06, + "loss": 2.3687, + "mean_token_accuracy": 0.5429168532616808, + "step": 14117 + }, + { + "epoch": 2.617352614015573, + "grad_norm": 8.5078125, + "learning_rate": 7.382647385984427e-06, + "loss": 3.1127, + "mean_token_accuracy": 0.46412920996619506, + "step": 14118 + }, + { + "epoch": 2.617538005190953, + "grad_norm": 9.546875, + "learning_rate": 7.382461994809048e-06, + "loss": 2.7468, + "mean_token_accuracy": 0.4847380982288657, + "step": 14119 + }, + { + "epoch": 2.617723396366333, + "grad_norm": 7.12890625, + "learning_rate": 7.382276603633668e-06, + "loss": 2.9983, + "mean_token_accuracy": 0.46207232436381346, + "step": 14120 + }, + { + "epoch": 2.617908787541713, + "grad_norm": 6.93359375, + "learning_rate": 7.382091212458288e-06, + "loss": 3.0181, + "mean_token_accuracy": 0.47529831604773054, + "step": 14121 + }, + { + "epoch": 2.618094178717093, + "grad_norm": 14.5390625, + "learning_rate": 7.3819058212829076e-06, + "loss": 2.5823, + "mean_token_accuracy": 0.5047941342357586, + "step": 14122 + }, + { + "epoch": 2.618279569892473, + "grad_norm": 9.6171875, + "learning_rate": 7.381720430107527e-06, + "loss": 3.4154, + "mean_token_accuracy": 0.4461909132629647, + "step": 14123 + }, + { + "epoch": 2.618464961067853, + "grad_norm": 7.53125, + "learning_rate": 7.381535038932147e-06, + "loss": 2.7958, + "mean_token_accuracy": 0.4945054945054945, + "step": 14124 + }, + { + "epoch": 2.6186503522432334, + "grad_norm": 8.6640625, + "learning_rate": 7.381349647756767e-06, + "loss": 2.6396, + "mean_token_accuracy": 0.49286304117022534, + "step": 14125 + }, + { + "epoch": 2.618835743418613, + "grad_norm": 6.7578125, + "learning_rate": 7.381164256581387e-06, + "loss": 3.0076, + "mean_token_accuracy": 0.4779585410653372, + "step": 14126 + }, + { + "epoch": 2.6190211345939933, + "grad_norm": 7.390625, + "learning_rate": 7.3809788654060075e-06, + "loss": 3.3571, + "mean_token_accuracy": 0.44035304166271233, + "step": 14127 + }, + { + "epoch": 2.6192065257693735, + "grad_norm": 7.13671875, + "learning_rate": 7.380793474230627e-06, + "loss": 2.7956, + "mean_token_accuracy": 0.4814137023904838, + "step": 14128 + }, + { + "epoch": 2.6193919169447533, + "grad_norm": 7.36328125, + "learning_rate": 7.380608083055248e-06, + "loss": 2.483, + "mean_token_accuracy": 0.5256426491072559, + "step": 14129 + }, + { + "epoch": 2.6195773081201335, + "grad_norm": 6.87109375, + "learning_rate": 7.380422691879867e-06, + "loss": 2.672, + "mean_token_accuracy": 0.5210253456221198, + "step": 14130 + }, + { + "epoch": 2.6197626992955136, + "grad_norm": 8.8359375, + "learning_rate": 7.380237300704487e-06, + "loss": 2.2239, + "mean_token_accuracy": 0.5228364156534178, + "step": 14131 + }, + { + "epoch": 2.619948090470894, + "grad_norm": 9.5078125, + "learning_rate": 7.3800519095291066e-06, + "loss": 2.4832, + "mean_token_accuracy": 0.5028207724576884, + "step": 14132 + }, + { + "epoch": 2.6201334816462736, + "grad_norm": 7.453125, + "learning_rate": 7.379866518353726e-06, + "loss": 2.8541, + "mean_token_accuracy": 0.4826792963464141, + "step": 14133 + }, + { + "epoch": 2.6203188728216538, + "grad_norm": 10.4296875, + "learning_rate": 7.379681127178347e-06, + "loss": 2.5243, + "mean_token_accuracy": 0.5446159007044616, + "step": 14134 + }, + { + "epoch": 2.6205042639970335, + "grad_norm": 7.94140625, + "learning_rate": 7.379495736002967e-06, + "loss": 3.0413, + "mean_token_accuracy": 0.4830739992873263, + "step": 14135 + }, + { + "epoch": 2.6206896551724137, + "grad_norm": 7.69921875, + "learning_rate": 7.379310344827587e-06, + "loss": 2.5266, + "mean_token_accuracy": 0.5210634472129463, + "step": 14136 + }, + { + "epoch": 2.620875046347794, + "grad_norm": 9.78125, + "learning_rate": 7.379124953652207e-06, + "loss": 3.4824, + "mean_token_accuracy": 0.4475969889982629, + "step": 14137 + }, + { + "epoch": 2.621060437523174, + "grad_norm": 7.97265625, + "learning_rate": 7.378939562476827e-06, + "loss": 2.5992, + "mean_token_accuracy": 0.5103920907763173, + "step": 14138 + }, + { + "epoch": 2.621245828698554, + "grad_norm": 8.3046875, + "learning_rate": 7.378754171301447e-06, + "loss": 3.7747, + "mean_token_accuracy": 0.42805100182149364, + "step": 14139 + }, + { + "epoch": 2.621431219873934, + "grad_norm": 7.296875, + "learning_rate": 7.378568780126066e-06, + "loss": 3.4326, + "mean_token_accuracy": 0.4532384548358989, + "step": 14140 + }, + { + "epoch": 2.621616611049314, + "grad_norm": 7.5234375, + "learning_rate": 7.378383388950686e-06, + "loss": 3.032, + "mean_token_accuracy": 0.4861598988337783, + "step": 14141 + }, + { + "epoch": 2.621802002224694, + "grad_norm": 8.9296875, + "learning_rate": 7.378197997775306e-06, + "loss": 2.2753, + "mean_token_accuracy": 0.5338488994646045, + "step": 14142 + }, + { + "epoch": 2.621987393400074, + "grad_norm": 7.21875, + "learning_rate": 7.378012606599927e-06, + "loss": 2.4876, + "mean_token_accuracy": 0.5226213326021387, + "step": 14143 + }, + { + "epoch": 2.6221727845754543, + "grad_norm": 8.359375, + "learning_rate": 7.3778272154245465e-06, + "loss": 3.0206, + "mean_token_accuracy": 0.45583472920156337, + "step": 14144 + }, + { + "epoch": 2.6223581757508345, + "grad_norm": 8.0234375, + "learning_rate": 7.377641824249166e-06, + "loss": 3.1258, + "mean_token_accuracy": 0.4807268007787151, + "step": 14145 + }, + { + "epoch": 2.6225435669262143, + "grad_norm": 7.26171875, + "learning_rate": 7.377456433073787e-06, + "loss": 3.2833, + "mean_token_accuracy": 0.4493281712593942, + "step": 14146 + }, + { + "epoch": 2.6227289581015945, + "grad_norm": 7.6328125, + "learning_rate": 7.377271041898406e-06, + "loss": 3.1788, + "mean_token_accuracy": 0.49473420260782347, + "step": 14147 + }, + { + "epoch": 2.622914349276974, + "grad_norm": 7.03125, + "learning_rate": 7.377085650723026e-06, + "loss": 2.8422, + "mean_token_accuracy": 0.4773832326506557, + "step": 14148 + }, + { + "epoch": 2.6230997404523544, + "grad_norm": 8.3203125, + "learning_rate": 7.376900259547646e-06, + "loss": 3.5837, + "mean_token_accuracy": 0.443391757460919, + "step": 14149 + }, + { + "epoch": 2.6232851316277346, + "grad_norm": 6.95703125, + "learning_rate": 7.376714868372265e-06, + "loss": 3.4642, + "mean_token_accuracy": 0.4074117785908179, + "step": 14150 + }, + { + "epoch": 2.6234705228031148, + "grad_norm": 10.9609375, + "learning_rate": 7.3765294771968866e-06, + "loss": 2.7694, + "mean_token_accuracy": 0.4804005112910098, + "step": 14151 + }, + { + "epoch": 2.6236559139784945, + "grad_norm": 7.53515625, + "learning_rate": 7.376344086021506e-06, + "loss": 3.2535, + "mean_token_accuracy": 0.4327240416349327, + "step": 14152 + }, + { + "epoch": 2.6238413051538747, + "grad_norm": 7.03515625, + "learning_rate": 7.376158694846126e-06, + "loss": 2.7118, + "mean_token_accuracy": 0.485350897115603, + "step": 14153 + }, + { + "epoch": 2.624026696329255, + "grad_norm": 7.578125, + "learning_rate": 7.375973303670746e-06, + "loss": 3.4201, + "mean_token_accuracy": 0.44834893166166345, + "step": 14154 + }, + { + "epoch": 2.6242120875046346, + "grad_norm": 11.5703125, + "learning_rate": 7.375787912495366e-06, + "loss": 2.9165, + "mean_token_accuracy": 0.48405880106911037, + "step": 14155 + }, + { + "epoch": 2.624397478680015, + "grad_norm": 7.60546875, + "learning_rate": 7.375602521319986e-06, + "loss": 2.5367, + "mean_token_accuracy": 0.5033240997229916, + "step": 14156 + }, + { + "epoch": 2.624582869855395, + "grad_norm": 9.5546875, + "learning_rate": 7.375417130144605e-06, + "loss": 4.0943, + "mean_token_accuracy": 0.4101567920077702, + "step": 14157 + }, + { + "epoch": 2.624768261030775, + "grad_norm": 9.1640625, + "learning_rate": 7.375231738969225e-06, + "loss": 2.7376, + "mean_token_accuracy": 0.5143303064699205, + "step": 14158 + }, + { + "epoch": 2.624953652206155, + "grad_norm": 12.484375, + "learning_rate": 7.375046347793846e-06, + "loss": 2.9806, + "mean_token_accuracy": 0.4590370955011839, + "step": 14159 + }, + { + "epoch": 2.625139043381535, + "grad_norm": 10.953125, + "learning_rate": 7.374860956618466e-06, + "loss": 2.5853, + "mean_token_accuracy": 0.508390918065153, + "step": 14160 + }, + { + "epoch": 2.625324434556915, + "grad_norm": 8.625, + "learning_rate": 7.3746755654430856e-06, + "loss": 3.1424, + "mean_token_accuracy": 0.4803276487176044, + "step": 14161 + }, + { + "epoch": 2.625509825732295, + "grad_norm": 13.46875, + "learning_rate": 7.374490174267705e-06, + "loss": 3.0, + "mean_token_accuracy": 0.47415692255549907, + "step": 14162 + }, + { + "epoch": 2.6256952169076753, + "grad_norm": 10.234375, + "learning_rate": 7.374304783092326e-06, + "loss": 2.809, + "mean_token_accuracy": 0.501952819871895, + "step": 14163 + }, + { + "epoch": 2.6258806080830555, + "grad_norm": 8.3828125, + "learning_rate": 7.374119391916945e-06, + "loss": 3.6695, + "mean_token_accuracy": 0.44614612101431084, + "step": 14164 + }, + { + "epoch": 2.626065999258435, + "grad_norm": 10.3359375, + "learning_rate": 7.373934000741565e-06, + "loss": 3.1726, + "mean_token_accuracy": 0.4530751977334435, + "step": 14165 + }, + { + "epoch": 2.6262513904338154, + "grad_norm": 10.40625, + "learning_rate": 7.373748609566185e-06, + "loss": 2.3665, + "mean_token_accuracy": 0.5176578856551248, + "step": 14166 + }, + { + "epoch": 2.626436781609195, + "grad_norm": 7.8125, + "learning_rate": 7.373563218390806e-06, + "loss": 2.9746, + "mean_token_accuracy": 0.4461044401005306, + "step": 14167 + }, + { + "epoch": 2.6266221727845753, + "grad_norm": 11.2421875, + "learning_rate": 7.373377827215426e-06, + "loss": 2.6354, + "mean_token_accuracy": 0.5139511981617245, + "step": 14168 + }, + { + "epoch": 2.6268075639599555, + "grad_norm": 10.1796875, + "learning_rate": 7.373192436040045e-06, + "loss": 2.5387, + "mean_token_accuracy": 0.5271895285729488, + "step": 14169 + }, + { + "epoch": 2.6269929551353357, + "grad_norm": 7.05078125, + "learning_rate": 7.373007044864665e-06, + "loss": 2.6464, + "mean_token_accuracy": 0.49543446244477174, + "step": 14170 + }, + { + "epoch": 2.6271783463107155, + "grad_norm": 7.015625, + "learning_rate": 7.3728216536892845e-06, + "loss": 3.1725, + "mean_token_accuracy": 0.45266818824040866, + "step": 14171 + }, + { + "epoch": 2.6273637374860956, + "grad_norm": 8.9296875, + "learning_rate": 7.372636262513905e-06, + "loss": 3.0672, + "mean_token_accuracy": 0.4473927392739274, + "step": 14172 + }, + { + "epoch": 2.627549128661476, + "grad_norm": 8.5859375, + "learning_rate": 7.372450871338525e-06, + "loss": 3.082, + "mean_token_accuracy": 0.49716171617161714, + "step": 14173 + }, + { + "epoch": 2.6277345198368556, + "grad_norm": 6.37890625, + "learning_rate": 7.372265480163144e-06, + "loss": 2.8405, + "mean_token_accuracy": 0.5161555875694588, + "step": 14174 + }, + { + "epoch": 2.6279199110122358, + "grad_norm": 7.0078125, + "learning_rate": 7.372080088987766e-06, + "loss": 2.6976, + "mean_token_accuracy": 0.5046668812359189, + "step": 14175 + }, + { + "epoch": 2.628105302187616, + "grad_norm": 8.3828125, + "learning_rate": 7.371894697812385e-06, + "loss": 3.4402, + "mean_token_accuracy": 0.4577408103031929, + "step": 14176 + }, + { + "epoch": 2.628290693362996, + "grad_norm": 12.5078125, + "learning_rate": 7.371709306637005e-06, + "loss": 3.0743, + "mean_token_accuracy": 0.5206190343805023, + "step": 14177 + }, + { + "epoch": 2.628476084538376, + "grad_norm": 6.7265625, + "learning_rate": 7.371523915461625e-06, + "loss": 3.1043, + "mean_token_accuracy": 0.4465770953294946, + "step": 14178 + }, + { + "epoch": 2.628661475713756, + "grad_norm": 6.87890625, + "learning_rate": 7.371338524286244e-06, + "loss": 2.5344, + "mean_token_accuracy": 0.5078745281790967, + "step": 14179 + }, + { + "epoch": 2.628846866889136, + "grad_norm": 7.4453125, + "learning_rate": 7.371153133110865e-06, + "loss": 3.0331, + "mean_token_accuracy": 0.44755434782608694, + "step": 14180 + }, + { + "epoch": 2.629032258064516, + "grad_norm": 7.66796875, + "learning_rate": 7.370967741935484e-06, + "loss": 3.2128, + "mean_token_accuracy": 0.4474716202270382, + "step": 14181 + }, + { + "epoch": 2.629217649239896, + "grad_norm": 7.6328125, + "learning_rate": 7.370782350760104e-06, + "loss": 3.011, + "mean_token_accuracy": 0.4665823076038192, + "step": 14182 + }, + { + "epoch": 2.6294030404152764, + "grad_norm": 6.76171875, + "learning_rate": 7.3705969595847245e-06, + "loss": 2.8592, + "mean_token_accuracy": 0.4823046134400674, + "step": 14183 + }, + { + "epoch": 2.629588431590656, + "grad_norm": 7.72265625, + "learning_rate": 7.370411568409345e-06, + "loss": 3.0971, + "mean_token_accuracy": 0.44139886578449905, + "step": 14184 + }, + { + "epoch": 2.6297738227660363, + "grad_norm": 12.2890625, + "learning_rate": 7.370226177233965e-06, + "loss": 2.4161, + "mean_token_accuracy": 0.5474780701754386, + "step": 14185 + }, + { + "epoch": 2.6299592139414165, + "grad_norm": 8.046875, + "learning_rate": 7.370040786058584e-06, + "loss": 2.9119, + "mean_token_accuracy": 0.4832852536908196, + "step": 14186 + }, + { + "epoch": 2.6301446051167963, + "grad_norm": 10.8671875, + "learning_rate": 7.369855394883204e-06, + "loss": 3.7396, + "mean_token_accuracy": 0.4427446903312045, + "step": 14187 + }, + { + "epoch": 2.6303299962921765, + "grad_norm": 11.3671875, + "learning_rate": 7.3696700037078236e-06, + "loss": 2.7694, + "mean_token_accuracy": 0.478134110787172, + "step": 14188 + }, + { + "epoch": 2.6305153874675566, + "grad_norm": 7.8828125, + "learning_rate": 7.369484612532444e-06, + "loss": 2.5305, + "mean_token_accuracy": 0.5245682315738264, + "step": 14189 + }, + { + "epoch": 2.630700778642937, + "grad_norm": 9.640625, + "learning_rate": 7.369299221357064e-06, + "loss": 3.1076, + "mean_token_accuracy": 0.4753491703473797, + "step": 14190 + }, + { + "epoch": 2.6308861698183166, + "grad_norm": 10.2734375, + "learning_rate": 7.369113830181684e-06, + "loss": 3.0144, + "mean_token_accuracy": 0.4603174603174603, + "step": 14191 + }, + { + "epoch": 2.6310715609936968, + "grad_norm": 9.0, + "learning_rate": 7.368928439006305e-06, + "loss": 2.7946, + "mean_token_accuracy": 0.48688729316266, + "step": 14192 + }, + { + "epoch": 2.6312569521690765, + "grad_norm": 10.6015625, + "learning_rate": 7.368743047830924e-06, + "loss": 2.7617, + "mean_token_accuracy": 0.48682688600436963, + "step": 14193 + }, + { + "epoch": 2.6314423433444567, + "grad_norm": 8.578125, + "learning_rate": 7.368557656655544e-06, + "loss": 2.6715, + "mean_token_accuracy": 0.48141957052848144, + "step": 14194 + }, + { + "epoch": 2.631627734519837, + "grad_norm": 7.48046875, + "learning_rate": 7.368372265480164e-06, + "loss": 2.9129, + "mean_token_accuracy": 0.46754134208008746, + "step": 14195 + }, + { + "epoch": 2.631813125695217, + "grad_norm": 7.58984375, + "learning_rate": 7.368186874304783e-06, + "loss": 3.2122, + "mean_token_accuracy": 0.4572068707991038, + "step": 14196 + }, + { + "epoch": 2.631998516870597, + "grad_norm": 7.046875, + "learning_rate": 7.368001483129403e-06, + "loss": 2.9541, + "mean_token_accuracy": 0.47368421052631576, + "step": 14197 + }, + { + "epoch": 2.632183908045977, + "grad_norm": 9.3125, + "learning_rate": 7.367816091954023e-06, + "loss": 4.3136, + "mean_token_accuracy": 0.4010071561091969, + "step": 14198 + }, + { + "epoch": 2.632369299221357, + "grad_norm": 9.1796875, + "learning_rate": 7.367630700778644e-06, + "loss": 2.7876, + "mean_token_accuracy": 0.503665200586432, + "step": 14199 + }, + { + "epoch": 2.632554690396737, + "grad_norm": 8.359375, + "learning_rate": 7.3674453096032635e-06, + "loss": 3.2459, + "mean_token_accuracy": 0.4617397998460354, + "step": 14200 + }, + { + "epoch": 2.632740081572117, + "grad_norm": 7.1015625, + "learning_rate": 7.367259918427884e-06, + "loss": 3.0194, + "mean_token_accuracy": 0.463554667998003, + "step": 14201 + }, + { + "epoch": 2.6329254727474973, + "grad_norm": 8.640625, + "learning_rate": 7.367074527252504e-06, + "loss": 2.708, + "mean_token_accuracy": 0.5133768135232264, + "step": 14202 + }, + { + "epoch": 2.6331108639228775, + "grad_norm": 7.33203125, + "learning_rate": 7.366889136077123e-06, + "loss": 2.788, + "mean_token_accuracy": 0.48757342973339357, + "step": 14203 + }, + { + "epoch": 2.6332962550982573, + "grad_norm": 9.3046875, + "learning_rate": 7.366703744901743e-06, + "loss": 3.731, + "mean_token_accuracy": 0.428777166797591, + "step": 14204 + }, + { + "epoch": 2.6334816462736375, + "grad_norm": 8.0234375, + "learning_rate": 7.366518353726363e-06, + "loss": 3.1002, + "mean_token_accuracy": 0.4542433891734736, + "step": 14205 + }, + { + "epoch": 2.633667037449017, + "grad_norm": 6.9140625, + "learning_rate": 7.366332962550983e-06, + "loss": 3.0576, + "mean_token_accuracy": 0.4557739557739558, + "step": 14206 + }, + { + "epoch": 2.6338524286243974, + "grad_norm": 6.69921875, + "learning_rate": 7.366147571375604e-06, + "loss": 2.7178, + "mean_token_accuracy": 0.48120394654677157, + "step": 14207 + }, + { + "epoch": 2.6340378197997776, + "grad_norm": 6.73046875, + "learning_rate": 7.365962180200223e-06, + "loss": 2.5909, + "mean_token_accuracy": 0.5094905094905094, + "step": 14208 + }, + { + "epoch": 2.6342232109751578, + "grad_norm": 7.08984375, + "learning_rate": 7.365776789024843e-06, + "loss": 3.205, + "mean_token_accuracy": 0.4548702811868883, + "step": 14209 + }, + { + "epoch": 2.6344086021505375, + "grad_norm": 7.65625, + "learning_rate": 7.365591397849463e-06, + "loss": 2.9416, + "mean_token_accuracy": 0.4600360576923077, + "step": 14210 + }, + { + "epoch": 2.6345939933259177, + "grad_norm": 6.58984375, + "learning_rate": 7.365406006674083e-06, + "loss": 2.8407, + "mean_token_accuracy": 0.47643459566727303, + "step": 14211 + }, + { + "epoch": 2.634779384501298, + "grad_norm": 8.2890625, + "learning_rate": 7.365220615498703e-06, + "loss": 2.7153, + "mean_token_accuracy": 0.5127450980392156, + "step": 14212 + }, + { + "epoch": 2.6349647756766776, + "grad_norm": 8.3671875, + "learning_rate": 7.365035224323322e-06, + "loss": 2.1452, + "mean_token_accuracy": 0.5376921928646067, + "step": 14213 + }, + { + "epoch": 2.635150166852058, + "grad_norm": 6.765625, + "learning_rate": 7.364849833147942e-06, + "loss": 2.4932, + "mean_token_accuracy": 0.5292682926829269, + "step": 14214 + }, + { + "epoch": 2.635335558027438, + "grad_norm": 6.91796875, + "learning_rate": 7.364664441972563e-06, + "loss": 3.4865, + "mean_token_accuracy": 0.4329971181556196, + "step": 14215 + }, + { + "epoch": 2.635520949202818, + "grad_norm": 7.69140625, + "learning_rate": 7.364479050797183e-06, + "loss": 3.0733, + "mean_token_accuracy": 0.49531893646236425, + "step": 14216 + }, + { + "epoch": 2.635706340378198, + "grad_norm": 7.93359375, + "learning_rate": 7.3642936596218026e-06, + "loss": 2.7227, + "mean_token_accuracy": 0.49695307739183425, + "step": 14217 + }, + { + "epoch": 2.635891731553578, + "grad_norm": 6.63671875, + "learning_rate": 7.364108268446423e-06, + "loss": 2.395, + "mean_token_accuracy": 0.5192061459667093, + "step": 14218 + }, + { + "epoch": 2.636077122728958, + "grad_norm": 7.32421875, + "learning_rate": 7.363922877271043e-06, + "loss": 2.3654, + "mean_token_accuracy": 0.5334760012228676, + "step": 14219 + }, + { + "epoch": 2.636262513904338, + "grad_norm": 9.125, + "learning_rate": 7.363737486095662e-06, + "loss": 2.8788, + "mean_token_accuracy": 0.47878128400435255, + "step": 14220 + }, + { + "epoch": 2.6364479050797183, + "grad_norm": 7.75390625, + "learning_rate": 7.363552094920282e-06, + "loss": 2.9765, + "mean_token_accuracy": 0.4730473047304731, + "step": 14221 + }, + { + "epoch": 2.6366332962550985, + "grad_norm": 9.390625, + "learning_rate": 7.363366703744902e-06, + "loss": 3.1158, + "mean_token_accuracy": 0.4769910243624448, + "step": 14222 + }, + { + "epoch": 2.636818687430478, + "grad_norm": 8.796875, + "learning_rate": 7.363181312569523e-06, + "loss": 2.7525, + "mean_token_accuracy": 0.49052229646802126, + "step": 14223 + }, + { + "epoch": 2.6370040786058584, + "grad_norm": 7.0703125, + "learning_rate": 7.362995921394143e-06, + "loss": 2.7399, + "mean_token_accuracy": 0.507840038896317, + "step": 14224 + }, + { + "epoch": 2.637189469781238, + "grad_norm": 6.20703125, + "learning_rate": 7.362810530218762e-06, + "loss": 2.2014, + "mean_token_accuracy": 0.5724515166583789, + "step": 14225 + }, + { + "epoch": 2.6373748609566183, + "grad_norm": 8.5, + "learning_rate": 7.362625139043382e-06, + "loss": 3.4631, + "mean_token_accuracy": 0.4371327849588719, + "step": 14226 + }, + { + "epoch": 2.6375602521319985, + "grad_norm": 7.25390625, + "learning_rate": 7.362439747868002e-06, + "loss": 2.9858, + "mean_token_accuracy": 0.4936321134934709, + "step": 14227 + }, + { + "epoch": 2.6377456433073787, + "grad_norm": 7.41796875, + "learning_rate": 7.362254356692622e-06, + "loss": 2.8484, + "mean_token_accuracy": 0.4996793638578941, + "step": 14228 + }, + { + "epoch": 2.637931034482759, + "grad_norm": 8.796875, + "learning_rate": 7.362068965517242e-06, + "loss": 2.6857, + "mean_token_accuracy": 0.4891135303265941, + "step": 14229 + }, + { + "epoch": 2.6381164256581386, + "grad_norm": 6.4453125, + "learning_rate": 7.361883574341861e-06, + "loss": 2.8567, + "mean_token_accuracy": 0.4705810397553517, + "step": 14230 + }, + { + "epoch": 2.638301816833519, + "grad_norm": 6.64453125, + "learning_rate": 7.361698183166481e-06, + "loss": 2.8533, + "mean_token_accuracy": 0.49150458305387884, + "step": 14231 + }, + { + "epoch": 2.6384872080088986, + "grad_norm": 7.90625, + "learning_rate": 7.361512791991102e-06, + "loss": 2.9313, + "mean_token_accuracy": 0.4723267060720043, + "step": 14232 + }, + { + "epoch": 2.6386725991842788, + "grad_norm": 6.97265625, + "learning_rate": 7.361327400815722e-06, + "loss": 2.5938, + "mean_token_accuracy": 0.5021057557323351, + "step": 14233 + }, + { + "epoch": 2.638857990359659, + "grad_norm": 8.671875, + "learning_rate": 7.361142009640342e-06, + "loss": 2.8646, + "mean_token_accuracy": 0.48496748067247514, + "step": 14234 + }, + { + "epoch": 2.639043381535039, + "grad_norm": 7.29296875, + "learning_rate": 7.360956618464962e-06, + "loss": 3.2904, + "mean_token_accuracy": 0.4434846088331914, + "step": 14235 + }, + { + "epoch": 2.639228772710419, + "grad_norm": 8.3671875, + "learning_rate": 7.360771227289582e-06, + "loss": 2.7182, + "mean_token_accuracy": 0.4736575481256332, + "step": 14236 + }, + { + "epoch": 2.639414163885799, + "grad_norm": 7.68359375, + "learning_rate": 7.360585836114201e-06, + "loss": 3.3077, + "mean_token_accuracy": 0.458537845517954, + "step": 14237 + }, + { + "epoch": 2.639599555061179, + "grad_norm": 8.234375, + "learning_rate": 7.360400444938821e-06, + "loss": 3.8085, + "mean_token_accuracy": 0.43091693335188536, + "step": 14238 + }, + { + "epoch": 2.639784946236559, + "grad_norm": 7.7734375, + "learning_rate": 7.360215053763441e-06, + "loss": 3.3621, + "mean_token_accuracy": 0.4654111275233875, + "step": 14239 + }, + { + "epoch": 2.639970337411939, + "grad_norm": 7.140625, + "learning_rate": 7.360029662588062e-06, + "loss": 2.7482, + "mean_token_accuracy": 0.4964915016372992, + "step": 14240 + }, + { + "epoch": 2.6401557285873194, + "grad_norm": 7.4375, + "learning_rate": 7.359844271412682e-06, + "loss": 3.9045, + "mean_token_accuracy": 0.4398918918918919, + "step": 14241 + }, + { + "epoch": 2.640341119762699, + "grad_norm": 7.8046875, + "learning_rate": 7.359658880237301e-06, + "loss": 3.1542, + "mean_token_accuracy": 0.47436245252306025, + "step": 14242 + }, + { + "epoch": 2.6405265109380793, + "grad_norm": 7.09375, + "learning_rate": 7.359473489061921e-06, + "loss": 2.998, + "mean_token_accuracy": 0.4581550802139037, + "step": 14243 + }, + { + "epoch": 2.6407119021134595, + "grad_norm": 7.24609375, + "learning_rate": 7.3592880978865414e-06, + "loss": 3.3017, + "mean_token_accuracy": 0.4505801761498672, + "step": 14244 + }, + { + "epoch": 2.6408972932888393, + "grad_norm": 6.734375, + "learning_rate": 7.359102706711161e-06, + "loss": 3.1089, + "mean_token_accuracy": 0.45920985120574653, + "step": 14245 + }, + { + "epoch": 2.6410826844642195, + "grad_norm": 8.828125, + "learning_rate": 7.358917315535781e-06, + "loss": 3.0563, + "mean_token_accuracy": 0.459630001434103, + "step": 14246 + }, + { + "epoch": 2.6412680756395996, + "grad_norm": 6.7578125, + "learning_rate": 7.3587319243604e-06, + "loss": 3.0398, + "mean_token_accuracy": 0.5004841208365608, + "step": 14247 + }, + { + "epoch": 2.64145346681498, + "grad_norm": 8.2421875, + "learning_rate": 7.358546533185022e-06, + "loss": 3.1707, + "mean_token_accuracy": 0.4669477488330071, + "step": 14248 + }, + { + "epoch": 2.6416388579903596, + "grad_norm": 9.171875, + "learning_rate": 7.358361142009641e-06, + "loss": 3.4026, + "mean_token_accuracy": 0.4428493746601414, + "step": 14249 + }, + { + "epoch": 2.6418242491657398, + "grad_norm": 6.859375, + "learning_rate": 7.358175750834261e-06, + "loss": 2.7039, + "mean_token_accuracy": 0.4921121151784347, + "step": 14250 + }, + { + "epoch": 2.6420096403411195, + "grad_norm": 8.328125, + "learning_rate": 7.357990359658881e-06, + "loss": 2.615, + "mean_token_accuracy": 0.5124664365170695, + "step": 14251 + }, + { + "epoch": 2.6421950315164997, + "grad_norm": 10.90625, + "learning_rate": 7.3578049684835e-06, + "loss": 3.6903, + "mean_token_accuracy": 0.4239174697977467, + "step": 14252 + }, + { + "epoch": 2.64238042269188, + "grad_norm": 11.40625, + "learning_rate": 7.357619577308121e-06, + "loss": 2.4858, + "mean_token_accuracy": 0.5097087378640777, + "step": 14253 + }, + { + "epoch": 2.64256581386726, + "grad_norm": 7.890625, + "learning_rate": 7.35743418613274e-06, + "loss": 3.1588, + "mean_token_accuracy": 0.46310979707028627, + "step": 14254 + }, + { + "epoch": 2.64275120504264, + "grad_norm": 10.734375, + "learning_rate": 7.35724879495736e-06, + "loss": 2.6914, + "mean_token_accuracy": 0.4879037209873511, + "step": 14255 + }, + { + "epoch": 2.64293659621802, + "grad_norm": 7.16015625, + "learning_rate": 7.357063403781981e-06, + "loss": 2.3388, + "mean_token_accuracy": 0.5301734976112648, + "step": 14256 + }, + { + "epoch": 2.6431219873934, + "grad_norm": 6.62890625, + "learning_rate": 7.356878012606601e-06, + "loss": 3.2898, + "mean_token_accuracy": 0.446204259967231, + "step": 14257 + }, + { + "epoch": 2.64330737856878, + "grad_norm": 6.67578125, + "learning_rate": 7.356692621431221e-06, + "loss": 2.6662, + "mean_token_accuracy": 0.4984029484029484, + "step": 14258 + }, + { + "epoch": 2.64349276974416, + "grad_norm": 14.09375, + "learning_rate": 7.35650723025584e-06, + "loss": 2.6272, + "mean_token_accuracy": 0.48154381240884103, + "step": 14259 + }, + { + "epoch": 2.6436781609195403, + "grad_norm": 7.1875, + "learning_rate": 7.35632183908046e-06, + "loss": 3.0848, + "mean_token_accuracy": 0.463245492371706, + "step": 14260 + }, + { + "epoch": 2.6438635520949205, + "grad_norm": 7.7265625, + "learning_rate": 7.3561364479050805e-06, + "loss": 3.1732, + "mean_token_accuracy": 0.4572441293752769, + "step": 14261 + }, + { + "epoch": 2.6440489432703003, + "grad_norm": 8.3046875, + "learning_rate": 7.3559510567297e-06, + "loss": 3.6731, + "mean_token_accuracy": 0.43567378979502835, + "step": 14262 + }, + { + "epoch": 2.6442343344456805, + "grad_norm": 8.3671875, + "learning_rate": 7.35576566555432e-06, + "loss": 2.9744, + "mean_token_accuracy": 0.4995707101680363, + "step": 14263 + }, + { + "epoch": 2.64441972562106, + "grad_norm": 7.1328125, + "learning_rate": 7.35558027437894e-06, + "loss": 3.1067, + "mean_token_accuracy": 0.46995086304649114, + "step": 14264 + }, + { + "epoch": 2.6446051167964404, + "grad_norm": 8.28125, + "learning_rate": 7.355394883203561e-06, + "loss": 2.5237, + "mean_token_accuracy": 0.5141571553994733, + "step": 14265 + }, + { + "epoch": 2.6447905079718206, + "grad_norm": 7.27734375, + "learning_rate": 7.35520949202818e-06, + "loss": 2.478, + "mean_token_accuracy": 0.5173444976076556, + "step": 14266 + }, + { + "epoch": 2.6449758991472008, + "grad_norm": 9.1171875, + "learning_rate": 7.3550241008528e-06, + "loss": 3.1787, + "mean_token_accuracy": 0.47163073667504296, + "step": 14267 + }, + { + "epoch": 2.6451612903225805, + "grad_norm": 8.4140625, + "learning_rate": 7.35483870967742e-06, + "loss": 2.9707, + "mean_token_accuracy": 0.46900429057552895, + "step": 14268 + }, + { + "epoch": 2.6453466814979607, + "grad_norm": 6.4921875, + "learning_rate": 7.354653318502039e-06, + "loss": 3.2017, + "mean_token_accuracy": 0.45097402597402597, + "step": 14269 + }, + { + "epoch": 2.645532072673341, + "grad_norm": 7.96875, + "learning_rate": 7.35446792732666e-06, + "loss": 3.0517, + "mean_token_accuracy": 0.4843972754633296, + "step": 14270 + }, + { + "epoch": 2.6457174638487206, + "grad_norm": 7.484375, + "learning_rate": 7.3542825361512795e-06, + "loss": 2.7003, + "mean_token_accuracy": 0.4737973662993819, + "step": 14271 + }, + { + "epoch": 2.645902855024101, + "grad_norm": 7.81640625, + "learning_rate": 7.3540971449759e-06, + "loss": 3.5894, + "mean_token_accuracy": 0.43482466747279325, + "step": 14272 + }, + { + "epoch": 2.646088246199481, + "grad_norm": 6.19140625, + "learning_rate": 7.3539117538005204e-06, + "loss": 2.5726, + "mean_token_accuracy": 0.5312038794012228, + "step": 14273 + }, + { + "epoch": 2.646273637374861, + "grad_norm": 7.37890625, + "learning_rate": 7.35372636262514e-06, + "loss": 2.8285, + "mean_token_accuracy": 0.47429798987264077, + "step": 14274 + }, + { + "epoch": 2.646459028550241, + "grad_norm": 6.234375, + "learning_rate": 7.35354097144976e-06, + "loss": 2.3178, + "mean_token_accuracy": 0.5242883101150818, + "step": 14275 + }, + { + "epoch": 2.646644419725621, + "grad_norm": 10.0859375, + "learning_rate": 7.353355580274379e-06, + "loss": 2.7029, + "mean_token_accuracy": 0.47975475795120703, + "step": 14276 + }, + { + "epoch": 2.646829810901001, + "grad_norm": 7.57421875, + "learning_rate": 7.353170189098999e-06, + "loss": 2.9633, + "mean_token_accuracy": 0.4553066037735849, + "step": 14277 + }, + { + "epoch": 2.647015202076381, + "grad_norm": 7.02734375, + "learning_rate": 7.352984797923619e-06, + "loss": 3.2534, + "mean_token_accuracy": 0.4459183673469388, + "step": 14278 + }, + { + "epoch": 2.6472005932517613, + "grad_norm": 7.71875, + "learning_rate": 7.352799406748239e-06, + "loss": 2.6739, + "mean_token_accuracy": 0.5099650083675643, + "step": 14279 + }, + { + "epoch": 2.6473859844271415, + "grad_norm": 9.1015625, + "learning_rate": 7.35261401557286e-06, + "loss": 3.0928, + "mean_token_accuracy": 0.47110187110187113, + "step": 14280 + }, + { + "epoch": 2.647571375602521, + "grad_norm": 10.125, + "learning_rate": 7.352428624397479e-06, + "loss": 2.7022, + "mean_token_accuracy": 0.49726303658887927, + "step": 14281 + }, + { + "epoch": 2.6477567667779014, + "grad_norm": 7.9921875, + "learning_rate": 7.3522432332221e-06, + "loss": 3.222, + "mean_token_accuracy": 0.434966953485472, + "step": 14282 + }, + { + "epoch": 2.6479421579532816, + "grad_norm": 8.9140625, + "learning_rate": 7.3520578420467194e-06, + "loss": 2.5268, + "mean_token_accuracy": 0.5171227969623156, + "step": 14283 + }, + { + "epoch": 2.6481275491286613, + "grad_norm": 8.8671875, + "learning_rate": 7.351872450871339e-06, + "loss": 3.3809, + "mean_token_accuracy": 0.4357548509228585, + "step": 14284 + }, + { + "epoch": 2.6483129403040415, + "grad_norm": 11.4609375, + "learning_rate": 7.351687059695959e-06, + "loss": 3.2992, + "mean_token_accuracy": 0.46227278671374394, + "step": 14285 + }, + { + "epoch": 2.6484983314794217, + "grad_norm": 8.3515625, + "learning_rate": 7.351501668520578e-06, + "loss": 3.4116, + "mean_token_accuracy": 0.4252127506130102, + "step": 14286 + }, + { + "epoch": 2.648683722654802, + "grad_norm": 7.453125, + "learning_rate": 7.351316277345199e-06, + "loss": 2.658, + "mean_token_accuracy": 0.5203423304805793, + "step": 14287 + }, + { + "epoch": 2.6488691138301816, + "grad_norm": 7.6484375, + "learning_rate": 7.351130886169819e-06, + "loss": 2.8526, + "mean_token_accuracy": 0.4992313604919293, + "step": 14288 + }, + { + "epoch": 2.649054505005562, + "grad_norm": 6.49609375, + "learning_rate": 7.350945494994439e-06, + "loss": 2.9531, + "mean_token_accuracy": 0.47182353829883666, + "step": 14289 + }, + { + "epoch": 2.6492398961809416, + "grad_norm": 8.796875, + "learning_rate": 7.350760103819059e-06, + "loss": 3.0691, + "mean_token_accuracy": 0.4632891860614633, + "step": 14290 + }, + { + "epoch": 2.6494252873563218, + "grad_norm": 8.8203125, + "learning_rate": 7.350574712643679e-06, + "loss": 2.9039, + "mean_token_accuracy": 0.49181969949916526, + "step": 14291 + }, + { + "epoch": 2.649610678531702, + "grad_norm": 6.859375, + "learning_rate": 7.350389321468299e-06, + "loss": 2.7767, + "mean_token_accuracy": 0.47903316469926926, + "step": 14292 + }, + { + "epoch": 2.649796069707082, + "grad_norm": 9.5546875, + "learning_rate": 7.350203930292918e-06, + "loss": 2.4519, + "mean_token_accuracy": 0.5420047732696898, + "step": 14293 + }, + { + "epoch": 2.649981460882462, + "grad_norm": 10.3203125, + "learning_rate": 7.350018539117538e-06, + "loss": 2.6788, + "mean_token_accuracy": 0.486514657980456, + "step": 14294 + }, + { + "epoch": 2.650166852057842, + "grad_norm": 9.2109375, + "learning_rate": 7.349833147942158e-06, + "loss": 2.5203, + "mean_token_accuracy": 0.5370532915360502, + "step": 14295 + }, + { + "epoch": 2.650352243233222, + "grad_norm": 6.67578125, + "learning_rate": 7.349647756766779e-06, + "loss": 2.4733, + "mean_token_accuracy": 0.5176431424766977, + "step": 14296 + }, + { + "epoch": 2.650537634408602, + "grad_norm": 11.859375, + "learning_rate": 7.349462365591399e-06, + "loss": 2.5689, + "mean_token_accuracy": 0.5104194496393267, + "step": 14297 + }, + { + "epoch": 2.650723025583982, + "grad_norm": 9.0546875, + "learning_rate": 7.349276974416018e-06, + "loss": 2.7057, + "mean_token_accuracy": 0.4876954627018713, + "step": 14298 + }, + { + "epoch": 2.6509084167593624, + "grad_norm": 7.328125, + "learning_rate": 7.349091583240639e-06, + "loss": 2.8154, + "mean_token_accuracy": 0.48210227272727274, + "step": 14299 + }, + { + "epoch": 2.651093807934742, + "grad_norm": 7.04296875, + "learning_rate": 7.3489061920652585e-06, + "loss": 2.7913, + "mean_token_accuracy": 0.4715729304889252, + "step": 14300 + }, + { + "epoch": 2.6512791991101223, + "grad_norm": 7.65625, + "learning_rate": 7.348720800889878e-06, + "loss": 3.1335, + "mean_token_accuracy": 0.4390834763298678, + "step": 14301 + }, + { + "epoch": 2.6514645902855025, + "grad_norm": 8.1640625, + "learning_rate": 7.348535409714498e-06, + "loss": 2.7062, + "mean_token_accuracy": 0.5124170413187754, + "step": 14302 + }, + { + "epoch": 2.6516499814608823, + "grad_norm": 9.5859375, + "learning_rate": 7.348350018539117e-06, + "loss": 2.7289, + "mean_token_accuracy": 0.4888211382113821, + "step": 14303 + }, + { + "epoch": 2.6518353726362625, + "grad_norm": 8.890625, + "learning_rate": 7.348164627363739e-06, + "loss": 2.6641, + "mean_token_accuracy": 0.5122745490981964, + "step": 14304 + }, + { + "epoch": 2.6520207638116426, + "grad_norm": 8.0859375, + "learning_rate": 7.347979236188358e-06, + "loss": 3.1834, + "mean_token_accuracy": 0.4539279385705848, + "step": 14305 + }, + { + "epoch": 2.652206154987023, + "grad_norm": 7.73046875, + "learning_rate": 7.347793845012978e-06, + "loss": 3.3326, + "mean_token_accuracy": 0.45745159837910854, + "step": 14306 + }, + { + "epoch": 2.6523915461624026, + "grad_norm": 9.8203125, + "learning_rate": 7.347608453837598e-06, + "loss": 2.7702, + "mean_token_accuracy": 0.4783176214648296, + "step": 14307 + }, + { + "epoch": 2.6525769373377828, + "grad_norm": 7.45703125, + "learning_rate": 7.347423062662218e-06, + "loss": 2.6301, + "mean_token_accuracy": 0.515926493108729, + "step": 14308 + }, + { + "epoch": 2.6527623285131625, + "grad_norm": 8.015625, + "learning_rate": 7.347237671486838e-06, + "loss": 2.8802, + "mean_token_accuracy": 0.5093541564991052, + "step": 14309 + }, + { + "epoch": 2.6529477196885427, + "grad_norm": 10.171875, + "learning_rate": 7.3470522803114574e-06, + "loss": 2.8799, + "mean_token_accuracy": 0.48279608837377763, + "step": 14310 + }, + { + "epoch": 2.653133110863923, + "grad_norm": 7.69921875, + "learning_rate": 7.346866889136077e-06, + "loss": 2.5841, + "mean_token_accuracy": 0.5251419864247125, + "step": 14311 + }, + { + "epoch": 2.653318502039303, + "grad_norm": 7.859375, + "learning_rate": 7.3466814979606984e-06, + "loss": 2.7497, + "mean_token_accuracy": 0.4897693209956992, + "step": 14312 + }, + { + "epoch": 2.653503893214683, + "grad_norm": 9.3515625, + "learning_rate": 7.346496106785318e-06, + "loss": 3.2413, + "mean_token_accuracy": 0.42471207166216407, + "step": 14313 + }, + { + "epoch": 2.653689284390063, + "grad_norm": 8.2421875, + "learning_rate": 7.346310715609938e-06, + "loss": 2.6391, + "mean_token_accuracy": 0.5070757670632435, + "step": 14314 + }, + { + "epoch": 2.653874675565443, + "grad_norm": 8.453125, + "learning_rate": 7.346125324434557e-06, + "loss": 3.6969, + "mean_token_accuracy": 0.43725247524752475, + "step": 14315 + }, + { + "epoch": 2.654060066740823, + "grad_norm": 9.5703125, + "learning_rate": 7.345939933259177e-06, + "loss": 2.6861, + "mean_token_accuracy": 0.4831869130566495, + "step": 14316 + }, + { + "epoch": 2.654245457916203, + "grad_norm": 9.25, + "learning_rate": 7.3457545420837975e-06, + "loss": 2.2485, + "mean_token_accuracy": 0.5464669229749437, + "step": 14317 + }, + { + "epoch": 2.6544308490915833, + "grad_norm": 7.98828125, + "learning_rate": 7.345569150908417e-06, + "loss": 2.649, + "mean_token_accuracy": 0.49554896142433236, + "step": 14318 + }, + { + "epoch": 2.6546162402669635, + "grad_norm": 8.4609375, + "learning_rate": 7.345383759733037e-06, + "loss": 2.3793, + "mean_token_accuracy": 0.5355987055016181, + "step": 14319 + }, + { + "epoch": 2.6548016314423433, + "grad_norm": 8.6328125, + "learning_rate": 7.345198368557658e-06, + "loss": 3.2562, + "mean_token_accuracy": 0.4733650931139021, + "step": 14320 + }, + { + "epoch": 2.6549870226177235, + "grad_norm": 8.2421875, + "learning_rate": 7.345012977382278e-06, + "loss": 3.1708, + "mean_token_accuracy": 0.46873025900189513, + "step": 14321 + }, + { + "epoch": 2.655172413793103, + "grad_norm": 8.03125, + "learning_rate": 7.344827586206897e-06, + "loss": 2.8323, + "mean_token_accuracy": 0.4583420776495278, + "step": 14322 + }, + { + "epoch": 2.6553578049684834, + "grad_norm": 7.25390625, + "learning_rate": 7.344642195031517e-06, + "loss": 3.0198, + "mean_token_accuracy": 0.4683162341581171, + "step": 14323 + }, + { + "epoch": 2.6555431961438636, + "grad_norm": 9.828125, + "learning_rate": 7.344456803856137e-06, + "loss": 2.8613, + "mean_token_accuracy": 0.4884003400947407, + "step": 14324 + }, + { + "epoch": 2.6557285873192438, + "grad_norm": 8.515625, + "learning_rate": 7.344271412680757e-06, + "loss": 2.43, + "mean_token_accuracy": 0.49431311329170385, + "step": 14325 + }, + { + "epoch": 2.6559139784946235, + "grad_norm": 7.8515625, + "learning_rate": 7.344086021505377e-06, + "loss": 3.3819, + "mean_token_accuracy": 0.44514693829762325, + "step": 14326 + }, + { + "epoch": 2.6560993696700037, + "grad_norm": 7.88671875, + "learning_rate": 7.3439006303299965e-06, + "loss": 3.8092, + "mean_token_accuracy": 0.4363689011148111, + "step": 14327 + }, + { + "epoch": 2.656284760845384, + "grad_norm": 9.390625, + "learning_rate": 7.343715239154617e-06, + "loss": 2.7322, + "mean_token_accuracy": 0.5014848812095032, + "step": 14328 + }, + { + "epoch": 2.6564701520207636, + "grad_norm": 6.6484375, + "learning_rate": 7.3435298479792375e-06, + "loss": 2.7605, + "mean_token_accuracy": 0.48818137964302943, + "step": 14329 + }, + { + "epoch": 2.656655543196144, + "grad_norm": 9.171875, + "learning_rate": 7.343344456803857e-06, + "loss": 3.4169, + "mean_token_accuracy": 0.42405816259087903, + "step": 14330 + }, + { + "epoch": 2.656840934371524, + "grad_norm": 9.609375, + "learning_rate": 7.343159065628477e-06, + "loss": 2.5572, + "mean_token_accuracy": 0.527061556329849, + "step": 14331 + }, + { + "epoch": 2.657026325546904, + "grad_norm": 7.12109375, + "learning_rate": 7.342973674453096e-06, + "loss": 2.4386, + "mean_token_accuracy": 0.520631810946492, + "step": 14332 + }, + { + "epoch": 2.657211716722284, + "grad_norm": 8.0625, + "learning_rate": 7.342788283277716e-06, + "loss": 3.3779, + "mean_token_accuracy": 0.43190661478599224, + "step": 14333 + }, + { + "epoch": 2.657397107897664, + "grad_norm": 8.8515625, + "learning_rate": 7.3426028921023365e-06, + "loss": 3.3593, + "mean_token_accuracy": 0.4447821681864235, + "step": 14334 + }, + { + "epoch": 2.657582499073044, + "grad_norm": 8.3125, + "learning_rate": 7.342417500926956e-06, + "loss": 2.72, + "mean_token_accuracy": 0.49237855027963595, + "step": 14335 + }, + { + "epoch": 2.657767890248424, + "grad_norm": 7.30078125, + "learning_rate": 7.342232109751577e-06, + "loss": 2.8139, + "mean_token_accuracy": 0.5040805916857944, + "step": 14336 + }, + { + "epoch": 2.6579532814238043, + "grad_norm": 6.9375, + "learning_rate": 7.342046718576197e-06, + "loss": 3.1177, + "mean_token_accuracy": 0.4613195615514334, + "step": 14337 + }, + { + "epoch": 2.6581386725991845, + "grad_norm": 9.7109375, + "learning_rate": 7.341861327400817e-06, + "loss": 2.4282, + "mean_token_accuracy": 0.5263311878291399, + "step": 14338 + }, + { + "epoch": 2.658324063774564, + "grad_norm": 9.9921875, + "learning_rate": 7.3416759362254364e-06, + "loss": 2.5198, + "mean_token_accuracy": 0.5300751879699248, + "step": 14339 + }, + { + "epoch": 2.6585094549499444, + "grad_norm": 7.44140625, + "learning_rate": 7.341490545050056e-06, + "loss": 2.7297, + "mean_token_accuracy": 0.49147596375364766, + "step": 14340 + }, + { + "epoch": 2.6586948461253246, + "grad_norm": 9.078125, + "learning_rate": 7.341305153874676e-06, + "loss": 3.0315, + "mean_token_accuracy": 0.4886835409133521, + "step": 14341 + }, + { + "epoch": 2.6588802373007043, + "grad_norm": 7.5703125, + "learning_rate": 7.341119762699296e-06, + "loss": 2.7941, + "mean_token_accuracy": 0.46582185938832704, + "step": 14342 + }, + { + "epoch": 2.6590656284760845, + "grad_norm": 8.40625, + "learning_rate": 7.340934371523916e-06, + "loss": 3.0662, + "mean_token_accuracy": 0.4630030783263026, + "step": 14343 + }, + { + "epoch": 2.6592510196514647, + "grad_norm": 7.328125, + "learning_rate": 7.340748980348536e-06, + "loss": 2.8566, + "mean_token_accuracy": 0.4864437689969605, + "step": 14344 + }, + { + "epoch": 2.659436410826845, + "grad_norm": 6.8125, + "learning_rate": 7.340563589173156e-06, + "loss": 3.1324, + "mean_token_accuracy": 0.4594193946880791, + "step": 14345 + }, + { + "epoch": 2.6596218020022246, + "grad_norm": 9.875, + "learning_rate": 7.3403781979977765e-06, + "loss": 2.1975, + "mean_token_accuracy": 0.5395151515151515, + "step": 14346 + }, + { + "epoch": 2.659807193177605, + "grad_norm": 8.171875, + "learning_rate": 7.340192806822396e-06, + "loss": 2.3029, + "mean_token_accuracy": 0.5442312816413536, + "step": 14347 + }, + { + "epoch": 2.6599925843529846, + "grad_norm": 8.515625, + "learning_rate": 7.340007415647016e-06, + "loss": 2.9849, + "mean_token_accuracy": 0.45294768890119014, + "step": 14348 + }, + { + "epoch": 2.6601779755283648, + "grad_norm": 8.5234375, + "learning_rate": 7.3398220244716354e-06, + "loss": 3.3485, + "mean_token_accuracy": 0.4499722838137472, + "step": 14349 + }, + { + "epoch": 2.660363366703745, + "grad_norm": 8.046875, + "learning_rate": 7.339636633296255e-06, + "loss": 3.5782, + "mean_token_accuracy": 0.4275343531241898, + "step": 14350 + }, + { + "epoch": 2.660548757879125, + "grad_norm": 8.703125, + "learning_rate": 7.3394512421208756e-06, + "loss": 2.9482, + "mean_token_accuracy": 0.5027027027027027, + "step": 14351 + }, + { + "epoch": 2.660734149054505, + "grad_norm": 9.0, + "learning_rate": 7.339265850945495e-06, + "loss": 2.6088, + "mean_token_accuracy": 0.4948390267879086, + "step": 14352 + }, + { + "epoch": 2.660919540229885, + "grad_norm": 9.203125, + "learning_rate": 7.339080459770116e-06, + "loss": 4.6201, + "mean_token_accuracy": 0.368457459073092, + "step": 14353 + }, + { + "epoch": 2.6611049314052653, + "grad_norm": 8.3515625, + "learning_rate": 7.338895068594736e-06, + "loss": 3.0571, + "mean_token_accuracy": 0.47402360384474995, + "step": 14354 + }, + { + "epoch": 2.661290322580645, + "grad_norm": 7.359375, + "learning_rate": 7.338709677419356e-06, + "loss": 2.7064, + "mean_token_accuracy": 0.486412672402021, + "step": 14355 + }, + { + "epoch": 2.661475713756025, + "grad_norm": 9.6015625, + "learning_rate": 7.3385242862439755e-06, + "loss": 2.84, + "mean_token_accuracy": 0.4868532654792197, + "step": 14356 + }, + { + "epoch": 2.6616611049314054, + "grad_norm": 7.74609375, + "learning_rate": 7.338338895068595e-06, + "loss": 2.5029, + "mean_token_accuracy": 0.5128824476650563, + "step": 14357 + }, + { + "epoch": 2.6618464961067856, + "grad_norm": 7.59375, + "learning_rate": 7.338153503893215e-06, + "loss": 3.0158, + "mean_token_accuracy": 0.48630306021717673, + "step": 14358 + }, + { + "epoch": 2.6620318872821653, + "grad_norm": 7.4296875, + "learning_rate": 7.337968112717834e-06, + "loss": 2.9526, + "mean_token_accuracy": 0.4682871249525256, + "step": 14359 + }, + { + "epoch": 2.6622172784575455, + "grad_norm": 9.515625, + "learning_rate": 7.337782721542455e-06, + "loss": 2.4451, + "mean_token_accuracy": 0.5371870842907244, + "step": 14360 + }, + { + "epoch": 2.6624026696329253, + "grad_norm": 8.359375, + "learning_rate": 7.337597330367075e-06, + "loss": 2.6715, + "mean_token_accuracy": 0.5096519847743338, + "step": 14361 + }, + { + "epoch": 2.6625880608083055, + "grad_norm": 7.578125, + "learning_rate": 7.337411939191695e-06, + "loss": 3.0092, + "mean_token_accuracy": 0.4981901941428101, + "step": 14362 + }, + { + "epoch": 2.6627734519836856, + "grad_norm": 9.046875, + "learning_rate": 7.3372265480163155e-06, + "loss": 2.7157, + "mean_token_accuracy": 0.503098429168468, + "step": 14363 + }, + { + "epoch": 2.662958843159066, + "grad_norm": 13.7890625, + "learning_rate": 7.337041156840935e-06, + "loss": 3.1592, + "mean_token_accuracy": 0.4386603567528213, + "step": 14364 + }, + { + "epoch": 2.6631442343344456, + "grad_norm": 7.44921875, + "learning_rate": 7.336855765665555e-06, + "loss": 2.7982, + "mean_token_accuracy": 0.47955550760599847, + "step": 14365 + }, + { + "epoch": 2.6633296255098258, + "grad_norm": 7.33203125, + "learning_rate": 7.3366703744901745e-06, + "loss": 2.9848, + "mean_token_accuracy": 0.4629433250854248, + "step": 14366 + }, + { + "epoch": 2.6635150166852055, + "grad_norm": 7.80078125, + "learning_rate": 7.336484983314794e-06, + "loss": 2.6045, + "mean_token_accuracy": 0.5093696763202725, + "step": 14367 + }, + { + "epoch": 2.6637004078605857, + "grad_norm": 7.52734375, + "learning_rate": 7.336299592139415e-06, + "loss": 2.2891, + "mean_token_accuracy": 0.5421074904782057, + "step": 14368 + }, + { + "epoch": 2.663885799035966, + "grad_norm": 8.7890625, + "learning_rate": 7.336114200964035e-06, + "loss": 2.8799, + "mean_token_accuracy": 0.4768488326594496, + "step": 14369 + }, + { + "epoch": 2.664071190211346, + "grad_norm": 7.5390625, + "learning_rate": 7.335928809788655e-06, + "loss": 2.9566, + "mean_token_accuracy": 0.4744389607970947, + "step": 14370 + }, + { + "epoch": 2.664256581386726, + "grad_norm": 8.8671875, + "learning_rate": 7.335743418613274e-06, + "loss": 2.8167, + "mean_token_accuracy": 0.4772811230144071, + "step": 14371 + }, + { + "epoch": 2.664441972562106, + "grad_norm": 8.703125, + "learning_rate": 7.335558027437895e-06, + "loss": 2.7402, + "mean_token_accuracy": 0.5307187593200119, + "step": 14372 + }, + { + "epoch": 2.664627363737486, + "grad_norm": 7.8984375, + "learning_rate": 7.3353726362625145e-06, + "loss": 3.0046, + "mean_token_accuracy": 0.47235216504737915, + "step": 14373 + }, + { + "epoch": 2.664812754912866, + "grad_norm": 8.3671875, + "learning_rate": 7.335187245087134e-06, + "loss": 2.6115, + "mean_token_accuracy": 0.5032178217821782, + "step": 14374 + }, + { + "epoch": 2.664998146088246, + "grad_norm": 8.328125, + "learning_rate": 7.335001853911754e-06, + "loss": 2.8469, + "mean_token_accuracy": 0.5011686554311724, + "step": 14375 + }, + { + "epoch": 2.6651835372636263, + "grad_norm": 7.03515625, + "learning_rate": 7.3348164627363734e-06, + "loss": 2.9636, + "mean_token_accuracy": 0.46422018348623856, + "step": 14376 + }, + { + "epoch": 2.6653689284390065, + "grad_norm": 14.09375, + "learning_rate": 7.334631071560995e-06, + "loss": 2.856, + "mean_token_accuracy": 0.48548812664907653, + "step": 14377 + }, + { + "epoch": 2.6655543196143863, + "grad_norm": 10.5078125, + "learning_rate": 7.3344456803856144e-06, + "loss": 2.6905, + "mean_token_accuracy": 0.4989408099688474, + "step": 14378 + }, + { + "epoch": 2.6657397107897665, + "grad_norm": 8.4765625, + "learning_rate": 7.334260289210234e-06, + "loss": 2.6666, + "mean_token_accuracy": 0.5112605400789839, + "step": 14379 + }, + { + "epoch": 2.665925101965146, + "grad_norm": 7.63671875, + "learning_rate": 7.3340748980348546e-06, + "loss": 3.339, + "mean_token_accuracy": 0.4401919763721388, + "step": 14380 + }, + { + "epoch": 2.6661104931405264, + "grad_norm": 7.3984375, + "learning_rate": 7.333889506859474e-06, + "loss": 2.4887, + "mean_token_accuracy": 0.5197666882696047, + "step": 14381 + }, + { + "epoch": 2.6662958843159066, + "grad_norm": 7.80859375, + "learning_rate": 7.333704115684094e-06, + "loss": 2.4098, + "mean_token_accuracy": 0.5115528591712921, + "step": 14382 + }, + { + "epoch": 2.6664812754912868, + "grad_norm": 8.2109375, + "learning_rate": 7.3335187245087135e-06, + "loss": 2.9373, + "mean_token_accuracy": 0.4772420009013069, + "step": 14383 + }, + { + "epoch": 2.6666666666666665, + "grad_norm": 8.5703125, + "learning_rate": 7.333333333333333e-06, + "loss": 3.1502, + "mean_token_accuracy": 0.47878463391781556, + "step": 14384 + }, + { + "epoch": 2.6668520578420467, + "grad_norm": 8.4375, + "learning_rate": 7.3331479421579545e-06, + "loss": 2.3126, + "mean_token_accuracy": 0.5543495610534717, + "step": 14385 + }, + { + "epoch": 2.667037449017427, + "grad_norm": 11.140625, + "learning_rate": 7.332962550982574e-06, + "loss": 3.3788, + "mean_token_accuracy": 0.4505475404136972, + "step": 14386 + }, + { + "epoch": 2.6672228401928066, + "grad_norm": 10.046875, + "learning_rate": 7.332777159807194e-06, + "loss": 3.0358, + "mean_token_accuracy": 0.4616791101630352, + "step": 14387 + }, + { + "epoch": 2.667408231368187, + "grad_norm": 8.4140625, + "learning_rate": 7.332591768631813e-06, + "loss": 2.7218, + "mean_token_accuracy": 0.47732958098811756, + "step": 14388 + }, + { + "epoch": 2.667593622543567, + "grad_norm": 9.1171875, + "learning_rate": 7.332406377456434e-06, + "loss": 3.0542, + "mean_token_accuracy": 0.460020464844321, + "step": 14389 + }, + { + "epoch": 2.667779013718947, + "grad_norm": 9.1640625, + "learning_rate": 7.3322209862810535e-06, + "loss": 2.533, + "mean_token_accuracy": 0.5123526745240253, + "step": 14390 + }, + { + "epoch": 2.667964404894327, + "grad_norm": 7.09375, + "learning_rate": 7.332035595105673e-06, + "loss": 2.6494, + "mean_token_accuracy": 0.48421185054723864, + "step": 14391 + }, + { + "epoch": 2.668149796069707, + "grad_norm": 10.453125, + "learning_rate": 7.331850203930293e-06, + "loss": 3.4936, + "mean_token_accuracy": 0.4476819290943017, + "step": 14392 + }, + { + "epoch": 2.668335187245087, + "grad_norm": 9.5078125, + "learning_rate": 7.331664812754914e-06, + "loss": 2.6458, + "mean_token_accuracy": 0.500997150997151, + "step": 14393 + }, + { + "epoch": 2.668520578420467, + "grad_norm": 7.17578125, + "learning_rate": 7.331479421579534e-06, + "loss": 2.9601, + "mean_token_accuracy": 0.47411679884643115, + "step": 14394 + }, + { + "epoch": 2.6687059695958473, + "grad_norm": 9.953125, + "learning_rate": 7.3312940304041535e-06, + "loss": 3.2614, + "mean_token_accuracy": 0.4641962944416625, + "step": 14395 + }, + { + "epoch": 2.6688913607712275, + "grad_norm": 8.328125, + "learning_rate": 7.331108639228773e-06, + "loss": 3.2525, + "mean_token_accuracy": 0.4509118161378966, + "step": 14396 + }, + { + "epoch": 2.669076751946607, + "grad_norm": 11.703125, + "learning_rate": 7.330923248053393e-06, + "loss": 3.4304, + "mean_token_accuracy": 0.4450656611623976, + "step": 14397 + }, + { + "epoch": 2.6692621431219874, + "grad_norm": 7.65234375, + "learning_rate": 7.330737856878013e-06, + "loss": 3.2061, + "mean_token_accuracy": 0.4449357468564322, + "step": 14398 + }, + { + "epoch": 2.6694475342973676, + "grad_norm": 7.9765625, + "learning_rate": 7.330552465702633e-06, + "loss": 2.6651, + "mean_token_accuracy": 0.5024531668153435, + "step": 14399 + }, + { + "epoch": 2.6696329254727473, + "grad_norm": 9.3359375, + "learning_rate": 7.3303670745272525e-06, + "loss": 2.891, + "mean_token_accuracy": 0.4740767472633225, + "step": 14400 + }, + { + "epoch": 2.6698183166481275, + "grad_norm": 8.515625, + "learning_rate": 7.330181683351874e-06, + "loss": 2.4907, + "mean_token_accuracy": 0.5204359673024523, + "step": 14401 + }, + { + "epoch": 2.6700037078235077, + "grad_norm": 8.859375, + "learning_rate": 7.3299962921764935e-06, + "loss": 2.1434, + "mean_token_accuracy": 0.5657636435021594, + "step": 14402 + }, + { + "epoch": 2.670189098998888, + "grad_norm": 8.40625, + "learning_rate": 7.329810901001113e-06, + "loss": 3.3618, + "mean_token_accuracy": 0.4505754544476713, + "step": 14403 + }, + { + "epoch": 2.6703744901742676, + "grad_norm": 8.0, + "learning_rate": 7.329625509825733e-06, + "loss": 3.0273, + "mean_token_accuracy": 0.48359414437152953, + "step": 14404 + }, + { + "epoch": 2.670559881349648, + "grad_norm": 7.58203125, + "learning_rate": 7.3294401186503524e-06, + "loss": 2.9479, + "mean_token_accuracy": 0.4811188811188811, + "step": 14405 + }, + { + "epoch": 2.6707452725250276, + "grad_norm": 7.96875, + "learning_rate": 7.329254727474973e-06, + "loss": 2.9317, + "mean_token_accuracy": 0.4608429545138406, + "step": 14406 + }, + { + "epoch": 2.6709306637004078, + "grad_norm": 9.9296875, + "learning_rate": 7.329069336299593e-06, + "loss": 3.3822, + "mean_token_accuracy": 0.4625935162094763, + "step": 14407 + }, + { + "epoch": 2.671116054875788, + "grad_norm": 9.8203125, + "learning_rate": 7.328883945124212e-06, + "loss": 2.7646, + "mean_token_accuracy": 0.5155367231638418, + "step": 14408 + }, + { + "epoch": 2.671301446051168, + "grad_norm": 6.75390625, + "learning_rate": 7.328698553948833e-06, + "loss": 3.0527, + "mean_token_accuracy": 0.4783256570035221, + "step": 14409 + }, + { + "epoch": 2.671486837226548, + "grad_norm": 9.921875, + "learning_rate": 7.328513162773453e-06, + "loss": 2.2954, + "mean_token_accuracy": 0.5635332252836305, + "step": 14410 + }, + { + "epoch": 2.671672228401928, + "grad_norm": 9.9296875, + "learning_rate": 7.328327771598073e-06, + "loss": 3.0179, + "mean_token_accuracy": 0.5013857727332718, + "step": 14411 + }, + { + "epoch": 2.6718576195773083, + "grad_norm": 7.89453125, + "learning_rate": 7.3281423804226925e-06, + "loss": 3.0008, + "mean_token_accuracy": 0.47876546358882793, + "step": 14412 + }, + { + "epoch": 2.672043010752688, + "grad_norm": 7.6484375, + "learning_rate": 7.327956989247312e-06, + "loss": 2.8653, + "mean_token_accuracy": 0.4782608695652174, + "step": 14413 + }, + { + "epoch": 2.672228401928068, + "grad_norm": 9.2421875, + "learning_rate": 7.327771598071932e-06, + "loss": 2.3436, + "mean_token_accuracy": 0.527347036903474, + "step": 14414 + }, + { + "epoch": 2.6724137931034484, + "grad_norm": 7.359375, + "learning_rate": 7.327586206896552e-06, + "loss": 2.2808, + "mean_token_accuracy": 0.5082590456213949, + "step": 14415 + }, + { + "epoch": 2.6725991842788286, + "grad_norm": 7.73046875, + "learning_rate": 7.327400815721172e-06, + "loss": 3.5812, + "mean_token_accuracy": 0.42135817469960846, + "step": 14416 + }, + { + "epoch": 2.6727845754542083, + "grad_norm": 9.5703125, + "learning_rate": 7.327215424545792e-06, + "loss": 2.4771, + "mean_token_accuracy": 0.5110599078341014, + "step": 14417 + }, + { + "epoch": 2.6729699666295885, + "grad_norm": 8.5390625, + "learning_rate": 7.327030033370413e-06, + "loss": 3.0331, + "mean_token_accuracy": 0.48859375, + "step": 14418 + }, + { + "epoch": 2.6731553578049683, + "grad_norm": 8.2578125, + "learning_rate": 7.3268446421950326e-06, + "loss": 2.2458, + "mean_token_accuracy": 0.5618089027502127, + "step": 14419 + }, + { + "epoch": 2.6733407489803485, + "grad_norm": 11.6328125, + "learning_rate": 7.326659251019652e-06, + "loss": 3.7466, + "mean_token_accuracy": 0.44139727286655434, + "step": 14420 + }, + { + "epoch": 2.6735261401557286, + "grad_norm": 8.984375, + "learning_rate": 7.326473859844272e-06, + "loss": 3.6291, + "mean_token_accuracy": 0.4485969387755102, + "step": 14421 + }, + { + "epoch": 2.673711531331109, + "grad_norm": 7.42578125, + "learning_rate": 7.3262884686688915e-06, + "loss": 3.3756, + "mean_token_accuracy": 0.4565706570657066, + "step": 14422 + }, + { + "epoch": 2.6738969225064886, + "grad_norm": 7.91796875, + "learning_rate": 7.326103077493512e-06, + "loss": 2.4003, + "mean_token_accuracy": 0.5232634338138925, + "step": 14423 + }, + { + "epoch": 2.6740823136818688, + "grad_norm": 8.953125, + "learning_rate": 7.325917686318132e-06, + "loss": 2.391, + "mean_token_accuracy": 0.5338824821526633, + "step": 14424 + }, + { + "epoch": 2.6742677048572485, + "grad_norm": 6.7578125, + "learning_rate": 7.325732295142752e-06, + "loss": 3.005, + "mean_token_accuracy": 0.47583148558758315, + "step": 14425 + }, + { + "epoch": 2.6744530960326287, + "grad_norm": 7.09765625, + "learning_rate": 7.325546903967372e-06, + "loss": 2.9228, + "mean_token_accuracy": 0.47908163265306125, + "step": 14426 + }, + { + "epoch": 2.674638487208009, + "grad_norm": 6.41796875, + "learning_rate": 7.325361512791992e-06, + "loss": 2.4937, + "mean_token_accuracy": 0.4925915948275862, + "step": 14427 + }, + { + "epoch": 2.674823878383389, + "grad_norm": 8.4140625, + "learning_rate": 7.325176121616612e-06, + "loss": 2.5499, + "mean_token_accuracy": 0.5092957746478873, + "step": 14428 + }, + { + "epoch": 2.6750092695587693, + "grad_norm": 7.15234375, + "learning_rate": 7.3249907304412315e-06, + "loss": 3.3315, + "mean_token_accuracy": 0.4595533498759305, + "step": 14429 + }, + { + "epoch": 2.675194660734149, + "grad_norm": 8.2109375, + "learning_rate": 7.324805339265851e-06, + "loss": 3.4268, + "mean_token_accuracy": 0.4422024088847177, + "step": 14430 + }, + { + "epoch": 2.675380051909529, + "grad_norm": 9.25, + "learning_rate": 7.324619948090471e-06, + "loss": 3.3492, + "mean_token_accuracy": 0.440857762062279, + "step": 14431 + }, + { + "epoch": 2.675565443084909, + "grad_norm": 7.61328125, + "learning_rate": 7.324434556915091e-06, + "loss": 2.8591, + "mean_token_accuracy": 0.4747097844112769, + "step": 14432 + }, + { + "epoch": 2.675750834260289, + "grad_norm": 7.72265625, + "learning_rate": 7.324249165739712e-06, + "loss": 3.3891, + "mean_token_accuracy": 0.43711640486840736, + "step": 14433 + }, + { + "epoch": 2.6759362254356693, + "grad_norm": 10.7578125, + "learning_rate": 7.3240637745643315e-06, + "loss": 2.8709, + "mean_token_accuracy": 0.46384910932588197, + "step": 14434 + }, + { + "epoch": 2.6761216166110495, + "grad_norm": 8.640625, + "learning_rate": 7.323878383388951e-06, + "loss": 3.2718, + "mean_token_accuracy": 0.4524421593830334, + "step": 14435 + }, + { + "epoch": 2.6763070077864293, + "grad_norm": 8.0703125, + "learning_rate": 7.323692992213572e-06, + "loss": 2.4309, + "mean_token_accuracy": 0.5226392612451594, + "step": 14436 + }, + { + "epoch": 2.6764923989618095, + "grad_norm": 8.046875, + "learning_rate": 7.323507601038191e-06, + "loss": 2.9711, + "mean_token_accuracy": 0.47438370846730976, + "step": 14437 + }, + { + "epoch": 2.676677790137189, + "grad_norm": 9.21875, + "learning_rate": 7.323322209862811e-06, + "loss": 3.8425, + "mean_token_accuracy": 0.42394822006472493, + "step": 14438 + }, + { + "epoch": 2.6768631813125694, + "grad_norm": 8.625, + "learning_rate": 7.3231368186874305e-06, + "loss": 2.8295, + "mean_token_accuracy": 0.48242540087076563, + "step": 14439 + }, + { + "epoch": 2.6770485724879496, + "grad_norm": 8.9296875, + "learning_rate": 7.32295142751205e-06, + "loss": 2.6654, + "mean_token_accuracy": 0.4991403026134801, + "step": 14440 + }, + { + "epoch": 2.6772339636633298, + "grad_norm": 13.3359375, + "learning_rate": 7.3227660363366715e-06, + "loss": 2.7126, + "mean_token_accuracy": 0.4706781055353155, + "step": 14441 + }, + { + "epoch": 2.6774193548387095, + "grad_norm": 10.5859375, + "learning_rate": 7.322580645161291e-06, + "loss": 2.9525, + "mean_token_accuracy": 0.48564834847829225, + "step": 14442 + }, + { + "epoch": 2.6776047460140897, + "grad_norm": 8.9453125, + "learning_rate": 7.322395253985911e-06, + "loss": 3.0842, + "mean_token_accuracy": 0.4722810514513517, + "step": 14443 + }, + { + "epoch": 2.67779013718947, + "grad_norm": 8.7421875, + "learning_rate": 7.322209862810531e-06, + "loss": 2.6741, + "mean_token_accuracy": 0.5108436921449352, + "step": 14444 + }, + { + "epoch": 2.6779755283648496, + "grad_norm": 13.3046875, + "learning_rate": 7.322024471635151e-06, + "loss": 3.0802, + "mean_token_accuracy": 0.47243675099866844, + "step": 14445 + }, + { + "epoch": 2.67816091954023, + "grad_norm": 8.109375, + "learning_rate": 7.3218390804597706e-06, + "loss": 2.1807, + "mean_token_accuracy": 0.5350964737192282, + "step": 14446 + }, + { + "epoch": 2.67834631071561, + "grad_norm": 9.0625, + "learning_rate": 7.32165368928439e-06, + "loss": 3.0076, + "mean_token_accuracy": 0.46537008535049673, + "step": 14447 + }, + { + "epoch": 2.67853170189099, + "grad_norm": 17.09375, + "learning_rate": 7.32146829810901e-06, + "loss": 2.6424, + "mean_token_accuracy": 0.5080044865912103, + "step": 14448 + }, + { + "epoch": 2.67871709306637, + "grad_norm": 12.5703125, + "learning_rate": 7.321282906933631e-06, + "loss": 2.9962, + "mean_token_accuracy": 0.47645473283653084, + "step": 14449 + }, + { + "epoch": 2.67890248424175, + "grad_norm": 10.8046875, + "learning_rate": 7.321097515758251e-06, + "loss": 2.6283, + "mean_token_accuracy": 0.5406218655967904, + "step": 14450 + }, + { + "epoch": 2.67908787541713, + "grad_norm": 17.796875, + "learning_rate": 7.3209121245828705e-06, + "loss": 2.4369, + "mean_token_accuracy": 0.5225100215849522, + "step": 14451 + }, + { + "epoch": 2.67927326659251, + "grad_norm": 18.4375, + "learning_rate": 7.32072673340749e-06, + "loss": 3.3878, + "mean_token_accuracy": 0.4425444596443228, + "step": 14452 + }, + { + "epoch": 2.6794586577678903, + "grad_norm": 14.59375, + "learning_rate": 7.320541342232111e-06, + "loss": 2.8386, + "mean_token_accuracy": 0.47734843437708196, + "step": 14453 + }, + { + "epoch": 2.6796440489432705, + "grad_norm": 6.55859375, + "learning_rate": 7.32035595105673e-06, + "loss": 3.1307, + "mean_token_accuracy": 0.4610519658480025, + "step": 14454 + }, + { + "epoch": 2.67982944011865, + "grad_norm": 11.453125, + "learning_rate": 7.32017055988135e-06, + "loss": 2.8486, + "mean_token_accuracy": 0.4752519933804724, + "step": 14455 + }, + { + "epoch": 2.6800148312940304, + "grad_norm": 13.171875, + "learning_rate": 7.3199851687059696e-06, + "loss": 2.9795, + "mean_token_accuracy": 0.46078304690528493, + "step": 14456 + }, + { + "epoch": 2.6802002224694106, + "grad_norm": 8.8671875, + "learning_rate": 7.319799777530591e-06, + "loss": 1.547, + "mean_token_accuracy": 0.649205078348393, + "step": 14457 + }, + { + "epoch": 2.6803856136447903, + "grad_norm": 7.14453125, + "learning_rate": 7.3196143863552105e-06, + "loss": 2.7919, + "mean_token_accuracy": 0.4840520540954325, + "step": 14458 + }, + { + "epoch": 2.6805710048201705, + "grad_norm": 10.53125, + "learning_rate": 7.31942899517983e-06, + "loss": 2.1802, + "mean_token_accuracy": 0.5456793336803748, + "step": 14459 + }, + { + "epoch": 2.6807563959955507, + "grad_norm": 12.5625, + "learning_rate": 7.31924360400445e-06, + "loss": 3.0239, + "mean_token_accuracy": 0.4828891342059612, + "step": 14460 + }, + { + "epoch": 2.680941787170931, + "grad_norm": 8.3125, + "learning_rate": 7.31905821282907e-06, + "loss": 3.1144, + "mean_token_accuracy": 0.46226415094339623, + "step": 14461 + }, + { + "epoch": 2.6811271783463106, + "grad_norm": 6.75, + "learning_rate": 7.31887282165369e-06, + "loss": 3.073, + "mean_token_accuracy": 0.4622485207100592, + "step": 14462 + }, + { + "epoch": 2.681312569521691, + "grad_norm": 9.4609375, + "learning_rate": 7.31868743047831e-06, + "loss": 2.9195, + "mean_token_accuracy": 0.5194730813287515, + "step": 14463 + }, + { + "epoch": 2.6814979606970706, + "grad_norm": 15.7265625, + "learning_rate": 7.318502039302929e-06, + "loss": 3.233, + "mean_token_accuracy": 0.45287089279787585, + "step": 14464 + }, + { + "epoch": 2.6816833518724508, + "grad_norm": 7.00390625, + "learning_rate": 7.318316648127551e-06, + "loss": 3.1165, + "mean_token_accuracy": 0.44779639975170704, + "step": 14465 + }, + { + "epoch": 2.681868743047831, + "grad_norm": 8.421875, + "learning_rate": 7.31813125695217e-06, + "loss": 2.7542, + "mean_token_accuracy": 0.5258884319308236, + "step": 14466 + }, + { + "epoch": 2.682054134223211, + "grad_norm": 9.953125, + "learning_rate": 7.31794586577679e-06, + "loss": 3.4616, + "mean_token_accuracy": 0.44388789505068577, + "step": 14467 + }, + { + "epoch": 2.682239525398591, + "grad_norm": 9.0703125, + "learning_rate": 7.3177604746014095e-06, + "loss": 3.0452, + "mean_token_accuracy": 0.4639529883904257, + "step": 14468 + }, + { + "epoch": 2.682424916573971, + "grad_norm": 6.9453125, + "learning_rate": 7.317575083426029e-06, + "loss": 2.5903, + "mean_token_accuracy": 0.5081680280046674, + "step": 14469 + }, + { + "epoch": 2.6826103077493513, + "grad_norm": 7.04296875, + "learning_rate": 7.31738969225065e-06, + "loss": 2.6123, + "mean_token_accuracy": 0.4862068965517241, + "step": 14470 + }, + { + "epoch": 2.682795698924731, + "grad_norm": 8.0078125, + "learning_rate": 7.317204301075269e-06, + "loss": 2.6064, + "mean_token_accuracy": 0.49676354867368955, + "step": 14471 + }, + { + "epoch": 2.682981090100111, + "grad_norm": 5.95703125, + "learning_rate": 7.317018909899889e-06, + "loss": 2.5964, + "mean_token_accuracy": 0.4882771491893153, + "step": 14472 + }, + { + "epoch": 2.6831664812754914, + "grad_norm": 7.296875, + "learning_rate": 7.31683351872451e-06, + "loss": 2.9497, + "mean_token_accuracy": 0.47953774385072095, + "step": 14473 + }, + { + "epoch": 2.6833518724508716, + "grad_norm": 10.109375, + "learning_rate": 7.31664812754913e-06, + "loss": 3.2987, + "mean_token_accuracy": 0.46673490276356194, + "step": 14474 + }, + { + "epoch": 2.6835372636262513, + "grad_norm": 7.7265625, + "learning_rate": 7.3164627363737496e-06, + "loss": 3.3764, + "mean_token_accuracy": 0.434087307304423, + "step": 14475 + }, + { + "epoch": 2.6837226548016315, + "grad_norm": 7.4140625, + "learning_rate": 7.316277345198369e-06, + "loss": 2.4983, + "mean_token_accuracy": 0.5052967638949354, + "step": 14476 + }, + { + "epoch": 2.6839080459770113, + "grad_norm": 10.015625, + "learning_rate": 7.316091954022989e-06, + "loss": 2.8413, + "mean_token_accuracy": 0.4908882192567083, + "step": 14477 + }, + { + "epoch": 2.6840934371523915, + "grad_norm": 8.8359375, + "learning_rate": 7.3159065628476085e-06, + "loss": 2.6331, + "mean_token_accuracy": 0.4841803687095167, + "step": 14478 + }, + { + "epoch": 2.6842788283277716, + "grad_norm": 7.09375, + "learning_rate": 7.315721171672229e-06, + "loss": 2.1008, + "mean_token_accuracy": 0.5543035243346919, + "step": 14479 + }, + { + "epoch": 2.684464219503152, + "grad_norm": 7.12109375, + "learning_rate": 7.315535780496849e-06, + "loss": 3.3076, + "mean_token_accuracy": 0.4471839214769806, + "step": 14480 + }, + { + "epoch": 2.6846496106785316, + "grad_norm": 7.18359375, + "learning_rate": 7.315350389321468e-06, + "loss": 2.6224, + "mean_token_accuracy": 0.5089998646636893, + "step": 14481 + }, + { + "epoch": 2.6848350018539118, + "grad_norm": 7.921875, + "learning_rate": 7.31516499814609e-06, + "loss": 3.2929, + "mean_token_accuracy": 0.4492753623188406, + "step": 14482 + }, + { + "epoch": 2.685020393029292, + "grad_norm": 8.6796875, + "learning_rate": 7.314979606970709e-06, + "loss": 2.7632, + "mean_token_accuracy": 0.4961704497980783, + "step": 14483 + }, + { + "epoch": 2.6852057842046717, + "grad_norm": 7.45703125, + "learning_rate": 7.314794215795329e-06, + "loss": 3.1422, + "mean_token_accuracy": 0.46682431492558074, + "step": 14484 + }, + { + "epoch": 2.685391175380052, + "grad_norm": 7.31640625, + "learning_rate": 7.3146088246199486e-06, + "loss": 2.8186, + "mean_token_accuracy": 0.4923326452373931, + "step": 14485 + }, + { + "epoch": 2.685576566555432, + "grad_norm": 7.66015625, + "learning_rate": 7.314423433444568e-06, + "loss": 2.5064, + "mean_token_accuracy": 0.5172867096095701, + "step": 14486 + }, + { + "epoch": 2.6857619577308123, + "grad_norm": 10.5390625, + "learning_rate": 7.314238042269189e-06, + "loss": 3.4663, + "mean_token_accuracy": 0.46333514394350894, + "step": 14487 + }, + { + "epoch": 2.685947348906192, + "grad_norm": 6.11328125, + "learning_rate": 7.314052651093808e-06, + "loss": 2.6942, + "mean_token_accuracy": 0.4924263674614306, + "step": 14488 + }, + { + "epoch": 2.686132740081572, + "grad_norm": 6.625, + "learning_rate": 7.313867259918428e-06, + "loss": 2.9867, + "mean_token_accuracy": 0.4907253599114064, + "step": 14489 + }, + { + "epoch": 2.686318131256952, + "grad_norm": 8.671875, + "learning_rate": 7.3136818687430485e-06, + "loss": 3.0863, + "mean_token_accuracy": 0.47035148913335123, + "step": 14490 + }, + { + "epoch": 2.686503522432332, + "grad_norm": 7.265625, + "learning_rate": 7.313496477567669e-06, + "loss": 2.8924, + "mean_token_accuracy": 0.4836174339083393, + "step": 14491 + }, + { + "epoch": 2.6866889136077123, + "grad_norm": 7.71875, + "learning_rate": 7.313311086392289e-06, + "loss": 3.2416, + "mean_token_accuracy": 0.45787965616045845, + "step": 14492 + }, + { + "epoch": 2.6868743047830925, + "grad_norm": 7.37109375, + "learning_rate": 7.313125695216908e-06, + "loss": 3.0507, + "mean_token_accuracy": 0.4570300637716368, + "step": 14493 + }, + { + "epoch": 2.6870596959584723, + "grad_norm": 7.71875, + "learning_rate": 7.312940304041528e-06, + "loss": 3.0931, + "mean_token_accuracy": 0.4741368565374548, + "step": 14494 + }, + { + "epoch": 2.6872450871338525, + "grad_norm": 6.6875, + "learning_rate": 7.3127549128661475e-06, + "loss": 3.0696, + "mean_token_accuracy": 0.46319873893537045, + "step": 14495 + }, + { + "epoch": 2.687430478309232, + "grad_norm": 6.546875, + "learning_rate": 7.312569521690768e-06, + "loss": 2.8635, + "mean_token_accuracy": 0.4871698113207547, + "step": 14496 + }, + { + "epoch": 2.6876158694846124, + "grad_norm": 7.1640625, + "learning_rate": 7.312384130515388e-06, + "loss": 2.0691, + "mean_token_accuracy": 0.5727239421139403, + "step": 14497 + }, + { + "epoch": 2.6878012606599926, + "grad_norm": 7.8515625, + "learning_rate": 7.312198739340008e-06, + "loss": 3.037, + "mean_token_accuracy": 0.48949839387200395, + "step": 14498 + }, + { + "epoch": 2.6879866518353728, + "grad_norm": 7.17578125, + "learning_rate": 7.312013348164629e-06, + "loss": 3.1276, + "mean_token_accuracy": 0.4701355807988274, + "step": 14499 + }, + { + "epoch": 2.688172043010753, + "grad_norm": 7.06640625, + "learning_rate": 7.311827956989248e-06, + "loss": 2.86, + "mean_token_accuracy": 0.4744139508290452, + "step": 14500 + }, + { + "epoch": 2.6883574341861327, + "grad_norm": 8.234375, + "learning_rate": 7.311642565813868e-06, + "loss": 2.8777, + "mean_token_accuracy": 0.4667176740627391, + "step": 14501 + }, + { + "epoch": 2.688542825361513, + "grad_norm": 8.2734375, + "learning_rate": 7.311457174638488e-06, + "loss": 2.6723, + "mean_token_accuracy": 0.4946164199192463, + "step": 14502 + }, + { + "epoch": 2.6887282165368926, + "grad_norm": 7.23828125, + "learning_rate": 7.311271783463107e-06, + "loss": 3.2436, + "mean_token_accuracy": 0.4434731071269818, + "step": 14503 + }, + { + "epoch": 2.688913607712273, + "grad_norm": 7.359375, + "learning_rate": 7.311086392287728e-06, + "loss": 3.1714, + "mean_token_accuracy": 0.45684102035557844, + "step": 14504 + }, + { + "epoch": 2.689098998887653, + "grad_norm": 7.66015625, + "learning_rate": 7.310901001112347e-06, + "loss": 3.357, + "mean_token_accuracy": 0.4477184711230847, + "step": 14505 + }, + { + "epoch": 2.689284390063033, + "grad_norm": 7.59375, + "learning_rate": 7.310715609936968e-06, + "loss": 2.5964, + "mean_token_accuracy": 0.510383488342464, + "step": 14506 + }, + { + "epoch": 2.689469781238413, + "grad_norm": 10.796875, + "learning_rate": 7.3105302187615875e-06, + "loss": 3.3372, + "mean_token_accuracy": 0.4530110141610642, + "step": 14507 + }, + { + "epoch": 2.689655172413793, + "grad_norm": 6.90625, + "learning_rate": 7.310344827586208e-06, + "loss": 2.7863, + "mean_token_accuracy": 0.4843126827971284, + "step": 14508 + }, + { + "epoch": 2.689840563589173, + "grad_norm": 7.41015625, + "learning_rate": 7.310159436410828e-06, + "loss": 2.8269, + "mean_token_accuracy": 0.47622201616108095, + "step": 14509 + }, + { + "epoch": 2.690025954764553, + "grad_norm": 7.98828125, + "learning_rate": 7.309974045235447e-06, + "loss": 3.2778, + "mean_token_accuracy": 0.4499257736667809, + "step": 14510 + }, + { + "epoch": 2.6902113459399333, + "grad_norm": 8.1171875, + "learning_rate": 7.309788654060067e-06, + "loss": 2.885, + "mean_token_accuracy": 0.46481620405101276, + "step": 14511 + }, + { + "epoch": 2.6903967371153135, + "grad_norm": 7.06640625, + "learning_rate": 7.3096032628846866e-06, + "loss": 2.4226, + "mean_token_accuracy": 0.5253798342541437, + "step": 14512 + }, + { + "epoch": 2.690582128290693, + "grad_norm": 7.2578125, + "learning_rate": 7.309417871709307e-06, + "loss": 2.795, + "mean_token_accuracy": 0.4846288427893027, + "step": 14513 + }, + { + "epoch": 2.6907675194660734, + "grad_norm": 8.140625, + "learning_rate": 7.3092324805339276e-06, + "loss": 3.2857, + "mean_token_accuracy": 0.45914654316903736, + "step": 14514 + }, + { + "epoch": 2.6909529106414536, + "grad_norm": 13.6328125, + "learning_rate": 7.309047089358547e-06, + "loss": 3.4974, + "mean_token_accuracy": 0.4483457123565159, + "step": 14515 + }, + { + "epoch": 2.6911383018168333, + "grad_norm": 7.01171875, + "learning_rate": 7.308861698183167e-06, + "loss": 3.2945, + "mean_token_accuracy": 0.4274798927613941, + "step": 14516 + }, + { + "epoch": 2.6913236929922135, + "grad_norm": 7.453125, + "learning_rate": 7.308676307007787e-06, + "loss": 2.8742, + "mean_token_accuracy": 0.4873663751214772, + "step": 14517 + }, + { + "epoch": 2.6915090841675937, + "grad_norm": 10.0546875, + "learning_rate": 7.308490915832407e-06, + "loss": 3.6484, + "mean_token_accuracy": 0.45082944811883335, + "step": 14518 + }, + { + "epoch": 2.691694475342974, + "grad_norm": 8.0078125, + "learning_rate": 7.308305524657027e-06, + "loss": 3.3543, + "mean_token_accuracy": 0.452659311707558, + "step": 14519 + }, + { + "epoch": 2.6918798665183536, + "grad_norm": 6.97265625, + "learning_rate": 7.308120133481646e-06, + "loss": 2.6265, + "mean_token_accuracy": 0.5147899577429779, + "step": 14520 + }, + { + "epoch": 2.692065257693734, + "grad_norm": 7.48828125, + "learning_rate": 7.307934742306266e-06, + "loss": 2.569, + "mean_token_accuracy": 0.49503245462975226, + "step": 14521 + }, + { + "epoch": 2.6922506488691136, + "grad_norm": 10.3984375, + "learning_rate": 7.307749351130887e-06, + "loss": 3.2732, + "mean_token_accuracy": 0.4837240681396526, + "step": 14522 + }, + { + "epoch": 2.6924360400444938, + "grad_norm": 8.015625, + "learning_rate": 7.307563959955507e-06, + "loss": 2.3497, + "mean_token_accuracy": 0.5142639206712434, + "step": 14523 + }, + { + "epoch": 2.692621431219874, + "grad_norm": 6.87109375, + "learning_rate": 7.3073785687801265e-06, + "loss": 3.1149, + "mean_token_accuracy": 0.4634115884115884, + "step": 14524 + }, + { + "epoch": 2.692806822395254, + "grad_norm": 8.6328125, + "learning_rate": 7.307193177604747e-06, + "loss": 2.8724, + "mean_token_accuracy": 0.4808992065824273, + "step": 14525 + }, + { + "epoch": 2.692992213570634, + "grad_norm": 7.8671875, + "learning_rate": 7.307007786429367e-06, + "loss": 2.6963, + "mean_token_accuracy": 0.5126658624849216, + "step": 14526 + }, + { + "epoch": 2.693177604746014, + "grad_norm": 7.58203125, + "learning_rate": 7.306822395253986e-06, + "loss": 2.9362, + "mean_token_accuracy": 0.4609221253865617, + "step": 14527 + }, + { + "epoch": 2.6933629959213943, + "grad_norm": 6.94140625, + "learning_rate": 7.306637004078606e-06, + "loss": 2.9932, + "mean_token_accuracy": 0.45277481323372465, + "step": 14528 + }, + { + "epoch": 2.693548387096774, + "grad_norm": 8.75, + "learning_rate": 7.306451612903226e-06, + "loss": 3.0467, + "mean_token_accuracy": 0.4671849988566202, + "step": 14529 + }, + { + "epoch": 2.693733778272154, + "grad_norm": 7.38671875, + "learning_rate": 7.306266221727847e-06, + "loss": 2.0626, + "mean_token_accuracy": 0.5645017352503718, + "step": 14530 + }, + { + "epoch": 2.6939191694475344, + "grad_norm": 8.734375, + "learning_rate": 7.306080830552467e-06, + "loss": 2.828, + "mean_token_accuracy": 0.4762704104990081, + "step": 14531 + }, + { + "epoch": 2.6941045606229146, + "grad_norm": 16.609375, + "learning_rate": 7.305895439377086e-06, + "loss": 2.7623, + "mean_token_accuracy": 0.49904099736274277, + "step": 14532 + }, + { + "epoch": 2.6942899517982943, + "grad_norm": 7.7890625, + "learning_rate": 7.305710048201706e-06, + "loss": 3.1047, + "mean_token_accuracy": 0.474646623289208, + "step": 14533 + }, + { + "epoch": 2.6944753429736745, + "grad_norm": 7.98828125, + "learning_rate": 7.305524657026326e-06, + "loss": 3.0994, + "mean_token_accuracy": 0.4784189004997728, + "step": 14534 + }, + { + "epoch": 2.6946607341490543, + "grad_norm": 9.7421875, + "learning_rate": 7.305339265850946e-06, + "loss": 2.804, + "mean_token_accuracy": 0.4751961883408072, + "step": 14535 + }, + { + "epoch": 2.6948461253244345, + "grad_norm": 10.578125, + "learning_rate": 7.305153874675566e-06, + "loss": 2.4495, + "mean_token_accuracy": 0.543204252088079, + "step": 14536 + }, + { + "epoch": 2.6950315164998146, + "grad_norm": 6.98828125, + "learning_rate": 7.304968483500185e-06, + "loss": 3.0707, + "mean_token_accuracy": 0.4661075367647059, + "step": 14537 + }, + { + "epoch": 2.695216907675195, + "grad_norm": 8.0546875, + "learning_rate": 7.304783092324807e-06, + "loss": 3.2969, + "mean_token_accuracy": 0.4661928193912885, + "step": 14538 + }, + { + "epoch": 2.6954022988505746, + "grad_norm": 7.1171875, + "learning_rate": 7.304597701149426e-06, + "loss": 3.2735, + "mean_token_accuracy": 0.4479295809571039, + "step": 14539 + }, + { + "epoch": 2.6955876900259548, + "grad_norm": 7.3125, + "learning_rate": 7.304412309974046e-06, + "loss": 2.9544, + "mean_token_accuracy": 0.473592317765168, + "step": 14540 + }, + { + "epoch": 2.695773081201335, + "grad_norm": 11.03125, + "learning_rate": 7.3042269187986656e-06, + "loss": 1.9727, + "mean_token_accuracy": 0.5680222841225627, + "step": 14541 + }, + { + "epoch": 2.6959584723767147, + "grad_norm": 7.25, + "learning_rate": 7.304041527623286e-06, + "loss": 3.0139, + "mean_token_accuracy": 0.47965353815983003, + "step": 14542 + }, + { + "epoch": 2.696143863552095, + "grad_norm": 7.3828125, + "learning_rate": 7.303856136447906e-06, + "loss": 3.1985, + "mean_token_accuracy": 0.4551226551226551, + "step": 14543 + }, + { + "epoch": 2.696329254727475, + "grad_norm": 7.9375, + "learning_rate": 7.303670745272525e-06, + "loss": 3.5007, + "mean_token_accuracy": 0.44407412345020664, + "step": 14544 + }, + { + "epoch": 2.6965146459028553, + "grad_norm": 8.953125, + "learning_rate": 7.303485354097145e-06, + "loss": 2.8548, + "mean_token_accuracy": 0.4833788016570678, + "step": 14545 + }, + { + "epoch": 2.696700037078235, + "grad_norm": 14.1640625, + "learning_rate": 7.303299962921766e-06, + "loss": 2.5259, + "mean_token_accuracy": 0.4767274472168906, + "step": 14546 + }, + { + "epoch": 2.696885428253615, + "grad_norm": 8.8046875, + "learning_rate": 7.303114571746386e-06, + "loss": 2.7058, + "mean_token_accuracy": 0.48175182481751827, + "step": 14547 + }, + { + "epoch": 2.697070819428995, + "grad_norm": 7.21484375, + "learning_rate": 7.302929180571006e-06, + "loss": 3.2638, + "mean_token_accuracy": 0.4495754506182035, + "step": 14548 + }, + { + "epoch": 2.697256210604375, + "grad_norm": 9.109375, + "learning_rate": 7.302743789395625e-06, + "loss": 3.5182, + "mean_token_accuracy": 0.4602321319486866, + "step": 14549 + }, + { + "epoch": 2.6974416017797553, + "grad_norm": 12.609375, + "learning_rate": 7.302558398220245e-06, + "loss": 2.6387, + "mean_token_accuracy": 0.4958382877526754, + "step": 14550 + }, + { + "epoch": 2.6976269929551355, + "grad_norm": 7.7109375, + "learning_rate": 7.302373007044865e-06, + "loss": 3.6533, + "mean_token_accuracy": 0.44113778362380796, + "step": 14551 + }, + { + "epoch": 2.6978123841305153, + "grad_norm": 8.25, + "learning_rate": 7.302187615869485e-06, + "loss": 2.5149, + "mean_token_accuracy": 0.5139035550862372, + "step": 14552 + }, + { + "epoch": 2.6979977753058955, + "grad_norm": 8.921875, + "learning_rate": 7.302002224694105e-06, + "loss": 2.8418, + "mean_token_accuracy": 0.4843209591883791, + "step": 14553 + }, + { + "epoch": 2.6981831664812757, + "grad_norm": 7.828125, + "learning_rate": 7.301816833518726e-06, + "loss": 3.0839, + "mean_token_accuracy": 0.4679184425757401, + "step": 14554 + }, + { + "epoch": 2.6983685576566554, + "grad_norm": 10.2734375, + "learning_rate": 7.301631442343346e-06, + "loss": 2.8046, + "mean_token_accuracy": 0.47708489857250186, + "step": 14555 + }, + { + "epoch": 2.6985539488320356, + "grad_norm": 7.578125, + "learning_rate": 7.301446051167965e-06, + "loss": 2.7482, + "mean_token_accuracy": 0.4867027535890798, + "step": 14556 + }, + { + "epoch": 2.6987393400074158, + "grad_norm": 7.76171875, + "learning_rate": 7.301260659992585e-06, + "loss": 2.604, + "mean_token_accuracy": 0.49788484136310224, + "step": 14557 + }, + { + "epoch": 2.698924731182796, + "grad_norm": 6.88671875, + "learning_rate": 7.301075268817205e-06, + "loss": 3.3564, + "mean_token_accuracy": 0.45613128311151313, + "step": 14558 + }, + { + "epoch": 2.6991101223581757, + "grad_norm": 7.62109375, + "learning_rate": 7.300889877641824e-06, + "loss": 2.4932, + "mean_token_accuracy": 0.5212969489477394, + "step": 14559 + }, + { + "epoch": 2.699295513533556, + "grad_norm": 8.7421875, + "learning_rate": 7.300704486466445e-06, + "loss": 2.261, + "mean_token_accuracy": 0.5565659528698543, + "step": 14560 + }, + { + "epoch": 2.6994809047089356, + "grad_norm": 8.953125, + "learning_rate": 7.300519095291064e-06, + "loss": 2.8922, + "mean_token_accuracy": 0.483424047501237, + "step": 14561 + }, + { + "epoch": 2.699666295884316, + "grad_norm": 7.55078125, + "learning_rate": 7.300333704115685e-06, + "loss": 3.364, + "mean_token_accuracy": 0.4601726263871763, + "step": 14562 + }, + { + "epoch": 2.699851687059696, + "grad_norm": 7.0625, + "learning_rate": 7.300148312940305e-06, + "loss": 2.76, + "mean_token_accuracy": 0.49669635162309683, + "step": 14563 + }, + { + "epoch": 2.700037078235076, + "grad_norm": 9.3828125, + "learning_rate": 7.299962921764925e-06, + "loss": 2.578, + "mean_token_accuracy": 0.49458353394318727, + "step": 14564 + }, + { + "epoch": 2.700222469410456, + "grad_norm": 11.15625, + "learning_rate": 7.299777530589545e-06, + "loss": 3.2782, + "mean_token_accuracy": 0.4922945205479452, + "step": 14565 + }, + { + "epoch": 2.700407860585836, + "grad_norm": 6.73046875, + "learning_rate": 7.299592139414164e-06, + "loss": 2.3026, + "mean_token_accuracy": 0.5326549210206561, + "step": 14566 + }, + { + "epoch": 2.700593251761216, + "grad_norm": 10.2578125, + "learning_rate": 7.299406748238784e-06, + "loss": 2.8587, + "mean_token_accuracy": 0.514873417721519, + "step": 14567 + }, + { + "epoch": 2.700778642936596, + "grad_norm": 10.4765625, + "learning_rate": 7.2992213570634044e-06, + "loss": 2.652, + "mean_token_accuracy": 0.4986449864498645, + "step": 14568 + }, + { + "epoch": 2.7009640341119763, + "grad_norm": 9.453125, + "learning_rate": 7.299035965888024e-06, + "loss": 2.9433, + "mean_token_accuracy": 0.4885746929448729, + "step": 14569 + }, + { + "epoch": 2.7011494252873565, + "grad_norm": 11.53125, + "learning_rate": 7.298850574712645e-06, + "loss": 3.2667, + "mean_token_accuracy": 0.4507815800591466, + "step": 14570 + }, + { + "epoch": 2.701334816462736, + "grad_norm": 10.0546875, + "learning_rate": 7.298665183537264e-06, + "loss": 2.7379, + "mean_token_accuracy": 0.500070751379652, + "step": 14571 + }, + { + "epoch": 2.7015202076381164, + "grad_norm": 7.91796875, + "learning_rate": 7.298479792361885e-06, + "loss": 2.6368, + "mean_token_accuracy": 0.4911937377690802, + "step": 14572 + }, + { + "epoch": 2.7017055988134966, + "grad_norm": 10.8515625, + "learning_rate": 7.298294401186504e-06, + "loss": 3.1396, + "mean_token_accuracy": 0.4870702179176755, + "step": 14573 + }, + { + "epoch": 2.7018909899888763, + "grad_norm": 7.828125, + "learning_rate": 7.298109010011124e-06, + "loss": 2.6299, + "mean_token_accuracy": 0.4980030721966206, + "step": 14574 + }, + { + "epoch": 2.7020763811642565, + "grad_norm": 7.9609375, + "learning_rate": 7.297923618835744e-06, + "loss": 3.8773, + "mean_token_accuracy": 0.42565633943013564, + "step": 14575 + }, + { + "epoch": 2.7022617723396367, + "grad_norm": 10.546875, + "learning_rate": 7.297738227660363e-06, + "loss": 2.6699, + "mean_token_accuracy": 0.49074930619796486, + "step": 14576 + }, + { + "epoch": 2.702447163515017, + "grad_norm": 8.4765625, + "learning_rate": 7.297552836484984e-06, + "loss": 3.2974, + "mean_token_accuracy": 0.44717683222779486, + "step": 14577 + }, + { + "epoch": 2.7026325546903966, + "grad_norm": 8.484375, + "learning_rate": 7.297367445309604e-06, + "loss": 3.5365, + "mean_token_accuracy": 0.4262587256794891, + "step": 14578 + }, + { + "epoch": 2.702817945865777, + "grad_norm": 9.71875, + "learning_rate": 7.297182054134224e-06, + "loss": 3.2692, + "mean_token_accuracy": 0.45686705767350927, + "step": 14579 + }, + { + "epoch": 2.7030033370411566, + "grad_norm": 9.7890625, + "learning_rate": 7.296996662958844e-06, + "loss": 2.9154, + "mean_token_accuracy": 0.48645320197044334, + "step": 14580 + }, + { + "epoch": 2.7031887282165368, + "grad_norm": 10.6015625, + "learning_rate": 7.296811271783464e-06, + "loss": 2.7109, + "mean_token_accuracy": 0.5182587986239746, + "step": 14581 + }, + { + "epoch": 2.703374119391917, + "grad_norm": 7.90234375, + "learning_rate": 7.296625880608084e-06, + "loss": 3.5476, + "mean_token_accuracy": 0.4400372222868442, + "step": 14582 + }, + { + "epoch": 2.703559510567297, + "grad_norm": 10.53125, + "learning_rate": 7.296440489432703e-06, + "loss": 2.7235, + "mean_token_accuracy": 0.504010923365762, + "step": 14583 + }, + { + "epoch": 2.703744901742677, + "grad_norm": 10.1796875, + "learning_rate": 7.296255098257323e-06, + "loss": 2.37, + "mean_token_accuracy": 0.5096791661026127, + "step": 14584 + }, + { + "epoch": 2.703930292918057, + "grad_norm": 7.33203125, + "learning_rate": 7.2960697070819435e-06, + "loss": 3.3059, + "mean_token_accuracy": 0.45863488009081876, + "step": 14585 + }, + { + "epoch": 2.7041156840934373, + "grad_norm": 7.83203125, + "learning_rate": 7.295884315906564e-06, + "loss": 2.7381, + "mean_token_accuracy": 0.47505981962083565, + "step": 14586 + }, + { + "epoch": 2.704301075268817, + "grad_norm": 9.3984375, + "learning_rate": 7.295698924731184e-06, + "loss": 2.8303, + "mean_token_accuracy": 0.4859629421673217, + "step": 14587 + }, + { + "epoch": 2.704486466444197, + "grad_norm": 8.15625, + "learning_rate": 7.295513533555803e-06, + "loss": 3.9349, + "mean_token_accuracy": 0.4351234809878479, + "step": 14588 + }, + { + "epoch": 2.7046718576195774, + "grad_norm": 11.1875, + "learning_rate": 7.295328142380424e-06, + "loss": 2.5918, + "mean_token_accuracy": 0.4888063776345965, + "step": 14589 + }, + { + "epoch": 2.7048572487949576, + "grad_norm": 9.3203125, + "learning_rate": 7.295142751205043e-06, + "loss": 3.1253, + "mean_token_accuracy": 0.46685800191335247, + "step": 14590 + }, + { + "epoch": 2.7050426399703373, + "grad_norm": 6.56640625, + "learning_rate": 7.294957360029663e-06, + "loss": 2.4609, + "mean_token_accuracy": 0.5107930015905476, + "step": 14591 + }, + { + "epoch": 2.7052280311457175, + "grad_norm": 7.5078125, + "learning_rate": 7.294771968854283e-06, + "loss": 3.0263, + "mean_token_accuracy": 0.4997263273125342, + "step": 14592 + }, + { + "epoch": 2.7054134223210973, + "grad_norm": 8.875, + "learning_rate": 7.294586577678902e-06, + "loss": 3.4129, + "mean_token_accuracy": 0.44300961918566345, + "step": 14593 + }, + { + "epoch": 2.7055988134964775, + "grad_norm": 7.1015625, + "learning_rate": 7.294401186503524e-06, + "loss": 2.6876, + "mean_token_accuracy": 0.5092054263565892, + "step": 14594 + }, + { + "epoch": 2.7057842046718577, + "grad_norm": 9.578125, + "learning_rate": 7.294215795328143e-06, + "loss": 2.9596, + "mean_token_accuracy": 0.4739084132055378, + "step": 14595 + }, + { + "epoch": 2.705969595847238, + "grad_norm": 9.484375, + "learning_rate": 7.294030404152763e-06, + "loss": 2.5546, + "mean_token_accuracy": 0.5499793757734085, + "step": 14596 + }, + { + "epoch": 2.7061549870226176, + "grad_norm": 9.609375, + "learning_rate": 7.293845012977383e-06, + "loss": 3.0804, + "mean_token_accuracy": 0.4592448098091124, + "step": 14597 + }, + { + "epoch": 2.7063403781979978, + "grad_norm": 8.34375, + "learning_rate": 7.293659621802003e-06, + "loss": 3.1929, + "mean_token_accuracy": 0.46561021239132827, + "step": 14598 + }, + { + "epoch": 2.706525769373378, + "grad_norm": 8.8671875, + "learning_rate": 7.293474230626623e-06, + "loss": 2.5953, + "mean_token_accuracy": 0.4842642939528753, + "step": 14599 + }, + { + "epoch": 2.7067111605487577, + "grad_norm": 10.7265625, + "learning_rate": 7.293288839451242e-06, + "loss": 3.7388, + "mean_token_accuracy": 0.4416994706121895, + "step": 14600 + }, + { + "epoch": 2.706896551724138, + "grad_norm": 7.4921875, + "learning_rate": 7.293103448275862e-06, + "loss": 2.7389, + "mean_token_accuracy": 0.48456397199236156, + "step": 14601 + }, + { + "epoch": 2.707081942899518, + "grad_norm": 8.2578125, + "learning_rate": 7.292918057100482e-06, + "loss": 2.6578, + "mean_token_accuracy": 0.4995408631772268, + "step": 14602 + }, + { + "epoch": 2.7072673340748983, + "grad_norm": 7.23046875, + "learning_rate": 7.292732665925103e-06, + "loss": 2.919, + "mean_token_accuracy": 0.4654965135849964, + "step": 14603 + }, + { + "epoch": 2.707452725250278, + "grad_norm": 9.3125, + "learning_rate": 7.292547274749723e-06, + "loss": 3.2368, + "mean_token_accuracy": 0.45878524945770066, + "step": 14604 + }, + { + "epoch": 2.707638116425658, + "grad_norm": 7.4921875, + "learning_rate": 7.292361883574342e-06, + "loss": 2.9561, + "mean_token_accuracy": 0.4618917576961271, + "step": 14605 + }, + { + "epoch": 2.707823507601038, + "grad_norm": 7.1796875, + "learning_rate": 7.292176492398963e-06, + "loss": 2.7712, + "mean_token_accuracy": 0.49917970031718256, + "step": 14606 + }, + { + "epoch": 2.708008898776418, + "grad_norm": 7.4765625, + "learning_rate": 7.2919911012235824e-06, + "loss": 3.0656, + "mean_token_accuracy": 0.470503300330033, + "step": 14607 + }, + { + "epoch": 2.7081942899517983, + "grad_norm": 6.85546875, + "learning_rate": 7.291805710048202e-06, + "loss": 3.1917, + "mean_token_accuracy": 0.45124674358020095, + "step": 14608 + }, + { + "epoch": 2.7083796811271785, + "grad_norm": 7.39453125, + "learning_rate": 7.291620318872822e-06, + "loss": 2.8625, + "mean_token_accuracy": 0.4962579942849367, + "step": 14609 + }, + { + "epoch": 2.7085650723025583, + "grad_norm": 7.53515625, + "learning_rate": 7.291434927697441e-06, + "loss": 2.8933, + "mean_token_accuracy": 0.4896751101321586, + "step": 14610 + }, + { + "epoch": 2.7087504634779385, + "grad_norm": 7.6171875, + "learning_rate": 7.291249536522063e-06, + "loss": 2.9236, + "mean_token_accuracy": 0.4631633178773828, + "step": 14611 + }, + { + "epoch": 2.7089358546533187, + "grad_norm": 6.25390625, + "learning_rate": 7.291064145346682e-06, + "loss": 3.0096, + "mean_token_accuracy": 0.46203534430225474, + "step": 14612 + }, + { + "epoch": 2.7091212458286984, + "grad_norm": 7.09765625, + "learning_rate": 7.290878754171302e-06, + "loss": 2.9686, + "mean_token_accuracy": 0.4742857142857143, + "step": 14613 + }, + { + "epoch": 2.7093066370040786, + "grad_norm": 8.34375, + "learning_rate": 7.290693362995922e-06, + "loss": 2.8423, + "mean_token_accuracy": 0.4909592822636301, + "step": 14614 + }, + { + "epoch": 2.709492028179459, + "grad_norm": 10.1328125, + "learning_rate": 7.290507971820542e-06, + "loss": 2.7285, + "mean_token_accuracy": 0.52606043803715, + "step": 14615 + }, + { + "epoch": 2.709677419354839, + "grad_norm": 7.13671875, + "learning_rate": 7.290322580645162e-06, + "loss": 2.7659, + "mean_token_accuracy": 0.4726767520839765, + "step": 14616 + }, + { + "epoch": 2.7098628105302187, + "grad_norm": 6.71875, + "learning_rate": 7.290137189469781e-06, + "loss": 2.8133, + "mean_token_accuracy": 0.4822954822954823, + "step": 14617 + }, + { + "epoch": 2.710048201705599, + "grad_norm": 7.66796875, + "learning_rate": 7.289951798294401e-06, + "loss": 2.1624, + "mean_token_accuracy": 0.5574008171872941, + "step": 14618 + }, + { + "epoch": 2.7102335928809786, + "grad_norm": 7.5390625, + "learning_rate": 7.289766407119022e-06, + "loss": 3.0709, + "mean_token_accuracy": 0.43725652225239053, + "step": 14619 + }, + { + "epoch": 2.710418984056359, + "grad_norm": 7.75, + "learning_rate": 7.289581015943642e-06, + "loss": 2.9743, + "mean_token_accuracy": 0.45851306774930223, + "step": 14620 + }, + { + "epoch": 2.710604375231739, + "grad_norm": 10.2578125, + "learning_rate": 7.289395624768262e-06, + "loss": 2.6218, + "mean_token_accuracy": 0.4846275752773376, + "step": 14621 + }, + { + "epoch": 2.710789766407119, + "grad_norm": 8.296875, + "learning_rate": 7.289210233592881e-06, + "loss": 2.8298, + "mean_token_accuracy": 0.45516707521610117, + "step": 14622 + }, + { + "epoch": 2.710975157582499, + "grad_norm": 7.84375, + "learning_rate": 7.289024842417502e-06, + "loss": 2.5224, + "mean_token_accuracy": 0.5372015226669743, + "step": 14623 + }, + { + "epoch": 2.711160548757879, + "grad_norm": 6.62109375, + "learning_rate": 7.2888394512421215e-06, + "loss": 2.8167, + "mean_token_accuracy": 0.4714076246334311, + "step": 14624 + }, + { + "epoch": 2.7113459399332593, + "grad_norm": 7.65234375, + "learning_rate": 7.288654060066741e-06, + "loss": 2.8195, + "mean_token_accuracy": 0.5288387025351489, + "step": 14625 + }, + { + "epoch": 2.711531331108639, + "grad_norm": 9.3359375, + "learning_rate": 7.288468668891361e-06, + "loss": 2.7815, + "mean_token_accuracy": 0.49747715805263876, + "step": 14626 + }, + { + "epoch": 2.7117167222840193, + "grad_norm": 7.67578125, + "learning_rate": 7.288283277715982e-06, + "loss": 2.5575, + "mean_token_accuracy": 0.5338971674050208, + "step": 14627 + }, + { + "epoch": 2.7119021134593995, + "grad_norm": 7.13671875, + "learning_rate": 7.288097886540602e-06, + "loss": 2.3913, + "mean_token_accuracy": 0.5031122349737405, + "step": 14628 + }, + { + "epoch": 2.7120875046347797, + "grad_norm": 7.30859375, + "learning_rate": 7.287912495365221e-06, + "loss": 2.8039, + "mean_token_accuracy": 0.48686197523406827, + "step": 14629 + }, + { + "epoch": 2.7122728958101594, + "grad_norm": 7.85546875, + "learning_rate": 7.287727104189841e-06, + "loss": 2.1497, + "mean_token_accuracy": 0.574089874857793, + "step": 14630 + }, + { + "epoch": 2.7124582869855396, + "grad_norm": 7.546875, + "learning_rate": 7.287541713014461e-06, + "loss": 3.4947, + "mean_token_accuracy": 0.44298444343754717, + "step": 14631 + }, + { + "epoch": 2.7126436781609193, + "grad_norm": 7.51171875, + "learning_rate": 7.287356321839081e-06, + "loss": 2.5436, + "mean_token_accuracy": 0.5137345867415456, + "step": 14632 + }, + { + "epoch": 2.7128290693362995, + "grad_norm": 7.171875, + "learning_rate": 7.287170930663701e-06, + "loss": 2.6861, + "mean_token_accuracy": 0.5251538183134274, + "step": 14633 + }, + { + "epoch": 2.7130144605116797, + "grad_norm": 7.25390625, + "learning_rate": 7.2869855394883204e-06, + "loss": 3.1023, + "mean_token_accuracy": 0.45741056218057924, + "step": 14634 + }, + { + "epoch": 2.71319985168706, + "grad_norm": 6.5078125, + "learning_rate": 7.286800148312941e-06, + "loss": 2.5777, + "mean_token_accuracy": 0.4917541229385307, + "step": 14635 + }, + { + "epoch": 2.7133852428624397, + "grad_norm": 6.5, + "learning_rate": 7.2866147571375614e-06, + "loss": 2.8021, + "mean_token_accuracy": 0.48037331869338457, + "step": 14636 + }, + { + "epoch": 2.71357063403782, + "grad_norm": 9.1328125, + "learning_rate": 7.286429365962181e-06, + "loss": 3.5122, + "mean_token_accuracy": 0.4454772160507821, + "step": 14637 + }, + { + "epoch": 2.7137560252131996, + "grad_norm": 7.6171875, + "learning_rate": 7.286243974786801e-06, + "loss": 3.0706, + "mean_token_accuracy": 0.4664378860672615, + "step": 14638 + }, + { + "epoch": 2.7139414163885798, + "grad_norm": 7.0078125, + "learning_rate": 7.28605858361142e-06, + "loss": 2.9132, + "mean_token_accuracy": 0.4592250400678797, + "step": 14639 + }, + { + "epoch": 2.71412680756396, + "grad_norm": 7.40625, + "learning_rate": 7.28587319243604e-06, + "loss": 2.7205, + "mean_token_accuracy": 0.480814408770556, + "step": 14640 + }, + { + "epoch": 2.71431219873934, + "grad_norm": 10.3125, + "learning_rate": 7.2856878012606605e-06, + "loss": 2.9284, + "mean_token_accuracy": 0.5020358306188925, + "step": 14641 + }, + { + "epoch": 2.71449758991472, + "grad_norm": 8.8046875, + "learning_rate": 7.28550241008528e-06, + "loss": 2.8961, + "mean_token_accuracy": 0.4637138429752066, + "step": 14642 + }, + { + "epoch": 2.7146829810901, + "grad_norm": 7.33203125, + "learning_rate": 7.285317018909901e-06, + "loss": 2.858, + "mean_token_accuracy": 0.4799503927242662, + "step": 14643 + }, + { + "epoch": 2.7148683722654803, + "grad_norm": 9.5234375, + "learning_rate": 7.285131627734521e-06, + "loss": 3.4392, + "mean_token_accuracy": 0.4286858974358974, + "step": 14644 + }, + { + "epoch": 2.71505376344086, + "grad_norm": 14.4609375, + "learning_rate": 7.284946236559141e-06, + "loss": 3.4729, + "mean_token_accuracy": 0.44673003262874167, + "step": 14645 + }, + { + "epoch": 2.71523915461624, + "grad_norm": 7.703125, + "learning_rate": 7.28476084538376e-06, + "loss": 3.2568, + "mean_token_accuracy": 0.4333284435968901, + "step": 14646 + }, + { + "epoch": 2.7154245457916204, + "grad_norm": 9.1796875, + "learning_rate": 7.28457545420838e-06, + "loss": 2.7446, + "mean_token_accuracy": 0.498164733519307, + "step": 14647 + }, + { + "epoch": 2.7156099369670006, + "grad_norm": 8.15625, + "learning_rate": 7.284390063033e-06, + "loss": 2.4834, + "mean_token_accuracy": 0.5175438596491229, + "step": 14648 + }, + { + "epoch": 2.7157953281423803, + "grad_norm": 7.81640625, + "learning_rate": 7.28420467185762e-06, + "loss": 2.7912, + "mean_token_accuracy": 0.4874969355234126, + "step": 14649 + }, + { + "epoch": 2.7159807193177605, + "grad_norm": 6.80859375, + "learning_rate": 7.28401928068224e-06, + "loss": 2.8824, + "mean_token_accuracy": 0.4808070221407048, + "step": 14650 + }, + { + "epoch": 2.7161661104931403, + "grad_norm": 9.671875, + "learning_rate": 7.28383388950686e-06, + "loss": 2.4154, + "mean_token_accuracy": 0.5250737463126843, + "step": 14651 + }, + { + "epoch": 2.7163515016685205, + "grad_norm": 7.59375, + "learning_rate": 7.28364849833148e-06, + "loss": 3.0658, + "mean_token_accuracy": 0.4571004085675375, + "step": 14652 + }, + { + "epoch": 2.7165368928439007, + "grad_norm": 7.203125, + "learning_rate": 7.2834631071561005e-06, + "loss": 3.3442, + "mean_token_accuracy": 0.41854838709677417, + "step": 14653 + }, + { + "epoch": 2.716722284019281, + "grad_norm": 7.70703125, + "learning_rate": 7.28327771598072e-06, + "loss": 3.0118, + "mean_token_accuracy": 0.4775469585769374, + "step": 14654 + }, + { + "epoch": 2.7169076751946606, + "grad_norm": 12.0390625, + "learning_rate": 7.28309232480534e-06, + "loss": 3.2518, + "mean_token_accuracy": 0.49902912621359224, + "step": 14655 + }, + { + "epoch": 2.7170930663700408, + "grad_norm": 8.625, + "learning_rate": 7.282906933629959e-06, + "loss": 2.8118, + "mean_token_accuracy": 0.4954706577392674, + "step": 14656 + }, + { + "epoch": 2.717278457545421, + "grad_norm": 7.1171875, + "learning_rate": 7.282721542454579e-06, + "loss": 2.8463, + "mean_token_accuracy": 0.48488960157888245, + "step": 14657 + }, + { + "epoch": 2.7174638487208007, + "grad_norm": 7.8984375, + "learning_rate": 7.2825361512791995e-06, + "loss": 2.8991, + "mean_token_accuracy": 0.4812283100220843, + "step": 14658 + }, + { + "epoch": 2.717649239896181, + "grad_norm": 8.6328125, + "learning_rate": 7.28235076010382e-06, + "loss": 2.5967, + "mean_token_accuracy": 0.5195960567444097, + "step": 14659 + }, + { + "epoch": 2.717834631071561, + "grad_norm": 6.78125, + "learning_rate": 7.28216536892844e-06, + "loss": 2.8318, + "mean_token_accuracy": 0.47472289808056234, + "step": 14660 + }, + { + "epoch": 2.7180200222469413, + "grad_norm": 8.4296875, + "learning_rate": 7.28197997775306e-06, + "loss": 2.8769, + "mean_token_accuracy": 0.5005065856129686, + "step": 14661 + }, + { + "epoch": 2.718205413422321, + "grad_norm": 8.9375, + "learning_rate": 7.28179458657768e-06, + "loss": 2.5784, + "mean_token_accuracy": 0.5031138200609514, + "step": 14662 + }, + { + "epoch": 2.718390804597701, + "grad_norm": 7.640625, + "learning_rate": 7.2816091954022994e-06, + "loss": 2.543, + "mean_token_accuracy": 0.5456287935257161, + "step": 14663 + }, + { + "epoch": 2.718576195773081, + "grad_norm": 7.9140625, + "learning_rate": 7.281423804226919e-06, + "loss": 2.6385, + "mean_token_accuracy": 0.5221164613661814, + "step": 14664 + }, + { + "epoch": 2.718761586948461, + "grad_norm": 7.15234375, + "learning_rate": 7.281238413051539e-06, + "loss": 2.5316, + "mean_token_accuracy": 0.5045362903225806, + "step": 14665 + }, + { + "epoch": 2.7189469781238413, + "grad_norm": 8.2109375, + "learning_rate": 7.281053021876159e-06, + "loss": 2.5829, + "mean_token_accuracy": 0.508139023317202, + "step": 14666 + }, + { + "epoch": 2.7191323692992215, + "grad_norm": 8.25, + "learning_rate": 7.28086763070078e-06, + "loss": 3.2634, + "mean_token_accuracy": 0.45217391304347826, + "step": 14667 + }, + { + "epoch": 2.7193177604746013, + "grad_norm": 7.9140625, + "learning_rate": 7.280682239525399e-06, + "loss": 3.0041, + "mean_token_accuracy": 0.47413134784268807, + "step": 14668 + }, + { + "epoch": 2.7195031516499815, + "grad_norm": 8.71875, + "learning_rate": 7.280496848350019e-06, + "loss": 3.6179, + "mean_token_accuracy": 0.431859649122807, + "step": 14669 + }, + { + "epoch": 2.7196885428253617, + "grad_norm": 7.1171875, + "learning_rate": 7.2803114571746395e-06, + "loss": 3.5975, + "mean_token_accuracy": 0.4420352346926204, + "step": 14670 + }, + { + "epoch": 2.7198739340007414, + "grad_norm": 7.3125, + "learning_rate": 7.280126065999259e-06, + "loss": 2.4539, + "mean_token_accuracy": 0.5286009648518263, + "step": 14671 + }, + { + "epoch": 2.7200593251761216, + "grad_norm": 9.265625, + "learning_rate": 7.279940674823879e-06, + "loss": 3.4695, + "mean_token_accuracy": 0.45538802047309446, + "step": 14672 + }, + { + "epoch": 2.720244716351502, + "grad_norm": 6.93359375, + "learning_rate": 7.2797552836484984e-06, + "loss": 2.8769, + "mean_token_accuracy": 0.4605739760378936, + "step": 14673 + }, + { + "epoch": 2.720430107526882, + "grad_norm": 8.3515625, + "learning_rate": 7.279569892473118e-06, + "loss": 2.5538, + "mean_token_accuracy": 0.522322890514136, + "step": 14674 + }, + { + "epoch": 2.7206154987022617, + "grad_norm": 7.42578125, + "learning_rate": 7.279384501297739e-06, + "loss": 2.6921, + "mean_token_accuracy": 0.4814420803782506, + "step": 14675 + }, + { + "epoch": 2.720800889877642, + "grad_norm": 7.68359375, + "learning_rate": 7.279199110122359e-06, + "loss": 3.1958, + "mean_token_accuracy": 0.47414820109602096, + "step": 14676 + }, + { + "epoch": 2.7209862810530216, + "grad_norm": 7.32421875, + "learning_rate": 7.279013718946979e-06, + "loss": 2.8463, + "mean_token_accuracy": 0.47734487734487735, + "step": 14677 + }, + { + "epoch": 2.721171672228402, + "grad_norm": 8.9609375, + "learning_rate": 7.278828327771598e-06, + "loss": 3.6245, + "mean_token_accuracy": 0.45037121644774414, + "step": 14678 + }, + { + "epoch": 2.721357063403782, + "grad_norm": 9.6171875, + "learning_rate": 7.278642936596219e-06, + "loss": 2.5197, + "mean_token_accuracy": 0.5088744102448888, + "step": 14679 + }, + { + "epoch": 2.721542454579162, + "grad_norm": 7.4609375, + "learning_rate": 7.2784575454208385e-06, + "loss": 3.2373, + "mean_token_accuracy": 0.4440150176678445, + "step": 14680 + }, + { + "epoch": 2.721727845754542, + "grad_norm": 6.5703125, + "learning_rate": 7.278272154245458e-06, + "loss": 2.962, + "mean_token_accuracy": 0.4866598255515649, + "step": 14681 + }, + { + "epoch": 2.721913236929922, + "grad_norm": 10.7265625, + "learning_rate": 7.278086763070078e-06, + "loss": 3.6383, + "mean_token_accuracy": 0.4645633971291866, + "step": 14682 + }, + { + "epoch": 2.7220986281053023, + "grad_norm": 6.5546875, + "learning_rate": 7.277901371894699e-06, + "loss": 2.775, + "mean_token_accuracy": 0.5201890034364262, + "step": 14683 + }, + { + "epoch": 2.722284019280682, + "grad_norm": 7.4140625, + "learning_rate": 7.277715980719319e-06, + "loss": 2.7628, + "mean_token_accuracy": 0.48069586448954676, + "step": 14684 + }, + { + "epoch": 2.7224694104560623, + "grad_norm": 6.921875, + "learning_rate": 7.277530589543938e-06, + "loss": 2.2859, + "mean_token_accuracy": 0.5444877029721774, + "step": 14685 + }, + { + "epoch": 2.7226548016314425, + "grad_norm": 6.984375, + "learning_rate": 7.277345198368558e-06, + "loss": 2.2994, + "mean_token_accuracy": 0.5664910702813825, + "step": 14686 + }, + { + "epoch": 2.7228401928068227, + "grad_norm": 6.16796875, + "learning_rate": 7.2771598071931785e-06, + "loss": 2.7646, + "mean_token_accuracy": 0.4840130916414904, + "step": 14687 + }, + { + "epoch": 2.7230255839822024, + "grad_norm": 8.625, + "learning_rate": 7.276974416017798e-06, + "loss": 2.2574, + "mean_token_accuracy": 0.5229303156640858, + "step": 14688 + }, + { + "epoch": 2.7232109751575826, + "grad_norm": 8.953125, + "learning_rate": 7.276789024842418e-06, + "loss": 1.648, + "mean_token_accuracy": 0.6362763915547025, + "step": 14689 + }, + { + "epoch": 2.7233963663329623, + "grad_norm": 7.3828125, + "learning_rate": 7.2766036336670375e-06, + "loss": 2.8386, + "mean_token_accuracy": 0.4729913137893594, + "step": 14690 + }, + { + "epoch": 2.7235817575083425, + "grad_norm": 11.4296875, + "learning_rate": 7.276418242491659e-06, + "loss": 3.1823, + "mean_token_accuracy": 0.4911887215636014, + "step": 14691 + }, + { + "epoch": 2.7237671486837227, + "grad_norm": 8.421875, + "learning_rate": 7.2762328513162784e-06, + "loss": 2.8446, + "mean_token_accuracy": 0.4702199144777031, + "step": 14692 + }, + { + "epoch": 2.723952539859103, + "grad_norm": 7.98046875, + "learning_rate": 7.276047460140898e-06, + "loss": 2.9023, + "mean_token_accuracy": 0.4807436918990704, + "step": 14693 + }, + { + "epoch": 2.7241379310344827, + "grad_norm": 7.26953125, + "learning_rate": 7.275862068965518e-06, + "loss": 2.8204, + "mean_token_accuracy": 0.4970434782608696, + "step": 14694 + }, + { + "epoch": 2.724323322209863, + "grad_norm": 6.1171875, + "learning_rate": 7.275676677790137e-06, + "loss": 2.316, + "mean_token_accuracy": 0.5496489468405216, + "step": 14695 + }, + { + "epoch": 2.7245087133852426, + "grad_norm": 11.828125, + "learning_rate": 7.275491286614758e-06, + "loss": 3.4062, + "mean_token_accuracy": 0.48497221277078373, + "step": 14696 + }, + { + "epoch": 2.7246941045606228, + "grad_norm": 8.203125, + "learning_rate": 7.2753058954393775e-06, + "loss": 3.3733, + "mean_token_accuracy": 0.45524672462348476, + "step": 14697 + }, + { + "epoch": 2.724879495736003, + "grad_norm": 9.28125, + "learning_rate": 7.275120504263997e-06, + "loss": 2.7203, + "mean_token_accuracy": 0.47891963109354413, + "step": 14698 + }, + { + "epoch": 2.725064886911383, + "grad_norm": 8.4453125, + "learning_rate": 7.2749351130886185e-06, + "loss": 2.431, + "mean_token_accuracy": 0.5080235357047339, + "step": 14699 + }, + { + "epoch": 2.7252502780867633, + "grad_norm": 11.0, + "learning_rate": 7.274749721913238e-06, + "loss": 2.6897, + "mean_token_accuracy": 0.4839404822986147, + "step": 14700 + }, + { + "epoch": 2.725435669262143, + "grad_norm": 9.0390625, + "learning_rate": 7.274564330737858e-06, + "loss": 2.9027, + "mean_token_accuracy": 0.496449951273841, + "step": 14701 + }, + { + "epoch": 2.7256210604375233, + "grad_norm": 8.359375, + "learning_rate": 7.2743789395624774e-06, + "loss": 3.2311, + "mean_token_accuracy": 0.47159173051901115, + "step": 14702 + }, + { + "epoch": 2.725806451612903, + "grad_norm": 9.140625, + "learning_rate": 7.274193548387097e-06, + "loss": 3.228, + "mean_token_accuracy": 0.45765587445214195, + "step": 14703 + }, + { + "epoch": 2.725991842788283, + "grad_norm": 10.6640625, + "learning_rate": 7.2740081572117176e-06, + "loss": 2.5513, + "mean_token_accuracy": 0.5542971352431713, + "step": 14704 + }, + { + "epoch": 2.7261772339636634, + "grad_norm": 9.859375, + "learning_rate": 7.273822766036337e-06, + "loss": 3.0244, + "mean_token_accuracy": 0.45500848896434637, + "step": 14705 + }, + { + "epoch": 2.7263626251390436, + "grad_norm": 7.19921875, + "learning_rate": 7.273637374860957e-06, + "loss": 2.2374, + "mean_token_accuracy": 0.5652978600347022, + "step": 14706 + }, + { + "epoch": 2.7265480163144233, + "grad_norm": 10.734375, + "learning_rate": 7.273451983685577e-06, + "loss": 2.8502, + "mean_token_accuracy": 0.4843830665978317, + "step": 14707 + }, + { + "epoch": 2.7267334074898035, + "grad_norm": 7.54296875, + "learning_rate": 7.273266592510198e-06, + "loss": 2.7886, + "mean_token_accuracy": 0.5159301130524152, + "step": 14708 + }, + { + "epoch": 2.7269187986651833, + "grad_norm": 8.59375, + "learning_rate": 7.2730812013348175e-06, + "loss": 2.4436, + "mean_token_accuracy": 0.5071707953063885, + "step": 14709 + }, + { + "epoch": 2.7271041898405635, + "grad_norm": 9.5390625, + "learning_rate": 7.272895810159437e-06, + "loss": 2.8527, + "mean_token_accuracy": 0.4840880160029096, + "step": 14710 + }, + { + "epoch": 2.7272895810159437, + "grad_norm": 7.015625, + "learning_rate": 7.272710418984057e-06, + "loss": 3.0903, + "mean_token_accuracy": 0.4669728011303426, + "step": 14711 + }, + { + "epoch": 2.727474972191324, + "grad_norm": 9.546875, + "learning_rate": 7.272525027808676e-06, + "loss": 2.7678, + "mean_token_accuracy": 0.5144085521729026, + "step": 14712 + }, + { + "epoch": 2.7276603633667036, + "grad_norm": 7.34765625, + "learning_rate": 7.272339636633297e-06, + "loss": 2.3313, + "mean_token_accuracy": 0.5395047903446559, + "step": 14713 + }, + { + "epoch": 2.727845754542084, + "grad_norm": 7.1328125, + "learning_rate": 7.2721542454579166e-06, + "loss": 2.9809, + "mean_token_accuracy": 0.5058353144586125, + "step": 14714 + }, + { + "epoch": 2.728031145717464, + "grad_norm": 8.7890625, + "learning_rate": 7.271968854282537e-06, + "loss": 2.9453, + "mean_token_accuracy": 0.4981899887654475, + "step": 14715 + }, + { + "epoch": 2.7282165368928437, + "grad_norm": 8.578125, + "learning_rate": 7.271783463107157e-06, + "loss": 2.8812, + "mean_token_accuracy": 0.48714262897300753, + "step": 14716 + }, + { + "epoch": 2.728401928068224, + "grad_norm": 6.4375, + "learning_rate": 7.271598071931777e-06, + "loss": 2.7284, + "mean_token_accuracy": 0.48509410633328753, + "step": 14717 + }, + { + "epoch": 2.728587319243604, + "grad_norm": 7.18359375, + "learning_rate": 7.271412680756397e-06, + "loss": 2.6055, + "mean_token_accuracy": 0.5002770083102493, + "step": 14718 + }, + { + "epoch": 2.7287727104189843, + "grad_norm": 8.8046875, + "learning_rate": 7.2712272895810165e-06, + "loss": 3.0788, + "mean_token_accuracy": 0.47523519645821805, + "step": 14719 + }, + { + "epoch": 2.728958101594364, + "grad_norm": 10.140625, + "learning_rate": 7.271041898405636e-06, + "loss": 3.2674, + "mean_token_accuracy": 0.45637788862872847, + "step": 14720 + }, + { + "epoch": 2.729143492769744, + "grad_norm": 7.05859375, + "learning_rate": 7.270856507230256e-06, + "loss": 2.9036, + "mean_token_accuracy": 0.49731267783749605, + "step": 14721 + }, + { + "epoch": 2.729328883945124, + "grad_norm": 9.1875, + "learning_rate": 7.270671116054876e-06, + "loss": 3.5934, + "mean_token_accuracy": 0.4600160470714095, + "step": 14722 + }, + { + "epoch": 2.729514275120504, + "grad_norm": 7.34375, + "learning_rate": 7.270485724879496e-06, + "loss": 3.0136, + "mean_token_accuracy": 0.4784653200812226, + "step": 14723 + }, + { + "epoch": 2.7296996662958843, + "grad_norm": 7.57421875, + "learning_rate": 7.270300333704116e-06, + "loss": 3.2421, + "mean_token_accuracy": 0.4791891309036015, + "step": 14724 + }, + { + "epoch": 2.7298850574712645, + "grad_norm": 11.7265625, + "learning_rate": 7.270114942528737e-06, + "loss": 3.0774, + "mean_token_accuracy": 0.46306978848546543, + "step": 14725 + }, + { + "epoch": 2.7300704486466443, + "grad_norm": 7.1484375, + "learning_rate": 7.2699295513533565e-06, + "loss": 3.1081, + "mean_token_accuracy": 0.46954387990762125, + "step": 14726 + }, + { + "epoch": 2.7302558398220245, + "grad_norm": 8.3984375, + "learning_rate": 7.269744160177976e-06, + "loss": 2.6353, + "mean_token_accuracy": 0.5100682039623254, + "step": 14727 + }, + { + "epoch": 2.7304412309974047, + "grad_norm": 9.515625, + "learning_rate": 7.269558769002596e-06, + "loss": 2.7495, + "mean_token_accuracy": 0.5179581447963801, + "step": 14728 + }, + { + "epoch": 2.7306266221727844, + "grad_norm": 12.75, + "learning_rate": 7.2693733778272155e-06, + "loss": 2.5371, + "mean_token_accuracy": 0.509911599249933, + "step": 14729 + }, + { + "epoch": 2.7308120133481646, + "grad_norm": 6.8125, + "learning_rate": 7.269187986651836e-06, + "loss": 3.2643, + "mean_token_accuracy": 0.4514231197328669, + "step": 14730 + }, + { + "epoch": 2.730997404523545, + "grad_norm": 9.1640625, + "learning_rate": 7.269002595476456e-06, + "loss": 3.1893, + "mean_token_accuracy": 0.4823446327683616, + "step": 14731 + }, + { + "epoch": 2.731182795698925, + "grad_norm": 17.78125, + "learning_rate": 7.268817204301076e-06, + "loss": 2.7713, + "mean_token_accuracy": 0.47913500876680304, + "step": 14732 + }, + { + "epoch": 2.7313681868743047, + "grad_norm": 9.34375, + "learning_rate": 7.268631813125696e-06, + "loss": 2.3493, + "mean_token_accuracy": 0.5169420330439652, + "step": 14733 + }, + { + "epoch": 2.731553578049685, + "grad_norm": 6.8828125, + "learning_rate": 7.268446421950316e-06, + "loss": 3.2546, + "mean_token_accuracy": 0.43972257250945773, + "step": 14734 + }, + { + "epoch": 2.7317389692250647, + "grad_norm": 10.984375, + "learning_rate": 7.268261030774936e-06, + "loss": 3.12, + "mean_token_accuracy": 0.46378269617706236, + "step": 14735 + }, + { + "epoch": 2.731924360400445, + "grad_norm": 13.4375, + "learning_rate": 7.2680756395995555e-06, + "loss": 2.588, + "mean_token_accuracy": 0.5037073052145375, + "step": 14736 + }, + { + "epoch": 2.732109751575825, + "grad_norm": 8.0859375, + "learning_rate": 7.267890248424175e-06, + "loss": 2.748, + "mean_token_accuracy": 0.49677196218584274, + "step": 14737 + }, + { + "epoch": 2.732295142751205, + "grad_norm": 10.046875, + "learning_rate": 7.267704857248795e-06, + "loss": 2.7569, + "mean_token_accuracy": 0.5062130177514793, + "step": 14738 + }, + { + "epoch": 2.732480533926585, + "grad_norm": 10.9453125, + "learning_rate": 7.267519466073415e-06, + "loss": 2.6763, + "mean_token_accuracy": 0.4783178304729464, + "step": 14739 + }, + { + "epoch": 2.732665925101965, + "grad_norm": 12.203125, + "learning_rate": 7.267334074898036e-06, + "loss": 2.6665, + "mean_token_accuracy": 0.4830818109610802, + "step": 14740 + }, + { + "epoch": 2.7328513162773453, + "grad_norm": 9.03125, + "learning_rate": 7.267148683722655e-06, + "loss": 2.6469, + "mean_token_accuracy": 0.48718300205620285, + "step": 14741 + }, + { + "epoch": 2.733036707452725, + "grad_norm": 8.5, + "learning_rate": 7.266963292547276e-06, + "loss": 2.7229, + "mean_token_accuracy": 0.4663755458515284, + "step": 14742 + }, + { + "epoch": 2.7332220986281053, + "grad_norm": 9.4375, + "learning_rate": 7.2667779013718956e-06, + "loss": 3.566, + "mean_token_accuracy": 0.4352910602910603, + "step": 14743 + }, + { + "epoch": 2.7334074898034855, + "grad_norm": 8.0625, + "learning_rate": 7.266592510196515e-06, + "loss": 2.3909, + "mean_token_accuracy": 0.507703777335984, + "step": 14744 + }, + { + "epoch": 2.7335928809788657, + "grad_norm": 7.18359375, + "learning_rate": 7.266407119021135e-06, + "loss": 2.9623, + "mean_token_accuracy": 0.4589978509218414, + "step": 14745 + }, + { + "epoch": 2.7337782721542454, + "grad_norm": 6.5546875, + "learning_rate": 7.2662217278457545e-06, + "loss": 3.3152, + "mean_token_accuracy": 0.446102091560626, + "step": 14746 + }, + { + "epoch": 2.7339636633296256, + "grad_norm": 7.61328125, + "learning_rate": 7.266036336670375e-06, + "loss": 3.0274, + "mean_token_accuracy": 0.4671400903808909, + "step": 14747 + }, + { + "epoch": 2.7341490545050053, + "grad_norm": 7.25390625, + "learning_rate": 7.2658509454949955e-06, + "loss": 2.6932, + "mean_token_accuracy": 0.4874629280129415, + "step": 14748 + }, + { + "epoch": 2.7343344456803855, + "grad_norm": 8.8359375, + "learning_rate": 7.265665554319615e-06, + "loss": 2.7605, + "mean_token_accuracy": 0.48218430034129695, + "step": 14749 + }, + { + "epoch": 2.7345198368557657, + "grad_norm": 8.734375, + "learning_rate": 7.265480163144235e-06, + "loss": 3.3027, + "mean_token_accuracy": 0.44083671557914456, + "step": 14750 + }, + { + "epoch": 2.734705228031146, + "grad_norm": 8.0078125, + "learning_rate": 7.265294771968855e-06, + "loss": 3.1385, + "mean_token_accuracy": 0.4635405227322644, + "step": 14751 + }, + { + "epoch": 2.7348906192065257, + "grad_norm": 9.46875, + "learning_rate": 7.265109380793475e-06, + "loss": 2.5733, + "mean_token_accuracy": 0.4963627135848418, + "step": 14752 + }, + { + "epoch": 2.735076010381906, + "grad_norm": 9.390625, + "learning_rate": 7.2649239896180945e-06, + "loss": 2.8095, + "mean_token_accuracy": 0.48098192608578366, + "step": 14753 + }, + { + "epoch": 2.735261401557286, + "grad_norm": 8.125, + "learning_rate": 7.264738598442714e-06, + "loss": 2.5466, + "mean_token_accuracy": 0.5452485672397708, + "step": 14754 + }, + { + "epoch": 2.7354467927326658, + "grad_norm": 9.84375, + "learning_rate": 7.264553207267334e-06, + "loss": 2.694, + "mean_token_accuracy": 0.4850581073602656, + "step": 14755 + }, + { + "epoch": 2.735632183908046, + "grad_norm": 11.84375, + "learning_rate": 7.264367816091955e-06, + "loss": 2.6604, + "mean_token_accuracy": 0.515748031496063, + "step": 14756 + }, + { + "epoch": 2.735817575083426, + "grad_norm": 7.72265625, + "learning_rate": 7.264182424916575e-06, + "loss": 2.4028, + "mean_token_accuracy": 0.5078856263623541, + "step": 14757 + }, + { + "epoch": 2.7360029662588063, + "grad_norm": 8.5234375, + "learning_rate": 7.2639970337411945e-06, + "loss": 2.5033, + "mean_token_accuracy": 0.5126753313601853, + "step": 14758 + }, + { + "epoch": 2.736188357434186, + "grad_norm": 9.4609375, + "learning_rate": 7.263811642565814e-06, + "loss": 3.3074, + "mean_token_accuracy": 0.47218892351953967, + "step": 14759 + }, + { + "epoch": 2.7363737486095663, + "grad_norm": 10.109375, + "learning_rate": 7.263626251390435e-06, + "loss": 3.1522, + "mean_token_accuracy": 0.4745358755644757, + "step": 14760 + }, + { + "epoch": 2.736559139784946, + "grad_norm": 10.2421875, + "learning_rate": 7.263440860215054e-06, + "loss": 3.6617, + "mean_token_accuracy": 0.4317027532745255, + "step": 14761 + }, + { + "epoch": 2.736744530960326, + "grad_norm": 7.96484375, + "learning_rate": 7.263255469039674e-06, + "loss": 2.3974, + "mean_token_accuracy": 0.5107644305772231, + "step": 14762 + }, + { + "epoch": 2.7369299221357064, + "grad_norm": 11.5, + "learning_rate": 7.2630700778642935e-06, + "loss": 2.4602, + "mean_token_accuracy": 0.5172079495879787, + "step": 14763 + }, + { + "epoch": 2.7371153133110866, + "grad_norm": 11.1484375, + "learning_rate": 7.262884686688915e-06, + "loss": 3.3814, + "mean_token_accuracy": 0.45423040152963673, + "step": 14764 + }, + { + "epoch": 2.7373007044864663, + "grad_norm": 8.421875, + "learning_rate": 7.2626992955135345e-06, + "loss": 2.6465, + "mean_token_accuracy": 0.4916848807374323, + "step": 14765 + }, + { + "epoch": 2.7374860956618465, + "grad_norm": 11.0546875, + "learning_rate": 7.262513904338154e-06, + "loss": 3.1681, + "mean_token_accuracy": 0.48580697485806973, + "step": 14766 + }, + { + "epoch": 2.7376714868372263, + "grad_norm": 12.53125, + "learning_rate": 7.262328513162774e-06, + "loss": 3.4805, + "mean_token_accuracy": 0.4295480880648899, + "step": 14767 + }, + { + "epoch": 2.7378568780126065, + "grad_norm": 7.82421875, + "learning_rate": 7.262143121987394e-06, + "loss": 3.5222, + "mean_token_accuracy": 0.477874967841523, + "step": 14768 + }, + { + "epoch": 2.7380422691879867, + "grad_norm": 6.77734375, + "learning_rate": 7.261957730812014e-06, + "loss": 2.7751, + "mean_token_accuracy": 0.49986468200270634, + "step": 14769 + }, + { + "epoch": 2.738227660363367, + "grad_norm": 8.796875, + "learning_rate": 7.2617723396366336e-06, + "loss": 3.0339, + "mean_token_accuracy": 0.4964576226712149, + "step": 14770 + }, + { + "epoch": 2.7384130515387466, + "grad_norm": 7.39453125, + "learning_rate": 7.261586948461253e-06, + "loss": 3.0945, + "mean_token_accuracy": 0.4671249252839211, + "step": 14771 + }, + { + "epoch": 2.738598442714127, + "grad_norm": 7.84375, + "learning_rate": 7.2614015572858746e-06, + "loss": 2.7335, + "mean_token_accuracy": 0.4873688875757128, + "step": 14772 + }, + { + "epoch": 2.738783833889507, + "grad_norm": 7.5625, + "learning_rate": 7.261216166110494e-06, + "loss": 3.3132, + "mean_token_accuracy": 0.45591965100943493, + "step": 14773 + }, + { + "epoch": 2.7389692250648867, + "grad_norm": 15.203125, + "learning_rate": 7.261030774935114e-06, + "loss": 3.0972, + "mean_token_accuracy": 0.4681783045606802, + "step": 14774 + }, + { + "epoch": 2.739154616240267, + "grad_norm": 8.1484375, + "learning_rate": 7.2608453837597335e-06, + "loss": 2.8424, + "mean_token_accuracy": 0.48164067297064966, + "step": 14775 + }, + { + "epoch": 2.739340007415647, + "grad_norm": 8.7734375, + "learning_rate": 7.260659992584353e-06, + "loss": 2.5422, + "mean_token_accuracy": 0.48308065494238933, + "step": 14776 + }, + { + "epoch": 2.7395253985910273, + "grad_norm": 9.875, + "learning_rate": 7.260474601408974e-06, + "loss": 2.6006, + "mean_token_accuracy": 0.5117640033440821, + "step": 14777 + }, + { + "epoch": 2.739710789766407, + "grad_norm": 11.671875, + "learning_rate": 7.260289210233593e-06, + "loss": 3.1703, + "mean_token_accuracy": 0.47640414668110787, + "step": 14778 + }, + { + "epoch": 2.739896180941787, + "grad_norm": 8.9140625, + "learning_rate": 7.260103819058213e-06, + "loss": 3.6669, + "mean_token_accuracy": 0.4358468219502566, + "step": 14779 + }, + { + "epoch": 2.740081572117167, + "grad_norm": 9.8828125, + "learning_rate": 7.259918427882834e-06, + "loss": 2.4907, + "mean_token_accuracy": 0.5083477259643063, + "step": 14780 + }, + { + "epoch": 2.740266963292547, + "grad_norm": 8.34375, + "learning_rate": 7.259733036707454e-06, + "loss": 3.2599, + "mean_token_accuracy": 0.45718654434250766, + "step": 14781 + }, + { + "epoch": 2.7404523544679273, + "grad_norm": 7.25390625, + "learning_rate": 7.2595476455320735e-06, + "loss": 2.6959, + "mean_token_accuracy": 0.4856534090909091, + "step": 14782 + }, + { + "epoch": 2.7406377456433075, + "grad_norm": 7.53515625, + "learning_rate": 7.259362254356693e-06, + "loss": 2.5506, + "mean_token_accuracy": 0.5103173771865508, + "step": 14783 + }, + { + "epoch": 2.7408231368186873, + "grad_norm": 12.9375, + "learning_rate": 7.259176863181313e-06, + "loss": 2.3265, + "mean_token_accuracy": 0.5409335288367546, + "step": 14784 + }, + { + "epoch": 2.7410085279940675, + "grad_norm": 8.2890625, + "learning_rate": 7.258991472005933e-06, + "loss": 2.8334, + "mean_token_accuracy": 0.4832948608917507, + "step": 14785 + }, + { + "epoch": 2.7411939191694477, + "grad_norm": 11.671875, + "learning_rate": 7.258806080830553e-06, + "loss": 2.9943, + "mean_token_accuracy": 0.45862703034017777, + "step": 14786 + }, + { + "epoch": 2.7413793103448274, + "grad_norm": 13.3046875, + "learning_rate": 7.258620689655173e-06, + "loss": 2.7407, + "mean_token_accuracy": 0.5071673085151904, + "step": 14787 + }, + { + "epoch": 2.7415647015202076, + "grad_norm": 7.2578125, + "learning_rate": 7.258435298479793e-06, + "loss": 2.8574, + "mean_token_accuracy": 0.47388610209501325, + "step": 14788 + }, + { + "epoch": 2.741750092695588, + "grad_norm": 9.21875, + "learning_rate": 7.258249907304414e-06, + "loss": 2.8127, + "mean_token_accuracy": 0.4882510013351135, + "step": 14789 + }, + { + "epoch": 2.741935483870968, + "grad_norm": 9.53125, + "learning_rate": 7.258064516129033e-06, + "loss": 3.0661, + "mean_token_accuracy": 0.484226271416916, + "step": 14790 + }, + { + "epoch": 2.7421208750463477, + "grad_norm": 7.30859375, + "learning_rate": 7.257879124953653e-06, + "loss": 2.7312, + "mean_token_accuracy": 0.5017103762827823, + "step": 14791 + }, + { + "epoch": 2.742306266221728, + "grad_norm": 8.1484375, + "learning_rate": 7.2576937337782725e-06, + "loss": 4.276, + "mean_token_accuracy": 0.3920744138634047, + "step": 14792 + }, + { + "epoch": 2.7424916573971077, + "grad_norm": 7.08984375, + "learning_rate": 7.257508342602892e-06, + "loss": 2.9164, + "mean_token_accuracy": 0.49093075599099695, + "step": 14793 + }, + { + "epoch": 2.742677048572488, + "grad_norm": 7.5859375, + "learning_rate": 7.257322951427513e-06, + "loss": 2.6669, + "mean_token_accuracy": 0.5044818931516672, + "step": 14794 + }, + { + "epoch": 2.742862439747868, + "grad_norm": 6.75390625, + "learning_rate": 7.257137560252132e-06, + "loss": 2.5004, + "mean_token_accuracy": 0.5346334902488231, + "step": 14795 + }, + { + "epoch": 2.743047830923248, + "grad_norm": 6.8671875, + "learning_rate": 7.256952169076753e-06, + "loss": 2.7715, + "mean_token_accuracy": 0.4831610044313146, + "step": 14796 + }, + { + "epoch": 2.743233222098628, + "grad_norm": 9.6484375, + "learning_rate": 7.2567667779013724e-06, + "loss": 3.2188, + "mean_token_accuracy": 0.47224797986488276, + "step": 14797 + }, + { + "epoch": 2.743418613274008, + "grad_norm": 8.0703125, + "learning_rate": 7.256581386725993e-06, + "loss": 2.8451, + "mean_token_accuracy": 0.48976248976248976, + "step": 14798 + }, + { + "epoch": 2.7436040044493883, + "grad_norm": 7.5859375, + "learning_rate": 7.2563959955506126e-06, + "loss": 2.5137, + "mean_token_accuracy": 0.5175574204946997, + "step": 14799 + }, + { + "epoch": 2.743789395624768, + "grad_norm": 10.46875, + "learning_rate": 7.256210604375232e-06, + "loss": 2.8148, + "mean_token_accuracy": 0.46623959000640613, + "step": 14800 + }, + { + "epoch": 2.7439747868001483, + "grad_norm": 7.8671875, + "learning_rate": 7.256025213199852e-06, + "loss": 2.5828, + "mean_token_accuracy": 0.5147101660355374, + "step": 14801 + }, + { + "epoch": 2.7441601779755285, + "grad_norm": 7.65625, + "learning_rate": 7.2558398220244715e-06, + "loss": 3.693, + "mean_token_accuracy": 0.42519863791146423, + "step": 14802 + }, + { + "epoch": 2.7443455691509087, + "grad_norm": 7.61328125, + "learning_rate": 7.255654430849092e-06, + "loss": 2.6108, + "mean_token_accuracy": 0.50011460004584, + "step": 14803 + }, + { + "epoch": 2.7445309603262884, + "grad_norm": 6.46875, + "learning_rate": 7.2554690396737125e-06, + "loss": 2.4222, + "mean_token_accuracy": 0.5353299017313992, + "step": 14804 + }, + { + "epoch": 2.7447163515016686, + "grad_norm": 8.0390625, + "learning_rate": 7.255283648498332e-06, + "loss": 3.2055, + "mean_token_accuracy": 0.45304029304029303, + "step": 14805 + }, + { + "epoch": 2.7449017426770483, + "grad_norm": 10.65625, + "learning_rate": 7.255098257322953e-06, + "loss": 3.7001, + "mean_token_accuracy": 0.44077576534701074, + "step": 14806 + }, + { + "epoch": 2.7450871338524285, + "grad_norm": 7.890625, + "learning_rate": 7.254912866147572e-06, + "loss": 3.3108, + "mean_token_accuracy": 0.45675807236735355, + "step": 14807 + }, + { + "epoch": 2.7452725250278087, + "grad_norm": 8.1484375, + "learning_rate": 7.254727474972192e-06, + "loss": 2.2498, + "mean_token_accuracy": 0.5223503395987995, + "step": 14808 + }, + { + "epoch": 2.745457916203189, + "grad_norm": 7.2578125, + "learning_rate": 7.2545420837968116e-06, + "loss": 2.5858, + "mean_token_accuracy": 0.4963005780346821, + "step": 14809 + }, + { + "epoch": 2.7456433073785687, + "grad_norm": 8.1015625, + "learning_rate": 7.254356692621431e-06, + "loss": 2.8728, + "mean_token_accuracy": 0.4782804919327541, + "step": 14810 + }, + { + "epoch": 2.745828698553949, + "grad_norm": 10.1875, + "learning_rate": 7.254171301446052e-06, + "loss": 3.0346, + "mean_token_accuracy": 0.4612899503865236, + "step": 14811 + }, + { + "epoch": 2.746014089729329, + "grad_norm": 7.015625, + "learning_rate": 7.253985910270672e-06, + "loss": 2.7668, + "mean_token_accuracy": 0.47231681170387185, + "step": 14812 + }, + { + "epoch": 2.746199480904709, + "grad_norm": 8.296875, + "learning_rate": 7.253800519095292e-06, + "loss": 3.2158, + "mean_token_accuracy": 0.4533333333333333, + "step": 14813 + }, + { + "epoch": 2.746384872080089, + "grad_norm": 9.25, + "learning_rate": 7.2536151279199115e-06, + "loss": 3.1318, + "mean_token_accuracy": 0.4723242022027676, + "step": 14814 + }, + { + "epoch": 2.746570263255469, + "grad_norm": 6.52734375, + "learning_rate": 7.253429736744532e-06, + "loss": 2.7032, + "mean_token_accuracy": 0.4800173761946134, + "step": 14815 + }, + { + "epoch": 2.7467556544308493, + "grad_norm": 7.22265625, + "learning_rate": 7.253244345569152e-06, + "loss": 3.2994, + "mean_token_accuracy": 0.4453227931488801, + "step": 14816 + }, + { + "epoch": 2.746941045606229, + "grad_norm": 7.75, + "learning_rate": 7.253058954393771e-06, + "loss": 3.2332, + "mean_token_accuracy": 0.4534954407294833, + "step": 14817 + }, + { + "epoch": 2.7471264367816093, + "grad_norm": 7.5, + "learning_rate": 7.252873563218391e-06, + "loss": 3.1735, + "mean_token_accuracy": 0.44948849104859334, + "step": 14818 + }, + { + "epoch": 2.747311827956989, + "grad_norm": 8.7421875, + "learning_rate": 7.2526881720430105e-06, + "loss": 3.9921, + "mean_token_accuracy": 0.4187058183795541, + "step": 14819 + }, + { + "epoch": 2.747497219132369, + "grad_norm": 9.2265625, + "learning_rate": 7.252502780867632e-06, + "loss": 2.1735, + "mean_token_accuracy": 0.5685851926977687, + "step": 14820 + }, + { + "epoch": 2.7476826103077494, + "grad_norm": 7.0546875, + "learning_rate": 7.2523173896922515e-06, + "loss": 2.5724, + "mean_token_accuracy": 0.5310828025477707, + "step": 14821 + }, + { + "epoch": 2.7478680014831296, + "grad_norm": 7.2421875, + "learning_rate": 7.252131998516871e-06, + "loss": 3.1205, + "mean_token_accuracy": 0.4641886751440061, + "step": 14822 + }, + { + "epoch": 2.7480533926585093, + "grad_norm": 8.7578125, + "learning_rate": 7.251946607341492e-06, + "loss": 2.945, + "mean_token_accuracy": 0.46978321424246094, + "step": 14823 + }, + { + "epoch": 2.7482387838338895, + "grad_norm": 9.515625, + "learning_rate": 7.251761216166111e-06, + "loss": 2.8226, + "mean_token_accuracy": 0.48175074183976263, + "step": 14824 + }, + { + "epoch": 2.7484241750092697, + "grad_norm": 6.51953125, + "learning_rate": 7.251575824990731e-06, + "loss": 3.1383, + "mean_token_accuracy": 0.45470548408937034, + "step": 14825 + }, + { + "epoch": 2.7486095661846495, + "grad_norm": 9.1640625, + "learning_rate": 7.251390433815351e-06, + "loss": 3.2372, + "mean_token_accuracy": 0.457713557161071, + "step": 14826 + }, + { + "epoch": 2.7487949573600297, + "grad_norm": 9.4765625, + "learning_rate": 7.25120504263997e-06, + "loss": 2.791, + "mean_token_accuracy": 0.5096081856750686, + "step": 14827 + }, + { + "epoch": 2.74898034853541, + "grad_norm": 13.59375, + "learning_rate": 7.2510196514645916e-06, + "loss": 2.4653, + "mean_token_accuracy": 0.5443718228031954, + "step": 14828 + }, + { + "epoch": 2.74916573971079, + "grad_norm": 7.99609375, + "learning_rate": 7.250834260289211e-06, + "loss": 2.8527, + "mean_token_accuracy": 0.4794957715015159, + "step": 14829 + }, + { + "epoch": 2.74935113088617, + "grad_norm": 8.8984375, + "learning_rate": 7.250648869113831e-06, + "loss": 3.1235, + "mean_token_accuracy": 0.5080596291358623, + "step": 14830 + }, + { + "epoch": 2.74953652206155, + "grad_norm": 7.21875, + "learning_rate": 7.2504634779384505e-06, + "loss": 2.4544, + "mean_token_accuracy": 0.5145631067961165, + "step": 14831 + }, + { + "epoch": 2.7497219132369297, + "grad_norm": 8.0859375, + "learning_rate": 7.250278086763071e-06, + "loss": 2.5942, + "mean_token_accuracy": 0.48945147679324896, + "step": 14832 + }, + { + "epoch": 2.74990730441231, + "grad_norm": 7.1015625, + "learning_rate": 7.250092695587691e-06, + "loss": 3.0181, + "mean_token_accuracy": 0.45010551703346396, + "step": 14833 + }, + { + "epoch": 2.75009269558769, + "grad_norm": 7.43359375, + "learning_rate": 7.24990730441231e-06, + "loss": 2.7281, + "mean_token_accuracy": 0.49951899951899953, + "step": 14834 + }, + { + "epoch": 2.7502780867630703, + "grad_norm": 8.125, + "learning_rate": 7.24972191323693e-06, + "loss": 2.994, + "mean_token_accuracy": 0.4570645875039511, + "step": 14835 + }, + { + "epoch": 2.75046347793845, + "grad_norm": 7.86328125, + "learning_rate": 7.249536522061551e-06, + "loss": 2.7826, + "mean_token_accuracy": 0.493452380952381, + "step": 14836 + }, + { + "epoch": 2.75064886911383, + "grad_norm": 7.359375, + "learning_rate": 7.249351130886171e-06, + "loss": 2.9018, + "mean_token_accuracy": 0.48751512484875154, + "step": 14837 + }, + { + "epoch": 2.75083426028921, + "grad_norm": 7.0234375, + "learning_rate": 7.2491657397107906e-06, + "loss": 2.6354, + "mean_token_accuracy": 0.5045500505561172, + "step": 14838 + }, + { + "epoch": 2.75101965146459, + "grad_norm": 8.03125, + "learning_rate": 7.24898034853541e-06, + "loss": 2.7796, + "mean_token_accuracy": 0.4876443728176202, + "step": 14839 + }, + { + "epoch": 2.7512050426399703, + "grad_norm": 7.1953125, + "learning_rate": 7.24879495736003e-06, + "loss": 2.4115, + "mean_token_accuracy": 0.5340458748648973, + "step": 14840 + }, + { + "epoch": 2.7513904338153505, + "grad_norm": 7.93359375, + "learning_rate": 7.24860956618465e-06, + "loss": 2.9755, + "mean_token_accuracy": 0.47453299287730144, + "step": 14841 + }, + { + "epoch": 2.7515758249907303, + "grad_norm": 9.0859375, + "learning_rate": 7.24842417500927e-06, + "loss": 2.9474, + "mean_token_accuracy": 0.4656970362239298, + "step": 14842 + }, + { + "epoch": 2.7517612161661105, + "grad_norm": 7.375, + "learning_rate": 7.24823878383389e-06, + "loss": 3.5333, + "mean_token_accuracy": 0.43989248308462325, + "step": 14843 + }, + { + "epoch": 2.7519466073414907, + "grad_norm": 6.91015625, + "learning_rate": 7.248053392658511e-06, + "loss": 2.6833, + "mean_token_accuracy": 0.4842875099988573, + "step": 14844 + }, + { + "epoch": 2.7521319985168704, + "grad_norm": 8.171875, + "learning_rate": 7.247868001483131e-06, + "loss": 3.3693, + "mean_token_accuracy": 0.4667900948854432, + "step": 14845 + }, + { + "epoch": 2.7523173896922506, + "grad_norm": 7.66796875, + "learning_rate": 7.24768261030775e-06, + "loss": 2.9951, + "mean_token_accuracy": 0.47549958937859294, + "step": 14846 + }, + { + "epoch": 2.752502780867631, + "grad_norm": 7.46875, + "learning_rate": 7.24749721913237e-06, + "loss": 2.4631, + "mean_token_accuracy": 0.5106794032413867, + "step": 14847 + }, + { + "epoch": 2.752688172043011, + "grad_norm": 8.3515625, + "learning_rate": 7.2473118279569895e-06, + "loss": 3.2675, + "mean_token_accuracy": 0.4621815286624204, + "step": 14848 + }, + { + "epoch": 2.7528735632183907, + "grad_norm": 6.2421875, + "learning_rate": 7.24712643678161e-06, + "loss": 2.6296, + "mean_token_accuracy": 0.4986001866417811, + "step": 14849 + }, + { + "epoch": 2.753058954393771, + "grad_norm": 7.3984375, + "learning_rate": 7.24694104560623e-06, + "loss": 3.5314, + "mean_token_accuracy": 0.41505766489568485, + "step": 14850 + }, + { + "epoch": 2.7532443455691507, + "grad_norm": 10.2265625, + "learning_rate": 7.246755654430849e-06, + "loss": 2.9439, + "mean_token_accuracy": 0.4622986327331799, + "step": 14851 + }, + { + "epoch": 2.753429736744531, + "grad_norm": 9.5625, + "learning_rate": 7.246570263255469e-06, + "loss": 3.1751, + "mean_token_accuracy": 0.44319263725801333, + "step": 14852 + }, + { + "epoch": 2.753615127919911, + "grad_norm": 6.68359375, + "learning_rate": 7.24638487208009e-06, + "loss": 2.4508, + "mean_token_accuracy": 0.5114231014677728, + "step": 14853 + }, + { + "epoch": 2.753800519095291, + "grad_norm": 10.859375, + "learning_rate": 7.24619948090471e-06, + "loss": 3.0964, + "mean_token_accuracy": 0.4699074074074074, + "step": 14854 + }, + { + "epoch": 2.753985910270671, + "grad_norm": 7.234375, + "learning_rate": 7.24601408972933e-06, + "loss": 2.9494, + "mean_token_accuracy": 0.48335419274092617, + "step": 14855 + }, + { + "epoch": 2.754171301446051, + "grad_norm": 8.0546875, + "learning_rate": 7.245828698553949e-06, + "loss": 2.2639, + "mean_token_accuracy": 0.5391383495145631, + "step": 14856 + }, + { + "epoch": 2.7543566926214313, + "grad_norm": 7.44140625, + "learning_rate": 7.245643307378569e-06, + "loss": 3.084, + "mean_token_accuracy": 0.48740916004732127, + "step": 14857 + }, + { + "epoch": 2.754542083796811, + "grad_norm": 9.2265625, + "learning_rate": 7.245457916203189e-06, + "loss": 3.0937, + "mean_token_accuracy": 0.4755747126436782, + "step": 14858 + }, + { + "epoch": 2.7547274749721913, + "grad_norm": 7.27734375, + "learning_rate": 7.245272525027809e-06, + "loss": 2.705, + "mean_token_accuracy": 0.49926650366748165, + "step": 14859 + }, + { + "epoch": 2.7549128661475715, + "grad_norm": 7.17578125, + "learning_rate": 7.245087133852429e-06, + "loss": 3.0543, + "mean_token_accuracy": 0.4727251624883937, + "step": 14860 + }, + { + "epoch": 2.7550982573229517, + "grad_norm": 10.203125, + "learning_rate": 7.24490174267705e-06, + "loss": 2.6734, + "mean_token_accuracy": 0.5080932784636488, + "step": 14861 + }, + { + "epoch": 2.7552836484983314, + "grad_norm": 9.359375, + "learning_rate": 7.24471635150167e-06, + "loss": 3.613, + "mean_token_accuracy": 0.426006426006426, + "step": 14862 + }, + { + "epoch": 2.7554690396737116, + "grad_norm": 9.1171875, + "learning_rate": 7.244530960326289e-06, + "loss": 3.2086, + "mean_token_accuracy": 0.46356502242152464, + "step": 14863 + }, + { + "epoch": 2.7556544308490913, + "grad_norm": 7.42578125, + "learning_rate": 7.244345569150909e-06, + "loss": 2.6528, + "mean_token_accuracy": 0.4897891963109354, + "step": 14864 + }, + { + "epoch": 2.7558398220244715, + "grad_norm": 10.2421875, + "learning_rate": 7.244160177975529e-06, + "loss": 2.8749, + "mean_token_accuracy": 0.47155704843428325, + "step": 14865 + }, + { + "epoch": 2.7560252131998517, + "grad_norm": 11.2265625, + "learning_rate": 7.243974786800149e-06, + "loss": 2.3936, + "mean_token_accuracy": 0.5176266137040715, + "step": 14866 + }, + { + "epoch": 2.756210604375232, + "grad_norm": 7.03125, + "learning_rate": 7.243789395624769e-06, + "loss": 2.823, + "mean_token_accuracy": 0.5138888888888888, + "step": 14867 + }, + { + "epoch": 2.7563959955506117, + "grad_norm": 9.9921875, + "learning_rate": 7.243604004449388e-06, + "loss": 3.1354, + "mean_token_accuracy": 0.466977985323549, + "step": 14868 + }, + { + "epoch": 2.756581386725992, + "grad_norm": 9.703125, + "learning_rate": 7.243418613274009e-06, + "loss": 3.0735, + "mean_token_accuracy": 0.45927173226689116, + "step": 14869 + }, + { + "epoch": 2.756766777901372, + "grad_norm": 8.359375, + "learning_rate": 7.243233222098629e-06, + "loss": 2.9045, + "mean_token_accuracy": 0.46311213399820994, + "step": 14870 + }, + { + "epoch": 2.756952169076752, + "grad_norm": 7.6953125, + "learning_rate": 7.243047830923249e-06, + "loss": 2.5329, + "mean_token_accuracy": 0.5031289111389237, + "step": 14871 + }, + { + "epoch": 2.757137560252132, + "grad_norm": 13.0234375, + "learning_rate": 7.242862439747869e-06, + "loss": 2.5275, + "mean_token_accuracy": 0.4943509338252248, + "step": 14872 + }, + { + "epoch": 2.757322951427512, + "grad_norm": 8.796875, + "learning_rate": 7.242677048572488e-06, + "loss": 2.9005, + "mean_token_accuracy": 0.47593167701863354, + "step": 14873 + }, + { + "epoch": 2.7575083426028923, + "grad_norm": 8.8046875, + "learning_rate": 7.242491657397108e-06, + "loss": 3.3039, + "mean_token_accuracy": 0.46223820605034904, + "step": 14874 + }, + { + "epoch": 2.757693733778272, + "grad_norm": 11.2890625, + "learning_rate": 7.242306266221728e-06, + "loss": 3.0149, + "mean_token_accuracy": 0.49165275459098495, + "step": 14875 + }, + { + "epoch": 2.7578791249536523, + "grad_norm": 15.984375, + "learning_rate": 7.242120875046348e-06, + "loss": 2.3571, + "mean_token_accuracy": 0.5181973272675576, + "step": 14876 + }, + { + "epoch": 2.758064516129032, + "grad_norm": 19.0, + "learning_rate": 7.2419354838709685e-06, + "loss": 2.9314, + "mean_token_accuracy": 0.4584313183012865, + "step": 14877 + }, + { + "epoch": 2.758249907304412, + "grad_norm": 9.765625, + "learning_rate": 7.241750092695588e-06, + "loss": 3.0267, + "mean_token_accuracy": 0.4695033592412067, + "step": 14878 + }, + { + "epoch": 2.7584352984797924, + "grad_norm": 8.546875, + "learning_rate": 7.241564701520209e-06, + "loss": 2.9736, + "mean_token_accuracy": 0.45637839208724323, + "step": 14879 + }, + { + "epoch": 2.7586206896551726, + "grad_norm": 7.64453125, + "learning_rate": 7.241379310344828e-06, + "loss": 2.9132, + "mean_token_accuracy": 0.49442896935933145, + "step": 14880 + }, + { + "epoch": 2.7588060808305523, + "grad_norm": 8.5234375, + "learning_rate": 7.241193919169448e-06, + "loss": 2.7925, + "mean_token_accuracy": 0.49201499791695597, + "step": 14881 + }, + { + "epoch": 2.7589914720059325, + "grad_norm": 8.2109375, + "learning_rate": 7.241008527994068e-06, + "loss": 2.9784, + "mean_token_accuracy": 0.48815228966986157, + "step": 14882 + }, + { + "epoch": 2.7591768631813127, + "grad_norm": 9.6015625, + "learning_rate": 7.240823136818687e-06, + "loss": 3.2139, + "mean_token_accuracy": 0.4637998436278342, + "step": 14883 + }, + { + "epoch": 2.7593622543566925, + "grad_norm": 11.015625, + "learning_rate": 7.240637745643308e-06, + "loss": 2.8003, + "mean_token_accuracy": 0.48838090426875475, + "step": 14884 + }, + { + "epoch": 2.7595476455320727, + "grad_norm": 7.90234375, + "learning_rate": 7.240452354467928e-06, + "loss": 3.3024, + "mean_token_accuracy": 0.44165565830030823, + "step": 14885 + }, + { + "epoch": 2.759733036707453, + "grad_norm": 10.546875, + "learning_rate": 7.240266963292548e-06, + "loss": 3.3158, + "mean_token_accuracy": 0.4415365132967497, + "step": 14886 + }, + { + "epoch": 2.759918427882833, + "grad_norm": 8.1796875, + "learning_rate": 7.240081572117168e-06, + "loss": 2.2504, + "mean_token_accuracy": 0.5347212894560107, + "step": 14887 + }, + { + "epoch": 2.760103819058213, + "grad_norm": 8.7578125, + "learning_rate": 7.239896180941788e-06, + "loss": 2.7059, + "mean_token_accuracy": 0.4975833736104398, + "step": 14888 + }, + { + "epoch": 2.760289210233593, + "grad_norm": 8.328125, + "learning_rate": 7.239710789766408e-06, + "loss": 2.737, + "mean_token_accuracy": 0.4924965893587995, + "step": 14889 + }, + { + "epoch": 2.7604746014089727, + "grad_norm": 7.09375, + "learning_rate": 7.239525398591027e-06, + "loss": 2.3621, + "mean_token_accuracy": 0.5105291576673866, + "step": 14890 + }, + { + "epoch": 2.760659992584353, + "grad_norm": 9.7734375, + "learning_rate": 7.239340007415647e-06, + "loss": 3.343, + "mean_token_accuracy": 0.47875419705916405, + "step": 14891 + }, + { + "epoch": 2.760845383759733, + "grad_norm": 7.859375, + "learning_rate": 7.2391546162402674e-06, + "loss": 2.8276, + "mean_token_accuracy": 0.49747590637907296, + "step": 14892 + }, + { + "epoch": 2.7610307749351133, + "grad_norm": 7.7421875, + "learning_rate": 7.238969225064888e-06, + "loss": 2.9518, + "mean_token_accuracy": 0.46955974842767295, + "step": 14893 + }, + { + "epoch": 2.761216166110493, + "grad_norm": 9.625, + "learning_rate": 7.238783833889508e-06, + "loss": 3.1409, + "mean_token_accuracy": 0.4560214778828944, + "step": 14894 + }, + { + "epoch": 2.761401557285873, + "grad_norm": 6.86328125, + "learning_rate": 7.238598442714127e-06, + "loss": 2.4125, + "mean_token_accuracy": 0.5088172043010752, + "step": 14895 + }, + { + "epoch": 2.7615869484612534, + "grad_norm": 10.1484375, + "learning_rate": 7.238413051538748e-06, + "loss": 3.8387, + "mean_token_accuracy": 0.4669040084388186, + "step": 14896 + }, + { + "epoch": 2.761772339636633, + "grad_norm": 10.1328125, + "learning_rate": 7.238227660363367e-06, + "loss": 2.5197, + "mean_token_accuracy": 0.5096200485044462, + "step": 14897 + }, + { + "epoch": 2.7619577308120133, + "grad_norm": 7.1953125, + "learning_rate": 7.238042269187987e-06, + "loss": 3.0126, + "mean_token_accuracy": 0.4675385333956095, + "step": 14898 + }, + { + "epoch": 2.7621431219873935, + "grad_norm": 8.3984375, + "learning_rate": 7.237856878012607e-06, + "loss": 2.7853, + "mean_token_accuracy": 0.47955647955647956, + "step": 14899 + }, + { + "epoch": 2.7623285131627737, + "grad_norm": 6.53125, + "learning_rate": 7.237671486837226e-06, + "loss": 2.7794, + "mean_token_accuracy": 0.4776364382437423, + "step": 14900 + }, + { + "epoch": 2.7625139043381535, + "grad_norm": 7.3828125, + "learning_rate": 7.237486095661848e-06, + "loss": 3.0698, + "mean_token_accuracy": 0.4510502864417568, + "step": 14901 + }, + { + "epoch": 2.7626992955135337, + "grad_norm": 9.6015625, + "learning_rate": 7.237300704486467e-06, + "loss": 2.3116, + "mean_token_accuracy": 0.5308845958312151, + "step": 14902 + }, + { + "epoch": 2.7628846866889134, + "grad_norm": 9.515625, + "learning_rate": 7.237115313311087e-06, + "loss": 2.9323, + "mean_token_accuracy": 0.46164349553128103, + "step": 14903 + }, + { + "epoch": 2.7630700778642936, + "grad_norm": 7.2421875, + "learning_rate": 7.236929922135707e-06, + "loss": 2.7643, + "mean_token_accuracy": 0.4996724747805581, + "step": 14904 + }, + { + "epoch": 2.763255469039674, + "grad_norm": 8.1015625, + "learning_rate": 7.236744530960327e-06, + "loss": 2.7777, + "mean_token_accuracy": 0.4927242819182589, + "step": 14905 + }, + { + "epoch": 2.763440860215054, + "grad_norm": 8.7734375, + "learning_rate": 7.236559139784947e-06, + "loss": 2.4191, + "mean_token_accuracy": 0.5222454135999122, + "step": 14906 + }, + { + "epoch": 2.7636262513904337, + "grad_norm": 7.5703125, + "learning_rate": 7.236373748609566e-06, + "loss": 3.0185, + "mean_token_accuracy": 0.4634915366744109, + "step": 14907 + }, + { + "epoch": 2.763811642565814, + "grad_norm": 7.3046875, + "learning_rate": 7.236188357434186e-06, + "loss": 3.4494, + "mean_token_accuracy": 0.4301492537313433, + "step": 14908 + }, + { + "epoch": 2.7639970337411937, + "grad_norm": 8.171875, + "learning_rate": 7.236002966258807e-06, + "loss": 3.8328, + "mean_token_accuracy": 0.4139804418688881, + "step": 14909 + }, + { + "epoch": 2.764182424916574, + "grad_norm": 8.34375, + "learning_rate": 7.235817575083427e-06, + "loss": 3.0634, + "mean_token_accuracy": 0.45400452219445436, + "step": 14910 + }, + { + "epoch": 2.764367816091954, + "grad_norm": 7.1484375, + "learning_rate": 7.235632183908047e-06, + "loss": 3.3591, + "mean_token_accuracy": 0.44966905681191394, + "step": 14911 + }, + { + "epoch": 2.7645532072673342, + "grad_norm": 7.7578125, + "learning_rate": 7.235446792732666e-06, + "loss": 2.7236, + "mean_token_accuracy": 0.49956445993031356, + "step": 14912 + }, + { + "epoch": 2.764738598442714, + "grad_norm": 7.69921875, + "learning_rate": 7.235261401557287e-06, + "loss": 3.0516, + "mean_token_accuracy": 0.4497530599098132, + "step": 14913 + }, + { + "epoch": 2.764923989618094, + "grad_norm": 7.375, + "learning_rate": 7.235076010381906e-06, + "loss": 2.9983, + "mean_token_accuracy": 0.4583655581335396, + "step": 14914 + }, + { + "epoch": 2.7651093807934743, + "grad_norm": 8.9765625, + "learning_rate": 7.234890619206526e-06, + "loss": 3.0604, + "mean_token_accuracy": 0.44219292158223455, + "step": 14915 + }, + { + "epoch": 2.765294771968854, + "grad_norm": 7.39453125, + "learning_rate": 7.234705228031146e-06, + "loss": 2.8731, + "mean_token_accuracy": 0.4873995368478409, + "step": 14916 + }, + { + "epoch": 2.7654801631442343, + "grad_norm": 8.3984375, + "learning_rate": 7.234519836855767e-06, + "loss": 2.4908, + "mean_token_accuracy": 0.5011469437322899, + "step": 14917 + }, + { + "epoch": 2.7656655543196145, + "grad_norm": 9.9140625, + "learning_rate": 7.234334445680387e-06, + "loss": 3.6922, + "mean_token_accuracy": 0.42028316405047705, + "step": 14918 + }, + { + "epoch": 2.7658509454949947, + "grad_norm": 7.26171875, + "learning_rate": 7.234149054505006e-06, + "loss": 2.8883, + "mean_token_accuracy": 0.47289101268149236, + "step": 14919 + }, + { + "epoch": 2.7660363366703744, + "grad_norm": 6.9296875, + "learning_rate": 7.233963663329626e-06, + "loss": 3.3742, + "mean_token_accuracy": 0.4421929528312507, + "step": 14920 + }, + { + "epoch": 2.7662217278457546, + "grad_norm": 7.4296875, + "learning_rate": 7.233778272154246e-06, + "loss": 3.2222, + "mean_token_accuracy": 0.457037695138977, + "step": 14921 + }, + { + "epoch": 2.7664071190211343, + "grad_norm": 7.7890625, + "learning_rate": 7.233592880978866e-06, + "loss": 3.1198, + "mean_token_accuracy": 0.4516829533116178, + "step": 14922 + }, + { + "epoch": 2.7665925101965145, + "grad_norm": 8.453125, + "learning_rate": 7.233407489803486e-06, + "loss": 2.4589, + "mean_token_accuracy": 0.5199101973295521, + "step": 14923 + }, + { + "epoch": 2.7667779013718947, + "grad_norm": 7.375, + "learning_rate": 7.233222098628105e-06, + "loss": 2.9488, + "mean_token_accuracy": 0.47092020692898573, + "step": 14924 + }, + { + "epoch": 2.766963292547275, + "grad_norm": 9.7265625, + "learning_rate": 7.233036707452727e-06, + "loss": 3.2661, + "mean_token_accuracy": 0.4494038520330174, + "step": 14925 + }, + { + "epoch": 2.7671486837226547, + "grad_norm": 7.5625, + "learning_rate": 7.232851316277346e-06, + "loss": 2.6689, + "mean_token_accuracy": 0.49907749077490776, + "step": 14926 + }, + { + "epoch": 2.767334074898035, + "grad_norm": 7.2421875, + "learning_rate": 7.232665925101966e-06, + "loss": 3.589, + "mean_token_accuracy": 0.4372999709048589, + "step": 14927 + }, + { + "epoch": 2.767519466073415, + "grad_norm": 8.75, + "learning_rate": 7.232480533926586e-06, + "loss": 4.0577, + "mean_token_accuracy": 0.4266649720386375, + "step": 14928 + }, + { + "epoch": 2.767704857248795, + "grad_norm": 8.4921875, + "learning_rate": 7.232295142751205e-06, + "loss": 2.5097, + "mean_token_accuracy": 0.508110992529349, + "step": 14929 + }, + { + "epoch": 2.767890248424175, + "grad_norm": 15.328125, + "learning_rate": 7.232109751575826e-06, + "loss": 4.2298, + "mean_token_accuracy": 0.42609871534820826, + "step": 14930 + }, + { + "epoch": 2.768075639599555, + "grad_norm": 7.78125, + "learning_rate": 7.2319243604004454e-06, + "loss": 2.4759, + "mean_token_accuracy": 0.5060144346431436, + "step": 14931 + }, + { + "epoch": 2.7682610307749353, + "grad_norm": 7.1953125, + "learning_rate": 7.231738969225065e-06, + "loss": 3.1937, + "mean_token_accuracy": 0.46236828901154037, + "step": 14932 + }, + { + "epoch": 2.768446421950315, + "grad_norm": 8.9453125, + "learning_rate": 7.2315535780496856e-06, + "loss": 2.981, + "mean_token_accuracy": 0.5010895744924877, + "step": 14933 + }, + { + "epoch": 2.7686318131256953, + "grad_norm": 9.1875, + "learning_rate": 7.231368186874306e-06, + "loss": 2.5565, + "mean_token_accuracy": 0.5140819964349376, + "step": 14934 + }, + { + "epoch": 2.768817204301075, + "grad_norm": 7.1328125, + "learning_rate": 7.231182795698926e-06, + "loss": 2.5259, + "mean_token_accuracy": 0.5347545413601561, + "step": 14935 + }, + { + "epoch": 2.769002595476455, + "grad_norm": 7.40625, + "learning_rate": 7.230997404523545e-06, + "loss": 2.8142, + "mean_token_accuracy": 0.48696648524778, + "step": 14936 + }, + { + "epoch": 2.7691879866518354, + "grad_norm": 7.80859375, + "learning_rate": 7.230812013348165e-06, + "loss": 4.1087, + "mean_token_accuracy": 0.41726440761810535, + "step": 14937 + }, + { + "epoch": 2.7693733778272156, + "grad_norm": 8.4921875, + "learning_rate": 7.230626622172785e-06, + "loss": 3.0888, + "mean_token_accuracy": 0.46557295550584143, + "step": 14938 + }, + { + "epoch": 2.7695587690025953, + "grad_norm": 9.6015625, + "learning_rate": 7.230441230997405e-06, + "loss": 3.2259, + "mean_token_accuracy": 0.4753146176185866, + "step": 14939 + }, + { + "epoch": 2.7697441601779755, + "grad_norm": 7.85546875, + "learning_rate": 7.230255839822025e-06, + "loss": 2.6818, + "mean_token_accuracy": 0.496486151302191, + "step": 14940 + }, + { + "epoch": 2.7699295513533557, + "grad_norm": 10.984375, + "learning_rate": 7.230070448646645e-06, + "loss": 2.9597, + "mean_token_accuracy": 0.4701913640824338, + "step": 14941 + }, + { + "epoch": 2.7701149425287355, + "grad_norm": 7.75, + "learning_rate": 7.229885057471266e-06, + "loss": 2.2951, + "mean_token_accuracy": 0.5373205173845971, + "step": 14942 + }, + { + "epoch": 2.7703003337041157, + "grad_norm": 14.3359375, + "learning_rate": 7.229699666295885e-06, + "loss": 3.617, + "mean_token_accuracy": 0.4546084546084546, + "step": 14943 + }, + { + "epoch": 2.770485724879496, + "grad_norm": 8.2578125, + "learning_rate": 7.229514275120505e-06, + "loss": 3.4586, + "mean_token_accuracy": 0.4414523783246882, + "step": 14944 + }, + { + "epoch": 2.770671116054876, + "grad_norm": 6.8125, + "learning_rate": 7.229328883945125e-06, + "loss": 2.9606, + "mean_token_accuracy": 0.47267719959414406, + "step": 14945 + }, + { + "epoch": 2.770856507230256, + "grad_norm": 7.70703125, + "learning_rate": 7.229143492769744e-06, + "loss": 2.6706, + "mean_token_accuracy": 0.4928949568124826, + "step": 14946 + }, + { + "epoch": 2.771041898405636, + "grad_norm": 9.296875, + "learning_rate": 7.228958101594365e-06, + "loss": 3.4236, + "mean_token_accuracy": 0.45313527014591826, + "step": 14947 + }, + { + "epoch": 2.7712272895810157, + "grad_norm": 8.625, + "learning_rate": 7.2287727104189845e-06, + "loss": 2.8268, + "mean_token_accuracy": 0.49534219351633335, + "step": 14948 + }, + { + "epoch": 2.771412680756396, + "grad_norm": 7.09375, + "learning_rate": 7.228587319243605e-06, + "loss": 2.6657, + "mean_token_accuracy": 0.5211981566820276, + "step": 14949 + }, + { + "epoch": 2.771598071931776, + "grad_norm": 7.4140625, + "learning_rate": 7.228401928068225e-06, + "loss": 3.3849, + "mean_token_accuracy": 0.43249872253449156, + "step": 14950 + }, + { + "epoch": 2.7717834631071563, + "grad_norm": 8.0625, + "learning_rate": 7.228216536892845e-06, + "loss": 3.6337, + "mean_token_accuracy": 0.4524598521539638, + "step": 14951 + }, + { + "epoch": 2.771968854282536, + "grad_norm": 7.90234375, + "learning_rate": 7.228031145717465e-06, + "loss": 3.4224, + "mean_token_accuracy": 0.4568937150378414, + "step": 14952 + }, + { + "epoch": 2.772154245457916, + "grad_norm": 8.1171875, + "learning_rate": 7.227845754542084e-06, + "loss": 2.9356, + "mean_token_accuracy": 0.4798491165376216, + "step": 14953 + }, + { + "epoch": 2.7723396366332964, + "grad_norm": 8.7265625, + "learning_rate": 7.227660363366704e-06, + "loss": 2.6862, + "mean_token_accuracy": 0.47983392645314354, + "step": 14954 + }, + { + "epoch": 2.772525027808676, + "grad_norm": 7.49609375, + "learning_rate": 7.227474972191324e-06, + "loss": 2.8088, + "mean_token_accuracy": 0.47935085382100034, + "step": 14955 + }, + { + "epoch": 2.7727104189840563, + "grad_norm": 6.8984375, + "learning_rate": 7.227289581015944e-06, + "loss": 2.2604, + "mean_token_accuracy": 0.5435386168601716, + "step": 14956 + }, + { + "epoch": 2.7728958101594365, + "grad_norm": 7.26953125, + "learning_rate": 7.227104189840565e-06, + "loss": 2.6062, + "mean_token_accuracy": 0.49673008323424495, + "step": 14957 + }, + { + "epoch": 2.7730812013348167, + "grad_norm": 9.34375, + "learning_rate": 7.226918798665184e-06, + "loss": 3.0451, + "mean_token_accuracy": 0.48356360171510243, + "step": 14958 + }, + { + "epoch": 2.7732665925101965, + "grad_norm": 7.86328125, + "learning_rate": 7.226733407489804e-06, + "loss": 3.22, + "mean_token_accuracy": 0.4558114035087719, + "step": 14959 + }, + { + "epoch": 2.7734519836855767, + "grad_norm": 6.8515625, + "learning_rate": 7.2265480163144244e-06, + "loss": 2.8417, + "mean_token_accuracy": 0.4682766504715633, + "step": 14960 + }, + { + "epoch": 2.7736373748609564, + "grad_norm": 8.4765625, + "learning_rate": 7.226362625139044e-06, + "loss": 2.6572, + "mean_token_accuracy": 0.48473325131417067, + "step": 14961 + }, + { + "epoch": 2.7738227660363366, + "grad_norm": 6.10546875, + "learning_rate": 7.226177233963664e-06, + "loss": 3.2596, + "mean_token_accuracy": 0.42630846623140317, + "step": 14962 + }, + { + "epoch": 2.774008157211717, + "grad_norm": 8.171875, + "learning_rate": 7.225991842788283e-06, + "loss": 3.3752, + "mean_token_accuracy": 0.4343101343101343, + "step": 14963 + }, + { + "epoch": 2.774193548387097, + "grad_norm": 8.5, + "learning_rate": 7.225806451612903e-06, + "loss": 2.4757, + "mean_token_accuracy": 0.513355592654424, + "step": 14964 + }, + { + "epoch": 2.7743789395624767, + "grad_norm": 8.53125, + "learning_rate": 7.225621060437524e-06, + "loss": 3.7204, + "mean_token_accuracy": 0.4127665876777251, + "step": 14965 + }, + { + "epoch": 2.774564330737857, + "grad_norm": 7.04296875, + "learning_rate": 7.225435669262144e-06, + "loss": 3.33, + "mean_token_accuracy": 0.4666427117020002, + "step": 14966 + }, + { + "epoch": 2.7747497219132367, + "grad_norm": 8.4296875, + "learning_rate": 7.225250278086764e-06, + "loss": 3.875, + "mean_token_accuracy": 0.4513528041569611, + "step": 14967 + }, + { + "epoch": 2.774935113088617, + "grad_norm": 6.87109375, + "learning_rate": 7.225064886911384e-06, + "loss": 2.7644, + "mean_token_accuracy": 0.4735774647887324, + "step": 14968 + }, + { + "epoch": 2.775120504263997, + "grad_norm": 7.125, + "learning_rate": 7.224879495736004e-06, + "loss": 2.4641, + "mean_token_accuracy": 0.5253505933117584, + "step": 14969 + }, + { + "epoch": 2.7753058954393772, + "grad_norm": 8.5703125, + "learning_rate": 7.224694104560623e-06, + "loss": 3.3967, + "mean_token_accuracy": 0.45251084067701774, + "step": 14970 + }, + { + "epoch": 2.7754912866147574, + "grad_norm": 9.765625, + "learning_rate": 7.224508713385243e-06, + "loss": 2.9796, + "mean_token_accuracy": 0.4374850870913863, + "step": 14971 + }, + { + "epoch": 2.775676677790137, + "grad_norm": 8.6015625, + "learning_rate": 7.224323322209863e-06, + "loss": 3.2966, + "mean_token_accuracy": 0.5060849598163031, + "step": 14972 + }, + { + "epoch": 2.7758620689655173, + "grad_norm": 8.328125, + "learning_rate": 7.224137931034483e-06, + "loss": 3.4799, + "mean_token_accuracy": 0.42666493910339465, + "step": 14973 + }, + { + "epoch": 2.776047460140897, + "grad_norm": 7.8046875, + "learning_rate": 7.223952539859104e-06, + "loss": 2.4665, + "mean_token_accuracy": 0.5753994025198078, + "step": 14974 + }, + { + "epoch": 2.7762328513162773, + "grad_norm": 11.046875, + "learning_rate": 7.223767148683723e-06, + "loss": 2.1144, + "mean_token_accuracy": 0.5366124774211477, + "step": 14975 + }, + { + "epoch": 2.7764182424916575, + "grad_norm": 7.14453125, + "learning_rate": 7.223581757508343e-06, + "loss": 2.5525, + "mean_token_accuracy": 0.4911203683402763, + "step": 14976 + }, + { + "epoch": 2.7766036336670377, + "grad_norm": 9.1171875, + "learning_rate": 7.2233963663329635e-06, + "loss": 2.7829, + "mean_token_accuracy": 0.5451481696687972, + "step": 14977 + }, + { + "epoch": 2.7767890248424174, + "grad_norm": 9.3671875, + "learning_rate": 7.223210975157583e-06, + "loss": 3.4986, + "mean_token_accuracy": 0.4373851404568128, + "step": 14978 + }, + { + "epoch": 2.7769744160177976, + "grad_norm": 8.265625, + "learning_rate": 7.223025583982203e-06, + "loss": 2.8453, + "mean_token_accuracy": 0.46710905943450665, + "step": 14979 + }, + { + "epoch": 2.7771598071931773, + "grad_norm": 13.125, + "learning_rate": 7.222840192806822e-06, + "loss": 3.1908, + "mean_token_accuracy": 0.45592124595944755, + "step": 14980 + }, + { + "epoch": 2.7773451983685575, + "grad_norm": 10.25, + "learning_rate": 7.222654801631442e-06, + "loss": 2.9848, + "mean_token_accuracy": 0.4584593232541397, + "step": 14981 + }, + { + "epoch": 2.7775305895439377, + "grad_norm": 8.75, + "learning_rate": 7.222469410456063e-06, + "loss": 2.7519, + "mean_token_accuracy": 0.473582224518796, + "step": 14982 + }, + { + "epoch": 2.777715980719318, + "grad_norm": 11.4296875, + "learning_rate": 7.222284019280683e-06, + "loss": 2.0863, + "mean_token_accuracy": 0.5655685977533762, + "step": 14983 + }, + { + "epoch": 2.7779013718946977, + "grad_norm": 11.1484375, + "learning_rate": 7.222098628105303e-06, + "loss": 2.4308, + "mean_token_accuracy": 0.5125912408759125, + "step": 14984 + }, + { + "epoch": 2.778086763070078, + "grad_norm": 9.6015625, + "learning_rate": 7.221913236929923e-06, + "loss": 3.4935, + "mean_token_accuracy": 0.4396197327852004, + "step": 14985 + }, + { + "epoch": 2.778272154245458, + "grad_norm": 10.90625, + "learning_rate": 7.221727845754543e-06, + "loss": 3.1788, + "mean_token_accuracy": 0.46296296296296297, + "step": 14986 + }, + { + "epoch": 2.778457545420838, + "grad_norm": 7.67578125, + "learning_rate": 7.2215424545791624e-06, + "loss": 3.0737, + "mean_token_accuracy": 0.47532496144525227, + "step": 14987 + }, + { + "epoch": 2.778642936596218, + "grad_norm": 9.765625, + "learning_rate": 7.221357063403782e-06, + "loss": 2.587, + "mean_token_accuracy": 0.5180847399242163, + "step": 14988 + }, + { + "epoch": 2.778828327771598, + "grad_norm": 12.984375, + "learning_rate": 7.221171672228402e-06, + "loss": 2.7294, + "mean_token_accuracy": 0.4927049559981473, + "step": 14989 + }, + { + "epoch": 2.7790137189469784, + "grad_norm": 8.6640625, + "learning_rate": 7.220986281053023e-06, + "loss": 3.0906, + "mean_token_accuracy": 0.49673396674584325, + "step": 14990 + }, + { + "epoch": 2.779199110122358, + "grad_norm": 14.5703125, + "learning_rate": 7.220800889877643e-06, + "loss": 2.7881, + "mean_token_accuracy": 0.48788325076984873, + "step": 14991 + }, + { + "epoch": 2.7793845012977383, + "grad_norm": 13.0546875, + "learning_rate": 7.220615498702262e-06, + "loss": 5.5876, + "mean_token_accuracy": 0.4015338263489455, + "step": 14992 + }, + { + "epoch": 2.779569892473118, + "grad_norm": 8.4296875, + "learning_rate": 7.220430107526882e-06, + "loss": 3.1843, + "mean_token_accuracy": 0.48014077425842133, + "step": 14993 + }, + { + "epoch": 2.779755283648498, + "grad_norm": 8.9765625, + "learning_rate": 7.2202447163515025e-06, + "loss": 3.1532, + "mean_token_accuracy": 0.4792698457619924, + "step": 14994 + }, + { + "epoch": 2.7799406748238784, + "grad_norm": 11.609375, + "learning_rate": 7.220059325176122e-06, + "loss": 2.5985, + "mean_token_accuracy": 0.5000560726701806, + "step": 14995 + }, + { + "epoch": 2.7801260659992586, + "grad_norm": 8.21875, + "learning_rate": 7.219873934000742e-06, + "loss": 2.6073, + "mean_token_accuracy": 0.5141521043563869, + "step": 14996 + }, + { + "epoch": 2.7803114571746383, + "grad_norm": 6.44921875, + "learning_rate": 7.2196885428253614e-06, + "loss": 2.7888, + "mean_token_accuracy": 0.4883241758241758, + "step": 14997 + }, + { + "epoch": 2.7804968483500185, + "grad_norm": 9.015625, + "learning_rate": 7.219503151649983e-06, + "loss": 2.935, + "mean_token_accuracy": 0.4764511018291805, + "step": 14998 + }, + { + "epoch": 2.7806822395253987, + "grad_norm": 10.015625, + "learning_rate": 7.219317760474602e-06, + "loss": 3.5296, + "mean_token_accuracy": 0.4371382957493514, + "step": 14999 + }, + { + "epoch": 2.7808676307007785, + "grad_norm": 8.8671875, + "learning_rate": 7.219132369299222e-06, + "loss": 3.1964, + "mean_token_accuracy": 0.4735980352026197, + "step": 15000 + }, + { + "epoch": 2.7810530218761587, + "grad_norm": 8.578125, + "learning_rate": 7.218946978123842e-06, + "loss": 2.538, + "mean_token_accuracy": 0.49921996879875197, + "step": 15001 + }, + { + "epoch": 2.781238413051539, + "grad_norm": 10.828125, + "learning_rate": 7.218761586948461e-06, + "loss": 2.5354, + "mean_token_accuracy": 0.5108680210257138, + "step": 15002 + }, + { + "epoch": 2.781423804226919, + "grad_norm": 7.828125, + "learning_rate": 7.218576195773082e-06, + "loss": 3.3247, + "mean_token_accuracy": 0.4697816150330117, + "step": 15003 + }, + { + "epoch": 2.781609195402299, + "grad_norm": 6.66015625, + "learning_rate": 7.2183908045977015e-06, + "loss": 2.1294, + "mean_token_accuracy": 0.5935689797848446, + "step": 15004 + }, + { + "epoch": 2.781794586577679, + "grad_norm": 7.54296875, + "learning_rate": 7.218205413422321e-06, + "loss": 2.7886, + "mean_token_accuracy": 0.49103987884906614, + "step": 15005 + }, + { + "epoch": 2.7819799777530587, + "grad_norm": 7.3828125, + "learning_rate": 7.2180200222469425e-06, + "loss": 2.3306, + "mean_token_accuracy": 0.5389274634362535, + "step": 15006 + }, + { + "epoch": 2.782165368928439, + "grad_norm": 8.3203125, + "learning_rate": 7.217834631071562e-06, + "loss": 3.4698, + "mean_token_accuracy": 0.43117959342224527, + "step": 15007 + }, + { + "epoch": 2.782350760103819, + "grad_norm": 7.4609375, + "learning_rate": 7.217649239896182e-06, + "loss": 2.113, + "mean_token_accuracy": 0.6056830601092896, + "step": 15008 + }, + { + "epoch": 2.7825361512791993, + "grad_norm": 8.34375, + "learning_rate": 7.217463848720801e-06, + "loss": 3.5166, + "mean_token_accuracy": 0.45571004530542103, + "step": 15009 + }, + { + "epoch": 2.782721542454579, + "grad_norm": 7.1328125, + "learning_rate": 7.217278457545421e-06, + "loss": 2.8189, + "mean_token_accuracy": 0.46806822689408967, + "step": 15010 + }, + { + "epoch": 2.7829069336299592, + "grad_norm": 7.7265625, + "learning_rate": 7.2170930663700415e-06, + "loss": 2.3085, + "mean_token_accuracy": 0.5416575071444274, + "step": 15011 + }, + { + "epoch": 2.7830923248053394, + "grad_norm": 7.5078125, + "learning_rate": 7.216907675194661e-06, + "loss": 3.4242, + "mean_token_accuracy": 0.4415089546551505, + "step": 15012 + }, + { + "epoch": 2.783277715980719, + "grad_norm": 7.62890625, + "learning_rate": 7.216722284019281e-06, + "loss": 3.0251, + "mean_token_accuracy": 0.4640779751611382, + "step": 15013 + }, + { + "epoch": 2.7834631071560993, + "grad_norm": 9.9921875, + "learning_rate": 7.216536892843901e-06, + "loss": 3.2228, + "mean_token_accuracy": 0.46495726495726497, + "step": 15014 + }, + { + "epoch": 2.7836484983314795, + "grad_norm": 7.71484375, + "learning_rate": 7.216351501668522e-06, + "loss": 3.4503, + "mean_token_accuracy": 0.42964705882352944, + "step": 15015 + }, + { + "epoch": 2.7838338895068597, + "grad_norm": 7.24609375, + "learning_rate": 7.2161661104931415e-06, + "loss": 3.4894, + "mean_token_accuracy": 0.42843716433942, + "step": 15016 + }, + { + "epoch": 2.7840192806822395, + "grad_norm": 9.5859375, + "learning_rate": 7.215980719317761e-06, + "loss": 3.2611, + "mean_token_accuracy": 0.4607128113389294, + "step": 15017 + }, + { + "epoch": 2.7842046718576197, + "grad_norm": 9.4140625, + "learning_rate": 7.215795328142381e-06, + "loss": 2.9679, + "mean_token_accuracy": 0.5068640324698579, + "step": 15018 + }, + { + "epoch": 2.7843900630329994, + "grad_norm": 6.953125, + "learning_rate": 7.215609936967e-06, + "loss": 3.0607, + "mean_token_accuracy": 0.4737818451644818, + "step": 15019 + }, + { + "epoch": 2.7845754542083796, + "grad_norm": 7.83984375, + "learning_rate": 7.215424545791621e-06, + "loss": 2.5687, + "mean_token_accuracy": 0.5053626149131767, + "step": 15020 + }, + { + "epoch": 2.78476084538376, + "grad_norm": 11.4296875, + "learning_rate": 7.2152391546162405e-06, + "loss": 3.3697, + "mean_token_accuracy": 0.44071709900855627, + "step": 15021 + }, + { + "epoch": 2.78494623655914, + "grad_norm": 17.609375, + "learning_rate": 7.215053763440861e-06, + "loss": 3.9178, + "mean_token_accuracy": 0.4407461186194572, + "step": 15022 + }, + { + "epoch": 2.7851316277345197, + "grad_norm": 8.7109375, + "learning_rate": 7.2148683722654815e-06, + "loss": 2.8927, + "mean_token_accuracy": 0.46709792549631945, + "step": 15023 + }, + { + "epoch": 2.7853170189099, + "grad_norm": 9.9765625, + "learning_rate": 7.214682981090101e-06, + "loss": 2.5958, + "mean_token_accuracy": 0.5057632836660106, + "step": 15024 + }, + { + "epoch": 2.78550241008528, + "grad_norm": 9.1640625, + "learning_rate": 7.214497589914721e-06, + "loss": 3.4265, + "mean_token_accuracy": 0.4288164665523156, + "step": 15025 + }, + { + "epoch": 2.78568780126066, + "grad_norm": 8.796875, + "learning_rate": 7.2143121987393404e-06, + "loss": 2.9072, + "mean_token_accuracy": 0.5011644671176666, + "step": 15026 + }, + { + "epoch": 2.78587319243604, + "grad_norm": 7.63671875, + "learning_rate": 7.21412680756396e-06, + "loss": 2.8445, + "mean_token_accuracy": 0.5015437816475238, + "step": 15027 + }, + { + "epoch": 2.7860585836114202, + "grad_norm": 7.10546875, + "learning_rate": 7.21394141638858e-06, + "loss": 3.2668, + "mean_token_accuracy": 0.4474148459763876, + "step": 15028 + }, + { + "epoch": 2.7862439747868004, + "grad_norm": 9.5234375, + "learning_rate": 7.2137560252132e-06, + "loss": 2.1866, + "mean_token_accuracy": 0.5628367975365666, + "step": 15029 + }, + { + "epoch": 2.78642936596218, + "grad_norm": 6.87109375, + "learning_rate": 7.213570634037821e-06, + "loss": 2.8371, + "mean_token_accuracy": 0.47868999551368324, + "step": 15030 + }, + { + "epoch": 2.7866147571375603, + "grad_norm": 8.578125, + "learning_rate": 7.21338524286244e-06, + "loss": 2.3633, + "mean_token_accuracy": 0.5554833468724614, + "step": 15031 + }, + { + "epoch": 2.78680014831294, + "grad_norm": 8.1640625, + "learning_rate": 7.213199851687061e-06, + "loss": 2.6255, + "mean_token_accuracy": 0.5090479937057435, + "step": 15032 + }, + { + "epoch": 2.7869855394883203, + "grad_norm": 6.68359375, + "learning_rate": 7.2130144605116805e-06, + "loss": 2.8819, + "mean_token_accuracy": 0.47889930898321814, + "step": 15033 + }, + { + "epoch": 2.7871709306637005, + "grad_norm": 7.31640625, + "learning_rate": 7.2128290693363e-06, + "loss": 2.6332, + "mean_token_accuracy": 0.4926062846580407, + "step": 15034 + }, + { + "epoch": 2.7873563218390807, + "grad_norm": 8.4296875, + "learning_rate": 7.21264367816092e-06, + "loss": 2.9633, + "mean_token_accuracy": 0.47737989040397605, + "step": 15035 + }, + { + "epoch": 2.7875417130144604, + "grad_norm": 9.8203125, + "learning_rate": 7.212458286985539e-06, + "loss": 2.7091, + "mean_token_accuracy": 0.5327774214239898, + "step": 15036 + }, + { + "epoch": 2.7877271041898406, + "grad_norm": 8.421875, + "learning_rate": 7.21227289581016e-06, + "loss": 2.9695, + "mean_token_accuracy": 0.48814877841254406, + "step": 15037 + }, + { + "epoch": 2.7879124953652203, + "grad_norm": 7.53125, + "learning_rate": 7.21208750463478e-06, + "loss": 2.9865, + "mean_token_accuracy": 0.47276817428368456, + "step": 15038 + }, + { + "epoch": 2.7880978865406005, + "grad_norm": 9.25, + "learning_rate": 7.2119021134594e-06, + "loss": 2.7567, + "mean_token_accuracy": 0.48334174659262996, + "step": 15039 + }, + { + "epoch": 2.7882832777159807, + "grad_norm": 8.796875, + "learning_rate": 7.21171672228402e-06, + "loss": 3.1252, + "mean_token_accuracy": 0.46791775907271976, + "step": 15040 + }, + { + "epoch": 2.788468668891361, + "grad_norm": 9.0546875, + "learning_rate": 7.21153133110864e-06, + "loss": 2.6008, + "mean_token_accuracy": 0.4856509422342653, + "step": 15041 + }, + { + "epoch": 2.7886540600667407, + "grad_norm": 7.0625, + "learning_rate": 7.21134593993326e-06, + "loss": 2.8621, + "mean_token_accuracy": 0.4546548956661316, + "step": 15042 + }, + { + "epoch": 2.788839451242121, + "grad_norm": 8.71875, + "learning_rate": 7.2111605487578795e-06, + "loss": 2.4514, + "mean_token_accuracy": 0.5345345345345346, + "step": 15043 + }, + { + "epoch": 2.789024842417501, + "grad_norm": 10.9921875, + "learning_rate": 7.210975157582499e-06, + "loss": 3.3255, + "mean_token_accuracy": 0.4586641756188697, + "step": 15044 + }, + { + "epoch": 2.789210233592881, + "grad_norm": 8.7734375, + "learning_rate": 7.210789766407119e-06, + "loss": 2.3458, + "mean_token_accuracy": 0.5252192982456141, + "step": 15045 + }, + { + "epoch": 2.789395624768261, + "grad_norm": 8.2421875, + "learning_rate": 7.21060437523174e-06, + "loss": 2.5305, + "mean_token_accuracy": 0.5393713722483846, + "step": 15046 + }, + { + "epoch": 2.789581015943641, + "grad_norm": 10.1484375, + "learning_rate": 7.21041898405636e-06, + "loss": 3.818, + "mean_token_accuracy": 0.4524929444967074, + "step": 15047 + }, + { + "epoch": 2.7897664071190214, + "grad_norm": 8.4609375, + "learning_rate": 7.210233592880979e-06, + "loss": 3.044, + "mean_token_accuracy": 0.4785492671445384, + "step": 15048 + }, + { + "epoch": 2.789951798294401, + "grad_norm": 7.265625, + "learning_rate": 7.2100482017056e-06, + "loss": 2.505, + "mean_token_accuracy": 0.485740153915799, + "step": 15049 + }, + { + "epoch": 2.7901371894697813, + "grad_norm": 10.0703125, + "learning_rate": 7.2098628105302195e-06, + "loss": 3.4323, + "mean_token_accuracy": 0.45092804474955506, + "step": 15050 + }, + { + "epoch": 2.790322580645161, + "grad_norm": 10.609375, + "learning_rate": 7.209677419354839e-06, + "loss": 2.5447, + "mean_token_accuracy": 0.4824062095730918, + "step": 15051 + }, + { + "epoch": 2.790507971820541, + "grad_norm": 10.390625, + "learning_rate": 7.209492028179459e-06, + "loss": 2.843, + "mean_token_accuracy": 0.4775125832115603, + "step": 15052 + }, + { + "epoch": 2.7906933629959214, + "grad_norm": 11.9609375, + "learning_rate": 7.2093066370040785e-06, + "loss": 3.2672, + "mean_token_accuracy": 0.4845197142101085, + "step": 15053 + }, + { + "epoch": 2.7908787541713016, + "grad_norm": 17.46875, + "learning_rate": 7.2091212458287e-06, + "loss": 2.9087, + "mean_token_accuracy": 0.463458974669737, + "step": 15054 + }, + { + "epoch": 2.7910641453466813, + "grad_norm": 12.7578125, + "learning_rate": 7.2089358546533194e-06, + "loss": 2.923, + "mean_token_accuracy": 0.48713186521623775, + "step": 15055 + }, + { + "epoch": 2.7912495365220615, + "grad_norm": 9.4453125, + "learning_rate": 7.208750463477939e-06, + "loss": 3.3041, + "mean_token_accuracy": 0.45218340611353713, + "step": 15056 + }, + { + "epoch": 2.7914349276974417, + "grad_norm": 13.03125, + "learning_rate": 7.208565072302559e-06, + "loss": 2.4398, + "mean_token_accuracy": 0.5108866736621196, + "step": 15057 + }, + { + "epoch": 2.7916203188728215, + "grad_norm": 8.8515625, + "learning_rate": 7.208379681127179e-06, + "loss": 2.3937, + "mean_token_accuracy": 0.5191654536632703, + "step": 15058 + }, + { + "epoch": 2.7918057100482017, + "grad_norm": 7.109375, + "learning_rate": 7.208194289951799e-06, + "loss": 3.0377, + "mean_token_accuracy": 0.43545699269324445, + "step": 15059 + }, + { + "epoch": 2.791991101223582, + "grad_norm": 6.77734375, + "learning_rate": 7.2080088987764185e-06, + "loss": 3.2756, + "mean_token_accuracy": 0.461275925439748, + "step": 15060 + }, + { + "epoch": 2.792176492398962, + "grad_norm": 10.203125, + "learning_rate": 7.207823507601038e-06, + "loss": 2.8231, + "mean_token_accuracy": 0.4801780354000621, + "step": 15061 + }, + { + "epoch": 2.792361883574342, + "grad_norm": 7.87890625, + "learning_rate": 7.2076381164256595e-06, + "loss": 2.8007, + "mean_token_accuracy": 0.4960419341035516, + "step": 15062 + }, + { + "epoch": 2.792547274749722, + "grad_norm": 7.36328125, + "learning_rate": 7.207452725250279e-06, + "loss": 2.7081, + "mean_token_accuracy": 0.5051770207080828, + "step": 15063 + }, + { + "epoch": 2.7927326659251017, + "grad_norm": 7.9453125, + "learning_rate": 7.207267334074899e-06, + "loss": 3.7494, + "mean_token_accuracy": 0.4190836485918453, + "step": 15064 + }, + { + "epoch": 2.792918057100482, + "grad_norm": 10.7578125, + "learning_rate": 7.207081942899518e-06, + "loss": 2.8866, + "mean_token_accuracy": 0.5111214087117701, + "step": 15065 + }, + { + "epoch": 2.793103448275862, + "grad_norm": 6.9375, + "learning_rate": 7.206896551724139e-06, + "loss": 3.004, + "mean_token_accuracy": 0.4711782989545069, + "step": 15066 + }, + { + "epoch": 2.7932888394512423, + "grad_norm": 8.3359375, + "learning_rate": 7.2067111605487586e-06, + "loss": 3.2717, + "mean_token_accuracy": 0.46502000827928797, + "step": 15067 + }, + { + "epoch": 2.793474230626622, + "grad_norm": 6.734375, + "learning_rate": 7.206525769373378e-06, + "loss": 3.067, + "mean_token_accuracy": 0.47590435315757207, + "step": 15068 + }, + { + "epoch": 2.7936596218020022, + "grad_norm": 7.33203125, + "learning_rate": 7.206340378197998e-06, + "loss": 2.8713, + "mean_token_accuracy": 0.4911985648615316, + "step": 15069 + }, + { + "epoch": 2.7938450129773824, + "grad_norm": 6.31640625, + "learning_rate": 7.206154987022619e-06, + "loss": 2.8797, + "mean_token_accuracy": 0.47644764476447643, + "step": 15070 + }, + { + "epoch": 2.794030404152762, + "grad_norm": 8.3125, + "learning_rate": 7.205969595847239e-06, + "loss": 3.1379, + "mean_token_accuracy": 0.47233887164506116, + "step": 15071 + }, + { + "epoch": 2.7942157953281423, + "grad_norm": 8.875, + "learning_rate": 7.2057842046718585e-06, + "loss": 2.9323, + "mean_token_accuracy": 0.45787384208204673, + "step": 15072 + }, + { + "epoch": 2.7944011865035225, + "grad_norm": 6.67578125, + "learning_rate": 7.205598813496478e-06, + "loss": 2.9699, + "mean_token_accuracy": 0.4838482155452109, + "step": 15073 + }, + { + "epoch": 2.7945865776789027, + "grad_norm": 7.71484375, + "learning_rate": 7.205413422321098e-06, + "loss": 2.8495, + "mean_token_accuracy": 0.4518233717814284, + "step": 15074 + }, + { + "epoch": 2.7947719688542825, + "grad_norm": 8.296875, + "learning_rate": 7.205228031145718e-06, + "loss": 2.7054, + "mean_token_accuracy": 0.4866317725279389, + "step": 15075 + }, + { + "epoch": 2.7949573600296627, + "grad_norm": 9.2890625, + "learning_rate": 7.205042639970338e-06, + "loss": 2.8962, + "mean_token_accuracy": 0.45772241208246056, + "step": 15076 + }, + { + "epoch": 2.7951427512050424, + "grad_norm": 7.94921875, + "learning_rate": 7.2048572487949575e-06, + "loss": 2.7764, + "mean_token_accuracy": 0.48885761201032135, + "step": 15077 + }, + { + "epoch": 2.7953281423804226, + "grad_norm": 6.84375, + "learning_rate": 7.204671857619578e-06, + "loss": 3.0983, + "mean_token_accuracy": 0.4557510148849797, + "step": 15078 + }, + { + "epoch": 2.795513533555803, + "grad_norm": 10.078125, + "learning_rate": 7.2044864664441985e-06, + "loss": 3.5627, + "mean_token_accuracy": 0.424390243902439, + "step": 15079 + }, + { + "epoch": 2.795698924731183, + "grad_norm": 9.296875, + "learning_rate": 7.204301075268818e-06, + "loss": 3.1225, + "mean_token_accuracy": 0.45311916324243556, + "step": 15080 + }, + { + "epoch": 2.7958843159065627, + "grad_norm": 9.078125, + "learning_rate": 7.204115684093438e-06, + "loss": 3.2977, + "mean_token_accuracy": 0.44464896134630555, + "step": 15081 + }, + { + "epoch": 2.796069707081943, + "grad_norm": 9.1484375, + "learning_rate": 7.2039302929180575e-06, + "loss": 3.0301, + "mean_token_accuracy": 0.4696875, + "step": 15082 + }, + { + "epoch": 2.796255098257323, + "grad_norm": 8.8515625, + "learning_rate": 7.203744901742677e-06, + "loss": 3.1958, + "mean_token_accuracy": 0.4553934105611554, + "step": 15083 + }, + { + "epoch": 2.796440489432703, + "grad_norm": 10.5859375, + "learning_rate": 7.203559510567298e-06, + "loss": 3.6017, + "mean_token_accuracy": 0.42387399834208345, + "step": 15084 + }, + { + "epoch": 2.796625880608083, + "grad_norm": 8.953125, + "learning_rate": 7.203374119391917e-06, + "loss": 2.7305, + "mean_token_accuracy": 0.49435787211176785, + "step": 15085 + }, + { + "epoch": 2.7968112717834632, + "grad_norm": 11.265625, + "learning_rate": 7.203188728216538e-06, + "loss": 2.6826, + "mean_token_accuracy": 0.49415053318148877, + "step": 15086 + }, + { + "epoch": 2.7969966629588434, + "grad_norm": 8.8828125, + "learning_rate": 7.203003337041158e-06, + "loss": 3.0542, + "mean_token_accuracy": 0.4573622524585747, + "step": 15087 + }, + { + "epoch": 2.797182054134223, + "grad_norm": 6.73828125, + "learning_rate": 7.202817945865778e-06, + "loss": 3.0894, + "mean_token_accuracy": 0.4755719557195572, + "step": 15088 + }, + { + "epoch": 2.7973674453096034, + "grad_norm": 9.296875, + "learning_rate": 7.2026325546903975e-06, + "loss": 2.3044, + "mean_token_accuracy": 0.5953451043338683, + "step": 15089 + }, + { + "epoch": 2.797552836484983, + "grad_norm": 9.546875, + "learning_rate": 7.202447163515017e-06, + "loss": 3.764, + "mean_token_accuracy": 0.4209486166007905, + "step": 15090 + }, + { + "epoch": 2.7977382276603633, + "grad_norm": 8.765625, + "learning_rate": 7.202261772339637e-06, + "loss": 2.9009, + "mean_token_accuracy": 0.4816078877512325, + "step": 15091 + }, + { + "epoch": 2.7979236188357435, + "grad_norm": 9.3671875, + "learning_rate": 7.202076381164257e-06, + "loss": 3.3386, + "mean_token_accuracy": 0.43972761518400466, + "step": 15092 + }, + { + "epoch": 2.7981090100111237, + "grad_norm": 8.0078125, + "learning_rate": 7.201890989988877e-06, + "loss": 3.0692, + "mean_token_accuracy": 0.4766125732987863, + "step": 15093 + }, + { + "epoch": 2.7982944011865034, + "grad_norm": 9.4609375, + "learning_rate": 7.2017055988134966e-06, + "loss": 2.8689, + "mean_token_accuracy": 0.46838276440962506, + "step": 15094 + }, + { + "epoch": 2.7984797923618836, + "grad_norm": 6.984375, + "learning_rate": 7.201520207638117e-06, + "loss": 2.8181, + "mean_token_accuracy": 0.4955522609340252, + "step": 15095 + }, + { + "epoch": 2.798665183537264, + "grad_norm": 8.609375, + "learning_rate": 7.2013348164627376e-06, + "loss": 2.7688, + "mean_token_accuracy": 0.5122669283611384, + "step": 15096 + }, + { + "epoch": 2.7988505747126435, + "grad_norm": 7.69140625, + "learning_rate": 7.201149425287357e-06, + "loss": 2.5302, + "mean_token_accuracy": 0.5167961895211832, + "step": 15097 + }, + { + "epoch": 2.7990359658880237, + "grad_norm": 7.640625, + "learning_rate": 7.200964034111977e-06, + "loss": 2.8601, + "mean_token_accuracy": 0.4876747873720629, + "step": 15098 + }, + { + "epoch": 2.799221357063404, + "grad_norm": 7.51171875, + "learning_rate": 7.2007786429365965e-06, + "loss": 3.0752, + "mean_token_accuracy": 0.46550598476605004, + "step": 15099 + }, + { + "epoch": 2.799406748238784, + "grad_norm": 7.12890625, + "learning_rate": 7.200593251761216e-06, + "loss": 2.7282, + "mean_token_accuracy": 0.4918970448045758, + "step": 15100 + }, + { + "epoch": 2.799592139414164, + "grad_norm": 6.84375, + "learning_rate": 7.200407860585837e-06, + "loss": 2.718, + "mean_token_accuracy": 0.4981715229444379, + "step": 15101 + }, + { + "epoch": 2.799777530589544, + "grad_norm": 7.33203125, + "learning_rate": 7.200222469410456e-06, + "loss": 2.9067, + "mean_token_accuracy": 0.5020752826678117, + "step": 15102 + }, + { + "epoch": 2.799962921764924, + "grad_norm": 8.609375, + "learning_rate": 7.200037078235077e-06, + "loss": 3.3783, + "mean_token_accuracy": 0.4369772137294491, + "step": 15103 + }, + { + "epoch": 2.800148312940304, + "grad_norm": 10.0078125, + "learning_rate": 7.199851687059697e-06, + "loss": 2.4701, + "mean_token_accuracy": 0.5457623161335512, + "step": 15104 + }, + { + "epoch": 2.800333704115684, + "grad_norm": 7.625, + "learning_rate": 7.199666295884317e-06, + "loss": 3.1045, + "mean_token_accuracy": 0.47562008469449485, + "step": 15105 + }, + { + "epoch": 2.8005190952910644, + "grad_norm": 7.46484375, + "learning_rate": 7.1994809047089365e-06, + "loss": 3.0907, + "mean_token_accuracy": 0.4813863928112965, + "step": 15106 + }, + { + "epoch": 2.800704486466444, + "grad_norm": 7.828125, + "learning_rate": 7.199295513533556e-06, + "loss": 3.2976, + "mean_token_accuracy": 0.4475277497477296, + "step": 15107 + }, + { + "epoch": 2.8008898776418243, + "grad_norm": 8.1875, + "learning_rate": 7.199110122358176e-06, + "loss": 2.9235, + "mean_token_accuracy": 0.49774236387782206, + "step": 15108 + }, + { + "epoch": 2.801075268817204, + "grad_norm": 8.4140625, + "learning_rate": 7.1989247311827955e-06, + "loss": 2.9451, + "mean_token_accuracy": 0.4848860257680872, + "step": 15109 + }, + { + "epoch": 2.8012606599925842, + "grad_norm": 7.921875, + "learning_rate": 7.198739340007416e-06, + "loss": 2.8936, + "mean_token_accuracy": 0.4786989795918367, + "step": 15110 + }, + { + "epoch": 2.8014460511679644, + "grad_norm": 8.1875, + "learning_rate": 7.1985539488320365e-06, + "loss": 3.1427, + "mean_token_accuracy": 0.46887136636459925, + "step": 15111 + }, + { + "epoch": 2.8016314423433446, + "grad_norm": 7.64453125, + "learning_rate": 7.198368557656656e-06, + "loss": 2.6006, + "mean_token_accuracy": 0.5046799516908212, + "step": 15112 + }, + { + "epoch": 2.8018168335187243, + "grad_norm": 7.05859375, + "learning_rate": 7.198183166481277e-06, + "loss": 3.0696, + "mean_token_accuracy": 0.47130061814053237, + "step": 15113 + }, + { + "epoch": 2.8020022246941045, + "grad_norm": 6.87109375, + "learning_rate": 7.197997775305896e-06, + "loss": 2.9914, + "mean_token_accuracy": 0.4650147492625369, + "step": 15114 + }, + { + "epoch": 2.8021876158694847, + "grad_norm": 7.3125, + "learning_rate": 7.197812384130516e-06, + "loss": 2.788, + "mean_token_accuracy": 0.4853542234332425, + "step": 15115 + }, + { + "epoch": 2.8023730070448645, + "grad_norm": 7.62109375, + "learning_rate": 7.1976269929551355e-06, + "loss": 2.5486, + "mean_token_accuracy": 0.4947674418604651, + "step": 15116 + }, + { + "epoch": 2.8025583982202447, + "grad_norm": 7.9765625, + "learning_rate": 7.197441601779755e-06, + "loss": 2.9243, + "mean_token_accuracy": 0.48503801815060094, + "step": 15117 + }, + { + "epoch": 2.802743789395625, + "grad_norm": 7.1796875, + "learning_rate": 7.197256210604376e-06, + "loss": 2.8176, + "mean_token_accuracy": 0.48723868154418365, + "step": 15118 + }, + { + "epoch": 2.802929180571005, + "grad_norm": 7.1171875, + "learning_rate": 7.197070819428996e-06, + "loss": 2.9805, + "mean_token_accuracy": 0.4750103263114416, + "step": 15119 + }, + { + "epoch": 2.803114571746385, + "grad_norm": 8.3828125, + "learning_rate": 7.196885428253616e-06, + "loss": 3.0855, + "mean_token_accuracy": 0.4650530675640694, + "step": 15120 + }, + { + "epoch": 2.803299962921765, + "grad_norm": 8.0390625, + "learning_rate": 7.1967000370782354e-06, + "loss": 3.2994, + "mean_token_accuracy": 0.4635239777204184, + "step": 15121 + }, + { + "epoch": 2.8034853540971447, + "grad_norm": 8.2890625, + "learning_rate": 7.196514645902856e-06, + "loss": 3.2627, + "mean_token_accuracy": 0.4201581872595126, + "step": 15122 + }, + { + "epoch": 2.803670745272525, + "grad_norm": 6.65625, + "learning_rate": 7.196329254727476e-06, + "loss": 2.4691, + "mean_token_accuracy": 0.5175804032661223, + "step": 15123 + }, + { + "epoch": 2.803856136447905, + "grad_norm": 8.6015625, + "learning_rate": 7.196143863552095e-06, + "loss": 2.3659, + "mean_token_accuracy": 0.5158619080942384, + "step": 15124 + }, + { + "epoch": 2.8040415276232853, + "grad_norm": 7.83203125, + "learning_rate": 7.195958472376715e-06, + "loss": 3.032, + "mean_token_accuracy": 0.45323002240477966, + "step": 15125 + }, + { + "epoch": 2.804226918798665, + "grad_norm": 7.14453125, + "learning_rate": 7.1957730812013345e-06, + "loss": 2.7285, + "mean_token_accuracy": 0.5055790108564535, + "step": 15126 + }, + { + "epoch": 2.8044123099740452, + "grad_norm": 9.1015625, + "learning_rate": 7.195587690025956e-06, + "loss": 2.7358, + "mean_token_accuracy": 0.4887051700046577, + "step": 15127 + }, + { + "epoch": 2.8045977011494254, + "grad_norm": 7.56640625, + "learning_rate": 7.1954022988505755e-06, + "loss": 3.0218, + "mean_token_accuracy": 0.4885626493683851, + "step": 15128 + }, + { + "epoch": 2.804783092324805, + "grad_norm": 6.83984375, + "learning_rate": 7.195216907675195e-06, + "loss": 2.511, + "mean_token_accuracy": 0.4998406628425749, + "step": 15129 + }, + { + "epoch": 2.8049684835001854, + "grad_norm": 8.5703125, + "learning_rate": 7.195031516499816e-06, + "loss": 2.6137, + "mean_token_accuracy": 0.481037367540435, + "step": 15130 + }, + { + "epoch": 2.8051538746755655, + "grad_norm": 7.9609375, + "learning_rate": 7.194846125324435e-06, + "loss": 3.7774, + "mean_token_accuracy": 0.45529253035692385, + "step": 15131 + }, + { + "epoch": 2.8053392658509457, + "grad_norm": 8.5546875, + "learning_rate": 7.194660734149055e-06, + "loss": 3.5504, + "mean_token_accuracy": 0.44033302497687327, + "step": 15132 + }, + { + "epoch": 2.8055246570263255, + "grad_norm": 8.140625, + "learning_rate": 7.1944753429736746e-06, + "loss": 3.2215, + "mean_token_accuracy": 0.4429657794676806, + "step": 15133 + }, + { + "epoch": 2.8057100482017057, + "grad_norm": 8.2734375, + "learning_rate": 7.194289951798294e-06, + "loss": 3.0829, + "mean_token_accuracy": 0.48085306940025924, + "step": 15134 + }, + { + "epoch": 2.8058954393770854, + "grad_norm": 9.6875, + "learning_rate": 7.1941045606229155e-06, + "loss": 2.5775, + "mean_token_accuracy": 0.5050999592003264, + "step": 15135 + }, + { + "epoch": 2.8060808305524656, + "grad_norm": 9.6953125, + "learning_rate": 7.193919169447535e-06, + "loss": 3.6068, + "mean_token_accuracy": 0.4398965478466209, + "step": 15136 + }, + { + "epoch": 2.806266221727846, + "grad_norm": 7.83984375, + "learning_rate": 7.193733778272155e-06, + "loss": 3.025, + "mean_token_accuracy": 0.4621919119406557, + "step": 15137 + }, + { + "epoch": 2.806451612903226, + "grad_norm": 7.921875, + "learning_rate": 7.1935483870967745e-06, + "loss": 2.8591, + "mean_token_accuracy": 0.5216637781629117, + "step": 15138 + }, + { + "epoch": 2.8066370040786057, + "grad_norm": 9.328125, + "learning_rate": 7.193362995921395e-06, + "loss": 3.0351, + "mean_token_accuracy": 0.46036892118501954, + "step": 15139 + }, + { + "epoch": 2.806822395253986, + "grad_norm": 8.2578125, + "learning_rate": 7.193177604746015e-06, + "loss": 2.9143, + "mean_token_accuracy": 0.4710926694329184, + "step": 15140 + }, + { + "epoch": 2.807007786429366, + "grad_norm": 7.640625, + "learning_rate": 7.192992213570634e-06, + "loss": 3.4673, + "mean_token_accuracy": 0.44758862011258177, + "step": 15141 + }, + { + "epoch": 2.807193177604746, + "grad_norm": 7.21484375, + "learning_rate": 7.192806822395254e-06, + "loss": 2.5494, + "mean_token_accuracy": 0.492918961447679, + "step": 15142 + }, + { + "epoch": 2.807378568780126, + "grad_norm": 8.84375, + "learning_rate": 7.192621431219875e-06, + "loss": 2.9263, + "mean_token_accuracy": 0.4803836094158675, + "step": 15143 + }, + { + "epoch": 2.8075639599555062, + "grad_norm": 7.59765625, + "learning_rate": 7.192436040044495e-06, + "loss": 2.5769, + "mean_token_accuracy": 0.49435417240429635, + "step": 15144 + }, + { + "epoch": 2.8077493511308864, + "grad_norm": 8.9765625, + "learning_rate": 7.1922506488691145e-06, + "loss": 2.3778, + "mean_token_accuracy": 0.5434726411029729, + "step": 15145 + }, + { + "epoch": 2.807934742306266, + "grad_norm": 8.25, + "learning_rate": 7.192065257693734e-06, + "loss": 3.5127, + "mean_token_accuracy": 0.43467960288808666, + "step": 15146 + }, + { + "epoch": 2.8081201334816464, + "grad_norm": 11.3984375, + "learning_rate": 7.191879866518354e-06, + "loss": 3.3858, + "mean_token_accuracy": 0.47497232812692164, + "step": 15147 + }, + { + "epoch": 2.808305524657026, + "grad_norm": 8.4765625, + "learning_rate": 7.191694475342974e-06, + "loss": 2.9619, + "mean_token_accuracy": 0.47221255366051745, + "step": 15148 + }, + { + "epoch": 2.8084909158324063, + "grad_norm": 6.83984375, + "learning_rate": 7.191509084167594e-06, + "loss": 2.5992, + "mean_token_accuracy": 0.5071653768988249, + "step": 15149 + }, + { + "epoch": 2.8086763070077865, + "grad_norm": 8.734375, + "learning_rate": 7.191323692992214e-06, + "loss": 2.4153, + "mean_token_accuracy": 0.5380348652931854, + "step": 15150 + }, + { + "epoch": 2.8088616981831667, + "grad_norm": 8.515625, + "learning_rate": 7.191138301816835e-06, + "loss": 2.4626, + "mean_token_accuracy": 0.50625, + "step": 15151 + }, + { + "epoch": 2.8090470893585464, + "grad_norm": 7.6328125, + "learning_rate": 7.190952910641455e-06, + "loss": 2.422, + "mean_token_accuracy": 0.5230891719745223, + "step": 15152 + }, + { + "epoch": 2.8092324805339266, + "grad_norm": 7.58984375, + "learning_rate": 7.190767519466074e-06, + "loss": 3.325, + "mean_token_accuracy": 0.4875606796116505, + "step": 15153 + }, + { + "epoch": 2.809417871709307, + "grad_norm": 8.90625, + "learning_rate": 7.190582128290694e-06, + "loss": 2.7585, + "mean_token_accuracy": 0.5080741626794258, + "step": 15154 + }, + { + "epoch": 2.8096032628846865, + "grad_norm": 6.6953125, + "learning_rate": 7.1903967371153135e-06, + "loss": 2.7595, + "mean_token_accuracy": 0.49667908634375507, + "step": 15155 + }, + { + "epoch": 2.8097886540600667, + "grad_norm": 6.79296875, + "learning_rate": 7.190211345939934e-06, + "loss": 3.0433, + "mean_token_accuracy": 0.4691623197299785, + "step": 15156 + }, + { + "epoch": 2.809974045235447, + "grad_norm": 6.74609375, + "learning_rate": 7.190025954764554e-06, + "loss": 3.007, + "mean_token_accuracy": 0.4715125017078836, + "step": 15157 + }, + { + "epoch": 2.810159436410827, + "grad_norm": 9.203125, + "learning_rate": 7.189840563589173e-06, + "loss": 3.114, + "mean_token_accuracy": 0.456773766147731, + "step": 15158 + }, + { + "epoch": 2.810344827586207, + "grad_norm": 10.65625, + "learning_rate": 7.189655172413794e-06, + "loss": 3.6596, + "mean_token_accuracy": 0.43829617834394907, + "step": 15159 + }, + { + "epoch": 2.810530218761587, + "grad_norm": 8.5234375, + "learning_rate": 7.189469781238414e-06, + "loss": 3.2839, + "mean_token_accuracy": 0.44046218037873114, + "step": 15160 + }, + { + "epoch": 2.810715609936967, + "grad_norm": 7.04296875, + "learning_rate": 7.189284390063034e-06, + "loss": 3.0344, + "mean_token_accuracy": 0.4515198579986687, + "step": 15161 + }, + { + "epoch": 2.810901001112347, + "grad_norm": 6.77734375, + "learning_rate": 7.1890989988876536e-06, + "loss": 2.4698, + "mean_token_accuracy": 0.5353876306620209, + "step": 15162 + }, + { + "epoch": 2.811086392287727, + "grad_norm": 8.890625, + "learning_rate": 7.188913607712273e-06, + "loss": 3.6128, + "mean_token_accuracy": 0.4655195772930497, + "step": 15163 + }, + { + "epoch": 2.8112717834631074, + "grad_norm": 9.3515625, + "learning_rate": 7.188728216536893e-06, + "loss": 2.8803, + "mean_token_accuracy": 0.4988600740951838, + "step": 15164 + }, + { + "epoch": 2.811457174638487, + "grad_norm": 6.73046875, + "learning_rate": 7.188542825361513e-06, + "loss": 3.0146, + "mean_token_accuracy": 0.4668587896253602, + "step": 15165 + }, + { + "epoch": 2.8116425658138673, + "grad_norm": 9.171875, + "learning_rate": 7.188357434186133e-06, + "loss": 3.4488, + "mean_token_accuracy": 0.45190453230472516, + "step": 15166 + }, + { + "epoch": 2.811827956989247, + "grad_norm": 7.8125, + "learning_rate": 7.1881720430107535e-06, + "loss": 2.5176, + "mean_token_accuracy": 0.5082615306639635, + "step": 15167 + }, + { + "epoch": 2.8120133481646272, + "grad_norm": 8.15625, + "learning_rate": 7.187986651835374e-06, + "loss": 2.1421, + "mean_token_accuracy": 0.5490694717373714, + "step": 15168 + }, + { + "epoch": 2.8121987393400074, + "grad_norm": 7.51953125, + "learning_rate": 7.187801260659994e-06, + "loss": 3.1388, + "mean_token_accuracy": 0.46170742422023725, + "step": 15169 + }, + { + "epoch": 2.8123841305153876, + "grad_norm": 10.828125, + "learning_rate": 7.187615869484613e-06, + "loss": 3.336, + "mean_token_accuracy": 0.45003417634996584, + "step": 15170 + }, + { + "epoch": 2.812569521690768, + "grad_norm": 6.4375, + "learning_rate": 7.187430478309233e-06, + "loss": 2.8683, + "mean_token_accuracy": 0.46730370711489755, + "step": 15171 + }, + { + "epoch": 2.8127549128661475, + "grad_norm": 9.234375, + "learning_rate": 7.1872450871338525e-06, + "loss": 3.2277, + "mean_token_accuracy": 0.465564738292011, + "step": 15172 + }, + { + "epoch": 2.8129403040415277, + "grad_norm": 10.3046875, + "learning_rate": 7.187059695958473e-06, + "loss": 2.6012, + "mean_token_accuracy": 0.4836394948335247, + "step": 15173 + }, + { + "epoch": 2.8131256952169075, + "grad_norm": 9.03125, + "learning_rate": 7.186874304783093e-06, + "loss": 3.6846, + "mean_token_accuracy": 0.44680851063829785, + "step": 15174 + }, + { + "epoch": 2.8133110863922877, + "grad_norm": 8.2734375, + "learning_rate": 7.186688913607713e-06, + "loss": 2.9356, + "mean_token_accuracy": 0.48729216152019, + "step": 15175 + }, + { + "epoch": 2.813496477567668, + "grad_norm": 10.5390625, + "learning_rate": 7.186503522432333e-06, + "loss": 2.5329, + "mean_token_accuracy": 0.5306863434059499, + "step": 15176 + }, + { + "epoch": 2.813681868743048, + "grad_norm": 6.7578125, + "learning_rate": 7.186318131256953e-06, + "loss": 2.7566, + "mean_token_accuracy": 0.4843462246777164, + "step": 15177 + }, + { + "epoch": 2.813867259918428, + "grad_norm": 8.6171875, + "learning_rate": 7.186132740081573e-06, + "loss": 3.4107, + "mean_token_accuracy": 0.46724744513318817, + "step": 15178 + }, + { + "epoch": 2.814052651093808, + "grad_norm": 9.0390625, + "learning_rate": 7.185947348906193e-06, + "loss": 2.5768, + "mean_token_accuracy": 0.5283747886983821, + "step": 15179 + }, + { + "epoch": 2.8142380422691877, + "grad_norm": 7.515625, + "learning_rate": 7.185761957730812e-06, + "loss": 3.4027, + "mean_token_accuracy": 0.4648182665424045, + "step": 15180 + }, + { + "epoch": 2.814423433444568, + "grad_norm": 7.15625, + "learning_rate": 7.185576566555432e-06, + "loss": 2.3864, + "mean_token_accuracy": 0.5327638295655798, + "step": 15181 + }, + { + "epoch": 2.814608824619948, + "grad_norm": 9.140625, + "learning_rate": 7.185391175380052e-06, + "loss": 3.1763, + "mean_token_accuracy": 0.46565888925402565, + "step": 15182 + }, + { + "epoch": 2.8147942157953283, + "grad_norm": 8.84375, + "learning_rate": 7.185205784204673e-06, + "loss": 2.7451, + "mean_token_accuracy": 0.5076026355803345, + "step": 15183 + }, + { + "epoch": 2.814979606970708, + "grad_norm": 7.37890625, + "learning_rate": 7.1850203930292925e-06, + "loss": 2.8083, + "mean_token_accuracy": 0.49742853725630903, + "step": 15184 + }, + { + "epoch": 2.8151649981460882, + "grad_norm": 9.28125, + "learning_rate": 7.184835001853913e-06, + "loss": 2.9839, + "mean_token_accuracy": 0.46620011911852294, + "step": 15185 + }, + { + "epoch": 2.8153503893214684, + "grad_norm": 10.765625, + "learning_rate": 7.184649610678533e-06, + "loss": 3.2862, + "mean_token_accuracy": 0.46138691883372734, + "step": 15186 + }, + { + "epoch": 2.815535780496848, + "grad_norm": 7.80078125, + "learning_rate": 7.184464219503152e-06, + "loss": 2.8552, + "mean_token_accuracy": 0.47386581469648564, + "step": 15187 + }, + { + "epoch": 2.8157211716722284, + "grad_norm": 8.3125, + "learning_rate": 7.184278828327772e-06, + "loss": 3.495, + "mean_token_accuracy": 0.42829525483304043, + "step": 15188 + }, + { + "epoch": 2.8159065628476085, + "grad_norm": 16.546875, + "learning_rate": 7.184093437152392e-06, + "loss": 3.1005, + "mean_token_accuracy": 0.4606645839250213, + "step": 15189 + }, + { + "epoch": 2.8160919540229887, + "grad_norm": 11.5625, + "learning_rate": 7.183908045977011e-06, + "loss": 3.435, + "mean_token_accuracy": 0.44716657126502574, + "step": 15190 + }, + { + "epoch": 2.8162773451983685, + "grad_norm": 7.79296875, + "learning_rate": 7.1837226548016326e-06, + "loss": 2.4775, + "mean_token_accuracy": 0.5566850035876585, + "step": 15191 + }, + { + "epoch": 2.8164627363737487, + "grad_norm": 12.234375, + "learning_rate": 7.183537263626252e-06, + "loss": 2.7201, + "mean_token_accuracy": 0.5015587978550942, + "step": 15192 + }, + { + "epoch": 2.8166481275491284, + "grad_norm": 12.3203125, + "learning_rate": 7.183351872450872e-06, + "loss": 3.244, + "mean_token_accuracy": 0.45494887131005207, + "step": 15193 + }, + { + "epoch": 2.8168335187245086, + "grad_norm": 10.0546875, + "learning_rate": 7.183166481275492e-06, + "loss": 2.8202, + "mean_token_accuracy": 0.48370031455533313, + "step": 15194 + }, + { + "epoch": 2.817018909899889, + "grad_norm": 8.7578125, + "learning_rate": 7.182981090100112e-06, + "loss": 2.5972, + "mean_token_accuracy": 0.5006048038707448, + "step": 15195 + }, + { + "epoch": 2.817204301075269, + "grad_norm": 8.90625, + "learning_rate": 7.182795698924732e-06, + "loss": 2.8367, + "mean_token_accuracy": 0.5017137454201631, + "step": 15196 + }, + { + "epoch": 2.8173896922506487, + "grad_norm": 7.4453125, + "learning_rate": 7.182610307749351e-06, + "loss": 2.0616, + "mean_token_accuracy": 0.5739686228936665, + "step": 15197 + }, + { + "epoch": 2.817575083426029, + "grad_norm": 7.31640625, + "learning_rate": 7.182424916573971e-06, + "loss": 3.1897, + "mean_token_accuracy": 0.4597519455252918, + "step": 15198 + }, + { + "epoch": 2.817760474601409, + "grad_norm": 9.9140625, + "learning_rate": 7.182239525398592e-06, + "loss": 3.3992, + "mean_token_accuracy": 0.4260727318484963, + "step": 15199 + }, + { + "epoch": 2.817945865776789, + "grad_norm": 10.1171875, + "learning_rate": 7.182054134223212e-06, + "loss": 2.8729, + "mean_token_accuracy": 0.46733264439086003, + "step": 15200 + }, + { + "epoch": 2.818131256952169, + "grad_norm": 8.4765625, + "learning_rate": 7.1818687430478315e-06, + "loss": 3.1042, + "mean_token_accuracy": 0.49048072346501664, + "step": 15201 + }, + { + "epoch": 2.8183166481275492, + "grad_norm": 11.34375, + "learning_rate": 7.181683351872451e-06, + "loss": 3.5828, + "mean_token_accuracy": 0.4350017582932833, + "step": 15202 + }, + { + "epoch": 2.8185020393029294, + "grad_norm": 12.5390625, + "learning_rate": 7.181497960697072e-06, + "loss": 2.6933, + "mean_token_accuracy": 0.49958088851634536, + "step": 15203 + }, + { + "epoch": 2.818687430478309, + "grad_norm": 9.1953125, + "learning_rate": 7.181312569521691e-06, + "loss": 3.4056, + "mean_token_accuracy": 0.44741044946383757, + "step": 15204 + }, + { + "epoch": 2.8188728216536894, + "grad_norm": 7.796875, + "learning_rate": 7.181127178346311e-06, + "loss": 3.1323, + "mean_token_accuracy": 0.43470016591609384, + "step": 15205 + }, + { + "epoch": 2.819058212829069, + "grad_norm": 8.265625, + "learning_rate": 7.180941787170931e-06, + "loss": 2.7191, + "mean_token_accuracy": 0.5104555638536221, + "step": 15206 + }, + { + "epoch": 2.8192436040044493, + "grad_norm": 11.78125, + "learning_rate": 7.180756395995552e-06, + "loss": 3.0656, + "mean_token_accuracy": 0.4659610610218215, + "step": 15207 + }, + { + "epoch": 2.8194289951798295, + "grad_norm": 6.95703125, + "learning_rate": 7.180571004820172e-06, + "loss": 2.7586, + "mean_token_accuracy": 0.49056603773584906, + "step": 15208 + }, + { + "epoch": 2.8196143863552097, + "grad_norm": 7.78125, + "learning_rate": 7.180385613644791e-06, + "loss": 3.4233, + "mean_token_accuracy": 0.42899891186071815, + "step": 15209 + }, + { + "epoch": 2.8197997775305894, + "grad_norm": 9.390625, + "learning_rate": 7.180200222469411e-06, + "loss": 3.0736, + "mean_token_accuracy": 0.515790787666958, + "step": 15210 + }, + { + "epoch": 2.8199851687059696, + "grad_norm": 8.7734375, + "learning_rate": 7.180014831294031e-06, + "loss": 2.537, + "mean_token_accuracy": 0.494885598923284, + "step": 15211 + }, + { + "epoch": 2.82017055988135, + "grad_norm": 10.4609375, + "learning_rate": 7.179829440118651e-06, + "loss": 3.3401, + "mean_token_accuracy": 0.4673852957435047, + "step": 15212 + }, + { + "epoch": 2.8203559510567295, + "grad_norm": 6.921875, + "learning_rate": 7.179644048943271e-06, + "loss": 3.0698, + "mean_token_accuracy": 0.45839320705421294, + "step": 15213 + }, + { + "epoch": 2.8205413422321097, + "grad_norm": 6.72265625, + "learning_rate": 7.17945865776789e-06, + "loss": 2.4871, + "mean_token_accuracy": 0.5359680284191829, + "step": 15214 + }, + { + "epoch": 2.82072673340749, + "grad_norm": 12.2265625, + "learning_rate": 7.179273266592512e-06, + "loss": 4.7525, + "mean_token_accuracy": 0.42224480906054884, + "step": 15215 + }, + { + "epoch": 2.82091212458287, + "grad_norm": 8.6875, + "learning_rate": 7.179087875417131e-06, + "loss": 2.4929, + "mean_token_accuracy": 0.516763145200193, + "step": 15216 + }, + { + "epoch": 2.82109751575825, + "grad_norm": 7.203125, + "learning_rate": 7.178902484241751e-06, + "loss": 2.8199, + "mean_token_accuracy": 0.493015873015873, + "step": 15217 + }, + { + "epoch": 2.82128290693363, + "grad_norm": 7.25, + "learning_rate": 7.178717093066371e-06, + "loss": 3.0702, + "mean_token_accuracy": 0.45937961595273263, + "step": 15218 + }, + { + "epoch": 2.82146829810901, + "grad_norm": 9.1953125, + "learning_rate": 7.17853170189099e-06, + "loss": 2.6695, + "mean_token_accuracy": 0.4725123378859176, + "step": 15219 + }, + { + "epoch": 2.82165368928439, + "grad_norm": 7.17578125, + "learning_rate": 7.178346310715611e-06, + "loss": 2.2863, + "mean_token_accuracy": 0.5600731570061902, + "step": 15220 + }, + { + "epoch": 2.82183908045977, + "grad_norm": 6.8671875, + "learning_rate": 7.17816091954023e-06, + "loss": 3.1208, + "mean_token_accuracy": 0.47054392044598464, + "step": 15221 + }, + { + "epoch": 2.8220244716351504, + "grad_norm": 8.296875, + "learning_rate": 7.17797552836485e-06, + "loss": 3.0415, + "mean_token_accuracy": 0.45921186563908684, + "step": 15222 + }, + { + "epoch": 2.82220986281053, + "grad_norm": 11.0703125, + "learning_rate": 7.17779013718947e-06, + "loss": 3.9782, + "mean_token_accuracy": 0.4315453863465866, + "step": 15223 + }, + { + "epoch": 2.8223952539859103, + "grad_norm": 7.89453125, + "learning_rate": 7.177604746014091e-06, + "loss": 3.3349, + "mean_token_accuracy": 0.4629338493167251, + "step": 15224 + }, + { + "epoch": 2.8225806451612905, + "grad_norm": 8.265625, + "learning_rate": 7.177419354838711e-06, + "loss": 3.0677, + "mean_token_accuracy": 0.44997828290140435, + "step": 15225 + }, + { + "epoch": 2.8227660363366702, + "grad_norm": 7.6015625, + "learning_rate": 7.17723396366333e-06, + "loss": 2.7996, + "mean_token_accuracy": 0.4922202274087373, + "step": 15226 + }, + { + "epoch": 2.8229514275120504, + "grad_norm": 7.484375, + "learning_rate": 7.17704857248795e-06, + "loss": 2.6462, + "mean_token_accuracy": 0.5251582448235168, + "step": 15227 + }, + { + "epoch": 2.8231368186874306, + "grad_norm": 9.703125, + "learning_rate": 7.1768631813125696e-06, + "loss": 2.9348, + "mean_token_accuracy": 0.469034749034749, + "step": 15228 + }, + { + "epoch": 2.823322209862811, + "grad_norm": 9.25, + "learning_rate": 7.17667779013719e-06, + "loss": 2.9124, + "mean_token_accuracy": 0.4720428123269976, + "step": 15229 + }, + { + "epoch": 2.8235076010381905, + "grad_norm": 11.078125, + "learning_rate": 7.17649239896181e-06, + "loss": 3.0566, + "mean_token_accuracy": 0.47708138447146864, + "step": 15230 + }, + { + "epoch": 2.8236929922135707, + "grad_norm": 9.921875, + "learning_rate": 7.176307007786429e-06, + "loss": 2.7654, + "mean_token_accuracy": 0.4688361831218974, + "step": 15231 + }, + { + "epoch": 2.8238783833889505, + "grad_norm": 7.5625, + "learning_rate": 7.176121616611051e-06, + "loss": 2.9236, + "mean_token_accuracy": 0.47847180109157067, + "step": 15232 + }, + { + "epoch": 2.8240637745643307, + "grad_norm": 6.7421875, + "learning_rate": 7.17593622543567e-06, + "loss": 2.5024, + "mean_token_accuracy": 0.49814601713335893, + "step": 15233 + }, + { + "epoch": 2.824249165739711, + "grad_norm": 9.3671875, + "learning_rate": 7.17575083426029e-06, + "loss": 3.0272, + "mean_token_accuracy": 0.4809512254307207, + "step": 15234 + }, + { + "epoch": 2.824434556915091, + "grad_norm": 12.9453125, + "learning_rate": 7.17556544308491e-06, + "loss": 2.4487, + "mean_token_accuracy": 0.5411372096008215, + "step": 15235 + }, + { + "epoch": 2.824619948090471, + "grad_norm": 9.8515625, + "learning_rate": 7.175380051909529e-06, + "loss": 2.9255, + "mean_token_accuracy": 0.47164298047549474, + "step": 15236 + }, + { + "epoch": 2.824805339265851, + "grad_norm": 7.9296875, + "learning_rate": 7.17519466073415e-06, + "loss": 2.6418, + "mean_token_accuracy": 0.4936708860759494, + "step": 15237 + }, + { + "epoch": 2.8249907304412307, + "grad_norm": 9.109375, + "learning_rate": 7.175009269558769e-06, + "loss": 2.9923, + "mean_token_accuracy": 0.4757526323594839, + "step": 15238 + }, + { + "epoch": 2.825176121616611, + "grad_norm": 8.6796875, + "learning_rate": 7.174823878383389e-06, + "loss": 2.9743, + "mean_token_accuracy": 0.4585518102372035, + "step": 15239 + }, + { + "epoch": 2.825361512791991, + "grad_norm": 7.9296875, + "learning_rate": 7.1746384872080095e-06, + "loss": 2.7221, + "mean_token_accuracy": 0.4936163667128134, + "step": 15240 + }, + { + "epoch": 2.8255469039673713, + "grad_norm": 8.4609375, + "learning_rate": 7.17445309603263e-06, + "loss": 2.935, + "mean_token_accuracy": 0.4834771068347711, + "step": 15241 + }, + { + "epoch": 2.8257322951427515, + "grad_norm": 8.921875, + "learning_rate": 7.17426770485725e-06, + "loss": 2.6768, + "mean_token_accuracy": 0.5093951849677041, + "step": 15242 + }, + { + "epoch": 2.8259176863181312, + "grad_norm": 8.3359375, + "learning_rate": 7.174082313681869e-06, + "loss": 2.8721, + "mean_token_accuracy": 0.5156293608707787, + "step": 15243 + }, + { + "epoch": 2.8261030774935114, + "grad_norm": 7.9453125, + "learning_rate": 7.173896922506489e-06, + "loss": 2.7361, + "mean_token_accuracy": 0.504950495049505, + "step": 15244 + }, + { + "epoch": 2.826288468668891, + "grad_norm": 7.9609375, + "learning_rate": 7.173711531331109e-06, + "loss": 2.6629, + "mean_token_accuracy": 0.5042558679391282, + "step": 15245 + }, + { + "epoch": 2.8264738598442714, + "grad_norm": 7.47265625, + "learning_rate": 7.173526140155729e-06, + "loss": 3.2311, + "mean_token_accuracy": 0.4636731777036684, + "step": 15246 + }, + { + "epoch": 2.8266592510196515, + "grad_norm": 14.2265625, + "learning_rate": 7.173340748980349e-06, + "loss": 2.87, + "mean_token_accuracy": 0.4608982412060301, + "step": 15247 + }, + { + "epoch": 2.8268446421950317, + "grad_norm": 12.1875, + "learning_rate": 7.173155357804969e-06, + "loss": 2.8229, + "mean_token_accuracy": 0.4696688331785825, + "step": 15248 + }, + { + "epoch": 2.8270300333704115, + "grad_norm": 8.4140625, + "learning_rate": 7.17296996662959e-06, + "loss": 2.975, + "mean_token_accuracy": 0.4731856601573885, + "step": 15249 + }, + { + "epoch": 2.8272154245457917, + "grad_norm": 10.8046875, + "learning_rate": 7.172784575454209e-06, + "loss": 2.1435, + "mean_token_accuracy": 0.5584784254352763, + "step": 15250 + }, + { + "epoch": 2.8274008157211714, + "grad_norm": 7.921875, + "learning_rate": 7.172599184278829e-06, + "loss": 3.0244, + "mean_token_accuracy": 0.4827233694540994, + "step": 15251 + }, + { + "epoch": 2.8275862068965516, + "grad_norm": 8.0546875, + "learning_rate": 7.172413793103449e-06, + "loss": 2.8611, + "mean_token_accuracy": 0.4657322875144657, + "step": 15252 + }, + { + "epoch": 2.827771598071932, + "grad_norm": 7.16796875, + "learning_rate": 7.172228401928068e-06, + "loss": 3.2189, + "mean_token_accuracy": 0.4601990049751244, + "step": 15253 + }, + { + "epoch": 2.827956989247312, + "grad_norm": 8.8515625, + "learning_rate": 7.172043010752689e-06, + "loss": 2.8872, + "mean_token_accuracy": 0.4682074768345937, + "step": 15254 + }, + { + "epoch": 2.8281423804226917, + "grad_norm": 7.48046875, + "learning_rate": 7.1718576195773084e-06, + "loss": 3.0905, + "mean_token_accuracy": 0.4719029374201788, + "step": 15255 + }, + { + "epoch": 2.828327771598072, + "grad_norm": 7.57421875, + "learning_rate": 7.171672228401929e-06, + "loss": 2.532, + "mean_token_accuracy": 0.5068777575914871, + "step": 15256 + }, + { + "epoch": 2.828513162773452, + "grad_norm": 6.890625, + "learning_rate": 7.1714868372265486e-06, + "loss": 1.9936, + "mean_token_accuracy": 0.5862289218191109, + "step": 15257 + }, + { + "epoch": 2.828698553948832, + "grad_norm": 7.30859375, + "learning_rate": 7.171301446051169e-06, + "loss": 2.7233, + "mean_token_accuracy": 0.4974804736709499, + "step": 15258 + }, + { + "epoch": 2.828883945124212, + "grad_norm": 8.8515625, + "learning_rate": 7.171116054875789e-06, + "loss": 2.4227, + "mean_token_accuracy": 0.6034171035979027, + "step": 15259 + }, + { + "epoch": 2.8290693362995922, + "grad_norm": 8.2421875, + "learning_rate": 7.170930663700408e-06, + "loss": 3.2699, + "mean_token_accuracy": 0.47303353422963995, + "step": 15260 + }, + { + "epoch": 2.8292547274749724, + "grad_norm": 7.04296875, + "learning_rate": 7.170745272525028e-06, + "loss": 3.2004, + "mean_token_accuracy": 0.45066193530776577, + "step": 15261 + }, + { + "epoch": 2.829440118650352, + "grad_norm": 7.28515625, + "learning_rate": 7.170559881349648e-06, + "loss": 3.3746, + "mean_token_accuracy": 0.45788712011577426, + "step": 15262 + }, + { + "epoch": 2.8296255098257324, + "grad_norm": 6.8203125, + "learning_rate": 7.170374490174268e-06, + "loss": 3.0295, + "mean_token_accuracy": 0.4820002801512817, + "step": 15263 + }, + { + "epoch": 2.829810901001112, + "grad_norm": 7.3671875, + "learning_rate": 7.170189098998889e-06, + "loss": 3.0949, + "mean_token_accuracy": 0.46167056986729116, + "step": 15264 + }, + { + "epoch": 2.8299962921764923, + "grad_norm": 7.48828125, + "learning_rate": 7.170003707823508e-06, + "loss": 2.5157, + "mean_token_accuracy": 0.4998430469812703, + "step": 15265 + }, + { + "epoch": 2.8301816833518725, + "grad_norm": 13.5, + "learning_rate": 7.169818316648128e-06, + "loss": 3.2212, + "mean_token_accuracy": 0.43549835378629154, + "step": 15266 + }, + { + "epoch": 2.8303670745272527, + "grad_norm": 12.1015625, + "learning_rate": 7.169632925472748e-06, + "loss": 2.8169, + "mean_token_accuracy": 0.4647041735257093, + "step": 15267 + }, + { + "epoch": 2.8305524657026324, + "grad_norm": 9.8671875, + "learning_rate": 7.169447534297368e-06, + "loss": 3.6591, + "mean_token_accuracy": 0.440552016985138, + "step": 15268 + }, + { + "epoch": 2.8307378568780126, + "grad_norm": 10.21875, + "learning_rate": 7.169262143121988e-06, + "loss": 3.1163, + "mean_token_accuracy": 0.4661560106299828, + "step": 15269 + }, + { + "epoch": 2.830923248053393, + "grad_norm": 7.78125, + "learning_rate": 7.169076751946607e-06, + "loss": 2.594, + "mean_token_accuracy": 0.539269406392694, + "step": 15270 + }, + { + "epoch": 2.8311086392287725, + "grad_norm": 8.40625, + "learning_rate": 7.168891360771227e-06, + "loss": 2.8756, + "mean_token_accuracy": 0.521639306623809, + "step": 15271 + }, + { + "epoch": 2.8312940304041527, + "grad_norm": 13.375, + "learning_rate": 7.168705969595848e-06, + "loss": 2.2763, + "mean_token_accuracy": 0.5494162524563634, + "step": 15272 + }, + { + "epoch": 2.831479421579533, + "grad_norm": 7.05859375, + "learning_rate": 7.168520578420468e-06, + "loss": 3.4559, + "mean_token_accuracy": 0.435024154589372, + "step": 15273 + }, + { + "epoch": 2.831664812754913, + "grad_norm": 8.2109375, + "learning_rate": 7.168335187245088e-06, + "loss": 2.6001, + "mean_token_accuracy": 0.48295849756549963, + "step": 15274 + }, + { + "epoch": 2.831850203930293, + "grad_norm": 11.25, + "learning_rate": 7.168149796069708e-06, + "loss": 3.1598, + "mean_token_accuracy": 0.46556436184749933, + "step": 15275 + }, + { + "epoch": 2.832035595105673, + "grad_norm": 10.1953125, + "learning_rate": 7.167964404894328e-06, + "loss": 3.0354, + "mean_token_accuracy": 0.4671600370027752, + "step": 15276 + }, + { + "epoch": 2.832220986281053, + "grad_norm": 6.38671875, + "learning_rate": 7.167779013718947e-06, + "loss": 2.5175, + "mean_token_accuracy": 0.5075727643882524, + "step": 15277 + }, + { + "epoch": 2.832406377456433, + "grad_norm": 10.1640625, + "learning_rate": 7.167593622543567e-06, + "loss": 2.594, + "mean_token_accuracy": 0.5274872620005363, + "step": 15278 + }, + { + "epoch": 2.832591768631813, + "grad_norm": 13.3984375, + "learning_rate": 7.167408231368187e-06, + "loss": 2.8739, + "mean_token_accuracy": 0.45, + "step": 15279 + }, + { + "epoch": 2.8327771598071934, + "grad_norm": 8.5546875, + "learning_rate": 7.167222840192808e-06, + "loss": 3.0544, + "mean_token_accuracy": 0.4652767361631918, + "step": 15280 + }, + { + "epoch": 2.832962550982573, + "grad_norm": 10.4609375, + "learning_rate": 7.167037449017428e-06, + "loss": 2.741, + "mean_token_accuracy": 0.466893039049236, + "step": 15281 + }, + { + "epoch": 2.8331479421579533, + "grad_norm": 8.25, + "learning_rate": 7.166852057842047e-06, + "loss": 3.3713, + "mean_token_accuracy": 0.45990428388000465, + "step": 15282 + }, + { + "epoch": 2.8333333333333335, + "grad_norm": 9.21875, + "learning_rate": 7.166666666666667e-06, + "loss": 3.4191, + "mean_token_accuracy": 0.46594666078906766, + "step": 15283 + }, + { + "epoch": 2.8335187245087132, + "grad_norm": 7.48046875, + "learning_rate": 7.1664812754912874e-06, + "loss": 3.1798, + "mean_token_accuracy": 0.463968253968254, + "step": 15284 + }, + { + "epoch": 2.8337041156840934, + "grad_norm": 8.140625, + "learning_rate": 7.166295884315907e-06, + "loss": 3.5657, + "mean_token_accuracy": 0.4385147891755821, + "step": 15285 + }, + { + "epoch": 2.8338895068594736, + "grad_norm": 7.66796875, + "learning_rate": 7.166110493140527e-06, + "loss": 3.0722, + "mean_token_accuracy": 0.46045751633986925, + "step": 15286 + }, + { + "epoch": 2.834074898034854, + "grad_norm": 8.046875, + "learning_rate": 7.165925101965146e-06, + "loss": 3.1527, + "mean_token_accuracy": 0.46708041014570967, + "step": 15287 + }, + { + "epoch": 2.8342602892102335, + "grad_norm": 8.1171875, + "learning_rate": 7.165739710789768e-06, + "loss": 3.5991, + "mean_token_accuracy": 0.44224655598728363, + "step": 15288 + }, + { + "epoch": 2.8344456803856137, + "grad_norm": 9.1484375, + "learning_rate": 7.165554319614387e-06, + "loss": 3.0266, + "mean_token_accuracy": 0.4754125412541254, + "step": 15289 + }, + { + "epoch": 2.8346310715609935, + "grad_norm": 7.69921875, + "learning_rate": 7.165368928439007e-06, + "loss": 2.5108, + "mean_token_accuracy": 0.4887545684565645, + "step": 15290 + }, + { + "epoch": 2.8348164627363737, + "grad_norm": 7.02734375, + "learning_rate": 7.165183537263627e-06, + "loss": 2.2721, + "mean_token_accuracy": 0.5655388978930308, + "step": 15291 + }, + { + "epoch": 2.835001853911754, + "grad_norm": 6.69140625, + "learning_rate": 7.164998146088247e-06, + "loss": 2.4699, + "mean_token_accuracy": 0.5299379770992366, + "step": 15292 + }, + { + "epoch": 2.835187245087134, + "grad_norm": 7.00390625, + "learning_rate": 7.164812754912867e-06, + "loss": 2.4868, + "mean_token_accuracy": 0.5182687591956842, + "step": 15293 + }, + { + "epoch": 2.835372636262514, + "grad_norm": 9.2109375, + "learning_rate": 7.164627363737486e-06, + "loss": 3.2646, + "mean_token_accuracy": 0.4600687810470004, + "step": 15294 + }, + { + "epoch": 2.835558027437894, + "grad_norm": 12.8671875, + "learning_rate": 7.164441972562106e-06, + "loss": 2.74, + "mean_token_accuracy": 0.49096420659904394, + "step": 15295 + }, + { + "epoch": 2.835743418613274, + "grad_norm": 8.046875, + "learning_rate": 7.164256581386727e-06, + "loss": 2.8127, + "mean_token_accuracy": 0.49566236811254394, + "step": 15296 + }, + { + "epoch": 2.835928809788654, + "grad_norm": 11.9921875, + "learning_rate": 7.164071190211347e-06, + "loss": 3.2175, + "mean_token_accuracy": 0.45983338292174947, + "step": 15297 + }, + { + "epoch": 2.836114200964034, + "grad_norm": 11.3828125, + "learning_rate": 7.163885799035967e-06, + "loss": 2.8594, + "mean_token_accuracy": 0.46378795420607266, + "step": 15298 + }, + { + "epoch": 2.8362995921394143, + "grad_norm": 7.7734375, + "learning_rate": 7.163700407860586e-06, + "loss": 2.8611, + "mean_token_accuracy": 0.47993499187398425, + "step": 15299 + }, + { + "epoch": 2.8364849833147945, + "grad_norm": 9.125, + "learning_rate": 7.163515016685206e-06, + "loss": 2.474, + "mean_token_accuracy": 0.5115901715345387, + "step": 15300 + }, + { + "epoch": 2.8366703744901742, + "grad_norm": 10.9296875, + "learning_rate": 7.1633296255098265e-06, + "loss": 2.5771, + "mean_token_accuracy": 0.49374796129172555, + "step": 15301 + }, + { + "epoch": 2.8368557656655544, + "grad_norm": 8.296875, + "learning_rate": 7.163144234334446e-06, + "loss": 3.0117, + "mean_token_accuracy": 0.4800275482093664, + "step": 15302 + }, + { + "epoch": 2.837041156840934, + "grad_norm": 9.3203125, + "learning_rate": 7.162958843159066e-06, + "loss": 3.4338, + "mean_token_accuracy": 0.45917001338688085, + "step": 15303 + }, + { + "epoch": 2.8372265480163144, + "grad_norm": 8.6171875, + "learning_rate": 7.162773451983687e-06, + "loss": 3.3489, + "mean_token_accuracy": 0.4583008573655495, + "step": 15304 + }, + { + "epoch": 2.8374119391916945, + "grad_norm": 7.16796875, + "learning_rate": 7.162588060808307e-06, + "loss": 3.2071, + "mean_token_accuracy": 0.45114539504441326, + "step": 15305 + }, + { + "epoch": 2.8375973303670747, + "grad_norm": 8.1171875, + "learning_rate": 7.162402669632926e-06, + "loss": 2.8511, + "mean_token_accuracy": 0.5066331517944421, + "step": 15306 + }, + { + "epoch": 2.8377827215424545, + "grad_norm": 8.265625, + "learning_rate": 7.162217278457546e-06, + "loss": 3.1439, + "mean_token_accuracy": 0.46727748691099474, + "step": 15307 + }, + { + "epoch": 2.8379681127178347, + "grad_norm": 9.65625, + "learning_rate": 7.162031887282166e-06, + "loss": 2.8564, + "mean_token_accuracy": 0.46910202767943354, + "step": 15308 + }, + { + "epoch": 2.8381535038932144, + "grad_norm": 7.94921875, + "learning_rate": 7.161846496106785e-06, + "loss": 2.7972, + "mean_token_accuracy": 0.4878367773999477, + "step": 15309 + }, + { + "epoch": 2.8383388950685946, + "grad_norm": 7.86328125, + "learning_rate": 7.161661104931406e-06, + "loss": 3.2065, + "mean_token_accuracy": 0.45118570988604867, + "step": 15310 + }, + { + "epoch": 2.838524286243975, + "grad_norm": 8.6875, + "learning_rate": 7.1614757137560255e-06, + "loss": 3.5121, + "mean_token_accuracy": 0.44878317063110135, + "step": 15311 + }, + { + "epoch": 2.838709677419355, + "grad_norm": 8.59375, + "learning_rate": 7.161290322580646e-06, + "loss": 3.4376, + "mean_token_accuracy": 0.4389626818469323, + "step": 15312 + }, + { + "epoch": 2.8388950685947347, + "grad_norm": 8.0859375, + "learning_rate": 7.1611049314052664e-06, + "loss": 2.7883, + "mean_token_accuracy": 0.47029945999018163, + "step": 15313 + }, + { + "epoch": 2.839080459770115, + "grad_norm": 7.51171875, + "learning_rate": 7.160919540229886e-06, + "loss": 3.0305, + "mean_token_accuracy": 0.5053506375227687, + "step": 15314 + }, + { + "epoch": 2.839265850945495, + "grad_norm": 9.0859375, + "learning_rate": 7.160734149054506e-06, + "loss": 3.4296, + "mean_token_accuracy": 0.4365284974093264, + "step": 15315 + }, + { + "epoch": 2.839451242120875, + "grad_norm": 8.015625, + "learning_rate": 7.160548757879125e-06, + "loss": 3.4203, + "mean_token_accuracy": 0.4649099576271186, + "step": 15316 + }, + { + "epoch": 2.839636633296255, + "grad_norm": 7.390625, + "learning_rate": 7.160363366703745e-06, + "loss": 2.552, + "mean_token_accuracy": 0.5275647518448849, + "step": 15317 + }, + { + "epoch": 2.8398220244716352, + "grad_norm": 6.99609375, + "learning_rate": 7.1601779755283655e-06, + "loss": 3.0433, + "mean_token_accuracy": 0.47981444332998996, + "step": 15318 + }, + { + "epoch": 2.8400074156470154, + "grad_norm": 7.203125, + "learning_rate": 7.159992584352985e-06, + "loss": 2.7172, + "mean_token_accuracy": 0.49413362973940966, + "step": 15319 + }, + { + "epoch": 2.840192806822395, + "grad_norm": 7.9765625, + "learning_rate": 7.159807193177606e-06, + "loss": 2.697, + "mean_token_accuracy": 0.46564087242306545, + "step": 15320 + }, + { + "epoch": 2.8403781979977754, + "grad_norm": 9.8203125, + "learning_rate": 7.159621802002225e-06, + "loss": 2.5349, + "mean_token_accuracy": 0.5241062156950007, + "step": 15321 + }, + { + "epoch": 2.840563589173155, + "grad_norm": 6.8359375, + "learning_rate": 7.159436410826846e-06, + "loss": 2.8102, + "mean_token_accuracy": 0.4800826541154862, + "step": 15322 + }, + { + "epoch": 2.8407489803485353, + "grad_norm": 7.05859375, + "learning_rate": 7.159251019651465e-06, + "loss": 2.7281, + "mean_token_accuracy": 0.49166200335758253, + "step": 15323 + }, + { + "epoch": 2.8409343715239155, + "grad_norm": 7.859375, + "learning_rate": 7.159065628476085e-06, + "loss": 2.7788, + "mean_token_accuracy": 0.4970980392156863, + "step": 15324 + }, + { + "epoch": 2.8411197626992957, + "grad_norm": 7.23828125, + "learning_rate": 7.158880237300705e-06, + "loss": 3.1547, + "mean_token_accuracy": 0.4501797268152408, + "step": 15325 + }, + { + "epoch": 2.8413051538746754, + "grad_norm": 8.609375, + "learning_rate": 7.158694846125324e-06, + "loss": 2.4184, + "mean_token_accuracy": 0.5225821707259399, + "step": 15326 + }, + { + "epoch": 2.8414905450500556, + "grad_norm": 7.66796875, + "learning_rate": 7.158509454949945e-06, + "loss": 3.1083, + "mean_token_accuracy": 0.4604329311568488, + "step": 15327 + }, + { + "epoch": 2.841675936225436, + "grad_norm": 9.5078125, + "learning_rate": 7.158324063774565e-06, + "loss": 2.8636, + "mean_token_accuracy": 0.4778147131088308, + "step": 15328 + }, + { + "epoch": 2.8418613274008155, + "grad_norm": 7.13671875, + "learning_rate": 7.158138672599185e-06, + "loss": 3.3273, + "mean_token_accuracy": 0.43343996062992124, + "step": 15329 + }, + { + "epoch": 2.8420467185761957, + "grad_norm": 7.05078125, + "learning_rate": 7.1579532814238055e-06, + "loss": 2.55, + "mean_token_accuracy": 0.48985549554809515, + "step": 15330 + }, + { + "epoch": 2.842232109751576, + "grad_norm": 8.6875, + "learning_rate": 7.157767890248425e-06, + "loss": 2.8157, + "mean_token_accuracy": 0.5078813785733369, + "step": 15331 + }, + { + "epoch": 2.842417500926956, + "grad_norm": 7.89453125, + "learning_rate": 7.157582499073045e-06, + "loss": 2.8427, + "mean_token_accuracy": 0.5075866475003993, + "step": 15332 + }, + { + "epoch": 2.842602892102336, + "grad_norm": 7.42578125, + "learning_rate": 7.157397107897664e-06, + "loss": 2.9308, + "mean_token_accuracy": 0.4898137732783402, + "step": 15333 + }, + { + "epoch": 2.842788283277716, + "grad_norm": 7.97265625, + "learning_rate": 7.157211716722284e-06, + "loss": 2.7237, + "mean_token_accuracy": 0.5005841121495327, + "step": 15334 + }, + { + "epoch": 2.842973674453096, + "grad_norm": 7.76171875, + "learning_rate": 7.1570263255469045e-06, + "loss": 3.3254, + "mean_token_accuracy": 0.44100856327307325, + "step": 15335 + }, + { + "epoch": 2.843159065628476, + "grad_norm": 9.6171875, + "learning_rate": 7.156840934371525e-06, + "loss": 3.2085, + "mean_token_accuracy": 0.4795158286778399, + "step": 15336 + }, + { + "epoch": 2.843344456803856, + "grad_norm": 7.99609375, + "learning_rate": 7.156655543196145e-06, + "loss": 3.2065, + "mean_token_accuracy": 0.47353625436215585, + "step": 15337 + }, + { + "epoch": 2.8435298479792364, + "grad_norm": 22.171875, + "learning_rate": 7.156470152020764e-06, + "loss": 3.4936, + "mean_token_accuracy": 0.44387910857840646, + "step": 15338 + }, + { + "epoch": 2.843715239154616, + "grad_norm": 7.95703125, + "learning_rate": 7.156284760845385e-06, + "loss": 2.5127, + "mean_token_accuracy": 0.5081242688158065, + "step": 15339 + }, + { + "epoch": 2.8439006303299963, + "grad_norm": 8.1953125, + "learning_rate": 7.1560993696700045e-06, + "loss": 3.1522, + "mean_token_accuracy": 0.44326562297839306, + "step": 15340 + }, + { + "epoch": 2.8440860215053765, + "grad_norm": 11.7578125, + "learning_rate": 7.155913978494624e-06, + "loss": 2.5732, + "mean_token_accuracy": 0.5439847231063017, + "step": 15341 + }, + { + "epoch": 2.8442714126807562, + "grad_norm": 8.875, + "learning_rate": 7.155728587319244e-06, + "loss": 2.858, + "mean_token_accuracy": 0.49902152641878667, + "step": 15342 + }, + { + "epoch": 2.8444568038561364, + "grad_norm": 7.9921875, + "learning_rate": 7.155543196143863e-06, + "loss": 2.6091, + "mean_token_accuracy": 0.49125218695326167, + "step": 15343 + }, + { + "epoch": 2.8446421950315166, + "grad_norm": 6.73828125, + "learning_rate": 7.155357804968484e-06, + "loss": 2.5139, + "mean_token_accuracy": 0.514889943892965, + "step": 15344 + }, + { + "epoch": 2.844827586206897, + "grad_norm": 8.8359375, + "learning_rate": 7.155172413793104e-06, + "loss": 2.8087, + "mean_token_accuracy": 0.5017280240420736, + "step": 15345 + }, + { + "epoch": 2.8450129773822765, + "grad_norm": 7.96484375, + "learning_rate": 7.154987022617724e-06, + "loss": 2.5906, + "mean_token_accuracy": 0.5117145899893504, + "step": 15346 + }, + { + "epoch": 2.8451983685576567, + "grad_norm": 11.0234375, + "learning_rate": 7.154801631442344e-06, + "loss": 2.1383, + "mean_token_accuracy": 0.5921406180026869, + "step": 15347 + }, + { + "epoch": 2.8453837597330365, + "grad_norm": 7.84765625, + "learning_rate": 7.154616240266964e-06, + "loss": 2.6228, + "mean_token_accuracy": 0.514711137581894, + "step": 15348 + }, + { + "epoch": 2.8455691509084167, + "grad_norm": 7.53515625, + "learning_rate": 7.154430849091584e-06, + "loss": 2.8816, + "mean_token_accuracy": 0.4826195278206193, + "step": 15349 + }, + { + "epoch": 2.845754542083797, + "grad_norm": 9.3515625, + "learning_rate": 7.1542454579162034e-06, + "loss": 2.9497, + "mean_token_accuracy": 0.4782608695652174, + "step": 15350 + }, + { + "epoch": 2.845939933259177, + "grad_norm": 8.453125, + "learning_rate": 7.154060066740823e-06, + "loss": 3.5573, + "mean_token_accuracy": 0.4413212293557913, + "step": 15351 + }, + { + "epoch": 2.846125324434557, + "grad_norm": 8.4140625, + "learning_rate": 7.153874675565443e-06, + "loss": 2.5936, + "mean_token_accuracy": 0.49481193255512324, + "step": 15352 + }, + { + "epoch": 2.846310715609937, + "grad_norm": 10.2734375, + "learning_rate": 7.153689284390064e-06, + "loss": 2.8037, + "mean_token_accuracy": 0.49286669638876507, + "step": 15353 + }, + { + "epoch": 2.846496106785317, + "grad_norm": 9.09375, + "learning_rate": 7.153503893214684e-06, + "loss": 3.4853, + "mean_token_accuracy": 0.4278494248867201, + "step": 15354 + }, + { + "epoch": 2.846681497960697, + "grad_norm": 8.15625, + "learning_rate": 7.153318502039303e-06, + "loss": 3.448, + "mean_token_accuracy": 0.4424838362068966, + "step": 15355 + }, + { + "epoch": 2.846866889136077, + "grad_norm": 9.5703125, + "learning_rate": 7.153133110863924e-06, + "loss": 3.1107, + "mean_token_accuracy": 0.46409389297791925, + "step": 15356 + }, + { + "epoch": 2.8470522803114573, + "grad_norm": 9.546875, + "learning_rate": 7.1529477196885435e-06, + "loss": 2.8841, + "mean_token_accuracy": 0.5011965811965812, + "step": 15357 + }, + { + "epoch": 2.8472376714868375, + "grad_norm": 7.73828125, + "learning_rate": 7.152762328513163e-06, + "loss": 2.7977, + "mean_token_accuracy": 0.4729277142059726, + "step": 15358 + }, + { + "epoch": 2.8474230626622172, + "grad_norm": 7.66015625, + "learning_rate": 7.152576937337783e-06, + "loss": 2.2581, + "mean_token_accuracy": 0.5403341976375684, + "step": 15359 + }, + { + "epoch": 2.8476084538375974, + "grad_norm": 7.6484375, + "learning_rate": 7.152391546162402e-06, + "loss": 3.3615, + "mean_token_accuracy": 0.4239086087311302, + "step": 15360 + }, + { + "epoch": 2.847793845012977, + "grad_norm": 7.91796875, + "learning_rate": 7.152206154987024e-06, + "loss": 2.8529, + "mean_token_accuracy": 0.4927097661623109, + "step": 15361 + }, + { + "epoch": 2.8479792361883574, + "grad_norm": 7.38671875, + "learning_rate": 7.152020763811643e-06, + "loss": 2.7488, + "mean_token_accuracy": 0.4978601997146933, + "step": 15362 + }, + { + "epoch": 2.8481646273637375, + "grad_norm": 9.1484375, + "learning_rate": 7.151835372636263e-06, + "loss": 3.1621, + "mean_token_accuracy": 0.4585887384176764, + "step": 15363 + }, + { + "epoch": 2.8483500185391177, + "grad_norm": 9.109375, + "learning_rate": 7.151649981460883e-06, + "loss": 2.8604, + "mean_token_accuracy": 0.48059561496072223, + "step": 15364 + }, + { + "epoch": 2.8485354097144975, + "grad_norm": 7.25390625, + "learning_rate": 7.151464590285503e-06, + "loss": 3.4205, + "mean_token_accuracy": 0.4360484134772653, + "step": 15365 + }, + { + "epoch": 2.8487208008898777, + "grad_norm": 36.1875, + "learning_rate": 7.151279199110123e-06, + "loss": 4.6103, + "mean_token_accuracy": 0.4674168418449363, + "step": 15366 + }, + { + "epoch": 2.848906192065258, + "grad_norm": 8.9921875, + "learning_rate": 7.1510938079347425e-06, + "loss": 2.7873, + "mean_token_accuracy": 0.48792585824304924, + "step": 15367 + }, + { + "epoch": 2.8490915832406376, + "grad_norm": 6.63671875, + "learning_rate": 7.150908416759362e-06, + "loss": 3.3537, + "mean_token_accuracy": 0.4453447050461976, + "step": 15368 + }, + { + "epoch": 2.849276974416018, + "grad_norm": 7.51953125, + "learning_rate": 7.1507230255839835e-06, + "loss": 2.3749, + "mean_token_accuracy": 0.5031650706054211, + "step": 15369 + }, + { + "epoch": 2.849462365591398, + "grad_norm": 8.4765625, + "learning_rate": 7.150537634408603e-06, + "loss": 2.8218, + "mean_token_accuracy": 0.48682196905288216, + "step": 15370 + }, + { + "epoch": 2.849647756766778, + "grad_norm": 7.09765625, + "learning_rate": 7.150352243233223e-06, + "loss": 2.0373, + "mean_token_accuracy": 0.5493087557603686, + "step": 15371 + }, + { + "epoch": 2.849833147942158, + "grad_norm": 7.31640625, + "learning_rate": 7.150166852057842e-06, + "loss": 2.7381, + "mean_token_accuracy": 0.5044893378226711, + "step": 15372 + }, + { + "epoch": 2.850018539117538, + "grad_norm": 7.96875, + "learning_rate": 7.149981460882463e-06, + "loss": 2.6069, + "mean_token_accuracy": 0.4919678714859438, + "step": 15373 + }, + { + "epoch": 2.850203930292918, + "grad_norm": 9.921875, + "learning_rate": 7.1497960697070825e-06, + "loss": 3.1815, + "mean_token_accuracy": 0.4425312568545734, + "step": 15374 + }, + { + "epoch": 2.850389321468298, + "grad_norm": 7.7734375, + "learning_rate": 7.149610678531702e-06, + "loss": 3.3214, + "mean_token_accuracy": 0.46538311940879035, + "step": 15375 + }, + { + "epoch": 2.8505747126436782, + "grad_norm": 8.96875, + "learning_rate": 7.149425287356322e-06, + "loss": 2.8184, + "mean_token_accuracy": 0.5161367154936287, + "step": 15376 + }, + { + "epoch": 2.8507601038190584, + "grad_norm": 8.640625, + "learning_rate": 7.149239896180943e-06, + "loss": 2.7493, + "mean_token_accuracy": 0.4799309948246118, + "step": 15377 + }, + { + "epoch": 2.850945494994438, + "grad_norm": 7.13671875, + "learning_rate": 7.149054505005563e-06, + "loss": 3.0697, + "mean_token_accuracy": 0.4699612403100775, + "step": 15378 + }, + { + "epoch": 2.8511308861698184, + "grad_norm": 7.296875, + "learning_rate": 7.1488691138301824e-06, + "loss": 3.37, + "mean_token_accuracy": 0.46646380119183467, + "step": 15379 + }, + { + "epoch": 2.851316277345198, + "grad_norm": 6.91015625, + "learning_rate": 7.148683722654802e-06, + "loss": 3.1996, + "mean_token_accuracy": 0.452587274077742, + "step": 15380 + }, + { + "epoch": 2.8515016685205783, + "grad_norm": 6.8828125, + "learning_rate": 7.148498331479422e-06, + "loss": 3.1867, + "mean_token_accuracy": 0.4593460008837826, + "step": 15381 + }, + { + "epoch": 2.8516870596959585, + "grad_norm": 7.421875, + "learning_rate": 7.148312940304042e-06, + "loss": 3.5187, + "mean_token_accuracy": 0.4282899366643209, + "step": 15382 + }, + { + "epoch": 2.8518724508713387, + "grad_norm": 7.3203125, + "learning_rate": 7.148127549128662e-06, + "loss": 2.4259, + "mean_token_accuracy": 0.547911547911548, + "step": 15383 + }, + { + "epoch": 2.8520578420467184, + "grad_norm": 7.2578125, + "learning_rate": 7.1479421579532815e-06, + "loss": 3.5647, + "mean_token_accuracy": 0.4451204672669749, + "step": 15384 + }, + { + "epoch": 2.8522432332220986, + "grad_norm": 8.0546875, + "learning_rate": 7.147756766777902e-06, + "loss": 3.2734, + "mean_token_accuracy": 0.46436176875861207, + "step": 15385 + }, + { + "epoch": 2.852428624397479, + "grad_norm": 8.5625, + "learning_rate": 7.1475713756025225e-06, + "loss": 2.7901, + "mean_token_accuracy": 0.4898785425101215, + "step": 15386 + }, + { + "epoch": 2.8526140155728585, + "grad_norm": 11.109375, + "learning_rate": 7.147385984427142e-06, + "loss": 2.3004, + "mean_token_accuracy": 0.5293980672953779, + "step": 15387 + }, + { + "epoch": 2.8527994067482387, + "grad_norm": 9.7890625, + "learning_rate": 7.147200593251762e-06, + "loss": 2.6833, + "mean_token_accuracy": 0.49607843137254903, + "step": 15388 + }, + { + "epoch": 2.852984797923619, + "grad_norm": 9.0703125, + "learning_rate": 7.147015202076381e-06, + "loss": 2.774, + "mean_token_accuracy": 0.46868801360158685, + "step": 15389 + }, + { + "epoch": 2.853170189098999, + "grad_norm": 7.109375, + "learning_rate": 7.146829810901001e-06, + "loss": 2.7155, + "mean_token_accuracy": 0.4909630428918263, + "step": 15390 + }, + { + "epoch": 2.853355580274379, + "grad_norm": 8.46875, + "learning_rate": 7.1466444197256216e-06, + "loss": 2.7526, + "mean_token_accuracy": 0.5096236713588049, + "step": 15391 + }, + { + "epoch": 2.853540971449759, + "grad_norm": 11.5390625, + "learning_rate": 7.146459028550241e-06, + "loss": 2.9017, + "mean_token_accuracy": 0.4709328526363227, + "step": 15392 + }, + { + "epoch": 2.853726362625139, + "grad_norm": 9.484375, + "learning_rate": 7.146273637374862e-06, + "loss": 2.1755, + "mean_token_accuracy": 0.5425426663368785, + "step": 15393 + }, + { + "epoch": 2.853911753800519, + "grad_norm": 12.640625, + "learning_rate": 7.146088246199482e-06, + "loss": 3.0755, + "mean_token_accuracy": 0.463776465504825, + "step": 15394 + }, + { + "epoch": 2.854097144975899, + "grad_norm": 8.671875, + "learning_rate": 7.145902855024102e-06, + "loss": 3.12, + "mean_token_accuracy": 0.4854199912955172, + "step": 15395 + }, + { + "epoch": 2.8542825361512794, + "grad_norm": 7.7109375, + "learning_rate": 7.1457174638487215e-06, + "loss": 3.2935, + "mean_token_accuracy": 0.4643468339988591, + "step": 15396 + }, + { + "epoch": 2.854467927326659, + "grad_norm": 8.0, + "learning_rate": 7.145532072673341e-06, + "loss": 2.9534, + "mean_token_accuracy": 0.48430351790503234, + "step": 15397 + }, + { + "epoch": 2.8546533185020393, + "grad_norm": 9.1640625, + "learning_rate": 7.145346681497961e-06, + "loss": 2.866, + "mean_token_accuracy": 0.4780555235571939, + "step": 15398 + }, + { + "epoch": 2.8548387096774195, + "grad_norm": 7.04296875, + "learning_rate": 7.145161290322581e-06, + "loss": 3.2361, + "mean_token_accuracy": 0.44921232071478956, + "step": 15399 + }, + { + "epoch": 2.8550241008527992, + "grad_norm": 7.55078125, + "learning_rate": 7.144975899147201e-06, + "loss": 2.8261, + "mean_token_accuracy": 0.4678362573099415, + "step": 15400 + }, + { + "epoch": 2.8552094920281794, + "grad_norm": 7.5546875, + "learning_rate": 7.144790507971821e-06, + "loss": 2.6117, + "mean_token_accuracy": 0.4860059269015476, + "step": 15401 + }, + { + "epoch": 2.8553948832035596, + "grad_norm": 7.85546875, + "learning_rate": 7.144605116796441e-06, + "loss": 2.4388, + "mean_token_accuracy": 0.5091268968550693, + "step": 15402 + }, + { + "epoch": 2.85558027437894, + "grad_norm": 7.63671875, + "learning_rate": 7.1444197256210615e-06, + "loss": 2.5938, + "mean_token_accuracy": 0.5183447612173704, + "step": 15403 + }, + { + "epoch": 2.8557656655543195, + "grad_norm": 9.5546875, + "learning_rate": 7.144234334445681e-06, + "loss": 2.504, + "mean_token_accuracy": 0.5094386181369525, + "step": 15404 + }, + { + "epoch": 2.8559510567296997, + "grad_norm": 8.8984375, + "learning_rate": 7.144048943270301e-06, + "loss": 3.4258, + "mean_token_accuracy": 0.4336482231219073, + "step": 15405 + }, + { + "epoch": 2.8561364479050795, + "grad_norm": 9.125, + "learning_rate": 7.1438635520949205e-06, + "loss": 2.941, + "mean_token_accuracy": 0.47102803738317756, + "step": 15406 + }, + { + "epoch": 2.8563218390804597, + "grad_norm": 7.7421875, + "learning_rate": 7.14367816091954e-06, + "loss": 2.7892, + "mean_token_accuracy": 0.4851625727672867, + "step": 15407 + }, + { + "epoch": 2.85650723025584, + "grad_norm": 10.0703125, + "learning_rate": 7.143492769744161e-06, + "loss": 3.4092, + "mean_token_accuracy": 0.44240498882221435, + "step": 15408 + }, + { + "epoch": 2.85669262143122, + "grad_norm": 11.9140625, + "learning_rate": 7.143307378568781e-06, + "loss": 2.1659, + "mean_token_accuracy": 0.5501371913195311, + "step": 15409 + }, + { + "epoch": 2.8568780126066, + "grad_norm": 7.9296875, + "learning_rate": 7.143121987393401e-06, + "loss": 2.6286, + "mean_token_accuracy": 0.49921618925466726, + "step": 15410 + }, + { + "epoch": 2.85706340378198, + "grad_norm": 8.6484375, + "learning_rate": 7.142936596218021e-06, + "loss": 3.133, + "mean_token_accuracy": 0.46520107238605896, + "step": 15411 + }, + { + "epoch": 2.85724879495736, + "grad_norm": 8.75, + "learning_rate": 7.142751205042641e-06, + "loss": 2.5357, + "mean_token_accuracy": 0.5255925798694606, + "step": 15412 + }, + { + "epoch": 2.85743418613274, + "grad_norm": 10.515625, + "learning_rate": 7.1425658138672605e-06, + "loss": 2.5536, + "mean_token_accuracy": 0.4854693241287162, + "step": 15413 + }, + { + "epoch": 2.85761957730812, + "grad_norm": 7.890625, + "learning_rate": 7.14238042269188e-06, + "loss": 3.3162, + "mean_token_accuracy": 0.4305973356252686, + "step": 15414 + }, + { + "epoch": 2.8578049684835003, + "grad_norm": 7.953125, + "learning_rate": 7.1421950315165e-06, + "loss": 2.9712, + "mean_token_accuracy": 0.47223894977718894, + "step": 15415 + }, + { + "epoch": 2.8579903596588805, + "grad_norm": 9.9609375, + "learning_rate": 7.14200964034112e-06, + "loss": 3.318, + "mean_token_accuracy": 0.4513636895704085, + "step": 15416 + }, + { + "epoch": 2.8581757508342602, + "grad_norm": 7.3828125, + "learning_rate": 7.141824249165741e-06, + "loss": 2.8601, + "mean_token_accuracy": 0.49901356350184956, + "step": 15417 + }, + { + "epoch": 2.8583611420096404, + "grad_norm": 10.03125, + "learning_rate": 7.1416388579903604e-06, + "loss": 3.9528, + "mean_token_accuracy": 0.41701212958697986, + "step": 15418 + }, + { + "epoch": 2.85854653318502, + "grad_norm": 8.6171875, + "learning_rate": 7.14145346681498e-06, + "loss": 3.1183, + "mean_token_accuracy": 0.5009868421052631, + "step": 15419 + }, + { + "epoch": 2.8587319243604004, + "grad_norm": 8.296875, + "learning_rate": 7.1412680756396006e-06, + "loss": 3.5853, + "mean_token_accuracy": 0.44292721728755136, + "step": 15420 + }, + { + "epoch": 2.8589173155357805, + "grad_norm": 9.8203125, + "learning_rate": 7.14108268446422e-06, + "loss": 3.1178, + "mean_token_accuracy": 0.4773278835100268, + "step": 15421 + }, + { + "epoch": 2.8591027067111607, + "grad_norm": 8.8359375, + "learning_rate": 7.14089729328884e-06, + "loss": 3.6471, + "mean_token_accuracy": 0.46944724744926564, + "step": 15422 + }, + { + "epoch": 2.8592880978865405, + "grad_norm": 8.4296875, + "learning_rate": 7.1407119021134595e-06, + "loss": 2.7932, + "mean_token_accuracy": 0.48786789485508875, + "step": 15423 + }, + { + "epoch": 2.8594734890619207, + "grad_norm": 9.3515625, + "learning_rate": 7.140526510938079e-06, + "loss": 3.7751, + "mean_token_accuracy": 0.44979699330626577, + "step": 15424 + }, + { + "epoch": 2.859658880237301, + "grad_norm": 9.15625, + "learning_rate": 7.1403411197627005e-06, + "loss": 2.8293, + "mean_token_accuracy": 0.47691318327974275, + "step": 15425 + }, + { + "epoch": 2.8598442714126806, + "grad_norm": 10.40625, + "learning_rate": 7.14015572858732e-06, + "loss": 2.6012, + "mean_token_accuracy": 0.5078836046660684, + "step": 15426 + }, + { + "epoch": 2.860029662588061, + "grad_norm": 8.515625, + "learning_rate": 7.13997033741194e-06, + "loss": 2.8402, + "mean_token_accuracy": 0.47957662492546216, + "step": 15427 + }, + { + "epoch": 2.860215053763441, + "grad_norm": 9.9609375, + "learning_rate": 7.139784946236559e-06, + "loss": 2.4463, + "mean_token_accuracy": 0.5068536482496837, + "step": 15428 + }, + { + "epoch": 2.860400444938821, + "grad_norm": 8.1328125, + "learning_rate": 7.13959955506118e-06, + "loss": 2.6924, + "mean_token_accuracy": 0.5178592204770215, + "step": 15429 + }, + { + "epoch": 2.860585836114201, + "grad_norm": 8.3203125, + "learning_rate": 7.1394141638857995e-06, + "loss": 2.2011, + "mean_token_accuracy": 0.5719931497826374, + "step": 15430 + }, + { + "epoch": 2.860771227289581, + "grad_norm": 8.1015625, + "learning_rate": 7.139228772710419e-06, + "loss": 2.9747, + "mean_token_accuracy": 0.48650025471217523, + "step": 15431 + }, + { + "epoch": 2.860956618464961, + "grad_norm": 8.671875, + "learning_rate": 7.139043381535039e-06, + "loss": 2.926, + "mean_token_accuracy": 0.49029714978775013, + "step": 15432 + }, + { + "epoch": 2.861142009640341, + "grad_norm": 7.16796875, + "learning_rate": 7.13885799035966e-06, + "loss": 2.6574, + "mean_token_accuracy": 0.4761255115961801, + "step": 15433 + }, + { + "epoch": 2.8613274008157212, + "grad_norm": 7.8125, + "learning_rate": 7.13867259918428e-06, + "loss": 3.2222, + "mean_token_accuracy": 0.4931987098583649, + "step": 15434 + }, + { + "epoch": 2.8615127919911014, + "grad_norm": 9.359375, + "learning_rate": 7.1384872080088995e-06, + "loss": 3.3731, + "mean_token_accuracy": 0.43633231747714324, + "step": 15435 + }, + { + "epoch": 2.861698183166481, + "grad_norm": 9.8046875, + "learning_rate": 7.138301816833519e-06, + "loss": 2.7059, + "mean_token_accuracy": 0.499622641509434, + "step": 15436 + }, + { + "epoch": 2.8618835743418614, + "grad_norm": 8.21875, + "learning_rate": 7.13811642565814e-06, + "loss": 3.4409, + "mean_token_accuracy": 0.45554445554445555, + "step": 15437 + }, + { + "epoch": 2.862068965517241, + "grad_norm": 8.0703125, + "learning_rate": 7.137931034482759e-06, + "loss": 3.0493, + "mean_token_accuracy": 0.4828467644288982, + "step": 15438 + }, + { + "epoch": 2.8622543566926213, + "grad_norm": 6.75, + "learning_rate": 7.137745643307379e-06, + "loss": 2.4812, + "mean_token_accuracy": 0.5218516826557028, + "step": 15439 + }, + { + "epoch": 2.8624397478680015, + "grad_norm": 8.2421875, + "learning_rate": 7.1375602521319985e-06, + "loss": 2.7305, + "mean_token_accuracy": 0.49899611832418683, + "step": 15440 + }, + { + "epoch": 2.8626251390433817, + "grad_norm": 6.28125, + "learning_rate": 7.13737486095662e-06, + "loss": 2.7991, + "mean_token_accuracy": 0.476644245142003, + "step": 15441 + }, + { + "epoch": 2.862810530218762, + "grad_norm": 8.1328125, + "learning_rate": 7.1371894697812395e-06, + "loss": 3.0243, + "mean_token_accuracy": 0.49973168768446474, + "step": 15442 + }, + { + "epoch": 2.8629959213941416, + "grad_norm": 7.09375, + "learning_rate": 7.137004078605859e-06, + "loss": 3.2036, + "mean_token_accuracy": 0.465005931198102, + "step": 15443 + }, + { + "epoch": 2.863181312569522, + "grad_norm": 7.7109375, + "learning_rate": 7.136818687430479e-06, + "loss": 3.0596, + "mean_token_accuracy": 0.4708381171067738, + "step": 15444 + }, + { + "epoch": 2.8633667037449015, + "grad_norm": 6.64453125, + "learning_rate": 7.1366332962550984e-06, + "loss": 2.6432, + "mean_token_accuracy": 0.4973298644392715, + "step": 15445 + }, + { + "epoch": 2.8635520949202817, + "grad_norm": 7.4921875, + "learning_rate": 7.136447905079719e-06, + "loss": 2.9686, + "mean_token_accuracy": 0.4724025974025974, + "step": 15446 + }, + { + "epoch": 2.863737486095662, + "grad_norm": 7.80859375, + "learning_rate": 7.136262513904339e-06, + "loss": 2.6856, + "mean_token_accuracy": 0.4875851714978635, + "step": 15447 + }, + { + "epoch": 2.863922877271042, + "grad_norm": 8.9375, + "learning_rate": 7.136077122728958e-06, + "loss": 3.8066, + "mean_token_accuracy": 0.44271698113207547, + "step": 15448 + }, + { + "epoch": 2.864108268446422, + "grad_norm": 7.625, + "learning_rate": 7.1358917315535796e-06, + "loss": 2.8372, + "mean_token_accuracy": 0.49585234342596435, + "step": 15449 + }, + { + "epoch": 2.864293659621802, + "grad_norm": 11.1015625, + "learning_rate": 7.135706340378199e-06, + "loss": 3.1271, + "mean_token_accuracy": 0.5034405504880781, + "step": 15450 + }, + { + "epoch": 2.864479050797182, + "grad_norm": 7.53125, + "learning_rate": 7.135520949202819e-06, + "loss": 3.1014, + "mean_token_accuracy": 0.46648192135128, + "step": 15451 + }, + { + "epoch": 2.864664441972562, + "grad_norm": 6.63671875, + "learning_rate": 7.1353355580274385e-06, + "loss": 2.894, + "mean_token_accuracy": 0.45373525557011796, + "step": 15452 + }, + { + "epoch": 2.864849833147942, + "grad_norm": 7.92578125, + "learning_rate": 7.135150166852058e-06, + "loss": 3.4305, + "mean_token_accuracy": 0.42160560344827586, + "step": 15453 + }, + { + "epoch": 2.8650352243233224, + "grad_norm": 7.140625, + "learning_rate": 7.134964775676679e-06, + "loss": 3.0192, + "mean_token_accuracy": 0.4666581827437007, + "step": 15454 + }, + { + "epoch": 2.865220615498702, + "grad_norm": 7.05078125, + "learning_rate": 7.134779384501298e-06, + "loss": 2.7483, + "mean_token_accuracy": 0.482048901268957, + "step": 15455 + }, + { + "epoch": 2.8654060066740823, + "grad_norm": 7.49609375, + "learning_rate": 7.134593993325918e-06, + "loss": 2.423, + "mean_token_accuracy": 0.5359148684599468, + "step": 15456 + }, + { + "epoch": 2.8655913978494625, + "grad_norm": 7.46875, + "learning_rate": 7.134408602150538e-06, + "loss": 2.9269, + "mean_token_accuracy": 0.47923363764947924, + "step": 15457 + }, + { + "epoch": 2.8657767890248422, + "grad_norm": 8.90625, + "learning_rate": 7.134223210975159e-06, + "loss": 2.078, + "mean_token_accuracy": 0.572247237003684, + "step": 15458 + }, + { + "epoch": 2.8659621802002224, + "grad_norm": 7.15234375, + "learning_rate": 7.1340378197997785e-06, + "loss": 2.8335, + "mean_token_accuracy": 0.4735194992778045, + "step": 15459 + }, + { + "epoch": 2.8661475713756026, + "grad_norm": 7.06640625, + "learning_rate": 7.133852428624398e-06, + "loss": 2.8685, + "mean_token_accuracy": 0.4880315762668704, + "step": 15460 + }, + { + "epoch": 2.866332962550983, + "grad_norm": 9.4140625, + "learning_rate": 7.133667037449018e-06, + "loss": 2.8043, + "mean_token_accuracy": 0.47434715821812595, + "step": 15461 + }, + { + "epoch": 2.8665183537263625, + "grad_norm": 7.0703125, + "learning_rate": 7.1334816462736375e-06, + "loss": 2.6912, + "mean_token_accuracy": 0.47393193338160755, + "step": 15462 + }, + { + "epoch": 2.8667037449017427, + "grad_norm": 7.57421875, + "learning_rate": 7.133296255098258e-06, + "loss": 2.6968, + "mean_token_accuracy": 0.4786856127886323, + "step": 15463 + }, + { + "epoch": 2.8668891360771225, + "grad_norm": 7.12109375, + "learning_rate": 7.133110863922878e-06, + "loss": 2.8232, + "mean_token_accuracy": 0.46379897785349233, + "step": 15464 + }, + { + "epoch": 2.8670745272525027, + "grad_norm": 8.1796875, + "learning_rate": 7.132925472747497e-06, + "loss": 3.0142, + "mean_token_accuracy": 0.4540380047505938, + "step": 15465 + }, + { + "epoch": 2.867259918427883, + "grad_norm": 9.5859375, + "learning_rate": 7.132740081572118e-06, + "loss": 3.1617, + "mean_token_accuracy": 0.45264891572879495, + "step": 15466 + }, + { + "epoch": 2.867445309603263, + "grad_norm": 8.9609375, + "learning_rate": 7.132554690396738e-06, + "loss": 3.4215, + "mean_token_accuracy": 0.4589661604050093, + "step": 15467 + }, + { + "epoch": 2.867630700778643, + "grad_norm": 6.796875, + "learning_rate": 7.132369299221358e-06, + "loss": 2.649, + "mean_token_accuracy": 0.504007694773966, + "step": 15468 + }, + { + "epoch": 2.867816091954023, + "grad_norm": 7.63671875, + "learning_rate": 7.1321839080459775e-06, + "loss": 2.9383, + "mean_token_accuracy": 0.4481786133960047, + "step": 15469 + }, + { + "epoch": 2.868001483129403, + "grad_norm": 6.8828125, + "learning_rate": 7.131998516870597e-06, + "loss": 2.8922, + "mean_token_accuracy": 0.4793646291306198, + "step": 15470 + }, + { + "epoch": 2.868186874304783, + "grad_norm": 8.0078125, + "learning_rate": 7.131813125695217e-06, + "loss": 2.7249, + "mean_token_accuracy": 0.5054680664916885, + "step": 15471 + }, + { + "epoch": 2.868372265480163, + "grad_norm": 8.84375, + "learning_rate": 7.131627734519837e-06, + "loss": 2.662, + "mean_token_accuracy": 0.5258299836236415, + "step": 15472 + }, + { + "epoch": 2.8685576566555433, + "grad_norm": 11.71875, + "learning_rate": 7.131442343344457e-06, + "loss": 2.0144, + "mean_token_accuracy": 0.5530789573101624, + "step": 15473 + }, + { + "epoch": 2.8687430478309235, + "grad_norm": 8.171875, + "learning_rate": 7.1312569521690774e-06, + "loss": 3.3912, + "mean_token_accuracy": 0.43987751737133435, + "step": 15474 + }, + { + "epoch": 2.8689284390063032, + "grad_norm": 9.9140625, + "learning_rate": 7.131071560993698e-06, + "loss": 2.7687, + "mean_token_accuracy": 0.4986634589681903, + "step": 15475 + }, + { + "epoch": 2.8691138301816834, + "grad_norm": 9.4609375, + "learning_rate": 7.130886169818318e-06, + "loss": 4.1266, + "mean_token_accuracy": 0.416520979020979, + "step": 15476 + }, + { + "epoch": 2.869299221357063, + "grad_norm": 8.1015625, + "learning_rate": 7.130700778642937e-06, + "loss": 2.6428, + "mean_token_accuracy": 0.4990616202690022, + "step": 15477 + }, + { + "epoch": 2.8694846125324434, + "grad_norm": 10.34375, + "learning_rate": 7.130515387467557e-06, + "loss": 3.0755, + "mean_token_accuracy": 0.4734674667607954, + "step": 15478 + }, + { + "epoch": 2.8696700037078235, + "grad_norm": 8.4453125, + "learning_rate": 7.1303299962921765e-06, + "loss": 3.1005, + "mean_token_accuracy": 0.4685714285714286, + "step": 15479 + }, + { + "epoch": 2.8698553948832037, + "grad_norm": 8.84375, + "learning_rate": 7.130144605116797e-06, + "loss": 2.6627, + "mean_token_accuracy": 0.5176203966005666, + "step": 15480 + }, + { + "epoch": 2.8700407860585835, + "grad_norm": 10.3671875, + "learning_rate": 7.129959213941417e-06, + "loss": 3.112, + "mean_token_accuracy": 0.47888161808447355, + "step": 15481 + }, + { + "epoch": 2.8702261772339637, + "grad_norm": 8.578125, + "learning_rate": 7.129773822766037e-06, + "loss": 2.9368, + "mean_token_accuracy": 0.4525049603174603, + "step": 15482 + }, + { + "epoch": 2.870411568409344, + "grad_norm": 7.203125, + "learning_rate": 7.129588431590657e-06, + "loss": 3.0739, + "mean_token_accuracy": 0.4537876042044219, + "step": 15483 + }, + { + "epoch": 2.8705969595847236, + "grad_norm": 7.06640625, + "learning_rate": 7.129403040415277e-06, + "loss": 2.594, + "mean_token_accuracy": 0.5207286432160804, + "step": 15484 + }, + { + "epoch": 2.870782350760104, + "grad_norm": 8.5234375, + "learning_rate": 7.129217649239897e-06, + "loss": 3.1862, + "mean_token_accuracy": 0.4503126395712372, + "step": 15485 + }, + { + "epoch": 2.870967741935484, + "grad_norm": 8.1875, + "learning_rate": 7.1290322580645166e-06, + "loss": 2.7443, + "mean_token_accuracy": 0.4899571505088377, + "step": 15486 + }, + { + "epoch": 2.871153133110864, + "grad_norm": 7.67578125, + "learning_rate": 7.128846866889136e-06, + "loss": 3.5087, + "mean_token_accuracy": 0.4472661311220523, + "step": 15487 + }, + { + "epoch": 2.871338524286244, + "grad_norm": 7.8203125, + "learning_rate": 7.128661475713756e-06, + "loss": 2.9899, + "mean_token_accuracy": 0.4697558268590455, + "step": 15488 + }, + { + "epoch": 2.871523915461624, + "grad_norm": 9.90625, + "learning_rate": 7.128476084538376e-06, + "loss": 2.9245, + "mean_token_accuracy": 0.48659003831417624, + "step": 15489 + }, + { + "epoch": 2.871709306637004, + "grad_norm": 8.453125, + "learning_rate": 7.128290693362997e-06, + "loss": 2.9171, + "mean_token_accuracy": 0.47273430449916204, + "step": 15490 + }, + { + "epoch": 2.871894697812384, + "grad_norm": 8.5546875, + "learning_rate": 7.1281053021876165e-06, + "loss": 3.2799, + "mean_token_accuracy": 0.4607336956521739, + "step": 15491 + }, + { + "epoch": 2.8720800889877642, + "grad_norm": 8.71875, + "learning_rate": 7.127919911012237e-06, + "loss": 2.6558, + "mean_token_accuracy": 0.48501152959262106, + "step": 15492 + }, + { + "epoch": 2.8722654801631444, + "grad_norm": 10.015625, + "learning_rate": 7.127734519836857e-06, + "loss": 2.48, + "mean_token_accuracy": 0.5230312035661219, + "step": 15493 + }, + { + "epoch": 2.872450871338524, + "grad_norm": 8.6015625, + "learning_rate": 7.127549128661476e-06, + "loss": 3.0757, + "mean_token_accuracy": 0.47432550043516103, + "step": 15494 + }, + { + "epoch": 2.8726362625139044, + "grad_norm": 7.80078125, + "learning_rate": 7.127363737486096e-06, + "loss": 2.2756, + "mean_token_accuracy": 0.5087077418410654, + "step": 15495 + }, + { + "epoch": 2.8728216536892845, + "grad_norm": 8.5390625, + "learning_rate": 7.1271783463107155e-06, + "loss": 3.1052, + "mean_token_accuracy": 0.45910547582555383, + "step": 15496 + }, + { + "epoch": 2.8730070448646643, + "grad_norm": 6.5234375, + "learning_rate": 7.126992955135336e-06, + "loss": 3.311, + "mean_token_accuracy": 0.4493084786530367, + "step": 15497 + }, + { + "epoch": 2.8731924360400445, + "grad_norm": 12.40625, + "learning_rate": 7.1268075639599565e-06, + "loss": 3.5757, + "mean_token_accuracy": 0.4381711118808452, + "step": 15498 + }, + { + "epoch": 2.8733778272154247, + "grad_norm": 8.6796875, + "learning_rate": 7.126622172784576e-06, + "loss": 3.0308, + "mean_token_accuracy": 0.44908086162174354, + "step": 15499 + }, + { + "epoch": 2.873563218390805, + "grad_norm": 8.8671875, + "learning_rate": 7.126436781609196e-06, + "loss": 2.6993, + "mean_token_accuracy": 0.511962447001817, + "step": 15500 + }, + { + "epoch": 2.8737486095661846, + "grad_norm": 8.3984375, + "learning_rate": 7.126251390433816e-06, + "loss": 2.7436, + "mean_token_accuracy": 0.4775628871192011, + "step": 15501 + }, + { + "epoch": 2.873934000741565, + "grad_norm": 9.3984375, + "learning_rate": 7.126065999258436e-06, + "loss": 2.5437, + "mean_token_accuracy": 0.4955933069357517, + "step": 15502 + }, + { + "epoch": 2.8741193919169445, + "grad_norm": 12.5, + "learning_rate": 7.125880608083056e-06, + "loss": 3.8848, + "mean_token_accuracy": 0.45644955300127715, + "step": 15503 + }, + { + "epoch": 2.8743047830923247, + "grad_norm": 9.90625, + "learning_rate": 7.125695216907675e-06, + "loss": 2.44, + "mean_token_accuracy": 0.5202205882352942, + "step": 15504 + }, + { + "epoch": 2.874490174267705, + "grad_norm": 7.37109375, + "learning_rate": 7.125509825732295e-06, + "loss": 2.5429, + "mean_token_accuracy": 0.5170721297481861, + "step": 15505 + }, + { + "epoch": 2.874675565443085, + "grad_norm": 8.5234375, + "learning_rate": 7.125324434556916e-06, + "loss": 3.0047, + "mean_token_accuracy": 0.4664682993820096, + "step": 15506 + }, + { + "epoch": 2.874860956618465, + "grad_norm": 10.1640625, + "learning_rate": 7.125139043381536e-06, + "loss": 3.8155, + "mean_token_accuracy": 0.4349201328483315, + "step": 15507 + }, + { + "epoch": 2.875046347793845, + "grad_norm": 8.359375, + "learning_rate": 7.1249536522061555e-06, + "loss": 3.2474, + "mean_token_accuracy": 0.43636363636363634, + "step": 15508 + }, + { + "epoch": 2.875231738969225, + "grad_norm": 8.21875, + "learning_rate": 7.124768261030775e-06, + "loss": 3.1538, + "mean_token_accuracy": 0.4579958342976163, + "step": 15509 + }, + { + "epoch": 2.875417130144605, + "grad_norm": 7.39453125, + "learning_rate": 7.124582869855396e-06, + "loss": 2.7179, + "mean_token_accuracy": 0.4927060060454725, + "step": 15510 + }, + { + "epoch": 2.875602521319985, + "grad_norm": 7.28515625, + "learning_rate": 7.124397478680015e-06, + "loss": 2.7276, + "mean_token_accuracy": 0.500557880055788, + "step": 15511 + }, + { + "epoch": 2.8757879124953654, + "grad_norm": 11.2734375, + "learning_rate": 7.124212087504635e-06, + "loss": 3.0326, + "mean_token_accuracy": 0.4778938906752412, + "step": 15512 + }, + { + "epoch": 2.875973303670745, + "grad_norm": 9.296875, + "learning_rate": 7.124026696329255e-06, + "loss": 3.39, + "mean_token_accuracy": 0.4228552051542896, + "step": 15513 + }, + { + "epoch": 2.8761586948461253, + "grad_norm": 9.1796875, + "learning_rate": 7.123841305153876e-06, + "loss": 3.0318, + "mean_token_accuracy": 0.4806849315068493, + "step": 15514 + }, + { + "epoch": 2.8763440860215055, + "grad_norm": 6.8671875, + "learning_rate": 7.1236559139784956e-06, + "loss": 2.9514, + "mean_token_accuracy": 0.4666772201994618, + "step": 15515 + }, + { + "epoch": 2.8765294771968852, + "grad_norm": 7.8671875, + "learning_rate": 7.123470522803115e-06, + "loss": 2.623, + "mean_token_accuracy": 0.48609916881627974, + "step": 15516 + }, + { + "epoch": 2.8767148683722654, + "grad_norm": 8.2109375, + "learning_rate": 7.123285131627735e-06, + "loss": 3.0368, + "mean_token_accuracy": 0.5015454113171659, + "step": 15517 + }, + { + "epoch": 2.8769002595476456, + "grad_norm": 14.4609375, + "learning_rate": 7.123099740452355e-06, + "loss": 2.6978, + "mean_token_accuracy": 0.47697893972403776, + "step": 15518 + }, + { + "epoch": 2.877085650723026, + "grad_norm": 11.328125, + "learning_rate": 7.122914349276975e-06, + "loss": 4.5036, + "mean_token_accuracy": 0.3924077402981918, + "step": 15519 + }, + { + "epoch": 2.8772710418984055, + "grad_norm": 8.8046875, + "learning_rate": 7.122728958101595e-06, + "loss": 3.4734, + "mean_token_accuracy": 0.4639618138424821, + "step": 15520 + }, + { + "epoch": 2.8774564330737857, + "grad_norm": 7.7421875, + "learning_rate": 7.122543566926214e-06, + "loss": 2.8755, + "mean_token_accuracy": 0.47150180940892644, + "step": 15521 + }, + { + "epoch": 2.8776418242491655, + "grad_norm": 8.203125, + "learning_rate": 7.122358175750836e-06, + "loss": 2.768, + "mean_token_accuracy": 0.48633668471255476, + "step": 15522 + }, + { + "epoch": 2.8778272154245457, + "grad_norm": 10.4921875, + "learning_rate": 7.122172784575455e-06, + "loss": 3.1767, + "mean_token_accuracy": 0.5161994485294118, + "step": 15523 + }, + { + "epoch": 2.878012606599926, + "grad_norm": 7.83984375, + "learning_rate": 7.121987393400075e-06, + "loss": 2.6629, + "mean_token_accuracy": 0.5141992551210428, + "step": 15524 + }, + { + "epoch": 2.878197997775306, + "grad_norm": 8.296875, + "learning_rate": 7.1218020022246946e-06, + "loss": 3.6966, + "mean_token_accuracy": 0.42244372270131036, + "step": 15525 + }, + { + "epoch": 2.878383388950686, + "grad_norm": 8.3359375, + "learning_rate": 7.121616611049314e-06, + "loss": 2.6353, + "mean_token_accuracy": 0.5074689669682306, + "step": 15526 + }, + { + "epoch": 2.878568780126066, + "grad_norm": 7.33203125, + "learning_rate": 7.121431219873935e-06, + "loss": 2.738, + "mean_token_accuracy": 0.4708940883915452, + "step": 15527 + }, + { + "epoch": 2.878754171301446, + "grad_norm": 6.40234375, + "learning_rate": 7.121245828698554e-06, + "loss": 2.7045, + "mean_token_accuracy": 0.4862919808087731, + "step": 15528 + }, + { + "epoch": 2.878939562476826, + "grad_norm": 6.5703125, + "learning_rate": 7.121060437523174e-06, + "loss": 2.6032, + "mean_token_accuracy": 0.4879054425508521, + "step": 15529 + }, + { + "epoch": 2.879124953652206, + "grad_norm": 6.24609375, + "learning_rate": 7.120875046347795e-06, + "loss": 2.8447, + "mean_token_accuracy": 0.4871415566681839, + "step": 15530 + }, + { + "epoch": 2.8793103448275863, + "grad_norm": 9.3984375, + "learning_rate": 7.120689655172415e-06, + "loss": 3.0904, + "mean_token_accuracy": 0.49290973547859285, + "step": 15531 + }, + { + "epoch": 2.8794957360029665, + "grad_norm": 7.9296875, + "learning_rate": 7.120504263997035e-06, + "loss": 3.0391, + "mean_token_accuracy": 0.4795808704997313, + "step": 15532 + }, + { + "epoch": 2.8796811271783462, + "grad_norm": 7.62109375, + "learning_rate": 7.120318872821654e-06, + "loss": 2.8077, + "mean_token_accuracy": 0.4823159549384333, + "step": 15533 + }, + { + "epoch": 2.8798665183537264, + "grad_norm": 7.62890625, + "learning_rate": 7.120133481646274e-06, + "loss": 2.5638, + "mean_token_accuracy": 0.49315897808451387, + "step": 15534 + }, + { + "epoch": 2.880051909529106, + "grad_norm": 8.2109375, + "learning_rate": 7.119948090470894e-06, + "loss": 2.8838, + "mean_token_accuracy": 0.4901139799953478, + "step": 15535 + }, + { + "epoch": 2.8802373007044864, + "grad_norm": 7.515625, + "learning_rate": 7.119762699295514e-06, + "loss": 2.8844, + "mean_token_accuracy": 0.5025048705816866, + "step": 15536 + }, + { + "epoch": 2.8804226918798665, + "grad_norm": 8.171875, + "learning_rate": 7.119577308120134e-06, + "loss": 2.8269, + "mean_token_accuracy": 0.4758729388942774, + "step": 15537 + }, + { + "epoch": 2.8806080830552467, + "grad_norm": 8.2734375, + "learning_rate": 7.119391916944754e-06, + "loss": 3.271, + "mean_token_accuracy": 0.4321691613209789, + "step": 15538 + }, + { + "epoch": 2.8807934742306265, + "grad_norm": 12.3515625, + "learning_rate": 7.119206525769375e-06, + "loss": 3.1054, + "mean_token_accuracy": 0.44327573253193087, + "step": 15539 + }, + { + "epoch": 2.8809788654060067, + "grad_norm": 8.875, + "learning_rate": 7.119021134593994e-06, + "loss": 2.5713, + "mean_token_accuracy": 0.5065471275215289, + "step": 15540 + }, + { + "epoch": 2.881164256581387, + "grad_norm": 6.953125, + "learning_rate": 7.118835743418614e-06, + "loss": 2.4565, + "mean_token_accuracy": 0.5129848229342328, + "step": 15541 + }, + { + "epoch": 2.8813496477567666, + "grad_norm": 6.82421875, + "learning_rate": 7.118650352243234e-06, + "loss": 2.525, + "mean_token_accuracy": 0.5035698010025824, + "step": 15542 + }, + { + "epoch": 2.881535038932147, + "grad_norm": 8.2734375, + "learning_rate": 7.118464961067853e-06, + "loss": 2.6406, + "mean_token_accuracy": 0.504462388440289, + "step": 15543 + }, + { + "epoch": 2.881720430107527, + "grad_norm": 8.4296875, + "learning_rate": 7.118279569892474e-06, + "loss": 2.5626, + "mean_token_accuracy": 0.5145477545857052, + "step": 15544 + }, + { + "epoch": 2.881905821282907, + "grad_norm": 7.390625, + "learning_rate": 7.118094178717093e-06, + "loss": 3.4419, + "mean_token_accuracy": 0.4398822869955157, + "step": 15545 + }, + { + "epoch": 2.882091212458287, + "grad_norm": 7.87109375, + "learning_rate": 7.117908787541714e-06, + "loss": 2.9308, + "mean_token_accuracy": 0.4707492302429011, + "step": 15546 + }, + { + "epoch": 2.882276603633667, + "grad_norm": 10.53125, + "learning_rate": 7.1177233963663335e-06, + "loss": 2.997, + "mean_token_accuracy": 0.469044558697515, + "step": 15547 + }, + { + "epoch": 2.882461994809047, + "grad_norm": 9.6171875, + "learning_rate": 7.117538005190954e-06, + "loss": 3.95, + "mean_token_accuracy": 0.44687875150060025, + "step": 15548 + }, + { + "epoch": 2.882647385984427, + "grad_norm": 8.4609375, + "learning_rate": 7.117352614015574e-06, + "loss": 2.6059, + "mean_token_accuracy": 0.5539921465968587, + "step": 15549 + }, + { + "epoch": 2.8828327771598072, + "grad_norm": 8.1484375, + "learning_rate": 7.117167222840193e-06, + "loss": 3.1906, + "mean_token_accuracy": 0.45764576457645767, + "step": 15550 + }, + { + "epoch": 2.8830181683351874, + "grad_norm": 7.4453125, + "learning_rate": 7.116981831664813e-06, + "loss": 3.1178, + "mean_token_accuracy": 0.4699074074074074, + "step": 15551 + }, + { + "epoch": 2.883203559510567, + "grad_norm": 8.4765625, + "learning_rate": 7.1167964404894326e-06, + "loss": 2.4148, + "mean_token_accuracy": 0.5143184421534936, + "step": 15552 + }, + { + "epoch": 2.8833889506859474, + "grad_norm": 7.0078125, + "learning_rate": 7.116611049314053e-06, + "loss": 2.2574, + "mean_token_accuracy": 0.5437966685812752, + "step": 15553 + }, + { + "epoch": 2.8835743418613276, + "grad_norm": 7.41796875, + "learning_rate": 7.1164256581386736e-06, + "loss": 2.7018, + "mean_token_accuracy": 0.5048031389527804, + "step": 15554 + }, + { + "epoch": 2.8837597330367073, + "grad_norm": 9.875, + "learning_rate": 7.116240266963293e-06, + "loss": 3.5147, + "mean_token_accuracy": 0.4680436477007015, + "step": 15555 + }, + { + "epoch": 2.8839451242120875, + "grad_norm": 8.109375, + "learning_rate": 7.116054875787914e-06, + "loss": 3.1235, + "mean_token_accuracy": 0.49077076577700174, + "step": 15556 + }, + { + "epoch": 2.8841305153874677, + "grad_norm": 8.2265625, + "learning_rate": 7.115869484612533e-06, + "loss": 2.95, + "mean_token_accuracy": 0.4889652438218581, + "step": 15557 + }, + { + "epoch": 2.884315906562848, + "grad_norm": 7.9375, + "learning_rate": 7.115684093437153e-06, + "loss": 2.3968, + "mean_token_accuracy": 0.5134742951907131, + "step": 15558 + }, + { + "epoch": 2.8845012977382276, + "grad_norm": 7.3984375, + "learning_rate": 7.115498702261773e-06, + "loss": 2.2958, + "mean_token_accuracy": 0.5257232916807385, + "step": 15559 + }, + { + "epoch": 2.884686688913608, + "grad_norm": 10.421875, + "learning_rate": 7.115313311086392e-06, + "loss": 2.6557, + "mean_token_accuracy": 0.5358150786184652, + "step": 15560 + }, + { + "epoch": 2.8848720800889875, + "grad_norm": 9.21875, + "learning_rate": 7.115127919911013e-06, + "loss": 2.734, + "mean_token_accuracy": 0.5150399017802333, + "step": 15561 + }, + { + "epoch": 2.8850574712643677, + "grad_norm": 7.80859375, + "learning_rate": 7.114942528735633e-06, + "loss": 3.1243, + "mean_token_accuracy": 0.4711073754990544, + "step": 15562 + }, + { + "epoch": 2.885242862439748, + "grad_norm": 12.7421875, + "learning_rate": 7.114757137560253e-06, + "loss": 2.5693, + "mean_token_accuracy": 0.5060805675196351, + "step": 15563 + }, + { + "epoch": 2.885428253615128, + "grad_norm": 10.203125, + "learning_rate": 7.1145717463848725e-06, + "loss": 3.1214, + "mean_token_accuracy": 0.44798500468603564, + "step": 15564 + }, + { + "epoch": 2.885613644790508, + "grad_norm": 10.59375, + "learning_rate": 7.114386355209493e-06, + "loss": 2.3188, + "mean_token_accuracy": 0.5499074469599886, + "step": 15565 + }, + { + "epoch": 2.885799035965888, + "grad_norm": 11.0, + "learning_rate": 7.114200964034113e-06, + "loss": 2.4669, + "mean_token_accuracy": 0.5096315691041705, + "step": 15566 + }, + { + "epoch": 2.8859844271412682, + "grad_norm": 7.140625, + "learning_rate": 7.114015572858732e-06, + "loss": 2.8679, + "mean_token_accuracy": 0.46134146341463417, + "step": 15567 + }, + { + "epoch": 2.886169818316648, + "grad_norm": 9.2265625, + "learning_rate": 7.113830181683352e-06, + "loss": 2.8338, + "mean_token_accuracy": 0.4872831616016695, + "step": 15568 + }, + { + "epoch": 2.886355209492028, + "grad_norm": 8.1796875, + "learning_rate": 7.113644790507972e-06, + "loss": 3.3915, + "mean_token_accuracy": 0.437449815320379, + "step": 15569 + }, + { + "epoch": 2.8865406006674084, + "grad_norm": 13.7421875, + "learning_rate": 7.113459399332593e-06, + "loss": 3.0637, + "mean_token_accuracy": 0.4847715736040609, + "step": 15570 + }, + { + "epoch": 2.8867259918427886, + "grad_norm": 8.0625, + "learning_rate": 7.113274008157213e-06, + "loss": 2.8374, + "mean_token_accuracy": 0.510624387054593, + "step": 15571 + }, + { + "epoch": 2.8869113830181683, + "grad_norm": 7.64453125, + "learning_rate": 7.113088616981832e-06, + "loss": 3.1731, + "mean_token_accuracy": 0.44519846350832265, + "step": 15572 + }, + { + "epoch": 2.8870967741935485, + "grad_norm": 10.6875, + "learning_rate": 7.112903225806453e-06, + "loss": 2.9545, + "mean_token_accuracy": 0.4576030317385126, + "step": 15573 + }, + { + "epoch": 2.8872821653689282, + "grad_norm": 11.2421875, + "learning_rate": 7.112717834631072e-06, + "loss": 2.7228, + "mean_token_accuracy": 0.48887403304364435, + "step": 15574 + }, + { + "epoch": 2.8874675565443084, + "grad_norm": 10.3515625, + "learning_rate": 7.112532443455692e-06, + "loss": 2.6822, + "mean_token_accuracy": 0.49574151683114775, + "step": 15575 + }, + { + "epoch": 2.8876529477196886, + "grad_norm": 11.203125, + "learning_rate": 7.112347052280312e-06, + "loss": 2.9155, + "mean_token_accuracy": 0.4805950117436528, + "step": 15576 + }, + { + "epoch": 2.887838338895069, + "grad_norm": 10.09375, + "learning_rate": 7.112161661104931e-06, + "loss": 3.1315, + "mean_token_accuracy": 0.4517818367607613, + "step": 15577 + }, + { + "epoch": 2.8880237300704485, + "grad_norm": 12.0078125, + "learning_rate": 7.111976269929553e-06, + "loss": 3.4284, + "mean_token_accuracy": 0.437152133580705, + "step": 15578 + }, + { + "epoch": 2.8882091212458287, + "grad_norm": 11.7578125, + "learning_rate": 7.111790878754172e-06, + "loss": 2.8019, + "mean_token_accuracy": 0.5130685920577618, + "step": 15579 + }, + { + "epoch": 2.8883945124212085, + "grad_norm": 6.62890625, + "learning_rate": 7.111605487578792e-06, + "loss": 2.6654, + "mean_token_accuracy": 0.49685455460493205, + "step": 15580 + }, + { + "epoch": 2.8885799035965887, + "grad_norm": 8.3828125, + "learning_rate": 7.1114200964034116e-06, + "loss": 2.8834, + "mean_token_accuracy": 0.4901065449010654, + "step": 15581 + }, + { + "epoch": 2.888765294771969, + "grad_norm": 7.18359375, + "learning_rate": 7.111234705228032e-06, + "loss": 3.39, + "mean_token_accuracy": 0.4440852490421456, + "step": 15582 + }, + { + "epoch": 2.888950685947349, + "grad_norm": 8.5625, + "learning_rate": 7.111049314052652e-06, + "loss": 3.0451, + "mean_token_accuracy": 0.48722960646338287, + "step": 15583 + }, + { + "epoch": 2.889136077122729, + "grad_norm": 8.15625, + "learning_rate": 7.110863922877271e-06, + "loss": 3.6271, + "mean_token_accuracy": 0.4175592095338663, + "step": 15584 + }, + { + "epoch": 2.889321468298109, + "grad_norm": 8.6015625, + "learning_rate": 7.110678531701891e-06, + "loss": 3.1728, + "mean_token_accuracy": 0.44114963503649635, + "step": 15585 + }, + { + "epoch": 2.889506859473489, + "grad_norm": 6.8125, + "learning_rate": 7.110493140526512e-06, + "loss": 2.6439, + "mean_token_accuracy": 0.4919593464556799, + "step": 15586 + }, + { + "epoch": 2.889692250648869, + "grad_norm": 8.375, + "learning_rate": 7.110307749351132e-06, + "loss": 3.044, + "mean_token_accuracy": 0.47741574604950354, + "step": 15587 + }, + { + "epoch": 2.889877641824249, + "grad_norm": 7.33984375, + "learning_rate": 7.110122358175752e-06, + "loss": 3.1407, + "mean_token_accuracy": 0.46085710564022403, + "step": 15588 + }, + { + "epoch": 2.8900630329996293, + "grad_norm": 7.91015625, + "learning_rate": 7.109936967000371e-06, + "loss": 3.1418, + "mean_token_accuracy": 0.4704569481968754, + "step": 15589 + }, + { + "epoch": 2.8902484241750095, + "grad_norm": 8.046875, + "learning_rate": 7.109751575824991e-06, + "loss": 3.1488, + "mean_token_accuracy": 0.48166926677067085, + "step": 15590 + }, + { + "epoch": 2.8904338153503892, + "grad_norm": 9.390625, + "learning_rate": 7.109566184649611e-06, + "loss": 2.4945, + "mean_token_accuracy": 0.5378996527361992, + "step": 15591 + }, + { + "epoch": 2.8906192065257694, + "grad_norm": 7.96875, + "learning_rate": 7.109380793474231e-06, + "loss": 2.6331, + "mean_token_accuracy": 0.4932278396137857, + "step": 15592 + }, + { + "epoch": 2.890804597701149, + "grad_norm": 7.96484375, + "learning_rate": 7.109195402298851e-06, + "loss": 3.2704, + "mean_token_accuracy": 0.4530014261636199, + "step": 15593 + }, + { + "epoch": 2.8909899888765294, + "grad_norm": 10.1328125, + "learning_rate": 7.10901001112347e-06, + "loss": 3.0939, + "mean_token_accuracy": 0.45751850624240414, + "step": 15594 + }, + { + "epoch": 2.8911753800519095, + "grad_norm": 13.0703125, + "learning_rate": 7.108824619948092e-06, + "loss": 1.9098, + "mean_token_accuracy": 0.5603662321539417, + "step": 15595 + }, + { + "epoch": 2.8913607712272897, + "grad_norm": 10.75, + "learning_rate": 7.108639228772711e-06, + "loss": 2.6984, + "mean_token_accuracy": 0.48870776023093904, + "step": 15596 + }, + { + "epoch": 2.8915461624026695, + "grad_norm": 9.5546875, + "learning_rate": 7.108453837597331e-06, + "loss": 3.3815, + "mean_token_accuracy": 0.45513654096228867, + "step": 15597 + }, + { + "epoch": 2.8917315535780497, + "grad_norm": 10.8515625, + "learning_rate": 7.108268446421951e-06, + "loss": 2.442, + "mean_token_accuracy": 0.5482262430545661, + "step": 15598 + }, + { + "epoch": 2.89191694475343, + "grad_norm": 7.921875, + "learning_rate": 7.108083055246571e-06, + "loss": 3.6025, + "mean_token_accuracy": 0.3982974332516445, + "step": 15599 + }, + { + "epoch": 2.8921023359288096, + "grad_norm": 14.1796875, + "learning_rate": 7.107897664071191e-06, + "loss": 2.8858, + "mean_token_accuracy": 0.4919614147909968, + "step": 15600 + }, + { + "epoch": 2.89228772710419, + "grad_norm": 20.625, + "learning_rate": 7.10771227289581e-06, + "loss": 2.9847, + "mean_token_accuracy": 0.48926967226750956, + "step": 15601 + }, + { + "epoch": 2.89247311827957, + "grad_norm": 8.3515625, + "learning_rate": 7.10752688172043e-06, + "loss": 2.6511, + "mean_token_accuracy": 0.49352799518362434, + "step": 15602 + }, + { + "epoch": 2.89265850945495, + "grad_norm": 7.625, + "learning_rate": 7.107341490545051e-06, + "loss": 2.8708, + "mean_token_accuracy": 0.48746010031919745, + "step": 15603 + }, + { + "epoch": 2.89284390063033, + "grad_norm": 9.046875, + "learning_rate": 7.107156099369671e-06, + "loss": 3.1153, + "mean_token_accuracy": 0.4703951225231563, + "step": 15604 + }, + { + "epoch": 2.89302929180571, + "grad_norm": 10.765625, + "learning_rate": 7.106970708194291e-06, + "loss": 2.6005, + "mean_token_accuracy": 0.48886954358910584, + "step": 15605 + }, + { + "epoch": 2.89321468298109, + "grad_norm": 7.421875, + "learning_rate": 7.10678531701891e-06, + "loss": 2.7559, + "mean_token_accuracy": 0.4886822464211428, + "step": 15606 + }, + { + "epoch": 2.89340007415647, + "grad_norm": 7.34375, + "learning_rate": 7.10659992584353e-06, + "loss": 2.6573, + "mean_token_accuracy": 0.5344180225281602, + "step": 15607 + }, + { + "epoch": 2.8935854653318502, + "grad_norm": 13.25, + "learning_rate": 7.1064145346681504e-06, + "loss": 2.6019, + "mean_token_accuracy": 0.5141738821017673, + "step": 15608 + }, + { + "epoch": 2.8937708565072304, + "grad_norm": 8.1015625, + "learning_rate": 7.10622914349277e-06, + "loss": 3.2425, + "mean_token_accuracy": 0.4597875569044006, + "step": 15609 + }, + { + "epoch": 2.89395624768261, + "grad_norm": 7.68359375, + "learning_rate": 7.10604375231739e-06, + "loss": 3.1906, + "mean_token_accuracy": 0.45671794871794874, + "step": 15610 + }, + { + "epoch": 2.8941416388579904, + "grad_norm": 8.609375, + "learning_rate": 7.105858361142011e-06, + "loss": 3.5021, + "mean_token_accuracy": 0.4381243063263041, + "step": 15611 + }, + { + "epoch": 2.8943270300333706, + "grad_norm": 7.05078125, + "learning_rate": 7.105672969966631e-06, + "loss": 2.8265, + "mean_token_accuracy": 0.49204372492043724, + "step": 15612 + }, + { + "epoch": 2.8945124212087503, + "grad_norm": 7.84375, + "learning_rate": 7.10548757879125e-06, + "loss": 2.8022, + "mean_token_accuracy": 0.4990352146647371, + "step": 15613 + }, + { + "epoch": 2.8946978123841305, + "grad_norm": 9.5078125, + "learning_rate": 7.10530218761587e-06, + "loss": 2.844, + "mean_token_accuracy": 0.49016799062377914, + "step": 15614 + }, + { + "epoch": 2.8948832035595107, + "grad_norm": 7.72265625, + "learning_rate": 7.10511679644049e-06, + "loss": 2.8177, + "mean_token_accuracy": 0.48145810320340854, + "step": 15615 + }, + { + "epoch": 2.895068594734891, + "grad_norm": 7.50390625, + "learning_rate": 7.10493140526511e-06, + "loss": 2.7389, + "mean_token_accuracy": 0.5150162337662337, + "step": 15616 + }, + { + "epoch": 2.8952539859102706, + "grad_norm": 15.7578125, + "learning_rate": 7.10474601408973e-06, + "loss": 3.1561, + "mean_token_accuracy": 0.47682439791843156, + "step": 15617 + }, + { + "epoch": 2.895439377085651, + "grad_norm": 9.4296875, + "learning_rate": 7.104560622914349e-06, + "loss": 2.738, + "mean_token_accuracy": 0.5141163625019498, + "step": 15618 + }, + { + "epoch": 2.8956247682610305, + "grad_norm": 7.75390625, + "learning_rate": 7.10437523173897e-06, + "loss": 2.9768, + "mean_token_accuracy": 0.4658847089114576, + "step": 15619 + }, + { + "epoch": 2.8958101594364107, + "grad_norm": 7.05859375, + "learning_rate": 7.10418984056359e-06, + "loss": 2.4236, + "mean_token_accuracy": 0.5262858443331816, + "step": 15620 + }, + { + "epoch": 2.895995550611791, + "grad_norm": 7.7109375, + "learning_rate": 7.10400444938821e-06, + "loss": 3.1566, + "mean_token_accuracy": 0.4694946974422957, + "step": 15621 + }, + { + "epoch": 2.896180941787171, + "grad_norm": 12.8671875, + "learning_rate": 7.10381905821283e-06, + "loss": 2.6538, + "mean_token_accuracy": 0.5302201297875048, + "step": 15622 + }, + { + "epoch": 2.896366332962551, + "grad_norm": 8.0390625, + "learning_rate": 7.103633667037449e-06, + "loss": 3.3227, + "mean_token_accuracy": 0.46773971660475994, + "step": 15623 + }, + { + "epoch": 2.896551724137931, + "grad_norm": 7.67578125, + "learning_rate": 7.103448275862069e-06, + "loss": 3.6951, + "mean_token_accuracy": 0.4410604192355117, + "step": 15624 + }, + { + "epoch": 2.8967371153133112, + "grad_norm": 8.078125, + "learning_rate": 7.1032628846866895e-06, + "loss": 3.2513, + "mean_token_accuracy": 0.45279451731064635, + "step": 15625 + }, + { + "epoch": 2.896922506488691, + "grad_norm": 8.5390625, + "learning_rate": 7.103077493511309e-06, + "loss": 3.6513, + "mean_token_accuracy": 0.4430601092896175, + "step": 15626 + }, + { + "epoch": 2.897107897664071, + "grad_norm": 7.69140625, + "learning_rate": 7.10289210233593e-06, + "loss": 2.8634, + "mean_token_accuracy": 0.4756035578144854, + "step": 15627 + }, + { + "epoch": 2.8972932888394514, + "grad_norm": 7.5703125, + "learning_rate": 7.102706711160549e-06, + "loss": 3.2001, + "mean_token_accuracy": 0.44053064958828914, + "step": 15628 + }, + { + "epoch": 2.8974786800148316, + "grad_norm": 12.953125, + "learning_rate": 7.10252131998517e-06, + "loss": 2.7398, + "mean_token_accuracy": 0.5194516053433326, + "step": 15629 + }, + { + "epoch": 2.8976640711902113, + "grad_norm": 9.4296875, + "learning_rate": 7.102335928809789e-06, + "loss": 3.0298, + "mean_token_accuracy": 0.46626865671641793, + "step": 15630 + }, + { + "epoch": 2.8978494623655915, + "grad_norm": 6.73828125, + "learning_rate": 7.102150537634409e-06, + "loss": 3.107, + "mean_token_accuracy": 0.47746071133167906, + "step": 15631 + }, + { + "epoch": 2.8980348535409712, + "grad_norm": 9.1640625, + "learning_rate": 7.101965146459029e-06, + "loss": 3.3361, + "mean_token_accuracy": 0.4469902912621359, + "step": 15632 + }, + { + "epoch": 2.8982202447163514, + "grad_norm": 8.4921875, + "learning_rate": 7.101779755283648e-06, + "loss": 3.7476, + "mean_token_accuracy": 0.4521452145214521, + "step": 15633 + }, + { + "epoch": 2.8984056358917316, + "grad_norm": 7.63671875, + "learning_rate": 7.101594364108269e-06, + "loss": 3.2099, + "mean_token_accuracy": 0.4664274730907227, + "step": 15634 + }, + { + "epoch": 2.898591027067112, + "grad_norm": 8.40625, + "learning_rate": 7.101408972932889e-06, + "loss": 2.4616, + "mean_token_accuracy": 0.5061302225925485, + "step": 15635 + }, + { + "epoch": 2.8987764182424915, + "grad_norm": 7.703125, + "learning_rate": 7.101223581757509e-06, + "loss": 3.4479, + "mean_token_accuracy": 0.4773887673231218, + "step": 15636 + }, + { + "epoch": 2.8989618094178717, + "grad_norm": 7.94140625, + "learning_rate": 7.1010381905821294e-06, + "loss": 2.3262, + "mean_token_accuracy": 0.5262515262515263, + "step": 15637 + }, + { + "epoch": 2.899147200593252, + "grad_norm": 6.88671875, + "learning_rate": 7.100852799406749e-06, + "loss": 3.1037, + "mean_token_accuracy": 0.4700877785280216, + "step": 15638 + }, + { + "epoch": 2.8993325917686317, + "grad_norm": 7.61328125, + "learning_rate": 7.100667408231369e-06, + "loss": 2.6047, + "mean_token_accuracy": 0.4934540164861807, + "step": 15639 + }, + { + "epoch": 2.899517982944012, + "grad_norm": 7.15625, + "learning_rate": 7.100482017055988e-06, + "loss": 2.4837, + "mean_token_accuracy": 0.5245454545454545, + "step": 15640 + }, + { + "epoch": 2.899703374119392, + "grad_norm": 7.08984375, + "learning_rate": 7.100296625880608e-06, + "loss": 3.8267, + "mean_token_accuracy": 0.4403852033017426, + "step": 15641 + }, + { + "epoch": 2.8998887652947722, + "grad_norm": 8.203125, + "learning_rate": 7.1001112347052285e-06, + "loss": 3.2386, + "mean_token_accuracy": 0.4633674315731269, + "step": 15642 + }, + { + "epoch": 2.900074156470152, + "grad_norm": 6.78125, + "learning_rate": 7.099925843529849e-06, + "loss": 2.8229, + "mean_token_accuracy": 0.5052127359819667, + "step": 15643 + }, + { + "epoch": 2.900259547645532, + "grad_norm": 7.15625, + "learning_rate": 7.099740452354469e-06, + "loss": 2.8255, + "mean_token_accuracy": 0.46617697300990774, + "step": 15644 + }, + { + "epoch": 2.900444938820912, + "grad_norm": 7.859375, + "learning_rate": 7.099555061179088e-06, + "loss": 3.5256, + "mean_token_accuracy": 0.43240248226950356, + "step": 15645 + }, + { + "epoch": 2.900630329996292, + "grad_norm": 8.0, + "learning_rate": 7.099369670003709e-06, + "loss": 3.9688, + "mean_token_accuracy": 0.400925466864981, + "step": 15646 + }, + { + "epoch": 2.9008157211716723, + "grad_norm": 7.2109375, + "learning_rate": 7.099184278828328e-06, + "loss": 3.0766, + "mean_token_accuracy": 0.47669652345529073, + "step": 15647 + }, + { + "epoch": 2.9010011123470525, + "grad_norm": 6.6015625, + "learning_rate": 7.098998887652948e-06, + "loss": 2.5819, + "mean_token_accuracy": 0.5340175642789123, + "step": 15648 + }, + { + "epoch": 2.9011865035224322, + "grad_norm": 7.58984375, + "learning_rate": 7.098813496477568e-06, + "loss": 3.1715, + "mean_token_accuracy": 0.4451233059320151, + "step": 15649 + }, + { + "epoch": 2.9013718946978124, + "grad_norm": 7.19140625, + "learning_rate": 7.098628105302187e-06, + "loss": 2.5984, + "mean_token_accuracy": 0.5097444781290602, + "step": 15650 + }, + { + "epoch": 2.901557285873192, + "grad_norm": 8.390625, + "learning_rate": 7.098442714126809e-06, + "loss": 2.9536, + "mean_token_accuracy": 0.46746226030191756, + "step": 15651 + }, + { + "epoch": 2.9017426770485724, + "grad_norm": 7.3828125, + "learning_rate": 7.098257322951428e-06, + "loss": 2.9524, + "mean_token_accuracy": 0.4646727351538795, + "step": 15652 + }, + { + "epoch": 2.9019280682239526, + "grad_norm": 8.1796875, + "learning_rate": 7.098071931776048e-06, + "loss": 3.007, + "mean_token_accuracy": 0.45636172450052576, + "step": 15653 + }, + { + "epoch": 2.9021134593993327, + "grad_norm": 8.5390625, + "learning_rate": 7.0978865406006685e-06, + "loss": 3.7691, + "mean_token_accuracy": 0.42691256830601093, + "step": 15654 + }, + { + "epoch": 2.9022988505747125, + "grad_norm": 10.2890625, + "learning_rate": 7.097701149425288e-06, + "loss": 2.8142, + "mean_token_accuracy": 0.47863888472492877, + "step": 15655 + }, + { + "epoch": 2.9024842417500927, + "grad_norm": 6.9765625, + "learning_rate": 7.097515758249908e-06, + "loss": 2.9332, + "mean_token_accuracy": 0.4742959856201318, + "step": 15656 + }, + { + "epoch": 2.902669632925473, + "grad_norm": 9.2109375, + "learning_rate": 7.097330367074527e-06, + "loss": 3.0579, + "mean_token_accuracy": 0.4539943419102789, + "step": 15657 + }, + { + "epoch": 2.9028550241008526, + "grad_norm": 8.9296875, + "learning_rate": 7.097144975899147e-06, + "loss": 3.8363, + "mean_token_accuracy": 0.44075321494182484, + "step": 15658 + }, + { + "epoch": 2.903040415276233, + "grad_norm": 8.078125, + "learning_rate": 7.096959584723768e-06, + "loss": 3.1466, + "mean_token_accuracy": 0.4855249891977531, + "step": 15659 + }, + { + "epoch": 2.903225806451613, + "grad_norm": 8.1328125, + "learning_rate": 7.096774193548388e-06, + "loss": 3.2449, + "mean_token_accuracy": 0.4701403404001194, + "step": 15660 + }, + { + "epoch": 2.903411197626993, + "grad_norm": 8.3984375, + "learning_rate": 7.096588802373008e-06, + "loss": 2.1974, + "mean_token_accuracy": 0.5418590335487843, + "step": 15661 + }, + { + "epoch": 2.903596588802373, + "grad_norm": 8.8984375, + "learning_rate": 7.096403411197627e-06, + "loss": 2.7801, + "mean_token_accuracy": 0.47370671227020095, + "step": 15662 + }, + { + "epoch": 2.903781979977753, + "grad_norm": 10.8125, + "learning_rate": 7.096218020022248e-06, + "loss": 2.5378, + "mean_token_accuracy": 0.5436564309911485, + "step": 15663 + }, + { + "epoch": 2.903967371153133, + "grad_norm": 7.65234375, + "learning_rate": 7.0960326288468675e-06, + "loss": 2.6963, + "mean_token_accuracy": 0.5021338724168913, + "step": 15664 + }, + { + "epoch": 2.904152762328513, + "grad_norm": 8.140625, + "learning_rate": 7.095847237671487e-06, + "loss": 2.8906, + "mean_token_accuracy": 0.49257278669043375, + "step": 15665 + }, + { + "epoch": 2.9043381535038932, + "grad_norm": 7.953125, + "learning_rate": 7.095661846496107e-06, + "loss": 2.6803, + "mean_token_accuracy": 0.5120320855614974, + "step": 15666 + }, + { + "epoch": 2.9045235446792734, + "grad_norm": 9.828125, + "learning_rate": 7.095476455320728e-06, + "loss": 3.0065, + "mean_token_accuracy": 0.48968481375358164, + "step": 15667 + }, + { + "epoch": 2.904708935854653, + "grad_norm": 8.234375, + "learning_rate": 7.095291064145348e-06, + "loss": 2.8933, + "mean_token_accuracy": 0.5061124694376528, + "step": 15668 + }, + { + "epoch": 2.9048943270300334, + "grad_norm": 8.0859375, + "learning_rate": 7.095105672969967e-06, + "loss": 2.711, + "mean_token_accuracy": 0.5133070772168516, + "step": 15669 + }, + { + "epoch": 2.9050797182054136, + "grad_norm": 8.953125, + "learning_rate": 7.094920281794587e-06, + "loss": 2.6645, + "mean_token_accuracy": 0.5021128511061397, + "step": 15670 + }, + { + "epoch": 2.9052651093807933, + "grad_norm": 8.7109375, + "learning_rate": 7.094734890619207e-06, + "loss": 2.7712, + "mean_token_accuracy": 0.511797325214744, + "step": 15671 + }, + { + "epoch": 2.9054505005561735, + "grad_norm": 8.9375, + "learning_rate": 7.094549499443827e-06, + "loss": 2.28, + "mean_token_accuracy": 0.5417822661191877, + "step": 15672 + }, + { + "epoch": 2.9056358917315537, + "grad_norm": 9.2734375, + "learning_rate": 7.094364108268447e-06, + "loss": 3.3647, + "mean_token_accuracy": 0.4733620949132136, + "step": 15673 + }, + { + "epoch": 2.905821282906934, + "grad_norm": 13.4375, + "learning_rate": 7.0941787170930664e-06, + "loss": 2.5466, + "mean_token_accuracy": 0.514533258803801, + "step": 15674 + }, + { + "epoch": 2.9060066740823136, + "grad_norm": 10.3359375, + "learning_rate": 7.093993325917688e-06, + "loss": 2.9538, + "mean_token_accuracy": 0.46054003940201643, + "step": 15675 + }, + { + "epoch": 2.906192065257694, + "grad_norm": 7.4296875, + "learning_rate": 7.093807934742307e-06, + "loss": 2.5765, + "mean_token_accuracy": 0.4891860465116279, + "step": 15676 + }, + { + "epoch": 2.9063774564330735, + "grad_norm": 10.6796875, + "learning_rate": 7.093622543566927e-06, + "loss": 2.5058, + "mean_token_accuracy": 0.5269845802398629, + "step": 15677 + }, + { + "epoch": 2.9065628476084537, + "grad_norm": 9.53125, + "learning_rate": 7.093437152391547e-06, + "loss": 3.4802, + "mean_token_accuracy": 0.44841562269712604, + "step": 15678 + }, + { + "epoch": 2.906748238783834, + "grad_norm": 7.94921875, + "learning_rate": 7.093251761216166e-06, + "loss": 3.0488, + "mean_token_accuracy": 0.461505376344086, + "step": 15679 + }, + { + "epoch": 2.906933629959214, + "grad_norm": 8.0, + "learning_rate": 7.093066370040787e-06, + "loss": 3.2182, + "mean_token_accuracy": 0.46346220029826773, + "step": 15680 + }, + { + "epoch": 2.907119021134594, + "grad_norm": 11.7265625, + "learning_rate": 7.0928809788654065e-06, + "loss": 3.146, + "mean_token_accuracy": 0.5066336200612335, + "step": 15681 + }, + { + "epoch": 2.907304412309974, + "grad_norm": 8.0078125, + "learning_rate": 7.092695587690026e-06, + "loss": 2.6054, + "mean_token_accuracy": 0.5064624222115844, + "step": 15682 + }, + { + "epoch": 2.9074898034853542, + "grad_norm": 9.390625, + "learning_rate": 7.092510196514647e-06, + "loss": 3.2638, + "mean_token_accuracy": 0.47739478566546234, + "step": 15683 + }, + { + "epoch": 2.907675194660734, + "grad_norm": 10.59375, + "learning_rate": 7.092324805339267e-06, + "loss": 3.5701, + "mean_token_accuracy": 0.4445576655859194, + "step": 15684 + }, + { + "epoch": 2.907860585836114, + "grad_norm": 9.28125, + "learning_rate": 7.092139414163887e-06, + "loss": 3.027, + "mean_token_accuracy": 0.4532773564463705, + "step": 15685 + }, + { + "epoch": 2.9080459770114944, + "grad_norm": 9.1796875, + "learning_rate": 7.091954022988506e-06, + "loss": 2.8795, + "mean_token_accuracy": 0.5132016086728449, + "step": 15686 + }, + { + "epoch": 2.9082313681868746, + "grad_norm": 9.7109375, + "learning_rate": 7.091768631813126e-06, + "loss": 2.5916, + "mean_token_accuracy": 0.49929758838679467, + "step": 15687 + }, + { + "epoch": 2.9084167593622543, + "grad_norm": 10.015625, + "learning_rate": 7.091583240637746e-06, + "loss": 2.8945, + "mean_token_accuracy": 0.46384009691096306, + "step": 15688 + }, + { + "epoch": 2.9086021505376345, + "grad_norm": 7.76953125, + "learning_rate": 7.091397849462366e-06, + "loss": 2.6458, + "mean_token_accuracy": 0.4937413073713491, + "step": 15689 + }, + { + "epoch": 2.9087875417130142, + "grad_norm": 11.5, + "learning_rate": 7.091212458286986e-06, + "loss": 3.2189, + "mean_token_accuracy": 0.49583484244838827, + "step": 15690 + }, + { + "epoch": 2.9089729328883944, + "grad_norm": 10.0703125, + "learning_rate": 7.091027067111606e-06, + "loss": 2.7098, + "mean_token_accuracy": 0.4876215165262476, + "step": 15691 + }, + { + "epoch": 2.9091583240637746, + "grad_norm": 7.28125, + "learning_rate": 7.090841675936227e-06, + "loss": 2.4582, + "mean_token_accuracy": 0.5132585000813404, + "step": 15692 + }, + { + "epoch": 2.909343715239155, + "grad_norm": 8.359375, + "learning_rate": 7.0906562847608465e-06, + "loss": 2.6627, + "mean_token_accuracy": 0.5062416406598306, + "step": 15693 + }, + { + "epoch": 2.9095291064145345, + "grad_norm": 8.65625, + "learning_rate": 7.090470893585466e-06, + "loss": 2.3882, + "mean_token_accuracy": 0.5252609603340292, + "step": 15694 + }, + { + "epoch": 2.9097144975899147, + "grad_norm": 9.7265625, + "learning_rate": 7.090285502410086e-06, + "loss": 3.2907, + "mean_token_accuracy": 0.500881390078066, + "step": 15695 + }, + { + "epoch": 2.909899888765295, + "grad_norm": 8.0078125, + "learning_rate": 7.090100111234705e-06, + "loss": 3.5808, + "mean_token_accuracy": 0.4350093109869646, + "step": 15696 + }, + { + "epoch": 2.9100852799406747, + "grad_norm": 8.0703125, + "learning_rate": 7.089914720059326e-06, + "loss": 2.6988, + "mean_token_accuracy": 0.5089573398633844, + "step": 15697 + }, + { + "epoch": 2.910270671116055, + "grad_norm": 8.1328125, + "learning_rate": 7.0897293288839455e-06, + "loss": 3.0188, + "mean_token_accuracy": 0.47078154022003144, + "step": 15698 + }, + { + "epoch": 2.910456062291435, + "grad_norm": 9.078125, + "learning_rate": 7.089543937708566e-06, + "loss": 3.1678, + "mean_token_accuracy": 0.44476553264153423, + "step": 15699 + }, + { + "epoch": 2.9106414534668152, + "grad_norm": 7.98046875, + "learning_rate": 7.089358546533186e-06, + "loss": 3.8545, + "mean_token_accuracy": 0.4150114990591679, + "step": 15700 + }, + { + "epoch": 2.910826844642195, + "grad_norm": 9.1796875, + "learning_rate": 7.089173155357806e-06, + "loss": 3.9198, + "mean_token_accuracy": 0.4080142764438676, + "step": 15701 + }, + { + "epoch": 2.911012235817575, + "grad_norm": 9.9453125, + "learning_rate": 7.088987764182426e-06, + "loss": 3.1579, + "mean_token_accuracy": 0.4416674352116573, + "step": 15702 + }, + { + "epoch": 2.911197626992955, + "grad_norm": 10.3359375, + "learning_rate": 7.0888023730070454e-06, + "loss": 2.7957, + "mean_token_accuracy": 0.49638802889576883, + "step": 15703 + }, + { + "epoch": 2.911383018168335, + "grad_norm": 7.796875, + "learning_rate": 7.088616981831665e-06, + "loss": 3.6298, + "mean_token_accuracy": 0.4284026775167345, + "step": 15704 + }, + { + "epoch": 2.9115684093437153, + "grad_norm": 11.484375, + "learning_rate": 7.088431590656285e-06, + "loss": 3.1702, + "mean_token_accuracy": 0.44798785117691725, + "step": 15705 + }, + { + "epoch": 2.9117538005190955, + "grad_norm": 9.890625, + "learning_rate": 7.088246199480905e-06, + "loss": 3.2304, + "mean_token_accuracy": 0.45840130505709625, + "step": 15706 + }, + { + "epoch": 2.9119391916944752, + "grad_norm": 8.4453125, + "learning_rate": 7.088060808305526e-06, + "loss": 3.2358, + "mean_token_accuracy": 0.47026413871333655, + "step": 15707 + }, + { + "epoch": 2.9121245828698554, + "grad_norm": 10.25, + "learning_rate": 7.087875417130145e-06, + "loss": 2.7427, + "mean_token_accuracy": 0.49605878423513694, + "step": 15708 + }, + { + "epoch": 2.912309974045235, + "grad_norm": 10.21875, + "learning_rate": 7.087690025954765e-06, + "loss": 3.0675, + "mean_token_accuracy": 0.4659731252709146, + "step": 15709 + }, + { + "epoch": 2.9124953652206154, + "grad_norm": 7.0859375, + "learning_rate": 7.0875046347793855e-06, + "loss": 2.4612, + "mean_token_accuracy": 0.5276872964169381, + "step": 15710 + }, + { + "epoch": 2.9126807563959956, + "grad_norm": 7.93359375, + "learning_rate": 7.087319243604005e-06, + "loss": 3.0221, + "mean_token_accuracy": 0.4718160229971254, + "step": 15711 + }, + { + "epoch": 2.9128661475713757, + "grad_norm": 11.5703125, + "learning_rate": 7.087133852428625e-06, + "loss": 3.1443, + "mean_token_accuracy": 0.451984126984127, + "step": 15712 + }, + { + "epoch": 2.913051538746756, + "grad_norm": 10.0625, + "learning_rate": 7.0869484612532444e-06, + "loss": 3.3893, + "mean_token_accuracy": 0.43479289940828403, + "step": 15713 + }, + { + "epoch": 2.9132369299221357, + "grad_norm": 7.71484375, + "learning_rate": 7.086763070077864e-06, + "loss": 3.2152, + "mean_token_accuracy": 0.45918114143920596, + "step": 15714 + }, + { + "epoch": 2.913422321097516, + "grad_norm": 9.8984375, + "learning_rate": 7.0865776789024846e-06, + "loss": 2.4842, + "mean_token_accuracy": 0.5282472686525352, + "step": 15715 + }, + { + "epoch": 2.9136077122728956, + "grad_norm": 10.0859375, + "learning_rate": 7.086392287727105e-06, + "loss": 2.663, + "mean_token_accuracy": 0.484525748653051, + "step": 15716 + }, + { + "epoch": 2.913793103448276, + "grad_norm": 10.25, + "learning_rate": 7.086206896551725e-06, + "loss": 3.3332, + "mean_token_accuracy": 0.4789722785665991, + "step": 15717 + }, + { + "epoch": 2.913978494623656, + "grad_norm": 12.5234375, + "learning_rate": 7.086021505376345e-06, + "loss": 2.9367, + "mean_token_accuracy": 0.4922727917198355, + "step": 15718 + }, + { + "epoch": 2.914163885799036, + "grad_norm": 12.7734375, + "learning_rate": 7.085836114200965e-06, + "loss": 3.3091, + "mean_token_accuracy": 0.4472979552093476, + "step": 15719 + }, + { + "epoch": 2.914349276974416, + "grad_norm": 9.1875, + "learning_rate": 7.0856507230255845e-06, + "loss": 3.1505, + "mean_token_accuracy": 0.4708854944297691, + "step": 15720 + }, + { + "epoch": 2.914534668149796, + "grad_norm": 7.90625, + "learning_rate": 7.085465331850204e-06, + "loss": 2.6392, + "mean_token_accuracy": 0.5240837116598384, + "step": 15721 + }, + { + "epoch": 2.914720059325176, + "grad_norm": 13.84375, + "learning_rate": 7.085279940674824e-06, + "loss": 3.478, + "mean_token_accuracy": 0.4285520423395305, + "step": 15722 + }, + { + "epoch": 2.914905450500556, + "grad_norm": 10.625, + "learning_rate": 7.085094549499444e-06, + "loss": 2.8776, + "mean_token_accuracy": 0.48238644880827464, + "step": 15723 + }, + { + "epoch": 2.9150908416759362, + "grad_norm": 8.328125, + "learning_rate": 7.084909158324065e-06, + "loss": 3.1688, + "mean_token_accuracy": 0.46766917293233085, + "step": 15724 + }, + { + "epoch": 2.9152762328513164, + "grad_norm": 7.66796875, + "learning_rate": 7.084723767148684e-06, + "loss": 2.5762, + "mean_token_accuracy": 0.5095231275955893, + "step": 15725 + }, + { + "epoch": 2.915461624026696, + "grad_norm": 8.6640625, + "learning_rate": 7.084538375973304e-06, + "loss": 3.3095, + "mean_token_accuracy": 0.46163793103448275, + "step": 15726 + }, + { + "epoch": 2.9156470152020764, + "grad_norm": 10.125, + "learning_rate": 7.0843529847979245e-06, + "loss": 2.7697, + "mean_token_accuracy": 0.4928736752344987, + "step": 15727 + }, + { + "epoch": 2.9158324063774566, + "grad_norm": 11.6875, + "learning_rate": 7.084167593622544e-06, + "loss": 1.9251, + "mean_token_accuracy": 0.5755448296036273, + "step": 15728 + }, + { + "epoch": 2.9160177975528363, + "grad_norm": 7.30078125, + "learning_rate": 7.083982202447164e-06, + "loss": 2.6348, + "mean_token_accuracy": 0.5233333333333333, + "step": 15729 + }, + { + "epoch": 2.9162031887282165, + "grad_norm": 10.90625, + "learning_rate": 7.0837968112717835e-06, + "loss": 3.4017, + "mean_token_accuracy": 0.4537117903930131, + "step": 15730 + }, + { + "epoch": 2.9163885799035967, + "grad_norm": 11.8671875, + "learning_rate": 7.083611420096403e-06, + "loss": 2.9499, + "mean_token_accuracy": 0.4781433794309331, + "step": 15731 + }, + { + "epoch": 2.916573971078977, + "grad_norm": 7.734375, + "learning_rate": 7.0834260289210244e-06, + "loss": 2.9926, + "mean_token_accuracy": 0.48730538922155686, + "step": 15732 + }, + { + "epoch": 2.9167593622543566, + "grad_norm": 10.703125, + "learning_rate": 7.083240637745644e-06, + "loss": 2.9316, + "mean_token_accuracy": 0.4698461878942127, + "step": 15733 + }, + { + "epoch": 2.916944753429737, + "grad_norm": 10.90625, + "learning_rate": 7.083055246570264e-06, + "loss": 3.2342, + "mean_token_accuracy": 0.43428154631655724, + "step": 15734 + }, + { + "epoch": 2.9171301446051165, + "grad_norm": 10.1328125, + "learning_rate": 7.082869855394884e-06, + "loss": 2.8833, + "mean_token_accuracy": 0.48706624605678234, + "step": 15735 + }, + { + "epoch": 2.9173155357804967, + "grad_norm": 8.453125, + "learning_rate": 7.082684464219504e-06, + "loss": 3.3451, + "mean_token_accuracy": 0.47602996254681645, + "step": 15736 + }, + { + "epoch": 2.917500926955877, + "grad_norm": 9.625, + "learning_rate": 7.0824990730441235e-06, + "loss": 3.0043, + "mean_token_accuracy": 0.47522236340533675, + "step": 15737 + }, + { + "epoch": 2.917686318131257, + "grad_norm": 12.9765625, + "learning_rate": 7.082313681868743e-06, + "loss": 3.549, + "mean_token_accuracy": 0.44315967259971417, + "step": 15738 + }, + { + "epoch": 2.917871709306637, + "grad_norm": 10.3984375, + "learning_rate": 7.082128290693363e-06, + "loss": 2.6227, + "mean_token_accuracy": 0.4967019290603609, + "step": 15739 + }, + { + "epoch": 2.918057100482017, + "grad_norm": 7.73828125, + "learning_rate": 7.081942899517984e-06, + "loss": 3.1772, + "mean_token_accuracy": 0.4609550894655765, + "step": 15740 + }, + { + "epoch": 2.9182424916573972, + "grad_norm": 10.0, + "learning_rate": 7.081757508342604e-06, + "loss": 3.4737, + "mean_token_accuracy": 0.4441242395132885, + "step": 15741 + }, + { + "epoch": 2.918427882832777, + "grad_norm": 8.21875, + "learning_rate": 7.0815721171672234e-06, + "loss": 3.0331, + "mean_token_accuracy": 0.4755825087775295, + "step": 15742 + }, + { + "epoch": 2.918613274008157, + "grad_norm": 7.56640625, + "learning_rate": 7.081386725991843e-06, + "loss": 3.0874, + "mean_token_accuracy": 0.46127678875155914, + "step": 15743 + }, + { + "epoch": 2.9187986651835374, + "grad_norm": 9.203125, + "learning_rate": 7.0812013348164636e-06, + "loss": 3.0048, + "mean_token_accuracy": 0.49566947565543074, + "step": 15744 + }, + { + "epoch": 2.9189840563589176, + "grad_norm": 8.9140625, + "learning_rate": 7.081015943641083e-06, + "loss": 3.1536, + "mean_token_accuracy": 0.45977179637214743, + "step": 15745 + }, + { + "epoch": 2.9191694475342973, + "grad_norm": 9.6875, + "learning_rate": 7.080830552465703e-06, + "loss": 2.6859, + "mean_token_accuracy": 0.4985284708893154, + "step": 15746 + }, + { + "epoch": 2.9193548387096775, + "grad_norm": 7.4453125, + "learning_rate": 7.0806451612903225e-06, + "loss": 2.4681, + "mean_token_accuracy": 0.49156400642742365, + "step": 15747 + }, + { + "epoch": 2.9195402298850572, + "grad_norm": 7.6015625, + "learning_rate": 7.080459770114944e-06, + "loss": 2.5623, + "mean_token_accuracy": 0.49905587026546705, + "step": 15748 + }, + { + "epoch": 2.9197256210604374, + "grad_norm": 8.1875, + "learning_rate": 7.0802743789395635e-06, + "loss": 2.7275, + "mean_token_accuracy": 0.503462204270052, + "step": 15749 + }, + { + "epoch": 2.9199110122358176, + "grad_norm": 6.78515625, + "learning_rate": 7.080088987764183e-06, + "loss": 2.9298, + "mean_token_accuracy": 0.47989093387866394, + "step": 15750 + }, + { + "epoch": 2.920096403411198, + "grad_norm": 6.671875, + "learning_rate": 7.079903596588803e-06, + "loss": 2.3778, + "mean_token_accuracy": 0.530525372465554, + "step": 15751 + }, + { + "epoch": 2.9202817945865776, + "grad_norm": 8.6328125, + "learning_rate": 7.079718205413422e-06, + "loss": 2.6491, + "mean_token_accuracy": 0.5050431401142301, + "step": 15752 + }, + { + "epoch": 2.9204671857619577, + "grad_norm": 7.203125, + "learning_rate": 7.079532814238043e-06, + "loss": 2.4091, + "mean_token_accuracy": 0.5245714285714286, + "step": 15753 + }, + { + "epoch": 2.920652576937338, + "grad_norm": 7.01171875, + "learning_rate": 7.0793474230626625e-06, + "loss": 2.8524, + "mean_token_accuracy": 0.49049276914836637, + "step": 15754 + }, + { + "epoch": 2.9208379681127177, + "grad_norm": 7.390625, + "learning_rate": 7.079162031887282e-06, + "loss": 2.6931, + "mean_token_accuracy": 0.48771390960947786, + "step": 15755 + }, + { + "epoch": 2.921023359288098, + "grad_norm": 7.1484375, + "learning_rate": 7.0789766407119035e-06, + "loss": 2.5714, + "mean_token_accuracy": 0.4988766702140239, + "step": 15756 + }, + { + "epoch": 2.921208750463478, + "grad_norm": 7.265625, + "learning_rate": 7.078791249536523e-06, + "loss": 3.511, + "mean_token_accuracy": 0.48579676674364897, + "step": 15757 + }, + { + "epoch": 2.9213941416388582, + "grad_norm": 9.53125, + "learning_rate": 7.078605858361143e-06, + "loss": 2.9755, + "mean_token_accuracy": 0.4838300570703868, + "step": 15758 + }, + { + "epoch": 2.921579532814238, + "grad_norm": 7.125, + "learning_rate": 7.0784204671857625e-06, + "loss": 2.4836, + "mean_token_accuracy": 0.5111083059833376, + "step": 15759 + }, + { + "epoch": 2.921764923989618, + "grad_norm": 8.140625, + "learning_rate": 7.078235076010382e-06, + "loss": 2.8941, + "mean_token_accuracy": 0.46477570525666717, + "step": 15760 + }, + { + "epoch": 2.921950315164998, + "grad_norm": 11.2578125, + "learning_rate": 7.078049684835003e-06, + "loss": 2.9741, + "mean_token_accuracy": 0.49598163030998854, + "step": 15761 + }, + { + "epoch": 2.922135706340378, + "grad_norm": 9.515625, + "learning_rate": 7.077864293659622e-06, + "loss": 3.3953, + "mean_token_accuracy": 0.4868884540117417, + "step": 15762 + }, + { + "epoch": 2.9223210975157583, + "grad_norm": 7.75, + "learning_rate": 7.077678902484242e-06, + "loss": 3.0091, + "mean_token_accuracy": 0.4638176010573852, + "step": 15763 + }, + { + "epoch": 2.9225064886911385, + "grad_norm": 11.171875, + "learning_rate": 7.077493511308862e-06, + "loss": 2.6826, + "mean_token_accuracy": 0.5095168374816984, + "step": 15764 + }, + { + "epoch": 2.9226918798665182, + "grad_norm": 8.5625, + "learning_rate": 7.077308120133483e-06, + "loss": 3.1169, + "mean_token_accuracy": 0.4585965315263581, + "step": 15765 + }, + { + "epoch": 2.9228772710418984, + "grad_norm": 7.328125, + "learning_rate": 7.0771227289581025e-06, + "loss": 3.1019, + "mean_token_accuracy": 0.4423810735181888, + "step": 15766 + }, + { + "epoch": 2.9230626622172786, + "grad_norm": 7.21484375, + "learning_rate": 7.076937337782722e-06, + "loss": 2.6243, + "mean_token_accuracy": 0.5051173991571343, + "step": 15767 + }, + { + "epoch": 2.9232480533926584, + "grad_norm": 10.34375, + "learning_rate": 7.076751946607342e-06, + "loss": 3.1089, + "mean_token_accuracy": 0.4505407544183593, + "step": 15768 + }, + { + "epoch": 2.9234334445680386, + "grad_norm": 7.56640625, + "learning_rate": 7.0765665554319614e-06, + "loss": 2.5792, + "mean_token_accuracy": 0.506789413118527, + "step": 15769 + }, + { + "epoch": 2.9236188357434187, + "grad_norm": 7.75390625, + "learning_rate": 7.076381164256582e-06, + "loss": 3.2366, + "mean_token_accuracy": 0.45126353790613716, + "step": 15770 + }, + { + "epoch": 2.923804226918799, + "grad_norm": 9.3203125, + "learning_rate": 7.076195773081202e-06, + "loss": 2.9261, + "mean_token_accuracy": 0.4659159159159159, + "step": 15771 + }, + { + "epoch": 2.9239896180941787, + "grad_norm": 9.7578125, + "learning_rate": 7.076010381905822e-06, + "loss": 2.7783, + "mean_token_accuracy": 0.4992810353091548, + "step": 15772 + }, + { + "epoch": 2.924175009269559, + "grad_norm": 7.5234375, + "learning_rate": 7.0758249907304426e-06, + "loss": 2.9573, + "mean_token_accuracy": 0.4654565920923034, + "step": 15773 + }, + { + "epoch": 2.9243604004449386, + "grad_norm": 8.21875, + "learning_rate": 7.075639599555062e-06, + "loss": 2.8393, + "mean_token_accuracy": 0.49641010913268235, + "step": 15774 + }, + { + "epoch": 2.924545791620319, + "grad_norm": 7.43359375, + "learning_rate": 7.075454208379682e-06, + "loss": 2.5363, + "mean_token_accuracy": 0.4935323383084577, + "step": 15775 + }, + { + "epoch": 2.924731182795699, + "grad_norm": 9.28125, + "learning_rate": 7.0752688172043015e-06, + "loss": 3.3421, + "mean_token_accuracy": 0.4594017094017094, + "step": 15776 + }, + { + "epoch": 2.924916573971079, + "grad_norm": 8.828125, + "learning_rate": 7.075083426028921e-06, + "loss": 3.0977, + "mean_token_accuracy": 0.47050788526015147, + "step": 15777 + }, + { + "epoch": 2.925101965146459, + "grad_norm": 7.57421875, + "learning_rate": 7.074898034853541e-06, + "loss": 2.7739, + "mean_token_accuracy": 0.5073227885178676, + "step": 15778 + }, + { + "epoch": 2.925287356321839, + "grad_norm": 8.1875, + "learning_rate": 7.074712643678161e-06, + "loss": 3.2481, + "mean_token_accuracy": 0.46366013071895423, + "step": 15779 + }, + { + "epoch": 2.925472747497219, + "grad_norm": 8.4140625, + "learning_rate": 7.074527252502782e-06, + "loss": 3.1468, + "mean_token_accuracy": 0.4652077278167208, + "step": 15780 + }, + { + "epoch": 2.925658138672599, + "grad_norm": 7.2109375, + "learning_rate": 7.074341861327401e-06, + "loss": 3.0484, + "mean_token_accuracy": 0.4479315263908702, + "step": 15781 + }, + { + "epoch": 2.9258435298479792, + "grad_norm": 6.8125, + "learning_rate": 7.074156470152022e-06, + "loss": 2.6744, + "mean_token_accuracy": 0.484525748653051, + "step": 15782 + }, + { + "epoch": 2.9260289210233594, + "grad_norm": 8.75, + "learning_rate": 7.0739710789766415e-06, + "loss": 2.7703, + "mean_token_accuracy": 0.48855258051998446, + "step": 15783 + }, + { + "epoch": 2.926214312198739, + "grad_norm": 7.75, + "learning_rate": 7.073785687801261e-06, + "loss": 3.5682, + "mean_token_accuracy": 0.44030163385002097, + "step": 15784 + }, + { + "epoch": 2.9263997033741194, + "grad_norm": 7.2421875, + "learning_rate": 7.073600296625881e-06, + "loss": 2.7126, + "mean_token_accuracy": 0.5106864186362094, + "step": 15785 + }, + { + "epoch": 2.9265850945494996, + "grad_norm": 7.3984375, + "learning_rate": 7.0734149054505005e-06, + "loss": 3.0821, + "mean_token_accuracy": 0.46185714285714285, + "step": 15786 + }, + { + "epoch": 2.9267704857248793, + "grad_norm": 8.21875, + "learning_rate": 7.073229514275121e-06, + "loss": 2.8343, + "mean_token_accuracy": 0.49259478672985785, + "step": 15787 + }, + { + "epoch": 2.9269558769002595, + "grad_norm": 9.015625, + "learning_rate": 7.0730441230997415e-06, + "loss": 2.835, + "mean_token_accuracy": 0.4790157004830918, + "step": 15788 + }, + { + "epoch": 2.9271412680756397, + "grad_norm": 7.18359375, + "learning_rate": 7.072858731924361e-06, + "loss": 3.2031, + "mean_token_accuracy": 0.44110830172777077, + "step": 15789 + }, + { + "epoch": 2.92732665925102, + "grad_norm": 8.3125, + "learning_rate": 7.072673340748981e-06, + "loss": 3.371, + "mean_token_accuracy": 0.4430113556708257, + "step": 15790 + }, + { + "epoch": 2.9275120504263996, + "grad_norm": 7.82421875, + "learning_rate": 7.072487949573601e-06, + "loss": 2.3814, + "mean_token_accuracy": 0.5577318472026837, + "step": 15791 + }, + { + "epoch": 2.92769744160178, + "grad_norm": 10.8984375, + "learning_rate": 7.072302558398221e-06, + "loss": 3.3322, + "mean_token_accuracy": 0.4778652906029332, + "step": 15792 + }, + { + "epoch": 2.9278828327771595, + "grad_norm": 8.5703125, + "learning_rate": 7.0721171672228405e-06, + "loss": 3.1129, + "mean_token_accuracy": 0.514813073124853, + "step": 15793 + }, + { + "epoch": 2.9280682239525397, + "grad_norm": 9.1640625, + "learning_rate": 7.07193177604746e-06, + "loss": 3.5527, + "mean_token_accuracy": 0.4112914811090361, + "step": 15794 + }, + { + "epoch": 2.92825361512792, + "grad_norm": 15.234375, + "learning_rate": 7.07174638487208e-06, + "loss": 2.8418, + "mean_token_accuracy": 0.4838897798512903, + "step": 15795 + }, + { + "epoch": 2.9284390063033, + "grad_norm": 8.3984375, + "learning_rate": 7.071560993696701e-06, + "loss": 3.201, + "mean_token_accuracy": 0.4392664238161035, + "step": 15796 + }, + { + "epoch": 2.92862439747868, + "grad_norm": 7.6640625, + "learning_rate": 7.071375602521321e-06, + "loss": 3.0161, + "mean_token_accuracy": 0.4551588762115078, + "step": 15797 + }, + { + "epoch": 2.92880978865406, + "grad_norm": 7.78125, + "learning_rate": 7.0711902113459404e-06, + "loss": 2.9503, + "mean_token_accuracy": 0.4650073206442167, + "step": 15798 + }, + { + "epoch": 2.9289951798294402, + "grad_norm": 10.1796875, + "learning_rate": 7.071004820170561e-06, + "loss": 2.9974, + "mean_token_accuracy": 0.47019471986620476, + "step": 15799 + }, + { + "epoch": 2.92918057100482, + "grad_norm": 7.43359375, + "learning_rate": 7.070819428995181e-06, + "loss": 2.9785, + "mean_token_accuracy": 0.4455039227519614, + "step": 15800 + }, + { + "epoch": 2.9293659621802, + "grad_norm": 7.81640625, + "learning_rate": 7.0706340378198e-06, + "loss": 3.6277, + "mean_token_accuracy": 0.433665008291874, + "step": 15801 + }, + { + "epoch": 2.9295513533555804, + "grad_norm": 7.5234375, + "learning_rate": 7.07044864664442e-06, + "loss": 3.4402, + "mean_token_accuracy": 0.46136631330977623, + "step": 15802 + }, + { + "epoch": 2.9297367445309606, + "grad_norm": 7.984375, + "learning_rate": 7.0702632554690395e-06, + "loss": 2.8516, + "mean_token_accuracy": 0.5092699884125145, + "step": 15803 + }, + { + "epoch": 2.9299221357063403, + "grad_norm": 7.171875, + "learning_rate": 7.070077864293661e-06, + "loss": 2.3194, + "mean_token_accuracy": 0.5531094527363184, + "step": 15804 + }, + { + "epoch": 2.9301075268817205, + "grad_norm": 9.2265625, + "learning_rate": 7.0698924731182805e-06, + "loss": 3.6322, + "mean_token_accuracy": 0.4616822429906542, + "step": 15805 + }, + { + "epoch": 2.9302929180571002, + "grad_norm": 7.98828125, + "learning_rate": 7.0697070819429e-06, + "loss": 2.599, + "mean_token_accuracy": 0.5279367011564212, + "step": 15806 + }, + { + "epoch": 2.9304783092324804, + "grad_norm": 6.75390625, + "learning_rate": 7.06952169076752e-06, + "loss": 2.3356, + "mean_token_accuracy": 0.5168043292509257, + "step": 15807 + }, + { + "epoch": 2.9306637004078606, + "grad_norm": 9.90625, + "learning_rate": 7.06933629959214e-06, + "loss": 3.0674, + "mean_token_accuracy": 0.46885035324341684, + "step": 15808 + }, + { + "epoch": 2.930849091583241, + "grad_norm": 8.0546875, + "learning_rate": 7.06915090841676e-06, + "loss": 2.6481, + "mean_token_accuracy": 0.505949603359776, + "step": 15809 + }, + { + "epoch": 2.9310344827586206, + "grad_norm": 6.93359375, + "learning_rate": 7.0689655172413796e-06, + "loss": 2.8904, + "mean_token_accuracy": 0.48043995917904525, + "step": 15810 + }, + { + "epoch": 2.9312198739340007, + "grad_norm": 7.03125, + "learning_rate": 7.068780126065999e-06, + "loss": 2.7723, + "mean_token_accuracy": 0.48166877370417194, + "step": 15811 + }, + { + "epoch": 2.931405265109381, + "grad_norm": 11.8359375, + "learning_rate": 7.0685947348906205e-06, + "loss": 3.0394, + "mean_token_accuracy": 0.4785571142284569, + "step": 15812 + }, + { + "epoch": 2.9315906562847607, + "grad_norm": 18.90625, + "learning_rate": 7.06840934371524e-06, + "loss": 3.0292, + "mean_token_accuracy": 0.4750179985601152, + "step": 15813 + }, + { + "epoch": 2.931776047460141, + "grad_norm": 7.43359375, + "learning_rate": 7.06822395253986e-06, + "loss": 2.3933, + "mean_token_accuracy": 0.5435318998942545, + "step": 15814 + }, + { + "epoch": 2.931961438635521, + "grad_norm": 8.3125, + "learning_rate": 7.0680385613644795e-06, + "loss": 2.3059, + "mean_token_accuracy": 0.5369670100404225, + "step": 15815 + }, + { + "epoch": 2.9321468298109012, + "grad_norm": 9.6875, + "learning_rate": 7.0678531701891e-06, + "loss": 3.2994, + "mean_token_accuracy": 0.45899870522227015, + "step": 15816 + }, + { + "epoch": 2.932332220986281, + "grad_norm": 9.8046875, + "learning_rate": 7.06766777901372e-06, + "loss": 2.7043, + "mean_token_accuracy": 0.5157147174372922, + "step": 15817 + }, + { + "epoch": 2.932517612161661, + "grad_norm": 8.921875, + "learning_rate": 7.067482387838339e-06, + "loss": 2.612, + "mean_token_accuracy": 0.5244029075804777, + "step": 15818 + }, + { + "epoch": 2.932703003337041, + "grad_norm": 9.671875, + "learning_rate": 7.067296996662959e-06, + "loss": 3.0663, + "mean_token_accuracy": 0.4700064850843061, + "step": 15819 + }, + { + "epoch": 2.932888394512421, + "grad_norm": 7.5, + "learning_rate": 7.06711160548758e-06, + "loss": 2.752, + "mean_token_accuracy": 0.4924466593988475, + "step": 15820 + }, + { + "epoch": 2.9330737856878013, + "grad_norm": 9.2890625, + "learning_rate": 7.0669262143122e-06, + "loss": 3.1458, + "mean_token_accuracy": 0.46797608881298036, + "step": 15821 + }, + { + "epoch": 2.9332591768631815, + "grad_norm": 9.5859375, + "learning_rate": 7.0667408231368195e-06, + "loss": 3.012, + "mean_token_accuracy": 0.4552975713904457, + "step": 15822 + }, + { + "epoch": 2.9334445680385612, + "grad_norm": 7.7109375, + "learning_rate": 7.066555431961439e-06, + "loss": 2.8264, + "mean_token_accuracy": 0.4715383758493929, + "step": 15823 + }, + { + "epoch": 2.9336299592139414, + "grad_norm": 7.39453125, + "learning_rate": 7.066370040786059e-06, + "loss": 3.6664, + "mean_token_accuracy": 0.41659017898118406, + "step": 15824 + }, + { + "epoch": 2.9338153503893216, + "grad_norm": 7.19921875, + "learning_rate": 7.066184649610679e-06, + "loss": 3.2621, + "mean_token_accuracy": 0.4592394215318693, + "step": 15825 + }, + { + "epoch": 2.9340007415647014, + "grad_norm": 11.2265625, + "learning_rate": 7.065999258435299e-06, + "loss": 2.7286, + "mean_token_accuracy": 0.5185491493383743, + "step": 15826 + }, + { + "epoch": 2.9341861327400816, + "grad_norm": 8.1171875, + "learning_rate": 7.065813867259919e-06, + "loss": 2.1867, + "mean_token_accuracy": 0.5652795838751625, + "step": 15827 + }, + { + "epoch": 2.9343715239154617, + "grad_norm": 6.421875, + "learning_rate": 7.065628476084539e-06, + "loss": 2.8827, + "mean_token_accuracy": 0.48452183526810394, + "step": 15828 + }, + { + "epoch": 2.934556915090842, + "grad_norm": 8.7421875, + "learning_rate": 7.06544308490916e-06, + "loss": 2.4925, + "mean_token_accuracy": 0.556489413285922, + "step": 15829 + }, + { + "epoch": 2.9347423062662217, + "grad_norm": 8.671875, + "learning_rate": 7.065257693733779e-06, + "loss": 3.2094, + "mean_token_accuracy": 0.4568430070678946, + "step": 15830 + }, + { + "epoch": 2.934927697441602, + "grad_norm": 7.5859375, + "learning_rate": 7.065072302558399e-06, + "loss": 2.8042, + "mean_token_accuracy": 0.49071117561683597, + "step": 15831 + }, + { + "epoch": 2.9351130886169816, + "grad_norm": 8.90625, + "learning_rate": 7.0648869113830185e-06, + "loss": 2.8898, + "mean_token_accuracy": 0.462671905697446, + "step": 15832 + }, + { + "epoch": 2.935298479792362, + "grad_norm": 9.4921875, + "learning_rate": 7.064701520207638e-06, + "loss": 2.8209, + "mean_token_accuracy": 0.5168785021183545, + "step": 15833 + }, + { + "epoch": 2.935483870967742, + "grad_norm": 7.66796875, + "learning_rate": 7.064516129032259e-06, + "loss": 3.3073, + "mean_token_accuracy": 0.45827372436814495, + "step": 15834 + }, + { + "epoch": 2.935669262143122, + "grad_norm": 7.28515625, + "learning_rate": 7.064330737856878e-06, + "loss": 3.3815, + "mean_token_accuracy": 0.44579454000315605, + "step": 15835 + }, + { + "epoch": 2.935854653318502, + "grad_norm": 8.9609375, + "learning_rate": 7.064145346681498e-06, + "loss": 3.3173, + "mean_token_accuracy": 0.46916299559471364, + "step": 15836 + }, + { + "epoch": 2.936040044493882, + "grad_norm": 11.1796875, + "learning_rate": 7.063959955506119e-06, + "loss": 2.8192, + "mean_token_accuracy": 0.494113763890417, + "step": 15837 + }, + { + "epoch": 2.9362254356692623, + "grad_norm": 10.3515625, + "learning_rate": 7.063774564330739e-06, + "loss": 2.6047, + "mean_token_accuracy": 0.5044428434197886, + "step": 15838 + }, + { + "epoch": 2.936410826844642, + "grad_norm": 9.4140625, + "learning_rate": 7.0635891731553586e-06, + "loss": 2.9186, + "mean_token_accuracy": 0.49605939463163906, + "step": 15839 + }, + { + "epoch": 2.9365962180200222, + "grad_norm": 7.3359375, + "learning_rate": 7.063403781979978e-06, + "loss": 2.8512, + "mean_token_accuracy": 0.4851138353765324, + "step": 15840 + }, + { + "epoch": 2.9367816091954024, + "grad_norm": 10.8046875, + "learning_rate": 7.063218390804598e-06, + "loss": 2.8166, + "mean_token_accuracy": 0.4740200546946217, + "step": 15841 + }, + { + "epoch": 2.9369670003707826, + "grad_norm": 9.78125, + "learning_rate": 7.063032999629218e-06, + "loss": 2.9014, + "mean_token_accuracy": 0.49051735259966456, + "step": 15842 + }, + { + "epoch": 2.9371523915461624, + "grad_norm": 8.6953125, + "learning_rate": 7.062847608453838e-06, + "loss": 3.6371, + "mean_token_accuracy": 0.41403087992804677, + "step": 15843 + }, + { + "epoch": 2.9373377827215426, + "grad_norm": 8.1484375, + "learning_rate": 7.062662217278458e-06, + "loss": 3.2407, + "mean_token_accuracy": 0.4611777685836436, + "step": 15844 + }, + { + "epoch": 2.9375231738969223, + "grad_norm": 8.578125, + "learning_rate": 7.062476826103078e-06, + "loss": 3.1328, + "mean_token_accuracy": 0.48473991507430997, + "step": 15845 + }, + { + "epoch": 2.9377085650723025, + "grad_norm": 9.484375, + "learning_rate": 7.062291434927699e-06, + "loss": 2.9125, + "mean_token_accuracy": 0.47316830038212326, + "step": 15846 + }, + { + "epoch": 2.9378939562476827, + "grad_norm": 9.4609375, + "learning_rate": 7.062106043752318e-06, + "loss": 2.6308, + "mean_token_accuracy": 0.513713230069632, + "step": 15847 + }, + { + "epoch": 2.938079347423063, + "grad_norm": 10.859375, + "learning_rate": 7.061920652576938e-06, + "loss": 3.2246, + "mean_token_accuracy": 0.48979914597501184, + "step": 15848 + }, + { + "epoch": 2.9382647385984426, + "grad_norm": 7.23046875, + "learning_rate": 7.0617352614015576e-06, + "loss": 2.7237, + "mean_token_accuracy": 0.5023864721123688, + "step": 15849 + }, + { + "epoch": 2.938450129773823, + "grad_norm": 10.328125, + "learning_rate": 7.061549870226177e-06, + "loss": 4.1498, + "mean_token_accuracy": 0.4085137480588289, + "step": 15850 + }, + { + "epoch": 2.9386355209492026, + "grad_norm": 9.9453125, + "learning_rate": 7.061364479050798e-06, + "loss": 2.7981, + "mean_token_accuracy": 0.4875759978852762, + "step": 15851 + }, + { + "epoch": 2.9388209121245827, + "grad_norm": 9.015625, + "learning_rate": 7.061179087875417e-06, + "loss": 2.7035, + "mean_token_accuracy": 0.4703505799556888, + "step": 15852 + }, + { + "epoch": 2.939006303299963, + "grad_norm": 14.4765625, + "learning_rate": 7.060993696700038e-06, + "loss": 2.3924, + "mean_token_accuracy": 0.5200155965687548, + "step": 15853 + }, + { + "epoch": 2.939191694475343, + "grad_norm": 8.8828125, + "learning_rate": 7.060808305524658e-06, + "loss": 2.5691, + "mean_token_accuracy": 0.49460473844710295, + "step": 15854 + }, + { + "epoch": 2.939377085650723, + "grad_norm": 11.2734375, + "learning_rate": 7.060622914349278e-06, + "loss": 2.2658, + "mean_token_accuracy": 0.5290097988653946, + "step": 15855 + }, + { + "epoch": 2.939562476826103, + "grad_norm": 8.3125, + "learning_rate": 7.060437523173898e-06, + "loss": 2.8359, + "mean_token_accuracy": 0.5164335664335664, + "step": 15856 + }, + { + "epoch": 2.9397478680014832, + "grad_norm": 7.65234375, + "learning_rate": 7.060252131998517e-06, + "loss": 2.7946, + "mean_token_accuracy": 0.49559408754974416, + "step": 15857 + }, + { + "epoch": 2.939933259176863, + "grad_norm": 10.015625, + "learning_rate": 7.060066740823137e-06, + "loss": 2.8507, + "mean_token_accuracy": 0.4987984379693602, + "step": 15858 + }, + { + "epoch": 2.940118650352243, + "grad_norm": 8.6640625, + "learning_rate": 7.0598813496477565e-06, + "loss": 3.165, + "mean_token_accuracy": 0.4562431842966194, + "step": 15859 + }, + { + "epoch": 2.9403040415276234, + "grad_norm": 7.69921875, + "learning_rate": 7.059695958472377e-06, + "loss": 2.73, + "mean_token_accuracy": 0.5006079555323953, + "step": 15860 + }, + { + "epoch": 2.9404894327030036, + "grad_norm": 10.15625, + "learning_rate": 7.0595105672969975e-06, + "loss": 2.9164, + "mean_token_accuracy": 0.5049894163894769, + "step": 15861 + }, + { + "epoch": 2.9406748238783833, + "grad_norm": 8.625, + "learning_rate": 7.059325176121617e-06, + "loss": 3.0053, + "mean_token_accuracy": 0.48793893129770993, + "step": 15862 + }, + { + "epoch": 2.9408602150537635, + "grad_norm": 7.90234375, + "learning_rate": 7.059139784946238e-06, + "loss": 2.1494, + "mean_token_accuracy": 0.6004613018964634, + "step": 15863 + }, + { + "epoch": 2.9410456062291432, + "grad_norm": 7.61328125, + "learning_rate": 7.058954393770857e-06, + "loss": 2.721, + "mean_token_accuracy": 0.507703777335984, + "step": 15864 + }, + { + "epoch": 2.9412309974045234, + "grad_norm": 8.0234375, + "learning_rate": 7.058769002595477e-06, + "loss": 2.6721, + "mean_token_accuracy": 0.48899499243041805, + "step": 15865 + }, + { + "epoch": 2.9414163885799036, + "grad_norm": 8.1796875, + "learning_rate": 7.058583611420097e-06, + "loss": 3.2691, + "mean_token_accuracy": 0.4306041239599662, + "step": 15866 + }, + { + "epoch": 2.941601779755284, + "grad_norm": 7.875, + "learning_rate": 7.058398220244716e-06, + "loss": 2.4522, + "mean_token_accuracy": 0.5192002925758868, + "step": 15867 + }, + { + "epoch": 2.9417871709306636, + "grad_norm": 7.2109375, + "learning_rate": 7.058212829069337e-06, + "loss": 2.9984, + "mean_token_accuracy": 0.47703984819734346, + "step": 15868 + }, + { + "epoch": 2.9419725621060437, + "grad_norm": 7.9296875, + "learning_rate": 7.058027437893957e-06, + "loss": 3.3077, + "mean_token_accuracy": 0.4520570948782536, + "step": 15869 + }, + { + "epoch": 2.942157953281424, + "grad_norm": 8.7421875, + "learning_rate": 7.057842046718577e-06, + "loss": 3.0002, + "mean_token_accuracy": 0.46713540002130605, + "step": 15870 + }, + { + "epoch": 2.9423433444568037, + "grad_norm": 9.2578125, + "learning_rate": 7.0576566555431965e-06, + "loss": 3.7599, + "mean_token_accuracy": 0.43513257575757575, + "step": 15871 + }, + { + "epoch": 2.942528735632184, + "grad_norm": 7.84765625, + "learning_rate": 7.057471264367817e-06, + "loss": 2.444, + "mean_token_accuracy": 0.5206198450387404, + "step": 15872 + }, + { + "epoch": 2.942714126807564, + "grad_norm": 6.86328125, + "learning_rate": 7.057285873192437e-06, + "loss": 3.1746, + "mean_token_accuracy": 0.44261096605744127, + "step": 15873 + }, + { + "epoch": 2.9428995179829442, + "grad_norm": 7.90234375, + "learning_rate": 7.057100482017056e-06, + "loss": 3.2638, + "mean_token_accuracy": 0.4214473950942631, + "step": 15874 + }, + { + "epoch": 2.943084909158324, + "grad_norm": 8.421875, + "learning_rate": 7.056915090841676e-06, + "loss": 2.8237, + "mean_token_accuracy": 0.5019907100199071, + "step": 15875 + }, + { + "epoch": 2.943270300333704, + "grad_norm": 8.765625, + "learning_rate": 7.0567296996662956e-06, + "loss": 3.1895, + "mean_token_accuracy": 0.4807560608135872, + "step": 15876 + }, + { + "epoch": 2.943455691509084, + "grad_norm": 8.0, + "learning_rate": 7.056544308490917e-06, + "loss": 2.7218, + "mean_token_accuracy": 0.5040379438533521, + "step": 15877 + }, + { + "epoch": 2.943641082684464, + "grad_norm": 9.796875, + "learning_rate": 7.0563589173155366e-06, + "loss": 3.5964, + "mean_token_accuracy": 0.46086576300830784, + "step": 15878 + }, + { + "epoch": 2.9438264738598443, + "grad_norm": 6.8046875, + "learning_rate": 7.056173526140156e-06, + "loss": 2.2858, + "mean_token_accuracy": 0.5082347200195193, + "step": 15879 + }, + { + "epoch": 2.9440118650352245, + "grad_norm": 7.51171875, + "learning_rate": 7.055988134964777e-06, + "loss": 2.7866, + "mean_token_accuracy": 0.5028873917228104, + "step": 15880 + }, + { + "epoch": 2.9441972562106042, + "grad_norm": 9.1953125, + "learning_rate": 7.055802743789396e-06, + "loss": 3.2958, + "mean_token_accuracy": 0.44286459692147145, + "step": 15881 + }, + { + "epoch": 2.9443826473859844, + "grad_norm": 7.10546875, + "learning_rate": 7.055617352614016e-06, + "loss": 2.868, + "mean_token_accuracy": 0.4928187835643812, + "step": 15882 + }, + { + "epoch": 2.9445680385613646, + "grad_norm": 8.2265625, + "learning_rate": 7.055431961438636e-06, + "loss": 2.8836, + "mean_token_accuracy": 0.4991249562478124, + "step": 15883 + }, + { + "epoch": 2.9447534297367444, + "grad_norm": 11.84375, + "learning_rate": 7.055246570263255e-06, + "loss": 3.754, + "mean_token_accuracy": 0.4304707150556953, + "step": 15884 + }, + { + "epoch": 2.9449388209121246, + "grad_norm": 10.6015625, + "learning_rate": 7.055061179087877e-06, + "loss": 3.0254, + "mean_token_accuracy": 0.47985663082437274, + "step": 15885 + }, + { + "epoch": 2.9451242120875047, + "grad_norm": 9.8046875, + "learning_rate": 7.054875787912496e-06, + "loss": 3.5139, + "mean_token_accuracy": 0.4581447963800905, + "step": 15886 + }, + { + "epoch": 2.945309603262885, + "grad_norm": 8.5, + "learning_rate": 7.054690396737116e-06, + "loss": 3.1766, + "mean_token_accuracy": 0.48623853211009177, + "step": 15887 + }, + { + "epoch": 2.9454949944382647, + "grad_norm": 8.2734375, + "learning_rate": 7.0545050055617355e-06, + "loss": 3.1931, + "mean_token_accuracy": 0.4423548650858545, + "step": 15888 + }, + { + "epoch": 2.945680385613645, + "grad_norm": 8.6171875, + "learning_rate": 7.054319614386356e-06, + "loss": 2.6668, + "mean_token_accuracy": 0.4884244775372294, + "step": 15889 + }, + { + "epoch": 2.9458657767890246, + "grad_norm": 7.171875, + "learning_rate": 7.054134223210976e-06, + "loss": 3.1518, + "mean_token_accuracy": 0.46432889963724305, + "step": 15890 + }, + { + "epoch": 2.946051167964405, + "grad_norm": 7.859375, + "learning_rate": 7.053948832035595e-06, + "loss": 2.8087, + "mean_token_accuracy": 0.4903186768858411, + "step": 15891 + }, + { + "epoch": 2.946236559139785, + "grad_norm": 7.75390625, + "learning_rate": 7.053763440860215e-06, + "loss": 3.668, + "mean_token_accuracy": 0.430779392338177, + "step": 15892 + }, + { + "epoch": 2.946421950315165, + "grad_norm": 7.83203125, + "learning_rate": 7.053578049684836e-06, + "loss": 3.0425, + "mean_token_accuracy": 0.5017862459065198, + "step": 15893 + }, + { + "epoch": 2.946607341490545, + "grad_norm": 8.34375, + "learning_rate": 7.053392658509456e-06, + "loss": 2.96, + "mean_token_accuracy": 0.48180636777128005, + "step": 15894 + }, + { + "epoch": 2.946792732665925, + "grad_norm": 7.87890625, + "learning_rate": 7.053207267334076e-06, + "loss": 3.4177, + "mean_token_accuracy": 0.4449760765550239, + "step": 15895 + }, + { + "epoch": 2.9469781238413053, + "grad_norm": 8.7265625, + "learning_rate": 7.053021876158695e-06, + "loss": 3.4909, + "mean_token_accuracy": 0.4472532814778804, + "step": 15896 + }, + { + "epoch": 2.947163515016685, + "grad_norm": 9.421875, + "learning_rate": 7.052836484983316e-06, + "loss": 3.6805, + "mean_token_accuracy": 0.4512670565302144, + "step": 15897 + }, + { + "epoch": 2.9473489061920652, + "grad_norm": 14.3046875, + "learning_rate": 7.052651093807935e-06, + "loss": 3.5733, + "mean_token_accuracy": 0.4441244239631336, + "step": 15898 + }, + { + "epoch": 2.9475342973674454, + "grad_norm": 13.2890625, + "learning_rate": 7.052465702632555e-06, + "loss": 2.3244, + "mean_token_accuracy": 0.5438373570520966, + "step": 15899 + }, + { + "epoch": 2.9477196885428256, + "grad_norm": 8.2109375, + "learning_rate": 7.052280311457175e-06, + "loss": 2.7091, + "mean_token_accuracy": 0.5077808901338313, + "step": 15900 + }, + { + "epoch": 2.9479050797182054, + "grad_norm": 7.94140625, + "learning_rate": 7.052094920281796e-06, + "loss": 2.9722, + "mean_token_accuracy": 0.4889607650119533, + "step": 15901 + }, + { + "epoch": 2.9480904708935856, + "grad_norm": 7.6015625, + "learning_rate": 7.051909529106416e-06, + "loss": 2.5472, + "mean_token_accuracy": 0.5253927658019729, + "step": 15902 + }, + { + "epoch": 2.9482758620689653, + "grad_norm": 9.1875, + "learning_rate": 7.051724137931035e-06, + "loss": 2.5659, + "mean_token_accuracy": 0.5105979473449352, + "step": 15903 + }, + { + "epoch": 2.9484612532443455, + "grad_norm": 7.0234375, + "learning_rate": 7.051538746755655e-06, + "loss": 2.8341, + "mean_token_accuracy": 0.4900739587388089, + "step": 15904 + }, + { + "epoch": 2.9486466444197257, + "grad_norm": 7.5234375, + "learning_rate": 7.0513533555802746e-06, + "loss": 3.3262, + "mean_token_accuracy": 0.44781718963165074, + "step": 15905 + }, + { + "epoch": 2.948832035595106, + "grad_norm": 6.35546875, + "learning_rate": 7.051167964404895e-06, + "loss": 2.6839, + "mean_token_accuracy": 0.5138317329675355, + "step": 15906 + }, + { + "epoch": 2.9490174267704856, + "grad_norm": 8.515625, + "learning_rate": 7.050982573229515e-06, + "loss": 3.434, + "mean_token_accuracy": 0.4387510008006405, + "step": 15907 + }, + { + "epoch": 2.949202817945866, + "grad_norm": 7.4140625, + "learning_rate": 7.050797182054134e-06, + "loss": 3.2484, + "mean_token_accuracy": 0.47089724732086574, + "step": 15908 + }, + { + "epoch": 2.9493882091212456, + "grad_norm": 8.1328125, + "learning_rate": 7.050611790878755e-06, + "loss": 4.0605, + "mean_token_accuracy": 0.4290044864799321, + "step": 15909 + }, + { + "epoch": 2.9495736002966257, + "grad_norm": 7.4140625, + "learning_rate": 7.050426399703375e-06, + "loss": 2.8843, + "mean_token_accuracy": 0.4799949193445954, + "step": 15910 + }, + { + "epoch": 2.949758991472006, + "grad_norm": 8.484375, + "learning_rate": 7.050241008527995e-06, + "loss": 2.939, + "mean_token_accuracy": 0.4675903018307768, + "step": 15911 + }, + { + "epoch": 2.949944382647386, + "grad_norm": 7.98046875, + "learning_rate": 7.050055617352615e-06, + "loss": 3.5833, + "mean_token_accuracy": 0.44323971260613976, + "step": 15912 + }, + { + "epoch": 2.9501297738227663, + "grad_norm": 7.421875, + "learning_rate": 7.049870226177234e-06, + "loss": 2.9019, + "mean_token_accuracy": 0.4617356749134726, + "step": 15913 + }, + { + "epoch": 2.950315164998146, + "grad_norm": 7.1015625, + "learning_rate": 7.049684835001854e-06, + "loss": 3.5659, + "mean_token_accuracy": 0.42948324536639254, + "step": 15914 + }, + { + "epoch": 2.9505005561735262, + "grad_norm": 10.75, + "learning_rate": 7.049499443826474e-06, + "loss": 2.5778, + "mean_token_accuracy": 0.4866270430906389, + "step": 15915 + }, + { + "epoch": 2.950685947348906, + "grad_norm": 8.2734375, + "learning_rate": 7.049314052651094e-06, + "loss": 3.5153, + "mean_token_accuracy": 0.4488924472911663, + "step": 15916 + }, + { + "epoch": 2.950871338524286, + "grad_norm": 10.125, + "learning_rate": 7.0491286614757145e-06, + "loss": 3.4706, + "mean_token_accuracy": 0.45141514253601717, + "step": 15917 + }, + { + "epoch": 2.9510567296996664, + "grad_norm": 7.8515625, + "learning_rate": 7.048943270300335e-06, + "loss": 2.5834, + "mean_token_accuracy": 0.5095163806552262, + "step": 15918 + }, + { + "epoch": 2.9512421208750466, + "grad_norm": 7.68359375, + "learning_rate": 7.048757879124955e-06, + "loss": 3.3423, + "mean_token_accuracy": 0.45326460481099656, + "step": 15919 + }, + { + "epoch": 2.9514275120504263, + "grad_norm": 11.2890625, + "learning_rate": 7.048572487949574e-06, + "loss": 3.1328, + "mean_token_accuracy": 0.4556204974853881, + "step": 15920 + }, + { + "epoch": 2.9516129032258065, + "grad_norm": 12.3125, + "learning_rate": 7.048387096774194e-06, + "loss": 2.4693, + "mean_token_accuracy": 0.5223765432098766, + "step": 15921 + }, + { + "epoch": 2.9517982944011862, + "grad_norm": 7.76953125, + "learning_rate": 7.048201705598814e-06, + "loss": 2.8882, + "mean_token_accuracy": 0.5012445181936708, + "step": 15922 + }, + { + "epoch": 2.9519836855765664, + "grad_norm": 13.0546875, + "learning_rate": 7.048016314423434e-06, + "loss": 2.9771, + "mean_token_accuracy": 0.46348675752121365, + "step": 15923 + }, + { + "epoch": 2.9521690767519466, + "grad_norm": 11.1171875, + "learning_rate": 7.047830923248054e-06, + "loss": 2.6287, + "mean_token_accuracy": 0.5175762693972342, + "step": 15924 + }, + { + "epoch": 2.952354467927327, + "grad_norm": 8.7265625, + "learning_rate": 7.047645532072674e-06, + "loss": 3.4737, + "mean_token_accuracy": 0.4410942956926659, + "step": 15925 + }, + { + "epoch": 2.9525398591027066, + "grad_norm": 14.3125, + "learning_rate": 7.047460140897294e-06, + "loss": 3.2648, + "mean_token_accuracy": 0.43913376353494477, + "step": 15926 + }, + { + "epoch": 2.9527252502780867, + "grad_norm": 10.828125, + "learning_rate": 7.047274749721914e-06, + "loss": 3.4699, + "mean_token_accuracy": 0.4301675977653631, + "step": 15927 + }, + { + "epoch": 2.952910641453467, + "grad_norm": 10.7265625, + "learning_rate": 7.047089358546534e-06, + "loss": 3.2113, + "mean_token_accuracy": 0.47706422018348627, + "step": 15928 + }, + { + "epoch": 2.9530960326288467, + "grad_norm": 6.6953125, + "learning_rate": 7.046903967371154e-06, + "loss": 2.7962, + "mean_token_accuracy": 0.5187223352075607, + "step": 15929 + }, + { + "epoch": 2.953281423804227, + "grad_norm": 10.984375, + "learning_rate": 7.046718576195773e-06, + "loss": 2.9269, + "mean_token_accuracy": 0.45759463344513657, + "step": 15930 + }, + { + "epoch": 2.953466814979607, + "grad_norm": 8.4296875, + "learning_rate": 7.046533185020393e-06, + "loss": 3.4795, + "mean_token_accuracy": 0.43736001194564733, + "step": 15931 + }, + { + "epoch": 2.9536522061549872, + "grad_norm": 7.35546875, + "learning_rate": 7.0463477938450134e-06, + "loss": 2.5584, + "mean_token_accuracy": 0.49147588894301025, + "step": 15932 + }, + { + "epoch": 2.953837597330367, + "grad_norm": 8.3203125, + "learning_rate": 7.046162402669634e-06, + "loss": 2.9198, + "mean_token_accuracy": 0.45978517160073357, + "step": 15933 + }, + { + "epoch": 2.954022988505747, + "grad_norm": 7.67578125, + "learning_rate": 7.0459770114942536e-06, + "loss": 3.1011, + "mean_token_accuracy": 0.4479772521649218, + "step": 15934 + }, + { + "epoch": 2.954208379681127, + "grad_norm": 10.6484375, + "learning_rate": 7.045791620318874e-06, + "loss": 2.8662, + "mean_token_accuracy": 0.47896498688138966, + "step": 15935 + }, + { + "epoch": 2.954393770856507, + "grad_norm": 9.125, + "learning_rate": 7.045606229143494e-06, + "loss": 2.7003, + "mean_token_accuracy": 0.47024734982332156, + "step": 15936 + }, + { + "epoch": 2.9545791620318873, + "grad_norm": 8.53125, + "learning_rate": 7.045420837968113e-06, + "loss": 2.7965, + "mean_token_accuracy": 0.4832859656245771, + "step": 15937 + }, + { + "epoch": 2.9547645532072675, + "grad_norm": 9.734375, + "learning_rate": 7.045235446792733e-06, + "loss": 2.9187, + "mean_token_accuracy": 0.47946319642130947, + "step": 15938 + }, + { + "epoch": 2.9549499443826472, + "grad_norm": 7.44921875, + "learning_rate": 7.045050055617353e-06, + "loss": 3.5564, + "mean_token_accuracy": 0.4570008643042351, + "step": 15939 + }, + { + "epoch": 2.9551353355580274, + "grad_norm": 8.2421875, + "learning_rate": 7.044864664441972e-06, + "loss": 2.6324, + "mean_token_accuracy": 0.5231325601695972, + "step": 15940 + }, + { + "epoch": 2.9553207267334076, + "grad_norm": 9.328125, + "learning_rate": 7.044679273266594e-06, + "loss": 3.3131, + "mean_token_accuracy": 0.4732387923147301, + "step": 15941 + }, + { + "epoch": 2.9555061179087874, + "grad_norm": 9.953125, + "learning_rate": 7.044493882091213e-06, + "loss": 3.0333, + "mean_token_accuracy": 0.49283449587824985, + "step": 15942 + }, + { + "epoch": 2.9556915090841676, + "grad_norm": 9.8203125, + "learning_rate": 7.044308490915833e-06, + "loss": 3.4776, + "mean_token_accuracy": 0.43864057127743983, + "step": 15943 + }, + { + "epoch": 2.9558769002595477, + "grad_norm": 8.8125, + "learning_rate": 7.044123099740453e-06, + "loss": 2.5535, + "mean_token_accuracy": 0.5170561765132821, + "step": 15944 + }, + { + "epoch": 2.956062291434928, + "grad_norm": 8.0078125, + "learning_rate": 7.043937708565073e-06, + "loss": 3.1252, + "mean_token_accuracy": 0.472607349575986, + "step": 15945 + }, + { + "epoch": 2.9562476826103077, + "grad_norm": 10.5546875, + "learning_rate": 7.043752317389693e-06, + "loss": 3.488, + "mean_token_accuracy": 0.4619138922781942, + "step": 15946 + }, + { + "epoch": 2.956433073785688, + "grad_norm": 8.4375, + "learning_rate": 7.043566926214312e-06, + "loss": 2.9627, + "mean_token_accuracy": 0.46504001049455596, + "step": 15947 + }, + { + "epoch": 2.9566184649610676, + "grad_norm": 8.203125, + "learning_rate": 7.043381535038932e-06, + "loss": 2.8816, + "mean_token_accuracy": 0.5276369481884506, + "step": 15948 + }, + { + "epoch": 2.956803856136448, + "grad_norm": 8.8984375, + "learning_rate": 7.043196143863553e-06, + "loss": 3.1179, + "mean_token_accuracy": 0.43981431501673324, + "step": 15949 + }, + { + "epoch": 2.956989247311828, + "grad_norm": 9.6953125, + "learning_rate": 7.043010752688173e-06, + "loss": 2.6075, + "mean_token_accuracy": 0.4959016393442623, + "step": 15950 + }, + { + "epoch": 2.957174638487208, + "grad_norm": 8.1875, + "learning_rate": 7.042825361512793e-06, + "loss": 2.9207, + "mean_token_accuracy": 0.4968211767425731, + "step": 15951 + }, + { + "epoch": 2.957360029662588, + "grad_norm": 7.27734375, + "learning_rate": 7.042639970337412e-06, + "loss": 3.4572, + "mean_token_accuracy": 0.4350783432527633, + "step": 15952 + }, + { + "epoch": 2.957545420837968, + "grad_norm": 8.7421875, + "learning_rate": 7.042454579162033e-06, + "loss": 3.4821, + "mean_token_accuracy": 0.44428301612668897, + "step": 15953 + }, + { + "epoch": 2.9577308120133483, + "grad_norm": 8.8046875, + "learning_rate": 7.042269187986652e-06, + "loss": 3.2571, + "mean_token_accuracy": 0.4592705167173252, + "step": 15954 + }, + { + "epoch": 2.957916203188728, + "grad_norm": 11.2734375, + "learning_rate": 7.042083796811272e-06, + "loss": 3.7642, + "mean_token_accuracy": 0.43663609352796795, + "step": 15955 + }, + { + "epoch": 2.9581015943641082, + "grad_norm": 9.1328125, + "learning_rate": 7.041898405635892e-06, + "loss": 2.7363, + "mean_token_accuracy": 0.5093948000873935, + "step": 15956 + }, + { + "epoch": 2.9582869855394884, + "grad_norm": 12.09375, + "learning_rate": 7.041713014460513e-06, + "loss": 2.2971, + "mean_token_accuracy": 0.5192258721670486, + "step": 15957 + }, + { + "epoch": 2.9584723767148686, + "grad_norm": 6.8984375, + "learning_rate": 7.041527623285133e-06, + "loss": 2.7576, + "mean_token_accuracy": 0.5030736240171552, + "step": 15958 + }, + { + "epoch": 2.9586577678902484, + "grad_norm": 8.46875, + "learning_rate": 7.041342232109752e-06, + "loss": 2.5126, + "mean_token_accuracy": 0.5742867084203201, + "step": 15959 + }, + { + "epoch": 2.9588431590656286, + "grad_norm": 9.890625, + "learning_rate": 7.041156840934372e-06, + "loss": 3.125, + "mean_token_accuracy": 0.49262536873156343, + "step": 15960 + }, + { + "epoch": 2.9590285502410083, + "grad_norm": 7.26953125, + "learning_rate": 7.0409714497589924e-06, + "loss": 2.881, + "mean_token_accuracy": 0.4847213900539245, + "step": 15961 + }, + { + "epoch": 2.9592139414163885, + "grad_norm": 9.1328125, + "learning_rate": 7.040786058583612e-06, + "loss": 3.8667, + "mean_token_accuracy": 0.4388256777637309, + "step": 15962 + }, + { + "epoch": 2.9593993325917687, + "grad_norm": 12.109375, + "learning_rate": 7.040600667408232e-06, + "loss": 3.0998, + "mean_token_accuracy": 0.46522265572447197, + "step": 15963 + }, + { + "epoch": 2.959584723767149, + "grad_norm": 9.96875, + "learning_rate": 7.040415276232851e-06, + "loss": 2.9649, + "mean_token_accuracy": 0.47561106923656726, + "step": 15964 + }, + { + "epoch": 2.9597701149425286, + "grad_norm": 12.71875, + "learning_rate": 7.040229885057471e-06, + "loss": 3.1712, + "mean_token_accuracy": 0.44451434323100153, + "step": 15965 + }, + { + "epoch": 2.959955506117909, + "grad_norm": 11.9453125, + "learning_rate": 7.040044493882092e-06, + "loss": 3.1579, + "mean_token_accuracy": 0.44703165592225536, + "step": 15966 + }, + { + "epoch": 2.960140897293289, + "grad_norm": 11.96875, + "learning_rate": 7.039859102706712e-06, + "loss": 2.86, + "mean_token_accuracy": 0.47367840194239047, + "step": 15967 + }, + { + "epoch": 2.9603262884686687, + "grad_norm": 7.578125, + "learning_rate": 7.039673711531332e-06, + "loss": 3.3039, + "mean_token_accuracy": 0.4448326383410346, + "step": 15968 + }, + { + "epoch": 2.960511679644049, + "grad_norm": 8.625, + "learning_rate": 7.039488320355951e-06, + "loss": 2.9844, + "mean_token_accuracy": 0.4704473850031506, + "step": 15969 + }, + { + "epoch": 2.960697070819429, + "grad_norm": 13.1484375, + "learning_rate": 7.039302929180572e-06, + "loss": 2.9881, + "mean_token_accuracy": 0.47422940879231934, + "step": 15970 + }, + { + "epoch": 2.9608824619948093, + "grad_norm": 9.390625, + "learning_rate": 7.0391175380051914e-06, + "loss": 3.1128, + "mean_token_accuracy": 0.46662584200857316, + "step": 15971 + }, + { + "epoch": 2.961067853170189, + "grad_norm": 8.625, + "learning_rate": 7.038932146829811e-06, + "loss": 2.8655, + "mean_token_accuracy": 0.48585209003215435, + "step": 15972 + }, + { + "epoch": 2.9612532443455692, + "grad_norm": 10.234375, + "learning_rate": 7.038746755654431e-06, + "loss": 4.0741, + "mean_token_accuracy": 0.41977800201816345, + "step": 15973 + }, + { + "epoch": 2.961438635520949, + "grad_norm": 12.8125, + "learning_rate": 7.038561364479052e-06, + "loss": 2.6558, + "mean_token_accuracy": 0.4894084215964867, + "step": 15974 + }, + { + "epoch": 2.961624026696329, + "grad_norm": 8.4296875, + "learning_rate": 7.038375973303672e-06, + "loss": 2.656, + "mean_token_accuracy": 0.5058947368421053, + "step": 15975 + }, + { + "epoch": 2.9618094178717094, + "grad_norm": 7.4453125, + "learning_rate": 7.038190582128291e-06, + "loss": 2.7776, + "mean_token_accuracy": 0.47499155119972963, + "step": 15976 + }, + { + "epoch": 2.9619948090470896, + "grad_norm": 10.703125, + "learning_rate": 7.038005190952911e-06, + "loss": 3.1101, + "mean_token_accuracy": 0.4675461125379407, + "step": 15977 + }, + { + "epoch": 2.9621802002224693, + "grad_norm": 10.2421875, + "learning_rate": 7.037819799777531e-06, + "loss": 3.0675, + "mean_token_accuracy": 0.46943353897924844, + "step": 15978 + }, + { + "epoch": 2.9623655913978495, + "grad_norm": 8.828125, + "learning_rate": 7.037634408602151e-06, + "loss": 3.3477, + "mean_token_accuracy": 0.443212016175621, + "step": 15979 + }, + { + "epoch": 2.9625509825732292, + "grad_norm": 10.625, + "learning_rate": 7.037449017426771e-06, + "loss": 3.7612, + "mean_token_accuracy": 0.4231354642313546, + "step": 15980 + }, + { + "epoch": 2.9627363737486094, + "grad_norm": 7.42578125, + "learning_rate": 7.03726362625139e-06, + "loss": 2.4887, + "mean_token_accuracy": 0.525781910397295, + "step": 15981 + }, + { + "epoch": 2.9629217649239896, + "grad_norm": 7.6640625, + "learning_rate": 7.037078235076012e-06, + "loss": 2.5001, + "mean_token_accuracy": 0.5222390700025271, + "step": 15982 + }, + { + "epoch": 2.96310715609937, + "grad_norm": 9.1015625, + "learning_rate": 7.036892843900631e-06, + "loss": 3.262, + "mean_token_accuracy": 0.44072321320228125, + "step": 15983 + }, + { + "epoch": 2.9632925472747496, + "grad_norm": 8.3125, + "learning_rate": 7.036707452725251e-06, + "loss": 2.8988, + "mean_token_accuracy": 0.4717436250861475, + "step": 15984 + }, + { + "epoch": 2.9634779384501297, + "grad_norm": 10.9140625, + "learning_rate": 7.036522061549871e-06, + "loss": 2.8953, + "mean_token_accuracy": 0.48127544097693353, + "step": 15985 + }, + { + "epoch": 2.96366332962551, + "grad_norm": 8.6875, + "learning_rate": 7.03633667037449e-06, + "loss": 2.9986, + "mean_token_accuracy": 0.5012654223347042, + "step": 15986 + }, + { + "epoch": 2.9638487208008897, + "grad_norm": 9.0546875, + "learning_rate": 7.036151279199111e-06, + "loss": 3.3258, + "mean_token_accuracy": 0.45209257704760764, + "step": 15987 + }, + { + "epoch": 2.96403411197627, + "grad_norm": 9.1484375, + "learning_rate": 7.0359658880237305e-06, + "loss": 2.5993, + "mean_token_accuracy": 0.4924396090724691, + "step": 15988 + }, + { + "epoch": 2.96421950315165, + "grad_norm": 7.6171875, + "learning_rate": 7.03578049684835e-06, + "loss": 2.9731, + "mean_token_accuracy": 0.46687299403943144, + "step": 15989 + }, + { + "epoch": 2.9644048943270302, + "grad_norm": 8.2265625, + "learning_rate": 7.035595105672971e-06, + "loss": 3.6356, + "mean_token_accuracy": 0.430504884076733, + "step": 15990 + }, + { + "epoch": 2.96459028550241, + "grad_norm": 10.5625, + "learning_rate": 7.035409714497591e-06, + "loss": 2.7833, + "mean_token_accuracy": 0.48601190476190476, + "step": 15991 + }, + { + "epoch": 2.96477567667779, + "grad_norm": 9.265625, + "learning_rate": 7.035224323322211e-06, + "loss": 2.7849, + "mean_token_accuracy": 0.48442224267122386, + "step": 15992 + }, + { + "epoch": 2.96496106785317, + "grad_norm": 7.3125, + "learning_rate": 7.03503893214683e-06, + "loss": 2.9016, + "mean_token_accuracy": 0.46602518412924687, + "step": 15993 + }, + { + "epoch": 2.96514645902855, + "grad_norm": 8.1328125, + "learning_rate": 7.03485354097145e-06, + "loss": 3.4631, + "mean_token_accuracy": 0.45498185224212623, + "step": 15994 + }, + { + "epoch": 2.9653318502039303, + "grad_norm": 9.0859375, + "learning_rate": 7.03466814979607e-06, + "loss": 2.3814, + "mean_token_accuracy": 0.5729805854541796, + "step": 15995 + }, + { + "epoch": 2.9655172413793105, + "grad_norm": 10.7578125, + "learning_rate": 7.03448275862069e-06, + "loss": 3.2046, + "mean_token_accuracy": 0.45206151832460734, + "step": 15996 + }, + { + "epoch": 2.9657026325546902, + "grad_norm": 8.9140625, + "learning_rate": 7.03429736744531e-06, + "loss": 2.9143, + "mean_token_accuracy": 0.4868483412322275, + "step": 15997 + }, + { + "epoch": 2.9658880237300704, + "grad_norm": 9.6640625, + "learning_rate": 7.03411197626993e-06, + "loss": 3.0945, + "mean_token_accuracy": 0.4782448377581121, + "step": 15998 + }, + { + "epoch": 2.9660734149054506, + "grad_norm": 9.7421875, + "learning_rate": 7.033926585094551e-06, + "loss": 2.4094, + "mean_token_accuracy": 0.5456736286435621, + "step": 15999 + }, + { + "epoch": 2.9662588060808304, + "grad_norm": 10.5234375, + "learning_rate": 7.0337411939191704e-06, + "loss": 3.2685, + "mean_token_accuracy": 0.46460071513706797, + "step": 16000 + }, + { + "epoch": 2.9664441972562106, + "grad_norm": 7.34375, + "learning_rate": 7.03355580274379e-06, + "loss": 2.6325, + "mean_token_accuracy": 0.5374576097009557, + "step": 16001 + }, + { + "epoch": 2.9666295884315907, + "grad_norm": 11.1171875, + "learning_rate": 7.03337041156841e-06, + "loss": 2.57, + "mean_token_accuracy": 0.5259748197041927, + "step": 16002 + }, + { + "epoch": 2.966814979606971, + "grad_norm": 9.703125, + "learning_rate": 7.033185020393029e-06, + "loss": 3.2965, + "mean_token_accuracy": 0.462882096069869, + "step": 16003 + }, + { + "epoch": 2.9670003707823507, + "grad_norm": 8.8828125, + "learning_rate": 7.03299962921765e-06, + "loss": 2.7615, + "mean_token_accuracy": 0.4979187071498531, + "step": 16004 + }, + { + "epoch": 2.967185761957731, + "grad_norm": 9.265625, + "learning_rate": 7.0328142380422695e-06, + "loss": 2.9219, + "mean_token_accuracy": 0.47659334461364916, + "step": 16005 + }, + { + "epoch": 2.9673711531331106, + "grad_norm": 14.2265625, + "learning_rate": 7.03262884686689e-06, + "loss": 2.4375, + "mean_token_accuracy": 0.5098587610124458, + "step": 16006 + }, + { + "epoch": 2.967556544308491, + "grad_norm": 15.375, + "learning_rate": 7.03244345569151e-06, + "loss": 2.981, + "mean_token_accuracy": 0.4940902280672549, + "step": 16007 + }, + { + "epoch": 2.967741935483871, + "grad_norm": 9.890625, + "learning_rate": 7.03225806451613e-06, + "loss": 3.5057, + "mean_token_accuracy": 0.4452264381884945, + "step": 16008 + }, + { + "epoch": 2.967927326659251, + "grad_norm": 7.5703125, + "learning_rate": 7.03207267334075e-06, + "loss": 2.325, + "mean_token_accuracy": 0.5525699324789858, + "step": 16009 + }, + { + "epoch": 2.968112717834631, + "grad_norm": 10.046875, + "learning_rate": 7.031887282165369e-06, + "loss": 2.5036, + "mean_token_accuracy": 0.5273972602739726, + "step": 16010 + }, + { + "epoch": 2.968298109010011, + "grad_norm": 13.3671875, + "learning_rate": 7.031701890989989e-06, + "loss": 2.7802, + "mean_token_accuracy": 0.4963719099741729, + "step": 16011 + }, + { + "epoch": 2.9684835001853913, + "grad_norm": 8.7578125, + "learning_rate": 7.031516499814609e-06, + "loss": 4.1978, + "mean_token_accuracy": 0.4329847337420363, + "step": 16012 + }, + { + "epoch": 2.968668891360771, + "grad_norm": 8.046875, + "learning_rate": 7.031331108639229e-06, + "loss": 2.6592, + "mean_token_accuracy": 0.523026851098454, + "step": 16013 + }, + { + "epoch": 2.9688542825361512, + "grad_norm": 8.7890625, + "learning_rate": 7.03114571746385e-06, + "loss": 2.4782, + "mean_token_accuracy": 0.5320798047940289, + "step": 16014 + }, + { + "epoch": 2.9690396737115314, + "grad_norm": 6.7109375, + "learning_rate": 7.030960326288469e-06, + "loss": 2.9751, + "mean_token_accuracy": 0.48104300914131576, + "step": 16015 + }, + { + "epoch": 2.9692250648869116, + "grad_norm": 10.2734375, + "learning_rate": 7.03077493511309e-06, + "loss": 3.2134, + "mean_token_accuracy": 0.4644207066557108, + "step": 16016 + }, + { + "epoch": 2.9694104560622914, + "grad_norm": 7.9140625, + "learning_rate": 7.0305895439377095e-06, + "loss": 3.2282, + "mean_token_accuracy": 0.45654853620955316, + "step": 16017 + }, + { + "epoch": 2.9695958472376716, + "grad_norm": 6.90234375, + "learning_rate": 7.030404152762329e-06, + "loss": 2.8448, + "mean_token_accuracy": 0.500245941957698, + "step": 16018 + }, + { + "epoch": 2.9697812384130513, + "grad_norm": 7.83203125, + "learning_rate": 7.030218761586949e-06, + "loss": 2.1174, + "mean_token_accuracy": 0.5502103946320938, + "step": 16019 + }, + { + "epoch": 2.9699666295884315, + "grad_norm": 8.6640625, + "learning_rate": 7.030033370411568e-06, + "loss": 3.7031, + "mean_token_accuracy": 0.42154901400400113, + "step": 16020 + }, + { + "epoch": 2.9701520207638117, + "grad_norm": 7.4296875, + "learning_rate": 7.029847979236188e-06, + "loss": 3.3452, + "mean_token_accuracy": 0.4250132485426603, + "step": 16021 + }, + { + "epoch": 2.970337411939192, + "grad_norm": 9.1875, + "learning_rate": 7.029662588060809e-06, + "loss": 3.2503, + "mean_token_accuracy": 0.44337160751565763, + "step": 16022 + }, + { + "epoch": 2.9705228031145716, + "grad_norm": 9.921875, + "learning_rate": 7.029477196885429e-06, + "loss": 4.0854, + "mean_token_accuracy": 0.43558636626227615, + "step": 16023 + }, + { + "epoch": 2.970708194289952, + "grad_norm": 9.2109375, + "learning_rate": 7.029291805710049e-06, + "loss": 2.7537, + "mean_token_accuracy": 0.4854524844133762, + "step": 16024 + }, + { + "epoch": 2.970893585465332, + "grad_norm": 8.375, + "learning_rate": 7.029106414534669e-06, + "loss": 2.9821, + "mean_token_accuracy": 0.46059799055804385, + "step": 16025 + }, + { + "epoch": 2.9710789766407117, + "grad_norm": 13.171875, + "learning_rate": 7.028921023359289e-06, + "loss": 2.8012, + "mean_token_accuracy": 0.46776532630191164, + "step": 16026 + }, + { + "epoch": 2.971264367816092, + "grad_norm": 11.9375, + "learning_rate": 7.0287356321839084e-06, + "loss": 3.0087, + "mean_token_accuracy": 0.46303849095080296, + "step": 16027 + }, + { + "epoch": 2.971449758991472, + "grad_norm": 8.9765625, + "learning_rate": 7.028550241008528e-06, + "loss": 2.7989, + "mean_token_accuracy": 0.4931907852870052, + "step": 16028 + }, + { + "epoch": 2.9716351501668523, + "grad_norm": 8.5, + "learning_rate": 7.028364849833148e-06, + "loss": 2.8601, + "mean_token_accuracy": 0.49517241379310345, + "step": 16029 + }, + { + "epoch": 2.971820541342232, + "grad_norm": 11.4453125, + "learning_rate": 7.028179458657769e-06, + "loss": 3.0025, + "mean_token_accuracy": 0.4830301095861262, + "step": 16030 + }, + { + "epoch": 2.9720059325176122, + "grad_norm": 9.9609375, + "learning_rate": 7.027994067482389e-06, + "loss": 2.751, + "mean_token_accuracy": 0.47950068712780575, + "step": 16031 + }, + { + "epoch": 2.972191323692992, + "grad_norm": 7.42578125, + "learning_rate": 7.027808676307008e-06, + "loss": 3.3981, + "mean_token_accuracy": 0.4720408405556215, + "step": 16032 + }, + { + "epoch": 2.972376714868372, + "grad_norm": 8.0234375, + "learning_rate": 7.027623285131628e-06, + "loss": 3.1487, + "mean_token_accuracy": 0.4673048091338946, + "step": 16033 + }, + { + "epoch": 2.9725621060437524, + "grad_norm": 9.234375, + "learning_rate": 7.0274378939562485e-06, + "loss": 2.7589, + "mean_token_accuracy": 0.5017758046614872, + "step": 16034 + }, + { + "epoch": 2.9727474972191326, + "grad_norm": 9.421875, + "learning_rate": 7.027252502780868e-06, + "loss": 3.4721, + "mean_token_accuracy": 0.42004200420042004, + "step": 16035 + }, + { + "epoch": 2.9729328883945123, + "grad_norm": 10.4453125, + "learning_rate": 7.027067111605488e-06, + "loss": 3.3667, + "mean_token_accuracy": 0.475139146567718, + "step": 16036 + }, + { + "epoch": 2.9731182795698925, + "grad_norm": 7.30078125, + "learning_rate": 7.0268817204301074e-06, + "loss": 2.9765, + "mean_token_accuracy": 0.47670563601465715, + "step": 16037 + }, + { + "epoch": 2.9733036707452727, + "grad_norm": 8.3984375, + "learning_rate": 7.026696329254729e-06, + "loss": 2.813, + "mean_token_accuracy": 0.5112575507962658, + "step": 16038 + }, + { + "epoch": 2.9734890619206524, + "grad_norm": 13.109375, + "learning_rate": 7.026510938079348e-06, + "loss": 2.6759, + "mean_token_accuracy": 0.5167730322369498, + "step": 16039 + }, + { + "epoch": 2.9736744530960326, + "grad_norm": 8.3046875, + "learning_rate": 7.026325546903968e-06, + "loss": 2.5048, + "mean_token_accuracy": 0.5065089933419458, + "step": 16040 + }, + { + "epoch": 2.973859844271413, + "grad_norm": 13.75, + "learning_rate": 7.026140155728588e-06, + "loss": 3.1133, + "mean_token_accuracy": 0.4612871146469669, + "step": 16041 + }, + { + "epoch": 2.974045235446793, + "grad_norm": 9.7734375, + "learning_rate": 7.025954764553208e-06, + "loss": 3.5767, + "mean_token_accuracy": 0.4438731790916881, + "step": 16042 + }, + { + "epoch": 2.9742306266221727, + "grad_norm": 13.0625, + "learning_rate": 7.025769373377828e-06, + "loss": 3.3915, + "mean_token_accuracy": 0.43561134356113435, + "step": 16043 + }, + { + "epoch": 2.974416017797553, + "grad_norm": 9.796875, + "learning_rate": 7.0255839822024475e-06, + "loss": 2.8212, + "mean_token_accuracy": 0.48490329762070405, + "step": 16044 + }, + { + "epoch": 2.9746014089729327, + "grad_norm": 8.3984375, + "learning_rate": 7.025398591027067e-06, + "loss": 3.1151, + "mean_token_accuracy": 0.46595946801773275, + "step": 16045 + }, + { + "epoch": 2.974786800148313, + "grad_norm": 13.484375, + "learning_rate": 7.0252131998516885e-06, + "loss": 3.0944, + "mean_token_accuracy": 0.46706963858456113, + "step": 16046 + }, + { + "epoch": 2.974972191323693, + "grad_norm": 9.8515625, + "learning_rate": 7.025027808676308e-06, + "loss": 2.24, + "mean_token_accuracy": 0.5503073070794445, + "step": 16047 + }, + { + "epoch": 2.9751575824990732, + "grad_norm": 11.9609375, + "learning_rate": 7.024842417500928e-06, + "loss": 2.5877, + "mean_token_accuracy": 0.4979764209044519, + "step": 16048 + }, + { + "epoch": 2.975342973674453, + "grad_norm": 10.2421875, + "learning_rate": 7.024657026325547e-06, + "loss": 3.1334, + "mean_token_accuracy": 0.4633265628526292, + "step": 16049 + }, + { + "epoch": 2.975528364849833, + "grad_norm": 12.5390625, + "learning_rate": 7.024471635150167e-06, + "loss": 2.5981, + "mean_token_accuracy": 0.5037894446741078, + "step": 16050 + }, + { + "epoch": 2.975713756025213, + "grad_norm": 9.9453125, + "learning_rate": 7.0242862439747875e-06, + "loss": 2.7976, + "mean_token_accuracy": 0.493721524613846, + "step": 16051 + }, + { + "epoch": 2.975899147200593, + "grad_norm": 9.1640625, + "learning_rate": 7.024100852799407e-06, + "loss": 2.9564, + "mean_token_accuracy": 0.47714681440443213, + "step": 16052 + }, + { + "epoch": 2.9760845383759733, + "grad_norm": 10.6796875, + "learning_rate": 7.023915461624027e-06, + "loss": 3.7299, + "mean_token_accuracy": 0.44403393541324576, + "step": 16053 + }, + { + "epoch": 2.9762699295513535, + "grad_norm": 10.3125, + "learning_rate": 7.023730070448648e-06, + "loss": 2.6719, + "mean_token_accuracy": 0.4824371791407728, + "step": 16054 + }, + { + "epoch": 2.9764553207267332, + "grad_norm": 8.8984375, + "learning_rate": 7.023544679273268e-06, + "loss": 2.8284, + "mean_token_accuracy": 0.4823960880195599, + "step": 16055 + }, + { + "epoch": 2.9766407119021134, + "grad_norm": 7.52734375, + "learning_rate": 7.0233592880978874e-06, + "loss": 2.8217, + "mean_token_accuracy": 0.4842640669704355, + "step": 16056 + }, + { + "epoch": 2.9768261030774936, + "grad_norm": 10.1015625, + "learning_rate": 7.023173896922507e-06, + "loss": 2.6674, + "mean_token_accuracy": 0.5064112467171327, + "step": 16057 + }, + { + "epoch": 2.9770114942528734, + "grad_norm": 8.8359375, + "learning_rate": 7.022988505747127e-06, + "loss": 2.3997, + "mean_token_accuracy": 0.5429324629676123, + "step": 16058 + }, + { + "epoch": 2.9771968854282536, + "grad_norm": 9.4296875, + "learning_rate": 7.022803114571746e-06, + "loss": 3.2473, + "mean_token_accuracy": 0.4431498079385403, + "step": 16059 + }, + { + "epoch": 2.9773822766036337, + "grad_norm": 8.25, + "learning_rate": 7.022617723396367e-06, + "loss": 3.1666, + "mean_token_accuracy": 0.4750362271110526, + "step": 16060 + }, + { + "epoch": 2.977567667779014, + "grad_norm": 7.26953125, + "learning_rate": 7.0224323322209865e-06, + "loss": 2.9514, + "mean_token_accuracy": 0.4817637452367991, + "step": 16061 + }, + { + "epoch": 2.9777530589543937, + "grad_norm": 8.21875, + "learning_rate": 7.022246941045607e-06, + "loss": 2.9749, + "mean_token_accuracy": 0.477124183006536, + "step": 16062 + }, + { + "epoch": 2.977938450129774, + "grad_norm": 9.1875, + "learning_rate": 7.0220615498702275e-06, + "loss": 3.0178, + "mean_token_accuracy": 0.4651747881355932, + "step": 16063 + }, + { + "epoch": 2.9781238413051536, + "grad_norm": 8.5234375, + "learning_rate": 7.021876158694847e-06, + "loss": 3.2032, + "mean_token_accuracy": 0.46174981923355024, + "step": 16064 + }, + { + "epoch": 2.978309232480534, + "grad_norm": 7.734375, + "learning_rate": 7.021690767519467e-06, + "loss": 3.4314, + "mean_token_accuracy": 0.4638534147695725, + "step": 16065 + }, + { + "epoch": 2.978494623655914, + "grad_norm": 8.078125, + "learning_rate": 7.0215053763440864e-06, + "loss": 2.4013, + "mean_token_accuracy": 0.5224535734383793, + "step": 16066 + }, + { + "epoch": 2.978680014831294, + "grad_norm": 9.640625, + "learning_rate": 7.021319985168706e-06, + "loss": 2.583, + "mean_token_accuracy": 0.49778237729154345, + "step": 16067 + }, + { + "epoch": 2.978865406006674, + "grad_norm": 7.625, + "learning_rate": 7.0211345939933266e-06, + "loss": 2.7554, + "mean_token_accuracy": 0.4992889463477699, + "step": 16068 + }, + { + "epoch": 2.979050797182054, + "grad_norm": 8.1015625, + "learning_rate": 7.020949202817946e-06, + "loss": 2.3226, + "mean_token_accuracy": 0.5322555812163202, + "step": 16069 + }, + { + "epoch": 2.9792361883574343, + "grad_norm": 7.14453125, + "learning_rate": 7.020763811642567e-06, + "loss": 2.5209, + "mean_token_accuracy": 0.5455287872601061, + "step": 16070 + }, + { + "epoch": 2.979421579532814, + "grad_norm": 10.3203125, + "learning_rate": 7.020578420467186e-06, + "loss": 3.1868, + "mean_token_accuracy": 0.48586883029073696, + "step": 16071 + }, + { + "epoch": 2.9796069707081942, + "grad_norm": 7.80859375, + "learning_rate": 7.020393029291807e-06, + "loss": 2.8225, + "mean_token_accuracy": 0.4572822065253952, + "step": 16072 + }, + { + "epoch": 2.9797923618835744, + "grad_norm": 9.875, + "learning_rate": 7.0202076381164265e-06, + "loss": 2.9148, + "mean_token_accuracy": 0.47517433751743376, + "step": 16073 + }, + { + "epoch": 2.9799777530589546, + "grad_norm": 8.5859375, + "learning_rate": 7.020022246941046e-06, + "loss": 2.6503, + "mean_token_accuracy": 0.48443962970258025, + "step": 16074 + }, + { + "epoch": 2.9801631442343344, + "grad_norm": 7.15234375, + "learning_rate": 7.019836855765666e-06, + "loss": 2.3383, + "mean_token_accuracy": 0.5538851827492248, + "step": 16075 + }, + { + "epoch": 2.9803485354097146, + "grad_norm": 8.0078125, + "learning_rate": 7.019651464590285e-06, + "loss": 3.6542, + "mean_token_accuracy": 0.419425763062597, + "step": 16076 + }, + { + "epoch": 2.9805339265850943, + "grad_norm": 13.625, + "learning_rate": 7.019466073414906e-06, + "loss": 3.0616, + "mean_token_accuracy": 0.44610814022578726, + "step": 16077 + }, + { + "epoch": 2.9807193177604745, + "grad_norm": 9.1640625, + "learning_rate": 7.019280682239526e-06, + "loss": 2.9022, + "mean_token_accuracy": 0.4836117041316087, + "step": 16078 + }, + { + "epoch": 2.9809047089358547, + "grad_norm": 9.1875, + "learning_rate": 7.019095291064146e-06, + "loss": 3.2806, + "mean_token_accuracy": 0.47354441138971526, + "step": 16079 + }, + { + "epoch": 2.981090100111235, + "grad_norm": 12.640625, + "learning_rate": 7.0189098998887665e-06, + "loss": 2.7462, + "mean_token_accuracy": 0.5073055153350007, + "step": 16080 + }, + { + "epoch": 2.9812754912866146, + "grad_norm": 10.0859375, + "learning_rate": 7.018724508713386e-06, + "loss": 3.2699, + "mean_token_accuracy": 0.47749820273184757, + "step": 16081 + }, + { + "epoch": 2.981460882461995, + "grad_norm": 10.1171875, + "learning_rate": 7.018539117538006e-06, + "loss": 3.2011, + "mean_token_accuracy": 0.4343350686228112, + "step": 16082 + }, + { + "epoch": 2.981646273637375, + "grad_norm": 7.828125, + "learning_rate": 7.0183537263626255e-06, + "loss": 2.7871, + "mean_token_accuracy": 0.47734790737811017, + "step": 16083 + }, + { + "epoch": 2.9818316648127547, + "grad_norm": 9.6015625, + "learning_rate": 7.018168335187245e-06, + "loss": 3.187, + "mean_token_accuracy": 0.4684385382059801, + "step": 16084 + }, + { + "epoch": 2.982017055988135, + "grad_norm": 8.4921875, + "learning_rate": 7.017982944011866e-06, + "loss": 3.9207, + "mean_token_accuracy": 0.4404518453598208, + "step": 16085 + }, + { + "epoch": 2.982202447163515, + "grad_norm": 7.53125, + "learning_rate": 7.017797552836485e-06, + "loss": 3.185, + "mean_token_accuracy": 0.4580178940123882, + "step": 16086 + }, + { + "epoch": 2.9823878383388953, + "grad_norm": 8.4921875, + "learning_rate": 7.017612161661106e-06, + "loss": 3.0809, + "mean_token_accuracy": 0.4618580060422961, + "step": 16087 + }, + { + "epoch": 2.982573229514275, + "grad_norm": 9.1328125, + "learning_rate": 7.017426770485725e-06, + "loss": 3.334, + "mean_token_accuracy": 0.47300271915058917, + "step": 16088 + }, + { + "epoch": 2.9827586206896552, + "grad_norm": 7.6015625, + "learning_rate": 7.017241379310346e-06, + "loss": 2.8521, + "mean_token_accuracy": 0.4849163937252198, + "step": 16089 + }, + { + "epoch": 2.982944011865035, + "grad_norm": 7.09765625, + "learning_rate": 7.0170559881349655e-06, + "loss": 2.6224, + "mean_token_accuracy": 0.5033280507131537, + "step": 16090 + }, + { + "epoch": 2.983129403040415, + "grad_norm": 7.75, + "learning_rate": 7.016870596959585e-06, + "loss": 3.0301, + "mean_token_accuracy": 0.465818759936407, + "step": 16091 + }, + { + "epoch": 2.9833147942157954, + "grad_norm": 9.7734375, + "learning_rate": 7.016685205784205e-06, + "loss": 2.6424, + "mean_token_accuracy": 0.5103589145024803, + "step": 16092 + }, + { + "epoch": 2.9835001853911756, + "grad_norm": 10.15625, + "learning_rate": 7.0164998146088244e-06, + "loss": 2.9568, + "mean_token_accuracy": 0.4986690328305235, + "step": 16093 + }, + { + "epoch": 2.9836855765665553, + "grad_norm": 7.8203125, + "learning_rate": 7.016314423433445e-06, + "loss": 2.7042, + "mean_token_accuracy": 0.5057142857142857, + "step": 16094 + }, + { + "epoch": 2.9838709677419355, + "grad_norm": 6.52734375, + "learning_rate": 7.0161290322580654e-06, + "loss": 3.4039, + "mean_token_accuracy": 0.4137034849379799, + "step": 16095 + }, + { + "epoch": 2.9840563589173157, + "grad_norm": 8.765625, + "learning_rate": 7.015943641082685e-06, + "loss": 2.7366, + "mean_token_accuracy": 0.4836190062920373, + "step": 16096 + }, + { + "epoch": 2.9842417500926954, + "grad_norm": 8.515625, + "learning_rate": 7.015758249907305e-06, + "loss": 2.5543, + "mean_token_accuracy": 0.5060856498873028, + "step": 16097 + }, + { + "epoch": 2.9844271412680756, + "grad_norm": 7.66796875, + "learning_rate": 7.015572858731925e-06, + "loss": 3.3866, + "mean_token_accuracy": 0.45870462942838514, + "step": 16098 + }, + { + "epoch": 2.984612532443456, + "grad_norm": 8.984375, + "learning_rate": 7.015387467556545e-06, + "loss": 2.5897, + "mean_token_accuracy": 0.5014084507042254, + "step": 16099 + }, + { + "epoch": 2.984797923618836, + "grad_norm": 10.359375, + "learning_rate": 7.0152020763811645e-06, + "loss": 3.033, + "mean_token_accuracy": 0.5031285914953391, + "step": 16100 + }, + { + "epoch": 2.9849833147942157, + "grad_norm": 8.0234375, + "learning_rate": 7.015016685205784e-06, + "loss": 2.8215, + "mean_token_accuracy": 0.4845119091694434, + "step": 16101 + }, + { + "epoch": 2.985168705969596, + "grad_norm": 8.171875, + "learning_rate": 7.014831294030404e-06, + "loss": 2.7788, + "mean_token_accuracy": 0.4604266067920292, + "step": 16102 + }, + { + "epoch": 2.9853540971449757, + "grad_norm": 7.1015625, + "learning_rate": 7.014645902855025e-06, + "loss": 3.1886, + "mean_token_accuracy": 0.4381831085876508, + "step": 16103 + }, + { + "epoch": 2.985539488320356, + "grad_norm": 8.65625, + "learning_rate": 7.014460511679645e-06, + "loss": 2.8462, + "mean_token_accuracy": 0.48796972154636387, + "step": 16104 + }, + { + "epoch": 2.985724879495736, + "grad_norm": 9.4921875, + "learning_rate": 7.014275120504264e-06, + "loss": 3.5178, + "mean_token_accuracy": 0.4276771443793287, + "step": 16105 + }, + { + "epoch": 2.9859102706711163, + "grad_norm": 7.375, + "learning_rate": 7.014089729328885e-06, + "loss": 2.2633, + "mean_token_accuracy": 0.5501248662147699, + "step": 16106 + }, + { + "epoch": 2.986095661846496, + "grad_norm": 8.5625, + "learning_rate": 7.0139043381535046e-06, + "loss": 3.3164, + "mean_token_accuracy": 0.44238038665082624, + "step": 16107 + }, + { + "epoch": 2.986281053021876, + "grad_norm": 7.73046875, + "learning_rate": 7.013718946978124e-06, + "loss": 2.2746, + "mean_token_accuracy": 0.5262465465070385, + "step": 16108 + }, + { + "epoch": 2.9864664441972564, + "grad_norm": 8.2265625, + "learning_rate": 7.013533555802744e-06, + "loss": 3.2038, + "mean_token_accuracy": 0.46635200974421437, + "step": 16109 + }, + { + "epoch": 2.986651835372636, + "grad_norm": 10.2890625, + "learning_rate": 7.0133481646273635e-06, + "loss": 4.4025, + "mean_token_accuracy": 0.40478026214340784, + "step": 16110 + }, + { + "epoch": 2.9868372265480163, + "grad_norm": 8.2109375, + "learning_rate": 7.013162773451985e-06, + "loss": 2.2861, + "mean_token_accuracy": 0.5616968357054027, + "step": 16111 + }, + { + "epoch": 2.9870226177233965, + "grad_norm": 7.76171875, + "learning_rate": 7.0129773822766045e-06, + "loss": 2.6689, + "mean_token_accuracy": 0.4896465174649655, + "step": 16112 + }, + { + "epoch": 2.9872080088987767, + "grad_norm": 9.03125, + "learning_rate": 7.012791991101224e-06, + "loss": 3.5308, + "mean_token_accuracy": 0.44295400943396224, + "step": 16113 + }, + { + "epoch": 2.9873934000741564, + "grad_norm": 8.2890625, + "learning_rate": 7.012606599925844e-06, + "loss": 2.9336, + "mean_token_accuracy": 0.49085754783841246, + "step": 16114 + }, + { + "epoch": 2.9875787912495366, + "grad_norm": 12.0546875, + "learning_rate": 7.012421208750464e-06, + "loss": 3.447, + "mean_token_accuracy": 0.48001122177023425, + "step": 16115 + }, + { + "epoch": 2.9877641824249164, + "grad_norm": 7.43359375, + "learning_rate": 7.012235817575084e-06, + "loss": 3.0891, + "mean_token_accuracy": 0.46674182638105977, + "step": 16116 + }, + { + "epoch": 2.9879495736002966, + "grad_norm": 9.8984375, + "learning_rate": 7.0120504263997035e-06, + "loss": 4.0403, + "mean_token_accuracy": 0.4278874056877539, + "step": 16117 + }, + { + "epoch": 2.9881349647756767, + "grad_norm": 7.203125, + "learning_rate": 7.011865035224323e-06, + "loss": 3.0249, + "mean_token_accuracy": 0.46439222235476557, + "step": 16118 + }, + { + "epoch": 2.988320355951057, + "grad_norm": 8.203125, + "learning_rate": 7.0116796440489445e-06, + "loss": 2.3397, + "mean_token_accuracy": 0.546819438956934, + "step": 16119 + }, + { + "epoch": 2.9885057471264367, + "grad_norm": 7.88671875, + "learning_rate": 7.011494252873564e-06, + "loss": 2.8827, + "mean_token_accuracy": 0.492470174066106, + "step": 16120 + }, + { + "epoch": 2.988691138301817, + "grad_norm": 7.0390625, + "learning_rate": 7.011308861698184e-06, + "loss": 3.044, + "mean_token_accuracy": 0.47456170505328293, + "step": 16121 + }, + { + "epoch": 2.9888765294771966, + "grad_norm": 7.046875, + "learning_rate": 7.0111234705228035e-06, + "loss": 2.3503, + "mean_token_accuracy": 0.5177489177489177, + "step": 16122 + }, + { + "epoch": 2.989061920652577, + "grad_norm": 7.078125, + "learning_rate": 7.010938079347424e-06, + "loss": 2.7325, + "mean_token_accuracy": 0.49824663939216834, + "step": 16123 + }, + { + "epoch": 2.989247311827957, + "grad_norm": 9.4453125, + "learning_rate": 7.010752688172044e-06, + "loss": 3.3835, + "mean_token_accuracy": 0.4614837976122797, + "step": 16124 + }, + { + "epoch": 2.989432703003337, + "grad_norm": 9.515625, + "learning_rate": 7.010567296996663e-06, + "loss": 3.5263, + "mean_token_accuracy": 0.43215997355808955, + "step": 16125 + }, + { + "epoch": 2.989618094178717, + "grad_norm": 7.3671875, + "learning_rate": 7.010381905821283e-06, + "loss": 2.7007, + "mean_token_accuracy": 0.47962962962962963, + "step": 16126 + }, + { + "epoch": 2.989803485354097, + "grad_norm": 7.6015625, + "learning_rate": 7.010196514645904e-06, + "loss": 2.6938, + "mean_token_accuracy": 0.4968358517370528, + "step": 16127 + }, + { + "epoch": 2.9899888765294773, + "grad_norm": 7.21875, + "learning_rate": 7.010011123470524e-06, + "loss": 3.0763, + "mean_token_accuracy": 0.4418569020989735, + "step": 16128 + }, + { + "epoch": 2.990174267704857, + "grad_norm": 9.4296875, + "learning_rate": 7.0098257322951435e-06, + "loss": 3.4117, + "mean_token_accuracy": 0.45829665492957744, + "step": 16129 + }, + { + "epoch": 2.9903596588802372, + "grad_norm": 6.51953125, + "learning_rate": 7.009640341119763e-06, + "loss": 2.99, + "mean_token_accuracy": 0.4742086752637749, + "step": 16130 + }, + { + "epoch": 2.9905450500556174, + "grad_norm": 6.234375, + "learning_rate": 7.009454949944383e-06, + "loss": 2.4049, + "mean_token_accuracy": 0.5227242612958791, + "step": 16131 + }, + { + "epoch": 2.9907304412309976, + "grad_norm": 8.359375, + "learning_rate": 7.009269558769003e-06, + "loss": 3.0595, + "mean_token_accuracy": 0.4732666015625, + "step": 16132 + }, + { + "epoch": 2.9909158324063774, + "grad_norm": 8.03125, + "learning_rate": 7.009084167593623e-06, + "loss": 2.8865, + "mean_token_accuracy": 0.47548377709879236, + "step": 16133 + }, + { + "epoch": 2.9911012235817576, + "grad_norm": 7.1640625, + "learning_rate": 7.0088987764182426e-06, + "loss": 2.655, + "mean_token_accuracy": 0.4969901629716635, + "step": 16134 + }, + { + "epoch": 2.9912866147571373, + "grad_norm": 8.3359375, + "learning_rate": 7.008713385242864e-06, + "loss": 2.2886, + "mean_token_accuracy": 0.554019014693172, + "step": 16135 + }, + { + "epoch": 2.9914720059325175, + "grad_norm": 7.46875, + "learning_rate": 7.0085279940674836e-06, + "loss": 3.3715, + "mean_token_accuracy": 0.4410275879524171, + "step": 16136 + }, + { + "epoch": 2.9916573971078977, + "grad_norm": 7.3203125, + "learning_rate": 7.008342602892103e-06, + "loss": 3.336, + "mean_token_accuracy": 0.450425608440235, + "step": 16137 + }, + { + "epoch": 2.991842788283278, + "grad_norm": 8.1640625, + "learning_rate": 7.008157211716723e-06, + "loss": 2.7438, + "mean_token_accuracy": 0.5210192730565256, + "step": 16138 + }, + { + "epoch": 2.9920281794586576, + "grad_norm": 7.45703125, + "learning_rate": 7.0079718205413425e-06, + "loss": 3.3697, + "mean_token_accuracy": 0.45369592608147835, + "step": 16139 + }, + { + "epoch": 2.992213570634038, + "grad_norm": 16.71875, + "learning_rate": 7.007786429365962e-06, + "loss": 2.3761, + "mean_token_accuracy": 0.5435341136168998, + "step": 16140 + }, + { + "epoch": 2.992398961809418, + "grad_norm": 7.9296875, + "learning_rate": 7.007601038190583e-06, + "loss": 2.9515, + "mean_token_accuracy": 0.46755921730175076, + "step": 16141 + }, + { + "epoch": 2.9925843529847977, + "grad_norm": 7.87109375, + "learning_rate": 7.007415647015202e-06, + "loss": 2.7486, + "mean_token_accuracy": 0.48514357053682894, + "step": 16142 + }, + { + "epoch": 2.992769744160178, + "grad_norm": 8.28125, + "learning_rate": 7.007230255839823e-06, + "loss": 2.6572, + "mean_token_accuracy": 0.5198687568355814, + "step": 16143 + }, + { + "epoch": 2.992955135335558, + "grad_norm": 7.9296875, + "learning_rate": 7.007044864664443e-06, + "loss": 3.6763, + "mean_token_accuracy": 0.42642803558032366, + "step": 16144 + }, + { + "epoch": 2.9931405265109383, + "grad_norm": 9.59375, + "learning_rate": 7.006859473489063e-06, + "loss": 3.1856, + "mean_token_accuracy": 0.4782673414706758, + "step": 16145 + }, + { + "epoch": 2.993325917686318, + "grad_norm": 8.2734375, + "learning_rate": 7.0066740823136825e-06, + "loss": 3.5655, + "mean_token_accuracy": 0.44153577661431065, + "step": 16146 + }, + { + "epoch": 2.9935113088616983, + "grad_norm": 8.7578125, + "learning_rate": 7.006488691138302e-06, + "loss": 3.0041, + "mean_token_accuracy": 0.4998449612403101, + "step": 16147 + }, + { + "epoch": 2.993696700037078, + "grad_norm": 8.46875, + "learning_rate": 7.006303299962922e-06, + "loss": 3.0778, + "mean_token_accuracy": 0.4717532899109398, + "step": 16148 + }, + { + "epoch": 2.993882091212458, + "grad_norm": 9.5078125, + "learning_rate": 7.006117908787542e-06, + "loss": 2.6611, + "mean_token_accuracy": 0.5286360698125404, + "step": 16149 + }, + { + "epoch": 2.9940674823878384, + "grad_norm": 9.109375, + "learning_rate": 7.005932517612162e-06, + "loss": 3.1986, + "mean_token_accuracy": 0.47848875300283905, + "step": 16150 + }, + { + "epoch": 2.9942528735632186, + "grad_norm": 9.140625, + "learning_rate": 7.0057471264367825e-06, + "loss": 2.7963, + "mean_token_accuracy": 0.4821073558648111, + "step": 16151 + }, + { + "epoch": 2.9944382647385983, + "grad_norm": 10.375, + "learning_rate": 7.005561735261402e-06, + "loss": 3.4231, + "mean_token_accuracy": 0.4320261437908497, + "step": 16152 + }, + { + "epoch": 2.9946236559139785, + "grad_norm": 9.5703125, + "learning_rate": 7.005376344086023e-06, + "loss": 2.2001, + "mean_token_accuracy": 0.5798851248388232, + "step": 16153 + }, + { + "epoch": 2.9948090470893587, + "grad_norm": 8.0625, + "learning_rate": 7.005190952910642e-06, + "loss": 2.6064, + "mean_token_accuracy": 0.5100308641975309, + "step": 16154 + }, + { + "epoch": 2.9949944382647384, + "grad_norm": 8.328125, + "learning_rate": 7.005005561735262e-06, + "loss": 3.4784, + "mean_token_accuracy": 0.43902439024390244, + "step": 16155 + }, + { + "epoch": 2.9951798294401186, + "grad_norm": 10.21875, + "learning_rate": 7.0048201705598815e-06, + "loss": 3.7055, + "mean_token_accuracy": 0.4459493041749503, + "step": 16156 + }, + { + "epoch": 2.995365220615499, + "grad_norm": 9.625, + "learning_rate": 7.004634779384501e-06, + "loss": 2.9989, + "mean_token_accuracy": 0.4818711967545639, + "step": 16157 + }, + { + "epoch": 2.995550611790879, + "grad_norm": 8.7265625, + "learning_rate": 7.004449388209122e-06, + "loss": 3.4792, + "mean_token_accuracy": 0.4525516055045872, + "step": 16158 + }, + { + "epoch": 2.9957360029662587, + "grad_norm": 11.421875, + "learning_rate": 7.004263997033742e-06, + "loss": 3.0171, + "mean_token_accuracy": 0.46459918080748974, + "step": 16159 + }, + { + "epoch": 2.995921394141639, + "grad_norm": 10.6484375, + "learning_rate": 7.004078605858362e-06, + "loss": 2.5996, + "mean_token_accuracy": 0.49562644713146387, + "step": 16160 + }, + { + "epoch": 2.9961067853170187, + "grad_norm": 7.34375, + "learning_rate": 7.003893214682982e-06, + "loss": 2.9389, + "mean_token_accuracy": 0.46618106139438087, + "step": 16161 + }, + { + "epoch": 2.996292176492399, + "grad_norm": 9.578125, + "learning_rate": 7.003707823507602e-06, + "loss": 2.7988, + "mean_token_accuracy": 0.504957127545552, + "step": 16162 + }, + { + "epoch": 2.996477567667779, + "grad_norm": 7.62890625, + "learning_rate": 7.0035224323322216e-06, + "loss": 2.5946, + "mean_token_accuracy": 0.5185609157808667, + "step": 16163 + }, + { + "epoch": 2.9966629588431593, + "grad_norm": 8.046875, + "learning_rate": 7.003337041156841e-06, + "loss": 3.0388, + "mean_token_accuracy": 0.4623420170234718, + "step": 16164 + }, + { + "epoch": 2.996848350018539, + "grad_norm": 9.5859375, + "learning_rate": 7.003151649981461e-06, + "loss": 3.5104, + "mean_token_accuracy": 0.4433003658040916, + "step": 16165 + }, + { + "epoch": 2.997033741193919, + "grad_norm": 9.59375, + "learning_rate": 7.002966258806081e-06, + "loss": 2.7606, + "mean_token_accuracy": 0.48795928500496527, + "step": 16166 + }, + { + "epoch": 2.9972191323692994, + "grad_norm": 7.9140625, + "learning_rate": 7.002780867630702e-06, + "loss": 3.117, + "mean_token_accuracy": 0.4677132701421801, + "step": 16167 + }, + { + "epoch": 2.997404523544679, + "grad_norm": 8.296875, + "learning_rate": 7.0025954764553215e-06, + "loss": 3.6018, + "mean_token_accuracy": 0.45206800916435547, + "step": 16168 + }, + { + "epoch": 2.9975899147200593, + "grad_norm": 11.625, + "learning_rate": 7.002410085279941e-06, + "loss": 2.9749, + "mean_token_accuracy": 0.466768193190471, + "step": 16169 + }, + { + "epoch": 2.9977753058954395, + "grad_norm": 9.2421875, + "learning_rate": 7.002224694104562e-06, + "loss": 3.797, + "mean_token_accuracy": 0.45506514224940176, + "step": 16170 + }, + { + "epoch": 2.9979606970708197, + "grad_norm": 9.1953125, + "learning_rate": 7.002039302929181e-06, + "loss": 2.7444, + "mean_token_accuracy": 0.4881911262798635, + "step": 16171 + }, + { + "epoch": 2.9981460882461994, + "grad_norm": 9.0625, + "learning_rate": 7.001853911753801e-06, + "loss": 2.7995, + "mean_token_accuracy": 0.5065345989819783, + "step": 16172 + }, + { + "epoch": 2.9983314794215796, + "grad_norm": 9.0, + "learning_rate": 7.0016685205784206e-06, + "loss": 2.9254, + "mean_token_accuracy": 0.4580208081296879, + "step": 16173 + }, + { + "epoch": 2.9985168705969594, + "grad_norm": 8.296875, + "learning_rate": 7.00148312940304e-06, + "loss": 3.1217, + "mean_token_accuracy": 0.4549600912200684, + "step": 16174 + }, + { + "epoch": 2.9987022617723396, + "grad_norm": 8.0859375, + "learning_rate": 7.0012977382276615e-06, + "loss": 2.8073, + "mean_token_accuracy": 0.48674785100286533, + "step": 16175 + }, + { + "epoch": 2.9988876529477198, + "grad_norm": 8.5234375, + "learning_rate": 7.001112347052281e-06, + "loss": 3.1527, + "mean_token_accuracy": 0.4547996272134203, + "step": 16176 + }, + { + "epoch": 2.9990730441231, + "grad_norm": 9.1640625, + "learning_rate": 7.000926955876901e-06, + "loss": 3.1718, + "mean_token_accuracy": 0.46279128038085693, + "step": 16177 + }, + { + "epoch": 2.9992584352984797, + "grad_norm": 8.1328125, + "learning_rate": 7.0007415647015205e-06, + "loss": 2.3162, + "mean_token_accuracy": 0.555876100819921, + "step": 16178 + }, + { + "epoch": 2.99944382647386, + "grad_norm": 9.8203125, + "learning_rate": 7.000556173526141e-06, + "loss": 3.4382, + "mean_token_accuracy": 0.4462552873849216, + "step": 16179 + }, + { + "epoch": 2.9996292176492396, + "grad_norm": 9.6640625, + "learning_rate": 7.000370782350761e-06, + "loss": 3.5468, + "mean_token_accuracy": 0.439365975464119, + "step": 16180 + }, + { + "epoch": 2.99981460882462, + "grad_norm": 9.1953125, + "learning_rate": 7.00018539117538e-06, + "loss": 3.1937, + "mean_token_accuracy": 0.457538789529788, + "step": 16181 + }, + { + "epoch": 3.0, + "grad_norm": 7.328125, + "learning_rate": 7e-06, + "loss": 2.8511, + "mean_token_accuracy": 0.47609608208955223, + "step": 16182 + } + ], + "logging_steps": 1, + "max_steps": 53940, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.6379480929583923e+18, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}