diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,21000 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 14968, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 6.680919294494922e-05, + "grad_norm": 0.3203125, + "learning_rate": 1.3360053440213764e-07, + "loss": 1.5739, + "step": 1 + }, + { + "epoch": 0.00033404596472474615, + "grad_norm": 0.431640625, + "learning_rate": 6.680026720106881e-07, + "loss": 1.6599, + "step": 5 + }, + { + "epoch": 0.0006680919294494923, + "grad_norm": 0.263671875, + "learning_rate": 1.3360053440213763e-06, + "loss": 1.6221, + "step": 10 + }, + { + "epoch": 0.0010021378941742383, + "grad_norm": 0.361328125, + "learning_rate": 2.004008016032064e-06, + "loss": 1.5389, + "step": 15 + }, + { + "epoch": 0.0013361838588989846, + "grad_norm": 0.318359375, + "learning_rate": 2.6720106880427525e-06, + "loss": 1.5856, + "step": 20 + }, + { + "epoch": 0.0016702298236237307, + "grad_norm": 0.33984375, + "learning_rate": 3.3400133600534405e-06, + "loss": 1.5469, + "step": 25 + }, + { + "epoch": 0.0020042757883484766, + "grad_norm": 0.361328125, + "learning_rate": 4.008016032064128e-06, + "loss": 1.5633, + "step": 30 + }, + { + "epoch": 0.0023383217530732227, + "grad_norm": 0.279296875, + "learning_rate": 4.676018704074817e-06, + "loss": 1.5587, + "step": 35 + }, + { + "epoch": 0.002672367717797969, + "grad_norm": 0.353515625, + "learning_rate": 5.344021376085505e-06, + "loss": 1.6141, + "step": 40 + }, + { + "epoch": 0.0030064136825227153, + "grad_norm": 0.318359375, + "learning_rate": 6.012024048096193e-06, + "loss": 1.5299, + "step": 45 + }, + { + "epoch": 0.0033404596472474614, + "grad_norm": 0.283203125, + "learning_rate": 6.680026720106881e-06, + "loss": 1.5561, + "step": 50 + }, + { + "epoch": 0.0036745056119722075, + "grad_norm": 0.35546875, + "learning_rate": 7.348029392117569e-06, + "loss": 1.5794, + "step": 55 + }, + { + "epoch": 0.004008551576696953, + "grad_norm": 0.310546875, + "learning_rate": 8.016032064128256e-06, + "loss": 1.5619, + "step": 60 + }, + { + "epoch": 0.004342597541421699, + "grad_norm": 0.2265625, + "learning_rate": 8.684034736138945e-06, + "loss": 1.5249, + "step": 65 + }, + { + "epoch": 0.004676643506146445, + "grad_norm": 0.77734375, + "learning_rate": 9.352037408149633e-06, + "loss": 1.5814, + "step": 70 + }, + { + "epoch": 0.0050106894708711915, + "grad_norm": 0.2138671875, + "learning_rate": 1.0020040080160322e-05, + "loss": 1.483, + "step": 75 + }, + { + "epoch": 0.005344735435595938, + "grad_norm": 0.267578125, + "learning_rate": 1.068804275217101e-05, + "loss": 1.3885, + "step": 80 + }, + { + "epoch": 0.0056787814003206845, + "grad_norm": 0.640625, + "learning_rate": 1.1356045424181697e-05, + "loss": 1.4409, + "step": 85 + }, + { + "epoch": 0.006012827365045431, + "grad_norm": 0.33984375, + "learning_rate": 1.2024048096192385e-05, + "loss": 1.4832, + "step": 90 + }, + { + "epoch": 0.006346873329770177, + "grad_norm": 0.236328125, + "learning_rate": 1.2692050768203074e-05, + "loss": 1.4454, + "step": 95 + }, + { + "epoch": 0.006680919294494923, + "grad_norm": 0.26171875, + "learning_rate": 1.3360053440213762e-05, + "loss": 1.5162, + "step": 100 + }, + { + "epoch": 0.007014965259219669, + "grad_norm": 0.251953125, + "learning_rate": 1.4028056112224449e-05, + "loss": 1.4367, + "step": 105 + }, + { + "epoch": 0.007349011223944415, + "grad_norm": 0.212890625, + "learning_rate": 1.4696058784235137e-05, + "loss": 1.4453, + "step": 110 + }, + { + "epoch": 0.007683057188669161, + "grad_norm": 0.23046875, + "learning_rate": 1.5364061456245828e-05, + "loss": 1.4518, + "step": 115 + }, + { + "epoch": 0.008017103153393906, + "grad_norm": 0.2060546875, + "learning_rate": 1.6032064128256513e-05, + "loss": 1.4095, + "step": 120 + }, + { + "epoch": 0.008351149118118652, + "grad_norm": 0.1904296875, + "learning_rate": 1.6700066800267204e-05, + "loss": 1.3303, + "step": 125 + }, + { + "epoch": 0.008685195082843399, + "grad_norm": 0.2265625, + "learning_rate": 1.736806947227789e-05, + "loss": 1.5248, + "step": 130 + }, + { + "epoch": 0.009019241047568145, + "grad_norm": 0.203125, + "learning_rate": 1.8036072144288578e-05, + "loss": 1.3274, + "step": 135 + }, + { + "epoch": 0.00935328701229289, + "grad_norm": 0.177734375, + "learning_rate": 1.8704074816299266e-05, + "loss": 1.4265, + "step": 140 + }, + { + "epoch": 0.009687332977017637, + "grad_norm": 0.259765625, + "learning_rate": 1.9372077488309955e-05, + "loss": 1.3648, + "step": 145 + }, + { + "epoch": 0.010021378941742383, + "grad_norm": 0.169921875, + "learning_rate": 2.0040080160320643e-05, + "loss": 1.2991, + "step": 150 + }, + { + "epoch": 0.010355424906467129, + "grad_norm": 0.1865234375, + "learning_rate": 2.070808283233133e-05, + "loss": 1.4394, + "step": 155 + }, + { + "epoch": 0.010689470871191877, + "grad_norm": 0.17578125, + "learning_rate": 2.137608550434202e-05, + "loss": 1.3224, + "step": 160 + }, + { + "epoch": 0.011023516835916623, + "grad_norm": 0.1767578125, + "learning_rate": 2.2044088176352705e-05, + "loss": 1.3659, + "step": 165 + }, + { + "epoch": 0.011357562800641369, + "grad_norm": 0.1689453125, + "learning_rate": 2.2712090848363394e-05, + "loss": 1.3688, + "step": 170 + }, + { + "epoch": 0.011691608765366115, + "grad_norm": 0.1865234375, + "learning_rate": 2.3380093520374082e-05, + "loss": 1.3416, + "step": 175 + }, + { + "epoch": 0.012025654730090861, + "grad_norm": 0.16796875, + "learning_rate": 2.404809619238477e-05, + "loss": 1.3394, + "step": 180 + }, + { + "epoch": 0.012359700694815607, + "grad_norm": 0.1572265625, + "learning_rate": 2.471609886439546e-05, + "loss": 1.2949, + "step": 185 + }, + { + "epoch": 0.012693746659540353, + "grad_norm": 0.1611328125, + "learning_rate": 2.5384101536406147e-05, + "loss": 1.4109, + "step": 190 + }, + { + "epoch": 0.0130277926242651, + "grad_norm": 0.1982421875, + "learning_rate": 2.6052104208416833e-05, + "loss": 1.3706, + "step": 195 + }, + { + "epoch": 0.013361838588989846, + "grad_norm": 0.166015625, + "learning_rate": 2.6720106880427524e-05, + "loss": 1.3242, + "step": 200 + }, + { + "epoch": 0.013695884553714592, + "grad_norm": 0.1962890625, + "learning_rate": 2.7388109552438213e-05, + "loss": 1.3187, + "step": 205 + }, + { + "epoch": 0.014029930518439338, + "grad_norm": 0.1806640625, + "learning_rate": 2.8056112224448898e-05, + "loss": 1.4374, + "step": 210 + }, + { + "epoch": 0.014363976483164084, + "grad_norm": 0.1572265625, + "learning_rate": 2.8724114896459586e-05, + "loss": 1.2938, + "step": 215 + }, + { + "epoch": 0.01469802244788883, + "grad_norm": 0.21875, + "learning_rate": 2.9392117568470275e-05, + "loss": 1.3603, + "step": 220 + }, + { + "epoch": 0.015032068412613576, + "grad_norm": 0.20703125, + "learning_rate": 3.0060120240480967e-05, + "loss": 1.3989, + "step": 225 + }, + { + "epoch": 0.015366114377338322, + "grad_norm": 0.1884765625, + "learning_rate": 3.0728122912491655e-05, + "loss": 1.3631, + "step": 230 + }, + { + "epoch": 0.015700160342063067, + "grad_norm": 0.1748046875, + "learning_rate": 3.139612558450234e-05, + "loss": 1.318, + "step": 235 + }, + { + "epoch": 0.016034206306787813, + "grad_norm": 0.2060546875, + "learning_rate": 3.2064128256513025e-05, + "loss": 1.344, + "step": 240 + }, + { + "epoch": 0.01636825227151256, + "grad_norm": 0.1875, + "learning_rate": 3.273213092852371e-05, + "loss": 1.3326, + "step": 245 + }, + { + "epoch": 0.016702298236237305, + "grad_norm": 0.216796875, + "learning_rate": 3.340013360053441e-05, + "loss": 1.2978, + "step": 250 + }, + { + "epoch": 0.01703634420096205, + "grad_norm": 0.228515625, + "learning_rate": 3.4068136272545094e-05, + "loss": 1.2572, + "step": 255 + }, + { + "epoch": 0.017370390165686797, + "grad_norm": 0.1943359375, + "learning_rate": 3.473613894455578e-05, + "loss": 1.2778, + "step": 260 + }, + { + "epoch": 0.017704436130411543, + "grad_norm": 0.30078125, + "learning_rate": 3.5404141616566464e-05, + "loss": 1.2495, + "step": 265 + }, + { + "epoch": 0.01803848209513629, + "grad_norm": 0.1826171875, + "learning_rate": 3.6072144288577156e-05, + "loss": 1.3069, + "step": 270 + }, + { + "epoch": 0.018372528059861035, + "grad_norm": 0.2099609375, + "learning_rate": 3.674014696058785e-05, + "loss": 1.3499, + "step": 275 + }, + { + "epoch": 0.01870657402458578, + "grad_norm": 0.2255859375, + "learning_rate": 3.740814963259853e-05, + "loss": 1.3404, + "step": 280 + }, + { + "epoch": 0.019040619989310528, + "grad_norm": 0.21875, + "learning_rate": 3.807615230460922e-05, + "loss": 1.2487, + "step": 285 + }, + { + "epoch": 0.019374665954035274, + "grad_norm": 0.212890625, + "learning_rate": 3.874415497661991e-05, + "loss": 1.3379, + "step": 290 + }, + { + "epoch": 0.01970871191876002, + "grad_norm": 0.1953125, + "learning_rate": 3.94121576486306e-05, + "loss": 1.2742, + "step": 295 + }, + { + "epoch": 0.020042757883484766, + "grad_norm": 0.193359375, + "learning_rate": 4.0080160320641287e-05, + "loss": 1.3057, + "step": 300 + }, + { + "epoch": 0.020376803848209512, + "grad_norm": 0.185546875, + "learning_rate": 4.074816299265197e-05, + "loss": 1.2915, + "step": 305 + }, + { + "epoch": 0.020710849812934258, + "grad_norm": 0.1962890625, + "learning_rate": 4.141616566466266e-05, + "loss": 1.2924, + "step": 310 + }, + { + "epoch": 0.021044895777659008, + "grad_norm": 0.19140625, + "learning_rate": 4.208416833667335e-05, + "loss": 1.2266, + "step": 315 + }, + { + "epoch": 0.021378941742383754, + "grad_norm": 0.2080078125, + "learning_rate": 4.275217100868404e-05, + "loss": 1.3178, + "step": 320 + }, + { + "epoch": 0.0217129877071085, + "grad_norm": 0.2060546875, + "learning_rate": 4.3420173680694725e-05, + "loss": 1.2772, + "step": 325 + }, + { + "epoch": 0.022047033671833246, + "grad_norm": 0.189453125, + "learning_rate": 4.408817635270541e-05, + "loss": 1.3171, + "step": 330 + }, + { + "epoch": 0.022381079636557992, + "grad_norm": 0.236328125, + "learning_rate": 4.47561790247161e-05, + "loss": 1.3287, + "step": 335 + }, + { + "epoch": 0.022715125601282738, + "grad_norm": 0.2119140625, + "learning_rate": 4.542418169672679e-05, + "loss": 1.3499, + "step": 340 + }, + { + "epoch": 0.023049171566007484, + "grad_norm": 0.2216796875, + "learning_rate": 4.609218436873748e-05, + "loss": 1.2967, + "step": 345 + }, + { + "epoch": 0.02338321753073223, + "grad_norm": 0.2236328125, + "learning_rate": 4.6760187040748164e-05, + "loss": 1.2266, + "step": 350 + }, + { + "epoch": 0.023717263495456976, + "grad_norm": 0.2021484375, + "learning_rate": 4.742818971275885e-05, + "loss": 1.265, + "step": 355 + }, + { + "epoch": 0.024051309460181722, + "grad_norm": 0.2021484375, + "learning_rate": 4.809619238476954e-05, + "loss": 1.2426, + "step": 360 + }, + { + "epoch": 0.02438535542490647, + "grad_norm": 0.244140625, + "learning_rate": 4.876419505678023e-05, + "loss": 1.3144, + "step": 365 + }, + { + "epoch": 0.024719401389631215, + "grad_norm": 0.208984375, + "learning_rate": 4.943219772879092e-05, + "loss": 1.2892, + "step": 370 + }, + { + "epoch": 0.02505344735435596, + "grad_norm": 0.216796875, + "learning_rate": 5.01002004008016e-05, + "loss": 1.1596, + "step": 375 + }, + { + "epoch": 0.025387493319080707, + "grad_norm": 0.2119140625, + "learning_rate": 5.0768203072812295e-05, + "loss": 1.3151, + "step": 380 + }, + { + "epoch": 0.025721539283805453, + "grad_norm": 0.2236328125, + "learning_rate": 5.143620574482299e-05, + "loss": 1.2643, + "step": 385 + }, + { + "epoch": 0.0260555852485302, + "grad_norm": 0.2041015625, + "learning_rate": 5.2104208416833665e-05, + "loss": 1.2695, + "step": 390 + }, + { + "epoch": 0.026389631213254945, + "grad_norm": 0.2041015625, + "learning_rate": 5.277221108884436e-05, + "loss": 1.2472, + "step": 395 + }, + { + "epoch": 0.02672367717797969, + "grad_norm": 0.2314453125, + "learning_rate": 5.344021376085505e-05, + "loss": 1.3066, + "step": 400 + }, + { + "epoch": 0.027057723142704437, + "grad_norm": 0.251953125, + "learning_rate": 5.4108216432865734e-05, + "loss": 1.2664, + "step": 405 + }, + { + "epoch": 0.027391769107429183, + "grad_norm": 0.251953125, + "learning_rate": 5.4776219104876426e-05, + "loss": 1.221, + "step": 410 + }, + { + "epoch": 0.02772581507215393, + "grad_norm": 0.201171875, + "learning_rate": 5.5444221776887104e-05, + "loss": 1.3246, + "step": 415 + }, + { + "epoch": 0.028059861036878676, + "grad_norm": 0.22265625, + "learning_rate": 5.6112224448897796e-05, + "loss": 1.2621, + "step": 420 + }, + { + "epoch": 0.028393907001603422, + "grad_norm": 0.2578125, + "learning_rate": 5.6780227120908494e-05, + "loss": 1.3305, + "step": 425 + }, + { + "epoch": 0.028727952966328168, + "grad_norm": 0.2080078125, + "learning_rate": 5.744822979291917e-05, + "loss": 1.2464, + "step": 430 + }, + { + "epoch": 0.029061998931052914, + "grad_norm": 0.2001953125, + "learning_rate": 5.8116232464929865e-05, + "loss": 1.284, + "step": 435 + }, + { + "epoch": 0.02939604489577766, + "grad_norm": 0.2109375, + "learning_rate": 5.878423513694055e-05, + "loss": 1.3277, + "step": 440 + }, + { + "epoch": 0.029730090860502406, + "grad_norm": 0.2216796875, + "learning_rate": 5.945223780895124e-05, + "loss": 1.2383, + "step": 445 + }, + { + "epoch": 0.030064136825227152, + "grad_norm": 0.2080078125, + "learning_rate": 6.012024048096193e-05, + "loss": 1.2604, + "step": 450 + }, + { + "epoch": 0.0303981827899519, + "grad_norm": 0.228515625, + "learning_rate": 6.078824315297261e-05, + "loss": 1.302, + "step": 455 + }, + { + "epoch": 0.030732228754676644, + "grad_norm": 0.1982421875, + "learning_rate": 6.145624582498331e-05, + "loss": 1.2275, + "step": 460 + }, + { + "epoch": 0.03106627471940139, + "grad_norm": 0.21484375, + "learning_rate": 6.212424849699398e-05, + "loss": 1.3504, + "step": 465 + }, + { + "epoch": 0.03140032068412613, + "grad_norm": 0.2080078125, + "learning_rate": 6.279225116900468e-05, + "loss": 1.2612, + "step": 470 + }, + { + "epoch": 0.03173436664885088, + "grad_norm": 0.23828125, + "learning_rate": 6.346025384101537e-05, + "loss": 1.2555, + "step": 475 + }, + { + "epoch": 0.032068412613575625, + "grad_norm": 0.1943359375, + "learning_rate": 6.412825651302605e-05, + "loss": 1.2519, + "step": 480 + }, + { + "epoch": 0.032402458578300375, + "grad_norm": 0.2255859375, + "learning_rate": 6.479625918503675e-05, + "loss": 1.2233, + "step": 485 + }, + { + "epoch": 0.03273650454302512, + "grad_norm": 0.2080078125, + "learning_rate": 6.546426185704742e-05, + "loss": 1.2148, + "step": 490 + }, + { + "epoch": 0.03307055050774987, + "grad_norm": 0.2021484375, + "learning_rate": 6.613226452905812e-05, + "loss": 1.2288, + "step": 495 + }, + { + "epoch": 0.03340459647247461, + "grad_norm": 0.224609375, + "learning_rate": 6.680026720106882e-05, + "loss": 1.3175, + "step": 500 + }, + { + "epoch": 0.03373864243719936, + "grad_norm": 0.193359375, + "learning_rate": 6.746826987307949e-05, + "loss": 1.2848, + "step": 505 + }, + { + "epoch": 0.0340726884019241, + "grad_norm": 0.2412109375, + "learning_rate": 6.813627254509019e-05, + "loss": 1.2759, + "step": 510 + }, + { + "epoch": 0.03440673436664885, + "grad_norm": 0.1943359375, + "learning_rate": 6.880427521710087e-05, + "loss": 1.3355, + "step": 515 + }, + { + "epoch": 0.034740780331373594, + "grad_norm": 0.201171875, + "learning_rate": 6.947227788911156e-05, + "loss": 1.2873, + "step": 520 + }, + { + "epoch": 0.035074826296098344, + "grad_norm": 0.21484375, + "learning_rate": 7.014028056112226e-05, + "loss": 1.27, + "step": 525 + }, + { + "epoch": 0.035408872260823086, + "grad_norm": 0.2060546875, + "learning_rate": 7.080828323313293e-05, + "loss": 1.1939, + "step": 530 + }, + { + "epoch": 0.035742918225547836, + "grad_norm": 0.212890625, + "learning_rate": 7.147628590514363e-05, + "loss": 1.2536, + "step": 535 + }, + { + "epoch": 0.03607696419027258, + "grad_norm": 0.2001953125, + "learning_rate": 7.214428857715431e-05, + "loss": 1.276, + "step": 540 + }, + { + "epoch": 0.03641101015499733, + "grad_norm": 0.2158203125, + "learning_rate": 7.2812291249165e-05, + "loss": 1.2621, + "step": 545 + }, + { + "epoch": 0.03674505611972207, + "grad_norm": 0.2451171875, + "learning_rate": 7.34802939211757e-05, + "loss": 1.2361, + "step": 550 + }, + { + "epoch": 0.03707910208444682, + "grad_norm": 0.2119140625, + "learning_rate": 7.414829659318637e-05, + "loss": 1.3944, + "step": 555 + }, + { + "epoch": 0.03741314804917156, + "grad_norm": 0.2099609375, + "learning_rate": 7.481629926519707e-05, + "loss": 1.2375, + "step": 560 + }, + { + "epoch": 0.03774719401389631, + "grad_norm": 0.2255859375, + "learning_rate": 7.548430193720776e-05, + "loss": 1.2653, + "step": 565 + }, + { + "epoch": 0.038081239978621055, + "grad_norm": 0.208984375, + "learning_rate": 7.615230460921844e-05, + "loss": 1.2265, + "step": 570 + }, + { + "epoch": 0.038415285943345805, + "grad_norm": 0.2001953125, + "learning_rate": 7.682030728122913e-05, + "loss": 1.263, + "step": 575 + }, + { + "epoch": 0.03874933190807055, + "grad_norm": 0.2021484375, + "learning_rate": 7.748830995323982e-05, + "loss": 1.1832, + "step": 580 + }, + { + "epoch": 0.0390833778727953, + "grad_norm": 0.2138671875, + "learning_rate": 7.81563126252505e-05, + "loss": 1.2563, + "step": 585 + }, + { + "epoch": 0.03941742383752004, + "grad_norm": 0.1865234375, + "learning_rate": 7.88243152972612e-05, + "loss": 1.2236, + "step": 590 + }, + { + "epoch": 0.03975146980224479, + "grad_norm": 0.1904296875, + "learning_rate": 7.949231796927187e-05, + "loss": 1.2766, + "step": 595 + }, + { + "epoch": 0.04008551576696953, + "grad_norm": 0.1982421875, + "learning_rate": 8.016032064128257e-05, + "loss": 1.2377, + "step": 600 + }, + { + "epoch": 0.04041956173169428, + "grad_norm": 0.2255859375, + "learning_rate": 8.082832331329326e-05, + "loss": 1.2083, + "step": 605 + }, + { + "epoch": 0.040753607696419024, + "grad_norm": 0.201171875, + "learning_rate": 8.149632598530394e-05, + "loss": 1.3272, + "step": 610 + }, + { + "epoch": 0.04108765366114377, + "grad_norm": 0.1953125, + "learning_rate": 8.216432865731464e-05, + "loss": 1.2428, + "step": 615 + }, + { + "epoch": 0.041421699625868516, + "grad_norm": 0.2197265625, + "learning_rate": 8.283233132932531e-05, + "loss": 1.3088, + "step": 620 + }, + { + "epoch": 0.041755745590593266, + "grad_norm": 0.1962890625, + "learning_rate": 8.350033400133601e-05, + "loss": 1.2174, + "step": 625 + }, + { + "epoch": 0.042089791555318015, + "grad_norm": 0.2490234375, + "learning_rate": 8.41683366733467e-05, + "loss": 1.1424, + "step": 630 + }, + { + "epoch": 0.04242383752004276, + "grad_norm": 0.234375, + "learning_rate": 8.483633934535738e-05, + "loss": 1.2562, + "step": 635 + }, + { + "epoch": 0.04275788348476751, + "grad_norm": 0.1982421875, + "learning_rate": 8.550434201736808e-05, + "loss": 1.2747, + "step": 640 + }, + { + "epoch": 0.04309192944949225, + "grad_norm": 0.1953125, + "learning_rate": 8.617234468937875e-05, + "loss": 1.2727, + "step": 645 + }, + { + "epoch": 0.043425975414217, + "grad_norm": 0.1953125, + "learning_rate": 8.684034736138945e-05, + "loss": 1.2413, + "step": 650 + }, + { + "epoch": 0.04376002137894174, + "grad_norm": 0.2158203125, + "learning_rate": 8.750835003340014e-05, + "loss": 1.2405, + "step": 655 + }, + { + "epoch": 0.04409406734366649, + "grad_norm": 0.1875, + "learning_rate": 8.817635270541082e-05, + "loss": 1.2667, + "step": 660 + }, + { + "epoch": 0.044428113308391234, + "grad_norm": 0.23828125, + "learning_rate": 8.884435537742152e-05, + "loss": 1.2387, + "step": 665 + }, + { + "epoch": 0.044762159273115984, + "grad_norm": 0.193359375, + "learning_rate": 8.95123580494322e-05, + "loss": 1.1747, + "step": 670 + }, + { + "epoch": 0.04509620523784073, + "grad_norm": 0.1962890625, + "learning_rate": 9.018036072144289e-05, + "loss": 1.1909, + "step": 675 + }, + { + "epoch": 0.045430251202565476, + "grad_norm": 0.2470703125, + "learning_rate": 9.084836339345357e-05, + "loss": 1.2968, + "step": 680 + }, + { + "epoch": 0.04576429716729022, + "grad_norm": 0.2041015625, + "learning_rate": 9.151636606546426e-05, + "loss": 1.2266, + "step": 685 + }, + { + "epoch": 0.04609834313201497, + "grad_norm": 0.2216796875, + "learning_rate": 9.218436873747496e-05, + "loss": 1.1976, + "step": 690 + }, + { + "epoch": 0.04643238909673971, + "grad_norm": 0.2080078125, + "learning_rate": 9.285237140948564e-05, + "loss": 1.2633, + "step": 695 + }, + { + "epoch": 0.04676643506146446, + "grad_norm": 0.2041015625, + "learning_rate": 9.352037408149633e-05, + "loss": 1.1995, + "step": 700 + }, + { + "epoch": 0.0471004810261892, + "grad_norm": 0.2021484375, + "learning_rate": 9.418837675350703e-05, + "loss": 1.2037, + "step": 705 + }, + { + "epoch": 0.04743452699091395, + "grad_norm": 0.2060546875, + "learning_rate": 9.48563794255177e-05, + "loss": 1.2941, + "step": 710 + }, + { + "epoch": 0.047768572955638695, + "grad_norm": 0.1923828125, + "learning_rate": 9.55243820975284e-05, + "loss": 1.1902, + "step": 715 + }, + { + "epoch": 0.048102618920363445, + "grad_norm": 0.205078125, + "learning_rate": 9.619238476953908e-05, + "loss": 1.214, + "step": 720 + }, + { + "epoch": 0.04843666488508819, + "grad_norm": 0.1865234375, + "learning_rate": 9.686038744154977e-05, + "loss": 1.1536, + "step": 725 + }, + { + "epoch": 0.04877071084981294, + "grad_norm": 0.189453125, + "learning_rate": 9.752839011356047e-05, + "loss": 1.2421, + "step": 730 + }, + { + "epoch": 0.04910475681453768, + "grad_norm": 0.2138671875, + "learning_rate": 9.819639278557115e-05, + "loss": 1.2786, + "step": 735 + }, + { + "epoch": 0.04943880277926243, + "grad_norm": 0.2001953125, + "learning_rate": 9.886439545758184e-05, + "loss": 1.2637, + "step": 740 + }, + { + "epoch": 0.04977284874398717, + "grad_norm": 0.197265625, + "learning_rate": 9.953239812959252e-05, + "loss": 1.2836, + "step": 745 + }, + { + "epoch": 0.05010689470871192, + "grad_norm": 0.1826171875, + "learning_rate": 0.0001002004008016032, + "loss": 1.281, + "step": 750 + }, + { + "epoch": 0.050440940673436664, + "grad_norm": 0.1904296875, + "learning_rate": 0.0001008684034736139, + "loss": 1.2277, + "step": 755 + }, + { + "epoch": 0.050774986638161414, + "grad_norm": 0.2265625, + "learning_rate": 0.00010153640614562459, + "loss": 1.2501, + "step": 760 + }, + { + "epoch": 0.051109032602886156, + "grad_norm": 0.181640625, + "learning_rate": 0.00010220440881763526, + "loss": 1.294, + "step": 765 + }, + { + "epoch": 0.051443078567610906, + "grad_norm": 0.2099609375, + "learning_rate": 0.00010287241148964597, + "loss": 1.2892, + "step": 770 + }, + { + "epoch": 0.05177712453233565, + "grad_norm": 0.1845703125, + "learning_rate": 0.00010354041416165665, + "loss": 1.2675, + "step": 775 + }, + { + "epoch": 0.0521111704970604, + "grad_norm": 0.2099609375, + "learning_rate": 0.00010420841683366733, + "loss": 1.2157, + "step": 780 + }, + { + "epoch": 0.05244521646178514, + "grad_norm": 0.1884765625, + "learning_rate": 0.00010487641950567804, + "loss": 1.2635, + "step": 785 + }, + { + "epoch": 0.05277926242650989, + "grad_norm": 0.17578125, + "learning_rate": 0.00010554442217768871, + "loss": 1.2452, + "step": 790 + }, + { + "epoch": 0.05311330839123463, + "grad_norm": 0.1953125, + "learning_rate": 0.0001062124248496994, + "loss": 1.1851, + "step": 795 + }, + { + "epoch": 0.05344735435595938, + "grad_norm": 0.251953125, + "learning_rate": 0.0001068804275217101, + "loss": 1.2847, + "step": 800 + }, + { + "epoch": 0.053781400320684125, + "grad_norm": 0.201171875, + "learning_rate": 0.00010754843019372078, + "loss": 1.2923, + "step": 805 + }, + { + "epoch": 0.054115446285408875, + "grad_norm": 0.1962890625, + "learning_rate": 0.00010821643286573147, + "loss": 1.1858, + "step": 810 + }, + { + "epoch": 0.05444949225013362, + "grad_norm": 0.20703125, + "learning_rate": 0.00010888443553774215, + "loss": 1.2945, + "step": 815 + }, + { + "epoch": 0.05478353821485837, + "grad_norm": 0.1982421875, + "learning_rate": 0.00010955243820975285, + "loss": 1.3139, + "step": 820 + }, + { + "epoch": 0.05511758417958311, + "grad_norm": 0.173828125, + "learning_rate": 0.00011022044088176354, + "loss": 1.1953, + "step": 825 + }, + { + "epoch": 0.05545163014430786, + "grad_norm": 0.19921875, + "learning_rate": 0.00011088844355377421, + "loss": 1.2724, + "step": 830 + }, + { + "epoch": 0.0557856761090326, + "grad_norm": 0.1904296875, + "learning_rate": 0.00011155644622578492, + "loss": 1.3072, + "step": 835 + }, + { + "epoch": 0.05611972207375735, + "grad_norm": 0.2255859375, + "learning_rate": 0.00011222444889779559, + "loss": 1.2644, + "step": 840 + }, + { + "epoch": 0.056453768038482094, + "grad_norm": 0.1904296875, + "learning_rate": 0.00011289245156980628, + "loss": 1.2475, + "step": 845 + }, + { + "epoch": 0.056787814003206843, + "grad_norm": 0.1796875, + "learning_rate": 0.00011356045424181699, + "loss": 1.1897, + "step": 850 + }, + { + "epoch": 0.057121859967931586, + "grad_norm": 0.2080078125, + "learning_rate": 0.00011422845691382766, + "loss": 1.2661, + "step": 855 + }, + { + "epoch": 0.057455905932656336, + "grad_norm": 0.2294921875, + "learning_rate": 0.00011489645958583835, + "loss": 1.2381, + "step": 860 + }, + { + "epoch": 0.05778995189738108, + "grad_norm": 0.173828125, + "learning_rate": 0.00011556446225784903, + "loss": 1.2789, + "step": 865 + }, + { + "epoch": 0.05812399786210583, + "grad_norm": 0.201171875, + "learning_rate": 0.00011623246492985973, + "loss": 1.2097, + "step": 870 + }, + { + "epoch": 0.05845804382683057, + "grad_norm": 0.185546875, + "learning_rate": 0.00011690046760187041, + "loss": 1.2372, + "step": 875 + }, + { + "epoch": 0.05879208979155532, + "grad_norm": 0.1982421875, + "learning_rate": 0.0001175684702738811, + "loss": 1.2229, + "step": 880 + }, + { + "epoch": 0.05912613575628006, + "grad_norm": 0.1748046875, + "learning_rate": 0.0001182364729458918, + "loss": 1.2605, + "step": 885 + }, + { + "epoch": 0.05946018172100481, + "grad_norm": 0.173828125, + "learning_rate": 0.00011890447561790248, + "loss": 1.2223, + "step": 890 + }, + { + "epoch": 0.059794227685729555, + "grad_norm": 0.1708984375, + "learning_rate": 0.00011957247828991315, + "loss": 1.2466, + "step": 895 + }, + { + "epoch": 0.060128273650454304, + "grad_norm": 0.2216796875, + "learning_rate": 0.00012024048096192387, + "loss": 1.2771, + "step": 900 + }, + { + "epoch": 0.06046231961517905, + "grad_norm": 0.2109375, + "learning_rate": 0.00012090848363393454, + "loss": 1.2373, + "step": 905 + }, + { + "epoch": 0.0607963655799038, + "grad_norm": 0.220703125, + "learning_rate": 0.00012157648630594522, + "loss": 1.2916, + "step": 910 + }, + { + "epoch": 0.06113041154462854, + "grad_norm": 0.2021484375, + "learning_rate": 0.0001222444889779559, + "loss": 1.2504, + "step": 915 + }, + { + "epoch": 0.06146445750935329, + "grad_norm": 0.1865234375, + "learning_rate": 0.00012291249164996662, + "loss": 1.2714, + "step": 920 + }, + { + "epoch": 0.06179850347407803, + "grad_norm": 0.1904296875, + "learning_rate": 0.0001235804943219773, + "loss": 1.2904, + "step": 925 + }, + { + "epoch": 0.06213254943880278, + "grad_norm": 0.17578125, + "learning_rate": 0.00012424849699398796, + "loss": 1.254, + "step": 930 + }, + { + "epoch": 0.062466595403527524, + "grad_norm": 0.2373046875, + "learning_rate": 0.0001249164996659987, + "loss": 1.3173, + "step": 935 + }, + { + "epoch": 0.06280064136825227, + "grad_norm": 0.189453125, + "learning_rate": 0.00012558450233800936, + "loss": 1.2083, + "step": 940 + }, + { + "epoch": 0.06313468733297702, + "grad_norm": 0.1845703125, + "learning_rate": 0.00012625250501002003, + "loss": 1.2594, + "step": 945 + }, + { + "epoch": 0.06346873329770177, + "grad_norm": 0.189453125, + "learning_rate": 0.00012692050768203073, + "loss": 1.2027, + "step": 950 + }, + { + "epoch": 0.06380277926242651, + "grad_norm": 0.171875, + "learning_rate": 0.00012758851035404143, + "loss": 1.1886, + "step": 955 + }, + { + "epoch": 0.06413682522715125, + "grad_norm": 0.1767578125, + "learning_rate": 0.0001282565130260521, + "loss": 1.1855, + "step": 960 + }, + { + "epoch": 0.064470871191876, + "grad_norm": 0.1787109375, + "learning_rate": 0.0001289245156980628, + "loss": 1.2326, + "step": 965 + }, + { + "epoch": 0.06480491715660075, + "grad_norm": 0.19140625, + "learning_rate": 0.0001295925183700735, + "loss": 1.2502, + "step": 970 + }, + { + "epoch": 0.0651389631213255, + "grad_norm": 0.18359375, + "learning_rate": 0.00013026052104208417, + "loss": 1.1563, + "step": 975 + }, + { + "epoch": 0.06547300908605024, + "grad_norm": 0.1748046875, + "learning_rate": 0.00013092852371409484, + "loss": 1.2203, + "step": 980 + }, + { + "epoch": 0.06580705505077498, + "grad_norm": 0.19140625, + "learning_rate": 0.00013159652638610557, + "loss": 1.3099, + "step": 985 + }, + { + "epoch": 0.06614110101549973, + "grad_norm": 0.1826171875, + "learning_rate": 0.00013226452905811624, + "loss": 1.2904, + "step": 990 + }, + { + "epoch": 0.06647514698022448, + "grad_norm": 0.197265625, + "learning_rate": 0.0001329325317301269, + "loss": 1.2602, + "step": 995 + }, + { + "epoch": 0.06680919294494922, + "grad_norm": 0.1650390625, + "learning_rate": 0.00013360053440213764, + "loss": 1.1857, + "step": 1000 + }, + { + "epoch": 0.06714323890967397, + "grad_norm": 0.19921875, + "learning_rate": 0.0001342685370741483, + "loss": 1.2519, + "step": 1005 + }, + { + "epoch": 0.06747728487439872, + "grad_norm": 0.17578125, + "learning_rate": 0.00013493653974615898, + "loss": 1.1755, + "step": 1010 + }, + { + "epoch": 0.06781133083912347, + "grad_norm": 0.173828125, + "learning_rate": 0.00013560454241816968, + "loss": 1.2496, + "step": 1015 + }, + { + "epoch": 0.0681453768038482, + "grad_norm": 0.1650390625, + "learning_rate": 0.00013627254509018038, + "loss": 1.2386, + "step": 1020 + }, + { + "epoch": 0.06847942276857295, + "grad_norm": 0.1728515625, + "learning_rate": 0.00013694054776219105, + "loss": 1.2811, + "step": 1025 + }, + { + "epoch": 0.0688134687332977, + "grad_norm": 0.1787109375, + "learning_rate": 0.00013760855043420175, + "loss": 1.2174, + "step": 1030 + }, + { + "epoch": 0.06914751469802245, + "grad_norm": 0.21875, + "learning_rate": 0.00013827655310621244, + "loss": 1.2521, + "step": 1035 + }, + { + "epoch": 0.06948156066274719, + "grad_norm": 0.201171875, + "learning_rate": 0.00013894455577822312, + "loss": 1.2147, + "step": 1040 + }, + { + "epoch": 0.06981560662747194, + "grad_norm": 0.275390625, + "learning_rate": 0.0001396125584502338, + "loss": 1.2223, + "step": 1045 + }, + { + "epoch": 0.07014965259219669, + "grad_norm": 0.177734375, + "learning_rate": 0.0001402805611222445, + "loss": 1.1894, + "step": 1050 + }, + { + "epoch": 0.07048369855692144, + "grad_norm": 0.193359375, + "learning_rate": 0.00014094856379425518, + "loss": 1.2567, + "step": 1055 + }, + { + "epoch": 0.07081774452164617, + "grad_norm": 0.19921875, + "learning_rate": 0.00014161656646626586, + "loss": 1.2306, + "step": 1060 + }, + { + "epoch": 0.07115179048637092, + "grad_norm": 0.1806640625, + "learning_rate": 0.00014228456913827658, + "loss": 1.19, + "step": 1065 + }, + { + "epoch": 0.07148583645109567, + "grad_norm": 0.1796875, + "learning_rate": 0.00014295257181028725, + "loss": 1.2903, + "step": 1070 + }, + { + "epoch": 0.07181988241582042, + "grad_norm": 0.171875, + "learning_rate": 0.00014362057448229792, + "loss": 1.1799, + "step": 1075 + }, + { + "epoch": 0.07215392838054516, + "grad_norm": 0.1748046875, + "learning_rate": 0.00014428857715430862, + "loss": 1.2499, + "step": 1080 + }, + { + "epoch": 0.0724879743452699, + "grad_norm": 0.189453125, + "learning_rate": 0.00014495657982631932, + "loss": 1.256, + "step": 1085 + }, + { + "epoch": 0.07282202030999466, + "grad_norm": 0.193359375, + "learning_rate": 0.00014562458249833, + "loss": 1.3308, + "step": 1090 + }, + { + "epoch": 0.0731560662747194, + "grad_norm": 0.1875, + "learning_rate": 0.0001462925851703407, + "loss": 1.2816, + "step": 1095 + }, + { + "epoch": 0.07349011223944414, + "grad_norm": 0.16796875, + "learning_rate": 0.0001469605878423514, + "loss": 1.2938, + "step": 1100 + }, + { + "epoch": 0.07382415820416889, + "grad_norm": 0.1875, + "learning_rate": 0.00014762859051436206, + "loss": 1.3056, + "step": 1105 + }, + { + "epoch": 0.07415820416889364, + "grad_norm": 0.1533203125, + "learning_rate": 0.00014829659318637273, + "loss": 1.2525, + "step": 1110 + }, + { + "epoch": 0.07449225013361839, + "grad_norm": 0.1923828125, + "learning_rate": 0.00014896459585838346, + "loss": 1.1993, + "step": 1115 + }, + { + "epoch": 0.07482629609834313, + "grad_norm": 0.193359375, + "learning_rate": 0.00014963259853039413, + "loss": 1.2613, + "step": 1120 + }, + { + "epoch": 0.07516034206306788, + "grad_norm": 0.1875, + "learning_rate": 0.0001503006012024048, + "loss": 1.2884, + "step": 1125 + }, + { + "epoch": 0.07549438802779262, + "grad_norm": 0.1875, + "learning_rate": 0.00015096860387441553, + "loss": 1.2162, + "step": 1130 + }, + { + "epoch": 0.07582843399251737, + "grad_norm": 0.158203125, + "learning_rate": 0.0001516366065464262, + "loss": 1.2131, + "step": 1135 + }, + { + "epoch": 0.07616247995724211, + "grad_norm": 0.1728515625, + "learning_rate": 0.00015230460921843687, + "loss": 1.2212, + "step": 1140 + }, + { + "epoch": 0.07649652592196686, + "grad_norm": 0.1640625, + "learning_rate": 0.00015297261189044757, + "loss": 1.1528, + "step": 1145 + }, + { + "epoch": 0.07683057188669161, + "grad_norm": 0.16796875, + "learning_rate": 0.00015364061456245827, + "loss": 1.125, + "step": 1150 + }, + { + "epoch": 0.07716461785141636, + "grad_norm": 0.1943359375, + "learning_rate": 0.00015430861723446894, + "loss": 1.2868, + "step": 1155 + }, + { + "epoch": 0.0774986638161411, + "grad_norm": 0.216796875, + "learning_rate": 0.00015497661990647964, + "loss": 1.2277, + "step": 1160 + }, + { + "epoch": 0.07783270978086584, + "grad_norm": 0.1728515625, + "learning_rate": 0.00015564462257849034, + "loss": 1.2085, + "step": 1165 + }, + { + "epoch": 0.0781667557455906, + "grad_norm": 0.18359375, + "learning_rate": 0.000156312625250501, + "loss": 1.1623, + "step": 1170 + }, + { + "epoch": 0.07850080171031534, + "grad_norm": 0.2138671875, + "learning_rate": 0.00015698062792251168, + "loss": 1.2236, + "step": 1175 + }, + { + "epoch": 0.07883484767504008, + "grad_norm": 0.1630859375, + "learning_rate": 0.0001576486305945224, + "loss": 1.2148, + "step": 1180 + }, + { + "epoch": 0.07916889363976483, + "grad_norm": 0.1630859375, + "learning_rate": 0.00015831663326653308, + "loss": 1.2213, + "step": 1185 + }, + { + "epoch": 0.07950293960448958, + "grad_norm": 0.21875, + "learning_rate": 0.00015898463593854375, + "loss": 1.1879, + "step": 1190 + }, + { + "epoch": 0.07983698556921433, + "grad_norm": 0.16796875, + "learning_rate": 0.00015965263861055445, + "loss": 1.2233, + "step": 1195 + }, + { + "epoch": 0.08017103153393906, + "grad_norm": 0.1650390625, + "learning_rate": 0.00016032064128256515, + "loss": 1.2497, + "step": 1200 + }, + { + "epoch": 0.08050507749866381, + "grad_norm": 0.16015625, + "learning_rate": 0.00016098864395457582, + "loss": 1.1905, + "step": 1205 + }, + { + "epoch": 0.08083912346338856, + "grad_norm": 0.1796875, + "learning_rate": 0.00016165664662658652, + "loss": 1.1979, + "step": 1210 + }, + { + "epoch": 0.08117316942811331, + "grad_norm": 0.1884765625, + "learning_rate": 0.00016232464929859721, + "loss": 1.2394, + "step": 1215 + }, + { + "epoch": 0.08150721539283805, + "grad_norm": 0.1787109375, + "learning_rate": 0.0001629926519706079, + "loss": 1.2193, + "step": 1220 + }, + { + "epoch": 0.0818412613575628, + "grad_norm": 0.185546875, + "learning_rate": 0.00016366065464261859, + "loss": 1.169, + "step": 1225 + }, + { + "epoch": 0.08217530732228755, + "grad_norm": 0.1728515625, + "learning_rate": 0.00016432865731462928, + "loss": 1.2845, + "step": 1230 + }, + { + "epoch": 0.0825093532870123, + "grad_norm": 0.1630859375, + "learning_rate": 0.00016499665998663996, + "loss": 1.298, + "step": 1235 + }, + { + "epoch": 0.08284339925173703, + "grad_norm": 0.16796875, + "learning_rate": 0.00016566466265865063, + "loss": 1.2208, + "step": 1240 + }, + { + "epoch": 0.08317744521646178, + "grad_norm": 0.181640625, + "learning_rate": 0.00016633266533066135, + "loss": 1.2539, + "step": 1245 + }, + { + "epoch": 0.08351149118118653, + "grad_norm": 0.1728515625, + "learning_rate": 0.00016700066800267202, + "loss": 1.2138, + "step": 1250 + }, + { + "epoch": 0.08384553714591128, + "grad_norm": 0.17578125, + "learning_rate": 0.0001676686706746827, + "loss": 1.2358, + "step": 1255 + }, + { + "epoch": 0.08417958311063603, + "grad_norm": 0.177734375, + "learning_rate": 0.0001683366733466934, + "loss": 1.2247, + "step": 1260 + }, + { + "epoch": 0.08451362907536077, + "grad_norm": 0.1591796875, + "learning_rate": 0.0001690046760187041, + "loss": 1.3043, + "step": 1265 + }, + { + "epoch": 0.08484767504008552, + "grad_norm": 0.193359375, + "learning_rate": 0.00016967267869071476, + "loss": 1.1915, + "step": 1270 + }, + { + "epoch": 0.08518172100481027, + "grad_norm": 0.171875, + "learning_rate": 0.00017034068136272546, + "loss": 1.151, + "step": 1275 + }, + { + "epoch": 0.08551576696953501, + "grad_norm": 0.1669921875, + "learning_rate": 0.00017100868403473616, + "loss": 1.2174, + "step": 1280 + }, + { + "epoch": 0.08584981293425975, + "grad_norm": 0.162109375, + "learning_rate": 0.00017167668670674683, + "loss": 1.2237, + "step": 1285 + }, + { + "epoch": 0.0861838588989845, + "grad_norm": 0.171875, + "learning_rate": 0.0001723446893787575, + "loss": 1.2163, + "step": 1290 + }, + { + "epoch": 0.08651790486370925, + "grad_norm": 0.1884765625, + "learning_rate": 0.00017301269205076823, + "loss": 1.3327, + "step": 1295 + }, + { + "epoch": 0.086851950828434, + "grad_norm": 0.173828125, + "learning_rate": 0.0001736806947227789, + "loss": 1.2554, + "step": 1300 + }, + { + "epoch": 0.08718599679315873, + "grad_norm": 0.158203125, + "learning_rate": 0.00017434869739478957, + "loss": 1.2164, + "step": 1305 + }, + { + "epoch": 0.08752004275788348, + "grad_norm": 0.1611328125, + "learning_rate": 0.00017501670006680027, + "loss": 1.1899, + "step": 1310 + }, + { + "epoch": 0.08785408872260823, + "grad_norm": 0.171875, + "learning_rate": 0.00017568470273881097, + "loss": 1.2598, + "step": 1315 + }, + { + "epoch": 0.08818813468733298, + "grad_norm": 0.1748046875, + "learning_rate": 0.00017635270541082164, + "loss": 1.2093, + "step": 1320 + }, + { + "epoch": 0.08852218065205772, + "grad_norm": 0.1591796875, + "learning_rate": 0.00017702070808283234, + "loss": 1.1936, + "step": 1325 + }, + { + "epoch": 0.08885622661678247, + "grad_norm": 0.1845703125, + "learning_rate": 0.00017768871075484304, + "loss": 1.3092, + "step": 1330 + }, + { + "epoch": 0.08919027258150722, + "grad_norm": 0.16796875, + "learning_rate": 0.0001783567134268537, + "loss": 1.2603, + "step": 1335 + }, + { + "epoch": 0.08952431854623197, + "grad_norm": 0.169921875, + "learning_rate": 0.0001790247160988644, + "loss": 1.1828, + "step": 1340 + }, + { + "epoch": 0.0898583645109567, + "grad_norm": 0.19140625, + "learning_rate": 0.0001796927187708751, + "loss": 1.2985, + "step": 1345 + }, + { + "epoch": 0.09019241047568145, + "grad_norm": 0.16796875, + "learning_rate": 0.00018036072144288578, + "loss": 1.1845, + "step": 1350 + }, + { + "epoch": 0.0905264564404062, + "grad_norm": 0.2333984375, + "learning_rate": 0.00018102872411489645, + "loss": 1.1018, + "step": 1355 + }, + { + "epoch": 0.09086050240513095, + "grad_norm": 0.1748046875, + "learning_rate": 0.00018169672678690715, + "loss": 1.1901, + "step": 1360 + }, + { + "epoch": 0.09119454836985569, + "grad_norm": 0.1689453125, + "learning_rate": 0.00018236472945891785, + "loss": 1.2401, + "step": 1365 + }, + { + "epoch": 0.09152859433458044, + "grad_norm": 0.1669921875, + "learning_rate": 0.00018303273213092852, + "loss": 1.2479, + "step": 1370 + }, + { + "epoch": 0.09186264029930519, + "grad_norm": 0.1650390625, + "learning_rate": 0.00018370073480293922, + "loss": 1.1413, + "step": 1375 + }, + { + "epoch": 0.09219668626402994, + "grad_norm": 0.21875, + "learning_rate": 0.00018436873747494992, + "loss": 1.2119, + "step": 1380 + }, + { + "epoch": 0.09253073222875467, + "grad_norm": 0.1650390625, + "learning_rate": 0.0001850367401469606, + "loss": 1.2831, + "step": 1385 + }, + { + "epoch": 0.09286477819347942, + "grad_norm": 0.158203125, + "learning_rate": 0.0001857047428189713, + "loss": 1.2524, + "step": 1390 + }, + { + "epoch": 0.09319882415820417, + "grad_norm": 0.1943359375, + "learning_rate": 0.00018637274549098199, + "loss": 1.2294, + "step": 1395 + }, + { + "epoch": 0.09353287012292892, + "grad_norm": 0.2041015625, + "learning_rate": 0.00018704074816299266, + "loss": 1.1901, + "step": 1400 + }, + { + "epoch": 0.09386691608765366, + "grad_norm": 0.2021484375, + "learning_rate": 0.00018770875083500336, + "loss": 1.2334, + "step": 1405 + }, + { + "epoch": 0.0942009620523784, + "grad_norm": 0.1923828125, + "learning_rate": 0.00018837675350701405, + "loss": 1.2637, + "step": 1410 + }, + { + "epoch": 0.09453500801710316, + "grad_norm": 0.171875, + "learning_rate": 0.00018904475617902473, + "loss": 1.2487, + "step": 1415 + }, + { + "epoch": 0.0948690539818279, + "grad_norm": 0.197265625, + "learning_rate": 0.0001897127588510354, + "loss": 1.2222, + "step": 1420 + }, + { + "epoch": 0.09520309994655264, + "grad_norm": 0.1533203125, + "learning_rate": 0.0001903807615230461, + "loss": 1.2418, + "step": 1425 + }, + { + "epoch": 0.09553714591127739, + "grad_norm": 0.173828125, + "learning_rate": 0.0001910487641950568, + "loss": 1.2197, + "step": 1430 + }, + { + "epoch": 0.09587119187600214, + "grad_norm": 0.1728515625, + "learning_rate": 0.00019171676686706747, + "loss": 1.1918, + "step": 1435 + }, + { + "epoch": 0.09620523784072689, + "grad_norm": 0.173828125, + "learning_rate": 0.00019238476953907816, + "loss": 1.2533, + "step": 1440 + }, + { + "epoch": 0.09653928380545163, + "grad_norm": 0.1708984375, + "learning_rate": 0.00019305277221108886, + "loss": 1.2039, + "step": 1445 + }, + { + "epoch": 0.09687332977017638, + "grad_norm": 0.169921875, + "learning_rate": 0.00019372077488309953, + "loss": 1.218, + "step": 1450 + }, + { + "epoch": 0.09720737573490112, + "grad_norm": 0.2060546875, + "learning_rate": 0.00019438877755511023, + "loss": 1.2209, + "step": 1455 + }, + { + "epoch": 0.09754142169962587, + "grad_norm": 0.173828125, + "learning_rate": 0.00019505678022712093, + "loss": 1.1822, + "step": 1460 + }, + { + "epoch": 0.09787546766435061, + "grad_norm": 0.1650390625, + "learning_rate": 0.0001957247828991316, + "loss": 1.2394, + "step": 1465 + }, + { + "epoch": 0.09820951362907536, + "grad_norm": 0.2001953125, + "learning_rate": 0.0001963927855711423, + "loss": 1.2377, + "step": 1470 + }, + { + "epoch": 0.09854355959380011, + "grad_norm": 0.171875, + "learning_rate": 0.00019706078824315297, + "loss": 1.2419, + "step": 1475 + }, + { + "epoch": 0.09887760555852486, + "grad_norm": 0.17578125, + "learning_rate": 0.00019772879091516367, + "loss": 1.2013, + "step": 1480 + }, + { + "epoch": 0.0992116515232496, + "grad_norm": 0.158203125, + "learning_rate": 0.00019839679358717434, + "loss": 1.2789, + "step": 1485 + }, + { + "epoch": 0.09954569748797434, + "grad_norm": 0.1611328125, + "learning_rate": 0.00019906479625918504, + "loss": 1.2002, + "step": 1490 + }, + { + "epoch": 0.0998797434526991, + "grad_norm": 0.158203125, + "learning_rate": 0.00019973279893119574, + "loss": 1.2699, + "step": 1495 + }, + { + "epoch": 0.10021378941742384, + "grad_norm": 0.16796875, + "learning_rate": 0.00019999997552557096, + "loss": 1.2202, + "step": 1500 + }, + { + "epoch": 0.10054783538214858, + "grad_norm": 0.1669921875, + "learning_rate": 0.00019999982595965903, + "loss": 1.1978, + "step": 1505 + }, + { + "epoch": 0.10088188134687333, + "grad_norm": 0.1767578125, + "learning_rate": 0.00019999954042494334, + "loss": 1.1724, + "step": 1510 + }, + { + "epoch": 0.10121592731159808, + "grad_norm": 0.1669921875, + "learning_rate": 0.00019999911892181214, + "loss": 1.1925, + "step": 1515 + }, + { + "epoch": 0.10154997327632283, + "grad_norm": 0.1845703125, + "learning_rate": 0.0001999985614508385, + "loss": 1.194, + "step": 1520 + }, + { + "epoch": 0.10188401924104756, + "grad_norm": 0.1826171875, + "learning_rate": 0.0001999978680127804, + "loss": 1.2429, + "step": 1525 + }, + { + "epoch": 0.10221806520577231, + "grad_norm": 0.1640625, + "learning_rate": 0.00019999703860858073, + "loss": 1.2303, + "step": 1530 + }, + { + "epoch": 0.10255211117049706, + "grad_norm": 0.15234375, + "learning_rate": 0.0001999960732393672, + "loss": 1.2445, + "step": 1535 + }, + { + "epoch": 0.10288615713522181, + "grad_norm": 0.1572265625, + "learning_rate": 0.0001999949719064525, + "loss": 1.2214, + "step": 1540 + }, + { + "epoch": 0.10322020309994655, + "grad_norm": 0.19140625, + "learning_rate": 0.00019999373461133398, + "loss": 1.3003, + "step": 1545 + }, + { + "epoch": 0.1035542490646713, + "grad_norm": 0.16015625, + "learning_rate": 0.00019999236135569408, + "loss": 1.1941, + "step": 1550 + }, + { + "epoch": 0.10388829502939605, + "grad_norm": 0.1728515625, + "learning_rate": 0.00019999085214139994, + "loss": 1.1791, + "step": 1555 + }, + { + "epoch": 0.1042223409941208, + "grad_norm": 0.16796875, + "learning_rate": 0.00019998920697050364, + "loss": 1.2118, + "step": 1560 + }, + { + "epoch": 0.10455638695884553, + "grad_norm": 0.181640625, + "learning_rate": 0.0001999874258452421, + "loss": 1.252, + "step": 1565 + }, + { + "epoch": 0.10489043292357028, + "grad_norm": 0.1572265625, + "learning_rate": 0.00019998550876803708, + "loss": 1.1598, + "step": 1570 + }, + { + "epoch": 0.10522447888829503, + "grad_norm": 0.169921875, + "learning_rate": 0.00019998345574149526, + "loss": 1.2936, + "step": 1575 + }, + { + "epoch": 0.10555852485301978, + "grad_norm": 0.15625, + "learning_rate": 0.0001999812667684081, + "loss": 1.2732, + "step": 1580 + }, + { + "epoch": 0.10589257081774452, + "grad_norm": 0.169921875, + "learning_rate": 0.0001999789418517519, + "loss": 1.2097, + "step": 1585 + }, + { + "epoch": 0.10622661678246927, + "grad_norm": 0.162109375, + "learning_rate": 0.00019997648099468786, + "loss": 1.3343, + "step": 1590 + }, + { + "epoch": 0.10656066274719402, + "grad_norm": 0.205078125, + "learning_rate": 0.000199973884200562, + "loss": 1.2659, + "step": 1595 + }, + { + "epoch": 0.10689470871191876, + "grad_norm": 0.166015625, + "learning_rate": 0.00019997115147290506, + "loss": 1.2585, + "step": 1600 + }, + { + "epoch": 0.1072287546766435, + "grad_norm": 0.1630859375, + "learning_rate": 0.0001999682828154328, + "loss": 1.2175, + "step": 1605 + }, + { + "epoch": 0.10756280064136825, + "grad_norm": 0.154296875, + "learning_rate": 0.00019996527823204567, + "loss": 1.208, + "step": 1610 + }, + { + "epoch": 0.107896846606093, + "grad_norm": 0.154296875, + "learning_rate": 0.000199962137726829, + "loss": 1.2164, + "step": 1615 + }, + { + "epoch": 0.10823089257081775, + "grad_norm": 0.169921875, + "learning_rate": 0.00019995886130405287, + "loss": 1.2604, + "step": 1620 + }, + { + "epoch": 0.10856493853554249, + "grad_norm": 0.1640625, + "learning_rate": 0.00019995544896817222, + "loss": 1.276, + "step": 1625 + }, + { + "epoch": 0.10889898450026723, + "grad_norm": 0.171875, + "learning_rate": 0.00019995190072382677, + "loss": 1.2216, + "step": 1630 + }, + { + "epoch": 0.10923303046499198, + "grad_norm": 0.1513671875, + "learning_rate": 0.000199948216575841, + "loss": 1.2231, + "step": 1635 + }, + { + "epoch": 0.10956707642971673, + "grad_norm": 0.1904296875, + "learning_rate": 0.0001999443965292243, + "loss": 1.1956, + "step": 1640 + }, + { + "epoch": 0.10990112239444147, + "grad_norm": 0.1630859375, + "learning_rate": 0.00019994044058917063, + "loss": 1.2923, + "step": 1645 + }, + { + "epoch": 0.11023516835916622, + "grad_norm": 0.193359375, + "learning_rate": 0.00019993634876105896, + "loss": 1.2428, + "step": 1650 + }, + { + "epoch": 0.11056921432389097, + "grad_norm": 0.15625, + "learning_rate": 0.0001999321210504528, + "loss": 1.1977, + "step": 1655 + }, + { + "epoch": 0.11090326028861572, + "grad_norm": 0.169921875, + "learning_rate": 0.00019992775746310062, + "loss": 1.201, + "step": 1660 + }, + { + "epoch": 0.11123730625334045, + "grad_norm": 0.162109375, + "learning_rate": 0.00019992325800493547, + "loss": 1.2272, + "step": 1665 + }, + { + "epoch": 0.1115713522180652, + "grad_norm": 0.1669921875, + "learning_rate": 0.00019991862268207527, + "loss": 1.1801, + "step": 1670 + }, + { + "epoch": 0.11190539818278995, + "grad_norm": 0.1669921875, + "learning_rate": 0.00019991385150082265, + "loss": 1.2178, + "step": 1675 + }, + { + "epoch": 0.1122394441475147, + "grad_norm": 0.1572265625, + "learning_rate": 0.00019990894446766485, + "loss": 1.2739, + "step": 1680 + }, + { + "epoch": 0.11257349011223944, + "grad_norm": 0.185546875, + "learning_rate": 0.00019990390158927402, + "loss": 1.2091, + "step": 1685 + }, + { + "epoch": 0.11290753607696419, + "grad_norm": 0.1611328125, + "learning_rate": 0.00019989872287250684, + "loss": 1.1765, + "step": 1690 + }, + { + "epoch": 0.11324158204168894, + "grad_norm": 0.1611328125, + "learning_rate": 0.00019989340832440478, + "loss": 1.2185, + "step": 1695 + }, + { + "epoch": 0.11357562800641369, + "grad_norm": 0.18359375, + "learning_rate": 0.00019988795795219396, + "loss": 1.2335, + "step": 1700 + }, + { + "epoch": 0.11390967397113842, + "grad_norm": 0.158203125, + "learning_rate": 0.00019988237176328527, + "loss": 1.2053, + "step": 1705 + }, + { + "epoch": 0.11424371993586317, + "grad_norm": 0.15625, + "learning_rate": 0.00019987664976527412, + "loss": 1.2055, + "step": 1710 + }, + { + "epoch": 0.11457776590058792, + "grad_norm": 0.1611328125, + "learning_rate": 0.00019987079196594069, + "loss": 1.25, + "step": 1715 + }, + { + "epoch": 0.11491181186531267, + "grad_norm": 0.1767578125, + "learning_rate": 0.0001998647983732498, + "loss": 1.2435, + "step": 1720 + }, + { + "epoch": 0.11524585783003741, + "grad_norm": 0.1572265625, + "learning_rate": 0.00019985866899535079, + "loss": 1.16, + "step": 1725 + }, + { + "epoch": 0.11557990379476216, + "grad_norm": 0.1611328125, + "learning_rate": 0.0001998524038405778, + "loss": 1.2465, + "step": 1730 + }, + { + "epoch": 0.1159139497594869, + "grad_norm": 0.173828125, + "learning_rate": 0.00019984600291744948, + "loss": 1.2855, + "step": 1735 + }, + { + "epoch": 0.11624799572421166, + "grad_norm": 0.166015625, + "learning_rate": 0.0001998394662346691, + "loss": 1.2028, + "step": 1740 + }, + { + "epoch": 0.11658204168893639, + "grad_norm": 0.154296875, + "learning_rate": 0.00019983279380112454, + "loss": 1.2496, + "step": 1745 + }, + { + "epoch": 0.11691608765366114, + "grad_norm": 0.2021484375, + "learning_rate": 0.00019982598562588822, + "loss": 1.2577, + "step": 1750 + }, + { + "epoch": 0.11725013361838589, + "grad_norm": 0.1591796875, + "learning_rate": 0.00019981904171821716, + "loss": 1.2954, + "step": 1755 + }, + { + "epoch": 0.11758417958311064, + "grad_norm": 0.17578125, + "learning_rate": 0.0001998119620875529, + "loss": 1.2704, + "step": 1760 + }, + { + "epoch": 0.11791822554783538, + "grad_norm": 0.1533203125, + "learning_rate": 0.0001998047467435216, + "loss": 1.2034, + "step": 1765 + }, + { + "epoch": 0.11825227151256013, + "grad_norm": 0.1611328125, + "learning_rate": 0.00019979739569593385, + "loss": 1.2341, + "step": 1770 + }, + { + "epoch": 0.11858631747728487, + "grad_norm": 0.173828125, + "learning_rate": 0.00019978990895478483, + "loss": 1.2971, + "step": 1775 + }, + { + "epoch": 0.11892036344200962, + "grad_norm": 0.1640625, + "learning_rate": 0.00019978228653025416, + "loss": 1.2409, + "step": 1780 + }, + { + "epoch": 0.11925440940673436, + "grad_norm": 0.1650390625, + "learning_rate": 0.000199774528432706, + "loss": 1.2282, + "step": 1785 + }, + { + "epoch": 0.11958845537145911, + "grad_norm": 0.1630859375, + "learning_rate": 0.00019976663467268893, + "loss": 1.2715, + "step": 1790 + }, + { + "epoch": 0.11992250133618386, + "grad_norm": 0.162109375, + "learning_rate": 0.00019975860526093604, + "loss": 1.2379, + "step": 1795 + }, + { + "epoch": 0.12025654730090861, + "grad_norm": 0.1689453125, + "learning_rate": 0.00019975044020836485, + "loss": 1.2745, + "step": 1800 + }, + { + "epoch": 0.12059059326563334, + "grad_norm": 0.17578125, + "learning_rate": 0.0001997421395260773, + "loss": 1.2342, + "step": 1805 + }, + { + "epoch": 0.1209246392303581, + "grad_norm": 0.1748046875, + "learning_rate": 0.00019973370322535976, + "loss": 1.2811, + "step": 1810 + }, + { + "epoch": 0.12125868519508284, + "grad_norm": 0.1669921875, + "learning_rate": 0.00019972513131768298, + "loss": 1.2948, + "step": 1815 + }, + { + "epoch": 0.1215927311598076, + "grad_norm": 0.16796875, + "learning_rate": 0.0001997164238147021, + "loss": 1.2323, + "step": 1820 + }, + { + "epoch": 0.12192677712453233, + "grad_norm": 0.162109375, + "learning_rate": 0.00019970758072825658, + "loss": 1.2297, + "step": 1825 + }, + { + "epoch": 0.12226082308925708, + "grad_norm": 0.158203125, + "learning_rate": 0.00019969860207037034, + "loss": 1.1426, + "step": 1830 + }, + { + "epoch": 0.12259486905398183, + "grad_norm": 0.1572265625, + "learning_rate": 0.00019968948785325158, + "loss": 1.3128, + "step": 1835 + }, + { + "epoch": 0.12292891501870658, + "grad_norm": 0.166015625, + "learning_rate": 0.00019968023808929276, + "loss": 1.247, + "step": 1840 + }, + { + "epoch": 0.12326296098343131, + "grad_norm": 0.17578125, + "learning_rate": 0.00019967085279107077, + "loss": 1.1878, + "step": 1845 + }, + { + "epoch": 0.12359700694815606, + "grad_norm": 0.1640625, + "learning_rate": 0.00019966133197134664, + "loss": 1.1013, + "step": 1850 + }, + { + "epoch": 0.12393105291288081, + "grad_norm": 0.1669921875, + "learning_rate": 0.00019965167564306576, + "loss": 1.2437, + "step": 1855 + }, + { + "epoch": 0.12426509887760556, + "grad_norm": 0.1611328125, + "learning_rate": 0.00019964188381935776, + "loss": 1.233, + "step": 1860 + }, + { + "epoch": 0.1245991448423303, + "grad_norm": 0.173828125, + "learning_rate": 0.00019963195651353645, + "loss": 1.2822, + "step": 1865 + }, + { + "epoch": 0.12493319080705505, + "grad_norm": 0.17578125, + "learning_rate": 0.00019962189373909996, + "loss": 1.2954, + "step": 1870 + }, + { + "epoch": 0.1252672367717798, + "grad_norm": 0.15234375, + "learning_rate": 0.0001996116955097305, + "loss": 1.2188, + "step": 1875 + }, + { + "epoch": 0.12560128273650453, + "grad_norm": 0.166015625, + "learning_rate": 0.0001996013618392945, + "loss": 1.2498, + "step": 1880 + }, + { + "epoch": 0.12593532870122928, + "grad_norm": 0.1650390625, + "learning_rate": 0.00019959089274184256, + "loss": 1.1953, + "step": 1885 + }, + { + "epoch": 0.12626937466595403, + "grad_norm": 0.166015625, + "learning_rate": 0.00019958028823160946, + "loss": 1.2453, + "step": 1890 + }, + { + "epoch": 0.12660342063067878, + "grad_norm": 0.15625, + "learning_rate": 0.00019956954832301397, + "loss": 1.1966, + "step": 1895 + }, + { + "epoch": 0.12693746659540353, + "grad_norm": 0.1611328125, + "learning_rate": 0.0001995586730306591, + "loss": 1.2389, + "step": 1900 + }, + { + "epoch": 0.12727151256012828, + "grad_norm": 0.1474609375, + "learning_rate": 0.00019954766236933183, + "loss": 1.2609, + "step": 1905 + }, + { + "epoch": 0.12760555852485303, + "grad_norm": 0.1591796875, + "learning_rate": 0.00019953651635400333, + "loss": 1.2263, + "step": 1910 + }, + { + "epoch": 0.12793960448957778, + "grad_norm": 0.16015625, + "learning_rate": 0.00019952523499982864, + "loss": 1.1892, + "step": 1915 + }, + { + "epoch": 0.1282736504543025, + "grad_norm": 0.16796875, + "learning_rate": 0.00019951381832214698, + "loss": 1.1666, + "step": 1920 + }, + { + "epoch": 0.12860769641902725, + "grad_norm": 0.154296875, + "learning_rate": 0.00019950226633648144, + "loss": 1.2589, + "step": 1925 + }, + { + "epoch": 0.128941742383752, + "grad_norm": 0.1650390625, + "learning_rate": 0.00019949057905853919, + "loss": 1.1863, + "step": 1930 + }, + { + "epoch": 0.12927578834847675, + "grad_norm": 0.150390625, + "learning_rate": 0.00019947875650421127, + "loss": 1.1823, + "step": 1935 + }, + { + "epoch": 0.1296098343132015, + "grad_norm": 0.1650390625, + "learning_rate": 0.0001994667986895727, + "loss": 1.2282, + "step": 1940 + }, + { + "epoch": 0.12994388027792625, + "grad_norm": 0.1640625, + "learning_rate": 0.00019945470563088248, + "loss": 1.1948, + "step": 1945 + }, + { + "epoch": 0.130277926242651, + "grad_norm": 0.166015625, + "learning_rate": 0.00019944247734458333, + "loss": 1.2441, + "step": 1950 + }, + { + "epoch": 0.13061197220737575, + "grad_norm": 0.1728515625, + "learning_rate": 0.00019943011384730198, + "loss": 1.1913, + "step": 1955 + }, + { + "epoch": 0.13094601817210047, + "grad_norm": 0.169921875, + "learning_rate": 0.00019941761515584894, + "loss": 1.2447, + "step": 1960 + }, + { + "epoch": 0.13128006413682522, + "grad_norm": 0.1884765625, + "learning_rate": 0.00019940498128721856, + "loss": 1.2564, + "step": 1965 + }, + { + "epoch": 0.13161411010154997, + "grad_norm": 0.1708984375, + "learning_rate": 0.00019939221225858902, + "loss": 1.193, + "step": 1970 + }, + { + "epoch": 0.13194815606627472, + "grad_norm": 0.1748046875, + "learning_rate": 0.00019937930808732222, + "loss": 1.3114, + "step": 1975 + }, + { + "epoch": 0.13228220203099947, + "grad_norm": 0.193359375, + "learning_rate": 0.00019936626879096383, + "loss": 1.1698, + "step": 1980 + }, + { + "epoch": 0.13261624799572422, + "grad_norm": 0.1630859375, + "learning_rate": 0.00019935309438724326, + "loss": 1.2549, + "step": 1985 + }, + { + "epoch": 0.13295029396044897, + "grad_norm": 0.1591796875, + "learning_rate": 0.00019933978489407366, + "loss": 1.1987, + "step": 1990 + }, + { + "epoch": 0.13328433992517372, + "grad_norm": 0.16015625, + "learning_rate": 0.00019932634032955178, + "loss": 1.1868, + "step": 1995 + }, + { + "epoch": 0.13361838588989844, + "grad_norm": 0.173828125, + "learning_rate": 0.00019931276071195804, + "loss": 1.2345, + "step": 2000 + }, + { + "epoch": 0.1339524318546232, + "grad_norm": 0.15625, + "learning_rate": 0.00019929904605975657, + "loss": 1.2149, + "step": 2005 + }, + { + "epoch": 0.13428647781934794, + "grad_norm": 0.16796875, + "learning_rate": 0.00019928519639159507, + "loss": 1.2072, + "step": 2010 + }, + { + "epoch": 0.1346205237840727, + "grad_norm": 31.375, + "learning_rate": 0.00019927121172630473, + "loss": 1.2617, + "step": 2015 + }, + { + "epoch": 0.13495456974879744, + "grad_norm": 0.1650390625, + "learning_rate": 0.0001992570920829004, + "loss": 1.2062, + "step": 2020 + }, + { + "epoch": 0.1352886157135222, + "grad_norm": 0.1591796875, + "learning_rate": 0.0001992428374805804, + "loss": 1.1609, + "step": 2025 + }, + { + "epoch": 0.13562266167824694, + "grad_norm": 0.1591796875, + "learning_rate": 0.0001992284479387266, + "loss": 1.2839, + "step": 2030 + }, + { + "epoch": 0.13595670764297169, + "grad_norm": 0.1845703125, + "learning_rate": 0.00019921392347690435, + "loss": 1.1998, + "step": 2035 + }, + { + "epoch": 0.1362907536076964, + "grad_norm": 0.162109375, + "learning_rate": 0.0001991992641148624, + "loss": 1.2281, + "step": 2040 + }, + { + "epoch": 0.13662479957242116, + "grad_norm": 0.1533203125, + "learning_rate": 0.00019918446987253287, + "loss": 1.1933, + "step": 2045 + }, + { + "epoch": 0.1369588455371459, + "grad_norm": 0.1689453125, + "learning_rate": 0.00019916954077003147, + "loss": 1.1923, + "step": 2050 + }, + { + "epoch": 0.13729289150187066, + "grad_norm": 0.3359375, + "learning_rate": 0.00019915447682765705, + "loss": 1.1975, + "step": 2055 + }, + { + "epoch": 0.1376269374665954, + "grad_norm": 0.1708984375, + "learning_rate": 0.000199139278065892, + "loss": 1.23, + "step": 2060 + }, + { + "epoch": 0.13796098343132016, + "grad_norm": 0.181640625, + "learning_rate": 0.0001991239445054019, + "loss": 1.2278, + "step": 2065 + }, + { + "epoch": 0.1382950293960449, + "grad_norm": 0.1611328125, + "learning_rate": 0.0001991084761670356, + "loss": 1.1755, + "step": 2070 + }, + { + "epoch": 0.13862907536076965, + "grad_norm": 0.1708984375, + "learning_rate": 0.00019909287307182534, + "loss": 1.1468, + "step": 2075 + }, + { + "epoch": 0.13896312132549438, + "grad_norm": 0.1630859375, + "learning_rate": 0.00019907713524098638, + "loss": 1.2159, + "step": 2080 + }, + { + "epoch": 0.13929716729021913, + "grad_norm": 0.158203125, + "learning_rate": 0.0001990612626959174, + "loss": 1.1975, + "step": 2085 + }, + { + "epoch": 0.13963121325494388, + "grad_norm": 0.2080078125, + "learning_rate": 0.0001990452554582001, + "loss": 1.2577, + "step": 2090 + }, + { + "epoch": 0.13996525921966863, + "grad_norm": 0.1767578125, + "learning_rate": 0.00019902911354959936, + "loss": 1.1977, + "step": 2095 + }, + { + "epoch": 0.14029930518439337, + "grad_norm": 0.1591796875, + "learning_rate": 0.00019901283699206323, + "loss": 1.2434, + "step": 2100 + }, + { + "epoch": 0.14063335114911812, + "grad_norm": 0.201171875, + "learning_rate": 0.00019899642580772274, + "loss": 1.2707, + "step": 2105 + }, + { + "epoch": 0.14096739711384287, + "grad_norm": 0.16796875, + "learning_rate": 0.000198979880018892, + "loss": 1.1902, + "step": 2110 + }, + { + "epoch": 0.14130144307856762, + "grad_norm": 0.16796875, + "learning_rate": 0.00019896319964806823, + "loss": 1.215, + "step": 2115 + }, + { + "epoch": 0.14163548904329235, + "grad_norm": 0.166015625, + "learning_rate": 0.00019894638471793153, + "loss": 1.2495, + "step": 2120 + }, + { + "epoch": 0.1419695350080171, + "grad_norm": 0.158203125, + "learning_rate": 0.000198929435251345, + "loss": 1.2628, + "step": 2125 + }, + { + "epoch": 0.14230358097274184, + "grad_norm": 0.17578125, + "learning_rate": 0.00019891235127135465, + "loss": 1.2156, + "step": 2130 + }, + { + "epoch": 0.1426376269374666, + "grad_norm": 0.1875, + "learning_rate": 0.00019889513280118946, + "loss": 1.1396, + "step": 2135 + }, + { + "epoch": 0.14297167290219134, + "grad_norm": 0.15234375, + "learning_rate": 0.00019887777986426117, + "loss": 1.2356, + "step": 2140 + }, + { + "epoch": 0.1433057188669161, + "grad_norm": 0.1640625, + "learning_rate": 0.00019886029248416441, + "loss": 1.2302, + "step": 2145 + }, + { + "epoch": 0.14363976483164084, + "grad_norm": 0.15625, + "learning_rate": 0.00019884267068467662, + "loss": 1.199, + "step": 2150 + }, + { + "epoch": 0.1439738107963656, + "grad_norm": 0.16015625, + "learning_rate": 0.00019882491448975796, + "loss": 1.3024, + "step": 2155 + }, + { + "epoch": 0.14430785676109031, + "grad_norm": 0.173828125, + "learning_rate": 0.00019880702392355138, + "loss": 1.188, + "step": 2160 + }, + { + "epoch": 0.14464190272581506, + "grad_norm": 0.1767578125, + "learning_rate": 0.00019878899901038254, + "loss": 1.2895, + "step": 2165 + }, + { + "epoch": 0.1449759486905398, + "grad_norm": 0.1826171875, + "learning_rate": 0.00019877083977475968, + "loss": 1.2214, + "step": 2170 + }, + { + "epoch": 0.14530999465526456, + "grad_norm": 0.1767578125, + "learning_rate": 0.00019875254624137376, + "loss": 1.2429, + "step": 2175 + }, + { + "epoch": 0.1456440406199893, + "grad_norm": 0.20703125, + "learning_rate": 0.00019873411843509832, + "loss": 1.2168, + "step": 2180 + }, + { + "epoch": 0.14597808658471406, + "grad_norm": 0.154296875, + "learning_rate": 0.00019871555638098954, + "loss": 1.2398, + "step": 2185 + }, + { + "epoch": 0.1463121325494388, + "grad_norm": 0.1611328125, + "learning_rate": 0.00019869686010428597, + "loss": 1.2839, + "step": 2190 + }, + { + "epoch": 0.14664617851416356, + "grad_norm": 0.1650390625, + "learning_rate": 0.00019867802963040881, + "loss": 1.2139, + "step": 2195 + }, + { + "epoch": 0.14698022447888828, + "grad_norm": 0.169921875, + "learning_rate": 0.00019865906498496162, + "loss": 1.2241, + "step": 2200 + }, + { + "epoch": 0.14731427044361303, + "grad_norm": 0.1630859375, + "learning_rate": 0.00019863996619373054, + "loss": 1.2731, + "step": 2205 + }, + { + "epoch": 0.14764831640833778, + "grad_norm": 0.1630859375, + "learning_rate": 0.00019862073328268394, + "loss": 1.2639, + "step": 2210 + }, + { + "epoch": 0.14798236237306253, + "grad_norm": 0.1572265625, + "learning_rate": 0.00019860136627797262, + "loss": 1.2519, + "step": 2215 + }, + { + "epoch": 0.14831640833778728, + "grad_norm": 0.166015625, + "learning_rate": 0.0001985818652059298, + "loss": 1.2553, + "step": 2220 + }, + { + "epoch": 0.14865045430251203, + "grad_norm": 0.1689453125, + "learning_rate": 0.0001985622300930708, + "loss": 1.2233, + "step": 2225 + }, + { + "epoch": 0.14898450026723678, + "grad_norm": 0.1875, + "learning_rate": 0.0001985424609660933, + "loss": 1.1913, + "step": 2230 + }, + { + "epoch": 0.14931854623196153, + "grad_norm": 0.1611328125, + "learning_rate": 0.00019852255785187724, + "loss": 1.217, + "step": 2235 + }, + { + "epoch": 0.14965259219668625, + "grad_norm": 0.1728515625, + "learning_rate": 0.00019850252077748467, + "loss": 1.2496, + "step": 2240 + }, + { + "epoch": 0.149986638161411, + "grad_norm": 0.17578125, + "learning_rate": 0.00019848234977015984, + "loss": 1.251, + "step": 2245 + }, + { + "epoch": 0.15032068412613575, + "grad_norm": 0.1796875, + "learning_rate": 0.00019846204485732903, + "loss": 1.2931, + "step": 2250 + }, + { + "epoch": 0.1506547300908605, + "grad_norm": 0.1708984375, + "learning_rate": 0.0001984416060666007, + "loss": 1.2315, + "step": 2255 + }, + { + "epoch": 0.15098877605558525, + "grad_norm": 0.1591796875, + "learning_rate": 0.00019842103342576517, + "loss": 1.1459, + "step": 2260 + }, + { + "epoch": 0.15132282202031, + "grad_norm": 0.177734375, + "learning_rate": 0.00019840032696279494, + "loss": 1.2563, + "step": 2265 + }, + { + "epoch": 0.15165686798503475, + "grad_norm": 0.15234375, + "learning_rate": 0.00019837948670584437, + "loss": 1.212, + "step": 2270 + }, + { + "epoch": 0.1519909139497595, + "grad_norm": 0.1572265625, + "learning_rate": 0.00019835851268324982, + "loss": 1.3146, + "step": 2275 + }, + { + "epoch": 0.15232495991448422, + "grad_norm": 0.15625, + "learning_rate": 0.00019833740492352934, + "loss": 1.2306, + "step": 2280 + }, + { + "epoch": 0.15265900587920897, + "grad_norm": 0.162109375, + "learning_rate": 0.00019831616345538305, + "loss": 1.2235, + "step": 2285 + }, + { + "epoch": 0.15299305184393372, + "grad_norm": 0.1689453125, + "learning_rate": 0.0001982947883076927, + "loss": 1.1821, + "step": 2290 + }, + { + "epoch": 0.15332709780865847, + "grad_norm": 0.16796875, + "learning_rate": 0.00019827327950952195, + "loss": 1.2706, + "step": 2295 + }, + { + "epoch": 0.15366114377338322, + "grad_norm": 0.16015625, + "learning_rate": 0.00019825163709011605, + "loss": 1.1829, + "step": 2300 + }, + { + "epoch": 0.15399518973810797, + "grad_norm": 0.166015625, + "learning_rate": 0.000198229861078902, + "loss": 1.194, + "step": 2305 + }, + { + "epoch": 0.15432923570283272, + "grad_norm": 0.15625, + "learning_rate": 0.00019820795150548846, + "loss": 1.2001, + "step": 2310 + }, + { + "epoch": 0.15466328166755747, + "grad_norm": 0.1689453125, + "learning_rate": 0.00019818590839966563, + "loss": 1.1742, + "step": 2315 + }, + { + "epoch": 0.1549973276322822, + "grad_norm": 0.1572265625, + "learning_rate": 0.00019816373179140534, + "loss": 1.2324, + "step": 2320 + }, + { + "epoch": 0.15533137359700694, + "grad_norm": 0.166015625, + "learning_rate": 0.00019814142171086088, + "loss": 1.1786, + "step": 2325 + }, + { + "epoch": 0.1556654195617317, + "grad_norm": 0.154296875, + "learning_rate": 0.0001981189781883671, + "loss": 1.2989, + "step": 2330 + }, + { + "epoch": 0.15599946552645644, + "grad_norm": 0.162109375, + "learning_rate": 0.00019809640125444016, + "loss": 1.2714, + "step": 2335 + }, + { + "epoch": 0.1563335114911812, + "grad_norm": 0.1669921875, + "learning_rate": 0.00019807369093977778, + "loss": 1.2568, + "step": 2340 + }, + { + "epoch": 0.15666755745590594, + "grad_norm": 0.154296875, + "learning_rate": 0.00019805084727525895, + "loss": 1.2131, + "step": 2345 + }, + { + "epoch": 0.1570016034206307, + "grad_norm": 0.173828125, + "learning_rate": 0.00019802787029194393, + "loss": 1.1464, + "step": 2350 + }, + { + "epoch": 0.15733564938535544, + "grad_norm": 0.2265625, + "learning_rate": 0.00019800476002107437, + "loss": 1.2525, + "step": 2355 + }, + { + "epoch": 0.15766969535008016, + "grad_norm": 0.1748046875, + "learning_rate": 0.000197981516494073, + "loss": 1.2007, + "step": 2360 + }, + { + "epoch": 0.1580037413148049, + "grad_norm": 0.171875, + "learning_rate": 0.0001979581397425439, + "loss": 1.2535, + "step": 2365 + }, + { + "epoch": 0.15833778727952966, + "grad_norm": 0.1689453125, + "learning_rate": 0.0001979346297982722, + "loss": 1.235, + "step": 2370 + }, + { + "epoch": 0.1586718332442544, + "grad_norm": 0.1728515625, + "learning_rate": 0.0001979109866932241, + "loss": 1.162, + "step": 2375 + }, + { + "epoch": 0.15900587920897916, + "grad_norm": 0.1669921875, + "learning_rate": 0.00019788721045954692, + "loss": 1.2224, + "step": 2380 + }, + { + "epoch": 0.1593399251737039, + "grad_norm": 0.166015625, + "learning_rate": 0.000197863301129569, + "loss": 1.3007, + "step": 2385 + }, + { + "epoch": 0.15967397113842866, + "grad_norm": 0.1591796875, + "learning_rate": 0.00019783925873579966, + "loss": 1.2242, + "step": 2390 + }, + { + "epoch": 0.1600080171031534, + "grad_norm": 0.1689453125, + "learning_rate": 0.000197815083310929, + "loss": 1.2477, + "step": 2395 + }, + { + "epoch": 0.16034206306787813, + "grad_norm": 0.1611328125, + "learning_rate": 0.00019779077488782824, + "loss": 1.1975, + "step": 2400 + }, + { + "epoch": 0.16067610903260288, + "grad_norm": 0.1787109375, + "learning_rate": 0.0001977663334995492, + "loss": 1.2646, + "step": 2405 + }, + { + "epoch": 0.16101015499732763, + "grad_norm": 0.1640625, + "learning_rate": 0.0001977417591793247, + "loss": 1.2167, + "step": 2410 + }, + { + "epoch": 0.16134420096205238, + "grad_norm": 0.1513671875, + "learning_rate": 0.00019771705196056812, + "loss": 1.2048, + "step": 2415 + }, + { + "epoch": 0.16167824692677712, + "grad_norm": 0.1650390625, + "learning_rate": 0.00019769221187687368, + "loss": 1.183, + "step": 2420 + }, + { + "epoch": 0.16201229289150187, + "grad_norm": 0.173828125, + "learning_rate": 0.0001976672389620162, + "loss": 1.281, + "step": 2425 + }, + { + "epoch": 0.16234633885622662, + "grad_norm": 0.158203125, + "learning_rate": 0.0001976421332499511, + "loss": 1.2045, + "step": 2430 + }, + { + "epoch": 0.16268038482095137, + "grad_norm": 0.1650390625, + "learning_rate": 0.00019761689477481434, + "loss": 1.243, + "step": 2435 + }, + { + "epoch": 0.1630144307856761, + "grad_norm": 0.1982421875, + "learning_rate": 0.0001975915235709225, + "loss": 1.1954, + "step": 2440 + }, + { + "epoch": 0.16334847675040085, + "grad_norm": 0.1767578125, + "learning_rate": 0.00019756601967277256, + "loss": 1.1924, + "step": 2445 + }, + { + "epoch": 0.1636825227151256, + "grad_norm": 0.166015625, + "learning_rate": 0.00019754038311504187, + "loss": 1.1605, + "step": 2450 + }, + { + "epoch": 0.16401656867985034, + "grad_norm": 0.154296875, + "learning_rate": 0.00019751461393258829, + "loss": 1.3125, + "step": 2455 + }, + { + "epoch": 0.1643506146445751, + "grad_norm": 0.1796875, + "learning_rate": 0.00019748871216044984, + "loss": 1.2302, + "step": 2460 + }, + { + "epoch": 0.16468466060929984, + "grad_norm": 0.181640625, + "learning_rate": 0.00019746267783384496, + "loss": 1.2435, + "step": 2465 + }, + { + "epoch": 0.1650187065740246, + "grad_norm": 0.1748046875, + "learning_rate": 0.00019743651098817227, + "loss": 1.1917, + "step": 2470 + }, + { + "epoch": 0.16535275253874934, + "grad_norm": 0.1689453125, + "learning_rate": 0.00019741021165901054, + "loss": 1.2182, + "step": 2475 + }, + { + "epoch": 0.16568679850347406, + "grad_norm": 0.16796875, + "learning_rate": 0.00019738377988211877, + "loss": 1.2176, + "step": 2480 + }, + { + "epoch": 0.1660208444681988, + "grad_norm": 0.1884765625, + "learning_rate": 0.000197357215693436, + "loss": 1.2962, + "step": 2485 + }, + { + "epoch": 0.16635489043292356, + "grad_norm": 0.1787109375, + "learning_rate": 0.00019733051912908126, + "loss": 1.2024, + "step": 2490 + }, + { + "epoch": 0.1666889363976483, + "grad_norm": 0.1650390625, + "learning_rate": 0.00019730369022535362, + "loss": 1.1819, + "step": 2495 + }, + { + "epoch": 0.16702298236237306, + "grad_norm": 0.1826171875, + "learning_rate": 0.0001972767290187321, + "loss": 1.1487, + "step": 2500 + }, + { + "epoch": 0.1673570283270978, + "grad_norm": 0.1796875, + "learning_rate": 0.0001972496355458756, + "loss": 1.2591, + "step": 2505 + }, + { + "epoch": 0.16769107429182256, + "grad_norm": 0.1611328125, + "learning_rate": 0.00019722240984362284, + "loss": 1.2167, + "step": 2510 + }, + { + "epoch": 0.1680251202565473, + "grad_norm": 0.1728515625, + "learning_rate": 0.00019719505194899233, + "loss": 1.2727, + "step": 2515 + }, + { + "epoch": 0.16835916622127206, + "grad_norm": 0.1787109375, + "learning_rate": 0.00019716756189918234, + "loss": 1.2353, + "step": 2520 + }, + { + "epoch": 0.16869321218599678, + "grad_norm": 0.16015625, + "learning_rate": 0.0001971399397315709, + "loss": 1.1215, + "step": 2525 + }, + { + "epoch": 0.16902725815072153, + "grad_norm": 0.1630859375, + "learning_rate": 0.00019711218548371546, + "loss": 1.2287, + "step": 2530 + }, + { + "epoch": 0.16936130411544628, + "grad_norm": 0.162109375, + "learning_rate": 0.00019708429919335335, + "loss": 1.2297, + "step": 2535 + }, + { + "epoch": 0.16969535008017103, + "grad_norm": 0.1630859375, + "learning_rate": 0.00019705628089840122, + "loss": 1.252, + "step": 2540 + }, + { + "epoch": 0.17002939604489578, + "grad_norm": 0.1787109375, + "learning_rate": 0.0001970281306369553, + "loss": 1.2608, + "step": 2545 + }, + { + "epoch": 0.17036344200962053, + "grad_norm": 0.1748046875, + "learning_rate": 0.0001969998484472912, + "loss": 1.1916, + "step": 2550 + }, + { + "epoch": 0.17069748797434528, + "grad_norm": 0.1513671875, + "learning_rate": 0.00019697143436786397, + "loss": 1.1868, + "step": 2555 + }, + { + "epoch": 0.17103153393907003, + "grad_norm": 0.1611328125, + "learning_rate": 0.00019694288843730796, + "loss": 1.1567, + "step": 2560 + }, + { + "epoch": 0.17136557990379475, + "grad_norm": 0.1728515625, + "learning_rate": 0.0001969142106944368, + "loss": 1.2043, + "step": 2565 + }, + { + "epoch": 0.1716996258685195, + "grad_norm": 0.15625, + "learning_rate": 0.00019688540117824332, + "loss": 1.1954, + "step": 2570 + }, + { + "epoch": 0.17203367183324425, + "grad_norm": 0.1787109375, + "learning_rate": 0.00019685645992789956, + "loss": 1.1889, + "step": 2575 + }, + { + "epoch": 0.172367717797969, + "grad_norm": 0.1533203125, + "learning_rate": 0.00019682738698275663, + "loss": 1.1758, + "step": 2580 + }, + { + "epoch": 0.17270176376269375, + "grad_norm": 0.1591796875, + "learning_rate": 0.0001967981823823448, + "loss": 1.2197, + "step": 2585 + }, + { + "epoch": 0.1730358097274185, + "grad_norm": 0.154296875, + "learning_rate": 0.0001967688461663732, + "loss": 1.1922, + "step": 2590 + }, + { + "epoch": 0.17336985569214325, + "grad_norm": 0.1640625, + "learning_rate": 0.0001967393783747301, + "loss": 1.2168, + "step": 2595 + }, + { + "epoch": 0.173703901656868, + "grad_norm": 0.19140625, + "learning_rate": 0.00019670977904748252, + "loss": 1.2387, + "step": 2600 + }, + { + "epoch": 0.17403794762159272, + "grad_norm": 0.158203125, + "learning_rate": 0.00019668004822487634, + "loss": 1.186, + "step": 2605 + }, + { + "epoch": 0.17437199358631747, + "grad_norm": 0.1640625, + "learning_rate": 0.00019665018594733634, + "loss": 1.2346, + "step": 2610 + }, + { + "epoch": 0.17470603955104222, + "grad_norm": 0.1533203125, + "learning_rate": 0.00019662019225546594, + "loss": 1.2375, + "step": 2615 + }, + { + "epoch": 0.17504008551576697, + "grad_norm": 0.1611328125, + "learning_rate": 0.00019659006719004727, + "loss": 1.221, + "step": 2620 + }, + { + "epoch": 0.17537413148049172, + "grad_norm": 0.1513671875, + "learning_rate": 0.00019655981079204113, + "loss": 1.1965, + "step": 2625 + }, + { + "epoch": 0.17570817744521647, + "grad_norm": 0.15625, + "learning_rate": 0.0001965294231025868, + "loss": 1.1957, + "step": 2630 + }, + { + "epoch": 0.17604222340994122, + "grad_norm": 0.1640625, + "learning_rate": 0.00019649890416300217, + "loss": 1.2214, + "step": 2635 + }, + { + "epoch": 0.17637626937466597, + "grad_norm": 0.166015625, + "learning_rate": 0.00019646825401478356, + "loss": 1.2766, + "step": 2640 + }, + { + "epoch": 0.1767103153393907, + "grad_norm": 0.16796875, + "learning_rate": 0.00019643747269960566, + "loss": 1.241, + "step": 2645 + }, + { + "epoch": 0.17704436130411544, + "grad_norm": 0.154296875, + "learning_rate": 0.0001964065602593215, + "loss": 1.1926, + "step": 2650 + }, + { + "epoch": 0.1773784072688402, + "grad_norm": 0.1787109375, + "learning_rate": 0.0001963755167359625, + "loss": 1.2595, + "step": 2655 + }, + { + "epoch": 0.17771245323356494, + "grad_norm": 0.169921875, + "learning_rate": 0.00019634434217173817, + "loss": 1.2557, + "step": 2660 + }, + { + "epoch": 0.1780464991982897, + "grad_norm": 0.16796875, + "learning_rate": 0.0001963130366090363, + "loss": 1.2221, + "step": 2665 + }, + { + "epoch": 0.17838054516301444, + "grad_norm": 0.1591796875, + "learning_rate": 0.00019628160009042275, + "loss": 1.2297, + "step": 2670 + }, + { + "epoch": 0.1787145911277392, + "grad_norm": 0.17578125, + "learning_rate": 0.00019625003265864147, + "loss": 1.2172, + "step": 2675 + }, + { + "epoch": 0.17904863709246394, + "grad_norm": 0.1669921875, + "learning_rate": 0.00019621833435661437, + "loss": 1.2086, + "step": 2680 + }, + { + "epoch": 0.17938268305718866, + "grad_norm": 0.1650390625, + "learning_rate": 0.00019618650522744137, + "loss": 1.1887, + "step": 2685 + }, + { + "epoch": 0.1797167290219134, + "grad_norm": 0.1533203125, + "learning_rate": 0.00019615454531440017, + "loss": 1.2667, + "step": 2690 + }, + { + "epoch": 0.18005077498663816, + "grad_norm": 0.1708984375, + "learning_rate": 0.00019612245466094641, + "loss": 1.2165, + "step": 2695 + }, + { + "epoch": 0.1803848209513629, + "grad_norm": 0.1630859375, + "learning_rate": 0.00019609023331071344, + "loss": 1.2485, + "step": 2700 + }, + { + "epoch": 0.18071886691608766, + "grad_norm": 0.171875, + "learning_rate": 0.0001960578813075123, + "loss": 1.2353, + "step": 2705 + }, + { + "epoch": 0.1810529128808124, + "grad_norm": 0.169921875, + "learning_rate": 0.00019602539869533167, + "loss": 1.2373, + "step": 2710 + }, + { + "epoch": 0.18138695884553716, + "grad_norm": 0.166015625, + "learning_rate": 0.00019599278551833788, + "loss": 1.1801, + "step": 2715 + }, + { + "epoch": 0.1817210048102619, + "grad_norm": 0.216796875, + "learning_rate": 0.00019596004182087477, + "loss": 1.2516, + "step": 2720 + }, + { + "epoch": 0.18205505077498663, + "grad_norm": 0.1708984375, + "learning_rate": 0.00019592716764746363, + "loss": 1.2321, + "step": 2725 + }, + { + "epoch": 0.18238909673971138, + "grad_norm": 0.193359375, + "learning_rate": 0.00019589416304280314, + "loss": 1.2326, + "step": 2730 + }, + { + "epoch": 0.18272314270443613, + "grad_norm": 0.169921875, + "learning_rate": 0.00019586102805176932, + "loss": 1.2251, + "step": 2735 + }, + { + "epoch": 0.18305718866916088, + "grad_norm": 0.1513671875, + "learning_rate": 0.00019582776271941557, + "loss": 1.1592, + "step": 2740 + }, + { + "epoch": 0.18339123463388562, + "grad_norm": 0.15625, + "learning_rate": 0.00019579436709097237, + "loss": 1.2147, + "step": 2745 + }, + { + "epoch": 0.18372528059861037, + "grad_norm": 0.1640625, + "learning_rate": 0.00019576084121184745, + "loss": 1.1742, + "step": 2750 + }, + { + "epoch": 0.18405932656333512, + "grad_norm": 0.166015625, + "learning_rate": 0.00019572718512762566, + "loss": 1.1736, + "step": 2755 + }, + { + "epoch": 0.18439337252805987, + "grad_norm": 0.2138671875, + "learning_rate": 0.00019569339888406883, + "loss": 1.1304, + "step": 2760 + }, + { + "epoch": 0.1847274184927846, + "grad_norm": 0.1787109375, + "learning_rate": 0.0001956594825271158, + "loss": 1.2708, + "step": 2765 + }, + { + "epoch": 0.18506146445750934, + "grad_norm": 0.1572265625, + "learning_rate": 0.00019562543610288232, + "loss": 1.1739, + "step": 2770 + }, + { + "epoch": 0.1853955104222341, + "grad_norm": 0.16015625, + "learning_rate": 0.00019559125965766096, + "loss": 1.2504, + "step": 2775 + }, + { + "epoch": 0.18572955638695884, + "grad_norm": 0.15625, + "learning_rate": 0.0001955569532379211, + "loss": 1.1925, + "step": 2780 + }, + { + "epoch": 0.1860636023516836, + "grad_norm": 0.166015625, + "learning_rate": 0.00019552251689030893, + "loss": 1.2344, + "step": 2785 + }, + { + "epoch": 0.18639764831640834, + "grad_norm": 0.1689453125, + "learning_rate": 0.00019548795066164713, + "loss": 1.1737, + "step": 2790 + }, + { + "epoch": 0.1867316942811331, + "grad_norm": 0.17578125, + "learning_rate": 0.00019545325459893512, + "loss": 1.2059, + "step": 2795 + }, + { + "epoch": 0.18706574024585784, + "grad_norm": 0.1513671875, + "learning_rate": 0.0001954184287493488, + "loss": 1.1795, + "step": 2800 + }, + { + "epoch": 0.18739978621058256, + "grad_norm": 0.16796875, + "learning_rate": 0.00019538347316024052, + "loss": 1.266, + "step": 2805 + }, + { + "epoch": 0.1877338321753073, + "grad_norm": 0.1708984375, + "learning_rate": 0.00019534838787913902, + "loss": 1.2198, + "step": 2810 + }, + { + "epoch": 0.18806787814003206, + "grad_norm": 0.173828125, + "learning_rate": 0.0001953131729537495, + "loss": 1.2272, + "step": 2815 + }, + { + "epoch": 0.1884019241047568, + "grad_norm": 0.1591796875, + "learning_rate": 0.00019527782843195335, + "loss": 1.1629, + "step": 2820 + }, + { + "epoch": 0.18873597006948156, + "grad_norm": 0.1630859375, + "learning_rate": 0.00019524235436180814, + "loss": 1.2099, + "step": 2825 + }, + { + "epoch": 0.1890700160342063, + "grad_norm": 0.16015625, + "learning_rate": 0.00019520675079154763, + "loss": 1.2457, + "step": 2830 + }, + { + "epoch": 0.18940406199893106, + "grad_norm": 0.1845703125, + "learning_rate": 0.00019517101776958166, + "loss": 1.1824, + "step": 2835 + }, + { + "epoch": 0.1897381079636558, + "grad_norm": 0.158203125, + "learning_rate": 0.00019513515534449606, + "loss": 1.2036, + "step": 2840 + }, + { + "epoch": 0.19007215392838053, + "grad_norm": 0.1669921875, + "learning_rate": 0.00019509916356505268, + "loss": 1.1686, + "step": 2845 + }, + { + "epoch": 0.19040619989310528, + "grad_norm": 0.1708984375, + "learning_rate": 0.0001950630424801891, + "loss": 1.1715, + "step": 2850 + }, + { + "epoch": 0.19074024585783003, + "grad_norm": 0.1650390625, + "learning_rate": 0.00019502679213901893, + "loss": 1.3139, + "step": 2855 + }, + { + "epoch": 0.19107429182255478, + "grad_norm": 0.169921875, + "learning_rate": 0.00019499041259083132, + "loss": 1.1451, + "step": 2860 + }, + { + "epoch": 0.19140833778727953, + "grad_norm": 0.1640625, + "learning_rate": 0.00019495390388509122, + "loss": 1.2861, + "step": 2865 + }, + { + "epoch": 0.19174238375200428, + "grad_norm": 0.1552734375, + "learning_rate": 0.00019491726607143918, + "loss": 1.2522, + "step": 2870 + }, + { + "epoch": 0.19207642971672903, + "grad_norm": 0.1904296875, + "learning_rate": 0.00019488049919969127, + "loss": 1.2232, + "step": 2875 + }, + { + "epoch": 0.19241047568145378, + "grad_norm": 0.16796875, + "learning_rate": 0.00019484360331983907, + "loss": 1.265, + "step": 2880 + }, + { + "epoch": 0.1927445216461785, + "grad_norm": 0.1630859375, + "learning_rate": 0.00019480657848204954, + "loss": 1.1885, + "step": 2885 + }, + { + "epoch": 0.19307856761090325, + "grad_norm": 0.17578125, + "learning_rate": 0.00019476942473666497, + "loss": 1.2748, + "step": 2890 + }, + { + "epoch": 0.193412613575628, + "grad_norm": 0.17578125, + "learning_rate": 0.000194732142134203, + "loss": 1.223, + "step": 2895 + }, + { + "epoch": 0.19374665954035275, + "grad_norm": 0.1630859375, + "learning_rate": 0.00019469473072535642, + "loss": 1.1482, + "step": 2900 + }, + { + "epoch": 0.1940807055050775, + "grad_norm": 0.2041015625, + "learning_rate": 0.0001946571905609931, + "loss": 1.1468, + "step": 2905 + }, + { + "epoch": 0.19441475146980225, + "grad_norm": 0.1748046875, + "learning_rate": 0.00019461952169215615, + "loss": 1.334, + "step": 2910 + }, + { + "epoch": 0.194748797434527, + "grad_norm": 0.1640625, + "learning_rate": 0.00019458172417006347, + "loss": 1.2264, + "step": 2915 + }, + { + "epoch": 0.19508284339925175, + "grad_norm": 0.1708984375, + "learning_rate": 0.00019454379804610805, + "loss": 1.2272, + "step": 2920 + }, + { + "epoch": 0.19541688936397647, + "grad_norm": 0.1796875, + "learning_rate": 0.00019450574337185765, + "loss": 1.235, + "step": 2925 + }, + { + "epoch": 0.19575093532870122, + "grad_norm": 0.1689453125, + "learning_rate": 0.00019446756019905482, + "loss": 1.2242, + "step": 2930 + }, + { + "epoch": 0.19608498129342597, + "grad_norm": 0.1689453125, + "learning_rate": 0.00019442924857961694, + "loss": 1.2805, + "step": 2935 + }, + { + "epoch": 0.19641902725815072, + "grad_norm": 0.1611328125, + "learning_rate": 0.00019439080856563585, + "loss": 1.2176, + "step": 2940 + }, + { + "epoch": 0.19675307322287547, + "grad_norm": 0.173828125, + "learning_rate": 0.00019435224020937812, + "loss": 1.1892, + "step": 2945 + }, + { + "epoch": 0.19708711918760022, + "grad_norm": 0.177734375, + "learning_rate": 0.0001943135435632848, + "loss": 1.3424, + "step": 2950 + }, + { + "epoch": 0.19742116515232497, + "grad_norm": 0.1611328125, + "learning_rate": 0.00019427471867997128, + "loss": 1.2189, + "step": 2955 + }, + { + "epoch": 0.19775521111704972, + "grad_norm": 0.162109375, + "learning_rate": 0.00019423576561222744, + "loss": 1.2175, + "step": 2960 + }, + { + "epoch": 0.19808925708177444, + "grad_norm": 0.1689453125, + "learning_rate": 0.00019419668441301733, + "loss": 1.2744, + "step": 2965 + }, + { + "epoch": 0.1984233030464992, + "grad_norm": 0.1591796875, + "learning_rate": 0.00019415747513547936, + "loss": 1.2161, + "step": 2970 + }, + { + "epoch": 0.19875734901122394, + "grad_norm": 0.16015625, + "learning_rate": 0.00019411813783292594, + "loss": 1.1341, + "step": 2975 + }, + { + "epoch": 0.1990913949759487, + "grad_norm": 0.154296875, + "learning_rate": 0.00019407867255884367, + "loss": 1.2168, + "step": 2980 + }, + { + "epoch": 0.19942544094067344, + "grad_norm": 0.166015625, + "learning_rate": 0.0001940390793668931, + "loss": 1.2508, + "step": 2985 + }, + { + "epoch": 0.1997594869053982, + "grad_norm": 0.1630859375, + "learning_rate": 0.00019399935831090868, + "loss": 1.2179, + "step": 2990 + }, + { + "epoch": 0.20009353287012294, + "grad_norm": 0.1748046875, + "learning_rate": 0.00019395950944489876, + "loss": 1.292, + "step": 2995 + }, + { + "epoch": 0.20042757883484769, + "grad_norm": 0.1552734375, + "learning_rate": 0.0001939195328230455, + "loss": 1.2248, + "step": 3000 + }, + { + "epoch": 0.2007616247995724, + "grad_norm": 0.1630859375, + "learning_rate": 0.00019387942849970465, + "loss": 1.2645, + "step": 3005 + }, + { + "epoch": 0.20109567076429716, + "grad_norm": 0.16796875, + "learning_rate": 0.0001938391965294058, + "loss": 1.1764, + "step": 3010 + }, + { + "epoch": 0.2014297167290219, + "grad_norm": 0.16796875, + "learning_rate": 0.00019379883696685183, + "loss": 1.1976, + "step": 3015 + }, + { + "epoch": 0.20176376269374666, + "grad_norm": 0.1611328125, + "learning_rate": 0.00019375834986691933, + "loss": 1.2362, + "step": 3020 + }, + { + "epoch": 0.2020978086584714, + "grad_norm": 0.16796875, + "learning_rate": 0.0001937177352846582, + "loss": 1.2767, + "step": 3025 + }, + { + "epoch": 0.20243185462319616, + "grad_norm": 0.166015625, + "learning_rate": 0.0001936769932752917, + "loss": 1.2597, + "step": 3030 + }, + { + "epoch": 0.2027659005879209, + "grad_norm": 0.150390625, + "learning_rate": 0.00019363612389421638, + "loss": 1.2247, + "step": 3035 + }, + { + "epoch": 0.20309994655264565, + "grad_norm": 0.166015625, + "learning_rate": 0.00019359512719700192, + "loss": 1.1646, + "step": 3040 + }, + { + "epoch": 0.20343399251737038, + "grad_norm": 0.1650390625, + "learning_rate": 0.00019355400323939112, + "loss": 1.1893, + "step": 3045 + }, + { + "epoch": 0.20376803848209513, + "grad_norm": 0.2373046875, + "learning_rate": 0.00019351275207729984, + "loss": 1.2971, + "step": 3050 + }, + { + "epoch": 0.20410208444681988, + "grad_norm": 0.1787109375, + "learning_rate": 0.0001934713737668169, + "loss": 1.2324, + "step": 3055 + }, + { + "epoch": 0.20443613041154463, + "grad_norm": 0.1845703125, + "learning_rate": 0.000193429868364204, + "loss": 1.1936, + "step": 3060 + }, + { + "epoch": 0.20477017637626937, + "grad_norm": 0.158203125, + "learning_rate": 0.0001933882359258956, + "loss": 1.2243, + "step": 3065 + }, + { + "epoch": 0.20510422234099412, + "grad_norm": 0.146484375, + "learning_rate": 0.00019334647650849897, + "loss": 1.1963, + "step": 3070 + }, + { + "epoch": 0.20543826830571887, + "grad_norm": 0.1708984375, + "learning_rate": 0.00019330459016879395, + "loss": 1.2023, + "step": 3075 + }, + { + "epoch": 0.20577231427044362, + "grad_norm": 0.166015625, + "learning_rate": 0.00019326257696373304, + "loss": 1.2018, + "step": 3080 + }, + { + "epoch": 0.20610636023516835, + "grad_norm": 0.173828125, + "learning_rate": 0.00019322043695044116, + "loss": 1.1572, + "step": 3085 + }, + { + "epoch": 0.2064404061998931, + "grad_norm": 0.1650390625, + "learning_rate": 0.0001931781701862157, + "loss": 1.2123, + "step": 3090 + }, + { + "epoch": 0.20677445216461784, + "grad_norm": 0.15234375, + "learning_rate": 0.00019313577672852632, + "loss": 1.2391, + "step": 3095 + }, + { + "epoch": 0.2071084981293426, + "grad_norm": 0.166015625, + "learning_rate": 0.00019309325663501508, + "loss": 1.2305, + "step": 3100 + }, + { + "epoch": 0.20744254409406734, + "grad_norm": 0.1708984375, + "learning_rate": 0.00019305060996349606, + "loss": 1.1387, + "step": 3105 + }, + { + "epoch": 0.2077765900587921, + "grad_norm": 0.166015625, + "learning_rate": 0.00019300783677195563, + "loss": 1.2658, + "step": 3110 + }, + { + "epoch": 0.20811063602351684, + "grad_norm": 0.1923828125, + "learning_rate": 0.00019296493711855198, + "loss": 1.2224, + "step": 3115 + }, + { + "epoch": 0.2084446819882416, + "grad_norm": 0.1630859375, + "learning_rate": 0.00019292191106161542, + "loss": 1.1874, + "step": 3120 + }, + { + "epoch": 0.20877872795296634, + "grad_norm": 0.166015625, + "learning_rate": 0.00019287875865964808, + "loss": 1.1604, + "step": 3125 + }, + { + "epoch": 0.20911277391769106, + "grad_norm": 0.1591796875, + "learning_rate": 0.00019283547997132381, + "loss": 1.2352, + "step": 3130 + }, + { + "epoch": 0.2094468198824158, + "grad_norm": 0.15234375, + "learning_rate": 0.00019279207505548825, + "loss": 1.2506, + "step": 3135 + }, + { + "epoch": 0.20978086584714056, + "grad_norm": 0.1572265625, + "learning_rate": 0.00019274854397115866, + "loss": 1.2556, + "step": 3140 + }, + { + "epoch": 0.2101149118118653, + "grad_norm": 0.18359375, + "learning_rate": 0.00019270488677752387, + "loss": 1.2294, + "step": 3145 + }, + { + "epoch": 0.21044895777659006, + "grad_norm": 0.29296875, + "learning_rate": 0.0001926611035339441, + "loss": 1.1891, + "step": 3150 + }, + { + "epoch": 0.2107830037413148, + "grad_norm": 0.16015625, + "learning_rate": 0.00019261719429995098, + "loss": 1.2341, + "step": 3155 + }, + { + "epoch": 0.21111704970603956, + "grad_norm": 0.1884765625, + "learning_rate": 0.00019257315913524754, + "loss": 1.2232, + "step": 3160 + }, + { + "epoch": 0.2114510956707643, + "grad_norm": 0.1748046875, + "learning_rate": 0.00019252899809970794, + "loss": 1.3083, + "step": 3165 + }, + { + "epoch": 0.21178514163548903, + "grad_norm": 0.181640625, + "learning_rate": 0.00019248471125337752, + "loss": 1.2538, + "step": 3170 + }, + { + "epoch": 0.21211918760021378, + "grad_norm": 0.1552734375, + "learning_rate": 0.00019244029865647267, + "loss": 1.2017, + "step": 3175 + }, + { + "epoch": 0.21245323356493853, + "grad_norm": 0.1689453125, + "learning_rate": 0.00019239576036938078, + "loss": 1.2138, + "step": 3180 + }, + { + "epoch": 0.21278727952966328, + "grad_norm": 0.1611328125, + "learning_rate": 0.00019235109645266015, + "loss": 1.1828, + "step": 3185 + }, + { + "epoch": 0.21312132549438803, + "grad_norm": 0.158203125, + "learning_rate": 0.00019230630696703984, + "loss": 1.208, + "step": 3190 + }, + { + "epoch": 0.21345537145911278, + "grad_norm": 0.1640625, + "learning_rate": 0.0001922613919734197, + "loss": 1.141, + "step": 3195 + }, + { + "epoch": 0.21378941742383753, + "grad_norm": 0.15625, + "learning_rate": 0.00019221635153287024, + "loss": 1.2047, + "step": 3200 + }, + { + "epoch": 0.21412346338856228, + "grad_norm": 0.1689453125, + "learning_rate": 0.00019217118570663249, + "loss": 1.2994, + "step": 3205 + }, + { + "epoch": 0.214457509353287, + "grad_norm": 0.158203125, + "learning_rate": 0.000192125894556118, + "loss": 1.1787, + "step": 3210 + }, + { + "epoch": 0.21479155531801175, + "grad_norm": 0.15625, + "learning_rate": 0.0001920804781429087, + "loss": 1.2072, + "step": 3215 + }, + { + "epoch": 0.2151256012827365, + "grad_norm": 0.1689453125, + "learning_rate": 0.00019203493652875686, + "loss": 1.1978, + "step": 3220 + }, + { + "epoch": 0.21545964724746125, + "grad_norm": 0.154296875, + "learning_rate": 0.00019198926977558495, + "loss": 1.1938, + "step": 3225 + }, + { + "epoch": 0.215793693212186, + "grad_norm": 0.1875, + "learning_rate": 0.00019194347794548565, + "loss": 1.253, + "step": 3230 + }, + { + "epoch": 0.21612773917691075, + "grad_norm": 0.1591796875, + "learning_rate": 0.0001918975611007217, + "loss": 1.198, + "step": 3235 + }, + { + "epoch": 0.2164617851416355, + "grad_norm": 0.17578125, + "learning_rate": 0.00019185151930372574, + "loss": 1.2049, + "step": 3240 + }, + { + "epoch": 0.21679583110636025, + "grad_norm": 0.1728515625, + "learning_rate": 0.00019180535261710037, + "loss": 1.2952, + "step": 3245 + }, + { + "epoch": 0.21712987707108497, + "grad_norm": 0.2001953125, + "learning_rate": 0.000191759061103618, + "loss": 1.1782, + "step": 3250 + }, + { + "epoch": 0.21746392303580972, + "grad_norm": 0.1669921875, + "learning_rate": 0.00019171264482622076, + "loss": 1.1968, + "step": 3255 + }, + { + "epoch": 0.21779796900053447, + "grad_norm": 0.1630859375, + "learning_rate": 0.00019166610384802038, + "loss": 1.2566, + "step": 3260 + }, + { + "epoch": 0.21813201496525922, + "grad_norm": 0.2001953125, + "learning_rate": 0.00019161943823229824, + "loss": 1.2565, + "step": 3265 + }, + { + "epoch": 0.21846606092998397, + "grad_norm": 0.15625, + "learning_rate": 0.00019157264804250506, + "loss": 1.1353, + "step": 3270 + }, + { + "epoch": 0.21880010689470872, + "grad_norm": 0.1728515625, + "learning_rate": 0.00019152573334226114, + "loss": 1.2901, + "step": 3275 + }, + { + "epoch": 0.21913415285943347, + "grad_norm": 0.1572265625, + "learning_rate": 0.00019147869419535577, + "loss": 1.1798, + "step": 3280 + }, + { + "epoch": 0.21946819882415822, + "grad_norm": 0.17578125, + "learning_rate": 0.0001914315306657478, + "loss": 1.2607, + "step": 3285 + }, + { + "epoch": 0.21980224478888294, + "grad_norm": 0.1640625, + "learning_rate": 0.0001913842428175649, + "loss": 1.1978, + "step": 3290 + }, + { + "epoch": 0.2201362907536077, + "grad_norm": 0.1708984375, + "learning_rate": 0.00019133683071510395, + "loss": 1.1645, + "step": 3295 + }, + { + "epoch": 0.22047033671833244, + "grad_norm": 0.1650390625, + "learning_rate": 0.00019128929442283074, + "loss": 1.1069, + "step": 3300 + }, + { + "epoch": 0.2208043826830572, + "grad_norm": 0.1494140625, + "learning_rate": 0.0001912416340053799, + "loss": 1.279, + "step": 3305 + }, + { + "epoch": 0.22113842864778194, + "grad_norm": 0.1845703125, + "learning_rate": 0.00019119384952755484, + "loss": 1.2085, + "step": 3310 + }, + { + "epoch": 0.2214724746125067, + "grad_norm": 0.173828125, + "learning_rate": 0.00019114594105432766, + "loss": 1.1412, + "step": 3315 + }, + { + "epoch": 0.22180652057723144, + "grad_norm": 0.1630859375, + "learning_rate": 0.00019109790865083905, + "loss": 1.1815, + "step": 3320 + }, + { + "epoch": 0.22214056654195619, + "grad_norm": 0.1796875, + "learning_rate": 0.00019104975238239818, + "loss": 1.238, + "step": 3325 + }, + { + "epoch": 0.2224746125066809, + "grad_norm": 0.1865234375, + "learning_rate": 0.00019100147231448274, + "loss": 1.2348, + "step": 3330 + }, + { + "epoch": 0.22280865847140566, + "grad_norm": 0.1689453125, + "learning_rate": 0.0001909530685127386, + "loss": 1.2156, + "step": 3335 + }, + { + "epoch": 0.2231427044361304, + "grad_norm": 0.1689453125, + "learning_rate": 0.00019090454104298004, + "loss": 1.1559, + "step": 3340 + }, + { + "epoch": 0.22347675040085516, + "grad_norm": 0.1572265625, + "learning_rate": 0.00019085588997118927, + "loss": 1.1645, + "step": 3345 + }, + { + "epoch": 0.2238107963655799, + "grad_norm": 0.1865234375, + "learning_rate": 0.00019080711536351676, + "loss": 1.2397, + "step": 3350 + }, + { + "epoch": 0.22414484233030466, + "grad_norm": 0.1572265625, + "learning_rate": 0.00019075821728628087, + "loss": 1.2215, + "step": 3355 + }, + { + "epoch": 0.2244788882950294, + "grad_norm": 0.1767578125, + "learning_rate": 0.00019070919580596783, + "loss": 1.242, + "step": 3360 + }, + { + "epoch": 0.22481293425975415, + "grad_norm": 0.19921875, + "learning_rate": 0.00019066005098923168, + "loss": 1.2694, + "step": 3365 + }, + { + "epoch": 0.22514698022447888, + "grad_norm": 0.15625, + "learning_rate": 0.00019061078290289415, + "loss": 1.2075, + "step": 3370 + }, + { + "epoch": 0.22548102618920363, + "grad_norm": 0.1630859375, + "learning_rate": 0.00019056139161394462, + "loss": 1.2271, + "step": 3375 + }, + { + "epoch": 0.22581507215392838, + "grad_norm": 0.177734375, + "learning_rate": 0.00019051187718953992, + "loss": 1.2696, + "step": 3380 + }, + { + "epoch": 0.22614911811865313, + "grad_norm": 0.166015625, + "learning_rate": 0.00019046223969700434, + "loss": 1.2897, + "step": 3385 + }, + { + "epoch": 0.22648316408337787, + "grad_norm": 0.18359375, + "learning_rate": 0.0001904124792038295, + "loss": 1.1922, + "step": 3390 + }, + { + "epoch": 0.22681721004810262, + "grad_norm": 0.16015625, + "learning_rate": 0.00019036259577767426, + "loss": 1.2858, + "step": 3395 + }, + { + "epoch": 0.22715125601282737, + "grad_norm": 0.1884765625, + "learning_rate": 0.00019031258948636466, + "loss": 1.1695, + "step": 3400 + }, + { + "epoch": 0.22748530197755212, + "grad_norm": 0.150390625, + "learning_rate": 0.00019026246039789376, + "loss": 1.1508, + "step": 3405 + }, + { + "epoch": 0.22781934794227685, + "grad_norm": 0.16015625, + "learning_rate": 0.0001902122085804216, + "loss": 1.1518, + "step": 3410 + }, + { + "epoch": 0.2281533939070016, + "grad_norm": 0.1640625, + "learning_rate": 0.0001901618341022751, + "loss": 1.213, + "step": 3415 + }, + { + "epoch": 0.22848743987172634, + "grad_norm": 0.162109375, + "learning_rate": 0.00019011133703194797, + "loss": 1.2272, + "step": 3420 + }, + { + "epoch": 0.2288214858364511, + "grad_norm": 0.1630859375, + "learning_rate": 0.00019006071743810055, + "loss": 1.2391, + "step": 3425 + }, + { + "epoch": 0.22915553180117584, + "grad_norm": 0.16796875, + "learning_rate": 0.00019000997538955985, + "loss": 1.2277, + "step": 3430 + }, + { + "epoch": 0.2294895777659006, + "grad_norm": 0.150390625, + "learning_rate": 0.0001899591109553193, + "loss": 1.2127, + "step": 3435 + }, + { + "epoch": 0.22982362373062534, + "grad_norm": 0.2236328125, + "learning_rate": 0.00018990812420453885, + "loss": 1.2484, + "step": 3440 + }, + { + "epoch": 0.2301576696953501, + "grad_norm": 0.1708984375, + "learning_rate": 0.00018985701520654466, + "loss": 1.262, + "step": 3445 + }, + { + "epoch": 0.23049171566007481, + "grad_norm": 0.1845703125, + "learning_rate": 0.00018980578403082917, + "loss": 1.1882, + "step": 3450 + }, + { + "epoch": 0.23082576162479956, + "grad_norm": 0.1748046875, + "learning_rate": 0.00018975443074705086, + "loss": 1.217, + "step": 3455 + }, + { + "epoch": 0.2311598075895243, + "grad_norm": 0.1669921875, + "learning_rate": 0.00018970295542503434, + "loss": 1.2297, + "step": 3460 + }, + { + "epoch": 0.23149385355424906, + "grad_norm": 0.154296875, + "learning_rate": 0.0001896513581347701, + "loss": 1.1437, + "step": 3465 + }, + { + "epoch": 0.2318278995189738, + "grad_norm": 0.171875, + "learning_rate": 0.0001895996389464145, + "loss": 1.2221, + "step": 3470 + }, + { + "epoch": 0.23216194548369856, + "grad_norm": 0.16796875, + "learning_rate": 0.0001895477979302896, + "loss": 1.2822, + "step": 3475 + }, + { + "epoch": 0.2324959914484233, + "grad_norm": 0.17578125, + "learning_rate": 0.00018949583515688313, + "loss": 1.2558, + "step": 3480 + }, + { + "epoch": 0.23283003741314806, + "grad_norm": 0.1591796875, + "learning_rate": 0.0001894437506968484, + "loss": 1.2, + "step": 3485 + }, + { + "epoch": 0.23316408337787278, + "grad_norm": 0.1611328125, + "learning_rate": 0.00018939154462100418, + "loss": 1.1973, + "step": 3490 + }, + { + "epoch": 0.23349812934259753, + "grad_norm": 0.171875, + "learning_rate": 0.0001893392170003345, + "loss": 1.1924, + "step": 3495 + }, + { + "epoch": 0.23383217530732228, + "grad_norm": 0.166015625, + "learning_rate": 0.0001892867679059887, + "loss": 1.2833, + "step": 3500 + }, + { + "epoch": 0.23416622127204703, + "grad_norm": 0.173828125, + "learning_rate": 0.0001892341974092814, + "loss": 1.2072, + "step": 3505 + }, + { + "epoch": 0.23450026723677178, + "grad_norm": 0.1611328125, + "learning_rate": 0.00018918150558169217, + "loss": 1.2041, + "step": 3510 + }, + { + "epoch": 0.23483431320149653, + "grad_norm": 0.162109375, + "learning_rate": 0.00018912869249486556, + "loss": 1.1624, + "step": 3515 + }, + { + "epoch": 0.23516835916622128, + "grad_norm": 0.18359375, + "learning_rate": 0.000189075758220611, + "loss": 1.2527, + "step": 3520 + }, + { + "epoch": 0.23550240513094603, + "grad_norm": 0.173828125, + "learning_rate": 0.0001890227028309028, + "loss": 1.0995, + "step": 3525 + }, + { + "epoch": 0.23583645109567075, + "grad_norm": 0.181640625, + "learning_rate": 0.00018896952639787978, + "loss": 1.2456, + "step": 3530 + }, + { + "epoch": 0.2361704970603955, + "grad_norm": 0.1533203125, + "learning_rate": 0.00018891622899384544, + "loss": 1.19, + "step": 3535 + }, + { + "epoch": 0.23650454302512025, + "grad_norm": 0.162109375, + "learning_rate": 0.0001888628106912678, + "loss": 1.2165, + "step": 3540 + }, + { + "epoch": 0.236838588989845, + "grad_norm": 0.1650390625, + "learning_rate": 0.00018880927156277914, + "loss": 1.1449, + "step": 3545 + }, + { + "epoch": 0.23717263495456975, + "grad_norm": 0.1640625, + "learning_rate": 0.00018875561168117617, + "loss": 1.2019, + "step": 3550 + }, + { + "epoch": 0.2375066809192945, + "grad_norm": 0.162109375, + "learning_rate": 0.00018870183111941965, + "loss": 1.2256, + "step": 3555 + }, + { + "epoch": 0.23784072688401925, + "grad_norm": 0.16015625, + "learning_rate": 0.00018864792995063455, + "loss": 1.141, + "step": 3560 + }, + { + "epoch": 0.238174772848744, + "grad_norm": 0.16796875, + "learning_rate": 0.00018859390824810975, + "loss": 1.1647, + "step": 3565 + }, + { + "epoch": 0.23850881881346872, + "grad_norm": 0.1640625, + "learning_rate": 0.00018853976608529803, + "loss": 1.2427, + "step": 3570 + }, + { + "epoch": 0.23884286477819347, + "grad_norm": 0.1708984375, + "learning_rate": 0.00018848550353581597, + "loss": 1.2271, + "step": 3575 + }, + { + "epoch": 0.23917691074291822, + "grad_norm": 0.166015625, + "learning_rate": 0.00018843112067344387, + "loss": 1.1665, + "step": 3580 + }, + { + "epoch": 0.23951095670764297, + "grad_norm": 0.158203125, + "learning_rate": 0.00018837661757212555, + "loss": 1.1786, + "step": 3585 + }, + { + "epoch": 0.23984500267236772, + "grad_norm": 0.162109375, + "learning_rate": 0.00018832199430596835, + "loss": 1.2068, + "step": 3590 + }, + { + "epoch": 0.24017904863709247, + "grad_norm": 0.1767578125, + "learning_rate": 0.000188267250949243, + "loss": 1.2176, + "step": 3595 + }, + { + "epoch": 0.24051309460181722, + "grad_norm": 0.1494140625, + "learning_rate": 0.00018821238757638358, + "loss": 1.2054, + "step": 3600 + }, + { + "epoch": 0.24084714056654197, + "grad_norm": 0.1650390625, + "learning_rate": 0.00018815740426198724, + "loss": 1.1923, + "step": 3605 + }, + { + "epoch": 0.2411811865312667, + "grad_norm": 0.1767578125, + "learning_rate": 0.00018810230108081425, + "loss": 1.18, + "step": 3610 + }, + { + "epoch": 0.24151523249599144, + "grad_norm": 0.169921875, + "learning_rate": 0.00018804707810778792, + "loss": 1.1351, + "step": 3615 + }, + { + "epoch": 0.2418492784607162, + "grad_norm": 0.166015625, + "learning_rate": 0.00018799173541799436, + "loss": 1.2466, + "step": 3620 + }, + { + "epoch": 0.24218332442544094, + "grad_norm": 0.1640625, + "learning_rate": 0.00018793627308668248, + "loss": 1.2114, + "step": 3625 + }, + { + "epoch": 0.2425173703901657, + "grad_norm": 0.16796875, + "learning_rate": 0.00018788069118926397, + "loss": 1.216, + "step": 3630 + }, + { + "epoch": 0.24285141635489044, + "grad_norm": 0.1640625, + "learning_rate": 0.0001878249898013129, + "loss": 1.2487, + "step": 3635 + }, + { + "epoch": 0.2431854623196152, + "grad_norm": 0.1630859375, + "learning_rate": 0.00018776916899856594, + "loss": 1.2802, + "step": 3640 + }, + { + "epoch": 0.24351950828433994, + "grad_norm": 0.154296875, + "learning_rate": 0.00018771322885692213, + "loss": 1.2153, + "step": 3645 + }, + { + "epoch": 0.24385355424906466, + "grad_norm": 0.173828125, + "learning_rate": 0.00018765716945244273, + "loss": 1.1273, + "step": 3650 + }, + { + "epoch": 0.2441876002137894, + "grad_norm": 0.173828125, + "learning_rate": 0.00018760099086135115, + "loss": 1.1616, + "step": 3655 + }, + { + "epoch": 0.24452164617851416, + "grad_norm": 0.1640625, + "learning_rate": 0.00018754469316003292, + "loss": 1.2522, + "step": 3660 + }, + { + "epoch": 0.2448556921432389, + "grad_norm": 0.1689453125, + "learning_rate": 0.00018748827642503545, + "loss": 1.2216, + "step": 3665 + }, + { + "epoch": 0.24518973810796366, + "grad_norm": 0.162109375, + "learning_rate": 0.0001874317407330681, + "loss": 1.1305, + "step": 3670 + }, + { + "epoch": 0.2455237840726884, + "grad_norm": 0.18359375, + "learning_rate": 0.0001873750861610018, + "loss": 1.179, + "step": 3675 + }, + { + "epoch": 0.24585783003741316, + "grad_norm": 0.169921875, + "learning_rate": 0.00018731831278586932, + "loss": 1.2022, + "step": 3680 + }, + { + "epoch": 0.2461918760021379, + "grad_norm": 0.169921875, + "learning_rate": 0.0001872614206848648, + "loss": 1.1367, + "step": 3685 + }, + { + "epoch": 0.24652592196686263, + "grad_norm": 0.1669921875, + "learning_rate": 0.00018720440993534395, + "loss": 1.1631, + "step": 3690 + }, + { + "epoch": 0.24685996793158738, + "grad_norm": 0.185546875, + "learning_rate": 0.0001871472806148237, + "loss": 1.2256, + "step": 3695 + }, + { + "epoch": 0.24719401389631213, + "grad_norm": 0.158203125, + "learning_rate": 0.00018709003280098225, + "loss": 1.2209, + "step": 3700 + }, + { + "epoch": 0.24752805986103688, + "grad_norm": 0.16015625, + "learning_rate": 0.0001870326665716589, + "loss": 1.1896, + "step": 3705 + }, + { + "epoch": 0.24786210582576162, + "grad_norm": 0.166015625, + "learning_rate": 0.00018697518200485398, + "loss": 1.2845, + "step": 3710 + }, + { + "epoch": 0.24819615179048637, + "grad_norm": 0.1630859375, + "learning_rate": 0.00018691757917872867, + "loss": 1.1432, + "step": 3715 + }, + { + "epoch": 0.24853019775521112, + "grad_norm": 0.1767578125, + "learning_rate": 0.00018685985817160503, + "loss": 1.265, + "step": 3720 + }, + { + "epoch": 0.24886424371993587, + "grad_norm": 0.201171875, + "learning_rate": 0.0001868020190619657, + "loss": 1.2452, + "step": 3725 + }, + { + "epoch": 0.2491982896846606, + "grad_norm": 0.1591796875, + "learning_rate": 0.00018674406192845398, + "loss": 1.2468, + "step": 3730 + }, + { + "epoch": 0.24953233564938535, + "grad_norm": 0.1689453125, + "learning_rate": 0.00018668598684987368, + "loss": 1.1719, + "step": 3735 + }, + { + "epoch": 0.2498663816141101, + "grad_norm": 0.1650390625, + "learning_rate": 0.00018662779390518885, + "loss": 1.1946, + "step": 3740 + }, + { + "epoch": 0.25020042757883487, + "grad_norm": 0.166015625, + "learning_rate": 0.00018656948317352393, + "loss": 1.3211, + "step": 3745 + }, + { + "epoch": 0.2505344735435596, + "grad_norm": 0.1708984375, + "learning_rate": 0.00018651105473416345, + "loss": 1.2501, + "step": 3750 + }, + { + "epoch": 0.2508685195082843, + "grad_norm": 0.1650390625, + "learning_rate": 0.000186452508666552, + "loss": 1.2048, + "step": 3755 + }, + { + "epoch": 0.25120256547300907, + "grad_norm": 0.1611328125, + "learning_rate": 0.0001863938450502941, + "loss": 1.2069, + "step": 3760 + }, + { + "epoch": 0.2515366114377338, + "grad_norm": 0.1640625, + "learning_rate": 0.00018633506396515407, + "loss": 1.1784, + "step": 3765 + }, + { + "epoch": 0.25187065740245856, + "grad_norm": 0.1728515625, + "learning_rate": 0.00018627616549105606, + "loss": 1.1697, + "step": 3770 + }, + { + "epoch": 0.2522047033671833, + "grad_norm": 0.1611328125, + "learning_rate": 0.00018621714970808374, + "loss": 1.1564, + "step": 3775 + }, + { + "epoch": 0.25253874933190806, + "grad_norm": 0.185546875, + "learning_rate": 0.00018615801669648026, + "loss": 1.1939, + "step": 3780 + }, + { + "epoch": 0.2528727952966328, + "grad_norm": 0.169921875, + "learning_rate": 0.00018609876653664825, + "loss": 1.2784, + "step": 3785 + }, + { + "epoch": 0.25320684126135756, + "grad_norm": 0.1650390625, + "learning_rate": 0.00018603939930914956, + "loss": 1.1739, + "step": 3790 + }, + { + "epoch": 0.2535408872260823, + "grad_norm": 0.162109375, + "learning_rate": 0.00018597991509470524, + "loss": 1.2513, + "step": 3795 + }, + { + "epoch": 0.25387493319080706, + "grad_norm": 0.154296875, + "learning_rate": 0.00018592031397419545, + "loss": 1.1615, + "step": 3800 + }, + { + "epoch": 0.2542089791555318, + "grad_norm": 0.1640625, + "learning_rate": 0.0001858605960286592, + "loss": 1.2272, + "step": 3805 + }, + { + "epoch": 0.25454302512025656, + "grad_norm": 0.1591796875, + "learning_rate": 0.00018580076133929444, + "loss": 1.2155, + "step": 3810 + }, + { + "epoch": 0.2548770710849813, + "grad_norm": 0.1689453125, + "learning_rate": 0.00018574080998745785, + "loss": 1.2615, + "step": 3815 + }, + { + "epoch": 0.25521111704970606, + "grad_norm": 0.1669921875, + "learning_rate": 0.00018568074205466465, + "loss": 1.2998, + "step": 3820 + }, + { + "epoch": 0.2555451630144308, + "grad_norm": 0.15234375, + "learning_rate": 0.00018562055762258862, + "loss": 1.1872, + "step": 3825 + }, + { + "epoch": 0.25587920897915556, + "grad_norm": 0.1796875, + "learning_rate": 0.00018556025677306205, + "loss": 1.195, + "step": 3830 + }, + { + "epoch": 0.25621325494388025, + "grad_norm": 0.16015625, + "learning_rate": 0.00018549983958807533, + "loss": 1.1688, + "step": 3835 + }, + { + "epoch": 0.256547300908605, + "grad_norm": 0.173828125, + "learning_rate": 0.00018543930614977719, + "loss": 1.2799, + "step": 3840 + }, + { + "epoch": 0.25688134687332975, + "grad_norm": 0.1669921875, + "learning_rate": 0.00018537865654047432, + "loss": 1.1605, + "step": 3845 + }, + { + "epoch": 0.2572153928380545, + "grad_norm": 0.1689453125, + "learning_rate": 0.00018531789084263143, + "loss": 1.2179, + "step": 3850 + }, + { + "epoch": 0.25754943880277925, + "grad_norm": 0.1630859375, + "learning_rate": 0.0001852570091388711, + "loss": 1.1944, + "step": 3855 + }, + { + "epoch": 0.257883484767504, + "grad_norm": 0.177734375, + "learning_rate": 0.00018519601151197352, + "loss": 1.2285, + "step": 3860 + }, + { + "epoch": 0.25821753073222875, + "grad_norm": 0.1640625, + "learning_rate": 0.00018513489804487666, + "loss": 1.2673, + "step": 3865 + }, + { + "epoch": 0.2585515766969535, + "grad_norm": 0.1796875, + "learning_rate": 0.00018507366882067583, + "loss": 1.2363, + "step": 3870 + }, + { + "epoch": 0.25888562266167825, + "grad_norm": 0.154296875, + "learning_rate": 0.00018501232392262385, + "loss": 1.266, + "step": 3875 + }, + { + "epoch": 0.259219668626403, + "grad_norm": 0.169921875, + "learning_rate": 0.00018495086343413083, + "loss": 1.2432, + "step": 3880 + }, + { + "epoch": 0.25955371459112775, + "grad_norm": 0.1845703125, + "learning_rate": 0.00018488928743876394, + "loss": 1.2437, + "step": 3885 + }, + { + "epoch": 0.2598877605558525, + "grad_norm": 0.16015625, + "learning_rate": 0.00018482759602024752, + "loss": 1.2451, + "step": 3890 + }, + { + "epoch": 0.26022180652057725, + "grad_norm": 0.1689453125, + "learning_rate": 0.0001847657892624628, + "loss": 1.2399, + "step": 3895 + }, + { + "epoch": 0.260555852485302, + "grad_norm": 0.1708984375, + "learning_rate": 0.0001847038672494478, + "loss": 1.2465, + "step": 3900 + }, + { + "epoch": 0.26088989845002675, + "grad_norm": 0.154296875, + "learning_rate": 0.0001846418300653973, + "loss": 1.2469, + "step": 3905 + }, + { + "epoch": 0.2612239444147515, + "grad_norm": 0.169921875, + "learning_rate": 0.0001845796777946627, + "loss": 1.1983, + "step": 3910 + }, + { + "epoch": 0.2615579903794762, + "grad_norm": 0.1630859375, + "learning_rate": 0.0001845174105217518, + "loss": 1.1489, + "step": 3915 + }, + { + "epoch": 0.26189203634420094, + "grad_norm": 0.1904296875, + "learning_rate": 0.00018445502833132883, + "loss": 1.1875, + "step": 3920 + }, + { + "epoch": 0.2622260823089257, + "grad_norm": 0.1787109375, + "learning_rate": 0.00018439253130821427, + "loss": 1.1905, + "step": 3925 + }, + { + "epoch": 0.26256012827365044, + "grad_norm": 0.16015625, + "learning_rate": 0.0001843299195373847, + "loss": 1.2525, + "step": 3930 + }, + { + "epoch": 0.2628941742383752, + "grad_norm": 0.166015625, + "learning_rate": 0.00018426719310397273, + "loss": 1.215, + "step": 3935 + }, + { + "epoch": 0.26322822020309994, + "grad_norm": 0.1669921875, + "learning_rate": 0.0001842043520932669, + "loss": 1.2433, + "step": 3940 + }, + { + "epoch": 0.2635622661678247, + "grad_norm": 0.1611328125, + "learning_rate": 0.0001841413965907116, + "loss": 1.1854, + "step": 3945 + }, + { + "epoch": 0.26389631213254944, + "grad_norm": 0.1650390625, + "learning_rate": 0.00018407832668190671, + "loss": 1.1176, + "step": 3950 + }, + { + "epoch": 0.2642303580972742, + "grad_norm": 0.1630859375, + "learning_rate": 0.00018401514245260783, + "loss": 1.2202, + "step": 3955 + }, + { + "epoch": 0.26456440406199894, + "grad_norm": 0.162109375, + "learning_rate": 0.000183951843988726, + "loss": 1.2002, + "step": 3960 + }, + { + "epoch": 0.2648984500267237, + "grad_norm": 0.1591796875, + "learning_rate": 0.00018388843137632748, + "loss": 1.2431, + "step": 3965 + }, + { + "epoch": 0.26523249599144844, + "grad_norm": 0.162109375, + "learning_rate": 0.00018382490470163378, + "loss": 1.1313, + "step": 3970 + }, + { + "epoch": 0.2655665419561732, + "grad_norm": 0.16015625, + "learning_rate": 0.00018376126405102153, + "loss": 1.2414, + "step": 3975 + }, + { + "epoch": 0.26590058792089794, + "grad_norm": 0.197265625, + "learning_rate": 0.00018369750951102232, + "loss": 1.2146, + "step": 3980 + }, + { + "epoch": 0.2662346338856227, + "grad_norm": 0.1796875, + "learning_rate": 0.00018363364116832256, + "loss": 1.1797, + "step": 3985 + }, + { + "epoch": 0.26656867985034743, + "grad_norm": 0.162109375, + "learning_rate": 0.00018356965910976348, + "loss": 1.236, + "step": 3990 + }, + { + "epoch": 0.26690272581507213, + "grad_norm": 0.1552734375, + "learning_rate": 0.00018350556342234078, + "loss": 1.1745, + "step": 3995 + }, + { + "epoch": 0.2672367717797969, + "grad_norm": 0.1640625, + "learning_rate": 0.00018344135419320483, + "loss": 1.1889, + "step": 4000 + }, + { + "epoch": 0.2675708177445216, + "grad_norm": 0.16015625, + "learning_rate": 0.00018337703150966025, + "loss": 1.2263, + "step": 4005 + }, + { + "epoch": 0.2679048637092464, + "grad_norm": 0.15625, + "learning_rate": 0.000183312595459166, + "loss": 1.1455, + "step": 4010 + }, + { + "epoch": 0.2682389096739711, + "grad_norm": 0.171875, + "learning_rate": 0.0001832480461293352, + "loss": 1.1681, + "step": 4015 + }, + { + "epoch": 0.2685729556386959, + "grad_norm": 0.169921875, + "learning_rate": 0.0001831833836079349, + "loss": 1.2668, + "step": 4020 + }, + { + "epoch": 0.2689070016034206, + "grad_norm": 0.1650390625, + "learning_rate": 0.00018311860798288609, + "loss": 1.2165, + "step": 4025 + }, + { + "epoch": 0.2692410475681454, + "grad_norm": 0.16796875, + "learning_rate": 0.00018305371934226362, + "loss": 1.2629, + "step": 4030 + }, + { + "epoch": 0.2695750935328701, + "grad_norm": 0.177734375, + "learning_rate": 0.00018298871777429595, + "loss": 1.1995, + "step": 4035 + }, + { + "epoch": 0.2699091394975949, + "grad_norm": 0.16015625, + "learning_rate": 0.00018292360336736506, + "loss": 1.1677, + "step": 4040 + }, + { + "epoch": 0.2702431854623196, + "grad_norm": 0.1640625, + "learning_rate": 0.00018285837621000636, + "loss": 1.2907, + "step": 4045 + }, + { + "epoch": 0.2705772314270444, + "grad_norm": 0.1689453125, + "learning_rate": 0.00018279303639090865, + "loss": 1.2465, + "step": 4050 + }, + { + "epoch": 0.2709112773917691, + "grad_norm": 0.1689453125, + "learning_rate": 0.00018272758399891383, + "loss": 1.1909, + "step": 4055 + }, + { + "epoch": 0.2712453233564939, + "grad_norm": 0.1591796875, + "learning_rate": 0.00018266201912301688, + "loss": 1.2178, + "step": 4060 + }, + { + "epoch": 0.2715793693212186, + "grad_norm": 0.173828125, + "learning_rate": 0.00018259634185236574, + "loss": 1.2175, + "step": 4065 + }, + { + "epoch": 0.27191341528594337, + "grad_norm": 0.169921875, + "learning_rate": 0.00018253055227626116, + "loss": 1.1551, + "step": 4070 + }, + { + "epoch": 0.27224746125066807, + "grad_norm": 0.1669921875, + "learning_rate": 0.00018246465048415663, + "loss": 1.1781, + "step": 4075 + }, + { + "epoch": 0.2725815072153928, + "grad_norm": 0.1748046875, + "learning_rate": 0.00018239863656565813, + "loss": 1.2389, + "step": 4080 + }, + { + "epoch": 0.27291555318011756, + "grad_norm": 0.1640625, + "learning_rate": 0.00018233251061052421, + "loss": 1.3091, + "step": 4085 + }, + { + "epoch": 0.2732495991448423, + "grad_norm": 0.16015625, + "learning_rate": 0.00018226627270866562, + "loss": 1.2638, + "step": 4090 + }, + { + "epoch": 0.27358364510956706, + "grad_norm": 0.1767578125, + "learning_rate": 0.00018219992295014548, + "loss": 1.194, + "step": 4095 + }, + { + "epoch": 0.2739176910742918, + "grad_norm": 0.166015625, + "learning_rate": 0.00018213346142517884, + "loss": 1.2313, + "step": 4100 + }, + { + "epoch": 0.27425173703901656, + "grad_norm": 0.1591796875, + "learning_rate": 0.00018206688822413288, + "loss": 1.1716, + "step": 4105 + }, + { + "epoch": 0.2745857830037413, + "grad_norm": 0.158203125, + "learning_rate": 0.00018200020343752646, + "loss": 1.2846, + "step": 4110 + }, + { + "epoch": 0.27491982896846606, + "grad_norm": 0.1611328125, + "learning_rate": 0.00018193340715603033, + "loss": 1.154, + "step": 4115 + }, + { + "epoch": 0.2752538749331908, + "grad_norm": 0.1748046875, + "learning_rate": 0.00018186649947046668, + "loss": 1.211, + "step": 4120 + }, + { + "epoch": 0.27558792089791556, + "grad_norm": 0.169921875, + "learning_rate": 0.0001817994804718093, + "loss": 1.2262, + "step": 4125 + }, + { + "epoch": 0.2759219668626403, + "grad_norm": 0.16796875, + "learning_rate": 0.00018173235025118324, + "loss": 1.2007, + "step": 4130 + }, + { + "epoch": 0.27625601282736506, + "grad_norm": 0.162109375, + "learning_rate": 0.0001816651088998649, + "loss": 1.228, + "step": 4135 + }, + { + "epoch": 0.2765900587920898, + "grad_norm": 0.1640625, + "learning_rate": 0.00018159775650928155, + "loss": 1.2326, + "step": 4140 + }, + { + "epoch": 0.27692410475681456, + "grad_norm": 0.1689453125, + "learning_rate": 0.0001815302931710117, + "loss": 1.1988, + "step": 4145 + }, + { + "epoch": 0.2772581507215393, + "grad_norm": 0.1708984375, + "learning_rate": 0.00018146271897678457, + "loss": 1.2058, + "step": 4150 + }, + { + "epoch": 0.277592196686264, + "grad_norm": 0.1708984375, + "learning_rate": 0.00018139503401848017, + "loss": 1.3223, + "step": 4155 + }, + { + "epoch": 0.27792624265098875, + "grad_norm": 0.1630859375, + "learning_rate": 0.00018132723838812907, + "loss": 1.2199, + "step": 4160 + }, + { + "epoch": 0.2782602886157135, + "grad_norm": 0.2080078125, + "learning_rate": 0.00018125933217791234, + "loss": 1.2513, + "step": 4165 + }, + { + "epoch": 0.27859433458043825, + "grad_norm": 0.16015625, + "learning_rate": 0.00018119131548016137, + "loss": 1.2097, + "step": 4170 + }, + { + "epoch": 0.278928380545163, + "grad_norm": 0.19140625, + "learning_rate": 0.00018112318838735787, + "loss": 1.1601, + "step": 4175 + }, + { + "epoch": 0.27926242650988775, + "grad_norm": 0.15625, + "learning_rate": 0.00018105495099213353, + "loss": 1.2154, + "step": 4180 + }, + { + "epoch": 0.2795964724746125, + "grad_norm": 0.166015625, + "learning_rate": 0.00018098660338727017, + "loss": 1.1971, + "step": 4185 + }, + { + "epoch": 0.27993051843933725, + "grad_norm": 0.1708984375, + "learning_rate": 0.0001809181456656993, + "loss": 1.1371, + "step": 4190 + }, + { + "epoch": 0.280264564404062, + "grad_norm": 0.1708984375, + "learning_rate": 0.00018084957792050224, + "loss": 1.2547, + "step": 4195 + }, + { + "epoch": 0.28059861036878675, + "grad_norm": 0.185546875, + "learning_rate": 0.0001807809002449099, + "loss": 1.201, + "step": 4200 + }, + { + "epoch": 0.2809326563335115, + "grad_norm": 0.16796875, + "learning_rate": 0.00018071211273230263, + "loss": 1.2014, + "step": 4205 + }, + { + "epoch": 0.28126670229823625, + "grad_norm": 0.1669921875, + "learning_rate": 0.00018064321547621022, + "loss": 1.2495, + "step": 4210 + }, + { + "epoch": 0.281600748262961, + "grad_norm": 0.1689453125, + "learning_rate": 0.00018057420857031157, + "loss": 1.1429, + "step": 4215 + }, + { + "epoch": 0.28193479422768575, + "grad_norm": 0.173828125, + "learning_rate": 0.0001805050921084347, + "loss": 1.1956, + "step": 4220 + }, + { + "epoch": 0.2822688401924105, + "grad_norm": 0.1708984375, + "learning_rate": 0.0001804358661845566, + "loss": 1.1969, + "step": 4225 + }, + { + "epoch": 0.28260288615713525, + "grad_norm": 0.1640625, + "learning_rate": 0.00018036653089280308, + "loss": 1.1448, + "step": 4230 + }, + { + "epoch": 0.28293693212185994, + "grad_norm": 0.1806640625, + "learning_rate": 0.00018029708632744871, + "loss": 1.2246, + "step": 4235 + }, + { + "epoch": 0.2832709780865847, + "grad_norm": 0.169921875, + "learning_rate": 0.00018022753258291658, + "loss": 1.2453, + "step": 4240 + }, + { + "epoch": 0.28360502405130944, + "grad_norm": 0.1533203125, + "learning_rate": 0.00018015786975377824, + "loss": 1.1488, + "step": 4245 + }, + { + "epoch": 0.2839390700160342, + "grad_norm": 0.1708984375, + "learning_rate": 0.00018008809793475358, + "loss": 1.1986, + "step": 4250 + }, + { + "epoch": 0.28427311598075894, + "grad_norm": 0.1708984375, + "learning_rate": 0.00018001821722071068, + "loss": 1.2035, + "step": 4255 + }, + { + "epoch": 0.2846071619454837, + "grad_norm": 0.1669921875, + "learning_rate": 0.00017994822770666565, + "loss": 1.2267, + "step": 4260 + }, + { + "epoch": 0.28494120791020844, + "grad_norm": 0.18359375, + "learning_rate": 0.0001798781294877826, + "loss": 1.2708, + "step": 4265 + }, + { + "epoch": 0.2852752538749332, + "grad_norm": 0.166015625, + "learning_rate": 0.00017980792265937336, + "loss": 1.2181, + "step": 4270 + }, + { + "epoch": 0.28560929983965794, + "grad_norm": 0.1669921875, + "learning_rate": 0.00017973760731689753, + "loss": 1.2263, + "step": 4275 + }, + { + "epoch": 0.2859433458043827, + "grad_norm": 0.2236328125, + "learning_rate": 0.00017966718355596218, + "loss": 1.1812, + "step": 4280 + }, + { + "epoch": 0.28627739176910744, + "grad_norm": 0.1708984375, + "learning_rate": 0.00017959665147232177, + "loss": 1.1684, + "step": 4285 + }, + { + "epoch": 0.2866114377338322, + "grad_norm": 0.1630859375, + "learning_rate": 0.00017952601116187823, + "loss": 1.1948, + "step": 4290 + }, + { + "epoch": 0.28694548369855694, + "grad_norm": 0.166015625, + "learning_rate": 0.00017945526272068038, + "loss": 1.2083, + "step": 4295 + }, + { + "epoch": 0.2872795296632817, + "grad_norm": 0.1806640625, + "learning_rate": 0.00017938440624492427, + "loss": 1.2976, + "step": 4300 + }, + { + "epoch": 0.28761357562800643, + "grad_norm": 0.1708984375, + "learning_rate": 0.00017931344183095272, + "loss": 1.1737, + "step": 4305 + }, + { + "epoch": 0.2879476215927312, + "grad_norm": 0.1708984375, + "learning_rate": 0.00017924236957525544, + "loss": 1.3016, + "step": 4310 + }, + { + "epoch": 0.2882816675574559, + "grad_norm": 0.16796875, + "learning_rate": 0.00017917118957446864, + "loss": 1.183, + "step": 4315 + }, + { + "epoch": 0.28861571352218063, + "grad_norm": 0.1611328125, + "learning_rate": 0.00017909990192537504, + "loss": 1.2075, + "step": 4320 + }, + { + "epoch": 0.2889497594869054, + "grad_norm": 0.166015625, + "learning_rate": 0.00017902850672490387, + "loss": 1.1567, + "step": 4325 + }, + { + "epoch": 0.2892838054516301, + "grad_norm": 0.1669921875, + "learning_rate": 0.00017895700407013045, + "loss": 1.1603, + "step": 4330 + }, + { + "epoch": 0.2896178514163549, + "grad_norm": 0.171875, + "learning_rate": 0.00017888539405827624, + "loss": 1.1885, + "step": 4335 + }, + { + "epoch": 0.2899518973810796, + "grad_norm": 0.16015625, + "learning_rate": 0.0001788136767867087, + "loss": 1.2023, + "step": 4340 + }, + { + "epoch": 0.2902859433458044, + "grad_norm": 0.171875, + "learning_rate": 0.00017874185235294113, + "loss": 1.2503, + "step": 4345 + }, + { + "epoch": 0.2906199893105291, + "grad_norm": 0.171875, + "learning_rate": 0.0001786699208546325, + "loss": 1.229, + "step": 4350 + }, + { + "epoch": 0.2909540352752539, + "grad_norm": 0.1748046875, + "learning_rate": 0.00017859788238958738, + "loss": 1.2023, + "step": 4355 + }, + { + "epoch": 0.2912880812399786, + "grad_norm": 0.154296875, + "learning_rate": 0.00017852573705575583, + "loss": 1.2153, + "step": 4360 + }, + { + "epoch": 0.2916221272047034, + "grad_norm": 0.173828125, + "learning_rate": 0.00017845348495123308, + "loss": 1.1974, + "step": 4365 + }, + { + "epoch": 0.2919561731694281, + "grad_norm": 0.1591796875, + "learning_rate": 0.00017838112617425968, + "loss": 1.1569, + "step": 4370 + }, + { + "epoch": 0.2922902191341529, + "grad_norm": 0.1650390625, + "learning_rate": 0.00017830866082322116, + "loss": 1.2243, + "step": 4375 + }, + { + "epoch": 0.2926242650988776, + "grad_norm": 0.166015625, + "learning_rate": 0.00017823608899664796, + "loss": 1.2616, + "step": 4380 + }, + { + "epoch": 0.2929583110636024, + "grad_norm": 0.173828125, + "learning_rate": 0.0001781634107932153, + "loss": 1.2427, + "step": 4385 + }, + { + "epoch": 0.2932923570283271, + "grad_norm": 0.1669921875, + "learning_rate": 0.000178090626311743, + "loss": 1.2594, + "step": 4390 + }, + { + "epoch": 0.29362640299305187, + "grad_norm": 0.162109375, + "learning_rate": 0.0001780177356511955, + "loss": 1.1679, + "step": 4395 + }, + { + "epoch": 0.29396044895777657, + "grad_norm": 0.16796875, + "learning_rate": 0.00017794473891068142, + "loss": 1.1783, + "step": 4400 + }, + { + "epoch": 0.2942944949225013, + "grad_norm": 0.17578125, + "learning_rate": 0.0001778716361894538, + "loss": 1.2501, + "step": 4405 + }, + { + "epoch": 0.29462854088722606, + "grad_norm": 0.27734375, + "learning_rate": 0.00017779842758690973, + "loss": 1.2928, + "step": 4410 + }, + { + "epoch": 0.2949625868519508, + "grad_norm": 0.1640625, + "learning_rate": 0.00017772511320259023, + "loss": 1.1966, + "step": 4415 + }, + { + "epoch": 0.29529663281667556, + "grad_norm": 0.162109375, + "learning_rate": 0.0001776516931361801, + "loss": 1.28, + "step": 4420 + }, + { + "epoch": 0.2956306787814003, + "grad_norm": 0.1689453125, + "learning_rate": 0.00017757816748750798, + "loss": 1.1887, + "step": 4425 + }, + { + "epoch": 0.29596472474612506, + "grad_norm": 0.1689453125, + "learning_rate": 0.00017750453635654591, + "loss": 1.2065, + "step": 4430 + }, + { + "epoch": 0.2962987707108498, + "grad_norm": 0.173828125, + "learning_rate": 0.0001774307998434095, + "loss": 1.2641, + "step": 4435 + }, + { + "epoch": 0.29663281667557456, + "grad_norm": 0.16015625, + "learning_rate": 0.00017735695804835757, + "loss": 1.1513, + "step": 4440 + }, + { + "epoch": 0.2969668626402993, + "grad_norm": 0.1611328125, + "learning_rate": 0.0001772830110717921, + "loss": 1.2108, + "step": 4445 + }, + { + "epoch": 0.29730090860502406, + "grad_norm": 0.1650390625, + "learning_rate": 0.00017720895901425805, + "loss": 1.1339, + "step": 4450 + }, + { + "epoch": 0.2976349545697488, + "grad_norm": 0.162109375, + "learning_rate": 0.0001771348019764433, + "loss": 1.1995, + "step": 4455 + }, + { + "epoch": 0.29796900053447356, + "grad_norm": 0.1728515625, + "learning_rate": 0.0001770605400591785, + "loss": 1.2197, + "step": 4460 + }, + { + "epoch": 0.2983030464991983, + "grad_norm": 0.16015625, + "learning_rate": 0.00017698617336343685, + "loss": 1.226, + "step": 4465 + }, + { + "epoch": 0.29863709246392306, + "grad_norm": 0.208984375, + "learning_rate": 0.000176911701990334, + "loss": 1.1742, + "step": 4470 + }, + { + "epoch": 0.2989711384286478, + "grad_norm": 0.1640625, + "learning_rate": 0.00017683712604112798, + "loss": 1.2294, + "step": 4475 + }, + { + "epoch": 0.2993051843933725, + "grad_norm": 0.1669921875, + "learning_rate": 0.00017676244561721905, + "loss": 1.1323, + "step": 4480 + }, + { + "epoch": 0.29963923035809725, + "grad_norm": 0.169921875, + "learning_rate": 0.00017668766082014936, + "loss": 1.1976, + "step": 4485 + }, + { + "epoch": 0.299973276322822, + "grad_norm": 0.1572265625, + "learning_rate": 0.00017661277175160314, + "loss": 1.3004, + "step": 4490 + }, + { + "epoch": 0.30030732228754675, + "grad_norm": 0.1591796875, + "learning_rate": 0.00017653777851340635, + "loss": 1.2209, + "step": 4495 + }, + { + "epoch": 0.3006413682522715, + "grad_norm": 0.169921875, + "learning_rate": 0.0001764626812075266, + "loss": 1.3061, + "step": 4500 + }, + { + "epoch": 0.30097541421699625, + "grad_norm": 0.1591796875, + "learning_rate": 0.00017638747993607286, + "loss": 1.2313, + "step": 4505 + }, + { + "epoch": 0.301309460181721, + "grad_norm": 0.166015625, + "learning_rate": 0.00017631217480129573, + "loss": 1.2196, + "step": 4510 + }, + { + "epoch": 0.30164350614644575, + "grad_norm": 0.16796875, + "learning_rate": 0.00017623676590558675, + "loss": 1.2387, + "step": 4515 + }, + { + "epoch": 0.3019775521111705, + "grad_norm": 0.1884765625, + "learning_rate": 0.00017616125335147875, + "loss": 1.1669, + "step": 4520 + }, + { + "epoch": 0.30231159807589525, + "grad_norm": 0.1845703125, + "learning_rate": 0.00017608563724164536, + "loss": 1.2865, + "step": 4525 + }, + { + "epoch": 0.30264564404062, + "grad_norm": 0.162109375, + "learning_rate": 0.0001760099176789012, + "loss": 1.191, + "step": 4530 + }, + { + "epoch": 0.30297969000534475, + "grad_norm": 0.1669921875, + "learning_rate": 0.00017593409476620127, + "loss": 1.2484, + "step": 4535 + }, + { + "epoch": 0.3033137359700695, + "grad_norm": 0.166015625, + "learning_rate": 0.00017585816860664135, + "loss": 1.2039, + "step": 4540 + }, + { + "epoch": 0.30364778193479425, + "grad_norm": 0.197265625, + "learning_rate": 0.00017578213930345753, + "loss": 1.2493, + "step": 4545 + }, + { + "epoch": 0.303981827899519, + "grad_norm": 0.1943359375, + "learning_rate": 0.0001757060069600261, + "loss": 1.1464, + "step": 4550 + }, + { + "epoch": 0.30431587386424375, + "grad_norm": 0.1650390625, + "learning_rate": 0.00017562977167986344, + "loss": 1.2524, + "step": 4555 + }, + { + "epoch": 0.30464991982896844, + "grad_norm": 0.1767578125, + "learning_rate": 0.00017555343356662597, + "loss": 1.1804, + "step": 4560 + }, + { + "epoch": 0.3049839657936932, + "grad_norm": 0.171875, + "learning_rate": 0.00017547699272410988, + "loss": 1.1869, + "step": 4565 + }, + { + "epoch": 0.30531801175841794, + "grad_norm": 0.16015625, + "learning_rate": 0.00017540044925625102, + "loss": 1.1751, + "step": 4570 + }, + { + "epoch": 0.3056520577231427, + "grad_norm": 0.1630859375, + "learning_rate": 0.00017532380326712487, + "loss": 1.2452, + "step": 4575 + }, + { + "epoch": 0.30598610368786744, + "grad_norm": 0.1708984375, + "learning_rate": 0.0001752470548609462, + "loss": 1.2203, + "step": 4580 + }, + { + "epoch": 0.3063201496525922, + "grad_norm": 0.171875, + "learning_rate": 0.00017517020414206913, + "loss": 1.2208, + "step": 4585 + }, + { + "epoch": 0.30665419561731694, + "grad_norm": 0.1650390625, + "learning_rate": 0.00017509325121498677, + "loss": 1.1297, + "step": 4590 + }, + { + "epoch": 0.3069882415820417, + "grad_norm": 0.1650390625, + "learning_rate": 0.00017501619618433135, + "loss": 1.2869, + "step": 4595 + }, + { + "epoch": 0.30732228754676644, + "grad_norm": 0.166015625, + "learning_rate": 0.00017493903915487377, + "loss": 1.2867, + "step": 4600 + }, + { + "epoch": 0.3076563335114912, + "grad_norm": 0.166015625, + "learning_rate": 0.0001748617802315238, + "loss": 1.1794, + "step": 4605 + }, + { + "epoch": 0.30799037947621594, + "grad_norm": 0.154296875, + "learning_rate": 0.0001747844195193296, + "loss": 1.1931, + "step": 4610 + }, + { + "epoch": 0.3083244254409407, + "grad_norm": 0.1728515625, + "learning_rate": 0.0001747069571234778, + "loss": 1.1756, + "step": 4615 + }, + { + "epoch": 0.30865847140566544, + "grad_norm": 0.1650390625, + "learning_rate": 0.00017462939314929327, + "loss": 1.2366, + "step": 4620 + }, + { + "epoch": 0.3089925173703902, + "grad_norm": 0.1806640625, + "learning_rate": 0.000174551727702239, + "loss": 1.1803, + "step": 4625 + }, + { + "epoch": 0.30932656333511493, + "grad_norm": 0.1875, + "learning_rate": 0.00017447396088791597, + "loss": 1.2106, + "step": 4630 + }, + { + "epoch": 0.3096606092998397, + "grad_norm": 0.15625, + "learning_rate": 0.00017439609281206297, + "loss": 1.2554, + "step": 4635 + }, + { + "epoch": 0.3099946552645644, + "grad_norm": 0.1845703125, + "learning_rate": 0.00017431812358055645, + "loss": 1.1544, + "step": 4640 + }, + { + "epoch": 0.31032870122928913, + "grad_norm": 0.2021484375, + "learning_rate": 0.00017424005329941047, + "loss": 1.1944, + "step": 4645 + }, + { + "epoch": 0.3106627471940139, + "grad_norm": 0.1708984375, + "learning_rate": 0.00017416188207477638, + "loss": 1.2195, + "step": 4650 + }, + { + "epoch": 0.3109967931587386, + "grad_norm": 0.1708984375, + "learning_rate": 0.00017408361001294292, + "loss": 1.2621, + "step": 4655 + }, + { + "epoch": 0.3113308391234634, + "grad_norm": 0.1630859375, + "learning_rate": 0.0001740052372203358, + "loss": 1.2301, + "step": 4660 + }, + { + "epoch": 0.3116648850881881, + "grad_norm": 0.1630859375, + "learning_rate": 0.00017392676380351775, + "loss": 1.1779, + "step": 4665 + }, + { + "epoch": 0.3119989310529129, + "grad_norm": 0.1689453125, + "learning_rate": 0.00017384818986918837, + "loss": 1.1954, + "step": 4670 + }, + { + "epoch": 0.3123329770176376, + "grad_norm": 0.1923828125, + "learning_rate": 0.00017376951552418386, + "loss": 1.2073, + "step": 4675 + }, + { + "epoch": 0.3126670229823624, + "grad_norm": 0.1669921875, + "learning_rate": 0.00017369074087547696, + "loss": 1.1381, + "step": 4680 + }, + { + "epoch": 0.3130010689470871, + "grad_norm": 0.1728515625, + "learning_rate": 0.00017361186603017685, + "loss": 1.2583, + "step": 4685 + }, + { + "epoch": 0.3133351149118119, + "grad_norm": 0.1748046875, + "learning_rate": 0.00017353289109552883, + "loss": 1.2078, + "step": 4690 + }, + { + "epoch": 0.3136691608765366, + "grad_norm": 0.1796875, + "learning_rate": 0.00017345381617891442, + "loss": 1.2324, + "step": 4695 + }, + { + "epoch": 0.3140032068412614, + "grad_norm": 0.171875, + "learning_rate": 0.000173374641387851, + "loss": 1.1969, + "step": 4700 + }, + { + "epoch": 0.3143372528059861, + "grad_norm": 0.162109375, + "learning_rate": 0.0001732953668299918, + "loss": 1.1406, + "step": 4705 + }, + { + "epoch": 0.31467129877071087, + "grad_norm": 0.1650390625, + "learning_rate": 0.0001732159926131256, + "loss": 1.1391, + "step": 4710 + }, + { + "epoch": 0.3150053447354356, + "grad_norm": 0.1591796875, + "learning_rate": 0.00017313651884517684, + "loss": 1.1954, + "step": 4715 + }, + { + "epoch": 0.3153393907001603, + "grad_norm": 0.171875, + "learning_rate": 0.00017305694563420524, + "loss": 1.218, + "step": 4720 + }, + { + "epoch": 0.31567343666488507, + "grad_norm": 0.1640625, + "learning_rate": 0.00017297727308840564, + "loss": 1.2108, + "step": 4725 + }, + { + "epoch": 0.3160074826296098, + "grad_norm": 0.16796875, + "learning_rate": 0.00017289750131610813, + "loss": 1.2214, + "step": 4730 + }, + { + "epoch": 0.31634152859433456, + "grad_norm": 0.1591796875, + "learning_rate": 0.00017281763042577763, + "loss": 1.3101, + "step": 4735 + }, + { + "epoch": 0.3166755745590593, + "grad_norm": 0.1650390625, + "learning_rate": 0.00017273766052601378, + "loss": 1.2593, + "step": 4740 + }, + { + "epoch": 0.31700962052378406, + "grad_norm": 0.1728515625, + "learning_rate": 0.00017265759172555085, + "loss": 1.2348, + "step": 4745 + }, + { + "epoch": 0.3173436664885088, + "grad_norm": 0.177734375, + "learning_rate": 0.0001725774241332577, + "loss": 1.1378, + "step": 4750 + }, + { + "epoch": 0.31767771245323356, + "grad_norm": 0.15234375, + "learning_rate": 0.00017249715785813737, + "loss": 1.1839, + "step": 4755 + }, + { + "epoch": 0.3180117584179583, + "grad_norm": 0.1728515625, + "learning_rate": 0.00017241679300932717, + "loss": 1.2049, + "step": 4760 + }, + { + "epoch": 0.31834580438268306, + "grad_norm": 0.1650390625, + "learning_rate": 0.00017233632969609842, + "loss": 1.2061, + "step": 4765 + }, + { + "epoch": 0.3186798503474078, + "grad_norm": 0.1865234375, + "learning_rate": 0.00017225576802785636, + "loss": 1.2253, + "step": 4770 + }, + { + "epoch": 0.31901389631213256, + "grad_norm": 0.154296875, + "learning_rate": 0.0001721751081141398, + "loss": 1.2239, + "step": 4775 + }, + { + "epoch": 0.3193479422768573, + "grad_norm": 0.1708984375, + "learning_rate": 0.00017209435006462136, + "loss": 1.1895, + "step": 4780 + }, + { + "epoch": 0.31968198824158206, + "grad_norm": 0.1689453125, + "learning_rate": 0.00017201349398910694, + "loss": 1.3404, + "step": 4785 + }, + { + "epoch": 0.3200160342063068, + "grad_norm": 0.16015625, + "learning_rate": 0.00017193253999753575, + "loss": 1.199, + "step": 4790 + }, + { + "epoch": 0.32035008017103156, + "grad_norm": 0.171875, + "learning_rate": 0.00017185148819998022, + "loss": 1.2542, + "step": 4795 + }, + { + "epoch": 0.32068412613575625, + "grad_norm": 0.169921875, + "learning_rate": 0.0001717703387066456, + "loss": 1.2918, + "step": 4800 + }, + { + "epoch": 0.321018172100481, + "grad_norm": 0.16796875, + "learning_rate": 0.00017168909162787016, + "loss": 1.1468, + "step": 4805 + }, + { + "epoch": 0.32135221806520575, + "grad_norm": 0.1728515625, + "learning_rate": 0.00017160774707412476, + "loss": 1.2845, + "step": 4810 + }, + { + "epoch": 0.3216862640299305, + "grad_norm": 0.17578125, + "learning_rate": 0.00017152630515601281, + "loss": 1.1376, + "step": 4815 + }, + { + "epoch": 0.32202030999465525, + "grad_norm": 0.1728515625, + "learning_rate": 0.0001714447659842701, + "loss": 1.2282, + "step": 4820 + }, + { + "epoch": 0.32235435595938, + "grad_norm": 0.1689453125, + "learning_rate": 0.00017136312966976465, + "loss": 1.2559, + "step": 4825 + }, + { + "epoch": 0.32268840192410475, + "grad_norm": 0.1552734375, + "learning_rate": 0.00017128139632349658, + "loss": 1.1685, + "step": 4830 + }, + { + "epoch": 0.3230224478888295, + "grad_norm": 0.16015625, + "learning_rate": 0.00017119956605659792, + "loss": 1.1611, + "step": 4835 + }, + { + "epoch": 0.32335649385355425, + "grad_norm": 0.16015625, + "learning_rate": 0.0001711176389803325, + "loss": 1.2007, + "step": 4840 + }, + { + "epoch": 0.323690539818279, + "grad_norm": 0.1708984375, + "learning_rate": 0.0001710356152060958, + "loss": 1.1714, + "step": 4845 + }, + { + "epoch": 0.32402458578300375, + "grad_norm": 0.17578125, + "learning_rate": 0.00017095349484541478, + "loss": 1.1959, + "step": 4850 + }, + { + "epoch": 0.3243586317477285, + "grad_norm": 0.1669921875, + "learning_rate": 0.00017087127800994767, + "loss": 1.2585, + "step": 4855 + }, + { + "epoch": 0.32469267771245325, + "grad_norm": 0.19140625, + "learning_rate": 0.00017078896481148388, + "loss": 1.1676, + "step": 4860 + }, + { + "epoch": 0.325026723677178, + "grad_norm": 0.1708984375, + "learning_rate": 0.00017070655536194397, + "loss": 1.1425, + "step": 4865 + }, + { + "epoch": 0.32536076964190275, + "grad_norm": 0.1689453125, + "learning_rate": 0.00017062404977337918, + "loss": 1.1537, + "step": 4870 + }, + { + "epoch": 0.3256948156066275, + "grad_norm": 0.1708984375, + "learning_rate": 0.00017054144815797164, + "loss": 1.1753, + "step": 4875 + }, + { + "epoch": 0.3260288615713522, + "grad_norm": 0.169921875, + "learning_rate": 0.00017045875062803395, + "loss": 1.2631, + "step": 4880 + }, + { + "epoch": 0.32636290753607694, + "grad_norm": 0.154296875, + "learning_rate": 0.00017037595729600913, + "loss": 1.2179, + "step": 4885 + }, + { + "epoch": 0.3266969535008017, + "grad_norm": 0.1591796875, + "learning_rate": 0.00017029306827447049, + "loss": 1.1933, + "step": 4890 + }, + { + "epoch": 0.32703099946552644, + "grad_norm": 0.1845703125, + "learning_rate": 0.00017021008367612144, + "loss": 1.2329, + "step": 4895 + }, + { + "epoch": 0.3273650454302512, + "grad_norm": 0.1708984375, + "learning_rate": 0.00017012700361379533, + "loss": 1.2341, + "step": 4900 + }, + { + "epoch": 0.32769909139497594, + "grad_norm": 0.1708984375, + "learning_rate": 0.00017004382820045533, + "loss": 1.1853, + "step": 4905 + }, + { + "epoch": 0.3280331373597007, + "grad_norm": 0.166015625, + "learning_rate": 0.0001699605575491943, + "loss": 1.1806, + "step": 4910 + }, + { + "epoch": 0.32836718332442544, + "grad_norm": 0.1708984375, + "learning_rate": 0.00016987719177323445, + "loss": 1.2325, + "step": 4915 + }, + { + "epoch": 0.3287012292891502, + "grad_norm": 0.16015625, + "learning_rate": 0.0001697937309859275, + "loss": 1.2619, + "step": 4920 + }, + { + "epoch": 0.32903527525387494, + "grad_norm": 0.166015625, + "learning_rate": 0.00016971017530075427, + "loss": 1.2323, + "step": 4925 + }, + { + "epoch": 0.3293693212185997, + "grad_norm": 0.265625, + "learning_rate": 0.0001696265248313246, + "loss": 1.2028, + "step": 4930 + }, + { + "epoch": 0.32970336718332444, + "grad_norm": 0.1796875, + "learning_rate": 0.00016954277969137723, + "loss": 1.1464, + "step": 4935 + }, + { + "epoch": 0.3300374131480492, + "grad_norm": 0.1865234375, + "learning_rate": 0.00016945893999477965, + "loss": 1.2066, + "step": 4940 + }, + { + "epoch": 0.33037145911277394, + "grad_norm": 0.1796875, + "learning_rate": 0.00016937500585552785, + "loss": 1.1981, + "step": 4945 + }, + { + "epoch": 0.3307055050774987, + "grad_norm": 0.1689453125, + "learning_rate": 0.00016929097738774634, + "loss": 1.1917, + "step": 4950 + }, + { + "epoch": 0.33103955104222343, + "grad_norm": 0.1708984375, + "learning_rate": 0.00016920685470568777, + "loss": 1.1599, + "step": 4955 + }, + { + "epoch": 0.33137359700694813, + "grad_norm": 0.1796875, + "learning_rate": 0.00016912263792373295, + "loss": 1.2997, + "step": 4960 + }, + { + "epoch": 0.3317076429716729, + "grad_norm": 0.154296875, + "learning_rate": 0.00016903832715639062, + "loss": 1.1939, + "step": 4965 + }, + { + "epoch": 0.3320416889363976, + "grad_norm": 0.171875, + "learning_rate": 0.00016895392251829736, + "loss": 1.1861, + "step": 4970 + }, + { + "epoch": 0.3323757349011224, + "grad_norm": 0.158203125, + "learning_rate": 0.00016886942412421734, + "loss": 1.282, + "step": 4975 + }, + { + "epoch": 0.3327097808658471, + "grad_norm": 0.1796875, + "learning_rate": 0.00016878483208904217, + "loss": 1.2393, + "step": 4980 + }, + { + "epoch": 0.3330438268305719, + "grad_norm": 0.166015625, + "learning_rate": 0.00016870014652779086, + "loss": 1.2781, + "step": 4985 + }, + { + "epoch": 0.3333778727952966, + "grad_norm": 0.1923828125, + "learning_rate": 0.00016861536755560956, + "loss": 1.242, + "step": 4990 + }, + { + "epoch": 0.3337119187600214, + "grad_norm": 0.1611328125, + "learning_rate": 0.0001685304952877714, + "loss": 1.1841, + "step": 4995 + }, + { + "epoch": 0.3340459647247461, + "grad_norm": 0.162109375, + "learning_rate": 0.0001684455298396764, + "loss": 1.2658, + "step": 5000 + }, + { + "epoch": 0.3343800106894709, + "grad_norm": 0.1669921875, + "learning_rate": 0.00016836047132685132, + "loss": 1.3034, + "step": 5005 + }, + { + "epoch": 0.3347140566541956, + "grad_norm": 0.1640625, + "learning_rate": 0.0001682753198649493, + "loss": 1.1442, + "step": 5010 + }, + { + "epoch": 0.3350481026189204, + "grad_norm": 0.16796875, + "learning_rate": 0.00016819007556975003, + "loss": 1.2386, + "step": 5015 + }, + { + "epoch": 0.3353821485836451, + "grad_norm": 0.1787109375, + "learning_rate": 0.0001681047385571594, + "loss": 1.3074, + "step": 5020 + }, + { + "epoch": 0.3357161945483699, + "grad_norm": 0.169921875, + "learning_rate": 0.0001680193089432092, + "loss": 1.215, + "step": 5025 + }, + { + "epoch": 0.3360502405130946, + "grad_norm": 0.1669921875, + "learning_rate": 0.00016793378684405735, + "loss": 1.2023, + "step": 5030 + }, + { + "epoch": 0.33638428647781937, + "grad_norm": 0.16015625, + "learning_rate": 0.00016784817237598744, + "loss": 1.1826, + "step": 5035 + }, + { + "epoch": 0.3367183324425441, + "grad_norm": 0.1845703125, + "learning_rate": 0.0001677624656554086, + "loss": 1.2126, + "step": 5040 + }, + { + "epoch": 0.3370523784072688, + "grad_norm": 0.16015625, + "learning_rate": 0.00016767666679885546, + "loss": 1.2062, + "step": 5045 + }, + { + "epoch": 0.33738642437199357, + "grad_norm": 0.1640625, + "learning_rate": 0.00016759077592298788, + "loss": 1.2673, + "step": 5050 + }, + { + "epoch": 0.3377204703367183, + "grad_norm": 0.16796875, + "learning_rate": 0.00016750479314459087, + "loss": 1.2311, + "step": 5055 + }, + { + "epoch": 0.33805451630144306, + "grad_norm": 0.1669921875, + "learning_rate": 0.00016741871858057437, + "loss": 1.1168, + "step": 5060 + }, + { + "epoch": 0.3383885622661678, + "grad_norm": 0.1865234375, + "learning_rate": 0.00016733255234797318, + "loss": 1.1359, + "step": 5065 + }, + { + "epoch": 0.33872260823089256, + "grad_norm": 0.171875, + "learning_rate": 0.00016724629456394666, + "loss": 1.1967, + "step": 5070 + }, + { + "epoch": 0.3390566541956173, + "grad_norm": 0.169921875, + "learning_rate": 0.0001671599453457787, + "loss": 1.2424, + "step": 5075 + }, + { + "epoch": 0.33939070016034206, + "grad_norm": 0.1611328125, + "learning_rate": 0.00016707350481087754, + "loss": 1.1751, + "step": 5080 + }, + { + "epoch": 0.3397247461250668, + "grad_norm": 0.1728515625, + "learning_rate": 0.00016698697307677548, + "loss": 1.1932, + "step": 5085 + }, + { + "epoch": 0.34005879208979156, + "grad_norm": 0.189453125, + "learning_rate": 0.00016690035026112893, + "loss": 1.2002, + "step": 5090 + }, + { + "epoch": 0.3403928380545163, + "grad_norm": 0.1689453125, + "learning_rate": 0.0001668136364817181, + "loss": 1.2033, + "step": 5095 + }, + { + "epoch": 0.34072688401924106, + "grad_norm": 0.189453125, + "learning_rate": 0.0001667268318564469, + "loss": 1.1139, + "step": 5100 + }, + { + "epoch": 0.3410609299839658, + "grad_norm": 0.1611328125, + "learning_rate": 0.0001666399365033427, + "loss": 1.2557, + "step": 5105 + }, + { + "epoch": 0.34139497594869056, + "grad_norm": 0.1669921875, + "learning_rate": 0.00016655295054055633, + "loss": 1.1648, + "step": 5110 + }, + { + "epoch": 0.3417290219134153, + "grad_norm": 0.1884765625, + "learning_rate": 0.0001664658740863617, + "loss": 1.2161, + "step": 5115 + }, + { + "epoch": 0.34206306787814006, + "grad_norm": 0.158203125, + "learning_rate": 0.00016637870725915593, + "loss": 1.2772, + "step": 5120 + }, + { + "epoch": 0.34239711384286475, + "grad_norm": 0.1640625, + "learning_rate": 0.00016629145017745878, + "loss": 1.2373, + "step": 5125 + }, + { + "epoch": 0.3427311598075895, + "grad_norm": 0.173828125, + "learning_rate": 0.000166204102959913, + "loss": 1.1672, + "step": 5130 + }, + { + "epoch": 0.34306520577231425, + "grad_norm": 0.1787109375, + "learning_rate": 0.00016611666572528372, + "loss": 1.2734, + "step": 5135 + }, + { + "epoch": 0.343399251737039, + "grad_norm": 0.20703125, + "learning_rate": 0.00016602913859245847, + "loss": 1.2255, + "step": 5140 + }, + { + "epoch": 0.34373329770176375, + "grad_norm": 0.1826171875, + "learning_rate": 0.0001659415216804471, + "loss": 1.2051, + "step": 5145 + }, + { + "epoch": 0.3440673436664885, + "grad_norm": 0.1787109375, + "learning_rate": 0.00016585381510838144, + "loss": 1.203, + "step": 5150 + }, + { + "epoch": 0.34440138963121325, + "grad_norm": 0.166015625, + "learning_rate": 0.00016576601899551534, + "loss": 1.1811, + "step": 5155 + }, + { + "epoch": 0.344735435595938, + "grad_norm": 0.1708984375, + "learning_rate": 0.00016567813346122427, + "loss": 1.2409, + "step": 5160 + }, + { + "epoch": 0.34506948156066275, + "grad_norm": 0.162109375, + "learning_rate": 0.00016559015862500538, + "loss": 1.1553, + "step": 5165 + }, + { + "epoch": 0.3454035275253875, + "grad_norm": 0.1806640625, + "learning_rate": 0.00016550209460647718, + "loss": 1.2214, + "step": 5170 + }, + { + "epoch": 0.34573757349011225, + "grad_norm": 0.173828125, + "learning_rate": 0.00016541394152537952, + "loss": 1.2735, + "step": 5175 + }, + { + "epoch": 0.346071619454837, + "grad_norm": 0.1669921875, + "learning_rate": 0.00016532569950157325, + "loss": 1.2076, + "step": 5180 + }, + { + "epoch": 0.34640566541956175, + "grad_norm": 0.1708984375, + "learning_rate": 0.00016523736865504021, + "loss": 1.1431, + "step": 5185 + }, + { + "epoch": 0.3467397113842865, + "grad_norm": 0.18359375, + "learning_rate": 0.00016514894910588305, + "loss": 1.2222, + "step": 5190 + }, + { + "epoch": 0.34707375734901125, + "grad_norm": 0.1650390625, + "learning_rate": 0.00016506044097432495, + "loss": 1.1915, + "step": 5195 + }, + { + "epoch": 0.347407803313736, + "grad_norm": 0.1708984375, + "learning_rate": 0.00016497184438070956, + "loss": 1.2848, + "step": 5200 + }, + { + "epoch": 0.3477418492784607, + "grad_norm": 0.1845703125, + "learning_rate": 0.0001648831594455008, + "loss": 1.1968, + "step": 5205 + }, + { + "epoch": 0.34807589524318544, + "grad_norm": 0.1982421875, + "learning_rate": 0.00016479438628928277, + "loss": 1.2443, + "step": 5210 + }, + { + "epoch": 0.3484099412079102, + "grad_norm": 0.1650390625, + "learning_rate": 0.00016470552503275947, + "loss": 1.1879, + "step": 5215 + }, + { + "epoch": 0.34874398717263494, + "grad_norm": 0.1650390625, + "learning_rate": 0.0001646165757967547, + "loss": 1.1934, + "step": 5220 + }, + { + "epoch": 0.3490780331373597, + "grad_norm": 0.1640625, + "learning_rate": 0.00016452753870221183, + "loss": 1.2376, + "step": 5225 + }, + { + "epoch": 0.34941207910208444, + "grad_norm": 0.1689453125, + "learning_rate": 0.0001644384138701938, + "loss": 1.2077, + "step": 5230 + }, + { + "epoch": 0.3497461250668092, + "grad_norm": 0.1669921875, + "learning_rate": 0.00016434920142188278, + "loss": 1.2144, + "step": 5235 + }, + { + "epoch": 0.35008017103153394, + "grad_norm": 0.16015625, + "learning_rate": 0.00016425990147858003, + "loss": 1.2423, + "step": 5240 + }, + { + "epoch": 0.3504142169962587, + "grad_norm": 0.1728515625, + "learning_rate": 0.00016417051416170594, + "loss": 1.1769, + "step": 5245 + }, + { + "epoch": 0.35074826296098344, + "grad_norm": 0.1640625, + "learning_rate": 0.00016408103959279945, + "loss": 1.1844, + "step": 5250 + }, + { + "epoch": 0.3510823089257082, + "grad_norm": 0.169921875, + "learning_rate": 0.00016399147789351837, + "loss": 1.1612, + "step": 5255 + }, + { + "epoch": 0.35141635489043294, + "grad_norm": 0.173828125, + "learning_rate": 0.00016390182918563887, + "loss": 1.2393, + "step": 5260 + }, + { + "epoch": 0.3517504008551577, + "grad_norm": 0.1806640625, + "learning_rate": 0.0001638120935910554, + "loss": 1.187, + "step": 5265 + }, + { + "epoch": 0.35208444681988244, + "grad_norm": 0.177734375, + "learning_rate": 0.00016372227123178057, + "loss": 1.1734, + "step": 5270 + }, + { + "epoch": 0.3524184927846072, + "grad_norm": 0.1640625, + "learning_rate": 0.00016363236222994505, + "loss": 1.1329, + "step": 5275 + }, + { + "epoch": 0.35275253874933193, + "grad_norm": 0.1689453125, + "learning_rate": 0.0001635423667077972, + "loss": 1.2187, + "step": 5280 + }, + { + "epoch": 0.35308658471405663, + "grad_norm": 0.1708984375, + "learning_rate": 0.0001634522847877031, + "loss": 1.2226, + "step": 5285 + }, + { + "epoch": 0.3534206306787814, + "grad_norm": 0.166015625, + "learning_rate": 0.00016336211659214621, + "loss": 1.1586, + "step": 5290 + }, + { + "epoch": 0.3537546766435061, + "grad_norm": 0.1767578125, + "learning_rate": 0.00016327186224372747, + "loss": 1.1645, + "step": 5295 + }, + { + "epoch": 0.3540887226082309, + "grad_norm": 0.162109375, + "learning_rate": 0.00016318152186516472, + "loss": 1.2019, + "step": 5300 + }, + { + "epoch": 0.3544227685729556, + "grad_norm": 0.1689453125, + "learning_rate": 0.00016309109557929302, + "loss": 1.2287, + "step": 5305 + }, + { + "epoch": 0.3547568145376804, + "grad_norm": 0.1689453125, + "learning_rate": 0.00016300058350906404, + "loss": 1.1239, + "step": 5310 + }, + { + "epoch": 0.3550908605024051, + "grad_norm": 0.17578125, + "learning_rate": 0.00016290998577754622, + "loss": 1.1858, + "step": 5315 + }, + { + "epoch": 0.3554249064671299, + "grad_norm": 0.16796875, + "learning_rate": 0.00016281930250792442, + "loss": 1.2656, + "step": 5320 + }, + { + "epoch": 0.3557589524318546, + "grad_norm": 0.1669921875, + "learning_rate": 0.00016272853382349979, + "loss": 1.1655, + "step": 5325 + }, + { + "epoch": 0.3560929983965794, + "grad_norm": 0.166015625, + "learning_rate": 0.00016263767984768965, + "loss": 1.2054, + "step": 5330 + }, + { + "epoch": 0.3564270443613041, + "grad_norm": 0.166015625, + "learning_rate": 0.00016254674070402731, + "loss": 1.1843, + "step": 5335 + }, + { + "epoch": 0.3567610903260289, + "grad_norm": 0.173828125, + "learning_rate": 0.0001624557165161618, + "loss": 1.2551, + "step": 5340 + }, + { + "epoch": 0.3570951362907536, + "grad_norm": 0.177734375, + "learning_rate": 0.00016236460740785784, + "loss": 1.179, + "step": 5345 + }, + { + "epoch": 0.3574291822554784, + "grad_norm": 0.169921875, + "learning_rate": 0.00016227341350299568, + "loss": 1.2391, + "step": 5350 + }, + { + "epoch": 0.3577632282202031, + "grad_norm": 0.17578125, + "learning_rate": 0.00016218213492557072, + "loss": 1.1763, + "step": 5355 + }, + { + "epoch": 0.35809727418492787, + "grad_norm": 0.1767578125, + "learning_rate": 0.00016209077179969356, + "loss": 1.1736, + "step": 5360 + }, + { + "epoch": 0.35843132014965257, + "grad_norm": 0.1748046875, + "learning_rate": 0.00016199932424958984, + "loss": 1.1226, + "step": 5365 + }, + { + "epoch": 0.3587653661143773, + "grad_norm": 0.162109375, + "learning_rate": 0.00016190779239959988, + "loss": 1.1357, + "step": 5370 + }, + { + "epoch": 0.35909941207910206, + "grad_norm": 0.2080078125, + "learning_rate": 0.00016181617637417862, + "loss": 1.232, + "step": 5375 + }, + { + "epoch": 0.3594334580438268, + "grad_norm": 0.1796875, + "learning_rate": 0.00016172447629789555, + "loss": 1.1751, + "step": 5380 + }, + { + "epoch": 0.35976750400855156, + "grad_norm": 0.1591796875, + "learning_rate": 0.00016163269229543437, + "loss": 1.2229, + "step": 5385 + }, + { + "epoch": 0.3601015499732763, + "grad_norm": 0.16015625, + "learning_rate": 0.00016154082449159284, + "loss": 1.2067, + "step": 5390 + }, + { + "epoch": 0.36043559593800106, + "grad_norm": 0.17578125, + "learning_rate": 0.00016144887301128283, + "loss": 1.2289, + "step": 5395 + }, + { + "epoch": 0.3607696419027258, + "grad_norm": 0.173828125, + "learning_rate": 0.00016135683797952982, + "loss": 1.1391, + "step": 5400 + }, + { + "epoch": 0.36110368786745056, + "grad_norm": 0.1796875, + "learning_rate": 0.00016126471952147297, + "loss": 1.2043, + "step": 5405 + }, + { + "epoch": 0.3614377338321753, + "grad_norm": 0.162109375, + "learning_rate": 0.00016117251776236492, + "loss": 1.2456, + "step": 5410 + }, + { + "epoch": 0.36177177979690006, + "grad_norm": 0.19921875, + "learning_rate": 0.00016108023282757143, + "loss": 1.3066, + "step": 5415 + }, + { + "epoch": 0.3621058257616248, + "grad_norm": 0.166015625, + "learning_rate": 0.00016098786484257147, + "loss": 1.1601, + "step": 5420 + }, + { + "epoch": 0.36243987172634956, + "grad_norm": 0.1640625, + "learning_rate": 0.00016089541393295696, + "loss": 1.2307, + "step": 5425 + }, + { + "epoch": 0.3627739176910743, + "grad_norm": 0.1875, + "learning_rate": 0.00016080288022443241, + "loss": 1.2327, + "step": 5430 + }, + { + "epoch": 0.36310796365579906, + "grad_norm": 0.1708984375, + "learning_rate": 0.0001607102638428151, + "loss": 1.1261, + "step": 5435 + }, + { + "epoch": 0.3634420096205238, + "grad_norm": 0.1669921875, + "learning_rate": 0.00016061756491403463, + "loss": 1.1447, + "step": 5440 + }, + { + "epoch": 0.3637760555852485, + "grad_norm": 0.1630859375, + "learning_rate": 0.00016052478356413282, + "loss": 1.2484, + "step": 5445 + }, + { + "epoch": 0.36411010154997325, + "grad_norm": 0.1640625, + "learning_rate": 0.00016043191991926356, + "loss": 1.2338, + "step": 5450 + }, + { + "epoch": 0.364444147514698, + "grad_norm": 0.1630859375, + "learning_rate": 0.0001603389741056927, + "loss": 1.1808, + "step": 5455 + }, + { + "epoch": 0.36477819347942275, + "grad_norm": 0.1630859375, + "learning_rate": 0.00016024594624979775, + "loss": 1.2165, + "step": 5460 + }, + { + "epoch": 0.3651122394441475, + "grad_norm": 0.1630859375, + "learning_rate": 0.0001601528364780678, + "loss": 1.1568, + "step": 5465 + }, + { + "epoch": 0.36544628540887225, + "grad_norm": 0.166015625, + "learning_rate": 0.00016005964491710328, + "loss": 1.2476, + "step": 5470 + }, + { + "epoch": 0.365780331373597, + "grad_norm": 0.1806640625, + "learning_rate": 0.00015996637169361593, + "loss": 1.2275, + "step": 5475 + }, + { + "epoch": 0.36611437733832175, + "grad_norm": 0.169921875, + "learning_rate": 0.00015987301693442838, + "loss": 1.1998, + "step": 5480 + }, + { + "epoch": 0.3664484233030465, + "grad_norm": 0.1611328125, + "learning_rate": 0.00015977958076647428, + "loss": 1.2051, + "step": 5485 + }, + { + "epoch": 0.36678246926777125, + "grad_norm": 0.1708984375, + "learning_rate": 0.0001596860633167978, + "loss": 1.2561, + "step": 5490 + }, + { + "epoch": 0.367116515232496, + "grad_norm": 0.162109375, + "learning_rate": 0.0001595924647125538, + "loss": 1.1694, + "step": 5495 + }, + { + "epoch": 0.36745056119722075, + "grad_norm": 0.1650390625, + "learning_rate": 0.00015949878508100733, + "loss": 1.2307, + "step": 5500 + }, + { + "epoch": 0.3677846071619455, + "grad_norm": 0.162109375, + "learning_rate": 0.00015940502454953376, + "loss": 1.2637, + "step": 5505 + }, + { + "epoch": 0.36811865312667025, + "grad_norm": 0.1650390625, + "learning_rate": 0.00015931118324561837, + "loss": 1.1909, + "step": 5510 + }, + { + "epoch": 0.368452699091395, + "grad_norm": 0.1708984375, + "learning_rate": 0.00015921726129685624, + "loss": 1.2717, + "step": 5515 + }, + { + "epoch": 0.36878674505611975, + "grad_norm": 0.1552734375, + "learning_rate": 0.00015912325883095217, + "loss": 1.138, + "step": 5520 + }, + { + "epoch": 0.36912079102084444, + "grad_norm": 0.1669921875, + "learning_rate": 0.0001590291759757204, + "loss": 1.1574, + "step": 5525 + }, + { + "epoch": 0.3694548369855692, + "grad_norm": 0.17578125, + "learning_rate": 0.00015893501285908448, + "loss": 1.2592, + "step": 5530 + }, + { + "epoch": 0.36978888295029394, + "grad_norm": 0.1650390625, + "learning_rate": 0.00015884076960907711, + "loss": 1.2071, + "step": 5535 + }, + { + "epoch": 0.3701229289150187, + "grad_norm": 0.162109375, + "learning_rate": 0.00015874644635383996, + "loss": 1.249, + "step": 5540 + }, + { + "epoch": 0.37045697487974344, + "grad_norm": 0.1611328125, + "learning_rate": 0.00015865204322162337, + "loss": 1.1818, + "step": 5545 + }, + { + "epoch": 0.3707910208444682, + "grad_norm": 0.162109375, + "learning_rate": 0.00015855756034078647, + "loss": 1.2327, + "step": 5550 + }, + { + "epoch": 0.37112506680919294, + "grad_norm": 0.1650390625, + "learning_rate": 0.00015846299783979669, + "loss": 1.2458, + "step": 5555 + }, + { + "epoch": 0.3714591127739177, + "grad_norm": 0.1650390625, + "learning_rate": 0.0001583683558472297, + "loss": 1.2344, + "step": 5560 + }, + { + "epoch": 0.37179315873864244, + "grad_norm": 0.15625, + "learning_rate": 0.0001582736344917694, + "loss": 1.1543, + "step": 5565 + }, + { + "epoch": 0.3721272047033672, + "grad_norm": 0.177734375, + "learning_rate": 0.00015817883390220746, + "loss": 1.223, + "step": 5570 + }, + { + "epoch": 0.37246125066809194, + "grad_norm": 0.1767578125, + "learning_rate": 0.00015808395420744334, + "loss": 1.1616, + "step": 5575 + }, + { + "epoch": 0.3727952966328167, + "grad_norm": 0.171875, + "learning_rate": 0.00015798899553648403, + "loss": 1.1649, + "step": 5580 + }, + { + "epoch": 0.37312934259754144, + "grad_norm": 0.1806640625, + "learning_rate": 0.00015789395801844397, + "loss": 1.1598, + "step": 5585 + }, + { + "epoch": 0.3734633885622662, + "grad_norm": 0.1767578125, + "learning_rate": 0.00015779884178254472, + "loss": 1.1505, + "step": 5590 + }, + { + "epoch": 0.37379743452699093, + "grad_norm": 0.1826171875, + "learning_rate": 0.00015770364695811493, + "loss": 1.2566, + "step": 5595 + }, + { + "epoch": 0.3741314804917157, + "grad_norm": 0.1748046875, + "learning_rate": 0.0001576083736745901, + "loss": 1.1727, + "step": 5600 + }, + { + "epoch": 0.3744655264564404, + "grad_norm": 0.1640625, + "learning_rate": 0.00015751302206151236, + "loss": 1.2458, + "step": 5605 + }, + { + "epoch": 0.37479957242116513, + "grad_norm": 0.1962890625, + "learning_rate": 0.0001574175922485304, + "loss": 1.1991, + "step": 5610 + }, + { + "epoch": 0.3751336183858899, + "grad_norm": 0.177734375, + "learning_rate": 0.00015732208436539927, + "loss": 1.2558, + "step": 5615 + }, + { + "epoch": 0.3754676643506146, + "grad_norm": 0.166015625, + "learning_rate": 0.00015722649854198005, + "loss": 1.1166, + "step": 5620 + }, + { + "epoch": 0.3758017103153394, + "grad_norm": 0.173828125, + "learning_rate": 0.0001571308349082399, + "loss": 1.152, + "step": 5625 + }, + { + "epoch": 0.3761357562800641, + "grad_norm": 0.169921875, + "learning_rate": 0.00015703509359425176, + "loss": 1.2202, + "step": 5630 + }, + { + "epoch": 0.3764698022447889, + "grad_norm": 0.166015625, + "learning_rate": 0.00015693927473019417, + "loss": 1.2383, + "step": 5635 + }, + { + "epoch": 0.3768038482095136, + "grad_norm": 0.1787109375, + "learning_rate": 0.0001568433784463511, + "loss": 1.1833, + "step": 5640 + }, + { + "epoch": 0.3771378941742384, + "grad_norm": 0.1767578125, + "learning_rate": 0.0001567474048731118, + "loss": 1.1411, + "step": 5645 + }, + { + "epoch": 0.3774719401389631, + "grad_norm": 0.171875, + "learning_rate": 0.00015665135414097065, + "loss": 1.1714, + "step": 5650 + }, + { + "epoch": 0.3778059861036879, + "grad_norm": 0.16796875, + "learning_rate": 0.0001565552263805269, + "loss": 1.2308, + "step": 5655 + }, + { + "epoch": 0.3781400320684126, + "grad_norm": 0.166015625, + "learning_rate": 0.00015645902172248453, + "loss": 1.2254, + "step": 5660 + }, + { + "epoch": 0.3784740780331374, + "grad_norm": 0.1689453125, + "learning_rate": 0.00015636274029765207, + "loss": 1.2096, + "step": 5665 + }, + { + "epoch": 0.3788081239978621, + "grad_norm": 0.1845703125, + "learning_rate": 0.00015626638223694252, + "loss": 1.2076, + "step": 5670 + }, + { + "epoch": 0.3791421699625869, + "grad_norm": 0.181640625, + "learning_rate": 0.00015616994767137294, + "loss": 1.2172, + "step": 5675 + }, + { + "epoch": 0.3794762159273116, + "grad_norm": 0.1650390625, + "learning_rate": 0.0001560734367320645, + "loss": 1.2691, + "step": 5680 + }, + { + "epoch": 0.37981026189203637, + "grad_norm": 0.2119140625, + "learning_rate": 0.00015597684955024222, + "loss": 1.1919, + "step": 5685 + }, + { + "epoch": 0.38014430785676107, + "grad_norm": 0.162109375, + "learning_rate": 0.00015588018625723477, + "loss": 1.2075, + "step": 5690 + }, + { + "epoch": 0.3804783538214858, + "grad_norm": 0.1611328125, + "learning_rate": 0.00015578344698447428, + "loss": 1.1873, + "step": 5695 + }, + { + "epoch": 0.38081239978621056, + "grad_norm": 0.169921875, + "learning_rate": 0.0001556866318634962, + "loss": 1.2008, + "step": 5700 + }, + { + "epoch": 0.3811464457509353, + "grad_norm": 0.1650390625, + "learning_rate": 0.00015558974102593913, + "loss": 1.1994, + "step": 5705 + }, + { + "epoch": 0.38148049171566006, + "grad_norm": 0.193359375, + "learning_rate": 0.0001554927746035446, + "loss": 1.241, + "step": 5710 + }, + { + "epoch": 0.3818145376803848, + "grad_norm": 0.162109375, + "learning_rate": 0.00015539573272815697, + "loss": 1.2538, + "step": 5715 + }, + { + "epoch": 0.38214858364510956, + "grad_norm": 0.1630859375, + "learning_rate": 0.00015529861553172314, + "loss": 1.1708, + "step": 5720 + }, + { + "epoch": 0.3824826296098343, + "grad_norm": 0.1630859375, + "learning_rate": 0.00015520142314629239, + "loss": 1.1842, + "step": 5725 + }, + { + "epoch": 0.38281667557455906, + "grad_norm": 0.1796875, + "learning_rate": 0.00015510415570401626, + "loss": 1.1753, + "step": 5730 + }, + { + "epoch": 0.3831507215392838, + "grad_norm": 0.1708984375, + "learning_rate": 0.0001550068133371484, + "loss": 1.2631, + "step": 5735 + }, + { + "epoch": 0.38348476750400856, + "grad_norm": 0.173828125, + "learning_rate": 0.0001549093961780443, + "loss": 1.2291, + "step": 5740 + }, + { + "epoch": 0.3838188134687333, + "grad_norm": 0.166015625, + "learning_rate": 0.0001548119043591611, + "loss": 1.228, + "step": 5745 + }, + { + "epoch": 0.38415285943345806, + "grad_norm": 0.2578125, + "learning_rate": 0.00015471433801305756, + "loss": 1.1866, + "step": 5750 + }, + { + "epoch": 0.3844869053981828, + "grad_norm": 0.173828125, + "learning_rate": 0.00015461669727239363, + "loss": 1.1626, + "step": 5755 + }, + { + "epoch": 0.38482095136290756, + "grad_norm": 0.162109375, + "learning_rate": 0.0001545189822699305, + "loss": 1.277, + "step": 5760 + }, + { + "epoch": 0.3851549973276323, + "grad_norm": 0.16015625, + "learning_rate": 0.00015442119313853033, + "loss": 1.219, + "step": 5765 + }, + { + "epoch": 0.385489043292357, + "grad_norm": 0.162109375, + "learning_rate": 0.0001543233300111561, + "loss": 1.2212, + "step": 5770 + }, + { + "epoch": 0.38582308925708175, + "grad_norm": 0.1669921875, + "learning_rate": 0.0001542253930208713, + "loss": 1.2303, + "step": 5775 + }, + { + "epoch": 0.3861571352218065, + "grad_norm": 0.166015625, + "learning_rate": 0.00015412738230083993, + "loss": 1.2315, + "step": 5780 + }, + { + "epoch": 0.38649118118653125, + "grad_norm": 0.18359375, + "learning_rate": 0.00015402929798432629, + "loss": 1.224, + "step": 5785 + }, + { + "epoch": 0.386825227151256, + "grad_norm": 0.15625, + "learning_rate": 0.00015393114020469462, + "loss": 1.1687, + "step": 5790 + }, + { + "epoch": 0.38715927311598075, + "grad_norm": 0.17578125, + "learning_rate": 0.0001538329090954091, + "loss": 1.1959, + "step": 5795 + }, + { + "epoch": 0.3874933190807055, + "grad_norm": 0.1796875, + "learning_rate": 0.0001537346047900337, + "loss": 1.2247, + "step": 5800 + }, + { + "epoch": 0.38782736504543025, + "grad_norm": 0.1689453125, + "learning_rate": 0.00015363622742223175, + "loss": 1.2288, + "step": 5805 + }, + { + "epoch": 0.388161411010155, + "grad_norm": 0.1728515625, + "learning_rate": 0.0001535377771257661, + "loss": 1.2688, + "step": 5810 + }, + { + "epoch": 0.38849545697487975, + "grad_norm": 0.1865234375, + "learning_rate": 0.0001534392540344986, + "loss": 1.1998, + "step": 5815 + }, + { + "epoch": 0.3888295029396045, + "grad_norm": 0.16796875, + "learning_rate": 0.00015334065828239023, + "loss": 1.1662, + "step": 5820 + }, + { + "epoch": 0.38916354890432925, + "grad_norm": 0.162109375, + "learning_rate": 0.00015324199000350062, + "loss": 1.2877, + "step": 5825 + }, + { + "epoch": 0.389497594869054, + "grad_norm": 0.173828125, + "learning_rate": 0.00015314324933198806, + "loss": 1.2341, + "step": 5830 + }, + { + "epoch": 0.38983164083377875, + "grad_norm": 0.1611328125, + "learning_rate": 0.0001530444364021094, + "loss": 1.1665, + "step": 5835 + }, + { + "epoch": 0.3901656867985035, + "grad_norm": 0.1630859375, + "learning_rate": 0.00015294555134821956, + "loss": 1.2566, + "step": 5840 + }, + { + "epoch": 0.39049973276322825, + "grad_norm": 0.166015625, + "learning_rate": 0.0001528465943047716, + "loss": 1.1577, + "step": 5845 + }, + { + "epoch": 0.39083377872795294, + "grad_norm": 0.1728515625, + "learning_rate": 0.00015274756540631644, + "loss": 1.1293, + "step": 5850 + }, + { + "epoch": 0.3911678246926777, + "grad_norm": 0.1796875, + "learning_rate": 0.00015264846478750278, + "loss": 1.1906, + "step": 5855 + }, + { + "epoch": 0.39150187065740244, + "grad_norm": 0.1845703125, + "learning_rate": 0.00015254929258307678, + "loss": 1.2896, + "step": 5860 + }, + { + "epoch": 0.3918359166221272, + "grad_norm": 0.1708984375, + "learning_rate": 0.0001524500489278819, + "loss": 1.1557, + "step": 5865 + }, + { + "epoch": 0.39216996258685194, + "grad_norm": 0.1689453125, + "learning_rate": 0.00015235073395685877, + "loss": 1.1846, + "step": 5870 + }, + { + "epoch": 0.3925040085515767, + "grad_norm": 0.1708984375, + "learning_rate": 0.00015225134780504505, + "loss": 1.2353, + "step": 5875 + }, + { + "epoch": 0.39283805451630144, + "grad_norm": 0.1650390625, + "learning_rate": 0.00015215189060757507, + "loss": 1.1962, + "step": 5880 + }, + { + "epoch": 0.3931721004810262, + "grad_norm": 0.1767578125, + "learning_rate": 0.00015205236249967995, + "loss": 1.1898, + "step": 5885 + }, + { + "epoch": 0.39350614644575094, + "grad_norm": 0.1708984375, + "learning_rate": 0.000151952763616687, + "loss": 1.2293, + "step": 5890 + }, + { + "epoch": 0.3938401924104757, + "grad_norm": 0.1669921875, + "learning_rate": 0.00015185309409401985, + "loss": 1.1423, + "step": 5895 + }, + { + "epoch": 0.39417423837520044, + "grad_norm": 0.16015625, + "learning_rate": 0.00015175335406719827, + "loss": 1.1552, + "step": 5900 + }, + { + "epoch": 0.3945082843399252, + "grad_norm": 0.1630859375, + "learning_rate": 0.00015165354367183777, + "loss": 1.1193, + "step": 5905 + }, + { + "epoch": 0.39484233030464994, + "grad_norm": 0.1591796875, + "learning_rate": 0.00015155366304364962, + "loss": 1.1415, + "step": 5910 + }, + { + "epoch": 0.3951763762693747, + "grad_norm": 0.17578125, + "learning_rate": 0.00015145371231844047, + "loss": 1.1684, + "step": 5915 + }, + { + "epoch": 0.39551042223409943, + "grad_norm": 0.177734375, + "learning_rate": 0.00015135369163211252, + "loss": 1.1763, + "step": 5920 + }, + { + "epoch": 0.3958444681988242, + "grad_norm": 0.1669921875, + "learning_rate": 0.00015125360112066275, + "loss": 1.11, + "step": 5925 + }, + { + "epoch": 0.3961785141635489, + "grad_norm": 0.169921875, + "learning_rate": 0.0001511534409201834, + "loss": 1.1879, + "step": 5930 + }, + { + "epoch": 0.39651256012827363, + "grad_norm": 0.1669921875, + "learning_rate": 0.00015105321116686132, + "loss": 1.2056, + "step": 5935 + }, + { + "epoch": 0.3968466060929984, + "grad_norm": 0.166015625, + "learning_rate": 0.00015095291199697784, + "loss": 1.187, + "step": 5940 + }, + { + "epoch": 0.3971806520577231, + "grad_norm": 0.1728515625, + "learning_rate": 0.0001508525435469089, + "loss": 1.2731, + "step": 5945 + }, + { + "epoch": 0.3975146980224479, + "grad_norm": 0.1630859375, + "learning_rate": 0.00015075210595312448, + "loss": 1.1658, + "step": 5950 + }, + { + "epoch": 0.3978487439871726, + "grad_norm": 0.1748046875, + "learning_rate": 0.0001506515993521886, + "loss": 1.154, + "step": 5955 + }, + { + "epoch": 0.3981827899518974, + "grad_norm": 0.1806640625, + "learning_rate": 0.0001505510238807591, + "loss": 1.195, + "step": 5960 + }, + { + "epoch": 0.3985168359166221, + "grad_norm": 0.185546875, + "learning_rate": 0.00015045037967558754, + "loss": 1.2198, + "step": 5965 + }, + { + "epoch": 0.3988508818813469, + "grad_norm": 0.1767578125, + "learning_rate": 0.00015034966687351884, + "loss": 1.1808, + "step": 5970 + }, + { + "epoch": 0.3991849278460716, + "grad_norm": 0.166015625, + "learning_rate": 0.00015024888561149125, + "loss": 1.1722, + "step": 5975 + }, + { + "epoch": 0.3995189738107964, + "grad_norm": 0.17578125, + "learning_rate": 0.00015014803602653607, + "loss": 1.1929, + "step": 5980 + }, + { + "epoch": 0.3998530197755211, + "grad_norm": 0.171875, + "learning_rate": 0.0001500471182557775, + "loss": 1.1078, + "step": 5985 + }, + { + "epoch": 0.4001870657402459, + "grad_norm": 0.158203125, + "learning_rate": 0.00014994613243643248, + "loss": 1.193, + "step": 5990 + }, + { + "epoch": 0.4005211117049706, + "grad_norm": 0.1728515625, + "learning_rate": 0.00014984507870581046, + "loss": 1.1894, + "step": 5995 + }, + { + "epoch": 0.40085515766969537, + "grad_norm": 0.1650390625, + "learning_rate": 0.00014974395720131328, + "loss": 1.2523, + "step": 6000 + }, + { + "epoch": 0.4011892036344201, + "grad_norm": 0.2119140625, + "learning_rate": 0.00014964276806043477, + "loss": 1.2658, + "step": 6005 + }, + { + "epoch": 0.4015232495991448, + "grad_norm": 0.169921875, + "learning_rate": 0.0001495415114207609, + "loss": 1.2058, + "step": 6010 + }, + { + "epoch": 0.40185729556386957, + "grad_norm": 0.1728515625, + "learning_rate": 0.0001494401874199694, + "loss": 1.1817, + "step": 6015 + }, + { + "epoch": 0.4021913415285943, + "grad_norm": 0.1650390625, + "learning_rate": 0.00014933879619582943, + "loss": 1.1867, + "step": 6020 + }, + { + "epoch": 0.40252538749331906, + "grad_norm": 0.1513671875, + "learning_rate": 0.00014923733788620175, + "loss": 1.2503, + "step": 6025 + }, + { + "epoch": 0.4028594334580438, + "grad_norm": 0.16796875, + "learning_rate": 0.0001491358126290382, + "loss": 1.2104, + "step": 6030 + }, + { + "epoch": 0.40319347942276856, + "grad_norm": 0.1591796875, + "learning_rate": 0.0001490342205623817, + "loss": 1.2023, + "step": 6035 + }, + { + "epoch": 0.4035275253874933, + "grad_norm": 0.1708984375, + "learning_rate": 0.00014893256182436609, + "loss": 1.154, + "step": 6040 + }, + { + "epoch": 0.40386157135221806, + "grad_norm": 0.1591796875, + "learning_rate": 0.00014883083655321567, + "loss": 1.0871, + "step": 6045 + }, + { + "epoch": 0.4041956173169428, + "grad_norm": 0.1689453125, + "learning_rate": 0.00014872904488724535, + "loss": 1.1745, + "step": 6050 + }, + { + "epoch": 0.40452966328166756, + "grad_norm": 0.162109375, + "learning_rate": 0.0001486271869648603, + "loss": 1.2076, + "step": 6055 + }, + { + "epoch": 0.4048637092463923, + "grad_norm": 0.1640625, + "learning_rate": 0.00014852526292455576, + "loss": 1.2039, + "step": 6060 + }, + { + "epoch": 0.40519775521111706, + "grad_norm": 0.17578125, + "learning_rate": 0.00014842327290491688, + "loss": 1.1722, + "step": 6065 + }, + { + "epoch": 0.4055318011758418, + "grad_norm": 0.1748046875, + "learning_rate": 0.00014832121704461848, + "loss": 1.1733, + "step": 6070 + }, + { + "epoch": 0.40586584714056656, + "grad_norm": 0.1669921875, + "learning_rate": 0.00014821909548242497, + "loss": 1.2495, + "step": 6075 + }, + { + "epoch": 0.4061998931052913, + "grad_norm": 0.1630859375, + "learning_rate": 0.00014811690835718998, + "loss": 1.1307, + "step": 6080 + }, + { + "epoch": 0.40653393907001606, + "grad_norm": 0.162109375, + "learning_rate": 0.00014801465580785648, + "loss": 1.2543, + "step": 6085 + }, + { + "epoch": 0.40686798503474075, + "grad_norm": 0.166015625, + "learning_rate": 0.00014791233797345618, + "loss": 1.1583, + "step": 6090 + }, + { + "epoch": 0.4072020309994655, + "grad_norm": 0.1689453125, + "learning_rate": 0.00014780995499310973, + "loss": 1.2537, + "step": 6095 + }, + { + "epoch": 0.40753607696419025, + "grad_norm": 0.17578125, + "learning_rate": 0.00014770750700602623, + "loss": 1.26, + "step": 6100 + }, + { + "epoch": 0.407870122928915, + "grad_norm": 0.185546875, + "learning_rate": 0.00014760499415150327, + "loss": 1.2814, + "step": 6105 + }, + { + "epoch": 0.40820416889363975, + "grad_norm": 0.162109375, + "learning_rate": 0.00014750241656892653, + "loss": 1.1918, + "step": 6110 + }, + { + "epoch": 0.4085382148583645, + "grad_norm": 0.1669921875, + "learning_rate": 0.00014739977439776983, + "loss": 1.2569, + "step": 6115 + }, + { + "epoch": 0.40887226082308925, + "grad_norm": 0.166015625, + "learning_rate": 0.00014729706777759474, + "loss": 1.2649, + "step": 6120 + }, + { + "epoch": 0.409206306787814, + "grad_norm": 0.1689453125, + "learning_rate": 0.00014719429684805041, + "loss": 1.2275, + "step": 6125 + }, + { + "epoch": 0.40954035275253875, + "grad_norm": 0.1884765625, + "learning_rate": 0.00014709146174887356, + "loss": 1.2521, + "step": 6130 + }, + { + "epoch": 0.4098743987172635, + "grad_norm": 0.193359375, + "learning_rate": 0.00014698856261988804, + "loss": 1.2256, + "step": 6135 + }, + { + "epoch": 0.41020844468198825, + "grad_norm": 0.177734375, + "learning_rate": 0.00014688559960100483, + "loss": 1.1749, + "step": 6140 + }, + { + "epoch": 0.410542490646713, + "grad_norm": 0.1689453125, + "learning_rate": 0.0001467825728322217, + "loss": 1.1549, + "step": 6145 + }, + { + "epoch": 0.41087653661143775, + "grad_norm": 0.1708984375, + "learning_rate": 0.00014667948245362329, + "loss": 1.2315, + "step": 6150 + }, + { + "epoch": 0.4112105825761625, + "grad_norm": 0.18359375, + "learning_rate": 0.00014657632860538047, + "loss": 1.1947, + "step": 6155 + }, + { + "epoch": 0.41154462854088725, + "grad_norm": 0.1875, + "learning_rate": 0.00014647311142775056, + "loss": 1.2204, + "step": 6160 + }, + { + "epoch": 0.411878674505612, + "grad_norm": 0.1787109375, + "learning_rate": 0.00014636983106107703, + "loss": 1.2413, + "step": 6165 + }, + { + "epoch": 0.4122127204703367, + "grad_norm": 0.1708984375, + "learning_rate": 0.00014626648764578916, + "loss": 1.1824, + "step": 6170 + }, + { + "epoch": 0.41254676643506144, + "grad_norm": 0.17578125, + "learning_rate": 0.000146163081322402, + "loss": 1.2953, + "step": 6175 + }, + { + "epoch": 0.4128808123997862, + "grad_norm": 0.2021484375, + "learning_rate": 0.00014605961223151614, + "loss": 1.1684, + "step": 6180 + }, + { + "epoch": 0.41321485836451094, + "grad_norm": 0.171875, + "learning_rate": 0.00014595608051381752, + "loss": 1.1756, + "step": 6185 + }, + { + "epoch": 0.4135489043292357, + "grad_norm": 0.1748046875, + "learning_rate": 0.0001458524863100772, + "loss": 1.2342, + "step": 6190 + }, + { + "epoch": 0.41388295029396044, + "grad_norm": 0.158203125, + "learning_rate": 0.00014574882976115124, + "loss": 1.2005, + "step": 6195 + }, + { + "epoch": 0.4142169962586852, + "grad_norm": 0.16796875, + "learning_rate": 0.00014564511100798044, + "loss": 1.2599, + "step": 6200 + }, + { + "epoch": 0.41455104222340994, + "grad_norm": 0.1708984375, + "learning_rate": 0.00014554133019159022, + "loss": 1.2165, + "step": 6205 + }, + { + "epoch": 0.4148850881881347, + "grad_norm": 0.1650390625, + "learning_rate": 0.00014543748745309034, + "loss": 1.1746, + "step": 6210 + }, + { + "epoch": 0.41521913415285944, + "grad_norm": 0.1826171875, + "learning_rate": 0.00014533358293367481, + "loss": 1.2066, + "step": 6215 + }, + { + "epoch": 0.4155531801175842, + "grad_norm": 0.169921875, + "learning_rate": 0.00014522961677462153, + "loss": 1.1775, + "step": 6220 + }, + { + "epoch": 0.41588722608230894, + "grad_norm": 0.1630859375, + "learning_rate": 0.0001451255891172924, + "loss": 1.1441, + "step": 6225 + }, + { + "epoch": 0.4162212720470337, + "grad_norm": 0.169921875, + "learning_rate": 0.0001450215001031327, + "loss": 1.2242, + "step": 6230 + }, + { + "epoch": 0.41655531801175844, + "grad_norm": 0.1640625, + "learning_rate": 0.00014491734987367137, + "loss": 1.2, + "step": 6235 + }, + { + "epoch": 0.4168893639764832, + "grad_norm": 0.162109375, + "learning_rate": 0.00014481313857052044, + "loss": 1.2246, + "step": 6240 + }, + { + "epoch": 0.41722340994120793, + "grad_norm": 0.1787109375, + "learning_rate": 0.00014470886633537498, + "loss": 1.1961, + "step": 6245 + }, + { + "epoch": 0.4175574559059327, + "grad_norm": 0.1982421875, + "learning_rate": 0.000144604533310013, + "loss": 1.2179, + "step": 6250 + }, + { + "epoch": 0.4178915018706574, + "grad_norm": 0.1640625, + "learning_rate": 0.00014450013963629508, + "loss": 1.1481, + "step": 6255 + }, + { + "epoch": 0.4182255478353821, + "grad_norm": 0.1748046875, + "learning_rate": 0.00014439568545616437, + "loss": 1.1695, + "step": 6260 + }, + { + "epoch": 0.4185595938001069, + "grad_norm": 0.1650390625, + "learning_rate": 0.0001442911709116461, + "loss": 1.1626, + "step": 6265 + }, + { + "epoch": 0.4188936397648316, + "grad_norm": 0.16796875, + "learning_rate": 0.0001441865961448478, + "loss": 1.1622, + "step": 6270 + }, + { + "epoch": 0.4192276857295564, + "grad_norm": 0.173828125, + "learning_rate": 0.0001440819612979587, + "loss": 1.2383, + "step": 6275 + }, + { + "epoch": 0.4195617316942811, + "grad_norm": 0.1630859375, + "learning_rate": 0.00014397726651324983, + "loss": 1.2379, + "step": 6280 + }, + { + "epoch": 0.4198957776590059, + "grad_norm": 0.169921875, + "learning_rate": 0.00014387251193307367, + "loss": 1.1498, + "step": 6285 + }, + { + "epoch": 0.4202298236237306, + "grad_norm": 0.1669921875, + "learning_rate": 0.00014376769769986405, + "loss": 1.2617, + "step": 6290 + }, + { + "epoch": 0.4205638695884554, + "grad_norm": 0.1767578125, + "learning_rate": 0.00014366282395613587, + "loss": 1.3093, + "step": 6295 + }, + { + "epoch": 0.4208979155531801, + "grad_norm": 0.173828125, + "learning_rate": 0.0001435578908444849, + "loss": 1.1134, + "step": 6300 + }, + { + "epoch": 0.4212319615179049, + "grad_norm": 0.1767578125, + "learning_rate": 0.00014345289850758777, + "loss": 1.2403, + "step": 6305 + }, + { + "epoch": 0.4215660074826296, + "grad_norm": 0.1728515625, + "learning_rate": 0.00014334784708820144, + "loss": 1.1466, + "step": 6310 + }, + { + "epoch": 0.4219000534473544, + "grad_norm": 0.1650390625, + "learning_rate": 0.00014324273672916343, + "loss": 1.1237, + "step": 6315 + }, + { + "epoch": 0.4222340994120791, + "grad_norm": 0.1611328125, + "learning_rate": 0.00014313756757339122, + "loss": 1.2737, + "step": 6320 + }, + { + "epoch": 0.42256814537680387, + "grad_norm": 0.1689453125, + "learning_rate": 0.00014303233976388236, + "loss": 1.1712, + "step": 6325 + }, + { + "epoch": 0.4229021913415286, + "grad_norm": 0.19140625, + "learning_rate": 0.00014292705344371402, + "loss": 1.1599, + "step": 6330 + }, + { + "epoch": 0.4232362373062533, + "grad_norm": 0.16796875, + "learning_rate": 0.00014282170875604307, + "loss": 1.163, + "step": 6335 + }, + { + "epoch": 0.42357028327097807, + "grad_norm": 0.1630859375, + "learning_rate": 0.00014271630584410558, + "loss": 1.1546, + "step": 6340 + }, + { + "epoch": 0.4239043292357028, + "grad_norm": 0.16796875, + "learning_rate": 0.00014261084485121697, + "loss": 1.232, + "step": 6345 + }, + { + "epoch": 0.42423837520042756, + "grad_norm": 0.1640625, + "learning_rate": 0.00014250532592077148, + "loss": 1.2507, + "step": 6350 + }, + { + "epoch": 0.4245724211651523, + "grad_norm": 0.1630859375, + "learning_rate": 0.00014239974919624224, + "loss": 1.216, + "step": 6355 + }, + { + "epoch": 0.42490646712987706, + "grad_norm": 0.162109375, + "learning_rate": 0.00014229411482118083, + "loss": 1.2722, + "step": 6360 + }, + { + "epoch": 0.4252405130946018, + "grad_norm": 0.1748046875, + "learning_rate": 0.00014218842293921738, + "loss": 1.1772, + "step": 6365 + }, + { + "epoch": 0.42557455905932656, + "grad_norm": 0.1982421875, + "learning_rate": 0.00014208267369406012, + "loss": 1.2117, + "step": 6370 + }, + { + "epoch": 0.4259086050240513, + "grad_norm": 0.1650390625, + "learning_rate": 0.0001419768672294952, + "loss": 1.1975, + "step": 6375 + }, + { + "epoch": 0.42624265098877606, + "grad_norm": 0.1689453125, + "learning_rate": 0.00014187100368938678, + "loss": 1.1905, + "step": 6380 + }, + { + "epoch": 0.4265766969535008, + "grad_norm": 0.171875, + "learning_rate": 0.00014176508321767637, + "loss": 1.1521, + "step": 6385 + }, + { + "epoch": 0.42691074291822556, + "grad_norm": 0.1572265625, + "learning_rate": 0.00014165910595838313, + "loss": 1.2083, + "step": 6390 + }, + { + "epoch": 0.4272447888829503, + "grad_norm": 0.1708984375, + "learning_rate": 0.00014155307205560323, + "loss": 1.2467, + "step": 6395 + }, + { + "epoch": 0.42757883484767506, + "grad_norm": 0.1630859375, + "learning_rate": 0.00014144698165351, + "loss": 1.2489, + "step": 6400 + }, + { + "epoch": 0.4279128808123998, + "grad_norm": 0.173828125, + "learning_rate": 0.00014134083489635355, + "loss": 1.2708, + "step": 6405 + }, + { + "epoch": 0.42824692677712456, + "grad_norm": 0.1728515625, + "learning_rate": 0.00014123463192846058, + "loss": 1.1689, + "step": 6410 + }, + { + "epoch": 0.42858097274184925, + "grad_norm": 0.158203125, + "learning_rate": 0.00014112837289423426, + "loss": 1.1623, + "step": 6415 + }, + { + "epoch": 0.428915018706574, + "grad_norm": 0.169921875, + "learning_rate": 0.00014102205793815398, + "loss": 1.1605, + "step": 6420 + }, + { + "epoch": 0.42924906467129875, + "grad_norm": 0.169921875, + "learning_rate": 0.00014091568720477518, + "loss": 1.288, + "step": 6425 + }, + { + "epoch": 0.4295831106360235, + "grad_norm": 0.1806640625, + "learning_rate": 0.00014080926083872907, + "loss": 1.1916, + "step": 6430 + }, + { + "epoch": 0.42991715660074825, + "grad_norm": 0.169921875, + "learning_rate": 0.00014070277898472263, + "loss": 1.2131, + "step": 6435 + }, + { + "epoch": 0.430251202565473, + "grad_norm": 0.1982421875, + "learning_rate": 0.00014059624178753817, + "loss": 1.1224, + "step": 6440 + }, + { + "epoch": 0.43058524853019775, + "grad_norm": 0.1708984375, + "learning_rate": 0.0001404896493920333, + "loss": 1.244, + "step": 6445 + }, + { + "epoch": 0.4309192944949225, + "grad_norm": 0.15625, + "learning_rate": 0.0001403830019431407, + "loss": 1.232, + "step": 6450 + }, + { + "epoch": 0.43125334045964725, + "grad_norm": 0.166015625, + "learning_rate": 0.00014027629958586788, + "loss": 1.2099, + "step": 6455 + }, + { + "epoch": 0.431587386424372, + "grad_norm": 0.1640625, + "learning_rate": 0.00014016954246529696, + "loss": 1.229, + "step": 6460 + }, + { + "epoch": 0.43192143238909675, + "grad_norm": 0.1689453125, + "learning_rate": 0.00014006273072658462, + "loss": 1.239, + "step": 6465 + }, + { + "epoch": 0.4322554783538215, + "grad_norm": 0.2255859375, + "learning_rate": 0.00013995586451496177, + "loss": 1.2018, + "step": 6470 + }, + { + "epoch": 0.43258952431854625, + "grad_norm": 0.1689453125, + "learning_rate": 0.0001398489439757333, + "loss": 1.1465, + "step": 6475 + }, + { + "epoch": 0.432923570283271, + "grad_norm": 0.1708984375, + "learning_rate": 0.00013974196925427816, + "loss": 1.2251, + "step": 6480 + }, + { + "epoch": 0.43325761624799575, + "grad_norm": 0.1650390625, + "learning_rate": 0.00013963494049604871, + "loss": 1.2515, + "step": 6485 + }, + { + "epoch": 0.4335916622127205, + "grad_norm": 0.1748046875, + "learning_rate": 0.00013952785784657106, + "loss": 1.2291, + "step": 6490 + }, + { + "epoch": 0.4339257081774452, + "grad_norm": 0.1767578125, + "learning_rate": 0.0001394207214514444, + "loss": 1.2463, + "step": 6495 + }, + { + "epoch": 0.43425975414216994, + "grad_norm": 0.1650390625, + "learning_rate": 0.00013931353145634102, + "loss": 1.152, + "step": 6500 + }, + { + "epoch": 0.4345938001068947, + "grad_norm": 0.16796875, + "learning_rate": 0.0001392062880070062, + "loss": 1.2804, + "step": 6505 + }, + { + "epoch": 0.43492784607161944, + "grad_norm": 0.17578125, + "learning_rate": 0.00013909899124925774, + "loss": 1.1706, + "step": 6510 + }, + { + "epoch": 0.4352618920363442, + "grad_norm": 0.1748046875, + "learning_rate": 0.0001389916413289861, + "loss": 1.1676, + "step": 6515 + }, + { + "epoch": 0.43559593800106894, + "grad_norm": 0.1826171875, + "learning_rate": 0.00013888423839215395, + "loss": 1.1549, + "step": 6520 + }, + { + "epoch": 0.4359299839657937, + "grad_norm": 0.177734375, + "learning_rate": 0.0001387767825847959, + "loss": 1.2364, + "step": 6525 + }, + { + "epoch": 0.43626402993051844, + "grad_norm": 0.1591796875, + "learning_rate": 0.0001386692740530187, + "loss": 1.244, + "step": 6530 + }, + { + "epoch": 0.4365980758952432, + "grad_norm": 0.177734375, + "learning_rate": 0.00013856171294300066, + "loss": 1.1887, + "step": 6535 + }, + { + "epoch": 0.43693212185996794, + "grad_norm": 0.1669921875, + "learning_rate": 0.00013845409940099152, + "loss": 1.0936, + "step": 6540 + }, + { + "epoch": 0.4372661678246927, + "grad_norm": 0.166015625, + "learning_rate": 0.00013834643357331245, + "loss": 1.2448, + "step": 6545 + }, + { + "epoch": 0.43760021378941744, + "grad_norm": 0.1904296875, + "learning_rate": 0.0001382387156063556, + "loss": 1.1776, + "step": 6550 + }, + { + "epoch": 0.4379342597541422, + "grad_norm": 0.1591796875, + "learning_rate": 0.0001381309456465841, + "loss": 1.2031, + "step": 6555 + }, + { + "epoch": 0.43826830571886694, + "grad_norm": 0.16015625, + "learning_rate": 0.0001380231238405317, + "loss": 1.2349, + "step": 6560 + }, + { + "epoch": 0.4386023516835917, + "grad_norm": 0.177734375, + "learning_rate": 0.00013791525033480268, + "loss": 1.1826, + "step": 6565 + }, + { + "epoch": 0.43893639764831643, + "grad_norm": 0.1611328125, + "learning_rate": 0.00013780732527607156, + "loss": 1.2273, + "step": 6570 + }, + { + "epoch": 0.43927044361304113, + "grad_norm": 0.1728515625, + "learning_rate": 0.00013769934881108312, + "loss": 1.1712, + "step": 6575 + }, + { + "epoch": 0.4396044895777659, + "grad_norm": 0.177734375, + "learning_rate": 0.00013759132108665182, + "loss": 1.1598, + "step": 6580 + }, + { + "epoch": 0.4399385355424906, + "grad_norm": 0.1708984375, + "learning_rate": 0.000137483242249662, + "loss": 1.1881, + "step": 6585 + }, + { + "epoch": 0.4402725815072154, + "grad_norm": 0.1669921875, + "learning_rate": 0.00013737511244706733, + "loss": 1.1738, + "step": 6590 + }, + { + "epoch": 0.4406066274719401, + "grad_norm": 0.1611328125, + "learning_rate": 0.00013726693182589093, + "loss": 1.2403, + "step": 6595 + }, + { + "epoch": 0.4409406734366649, + "grad_norm": 0.1669921875, + "learning_rate": 0.00013715870053322492, + "loss": 1.1563, + "step": 6600 + }, + { + "epoch": 0.4412747194013896, + "grad_norm": 0.19140625, + "learning_rate": 0.0001370504187162304, + "loss": 1.1397, + "step": 6605 + }, + { + "epoch": 0.4416087653661144, + "grad_norm": 0.1630859375, + "learning_rate": 0.00013694208652213703, + "loss": 1.1837, + "step": 6610 + }, + { + "epoch": 0.4419428113308391, + "grad_norm": 0.1728515625, + "learning_rate": 0.00013683370409824317, + "loss": 1.196, + "step": 6615 + }, + { + "epoch": 0.4422768572955639, + "grad_norm": 0.1767578125, + "learning_rate": 0.00013672527159191525, + "loss": 1.2161, + "step": 6620 + }, + { + "epoch": 0.4426109032602886, + "grad_norm": 0.1689453125, + "learning_rate": 0.00013661678915058797, + "loss": 1.2789, + "step": 6625 + }, + { + "epoch": 0.4429449492250134, + "grad_norm": 0.171875, + "learning_rate": 0.00013650825692176387, + "loss": 1.2474, + "step": 6630 + }, + { + "epoch": 0.4432789951897381, + "grad_norm": 0.1748046875, + "learning_rate": 0.00013639967505301313, + "loss": 1.2226, + "step": 6635 + }, + { + "epoch": 0.4436130411544629, + "grad_norm": 0.1748046875, + "learning_rate": 0.00013629104369197351, + "loss": 1.2119, + "step": 6640 + }, + { + "epoch": 0.4439470871191876, + "grad_norm": 0.1669921875, + "learning_rate": 0.00013618236298635003, + "loss": 1.1872, + "step": 6645 + }, + { + "epoch": 0.44428113308391237, + "grad_norm": 0.197265625, + "learning_rate": 0.0001360736330839148, + "loss": 1.2023, + "step": 6650 + }, + { + "epoch": 0.44461517904863707, + "grad_norm": 0.171875, + "learning_rate": 0.00013596485413250683, + "loss": 1.1973, + "step": 6655 + }, + { + "epoch": 0.4449492250133618, + "grad_norm": 0.169921875, + "learning_rate": 0.0001358560262800318, + "loss": 1.1778, + "step": 6660 + }, + { + "epoch": 0.44528327097808656, + "grad_norm": 0.1669921875, + "learning_rate": 0.00013574714967446192, + "loss": 1.1713, + "step": 6665 + }, + { + "epoch": 0.4456173169428113, + "grad_norm": 0.1640625, + "learning_rate": 0.00013563822446383564, + "loss": 1.2628, + "step": 6670 + }, + { + "epoch": 0.44595136290753606, + "grad_norm": 0.16796875, + "learning_rate": 0.00013552925079625755, + "loss": 1.2013, + "step": 6675 + }, + { + "epoch": 0.4462854088722608, + "grad_norm": 0.177734375, + "learning_rate": 0.00013542022881989803, + "loss": 1.2194, + "step": 6680 + }, + { + "epoch": 0.44661945483698556, + "grad_norm": 0.1640625, + "learning_rate": 0.00013531115868299336, + "loss": 1.1687, + "step": 6685 + }, + { + "epoch": 0.4469535008017103, + "grad_norm": 0.1650390625, + "learning_rate": 0.000135202040533845, + "loss": 1.1385, + "step": 6690 + }, + { + "epoch": 0.44728754676643506, + "grad_norm": 0.1806640625, + "learning_rate": 0.00013509287452081995, + "loss": 1.225, + "step": 6695 + }, + { + "epoch": 0.4476215927311598, + "grad_norm": 0.1650390625, + "learning_rate": 0.00013498366079235015, + "loss": 1.2127, + "step": 6700 + }, + { + "epoch": 0.44795563869588456, + "grad_norm": 0.166015625, + "learning_rate": 0.0001348743994969325, + "loss": 1.2052, + "step": 6705 + }, + { + "epoch": 0.4482896846606093, + "grad_norm": 0.1953125, + "learning_rate": 0.00013476509078312845, + "loss": 1.2059, + "step": 6710 + }, + { + "epoch": 0.44862373062533406, + "grad_norm": 0.193359375, + "learning_rate": 0.0001346557347995641, + "loss": 1.1859, + "step": 6715 + }, + { + "epoch": 0.4489577765900588, + "grad_norm": 0.1640625, + "learning_rate": 0.00013454633169492967, + "loss": 1.1002, + "step": 6720 + }, + { + "epoch": 0.44929182255478356, + "grad_norm": 0.1650390625, + "learning_rate": 0.00013443688161797953, + "loss": 1.1503, + "step": 6725 + }, + { + "epoch": 0.4496258685195083, + "grad_norm": 0.158203125, + "learning_rate": 0.00013432738471753195, + "loss": 1.1687, + "step": 6730 + }, + { + "epoch": 0.449959914484233, + "grad_norm": 0.1708984375, + "learning_rate": 0.00013421784114246873, + "loss": 1.2168, + "step": 6735 + }, + { + "epoch": 0.45029396044895775, + "grad_norm": 0.158203125, + "learning_rate": 0.00013410825104173528, + "loss": 1.2876, + "step": 6740 + }, + { + "epoch": 0.4506280064136825, + "grad_norm": 0.16015625, + "learning_rate": 0.00013399861456434017, + "loss": 1.2127, + "step": 6745 + }, + { + "epoch": 0.45096205237840725, + "grad_norm": 0.255859375, + "learning_rate": 0.00013388893185935512, + "loss": 1.229, + "step": 6750 + }, + { + "epoch": 0.451296098343132, + "grad_norm": 0.1640625, + "learning_rate": 0.00013377920307591453, + "loss": 1.2186, + "step": 6755 + }, + { + "epoch": 0.45163014430785675, + "grad_norm": 0.162109375, + "learning_rate": 0.00013366942836321575, + "loss": 1.1846, + "step": 6760 + }, + { + "epoch": 0.4519641902725815, + "grad_norm": 0.177734375, + "learning_rate": 0.00013355960787051827, + "loss": 1.2155, + "step": 6765 + }, + { + "epoch": 0.45229823623730625, + "grad_norm": 0.1572265625, + "learning_rate": 0.000133449741747144, + "loss": 1.2372, + "step": 6770 + }, + { + "epoch": 0.452632282202031, + "grad_norm": 0.1904296875, + "learning_rate": 0.00013333983014247687, + "loss": 1.2568, + "step": 6775 + }, + { + "epoch": 0.45296632816675575, + "grad_norm": 0.1689453125, + "learning_rate": 0.0001332298732059626, + "loss": 1.1603, + "step": 6780 + }, + { + "epoch": 0.4533003741314805, + "grad_norm": 0.1787109375, + "learning_rate": 0.0001331198710871086, + "loss": 1.309, + "step": 6785 + }, + { + "epoch": 0.45363442009620525, + "grad_norm": 0.169921875, + "learning_rate": 0.00013300982393548368, + "loss": 1.1875, + "step": 6790 + }, + { + "epoch": 0.45396846606093, + "grad_norm": 0.1650390625, + "learning_rate": 0.00013289973190071797, + "loss": 1.2092, + "step": 6795 + }, + { + "epoch": 0.45430251202565475, + "grad_norm": 0.1650390625, + "learning_rate": 0.00013278959513250243, + "loss": 1.2222, + "step": 6800 + }, + { + "epoch": 0.4546365579903795, + "grad_norm": 0.1650390625, + "learning_rate": 0.0001326794137805891, + "loss": 1.1878, + "step": 6805 + }, + { + "epoch": 0.45497060395510425, + "grad_norm": 0.173828125, + "learning_rate": 0.0001325691879947904, + "loss": 1.1878, + "step": 6810 + }, + { + "epoch": 0.45530464991982894, + "grad_norm": 0.1669921875, + "learning_rate": 0.0001324589179249793, + "loss": 1.2343, + "step": 6815 + }, + { + "epoch": 0.4556386958845537, + "grad_norm": 0.1845703125, + "learning_rate": 0.000132348603721089, + "loss": 1.2661, + "step": 6820 + }, + { + "epoch": 0.45597274184927844, + "grad_norm": 0.1708984375, + "learning_rate": 0.00013223824553311263, + "loss": 1.3178, + "step": 6825 + }, + { + "epoch": 0.4563067878140032, + "grad_norm": 0.1767578125, + "learning_rate": 0.00013212784351110312, + "loss": 1.1792, + "step": 6830 + }, + { + "epoch": 0.45664083377872794, + "grad_norm": 0.16796875, + "learning_rate": 0.00013201739780517311, + "loss": 1.1898, + "step": 6835 + }, + { + "epoch": 0.4569748797434527, + "grad_norm": 0.1875, + "learning_rate": 0.00013190690856549456, + "loss": 1.2211, + "step": 6840 + }, + { + "epoch": 0.45730892570817744, + "grad_norm": 0.162109375, + "learning_rate": 0.00013179637594229858, + "loss": 1.1925, + "step": 6845 + }, + { + "epoch": 0.4576429716729022, + "grad_norm": 0.1669921875, + "learning_rate": 0.00013168580008587536, + "loss": 1.2215, + "step": 6850 + }, + { + "epoch": 0.45797701763762694, + "grad_norm": 0.171875, + "learning_rate": 0.0001315751811465738, + "loss": 1.1885, + "step": 6855 + }, + { + "epoch": 0.4583110636023517, + "grad_norm": 0.16796875, + "learning_rate": 0.00013146451927480146, + "loss": 1.221, + "step": 6860 + }, + { + "epoch": 0.45864510956707644, + "grad_norm": 0.173828125, + "learning_rate": 0.00013135381462102413, + "loss": 1.2224, + "step": 6865 + }, + { + "epoch": 0.4589791555318012, + "grad_norm": 0.1669921875, + "learning_rate": 0.000131243067335766, + "loss": 1.1484, + "step": 6870 + }, + { + "epoch": 0.45931320149652594, + "grad_norm": 0.1708984375, + "learning_rate": 0.00013113227756960898, + "loss": 1.2461, + "step": 6875 + }, + { + "epoch": 0.4596472474612507, + "grad_norm": 0.1728515625, + "learning_rate": 0.00013102144547319286, + "loss": 1.2175, + "step": 6880 + }, + { + "epoch": 0.45998129342597543, + "grad_norm": 0.1689453125, + "learning_rate": 0.00013091057119721505, + "loss": 1.2129, + "step": 6885 + }, + { + "epoch": 0.4603153393907002, + "grad_norm": 0.1591796875, + "learning_rate": 0.00013079965489243015, + "loss": 1.1792, + "step": 6890 + }, + { + "epoch": 0.46064938535542493, + "grad_norm": 0.1640625, + "learning_rate": 0.00013068869670965008, + "loss": 1.195, + "step": 6895 + }, + { + "epoch": 0.46098343132014963, + "grad_norm": 0.1728515625, + "learning_rate": 0.00013057769679974358, + "loss": 1.1676, + "step": 6900 + }, + { + "epoch": 0.4613174772848744, + "grad_norm": 0.16015625, + "learning_rate": 0.00013046665531363615, + "loss": 1.1518, + "step": 6905 + }, + { + "epoch": 0.4616515232495991, + "grad_norm": 0.1806640625, + "learning_rate": 0.00013035557240230982, + "loss": 1.2451, + "step": 6910 + }, + { + "epoch": 0.4619855692143239, + "grad_norm": 0.1953125, + "learning_rate": 0.00013024444821680304, + "loss": 1.2153, + "step": 6915 + }, + { + "epoch": 0.4623196151790486, + "grad_norm": 0.1708984375, + "learning_rate": 0.0001301332829082102, + "loss": 1.2301, + "step": 6920 + }, + { + "epoch": 0.4626536611437734, + "grad_norm": 0.1591796875, + "learning_rate": 0.00013002207662768175, + "loss": 1.2094, + "step": 6925 + }, + { + "epoch": 0.4629877071084981, + "grad_norm": 0.1767578125, + "learning_rate": 0.0001299108295264238, + "loss": 1.2514, + "step": 6930 + }, + { + "epoch": 0.4633217530732229, + "grad_norm": 0.1806640625, + "learning_rate": 0.00012979954175569797, + "loss": 1.2319, + "step": 6935 + }, + { + "epoch": 0.4636557990379476, + "grad_norm": 0.15625, + "learning_rate": 0.00012968821346682113, + "loss": 1.1269, + "step": 6940 + }, + { + "epoch": 0.4639898450026724, + "grad_norm": 0.16796875, + "learning_rate": 0.00012957684481116537, + "loss": 1.1569, + "step": 6945 + }, + { + "epoch": 0.4643238909673971, + "grad_norm": 0.197265625, + "learning_rate": 0.00012946543594015753, + "loss": 1.1848, + "step": 6950 + }, + { + "epoch": 0.4646579369321219, + "grad_norm": 0.1787109375, + "learning_rate": 0.00012935398700527915, + "loss": 1.1894, + "step": 6955 + }, + { + "epoch": 0.4649919828968466, + "grad_norm": 0.1689453125, + "learning_rate": 0.00012924249815806632, + "loss": 1.2352, + "step": 6960 + }, + { + "epoch": 0.4653260288615714, + "grad_norm": 0.1748046875, + "learning_rate": 0.00012913096955010937, + "loss": 1.2918, + "step": 6965 + }, + { + "epoch": 0.4656600748262961, + "grad_norm": 0.1650390625, + "learning_rate": 0.00012901940133305267, + "loss": 1.2738, + "step": 6970 + }, + { + "epoch": 0.46599412079102087, + "grad_norm": 0.1826171875, + "learning_rate": 0.00012890779365859443, + "loss": 1.0624, + "step": 6975 + }, + { + "epoch": 0.46632816675574557, + "grad_norm": 0.185546875, + "learning_rate": 0.00012879614667848655, + "loss": 1.2576, + "step": 6980 + }, + { + "epoch": 0.4666622127204703, + "grad_norm": 0.1728515625, + "learning_rate": 0.00012868446054453434, + "loss": 1.2957, + "step": 6985 + }, + { + "epoch": 0.46699625868519506, + "grad_norm": 0.171875, + "learning_rate": 0.00012857273540859643, + "loss": 1.1472, + "step": 6990 + }, + { + "epoch": 0.4673303046499198, + "grad_norm": 0.205078125, + "learning_rate": 0.0001284609714225843, + "loss": 1.1081, + "step": 6995 + }, + { + "epoch": 0.46766435061464456, + "grad_norm": 0.1796875, + "learning_rate": 0.00012834916873846245, + "loss": 1.2406, + "step": 7000 + }, + { + "epoch": 0.4679983965793693, + "grad_norm": 0.1669921875, + "learning_rate": 0.00012823732750824794, + "loss": 1.2261, + "step": 7005 + }, + { + "epoch": 0.46833244254409406, + "grad_norm": 0.228515625, + "learning_rate": 0.00012812544788401014, + "loss": 1.2323, + "step": 7010 + }, + { + "epoch": 0.4686664885088188, + "grad_norm": 0.1650390625, + "learning_rate": 0.00012801353001787072, + "loss": 1.1986, + "step": 7015 + }, + { + "epoch": 0.46900053447354356, + "grad_norm": 0.166015625, + "learning_rate": 0.0001279015740620034, + "loss": 1.1924, + "step": 7020 + }, + { + "epoch": 0.4693345804382683, + "grad_norm": 0.1708984375, + "learning_rate": 0.00012778958016863357, + "loss": 1.204, + "step": 7025 + }, + { + "epoch": 0.46966862640299306, + "grad_norm": 0.1865234375, + "learning_rate": 0.0001276775484900382, + "loss": 1.206, + "step": 7030 + }, + { + "epoch": 0.4700026723677178, + "grad_norm": 0.18359375, + "learning_rate": 0.00012756547917854578, + "loss": 1.2013, + "step": 7035 + }, + { + "epoch": 0.47033671833244256, + "grad_norm": 0.2060546875, + "learning_rate": 0.0001274533723865358, + "loss": 1.2634, + "step": 7040 + }, + { + "epoch": 0.4706707642971673, + "grad_norm": 0.1650390625, + "learning_rate": 0.00012734122826643884, + "loss": 1.302, + "step": 7045 + }, + { + "epoch": 0.47100481026189206, + "grad_norm": 0.1708984375, + "learning_rate": 0.00012722904697073616, + "loss": 1.2133, + "step": 7050 + }, + { + "epoch": 0.4713388562266168, + "grad_norm": 0.1728515625, + "learning_rate": 0.00012711682865195964, + "loss": 1.1755, + "step": 7055 + }, + { + "epoch": 0.4716729021913415, + "grad_norm": 0.166015625, + "learning_rate": 0.00012700457346269137, + "loss": 1.2221, + "step": 7060 + }, + { + "epoch": 0.47200694815606625, + "grad_norm": 0.1845703125, + "learning_rate": 0.00012689228155556373, + "loss": 1.2812, + "step": 7065 + }, + { + "epoch": 0.472340994120791, + "grad_norm": 0.16015625, + "learning_rate": 0.00012677995308325887, + "loss": 1.2279, + "step": 7070 + }, + { + "epoch": 0.47267504008551575, + "grad_norm": 0.17578125, + "learning_rate": 0.0001266675881985088, + "loss": 1.1915, + "step": 7075 + }, + { + "epoch": 0.4730090860502405, + "grad_norm": 0.1640625, + "learning_rate": 0.00012655518705409496, + "loss": 1.187, + "step": 7080 + }, + { + "epoch": 0.47334313201496525, + "grad_norm": 0.1630859375, + "learning_rate": 0.00012644274980284806, + "loss": 1.1714, + "step": 7085 + }, + { + "epoch": 0.47367717797969, + "grad_norm": 0.22265625, + "learning_rate": 0.00012633027659764804, + "loss": 1.2443, + "step": 7090 + }, + { + "epoch": 0.47401122394441475, + "grad_norm": 0.1689453125, + "learning_rate": 0.00012621776759142356, + "loss": 1.2601, + "step": 7095 + }, + { + "epoch": 0.4743452699091395, + "grad_norm": 0.177734375, + "learning_rate": 0.0001261052229371521, + "loss": 1.2128, + "step": 7100 + }, + { + "epoch": 0.47467931587386425, + "grad_norm": 0.1689453125, + "learning_rate": 0.00012599264278785952, + "loss": 1.252, + "step": 7105 + }, + { + "epoch": 0.475013361838589, + "grad_norm": 0.173828125, + "learning_rate": 0.00012588002729661994, + "loss": 1.1306, + "step": 7110 + }, + { + "epoch": 0.47534740780331375, + "grad_norm": 0.166015625, + "learning_rate": 0.00012576737661655559, + "loss": 1.2137, + "step": 7115 + }, + { + "epoch": 0.4756814537680385, + "grad_norm": 0.166015625, + "learning_rate": 0.00012565469090083651, + "loss": 1.2275, + "step": 7120 + }, + { + "epoch": 0.47601549973276325, + "grad_norm": 0.166015625, + "learning_rate": 0.0001255419703026804, + "loss": 1.1858, + "step": 7125 + }, + { + "epoch": 0.476349545697488, + "grad_norm": 0.1669921875, + "learning_rate": 0.00012542921497535233, + "loss": 1.1734, + "step": 7130 + }, + { + "epoch": 0.47668359166221275, + "grad_norm": 0.17578125, + "learning_rate": 0.00012531642507216474, + "loss": 1.1865, + "step": 7135 + }, + { + "epoch": 0.47701763762693744, + "grad_norm": 0.1669921875, + "learning_rate": 0.00012520360074647687, + "loss": 1.2398, + "step": 7140 + }, + { + "epoch": 0.4773516835916622, + "grad_norm": 0.1689453125, + "learning_rate": 0.00012509074215169493, + "loss": 1.2366, + "step": 7145 + }, + { + "epoch": 0.47768572955638694, + "grad_norm": 0.166015625, + "learning_rate": 0.00012497784944127166, + "loss": 1.1808, + "step": 7150 + }, + { + "epoch": 0.4780197755211117, + "grad_norm": 0.1650390625, + "learning_rate": 0.00012486492276870615, + "loss": 1.1907, + "step": 7155 + }, + { + "epoch": 0.47835382148583644, + "grad_norm": 0.1806640625, + "learning_rate": 0.00012475196228754374, + "loss": 1.1947, + "step": 7160 + }, + { + "epoch": 0.4786878674505612, + "grad_norm": 0.1640625, + "learning_rate": 0.00012463896815137582, + "loss": 1.1483, + "step": 7165 + }, + { + "epoch": 0.47902191341528594, + "grad_norm": 0.1865234375, + "learning_rate": 0.00012452594051383923, + "loss": 1.1198, + "step": 7170 + }, + { + "epoch": 0.4793559593800107, + "grad_norm": 0.1728515625, + "learning_rate": 0.00012441287952861673, + "loss": 1.2169, + "step": 7175 + }, + { + "epoch": 0.47969000534473544, + "grad_norm": 0.17578125, + "learning_rate": 0.00012429978534943617, + "loss": 1.2445, + "step": 7180 + }, + { + "epoch": 0.4800240513094602, + "grad_norm": 0.1787109375, + "learning_rate": 0.00012418665813007066, + "loss": 1.1686, + "step": 7185 + }, + { + "epoch": 0.48035809727418494, + "grad_norm": 0.1748046875, + "learning_rate": 0.00012407349802433818, + "loss": 1.2286, + "step": 7190 + }, + { + "epoch": 0.4806921432389097, + "grad_norm": 0.1689453125, + "learning_rate": 0.00012396030518610143, + "loss": 1.2835, + "step": 7195 + }, + { + "epoch": 0.48102618920363444, + "grad_norm": 0.171875, + "learning_rate": 0.00012384707976926767, + "loss": 1.2347, + "step": 7200 + }, + { + "epoch": 0.4813602351683592, + "grad_norm": 0.169921875, + "learning_rate": 0.00012373382192778834, + "loss": 1.1946, + "step": 7205 + }, + { + "epoch": 0.48169428113308393, + "grad_norm": 0.1689453125, + "learning_rate": 0.00012362053181565912, + "loss": 1.1201, + "step": 7210 + }, + { + "epoch": 0.4820283270978087, + "grad_norm": 0.1708984375, + "learning_rate": 0.00012350720958691943, + "loss": 1.1542, + "step": 7215 + }, + { + "epoch": 0.4823623730625334, + "grad_norm": 0.185546875, + "learning_rate": 0.00012339385539565244, + "loss": 1.1681, + "step": 7220 + }, + { + "epoch": 0.48269641902725813, + "grad_norm": 0.173828125, + "learning_rate": 0.00012328046939598475, + "loss": 1.2992, + "step": 7225 + }, + { + "epoch": 0.4830304649919829, + "grad_norm": 0.1650390625, + "learning_rate": 0.00012316705174208624, + "loss": 1.1038, + "step": 7230 + }, + { + "epoch": 0.4833645109567076, + "grad_norm": 0.255859375, + "learning_rate": 0.00012305360258816977, + "loss": 1.141, + "step": 7235 + }, + { + "epoch": 0.4836985569214324, + "grad_norm": 0.1591796875, + "learning_rate": 0.00012294012208849105, + "loss": 1.1671, + "step": 7240 + }, + { + "epoch": 0.4840326028861571, + "grad_norm": 0.154296875, + "learning_rate": 0.00012282661039734848, + "loss": 1.2421, + "step": 7245 + }, + { + "epoch": 0.4843666488508819, + "grad_norm": 0.166015625, + "learning_rate": 0.0001227130676690828, + "loss": 1.2105, + "step": 7250 + }, + { + "epoch": 0.4847006948156066, + "grad_norm": 0.173828125, + "learning_rate": 0.00012259949405807696, + "loss": 1.1635, + "step": 7255 + }, + { + "epoch": 0.4850347407803314, + "grad_norm": 0.16796875, + "learning_rate": 0.00012248588971875587, + "loss": 1.2874, + "step": 7260 + }, + { + "epoch": 0.4853687867450561, + "grad_norm": 0.166015625, + "learning_rate": 0.00012237225480558637, + "loss": 1.1925, + "step": 7265 + }, + { + "epoch": 0.4857028327097809, + "grad_norm": 0.173828125, + "learning_rate": 0.00012225858947307664, + "loss": 1.208, + "step": 7270 + }, + { + "epoch": 0.4860368786745056, + "grad_norm": 0.162109375, + "learning_rate": 0.0001221448938757764, + "loss": 1.2509, + "step": 7275 + }, + { + "epoch": 0.4863709246392304, + "grad_norm": 0.169921875, + "learning_rate": 0.00012203116816827646, + "loss": 1.1224, + "step": 7280 + }, + { + "epoch": 0.4867049706039551, + "grad_norm": 0.16796875, + "learning_rate": 0.00012191741250520861, + "loss": 1.1747, + "step": 7285 + }, + { + "epoch": 0.48703901656867987, + "grad_norm": 0.16796875, + "learning_rate": 0.00012180362704124523, + "loss": 1.1535, + "step": 7290 + }, + { + "epoch": 0.4873730625334046, + "grad_norm": 0.166015625, + "learning_rate": 0.00012168981193109945, + "loss": 1.213, + "step": 7295 + }, + { + "epoch": 0.4877071084981293, + "grad_norm": 0.19140625, + "learning_rate": 0.00012157596732952448, + "loss": 1.1681, + "step": 7300 + }, + { + "epoch": 0.48804115446285407, + "grad_norm": 0.1650390625, + "learning_rate": 0.00012146209339131376, + "loss": 1.1547, + "step": 7305 + }, + { + "epoch": 0.4883752004275788, + "grad_norm": 0.203125, + "learning_rate": 0.00012134819027130062, + "loss": 1.1921, + "step": 7310 + }, + { + "epoch": 0.48870924639230356, + "grad_norm": 0.1640625, + "learning_rate": 0.00012123425812435803, + "loss": 1.2203, + "step": 7315 + }, + { + "epoch": 0.4890432923570283, + "grad_norm": 0.1630859375, + "learning_rate": 0.00012112029710539842, + "loss": 1.1512, + "step": 7320 + }, + { + "epoch": 0.48937733832175306, + "grad_norm": 0.181640625, + "learning_rate": 0.00012100630736937348, + "loss": 1.192, + "step": 7325 + }, + { + "epoch": 0.4897113842864778, + "grad_norm": 0.173828125, + "learning_rate": 0.00012089228907127403, + "loss": 1.1679, + "step": 7330 + }, + { + "epoch": 0.49004543025120256, + "grad_norm": 0.1591796875, + "learning_rate": 0.00012077824236612958, + "loss": 1.2825, + "step": 7335 + }, + { + "epoch": 0.4903794762159273, + "grad_norm": 0.1845703125, + "learning_rate": 0.00012066416740900836, + "loss": 1.1705, + "step": 7340 + }, + { + "epoch": 0.49071352218065206, + "grad_norm": 0.1669921875, + "learning_rate": 0.00012055006435501703, + "loss": 1.1813, + "step": 7345 + }, + { + "epoch": 0.4910475681453768, + "grad_norm": 0.18359375, + "learning_rate": 0.0001204359333593004, + "loss": 1.2041, + "step": 7350 + }, + { + "epoch": 0.49138161411010156, + "grad_norm": 0.16796875, + "learning_rate": 0.00012032177457704124, + "loss": 1.2289, + "step": 7355 + }, + { + "epoch": 0.4917156600748263, + "grad_norm": 0.1630859375, + "learning_rate": 0.00012020758816346023, + "loss": 1.1914, + "step": 7360 + }, + { + "epoch": 0.49204970603955106, + "grad_norm": 0.1650390625, + "learning_rate": 0.00012009337427381549, + "loss": 1.1316, + "step": 7365 + }, + { + "epoch": 0.4923837520042758, + "grad_norm": 0.1533203125, + "learning_rate": 0.00011997913306340257, + "loss": 1.1829, + "step": 7370 + }, + { + "epoch": 0.49271779796900056, + "grad_norm": 0.1767578125, + "learning_rate": 0.00011986486468755412, + "loss": 1.2015, + "step": 7375 + }, + { + "epoch": 0.49305184393372525, + "grad_norm": 0.1591796875, + "learning_rate": 0.00011975056930163978, + "loss": 1.169, + "step": 7380 + }, + { + "epoch": 0.49338588989845, + "grad_norm": 0.25390625, + "learning_rate": 0.00011963624706106589, + "loss": 1.2223, + "step": 7385 + }, + { + "epoch": 0.49371993586317475, + "grad_norm": 0.1689453125, + "learning_rate": 0.0001195218981212753, + "loss": 1.2101, + "step": 7390 + }, + { + "epoch": 0.4940539818278995, + "grad_norm": 0.173828125, + "learning_rate": 0.00011940752263774717, + "loss": 1.278, + "step": 7395 + }, + { + "epoch": 0.49438802779262425, + "grad_norm": 0.1748046875, + "learning_rate": 0.0001192931207659967, + "loss": 1.173, + "step": 7400 + }, + { + "epoch": 0.494722073757349, + "grad_norm": 0.1572265625, + "learning_rate": 0.00011917869266157513, + "loss": 1.1914, + "step": 7405 + }, + { + "epoch": 0.49505611972207375, + "grad_norm": 0.15234375, + "learning_rate": 0.00011906423848006913, + "loss": 1.2184, + "step": 7410 + }, + { + "epoch": 0.4953901656867985, + "grad_norm": 0.1689453125, + "learning_rate": 0.000118949758377101, + "loss": 1.3098, + "step": 7415 + }, + { + "epoch": 0.49572421165152325, + "grad_norm": 0.1845703125, + "learning_rate": 0.00011883525250832828, + "loss": 1.2174, + "step": 7420 + }, + { + "epoch": 0.496058257616248, + "grad_norm": 0.1630859375, + "learning_rate": 0.00011872072102944346, + "loss": 1.2241, + "step": 7425 + }, + { + "epoch": 0.49639230358097275, + "grad_norm": 0.1591796875, + "learning_rate": 0.00011860616409617386, + "loss": 1.1798, + "step": 7430 + }, + { + "epoch": 0.4967263495456975, + "grad_norm": 0.1630859375, + "learning_rate": 0.0001184915818642815, + "loss": 1.1564, + "step": 7435 + }, + { + "epoch": 0.49706039551042225, + "grad_norm": 0.158203125, + "learning_rate": 0.00011837697448956275, + "loss": 1.2022, + "step": 7440 + }, + { + "epoch": 0.497394441475147, + "grad_norm": 0.1640625, + "learning_rate": 0.0001182623421278481, + "loss": 1.1389, + "step": 7445 + }, + { + "epoch": 0.49772848743987175, + "grad_norm": 0.1689453125, + "learning_rate": 0.00011814768493500213, + "loss": 1.1532, + "step": 7450 + }, + { + "epoch": 0.4980625334045965, + "grad_norm": 0.1669921875, + "learning_rate": 0.00011803300306692306, + "loss": 1.1738, + "step": 7455 + }, + { + "epoch": 0.4983965793693212, + "grad_norm": 0.162109375, + "learning_rate": 0.00011791829667954277, + "loss": 1.243, + "step": 7460 + }, + { + "epoch": 0.49873062533404594, + "grad_norm": 0.177734375, + "learning_rate": 0.00011780356592882645, + "loss": 1.1898, + "step": 7465 + }, + { + "epoch": 0.4990646712987707, + "grad_norm": 0.169921875, + "learning_rate": 0.00011768881097077238, + "loss": 1.1964, + "step": 7470 + }, + { + "epoch": 0.49939871726349544, + "grad_norm": 0.1708984375, + "learning_rate": 0.00011757403196141172, + "loss": 1.1461, + "step": 7475 + }, + { + "epoch": 0.4997327632282202, + "grad_norm": 0.181640625, + "learning_rate": 0.00011745922905680849, + "loss": 1.2334, + "step": 7480 + }, + { + "epoch": 0.500066809192945, + "grad_norm": 0.169921875, + "learning_rate": 0.00011734440241305902, + "loss": 1.2266, + "step": 7485 + }, + { + "epoch": 0.5004008551576697, + "grad_norm": 0.1640625, + "learning_rate": 0.00011722955218629204, + "loss": 1.2048, + "step": 7490 + }, + { + "epoch": 0.5007349011223945, + "grad_norm": 0.169921875, + "learning_rate": 0.00011711467853266826, + "loss": 1.2477, + "step": 7495 + }, + { + "epoch": 0.5010689470871192, + "grad_norm": 0.16796875, + "learning_rate": 0.00011699978160838032, + "loss": 1.1918, + "step": 7500 + }, + { + "epoch": 0.5014029930518439, + "grad_norm": 0.177734375, + "learning_rate": 0.00011688486156965246, + "loss": 1.1969, + "step": 7505 + }, + { + "epoch": 0.5017370390165686, + "grad_norm": 0.1630859375, + "learning_rate": 0.00011676991857274035, + "loss": 1.2102, + "step": 7510 + }, + { + "epoch": 0.5020710849812934, + "grad_norm": 0.1669921875, + "learning_rate": 0.0001166549527739309, + "loss": 1.2321, + "step": 7515 + }, + { + "epoch": 0.5024051309460181, + "grad_norm": 0.1650390625, + "learning_rate": 0.00011653996432954192, + "loss": 1.1598, + "step": 7520 + }, + { + "epoch": 0.5027391769107429, + "grad_norm": 0.177734375, + "learning_rate": 0.00011642495339592224, + "loss": 1.2661, + "step": 7525 + }, + { + "epoch": 0.5030732228754676, + "grad_norm": 0.1669921875, + "learning_rate": 0.00011630992012945099, + "loss": 1.1201, + "step": 7530 + }, + { + "epoch": 0.5034072688401924, + "grad_norm": 0.1728515625, + "learning_rate": 0.00011619486468653785, + "loss": 1.2572, + "step": 7535 + }, + { + "epoch": 0.5037413148049171, + "grad_norm": 0.1796875, + "learning_rate": 0.00011607978722362258, + "loss": 1.1534, + "step": 7540 + }, + { + "epoch": 0.5040753607696419, + "grad_norm": 0.177734375, + "learning_rate": 0.00011596468789717494, + "loss": 1.1787, + "step": 7545 + }, + { + "epoch": 0.5044094067343666, + "grad_norm": 0.1767578125, + "learning_rate": 0.0001158495668636944, + "loss": 1.1895, + "step": 7550 + }, + { + "epoch": 0.5047434526990914, + "grad_norm": 0.185546875, + "learning_rate": 0.00011573442427970982, + "loss": 1.2308, + "step": 7555 + }, + { + "epoch": 0.5050774986638161, + "grad_norm": 0.1689453125, + "learning_rate": 0.00011561926030177957, + "loss": 1.183, + "step": 7560 + }, + { + "epoch": 0.5054115446285409, + "grad_norm": 0.1650390625, + "learning_rate": 0.00011550407508649095, + "loss": 1.1378, + "step": 7565 + }, + { + "epoch": 0.5057455905932656, + "grad_norm": 0.1806640625, + "learning_rate": 0.00011538886879046023, + "loss": 1.2704, + "step": 7570 + }, + { + "epoch": 0.5060796365579904, + "grad_norm": 0.1767578125, + "learning_rate": 0.00011527364157033227, + "loss": 1.0969, + "step": 7575 + }, + { + "epoch": 0.5064136825227151, + "grad_norm": 0.17578125, + "learning_rate": 0.00011515839358278045, + "loss": 1.2111, + "step": 7580 + }, + { + "epoch": 0.5067477284874399, + "grad_norm": 0.1796875, + "learning_rate": 0.0001150431249845063, + "loss": 1.2049, + "step": 7585 + }, + { + "epoch": 0.5070817744521646, + "grad_norm": 0.1708984375, + "learning_rate": 0.00011492783593223948, + "loss": 1.2238, + "step": 7590 + }, + { + "epoch": 0.5074158204168894, + "grad_norm": 0.1650390625, + "learning_rate": 0.00011481252658273737, + "loss": 1.1828, + "step": 7595 + }, + { + "epoch": 0.5077498663816141, + "grad_norm": 0.173828125, + "learning_rate": 0.00011469719709278503, + "loss": 1.1336, + "step": 7600 + }, + { + "epoch": 0.5080839123463389, + "grad_norm": 0.185546875, + "learning_rate": 0.00011458184761919481, + "loss": 1.2439, + "step": 7605 + }, + { + "epoch": 0.5084179583110636, + "grad_norm": 0.1787109375, + "learning_rate": 0.00011446647831880633, + "loss": 1.2414, + "step": 7610 + }, + { + "epoch": 0.5087520042757884, + "grad_norm": 0.171875, + "learning_rate": 0.00011435108934848608, + "loss": 1.1807, + "step": 7615 + }, + { + "epoch": 0.5090860502405131, + "grad_norm": 0.1728515625, + "learning_rate": 0.00011423568086512737, + "loss": 1.1715, + "step": 7620 + }, + { + "epoch": 0.5094200962052379, + "grad_norm": 0.1591796875, + "learning_rate": 0.00011412025302564999, + "loss": 1.2178, + "step": 7625 + }, + { + "epoch": 0.5097541421699626, + "grad_norm": 0.158203125, + "learning_rate": 0.00011400480598700006, + "loss": 1.1287, + "step": 7630 + }, + { + "epoch": 0.5100881881346874, + "grad_norm": 0.177734375, + "learning_rate": 0.00011388933990614982, + "loss": 1.1457, + "step": 7635 + }, + { + "epoch": 0.5104222340994121, + "grad_norm": 0.1845703125, + "learning_rate": 0.0001137738549400974, + "loss": 1.1473, + "step": 7640 + }, + { + "epoch": 0.5107562800641369, + "grad_norm": 0.1728515625, + "learning_rate": 0.00011365835124586657, + "loss": 1.2109, + "step": 7645 + }, + { + "epoch": 0.5110903260288616, + "grad_norm": 0.1640625, + "learning_rate": 0.00011354282898050661, + "loss": 1.1932, + "step": 7650 + }, + { + "epoch": 0.5114243719935864, + "grad_norm": 0.1689453125, + "learning_rate": 0.00011342728830109209, + "loss": 1.2119, + "step": 7655 + }, + { + "epoch": 0.5117584179583111, + "grad_norm": 0.1728515625, + "learning_rate": 0.00011331172936472243, + "loss": 1.1444, + "step": 7660 + }, + { + "epoch": 0.5120924639230358, + "grad_norm": 0.16796875, + "learning_rate": 0.00011319615232852217, + "loss": 1.2466, + "step": 7665 + }, + { + "epoch": 0.5124265098877605, + "grad_norm": 0.169921875, + "learning_rate": 0.00011308055734964018, + "loss": 1.1991, + "step": 7670 + }, + { + "epoch": 0.5127605558524853, + "grad_norm": 0.1650390625, + "learning_rate": 0.00011296494458524986, + "loss": 1.149, + "step": 7675 + }, + { + "epoch": 0.51309460181721, + "grad_norm": 0.169921875, + "learning_rate": 0.00011284931419254883, + "loss": 1.2253, + "step": 7680 + }, + { + "epoch": 0.5134286477819348, + "grad_norm": 0.166015625, + "learning_rate": 0.00011273366632875859, + "loss": 1.2265, + "step": 7685 + }, + { + "epoch": 0.5137626937466595, + "grad_norm": 0.1572265625, + "learning_rate": 0.00011261800115112441, + "loss": 1.2184, + "step": 7690 + }, + { + "epoch": 0.5140967397113843, + "grad_norm": 0.1640625, + "learning_rate": 0.00011250231881691517, + "loss": 1.1543, + "step": 7695 + }, + { + "epoch": 0.514430785676109, + "grad_norm": 0.1767578125, + "learning_rate": 0.00011238661948342302, + "loss": 1.1452, + "step": 7700 + }, + { + "epoch": 0.5147648316408338, + "grad_norm": 0.17578125, + "learning_rate": 0.00011227090330796317, + "loss": 1.1757, + "step": 7705 + }, + { + "epoch": 0.5150988776055585, + "grad_norm": 0.162109375, + "learning_rate": 0.00011215517044787387, + "loss": 1.2306, + "step": 7710 + }, + { + "epoch": 0.5154329235702833, + "grad_norm": 0.1650390625, + "learning_rate": 0.00011203942106051595, + "loss": 1.1709, + "step": 7715 + }, + { + "epoch": 0.515766969535008, + "grad_norm": 0.171875, + "learning_rate": 0.00011192365530327275, + "loss": 1.2167, + "step": 7720 + }, + { + "epoch": 0.5161010154997328, + "grad_norm": 0.1611328125, + "learning_rate": 0.00011180787333354983, + "loss": 1.1912, + "step": 7725 + }, + { + "epoch": 0.5164350614644575, + "grad_norm": 0.1640625, + "learning_rate": 0.00011169207530877486, + "loss": 1.2212, + "step": 7730 + }, + { + "epoch": 0.5167691074291823, + "grad_norm": 0.1748046875, + "learning_rate": 0.00011157626138639727, + "loss": 1.2778, + "step": 7735 + }, + { + "epoch": 0.517103153393907, + "grad_norm": 0.169921875, + "learning_rate": 0.00011146043172388819, + "loss": 1.2223, + "step": 7740 + }, + { + "epoch": 0.5174371993586317, + "grad_norm": 0.1689453125, + "learning_rate": 0.00011134458647874007, + "loss": 1.2156, + "step": 7745 + }, + { + "epoch": 0.5177712453233565, + "grad_norm": 0.1611328125, + "learning_rate": 0.00011122872580846652, + "loss": 1.1062, + "step": 7750 + }, + { + "epoch": 0.5181052912880812, + "grad_norm": 0.171875, + "learning_rate": 0.00011111284987060228, + "loss": 1.1876, + "step": 7755 + }, + { + "epoch": 0.518439337252806, + "grad_norm": 0.166015625, + "learning_rate": 0.00011099695882270272, + "loss": 1.1698, + "step": 7760 + }, + { + "epoch": 0.5187733832175307, + "grad_norm": 0.16796875, + "learning_rate": 0.00011088105282234376, + "loss": 1.1686, + "step": 7765 + }, + { + "epoch": 0.5191074291822555, + "grad_norm": 0.1640625, + "learning_rate": 0.00011076513202712167, + "loss": 1.1526, + "step": 7770 + }, + { + "epoch": 0.5194414751469802, + "grad_norm": 0.16796875, + "learning_rate": 0.00011064919659465289, + "loss": 1.1648, + "step": 7775 + }, + { + "epoch": 0.519775521111705, + "grad_norm": 0.1708984375, + "learning_rate": 0.00011053324668257368, + "loss": 1.1674, + "step": 7780 + }, + { + "epoch": 0.5201095670764297, + "grad_norm": 0.162109375, + "learning_rate": 0.00011041728244854004, + "loss": 1.1817, + "step": 7785 + }, + { + "epoch": 0.5204436130411545, + "grad_norm": 0.1611328125, + "learning_rate": 0.00011030130405022738, + "loss": 1.1222, + "step": 7790 + }, + { + "epoch": 0.5207776590058792, + "grad_norm": 0.1708984375, + "learning_rate": 0.00011018531164533048, + "loss": 1.1805, + "step": 7795 + }, + { + "epoch": 0.521111704970604, + "grad_norm": 0.16796875, + "learning_rate": 0.00011006930539156308, + "loss": 1.1889, + "step": 7800 + }, + { + "epoch": 0.5214457509353287, + "grad_norm": 0.16796875, + "learning_rate": 0.00010995328544665775, + "loss": 1.1897, + "step": 7805 + }, + { + "epoch": 0.5217797969000535, + "grad_norm": 0.169921875, + "learning_rate": 0.00010983725196836574, + "loss": 1.1815, + "step": 7810 + }, + { + "epoch": 0.5221138428647782, + "grad_norm": 0.1640625, + "learning_rate": 0.00010972120511445656, + "loss": 1.1831, + "step": 7815 + }, + { + "epoch": 0.522447888829503, + "grad_norm": 0.16796875, + "learning_rate": 0.00010960514504271813, + "loss": 1.1478, + "step": 7820 + }, + { + "epoch": 0.5227819347942276, + "grad_norm": 0.1611328125, + "learning_rate": 0.00010948907191095612, + "loss": 1.2321, + "step": 7825 + }, + { + "epoch": 0.5231159807589524, + "grad_norm": 0.1826171875, + "learning_rate": 0.00010937298587699409, + "loss": 1.1213, + "step": 7830 + }, + { + "epoch": 0.5234500267236771, + "grad_norm": 0.1787109375, + "learning_rate": 0.00010925688709867312, + "loss": 1.2076, + "step": 7835 + }, + { + "epoch": 0.5237840726884019, + "grad_norm": 0.1806640625, + "learning_rate": 0.0001091407757338516, + "loss": 1.2197, + "step": 7840 + }, + { + "epoch": 0.5241181186531266, + "grad_norm": 0.171875, + "learning_rate": 0.00010902465194040501, + "loss": 1.2402, + "step": 7845 + }, + { + "epoch": 0.5244521646178514, + "grad_norm": 0.1708984375, + "learning_rate": 0.00010890851587622577, + "loss": 1.2424, + "step": 7850 + }, + { + "epoch": 0.5247862105825761, + "grad_norm": 0.1708984375, + "learning_rate": 0.00010879236769922301, + "loss": 1.2017, + "step": 7855 + }, + { + "epoch": 0.5251202565473009, + "grad_norm": 0.1796875, + "learning_rate": 0.00010867620756732219, + "loss": 1.2269, + "step": 7860 + }, + { + "epoch": 0.5254543025120256, + "grad_norm": 0.1708984375, + "learning_rate": 0.00010856003563846526, + "loss": 1.1909, + "step": 7865 + }, + { + "epoch": 0.5257883484767504, + "grad_norm": 0.1650390625, + "learning_rate": 0.00010844385207061001, + "loss": 1.1847, + "step": 7870 + }, + { + "epoch": 0.5261223944414751, + "grad_norm": 0.162109375, + "learning_rate": 0.00010832765702173011, + "loss": 1.1651, + "step": 7875 + }, + { + "epoch": 0.5264564404061999, + "grad_norm": 0.166015625, + "learning_rate": 0.00010821145064981487, + "loss": 1.1334, + "step": 7880 + }, + { + "epoch": 0.5267904863709246, + "grad_norm": 0.1640625, + "learning_rate": 0.00010809523311286897, + "loss": 1.1816, + "step": 7885 + }, + { + "epoch": 0.5271245323356494, + "grad_norm": 0.1748046875, + "learning_rate": 0.00010797900456891228, + "loss": 1.2023, + "step": 7890 + }, + { + "epoch": 0.5274585783003741, + "grad_norm": 0.171875, + "learning_rate": 0.00010786276517597968, + "loss": 1.1492, + "step": 7895 + }, + { + "epoch": 0.5277926242650989, + "grad_norm": 0.1826171875, + "learning_rate": 0.0001077465150921207, + "loss": 1.2383, + "step": 7900 + }, + { + "epoch": 0.5281266702298236, + "grad_norm": 0.162109375, + "learning_rate": 0.00010763025447539948, + "loss": 1.1999, + "step": 7905 + }, + { + "epoch": 0.5284607161945484, + "grad_norm": 0.18359375, + "learning_rate": 0.00010751398348389446, + "loss": 1.1763, + "step": 7910 + }, + { + "epoch": 0.5287947621592731, + "grad_norm": 0.173828125, + "learning_rate": 0.00010739770227569821, + "loss": 1.2671, + "step": 7915 + }, + { + "epoch": 0.5291288081239979, + "grad_norm": 0.1767578125, + "learning_rate": 0.00010728141100891716, + "loss": 1.2537, + "step": 7920 + }, + { + "epoch": 0.5294628540887226, + "grad_norm": 0.1708984375, + "learning_rate": 0.00010716510984167142, + "loss": 1.2583, + "step": 7925 + }, + { + "epoch": 0.5297969000534474, + "grad_norm": 0.16796875, + "learning_rate": 0.00010704879893209463, + "loss": 1.177, + "step": 7930 + }, + { + "epoch": 0.5301309460181721, + "grad_norm": 0.1865234375, + "learning_rate": 0.00010693247843833352, + "loss": 1.2566, + "step": 7935 + }, + { + "epoch": 0.5304649919828969, + "grad_norm": 0.1708984375, + "learning_rate": 0.00010681614851854802, + "loss": 1.1253, + "step": 7940 + }, + { + "epoch": 0.5307990379476216, + "grad_norm": 0.1845703125, + "learning_rate": 0.00010669980933091079, + "loss": 1.2334, + "step": 7945 + }, + { + "epoch": 0.5311330839123464, + "grad_norm": 0.228515625, + "learning_rate": 0.0001065834610336071, + "loss": 1.2108, + "step": 7950 + }, + { + "epoch": 0.5314671298770711, + "grad_norm": 0.18359375, + "learning_rate": 0.0001064671037848346, + "loss": 1.2085, + "step": 7955 + }, + { + "epoch": 0.5318011758417959, + "grad_norm": 0.1767578125, + "learning_rate": 0.00010635073774280315, + "loss": 1.1995, + "step": 7960 + }, + { + "epoch": 0.5321352218065206, + "grad_norm": 0.1748046875, + "learning_rate": 0.00010623436306573455, + "loss": 1.162, + "step": 7965 + }, + { + "epoch": 0.5324692677712454, + "grad_norm": 0.193359375, + "learning_rate": 0.00010611797991186229, + "loss": 1.2132, + "step": 7970 + }, + { + "epoch": 0.5328033137359701, + "grad_norm": 0.1708984375, + "learning_rate": 0.00010600158843943149, + "loss": 1.2682, + "step": 7975 + }, + { + "epoch": 0.5331373597006949, + "grad_norm": 0.17578125, + "learning_rate": 0.00010588518880669842, + "loss": 1.1729, + "step": 7980 + }, + { + "epoch": 0.5334714056654195, + "grad_norm": 0.1787109375, + "learning_rate": 0.00010576878117193066, + "loss": 1.1858, + "step": 7985 + }, + { + "epoch": 0.5338054516301443, + "grad_norm": 0.185546875, + "learning_rate": 0.00010565236569340646, + "loss": 1.2354, + "step": 7990 + }, + { + "epoch": 0.534139497594869, + "grad_norm": 0.171875, + "learning_rate": 0.00010553594252941488, + "loss": 1.1875, + "step": 7995 + }, + { + "epoch": 0.5344735435595938, + "grad_norm": 0.1533203125, + "learning_rate": 0.00010541951183825536, + "loss": 1.1688, + "step": 8000 + }, + { + "epoch": 0.5348075895243185, + "grad_norm": 0.16015625, + "learning_rate": 0.00010530307377823762, + "loss": 1.1839, + "step": 8005 + }, + { + "epoch": 0.5351416354890433, + "grad_norm": 0.1728515625, + "learning_rate": 0.00010518662850768133, + "loss": 1.1939, + "step": 8010 + }, + { + "epoch": 0.535475681453768, + "grad_norm": 0.169921875, + "learning_rate": 0.00010507017618491603, + "loss": 1.2179, + "step": 8015 + }, + { + "epoch": 0.5358097274184928, + "grad_norm": 0.16796875, + "learning_rate": 0.00010495371696828083, + "loss": 1.2316, + "step": 8020 + }, + { + "epoch": 0.5361437733832175, + "grad_norm": 0.1708984375, + "learning_rate": 0.00010483725101612419, + "loss": 1.2384, + "step": 8025 + }, + { + "epoch": 0.5364778193479423, + "grad_norm": 0.197265625, + "learning_rate": 0.00010472077848680378, + "loss": 1.1438, + "step": 8030 + }, + { + "epoch": 0.536811865312667, + "grad_norm": 0.1708984375, + "learning_rate": 0.00010460429953868614, + "loss": 1.2008, + "step": 8035 + }, + { + "epoch": 0.5371459112773918, + "grad_norm": 0.1708984375, + "learning_rate": 0.00010448781433014663, + "loss": 1.1596, + "step": 8040 + }, + { + "epoch": 0.5374799572421165, + "grad_norm": 0.1787109375, + "learning_rate": 0.00010437132301956897, + "loss": 1.156, + "step": 8045 + }, + { + "epoch": 0.5378140032068413, + "grad_norm": 0.1689453125, + "learning_rate": 0.00010425482576534545, + "loss": 1.1974, + "step": 8050 + }, + { + "epoch": 0.538148049171566, + "grad_norm": 0.1630859375, + "learning_rate": 0.00010413832272587609, + "loss": 1.091, + "step": 8055 + }, + { + "epoch": 0.5384820951362908, + "grad_norm": 0.173828125, + "learning_rate": 0.00010402181405956906, + "loss": 1.262, + "step": 8060 + }, + { + "epoch": 0.5388161411010155, + "grad_norm": 0.177734375, + "learning_rate": 0.00010390529992484004, + "loss": 1.169, + "step": 8065 + }, + { + "epoch": 0.5391501870657402, + "grad_norm": 0.1787109375, + "learning_rate": 0.00010378878048011218, + "loss": 1.1552, + "step": 8070 + }, + { + "epoch": 0.539484233030465, + "grad_norm": 0.1669921875, + "learning_rate": 0.00010367225588381584, + "loss": 1.195, + "step": 8075 + }, + { + "epoch": 0.5398182789951897, + "grad_norm": 0.1669921875, + "learning_rate": 0.00010355572629438846, + "loss": 1.239, + "step": 8080 + }, + { + "epoch": 0.5401523249599145, + "grad_norm": 0.1708984375, + "learning_rate": 0.00010343919187027413, + "loss": 1.2839, + "step": 8085 + }, + { + "epoch": 0.5404863709246392, + "grad_norm": 0.1572265625, + "learning_rate": 0.00010332265276992362, + "loss": 1.203, + "step": 8090 + }, + { + "epoch": 0.540820416889364, + "grad_norm": 0.1689453125, + "learning_rate": 0.00010320610915179402, + "loss": 1.1622, + "step": 8095 + }, + { + "epoch": 0.5411544628540887, + "grad_norm": 0.17578125, + "learning_rate": 0.00010308956117434858, + "loss": 1.1227, + "step": 8100 + }, + { + "epoch": 0.5414885088188135, + "grad_norm": 0.1728515625, + "learning_rate": 0.00010297300899605644, + "loss": 1.2179, + "step": 8105 + }, + { + "epoch": 0.5418225547835382, + "grad_norm": 0.1728515625, + "learning_rate": 0.00010285645277539252, + "loss": 1.1927, + "step": 8110 + }, + { + "epoch": 0.542156600748263, + "grad_norm": 0.1669921875, + "learning_rate": 0.00010273989267083717, + "loss": 1.1551, + "step": 8115 + }, + { + "epoch": 0.5424906467129877, + "grad_norm": 0.1689453125, + "learning_rate": 0.000102623328840876, + "loss": 1.2515, + "step": 8120 + }, + { + "epoch": 0.5428246926777125, + "grad_norm": 0.16796875, + "learning_rate": 0.00010250676144399984, + "loss": 1.1713, + "step": 8125 + }, + { + "epoch": 0.5431587386424372, + "grad_norm": 0.1806640625, + "learning_rate": 0.00010239019063870416, + "loss": 1.1947, + "step": 8130 + }, + { + "epoch": 0.543492784607162, + "grad_norm": 0.177734375, + "learning_rate": 0.00010227361658348922, + "loss": 1.2157, + "step": 8135 + }, + { + "epoch": 0.5438268305718867, + "grad_norm": 0.1630859375, + "learning_rate": 0.00010215703943685964, + "loss": 1.1747, + "step": 8140 + }, + { + "epoch": 0.5441608765366115, + "grad_norm": 0.162109375, + "learning_rate": 0.0001020404593573242, + "loss": 1.2436, + "step": 8145 + }, + { + "epoch": 0.5444949225013361, + "grad_norm": 0.166015625, + "learning_rate": 0.00010192387650339579, + "loss": 1.1607, + "step": 8150 + }, + { + "epoch": 0.5448289684660609, + "grad_norm": 0.16796875, + "learning_rate": 0.00010180729103359094, + "loss": 1.1594, + "step": 8155 + }, + { + "epoch": 0.5451630144307856, + "grad_norm": 0.1640625, + "learning_rate": 0.00010169070310642983, + "loss": 1.1701, + "step": 8160 + }, + { + "epoch": 0.5454970603955104, + "grad_norm": 0.1728515625, + "learning_rate": 0.0001015741128804359, + "loss": 1.2794, + "step": 8165 + }, + { + "epoch": 0.5458311063602351, + "grad_norm": 0.1845703125, + "learning_rate": 0.00010145752051413584, + "loss": 1.2377, + "step": 8170 + }, + { + "epoch": 0.5461651523249599, + "grad_norm": 0.185546875, + "learning_rate": 0.00010134092616605908, + "loss": 1.1754, + "step": 8175 + }, + { + "epoch": 0.5464991982896846, + "grad_norm": 0.1630859375, + "learning_rate": 0.0001012243299947379, + "loss": 1.1771, + "step": 8180 + }, + { + "epoch": 0.5468332442544094, + "grad_norm": 0.1796875, + "learning_rate": 0.00010110773215870695, + "loss": 1.2177, + "step": 8185 + }, + { + "epoch": 0.5471672902191341, + "grad_norm": 0.1708984375, + "learning_rate": 0.00010099113281650325, + "loss": 1.2478, + "step": 8190 + }, + { + "epoch": 0.5475013361838589, + "grad_norm": 0.1748046875, + "learning_rate": 0.00010087453212666574, + "loss": 1.1897, + "step": 8195 + }, + { + "epoch": 0.5478353821485836, + "grad_norm": 0.16796875, + "learning_rate": 0.0001007579302477353, + "loss": 1.179, + "step": 8200 + }, + { + "epoch": 0.5481694281133084, + "grad_norm": 0.1650390625, + "learning_rate": 0.00010064132733825438, + "loss": 1.1982, + "step": 8205 + }, + { + "epoch": 0.5485034740780331, + "grad_norm": 0.1787109375, + "learning_rate": 0.00010052472355676683, + "loss": 1.1418, + "step": 8210 + }, + { + "epoch": 0.5488375200427579, + "grad_norm": 0.1767578125, + "learning_rate": 0.00010040811906181769, + "loss": 1.1776, + "step": 8215 + }, + { + "epoch": 0.5491715660074826, + "grad_norm": 0.169921875, + "learning_rate": 0.00010029151401195298, + "loss": 1.133, + "step": 8220 + }, + { + "epoch": 0.5495056119722074, + "grad_norm": 0.18359375, + "learning_rate": 0.00010017490856571945, + "loss": 1.2514, + "step": 8225 + }, + { + "epoch": 0.5498396579369321, + "grad_norm": 0.158203125, + "learning_rate": 0.00010005830288166445, + "loss": 1.1875, + "step": 8230 + }, + { + "epoch": 0.5501737039016569, + "grad_norm": 0.18359375, + "learning_rate": 9.994169711833555e-05, + "loss": 1.2005, + "step": 8235 + }, + { + "epoch": 0.5505077498663816, + "grad_norm": 0.173828125, + "learning_rate": 9.982509143428054e-05, + "loss": 1.2274, + "step": 8240 + }, + { + "epoch": 0.5508417958311064, + "grad_norm": 0.1611328125, + "learning_rate": 9.970848598804705e-05, + "loss": 1.2221, + "step": 8245 + }, + { + "epoch": 0.5511758417958311, + "grad_norm": 0.1796875, + "learning_rate": 9.959188093818234e-05, + "loss": 1.2734, + "step": 8250 + }, + { + "epoch": 0.5515098877605559, + "grad_norm": 0.193359375, + "learning_rate": 9.947527644323319e-05, + "loss": 1.1843, + "step": 8255 + }, + { + "epoch": 0.5518439337252806, + "grad_norm": 0.1689453125, + "learning_rate": 9.935867266174566e-05, + "loss": 1.2689, + "step": 8260 + }, + { + "epoch": 0.5521779796900054, + "grad_norm": 0.18359375, + "learning_rate": 9.924206975226471e-05, + "loss": 1.3259, + "step": 8265 + }, + { + "epoch": 0.5525120256547301, + "grad_norm": 0.1787109375, + "learning_rate": 9.912546787333427e-05, + "loss": 1.1586, + "step": 8270 + }, + { + "epoch": 0.5528460716194549, + "grad_norm": 0.1669921875, + "learning_rate": 9.900886718349676e-05, + "loss": 1.1334, + "step": 8275 + }, + { + "epoch": 0.5531801175841796, + "grad_norm": 0.1669921875, + "learning_rate": 9.889226784129306e-05, + "loss": 1.2147, + "step": 8280 + }, + { + "epoch": 0.5535141635489044, + "grad_norm": 0.181640625, + "learning_rate": 9.877567000526213e-05, + "loss": 1.1871, + "step": 8285 + }, + { + "epoch": 0.5538482095136291, + "grad_norm": 0.1640625, + "learning_rate": 9.865907383394096e-05, + "loss": 1.2581, + "step": 8290 + }, + { + "epoch": 0.5541822554783539, + "grad_norm": 0.16796875, + "learning_rate": 9.854247948586417e-05, + "loss": 1.1312, + "step": 8295 + }, + { + "epoch": 0.5545163014430786, + "grad_norm": 0.1806640625, + "learning_rate": 9.84258871195641e-05, + "loss": 1.1728, + "step": 8300 + }, + { + "epoch": 0.5548503474078034, + "grad_norm": 0.1669921875, + "learning_rate": 9.830929689357019e-05, + "loss": 1.2742, + "step": 8305 + }, + { + "epoch": 0.555184393372528, + "grad_norm": 0.166015625, + "learning_rate": 9.819270896640908e-05, + "loss": 1.2569, + "step": 8310 + }, + { + "epoch": 0.5555184393372528, + "grad_norm": 0.17578125, + "learning_rate": 9.807612349660423e-05, + "loss": 1.2754, + "step": 8315 + }, + { + "epoch": 0.5558524853019775, + "grad_norm": 0.1669921875, + "learning_rate": 9.795954064267581e-05, + "loss": 1.1742, + "step": 8320 + }, + { + "epoch": 0.5561865312667023, + "grad_norm": 0.1728515625, + "learning_rate": 9.784296056314037e-05, + "loss": 1.1708, + "step": 8325 + }, + { + "epoch": 0.556520577231427, + "grad_norm": 0.44140625, + "learning_rate": 9.772638341651079e-05, + "loss": 1.2346, + "step": 8330 + }, + { + "epoch": 0.5568546231961518, + "grad_norm": 0.1982421875, + "learning_rate": 9.760980936129585e-05, + "loss": 1.1983, + "step": 8335 + }, + { + "epoch": 0.5571886691608765, + "grad_norm": 0.166015625, + "learning_rate": 9.749323855600017e-05, + "loss": 1.2003, + "step": 8340 + }, + { + "epoch": 0.5575227151256013, + "grad_norm": 0.1689453125, + "learning_rate": 9.737667115912402e-05, + "loss": 1.2915, + "step": 8345 + }, + { + "epoch": 0.557856761090326, + "grad_norm": 0.171875, + "learning_rate": 9.726010732916288e-05, + "loss": 1.2653, + "step": 8350 + }, + { + "epoch": 0.5581908070550508, + "grad_norm": 0.1806640625, + "learning_rate": 9.714354722460753e-05, + "loss": 1.2065, + "step": 8355 + }, + { + "epoch": 0.5585248530197755, + "grad_norm": 0.1865234375, + "learning_rate": 9.702699100394355e-05, + "loss": 1.1583, + "step": 8360 + }, + { + "epoch": 0.5588588989845003, + "grad_norm": 0.2060546875, + "learning_rate": 9.691043882565145e-05, + "loss": 1.189, + "step": 8365 + }, + { + "epoch": 0.559192944949225, + "grad_norm": 0.162109375, + "learning_rate": 9.6793890848206e-05, + "loss": 1.2118, + "step": 8370 + }, + { + "epoch": 0.5595269909139498, + "grad_norm": 0.1640625, + "learning_rate": 9.66773472300764e-05, + "loss": 1.1771, + "step": 8375 + }, + { + "epoch": 0.5598610368786745, + "grad_norm": 0.173828125, + "learning_rate": 9.656080812972591e-05, + "loss": 1.287, + "step": 8380 + }, + { + "epoch": 0.5601950828433993, + "grad_norm": 0.162109375, + "learning_rate": 9.644427370561157e-05, + "loss": 1.1679, + "step": 8385 + }, + { + "epoch": 0.560529128808124, + "grad_norm": 0.1826171875, + "learning_rate": 9.632774411618414e-05, + "loss": 1.1996, + "step": 8390 + }, + { + "epoch": 0.5608631747728487, + "grad_norm": 0.1767578125, + "learning_rate": 9.621121951988783e-05, + "loss": 1.2401, + "step": 8395 + }, + { + "epoch": 0.5611972207375735, + "grad_norm": 0.1640625, + "learning_rate": 9.609470007516e-05, + "loss": 1.1879, + "step": 8400 + }, + { + "epoch": 0.5615312667022982, + "grad_norm": 0.1708984375, + "learning_rate": 9.597818594043096e-05, + "loss": 1.2456, + "step": 8405 + }, + { + "epoch": 0.561865312667023, + "grad_norm": 0.18359375, + "learning_rate": 9.586167727412395e-05, + "loss": 1.1403, + "step": 8410 + }, + { + "epoch": 0.5621993586317477, + "grad_norm": 0.1748046875, + "learning_rate": 9.574517423465462e-05, + "loss": 1.2011, + "step": 8415 + }, + { + "epoch": 0.5625334045964725, + "grad_norm": 0.1708984375, + "learning_rate": 9.562867698043101e-05, + "loss": 1.1682, + "step": 8420 + }, + { + "epoch": 0.5628674505611972, + "grad_norm": 0.1767578125, + "learning_rate": 9.55121856698534e-05, + "loss": 1.1735, + "step": 8425 + }, + { + "epoch": 0.563201496525922, + "grad_norm": 0.162109375, + "learning_rate": 9.539570046131389e-05, + "loss": 1.1531, + "step": 8430 + }, + { + "epoch": 0.5635355424906467, + "grad_norm": 0.1640625, + "learning_rate": 9.527922151319626e-05, + "loss": 1.1076, + "step": 8435 + }, + { + "epoch": 0.5638695884553715, + "grad_norm": 0.173828125, + "learning_rate": 9.516274898387582e-05, + "loss": 1.1787, + "step": 8440 + }, + { + "epoch": 0.5642036344200962, + "grad_norm": 0.1640625, + "learning_rate": 9.504628303171922e-05, + "loss": 1.2271, + "step": 8445 + }, + { + "epoch": 0.564537680384821, + "grad_norm": 0.18359375, + "learning_rate": 9.492982381508398e-05, + "loss": 1.1947, + "step": 8450 + }, + { + "epoch": 0.5648717263495457, + "grad_norm": 0.173828125, + "learning_rate": 9.481337149231868e-05, + "loss": 1.1849, + "step": 8455 + }, + { + "epoch": 0.5652057723142705, + "grad_norm": 0.2001953125, + "learning_rate": 9.469692622176239e-05, + "loss": 1.2134, + "step": 8460 + }, + { + "epoch": 0.5655398182789952, + "grad_norm": 0.1572265625, + "learning_rate": 9.458048816174465e-05, + "loss": 1.2114, + "step": 8465 + }, + { + "epoch": 0.5658738642437199, + "grad_norm": 0.1865234375, + "learning_rate": 9.446405747058513e-05, + "loss": 1.2596, + "step": 8470 + }, + { + "epoch": 0.5662079102084446, + "grad_norm": 0.169921875, + "learning_rate": 9.434763430659357e-05, + "loss": 1.1842, + "step": 8475 + }, + { + "epoch": 0.5665419561731694, + "grad_norm": 0.1611328125, + "learning_rate": 9.423121882806934e-05, + "loss": 1.1554, + "step": 8480 + }, + { + "epoch": 0.5668760021378941, + "grad_norm": 0.1611328125, + "learning_rate": 9.411481119330156e-05, + "loss": 1.1946, + "step": 8485 + }, + { + "epoch": 0.5672100481026189, + "grad_norm": 0.158203125, + "learning_rate": 9.399841156056853e-05, + "loss": 1.134, + "step": 8490 + }, + { + "epoch": 0.5675440940673436, + "grad_norm": 0.193359375, + "learning_rate": 9.388202008813772e-05, + "loss": 1.1399, + "step": 8495 + }, + { + "epoch": 0.5678781400320684, + "grad_norm": 0.1669921875, + "learning_rate": 9.376563693426548e-05, + "loss": 1.1826, + "step": 8500 + }, + { + "epoch": 0.5682121859967931, + "grad_norm": 0.1640625, + "learning_rate": 9.364926225719686e-05, + "loss": 1.1893, + "step": 8505 + }, + { + "epoch": 0.5685462319615179, + "grad_norm": 0.1630859375, + "learning_rate": 9.353289621516539e-05, + "loss": 1.158, + "step": 8510 + }, + { + "epoch": 0.5688802779262426, + "grad_norm": 0.1787109375, + "learning_rate": 9.341653896639293e-05, + "loss": 1.1754, + "step": 8515 + }, + { + "epoch": 0.5692143238909674, + "grad_norm": 0.1787109375, + "learning_rate": 9.330019066908923e-05, + "loss": 1.2279, + "step": 8520 + }, + { + "epoch": 0.5695483698556921, + "grad_norm": 0.181640625, + "learning_rate": 9.318385148145199e-05, + "loss": 1.2369, + "step": 8525 + }, + { + "epoch": 0.5698824158204169, + "grad_norm": 0.1689453125, + "learning_rate": 9.306752156166652e-05, + "loss": 1.1594, + "step": 8530 + }, + { + "epoch": 0.5702164617851416, + "grad_norm": 0.1826171875, + "learning_rate": 9.295120106790542e-05, + "loss": 1.2326, + "step": 8535 + }, + { + "epoch": 0.5705505077498664, + "grad_norm": 0.166015625, + "learning_rate": 9.283489015832857e-05, + "loss": 1.2411, + "step": 8540 + }, + { + "epoch": 0.5708845537145911, + "grad_norm": 0.169921875, + "learning_rate": 9.271858899108285e-05, + "loss": 1.1679, + "step": 8545 + }, + { + "epoch": 0.5712185996793159, + "grad_norm": 0.1640625, + "learning_rate": 9.260229772430181e-05, + "loss": 1.194, + "step": 8550 + }, + { + "epoch": 0.5715526456440406, + "grad_norm": 0.1708984375, + "learning_rate": 9.248601651610556e-05, + "loss": 1.2654, + "step": 8555 + }, + { + "epoch": 0.5718866916087654, + "grad_norm": 0.1728515625, + "learning_rate": 9.236974552460055e-05, + "loss": 1.2777, + "step": 8560 + }, + { + "epoch": 0.5722207375734901, + "grad_norm": 0.1845703125, + "learning_rate": 9.225348490787935e-05, + "loss": 1.2713, + "step": 8565 + }, + { + "epoch": 0.5725547835382149, + "grad_norm": 0.1708984375, + "learning_rate": 9.213723482402035e-05, + "loss": 1.1932, + "step": 8570 + }, + { + "epoch": 0.5728888295029396, + "grad_norm": 0.1689453125, + "learning_rate": 9.202099543108771e-05, + "loss": 1.2309, + "step": 8575 + }, + { + "epoch": 0.5732228754676644, + "grad_norm": 0.1796875, + "learning_rate": 9.190476688713103e-05, + "loss": 1.1724, + "step": 8580 + }, + { + "epoch": 0.5735569214323891, + "grad_norm": 0.1748046875, + "learning_rate": 9.178854935018516e-05, + "loss": 1.1986, + "step": 8585 + }, + { + "epoch": 0.5738909673971139, + "grad_norm": 0.173828125, + "learning_rate": 9.167234297826992e-05, + "loss": 1.1978, + "step": 8590 + }, + { + "epoch": 0.5742250133618386, + "grad_norm": 0.1689453125, + "learning_rate": 9.155614792939004e-05, + "loss": 1.1005, + "step": 8595 + }, + { + "epoch": 0.5745590593265634, + "grad_norm": 0.189453125, + "learning_rate": 9.143996436153476e-05, + "loss": 1.2731, + "step": 8600 + }, + { + "epoch": 0.5748931052912881, + "grad_norm": 0.1708984375, + "learning_rate": 9.13237924326778e-05, + "loss": 1.2199, + "step": 8605 + }, + { + "epoch": 0.5752271512560129, + "grad_norm": 0.181640625, + "learning_rate": 9.120763230077703e-05, + "loss": 1.1892, + "step": 8610 + }, + { + "epoch": 0.5755611972207376, + "grad_norm": 0.208984375, + "learning_rate": 9.109148412377426e-05, + "loss": 1.261, + "step": 8615 + }, + { + "epoch": 0.5758952431854624, + "grad_norm": 0.16015625, + "learning_rate": 9.097534805959502e-05, + "loss": 1.2014, + "step": 8620 + }, + { + "epoch": 0.5762292891501871, + "grad_norm": 0.1611328125, + "learning_rate": 9.085922426614844e-05, + "loss": 1.2072, + "step": 8625 + }, + { + "epoch": 0.5765633351149118, + "grad_norm": 0.166015625, + "learning_rate": 9.074311290132693e-05, + "loss": 1.154, + "step": 8630 + }, + { + "epoch": 0.5768973810796365, + "grad_norm": 0.1796875, + "learning_rate": 9.062701412300592e-05, + "loss": 1.2357, + "step": 8635 + }, + { + "epoch": 0.5772314270443613, + "grad_norm": 0.201171875, + "learning_rate": 9.051092808904389e-05, + "loss": 1.1894, + "step": 8640 + }, + { + "epoch": 0.577565473009086, + "grad_norm": 0.2060546875, + "learning_rate": 9.03948549572819e-05, + "loss": 1.2215, + "step": 8645 + }, + { + "epoch": 0.5778995189738108, + "grad_norm": 0.1669921875, + "learning_rate": 9.027879488554346e-05, + "loss": 1.2391, + "step": 8650 + }, + { + "epoch": 0.5782335649385355, + "grad_norm": 0.1865234375, + "learning_rate": 9.016274803163431e-05, + "loss": 1.2081, + "step": 8655 + }, + { + "epoch": 0.5785676109032603, + "grad_norm": 0.16015625, + "learning_rate": 9.004671455334228e-05, + "loss": 1.1283, + "step": 8660 + }, + { + "epoch": 0.578901656867985, + "grad_norm": 0.173828125, + "learning_rate": 8.993069460843693e-05, + "loss": 1.2581, + "step": 8665 + }, + { + "epoch": 0.5792357028327098, + "grad_norm": 0.1748046875, + "learning_rate": 8.981468835466953e-05, + "loss": 1.1975, + "step": 8670 + }, + { + "epoch": 0.5795697487974345, + "grad_norm": 0.181640625, + "learning_rate": 8.969869594977263e-05, + "loss": 1.2193, + "step": 8675 + }, + { + "epoch": 0.5799037947621593, + "grad_norm": 0.1591796875, + "learning_rate": 8.958271755145999e-05, + "loss": 1.1285, + "step": 8680 + }, + { + "epoch": 0.580237840726884, + "grad_norm": 0.173828125, + "learning_rate": 8.946675331742636e-05, + "loss": 1.1692, + "step": 8685 + }, + { + "epoch": 0.5805718866916088, + "grad_norm": 0.1611328125, + "learning_rate": 8.935080340534715e-05, + "loss": 1.14, + "step": 8690 + }, + { + "epoch": 0.5809059326563335, + "grad_norm": 0.1708984375, + "learning_rate": 8.923486797287834e-05, + "loss": 1.2068, + "step": 8695 + }, + { + "epoch": 0.5812399786210583, + "grad_norm": 0.16796875, + "learning_rate": 8.911894717765625e-05, + "loss": 1.2347, + "step": 8700 + }, + { + "epoch": 0.581574024585783, + "grad_norm": 0.2041015625, + "learning_rate": 8.90030411772973e-05, + "loss": 1.1138, + "step": 8705 + }, + { + "epoch": 0.5819080705505077, + "grad_norm": 0.16015625, + "learning_rate": 8.888715012939773e-05, + "loss": 1.2134, + "step": 8710 + }, + { + "epoch": 0.5822421165152325, + "grad_norm": 0.1728515625, + "learning_rate": 8.87712741915335e-05, + "loss": 1.1751, + "step": 8715 + }, + { + "epoch": 0.5825761624799572, + "grad_norm": 0.173828125, + "learning_rate": 8.865541352125998e-05, + "loss": 1.2041, + "step": 8720 + }, + { + "epoch": 0.582910208444682, + "grad_norm": 0.388671875, + "learning_rate": 8.853956827611182e-05, + "loss": 1.2135, + "step": 8725 + }, + { + "epoch": 0.5832442544094067, + "grad_norm": 0.1806640625, + "learning_rate": 8.842373861360271e-05, + "loss": 1.2284, + "step": 8730 + }, + { + "epoch": 0.5835783003741315, + "grad_norm": 0.1767578125, + "learning_rate": 8.830792469122517e-05, + "loss": 1.2092, + "step": 8735 + }, + { + "epoch": 0.5839123463388562, + "grad_norm": 0.1767578125, + "learning_rate": 8.819212666645018e-05, + "loss": 1.2435, + "step": 8740 + }, + { + "epoch": 0.584246392303581, + "grad_norm": 0.1904296875, + "learning_rate": 8.807634469672727e-05, + "loss": 1.1988, + "step": 8745 + }, + { + "epoch": 0.5845804382683057, + "grad_norm": 0.1611328125, + "learning_rate": 8.796057893948409e-05, + "loss": 1.264, + "step": 8750 + }, + { + "epoch": 0.5849144842330305, + "grad_norm": 0.171875, + "learning_rate": 8.784482955212614e-05, + "loss": 1.1828, + "step": 8755 + }, + { + "epoch": 0.5852485301977552, + "grad_norm": 0.1650390625, + "learning_rate": 8.772909669203684e-05, + "loss": 1.1714, + "step": 8760 + }, + { + "epoch": 0.58558257616248, + "grad_norm": 0.1640625, + "learning_rate": 8.7613380516577e-05, + "loss": 1.2169, + "step": 8765 + }, + { + "epoch": 0.5859166221272047, + "grad_norm": 0.158203125, + "learning_rate": 8.749768118308485e-05, + "loss": 1.1714, + "step": 8770 + }, + { + "epoch": 0.5862506680919295, + "grad_norm": 0.16796875, + "learning_rate": 8.73819988488756e-05, + "loss": 1.1609, + "step": 8775 + }, + { + "epoch": 0.5865847140566542, + "grad_norm": 0.1650390625, + "learning_rate": 8.726633367124146e-05, + "loss": 1.1455, + "step": 8780 + }, + { + "epoch": 0.586918760021379, + "grad_norm": 0.1640625, + "learning_rate": 8.715068580745116e-05, + "loss": 1.2395, + "step": 8785 + }, + { + "epoch": 0.5872528059861037, + "grad_norm": 0.1865234375, + "learning_rate": 8.703505541475015e-05, + "loss": 1.2282, + "step": 8790 + }, + { + "epoch": 0.5875868519508284, + "grad_norm": 0.171875, + "learning_rate": 8.691944265035985e-05, + "loss": 1.2212, + "step": 8795 + }, + { + "epoch": 0.5879208979155531, + "grad_norm": 0.16796875, + "learning_rate": 8.680384767147785e-05, + "loss": 1.2372, + "step": 8800 + }, + { + "epoch": 0.5882549438802779, + "grad_norm": 0.1708984375, + "learning_rate": 8.668827063527758e-05, + "loss": 1.1653, + "step": 8805 + }, + { + "epoch": 0.5885889898450026, + "grad_norm": 0.1962890625, + "learning_rate": 8.657271169890797e-05, + "loss": 1.1536, + "step": 8810 + }, + { + "epoch": 0.5889230358097274, + "grad_norm": 0.162109375, + "learning_rate": 8.645717101949338e-05, + "loss": 1.1778, + "step": 8815 + }, + { + "epoch": 0.5892570817744521, + "grad_norm": 0.162109375, + "learning_rate": 8.634164875413343e-05, + "loss": 1.2575, + "step": 8820 + }, + { + "epoch": 0.5895911277391769, + "grad_norm": 0.1669921875, + "learning_rate": 8.622614505990263e-05, + "loss": 1.1312, + "step": 8825 + }, + { + "epoch": 0.5899251737039016, + "grad_norm": 0.1728515625, + "learning_rate": 8.61106600938502e-05, + "loss": 1.1868, + "step": 8830 + }, + { + "epoch": 0.5902592196686264, + "grad_norm": 0.1708984375, + "learning_rate": 8.599519401299997e-05, + "loss": 1.2483, + "step": 8835 + }, + { + "epoch": 0.5905932656333511, + "grad_norm": 0.20703125, + "learning_rate": 8.587974697435005e-05, + "loss": 1.2095, + "step": 8840 + }, + { + "epoch": 0.5909273115980759, + "grad_norm": 0.1640625, + "learning_rate": 8.576431913487265e-05, + "loss": 1.1909, + "step": 8845 + }, + { + "epoch": 0.5912613575628006, + "grad_norm": 0.169921875, + "learning_rate": 8.564891065151392e-05, + "loss": 1.2462, + "step": 8850 + }, + { + "epoch": 0.5915954035275254, + "grad_norm": 0.1669921875, + "learning_rate": 8.553352168119368e-05, + "loss": 1.2182, + "step": 8855 + }, + { + "epoch": 0.5919294494922501, + "grad_norm": 0.1630859375, + "learning_rate": 8.54181523808052e-05, + "loss": 1.1494, + "step": 8860 + }, + { + "epoch": 0.5922634954569749, + "grad_norm": 0.16796875, + "learning_rate": 8.530280290721499e-05, + "loss": 1.1958, + "step": 8865 + }, + { + "epoch": 0.5925975414216996, + "grad_norm": 0.171875, + "learning_rate": 8.518747341726265e-05, + "loss": 1.2156, + "step": 8870 + }, + { + "epoch": 0.5929315873864244, + "grad_norm": 0.171875, + "learning_rate": 8.507216406776056e-05, + "loss": 1.1586, + "step": 8875 + }, + { + "epoch": 0.5932656333511491, + "grad_norm": 0.169921875, + "learning_rate": 8.495687501549371e-05, + "loss": 1.1507, + "step": 8880 + }, + { + "epoch": 0.5935996793158739, + "grad_norm": 0.173828125, + "learning_rate": 8.484160641721958e-05, + "loss": 1.1466, + "step": 8885 + }, + { + "epoch": 0.5939337252805986, + "grad_norm": 0.1689453125, + "learning_rate": 8.472635842966776e-05, + "loss": 1.178, + "step": 8890 + }, + { + "epoch": 0.5942677712453234, + "grad_norm": 0.1728515625, + "learning_rate": 8.46111312095398e-05, + "loss": 1.2257, + "step": 8895 + }, + { + "epoch": 0.5946018172100481, + "grad_norm": 0.1689453125, + "learning_rate": 8.449592491350909e-05, + "loss": 1.1655, + "step": 8900 + }, + { + "epoch": 0.5949358631747729, + "grad_norm": 0.1767578125, + "learning_rate": 8.438073969822047e-05, + "loss": 1.1703, + "step": 8905 + }, + { + "epoch": 0.5952699091394976, + "grad_norm": 0.1943359375, + "learning_rate": 8.426557572029019e-05, + "loss": 1.3285, + "step": 8910 + }, + { + "epoch": 0.5956039551042224, + "grad_norm": 0.1669921875, + "learning_rate": 8.415043313630563e-05, + "loss": 1.1821, + "step": 8915 + }, + { + "epoch": 0.5959380010689471, + "grad_norm": 0.169921875, + "learning_rate": 8.403531210282507e-05, + "loss": 1.2104, + "step": 8920 + }, + { + "epoch": 0.5962720470336719, + "grad_norm": 0.1630859375, + "learning_rate": 8.392021277637743e-05, + "loss": 1.1807, + "step": 8925 + }, + { + "epoch": 0.5966060929983966, + "grad_norm": 0.1669921875, + "learning_rate": 8.380513531346218e-05, + "loss": 1.2005, + "step": 8930 + }, + { + "epoch": 0.5969401389631214, + "grad_norm": 0.171875, + "learning_rate": 8.369007987054906e-05, + "loss": 1.2966, + "step": 8935 + }, + { + "epoch": 0.5972741849278461, + "grad_norm": 0.1806640625, + "learning_rate": 8.357504660407779e-05, + "loss": 1.16, + "step": 8940 + }, + { + "epoch": 0.5976082308925709, + "grad_norm": 0.1650390625, + "learning_rate": 8.346003567045806e-05, + "loss": 1.1801, + "step": 8945 + }, + { + "epoch": 0.5979422768572956, + "grad_norm": 0.1708984375, + "learning_rate": 8.334504722606912e-05, + "loss": 1.2457, + "step": 8950 + }, + { + "epoch": 0.5982763228220203, + "grad_norm": 0.15234375, + "learning_rate": 8.323008142725967e-05, + "loss": 1.1636, + "step": 8955 + }, + { + "epoch": 0.598610368786745, + "grad_norm": 0.1767578125, + "learning_rate": 8.311513843034755e-05, + "loss": 1.1566, + "step": 8960 + }, + { + "epoch": 0.5989444147514698, + "grad_norm": 0.1640625, + "learning_rate": 8.300021839161969e-05, + "loss": 1.2084, + "step": 8965 + }, + { + "epoch": 0.5992784607161945, + "grad_norm": 0.169921875, + "learning_rate": 8.288532146733173e-05, + "loss": 1.1723, + "step": 8970 + }, + { + "epoch": 0.5996125066809193, + "grad_norm": 0.181640625, + "learning_rate": 8.277044781370799e-05, + "loss": 1.2041, + "step": 8975 + }, + { + "epoch": 0.599946552645644, + "grad_norm": 0.1611328125, + "learning_rate": 8.265559758694099e-05, + "loss": 1.2412, + "step": 8980 + }, + { + "epoch": 0.6002805986103688, + "grad_norm": 0.181640625, + "learning_rate": 8.254077094319153e-05, + "loss": 1.1691, + "step": 8985 + }, + { + "epoch": 0.6006146445750935, + "grad_norm": 0.162109375, + "learning_rate": 8.24259680385883e-05, + "loss": 1.1641, + "step": 8990 + }, + { + "epoch": 0.6009486905398183, + "grad_norm": 0.16015625, + "learning_rate": 8.231118902922767e-05, + "loss": 1.1753, + "step": 8995 + }, + { + "epoch": 0.601282736504543, + "grad_norm": 0.1640625, + "learning_rate": 8.219643407117356e-05, + "loss": 1.1704, + "step": 9000 + }, + { + "epoch": 0.6016167824692678, + "grad_norm": 0.232421875, + "learning_rate": 8.208170332045723e-05, + "loss": 1.1438, + "step": 9005 + }, + { + "epoch": 0.6019508284339925, + "grad_norm": 0.1630859375, + "learning_rate": 8.196699693307695e-05, + "loss": 1.1879, + "step": 9010 + }, + { + "epoch": 0.6022848743987173, + "grad_norm": 0.1728515625, + "learning_rate": 8.18523150649979e-05, + "loss": 1.2226, + "step": 9015 + }, + { + "epoch": 0.602618920363442, + "grad_norm": 0.166015625, + "learning_rate": 8.173765787215193e-05, + "loss": 1.2468, + "step": 9020 + }, + { + "epoch": 0.6029529663281668, + "grad_norm": 0.181640625, + "learning_rate": 8.162302551043728e-05, + "loss": 1.1547, + "step": 9025 + }, + { + "epoch": 0.6032870122928915, + "grad_norm": 0.201171875, + "learning_rate": 8.150841813571849e-05, + "loss": 1.1479, + "step": 9030 + }, + { + "epoch": 0.6036210582576162, + "grad_norm": 0.1826171875, + "learning_rate": 8.139383590382614e-05, + "loss": 1.2236, + "step": 9035 + }, + { + "epoch": 0.603955104222341, + "grad_norm": 0.16015625, + "learning_rate": 8.127927897055658e-05, + "loss": 1.2452, + "step": 9040 + }, + { + "epoch": 0.6042891501870657, + "grad_norm": 0.1708984375, + "learning_rate": 8.116474749167175e-05, + "loss": 1.1888, + "step": 9045 + }, + { + "epoch": 0.6046231961517905, + "grad_norm": 0.1748046875, + "learning_rate": 8.105024162289901e-05, + "loss": 1.1423, + "step": 9050 + }, + { + "epoch": 0.6049572421165152, + "grad_norm": 0.193359375, + "learning_rate": 8.093576151993092e-05, + "loss": 1.1972, + "step": 9055 + }, + { + "epoch": 0.60529128808124, + "grad_norm": 0.1650390625, + "learning_rate": 8.082130733842492e-05, + "loss": 1.1641, + "step": 9060 + }, + { + "epoch": 0.6056253340459647, + "grad_norm": 0.1650390625, + "learning_rate": 8.070687923400328e-05, + "loss": 1.2438, + "step": 9065 + }, + { + "epoch": 0.6059593800106895, + "grad_norm": 0.166015625, + "learning_rate": 8.059247736225285e-05, + "loss": 1.2421, + "step": 9070 + }, + { + "epoch": 0.6062934259754142, + "grad_norm": 0.173828125, + "learning_rate": 8.047810187872473e-05, + "loss": 1.2011, + "step": 9075 + }, + { + "epoch": 0.606627471940139, + "grad_norm": 0.1748046875, + "learning_rate": 8.036375293893413e-05, + "loss": 1.1614, + "step": 9080 + }, + { + "epoch": 0.6069615179048637, + "grad_norm": 0.1640625, + "learning_rate": 8.024943069836023e-05, + "loss": 1.121, + "step": 9085 + }, + { + "epoch": 0.6072955638695885, + "grad_norm": 0.15625, + "learning_rate": 8.013513531244593e-05, + "loss": 1.1455, + "step": 9090 + }, + { + "epoch": 0.6076296098343132, + "grad_norm": 0.1728515625, + "learning_rate": 8.002086693659746e-05, + "loss": 1.157, + "step": 9095 + }, + { + "epoch": 0.607963655799038, + "grad_norm": 0.1787109375, + "learning_rate": 7.990662572618452e-05, + "loss": 1.1843, + "step": 9100 + }, + { + "epoch": 0.6082977017637627, + "grad_norm": 0.20703125, + "learning_rate": 7.979241183653978e-05, + "loss": 1.2587, + "step": 9105 + }, + { + "epoch": 0.6086317477284875, + "grad_norm": 0.2060546875, + "learning_rate": 7.967822542295877e-05, + "loss": 1.2498, + "step": 9110 + }, + { + "epoch": 0.6089657936932121, + "grad_norm": 0.171875, + "learning_rate": 7.956406664069964e-05, + "loss": 1.2265, + "step": 9115 + }, + { + "epoch": 0.6092998396579369, + "grad_norm": 0.1748046875, + "learning_rate": 7.9449935644983e-05, + "loss": 1.1527, + "step": 9120 + }, + { + "epoch": 0.6096338856226616, + "grad_norm": 0.1708984375, + "learning_rate": 7.933583259099162e-05, + "loss": 1.1783, + "step": 9125 + }, + { + "epoch": 0.6099679315873864, + "grad_norm": 0.16015625, + "learning_rate": 7.922175763387043e-05, + "loss": 1.2366, + "step": 9130 + }, + { + "epoch": 0.6103019775521111, + "grad_norm": 0.1708984375, + "learning_rate": 7.910771092872599e-05, + "loss": 1.1738, + "step": 9135 + }, + { + "epoch": 0.6106360235168359, + "grad_norm": 0.1728515625, + "learning_rate": 7.899369263062654e-05, + "loss": 1.1757, + "step": 9140 + }, + { + "epoch": 0.6109700694815606, + "grad_norm": 0.1962890625, + "learning_rate": 7.887970289460161e-05, + "loss": 1.28, + "step": 9145 + }, + { + "epoch": 0.6113041154462854, + "grad_norm": 0.1708984375, + "learning_rate": 7.876574187564201e-05, + "loss": 1.1575, + "step": 9150 + }, + { + "epoch": 0.6116381614110101, + "grad_norm": 0.16796875, + "learning_rate": 7.865180972869938e-05, + "loss": 1.1488, + "step": 9155 + }, + { + "epoch": 0.6119722073757349, + "grad_norm": 0.1728515625, + "learning_rate": 7.853790660868625e-05, + "loss": 1.2271, + "step": 9160 + }, + { + "epoch": 0.6123062533404596, + "grad_norm": 0.1611328125, + "learning_rate": 7.842403267047556e-05, + "loss": 1.1493, + "step": 9165 + }, + { + "epoch": 0.6126402993051844, + "grad_norm": 0.1630859375, + "learning_rate": 7.831018806890058e-05, + "loss": 1.1521, + "step": 9170 + }, + { + "epoch": 0.6129743452699091, + "grad_norm": 0.232421875, + "learning_rate": 7.819637295875478e-05, + "loss": 1.0877, + "step": 9175 + }, + { + "epoch": 0.6133083912346339, + "grad_norm": 0.1728515625, + "learning_rate": 7.808258749479142e-05, + "loss": 1.2084, + "step": 9180 + }, + { + "epoch": 0.6136424371993586, + "grad_norm": 0.1826171875, + "learning_rate": 7.796883183172352e-05, + "loss": 1.2174, + "step": 9185 + }, + { + "epoch": 0.6139764831640834, + "grad_norm": 0.17578125, + "learning_rate": 7.785510612422359e-05, + "loss": 1.2459, + "step": 9190 + }, + { + "epoch": 0.6143105291288081, + "grad_norm": 0.1640625, + "learning_rate": 7.774141052692337e-05, + "loss": 1.232, + "step": 9195 + }, + { + "epoch": 0.6146445750935329, + "grad_norm": 0.169921875, + "learning_rate": 7.762774519441366e-05, + "loss": 1.1734, + "step": 9200 + }, + { + "epoch": 0.6149786210582576, + "grad_norm": 0.1728515625, + "learning_rate": 7.751411028124414e-05, + "loss": 1.1733, + "step": 9205 + }, + { + "epoch": 0.6153126670229824, + "grad_norm": 0.1826171875, + "learning_rate": 7.740050594192308e-05, + "loss": 1.2005, + "step": 9210 + }, + { + "epoch": 0.6156467129877071, + "grad_norm": 0.173828125, + "learning_rate": 7.728693233091721e-05, + "loss": 1.2093, + "step": 9215 + }, + { + "epoch": 0.6159807589524319, + "grad_norm": 0.1640625, + "learning_rate": 7.717338960265152e-05, + "loss": 1.1545, + "step": 9220 + }, + { + "epoch": 0.6163148049171566, + "grad_norm": 0.173828125, + "learning_rate": 7.705987791150895e-05, + "loss": 1.2138, + "step": 9225 + }, + { + "epoch": 0.6166488508818814, + "grad_norm": 0.169921875, + "learning_rate": 7.694639741183027e-05, + "loss": 1.2281, + "step": 9230 + }, + { + "epoch": 0.6169828968466061, + "grad_norm": 0.1806640625, + "learning_rate": 7.68329482579138e-05, + "loss": 1.2119, + "step": 9235 + }, + { + "epoch": 0.6173169428113309, + "grad_norm": 0.166015625, + "learning_rate": 7.671953060401528e-05, + "loss": 1.2096, + "step": 9240 + }, + { + "epoch": 0.6176509887760556, + "grad_norm": 0.1728515625, + "learning_rate": 7.660614460434754e-05, + "loss": 1.2063, + "step": 9245 + }, + { + "epoch": 0.6179850347407804, + "grad_norm": 0.1611328125, + "learning_rate": 7.649279041308057e-05, + "loss": 1.2487, + "step": 9250 + }, + { + "epoch": 0.6183190807055051, + "grad_norm": 0.173828125, + "learning_rate": 7.637946818434087e-05, + "loss": 1.2365, + "step": 9255 + }, + { + "epoch": 0.6186531266702299, + "grad_norm": 0.193359375, + "learning_rate": 7.626617807221166e-05, + "loss": 1.1823, + "step": 9260 + }, + { + "epoch": 0.6189871726349546, + "grad_norm": 0.1708984375, + "learning_rate": 7.615292023073235e-05, + "loss": 1.1834, + "step": 9265 + }, + { + "epoch": 0.6193212185996794, + "grad_norm": 0.1640625, + "learning_rate": 7.603969481389856e-05, + "loss": 1.248, + "step": 9270 + }, + { + "epoch": 0.619655264564404, + "grad_norm": 0.181640625, + "learning_rate": 7.592650197566181e-05, + "loss": 1.2045, + "step": 9275 + }, + { + "epoch": 0.6199893105291288, + "grad_norm": 0.16796875, + "learning_rate": 7.581334186992934e-05, + "loss": 1.184, + "step": 9280 + }, + { + "epoch": 0.6203233564938535, + "grad_norm": 0.1767578125, + "learning_rate": 7.570021465056384e-05, + "loss": 1.1881, + "step": 9285 + }, + { + "epoch": 0.6206574024585783, + "grad_norm": 0.16796875, + "learning_rate": 7.558712047138328e-05, + "loss": 1.2263, + "step": 9290 + }, + { + "epoch": 0.620991448423303, + "grad_norm": 0.1826171875, + "learning_rate": 7.547405948616079e-05, + "loss": 1.2007, + "step": 9295 + }, + { + "epoch": 0.6213254943880278, + "grad_norm": 0.189453125, + "learning_rate": 7.536103184862424e-05, + "loss": 1.1391, + "step": 9300 + }, + { + "epoch": 0.6216595403527525, + "grad_norm": 0.1650390625, + "learning_rate": 7.524803771245628e-05, + "loss": 1.2094, + "step": 9305 + }, + { + "epoch": 0.6219935863174773, + "grad_norm": 0.173828125, + "learning_rate": 7.513507723129386e-05, + "loss": 1.2207, + "step": 9310 + }, + { + "epoch": 0.622327632282202, + "grad_norm": 0.1689453125, + "learning_rate": 7.502215055872838e-05, + "loss": 1.2251, + "step": 9315 + }, + { + "epoch": 0.6226616782469268, + "grad_norm": 0.166015625, + "learning_rate": 7.49092578483051e-05, + "loss": 1.163, + "step": 9320 + }, + { + "epoch": 0.6229957242116515, + "grad_norm": 0.205078125, + "learning_rate": 7.479639925352318e-05, + "loss": 1.2131, + "step": 9325 + }, + { + "epoch": 0.6233297701763763, + "grad_norm": 0.1669921875, + "learning_rate": 7.468357492783531e-05, + "loss": 1.1521, + "step": 9330 + }, + { + "epoch": 0.623663816141101, + "grad_norm": 0.1669921875, + "learning_rate": 7.457078502464768e-05, + "loss": 1.1685, + "step": 9335 + }, + { + "epoch": 0.6239978621058258, + "grad_norm": 0.1689453125, + "learning_rate": 7.44580296973196e-05, + "loss": 1.2071, + "step": 9340 + }, + { + "epoch": 0.6243319080705505, + "grad_norm": 0.1640625, + "learning_rate": 7.434530909916351e-05, + "loss": 1.2186, + "step": 9345 + }, + { + "epoch": 0.6246659540352753, + "grad_norm": 0.1943359375, + "learning_rate": 7.423262338344444e-05, + "loss": 1.1182, + "step": 9350 + }, + { + "epoch": 0.625, + "grad_norm": 0.166015625, + "learning_rate": 7.411997270338008e-05, + "loss": 1.149, + "step": 9355 + }, + { + "epoch": 0.6253340459647247, + "grad_norm": 0.1748046875, + "learning_rate": 7.400735721214052e-05, + "loss": 1.15, + "step": 9360 + }, + { + "epoch": 0.6256680919294495, + "grad_norm": 0.1689453125, + "learning_rate": 7.389477706284793e-05, + "loss": 1.1869, + "step": 9365 + }, + { + "epoch": 0.6260021378941742, + "grad_norm": 0.166015625, + "learning_rate": 7.378223240857643e-05, + "loss": 1.2278, + "step": 9370 + }, + { + "epoch": 0.626336183858899, + "grad_norm": 0.1708984375, + "learning_rate": 7.366972340235197e-05, + "loss": 1.1563, + "step": 9375 + }, + { + "epoch": 0.6266702298236237, + "grad_norm": 0.1708984375, + "learning_rate": 7.355725019715195e-05, + "loss": 1.1964, + "step": 9380 + }, + { + "epoch": 0.6270042757883485, + "grad_norm": 0.1884765625, + "learning_rate": 7.344481294590509e-05, + "loss": 1.1799, + "step": 9385 + }, + { + "epoch": 0.6273383217530732, + "grad_norm": 0.1689453125, + "learning_rate": 7.333241180149123e-05, + "loss": 1.2004, + "step": 9390 + }, + { + "epoch": 0.627672367717798, + "grad_norm": 0.1796875, + "learning_rate": 7.322004691674118e-05, + "loss": 1.3072, + "step": 9395 + }, + { + "epoch": 0.6280064136825227, + "grad_norm": 0.169921875, + "learning_rate": 7.310771844443631e-05, + "loss": 1.3167, + "step": 9400 + }, + { + "epoch": 0.6283404596472475, + "grad_norm": 0.1591796875, + "learning_rate": 7.299542653730865e-05, + "loss": 1.1872, + "step": 9405 + }, + { + "epoch": 0.6286745056119722, + "grad_norm": 0.1650390625, + "learning_rate": 7.288317134804038e-05, + "loss": 1.1386, + "step": 9410 + }, + { + "epoch": 0.629008551576697, + "grad_norm": 0.1572265625, + "learning_rate": 7.277095302926385e-05, + "loss": 1.1987, + "step": 9415 + }, + { + "epoch": 0.6293425975414217, + "grad_norm": 0.181640625, + "learning_rate": 7.265877173356116e-05, + "loss": 1.1932, + "step": 9420 + }, + { + "epoch": 0.6296766435061465, + "grad_norm": 0.177734375, + "learning_rate": 7.254662761346423e-05, + "loss": 1.1853, + "step": 9425 + }, + { + "epoch": 0.6300106894708712, + "grad_norm": 0.173828125, + "learning_rate": 7.243452082145423e-05, + "loss": 1.1307, + "step": 9430 + }, + { + "epoch": 0.630344735435596, + "grad_norm": 0.173828125, + "learning_rate": 7.232245150996181e-05, + "loss": 1.1789, + "step": 9435 + }, + { + "epoch": 0.6306787814003206, + "grad_norm": 0.1611328125, + "learning_rate": 7.221041983136646e-05, + "loss": 1.2618, + "step": 9440 + }, + { + "epoch": 0.6310128273650454, + "grad_norm": 0.1728515625, + "learning_rate": 7.209842593799662e-05, + "loss": 1.2339, + "step": 9445 + }, + { + "epoch": 0.6313468733297701, + "grad_norm": 0.1796875, + "learning_rate": 7.198646998212928e-05, + "loss": 1.1588, + "step": 9450 + }, + { + "epoch": 0.6316809192944949, + "grad_norm": 0.1767578125, + "learning_rate": 7.187455211598989e-05, + "loss": 1.2706, + "step": 9455 + }, + { + "epoch": 0.6320149652592196, + "grad_norm": 0.162109375, + "learning_rate": 7.176267249175209e-05, + "loss": 1.2431, + "step": 9460 + }, + { + "epoch": 0.6323490112239444, + "grad_norm": 0.1748046875, + "learning_rate": 7.165083126153756e-05, + "loss": 1.1589, + "step": 9465 + }, + { + "epoch": 0.6326830571886691, + "grad_norm": 0.1650390625, + "learning_rate": 7.153902857741571e-05, + "loss": 1.3123, + "step": 9470 + }, + { + "epoch": 0.6330171031533939, + "grad_norm": 0.2255859375, + "learning_rate": 7.14272645914036e-05, + "loss": 1.2838, + "step": 9475 + }, + { + "epoch": 0.6333511491181186, + "grad_norm": 0.169921875, + "learning_rate": 7.131553945546568e-05, + "loss": 1.2397, + "step": 9480 + }, + { + "epoch": 0.6336851950828434, + "grad_norm": 0.189453125, + "learning_rate": 7.120385332151348e-05, + "loss": 1.1422, + "step": 9485 + }, + { + "epoch": 0.6340192410475681, + "grad_norm": 0.1748046875, + "learning_rate": 7.109220634140558e-05, + "loss": 1.2258, + "step": 9490 + }, + { + "epoch": 0.6343532870122929, + "grad_norm": 0.1669921875, + "learning_rate": 7.098059866694733e-05, + "loss": 1.1345, + "step": 9495 + }, + { + "epoch": 0.6346873329770176, + "grad_norm": 0.169921875, + "learning_rate": 7.086903044989064e-05, + "loss": 1.1749, + "step": 9500 + }, + { + "epoch": 0.6350213789417424, + "grad_norm": 0.1865234375, + "learning_rate": 7.075750184193368e-05, + "loss": 1.1437, + "step": 9505 + }, + { + "epoch": 0.6353554249064671, + "grad_norm": 0.1669921875, + "learning_rate": 7.064601299472087e-05, + "loss": 1.1846, + "step": 9510 + }, + { + "epoch": 0.6356894708711919, + "grad_norm": 0.16796875, + "learning_rate": 7.053456405984253e-05, + "loss": 1.2066, + "step": 9515 + }, + { + "epoch": 0.6360235168359166, + "grad_norm": 0.177734375, + "learning_rate": 7.042315518883467e-05, + "loss": 1.203, + "step": 9520 + }, + { + "epoch": 0.6363575628006414, + "grad_norm": 0.173828125, + "learning_rate": 7.031178653317886e-05, + "loss": 1.2101, + "step": 9525 + }, + { + "epoch": 0.6366916087653661, + "grad_norm": 0.1904296875, + "learning_rate": 7.020045824430205e-05, + "loss": 1.1811, + "step": 9530 + }, + { + "epoch": 0.6370256547300909, + "grad_norm": 0.177734375, + "learning_rate": 7.008917047357624e-05, + "loss": 1.2073, + "step": 9535 + }, + { + "epoch": 0.6373597006948156, + "grad_norm": 0.1767578125, + "learning_rate": 6.997792337231827e-05, + "loss": 1.1933, + "step": 9540 + }, + { + "epoch": 0.6376937466595404, + "grad_norm": 0.169921875, + "learning_rate": 6.986671709178985e-05, + "loss": 1.1735, + "step": 9545 + }, + { + "epoch": 0.6380277926242651, + "grad_norm": 0.1689453125, + "learning_rate": 6.975555178319701e-05, + "loss": 1.2503, + "step": 9550 + }, + { + "epoch": 0.6383618385889899, + "grad_norm": 0.177734375, + "learning_rate": 6.964442759769017e-05, + "loss": 1.2005, + "step": 9555 + }, + { + "epoch": 0.6386958845537146, + "grad_norm": 0.1884765625, + "learning_rate": 6.953334468636386e-05, + "loss": 1.2382, + "step": 9560 + }, + { + "epoch": 0.6390299305184394, + "grad_norm": 0.1689453125, + "learning_rate": 6.942230320025645e-05, + "loss": 1.0983, + "step": 9565 + }, + { + "epoch": 0.6393639764831641, + "grad_norm": 0.1796875, + "learning_rate": 6.931130329034993e-05, + "loss": 1.1712, + "step": 9570 + }, + { + "epoch": 0.6396980224478889, + "grad_norm": 0.1669921875, + "learning_rate": 6.920034510756986e-05, + "loss": 1.2003, + "step": 9575 + }, + { + "epoch": 0.6400320684126136, + "grad_norm": 0.162109375, + "learning_rate": 6.9089428802785e-05, + "loss": 1.15, + "step": 9580 + }, + { + "epoch": 0.6403661143773384, + "grad_norm": 0.18359375, + "learning_rate": 6.897855452680715e-05, + "loss": 1.2444, + "step": 9585 + }, + { + "epoch": 0.6407001603420631, + "grad_norm": 0.1796875, + "learning_rate": 6.886772243039105e-05, + "loss": 1.2818, + "step": 9590 + }, + { + "epoch": 0.6410342063067879, + "grad_norm": 0.181640625, + "learning_rate": 6.875693266423404e-05, + "loss": 1.1941, + "step": 9595 + }, + { + "epoch": 0.6413682522715125, + "grad_norm": 0.1689453125, + "learning_rate": 6.864618537897588e-05, + "loss": 1.184, + "step": 9600 + }, + { + "epoch": 0.6417022982362373, + "grad_norm": 0.1650390625, + "learning_rate": 6.853548072519859e-05, + "loss": 1.114, + "step": 9605 + }, + { + "epoch": 0.642036344200962, + "grad_norm": 0.189453125, + "learning_rate": 6.842481885342625e-05, + "loss": 1.2692, + "step": 9610 + }, + { + "epoch": 0.6423703901656868, + "grad_norm": 0.1669921875, + "learning_rate": 6.831419991412464e-05, + "loss": 1.1408, + "step": 9615 + }, + { + "epoch": 0.6427044361304115, + "grad_norm": 0.158203125, + "learning_rate": 6.820362405770143e-05, + "loss": 1.2433, + "step": 9620 + }, + { + "epoch": 0.6430384820951363, + "grad_norm": 0.1767578125, + "learning_rate": 6.809309143450545e-05, + "loss": 1.2076, + "step": 9625 + }, + { + "epoch": 0.643372528059861, + "grad_norm": 0.177734375, + "learning_rate": 6.798260219482691e-05, + "loss": 1.2467, + "step": 9630 + }, + { + "epoch": 0.6437065740245858, + "grad_norm": 0.1845703125, + "learning_rate": 6.787215648889689e-05, + "loss": 1.2093, + "step": 9635 + }, + { + "epoch": 0.6440406199893105, + "grad_norm": 0.181640625, + "learning_rate": 6.77617544668874e-05, + "loss": 1.2324, + "step": 9640 + }, + { + "epoch": 0.6443746659540353, + "grad_norm": 0.171875, + "learning_rate": 6.765139627891099e-05, + "loss": 1.1822, + "step": 9645 + }, + { + "epoch": 0.64470871191876, + "grad_norm": 0.1845703125, + "learning_rate": 6.754108207502069e-05, + "loss": 1.223, + "step": 9650 + }, + { + "epoch": 0.6450427578834848, + "grad_norm": 0.1611328125, + "learning_rate": 6.743081200520962e-05, + "loss": 1.1462, + "step": 9655 + }, + { + "epoch": 0.6453768038482095, + "grad_norm": 0.1630859375, + "learning_rate": 6.732058621941092e-05, + "loss": 1.2425, + "step": 9660 + }, + { + "epoch": 0.6457108498129343, + "grad_norm": 0.173828125, + "learning_rate": 6.721040486749756e-05, + "loss": 1.245, + "step": 9665 + }, + { + "epoch": 0.646044895777659, + "grad_norm": 0.1689453125, + "learning_rate": 6.710026809928206e-05, + "loss": 1.1829, + "step": 9670 + }, + { + "epoch": 0.6463789417423838, + "grad_norm": 0.1875, + "learning_rate": 6.69901760645163e-05, + "loss": 1.1876, + "step": 9675 + }, + { + "epoch": 0.6467129877071085, + "grad_norm": 0.1689453125, + "learning_rate": 6.68801289128914e-05, + "loss": 1.3096, + "step": 9680 + }, + { + "epoch": 0.6470470336718332, + "grad_norm": 0.1826171875, + "learning_rate": 6.677012679403743e-05, + "loss": 1.2622, + "step": 9685 + }, + { + "epoch": 0.647381079636558, + "grad_norm": 0.1748046875, + "learning_rate": 6.666016985752316e-05, + "loss": 1.2291, + "step": 9690 + }, + { + "epoch": 0.6477151256012827, + "grad_norm": 0.1806640625, + "learning_rate": 6.655025825285601e-05, + "loss": 1.2274, + "step": 9695 + }, + { + "epoch": 0.6480491715660075, + "grad_norm": 0.1767578125, + "learning_rate": 6.644039212948177e-05, + "loss": 1.2035, + "step": 9700 + }, + { + "epoch": 0.6483832175307322, + "grad_norm": 0.1884765625, + "learning_rate": 6.633057163678426e-05, + "loss": 1.229, + "step": 9705 + }, + { + "epoch": 0.648717263495457, + "grad_norm": 0.1767578125, + "learning_rate": 6.622079692408545e-05, + "loss": 1.319, + "step": 9710 + }, + { + "epoch": 0.6490513094601817, + "grad_norm": 0.166015625, + "learning_rate": 6.611106814064492e-05, + "loss": 1.1854, + "step": 9715 + }, + { + "epoch": 0.6493853554249065, + "grad_norm": 0.1689453125, + "learning_rate": 6.600138543565986e-05, + "loss": 1.1671, + "step": 9720 + }, + { + "epoch": 0.6497194013896312, + "grad_norm": 0.169921875, + "learning_rate": 6.589174895826475e-05, + "loss": 1.2083, + "step": 9725 + }, + { + "epoch": 0.650053447354356, + "grad_norm": 0.162109375, + "learning_rate": 6.57821588575313e-05, + "loss": 1.147, + "step": 9730 + }, + { + "epoch": 0.6503874933190807, + "grad_norm": 0.1708984375, + "learning_rate": 6.567261528246806e-05, + "loss": 1.2656, + "step": 9735 + }, + { + "epoch": 0.6507215392838055, + "grad_norm": 0.228515625, + "learning_rate": 6.556311838202046e-05, + "loss": 1.2546, + "step": 9740 + }, + { + "epoch": 0.6510555852485302, + "grad_norm": 0.173828125, + "learning_rate": 6.545366830507034e-05, + "loss": 1.2111, + "step": 9745 + }, + { + "epoch": 0.651389631213255, + "grad_norm": 0.16796875, + "learning_rate": 6.534426520043594e-05, + "loss": 1.2212, + "step": 9750 + }, + { + "epoch": 0.6517236771779797, + "grad_norm": 0.1689453125, + "learning_rate": 6.523490921687157e-05, + "loss": 1.128, + "step": 9755 + }, + { + "epoch": 0.6520577231427044, + "grad_norm": 0.17578125, + "learning_rate": 6.512560050306756e-05, + "loss": 1.197, + "step": 9760 + }, + { + "epoch": 0.6523917691074291, + "grad_norm": 0.1650390625, + "learning_rate": 6.501633920764989e-05, + "loss": 1.2123, + "step": 9765 + }, + { + "epoch": 0.6527258150721539, + "grad_norm": 0.173828125, + "learning_rate": 6.490712547918006e-05, + "loss": 1.2868, + "step": 9770 + }, + { + "epoch": 0.6530598610368786, + "grad_norm": 0.1728515625, + "learning_rate": 6.479795946615501e-05, + "loss": 1.2247, + "step": 9775 + }, + { + "epoch": 0.6533939070016034, + "grad_norm": 0.1708984375, + "learning_rate": 6.468884131700668e-05, + "loss": 1.2421, + "step": 9780 + }, + { + "epoch": 0.6537279529663281, + "grad_norm": 0.1748046875, + "learning_rate": 6.457977118010196e-05, + "loss": 1.2825, + "step": 9785 + }, + { + "epoch": 0.6540619989310529, + "grad_norm": 0.1767578125, + "learning_rate": 6.44707492037425e-05, + "loss": 1.1924, + "step": 9790 + }, + { + "epoch": 0.6543960448957776, + "grad_norm": 0.1953125, + "learning_rate": 6.436177553616438e-05, + "loss": 1.2154, + "step": 9795 + }, + { + "epoch": 0.6547300908605024, + "grad_norm": 0.1826171875, + "learning_rate": 6.42528503255381e-05, + "loss": 1.1975, + "step": 9800 + }, + { + "epoch": 0.6550641368252271, + "grad_norm": 0.1591796875, + "learning_rate": 6.414397371996821e-05, + "loss": 1.0709, + "step": 9805 + }, + { + "epoch": 0.6553981827899519, + "grad_norm": 0.1748046875, + "learning_rate": 6.403514586749318e-05, + "loss": 1.2159, + "step": 9810 + }, + { + "epoch": 0.6557322287546766, + "grad_norm": 0.1728515625, + "learning_rate": 6.392636691608521e-05, + "loss": 1.2559, + "step": 9815 + }, + { + "epoch": 0.6560662747194014, + "grad_norm": 0.169921875, + "learning_rate": 6.381763701365e-05, + "loss": 1.1734, + "step": 9820 + }, + { + "epoch": 0.6564003206841261, + "grad_norm": 0.1611328125, + "learning_rate": 6.370895630802652e-05, + "loss": 1.1594, + "step": 9825 + }, + { + "epoch": 0.6567343666488509, + "grad_norm": 0.154296875, + "learning_rate": 6.36003249469869e-05, + "loss": 1.1063, + "step": 9830 + }, + { + "epoch": 0.6570684126135756, + "grad_norm": 0.17578125, + "learning_rate": 6.349174307823616e-05, + "loss": 1.1498, + "step": 9835 + }, + { + "epoch": 0.6574024585783004, + "grad_norm": 0.1630859375, + "learning_rate": 6.338321084941205e-05, + "loss": 1.2358, + "step": 9840 + }, + { + "epoch": 0.6577365045430251, + "grad_norm": 0.1787109375, + "learning_rate": 6.327472840808478e-05, + "loss": 1.2042, + "step": 9845 + }, + { + "epoch": 0.6580705505077499, + "grad_norm": 0.171875, + "learning_rate": 6.316629590175688e-05, + "loss": 1.1814, + "step": 9850 + }, + { + "epoch": 0.6584045964724746, + "grad_norm": 0.1708984375, + "learning_rate": 6.305791347786299e-05, + "loss": 1.2021, + "step": 9855 + }, + { + "epoch": 0.6587386424371994, + "grad_norm": 0.1669921875, + "learning_rate": 6.294958128376962e-05, + "loss": 1.1821, + "step": 9860 + }, + { + "epoch": 0.6590726884019241, + "grad_norm": 0.17578125, + "learning_rate": 6.284129946677508e-05, + "loss": 1.1893, + "step": 9865 + }, + { + "epoch": 0.6594067343666489, + "grad_norm": 0.166015625, + "learning_rate": 6.27330681741091e-05, + "loss": 1.1647, + "step": 9870 + }, + { + "epoch": 0.6597407803313736, + "grad_norm": 0.173828125, + "learning_rate": 6.26248875529327e-05, + "loss": 1.2649, + "step": 9875 + }, + { + "epoch": 0.6600748262960984, + "grad_norm": 0.1728515625, + "learning_rate": 6.251675775033804e-05, + "loss": 1.2161, + "step": 9880 + }, + { + "epoch": 0.6604088722608231, + "grad_norm": 0.1640625, + "learning_rate": 6.24086789133482e-05, + "loss": 1.1798, + "step": 9885 + }, + { + "epoch": 0.6607429182255479, + "grad_norm": 0.1767578125, + "learning_rate": 6.23006511889169e-05, + "loss": 1.1734, + "step": 9890 + }, + { + "epoch": 0.6610769641902726, + "grad_norm": 0.1748046875, + "learning_rate": 6.219267472392843e-05, + "loss": 1.215, + "step": 9895 + }, + { + "epoch": 0.6614110101549974, + "grad_norm": 0.181640625, + "learning_rate": 6.208474966519735e-05, + "loss": 1.1571, + "step": 9900 + }, + { + "epoch": 0.6617450561197221, + "grad_norm": 0.1708984375, + "learning_rate": 6.197687615946832e-05, + "loss": 1.2906, + "step": 9905 + }, + { + "epoch": 0.6620791020844469, + "grad_norm": 0.1650390625, + "learning_rate": 6.186905435341592e-05, + "loss": 1.1861, + "step": 9910 + }, + { + "epoch": 0.6624131480491716, + "grad_norm": 0.166015625, + "learning_rate": 6.17612843936444e-05, + "loss": 1.1958, + "step": 9915 + }, + { + "epoch": 0.6627471940138963, + "grad_norm": 0.1708984375, + "learning_rate": 6.165356642668754e-05, + "loss": 1.2155, + "step": 9920 + }, + { + "epoch": 0.663081239978621, + "grad_norm": 0.1611328125, + "learning_rate": 6.154590059900849e-05, + "loss": 1.1376, + "step": 9925 + }, + { + "epoch": 0.6634152859433458, + "grad_norm": 0.1796875, + "learning_rate": 6.143828705699936e-05, + "loss": 1.2243, + "step": 9930 + }, + { + "epoch": 0.6637493319080705, + "grad_norm": 0.17578125, + "learning_rate": 6.13307259469813e-05, + "loss": 1.2159, + "step": 9935 + }, + { + "epoch": 0.6640833778727953, + "grad_norm": 0.181640625, + "learning_rate": 6.122321741520412e-05, + "loss": 1.2191, + "step": 9940 + }, + { + "epoch": 0.66441742383752, + "grad_norm": 0.1748046875, + "learning_rate": 6.111576160784611e-05, + "loss": 1.1936, + "step": 9945 + }, + { + "epoch": 0.6647514698022448, + "grad_norm": 0.205078125, + "learning_rate": 6.1008358671013885e-05, + "loss": 1.2535, + "step": 9950 + }, + { + "epoch": 0.6650855157669695, + "grad_norm": 0.169921875, + "learning_rate": 6.090100875074225e-05, + "loss": 1.1455, + "step": 9955 + }, + { + "epoch": 0.6654195617316943, + "grad_norm": 0.1748046875, + "learning_rate": 6.079371199299384e-05, + "loss": 1.1946, + "step": 9960 + }, + { + "epoch": 0.665753607696419, + "grad_norm": 0.16796875, + "learning_rate": 6.0686468543659005e-05, + "loss": 1.2062, + "step": 9965 + }, + { + "epoch": 0.6660876536611438, + "grad_norm": 0.1708984375, + "learning_rate": 6.057927854855565e-05, + "loss": 1.2187, + "step": 9970 + }, + { + "epoch": 0.6664216996258685, + "grad_norm": 0.1796875, + "learning_rate": 6.0472142153428954e-05, + "loss": 1.2224, + "step": 9975 + }, + { + "epoch": 0.6667557455905933, + "grad_norm": 0.1796875, + "learning_rate": 6.036505950395126e-05, + "loss": 1.1661, + "step": 9980 + }, + { + "epoch": 0.667089791555318, + "grad_norm": 0.169921875, + "learning_rate": 6.025803074572185e-05, + "loss": 1.2129, + "step": 9985 + }, + { + "epoch": 0.6674238375200428, + "grad_norm": 0.19140625, + "learning_rate": 6.0151056024266695e-05, + "loss": 1.2235, + "step": 9990 + }, + { + "epoch": 0.6677578834847675, + "grad_norm": 0.1669921875, + "learning_rate": 6.0044135485038265e-05, + "loss": 1.1978, + "step": 9995 + }, + { + "epoch": 0.6680919294494923, + "grad_norm": 0.17578125, + "learning_rate": 5.9937269273415386e-05, + "loss": 1.2076, + "step": 10000 + }, + { + "epoch": 0.668425975414217, + "grad_norm": 0.1767578125, + "learning_rate": 5.983045753470308e-05, + "loss": 1.2702, + "step": 10005 + }, + { + "epoch": 0.6687600213789417, + "grad_norm": 0.1689453125, + "learning_rate": 5.972370041413218e-05, + "loss": 1.241, + "step": 10010 + }, + { + "epoch": 0.6690940673436665, + "grad_norm": 0.1669921875, + "learning_rate": 5.961699805685932e-05, + "loss": 1.2256, + "step": 10015 + }, + { + "epoch": 0.6694281133083912, + "grad_norm": 0.1669921875, + "learning_rate": 5.95103506079667e-05, + "loss": 1.1762, + "step": 10020 + }, + { + "epoch": 0.669762159273116, + "grad_norm": 0.1728515625, + "learning_rate": 5.940375821246186e-05, + "loss": 1.1551, + "step": 10025 + }, + { + "epoch": 0.6700962052378407, + "grad_norm": 0.1689453125, + "learning_rate": 5.9297221015277394e-05, + "loss": 1.1554, + "step": 10030 + }, + { + "epoch": 0.6704302512025655, + "grad_norm": 0.17578125, + "learning_rate": 5.9190739161270956e-05, + "loss": 1.1697, + "step": 10035 + }, + { + "epoch": 0.6707642971672902, + "grad_norm": 0.1669921875, + "learning_rate": 5.9084312795224874e-05, + "loss": 1.1717, + "step": 10040 + }, + { + "epoch": 0.671098343132015, + "grad_norm": 0.169921875, + "learning_rate": 5.8977942061846034e-05, + "loss": 1.2296, + "step": 10045 + }, + { + "epoch": 0.6714323890967397, + "grad_norm": 0.177734375, + "learning_rate": 5.8871627105765746e-05, + "loss": 1.2098, + "step": 10050 + }, + { + "epoch": 0.6717664350614645, + "grad_norm": 0.166015625, + "learning_rate": 5.876536807153943e-05, + "loss": 1.1903, + "step": 10055 + }, + { + "epoch": 0.6721004810261892, + "grad_norm": 0.177734375, + "learning_rate": 5.865916510364648e-05, + "loss": 1.1912, + "step": 10060 + }, + { + "epoch": 0.672434526990914, + "grad_norm": 0.16015625, + "learning_rate": 5.855301834649003e-05, + "loss": 1.1724, + "step": 10065 + }, + { + "epoch": 0.6727685729556387, + "grad_norm": 0.16796875, + "learning_rate": 5.8446927944396815e-05, + "loss": 1.2208, + "step": 10070 + }, + { + "epoch": 0.6731026189203635, + "grad_norm": 0.1689453125, + "learning_rate": 5.834089404161689e-05, + "loss": 1.2138, + "step": 10075 + }, + { + "epoch": 0.6734366648850882, + "grad_norm": 0.16796875, + "learning_rate": 5.8234916782323646e-05, + "loss": 1.2382, + "step": 10080 + }, + { + "epoch": 0.6737707108498129, + "grad_norm": 0.173828125, + "learning_rate": 5.812899631061327e-05, + "loss": 1.1906, + "step": 10085 + }, + { + "epoch": 0.6741047568145376, + "grad_norm": 0.1708984375, + "learning_rate": 5.80231327705048e-05, + "loss": 1.2238, + "step": 10090 + }, + { + "epoch": 0.6744388027792624, + "grad_norm": 0.173828125, + "learning_rate": 5.791732630593991e-05, + "loss": 1.1796, + "step": 10095 + }, + { + "epoch": 0.6747728487439871, + "grad_norm": 0.1669921875, + "learning_rate": 5.781157706078264e-05, + "loss": 1.247, + "step": 10100 + }, + { + "epoch": 0.6751068947087119, + "grad_norm": 0.18359375, + "learning_rate": 5.770588517881918e-05, + "loss": 1.2604, + "step": 10105 + }, + { + "epoch": 0.6754409406734366, + "grad_norm": 0.1728515625, + "learning_rate": 5.760025080375777e-05, + "loss": 1.2595, + "step": 10110 + }, + { + "epoch": 0.6757749866381614, + "grad_norm": 0.16796875, + "learning_rate": 5.749467407922853e-05, + "loss": 1.1983, + "step": 10115 + }, + { + "epoch": 0.6761090326028861, + "grad_norm": 0.1767578125, + "learning_rate": 5.738915514878307e-05, + "loss": 1.2557, + "step": 10120 + }, + { + "epoch": 0.6764430785676109, + "grad_norm": 0.1689453125, + "learning_rate": 5.728369415589443e-05, + "loss": 1.1756, + "step": 10125 + }, + { + "epoch": 0.6767771245323356, + "grad_norm": 0.173828125, + "learning_rate": 5.717829124395699e-05, + "loss": 1.2396, + "step": 10130 + }, + { + "epoch": 0.6771111704970604, + "grad_norm": 0.1689453125, + "learning_rate": 5.707294655628599e-05, + "loss": 1.1818, + "step": 10135 + }, + { + "epoch": 0.6774452164617851, + "grad_norm": 0.1767578125, + "learning_rate": 5.696766023611768e-05, + "loss": 1.1824, + "step": 10140 + }, + { + "epoch": 0.6777792624265099, + "grad_norm": 0.1552734375, + "learning_rate": 5.686243242660876e-05, + "loss": 1.1367, + "step": 10145 + }, + { + "epoch": 0.6781133083912346, + "grad_norm": 0.1591796875, + "learning_rate": 5.6757263270836594e-05, + "loss": 1.2218, + "step": 10150 + }, + { + "epoch": 0.6784473543559594, + "grad_norm": 0.17578125, + "learning_rate": 5.6652152911798585e-05, + "loss": 1.2477, + "step": 10155 + }, + { + "epoch": 0.6787814003206841, + "grad_norm": 0.1611328125, + "learning_rate": 5.6547101492412265e-05, + "loss": 1.2255, + "step": 10160 + }, + { + "epoch": 0.6791154462854089, + "grad_norm": 0.1650390625, + "learning_rate": 5.644210915551509e-05, + "loss": 1.2308, + "step": 10165 + }, + { + "epoch": 0.6794494922501336, + "grad_norm": 0.1611328125, + "learning_rate": 5.633717604386415e-05, + "loss": 1.1479, + "step": 10170 + }, + { + "epoch": 0.6797835382148584, + "grad_norm": 0.16796875, + "learning_rate": 5.6232302300135966e-05, + "loss": 1.171, + "step": 10175 + }, + { + "epoch": 0.6801175841795831, + "grad_norm": 0.1806640625, + "learning_rate": 5.612748806692632e-05, + "loss": 1.2255, + "step": 10180 + }, + { + "epoch": 0.6804516301443079, + "grad_norm": 0.171875, + "learning_rate": 5.602273348675019e-05, + "loss": 1.1911, + "step": 10185 + }, + { + "epoch": 0.6807856761090326, + "grad_norm": 0.1611328125, + "learning_rate": 5.5918038702041343e-05, + "loss": 1.1586, + "step": 10190 + }, + { + "epoch": 0.6811197220737574, + "grad_norm": 0.16796875, + "learning_rate": 5.5813403855152224e-05, + "loss": 1.2018, + "step": 10195 + }, + { + "epoch": 0.6814537680384821, + "grad_norm": 0.16796875, + "learning_rate": 5.5708829088353875e-05, + "loss": 1.1946, + "step": 10200 + }, + { + "epoch": 0.6817878140032069, + "grad_norm": 0.169921875, + "learning_rate": 5.560431454383566e-05, + "loss": 1.2084, + "step": 10205 + }, + { + "epoch": 0.6821218599679316, + "grad_norm": 0.1650390625, + "learning_rate": 5.549986036370491e-05, + "loss": 1.2056, + "step": 10210 + }, + { + "epoch": 0.6824559059326564, + "grad_norm": 0.16796875, + "learning_rate": 5.5395466689987044e-05, + "loss": 1.1836, + "step": 10215 + }, + { + "epoch": 0.6827899518973811, + "grad_norm": 0.171875, + "learning_rate": 5.529113366462504e-05, + "loss": 1.1598, + "step": 10220 + }, + { + "epoch": 0.6831239978621059, + "grad_norm": 0.1748046875, + "learning_rate": 5.518686142947962e-05, + "loss": 1.1612, + "step": 10225 + }, + { + "epoch": 0.6834580438268306, + "grad_norm": 0.17578125, + "learning_rate": 5.508265012632865e-05, + "loss": 1.2547, + "step": 10230 + }, + { + "epoch": 0.6837920897915554, + "grad_norm": 0.1787109375, + "learning_rate": 5.497849989686732e-05, + "loss": 1.2068, + "step": 10235 + }, + { + "epoch": 0.6841261357562801, + "grad_norm": 0.1728515625, + "learning_rate": 5.4874410882707635e-05, + "loss": 1.2069, + "step": 10240 + }, + { + "epoch": 0.6844601817210048, + "grad_norm": 0.1640625, + "learning_rate": 5.4770383225378486e-05, + "loss": 1.1858, + "step": 10245 + }, + { + "epoch": 0.6847942276857295, + "grad_norm": 0.16796875, + "learning_rate": 5.466641706632525e-05, + "loss": 1.1981, + "step": 10250 + }, + { + "epoch": 0.6851282736504543, + "grad_norm": 0.1728515625, + "learning_rate": 5.456251254690967e-05, + "loss": 1.2036, + "step": 10255 + }, + { + "epoch": 0.685462319615179, + "grad_norm": 0.1572265625, + "learning_rate": 5.4458669808409766e-05, + "loss": 1.2206, + "step": 10260 + }, + { + "epoch": 0.6857963655799038, + "grad_norm": 0.1611328125, + "learning_rate": 5.435488899201957e-05, + "loss": 1.1534, + "step": 10265 + }, + { + "epoch": 0.6861304115446285, + "grad_norm": 0.1767578125, + "learning_rate": 5.42511702388488e-05, + "loss": 1.2095, + "step": 10270 + }, + { + "epoch": 0.6864644575093533, + "grad_norm": 0.1728515625, + "learning_rate": 5.4147513689922815e-05, + "loss": 1.1465, + "step": 10275 + }, + { + "epoch": 0.686798503474078, + "grad_norm": 0.162109375, + "learning_rate": 5.404391948618252e-05, + "loss": 1.2494, + "step": 10280 + }, + { + "epoch": 0.6871325494388028, + "grad_norm": 0.166015625, + "learning_rate": 5.39403877684839e-05, + "loss": 1.2441, + "step": 10285 + }, + { + "epoch": 0.6874665954035275, + "grad_norm": 0.1689453125, + "learning_rate": 5.3836918677598015e-05, + "loss": 1.1477, + "step": 10290 + }, + { + "epoch": 0.6878006413682523, + "grad_norm": 0.1787109375, + "learning_rate": 5.373351235421084e-05, + "loss": 1.244, + "step": 10295 + }, + { + "epoch": 0.688134687332977, + "grad_norm": 0.1787109375, + "learning_rate": 5.3630168938922984e-05, + "loss": 1.252, + "step": 10300 + }, + { + "epoch": 0.6884687332977018, + "grad_norm": 0.1865234375, + "learning_rate": 5.352688857224945e-05, + "loss": 1.1898, + "step": 10305 + }, + { + "epoch": 0.6888027792624265, + "grad_norm": 0.16015625, + "learning_rate": 5.3423671394619554e-05, + "loss": 1.1629, + "step": 10310 + }, + { + "epoch": 0.6891368252271513, + "grad_norm": 0.1650390625, + "learning_rate": 5.332051754637676e-05, + "loss": 1.2108, + "step": 10315 + }, + { + "epoch": 0.689470871191876, + "grad_norm": 0.15625, + "learning_rate": 5.321742716777829e-05, + "loss": 1.1881, + "step": 10320 + }, + { + "epoch": 0.6898049171566007, + "grad_norm": 0.1630859375, + "learning_rate": 5.311440039899521e-05, + "loss": 1.1336, + "step": 10325 + }, + { + "epoch": 0.6901389631213255, + "grad_norm": 0.1728515625, + "learning_rate": 5.301143738011197e-05, + "loss": 1.1887, + "step": 10330 + }, + { + "epoch": 0.6904730090860502, + "grad_norm": 0.1669921875, + "learning_rate": 5.290853825112647e-05, + "loss": 1.1783, + "step": 10335 + }, + { + "epoch": 0.690807055050775, + "grad_norm": 0.17578125, + "learning_rate": 5.2805703151949616e-05, + "loss": 1.2454, + "step": 10340 + }, + { + "epoch": 0.6911411010154997, + "grad_norm": 0.1591796875, + "learning_rate": 5.2702932222405286e-05, + "loss": 1.2103, + "step": 10345 + }, + { + "epoch": 0.6914751469802245, + "grad_norm": 0.173828125, + "learning_rate": 5.2600225602230166e-05, + "loss": 1.2498, + "step": 10350 + }, + { + "epoch": 0.6918091929449492, + "grad_norm": 0.1689453125, + "learning_rate": 5.249758343107348e-05, + "loss": 1.1575, + "step": 10355 + }, + { + "epoch": 0.692143238909674, + "grad_norm": 0.1708984375, + "learning_rate": 5.239500584849678e-05, + "loss": 1.2488, + "step": 10360 + }, + { + "epoch": 0.6924772848743987, + "grad_norm": 0.16796875, + "learning_rate": 5.229249299397378e-05, + "loss": 1.2236, + "step": 10365 + }, + { + "epoch": 0.6928113308391235, + "grad_norm": 0.181640625, + "learning_rate": 5.219004500689031e-05, + "loss": 1.1939, + "step": 10370 + }, + { + "epoch": 0.6931453768038482, + "grad_norm": 0.171875, + "learning_rate": 5.2087662026543846e-05, + "loss": 1.2497, + "step": 10375 + }, + { + "epoch": 0.693479422768573, + "grad_norm": 0.193359375, + "learning_rate": 5.1985344192143534e-05, + "loss": 1.1932, + "step": 10380 + }, + { + "epoch": 0.6938134687332977, + "grad_norm": 0.16796875, + "learning_rate": 5.188309164281e-05, + "loss": 1.1782, + "step": 10385 + }, + { + "epoch": 0.6941475146980225, + "grad_norm": 0.177734375, + "learning_rate": 5.1780904517575046e-05, + "loss": 1.2641, + "step": 10390 + }, + { + "epoch": 0.6944815606627472, + "grad_norm": 0.169921875, + "learning_rate": 5.1678782955381534e-05, + "loss": 1.1262, + "step": 10395 + }, + { + "epoch": 0.694815606627472, + "grad_norm": 0.1826171875, + "learning_rate": 5.157672709508312e-05, + "loss": 1.2646, + "step": 10400 + }, + { + "epoch": 0.6951496525921966, + "grad_norm": 0.166015625, + "learning_rate": 5.147473707544425e-05, + "loss": 1.1851, + "step": 10405 + }, + { + "epoch": 0.6954836985569214, + "grad_norm": 0.1875, + "learning_rate": 5.137281303513969e-05, + "loss": 1.1714, + "step": 10410 + }, + { + "epoch": 0.6958177445216461, + "grad_norm": 0.171875, + "learning_rate": 5.127095511275466e-05, + "loss": 1.2067, + "step": 10415 + }, + { + "epoch": 0.6961517904863709, + "grad_norm": 0.1650390625, + "learning_rate": 5.116916344678435e-05, + "loss": 1.2317, + "step": 10420 + }, + { + "epoch": 0.6964858364510956, + "grad_norm": 0.1650390625, + "learning_rate": 5.106743817563395e-05, + "loss": 1.2716, + "step": 10425 + }, + { + "epoch": 0.6968198824158204, + "grad_norm": 0.177734375, + "learning_rate": 5.0965779437618314e-05, + "loss": 1.1254, + "step": 10430 + }, + { + "epoch": 0.6971539283805451, + "grad_norm": 0.1806640625, + "learning_rate": 5.086418737096186e-05, + "loss": 1.211, + "step": 10435 + }, + { + "epoch": 0.6974879743452699, + "grad_norm": 0.1728515625, + "learning_rate": 5.076266211379826e-05, + "loss": 1.2339, + "step": 10440 + }, + { + "epoch": 0.6978220203099946, + "grad_norm": 0.169921875, + "learning_rate": 5.066120380417056e-05, + "loss": 1.2717, + "step": 10445 + }, + { + "epoch": 0.6981560662747194, + "grad_norm": 0.1748046875, + "learning_rate": 5.055981258003064e-05, + "loss": 1.2531, + "step": 10450 + }, + { + "epoch": 0.6984901122394441, + "grad_norm": 0.1630859375, + "learning_rate": 5.0458488579239116e-05, + "loss": 1.1889, + "step": 10455 + }, + { + "epoch": 0.6988241582041689, + "grad_norm": 0.1767578125, + "learning_rate": 5.035723193956523e-05, + "loss": 1.1753, + "step": 10460 + }, + { + "epoch": 0.6991582041688936, + "grad_norm": 0.1689453125, + "learning_rate": 5.025604279868676e-05, + "loss": 1.303, + "step": 10465 + }, + { + "epoch": 0.6994922501336184, + "grad_norm": 0.1728515625, + "learning_rate": 5.015492129418957e-05, + "loss": 1.2091, + "step": 10470 + }, + { + "epoch": 0.6998262960983431, + "grad_norm": 0.169921875, + "learning_rate": 5.005386756356754e-05, + "loss": 1.2594, + "step": 10475 + }, + { + "epoch": 0.7001603420630679, + "grad_norm": 0.171875, + "learning_rate": 4.995288174422251e-05, + "loss": 1.1741, + "step": 10480 + }, + { + "epoch": 0.7004943880277926, + "grad_norm": 0.1689453125, + "learning_rate": 4.985196397346397e-05, + "loss": 1.2114, + "step": 10485 + }, + { + "epoch": 0.7008284339925174, + "grad_norm": 0.181640625, + "learning_rate": 4.97511143885088e-05, + "loss": 1.1966, + "step": 10490 + }, + { + "epoch": 0.7011624799572421, + "grad_norm": 0.1728515625, + "learning_rate": 4.9650333126481174e-05, + "loss": 1.1899, + "step": 10495 + }, + { + "epoch": 0.7014965259219669, + "grad_norm": 0.1767578125, + "learning_rate": 4.954962032441249e-05, + "loss": 1.1736, + "step": 10500 + }, + { + "epoch": 0.7018305718866916, + "grad_norm": 0.1708984375, + "learning_rate": 4.9448976119240895e-05, + "loss": 1.1972, + "step": 10505 + }, + { + "epoch": 0.7021646178514164, + "grad_norm": 0.1591796875, + "learning_rate": 4.934840064781143e-05, + "loss": 1.2075, + "step": 10510 + }, + { + "epoch": 0.7024986638161411, + "grad_norm": 0.17578125, + "learning_rate": 4.924789404687552e-05, + "loss": 1.2075, + "step": 10515 + }, + { + "epoch": 0.7028327097808659, + "grad_norm": 0.17578125, + "learning_rate": 4.914745645309111e-05, + "loss": 1.198, + "step": 10520 + }, + { + "epoch": 0.7031667557455906, + "grad_norm": 0.173828125, + "learning_rate": 4.904708800302218e-05, + "loss": 1.2011, + "step": 10525 + }, + { + "epoch": 0.7035008017103154, + "grad_norm": 0.16796875, + "learning_rate": 4.8946788833138724e-05, + "loss": 1.1074, + "step": 10530 + }, + { + "epoch": 0.7038348476750401, + "grad_norm": 0.1669921875, + "learning_rate": 4.884655907981659e-05, + "loss": 1.1243, + "step": 10535 + }, + { + "epoch": 0.7041688936397649, + "grad_norm": 0.1748046875, + "learning_rate": 4.874639887933725e-05, + "loss": 1.1672, + "step": 10540 + }, + { + "epoch": 0.7045029396044896, + "grad_norm": 0.1591796875, + "learning_rate": 4.864630836788753e-05, + "loss": 1.1574, + "step": 10545 + }, + { + "epoch": 0.7048369855692144, + "grad_norm": 0.171875, + "learning_rate": 4.854628768155951e-05, + "loss": 1.1272, + "step": 10550 + }, + { + "epoch": 0.7051710315339391, + "grad_norm": 0.1689453125, + "learning_rate": 4.844633695635041e-05, + "loss": 1.2193, + "step": 10555 + }, + { + "epoch": 0.7055050774986639, + "grad_norm": 0.17578125, + "learning_rate": 4.834645632816227e-05, + "loss": 1.2342, + "step": 10560 + }, + { + "epoch": 0.7058391234633885, + "grad_norm": 0.173828125, + "learning_rate": 4.824664593280175e-05, + "loss": 1.2653, + "step": 10565 + }, + { + "epoch": 0.7061731694281133, + "grad_norm": 0.1728515625, + "learning_rate": 4.8146905905980144e-05, + "loss": 1.1794, + "step": 10570 + }, + { + "epoch": 0.706507215392838, + "grad_norm": 0.169921875, + "learning_rate": 4.804723638331303e-05, + "loss": 1.2321, + "step": 10575 + }, + { + "epoch": 0.7068412613575628, + "grad_norm": 0.1787109375, + "learning_rate": 4.7947637500320084e-05, + "loss": 1.1332, + "step": 10580 + }, + { + "epoch": 0.7071753073222875, + "grad_norm": 0.177734375, + "learning_rate": 4.7848109392424914e-05, + "loss": 1.2459, + "step": 10585 + }, + { + "epoch": 0.7075093532870123, + "grad_norm": 0.18359375, + "learning_rate": 4.774865219495498e-05, + "loss": 1.254, + "step": 10590 + }, + { + "epoch": 0.707843399251737, + "grad_norm": 0.1953125, + "learning_rate": 4.7649266043141236e-05, + "loss": 1.2396, + "step": 10595 + }, + { + "epoch": 0.7081774452164618, + "grad_norm": 0.1669921875, + "learning_rate": 4.7549951072118137e-05, + "loss": 1.125, + "step": 10600 + }, + { + "epoch": 0.7085114911811865, + "grad_norm": 0.1650390625, + "learning_rate": 4.7450707416923225e-05, + "loss": 1.2224, + "step": 10605 + }, + { + "epoch": 0.7088455371459113, + "grad_norm": 0.16796875, + "learning_rate": 4.7351535212497213e-05, + "loss": 1.2234, + "step": 10610 + }, + { + "epoch": 0.709179583110636, + "grad_norm": 0.173828125, + "learning_rate": 4.725243459368358e-05, + "loss": 1.1677, + "step": 10615 + }, + { + "epoch": 0.7095136290753608, + "grad_norm": 0.17578125, + "learning_rate": 4.7153405695228457e-05, + "loss": 1.2209, + "step": 10620 + }, + { + "epoch": 0.7098476750400855, + "grad_norm": 0.18359375, + "learning_rate": 4.7054448651780434e-05, + "loss": 1.1557, + "step": 10625 + }, + { + "epoch": 0.7101817210048103, + "grad_norm": 0.181640625, + "learning_rate": 4.695556359789061e-05, + "loss": 1.2074, + "step": 10630 + }, + { + "epoch": 0.710515766969535, + "grad_norm": 0.1708984375, + "learning_rate": 4.685675066801194e-05, + "loss": 1.0386, + "step": 10635 + }, + { + "epoch": 0.7108498129342598, + "grad_norm": 0.17578125, + "learning_rate": 4.675800999649943e-05, + "loss": 1.2289, + "step": 10640 + }, + { + "epoch": 0.7111838588989845, + "grad_norm": 0.1640625, + "learning_rate": 4.66593417176098e-05, + "loss": 1.2262, + "step": 10645 + }, + { + "epoch": 0.7115179048637092, + "grad_norm": 0.1689453125, + "learning_rate": 4.656074596550142e-05, + "loss": 1.1148, + "step": 10650 + }, + { + "epoch": 0.711851950828434, + "grad_norm": 0.158203125, + "learning_rate": 4.646222287423391e-05, + "loss": 1.2033, + "step": 10655 + }, + { + "epoch": 0.7121859967931587, + "grad_norm": 0.1708984375, + "learning_rate": 4.636377257776826e-05, + "loss": 1.1739, + "step": 10660 + }, + { + "epoch": 0.7125200427578835, + "grad_norm": 0.1669921875, + "learning_rate": 4.626539520996632e-05, + "loss": 1.1661, + "step": 10665 + }, + { + "epoch": 0.7128540887226082, + "grad_norm": 0.16015625, + "learning_rate": 4.6167090904590916e-05, + "loss": 1.2023, + "step": 10670 + }, + { + "epoch": 0.713188134687333, + "grad_norm": 0.1982421875, + "learning_rate": 4.606885979530544e-05, + "loss": 1.2215, + "step": 10675 + }, + { + "epoch": 0.7135221806520577, + "grad_norm": 0.181640625, + "learning_rate": 4.597070201567374e-05, + "loss": 1.2264, + "step": 10680 + }, + { + "epoch": 0.7138562266167825, + "grad_norm": 0.17578125, + "learning_rate": 4.5872617699160095e-05, + "loss": 1.1838, + "step": 10685 + }, + { + "epoch": 0.7141902725815072, + "grad_norm": 0.1904296875, + "learning_rate": 4.577460697912873e-05, + "loss": 1.2051, + "step": 10690 + }, + { + "epoch": 0.714524318546232, + "grad_norm": 0.169921875, + "learning_rate": 4.567666998884395e-05, + "loss": 1.1821, + "step": 10695 + }, + { + "epoch": 0.7148583645109567, + "grad_norm": 0.1728515625, + "learning_rate": 4.557880686146968e-05, + "loss": 1.2537, + "step": 10700 + }, + { + "epoch": 0.7151924104756815, + "grad_norm": 0.189453125, + "learning_rate": 4.548101773006953e-05, + "loss": 1.1674, + "step": 10705 + }, + { + "epoch": 0.7155264564404062, + "grad_norm": 0.1728515625, + "learning_rate": 4.5383302727606426e-05, + "loss": 1.1552, + "step": 10710 + }, + { + "epoch": 0.715860502405131, + "grad_norm": 0.1748046875, + "learning_rate": 4.528566198694246e-05, + "loss": 1.2517, + "step": 10715 + }, + { + "epoch": 0.7161945483698557, + "grad_norm": 0.193359375, + "learning_rate": 4.5188095640838865e-05, + "loss": 1.176, + "step": 10720 + }, + { + "epoch": 0.7165285943345805, + "grad_norm": 0.1611328125, + "learning_rate": 4.50906038219557e-05, + "loss": 1.1779, + "step": 10725 + }, + { + "epoch": 0.7168626402993051, + "grad_norm": 0.1923828125, + "learning_rate": 4.499318666285162e-05, + "loss": 1.1945, + "step": 10730 + }, + { + "epoch": 0.7171966862640299, + "grad_norm": 0.177734375, + "learning_rate": 4.489584429598375e-05, + "loss": 1.2406, + "step": 10735 + }, + { + "epoch": 0.7175307322287546, + "grad_norm": 0.1728515625, + "learning_rate": 4.4798576853707664e-05, + "loss": 1.207, + "step": 10740 + }, + { + "epoch": 0.7178647781934794, + "grad_norm": 0.1787109375, + "learning_rate": 4.470138446827692e-05, + "loss": 1.2622, + "step": 10745 + }, + { + "epoch": 0.7181988241582041, + "grad_norm": 0.1708984375, + "learning_rate": 4.4604267271843046e-05, + "loss": 1.1559, + "step": 10750 + }, + { + "epoch": 0.7185328701229289, + "grad_norm": 0.1728515625, + "learning_rate": 4.4507225396455385e-05, + "loss": 1.2233, + "step": 10755 + }, + { + "epoch": 0.7188669160876536, + "grad_norm": 0.169921875, + "learning_rate": 4.44102589740609e-05, + "loss": 1.2279, + "step": 10760 + }, + { + "epoch": 0.7192009620523784, + "grad_norm": 0.16796875, + "learning_rate": 4.431336813650385e-05, + "loss": 1.2143, + "step": 10765 + }, + { + "epoch": 0.7195350080171031, + "grad_norm": 0.21484375, + "learning_rate": 4.421655301552575e-05, + "loss": 1.2, + "step": 10770 + }, + { + "epoch": 0.7198690539818279, + "grad_norm": 0.1689453125, + "learning_rate": 4.411981374276527e-05, + "loss": 1.2192, + "step": 10775 + }, + { + "epoch": 0.7202030999465526, + "grad_norm": 0.1669921875, + "learning_rate": 4.402315044975778e-05, + "loss": 1.1967, + "step": 10780 + }, + { + "epoch": 0.7205371459112774, + "grad_norm": 0.1884765625, + "learning_rate": 4.3926563267935514e-05, + "loss": 1.2115, + "step": 10785 + }, + { + "epoch": 0.7208711918760021, + "grad_norm": 0.1796875, + "learning_rate": 4.383005232862707e-05, + "loss": 1.2096, + "step": 10790 + }, + { + "epoch": 0.7212052378407269, + "grad_norm": 0.1826171875, + "learning_rate": 4.37336177630575e-05, + "loss": 1.1461, + "step": 10795 + }, + { + "epoch": 0.7215392838054516, + "grad_norm": 0.177734375, + "learning_rate": 4.363725970234794e-05, + "loss": 1.2384, + "step": 10800 + }, + { + "epoch": 0.7218733297701764, + "grad_norm": 0.173828125, + "learning_rate": 4.354097827751552e-05, + "loss": 1.2205, + "step": 10805 + }, + { + "epoch": 0.7222073757349011, + "grad_norm": 0.2119140625, + "learning_rate": 4.344477361947309e-05, + "loss": 1.1431, + "step": 10810 + }, + { + "epoch": 0.7225414216996259, + "grad_norm": 0.177734375, + "learning_rate": 4.334864585902935e-05, + "loss": 1.1884, + "step": 10815 + }, + { + "epoch": 0.7228754676643506, + "grad_norm": 0.1650390625, + "learning_rate": 4.3252595126888205e-05, + "loss": 1.1578, + "step": 10820 + }, + { + "epoch": 0.7232095136290754, + "grad_norm": 0.1669921875, + "learning_rate": 4.31566215536489e-05, + "loss": 1.2039, + "step": 10825 + }, + { + "epoch": 0.7235435595938001, + "grad_norm": 0.181640625, + "learning_rate": 4.3060725269805846e-05, + "loss": 1.1527, + "step": 10830 + }, + { + "epoch": 0.7238776055585249, + "grad_norm": 0.173828125, + "learning_rate": 4.296490640574826e-05, + "loss": 1.174, + "step": 10835 + }, + { + "epoch": 0.7242116515232496, + "grad_norm": 0.169921875, + "learning_rate": 4.2869165091760086e-05, + "loss": 1.2951, + "step": 10840 + }, + { + "epoch": 0.7245456974879744, + "grad_norm": 0.1708984375, + "learning_rate": 4.2773501458019936e-05, + "loss": 1.2161, + "step": 10845 + }, + { + "epoch": 0.7248797434526991, + "grad_norm": 0.162109375, + "learning_rate": 4.267791563460074e-05, + "loss": 1.1598, + "step": 10850 + }, + { + "epoch": 0.7252137894174239, + "grad_norm": 0.1845703125, + "learning_rate": 4.258240775146961e-05, + "loss": 1.2698, + "step": 10855 + }, + { + "epoch": 0.7255478353821486, + "grad_norm": 0.173828125, + "learning_rate": 4.248697793848768e-05, + "loss": 1.2178, + "step": 10860 + }, + { + "epoch": 0.7258818813468734, + "grad_norm": 0.1630859375, + "learning_rate": 4.239162632540994e-05, + "loss": 1.1651, + "step": 10865 + }, + { + "epoch": 0.7262159273115981, + "grad_norm": 0.1728515625, + "learning_rate": 4.229635304188507e-05, + "loss": 1.2835, + "step": 10870 + }, + { + "epoch": 0.7265499732763229, + "grad_norm": 0.1669921875, + "learning_rate": 4.2201158217455296e-05, + "loss": 1.1627, + "step": 10875 + }, + { + "epoch": 0.7268840192410476, + "grad_norm": 0.15234375, + "learning_rate": 4.210604198155607e-05, + "loss": 1.2025, + "step": 10880 + }, + { + "epoch": 0.7272180652057724, + "grad_norm": 0.1689453125, + "learning_rate": 4.201100446351597e-05, + "loss": 1.2049, + "step": 10885 + }, + { + "epoch": 0.727552111170497, + "grad_norm": 0.17578125, + "learning_rate": 4.1916045792556694e-05, + "loss": 1.2161, + "step": 10890 + }, + { + "epoch": 0.7278861571352218, + "grad_norm": 0.1708984375, + "learning_rate": 4.182116609779259e-05, + "loss": 1.1577, + "step": 10895 + }, + { + "epoch": 0.7282202030999465, + "grad_norm": 0.189453125, + "learning_rate": 4.1726365508230616e-05, + "loss": 1.2406, + "step": 10900 + }, + { + "epoch": 0.7285542490646713, + "grad_norm": 0.1650390625, + "learning_rate": 4.163164415277029e-05, + "loss": 1.173, + "step": 10905 + }, + { + "epoch": 0.728888295029396, + "grad_norm": 0.2060546875, + "learning_rate": 4.1537002160203344e-05, + "loss": 1.2019, + "step": 10910 + }, + { + "epoch": 0.7292223409941208, + "grad_norm": 0.166015625, + "learning_rate": 4.1442439659213564e-05, + "loss": 1.2311, + "step": 10915 + }, + { + "epoch": 0.7295563869588455, + "grad_norm": 0.1669921875, + "learning_rate": 4.134795677837663e-05, + "loss": 1.1683, + "step": 10920 + }, + { + "epoch": 0.7298904329235703, + "grad_norm": 0.1572265625, + "learning_rate": 4.125355364616009e-05, + "loss": 1.1664, + "step": 10925 + }, + { + "epoch": 0.730224478888295, + "grad_norm": 0.171875, + "learning_rate": 4.115923039092293e-05, + "loss": 1.1981, + "step": 10930 + }, + { + "epoch": 0.7305585248530198, + "grad_norm": 0.17578125, + "learning_rate": 4.1064987140915544e-05, + "loss": 1.1545, + "step": 10935 + }, + { + "epoch": 0.7308925708177445, + "grad_norm": 0.1748046875, + "learning_rate": 4.097082402427962e-05, + "loss": 1.1627, + "step": 10940 + }, + { + "epoch": 0.7312266167824693, + "grad_norm": 0.171875, + "learning_rate": 4.087674116904786e-05, + "loss": 1.2434, + "step": 10945 + }, + { + "epoch": 0.731560662747194, + "grad_norm": 0.18359375, + "learning_rate": 4.07827387031438e-05, + "loss": 1.1772, + "step": 10950 + }, + { + "epoch": 0.7318947087119188, + "grad_norm": 0.1748046875, + "learning_rate": 4.068881675438165e-05, + "loss": 1.1721, + "step": 10955 + }, + { + "epoch": 0.7322287546766435, + "grad_norm": 0.1708984375, + "learning_rate": 4.0594975450466255e-05, + "loss": 1.141, + "step": 10960 + }, + { + "epoch": 0.7325628006413683, + "grad_norm": 0.1748046875, + "learning_rate": 4.050121491899266e-05, + "loss": 1.2159, + "step": 10965 + }, + { + "epoch": 0.732896846606093, + "grad_norm": 0.1708984375, + "learning_rate": 4.040753528744623e-05, + "loss": 1.219, + "step": 10970 + }, + { + "epoch": 0.7332308925708177, + "grad_norm": 0.166015625, + "learning_rate": 4.0313936683202205e-05, + "loss": 1.1963, + "step": 10975 + }, + { + "epoch": 0.7335649385355425, + "grad_norm": 0.16796875, + "learning_rate": 4.0220419233525754e-05, + "loss": 1.1704, + "step": 10980 + }, + { + "epoch": 0.7338989845002672, + "grad_norm": 0.1708984375, + "learning_rate": 4.0126983065571643e-05, + "loss": 1.1832, + "step": 10985 + }, + { + "epoch": 0.734233030464992, + "grad_norm": 0.1669921875, + "learning_rate": 4.003362830638409e-05, + "loss": 1.2385, + "step": 10990 + }, + { + "epoch": 0.7345670764297167, + "grad_norm": 0.171875, + "learning_rate": 3.9940355082896694e-05, + "loss": 1.1976, + "step": 10995 + }, + { + "epoch": 0.7349011223944415, + "grad_norm": 0.1767578125, + "learning_rate": 3.984716352193222e-05, + "loss": 1.1941, + "step": 11000 + }, + { + "epoch": 0.7352351683591662, + "grad_norm": 0.1669921875, + "learning_rate": 3.975405375020228e-05, + "loss": 1.1729, + "step": 11005 + }, + { + "epoch": 0.735569214323891, + "grad_norm": 0.171875, + "learning_rate": 3.96610258943073e-05, + "loss": 1.2462, + "step": 11010 + }, + { + "epoch": 0.7359032602886157, + "grad_norm": 0.1875, + "learning_rate": 3.956808008073646e-05, + "loss": 1.2269, + "step": 11015 + }, + { + "epoch": 0.7362373062533405, + "grad_norm": 0.1630859375, + "learning_rate": 3.9475216435867225e-05, + "loss": 1.1627, + "step": 11020 + }, + { + "epoch": 0.7365713522180652, + "grad_norm": 0.181640625, + "learning_rate": 3.938243508596539e-05, + "loss": 1.1939, + "step": 11025 + }, + { + "epoch": 0.73690539818279, + "grad_norm": 0.1650390625, + "learning_rate": 3.9289736157184876e-05, + "loss": 1.2428, + "step": 11030 + }, + { + "epoch": 0.7372394441475147, + "grad_norm": 0.171875, + "learning_rate": 3.9197119775567595e-05, + "loss": 1.1643, + "step": 11035 + }, + { + "epoch": 0.7375734901122395, + "grad_norm": 0.181640625, + "learning_rate": 3.910458606704309e-05, + "loss": 1.1663, + "step": 11040 + }, + { + "epoch": 0.7379075360769642, + "grad_norm": 0.17578125, + "learning_rate": 3.901213515742856e-05, + "loss": 1.2113, + "step": 11045 + }, + { + "epoch": 0.7382415820416889, + "grad_norm": 0.181640625, + "learning_rate": 3.891976717242861e-05, + "loss": 1.0892, + "step": 11050 + }, + { + "epoch": 0.7385756280064136, + "grad_norm": 0.1669921875, + "learning_rate": 3.8827482237635105e-05, + "loss": 1.2396, + "step": 11055 + }, + { + "epoch": 0.7389096739711384, + "grad_norm": 0.1669921875, + "learning_rate": 3.8735280478527035e-05, + "loss": 1.1426, + "step": 11060 + }, + { + "epoch": 0.7392437199358631, + "grad_norm": 0.1796875, + "learning_rate": 3.8643162020470224e-05, + "loss": 1.1233, + "step": 11065 + }, + { + "epoch": 0.7395777659005879, + "grad_norm": 0.1669921875, + "learning_rate": 3.85511269887172e-05, + "loss": 1.2431, + "step": 11070 + }, + { + "epoch": 0.7399118118653126, + "grad_norm": 0.171875, + "learning_rate": 3.8459175508407184e-05, + "loss": 1.283, + "step": 11075 + }, + { + "epoch": 0.7402458578300374, + "grad_norm": 0.1767578125, + "learning_rate": 3.8367307704565706e-05, + "loss": 1.1919, + "step": 11080 + }, + { + "epoch": 0.7405799037947621, + "grad_norm": 0.18359375, + "learning_rate": 3.827552370210448e-05, + "loss": 1.2189, + "step": 11085 + }, + { + "epoch": 0.7409139497594869, + "grad_norm": 0.1669921875, + "learning_rate": 3.818382362582137e-05, + "loss": 1.2248, + "step": 11090 + }, + { + "epoch": 0.7412479957242116, + "grad_norm": 0.1767578125, + "learning_rate": 3.809220760040014e-05, + "loss": 1.1446, + "step": 11095 + }, + { + "epoch": 0.7415820416889364, + "grad_norm": 0.1640625, + "learning_rate": 3.8000675750410186e-05, + "loss": 1.1236, + "step": 11100 + }, + { + "epoch": 0.7419160876536611, + "grad_norm": 0.1689453125, + "learning_rate": 3.7909228200306436e-05, + "loss": 1.1346, + "step": 11105 + }, + { + "epoch": 0.7422501336183859, + "grad_norm": 0.1640625, + "learning_rate": 3.7817865074429314e-05, + "loss": 1.1425, + "step": 11110 + }, + { + "epoch": 0.7425841795831106, + "grad_norm": 0.166015625, + "learning_rate": 3.7726586497004334e-05, + "loss": 1.1919, + "step": 11115 + }, + { + "epoch": 0.7429182255478354, + "grad_norm": 0.169921875, + "learning_rate": 3.7635392592142174e-05, + "loss": 1.1797, + "step": 11120 + }, + { + "epoch": 0.7432522715125601, + "grad_norm": 0.1708984375, + "learning_rate": 3.7544283483838215e-05, + "loss": 1.2874, + "step": 11125 + }, + { + "epoch": 0.7435863174772849, + "grad_norm": 0.166015625, + "learning_rate": 3.745325929597272e-05, + "loss": 1.1107, + "step": 11130 + }, + { + "epoch": 0.7439203634420096, + "grad_norm": 0.1748046875, + "learning_rate": 3.736232015231038e-05, + "loss": 1.1931, + "step": 11135 + }, + { + "epoch": 0.7442544094067344, + "grad_norm": 0.171875, + "learning_rate": 3.7271466176500224e-05, + "loss": 1.1346, + "step": 11140 + }, + { + "epoch": 0.7445884553714591, + "grad_norm": 0.171875, + "learning_rate": 3.718069749207559e-05, + "loss": 1.2507, + "step": 11145 + }, + { + "epoch": 0.7449225013361839, + "grad_norm": 0.1796875, + "learning_rate": 3.7090014222453794e-05, + "loss": 1.2236, + "step": 11150 + }, + { + "epoch": 0.7452565473009086, + "grad_norm": 0.1728515625, + "learning_rate": 3.699941649093599e-05, + "loss": 1.2067, + "step": 11155 + }, + { + "epoch": 0.7455905932656334, + "grad_norm": 0.1669921875, + "learning_rate": 3.6908904420707e-05, + "loss": 1.2222, + "step": 11160 + }, + { + "epoch": 0.7459246392303581, + "grad_norm": 0.1689453125, + "learning_rate": 3.6818478134835285e-05, + "loss": 1.1647, + "step": 11165 + }, + { + "epoch": 0.7462586851950829, + "grad_norm": 0.177734375, + "learning_rate": 3.672813775627259e-05, + "loss": 1.2522, + "step": 11170 + }, + { + "epoch": 0.7465927311598076, + "grad_norm": 0.1572265625, + "learning_rate": 3.663788340785379e-05, + "loss": 1.2151, + "step": 11175 + }, + { + "epoch": 0.7469267771245324, + "grad_norm": 0.166015625, + "learning_rate": 3.6547715212296906e-05, + "loss": 1.1831, + "step": 11180 + }, + { + "epoch": 0.7472608230892571, + "grad_norm": 0.1669921875, + "learning_rate": 3.645763329220281e-05, + "loss": 1.2159, + "step": 11185 + }, + { + "epoch": 0.7475948690539819, + "grad_norm": 0.1767578125, + "learning_rate": 3.636763777005499e-05, + "loss": 1.2279, + "step": 11190 + }, + { + "epoch": 0.7479289150187066, + "grad_norm": 0.17578125, + "learning_rate": 3.627772876821944e-05, + "loss": 1.1907, + "step": 11195 + }, + { + "epoch": 0.7482629609834314, + "grad_norm": 0.1708984375, + "learning_rate": 3.618790640894465e-05, + "loss": 1.1535, + "step": 11200 + }, + { + "epoch": 0.7485970069481561, + "grad_norm": 0.1689453125, + "learning_rate": 3.609817081436119e-05, + "loss": 1.1594, + "step": 11205 + }, + { + "epoch": 0.7489310529128808, + "grad_norm": 0.1767578125, + "learning_rate": 3.600852210648164e-05, + "loss": 1.2243, + "step": 11210 + }, + { + "epoch": 0.7492650988776055, + "grad_norm": 0.205078125, + "learning_rate": 3.591896040720054e-05, + "loss": 1.2602, + "step": 11215 + }, + { + "epoch": 0.7495991448423303, + "grad_norm": 0.171875, + "learning_rate": 3.5829485838294093e-05, + "loss": 1.2086, + "step": 11220 + }, + { + "epoch": 0.749933190807055, + "grad_norm": 0.1669921875, + "learning_rate": 3.5740098521419985e-05, + "loss": 1.2441, + "step": 11225 + }, + { + "epoch": 0.7502672367717798, + "grad_norm": 0.18359375, + "learning_rate": 3.565079857811728e-05, + "loss": 1.1788, + "step": 11230 + }, + { + "epoch": 0.7506012827365045, + "grad_norm": 0.1611328125, + "learning_rate": 3.556158612980624e-05, + "loss": 1.1962, + "step": 11235 + }, + { + "epoch": 0.7509353287012293, + "grad_norm": 0.1689453125, + "learning_rate": 3.5472461297788185e-05, + "loss": 1.2109, + "step": 11240 + }, + { + "epoch": 0.751269374665954, + "grad_norm": 0.1748046875, + "learning_rate": 3.538342420324534e-05, + "loss": 1.2655, + "step": 11245 + }, + { + "epoch": 0.7516034206306788, + "grad_norm": 0.1708984375, + "learning_rate": 3.529447496724053e-05, + "loss": 1.1728, + "step": 11250 + }, + { + "epoch": 0.7519374665954035, + "grad_norm": 0.1630859375, + "learning_rate": 3.5205613710717234e-05, + "loss": 1.121, + "step": 11255 + }, + { + "epoch": 0.7522715125601283, + "grad_norm": 0.18359375, + "learning_rate": 3.511684055449922e-05, + "loss": 1.1685, + "step": 11260 + }, + { + "epoch": 0.752605558524853, + "grad_norm": 0.1689453125, + "learning_rate": 3.5028155619290495e-05, + "loss": 1.162, + "step": 11265 + }, + { + "epoch": 0.7529396044895778, + "grad_norm": 0.1904296875, + "learning_rate": 3.493955902567505e-05, + "loss": 1.2662, + "step": 11270 + }, + { + "epoch": 0.7532736504543025, + "grad_norm": 0.16796875, + "learning_rate": 3.4851050894116946e-05, + "loss": 1.1836, + "step": 11275 + }, + { + "epoch": 0.7536076964190273, + "grad_norm": 0.1669921875, + "learning_rate": 3.476263134495978e-05, + "loss": 1.18, + "step": 11280 + }, + { + "epoch": 0.753941742383752, + "grad_norm": 0.162109375, + "learning_rate": 3.467430049842678e-05, + "loss": 1.1368, + "step": 11285 + }, + { + "epoch": 0.7542757883484768, + "grad_norm": 0.171875, + "learning_rate": 3.4586058474620495e-05, + "loss": 1.2558, + "step": 11290 + }, + { + "epoch": 0.7546098343132015, + "grad_norm": 0.1708984375, + "learning_rate": 3.4497905393522835e-05, + "loss": 1.1602, + "step": 11295 + }, + { + "epoch": 0.7549438802779262, + "grad_norm": 0.1708984375, + "learning_rate": 3.4409841374994634e-05, + "loss": 1.264, + "step": 11300 + }, + { + "epoch": 0.755277926242651, + "grad_norm": 0.1806640625, + "learning_rate": 3.432186653877575e-05, + "loss": 1.2065, + "step": 11305 + }, + { + "epoch": 0.7556119722073757, + "grad_norm": 0.1650390625, + "learning_rate": 3.423398100448466e-05, + "loss": 1.2051, + "step": 11310 + }, + { + "epoch": 0.7559460181721005, + "grad_norm": 0.166015625, + "learning_rate": 3.414618489161856e-05, + "loss": 1.112, + "step": 11315 + }, + { + "epoch": 0.7562800641368252, + "grad_norm": 0.1826171875, + "learning_rate": 3.4058478319552936e-05, + "loss": 1.2844, + "step": 11320 + }, + { + "epoch": 0.75661411010155, + "grad_norm": 0.1767578125, + "learning_rate": 3.397086140754153e-05, + "loss": 1.2591, + "step": 11325 + }, + { + "epoch": 0.7569481560662747, + "grad_norm": 0.1728515625, + "learning_rate": 3.388333427471627e-05, + "loss": 1.1908, + "step": 11330 + }, + { + "epoch": 0.7572822020309995, + "grad_norm": 0.1708984375, + "learning_rate": 3.3795897040087e-05, + "loss": 1.21, + "step": 11335 + }, + { + "epoch": 0.7576162479957242, + "grad_norm": 0.1669921875, + "learning_rate": 3.3708549822541225e-05, + "loss": 1.1525, + "step": 11340 + }, + { + "epoch": 0.757950293960449, + "grad_norm": 0.1748046875, + "learning_rate": 3.36212927408441e-05, + "loss": 1.2159, + "step": 11345 + }, + { + "epoch": 0.7582843399251737, + "grad_norm": 0.166015625, + "learning_rate": 3.3534125913638316e-05, + "loss": 1.2143, + "step": 11350 + }, + { + "epoch": 0.7586183858898985, + "grad_norm": 0.1630859375, + "learning_rate": 3.344704945944372e-05, + "loss": 1.2032, + "step": 11355 + }, + { + "epoch": 0.7589524318546232, + "grad_norm": 0.169921875, + "learning_rate": 3.336006349665731e-05, + "loss": 1.1575, + "step": 11360 + }, + { + "epoch": 0.759286477819348, + "grad_norm": 0.1708984375, + "learning_rate": 3.32731681435531e-05, + "loss": 1.1845, + "step": 11365 + }, + { + "epoch": 0.7596205237840727, + "grad_norm": 0.1748046875, + "learning_rate": 3.3186363518281907e-05, + "loss": 1.1324, + "step": 11370 + }, + { + "epoch": 0.7599545697487974, + "grad_norm": 0.17578125, + "learning_rate": 3.30996497388711e-05, + "loss": 1.2672, + "step": 11375 + }, + { + "epoch": 0.7602886157135221, + "grad_norm": 0.1787109375, + "learning_rate": 3.301302692322453e-05, + "loss": 1.2024, + "step": 11380 + }, + { + "epoch": 0.7606226616782469, + "grad_norm": 0.1806640625, + "learning_rate": 3.292649518912251e-05, + "loss": 1.2042, + "step": 11385 + }, + { + "epoch": 0.7609567076429716, + "grad_norm": 0.1796875, + "learning_rate": 3.284005465422134e-05, + "loss": 1.1876, + "step": 11390 + }, + { + "epoch": 0.7612907536076964, + "grad_norm": 0.1787109375, + "learning_rate": 3.275370543605337e-05, + "loss": 1.1774, + "step": 11395 + }, + { + "epoch": 0.7616247995724211, + "grad_norm": 0.19140625, + "learning_rate": 3.266744765202684e-05, + "loss": 1.1879, + "step": 11400 + }, + { + "epoch": 0.7619588455371459, + "grad_norm": 0.169921875, + "learning_rate": 3.2581281419425644e-05, + "loss": 1.1583, + "step": 11405 + }, + { + "epoch": 0.7622928915018706, + "grad_norm": 0.1728515625, + "learning_rate": 3.2495206855409165e-05, + "loss": 1.1837, + "step": 11410 + }, + { + "epoch": 0.7626269374665954, + "grad_norm": 0.1708984375, + "learning_rate": 3.2409224077012134e-05, + "loss": 1.1778, + "step": 11415 + }, + { + "epoch": 0.7629609834313201, + "grad_norm": 0.1728515625, + "learning_rate": 3.232333320114457e-05, + "loss": 1.1836, + "step": 11420 + }, + { + "epoch": 0.7632950293960449, + "grad_norm": 0.173828125, + "learning_rate": 3.223753434459139e-05, + "loss": 1.194, + "step": 11425 + }, + { + "epoch": 0.7636290753607696, + "grad_norm": 0.1640625, + "learning_rate": 3.2151827624012574e-05, + "loss": 1.2349, + "step": 11430 + }, + { + "epoch": 0.7639631213254944, + "grad_norm": 0.169921875, + "learning_rate": 3.206621315594264e-05, + "loss": 1.2218, + "step": 11435 + }, + { + "epoch": 0.7642971672902191, + "grad_norm": 0.17578125, + "learning_rate": 3.1980691056790814e-05, + "loss": 1.2077, + "step": 11440 + }, + { + "epoch": 0.7646312132549439, + "grad_norm": 0.1640625, + "learning_rate": 3.189526144284066e-05, + "loss": 1.1235, + "step": 11445 + }, + { + "epoch": 0.7649652592196686, + "grad_norm": 0.18359375, + "learning_rate": 3.180992443025001e-05, + "loss": 1.2072, + "step": 11450 + }, + { + "epoch": 0.7652993051843934, + "grad_norm": 0.1669921875, + "learning_rate": 3.17246801350507e-05, + "loss": 1.2245, + "step": 11455 + }, + { + "epoch": 0.7656333511491181, + "grad_norm": 0.1943359375, + "learning_rate": 3.163952867314871e-05, + "loss": 1.2003, + "step": 11460 + }, + { + "epoch": 0.7659673971138429, + "grad_norm": 0.1806640625, + "learning_rate": 3.155447016032361e-05, + "loss": 1.2126, + "step": 11465 + }, + { + "epoch": 0.7663014430785676, + "grad_norm": 0.166015625, + "learning_rate": 3.146950471222865e-05, + "loss": 1.2016, + "step": 11470 + }, + { + "epoch": 0.7666354890432924, + "grad_norm": 0.171875, + "learning_rate": 3.138463244439048e-05, + "loss": 1.2201, + "step": 11475 + }, + { + "epoch": 0.7669695350080171, + "grad_norm": 0.169921875, + "learning_rate": 3.1299853472209186e-05, + "loss": 1.2002, + "step": 11480 + }, + { + "epoch": 0.7673035809727419, + "grad_norm": 0.1826171875, + "learning_rate": 3.121516791095787e-05, + "loss": 1.2091, + "step": 11485 + }, + { + "epoch": 0.7676376269374666, + "grad_norm": 0.1767578125, + "learning_rate": 3.113057587578271e-05, + "loss": 1.2976, + "step": 11490 + }, + { + "epoch": 0.7679716729021914, + "grad_norm": 0.16796875, + "learning_rate": 3.1046077481702654e-05, + "loss": 1.1261, + "step": 11495 + }, + { + "epoch": 0.7683057188669161, + "grad_norm": 0.1630859375, + "learning_rate": 3.096167284360939e-05, + "loss": 1.2017, + "step": 11500 + }, + { + "epoch": 0.7686397648316409, + "grad_norm": 0.17578125, + "learning_rate": 3.087736207626709e-05, + "loss": 1.1789, + "step": 11505 + }, + { + "epoch": 0.7689738107963656, + "grad_norm": 0.1748046875, + "learning_rate": 3.0793145294312255e-05, + "loss": 1.2321, + "step": 11510 + }, + { + "epoch": 0.7693078567610904, + "grad_norm": 0.1728515625, + "learning_rate": 3.0709022612253656e-05, + "loss": 1.2757, + "step": 11515 + }, + { + "epoch": 0.7696419027258151, + "grad_norm": 0.181640625, + "learning_rate": 3.062499414447215e-05, + "loss": 1.2314, + "step": 11520 + }, + { + "epoch": 0.7699759486905399, + "grad_norm": 0.17578125, + "learning_rate": 3.054106000522039e-05, + "loss": 1.1666, + "step": 11525 + }, + { + "epoch": 0.7703099946552646, + "grad_norm": 0.16796875, + "learning_rate": 3.0457220308622782e-05, + "loss": 1.1709, + "step": 11530 + }, + { + "epoch": 0.7706440406199893, + "grad_norm": 0.1767578125, + "learning_rate": 3.0373475168675435e-05, + "loss": 1.2248, + "step": 11535 + }, + { + "epoch": 0.770978086584714, + "grad_norm": 0.1669921875, + "learning_rate": 3.0289824699245784e-05, + "loss": 1.2131, + "step": 11540 + }, + { + "epoch": 0.7713121325494388, + "grad_norm": 0.1787109375, + "learning_rate": 3.0206269014072518e-05, + "loss": 1.2146, + "step": 11545 + }, + { + "epoch": 0.7716461785141635, + "grad_norm": 0.1728515625, + "learning_rate": 3.0122808226765554e-05, + "loss": 1.1952, + "step": 11550 + }, + { + "epoch": 0.7719802244788883, + "grad_norm": 0.166015625, + "learning_rate": 3.003944245080573e-05, + "loss": 1.1605, + "step": 11555 + }, + { + "epoch": 0.772314270443613, + "grad_norm": 0.1669921875, + "learning_rate": 2.9956171799544686e-05, + "loss": 1.2247, + "step": 11560 + }, + { + "epoch": 0.7726483164083378, + "grad_norm": 0.171875, + "learning_rate": 2.9872996386204678e-05, + "loss": 1.1881, + "step": 11565 + }, + { + "epoch": 0.7729823623730625, + "grad_norm": 0.169921875, + "learning_rate": 2.9789916323878597e-05, + "loss": 1.2223, + "step": 11570 + }, + { + "epoch": 0.7733164083377873, + "grad_norm": 0.1806640625, + "learning_rate": 2.970693172552953e-05, + "loss": 1.2298, + "step": 11575 + }, + { + "epoch": 0.773650454302512, + "grad_norm": 0.1669921875, + "learning_rate": 2.9624042703990896e-05, + "loss": 1.2414, + "step": 11580 + }, + { + "epoch": 0.7739845002672368, + "grad_norm": 0.1708984375, + "learning_rate": 2.9541249371966064e-05, + "loss": 1.1649, + "step": 11585 + }, + { + "epoch": 0.7743185462319615, + "grad_norm": 0.169921875, + "learning_rate": 2.945855184202837e-05, + "loss": 1.1757, + "step": 11590 + }, + { + "epoch": 0.7746525921966863, + "grad_norm": 0.16796875, + "learning_rate": 2.937595022662083e-05, + "loss": 1.1983, + "step": 11595 + }, + { + "epoch": 0.774986638161411, + "grad_norm": 0.1767578125, + "learning_rate": 2.9293444638056045e-05, + "loss": 1.27, + "step": 11600 + }, + { + "epoch": 0.7753206841261358, + "grad_norm": 0.1689453125, + "learning_rate": 2.921103518851609e-05, + "loss": 1.2107, + "step": 11605 + }, + { + "epoch": 0.7756547300908605, + "grad_norm": 0.177734375, + "learning_rate": 2.9128721990052345e-05, + "loss": 1.167, + "step": 11610 + }, + { + "epoch": 0.7759887760555852, + "grad_norm": 0.17578125, + "learning_rate": 2.9046505154585235e-05, + "loss": 1.2401, + "step": 11615 + }, + { + "epoch": 0.77632282202031, + "grad_norm": 0.1611328125, + "learning_rate": 2.8964384793904188e-05, + "loss": 1.1416, + "step": 11620 + }, + { + "epoch": 0.7766568679850347, + "grad_norm": 0.1728515625, + "learning_rate": 2.8882361019667502e-05, + "loss": 1.2621, + "step": 11625 + }, + { + "epoch": 0.7769909139497595, + "grad_norm": 0.1845703125, + "learning_rate": 2.8800433943402115e-05, + "loss": 1.2146, + "step": 11630 + }, + { + "epoch": 0.7773249599144842, + "grad_norm": 0.1728515625, + "learning_rate": 2.8718603676503475e-05, + "loss": 1.2552, + "step": 11635 + }, + { + "epoch": 0.777659005879209, + "grad_norm": 0.203125, + "learning_rate": 2.8636870330235356e-05, + "loss": 1.2535, + "step": 11640 + }, + { + "epoch": 0.7779930518439337, + "grad_norm": 0.1728515625, + "learning_rate": 2.8555234015729904e-05, + "loss": 1.1534, + "step": 11645 + }, + { + "epoch": 0.7783270978086585, + "grad_norm": 0.201171875, + "learning_rate": 2.8473694843987198e-05, + "loss": 1.2312, + "step": 11650 + }, + { + "epoch": 0.7786611437733832, + "grad_norm": 0.1767578125, + "learning_rate": 2.839225292587525e-05, + "loss": 1.1423, + "step": 11655 + }, + { + "epoch": 0.778995189738108, + "grad_norm": 0.181640625, + "learning_rate": 2.831090837212984e-05, + "loss": 1.2034, + "step": 11660 + }, + { + "epoch": 0.7793292357028327, + "grad_norm": 0.1630859375, + "learning_rate": 2.8229661293354427e-05, + "loss": 1.1359, + "step": 11665 + }, + { + "epoch": 0.7796632816675575, + "grad_norm": 0.181640625, + "learning_rate": 2.8148511800019827e-05, + "loss": 1.2567, + "step": 11670 + }, + { + "epoch": 0.7799973276322822, + "grad_norm": 0.1728515625, + "learning_rate": 2.8067460002464252e-05, + "loss": 1.2415, + "step": 11675 + }, + { + "epoch": 0.780331373597007, + "grad_norm": 0.1689453125, + "learning_rate": 2.7986506010893088e-05, + "loss": 1.1646, + "step": 11680 + }, + { + "epoch": 0.7806654195617317, + "grad_norm": 0.1630859375, + "learning_rate": 2.7905649935378673e-05, + "loss": 1.1926, + "step": 11685 + }, + { + "epoch": 0.7809994655264565, + "grad_norm": 0.173828125, + "learning_rate": 2.7824891885860227e-05, + "loss": 1.2113, + "step": 11690 + }, + { + "epoch": 0.7813335114911811, + "grad_norm": 0.16796875, + "learning_rate": 2.7744231972143687e-05, + "loss": 1.1887, + "step": 11695 + }, + { + "epoch": 0.7816675574559059, + "grad_norm": 0.1669921875, + "learning_rate": 2.7663670303901566e-05, + "loss": 1.2085, + "step": 11700 + }, + { + "epoch": 0.7820016034206306, + "grad_norm": 0.16796875, + "learning_rate": 2.758320699067284e-05, + "loss": 1.2066, + "step": 11705 + }, + { + "epoch": 0.7823356493853554, + "grad_norm": 0.173828125, + "learning_rate": 2.7502842141862672e-05, + "loss": 1.2673, + "step": 11710 + }, + { + "epoch": 0.7826696953500801, + "grad_norm": 0.1767578125, + "learning_rate": 2.742257586674233e-05, + "loss": 1.2183, + "step": 11715 + }, + { + "epoch": 0.7830037413148049, + "grad_norm": 0.171875, + "learning_rate": 2.7342408274449184e-05, + "loss": 1.166, + "step": 11720 + }, + { + "epoch": 0.7833377872795296, + "grad_norm": 0.1630859375, + "learning_rate": 2.7262339473986286e-05, + "loss": 1.1372, + "step": 11725 + }, + { + "epoch": 0.7836718332442544, + "grad_norm": 0.1806640625, + "learning_rate": 2.71823695742224e-05, + "loss": 1.1694, + "step": 11730 + }, + { + "epoch": 0.7840058792089791, + "grad_norm": 0.181640625, + "learning_rate": 2.710249868389185e-05, + "loss": 1.219, + "step": 11735 + }, + { + "epoch": 0.7843399251737039, + "grad_norm": 0.1767578125, + "learning_rate": 2.7022726911594363e-05, + "loss": 1.2113, + "step": 11740 + }, + { + "epoch": 0.7846739711384286, + "grad_norm": 0.158203125, + "learning_rate": 2.6943054365794818e-05, + "loss": 1.1246, + "step": 11745 + }, + { + "epoch": 0.7850080171031534, + "grad_norm": 0.1923828125, + "learning_rate": 2.6863481154823168e-05, + "loss": 1.1696, + "step": 11750 + }, + { + "epoch": 0.7853420630678781, + "grad_norm": 0.173828125, + "learning_rate": 2.678400738687442e-05, + "loss": 1.1715, + "step": 11755 + }, + { + "epoch": 0.7856761090326029, + "grad_norm": 0.1728515625, + "learning_rate": 2.6704633170008232e-05, + "loss": 1.1638, + "step": 11760 + }, + { + "epoch": 0.7860101549973276, + "grad_norm": 0.169921875, + "learning_rate": 2.662535861214902e-05, + "loss": 1.2395, + "step": 11765 + }, + { + "epoch": 0.7863442009620524, + "grad_norm": 0.1865234375, + "learning_rate": 2.654618382108558e-05, + "loss": 1.2303, + "step": 11770 + }, + { + "epoch": 0.7866782469267771, + "grad_norm": 0.1611328125, + "learning_rate": 2.6467108904471184e-05, + "loss": 1.2723, + "step": 11775 + }, + { + "epoch": 0.7870122928915019, + "grad_norm": 0.1689453125, + "learning_rate": 2.6388133969823193e-05, + "loss": 1.1856, + "step": 11780 + }, + { + "epoch": 0.7873463388562266, + "grad_norm": 0.1640625, + "learning_rate": 2.6309259124523046e-05, + "loss": 1.2234, + "step": 11785 + }, + { + "epoch": 0.7876803848209514, + "grad_norm": 0.173828125, + "learning_rate": 2.6230484475816132e-05, + "loss": 1.1786, + "step": 11790 + }, + { + "epoch": 0.7880144307856761, + "grad_norm": 0.173828125, + "learning_rate": 2.6151810130811638e-05, + "loss": 1.2053, + "step": 11795 + }, + { + "epoch": 0.7883484767504009, + "grad_norm": 0.1640625, + "learning_rate": 2.6073236196482263e-05, + "loss": 1.1418, + "step": 11800 + }, + { + "epoch": 0.7886825227151256, + "grad_norm": 0.1962890625, + "learning_rate": 2.599476277966423e-05, + "loss": 1.1912, + "step": 11805 + }, + { + "epoch": 0.7890165686798504, + "grad_norm": 0.1650390625, + "learning_rate": 2.591638998705711e-05, + "loss": 1.1552, + "step": 11810 + }, + { + "epoch": 0.7893506146445751, + "grad_norm": 0.1572265625, + "learning_rate": 2.583811792522365e-05, + "loss": 1.148, + "step": 11815 + }, + { + "epoch": 0.7896846606092999, + "grad_norm": 0.1943359375, + "learning_rate": 2.5759946700589556e-05, + "loss": 1.2014, + "step": 11820 + }, + { + "epoch": 0.7900187065740246, + "grad_norm": 0.177734375, + "learning_rate": 2.568187641944354e-05, + "loss": 1.1741, + "step": 11825 + }, + { + "epoch": 0.7903527525387494, + "grad_norm": 0.1748046875, + "learning_rate": 2.5603907187937038e-05, + "loss": 1.1774, + "step": 11830 + }, + { + "epoch": 0.7906867985034741, + "grad_norm": 0.169921875, + "learning_rate": 2.5526039112084044e-05, + "loss": 1.2101, + "step": 11835 + }, + { + "epoch": 0.7910208444681989, + "grad_norm": 0.1796875, + "learning_rate": 2.5448272297761e-05, + "loss": 1.1802, + "step": 11840 + }, + { + "epoch": 0.7913548904329236, + "grad_norm": 0.1806640625, + "learning_rate": 2.5370606850706757e-05, + "loss": 1.194, + "step": 11845 + }, + { + "epoch": 0.7916889363976484, + "grad_norm": 0.1796875, + "learning_rate": 2.5293042876522245e-05, + "loss": 1.2415, + "step": 11850 + }, + { + "epoch": 0.7920229823623731, + "grad_norm": 0.1787109375, + "learning_rate": 2.521558048067042e-05, + "loss": 1.2278, + "step": 11855 + }, + { + "epoch": 0.7923570283270978, + "grad_norm": 0.1650390625, + "learning_rate": 2.5138219768476203e-05, + "loss": 1.2145, + "step": 11860 + }, + { + "epoch": 0.7926910742918225, + "grad_norm": 0.189453125, + "learning_rate": 2.5060960845126235e-05, + "loss": 1.179, + "step": 11865 + }, + { + "epoch": 0.7930251202565473, + "grad_norm": 0.1572265625, + "learning_rate": 2.4983803815668694e-05, + "loss": 1.1905, + "step": 11870 + }, + { + "epoch": 0.793359166221272, + "grad_norm": 0.19921875, + "learning_rate": 2.4906748785013267e-05, + "loss": 1.1982, + "step": 11875 + }, + { + "epoch": 0.7936932121859968, + "grad_norm": 0.1728515625, + "learning_rate": 2.4829795857930904e-05, + "loss": 1.2422, + "step": 11880 + }, + { + "epoch": 0.7940272581507215, + "grad_norm": 0.17578125, + "learning_rate": 2.4752945139053785e-05, + "loss": 1.2127, + "step": 11885 + }, + { + "epoch": 0.7943613041154463, + "grad_norm": 0.185546875, + "learning_rate": 2.4676196732875144e-05, + "loss": 1.1725, + "step": 11890 + }, + { + "epoch": 0.794695350080171, + "grad_norm": 0.1796875, + "learning_rate": 2.4599550743749e-05, + "loss": 1.2145, + "step": 11895 + }, + { + "epoch": 0.7950293960448958, + "grad_norm": 0.1689453125, + "learning_rate": 2.4523007275890152e-05, + "loss": 1.1748, + "step": 11900 + }, + { + "epoch": 0.7953634420096205, + "grad_norm": 0.171875, + "learning_rate": 2.4446566433374065e-05, + "loss": 1.2528, + "step": 11905 + }, + { + "epoch": 0.7956974879743453, + "grad_norm": 0.1689453125, + "learning_rate": 2.4370228320136613e-05, + "loss": 1.2329, + "step": 11910 + }, + { + "epoch": 0.79603153393907, + "grad_norm": 0.1767578125, + "learning_rate": 2.429399303997394e-05, + "loss": 1.2073, + "step": 11915 + }, + { + "epoch": 0.7963655799037948, + "grad_norm": 0.18359375, + "learning_rate": 2.4217860696542482e-05, + "loss": 1.1722, + "step": 11920 + }, + { + "epoch": 0.7966996258685195, + "grad_norm": 0.173828125, + "learning_rate": 2.414183139335866e-05, + "loss": 1.24, + "step": 11925 + }, + { + "epoch": 0.7970336718332443, + "grad_norm": 0.169921875, + "learning_rate": 2.406590523379877e-05, + "loss": 1.1457, + "step": 11930 + }, + { + "epoch": 0.797367717797969, + "grad_norm": 0.171875, + "learning_rate": 2.399008232109885e-05, + "loss": 1.2234, + "step": 11935 + }, + { + "epoch": 0.7977017637626937, + "grad_norm": 0.1767578125, + "learning_rate": 2.3914362758354658e-05, + "loss": 1.2298, + "step": 11940 + }, + { + "epoch": 0.7980358097274185, + "grad_norm": 0.1796875, + "learning_rate": 2.383874664852127e-05, + "loss": 1.1725, + "step": 11945 + }, + { + "epoch": 0.7983698556921432, + "grad_norm": 0.169921875, + "learning_rate": 2.3763234094413277e-05, + "loss": 1.171, + "step": 11950 + }, + { + "epoch": 0.798703901656868, + "grad_norm": 0.169921875, + "learning_rate": 2.3687825198704296e-05, + "loss": 1.1286, + "step": 11955 + }, + { + "epoch": 0.7990379476215927, + "grad_norm": 0.1669921875, + "learning_rate": 2.3612520063927145e-05, + "loss": 1.22, + "step": 11960 + }, + { + "epoch": 0.7993719935863175, + "grad_norm": 0.1669921875, + "learning_rate": 2.353731879247345e-05, + "loss": 1.1227, + "step": 11965 + }, + { + "epoch": 0.7997060395510422, + "grad_norm": 0.19921875, + "learning_rate": 2.346222148659365e-05, + "loss": 1.1955, + "step": 11970 + }, + { + "epoch": 0.800040085515767, + "grad_norm": 0.1630859375, + "learning_rate": 2.3387228248396842e-05, + "loss": 1.1408, + "step": 11975 + }, + { + "epoch": 0.8003741314804917, + "grad_norm": 0.177734375, + "learning_rate": 2.3312339179850652e-05, + "loss": 1.1877, + "step": 11980 + }, + { + "epoch": 0.8007081774452165, + "grad_norm": 0.1708984375, + "learning_rate": 2.3237554382781002e-05, + "loss": 1.1573, + "step": 11985 + }, + { + "epoch": 0.8010422234099412, + "grad_norm": 0.1787109375, + "learning_rate": 2.316287395887202e-05, + "loss": 1.1767, + "step": 11990 + }, + { + "epoch": 0.801376269374666, + "grad_norm": 0.171875, + "learning_rate": 2.3088298009666033e-05, + "loss": 1.3085, + "step": 11995 + }, + { + "epoch": 0.8017103153393907, + "grad_norm": 0.16015625, + "learning_rate": 2.3013826636563198e-05, + "loss": 1.1452, + "step": 12000 + }, + { + "epoch": 0.8020443613041155, + "grad_norm": 0.1865234375, + "learning_rate": 2.2939459940821518e-05, + "loss": 1.2353, + "step": 12005 + }, + { + "epoch": 0.8023784072688402, + "grad_norm": 0.181640625, + "learning_rate": 2.2865198023556698e-05, + "loss": 1.2847, + "step": 12010 + }, + { + "epoch": 0.802712453233565, + "grad_norm": 0.1728515625, + "learning_rate": 2.2791040985741974e-05, + "loss": 1.2256, + "step": 12015 + }, + { + "epoch": 0.8030464991982896, + "grad_norm": 0.185546875, + "learning_rate": 2.271698892820794e-05, + "loss": 1.1917, + "step": 12020 + }, + { + "epoch": 0.8033805451630144, + "grad_norm": 0.1826171875, + "learning_rate": 2.264304195164243e-05, + "loss": 1.2397, + "step": 12025 + }, + { + "epoch": 0.8037145911277391, + "grad_norm": 0.1708984375, + "learning_rate": 2.2569200156590507e-05, + "loss": 1.1771, + "step": 12030 + }, + { + "epoch": 0.8040486370924639, + "grad_norm": 0.177734375, + "learning_rate": 2.2495463643454085e-05, + "loss": 1.1452, + "step": 12035 + }, + { + "epoch": 0.8043826830571886, + "grad_norm": 0.1630859375, + "learning_rate": 2.2421832512492057e-05, + "loss": 1.2384, + "step": 12040 + }, + { + "epoch": 0.8047167290219134, + "grad_norm": 0.169921875, + "learning_rate": 2.23483068638199e-05, + "loss": 1.1587, + "step": 12045 + }, + { + "epoch": 0.8050507749866381, + "grad_norm": 0.1796875, + "learning_rate": 2.22748867974098e-05, + "loss": 1.2088, + "step": 12050 + }, + { + "epoch": 0.8053848209513629, + "grad_norm": 0.1748046875, + "learning_rate": 2.220157241309028e-05, + "loss": 1.2517, + "step": 12055 + }, + { + "epoch": 0.8057188669160876, + "grad_norm": 0.162109375, + "learning_rate": 2.2128363810546205e-05, + "loss": 1.0946, + "step": 12060 + }, + { + "epoch": 0.8060529128808124, + "grad_norm": 0.1669921875, + "learning_rate": 2.205526108931857e-05, + "loss": 1.1852, + "step": 12065 + }, + { + "epoch": 0.8063869588455371, + "grad_norm": 0.1640625, + "learning_rate": 2.1982264348804525e-05, + "loss": 1.1549, + "step": 12070 + }, + { + "epoch": 0.8067210048102619, + "grad_norm": 0.1748046875, + "learning_rate": 2.1909373688257008e-05, + "loss": 1.2467, + "step": 12075 + }, + { + "epoch": 0.8070550507749866, + "grad_norm": 0.173828125, + "learning_rate": 2.183658920678474e-05, + "loss": 1.1084, + "step": 12080 + }, + { + "epoch": 0.8073890967397114, + "grad_norm": 0.173828125, + "learning_rate": 2.1763911003352055e-05, + "loss": 1.2701, + "step": 12085 + }, + { + "epoch": 0.8077231427044361, + "grad_norm": 0.171875, + "learning_rate": 2.1691339176778856e-05, + "loss": 1.1389, + "step": 12090 + }, + { + "epoch": 0.8080571886691609, + "grad_norm": 0.1650390625, + "learning_rate": 2.161887382574035e-05, + "loss": 1.2096, + "step": 12095 + }, + { + "epoch": 0.8083912346338856, + "grad_norm": 0.1728515625, + "learning_rate": 2.1546515048766914e-05, + "loss": 1.2264, + "step": 12100 + }, + { + "epoch": 0.8087252805986104, + "grad_norm": 0.1669921875, + "learning_rate": 2.1474262944244196e-05, + "loss": 1.1749, + "step": 12105 + }, + { + "epoch": 0.8090593265633351, + "grad_norm": 0.16796875, + "learning_rate": 2.140211761041262e-05, + "loss": 1.2378, + "step": 12110 + }, + { + "epoch": 0.8093933725280599, + "grad_norm": 0.1669921875, + "learning_rate": 2.133007914536753e-05, + "loss": 1.1184, + "step": 12115 + }, + { + "epoch": 0.8097274184927846, + "grad_norm": 0.171875, + "learning_rate": 2.125814764705889e-05, + "loss": 1.1524, + "step": 12120 + }, + { + "epoch": 0.8100614644575094, + "grad_norm": 0.171875, + "learning_rate": 2.1186323213291316e-05, + "loss": 1.2009, + "step": 12125 + }, + { + "epoch": 0.8103955104222341, + "grad_norm": 0.1962890625, + "learning_rate": 2.1114605941723777e-05, + "loss": 1.1628, + "step": 12130 + }, + { + "epoch": 0.8107295563869589, + "grad_norm": 0.1650390625, + "learning_rate": 2.104299592986958e-05, + "loss": 1.2373, + "step": 12135 + }, + { + "epoch": 0.8110636023516836, + "grad_norm": 0.1748046875, + "learning_rate": 2.0971493275096133e-05, + "loss": 1.2578, + "step": 12140 + }, + { + "epoch": 0.8113976483164084, + "grad_norm": 0.1767578125, + "learning_rate": 2.0900098074624952e-05, + "loss": 1.1665, + "step": 12145 + }, + { + "epoch": 0.8117316942811331, + "grad_norm": 0.1845703125, + "learning_rate": 2.08288104255314e-05, + "loss": 1.1408, + "step": 12150 + }, + { + "epoch": 0.8120657402458579, + "grad_norm": 0.1650390625, + "learning_rate": 2.0757630424744568e-05, + "loss": 1.1506, + "step": 12155 + }, + { + "epoch": 0.8123997862105826, + "grad_norm": 0.173828125, + "learning_rate": 2.0686558169047256e-05, + "loss": 1.2123, + "step": 12160 + }, + { + "epoch": 0.8127338321753074, + "grad_norm": 0.1708984375, + "learning_rate": 2.0615593755075734e-05, + "loss": 1.1628, + "step": 12165 + }, + { + "epoch": 0.8130678781400321, + "grad_norm": 0.1708984375, + "learning_rate": 2.0544737279319636e-05, + "loss": 1.2352, + "step": 12170 + }, + { + "epoch": 0.8134019241047569, + "grad_norm": 0.1572265625, + "learning_rate": 2.0473988838121783e-05, + "loss": 1.1936, + "step": 12175 + }, + { + "epoch": 0.8137359700694815, + "grad_norm": 0.1669921875, + "learning_rate": 2.0403348527678222e-05, + "loss": 1.1521, + "step": 12180 + }, + { + "epoch": 0.8140700160342063, + "grad_norm": 0.1767578125, + "learning_rate": 2.0332816444037873e-05, + "loss": 1.2135, + "step": 12185 + }, + { + "epoch": 0.814404061998931, + "grad_norm": 0.1796875, + "learning_rate": 2.0262392683102493e-05, + "loss": 1.2112, + "step": 12190 + }, + { + "epoch": 0.8147381079636558, + "grad_norm": 0.1708984375, + "learning_rate": 2.0192077340626636e-05, + "loss": 1.1762, + "step": 12195 + }, + { + "epoch": 0.8150721539283805, + "grad_norm": 0.1708984375, + "learning_rate": 2.012187051221742e-05, + "loss": 1.1734, + "step": 12200 + }, + { + "epoch": 0.8154061998931053, + "grad_norm": 0.181640625, + "learning_rate": 2.005177229333437e-05, + "loss": 1.1807, + "step": 12205 + }, + { + "epoch": 0.81574024585783, + "grad_norm": 0.1650390625, + "learning_rate": 1.998178277928934e-05, + "loss": 1.1999, + "step": 12210 + }, + { + "epoch": 0.8160742918225548, + "grad_norm": 0.1904296875, + "learning_rate": 1.9911902065246447e-05, + "loss": 1.1723, + "step": 12215 + }, + { + "epoch": 0.8164083377872795, + "grad_norm": 0.162109375, + "learning_rate": 1.9842130246221768e-05, + "loss": 1.1958, + "step": 12220 + }, + { + "epoch": 0.8167423837520043, + "grad_norm": 0.16796875, + "learning_rate": 1.977246741708344e-05, + "loss": 1.2349, + "step": 12225 + }, + { + "epoch": 0.817076429716729, + "grad_norm": 0.1689453125, + "learning_rate": 1.9702913672551292e-05, + "loss": 1.1717, + "step": 12230 + }, + { + "epoch": 0.8174104756814538, + "grad_norm": 0.169921875, + "learning_rate": 1.9633469107196932e-05, + "loss": 1.2384, + "step": 12235 + }, + { + "epoch": 0.8177445216461785, + "grad_norm": 0.1767578125, + "learning_rate": 1.956413381544344e-05, + "loss": 1.1851, + "step": 12240 + }, + { + "epoch": 0.8180785676109033, + "grad_norm": 0.173828125, + "learning_rate": 1.9494907891565316e-05, + "loss": 1.2535, + "step": 12245 + }, + { + "epoch": 0.818412613575628, + "grad_norm": 0.1611328125, + "learning_rate": 1.942579142968842e-05, + "loss": 1.1402, + "step": 12250 + }, + { + "epoch": 0.8187466595403528, + "grad_norm": 0.1875, + "learning_rate": 1.9356784523789772e-05, + "loss": 1.2167, + "step": 12255 + }, + { + "epoch": 0.8190807055050775, + "grad_norm": 0.1728515625, + "learning_rate": 1.928788726769737e-05, + "loss": 1.1381, + "step": 12260 + }, + { + "epoch": 0.8194147514698022, + "grad_norm": 0.1669921875, + "learning_rate": 1.9219099755090107e-05, + "loss": 1.1904, + "step": 12265 + }, + { + "epoch": 0.819748797434527, + "grad_norm": 0.173828125, + "learning_rate": 1.915042207949779e-05, + "loss": 1.2316, + "step": 12270 + }, + { + "epoch": 0.8200828433992517, + "grad_norm": 0.1650390625, + "learning_rate": 1.908185433430074e-05, + "loss": 1.1822, + "step": 12275 + }, + { + "epoch": 0.8204168893639765, + "grad_norm": 0.19921875, + "learning_rate": 1.901339661272985e-05, + "loss": 1.2357, + "step": 12280 + }, + { + "epoch": 0.8207509353287012, + "grad_norm": 0.19140625, + "learning_rate": 1.8945049007866446e-05, + "loss": 1.1806, + "step": 12285 + }, + { + "epoch": 0.821084981293426, + "grad_norm": 0.1689453125, + "learning_rate": 1.887681161264214e-05, + "loss": 1.1728, + "step": 12290 + }, + { + "epoch": 0.8214190272581507, + "grad_norm": 0.1923828125, + "learning_rate": 1.880868451983865e-05, + "loss": 1.2062, + "step": 12295 + }, + { + "epoch": 0.8217530732228755, + "grad_norm": 0.166015625, + "learning_rate": 1.874066782208771e-05, + "loss": 1.1982, + "step": 12300 + }, + { + "epoch": 0.8220871191876002, + "grad_norm": 0.1748046875, + "learning_rate": 1.8672761611870958e-05, + "loss": 1.2017, + "step": 12305 + }, + { + "epoch": 0.822421165152325, + "grad_norm": 0.1708984375, + "learning_rate": 1.8604965981519827e-05, + "loss": 1.1544, + "step": 12310 + }, + { + "epoch": 0.8227552111170497, + "grad_norm": 0.1845703125, + "learning_rate": 1.8537281023215436e-05, + "loss": 1.0842, + "step": 12315 + }, + { + "epoch": 0.8230892570817745, + "grad_norm": 0.1943359375, + "learning_rate": 1.846970682898833e-05, + "loss": 1.2518, + "step": 12320 + }, + { + "epoch": 0.8234233030464992, + "grad_norm": 0.1689453125, + "learning_rate": 1.8402243490718474e-05, + "loss": 1.2632, + "step": 12325 + }, + { + "epoch": 0.823757349011224, + "grad_norm": 0.1787109375, + "learning_rate": 1.8334891100135166e-05, + "loss": 1.2021, + "step": 12330 + }, + { + "epoch": 0.8240913949759487, + "grad_norm": 0.1640625, + "learning_rate": 1.8267649748816772e-05, + "loss": 1.1112, + "step": 12335 + }, + { + "epoch": 0.8244254409406734, + "grad_norm": 0.16015625, + "learning_rate": 1.820051952819072e-05, + "loss": 1.2062, + "step": 12340 + }, + { + "epoch": 0.8247594869053981, + "grad_norm": 0.177734375, + "learning_rate": 1.8133500529533308e-05, + "loss": 1.2504, + "step": 12345 + }, + { + "epoch": 0.8250935328701229, + "grad_norm": 0.1728515625, + "learning_rate": 1.806659284396969e-05, + "loss": 1.2482, + "step": 12350 + }, + { + "epoch": 0.8254275788348476, + "grad_norm": 0.1708984375, + "learning_rate": 1.799979656247355e-05, + "loss": 1.1268, + "step": 12355 + }, + { + "epoch": 0.8257616247995724, + "grad_norm": 0.1650390625, + "learning_rate": 1.793311177586714e-05, + "loss": 1.1863, + "step": 12360 + }, + { + "epoch": 0.8260956707642971, + "grad_norm": 0.1904296875, + "learning_rate": 1.786653857482118e-05, + "loss": 1.2579, + "step": 12365 + }, + { + "epoch": 0.8264297167290219, + "grad_norm": 0.173828125, + "learning_rate": 1.780007704985457e-05, + "loss": 1.1932, + "step": 12370 + }, + { + "epoch": 0.8267637626937466, + "grad_norm": 0.1669921875, + "learning_rate": 1.77337272913344e-05, + "loss": 1.2517, + "step": 12375 + }, + { + "epoch": 0.8270978086584714, + "grad_norm": 0.1806640625, + "learning_rate": 1.766748938947581e-05, + "loss": 1.2773, + "step": 12380 + }, + { + "epoch": 0.8274318546231961, + "grad_norm": 0.1630859375, + "learning_rate": 1.760136343434188e-05, + "loss": 1.1525, + "step": 12385 + }, + { + "epoch": 0.8277659005879209, + "grad_norm": 0.169921875, + "learning_rate": 1.7535349515843392e-05, + "loss": 1.1876, + "step": 12390 + }, + { + "epoch": 0.8280999465526456, + "grad_norm": 0.169921875, + "learning_rate": 1.746944772373883e-05, + "loss": 1.1088, + "step": 12395 + }, + { + "epoch": 0.8284339925173704, + "grad_norm": 0.19140625, + "learning_rate": 1.740365814763427e-05, + "loss": 1.2222, + "step": 12400 + }, + { + "epoch": 0.8287680384820951, + "grad_norm": 0.173828125, + "learning_rate": 1.733798087698313e-05, + "loss": 1.2101, + "step": 12405 + }, + { + "epoch": 0.8291020844468199, + "grad_norm": 0.1669921875, + "learning_rate": 1.727241600108619e-05, + "loss": 1.2127, + "step": 12410 + }, + { + "epoch": 0.8294361304115446, + "grad_norm": 0.1689453125, + "learning_rate": 1.7206963609091352e-05, + "loss": 1.1727, + "step": 12415 + }, + { + "epoch": 0.8297701763762694, + "grad_norm": 0.1669921875, + "learning_rate": 1.7141623789993655e-05, + "loss": 1.1811, + "step": 12420 + }, + { + "epoch": 0.8301042223409941, + "grad_norm": 0.181640625, + "learning_rate": 1.7076396632634994e-05, + "loss": 1.179, + "step": 12425 + }, + { + "epoch": 0.8304382683057189, + "grad_norm": 0.166015625, + "learning_rate": 1.7011282225704074e-05, + "loss": 1.1736, + "step": 12430 + }, + { + "epoch": 0.8307723142704436, + "grad_norm": 0.16015625, + "learning_rate": 1.694628065773638e-05, + "loss": 1.1563, + "step": 12435 + }, + { + "epoch": 0.8311063602351684, + "grad_norm": 0.162109375, + "learning_rate": 1.6881392017113917e-05, + "loss": 1.2117, + "step": 12440 + }, + { + "epoch": 0.8314404061998931, + "grad_norm": 0.173828125, + "learning_rate": 1.6816616392065142e-05, + "loss": 1.1943, + "step": 12445 + }, + { + "epoch": 0.8317744521646179, + "grad_norm": 0.171875, + "learning_rate": 1.6751953870664817e-05, + "loss": 1.2046, + "step": 12450 + }, + { + "epoch": 0.8321084981293426, + "grad_norm": 0.1611328125, + "learning_rate": 1.6687404540833996e-05, + "loss": 1.1596, + "step": 12455 + }, + { + "epoch": 0.8324425440940674, + "grad_norm": 0.1728515625, + "learning_rate": 1.6622968490339773e-05, + "loss": 1.265, + "step": 12460 + }, + { + "epoch": 0.8327765900587921, + "grad_norm": 0.1630859375, + "learning_rate": 1.6558645806795193e-05, + "loss": 1.2546, + "step": 12465 + }, + { + "epoch": 0.8331106360235169, + "grad_norm": 0.162109375, + "learning_rate": 1.6494436577659222e-05, + "loss": 1.1452, + "step": 12470 + }, + { + "epoch": 0.8334446819882416, + "grad_norm": 0.1767578125, + "learning_rate": 1.643034089023655e-05, + "loss": 1.2115, + "step": 12475 + }, + { + "epoch": 0.8337787279529664, + "grad_norm": 0.169921875, + "learning_rate": 1.6366358831677454e-05, + "loss": 1.156, + "step": 12480 + }, + { + "epoch": 0.8341127739176911, + "grad_norm": 0.171875, + "learning_rate": 1.6302490488977705e-05, + "loss": 1.1572, + "step": 12485 + }, + { + "epoch": 0.8344468198824159, + "grad_norm": 0.1728515625, + "learning_rate": 1.623873594897848e-05, + "loss": 1.1823, + "step": 12490 + }, + { + "epoch": 0.8347808658471406, + "grad_norm": 0.1865234375, + "learning_rate": 1.6175095298366217e-05, + "loss": 1.1672, + "step": 12495 + }, + { + "epoch": 0.8351149118118654, + "grad_norm": 0.1689453125, + "learning_rate": 1.6111568623672533e-05, + "loss": 1.1418, + "step": 12500 + }, + { + "epoch": 0.83544895777659, + "grad_norm": 0.173828125, + "learning_rate": 1.6048156011274018e-05, + "loss": 1.2192, + "step": 12505 + }, + { + "epoch": 0.8357830037413148, + "grad_norm": 0.1708984375, + "learning_rate": 1.598485754739215e-05, + "loss": 1.2165, + "step": 12510 + }, + { + "epoch": 0.8361170497060395, + "grad_norm": 0.1669921875, + "learning_rate": 1.59216733180933e-05, + "loss": 1.2849, + "step": 12515 + }, + { + "epoch": 0.8364510956707643, + "grad_norm": 0.181640625, + "learning_rate": 1.585860340928844e-05, + "loss": 1.2269, + "step": 12520 + }, + { + "epoch": 0.836785141635489, + "grad_norm": 0.1669921875, + "learning_rate": 1.579564790673308e-05, + "loss": 1.2391, + "step": 12525 + }, + { + "epoch": 0.8371191876002138, + "grad_norm": 0.1669921875, + "learning_rate": 1.5732806896027287e-05, + "loss": 1.1784, + "step": 12530 + }, + { + "epoch": 0.8374532335649385, + "grad_norm": 0.1708984375, + "learning_rate": 1.5670080462615345e-05, + "loss": 1.1549, + "step": 12535 + }, + { + "epoch": 0.8377872795296633, + "grad_norm": 0.171875, + "learning_rate": 1.5607468691785776e-05, + "loss": 1.1978, + "step": 12540 + }, + { + "epoch": 0.838121325494388, + "grad_norm": 0.181640625, + "learning_rate": 1.554497166867118e-05, + "loss": 1.2579, + "step": 12545 + }, + { + "epoch": 0.8384553714591128, + "grad_norm": 0.166015625, + "learning_rate": 1.5482589478248222e-05, + "loss": 1.2783, + "step": 12550 + }, + { + "epoch": 0.8387894174238375, + "grad_norm": 0.162109375, + "learning_rate": 1.5420322205337333e-05, + "loss": 1.2231, + "step": 12555 + }, + { + "epoch": 0.8391234633885623, + "grad_norm": 0.169921875, + "learning_rate": 1.5358169934602706e-05, + "loss": 1.1347, + "step": 12560 + }, + { + "epoch": 0.839457509353287, + "grad_norm": 0.1826171875, + "learning_rate": 1.5296132750552207e-05, + "loss": 1.1165, + "step": 12565 + }, + { + "epoch": 0.8397915553180118, + "grad_norm": 0.1826171875, + "learning_rate": 1.5234210737537225e-05, + "loss": 1.2383, + "step": 12570 + }, + { + "epoch": 0.8401256012827365, + "grad_norm": 0.1689453125, + "learning_rate": 1.5172403979752492e-05, + "loss": 1.1735, + "step": 12575 + }, + { + "epoch": 0.8404596472474613, + "grad_norm": 0.1787109375, + "learning_rate": 1.5110712561236062e-05, + "loss": 1.2072, + "step": 12580 + }, + { + "epoch": 0.840793693212186, + "grad_norm": 0.1708984375, + "learning_rate": 1.5049136565869205e-05, + "loss": 1.1671, + "step": 12585 + }, + { + "epoch": 0.8411277391769107, + "grad_norm": 0.1572265625, + "learning_rate": 1.4987676077376156e-05, + "loss": 1.2047, + "step": 12590 + }, + { + "epoch": 0.8414617851416355, + "grad_norm": 0.1669921875, + "learning_rate": 1.4926331179324205e-05, + "loss": 1.1631, + "step": 12595 + }, + { + "epoch": 0.8417958311063602, + "grad_norm": 0.16796875, + "learning_rate": 1.4865101955123362e-05, + "loss": 1.1725, + "step": 12600 + }, + { + "epoch": 0.842129877071085, + "grad_norm": 0.17578125, + "learning_rate": 1.4803988488026487e-05, + "loss": 1.2299, + "step": 12605 + }, + { + "epoch": 0.8424639230358097, + "grad_norm": 0.1640625, + "learning_rate": 1.4742990861128924e-05, + "loss": 1.1771, + "step": 12610 + }, + { + "epoch": 0.8427979690005345, + "grad_norm": 0.1728515625, + "learning_rate": 1.4682109157368561e-05, + "loss": 1.2413, + "step": 12615 + }, + { + "epoch": 0.8431320149652592, + "grad_norm": 0.1796875, + "learning_rate": 1.4621343459525671e-05, + "loss": 1.1604, + "step": 12620 + }, + { + "epoch": 0.843466060929984, + "grad_norm": 0.169921875, + "learning_rate": 1.4560693850222828e-05, + "loss": 1.1893, + "step": 12625 + }, + { + "epoch": 0.8438001068947087, + "grad_norm": 0.17578125, + "learning_rate": 1.450016041192469e-05, + "loss": 1.1732, + "step": 12630 + }, + { + "epoch": 0.8441341528594335, + "grad_norm": 0.1767578125, + "learning_rate": 1.4439743226937975e-05, + "loss": 1.1983, + "step": 12635 + }, + { + "epoch": 0.8444681988241582, + "grad_norm": 0.1748046875, + "learning_rate": 1.437944237741139e-05, + "loss": 1.2534, + "step": 12640 + }, + { + "epoch": 0.844802244788883, + "grad_norm": 0.171875, + "learning_rate": 1.4319257945335408e-05, + "loss": 1.2628, + "step": 12645 + }, + { + "epoch": 0.8451362907536077, + "grad_norm": 0.1767578125, + "learning_rate": 1.425919001254219e-05, + "loss": 1.1942, + "step": 12650 + }, + { + "epoch": 0.8454703367183325, + "grad_norm": 0.158203125, + "learning_rate": 1.419923866070556e-05, + "loss": 1.1969, + "step": 12655 + }, + { + "epoch": 0.8458043826830572, + "grad_norm": 0.177734375, + "learning_rate": 1.4139403971340815e-05, + "loss": 1.2407, + "step": 12660 + }, + { + "epoch": 0.8461384286477819, + "grad_norm": 0.18359375, + "learning_rate": 1.4079686025804584e-05, + "loss": 1.2333, + "step": 12665 + }, + { + "epoch": 0.8464724746125066, + "grad_norm": 0.185546875, + "learning_rate": 1.4020084905294761e-05, + "loss": 1.2168, + "step": 12670 + }, + { + "epoch": 0.8468065205772314, + "grad_norm": 0.162109375, + "learning_rate": 1.3960600690850466e-05, + "loss": 1.2205, + "step": 12675 + }, + { + "epoch": 0.8471405665419561, + "grad_norm": 0.169921875, + "learning_rate": 1.3901233463351771e-05, + "loss": 1.1949, + "step": 12680 + }, + { + "epoch": 0.8474746125066809, + "grad_norm": 0.16015625, + "learning_rate": 1.3841983303519756e-05, + "loss": 1.22, + "step": 12685 + }, + { + "epoch": 0.8478086584714056, + "grad_norm": 0.18359375, + "learning_rate": 1.3782850291916271e-05, + "loss": 1.1553, + "step": 12690 + }, + { + "epoch": 0.8481427044361304, + "grad_norm": 0.1708984375, + "learning_rate": 1.3723834508943945e-05, + "loss": 1.2315, + "step": 12695 + }, + { + "epoch": 0.8484767504008551, + "grad_norm": 0.1767578125, + "learning_rate": 1.3664936034845933e-05, + "loss": 1.1529, + "step": 12700 + }, + { + "epoch": 0.8488107963655799, + "grad_norm": 0.1611328125, + "learning_rate": 1.360615494970594e-05, + "loss": 1.1605, + "step": 12705 + }, + { + "epoch": 0.8491448423303046, + "grad_norm": 0.1796875, + "learning_rate": 1.3547491333448003e-05, + "loss": 1.1639, + "step": 12710 + }, + { + "epoch": 0.8494788882950294, + "grad_norm": 0.205078125, + "learning_rate": 1.348894526583655e-05, + "loss": 1.189, + "step": 12715 + }, + { + "epoch": 0.8498129342597541, + "grad_norm": 0.1787109375, + "learning_rate": 1.343051682647607e-05, + "loss": 1.2305, + "step": 12720 + }, + { + "epoch": 0.8501469802244789, + "grad_norm": 0.1728515625, + "learning_rate": 1.3372206094811158e-05, + "loss": 1.1985, + "step": 12725 + }, + { + "epoch": 0.8504810261892036, + "grad_norm": 0.1767578125, + "learning_rate": 1.3314013150126336e-05, + "loss": 1.1517, + "step": 12730 + }, + { + "epoch": 0.8508150721539284, + "grad_norm": 0.171875, + "learning_rate": 1.3255938071546026e-05, + "loss": 1.1212, + "step": 12735 + }, + { + "epoch": 0.8511491181186531, + "grad_norm": 0.1787109375, + "learning_rate": 1.3197980938034305e-05, + "loss": 1.1748, + "step": 12740 + }, + { + "epoch": 0.8514831640833779, + "grad_norm": 0.1669921875, + "learning_rate": 1.3140141828394992e-05, + "loss": 1.2564, + "step": 12745 + }, + { + "epoch": 0.8518172100481026, + "grad_norm": 0.162109375, + "learning_rate": 1.3082420821271324e-05, + "loss": 1.2606, + "step": 12750 + }, + { + "epoch": 0.8521512560128274, + "grad_norm": 0.16796875, + "learning_rate": 1.3024817995146032e-05, + "loss": 1.2375, + "step": 12755 + }, + { + "epoch": 0.8524853019775521, + "grad_norm": 0.1787109375, + "learning_rate": 1.2967333428341121e-05, + "loss": 1.1506, + "step": 12760 + }, + { + "epoch": 0.8528193479422769, + "grad_norm": 0.173828125, + "learning_rate": 1.290996719901777e-05, + "loss": 1.2125, + "step": 12765 + }, + { + "epoch": 0.8531533939070016, + "grad_norm": 0.177734375, + "learning_rate": 1.2852719385176303e-05, + "loss": 1.231, + "step": 12770 + }, + { + "epoch": 0.8534874398717264, + "grad_norm": 0.1689453125, + "learning_rate": 1.279559006465607e-05, + "loss": 1.2045, + "step": 12775 + }, + { + "epoch": 0.8538214858364511, + "grad_norm": 0.1708984375, + "learning_rate": 1.2738579315135224e-05, + "loss": 1.1227, + "step": 12780 + }, + { + "epoch": 0.8541555318011759, + "grad_norm": 0.1640625, + "learning_rate": 1.268168721413071e-05, + "loss": 1.1242, + "step": 12785 + }, + { + "epoch": 0.8544895777659006, + "grad_norm": 0.1796875, + "learning_rate": 1.262491383899823e-05, + "loss": 1.1677, + "step": 12790 + }, + { + "epoch": 0.8548236237306254, + "grad_norm": 0.216796875, + "learning_rate": 1.2568259266931958e-05, + "loss": 1.2305, + "step": 12795 + }, + { + "epoch": 0.8551576696953501, + "grad_norm": 0.1728515625, + "learning_rate": 1.251172357496455e-05, + "loss": 1.1436, + "step": 12800 + }, + { + "epoch": 0.8554917156600749, + "grad_norm": 0.19140625, + "learning_rate": 1.2455306839967085e-05, + "loss": 1.178, + "step": 12805 + }, + { + "epoch": 0.8558257616247996, + "grad_norm": 0.1708984375, + "learning_rate": 1.2399009138648854e-05, + "loss": 1.1356, + "step": 12810 + }, + { + "epoch": 0.8561598075895244, + "grad_norm": 0.1796875, + "learning_rate": 1.2342830547557305e-05, + "loss": 1.2149, + "step": 12815 + }, + { + "epoch": 0.8564938535542491, + "grad_norm": 0.1669921875, + "learning_rate": 1.228677114307788e-05, + "loss": 1.185, + "step": 12820 + }, + { + "epoch": 0.8568278995189738, + "grad_norm": 0.185546875, + "learning_rate": 1.2230831001434084e-05, + "loss": 1.2487, + "step": 12825 + }, + { + "epoch": 0.8571619454836985, + "grad_norm": 0.173828125, + "learning_rate": 1.2175010198687143e-05, + "loss": 1.1808, + "step": 12830 + }, + { + "epoch": 0.8574959914484233, + "grad_norm": 0.1728515625, + "learning_rate": 1.2119308810736064e-05, + "loss": 1.234, + "step": 12835 + }, + { + "epoch": 0.857830037413148, + "grad_norm": 0.1728515625, + "learning_rate": 1.2063726913317508e-05, + "loss": 1.1794, + "step": 12840 + }, + { + "epoch": 0.8581640833778728, + "grad_norm": 0.169921875, + "learning_rate": 1.2008264582005657e-05, + "loss": 1.2039, + "step": 12845 + }, + { + "epoch": 0.8584981293425975, + "grad_norm": 0.1748046875, + "learning_rate": 1.195292189221211e-05, + "loss": 1.1237, + "step": 12850 + }, + { + "epoch": 0.8588321753073223, + "grad_norm": 0.1767578125, + "learning_rate": 1.189769891918575e-05, + "loss": 1.2079, + "step": 12855 + }, + { + "epoch": 0.859166221272047, + "grad_norm": 0.2080078125, + "learning_rate": 1.1842595738012774e-05, + "loss": 1.1594, + "step": 12860 + }, + { + "epoch": 0.8595002672367718, + "grad_norm": 0.173828125, + "learning_rate": 1.1787612423616412e-05, + "loss": 1.2106, + "step": 12865 + }, + { + "epoch": 0.8598343132014965, + "grad_norm": 0.1845703125, + "learning_rate": 1.1732749050756987e-05, + "loss": 1.2124, + "step": 12870 + }, + { + "epoch": 0.8601683591662213, + "grad_norm": 0.173828125, + "learning_rate": 1.1678005694031657e-05, + "loss": 1.2354, + "step": 12875 + }, + { + "epoch": 0.860502405130946, + "grad_norm": 0.1806640625, + "learning_rate": 1.1623382427874474e-05, + "loss": 1.1976, + "step": 12880 + }, + { + "epoch": 0.8608364510956708, + "grad_norm": 0.185546875, + "learning_rate": 1.156887932655616e-05, + "loss": 1.1636, + "step": 12885 + }, + { + "epoch": 0.8611704970603955, + "grad_norm": 0.1708984375, + "learning_rate": 1.1514496464184056e-05, + "loss": 1.2191, + "step": 12890 + }, + { + "epoch": 0.8615045430251203, + "grad_norm": 0.1728515625, + "learning_rate": 1.1460233914701968e-05, + "loss": 1.2312, + "step": 12895 + }, + { + "epoch": 0.861838588989845, + "grad_norm": 0.1767578125, + "learning_rate": 1.1406091751890257e-05, + "loss": 1.2598, + "step": 12900 + }, + { + "epoch": 0.8621726349545697, + "grad_norm": 0.1689453125, + "learning_rate": 1.135207004936546e-05, + "loss": 1.175, + "step": 12905 + }, + { + "epoch": 0.8625066809192945, + "grad_norm": 0.16796875, + "learning_rate": 1.1298168880580362e-05, + "loss": 1.2295, + "step": 12910 + }, + { + "epoch": 0.8628407268840192, + "grad_norm": 0.1640625, + "learning_rate": 1.1244388318823851e-05, + "loss": 1.173, + "step": 12915 + }, + { + "epoch": 0.863174772848744, + "grad_norm": 0.2236328125, + "learning_rate": 1.1190728437220877e-05, + "loss": 1.2834, + "step": 12920 + }, + { + "epoch": 0.8635088188134687, + "grad_norm": 0.173828125, + "learning_rate": 1.113718930873222e-05, + "loss": 1.2072, + "step": 12925 + }, + { + "epoch": 0.8638428647781935, + "grad_norm": 0.177734375, + "learning_rate": 1.1083771006154553e-05, + "loss": 1.2174, + "step": 12930 + }, + { + "epoch": 0.8641769107429182, + "grad_norm": 0.171875, + "learning_rate": 1.103047360212024e-05, + "loss": 1.1909, + "step": 12935 + }, + { + "epoch": 0.864510956707643, + "grad_norm": 0.1767578125, + "learning_rate": 1.0977297169097234e-05, + "loss": 1.1942, + "step": 12940 + }, + { + "epoch": 0.8648450026723677, + "grad_norm": 0.1708984375, + "learning_rate": 1.0924241779389011e-05, + "loss": 1.1526, + "step": 12945 + }, + { + "epoch": 0.8651790486370925, + "grad_norm": 0.1708984375, + "learning_rate": 1.0871307505134476e-05, + "loss": 1.2266, + "step": 12950 + }, + { + "epoch": 0.8655130946018172, + "grad_norm": 0.1669921875, + "learning_rate": 1.0818494418307845e-05, + "loss": 1.2367, + "step": 12955 + }, + { + "epoch": 0.865847140566542, + "grad_norm": 0.1689453125, + "learning_rate": 1.076580259071861e-05, + "loss": 1.2017, + "step": 12960 + }, + { + "epoch": 0.8661811865312667, + "grad_norm": 0.1728515625, + "learning_rate": 1.0713232094011316e-05, + "loss": 1.1665, + "step": 12965 + }, + { + "epoch": 0.8665152324959915, + "grad_norm": 0.1689453125, + "learning_rate": 1.0660782999665542e-05, + "loss": 1.2247, + "step": 12970 + }, + { + "epoch": 0.8668492784607162, + "grad_norm": 0.173828125, + "learning_rate": 1.0608455378995851e-05, + "loss": 1.1821, + "step": 12975 + }, + { + "epoch": 0.867183324425441, + "grad_norm": 0.1689453125, + "learning_rate": 1.0556249303151599e-05, + "loss": 1.1619, + "step": 12980 + }, + { + "epoch": 0.8675173703901656, + "grad_norm": 0.177734375, + "learning_rate": 1.050416484311686e-05, + "loss": 1.1964, + "step": 12985 + }, + { + "epoch": 0.8678514163548904, + "grad_norm": 0.1728515625, + "learning_rate": 1.0452202069710393e-05, + "loss": 1.1573, + "step": 12990 + }, + { + "epoch": 0.8681854623196151, + "grad_norm": 0.1708984375, + "learning_rate": 1.0400361053585506e-05, + "loss": 1.1311, + "step": 12995 + }, + { + "epoch": 0.8685195082843399, + "grad_norm": 0.1748046875, + "learning_rate": 1.0348641865229914e-05, + "loss": 1.1306, + "step": 13000 + }, + { + "epoch": 0.8688535542490646, + "grad_norm": 0.1728515625, + "learning_rate": 1.0297044574965675e-05, + "loss": 1.2016, + "step": 13005 + }, + { + "epoch": 0.8691876002137894, + "grad_norm": 0.1884765625, + "learning_rate": 1.024556925294916e-05, + "loss": 1.2006, + "step": 13010 + }, + { + "epoch": 0.8695216461785141, + "grad_norm": 0.1748046875, + "learning_rate": 1.0194215969170872e-05, + "loss": 1.1202, + "step": 13015 + }, + { + "epoch": 0.8698556921432389, + "grad_norm": 0.1748046875, + "learning_rate": 1.0142984793455346e-05, + "loss": 1.2478, + "step": 13020 + }, + { + "epoch": 0.8701897381079636, + "grad_norm": 0.171875, + "learning_rate": 1.0091875795461147e-05, + "loss": 1.1605, + "step": 13025 + }, + { + "epoch": 0.8705237840726884, + "grad_norm": 0.19140625, + "learning_rate": 1.00408890446807e-05, + "loss": 1.2451, + "step": 13030 + }, + { + "epoch": 0.8708578300374131, + "grad_norm": 0.173828125, + "learning_rate": 9.990024610440185e-06, + "loss": 1.2272, + "step": 13035 + }, + { + "epoch": 0.8711918760021379, + "grad_norm": 0.16796875, + "learning_rate": 9.939282561899466e-06, + "loss": 1.112, + "step": 13040 + }, + { + "epoch": 0.8715259219668626, + "grad_norm": 0.1728515625, + "learning_rate": 9.888662968052053e-06, + "loss": 1.2417, + "step": 13045 + }, + { + "epoch": 0.8718599679315874, + "grad_norm": 0.18359375, + "learning_rate": 9.838165897724894e-06, + "loss": 1.2227, + "step": 13050 + }, + { + "epoch": 0.8721940138963121, + "grad_norm": 0.16796875, + "learning_rate": 9.787791419578407e-06, + "loss": 1.2224, + "step": 13055 + }, + { + "epoch": 0.8725280598610369, + "grad_norm": 0.1630859375, + "learning_rate": 9.737539602106238e-06, + "loss": 1.1799, + "step": 13060 + }, + { + "epoch": 0.8728621058257616, + "grad_norm": 0.1962890625, + "learning_rate": 9.687410513635354e-06, + "loss": 1.2489, + "step": 13065 + }, + { + "epoch": 0.8731961517904864, + "grad_norm": 0.1728515625, + "learning_rate": 9.637404222325763e-06, + "loss": 1.1243, + "step": 13070 + }, + { + "epoch": 0.8735301977552111, + "grad_norm": 0.162109375, + "learning_rate": 9.587520796170524e-06, + "loss": 1.1682, + "step": 13075 + }, + { + "epoch": 0.8738642437199359, + "grad_norm": 0.1708984375, + "learning_rate": 9.53776030299568e-06, + "loss": 1.1071, + "step": 13080 + }, + { + "epoch": 0.8741982896846606, + "grad_norm": 0.1748046875, + "learning_rate": 9.488122810460097e-06, + "loss": 1.2338, + "step": 13085 + }, + { + "epoch": 0.8745323356493854, + "grad_norm": 0.1669921875, + "learning_rate": 9.438608386055403e-06, + "loss": 1.23, + "step": 13090 + }, + { + "epoch": 0.8748663816141101, + "grad_norm": 0.1630859375, + "learning_rate": 9.38921709710585e-06, + "loss": 1.1198, + "step": 13095 + }, + { + "epoch": 0.8752004275788349, + "grad_norm": 0.1767578125, + "learning_rate": 9.339949010768346e-06, + "loss": 1.2296, + "step": 13100 + }, + { + "epoch": 0.8755344735435596, + "grad_norm": 0.1943359375, + "learning_rate": 9.290804194032199e-06, + "loss": 1.1535, + "step": 13105 + }, + { + "epoch": 0.8758685195082844, + "grad_norm": 0.17578125, + "learning_rate": 9.24178271371915e-06, + "loss": 1.2012, + "step": 13110 + }, + { + "epoch": 0.8762025654730091, + "grad_norm": 0.1923828125, + "learning_rate": 9.192884636483246e-06, + "loss": 1.2506, + "step": 13115 + }, + { + "epoch": 0.8765366114377339, + "grad_norm": 0.1875, + "learning_rate": 9.144110028810737e-06, + "loss": 1.2568, + "step": 13120 + }, + { + "epoch": 0.8768706574024586, + "grad_norm": 0.1728515625, + "learning_rate": 9.095458957019987e-06, + "loss": 1.1589, + "step": 13125 + }, + { + "epoch": 0.8772047033671834, + "grad_norm": 0.173828125, + "learning_rate": 9.04693148726139e-06, + "loss": 1.2607, + "step": 13130 + }, + { + "epoch": 0.8775387493319081, + "grad_norm": 0.185546875, + "learning_rate": 8.998527685517255e-06, + "loss": 1.2026, + "step": 13135 + }, + { + "epoch": 0.8778727952966329, + "grad_norm": 0.16796875, + "learning_rate": 8.950247617601793e-06, + "loss": 1.1373, + "step": 13140 + }, + { + "epoch": 0.8782068412613576, + "grad_norm": 0.158203125, + "learning_rate": 8.902091349160968e-06, + "loss": 1.1837, + "step": 13145 + }, + { + "epoch": 0.8785408872260823, + "grad_norm": 0.169921875, + "learning_rate": 8.854058945672372e-06, + "loss": 1.1745, + "step": 13150 + }, + { + "epoch": 0.878874933190807, + "grad_norm": 0.1689453125, + "learning_rate": 8.806150472445185e-06, + "loss": 1.2169, + "step": 13155 + }, + { + "epoch": 0.8792089791555318, + "grad_norm": 0.181640625, + "learning_rate": 8.758365994620132e-06, + "loss": 1.1807, + "step": 13160 + }, + { + "epoch": 0.8795430251202565, + "grad_norm": 0.1650390625, + "learning_rate": 8.710705577169299e-06, + "loss": 1.1301, + "step": 13165 + }, + { + "epoch": 0.8798770710849813, + "grad_norm": 0.17578125, + "learning_rate": 8.663169284896078e-06, + "loss": 1.2744, + "step": 13170 + }, + { + "epoch": 0.880211117049706, + "grad_norm": 0.1787109375, + "learning_rate": 8.615757182435124e-06, + "loss": 1.1536, + "step": 13175 + }, + { + "epoch": 0.8805451630144308, + "grad_norm": 0.177734375, + "learning_rate": 8.568469334252238e-06, + "loss": 1.2433, + "step": 13180 + }, + { + "epoch": 0.8808792089791555, + "grad_norm": 0.185546875, + "learning_rate": 8.52130580464423e-06, + "loss": 1.2025, + "step": 13185 + }, + { + "epoch": 0.8812132549438803, + "grad_norm": 0.1640625, + "learning_rate": 8.474266657738895e-06, + "loss": 1.069, + "step": 13190 + }, + { + "epoch": 0.881547300908605, + "grad_norm": 0.171875, + "learning_rate": 8.427351957494921e-06, + "loss": 1.1772, + "step": 13195 + }, + { + "epoch": 0.8818813468733298, + "grad_norm": 0.1796875, + "learning_rate": 8.380561767701778e-06, + "loss": 1.2106, + "step": 13200 + }, + { + "epoch": 0.8822153928380545, + "grad_norm": 0.208984375, + "learning_rate": 8.333896151979636e-06, + "loss": 1.2206, + "step": 13205 + }, + { + "epoch": 0.8825494388027793, + "grad_norm": 0.1845703125, + "learning_rate": 8.287355173779265e-06, + "loss": 1.1584, + "step": 13210 + }, + { + "epoch": 0.882883484767504, + "grad_norm": 0.181640625, + "learning_rate": 8.240938896382022e-06, + "loss": 1.1826, + "step": 13215 + }, + { + "epoch": 0.8832175307322288, + "grad_norm": 0.2099609375, + "learning_rate": 8.194647382899656e-06, + "loss": 1.2083, + "step": 13220 + }, + { + "epoch": 0.8835515766969535, + "grad_norm": 0.1650390625, + "learning_rate": 8.148480696274275e-06, + "loss": 1.2252, + "step": 13225 + }, + { + "epoch": 0.8838856226616782, + "grad_norm": 0.1708984375, + "learning_rate": 8.102438899278298e-06, + "loss": 1.1475, + "step": 13230 + }, + { + "epoch": 0.884219668626403, + "grad_norm": 0.185546875, + "learning_rate": 8.056522054514337e-06, + "loss": 1.2059, + "step": 13235 + }, + { + "epoch": 0.8845537145911277, + "grad_norm": 0.1728515625, + "learning_rate": 8.010730224415064e-06, + "loss": 1.2262, + "step": 13240 + }, + { + "epoch": 0.8848877605558525, + "grad_norm": 0.166015625, + "learning_rate": 7.965063471243161e-06, + "loss": 1.1197, + "step": 13245 + }, + { + "epoch": 0.8852218065205772, + "grad_norm": 0.173828125, + "learning_rate": 7.919521857091328e-06, + "loss": 1.1822, + "step": 13250 + }, + { + "epoch": 0.885555852485302, + "grad_norm": 0.1611328125, + "learning_rate": 7.874105443882041e-06, + "loss": 1.1623, + "step": 13255 + }, + { + "epoch": 0.8858898984500267, + "grad_norm": 0.171875, + "learning_rate": 7.828814293367525e-06, + "loss": 1.2486, + "step": 13260 + }, + { + "epoch": 0.8862239444147515, + "grad_norm": 0.1591796875, + "learning_rate": 7.783648467129767e-06, + "loss": 1.1448, + "step": 13265 + }, + { + "epoch": 0.8865579903794762, + "grad_norm": 0.1611328125, + "learning_rate": 7.738608026580296e-06, + "loss": 1.1805, + "step": 13270 + }, + { + "epoch": 0.886892036344201, + "grad_norm": 0.171875, + "learning_rate": 7.693693032960181e-06, + "loss": 1.2759, + "step": 13275 + }, + { + "epoch": 0.8872260823089257, + "grad_norm": 0.1796875, + "learning_rate": 7.648903547339858e-06, + "loss": 1.1487, + "step": 13280 + }, + { + "epoch": 0.8875601282736505, + "grad_norm": 0.1865234375, + "learning_rate": 7.604239630619225e-06, + "loss": 1.1906, + "step": 13285 + }, + { + "epoch": 0.8878941742383752, + "grad_norm": 0.169921875, + "learning_rate": 7.559701343527348e-06, + "loss": 1.1496, + "step": 13290 + }, + { + "epoch": 0.8882282202031, + "grad_norm": 0.16796875, + "learning_rate": 7.515288746622495e-06, + "loss": 1.1543, + "step": 13295 + }, + { + "epoch": 0.8885622661678247, + "grad_norm": 0.17578125, + "learning_rate": 7.471001900292063e-06, + "loss": 1.1974, + "step": 13300 + }, + { + "epoch": 0.8888963121325495, + "grad_norm": 0.1650390625, + "learning_rate": 7.42684086475246e-06, + "loss": 1.2546, + "step": 13305 + }, + { + "epoch": 0.8892303580972741, + "grad_norm": 0.173828125, + "learning_rate": 7.382805700049023e-06, + "loss": 1.2219, + "step": 13310 + }, + { + "epoch": 0.8895644040619989, + "grad_norm": 0.1650390625, + "learning_rate": 7.338896466055934e-06, + "loss": 1.1687, + "step": 13315 + }, + { + "epoch": 0.8898984500267236, + "grad_norm": 0.166015625, + "learning_rate": 7.295113222476147e-06, + "loss": 1.1647, + "step": 13320 + }, + { + "epoch": 0.8902324959914484, + "grad_norm": 0.16015625, + "learning_rate": 7.251456028841319e-06, + "loss": 1.171, + "step": 13325 + }, + { + "epoch": 0.8905665419561731, + "grad_norm": 0.1796875, + "learning_rate": 7.207924944511757e-06, + "loss": 1.1786, + "step": 13330 + }, + { + "epoch": 0.8909005879208979, + "grad_norm": 0.1748046875, + "learning_rate": 7.1645200286762245e-06, + "loss": 1.2024, + "step": 13335 + }, + { + "epoch": 0.8912346338856226, + "grad_norm": 0.166015625, + "learning_rate": 7.121241340351947e-06, + "loss": 1.2028, + "step": 13340 + }, + { + "epoch": 0.8915686798503474, + "grad_norm": 0.2001953125, + "learning_rate": 7.078088938384597e-06, + "loss": 1.2511, + "step": 13345 + }, + { + "epoch": 0.8919027258150721, + "grad_norm": 0.173828125, + "learning_rate": 7.035062881448051e-06, + "loss": 1.1716, + "step": 13350 + }, + { + "epoch": 0.8922367717797969, + "grad_norm": 0.158203125, + "learning_rate": 6.99216322804439e-06, + "loss": 1.1883, + "step": 13355 + }, + { + "epoch": 0.8925708177445216, + "grad_norm": 0.177734375, + "learning_rate": 6.9493900365039335e-06, + "loss": 1.2068, + "step": 13360 + }, + { + "epoch": 0.8929048637092464, + "grad_norm": 0.181640625, + "learning_rate": 6.9067433649849465e-06, + "loss": 1.2287, + "step": 13365 + }, + { + "epoch": 0.8932389096739711, + "grad_norm": 0.173828125, + "learning_rate": 6.864223271473702e-06, + "loss": 1.2195, + "step": 13370 + }, + { + "epoch": 0.8935729556386959, + "grad_norm": 0.2216796875, + "learning_rate": 6.821829813784331e-06, + "loss": 1.2318, + "step": 13375 + }, + { + "epoch": 0.8939070016034206, + "grad_norm": 0.1669921875, + "learning_rate": 6.779563049558857e-06, + "loss": 1.1857, + "step": 13380 + }, + { + "epoch": 0.8942410475681454, + "grad_norm": 0.162109375, + "learning_rate": 6.7374230362669655e-06, + "loss": 1.1986, + "step": 13385 + }, + { + "epoch": 0.8945750935328701, + "grad_norm": 0.1708984375, + "learning_rate": 6.695409831206045e-06, + "loss": 1.1192, + "step": 13390 + }, + { + "epoch": 0.8949091394975949, + "grad_norm": 0.1767578125, + "learning_rate": 6.653523491501035e-06, + "loss": 1.1814, + "step": 13395 + }, + { + "epoch": 0.8952431854623196, + "grad_norm": 0.173828125, + "learning_rate": 6.611764074104409e-06, + "loss": 1.2354, + "step": 13400 + }, + { + "epoch": 0.8955772314270444, + "grad_norm": 0.177734375, + "learning_rate": 6.5701316357960285e-06, + "loss": 1.1647, + "step": 13405 + }, + { + "epoch": 0.8959112773917691, + "grad_norm": 0.18359375, + "learning_rate": 6.52862623318311e-06, + "loss": 1.1939, + "step": 13410 + }, + { + "epoch": 0.8962453233564939, + "grad_norm": 0.1796875, + "learning_rate": 6.487247922700157e-06, + "loss": 1.1808, + "step": 13415 + }, + { + "epoch": 0.8965793693212186, + "grad_norm": 0.1708984375, + "learning_rate": 6.445996760608896e-06, + "loss": 1.2396, + "step": 13420 + }, + { + "epoch": 0.8969134152859434, + "grad_norm": 0.16796875, + "learning_rate": 6.404872802998108e-06, + "loss": 1.182, + "step": 13425 + }, + { + "epoch": 0.8972474612506681, + "grad_norm": 0.16015625, + "learning_rate": 6.3638761057836285e-06, + "loss": 1.1529, + "step": 13430 + }, + { + "epoch": 0.8975815072153929, + "grad_norm": 0.1748046875, + "learning_rate": 6.323006724708302e-06, + "loss": 1.2241, + "step": 13435 + }, + { + "epoch": 0.8979155531801176, + "grad_norm": 0.1845703125, + "learning_rate": 6.2822647153418235e-06, + "loss": 1.2137, + "step": 13440 + }, + { + "epoch": 0.8982495991448424, + "grad_norm": 0.1650390625, + "learning_rate": 6.24165013308069e-06, + "loss": 1.1781, + "step": 13445 + }, + { + "epoch": 0.8985836451095671, + "grad_norm": 0.2060546875, + "learning_rate": 6.2011630331481826e-06, + "loss": 1.2033, + "step": 13450 + }, + { + "epoch": 0.8989176910742919, + "grad_norm": 0.1884765625, + "learning_rate": 6.160803470594234e-06, + "loss": 1.3137, + "step": 13455 + }, + { + "epoch": 0.8992517370390166, + "grad_norm": 0.1669921875, + "learning_rate": 6.120571500295347e-06, + "loss": 1.2285, + "step": 13460 + }, + { + "epoch": 0.8995857830037414, + "grad_norm": 0.16796875, + "learning_rate": 6.080467176954519e-06, + "loss": 1.1531, + "step": 13465 + }, + { + "epoch": 0.899919828968466, + "grad_norm": 0.1669921875, + "learning_rate": 6.040490555101241e-06, + "loss": 1.1854, + "step": 13470 + }, + { + "epoch": 0.9002538749331908, + "grad_norm": 0.1689453125, + "learning_rate": 6.000641689091324e-06, + "loss": 1.1357, + "step": 13475 + }, + { + "epoch": 0.9005879208979155, + "grad_norm": 0.1689453125, + "learning_rate": 5.960920633106926e-06, + "loss": 1.2068, + "step": 13480 + }, + { + "epoch": 0.9009219668626403, + "grad_norm": 0.173828125, + "learning_rate": 5.921327441156321e-06, + "loss": 1.2706, + "step": 13485 + }, + { + "epoch": 0.901256012827365, + "grad_norm": 0.171875, + "learning_rate": 5.8818621670740595e-06, + "loss": 1.2044, + "step": 13490 + }, + { + "epoch": 0.9015900587920898, + "grad_norm": 0.169921875, + "learning_rate": 5.84252486452066e-06, + "loss": 1.1731, + "step": 13495 + }, + { + "epoch": 0.9019241047568145, + "grad_norm": 0.1796875, + "learning_rate": 5.803315586982671e-06, + "loss": 1.1598, + "step": 13500 + }, + { + "epoch": 0.9022581507215393, + "grad_norm": 0.171875, + "learning_rate": 5.764234387772593e-06, + "loss": 1.1737, + "step": 13505 + }, + { + "epoch": 0.902592196686264, + "grad_norm": 0.1572265625, + "learning_rate": 5.7252813200287306e-06, + "loss": 1.1653, + "step": 13510 + }, + { + "epoch": 0.9029262426509888, + "grad_norm": 0.1552734375, + "learning_rate": 5.686456436715227e-06, + "loss": 1.1973, + "step": 13515 + }, + { + "epoch": 0.9032602886157135, + "grad_norm": 0.173828125, + "learning_rate": 5.647759790621876e-06, + "loss": 1.2162, + "step": 13520 + }, + { + "epoch": 0.9035943345804383, + "grad_norm": 0.1826171875, + "learning_rate": 5.609191434364159e-06, + "loss": 1.2118, + "step": 13525 + }, + { + "epoch": 0.903928380545163, + "grad_norm": 0.1708984375, + "learning_rate": 5.5707514203830915e-06, + "loss": 1.2077, + "step": 13530 + }, + { + "epoch": 0.9042624265098878, + "grad_norm": 0.1728515625, + "learning_rate": 5.532439800945188e-06, + "loss": 1.2367, + "step": 13535 + }, + { + "epoch": 0.9045964724746125, + "grad_norm": 0.177734375, + "learning_rate": 5.494256628142358e-06, + "loss": 1.2476, + "step": 13540 + }, + { + "epoch": 0.9049305184393373, + "grad_norm": 0.177734375, + "learning_rate": 5.45620195389196e-06, + "loss": 1.1336, + "step": 13545 + }, + { + "epoch": 0.905264564404062, + "grad_norm": 0.1650390625, + "learning_rate": 5.418275829936537e-06, + "loss": 1.2059, + "step": 13550 + }, + { + "epoch": 0.9055986103687867, + "grad_norm": 0.1845703125, + "learning_rate": 5.380478307843872e-06, + "loss": 1.181, + "step": 13555 + }, + { + "epoch": 0.9059326563335115, + "grad_norm": 0.162109375, + "learning_rate": 5.3428094390068906e-06, + "loss": 1.1992, + "step": 13560 + }, + { + "epoch": 0.9062667022982362, + "grad_norm": 0.1728515625, + "learning_rate": 5.3052692746436095e-06, + "loss": 1.1258, + "step": 13565 + }, + { + "epoch": 0.906600748262961, + "grad_norm": 0.1650390625, + "learning_rate": 5.2678578657970105e-06, + "loss": 1.1891, + "step": 13570 + }, + { + "epoch": 0.9069347942276857, + "grad_norm": 0.169921875, + "learning_rate": 5.2305752633350355e-06, + "loss": 1.1366, + "step": 13575 + }, + { + "epoch": 0.9072688401924105, + "grad_norm": 0.1650390625, + "learning_rate": 5.193421517950481e-06, + "loss": 1.1117, + "step": 13580 + }, + { + "epoch": 0.9076028861571352, + "grad_norm": 0.1796875, + "learning_rate": 5.1563966801609465e-06, + "loss": 1.228, + "step": 13585 + }, + { + "epoch": 0.90793693212186, + "grad_norm": 0.1640625, + "learning_rate": 5.119500800308741e-06, + "loss": 1.2213, + "step": 13590 + }, + { + "epoch": 0.9082709780865847, + "grad_norm": 0.169921875, + "learning_rate": 5.082733928560835e-06, + "loss": 1.2232, + "step": 13595 + }, + { + "epoch": 0.9086050240513095, + "grad_norm": 0.169921875, + "learning_rate": 5.0460961149087824e-06, + "loss": 1.1233, + "step": 13600 + }, + { + "epoch": 0.9089390700160342, + "grad_norm": 0.173828125, + "learning_rate": 5.009587409168703e-06, + "loss": 1.2048, + "step": 13605 + }, + { + "epoch": 0.909273115980759, + "grad_norm": 0.169921875, + "learning_rate": 4.9732078609811e-06, + "loss": 1.1446, + "step": 13610 + }, + { + "epoch": 0.9096071619454837, + "grad_norm": 0.1806640625, + "learning_rate": 4.936957519810892e-06, + "loss": 1.2648, + "step": 13615 + }, + { + "epoch": 0.9099412079102085, + "grad_norm": 0.169921875, + "learning_rate": 4.900836434947353e-06, + "loss": 1.2087, + "step": 13620 + }, + { + "epoch": 0.9102752538749332, + "grad_norm": 0.171875, + "learning_rate": 4.864844655503953e-06, + "loss": 1.2085, + "step": 13625 + }, + { + "epoch": 0.9106092998396579, + "grad_norm": 0.1806640625, + "learning_rate": 4.8289822304183665e-06, + "loss": 1.2029, + "step": 13630 + }, + { + "epoch": 0.9109433458043826, + "grad_norm": 0.169921875, + "learning_rate": 4.793249208452388e-06, + "loss": 1.1732, + "step": 13635 + }, + { + "epoch": 0.9112773917691074, + "grad_norm": 0.1923828125, + "learning_rate": 4.757645638191876e-06, + "loss": 1.2723, + "step": 13640 + }, + { + "epoch": 0.9116114377338321, + "grad_norm": 0.1748046875, + "learning_rate": 4.722171568046674e-06, + "loss": 1.2018, + "step": 13645 + }, + { + "epoch": 0.9119454836985569, + "grad_norm": 0.171875, + "learning_rate": 4.686827046250497e-06, + "loss": 1.1453, + "step": 13650 + }, + { + "epoch": 0.9122795296632816, + "grad_norm": 0.16796875, + "learning_rate": 4.651612120860993e-06, + "loss": 1.2069, + "step": 13655 + }, + { + "epoch": 0.9126135756280064, + "grad_norm": 0.1689453125, + "learning_rate": 4.616526839759516e-06, + "loss": 1.2072, + "step": 13660 + }, + { + "epoch": 0.9129476215927311, + "grad_norm": 0.17578125, + "learning_rate": 4.58157125065124e-06, + "loss": 1.2006, + "step": 13665 + }, + { + "epoch": 0.9132816675574559, + "grad_norm": 0.173828125, + "learning_rate": 4.546745401064889e-06, + "loss": 1.237, + "step": 13670 + }, + { + "epoch": 0.9136157135221806, + "grad_norm": 0.173828125, + "learning_rate": 4.512049338352875e-06, + "loss": 1.1559, + "step": 13675 + }, + { + "epoch": 0.9139497594869054, + "grad_norm": 0.1669921875, + "learning_rate": 4.477483109691083e-06, + "loss": 1.2091, + "step": 13680 + }, + { + "epoch": 0.9142838054516301, + "grad_norm": 0.1640625, + "learning_rate": 4.443046762078884e-06, + "loss": 1.1398, + "step": 13685 + }, + { + "epoch": 0.9146178514163549, + "grad_norm": 0.1796875, + "learning_rate": 4.408740342339046e-06, + "loss": 1.1943, + "step": 13690 + }, + { + "epoch": 0.9149518973810796, + "grad_norm": 0.1708984375, + "learning_rate": 4.3745638971177005e-06, + "loss": 1.1941, + "step": 13695 + }, + { + "epoch": 0.9152859433458044, + "grad_norm": 0.1865234375, + "learning_rate": 4.34051747288422e-06, + "loss": 1.2382, + "step": 13700 + }, + { + "epoch": 0.9156199893105291, + "grad_norm": 0.1669921875, + "learning_rate": 4.306601115931175e-06, + "loss": 1.1271, + "step": 13705 + }, + { + "epoch": 0.9159540352752539, + "grad_norm": 0.169921875, + "learning_rate": 4.2728148723743536e-06, + "loss": 1.2612, + "step": 13710 + }, + { + "epoch": 0.9162880812399786, + "grad_norm": 0.1787109375, + "learning_rate": 4.239158788152564e-06, + "loss": 1.1464, + "step": 13715 + }, + { + "epoch": 0.9166221272047034, + "grad_norm": 0.1689453125, + "learning_rate": 4.205632909027668e-06, + "loss": 1.1814, + "step": 13720 + }, + { + "epoch": 0.9169561731694281, + "grad_norm": 0.1708984375, + "learning_rate": 4.172237280584445e-06, + "loss": 1.2345, + "step": 13725 + }, + { + "epoch": 0.9172902191341529, + "grad_norm": 0.1767578125, + "learning_rate": 4.138971948230674e-06, + "loss": 1.1822, + "step": 13730 + }, + { + "epoch": 0.9176242650988776, + "grad_norm": 0.173828125, + "learning_rate": 4.105836957196873e-06, + "loss": 1.1968, + "step": 13735 + }, + { + "epoch": 0.9179583110636024, + "grad_norm": 0.1611328125, + "learning_rate": 4.072832352536382e-06, + "loss": 1.1399, + "step": 13740 + }, + { + "epoch": 0.9182923570283271, + "grad_norm": 0.16015625, + "learning_rate": 4.039958179125225e-06, + "loss": 1.1361, + "step": 13745 + }, + { + "epoch": 0.9186264029930519, + "grad_norm": 0.1748046875, + "learning_rate": 4.007214481662125e-06, + "loss": 1.1929, + "step": 13750 + }, + { + "epoch": 0.9189604489577766, + "grad_norm": 0.171875, + "learning_rate": 3.974601304668346e-06, + "loss": 1.1354, + "step": 13755 + }, + { + "epoch": 0.9192944949225014, + "grad_norm": 0.162109375, + "learning_rate": 3.94211869248774e-06, + "loss": 1.1908, + "step": 13760 + }, + { + "epoch": 0.9196285408872261, + "grad_norm": 0.1767578125, + "learning_rate": 3.909766689286576e-06, + "loss": 1.1672, + "step": 13765 + }, + { + "epoch": 0.9199625868519509, + "grad_norm": 0.169921875, + "learning_rate": 3.877545339053601e-06, + "loss": 1.1175, + "step": 13770 + }, + { + "epoch": 0.9202966328166756, + "grad_norm": 0.177734375, + "learning_rate": 3.845454685599847e-06, + "loss": 1.1686, + "step": 13775 + }, + { + "epoch": 0.9206306787814004, + "grad_norm": 0.171875, + "learning_rate": 3.813494772558657e-06, + "loss": 1.1441, + "step": 13780 + }, + { + "epoch": 0.9209647247461251, + "grad_norm": 0.1806640625, + "learning_rate": 3.7816656433856348e-06, + "loss": 1.1806, + "step": 13785 + }, + { + "epoch": 0.9212987707108499, + "grad_norm": 0.1787109375, + "learning_rate": 3.7499673413585513e-06, + "loss": 1.1616, + "step": 13790 + }, + { + "epoch": 0.9216328166755745, + "grad_norm": 0.166015625, + "learning_rate": 3.718399909577275e-06, + "loss": 1.1416, + "step": 13795 + }, + { + "epoch": 0.9219668626402993, + "grad_norm": 0.166015625, + "learning_rate": 3.686963390963727e-06, + "loss": 1.1897, + "step": 13800 + }, + { + "epoch": 0.922300908605024, + "grad_norm": 0.18359375, + "learning_rate": 3.65565782826186e-06, + "loss": 1.1332, + "step": 13805 + }, + { + "epoch": 0.9226349545697488, + "grad_norm": 0.1650390625, + "learning_rate": 3.624483264037537e-06, + "loss": 1.1628, + "step": 13810 + }, + { + "epoch": 0.9229690005344735, + "grad_norm": 0.162109375, + "learning_rate": 3.593439740678506e-06, + "loss": 1.2139, + "step": 13815 + }, + { + "epoch": 0.9233030464991983, + "grad_norm": 0.169921875, + "learning_rate": 3.5625273003943583e-06, + "loss": 1.176, + "step": 13820 + }, + { + "epoch": 0.923637092463923, + "grad_norm": 0.1796875, + "learning_rate": 3.5317459852164502e-06, + "loss": 1.2045, + "step": 13825 + }, + { + "epoch": 0.9239711384286478, + "grad_norm": 0.1669921875, + "learning_rate": 3.501095836997825e-06, + "loss": 1.2195, + "step": 13830 + }, + { + "epoch": 0.9243051843933725, + "grad_norm": 0.1689453125, + "learning_rate": 3.4705768974132024e-06, + "loss": 1.1469, + "step": 13835 + }, + { + "epoch": 0.9246392303580973, + "grad_norm": 0.1650390625, + "learning_rate": 3.4401892079588993e-06, + "loss": 1.1442, + "step": 13840 + }, + { + "epoch": 0.924973276322822, + "grad_norm": 0.18359375, + "learning_rate": 3.4099328099527427e-06, + "loss": 1.2137, + "step": 13845 + }, + { + "epoch": 0.9253073222875468, + "grad_norm": 0.171875, + "learning_rate": 3.3798077445340915e-06, + "loss": 1.2369, + "step": 13850 + }, + { + "epoch": 0.9256413682522715, + "grad_norm": 0.1689453125, + "learning_rate": 3.34981405266368e-06, + "loss": 1.1292, + "step": 13855 + }, + { + "epoch": 0.9259754142169963, + "grad_norm": 0.1826171875, + "learning_rate": 3.3199517751236753e-06, + "loss": 1.1946, + "step": 13860 + }, + { + "epoch": 0.926309460181721, + "grad_norm": 0.16796875, + "learning_rate": 3.29022095251752e-06, + "loss": 1.156, + "step": 13865 + }, + { + "epoch": 0.9266435061464458, + "grad_norm": 0.162109375, + "learning_rate": 3.2606216252699106e-06, + "loss": 1.2873, + "step": 13870 + }, + { + "epoch": 0.9269775521111705, + "grad_norm": 0.173828125, + "learning_rate": 3.2311538336267766e-06, + "loss": 1.2188, + "step": 13875 + }, + { + "epoch": 0.9273115980758952, + "grad_norm": 0.1640625, + "learning_rate": 3.2018176176552116e-06, + "loss": 1.1445, + "step": 13880 + }, + { + "epoch": 0.92764564404062, + "grad_norm": 0.1669921875, + "learning_rate": 3.1726130172433755e-06, + "loss": 1.1774, + "step": 13885 + }, + { + "epoch": 0.9279796900053447, + "grad_norm": 0.177734375, + "learning_rate": 3.143540072100459e-06, + "loss": 1.2333, + "step": 13890 + }, + { + "epoch": 0.9283137359700695, + "grad_norm": 0.171875, + "learning_rate": 3.114598821756698e-06, + "loss": 1.2043, + "step": 13895 + }, + { + "epoch": 0.9286477819347942, + "grad_norm": 0.177734375, + "learning_rate": 3.0857893055632246e-06, + "loss": 1.2579, + "step": 13900 + }, + { + "epoch": 0.928981827899519, + "grad_norm": 0.166015625, + "learning_rate": 3.0571115626920498e-06, + "loss": 1.1417, + "step": 13905 + }, + { + "epoch": 0.9293158738642437, + "grad_norm": 0.171875, + "learning_rate": 3.0285656321360267e-06, + "loss": 1.1325, + "step": 13910 + }, + { + "epoch": 0.9296499198289685, + "grad_norm": 0.1640625, + "learning_rate": 3.0001515527088074e-06, + "loss": 1.1791, + "step": 13915 + }, + { + "epoch": 0.9299839657936932, + "grad_norm": 0.1640625, + "learning_rate": 2.9718693630447214e-06, + "loss": 1.1859, + "step": 13920 + }, + { + "epoch": 0.930318011758418, + "grad_norm": 0.169921875, + "learning_rate": 2.9437191015987854e-06, + "loss": 1.1939, + "step": 13925 + }, + { + "epoch": 0.9306520577231427, + "grad_norm": 0.1845703125, + "learning_rate": 2.9157008066466707e-06, + "loss": 1.2171, + "step": 13930 + }, + { + "epoch": 0.9309861036878675, + "grad_norm": 0.1708984375, + "learning_rate": 2.8878145162845373e-06, + "loss": 1.2151, + "step": 13935 + }, + { + "epoch": 0.9313201496525922, + "grad_norm": 0.1796875, + "learning_rate": 2.8600602684291546e-06, + "loss": 1.2412, + "step": 13940 + }, + { + "epoch": 0.931654195617317, + "grad_norm": 0.171875, + "learning_rate": 2.832438100817658e-06, + "loss": 1.1997, + "step": 13945 + }, + { + "epoch": 0.9319882415820417, + "grad_norm": 0.1708984375, + "learning_rate": 2.8049480510076944e-06, + "loss": 1.2391, + "step": 13950 + }, + { + "epoch": 0.9323222875467664, + "grad_norm": 0.2119140625, + "learning_rate": 2.7775901563771967e-06, + "loss": 1.267, + "step": 13955 + }, + { + "epoch": 0.9326563335114911, + "grad_norm": 0.1728515625, + "learning_rate": 2.750364454124432e-06, + "loss": 1.1957, + "step": 13960 + }, + { + "epoch": 0.9329903794762159, + "grad_norm": 0.171875, + "learning_rate": 2.7232709812679114e-06, + "loss": 1.1725, + "step": 13965 + }, + { + "epoch": 0.9333244254409406, + "grad_norm": 0.1728515625, + "learning_rate": 2.6963097746463882e-06, + "loss": 1.085, + "step": 13970 + }, + { + "epoch": 0.9336584714056654, + "grad_norm": 0.166015625, + "learning_rate": 2.6694808709187613e-06, + "loss": 1.1619, + "step": 13975 + }, + { + "epoch": 0.9339925173703901, + "grad_norm": 0.189453125, + "learning_rate": 2.6427843065640167e-06, + "loss": 1.2746, + "step": 13980 + }, + { + "epoch": 0.9343265633351149, + "grad_norm": 0.1796875, + "learning_rate": 2.6162201178812294e-06, + "loss": 1.2403, + "step": 13985 + }, + { + "epoch": 0.9346606092998396, + "grad_norm": 0.1767578125, + "learning_rate": 2.589788340989474e-06, + "loss": 1.2107, + "step": 13990 + }, + { + "epoch": 0.9349946552645644, + "grad_norm": 0.1708984375, + "learning_rate": 2.5634890118277686e-06, + "loss": 1.2215, + "step": 13995 + }, + { + "epoch": 0.9353287012292891, + "grad_norm": 0.169921875, + "learning_rate": 2.537322166155065e-06, + "loss": 1.1664, + "step": 14000 + }, + { + "epoch": 0.9356627471940139, + "grad_norm": 0.1689453125, + "learning_rate": 2.5112878395501805e-06, + "loss": 1.209, + "step": 14005 + }, + { + "epoch": 0.9359967931587386, + "grad_norm": 0.203125, + "learning_rate": 2.4853860674117325e-06, + "loss": 1.1431, + "step": 14010 + }, + { + "epoch": 0.9363308391234634, + "grad_norm": 0.16796875, + "learning_rate": 2.4596168849581158e-06, + "loss": 1.2497, + "step": 14015 + }, + { + "epoch": 0.9366648850881881, + "grad_norm": 0.1748046875, + "learning_rate": 2.4339803272274366e-06, + "loss": 1.1754, + "step": 14020 + }, + { + "epoch": 0.9369989310529129, + "grad_norm": 0.1796875, + "learning_rate": 2.4084764290774775e-06, + "loss": 1.1658, + "step": 14025 + }, + { + "epoch": 0.9373329770176376, + "grad_norm": 0.16796875, + "learning_rate": 2.3831052251856445e-06, + "loss": 1.2201, + "step": 14030 + }, + { + "epoch": 0.9376670229823624, + "grad_norm": 0.1875, + "learning_rate": 2.3578667500489206e-06, + "loss": 1.2038, + "step": 14035 + }, + { + "epoch": 0.9380010689470871, + "grad_norm": 0.1630859375, + "learning_rate": 2.3327610379838105e-06, + "loss": 1.1809, + "step": 14040 + }, + { + "epoch": 0.9383351149118119, + "grad_norm": 0.17578125, + "learning_rate": 2.307788123126331e-06, + "loss": 1.1757, + "step": 14045 + }, + { + "epoch": 0.9386691608765366, + "grad_norm": 0.1796875, + "learning_rate": 2.282948039431898e-06, + "loss": 1.209, + "step": 14050 + }, + { + "epoch": 0.9390032068412614, + "grad_norm": 0.18359375, + "learning_rate": 2.258240820675317e-06, + "loss": 1.1629, + "step": 14055 + }, + { + "epoch": 0.9393372528059861, + "grad_norm": 0.1748046875, + "learning_rate": 2.233666500450793e-06, + "loss": 1.1975, + "step": 14060 + }, + { + "epoch": 0.9396712987707109, + "grad_norm": 0.1669921875, + "learning_rate": 2.2092251121717757e-06, + "loss": 1.1843, + "step": 14065 + }, + { + "epoch": 0.9400053447354356, + "grad_norm": 0.17578125, + "learning_rate": 2.1849166890709927e-06, + "loss": 1.1829, + "step": 14070 + }, + { + "epoch": 0.9403393907001604, + "grad_norm": 0.1728515625, + "learning_rate": 2.160741264200361e-06, + "loss": 1.1872, + "step": 14075 + }, + { + "epoch": 0.9406734366648851, + "grad_norm": 0.1728515625, + "learning_rate": 2.1366988704309976e-06, + "loss": 1.2531, + "step": 14080 + }, + { + "epoch": 0.9410074826296099, + "grad_norm": 0.158203125, + "learning_rate": 2.112789540453086e-06, + "loss": 1.1454, + "step": 14085 + }, + { + "epoch": 0.9413415285943346, + "grad_norm": 0.177734375, + "learning_rate": 2.089013306775922e-06, + "loss": 1.1905, + "step": 14090 + }, + { + "epoch": 0.9416755745590594, + "grad_norm": 0.1728515625, + "learning_rate": 2.065370201727823e-06, + "loss": 1.1781, + "step": 14095 + }, + { + "epoch": 0.9420096205237841, + "grad_norm": 0.17578125, + "learning_rate": 2.0418602574561074e-06, + "loss": 1.158, + "step": 14100 + }, + { + "epoch": 0.9423436664885089, + "grad_norm": 0.1806640625, + "learning_rate": 2.0184835059270047e-06, + "loss": 1.2032, + "step": 14105 + }, + { + "epoch": 0.9426777124532336, + "grad_norm": 0.1748046875, + "learning_rate": 1.9952399789256558e-06, + "loss": 1.135, + "step": 14110 + }, + { + "epoch": 0.9430117584179583, + "grad_norm": 0.1728515625, + "learning_rate": 1.9721297080560807e-06, + "loss": 1.225, + "step": 14115 + }, + { + "epoch": 0.943345804382683, + "grad_norm": 0.1884765625, + "learning_rate": 1.9491527247410657e-06, + "loss": 1.2238, + "step": 14120 + }, + { + "epoch": 0.9436798503474078, + "grad_norm": 0.2451171875, + "learning_rate": 1.92630906022222e-06, + "loss": 1.0923, + "step": 14125 + }, + { + "epoch": 0.9440138963121325, + "grad_norm": 0.173828125, + "learning_rate": 1.9035987455598425e-06, + "loss": 1.2037, + "step": 14130 + }, + { + "epoch": 0.9443479422768573, + "grad_norm": 0.1708984375, + "learning_rate": 1.8810218116329326e-06, + "loss": 1.2147, + "step": 14135 + }, + { + "epoch": 0.944681988241582, + "grad_norm": 0.162109375, + "learning_rate": 1.8585782891391345e-06, + "loss": 1.1872, + "step": 14140 + }, + { + "epoch": 0.9450160342063068, + "grad_norm": 0.1826171875, + "learning_rate": 1.8362682085946825e-06, + "loss": 1.2518, + "step": 14145 + }, + { + "epoch": 0.9453500801710315, + "grad_norm": 0.1787109375, + "learning_rate": 1.8140916003343778e-06, + "loss": 1.2239, + "step": 14150 + }, + { + "epoch": 0.9456841261357563, + "grad_norm": 0.169921875, + "learning_rate": 1.7920484945115557e-06, + "loss": 1.1243, + "step": 14155 + }, + { + "epoch": 0.946018172100481, + "grad_norm": 0.2060546875, + "learning_rate": 1.770138921098008e-06, + "loss": 1.1931, + "step": 14160 + }, + { + "epoch": 0.9463522180652058, + "grad_norm": 0.1884765625, + "learning_rate": 1.7483629098839605e-06, + "loss": 1.2633, + "step": 14165 + }, + { + "epoch": 0.9466862640299305, + "grad_norm": 0.1923828125, + "learning_rate": 1.7267204904780621e-06, + "loss": 1.2289, + "step": 14170 + }, + { + "epoch": 0.9470203099946553, + "grad_norm": 0.166015625, + "learning_rate": 1.7052116923072958e-06, + "loss": 1.2129, + "step": 14175 + }, + { + "epoch": 0.94735435595938, + "grad_norm": 0.1708984375, + "learning_rate": 1.683836544616979e-06, + "loss": 1.2649, + "step": 14180 + }, + { + "epoch": 0.9476884019241048, + "grad_norm": 0.162109375, + "learning_rate": 1.6625950764706743e-06, + "loss": 1.1021, + "step": 14185 + }, + { + "epoch": 0.9480224478888295, + "grad_norm": 0.1806640625, + "learning_rate": 1.641487316750212e-06, + "loss": 1.2487, + "step": 14190 + }, + { + "epoch": 0.9483564938535542, + "grad_norm": 0.17578125, + "learning_rate": 1.6205132941556122e-06, + "loss": 1.2798, + "step": 14195 + }, + { + "epoch": 0.948690539818279, + "grad_norm": 0.1630859375, + "learning_rate": 1.5996730372050627e-06, + "loss": 1.1935, + "step": 14200 + }, + { + "epoch": 0.9490245857830037, + "grad_norm": 0.1640625, + "learning_rate": 1.5789665742348415e-06, + "loss": 1.1511, + "step": 14205 + }, + { + "epoch": 0.9493586317477285, + "grad_norm": 0.177734375, + "learning_rate": 1.558393933399338e-06, + "loss": 1.2232, + "step": 14210 + }, + { + "epoch": 0.9496926777124532, + "grad_norm": 0.1689453125, + "learning_rate": 1.537955142670966e-06, + "loss": 1.2133, + "step": 14215 + }, + { + "epoch": 0.950026723677178, + "grad_norm": 0.173828125, + "learning_rate": 1.517650229840173e-06, + "loss": 1.1668, + "step": 14220 + }, + { + "epoch": 0.9503607696419027, + "grad_norm": 0.169921875, + "learning_rate": 1.4974792225153188e-06, + "loss": 1.2862, + "step": 14225 + }, + { + "epoch": 0.9506948156066275, + "grad_norm": 0.1708984375, + "learning_rate": 1.4774421481227762e-06, + "loss": 1.2089, + "step": 14230 + }, + { + "epoch": 0.9510288615713522, + "grad_norm": 0.162109375, + "learning_rate": 1.4575390339067296e-06, + "loss": 1.1126, + "step": 14235 + }, + { + "epoch": 0.951362907536077, + "grad_norm": 0.1796875, + "learning_rate": 1.4377699069292427e-06, + "loss": 1.1774, + "step": 14240 + }, + { + "epoch": 0.9516969535008017, + "grad_norm": 0.166015625, + "learning_rate": 1.418134794070236e-06, + "loss": 1.1765, + "step": 14245 + }, + { + "epoch": 0.9520309994655265, + "grad_norm": 0.173828125, + "learning_rate": 1.3986337220273759e-06, + "loss": 1.1659, + "step": 14250 + }, + { + "epoch": 0.9523650454302512, + "grad_norm": 0.173828125, + "learning_rate": 1.3792667173160855e-06, + "loss": 1.2359, + "step": 14255 + }, + { + "epoch": 0.952699091394976, + "grad_norm": 0.169921875, + "learning_rate": 1.3600338062694784e-06, + "loss": 1.1513, + "step": 14260 + }, + { + "epoch": 0.9530331373597007, + "grad_norm": 0.162109375, + "learning_rate": 1.3409350150383803e-06, + "loss": 1.1959, + "step": 14265 + }, + { + "epoch": 0.9533671833244255, + "grad_norm": 0.181640625, + "learning_rate": 1.32197036959123e-06, + "loss": 1.1792, + "step": 14270 + }, + { + "epoch": 0.9537012292891501, + "grad_norm": 0.171875, + "learning_rate": 1.3031398957140562e-06, + "loss": 1.2564, + "step": 14275 + }, + { + "epoch": 0.9540352752538749, + "grad_norm": 0.1767578125, + "learning_rate": 1.284443619010478e-06, + "loss": 1.2568, + "step": 14280 + }, + { + "epoch": 0.9543693212185996, + "grad_norm": 0.1826171875, + "learning_rate": 1.265881564901672e-06, + "loss": 1.2136, + "step": 14285 + }, + { + "epoch": 0.9547033671833244, + "grad_norm": 0.201171875, + "learning_rate": 1.2474537586262603e-06, + "loss": 1.2579, + "step": 14290 + }, + { + "epoch": 0.9550374131480491, + "grad_norm": 0.1708984375, + "learning_rate": 1.2291602252403444e-06, + "loss": 1.2214, + "step": 14295 + }, + { + "epoch": 0.9553714591127739, + "grad_norm": 0.189453125, + "learning_rate": 1.2110009896174944e-06, + "loss": 1.1595, + "step": 14300 + }, + { + "epoch": 0.9557055050774986, + "grad_norm": 0.169921875, + "learning_rate": 1.1929760764486264e-06, + "loss": 1.1858, + "step": 14305 + }, + { + "epoch": 0.9560395510422234, + "grad_norm": 0.162109375, + "learning_rate": 1.1750855102420578e-06, + "loss": 1.1783, + "step": 14310 + }, + { + "epoch": 0.9563735970069481, + "grad_norm": 0.1689453125, + "learning_rate": 1.1573293153233966e-06, + "loss": 1.257, + "step": 14315 + }, + { + "epoch": 0.9567076429716729, + "grad_norm": 0.16796875, + "learning_rate": 1.1397075158355975e-06, + "loss": 1.197, + "step": 14320 + }, + { + "epoch": 0.9570416889363976, + "grad_norm": 0.1708984375, + "learning_rate": 1.1222201357388496e-06, + "loss": 1.2113, + "step": 14325 + }, + { + "epoch": 0.9573757349011224, + "grad_norm": 0.16796875, + "learning_rate": 1.1048671988105553e-06, + "loss": 1.1578, + "step": 14330 + }, + { + "epoch": 0.9577097808658471, + "grad_norm": 0.181640625, + "learning_rate": 1.0876487286453408e-06, + "loss": 1.1104, + "step": 14335 + }, + { + "epoch": 0.9580438268305719, + "grad_norm": 0.1728515625, + "learning_rate": 1.0705647486550118e-06, + "loss": 1.23, + "step": 14340 + }, + { + "epoch": 0.9583778727952966, + "grad_norm": 0.181640625, + "learning_rate": 1.0536152820684874e-06, + "loss": 1.2529, + "step": 14345 + }, + { + "epoch": 0.9587119187600214, + "grad_norm": 0.16796875, + "learning_rate": 1.036800351931777e-06, + "loss": 1.2079, + "step": 14350 + }, + { + "epoch": 0.9590459647247461, + "grad_norm": 0.173828125, + "learning_rate": 1.0201199811080032e-06, + "loss": 1.2061, + "step": 14355 + }, + { + "epoch": 0.9593800106894709, + "grad_norm": 0.1630859375, + "learning_rate": 1.0035741922772902e-06, + "loss": 1.1448, + "step": 14360 + }, + { + "epoch": 0.9597140566541956, + "grad_norm": 0.173828125, + "learning_rate": 9.871630079367977e-07, + "loss": 1.2618, + "step": 14365 + }, + { + "epoch": 0.9600481026189204, + "grad_norm": 0.173828125, + "learning_rate": 9.708864504006433e-07, + "loss": 1.1832, + "step": 14370 + }, + { + "epoch": 0.9603821485836451, + "grad_norm": 0.1630859375, + "learning_rate": 9.547445417999123e-07, + "loss": 1.1876, + "step": 14375 + }, + { + "epoch": 0.9607161945483699, + "grad_norm": 0.16796875, + "learning_rate": 9.387373040826153e-07, + "loss": 1.1768, + "step": 14380 + }, + { + "epoch": 0.9610502405130946, + "grad_norm": 0.1650390625, + "learning_rate": 9.228647590136308e-07, + "loss": 1.1675, + "step": 14385 + }, + { + "epoch": 0.9613842864778194, + "grad_norm": 0.1689453125, + "learning_rate": 9.071269281746842e-07, + "loss": 1.1261, + "step": 14390 + }, + { + "epoch": 0.9617183324425441, + "grad_norm": 0.357421875, + "learning_rate": 8.915238329643805e-07, + "loss": 1.198, + "step": 14395 + }, + { + "epoch": 0.9620523784072689, + "grad_norm": 0.1728515625, + "learning_rate": 8.760554945981048e-07, + "loss": 1.2774, + "step": 14400 + }, + { + "epoch": 0.9623864243719936, + "grad_norm": 0.1748046875, + "learning_rate": 8.607219341079887e-07, + "loss": 1.2643, + "step": 14405 + }, + { + "epoch": 0.9627204703367184, + "grad_norm": 0.1708984375, + "learning_rate": 8.455231723429324e-07, + "loss": 1.1497, + "step": 14410 + }, + { + "epoch": 0.9630545163014431, + "grad_norm": 0.1708984375, + "learning_rate": 8.304592299685387e-07, + "loss": 1.2001, + "step": 14415 + }, + { + "epoch": 0.9633885622661679, + "grad_norm": 0.1611328125, + "learning_rate": 8.155301274671234e-07, + "loss": 1.2162, + "step": 14420 + }, + { + "epoch": 0.9637226082308926, + "grad_norm": 0.1650390625, + "learning_rate": 8.007358851376378e-07, + "loss": 1.2046, + "step": 14425 + }, + { + "epoch": 0.9640566541956174, + "grad_norm": 0.162109375, + "learning_rate": 7.860765230956579e-07, + "loss": 1.1698, + "step": 14430 + }, + { + "epoch": 0.9643907001603421, + "grad_norm": 0.1767578125, + "learning_rate": 7.71552061273395e-07, + "loss": 1.2303, + "step": 14435 + }, + { + "epoch": 0.9647247461250668, + "grad_norm": 0.1943359375, + "learning_rate": 7.571625194196074e-07, + "loss": 1.1499, + "step": 14440 + }, + { + "epoch": 0.9650587920897915, + "grad_norm": 0.17578125, + "learning_rate": 7.429079170996222e-07, + "loss": 1.1865, + "step": 14445 + }, + { + "epoch": 0.9653928380545163, + "grad_norm": 0.166015625, + "learning_rate": 7.287882736952912e-07, + "loss": 1.2239, + "step": 14450 + }, + { + "epoch": 0.965726884019241, + "grad_norm": 0.1708984375, + "learning_rate": 7.14803608404957e-07, + "loss": 1.134, + "step": 14455 + }, + { + "epoch": 0.9660609299839658, + "grad_norm": 0.189453125, + "learning_rate": 7.009539402434207e-07, + "loss": 1.2218, + "step": 14460 + }, + { + "epoch": 0.9663949759486905, + "grad_norm": 0.171875, + "learning_rate": 6.87239288041952e-07, + "loss": 1.2575, + "step": 14465 + }, + { + "epoch": 0.9667290219134153, + "grad_norm": 0.1748046875, + "learning_rate": 6.736596704482456e-07, + "loss": 1.1718, + "step": 14470 + }, + { + "epoch": 0.96706306787814, + "grad_norm": 0.1708984375, + "learning_rate": 6.60215105926365e-07, + "loss": 1.1994, + "step": 14475 + }, + { + "epoch": 0.9673971138428648, + "grad_norm": 0.16796875, + "learning_rate": 6.46905612756743e-07, + "loss": 1.1799, + "step": 14480 + }, + { + "epoch": 0.9677311598075895, + "grad_norm": 0.1650390625, + "learning_rate": 6.337312090361924e-07, + "loss": 1.2205, + "step": 14485 + }, + { + "epoch": 0.9680652057723143, + "grad_norm": 0.1689453125, + "learning_rate": 6.206919126777955e-07, + "loss": 1.2612, + "step": 14490 + }, + { + "epoch": 0.968399251737039, + "grad_norm": 0.1630859375, + "learning_rate": 6.077877414109923e-07, + "loss": 1.1688, + "step": 14495 + }, + { + "epoch": 0.9687332977017638, + "grad_norm": 0.1728515625, + "learning_rate": 5.950187127814477e-07, + "loss": 1.2077, + "step": 14500 + }, + { + "epoch": 0.9690673436664885, + "grad_norm": 0.1748046875, + "learning_rate": 5.823848441510737e-07, + "loss": 1.1738, + "step": 14505 + }, + { + "epoch": 0.9694013896312133, + "grad_norm": 0.173828125, + "learning_rate": 5.698861526980404e-07, + "loss": 1.1966, + "step": 14510 + }, + { + "epoch": 0.969735435595938, + "grad_norm": 0.1708984375, + "learning_rate": 5.575226554166757e-07, + "loss": 1.145, + "step": 14515 + }, + { + "epoch": 0.9700694815606627, + "grad_norm": 0.171875, + "learning_rate": 5.452943691175216e-07, + "loss": 1.1979, + "step": 14520 + }, + { + "epoch": 0.9704035275253875, + "grad_norm": 0.1689453125, + "learning_rate": 5.332013104272782e-07, + "loss": 1.1926, + "step": 14525 + }, + { + "epoch": 0.9707375734901122, + "grad_norm": 0.1796875, + "learning_rate": 5.212434957887369e-07, + "loss": 1.2438, + "step": 14530 + }, + { + "epoch": 0.971071619454837, + "grad_norm": 0.1728515625, + "learning_rate": 5.094209414608253e-07, + "loss": 1.2502, + "step": 14535 + }, + { + "epoch": 0.9714056654195617, + "grad_norm": 0.1767578125, + "learning_rate": 4.977336635185736e-07, + "loss": 1.1783, + "step": 14540 + }, + { + "epoch": 0.9717397113842865, + "grad_norm": 0.171875, + "learning_rate": 4.861816778530371e-07, + "loss": 1.2503, + "step": 14545 + }, + { + "epoch": 0.9720737573490112, + "grad_norm": 0.1787109375, + "learning_rate": 4.7476500017136215e-07, + "loss": 1.2635, + "step": 14550 + }, + { + "epoch": 0.972407803313736, + "grad_norm": 0.166015625, + "learning_rate": 4.6348364599668737e-07, + "loss": 1.1629, + "step": 14555 + }, + { + "epoch": 0.9727418492784607, + "grad_norm": 0.1669921875, + "learning_rate": 4.523376306681537e-07, + "loss": 1.1369, + "step": 14560 + }, + { + "epoch": 0.9730758952431855, + "grad_norm": 0.1748046875, + "learning_rate": 4.413269693409161e-07, + "loss": 1.2048, + "step": 14565 + }, + { + "epoch": 0.9734099412079102, + "grad_norm": 0.16796875, + "learning_rate": 4.3045167698603226e-07, + "loss": 1.2306, + "step": 14570 + }, + { + "epoch": 0.973743987172635, + "grad_norm": 0.1728515625, + "learning_rate": 4.197117683905627e-07, + "loss": 1.1437, + "step": 14575 + }, + { + "epoch": 0.9740780331373597, + "grad_norm": 0.166015625, + "learning_rate": 4.0910725815742626e-07, + "loss": 1.0792, + "step": 14580 + }, + { + "epoch": 0.9744120791020845, + "grad_norm": 0.1767578125, + "learning_rate": 3.986381607055112e-07, + "loss": 1.2339, + "step": 14585 + }, + { + "epoch": 0.9747461250668092, + "grad_norm": 0.171875, + "learning_rate": 3.883044902695199e-07, + "loss": 1.1773, + "step": 14590 + }, + { + "epoch": 0.975080171031534, + "grad_norm": 0.1767578125, + "learning_rate": 3.781062609000463e-07, + "loss": 1.1915, + "step": 14595 + }, + { + "epoch": 0.9754142169962586, + "grad_norm": 0.1767578125, + "learning_rate": 3.680434864635429e-07, + "loss": 1.1747, + "step": 14600 + }, + { + "epoch": 0.9757482629609834, + "grad_norm": 0.1689453125, + "learning_rate": 3.5811618064226505e-07, + "loss": 1.1539, + "step": 14605 + }, + { + "epoch": 0.9760823089257081, + "grad_norm": 0.166015625, + "learning_rate": 3.483243569342487e-07, + "loss": 1.2025, + "step": 14610 + }, + { + "epoch": 0.9764163548904329, + "grad_norm": 0.1748046875, + "learning_rate": 3.3866802865337723e-07, + "loss": 1.1391, + "step": 14615 + }, + { + "epoch": 0.9767504008551576, + "grad_norm": 0.1806640625, + "learning_rate": 3.29147208929248e-07, + "loss": 1.1749, + "step": 14620 + }, + { + "epoch": 0.9770844468198824, + "grad_norm": 0.1796875, + "learning_rate": 3.1976191070722803e-07, + "loss": 1.1628, + "step": 14625 + }, + { + "epoch": 0.9774184927846071, + "grad_norm": 0.166015625, + "learning_rate": 3.1051214674843174e-07, + "loss": 1.1674, + "step": 14630 + }, + { + "epoch": 0.9777525387493319, + "grad_norm": 0.1669921875, + "learning_rate": 3.013979296296543e-07, + "loss": 1.2079, + "step": 14635 + }, + { + "epoch": 0.9780865847140566, + "grad_norm": 0.177734375, + "learning_rate": 2.92419271743416e-07, + "loss": 1.2202, + "step": 14640 + }, + { + "epoch": 0.9784206306787814, + "grad_norm": 0.1748046875, + "learning_rate": 2.8357618529792905e-07, + "loss": 1.2067, + "step": 14645 + }, + { + "epoch": 0.9787546766435061, + "grad_norm": 0.1669921875, + "learning_rate": 2.74868682317031e-07, + "loss": 1.191, + "step": 14650 + }, + { + "epoch": 0.9790887226082309, + "grad_norm": 0.1708984375, + "learning_rate": 2.6629677464024006e-07, + "loss": 1.1576, + "step": 14655 + }, + { + "epoch": 0.9794227685729556, + "grad_norm": 0.1669921875, + "learning_rate": 2.5786047392268864e-07, + "loss": 1.1496, + "step": 14660 + }, + { + "epoch": 0.9797568145376804, + "grad_norm": 0.169921875, + "learning_rate": 2.4955979163514554e-07, + "loss": 1.2359, + "step": 14665 + }, + { + "epoch": 0.9800908605024051, + "grad_norm": 0.1787109375, + "learning_rate": 2.413947390639715e-07, + "loss": 1.2322, + "step": 14670 + }, + { + "epoch": 0.9804249064671299, + "grad_norm": 0.171875, + "learning_rate": 2.3336532731108584e-07, + "loss": 1.2106, + "step": 14675 + }, + { + "epoch": 0.9807589524318546, + "grad_norm": 0.18359375, + "learning_rate": 2.2547156729403329e-07, + "loss": 1.1849, + "step": 14680 + }, + { + "epoch": 0.9810929983965794, + "grad_norm": 0.18359375, + "learning_rate": 2.177134697458616e-07, + "loss": 1.225, + "step": 14685 + }, + { + "epoch": 0.9814270443613041, + "grad_norm": 0.1748046875, + "learning_rate": 2.100910452151883e-07, + "loss": 1.1283, + "step": 14690 + }, + { + "epoch": 0.9817610903260289, + "grad_norm": 0.1767578125, + "learning_rate": 2.0260430406615626e-07, + "loss": 1.1439, + "step": 14695 + }, + { + "epoch": 0.9820951362907536, + "grad_norm": 0.1689453125, + "learning_rate": 1.9525325647841152e-07, + "loss": 1.1861, + "step": 14700 + }, + { + "epoch": 0.9824291822554784, + "grad_norm": 0.1748046875, + "learning_rate": 1.8803791244710322e-07, + "loss": 1.2404, + "step": 14705 + }, + { + "epoch": 0.9827632282202031, + "grad_norm": 0.1669921875, + "learning_rate": 1.8095828178286145e-07, + "loss": 1.2448, + "step": 14710 + }, + { + "epoch": 0.9830972741849279, + "grad_norm": 0.1611328125, + "learning_rate": 1.7401437411179722e-07, + "loss": 1.2503, + "step": 14715 + }, + { + "epoch": 0.9834313201496526, + "grad_norm": 0.173828125, + "learning_rate": 1.6720619887548028e-07, + "loss": 1.2129, + "step": 14720 + }, + { + "epoch": 0.9837653661143774, + "grad_norm": 0.169921875, + "learning_rate": 1.605337653309058e-07, + "loss": 1.1759, + "step": 14725 + }, + { + "epoch": 0.9840994120791021, + "grad_norm": 0.1728515625, + "learning_rate": 1.539970825505277e-07, + "loss": 1.1244, + "step": 14730 + }, + { + "epoch": 0.9844334580438269, + "grad_norm": 0.1669921875, + "learning_rate": 1.4759615942220306e-07, + "loss": 1.1535, + "step": 14735 + }, + { + "epoch": 0.9847675040085516, + "grad_norm": 0.1845703125, + "learning_rate": 1.4133100464922555e-07, + "loss": 1.1768, + "step": 14740 + }, + { + "epoch": 0.9851015499732764, + "grad_norm": 0.1826171875, + "learning_rate": 1.352016267502365e-07, + "loss": 1.2574, + "step": 14745 + }, + { + "epoch": 0.9854355959380011, + "grad_norm": 0.1962890625, + "learning_rate": 1.292080340593249e-07, + "loss": 1.1745, + "step": 14750 + }, + { + "epoch": 0.9857696419027259, + "grad_norm": 0.166015625, + "learning_rate": 1.2335023472589414e-07, + "loss": 1.2315, + "step": 14755 + }, + { + "epoch": 0.9861036878674505, + "grad_norm": 0.1787109375, + "learning_rate": 1.1762823671475076e-07, + "loss": 1.2231, + "step": 14760 + }, + { + "epoch": 0.9864377338321753, + "grad_norm": 0.162109375, + "learning_rate": 1.12042047806038e-07, + "loss": 1.2253, + "step": 14765 + }, + { + "epoch": 0.9867717797969, + "grad_norm": 0.1767578125, + "learning_rate": 1.0659167559523564e-07, + "loss": 1.1431, + "step": 14770 + }, + { + "epoch": 0.9871058257616248, + "grad_norm": 0.1767578125, + "learning_rate": 1.012771274931823e-07, + "loss": 1.1837, + "step": 14775 + }, + { + "epoch": 0.9874398717263495, + "grad_norm": 0.19921875, + "learning_rate": 9.609841072599768e-08, + "loss": 1.2645, + "step": 14780 + }, + { + "epoch": 0.9877739176910743, + "grad_norm": 0.1669921875, + "learning_rate": 9.105553233513809e-08, + "loss": 1.2164, + "step": 14785 + }, + { + "epoch": 0.988107963655799, + "grad_norm": 0.169921875, + "learning_rate": 8.614849917737423e-08, + "loss": 1.2218, + "step": 14790 + }, + { + "epoch": 0.9884420096205238, + "grad_norm": 0.16796875, + "learning_rate": 8.137731792472458e-08, + "loss": 1.1711, + "step": 14795 + }, + { + "epoch": 0.9887760555852485, + "grad_norm": 0.1650390625, + "learning_rate": 7.674199506453317e-08, + "loss": 1.2162, + "step": 14800 + }, + { + "epoch": 0.9891101015499733, + "grad_norm": 0.1669921875, + "learning_rate": 7.224253689940286e-08, + "loss": 1.2323, + "step": 14805 + }, + { + "epoch": 0.989444147514698, + "grad_norm": 0.1708984375, + "learning_rate": 6.787894954720653e-08, + "loss": 1.1196, + "step": 14810 + }, + { + "epoch": 0.9897781934794228, + "grad_norm": 0.158203125, + "learning_rate": 6.365123894107594e-08, + "loss": 1.2014, + "step": 14815 + }, + { + "epoch": 0.9901122394441475, + "grad_norm": 0.1748046875, + "learning_rate": 5.955941082936845e-08, + "loss": 1.1848, + "step": 14820 + }, + { + "epoch": 0.9904462854088723, + "grad_norm": 0.1689453125, + "learning_rate": 5.560347077572248e-08, + "loss": 1.1779, + "step": 14825 + }, + { + "epoch": 0.990780331373597, + "grad_norm": 0.162109375, + "learning_rate": 5.1783424158990954e-08, + "loss": 1.208, + "step": 14830 + }, + { + "epoch": 0.9911143773383218, + "grad_norm": 0.17578125, + "learning_rate": 4.809927617324128e-08, + "loss": 1.2122, + "step": 14835 + }, + { + "epoch": 0.9914484233030465, + "grad_norm": 0.17578125, + "learning_rate": 4.4551031827788636e-08, + "loss": 1.1781, + "step": 14840 + }, + { + "epoch": 0.9917824692677712, + "grad_norm": 0.16796875, + "learning_rate": 4.113869594712938e-08, + "loss": 1.1899, + "step": 14845 + }, + { + "epoch": 0.992116515232496, + "grad_norm": 0.1689453125, + "learning_rate": 3.7862273171007657e-08, + "loss": 1.1715, + "step": 14850 + }, + { + "epoch": 0.9924505611972207, + "grad_norm": 0.1650390625, + "learning_rate": 3.472176795432658e-08, + "loss": 1.1051, + "step": 14855 + }, + { + "epoch": 0.9927846071619455, + "grad_norm": 0.1728515625, + "learning_rate": 3.171718456720374e-08, + "loss": 1.1639, + "step": 14860 + }, + { + "epoch": 0.9931186531266702, + "grad_norm": 0.1650390625, + "learning_rate": 2.8848527094949006e-08, + "loss": 1.1834, + "step": 14865 + }, + { + "epoch": 0.993452699091395, + "grad_norm": 0.1728515625, + "learning_rate": 2.611579943804232e-08, + "loss": 1.274, + "step": 14870 + }, + { + "epoch": 0.9937867450561197, + "grad_norm": 0.1591796875, + "learning_rate": 2.3519005312144792e-08, + "loss": 1.1908, + "step": 14875 + }, + { + "epoch": 0.9941207910208445, + "grad_norm": 0.193359375, + "learning_rate": 2.105814824810981e-08, + "loss": 1.2198, + "step": 14880 + }, + { + "epoch": 0.9944548369855692, + "grad_norm": 0.2001953125, + "learning_rate": 1.873323159191642e-08, + "loss": 1.1147, + "step": 14885 + }, + { + "epoch": 0.994788882950294, + "grad_norm": 0.169921875, + "learning_rate": 1.6544258504747058e-08, + "loss": 1.1683, + "step": 14890 + }, + { + "epoch": 0.9951229289150187, + "grad_norm": 0.1708984375, + "learning_rate": 1.4491231962920904e-08, + "loss": 1.2365, + "step": 14895 + }, + { + "epoch": 0.9954569748797435, + "grad_norm": 0.1796875, + "learning_rate": 1.2574154757916123e-08, + "loss": 1.1823, + "step": 14900 + }, + { + "epoch": 0.9957910208444682, + "grad_norm": 0.1689453125, + "learning_rate": 1.0793029496369844e-08, + "loss": 1.1889, + "step": 14905 + }, + { + "epoch": 0.996125066809193, + "grad_norm": 0.1728515625, + "learning_rate": 9.147858600067061e-09, + "loss": 1.2029, + "step": 14910 + }, + { + "epoch": 0.9964591127739177, + "grad_norm": 0.166015625, + "learning_rate": 7.63864430591843e-09, + "loss": 1.199, + "step": 14915 + }, + { + "epoch": 0.9967931587386424, + "grad_norm": 0.17578125, + "learning_rate": 6.265388666004679e-09, + "loss": 1.1928, + "step": 14920 + }, + { + "epoch": 0.9971272047033671, + "grad_norm": 0.1787109375, + "learning_rate": 5.028093547498891e-09, + "loss": 1.2173, + "step": 14925 + }, + { + "epoch": 0.9974612506680919, + "grad_norm": 0.166015625, + "learning_rate": 3.926760632777526e-09, + "loss": 1.1781, + "step": 14930 + }, + { + "epoch": 0.9977952966328166, + "grad_norm": 0.17578125, + "learning_rate": 2.9613914192760937e-09, + "loss": 1.1657, + "step": 14935 + }, + { + "epoch": 0.9981293425975414, + "grad_norm": 0.1953125, + "learning_rate": 2.131987219622378e-09, + "loss": 1.1723, + "step": 14940 + }, + { + "epoch": 0.9984633885622661, + "grad_norm": 0.1611328125, + "learning_rate": 1.4385491615365177e-09, + "loss": 1.1883, + "step": 14945 + }, + { + "epoch": 0.9987974345269909, + "grad_norm": 0.1708984375, + "learning_rate": 8.810781878865193e-10, + "loss": 1.23, + "step": 14950 + }, + { + "epoch": 0.9991314804917156, + "grad_norm": 0.16796875, + "learning_rate": 4.595750566660506e-10, + "loss": 1.1041, + "step": 14955 + }, + { + "epoch": 0.9994655264564404, + "grad_norm": 0.2021484375, + "learning_rate": 1.7404034097223688e-10, + "loss": 1.2243, + "step": 14960 + }, + { + "epoch": 0.9997995724211651, + "grad_norm": 0.1748046875, + "learning_rate": 2.4474429050069803e-11, + "loss": 1.2883, + "step": 14965 + }, + { + "epoch": 1.0, + "step": 14968, + "total_flos": 1.1104476526052114e+19, + "train_loss": 0.0, + "train_runtime": 10.0721, + "train_samples_per_second": 11888.555, + "train_steps_per_second": 1486.082 + } + ], + "logging_steps": 5, + "max_steps": 14968, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1.1104476526052114e+19, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}