diff --git "a/trainer_state.json" "b/trainer_state.json"
new file mode 100644--- /dev/null
+++ "b/trainer_state.json"
@@ -0,0 +1,21721 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 49.441786283891545,
+  "eval_steps": 500,
+  "global_step": 31000,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.02,
+      "grad_norm": 3.6600120067596436,
+      "learning_rate": 0.0019993620414673046,
+      "loss": 2.7717,
+      "step": 10
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 1.797737717628479,
+      "learning_rate": 0.0019987240829346096,
+      "loss": 2.2538,
+      "step": 20
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 1.503915548324585,
+      "learning_rate": 0.001998086124401914,
+      "loss": 2.5162,
+      "step": 30
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 2.3732383251190186,
+      "learning_rate": 0.0019974481658692187,
+      "loss": 2.7087,
+      "step": 40
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 4.7273030281066895,
+      "learning_rate": 0.0019968102073365233,
+      "loss": 2.3868,
+      "step": 50
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 4.214552402496338,
+      "learning_rate": 0.001996172248803828,
+      "loss": 2.4794,
+      "step": 60
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 5.231570243835449,
+      "learning_rate": 0.0019955342902711324,
+      "loss": 2.5851,
+      "step": 70
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 1.2388306856155396,
+      "learning_rate": 0.001994896331738437,
+      "loss": 2.3979,
+      "step": 80
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 5.343308448791504,
+      "learning_rate": 0.001994258373205742,
+      "loss": 2.5801,
+      "step": 90
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 5.010150909423828,
+      "learning_rate": 0.0019936204146730465,
+      "loss": 2.782,
+      "step": 100
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 4.422647953033447,
+      "learning_rate": 0.001992982456140351,
+      "loss": 2.5693,
+      "step": 110
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 2.5217037200927734,
+      "learning_rate": 0.0019923444976076557,
+      "loss": 2.6707,
+      "step": 120
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 4.945374011993408,
+      "learning_rate": 0.0019917065390749602,
+      "loss": 2.7236,
+      "step": 130
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 1.9850760698318481,
+      "learning_rate": 0.001991068580542265,
+      "loss": 2.5799,
+      "step": 140
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 5.4692206382751465,
+      "learning_rate": 0.0019904306220095693,
+      "loss": 2.6595,
+      "step": 150
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 2.3442928791046143,
+      "learning_rate": 0.0019897926634768743,
+      "loss": 2.8178,
+      "step": 160
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 1.8789024353027344,
+      "learning_rate": 0.001989154704944179,
+      "loss": 2.5928,
+      "step": 170
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 4.694440841674805,
+      "learning_rate": 0.0019885167464114835,
+      "loss": 2.7751,
+      "step": 180
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 6.304182529449463,
+      "learning_rate": 0.001987878787878788,
+      "loss": 2.5837,
+      "step": 190
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 5.747090816497803,
+      "learning_rate": 0.0019872408293460926,
+      "loss": 2.7569,
+      "step": 200
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 5.950802803039551,
+      "learning_rate": 0.001986602870813397,
+      "loss": 2.5465,
+      "step": 210
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 3.991403102874756,
+      "learning_rate": 0.0019859649122807017,
+      "loss": 2.3429,
+      "step": 220
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 15.909507751464844,
+      "learning_rate": 0.0019853269537480063,
+      "loss": 2.6886,
+      "step": 230
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 3.464792490005493,
+      "learning_rate": 0.0019846889952153113,
+      "loss": 2.6745,
+      "step": 240
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 2.647952079772949,
+      "learning_rate": 0.001984051036682616,
+      "loss": 2.666,
+      "step": 250
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 5.060638427734375,
+      "learning_rate": 0.0019834130781499204,
+      "loss": 2.502,
+      "step": 260
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 2.046036720275879,
+      "learning_rate": 0.001982775119617225,
+      "loss": 2.6536,
+      "step": 270
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 3.072054147720337,
+      "learning_rate": 0.0019821371610845295,
+      "loss": 2.7964,
+      "step": 280
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 4.118267059326172,
+      "learning_rate": 0.001981499202551834,
+      "loss": 2.7141,
+      "step": 290
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 6.113500118255615,
+      "learning_rate": 0.0019808612440191387,
+      "loss": 2.6802,
+      "step": 300
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 4.462850570678711,
+      "learning_rate": 0.0019802232854864437,
+      "loss": 2.611,
+      "step": 310
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 4.859583854675293,
+      "learning_rate": 0.001979585326953748,
+      "loss": 2.954,
+      "step": 320
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 2.9786384105682373,
+      "learning_rate": 0.0019789473684210528,
+      "loss": 2.7084,
+      "step": 330
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 2.122506618499756,
+      "learning_rate": 0.0019783094098883573,
+      "loss": 2.5917,
+      "step": 340
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 2.5119547843933105,
+      "learning_rate": 0.001977671451355662,
+      "loss": 2.7874,
+      "step": 350
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 5.032376289367676,
+      "learning_rate": 0.0019770334928229665,
+      "loss": 2.7288,
+      "step": 360
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 6.186531066894531,
+      "learning_rate": 0.001976395534290271,
+      "loss": 2.7858,
+      "step": 370
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 1.5598961114883423,
+      "learning_rate": 0.001975757575757576,
+      "loss": 2.8022,
+      "step": 380
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 3.7895145416259766,
+      "learning_rate": 0.0019751196172248806,
+      "loss": 2.9117,
+      "step": 390
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 9.171380043029785,
+      "learning_rate": 0.001974481658692185,
+      "loss": 2.8098,
+      "step": 400
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 3.090906858444214,
+      "learning_rate": 0.0019738437001594897,
+      "loss": 2.7351,
+      "step": 410
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 8.468884468078613,
+      "learning_rate": 0.0019732057416267943,
+      "loss": 2.868,
+      "step": 420
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 3.8941705226898193,
+      "learning_rate": 0.001972567783094099,
+      "loss": 2.9424,
+      "step": 430
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 2.694938898086548,
+      "learning_rate": 0.0019719298245614034,
+      "loss": 2.7966,
+      "step": 440
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 3.2257883548736572,
+      "learning_rate": 0.0019712918660287084,
+      "loss": 2.8685,
+      "step": 450
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 2.8044159412384033,
+      "learning_rate": 0.001970653907496013,
+      "loss": 2.9069,
+      "step": 460
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 3.731559991836548,
+      "learning_rate": 0.0019700159489633175,
+      "loss": 2.9604,
+      "step": 470
+    },
+    {
+      "epoch": 0.77,
+      "grad_norm": 3.8493754863739014,
+      "learning_rate": 0.001969377990430622,
+      "loss": 3.1143,
+      "step": 480
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 2.380948781967163,
+      "learning_rate": 0.0019687400318979266,
+      "loss": 2.8833,
+      "step": 490
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 2.755617141723633,
+      "learning_rate": 0.001968102073365231,
+      "loss": 2.8669,
+      "step": 500
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 3.0759685039520264,
+      "learning_rate": 0.0019674641148325358,
+      "loss": 2.659,
+      "step": 510
+    },
+    {
+      "epoch": 0.83,
+      "grad_norm": 2.367964744567871,
+      "learning_rate": 0.0019668261562998408,
+      "loss": 3.0106,
+      "step": 520
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 4.058963775634766,
+      "learning_rate": 0.0019661881977671453,
+      "loss": 2.8824,
+      "step": 530
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 1.9852606058120728,
+      "learning_rate": 0.00196555023923445,
+      "loss": 2.8171,
+      "step": 540
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 3.7825193405151367,
+      "learning_rate": 0.0019649122807017545,
+      "loss": 2.9357,
+      "step": 550
+    },
+    {
+      "epoch": 0.89,
+      "grad_norm": 4.315491676330566,
+      "learning_rate": 0.001964274322169059,
+      "loss": 2.8921,
+      "step": 560
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 2.1023871898651123,
+      "learning_rate": 0.0019636363636363636,
+      "loss": 2.8568,
+      "step": 570
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 2.552720785140991,
+      "learning_rate": 0.001962998405103668,
+      "loss": 2.9858,
+      "step": 580
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 9.927117347717285,
+      "learning_rate": 0.001962360446570973,
+      "loss": 2.8938,
+      "step": 590
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 5.051787853240967,
+      "learning_rate": 0.0019617224880382777,
+      "loss": 2.7605,
+      "step": 600
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 3.3584322929382324,
+      "learning_rate": 0.0019610845295055823,
+      "loss": 2.7514,
+      "step": 610
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 1.7937440872192383,
+      "learning_rate": 0.001960446570972887,
+      "loss": 2.9748,
+      "step": 620
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 5.3826003074646,
+      "learning_rate": 0.0019598086124401914,
+      "loss": 2.7608,
+      "step": 630
+    },
+    {
+      "epoch": 1.02,
+      "grad_norm": 4.8360724449157715,
+      "learning_rate": 0.001959170653907496,
+      "loss": 2.2607,
+      "step": 640
+    },
+    {
+      "epoch": 1.04,
+      "grad_norm": 2.8146324157714844,
+      "learning_rate": 0.0019585326953748005,
+      "loss": 2.2973,
+      "step": 650
+    },
+    {
+      "epoch": 1.05,
+      "grad_norm": 2.535956859588623,
+      "learning_rate": 0.0019578947368421055,
+      "loss": 2.2598,
+      "step": 660
+    },
+    {
+      "epoch": 1.07,
+      "grad_norm": 4.664738655090332,
+      "learning_rate": 0.00195725677830941,
+      "loss": 2.3387,
+      "step": 670
+    },
+    {
+      "epoch": 1.08,
+      "grad_norm": 3.6893537044525146,
+      "learning_rate": 0.0019566188197767146,
+      "loss": 2.249,
+      "step": 680
+    },
+    {
+      "epoch": 1.1,
+      "grad_norm": 4.679712295532227,
+      "learning_rate": 0.001955980861244019,
+      "loss": 2.3068,
+      "step": 690
+    },
+    {
+      "epoch": 1.12,
+      "grad_norm": 2.374504327774048,
+      "learning_rate": 0.0019553429027113238,
+      "loss": 2.5318,
+      "step": 700
+    },
+    {
+      "epoch": 1.13,
+      "grad_norm": 3.3196609020233154,
+      "learning_rate": 0.0019547049441786283,
+      "loss": 2.4066,
+      "step": 710
+    },
+    {
+      "epoch": 1.15,
+      "grad_norm": 5.714052200317383,
+      "learning_rate": 0.001954066985645933,
+      "loss": 2.4376,
+      "step": 720
+    },
+    {
+      "epoch": 1.16,
+      "grad_norm": 6.177249431610107,
+      "learning_rate": 0.001953429027113238,
+      "loss": 2.2004,
+      "step": 730
+    },
+    {
+      "epoch": 1.18,
+      "grad_norm": 2.936424970626831,
+      "learning_rate": 0.0019527910685805422,
+      "loss": 2.3424,
+      "step": 740
+    },
+    {
+      "epoch": 1.2,
+      "grad_norm": 3.634345769882202,
+      "learning_rate": 0.0019521531100478468,
+      "loss": 2.5118,
+      "step": 750
+    },
+    {
+      "epoch": 1.21,
+      "grad_norm": 1.8677217960357666,
+      "learning_rate": 0.0019515151515151514,
+      "loss": 2.3253,
+      "step": 760
+    },
+    {
+      "epoch": 1.23,
+      "grad_norm": 2.1149327754974365,
+      "learning_rate": 0.0019508771929824564,
+      "loss": 2.3353,
+      "step": 770
+    },
+    {
+      "epoch": 1.24,
+      "grad_norm": 4.144554138183594,
+      "learning_rate": 0.001950239234449761,
+      "loss": 2.6115,
+      "step": 780
+    },
+    {
+      "epoch": 1.26,
+      "grad_norm": 4.8128814697265625,
+      "learning_rate": 0.0019496012759170655,
+      "loss": 2.5473,
+      "step": 790
+    },
+    {
+      "epoch": 1.28,
+      "grad_norm": 2.2847745418548584,
+      "learning_rate": 0.0019489633173843703,
+      "loss": 2.4678,
+      "step": 800
+    },
+    {
+      "epoch": 1.29,
+      "grad_norm": 4.829673767089844,
+      "learning_rate": 0.0019483253588516748,
+      "loss": 2.638,
+      "step": 810
+    },
+    {
+      "epoch": 1.31,
+      "grad_norm": 2.4837145805358887,
+      "learning_rate": 0.0019476874003189794,
+      "loss": 2.4926,
+      "step": 820
+    },
+    {
+      "epoch": 1.32,
+      "grad_norm": 2.7193591594696045,
+      "learning_rate": 0.001947049441786284,
+      "loss": 2.4601,
+      "step": 830
+    },
+    {
+      "epoch": 1.34,
+      "grad_norm": 3.0565848350524902,
+      "learning_rate": 0.0019464114832535887,
+      "loss": 2.3951,
+      "step": 840
+    },
+    {
+      "epoch": 1.36,
+      "grad_norm": 3.4434733390808105,
+      "learning_rate": 0.0019457735247208933,
+      "loss": 2.4247,
+      "step": 850
+    },
+    {
+      "epoch": 1.37,
+      "grad_norm": 2.9921035766601562,
+      "learning_rate": 0.0019451355661881979,
+      "loss": 2.223,
+      "step": 860
+    },
+    {
+      "epoch": 1.39,
+      "grad_norm": 4.521476745605469,
+      "learning_rate": 0.0019444976076555026,
+      "loss": 2.371,
+      "step": 870
+    },
+    {
+      "epoch": 1.4,
+      "grad_norm": 1.987562656402588,
+      "learning_rate": 0.0019438596491228072,
+      "loss": 2.3996,
+      "step": 880
+    },
+    {
+      "epoch": 1.42,
+      "grad_norm": 2.5876095294952393,
+      "learning_rate": 0.0019432216905901118,
+      "loss": 2.6735,
+      "step": 890
+    },
+    {
+      "epoch": 1.44,
+      "grad_norm": 3.727102279663086,
+      "learning_rate": 0.0019425837320574163,
+      "loss": 2.3431,
+      "step": 900
+    },
+    {
+      "epoch": 1.45,
+      "grad_norm": 2.775712728500366,
+      "learning_rate": 0.001941945773524721,
+      "loss": 2.5217,
+      "step": 910
+    },
+    {
+      "epoch": 1.47,
+      "grad_norm": 4.316661357879639,
+      "learning_rate": 0.0019413078149920257,
+      "loss": 2.5654,
+      "step": 920
+    },
+    {
+      "epoch": 1.48,
+      "grad_norm": 5.313731670379639,
+      "learning_rate": 0.0019406698564593302,
+      "loss": 2.5676,
+      "step": 930
+    },
+    {
+      "epoch": 1.5,
+      "grad_norm": 3.3875491619110107,
+      "learning_rate": 0.0019400318979266348,
+      "loss": 2.701,
+      "step": 940
+    },
+    {
+      "epoch": 1.52,
+      "grad_norm": 5.12388801574707,
+      "learning_rate": 0.0019393939393939396,
+      "loss": 2.6469,
+      "step": 950
+    },
+    {
+      "epoch": 1.53,
+      "grad_norm": 5.213893890380859,
+      "learning_rate": 0.0019387559808612441,
+      "loss": 2.5527,
+      "step": 960
+    },
+    {
+      "epoch": 1.55,
+      "grad_norm": 2.5714313983917236,
+      "learning_rate": 0.0019381180223285487,
+      "loss": 2.5112,
+      "step": 970
+    },
+    {
+      "epoch": 1.56,
+      "grad_norm": 3.034376859664917,
+      "learning_rate": 0.0019374800637958535,
+      "loss": 2.7126,
+      "step": 980
+    },
+    {
+      "epoch": 1.58,
+      "grad_norm": 4.801724910736084,
+      "learning_rate": 0.001936842105263158,
+      "loss": 2.6247,
+      "step": 990
+    },
+    {
+      "epoch": 1.59,
+      "grad_norm": 2.8160829544067383,
+      "learning_rate": 0.0019362041467304626,
+      "loss": 2.5434,
+      "step": 1000
+    },
+    {
+      "epoch": 1.61,
+      "grad_norm": 5.186509132385254,
+      "learning_rate": 0.0019355661881977672,
+      "loss": 2.6724,
+      "step": 1010
+    },
+    {
+      "epoch": 1.63,
+      "grad_norm": 4.558096408843994,
+      "learning_rate": 0.001934928229665072,
+      "loss": 2.6487,
+      "step": 1020
+    },
+    {
+      "epoch": 1.64,
+      "grad_norm": 4.698276519775391,
+      "learning_rate": 0.0019342902711323765,
+      "loss": 2.5392,
+      "step": 1030
+    },
+    {
+      "epoch": 1.66,
+      "grad_norm": 3.624025821685791,
+      "learning_rate": 0.001933652312599681,
+      "loss": 2.6413,
+      "step": 1040
+    },
+    {
+      "epoch": 1.67,
+      "grad_norm": 2.634162664413452,
+      "learning_rate": 0.0019330143540669858,
+      "loss": 2.7082,
+      "step": 1050
+    },
+    {
+      "epoch": 1.69,
+      "grad_norm": 2.483462333679199,
+      "learning_rate": 0.0019323763955342904,
+      "loss": 2.716,
+      "step": 1060
+    },
+    {
+      "epoch": 1.71,
+      "grad_norm": 3.256911277770996,
+      "learning_rate": 0.001931738437001595,
+      "loss": 2.7508,
+      "step": 1070
+    },
+    {
+      "epoch": 1.72,
+      "grad_norm": 2.233299970626831,
+      "learning_rate": 0.0019311004784688995,
+      "loss": 2.7591,
+      "step": 1080
+    },
+    {
+      "epoch": 1.74,
+      "grad_norm": 3.583534002304077,
+      "learning_rate": 0.0019304625199362043,
+      "loss": 2.5285,
+      "step": 1090
+    },
+    {
+      "epoch": 1.75,
+      "grad_norm": 2.7138407230377197,
+      "learning_rate": 0.0019298245614035089,
+      "loss": 2.4167,
+      "step": 1100
+    },
+    {
+      "epoch": 1.77,
+      "grad_norm": 4.423559665679932,
+      "learning_rate": 0.0019291866028708134,
+      "loss": 2.6601,
+      "step": 1110
+    },
+    {
+      "epoch": 1.79,
+      "grad_norm": 4.424483776092529,
+      "learning_rate": 0.0019285486443381182,
+      "loss": 2.6824,
+      "step": 1120
+    },
+    {
+      "epoch": 1.8,
+      "grad_norm": 1.5624890327453613,
+      "learning_rate": 0.0019279106858054228,
+      "loss": 2.7301,
+      "step": 1130
+    },
+    {
+      "epoch": 1.82,
+      "grad_norm": 3.3539106845855713,
+      "learning_rate": 0.0019272727272727273,
+      "loss": 2.6942,
+      "step": 1140
+    },
+    {
+      "epoch": 1.83,
+      "grad_norm": 3.6363375186920166,
+      "learning_rate": 0.001926634768740032,
+      "loss": 2.6089,
+      "step": 1150
+    },
+    {
+      "epoch": 1.85,
+      "grad_norm": 4.147191047668457,
+      "learning_rate": 0.0019259968102073367,
+      "loss": 2.7383,
+      "step": 1160
+    },
+    {
+      "epoch": 1.87,
+      "grad_norm": 2.932065486907959,
+      "learning_rate": 0.0019253588516746412,
+      "loss": 2.6189,
+      "step": 1170
+    },
+    {
+      "epoch": 1.88,
+      "grad_norm": 4.373189449310303,
+      "learning_rate": 0.0019247208931419458,
+      "loss": 2.8137,
+      "step": 1180
+    },
+    {
+      "epoch": 1.9,
+      "grad_norm": 3.7591166496276855,
+      "learning_rate": 0.0019240829346092506,
+      "loss": 2.7852,
+      "step": 1190
+    },
+    {
+      "epoch": 1.91,
+      "grad_norm": 4.965326309204102,
+      "learning_rate": 0.0019234449760765552,
+      "loss": 2.6627,
+      "step": 1200
+    },
+    {
+      "epoch": 1.93,
+      "grad_norm": 3.0201761722564697,
+      "learning_rate": 0.0019228070175438597,
+      "loss": 2.8458,
+      "step": 1210
+    },
+    {
+      "epoch": 1.95,
+      "grad_norm": 4.8068695068359375,
+      "learning_rate": 0.0019221690590111643,
+      "loss": 2.6854,
+      "step": 1220
+    },
+    {
+      "epoch": 1.96,
+      "grad_norm": 2.76481032371521,
+      "learning_rate": 0.001921531100478469,
+      "loss": 2.721,
+      "step": 1230
+    },
+    {
+      "epoch": 1.98,
+      "grad_norm": 4.103845596313477,
+      "learning_rate": 0.0019208931419457736,
+      "loss": 2.7458,
+      "step": 1240
+    },
+    {
+      "epoch": 1.99,
+      "grad_norm": 2.848653793334961,
+      "learning_rate": 0.0019202551834130782,
+      "loss": 2.8408,
+      "step": 1250
+    },
+    {
+      "epoch": 2.01,
+      "grad_norm": 2.1055376529693604,
+      "learning_rate": 0.0019196172248803827,
+      "loss": 2.2232,
+      "step": 1260
+    },
+    {
+      "epoch": 2.03,
+      "grad_norm": 2.4950308799743652,
+      "learning_rate": 0.0019189792663476875,
+      "loss": 1.9928,
+      "step": 1270
+    },
+    {
+      "epoch": 2.04,
+      "grad_norm": 4.026719093322754,
+      "learning_rate": 0.001918341307814992,
+      "loss": 1.8887,
+      "step": 1280
+    },
+    {
+      "epoch": 2.06,
+      "grad_norm": 2.439951181411743,
+      "learning_rate": 0.0019177033492822966,
+      "loss": 1.7856,
+      "step": 1290
+    },
+    {
+      "epoch": 2.07,
+      "grad_norm": 3.8327765464782715,
+      "learning_rate": 0.0019170653907496014,
+      "loss": 1.9734,
+      "step": 1300
+    },
+    {
+      "epoch": 2.09,
+      "grad_norm": 4.497558116912842,
+      "learning_rate": 0.001916427432216906,
+      "loss": 1.9576,
+      "step": 1310
+    },
+    {
+      "epoch": 2.11,
+      "grad_norm": 4.017326831817627,
+      "learning_rate": 0.0019157894736842106,
+      "loss": 2.0879,
+      "step": 1320
+    },
+    {
+      "epoch": 2.12,
+      "grad_norm": 5.959986209869385,
+      "learning_rate": 0.0019151515151515151,
+      "loss": 2.0042,
+      "step": 1330
+    },
+    {
+      "epoch": 2.14,
+      "grad_norm": 2.2927639484405518,
+      "learning_rate": 0.00191451355661882,
+      "loss": 1.8132,
+      "step": 1340
+    },
+    {
+      "epoch": 2.15,
+      "grad_norm": 6.273167133331299,
+      "learning_rate": 0.0019138755980861245,
+      "loss": 2.0617,
+      "step": 1350
+    },
+    {
+      "epoch": 2.17,
+      "grad_norm": 2.9032981395721436,
+      "learning_rate": 0.001913237639553429,
+      "loss": 2.0173,
+      "step": 1360
+    },
+    {
+      "epoch": 2.19,
+      "grad_norm": 5.651817798614502,
+      "learning_rate": 0.0019125996810207338,
+      "loss": 2.0464,
+      "step": 1370
+    },
+    {
+      "epoch": 2.2,
+      "grad_norm": 4.344000339508057,
+      "learning_rate": 0.0019119617224880384,
+      "loss": 1.9377,
+      "step": 1380
+    },
+    {
+      "epoch": 2.22,
+      "grad_norm": 3.4183313846588135,
+      "learning_rate": 0.001911323763955343,
+      "loss": 2.1948,
+      "step": 1390
+    },
+    {
+      "epoch": 2.23,
+      "grad_norm": 8.772147178649902,
+      "learning_rate": 0.0019106858054226475,
+      "loss": 2.0348,
+      "step": 1400
+    },
+    {
+      "epoch": 2.25,
+      "grad_norm": 2.020637273788452,
+      "learning_rate": 0.0019100478468899523,
+      "loss": 2.0723,
+      "step": 1410
+    },
+    {
+      "epoch": 2.26,
+      "grad_norm": 4.4110565185546875,
+      "learning_rate": 0.0019094098883572568,
+      "loss": 1.9483,
+      "step": 1420
+    },
+    {
+      "epoch": 2.28,
+      "grad_norm": 4.694215774536133,
+      "learning_rate": 0.0019087719298245614,
+      "loss": 2.0805,
+      "step": 1430
+    },
+    {
+      "epoch": 2.3,
+      "grad_norm": 4.042151927947998,
+      "learning_rate": 0.0019081339712918662,
+      "loss": 2.0796,
+      "step": 1440
+    },
+    {
+      "epoch": 2.31,
+      "grad_norm": 3.466386318206787,
+      "learning_rate": 0.0019074960127591707,
+      "loss": 2.049,
+      "step": 1450
+    },
+    {
+      "epoch": 2.33,
+      "grad_norm": 5.676107406616211,
+      "learning_rate": 0.0019068580542264753,
+      "loss": 2.183,
+      "step": 1460
+    },
+    {
+      "epoch": 2.34,
+      "grad_norm": 2.662849187850952,
+      "learning_rate": 0.0019062200956937799,
+      "loss": 2.0795,
+      "step": 1470
+    },
+    {
+      "epoch": 2.36,
+      "grad_norm": 2.9790804386138916,
+      "learning_rate": 0.0019055821371610846,
+      "loss": 2.0264,
+      "step": 1480
+    },
+    {
+      "epoch": 2.38,
+      "grad_norm": 5.430638790130615,
+      "learning_rate": 0.0019049441786283892,
+      "loss": 2.0712,
+      "step": 1490
+    },
+    {
+      "epoch": 2.39,
+      "grad_norm": 3.5230486392974854,
+      "learning_rate": 0.0019043062200956938,
+      "loss": 2.1533,
+      "step": 1500
+    },
+    {
+      "epoch": 2.41,
+      "grad_norm": 3.2345664501190186,
+      "learning_rate": 0.0019036682615629985,
+      "loss": 2.325,
+      "step": 1510
+    },
+    {
+      "epoch": 2.42,
+      "grad_norm": 2.0495986938476562,
+      "learning_rate": 0.001903030303030303,
+      "loss": 2.225,
+      "step": 1520
+    },
+    {
+      "epoch": 2.44,
+      "grad_norm": 4.178987979888916,
+      "learning_rate": 0.0019023923444976077,
+      "loss": 2.3576,
+      "step": 1530
+    },
+    {
+      "epoch": 2.46,
+      "grad_norm": 4.338198184967041,
+      "learning_rate": 0.0019017543859649122,
+      "loss": 2.1247,
+      "step": 1540
+    },
+    {
+      "epoch": 2.47,
+      "grad_norm": 2.644819736480713,
+      "learning_rate": 0.001901116427432217,
+      "loss": 2.3011,
+      "step": 1550
+    },
+    {
+      "epoch": 2.49,
+      "grad_norm": 2.83943772315979,
+      "learning_rate": 0.0019004784688995216,
+      "loss": 2.3449,
+      "step": 1560
+    },
+    {
+      "epoch": 2.5,
+      "grad_norm": 5.573853492736816,
+      "learning_rate": 0.0018998405103668261,
+      "loss": 2.1616,
+      "step": 1570
+    },
+    {
+      "epoch": 2.52,
+      "grad_norm": 5.958674430847168,
+      "learning_rate": 0.001899202551834131,
+      "loss": 2.2724,
+      "step": 1580
+    },
+    {
+      "epoch": 2.54,
+      "grad_norm": 4.136911392211914,
+      "learning_rate": 0.0018985645933014355,
+      "loss": 2.1089,
+      "step": 1590
+    },
+    {
+      "epoch": 2.55,
+      "grad_norm": 2.5898241996765137,
+      "learning_rate": 0.00189792663476874,
+      "loss": 2.3466,
+      "step": 1600
+    },
+    {
+      "epoch": 2.57,
+      "grad_norm": 3.101346015930176,
+      "learning_rate": 0.0018972886762360446,
+      "loss": 2.2233,
+      "step": 1610
+    },
+    {
+      "epoch": 2.58,
+      "grad_norm": 3.3297476768493652,
+      "learning_rate": 0.0018966507177033494,
+      "loss": 2.1519,
+      "step": 1620
+    },
+    {
+      "epoch": 2.6,
+      "grad_norm": 5.8525848388671875,
+      "learning_rate": 0.001896012759170654,
+      "loss": 2.4774,
+      "step": 1630
+    },
+    {
+      "epoch": 2.62,
+      "grad_norm": 5.049089431762695,
+      "learning_rate": 0.0018953748006379585,
+      "loss": 2.2551,
+      "step": 1640
+    },
+    {
+      "epoch": 2.63,
+      "grad_norm": 3.721668004989624,
+      "learning_rate": 0.001894736842105263,
+      "loss": 2.1791,
+      "step": 1650
+    },
+    {
+      "epoch": 2.65,
+      "grad_norm": 3.234546661376953,
+      "learning_rate": 0.0018940988835725679,
+      "loss": 2.3578,
+      "step": 1660
+    },
+    {
+      "epoch": 2.66,
+      "grad_norm": 3.65110445022583,
+      "learning_rate": 0.0018934609250398724,
+      "loss": 2.4003,
+      "step": 1670
+    },
+    {
+      "epoch": 2.68,
+      "grad_norm": 4.681038856506348,
+      "learning_rate": 0.001892822966507177,
+      "loss": 2.3949,
+      "step": 1680
+    },
+    {
+      "epoch": 2.7,
+      "grad_norm": 4.4321136474609375,
+      "learning_rate": 0.0018921850079744818,
+      "loss": 2.3568,
+      "step": 1690
+    },
+    {
+      "epoch": 2.71,
+      "grad_norm": 3.075857162475586,
+      "learning_rate": 0.0018915470494417863,
+      "loss": 2.3458,
+      "step": 1700
+    },
+    {
+      "epoch": 2.73,
+      "grad_norm": 2.5896382331848145,
+      "learning_rate": 0.0018909090909090909,
+      "loss": 2.4374,
+      "step": 1710
+    },
+    {
+      "epoch": 2.74,
+      "grad_norm": 4.7238006591796875,
+      "learning_rate": 0.0018902711323763954,
+      "loss": 2.5829,
+      "step": 1720
+    },
+    {
+      "epoch": 2.76,
+      "grad_norm": 2.7794413566589355,
+      "learning_rate": 0.0018896331738437002,
+      "loss": 2.2883,
+      "step": 1730
+    },
+    {
+      "epoch": 2.78,
+      "grad_norm": 2.269745349884033,
+      "learning_rate": 0.0018889952153110048,
+      "loss": 2.1972,
+      "step": 1740
+    },
+    {
+      "epoch": 2.79,
+      "grad_norm": 4.9795918464660645,
+      "learning_rate": 0.0018883572567783093,
+      "loss": 2.3769,
+      "step": 1750
+    },
+    {
+      "epoch": 2.81,
+      "grad_norm": 2.1848745346069336,
+      "learning_rate": 0.0018877192982456141,
+      "loss": 2.3916,
+      "step": 1760
+    },
+    {
+      "epoch": 2.82,
+      "grad_norm": 3.246695041656494,
+      "learning_rate": 0.0018870813397129187,
+      "loss": 2.3303,
+      "step": 1770
+    },
+    {
+      "epoch": 2.84,
+      "grad_norm": 4.342026710510254,
+      "learning_rate": 0.0018864433811802233,
+      "loss": 2.3351,
+      "step": 1780
+    },
+    {
+      "epoch": 2.85,
+      "grad_norm": 3.7110838890075684,
+      "learning_rate": 0.0018858054226475278,
+      "loss": 2.2379,
+      "step": 1790
+    },
+    {
+      "epoch": 2.87,
+      "grad_norm": 3.2227859497070312,
+      "learning_rate": 0.0018851674641148326,
+      "loss": 2.3734,
+      "step": 1800
+    },
+    {
+      "epoch": 2.89,
+      "grad_norm": 3.257556915283203,
+      "learning_rate": 0.0018845295055821372,
+      "loss": 2.533,
+      "step": 1810
+    },
+    {
+      "epoch": 2.9,
+      "grad_norm": 3.287235975265503,
+      "learning_rate": 0.0018838915470494417,
+      "loss": 2.4149,
+      "step": 1820
+    },
+    {
+      "epoch": 2.92,
+      "grad_norm": 5.332248210906982,
+      "learning_rate": 0.0018832535885167465,
+      "loss": 2.4045,
+      "step": 1830
+    },
+    {
+      "epoch": 2.93,
+      "grad_norm": 6.954147815704346,
+      "learning_rate": 0.001882615629984051,
+      "loss": 2.4621,
+      "step": 1840
+    },
+    {
+      "epoch": 2.95,
+      "grad_norm": 3.5278656482696533,
+      "learning_rate": 0.0018819776714513556,
+      "loss": 2.3869,
+      "step": 1850
+    },
+    {
+      "epoch": 2.97,
+      "grad_norm": 5.101337909698486,
+      "learning_rate": 0.0018813397129186602,
+      "loss": 2.507,
+      "step": 1860
+    },
+    {
+      "epoch": 2.98,
+      "grad_norm": 3.60355544090271,
+      "learning_rate": 0.001880701754385965,
+      "loss": 2.409,
+      "step": 1870
+    },
+    {
+      "epoch": 3.0,
+      "grad_norm": 4.093048572540283,
+      "learning_rate": 0.0018800637958532695,
+      "loss": 2.2993,
+      "step": 1880
+    },
+    {
+      "epoch": 3.01,
+      "grad_norm": 2.8062260150909424,
+      "learning_rate": 0.001879425837320574,
+      "loss": 1.7359,
+      "step": 1890
+    },
+    {
+      "epoch": 3.03,
+      "grad_norm": 6.747288227081299,
+      "learning_rate": 0.0018787878787878789,
+      "loss": 1.6095,
+      "step": 1900
+    },
+    {
+      "epoch": 3.05,
+      "grad_norm": 4.082904815673828,
+      "learning_rate": 0.0018781499202551834,
+      "loss": 1.69,
+      "step": 1910
+    },
+    {
+      "epoch": 3.06,
+      "grad_norm": 1.8365858793258667,
+      "learning_rate": 0.001877511961722488,
+      "loss": 1.5815,
+      "step": 1920
+    },
+    {
+      "epoch": 3.08,
+      "grad_norm": 2.940593719482422,
+      "learning_rate": 0.0018768740031897926,
+      "loss": 1.6635,
+      "step": 1930
+    },
+    {
+      "epoch": 3.09,
+      "grad_norm": 2.660888433456421,
+      "learning_rate": 0.0018762360446570973,
+      "loss": 1.632,
+      "step": 1940
+    },
+    {
+      "epoch": 3.11,
+      "grad_norm": 3.061300039291382,
+      "learning_rate": 0.001875598086124402,
+      "loss": 1.6964,
+      "step": 1950
+    },
+    {
+      "epoch": 3.13,
+      "grad_norm": 3.134197235107422,
+      "learning_rate": 0.0018749601275917065,
+      "loss": 1.6702,
+      "step": 1960
+    },
+    {
+      "epoch": 3.14,
+      "grad_norm": 5.188543796539307,
+      "learning_rate": 0.001874322169059011,
+      "loss": 1.7615,
+      "step": 1970
+    },
+    {
+      "epoch": 3.16,
+      "grad_norm": 3.115239381790161,
+      "learning_rate": 0.0018736842105263158,
+      "loss": 1.8079,
+      "step": 1980
+    },
+    {
+      "epoch": 3.17,
+      "grad_norm": 4.397618770599365,
+      "learning_rate": 0.0018730462519936204,
+      "loss": 1.8256,
+      "step": 1990
+    },
+    {
+      "epoch": 3.19,
+      "grad_norm": 6.745879650115967,
+      "learning_rate": 0.001872408293460925,
+      "loss": 1.6285,
+      "step": 2000
+    },
+    {
+      "epoch": 3.21,
+      "grad_norm": 4.609273433685303,
+      "learning_rate": 0.0018717703349282297,
+      "loss": 1.8885,
+      "step": 2010
+    },
+    {
+      "epoch": 3.22,
+      "grad_norm": 2.650247097015381,
+      "learning_rate": 0.0018711323763955343,
+      "loss": 1.6762,
+      "step": 2020
+    },
+    {
+      "epoch": 3.24,
+      "grad_norm": 5.857548713684082,
+      "learning_rate": 0.0018704944178628388,
+      "loss": 1.7824,
+      "step": 2030
+    },
+    {
+      "epoch": 3.25,
+      "grad_norm": 3.2646751403808594,
+      "learning_rate": 0.0018698564593301434,
+      "loss": 1.8078,
+      "step": 2040
+    },
+    {
+      "epoch": 3.27,
+      "grad_norm": 3.6167776584625244,
+      "learning_rate": 0.0018692185007974482,
+      "loss": 1.7395,
+      "step": 2050
+    },
+    {
+      "epoch": 3.29,
+      "grad_norm": 3.98301100730896,
+      "learning_rate": 0.0018685805422647527,
+      "loss": 1.811,
+      "step": 2060
+    },
+    {
+      "epoch": 3.3,
+      "grad_norm": 5.3117594718933105,
+      "learning_rate": 0.0018679425837320573,
+      "loss": 1.7647,
+      "step": 2070
+    },
+    {
+      "epoch": 3.32,
+      "grad_norm": 6.290541172027588,
+      "learning_rate": 0.001867304625199362,
+      "loss": 1.8698,
+      "step": 2080
+    },
+    {
+      "epoch": 3.33,
+      "grad_norm": 6.5661091804504395,
+      "learning_rate": 0.0018666666666666666,
+      "loss": 2.0072,
+      "step": 2090
+    },
+    {
+      "epoch": 3.35,
+      "grad_norm": 6.150557994842529,
+      "learning_rate": 0.0018660287081339712,
+      "loss": 1.8055,
+      "step": 2100
+    },
+    {
+      "epoch": 3.37,
+      "grad_norm": 3.677581310272217,
+      "learning_rate": 0.0018653907496012758,
+      "loss": 1.8711,
+      "step": 2110
+    },
+    {
+      "epoch": 3.38,
+      "grad_norm": 2.2296063899993896,
+      "learning_rate": 0.0018647527910685806,
+      "loss": 1.8763,
+      "step": 2120
+    },
+    {
+      "epoch": 3.4,
+      "grad_norm": 3.410414695739746,
+      "learning_rate": 0.0018641148325358851,
+      "loss": 1.8947,
+      "step": 2130
+    },
+    {
+      "epoch": 3.41,
+      "grad_norm": 3.566406726837158,
+      "learning_rate": 0.0018634768740031897,
+      "loss": 1.9423,
+      "step": 2140
+    },
+    {
+      "epoch": 3.43,
+      "grad_norm": 5.5341668128967285,
+      "learning_rate": 0.0018628389154704945,
+      "loss": 2.0945,
+      "step": 2150
+    },
+    {
+      "epoch": 3.44,
+      "grad_norm": 4.542388439178467,
+      "learning_rate": 0.001862200956937799,
+      "loss": 1.9816,
+      "step": 2160
+    },
+    {
+      "epoch": 3.46,
+      "grad_norm": 3.3940858840942383,
+      "learning_rate": 0.0018615629984051036,
+      "loss": 1.9789,
+      "step": 2170
+    },
+    {
+      "epoch": 3.48,
+      "grad_norm": 3.9412808418273926,
+      "learning_rate": 0.0018609250398724081,
+      "loss": 2.1294,
+      "step": 2180
+    },
+    {
+      "epoch": 3.49,
+      "grad_norm": 2.695256233215332,
+      "learning_rate": 0.001860287081339713,
+      "loss": 2.0233,
+      "step": 2190
+    },
+    {
+      "epoch": 3.51,
+      "grad_norm": 3.1621010303497314,
+      "learning_rate": 0.0018596491228070175,
+      "loss": 1.8246,
+      "step": 2200
+    },
+    {
+      "epoch": 3.52,
+      "grad_norm": 5.293850898742676,
+      "learning_rate": 0.001859011164274322,
+      "loss": 2.0004,
+      "step": 2210
+    },
+    {
+      "epoch": 3.54,
+      "grad_norm": 3.9184532165527344,
+      "learning_rate": 0.0018583732057416268,
+      "loss": 2.1011,
+      "step": 2220
+    },
+    {
+      "epoch": 3.56,
+      "grad_norm": 2.1356756687164307,
+      "learning_rate": 0.0018577352472089314,
+      "loss": 2.1129,
+      "step": 2230
+    },
+    {
+      "epoch": 3.57,
+      "grad_norm": 3.8817296028137207,
+      "learning_rate": 0.001857097288676236,
+      "loss": 2.0047,
+      "step": 2240
+    },
+    {
+      "epoch": 3.59,
+      "grad_norm": 3.2533388137817383,
+      "learning_rate": 0.0018564593301435405,
+      "loss": 1.9137,
+      "step": 2250
+    },
+    {
+      "epoch": 3.6,
+      "grad_norm": 3.3586273193359375,
+      "learning_rate": 0.0018558213716108455,
+      "loss": 2.065,
+      "step": 2260
+    },
+    {
+      "epoch": 3.62,
+      "grad_norm": 4.144857406616211,
+      "learning_rate": 0.00185518341307815,
+      "loss": 2.0735,
+      "step": 2270
+    },
+    {
+      "epoch": 3.64,
+      "grad_norm": 3.9639623165130615,
+      "learning_rate": 0.0018545454545454546,
+      "loss": 2.118,
+      "step": 2280
+    },
+    {
+      "epoch": 3.65,
+      "grad_norm": 3.5141801834106445,
+      "learning_rate": 0.0018539074960127592,
+      "loss": 2.0006,
+      "step": 2290
+    },
+    {
+      "epoch": 3.67,
+      "grad_norm": 3.2397677898406982,
+      "learning_rate": 0.001853269537480064,
+      "loss": 2.1261,
+      "step": 2300
+    },
+    {
+      "epoch": 3.68,
+      "grad_norm": 5.273965835571289,
+      "learning_rate": 0.0018526315789473685,
+      "loss": 2.0134,
+      "step": 2310
+    },
+    {
+      "epoch": 3.7,
+      "grad_norm": 4.7644805908203125,
+      "learning_rate": 0.001851993620414673,
+      "loss": 1.9781,
+      "step": 2320
+    },
+    {
+      "epoch": 3.72,
+      "grad_norm": 3.042400598526001,
+      "learning_rate": 0.0018513556618819779,
+      "loss": 2.0429,
+      "step": 2330
+    },
+    {
+      "epoch": 3.73,
+      "grad_norm": 4.666615009307861,
+      "learning_rate": 0.0018507177033492824,
+      "loss": 2.052,
+      "step": 2340
+    },
+    {
+      "epoch": 3.75,
+      "grad_norm": 2.8000500202178955,
+      "learning_rate": 0.001850079744816587,
+      "loss": 1.8426,
+      "step": 2350
+    },
+    {
+      "epoch": 3.76,
+      "grad_norm": 4.616471767425537,
+      "learning_rate": 0.0018494417862838916,
+      "loss": 2.1656,
+      "step": 2360
+    },
+    {
+      "epoch": 3.78,
+      "grad_norm": 4.575398921966553,
+      "learning_rate": 0.0018488038277511964,
+      "loss": 2.176,
+      "step": 2370
+    },
+    {
+      "epoch": 3.8,
+      "grad_norm": 4.790685176849365,
+      "learning_rate": 0.001848165869218501,
+      "loss": 2.2702,
+      "step": 2380
+    },
+    {
+      "epoch": 3.81,
+      "grad_norm": 4.581923007965088,
+      "learning_rate": 0.0018475279106858055,
+      "loss": 2.1888,
+      "step": 2390
+    },
+    {
+      "epoch": 3.83,
+      "grad_norm": 4.997535705566406,
+      "learning_rate": 0.0018468899521531103,
+      "loss": 2.0964,
+      "step": 2400
+    },
+    {
+      "epoch": 3.84,
+      "grad_norm": 3.024472951889038,
+      "learning_rate": 0.0018462519936204148,
+      "loss": 1.9286,
+      "step": 2410
+    },
+    {
+      "epoch": 3.86,
+      "grad_norm": 3.9244346618652344,
+      "learning_rate": 0.0018456140350877194,
+      "loss": 2.0429,
+      "step": 2420
+    },
+    {
+      "epoch": 3.88,
+      "grad_norm": 5.021399974822998,
+      "learning_rate": 0.001844976076555024,
+      "loss": 2.0872,
+      "step": 2430
+    },
+    {
+      "epoch": 3.89,
+      "grad_norm": 2.4256746768951416,
+      "learning_rate": 0.0018443381180223287,
+      "loss": 2.0974,
+      "step": 2440
+    },
+    {
+      "epoch": 3.91,
+      "grad_norm": 1.7723888158798218,
+      "learning_rate": 0.0018437001594896333,
+      "loss": 2.1741,
+      "step": 2450
+    },
+    {
+      "epoch": 3.92,
+      "grad_norm": 2.9281272888183594,
+      "learning_rate": 0.0018430622009569379,
+      "loss": 2.0566,
+      "step": 2460
+    },
+    {
+      "epoch": 3.94,
+      "grad_norm": 3.0242364406585693,
+      "learning_rate": 0.0018424242424242426,
+      "loss": 2.2248,
+      "step": 2470
+    },
+    {
+      "epoch": 3.96,
+      "grad_norm": 3.027165651321411,
+      "learning_rate": 0.0018417862838915472,
+      "loss": 2.0762,
+      "step": 2480
+    },
+    {
+      "epoch": 3.97,
+      "grad_norm": 4.249027729034424,
+      "learning_rate": 0.0018411483253588518,
+      "loss": 2.1017,
+      "step": 2490
+    },
+    {
+      "epoch": 3.99,
+      "grad_norm": 3.3154234886169434,
+      "learning_rate": 0.0018405103668261563,
+      "loss": 2.0766,
+      "step": 2500
+    },
+    {
+      "epoch": 4.0,
+      "grad_norm": 1.4245625734329224,
+      "learning_rate": 0.001839872408293461,
+      "loss": 1.9103,
+      "step": 2510
+    },
+    {
+      "epoch": 4.02,
+      "grad_norm": 1.513168454170227,
+      "learning_rate": 0.0018392344497607657,
+      "loss": 1.4412,
+      "step": 2520
+    },
+    {
+      "epoch": 4.04,
+      "grad_norm": 4.4338507652282715,
+      "learning_rate": 0.0018385964912280702,
+      "loss": 1.3431,
+      "step": 2530
+    },
+    {
+      "epoch": 4.05,
+      "grad_norm": 4.030521869659424,
+      "learning_rate": 0.001837958532695375,
+      "loss": 1.4325,
+      "step": 2540
+    },
+    {
+      "epoch": 4.07,
+      "grad_norm": 4.0168137550354,
+      "learning_rate": 0.0018373205741626796,
+      "loss": 1.4083,
+      "step": 2550
+    },
+    {
+      "epoch": 4.08,
+      "grad_norm": 5.304862022399902,
+      "learning_rate": 0.0018366826156299841,
+      "loss": 1.5336,
+      "step": 2560
+    },
+    {
+      "epoch": 4.1,
+      "grad_norm": 3.5825703144073486,
+      "learning_rate": 0.0018360446570972887,
+      "loss": 1.4663,
+      "step": 2570
+    },
+    {
+      "epoch": 4.11,
+      "grad_norm": 3.8972997665405273,
+      "learning_rate": 0.0018354066985645935,
+      "loss": 1.5203,
+      "step": 2580
+    },
+    {
+      "epoch": 4.13,
+      "grad_norm": 5.68231725692749,
+      "learning_rate": 0.001834768740031898,
+      "loss": 1.6814,
+      "step": 2590
+    },
+    {
+      "epoch": 4.15,
+      "grad_norm": 3.8971197605133057,
+      "learning_rate": 0.0018341307814992026,
+      "loss": 1.3574,
+      "step": 2600
+    },
+    {
+      "epoch": 4.16,
+      "grad_norm": 3.819286346435547,
+      "learning_rate": 0.0018334928229665074,
+      "loss": 1.4937,
+      "step": 2610
+    },
+    {
+      "epoch": 4.18,
+      "grad_norm": 3.3106937408447266,
+      "learning_rate": 0.001832854864433812,
+      "loss": 1.5814,
+      "step": 2620
+    },
+    {
+      "epoch": 4.19,
+      "grad_norm": 5.2803754806518555,
+      "learning_rate": 0.0018322169059011165,
+      "loss": 1.641,
+      "step": 2630
+    },
+    {
+      "epoch": 4.21,
+      "grad_norm": 4.728196620941162,
+      "learning_rate": 0.001831578947368421,
+      "loss": 1.5647,
+      "step": 2640
+    },
+    {
+      "epoch": 4.23,
+      "grad_norm": 3.2671823501586914,
+      "learning_rate": 0.0018309409888357258,
+      "loss": 1.7673,
+      "step": 2650
+    },
+    {
+      "epoch": 4.24,
+      "grad_norm": 2.539050579071045,
+      "learning_rate": 0.0018303030303030304,
+      "loss": 1.5397,
+      "step": 2660
+    },
+    {
+      "epoch": 4.26,
+      "grad_norm": 2.7646982669830322,
+      "learning_rate": 0.001829665071770335,
+      "loss": 1.4788,
+      "step": 2670
+    },
+    {
+      "epoch": 4.27,
+      "grad_norm": 3.103675603866577,
+      "learning_rate": 0.0018290271132376395,
+      "loss": 1.5428,
+      "step": 2680
+    },
+    {
+      "epoch": 4.29,
+      "grad_norm": 5.560327053070068,
+      "learning_rate": 0.0018283891547049443,
+      "loss": 1.5389,
+      "step": 2690
+    },
+    {
+      "epoch": 4.31,
+      "grad_norm": 3.355659246444702,
+      "learning_rate": 0.0018277511961722489,
+      "loss": 1.6061,
+      "step": 2700
+    },
+    {
+      "epoch": 4.32,
+      "grad_norm": 5.579101085662842,
+      "learning_rate": 0.0018271132376395534,
+      "loss": 1.6554,
+      "step": 2710
+    },
+    {
+      "epoch": 4.34,
+      "grad_norm": 4.465839862823486,
+      "learning_rate": 0.0018264752791068582,
+      "loss": 1.7019,
+      "step": 2720
+    },
+    {
+      "epoch": 4.35,
+      "grad_norm": 2.9531333446502686,
+      "learning_rate": 0.0018258373205741628,
+      "loss": 1.6395,
+      "step": 2730
+    },
+    {
+      "epoch": 4.37,
+      "grad_norm": 3.912163257598877,
+      "learning_rate": 0.0018251993620414673,
+      "loss": 1.7232,
+      "step": 2740
+    },
+    {
+      "epoch": 4.39,
+      "grad_norm": 3.955035924911499,
+      "learning_rate": 0.001824561403508772,
+      "loss": 1.6601,
+      "step": 2750
+    },
+    {
+      "epoch": 4.4,
+      "grad_norm": 5.796784400939941,
+      "learning_rate": 0.0018239234449760767,
+      "loss": 1.7742,
+      "step": 2760
+    },
+    {
+      "epoch": 4.42,
+      "grad_norm": 3.470076322555542,
+      "learning_rate": 0.0018232854864433812,
+      "loss": 1.7226,
+      "step": 2770
+    },
+    {
+      "epoch": 4.43,
+      "grad_norm": 4.716192245483398,
+      "learning_rate": 0.0018226475279106858,
+      "loss": 1.6537,
+      "step": 2780
+    },
+    {
+      "epoch": 4.45,
+      "grad_norm": 4.586126327514648,
+      "learning_rate": 0.0018220095693779906,
+      "loss": 1.6319,
+      "step": 2790
+    },
+    {
+      "epoch": 4.47,
+      "grad_norm": 4.049830913543701,
+      "learning_rate": 0.0018213716108452952,
+      "loss": 1.8295,
+      "step": 2800
+    },
+    {
+      "epoch": 4.48,
+      "grad_norm": 2.4487478733062744,
+      "learning_rate": 0.0018207336523125997,
+      "loss": 1.9085,
+      "step": 2810
+    },
+    {
+      "epoch": 4.5,
+      "grad_norm": 3.3505730628967285,
+      "learning_rate": 0.0018200956937799043,
+      "loss": 1.7341,
+      "step": 2820
+    },
+    {
+      "epoch": 4.51,
+      "grad_norm": 3.655205011367798,
+      "learning_rate": 0.001819457735247209,
+      "loss": 1.7667,
+      "step": 2830
+    },
+    {
+      "epoch": 4.53,
+      "grad_norm": 4.730507850646973,
+      "learning_rate": 0.0018188197767145136,
+      "loss": 1.6444,
+      "step": 2840
+    },
+    {
+      "epoch": 4.55,
+      "grad_norm": 3.635011911392212,
+      "learning_rate": 0.0018181818181818182,
+      "loss": 1.7706,
+      "step": 2850
+    },
+    {
+      "epoch": 4.56,
+      "grad_norm": 2.98230242729187,
+      "learning_rate": 0.001817543859649123,
+      "loss": 1.7443,
+      "step": 2860
+    },
+    {
+      "epoch": 4.58,
+      "grad_norm": 2.706557035446167,
+      "learning_rate": 0.0018169059011164275,
+      "loss": 1.738,
+      "step": 2870
+    },
+    {
+      "epoch": 4.59,
+      "grad_norm": 5.715457439422607,
+      "learning_rate": 0.001816267942583732,
+      "loss": 1.9244,
+      "step": 2880
+    },
+    {
+      "epoch": 4.61,
+      "grad_norm": 4.379674911499023,
+      "learning_rate": 0.0018156299840510366,
+      "loss": 1.8655,
+      "step": 2890
+    },
+    {
+      "epoch": 4.63,
+      "grad_norm": 3.1540908813476562,
+      "learning_rate": 0.0018149920255183414,
+      "loss": 1.8485,
+      "step": 2900
+    },
+    {
+      "epoch": 4.64,
+      "grad_norm": 4.2252373695373535,
+      "learning_rate": 0.001814354066985646,
+      "loss": 1.8581,
+      "step": 2910
+    },
+    {
+      "epoch": 4.66,
+      "grad_norm": 2.00207781791687,
+      "learning_rate": 0.0018137161084529506,
+      "loss": 2.0877,
+      "step": 2920
+    },
+    {
+      "epoch": 4.67,
+      "grad_norm": 2.710052013397217,
+      "learning_rate": 0.0018130781499202553,
+      "loss": 1.6488,
+      "step": 2930
+    },
+    {
+      "epoch": 4.69,
+      "grad_norm": 5.69435453414917,
+      "learning_rate": 0.00181244019138756,
+      "loss": 1.8438,
+      "step": 2940
+    },
+    {
+      "epoch": 4.7,
+      "grad_norm": 3.163170576095581,
+      "learning_rate": 0.0018118022328548645,
+      "loss": 1.8168,
+      "step": 2950
+    },
+    {
+      "epoch": 4.72,
+      "grad_norm": 2.5819127559661865,
+      "learning_rate": 0.001811164274322169,
+      "loss": 1.8733,
+      "step": 2960
+    },
+    {
+      "epoch": 4.74,
+      "grad_norm": 3.780280351638794,
+      "learning_rate": 0.0018105263157894738,
+      "loss": 1.7007,
+      "step": 2970
+    },
+    {
+      "epoch": 4.75,
+      "grad_norm": 4.294229030609131,
+      "learning_rate": 0.0018098883572567784,
+      "loss": 1.838,
+      "step": 2980
+    },
+    {
+      "epoch": 4.77,
+      "grad_norm": 4.328463077545166,
+      "learning_rate": 0.001809250398724083,
+      "loss": 1.8721,
+      "step": 2990
+    },
+    {
+      "epoch": 4.78,
+      "grad_norm": 3.204005241394043,
+      "learning_rate": 0.0018086124401913875,
+      "loss": 1.8796,
+      "step": 3000
+    },
+    {
+      "epoch": 4.8,
+      "grad_norm": 6.089762210845947,
+      "learning_rate": 0.0018079744816586923,
+      "loss": 1.8764,
+      "step": 3010
+    },
+    {
+      "epoch": 4.82,
+      "grad_norm": 5.21134090423584,
+      "learning_rate": 0.0018073365231259968,
+      "loss": 1.8615,
+      "step": 3020
+    },
+    {
+      "epoch": 4.83,
+      "grad_norm": 5.567359924316406,
+      "learning_rate": 0.0018066985645933014,
+      "loss": 1.9394,
+      "step": 3030
+    },
+    {
+      "epoch": 4.85,
+      "grad_norm": 3.8925669193267822,
+      "learning_rate": 0.0018060606060606062,
+      "loss": 1.9556,
+      "step": 3040
+    },
+    {
+      "epoch": 4.86,
+      "grad_norm": 3.123612642288208,
+      "learning_rate": 0.0018054226475279107,
+      "loss": 1.8685,
+      "step": 3050
+    },
+    {
+      "epoch": 4.88,
+      "grad_norm": 3.970958709716797,
+      "learning_rate": 0.0018047846889952153,
+      "loss": 1.8955,
+      "step": 3060
+    },
+    {
+      "epoch": 4.9,
+      "grad_norm": 4.519131660461426,
+      "learning_rate": 0.0018041467304625199,
+      "loss": 1.9885,
+      "step": 3070
+    },
+    {
+      "epoch": 4.91,
+      "grad_norm": 3.834430456161499,
+      "learning_rate": 0.0018035087719298246,
+      "loss": 1.9572,
+      "step": 3080
+    },
+    {
+      "epoch": 4.93,
+      "grad_norm": 5.614201068878174,
+      "learning_rate": 0.0018028708133971292,
+      "loss": 1.9366,
+      "step": 3090
+    },
+    {
+      "epoch": 4.94,
+      "grad_norm": 3.8492119312286377,
+      "learning_rate": 0.0018022328548644338,
+      "loss": 1.8337,
+      "step": 3100
+    },
+    {
+      "epoch": 4.96,
+      "grad_norm": 5.122296333312988,
+      "learning_rate": 0.0018015948963317385,
+      "loss": 1.9093,
+      "step": 3110
+    },
+    {
+      "epoch": 4.98,
+      "grad_norm": 3.0235679149627686,
+      "learning_rate": 0.001800956937799043,
+      "loss": 1.8341,
+      "step": 3120
+    },
+    {
+      "epoch": 4.99,
+      "grad_norm": 3.4031426906585693,
+      "learning_rate": 0.0018003189792663477,
+      "loss": 1.8851,
+      "step": 3130
+    },
+    {
+      "epoch": 5.01,
+      "grad_norm": 3.6404995918273926,
+      "learning_rate": 0.0017996810207336522,
+      "loss": 1.4797,
+      "step": 3140
+    },
+    {
+      "epoch": 5.02,
+      "grad_norm": 3.4057798385620117,
+      "learning_rate": 0.001799043062200957,
+      "loss": 1.1585,
+      "step": 3150
+    },
+    {
+      "epoch": 5.04,
+      "grad_norm": 3.314164161682129,
+      "learning_rate": 0.0017984051036682616,
+      "loss": 1.3921,
+      "step": 3160
+    },
+    {
+      "epoch": 5.06,
+      "grad_norm": 4.028993606567383,
+      "learning_rate": 0.0017977671451355661,
+      "loss": 1.2938,
+      "step": 3170
+    },
+    {
+      "epoch": 5.07,
+      "grad_norm": 4.128094673156738,
+      "learning_rate": 0.001797129186602871,
+      "loss": 1.2979,
+      "step": 3180
+    },
+    {
+      "epoch": 5.09,
+      "grad_norm": 3.079228639602661,
+      "learning_rate": 0.0017964912280701755,
+      "loss": 1.292,
+      "step": 3190
+    },
+    {
+      "epoch": 5.1,
+      "grad_norm": 4.176467418670654,
+      "learning_rate": 0.00179585326953748,
+      "loss": 1.528,
+      "step": 3200
+    },
+    {
+      "epoch": 5.12,
+      "grad_norm": 3.689857244491577,
+      "learning_rate": 0.0017952153110047846,
+      "loss": 1.2832,
+      "step": 3210
+    },
+    {
+      "epoch": 5.14,
+      "grad_norm": 3.580005645751953,
+      "learning_rate": 0.0017945773524720894,
+      "loss": 1.3447,
+      "step": 3220
+    },
+    {
+      "epoch": 5.15,
+      "grad_norm": 3.8672592639923096,
+      "learning_rate": 0.001793939393939394,
+      "loss": 1.234,
+      "step": 3230
+    },
+    {
+      "epoch": 5.17,
+      "grad_norm": 3.5929276943206787,
+      "learning_rate": 0.0017933014354066985,
+      "loss": 1.2326,
+      "step": 3240
+    },
+    {
+      "epoch": 5.18,
+      "grad_norm": 3.1610376834869385,
+      "learning_rate": 0.0017926634768740033,
+      "loss": 1.3224,
+      "step": 3250
+    },
+    {
+      "epoch": 5.2,
+      "grad_norm": 3.908184289932251,
+      "learning_rate": 0.0017920255183413079,
+      "loss": 1.4446,
+      "step": 3260
+    },
+    {
+      "epoch": 5.22,
+      "grad_norm": 4.269443511962891,
+      "learning_rate": 0.0017913875598086124,
+      "loss": 1.4286,
+      "step": 3270
+    },
+    {
+      "epoch": 5.23,
+      "grad_norm": 3.0032732486724854,
+      "learning_rate": 0.001790749601275917,
+      "loss": 1.3768,
+      "step": 3280
+    },
+    {
+      "epoch": 5.25,
+      "grad_norm": 3.841958522796631,
+      "learning_rate": 0.0017901116427432218,
+      "loss": 1.3441,
+      "step": 3290
+    },
+    {
+      "epoch": 5.26,
+      "grad_norm": 3.0527617931365967,
+      "learning_rate": 0.0017894736842105263,
+      "loss": 1.4091,
+      "step": 3300
+    },
+    {
+      "epoch": 5.28,
+      "grad_norm": 3.266508102416992,
+      "learning_rate": 0.0017888357256778309,
+      "loss": 1.3933,
+      "step": 3310
+    },
+    {
+      "epoch": 5.3,
+      "grad_norm": 4.250580310821533,
+      "learning_rate": 0.0017881977671451357,
+      "loss": 1.5105,
+      "step": 3320
+    },
+    {
+      "epoch": 5.31,
+      "grad_norm": 3.375892162322998,
+      "learning_rate": 0.0017875598086124402,
+      "loss": 1.4668,
+      "step": 3330
+    },
+    {
+      "epoch": 5.33,
+      "grad_norm": 4.1522297859191895,
+      "learning_rate": 0.0017869218500797448,
+      "loss": 1.4022,
+      "step": 3340
+    },
+    {
+      "epoch": 5.34,
+      "grad_norm": 5.130900859832764,
+      "learning_rate": 0.0017862838915470493,
+      "loss": 1.4457,
+      "step": 3350
+    },
+    {
+      "epoch": 5.36,
+      "grad_norm": 3.176265239715576,
+      "learning_rate": 0.0017856459330143541,
+      "loss": 1.5113,
+      "step": 3360
+    },
+    {
+      "epoch": 5.37,
+      "grad_norm": 5.0800557136535645,
+      "learning_rate": 0.0017850079744816587,
+      "loss": 1.5258,
+      "step": 3370
+    },
+    {
+      "epoch": 5.39,
+      "grad_norm": 4.9642534255981445,
+      "learning_rate": 0.0017843700159489633,
+      "loss": 1.4101,
+      "step": 3380
+    },
+    {
+      "epoch": 5.41,
+      "grad_norm": 4.7204270362854,
+      "learning_rate": 0.0017837320574162678,
+      "loss": 1.5812,
+      "step": 3390
+    },
+    {
+      "epoch": 5.42,
+      "grad_norm": 3.163360834121704,
+      "learning_rate": 0.0017830940988835726,
+      "loss": 1.5647,
+      "step": 3400
+    },
+    {
+      "epoch": 5.44,
+      "grad_norm": 5.122838973999023,
+      "learning_rate": 0.0017824561403508772,
+      "loss": 1.5151,
+      "step": 3410
+    },
+    {
+      "epoch": 5.45,
+      "grad_norm": 3.543826103210449,
+      "learning_rate": 0.0017818181818181817,
+      "loss": 1.4134,
+      "step": 3420
+    },
+    {
+      "epoch": 5.47,
+      "grad_norm": 3.4644534587860107,
+      "learning_rate": 0.0017811802232854865,
+      "loss": 1.6267,
+      "step": 3430
+    },
+    {
+      "epoch": 5.49,
+      "grad_norm": 4.8260040283203125,
+      "learning_rate": 0.001780542264752791,
+      "loss": 1.4154,
+      "step": 3440
+    },
+    {
+      "epoch": 5.5,
+      "grad_norm": 2.8876681327819824,
+      "learning_rate": 0.0017799043062200956,
+      "loss": 1.675,
+      "step": 3450
+    },
+    {
+      "epoch": 5.52,
+      "grad_norm": 2.8691539764404297,
+      "learning_rate": 0.0017792663476874002,
+      "loss": 1.6627,
+      "step": 3460
+    },
+    {
+      "epoch": 5.53,
+      "grad_norm": 4.810047626495361,
+      "learning_rate": 0.001778628389154705,
+      "loss": 1.4778,
+      "step": 3470
+    },
+    {
+      "epoch": 5.55,
+      "grad_norm": 6.102086067199707,
+      "learning_rate": 0.0017779904306220095,
+      "loss": 1.597,
+      "step": 3480
+    },
+    {
+      "epoch": 5.57,
+      "grad_norm": 2.5562939643859863,
+      "learning_rate": 0.001777352472089314,
+      "loss": 1.6155,
+      "step": 3490
+    },
+    {
+      "epoch": 5.58,
+      "grad_norm": 4.305008888244629,
+      "learning_rate": 0.0017767145135566189,
+      "loss": 1.6084,
+      "step": 3500
+    },
+    {
+      "epoch": 5.6,
+      "grad_norm": 3.545440673828125,
+      "learning_rate": 0.0017760765550239234,
+      "loss": 1.7705,
+      "step": 3510
+    },
+    {
+      "epoch": 5.61,
+      "grad_norm": 3.9225101470947266,
+      "learning_rate": 0.001775438596491228,
+      "loss": 1.7765,
+      "step": 3520
+    },
+    {
+      "epoch": 5.63,
+      "grad_norm": 3.6406924724578857,
+      "learning_rate": 0.0017748006379585326,
+      "loss": 1.5835,
+      "step": 3530
+    },
+    {
+      "epoch": 5.65,
+      "grad_norm": 3.9222354888916016,
+      "learning_rate": 0.0017741626794258373,
+      "loss": 1.625,
+      "step": 3540
+    },
+    {
+      "epoch": 5.66,
+      "grad_norm": 3.7696895599365234,
+      "learning_rate": 0.001773524720893142,
+      "loss": 1.7595,
+      "step": 3550
+    },
+    {
+      "epoch": 5.68,
+      "grad_norm": 3.927811622619629,
+      "learning_rate": 0.0017728867623604465,
+      "loss": 1.5485,
+      "step": 3560
+    },
+    {
+      "epoch": 5.69,
+      "grad_norm": 5.417560577392578,
+      "learning_rate": 0.0017722488038277512,
+      "loss": 1.7188,
+      "step": 3570
+    },
+    {
+      "epoch": 5.71,
+      "grad_norm": 3.9058241844177246,
+      "learning_rate": 0.0017716108452950558,
+      "loss": 1.5654,
+      "step": 3580
+    },
+    {
+      "epoch": 5.73,
+      "grad_norm": 3.2079246044158936,
+      "learning_rate": 0.0017709728867623604,
+      "loss": 1.8261,
+      "step": 3590
+    },
+    {
+      "epoch": 5.74,
+      "grad_norm": 3.8155903816223145,
+      "learning_rate": 0.001770334928229665,
+      "loss": 1.5338,
+      "step": 3600
+    },
+    {
+      "epoch": 5.76,
+      "grad_norm": 4.09771203994751,
+      "learning_rate": 0.0017696969696969697,
+      "loss": 1.7654,
+      "step": 3610
+    },
+    {
+      "epoch": 5.77,
+      "grad_norm": 3.3263423442840576,
+      "learning_rate": 0.0017690590111642743,
+      "loss": 1.5809,
+      "step": 3620
+    },
+    {
+      "epoch": 5.79,
+      "grad_norm": 4.113112926483154,
+      "learning_rate": 0.0017684210526315788,
+      "loss": 1.7932,
+      "step": 3630
+    },
+    {
+      "epoch": 5.81,
+      "grad_norm": 4.0192694664001465,
+      "learning_rate": 0.0017677830940988836,
+      "loss": 1.7115,
+      "step": 3640
+    },
+    {
+      "epoch": 5.82,
+      "grad_norm": 3.165609121322632,
+      "learning_rate": 0.0017671451355661882,
+      "loss": 1.4939,
+      "step": 3650
+    },
+    {
+      "epoch": 5.84,
+      "grad_norm": 3.859196186065674,
+      "learning_rate": 0.0017665071770334927,
+      "loss": 1.7767,
+      "step": 3660
+    },
+    {
+      "epoch": 5.85,
+      "grad_norm": 4.11074686050415,
+      "learning_rate": 0.0017658692185007973,
+      "loss": 1.6754,
+      "step": 3670
+    },
+    {
+      "epoch": 5.87,
+      "grad_norm": 2.926147937774658,
+      "learning_rate": 0.001765231259968102,
+      "loss": 1.6904,
+      "step": 3680
+    },
+    {
+      "epoch": 5.89,
+      "grad_norm": 4.121160507202148,
+      "learning_rate": 0.0017645933014354066,
+      "loss": 1.575,
+      "step": 3690
+    },
+    {
+      "epoch": 5.9,
+      "grad_norm": 4.0827131271362305,
+      "learning_rate": 0.0017639553429027112,
+      "loss": 1.6392,
+      "step": 3700
+    },
+    {
+      "epoch": 5.92,
+      "grad_norm": 4.13917875289917,
+      "learning_rate": 0.0017633173843700158,
+      "loss": 1.6323,
+      "step": 3710
+    },
+    {
+      "epoch": 5.93,
+      "grad_norm": 3.052493095397949,
+      "learning_rate": 0.0017626794258373206,
+      "loss": 1.8932,
+      "step": 3720
+    },
+    {
+      "epoch": 5.95,
+      "grad_norm": 5.432674407958984,
+      "learning_rate": 0.0017620414673046251,
+      "loss": 1.58,
+      "step": 3730
+    },
+    {
+      "epoch": 5.96,
+      "grad_norm": 6.6524505615234375,
+      "learning_rate": 0.0017614035087719297,
+      "loss": 1.6361,
+      "step": 3740
+    },
+    {
+      "epoch": 5.98,
+      "grad_norm": 7.488154888153076,
+      "learning_rate": 0.0017607655502392347,
+      "loss": 1.823,
+      "step": 3750
+    },
+    {
+      "epoch": 6.0,
+      "grad_norm": 3.211604118347168,
+      "learning_rate": 0.0017601275917065392,
+      "loss": 1.8265,
+      "step": 3760
+    },
+    {
+      "epoch": 6.01,
+      "grad_norm": 2.5021958351135254,
+      "learning_rate": 0.0017594896331738438,
+      "loss": 1.3958,
+      "step": 3770
+    },
+    {
+      "epoch": 6.03,
+      "grad_norm": 3.7297511100769043,
+      "learning_rate": 0.0017588516746411484,
+      "loss": 1.1193,
+      "step": 3780
+    },
+    {
+      "epoch": 6.04,
+      "grad_norm": 4.050276279449463,
+      "learning_rate": 0.0017582137161084531,
+      "loss": 1.2042,
+      "step": 3790
+    },
+    {
+      "epoch": 6.06,
+      "grad_norm": 4.484896659851074,
+      "learning_rate": 0.0017575757575757577,
+      "loss": 1.2483,
+      "step": 3800
+    },
+    {
+      "epoch": 6.08,
+      "grad_norm": 7.920963764190674,
+      "learning_rate": 0.0017569377990430623,
+      "loss": 1.2832,
+      "step": 3810
+    },
+    {
+      "epoch": 6.09,
+      "grad_norm": 2.772211790084839,
+      "learning_rate": 0.001756299840510367,
+      "loss": 1.1111,
+      "step": 3820
+    },
+    {
+      "epoch": 6.11,
+      "grad_norm": 3.2087087631225586,
+      "learning_rate": 0.0017556618819776716,
+      "loss": 1.108,
+      "step": 3830
+    },
+    {
+      "epoch": 6.12,
+      "grad_norm": 3.650775194168091,
+      "learning_rate": 0.0017550239234449762,
+      "loss": 1.2609,
+      "step": 3840
+    },
+    {
+      "epoch": 6.14,
+      "grad_norm": 3.6753830909729004,
+      "learning_rate": 0.0017543859649122807,
+      "loss": 1.1581,
+      "step": 3850
+    },
+    {
+      "epoch": 6.16,
+      "grad_norm": 3.568274974822998,
+      "learning_rate": 0.0017537480063795855,
+      "loss": 1.2661,
+      "step": 3860
+    },
+    {
+      "epoch": 6.17,
+      "grad_norm": 3.6179471015930176,
+      "learning_rate": 0.00175311004784689,
+      "loss": 1.092,
+      "step": 3870
+    },
+    {
+      "epoch": 6.19,
+      "grad_norm": 2.885768413543701,
+      "learning_rate": 0.0017524720893141946,
+      "loss": 1.1903,
+      "step": 3880
+    },
+    {
+      "epoch": 6.2,
+      "grad_norm": 2.389308214187622,
+      "learning_rate": 0.0017518341307814994,
+      "loss": 1.1977,
+      "step": 3890
+    },
+    {
+      "epoch": 6.22,
+      "grad_norm": 4.820352554321289,
+      "learning_rate": 0.001751196172248804,
+      "loss": 1.3104,
+      "step": 3900
+    },
+    {
+      "epoch": 6.24,
+      "grad_norm": 2.8304367065429688,
+      "learning_rate": 0.0017505582137161085,
+      "loss": 1.3522,
+      "step": 3910
+    },
+    {
+      "epoch": 6.25,
+      "grad_norm": 4.1842732429504395,
+      "learning_rate": 0.001749920255183413,
+      "loss": 1.2901,
+      "step": 3920
+    },
+    {
+      "epoch": 6.27,
+      "grad_norm": 4.698485851287842,
+      "learning_rate": 0.0017492822966507179,
+      "loss": 1.2261,
+      "step": 3930
+    },
+    {
+      "epoch": 6.28,
+      "grad_norm": 5.434518814086914,
+      "learning_rate": 0.0017486443381180224,
+      "loss": 1.3348,
+      "step": 3940
+    },
+    {
+      "epoch": 6.3,
+      "grad_norm": 4.726064682006836,
+      "learning_rate": 0.001748006379585327,
+      "loss": 1.308,
+      "step": 3950
+    },
+    {
+      "epoch": 6.32,
+      "grad_norm": 3.2794930934906006,
+      "learning_rate": 0.0017473684210526318,
+      "loss": 1.232,
+      "step": 3960
+    },
+    {
+      "epoch": 6.33,
+      "grad_norm": 4.248810768127441,
+      "learning_rate": 0.0017467304625199364,
+      "loss": 1.2118,
+      "step": 3970
+    },
+    {
+      "epoch": 6.35,
+      "grad_norm": 5.226914882659912,
+      "learning_rate": 0.001746092503987241,
+      "loss": 1.273,
+      "step": 3980
+    },
+    {
+      "epoch": 6.36,
+      "grad_norm": 4.917492866516113,
+      "learning_rate": 0.0017454545454545455,
+      "loss": 1.2566,
+      "step": 3990
+    },
+    {
+      "epoch": 6.38,
+      "grad_norm": 6.164140224456787,
+      "learning_rate": 0.0017448165869218503,
+      "loss": 1.4062,
+      "step": 4000
+    },
+    {
+      "epoch": 6.4,
+      "grad_norm": 4.235147953033447,
+      "learning_rate": 0.0017441786283891548,
+      "loss": 1.2668,
+      "step": 4010
+    },
+    {
+      "epoch": 6.41,
+      "grad_norm": 4.627178192138672,
+      "learning_rate": 0.0017435406698564594,
+      "loss": 1.2448,
+      "step": 4020
+    },
+    {
+      "epoch": 6.43,
+      "grad_norm": 5.603235721588135,
+      "learning_rate": 0.0017429027113237642,
+      "loss": 1.4769,
+      "step": 4030
+    },
+    {
+      "epoch": 6.44,
+      "grad_norm": 2.3861303329467773,
+      "learning_rate": 0.0017422647527910687,
+      "loss": 1.6294,
+      "step": 4040
+    },
+    {
+      "epoch": 6.46,
+      "grad_norm": 3.891209840774536,
+      "learning_rate": 0.0017416267942583733,
+      "loss": 1.3206,
+      "step": 4050
+    },
+    {
+      "epoch": 6.48,
+      "grad_norm": 2.741506576538086,
+      "learning_rate": 0.0017409888357256779,
+      "loss": 1.3517,
+      "step": 4060
+    },
+    {
+      "epoch": 6.49,
+      "grad_norm": 3.152433156967163,
+      "learning_rate": 0.0017403508771929826,
+      "loss": 1.2853,
+      "step": 4070
+    },
+    {
+      "epoch": 6.51,
+      "grad_norm": 3.4589314460754395,
+      "learning_rate": 0.0017397129186602872,
+      "loss": 1.4094,
+      "step": 4080
+    },
+    {
+      "epoch": 6.52,
+      "grad_norm": 6.630537033081055,
+      "learning_rate": 0.0017390749601275918,
+      "loss": 1.3614,
+      "step": 4090
+    },
+    {
+      "epoch": 6.54,
+      "grad_norm": 3.220771551132202,
+      "learning_rate": 0.0017384370015948963,
+      "loss": 1.3354,
+      "step": 4100
+    },
+    {
+      "epoch": 6.56,
+      "grad_norm": 2.8003170490264893,
+      "learning_rate": 0.001737799043062201,
+      "loss": 1.398,
+      "step": 4110
+    },
+    {
+      "epoch": 6.57,
+      "grad_norm": 5.145318984985352,
+      "learning_rate": 0.0017371610845295057,
+      "loss": 1.4861,
+      "step": 4120
+    },
+    {
+      "epoch": 6.59,
+      "grad_norm": 3.6889803409576416,
+      "learning_rate": 0.0017365231259968102,
+      "loss": 1.3687,
+      "step": 4130
+    },
+    {
+      "epoch": 6.6,
+      "grad_norm": 3.3676440715789795,
+      "learning_rate": 0.001735885167464115,
+      "loss": 1.3656,
+      "step": 4140
+    },
+    {
+      "epoch": 6.62,
+      "grad_norm": 4.406673431396484,
+      "learning_rate": 0.0017352472089314196,
+      "loss": 1.4002,
+      "step": 4150
+    },
+    {
+      "epoch": 6.63,
+      "grad_norm": 4.088317394256592,
+      "learning_rate": 0.0017346092503987241,
+      "loss": 1.4265,
+      "step": 4160
+    },
+    {
+      "epoch": 6.65,
+      "grad_norm": 4.677865982055664,
+      "learning_rate": 0.0017339712918660287,
+      "loss": 1.5231,
+      "step": 4170
+    },
+    {
+      "epoch": 6.67,
+      "grad_norm": 4.6024322509765625,
+      "learning_rate": 0.0017333333333333335,
+      "loss": 1.5746,
+      "step": 4180
+    },
+    {
+      "epoch": 6.68,
+      "grad_norm": 4.752773284912109,
+      "learning_rate": 0.001732695374800638,
+      "loss": 1.3851,
+      "step": 4190
+    },
+    {
+      "epoch": 6.7,
+      "grad_norm": 3.870704412460327,
+      "learning_rate": 0.0017320574162679426,
+      "loss": 1.4281,
+      "step": 4200
+    },
+    {
+      "epoch": 6.71,
+      "grad_norm": 3.5712807178497314,
+      "learning_rate": 0.0017314194577352474,
+      "loss": 1.6449,
+      "step": 4210
+    },
+    {
+      "epoch": 6.73,
+      "grad_norm": 3.4372332096099854,
+      "learning_rate": 0.001730781499202552,
+      "loss": 1.5439,
+      "step": 4220
+    },
+    {
+      "epoch": 6.75,
+      "grad_norm": 5.638207912445068,
+      "learning_rate": 0.0017301435406698565,
+      "loss": 1.5426,
+      "step": 4230
+    },
+    {
+      "epoch": 6.76,
+      "grad_norm": 5.095453262329102,
+      "learning_rate": 0.001729505582137161,
+      "loss": 1.4252,
+      "step": 4240
+    },
+    {
+      "epoch": 6.78,
+      "grad_norm": 2.4728281497955322,
+      "learning_rate": 0.0017288676236044658,
+      "loss": 1.4374,
+      "step": 4250
+    },
+    {
+      "epoch": 6.79,
+      "grad_norm": 3.4558870792388916,
+      "learning_rate": 0.0017282296650717704,
+      "loss": 1.4457,
+      "step": 4260
+    },
+    {
+      "epoch": 6.81,
+      "grad_norm": 3.5765767097473145,
+      "learning_rate": 0.001727591706539075,
+      "loss": 1.5364,
+      "step": 4270
+    },
+    {
+      "epoch": 6.83,
+      "grad_norm": 4.479535102844238,
+      "learning_rate": 0.0017269537480063797,
+      "loss": 1.561,
+      "step": 4280
+    },
+    {
+      "epoch": 6.84,
+      "grad_norm": 3.1493709087371826,
+      "learning_rate": 0.0017263157894736843,
+      "loss": 1.4131,
+      "step": 4290
+    },
+    {
+      "epoch": 6.86,
+      "grad_norm": 4.1836042404174805,
+      "learning_rate": 0.0017256778309409889,
+      "loss": 1.5031,
+      "step": 4300
+    },
+    {
+      "epoch": 6.87,
+      "grad_norm": 3.2860119342803955,
+      "learning_rate": 0.0017250398724082934,
+      "loss": 1.6286,
+      "step": 4310
+    },
+    {
+      "epoch": 6.89,
+      "grad_norm": 2.8824214935302734,
+      "learning_rate": 0.0017244019138755982,
+      "loss": 1.5281,
+      "step": 4320
+    },
+    {
+      "epoch": 6.91,
+      "grad_norm": 5.243397235870361,
+      "learning_rate": 0.0017237639553429028,
+      "loss": 1.5868,
+      "step": 4330
+    },
+    {
+      "epoch": 6.92,
+      "grad_norm": 2.8732147216796875,
+      "learning_rate": 0.0017231259968102073,
+      "loss": 1.5813,
+      "step": 4340
+    },
+    {
+      "epoch": 6.94,
+      "grad_norm": 4.3689494132995605,
+      "learning_rate": 0.0017224880382775121,
+      "loss": 1.5706,
+      "step": 4350
+    },
+    {
+      "epoch": 6.95,
+      "grad_norm": 4.520773887634277,
+      "learning_rate": 0.0017218500797448167,
+      "loss": 1.5769,
+      "step": 4360
+    },
+    {
+      "epoch": 6.97,
+      "grad_norm": 3.988919734954834,
+      "learning_rate": 0.0017212121212121212,
+      "loss": 1.6688,
+      "step": 4370
+    },
+    {
+      "epoch": 6.99,
+      "grad_norm": 3.1639842987060547,
+      "learning_rate": 0.0017205741626794258,
+      "loss": 1.5045,
+      "step": 4380
+    },
+    {
+      "epoch": 7.0,
+      "grad_norm": 1.8472672700881958,
+      "learning_rate": 0.0017199362041467306,
+      "loss": 1.4904,
+      "step": 4390
+    },
+    {
+      "epoch": 7.02,
+      "grad_norm": 3.472080945968628,
+      "learning_rate": 0.0017192982456140352,
+      "loss": 1.0386,
+      "step": 4400
+    },
+    {
+      "epoch": 7.03,
+      "grad_norm": 3.553772211074829,
+      "learning_rate": 0.0017186602870813397,
+      "loss": 0.9889,
+      "step": 4410
+    },
+    {
+      "epoch": 7.05,
+      "grad_norm": 4.417268753051758,
+      "learning_rate": 0.0017180223285486443,
+      "loss": 0.9838,
+      "step": 4420
+    },
+    {
+      "epoch": 7.07,
+      "grad_norm": 5.340514659881592,
+      "learning_rate": 0.001717384370015949,
+      "loss": 1.0057,
+      "step": 4430
+    },
+    {
+      "epoch": 7.08,
+      "grad_norm": 3.2239015102386475,
+      "learning_rate": 0.0017167464114832536,
+      "loss": 1.1003,
+      "step": 4440
+    },
+    {
+      "epoch": 7.1,
+      "grad_norm": 3.5991039276123047,
+      "learning_rate": 0.0017161084529505582,
+      "loss": 1.0384,
+      "step": 4450
+    },
+    {
+      "epoch": 7.11,
+      "grad_norm": 3.8520448207855225,
+      "learning_rate": 0.001715470494417863,
+      "loss": 0.9872,
+      "step": 4460
+    },
+    {
+      "epoch": 7.13,
+      "grad_norm": 3.489706516265869,
+      "learning_rate": 0.0017148325358851675,
+      "loss": 1.1452,
+      "step": 4470
+    },
+    {
+      "epoch": 7.15,
+      "grad_norm": 2.60661244392395,
+      "learning_rate": 0.001714194577352472,
+      "loss": 1.0007,
+      "step": 4480
+    },
+    {
+      "epoch": 7.16,
+      "grad_norm": 5.66582727432251,
+      "learning_rate": 0.0017135566188197766,
+      "loss": 1.1072,
+      "step": 4490
+    },
+    {
+      "epoch": 7.18,
+      "grad_norm": 4.794973373413086,
+      "learning_rate": 0.0017129186602870814,
+      "loss": 1.1994,
+      "step": 4500
+    },
+    {
+      "epoch": 7.19,
+      "grad_norm": 5.310514450073242,
+      "learning_rate": 0.001712280701754386,
+      "loss": 1.104,
+      "step": 4510
+    },
+    {
+      "epoch": 7.21,
+      "grad_norm": 3.0956227779388428,
+      "learning_rate": 0.0017116427432216906,
+      "loss": 1.0067,
+      "step": 4520
+    },
+    {
+      "epoch": 7.22,
+      "grad_norm": 4.637990474700928,
+      "learning_rate": 0.0017110047846889953,
+      "loss": 1.0235,
+      "step": 4530
+    },
+    {
+      "epoch": 7.24,
+      "grad_norm": 2.7805848121643066,
+      "learning_rate": 0.0017103668261563,
+      "loss": 1.2051,
+      "step": 4540
+    },
+    {
+      "epoch": 7.26,
+      "grad_norm": 4.313024520874023,
+      "learning_rate": 0.0017097288676236045,
+      "loss": 1.1176,
+      "step": 4550
+    },
+    {
+      "epoch": 7.27,
+      "grad_norm": 3.135601282119751,
+      "learning_rate": 0.001709090909090909,
+      "loss": 1.2627,
+      "step": 4560
+    },
+    {
+      "epoch": 7.29,
+      "grad_norm": 3.0150604248046875,
+      "learning_rate": 0.0017084529505582138,
+      "loss": 1.0905,
+      "step": 4570
+    },
+    {
+      "epoch": 7.3,
+      "grad_norm": 3.5915615558624268,
+      "learning_rate": 0.0017078149920255184,
+      "loss": 1.0766,
+      "step": 4580
+    },
+    {
+      "epoch": 7.32,
+      "grad_norm": 3.981519937515259,
+      "learning_rate": 0.001707177033492823,
+      "loss": 1.2256,
+      "step": 4590
+    },
+    {
+      "epoch": 7.34,
+      "grad_norm": 3.5225601196289062,
+      "learning_rate": 0.0017065390749601277,
+      "loss": 1.2533,
+      "step": 4600
+    },
+    {
+      "epoch": 7.35,
+      "grad_norm": 4.971328258514404,
+      "learning_rate": 0.0017059011164274323,
+      "loss": 1.2287,
+      "step": 4610
+    },
+    {
+      "epoch": 7.37,
+      "grad_norm": 3.6815969944000244,
+      "learning_rate": 0.0017052631578947368,
+      "loss": 1.1224,
+      "step": 4620
+    },
+    {
+      "epoch": 7.38,
+      "grad_norm": 2.50472354888916,
+      "learning_rate": 0.0017046251993620414,
+      "loss": 1.3059,
+      "step": 4630
+    },
+    {
+      "epoch": 7.4,
+      "grad_norm": 2.376018524169922,
+      "learning_rate": 0.0017039872408293462,
+      "loss": 1.3294,
+      "step": 4640
+    },
+    {
+      "epoch": 7.42,
+      "grad_norm": 3.935692548751831,
+      "learning_rate": 0.0017033492822966507,
+      "loss": 1.3172,
+      "step": 4650
+    },
+    {
+      "epoch": 7.43,
+      "grad_norm": 2.974992513656616,
+      "learning_rate": 0.0017027113237639553,
+      "loss": 1.2235,
+      "step": 4660
+    },
+    {
+      "epoch": 7.45,
+      "grad_norm": 6.238065242767334,
+      "learning_rate": 0.00170207336523126,
+      "loss": 1.2355,
+      "step": 4670
+    },
+    {
+      "epoch": 7.46,
+      "grad_norm": 4.15529727935791,
+      "learning_rate": 0.0017014354066985646,
+      "loss": 1.1975,
+      "step": 4680
+    },
+    {
+      "epoch": 7.48,
+      "grad_norm": 3.069063663482666,
+      "learning_rate": 0.0017007974481658692,
+      "loss": 1.0709,
+      "step": 4690
+    },
+    {
+      "epoch": 7.5,
+      "grad_norm": 3.6260762214660645,
+      "learning_rate": 0.0017001594896331738,
+      "loss": 1.2178,
+      "step": 4700
+    },
+    {
+      "epoch": 7.51,
+      "grad_norm": 3.0013301372528076,
+      "learning_rate": 0.0016995215311004785,
+      "loss": 1.1398,
+      "step": 4710
+    },
+    {
+      "epoch": 7.53,
+      "grad_norm": 2.0015666484832764,
+      "learning_rate": 0.001698883572567783,
+      "loss": 1.3866,
+      "step": 4720
+    },
+    {
+      "epoch": 7.54,
+      "grad_norm": 3.997130870819092,
+      "learning_rate": 0.0016982456140350877,
+      "loss": 1.3066,
+      "step": 4730
+    },
+    {
+      "epoch": 7.56,
+      "grad_norm": 3.5671958923339844,
+      "learning_rate": 0.0016976076555023924,
+      "loss": 1.0524,
+      "step": 4740
+    },
+    {
+      "epoch": 7.58,
+      "grad_norm": 2.9513649940490723,
+      "learning_rate": 0.001696969696969697,
+      "loss": 1.173,
+      "step": 4750
+    },
+    {
+      "epoch": 7.59,
+      "grad_norm": 3.9709384441375732,
+      "learning_rate": 0.0016963317384370016,
+      "loss": 1.4574,
+      "step": 4760
+    },
+    {
+      "epoch": 7.61,
+      "grad_norm": 4.372689723968506,
+      "learning_rate": 0.0016956937799043061,
+      "loss": 1.4686,
+      "step": 4770
+    },
+    {
+      "epoch": 7.62,
+      "grad_norm": 3.748054265975952,
+      "learning_rate": 0.001695055821371611,
+      "loss": 1.4393,
+      "step": 4780
+    },
+    {
+      "epoch": 7.64,
+      "grad_norm": 3.7790236473083496,
+      "learning_rate": 0.0016944178628389155,
+      "loss": 1.2965,
+      "step": 4790
+    },
+    {
+      "epoch": 7.66,
+      "grad_norm": 4.572340965270996,
+      "learning_rate": 0.00169377990430622,
+      "loss": 1.5793,
+      "step": 4800
+    },
+    {
+      "epoch": 7.67,
+      "grad_norm": 3.838794231414795,
+      "learning_rate": 0.0016931419457735246,
+      "loss": 1.2758,
+      "step": 4810
+    },
+    {
+      "epoch": 7.69,
+      "grad_norm": 3.9073917865753174,
+      "learning_rate": 0.0016925039872408294,
+      "loss": 1.2657,
+      "step": 4820
+    },
+    {
+      "epoch": 7.7,
+      "grad_norm": 3.6725800037384033,
+      "learning_rate": 0.001691866028708134,
+      "loss": 1.1664,
+      "step": 4830
+    },
+    {
+      "epoch": 7.72,
+      "grad_norm": 2.742488384246826,
+      "learning_rate": 0.0016912280701754385,
+      "loss": 1.3705,
+      "step": 4840
+    },
+    {
+      "epoch": 7.74,
+      "grad_norm": 5.307029724121094,
+      "learning_rate": 0.0016905901116427433,
+      "loss": 1.3447,
+      "step": 4850
+    },
+    {
+      "epoch": 7.75,
+      "grad_norm": 3.2814066410064697,
+      "learning_rate": 0.0016899521531100479,
+      "loss": 1.2705,
+      "step": 4860
+    },
+    {
+      "epoch": 7.77,
+      "grad_norm": 4.674114227294922,
+      "learning_rate": 0.0016893141945773524,
+      "loss": 1.3062,
+      "step": 4870
+    },
+    {
+      "epoch": 7.78,
+      "grad_norm": 3.455000638961792,
+      "learning_rate": 0.001688676236044657,
+      "loss": 1.3788,
+      "step": 4880
+    },
+    {
+      "epoch": 7.8,
+      "grad_norm": 3.7969977855682373,
+      "learning_rate": 0.0016880382775119618,
+      "loss": 1.4107,
+      "step": 4890
+    },
+    {
+      "epoch": 7.81,
+      "grad_norm": 4.002437591552734,
+      "learning_rate": 0.0016874003189792663,
+      "loss": 1.3215,
+      "step": 4900
+    },
+    {
+      "epoch": 7.83,
+      "grad_norm": 2.509416103363037,
+      "learning_rate": 0.0016867623604465709,
+      "loss": 1.2652,
+      "step": 4910
+    },
+    {
+      "epoch": 7.85,
+      "grad_norm": 2.7716715335845947,
+      "learning_rate": 0.0016861244019138757,
+      "loss": 1.4095,
+      "step": 4920
+    },
+    {
+      "epoch": 7.86,
+      "grad_norm": 5.537817001342773,
+      "learning_rate": 0.0016854864433811802,
+      "loss": 1.2998,
+      "step": 4930
+    },
+    {
+      "epoch": 7.88,
+      "grad_norm": 3.2739720344543457,
+      "learning_rate": 0.0016848484848484848,
+      "loss": 1.3311,
+      "step": 4940
+    },
+    {
+      "epoch": 7.89,
+      "grad_norm": 3.1102712154388428,
+      "learning_rate": 0.0016842105263157893,
+      "loss": 1.2811,
+      "step": 4950
+    },
+    {
+      "epoch": 7.91,
+      "grad_norm": 4.807369709014893,
+      "learning_rate": 0.0016835725677830941,
+      "loss": 1.4339,
+      "step": 4960
+    },
+    {
+      "epoch": 7.93,
+      "grad_norm": 8.400796890258789,
+      "learning_rate": 0.0016829346092503987,
+      "loss": 1.5323,
+      "step": 4970
+    },
+    {
+      "epoch": 7.94,
+      "grad_norm": 3.6073365211486816,
+      "learning_rate": 0.0016822966507177033,
+      "loss": 1.5019,
+      "step": 4980
+    },
+    {
+      "epoch": 7.96,
+      "grad_norm": 3.30039644241333,
+      "learning_rate": 0.001681658692185008,
+      "loss": 1.4698,
+      "step": 4990
+    },
+    {
+      "epoch": 7.97,
+      "grad_norm": 3.7990474700927734,
+      "learning_rate": 0.0016810207336523126,
+      "loss": 1.5408,
+      "step": 5000
+    },
+    {
+      "epoch": 7.99,
+      "grad_norm": 4.0094499588012695,
+      "learning_rate": 0.0016803827751196172,
+      "loss": 1.4195,
+      "step": 5010
+    },
+    {
+      "epoch": 8.01,
+      "grad_norm": 2.0265750885009766,
+      "learning_rate": 0.0016797448165869217,
+      "loss": 1.181,
+      "step": 5020
+    },
+    {
+      "epoch": 8.02,
+      "grad_norm": 2.840583086013794,
+      "learning_rate": 0.0016791068580542265,
+      "loss": 0.8481,
+      "step": 5030
+    },
+    {
+      "epoch": 8.04,
+      "grad_norm": 2.409465789794922,
+      "learning_rate": 0.001678468899521531,
+      "loss": 0.9815,
+      "step": 5040
+    },
+    {
+      "epoch": 8.05,
+      "grad_norm": 5.790297031402588,
+      "learning_rate": 0.0016778309409888356,
+      "loss": 0.9621,
+      "step": 5050
+    },
+    {
+      "epoch": 8.07,
+      "grad_norm": 2.9008841514587402,
+      "learning_rate": 0.0016771929824561404,
+      "loss": 0.9734,
+      "step": 5060
+    },
+    {
+      "epoch": 8.09,
+      "grad_norm": 2.495950937271118,
+      "learning_rate": 0.001676555023923445,
+      "loss": 0.9353,
+      "step": 5070
+    },
+    {
+      "epoch": 8.1,
+      "grad_norm": 3.870645523071289,
+      "learning_rate": 0.0016759170653907495,
+      "loss": 1.023,
+      "step": 5080
+    },
+    {
+      "epoch": 8.12,
+      "grad_norm": 2.352860450744629,
+      "learning_rate": 0.001675279106858054,
+      "loss": 0.9064,
+      "step": 5090
+    },
+    {
+      "epoch": 8.13,
+      "grad_norm": 3.9795637130737305,
+      "learning_rate": 0.0016746411483253589,
+      "loss": 1.0378,
+      "step": 5100
+    },
+    {
+      "epoch": 8.15,
+      "grad_norm": 3.728628396987915,
+      "learning_rate": 0.0016740031897926634,
+      "loss": 0.8394,
+      "step": 5110
+    },
+    {
+      "epoch": 8.17,
+      "grad_norm": 4.232802391052246,
+      "learning_rate": 0.001673365231259968,
+      "loss": 1.0115,
+      "step": 5120
+    },
+    {
+      "epoch": 8.18,
+      "grad_norm": 4.09517765045166,
+      "learning_rate": 0.0016727272727272726,
+      "loss": 1.0139,
+      "step": 5130
+    },
+    {
+      "epoch": 8.2,
+      "grad_norm": 2.101757287979126,
+      "learning_rate": 0.0016720893141945773,
+      "loss": 1.0698,
+      "step": 5140
+    },
+    {
+      "epoch": 8.21,
+      "grad_norm": 4.124992370605469,
+      "learning_rate": 0.001671451355661882,
+      "loss": 1.0214,
+      "step": 5150
+    },
+    {
+      "epoch": 8.23,
+      "grad_norm": 4.000357151031494,
+      "learning_rate": 0.0016708133971291865,
+      "loss": 1.0302,
+      "step": 5160
+    },
+    {
+      "epoch": 8.25,
+      "grad_norm": 4.556628704071045,
+      "learning_rate": 0.0016701754385964912,
+      "loss": 1.1185,
+      "step": 5170
+    },
+    {
+      "epoch": 8.26,
+      "grad_norm": 4.288385391235352,
+      "learning_rate": 0.0016695374800637958,
+      "loss": 1.1112,
+      "step": 5180
+    },
+    {
+      "epoch": 8.28,
+      "grad_norm": 3.5157744884490967,
+      "learning_rate": 0.0016688995215311004,
+      "loss": 1.0085,
+      "step": 5190
+    },
+    {
+      "epoch": 8.29,
+      "grad_norm": 4.492936611175537,
+      "learning_rate": 0.001668261562998405,
+      "loss": 0.9928,
+      "step": 5200
+    },
+    {
+      "epoch": 8.31,
+      "grad_norm": 2.415928363800049,
+      "learning_rate": 0.0016676236044657097,
+      "loss": 1.0586,
+      "step": 5210
+    },
+    {
+      "epoch": 8.33,
+      "grad_norm": 4.508685111999512,
+      "learning_rate": 0.0016669856459330143,
+      "loss": 1.1178,
+      "step": 5220
+    },
+    {
+      "epoch": 8.34,
+      "grad_norm": 6.090748310089111,
+      "learning_rate": 0.0016663476874003188,
+      "loss": 1.0743,
+      "step": 5230
+    },
+    {
+      "epoch": 8.36,
+      "grad_norm": 4.638314723968506,
+      "learning_rate": 0.0016657097288676238,
+      "loss": 1.1763,
+      "step": 5240
+    },
+    {
+      "epoch": 8.37,
+      "grad_norm": 3.5104875564575195,
+      "learning_rate": 0.0016650717703349284,
+      "loss": 1.0157,
+      "step": 5250
+    },
+    {
+      "epoch": 8.39,
+      "grad_norm": 3.11543869972229,
+      "learning_rate": 0.001664433811802233,
+      "loss": 1.1719,
+      "step": 5260
+    },
+    {
+      "epoch": 8.41,
+      "grad_norm": 3.698253631591797,
+      "learning_rate": 0.0016637958532695375,
+      "loss": 1.3834,
+      "step": 5270
+    },
+    {
+      "epoch": 8.42,
+      "grad_norm": 4.070870876312256,
+      "learning_rate": 0.0016631578947368423,
+      "loss": 1.2485,
+      "step": 5280
+    },
+    {
+      "epoch": 8.44,
+      "grad_norm": 3.328082323074341,
+      "learning_rate": 0.0016625199362041469,
+      "loss": 1.0835,
+      "step": 5290
+    },
+    {
+      "epoch": 8.45,
+      "grad_norm": 4.319711208343506,
+      "learning_rate": 0.0016618819776714514,
+      "loss": 1.1718,
+      "step": 5300
+    },
+    {
+      "epoch": 8.47,
+      "grad_norm": 3.040421485900879,
+      "learning_rate": 0.0016612440191387562,
+      "loss": 1.1825,
+      "step": 5310
+    },
+    {
+      "epoch": 8.48,
+      "grad_norm": 3.8478896617889404,
+      "learning_rate": 0.0016606060606060608,
+      "loss": 1.2003,
+      "step": 5320
+    },
+    {
+      "epoch": 8.5,
+      "grad_norm": 3.5098345279693604,
+      "learning_rate": 0.0016599681020733653,
+      "loss": 1.3229,
+      "step": 5330
+    },
+    {
+      "epoch": 8.52,
+      "grad_norm": 1.7316700220108032,
+      "learning_rate": 0.00165933014354067,
+      "loss": 1.1779,
+      "step": 5340
+    },
+    {
+      "epoch": 8.53,
+      "grad_norm": 2.3097381591796875,
+      "learning_rate": 0.0016586921850079747,
+      "loss": 1.0794,
+      "step": 5350
+    },
+    {
+      "epoch": 8.55,
+      "grad_norm": 2.1922430992126465,
+      "learning_rate": 0.0016580542264752792,
+      "loss": 1.2068,
+      "step": 5360
+    },
+    {
+      "epoch": 8.56,
+      "grad_norm": 5.0043864250183105,
+      "learning_rate": 0.0016574162679425838,
+      "loss": 1.2837,
+      "step": 5370
+    },
+    {
+      "epoch": 8.58,
+      "grad_norm": 4.01829195022583,
+      "learning_rate": 0.0016567783094098886,
+      "loss": 1.0479,
+      "step": 5380
+    },
+    {
+      "epoch": 8.6,
+      "grad_norm": 3.4249794483184814,
+      "learning_rate": 0.0016561403508771931,
+      "loss": 1.3172,
+      "step": 5390
+    },
+    {
+      "epoch": 8.61,
+      "grad_norm": 3.52347993850708,
+      "learning_rate": 0.0016555023923444977,
+      "loss": 1.2796,
+      "step": 5400
+    },
+    {
+      "epoch": 8.63,
+      "grad_norm": 3.3641083240509033,
+      "learning_rate": 0.0016548644338118023,
+      "loss": 1.2373,
+      "step": 5410
+    },
+    {
+      "epoch": 8.64,
+      "grad_norm": 4.422176361083984,
+      "learning_rate": 0.001654226475279107,
+      "loss": 1.1601,
+      "step": 5420
+    },
+    {
+      "epoch": 8.66,
+      "grad_norm": 4.531619071960449,
+      "learning_rate": 0.0016535885167464116,
+      "loss": 1.1254,
+      "step": 5430
+    },
+    {
+      "epoch": 8.68,
+      "grad_norm": 2.5183939933776855,
+      "learning_rate": 0.0016529505582137162,
+      "loss": 1.2257,
+      "step": 5440
+    },
+    {
+      "epoch": 8.69,
+      "grad_norm": 2.493967294692993,
+      "learning_rate": 0.001652312599681021,
+      "loss": 1.1369,
+      "step": 5450
+    },
+    {
+      "epoch": 8.71,
+      "grad_norm": 4.203963279724121,
+      "learning_rate": 0.0016516746411483255,
+      "loss": 1.0819,
+      "step": 5460
+    },
+    {
+      "epoch": 8.72,
+      "grad_norm": 4.204017639160156,
+      "learning_rate": 0.00165103668261563,
+      "loss": 1.2126,
+      "step": 5470
+    },
+    {
+      "epoch": 8.74,
+      "grad_norm": 5.039621353149414,
+      "learning_rate": 0.0016503987240829346,
+      "loss": 1.265,
+      "step": 5480
+    },
+    {
+      "epoch": 8.76,
+      "grad_norm": 2.5682952404022217,
+      "learning_rate": 0.0016497607655502394,
+      "loss": 1.2403,
+      "step": 5490
+    },
+    {
+      "epoch": 8.77,
+      "grad_norm": 2.821531057357788,
+      "learning_rate": 0.001649122807017544,
+      "loss": 1.3182,
+      "step": 5500
+    },
+    {
+      "epoch": 8.79,
+      "grad_norm": 3.4752848148345947,
+      "learning_rate": 0.0016484848484848485,
+      "loss": 1.264,
+      "step": 5510
+    },
+    {
+      "epoch": 8.8,
+      "grad_norm": 2.5011346340179443,
+      "learning_rate": 0.001647846889952153,
+      "loss": 1.1792,
+      "step": 5520
+    },
+    {
+      "epoch": 8.82,
+      "grad_norm": 4.323322772979736,
+      "learning_rate": 0.0016472089314194579,
+      "loss": 1.1745,
+      "step": 5530
+    },
+    {
+      "epoch": 8.84,
+      "grad_norm": 4.5369768142700195,
+      "learning_rate": 0.0016465709728867624,
+      "loss": 1.2837,
+      "step": 5540
+    },
+    {
+      "epoch": 8.85,
+      "grad_norm": 4.292603492736816,
+      "learning_rate": 0.001645933014354067,
+      "loss": 1.3317,
+      "step": 5550
+    },
+    {
+      "epoch": 8.87,
+      "grad_norm": 2.312387228012085,
+      "learning_rate": 0.0016452950558213718,
+      "loss": 1.2081,
+      "step": 5560
+    },
+    {
+      "epoch": 8.88,
+      "grad_norm": 3.759363889694214,
+      "learning_rate": 0.0016446570972886764,
+      "loss": 1.2549,
+      "step": 5570
+    },
+    {
+      "epoch": 8.9,
+      "grad_norm": 4.08116340637207,
+      "learning_rate": 0.001644019138755981,
+      "loss": 1.2823,
+      "step": 5580
+    },
+    {
+      "epoch": 8.92,
+      "grad_norm": 3.29032039642334,
+      "learning_rate": 0.0016433811802232855,
+      "loss": 1.2339,
+      "step": 5590
+    },
+    {
+      "epoch": 8.93,
+      "grad_norm": 4.082303524017334,
+      "learning_rate": 0.0016427432216905903,
+      "loss": 1.2537,
+      "step": 5600
+    },
+    {
+      "epoch": 8.95,
+      "grad_norm": 3.470620632171631,
+      "learning_rate": 0.0016421052631578948,
+      "loss": 1.2605,
+      "step": 5610
+    },
+    {
+      "epoch": 8.96,
+      "grad_norm": 5.008780002593994,
+      "learning_rate": 0.0016414673046251994,
+      "loss": 1.3909,
+      "step": 5620
+    },
+    {
+      "epoch": 8.98,
+      "grad_norm": 3.3247218132019043,
+      "learning_rate": 0.0016408293460925042,
+      "loss": 1.2504,
+      "step": 5630
+    },
+    {
+      "epoch": 9.0,
+      "grad_norm": 3.653365135192871,
+      "learning_rate": 0.0016401913875598087,
+      "loss": 1.3043,
+      "step": 5640
+    },
+    {
+      "epoch": 9.01,
+      "grad_norm": 2.061579942703247,
+      "learning_rate": 0.0016395534290271133,
+      "loss": 1.0127,
+      "step": 5650
+    },
+    {
+      "epoch": 9.03,
+      "grad_norm": 3.8204243183135986,
+      "learning_rate": 0.0016389154704944179,
+      "loss": 0.8594,
+      "step": 5660
+    },
+    {
+      "epoch": 9.04,
+      "grad_norm": 3.1755354404449463,
+      "learning_rate": 0.0016382775119617226,
+      "loss": 0.8146,
+      "step": 5670
+    },
+    {
+      "epoch": 9.06,
+      "grad_norm": 6.688543319702148,
+      "learning_rate": 0.0016376395534290272,
+      "loss": 0.8972,
+      "step": 5680
+    },
+    {
+      "epoch": 9.07,
+      "grad_norm": 2.6439781188964844,
+      "learning_rate": 0.0016370015948963318,
+      "loss": 0.8627,
+      "step": 5690
+    },
+    {
+      "epoch": 9.09,
+      "grad_norm": 2.962597131729126,
+      "learning_rate": 0.0016363636363636365,
+      "loss": 0.8263,
+      "step": 5700
+    },
+    {
+      "epoch": 9.11,
+      "grad_norm": 4.008563995361328,
+      "learning_rate": 0.001635725677830941,
+      "loss": 0.849,
+      "step": 5710
+    },
+    {
+      "epoch": 9.12,
+      "grad_norm": 3.4718000888824463,
+      "learning_rate": 0.0016350877192982457,
+      "loss": 0.873,
+      "step": 5720
+    },
+    {
+      "epoch": 9.14,
+      "grad_norm": 3.4607927799224854,
+      "learning_rate": 0.0016344497607655502,
+      "loss": 0.9231,
+      "step": 5730
+    },
+    {
+      "epoch": 9.15,
+      "grad_norm": 3.6140999794006348,
+      "learning_rate": 0.001633811802232855,
+      "loss": 0.8654,
+      "step": 5740
+    },
+    {
+      "epoch": 9.17,
+      "grad_norm": 4.108109474182129,
+      "learning_rate": 0.0016331738437001596,
+      "loss": 1.1229,
+      "step": 5750
+    },
+    {
+      "epoch": 9.19,
+      "grad_norm": 3.7185311317443848,
+      "learning_rate": 0.0016325358851674641,
+      "loss": 0.8597,
+      "step": 5760
+    },
+    {
+      "epoch": 9.2,
+      "grad_norm": 2.866516351699829,
+      "learning_rate": 0.001631897926634769,
+      "loss": 0.9815,
+      "step": 5770
+    },
+    {
+      "epoch": 9.22,
+      "grad_norm": 3.452366590499878,
+      "learning_rate": 0.0016312599681020735,
+      "loss": 0.8871,
+      "step": 5780
+    },
+    {
+      "epoch": 9.23,
+      "grad_norm": 3.129293203353882,
+      "learning_rate": 0.001630622009569378,
+      "loss": 0.9316,
+      "step": 5790
+    },
+    {
+      "epoch": 9.25,
+      "grad_norm": 3.3350937366485596,
+      "learning_rate": 0.0016299840510366826,
+      "loss": 1.0546,
+      "step": 5800
+    },
+    {
+      "epoch": 9.27,
+      "grad_norm": 3.9579129219055176,
+      "learning_rate": 0.0016293460925039874,
+      "loss": 0.8929,
+      "step": 5810
+    },
+    {
+      "epoch": 9.28,
+      "grad_norm": 2.4661436080932617,
+      "learning_rate": 0.001628708133971292,
+      "loss": 0.8749,
+      "step": 5820
+    },
+    {
+      "epoch": 9.3,
+      "grad_norm": 5.519815444946289,
+      "learning_rate": 0.0016280701754385965,
+      "loss": 1.0323,
+      "step": 5830
+    },
+    {
+      "epoch": 9.31,
+      "grad_norm": 4.336925983428955,
+      "learning_rate": 0.001627432216905901,
+      "loss": 0.9915,
+      "step": 5840
+    },
+    {
+      "epoch": 9.33,
+      "grad_norm": 3.4402873516082764,
+      "learning_rate": 0.0016267942583732058,
+      "loss": 1.0824,
+      "step": 5850
+    },
+    {
+      "epoch": 9.35,
+      "grad_norm": 2.801079034805298,
+      "learning_rate": 0.0016261562998405104,
+      "loss": 1.0676,
+      "step": 5860
+    },
+    {
+      "epoch": 9.36,
+      "grad_norm": 4.0579729080200195,
+      "learning_rate": 0.001625518341307815,
+      "loss": 1.1761,
+      "step": 5870
+    },
+    {
+      "epoch": 9.38,
+      "grad_norm": 3.309401035308838,
+      "learning_rate": 0.0016248803827751197,
+      "loss": 0.9767,
+      "step": 5880
+    },
+    {
+      "epoch": 9.39,
+      "grad_norm": 2.3733794689178467,
+      "learning_rate": 0.0016242424242424243,
+      "loss": 0.874,
+      "step": 5890
+    },
+    {
+      "epoch": 9.41,
+      "grad_norm": 2.4089362621307373,
+      "learning_rate": 0.0016236044657097289,
+      "loss": 0.9179,
+      "step": 5900
+    },
+    {
+      "epoch": 9.43,
+      "grad_norm": 4.605165004730225,
+      "learning_rate": 0.0016229665071770334,
+      "loss": 1.1248,
+      "step": 5910
+    },
+    {
+      "epoch": 9.44,
+      "grad_norm": 5.862342357635498,
+      "learning_rate": 0.0016223285486443382,
+      "loss": 1.1098,
+      "step": 5920
+    },
+    {
+      "epoch": 9.46,
+      "grad_norm": 4.282538890838623,
+      "learning_rate": 0.0016216905901116428,
+      "loss": 0.9369,
+      "step": 5930
+    },
+    {
+      "epoch": 9.47,
+      "grad_norm": 4.155124187469482,
+      "learning_rate": 0.0016210526315789473,
+      "loss": 1.0002,
+      "step": 5940
+    },
+    {
+      "epoch": 9.49,
+      "grad_norm": 2.9133784770965576,
+      "learning_rate": 0.0016204146730462521,
+      "loss": 0.9377,
+      "step": 5950
+    },
+    {
+      "epoch": 9.51,
+      "grad_norm": 3.7865607738494873,
+      "learning_rate": 0.0016197767145135567,
+      "loss": 1.1844,
+      "step": 5960
+    },
+    {
+      "epoch": 9.52,
+      "grad_norm": 4.1235270500183105,
+      "learning_rate": 0.0016191387559808612,
+      "loss": 1.0376,
+      "step": 5970
+    },
+    {
+      "epoch": 9.54,
+      "grad_norm": 2.8695414066314697,
+      "learning_rate": 0.0016185007974481658,
+      "loss": 1.1122,
+      "step": 5980
+    },
+    {
+      "epoch": 9.55,
+      "grad_norm": 4.424979209899902,
+      "learning_rate": 0.0016178628389154706,
+      "loss": 1.0555,
+      "step": 5990
+    },
+    {
+      "epoch": 9.57,
+      "grad_norm": 4.249617576599121,
+      "learning_rate": 0.0016172248803827752,
+      "loss": 1.0784,
+      "step": 6000
+    },
+    {
+      "epoch": 9.59,
+      "grad_norm": 3.7470569610595703,
+      "learning_rate": 0.0016165869218500797,
+      "loss": 1.295,
+      "step": 6010
+    },
+    {
+      "epoch": 9.6,
+      "grad_norm": 3.8228983879089355,
+      "learning_rate": 0.0016159489633173845,
+      "loss": 1.005,
+      "step": 6020
+    },
+    {
+      "epoch": 9.62,
+      "grad_norm": 3.3642499446868896,
+      "learning_rate": 0.001615311004784689,
+      "loss": 1.002,
+      "step": 6030
+    },
+    {
+      "epoch": 9.63,
+      "grad_norm": 3.2379348278045654,
+      "learning_rate": 0.0016146730462519936,
+      "loss": 0.9958,
+      "step": 6040
+    },
+    {
+      "epoch": 9.65,
+      "grad_norm": 2.8118715286254883,
+      "learning_rate": 0.0016140350877192982,
+      "loss": 1.2711,
+      "step": 6050
+    },
+    {
+      "epoch": 9.67,
+      "grad_norm": 4.146730899810791,
+      "learning_rate": 0.001613397129186603,
+      "loss": 1.2352,
+      "step": 6060
+    },
+    {
+      "epoch": 9.68,
+      "grad_norm": 3.6763010025024414,
+      "learning_rate": 0.0016127591706539075,
+      "loss": 1.1935,
+      "step": 6070
+    },
+    {
+      "epoch": 9.7,
+      "grad_norm": 2.510589838027954,
+      "learning_rate": 0.001612121212121212,
+      "loss": 1.2449,
+      "step": 6080
+    },
+    {
+      "epoch": 9.71,
+      "grad_norm": 3.556995153427124,
+      "learning_rate": 0.0016114832535885169,
+      "loss": 1.3257,
+      "step": 6090
+    },
+    {
+      "epoch": 9.73,
+      "grad_norm": 3.670929193496704,
+      "learning_rate": 0.0016108452950558214,
+      "loss": 1.0892,
+      "step": 6100
+    },
+    {
+      "epoch": 9.74,
+      "grad_norm": 3.0864908695220947,
+      "learning_rate": 0.001610207336523126,
+      "loss": 1.0478,
+      "step": 6110
+    },
+    {
+      "epoch": 9.76,
+      "grad_norm": 2.65902042388916,
+      "learning_rate": 0.0016095693779904306,
+      "loss": 1.1211,
+      "step": 6120
+    },
+    {
+      "epoch": 9.78,
+      "grad_norm": 2.6973979473114014,
+      "learning_rate": 0.0016089314194577353,
+      "loss": 1.0727,
+      "step": 6130
+    },
+    {
+      "epoch": 9.79,
+      "grad_norm": 4.574107646942139,
+      "learning_rate": 0.00160829346092504,
+      "loss": 1.0997,
+      "step": 6140
+    },
+    {
+      "epoch": 9.81,
+      "grad_norm": 3.178717613220215,
+      "learning_rate": 0.0016076555023923445,
+      "loss": 1.0726,
+      "step": 6150
+    },
+    {
+      "epoch": 9.82,
+      "grad_norm": 3.5710108280181885,
+      "learning_rate": 0.0016070175438596492,
+      "loss": 1.0341,
+      "step": 6160
+    },
+    {
+      "epoch": 9.84,
+      "grad_norm": 3.28791880607605,
+      "learning_rate": 0.0016063795853269538,
+      "loss": 1.2142,
+      "step": 6170
+    },
+    {
+      "epoch": 9.86,
+      "grad_norm": 2.811490535736084,
+      "learning_rate": 0.0016057416267942584,
+      "loss": 1.2218,
+      "step": 6180
+    },
+    {
+      "epoch": 9.87,
+      "grad_norm": 2.8246653079986572,
+      "learning_rate": 0.001605103668261563,
+      "loss": 1.0756,
+      "step": 6190
+    },
+    {
+      "epoch": 9.89,
+      "grad_norm": 4.228902339935303,
+      "learning_rate": 0.0016044657097288677,
+      "loss": 1.133,
+      "step": 6200
+    },
+    {
+      "epoch": 9.9,
+      "grad_norm": 3.8225128650665283,
+      "learning_rate": 0.0016038277511961723,
+      "loss": 1.2428,
+      "step": 6210
+    },
+    {
+      "epoch": 9.92,
+      "grad_norm": 4.282769680023193,
+      "learning_rate": 0.0016031897926634768,
+      "loss": 1.2566,
+      "step": 6220
+    },
+    {
+      "epoch": 9.94,
+      "grad_norm": 4.843967437744141,
+      "learning_rate": 0.0016025518341307814,
+      "loss": 1.1299,
+      "step": 6230
+    },
+    {
+      "epoch": 9.95,
+      "grad_norm": 3.592618227005005,
+      "learning_rate": 0.0016019138755980862,
+      "loss": 1.2274,
+      "step": 6240
+    },
+    {
+      "epoch": 9.97,
+      "grad_norm": 4.132793426513672,
+      "learning_rate": 0.0016012759170653907,
+      "loss": 1.1314,
+      "step": 6250
+    },
+    {
+      "epoch": 9.98,
+      "grad_norm": 4.065629005432129,
+      "learning_rate": 0.0016006379585326953,
+      "loss": 1.2704,
+      "step": 6260
+    },
+    {
+      "epoch": 10.0,
+      "grad_norm": 4.359400749206543,
+      "learning_rate": 0.0016,
+      "loss": 1.1157,
+      "step": 6270
+    },
+    {
+      "epoch": 10.02,
+      "grad_norm": 3.357485771179199,
+      "learning_rate": 0.0015993620414673046,
+      "loss": 0.8463,
+      "step": 6280
+    },
+    {
+      "epoch": 10.03,
+      "grad_norm": 3.480729579925537,
+      "learning_rate": 0.0015987240829346092,
+      "loss": 0.7863,
+      "step": 6290
+    },
+    {
+      "epoch": 10.05,
+      "grad_norm": 2.2290802001953125,
+      "learning_rate": 0.0015980861244019138,
+      "loss": 0.7799,
+      "step": 6300
+    },
+    {
+      "epoch": 10.06,
+      "grad_norm": 2.3109190464019775,
+      "learning_rate": 0.0015974481658692185,
+      "loss": 0.6763,
+      "step": 6310
+    },
+    {
+      "epoch": 10.08,
+      "grad_norm": 4.066445350646973,
+      "learning_rate": 0.001596810207336523,
+      "loss": 0.8017,
+      "step": 6320
+    },
+    {
+      "epoch": 10.1,
+      "grad_norm": 3.2394723892211914,
+      "learning_rate": 0.0015961722488038277,
+      "loss": 0.9444,
+      "step": 6330
+    },
+    {
+      "epoch": 10.11,
+      "grad_norm": 3.250441551208496,
+      "learning_rate": 0.0015955342902711324,
+      "loss": 0.8472,
+      "step": 6340
+    },
+    {
+      "epoch": 10.13,
+      "grad_norm": 2.4321706295013428,
+      "learning_rate": 0.001594896331738437,
+      "loss": 0.784,
+      "step": 6350
+    },
+    {
+      "epoch": 10.14,
+      "grad_norm": 2.8150861263275146,
+      "learning_rate": 0.0015942583732057416,
+      "loss": 0.8903,
+      "step": 6360
+    },
+    {
+      "epoch": 10.16,
+      "grad_norm": 2.4516990184783936,
+      "learning_rate": 0.0015936204146730461,
+      "loss": 0.7983,
+      "step": 6370
+    },
+    {
+      "epoch": 10.18,
+      "grad_norm": 4.499327659606934,
+      "learning_rate": 0.001592982456140351,
+      "loss": 0.8822,
+      "step": 6380
+    },
+    {
+      "epoch": 10.19,
+      "grad_norm": 3.4871532917022705,
+      "learning_rate": 0.0015923444976076555,
+      "loss": 0.8933,
+      "step": 6390
+    },
+    {
+      "epoch": 10.21,
+      "grad_norm": 2.1855833530426025,
+      "learning_rate": 0.00159170653907496,
+      "loss": 0.7781,
+      "step": 6400
+    },
+    {
+      "epoch": 10.22,
+      "grad_norm": 2.45394229888916,
+      "learning_rate": 0.0015910685805422648,
+      "loss": 0.7879,
+      "step": 6410
+    },
+    {
+      "epoch": 10.24,
+      "grad_norm": 4.465210914611816,
+      "learning_rate": 0.0015904306220095694,
+      "loss": 0.9825,
+      "step": 6420
+    },
+    {
+      "epoch": 10.26,
+      "grad_norm": 4.026763916015625,
+      "learning_rate": 0.001589792663476874,
+      "loss": 0.9108,
+      "step": 6430
+    },
+    {
+      "epoch": 10.27,
+      "grad_norm": 3.5490238666534424,
+      "learning_rate": 0.0015891547049441785,
+      "loss": 0.8742,
+      "step": 6440
+    },
+    {
+      "epoch": 10.29,
+      "grad_norm": 3.857203960418701,
+      "learning_rate": 0.0015885167464114833,
+      "loss": 0.9243,
+      "step": 6450
+    },
+    {
+      "epoch": 10.3,
+      "grad_norm": 5.296510696411133,
+      "learning_rate": 0.0015878787878787879,
+      "loss": 1.0742,
+      "step": 6460
+    },
+    {
+      "epoch": 10.32,
+      "grad_norm": 3.6743974685668945,
+      "learning_rate": 0.0015872408293460924,
+      "loss": 1.0541,
+      "step": 6470
+    },
+    {
+      "epoch": 10.33,
+      "grad_norm": 3.527785301208496,
+      "learning_rate": 0.0015866028708133972,
+      "loss": 0.958,
+      "step": 6480
+    },
+    {
+      "epoch": 10.35,
+      "grad_norm": 2.7961020469665527,
+      "learning_rate": 0.0015859649122807018,
+      "loss": 1.0794,
+      "step": 6490
+    },
+    {
+      "epoch": 10.37,
+      "grad_norm": 5.286695957183838,
+      "learning_rate": 0.0015853269537480063,
+      "loss": 1.0691,
+      "step": 6500
+    },
+    {
+      "epoch": 10.38,
+      "grad_norm": 2.792459011077881,
+      "learning_rate": 0.0015846889952153109,
+      "loss": 1.005,
+      "step": 6510
+    },
+    {
+      "epoch": 10.4,
+      "grad_norm": 4.287434101104736,
+      "learning_rate": 0.0015840510366826157,
+      "loss": 1.0826,
+      "step": 6520
+    },
+    {
+      "epoch": 10.41,
+      "grad_norm": 3.291612148284912,
+      "learning_rate": 0.0015834130781499202,
+      "loss": 0.9482,
+      "step": 6530
+    },
+    {
+      "epoch": 10.43,
+      "grad_norm": 2.1570498943328857,
+      "learning_rate": 0.0015827751196172248,
+      "loss": 0.8198,
+      "step": 6540
+    },
+    {
+      "epoch": 10.45,
+      "grad_norm": 5.364358901977539,
+      "learning_rate": 0.0015821371610845293,
+      "loss": 0.922,
+      "step": 6550
+    },
+    {
+      "epoch": 10.46,
+      "grad_norm": 2.493326187133789,
+      "learning_rate": 0.0015814992025518341,
+      "loss": 1.0143,
+      "step": 6560
+    },
+    {
+      "epoch": 10.48,
+      "grad_norm": 4.336993217468262,
+      "learning_rate": 0.0015808612440191387,
+      "loss": 0.9091,
+      "step": 6570
+    },
+    {
+      "epoch": 10.49,
+      "grad_norm": 3.2809929847717285,
+      "learning_rate": 0.0015802232854864433,
+      "loss": 0.947,
+      "step": 6580
+    },
+    {
+      "epoch": 10.51,
+      "grad_norm": 3.941453456878662,
+      "learning_rate": 0.001579585326953748,
+      "loss": 1.0424,
+      "step": 6590
+    },
+    {
+      "epoch": 10.53,
+      "grad_norm": 2.2481088638305664,
+      "learning_rate": 0.0015789473684210526,
+      "loss": 0.9591,
+      "step": 6600
+    },
+    {
+      "epoch": 10.54,
+      "grad_norm": 2.889963388442993,
+      "learning_rate": 0.0015783094098883572,
+      "loss": 1.0191,
+      "step": 6610
+    },
+    {
+      "epoch": 10.56,
+      "grad_norm": 3.2319366931915283,
+      "learning_rate": 0.0015776714513556617,
+      "loss": 1.1833,
+      "step": 6620
+    },
+    {
+      "epoch": 10.57,
+      "grad_norm": 2.6110410690307617,
+      "learning_rate": 0.0015770334928229665,
+      "loss": 1.082,
+      "step": 6630
+    },
+    {
+      "epoch": 10.59,
+      "grad_norm": 2.9207470417022705,
+      "learning_rate": 0.001576395534290271,
+      "loss": 0.9628,
+      "step": 6640
+    },
+    {
+      "epoch": 10.61,
+      "grad_norm": 4.300070285797119,
+      "learning_rate": 0.0015757575757575756,
+      "loss": 0.9607,
+      "step": 6650
+    },
+    {
+      "epoch": 10.62,
+      "grad_norm": 3.435377597808838,
+      "learning_rate": 0.0015751196172248804,
+      "loss": 0.9709,
+      "step": 6660
+    },
+    {
+      "epoch": 10.64,
+      "grad_norm": 3.129941940307617,
+      "learning_rate": 0.001574481658692185,
+      "loss": 1.0623,
+      "step": 6670
+    },
+    {
+      "epoch": 10.65,
+      "grad_norm": 3.273089647293091,
+      "learning_rate": 0.0015738437001594895,
+      "loss": 1.0186,
+      "step": 6680
+    },
+    {
+      "epoch": 10.67,
+      "grad_norm": 3.147507667541504,
+      "learning_rate": 0.001573205741626794,
+      "loss": 0.9143,
+      "step": 6690
+    },
+    {
+      "epoch": 10.69,
+      "grad_norm": 3.1906449794769287,
+      "learning_rate": 0.0015725677830940989,
+      "loss": 1.0698,
+      "step": 6700
+    },
+    {
+      "epoch": 10.7,
+      "grad_norm": 2.52282452583313,
+      "learning_rate": 0.0015719298245614034,
+      "loss": 1.2058,
+      "step": 6710
+    },
+    {
+      "epoch": 10.72,
+      "grad_norm": 3.526111602783203,
+      "learning_rate": 0.001571291866028708,
+      "loss": 1.0494,
+      "step": 6720
+    },
+    {
+      "epoch": 10.73,
+      "grad_norm": 4.391296863555908,
+      "learning_rate": 0.001570653907496013,
+      "loss": 1.0639,
+      "step": 6730
+    },
+    {
+      "epoch": 10.75,
+      "grad_norm": 3.623323678970337,
+      "learning_rate": 0.0015700159489633176,
+      "loss": 1.1105,
+      "step": 6740
+    },
+    {
+      "epoch": 10.77,
+      "grad_norm": 3.705646514892578,
+      "learning_rate": 0.0015693779904306221,
+      "loss": 0.964,
+      "step": 6750
+    },
+    {
+      "epoch": 10.78,
+      "grad_norm": 2.726846694946289,
+      "learning_rate": 0.0015687400318979267,
+      "loss": 1.1207,
+      "step": 6760
+    },
+    {
+      "epoch": 10.8,
+      "grad_norm": 2.3796093463897705,
+      "learning_rate": 0.0015681020733652315,
+      "loss": 1.038,
+      "step": 6770
+    },
+    {
+      "epoch": 10.81,
+      "grad_norm": 2.321793794631958,
+      "learning_rate": 0.001567464114832536,
+      "loss": 1.1098,
+      "step": 6780
+    },
+    {
+      "epoch": 10.83,
+      "grad_norm": 4.951314926147461,
+      "learning_rate": 0.0015668261562998406,
+      "loss": 1.0766,
+      "step": 6790
+    },
+    {
+      "epoch": 10.85,
+      "grad_norm": 2.3192481994628906,
+      "learning_rate": 0.0015661881977671454,
+      "loss": 1.1615,
+      "step": 6800
+    },
+    {
+      "epoch": 10.86,
+      "grad_norm": 3.576709508895874,
+      "learning_rate": 0.00156555023923445,
+      "loss": 1.0031,
+      "step": 6810
+    },
+    {
+      "epoch": 10.88,
+      "grad_norm": 3.2805440425872803,
+      "learning_rate": 0.0015649122807017545,
+      "loss": 0.9738,
+      "step": 6820
+    },
+    {
+      "epoch": 10.89,
+      "grad_norm": 3.367990016937256,
+      "learning_rate": 0.001564274322169059,
+      "loss": 0.9948,
+      "step": 6830
+    },
+    {
+      "epoch": 10.91,
+      "grad_norm": 4.358039379119873,
+      "learning_rate": 0.0015636363636363638,
+      "loss": 1.0906,
+      "step": 6840
+    },
+    {
+      "epoch": 10.93,
+      "grad_norm": 3.0704123973846436,
+      "learning_rate": 0.0015629984051036684,
+      "loss": 1.1433,
+      "step": 6850
+    },
+    {
+      "epoch": 10.94,
+      "grad_norm": 3.6105406284332275,
+      "learning_rate": 0.001562360446570973,
+      "loss": 1.0975,
+      "step": 6860
+    },
+    {
+      "epoch": 10.96,
+      "grad_norm": 2.0646121501922607,
+      "learning_rate": 0.0015617224880382775,
+      "loss": 1.1646,
+      "step": 6870
+    },
+    {
+      "epoch": 10.97,
+      "grad_norm": 3.911951780319214,
+      "learning_rate": 0.0015610845295055823,
+      "loss": 1.0135,
+      "step": 6880
+    },
+    {
+      "epoch": 10.99,
+      "grad_norm": 3.6417315006256104,
+      "learning_rate": 0.0015604465709728869,
+      "loss": 1.1479,
+      "step": 6890
+    },
+    {
+      "epoch": 11.0,
+      "grad_norm": 2.3251378536224365,
+      "learning_rate": 0.0015598086124401914,
+      "loss": 1.058,
+      "step": 6900
+    },
+    {
+      "epoch": 11.02,
+      "grad_norm": 2.8822855949401855,
+      "learning_rate": 0.0015591706539074962,
+      "loss": 0.7655,
+      "step": 6910
+    },
+    {
+      "epoch": 11.04,
+      "grad_norm": 3.3327693939208984,
+      "learning_rate": 0.0015585326953748008,
+      "loss": 0.8605,
+      "step": 6920
+    },
+    {
+      "epoch": 11.05,
+      "grad_norm": 2.5779995918273926,
+      "learning_rate": 0.0015578947368421053,
+      "loss": 0.8452,
+      "step": 6930
+    },
+    {
+      "epoch": 11.07,
+      "grad_norm": 3.0843000411987305,
+      "learning_rate": 0.00155725677830941,
+      "loss": 0.7461,
+      "step": 6940
+    },
+    {
+      "epoch": 11.08,
+      "grad_norm": 3.354552984237671,
+      "learning_rate": 0.0015566188197767147,
+      "loss": 0.8311,
+      "step": 6950
+    },
+    {
+      "epoch": 11.1,
+      "grad_norm": 2.7717132568359375,
+      "learning_rate": 0.0015559808612440192,
+      "loss": 0.8472,
+      "step": 6960
+    },
+    {
+      "epoch": 11.12,
+      "grad_norm": 2.2292239665985107,
+      "learning_rate": 0.0015553429027113238,
+      "loss": 0.759,
+      "step": 6970
+    },
+    {
+      "epoch": 11.13,
+      "grad_norm": 4.184642791748047,
+      "learning_rate": 0.0015547049441786286,
+      "loss": 0.7796,
+      "step": 6980
+    },
+    {
+      "epoch": 11.15,
+      "grad_norm": 3.987525463104248,
+      "learning_rate": 0.0015540669856459331,
+      "loss": 0.7885,
+      "step": 6990
+    },
+    {
+      "epoch": 11.16,
+      "grad_norm": 2.9014410972595215,
+      "learning_rate": 0.0015534290271132377,
+      "loss": 0.8425,
+      "step": 7000
+    },
+    {
+      "epoch": 11.18,
+      "grad_norm": 2.4290761947631836,
+      "learning_rate": 0.0015527910685805423,
+      "loss": 0.7046,
+      "step": 7010
+    },
+    {
+      "epoch": 11.2,
+      "grad_norm": 4.5738701820373535,
+      "learning_rate": 0.001552153110047847,
+      "loss": 0.794,
+      "step": 7020
+    },
+    {
+      "epoch": 11.21,
+      "grad_norm": 3.999741792678833,
+      "learning_rate": 0.0015515151515151516,
+      "loss": 0.8193,
+      "step": 7030
+    },
+    {
+      "epoch": 11.23,
+      "grad_norm": 3.0708727836608887,
+      "learning_rate": 0.0015508771929824562,
+      "loss": 0.8712,
+      "step": 7040
+    },
+    {
+      "epoch": 11.24,
+      "grad_norm": 3.396559715270996,
+      "learning_rate": 0.001550239234449761,
+      "loss": 0.8411,
+      "step": 7050
+    },
+    {
+      "epoch": 11.26,
+      "grad_norm": 3.517340898513794,
+      "learning_rate": 0.0015496012759170655,
+      "loss": 0.7907,
+      "step": 7060
+    },
+    {
+      "epoch": 11.28,
+      "grad_norm": 2.170309066772461,
+      "learning_rate": 0.00154896331738437,
+      "loss": 0.7761,
+      "step": 7070
+    },
+    {
+      "epoch": 11.29,
+      "grad_norm": 4.765143871307373,
+      "learning_rate": 0.0015483253588516746,
+      "loss": 1.0072,
+      "step": 7080
+    },
+    {
+      "epoch": 11.31,
+      "grad_norm": 2.595566749572754,
+      "learning_rate": 0.0015476874003189794,
+      "loss": 0.9483,
+      "step": 7090
+    },
+    {
+      "epoch": 11.32,
+      "grad_norm": 3.8784263134002686,
+      "learning_rate": 0.001547049441786284,
+      "loss": 0.8049,
+      "step": 7100
+    },
+    {
+      "epoch": 11.34,
+      "grad_norm": 3.033404588699341,
+      "learning_rate": 0.0015464114832535885,
+      "loss": 0.8574,
+      "step": 7110
+    },
+    {
+      "epoch": 11.36,
+      "grad_norm": 3.059054136276245,
+      "learning_rate": 0.0015457735247208933,
+      "loss": 0.8477,
+      "step": 7120
+    },
+    {
+      "epoch": 11.37,
+      "grad_norm": 4.744221210479736,
+      "learning_rate": 0.0015451355661881979,
+      "loss": 1.0191,
+      "step": 7130
+    },
+    {
+      "epoch": 11.39,
+      "grad_norm": 2.8809046745300293,
+      "learning_rate": 0.0015444976076555024,
+      "loss": 0.8683,
+      "step": 7140
+    },
+    {
+      "epoch": 11.4,
+      "grad_norm": 2.913546323776245,
+      "learning_rate": 0.001543859649122807,
+      "loss": 0.9145,
+      "step": 7150
+    },
+    {
+      "epoch": 11.42,
+      "grad_norm": 3.83941650390625,
+      "learning_rate": 0.0015432216905901118,
+      "loss": 0.8532,
+      "step": 7160
+    },
+    {
+      "epoch": 11.44,
+      "grad_norm": 3.471904754638672,
+      "learning_rate": 0.0015425837320574164,
+      "loss": 0.8605,
+      "step": 7170
+    },
+    {
+      "epoch": 11.45,
+      "grad_norm": 3.6713290214538574,
+      "learning_rate": 0.001541945773524721,
+      "loss": 1.013,
+      "step": 7180
+    },
+    {
+      "epoch": 11.47,
+      "grad_norm": 3.537461996078491,
+      "learning_rate": 0.0015413078149920257,
+      "loss": 0.9493,
+      "step": 7190
+    },
+    {
+      "epoch": 11.48,
+      "grad_norm": 3.101954460144043,
+      "learning_rate": 0.0015406698564593303,
+      "loss": 0.892,
+      "step": 7200
+    },
+    {
+      "epoch": 11.5,
+      "grad_norm": 4.835020542144775,
+      "learning_rate": 0.0015400318979266348,
+      "loss": 0.9719,
+      "step": 7210
+    },
+    {
+      "epoch": 11.52,
+      "grad_norm": 3.35196852684021,
+      "learning_rate": 0.0015393939393939394,
+      "loss": 0.84,
+      "step": 7220
+    },
+    {
+      "epoch": 11.53,
+      "grad_norm": 3.0783281326293945,
+      "learning_rate": 0.0015387559808612442,
+      "loss": 1.0172,
+      "step": 7230
+    },
+    {
+      "epoch": 11.55,
+      "grad_norm": 3.5924274921417236,
+      "learning_rate": 0.0015381180223285487,
+      "loss": 0.9569,
+      "step": 7240
+    },
+    {
+      "epoch": 11.56,
+      "grad_norm": 4.351842403411865,
+      "learning_rate": 0.0015374800637958533,
+      "loss": 0.9124,
+      "step": 7250
+    },
+    {
+      "epoch": 11.58,
+      "grad_norm": 5.1138200759887695,
+      "learning_rate": 0.0015368421052631579,
+      "loss": 1.0265,
+      "step": 7260
+    },
+    {
+      "epoch": 11.59,
+      "grad_norm": 4.592616558074951,
+      "learning_rate": 0.0015362041467304626,
+      "loss": 0.8753,
+      "step": 7270
+    },
+    {
+      "epoch": 11.61,
+      "grad_norm": 2.198404550552368,
+      "learning_rate": 0.0015355661881977672,
+      "loss": 0.9964,
+      "step": 7280
+    },
+    {
+      "epoch": 11.63,
+      "grad_norm": 3.718247175216675,
+      "learning_rate": 0.0015349282296650718,
+      "loss": 0.9982,
+      "step": 7290
+    },
+    {
+      "epoch": 11.64,
+      "grad_norm": 2.973299980163574,
+      "learning_rate": 0.0015342902711323765,
+      "loss": 0.9022,
+      "step": 7300
+    },
+    {
+      "epoch": 11.66,
+      "grad_norm": 3.1553690433502197,
+      "learning_rate": 0.001533652312599681,
+      "loss": 0.8882,
+      "step": 7310
+    },
+    {
+      "epoch": 11.67,
+      "grad_norm": 5.204711437225342,
+      "learning_rate": 0.0015330143540669857,
+      "loss": 0.9439,
+      "step": 7320
+    },
+    {
+      "epoch": 11.69,
+      "grad_norm": 2.575793981552124,
+      "learning_rate": 0.0015323763955342902,
+      "loss": 0.9633,
+      "step": 7330
+    },
+    {
+      "epoch": 11.71,
+      "grad_norm": 3.682734251022339,
+      "learning_rate": 0.001531738437001595,
+      "loss": 0.8831,
+      "step": 7340
+    },
+    {
+      "epoch": 11.72,
+      "grad_norm": 4.238563060760498,
+      "learning_rate": 0.0015311004784688996,
+      "loss": 1.2109,
+      "step": 7350
+    },
+    {
+      "epoch": 11.74,
+      "grad_norm": 4.091822147369385,
+      "learning_rate": 0.0015304625199362041,
+      "loss": 0.9663,
+      "step": 7360
+    },
+    {
+      "epoch": 11.75,
+      "grad_norm": 4.6950154304504395,
+      "learning_rate": 0.001529824561403509,
+      "loss": 0.9541,
+      "step": 7370
+    },
+    {
+      "epoch": 11.77,
+      "grad_norm": 2.6994404792785645,
+      "learning_rate": 0.0015291866028708135,
+      "loss": 0.938,
+      "step": 7380
+    },
+    {
+      "epoch": 11.79,
+      "grad_norm": 3.632509708404541,
+      "learning_rate": 0.001528548644338118,
+      "loss": 1.0444,
+      "step": 7390
+    },
+    {
+      "epoch": 11.8,
+      "grad_norm": 3.1459712982177734,
+      "learning_rate": 0.0015279106858054226,
+      "loss": 1.0143,
+      "step": 7400
+    },
+    {
+      "epoch": 11.82,
+      "grad_norm": 3.5480315685272217,
+      "learning_rate": 0.0015272727272727274,
+      "loss": 1.001,
+      "step": 7410
+    },
+    {
+      "epoch": 11.83,
+      "grad_norm": 2.908008575439453,
+      "learning_rate": 0.001526634768740032,
+      "loss": 1.0674,
+      "step": 7420
+    },
+    {
+      "epoch": 11.85,
+      "grad_norm": 3.147965431213379,
+      "learning_rate": 0.0015259968102073365,
+      "loss": 1.0779,
+      "step": 7430
+    },
+    {
+      "epoch": 11.87,
+      "grad_norm": 3.2961347103118896,
+      "learning_rate": 0.0015253588516746413,
+      "loss": 0.9077,
+      "step": 7440
+    },
+    {
+      "epoch": 11.88,
+      "grad_norm": 3.34252667427063,
+      "learning_rate": 0.0015247208931419458,
+      "loss": 1.1163,
+      "step": 7450
+    },
+    {
+      "epoch": 11.9,
+      "grad_norm": 3.7476675510406494,
+      "learning_rate": 0.0015240829346092504,
+      "loss": 1.0136,
+      "step": 7460
+    },
+    {
+      "epoch": 11.91,
+      "grad_norm": 3.686720609664917,
+      "learning_rate": 0.001523444976076555,
+      "loss": 1.0083,
+      "step": 7470
+    },
+    {
+      "epoch": 11.93,
+      "grad_norm": 3.023853302001953,
+      "learning_rate": 0.0015228070175438597,
+      "loss": 1.1149,
+      "step": 7480
+    },
+    {
+      "epoch": 11.95,
+      "grad_norm": 2.11389422416687,
+      "learning_rate": 0.0015221690590111643,
+      "loss": 0.9332,
+      "step": 7490
+    },
+    {
+      "epoch": 11.96,
+      "grad_norm": 2.868576765060425,
+      "learning_rate": 0.0015215311004784689,
+      "loss": 1.0661,
+      "step": 7500
+    },
+    {
+      "epoch": 11.98,
+      "grad_norm": 2.1617486476898193,
+      "learning_rate": 0.0015208931419457737,
+      "loss": 1.0098,
+      "step": 7510
+    },
+    {
+      "epoch": 11.99,
+      "grad_norm": 3.540294647216797,
+      "learning_rate": 0.0015202551834130782,
+      "loss": 1.032,
+      "step": 7520
+    },
+    {
+      "epoch": 12.01,
+      "grad_norm": 3.1346607208251953,
+      "learning_rate": 0.0015196172248803828,
+      "loss": 0.9046,
+      "step": 7530
+    },
+    {
+      "epoch": 12.03,
+      "grad_norm": 2.131230115890503,
+      "learning_rate": 0.0015189792663476873,
+      "loss": 0.6433,
+      "step": 7540
+    },
+    {
+      "epoch": 12.04,
+      "grad_norm": 1.7812432050704956,
+      "learning_rate": 0.0015183413078149921,
+      "loss": 0.6029,
+      "step": 7550
+    },
+    {
+      "epoch": 12.06,
+      "grad_norm": 3.244680643081665,
+      "learning_rate": 0.0015177033492822967,
+      "loss": 0.7685,
+      "step": 7560
+    },
+    {
+      "epoch": 12.07,
+      "grad_norm": 2.641512393951416,
+      "learning_rate": 0.0015170653907496012,
+      "loss": 0.5811,
+      "step": 7570
+    },
+    {
+      "epoch": 12.09,
+      "grad_norm": 2.1574976444244385,
+      "learning_rate": 0.0015164274322169058,
+      "loss": 0.7704,
+      "step": 7580
+    },
+    {
+      "epoch": 12.11,
+      "grad_norm": 2.7403526306152344,
+      "learning_rate": 0.0015157894736842106,
+      "loss": 0.822,
+      "step": 7590
+    },
+    {
+      "epoch": 12.12,
+      "grad_norm": 4.00333309173584,
+      "learning_rate": 0.0015151515151515152,
+      "loss": 0.6791,
+      "step": 7600
+    },
+    {
+      "epoch": 12.14,
+      "grad_norm": 3.1871447563171387,
+      "learning_rate": 0.0015145135566188197,
+      "loss": 0.6589,
+      "step": 7610
+    },
+    {
+      "epoch": 12.15,
+      "grad_norm": 2.847644567489624,
+      "learning_rate": 0.0015138755980861245,
+      "loss": 0.7128,
+      "step": 7620
+    },
+    {
+      "epoch": 12.17,
+      "grad_norm": 2.5338680744171143,
+      "learning_rate": 0.001513237639553429,
+      "loss": 0.9216,
+      "step": 7630
+    },
+    {
+      "epoch": 12.19,
+      "grad_norm": 2.299643039703369,
+      "learning_rate": 0.0015125996810207336,
+      "loss": 0.8705,
+      "step": 7640
+    },
+    {
+      "epoch": 12.2,
+      "grad_norm": 2.6167166233062744,
+      "learning_rate": 0.0015119617224880382,
+      "loss": 0.7226,
+      "step": 7650
+    },
+    {
+      "epoch": 12.22,
+      "grad_norm": 1.9708589315414429,
+      "learning_rate": 0.001511323763955343,
+      "loss": 0.7894,
+      "step": 7660
+    },
+    {
+      "epoch": 12.23,
+      "grad_norm": 2.8870623111724854,
+      "learning_rate": 0.0015106858054226475,
+      "loss": 0.7716,
+      "step": 7670
+    },
+    {
+      "epoch": 12.25,
+      "grad_norm": 2.571887493133545,
+      "learning_rate": 0.001510047846889952,
+      "loss": 0.6934,
+      "step": 7680
+    },
+    {
+      "epoch": 12.26,
+      "grad_norm": 3.059251070022583,
+      "learning_rate": 0.0015094098883572569,
+      "loss": 0.7272,
+      "step": 7690
+    },
+    {
+      "epoch": 12.28,
+      "grad_norm": 2.94647216796875,
+      "learning_rate": 0.0015087719298245614,
+      "loss": 0.7966,
+      "step": 7700
+    },
+    {
+      "epoch": 12.3,
+      "grad_norm": 2.6510915756225586,
+      "learning_rate": 0.001508133971291866,
+      "loss": 0.735,
+      "step": 7710
+    },
+    {
+      "epoch": 12.31,
+      "grad_norm": 2.9655959606170654,
+      "learning_rate": 0.0015074960127591706,
+      "loss": 0.8989,
+      "step": 7720
+    },
+    {
+      "epoch": 12.33,
+      "grad_norm": 2.72773814201355,
+      "learning_rate": 0.0015068580542264753,
+      "loss": 0.8239,
+      "step": 7730
+    },
+    {
+      "epoch": 12.34,
+      "grad_norm": 2.8079593181610107,
+      "learning_rate": 0.00150622009569378,
+      "loss": 0.7945,
+      "step": 7740
+    },
+    {
+      "epoch": 12.36,
+      "grad_norm": 2.3012099266052246,
+      "learning_rate": 0.0015055821371610845,
+      "loss": 0.8224,
+      "step": 7750
+    },
+    {
+      "epoch": 12.38,
+      "grad_norm": 3.559399127960205,
+      "learning_rate": 0.0015049441786283892,
+      "loss": 0.7912,
+      "step": 7760
+    },
+    {
+      "epoch": 12.39,
+      "grad_norm": 2.993138551712036,
+      "learning_rate": 0.0015043062200956938,
+      "loss": 0.8127,
+      "step": 7770
+    },
+    {
+      "epoch": 12.41,
+      "grad_norm": 3.5749433040618896,
+      "learning_rate": 0.0015036682615629984,
+      "loss": 0.8071,
+      "step": 7780
+    },
+    {
+      "epoch": 12.42,
+      "grad_norm": 2.879560947418213,
+      "learning_rate": 0.001503030303030303,
+      "loss": 0.8794,
+      "step": 7790
+    },
+    {
+      "epoch": 12.44,
+      "grad_norm": 3.648130416870117,
+      "learning_rate": 0.0015023923444976077,
+      "loss": 0.8795,
+      "step": 7800
+    },
+    {
+      "epoch": 12.46,
+      "grad_norm": 5.283175468444824,
+      "learning_rate": 0.0015017543859649123,
+      "loss": 1.0504,
+      "step": 7810
+    },
+    {
+      "epoch": 12.47,
+      "grad_norm": 3.602062940597534,
+      "learning_rate": 0.0015011164274322168,
+      "loss": 0.8283,
+      "step": 7820
+    },
+    {
+      "epoch": 12.49,
+      "grad_norm": 2.755488872528076,
+      "learning_rate": 0.0015004784688995216,
+      "loss": 0.8203,
+      "step": 7830
+    },
+    {
+      "epoch": 12.5,
+      "grad_norm": 3.24674654006958,
+      "learning_rate": 0.0014998405103668262,
+      "loss": 0.7843,
+      "step": 7840
+    },
+    {
+      "epoch": 12.52,
+      "grad_norm": 2.072895050048828,
+      "learning_rate": 0.0014992025518341307,
+      "loss": 0.7738,
+      "step": 7850
+    },
+    {
+      "epoch": 12.54,
+      "grad_norm": 4.0108208656311035,
+      "learning_rate": 0.0014985645933014353,
+      "loss": 0.8728,
+      "step": 7860
+    },
+    {
+      "epoch": 12.55,
+      "grad_norm": 2.896224021911621,
+      "learning_rate": 0.00149792663476874,
+      "loss": 0.823,
+      "step": 7870
+    },
+    {
+      "epoch": 12.57,
+      "grad_norm": 3.3562960624694824,
+      "learning_rate": 0.0014972886762360446,
+      "loss": 1.0389,
+      "step": 7880
+    },
+    {
+      "epoch": 12.58,
+      "grad_norm": 3.14931058883667,
+      "learning_rate": 0.0014966507177033492,
+      "loss": 0.9448,
+      "step": 7890
+    },
+    {
+      "epoch": 12.6,
+      "grad_norm": 6.942476272583008,
+      "learning_rate": 0.001496012759170654,
+      "loss": 0.8526,
+      "step": 7900
+    },
+    {
+      "epoch": 12.62,
+      "grad_norm": 2.516266107559204,
+      "learning_rate": 0.0014953748006379585,
+      "loss": 0.8342,
+      "step": 7910
+    },
+    {
+      "epoch": 12.63,
+      "grad_norm": 2.6325111389160156,
+      "learning_rate": 0.001494736842105263,
+      "loss": 0.9933,
+      "step": 7920
+    },
+    {
+      "epoch": 12.65,
+      "grad_norm": 3.630423069000244,
+      "learning_rate": 0.0014940988835725677,
+      "loss": 0.8403,
+      "step": 7930
+    },
+    {
+      "epoch": 12.66,
+      "grad_norm": 3.6334409713745117,
+      "learning_rate": 0.0014934609250398724,
+      "loss": 1.0628,
+      "step": 7940
+    },
+    {
+      "epoch": 12.68,
+      "grad_norm": 3.110170841217041,
+      "learning_rate": 0.001492822966507177,
+      "loss": 0.8604,
+      "step": 7950
+    },
+    {
+      "epoch": 12.7,
+      "grad_norm": 3.0557703971862793,
+      "learning_rate": 0.0014921850079744816,
+      "loss": 0.9121,
+      "step": 7960
+    },
+    {
+      "epoch": 12.71,
+      "grad_norm": 3.6271071434020996,
+      "learning_rate": 0.0014915470494417861,
+      "loss": 0.9177,
+      "step": 7970
+    },
+    {
+      "epoch": 12.73,
+      "grad_norm": 3.5513288974761963,
+      "learning_rate": 0.001490909090909091,
+      "loss": 0.8542,
+      "step": 7980
+    },
+    {
+      "epoch": 12.74,
+      "grad_norm": 4.270805358886719,
+      "learning_rate": 0.0014902711323763955,
+      "loss": 0.9907,
+      "step": 7990
+    },
+    {
+      "epoch": 12.76,
+      "grad_norm": 2.8084616661071777,
+      "learning_rate": 0.0014896331738437,
+      "loss": 0.9405,
+      "step": 8000
+    },
+    {
+      "epoch": 12.78,
+      "grad_norm": 5.405944347381592,
+      "learning_rate": 0.0014889952153110048,
+      "loss": 0.8483,
+      "step": 8010
+    },
+    {
+      "epoch": 12.79,
+      "grad_norm": 3.2791013717651367,
+      "learning_rate": 0.0014883572567783094,
+      "loss": 0.9408,
+      "step": 8020
+    },
+    {
+      "epoch": 12.81,
+      "grad_norm": 3.3789143562316895,
+      "learning_rate": 0.001487719298245614,
+      "loss": 1.0307,
+      "step": 8030
+    },
+    {
+      "epoch": 12.82,
+      "grad_norm": 3.513697624206543,
+      "learning_rate": 0.0014870813397129185,
+      "loss": 0.8657,
+      "step": 8040
+    },
+    {
+      "epoch": 12.84,
+      "grad_norm": 3.4501123428344727,
+      "learning_rate": 0.0014864433811802233,
+      "loss": 0.9202,
+      "step": 8050
+    },
+    {
+      "epoch": 12.85,
+      "grad_norm": 3.0335283279418945,
+      "learning_rate": 0.0014858054226475279,
+      "loss": 0.941,
+      "step": 8060
+    },
+    {
+      "epoch": 12.87,
+      "grad_norm": 3.0770187377929688,
+      "learning_rate": 0.0014851674641148324,
+      "loss": 0.9562,
+      "step": 8070
+    },
+    {
+      "epoch": 12.89,
+      "grad_norm": 2.967750310897827,
+      "learning_rate": 0.0014845295055821372,
+      "loss": 0.9318,
+      "step": 8080
+    },
+    {
+      "epoch": 12.9,
+      "grad_norm": 4.517429828643799,
+      "learning_rate": 0.0014838915470494418,
+      "loss": 0.9225,
+      "step": 8090
+    },
+    {
+      "epoch": 12.92,
+      "grad_norm": 4.639514923095703,
+      "learning_rate": 0.0014832535885167463,
+      "loss": 0.8997,
+      "step": 8100
+    },
+    {
+      "epoch": 12.93,
+      "grad_norm": 4.017191410064697,
+      "learning_rate": 0.0014826156299840509,
+      "loss": 1.0325,
+      "step": 8110
+    },
+    {
+      "epoch": 12.95,
+      "grad_norm": 4.688587188720703,
+      "learning_rate": 0.0014819776714513557,
+      "loss": 0.8542,
+      "step": 8120
+    },
+    {
+      "epoch": 12.97,
+      "grad_norm": 5.4787821769714355,
+      "learning_rate": 0.0014813397129186602,
+      "loss": 1.2567,
+      "step": 8130
+    },
+    {
+      "epoch": 12.98,
+      "grad_norm": 3.8270418643951416,
+      "learning_rate": 0.0014807017543859648,
+      "loss": 0.9071,
+      "step": 8140
+    },
+    {
+      "epoch": 13.0,
+      "grad_norm": 5.171020984649658,
+      "learning_rate": 0.0014800637958532696,
+      "loss": 0.9396,
+      "step": 8150
+    },
+    {
+      "epoch": 13.01,
+      "grad_norm": 2.2651660442352295,
+      "learning_rate": 0.0014794258373205741,
+      "loss": 0.6466,
+      "step": 8160
+    },
+    {
+      "epoch": 13.03,
+      "grad_norm": 1.7244137525558472,
+      "learning_rate": 0.0014787878787878787,
+      "loss": 0.5765,
+      "step": 8170
+    },
+    {
+      "epoch": 13.05,
+      "grad_norm": 2.143556833267212,
+      "learning_rate": 0.0014781499202551833,
+      "loss": 0.6964,
+      "step": 8180
+    },
+    {
+      "epoch": 13.06,
+      "grad_norm": 3.048412561416626,
+      "learning_rate": 0.001477511961722488,
+      "loss": 0.5957,
+      "step": 8190
+    },
+    {
+      "epoch": 13.08,
+      "grad_norm": 3.002617120742798,
+      "learning_rate": 0.0014768740031897926,
+      "loss": 0.6172,
+      "step": 8200
+    },
+    {
+      "epoch": 13.09,
+      "grad_norm": 2.4327642917633057,
+      "learning_rate": 0.0014762360446570972,
+      "loss": 0.6952,
+      "step": 8210
+    },
+    {
+      "epoch": 13.11,
+      "grad_norm": 3.3259124755859375,
+      "learning_rate": 0.0014755980861244022,
+      "loss": 0.7637,
+      "step": 8220
+    },
+    {
+      "epoch": 13.13,
+      "grad_norm": 2.1302742958068848,
+      "learning_rate": 0.0014749601275917067,
+      "loss": 0.8759,
+      "step": 8230
+    },
+    {
+      "epoch": 13.14,
+      "grad_norm": 2.8593993186950684,
+      "learning_rate": 0.0014743221690590113,
+      "loss": 0.6421,
+      "step": 8240
+    },
+    {
+      "epoch": 13.16,
+      "grad_norm": 3.1945838928222656,
+      "learning_rate": 0.0014736842105263158,
+      "loss": 0.8016,
+      "step": 8250
+    },
+    {
+      "epoch": 13.17,
+      "grad_norm": 2.6106722354888916,
+      "learning_rate": 0.0014730462519936206,
+      "loss": 0.9062,
+      "step": 8260
+    },
+    {
+      "epoch": 13.19,
+      "grad_norm": 2.938920021057129,
+      "learning_rate": 0.0014724082934609252,
+      "loss": 0.6848,
+      "step": 8270
+    },
+    {
+      "epoch": 13.21,
+      "grad_norm": 2.4809677600860596,
+      "learning_rate": 0.0014717703349282297,
+      "loss": 0.7019,
+      "step": 8280
+    },
+    {
+      "epoch": 13.22,
+      "grad_norm": 3.0914158821105957,
+      "learning_rate": 0.0014711323763955343,
+      "loss": 0.8232,
+      "step": 8290
+    },
+    {
+      "epoch": 13.24,
+      "grad_norm": 3.0564115047454834,
+      "learning_rate": 0.001470494417862839,
+      "loss": 0.7435,
+      "step": 8300
+    },
+    {
+      "epoch": 13.25,
+      "grad_norm": 3.3561959266662598,
+      "learning_rate": 0.0014698564593301437,
+      "loss": 0.7295,
+      "step": 8310
+    },
+    {
+      "epoch": 13.27,
+      "grad_norm": 1.9883933067321777,
+      "learning_rate": 0.0014692185007974482,
+      "loss": 0.7224,
+      "step": 8320
+    },
+    {
+      "epoch": 13.29,
+      "grad_norm": 2.7677059173583984,
+      "learning_rate": 0.001468580542264753,
+      "loss": 0.7052,
+      "step": 8330
+    },
+    {
+      "epoch": 13.3,
+      "grad_norm": 2.8097822666168213,
+      "learning_rate": 0.0014679425837320576,
+      "loss": 0.8795,
+      "step": 8340
+    },
+    {
+      "epoch": 13.32,
+      "grad_norm": 2.9403786659240723,
+      "learning_rate": 0.0014673046251993621,
+      "loss": 0.8753,
+      "step": 8350
+    },
+    {
+      "epoch": 13.33,
+      "grad_norm": 2.103468179702759,
+      "learning_rate": 0.0014666666666666667,
+      "loss": 0.7013,
+      "step": 8360
+    },
+    {
+      "epoch": 13.35,
+      "grad_norm": 4.1119489669799805,
+      "learning_rate": 0.0014660287081339715,
+      "loss": 0.7666,
+      "step": 8370
+    },
+    {
+      "epoch": 13.37,
+      "grad_norm": 2.627279758453369,
+      "learning_rate": 0.001465390749601276,
+      "loss": 0.912,
+      "step": 8380
+    },
+    {
+      "epoch": 13.38,
+      "grad_norm": 3.824855327606201,
+      "learning_rate": 0.0014647527910685806,
+      "loss": 0.8233,
+      "step": 8390
+    },
+    {
+      "epoch": 13.4,
+      "grad_norm": 2.9254772663116455,
+      "learning_rate": 0.0014641148325358854,
+      "loss": 0.7541,
+      "step": 8400
+    },
+    {
+      "epoch": 13.41,
+      "grad_norm": 3.6978065967559814,
+      "learning_rate": 0.00146347687400319,
+      "loss": 0.7604,
+      "step": 8410
+    },
+    {
+      "epoch": 13.43,
+      "grad_norm": 2.875696897506714,
+      "learning_rate": 0.0014628389154704945,
+      "loss": 0.7459,
+      "step": 8420
+    },
+    {
+      "epoch": 13.44,
+      "grad_norm": 3.1799988746643066,
+      "learning_rate": 0.001462200956937799,
+      "loss": 0.7081,
+      "step": 8430
+    },
+    {
+      "epoch": 13.46,
+      "grad_norm": 1.9684711694717407,
+      "learning_rate": 0.0014615629984051038,
+      "loss": 0.7558,
+      "step": 8440
+    },
+    {
+      "epoch": 13.48,
+      "grad_norm": 2.5012054443359375,
+      "learning_rate": 0.0014609250398724084,
+      "loss": 0.8078,
+      "step": 8450
+    },
+    {
+      "epoch": 13.49,
+      "grad_norm": 2.650980234146118,
+      "learning_rate": 0.001460287081339713,
+      "loss": 0.7813,
+      "step": 8460
+    },
+    {
+      "epoch": 13.51,
+      "grad_norm": 2.5888872146606445,
+      "learning_rate": 0.0014596491228070177,
+      "loss": 0.8004,
+      "step": 8470
+    },
+    {
+      "epoch": 13.52,
+      "grad_norm": 4.472870349884033,
+      "learning_rate": 0.0014590111642743223,
+      "loss": 0.868,
+      "step": 8480
+    },
+    {
+      "epoch": 13.54,
+      "grad_norm": 2.7261528968811035,
+      "learning_rate": 0.0014583732057416269,
+      "loss": 0.7751,
+      "step": 8490
+    },
+    {
+      "epoch": 13.56,
+      "grad_norm": 5.15382194519043,
+      "learning_rate": 0.0014577352472089314,
+      "loss": 0.8288,
+      "step": 8500
+    },
+    {
+      "epoch": 13.57,
+      "grad_norm": 3.0572078227996826,
+      "learning_rate": 0.0014570972886762362,
+      "loss": 0.9282,
+      "step": 8510
+    },
+    {
+      "epoch": 13.59,
+      "grad_norm": 2.779832363128662,
+      "learning_rate": 0.0014564593301435408,
+      "loss": 0.8,
+      "step": 8520
+    },
+    {
+      "epoch": 13.6,
+      "grad_norm": 3.26220965385437,
+      "learning_rate": 0.0014558213716108453,
+      "loss": 0.7932,
+      "step": 8530
+    },
+    {
+      "epoch": 13.62,
+      "grad_norm": 5.765030384063721,
+      "learning_rate": 0.0014551834130781501,
+      "loss": 0.7697,
+      "step": 8540
+    },
+    {
+      "epoch": 13.64,
+      "grad_norm": 3.393489122390747,
+      "learning_rate": 0.0014545454545454547,
+      "loss": 0.7436,
+      "step": 8550
+    },
+    {
+      "epoch": 13.65,
+      "grad_norm": 3.4582221508026123,
+      "learning_rate": 0.0014539074960127592,
+      "loss": 0.8099,
+      "step": 8560
+    },
+    {
+      "epoch": 13.67,
+      "grad_norm": 2.931617498397827,
+      "learning_rate": 0.0014532695374800638,
+      "loss": 0.8833,
+      "step": 8570
+    },
+    {
+      "epoch": 13.68,
+      "grad_norm": 3.149649143218994,
+      "learning_rate": 0.0014526315789473686,
+      "loss": 0.826,
+      "step": 8580
+    },
+    {
+      "epoch": 13.7,
+      "grad_norm": 2.6606719493865967,
+      "learning_rate": 0.0014519936204146731,
+      "loss": 0.9458,
+      "step": 8590
+    },
+    {
+      "epoch": 13.72,
+      "grad_norm": 3.6608333587646484,
+      "learning_rate": 0.0014513556618819777,
+      "loss": 0.8182,
+      "step": 8600
+    },
+    {
+      "epoch": 13.73,
+      "grad_norm": 4.276224136352539,
+      "learning_rate": 0.0014507177033492825,
+      "loss": 0.8656,
+      "step": 8610
+    },
+    {
+      "epoch": 13.75,
+      "grad_norm": 3.306110382080078,
+      "learning_rate": 0.001450079744816587,
+      "loss": 0.8356,
+      "step": 8620
+    },
+    {
+      "epoch": 13.76,
+      "grad_norm": 3.0018744468688965,
+      "learning_rate": 0.0014494417862838916,
+      "loss": 0.8602,
+      "step": 8630
+    },
+    {
+      "epoch": 13.78,
+      "grad_norm": 3.3632960319519043,
+      "learning_rate": 0.0014488038277511962,
+      "loss": 1.0159,
+      "step": 8640
+    },
+    {
+      "epoch": 13.8,
+      "grad_norm": 2.006432056427002,
+      "learning_rate": 0.001448165869218501,
+      "loss": 0.8172,
+      "step": 8650
+    },
+    {
+      "epoch": 13.81,
+      "grad_norm": 3.5842230319976807,
+      "learning_rate": 0.0014475279106858055,
+      "loss": 0.8222,
+      "step": 8660
+    },
+    {
+      "epoch": 13.83,
+      "grad_norm": 3.855170488357544,
+      "learning_rate": 0.00144688995215311,
+      "loss": 0.8993,
+      "step": 8670
+    },
+    {
+      "epoch": 13.84,
+      "grad_norm": 3.3235816955566406,
+      "learning_rate": 0.0014462519936204146,
+      "loss": 0.8374,
+      "step": 8680
+    },
+    {
+      "epoch": 13.86,
+      "grad_norm": 3.43414568901062,
+      "learning_rate": 0.0014456140350877194,
+      "loss": 0.9525,
+      "step": 8690
+    },
+    {
+      "epoch": 13.88,
+      "grad_norm": 3.4128949642181396,
+      "learning_rate": 0.001444976076555024,
+      "loss": 0.8527,
+      "step": 8700
+    },
+    {
+      "epoch": 13.89,
+      "grad_norm": 5.165436744689941,
+      "learning_rate": 0.0014443381180223285,
+      "loss": 0.8698,
+      "step": 8710
+    },
+    {
+      "epoch": 13.91,
+      "grad_norm": 3.940591812133789,
+      "learning_rate": 0.0014437001594896333,
+      "loss": 0.9243,
+      "step": 8720
+    },
+    {
+      "epoch": 13.92,
+      "grad_norm": 3.3081157207489014,
+      "learning_rate": 0.0014430622009569379,
+      "loss": 0.9268,
+      "step": 8730
+    },
+    {
+      "epoch": 13.94,
+      "grad_norm": 3.6998980045318604,
+      "learning_rate": 0.0014424242424242424,
+      "loss": 0.9113,
+      "step": 8740
+    },
+    {
+      "epoch": 13.96,
+      "grad_norm": 3.386359214782715,
+      "learning_rate": 0.001441786283891547,
+      "loss": 0.9067,
+      "step": 8750
+    },
+    {
+      "epoch": 13.97,
+      "grad_norm": 2.559299945831299,
+      "learning_rate": 0.0014411483253588518,
+      "loss": 0.9951,
+      "step": 8760
+    },
+    {
+      "epoch": 13.99,
+      "grad_norm": 2.8027663230895996,
+      "learning_rate": 0.0014405103668261564,
+      "loss": 1.0755,
+      "step": 8770
+    },
+    {
+      "epoch": 14.0,
+      "grad_norm": 2.2676618099212646,
+      "learning_rate": 0.001439872408293461,
+      "loss": 0.7299,
+      "step": 8780
+    },
+    {
+      "epoch": 14.02,
+      "grad_norm": 3.9087278842926025,
+      "learning_rate": 0.0014392344497607657,
+      "loss": 0.8103,
+      "step": 8790
+    },
+    {
+      "epoch": 14.04,
+      "grad_norm": 3.4694509506225586,
+      "learning_rate": 0.0014385964912280703,
+      "loss": 0.6432,
+      "step": 8800
+    },
+    {
+      "epoch": 14.05,
+      "grad_norm": 2.3048603534698486,
+      "learning_rate": 0.0014379585326953748,
+      "loss": 0.5185,
+      "step": 8810
+    },
+    {
+      "epoch": 14.07,
+      "grad_norm": 3.251046895980835,
+      "learning_rate": 0.0014373205741626794,
+      "loss": 0.6668,
+      "step": 8820
+    },
+    {
+      "epoch": 14.08,
+      "grad_norm": 1.8965840339660645,
+      "learning_rate": 0.0014366826156299842,
+      "loss": 0.5914,
+      "step": 8830
+    },
+    {
+      "epoch": 14.1,
+      "grad_norm": 4.089531421661377,
+      "learning_rate": 0.0014360446570972887,
+      "loss": 0.6342,
+      "step": 8840
+    },
+    {
+      "epoch": 14.11,
+      "grad_norm": 2.4392364025115967,
+      "learning_rate": 0.0014354066985645933,
+      "loss": 0.6801,
+      "step": 8850
+    },
+    {
+      "epoch": 14.13,
+      "grad_norm": 2.7692840099334717,
+      "learning_rate": 0.001434768740031898,
+      "loss": 0.7158,
+      "step": 8860
+    },
+    {
+      "epoch": 14.15,
+      "grad_norm": 2.2414801120758057,
+      "learning_rate": 0.0014341307814992026,
+      "loss": 0.6251,
+      "step": 8870
+    },
+    {
+      "epoch": 14.16,
+      "grad_norm": 2.941929578781128,
+      "learning_rate": 0.0014334928229665072,
+      "loss": 0.7273,
+      "step": 8880
+    },
+    {
+      "epoch": 14.18,
+      "grad_norm": 2.245312452316284,
+      "learning_rate": 0.0014328548644338118,
+      "loss": 0.6797,
+      "step": 8890
+    },
+    {
+      "epoch": 14.19,
+      "grad_norm": 2.1441662311553955,
+      "learning_rate": 0.0014322169059011165,
+      "loss": 0.6244,
+      "step": 8900
+    },
+    {
+      "epoch": 14.21,
+      "grad_norm": 3.0492477416992188,
+      "learning_rate": 0.001431578947368421,
+      "loss": 0.6268,
+      "step": 8910
+    },
+    {
+      "epoch": 14.23,
+      "grad_norm": 2.6444950103759766,
+      "learning_rate": 0.0014309409888357257,
+      "loss": 0.6373,
+      "step": 8920
+    },
+    {
+      "epoch": 14.24,
+      "grad_norm": 2.9322099685668945,
+      "learning_rate": 0.0014303030303030304,
+      "loss": 0.7353,
+      "step": 8930
+    },
+    {
+      "epoch": 14.26,
+      "grad_norm": 2.753868341445923,
+      "learning_rate": 0.001429665071770335,
+      "loss": 0.6592,
+      "step": 8940
+    },
+    {
+      "epoch": 14.27,
+      "grad_norm": 3.1307361125946045,
+      "learning_rate": 0.0014290271132376396,
+      "loss": 0.6678,
+      "step": 8950
+    },
+    {
+      "epoch": 14.29,
+      "grad_norm": 2.1127524375915527,
+      "learning_rate": 0.0014283891547049441,
+      "loss": 0.6473,
+      "step": 8960
+    },
+    {
+      "epoch": 14.31,
+      "grad_norm": 2.359909772872925,
+      "learning_rate": 0.001427751196172249,
+      "loss": 0.8442,
+      "step": 8970
+    },
+    {
+      "epoch": 14.32,
+      "grad_norm": 3.395587205886841,
+      "learning_rate": 0.0014271132376395535,
+      "loss": 0.6634,
+      "step": 8980
+    },
+    {
+      "epoch": 14.34,
+      "grad_norm": 3.500505208969116,
+      "learning_rate": 0.001426475279106858,
+      "loss": 0.6735,
+      "step": 8990
+    },
+    {
+      "epoch": 14.35,
+      "grad_norm": 1.948743224143982,
+      "learning_rate": 0.0014258373205741626,
+      "loss": 0.8773,
+      "step": 9000
+    },
+    {
+      "epoch": 14.37,
+      "grad_norm": 4.593191146850586,
+      "learning_rate": 0.0014251993620414674,
+      "loss": 0.7344,
+      "step": 9010
+    },
+    {
+      "epoch": 14.39,
+      "grad_norm": 2.9138360023498535,
+      "learning_rate": 0.001424561403508772,
+      "loss": 0.7962,
+      "step": 9020
+    },
+    {
+      "epoch": 14.4,
+      "grad_norm": 2.7665469646453857,
+      "learning_rate": 0.0014239234449760765,
+      "loss": 0.7066,
+      "step": 9030
+    },
+    {
+      "epoch": 14.42,
+      "grad_norm": 2.5287930965423584,
+      "learning_rate": 0.0014232854864433813,
+      "loss": 0.799,
+      "step": 9040
+    },
+    {
+      "epoch": 14.43,
+      "grad_norm": 1.9143520593643188,
+      "learning_rate": 0.0014226475279106858,
+      "loss": 0.877,
+      "step": 9050
+    },
+    {
+      "epoch": 14.45,
+      "grad_norm": 3.114867925643921,
+      "learning_rate": 0.0014220095693779904,
+      "loss": 0.7229,
+      "step": 9060
+    },
+    {
+      "epoch": 14.47,
+      "grad_norm": 4.132133960723877,
+      "learning_rate": 0.001421371610845295,
+      "loss": 0.7723,
+      "step": 9070
+    },
+    {
+      "epoch": 14.48,
+      "grad_norm": 2.8847928047180176,
+      "learning_rate": 0.0014207336523125997,
+      "loss": 0.8349,
+      "step": 9080
+    },
+    {
+      "epoch": 14.5,
+      "grad_norm": 4.3192009925842285,
+      "learning_rate": 0.0014200956937799043,
+      "loss": 0.7454,
+      "step": 9090
+    },
+    {
+      "epoch": 14.51,
+      "grad_norm": 2.5490753650665283,
+      "learning_rate": 0.0014194577352472089,
+      "loss": 0.8047,
+      "step": 9100
+    },
+    {
+      "epoch": 14.53,
+      "grad_norm": 3.995173215866089,
+      "learning_rate": 0.0014188197767145137,
+      "loss": 0.7209,
+      "step": 9110
+    },
+    {
+      "epoch": 14.55,
+      "grad_norm": 3.334613084793091,
+      "learning_rate": 0.0014181818181818182,
+      "loss": 0.9342,
+      "step": 9120
+    },
+    {
+      "epoch": 14.56,
+      "grad_norm": 2.7369375228881836,
+      "learning_rate": 0.0014175438596491228,
+      "loss": 0.7473,
+      "step": 9130
+    },
+    {
+      "epoch": 14.58,
+      "grad_norm": 4.180137634277344,
+      "learning_rate": 0.0014169059011164273,
+      "loss": 0.8705,
+      "step": 9140
+    },
+    {
+      "epoch": 14.59,
+      "grad_norm": 3.7026357650756836,
+      "learning_rate": 0.0014162679425837321,
+      "loss": 0.7836,
+      "step": 9150
+    },
+    {
+      "epoch": 14.61,
+      "grad_norm": 1.8971599340438843,
+      "learning_rate": 0.0014156299840510367,
+      "loss": 0.7062,
+      "step": 9160
+    },
+    {
+      "epoch": 14.63,
+      "grad_norm": 2.8083083629608154,
+      "learning_rate": 0.0014149920255183412,
+      "loss": 0.7808,
+      "step": 9170
+    },
+    {
+      "epoch": 14.64,
+      "grad_norm": 2.1013123989105225,
+      "learning_rate": 0.001414354066985646,
+      "loss": 0.919,
+      "step": 9180
+    },
+    {
+      "epoch": 14.66,
+      "grad_norm": 2.5876877307891846,
+      "learning_rate": 0.0014137161084529506,
+      "loss": 0.7824,
+      "step": 9190
+    },
+    {
+      "epoch": 14.67,
+      "grad_norm": 2.3595352172851562,
+      "learning_rate": 0.0014130781499202552,
+      "loss": 0.8545,
+      "step": 9200
+    },
+    {
+      "epoch": 14.69,
+      "grad_norm": 6.161678314208984,
+      "learning_rate": 0.0014124401913875597,
+      "loss": 0.764,
+      "step": 9210
+    },
+    {
+      "epoch": 14.7,
+      "grad_norm": 2.7124509811401367,
+      "learning_rate": 0.0014118022328548645,
+      "loss": 0.7967,
+      "step": 9220
+    },
+    {
+      "epoch": 14.72,
+      "grad_norm": 3.200411081314087,
+      "learning_rate": 0.001411164274322169,
+      "loss": 0.8589,
+      "step": 9230
+    },
+    {
+      "epoch": 14.74,
+      "grad_norm": 1.9819875955581665,
+      "learning_rate": 0.0014105263157894736,
+      "loss": 0.809,
+      "step": 9240
+    },
+    {
+      "epoch": 14.75,
+      "grad_norm": 3.223145008087158,
+      "learning_rate": 0.0014098883572567784,
+      "loss": 0.739,
+      "step": 9250
+    },
+    {
+      "epoch": 14.77,
+      "grad_norm": 3.0328469276428223,
+      "learning_rate": 0.001409250398724083,
+      "loss": 0.8469,
+      "step": 9260
+    },
+    {
+      "epoch": 14.78,
+      "grad_norm": 2.144221305847168,
+      "learning_rate": 0.0014086124401913875,
+      "loss": 0.9141,
+      "step": 9270
+    },
+    {
+      "epoch": 14.8,
+      "grad_norm": 2.3607845306396484,
+      "learning_rate": 0.001407974481658692,
+      "loss": 0.8812,
+      "step": 9280
+    },
+    {
+      "epoch": 14.82,
+      "grad_norm": 2.356010913848877,
+      "learning_rate": 0.0014073365231259969,
+      "loss": 0.7773,
+      "step": 9290
+    },
+    {
+      "epoch": 14.83,
+      "grad_norm": 3.326063394546509,
+      "learning_rate": 0.0014066985645933014,
+      "loss": 0.9775,
+      "step": 9300
+    },
+    {
+      "epoch": 14.85,
+      "grad_norm": 3.0373737812042236,
+      "learning_rate": 0.001406060606060606,
+      "loss": 0.8173,
+      "step": 9310
+    },
+    {
+      "epoch": 14.86,
+      "grad_norm": 3.7840776443481445,
+      "learning_rate": 0.0014054226475279108,
+      "loss": 0.8301,
+      "step": 9320
+    },
+    {
+      "epoch": 14.88,
+      "grad_norm": 3.13913893699646,
+      "learning_rate": 0.0014047846889952153,
+      "loss": 0.7767,
+      "step": 9330
+    },
+    {
+      "epoch": 14.9,
+      "grad_norm": 4.028443813323975,
+      "learning_rate": 0.00140414673046252,
+      "loss": 0.7516,
+      "step": 9340
+    },
+    {
+      "epoch": 14.91,
+      "grad_norm": 3.6890182495117188,
+      "learning_rate": 0.0014035087719298245,
+      "loss": 0.769,
+      "step": 9350
+    },
+    {
+      "epoch": 14.93,
+      "grad_norm": 4.084263801574707,
+      "learning_rate": 0.0014028708133971292,
+      "loss": 0.8712,
+      "step": 9360
+    },
+    {
+      "epoch": 14.94,
+      "grad_norm": 2.6253440380096436,
+      "learning_rate": 0.0014022328548644338,
+      "loss": 0.9161,
+      "step": 9370
+    },
+    {
+      "epoch": 14.96,
+      "grad_norm": 3.6379435062408447,
+      "learning_rate": 0.0014015948963317384,
+      "loss": 0.9507,
+      "step": 9380
+    },
+    {
+      "epoch": 14.98,
+      "grad_norm": 3.1507678031921387,
+      "learning_rate": 0.001400956937799043,
+      "loss": 0.9133,
+      "step": 9390
+    },
+    {
+      "epoch": 14.99,
+      "grad_norm": 2.170366048812866,
+      "learning_rate": 0.0014003189792663477,
+      "loss": 0.7949,
+      "step": 9400
+    },
+    {
+      "epoch": 15.01,
+      "grad_norm": 1.886562705039978,
+      "learning_rate": 0.0013996810207336523,
+      "loss": 0.7133,
+      "step": 9410
+    },
+    {
+      "epoch": 15.02,
+      "grad_norm": 2.3615992069244385,
+      "learning_rate": 0.0013990430622009568,
+      "loss": 0.598,
+      "step": 9420
+    },
+    {
+      "epoch": 15.04,
+      "grad_norm": 2.0564517974853516,
+      "learning_rate": 0.0013984051036682616,
+      "loss": 0.6673,
+      "step": 9430
+    },
+    {
+      "epoch": 15.06,
+      "grad_norm": 2.599745273590088,
+      "learning_rate": 0.0013977671451355662,
+      "loss": 0.5725,
+      "step": 9440
+    },
+    {
+      "epoch": 15.07,
+      "grad_norm": 2.5613441467285156,
+      "learning_rate": 0.0013971291866028707,
+      "loss": 0.5816,
+      "step": 9450
+    },
+    {
+      "epoch": 15.09,
+      "grad_norm": 2.8341970443725586,
+      "learning_rate": 0.0013964912280701753,
+      "loss": 0.5968,
+      "step": 9460
+    },
+    {
+      "epoch": 15.1,
+      "grad_norm": 3.303835391998291,
+      "learning_rate": 0.00139585326953748,
+      "loss": 0.6412,
+      "step": 9470
+    },
+    {
+      "epoch": 15.12,
+      "grad_norm": 3.2321808338165283,
+      "learning_rate": 0.0013952153110047846,
+      "loss": 0.6451,
+      "step": 9480
+    },
+    {
+      "epoch": 15.14,
+      "grad_norm": 2.747515916824341,
+      "learning_rate": 0.0013945773524720892,
+      "loss": 0.6491,
+      "step": 9490
+    },
+    {
+      "epoch": 15.15,
+      "grad_norm": 2.1695239543914795,
+      "learning_rate": 0.001393939393939394,
+      "loss": 0.6463,
+      "step": 9500
+    },
+    {
+      "epoch": 15.17,
+      "grad_norm": 2.5514535903930664,
+      "learning_rate": 0.0013933014354066985,
+      "loss": 0.696,
+      "step": 9510
+    },
+    {
+      "epoch": 15.18,
+      "grad_norm": 2.224310874938965,
+      "learning_rate": 0.001392663476874003,
+      "loss": 0.6733,
+      "step": 9520
+    },
+    {
+      "epoch": 15.2,
+      "grad_norm": 3.0674171447753906,
+      "learning_rate": 0.0013920255183413077,
+      "loss": 0.655,
+      "step": 9530
+    },
+    {
+      "epoch": 15.22,
+      "grad_norm": 1.9924139976501465,
+      "learning_rate": 0.0013913875598086124,
+      "loss": 0.6315,
+      "step": 9540
+    },
+    {
+      "epoch": 15.23,
+      "grad_norm": 3.7744829654693604,
+      "learning_rate": 0.001390749601275917,
+      "loss": 0.6991,
+      "step": 9550
+    },
+    {
+      "epoch": 15.25,
+      "grad_norm": 3.4672529697418213,
+      "learning_rate": 0.0013901116427432216,
+      "loss": 0.7007,
+      "step": 9560
+    },
+    {
+      "epoch": 15.26,
+      "grad_norm": 3.2644975185394287,
+      "learning_rate": 0.0013894736842105264,
+      "loss": 0.6403,
+      "step": 9570
+    },
+    {
+      "epoch": 15.28,
+      "grad_norm": 2.8029818534851074,
+      "learning_rate": 0.001388835725677831,
+      "loss": 0.7316,
+      "step": 9580
+    },
+    {
+      "epoch": 15.3,
+      "grad_norm": 1.8042049407958984,
+      "learning_rate": 0.0013881977671451355,
+      "loss": 0.6332,
+      "step": 9590
+    },
+    {
+      "epoch": 15.31,
+      "grad_norm": 2.2891921997070312,
+      "learning_rate": 0.00138755980861244,
+      "loss": 0.6348,
+      "step": 9600
+    },
+    {
+      "epoch": 15.33,
+      "grad_norm": 2.8570570945739746,
+      "learning_rate": 0.0013869218500797448,
+      "loss": 0.6607,
+      "step": 9610
+    },
+    {
+      "epoch": 15.34,
+      "grad_norm": 2.2186977863311768,
+      "learning_rate": 0.0013862838915470494,
+      "loss": 0.627,
+      "step": 9620
+    },
+    {
+      "epoch": 15.36,
+      "grad_norm": 2.3791275024414062,
+      "learning_rate": 0.001385645933014354,
+      "loss": 0.6721,
+      "step": 9630
+    },
+    {
+      "epoch": 15.37,
+      "grad_norm": 2.992490530014038,
+      "learning_rate": 0.0013850079744816587,
+      "loss": 0.6734,
+      "step": 9640
+    },
+    {
+      "epoch": 15.39,
+      "grad_norm": 5.538806438446045,
+      "learning_rate": 0.0013843700159489633,
+      "loss": 0.678,
+      "step": 9650
+    },
+    {
+      "epoch": 15.41,
+      "grad_norm": 2.7970008850097656,
+      "learning_rate": 0.0013837320574162679,
+      "loss": 0.6576,
+      "step": 9660
+    },
+    {
+      "epoch": 15.42,
+      "grad_norm": 5.550447940826416,
+      "learning_rate": 0.0013830940988835724,
+      "loss": 0.7144,
+      "step": 9670
+    },
+    {
+      "epoch": 15.44,
+      "grad_norm": 2.3102047443389893,
+      "learning_rate": 0.0013824561403508772,
+      "loss": 0.669,
+      "step": 9680
+    },
+    {
+      "epoch": 15.45,
+      "grad_norm": 3.720393419265747,
+      "learning_rate": 0.0013818181818181818,
+      "loss": 0.7442,
+      "step": 9690
+    },
+    {
+      "epoch": 15.47,
+      "grad_norm": 2.284290075302124,
+      "learning_rate": 0.0013811802232854863,
+      "loss": 0.7835,
+      "step": 9700
+    },
+    {
+      "epoch": 15.49,
+      "grad_norm": 3.2873239517211914,
+      "learning_rate": 0.0013805422647527909,
+      "loss": 0.6662,
+      "step": 9710
+    },
+    {
+      "epoch": 15.5,
+      "grad_norm": 2.7117483615875244,
+      "learning_rate": 0.0013799043062200959,
+      "loss": 0.7348,
+      "step": 9720
+    },
+    {
+      "epoch": 15.52,
+      "grad_norm": 3.2797791957855225,
+      "learning_rate": 0.0013792663476874004,
+      "loss": 0.8664,
+      "step": 9730
+    },
+    {
+      "epoch": 15.53,
+      "grad_norm": 3.7056384086608887,
+      "learning_rate": 0.001378628389154705,
+      "loss": 0.7128,
+      "step": 9740
+    },
+    {
+      "epoch": 15.55,
+      "grad_norm": 2.3162360191345215,
+      "learning_rate": 0.0013779904306220098,
+      "loss": 0.6596,
+      "step": 9750
+    },
+    {
+      "epoch": 15.57,
+      "grad_norm": 2.1081748008728027,
+      "learning_rate": 0.0013773524720893143,
+      "loss": 0.7492,
+      "step": 9760
+    },
+    {
+      "epoch": 15.58,
+      "grad_norm": 3.5717201232910156,
+      "learning_rate": 0.001376714513556619,
+      "loss": 0.8277,
+      "step": 9770
+    },
+    {
+      "epoch": 15.6,
+      "grad_norm": 3.751756429672241,
+      "learning_rate": 0.0013760765550239235,
+      "loss": 0.7028,
+      "step": 9780
+    },
+    {
+      "epoch": 15.61,
+      "grad_norm": 3.4455363750457764,
+      "learning_rate": 0.0013754385964912283,
+      "loss": 0.7339,
+      "step": 9790
+    },
+    {
+      "epoch": 15.63,
+      "grad_norm": 2.6450400352478027,
+      "learning_rate": 0.0013748006379585328,
+      "loss": 0.8658,
+      "step": 9800
+    },
+    {
+      "epoch": 15.65,
+      "grad_norm": 2.7757487297058105,
+      "learning_rate": 0.0013741626794258374,
+      "loss": 0.7462,
+      "step": 9810
+    },
+    {
+      "epoch": 15.66,
+      "grad_norm": 2.791318416595459,
+      "learning_rate": 0.0013735247208931422,
+      "loss": 0.7724,
+      "step": 9820
+    },
+    {
+      "epoch": 15.68,
+      "grad_norm": 2.722747802734375,
+      "learning_rate": 0.0013728867623604467,
+      "loss": 0.8533,
+      "step": 9830
+    },
+    {
+      "epoch": 15.69,
+      "grad_norm": 2.778831958770752,
+      "learning_rate": 0.0013722488038277513,
+      "loss": 0.7243,
+      "step": 9840
+    },
+    {
+      "epoch": 15.71,
+      "grad_norm": 2.3124783039093018,
+      "learning_rate": 0.0013716108452950558,
+      "loss": 0.87,
+      "step": 9850
+    },
+    {
+      "epoch": 15.73,
+      "grad_norm": 2.3077304363250732,
+      "learning_rate": 0.0013709728867623606,
+      "loss": 0.8462,
+      "step": 9860
+    },
+    {
+      "epoch": 15.74,
+      "grad_norm": 4.141488552093506,
+      "learning_rate": 0.0013703349282296652,
+      "loss": 0.8513,
+      "step": 9870
+    },
+    {
+      "epoch": 15.76,
+      "grad_norm": 2.998544454574585,
+      "learning_rate": 0.0013696969696969697,
+      "loss": 0.7472,
+      "step": 9880
+    },
+    {
+      "epoch": 15.77,
+      "grad_norm": 2.3463869094848633,
+      "learning_rate": 0.0013690590111642745,
+      "loss": 0.8363,
+      "step": 9890
+    },
+    {
+      "epoch": 15.79,
+      "grad_norm": 2.782196521759033,
+      "learning_rate": 0.001368421052631579,
+      "loss": 0.7076,
+      "step": 9900
+    },
+    {
+      "epoch": 15.81,
+      "grad_norm": 3.6227550506591797,
+      "learning_rate": 0.0013677830940988837,
+      "loss": 0.677,
+      "step": 9910
+    },
+    {
+      "epoch": 15.82,
+      "grad_norm": 3.1042935848236084,
+      "learning_rate": 0.0013671451355661882,
+      "loss": 0.8747,
+      "step": 9920
+    },
+    {
+      "epoch": 15.84,
+      "grad_norm": 2.9278554916381836,
+      "learning_rate": 0.001366507177033493,
+      "loss": 0.8876,
+      "step": 9930
+    },
+    {
+      "epoch": 15.85,
+      "grad_norm": 2.6750121116638184,
+      "learning_rate": 0.0013658692185007976,
+      "loss": 0.7374,
+      "step": 9940
+    },
+    {
+      "epoch": 15.87,
+      "grad_norm": 2.563796043395996,
+      "learning_rate": 0.0013652312599681021,
+      "loss": 0.6978,
+      "step": 9950
+    },
+    {
+      "epoch": 15.89,
+      "grad_norm": 2.839409112930298,
+      "learning_rate": 0.001364593301435407,
+      "loss": 0.7307,
+      "step": 9960
+    },
+    {
+      "epoch": 15.9,
+      "grad_norm": 2.651336908340454,
+      "learning_rate": 0.0013639553429027115,
+      "loss": 0.6775,
+      "step": 9970
+    },
+    {
+      "epoch": 15.92,
+      "grad_norm": 3.166914701461792,
+      "learning_rate": 0.001363317384370016,
+      "loss": 0.9131,
+      "step": 9980
+    },
+    {
+      "epoch": 15.93,
+      "grad_norm": 2.0613489151000977,
+      "learning_rate": 0.0013626794258373206,
+      "loss": 0.8707,
+      "step": 9990
+    },
+    {
+      "epoch": 15.95,
+      "grad_norm": 3.3213391304016113,
+      "learning_rate": 0.0013620414673046254,
+      "loss": 0.7631,
+      "step": 10000
+    },
+    {
+      "epoch": 15.96,
+      "grad_norm": 3.0203397274017334,
+      "learning_rate": 0.00136140350877193,
+      "loss": 0.8512,
+      "step": 10010
+    },
+    {
+      "epoch": 15.98,
+      "grad_norm": 2.070725202560425,
+      "learning_rate": 0.0013607655502392345,
+      "loss": 0.8031,
+      "step": 10020
+    },
+    {
+      "epoch": 16.0,
+      "grad_norm": 2.3090660572052,
+      "learning_rate": 0.0013601275917065393,
+      "loss": 0.7327,
+      "step": 10030
+    },
+    {
+      "epoch": 16.01,
+      "grad_norm": 2.2141530513763428,
+      "learning_rate": 0.0013594896331738438,
+      "loss": 0.6028,
+      "step": 10040
+    },
+    {
+      "epoch": 16.03,
+      "grad_norm": 2.6139416694641113,
+      "learning_rate": 0.0013588516746411484,
+      "loss": 0.5008,
+      "step": 10050
+    },
+    {
+      "epoch": 16.04,
+      "grad_norm": 4.001714706420898,
+      "learning_rate": 0.001358213716108453,
+      "loss": 0.5758,
+      "step": 10060
+    },
+    {
+      "epoch": 16.06,
+      "grad_norm": 3.021545886993408,
+      "learning_rate": 0.0013575757575757577,
+      "loss": 0.5445,
+      "step": 10070
+    },
+    {
+      "epoch": 16.08,
+      "grad_norm": 3.3995559215545654,
+      "learning_rate": 0.0013569377990430623,
+      "loss": 0.6745,
+      "step": 10080
+    },
+    {
+      "epoch": 16.09,
+      "grad_norm": 1.990356683731079,
+      "learning_rate": 0.0013562998405103669,
+      "loss": 0.57,
+      "step": 10090
+    },
+    {
+      "epoch": 16.11,
+      "grad_norm": 2.0073490142822266,
+      "learning_rate": 0.0013556618819776714,
+      "loss": 0.532,
+      "step": 10100
+    },
+    {
+      "epoch": 16.12,
+      "grad_norm": 3.74519681930542,
+      "learning_rate": 0.0013550239234449762,
+      "loss": 0.5446,
+      "step": 10110
+    },
+    {
+      "epoch": 16.14,
+      "grad_norm": 3.367241144180298,
+      "learning_rate": 0.0013543859649122808,
+      "loss": 0.6287,
+      "step": 10120
+    },
+    {
+      "epoch": 16.16,
+      "grad_norm": 2.2096097469329834,
+      "learning_rate": 0.0013537480063795853,
+      "loss": 0.6978,
+      "step": 10130
+    },
+    {
+      "epoch": 16.17,
+      "grad_norm": 1.8970123529434204,
+      "learning_rate": 0.0013531100478468901,
+      "loss": 0.6808,
+      "step": 10140
+    },
+    {
+      "epoch": 16.19,
+      "grad_norm": 3.6559560298919678,
+      "learning_rate": 0.0013524720893141947,
+      "loss": 0.6263,
+      "step": 10150
+    },
+    {
+      "epoch": 16.2,
+      "grad_norm": 2.3549561500549316,
+      "learning_rate": 0.0013518341307814992,
+      "loss": 0.6148,
+      "step": 10160
+    },
+    {
+      "epoch": 16.22,
+      "grad_norm": 1.73717200756073,
+      "learning_rate": 0.0013511961722488038,
+      "loss": 0.5918,
+      "step": 10170
+    },
+    {
+      "epoch": 16.24,
+      "grad_norm": 2.160614252090454,
+      "learning_rate": 0.0013505582137161086,
+      "loss": 0.6008,
+      "step": 10180
+    },
+    {
+      "epoch": 16.25,
+      "grad_norm": 3.5011887550354004,
+      "learning_rate": 0.0013499202551834131,
+      "loss": 0.7248,
+      "step": 10190
+    },
+    {
+      "epoch": 16.27,
+      "grad_norm": 2.6921088695526123,
+      "learning_rate": 0.0013492822966507177,
+      "loss": 0.7165,
+      "step": 10200
+    },
+    {
+      "epoch": 16.28,
+      "grad_norm": 1.8108500242233276,
+      "learning_rate": 0.0013486443381180225,
+      "loss": 0.5676,
+      "step": 10210
+    },
+    {
+      "epoch": 16.3,
+      "grad_norm": 2.7839293479919434,
+      "learning_rate": 0.001348006379585327,
+      "loss": 0.65,
+      "step": 10220
+    },
+    {
+      "epoch": 16.32,
+      "grad_norm": 2.6478052139282227,
+      "learning_rate": 0.0013473684210526316,
+      "loss": 0.7421,
+      "step": 10230
+    },
+    {
+      "epoch": 16.33,
+      "grad_norm": 2.5701370239257812,
+      "learning_rate": 0.0013467304625199362,
+      "loss": 0.7106,
+      "step": 10240
+    },
+    {
+      "epoch": 16.35,
+      "grad_norm": 2.3916168212890625,
+      "learning_rate": 0.001346092503987241,
+      "loss": 0.5939,
+      "step": 10250
+    },
+    {
+      "epoch": 16.36,
+      "grad_norm": 2.6145966053009033,
+      "learning_rate": 0.0013454545454545455,
+      "loss": 0.6589,
+      "step": 10260
+    },
+    {
+      "epoch": 16.38,
+      "grad_norm": 2.416173219680786,
+      "learning_rate": 0.00134481658692185,
+      "loss": 0.5956,
+      "step": 10270
+    },
+    {
+      "epoch": 16.4,
+      "grad_norm": 3.0522923469543457,
+      "learning_rate": 0.0013441786283891549,
+      "loss": 0.7672,
+      "step": 10280
+    },
+    {
+      "epoch": 16.41,
+      "grad_norm": 3.9606542587280273,
+      "learning_rate": 0.0013435406698564594,
+      "loss": 0.6298,
+      "step": 10290
+    },
+    {
+      "epoch": 16.43,
+      "grad_norm": 2.6333351135253906,
+      "learning_rate": 0.001342902711323764,
+      "loss": 0.758,
+      "step": 10300
+    },
+    {
+      "epoch": 16.44,
+      "grad_norm": 3.0208117961883545,
+      "learning_rate": 0.0013422647527910685,
+      "loss": 0.7356,
+      "step": 10310
+    },
+    {
+      "epoch": 16.46,
+      "grad_norm": 2.344989776611328,
+      "learning_rate": 0.0013416267942583733,
+      "loss": 0.6947,
+      "step": 10320
+    },
+    {
+      "epoch": 16.48,
+      "grad_norm": 1.7995355129241943,
+      "learning_rate": 0.0013409888357256779,
+      "loss": 0.7519,
+      "step": 10330
+    },
+    {
+      "epoch": 16.49,
+      "grad_norm": 1.8002946376800537,
+      "learning_rate": 0.0013403508771929824,
+      "loss": 0.5739,
+      "step": 10340
+    },
+    {
+      "epoch": 16.51,
+      "grad_norm": 2.094810962677002,
+      "learning_rate": 0.0013397129186602872,
+      "loss": 0.6613,
+      "step": 10350
+    },
+    {
+      "epoch": 16.52,
+      "grad_norm": 3.815561294555664,
+      "learning_rate": 0.0013390749601275918,
+      "loss": 0.6778,
+      "step": 10360
+    },
+    {
+      "epoch": 16.54,
+      "grad_norm": 2.7428698539733887,
+      "learning_rate": 0.0013384370015948964,
+      "loss": 0.6428,
+      "step": 10370
+    },
+    {
+      "epoch": 16.56,
+      "grad_norm": 2.3527848720550537,
+      "learning_rate": 0.001337799043062201,
+      "loss": 0.703,
+      "step": 10380
+    },
+    {
+      "epoch": 16.57,
+      "grad_norm": 2.305804967880249,
+      "learning_rate": 0.0013371610845295057,
+      "loss": 0.6954,
+      "step": 10390
+    },
+    {
+      "epoch": 16.59,
+      "grad_norm": 2.0628771781921387,
+      "learning_rate": 0.0013365231259968103,
+      "loss": 0.6499,
+      "step": 10400
+    },
+    {
+      "epoch": 16.6,
+      "grad_norm": 3.61171555519104,
+      "learning_rate": 0.0013358851674641148,
+      "loss": 0.767,
+      "step": 10410
+    },
+    {
+      "epoch": 16.62,
+      "grad_norm": 1.9354444742202759,
+      "learning_rate": 0.0013352472089314194,
+      "loss": 0.7736,
+      "step": 10420
+    },
+    {
+      "epoch": 16.63,
+      "grad_norm": 2.2509772777557373,
+      "learning_rate": 0.0013346092503987242,
+      "loss": 0.6941,
+      "step": 10430
+    },
+    {
+      "epoch": 16.65,
+      "grad_norm": 3.0013530254364014,
+      "learning_rate": 0.0013339712918660287,
+      "loss": 0.7728,
+      "step": 10440
+    },
+    {
+      "epoch": 16.67,
+      "grad_norm": 2.995089292526245,
+      "learning_rate": 0.0013333333333333333,
+      "loss": 0.6111,
+      "step": 10450
+    },
+    {
+      "epoch": 16.68,
+      "grad_norm": 2.9852423667907715,
+      "learning_rate": 0.001332695374800638,
+      "loss": 0.7604,
+      "step": 10460
+    },
+    {
+      "epoch": 16.7,
+      "grad_norm": 1.8430482149124146,
+      "learning_rate": 0.0013320574162679426,
+      "loss": 0.6248,
+      "step": 10470
+    },
+    {
+      "epoch": 16.71,
+      "grad_norm": 2.271106481552124,
+      "learning_rate": 0.0013314194577352472,
+      "loss": 0.6576,
+      "step": 10480
+    },
+    {
+      "epoch": 16.73,
+      "grad_norm": 3.168851852416992,
+      "learning_rate": 0.0013307814992025518,
+      "loss": 0.7078,
+      "step": 10490
+    },
+    {
+      "epoch": 16.75,
+      "grad_norm": 3.591390371322632,
+      "learning_rate": 0.0013301435406698565,
+      "loss": 0.7218,
+      "step": 10500
+    },
+    {
+      "epoch": 16.76,
+      "grad_norm": 2.9601821899414062,
+      "learning_rate": 0.001329505582137161,
+      "loss": 0.6973,
+      "step": 10510
+    },
+    {
+      "epoch": 16.78,
+      "grad_norm": 2.4465489387512207,
+      "learning_rate": 0.0013288676236044657,
+      "loss": 0.6793,
+      "step": 10520
+    },
+    {
+      "epoch": 16.79,
+      "grad_norm": 3.1582698822021484,
+      "learning_rate": 0.0013282296650717704,
+      "loss": 0.6457,
+      "step": 10530
+    },
+    {
+      "epoch": 16.81,
+      "grad_norm": 2.704655408859253,
+      "learning_rate": 0.001327591706539075,
+      "loss": 0.7386,
+      "step": 10540
+    },
+    {
+      "epoch": 16.83,
+      "grad_norm": 2.8489794731140137,
+      "learning_rate": 0.0013269537480063796,
+      "loss": 0.7504,
+      "step": 10550
+    },
+    {
+      "epoch": 16.84,
+      "grad_norm": 3.1505606174468994,
+      "learning_rate": 0.0013263157894736841,
+      "loss": 0.7292,
+      "step": 10560
+    },
+    {
+      "epoch": 16.86,
+      "grad_norm": 2.3454043865203857,
+      "learning_rate": 0.001325677830940989,
+      "loss": 0.7907,
+      "step": 10570
+    },
+    {
+      "epoch": 16.87,
+      "grad_norm": 3.128525972366333,
+      "learning_rate": 0.0013250398724082935,
+      "loss": 0.6484,
+      "step": 10580
+    },
+    {
+      "epoch": 16.89,
+      "grad_norm": 3.890327215194702,
+      "learning_rate": 0.001324401913875598,
+      "loss": 0.7387,
+      "step": 10590
+    },
+    {
+      "epoch": 16.91,
+      "grad_norm": 3.827643394470215,
+      "learning_rate": 0.0013237639553429028,
+      "loss": 0.8169,
+      "step": 10600
+    },
+    {
+      "epoch": 16.92,
+      "grad_norm": 2.757068395614624,
+      "learning_rate": 0.0013231259968102074,
+      "loss": 0.7266,
+      "step": 10610
+    },
+    {
+      "epoch": 16.94,
+      "grad_norm": 2.3636882305145264,
+      "learning_rate": 0.001322488038277512,
+      "loss": 0.8095,
+      "step": 10620
+    },
+    {
+      "epoch": 16.95,
+      "grad_norm": 2.3341169357299805,
+      "learning_rate": 0.0013218500797448165,
+      "loss": 0.757,
+      "step": 10630
+    },
+    {
+      "epoch": 16.97,
+      "grad_norm": 3.235461950302124,
+      "learning_rate": 0.0013212121212121213,
+      "loss": 0.7138,
+      "step": 10640
+    },
+    {
+      "epoch": 16.99,
+      "grad_norm": 3.797213315963745,
+      "learning_rate": 0.0013205741626794258,
+      "loss": 0.7479,
+      "step": 10650
+    },
+    {
+      "epoch": 17.0,
+      "grad_norm": 2.1070356369018555,
+      "learning_rate": 0.0013199362041467304,
+      "loss": 0.7187,
+      "step": 10660
+    },
+    {
+      "epoch": 17.02,
+      "grad_norm": 2.2326266765594482,
+      "learning_rate": 0.0013192982456140352,
+      "loss": 0.4728,
+      "step": 10670
+    },
+    {
+      "epoch": 17.03,
+      "grad_norm": 2.8324732780456543,
+      "learning_rate": 0.0013186602870813397,
+      "loss": 0.5915,
+      "step": 10680
+    },
+    {
+      "epoch": 17.05,
+      "grad_norm": 2.2015562057495117,
+      "learning_rate": 0.0013180223285486443,
+      "loss": 0.6181,
+      "step": 10690
+    },
+    {
+      "epoch": 17.07,
+      "grad_norm": 1.9790899753570557,
+      "learning_rate": 0.0013173843700159489,
+      "loss": 0.5131,
+      "step": 10700
+    },
+    {
+      "epoch": 17.08,
+      "grad_norm": 2.4350438117980957,
+      "learning_rate": 0.0013167464114832537,
+      "loss": 0.5058,
+      "step": 10710
+    },
+    {
+      "epoch": 17.1,
+      "grad_norm": 2.701519250869751,
+      "learning_rate": 0.0013161084529505582,
+      "loss": 0.5272,
+      "step": 10720
+    },
+    {
+      "epoch": 17.11,
+      "grad_norm": 2.316878318786621,
+      "learning_rate": 0.0013154704944178628,
+      "loss": 0.5489,
+      "step": 10730
+    },
+    {
+      "epoch": 17.13,
+      "grad_norm": 2.2858500480651855,
+      "learning_rate": 0.0013148325358851676,
+      "loss": 0.6087,
+      "step": 10740
+    },
+    {
+      "epoch": 17.15,
+      "grad_norm": 1.5047816038131714,
+      "learning_rate": 0.0013141945773524721,
+      "loss": 0.5726,
+      "step": 10750
+    },
+    {
+      "epoch": 17.16,
+      "grad_norm": 2.081256628036499,
+      "learning_rate": 0.0013135566188197767,
+      "loss": 0.5417,
+      "step": 10760
+    },
+    {
+      "epoch": 17.18,
+      "grad_norm": 1.8512243032455444,
+      "learning_rate": 0.0013129186602870812,
+      "loss": 0.7781,
+      "step": 10770
+    },
+    {
+      "epoch": 17.19,
+      "grad_norm": 2.651259422302246,
+      "learning_rate": 0.001312280701754386,
+      "loss": 0.6324,
+      "step": 10780
+    },
+    {
+      "epoch": 17.21,
+      "grad_norm": 1.8741660118103027,
+      "learning_rate": 0.0013116427432216906,
+      "loss": 0.5242,
+      "step": 10790
+    },
+    {
+      "epoch": 17.22,
+      "grad_norm": 2.223308801651001,
+      "learning_rate": 0.0013110047846889952,
+      "loss": 0.5837,
+      "step": 10800
+    },
+    {
+      "epoch": 17.24,
+      "grad_norm": 2.954585552215576,
+      "learning_rate": 0.0013103668261562997,
+      "loss": 0.7762,
+      "step": 10810
+    },
+    {
+      "epoch": 17.26,
+      "grad_norm": 2.075242519378662,
+      "learning_rate": 0.0013097288676236045,
+      "loss": 0.6981,
+      "step": 10820
+    },
+    {
+      "epoch": 17.27,
+      "grad_norm": 1.9512617588043213,
+      "learning_rate": 0.001309090909090909,
+      "loss": 0.5878,
+      "step": 10830
+    },
+    {
+      "epoch": 17.29,
+      "grad_norm": 2.4567389488220215,
+      "learning_rate": 0.0013084529505582136,
+      "loss": 0.5861,
+      "step": 10840
+    },
+    {
+      "epoch": 17.3,
+      "grad_norm": 2.4589033126831055,
+      "learning_rate": 0.0013078149920255184,
+      "loss": 0.5776,
+      "step": 10850
+    },
+    {
+      "epoch": 17.32,
+      "grad_norm": 3.0933573246002197,
+      "learning_rate": 0.001307177033492823,
+      "loss": 0.6265,
+      "step": 10860
+    },
+    {
+      "epoch": 17.34,
+      "grad_norm": 2.9563870429992676,
+      "learning_rate": 0.0013065390749601275,
+      "loss": 0.688,
+      "step": 10870
+    },
+    {
+      "epoch": 17.35,
+      "grad_norm": 2.6502304077148438,
+      "learning_rate": 0.001305901116427432,
+      "loss": 0.6685,
+      "step": 10880
+    },
+    {
+      "epoch": 17.37,
+      "grad_norm": 2.815063238143921,
+      "learning_rate": 0.0013052631578947369,
+      "loss": 0.651,
+      "step": 10890
+    },
+    {
+      "epoch": 17.38,
+      "grad_norm": 2.2861077785491943,
+      "learning_rate": 0.0013046251993620414,
+      "loss": 0.588,
+      "step": 10900
+    },
+    {
+      "epoch": 17.4,
+      "grad_norm": 2.0195345878601074,
+      "learning_rate": 0.001303987240829346,
+      "loss": 0.6129,
+      "step": 10910
+    },
+    {
+      "epoch": 17.42,
+      "grad_norm": 2.192063331604004,
+      "learning_rate": 0.0013033492822966508,
+      "loss": 0.6819,
+      "step": 10920
+    },
+    {
+      "epoch": 17.43,
+      "grad_norm": 3.0410258769989014,
+      "learning_rate": 0.0013027113237639553,
+      "loss": 0.6265,
+      "step": 10930
+    },
+    {
+      "epoch": 17.45,
+      "grad_norm": 1.9278006553649902,
+      "learning_rate": 0.00130207336523126,
+      "loss": 0.5889,
+      "step": 10940
+    },
+    {
+      "epoch": 17.46,
+      "grad_norm": 2.2657618522644043,
+      "learning_rate": 0.0013014354066985645,
+      "loss": 0.6477,
+      "step": 10950
+    },
+    {
+      "epoch": 17.48,
+      "grad_norm": 3.8989851474761963,
+      "learning_rate": 0.0013007974481658692,
+      "loss": 0.6831,
+      "step": 10960
+    },
+    {
+      "epoch": 17.5,
+      "grad_norm": 2.630307197570801,
+      "learning_rate": 0.0013001594896331738,
+      "loss": 0.7138,
+      "step": 10970
+    },
+    {
+      "epoch": 17.51,
+      "grad_norm": 2.4029276371002197,
+      "learning_rate": 0.0012995215311004784,
+      "loss": 0.6426,
+      "step": 10980
+    },
+    {
+      "epoch": 17.53,
+      "grad_norm": 2.127747058868408,
+      "learning_rate": 0.0012988835725677831,
+      "loss": 0.7466,
+      "step": 10990
+    },
+    {
+      "epoch": 17.54,
+      "grad_norm": 1.9066559076309204,
+      "learning_rate": 0.0012982456140350877,
+      "loss": 0.578,
+      "step": 11000
+    },
+    {
+      "epoch": 17.56,
+      "grad_norm": 2.585181713104248,
+      "learning_rate": 0.0012976076555023923,
+      "loss": 0.6634,
+      "step": 11010
+    },
+    {
+      "epoch": 17.58,
+      "grad_norm": 1.7290911674499512,
+      "learning_rate": 0.0012969696969696968,
+      "loss": 0.6455,
+      "step": 11020
+    },
+    {
+      "epoch": 17.59,
+      "grad_norm": 3.596162796020508,
+      "learning_rate": 0.0012963317384370016,
+      "loss": 0.6049,
+      "step": 11030
+    },
+    {
+      "epoch": 17.61,
+      "grad_norm": 1.8785613775253296,
+      "learning_rate": 0.0012956937799043062,
+      "loss": 0.6151,
+      "step": 11040
+    },
+    {
+      "epoch": 17.62,
+      "grad_norm": 3.3277037143707275,
+      "learning_rate": 0.0012950558213716107,
+      "loss": 0.5931,
+      "step": 11050
+    },
+    {
+      "epoch": 17.64,
+      "grad_norm": 2.3854615688323975,
+      "learning_rate": 0.0012944178628389155,
+      "loss": 0.6332,
+      "step": 11060
+    },
+    {
+      "epoch": 17.66,
+      "grad_norm": 2.8721373081207275,
+      "learning_rate": 0.00129377990430622,
+      "loss": 0.731,
+      "step": 11070
+    },
+    {
+      "epoch": 17.67,
+      "grad_norm": 3.060612440109253,
+      "learning_rate": 0.0012931419457735246,
+      "loss": 0.7137,
+      "step": 11080
+    },
+    {
+      "epoch": 17.69,
+      "grad_norm": 2.5586180686950684,
+      "learning_rate": 0.0012925039872408292,
+      "loss": 0.5945,
+      "step": 11090
+    },
+    {
+      "epoch": 17.7,
+      "grad_norm": 2.5695533752441406,
+      "learning_rate": 0.001291866028708134,
+      "loss": 0.7658,
+      "step": 11100
+    },
+    {
+      "epoch": 17.72,
+      "grad_norm": 4.104732036590576,
+      "learning_rate": 0.0012912280701754385,
+      "loss": 0.7704,
+      "step": 11110
+    },
+    {
+      "epoch": 17.74,
+      "grad_norm": 3.1808011531829834,
+      "learning_rate": 0.001290590111642743,
+      "loss": 0.6476,
+      "step": 11120
+    },
+    {
+      "epoch": 17.75,
+      "grad_norm": 2.210597038269043,
+      "learning_rate": 0.0012899521531100477,
+      "loss": 0.722,
+      "step": 11130
+    },
+    {
+      "epoch": 17.77,
+      "grad_norm": 2.6710522174835205,
+      "learning_rate": 0.0012893141945773524,
+      "loss": 0.6275,
+      "step": 11140
+    },
+    {
+      "epoch": 17.78,
+      "grad_norm": 2.2379961013793945,
+      "learning_rate": 0.001288676236044657,
+      "loss": 0.6321,
+      "step": 11150
+    },
+    {
+      "epoch": 17.8,
+      "grad_norm": 2.719963312149048,
+      "learning_rate": 0.0012880382775119616,
+      "loss": 0.6912,
+      "step": 11160
+    },
+    {
+      "epoch": 17.81,
+      "grad_norm": 2.1712732315063477,
+      "learning_rate": 0.0012874003189792664,
+      "loss": 0.6567,
+      "step": 11170
+    },
+    {
+      "epoch": 17.83,
+      "grad_norm": 4.386512279510498,
+      "learning_rate": 0.001286762360446571,
+      "loss": 0.6933,
+      "step": 11180
+    },
+    {
+      "epoch": 17.85,
+      "grad_norm": 4.8913373947143555,
+      "learning_rate": 0.0012861244019138755,
+      "loss": 0.6339,
+      "step": 11190
+    },
+    {
+      "epoch": 17.86,
+      "grad_norm": 3.154282569885254,
+      "learning_rate": 0.00128548644338118,
+      "loss": 0.6558,
+      "step": 11200
+    },
+    {
+      "epoch": 17.88,
+      "grad_norm": 4.05545711517334,
+      "learning_rate": 0.001284848484848485,
+      "loss": 0.7379,
+      "step": 11210
+    },
+    {
+      "epoch": 17.89,
+      "grad_norm": 2.9737448692321777,
+      "learning_rate": 0.0012842105263157896,
+      "loss": 0.7563,
+      "step": 11220
+    },
+    {
+      "epoch": 17.91,
+      "grad_norm": 4.375244617462158,
+      "learning_rate": 0.0012835725677830942,
+      "loss": 0.7779,
+      "step": 11230
+    },
+    {
+      "epoch": 17.93,
+      "grad_norm": 2.775324821472168,
+      "learning_rate": 0.001282934609250399,
+      "loss": 0.6884,
+      "step": 11240
+    },
+    {
+      "epoch": 17.94,
+      "grad_norm": 2.1626110076904297,
+      "learning_rate": 0.0012822966507177035,
+      "loss": 0.7123,
+      "step": 11250
+    },
+    {
+      "epoch": 17.96,
+      "grad_norm": 3.951596260070801,
+      "learning_rate": 0.001281658692185008,
+      "loss": 0.7645,
+      "step": 11260
+    },
+    {
+      "epoch": 17.97,
+      "grad_norm": 2.271362066268921,
+      "learning_rate": 0.0012810207336523126,
+      "loss": 0.8049,
+      "step": 11270
+    },
+    {
+      "epoch": 17.99,
+      "grad_norm": 3.3153727054595947,
+      "learning_rate": 0.0012803827751196174,
+      "loss": 0.7484,
+      "step": 11280
+    },
+    {
+      "epoch": 18.01,
+      "grad_norm": 2.451831340789795,
+      "learning_rate": 0.001279744816586922,
+      "loss": 0.6549,
+      "step": 11290
+    },
+    {
+      "epoch": 18.02,
+      "grad_norm": 2.1044692993164062,
+      "learning_rate": 0.0012791068580542265,
+      "loss": 0.5569,
+      "step": 11300
+    },
+    {
+      "epoch": 18.04,
+      "grad_norm": 5.281918525695801,
+      "learning_rate": 0.0012784688995215313,
+      "loss": 0.5877,
+      "step": 11310
+    },
+    {
+      "epoch": 18.05,
+      "grad_norm": 2.706597328186035,
+      "learning_rate": 0.0012778309409888359,
+      "loss": 0.541,
+      "step": 11320
+    },
+    {
+      "epoch": 18.07,
+      "grad_norm": 2.2525746822357178,
+      "learning_rate": 0.0012771929824561404,
+      "loss": 0.5092,
+      "step": 11330
+    },
+    {
+      "epoch": 18.09,
+      "grad_norm": 2.823735475540161,
+      "learning_rate": 0.001276555023923445,
+      "loss": 0.5205,
+      "step": 11340
+    },
+    {
+      "epoch": 18.1,
+      "grad_norm": 2.758739948272705,
+      "learning_rate": 0.0012759170653907498,
+      "loss": 0.5012,
+      "step": 11350
+    },
+    {
+      "epoch": 18.12,
+      "grad_norm": 3.0362417697906494,
+      "learning_rate": 0.0012752791068580543,
+      "loss": 0.6093,
+      "step": 11360
+    },
+    {
+      "epoch": 18.13,
+      "grad_norm": 1.967462182044983,
+      "learning_rate": 0.001274641148325359,
+      "loss": 0.6387,
+      "step": 11370
+    },
+    {
+      "epoch": 18.15,
+      "grad_norm": 2.352168083190918,
+      "learning_rate": 0.0012740031897926637,
+      "loss": 0.6707,
+      "step": 11380
+    },
+    {
+      "epoch": 18.17,
+      "grad_norm": 1.6705100536346436,
+      "learning_rate": 0.0012733652312599683,
+      "loss": 0.6675,
+      "step": 11390
+    },
+    {
+      "epoch": 18.18,
+      "grad_norm": 2.1992321014404297,
+      "learning_rate": 0.0012727272727272728,
+      "loss": 0.7195,
+      "step": 11400
+    },
+    {
+      "epoch": 18.2,
+      "grad_norm": 1.7198143005371094,
+      "learning_rate": 0.0012720893141945774,
+      "loss": 0.6287,
+      "step": 11410
+    },
+    {
+      "epoch": 18.21,
+      "grad_norm": 2.229097604751587,
+      "learning_rate": 0.0012714513556618822,
+      "loss": 0.5207,
+      "step": 11420
+    },
+    {
+      "epoch": 18.23,
+      "grad_norm": 3.074547529220581,
+      "learning_rate": 0.0012708133971291867,
+      "loss": 0.5929,
+      "step": 11430
+    },
+    {
+      "epoch": 18.25,
+      "grad_norm": 2.6688926219940186,
+      "learning_rate": 0.0012701754385964913,
+      "loss": 0.5927,
+      "step": 11440
+    },
+    {
+      "epoch": 18.26,
+      "grad_norm": 2.6187679767608643,
+      "learning_rate": 0.0012695374800637958,
+      "loss": 0.5733,
+      "step": 11450
+    },
+    {
+      "epoch": 18.28,
+      "grad_norm": 2.056699752807617,
+      "learning_rate": 0.0012688995215311006,
+      "loss": 0.6502,
+      "step": 11460
+    },
+    {
+      "epoch": 18.29,
+      "grad_norm": 3.4182140827178955,
+      "learning_rate": 0.0012682615629984052,
+      "loss": 0.5932,
+      "step": 11470
+    },
+    {
+      "epoch": 18.31,
+      "grad_norm": 2.9311532974243164,
+      "learning_rate": 0.0012676236044657097,
+      "loss": 0.5568,
+      "step": 11480
+    },
+    {
+      "epoch": 18.33,
+      "grad_norm": 1.7414332628250122,
+      "learning_rate": 0.0012669856459330145,
+      "loss": 0.5494,
+      "step": 11490
+    },
+    {
+      "epoch": 18.34,
+      "grad_norm": 2.6820008754730225,
+      "learning_rate": 0.001266347687400319,
+      "loss": 0.7878,
+      "step": 11500
+    },
+    {
+      "epoch": 18.36,
+      "grad_norm": 2.811760663986206,
+      "learning_rate": 0.0012657097288676237,
+      "loss": 0.5324,
+      "step": 11510
+    },
+    {
+      "epoch": 18.37,
+      "grad_norm": 3.026895046234131,
+      "learning_rate": 0.0012650717703349282,
+      "loss": 0.7353,
+      "step": 11520
+    },
+    {
+      "epoch": 18.39,
+      "grad_norm": 2.6072068214416504,
+      "learning_rate": 0.001264433811802233,
+      "loss": 0.6509,
+      "step": 11530
+    },
+    {
+      "epoch": 18.41,
+      "grad_norm": 2.0730879306793213,
+      "learning_rate": 0.0012637958532695376,
+      "loss": 0.5353,
+      "step": 11540
+    },
+    {
+      "epoch": 18.42,
+      "grad_norm": 3.863426923751831,
+      "learning_rate": 0.0012631578947368421,
+      "loss": 0.6443,
+      "step": 11550
+    },
+    {
+      "epoch": 18.44,
+      "grad_norm": 1.9193871021270752,
+      "learning_rate": 0.001262519936204147,
+      "loss": 0.5827,
+      "step": 11560
+    },
+    {
+      "epoch": 18.45,
+      "grad_norm": 3.126490354537964,
+      "learning_rate": 0.0012618819776714515,
+      "loss": 0.6556,
+      "step": 11570
+    },
+    {
+      "epoch": 18.47,
+      "grad_norm": 3.189641237258911,
+      "learning_rate": 0.001261244019138756,
+      "loss": 0.693,
+      "step": 11580
+    },
+    {
+      "epoch": 18.48,
+      "grad_norm": 3.374671220779419,
+      "learning_rate": 0.0012606060606060606,
+      "loss": 0.6156,
+      "step": 11590
+    },
+    {
+      "epoch": 18.5,
+      "grad_norm": 1.7221401929855347,
+      "learning_rate": 0.0012599681020733654,
+      "loss": 0.6865,
+      "step": 11600
+    },
+    {
+      "epoch": 18.52,
+      "grad_norm": 1.6494935750961304,
+      "learning_rate": 0.00125933014354067,
+      "loss": 0.6058,
+      "step": 11610
+    },
+    {
+      "epoch": 18.53,
+      "grad_norm": 2.8912765979766846,
+      "learning_rate": 0.0012586921850079745,
+      "loss": 0.6022,
+      "step": 11620
+    },
+    {
+      "epoch": 18.55,
+      "grad_norm": 2.1293585300445557,
+      "learning_rate": 0.0012580542264752793,
+      "loss": 0.5817,
+      "step": 11630
+    },
+    {
+      "epoch": 18.56,
+      "grad_norm": 3.3972530364990234,
+      "learning_rate": 0.0012574162679425838,
+      "loss": 0.6279,
+      "step": 11640
+    },
+    {
+      "epoch": 18.58,
+      "grad_norm": 4.464833736419678,
+      "learning_rate": 0.0012567783094098884,
+      "loss": 0.6584,
+      "step": 11650
+    },
+    {
+      "epoch": 18.6,
+      "grad_norm": 3.3168396949768066,
+      "learning_rate": 0.001256140350877193,
+      "loss": 0.7492,
+      "step": 11660
+    },
+    {
+      "epoch": 18.61,
+      "grad_norm": 1.7018378973007202,
+      "learning_rate": 0.0012555023923444977,
+      "loss": 0.6064,
+      "step": 11670
+    },
+    {
+      "epoch": 18.63,
+      "grad_norm": 2.8935000896453857,
+      "learning_rate": 0.0012548644338118023,
+      "loss": 0.6793,
+      "step": 11680
+    },
+    {
+      "epoch": 18.64,
+      "grad_norm": 3.3293614387512207,
+      "learning_rate": 0.0012542264752791069,
+      "loss": 0.6478,
+      "step": 11690
+    },
+    {
+      "epoch": 18.66,
+      "grad_norm": 2.4878737926483154,
+      "learning_rate": 0.0012535885167464116,
+      "loss": 0.5137,
+      "step": 11700
+    },
+    {
+      "epoch": 18.68,
+      "grad_norm": 2.662574529647827,
+      "learning_rate": 0.0012529505582137162,
+      "loss": 0.6051,
+      "step": 11710
+    },
+    {
+      "epoch": 18.69,
+      "grad_norm": 2.5218799114227295,
+      "learning_rate": 0.0012523125996810208,
+      "loss": 0.6176,
+      "step": 11720
+    },
+    {
+      "epoch": 18.71,
+      "grad_norm": 2.6172173023223877,
+      "learning_rate": 0.0012516746411483253,
+      "loss": 0.627,
+      "step": 11730
+    },
+    {
+      "epoch": 18.72,
+      "grad_norm": 2.4706501960754395,
+      "learning_rate": 0.0012510366826156301,
+      "loss": 0.6826,
+      "step": 11740
+    },
+    {
+      "epoch": 18.74,
+      "grad_norm": 1.9907801151275635,
+      "learning_rate": 0.0012503987240829347,
+      "loss": 0.6528,
+      "step": 11750
+    },
+    {
+      "epoch": 18.76,
+      "grad_norm": 4.803826808929443,
+      "learning_rate": 0.0012497607655502392,
+      "loss": 0.599,
+      "step": 11760
+    },
+    {
+      "epoch": 18.77,
+      "grad_norm": 2.5642504692077637,
+      "learning_rate": 0.001249122807017544,
+      "loss": 0.6657,
+      "step": 11770
+    },
+    {
+      "epoch": 18.79,
+      "grad_norm": 4.334081649780273,
+      "learning_rate": 0.0012484848484848486,
+      "loss": 0.6764,
+      "step": 11780
+    },
+    {
+      "epoch": 18.8,
+      "grad_norm": 2.7521369457244873,
+      "learning_rate": 0.0012478468899521531,
+      "loss": 0.6906,
+      "step": 11790
+    },
+    {
+      "epoch": 18.82,
+      "grad_norm": 2.13214373588562,
+      "learning_rate": 0.0012472089314194577,
+      "loss": 0.6823,
+      "step": 11800
+    },
+    {
+      "epoch": 18.84,
+      "grad_norm": 3.1697006225585938,
+      "learning_rate": 0.0012465709728867625,
+      "loss": 0.6231,
+      "step": 11810
+    },
+    {
+      "epoch": 18.85,
+      "grad_norm": 2.6898703575134277,
+      "learning_rate": 0.001245933014354067,
+      "loss": 0.8077,
+      "step": 11820
+    },
+    {
+      "epoch": 18.87,
+      "grad_norm": 3.177943706512451,
+      "learning_rate": 0.0012452950558213716,
+      "loss": 0.634,
+      "step": 11830
+    },
+    {
+      "epoch": 18.88,
+      "grad_norm": 2.5923023223876953,
+      "learning_rate": 0.0012446570972886762,
+      "loss": 0.5839,
+      "step": 11840
+    },
+    {
+      "epoch": 18.9,
+      "grad_norm": 1.8359884023666382,
+      "learning_rate": 0.001244019138755981,
+      "loss": 0.5992,
+      "step": 11850
+    },
+    {
+      "epoch": 18.92,
+      "grad_norm": 2.252401828765869,
+      "learning_rate": 0.0012433811802232855,
+      "loss": 0.5877,
+      "step": 11860
+    },
+    {
+      "epoch": 18.93,
+      "grad_norm": 2.945974588394165,
+      "learning_rate": 0.00124274322169059,
+      "loss": 0.8981,
+      "step": 11870
+    },
+    {
+      "epoch": 18.95,
+      "grad_norm": 2.5869786739349365,
+      "learning_rate": 0.0012421052631578949,
+      "loss": 0.7204,
+      "step": 11880
+    },
+    {
+      "epoch": 18.96,
+      "grad_norm": 2.0073652267456055,
+      "learning_rate": 0.0012414673046251994,
+      "loss": 0.7112,
+      "step": 11890
+    },
+    {
+      "epoch": 18.98,
+      "grad_norm": 2.726731777191162,
+      "learning_rate": 0.001240829346092504,
+      "loss": 0.6721,
+      "step": 11900
+    },
+    {
+      "epoch": 19.0,
+      "grad_norm": 2.646214246749878,
+      "learning_rate": 0.0012401913875598085,
+      "loss": 0.6353,
+      "step": 11910
+    },
+    {
+      "epoch": 19.01,
+      "grad_norm": 1.7647764682769775,
+      "learning_rate": 0.0012395534290271133,
+      "loss": 0.5528,
+      "step": 11920
+    },
+    {
+      "epoch": 19.03,
+      "grad_norm": 1.6511797904968262,
+      "learning_rate": 0.0012389154704944179,
+      "loss": 0.5311,
+      "step": 11930
+    },
+    {
+      "epoch": 19.04,
+      "grad_norm": 3.120816707611084,
+      "learning_rate": 0.0012382775119617224,
+      "loss": 0.4873,
+      "step": 11940
+    },
+    {
+      "epoch": 19.06,
+      "grad_norm": 1.2211092710494995,
+      "learning_rate": 0.0012376395534290272,
+      "loss": 0.5031,
+      "step": 11950
+    },
+    {
+      "epoch": 19.07,
+      "grad_norm": 2.295135021209717,
+      "learning_rate": 0.0012370015948963318,
+      "loss": 0.5656,
+      "step": 11960
+    },
+    {
+      "epoch": 19.09,
+      "grad_norm": 1.805337905883789,
+      "learning_rate": 0.0012363636363636364,
+      "loss": 0.5322,
+      "step": 11970
+    },
+    {
+      "epoch": 19.11,
+      "grad_norm": 1.8517502546310425,
+      "learning_rate": 0.001235725677830941,
+      "loss": 0.5287,
+      "step": 11980
+    },
+    {
+      "epoch": 19.12,
+      "grad_norm": 2.464036464691162,
+      "learning_rate": 0.0012350877192982457,
+      "loss": 0.5865,
+      "step": 11990
+    },
+    {
+      "epoch": 19.14,
+      "grad_norm": 1.964254379272461,
+      "learning_rate": 0.0012344497607655503,
+      "loss": 0.512,
+      "step": 12000
+    },
+    {
+      "epoch": 19.15,
+      "grad_norm": 2.386060953140259,
+      "learning_rate": 0.0012338118022328548,
+      "loss": 0.6416,
+      "step": 12010
+    },
+    {
+      "epoch": 19.17,
+      "grad_norm": 2.4723477363586426,
+      "learning_rate": 0.0012331738437001596,
+      "loss": 0.5147,
+      "step": 12020
+    },
+    {
+      "epoch": 19.19,
+      "grad_norm": 1.7513999938964844,
+      "learning_rate": 0.0012325358851674642,
+      "loss": 0.5413,
+      "step": 12030
+    },
+    {
+      "epoch": 19.2,
+      "grad_norm": 1.3666512966156006,
+      "learning_rate": 0.0012318979266347687,
+      "loss": 0.619,
+      "step": 12040
+    },
+    {
+      "epoch": 19.22,
+      "grad_norm": 2.0821938514709473,
+      "learning_rate": 0.0012312599681020733,
+      "loss": 0.5351,
+      "step": 12050
+    },
+    {
+      "epoch": 19.23,
+      "grad_norm": 2.403721570968628,
+      "learning_rate": 0.001230622009569378,
+      "loss": 0.5107,
+      "step": 12060
+    },
+    {
+      "epoch": 19.25,
+      "grad_norm": 2.3420348167419434,
+      "learning_rate": 0.0012299840510366826,
+      "loss": 0.5017,
+      "step": 12070
+    },
+    {
+      "epoch": 19.27,
+      "grad_norm": 1.8931384086608887,
+      "learning_rate": 0.0012293460925039872,
+      "loss": 0.5291,
+      "step": 12080
+    },
+    {
+      "epoch": 19.28,
+      "grad_norm": 1.815537691116333,
+      "learning_rate": 0.001228708133971292,
+      "loss": 0.6106,
+      "step": 12090
+    },
+    {
+      "epoch": 19.3,
+      "grad_norm": 2.327855348587036,
+      "learning_rate": 0.0012280701754385965,
+      "loss": 0.4726,
+      "step": 12100
+    },
+    {
+      "epoch": 19.31,
+      "grad_norm": 2.388517141342163,
+      "learning_rate": 0.001227432216905901,
+      "loss": 0.5529,
+      "step": 12110
+    },
+    {
+      "epoch": 19.33,
+      "grad_norm": 3.0234811305999756,
+      "learning_rate": 0.0012267942583732057,
+      "loss": 0.6617,
+      "step": 12120
+    },
+    {
+      "epoch": 19.35,
+      "grad_norm": 3.1139323711395264,
+      "learning_rate": 0.0012261562998405104,
+      "loss": 0.5682,
+      "step": 12130
+    },
+    {
+      "epoch": 19.36,
+      "grad_norm": 3.9127554893493652,
+      "learning_rate": 0.001225518341307815,
+      "loss": 0.6042,
+      "step": 12140
+    },
+    {
+      "epoch": 19.38,
+      "grad_norm": 3.9032232761383057,
+      "learning_rate": 0.0012248803827751196,
+      "loss": 0.6022,
+      "step": 12150
+    },
+    {
+      "epoch": 19.39,
+      "grad_norm": 1.7738832235336304,
+      "learning_rate": 0.0012242424242424241,
+      "loss": 0.5483,
+      "step": 12160
+    },
+    {
+      "epoch": 19.41,
+      "grad_norm": 3.865807294845581,
+      "learning_rate": 0.001223604465709729,
+      "loss": 0.6323,
+      "step": 12170
+    },
+    {
+      "epoch": 19.43,
+      "grad_norm": 2.653740406036377,
+      "learning_rate": 0.0012229665071770335,
+      "loss": 0.6286,
+      "step": 12180
+    },
+    {
+      "epoch": 19.44,
+      "grad_norm": 1.727924108505249,
+      "learning_rate": 0.001222328548644338,
+      "loss": 0.5744,
+      "step": 12190
+    },
+    {
+      "epoch": 19.46,
+      "grad_norm": 2.1040127277374268,
+      "learning_rate": 0.0012216905901116428,
+      "loss": 0.5941,
+      "step": 12200
+    },
+    {
+      "epoch": 19.47,
+      "grad_norm": 2.8161518573760986,
+      "learning_rate": 0.0012210526315789474,
+      "loss": 0.542,
+      "step": 12210
+    },
+    {
+      "epoch": 19.49,
+      "grad_norm": 2.4196929931640625,
+      "learning_rate": 0.001220414673046252,
+      "loss": 0.5612,
+      "step": 12220
+    },
+    {
+      "epoch": 19.51,
+      "grad_norm": 2.2649526596069336,
+      "learning_rate": 0.0012197767145135565,
+      "loss": 0.4941,
+      "step": 12230
+    },
+    {
+      "epoch": 19.52,
+      "grad_norm": 3.1256422996520996,
+      "learning_rate": 0.0012191387559808613,
+      "loss": 0.551,
+      "step": 12240
+    },
+    {
+      "epoch": 19.54,
+      "grad_norm": 2.1946921348571777,
+      "learning_rate": 0.0012185007974481658,
+      "loss": 0.5294,
+      "step": 12250
+    },
+    {
+      "epoch": 19.55,
+      "grad_norm": 2.897484064102173,
+      "learning_rate": 0.0012178628389154704,
+      "loss": 0.7128,
+      "step": 12260
+    },
+    {
+      "epoch": 19.57,
+      "grad_norm": 2.024834156036377,
+      "learning_rate": 0.0012172248803827752,
+      "loss": 0.5942,
+      "step": 12270
+    },
+    {
+      "epoch": 19.59,
+      "grad_norm": 1.7685123682022095,
+      "learning_rate": 0.0012165869218500797,
+      "loss": 0.5827,
+      "step": 12280
+    },
+    {
+      "epoch": 19.6,
+      "grad_norm": 2.0234525203704834,
+      "learning_rate": 0.0012159489633173843,
+      "loss": 0.6377,
+      "step": 12290
+    },
+    {
+      "epoch": 19.62,
+      "grad_norm": 4.358128070831299,
+      "learning_rate": 0.0012153110047846889,
+      "loss": 0.7277,
+      "step": 12300
+    },
+    {
+      "epoch": 19.63,
+      "grad_norm": 2.4699137210845947,
+      "learning_rate": 0.0012146730462519937,
+      "loss": 0.5023,
+      "step": 12310
+    },
+    {
+      "epoch": 19.65,
+      "grad_norm": 2.4853904247283936,
+      "learning_rate": 0.0012140350877192982,
+      "loss": 0.585,
+      "step": 12320
+    },
+    {
+      "epoch": 19.67,
+      "grad_norm": 2.560833215713501,
+      "learning_rate": 0.0012133971291866028,
+      "loss": 0.6899,
+      "step": 12330
+    },
+    {
+      "epoch": 19.68,
+      "grad_norm": 3.2827863693237305,
+      "learning_rate": 0.0012127591706539076,
+      "loss": 0.6114,
+      "step": 12340
+    },
+    {
+      "epoch": 19.7,
+      "grad_norm": 2.643315553665161,
+      "learning_rate": 0.0012121212121212121,
+      "loss": 0.6714,
+      "step": 12350
+    },
+    {
+      "epoch": 19.71,
+      "grad_norm": 2.2757856845855713,
+      "learning_rate": 0.0012114832535885167,
+      "loss": 0.5607,
+      "step": 12360
+    },
+    {
+      "epoch": 19.73,
+      "grad_norm": 2.054987668991089,
+      "learning_rate": 0.0012108452950558212,
+      "loss": 0.5481,
+      "step": 12370
+    },
+    {
+      "epoch": 19.74,
+      "grad_norm": 2.3429064750671387,
+      "learning_rate": 0.001210207336523126,
+      "loss": 0.6629,
+      "step": 12380
+    },
+    {
+      "epoch": 19.76,
+      "grad_norm": 1.6089274883270264,
+      "learning_rate": 0.0012095693779904306,
+      "loss": 0.6237,
+      "step": 12390
+    },
+    {
+      "epoch": 19.78,
+      "grad_norm": 4.483922004699707,
+      "learning_rate": 0.0012089314194577352,
+      "loss": 0.6287,
+      "step": 12400
+    },
+    {
+      "epoch": 19.79,
+      "grad_norm": 2.133923292160034,
+      "learning_rate": 0.00120829346092504,
+      "loss": 0.6648,
+      "step": 12410
+    },
+    {
+      "epoch": 19.81,
+      "grad_norm": 2.3778302669525146,
+      "learning_rate": 0.0012076555023923445,
+      "loss": 0.639,
+      "step": 12420
+    },
+    {
+      "epoch": 19.82,
+      "grad_norm": 2.589620351791382,
+      "learning_rate": 0.001207017543859649,
+      "loss": 0.6414,
+      "step": 12430
+    },
+    {
+      "epoch": 19.84,
+      "grad_norm": 1.5527355670928955,
+      "learning_rate": 0.0012063795853269536,
+      "loss": 0.6587,
+      "step": 12440
+    },
+    {
+      "epoch": 19.86,
+      "grad_norm": 3.8891091346740723,
+      "learning_rate": 0.0012057416267942584,
+      "loss": 0.6672,
+      "step": 12450
+    },
+    {
+      "epoch": 19.87,
+      "grad_norm": 2.5779592990875244,
+      "learning_rate": 0.001205103668261563,
+      "loss": 0.6224,
+      "step": 12460
+    },
+    {
+      "epoch": 19.89,
+      "grad_norm": 2.218827486038208,
+      "learning_rate": 0.0012044657097288675,
+      "loss": 0.6988,
+      "step": 12470
+    },
+    {
+      "epoch": 19.9,
+      "grad_norm": 3.827039957046509,
+      "learning_rate": 0.0012038277511961723,
+      "loss": 0.5554,
+      "step": 12480
+    },
+    {
+      "epoch": 19.92,
+      "grad_norm": 3.635878562927246,
+      "learning_rate": 0.0012031897926634769,
+      "loss": 0.6447,
+      "step": 12490
+    },
+    {
+      "epoch": 19.94,
+      "grad_norm": 1.9988211393356323,
+      "learning_rate": 0.0012025518341307814,
+      "loss": 0.5721,
+      "step": 12500
+    },
+    {
+      "epoch": 19.95,
+      "grad_norm": 4.294229984283447,
+      "learning_rate": 0.001201913875598086,
+      "loss": 0.6425,
+      "step": 12510
+    },
+    {
+      "epoch": 19.97,
+      "grad_norm": 2.2810208797454834,
+      "learning_rate": 0.0012012759170653908,
+      "loss": 0.609,
+      "step": 12520
+    },
+    {
+      "epoch": 19.98,
+      "grad_norm": 2.6013190746307373,
+      "learning_rate": 0.0012006379585326953,
+      "loss": 0.615,
+      "step": 12530
+    },
+    {
+      "epoch": 20.0,
+      "grad_norm": 3.9176077842712402,
+      "learning_rate": 0.0012,
+      "loss": 0.6626,
+      "step": 12540
+    },
+    {
+      "epoch": 20.02,
+      "grad_norm": 1.4916435480117798,
+      "learning_rate": 0.0011993620414673045,
+      "loss": 0.4802,
+      "step": 12550
+    },
+    {
+      "epoch": 20.03,
+      "grad_norm": 1.8869787454605103,
+      "learning_rate": 0.0011987240829346092,
+      "loss": 0.4899,
+      "step": 12560
+    },
+    {
+      "epoch": 20.05,
+      "grad_norm": 1.645322561264038,
+      "learning_rate": 0.0011980861244019138,
+      "loss": 0.4875,
+      "step": 12570
+    },
+    {
+      "epoch": 20.06,
+      "grad_norm": 3.0053963661193848,
+      "learning_rate": 0.0011974481658692184,
+      "loss": 0.5344,
+      "step": 12580
+    },
+    {
+      "epoch": 20.08,
+      "grad_norm": 1.9125926494598389,
+      "learning_rate": 0.0011968102073365231,
+      "loss": 0.5512,
+      "step": 12590
+    },
+    {
+      "epoch": 20.1,
+      "grad_norm": 2.4130938053131104,
+      "learning_rate": 0.0011961722488038277,
+      "loss": 0.6046,
+      "step": 12600
+    },
+    {
+      "epoch": 20.11,
+      "grad_norm": 2.648345947265625,
+      "learning_rate": 0.0011955342902711323,
+      "loss": 0.5085,
+      "step": 12610
+    },
+    {
+      "epoch": 20.13,
+      "grad_norm": 3.288292646408081,
+      "learning_rate": 0.0011948963317384368,
+      "loss": 0.5101,
+      "step": 12620
+    },
+    {
+      "epoch": 20.14,
+      "grad_norm": 2.3620495796203613,
+      "learning_rate": 0.0011942583732057416,
+      "loss": 0.502,
+      "step": 12630
+    },
+    {
+      "epoch": 20.16,
+      "grad_norm": 2.2232260704040527,
+      "learning_rate": 0.0011936204146730462,
+      "loss": 0.5246,
+      "step": 12640
+    },
+    {
+      "epoch": 20.18,
+      "grad_norm": 3.120986223220825,
+      "learning_rate": 0.0011929824561403507,
+      "loss": 0.621,
+      "step": 12650
+    },
+    {
+      "epoch": 20.19,
+      "grad_norm": 1.5366686582565308,
+      "learning_rate": 0.0011923444976076555,
+      "loss": 0.4844,
+      "step": 12660
+    },
+    {
+      "epoch": 20.21,
+      "grad_norm": 2.0947461128234863,
+      "learning_rate": 0.00119170653907496,
+      "loss": 0.6025,
+      "step": 12670
+    },
+    {
+      "epoch": 20.22,
+      "grad_norm": 1.5178321599960327,
+      "learning_rate": 0.0011910685805422646,
+      "loss": 0.5421,
+      "step": 12680
+    },
+    {
+      "epoch": 20.24,
+      "grad_norm": 2.9309802055358887,
+      "learning_rate": 0.0011904306220095692,
+      "loss": 0.4816,
+      "step": 12690
+    },
+    {
+      "epoch": 20.26,
+      "grad_norm": 1.6734910011291504,
+      "learning_rate": 0.0011897926634768742,
+      "loss": 0.668,
+      "step": 12700
+    },
+    {
+      "epoch": 20.27,
+      "grad_norm": 1.755245327949524,
+      "learning_rate": 0.0011891547049441788,
+      "loss": 0.5436,
+      "step": 12710
+    },
+    {
+      "epoch": 20.29,
+      "grad_norm": 2.458543062210083,
+      "learning_rate": 0.0011885167464114833,
+      "loss": 0.5392,
+      "step": 12720
+    },
+    {
+      "epoch": 20.3,
+      "grad_norm": 2.2478575706481934,
+      "learning_rate": 0.001187878787878788,
+      "loss": 0.5686,
+      "step": 12730
+    },
+    {
+      "epoch": 20.32,
+      "grad_norm": 1.2976596355438232,
+      "learning_rate": 0.0011872408293460927,
+      "loss": 0.4879,
+      "step": 12740
+    },
+    {
+      "epoch": 20.33,
+      "grad_norm": 2.250114917755127,
+      "learning_rate": 0.0011866028708133972,
+      "loss": 0.6064,
+      "step": 12750
+    },
+    {
+      "epoch": 20.35,
+      "grad_norm": 2.2818796634674072,
+      "learning_rate": 0.0011859649122807018,
+      "loss": 0.5619,
+      "step": 12760
+    },
+    {
+      "epoch": 20.37,
+      "grad_norm": 2.3820178508758545,
+      "learning_rate": 0.0011853269537480066,
+      "loss": 0.5126,
+      "step": 12770
+    },
+    {
+      "epoch": 20.38,
+      "grad_norm": 3.2392003536224365,
+      "learning_rate": 0.0011846889952153111,
+      "loss": 0.544,
+      "step": 12780
+    },
+    {
+      "epoch": 20.4,
+      "grad_norm": 3.075946092605591,
+      "learning_rate": 0.0011840510366826157,
+      "loss": 0.6301,
+      "step": 12790
+    },
+    {
+      "epoch": 20.41,
+      "grad_norm": 4.1104230880737305,
+      "learning_rate": 0.0011834130781499205,
+      "loss": 0.6136,
+      "step": 12800
+    },
+    {
+      "epoch": 20.43,
+      "grad_norm": 1.715682029724121,
+      "learning_rate": 0.001182775119617225,
+      "loss": 0.5667,
+      "step": 12810
+    },
+    {
+      "epoch": 20.45,
+      "grad_norm": 1.5427650213241577,
+      "learning_rate": 0.0011821371610845296,
+      "loss": 0.6001,
+      "step": 12820
+    },
+    {
+      "epoch": 20.46,
+      "grad_norm": 2.109271764755249,
+      "learning_rate": 0.0011814992025518342,
+      "loss": 0.5762,
+      "step": 12830
+    },
+    {
+      "epoch": 20.48,
+      "grad_norm": 1.6117897033691406,
+      "learning_rate": 0.001180861244019139,
+      "loss": 0.5072,
+      "step": 12840
+    },
+    {
+      "epoch": 20.49,
+      "grad_norm": 2.525860071182251,
+      "learning_rate": 0.0011802232854864435,
+      "loss": 0.6302,
+      "step": 12850
+    },
+    {
+      "epoch": 20.51,
+      "grad_norm": 1.8826050758361816,
+      "learning_rate": 0.001179585326953748,
+      "loss": 0.5483,
+      "step": 12860
+    },
+    {
+      "epoch": 20.53,
+      "grad_norm": 1.7801835536956787,
+      "learning_rate": 0.0011789473684210526,
+      "loss": 0.5239,
+      "step": 12870
+    },
+    {
+      "epoch": 20.54,
+      "grad_norm": 3.124882459640503,
+      "learning_rate": 0.0011783094098883574,
+      "loss": 0.4724,
+      "step": 12880
+    },
+    {
+      "epoch": 20.56,
+      "grad_norm": 2.8056161403656006,
+      "learning_rate": 0.001177671451355662,
+      "loss": 0.5681,
+      "step": 12890
+    },
+    {
+      "epoch": 20.57,
+      "grad_norm": 2.165199041366577,
+      "learning_rate": 0.0011770334928229665,
+      "loss": 0.6012,
+      "step": 12900
+    },
+    {
+      "epoch": 20.59,
+      "grad_norm": 2.297102451324463,
+      "learning_rate": 0.0011763955342902713,
+      "loss": 0.517,
+      "step": 12910
+    },
+    {
+      "epoch": 20.61,
+      "grad_norm": 2.99562668800354,
+      "learning_rate": 0.0011757575757575759,
+      "loss": 0.5696,
+      "step": 12920
+    },
+    {
+      "epoch": 20.62,
+      "grad_norm": 2.0757791996002197,
+      "learning_rate": 0.0011751196172248804,
+      "loss": 0.6102,
+      "step": 12930
+    },
+    {
+      "epoch": 20.64,
+      "grad_norm": 2.441718816757202,
+      "learning_rate": 0.001174481658692185,
+      "loss": 0.6066,
+      "step": 12940
+    },
+    {
+      "epoch": 20.65,
+      "grad_norm": 1.4816184043884277,
+      "learning_rate": 0.0011738437001594898,
+      "loss": 0.5083,
+      "step": 12950
+    },
+    {
+      "epoch": 20.67,
+      "grad_norm": 2.349161386489868,
+      "learning_rate": 0.0011732057416267943,
+      "loss": 0.5705,
+      "step": 12960
+    },
+    {
+      "epoch": 20.69,
+      "grad_norm": 2.0626585483551025,
+      "learning_rate": 0.001172567783094099,
+      "loss": 0.5551,
+      "step": 12970
+    },
+    {
+      "epoch": 20.7,
+      "grad_norm": 2.3144423961639404,
+      "learning_rate": 0.0011719298245614037,
+      "loss": 0.7276,
+      "step": 12980
+    },
+    {
+      "epoch": 20.72,
+      "grad_norm": 2.9268980026245117,
+      "learning_rate": 0.0011712918660287083,
+      "loss": 0.6176,
+      "step": 12990
+    },
+    {
+      "epoch": 20.73,
+      "grad_norm": 2.339564323425293,
+      "learning_rate": 0.0011706539074960128,
+      "loss": 0.6206,
+      "step": 13000
+    },
+    {
+      "epoch": 20.75,
+      "grad_norm": 2.358088493347168,
+      "learning_rate": 0.0011700159489633174,
+      "loss": 0.5862,
+      "step": 13010
+    },
+    {
+      "epoch": 20.77,
+      "grad_norm": 2.127462863922119,
+      "learning_rate": 0.0011693779904306222,
+      "loss": 0.5488,
+      "step": 13020
+    },
+    {
+      "epoch": 20.78,
+      "grad_norm": 3.3488762378692627,
+      "learning_rate": 0.0011687400318979267,
+      "loss": 0.6345,
+      "step": 13030
+    },
+    {
+      "epoch": 20.8,
+      "grad_norm": 3.2236621379852295,
+      "learning_rate": 0.0011681020733652313,
+      "loss": 0.5897,
+      "step": 13040
+    },
+    {
+      "epoch": 20.81,
+      "grad_norm": 3.0065135955810547,
+      "learning_rate": 0.001167464114832536,
+      "loss": 0.6214,
+      "step": 13050
+    },
+    {
+      "epoch": 20.83,
+      "grad_norm": 1.797853946685791,
+      "learning_rate": 0.0011668261562998406,
+      "loss": 0.6224,
+      "step": 13060
+    },
+    {
+      "epoch": 20.85,
+      "grad_norm": 1.6769222021102905,
+      "learning_rate": 0.0011661881977671452,
+      "loss": 0.533,
+      "step": 13070
+    },
+    {
+      "epoch": 20.86,
+      "grad_norm": 2.111424207687378,
+      "learning_rate": 0.0011655502392344497,
+      "loss": 0.5674,
+      "step": 13080
+    },
+    {
+      "epoch": 20.88,
+      "grad_norm": 1.9882782697677612,
+      "learning_rate": 0.0011649122807017545,
+      "loss": 0.6407,
+      "step": 13090
+    },
+    {
+      "epoch": 20.89,
+      "grad_norm": 2.0077192783355713,
+      "learning_rate": 0.001164274322169059,
+      "loss": 0.4901,
+      "step": 13100
+    },
+    {
+      "epoch": 20.91,
+      "grad_norm": 1.3955817222595215,
+      "learning_rate": 0.0011636363636363637,
+      "loss": 0.6638,
+      "step": 13110
+    },
+    {
+      "epoch": 20.93,
+      "grad_norm": 2.236403226852417,
+      "learning_rate": 0.0011629984051036684,
+      "loss": 0.6267,
+      "step": 13120
+    },
+    {
+      "epoch": 20.94,
+      "grad_norm": 2.0299949645996094,
+      "learning_rate": 0.001162360446570973,
+      "loss": 0.6097,
+      "step": 13130
+    },
+    {
+      "epoch": 20.96,
+      "grad_norm": 3.4427030086517334,
+      "learning_rate": 0.0011617224880382776,
+      "loss": 0.6125,
+      "step": 13140
+    },
+    {
+      "epoch": 20.97,
+      "grad_norm": 2.427687168121338,
+      "learning_rate": 0.0011610845295055821,
+      "loss": 0.6464,
+      "step": 13150
+    },
+    {
+      "epoch": 20.99,
+      "grad_norm": 2.512589454650879,
+      "learning_rate": 0.001160446570972887,
+      "loss": 0.636,
+      "step": 13160
+    },
+    {
+      "epoch": 21.0,
+      "grad_norm": 1.6817240715026855,
+      "learning_rate": 0.0011598086124401915,
+      "loss": 0.5498,
+      "step": 13170
+    },
+    {
+      "epoch": 21.02,
+      "grad_norm": 2.4378724098205566,
+      "learning_rate": 0.001159170653907496,
+      "loss": 0.4912,
+      "step": 13180
+    },
+    {
+      "epoch": 21.04,
+      "grad_norm": 1.7349364757537842,
+      "learning_rate": 0.0011585326953748008,
+      "loss": 0.4135,
+      "step": 13190
+    },
+    {
+      "epoch": 21.05,
+      "grad_norm": 2.0071072578430176,
+      "learning_rate": 0.0011578947368421054,
+      "loss": 0.4892,
+      "step": 13200
+    },
+    {
+      "epoch": 21.07,
+      "grad_norm": 2.1567165851593018,
+      "learning_rate": 0.00115725677830941,
+      "loss": 0.4291,
+      "step": 13210
+    },
+    {
+      "epoch": 21.08,
+      "grad_norm": 1.5533453226089478,
+      "learning_rate": 0.0011566188197767145,
+      "loss": 0.4515,
+      "step": 13220
+    },
+    {
+      "epoch": 21.1,
+      "grad_norm": 1.481789231300354,
+      "learning_rate": 0.0011559808612440193,
+      "loss": 0.4933,
+      "step": 13230
+    },
+    {
+      "epoch": 21.12,
+      "grad_norm": 1.985859990119934,
+      "learning_rate": 0.0011553429027113238,
+      "loss": 0.6186,
+      "step": 13240
+    },
+    {
+      "epoch": 21.13,
+      "grad_norm": 0.9559075236320496,
+      "learning_rate": 0.0011547049441786284,
+      "loss": 0.4367,
+      "step": 13250
+    },
+    {
+      "epoch": 21.15,
+      "grad_norm": 3.953303575515747,
+      "learning_rate": 0.001154066985645933,
+      "loss": 0.5434,
+      "step": 13260
+    },
+    {
+      "epoch": 21.16,
+      "grad_norm": 1.7408164739608765,
+      "learning_rate": 0.0011534290271132377,
+      "loss": 0.5025,
+      "step": 13270
+    },
+    {
+      "epoch": 21.18,
+      "grad_norm": 2.5240061283111572,
+      "learning_rate": 0.0011527910685805423,
+      "loss": 0.5206,
+      "step": 13280
+    },
+    {
+      "epoch": 21.2,
+      "grad_norm": 1.7967180013656616,
+      "learning_rate": 0.0011521531100478469,
+      "loss": 0.4679,
+      "step": 13290
+    },
+    {
+      "epoch": 21.21,
+      "grad_norm": 1.5482749938964844,
+      "learning_rate": 0.0011515151515151516,
+      "loss": 0.5074,
+      "step": 13300
+    },
+    {
+      "epoch": 21.23,
+      "grad_norm": 2.0703771114349365,
+      "learning_rate": 0.0011508771929824562,
+      "loss": 0.5012,
+      "step": 13310
+    },
+    {
+      "epoch": 21.24,
+      "grad_norm": 1.4565823078155518,
+      "learning_rate": 0.0011502392344497608,
+      "loss": 0.5855,
+      "step": 13320
+    },
+    {
+      "epoch": 21.26,
+      "grad_norm": 2.0159592628479004,
+      "learning_rate": 0.0011496012759170653,
+      "loss": 0.5762,
+      "step": 13330
+    },
+    {
+      "epoch": 21.28,
+      "grad_norm": 1.8826504945755005,
+      "learning_rate": 0.0011489633173843701,
+      "loss": 0.5274,
+      "step": 13340
+    },
+    {
+      "epoch": 21.29,
+      "grad_norm": 1.7150112390518188,
+      "learning_rate": 0.0011483253588516747,
+      "loss": 0.5659,
+      "step": 13350
+    },
+    {
+      "epoch": 21.31,
+      "grad_norm": 2.8087666034698486,
+      "learning_rate": 0.0011476874003189792,
+      "loss": 0.4906,
+      "step": 13360
+    },
+    {
+      "epoch": 21.32,
+      "grad_norm": 1.7748334407806396,
+      "learning_rate": 0.001147049441786284,
+      "loss": 0.4622,
+      "step": 13370
+    },
+    {
+      "epoch": 21.34,
+      "grad_norm": 2.2863359451293945,
+      "learning_rate": 0.0011464114832535886,
+      "loss": 0.5015,
+      "step": 13380
+    },
+    {
+      "epoch": 21.36,
+      "grad_norm": 2.4490015506744385,
+      "learning_rate": 0.0011457735247208931,
+      "loss": 0.5743,
+      "step": 13390
+    },
+    {
+      "epoch": 21.37,
+      "grad_norm": 1.4806760549545288,
+      "learning_rate": 0.0011451355661881977,
+      "loss": 0.516,
+      "step": 13400
+    },
+    {
+      "epoch": 21.39,
+      "grad_norm": 1.909926176071167,
+      "learning_rate": 0.0011444976076555025,
+      "loss": 0.516,
+      "step": 13410
+    },
+    {
+      "epoch": 21.4,
+      "grad_norm": 2.3129677772521973,
+      "learning_rate": 0.001143859649122807,
+      "loss": 0.5169,
+      "step": 13420
+    },
+    {
+      "epoch": 21.42,
+      "grad_norm": 2.589088201522827,
+      "learning_rate": 0.0011432216905901116,
+      "loss": 0.5535,
+      "step": 13430
+    },
+    {
+      "epoch": 21.44,
+      "grad_norm": 2.4051127433776855,
+      "learning_rate": 0.0011425837320574164,
+      "loss": 0.5236,
+      "step": 13440
+    },
+    {
+      "epoch": 21.45,
+      "grad_norm": 2.466587781906128,
+      "learning_rate": 0.001141945773524721,
+      "loss": 0.5304,
+      "step": 13450
+    },
+    {
+      "epoch": 21.47,
+      "grad_norm": 1.5987040996551514,
+      "learning_rate": 0.0011413078149920255,
+      "loss": 0.4916,
+      "step": 13460
+    },
+    {
+      "epoch": 21.48,
+      "grad_norm": 3.281262159347534,
+      "learning_rate": 0.00114066985645933,
+      "loss": 0.5194,
+      "step": 13470
+    },
+    {
+      "epoch": 21.5,
+      "grad_norm": 2.3112425804138184,
+      "learning_rate": 0.0011400318979266349,
+      "loss": 0.5282,
+      "step": 13480
+    },
+    {
+      "epoch": 21.52,
+      "grad_norm": 1.8901677131652832,
+      "learning_rate": 0.0011393939393939394,
+      "loss": 0.5697,
+      "step": 13490
+    },
+    {
+      "epoch": 21.53,
+      "grad_norm": 2.8748323917388916,
+      "learning_rate": 0.001138755980861244,
+      "loss": 0.5219,
+      "step": 13500
+    },
+    {
+      "epoch": 21.55,
+      "grad_norm": 2.162447690963745,
+      "learning_rate": 0.0011381180223285488,
+      "loss": 0.5469,
+      "step": 13510
+    },
+    {
+      "epoch": 21.56,
+      "grad_norm": 2.3993029594421387,
+      "learning_rate": 0.0011374800637958533,
+      "loss": 0.5083,
+      "step": 13520
+    },
+    {
+      "epoch": 21.58,
+      "grad_norm": 2.262704372406006,
+      "learning_rate": 0.0011368421052631579,
+      "loss": 0.5152,
+      "step": 13530
+    },
+    {
+      "epoch": 21.59,
+      "grad_norm": 1.8415032625198364,
+      "learning_rate": 0.0011362041467304624,
+      "loss": 0.6413,
+      "step": 13540
+    },
+    {
+      "epoch": 21.61,
+      "grad_norm": 1.7143352031707764,
+      "learning_rate": 0.0011355661881977672,
+      "loss": 0.5897,
+      "step": 13550
+    },
+    {
+      "epoch": 21.63,
+      "grad_norm": 1.9199092388153076,
+      "learning_rate": 0.0011349282296650718,
+      "loss": 0.5259,
+      "step": 13560
+    },
+    {
+      "epoch": 21.64,
+      "grad_norm": 1.6734964847564697,
+      "learning_rate": 0.0011342902711323764,
+      "loss": 0.5658,
+      "step": 13570
+    },
+    {
+      "epoch": 21.66,
+      "grad_norm": 2.817392349243164,
+      "learning_rate": 0.001133652312599681,
+      "loss": 0.6579,
+      "step": 13580
+    },
+    {
+      "epoch": 21.67,
+      "grad_norm": 3.3291382789611816,
+      "learning_rate": 0.0011330143540669857,
+      "loss": 0.603,
+      "step": 13590
+    },
+    {
+      "epoch": 21.69,
+      "grad_norm": 2.5923471450805664,
+      "learning_rate": 0.0011323763955342903,
+      "loss": 0.659,
+      "step": 13600
+    },
+    {
+      "epoch": 21.71,
+      "grad_norm": 2.979832410812378,
+      "learning_rate": 0.0011317384370015948,
+      "loss": 0.6129,
+      "step": 13610
+    },
+    {
+      "epoch": 21.72,
+      "grad_norm": 3.666498899459839,
+      "learning_rate": 0.0011311004784688996,
+      "loss": 0.7636,
+      "step": 13620
+    },
+    {
+      "epoch": 21.74,
+      "grad_norm": 1.8010962009429932,
+      "learning_rate": 0.0011304625199362042,
+      "loss": 0.5779,
+      "step": 13630
+    },
+    {
+      "epoch": 21.75,
+      "grad_norm": 2.430271625518799,
+      "learning_rate": 0.0011298245614035087,
+      "loss": 0.5292,
+      "step": 13640
+    },
+    {
+      "epoch": 21.77,
+      "grad_norm": 2.2051026821136475,
+      "learning_rate": 0.0011291866028708133,
+      "loss": 0.5498,
+      "step": 13650
+    },
+    {
+      "epoch": 21.79,
+      "grad_norm": 3.7122042179107666,
+      "learning_rate": 0.001128548644338118,
+      "loss": 0.5772,
+      "step": 13660
+    },
+    {
+      "epoch": 21.8,
+      "grad_norm": 2.4475326538085938,
+      "learning_rate": 0.0011279106858054226,
+      "loss": 0.5221,
+      "step": 13670
+    },
+    {
+      "epoch": 21.82,
+      "grad_norm": 2.862783193588257,
+      "learning_rate": 0.0011272727272727272,
+      "loss": 0.5799,
+      "step": 13680
+    },
+    {
+      "epoch": 21.83,
+      "grad_norm": 2.2433278560638428,
+      "learning_rate": 0.001126634768740032,
+      "loss": 0.5334,
+      "step": 13690
+    },
+    {
+      "epoch": 21.85,
+      "grad_norm": 2.5554163455963135,
+      "learning_rate": 0.0011259968102073365,
+      "loss": 0.5254,
+      "step": 13700
+    },
+    {
+      "epoch": 21.87,
+      "grad_norm": 2.6535990238189697,
+      "learning_rate": 0.001125358851674641,
+      "loss": 0.6575,
+      "step": 13710
+    },
+    {
+      "epoch": 21.88,
+      "grad_norm": 2.348066806793213,
+      "learning_rate": 0.0011247208931419457,
+      "loss": 0.5326,
+      "step": 13720
+    },
+    {
+      "epoch": 21.9,
+      "grad_norm": 2.7629575729370117,
+      "learning_rate": 0.0011240829346092504,
+      "loss": 0.5429,
+      "step": 13730
+    },
+    {
+      "epoch": 21.91,
+      "grad_norm": 1.7561380863189697,
+      "learning_rate": 0.001123444976076555,
+      "loss": 0.612,
+      "step": 13740
+    },
+    {
+      "epoch": 21.93,
+      "grad_norm": 1.6795223951339722,
+      "learning_rate": 0.0011228070175438596,
+      "loss": 0.5954,
+      "step": 13750
+    },
+    {
+      "epoch": 21.95,
+      "grad_norm": 2.316612958908081,
+      "learning_rate": 0.0011221690590111643,
+      "loss": 0.6165,
+      "step": 13760
+    },
+    {
+      "epoch": 21.96,
+      "grad_norm": 2.91849422454834,
+      "learning_rate": 0.001121531100478469,
+      "loss": 0.5407,
+      "step": 13770
+    },
+    {
+      "epoch": 21.98,
+      "grad_norm": 1.6966789960861206,
+      "learning_rate": 0.0011208931419457735,
+      "loss": 0.5186,
+      "step": 13780
+    },
+    {
+      "epoch": 21.99,
+      "grad_norm": 2.0186002254486084,
+      "learning_rate": 0.001120255183413078,
+      "loss": 0.6633,
+      "step": 13790
+    },
+    {
+      "epoch": 22.01,
+      "grad_norm": 1.466770052909851,
+      "learning_rate": 0.0011196172248803828,
+      "loss": 0.4628,
+      "step": 13800
+    },
+    {
+      "epoch": 22.03,
+      "grad_norm": 1.2927073240280151,
+      "learning_rate": 0.0011189792663476874,
+      "loss": 0.3833,
+      "step": 13810
+    },
+    {
+      "epoch": 22.04,
+      "grad_norm": 1.5075204372406006,
+      "learning_rate": 0.001118341307814992,
+      "loss": 0.4408,
+      "step": 13820
+    },
+    {
+      "epoch": 22.06,
+      "grad_norm": 1.5921709537506104,
+      "learning_rate": 0.0011177033492822967,
+      "loss": 0.4546,
+      "step": 13830
+    },
+    {
+      "epoch": 22.07,
+      "grad_norm": 1.4669833183288574,
+      "learning_rate": 0.0011170653907496013,
+      "loss": 0.4423,
+      "step": 13840
+    },
+    {
+      "epoch": 22.09,
+      "grad_norm": 2.846984624862671,
+      "learning_rate": 0.0011164274322169058,
+      "loss": 0.5419,
+      "step": 13850
+    },
+    {
+      "epoch": 22.11,
+      "grad_norm": 1.1254881620407104,
+      "learning_rate": 0.0011157894736842104,
+      "loss": 0.4977,
+      "step": 13860
+    },
+    {
+      "epoch": 22.12,
+      "grad_norm": 1.3367946147918701,
+      "learning_rate": 0.0011151515151515152,
+      "loss": 0.4496,
+      "step": 13870
+    },
+    {
+      "epoch": 22.14,
+      "grad_norm": 1.633335828781128,
+      "learning_rate": 0.0011145135566188197,
+      "loss": 0.4433,
+      "step": 13880
+    },
+    {
+      "epoch": 22.15,
+      "grad_norm": 2.3413655757904053,
+      "learning_rate": 0.0011138755980861243,
+      "loss": 0.5003,
+      "step": 13890
+    },
+    {
+      "epoch": 22.17,
+      "grad_norm": 2.336428642272949,
+      "learning_rate": 0.001113237639553429,
+      "loss": 0.532,
+      "step": 13900
+    },
+    {
+      "epoch": 22.19,
+      "grad_norm": 1.713782787322998,
+      "learning_rate": 0.0011125996810207337,
+      "loss": 0.4474,
+      "step": 13910
+    },
+    {
+      "epoch": 22.2,
+      "grad_norm": 2.55415678024292,
+      "learning_rate": 0.0011119617224880382,
+      "loss": 0.4168,
+      "step": 13920
+    },
+    {
+      "epoch": 22.22,
+      "grad_norm": 1.7358187437057495,
+      "learning_rate": 0.0011113237639553428,
+      "loss": 0.5048,
+      "step": 13930
+    },
+    {
+      "epoch": 22.23,
+      "grad_norm": 1.6725515127182007,
+      "learning_rate": 0.0011106858054226476,
+      "loss": 0.508,
+      "step": 13940
+    },
+    {
+      "epoch": 22.25,
+      "grad_norm": 1.3164896965026855,
+      "learning_rate": 0.0011100478468899521,
+      "loss": 0.463,
+      "step": 13950
+    },
+    {
+      "epoch": 22.26,
+      "grad_norm": 1.724993348121643,
+      "learning_rate": 0.0011094098883572567,
+      "loss": 0.5165,
+      "step": 13960
+    },
+    {
+      "epoch": 22.28,
+      "grad_norm": 3.1152279376983643,
+      "learning_rate": 0.0011087719298245612,
+      "loss": 0.4615,
+      "step": 13970
+    },
+    {
+      "epoch": 22.3,
+      "grad_norm": 2.1127662658691406,
+      "learning_rate": 0.001108133971291866,
+      "loss": 0.4926,
+      "step": 13980
+    },
+    {
+      "epoch": 22.31,
+      "grad_norm": 2.012160062789917,
+      "learning_rate": 0.0011074960127591706,
+      "loss": 0.4331,
+      "step": 13990
+    },
+    {
+      "epoch": 22.33,
+      "grad_norm": 1.761988639831543,
+      "learning_rate": 0.0011068580542264752,
+      "loss": 0.4927,
+      "step": 14000
+    },
+    {
+      "epoch": 22.34,
+      "grad_norm": 1.8735899925231934,
+      "learning_rate": 0.00110622009569378,
+      "loss": 0.5162,
+      "step": 14010
+    },
+    {
+      "epoch": 22.36,
+      "grad_norm": 1.9117660522460938,
+      "learning_rate": 0.0011055821371610845,
+      "loss": 0.4583,
+      "step": 14020
+    },
+    {
+      "epoch": 22.38,
+      "grad_norm": 1.598494291305542,
+      "learning_rate": 0.001104944178628389,
+      "loss": 0.4943,
+      "step": 14030
+    },
+    {
+      "epoch": 22.39,
+      "grad_norm": 1.6611143350601196,
+      "learning_rate": 0.0011043062200956936,
+      "loss": 0.4889,
+      "step": 14040
+    },
+    {
+      "epoch": 22.41,
+      "grad_norm": 2.4984424114227295,
+      "learning_rate": 0.0011036682615629984,
+      "loss": 0.4838,
+      "step": 14050
+    },
+    {
+      "epoch": 22.42,
+      "grad_norm": 2.082078695297241,
+      "learning_rate": 0.001103030303030303,
+      "loss": 0.6166,
+      "step": 14060
+    },
+    {
+      "epoch": 22.44,
+      "grad_norm": 2.6350715160369873,
+      "learning_rate": 0.0011023923444976075,
+      "loss": 0.5082,
+      "step": 14070
+    },
+    {
+      "epoch": 22.46,
+      "grad_norm": 1.6463345289230347,
+      "learning_rate": 0.0011017543859649123,
+      "loss": 0.4486,
+      "step": 14080
+    },
+    {
+      "epoch": 22.47,
+      "grad_norm": 2.0142619609832764,
+      "learning_rate": 0.0011011164274322169,
+      "loss": 0.5573,
+      "step": 14090
+    },
+    {
+      "epoch": 22.49,
+      "grad_norm": 2.3120744228363037,
+      "learning_rate": 0.0011004784688995214,
+      "loss": 0.5478,
+      "step": 14100
+    },
+    {
+      "epoch": 22.5,
+      "grad_norm": 1.7484601736068726,
+      "learning_rate": 0.001099840510366826,
+      "loss": 0.5557,
+      "step": 14110
+    },
+    {
+      "epoch": 22.52,
+      "grad_norm": 2.2994306087493896,
+      "learning_rate": 0.0010992025518341308,
+      "loss": 0.507,
+      "step": 14120
+    },
+    {
+      "epoch": 22.54,
+      "grad_norm": 3.1111643314361572,
+      "learning_rate": 0.0010985645933014353,
+      "loss": 0.783,
+      "step": 14130
+    },
+    {
+      "epoch": 22.55,
+      "grad_norm": 2.3941569328308105,
+      "learning_rate": 0.00109792663476874,
+      "loss": 0.6618,
+      "step": 14140
+    },
+    {
+      "epoch": 22.57,
+      "grad_norm": 1.893367052078247,
+      "learning_rate": 0.0010972886762360447,
+      "loss": 0.5318,
+      "step": 14150
+    },
+    {
+      "epoch": 22.58,
+      "grad_norm": 2.1536896228790283,
+      "learning_rate": 0.0010966507177033492,
+      "loss": 0.5581,
+      "step": 14160
+    },
+    {
+      "epoch": 22.6,
+      "grad_norm": 2.7636032104492188,
+      "learning_rate": 0.0010960127591706538,
+      "loss": 0.5706,
+      "step": 14170
+    },
+    {
+      "epoch": 22.62,
+      "grad_norm": 2.516028642654419,
+      "learning_rate": 0.0010953748006379584,
+      "loss": 0.5557,
+      "step": 14180
+    },
+    {
+      "epoch": 22.63,
+      "grad_norm": 1.5299115180969238,
+      "learning_rate": 0.0010947368421052634,
+      "loss": 0.5118,
+      "step": 14190
+    },
+    {
+      "epoch": 22.65,
+      "grad_norm": 2.1962053775787354,
+      "learning_rate": 0.001094098883572568,
+      "loss": 0.5678,
+      "step": 14200
+    },
+    {
+      "epoch": 22.66,
+      "grad_norm": 4.639540195465088,
+      "learning_rate": 0.0010934609250398725,
+      "loss": 0.4226,
+      "step": 14210
+    },
+    {
+      "epoch": 22.68,
+      "grad_norm": 3.8349008560180664,
+      "learning_rate": 0.0010928229665071773,
+      "loss": 0.5659,
+      "step": 14220
+    },
+    {
+      "epoch": 22.7,
+      "grad_norm": 2.3924553394317627,
+      "learning_rate": 0.0010921850079744818,
+      "loss": 0.5681,
+      "step": 14230
+    },
+    {
+      "epoch": 22.71,
+      "grad_norm": 3.5269014835357666,
+      "learning_rate": 0.0010915470494417864,
+      "loss": 0.5181,
+      "step": 14240
+    },
+    {
+      "epoch": 22.73,
+      "grad_norm": 2.35038685798645,
+      "learning_rate": 0.001090909090909091,
+      "loss": 0.4825,
+      "step": 14250
+    },
+    {
+      "epoch": 22.74,
+      "grad_norm": 2.1526710987091064,
+      "learning_rate": 0.0010902711323763957,
+      "loss": 0.5347,
+      "step": 14260
+    },
+    {
+      "epoch": 22.76,
+      "grad_norm": 2.3087081909179688,
+      "learning_rate": 0.0010896331738437003,
+      "loss": 0.6681,
+      "step": 14270
+    },
+    {
+      "epoch": 22.78,
+      "grad_norm": 1.9781696796417236,
+      "learning_rate": 0.0010889952153110049,
+      "loss": 0.5765,
+      "step": 14280
+    },
+    {
+      "epoch": 22.79,
+      "grad_norm": 2.716538429260254,
+      "learning_rate": 0.0010883572567783094,
+      "loss": 0.6403,
+      "step": 14290
+    },
+    {
+      "epoch": 22.81,
+      "grad_norm": 2.0449490547180176,
+      "learning_rate": 0.0010877192982456142,
+      "loss": 0.4586,
+      "step": 14300
+    },
+    {
+      "epoch": 22.82,
+      "grad_norm": 2.0720322132110596,
+      "learning_rate": 0.0010870813397129188,
+      "loss": 0.597,
+      "step": 14310
+    },
+    {
+      "epoch": 22.84,
+      "grad_norm": 1.6174436807632446,
+      "learning_rate": 0.0010864433811802233,
+      "loss": 0.4503,
+      "step": 14320
+    },
+    {
+      "epoch": 22.85,
+      "grad_norm": 2.284149169921875,
+      "learning_rate": 0.001085805422647528,
+      "loss": 0.554,
+      "step": 14330
+    },
+    {
+      "epoch": 22.87,
+      "grad_norm": 1.6513159275054932,
+      "learning_rate": 0.0010851674641148327,
+      "loss": 0.4929,
+      "step": 14340
+    },
+    {
+      "epoch": 22.89,
+      "grad_norm": 3.105323076248169,
+      "learning_rate": 0.0010845295055821372,
+      "loss": 0.601,
+      "step": 14350
+    },
+    {
+      "epoch": 22.9,
+      "grad_norm": 1.6782584190368652,
+      "learning_rate": 0.0010838915470494418,
+      "loss": 0.5175,
+      "step": 14360
+    },
+    {
+      "epoch": 22.92,
+      "grad_norm": 2.065708875656128,
+      "learning_rate": 0.0010832535885167466,
+      "loss": 0.5765,
+      "step": 14370
+    },
+    {
+      "epoch": 22.93,
+      "grad_norm": 3.1577556133270264,
+      "learning_rate": 0.0010826156299840511,
+      "loss": 0.6024,
+      "step": 14380
+    },
+    {
+      "epoch": 22.95,
+      "grad_norm": 3.8669426441192627,
+      "learning_rate": 0.0010819776714513557,
+      "loss": 0.5703,
+      "step": 14390
+    },
+    {
+      "epoch": 22.97,
+      "grad_norm": 2.084577798843384,
+      "learning_rate": 0.0010813397129186605,
+      "loss": 0.5736,
+      "step": 14400
+    },
+    {
+      "epoch": 22.98,
+      "grad_norm": 2.3322348594665527,
+      "learning_rate": 0.001080701754385965,
+      "loss": 0.4955,
+      "step": 14410
+    },
+    {
+      "epoch": 23.0,
+      "grad_norm": 2.981834650039673,
+      "learning_rate": 0.0010800637958532696,
+      "loss": 0.5941,
+      "step": 14420
+    },
+    {
+      "epoch": 23.01,
+      "grad_norm": 1.649495244026184,
+      "learning_rate": 0.0010794258373205742,
+      "loss": 0.4623,
+      "step": 14430
+    },
+    {
+      "epoch": 23.03,
+      "grad_norm": 2.4361202716827393,
+      "learning_rate": 0.001078787878787879,
+      "loss": 0.5178,
+      "step": 14440
+    },
+    {
+      "epoch": 23.05,
+      "grad_norm": 1.9195847511291504,
+      "learning_rate": 0.0010781499202551835,
+      "loss": 0.5056,
+      "step": 14450
+    },
+    {
+      "epoch": 23.06,
+      "grad_norm": 1.472584843635559,
+      "learning_rate": 0.001077511961722488,
+      "loss": 0.422,
+      "step": 14460
+    },
+    {
+      "epoch": 23.08,
+      "grad_norm": 1.9220826625823975,
+      "learning_rate": 0.0010768740031897928,
+      "loss": 0.4461,
+      "step": 14470
+    },
+    {
+      "epoch": 23.09,
+      "grad_norm": 2.0163981914520264,
+      "learning_rate": 0.0010762360446570974,
+      "loss": 0.405,
+      "step": 14480
+    },
+    {
+      "epoch": 23.11,
+      "grad_norm": 2.0835061073303223,
+      "learning_rate": 0.001075598086124402,
+      "loss": 0.5177,
+      "step": 14490
+    },
+    {
+      "epoch": 23.13,
+      "grad_norm": 0.9891412258148193,
+      "learning_rate": 0.0010749601275917065,
+      "loss": 0.5585,
+      "step": 14500
+    },
+    {
+      "epoch": 23.14,
+      "grad_norm": 1.3112674951553345,
+      "learning_rate": 0.0010743221690590113,
+      "loss": 0.5358,
+      "step": 14510
+    },
+    {
+      "epoch": 23.16,
+      "grad_norm": 1.358392357826233,
+      "learning_rate": 0.0010736842105263159,
+      "loss": 0.3928,
+      "step": 14520
+    },
+    {
+      "epoch": 23.17,
+      "grad_norm": 1.7104527950286865,
+      "learning_rate": 0.0010730462519936204,
+      "loss": 0.4469,
+      "step": 14530
+    },
+    {
+      "epoch": 23.19,
+      "grad_norm": 2.007497787475586,
+      "learning_rate": 0.0010724082934609252,
+      "loss": 0.4123,
+      "step": 14540
+    },
+    {
+      "epoch": 23.21,
+      "grad_norm": 2.2213757038116455,
+      "learning_rate": 0.0010717703349282298,
+      "loss": 0.4139,
+      "step": 14550
+    },
+    {
+      "epoch": 23.22,
+      "grad_norm": 1.6128385066986084,
+      "learning_rate": 0.0010711323763955343,
+      "loss": 0.4715,
+      "step": 14560
+    },
+    {
+      "epoch": 23.24,
+      "grad_norm": 1.6998387575149536,
+      "learning_rate": 0.001070494417862839,
+      "loss": 0.433,
+      "step": 14570
+    },
+    {
+      "epoch": 23.25,
+      "grad_norm": 1.7560913562774658,
+      "learning_rate": 0.0010698564593301437,
+      "loss": 0.5458,
+      "step": 14580
+    },
+    {
+      "epoch": 23.27,
+      "grad_norm": 1.0924944877624512,
+      "learning_rate": 0.0010692185007974483,
+      "loss": 0.4552,
+      "step": 14590
+    },
+    {
+      "epoch": 23.29,
+      "grad_norm": 1.2721997499465942,
+      "learning_rate": 0.0010685805422647528,
+      "loss": 0.4142,
+      "step": 14600
+    },
+    {
+      "epoch": 23.3,
+      "grad_norm": 1.5277657508850098,
+      "learning_rate": 0.0010679425837320576,
+      "loss": 0.4749,
+      "step": 14610
+    },
+    {
+      "epoch": 23.32,
+      "grad_norm": 1.4912691116333008,
+      "learning_rate": 0.0010673046251993622,
+      "loss": 0.4994,
+      "step": 14620
+    },
+    {
+      "epoch": 23.33,
+      "grad_norm": 2.7884340286254883,
+      "learning_rate": 0.0010666666666666667,
+      "loss": 0.4654,
+      "step": 14630
+    },
+    {
+      "epoch": 23.35,
+      "grad_norm": 3.288153886795044,
+      "learning_rate": 0.0010660287081339713,
+      "loss": 0.4726,
+      "step": 14640
+    },
+    {
+      "epoch": 23.37,
+      "grad_norm": 1.869439721107483,
+      "learning_rate": 0.001065390749601276,
+      "loss": 0.5029,
+      "step": 14650
+    },
+    {
+      "epoch": 23.38,
+      "grad_norm": 1.9574953317642212,
+      "learning_rate": 0.0010647527910685806,
+      "loss": 0.5279,
+      "step": 14660
+    },
+    {
+      "epoch": 23.4,
+      "grad_norm": 3.001887321472168,
+      "learning_rate": 0.0010641148325358852,
+      "loss": 0.5283,
+      "step": 14670
+    },
+    {
+      "epoch": 23.41,
+      "grad_norm": 3.0924551486968994,
+      "learning_rate": 0.0010634768740031897,
+      "loss": 0.4474,
+      "step": 14680
+    },
+    {
+      "epoch": 23.43,
+      "grad_norm": 1.803222894668579,
+      "learning_rate": 0.0010628389154704945,
+      "loss": 0.4873,
+      "step": 14690
+    },
+    {
+      "epoch": 23.44,
+      "grad_norm": 2.564887762069702,
+      "learning_rate": 0.001062200956937799,
+      "loss": 0.4802,
+      "step": 14700
+    },
+    {
+      "epoch": 23.46,
+      "grad_norm": 2.3837051391601562,
+      "learning_rate": 0.0010615629984051037,
+      "loss": 0.5515,
+      "step": 14710
+    },
+    {
+      "epoch": 23.48,
+      "grad_norm": 2.775334358215332,
+      "learning_rate": 0.0010609250398724084,
+      "loss": 0.5124,
+      "step": 14720
+    },
+    {
+      "epoch": 23.49,
+      "grad_norm": 2.805455207824707,
+      "learning_rate": 0.001060287081339713,
+      "loss": 0.5103,
+      "step": 14730
+    },
+    {
+      "epoch": 23.51,
+      "grad_norm": 4.685495376586914,
+      "learning_rate": 0.0010596491228070176,
+      "loss": 0.5481,
+      "step": 14740
+    },
+    {
+      "epoch": 23.52,
+      "grad_norm": 1.6772174835205078,
+      "learning_rate": 0.0010590111642743221,
+      "loss": 0.5023,
+      "step": 14750
+    },
+    {
+      "epoch": 23.54,
+      "grad_norm": 3.1417901515960693,
+      "learning_rate": 0.001058373205741627,
+      "loss": 0.4909,
+      "step": 14760
+    },
+    {
+      "epoch": 23.56,
+      "grad_norm": 2.6341207027435303,
+      "learning_rate": 0.0010577352472089315,
+      "loss": 0.577,
+      "step": 14770
+    },
+    {
+      "epoch": 23.57,
+      "grad_norm": 1.981137990951538,
+      "learning_rate": 0.001057097288676236,
+      "loss": 0.527,
+      "step": 14780
+    },
+    {
+      "epoch": 23.59,
+      "grad_norm": 1.8690191507339478,
+      "learning_rate": 0.0010564593301435408,
+      "loss": 0.6524,
+      "step": 14790
+    },
+    {
+      "epoch": 23.6,
+      "grad_norm": 2.470585584640503,
+      "learning_rate": 0.0010558213716108454,
+      "loss": 0.5171,
+      "step": 14800
+    },
+    {
+      "epoch": 23.62,
+      "grad_norm": 1.5225473642349243,
+      "learning_rate": 0.00105518341307815,
+      "loss": 0.574,
+      "step": 14810
+    },
+    {
+      "epoch": 23.64,
+      "grad_norm": 1.244357705116272,
+      "learning_rate": 0.0010545454545454545,
+      "loss": 0.513,
+      "step": 14820
+    },
+    {
+      "epoch": 23.65,
+      "grad_norm": 3.7984049320220947,
+      "learning_rate": 0.0010539074960127593,
+      "loss": 0.4659,
+      "step": 14830
+    },
+    {
+      "epoch": 23.67,
+      "grad_norm": 2.2695350646972656,
+      "learning_rate": 0.0010532695374800638,
+      "loss": 0.5469,
+      "step": 14840
+    },
+    {
+      "epoch": 23.68,
+      "grad_norm": 2.1727049350738525,
+      "learning_rate": 0.0010526315789473684,
+      "loss": 0.4998,
+      "step": 14850
+    },
+    {
+      "epoch": 23.7,
+      "grad_norm": 2.2124183177948,
+      "learning_rate": 0.0010519936204146732,
+      "loss": 0.4202,
+      "step": 14860
+    },
+    {
+      "epoch": 23.72,
+      "grad_norm": 1.9910480976104736,
+      "learning_rate": 0.0010513556618819777,
+      "loss": 0.4944,
+      "step": 14870
+    },
+    {
+      "epoch": 23.73,
+      "grad_norm": 2.623316526412964,
+      "learning_rate": 0.0010507177033492823,
+      "loss": 0.5959,
+      "step": 14880
+    },
+    {
+      "epoch": 23.75,
+      "grad_norm": 3.7587718963623047,
+      "learning_rate": 0.0010500797448165869,
+      "loss": 0.5584,
+      "step": 14890
+    },
+    {
+      "epoch": 23.76,
+      "grad_norm": 2.0342280864715576,
+      "learning_rate": 0.0010494417862838916,
+      "loss": 0.5426,
+      "step": 14900
+    },
+    {
+      "epoch": 23.78,
+      "grad_norm": 2.8675320148468018,
+      "learning_rate": 0.0010488038277511962,
+      "loss": 0.6461,
+      "step": 14910
+    },
+    {
+      "epoch": 23.8,
+      "grad_norm": 2.026543617248535,
+      "learning_rate": 0.0010481658692185008,
+      "loss": 0.5184,
+      "step": 14920
+    },
+    {
+      "epoch": 23.81,
+      "grad_norm": 2.560939073562622,
+      "learning_rate": 0.0010475279106858055,
+      "loss": 0.5449,
+      "step": 14930
+    },
+    {
+      "epoch": 23.83,
+      "grad_norm": 2.085392951965332,
+      "learning_rate": 0.0010468899521531101,
+      "loss": 0.5984,
+      "step": 14940
+    },
+    {
+      "epoch": 23.84,
+      "grad_norm": 2.2556986808776855,
+      "learning_rate": 0.0010462519936204147,
+      "loss": 0.4763,
+      "step": 14950
+    },
+    {
+      "epoch": 23.86,
+      "grad_norm": 1.4370797872543335,
+      "learning_rate": 0.0010456140350877192,
+      "loss": 0.5311,
+      "step": 14960
+    },
+    {
+      "epoch": 23.88,
+      "grad_norm": 1.252243161201477,
+      "learning_rate": 0.001044976076555024,
+      "loss": 0.4934,
+      "step": 14970
+    },
+    {
+      "epoch": 23.89,
+      "grad_norm": 2.0001296997070312,
+      "learning_rate": 0.0010443381180223286,
+      "loss": 0.4836,
+      "step": 14980
+    },
+    {
+      "epoch": 23.91,
+      "grad_norm": 2.259216070175171,
+      "learning_rate": 0.0010437001594896331,
+      "loss": 0.5749,
+      "step": 14990
+    },
+    {
+      "epoch": 23.92,
+      "grad_norm": 1.5871505737304688,
+      "learning_rate": 0.0010430622009569377,
+      "loss": 0.5055,
+      "step": 15000
+    },
+    {
+      "epoch": 23.94,
+      "grad_norm": 3.0217132568359375,
+      "learning_rate": 0.0010424242424242425,
+      "loss": 0.4674,
+      "step": 15010
+    },
+    {
+      "epoch": 23.96,
+      "grad_norm": 2.425215482711792,
+      "learning_rate": 0.001041786283891547,
+      "loss": 0.4853,
+      "step": 15020
+    },
+    {
+      "epoch": 23.97,
+      "grad_norm": 2.7950572967529297,
+      "learning_rate": 0.0010411483253588516,
+      "loss": 0.5244,
+      "step": 15030
+    },
+    {
+      "epoch": 23.99,
+      "grad_norm": 1.8970431089401245,
+      "learning_rate": 0.0010405103668261564,
+      "loss": 0.538,
+      "step": 15040
+    },
+    {
+      "epoch": 24.0,
+      "grad_norm": 0.7786374688148499,
+      "learning_rate": 0.001039872408293461,
+      "loss": 0.4562,
+      "step": 15050
+    },
+    {
+      "epoch": 24.02,
+      "grad_norm": 1.385309100151062,
+      "learning_rate": 0.0010392344497607655,
+      "loss": 0.4353,
+      "step": 15060
+    },
+    {
+      "epoch": 24.04,
+      "grad_norm": 3.768200397491455,
+      "learning_rate": 0.00103859649122807,
+      "loss": 0.4483,
+      "step": 15070
+    },
+    {
+      "epoch": 24.05,
+      "grad_norm": 3.530329704284668,
+      "learning_rate": 0.0010379585326953749,
+      "loss": 0.4374,
+      "step": 15080
+    },
+    {
+      "epoch": 24.07,
+      "grad_norm": 1.2706865072250366,
+      "learning_rate": 0.0010373205741626794,
+      "loss": 0.3817,
+      "step": 15090
+    },
+    {
+      "epoch": 24.08,
+      "grad_norm": 0.9349244832992554,
+      "learning_rate": 0.001036682615629984,
+      "loss": 0.5057,
+      "step": 15100
+    },
+    {
+      "epoch": 24.1,
+      "grad_norm": 3.0068447589874268,
+      "learning_rate": 0.0010360446570972888,
+      "loss": 0.4718,
+      "step": 15110
+    },
+    {
+      "epoch": 24.11,
+      "grad_norm": 2.423353672027588,
+      "learning_rate": 0.0010354066985645933,
+      "loss": 0.4668,
+      "step": 15120
+    },
+    {
+      "epoch": 24.13,
+      "grad_norm": 1.5053311586380005,
+      "learning_rate": 0.0010347687400318979,
+      "loss": 0.4525,
+      "step": 15130
+    },
+    {
+      "epoch": 24.15,
+      "grad_norm": 1.660056710243225,
+      "learning_rate": 0.0010341307814992024,
+      "loss": 0.4734,
+      "step": 15140
+    },
+    {
+      "epoch": 24.16,
+      "grad_norm": 1.5876003503799438,
+      "learning_rate": 0.0010334928229665072,
+      "loss": 0.3713,
+      "step": 15150
+    },
+    {
+      "epoch": 24.18,
+      "grad_norm": 1.1910775899887085,
+      "learning_rate": 0.0010328548644338118,
+      "loss": 0.4542,
+      "step": 15160
+    },
+    {
+      "epoch": 24.19,
+      "grad_norm": 2.1305978298187256,
+      "learning_rate": 0.0010322169059011164,
+      "loss": 0.4496,
+      "step": 15170
+    },
+    {
+      "epoch": 24.21,
+      "grad_norm": 1.9429091215133667,
+      "learning_rate": 0.0010315789473684211,
+      "loss": 0.4965,
+      "step": 15180
+    },
+    {
+      "epoch": 24.23,
+      "grad_norm": 1.6149272918701172,
+      "learning_rate": 0.0010309409888357257,
+      "loss": 0.4369,
+      "step": 15190
+    },
+    {
+      "epoch": 24.24,
+      "grad_norm": 1.5995999574661255,
+      "learning_rate": 0.0010303030303030303,
+      "loss": 0.4579,
+      "step": 15200
+    },
+    {
+      "epoch": 24.26,
+      "grad_norm": 1.6771583557128906,
+      "learning_rate": 0.0010296650717703348,
+      "loss": 0.4289,
+      "step": 15210
+    },
+    {
+      "epoch": 24.27,
+      "grad_norm": 2.6575920581817627,
+      "learning_rate": 0.0010290271132376396,
+      "loss": 0.5622,
+      "step": 15220
+    },
+    {
+      "epoch": 24.29,
+      "grad_norm": 1.5423036813735962,
+      "learning_rate": 0.0010283891547049442,
+      "loss": 0.4558,
+      "step": 15230
+    },
+    {
+      "epoch": 24.31,
+      "grad_norm": 1.731204628944397,
+      "learning_rate": 0.0010277511961722487,
+      "loss": 0.4732,
+      "step": 15240
+    },
+    {
+      "epoch": 24.32,
+      "grad_norm": 2.5990333557128906,
+      "learning_rate": 0.0010271132376395535,
+      "loss": 0.5179,
+      "step": 15250
+    },
+    {
+      "epoch": 24.34,
+      "grad_norm": 1.5724194049835205,
+      "learning_rate": 0.001026475279106858,
+      "loss": 0.4896,
+      "step": 15260
+    },
+    {
+      "epoch": 24.35,
+      "grad_norm": 3.0556674003601074,
+      "learning_rate": 0.0010258373205741626,
+      "loss": 0.522,
+      "step": 15270
+    },
+    {
+      "epoch": 24.37,
+      "grad_norm": 2.603013515472412,
+      "learning_rate": 0.0010251993620414672,
+      "loss": 0.4626,
+      "step": 15280
+    },
+    {
+      "epoch": 24.39,
+      "grad_norm": 1.3041783571243286,
+      "learning_rate": 0.001024561403508772,
+      "loss": 0.3713,
+      "step": 15290
+    },
+    {
+      "epoch": 24.4,
+      "grad_norm": 1.5249779224395752,
+      "learning_rate": 0.0010239234449760765,
+      "loss": 0.4899,
+      "step": 15300
+    },
+    {
+      "epoch": 24.42,
+      "grad_norm": 1.814285159111023,
+      "learning_rate": 0.001023285486443381,
+      "loss": 0.534,
+      "step": 15310
+    },
+    {
+      "epoch": 24.43,
+      "grad_norm": 2.138099431991577,
+      "learning_rate": 0.0010226475279106859,
+      "loss": 0.4634,
+      "step": 15320
+    },
+    {
+      "epoch": 24.45,
+      "grad_norm": 1.3936606645584106,
+      "learning_rate": 0.0010220095693779904,
+      "loss": 0.4094,
+      "step": 15330
+    },
+    {
+      "epoch": 24.47,
+      "grad_norm": 1.609049677848816,
+      "learning_rate": 0.001021371610845295,
+      "loss": 0.4397,
+      "step": 15340
+    },
+    {
+      "epoch": 24.48,
+      "grad_norm": 1.38874351978302,
+      "learning_rate": 0.0010207336523125996,
+      "loss": 0.5197,
+      "step": 15350
+    },
+    {
+      "epoch": 24.5,
+      "grad_norm": 2.1596977710723877,
+      "learning_rate": 0.0010200956937799043,
+      "loss": 0.4791,
+      "step": 15360
+    },
+    {
+      "epoch": 24.51,
+      "grad_norm": 1.4566435813903809,
+      "learning_rate": 0.001019457735247209,
+      "loss": 0.4453,
+      "step": 15370
+    },
+    {
+      "epoch": 24.53,
+      "grad_norm": 1.784945011138916,
+      "learning_rate": 0.0010188197767145135,
+      "loss": 0.4974,
+      "step": 15380
+    },
+    {
+      "epoch": 24.55,
+      "grad_norm": 1.9153186082839966,
+      "learning_rate": 0.001018181818181818,
+      "loss": 0.4708,
+      "step": 15390
+    },
+    {
+      "epoch": 24.56,
+      "grad_norm": 1.7097647190093994,
+      "learning_rate": 0.0010175438596491228,
+      "loss": 0.4766,
+      "step": 15400
+    },
+    {
+      "epoch": 24.58,
+      "grad_norm": 1.6198031902313232,
+      "learning_rate": 0.0010169059011164274,
+      "loss": 0.4857,
+      "step": 15410
+    },
+    {
+      "epoch": 24.59,
+      "grad_norm": 2.2390496730804443,
+      "learning_rate": 0.001016267942583732,
+      "loss": 0.5341,
+      "step": 15420
+    },
+    {
+      "epoch": 24.61,
+      "grad_norm": 2.7094318866729736,
+      "learning_rate": 0.0010156299840510367,
+      "loss": 0.5316,
+      "step": 15430
+    },
+    {
+      "epoch": 24.63,
+      "grad_norm": 1.7831966876983643,
+      "learning_rate": 0.0010149920255183413,
+      "loss": 0.5638,
+      "step": 15440
+    },
+    {
+      "epoch": 24.64,
+      "grad_norm": 1.7682468891143799,
+      "learning_rate": 0.0010143540669856458,
+      "loss": 0.4232,
+      "step": 15450
+    },
+    {
+      "epoch": 24.66,
+      "grad_norm": 3.382634401321411,
+      "learning_rate": 0.0010137161084529504,
+      "loss": 0.5211,
+      "step": 15460
+    },
+    {
+      "epoch": 24.67,
+      "grad_norm": 1.6117042303085327,
+      "learning_rate": 0.0010130781499202552,
+      "loss": 0.5442,
+      "step": 15470
+    },
+    {
+      "epoch": 24.69,
+      "grad_norm": 2.2903084754943848,
+      "learning_rate": 0.0010124401913875597,
+      "loss": 0.5145,
+      "step": 15480
+    },
+    {
+      "epoch": 24.7,
+      "grad_norm": 1.3082456588745117,
+      "learning_rate": 0.0010118022328548643,
+      "loss": 0.5143,
+      "step": 15490
+    },
+    {
+      "epoch": 24.72,
+      "grad_norm": 1.9928056001663208,
+      "learning_rate": 0.001011164274322169,
+      "loss": 0.5222,
+      "step": 15500
+    },
+    {
+      "epoch": 24.74,
+      "grad_norm": 1.8907785415649414,
+      "learning_rate": 0.0010105263157894737,
+      "loss": 0.4507,
+      "step": 15510
+    },
+    {
+      "epoch": 24.75,
+      "grad_norm": 1.9465861320495605,
+      "learning_rate": 0.0010098883572567782,
+      "loss": 0.4636,
+      "step": 15520
+    },
+    {
+      "epoch": 24.77,
+      "grad_norm": 1.3651535511016846,
+      "learning_rate": 0.0010092503987240828,
+      "loss": 0.4484,
+      "step": 15530
+    },
+    {
+      "epoch": 24.78,
+      "grad_norm": 1.8174107074737549,
+      "learning_rate": 0.0010086124401913876,
+      "loss": 0.4479,
+      "step": 15540
+    },
+    {
+      "epoch": 24.8,
+      "grad_norm": 1.7005228996276855,
+      "learning_rate": 0.0010079744816586921,
+      "loss": 0.5416,
+      "step": 15550
+    },
+    {
+      "epoch": 24.82,
+      "grad_norm": 1.941279649734497,
+      "learning_rate": 0.0010073365231259967,
+      "loss": 0.5404,
+      "step": 15560
+    },
+    {
+      "epoch": 24.83,
+      "grad_norm": 2.1660587787628174,
+      "learning_rate": 0.0010066985645933015,
+      "loss": 0.5335,
+      "step": 15570
+    },
+    {
+      "epoch": 24.85,
+      "grad_norm": 2.4644267559051514,
+      "learning_rate": 0.001006060606060606,
+      "loss": 0.6635,
+      "step": 15580
+    },
+    {
+      "epoch": 24.86,
+      "grad_norm": 1.596439242362976,
+      "learning_rate": 0.0010054226475279106,
+      "loss": 0.4986,
+      "step": 15590
+    },
+    {
+      "epoch": 24.88,
+      "grad_norm": 1.4329978227615356,
+      "learning_rate": 0.0010047846889952152,
+      "loss": 0.4721,
+      "step": 15600
+    },
+    {
+      "epoch": 24.9,
+      "grad_norm": 1.454533338546753,
+      "learning_rate": 0.00100414673046252,
+      "loss": 0.4796,
+      "step": 15610
+    },
+    {
+      "epoch": 24.91,
+      "grad_norm": 2.5839779376983643,
+      "learning_rate": 0.0010035087719298245,
+      "loss": 0.5149,
+      "step": 15620
+    },
+    {
+      "epoch": 24.93,
+      "grad_norm": 2.21061110496521,
+      "learning_rate": 0.001002870813397129,
+      "loss": 0.5191,
+      "step": 15630
+    },
+    {
+      "epoch": 24.94,
+      "grad_norm": 1.6350433826446533,
+      "learning_rate": 0.0010022328548644338,
+      "loss": 0.4748,
+      "step": 15640
+    },
+    {
+      "epoch": 24.96,
+      "grad_norm": 1.638689637184143,
+      "learning_rate": 0.0010015948963317384,
+      "loss": 0.4639,
+      "step": 15650
+    },
+    {
+      "epoch": 24.98,
+      "grad_norm": 1.925967812538147,
+      "learning_rate": 0.001000956937799043,
+      "loss": 0.6168,
+      "step": 15660
+    },
+    {
+      "epoch": 24.99,
+      "grad_norm": 1.7674167156219482,
+      "learning_rate": 0.0010003189792663475,
+      "loss": 0.5219,
+      "step": 15670
+    },
+    {
+      "epoch": 25.01,
+      "grad_norm": 0.9835655689239502,
+      "learning_rate": 0.0009996810207336523,
+      "loss": 0.391,
+      "step": 15680
+    },
+    {
+      "epoch": 25.02,
+      "grad_norm": 1.7107539176940918,
+      "learning_rate": 0.000999043062200957,
+      "loss": 0.3556,
+      "step": 15690
+    },
+    {
+      "epoch": 25.04,
+      "grad_norm": 2.7259128093719482,
+      "learning_rate": 0.0009984051036682616,
+      "loss": 0.5067,
+      "step": 15700
+    },
+    {
+      "epoch": 25.06,
+      "grad_norm": 1.4780336618423462,
+      "learning_rate": 0.0009977671451355662,
+      "loss": 0.4045,
+      "step": 15710
+    },
+    {
+      "epoch": 25.07,
+      "grad_norm": 1.699403166770935,
+      "learning_rate": 0.000997129186602871,
+      "loss": 0.4629,
+      "step": 15720
+    },
+    {
+      "epoch": 25.09,
+      "grad_norm": 2.0610368251800537,
+      "learning_rate": 0.0009964912280701755,
+      "loss": 0.4457,
+      "step": 15730
+    },
+    {
+      "epoch": 25.1,
+      "grad_norm": 1.1959340572357178,
+      "learning_rate": 0.0009958532695374801,
+      "loss": 0.435,
+      "step": 15740
+    },
+    {
+      "epoch": 25.12,
+      "grad_norm": 2.4365720748901367,
+      "learning_rate": 0.0009952153110047847,
+      "loss": 0.444,
+      "step": 15750
+    },
+    {
+      "epoch": 25.14,
+      "grad_norm": 1.2574375867843628,
+      "learning_rate": 0.0009945773524720895,
+      "loss": 0.4484,
+      "step": 15760
+    },
+    {
+      "epoch": 25.15,
+      "grad_norm": 3.0744266510009766,
+      "learning_rate": 0.000993939393939394,
+      "loss": 0.5128,
+      "step": 15770
+    },
+    {
+      "epoch": 25.17,
+      "grad_norm": 1.3673443794250488,
+      "learning_rate": 0.0009933014354066986,
+      "loss": 0.528,
+      "step": 15780
+    },
+    {
+      "epoch": 25.18,
+      "grad_norm": 1.0166288614273071,
+      "learning_rate": 0.0009926634768740031,
+      "loss": 0.4576,
+      "step": 15790
+    },
+    {
+      "epoch": 25.2,
+      "grad_norm": 2.5745012760162354,
+      "learning_rate": 0.000992025518341308,
+      "loss": 0.4262,
+      "step": 15800
+    },
+    {
+      "epoch": 25.22,
+      "grad_norm": 1.265143871307373,
+      "learning_rate": 0.0009913875598086125,
+      "loss": 0.4196,
+      "step": 15810
+    },
+    {
+      "epoch": 25.23,
+      "grad_norm": 2.3100552558898926,
+      "learning_rate": 0.000990749601275917,
+      "loss": 0.5067,
+      "step": 15820
+    },
+    {
+      "epoch": 25.25,
+      "grad_norm": 1.1458524465560913,
+      "learning_rate": 0.0009901116427432218,
+      "loss": 0.4197,
+      "step": 15830
+    },
+    {
+      "epoch": 25.26,
+      "grad_norm": 1.4825867414474487,
+      "learning_rate": 0.0009894736842105264,
+      "loss": 0.4208,
+      "step": 15840
+    },
+    {
+      "epoch": 25.28,
+      "grad_norm": 2.505919933319092,
+      "learning_rate": 0.000988835725677831,
+      "loss": 0.5082,
+      "step": 15850
+    },
+    {
+      "epoch": 25.3,
+      "grad_norm": 1.547998070716858,
+      "learning_rate": 0.0009881977671451355,
+      "loss": 0.4517,
+      "step": 15860
+    },
+    {
+      "epoch": 25.31,
+      "grad_norm": 1.6311086416244507,
+      "learning_rate": 0.0009875598086124403,
+      "loss": 0.4928,
+      "step": 15870
+    },
+    {
+      "epoch": 25.33,
+      "grad_norm": 1.7544368505477905,
+      "learning_rate": 0.0009869218500797449,
+      "loss": 0.4175,
+      "step": 15880
+    },
+    {
+      "epoch": 25.34,
+      "grad_norm": 1.1133722066879272,
+      "learning_rate": 0.0009862838915470494,
+      "loss": 0.4709,
+      "step": 15890
+    },
+    {
+      "epoch": 25.36,
+      "grad_norm": 1.8425043821334839,
+      "learning_rate": 0.0009856459330143542,
+      "loss": 0.425,
+      "step": 15900
+    },
+    {
+      "epoch": 25.37,
+      "grad_norm": 1.6408649682998657,
+      "learning_rate": 0.0009850079744816588,
+      "loss": 0.4856,
+      "step": 15910
+    },
+    {
+      "epoch": 25.39,
+      "grad_norm": 2.6448709964752197,
+      "learning_rate": 0.0009843700159489633,
+      "loss": 0.5066,
+      "step": 15920
+    },
+    {
+      "epoch": 25.41,
+      "grad_norm": 3.6012330055236816,
+      "learning_rate": 0.0009837320574162679,
+      "loss": 0.4534,
+      "step": 15930
+    },
+    {
+      "epoch": 25.42,
+      "grad_norm": 3.10849666595459,
+      "learning_rate": 0.0009830940988835727,
+      "loss": 0.458,
+      "step": 15940
+    },
+    {
+      "epoch": 25.44,
+      "grad_norm": 1.3097262382507324,
+      "learning_rate": 0.0009824561403508772,
+      "loss": 0.4695,
+      "step": 15950
+    },
+    {
+      "epoch": 25.45,
+      "grad_norm": 1.4666467905044556,
+      "learning_rate": 0.0009818181818181818,
+      "loss": 0.4955,
+      "step": 15960
+    },
+    {
+      "epoch": 25.47,
+      "grad_norm": 2.279972791671753,
+      "learning_rate": 0.0009811802232854866,
+      "loss": 0.4531,
+      "step": 15970
+    },
+    {
+      "epoch": 25.49,
+      "grad_norm": 1.8388824462890625,
+      "learning_rate": 0.0009805422647527911,
+      "loss": 0.5209,
+      "step": 15980
+    },
+    {
+      "epoch": 25.5,
+      "grad_norm": 1.2906782627105713,
+      "learning_rate": 0.0009799043062200957,
+      "loss": 0.5712,
+      "step": 15990
+    },
+    {
+      "epoch": 25.52,
+      "grad_norm": 1.1561537981033325,
+      "learning_rate": 0.0009792663476874003,
+      "loss": 0.4498,
+      "step": 16000
+    },
+    {
+      "epoch": 25.53,
+      "grad_norm": 0.9394503831863403,
+      "learning_rate": 0.000978628389154705,
+      "loss": 0.4817,
+      "step": 16010
+    },
+    {
+      "epoch": 25.55,
+      "grad_norm": 1.3297114372253418,
+      "learning_rate": 0.0009779904306220096,
+      "loss": 0.4238,
+      "step": 16020
+    },
+    {
+      "epoch": 25.57,
+      "grad_norm": 1.832533597946167,
+      "learning_rate": 0.0009773524720893142,
+      "loss": 0.4492,
+      "step": 16030
+    },
+    {
+      "epoch": 25.58,
+      "grad_norm": 1.8517677783966064,
+      "learning_rate": 0.000976714513556619,
+      "loss": 0.4209,
+      "step": 16040
+    },
+    {
+      "epoch": 25.6,
+      "grad_norm": 1.57618248462677,
+      "learning_rate": 0.0009760765550239234,
+      "loss": 0.4237,
+      "step": 16050
+    },
+    {
+      "epoch": 25.61,
+      "grad_norm": 2.6795618534088135,
+      "learning_rate": 0.0009754385964912282,
+      "loss": 0.4727,
+      "step": 16060
+    },
+    {
+      "epoch": 25.63,
+      "grad_norm": 1.7043702602386475,
+      "learning_rate": 0.0009748006379585327,
+      "loss": 0.5071,
+      "step": 16070
+    },
+    {
+      "epoch": 25.65,
+      "grad_norm": 2.142303228378296,
+      "learning_rate": 0.0009741626794258374,
+      "loss": 0.5109,
+      "step": 16080
+    },
+    {
+      "epoch": 25.66,
+      "grad_norm": 1.5886117219924927,
+      "learning_rate": 0.000973524720893142,
+      "loss": 0.4755,
+      "step": 16090
+    },
+    {
+      "epoch": 25.68,
+      "grad_norm": 1.767467975616455,
+      "learning_rate": 0.0009728867623604466,
+      "loss": 0.455,
+      "step": 16100
+    },
+    {
+      "epoch": 25.69,
+      "grad_norm": 1.6067595481872559,
+      "learning_rate": 0.0009722488038277513,
+      "loss": 0.4701,
+      "step": 16110
+    },
+    {
+      "epoch": 25.71,
+      "grad_norm": 1.5716664791107178,
+      "learning_rate": 0.0009716108452950559,
+      "loss": 0.4849,
+      "step": 16120
+    },
+    {
+      "epoch": 25.73,
+      "grad_norm": 1.2191548347473145,
+      "learning_rate": 0.0009709728867623605,
+      "loss": 0.4129,
+      "step": 16130
+    },
+    {
+      "epoch": 25.74,
+      "grad_norm": 1.7351710796356201,
+      "learning_rate": 0.0009703349282296651,
+      "loss": 0.5078,
+      "step": 16140
+    },
+    {
+      "epoch": 25.76,
+      "grad_norm": 1.1435052156448364,
+      "learning_rate": 0.0009696969696969698,
+      "loss": 0.4075,
+      "step": 16150
+    },
+    {
+      "epoch": 25.77,
+      "grad_norm": 2.094747304916382,
+      "learning_rate": 0.0009690590111642743,
+      "loss": 0.3737,
+      "step": 16160
+    },
+    {
+      "epoch": 25.79,
+      "grad_norm": 2.186330556869507,
+      "learning_rate": 0.000968421052631579,
+      "loss": 0.4566,
+      "step": 16170
+    },
+    {
+      "epoch": 25.81,
+      "grad_norm": 2.0006825923919678,
+      "learning_rate": 0.0009677830940988836,
+      "loss": 0.458,
+      "step": 16180
+    },
+    {
+      "epoch": 25.82,
+      "grad_norm": 1.7449229955673218,
+      "learning_rate": 0.0009671451355661883,
+      "loss": 0.4487,
+      "step": 16190
+    },
+    {
+      "epoch": 25.84,
+      "grad_norm": 1.6336495876312256,
+      "learning_rate": 0.0009665071770334929,
+      "loss": 0.4743,
+      "step": 16200
+    },
+    {
+      "epoch": 25.85,
+      "grad_norm": 2.408162832260132,
+      "learning_rate": 0.0009658692185007975,
+      "loss": 0.5951,
+      "step": 16210
+    },
+    {
+      "epoch": 25.87,
+      "grad_norm": 1.5623067617416382,
+      "learning_rate": 0.0009652312599681022,
+      "loss": 0.4353,
+      "step": 16220
+    },
+    {
+      "epoch": 25.89,
+      "grad_norm": 1.187019944190979,
+      "learning_rate": 0.0009645933014354067,
+      "loss": 0.4622,
+      "step": 16230
+    },
+    {
+      "epoch": 25.9,
+      "grad_norm": 1.6125158071517944,
+      "learning_rate": 0.0009639553429027114,
+      "loss": 0.4263,
+      "step": 16240
+    },
+    {
+      "epoch": 25.92,
+      "grad_norm": 2.758575677871704,
+      "learning_rate": 0.000963317384370016,
+      "loss": 0.4074,
+      "step": 16250
+    },
+    {
+      "epoch": 25.93,
+      "grad_norm": 1.477206826210022,
+      "learning_rate": 0.0009626794258373206,
+      "loss": 0.5476,
+      "step": 16260
+    },
+    {
+      "epoch": 25.95,
+      "grad_norm": 2.584649085998535,
+      "learning_rate": 0.0009620414673046253,
+      "loss": 0.4626,
+      "step": 16270
+    },
+    {
+      "epoch": 25.96,
+      "grad_norm": 1.4972593784332275,
+      "learning_rate": 0.0009614035087719299,
+      "loss": 0.4884,
+      "step": 16280
+    },
+    {
+      "epoch": 25.98,
+      "grad_norm": 1.7186070680618286,
+      "learning_rate": 0.0009607655502392345,
+      "loss": 0.4263,
+      "step": 16290
+    },
+    {
+      "epoch": 26.0,
+      "grad_norm": 2.3209738731384277,
+      "learning_rate": 0.0009601275917065391,
+      "loss": 0.5385,
+      "step": 16300
+    },
+    {
+      "epoch": 26.01,
+      "grad_norm": 1.8050909042358398,
+      "learning_rate": 0.0009594896331738438,
+      "loss": 0.4111,
+      "step": 16310
+    },
+    {
+      "epoch": 26.03,
+      "grad_norm": 1.729257345199585,
+      "learning_rate": 0.0009588516746411483,
+      "loss": 0.3666,
+      "step": 16320
+    },
+    {
+      "epoch": 26.04,
+      "grad_norm": 2.2084038257598877,
+      "learning_rate": 0.000958213716108453,
+      "loss": 0.3862,
+      "step": 16330
+    },
+    {
+      "epoch": 26.06,
+      "grad_norm": 1.1707019805908203,
+      "learning_rate": 0.0009575757575757576,
+      "loss": 0.4629,
+      "step": 16340
+    },
+    {
+      "epoch": 26.08,
+      "grad_norm": 3.4062771797180176,
+      "learning_rate": 0.0009569377990430622,
+      "loss": 0.3774,
+      "step": 16350
+    },
+    {
+      "epoch": 26.09,
+      "grad_norm": 1.5490500926971436,
+      "learning_rate": 0.0009562998405103669,
+      "loss": 0.4169,
+      "step": 16360
+    },
+    {
+      "epoch": 26.11,
+      "grad_norm": 1.3803966045379639,
+      "learning_rate": 0.0009556618819776715,
+      "loss": 0.4544,
+      "step": 16370
+    },
+    {
+      "epoch": 26.12,
+      "grad_norm": 1.512718915939331,
+      "learning_rate": 0.0009550239234449761,
+      "loss": 0.4163,
+      "step": 16380
+    },
+    {
+      "epoch": 26.14,
+      "grad_norm": 1.462695837020874,
+      "learning_rate": 0.0009543859649122807,
+      "loss": 0.4556,
+      "step": 16390
+    },
+    {
+      "epoch": 26.16,
+      "grad_norm": 1.237164855003357,
+      "learning_rate": 0.0009537480063795854,
+      "loss": 0.3972,
+      "step": 16400
+    },
+    {
+      "epoch": 26.17,
+      "grad_norm": 1.3175599575042725,
+      "learning_rate": 0.0009531100478468899,
+      "loss": 0.4673,
+      "step": 16410
+    },
+    {
+      "epoch": 26.19,
+      "grad_norm": 1.3138153553009033,
+      "learning_rate": 0.0009524720893141946,
+      "loss": 0.3655,
+      "step": 16420
+    },
+    {
+      "epoch": 26.2,
+      "grad_norm": 1.4665474891662598,
+      "learning_rate": 0.0009518341307814993,
+      "loss": 0.4013,
+      "step": 16430
+    },
+    {
+      "epoch": 26.22,
+      "grad_norm": 1.0699955224990845,
+      "learning_rate": 0.0009511961722488038,
+      "loss": 0.4483,
+      "step": 16440
+    },
+    {
+      "epoch": 26.24,
+      "grad_norm": 1.7993961572647095,
+      "learning_rate": 0.0009505582137161085,
+      "loss": 0.5006,
+      "step": 16450
+    },
+    {
+      "epoch": 26.25,
+      "grad_norm": 2.075788736343384,
+      "learning_rate": 0.0009499202551834131,
+      "loss": 0.4183,
+      "step": 16460
+    },
+    {
+      "epoch": 26.27,
+      "grad_norm": 1.428053379058838,
+      "learning_rate": 0.0009492822966507177,
+      "loss": 0.4499,
+      "step": 16470
+    },
+    {
+      "epoch": 26.28,
+      "grad_norm": 0.9592264294624329,
+      "learning_rate": 0.0009486443381180223,
+      "loss": 0.4191,
+      "step": 16480
+    },
+    {
+      "epoch": 26.3,
+      "grad_norm": 1.708006739616394,
+      "learning_rate": 0.000948006379585327,
+      "loss": 0.4259,
+      "step": 16490
+    },
+    {
+      "epoch": 26.32,
+      "grad_norm": 2.1585805416107178,
+      "learning_rate": 0.0009473684210526315,
+      "loss": 0.5309,
+      "step": 16500
+    },
+    {
+      "epoch": 26.33,
+      "grad_norm": 1.7619798183441162,
+      "learning_rate": 0.0009467304625199362,
+      "loss": 0.4947,
+      "step": 16510
+    },
+    {
+      "epoch": 26.35,
+      "grad_norm": 2.408426523208618,
+      "learning_rate": 0.0009460925039872409,
+      "loss": 0.4576,
+      "step": 16520
+    },
+    {
+      "epoch": 26.36,
+      "grad_norm": 1.5698516368865967,
+      "learning_rate": 0.0009454545454545454,
+      "loss": 0.4161,
+      "step": 16530
+    },
+    {
+      "epoch": 26.38,
+      "grad_norm": 3.033655881881714,
+      "learning_rate": 0.0009448165869218501,
+      "loss": 0.5235,
+      "step": 16540
+    },
+    {
+      "epoch": 26.4,
+      "grad_norm": 2.2422995567321777,
+      "learning_rate": 0.0009441786283891547,
+      "loss": 0.5035,
+      "step": 16550
+    },
+    {
+      "epoch": 26.41,
+      "grad_norm": 2.3441011905670166,
+      "learning_rate": 0.0009435406698564593,
+      "loss": 0.4001,
+      "step": 16560
+    },
+    {
+      "epoch": 26.43,
+      "grad_norm": 1.529283881187439,
+      "learning_rate": 0.0009429027113237639,
+      "loss": 0.3795,
+      "step": 16570
+    },
+    {
+      "epoch": 26.44,
+      "grad_norm": 1.2047476768493652,
+      "learning_rate": 0.0009422647527910686,
+      "loss": 0.4504,
+      "step": 16580
+    },
+    {
+      "epoch": 26.46,
+      "grad_norm": 2.407144069671631,
+      "learning_rate": 0.0009416267942583733,
+      "loss": 0.3977,
+      "step": 16590
+    },
+    {
+      "epoch": 26.48,
+      "grad_norm": 1.3065524101257324,
+      "learning_rate": 0.0009409888357256778,
+      "loss": 0.4534,
+      "step": 16600
+    },
+    {
+      "epoch": 26.49,
+      "grad_norm": 2.119401693344116,
+      "learning_rate": 0.0009403508771929825,
+      "loss": 0.513,
+      "step": 16610
+    },
+    {
+      "epoch": 26.51,
+      "grad_norm": 1.1828601360321045,
+      "learning_rate": 0.000939712918660287,
+      "loss": 0.4113,
+      "step": 16620
+    },
+    {
+      "epoch": 26.52,
+      "grad_norm": 3.7420921325683594,
+      "learning_rate": 0.0009390749601275917,
+      "loss": 0.4948,
+      "step": 16630
+    },
+    {
+      "epoch": 26.54,
+      "grad_norm": 1.524720549583435,
+      "learning_rate": 0.0009384370015948963,
+      "loss": 0.4233,
+      "step": 16640
+    },
+    {
+      "epoch": 26.56,
+      "grad_norm": 2.136596918106079,
+      "learning_rate": 0.000937799043062201,
+      "loss": 0.472,
+      "step": 16650
+    },
+    {
+      "epoch": 26.57,
+      "grad_norm": 2.395744800567627,
+      "learning_rate": 0.0009371610845295055,
+      "loss": 0.4629,
+      "step": 16660
+    },
+    {
+      "epoch": 26.59,
+      "grad_norm": 1.2973766326904297,
+      "learning_rate": 0.0009365231259968102,
+      "loss": 0.4332,
+      "step": 16670
+    },
+    {
+      "epoch": 26.6,
+      "grad_norm": 2.164285659790039,
+      "learning_rate": 0.0009358851674641149,
+      "loss": 0.4451,
+      "step": 16680
+    },
+    {
+      "epoch": 26.62,
+      "grad_norm": 1.284764051437378,
+      "learning_rate": 0.0009352472089314194,
+      "loss": 0.5361,
+      "step": 16690
+    },
+    {
+      "epoch": 26.63,
+      "grad_norm": 1.187538743019104,
+      "learning_rate": 0.0009346092503987241,
+      "loss": 0.4414,
+      "step": 16700
+    },
+    {
+      "epoch": 26.65,
+      "grad_norm": 2.0321905612945557,
+      "learning_rate": 0.0009339712918660287,
+      "loss": 0.3823,
+      "step": 16710
+    },
+    {
+      "epoch": 26.67,
+      "grad_norm": 2.023181676864624,
+      "learning_rate": 0.0009333333333333333,
+      "loss": 0.5097,
+      "step": 16720
+    },
+    {
+      "epoch": 26.68,
+      "grad_norm": 1.6229287385940552,
+      "learning_rate": 0.0009326953748006379,
+      "loss": 0.4747,
+      "step": 16730
+    },
+    {
+      "epoch": 26.7,
+      "grad_norm": 1.848752498626709,
+      "learning_rate": 0.0009320574162679426,
+      "loss": 0.4416,
+      "step": 16740
+    },
+    {
+      "epoch": 26.71,
+      "grad_norm": 1.674248218536377,
+      "learning_rate": 0.0009314194577352472,
+      "loss": 0.5362,
+      "step": 16750
+    },
+    {
+      "epoch": 26.73,
+      "grad_norm": 0.7888638973236084,
+      "learning_rate": 0.0009307814992025518,
+      "loss": 0.4482,
+      "step": 16760
+    },
+    {
+      "epoch": 26.75,
+      "grad_norm": 1.2110415697097778,
+      "learning_rate": 0.0009301435406698565,
+      "loss": 0.4771,
+      "step": 16770
+    },
+    {
+      "epoch": 26.76,
+      "grad_norm": 2.884260654449463,
+      "learning_rate": 0.000929505582137161,
+      "loss": 0.44,
+      "step": 16780
+    },
+    {
+      "epoch": 26.78,
+      "grad_norm": 1.4633077383041382,
+      "learning_rate": 0.0009288676236044657,
+      "loss": 0.4565,
+      "step": 16790
+    },
+    {
+      "epoch": 26.79,
+      "grad_norm": 1.6688116788864136,
+      "learning_rate": 0.0009282296650717703,
+      "loss": 0.4524,
+      "step": 16800
+    },
+    {
+      "epoch": 26.81,
+      "grad_norm": 1.4576424360275269,
+      "learning_rate": 0.000927591706539075,
+      "loss": 0.4253,
+      "step": 16810
+    },
+    {
+      "epoch": 26.83,
+      "grad_norm": 1.47834312915802,
+      "learning_rate": 0.0009269537480063796,
+      "loss": 0.4857,
+      "step": 16820
+    },
+    {
+      "epoch": 26.84,
+      "grad_norm": 1.2933154106140137,
+      "learning_rate": 0.0009263157894736843,
+      "loss": 0.4483,
+      "step": 16830
+    },
+    {
+      "epoch": 26.86,
+      "grad_norm": 2.671135663986206,
+      "learning_rate": 0.0009256778309409889,
+      "loss": 0.4321,
+      "step": 16840
+    },
+    {
+      "epoch": 26.87,
+      "grad_norm": 0.900836169719696,
+      "learning_rate": 0.0009250398724082935,
+      "loss": 0.4142,
+      "step": 16850
+    },
+    {
+      "epoch": 26.89,
+      "grad_norm": 1.467921257019043,
+      "learning_rate": 0.0009244019138755982,
+      "loss": 0.4381,
+      "step": 16860
+    },
+    {
+      "epoch": 26.91,
+      "grad_norm": 1.2465593814849854,
+      "learning_rate": 0.0009237639553429027,
+      "loss": 0.4075,
+      "step": 16870
+    },
+    {
+      "epoch": 26.92,
+      "grad_norm": 1.7828130722045898,
+      "learning_rate": 0.0009231259968102074,
+      "loss": 0.4855,
+      "step": 16880
+    },
+    {
+      "epoch": 26.94,
+      "grad_norm": 2.368098735809326,
+      "learning_rate": 0.000922488038277512,
+      "loss": 0.5569,
+      "step": 16890
+    },
+    {
+      "epoch": 26.95,
+      "grad_norm": 1.8269487619400024,
+      "learning_rate": 0.0009218500797448166,
+      "loss": 0.5397,
+      "step": 16900
+    },
+    {
+      "epoch": 26.97,
+      "grad_norm": 1.5869868993759155,
+      "learning_rate": 0.0009212121212121213,
+      "loss": 0.447,
+      "step": 16910
+    },
+    {
+      "epoch": 26.99,
+      "grad_norm": 2.1379966735839844,
+      "learning_rate": 0.0009205741626794259,
+      "loss": 0.3943,
+      "step": 16920
+    },
+    {
+      "epoch": 27.0,
+      "grad_norm": 0.7937178611755371,
+      "learning_rate": 0.0009199362041467305,
+      "loss": 0.4762,
+      "step": 16930
+    },
+    {
+      "epoch": 27.02,
+      "grad_norm": 0.7434051036834717,
+      "learning_rate": 0.0009192982456140351,
+      "loss": 0.3192,
+      "step": 16940
+    },
+    {
+      "epoch": 27.03,
+      "grad_norm": 1.9126826524734497,
+      "learning_rate": 0.0009186602870813398,
+      "loss": 0.3383,
+      "step": 16950
+    },
+    {
+      "epoch": 27.05,
+      "grad_norm": 0.8884724378585815,
+      "learning_rate": 0.0009180223285486443,
+      "loss": 0.335,
+      "step": 16960
+    },
+    {
+      "epoch": 27.07,
+      "grad_norm": 0.7101998329162598,
+      "learning_rate": 0.000917384370015949,
+      "loss": 0.355,
+      "step": 16970
+    },
+    {
+      "epoch": 27.08,
+      "grad_norm": 1.6486220359802246,
+      "learning_rate": 0.0009167464114832537,
+      "loss": 0.4237,
+      "step": 16980
+    },
+    {
+      "epoch": 27.1,
+      "grad_norm": 1.5720986127853394,
+      "learning_rate": 0.0009161084529505583,
+      "loss": 0.3649,
+      "step": 16990
+    },
+    {
+      "epoch": 27.11,
+      "grad_norm": 1.331430196762085,
+      "learning_rate": 0.0009154704944178629,
+      "loss": 0.4673,
+      "step": 17000
+    },
+    {
+      "epoch": 27.13,
+      "grad_norm": 1.1665971279144287,
+      "learning_rate": 0.0009148325358851675,
+      "loss": 0.3722,
+      "step": 17010
+    },
+    {
+      "epoch": 27.15,
+      "grad_norm": 0.784977376461029,
+      "learning_rate": 0.0009141945773524722,
+      "loss": 0.3737,
+      "step": 17020
+    },
+    {
+      "epoch": 27.16,
+      "grad_norm": 1.2977066040039062,
+      "learning_rate": 0.0009135566188197767,
+      "loss": 0.4048,
+      "step": 17030
+    },
+    {
+      "epoch": 27.18,
+      "grad_norm": 0.9560506343841553,
+      "learning_rate": 0.0009129186602870814,
+      "loss": 0.4475,
+      "step": 17040
+    },
+    {
+      "epoch": 27.19,
+      "grad_norm": 1.2674915790557861,
+      "learning_rate": 0.000912280701754386,
+      "loss": 0.4099,
+      "step": 17050
+    },
+    {
+      "epoch": 27.21,
+      "grad_norm": 1.7103983163833618,
+      "learning_rate": 0.0009116427432216906,
+      "loss": 0.3881,
+      "step": 17060
+    },
+    {
+      "epoch": 27.22,
+      "grad_norm": 1.464312195777893,
+      "learning_rate": 0.0009110047846889953,
+      "loss": 0.3751,
+      "step": 17070
+    },
+    {
+      "epoch": 27.24,
+      "grad_norm": 1.2396901845932007,
+      "learning_rate": 0.0009103668261562999,
+      "loss": 0.3781,
+      "step": 17080
+    },
+    {
+      "epoch": 27.26,
+      "grad_norm": 1.704807162284851,
+      "learning_rate": 0.0009097288676236045,
+      "loss": 0.3462,
+      "step": 17090
+    },
+    {
+      "epoch": 27.27,
+      "grad_norm": 1.6080540418624878,
+      "learning_rate": 0.0009090909090909091,
+      "loss": 0.4836,
+      "step": 17100
+    },
+    {
+      "epoch": 27.29,
+      "grad_norm": 2.0571115016937256,
+      "learning_rate": 0.0009084529505582138,
+      "loss": 0.424,
+      "step": 17110
+    },
+    {
+      "epoch": 27.3,
+      "grad_norm": 1.9051276445388794,
+      "learning_rate": 0.0009078149920255183,
+      "loss": 0.4107,
+      "step": 17120
+    },
+    {
+      "epoch": 27.32,
+      "grad_norm": 1.0614899396896362,
+      "learning_rate": 0.000907177033492823,
+      "loss": 0.4258,
+      "step": 17130
+    },
+    {
+      "epoch": 27.34,
+      "grad_norm": 0.771443247795105,
+      "learning_rate": 0.0009065390749601277,
+      "loss": 0.4893,
+      "step": 17140
+    },
+    {
+      "epoch": 27.35,
+      "grad_norm": 1.2591562271118164,
+      "learning_rate": 0.0009059011164274322,
+      "loss": 0.3847,
+      "step": 17150
+    },
+    {
+      "epoch": 27.37,
+      "grad_norm": 2.423963785171509,
+      "learning_rate": 0.0009052631578947369,
+      "loss": 0.4034,
+      "step": 17160
+    },
+    {
+      "epoch": 27.38,
+      "grad_norm": 1.5385760068893433,
+      "learning_rate": 0.0009046251993620415,
+      "loss": 0.4347,
+      "step": 17170
+    },
+    {
+      "epoch": 27.4,
+      "grad_norm": 1.886620044708252,
+      "learning_rate": 0.0009039872408293461,
+      "loss": 0.5058,
+      "step": 17180
+    },
+    {
+      "epoch": 27.42,
+      "grad_norm": 1.3259475231170654,
+      "learning_rate": 0.0009033492822966507,
+      "loss": 0.4189,
+      "step": 17190
+    },
+    {
+      "epoch": 27.43,
+      "grad_norm": 2.392594814300537,
+      "learning_rate": 0.0009027113237639554,
+      "loss": 0.3627,
+      "step": 17200
+    },
+    {
+      "epoch": 27.45,
+      "grad_norm": 2.2240548133850098,
+      "learning_rate": 0.0009020733652312599,
+      "loss": 0.4367,
+      "step": 17210
+    },
+    {
+      "epoch": 27.46,
+      "grad_norm": 1.4467096328735352,
+      "learning_rate": 0.0009014354066985646,
+      "loss": 0.4677,
+      "step": 17220
+    },
+    {
+      "epoch": 27.48,
+      "grad_norm": 1.5662921667099,
+      "learning_rate": 0.0009007974481658693,
+      "loss": 0.508,
+      "step": 17230
+    },
+    {
+      "epoch": 27.5,
+      "grad_norm": 1.6414707899093628,
+      "learning_rate": 0.0009001594896331738,
+      "loss": 0.3679,
+      "step": 17240
+    },
+    {
+      "epoch": 27.51,
+      "grad_norm": 1.0565104484558105,
+      "learning_rate": 0.0008995215311004785,
+      "loss": 0.3557,
+      "step": 17250
+    },
+    {
+      "epoch": 27.53,
+      "grad_norm": 1.353499174118042,
+      "learning_rate": 0.0008988835725677831,
+      "loss": 0.4911,
+      "step": 17260
+    },
+    {
+      "epoch": 27.54,
+      "grad_norm": 3.16988205909729,
+      "learning_rate": 0.0008982456140350877,
+      "loss": 0.5339,
+      "step": 17270
+    },
+    {
+      "epoch": 27.56,
+      "grad_norm": 2.6091325283050537,
+      "learning_rate": 0.0008976076555023923,
+      "loss": 0.5084,
+      "step": 17280
+    },
+    {
+      "epoch": 27.58,
+      "grad_norm": 1.3425127267837524,
+      "learning_rate": 0.000896969696969697,
+      "loss": 0.4548,
+      "step": 17290
+    },
+    {
+      "epoch": 27.59,
+      "grad_norm": 1.9189682006835938,
+      "learning_rate": 0.0008963317384370016,
+      "loss": 0.4727,
+      "step": 17300
+    },
+    {
+      "epoch": 27.61,
+      "grad_norm": 3.497046709060669,
+      "learning_rate": 0.0008956937799043062,
+      "loss": 0.5446,
+      "step": 17310
+    },
+    {
+      "epoch": 27.62,
+      "grad_norm": 1.4161769151687622,
+      "learning_rate": 0.0008950558213716109,
+      "loss": 0.362,
+      "step": 17320
+    },
+    {
+      "epoch": 27.64,
+      "grad_norm": 1.7099406719207764,
+      "learning_rate": 0.0008944178628389154,
+      "loss": 0.3934,
+      "step": 17330
+    },
+    {
+      "epoch": 27.66,
+      "grad_norm": 2.129094362258911,
+      "learning_rate": 0.0008937799043062201,
+      "loss": 0.4169,
+      "step": 17340
+    },
+    {
+      "epoch": 27.67,
+      "grad_norm": 1.5544568300247192,
+      "learning_rate": 0.0008931419457735247,
+      "loss": 0.4201,
+      "step": 17350
+    },
+    {
+      "epoch": 27.69,
+      "grad_norm": 1.1022940874099731,
+      "learning_rate": 0.0008925039872408293,
+      "loss": 0.4958,
+      "step": 17360
+    },
+    {
+      "epoch": 27.7,
+      "grad_norm": 1.771380066871643,
+      "learning_rate": 0.0008918660287081339,
+      "loss": 0.3859,
+      "step": 17370
+    },
+    {
+      "epoch": 27.72,
+      "grad_norm": 2.647625207901001,
+      "learning_rate": 0.0008912280701754386,
+      "loss": 0.4444,
+      "step": 17380
+    },
+    {
+      "epoch": 27.74,
+      "grad_norm": 1.4977500438690186,
+      "learning_rate": 0.0008905901116427433,
+      "loss": 0.4672,
+      "step": 17390
+    },
+    {
+      "epoch": 27.75,
+      "grad_norm": 1.1140875816345215,
+      "learning_rate": 0.0008899521531100478,
+      "loss": 0.4828,
+      "step": 17400
+    },
+    {
+      "epoch": 27.77,
+      "grad_norm": 2.1960301399230957,
+      "learning_rate": 0.0008893141945773525,
+      "loss": 0.5277,
+      "step": 17410
+    },
+    {
+      "epoch": 27.78,
+      "grad_norm": 1.2357120513916016,
+      "learning_rate": 0.000888676236044657,
+      "loss": 0.3831,
+      "step": 17420
+    },
+    {
+      "epoch": 27.8,
+      "grad_norm": 2.183209180831909,
+      "learning_rate": 0.0008880382775119617,
+      "loss": 0.4646,
+      "step": 17430
+    },
+    {
+      "epoch": 27.81,
+      "grad_norm": 1.4991573095321655,
+      "learning_rate": 0.0008874003189792663,
+      "loss": 0.4445,
+      "step": 17440
+    },
+    {
+      "epoch": 27.83,
+      "grad_norm": 2.547933340072632,
+      "learning_rate": 0.000886762360446571,
+      "loss": 0.609,
+      "step": 17450
+    },
+    {
+      "epoch": 27.85,
+      "grad_norm": 1.751570224761963,
+      "learning_rate": 0.0008861244019138756,
+      "loss": 0.4723,
+      "step": 17460
+    },
+    {
+      "epoch": 27.86,
+      "grad_norm": 1.5204071998596191,
+      "learning_rate": 0.0008854864433811802,
+      "loss": 0.3944,
+      "step": 17470
+    },
+    {
+      "epoch": 27.88,
+      "grad_norm": 0.8941110372543335,
+      "learning_rate": 0.0008848484848484849,
+      "loss": 0.4384,
+      "step": 17480
+    },
+    {
+      "epoch": 27.89,
+      "grad_norm": 3.1257965564727783,
+      "learning_rate": 0.0008842105263157894,
+      "loss": 0.403,
+      "step": 17490
+    },
+    {
+      "epoch": 27.91,
+      "grad_norm": 1.1965994834899902,
+      "learning_rate": 0.0008835725677830941,
+      "loss": 0.4476,
+      "step": 17500
+    },
+    {
+      "epoch": 27.93,
+      "grad_norm": 2.3756983280181885,
+      "learning_rate": 0.0008829346092503987,
+      "loss": 0.5574,
+      "step": 17510
+    },
+    {
+      "epoch": 27.94,
+      "grad_norm": 2.028165578842163,
+      "learning_rate": 0.0008822966507177033,
+      "loss": 0.5269,
+      "step": 17520
+    },
+    {
+      "epoch": 27.96,
+      "grad_norm": 2.061138868331909,
+      "learning_rate": 0.0008816586921850079,
+      "loss": 0.3863,
+      "step": 17530
+    },
+    {
+      "epoch": 27.97,
+      "grad_norm": 1.6647778749465942,
+      "learning_rate": 0.0008810207336523126,
+      "loss": 0.4803,
+      "step": 17540
+    },
+    {
+      "epoch": 27.99,
+      "grad_norm": 1.8999004364013672,
+      "learning_rate": 0.0008803827751196173,
+      "loss": 0.4465,
+      "step": 17550
+    },
+    {
+      "epoch": 28.01,
+      "grad_norm": 1.4546337127685547,
+      "learning_rate": 0.0008797448165869219,
+      "loss": 0.4054,
+      "step": 17560
+    },
+    {
+      "epoch": 28.02,
+      "grad_norm": 0.5947902798652649,
+      "learning_rate": 0.0008791068580542266,
+      "loss": 0.3905,
+      "step": 17570
+    },
+    {
+      "epoch": 28.04,
+      "grad_norm": 1.231417179107666,
+      "learning_rate": 0.0008784688995215311,
+      "loss": 0.3693,
+      "step": 17580
+    },
+    {
+      "epoch": 28.05,
+      "grad_norm": 1.068305253982544,
+      "learning_rate": 0.0008778309409888358,
+      "loss": 0.3525,
+      "step": 17590
+    },
+    {
+      "epoch": 28.07,
+      "grad_norm": 1.7087610960006714,
+      "learning_rate": 0.0008771929824561404,
+      "loss": 0.3472,
+      "step": 17600
+    },
+    {
+      "epoch": 28.09,
+      "grad_norm": 1.412925362586975,
+      "learning_rate": 0.000876555023923445,
+      "loss": 0.469,
+      "step": 17610
+    },
+    {
+      "epoch": 28.1,
+      "grad_norm": 1.3570494651794434,
+      "learning_rate": 0.0008759170653907497,
+      "loss": 0.4222,
+      "step": 17620
+    },
+    {
+      "epoch": 28.12,
+      "grad_norm": 0.9123827219009399,
+      "learning_rate": 0.0008752791068580543,
+      "loss": 0.3517,
+      "step": 17630
+    },
+    {
+      "epoch": 28.13,
+      "grad_norm": 1.3093185424804688,
+      "learning_rate": 0.0008746411483253589,
+      "loss": 0.4117,
+      "step": 17640
+    },
+    {
+      "epoch": 28.15,
+      "grad_norm": 1.1676615476608276,
+      "learning_rate": 0.0008740031897926635,
+      "loss": 0.3839,
+      "step": 17650
+    },
+    {
+      "epoch": 28.17,
+      "grad_norm": 0.8572595119476318,
+      "learning_rate": 0.0008733652312599682,
+      "loss": 0.38,
+      "step": 17660
+    },
+    {
+      "epoch": 28.18,
+      "grad_norm": 1.9796086549758911,
+      "learning_rate": 0.0008727272727272727,
+      "loss": 0.4742,
+      "step": 17670
+    },
+    {
+      "epoch": 28.2,
+      "grad_norm": 1.599166989326477,
+      "learning_rate": 0.0008720893141945774,
+      "loss": 0.4556,
+      "step": 17680
+    },
+    {
+      "epoch": 28.21,
+      "grad_norm": 1.9437137842178345,
+      "learning_rate": 0.0008714513556618821,
+      "loss": 0.4952,
+      "step": 17690
+    },
+    {
+      "epoch": 28.23,
+      "grad_norm": 1.6551004648208618,
+      "learning_rate": 0.0008708133971291866,
+      "loss": 0.3986,
+      "step": 17700
+    },
+    {
+      "epoch": 28.25,
+      "grad_norm": 1.8391096591949463,
+      "learning_rate": 0.0008701754385964913,
+      "loss": 0.4194,
+      "step": 17710
+    },
+    {
+      "epoch": 28.26,
+      "grad_norm": 0.9920051097869873,
+      "learning_rate": 0.0008695374800637959,
+      "loss": 0.3206,
+      "step": 17720
+    },
+    {
+      "epoch": 28.28,
+      "grad_norm": 1.8732203245162964,
+      "learning_rate": 0.0008688995215311005,
+      "loss": 0.4823,
+      "step": 17730
+    },
+    {
+      "epoch": 28.29,
+      "grad_norm": 1.4714813232421875,
+      "learning_rate": 0.0008682615629984051,
+      "loss": 0.4007,
+      "step": 17740
+    },
+    {
+      "epoch": 28.31,
+      "grad_norm": 1.8994234800338745,
+      "learning_rate": 0.0008676236044657098,
+      "loss": 0.4203,
+      "step": 17750
+    },
+    {
+      "epoch": 28.33,
+      "grad_norm": 1.9376466274261475,
+      "learning_rate": 0.0008669856459330143,
+      "loss": 0.4652,
+      "step": 17760
+    },
+    {
+      "epoch": 28.34,
+      "grad_norm": 2.3434700965881348,
+      "learning_rate": 0.000866347687400319,
+      "loss": 0.3574,
+      "step": 17770
+    },
+    {
+      "epoch": 28.36,
+      "grad_norm": 1.5705221891403198,
+      "learning_rate": 0.0008657097288676237,
+      "loss": 0.3634,
+      "step": 17780
+    },
+    {
+      "epoch": 28.37,
+      "grad_norm": 2.1308560371398926,
+      "learning_rate": 0.0008650717703349283,
+      "loss": 0.4423,
+      "step": 17790
+    },
+    {
+      "epoch": 28.39,
+      "grad_norm": 0.966135561466217,
+      "learning_rate": 0.0008644338118022329,
+      "loss": 0.4221,
+      "step": 17800
+    },
+    {
+      "epoch": 28.41,
+      "grad_norm": 1.37132728099823,
+      "learning_rate": 0.0008637958532695375,
+      "loss": 0.4348,
+      "step": 17810
+    },
+    {
+      "epoch": 28.42,
+      "grad_norm": 1.676096796989441,
+      "learning_rate": 0.0008631578947368422,
+      "loss": 0.4338,
+      "step": 17820
+    },
+    {
+      "epoch": 28.44,
+      "grad_norm": 1.1030077934265137,
+      "learning_rate": 0.0008625199362041467,
+      "loss": 0.4399,
+      "step": 17830
+    },
+    {
+      "epoch": 28.45,
+      "grad_norm": 0.8978865146636963,
+      "learning_rate": 0.0008618819776714514,
+      "loss": 0.4306,
+      "step": 17840
+    },
+    {
+      "epoch": 28.47,
+      "grad_norm": 1.170512080192566,
+      "learning_rate": 0.0008612440191387561,
+      "loss": 0.4347,
+      "step": 17850
+    },
+    {
+      "epoch": 28.48,
+      "grad_norm": 1.0260136127471924,
+      "learning_rate": 0.0008606060606060606,
+      "loss": 0.3928,
+      "step": 17860
+    },
+    {
+      "epoch": 28.5,
+      "grad_norm": 1.04338800907135,
+      "learning_rate": 0.0008599681020733653,
+      "loss": 0.4193,
+      "step": 17870
+    },
+    {
+      "epoch": 28.52,
+      "grad_norm": 0.9068986177444458,
+      "learning_rate": 0.0008593301435406699,
+      "loss": 0.3889,
+      "step": 17880
+    },
+    {
+      "epoch": 28.53,
+      "grad_norm": 1.3259004354476929,
+      "learning_rate": 0.0008586921850079745,
+      "loss": 0.3873,
+      "step": 17890
+    },
+    {
+      "epoch": 28.55,
+      "grad_norm": 1.3916800022125244,
+      "learning_rate": 0.0008580542264752791,
+      "loss": 0.4631,
+      "step": 17900
+    },
+    {
+      "epoch": 28.56,
+      "grad_norm": 2.1619112491607666,
+      "learning_rate": 0.0008574162679425838,
+      "loss": 0.4275,
+      "step": 17910
+    },
+    {
+      "epoch": 28.58,
+      "grad_norm": 1.750162959098816,
+      "learning_rate": 0.0008567783094098883,
+      "loss": 0.4469,
+      "step": 17920
+    },
+    {
+      "epoch": 28.6,
+      "grad_norm": 1.2156579494476318,
+      "learning_rate": 0.000856140350877193,
+      "loss": 0.3898,
+      "step": 17930
+    },
+    {
+      "epoch": 28.61,
+      "grad_norm": 1.1427280902862549,
+      "learning_rate": 0.0008555023923444977,
+      "loss": 0.4424,
+      "step": 17940
+    },
+    {
+      "epoch": 28.63,
+      "grad_norm": 1.6410181522369385,
+      "learning_rate": 0.0008548644338118022,
+      "loss": 0.4153,
+      "step": 17950
+    },
+    {
+      "epoch": 28.64,
+      "grad_norm": 1.1331639289855957,
+      "learning_rate": 0.0008542264752791069,
+      "loss": 0.4638,
+      "step": 17960
+    },
+    {
+      "epoch": 28.66,
+      "grad_norm": 0.9264315366744995,
+      "learning_rate": 0.0008535885167464115,
+      "loss": 0.4634,
+      "step": 17970
+    },
+    {
+      "epoch": 28.68,
+      "grad_norm": 1.4615089893341064,
+      "learning_rate": 0.0008529505582137161,
+      "loss": 0.383,
+      "step": 17980
+    },
+    {
+      "epoch": 28.69,
+      "grad_norm": 1.291256070137024,
+      "learning_rate": 0.0008523125996810207,
+      "loss": 0.4415,
+      "step": 17990
+    },
+    {
+      "epoch": 28.71,
+      "grad_norm": 1.3759894371032715,
+      "learning_rate": 0.0008516746411483254,
+      "loss": 0.4275,
+      "step": 18000
+    },
+    {
+      "epoch": 28.72,
+      "grad_norm": 2.605381488800049,
+      "learning_rate": 0.00085103668261563,
+      "loss": 0.4614,
+      "step": 18010
+    },
+    {
+      "epoch": 28.74,
+      "grad_norm": 1.3442084789276123,
+      "learning_rate": 0.0008503987240829346,
+      "loss": 0.4276,
+      "step": 18020
+    },
+    {
+      "epoch": 28.76,
+      "grad_norm": 1.7800729274749756,
+      "learning_rate": 0.0008497607655502393,
+      "loss": 0.5137,
+      "step": 18030
+    },
+    {
+      "epoch": 28.77,
+      "grad_norm": 1.6473747491836548,
+      "learning_rate": 0.0008491228070175438,
+      "loss": 0.4381,
+      "step": 18040
+    },
+    {
+      "epoch": 28.79,
+      "grad_norm": 1.2551579475402832,
+      "learning_rate": 0.0008484848484848485,
+      "loss": 0.3784,
+      "step": 18050
+    },
+    {
+      "epoch": 28.8,
+      "grad_norm": 1.7706053256988525,
+      "learning_rate": 0.0008478468899521531,
+      "loss": 0.4137,
+      "step": 18060
+    },
+    {
+      "epoch": 28.82,
+      "grad_norm": 1.2189148664474487,
+      "learning_rate": 0.0008472089314194577,
+      "loss": 0.3987,
+      "step": 18070
+    },
+    {
+      "epoch": 28.84,
+      "grad_norm": 2.0609757900238037,
+      "learning_rate": 0.0008465709728867623,
+      "loss": 0.4321,
+      "step": 18080
+    },
+    {
+      "epoch": 28.85,
+      "grad_norm": 3.152968406677246,
+      "learning_rate": 0.000845933014354067,
+      "loss": 0.4866,
+      "step": 18090
+    },
+    {
+      "epoch": 28.87,
+      "grad_norm": 1.9931256771087646,
+      "learning_rate": 0.0008452950558213716,
+      "loss": 0.3915,
+      "step": 18100
+    },
+    {
+      "epoch": 28.88,
+      "grad_norm": 1.5088871717453003,
+      "learning_rate": 0.0008446570972886762,
+      "loss": 0.4172,
+      "step": 18110
+    },
+    {
+      "epoch": 28.9,
+      "grad_norm": 0.9786420464515686,
+      "learning_rate": 0.0008440191387559809,
+      "loss": 0.392,
+      "step": 18120
+    },
+    {
+      "epoch": 28.92,
+      "grad_norm": 1.9202160835266113,
+      "learning_rate": 0.0008433811802232854,
+      "loss": 0.3735,
+      "step": 18130
+    },
+    {
+      "epoch": 28.93,
+      "grad_norm": 1.7300411462783813,
+      "learning_rate": 0.0008427432216905901,
+      "loss": 0.493,
+      "step": 18140
+    },
+    {
+      "epoch": 28.95,
+      "grad_norm": 1.3852993249893188,
+      "learning_rate": 0.0008421052631578947,
+      "loss": 0.447,
+      "step": 18150
+    },
+    {
+      "epoch": 28.96,
+      "grad_norm": 1.5973821878433228,
+      "learning_rate": 0.0008414673046251993,
+      "loss": 0.4247,
+      "step": 18160
+    },
+    {
+      "epoch": 28.98,
+      "grad_norm": 1.521041989326477,
+      "learning_rate": 0.000840829346092504,
+      "loss": 0.514,
+      "step": 18170
+    },
+    {
+      "epoch": 29.0,
+      "grad_norm": 1.2108961343765259,
+      "learning_rate": 0.0008401913875598086,
+      "loss": 0.4258,
+      "step": 18180
+    },
+    {
+      "epoch": 29.01,
+      "grad_norm": 1.1732271909713745,
+      "learning_rate": 0.0008395534290271133,
+      "loss": 0.391,
+      "step": 18190
+    },
+    {
+      "epoch": 29.03,
+      "grad_norm": 2.4832112789154053,
+      "learning_rate": 0.0008389154704944178,
+      "loss": 0.4139,
+      "step": 18200
+    },
+    {
+      "epoch": 29.04,
+      "grad_norm": 1.2037804126739502,
+      "learning_rate": 0.0008382775119617225,
+      "loss": 0.3189,
+      "step": 18210
+    },
+    {
+      "epoch": 29.06,
+      "grad_norm": 1.1315257549285889,
+      "learning_rate": 0.000837639553429027,
+      "loss": 0.3455,
+      "step": 18220
+    },
+    {
+      "epoch": 29.07,
+      "grad_norm": 0.7903701663017273,
+      "learning_rate": 0.0008370015948963317,
+      "loss": 0.3511,
+      "step": 18230
+    },
+    {
+      "epoch": 29.09,
+      "grad_norm": 1.1077697277069092,
+      "learning_rate": 0.0008363636363636363,
+      "loss": 0.4795,
+      "step": 18240
+    },
+    {
+      "epoch": 29.11,
+      "grad_norm": 0.752619206905365,
+      "learning_rate": 0.000835725677830941,
+      "loss": 0.3194,
+      "step": 18250
+    },
+    {
+      "epoch": 29.12,
+      "grad_norm": 2.1113548278808594,
+      "learning_rate": 0.0008350877192982456,
+      "loss": 0.3937,
+      "step": 18260
+    },
+    {
+      "epoch": 29.14,
+      "grad_norm": 1.051826000213623,
+      "learning_rate": 0.0008344497607655502,
+      "loss": 0.3966,
+      "step": 18270
+    },
+    {
+      "epoch": 29.15,
+      "grad_norm": 1.329938530921936,
+      "learning_rate": 0.0008338118022328549,
+      "loss": 0.4212,
+      "step": 18280
+    },
+    {
+      "epoch": 29.17,
+      "grad_norm": 1.7144334316253662,
+      "learning_rate": 0.0008331738437001594,
+      "loss": 0.4333,
+      "step": 18290
+    },
+    {
+      "epoch": 29.19,
+      "grad_norm": 1.252589464187622,
+      "learning_rate": 0.0008325358851674642,
+      "loss": 0.379,
+      "step": 18300
+    },
+    {
+      "epoch": 29.2,
+      "grad_norm": 0.7238291501998901,
+      "learning_rate": 0.0008318979266347688,
+      "loss": 0.353,
+      "step": 18310
+    },
+    {
+      "epoch": 29.22,
+      "grad_norm": 1.5246005058288574,
+      "learning_rate": 0.0008312599681020734,
+      "loss": 0.361,
+      "step": 18320
+    },
+    {
+      "epoch": 29.23,
+      "grad_norm": 1.488550066947937,
+      "learning_rate": 0.0008306220095693781,
+      "loss": 0.4171,
+      "step": 18330
+    },
+    {
+      "epoch": 29.25,
+      "grad_norm": 1.30950129032135,
+      "learning_rate": 0.0008299840510366827,
+      "loss": 0.37,
+      "step": 18340
+    },
+    {
+      "epoch": 29.27,
+      "grad_norm": 2.5584652423858643,
+      "learning_rate": 0.0008293460925039873,
+      "loss": 0.4683,
+      "step": 18350
+    },
+    {
+      "epoch": 29.28,
+      "grad_norm": 1.2544807195663452,
+      "learning_rate": 0.0008287081339712919,
+      "loss": 0.3517,
+      "step": 18360
+    },
+    {
+      "epoch": 29.3,
+      "grad_norm": 1.9312729835510254,
+      "learning_rate": 0.0008280701754385966,
+      "loss": 0.3754,
+      "step": 18370
+    },
+    {
+      "epoch": 29.31,
+      "grad_norm": 1.9227901697158813,
+      "learning_rate": 0.0008274322169059011,
+      "loss": 0.353,
+      "step": 18380
+    },
+    {
+      "epoch": 29.33,
+      "grad_norm": 0.7560509443283081,
+      "learning_rate": 0.0008267942583732058,
+      "loss": 0.4141,
+      "step": 18390
+    },
+    {
+      "epoch": 29.35,
+      "grad_norm": 1.5966806411743164,
+      "learning_rate": 0.0008261562998405105,
+      "loss": 0.419,
+      "step": 18400
+    },
+    {
+      "epoch": 29.36,
+      "grad_norm": 1.8788731098175049,
+      "learning_rate": 0.000825518341307815,
+      "loss": 0.4134,
+      "step": 18410
+    },
+    {
+      "epoch": 29.38,
+      "grad_norm": 0.7582562565803528,
+      "learning_rate": 0.0008248803827751197,
+      "loss": 0.3092,
+      "step": 18420
+    },
+    {
+      "epoch": 29.39,
+      "grad_norm": 1.155375599861145,
+      "learning_rate": 0.0008242424242424243,
+      "loss": 0.4677,
+      "step": 18430
+    },
+    {
+      "epoch": 29.41,
+      "grad_norm": 0.581142246723175,
+      "learning_rate": 0.0008236044657097289,
+      "loss": 0.4426,
+      "step": 18440
+    },
+    {
+      "epoch": 29.43,
+      "grad_norm": 1.6078975200653076,
+      "learning_rate": 0.0008229665071770335,
+      "loss": 0.3892,
+      "step": 18450
+    },
+    {
+      "epoch": 29.44,
+      "grad_norm": 1.3083795309066772,
+      "learning_rate": 0.0008223285486443382,
+      "loss": 0.3718,
+      "step": 18460
+    },
+    {
+      "epoch": 29.46,
+      "grad_norm": 1.41934072971344,
+      "learning_rate": 0.0008216905901116427,
+      "loss": 0.3701,
+      "step": 18470
+    },
+    {
+      "epoch": 29.47,
+      "grad_norm": 1.7969826459884644,
+      "learning_rate": 0.0008210526315789474,
+      "loss": 0.44,
+      "step": 18480
+    },
+    {
+      "epoch": 29.49,
+      "grad_norm": 1.134151577949524,
+      "learning_rate": 0.0008204146730462521,
+      "loss": 0.3765,
+      "step": 18490
+    },
+    {
+      "epoch": 29.51,
+      "grad_norm": 1.9421136379241943,
+      "learning_rate": 0.0008197767145135566,
+      "loss": 0.3526,
+      "step": 18500
+    },
+    {
+      "epoch": 29.52,
+      "grad_norm": 1.0447088479995728,
+      "learning_rate": 0.0008191387559808613,
+      "loss": 0.3929,
+      "step": 18510
+    },
+    {
+      "epoch": 29.54,
+      "grad_norm": 2.2842037677764893,
+      "learning_rate": 0.0008185007974481659,
+      "loss": 0.428,
+      "step": 18520
+    },
+    {
+      "epoch": 29.55,
+      "grad_norm": 1.4780536890029907,
+      "learning_rate": 0.0008178628389154705,
+      "loss": 0.4503,
+      "step": 18530
+    },
+    {
+      "epoch": 29.57,
+      "grad_norm": 1.1551343202590942,
+      "learning_rate": 0.0008172248803827751,
+      "loss": 0.3608,
+      "step": 18540
+    },
+    {
+      "epoch": 29.59,
+      "grad_norm": 0.9097251892089844,
+      "learning_rate": 0.0008165869218500798,
+      "loss": 0.3289,
+      "step": 18550
+    },
+    {
+      "epoch": 29.6,
+      "grad_norm": 1.372117042541504,
+      "learning_rate": 0.0008159489633173845,
+      "loss": 0.4009,
+      "step": 18560
+    },
+    {
+      "epoch": 29.62,
+      "grad_norm": 1.3583626747131348,
+      "learning_rate": 0.000815311004784689,
+      "loss": 0.4491,
+      "step": 18570
+    },
+    {
+      "epoch": 29.63,
+      "grad_norm": 1.208733320236206,
+      "learning_rate": 0.0008146730462519937,
+      "loss": 0.4185,
+      "step": 18580
+    },
+    {
+      "epoch": 29.65,
+      "grad_norm": 1.2088313102722168,
+      "learning_rate": 0.0008140350877192983,
+      "loss": 0.3733,
+      "step": 18590
+    },
+    {
+      "epoch": 29.67,
+      "grad_norm": 1.3854396343231201,
+      "learning_rate": 0.0008133971291866029,
+      "loss": 0.4189,
+      "step": 18600
+    },
+    {
+      "epoch": 29.68,
+      "grad_norm": 2.3625354766845703,
+      "learning_rate": 0.0008127591706539075,
+      "loss": 0.4158,
+      "step": 18610
+    },
+    {
+      "epoch": 29.7,
+      "grad_norm": 1.1244155168533325,
+      "learning_rate": 0.0008121212121212122,
+      "loss": 0.4763,
+      "step": 18620
+    },
+    {
+      "epoch": 29.71,
+      "grad_norm": 1.6580774784088135,
+      "learning_rate": 0.0008114832535885167,
+      "loss": 0.441,
+      "step": 18630
+    },
+    {
+      "epoch": 29.73,
+      "grad_norm": 1.9566985368728638,
+      "learning_rate": 0.0008108452950558214,
+      "loss": 0.5087,
+      "step": 18640
+    },
+    {
+      "epoch": 29.74,
+      "grad_norm": 1.5847853422164917,
+      "learning_rate": 0.0008102073365231261,
+      "loss": 0.4876,
+      "step": 18650
+    },
+    {
+      "epoch": 29.76,
+      "grad_norm": 1.6287412643432617,
+      "learning_rate": 0.0008095693779904306,
+      "loss": 0.3967,
+      "step": 18660
+    },
+    {
+      "epoch": 29.78,
+      "grad_norm": 1.0776193141937256,
+      "learning_rate": 0.0008089314194577353,
+      "loss": 0.3406,
+      "step": 18670
+    },
+    {
+      "epoch": 29.79,
+      "grad_norm": 1.6697405576705933,
+      "learning_rate": 0.0008082934609250399,
+      "loss": 0.3976,
+      "step": 18680
+    },
+    {
+      "epoch": 29.81,
+      "grad_norm": 1.0817621946334839,
+      "learning_rate": 0.0008076555023923445,
+      "loss": 0.4045,
+      "step": 18690
+    },
+    {
+      "epoch": 29.82,
+      "grad_norm": 1.333869457244873,
+      "learning_rate": 0.0008070175438596491,
+      "loss": 0.4219,
+      "step": 18700
+    },
+    {
+      "epoch": 29.84,
+      "grad_norm": 1.135141134262085,
+      "learning_rate": 0.0008063795853269538,
+      "loss": 0.4283,
+      "step": 18710
+    },
+    {
+      "epoch": 29.86,
+      "grad_norm": 1.497247576713562,
+      "learning_rate": 0.0008057416267942584,
+      "loss": 0.4653,
+      "step": 18720
+    },
+    {
+      "epoch": 29.87,
+      "grad_norm": 1.3332675695419312,
+      "learning_rate": 0.000805103668261563,
+      "loss": 0.4842,
+      "step": 18730
+    },
+    {
+      "epoch": 29.89,
+      "grad_norm": 1.2101079225540161,
+      "learning_rate": 0.0008044657097288677,
+      "loss": 0.4352,
+      "step": 18740
+    },
+    {
+      "epoch": 29.9,
+      "grad_norm": 1.161740779876709,
+      "learning_rate": 0.0008038277511961722,
+      "loss": 0.4775,
+      "step": 18750
+    },
+    {
+      "epoch": 29.92,
+      "grad_norm": 2.115443229675293,
+      "learning_rate": 0.0008031897926634769,
+      "loss": 0.4047,
+      "step": 18760
+    },
+    {
+      "epoch": 29.94,
+      "grad_norm": 1.6414830684661865,
+      "learning_rate": 0.0008025518341307815,
+      "loss": 0.4002,
+      "step": 18770
+    },
+    {
+      "epoch": 29.95,
+      "grad_norm": 2.5109338760375977,
+      "learning_rate": 0.0008019138755980861,
+      "loss": 0.456,
+      "step": 18780
+    },
+    {
+      "epoch": 29.97,
+      "grad_norm": 1.0796329975128174,
+      "learning_rate": 0.0008012759170653907,
+      "loss": 0.3812,
+      "step": 18790
+    },
+    {
+      "epoch": 29.98,
+      "grad_norm": 1.581429362297058,
+      "learning_rate": 0.0008006379585326954,
+      "loss": 0.4038,
+      "step": 18800
+    },
+    {
+      "epoch": 30.0,
+      "grad_norm": 1.5293798446655273,
+      "learning_rate": 0.0008,
+      "loss": 0.4527,
+      "step": 18810
+    },
+    {
+      "epoch": 30.02,
+      "grad_norm": 0.3642142117023468,
+      "learning_rate": 0.0007993620414673046,
+      "loss": 0.3424,
+      "step": 18820
+    },
+    {
+      "epoch": 30.03,
+      "grad_norm": 1.181534767150879,
+      "learning_rate": 0.0007987240829346093,
+      "loss": 0.3467,
+      "step": 18830
+    },
+    {
+      "epoch": 30.05,
+      "grad_norm": 1.3809243440628052,
+      "learning_rate": 0.0007980861244019138,
+      "loss": 0.3445,
+      "step": 18840
+    },
+    {
+      "epoch": 30.06,
+      "grad_norm": 0.9193634986877441,
+      "learning_rate": 0.0007974481658692185,
+      "loss": 0.418,
+      "step": 18850
+    },
+    {
+      "epoch": 30.08,
+      "grad_norm": 1.8701001405715942,
+      "learning_rate": 0.0007968102073365231,
+      "loss": 0.3377,
+      "step": 18860
+    },
+    {
+      "epoch": 30.1,
+      "grad_norm": 1.0947514772415161,
+      "learning_rate": 0.0007961722488038277,
+      "loss": 0.3047,
+      "step": 18870
+    },
+    {
+      "epoch": 30.11,
+      "grad_norm": 1.5115679502487183,
+      "learning_rate": 0.0007955342902711324,
+      "loss": 0.4687,
+      "step": 18880
+    },
+    {
+      "epoch": 30.13,
+      "grad_norm": 1.4967734813690186,
+      "learning_rate": 0.000794896331738437,
+      "loss": 0.3585,
+      "step": 18890
+    },
+    {
+      "epoch": 30.14,
+      "grad_norm": 1.0271711349487305,
+      "learning_rate": 0.0007942583732057416,
+      "loss": 0.3857,
+      "step": 18900
+    },
+    {
+      "epoch": 30.16,
+      "grad_norm": 0.6969228982925415,
+      "learning_rate": 0.0007936204146730462,
+      "loss": 0.3916,
+      "step": 18910
+    },
+    {
+      "epoch": 30.18,
+      "grad_norm": 1.0234570503234863,
+      "learning_rate": 0.0007929824561403509,
+      "loss": 0.2736,
+      "step": 18920
+    },
+    {
+      "epoch": 30.19,
+      "grad_norm": 0.3337653577327728,
+      "learning_rate": 0.0007923444976076554,
+      "loss": 0.3358,
+      "step": 18930
+    },
+    {
+      "epoch": 30.21,
+      "grad_norm": 0.8475213646888733,
+      "learning_rate": 0.0007917065390749601,
+      "loss": 0.4215,
+      "step": 18940
+    },
+    {
+      "epoch": 30.22,
+      "grad_norm": 1.190527319908142,
+      "learning_rate": 0.0007910685805422647,
+      "loss": 0.3912,
+      "step": 18950
+    },
+    {
+      "epoch": 30.24,
+      "grad_norm": 2.408203601837158,
+      "learning_rate": 0.0007904306220095693,
+      "loss": 0.4475,
+      "step": 18960
+    },
+    {
+      "epoch": 30.26,
+      "grad_norm": 0.8776838183403015,
+      "learning_rate": 0.000789792663476874,
+      "loss": 0.3504,
+      "step": 18970
+    },
+    {
+      "epoch": 30.27,
+      "grad_norm": 2.3490686416625977,
+      "learning_rate": 0.0007891547049441786,
+      "loss": 0.3283,
+      "step": 18980
+    },
+    {
+      "epoch": 30.29,
+      "grad_norm": 0.9696643948554993,
+      "learning_rate": 0.0007885167464114833,
+      "loss": 0.3629,
+      "step": 18990
+    },
+    {
+      "epoch": 30.3,
+      "grad_norm": 1.1829396486282349,
+      "learning_rate": 0.0007878787878787878,
+      "loss": 0.3908,
+      "step": 19000
+    },
+    {
+      "epoch": 30.32,
+      "grad_norm": 1.0642168521881104,
+      "learning_rate": 0.0007872408293460925,
+      "loss": 0.359,
+      "step": 19010
+    },
+    {
+      "epoch": 30.33,
+      "grad_norm": 1.3545867204666138,
+      "learning_rate": 0.000786602870813397,
+      "loss": 0.3656,
+      "step": 19020
+    },
+    {
+      "epoch": 30.35,
+      "grad_norm": 1.1527637243270874,
+      "learning_rate": 0.0007859649122807017,
+      "loss": 0.358,
+      "step": 19030
+    },
+    {
+      "epoch": 30.37,
+      "grad_norm": 1.6512736082077026,
+      "learning_rate": 0.0007853269537480065,
+      "loss": 0.3228,
+      "step": 19040
+    },
+    {
+      "epoch": 30.38,
+      "grad_norm": 0.9381676912307739,
+      "learning_rate": 0.0007846889952153111,
+      "loss": 0.3604,
+      "step": 19050
+    },
+    {
+      "epoch": 30.4,
+      "grad_norm": 0.786491870880127,
+      "learning_rate": 0.0007840510366826157,
+      "loss": 0.4115,
+      "step": 19060
+    },
+    {
+      "epoch": 30.41,
+      "grad_norm": 1.0724458694458008,
+      "learning_rate": 0.0007834130781499203,
+      "loss": 0.431,
+      "step": 19070
+    },
+    {
+      "epoch": 30.43,
+      "grad_norm": 1.1807036399841309,
+      "learning_rate": 0.000782775119617225,
+      "loss": 0.3628,
+      "step": 19080
+    },
+    {
+      "epoch": 30.45,
+      "grad_norm": 2.059079647064209,
+      "learning_rate": 0.0007821371610845295,
+      "loss": 0.4,
+      "step": 19090
+    },
+    {
+      "epoch": 30.46,
+      "grad_norm": 2.4132237434387207,
+      "learning_rate": 0.0007814992025518342,
+      "loss": 0.41,
+      "step": 19100
+    },
+    {
+      "epoch": 30.48,
+      "grad_norm": 1.3958185911178589,
+      "learning_rate": 0.0007808612440191388,
+      "loss": 0.4278,
+      "step": 19110
+    },
+    {
+      "epoch": 30.49,
+      "grad_norm": 1.9119430780410767,
+      "learning_rate": 0.0007802232854864434,
+      "loss": 0.3688,
+      "step": 19120
+    },
+    {
+      "epoch": 30.51,
+      "grad_norm": 0.8936794996261597,
+      "learning_rate": 0.0007795853269537481,
+      "loss": 0.3909,
+      "step": 19130
+    },
+    {
+      "epoch": 30.53,
+      "grad_norm": 0.8988013863563538,
+      "learning_rate": 0.0007789473684210527,
+      "loss": 0.413,
+      "step": 19140
+    },
+    {
+      "epoch": 30.54,
+      "grad_norm": 1.2821409702301025,
+      "learning_rate": 0.0007783094098883573,
+      "loss": 0.3746,
+      "step": 19150
+    },
+    {
+      "epoch": 30.56,
+      "grad_norm": 1.8209261894226074,
+      "learning_rate": 0.0007776714513556619,
+      "loss": 0.4328,
+      "step": 19160
+    },
+    {
+      "epoch": 30.57,
+      "grad_norm": 0.5443445444107056,
+      "learning_rate": 0.0007770334928229666,
+      "loss": 0.3094,
+      "step": 19170
+    },
+    {
+      "epoch": 30.59,
+      "grad_norm": 1.1508780717849731,
+      "learning_rate": 0.0007763955342902711,
+      "loss": 0.3781,
+      "step": 19180
+    },
+    {
+      "epoch": 30.61,
+      "grad_norm": 1.742360234260559,
+      "learning_rate": 0.0007757575757575758,
+      "loss": 0.4211,
+      "step": 19190
+    },
+    {
+      "epoch": 30.62,
+      "grad_norm": 1.645337462425232,
+      "learning_rate": 0.0007751196172248805,
+      "loss": 0.4973,
+      "step": 19200
+    },
+    {
+      "epoch": 30.64,
+      "grad_norm": 3.0447423458099365,
+      "learning_rate": 0.000774481658692185,
+      "loss": 0.4262,
+      "step": 19210
+    },
+    {
+      "epoch": 30.65,
+      "grad_norm": 1.7042852640151978,
+      "learning_rate": 0.0007738437001594897,
+      "loss": 0.3602,
+      "step": 19220
+    },
+    {
+      "epoch": 30.67,
+      "grad_norm": 1.517238974571228,
+      "learning_rate": 0.0007732057416267943,
+      "loss": 0.4683,
+      "step": 19230
+    },
+    {
+      "epoch": 30.69,
+      "grad_norm": 1.3933392763137817,
+      "learning_rate": 0.0007725677830940989,
+      "loss": 0.3464,
+      "step": 19240
+    },
+    {
+      "epoch": 30.7,
+      "grad_norm": 1.2021284103393555,
+      "learning_rate": 0.0007719298245614035,
+      "loss": 0.3205,
+      "step": 19250
+    },
+    {
+      "epoch": 30.72,
+      "grad_norm": 2.1373493671417236,
+      "learning_rate": 0.0007712918660287082,
+      "loss": 0.3901,
+      "step": 19260
+    },
+    {
+      "epoch": 30.73,
+      "grad_norm": 1.1844630241394043,
+      "learning_rate": 0.0007706539074960128,
+      "loss": 0.4874,
+      "step": 19270
+    },
+    {
+      "epoch": 30.75,
+      "grad_norm": 1.7633776664733887,
+      "learning_rate": 0.0007700159489633174,
+      "loss": 0.4525,
+      "step": 19280
+    },
+    {
+      "epoch": 30.77,
+      "grad_norm": 0.8574751615524292,
+      "learning_rate": 0.0007693779904306221,
+      "loss": 0.4477,
+      "step": 19290
+    },
+    {
+      "epoch": 30.78,
+      "grad_norm": 0.9806014895439148,
+      "learning_rate": 0.0007687400318979266,
+      "loss": 0.4231,
+      "step": 19300
+    },
+    {
+      "epoch": 30.8,
+      "grad_norm": 1.515453577041626,
+      "learning_rate": 0.0007681020733652313,
+      "loss": 0.4159,
+      "step": 19310
+    },
+    {
+      "epoch": 30.81,
+      "grad_norm": 1.195142149925232,
+      "learning_rate": 0.0007674641148325359,
+      "loss": 0.463,
+      "step": 19320
+    },
+    {
+      "epoch": 30.83,
+      "grad_norm": 1.2401401996612549,
+      "learning_rate": 0.0007668261562998405,
+      "loss": 0.3719,
+      "step": 19330
+    },
+    {
+      "epoch": 30.85,
+      "grad_norm": 1.4845614433288574,
+      "learning_rate": 0.0007661881977671451,
+      "loss": 0.4013,
+      "step": 19340
+    },
+    {
+      "epoch": 30.86,
+      "grad_norm": 1.5066015720367432,
+      "learning_rate": 0.0007655502392344498,
+      "loss": 0.3917,
+      "step": 19350
+    },
+    {
+      "epoch": 30.88,
+      "grad_norm": 1.3425683975219727,
+      "learning_rate": 0.0007649122807017545,
+      "loss": 0.4719,
+      "step": 19360
+    },
+    {
+      "epoch": 30.89,
+      "grad_norm": 1.1638840436935425,
+      "learning_rate": 0.000764274322169059,
+      "loss": 0.3526,
+      "step": 19370
+    },
+    {
+      "epoch": 30.91,
+      "grad_norm": 1.2238682508468628,
+      "learning_rate": 0.0007636363636363637,
+      "loss": 0.4084,
+      "step": 19380
+    },
+    {
+      "epoch": 30.93,
+      "grad_norm": 2.78072190284729,
+      "learning_rate": 0.0007629984051036683,
+      "loss": 0.4198,
+      "step": 19390
+    },
+    {
+      "epoch": 30.94,
+      "grad_norm": 1.495713472366333,
+      "learning_rate": 0.0007623604465709729,
+      "loss": 0.451,
+      "step": 19400
+    },
+    {
+      "epoch": 30.96,
+      "grad_norm": 1.5464080572128296,
+      "learning_rate": 0.0007617224880382775,
+      "loss": 0.492,
+      "step": 19410
+    },
+    {
+      "epoch": 30.97,
+      "grad_norm": 1.054413080215454,
+      "learning_rate": 0.0007610845295055822,
+      "loss": 0.372,
+      "step": 19420
+    },
+    {
+      "epoch": 30.99,
+      "grad_norm": 1.212849736213684,
+      "learning_rate": 0.0007604465709728868,
+      "loss": 0.4701,
+      "step": 19430
+    },
+    {
+      "epoch": 31.0,
+      "grad_norm": 1.4419000148773193,
+      "learning_rate": 0.0007598086124401914,
+      "loss": 0.4366,
+      "step": 19440
+    },
+    {
+      "epoch": 31.02,
+      "grad_norm": 1.8938413858413696,
+      "learning_rate": 0.0007591706539074961,
+      "loss": 0.3519,
+      "step": 19450
+    },
+    {
+      "epoch": 31.04,
+      "grad_norm": 0.6526773571968079,
+      "learning_rate": 0.0007585326953748006,
+      "loss": 0.3049,
+      "step": 19460
+    },
+    {
+      "epoch": 31.05,
+      "grad_norm": 1.1578338146209717,
+      "learning_rate": 0.0007578947368421053,
+      "loss": 0.32,
+      "step": 19470
+    },
+    {
+      "epoch": 31.07,
+      "grad_norm": 1.577438235282898,
+      "learning_rate": 0.0007572567783094099,
+      "loss": 0.3386,
+      "step": 19480
+    },
+    {
+      "epoch": 31.08,
+      "grad_norm": 1.2335830926895142,
+      "learning_rate": 0.0007566188197767145,
+      "loss": 0.3304,
+      "step": 19490
+    },
+    {
+      "epoch": 31.1,
+      "grad_norm": 1.2585694789886475,
+      "learning_rate": 0.0007559808612440191,
+      "loss": 0.347,
+      "step": 19500
+    },
+    {
+      "epoch": 31.12,
+      "grad_norm": 0.6757459044456482,
+      "learning_rate": 0.0007553429027113238,
+      "loss": 0.3262,
+      "step": 19510
+    },
+    {
+      "epoch": 31.13,
+      "grad_norm": 1.0831152200698853,
+      "learning_rate": 0.0007547049441786284,
+      "loss": 0.3829,
+      "step": 19520
+    },
+    {
+      "epoch": 31.15,
+      "grad_norm": 0.7511752843856812,
+      "learning_rate": 0.000754066985645933,
+      "loss": 0.3401,
+      "step": 19530
+    },
+    {
+      "epoch": 31.16,
+      "grad_norm": 1.0880722999572754,
+      "learning_rate": 0.0007534290271132377,
+      "loss": 0.3095,
+      "step": 19540
+    },
+    {
+      "epoch": 31.18,
+      "grad_norm": 0.5444307923316956,
+      "learning_rate": 0.0007527910685805422,
+      "loss": 0.3768,
+      "step": 19550
+    },
+    {
+      "epoch": 31.2,
+      "grad_norm": 0.3923839330673218,
+      "learning_rate": 0.0007521531100478469,
+      "loss": 0.2829,
+      "step": 19560
+    },
+    {
+      "epoch": 31.21,
+      "grad_norm": 1.0065523386001587,
+      "learning_rate": 0.0007515151515151515,
+      "loss": 0.3505,
+      "step": 19570
+    },
+    {
+      "epoch": 31.23,
+      "grad_norm": 1.3828843832015991,
+      "learning_rate": 0.0007508771929824561,
+      "loss": 0.3315,
+      "step": 19580
+    },
+    {
+      "epoch": 31.24,
+      "grad_norm": 1.0128448009490967,
+      "learning_rate": 0.0007502392344497608,
+      "loss": 0.3646,
+      "step": 19590
+    },
+    {
+      "epoch": 31.26,
+      "grad_norm": 0.8419422507286072,
+      "learning_rate": 0.0007496012759170654,
+      "loss": 0.3779,
+      "step": 19600
+    },
+    {
+      "epoch": 31.28,
+      "grad_norm": 0.926729679107666,
+      "learning_rate": 0.00074896331738437,
+      "loss": 0.3611,
+      "step": 19610
+    },
+    {
+      "epoch": 31.29,
+      "grad_norm": 0.6536783576011658,
+      "learning_rate": 0.0007483253588516746,
+      "loss": 0.4033,
+      "step": 19620
+    },
+    {
+      "epoch": 31.31,
+      "grad_norm": 0.8884857892990112,
+      "learning_rate": 0.0007476874003189793,
+      "loss": 0.3712,
+      "step": 19630
+    },
+    {
+      "epoch": 31.32,
+      "grad_norm": 0.7593963146209717,
+      "learning_rate": 0.0007470494417862838,
+      "loss": 0.3676,
+      "step": 19640
+    },
+    {
+      "epoch": 31.34,
+      "grad_norm": 0.4570366442203522,
+      "learning_rate": 0.0007464114832535885,
+      "loss": 0.3733,
+      "step": 19650
+    },
+    {
+      "epoch": 31.36,
+      "grad_norm": 0.5554494857788086,
+      "learning_rate": 0.0007457735247208931,
+      "loss": 0.3223,
+      "step": 19660
+    },
+    {
+      "epoch": 31.37,
+      "grad_norm": 0.5048463344573975,
+      "learning_rate": 0.0007451355661881977,
+      "loss": 0.3381,
+      "step": 19670
+    },
+    {
+      "epoch": 31.39,
+      "grad_norm": 0.8749020099639893,
+      "learning_rate": 0.0007444976076555024,
+      "loss": 0.4279,
+      "step": 19680
+    },
+    {
+      "epoch": 31.4,
+      "grad_norm": 1.8333324193954468,
+      "learning_rate": 0.000743859649122807,
+      "loss": 0.3923,
+      "step": 19690
+    },
+    {
+      "epoch": 31.42,
+      "grad_norm": 0.5864129662513733,
+      "learning_rate": 0.0007432216905901116,
+      "loss": 0.3666,
+      "step": 19700
+    },
+    {
+      "epoch": 31.44,
+      "grad_norm": 1.0455960035324097,
+      "learning_rate": 0.0007425837320574162,
+      "loss": 0.2746,
+      "step": 19710
+    },
+    {
+      "epoch": 31.45,
+      "grad_norm": 0.851701021194458,
+      "learning_rate": 0.0007419457735247209,
+      "loss": 0.4655,
+      "step": 19720
+    },
+    {
+      "epoch": 31.47,
+      "grad_norm": 1.4521914720535278,
+      "learning_rate": 0.0007413078149920254,
+      "loss": 0.3904,
+      "step": 19730
+    },
+    {
+      "epoch": 31.48,
+      "grad_norm": 0.7903003096580505,
+      "learning_rate": 0.0007406698564593301,
+      "loss": 0.3511,
+      "step": 19740
+    },
+    {
+      "epoch": 31.5,
+      "grad_norm": 0.5965768694877625,
+      "learning_rate": 0.0007400318979266348,
+      "loss": 0.348,
+      "step": 19750
+    },
+    {
+      "epoch": 31.52,
+      "grad_norm": 1.1456114053726196,
+      "learning_rate": 0.0007393939393939393,
+      "loss": 0.3759,
+      "step": 19760
+    },
+    {
+      "epoch": 31.53,
+      "grad_norm": 1.1218417882919312,
+      "learning_rate": 0.000738755980861244,
+      "loss": 0.3504,
+      "step": 19770
+    },
+    {
+      "epoch": 31.55,
+      "grad_norm": 1.06869375705719,
+      "learning_rate": 0.0007381180223285486,
+      "loss": 0.3465,
+      "step": 19780
+    },
+    {
+      "epoch": 31.56,
+      "grad_norm": 1.4235601425170898,
+      "learning_rate": 0.0007374800637958534,
+      "loss": 0.4782,
+      "step": 19790
+    },
+    {
+      "epoch": 31.58,
+      "grad_norm": 1.7712465524673462,
+      "learning_rate": 0.0007368421052631579,
+      "loss": 0.4207,
+      "step": 19800
+    },
+    {
+      "epoch": 31.59,
+      "grad_norm": 2.5546319484710693,
+      "learning_rate": 0.0007362041467304626,
+      "loss": 0.3945,
+      "step": 19810
+    },
+    {
+      "epoch": 31.61,
+      "grad_norm": 1.4989862442016602,
+      "learning_rate": 0.0007355661881977672,
+      "loss": 0.3541,
+      "step": 19820
+    },
+    {
+      "epoch": 31.63,
+      "grad_norm": 1.1824603080749512,
+      "learning_rate": 0.0007349282296650718,
+      "loss": 0.3834,
+      "step": 19830
+    },
+    {
+      "epoch": 31.64,
+      "grad_norm": 1.468544602394104,
+      "learning_rate": 0.0007342902711323765,
+      "loss": 0.4389,
+      "step": 19840
+    },
+    {
+      "epoch": 31.66,
+      "grad_norm": 1.5891380310058594,
+      "learning_rate": 0.0007336523125996811,
+      "loss": 0.3104,
+      "step": 19850
+    },
+    {
+      "epoch": 31.67,
+      "grad_norm": 1.6829235553741455,
+      "learning_rate": 0.0007330143540669857,
+      "loss": 0.4124,
+      "step": 19860
+    },
+    {
+      "epoch": 31.69,
+      "grad_norm": 0.8262020349502563,
+      "learning_rate": 0.0007323763955342903,
+      "loss": 0.4355,
+      "step": 19870
+    },
+    {
+      "epoch": 31.71,
+      "grad_norm": 1.0964784622192383,
+      "learning_rate": 0.000731738437001595,
+      "loss": 0.4051,
+      "step": 19880
+    },
+    {
+      "epoch": 31.72,
+      "grad_norm": 1.241181492805481,
+      "learning_rate": 0.0007311004784688995,
+      "loss": 0.3317,
+      "step": 19890
+    },
+    {
+      "epoch": 31.74,
+      "grad_norm": 0.7173839807510376,
+      "learning_rate": 0.0007304625199362042,
+      "loss": 0.303,
+      "step": 19900
+    },
+    {
+      "epoch": 31.75,
+      "grad_norm": 1.3341323137283325,
+      "learning_rate": 0.0007298245614035089,
+      "loss": 0.4809,
+      "step": 19910
+    },
+    {
+      "epoch": 31.77,
+      "grad_norm": 0.7707849144935608,
+      "learning_rate": 0.0007291866028708134,
+      "loss": 0.3865,
+      "step": 19920
+    },
+    {
+      "epoch": 31.79,
+      "grad_norm": 1.0697121620178223,
+      "learning_rate": 0.0007285486443381181,
+      "loss": 0.3991,
+      "step": 19930
+    },
+    {
+      "epoch": 31.8,
+      "grad_norm": 0.8612807989120483,
+      "learning_rate": 0.0007279106858054227,
+      "loss": 0.3753,
+      "step": 19940
+    },
+    {
+      "epoch": 31.82,
+      "grad_norm": 1.1503595113754272,
+      "learning_rate": 0.0007272727272727273,
+      "loss": 0.4917,
+      "step": 19950
+    },
+    {
+      "epoch": 31.83,
+      "grad_norm": 2.088966131210327,
+      "learning_rate": 0.0007266347687400319,
+      "loss": 0.4367,
+      "step": 19960
+    },
+    {
+      "epoch": 31.85,
+      "grad_norm": 1.2572288513183594,
+      "learning_rate": 0.0007259968102073366,
+      "loss": 0.4804,
+      "step": 19970
+    },
+    {
+      "epoch": 31.87,
+      "grad_norm": 1.781175136566162,
+      "learning_rate": 0.0007253588516746412,
+      "loss": 0.4508,
+      "step": 19980
+    },
+    {
+      "epoch": 31.88,
+      "grad_norm": 1.0523390769958496,
+      "learning_rate": 0.0007247208931419458,
+      "loss": 0.364,
+      "step": 19990
+    },
+    {
+      "epoch": 31.9,
+      "grad_norm": 1.5974029302597046,
+      "learning_rate": 0.0007240829346092505,
+      "loss": 0.3279,
+      "step": 20000
+    },
+    {
+      "epoch": 31.91,
+      "grad_norm": 1.0655145645141602,
+      "learning_rate": 0.000723444976076555,
+      "loss": 0.3507,
+      "step": 20010
+    },
+    {
+      "epoch": 31.93,
+      "grad_norm": 1.4828819036483765,
+      "learning_rate": 0.0007228070175438597,
+      "loss": 0.3621,
+      "step": 20020
+    },
+    {
+      "epoch": 31.95,
+      "grad_norm": 0.6305584907531738,
+      "learning_rate": 0.0007221690590111643,
+      "loss": 0.47,
+      "step": 20030
+    },
+    {
+      "epoch": 31.96,
+      "grad_norm": 0.873736560344696,
+      "learning_rate": 0.0007215311004784689,
+      "loss": 0.3372,
+      "step": 20040
+    },
+    {
+      "epoch": 31.98,
+      "grad_norm": 0.7609186768531799,
+      "learning_rate": 0.0007208931419457735,
+      "loss": 0.4067,
+      "step": 20050
+    },
+    {
+      "epoch": 31.99,
+      "grad_norm": 0.638640284538269,
+      "learning_rate": 0.0007202551834130782,
+      "loss": 0.4417,
+      "step": 20060
+    },
+    {
+      "epoch": 32.01,
+      "grad_norm": 1.3461627960205078,
+      "learning_rate": 0.0007196172248803828,
+      "loss": 0.3463,
+      "step": 20070
+    },
+    {
+      "epoch": 32.03,
+      "grad_norm": 0.9440305233001709,
+      "learning_rate": 0.0007189792663476874,
+      "loss": 0.3378,
+      "step": 20080
+    },
+    {
+      "epoch": 32.04,
+      "grad_norm": 1.406764268875122,
+      "learning_rate": 0.0007183413078149921,
+      "loss": 0.3547,
+      "step": 20090
+    },
+    {
+      "epoch": 32.06,
+      "grad_norm": 1.0603829622268677,
+      "learning_rate": 0.0007177033492822966,
+      "loss": 0.303,
+      "step": 20100
+    },
+    {
+      "epoch": 32.07,
+      "grad_norm": 0.5735631585121155,
+      "learning_rate": 0.0007170653907496013,
+      "loss": 0.321,
+      "step": 20110
+    },
+    {
+      "epoch": 32.09,
+      "grad_norm": 0.5425032377243042,
+      "learning_rate": 0.0007164274322169059,
+      "loss": 0.3486,
+      "step": 20120
+    },
+    {
+      "epoch": 32.11,
+      "grad_norm": 0.4281626045703888,
+      "learning_rate": 0.0007157894736842105,
+      "loss": 0.4408,
+      "step": 20130
+    },
+    {
+      "epoch": 32.12,
+      "grad_norm": 0.6306964755058289,
+      "learning_rate": 0.0007151515151515152,
+      "loss": 0.3664,
+      "step": 20140
+    },
+    {
+      "epoch": 32.14,
+      "grad_norm": 1.3869534730911255,
+      "learning_rate": 0.0007145135566188198,
+      "loss": 0.3345,
+      "step": 20150
+    },
+    {
+      "epoch": 32.15,
+      "grad_norm": 1.281069040298462,
+      "learning_rate": 0.0007138755980861245,
+      "loss": 0.3758,
+      "step": 20160
+    },
+    {
+      "epoch": 32.17,
+      "grad_norm": 0.7174175381660461,
+      "learning_rate": 0.000713237639553429,
+      "loss": 0.4612,
+      "step": 20170
+    },
+    {
+      "epoch": 32.19,
+      "grad_norm": 0.7497925758361816,
+      "learning_rate": 0.0007125996810207337,
+      "loss": 0.3671,
+      "step": 20180
+    },
+    {
+      "epoch": 32.2,
+      "grad_norm": 0.6942813992500305,
+      "learning_rate": 0.0007119617224880383,
+      "loss": 0.3147,
+      "step": 20190
+    },
+    {
+      "epoch": 32.22,
+      "grad_norm": 1.1213644742965698,
+      "learning_rate": 0.0007113237639553429,
+      "loss": 0.3739,
+      "step": 20200
+    },
+    {
+      "epoch": 32.23,
+      "grad_norm": 0.7664075493812561,
+      "learning_rate": 0.0007106858054226475,
+      "loss": 0.3105,
+      "step": 20210
+    },
+    {
+      "epoch": 32.25,
+      "grad_norm": 0.9661602973937988,
+      "learning_rate": 0.0007100478468899522,
+      "loss": 0.3266,
+      "step": 20220
+    },
+    {
+      "epoch": 32.26,
+      "grad_norm": 1.2888504266738892,
+      "learning_rate": 0.0007094098883572568,
+      "loss": 0.3405,
+      "step": 20230
+    },
+    {
+      "epoch": 32.28,
+      "grad_norm": 0.829325258731842,
+      "learning_rate": 0.0007087719298245614,
+      "loss": 0.3778,
+      "step": 20240
+    },
+    {
+      "epoch": 32.3,
+      "grad_norm": 0.8283563256263733,
+      "learning_rate": 0.0007081339712918661,
+      "loss": 0.389,
+      "step": 20250
+    },
+    {
+      "epoch": 32.31,
+      "grad_norm": 2.8405203819274902,
+      "learning_rate": 0.0007074960127591706,
+      "loss": 0.4051,
+      "step": 20260
+    },
+    {
+      "epoch": 32.33,
+      "grad_norm": 0.9580861330032349,
+      "learning_rate": 0.0007068580542264753,
+      "loss": 0.3376,
+      "step": 20270
+    },
+    {
+      "epoch": 32.34,
+      "grad_norm": 1.7252624034881592,
+      "learning_rate": 0.0007062200956937799,
+      "loss": 0.3626,
+      "step": 20280
+    },
+    {
+      "epoch": 32.36,
+      "grad_norm": 0.8010210990905762,
+      "learning_rate": 0.0007055821371610845,
+      "loss": 0.3524,
+      "step": 20290
+    },
+    {
+      "epoch": 32.38,
+      "grad_norm": 0.7796013951301575,
+      "learning_rate": 0.0007049441786283892,
+      "loss": 0.3778,
+      "step": 20300
+    },
+    {
+      "epoch": 32.39,
+      "grad_norm": 0.8661381602287292,
+      "learning_rate": 0.0007043062200956938,
+      "loss": 0.4093,
+      "step": 20310
+    },
+    {
+      "epoch": 32.41,
+      "grad_norm": 0.987169623374939,
+      "learning_rate": 0.0007036682615629984,
+      "loss": 0.2886,
+      "step": 20320
+    },
+    {
+      "epoch": 32.42,
+      "grad_norm": 1.090738296508789,
+      "learning_rate": 0.000703030303030303,
+      "loss": 0.3848,
+      "step": 20330
+    },
+    {
+      "epoch": 32.44,
+      "grad_norm": 2.1070291996002197,
+      "learning_rate": 0.0007023923444976077,
+      "loss": 0.3214,
+      "step": 20340
+    },
+    {
+      "epoch": 32.46,
+      "grad_norm": 1.216748833656311,
+      "learning_rate": 0.0007017543859649122,
+      "loss": 0.3446,
+      "step": 20350
+    },
+    {
+      "epoch": 32.47,
+      "grad_norm": 0.8944370150566101,
+      "learning_rate": 0.0007011164274322169,
+      "loss": 0.3595,
+      "step": 20360
+    },
+    {
+      "epoch": 32.49,
+      "grad_norm": 0.7445225119590759,
+      "learning_rate": 0.0007004784688995215,
+      "loss": 0.3376,
+      "step": 20370
+    },
+    {
+      "epoch": 32.5,
+      "grad_norm": 0.584930419921875,
+      "learning_rate": 0.0006998405103668261,
+      "loss": 0.361,
+      "step": 20380
+    },
+    {
+      "epoch": 32.52,
+      "grad_norm": 1.1852445602416992,
+      "learning_rate": 0.0006992025518341308,
+      "loss": 0.423,
+      "step": 20390
+    },
+    {
+      "epoch": 32.54,
+      "grad_norm": 0.8096782565116882,
+      "learning_rate": 0.0006985645933014354,
+      "loss": 0.3028,
+      "step": 20400
+    },
+    {
+      "epoch": 32.55,
+      "grad_norm": 0.7330004572868347,
+      "learning_rate": 0.00069792663476874,
+      "loss": 0.3499,
+      "step": 20410
+    },
+    {
+      "epoch": 32.57,
+      "grad_norm": 0.556844174861908,
+      "learning_rate": 0.0006972886762360446,
+      "loss": 0.4042,
+      "step": 20420
+    },
+    {
+      "epoch": 32.58,
+      "grad_norm": 1.2681604623794556,
+      "learning_rate": 0.0006966507177033493,
+      "loss": 0.2892,
+      "step": 20430
+    },
+    {
+      "epoch": 32.6,
+      "grad_norm": 0.985937237739563,
+      "learning_rate": 0.0006960127591706538,
+      "loss": 0.367,
+      "step": 20440
+    },
+    {
+      "epoch": 32.62,
+      "grad_norm": 0.685664176940918,
+      "learning_rate": 0.0006953748006379585,
+      "loss": 0.4311,
+      "step": 20450
+    },
+    {
+      "epoch": 32.63,
+      "grad_norm": 0.580774188041687,
+      "learning_rate": 0.0006947368421052632,
+      "loss": 0.3973,
+      "step": 20460
+    },
+    {
+      "epoch": 32.65,
+      "grad_norm": 0.5588364601135254,
+      "learning_rate": 0.0006940988835725677,
+      "loss": 0.3902,
+      "step": 20470
+    },
+    {
+      "epoch": 32.66,
+      "grad_norm": 2.530954599380493,
+      "learning_rate": 0.0006934609250398724,
+      "loss": 0.4405,
+      "step": 20480
+    },
+    {
+      "epoch": 32.68,
+      "grad_norm": 0.9018158316612244,
+      "learning_rate": 0.000692822966507177,
+      "loss": 0.4219,
+      "step": 20490
+    },
+    {
+      "epoch": 32.7,
+      "grad_norm": 1.1370121240615845,
+      "learning_rate": 0.0006921850079744816,
+      "loss": 0.3901,
+      "step": 20500
+    },
+    {
+      "epoch": 32.71,
+      "grad_norm": 0.9494215250015259,
+      "learning_rate": 0.0006915470494417862,
+      "loss": 0.3377,
+      "step": 20510
+    },
+    {
+      "epoch": 32.73,
+      "grad_norm": 0.7825329899787903,
+      "learning_rate": 0.0006909090909090909,
+      "loss": 0.3479,
+      "step": 20520
+    },
+    {
+      "epoch": 32.74,
+      "grad_norm": 1.0042078495025635,
+      "learning_rate": 0.0006902711323763954,
+      "loss": 0.3888,
+      "step": 20530
+    },
+    {
+      "epoch": 32.76,
+      "grad_norm": 1.4132115840911865,
+      "learning_rate": 0.0006896331738437002,
+      "loss": 0.3898,
+      "step": 20540
+    },
+    {
+      "epoch": 32.78,
+      "grad_norm": 0.9790666103363037,
+      "learning_rate": 0.0006889952153110049,
+      "loss": 0.4135,
+      "step": 20550
+    },
+    {
+      "epoch": 32.79,
+      "grad_norm": 1.599612832069397,
+      "learning_rate": 0.0006883572567783095,
+      "loss": 0.4875,
+      "step": 20560
+    },
+    {
+      "epoch": 32.81,
+      "grad_norm": 0.934172511100769,
+      "learning_rate": 0.0006877192982456141,
+      "loss": 0.4245,
+      "step": 20570
+    },
+    {
+      "epoch": 32.82,
+      "grad_norm": 1.2698485851287842,
+      "learning_rate": 0.0006870813397129187,
+      "loss": 0.3811,
+      "step": 20580
+    },
+    {
+      "epoch": 32.84,
+      "grad_norm": 1.3154641389846802,
+      "learning_rate": 0.0006864433811802234,
+      "loss": 0.3682,
+      "step": 20590
+    },
+    {
+      "epoch": 32.85,
+      "grad_norm": 0.9714843034744263,
+      "learning_rate": 0.0006858054226475279,
+      "loss": 0.3105,
+      "step": 20600
+    },
+    {
+      "epoch": 32.87,
+      "grad_norm": 0.6481144428253174,
+      "learning_rate": 0.0006851674641148326,
+      "loss": 0.3489,
+      "step": 20610
+    },
+    {
+      "epoch": 32.89,
+      "grad_norm": 1.0251431465148926,
+      "learning_rate": 0.0006845295055821373,
+      "loss": 0.382,
+      "step": 20620
+    },
+    {
+      "epoch": 32.9,
+      "grad_norm": 1.046749234199524,
+      "learning_rate": 0.0006838915470494418,
+      "loss": 0.4344,
+      "step": 20630
+    },
+    {
+      "epoch": 32.92,
+      "grad_norm": 1.45224928855896,
+      "learning_rate": 0.0006832535885167465,
+      "loss": 0.3381,
+      "step": 20640
+    },
+    {
+      "epoch": 32.93,
+      "grad_norm": 1.0642521381378174,
+      "learning_rate": 0.0006826156299840511,
+      "loss": 0.4037,
+      "step": 20650
+    },
+    {
+      "epoch": 32.95,
+      "grad_norm": 0.7638436555862427,
+      "learning_rate": 0.0006819776714513557,
+      "loss": 0.4348,
+      "step": 20660
+    },
+    {
+      "epoch": 32.97,
+      "grad_norm": 0.9343836307525635,
+      "learning_rate": 0.0006813397129186603,
+      "loss": 0.4039,
+      "step": 20670
+    },
+    {
+      "epoch": 32.98,
+      "grad_norm": 0.6389197707176208,
+      "learning_rate": 0.000680701754385965,
+      "loss": 0.3571,
+      "step": 20680
+    },
+    {
+      "epoch": 33.0,
+      "grad_norm": 1.0805469751358032,
+      "learning_rate": 0.0006800637958532696,
+      "loss": 0.3274,
+      "step": 20690
+    },
+    {
+      "epoch": 33.01,
+      "grad_norm": 1.199524998664856,
+      "learning_rate": 0.0006794258373205742,
+      "loss": 0.3143,
+      "step": 20700
+    },
+    {
+      "epoch": 33.03,
+      "grad_norm": 1.2967311143875122,
+      "learning_rate": 0.0006787878787878789,
+      "loss": 0.3555,
+      "step": 20710
+    },
+    {
+      "epoch": 33.05,
+      "grad_norm": 1.0752925872802734,
+      "learning_rate": 0.0006781499202551834,
+      "loss": 0.3797,
+      "step": 20720
+    },
+    {
+      "epoch": 33.06,
+      "grad_norm": 0.7720149159431458,
+      "learning_rate": 0.0006775119617224881,
+      "loss": 0.2873,
+      "step": 20730
+    },
+    {
+      "epoch": 33.08,
+      "grad_norm": 0.6133707761764526,
+      "learning_rate": 0.0006768740031897927,
+      "loss": 0.3092,
+      "step": 20740
+    },
+    {
+      "epoch": 33.09,
+      "grad_norm": 0.8874982595443726,
+      "learning_rate": 0.0006762360446570973,
+      "loss": 0.3187,
+      "step": 20750
+    },
+    {
+      "epoch": 33.11,
+      "grad_norm": 1.3732993602752686,
+      "learning_rate": 0.0006755980861244019,
+      "loss": 0.2782,
+      "step": 20760
+    },
+    {
+      "epoch": 33.13,
+      "grad_norm": 0.9452306032180786,
+      "learning_rate": 0.0006749601275917066,
+      "loss": 0.2779,
+      "step": 20770
+    },
+    {
+      "epoch": 33.14,
+      "grad_norm": 1.7680912017822266,
+      "learning_rate": 0.0006743221690590112,
+      "loss": 0.2884,
+      "step": 20780
+    },
+    {
+      "epoch": 33.16,
+      "grad_norm": 1.6482670307159424,
+      "learning_rate": 0.0006736842105263158,
+      "loss": 0.359,
+      "step": 20790
+    },
+    {
+      "epoch": 33.17,
+      "grad_norm": 0.7076551914215088,
+      "learning_rate": 0.0006730462519936205,
+      "loss": 0.3134,
+      "step": 20800
+    },
+    {
+      "epoch": 33.19,
+      "grad_norm": 0.630064845085144,
+      "learning_rate": 0.000672408293460925,
+      "loss": 0.4372,
+      "step": 20810
+    },
+    {
+      "epoch": 33.21,
+      "grad_norm": 1.7952457666397095,
+      "learning_rate": 0.0006717703349282297,
+      "loss": 0.3711,
+      "step": 20820
+    },
+    {
+      "epoch": 33.22,
+      "grad_norm": 2.3427815437316895,
+      "learning_rate": 0.0006711323763955343,
+      "loss": 0.3385,
+      "step": 20830
+    },
+    {
+      "epoch": 33.24,
+      "grad_norm": 0.6796151995658875,
+      "learning_rate": 0.0006704944178628389,
+      "loss": 0.2952,
+      "step": 20840
+    },
+    {
+      "epoch": 33.25,
+      "grad_norm": 1.292067527770996,
+      "learning_rate": 0.0006698564593301436,
+      "loss": 0.3058,
+      "step": 20850
+    },
+    {
+      "epoch": 33.27,
+      "grad_norm": 0.8857368230819702,
+      "learning_rate": 0.0006692185007974482,
+      "loss": 0.3184,
+      "step": 20860
+    },
+    {
+      "epoch": 33.29,
+      "grad_norm": 0.8489099740982056,
+      "learning_rate": 0.0006685805422647528,
+      "loss": 0.3405,
+      "step": 20870
+    },
+    {
+      "epoch": 33.3,
+      "grad_norm": 0.4213086664676666,
+      "learning_rate": 0.0006679425837320574,
+      "loss": 0.2919,
+      "step": 20880
+    },
+    {
+      "epoch": 33.32,
+      "grad_norm": 0.3164719343185425,
+      "learning_rate": 0.0006673046251993621,
+      "loss": 0.3406,
+      "step": 20890
+    },
+    {
+      "epoch": 33.33,
+      "grad_norm": 1.2104874849319458,
+      "learning_rate": 0.0006666666666666666,
+      "loss": 0.3277,
+      "step": 20900
+    },
+    {
+      "epoch": 33.35,
+      "grad_norm": 1.2871508598327637,
+      "learning_rate": 0.0006660287081339713,
+      "loss": 0.4363,
+      "step": 20910
+    },
+    {
+      "epoch": 33.37,
+      "grad_norm": 1.1551439762115479,
+      "learning_rate": 0.0006653907496012759,
+      "loss": 0.4146,
+      "step": 20920
+    },
+    {
+      "epoch": 33.38,
+      "grad_norm": 0.4967116713523865,
+      "learning_rate": 0.0006647527910685805,
+      "loss": 0.3614,
+      "step": 20930
+    },
+    {
+      "epoch": 33.4,
+      "grad_norm": 1.4939340353012085,
+      "learning_rate": 0.0006641148325358852,
+      "loss": 0.2574,
+      "step": 20940
+    },
+    {
+      "epoch": 33.41,
+      "grad_norm": 2.036379337310791,
+      "learning_rate": 0.0006634768740031898,
+      "loss": 0.2869,
+      "step": 20950
+    },
+    {
+      "epoch": 33.43,
+      "grad_norm": 0.6936982870101929,
+      "learning_rate": 0.0006628389154704945,
+      "loss": 0.3495,
+      "step": 20960
+    },
+    {
+      "epoch": 33.44,
+      "grad_norm": 1.4173444509506226,
+      "learning_rate": 0.000662200956937799,
+      "loss": 0.4171,
+      "step": 20970
+    },
+    {
+      "epoch": 33.46,
+      "grad_norm": 0.9318954348564148,
+      "learning_rate": 0.0006615629984051037,
+      "loss": 0.368,
+      "step": 20980
+    },
+    {
+      "epoch": 33.48,
+      "grad_norm": 0.6373530030250549,
+      "learning_rate": 0.0006609250398724083,
+      "loss": 0.3407,
+      "step": 20990
+    },
+    {
+      "epoch": 33.49,
+      "grad_norm": 0.5435881614685059,
+      "learning_rate": 0.0006602870813397129,
+      "loss": 0.3582,
+      "step": 21000
+    },
+    {
+      "epoch": 33.51,
+      "grad_norm": 0.5529409050941467,
+      "learning_rate": 0.0006596491228070176,
+      "loss": 0.397,
+      "step": 21010
+    },
+    {
+      "epoch": 33.52,
+      "grad_norm": 0.6477614641189575,
+      "learning_rate": 0.0006590111642743222,
+      "loss": 0.4706,
+      "step": 21020
+    },
+    {
+      "epoch": 33.54,
+      "grad_norm": 0.6400772929191589,
+      "learning_rate": 0.0006583732057416268,
+      "loss": 0.3936,
+      "step": 21030
+    },
+    {
+      "epoch": 33.56,
+      "grad_norm": 0.7568894624710083,
+      "learning_rate": 0.0006577352472089314,
+      "loss": 0.2936,
+      "step": 21040
+    },
+    {
+      "epoch": 33.57,
+      "grad_norm": 0.6574100852012634,
+      "learning_rate": 0.0006570972886762361,
+      "loss": 0.3454,
+      "step": 21050
+    },
+    {
+      "epoch": 33.59,
+      "grad_norm": 0.6999270915985107,
+      "learning_rate": 0.0006564593301435406,
+      "loss": 0.3597,
+      "step": 21060
+    },
+    {
+      "epoch": 33.6,
+      "grad_norm": 1.4358758926391602,
+      "learning_rate": 0.0006558213716108453,
+      "loss": 0.4673,
+      "step": 21070
+    },
+    {
+      "epoch": 33.62,
+      "grad_norm": 0.8735805749893188,
+      "learning_rate": 0.0006551834130781499,
+      "loss": 0.3329,
+      "step": 21080
+    },
+    {
+      "epoch": 33.64,
+      "grad_norm": 0.42257770895957947,
+      "learning_rate": 0.0006545454545454545,
+      "loss": 0.3934,
+      "step": 21090
+    },
+    {
+      "epoch": 33.65,
+      "grad_norm": 0.5228465795516968,
+      "learning_rate": 0.0006539074960127592,
+      "loss": 0.3563,
+      "step": 21100
+    },
+    {
+      "epoch": 33.67,
+      "grad_norm": 0.7257753014564514,
+      "learning_rate": 0.0006532695374800638,
+      "loss": 0.3194,
+      "step": 21110
+    },
+    {
+      "epoch": 33.68,
+      "grad_norm": 1.101475477218628,
+      "learning_rate": 0.0006526315789473684,
+      "loss": 0.3633,
+      "step": 21120
+    },
+    {
+      "epoch": 33.7,
+      "grad_norm": 1.2462613582611084,
+      "learning_rate": 0.000651993620414673,
+      "loss": 0.3845,
+      "step": 21130
+    },
+    {
+      "epoch": 33.72,
+      "grad_norm": 0.8615121841430664,
+      "learning_rate": 0.0006513556618819777,
+      "loss": 0.3183,
+      "step": 21140
+    },
+    {
+      "epoch": 33.73,
+      "grad_norm": 1.6341915130615234,
+      "learning_rate": 0.0006507177033492822,
+      "loss": 0.3739,
+      "step": 21150
+    },
+    {
+      "epoch": 33.75,
+      "grad_norm": 0.7129934430122375,
+      "learning_rate": 0.0006500797448165869,
+      "loss": 0.3184,
+      "step": 21160
+    },
+    {
+      "epoch": 33.76,
+      "grad_norm": 1.0505317449569702,
+      "learning_rate": 0.0006494417862838916,
+      "loss": 0.3377,
+      "step": 21170
+    },
+    {
+      "epoch": 33.78,
+      "grad_norm": 0.6486239433288574,
+      "learning_rate": 0.0006488038277511961,
+      "loss": 0.4064,
+      "step": 21180
+    },
+    {
+      "epoch": 33.8,
+      "grad_norm": 0.805962324142456,
+      "learning_rate": 0.0006481658692185008,
+      "loss": 0.4032,
+      "step": 21190
+    },
+    {
+      "epoch": 33.81,
+      "grad_norm": 0.8866637349128723,
+      "learning_rate": 0.0006475279106858054,
+      "loss": 0.4194,
+      "step": 21200
+    },
+    {
+      "epoch": 33.83,
+      "grad_norm": 2.0624029636383057,
+      "learning_rate": 0.00064688995215311,
+      "loss": 0.3906,
+      "step": 21210
+    },
+    {
+      "epoch": 33.84,
+      "grad_norm": 0.9357002377510071,
+      "learning_rate": 0.0006462519936204146,
+      "loss": 0.4074,
+      "step": 21220
+    },
+    {
+      "epoch": 33.86,
+      "grad_norm": 0.7102904915809631,
+      "learning_rate": 0.0006456140350877193,
+      "loss": 0.365,
+      "step": 21230
+    },
+    {
+      "epoch": 33.88,
+      "grad_norm": 1.7485020160675049,
+      "learning_rate": 0.0006449760765550238,
+      "loss": 0.3714,
+      "step": 21240
+    },
+    {
+      "epoch": 33.89,
+      "grad_norm": 1.0567692518234253,
+      "learning_rate": 0.0006443381180223285,
+      "loss": 0.3404,
+      "step": 21250
+    },
+    {
+      "epoch": 33.91,
+      "grad_norm": 1.1951782703399658,
+      "learning_rate": 0.0006437001594896332,
+      "loss": 0.3729,
+      "step": 21260
+    },
+    {
+      "epoch": 33.92,
+      "grad_norm": 1.056022047996521,
+      "learning_rate": 0.0006430622009569377,
+      "loss": 0.4126,
+      "step": 21270
+    },
+    {
+      "epoch": 33.94,
+      "grad_norm": 0.45082104206085205,
+      "learning_rate": 0.0006424242424242425,
+      "loss": 0.3527,
+      "step": 21280
+    },
+    {
+      "epoch": 33.96,
+      "grad_norm": 0.9164630174636841,
+      "learning_rate": 0.0006417862838915471,
+      "loss": 0.3731,
+      "step": 21290
+    },
+    {
+      "epoch": 33.97,
+      "grad_norm": 1.3435860872268677,
+      "learning_rate": 0.0006411483253588518,
+      "loss": 0.3889,
+      "step": 21300
+    },
+    {
+      "epoch": 33.99,
+      "grad_norm": 1.0820218324661255,
+      "learning_rate": 0.0006405103668261563,
+      "loss": 0.3823,
+      "step": 21310
+    },
+    {
+      "epoch": 34.0,
+      "grad_norm": 0.43368542194366455,
+      "learning_rate": 0.000639872408293461,
+      "loss": 0.2854,
+      "step": 21320
+    },
+    {
+      "epoch": 34.02,
+      "grad_norm": 1.282607913017273,
+      "learning_rate": 0.0006392344497607657,
+      "loss": 0.3346,
+      "step": 21330
+    },
+    {
+      "epoch": 34.04,
+      "grad_norm": 0.48912495374679565,
+      "learning_rate": 0.0006385964912280702,
+      "loss": 0.3253,
+      "step": 21340
+    },
+    {
+      "epoch": 34.05,
+      "grad_norm": 0.5396437644958496,
+      "learning_rate": 0.0006379585326953749,
+      "loss": 0.3848,
+      "step": 21350
+    },
+    {
+      "epoch": 34.07,
+      "grad_norm": 0.43667685985565186,
+      "learning_rate": 0.0006373205741626795,
+      "loss": 0.389,
+      "step": 21360
+    },
+    {
+      "epoch": 34.08,
+      "grad_norm": 0.7458956837654114,
+      "learning_rate": 0.0006366826156299841,
+      "loss": 0.3669,
+      "step": 21370
+    },
+    {
+      "epoch": 34.1,
+      "grad_norm": 0.5539690256118774,
+      "learning_rate": 0.0006360446570972887,
+      "loss": 0.3428,
+      "step": 21380
+    },
+    {
+      "epoch": 34.11,
+      "grad_norm": 0.5653215646743774,
+      "learning_rate": 0.0006354066985645934,
+      "loss": 0.3716,
+      "step": 21390
+    },
+    {
+      "epoch": 34.13,
+      "grad_norm": 0.7448431849479675,
+      "learning_rate": 0.0006347687400318979,
+      "loss": 0.3799,
+      "step": 21400
+    },
+    {
+      "epoch": 34.15,
+      "grad_norm": 1.6342990398406982,
+      "learning_rate": 0.0006341307814992026,
+      "loss": 0.281,
+      "step": 21410
+    },
+    {
+      "epoch": 34.16,
+      "grad_norm": 1.0939606428146362,
+      "learning_rate": 0.0006334928229665073,
+      "loss": 0.2995,
+      "step": 21420
+    },
+    {
+      "epoch": 34.18,
+      "grad_norm": 0.4550718665122986,
+      "learning_rate": 0.0006328548644338118,
+      "loss": 0.2839,
+      "step": 21430
+    },
+    {
+      "epoch": 34.19,
+      "grad_norm": 0.7015230655670166,
+      "learning_rate": 0.0006322169059011165,
+      "loss": 0.3624,
+      "step": 21440
+    },
+    {
+      "epoch": 34.21,
+      "grad_norm": 0.9311388731002808,
+      "learning_rate": 0.0006315789473684211,
+      "loss": 0.3116,
+      "step": 21450
+    },
+    {
+      "epoch": 34.23,
+      "grad_norm": 0.519597053527832,
+      "learning_rate": 0.0006309409888357257,
+      "loss": 0.3464,
+      "step": 21460
+    },
+    {
+      "epoch": 34.24,
+      "grad_norm": 0.687154233455658,
+      "learning_rate": 0.0006303030303030303,
+      "loss": 0.2955,
+      "step": 21470
+    },
+    {
+      "epoch": 34.26,
+      "grad_norm": 0.6777644753456116,
+      "learning_rate": 0.000629665071770335,
+      "loss": 0.3579,
+      "step": 21480
+    },
+    {
+      "epoch": 34.27,
+      "grad_norm": 1.1561830043792725,
+      "learning_rate": 0.0006290271132376396,
+      "loss": 0.324,
+      "step": 21490
+    },
+    {
+      "epoch": 34.29,
+      "grad_norm": 0.6058475375175476,
+      "learning_rate": 0.0006283891547049442,
+      "loss": 0.4582,
+      "step": 21500
+    },
+    {
+      "epoch": 34.31,
+      "grad_norm": 1.3952281475067139,
+      "learning_rate": 0.0006277511961722489,
+      "loss": 0.2972,
+      "step": 21510
+    },
+    {
+      "epoch": 34.32,
+      "grad_norm": 0.9021815061569214,
+      "learning_rate": 0.0006271132376395534,
+      "loss": 0.3736,
+      "step": 21520
+    },
+    {
+      "epoch": 34.34,
+      "grad_norm": 0.5777958631515503,
+      "learning_rate": 0.0006264752791068581,
+      "loss": 0.3418,
+      "step": 21530
+    },
+    {
+      "epoch": 34.35,
+      "grad_norm": 0.5624024271965027,
+      "learning_rate": 0.0006258373205741627,
+      "loss": 0.3897,
+      "step": 21540
+    },
+    {
+      "epoch": 34.37,
+      "grad_norm": 1.0554344654083252,
+      "learning_rate": 0.0006251993620414673,
+      "loss": 0.3636,
+      "step": 21550
+    },
+    {
+      "epoch": 34.39,
+      "grad_norm": 0.39624953269958496,
+      "learning_rate": 0.000624561403508772,
+      "loss": 0.3199,
+      "step": 21560
+    },
+    {
+      "epoch": 34.4,
+      "grad_norm": 0.8201066255569458,
+      "learning_rate": 0.0006239234449760766,
+      "loss": 0.4237,
+      "step": 21570
+    },
+    {
+      "epoch": 34.42,
+      "grad_norm": 0.7447034120559692,
+      "learning_rate": 0.0006232854864433812,
+      "loss": 0.3045,
+      "step": 21580
+    },
+    {
+      "epoch": 34.43,
+      "grad_norm": 0.37216076254844666,
+      "learning_rate": 0.0006226475279106858,
+      "loss": 0.287,
+      "step": 21590
+    },
+    {
+      "epoch": 34.45,
+      "grad_norm": 1.3851195573806763,
+      "learning_rate": 0.0006220095693779905,
+      "loss": 0.3752,
+      "step": 21600
+    },
+    {
+      "epoch": 34.47,
+      "grad_norm": 0.5135475397109985,
+      "learning_rate": 0.000621371610845295,
+      "loss": 0.2575,
+      "step": 21610
+    },
+    {
+      "epoch": 34.48,
+      "grad_norm": 1.3252980709075928,
+      "learning_rate": 0.0006207336523125997,
+      "loss": 0.3204,
+      "step": 21620
+    },
+    {
+      "epoch": 34.5,
+      "grad_norm": 1.036947250366211,
+      "learning_rate": 0.0006200956937799043,
+      "loss": 0.3715,
+      "step": 21630
+    },
+    {
+      "epoch": 34.51,
+      "grad_norm": 0.9725881218910217,
+      "learning_rate": 0.0006194577352472089,
+      "loss": 0.2895,
+      "step": 21640
+    },
+    {
+      "epoch": 34.53,
+      "grad_norm": 0.8383840322494507,
+      "learning_rate": 0.0006188197767145136,
+      "loss": 0.2823,
+      "step": 21650
+    },
+    {
+      "epoch": 34.55,
+      "grad_norm": 0.5011244416236877,
+      "learning_rate": 0.0006181818181818182,
+      "loss": 0.2946,
+      "step": 21660
+    },
+    {
+      "epoch": 34.56,
+      "grad_norm": 0.5851901769638062,
+      "learning_rate": 0.0006175438596491228,
+      "loss": 0.3679,
+      "step": 21670
+    },
+    {
+      "epoch": 34.58,
+      "grad_norm": 1.2106326818466187,
+      "learning_rate": 0.0006169059011164274,
+      "loss": 0.3548,
+      "step": 21680
+    },
+    {
+      "epoch": 34.59,
+      "grad_norm": 0.7996150255203247,
+      "learning_rate": 0.0006162679425837321,
+      "loss": 0.3279,
+      "step": 21690
+    },
+    {
+      "epoch": 34.61,
+      "grad_norm": 0.9852333664894104,
+      "learning_rate": 0.0006156299840510366,
+      "loss": 0.3803,
+      "step": 21700
+    },
+    {
+      "epoch": 34.63,
+      "grad_norm": 1.8588385581970215,
+      "learning_rate": 0.0006149920255183413,
+      "loss": 0.3173,
+      "step": 21710
+    },
+    {
+      "epoch": 34.64,
+      "grad_norm": 1.403646469116211,
+      "learning_rate": 0.000614354066985646,
+      "loss": 0.383,
+      "step": 21720
+    },
+    {
+      "epoch": 34.66,
+      "grad_norm": 0.7591367363929749,
+      "learning_rate": 0.0006137161084529505,
+      "loss": 0.3134,
+      "step": 21730
+    },
+    {
+      "epoch": 34.67,
+      "grad_norm": 0.8111428022384644,
+      "learning_rate": 0.0006130781499202552,
+      "loss": 0.4017,
+      "step": 21740
+    },
+    {
+      "epoch": 34.69,
+      "grad_norm": 0.7398600578308105,
+      "learning_rate": 0.0006124401913875598,
+      "loss": 0.3378,
+      "step": 21750
+    },
+    {
+      "epoch": 34.7,
+      "grad_norm": 2.6811933517456055,
+      "learning_rate": 0.0006118022328548645,
+      "loss": 0.4099,
+      "step": 21760
+    },
+    {
+      "epoch": 34.72,
+      "grad_norm": 0.5849010944366455,
+      "learning_rate": 0.000611164274322169,
+      "loss": 0.3977,
+      "step": 21770
+    },
+    {
+      "epoch": 34.74,
+      "grad_norm": 1.6892285346984863,
+      "learning_rate": 0.0006105263157894737,
+      "loss": 0.3775,
+      "step": 21780
+    },
+    {
+      "epoch": 34.75,
+      "grad_norm": 0.6772777438163757,
+      "learning_rate": 0.0006098883572567783,
+      "loss": 0.3051,
+      "step": 21790
+    },
+    {
+      "epoch": 34.77,
+      "grad_norm": 0.7815658450126648,
+      "learning_rate": 0.0006092503987240829,
+      "loss": 0.3252,
+      "step": 21800
+    },
+    {
+      "epoch": 34.78,
+      "grad_norm": 0.7828931212425232,
+      "learning_rate": 0.0006086124401913876,
+      "loss": 0.3276,
+      "step": 21810
+    },
+    {
+      "epoch": 34.8,
+      "grad_norm": 0.6614720821380615,
+      "learning_rate": 0.0006079744816586922,
+      "loss": 0.3114,
+      "step": 21820
+    },
+    {
+      "epoch": 34.82,
+      "grad_norm": 0.6951574087142944,
+      "learning_rate": 0.0006073365231259968,
+      "loss": 0.4708,
+      "step": 21830
+    },
+    {
+      "epoch": 34.83,
+      "grad_norm": 0.5724729895591736,
+      "learning_rate": 0.0006066985645933014,
+      "loss": 0.4992,
+      "step": 21840
+    },
+    {
+      "epoch": 34.85,
+      "grad_norm": 0.5912214517593384,
+      "learning_rate": 0.0006060606060606061,
+      "loss": 0.3571,
+      "step": 21850
+    },
+    {
+      "epoch": 34.86,
+      "grad_norm": 1.9406144618988037,
+      "learning_rate": 0.0006054226475279106,
+      "loss": 0.4081,
+      "step": 21860
+    },
+    {
+      "epoch": 34.88,
+      "grad_norm": 0.6928081512451172,
+      "learning_rate": 0.0006047846889952153,
+      "loss": 0.3651,
+      "step": 21870
+    },
+    {
+      "epoch": 34.9,
+      "grad_norm": 1.4750044345855713,
+      "learning_rate": 0.00060414673046252,
+      "loss": 0.3445,
+      "step": 21880
+    },
+    {
+      "epoch": 34.91,
+      "grad_norm": 1.0808309316635132,
+      "learning_rate": 0.0006035087719298245,
+      "loss": 0.3738,
+      "step": 21890
+    },
+    {
+      "epoch": 34.93,
+      "grad_norm": 0.8171405792236328,
+      "learning_rate": 0.0006028708133971292,
+      "loss": 0.3313,
+      "step": 21900
+    },
+    {
+      "epoch": 34.94,
+      "grad_norm": 0.9406991004943848,
+      "learning_rate": 0.0006022328548644338,
+      "loss": 0.3488,
+      "step": 21910
+    },
+    {
+      "epoch": 34.96,
+      "grad_norm": 0.7322232127189636,
+      "learning_rate": 0.0006015948963317384,
+      "loss": 0.3948,
+      "step": 21920
+    },
+    {
+      "epoch": 34.98,
+      "grad_norm": 1.1117455959320068,
+      "learning_rate": 0.000600956937799043,
+      "loss": 0.385,
+      "step": 21930
+    },
+    {
+      "epoch": 34.99,
+      "grad_norm": 1.0977877378463745,
+      "learning_rate": 0.0006003189792663477,
+      "loss": 0.419,
+      "step": 21940
+    },
+    {
+      "epoch": 35.01,
+      "grad_norm": 0.726335346698761,
+      "learning_rate": 0.0005996810207336522,
+      "loss": 0.3169,
+      "step": 21950
+    },
+    {
+      "epoch": 35.02,
+      "grad_norm": 0.49332767724990845,
+      "learning_rate": 0.0005990430622009569,
+      "loss": 0.2902,
+      "step": 21960
+    },
+    {
+      "epoch": 35.04,
+      "grad_norm": 0.9816588759422302,
+      "learning_rate": 0.0005984051036682616,
+      "loss": 0.3322,
+      "step": 21970
+    },
+    {
+      "epoch": 35.06,
+      "grad_norm": 0.8066359162330627,
+      "learning_rate": 0.0005977671451355661,
+      "loss": 0.3058,
+      "step": 21980
+    },
+    {
+      "epoch": 35.07,
+      "grad_norm": 0.38948720693588257,
+      "learning_rate": 0.0005971291866028708,
+      "loss": 0.2839,
+      "step": 21990
+    },
+    {
+      "epoch": 35.09,
+      "grad_norm": 0.2944769561290741,
+      "learning_rate": 0.0005964912280701754,
+      "loss": 0.3656,
+      "step": 22000
+    },
+    {
+      "epoch": 35.1,
+      "grad_norm": 0.3112677335739136,
+      "learning_rate": 0.00059585326953748,
+      "loss": 0.2358,
+      "step": 22010
+    },
+    {
+      "epoch": 35.12,
+      "grad_norm": 2.4940788745880127,
+      "learning_rate": 0.0005952153110047846,
+      "loss": 0.3809,
+      "step": 22020
+    },
+    {
+      "epoch": 35.14,
+      "grad_norm": 0.9833939671516418,
+      "learning_rate": 0.0005945773524720894,
+      "loss": 0.3437,
+      "step": 22030
+    },
+    {
+      "epoch": 35.15,
+      "grad_norm": 1.0946290493011475,
+      "learning_rate": 0.000593939393939394,
+      "loss": 0.3872,
+      "step": 22040
+    },
+    {
+      "epoch": 35.17,
+      "grad_norm": 1.2367923259735107,
+      "learning_rate": 0.0005933014354066986,
+      "loss": 0.3041,
+      "step": 22050
+    },
+    {
+      "epoch": 35.18,
+      "grad_norm": 1.032891035079956,
+      "learning_rate": 0.0005926634768740033,
+      "loss": 0.2409,
+      "step": 22060
+    },
+    {
+      "epoch": 35.2,
+      "grad_norm": 0.7659148573875427,
+      "learning_rate": 0.0005920255183413078,
+      "loss": 0.3928,
+      "step": 22070
+    },
+    {
+      "epoch": 35.22,
+      "grad_norm": 0.337522953748703,
+      "learning_rate": 0.0005913875598086125,
+      "loss": 0.3563,
+      "step": 22080
+    },
+    {
+      "epoch": 35.23,
+      "grad_norm": 0.6713753342628479,
+      "learning_rate": 0.0005907496012759171,
+      "loss": 0.3728,
+      "step": 22090
+    },
+    {
+      "epoch": 35.25,
+      "grad_norm": 1.1470608711242676,
+      "learning_rate": 0.0005901116427432218,
+      "loss": 0.3121,
+      "step": 22100
+    },
+    {
+      "epoch": 35.26,
+      "grad_norm": 0.5234013199806213,
+      "learning_rate": 0.0005894736842105263,
+      "loss": 0.42,
+      "step": 22110
+    },
+    {
+      "epoch": 35.28,
+      "grad_norm": 0.6255330443382263,
+      "learning_rate": 0.000588835725677831,
+      "loss": 0.2818,
+      "step": 22120
+    },
+    {
+      "epoch": 35.3,
+      "grad_norm": 1.1830130815505981,
+      "learning_rate": 0.0005881977671451357,
+      "loss": 0.3217,
+      "step": 22130
+    },
+    {
+      "epoch": 35.31,
+      "grad_norm": 0.47124946117401123,
+      "learning_rate": 0.0005875598086124402,
+      "loss": 0.3054,
+      "step": 22140
+    },
+    {
+      "epoch": 35.33,
+      "grad_norm": 0.5270739793777466,
+      "learning_rate": 0.0005869218500797449,
+      "loss": 0.3684,
+      "step": 22150
+    },
+    {
+      "epoch": 35.34,
+      "grad_norm": 1.9852588176727295,
+      "learning_rate": 0.0005862838915470495,
+      "loss": 0.3598,
+      "step": 22160
+    },
+    {
+      "epoch": 35.36,
+      "grad_norm": 1.0637511014938354,
+      "learning_rate": 0.0005856459330143541,
+      "loss": 0.3553,
+      "step": 22170
+    },
+    {
+      "epoch": 35.37,
+      "grad_norm": 0.7305306792259216,
+      "learning_rate": 0.0005850079744816587,
+      "loss": 0.3361,
+      "step": 22180
+    },
+    {
+      "epoch": 35.39,
+      "grad_norm": 1.0449053049087524,
+      "learning_rate": 0.0005843700159489634,
+      "loss": 0.45,
+      "step": 22190
+    },
+    {
+      "epoch": 35.41,
+      "grad_norm": 0.3895207643508911,
+      "learning_rate": 0.000583732057416268,
+      "loss": 0.38,
+      "step": 22200
+    },
+    {
+      "epoch": 35.42,
+      "grad_norm": 0.8981882333755493,
+      "learning_rate": 0.0005830940988835726,
+      "loss": 0.376,
+      "step": 22210
+    },
+    {
+      "epoch": 35.44,
+      "grad_norm": 1.1853015422821045,
+      "learning_rate": 0.0005824561403508773,
+      "loss": 0.4054,
+      "step": 22220
+    },
+    {
+      "epoch": 35.45,
+      "grad_norm": 0.6197064518928528,
+      "learning_rate": 0.0005818181818181818,
+      "loss": 0.3198,
+      "step": 22230
+    },
+    {
+      "epoch": 35.47,
+      "grad_norm": 0.5569806694984436,
+      "learning_rate": 0.0005811802232854865,
+      "loss": 0.4118,
+      "step": 22240
+    },
+    {
+      "epoch": 35.49,
+      "grad_norm": 0.48562178015708923,
+      "learning_rate": 0.0005805422647527911,
+      "loss": 0.2063,
+      "step": 22250
+    },
+    {
+      "epoch": 35.5,
+      "grad_norm": 0.5743929743766785,
+      "learning_rate": 0.0005799043062200957,
+      "loss": 0.2263,
+      "step": 22260
+    },
+    {
+      "epoch": 35.52,
+      "grad_norm": 0.5665689706802368,
+      "learning_rate": 0.0005792663476874004,
+      "loss": 0.3017,
+      "step": 22270
+    },
+    {
+      "epoch": 35.53,
+      "grad_norm": 0.7719668745994568,
+      "learning_rate": 0.000578628389154705,
+      "loss": 0.3591,
+      "step": 22280
+    },
+    {
+      "epoch": 35.55,
+      "grad_norm": 1.785213828086853,
+      "learning_rate": 0.0005779904306220096,
+      "loss": 0.3357,
+      "step": 22290
+    },
+    {
+      "epoch": 35.57,
+      "grad_norm": 0.3386642336845398,
+      "learning_rate": 0.0005773524720893142,
+      "loss": 0.3164,
+      "step": 22300
+    },
+    {
+      "epoch": 35.58,
+      "grad_norm": 0.8696405291557312,
+      "learning_rate": 0.0005767145135566189,
+      "loss": 0.3237,
+      "step": 22310
+    },
+    {
+      "epoch": 35.6,
+      "grad_norm": 0.32794955372810364,
+      "learning_rate": 0.0005760765550239234,
+      "loss": 0.2756,
+      "step": 22320
+    },
+    {
+      "epoch": 35.61,
+      "grad_norm": 0.3796286880970001,
+      "learning_rate": 0.0005754385964912281,
+      "loss": 0.3981,
+      "step": 22330
+    },
+    {
+      "epoch": 35.63,
+      "grad_norm": 0.31685948371887207,
+      "learning_rate": 0.0005748006379585327,
+      "loss": 0.3877,
+      "step": 22340
+    },
+    {
+      "epoch": 35.65,
+      "grad_norm": 0.3694205582141876,
+      "learning_rate": 0.0005741626794258373,
+      "loss": 0.2914,
+      "step": 22350
+    },
+    {
+      "epoch": 35.66,
+      "grad_norm": 0.6097325086593628,
+      "learning_rate": 0.000573524720893142,
+      "loss": 0.2863,
+      "step": 22360
+    },
+    {
+      "epoch": 35.68,
+      "grad_norm": 0.7454453110694885,
+      "learning_rate": 0.0005728867623604466,
+      "loss": 0.3531,
+      "step": 22370
+    },
+    {
+      "epoch": 35.69,
+      "grad_norm": 0.4996640086174011,
+      "learning_rate": 0.0005722488038277512,
+      "loss": 0.3063,
+      "step": 22380
+    },
+    {
+      "epoch": 35.71,
+      "grad_norm": 0.4868077337741852,
+      "learning_rate": 0.0005716108452950558,
+      "loss": 0.3087,
+      "step": 22390
+    },
+    {
+      "epoch": 35.73,
+      "grad_norm": 0.3814201056957245,
+      "learning_rate": 0.0005709728867623605,
+      "loss": 0.3242,
+      "step": 22400
+    },
+    {
+      "epoch": 35.74,
+      "grad_norm": 0.5458118915557861,
+      "learning_rate": 0.000570334928229665,
+      "loss": 0.2917,
+      "step": 22410
+    },
+    {
+      "epoch": 35.76,
+      "grad_norm": 0.7367342114448547,
+      "learning_rate": 0.0005696969696969697,
+      "loss": 0.3678,
+      "step": 22420
+    },
+    {
+      "epoch": 35.77,
+      "grad_norm": 0.876809298992157,
+      "learning_rate": 0.0005690590111642744,
+      "loss": 0.2996,
+      "step": 22430
+    },
+    {
+      "epoch": 35.79,
+      "grad_norm": 0.392926424741745,
+      "learning_rate": 0.0005684210526315789,
+      "loss": 0.2191,
+      "step": 22440
+    },
+    {
+      "epoch": 35.81,
+      "grad_norm": 0.5339792966842651,
+      "learning_rate": 0.0005677830940988836,
+      "loss": 0.3324,
+      "step": 22450
+    },
+    {
+      "epoch": 35.82,
+      "grad_norm": 0.31976428627967834,
+      "learning_rate": 0.0005671451355661882,
+      "loss": 0.3723,
+      "step": 22460
+    },
+    {
+      "epoch": 35.84,
+      "grad_norm": 1.3592702150344849,
+      "learning_rate": 0.0005665071770334928,
+      "loss": 0.3606,
+      "step": 22470
+    },
+    {
+      "epoch": 35.85,
+      "grad_norm": 0.4233976900577545,
+      "learning_rate": 0.0005658692185007974,
+      "loss": 0.329,
+      "step": 22480
+    },
+    {
+      "epoch": 35.87,
+      "grad_norm": 0.6980434656143188,
+      "learning_rate": 0.0005652312599681021,
+      "loss": 0.3132,
+      "step": 22490
+    },
+    {
+      "epoch": 35.89,
+      "grad_norm": 0.766575813293457,
+      "learning_rate": 0.0005645933014354066,
+      "loss": 0.3447,
+      "step": 22500
+    },
+    {
+      "epoch": 35.9,
+      "grad_norm": 0.6142354011535645,
+      "learning_rate": 0.0005639553429027113,
+      "loss": 0.2992,
+      "step": 22510
+    },
+    {
+      "epoch": 35.92,
+      "grad_norm": 0.41867053508758545,
+      "learning_rate": 0.000563317384370016,
+      "loss": 0.3276,
+      "step": 22520
+    },
+    {
+      "epoch": 35.93,
+      "grad_norm": 0.5943330526351929,
+      "learning_rate": 0.0005626794258373205,
+      "loss": 0.4112,
+      "step": 22530
+    },
+    {
+      "epoch": 35.95,
+      "grad_norm": 1.2840982675552368,
+      "learning_rate": 0.0005620414673046252,
+      "loss": 0.406,
+      "step": 22540
+    },
+    {
+      "epoch": 35.96,
+      "grad_norm": 0.5472711324691772,
+      "learning_rate": 0.0005614035087719298,
+      "loss": 0.3947,
+      "step": 22550
+    },
+    {
+      "epoch": 35.98,
+      "grad_norm": 0.49946820735931396,
+      "learning_rate": 0.0005607655502392345,
+      "loss": 0.3561,
+      "step": 22560
+    },
+    {
+      "epoch": 36.0,
+      "grad_norm": 0.5711825489997864,
+      "learning_rate": 0.000560127591706539,
+      "loss": 0.4207,
+      "step": 22570
+    },
+    {
+      "epoch": 36.01,
+      "grad_norm": 2.452195882797241,
+      "learning_rate": 0.0005594896331738437,
+      "loss": 0.3378,
+      "step": 22580
+    },
+    {
+      "epoch": 36.03,
+      "grad_norm": 0.39312276244163513,
+      "learning_rate": 0.0005588516746411484,
+      "loss": 0.3136,
+      "step": 22590
+    },
+    {
+      "epoch": 36.04,
+      "grad_norm": 0.7896597981452942,
+      "learning_rate": 0.0005582137161084529,
+      "loss": 0.3815,
+      "step": 22600
+    },
+    {
+      "epoch": 36.06,
+      "grad_norm": 0.5603874921798706,
+      "learning_rate": 0.0005575757575757576,
+      "loss": 0.3509,
+      "step": 22610
+    },
+    {
+      "epoch": 36.08,
+      "grad_norm": 0.3025873899459839,
+      "learning_rate": 0.0005569377990430622,
+      "loss": 0.2575,
+      "step": 22620
+    },
+    {
+      "epoch": 36.09,
+      "grad_norm": 0.2621009349822998,
+      "learning_rate": 0.0005562998405103668,
+      "loss": 0.2562,
+      "step": 22630
+    },
+    {
+      "epoch": 36.11,
+      "grad_norm": 0.6688899397850037,
+      "learning_rate": 0.0005556618819776714,
+      "loss": 0.3194,
+      "step": 22640
+    },
+    {
+      "epoch": 36.12,
+      "grad_norm": 2.3156378269195557,
+      "learning_rate": 0.0005550239234449761,
+      "loss": 0.3564,
+      "step": 22650
+    },
+    {
+      "epoch": 36.14,
+      "grad_norm": 0.35387060046195984,
+      "learning_rate": 0.0005543859649122806,
+      "loss": 0.3054,
+      "step": 22660
+    },
+    {
+      "epoch": 36.16,
+      "grad_norm": 0.3707694709300995,
+      "learning_rate": 0.0005537480063795853,
+      "loss": 0.3391,
+      "step": 22670
+    },
+    {
+      "epoch": 36.17,
+      "grad_norm": 0.675459086894989,
+      "learning_rate": 0.00055311004784689,
+      "loss": 0.3202,
+      "step": 22680
+    },
+    {
+      "epoch": 36.19,
+      "grad_norm": 0.46194231510162354,
+      "learning_rate": 0.0005524720893141945,
+      "loss": 0.2755,
+      "step": 22690
+    },
+    {
+      "epoch": 36.2,
+      "grad_norm": 0.4732086956501007,
+      "learning_rate": 0.0005518341307814992,
+      "loss": 0.2891,
+      "step": 22700
+    },
+    {
+      "epoch": 36.22,
+      "grad_norm": 0.5394445061683655,
+      "learning_rate": 0.0005511961722488038,
+      "loss": 0.2845,
+      "step": 22710
+    },
+    {
+      "epoch": 36.24,
+      "grad_norm": 0.7429685592651367,
+      "learning_rate": 0.0005505582137161084,
+      "loss": 0.3268,
+      "step": 22720
+    },
+    {
+      "epoch": 36.25,
+      "grad_norm": 0.4031120240688324,
+      "learning_rate": 0.000549920255183413,
+      "loss": 0.3197,
+      "step": 22730
+    },
+    {
+      "epoch": 36.27,
+      "grad_norm": 1.3633867502212524,
+      "learning_rate": 0.0005492822966507177,
+      "loss": 0.2912,
+      "step": 22740
+    },
+    {
+      "epoch": 36.28,
+      "grad_norm": 0.246135875582695,
+      "learning_rate": 0.0005486443381180223,
+      "loss": 0.3485,
+      "step": 22750
+    },
+    {
+      "epoch": 36.3,
+      "grad_norm": 0.7717587351799011,
+      "learning_rate": 0.0005480063795853269,
+      "loss": 0.2938,
+      "step": 22760
+    },
+    {
+      "epoch": 36.32,
+      "grad_norm": 0.5031578540802002,
+      "learning_rate": 0.0005473684210526317,
+      "loss": 0.3693,
+      "step": 22770
+    },
+    {
+      "epoch": 36.33,
+      "grad_norm": 0.46057426929473877,
+      "learning_rate": 0.0005467304625199362,
+      "loss": 0.2782,
+      "step": 22780
+    },
+    {
+      "epoch": 36.35,
+      "grad_norm": 0.33407339453697205,
+      "learning_rate": 0.0005460925039872409,
+      "loss": 0.3133,
+      "step": 22790
+    },
+    {
+      "epoch": 36.36,
+      "grad_norm": 0.7417854070663452,
+      "learning_rate": 0.0005454545454545455,
+      "loss": 0.3441,
+      "step": 22800
+    },
+    {
+      "epoch": 36.38,
+      "grad_norm": 0.3010425567626953,
+      "learning_rate": 0.0005448165869218501,
+      "loss": 0.314,
+      "step": 22810
+    },
+    {
+      "epoch": 36.4,
+      "grad_norm": 0.5968150496482849,
+      "learning_rate": 0.0005441786283891547,
+      "loss": 0.3526,
+      "step": 22820
+    },
+    {
+      "epoch": 36.41,
+      "grad_norm": 0.8175147771835327,
+      "learning_rate": 0.0005435406698564594,
+      "loss": 0.3451,
+      "step": 22830
+    },
+    {
+      "epoch": 36.43,
+      "grad_norm": 1.3906422853469849,
+      "learning_rate": 0.000542902711323764,
+      "loss": 0.2813,
+      "step": 22840
+    },
+    {
+      "epoch": 36.44,
+      "grad_norm": 0.47024595737457275,
+      "learning_rate": 0.0005422647527910686,
+      "loss": 0.3559,
+      "step": 22850
+    },
+    {
+      "epoch": 36.46,
+      "grad_norm": 0.3460497558116913,
+      "learning_rate": 0.0005416267942583733,
+      "loss": 0.2783,
+      "step": 22860
+    },
+    {
+      "epoch": 36.48,
+      "grad_norm": 0.5971447825431824,
+      "learning_rate": 0.0005409888357256778,
+      "loss": 0.3274,
+      "step": 22870
+    },
+    {
+      "epoch": 36.49,
+      "grad_norm": 0.9573736190795898,
+      "learning_rate": 0.0005403508771929825,
+      "loss": 0.2442,
+      "step": 22880
+    },
+    {
+      "epoch": 36.51,
+      "grad_norm": 0.4627261757850647,
+      "learning_rate": 0.0005397129186602871,
+      "loss": 0.2677,
+      "step": 22890
+    },
+    {
+      "epoch": 36.52,
+      "grad_norm": 0.45995354652404785,
+      "learning_rate": 0.0005390749601275918,
+      "loss": 0.3892,
+      "step": 22900
+    },
+    {
+      "epoch": 36.54,
+      "grad_norm": 0.2959776818752289,
+      "learning_rate": 0.0005384370015948964,
+      "loss": 0.3216,
+      "step": 22910
+    },
+    {
+      "epoch": 36.56,
+      "grad_norm": 0.4786494970321655,
+      "learning_rate": 0.000537799043062201,
+      "loss": 0.3105,
+      "step": 22920
+    },
+    {
+      "epoch": 36.57,
+      "grad_norm": 0.462162584066391,
+      "learning_rate": 0.0005371610845295057,
+      "loss": 0.4404,
+      "step": 22930
+    },
+    {
+      "epoch": 36.59,
+      "grad_norm": 0.37563401460647583,
+      "learning_rate": 0.0005365231259968102,
+      "loss": 0.2949,
+      "step": 22940
+    },
+    {
+      "epoch": 36.6,
+      "grad_norm": 0.4217167794704437,
+      "learning_rate": 0.0005358851674641149,
+      "loss": 0.425,
+      "step": 22950
+    },
+    {
+      "epoch": 36.62,
+      "grad_norm": 0.5127308964729309,
+      "learning_rate": 0.0005352472089314195,
+      "loss": 0.3215,
+      "step": 22960
+    },
+    {
+      "epoch": 36.63,
+      "grad_norm": 1.0700709819793701,
+      "learning_rate": 0.0005346092503987241,
+      "loss": 0.3379,
+      "step": 22970
+    },
+    {
+      "epoch": 36.65,
+      "grad_norm": 0.6836196184158325,
+      "learning_rate": 0.0005339712918660288,
+      "loss": 0.3811,
+      "step": 22980
+    },
+    {
+      "epoch": 36.67,
+      "grad_norm": 0.2946398854255676,
+      "learning_rate": 0.0005333333333333334,
+      "loss": 0.2362,
+      "step": 22990
+    },
+    {
+      "epoch": 36.68,
+      "grad_norm": 0.38813692331314087,
+      "learning_rate": 0.000532695374800638,
+      "loss": 0.3188,
+      "step": 23000
+    },
+    {
+      "epoch": 36.7,
+      "grad_norm": 0.483698308467865,
+      "learning_rate": 0.0005320574162679426,
+      "loss": 0.4241,
+      "step": 23010
+    },
+    {
+      "epoch": 36.71,
+      "grad_norm": 0.5879315733909607,
+      "learning_rate": 0.0005314194577352473,
+      "loss": 0.3343,
+      "step": 23020
+    },
+    {
+      "epoch": 36.73,
+      "grad_norm": 0.3913237154483795,
+      "learning_rate": 0.0005307814992025518,
+      "loss": 0.3438,
+      "step": 23030
+    },
+    {
+      "epoch": 36.75,
+      "grad_norm": 0.9392869472503662,
+      "learning_rate": 0.0005301435406698565,
+      "loss": 0.3663,
+      "step": 23040
+    },
+    {
+      "epoch": 36.76,
+      "grad_norm": 0.4291793704032898,
+      "learning_rate": 0.0005295055821371611,
+      "loss": 0.3078,
+      "step": 23050
+    },
+    {
+      "epoch": 36.78,
+      "grad_norm": 0.6778882741928101,
+      "learning_rate": 0.0005288676236044657,
+      "loss": 0.3618,
+      "step": 23060
+    },
+    {
+      "epoch": 36.79,
+      "grad_norm": 0.9089276194572449,
+      "learning_rate": 0.0005282296650717704,
+      "loss": 0.3424,
+      "step": 23070
+    },
+    {
+      "epoch": 36.81,
+      "grad_norm": 0.6602213978767395,
+      "learning_rate": 0.000527591706539075,
+      "loss": 0.3148,
+      "step": 23080
+    },
+    {
+      "epoch": 36.83,
+      "grad_norm": 0.4564104378223419,
+      "learning_rate": 0.0005269537480063796,
+      "loss": 0.4301,
+      "step": 23090
+    },
+    {
+      "epoch": 36.84,
+      "grad_norm": 0.23501376807689667,
+      "learning_rate": 0.0005263157894736842,
+      "loss": 0.3264,
+      "step": 23100
+    },
+    {
+      "epoch": 36.86,
+      "grad_norm": 1.654263973236084,
+      "learning_rate": 0.0005256778309409889,
+      "loss": 0.3627,
+      "step": 23110
+    },
+    {
+      "epoch": 36.87,
+      "grad_norm": 0.8504493236541748,
+      "learning_rate": 0.0005250398724082934,
+      "loss": 0.3381,
+      "step": 23120
+    },
+    {
+      "epoch": 36.89,
+      "grad_norm": 0.7040032744407654,
+      "learning_rate": 0.0005244019138755981,
+      "loss": 0.4432,
+      "step": 23130
+    },
+    {
+      "epoch": 36.91,
+      "grad_norm": 0.5224348902702332,
+      "learning_rate": 0.0005237639553429028,
+      "loss": 0.3652,
+      "step": 23140
+    },
+    {
+      "epoch": 36.92,
+      "grad_norm": 0.5879861116409302,
+      "learning_rate": 0.0005231259968102073,
+      "loss": 0.3216,
+      "step": 23150
+    },
+    {
+      "epoch": 36.94,
+      "grad_norm": 0.3892087936401367,
+      "learning_rate": 0.000522488038277512,
+      "loss": 0.3308,
+      "step": 23160
+    },
+    {
+      "epoch": 36.95,
+      "grad_norm": 0.2915053069591522,
+      "learning_rate": 0.0005218500797448166,
+      "loss": 0.3044,
+      "step": 23170
+    },
+    {
+      "epoch": 36.97,
+      "grad_norm": 0.515186607837677,
+      "learning_rate": 0.0005212121212121212,
+      "loss": 0.4347,
+      "step": 23180
+    },
+    {
+      "epoch": 36.99,
+      "grad_norm": 0.4125446677207947,
+      "learning_rate": 0.0005205741626794258,
+      "loss": 0.3828,
+      "step": 23190
+    },
+    {
+      "epoch": 37.0,
+      "grad_norm": 0.4284899830818176,
+      "learning_rate": 0.0005199362041467305,
+      "loss": 0.4189,
+      "step": 23200
+    },
+    {
+      "epoch": 37.02,
+      "grad_norm": 1.1735564470291138,
+      "learning_rate": 0.000519298245614035,
+      "loss": 0.3337,
+      "step": 23210
+    },
+    {
+      "epoch": 37.03,
+      "grad_norm": 1.21298348903656,
+      "learning_rate": 0.0005186602870813397,
+      "loss": 0.3879,
+      "step": 23220
+    },
+    {
+      "epoch": 37.05,
+      "grad_norm": 3.3211417198181152,
+      "learning_rate": 0.0005180223285486444,
+      "loss": 0.3724,
+      "step": 23230
+    },
+    {
+      "epoch": 37.07,
+      "grad_norm": 0.5634852647781372,
+      "learning_rate": 0.0005173843700159489,
+      "loss": 0.2916,
+      "step": 23240
+    },
+    {
+      "epoch": 37.08,
+      "grad_norm": 0.40934479236602783,
+      "learning_rate": 0.0005167464114832536,
+      "loss": 0.3133,
+      "step": 23250
+    },
+    {
+      "epoch": 37.1,
+      "grad_norm": 0.6190032958984375,
+      "learning_rate": 0.0005161084529505582,
+      "loss": 0.2639,
+      "step": 23260
+    },
+    {
+      "epoch": 37.11,
+      "grad_norm": 0.38555908203125,
+      "learning_rate": 0.0005154704944178628,
+      "loss": 0.3493,
+      "step": 23270
+    },
+    {
+      "epoch": 37.13,
+      "grad_norm": 0.2890884280204773,
+      "learning_rate": 0.0005148325358851674,
+      "loss": 0.3042,
+      "step": 23280
+    },
+    {
+      "epoch": 37.15,
+      "grad_norm": 0.3978734016418457,
+      "learning_rate": 0.0005141945773524721,
+      "loss": 0.276,
+      "step": 23290
+    },
+    {
+      "epoch": 37.16,
+      "grad_norm": 0.6064948439598083,
+      "learning_rate": 0.0005135566188197768,
+      "loss": 0.3809,
+      "step": 23300
+    },
+    {
+      "epoch": 37.18,
+      "grad_norm": 0.6788705587387085,
+      "learning_rate": 0.0005129186602870813,
+      "loss": 0.2914,
+      "step": 23310
+    },
+    {
+      "epoch": 37.19,
+      "grad_norm": 0.4636113941669464,
+      "learning_rate": 0.000512280701754386,
+      "loss": 0.3822,
+      "step": 23320
+    },
+    {
+      "epoch": 37.21,
+      "grad_norm": 0.6636508107185364,
+      "learning_rate": 0.0005116427432216905,
+      "loss": 0.3767,
+      "step": 23330
+    },
+    {
+      "epoch": 37.22,
+      "grad_norm": 0.435531347990036,
+      "learning_rate": 0.0005110047846889952,
+      "loss": 0.4045,
+      "step": 23340
+    },
+    {
+      "epoch": 37.24,
+      "grad_norm": 0.5816912651062012,
+      "learning_rate": 0.0005103668261562998,
+      "loss": 0.3083,
+      "step": 23350
+    },
+    {
+      "epoch": 37.26,
+      "grad_norm": 0.8348118662834167,
+      "learning_rate": 0.0005097288676236045,
+      "loss": 0.2738,
+      "step": 23360
+    },
+    {
+      "epoch": 37.27,
+      "grad_norm": 0.5250842571258545,
+      "learning_rate": 0.000509090909090909,
+      "loss": 0.3266,
+      "step": 23370
+    },
+    {
+      "epoch": 37.29,
+      "grad_norm": 0.3116588592529297,
+      "learning_rate": 0.0005084529505582137,
+      "loss": 0.2874,
+      "step": 23380
+    },
+    {
+      "epoch": 37.3,
+      "grad_norm": 0.5619212985038757,
+      "learning_rate": 0.0005078149920255184,
+      "loss": 0.2712,
+      "step": 23390
+    },
+    {
+      "epoch": 37.32,
+      "grad_norm": 0.34848636388778687,
+      "learning_rate": 0.0005071770334928229,
+      "loss": 0.4232,
+      "step": 23400
+    },
+    {
+      "epoch": 37.34,
+      "grad_norm": 0.38688668608665466,
+      "learning_rate": 0.0005065390749601276,
+      "loss": 0.2947,
+      "step": 23410
+    },
+    {
+      "epoch": 37.35,
+      "grad_norm": 1.7782784700393677,
+      "learning_rate": 0.0005059011164274322,
+      "loss": 0.3313,
+      "step": 23420
+    },
+    {
+      "epoch": 37.37,
+      "grad_norm": 0.2640959620475769,
+      "learning_rate": 0.0005052631578947368,
+      "loss": 0.2622,
+      "step": 23430
+    },
+    {
+      "epoch": 37.38,
+      "grad_norm": 0.2727811932563782,
+      "learning_rate": 0.0005046251993620414,
+      "loss": 0.3136,
+      "step": 23440
+    },
+    {
+      "epoch": 37.4,
+      "grad_norm": 0.5404552817344666,
+      "learning_rate": 0.0005039872408293461,
+      "loss": 0.3078,
+      "step": 23450
+    },
+    {
+      "epoch": 37.42,
+      "grad_norm": 0.38602226972579956,
+      "learning_rate": 0.0005033492822966507,
+      "loss": 0.2807,
+      "step": 23460
+    },
+    {
+      "epoch": 37.43,
+      "grad_norm": 0.30310848355293274,
+      "learning_rate": 0.0005027113237639553,
+      "loss": 0.3203,
+      "step": 23470
+    },
+    {
+      "epoch": 37.45,
+      "grad_norm": 0.41210854053497314,
+      "learning_rate": 0.00050207336523126,
+      "loss": 0.3025,
+      "step": 23480
+    },
+    {
+      "epoch": 37.46,
+      "grad_norm": 0.7113584876060486,
+      "learning_rate": 0.0005014354066985645,
+      "loss": 0.3876,
+      "step": 23490
+    },
+    {
+      "epoch": 37.48,
+      "grad_norm": 0.8924645185470581,
+      "learning_rate": 0.0005007974481658692,
+      "loss": 0.2642,
+      "step": 23500
+    },
+    {
+      "epoch": 37.5,
+      "grad_norm": 0.5637812614440918,
+      "learning_rate": 0.0005001594896331738,
+      "loss": 0.3732,
+      "step": 23510
+    },
+    {
+      "epoch": 37.51,
+      "grad_norm": 0.34932073950767517,
+      "learning_rate": 0.0004995215311004785,
+      "loss": 0.2753,
+      "step": 23520
+    },
+    {
+      "epoch": 37.53,
+      "grad_norm": 0.39498457312583923,
+      "learning_rate": 0.0004988835725677831,
+      "loss": 0.2949,
+      "step": 23530
+    },
+    {
+      "epoch": 37.54,
+      "grad_norm": 0.4476890563964844,
+      "learning_rate": 0.0004982456140350878,
+      "loss": 0.318,
+      "step": 23540
+    },
+    {
+      "epoch": 37.56,
+      "grad_norm": 0.3034002482891083,
+      "learning_rate": 0.0004976076555023923,
+      "loss": 0.2778,
+      "step": 23550
+    },
+    {
+      "epoch": 37.58,
+      "grad_norm": 0.7696762084960938,
+      "learning_rate": 0.000496969696969697,
+      "loss": 0.3525,
+      "step": 23560
+    },
+    {
+      "epoch": 37.59,
+      "grad_norm": 0.6639572978019714,
+      "learning_rate": 0.0004963317384370016,
+      "loss": 0.2929,
+      "step": 23570
+    },
+    {
+      "epoch": 37.61,
+      "grad_norm": 0.8098918199539185,
+      "learning_rate": 0.0004956937799043062,
+      "loss": 0.3162,
+      "step": 23580
+    },
+    {
+      "epoch": 37.62,
+      "grad_norm": 0.7061499357223511,
+      "learning_rate": 0.0004950558213716109,
+      "loss": 0.4247,
+      "step": 23590
+    },
+    {
+      "epoch": 37.64,
+      "grad_norm": 0.7736586928367615,
+      "learning_rate": 0.0004944178628389155,
+      "loss": 0.3253,
+      "step": 23600
+    },
+    {
+      "epoch": 37.66,
+      "grad_norm": 0.22601386904716492,
+      "learning_rate": 0.0004937799043062201,
+      "loss": 0.322,
+      "step": 23610
+    },
+    {
+      "epoch": 37.67,
+      "grad_norm": 0.34596702456474304,
+      "learning_rate": 0.0004931419457735247,
+      "loss": 0.381,
+      "step": 23620
+    },
+    {
+      "epoch": 37.69,
+      "grad_norm": 0.399099737405777,
+      "learning_rate": 0.0004925039872408294,
+      "loss": 0.367,
+      "step": 23630
+    },
+    {
+      "epoch": 37.7,
+      "grad_norm": 0.4223106801509857,
+      "learning_rate": 0.0004918660287081339,
+      "loss": 0.3502,
+      "step": 23640
+    },
+    {
+      "epoch": 37.72,
+      "grad_norm": 0.36701181530952454,
+      "learning_rate": 0.0004912280701754386,
+      "loss": 0.3696,
+      "step": 23650
+    },
+    {
+      "epoch": 37.74,
+      "grad_norm": 1.0397878885269165,
+      "learning_rate": 0.0004905901116427433,
+      "loss": 0.2922,
+      "step": 23660
+    },
+    {
+      "epoch": 37.75,
+      "grad_norm": 0.6061972975730896,
+      "learning_rate": 0.0004899521531100478,
+      "loss": 0.3406,
+      "step": 23670
+    },
+    {
+      "epoch": 37.77,
+      "grad_norm": 0.46018704771995544,
+      "learning_rate": 0.0004893141945773525,
+      "loss": 0.3847,
+      "step": 23680
+    },
+    {
+      "epoch": 37.78,
+      "grad_norm": 0.4098079204559326,
+      "learning_rate": 0.0004886762360446571,
+      "loss": 0.2944,
+      "step": 23690
+    },
+    {
+      "epoch": 37.8,
+      "grad_norm": 0.33187025785446167,
+      "learning_rate": 0.0004880382775119617,
+      "loss": 0.3696,
+      "step": 23700
+    },
+    {
+      "epoch": 37.81,
+      "grad_norm": 1.5086455345153809,
+      "learning_rate": 0.00048740031897926637,
+      "loss": 0.3415,
+      "step": 23710
+    },
+    {
+      "epoch": 37.83,
+      "grad_norm": 0.1812012493610382,
+      "learning_rate": 0.000486762360446571,
+      "loss": 0.3317,
+      "step": 23720
+    },
+    {
+      "epoch": 37.85,
+      "grad_norm": 0.4595651924610138,
+      "learning_rate": 0.00048612440191387566,
+      "loss": 0.3449,
+      "step": 23730
+    },
+    {
+      "epoch": 37.86,
+      "grad_norm": 0.7050609588623047,
+      "learning_rate": 0.0004854864433811803,
+      "loss": 0.3273,
+      "step": 23740
+    },
+    {
+      "epoch": 37.88,
+      "grad_norm": 0.4877799451351166,
+      "learning_rate": 0.0004848484848484849,
+      "loss": 0.2679,
+      "step": 23750
+    },
+    {
+      "epoch": 37.89,
+      "grad_norm": 0.4837338328361511,
+      "learning_rate": 0.0004842105263157895,
+      "loss": 0.3203,
+      "step": 23760
+    },
+    {
+      "epoch": 37.91,
+      "grad_norm": 0.5711174607276917,
+      "learning_rate": 0.0004835725677830941,
+      "loss": 0.3088,
+      "step": 23770
+    },
+    {
+      "epoch": 37.93,
+      "grad_norm": 0.7363555431365967,
+      "learning_rate": 0.00048293460925039874,
+      "loss": 0.3494,
+      "step": 23780
+    },
+    {
+      "epoch": 37.94,
+      "grad_norm": 0.4688860774040222,
+      "learning_rate": 0.00048229665071770336,
+      "loss": 0.406,
+      "step": 23790
+    },
+    {
+      "epoch": 37.96,
+      "grad_norm": 0.22278854250907898,
+      "learning_rate": 0.000481658692185008,
+      "loss": 0.2775,
+      "step": 23800
+    },
+    {
+      "epoch": 37.97,
+      "grad_norm": 0.5794351100921631,
+      "learning_rate": 0.00048102073365231265,
+      "loss": 0.3613,
+      "step": 23810
+    },
+    {
+      "epoch": 37.99,
+      "grad_norm": 0.7034667730331421,
+      "learning_rate": 0.00048038277511961726,
+      "loss": 0.3417,
+      "step": 23820
+    },
+    {
+      "epoch": 38.01,
+      "grad_norm": 0.5369040966033936,
+      "learning_rate": 0.0004797448165869219,
+      "loss": 0.2919,
+      "step": 23830
+    },
+    {
+      "epoch": 38.02,
+      "grad_norm": 0.4583072066307068,
+      "learning_rate": 0.0004791068580542265,
+      "loss": 0.2711,
+      "step": 23840
+    },
+    {
+      "epoch": 38.04,
+      "grad_norm": 0.32047978043556213,
+      "learning_rate": 0.0004784688995215311,
+      "loss": 0.3133,
+      "step": 23850
+    },
+    {
+      "epoch": 38.05,
+      "grad_norm": 0.4489063024520874,
+      "learning_rate": 0.00047783094098883573,
+      "loss": 0.3027,
+      "step": 23860
+    },
+    {
+      "epoch": 38.07,
+      "grad_norm": 0.29304754734039307,
+      "learning_rate": 0.00047719298245614035,
+      "loss": 0.2937,
+      "step": 23870
+    },
+    {
+      "epoch": 38.09,
+      "grad_norm": 0.5141634345054626,
+      "learning_rate": 0.00047655502392344496,
+      "loss": 0.2787,
+      "step": 23880
+    },
+    {
+      "epoch": 38.1,
+      "grad_norm": 0.6913502216339111,
+      "learning_rate": 0.00047591706539074964,
+      "loss": 0.3666,
+      "step": 23890
+    },
+    {
+      "epoch": 38.12,
+      "grad_norm": 0.49919384717941284,
+      "learning_rate": 0.00047527910685805425,
+      "loss": 0.3182,
+      "step": 23900
+    },
+    {
+      "epoch": 38.13,
+      "grad_norm": 0.27605143189430237,
+      "learning_rate": 0.00047464114832535887,
+      "loss": 0.2354,
+      "step": 23910
+    },
+    {
+      "epoch": 38.15,
+      "grad_norm": 1.246079921722412,
+      "learning_rate": 0.0004740031897926635,
+      "loss": 0.3429,
+      "step": 23920
+    },
+    {
+      "epoch": 38.17,
+      "grad_norm": 0.18399390578269958,
+      "learning_rate": 0.0004733652312599681,
+      "loss": 0.2916,
+      "step": 23930
+    },
+    {
+      "epoch": 38.18,
+      "grad_norm": 0.3015744388103485,
+      "learning_rate": 0.0004727272727272727,
+      "loss": 0.3216,
+      "step": 23940
+    },
+    {
+      "epoch": 38.2,
+      "grad_norm": 0.5281094312667847,
+      "learning_rate": 0.00047208931419457734,
+      "loss": 0.4193,
+      "step": 23950
+    },
+    {
+      "epoch": 38.21,
+      "grad_norm": 0.6574485301971436,
+      "learning_rate": 0.00047145135566188195,
+      "loss": 0.318,
+      "step": 23960
+    },
+    {
+      "epoch": 38.23,
+      "grad_norm": 0.5636985898017883,
+      "learning_rate": 0.0004708133971291866,
+      "loss": 0.3493,
+      "step": 23970
+    },
+    {
+      "epoch": 38.25,
+      "grad_norm": 0.3899206817150116,
+      "learning_rate": 0.00047017543859649124,
+      "loss": 0.313,
+      "step": 23980
+    },
+    {
+      "epoch": 38.26,
+      "grad_norm": 0.465703547000885,
+      "learning_rate": 0.00046953748006379586,
+      "loss": 0.3227,
+      "step": 23990
+    },
+    {
+      "epoch": 38.28,
+      "grad_norm": 0.9873224496841431,
+      "learning_rate": 0.0004688995215311005,
+      "loss": 0.3093,
+      "step": 24000
+    },
+    {
+      "epoch": 38.29,
+      "grad_norm": 0.545748233795166,
+      "learning_rate": 0.0004682615629984051,
+      "loss": 0.3427,
+      "step": 24010
+    },
+    {
+      "epoch": 38.31,
+      "grad_norm": 1.6173831224441528,
+      "learning_rate": 0.0004676236044657097,
+      "loss": 0.3487,
+      "step": 24020
+    },
+    {
+      "epoch": 38.33,
+      "grad_norm": 0.43845269083976746,
+      "learning_rate": 0.0004669856459330143,
+      "loss": 0.3368,
+      "step": 24030
+    },
+    {
+      "epoch": 38.34,
+      "grad_norm": 0.6073929071426392,
+      "learning_rate": 0.00046634768740031894,
+      "loss": 0.4022,
+      "step": 24040
+    },
+    {
+      "epoch": 38.36,
+      "grad_norm": 0.3742305636405945,
+      "learning_rate": 0.0004657097288676236,
+      "loss": 0.2748,
+      "step": 24050
+    },
+    {
+      "epoch": 38.37,
+      "grad_norm": 0.2694351375102997,
+      "learning_rate": 0.00046507177033492823,
+      "loss": 0.3874,
+      "step": 24060
+    },
+    {
+      "epoch": 38.39,
+      "grad_norm": 0.48228031396865845,
+      "learning_rate": 0.00046443381180223285,
+      "loss": 0.2613,
+      "step": 24070
+    },
+    {
+      "epoch": 38.41,
+      "grad_norm": 0.7061280608177185,
+      "learning_rate": 0.0004637958532695375,
+      "loss": 0.3152,
+      "step": 24080
+    },
+    {
+      "epoch": 38.42,
+      "grad_norm": 0.5890529751777649,
+      "learning_rate": 0.00046315789473684214,
+      "loss": 0.2593,
+      "step": 24090
+    },
+    {
+      "epoch": 38.44,
+      "grad_norm": 0.5934563875198364,
+      "learning_rate": 0.00046251993620414675,
+      "loss": 0.3712,
+      "step": 24100
+    },
+    {
+      "epoch": 38.45,
+      "grad_norm": 0.30438482761383057,
+      "learning_rate": 0.00046188197767145137,
+      "loss": 0.3082,
+      "step": 24110
+    },
+    {
+      "epoch": 38.47,
+      "grad_norm": 0.1404085010290146,
+      "learning_rate": 0.000461244019138756,
+      "loss": 0.2458,
+      "step": 24120
+    },
+    {
+      "epoch": 38.48,
+      "grad_norm": 0.45408958196640015,
+      "learning_rate": 0.00046060606060606066,
+      "loss": 0.302,
+      "step": 24130
+    },
+    {
+      "epoch": 38.5,
+      "grad_norm": 0.4974878430366516,
+      "learning_rate": 0.0004599681020733653,
+      "loss": 0.3416,
+      "step": 24140
+    },
+    {
+      "epoch": 38.52,
+      "grad_norm": 0.2546900510787964,
+      "learning_rate": 0.0004593301435406699,
+      "loss": 0.2762,
+      "step": 24150
+    },
+    {
+      "epoch": 38.53,
+      "grad_norm": 0.5472551584243774,
+      "learning_rate": 0.0004586921850079745,
+      "loss": 0.3176,
+      "step": 24160
+    },
+    {
+      "epoch": 38.55,
+      "grad_norm": 0.7795162200927734,
+      "learning_rate": 0.0004580542264752791,
+      "loss": 0.3699,
+      "step": 24170
+    },
+    {
+      "epoch": 38.56,
+      "grad_norm": 0.4223695397377014,
+      "learning_rate": 0.00045741626794258374,
+      "loss": 0.3011,
+      "step": 24180
+    },
+    {
+      "epoch": 38.58,
+      "grad_norm": 2.152009963989258,
+      "learning_rate": 0.00045677830940988836,
+      "loss": 0.36,
+      "step": 24190
+    },
+    {
+      "epoch": 38.6,
+      "grad_norm": 0.477445513010025,
+      "learning_rate": 0.000456140350877193,
+      "loss": 0.279,
+      "step": 24200
+    },
+    {
+      "epoch": 38.61,
+      "grad_norm": 0.546576738357544,
+      "learning_rate": 0.00045550239234449765,
+      "loss": 0.3519,
+      "step": 24210
+    },
+    {
+      "epoch": 38.63,
+      "grad_norm": 0.3089749217033386,
+      "learning_rate": 0.00045486443381180226,
+      "loss": 0.3306,
+      "step": 24220
+    },
+    {
+      "epoch": 38.64,
+      "grad_norm": 0.5986670851707458,
+      "learning_rate": 0.0004542264752791069,
+      "loss": 0.2692,
+      "step": 24230
+    },
+    {
+      "epoch": 38.66,
+      "grad_norm": 1.1655359268188477,
+      "learning_rate": 0.0004535885167464115,
+      "loss": 0.3717,
+      "step": 24240
+    },
+    {
+      "epoch": 38.68,
+      "grad_norm": 0.349162757396698,
+      "learning_rate": 0.0004529505582137161,
+      "loss": 0.4474,
+      "step": 24250
+    },
+    {
+      "epoch": 38.69,
+      "grad_norm": 0.3474232256412506,
+      "learning_rate": 0.00045231259968102073,
+      "loss": 0.3711,
+      "step": 24260
+    },
+    {
+      "epoch": 38.71,
+      "grad_norm": 0.38125041127204895,
+      "learning_rate": 0.00045167464114832535,
+      "loss": 0.3417,
+      "step": 24270
+    },
+    {
+      "epoch": 38.72,
+      "grad_norm": 0.49059632420539856,
+      "learning_rate": 0.00045103668261562996,
+      "loss": 0.3728,
+      "step": 24280
+    },
+    {
+      "epoch": 38.74,
+      "grad_norm": 0.34616127610206604,
+      "learning_rate": 0.00045039872408293464,
+      "loss": 0.3007,
+      "step": 24290
+    },
+    {
+      "epoch": 38.76,
+      "grad_norm": 0.6310774087905884,
+      "learning_rate": 0.00044976076555023925,
+      "loss": 0.2896,
+      "step": 24300
+    },
+    {
+      "epoch": 38.77,
+      "grad_norm": 1.5255939960479736,
+      "learning_rate": 0.00044912280701754387,
+      "loss": 0.3429,
+      "step": 24310
+    },
+    {
+      "epoch": 38.79,
+      "grad_norm": 0.38608258962631226,
+      "learning_rate": 0.0004484848484848485,
+      "loss": 0.3438,
+      "step": 24320
+    },
+    {
+      "epoch": 38.8,
+      "grad_norm": 1.0546627044677734,
+      "learning_rate": 0.0004478468899521531,
+      "loss": 0.3801,
+      "step": 24330
+    },
+    {
+      "epoch": 38.82,
+      "grad_norm": 0.3056943118572235,
+      "learning_rate": 0.0004472089314194577,
+      "loss": 0.2759,
+      "step": 24340
+    },
+    {
+      "epoch": 38.84,
+      "grad_norm": 0.7335503101348877,
+      "learning_rate": 0.00044657097288676234,
+      "loss": 0.3331,
+      "step": 24350
+    },
+    {
+      "epoch": 38.85,
+      "grad_norm": 0.36230140924453735,
+      "learning_rate": 0.00044593301435406695,
+      "loss": 0.2871,
+      "step": 24360
+    },
+    {
+      "epoch": 38.87,
+      "grad_norm": 0.3868005573749542,
+      "learning_rate": 0.0004452950558213716,
+      "loss": 0.3049,
+      "step": 24370
+    },
+    {
+      "epoch": 38.88,
+      "grad_norm": 0.4695385992527008,
+      "learning_rate": 0.00044465709728867624,
+      "loss": 0.3754,
+      "step": 24380
+    },
+    {
+      "epoch": 38.9,
+      "grad_norm": 0.2892504036426544,
+      "learning_rate": 0.00044401913875598086,
+      "loss": 0.3438,
+      "step": 24390
+    },
+    {
+      "epoch": 38.92,
+      "grad_norm": 0.7235500812530518,
+      "learning_rate": 0.0004433811802232855,
+      "loss": 0.3345,
+      "step": 24400
+    },
+    {
+      "epoch": 38.93,
+      "grad_norm": 0.48276352882385254,
+      "learning_rate": 0.0004427432216905901,
+      "loss": 0.3486,
+      "step": 24410
+    },
+    {
+      "epoch": 38.95,
+      "grad_norm": 0.384084016084671,
+      "learning_rate": 0.0004421052631578947,
+      "loss": 0.3657,
+      "step": 24420
+    },
+    {
+      "epoch": 38.96,
+      "grad_norm": 0.31639254093170166,
+      "learning_rate": 0.0004414673046251993,
+      "loss": 0.3392,
+      "step": 24430
+    },
+    {
+      "epoch": 38.98,
+      "grad_norm": 0.3250158131122589,
+      "learning_rate": 0.00044082934609250394,
+      "loss": 0.3391,
+      "step": 24440
+    },
+    {
+      "epoch": 39.0,
+      "grad_norm": 0.37524476647377014,
+      "learning_rate": 0.00044019138755980867,
+      "loss": 0.3313,
+      "step": 24450
+    },
+    {
+      "epoch": 39.01,
+      "grad_norm": 0.5987895131111145,
+      "learning_rate": 0.0004395534290271133,
+      "loss": 0.2781,
+      "step": 24460
+    },
+    {
+      "epoch": 39.03,
+      "grad_norm": 0.18153107166290283,
+      "learning_rate": 0.0004389154704944179,
+      "loss": 0.2496,
+      "step": 24470
+    },
+    {
+      "epoch": 39.04,
+      "grad_norm": 0.34211575984954834,
+      "learning_rate": 0.0004382775119617225,
+      "loss": 0.272,
+      "step": 24480
+    },
+    {
+      "epoch": 39.06,
+      "grad_norm": 0.391075074672699,
+      "learning_rate": 0.00043763955342902714,
+      "loss": 0.3307,
+      "step": 24490
+    },
+    {
+      "epoch": 39.07,
+      "grad_norm": 0.2632424831390381,
+      "learning_rate": 0.00043700159489633175,
+      "loss": 0.2906,
+      "step": 24500
+    },
+    {
+      "epoch": 39.09,
+      "grad_norm": 0.5995433926582336,
+      "learning_rate": 0.00043636363636363637,
+      "loss": 0.293,
+      "step": 24510
+    },
+    {
+      "epoch": 39.11,
+      "grad_norm": 0.6448796987533569,
+      "learning_rate": 0.00043572567783094104,
+      "loss": 0.251,
+      "step": 24520
+    },
+    {
+      "epoch": 39.12,
+      "grad_norm": 0.5249642729759216,
+      "learning_rate": 0.00043508771929824566,
+      "loss": 0.3272,
+      "step": 24530
+    },
+    {
+      "epoch": 39.14,
+      "grad_norm": 0.8031821250915527,
+      "learning_rate": 0.0004344497607655503,
+      "loss": 0.3417,
+      "step": 24540
+    },
+    {
+      "epoch": 39.15,
+      "grad_norm": 0.9898377656936646,
+      "learning_rate": 0.0004338118022328549,
+      "loss": 0.3799,
+      "step": 24550
+    },
+    {
+      "epoch": 39.17,
+      "grad_norm": 0.301408976316452,
+      "learning_rate": 0.0004331738437001595,
+      "loss": 0.2618,
+      "step": 24560
+    },
+    {
+      "epoch": 39.19,
+      "grad_norm": 0.3909609317779541,
+      "learning_rate": 0.0004325358851674641,
+      "loss": 0.3,
+      "step": 24570
+    },
+    {
+      "epoch": 39.2,
+      "grad_norm": 0.3314201533794403,
+      "learning_rate": 0.00043189792663476874,
+      "loss": 0.3461,
+      "step": 24580
+    },
+    {
+      "epoch": 39.22,
+      "grad_norm": 0.8803900480270386,
+      "learning_rate": 0.00043125996810207336,
+      "loss": 0.3438,
+      "step": 24590
+    },
+    {
+      "epoch": 39.23,
+      "grad_norm": 0.3051396906375885,
+      "learning_rate": 0.00043062200956937803,
+      "loss": 0.3489,
+      "step": 24600
+    },
+    {
+      "epoch": 39.25,
+      "grad_norm": 0.5020725131034851,
+      "learning_rate": 0.00042998405103668265,
+      "loss": 0.316,
+      "step": 24610
+    },
+    {
+      "epoch": 39.27,
+      "grad_norm": 0.7016777396202087,
+      "learning_rate": 0.00042934609250398726,
+      "loss": 0.3582,
+      "step": 24620
+    },
+    {
+      "epoch": 39.28,
+      "grad_norm": 0.21689297258853912,
+      "learning_rate": 0.0004287081339712919,
+      "loss": 0.2597,
+      "step": 24630
+    },
+    {
+      "epoch": 39.3,
+      "grad_norm": 0.6638566851615906,
+      "learning_rate": 0.0004280701754385965,
+      "loss": 0.3905,
+      "step": 24640
+    },
+    {
+      "epoch": 39.31,
+      "grad_norm": 0.24087496101856232,
+      "learning_rate": 0.0004274322169059011,
+      "loss": 0.3349,
+      "step": 24650
+    },
+    {
+      "epoch": 39.33,
+      "grad_norm": 0.14746366441249847,
+      "learning_rate": 0.00042679425837320573,
+      "loss": 0.2942,
+      "step": 24660
+    },
+    {
+      "epoch": 39.35,
+      "grad_norm": 0.3620028495788574,
+      "learning_rate": 0.00042615629984051035,
+      "loss": 0.2394,
+      "step": 24670
+    },
+    {
+      "epoch": 39.36,
+      "grad_norm": 0.5359326004981995,
+      "learning_rate": 0.000425518341307815,
+      "loss": 0.3661,
+      "step": 24680
+    },
+    {
+      "epoch": 39.38,
+      "grad_norm": 0.26914021372795105,
+      "learning_rate": 0.00042488038277511964,
+      "loss": 0.2544,
+      "step": 24690
+    },
+    {
+      "epoch": 39.39,
+      "grad_norm": 0.22984707355499268,
+      "learning_rate": 0.00042424242424242425,
+      "loss": 0.2545,
+      "step": 24700
+    },
+    {
+      "epoch": 39.41,
+      "grad_norm": 0.2788347601890564,
+      "learning_rate": 0.00042360446570972887,
+      "loss": 0.3354,
+      "step": 24710
+    },
+    {
+      "epoch": 39.43,
+      "grad_norm": 0.24124827980995178,
+      "learning_rate": 0.0004229665071770335,
+      "loss": 0.2551,
+      "step": 24720
+    },
+    {
+      "epoch": 39.44,
+      "grad_norm": 0.547863781452179,
+      "learning_rate": 0.0004223285486443381,
+      "loss": 0.2597,
+      "step": 24730
+    },
+    {
+      "epoch": 39.46,
+      "grad_norm": 0.25198522210121155,
+      "learning_rate": 0.0004216905901116427,
+      "loss": 0.3179,
+      "step": 24740
+    },
+    {
+      "epoch": 39.47,
+      "grad_norm": 0.3968208134174347,
+      "learning_rate": 0.00042105263157894734,
+      "loss": 0.2876,
+      "step": 24750
+    },
+    {
+      "epoch": 39.49,
+      "grad_norm": 0.33785438537597656,
+      "learning_rate": 0.000420414673046252,
+      "loss": 0.3097,
+      "step": 24760
+    },
+    {
+      "epoch": 39.51,
+      "grad_norm": 0.5009357333183289,
+      "learning_rate": 0.0004197767145135566,
+      "loss": 0.3202,
+      "step": 24770
+    },
+    {
+      "epoch": 39.52,
+      "grad_norm": 0.4793984889984131,
+      "learning_rate": 0.00041913875598086124,
+      "loss": 0.3518,
+      "step": 24780
+    },
+    {
+      "epoch": 39.54,
+      "grad_norm": 0.19300325214862823,
+      "learning_rate": 0.00041850079744816586,
+      "loss": 0.3672,
+      "step": 24790
+    },
+    {
+      "epoch": 39.55,
+      "grad_norm": 0.5630788803100586,
+      "learning_rate": 0.0004178628389154705,
+      "loss": 0.3217,
+      "step": 24800
+    },
+    {
+      "epoch": 39.57,
+      "grad_norm": 0.28488433361053467,
+      "learning_rate": 0.0004172248803827751,
+      "loss": 0.3063,
+      "step": 24810
+    },
+    {
+      "epoch": 39.59,
+      "grad_norm": 0.25450441241264343,
+      "learning_rate": 0.0004165869218500797,
+      "loss": 0.3638,
+      "step": 24820
+    },
+    {
+      "epoch": 39.6,
+      "grad_norm": 0.4360348880290985,
+      "learning_rate": 0.0004159489633173844,
+      "loss": 0.3122,
+      "step": 24830
+    },
+    {
+      "epoch": 39.62,
+      "grad_norm": 0.5293656587600708,
+      "learning_rate": 0.00041531100478468905,
+      "loss": 0.3944,
+      "step": 24840
+    },
+    {
+      "epoch": 39.63,
+      "grad_norm": 0.46485990285873413,
+      "learning_rate": 0.00041467304625199367,
+      "loss": 0.2323,
+      "step": 24850
+    },
+    {
+      "epoch": 39.65,
+      "grad_norm": 0.501832127571106,
+      "learning_rate": 0.0004140350877192983,
+      "loss": 0.3502,
+      "step": 24860
+    },
+    {
+      "epoch": 39.67,
+      "grad_norm": 0.4300176799297333,
+      "learning_rate": 0.0004133971291866029,
+      "loss": 0.2736,
+      "step": 24870
+    },
+    {
+      "epoch": 39.68,
+      "grad_norm": 0.253682941198349,
+      "learning_rate": 0.0004127591706539075,
+      "loss": 0.3306,
+      "step": 24880
+    },
+    {
+      "epoch": 39.7,
+      "grad_norm": 0.18599876761436462,
+      "learning_rate": 0.00041212121212121214,
+      "loss": 0.3534,
+      "step": 24890
+    },
+    {
+      "epoch": 39.71,
+      "grad_norm": 0.21810634434223175,
+      "learning_rate": 0.00041148325358851675,
+      "loss": 0.2772,
+      "step": 24900
+    },
+    {
+      "epoch": 39.73,
+      "grad_norm": 0.3228086233139038,
+      "learning_rate": 0.00041084529505582137,
+      "loss": 0.2433,
+      "step": 24910
+    },
+    {
+      "epoch": 39.74,
+      "grad_norm": 0.30225640535354614,
+      "learning_rate": 0.00041020733652312604,
+      "loss": 0.2261,
+      "step": 24920
+    },
+    {
+      "epoch": 39.76,
+      "grad_norm": 0.19185695052146912,
+      "learning_rate": 0.00040956937799043066,
+      "loss": 0.3096,
+      "step": 24930
+    },
+    {
+      "epoch": 39.78,
+      "grad_norm": 0.40327930450439453,
+      "learning_rate": 0.0004089314194577353,
+      "loss": 0.3472,
+      "step": 24940
+    },
+    {
+      "epoch": 39.79,
+      "grad_norm": 0.4578391909599304,
+      "learning_rate": 0.0004082934609250399,
+      "loss": 0.3177,
+      "step": 24950
+    },
+    {
+      "epoch": 39.81,
+      "grad_norm": 0.24900272488594055,
+      "learning_rate": 0.0004076555023923445,
+      "loss": 0.3526,
+      "step": 24960
+    },
+    {
+      "epoch": 39.82,
+      "grad_norm": 0.8984745144844055,
+      "learning_rate": 0.0004070175438596491,
+      "loss": 0.3307,
+      "step": 24970
+    },
+    {
+      "epoch": 39.84,
+      "grad_norm": 0.2043074071407318,
+      "learning_rate": 0.00040637958532695374,
+      "loss": 0.3346,
+      "step": 24980
+    },
+    {
+      "epoch": 39.86,
+      "grad_norm": 0.293965220451355,
+      "learning_rate": 0.00040574162679425836,
+      "loss": 0.3562,
+      "step": 24990
+    },
+    {
+      "epoch": 39.87,
+      "grad_norm": 0.1676713526248932,
+      "learning_rate": 0.00040510366826156303,
+      "loss": 0.2771,
+      "step": 25000
+    },
+    {
+      "epoch": 39.89,
+      "grad_norm": 0.7040833830833435,
+      "learning_rate": 0.00040446570972886765,
+      "loss": 0.3782,
+      "step": 25010
+    },
+    {
+      "epoch": 39.9,
+      "grad_norm": 1.6222413778305054,
+      "learning_rate": 0.00040382775119617226,
+      "loss": 0.2836,
+      "step": 25020
+    },
+    {
+      "epoch": 39.92,
+      "grad_norm": 0.3965054750442505,
+      "learning_rate": 0.0004031897926634769,
+      "loss": 0.4055,
+      "step": 25030
+    },
+    {
+      "epoch": 39.94,
+      "grad_norm": 0.5142346024513245,
+      "learning_rate": 0.0004025518341307815,
+      "loss": 0.34,
+      "step": 25040
+    },
+    {
+      "epoch": 39.95,
+      "grad_norm": 0.4719744622707367,
+      "learning_rate": 0.0004019138755980861,
+      "loss": 0.3643,
+      "step": 25050
+    },
+    {
+      "epoch": 39.97,
+      "grad_norm": 0.29006433486938477,
+      "learning_rate": 0.00040127591706539073,
+      "loss": 0.3195,
+      "step": 25060
+    },
+    {
+      "epoch": 39.98,
+      "grad_norm": 0.40275096893310547,
+      "learning_rate": 0.00040063795853269535,
+      "loss": 0.3247,
+      "step": 25070
+    },
+    {
+      "epoch": 40.0,
+      "grad_norm": 0.19441524147987366,
+      "learning_rate": 0.0004,
+      "loss": 0.2905,
+      "step": 25080
+    },
+    {
+      "epoch": 40.02,
+      "grad_norm": 0.30110710859298706,
+      "learning_rate": 0.00039936204146730464,
+      "loss": 0.2916,
+      "step": 25090
+    },
+    {
+      "epoch": 40.03,
+      "grad_norm": 0.36237674951553345,
+      "learning_rate": 0.00039872408293460925,
+      "loss": 0.3144,
+      "step": 25100
+    },
+    {
+      "epoch": 40.05,
+      "grad_norm": 0.4144202172756195,
+      "learning_rate": 0.00039808612440191387,
+      "loss": 0.2537,
+      "step": 25110
+    },
+    {
+      "epoch": 40.06,
+      "grad_norm": 0.5469448566436768,
+      "learning_rate": 0.0003974481658692185,
+      "loss": 0.2778,
+      "step": 25120
+    },
+    {
+      "epoch": 40.08,
+      "grad_norm": 0.6350633502006531,
+      "learning_rate": 0.0003968102073365231,
+      "loss": 0.3139,
+      "step": 25130
+    },
+    {
+      "epoch": 40.1,
+      "grad_norm": 0.6425772905349731,
+      "learning_rate": 0.0003961722488038277,
+      "loss": 0.3137,
+      "step": 25140
+    },
+    {
+      "epoch": 40.11,
+      "grad_norm": 0.5132192373275757,
+      "learning_rate": 0.00039553429027113234,
+      "loss": 0.3182,
+      "step": 25150
+    },
+    {
+      "epoch": 40.13,
+      "grad_norm": 0.3655058443546295,
+      "learning_rate": 0.000394896331738437,
+      "loss": 0.3213,
+      "step": 25160
+    },
+    {
+      "epoch": 40.14,
+      "grad_norm": 0.3207656145095825,
+      "learning_rate": 0.0003942583732057416,
+      "loss": 0.2839,
+      "step": 25170
+    },
+    {
+      "epoch": 40.16,
+      "grad_norm": 0.4457024037837982,
+      "learning_rate": 0.00039362041467304624,
+      "loss": 0.3271,
+      "step": 25180
+    },
+    {
+      "epoch": 40.18,
+      "grad_norm": 0.457660049200058,
+      "learning_rate": 0.00039298245614035086,
+      "loss": 0.3049,
+      "step": 25190
+    },
+    {
+      "epoch": 40.19,
+      "grad_norm": 0.44609880447387695,
+      "learning_rate": 0.00039234449760765553,
+      "loss": 0.3159,
+      "step": 25200
+    },
+    {
+      "epoch": 40.21,
+      "grad_norm": 0.14960619807243347,
+      "learning_rate": 0.00039170653907496015,
+      "loss": 0.2678,
+      "step": 25210
+    },
+    {
+      "epoch": 40.22,
+      "grad_norm": 0.20554865896701813,
+      "learning_rate": 0.00039106858054226476,
+      "loss": 0.2969,
+      "step": 25220
+    },
+    {
+      "epoch": 40.24,
+      "grad_norm": 0.25997835397720337,
+      "learning_rate": 0.0003904306220095694,
+      "loss": 0.2159,
+      "step": 25230
+    },
+    {
+      "epoch": 40.26,
+      "grad_norm": 0.18251359462738037,
+      "learning_rate": 0.00038979266347687405,
+      "loss": 0.3482,
+      "step": 25240
+    },
+    {
+      "epoch": 40.27,
+      "grad_norm": 0.3024716377258301,
+      "learning_rate": 0.00038915470494417867,
+      "loss": 0.3027,
+      "step": 25250
+    },
+    {
+      "epoch": 40.29,
+      "grad_norm": 0.38427066802978516,
+      "learning_rate": 0.0003885167464114833,
+      "loss": 0.297,
+      "step": 25260
+    },
+    {
+      "epoch": 40.3,
+      "grad_norm": 0.4605743885040283,
+      "learning_rate": 0.0003878787878787879,
+      "loss": 0.2966,
+      "step": 25270
+    },
+    {
+      "epoch": 40.32,
+      "grad_norm": 0.3320145010948181,
+      "learning_rate": 0.0003872408293460925,
+      "loss": 0.2449,
+      "step": 25280
+    },
+    {
+      "epoch": 40.33,
+      "grad_norm": 0.23880721628665924,
+      "learning_rate": 0.00038660287081339714,
+      "loss": 0.2779,
+      "step": 25290
+    },
+    {
+      "epoch": 40.35,
+      "grad_norm": 0.442751407623291,
+      "learning_rate": 0.00038596491228070175,
+      "loss": 0.2729,
+      "step": 25300
+    },
+    {
+      "epoch": 40.37,
+      "grad_norm": 0.2670186758041382,
+      "learning_rate": 0.0003853269537480064,
+      "loss": 0.3296,
+      "step": 25310
+    },
+    {
+      "epoch": 40.38,
+      "grad_norm": 0.2149314135313034,
+      "learning_rate": 0.00038468899521531104,
+      "loss": 0.3094,
+      "step": 25320
+    },
+    {
+      "epoch": 40.4,
+      "grad_norm": 0.15769945085048676,
+      "learning_rate": 0.00038405103668261566,
+      "loss": 0.2962,
+      "step": 25330
+    },
+    {
+      "epoch": 40.41,
+      "grad_norm": 0.30012694001197815,
+      "learning_rate": 0.0003834130781499203,
+      "loss": 0.2864,
+      "step": 25340
+    },
+    {
+      "epoch": 40.43,
+      "grad_norm": 0.6400253772735596,
+      "learning_rate": 0.0003827751196172249,
+      "loss": 0.4076,
+      "step": 25350
+    },
+    {
+      "epoch": 40.45,
+      "grad_norm": 0.5464116334915161,
+      "learning_rate": 0.0003821371610845295,
+      "loss": 0.3281,
+      "step": 25360
+    },
+    {
+      "epoch": 40.46,
+      "grad_norm": 0.463392972946167,
+      "learning_rate": 0.0003814992025518341,
+      "loss": 0.3192,
+      "step": 25370
+    },
+    {
+      "epoch": 40.48,
+      "grad_norm": 0.1991080492734909,
+      "learning_rate": 0.00038086124401913874,
+      "loss": 0.2582,
+      "step": 25380
+    },
+    {
+      "epoch": 40.49,
+      "grad_norm": 0.5955290198326111,
+      "learning_rate": 0.0003802232854864434,
+      "loss": 0.4031,
+      "step": 25390
+    },
+    {
+      "epoch": 40.51,
+      "grad_norm": 0.22706195712089539,
+      "learning_rate": 0.00037958532695374803,
+      "loss": 0.2928,
+      "step": 25400
+    },
+    {
+      "epoch": 40.53,
+      "grad_norm": 0.4163839817047119,
+      "learning_rate": 0.00037894736842105265,
+      "loss": 0.2956,
+      "step": 25410
+    },
+    {
+      "epoch": 40.54,
+      "grad_norm": 0.2746015787124634,
+      "learning_rate": 0.00037830940988835726,
+      "loss": 0.2378,
+      "step": 25420
+    },
+    {
+      "epoch": 40.56,
+      "grad_norm": 0.23401568830013275,
+      "learning_rate": 0.0003776714513556619,
+      "loss": 0.3618,
+      "step": 25430
+    },
+    {
+      "epoch": 40.57,
+      "grad_norm": 1.4698227643966675,
+      "learning_rate": 0.0003770334928229665,
+      "loss": 0.3472,
+      "step": 25440
+    },
+    {
+      "epoch": 40.59,
+      "grad_norm": 0.29799923300743103,
+      "learning_rate": 0.0003763955342902711,
+      "loss": 0.321,
+      "step": 25450
+    },
+    {
+      "epoch": 40.61,
+      "grad_norm": 0.27735623717308044,
+      "learning_rate": 0.00037575757575757573,
+      "loss": 0.346,
+      "step": 25460
+    },
+    {
+      "epoch": 40.62,
+      "grad_norm": 0.34145793318748474,
+      "learning_rate": 0.0003751196172248804,
+      "loss": 0.3426,
+      "step": 25470
+    },
+    {
+      "epoch": 40.64,
+      "grad_norm": 0.24481597542762756,
+      "learning_rate": 0.000374481658692185,
+      "loss": 0.2791,
+      "step": 25480
+    },
+    {
+      "epoch": 40.65,
+      "grad_norm": 0.5041400194168091,
+      "learning_rate": 0.00037384370015948964,
+      "loss": 0.2817,
+      "step": 25490
+    },
+    {
+      "epoch": 40.67,
+      "grad_norm": 0.3849920332431793,
+      "learning_rate": 0.00037320574162679425,
+      "loss": 0.3522,
+      "step": 25500
+    },
+    {
+      "epoch": 40.69,
+      "grad_norm": 0.4459153413772583,
+      "learning_rate": 0.00037256778309409887,
+      "loss": 0.3244,
+      "step": 25510
+    },
+    {
+      "epoch": 40.7,
+      "grad_norm": 0.441022127866745,
+      "learning_rate": 0.0003719298245614035,
+      "loss": 0.2834,
+      "step": 25520
+    },
+    {
+      "epoch": 40.72,
+      "grad_norm": 0.16988414525985718,
+      "learning_rate": 0.0003712918660287081,
+      "loss": 0.2639,
+      "step": 25530
+    },
+    {
+      "epoch": 40.73,
+      "grad_norm": 0.3544873893260956,
+      "learning_rate": 0.0003706539074960127,
+      "loss": 0.3157,
+      "step": 25540
+    },
+    {
+      "epoch": 40.75,
+      "grad_norm": 0.6139649152755737,
+      "learning_rate": 0.0003700159489633174,
+      "loss": 0.3729,
+      "step": 25550
+    },
+    {
+      "epoch": 40.77,
+      "grad_norm": 0.22452673316001892,
+      "learning_rate": 0.000369377990430622,
+      "loss": 0.2209,
+      "step": 25560
+    },
+    {
+      "epoch": 40.78,
+      "grad_norm": 0.458019495010376,
+      "learning_rate": 0.0003687400318979267,
+      "loss": 0.3738,
+      "step": 25570
+    },
+    {
+      "epoch": 40.8,
+      "grad_norm": 0.24333609640598297,
+      "learning_rate": 0.0003681020733652313,
+      "loss": 0.3406,
+      "step": 25580
+    },
+    {
+      "epoch": 40.81,
+      "grad_norm": 0.135534405708313,
+      "learning_rate": 0.0003674641148325359,
+      "loss": 0.2845,
+      "step": 25590
+    },
+    {
+      "epoch": 40.83,
+      "grad_norm": 1.0264251232147217,
+      "learning_rate": 0.00036682615629984053,
+      "loss": 0.3014,
+      "step": 25600
+    },
+    {
+      "epoch": 40.85,
+      "grad_norm": 0.5027388334274292,
+      "learning_rate": 0.00036618819776714515,
+      "loss": 0.2648,
+      "step": 25610
+    },
+    {
+      "epoch": 40.86,
+      "grad_norm": 0.37629154324531555,
+      "learning_rate": 0.00036555023923444976,
+      "loss": 0.3263,
+      "step": 25620
+    },
+    {
+      "epoch": 40.88,
+      "grad_norm": 0.16155029833316803,
+      "learning_rate": 0.00036491228070175443,
+      "loss": 0.2677,
+      "step": 25630
+    },
+    {
+      "epoch": 40.89,
+      "grad_norm": 0.5950889587402344,
+      "learning_rate": 0.00036427432216905905,
+      "loss": 0.2737,
+      "step": 25640
+    },
+    {
+      "epoch": 40.91,
+      "grad_norm": 1.288246750831604,
+      "learning_rate": 0.00036363636363636367,
+      "loss": 0.3458,
+      "step": 25650
+    },
+    {
+      "epoch": 40.93,
+      "grad_norm": 0.21823683381080627,
+      "learning_rate": 0.0003629984051036683,
+      "loss": 0.2528,
+      "step": 25660
+    },
+    {
+      "epoch": 40.94,
+      "grad_norm": 0.2102632224559784,
+      "learning_rate": 0.0003623604465709729,
+      "loss": 0.3346,
+      "step": 25670
+    },
+    {
+      "epoch": 40.96,
+      "grad_norm": 0.753999650478363,
+      "learning_rate": 0.0003617224880382775,
+      "loss": 0.3758,
+      "step": 25680
+    },
+    {
+      "epoch": 40.97,
+      "grad_norm": 0.20464596152305603,
+      "learning_rate": 0.00036108452950558214,
+      "loss": 0.3724,
+      "step": 25690
+    },
+    {
+      "epoch": 40.99,
+      "grad_norm": 0.38693875074386597,
+      "learning_rate": 0.00036044657097288675,
+      "loss": 0.4987,
+      "step": 25700
+    },
+    {
+      "epoch": 41.0,
+      "grad_norm": 1.1584486961364746,
+      "learning_rate": 0.0003598086124401914,
+      "loss": 0.3108,
+      "step": 25710
+    },
+    {
+      "epoch": 41.02,
+      "grad_norm": 0.23398354649543762,
+      "learning_rate": 0.00035917065390749604,
+      "loss": 0.321,
+      "step": 25720
+    },
+    {
+      "epoch": 41.04,
+      "grad_norm": 0.265209823846817,
+      "learning_rate": 0.00035853269537480066,
+      "loss": 0.379,
+      "step": 25730
+    },
+    {
+      "epoch": 41.05,
+      "grad_norm": 0.5159454941749573,
+      "learning_rate": 0.0003578947368421053,
+      "loss": 0.2849,
+      "step": 25740
+    },
+    {
+      "epoch": 41.07,
+      "grad_norm": 0.3185652792453766,
+      "learning_rate": 0.0003572567783094099,
+      "loss": 0.2885,
+      "step": 25750
+    },
+    {
+      "epoch": 41.08,
+      "grad_norm": 0.6398610472679138,
+      "learning_rate": 0.0003566188197767145,
+      "loss": 0.3583,
+      "step": 25760
+    },
+    {
+      "epoch": 41.1,
+      "grad_norm": 0.5768219232559204,
+      "learning_rate": 0.0003559808612440191,
+      "loss": 0.3427,
+      "step": 25770
+    },
+    {
+      "epoch": 41.12,
+      "grad_norm": 0.5042071342468262,
+      "learning_rate": 0.00035534290271132374,
+      "loss": 0.3047,
+      "step": 25780
+    },
+    {
+      "epoch": 41.13,
+      "grad_norm": 0.20871587097644806,
+      "learning_rate": 0.0003547049441786284,
+      "loss": 0.2634,
+      "step": 25790
+    },
+    {
+      "epoch": 41.15,
+      "grad_norm": 0.20863570272922516,
+      "learning_rate": 0.00035406698564593303,
+      "loss": 0.3444,
+      "step": 25800
+    },
+    {
+      "epoch": 41.16,
+      "grad_norm": 0.43497905135154724,
+      "learning_rate": 0.00035342902711323765,
+      "loss": 0.3717,
+      "step": 25810
+    },
+    {
+      "epoch": 41.18,
+      "grad_norm": 0.5420474410057068,
+      "learning_rate": 0.00035279106858054226,
+      "loss": 0.2936,
+      "step": 25820
+    },
+    {
+      "epoch": 41.2,
+      "grad_norm": 0.16857664287090302,
+      "learning_rate": 0.0003521531100478469,
+      "loss": 0.2317,
+      "step": 25830
+    },
+    {
+      "epoch": 41.21,
+      "grad_norm": 0.176952064037323,
+      "learning_rate": 0.0003515151515151515,
+      "loss": 0.2895,
+      "step": 25840
+    },
+    {
+      "epoch": 41.23,
+      "grad_norm": 0.3629634380340576,
+      "learning_rate": 0.0003508771929824561,
+      "loss": 0.3161,
+      "step": 25850
+    },
+    {
+      "epoch": 41.24,
+      "grad_norm": 0.3649951219558716,
+      "learning_rate": 0.00035023923444976073,
+      "loss": 0.3293,
+      "step": 25860
+    },
+    {
+      "epoch": 41.26,
+      "grad_norm": 0.2517475187778473,
+      "learning_rate": 0.0003496012759170654,
+      "loss": 0.3179,
+      "step": 25870
+    },
+    {
+      "epoch": 41.28,
+      "grad_norm": 0.18728438019752502,
+      "learning_rate": 0.00034896331738437,
+      "loss": 0.278,
+      "step": 25880
+    },
+    {
+      "epoch": 41.29,
+      "grad_norm": 0.3795156180858612,
+      "learning_rate": 0.00034832535885167464,
+      "loss": 0.3076,
+      "step": 25890
+    },
+    {
+      "epoch": 41.31,
+      "grad_norm": 0.6630691289901733,
+      "learning_rate": 0.00034768740031897925,
+      "loss": 0.3059,
+      "step": 25900
+    },
+    {
+      "epoch": 41.32,
+      "grad_norm": 0.5528631210327148,
+      "learning_rate": 0.00034704944178628387,
+      "loss": 0.287,
+      "step": 25910
+    },
+    {
+      "epoch": 41.34,
+      "grad_norm": 0.43808212876319885,
+      "learning_rate": 0.0003464114832535885,
+      "loss": 0.2863,
+      "step": 25920
+    },
+    {
+      "epoch": 41.36,
+      "grad_norm": 0.18791545927524567,
+      "learning_rate": 0.0003457735247208931,
+      "loss": 0.3391,
+      "step": 25930
+    },
+    {
+      "epoch": 41.37,
+      "grad_norm": 0.3744913935661316,
+      "learning_rate": 0.0003451355661881977,
+      "loss": 0.3083,
+      "step": 25940
+    },
+    {
+      "epoch": 41.39,
+      "grad_norm": 0.48115044832229614,
+      "learning_rate": 0.00034449760765550245,
+      "loss": 0.3296,
+      "step": 25950
+    },
+    {
+      "epoch": 41.4,
+      "grad_norm": 0.38108351826667786,
+      "learning_rate": 0.00034385964912280706,
+      "loss": 0.2663,
+      "step": 25960
+    },
+    {
+      "epoch": 41.42,
+      "grad_norm": 0.3938140869140625,
+      "learning_rate": 0.0003432216905901117,
+      "loss": 0.3353,
+      "step": 25970
+    },
+    {
+      "epoch": 41.44,
+      "grad_norm": 0.2402111142873764,
+      "learning_rate": 0.0003425837320574163,
+      "loss": 0.3786,
+      "step": 25980
+    },
+    {
+      "epoch": 41.45,
+      "grad_norm": 0.39668262004852295,
+      "learning_rate": 0.0003419457735247209,
+      "loss": 0.2367,
+      "step": 25990
+    },
+    {
+      "epoch": 41.47,
+      "grad_norm": 0.3418915569782257,
+      "learning_rate": 0.00034130781499202553,
+      "loss": 0.2675,
+      "step": 26000
+    },
+    {
+      "epoch": 41.48,
+      "grad_norm": 0.5036392211914062,
+      "learning_rate": 0.00034066985645933015,
+      "loss": 0.4014,
+      "step": 26010
+    },
+    {
+      "epoch": 41.5,
+      "grad_norm": 0.4944436550140381,
+      "learning_rate": 0.0003400318979266348,
+      "loss": 0.3061,
+      "step": 26020
+    },
+    {
+      "epoch": 41.52,
+      "grad_norm": 0.36498111486434937,
+      "learning_rate": 0.00033939393939393943,
+      "loss": 0.2843,
+      "step": 26030
+    },
+    {
+      "epoch": 41.53,
+      "grad_norm": 0.4892807900905609,
+      "learning_rate": 0.00033875598086124405,
+      "loss": 0.349,
+      "step": 26040
+    },
+    {
+      "epoch": 41.55,
+      "grad_norm": 0.23948755860328674,
+      "learning_rate": 0.00033811802232854867,
+      "loss": 0.2737,
+      "step": 26050
+    },
+    {
+      "epoch": 41.56,
+      "grad_norm": 0.22319771349430084,
+      "learning_rate": 0.0003374800637958533,
+      "loss": 0.2962,
+      "step": 26060
+    },
+    {
+      "epoch": 41.58,
+      "grad_norm": 0.3389337956905365,
+      "learning_rate": 0.0003368421052631579,
+      "loss": 0.3462,
+      "step": 26070
+    },
+    {
+      "epoch": 41.59,
+      "grad_norm": 0.15749427676200867,
+      "learning_rate": 0.0003362041467304625,
+      "loss": 0.3707,
+      "step": 26080
+    },
+    {
+      "epoch": 41.61,
+      "grad_norm": 0.5987353324890137,
+      "learning_rate": 0.00033556618819776714,
+      "loss": 0.3666,
+      "step": 26090
+    },
+    {
+      "epoch": 41.63,
+      "grad_norm": 0.21494194865226746,
+      "learning_rate": 0.0003349282296650718,
+      "loss": 0.296,
+      "step": 26100
+    },
+    {
+      "epoch": 41.64,
+      "grad_norm": 0.4202018976211548,
+      "learning_rate": 0.0003342902711323764,
+      "loss": 0.3,
+      "step": 26110
+    },
+    {
+      "epoch": 41.66,
+      "grad_norm": 0.33832699060440063,
+      "learning_rate": 0.00033365231259968104,
+      "loss": 0.3593,
+      "step": 26120
+    },
+    {
+      "epoch": 41.67,
+      "grad_norm": 0.18312333524227142,
+      "learning_rate": 0.00033301435406698566,
+      "loss": 0.2593,
+      "step": 26130
+    },
+    {
+      "epoch": 41.69,
+      "grad_norm": 0.6346192359924316,
+      "learning_rate": 0.0003323763955342903,
+      "loss": 0.3646,
+      "step": 26140
+    },
+    {
+      "epoch": 41.71,
+      "grad_norm": 0.2041671872138977,
+      "learning_rate": 0.0003317384370015949,
+      "loss": 0.2913,
+      "step": 26150
+    },
+    {
+      "epoch": 41.72,
+      "grad_norm": 0.23247523605823517,
+      "learning_rate": 0.0003311004784688995,
+      "loss": 0.2864,
+      "step": 26160
+    },
+    {
+      "epoch": 41.74,
+      "grad_norm": 0.6074626445770264,
+      "learning_rate": 0.0003304625199362041,
+      "loss": 0.2761,
+      "step": 26170
+    },
+    {
+      "epoch": 41.75,
+      "grad_norm": 0.2906535267829895,
+      "learning_rate": 0.0003298245614035088,
+      "loss": 0.2929,
+      "step": 26180
+    },
+    {
+      "epoch": 41.77,
+      "grad_norm": 0.36293816566467285,
+      "learning_rate": 0.0003291866028708134,
+      "loss": 0.2971,
+      "step": 26190
+    },
+    {
+      "epoch": 41.79,
+      "grad_norm": 0.3410266041755676,
+      "learning_rate": 0.00032854864433811803,
+      "loss": 0.2934,
+      "step": 26200
+    },
+    {
+      "epoch": 41.8,
+      "grad_norm": 0.5327407717704773,
+      "learning_rate": 0.00032791068580542265,
+      "loss": 0.3566,
+      "step": 26210
+    },
+    {
+      "epoch": 41.82,
+      "grad_norm": 0.4243089556694031,
+      "learning_rate": 0.00032727272727272726,
+      "loss": 0.2588,
+      "step": 26220
+    },
+    {
+      "epoch": 41.83,
+      "grad_norm": 0.3032602369785309,
+      "learning_rate": 0.0003266347687400319,
+      "loss": 0.2629,
+      "step": 26230
+    },
+    {
+      "epoch": 41.85,
+      "grad_norm": 0.4830479621887207,
+      "learning_rate": 0.0003259968102073365,
+      "loss": 0.3939,
+      "step": 26240
+    },
+    {
+      "epoch": 41.87,
+      "grad_norm": 0.11178059130907059,
+      "learning_rate": 0.0003253588516746411,
+      "loss": 0.2336,
+      "step": 26250
+    },
+    {
+      "epoch": 41.88,
+      "grad_norm": 0.11337348073720932,
+      "learning_rate": 0.0003247208931419458,
+      "loss": 0.2311,
+      "step": 26260
+    },
+    {
+      "epoch": 41.9,
+      "grad_norm": 0.42159444093704224,
+      "learning_rate": 0.0003240829346092504,
+      "loss": 0.3213,
+      "step": 26270
+    },
+    {
+      "epoch": 41.91,
+      "grad_norm": 0.36887168884277344,
+      "learning_rate": 0.000323444976076555,
+      "loss": 0.2928,
+      "step": 26280
+    },
+    {
+      "epoch": 41.93,
+      "grad_norm": 0.4706740081310272,
+      "learning_rate": 0.00032280701754385964,
+      "loss": 0.2848,
+      "step": 26290
+    },
+    {
+      "epoch": 41.95,
+      "grad_norm": 0.5931901335716248,
+      "learning_rate": 0.00032216905901116425,
+      "loss": 0.3273,
+      "step": 26300
+    },
+    {
+      "epoch": 41.96,
+      "grad_norm": 0.21357150375843048,
+      "learning_rate": 0.00032153110047846887,
+      "loss": 0.2859,
+      "step": 26310
+    },
+    {
+      "epoch": 41.98,
+      "grad_norm": 0.48659244179725647,
+      "learning_rate": 0.00032089314194577354,
+      "loss": 0.3588,
+      "step": 26320
+    },
+    {
+      "epoch": 41.99,
+      "grad_norm": 0.28712713718414307,
+      "learning_rate": 0.00032025518341307816,
+      "loss": 0.3016,
+      "step": 26330
+    },
+    {
+      "epoch": 42.01,
+      "grad_norm": 0.5351189970970154,
+      "learning_rate": 0.00031961722488038283,
+      "loss": 0.3306,
+      "step": 26340
+    },
+    {
+      "epoch": 42.03,
+      "grad_norm": 0.22305412590503693,
+      "learning_rate": 0.00031897926634768745,
+      "loss": 0.283,
+      "step": 26350
+    },
+    {
+      "epoch": 42.04,
+      "grad_norm": 0.3026597797870636,
+      "learning_rate": 0.00031834130781499206,
+      "loss": 0.2912,
+      "step": 26360
+    },
+    {
+      "epoch": 42.06,
+      "grad_norm": 0.3411235809326172,
+      "learning_rate": 0.0003177033492822967,
+      "loss": 0.3273,
+      "step": 26370
+    },
+    {
+      "epoch": 42.07,
+      "grad_norm": 0.31902214884757996,
+      "learning_rate": 0.0003170653907496013,
+      "loss": 0.2511,
+      "step": 26380
+    },
+    {
+      "epoch": 42.09,
+      "grad_norm": 0.2367999106645584,
+      "learning_rate": 0.0003164274322169059,
+      "loss": 0.2427,
+      "step": 26390
+    },
+    {
+      "epoch": 42.11,
+      "grad_norm": 0.24773749709129333,
+      "learning_rate": 0.00031578947368421053,
+      "loss": 0.2991,
+      "step": 26400
+    },
+    {
+      "epoch": 42.12,
+      "grad_norm": 0.33940422534942627,
+      "learning_rate": 0.00031515151515151515,
+      "loss": 0.2688,
+      "step": 26410
+    },
+    {
+      "epoch": 42.14,
+      "grad_norm": 0.4297594130039215,
+      "learning_rate": 0.0003145135566188198,
+      "loss": 0.2882,
+      "step": 26420
+    },
+    {
+      "epoch": 42.15,
+      "grad_norm": 0.38739773631095886,
+      "learning_rate": 0.00031387559808612443,
+      "loss": 0.299,
+      "step": 26430
+    },
+    {
+      "epoch": 42.17,
+      "grad_norm": 0.19908225536346436,
+      "learning_rate": 0.00031323763955342905,
+      "loss": 0.2284,
+      "step": 26440
+    },
+    {
+      "epoch": 42.19,
+      "grad_norm": 0.2024683952331543,
+      "learning_rate": 0.00031259968102073367,
+      "loss": 0.27,
+      "step": 26450
+    },
+    {
+      "epoch": 42.2,
+      "grad_norm": 0.27837881445884705,
+      "learning_rate": 0.0003119617224880383,
+      "loss": 0.3353,
+      "step": 26460
+    },
+    {
+      "epoch": 42.22,
+      "grad_norm": 0.25491103529930115,
+      "learning_rate": 0.0003113237639553429,
+      "loss": 0.3187,
+      "step": 26470
+    },
+    {
+      "epoch": 42.23,
+      "grad_norm": 0.430846244096756,
+      "learning_rate": 0.0003106858054226475,
+      "loss": 0.2994,
+      "step": 26480
+    },
+    {
+      "epoch": 42.25,
+      "grad_norm": 0.3018259108066559,
+      "learning_rate": 0.00031004784688995214,
+      "loss": 0.3171,
+      "step": 26490
+    },
+    {
+      "epoch": 42.26,
+      "grad_norm": 0.29348355531692505,
+      "learning_rate": 0.0003094098883572568,
+      "loss": 0.3,
+      "step": 26500
+    },
+    {
+      "epoch": 42.28,
+      "grad_norm": 0.3258605897426605,
+      "learning_rate": 0.0003087719298245614,
+      "loss": 0.2795,
+      "step": 26510
+    },
+    {
+      "epoch": 42.3,
+      "grad_norm": 0.17465408146381378,
+      "learning_rate": 0.00030813397129186604,
+      "loss": 0.3106,
+      "step": 26520
+    },
+    {
+      "epoch": 42.31,
+      "grad_norm": 0.2361348271369934,
+      "learning_rate": 0.00030749601275917066,
+      "loss": 0.2802,
+      "step": 26530
+    },
+    {
+      "epoch": 42.33,
+      "grad_norm": 0.18255957961082458,
+      "learning_rate": 0.0003068580542264753,
+      "loss": 0.2396,
+      "step": 26540
+    },
+    {
+      "epoch": 42.34,
+      "grad_norm": 0.5694864988327026,
+      "learning_rate": 0.0003062200956937799,
+      "loss": 0.3536,
+      "step": 26550
+    },
+    {
+      "epoch": 42.36,
+      "grad_norm": 0.37303659319877625,
+      "learning_rate": 0.0003055821371610845,
+      "loss": 0.263,
+      "step": 26560
+    },
+    {
+      "epoch": 42.38,
+      "grad_norm": 0.3398790657520294,
+      "learning_rate": 0.0003049441786283891,
+      "loss": 0.2204,
+      "step": 26570
+    },
+    {
+      "epoch": 42.39,
+      "grad_norm": 0.28415796160697937,
+      "learning_rate": 0.0003043062200956938,
+      "loss": 0.3556,
+      "step": 26580
+    },
+    {
+      "epoch": 42.41,
+      "grad_norm": 0.4093596637248993,
+      "learning_rate": 0.0003036682615629984,
+      "loss": 0.2996,
+      "step": 26590
+    },
+    {
+      "epoch": 42.42,
+      "grad_norm": 0.25546014308929443,
+      "learning_rate": 0.00030303030303030303,
+      "loss": 0.3492,
+      "step": 26600
+    },
+    {
+      "epoch": 42.44,
+      "grad_norm": 0.7774071097373962,
+      "learning_rate": 0.00030239234449760765,
+      "loss": 0.3175,
+      "step": 26610
+    },
+    {
+      "epoch": 42.46,
+      "grad_norm": 0.7066117525100708,
+      "learning_rate": 0.00030175438596491226,
+      "loss": 0.3957,
+      "step": 26620
+    },
+    {
+      "epoch": 42.47,
+      "grad_norm": 0.42754918336868286,
+      "learning_rate": 0.0003011164274322169,
+      "loss": 0.2983,
+      "step": 26630
+    },
+    {
+      "epoch": 42.49,
+      "grad_norm": 0.5412092208862305,
+      "learning_rate": 0.0003004784688995215,
+      "loss": 0.3556,
+      "step": 26640
+    },
+    {
+      "epoch": 42.5,
+      "grad_norm": 0.19610168039798737,
+      "learning_rate": 0.0002998405103668261,
+      "loss": 0.309,
+      "step": 26650
+    },
+    {
+      "epoch": 42.52,
+      "grad_norm": 0.4178897738456726,
+      "learning_rate": 0.0002992025518341308,
+      "loss": 0.2874,
+      "step": 26660
+    },
+    {
+      "epoch": 42.54,
+      "grad_norm": 0.24159128963947296,
+      "learning_rate": 0.0002985645933014354,
+      "loss": 0.3137,
+      "step": 26670
+    },
+    {
+      "epoch": 42.55,
+      "grad_norm": 0.3273567259311676,
+      "learning_rate": 0.00029792663476874,
+      "loss": 0.3238,
+      "step": 26680
+    },
+    {
+      "epoch": 42.57,
+      "grad_norm": 0.48245471715927124,
+      "learning_rate": 0.0002972886762360447,
+      "loss": 0.42,
+      "step": 26690
+    },
+    {
+      "epoch": 42.58,
+      "grad_norm": 0.7114046216011047,
+      "learning_rate": 0.0002966507177033493,
+      "loss": 0.3498,
+      "step": 26700
+    },
+    {
+      "epoch": 42.6,
+      "grad_norm": 0.6506601572036743,
+      "learning_rate": 0.0002960127591706539,
+      "loss": 0.3406,
+      "step": 26710
+    },
+    {
+      "epoch": 42.62,
+      "grad_norm": 0.5458781719207764,
+      "learning_rate": 0.00029537480063795854,
+      "loss": 0.2737,
+      "step": 26720
+    },
+    {
+      "epoch": 42.63,
+      "grad_norm": 0.19456742703914642,
+      "learning_rate": 0.00029473684210526316,
+      "loss": 0.31,
+      "step": 26730
+    },
+    {
+      "epoch": 42.65,
+      "grad_norm": 0.178878054022789,
+      "learning_rate": 0.00029409888357256783,
+      "loss": 0.3827,
+      "step": 26740
+    },
+    {
+      "epoch": 42.66,
+      "grad_norm": 0.40357646346092224,
+      "learning_rate": 0.00029346092503987245,
+      "loss": 0.2755,
+      "step": 26750
+    },
+    {
+      "epoch": 42.68,
+      "grad_norm": 0.5037977695465088,
+      "learning_rate": 0.00029282296650717706,
+      "loss": 0.3228,
+      "step": 26760
+    },
+    {
+      "epoch": 42.7,
+      "grad_norm": 0.20705698430538177,
+      "learning_rate": 0.0002921850079744817,
+      "loss": 0.2309,
+      "step": 26770
+    },
+    {
+      "epoch": 42.71,
+      "grad_norm": 0.22491195797920227,
+      "learning_rate": 0.0002915470494417863,
+      "loss": 0.2569,
+      "step": 26780
+    },
+    {
+      "epoch": 42.73,
+      "grad_norm": 0.270967036485672,
+      "learning_rate": 0.0002909090909090909,
+      "loss": 0.2846,
+      "step": 26790
+    },
+    {
+      "epoch": 42.74,
+      "grad_norm": 0.1675962209701538,
+      "learning_rate": 0.00029027113237639553,
+      "loss": 0.3479,
+      "step": 26800
+    },
+    {
+      "epoch": 42.76,
+      "grad_norm": 0.24002137780189514,
+      "learning_rate": 0.0002896331738437002,
+      "loss": 0.3825,
+      "step": 26810
+    },
+    {
+      "epoch": 42.78,
+      "grad_norm": 0.7108230590820312,
+      "learning_rate": 0.0002889952153110048,
+      "loss": 0.4057,
+      "step": 26820
+    },
+    {
+      "epoch": 42.79,
+      "grad_norm": 0.5931742787361145,
+      "learning_rate": 0.00028835725677830943,
+      "loss": 0.3256,
+      "step": 26830
+    },
+    {
+      "epoch": 42.81,
+      "grad_norm": 0.4527370035648346,
+      "learning_rate": 0.00028771929824561405,
+      "loss": 0.2943,
+      "step": 26840
+    },
+    {
+      "epoch": 42.82,
+      "grad_norm": 0.6159200072288513,
+      "learning_rate": 0.00028708133971291867,
+      "loss": 0.3117,
+      "step": 26850
+    },
+    {
+      "epoch": 42.84,
+      "grad_norm": 0.1614978313446045,
+      "learning_rate": 0.0002864433811802233,
+      "loss": 0.2834,
+      "step": 26860
+    },
+    {
+      "epoch": 42.85,
+      "grad_norm": 0.37030118703842163,
+      "learning_rate": 0.0002858054226475279,
+      "loss": 0.3029,
+      "step": 26870
+    },
+    {
+      "epoch": 42.87,
+      "grad_norm": 0.13131965696811676,
+      "learning_rate": 0.0002851674641148325,
+      "loss": 0.27,
+      "step": 26880
+    },
+    {
+      "epoch": 42.89,
+      "grad_norm": 0.42525768280029297,
+      "learning_rate": 0.0002845295055821372,
+      "loss": 0.3307,
+      "step": 26890
+    },
+    {
+      "epoch": 42.9,
+      "grad_norm": 0.17870941758155823,
+      "learning_rate": 0.0002838915470494418,
+      "loss": 0.2564,
+      "step": 26900
+    },
+    {
+      "epoch": 42.92,
+      "grad_norm": 0.7622866630554199,
+      "learning_rate": 0.0002832535885167464,
+      "loss": 0.3327,
+      "step": 26910
+    },
+    {
+      "epoch": 42.93,
+      "grad_norm": 0.5731341242790222,
+      "learning_rate": 0.00028261562998405104,
+      "loss": 0.3194,
+      "step": 26920
+    },
+    {
+      "epoch": 42.95,
+      "grad_norm": 0.3763886094093323,
+      "learning_rate": 0.00028197767145135566,
+      "loss": 0.2775,
+      "step": 26930
+    },
+    {
+      "epoch": 42.97,
+      "grad_norm": 0.33604711294174194,
+      "learning_rate": 0.0002813397129186603,
+      "loss": 0.2825,
+      "step": 26940
+    },
+    {
+      "epoch": 42.98,
+      "grad_norm": 0.2752174437046051,
+      "learning_rate": 0.0002807017543859649,
+      "loss": 0.3769,
+      "step": 26950
+    },
+    {
+      "epoch": 43.0,
+      "grad_norm": 0.4602324962615967,
+      "learning_rate": 0.0002800637958532695,
+      "loss": 0.3297,
+      "step": 26960
+    },
+    {
+      "epoch": 43.01,
+      "grad_norm": 0.263231486082077,
+      "learning_rate": 0.0002794258373205742,
+      "loss": 0.226,
+      "step": 26970
+    },
+    {
+      "epoch": 43.03,
+      "grad_norm": 0.085409976541996,
+      "learning_rate": 0.0002787878787878788,
+      "loss": 0.2887,
+      "step": 26980
+    },
+    {
+      "epoch": 43.05,
+      "grad_norm": 0.3499665856361389,
+      "learning_rate": 0.0002781499202551834,
+      "loss": 0.3344,
+      "step": 26990
+    },
+    {
+      "epoch": 43.06,
+      "grad_norm": 0.6164402365684509,
+      "learning_rate": 0.00027751196172248803,
+      "loss": 0.3555,
+      "step": 27000
+    },
+    {
+      "epoch": 43.08,
+      "grad_norm": 0.22411352396011353,
+      "learning_rate": 0.00027687400318979265,
+      "loss": 0.3044,
+      "step": 27010
+    },
+    {
+      "epoch": 43.09,
+      "grad_norm": 0.5322696566581726,
+      "learning_rate": 0.00027623604465709726,
+      "loss": 0.2515,
+      "step": 27020
+    },
+    {
+      "epoch": 43.11,
+      "grad_norm": 0.382097989320755,
+      "learning_rate": 0.0002755980861244019,
+      "loss": 0.399,
+      "step": 27030
+    },
+    {
+      "epoch": 43.13,
+      "grad_norm": 0.13839659094810486,
+      "learning_rate": 0.0002749601275917065,
+      "loss": 0.3297,
+      "step": 27040
+    },
+    {
+      "epoch": 43.14,
+      "grad_norm": 0.22009891271591187,
+      "learning_rate": 0.00027432216905901117,
+      "loss": 0.277,
+      "step": 27050
+    },
+    {
+      "epoch": 43.16,
+      "grad_norm": 0.41159576177597046,
+      "learning_rate": 0.00027368421052631584,
+      "loss": 0.3107,
+      "step": 27060
+    },
+    {
+      "epoch": 43.17,
+      "grad_norm": 0.21699748933315277,
+      "learning_rate": 0.00027304625199362046,
+      "loss": 0.2765,
+      "step": 27070
+    },
+    {
+      "epoch": 43.19,
+      "grad_norm": 0.21291545033454895,
+      "learning_rate": 0.0002724082934609251,
+      "loss": 0.2479,
+      "step": 27080
+    },
+    {
+      "epoch": 43.21,
+      "grad_norm": 0.20848800241947174,
+      "learning_rate": 0.0002717703349282297,
+      "loss": 0.316,
+      "step": 27090
+    },
+    {
+      "epoch": 43.22,
+      "grad_norm": 0.41950148344039917,
+      "learning_rate": 0.0002711323763955343,
+      "loss": 0.3588,
+      "step": 27100
+    },
+    {
+      "epoch": 43.24,
+      "grad_norm": 0.33547741174697876,
+      "learning_rate": 0.0002704944178628389,
+      "loss": 0.2836,
+      "step": 27110
+    },
+    {
+      "epoch": 43.25,
+      "grad_norm": 1.4663803577423096,
+      "learning_rate": 0.00026985645933014354,
+      "loss": 0.3852,
+      "step": 27120
+    },
+    {
+      "epoch": 43.27,
+      "grad_norm": 0.2404787391424179,
+      "learning_rate": 0.0002692185007974482,
+      "loss": 0.2629,
+      "step": 27130
+    },
+    {
+      "epoch": 43.29,
+      "grad_norm": 0.08930987864732742,
+      "learning_rate": 0.00026858054226475283,
+      "loss": 0.1914,
+      "step": 27140
+    },
+    {
+      "epoch": 43.3,
+      "grad_norm": 0.3522126376628876,
+      "learning_rate": 0.00026794258373205745,
+      "loss": 0.3222,
+      "step": 27150
+    },
+    {
+      "epoch": 43.32,
+      "grad_norm": 2.05954909324646,
+      "learning_rate": 0.00026730462519936206,
+      "loss": 0.3058,
+      "step": 27160
+    },
+    {
+      "epoch": 43.33,
+      "grad_norm": 0.36962321400642395,
+      "learning_rate": 0.0002666666666666667,
+      "loss": 0.2829,
+      "step": 27170
+    },
+    {
+      "epoch": 43.35,
+      "grad_norm": 0.18911263346672058,
+      "learning_rate": 0.0002660287081339713,
+      "loss": 0.3731,
+      "step": 27180
+    },
+    {
+      "epoch": 43.37,
+      "grad_norm": 0.19024628400802612,
+      "learning_rate": 0.0002653907496012759,
+      "loss": 0.2726,
+      "step": 27190
+    },
+    {
+      "epoch": 43.38,
+      "grad_norm": 0.20783045887947083,
+      "learning_rate": 0.00026475279106858053,
+      "loss": 0.2787,
+      "step": 27200
+    },
+    {
+      "epoch": 43.4,
+      "grad_norm": 1.8203842639923096,
+      "learning_rate": 0.0002641148325358852,
+      "loss": 0.2755,
+      "step": 27210
+    },
+    {
+      "epoch": 43.41,
+      "grad_norm": 0.41969624161720276,
+      "learning_rate": 0.0002634768740031898,
+      "loss": 0.3315,
+      "step": 27220
+    },
+    {
+      "epoch": 43.43,
+      "grad_norm": 0.17119190096855164,
+      "learning_rate": 0.00026283891547049443,
+      "loss": 0.2718,
+      "step": 27230
+    },
+    {
+      "epoch": 43.44,
+      "grad_norm": 0.24514427781105042,
+      "learning_rate": 0.00026220095693779905,
+      "loss": 0.2831,
+      "step": 27240
+    },
+    {
+      "epoch": 43.46,
+      "grad_norm": 0.24649424850940704,
+      "learning_rate": 0.00026156299840510367,
+      "loss": 0.2647,
+      "step": 27250
+    },
+    {
+      "epoch": 43.48,
+      "grad_norm": 0.3236254155635834,
+      "learning_rate": 0.0002609250398724083,
+      "loss": 0.2733,
+      "step": 27260
+    },
+    {
+      "epoch": 43.49,
+      "grad_norm": 0.4180354177951813,
+      "learning_rate": 0.0002602870813397129,
+      "loss": 0.3352,
+      "step": 27270
+    },
+    {
+      "epoch": 43.51,
+      "grad_norm": 0.4652386009693146,
+      "learning_rate": 0.0002596491228070175,
+      "loss": 0.301,
+      "step": 27280
+    },
+    {
+      "epoch": 43.52,
+      "grad_norm": 0.30387723445892334,
+      "learning_rate": 0.0002590111642743222,
+      "loss": 0.2286,
+      "step": 27290
+    },
+    {
+      "epoch": 43.54,
+      "grad_norm": 0.13368535041809082,
+      "learning_rate": 0.0002583732057416268,
+      "loss": 0.2803,
+      "step": 27300
+    },
+    {
+      "epoch": 43.56,
+      "grad_norm": 0.10856983065605164,
+      "learning_rate": 0.0002577352472089314,
+      "loss": 0.3184,
+      "step": 27310
+    },
+    {
+      "epoch": 43.57,
+      "grad_norm": 0.3115447759628296,
+      "learning_rate": 0.00025709728867623604,
+      "loss": 0.3274,
+      "step": 27320
+    },
+    {
+      "epoch": 43.59,
+      "grad_norm": 0.3176775276660919,
+      "learning_rate": 0.00025645933014354066,
+      "loss": 0.3476,
+      "step": 27330
+    },
+    {
+      "epoch": 43.6,
+      "grad_norm": 0.17715303599834442,
+      "learning_rate": 0.0002558213716108453,
+      "loss": 0.2684,
+      "step": 27340
+    },
+    {
+      "epoch": 43.62,
+      "grad_norm": 0.39675870537757874,
+      "learning_rate": 0.0002551834130781499,
+      "loss": 0.317,
+      "step": 27350
+    },
+    {
+      "epoch": 43.64,
+      "grad_norm": 0.29539576172828674,
+      "learning_rate": 0.0002545454545454545,
+      "loss": 0.3124,
+      "step": 27360
+    },
+    {
+      "epoch": 43.65,
+      "grad_norm": 0.1516566276550293,
+      "learning_rate": 0.0002539074960127592,
+      "loss": 0.3537,
+      "step": 27370
+    },
+    {
+      "epoch": 43.67,
+      "grad_norm": 0.3762792944908142,
+      "learning_rate": 0.0002532695374800638,
+      "loss": 0.3026,
+      "step": 27380
+    },
+    {
+      "epoch": 43.68,
+      "grad_norm": 0.15428495407104492,
+      "learning_rate": 0.0002526315789473684,
+      "loss": 0.2849,
+      "step": 27390
+    },
+    {
+      "epoch": 43.7,
+      "grad_norm": 0.22668874263763428,
+      "learning_rate": 0.00025199362041467303,
+      "loss": 0.2174,
+      "step": 27400
+    },
+    {
+      "epoch": 43.72,
+      "grad_norm": 0.12878923118114471,
+      "learning_rate": 0.00025135566188197765,
+      "loss": 0.2367,
+      "step": 27410
+    },
+    {
+      "epoch": 43.73,
+      "grad_norm": 0.1742442101240158,
+      "learning_rate": 0.00025071770334928226,
+      "loss": 0.3225,
+      "step": 27420
+    },
+    {
+      "epoch": 43.75,
+      "grad_norm": 0.2178335189819336,
+      "learning_rate": 0.0002500797448165869,
+      "loss": 0.3674,
+      "step": 27430
+    },
+    {
+      "epoch": 43.76,
+      "grad_norm": 0.07598412036895752,
+      "learning_rate": 0.00024944178628389155,
+      "loss": 0.3825,
+      "step": 27440
+    },
+    {
+      "epoch": 43.78,
+      "grad_norm": 0.3597804605960846,
+      "learning_rate": 0.00024880382775119617,
+      "loss": 0.2679,
+      "step": 27450
+    },
+    {
+      "epoch": 43.8,
+      "grad_norm": 0.5584509968757629,
+      "learning_rate": 0.0002481658692185008,
+      "loss": 0.3459,
+      "step": 27460
+    },
+    {
+      "epoch": 43.81,
+      "grad_norm": 0.19587256014347076,
+      "learning_rate": 0.00024752791068580546,
+      "loss": 0.3732,
+      "step": 27470
+    },
+    {
+      "epoch": 43.83,
+      "grad_norm": 0.4442209303379059,
+      "learning_rate": 0.0002468899521531101,
+      "loss": 0.3049,
+      "step": 27480
+    },
+    {
+      "epoch": 43.84,
+      "grad_norm": 0.259143590927124,
+      "learning_rate": 0.0002462519936204147,
+      "loss": 0.2928,
+      "step": 27490
+    },
+    {
+      "epoch": 43.86,
+      "grad_norm": 0.19528359174728394,
+      "learning_rate": 0.0002456140350877193,
+      "loss": 0.2799,
+      "step": 27500
+    },
+    {
+      "epoch": 43.88,
+      "grad_norm": 0.47608378529548645,
+      "learning_rate": 0.0002449760765550239,
+      "loss": 0.3079,
+      "step": 27510
+    },
+    {
+      "epoch": 43.89,
+      "grad_norm": 0.2542645335197449,
+      "learning_rate": 0.00024433811802232854,
+      "loss": 0.2834,
+      "step": 27520
+    },
+    {
+      "epoch": 43.91,
+      "grad_norm": 0.37310686707496643,
+      "learning_rate": 0.00024370015948963318,
+      "loss": 0.3171,
+      "step": 27530
+    },
+    {
+      "epoch": 43.92,
+      "grad_norm": 0.6291790008544922,
+      "learning_rate": 0.00024306220095693783,
+      "loss": 0.3439,
+      "step": 27540
+    },
+    {
+      "epoch": 43.94,
+      "grad_norm": 0.5721063613891602,
+      "learning_rate": 0.00024242424242424245,
+      "loss": 0.3039,
+      "step": 27550
+    },
+    {
+      "epoch": 43.96,
+      "grad_norm": 0.1536693423986435,
+      "learning_rate": 0.00024178628389154706,
+      "loss": 0.2981,
+      "step": 27560
+    },
+    {
+      "epoch": 43.97,
+      "grad_norm": 0.3179001212120056,
+      "learning_rate": 0.00024114832535885168,
+      "loss": 0.2213,
+      "step": 27570
+    },
+    {
+      "epoch": 43.99,
+      "grad_norm": 0.19436044991016388,
+      "learning_rate": 0.00024051036682615632,
+      "loss": 0.2756,
+      "step": 27580
+    },
+    {
+      "epoch": 44.0,
+      "grad_norm": 0.21824301779270172,
+      "learning_rate": 0.00023987240829346094,
+      "loss": 0.2627,
+      "step": 27590
+    },
+    {
+      "epoch": 44.02,
+      "grad_norm": 0.38110193610191345,
+      "learning_rate": 0.00023923444976076556,
+      "loss": 0.248,
+      "step": 27600
+    },
+    {
+      "epoch": 44.04,
+      "grad_norm": 0.2160405069589615,
+      "learning_rate": 0.00023859649122807017,
+      "loss": 0.2404,
+      "step": 27610
+    },
+    {
+      "epoch": 44.05,
+      "grad_norm": 0.3136873245239258,
+      "learning_rate": 0.00023795853269537482,
+      "loss": 0.3034,
+      "step": 27620
+    },
+    {
+      "epoch": 44.07,
+      "grad_norm": 0.21699780225753784,
+      "learning_rate": 0.00023732057416267943,
+      "loss": 0.3371,
+      "step": 27630
+    },
+    {
+      "epoch": 44.08,
+      "grad_norm": 0.3122328221797943,
+      "learning_rate": 0.00023668261562998405,
+      "loss": 0.2569,
+      "step": 27640
+    },
+    {
+      "epoch": 44.1,
+      "grad_norm": 0.45483753085136414,
+      "learning_rate": 0.00023604465709728867,
+      "loss": 0.2873,
+      "step": 27650
+    },
+    {
+      "epoch": 44.11,
+      "grad_norm": 0.39906224608421326,
+      "learning_rate": 0.0002354066985645933,
+      "loss": 0.3227,
+      "step": 27660
+    },
+    {
+      "epoch": 44.13,
+      "grad_norm": 0.24932830035686493,
+      "learning_rate": 0.00023476874003189793,
+      "loss": 0.2343,
+      "step": 27670
+    },
+    {
+      "epoch": 44.15,
+      "grad_norm": 0.09502261132001877,
+      "learning_rate": 0.00023413078149920255,
+      "loss": 0.3065,
+      "step": 27680
+    },
+    {
+      "epoch": 44.16,
+      "grad_norm": 0.3910047709941864,
+      "learning_rate": 0.00023349282296650716,
+      "loss": 0.3902,
+      "step": 27690
+    },
+    {
+      "epoch": 44.18,
+      "grad_norm": 0.2578485310077667,
+      "learning_rate": 0.0002328548644338118,
+      "loss": 0.3062,
+      "step": 27700
+    },
+    {
+      "epoch": 44.19,
+      "grad_norm": 0.40186047554016113,
+      "learning_rate": 0.00023221690590111642,
+      "loss": 0.3129,
+      "step": 27710
+    },
+    {
+      "epoch": 44.21,
+      "grad_norm": 0.8674927353858948,
+      "learning_rate": 0.00023157894736842107,
+      "loss": 0.3522,
+      "step": 27720
+    },
+    {
+      "epoch": 44.23,
+      "grad_norm": 0.1684367060661316,
+      "learning_rate": 0.00023094098883572568,
+      "loss": 0.2683,
+      "step": 27730
+    },
+    {
+      "epoch": 44.24,
+      "grad_norm": 0.34888872504234314,
+      "learning_rate": 0.00023030303030303033,
+      "loss": 0.2477,
+      "step": 27740
+    },
+    {
+      "epoch": 44.26,
+      "grad_norm": 0.5431171655654907,
+      "learning_rate": 0.00022966507177033495,
+      "loss": 0.35,
+      "step": 27750
+    },
+    {
+      "epoch": 44.27,
+      "grad_norm": 0.3396085202693939,
+      "learning_rate": 0.00022902711323763956,
+      "loss": 0.3115,
+      "step": 27760
+    },
+    {
+      "epoch": 44.29,
+      "grad_norm": 0.327421635389328,
+      "learning_rate": 0.00022838915470494418,
+      "loss": 0.3153,
+      "step": 27770
+    },
+    {
+      "epoch": 44.31,
+      "grad_norm": 0.34646356105804443,
+      "learning_rate": 0.00022775119617224882,
+      "loss": 0.3603,
+      "step": 27780
+    },
+    {
+      "epoch": 44.32,
+      "grad_norm": 0.3496292233467102,
+      "learning_rate": 0.00022711323763955344,
+      "loss": 0.2877,
+      "step": 27790
+    },
+    {
+      "epoch": 44.34,
+      "grad_norm": 0.19173116981983185,
+      "learning_rate": 0.00022647527910685806,
+      "loss": 0.2755,
+      "step": 27800
+    },
+    {
+      "epoch": 44.35,
+      "grad_norm": 0.6964245438575745,
+      "learning_rate": 0.00022583732057416267,
+      "loss": 0.3568,
+      "step": 27810
+    },
+    {
+      "epoch": 44.37,
+      "grad_norm": 0.283237099647522,
+      "learning_rate": 0.00022519936204146732,
+      "loss": 0.3004,
+      "step": 27820
+    },
+    {
+      "epoch": 44.39,
+      "grad_norm": 0.3077571988105774,
+      "learning_rate": 0.00022456140350877193,
+      "loss": 0.3139,
+      "step": 27830
+    },
+    {
+      "epoch": 44.4,
+      "grad_norm": 0.44178569316864014,
+      "learning_rate": 0.00022392344497607655,
+      "loss": 0.2253,
+      "step": 27840
+    },
+    {
+      "epoch": 44.42,
+      "grad_norm": 0.23611438274383545,
+      "learning_rate": 0.00022328548644338117,
+      "loss": 0.3357,
+      "step": 27850
+    },
+    {
+      "epoch": 44.43,
+      "grad_norm": 0.402852326631546,
+      "learning_rate": 0.0002226475279106858,
+      "loss": 0.3024,
+      "step": 27860
+    },
+    {
+      "epoch": 44.45,
+      "grad_norm": 0.5001922249794006,
+      "learning_rate": 0.00022200956937799043,
+      "loss": 0.2424,
+      "step": 27870
+    },
+    {
+      "epoch": 44.47,
+      "grad_norm": 0.5164135098457336,
+      "learning_rate": 0.00022137161084529505,
+      "loss": 0.2952,
+      "step": 27880
+    },
+    {
+      "epoch": 44.48,
+      "grad_norm": 0.35648113489151,
+      "learning_rate": 0.00022073365231259966,
+      "loss": 0.2745,
+      "step": 27890
+    },
+    {
+      "epoch": 44.5,
+      "grad_norm": 0.6341779232025146,
+      "learning_rate": 0.00022009569377990433,
+      "loss": 0.3428,
+      "step": 27900
+    },
+    {
+      "epoch": 44.51,
+      "grad_norm": 0.5282499194145203,
+      "learning_rate": 0.00021945773524720895,
+      "loss": 0.3385,
+      "step": 27910
+    },
+    {
+      "epoch": 44.53,
+      "grad_norm": 0.34089717268943787,
+      "learning_rate": 0.00021881977671451357,
+      "loss": 0.3049,
+      "step": 27920
+    },
+    {
+      "epoch": 44.55,
+      "grad_norm": 0.44440943002700806,
+      "learning_rate": 0.00021818181818181818,
+      "loss": 0.3613,
+      "step": 27930
+    },
+    {
+      "epoch": 44.56,
+      "grad_norm": 0.3817773461341858,
+      "learning_rate": 0.00021754385964912283,
+      "loss": 0.3126,
+      "step": 27940
+    },
+    {
+      "epoch": 44.58,
+      "grad_norm": 0.24557062983512878,
+      "learning_rate": 0.00021690590111642745,
+      "loss": 0.2949,
+      "step": 27950
+    },
+    {
+      "epoch": 44.59,
+      "grad_norm": 0.7320693135261536,
+      "learning_rate": 0.00021626794258373206,
+      "loss": 0.3664,
+      "step": 27960
+    },
+    {
+      "epoch": 44.61,
+      "grad_norm": 0.4003210663795471,
+      "learning_rate": 0.00021562998405103668,
+      "loss": 0.3629,
+      "step": 27970
+    },
+    {
+      "epoch": 44.63,
+      "grad_norm": 0.30994275212287903,
+      "learning_rate": 0.00021499202551834132,
+      "loss": 0.2988,
+      "step": 27980
+    },
+    {
+      "epoch": 44.64,
+      "grad_norm": 0.2852626442909241,
+      "learning_rate": 0.00021435406698564594,
+      "loss": 0.2767,
+      "step": 27990
+    },
+    {
+      "epoch": 44.66,
+      "grad_norm": 0.2598101794719696,
+      "learning_rate": 0.00021371610845295056,
+      "loss": 0.3016,
+      "step": 28000
+    },
+    {
+      "epoch": 44.67,
+      "grad_norm": 0.1722613275051117,
+      "learning_rate": 0.00021307814992025517,
+      "loss": 0.1633,
+      "step": 28010
+    },
+    {
+      "epoch": 44.69,
+      "grad_norm": 0.2119804471731186,
+      "learning_rate": 0.00021244019138755982,
+      "loss": 0.2715,
+      "step": 28020
+    },
+    {
+      "epoch": 44.7,
+      "grad_norm": 0.27806442975997925,
+      "learning_rate": 0.00021180223285486443,
+      "loss": 0.2886,
+      "step": 28030
+    },
+    {
+      "epoch": 44.72,
+      "grad_norm": 0.47258105874061584,
+      "learning_rate": 0.00021116427432216905,
+      "loss": 0.2836,
+      "step": 28040
+    },
+    {
+      "epoch": 44.74,
+      "grad_norm": 0.295608252286911,
+      "learning_rate": 0.00021052631578947367,
+      "loss": 0.306,
+      "step": 28050
+    },
+    {
+      "epoch": 44.75,
+      "grad_norm": 0.2584683895111084,
+      "learning_rate": 0.0002098883572567783,
+      "loss": 0.3159,
+      "step": 28060
+    },
+    {
+      "epoch": 44.77,
+      "grad_norm": 0.41258344054222107,
+      "learning_rate": 0.00020925039872408293,
+      "loss": 0.3437,
+      "step": 28070
+    },
+    {
+      "epoch": 44.78,
+      "grad_norm": 0.13248884677886963,
+      "learning_rate": 0.00020861244019138755,
+      "loss": 0.3474,
+      "step": 28080
+    },
+    {
+      "epoch": 44.8,
+      "grad_norm": 0.2799845337867737,
+      "learning_rate": 0.0002079744816586922,
+      "loss": 0.3041,
+      "step": 28090
+    },
+    {
+      "epoch": 44.82,
+      "grad_norm": 0.34866270422935486,
+      "learning_rate": 0.00020733652312599683,
+      "loss": 0.2709,
+      "step": 28100
+    },
+    {
+      "epoch": 44.83,
+      "grad_norm": 0.22995953261852264,
+      "learning_rate": 0.00020669856459330145,
+      "loss": 0.3103,
+      "step": 28110
+    },
+    {
+      "epoch": 44.85,
+      "grad_norm": 0.6735871434211731,
+      "learning_rate": 0.00020606060606060607,
+      "loss": 0.3272,
+      "step": 28120
+    },
+    {
+      "epoch": 44.86,
+      "grad_norm": 0.8896424770355225,
+      "learning_rate": 0.00020542264752791068,
+      "loss": 0.2944,
+      "step": 28130
+    },
+    {
+      "epoch": 44.88,
+      "grad_norm": 0.12463853508234024,
+      "learning_rate": 0.00020478468899521533,
+      "loss": 0.3212,
+      "step": 28140
+    },
+    {
+      "epoch": 44.9,
+      "grad_norm": 0.29286473989486694,
+      "learning_rate": 0.00020414673046251995,
+      "loss": 0.2537,
+      "step": 28150
+    },
+    {
+      "epoch": 44.91,
+      "grad_norm": 0.4333782494068146,
+      "learning_rate": 0.00020350877192982456,
+      "loss": 0.2964,
+      "step": 28160
+    },
+    {
+      "epoch": 44.93,
+      "grad_norm": 0.25017327070236206,
+      "learning_rate": 0.00020287081339712918,
+      "loss": 0.3095,
+      "step": 28170
+    },
+    {
+      "epoch": 44.94,
+      "grad_norm": 0.18606650829315186,
+      "learning_rate": 0.00020223285486443382,
+      "loss": 0.2461,
+      "step": 28180
+    },
+    {
+      "epoch": 44.96,
+      "grad_norm": 0.14284665882587433,
+      "learning_rate": 0.00020159489633173844,
+      "loss": 0.2787,
+      "step": 28190
+    },
+    {
+      "epoch": 44.98,
+      "grad_norm": 0.6224771738052368,
+      "learning_rate": 0.00020095693779904306,
+      "loss": 0.3598,
+      "step": 28200
+    },
+    {
+      "epoch": 44.99,
+      "grad_norm": 0.32806506752967834,
+      "learning_rate": 0.00020031897926634767,
+      "loss": 0.2615,
+      "step": 28210
+    },
+    {
+      "epoch": 45.01,
+      "grad_norm": 0.45343583822250366,
+      "learning_rate": 0.00019968102073365232,
+      "loss": 0.3322,
+      "step": 28220
+    },
+    {
+      "epoch": 45.02,
+      "grad_norm": 0.18727990984916687,
+      "learning_rate": 0.00019904306220095693,
+      "loss": 0.2696,
+      "step": 28230
+    },
+    {
+      "epoch": 45.04,
+      "grad_norm": 0.28035393357276917,
+      "learning_rate": 0.00019840510366826155,
+      "loss": 0.256,
+      "step": 28240
+    },
+    {
+      "epoch": 45.06,
+      "grad_norm": 0.37490570545196533,
+      "learning_rate": 0.00019776714513556617,
+      "loss": 0.3105,
+      "step": 28250
+    },
+    {
+      "epoch": 45.07,
+      "grad_norm": 0.27727392315864563,
+      "learning_rate": 0.0001971291866028708,
+      "loss": 0.2573,
+      "step": 28260
+    },
+    {
+      "epoch": 45.09,
+      "grad_norm": 0.2856091856956482,
+      "learning_rate": 0.00019649122807017543,
+      "loss": 0.297,
+      "step": 28270
+    },
+    {
+      "epoch": 45.1,
+      "grad_norm": 0.3423827886581421,
+      "learning_rate": 0.00019585326953748007,
+      "loss": 0.3219,
+      "step": 28280
+    },
+    {
+      "epoch": 45.12,
+      "grad_norm": 0.2217862457036972,
+      "learning_rate": 0.0001952153110047847,
+      "loss": 0.2413,
+      "step": 28290
+    },
+    {
+      "epoch": 45.14,
+      "grad_norm": 0.49296557903289795,
+      "learning_rate": 0.00019457735247208933,
+      "loss": 0.2936,
+      "step": 28300
+    },
+    {
+      "epoch": 45.15,
+      "grad_norm": 0.28188827633857727,
+      "learning_rate": 0.00019393939393939395,
+      "loss": 0.2937,
+      "step": 28310
+    },
+    {
+      "epoch": 45.17,
+      "grad_norm": 0.6118289232254028,
+      "learning_rate": 0.00019330143540669857,
+      "loss": 0.3172,
+      "step": 28320
+    },
+    {
+      "epoch": 45.18,
+      "grad_norm": 0.38920632004737854,
+      "learning_rate": 0.0001926634768740032,
+      "loss": 0.3467,
+      "step": 28330
+    },
+    {
+      "epoch": 45.2,
+      "grad_norm": 0.2669709324836731,
+      "learning_rate": 0.00019202551834130783,
+      "loss": 0.2941,
+      "step": 28340
+    },
+    {
+      "epoch": 45.22,
+      "grad_norm": 0.17795272171497345,
+      "learning_rate": 0.00019138755980861245,
+      "loss": 0.3487,
+      "step": 28350
+    },
+    {
+      "epoch": 45.23,
+      "grad_norm": 0.3200840651988983,
+      "learning_rate": 0.00019074960127591706,
+      "loss": 0.3224,
+      "step": 28360
+    },
+    {
+      "epoch": 45.25,
+      "grad_norm": 0.3185681700706482,
+      "learning_rate": 0.0001901116427432217,
+      "loss": 0.2754,
+      "step": 28370
+    },
+    {
+      "epoch": 45.26,
+      "grad_norm": 0.35010969638824463,
+      "learning_rate": 0.00018947368421052632,
+      "loss": 0.2843,
+      "step": 28380
+    },
+    {
+      "epoch": 45.28,
+      "grad_norm": 0.19338567554950714,
+      "learning_rate": 0.00018883572567783094,
+      "loss": 0.3434,
+      "step": 28390
+    },
+    {
+      "epoch": 45.3,
+      "grad_norm": 0.13185134530067444,
+      "learning_rate": 0.00018819776714513556,
+      "loss": 0.2991,
+      "step": 28400
+    },
+    {
+      "epoch": 45.31,
+      "grad_norm": 0.2024078220129013,
+      "learning_rate": 0.0001875598086124402,
+      "loss": 0.3248,
+      "step": 28410
+    },
+    {
+      "epoch": 45.33,
+      "grad_norm": 0.22243604063987732,
+      "learning_rate": 0.00018692185007974482,
+      "loss": 0.2409,
+      "step": 28420
+    },
+    {
+      "epoch": 45.34,
+      "grad_norm": 0.5372808575630188,
+      "learning_rate": 0.00018628389154704943,
+      "loss": 0.2738,
+      "step": 28430
+    },
+    {
+      "epoch": 45.36,
+      "grad_norm": 0.17532573640346527,
+      "learning_rate": 0.00018564593301435405,
+      "loss": 0.2954,
+      "step": 28440
+    },
+    {
+      "epoch": 45.37,
+      "grad_norm": 0.2568674087524414,
+      "learning_rate": 0.0001850079744816587,
+      "loss": 0.2461,
+      "step": 28450
+    },
+    {
+      "epoch": 45.39,
+      "grad_norm": 0.36683690547943115,
+      "learning_rate": 0.00018437001594896334,
+      "loss": 0.301,
+      "step": 28460
+    },
+    {
+      "epoch": 45.41,
+      "grad_norm": 0.32988253235816956,
+      "learning_rate": 0.00018373205741626796,
+      "loss": 0.2522,
+      "step": 28470
+    },
+    {
+      "epoch": 45.42,
+      "grad_norm": 0.28334781527519226,
+      "learning_rate": 0.00018309409888357257,
+      "loss": 0.2795,
+      "step": 28480
+    },
+    {
+      "epoch": 45.44,
+      "grad_norm": 0.26257234811782837,
+      "learning_rate": 0.00018245614035087722,
+      "loss": 0.3357,
+      "step": 28490
+    },
+    {
+      "epoch": 45.45,
+      "grad_norm": 0.376924067735672,
+      "learning_rate": 0.00018181818181818183,
+      "loss": 0.3157,
+      "step": 28500
+    },
+    {
+      "epoch": 45.47,
+      "grad_norm": 0.06856755167245865,
+      "learning_rate": 0.00018118022328548645,
+      "loss": 0.3126,
+      "step": 28510
+    },
+    {
+      "epoch": 45.49,
+      "grad_norm": 0.3555695414543152,
+      "learning_rate": 0.00018054226475279107,
+      "loss": 0.2759,
+      "step": 28520
+    },
+    {
+      "epoch": 45.5,
+      "grad_norm": 0.44711726903915405,
+      "learning_rate": 0.0001799043062200957,
+      "loss": 0.3947,
+      "step": 28530
+    },
+    {
+      "epoch": 45.52,
+      "grad_norm": 0.5563350319862366,
+      "learning_rate": 0.00017926634768740033,
+      "loss": 0.3077,
+      "step": 28540
+    },
+    {
+      "epoch": 45.53,
+      "grad_norm": 0.22353103756904602,
+      "learning_rate": 0.00017862838915470495,
+      "loss": 0.319,
+      "step": 28550
+    },
+    {
+      "epoch": 45.55,
+      "grad_norm": 0.23482950031757355,
+      "learning_rate": 0.00017799043062200956,
+      "loss": 0.2164,
+      "step": 28560
+    },
+    {
+      "epoch": 45.57,
+      "grad_norm": 0.3976686894893646,
+      "learning_rate": 0.0001773524720893142,
+      "loss": 0.2903,
+      "step": 28570
+    },
+    {
+      "epoch": 45.58,
+      "grad_norm": 0.31743720173835754,
+      "learning_rate": 0.00017671451355661882,
+      "loss": 0.3855,
+      "step": 28580
+    },
+    {
+      "epoch": 45.6,
+      "grad_norm": 0.2157888561487198,
+      "learning_rate": 0.00017607655502392344,
+      "loss": 0.2198,
+      "step": 28590
+    },
+    {
+      "epoch": 45.61,
+      "grad_norm": 0.42237186431884766,
+      "learning_rate": 0.00017543859649122806,
+      "loss": 0.3859,
+      "step": 28600
+    },
+    {
+      "epoch": 45.63,
+      "grad_norm": 0.1533055305480957,
+      "learning_rate": 0.0001748006379585327,
+      "loss": 0.2198,
+      "step": 28610
+    },
+    {
+      "epoch": 45.65,
+      "grad_norm": 0.16389824450016022,
+      "learning_rate": 0.00017416267942583732,
+      "loss": 0.2789,
+      "step": 28620
+    },
+    {
+      "epoch": 45.66,
+      "grad_norm": 0.4902271032333374,
+      "learning_rate": 0.00017352472089314193,
+      "loss": 0.3184,
+      "step": 28630
+    },
+    {
+      "epoch": 45.68,
+      "grad_norm": 0.31961241364479065,
+      "learning_rate": 0.00017288676236044655,
+      "loss": 0.2653,
+      "step": 28640
+    },
+    {
+      "epoch": 45.69,
+      "grad_norm": 1.2578412294387817,
+      "learning_rate": 0.00017224880382775122,
+      "loss": 0.2537,
+      "step": 28650
+    },
+    {
+      "epoch": 45.71,
+      "grad_norm": 0.19706355035305023,
+      "learning_rate": 0.00017161084529505584,
+      "loss": 0.2666,
+      "step": 28660
+    },
+    {
+      "epoch": 45.73,
+      "grad_norm": 0.17647922039031982,
+      "learning_rate": 0.00017097288676236046,
+      "loss": 0.2909,
+      "step": 28670
+    },
+    {
+      "epoch": 45.74,
+      "grad_norm": 0.20171548426151276,
+      "learning_rate": 0.00017033492822966507,
+      "loss": 0.3303,
+      "step": 28680
+    },
+    {
+      "epoch": 45.76,
+      "grad_norm": 0.1995372623205185,
+      "learning_rate": 0.00016969696969696972,
+      "loss": 0.2621,
+      "step": 28690
+    },
+    {
+      "epoch": 45.77,
+      "grad_norm": 0.23527149856090546,
+      "learning_rate": 0.00016905901116427433,
+      "loss": 0.3213,
+      "step": 28700
+    },
+    {
+      "epoch": 45.79,
+      "grad_norm": 0.2143118530511856,
+      "learning_rate": 0.00016842105263157895,
+      "loss": 0.2584,
+      "step": 28710
+    },
+    {
+      "epoch": 45.81,
+      "grad_norm": 0.05645094811916351,
+      "learning_rate": 0.00016778309409888357,
+      "loss": 0.2731,
+      "step": 28720
+    },
+    {
+      "epoch": 45.82,
+      "grad_norm": 0.6314740777015686,
+      "learning_rate": 0.0001671451355661882,
+      "loss": 0.3243,
+      "step": 28730
+    },
+    {
+      "epoch": 45.84,
+      "grad_norm": 0.15495331585407257,
+      "learning_rate": 0.00016650717703349283,
+      "loss": 0.2936,
+      "step": 28740
+    },
+    {
+      "epoch": 45.85,
+      "grad_norm": 0.47223085165023804,
+      "learning_rate": 0.00016586921850079745,
+      "loss": 0.2497,
+      "step": 28750
+    },
+    {
+      "epoch": 45.87,
+      "grad_norm": 0.3611065447330475,
+      "learning_rate": 0.00016523125996810206,
+      "loss": 0.356,
+      "step": 28760
+    },
+    {
+      "epoch": 45.89,
+      "grad_norm": 0.38601604104042053,
+      "learning_rate": 0.0001645933014354067,
+      "loss": 0.2902,
+      "step": 28770
+    },
+    {
+      "epoch": 45.9,
+      "grad_norm": 0.36279168725013733,
+      "learning_rate": 0.00016395534290271132,
+      "loss": 0.2713,
+      "step": 28780
+    },
+    {
+      "epoch": 45.92,
+      "grad_norm": 0.15732410550117493,
+      "learning_rate": 0.00016331738437001594,
+      "loss": 0.323,
+      "step": 28790
+    },
+    {
+      "epoch": 45.93,
+      "grad_norm": 0.21142350137233734,
+      "learning_rate": 0.00016267942583732056,
+      "loss": 0.2383,
+      "step": 28800
+    },
+    {
+      "epoch": 45.95,
+      "grad_norm": 0.17822466790676117,
+      "learning_rate": 0.0001620414673046252,
+      "loss": 0.2362,
+      "step": 28810
+    },
+    {
+      "epoch": 45.96,
+      "grad_norm": 0.21047089993953705,
+      "learning_rate": 0.00016140350877192982,
+      "loss": 0.3138,
+      "step": 28820
+    },
+    {
+      "epoch": 45.98,
+      "grad_norm": 0.4122728109359741,
+      "learning_rate": 0.00016076555023923443,
+      "loss": 0.2932,
+      "step": 28830
+    },
+    {
+      "epoch": 46.0,
+      "grad_norm": 0.27697211503982544,
+      "learning_rate": 0.00016012759170653908,
+      "loss": 0.3276,
+      "step": 28840
+    },
+    {
+      "epoch": 46.01,
+      "grad_norm": 0.32727357745170593,
+      "learning_rate": 0.00015948963317384372,
+      "loss": 0.2403,
+      "step": 28850
+    },
+    {
+      "epoch": 46.03,
+      "grad_norm": 0.26314985752105713,
+      "learning_rate": 0.00015885167464114834,
+      "loss": 0.2484,
+      "step": 28860
+    },
+    {
+      "epoch": 46.04,
+      "grad_norm": 0.12502922117710114,
+      "learning_rate": 0.00015821371610845296,
+      "loss": 0.2411,
+      "step": 28870
+    },
+    {
+      "epoch": 46.06,
+      "grad_norm": 0.35499653220176697,
+      "learning_rate": 0.00015757575757575757,
+      "loss": 0.2731,
+      "step": 28880
+    },
+    {
+      "epoch": 46.08,
+      "grad_norm": 0.4001838266849518,
+      "learning_rate": 0.00015693779904306222,
+      "loss": 0.256,
+      "step": 28890
+    },
+    {
+      "epoch": 46.09,
+      "grad_norm": 0.49199333786964417,
+      "learning_rate": 0.00015629984051036683,
+      "loss": 0.3412,
+      "step": 28900
+    },
+    {
+      "epoch": 46.11,
+      "grad_norm": 0.22476720809936523,
+      "learning_rate": 0.00015566188197767145,
+      "loss": 0.2704,
+      "step": 28910
+    },
+    {
+      "epoch": 46.12,
+      "grad_norm": 0.42547646164894104,
+      "learning_rate": 0.00015502392344497607,
+      "loss": 0.3282,
+      "step": 28920
+    },
+    {
+      "epoch": 46.14,
+      "grad_norm": 0.14458052814006805,
+      "learning_rate": 0.0001543859649122807,
+      "loss": 0.3036,
+      "step": 28930
+    },
+    {
+      "epoch": 46.16,
+      "grad_norm": 0.23600299656391144,
+      "learning_rate": 0.00015374800637958533,
+      "loss": 0.3372,
+      "step": 28940
+    },
+    {
+      "epoch": 46.17,
+      "grad_norm": 0.31214261054992676,
+      "learning_rate": 0.00015311004784688995,
+      "loss": 0.2513,
+      "step": 28950
+    },
+    {
+      "epoch": 46.19,
+      "grad_norm": 0.6175329685211182,
+      "learning_rate": 0.00015247208931419456,
+      "loss": 0.297,
+      "step": 28960
+    },
+    {
+      "epoch": 46.2,
+      "grad_norm": 0.3160916864871979,
+      "learning_rate": 0.0001518341307814992,
+      "loss": 0.3279,
+      "step": 28970
+    },
+    {
+      "epoch": 46.22,
+      "grad_norm": 0.37880146503448486,
+      "learning_rate": 0.00015119617224880382,
+      "loss": 0.253,
+      "step": 28980
+    },
+    {
+      "epoch": 46.24,
+      "grad_norm": 0.16760538518428802,
+      "learning_rate": 0.00015055821371610844,
+      "loss": 0.2691,
+      "step": 28990
+    },
+    {
+      "epoch": 46.25,
+      "grad_norm": 0.34026291966438293,
+      "learning_rate": 0.00014992025518341306,
+      "loss": 0.4,
+      "step": 29000
+    },
+    {
+      "epoch": 46.27,
+      "grad_norm": 0.22685553133487701,
+      "learning_rate": 0.0001492822966507177,
+      "loss": 0.2779,
+      "step": 29010
+    },
+    {
+      "epoch": 46.28,
+      "grad_norm": 0.3551049530506134,
+      "learning_rate": 0.00014864433811802235,
+      "loss": 0.3183,
+      "step": 29020
+    },
+    {
+      "epoch": 46.3,
+      "grad_norm": 0.5112924575805664,
+      "learning_rate": 0.00014800637958532696,
+      "loss": 0.3357,
+      "step": 29030
+    },
+    {
+      "epoch": 46.32,
+      "grad_norm": 0.4620679020881653,
+      "learning_rate": 0.00014736842105263158,
+      "loss": 0.2737,
+      "step": 29040
+    },
+    {
+      "epoch": 46.33,
+      "grad_norm": 0.25304239988327026,
+      "learning_rate": 0.00014673046251993622,
+      "loss": 0.2402,
+      "step": 29050
+    },
+    {
+      "epoch": 46.35,
+      "grad_norm": 0.20634669065475464,
+      "learning_rate": 0.00014609250398724084,
+      "loss": 0.2407,
+      "step": 29060
+    },
+    {
+      "epoch": 46.36,
+      "grad_norm": 0.4903095066547394,
+      "learning_rate": 0.00014545454545454546,
+      "loss": 0.3264,
+      "step": 29070
+    },
+    {
+      "epoch": 46.38,
+      "grad_norm": 0.09498465806245804,
+      "learning_rate": 0.0001448165869218501,
+      "loss": 0.2885,
+      "step": 29080
+    },
+    {
+      "epoch": 46.4,
+      "grad_norm": 0.292100191116333,
+      "learning_rate": 0.00014417862838915472,
+      "loss": 0.3582,
+      "step": 29090
+    },
+    {
+      "epoch": 46.41,
+      "grad_norm": 0.23083628714084625,
+      "learning_rate": 0.00014354066985645933,
+      "loss": 0.2363,
+      "step": 29100
+    },
+    {
+      "epoch": 46.43,
+      "grad_norm": 0.3492584228515625,
+      "learning_rate": 0.00014290271132376395,
+      "loss": 0.3224,
+      "step": 29110
+    },
+    {
+      "epoch": 46.44,
+      "grad_norm": 0.5817916393280029,
+      "learning_rate": 0.0001422647527910686,
+      "loss": 0.3166,
+      "step": 29120
+    },
+    {
+      "epoch": 46.46,
+      "grad_norm": 0.3647211194038391,
+      "learning_rate": 0.0001416267942583732,
+      "loss": 0.2748,
+      "step": 29130
+    },
+    {
+      "epoch": 46.48,
+      "grad_norm": 0.46294817328453064,
+      "learning_rate": 0.00014098883572567783,
+      "loss": 0.3511,
+      "step": 29140
+    },
+    {
+      "epoch": 46.49,
+      "grad_norm": 0.09461899846792221,
+      "learning_rate": 0.00014035087719298245,
+      "loss": 0.2238,
+      "step": 29150
+    },
+    {
+      "epoch": 46.51,
+      "grad_norm": 0.3371366262435913,
+      "learning_rate": 0.0001397129186602871,
+      "loss": 0.314,
+      "step": 29160
+    },
+    {
+      "epoch": 46.52,
+      "grad_norm": 0.4762924313545227,
+      "learning_rate": 0.0001390749601275917,
+      "loss": 0.2712,
+      "step": 29170
+    },
+    {
+      "epoch": 46.54,
+      "grad_norm": 0.22956405580043793,
+      "learning_rate": 0.00013843700159489632,
+      "loss": 0.3876,
+      "step": 29180
+    },
+    {
+      "epoch": 46.56,
+      "grad_norm": 0.11843441426753998,
+      "learning_rate": 0.00013779904306220094,
+      "loss": 0.2649,
+      "step": 29190
+    },
+    {
+      "epoch": 46.57,
+      "grad_norm": 0.38579946756362915,
+      "learning_rate": 0.00013716108452950558,
+      "loss": 0.2696,
+      "step": 29200
+    },
+    {
+      "epoch": 46.59,
+      "grad_norm": 0.33288395404815674,
+      "learning_rate": 0.00013652312599681023,
+      "loss": 0.3079,
+      "step": 29210
+    },
+    {
+      "epoch": 46.6,
+      "grad_norm": 0.2519354224205017,
+      "learning_rate": 0.00013588516746411485,
+      "loss": 0.3065,
+      "step": 29220
+    },
+    {
+      "epoch": 46.62,
+      "grad_norm": 0.1389375776052475,
+      "learning_rate": 0.00013524720893141946,
+      "loss": 0.2599,
+      "step": 29230
+    },
+    {
+      "epoch": 46.63,
+      "grad_norm": 0.33025360107421875,
+      "learning_rate": 0.0001346092503987241,
+      "loss": 0.3191,
+      "step": 29240
+    },
+    {
+      "epoch": 46.65,
+      "grad_norm": 0.1264023780822754,
+      "learning_rate": 0.00013397129186602872,
+      "loss": 0.2695,
+      "step": 29250
+    },
+    {
+      "epoch": 46.67,
+      "grad_norm": 0.4093076288700104,
+      "learning_rate": 0.00013333333333333334,
+      "loss": 0.2471,
+      "step": 29260
+    },
+    {
+      "epoch": 46.68,
+      "grad_norm": 0.722159743309021,
+      "learning_rate": 0.00013269537480063796,
+      "loss": 0.28,
+      "step": 29270
+    },
+    {
+      "epoch": 46.7,
+      "grad_norm": 0.4530355632305145,
+      "learning_rate": 0.0001320574162679426,
+      "loss": 0.3053,
+      "step": 29280
+    },
+    {
+      "epoch": 46.71,
+      "grad_norm": 0.10739301890134811,
+      "learning_rate": 0.00013141945773524722,
+      "loss": 0.2983,
+      "step": 29290
+    },
+    {
+      "epoch": 46.73,
+      "grad_norm": 0.1723739355802536,
+      "learning_rate": 0.00013078149920255183,
+      "loss": 0.2239,
+      "step": 29300
+    },
+    {
+      "epoch": 46.75,
+      "grad_norm": 0.2270219624042511,
+      "learning_rate": 0.00013014354066985645,
+      "loss": 0.3093,
+      "step": 29310
+    },
+    {
+      "epoch": 46.76,
+      "grad_norm": 0.6445925831794739,
+      "learning_rate": 0.0001295055821371611,
+      "loss": 0.3212,
+      "step": 29320
+    },
+    {
+      "epoch": 46.78,
+      "grad_norm": 0.22848792374134064,
+      "learning_rate": 0.0001288676236044657,
+      "loss": 0.3183,
+      "step": 29330
+    },
+    {
+      "epoch": 46.79,
+      "grad_norm": 0.3686947822570801,
+      "learning_rate": 0.00012822966507177033,
+      "loss": 0.2628,
+      "step": 29340
+    },
+    {
+      "epoch": 46.81,
+      "grad_norm": 0.27950429916381836,
+      "learning_rate": 0.00012759170653907495,
+      "loss": 0.312,
+      "step": 29350
+    },
+    {
+      "epoch": 46.83,
+      "grad_norm": 0.13954879343509674,
+      "learning_rate": 0.0001269537480063796,
+      "loss": 0.315,
+      "step": 29360
+    },
+    {
+      "epoch": 46.84,
+      "grad_norm": 0.314480185508728,
+      "learning_rate": 0.0001263157894736842,
+      "loss": 0.3344,
+      "step": 29370
+    },
+    {
+      "epoch": 46.86,
+      "grad_norm": 0.3248406946659088,
+      "learning_rate": 0.00012567783094098882,
+      "loss": 0.3106,
+      "step": 29380
+    },
+    {
+      "epoch": 46.87,
+      "grad_norm": 0.3097328543663025,
+      "learning_rate": 0.00012503987240829344,
+      "loss": 0.2933,
+      "step": 29390
+    },
+    {
+      "epoch": 46.89,
+      "grad_norm": 0.4338608384132385,
+      "learning_rate": 0.00012440191387559808,
+      "loss": 0.2809,
+      "step": 29400
+    },
+    {
+      "epoch": 46.91,
+      "grad_norm": 0.35394251346588135,
+      "learning_rate": 0.00012376395534290273,
+      "loss": 0.3051,
+      "step": 29410
+    },
+    {
+      "epoch": 46.92,
+      "grad_norm": 0.07790148258209229,
+      "learning_rate": 0.00012312599681020735,
+      "loss": 0.3595,
+      "step": 29420
+    },
+    {
+      "epoch": 46.94,
+      "grad_norm": 0.2738390564918518,
+      "learning_rate": 0.00012248803827751196,
+      "loss": 0.2533,
+      "step": 29430
+    },
+    {
+      "epoch": 46.95,
+      "grad_norm": 0.19870556890964508,
+      "learning_rate": 0.00012185007974481659,
+      "loss": 0.2967,
+      "step": 29440
+    },
+    {
+      "epoch": 46.97,
+      "grad_norm": 0.15914097428321838,
+      "learning_rate": 0.00012121212121212122,
+      "loss": 0.3222,
+      "step": 29450
+    },
+    {
+      "epoch": 46.99,
+      "grad_norm": 0.22630850970745087,
+      "learning_rate": 0.00012057416267942584,
+      "loss": 0.2994,
+      "step": 29460
+    },
+    {
+      "epoch": 47.0,
+      "grad_norm": 0.32556214928627014,
+      "learning_rate": 0.00011993620414673047,
+      "loss": 0.2816,
+      "step": 29470
+    },
+    {
+      "epoch": 47.02,
+      "grad_norm": 0.274972528219223,
+      "learning_rate": 0.00011929824561403509,
+      "loss": 0.2624,
+      "step": 29480
+    },
+    {
+      "epoch": 47.03,
+      "grad_norm": 0.3284093737602234,
+      "learning_rate": 0.00011866028708133972,
+      "loss": 0.2596,
+      "step": 29490
+    },
+    {
+      "epoch": 47.05,
+      "grad_norm": 0.2033546268939972,
+      "learning_rate": 0.00011802232854864433,
+      "loss": 0.2444,
+      "step": 29500
+    },
+    {
+      "epoch": 47.07,
+      "grad_norm": 0.3881695866584778,
+      "learning_rate": 0.00011738437001594896,
+      "loss": 0.2642,
+      "step": 29510
+    },
+    {
+      "epoch": 47.08,
+      "grad_norm": 0.3856006860733032,
+      "learning_rate": 0.00011674641148325358,
+      "loss": 0.2641,
+      "step": 29520
+    },
+    {
+      "epoch": 47.1,
+      "grad_norm": 0.3555915355682373,
+      "learning_rate": 0.00011610845295055821,
+      "loss": 0.2547,
+      "step": 29530
+    },
+    {
+      "epoch": 47.11,
+      "grad_norm": 0.39494889974594116,
+      "learning_rate": 0.00011547049441786284,
+      "loss": 0.3229,
+      "step": 29540
+    },
+    {
+      "epoch": 47.13,
+      "grad_norm": 0.39036959409713745,
+      "learning_rate": 0.00011483253588516747,
+      "loss": 0.2395,
+      "step": 29550
+    },
+    {
+      "epoch": 47.15,
+      "grad_norm": 0.14146322011947632,
+      "learning_rate": 0.00011419457735247209,
+      "loss": 0.1988,
+      "step": 29560
+    },
+    {
+      "epoch": 47.16,
+      "grad_norm": 0.22183720767498016,
+      "learning_rate": 0.00011355661881977672,
+      "loss": 0.3234,
+      "step": 29570
+    },
+    {
+      "epoch": 47.18,
+      "grad_norm": 0.19865743815898895,
+      "learning_rate": 0.00011291866028708134,
+      "loss": 0.2811,
+      "step": 29580
+    },
+    {
+      "epoch": 47.19,
+      "grad_norm": 0.457445353269577,
+      "learning_rate": 0.00011228070175438597,
+      "loss": 0.2627,
+      "step": 29590
+    },
+    {
+      "epoch": 47.21,
+      "grad_norm": 0.28596189618110657,
+      "learning_rate": 0.00011164274322169058,
+      "loss": 0.2277,
+      "step": 29600
+    },
+    {
+      "epoch": 47.22,
+      "grad_norm": 0.4201318025588989,
+      "learning_rate": 0.00011100478468899521,
+      "loss": 0.3182,
+      "step": 29610
+    },
+    {
+      "epoch": 47.24,
+      "grad_norm": 0.31965920329093933,
+      "learning_rate": 0.00011036682615629983,
+      "loss": 0.2838,
+      "step": 29620
+    },
+    {
+      "epoch": 47.26,
+      "grad_norm": 0.10794230550527573,
+      "learning_rate": 0.00010972886762360448,
+      "loss": 0.2655,
+      "step": 29630
+    },
+    {
+      "epoch": 47.27,
+      "grad_norm": 0.13269487023353577,
+      "learning_rate": 0.00010909090909090909,
+      "loss": 0.3302,
+      "step": 29640
+    },
+    {
+      "epoch": 47.29,
+      "grad_norm": 0.5231210589408875,
+      "learning_rate": 0.00010845295055821372,
+      "loss": 0.3293,
+      "step": 29650
+    },
+    {
+      "epoch": 47.3,
+      "grad_norm": 0.158706933259964,
+      "learning_rate": 0.00010781499202551834,
+      "loss": 0.2234,
+      "step": 29660
+    },
+    {
+      "epoch": 47.32,
+      "grad_norm": 0.2540994882583618,
+      "learning_rate": 0.00010717703349282297,
+      "loss": 0.3066,
+      "step": 29670
+    },
+    {
+      "epoch": 47.34,
+      "grad_norm": 0.32114022970199585,
+      "learning_rate": 0.00010653907496012759,
+      "loss": 0.26,
+      "step": 29680
+    },
+    {
+      "epoch": 47.35,
+      "grad_norm": 0.14222322404384613,
+      "learning_rate": 0.00010590111642743222,
+      "loss": 0.1941,
+      "step": 29690
+    },
+    {
+      "epoch": 47.37,
+      "grad_norm": 0.33291783928871155,
+      "learning_rate": 0.00010526315789473683,
+      "loss": 0.3337,
+      "step": 29700
+    },
+    {
+      "epoch": 47.38,
+      "grad_norm": 0.21735547482967377,
+      "learning_rate": 0.00010462519936204146,
+      "loss": 0.2755,
+      "step": 29710
+    },
+    {
+      "epoch": 47.4,
+      "grad_norm": 0.37341004610061646,
+      "learning_rate": 0.0001039872408293461,
+      "loss": 0.2765,
+      "step": 29720
+    },
+    {
+      "epoch": 47.42,
+      "grad_norm": 0.13885751366615295,
+      "learning_rate": 0.00010334928229665073,
+      "loss": 0.3081,
+      "step": 29730
+    },
+    {
+      "epoch": 47.43,
+      "grad_norm": 0.6437707543373108,
+      "learning_rate": 0.00010271132376395534,
+      "loss": 0.322,
+      "step": 29740
+    },
+    {
+      "epoch": 47.45,
+      "grad_norm": 0.13305498659610748,
+      "learning_rate": 0.00010207336523125997,
+      "loss": 0.2947,
+      "step": 29750
+    },
+    {
+      "epoch": 47.46,
+      "grad_norm": 0.6117695569992065,
+      "learning_rate": 0.00010143540669856459,
+      "loss": 0.3218,
+      "step": 29760
+    },
+    {
+      "epoch": 47.48,
+      "grad_norm": 0.34142374992370605,
+      "learning_rate": 0.00010079744816586922,
+      "loss": 0.3295,
+      "step": 29770
+    },
+    {
+      "epoch": 47.5,
+      "grad_norm": 0.37447261810302734,
+      "learning_rate": 0.00010015948963317384,
+      "loss": 0.2824,
+      "step": 29780
+    },
+    {
+      "epoch": 47.51,
+      "grad_norm": 0.14651019871234894,
+      "learning_rate": 9.952153110047847e-05,
+      "loss": 0.2739,
+      "step": 29790
+    },
+    {
+      "epoch": 47.53,
+      "grad_norm": 0.14142945408821106,
+      "learning_rate": 9.888357256778308e-05,
+      "loss": 0.217,
+      "step": 29800
+    },
+    {
+      "epoch": 47.54,
+      "grad_norm": 0.3807011544704437,
+      "learning_rate": 9.824561403508771e-05,
+      "loss": 0.3054,
+      "step": 29810
+    },
+    {
+      "epoch": 47.56,
+      "grad_norm": 0.2842819392681122,
+      "learning_rate": 9.760765550239235e-05,
+      "loss": 0.3078,
+      "step": 29820
+    },
+    {
+      "epoch": 47.58,
+      "grad_norm": 0.7402997016906738,
+      "learning_rate": 9.696969696969698e-05,
+      "loss": 0.3118,
+      "step": 29830
+    },
+    {
+      "epoch": 47.59,
+      "grad_norm": 0.13281618058681488,
+      "learning_rate": 9.63317384370016e-05,
+      "loss": 0.3055,
+      "step": 29840
+    },
+    {
+      "epoch": 47.61,
+      "grad_norm": 0.3724515736103058,
+      "learning_rate": 9.569377990430622e-05,
+      "loss": 0.3274,
+      "step": 29850
+    },
+    {
+      "epoch": 47.62,
+      "grad_norm": 0.33854445815086365,
+      "learning_rate": 9.505582137161085e-05,
+      "loss": 0.2533,
+      "step": 29860
+    },
+    {
+      "epoch": 47.64,
+      "grad_norm": 0.42690280079841614,
+      "learning_rate": 9.441786283891547e-05,
+      "loss": 0.3262,
+      "step": 29870
+    },
+    {
+      "epoch": 47.66,
+      "grad_norm": 0.6151228547096252,
+      "learning_rate": 9.37799043062201e-05,
+      "loss": 0.2889,
+      "step": 29880
+    },
+    {
+      "epoch": 47.67,
+      "grad_norm": 0.26469776034355164,
+      "learning_rate": 9.314194577352472e-05,
+      "loss": 0.3036,
+      "step": 29890
+    },
+    {
+      "epoch": 47.69,
+      "grad_norm": 0.2703404426574707,
+      "learning_rate": 9.250398724082935e-05,
+      "loss": 0.2673,
+      "step": 29900
+    },
+    {
+      "epoch": 47.7,
+      "grad_norm": 0.3791040778160095,
+      "learning_rate": 9.186602870813398e-05,
+      "loss": 0.3244,
+      "step": 29910
+    },
+    {
+      "epoch": 47.72,
+      "grad_norm": 0.29400941729545593,
+      "learning_rate": 9.122807017543861e-05,
+      "loss": 0.2538,
+      "step": 29920
+    },
+    {
+      "epoch": 47.74,
+      "grad_norm": 0.4795028567314148,
+      "learning_rate": 9.059011164274323e-05,
+      "loss": 0.2482,
+      "step": 29930
+    },
+    {
+      "epoch": 47.75,
+      "grad_norm": 0.36813196539878845,
+      "learning_rate": 8.995215311004786e-05,
+      "loss": 0.2548,
+      "step": 29940
+    },
+    {
+      "epoch": 47.77,
+      "grad_norm": 0.22788019478321075,
+      "learning_rate": 8.931419457735247e-05,
+      "loss": 0.2961,
+      "step": 29950
+    },
+    {
+      "epoch": 47.78,
+      "grad_norm": 0.24274033308029175,
+      "learning_rate": 8.86762360446571e-05,
+      "loss": 0.2933,
+      "step": 29960
+    },
+    {
+      "epoch": 47.8,
+      "grad_norm": 0.4556421637535095,
+      "learning_rate": 8.803827751196172e-05,
+      "loss": 0.3562,
+      "step": 29970
+    },
+    {
+      "epoch": 47.81,
+      "grad_norm": 0.5702171921730042,
+      "learning_rate": 8.740031897926635e-05,
+      "loss": 0.3005,
+      "step": 29980
+    },
+    {
+      "epoch": 47.83,
+      "grad_norm": 0.32142460346221924,
+      "learning_rate": 8.676236044657097e-05,
+      "loss": 0.2663,
+      "step": 29990
+    },
+    {
+      "epoch": 47.85,
+      "grad_norm": 0.17863740026950836,
+      "learning_rate": 8.612440191387561e-05,
+      "loss": 0.2509,
+      "step": 30000
+    },
+    {
+      "epoch": 47.86,
+      "grad_norm": 0.09491372853517532,
+      "learning_rate": 8.548644338118023e-05,
+      "loss": 0.3576,
+      "step": 30010
+    },
+    {
+      "epoch": 47.88,
+      "grad_norm": 0.4455479085445404,
+      "learning_rate": 8.484848484848486e-05,
+      "loss": 0.4137,
+      "step": 30020
+    },
+    {
+      "epoch": 47.89,
+      "grad_norm": 0.33983567357063293,
+      "learning_rate": 8.421052631578948e-05,
+      "loss": 0.2838,
+      "step": 30030
+    },
+    {
+      "epoch": 47.91,
+      "grad_norm": 0.4801020622253418,
+      "learning_rate": 8.35725677830941e-05,
+      "loss": 0.2983,
+      "step": 30040
+    },
+    {
+      "epoch": 47.93,
+      "grad_norm": 0.33874234557151794,
+      "learning_rate": 8.293460925039872e-05,
+      "loss": 0.2689,
+      "step": 30050
+    },
+    {
+      "epoch": 47.94,
+      "grad_norm": 0.269828736782074,
+      "learning_rate": 8.229665071770335e-05,
+      "loss": 0.3258,
+      "step": 30060
+    },
+    {
+      "epoch": 47.96,
+      "grad_norm": 0.0987486019730568,
+      "learning_rate": 8.165869218500797e-05,
+      "loss": 0.2461,
+      "step": 30070
+    },
+    {
+      "epoch": 47.97,
+      "grad_norm": 0.3457973897457123,
+      "learning_rate": 8.10207336523126e-05,
+      "loss": 0.2783,
+      "step": 30080
+    },
+    {
+      "epoch": 47.99,
+      "grad_norm": 0.10124126076698303,
+      "learning_rate": 8.038277511961722e-05,
+      "loss": 0.2794,
+      "step": 30090
+    },
+    {
+      "epoch": 48.01,
+      "grad_norm": 0.40085652470588684,
+      "learning_rate": 7.974481658692186e-05,
+      "loss": 0.326,
+      "step": 30100
+    },
+    {
+      "epoch": 48.02,
+      "grad_norm": 0.184198796749115,
+      "learning_rate": 7.910685805422648e-05,
+      "loss": 0.2469,
+      "step": 30110
+    },
+    {
+      "epoch": 48.04,
+      "grad_norm": 0.2005092054605484,
+      "learning_rate": 7.846889952153111e-05,
+      "loss": 0.3044,
+      "step": 30120
+    },
+    {
+      "epoch": 48.05,
+      "grad_norm": 0.35767000913619995,
+      "learning_rate": 7.783094098883573e-05,
+      "loss": 0.2981,
+      "step": 30130
+    },
+    {
+      "epoch": 48.07,
+      "grad_norm": 0.38873291015625,
+      "learning_rate": 7.719298245614036e-05,
+      "loss": 0.3039,
+      "step": 30140
+    },
+    {
+      "epoch": 48.09,
+      "grad_norm": 0.22854940593242645,
+      "learning_rate": 7.655502392344497e-05,
+      "loss": 0.2068,
+      "step": 30150
+    },
+    {
+      "epoch": 48.1,
+      "grad_norm": 0.1659734845161438,
+      "learning_rate": 7.59170653907496e-05,
+      "loss": 0.277,
+      "step": 30160
+    },
+    {
+      "epoch": 48.12,
+      "grad_norm": 0.1869482696056366,
+      "learning_rate": 7.527910685805422e-05,
+      "loss": 0.2194,
+      "step": 30170
+    },
+    {
+      "epoch": 48.13,
+      "grad_norm": 0.08279826492071152,
+      "learning_rate": 7.464114832535885e-05,
+      "loss": 0.2824,
+      "step": 30180
+    },
+    {
+      "epoch": 48.15,
+      "grad_norm": 0.4725863039493561,
+      "learning_rate": 7.400318979266348e-05,
+      "loss": 0.2503,
+      "step": 30190
+    },
+    {
+      "epoch": 48.17,
+      "grad_norm": 0.172104611992836,
+      "learning_rate": 7.336523125996811e-05,
+      "loss": 0.2658,
+      "step": 30200
+    },
+    {
+      "epoch": 48.18,
+      "grad_norm": 0.21676242351531982,
+      "learning_rate": 7.272727272727273e-05,
+      "loss": 0.2658,
+      "step": 30210
+    },
+    {
+      "epoch": 48.2,
+      "grad_norm": 0.3602610230445862,
+      "learning_rate": 7.208931419457736e-05,
+      "loss": 0.2367,
+      "step": 30220
+    },
+    {
+      "epoch": 48.21,
+      "grad_norm": 0.3500073552131653,
+      "learning_rate": 7.145135566188198e-05,
+      "loss": 0.3014,
+      "step": 30230
+    },
+    {
+      "epoch": 48.23,
+      "grad_norm": 0.3083650469779968,
+      "learning_rate": 7.08133971291866e-05,
+      "loss": 0.3093,
+      "step": 30240
+    },
+    {
+      "epoch": 48.25,
+      "grad_norm": 0.20540174841880798,
+      "learning_rate": 7.017543859649122e-05,
+      "loss": 0.2626,
+      "step": 30250
+    },
+    {
+      "epoch": 48.26,
+      "grad_norm": 0.26233381032943726,
+      "learning_rate": 6.953748006379585e-05,
+      "loss": 0.3743,
+      "step": 30260
+    },
+    {
+      "epoch": 48.28,
+      "grad_norm": 0.6622065901756287,
+      "learning_rate": 6.889952153110047e-05,
+      "loss": 0.3261,
+      "step": 30270
+    },
+    {
+      "epoch": 48.29,
+      "grad_norm": 0.50579833984375,
+      "learning_rate": 6.826156299840511e-05,
+      "loss": 0.2812,
+      "step": 30280
+    },
+    {
+      "epoch": 48.31,
+      "grad_norm": 0.22522664070129395,
+      "learning_rate": 6.762360446570973e-05,
+      "loss": 0.247,
+      "step": 30290
+    },
+    {
+      "epoch": 48.33,
+      "grad_norm": 0.334440141916275,
+      "learning_rate": 6.698564593301436e-05,
+      "loss": 0.2827,
+      "step": 30300
+    },
+    {
+      "epoch": 48.34,
+      "grad_norm": 0.10422962158918381,
+      "learning_rate": 6.634768740031898e-05,
+      "loss": 0.2475,
+      "step": 30310
+    },
+    {
+      "epoch": 48.36,
+      "grad_norm": 0.409278005361557,
+      "learning_rate": 6.570972886762361e-05,
+      "loss": 0.3053,
+      "step": 30320
+    },
+    {
+      "epoch": 48.37,
+      "grad_norm": 0.15748478472232819,
+      "learning_rate": 6.507177033492823e-05,
+      "loss": 0.3292,
+      "step": 30330
+    },
+    {
+      "epoch": 48.39,
+      "grad_norm": 0.1966976523399353,
+      "learning_rate": 6.443381180223286e-05,
+      "loss": 0.2462,
+      "step": 30340
+    },
+    {
+      "epoch": 48.41,
+      "grad_norm": 0.34300366044044495,
+      "learning_rate": 6.379585326953747e-05,
+      "loss": 0.3215,
+      "step": 30350
+    },
+    {
+      "epoch": 48.42,
+      "grad_norm": 0.15784505009651184,
+      "learning_rate": 6.31578947368421e-05,
+      "loss": 0.2191,
+      "step": 30360
+    },
+    {
+      "epoch": 48.44,
+      "grad_norm": 0.1942838877439499,
+      "learning_rate": 6.251993620414672e-05,
+      "loss": 0.2964,
+      "step": 30370
+    },
+    {
+      "epoch": 48.45,
+      "grad_norm": 0.23638346791267395,
+      "learning_rate": 6.188197767145136e-05,
+      "loss": 0.2913,
+      "step": 30380
+    },
+    {
+      "epoch": 48.47,
+      "grad_norm": 0.18222945928573608,
+      "learning_rate": 6.124401913875598e-05,
+      "loss": 0.2863,
+      "step": 30390
+    },
+    {
+      "epoch": 48.48,
+      "grad_norm": 0.13442523777484894,
+      "learning_rate": 6.060606060606061e-05,
+      "loss": 0.3137,
+      "step": 30400
+    },
+    {
+      "epoch": 48.5,
+      "grad_norm": 0.09403583407402039,
+      "learning_rate": 5.9968102073365235e-05,
+      "loss": 0.2212,
+      "step": 30410
+    },
+    {
+      "epoch": 48.52,
+      "grad_norm": 0.21507516503334045,
+      "learning_rate": 5.933014354066986e-05,
+      "loss": 0.2613,
+      "step": 30420
+    },
+    {
+      "epoch": 48.53,
+      "grad_norm": 0.41693365573883057,
+      "learning_rate": 5.869218500797448e-05,
+      "loss": 0.2824,
+      "step": 30430
+    },
+    {
+      "epoch": 48.55,
+      "grad_norm": 0.23327617347240448,
+      "learning_rate": 5.8054226475279106e-05,
+      "loss": 0.3104,
+      "step": 30440
+    },
+    {
+      "epoch": 48.56,
+      "grad_norm": 0.6092672348022461,
+      "learning_rate": 5.7416267942583736e-05,
+      "loss": 0.309,
+      "step": 30450
+    },
+    {
+      "epoch": 48.58,
+      "grad_norm": 0.14301355183124542,
+      "learning_rate": 5.677830940988836e-05,
+      "loss": 0.2445,
+      "step": 30460
+    },
+    {
+      "epoch": 48.6,
+      "grad_norm": 0.42832037806510925,
+      "learning_rate": 5.6140350877192984e-05,
+      "loss": 0.2872,
+      "step": 30470
+    },
+    {
+      "epoch": 48.61,
+      "grad_norm": 0.25466400384902954,
+      "learning_rate": 5.550239234449761e-05,
+      "loss": 0.2659,
+      "step": 30480
+    },
+    {
+      "epoch": 48.63,
+      "grad_norm": 0.2657581865787506,
+      "learning_rate": 5.486443381180224e-05,
+      "loss": 0.2374,
+      "step": 30490
+    },
+    {
+      "epoch": 48.64,
+      "grad_norm": 0.40479007363319397,
+      "learning_rate": 5.422647527910686e-05,
+      "loss": 0.3537,
+      "step": 30500
+    },
+    {
+      "epoch": 48.66,
+      "grad_norm": 0.28331390023231506,
+      "learning_rate": 5.3588516746411485e-05,
+      "loss": 0.3156,
+      "step": 30510
+    },
+    {
+      "epoch": 48.68,
+      "grad_norm": 0.27074429392814636,
+      "learning_rate": 5.295055821371611e-05,
+      "loss": 0.2869,
+      "step": 30520
+    },
+    {
+      "epoch": 48.69,
+      "grad_norm": 0.21443207561969757,
+      "learning_rate": 5.231259968102073e-05,
+      "loss": 0.2715,
+      "step": 30530
+    },
+    {
+      "epoch": 48.71,
+      "grad_norm": 0.28873592615127563,
+      "learning_rate": 5.167464114832536e-05,
+      "loss": 0.283,
+      "step": 30540
+    },
+    {
+      "epoch": 48.72,
+      "grad_norm": 0.2248823344707489,
+      "learning_rate": 5.1036682615629986e-05,
+      "loss": 0.2609,
+      "step": 30550
+    },
+    {
+      "epoch": 48.74,
+      "grad_norm": 0.16412192583084106,
+      "learning_rate": 5.039872408293461e-05,
+      "loss": 0.2673,
+      "step": 30560
+    },
+    {
+      "epoch": 48.76,
+      "grad_norm": 0.37860092520713806,
+      "learning_rate": 4.9760765550239234e-05,
+      "loss": 0.2795,
+      "step": 30570
+    },
+    {
+      "epoch": 48.77,
+      "grad_norm": 0.5846998691558838,
+      "learning_rate": 4.912280701754386e-05,
+      "loss": 0.3345,
+      "step": 30580
+    },
+    {
+      "epoch": 48.79,
+      "grad_norm": 0.4207826554775238,
+      "learning_rate": 4.848484848484849e-05,
+      "loss": 0.2574,
+      "step": 30590
+    },
+    {
+      "epoch": 48.8,
+      "grad_norm": 0.2351989895105362,
+      "learning_rate": 4.784688995215311e-05,
+      "loss": 0.2994,
+      "step": 30600
+    },
+    {
+      "epoch": 48.82,
+      "grad_norm": 0.29773497581481934,
+      "learning_rate": 4.7208931419457735e-05,
+      "loss": 0.3491,
+      "step": 30610
+    },
+    {
+      "epoch": 48.84,
+      "grad_norm": 0.3682696521282196,
+      "learning_rate": 4.657097288676236e-05,
+      "loss": 0.2976,
+      "step": 30620
+    },
+    {
+      "epoch": 48.85,
+      "grad_norm": 0.33122923970222473,
+      "learning_rate": 4.593301435406699e-05,
+      "loss": 0.314,
+      "step": 30630
+    },
+    {
+      "epoch": 48.87,
+      "grad_norm": 0.3438310921192169,
+      "learning_rate": 4.529505582137161e-05,
+      "loss": 0.3169,
+      "step": 30640
+    },
+    {
+      "epoch": 48.88,
+      "grad_norm": 0.32344672083854675,
+      "learning_rate": 4.4657097288676236e-05,
+      "loss": 0.2184,
+      "step": 30650
+    },
+    {
+      "epoch": 48.9,
+      "grad_norm": 0.4275621771812439,
+      "learning_rate": 4.401913875598086e-05,
+      "loss": 0.3592,
+      "step": 30660
+    },
+    {
+      "epoch": 48.92,
+      "grad_norm": 0.514369785785675,
+      "learning_rate": 4.3381180223285484e-05,
+      "loss": 0.2393,
+      "step": 30670
+    },
+    {
+      "epoch": 48.93,
+      "grad_norm": 0.23344865441322327,
+      "learning_rate": 4.2743221690590114e-05,
+      "loss": 0.3637,
+      "step": 30680
+    },
+    {
+      "epoch": 48.95,
+      "grad_norm": 0.2496626079082489,
+      "learning_rate": 4.210526315789474e-05,
+      "loss": 0.3157,
+      "step": 30690
+    },
+    {
+      "epoch": 48.96,
+      "grad_norm": 0.15069235861301422,
+      "learning_rate": 4.146730462519936e-05,
+      "loss": 0.2731,
+      "step": 30700
+    },
+    {
+      "epoch": 48.98,
+      "grad_norm": 0.5047960877418518,
+      "learning_rate": 4.0829346092503985e-05,
+      "loss": 0.2811,
+      "step": 30710
+    },
+    {
+      "epoch": 49.0,
+      "grad_norm": 0.34830254316329956,
+      "learning_rate": 4.019138755980861e-05,
+      "loss": 0.2925,
+      "step": 30720
+    },
+    {
+      "epoch": 49.01,
+      "grad_norm": 0.4893124997615814,
+      "learning_rate": 3.955342902711324e-05,
+      "loss": 0.308,
+      "step": 30730
+    },
+    {
+      "epoch": 49.03,
+      "grad_norm": 0.3630107045173645,
+      "learning_rate": 3.891547049441786e-05,
+      "loss": 0.2671,
+      "step": 30740
+    },
+    {
+      "epoch": 49.04,
+      "grad_norm": 0.16974857449531555,
+      "learning_rate": 3.8277511961722486e-05,
+      "loss": 0.3295,
+      "step": 30750
+    },
+    {
+      "epoch": 49.06,
+      "grad_norm": 0.34105682373046875,
+      "learning_rate": 3.763955342902711e-05,
+      "loss": 0.2895,
+      "step": 30760
+    },
+    {
+      "epoch": 49.07,
+      "grad_norm": 0.47773271799087524,
+      "learning_rate": 3.700159489633174e-05,
+      "loss": 0.2591,
+      "step": 30770
+    },
+    {
+      "epoch": 49.09,
+      "grad_norm": 0.3436296582221985,
+      "learning_rate": 3.6363636363636364e-05,
+      "loss": 0.2138,
+      "step": 30780
+    },
+    {
+      "epoch": 49.11,
+      "grad_norm": 0.1262790709733963,
+      "learning_rate": 3.572567783094099e-05,
+      "loss": 0.2472,
+      "step": 30790
+    },
+    {
+      "epoch": 49.12,
+      "grad_norm": 0.2755976915359497,
+      "learning_rate": 3.508771929824561e-05,
+      "loss": 0.2532,
+      "step": 30800
+    },
+    {
+      "epoch": 49.14,
+      "grad_norm": 0.16442789137363434,
+      "learning_rate": 3.4449760765550235e-05,
+      "loss": 0.2813,
+      "step": 30810
+    },
+    {
+      "epoch": 49.15,
+      "grad_norm": 0.29541754722595215,
+      "learning_rate": 3.3811802232854866e-05,
+      "loss": 0.3036,
+      "step": 30820
+    },
+    {
+      "epoch": 49.17,
+      "grad_norm": 0.07406118512153625,
+      "learning_rate": 3.317384370015949e-05,
+      "loss": 0.2552,
+      "step": 30830
+    },
+    {
+      "epoch": 49.19,
+      "grad_norm": 0.413967102766037,
+      "learning_rate": 3.253588516746411e-05,
+      "loss": 0.3118,
+      "step": 30840
+    },
+    {
+      "epoch": 49.2,
+      "grad_norm": 0.567054808139801,
+      "learning_rate": 3.1897926634768736e-05,
+      "loss": 0.259,
+      "step": 30850
+    },
+    {
+      "epoch": 49.22,
+      "grad_norm": 0.19133225083351135,
+      "learning_rate": 3.125996810207336e-05,
+      "loss": 0.216,
+      "step": 30860
+    },
+    {
+      "epoch": 49.23,
+      "grad_norm": 0.35869938135147095,
+      "learning_rate": 3.062200956937799e-05,
+      "loss": 0.3244,
+      "step": 30870
+    },
+    {
+      "epoch": 49.25,
+      "grad_norm": 0.3546787202358246,
+      "learning_rate": 2.9984051036682618e-05,
+      "loss": 0.3345,
+      "step": 30880
+    },
+    {
+      "epoch": 49.27,
+      "grad_norm": 0.3473091721534729,
+      "learning_rate": 2.934609250398724e-05,
+      "loss": 0.2133,
+      "step": 30890
+    },
+    {
+      "epoch": 49.28,
+      "grad_norm": 0.4771929979324341,
+      "learning_rate": 2.8708133971291868e-05,
+      "loss": 0.3053,
+      "step": 30900
+    },
+    {
+      "epoch": 49.3,
+      "grad_norm": 0.3776096701622009,
+      "learning_rate": 2.8070175438596492e-05,
+      "loss": 0.2765,
+      "step": 30910
+    },
+    {
+      "epoch": 49.31,
+      "grad_norm": 0.2937834560871124,
+      "learning_rate": 2.743221690590112e-05,
+      "loss": 0.2581,
+      "step": 30920
+    },
+    {
+      "epoch": 49.33,
+      "grad_norm": 0.2534268796443939,
+      "learning_rate": 2.6794258373205743e-05,
+      "loss": 0.2985,
+      "step": 30930
+    },
+    {
+      "epoch": 49.35,
+      "grad_norm": 0.18742942810058594,
+      "learning_rate": 2.6156299840510366e-05,
+      "loss": 0.2159,
+      "step": 30940
+    },
+    {
+      "epoch": 49.36,
+      "grad_norm": 0.3918183147907257,
+      "learning_rate": 2.5518341307814993e-05,
+      "loss": 0.3054,
+      "step": 30950
+    },
+    {
+      "epoch": 49.38,
+      "grad_norm": 0.33097043633461,
+      "learning_rate": 2.4880382775119617e-05,
+      "loss": 0.3201,
+      "step": 30960
+    },
+    {
+      "epoch": 49.39,
+      "grad_norm": 0.37174108624458313,
+      "learning_rate": 2.4242424242424244e-05,
+      "loss": 0.2708,
+      "step": 30970
+    },
+    {
+      "epoch": 49.41,
+      "grad_norm": 0.27249741554260254,
+      "learning_rate": 2.3604465709728868e-05,
+      "loss": 0.2347,
+      "step": 30980
+    },
+    {
+      "epoch": 49.43,
+      "grad_norm": 0.7410305738449097,
+      "learning_rate": 2.2966507177033495e-05,
+      "loss": 0.3456,
+      "step": 30990
+    },
+    {
+      "epoch": 49.44,
+      "grad_norm": 0.4471137225627899,
+      "learning_rate": 2.2328548644338118e-05,
+      "loss": 0.2472,
+      "step": 31000
+    }
+  ],
+  "logging_steps": 10,
+  "max_steps": 31350,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 50,
+  "save_steps": 500,
+  "total_flos": 8.371975248433152e+16,
+  "train_batch_size": 4,
+  "trial_name": null,
+  "trial_params": null
+}